Home | History | Annotate | Download | only in mntfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/file.h>
     27 #include <sys/stat.h>
     28 #include <sys/atomic.h>
     29 #include <sys/mntio.h>
     30 #include <sys/mnttab.h>
     31 #include <sys/mount.h>
     32 #include <sys/sunddi.h>
     33 #include <sys/sysmacros.h>
     34 #include <sys/systm.h>
     35 #include <sys/vfs.h>
     36 #include <sys/vfs_opreg.h>
     37 #include <sys/fs/mntdata.h>
     38 #include <fs/fs_subr.h>
     39 #include <sys/vmsystm.h>
     40 #include <vm/seg_vn.h>
     41 #include <sys/time.h>
     42 #include <sys/ksynch.h>
     43 #include <sys/sdt.h>
     44 
     45 #define	MNTROOTINO	2
     46 
     47 static mntnode_t *mntgetnode(vnode_t *);
     48 
     49 vnodeops_t *mntvnodeops;
     50 extern void vfs_mnttab_readop(void);
     51 
     52 /*
     53  * Design of kernel mnttab accounting.
     54  *
     55  * mntfs provides two methods of reading the in-kernel mnttab, i.e. the state of
     56  * the mounted resources: the read-only file /etc/mnttab, and a collection of
     57  * ioctl() commands. Most of these interfaces are public and are described in
     58  * mnttab(4). Three private ioctl() commands, MNTIOC_GETMNTENT,
     59  * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY, provide for the getmntent(3C)
     60  * family of functions, allowing them to support white space in mount names.
     61  *
     62  * A significant feature of mntfs is that it provides a file descriptor with a
     63  * snapshot once it begins to consume mnttab data. Thus, as the process
     64  * continues to consume data, its view of the in-kernel mnttab does not change
     65  * even if resources are mounted or unmounted. The intent is to ensure that
     66  * processes are guaranteed to read self-consistent data even as the system
     67  * changes.
     68  *
     69  * The snapshot is implemented by a "database", unique to each zone, that
     70  * comprises a linked list of mntelem_ts. The database is identified by
     71  * zone_mntfs_db and is protected by zone_mntfs_db_lock. Each element contains
     72  * the text entry in /etc/mnttab for a mounted resource, i.e. a vfs_t, and is
     73  * marked with its time of "birth", i.e. creation. An element is "killed", and
     74  * marked with its time of death, when it is found to be out of date, e.g. when
     75  * the corresponding resource has been unmounted.
     76  *
     77  * When a process performs the first read() or ioctl() for a file descriptor for
     78  * /etc/mnttab, the database is updated by a call to mntfs_snapshot() to ensure
     79  * that an element exists for each currently mounted resource. Following this,
     80  * the current time is written into a snapshot structure, a mntsnap_t, embedded
     81  * in the descriptor's mntnode_t.
     82  *
     83  * mntfs is able to enumerate the /etc/mnttab entries corresponding to a
     84  * particular file descriptor by searching the database for entries that were
     85  * born before the appropriate snapshot and that either are still alive or died
     86  * after the snapshot was created. Consumers use the iterator function
     87  * mntfs_get_next_elem() to identify the next suitable element in the database.
     88  *
     89  * Each snapshot has a hold on its corresponding database elements, effected by
     90  * a per-element reference count. At last close(), a snapshot is destroyed in
     91  * mntfs_freesnap() by releasing all of its holds; an element is destroyed if
     92  * its reference count becomes zero. Therefore the database never exists unless
     93  * there is at least one active consumer of /etc/mnttab.
     94  *
     95  * getmntent(3C) et al. "do not open, close or rewind the file." This implies
     96  * that getmntent() and read() must be able to operate without interaction on
     97  * the same file descriptor; this is accomplished by the use of separate
     98  * mntsnap_ts for both read() and ioctl().
     99  *
    100  * NOTE: The following variable enables the generation of the "dev=xxx"
    101  * in the option string for a mounted file system.  Really this should
    102  * be gotten rid of altogether, but for the sake of backwards compatibility
    103  * we had to leave it in.  It is defined as a 32-bit device number.  This
    104  * means that when 64-bit device numbers are in use, if either the major or
    105  * minor part of the device number will not fit in a 16 bit quantity, the
    106  * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
    107  * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
    108  * device number handles this check and assigns the proper value.
    109  */
    110 int mntfs_enabledev = 1;	/* enable old "dev=xxx" option */
    111 
    112 extern void vfs_mono_time(timespec_t *);
    113 enum { MNTFS_FIRST, MNTFS_SECOND, MNTFS_NEITHER };
    114 
    115 /*
    116  * Determine whether a field within a line from /etc/mnttab contains actual
    117  * content or simply the marker string "-". This never applies to the time,
    118  * therefore the delimiter must be a tab.
    119  */
    120 #define	MNTFS_REAL_FIELD(x)	(*(x) != '-' || *((x) + 1) != '\t')
    121 
    122 static int
    123 mntfs_devsize(struct vfs *vfsp)
    124 {
    125 	dev32_t odev;
    126 
    127 	(void) cmpldev(&odev, vfsp->vfs_dev);
    128 	return (snprintf(NULL, 0, "dev=%x", odev));
    129 }
    130 
    131 static int
    132 mntfs_devprint(struct vfs *vfsp, char *buf)
    133 {
    134 	dev32_t odev;
    135 
    136 	(void) cmpldev(&odev, vfsp->vfs_dev);
    137 	return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
    138 }
    139 
    140 /* Identify which, if either, of two supplied timespec structs is newer. */
    141 static int
    142 mntfs_newest(timespec_t *a, timespec_t *b)
    143 {
    144 	if (a->tv_sec == b->tv_sec &&
    145 	    a->tv_nsec == b->tv_nsec) {
    146 		return (MNTFS_NEITHER);
    147 	} else if (b->tv_sec > a->tv_sec ||
    148 	    (b->tv_sec == a->tv_sec &&
    149 	    b->tv_nsec > a->tv_nsec)) {
    150 		return (MNTFS_SECOND);
    151 	} else {
    152 		return (MNTFS_FIRST);
    153 	}
    154 }
    155 
    156 static int
    157 mntfs_optsize(struct vfs *vfsp)
    158 {
    159 	int i, size = 0;
    160 	mntopt_t *mop;
    161 
    162 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
    163 		mop = &vfsp->vfs_mntopts.mo_list[i];
    164 		if (mop->mo_flags & MO_NODISPLAY)
    165 			continue;
    166 		if (mop->mo_flags & MO_SET) {
    167 			if (size)
    168 				size++; /* space for comma */
    169 			size += strlen(mop->mo_name);
    170 			/*
    171 			 * count option value if there is one
    172 			 */
    173 			if (mop->mo_arg != NULL) {
    174 				size += strlen(mop->mo_arg) + 1;
    175 			}
    176 		}
    177 	}
    178 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
    179 		/*
    180 		 * Add space for "zone=<zone_name>" if required.
    181 		 */
    182 		if (size)
    183 			size++;	/* space for comma */
    184 		size += sizeof ("zone=") - 1;
    185 		size += strlen(vfsp->vfs_zone->zone_name);
    186 	}
    187 	if (mntfs_enabledev) {
    188 		if (size != 0)
    189 			size++; /* space for comma */
    190 		size += mntfs_devsize(vfsp);
    191 	}
    192 	if (size == 0)
    193 		size = strlen("-");
    194 	return (size);
    195 }
    196 
    197 static int
    198 mntfs_optprint(struct vfs *vfsp, char *buf)
    199 {
    200 	int i, optinbuf = 0;
    201 	mntopt_t *mop;
    202 	char *origbuf = buf;
    203 
    204 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
    205 		mop = &vfsp->vfs_mntopts.mo_list[i];
    206 		if (mop->mo_flags & MO_NODISPLAY)
    207 			continue;
    208 		if (mop->mo_flags & MO_SET) {
    209 			if (optinbuf)
    210 				*buf++ = ',';
    211 			else
    212 				optinbuf = 1;
    213 			buf += snprintf(buf, MAX_MNTOPT_STR,
    214 			    "%s", mop->mo_name);
    215 			/*
    216 			 * print option value if there is one
    217 			 */
    218 			if (mop->mo_arg != NULL) {
    219 				buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
    220 				    mop->mo_arg);
    221 			}
    222 		}
    223 	}
    224 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
    225 		if (optinbuf)
    226 			*buf++ = ',';
    227 		else
    228 			optinbuf = 1;
    229 		buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
    230 		    vfsp->vfs_zone->zone_name);
    231 	}
    232 	if (mntfs_enabledev) {
    233 		if (optinbuf++)
    234 			*buf++ = ',';
    235 		buf += mntfs_devprint(vfsp, buf);
    236 	}
    237 	if (!optinbuf) {
    238 		buf += snprintf(buf, MAX_MNTOPT_STR, "-");
    239 	}
    240 	return (buf - origbuf);
    241 }
    242 
    243 void
    244 mntfs_populate_text(vfs_t *vfsp, zone_t *zonep, mntelem_t *elemp)
    245 {
    246 	struct extmnttab *tabp = &elemp->mnte_tab;
    247 	const char *resource, *mntpt;
    248 	char *cp = elemp->mnte_text;
    249 	mntpt = refstr_value(vfsp->vfs_mntpt);
    250 	resource = refstr_value(vfsp->vfs_resource);
    251 
    252 	tabp->mnt_special = 0;
    253 	if (resource != NULL && resource[0] != '\0') {
    254 		if (resource[0] != '/') {
    255 			cp += snprintf(cp, MAXPATHLEN, "%s\t", resource);
    256 		} else if (!ZONE_PATH_VISIBLE(resource, zonep)) {
    257 			/*
    258 			 * Use the mount point as the resource.
    259 			 */
    260 			cp += snprintf(cp, MAXPATHLEN, "%s\t",
    261 			    ZONE_PATH_TRANSLATE(mntpt, zonep));
    262 		} else {
    263 			cp += snprintf(cp, MAXPATHLEN, "%s\t",
    264 			    ZONE_PATH_TRANSLATE(resource, zonep));
    265 		}
    266 	} else {
    267 		cp += snprintf(cp, MAXPATHLEN, "-\t");
    268 	}
    269 
    270 	tabp->mnt_mountp = (char *)(cp - elemp->mnte_text);
    271 	if (mntpt != NULL && mntpt[0] != '\0') {
    272 		/*
    273 		 * We know the mount point is visible from within the zone,
    274 		 * otherwise it wouldn't be on the zone's vfs list.
    275 		 */
    276 		cp += snprintf(cp, MAXPATHLEN, "%s\t",
    277 		    ZONE_PATH_TRANSLATE(mntpt, zonep));
    278 	} else {
    279 		cp += snprintf(cp, MAXPATHLEN, "-\t");
    280 	}
    281 
    282 	tabp->mnt_fstype = (char *)(cp - elemp->mnte_text);
    283 	cp += snprintf(cp, MAXPATHLEN, "%s\t",
    284 	    vfssw[vfsp->vfs_fstype].vsw_name);
    285 
    286 	tabp->mnt_mntopts = (char *)(cp - elemp->mnte_text);
    287 	cp += mntfs_optprint(vfsp, cp);
    288 	*cp++ = '\t';
    289 
    290 	tabp->mnt_time = (char *)(cp - elemp->mnte_text);
    291 	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
    292 	*cp++ = '\n'; /* over-write snprintf's trailing null-byte */
    293 
    294 	tabp->mnt_major = getmajor(vfsp->vfs_dev);
    295 	tabp->mnt_minor = getminor(vfsp->vfs_dev);
    296 
    297 	elemp->mnte_text_size = cp - elemp->mnte_text;
    298 	elemp->mnte_vfs_ctime = vfsp->vfs_hrctime;
    299 	elemp->mnte_hidden = vfsp->vfs_flag & VFS_NOMNTTAB;
    300 }
    301 
    302 /* Determine the length of the /etc/mnttab entry for this vfs_t. */
    303 static size_t
    304 mntfs_text_len(vfs_t *vfsp, zone_t *zone)
    305 {
    306 	size_t size = 0;
    307 	const char *resource, *mntpt;
    308 	size_t mntsize;
    309 
    310 	mntpt = refstr_value(vfsp->vfs_mntpt);
    311 	if (mntpt != NULL && mntpt[0] != '\0') {
    312 		mntsize = strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
    313 	} else {
    314 		mntsize = 2;	/* "-\t" */
    315 	}
    316 	size += mntsize;
    317 
    318 	resource = refstr_value(vfsp->vfs_resource);
    319 	if (resource != NULL && resource[0] != '\0') {
    320 		if (resource[0] != '/') {
    321 			size += strlen(resource) + 1;
    322 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
    323 			/*
    324 			 * Same as the zone's view of the mount point.
    325 			 */
    326 			size += mntsize;
    327 		} else {
    328 			size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
    329 		}
    330 	} else {
    331 		size += 2;	/* "-\t" */
    332 	}
    333 	size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
    334 	size += mntfs_optsize(vfsp);
    335 	size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
    336 	return (size);
    337 }
    338 
    339 /* Destroy the resources associated with a snapshot element. */
    340 static void
    341 mntfs_destroy_elem(mntelem_t *elemp)
    342 {
    343 	kmem_free(elemp->mnte_text, elemp->mnte_text_size);
    344 	kmem_free(elemp, sizeof (mntelem_t));
    345 }
    346 
    347 /*
    348  * Return 1 if the given snapshot is in the range of the given element; return
    349  * 0 otherwise.
    350  */
    351 static int
    352 mntfs_elem_in_range(mntsnap_t *snapp, mntelem_t *elemp)
    353 {
    354 	timespec_t	*stimep = &snapp->mnts_time;
    355 	timespec_t	*btimep = &elemp->mnte_birth;
    356 	timespec_t	*dtimep = &elemp->mnte_death;
    357 
    358 	/*
    359 	 * If a snapshot is in range of an element then the snapshot must have
    360 	 * been created after the birth of the element, and either the element
    361 	 * is still alive or it died after the snapshot was created.
    362 	 */
    363 	if (mntfs_newest(btimep, stimep) == MNTFS_SECOND &&
    364 	    (MNTFS_ELEM_IS_ALIVE(elemp) ||
    365 	    mntfs_newest(stimep, dtimep) == MNTFS_SECOND))
    366 		return (1);
    367 	else
    368 		return (0);
    369 }
    370 
    371 /*
    372  * Return the next valid database element, after the one provided, for a given
    373  * snapshot; return NULL if none exists. The caller must hold the zone's
    374  * database lock as a reader before calling this function.
    375  */
    376 static mntelem_t *
    377 mntfs_get_next_elem(mntsnap_t *snapp, mntelem_t *elemp)
    378 {
    379 	int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
    380 
    381 	do {
    382 		elemp = elemp->mnte_next;
    383 	} while (elemp &&
    384 	    (!mntfs_elem_in_range(snapp, elemp) ||
    385 	    (!show_hidden && elemp->mnte_hidden)));
    386 	return (elemp);
    387 }
    388 
    389 /*
    390  * This function frees the resources associated with a mntsnap_t. It walks
    391  * through the database, decrementing the reference count of any element that
    392  * satisfies the snapshot. If the reference count of an element becomes zero
    393  * then it is removed from the database.
    394  */
    395 static void
    396 mntfs_freesnap(mntnode_t *mnp, mntsnap_t *snapp)
    397 {
    398 	zone_t *zonep = MTOD(mnp)->mnt_zone;
    399 	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
    400 	mntelem_t **elempp = &zonep->zone_mntfs_db;
    401 	mntelem_t *elemp;
    402 	int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
    403 	size_t number_decremented = 0;
    404 
    405 	ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
    406 
    407 	/* Ignore an uninitialised snapshot. */
    408 	if (snapp->mnts_nmnts == 0)
    409 		return;
    410 
    411 	/* Drop the holds on any matching database elements. */
    412 	rw_enter(dblockp, RW_WRITER);
    413 	while ((elemp = *elempp) != NULL) {
    414 		if (mntfs_elem_in_range(snapp, elemp) &&
    415 		    (!elemp->mnte_hidden || show_hidden) &&
    416 		    ++number_decremented && --elemp->mnte_refcnt == 0) {
    417 			if ((*elempp = elemp->mnte_next) != NULL)
    418 				(*elempp)->mnte_prev = elemp->mnte_prev;
    419 			mntfs_destroy_elem(elemp);
    420 		} else {
    421 			elempp = &elemp->mnte_next;
    422 		}
    423 	}
    424 	rw_exit(dblockp);
    425 	ASSERT(number_decremented == snapp->mnts_nmnts);
    426 
    427 	/* Clear the snapshot data. */
    428 	bzero(snapp, sizeof (mntsnap_t));
    429 }
    430 
    431 /* Insert the new database element newp after the existing element prevp. */
    432 static void
    433 mntfs_insert_after(mntelem_t *newp, mntelem_t *prevp)
    434 {
    435 	newp->mnte_prev = prevp;
    436 	newp->mnte_next = prevp->mnte_next;
    437 	prevp->mnte_next = newp;
    438 	if (newp->mnte_next != NULL)
    439 		newp->mnte_next->mnte_prev = newp;
    440 }
    441 
    442 /* Create and return a copy of a given database element. */
    443 static mntelem_t *
    444 mntfs_copy(mntelem_t *origp)
    445 {
    446 	mntelem_t *copyp;
    447 
    448 	copyp = kmem_zalloc(sizeof (mntelem_t), KM_SLEEP);
    449 	copyp->mnte_vfs_ctime = origp->mnte_vfs_ctime;
    450 	copyp->mnte_text_size = origp->mnte_text_size;
    451 	copyp->mnte_text = kmem_alloc(copyp->mnte_text_size, KM_SLEEP);
    452 	bcopy(origp->mnte_text, copyp->mnte_text, copyp->mnte_text_size);
    453 	copyp->mnte_tab = origp->mnte_tab;
    454 	copyp->mnte_hidden = origp->mnte_hidden;
    455 
    456 	return (copyp);
    457 }
    458 
    459 /*
    460  * Compare two database elements and determine whether or not the vfs_t payload
    461  * data of each are the same. Return 1 if so and 0 otherwise.
    462  */
    463 static int
    464 mntfs_is_same_element(mntelem_t *a, mntelem_t *b)
    465 {
    466 	if (a->mnte_hidden == b->mnte_hidden &&
    467 	    a->mnte_text_size == b->mnte_text_size &&
    468 	    bcmp(a->mnte_text, b->mnte_text, a->mnte_text_size) == 0 &&
    469 	    bcmp(&a->mnte_tab, &b->mnte_tab, sizeof (struct extmnttab)) == 0)
    470 		return (1);
    471 	else
    472 		return (0);
    473 }
    474 
    475 /*
    476  * mntfs_snapshot() updates the database, creating it if necessary, so that it
    477  * accurately reflects the state of the in-kernel mnttab. It also increments
    478  * the reference count on all database elements that correspond to currently-
    479  * mounted resources. Finally, it initialises the appropriate snapshot
    480  * structure.
    481  *
    482  * Each vfs_t is given a high-resolution time stamp, for the benefit of mntfs,
    483  * when it is inserted into the in-kernel mnttab. This time stamp is copied into
    484  * the corresponding database element when it is created, allowing the element
    485  * and the vfs_t to be identified as a pair. It is possible that some file
    486  * systems may make unadvertised changes to, for example, a resource's mount
    487  * options. Therefore, in order to determine whether a database element is an
    488  * up-to-date representation of a given vfs_t, it is compared with a temporary
    489  * element generated for this purpose. Although less efficient, this is safer
    490  * than implementing an mtime for a vfs_t.
    491  *
    492  * Some mounted resources are marked as "hidden" with a VFS_NOMNTTAB flag. These
    493  * are considered invisible unless the user has already set the MNT_SHOWHIDDEN
    494  * flag in the vnode using the MNTIOC_SHOWHIDDEN ioctl.
    495  */
    496 static void
    497 mntfs_snapshot(mntnode_t *mnp, mntsnap_t *snapp)
    498 {
    499 	zone_t		*zonep = MTOD(mnp)->mnt_zone;
    500 	int		is_global_zone = (zonep == global_zone);
    501 	int		show_hidden = mnp->mnt_flags & MNT_SHOWHIDDEN;
    502 	vfs_t		*vfsp, *firstvfsp, *lastvfsp;
    503 	vfs_t		dummyvfs;
    504 	vfs_t		*dummyvfsp = NULL;
    505 	krwlock_t	*dblockp = &zonep->zone_mntfs_db_lock;
    506 	mntelem_t	**headpp = &zonep->zone_mntfs_db;
    507 	mntelem_t	*elemp;
    508 	mntelem_t	*prevp = NULL;
    509 	int		order;
    510 	mntelem_t	*tempelemp;
    511 	mntelem_t	*newp;
    512 	mntelem_t	*firstp = NULL;
    513 	size_t		nmnts = 0;
    514 	size_t		text_size = 0;
    515 	int		insert_before;
    516 	timespec_t	last_mtime;
    517 	size_t		entry_length, new_entry_length;
    518 
    519 
    520 	ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
    521 	vfs_list_read_lock();
    522 	vfs_mnttab_modtime(&last_mtime);
    523 
    524 	/*
    525 	 * If this snapshot already exists then we must have been asked to
    526 	 * rewind the file, i.e. discard the snapshot and create a new one in
    527 	 * its place. In this case we first see if the in-kernel mnttab has
    528 	 * advertised a change; if not then we simply reinitialise the metadata.
    529 	 */
    530 	if (snapp->mnts_nmnts) {
    531 		if (mntfs_newest(&last_mtime, &snapp->mnts_last_mtime) ==
    532 		    MNTFS_NEITHER) {
    533 			/*
    534 			 * An unchanged mtime is no guarantee that the
    535 			 * in-kernel mnttab is unchanged; for example, a
    536 			 * concurrent remount may be between calls to
    537 			 * vfs_setmntopt_nolock() and vfs_mnttab_modtimeupd().
    538 			 * It follows that the database may have changed, and
    539 			 * in particular that some elements in this snapshot
    540 			 * may have been killed by another call to
    541 			 * mntfs_snapshot(). It is therefore not merely
    542 			 * unnecessary to update the snapshot's time but in
    543 			 * fact dangerous; it needs to be left alone.
    544 			 */
    545 			snapp->mnts_next = snapp->mnts_first;
    546 			snapp->mnts_flags &= ~MNTS_REWIND;
    547 			snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
    548 			vfs_list_unlock();
    549 			return;
    550 		} else {
    551 			mntfs_freesnap(mnp, snapp);
    552 		}
    553 	}
    554 
    555 	/*
    556 	 * Create a temporary database element. For each vfs_t, the temporary
    557 	 * element will be populated with the corresponding text. If the vfs_t
    558 	 * does not have a corresponding element within the database, or if
    559 	 * there is such an element but it is stale, a copy of the temporary
    560 	 * element is inserted into the database at the appropriate location.
    561 	 */
    562 	tempelemp = kmem_alloc(sizeof (mntelem_t), KM_SLEEP);
    563 	entry_length = MNT_LINE_MAX;
    564 	tempelemp->mnte_text = kmem_alloc(entry_length, KM_SLEEP);
    565 
    566 	/* Find the first and last vfs_t for the given zone. */
    567 	if (is_global_zone) {
    568 		firstvfsp = rootvfs;
    569 		lastvfsp = firstvfsp->vfs_prev;
    570 	} else {
    571 		firstvfsp = zonep->zone_vfslist;
    572 		/*
    573 		 * If there isn't already a vfs_t for root then we create a
    574 		 * dummy which will be used as the head of the list (which will
    575 		 * therefore no longer be circular).
    576 		 */
    577 		if (firstvfsp == NULL ||
    578 		    strcmp(refstr_value(firstvfsp->vfs_mntpt),
    579 		    zonep->zone_rootpath) != 0) {
    580 			/*
    581 			 * The zone's vfs_ts will have mount points relative to
    582 			 * the zone's root path. The vfs_t for the zone's
    583 			 * root file system would therefore have a mount point
    584 			 * equal to the zone's root path. Since the zone's root
    585 			 * path isn't a mount point, we copy the vfs_t of the
    586 			 * zone's root vnode, and provide it with a fake mount
    587 			 * point and resource.
    588 			 *
    589 			 * Note that by cloning another vfs_t we also acquire
    590 			 * its high-resolution ctime. This might appear to
    591 			 * violate the requirement that the ctimes in the list
    592 			 * of vfs_ts are unique and monotonically increasing;
    593 			 * this is not the case. The dummy vfs_t appears in only
    594 			 * a non-global zone's vfs_t list, where the cloned
    595 			 * vfs_t would not ordinarily be visible; the ctimes are
    596 			 * therefore unique. The zone's root path must be
    597 			 * available before the zone boots, and so its root
    598 			 * vnode's vfs_t's ctime must be lower than those of any
    599 			 * resources subsequently mounted by the zone. The
    600 			 * ctimes are therefore monotonically increasing.
    601 			 */
    602 			dummyvfs = *zonep->zone_rootvp->v_vfsp;
    603 			dummyvfs.vfs_mntpt = refstr_alloc(zonep->zone_rootpath);
    604 			dummyvfs.vfs_resource = dummyvfs.vfs_mntpt;
    605 			dummyvfsp = &dummyvfs;
    606 			if (firstvfsp == NULL) {
    607 				lastvfsp = dummyvfsp;
    608 			} else {
    609 				lastvfsp = firstvfsp->vfs_zone_prev;
    610 				dummyvfsp->vfs_zone_next = firstvfsp;
    611 			}
    612 			firstvfsp = dummyvfsp;
    613 		} else {
    614 			lastvfsp = firstvfsp->vfs_zone_prev;
    615 		}
    616 	}
    617 
    618 	/*
    619 	 * Now walk through all the vfs_ts for this zone. For each one, find the
    620 	 * corresponding database element, creating it first if necessary, and
    621 	 * increment its reference count.
    622 	 */
    623 	rw_enter(dblockp, RW_WRITER);
    624 	elemp = zonep->zone_mntfs_db;
    625 	/* CSTYLED */
    626 	for (vfsp = firstvfsp;;
    627 	    vfsp = is_global_zone ? vfsp->vfs_next : vfsp->vfs_zone_next) {
    628 		DTRACE_PROBE1(new__vfs, vfs_t *, vfsp);
    629 		/* Consider only visible entries. */
    630 		if ((vfsp->vfs_flag & VFS_NOMNTTAB) == 0 || show_hidden) {
    631 			/*
    632 			 * Walk through the existing database looking for either
    633 			 * an element that matches the current vfs_t, or for the
    634 			 * correct place in which to insert a new element.
    635 			 */
    636 			insert_before = 0;
    637 			for (; elemp; prevp = elemp, elemp = elemp->mnte_next) {
    638 				DTRACE_PROBE1(considering__elem, mntelem_t *,
    639 				    elemp);
    640 
    641 				/* Compare the vfs_t with the element. */
    642 				order = mntfs_newest(&elemp->mnte_vfs_ctime,
    643 				    &vfsp->vfs_hrctime);
    644 
    645 				/*
    646 				 * If we encounter a database element newer than
    647 				 * this vfs_t then we've stepped over a gap
    648 				 * where the element for this vfs_t must be
    649 				 * inserted.
    650 				 */
    651 				if (order == MNTFS_FIRST) {
    652 					insert_before = 1;
    653 					break;
    654 				}
    655 
    656 				/* Dead elements no longer interest us. */
    657 				if (MNTFS_ELEM_IS_DEAD(elemp))
    658 					continue;
    659 
    660 				/*
    661 				 * If the time stamps are the same then the
    662 				 * element is potential match for the vfs_t,
    663 				 * although it may later prove to be stale.
    664 				 */
    665 				if (order == MNTFS_NEITHER)
    666 					break;
    667 
    668 				/*
    669 				 * This element must be older than the vfs_t.
    670 				 * It must, therefore, correspond to a vfs_t
    671 				 * that has been unmounted. Since the element is
    672 				 * still alive, we kill it if it is visible.
    673 				 */
    674 				if (!elemp->mnte_hidden || show_hidden)
    675 					vfs_mono_time(&elemp->mnte_death);
    676 			}
    677 			DTRACE_PROBE2(possible__match, vfs_t *, vfsp,
    678 			    mntelem_t *, elemp);
    679 
    680 			/* Create a new database element if required. */
    681 			new_entry_length = mntfs_text_len(vfsp, zonep);
    682 			if (new_entry_length > entry_length) {
    683 				kmem_free(tempelemp->mnte_text, entry_length);
    684 				tempelemp->mnte_text =
    685 				    kmem_alloc(new_entry_length, KM_SLEEP);
    686 				entry_length = new_entry_length;
    687 			}
    688 			mntfs_populate_text(vfsp, zonep, tempelemp);
    689 			ASSERT(tempelemp->mnte_text_size == new_entry_length);
    690 			if (elemp == NULL) {
    691 				/*
    692 				 * We ran off the end of the database. Insert a
    693 				 * new element at the end.
    694 				 */
    695 				newp = mntfs_copy(tempelemp);
    696 				vfs_mono_time(&newp->mnte_birth);
    697 				if (prevp) {
    698 					mntfs_insert_after(newp, prevp);
    699 				} else {
    700 					newp->mnte_next = NULL;
    701 					newp->mnte_prev = NULL;
    702 					ASSERT(*headpp == NULL);
    703 					*headpp = newp;
    704 				}
    705 				elemp = newp;
    706 			} else if (insert_before) {
    707 				/*
    708 				 * Insert a new element before the current one.
    709 				 */
    710 				newp = mntfs_copy(tempelemp);
    711 				vfs_mono_time(&newp->mnte_birth);
    712 				if (prevp) {
    713 					mntfs_insert_after(newp, prevp);
    714 				} else {
    715 					newp->mnte_next = elemp;
    716 					newp->mnte_prev = NULL;
    717 					elemp->mnte_prev = newp;
    718 					ASSERT(*headpp == elemp);
    719 					*headpp = newp;
    720 				}
    721 				elemp = newp;
    722 			} else if (!mntfs_is_same_element(elemp, tempelemp)) {
    723 				/*
    724 				 * The element corresponds to the vfs_t, but the
    725 				 * vfs_t has changed; it must have been
    726 				 * remounted. Kill the old element and insert a
    727 				 * new one after it.
    728 				 */
    729 				vfs_mono_time(&elemp->mnte_death);
    730 				newp = mntfs_copy(tempelemp);
    731 				vfs_mono_time(&newp->mnte_birth);
    732 				mntfs_insert_after(newp, elemp);
    733 				elemp = newp;
    734 			}
    735 
    736 			/* We've found the corresponding element. Hold it. */
    737 			DTRACE_PROBE1(incrementing, mntelem_t *, elemp);
    738 			elemp->mnte_refcnt++;
    739 
    740 			/*
    741 			 * Update the parameters used to initialise the
    742 			 * snapshot.
    743 			 */
    744 			nmnts++;
    745 			text_size += elemp->mnte_text_size;
    746 			if (!firstp)
    747 				firstp = elemp;
    748 
    749 			prevp = elemp;
    750 			elemp = elemp->mnte_next;
    751 		}
    752 
    753 		if (vfsp == lastvfsp)
    754 			break;
    755 	}
    756 
    757 	/*
    758 	 * Any remaining visible database elements that are still alive must be
    759 	 * killed now, because their corresponding vfs_ts must have been
    760 	 * unmounted.
    761 	 */
    762 	for (; elemp; elemp = elemp->mnte_next) {
    763 		if (MNTFS_ELEM_IS_ALIVE(elemp) &&
    764 		    (!elemp->mnte_hidden || show_hidden))
    765 			vfs_mono_time(&elemp->mnte_death);
    766 	}
    767 
    768 	/* Initialise the snapshot. */
    769 	vfs_mono_time(&snapp->mnts_time);
    770 	snapp->mnts_last_mtime = last_mtime;
    771 	snapp->mnts_first = snapp->mnts_next = firstp;
    772 	snapp->mnts_flags = show_hidden ? MNTS_SHOWHIDDEN : 0;
    773 	snapp->mnts_nmnts = nmnts;
    774 	snapp->mnts_text_size = MTOD(mnp)->mnt_size = text_size;
    775 	snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
    776 
    777 	/* Clean up. */
    778 	rw_exit(dblockp);
    779 	vfs_list_unlock();
    780 	if (dummyvfsp != NULL)
    781 		refstr_rele(dummyvfsp->vfs_mntpt);
    782 	kmem_free(tempelemp->mnte_text, entry_length);
    783 	kmem_free(tempelemp, sizeof (mntelem_t));
    784 }
    785 
    786 /*
    787  * Public function to convert vfs_mntopts into a string.
    788  * A buffer of sufficient size is allocated, which is returned via bufp,
    789  * and whose length is returned via lenp.
    790  */
    791 void
    792 mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
    793 {
    794 	size_t len;
    795 	char *buf;
    796 
    797 	vfs_list_read_lock();
    798 
    799 	len = mntfs_optsize(vfsp) + 1;
    800 	buf = kmem_alloc(len, KM_NOSLEEP);
    801 	if (buf == NULL) {
    802 		*bufp = NULL;
    803 		vfs_list_unlock();
    804 		return;
    805 	}
    806 	buf[len - 1] = '\0';
    807 	(void) mntfs_optprint(vfsp, buf);
    808 	ASSERT(buf[len - 1] == '\0');
    809 
    810 	vfs_list_unlock();
    811 	*bufp = buf;
    812 	*lenp = len;
    813 }
    814 
    815 /* ARGSUSED */
    816 static int
    817 mntopen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
    818 {
    819 	vnode_t *vp = *vpp;
    820 	mntnode_t *nmnp;
    821 
    822 	/*
    823 	 * Not allowed to open for writing, return error.
    824 	 */
    825 	if (flag & FWRITE)
    826 		return (EPERM);
    827 	/*
    828 	 * Create a new mnt/vnode for each open, this will give us a handle to
    829 	 * hang the snapshot on.
    830 	 */
    831 	nmnp = mntgetnode(vp);
    832 
    833 	*vpp = MTOV(nmnp);
    834 	atomic_add_32(&MTOD(nmnp)->mnt_nopen, 1);
    835 	VN_RELE(vp);
    836 	return (0);
    837 }
    838 
    839 /* ARGSUSED */
    840 static int
    841 mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
    842 	caller_context_t *ct)
    843 {
    844 	mntnode_t *mnp = VTOM(vp);
    845 
    846 	/* Clean up any locks or shares held by the current process */
    847 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
    848 	cleanshares(vp, ttoproc(curthread)->p_pid);
    849 
    850 	if (count > 1)
    851 		return (0);
    852 	if (vp->v_count == 1) {
    853 		rw_enter(&mnp->mnt_contents, RW_WRITER);
    854 		mntfs_freesnap(mnp, &mnp->mnt_read);
    855 		mntfs_freesnap(mnp, &mnp->mnt_ioctl);
    856 		rw_exit(&mnp->mnt_contents);
    857 		atomic_add_32(&MTOD(mnp)->mnt_nopen, -1);
    858 	}
    859 	return (0);
    860 }
    861 
    862 /* ARGSUSED */
    863 static int
    864 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
    865 {
    866 	mntnode_t *mnp = VTOM(vp);
    867 	zone_t *zonep = MTOD(mnp)->mnt_zone;
    868 	mntsnap_t *snapp = &mnp->mnt_read;
    869 	off_t off = uio->uio_offset;
    870 	size_t len = uio->uio_resid;
    871 	char *bufferp;
    872 	size_t available, copylen;
    873 	size_t written = 0;
    874 	mntelem_t *elemp;
    875 	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
    876 	int error = 0;
    877 	off_t	ieoffset;
    878 
    879 	rw_enter(&mnp->mnt_contents, RW_WRITER);
    880 	if (snapp->mnts_nmnts == 0 || (off == (off_t)0))
    881 		mntfs_snapshot(mnp, snapp);
    882 
    883 	if ((size_t)(off + len) > snapp->mnts_text_size)
    884 		len = snapp->mnts_text_size - off;
    885 
    886 	if (off < 0 || len > snapp->mnts_text_size) {
    887 		rw_exit(&mnp->mnt_contents);
    888 		return (EFAULT);
    889 	}
    890 
    891 	if (len == 0) {
    892 		rw_exit(&mnp->mnt_contents);
    893 		return (0);
    894 	}
    895 
    896 	/*
    897 	 * For the file offset provided, locate the corresponding database
    898 	 * element and calculate the corresponding offset within its text. If
    899 	 * the file offset is the same as that reached during the last read(2)
    900 	 * then use the saved element and intra-element offset.
    901 	 */
    902 	rw_enter(dblockp, RW_READER);
    903 	if (off == 0 || (off == snapp->mnts_foffset)) {
    904 		elemp = snapp->mnts_next;
    905 		ieoffset = snapp->mnts_ieoffset;
    906 	} else {
    907 		off_t total_off;
    908 		/*
    909 		 * Find the element corresponding to the requested file offset
    910 		 * by walking through the database and summing the text sizes
    911 		 * of the individual elements. If the requested file offset is
    912 		 * greater than that reached on the last visit then we can start
    913 		 * at the last seen element; otherwise, we have to start at the
    914 		 * beginning.
    915 		 */
    916 		if (off > snapp->mnts_foffset) {
    917 			elemp = snapp->mnts_next;
    918 			total_off = snapp->mnts_foffset - snapp->mnts_ieoffset;
    919 		} else {
    920 			elemp = snapp->mnts_first;
    921 			total_off = 0;
    922 		}
    923 		while (off > total_off + elemp->mnte_text_size) {
    924 			total_off += elemp->mnte_text_size;
    925 			elemp = mntfs_get_next_elem(snapp, elemp);
    926 			ASSERT(elemp != NULL);
    927 		}
    928 		/* Calculate the intra-element offset. */
    929 		if (off > total_off)
    930 			ieoffset = off - total_off;
    931 		else
    932 			ieoffset = 0;
    933 	}
    934 
    935 	/*
    936 	 * Create a buffer and populate it with the text from successive
    937 	 * database elements until it is full.
    938 	 */
    939 	bufferp = kmem_alloc(len, KM_SLEEP);
    940 	while (written < len) {
    941 		available = elemp->mnte_text_size - ieoffset;
    942 		copylen = MIN(len - written, available);
    943 		bcopy(elemp->mnte_text + ieoffset, bufferp + written, copylen);
    944 		written += copylen;
    945 		if (copylen == available) {
    946 			elemp = mntfs_get_next_elem(snapp, elemp);
    947 			ASSERT(elemp != NULL || written == len);
    948 			ieoffset = 0;
    949 		} else {
    950 			ieoffset += copylen;
    951 		}
    952 	}
    953 	rw_exit(dblockp);
    954 
    955 	/*
    956 	 * Write the populated buffer, update the snapshot's state if
    957 	 * successful and then advertise our read.
    958 	 */
    959 	error = uiomove(bufferp, len, UIO_READ, uio);
    960 	if (error == 0) {
    961 		snapp->mnts_next = elemp;
    962 		snapp->mnts_foffset = off + len;
    963 		snapp->mnts_ieoffset = ieoffset;
    964 	}
    965 	vfs_mnttab_readop();
    966 	rw_exit(&mnp->mnt_contents);
    967 
    968 	/* Clean up. */
    969 	kmem_free(bufferp, len);
    970 	return (error);
    971 }
    972 
    973 static int
    974 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
    975 	caller_context_t *ct)
    976 {
    977 	mntnode_t *mnp = VTOM(vp);
    978 	int error;
    979 	vnode_t *rvp;
    980 	extern timespec_t vfs_mnttab_ctime;
    981 	mntdata_t *mntdata = MTOD(VTOM(vp));
    982 	mntsnap_t *snap;
    983 
    984 	rw_enter(&mnp->mnt_contents, RW_READER);
    985 	snap = mnp->mnt_read.mnts_nmnts ? &mnp->mnt_read : &mnp->mnt_ioctl;
    986 	/*
    987 	 * Return all the attributes.  Should be refined
    988 	 * so that it returns only those asked for.
    989 	 * Most of this is complete fakery anyway.
    990 	 */
    991 	rvp = mnp->mnt_mountvp;
    992 	/*
    993 	 * Attributes are same as underlying file with modifications
    994 	 */
    995 	if (error = VOP_GETATTR(rvp, vap, flags, cr, ct)) {
    996 		rw_exit(&mnp->mnt_contents);
    997 		return (error);
    998 	}
    999 
   1000 	/*
   1001 	 * We always look like a regular file
   1002 	 */
   1003 	vap->va_type = VREG;
   1004 	/*
   1005 	 * mode should basically be read only
   1006 	 */
   1007 	vap->va_mode &= 07444;
   1008 	vap->va_fsid = vp->v_vfsp->vfs_dev;
   1009 	vap->va_blksize = DEV_BSIZE;
   1010 	vap->va_rdev = 0;
   1011 	vap->va_seq = 0;
   1012 	/*
   1013 	 * Set nlink to the number of open vnodes for mnttab info
   1014 	 * plus one for existing.
   1015 	 */
   1016 	vap->va_nlink = mntdata->mnt_nopen + 1;
   1017 	/*
   1018 	 * If we haven't taken a snapshot yet, set the
   1019 	 * size to the size of the latest snapshot.
   1020 	 */
   1021 	vap->va_size = snap->mnts_text_size ? snap->mnts_text_size :
   1022 	    mntdata->mnt_size;
   1023 	rw_exit(&mnp->mnt_contents);
   1024 	/*
   1025 	 * Fetch mtime from the vfs mnttab timestamp
   1026 	 */
   1027 	vap->va_ctime = vfs_mnttab_ctime;
   1028 	vfs_list_read_lock();
   1029 	vfs_mnttab_modtime(&vap->va_mtime);
   1030 	vap->va_atime = vap->va_mtime;
   1031 	vfs_list_unlock();
   1032 	/*
   1033 	 * Nodeid is always ROOTINO;
   1034 	 */
   1035 	vap->va_nodeid = (ino64_t)MNTROOTINO;
   1036 	vap->va_nblocks = btod(vap->va_size);
   1037 	return (0);
   1038 }
   1039 
   1040 
   1041 static int
   1042 mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr,
   1043 	caller_context_t *ct)
   1044 {
   1045 	mntnode_t *mnp = VTOM(vp);
   1046 
   1047 	if (mode & (VWRITE|VEXEC))
   1048 		return (EROFS);
   1049 
   1050 	/*
   1051 	 * Do access check on the underlying directory vnode.
   1052 	 */
   1053 	return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr, ct));
   1054 }
   1055 
   1056 
   1057 /*
   1058  * New /mntfs vnode required; allocate it and fill in most of the fields.
   1059  */
   1060 static mntnode_t *
   1061 mntgetnode(vnode_t *dp)
   1062 {
   1063 	mntnode_t *mnp;
   1064 	vnode_t *vp;
   1065 
   1066 	mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
   1067 	mnp->mnt_vnode = vn_alloc(KM_SLEEP);
   1068 	mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
   1069 	rw_init(&mnp->mnt_contents, NULL, RW_DEFAULT, NULL);
   1070 	vp = MTOV(mnp);
   1071 	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
   1072 	vn_setops(vp, mntvnodeops);
   1073 	vp->v_vfsp = dp->v_vfsp;
   1074 	vp->v_type = VREG;
   1075 	vp->v_data = (caddr_t)mnp;
   1076 
   1077 	return (mnp);
   1078 }
   1079 
   1080 /*
   1081  * Free the storage obtained from mntgetnode().
   1082  */
   1083 static void
   1084 mntfreenode(mntnode_t *mnp)
   1085 {
   1086 	vnode_t *vp = MTOV(mnp);
   1087 
   1088 	rw_destroy(&mnp->mnt_contents);
   1089 	vn_invalid(vp);
   1090 	vn_free(vp);
   1091 	kmem_free(mnp, sizeof (*mnp));
   1092 }
   1093 
   1094 
   1095 /* ARGSUSED */
   1096 static int
   1097 mntfsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
   1098 {
   1099 	return (0);
   1100 }
   1101 
   1102 /* ARGSUSED */
   1103 static void
   1104 mntinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
   1105 {
   1106 	mntnode_t *mnp = VTOM(vp);
   1107 
   1108 	mntfreenode(mnp);
   1109 }
   1110 
   1111 /*
   1112  * lseek(2) is supported only to rewind the file. Rewinding has a special
   1113  * meaning for /etc/mnttab: it forces mntfs to refresh the snapshot at the next
   1114  * read() or ioctl().
   1115  *
   1116  * The generic lseek() code will have already changed the file offset. Therefore
   1117  * mntread() can detect a rewind simply by looking for a zero offset. For the
   1118  * benefit of mntioctl() we advertise a rewind with a specific flag.
   1119  */
   1120 /* ARGSUSED */
   1121 static int
   1122 mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
   1123 {
   1124 	mntnode_t *mnp = VTOM(vp);
   1125 
   1126 	if (*noffp == 0) {
   1127 		rw_enter(&mnp->mnt_contents, RW_WRITER);
   1128 		mnp->mnt_ioctl.mnts_flags |= MNTS_REWIND;
   1129 		rw_exit(&mnp->mnt_contents);
   1130 	}
   1131 
   1132 	return (0);
   1133 }
   1134 
   1135 /*
   1136  * Return the answer requested to poll().
   1137  * POLLRDBAND will return when the mtime of the mnttab
   1138  * information is newer than the latest one read for this open.
   1139  */
   1140 /* ARGSUSED */
   1141 static int
   1142 mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp,
   1143 	caller_context_t *ct)
   1144 {
   1145 	mntnode_t *mnp = VTOM(vp);
   1146 	mntsnap_t *snapp;
   1147 
   1148 	rw_enter(&mnp->mnt_contents, RW_READER);
   1149 	if (mntfs_newest(&mnp->mnt_ioctl.mnts_last_mtime,
   1150 	    &mnp->mnt_read.mnts_last_mtime) == MNTFS_FIRST)
   1151 		snapp = &mnp->mnt_ioctl;
   1152 	else
   1153 		snapp = &mnp->mnt_read;
   1154 
   1155 	*revp = 0;
   1156 	*phpp = (pollhead_t *)NULL;
   1157 	if (ev & POLLIN)
   1158 		*revp |= POLLIN;
   1159 
   1160 	if (ev & POLLRDNORM)
   1161 		*revp |= POLLRDNORM;
   1162 
   1163 	if (ev & POLLRDBAND) {
   1164 		vfs_mnttab_poll(&snapp->mnts_last_mtime, phpp);
   1165 		if (*phpp == (pollhead_t *)NULL)
   1166 			*revp |= POLLRDBAND;
   1167 	}
   1168 	rw_exit(&mnp->mnt_contents);
   1169 
   1170 	if (*revp || *phpp != NULL || any) {
   1171 		return (0);
   1172 	}
   1173 	/*
   1174 	 * If someone is polling an unsupported poll events (e.g.
   1175 	 * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
   1176 	 * That way we will ensure that we don't return a 0
   1177 	 * revents with a NULL pollhead pointer.
   1178 	 */
   1179 	*revp = POLLERR;
   1180 	return (0);
   1181 }
   1182 
   1183 /*
   1184  * mntfs_same_word() returns 1 if two words are the same in the context of
   1185  * MNTIOC_GETMNTANY and 0 otherwise.
   1186  *
   1187  * worda is a memory address that lies somewhere in the buffer bufa; it cannot
   1188  * be NULL since this is used to indicate to getmntany(3C) that the user does
   1189  * not wish to match a particular field. The text to which worda points is
   1190  * supplied by the user; if it is not null-terminated then it cannot match.
   1191  *
   1192  * Buffer bufb contains a line from /etc/mnttab, in which the fields are
   1193  * delimited by tab or new-line characters. offb is the offset of the second
   1194  * word within this buffer.
   1195  *
   1196  * mntfs_same_word() returns 1 if the words are the same and 0 otherwise.
   1197  */
   1198 int
   1199 mntfs_same_word(char *worda, char *bufa, size_t sizea, off_t offb, char *bufb,
   1200     size_t sizeb)
   1201 {
   1202 	char *wordb = bufb + offb;
   1203 	int bytes_remaining;
   1204 
   1205 	ASSERT(worda != NULL);
   1206 
   1207 	bytes_remaining = MIN(((bufa + sizea) - worda),
   1208 	    ((bufb + sizeb) - wordb));
   1209 	while (bytes_remaining && *worda == *wordb) {
   1210 		worda++;
   1211 		wordb++;
   1212 		bytes_remaining--;
   1213 	}
   1214 	if (bytes_remaining &&
   1215 	    *worda == '\0' && (*wordb == '\t' || *wordb == '\n'))
   1216 		return (1);
   1217 	else
   1218 		return (0);
   1219 }
   1220 
   1221 /*
   1222  * mntfs_special_info_string() returns which, if either, of VBLK or VCHR
   1223  * corresponds to a supplied path. If the path is a special device then the
   1224  * function optionally sets the major and minor numbers.
   1225  */
   1226 vtype_t
   1227 mntfs_special_info_string(char *path, uint_t *major, uint_t *minor, cred_t *cr)
   1228 {
   1229 	vattr_t vattr;
   1230 	vnode_t *vp;
   1231 	vtype_t type;
   1232 	int error;
   1233 
   1234 	if (path == NULL || *path != '/' ||
   1235 	    lookupnameat(path + 1, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, rootdir))
   1236 		return (0);
   1237 
   1238 	vattr.va_mask = AT_TYPE | AT_RDEV;
   1239 	error = VOP_GETATTR(vp, &vattr, ATTR_REAL, cr, NULL);
   1240 	VN_RELE(vp);
   1241 
   1242 	if (error == 0 && ((type = vattr.va_type) == VBLK || type == VCHR)) {
   1243 		if (major && minor) {
   1244 			*major = getmajor(vattr.va_rdev);
   1245 			*minor = getminor(vattr.va_rdev);
   1246 		}
   1247 		return (type);
   1248 	} else {
   1249 		return (0);
   1250 	}
   1251 }
   1252 
   1253 /*
   1254  * mntfs_special_info_element() extracts the name of the mounted resource
   1255  * for a given element and copies it into a null-terminated string, which it
   1256  * then passes to mntfs_special_info_string().
   1257  */
   1258 vtype_t
   1259 mntfs_special_info_element(mntelem_t *elemp, cred_t *cr)
   1260 {
   1261 	char *newpath;
   1262 	vtype_t type;
   1263 
   1264 	newpath = kmem_alloc(elemp->mnte_text_size, KM_SLEEP);
   1265 	bcopy(elemp->mnte_text, newpath, (off_t)(elemp->mnte_tab.mnt_mountp));
   1266 	*(newpath + (off_t)elemp->mnte_tab.mnt_mountp - 1) = '\0';
   1267 	type = mntfs_special_info_string(newpath, NULL, NULL, cr);
   1268 	kmem_free(newpath, elemp->mnte_text_size);
   1269 
   1270 	return (type);
   1271 }
   1272 
   1273 /*
   1274  * Convert an address that points to a byte within a user buffer into an
   1275  * address that points to the corresponding offset within a kernel buffer. If
   1276  * the user address is NULL then make no conversion. If the address does not
   1277  * lie within the buffer then reset it to NULL.
   1278  */
   1279 char *
   1280 mntfs_import_addr(char *uaddr, char *ubufp, char *kbufp, size_t bufsize)
   1281 {
   1282 	if (uaddr < ubufp || uaddr >= ubufp + bufsize)
   1283 		return (NULL);
   1284 	else
   1285 		return (kbufp + (uaddr - ubufp));
   1286 }
   1287 
   1288 /*
   1289  * These 32-bit versions are to support STRUCT_DECL(9F) etc. in
   1290  * mntfs_copyout_element() and mntioctl().
   1291  */
   1292 #ifdef _SYSCALL32_IMPL
   1293 typedef struct extmnttab32 {
   1294 	uint32_t	mnt_special;
   1295 	uint32_t	mnt_mountp;
   1296 	uint32_t	mnt_fstype;
   1297 	uint32_t	mnt_mntopts;
   1298 	uint32_t	mnt_time;
   1299 	uint_t		mnt_major;
   1300 	uint_t		mnt_minor;
   1301 } extmnttab32_t;
   1302 
   1303 typedef struct mnttab32 {
   1304 	uint32_t	mnt_special;
   1305 	uint32_t	mnt_mountp;
   1306 	uint32_t	mnt_fstype;
   1307 	uint32_t	mnt_mntopts;
   1308 	uint32_t	mnt_time;
   1309 } mnttab32_t;
   1310 
   1311 struct mntentbuf32 {
   1312 	uint32_t	mbuf_emp;
   1313 	uint_t		mbuf_bufsize;
   1314 	uint32_t	mbuf_buf;
   1315 };
   1316 #endif
   1317 
   1318 /*
   1319  * mntfs_copyout_element() is common code for the MNTIOC_GETMNTENT,
   1320  * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY ioctls. Having identifed the
   1321  * database element desired by the user, this function copies out the text and
   1322  * the pointers to the relevant userland addresses. It returns 0 on success
   1323  * and non-zero otherwise.
   1324  */
   1325 int
   1326 mntfs_copyout_elem(mntelem_t *elemp, struct extmnttab *uemp,
   1327     char *ubufp, int cmd, int datamodel)
   1328 {
   1329 		STRUCT_DECL(extmnttab, ktab);
   1330 		char *dbbufp = elemp->mnte_text;
   1331 		size_t dbbufsize = elemp->mnte_text_size;
   1332 		struct extmnttab *dbtabp = &elemp->mnte_tab;
   1333 		size_t ssize;
   1334 		char *kbufp;
   1335 		int error = 0;
   1336 
   1337 
   1338 		/*
   1339 		 * We create a struct extmnttab within the kernel of the size
   1340 		 * determined by the user's data model. We then populate its
   1341 		 * fields by combining the start address of the text buffer
   1342 		 * supplied by the user, ubufp, with the offsets stored for
   1343 		 * this database element within dbtabp, a pointer to a struct
   1344 		 * extmnttab.
   1345 		 *
   1346 		 * Note that if the corresponding field is "-" this signifies
   1347 		 * no real content, and we set the address to NULL. This does
   1348 		 * not apply to mnt_time.
   1349 		 */
   1350 		STRUCT_INIT(ktab, datamodel);
   1351 		STRUCT_FSETP(ktab, mnt_special,
   1352 		    MNTFS_REAL_FIELD(dbbufp) ? ubufp : NULL);
   1353 		STRUCT_FSETP(ktab, mnt_mountp,
   1354 		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mountp) ?
   1355 		    ubufp + (off_t)dbtabp->mnt_mountp : NULL);
   1356 		STRUCT_FSETP(ktab, mnt_fstype,
   1357 		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_fstype) ?
   1358 		    ubufp + (off_t)dbtabp->mnt_fstype : NULL);
   1359 		STRUCT_FSETP(ktab, mnt_mntopts,
   1360 		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mntopts) ?
   1361 		    ubufp + (off_t)dbtabp->mnt_mntopts : NULL);
   1362 		STRUCT_FSETP(ktab, mnt_time,
   1363 		    ubufp + (off_t)dbtabp->mnt_time);
   1364 		if (cmd == MNTIOC_GETEXTMNTENT) {
   1365 			STRUCT_FSETP(ktab, mnt_major, dbtabp->mnt_major);
   1366 			STRUCT_FSETP(ktab, mnt_minor, dbtabp->mnt_minor);
   1367 			ssize = SIZEOF_STRUCT(extmnttab, datamodel);
   1368 		} else {
   1369 			ssize = SIZEOF_STRUCT(mnttab, datamodel);
   1370 		}
   1371 		if (copyout(STRUCT_BUF(ktab), uemp, ssize))
   1372 			return (EFAULT);
   1373 
   1374 		/*
   1375 		 * We create a text buffer in the kernel into which we copy the
   1376 		 * /etc/mnttab entry for this element. We change the tab and
   1377 		 * new-line delimiters to null bytes before copying out the
   1378 		 * buffer.
   1379 		 */
   1380 		kbufp = kmem_alloc(dbbufsize, KM_SLEEP);
   1381 		bcopy(elemp->mnte_text, kbufp, dbbufsize);
   1382 		*(kbufp + (off_t)dbtabp->mnt_mountp - 1) =
   1383 		    *(kbufp + (off_t)dbtabp->mnt_fstype - 1) =
   1384 		    *(kbufp + (off_t)dbtabp->mnt_mntopts - 1) =
   1385 		    *(kbufp + (off_t)dbtabp->mnt_time - 1) =
   1386 		    *(kbufp + dbbufsize - 1) = '\0';
   1387 		if (copyout(kbufp, ubufp, dbbufsize))
   1388 			error = EFAULT;
   1389 
   1390 		kmem_free(kbufp, dbbufsize);
   1391 		return (error);
   1392 }
   1393 
   1394 /* ARGSUSED */
   1395 static int
   1396 mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
   1397     int *rvalp, caller_context_t *ct)
   1398 {
   1399 	uint_t *up = (uint_t *)arg;
   1400 	mntnode_t *mnp = VTOM(vp);
   1401 	mntsnap_t *snapp = &mnp->mnt_ioctl;
   1402 	int error = 0;
   1403 	zone_t *zonep = MTOD(mnp)->mnt_zone;
   1404 	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
   1405 	model_t datamodel = flag & DATAMODEL_MASK;
   1406 
   1407 	switch (cmd) {
   1408 
   1409 	case MNTIOC_NMNTS:  		/* get no. of mounted resources */
   1410 	{
   1411 		rw_enter(&mnp->mnt_contents, RW_READER);
   1412 		if (snapp->mnts_nmnts == 0 ||
   1413 		    (snapp->mnts_flags & MNTS_REWIND)) {
   1414 			if (!rw_tryupgrade(&mnp->mnt_contents)) {
   1415 				rw_exit(&mnp->mnt_contents);
   1416 				rw_enter(&mnp->mnt_contents, RW_WRITER);
   1417 			}
   1418 			if (snapp->mnts_nmnts == 0 ||
   1419 			    (snapp->mnts_flags & MNTS_REWIND))
   1420 				mntfs_snapshot(mnp, snapp);
   1421 		}
   1422 		rw_exit(&mnp->mnt_contents);
   1423 
   1424 		if (suword32(up, snapp->mnts_nmnts) != 0)
   1425 			error = EFAULT;
   1426 		break;
   1427 	}
   1428 
   1429 	case MNTIOC_GETDEVLIST:  	/* get mounted device major/minor nos */
   1430 	{
   1431 		size_t len;
   1432 		uint_t *devlist;
   1433 		mntelem_t *elemp;
   1434 		int i = 0;
   1435 
   1436 		rw_enter(&mnp->mnt_contents, RW_READER);
   1437 		if (snapp->mnts_nmnts == 0 ||
   1438 		    (snapp->mnts_flags & MNTS_REWIND)) {
   1439 			if (!rw_tryupgrade(&mnp->mnt_contents)) {
   1440 				rw_exit(&mnp->mnt_contents);
   1441 				rw_enter(&mnp->mnt_contents, RW_WRITER);
   1442 			}
   1443 			if (snapp->mnts_nmnts == 0 ||
   1444 			    (snapp->mnts_flags & MNTS_REWIND))
   1445 				mntfs_snapshot(mnp, snapp);
   1446 			rw_downgrade(&mnp->mnt_contents);
   1447 		}
   1448 
   1449 		/* Create a local buffer to hold the device numbers. */
   1450 		len = 2 * snapp->mnts_nmnts * sizeof (uint_t);
   1451 		devlist = kmem_alloc(len, KM_SLEEP);
   1452 
   1453 		/*
   1454 		 * Walk the database elements for this snapshot and add their
   1455 		 * major and minor numbers.
   1456 		 */
   1457 		rw_enter(dblockp, RW_READER);
   1458 		for (elemp = snapp->mnts_first; elemp;
   1459 		    elemp = mntfs_get_next_elem(snapp, elemp)) {
   1460 				devlist[2 * i] = elemp->mnte_tab.mnt_major;
   1461 				devlist[2 * i + 1] = elemp->mnte_tab.mnt_minor;
   1462 				i++;
   1463 		}
   1464 		rw_exit(dblockp);
   1465 		ASSERT(i == snapp->mnts_nmnts);
   1466 		rw_exit(&mnp->mnt_contents);
   1467 
   1468 		error = xcopyout(devlist, up, len);
   1469 		kmem_free(devlist, len);
   1470 		break;
   1471 	}
   1472 
   1473 	case MNTIOC_SETTAG:		/* set tag on mounted file system */
   1474 	case MNTIOC_CLRTAG:		/* clear tag on mounted file system */
   1475 	{
   1476 		struct mnttagdesc *dp = (struct mnttagdesc *)arg;
   1477 		STRUCT_DECL(mnttagdesc, tagdesc);
   1478 		char *cptr;
   1479 		uint32_t major, minor;
   1480 		char tagbuf[MAX_MNTOPT_TAG];
   1481 		char *pbuf;
   1482 		size_t len;
   1483 		uint_t start = 0;
   1484 		mntdata_t *mntdata = MTOD(mnp);
   1485 		zone_t *zone = mntdata->mnt_zone;
   1486 
   1487 		STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
   1488 		if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
   1489 			error = EFAULT;
   1490 			break;
   1491 		}
   1492 		pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
   1493 		if (zone != global_zone) {
   1494 			(void) strcpy(pbuf, zone->zone_rootpath);
   1495 			/* truncate "/" and nul */
   1496 			start = zone->zone_rootpathlen - 2;
   1497 			ASSERT(pbuf[start] == '/');
   1498 		}
   1499 		cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
   1500 		error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
   1501 		if (error) {
   1502 			kmem_free(pbuf, MAXPATHLEN);
   1503 			break;
   1504 		}
   1505 		if (start != 0 && pbuf[start] != '/') {
   1506 			kmem_free(pbuf, MAXPATHLEN);
   1507 			error = EINVAL;
   1508 			break;
   1509 		}
   1510 		cptr = STRUCT_FGETP(tagdesc, mtd_tag);
   1511 		if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
   1512 			kmem_free(pbuf, MAXPATHLEN);
   1513 			break;
   1514 		}
   1515 		major = STRUCT_FGET(tagdesc, mtd_major);
   1516 		minor = STRUCT_FGET(tagdesc, mtd_minor);
   1517 		if (cmd == MNTIOC_SETTAG)
   1518 			error = vfs_settag(major, minor, pbuf, tagbuf, cr);
   1519 		else
   1520 			error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
   1521 		kmem_free(pbuf, MAXPATHLEN);
   1522 		break;
   1523 	}
   1524 
   1525 	case MNTIOC_SHOWHIDDEN:
   1526 	{
   1527 		mutex_enter(&vp->v_lock);
   1528 		mnp->mnt_flags |= MNT_SHOWHIDDEN;
   1529 		mutex_exit(&vp->v_lock);
   1530 		break;
   1531 	}
   1532 
   1533 	case MNTIOC_GETMNTANY:
   1534 	{
   1535 		STRUCT_DECL(mntentbuf, embuf);	/* Our copy of user's embuf */
   1536 		STRUCT_DECL(extmnttab, ktab);	/* Out copy of user's emp */
   1537 		struct extmnttab *uemp;		/* uaddr of user's emp */
   1538 		char *ubufp;			/* uaddr of user's text buf */
   1539 		size_t ubufsize;		/* size of the above */
   1540 		struct extmnttab preftab;	/* our version of user's emp */
   1541 		char *prefbuf;			/* our copy of user's text */
   1542 		mntelem_t *elemp;		/* a database element */
   1543 		struct extmnttab *dbtabp;	/* element's extmnttab */
   1544 		char *dbbufp;			/* element's text buf */
   1545 		size_t dbbufsize;		/* size of the above */
   1546 		vtype_t type;			/* type, if any, of special */
   1547 
   1548 
   1549 		/*
   1550 		 * embuf is a struct embuf within the kernel. We copy into it
   1551 		 * the struct embuf supplied by the user.
   1552 		 */
   1553 		STRUCT_INIT(embuf, datamodel);
   1554 		if (copyin((void *) arg, STRUCT_BUF(embuf),
   1555 		    STRUCT_SIZE(embuf))) {
   1556 			error = EFAULT;
   1557 			break;
   1558 		}
   1559 		uemp = STRUCT_FGETP(embuf, mbuf_emp);
   1560 		ubufp = STRUCT_FGETP(embuf, mbuf_buf);
   1561 		ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
   1562 
   1563 		/*
   1564 		 * Check that the text buffer offered by the user is the
   1565 		 * agreed size.
   1566 		 */
   1567 		if (ubufsize != MNT_LINE_MAX) {
   1568 			error = EINVAL;
   1569 			break;
   1570 		}
   1571 
   1572 		/* Copy the user-supplied entry into a local buffer. */
   1573 		prefbuf = kmem_alloc(MNT_LINE_MAX, KM_SLEEP);
   1574 		if (copyin(ubufp, prefbuf, MNT_LINE_MAX)) {
   1575 			kmem_free(prefbuf, MNT_LINE_MAX);
   1576 			error = EFAULT;
   1577 			break;
   1578 		}
   1579 
   1580 		/* Ensure that any string within it is null-terminated. */
   1581 		*(prefbuf + MNT_LINE_MAX - 1) = 0;
   1582 
   1583 		/* Copy in the user-supplied mpref */
   1584 		STRUCT_INIT(ktab, datamodel);
   1585 		if (copyin(uemp, STRUCT_BUF(ktab),
   1586 		    SIZEOF_STRUCT(mnttab, datamodel))) {
   1587 			kmem_free(prefbuf, MNT_LINE_MAX);
   1588 			error = EFAULT;
   1589 			break;
   1590 		}
   1591 
   1592 		/*
   1593 		 * Copy the members of the user's pref struct into a local
   1594 		 * struct. The pointers need to be offset and verified to
   1595 		 * ensure that they lie within the bounds of the buffer.
   1596 		 */
   1597 		preftab.mnt_special = mntfs_import_addr(STRUCT_FGETP(ktab,
   1598 		    mnt_special), ubufp, prefbuf, MNT_LINE_MAX);
   1599 		preftab.mnt_mountp = mntfs_import_addr(STRUCT_FGETP(ktab,
   1600 		    mnt_mountp), ubufp, prefbuf, MNT_LINE_MAX);
   1601 		preftab.mnt_fstype = mntfs_import_addr(STRUCT_FGETP(ktab,
   1602 		    mnt_fstype), ubufp, prefbuf, MNT_LINE_MAX);
   1603 		preftab.mnt_mntopts = mntfs_import_addr(STRUCT_FGETP(ktab,
   1604 		    mnt_mntopts), ubufp, prefbuf, MNT_LINE_MAX);
   1605 		preftab.mnt_time = mntfs_import_addr(STRUCT_FGETP(ktab,
   1606 		    mnt_time), ubufp, prefbuf, MNT_LINE_MAX);
   1607 
   1608 		/*
   1609 		 * If the user specifies a mounted resource that is a special
   1610 		 * device then we capture its mode and major and minor numbers;
   1611 		 * c.f. the block comment below.
   1612 		 */
   1613 		type = mntfs_special_info_string(preftab.mnt_special,
   1614 		    &preftab.mnt_major, &preftab.mnt_minor, cr);
   1615 
   1616 		rw_enter(&mnp->mnt_contents, RW_WRITER);
   1617 		if (snapp->mnts_nmnts == 0 ||
   1618 		    (snapp->mnts_flags & MNTS_REWIND))
   1619 			mntfs_snapshot(mnp, snapp);
   1620 
   1621 		/*
   1622 		 * This is the core functionality that implements getmntany().
   1623 		 * We walk through the mntfs database until we find an element
   1624 		 * matching the user's preferences that are contained in
   1625 		 * preftab. Typically, this means checking that the text
   1626 		 * matches. However, the mounted resource is special: if the
   1627 		 * user is looking for a special device then we must find a
   1628 		 * database element with the same major and minor numbers and
   1629 		 * the same type, i.e. VBLK or VCHR. The type is not recorded
   1630 		 * in the element because it cannot be inferred from the vfs_t.
   1631 		 * We therefore check the type of suitable candidates via
   1632 		 * mntfs_special_info_element(); since this calls into the
   1633 		 * underlying file system we make sure to drop the database lock
   1634 		 * first.
   1635 		 */
   1636 		elemp = snapp->mnts_next;
   1637 		rw_enter(dblockp, RW_READER);
   1638 		for (;;) {
   1639 			for (; elemp; elemp = mntfs_get_next_elem(snapp,
   1640 			    elemp)) {
   1641 				dbtabp = &elemp->mnte_tab;
   1642 				dbbufp = elemp->mnte_text;
   1643 				dbbufsize = elemp->mnte_text_size;
   1644 
   1645 				if (((type &&
   1646 				    dbtabp->mnt_major == preftab.mnt_major &&
   1647 				    dbtabp->mnt_minor == preftab.mnt_minor &&
   1648 				    MNTFS_REAL_FIELD(dbbufp)) ||
   1649 				    (!type && (!preftab.mnt_special ||
   1650 				    mntfs_same_word(preftab.mnt_special,
   1651 				    prefbuf, MNT_LINE_MAX, (off_t)0, dbbufp,
   1652 				    dbbufsize)))) &&
   1653 
   1654 				    (!preftab.mnt_mountp || mntfs_same_word(
   1655 				    preftab.mnt_mountp, prefbuf, MNT_LINE_MAX,
   1656 				    (off_t)dbtabp->mnt_mountp, dbbufp,
   1657 				    dbbufsize)) &&
   1658 
   1659 				    (!preftab.mnt_fstype || mntfs_same_word(
   1660 				    preftab.mnt_fstype, prefbuf, MNT_LINE_MAX,
   1661 				    (off_t)dbtabp->mnt_fstype, dbbufp,
   1662 				    dbbufsize)) &&
   1663 
   1664 				    (!preftab.mnt_mntopts || mntfs_same_word(
   1665 				    preftab.mnt_mntopts, prefbuf, MNT_LINE_MAX,
   1666 				    (off_t)dbtabp->mnt_mntopts, dbbufp,
   1667 				    dbbufsize)) &&
   1668 
   1669 				    (!preftab.mnt_time || mntfs_same_word(
   1670 				    preftab.mnt_time, prefbuf, MNT_LINE_MAX,
   1671 				    (off_t)dbtabp->mnt_time, dbbufp,
   1672 				    dbbufsize)))
   1673 					break;
   1674 			}
   1675 			rw_exit(dblockp);
   1676 
   1677 			if (elemp == NULL || type == 0 ||
   1678 			    type == mntfs_special_info_element(elemp, cr))
   1679 				break;
   1680 
   1681 			rw_enter(dblockp, RW_READER);
   1682 			elemp = mntfs_get_next_elem(snapp, elemp);
   1683 		}
   1684 
   1685 		kmem_free(prefbuf, MNT_LINE_MAX);
   1686 
   1687 		/* If we failed to find a match then return EOF. */
   1688 		if (elemp == NULL) {
   1689 			rw_exit(&mnp->mnt_contents);
   1690 			*rvalp = MNTFS_EOF;
   1691 			break;
   1692 		}
   1693 
   1694 		/*
   1695 		 * Check that the text buffer offered by the user will be large
   1696 		 * enough to accommodate the text for this entry.
   1697 		 */
   1698 		if (elemp->mnte_text_size > MNT_LINE_MAX) {
   1699 			rw_exit(&mnp->mnt_contents);
   1700 			*rvalp = MNTFS_TOOLONG;
   1701 			break;
   1702 		}
   1703 
   1704 		/*
   1705 		 * Populate the user's struct mnttab and text buffer using the
   1706 		 * element's contents.
   1707 		 */
   1708 		if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
   1709 			error = EFAULT;
   1710 		} else {
   1711 			rw_enter(dblockp, RW_READER);
   1712 			elemp = mntfs_get_next_elem(snapp, elemp);
   1713 			rw_exit(dblockp);
   1714 			snapp->mnts_next = elemp;
   1715 		}
   1716 		rw_exit(&mnp->mnt_contents);
   1717 		break;
   1718 	}
   1719 
   1720 	case MNTIOC_GETMNTENT:
   1721 	case MNTIOC_GETEXTMNTENT:
   1722 	{
   1723 		STRUCT_DECL(mntentbuf, embuf);	/* Our copy of user's embuf */
   1724 		struct extmnttab *uemp;		/* uaddr of user's emp */
   1725 		char *ubufp;			/* uaddr of user's text buf */
   1726 		size_t ubufsize;		/* size of the above */
   1727 		mntelem_t *elemp;		/* a database element */
   1728 
   1729 
   1730 		rw_enter(&mnp->mnt_contents, RW_WRITER);
   1731 		if (snapp->mnts_nmnts == 0 ||
   1732 		    (snapp->mnts_flags & MNTS_REWIND))
   1733 			mntfs_snapshot(mnp, snapp);
   1734 		if ((elemp = snapp->mnts_next) == NULL) {
   1735 			rw_exit(&mnp->mnt_contents);
   1736 			*rvalp = MNTFS_EOF;
   1737 			break;
   1738 		}
   1739 
   1740 		/*
   1741 		 * embuf is a struct embuf within the kernel. We copy into it
   1742 		 * the struct embuf supplied by the user.
   1743 		 */
   1744 		STRUCT_INIT(embuf, datamodel);
   1745 		if (copyin((void *) arg, STRUCT_BUF(embuf),
   1746 		    STRUCT_SIZE(embuf))) {
   1747 			rw_exit(&mnp->mnt_contents);
   1748 			error = EFAULT;
   1749 			break;
   1750 		}
   1751 		uemp = STRUCT_FGETP(embuf, mbuf_emp);
   1752 		ubufp = STRUCT_FGETP(embuf, mbuf_buf);
   1753 		ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
   1754 
   1755 		/*
   1756 		 * Check that the text buffer offered by the user will be large
   1757 		 * enough to accommodate the text for this entry.
   1758 		 */
   1759 		if (elemp->mnte_text_size > ubufsize) {
   1760 			rw_exit(&mnp->mnt_contents);
   1761 			*rvalp = MNTFS_TOOLONG;
   1762 			break;
   1763 		}
   1764 
   1765 		/*
   1766 		 * Populate the user's struct mnttab and text buffer using the
   1767 		 * element's contents.
   1768 		 */
   1769 		if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
   1770 			error = EFAULT;
   1771 		} else {
   1772 			rw_enter(dblockp, RW_READER);
   1773 			elemp = mntfs_get_next_elem(snapp, elemp);
   1774 			rw_exit(dblockp);
   1775 			snapp->mnts_next = elemp;
   1776 		}
   1777 		rw_exit(&mnp->mnt_contents);
   1778 		break;
   1779 	}
   1780 
   1781 	default:
   1782 		error = EINVAL;
   1783 		break;
   1784 	}
   1785 
   1786 	return (error);
   1787 }
   1788 
   1789 /*
   1790  * /mntfs vnode operations vector
   1791  */
   1792 const fs_operation_def_t mnt_vnodeops_template[] = {
   1793 	VOPNAME_OPEN,		{ .vop_open = mntopen },
   1794 	VOPNAME_CLOSE,		{ .vop_close = mntclose },
   1795 	VOPNAME_READ,		{ .vop_read = mntread },
   1796 	VOPNAME_IOCTL,		{ .vop_ioctl = mntioctl },
   1797 	VOPNAME_GETATTR,	{ .vop_getattr = mntgetattr },
   1798 	VOPNAME_ACCESS,		{ .vop_access = mntaccess },
   1799 	VOPNAME_FSYNC,		{ .vop_fsync = mntfsync },
   1800 	VOPNAME_INACTIVE,	{ .vop_inactive = mntinactive },
   1801 	VOPNAME_SEEK,		{ .vop_seek = mntseek },
   1802 	VOPNAME_POLL,		{ .vop_poll = mntpoll },
   1803 	VOPNAME_DISPOSE,	{ .error = fs_error },
   1804 	VOPNAME_SHRLOCK,	{ .error = fs_error },
   1805 	NULL,			NULL
   1806 };
   1807