Home | History | Annotate | Download | only in fs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 
     40 #include <sys/types.h>
     41 #include <sys/t_lock.h>
     42 #include <sys/param.h>
     43 #include <sys/errno.h>
     44 #include <sys/user.h>
     45 #include <sys/fstyp.h>
     46 #include <sys/kmem.h>
     47 #include <sys/systm.h>
     48 #include <sys/proc.h>
     49 #include <sys/mount.h>
     50 #include <sys/vfs.h>
     51 #include <sys/vfs_opreg.h>
     52 #include <sys/fem.h>
     53 #include <sys/mntent.h>
     54 #include <sys/stat.h>
     55 #include <sys/statvfs.h>
     56 #include <sys/statfs.h>
     57 #include <sys/cred.h>
     58 #include <sys/vnode.h>
     59 #include <sys/rwstlock.h>
     60 #include <sys/dnlc.h>
     61 #include <sys/file.h>
     62 #include <sys/time.h>
     63 #include <sys/atomic.h>
     64 #include <sys/cmn_err.h>
     65 #include <sys/buf.h>
     66 #include <sys/swap.h>
     67 #include <sys/debug.h>
     68 #include <sys/vnode.h>
     69 #include <sys/modctl.h>
     70 #include <sys/ddi.h>
     71 #include <sys/pathname.h>
     72 #include <sys/bootconf.h>
     73 #include <sys/dumphdr.h>
     74 #include <sys/dc_ki.h>
     75 #include <sys/poll.h>
     76 #include <sys/sunddi.h>
     77 #include <sys/sysmacros.h>
     78 #include <sys/zone.h>
     79 #include <sys/policy.h>
     80 #include <sys/ctfs.h>
     81 #include <sys/objfs.h>
     82 #include <sys/console.h>
     83 #include <sys/reboot.h>
     84 #include <sys/attr.h>
     85 #include <sys/spa.h>
     86 #include <sys/lofi.h>
     87 
     88 #include <vm/page.h>
     89 
     90 #include <fs/fs_subr.h>
     91 
     92 /* Private interfaces to create vopstats-related data structures */
     93 extern void		initialize_vopstats(vopstats_t *);
     94 extern vopstats_t	*get_fstype_vopstats(struct vfs *, struct vfssw *);
     95 extern vsk_anchor_t	*get_vskstat_anchor(struct vfs *);
     96 
     97 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
     98 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
     99     const char *, int, int);
    100 static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
    101 static void vfs_freemnttab(struct vfs *);
    102 static void vfs_freeopt(mntopt_t *);
    103 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
    104 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
    105 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
    106 static void vfs_createopttbl_extend(mntopts_t *, const char *,
    107     const mntopts_t *);
    108 static char **vfs_copycancelopt_extend(char **const, int);
    109 static void vfs_freecancelopt(char **);
    110 static void getrootfs(char **, char **);
    111 static int getmacpath(dev_info_t *, void *);
    112 static void vfs_mnttabvp_setup(void);
    113 
    114 struct ipmnt {
    115 	struct ipmnt	*mip_next;
    116 	dev_t		mip_dev;
    117 	struct vfs	*mip_vfsp;
    118 };
    119 
    120 static kmutex_t		vfs_miplist_mutex;
    121 static struct ipmnt	*vfs_miplist = NULL;
    122 static struct ipmnt	*vfs_miplist_end = NULL;
    123 
    124 static kmem_cache_t *vfs_cache;	/* Pointer to VFS kmem cache */
    125 
    126 /*
    127  * VFS global data.
    128  */
    129 vnode_t *rootdir;		/* pointer to root inode vnode. */
    130 vnode_t *devicesdir;		/* pointer to inode of devices root */
    131 vnode_t	*devdir;		/* pointer to inode of dev root */
    132 
    133 char *server_rootpath;		/* root path for diskless clients */
    134 char *server_hostname;		/* hostname of diskless server */
    135 
    136 static struct vfs root;
    137 static struct vfs devices;
    138 static struct vfs dev;
    139 struct vfs *rootvfs = &root;	/* pointer to root vfs; head of VFS list. */
    140 rvfs_t *rvfs_list;		/* array of vfs ptrs for vfs hash list */
    141 int vfshsz = 512;		/* # of heads/locks in vfs hash arrays */
    142 				/* must be power of 2!	*/
    143 timespec_t vfs_mnttab_ctime;	/* mnttab created time */
    144 timespec_t vfs_mnttab_mtime;	/* mnttab last modified time */
    145 char *vfs_dummyfstype = "\0";
    146 struct pollhead vfs_pollhd;	/* for mnttab pollers */
    147 struct vnode *vfs_mntdummyvp;	/* to fake mnttab read/write for file events */
    148 int	mntfstype;		/* will be set once mnt fs is mounted */
    149 
    150 /*
    151  * Table for generic options recognized in the VFS layer and acted
    152  * on at this level before parsing file system specific options.
    153  * The nosuid option is stronger than any of the devices and setuid
    154  * options, so those are canceled when nosuid is seen.
    155  *
    156  * All options which are added here need to be added to the
    157  * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
    158  */
    159 /*
    160  * VFS Mount options table
    161  */
    162 static char *ro_cancel[] = { MNTOPT_RW, NULL };
    163 static char *rw_cancel[] = { MNTOPT_RO, NULL };
    164 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
    165 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
    166     MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
    167 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
    168 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
    169 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
    170 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
    171 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
    172 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
    173 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
    174 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
    175 
    176 static const mntopt_t mntopts[] = {
    177 /*
    178  *	option name		cancel options		default arg	flags
    179  */
    180 	{ MNTOPT_REMOUNT,	NULL,			NULL,
    181 		MO_NODISPLAY, (void *)0 },
    182 	{ MNTOPT_RO,		ro_cancel,		NULL,		0,
    183 		(void *)0 },
    184 	{ MNTOPT_RW,		rw_cancel,		NULL,		0,
    185 		(void *)0 },
    186 	{ MNTOPT_SUID,		suid_cancel,		NULL,		0,
    187 		(void *)0 },
    188 	{ MNTOPT_NOSUID,	nosuid_cancel,		NULL,		0,
    189 		(void *)0 },
    190 	{ MNTOPT_DEVICES,	devices_cancel,		NULL,		0,
    191 		(void *)0 },
    192 	{ MNTOPT_NODEVICES,	nodevices_cancel,	NULL,		0,
    193 		(void *)0 },
    194 	{ MNTOPT_SETUID,	setuid_cancel,		NULL,		0,
    195 		(void *)0 },
    196 	{ MNTOPT_NOSETUID,	nosetuid_cancel,	NULL,		0,
    197 		(void *)0 },
    198 	{ MNTOPT_NBMAND,	nbmand_cancel,		NULL,		0,
    199 		(void *)0 },
    200 	{ MNTOPT_NONBMAND,	nonbmand_cancel,	NULL,		0,
    201 		(void *)0 },
    202 	{ MNTOPT_EXEC,		exec_cancel,		NULL,		0,
    203 		(void *)0 },
    204 	{ MNTOPT_NOEXEC,	noexec_cancel,		NULL,		0,
    205 		(void *)0 },
    206 };
    207 
    208 const mntopts_t vfs_mntopts = {
    209 	sizeof (mntopts) / sizeof (mntopt_t),
    210 	(mntopt_t *)&mntopts[0]
    211 };
    212 
    213 /*
    214  * File system operation dispatch functions.
    215  */
    216 
    217 int
    218 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
    219 {
    220 	return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
    221 }
    222 
    223 int
    224 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
    225 {
    226 	return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
    227 }
    228 
    229 int
    230 fsop_root(vfs_t *vfsp, vnode_t **vpp)
    231 {
    232 	refstr_t *mntpt;
    233 	int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
    234 	/*
    235 	 * Make sure this root has a path.  With lofs, it is possible to have
    236 	 * a NULL mountpoint.
    237 	 */
    238 	if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
    239 		mntpt = vfs_getmntpoint(vfsp);
    240 		vn_setpath_str(*vpp, refstr_value(mntpt),
    241 		    strlen(refstr_value(mntpt)));
    242 		refstr_rele(mntpt);
    243 	}
    244 
    245 	return (ret);
    246 }
    247 
    248 int
    249 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
    250 {
    251 	return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
    252 }
    253 
    254 int
    255 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
    256 {
    257 	return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
    258 }
    259 
    260 int
    261 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
    262 {
    263 	/*
    264 	 * In order to handle system attribute fids in a manner
    265 	 * transparent to the underlying fs, we embed the fid for
    266 	 * the sysattr parent object in the sysattr fid and tack on
    267 	 * some extra bytes that only the sysattr layer knows about.
    268 	 *
    269 	 * This guarantees that sysattr fids are larger than other fids
    270 	 * for this vfs. If the vfs supports the sysattr view interface
    271 	 * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
    272 	 * collision with XATTR_FIDSZ.
    273 	 */
    274 	if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
    275 	    fidp->fid_len == XATTR_FIDSZ)
    276 		return (xattr_dir_vget(vfsp, vpp, fidp));
    277 
    278 	return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
    279 }
    280 
    281 int
    282 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
    283 {
    284 	return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
    285 }
    286 
    287 void
    288 fsop_freefs(vfs_t *vfsp)
    289 {
    290 	(*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
    291 }
    292 
    293 int
    294 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
    295 {
    296 	return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
    297 }
    298 
    299 int
    300 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
    301 {
    302 	ASSERT((fstype >= 0) && (fstype < nfstype));
    303 
    304 	if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
    305 		return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
    306 	else
    307 		return (ENOTSUP);
    308 }
    309 
    310 /*
    311  * File system initialization.  vfs_setfsops() must be called from a file
    312  * system's init routine.
    313  */
    314 
    315 static int
    316 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
    317     int *unused_ops)
    318 {
    319 	static const fs_operation_trans_def_t vfs_ops_table[] = {
    320 		VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
    321 			fs_nosys, fs_nosys,
    322 
    323 		VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
    324 			fs_nosys, fs_nosys,
    325 
    326 		VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
    327 			fs_nosys, fs_nosys,
    328 
    329 		VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
    330 			fs_nosys, fs_nosys,
    331 
    332 		VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
    333 			(fs_generic_func_p) fs_sync,
    334 			(fs_generic_func_p) fs_sync,	/* No errors allowed */
    335 
    336 		VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
    337 			fs_nosys, fs_nosys,
    338 
    339 		VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
    340 			fs_nosys, fs_nosys,
    341 
    342 		VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
    343 			(fs_generic_func_p)fs_freevfs,
    344 			(fs_generic_func_p)fs_freevfs,	/* Shouldn't fail */
    345 
    346 		VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
    347 			(fs_generic_func_p)fs_nosys,
    348 			(fs_generic_func_p)fs_nosys,
    349 
    350 		NULL, 0, NULL, NULL
    351 	};
    352 
    353 	return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
    354 }
    355 
    356 void
    357 zfs_boot_init() {
    358 
    359 	if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
    360 		spa_boot_init();
    361 }
    362 
    363 int
    364 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
    365 {
    366 	int error;
    367 	int unused_ops;
    368 
    369 	/*
    370 	 * Verify that fstype refers to a valid fs.  Note that
    371 	 * 0 is valid since it's used to set "stray" ops.
    372 	 */
    373 	if ((fstype < 0) || (fstype >= nfstype))
    374 		return (EINVAL);
    375 
    376 	if (!ALLOCATED_VFSSW(&vfssw[fstype]))
    377 		return (EINVAL);
    378 
    379 	/* Set up the operations vector. */
    380 
    381 	error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
    382 
    383 	if (error != 0)
    384 		return (error);
    385 
    386 	vfssw[fstype].vsw_flag |= VSW_INSTALLED;
    387 
    388 	if (actual != NULL)
    389 		*actual = &vfssw[fstype].vsw_vfsops;
    390 
    391 #if DEBUG
    392 	if (unused_ops != 0)
    393 		cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
    394 		    "but not used", vfssw[fstype].vsw_name, unused_ops);
    395 #endif
    396 
    397 	return (0);
    398 }
    399 
    400 int
    401 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
    402 {
    403 	int error;
    404 	int unused_ops;
    405 
    406 	*actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
    407 
    408 	error = fs_copyfsops(template, *actual, &unused_ops);
    409 	if (error != 0) {
    410 		kmem_free(*actual, sizeof (vfsops_t));
    411 		*actual = NULL;
    412 		return (error);
    413 	}
    414 
    415 	return (0);
    416 }
    417 
    418 /*
    419  * Free a vfsops structure created as a result of vfs_makefsops().
    420  * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
    421  * vfs_freevfsops_by_type().
    422  */
    423 void
    424 vfs_freevfsops(vfsops_t *vfsops)
    425 {
    426 	kmem_free(vfsops, sizeof (vfsops_t));
    427 }
    428 
    429 /*
    430  * Since the vfsops structure is part of the vfssw table and wasn't
    431  * really allocated, we're not really freeing anything.  We keep
    432  * the name for consistency with vfs_freevfsops().  We do, however,
    433  * need to take care of a little bookkeeping.
    434  * NOTE: For a vfsops structure created by vfs_setfsops(), use
    435  * vfs_freevfsops_by_type().
    436  */
    437 int
    438 vfs_freevfsops_by_type(int fstype)
    439 {
    440 
    441 	/* Verify that fstype refers to a loaded fs (and not fsid 0). */
    442 	if ((fstype <= 0) || (fstype >= nfstype))
    443 		return (EINVAL);
    444 
    445 	WLOCK_VFSSW();
    446 	if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
    447 		WUNLOCK_VFSSW();
    448 		return (EINVAL);
    449 	}
    450 
    451 	vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
    452 	WUNLOCK_VFSSW();
    453 
    454 	return (0);
    455 }
    456 
    457 /* Support routines used to reference vfs_op */
    458 
    459 /* Set the operations vector for a vfs */
    460 void
    461 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
    462 {
    463 	vfsops_t	*op;
    464 
    465 	ASSERT(vfsp != NULL);
    466 	ASSERT(vfsops != NULL);
    467 
    468 	op = vfsp->vfs_op;
    469 	membar_consumer();
    470 	if (vfsp->vfs_femhead == NULL &&
    471 	    casptr(&vfsp->vfs_op, op, vfsops) == op) {
    472 		return;
    473 	}
    474 	fsem_setvfsops(vfsp, vfsops);
    475 }
    476 
    477 /* Retrieve the operations vector for a vfs */
    478 vfsops_t *
    479 vfs_getops(vfs_t *vfsp)
    480 {
    481 	vfsops_t	*op;
    482 
    483 	ASSERT(vfsp != NULL);
    484 
    485 	op = vfsp->vfs_op;
    486 	membar_consumer();
    487 	if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
    488 		return (op);
    489 	} else {
    490 		return (fsem_getvfsops(vfsp));
    491 	}
    492 }
    493 
    494 /*
    495  * Returns non-zero (1) if the vfsops matches that of the vfs.
    496  * Returns zero (0) if not.
    497  */
    498 int
    499 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
    500 {
    501 	return (vfs_getops(vfsp) == vfsops);
    502 }
    503 
    504 /*
    505  * Returns non-zero (1) if the file system has installed a non-default,
    506  * non-error vfs_sync routine.  Returns zero (0) otherwise.
    507  */
    508 int
    509 vfs_can_sync(vfs_t *vfsp)
    510 {
    511 	/* vfs_sync() routine is not the default/error function */
    512 	return (vfs_getops(vfsp)->vfs_sync != fs_sync);
    513 }
    514 
    515 /*
    516  * Initialize a vfs structure.
    517  */
    518 void
    519 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
    520 {
    521 	/* Other initialization has been moved to vfs_alloc() */
    522 	vfsp->vfs_count = 0;
    523 	vfsp->vfs_next = vfsp;
    524 	vfsp->vfs_prev = vfsp;
    525 	vfsp->vfs_zone_next = vfsp;
    526 	vfsp->vfs_zone_prev = vfsp;
    527 	vfsp->vfs_lofi_minor = 0;
    528 	sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
    529 	vfsimpl_setup(vfsp);
    530 	vfsp->vfs_data = (data);
    531 	vfs_setops((vfsp), (op));
    532 }
    533 
    534 /*
    535  * Allocate and initialize the vfs implementation private data
    536  * structure, vfs_impl_t.
    537  */
    538 void
    539 vfsimpl_setup(vfs_t *vfsp)
    540 {
    541 	int i;
    542 
    543 	if (vfsp->vfs_implp != NULL) {
    544 		return;
    545 	}
    546 
    547 	vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
    548 	/* Note that these are #define'd in vfs.h */
    549 	vfsp->vfs_vskap = NULL;
    550 	vfsp->vfs_fstypevsp = NULL;
    551 
    552 	/* Set size of counted array, then zero the array */
    553 	vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
    554 	for (i = 1; i <  VFS_FEATURE_MAXSZ; i++) {
    555 		vfsp->vfs_featureset[i] = 0;
    556 	}
    557 }
    558 
    559 /*
    560  * Release the vfs_impl_t structure, if it exists. Some unbundled
    561  * filesystems may not use the newer version of vfs and thus
    562  * would not contain this implementation private data structure.
    563  */
    564 void
    565 vfsimpl_teardown(vfs_t *vfsp)
    566 {
    567 	vfs_impl_t	*vip = vfsp->vfs_implp;
    568 
    569 	if (vip == NULL)
    570 		return;
    571 
    572 	kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
    573 	vfsp->vfs_implp = NULL;
    574 }
    575 
    576 /*
    577  * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
    578  * fstatvfs, and sysfs moved to common/syscall.
    579  */
    580 
    581 /*
    582  * Update every mounted file system.  We call the vfs_sync operation of
    583  * each file system type, passing it a NULL vfsp to indicate that all
    584  * mounted file systems of that type should be updated.
    585  */
    586 void
    587 vfs_sync(int flag)
    588 {
    589 	struct vfssw *vswp;
    590 	RLOCK_VFSSW();
    591 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
    592 		if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
    593 			vfs_refvfssw(vswp);
    594 			RUNLOCK_VFSSW();
    595 			(void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
    596 			    CRED());
    597 			vfs_unrefvfssw(vswp);
    598 			RLOCK_VFSSW();
    599 		}
    600 	}
    601 	RUNLOCK_VFSSW();
    602 }
    603 
    604 void
    605 sync(void)
    606 {
    607 	vfs_sync(0);
    608 }
    609 
    610 /*
    611  * External routines.
    612  */
    613 
    614 krwlock_t vfssw_lock;	/* lock accesses to vfssw */
    615 
    616 /*
    617  * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
    618  * but otherwise should be accessed only via vfs_list_lock() and
    619  * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
    620  */
    621 static krwlock_t vfslist;
    622 
    623 /*
    624  * Mount devfs on /devices. This is done right after root is mounted
    625  * to provide device access support for the system
    626  */
    627 static void
    628 vfs_mountdevices(void)
    629 {
    630 	struct vfssw *vsw;
    631 	struct vnode *mvp;
    632 	struct mounta mounta = {	/* fake mounta for devfs_mount() */
    633 		NULL,
    634 		NULL,
    635 		MS_SYSSPACE,
    636 		NULL,
    637 		NULL,
    638 		0,
    639 		NULL,
    640 		0
    641 	};
    642 
    643 	/*
    644 	 * _init devfs module to fill in the vfssw
    645 	 */
    646 	if (modload("fs", "devfs") == -1)
    647 		panic("Cannot _init devfs module");
    648 
    649 	/*
    650 	 * Hold vfs
    651 	 */
    652 	RLOCK_VFSSW();
    653 	vsw = vfs_getvfsswbyname("devfs");
    654 	VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
    655 	VFS_HOLD(&devices);
    656 
    657 	/*
    658 	 * Locate mount point
    659 	 */
    660 	if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
    661 		panic("Cannot find /devices");
    662 
    663 	/*
    664 	 * Perform the mount of /devices
    665 	 */
    666 	if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
    667 		panic("Cannot mount /devices");
    668 
    669 	RUNLOCK_VFSSW();
    670 
    671 	/*
    672 	 * Set appropriate members and add to vfs list for mnttab display
    673 	 */
    674 	vfs_setresource(&devices, "/devices");
    675 	vfs_setmntpoint(&devices, "/devices");
    676 
    677 	/*
    678 	 * Hold the root of /devices so it won't go away
    679 	 */
    680 	if (VFS_ROOT(&devices, &devicesdir))
    681 		panic("vfs_mountdevices: not devices root");
    682 
    683 	if (vfs_lock(&devices) != 0) {
    684 		VN_RELE(devicesdir);
    685 		cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
    686 		return;
    687 	}
    688 
    689 	if (vn_vfswlock(mvp) != 0) {
    690 		vfs_unlock(&devices);
    691 		VN_RELE(devicesdir);
    692 		cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
    693 		return;
    694 	}
    695 
    696 	vfs_add(mvp, &devices, 0);
    697 	vn_vfsunlock(mvp);
    698 	vfs_unlock(&devices);
    699 	VN_RELE(devicesdir);
    700 }
    701 
    702 /*
    703  * mount the first instance of /dev  to root and remain mounted
    704  */
    705 static void
    706 vfs_mountdev1(void)
    707 {
    708 	struct vfssw *vsw;
    709 	struct vnode *mvp;
    710 	struct mounta mounta = {	/* fake mounta for sdev_mount() */
    711 		NULL,
    712 		NULL,
    713 		MS_SYSSPACE | MS_OVERLAY,
    714 		NULL,
    715 		NULL,
    716 		0,
    717 		NULL,
    718 		0
    719 	};
    720 
    721 	/*
    722 	 * _init dev module to fill in the vfssw
    723 	 */
    724 	if (modload("fs", "dev") == -1)
    725 		cmn_err(CE_PANIC, "Cannot _init dev module\n");
    726 
    727 	/*
    728 	 * Hold vfs
    729 	 */
    730 	RLOCK_VFSSW();
    731 	vsw = vfs_getvfsswbyname("dev");
    732 	VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
    733 	VFS_HOLD(&dev);
    734 
    735 	/*
    736 	 * Locate mount point
    737 	 */
    738 	if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
    739 		cmn_err(CE_PANIC, "Cannot find /dev\n");
    740 
    741 	/*
    742 	 * Perform the mount of /dev
    743 	 */
    744 	if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
    745 		cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
    746 
    747 	RUNLOCK_VFSSW();
    748 
    749 	/*
    750 	 * Set appropriate members and add to vfs list for mnttab display
    751 	 */
    752 	vfs_setresource(&dev, "/dev");
    753 	vfs_setmntpoint(&dev, "/dev");
    754 
    755 	/*
    756 	 * Hold the root of /dev so it won't go away
    757 	 */
    758 	if (VFS_ROOT(&dev, &devdir))
    759 		cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
    760 
    761 	if (vfs_lock(&dev) != 0) {
    762 		VN_RELE(devdir);
    763 		cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
    764 		return;
    765 	}
    766 
    767 	if (vn_vfswlock(mvp) != 0) {
    768 		vfs_unlock(&dev);
    769 		VN_RELE(devdir);
    770 		cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
    771 		return;
    772 	}
    773 
    774 	vfs_add(mvp, &dev, 0);
    775 	vn_vfsunlock(mvp);
    776 	vfs_unlock(&dev);
    777 	VN_RELE(devdir);
    778 }
    779 
    780 /*
    781  * Mount required filesystem. This is done right after root is mounted.
    782  */
    783 static void
    784 vfs_mountfs(char *module, char *spec, char *path)
    785 {
    786 	struct vnode *mvp;
    787 	struct mounta mounta;
    788 	vfs_t *vfsp;
    789 
    790 	mounta.flags = MS_SYSSPACE | MS_DATA;
    791 	mounta.fstype = module;
    792 	mounta.spec = spec;
    793 	mounta.dir = path;
    794 	if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
    795 		cmn_err(CE_WARN, "Cannot find %s", path);
    796 		return;
    797 	}
    798 	if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
    799 		cmn_err(CE_WARN, "Cannot mount %s", path);
    800 	else
    801 		VFS_RELE(vfsp);
    802 	VN_RELE(mvp);
    803 }
    804 
    805 /*
    806  * vfs_mountroot is called by main() to mount the root filesystem.
    807  */
    808 void
    809 vfs_mountroot(void)
    810 {
    811 	struct vnode	*rvp = NULL;
    812 	char		*path;
    813 	size_t		plen;
    814 	struct vfssw	*vswp;
    815 
    816 	rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
    817 	rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
    818 
    819 	/*
    820 	 * Alloc the vfs hash bucket array and locks
    821 	 */
    822 	rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
    823 
    824 	/*
    825 	 * Call machine-dependent routine "rootconf" to choose a root
    826 	 * file system type.
    827 	 */
    828 	if (rootconf())
    829 		panic("vfs_mountroot: cannot mount root");
    830 	/*
    831 	 * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
    832 	 * to point to it.  These are used by lookuppn() so that it
    833 	 * knows where to start from ('/' or '.').
    834 	 */
    835 	vfs_setmntpoint(rootvfs, "/");
    836 	if (VFS_ROOT(rootvfs, &rootdir))
    837 		panic("vfs_mountroot: no root vnode");
    838 	PTOU(curproc)->u_cdir = rootdir;
    839 	VN_HOLD(PTOU(curproc)->u_cdir);
    840 	PTOU(curproc)->u_rdir = NULL;
    841 
    842 	/*
    843 	 * Setup the global zone's rootvp, now that it exists.
    844 	 */
    845 	global_zone->zone_rootvp = rootdir;
    846 	VN_HOLD(global_zone->zone_rootvp);
    847 
    848 	/*
    849 	 * Notify the module code that it can begin using the
    850 	 * root filesystem instead of the boot program's services.
    851 	 */
    852 	modrootloaded = 1;
    853 
    854 	/*
    855 	 * Special handling for a ZFS root file system.
    856 	 */
    857 	zfs_boot_init();
    858 
    859 	/*
    860 	 * Set up mnttab information for root
    861 	 */
    862 	vfs_setresource(rootvfs, rootfs.bo_name);
    863 
    864 	/*
    865 	 * Notify cluster software that the root filesystem is available.
    866 	 */
    867 	clboot_mountroot();
    868 
    869 	/* Now that we're all done with the root FS, set up its vopstats */
    870 	if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
    871 		/* Set flag for statistics collection */
    872 		if (vswp->vsw_flag & VSW_STATS) {
    873 			initialize_vopstats(&rootvfs->vfs_vopstats);
    874 			rootvfs->vfs_flag |= VFS_STATS;
    875 			rootvfs->vfs_fstypevsp =
    876 			    get_fstype_vopstats(rootvfs, vswp);
    877 			rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
    878 		}
    879 		vfs_unrefvfssw(vswp);
    880 	}
    881 
    882 	/*
    883 	 * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
    884 	 * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
    885 	 */
    886 	vfs_mountdevices();
    887 	vfs_mountdev1();
    888 
    889 	vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
    890 	vfs_mountfs("proc", "/proc", "/proc");
    891 	vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
    892 	vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
    893 	vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
    894 
    895 	if (getzoneid() == GLOBAL_ZONEID) {
    896 		vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
    897 	}
    898 
    899 #ifdef __sparc
    900 	/*
    901 	 * This bit of magic can go away when we convert sparc to
    902 	 * the new boot architecture based on ramdisk.
    903 	 *
    904 	 * Booting off a mirrored root volume:
    905 	 * At this point, we have booted and mounted root on a
    906 	 * single component of the mirror.  Complete the boot
    907 	 * by configuring SVM and converting the root to the
    908 	 * dev_t of the mirrored root device.  This dev_t conversion
    909 	 * only works because the underlying device doesn't change.
    910 	 */
    911 	if (root_is_svm) {
    912 		if (svm_rootconf()) {
    913 			panic("vfs_mountroot: cannot remount root");
    914 		}
    915 
    916 		/*
    917 		 * mnttab should reflect the new root device
    918 		 */
    919 		vfs_lock_wait(rootvfs);
    920 		vfs_setresource(rootvfs, rootfs.bo_name);
    921 		vfs_unlock(rootvfs);
    922 	}
    923 #endif /* __sparc */
    924 
    925 	/*
    926 	 * Look up the root device via devfs so that a dv_node is
    927 	 * created for it. The vnode is never VN_RELE()ed.
    928 	 * We allocate more than MAXPATHLEN so that the
    929 	 * buffer passed to i_ddi_prompath_to_devfspath() is
    930 	 * exactly MAXPATHLEN (the function expects a buffer
    931 	 * of that length).
    932 	 */
    933 	plen = strlen("/devices");
    934 	path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
    935 	(void) strcpy(path, "/devices");
    936 
    937 	if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
    938 	    != DDI_SUCCESS ||
    939 	    lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
    940 
    941 		/* NUL terminate in case "path" has garbage */
    942 		path[plen + MAXPATHLEN - 1] = '\0';
    943 #ifdef	DEBUG
    944 		cmn_err(CE_WARN, "!Cannot lookup root device: %s", path);
    945 #endif
    946 	}
    947 	kmem_free(path, plen + MAXPATHLEN);
    948 	vfs_mnttabvp_setup();
    949 }
    950 
    951 /*
    952  * If remount failed and we're in a zone we need to check for the zone
    953  * root path and strip it before the call to vfs_setpath().
    954  *
    955  * If strpath doesn't begin with the zone_rootpath the original
    956  * strpath is returned unchanged.
    957  */
    958 static const char *
    959 stripzonepath(const char *strpath)
    960 {
    961 	char *str1, *str2;
    962 	int i;
    963 	zone_t *zonep = curproc->p_zone;
    964 
    965 	if (zonep->zone_rootpath == NULL || strpath == NULL) {
    966 		return (NULL);
    967 	}
    968 
    969 	/*
    970 	 * we check for the end of the string at one past the
    971 	 * current position because the zone_rootpath always
    972 	 * ends with "/" but we don't want to strip that off.
    973 	 */
    974 	str1 = zonep->zone_rootpath;
    975 	str2 = (char *)strpath;
    976 	ASSERT(str1[0] != '\0');
    977 	for (i = 0; str1[i + 1] != '\0'; i++) {
    978 		if (str1[i] != str2[i])
    979 			return ((char *)strpath);
    980 	}
    981 	return (&str2[i]);
    982 }
    983 
    984 /*
    985  * Check to see if our "block device" is actually a file.  If so,
    986  * automatically add a lofi device, and keep track of this fact.
    987  */
    988 static int
    989 lofi_add(const char *fsname, struct vfs *vfsp,
    990     mntopts_t *mntopts, struct mounta *uap)
    991 {
    992 	int fromspace = (uap->flags & MS_SYSSPACE) ?
    993 	    UIO_SYSSPACE : UIO_USERSPACE;
    994 	struct lofi_ioctl *li = NULL;
    995 	struct vnode *vp = NULL;
    996 	struct pathname	pn = { NULL };
    997 	ldi_ident_t ldi_id;
    998 	ldi_handle_t ldi_hdl;
    999 	vfssw_t *vfssw;
   1000 	int minor;
   1001 	int err = 0;
   1002 
   1003 	if (fsname == NULL ||
   1004 	    (vfssw = vfs_getvfssw(fsname)) == NULL)
   1005 		return (0);
   1006 
   1007 	if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
   1008 		vfs_unrefvfssw(vfssw);
   1009 		return (0);
   1010 	}
   1011 
   1012 	vfs_unrefvfssw(vfssw);
   1013 	vfssw = NULL;
   1014 
   1015 	if (pn_get(uap->spec, fromspace, &pn) != 0)
   1016 		return (0);
   1017 
   1018 	if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
   1019 		goto out;
   1020 
   1021 	if (vp->v_type != VREG)
   1022 		goto out;
   1023 
   1024 	/* OK, this is a lofi mount. */
   1025 
   1026 	if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) ||
   1027 	    vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
   1028 	    vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
   1029 	    vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
   1030 		err = EINVAL;
   1031 		goto out;
   1032 	}
   1033 
   1034 	ldi_id = ldi_ident_from_anon();
   1035 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
   1036 	(void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN + 1);
   1037 
   1038 	/*
   1039 	 * The lofi control node is currently exclusive-open.  We'd like
   1040 	 * to improve this, but in the meantime, we'll loop waiting for
   1041 	 * access.
   1042 	 */
   1043 	for (;;) {
   1044 		err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE | FEXCL,
   1045 		    kcred, &ldi_hdl, ldi_id);
   1046 
   1047 		if (err != EBUSY)
   1048 			break;
   1049 
   1050 		if ((err = delay_sig(hz / 8)) == EINTR)
   1051 			break;
   1052 	}
   1053 
   1054 	if (err)
   1055 		goto out2;
   1056 
   1057 	err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
   1058 	    FREAD | FWRITE | FEXCL | FKIOCTL, kcred, &minor);
   1059 
   1060 	(void) ldi_close(ldi_hdl, FREAD | FWRITE | FEXCL, kcred);
   1061 
   1062 	if (!err)
   1063 		vfsp->vfs_lofi_minor = minor;
   1064 
   1065 out2:
   1066 	ldi_ident_release(ldi_id);
   1067 out:
   1068 	if (li != NULL)
   1069 		kmem_free(li, sizeof (*li));
   1070 	if (vp != NULL)
   1071 		VN_RELE(vp);
   1072 	pn_free(&pn);
   1073 	return (err);
   1074 }
   1075 
   1076 static void
   1077 lofi_remove(struct vfs *vfsp)
   1078 {
   1079 	struct lofi_ioctl *li = NULL;
   1080 	ldi_ident_t ldi_id;
   1081 	ldi_handle_t ldi_hdl;
   1082 	int err;
   1083 
   1084 	if (vfsp->vfs_lofi_minor == 0)
   1085 		return;
   1086 
   1087 	ldi_id = ldi_ident_from_anon();
   1088 
   1089 	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
   1090 	li->li_minor = vfsp->vfs_lofi_minor;
   1091 	li->li_cleanup = B_TRUE;
   1092 
   1093 	do {
   1094 		err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE | FEXCL,
   1095 		    kcred, &ldi_hdl, ldi_id);
   1096 	} while (err == EBUSY);
   1097 
   1098 	if (err)
   1099 		goto out;
   1100 
   1101 	err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
   1102 	    FREAD | FWRITE | FEXCL | FKIOCTL, kcred,