Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/types.h>
     29 #include <sys/param.h>
     30 #include <sys/systm.h>
     31 #include <sys/sysmacros.h>
     32 #include <sys/kmem.h>
     33 #include <sys/pathname.h>
     34 #include <sys/vnode.h>
     35 #include <sys/vfs.h>
     36 #include <sys/vfs_opreg.h>
     37 #include <sys/mntent.h>
     38 #include <sys/mount.h>
     39 #include <sys/cmn_err.h>
     40 #include "fs/fs_subr.h"
     41 #include <sys/zfs_znode.h>
     42 #include <sys/zfs_dir.h>
     43 #include <sys/zil.h>
     44 #include <sys/fs/zfs.h>
     45 #include <sys/dmu.h>
     46 #include <sys/dsl_prop.h>
     47 #include <sys/dsl_dataset.h>
     48 #include <sys/dsl_deleg.h>
     49 #include <sys/spa.h>
     50 #include <sys/zap.h>
     51 #include <sys/varargs.h>
     52 #include <sys/policy.h>
     53 #include <sys/atomic.h>
     54 #include <sys/mkdev.h>
     55 #include <sys/modctl.h>
     56 #include <sys/refstr.h>
     57 #include <sys/zfs_ioctl.h>
     58 #include <sys/zfs_ctldir.h>
     59 #include <sys/zfs_fuid.h>
     60 #include <sys/bootconf.h>
     61 #include <sys/sunddi.h>
     62 #include <sys/dnlc.h>
     63 #include <sys/dmu_objset.h>
     64 #include <sys/spa_boot.h>
     65 
     66 int zfsfstype;
     67 vfsops_t *zfs_vfsops = NULL;
     68 static major_t zfs_major;
     69 static minor_t zfs_minor;
     70 static kmutex_t	zfs_dev_mtx;
     71 
     72 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
     73 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
     74 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
     75 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
     76 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
     77 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
     78 static void zfs_freevfs(vfs_t *vfsp);
     79 
     80 static const fs_operation_def_t zfs_vfsops_template[] = {
     81 	VFSNAME_MOUNT,		{ .vfs_mount = zfs_mount },
     82 	VFSNAME_MOUNTROOT,	{ .vfs_mountroot = zfs_mountroot },
     83 	VFSNAME_UNMOUNT,	{ .vfs_unmount = zfs_umount },
     84 	VFSNAME_ROOT,		{ .vfs_root = zfs_root },
     85 	VFSNAME_STATVFS,	{ .vfs_statvfs = zfs_statvfs },
     86 	VFSNAME_SYNC,		{ .vfs_sync = zfs_sync },
     87 	VFSNAME_VGET,		{ .vfs_vget = zfs_vget },
     88 	VFSNAME_FREEVFS,	{ .vfs_freevfs = zfs_freevfs },
     89 	NULL,			NULL
     90 };
     91 
     92 static const fs_operation_def_t zfs_vfsops_eio_template[] = {
     93 	VFSNAME_FREEVFS,	{ .vfs_freevfs =  zfs_freevfs },
     94 	NULL,			NULL
     95 };
     96 
     97 /*
     98  * We need to keep a count of active fs's.
     99  * This is necessary to prevent our module
    100  * from being unloaded after a umount -f
    101  */
    102 static uint32_t	zfs_active_fs_count = 0;
    103 
    104 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
    105 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
    106 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
    107 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
    108 
    109 /*
    110  * MO_DEFAULT is not used since the default value is determined
    111  * by the equivalent property.
    112  */
    113 static mntopt_t mntopts[] = {
    114 	{ MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
    115 	{ MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
    116 	{ MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
    117 	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
    118 };
    119 
    120 static mntopts_t zfs_mntopts = {
    121 	sizeof (mntopts) / sizeof (mntopt_t),
    122 	mntopts
    123 };
    124 
    125 /*ARGSUSED*/
    126 int
    127 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
    128 {
    129 	/*
    130 	 * Data integrity is job one.  We don't want a compromised kernel
    131 	 * writing to the storage pool, so we never sync during panic.
    132 	 */
    133 	if (panicstr)
    134 		return (0);
    135 
    136 	/*
    137 	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
    138 	 * to sync metadata, which they would otherwise cache indefinitely.
    139 	 * Semantically, the only requirement is that the sync be initiated.
    140 	 * The DMU syncs out txgs frequently, so there's nothing to do.
    141 	 */
    142 	if (flag & SYNC_ATTR)
    143 		return (0);
    144 
    145 	if (vfsp != NULL) {
    146 		/*
    147 		 * Sync a specific filesystem.
    148 		 */
    149 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
    150 
    151 		ZFS_ENTER(zfsvfs);
    152 		if (zfsvfs->z_log != NULL)
    153 			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
    154 		else
    155 			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
    156 		ZFS_EXIT(zfsvfs);
    157 	} else {
    158 		/*
    159 		 * Sync all ZFS filesystems.  This is what happens when you
    160 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
    161 		 * request by waiting for all pools to commit all dirty data.
    162 		 */
    163 		spa_sync_allpools();
    164 	}
    165 
    166 	return (0);
    167 }
    168 
    169 static int
    170 zfs_create_unique_device(dev_t *dev)
    171 {
    172 	major_t new_major;
    173 
    174 	do {
    175 		ASSERT3U(zfs_minor, <=, MAXMIN32);
    176 		minor_t start = zfs_minor;
    177 		do {
    178 			mutex_enter(&zfs_dev_mtx);
    179 			if (zfs_minor >= MAXMIN32) {
    180 				/*
    181 				 * If we're still using the real major
    182 				 * keep out of /dev/zfs and /dev/zvol minor
    183 				 * number space.  If we're using a getudev()'ed
    184 				 * major number, we can use all of its minors.
    185 				 */
    186 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
    187 					zfs_minor = ZFS_MIN_MINOR;
    188 				else
    189 					zfs_minor = 0;
    190 			} else {
    191 				zfs_minor++;
    192 			}
    193 			*dev = makedevice(zfs_major, zfs_minor);
    194 			mutex_exit(&zfs_dev_mtx);
    195 		} while (vfs_devismounted(*dev) && zfs_minor != start);
    196 		if (zfs_minor == start) {
    197 			/*
    198 			 * We are using all ~262,000 minor numbers for the
    199 			 * current major number.  Create a new major number.
    200 			 */
    201 			if ((new_major = getudev()) == (major_t)-1) {
    202 				cmn_err(CE_WARN,
    203 				    "zfs_mount: Can't get unique major "
    204 				    "device number.");
    205 				return (-1);
    206 			}
    207 			mutex_enter(&zfs_dev_mtx);
    208 			zfs_major = new_major;
    209 			zfs_minor = 0;
    210 
    211 			mutex_exit(&zfs_dev_mtx);
    212 		} else {
    213 			break;
    214 		}
    215 		/* CONSTANTCONDITION */
    216 	} while (1);
    217 
    218 	return (0);
    219 }
    220 
    221 static void
    222 atime_changed_cb(void *arg, uint64_t newval)
    223 {
    224 	zfsvfs_t *zfsvfs = arg;
    225 
    226 	if (newval == TRUE) {
    227 		zfsvfs->z_atime = TRUE;
    228 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
    229 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
    230 	} else {
    231 		zfsvfs->z_atime = FALSE;
    232 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
    233 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
    234 	}
    235 }
    236 
    237 static void
    238 xattr_changed_cb(void *arg, uint64_t newval)
    239 {
    240 	zfsvfs_t *zfsvfs = arg;
    241 
    242 	if (newval == TRUE) {
    243 		/* XXX locking on vfs_flag? */
    244 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
    245 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
    246 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
    247 	} else {
    248 		/* XXX locking on vfs_flag? */
    249 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
    250 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
    251 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
    252 	}
    253 }
    254 
    255 static void
    256 blksz_changed_cb(void *arg, uint64_t newval)
    257 {
    258 	zfsvfs_t *zfsvfs = arg;
    259 
    260 	if (newval < SPA_MINBLOCKSIZE ||
    261 	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
    262 		newval = SPA_MAXBLOCKSIZE;
    263 
    264 	zfsvfs->z_max_blksz = newval;
    265 	zfsvfs->z_vfs->vfs_bsize = newval;
    266 }
    267 
    268 static void
    269 readonly_changed_cb(void *arg, uint64_t newval)
    270 {
    271 	zfsvfs_t *zfsvfs = arg;
    272 
    273 	if (newval) {
    274 		/* XXX locking on vfs_flag? */
    275 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
    276 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
    277 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
    278 	} else {
    279 		/* XXX locking on vfs_flag? */
    280 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
    281 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
    282 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
    283 	}
    284 }
    285 
    286 static void
    287 devices_changed_cb(void *arg, uint64_t newval)
    288 {
    289 	zfsvfs_t *zfsvfs = arg;
    290 
    291 	if (newval == FALSE) {
    292 		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
    293 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
    294 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
    295 	} else {
    296 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
    297 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
    298 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
    299 	}
    300 }
    301 
    302 static void
    303 setuid_changed_cb(void *arg, uint64_t newval)
    304 {
    305 	zfsvfs_t *zfsvfs = arg;
    306 
    307 	if (newval == FALSE) {
    308 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
    309 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
    310 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
    311 	} else {
    312 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
    313 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
    314 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
    315 	}
    316 }
    317 
    318 static void
    319 exec_changed_cb(void *arg, uint64_t newval)
    320 {
    321 	zfsvfs_t *zfsvfs = arg;
    322 
    323 	if (newval == FALSE) {
    324 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
    325 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
    326 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
    327 	} else {
    328 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
    329 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
    330 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
    331 	}
    332 }
    333 
    334 /*
    335  * The nbmand mount option can be changed at mount time.
    336  * We can't allow it to be toggled on live file systems or incorrect
    337  * behavior may be seen from cifs clients
    338  *
    339  * This property isn't registered via dsl_prop_register(), but this callback
    340  * will be called when a file system is first mounted
    341  */
    342 static void
    343 nbmand_changed_cb(void *arg, uint64_t newval)
    344 {
    345 	zfsvfs_t *zfsvfs = arg;
    346 	if (newval == FALSE) {
    347 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
    348 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
    349 	} else {
    350 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
    351 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
    352 	}
    353 }
    354 
    355 static void
    356 snapdir_changed_cb(void *arg, uint64_t newval)
    357 {
    358 	zfsvfs_t *zfsvfs = arg;
    359 
    360 	zfsvfs->z_show_ctldir = newval;
    361 }
    362 
    363 static void
    364 vscan_changed_cb(void *arg, uint64_t newval)
    365 {
    366 	zfsvfs_t *zfsvfs = arg;
    367 
    368 	zfsvfs->z_vscan = newval;
    369 }
    370 
    371 static void
    372 acl_mode_changed_cb(void *arg, uint64_t newval)
    373 {
    374 	zfsvfs_t *zfsvfs = arg;
    375 
    376 	zfsvfs->z_acl_mode = newval;
    377 }
    378 
    379 static void
    380 acl_inherit_changed_cb(void *arg, uint64_t newval)
    381 {
    382 	zfsvfs_t *zfsvfs = arg;
    383 
    384 	zfsvfs->z_acl_inherit = newval;
    385 }
    386 
    387 static int
    388 zfs_register_callbacks(vfs_t *vfsp)
    389 {
    390 	struct dsl_dataset *ds = NULL;
    391 	objset_t *os = NULL;
    392 	zfsvfs_t *zfsvfs = NULL;
    393 	uint64_t nbmand;
    394 	int readonly, do_readonly = B_FALSE;
    395 	int setuid, do_setuid = B_FALSE;
    396 	int exec, do_exec = B_FALSE;
    397 	int devices, do_devices = B_FALSE;
    398 	int xattr, do_xattr = B_FALSE;
    399 	int atime, do_atime = B_FALSE;
    400 	int error = 0;
    401 
    402 	ASSERT(vfsp);
    403 	zfsvfs = vfsp->vfs_data;
    404 	ASSERT(zfsvfs);
    405 	os = zfsvfs->z_os;
    406 
    407 	/*
    408 	 * The act of registering our callbacks will destroy any mount
    409 	 * options we may have.  In order to enable temporary overrides
    410 	 * of mount options, we stash away the current values and
    411 	 * restore them after we register the callbacks.
    412 	 */
    413 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
    414 		readonly = B_TRUE;
    415 		do_readonly = B_TRUE;
    416 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
    417 		readonly = B_FALSE;
    418 		do_readonly = B_TRUE;
    419 	}
    420 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
    421 		devices = B_FALSE;
    422 		setuid = B_FALSE;
    423 		do_devices = B_TRUE;
    424 		do_setuid = B_TRUE;
    425 	} else {
    426 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
    427 			devices = B_FALSE;
    428 			do_devices = B_TRUE;
    429 		} else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
    430 			devices = B_TRUE;
    431 			do_devices = B_TRUE;
    432 		}
    433 
    434 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
    435 			setuid = B_FALSE;
    436 			do_setuid = B_TRUE;
    437 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
    438 			setuid = B_TRUE;
    439 			do_setuid = B_TRUE;
    440 		}
    441 	}
    442 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
    443 		exec = B_FALSE;
    444 		do_exec = B_TRUE;
    445 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
    446 		exec = B_TRUE;
    447 		do_exec = B_TRUE;
    448 	}
    449 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
    450 		xattr = B_FALSE;
    451 		do_xattr = B_TRUE;
    452 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
    453 		xattr = B_TRUE;
    454 		do_xattr = B_TRUE;
    455 	}
    456 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
    457 		atime = B_FALSE;
    458 		do_atime = B_TRUE;
    459 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
    460 		atime = B_TRUE;
    461 		do_atime = B_TRUE;
    462 	}
    463 
    464 	/*
    465 	 * nbmand is a special property.  It can only be changed at
    466 	 * mount time.
    467 	 *
    468 	 * This is weird, but it is documented to only be changeable
    469 	 * at mount time.
    470 	 */
    471 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
    472 		nbmand = B_FALSE;
    473 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
    474 		nbmand = B_TRUE;
    475 	} else {
    476 		char osname[MAXNAMELEN];
    477 
    478 		dmu_objset_name(os, osname);
    479 		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
    480 		    NULL))
    481 		return (error);
    482 	}
    483 
    484 	/*
    485 	 * Register property callbacks.
    486 	 *
    487 	 * It would probably be fine to just check for i/o error from
    488 	 * the first prop_register(), but I guess I like to go
    489 	 * overboard...
    490 	 */
    491 	ds = dmu_objset_ds(os);
    492 	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
    493 	error = error ? error : dsl_prop_register(ds,
    494 	    "xattr", xattr_changed_cb, zfsvfs);
    495 	error = error ? error : dsl_prop_register(ds,
    496 	    "recordsize", blksz_changed_cb, zfsvfs);
    497 	error = error ? error : dsl_prop_register(ds,
    498 	    "readonly", readonly_changed_cb, zfsvfs);
    499 	error = error ? error : dsl_prop_register(ds,
    500 	    "devices", devices_changed_cb, zfsvfs);
    501 	error = error ? error : dsl_prop_register(ds,
    502 	    "setuid", setuid_changed_cb, zfsvfs);
    503 	error = error ? error : dsl_prop_register(ds,
    504 	    "exec", exec_changed_cb, zfsvfs);
    505 	error = error ? error : dsl_prop_register(ds,
    506 	    "snapdir", snapdir_changed_cb, zfsvfs);
    507 	error = error ? error : dsl_prop_register(ds,
    508 	    "aclmode", acl_mode_changed_cb, zfsvfs);
    509 	error = error ? error : dsl_prop_register(ds,
    510 	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
    511 	error = error ? error : dsl_prop_register(ds,
    512 	    "vscan", vscan_changed_cb, zfsvfs);
    513 	if (error)
    514 		goto unregister;
    515 
    516 	/*
    517 	 * Invoke our callbacks to restore temporary mount options.
    518 	 */
    519 	if (do_readonly)
    520 		readonly_changed_cb(zfsvfs, readonly);
    521 	if (do_setuid)
    522 		setuid_changed_cb(zfsvfs, setuid);
    523 	if (do_exec)
    524 		exec_changed_cb(zfsvfs, exec);
    525 	if (do_devices)
    526 		devices_changed_cb(zfsvfs, devices);
    527 	if (do_xattr)
    528 		xattr_changed_cb(zfsvfs, xattr);
    529 	if (do_atime)
    530 		atime_changed_cb(zfsvfs, atime);
    531 
    532 	nbmand_changed_cb(zfsvfs, nbmand);
    533 
    534 	return (0);
    535 
    536 unregister:
    537 	/*
    538 	 * We may attempt to unregister some callbacks that are not
    539 	 * registered, but this is OK; it will simply return ENOMSG,
    540 	 * which we will ignore.
    541 	 */
    542 	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
    543 	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
    544 	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
    545 	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
    546 	(void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
    547 	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
    548 	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
    549 	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
    550 	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
    551 	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
    552 	    zfsvfs);
    553 	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
    554 	return (error);
    555 
    556 }
    557 
    558 static int
    559 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
    560 {
    561 	uint_t readonly;
    562 	int error;
    563 
    564 	error = zfs_register_callbacks(zfsvfs->z_vfs);
    565 	if (error)
    566 		return (error);
    567 
    568 	/*
    569 	 * Set the objset user_ptr to track its zfsvfs.
    570 	 */
    571 	mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
    572 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
    573 	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
    574 
    575 	/*
    576 	 * If we are not mounting (ie: online recv), then we don't
    577 	 * have to worry about replaying the log as we blocked all
    578 	 * operations out since we closed the ZIL.
    579 	 */
    580 	if (mounting) {
    581 		/*
    582 		 * During replay we remove the read only flag to
    583 		 * allow replays to succeed.
    584 		 */
    585 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
    586 		if (readonly != 0)
    587 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
    588 		else
    589 			zfs_unlinked_drain(zfsvfs);
    590 
    591 		/*
    592 		 * Parse and replay the intent log.
    593 		 *
    594 		 * Because of ziltest, this must be done after
    595 		 * zfs_unlinked_drain().  (Further note: ziltest doesn't
    596 		 * use readonly mounts, where zfs_unlinked_drain() isn't
    597 		 * called.)  This is because ziltest causes spa_sync()
    598 		 * to think it's committed, but actually it is not, so
    599 		 * the intent log contains many txg's worth of changes.
    600 		 *
    601 		 * In particular, if object N is in the unlinked set in
    602 		 * the last txg to actually sync, then it could be
    603 		 * actually freed in a later txg and then reallocated in
    604 		 * a yet later txg.  This would write a "create object
    605 		 * N" record to the intent log.  Normally, this would be
    606 		 * fine because the spa_sync() would have written out
    607 		 * the fact that object N is free, before we could write
    608 		 * the "create object N" intent log record.
    609 		 *
    610 		 * But when we are in ziltest mode, we advance the "open
    611 		 * txg" without actually spa_sync()-ing the changes to
    612 		 * disk.  So we would see that object N is still
    613 		 * allocated and in the unlinked set, and there is an
    614 		 * intent log record saying to allocate it.
    615 		 */
    616 		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
    617 		    zfs_replay_vector);
    618 
    619 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
    620 	}
    621 
    622 	if (!zil_disable)
    623 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
    624 
    625 	return (0);
    626 }
    627 
    628 static void
    629 zfs_freezfsvfs(zfsvfs_t *zfsvfs)
    630 {
    631 	mutex_destroy(&zfsvfs->z_znodes_lock);
    632 	mutex_destroy(&zfsvfs->z_online_recv_lock);
    633 	list_destroy(&zfsvfs->z_all_znodes);
    634 	rrw_destroy(&zfsvfs->z_teardown_lock);
    635 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
    636 	rw_destroy(&zfsvfs->z_fuid_lock);
    637 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
    638 }
    639 
    640 static int
    641 zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
    642 {
    643 	dev_t mount_dev;
    644 	uint64_t recordsize, readonly;
    645 	int error = 0;
    646 	int mode;
    647 	zfsvfs_t *zfsvfs;
    648 	znode_t *zp = NULL;
    649 
    650 	ASSERT(vfsp);
    651 	ASSERT(osname);
    652 
    653 	/*
    654 	 * Initialize the zfs-specific filesystem structure.
    655 	 * Should probably make this a kmem cache, shuffle fields,
    656 	 * and just bzero up to z_hold_mtx[].
    657 	 */
    658 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
    659 	zfsvfs->z_vfs = vfsp;
    660 	zfsvfs->z_parent = zfsvfs;
    661 	zfsvfs->z_assign = TXG_NOWAIT;
    662 	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
    663 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
    664 
    665 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
    666 	mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
    667 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
    668 	    offsetof(znode_t, z_link_node));
    669 	rrw_init(&zfsvfs->z_teardown_lock);
    670 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
    671 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
    672 
    673 	/* Initialize the generic filesystem structure. */
    674 	vfsp->vfs_bcount = 0;
    675 	vfsp->vfs_data = NULL;
    676 
    677 	if (zfs_create_unique_device(&mount_dev) == -1) {
    678 		error = ENODEV;
    679 		goto out;
    680 	}
    681 	ASSERT(vfs_devismounted(mount_dev) == 0);
    682 
    683 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
    684 	    NULL))
    685 		goto out;
    686 
    687 	vfsp->vfs_dev = mount_dev;
    688 	vfsp->vfs_fstype = zfsfstype;
    689 	vfsp->vfs_bsize = recordsize;
    690 	vfsp->vfs_flag |= VFS_NOTRUNC;
    691 	vfsp->vfs_data = zfsvfs;
    692 
    693 	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
    694 		goto out;
    695 
    696 	mode = DS_MODE_OWNER;
    697 	if (readonly)
    698 		mode |= DS_MODE_READONLY;
    699 
    700 	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
    701 	if (error == EROFS) {
    702 		mode = DS_MODE_OWNER | DS_MODE_READONLY;
    703 		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
    704 		    &zfsvfs->z_os);
    705 	}
    706 
    707 	if (error)
    708 		goto out;
    709 
    710 	if (error = zfs_init_fs(zfsvfs, &zp, cr))
    711 		goto out;
    712 
    713 	/* The call to zfs_init_fs leaves the vnode held, release it here. */
    714 	VN_RELE(ZTOV(zp));
    715 
    716 	/*
    717 	 * Set features for file system.
    718 	 */
    719 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
    720 	if (zfsvfs->z_use_fuids) {
    721 		vfs_set_feature(vfsp, VFSFT_XVATTR);
    722 		vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
    723 		vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
    724 	}
    725 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
    726 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
    727 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
    728 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
    729 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
    730 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
    731 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
    732 	}
    733 
    734 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
    735 		uint64_t pval;
    736 
    737 		ASSERT(mode & DS_MODE_READONLY);
    738 		atime_changed_cb(zfsvfs, B_FALSE);
    739 		readonly_changed_cb(zfsvfs, B_TRUE);
    740 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
    741 			goto out;
    742 		xattr_changed_cb(zfsvfs, pval);
    743 		zfsvfs->z_issnap = B_TRUE;
    744 	} else {
    745 		error = zfsvfs_setup(zfsvfs, B_TRUE);
    746 	}
    747 
    748 	if (!zfsvfs->z_issnap)
    749 		zfsctl_create(zfsvfs);
    750 out:
    751 	if (error) {
    752 		if (zfsvfs->z_os)
    753 			dmu_objset_close(zfsvfs->z_os);
    754 		zfs_freezfsvfs(zfsvfs);
    755 	} else {
    756 		atomic_add_32(&zfs_active_fs_count, 1);
    757 	}
    758 
    759 	return (error);
    760 }
    761 
    762 void
    763 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
    764 {
    765 	objset_t *os = zfsvfs->z_os;
    766 	struct dsl_dataset *ds;
    767 
    768 	/*
    769 	 * Unregister properties.
    770 	 */
    771 	if (!dmu_objset_is_snapshot(os)) {
    772 		ds = dmu_objset_ds(os);
    773 		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
    774 		    zfsvfs) == 0);
    775 
    776 		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
    777 		    zfsvfs) == 0);
    778 
    779 		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
    780 		    zfsvfs) == 0);
    781 
    782 		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
    783 		    zfsvfs) == 0);
    784 
    785 		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
    786 		    zfsvfs) == 0);
    787 
    788 		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
    789 		    zfsvfs) == 0);
    790 
    791 		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
    792 		    zfsvfs) == 0);
    793 
    794 		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
    795 		    zfsvfs) == 0);
    796 
    797 		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
    798 		    zfsvfs) == 0);
    799 
    800 		VERIFY(dsl_prop_unregister(ds, "aclinherit",
    801 		    acl_inherit_changed_cb, zfsvfs) == 0);
    802 
    803 		VERIFY(dsl_prop_unregister(ds, "vscan",
    804 		    vscan_changed_cb, zfsvfs) == 0);
    805 	}
    806 }
    807 
    808 /*
    809  * Convert a decimal digit string to a uint64_t integer.
    810  */
    811 static int
    812 str_to_uint64(char *str, uint64_t *objnum)
    813 {
    814 	uint64_t num = 0;
    815 
    816 	while (*str) {
    817 		if (*str < '0' || *str > '9')
    818 			return (EINVAL);
    819 
    820 		num = num*10 + *str++ - '0';
    821 	}
    822 
    823 	*objnum = num;
    824 	return (0);
    825 }
    826 
    827 /*
    828  * The boot path passed from the boot loader is in the form of
    829  * "rootpool-name/root-filesystem-object-number'. Convert this
    830  * string to a dataset name: "rootpool-name/root-filesystem-name".
    831  */
    832 static int
    833 zfs_parse_bootfs(char *bpath, char *outpath)
    834 {
    835 	char *slashp;
    836 	uint64_t objnum;
    837 	int error;
    838 
    839 	if (*bpath == 0 || *bpath == '/')
    840 		return (EINVAL);
    841 
    842 	slashp = strchr(bpath, '/');
    843 
    844 	/* if no '/', just return the pool name */
    845 	if (slashp == NULL) {
    846 		(void) strcpy(outpath, bpath);
    847 		return (0);
    848 	}
    849 
    850 	if (error = str_to_uint64(slashp+1, &objnum))
    851 		return (error);
    852 
    853 	*slashp = '\0';
    854 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
    855 	*slashp = '/';
    856 
    857 	return (error);
    858 }
    859 
    860 static int
    861 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
    862 {
    863 	int error = 0;
    864 	static int zfsrootdone = 0;
    865 	zfsvfs_t *zfsvfs = NULL;
    866 	znode_t *zp = NULL;
    867 	vnode_t *vp = NULL;
    868 	char *zfs_bootfs;
    869 
    870 	ASSERT(vfsp);
    871 
    872 	/*
    873 	 * The filesystem that we mount as root is defined in the
    874 	 * boot property "zfs-bootfs" with a format of
    875 	 * "poolname/root-dataset-objnum".
    876 	 */
    877 	if (why == ROOT_INIT) {
    878 		if (zfsrootdone++)
    879 			return (EBUSY);
    880 		/*
    881 		 * the process of doing a spa_load will require the
    882 		 * clock to be set before we could (for example) do
    883 		 * something better by looking at the timestamp on
    884 		 * an uberblock, so just set it to -1.
    885 		 */
    886 		clkset(-1);
    887 
    888 		if ((zfs_bootfs = spa_get_bootfs()) == NULL) {
    889 			cmn_err(CE_NOTE, "\nspa_get_bootfs: can not get "
    890 			    "bootfs name \n");
    891 			return (EINVAL);
    892 		}
    893 
    894 		if (error = spa_import_rootpool(rootfs.bo_name)) {
    895 			spa_free_bootfs(zfs_bootfs);
    896 			cmn_err(CE_NOTE, "\nspa_import_rootpool: error %d\n",
    897 			    error);
    898 			return (error);
    899 		}
    900 
    901 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
    902 			spa_free_bootfs(zfs_bootfs);
    903 			cmn_err(CE_NOTE, "\nzfs_parse_bootfs: error %d\n",
    904 			    error);
    905 			return (error);
    906 		}
    907 
    908 		spa_free_bootfs(zfs_bootfs);
    909 
    910 		if (error = vfs_lock(vfsp))
    911 			return (error);
    912 
    913 		if (error = zfs_domount(vfsp, rootfs.bo_name, CRED())) {
    914 			cmn_err(CE_NOTE, "\nzfs_domount: error %d\n", error);
    915 			goto out;
    916 		}
    917 
    918 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
    919 		ASSERT(zfsvfs);
    920 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
    921 			cmn_err(CE_NOTE, "\nzfs_zget: error %d\n", error);
    922 			goto out;
    923 		}
    924 
    925 		vp = ZTOV(zp);
    926 		mutex_enter(&vp->v_lock);
    927 		vp->v_flag |= VROOT;
    928 		mutex_exit(&vp->v_lock);
    929 		rootvp = vp;
    930 
    931 		/*
    932 		 * Leave rootvp held.  The root file system is never unmounted.
    933 		 */
    934 
    935 		vfs_add((struct vnode *)0, vfsp,
    936 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
    937 out:
    938 		vfs_unlock(vfsp);
    939 		return (error);
    940 	} else if (why == ROOT_REMOUNT) {
    941 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
    942 		vfsp->vfs_flag |= VFS_REMOUNT;
    943 
    944 		/* refresh mount options */
    945 		zfs_unregister_callbacks(vfsp->vfs_data);
    946 		return (zfs_register_callbacks(vfsp));
    947 
    948 	} else if (why == ROOT_UNMOUNT) {
    949 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
    950 		(void) zfs_sync(vfsp, 0, 0);
    951 		return (0);
    952 	}
    953 
    954 	/*
    955 	 * if "why" is equal to anything else other than ROOT_INIT,
    956 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
    957 	 */
    958 	return (ENOTSUP);
    959 }
    960 
    961 /*ARGSUSED*/
    962 static int
    963 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
    964 {
    965 	char		*osname;
    966 	pathname_t	spn;
    967 	int		error = 0;
    968 	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
    969 	    UIO_SYSSPACE : UIO_USERSPACE;
    970 	int		canwrite;
    971 
    972 	if (mvp->v_type != VDIR)
    973 		return (ENOTDIR);
    974 
    975 	mutex_enter(&mvp->v_lock);
    976 	if ((uap->flags & MS_REMOUNT) == 0 &&
    977 	    (uap->flags & MS_OVERLAY) == 0 &&
    978 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
    979 		mutex_exit(&mvp->v_lock);
    980 		return (EBUSY);
    981 	}
    982 	mutex_exit(&mvp->v_lock);
    983 
    984 	/*
    985 	 * ZFS does not support passing unparsed data in via MS_DATA.
    986 	 * Users should use the MS_OPTIONSTR interface; this means
    987 	 * that all option parsing is already done and the options struct
    988 	 * can be interrogated.
    989 	 */
    990 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
    991 		return (EINVAL);
    992 
    993 	/*
    994 	 * Get the objset name (the "special" mount argument).
    995 	 */
    996 	if (error = pn_get(uap->spec, fromspace, &spn))
    997 		return (error);
    998 
    999 	osname = spn.pn_path;
   1000 
   1001 	/*
   1002 	 * Check for mount privilege?
   1003 	 *
   1004 	 * If we don't have privilege then see if
   1005 	 * we have local permission to allow it
   1006 	 */</