Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/types.h>
     27 #include <sys/param.h>
     28 #include <sys/systm.h>
     29 #include <sys/sysmacros.h>
     30 #include <sys/kmem.h>
     31 #include <sys/pathname.h>
     32 #include <sys/vnode.h>
     33 #include <sys/vfs.h>
     34 #include <sys/vfs_opreg.h>
     35 #include <sys/mntent.h>
     36 #include <sys/mount.h>
     37 #include <sys/cmn_err.h>
     38 #include "fs/fs_subr.h"
     39 #include <sys/zfs_znode.h>
     40 #include <sys/zfs_dir.h>
     41 #include <sys/zil.h>
     42 #include <sys/fs/zfs.h>
     43 #include <sys/dmu.h>
     44 #include <sys/dsl_prop.h>
     45 #include <sys/dsl_dataset.h>
     46 #include <sys/dsl_deleg.h>
     47 #include <sys/spa.h>
     48 #include <sys/zap.h>
     49 #include <sys/varargs.h>
     50 #include <sys/policy.h>
     51 #include <sys/atomic.h>
     52 #include <sys/mkdev.h>
     53 #include <sys/modctl.h>
     54 #include <sys/refstr.h>
     55 #include <sys/zfs_ioctl.h>
     56 #include <sys/zfs_ctldir.h>
     57 #include <sys/zfs_fuid.h>
     58 #include <sys/bootconf.h>
     59 #include <sys/sunddi.h>
     60 #include <sys/dnlc.h>
     61 #include <sys/dmu_objset.h>
     62 #include <sys/spa_boot.h>
     63 
     64 int zfsfstype;
     65 vfsops_t *zfs_vfsops = NULL;
     66 static major_t zfs_major;
     67 static minor_t zfs_minor;
     68 static kmutex_t	zfs_dev_mtx;
     69 
     70 extern int sys_shutdown;
     71 
     72 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
     73 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
     74 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
     75 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
     76 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
     77 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
     78 static void zfs_freevfs(vfs_t *vfsp);
     79 
     80 static const fs_operation_def_t zfs_vfsops_template[] = {
     81 	VFSNAME_MOUNT,		{ .vfs_mount = zfs_mount },
     82 	VFSNAME_MOUNTROOT,	{ .vfs_mountroot = zfs_mountroot },
     83 	VFSNAME_UNMOUNT,	{ .vfs_unmount = zfs_umount },
     84 	VFSNAME_ROOT,		{ .vfs_root = zfs_root },
     85 	VFSNAME_STATVFS,	{ .vfs_statvfs = zfs_statvfs },
     86 	VFSNAME_SYNC,		{ .vfs_sync = zfs_sync },
     87 	VFSNAME_VGET,		{ .vfs_vget = zfs_vget },
     88 	VFSNAME_FREEVFS,	{ .vfs_freevfs = zfs_freevfs },
     89 	NULL,			NULL
     90 };
     91 
     92 static const fs_operation_def_t zfs_vfsops_eio_template[] = {
     93 	VFSNAME_FREEVFS,	{ .vfs_freevfs =  zfs_freevfs },
     94 	NULL,			NULL
     95 };
     96 
     97 /*
     98  * We need to keep a count of active fs's.
     99  * This is necessary to prevent our module
    100  * from being unloaded after a umount -f
    101  */
    102 static uint32_t	zfs_active_fs_count = 0;
    103 
    104 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
    105 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
    106 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
    107 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
    108 
    109 /*
    110  * MO_DEFAULT is not used since the default value is determined
    111  * by the equivalent property.
    112  */
    113 static mntopt_t mntopts[] = {
    114 	{ MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
    115 	{ MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
    116 	{ MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
    117 	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
    118 };
    119 
    120 static mntopts_t zfs_mntopts = {
    121 	sizeof (mntopts) / sizeof (mntopt_t),
    122 	mntopts
    123 };
    124 
    125 /*ARGSUSED*/
    126 int
    127 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
    128 {
    129 	/*
    130 	 * Data integrity is job one.  We don't want a compromised kernel
    131 	 * writing to the storage pool, so we never sync during panic.
    132 	 */
    133 	if (panicstr)
    134 		return (0);
    135 
    136 	/*
    137 	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
    138 	 * to sync metadata, which they would otherwise cache indefinitely.
    139 	 * Semantically, the only requirement is that the sync be initiated.
    140 	 * The DMU syncs out txgs frequently, so there's nothing to do.
    141 	 */
    142 	if (flag & SYNC_ATTR)
    143 		return (0);
    144 
    145 	if (vfsp != NULL) {
    146 		/*
    147 		 * Sync a specific filesystem.
    148 		 */
    149 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
    150 		dsl_pool_t *dp;
    151 
    152 		ZFS_ENTER(zfsvfs);
    153 		dp = dmu_objset_pool(zfsvfs->z_os);
    154 
    155 		/*
    156 		 * If the system is shutting down, then skip any
    157 		 * filesystems which may exist on a suspended pool.
    158 		 */
    159 		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
    160 			ZFS_EXIT(zfsvfs);
    161 			return (0);
    162 		}
    163 
    164 		if (zfsvfs->z_log != NULL)
    165 			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
    166 		else
    167 			txg_wait_synced(dp, 0);
    168 		ZFS_EXIT(zfsvfs);
    169 	} else {
    170 		/*
    171 		 * Sync all ZFS filesystems.  This is what happens when you
    172 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
    173 		 * request by waiting for all pools to commit all dirty data.
    174 		 */
    175 		spa_sync_allpools();
    176 	}
    177 
    178 	return (0);
    179 }
    180 
    181 static int
    182 zfs_create_unique_device(dev_t *dev)
    183 {
    184 	major_t new_major;
    185 
    186 	do {
    187 		ASSERT3U(zfs_minor, <=, MAXMIN32);
    188 		minor_t start = zfs_minor;
    189 		do {
    190 			mutex_enter(&zfs_dev_mtx);
    191 			if (zfs_minor >= MAXMIN32) {
    192 				/*
    193 				 * If we're still using the real major
    194 				 * keep out of /dev/zfs and /dev/zvol minor
    195 				 * number space.  If we're using a getudev()'ed
    196 				 * major number, we can use all of its minors.
    197 				 */
    198 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
    199 					zfs_minor = ZFS_MIN_MINOR;
    200 				else
    201 					zfs_minor = 0;
    202 			} else {
    203 				zfs_minor++;
    204 			}
    205 			*dev = makedevice(zfs_major, zfs_minor);
    206 			mutex_exit(&zfs_dev_mtx);
    207 		} while (vfs_devismounted(*dev) && zfs_minor != start);
    208 		if (zfs_minor == start) {
    209 			/*
    210 			 * We are using all ~262,000 minor numbers for the
    211 			 * current major number.  Create a new major number.
    212 			 */
    213 			if ((new_major = getudev()) == (major_t)-1) {
    214 				cmn_err(CE_WARN,
    215 				    "zfs_mount: Can't get unique major "
    216 				    "device number.");
    217 				return (-1);
    218 			}
    219 			mutex_enter(&zfs_dev_mtx);
    220 			zfs_major = new_major;
    221 			zfs_minor = 0;
    222 
    223 			mutex_exit(&zfs_dev_mtx);
    224 		} else {
    225 			break;
    226 		}
    227 		/* CONSTANTCONDITION */
    228 	} while (1);
    229 
    230 	return (0);
    231 }
    232 
    233 static void
    234 atime_changed_cb(void *arg, uint64_t newval)
    235 {
    236 	zfsvfs_t *zfsvfs = arg;
    237 
    238 	if (newval == TRUE) {
    239 		zfsvfs->z_atime = TRUE;
    240 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
    241 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
    242 	} else {
    243 		zfsvfs->z_atime = FALSE;
    244 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
    245 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
    246 	}
    247 }
    248 
    249 static void
    250 xattr_changed_cb(void *arg, uint64_t newval)
    251 {
    252 	zfsvfs_t *zfsvfs = arg;
    253 
    254 	if (newval == TRUE) {
    255 		/* XXX locking on vfs_flag? */
    256 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
    257 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
    258 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
    259 	} else {
    260 		/* XXX locking on vfs_flag? */
    261 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
    262 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
    263 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
    264 	}
    265 }
    266 
    267 static void
    268 blksz_changed_cb(void *arg, uint64_t newval)
    269 {
    270 	zfsvfs_t *zfsvfs = arg;
    271 
    272 	if (newval < SPA_MINBLOCKSIZE ||
    273 	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
    274 		newval = SPA_MAXBLOCKSIZE;
    275 
    276 	zfsvfs->z_max_blksz = newval;
    277 	zfsvfs->z_vfs->vfs_bsize = newval;
    278 }
    279 
    280 static void
    281 readonly_changed_cb(void *arg, uint64_t newval)
    282 {
    283 	zfsvfs_t *zfsvfs = arg;
    284 
    285 	if (newval) {
    286 		/* XXX locking on vfs_flag? */
    287 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
    288 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
    289 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
    290 	} else {
    291 		/* XXX locking on vfs_flag? */
    292 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
    293 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
    294 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
    295 	}
    296 }
    297 
    298 static void
    299 devices_changed_cb(void *arg, uint64_t newval)
    300 {
    301 	zfsvfs_t *zfsvfs = arg;
    302 
    303 	if (newval == FALSE) {
    304 		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
    305 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
    306 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
    307 	} else {
    308 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
    309 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
    310 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
    311 	}
    312 }
    313 
    314 static void
    315 setuid_changed_cb(void *arg, uint64_t newval)
    316 {
    317 	zfsvfs_t *zfsvfs = arg;
    318 
    319 	if (newval == FALSE) {
    320 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
    321 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
    322 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
    323 	} else {
    324 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
    325 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
    326 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
    327 	}
    328 }
    329 
    330 static void
    331 exec_changed_cb(void *arg, uint64_t newval)
    332 {
    333 	zfsvfs_t *zfsvfs = arg;
    334 
    335 	if (newval == FALSE) {
    336 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
    337 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
    338 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
    339 	} else {
    340 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
    341 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
    342 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
    343 	}
    344 }
    345 
    346 /*
    347  * The nbmand mount option can be changed at mount time.
    348  * We can't allow it to be toggled on live file systems or incorrect
    349  * behavior may be seen from cifs clients
    350  *
    351  * This property isn't registered via dsl_prop_register(), but this callback
    352  * will be called when a file system is first mounted
    353  */
    354 static void
    355 nbmand_changed_cb(void *arg, uint64_t newval)
    356 {
    357 	zfsvfs_t *zfsvfs = arg;
    358 	if (newval == FALSE) {
    359 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
    360 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
    361 	} else {
    362 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
    363 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
    364 	}
    365 }
    366 
    367 static void
    368 snapdir_changed_cb(void *arg, uint64_t newval)
    369 {
    370 	zfsvfs_t *zfsvfs = arg;
    371 
    372 	zfsvfs->z_show_ctldir = newval;
    373 }
    374 
    375 static void
    376 vscan_changed_cb(void *arg, uint64_t newval)
    377 {
    378 	zfsvfs_t *zfsvfs = arg;
    379 
    380 	zfsvfs->z_vscan = newval;
    381 }
    382 
    383 static void
    384 acl_mode_changed_cb(void *arg, uint64_t newval)
    385 {
    386 	zfsvfs_t *zfsvfs = arg;
    387 
    388 	zfsvfs->z_acl_mode = newval;
    389 }
    390 
    391 static void
    392 acl_inherit_changed_cb(void *arg, uint64_t newval)
    393 {
    394 	zfsvfs_t *zfsvfs = arg;
    395 
    396 	zfsvfs->z_acl_inherit = newval;
    397 }
    398 
    399 static int
    400 zfs_register_callbacks(vfs_t *vfsp)
    401 {
    402 	struct dsl_dataset *ds = NULL;
    403 	objset_t *os = NULL;
    404 	zfsvfs_t *zfsvfs = NULL;
    405 	uint64_t nbmand;
    406 	int readonly, do_readonly = B_FALSE;
    407 	int setuid, do_setuid = B_FALSE;
    408 	int exec, do_exec = B_FALSE;
    409 	int devices, do_devices = B_FALSE;
    410 	int xattr, do_xattr = B_FALSE;
    411 	int atime, do_atime = B_FALSE;
    412 	int error = 0;
    413 
    414 	ASSERT(vfsp);
    415 	zfsvfs = vfsp->vfs_data;
    416 	ASSERT(zfsvfs);
    417 	os = zfsvfs->z_os;
    418 
    419 	/*
    420 	 * The act of registering our callbacks will destroy any mount
    421 	 * options we may have.  In order to enable temporary overrides
    422 	 * of mount options, we stash away the current values and
    423 	 * restore them after we register the callbacks.
    424 	 */
    425 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
    426 		readonly = B_TRUE;
    427 		do_readonly = B_TRUE;
    428 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
    429 		readonly = B_FALSE;
    430 		do_readonly = B_TRUE;
    431 	}
    432 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
    433 		devices = B_FALSE;
    434 		setuid = B_FALSE;
    435 		do_devices = B_TRUE;
    436 		do_setuid = B_TRUE;
    437 	} else {
    438 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
    439 			devices = B_FALSE;
    440 			do_devices = B_TRUE;
    441 		} else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
    442 			devices = B_TRUE;
    443 			do_devices = B_TRUE;
    444 		}
    445 
    446 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
    447 			setuid = B_FALSE;
    448 			do_setuid = B_TRUE;
    449 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
    450 			setuid = B_TRUE;
    451 			do_setuid = B_TRUE;
    452 		}
    453 	}
    454 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
    455 		exec = B_FALSE;
    456 		do_exec = B_TRUE;
    457 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
    458 		exec = B_TRUE;
    459 		do_exec = B_TRUE;
    460 	}
    461 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
    462 		xattr = B_FALSE;
    463 		do_xattr = B_TRUE;
    464 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
    465 		xattr = B_TRUE;
    466 		do_xattr = B_TRUE;
    467 	}
    468 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
    469 		atime = B_FALSE;
    470 		do_atime = B_TRUE;
    471 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
    472 		atime = B_TRUE;
    473 		do_atime = B_TRUE;
    474 	}
    475 
    476 	/*
    477 	 * nbmand is a special property.  It can only be changed at
    478 	 * mount time.
    479 	 *
    480 	 * This is weird, but it is documented to only be changeable
    481 	 * at mount time.
    482 	 */
    483 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
    484 		nbmand = B_FALSE;
    485 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
    486 		nbmand = B_TRUE;
    487 	} else {
    488 		char osname[MAXNAMELEN];
    489 
    490 		dmu_objset_name(os, osname);
    491 		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
    492 		    NULL)) {
    493 			return (error);
    494 		}
    495 	}
    496 
    497 	/*
    498 	 * Register property callbacks.
    499 	 *
    500 	 * It would probably be fine to just check for i/o error from
    501 	 * the first prop_register(), but I guess I like to go
    502 	 * overboard...
    503 	 */
    504 	ds = dmu_objset_ds(os);
    505 	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
    506 	error = error ? error : dsl_prop_register(ds,
    507 	    "xattr", xattr_changed_cb, zfsvfs);
    508 	error = error ? error : dsl_prop_register(ds,
    509 	    "recordsize", blksz_changed_cb, zfsvfs);
    510 	error = error ? error : dsl_prop_register(ds,
    511 	    "readonly", readonly_changed_cb, zfsvfs);
    512 	error = error ? error : dsl_prop_register(ds,
    513 	    "devices", devices_changed_cb, zfsvfs);
    514 	error = error ? error : dsl_prop_register(ds,
    515 	    "setuid", setuid_changed_cb, zfsvfs);
    516 	error = error ? error : dsl_prop_register(ds,
    517 	    "exec", exec_changed_cb, zfsvfs);
    518 	error = error ? error : dsl_prop_register(ds,
    519 	    "snapdir", snapdir_changed_cb, zfsvfs);
    520 	error = error ? error : dsl_prop_register(ds,
    521 	    "aclmode", acl_mode_changed_cb, zfsvfs);
    522 	error = error ? error : dsl_prop_register(ds,
    523 	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
    524 	error = error ? error : dsl_prop_register(ds,
    525 	    "vscan", vscan_changed_cb, zfsvfs);
    526 	if (error)
    527 		goto unregister;
    528 
    529 	/*
    530 	 * Invoke our callbacks to restore temporary mount options.
    531 	 */
    532 	if (do_readonly)
    533 		readonly_changed_cb(zfsvfs, readonly);
    534 	if (do_setuid)
    535 		setuid_changed_cb(zfsvfs, setuid);
    536 	if (do_exec)
    537 		exec_changed_cb(zfsvfs, exec);
    538 	if (do_devices)
    539 		devices_changed_cb(zfsvfs, devices);
    540 	if (do_xattr)
    541 		xattr_changed_cb(zfsvfs, xattr);
    542 	if (do_atime)
    543 		atime_changed_cb(zfsvfs, atime);
    544 
    545 	nbmand_changed_cb(zfsvfs, nbmand);
    546 
    547 	return (0);
    548 
    549 unregister:
    550 	/*
    551 	 * We may attempt to unregister some callbacks that are not
    552 	 * registered, but this is OK; it will simply return ENOMSG,
    553 	 * which we will ignore.
    554 	 */
    555 	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
    556 	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
    557 	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
    558 	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
    559 	(void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
    560 	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
    561 	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
    562 	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
    563 	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
    564 	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
    565 	    zfsvfs);
    566 	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
    567 	return (error);
    568 
    569 }
    570 
    571 static void
    572 uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
    573     int64_t delta, dmu_tx_t *tx)
    574 {
    575 	uint64_t used = 0;
    576 	char buf[32];
    577 	int err;
    578 	uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
    579 
    580 	if (delta == 0)
    581 		return;
    582 
    583 	(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
    584 	err = zap_lookup(os, obj, buf, 8, 1, &used);
    585 	ASSERT(err == 0 || err == ENOENT);
    586 	/* no underflow/overflow */
    587 	ASSERT(delta > 0 || used >= -delta);
    588 	ASSERT(delta < 0 || used + delta > used);
    589 	used += delta;
    590 	if (used == 0)
    591 		err = zap_remove(os, obj, buf, tx);
    592 	else
    593 		err = zap_update(os, obj, buf, 8, 1, &used, tx);
    594 	ASSERT(err == 0);
    595 }
    596 
    597 static void
    598 zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype,
    599     void *oldbonus, void *newbonus,
    600     uint64_t oldused, uint64_t newused, dmu_tx_t *tx)
    601 {
    602 	znode_phys_t *oldznp = oldbonus;
    603 	znode_phys_t *newznp = newbonus;
    604 
    605 	if (bonustype != DMU_OT_ZNODE)
    606 		return;
    607 
    608 	/* We charge 512 for the dnode (if it's allocated). */
    609 	if (oldznp->zp_gen != 0)
    610 		oldused += DNODE_SIZE;
    611 	if (newznp->zp_gen != 0)
    612 		newused += DNODE_SIZE;
    613 
    614 	if (oldznp->zp_uid == newznp->zp_uid) {
    615 		uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx);
    616 	} else {
    617 		uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx);
    618 		uidacct(os, B_FALSE, newznp->zp_uid, newused, tx);
    619 	}
    620 
    621 	if (oldznp->zp_gid == newznp->zp_gid) {
    622 		uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx);
    623 	} else {
    624 		uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx);
    625 		uidacct(os, B_TRUE, newznp->zp_gid, newused, tx);
    626 	}
    627 }
    628 
    629 static void
    630 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
    631     char *domainbuf, int buflen, uid_t *ridp)
    632 {
    633 	extern uint64_t strtonum(const char *str, char **nptr);
    634 	uint64_t fuid;
    635 	const char *domain;
    636 
    637 	fuid = strtonum(fuidstr, NULL);
    638 
    639 	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
    640 	if (domain)
    641 		(void) strlcpy(domainbuf, domain, buflen);
    642 	else
    643 		domainbuf[0] = '\0';
    644 	*ridp = FUID_RID(fuid);
    645 }
    646 
    647 static uint64_t
    648 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
    649 {
    650 	switch (type) {
    651 	case ZFS_PROP_USERUSED:
    652 		return (DMU_USERUSED_OBJECT);
    653 	case ZFS_PROP_GROUPUSED:
    654 		return (DMU_GROUPUSED_OBJECT);
    655 	case ZFS_PROP_USERQUOTA:
    656 		return (zfsvfs->z_userquota_obj);
    657 	case ZFS_PROP_GROUPQUOTA:
    658 		return (zfsvfs->z_groupquota_obj);
    659 	}
    660 	return (0);
    661 }
    662 
    663 int
    664 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
    665     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
    666 {
    667 	int error;
    668 	zap_cursor_t zc;
    669 	zap_attribute_t za;
    670 	zfs_useracct_t *buf = vbuf;
    671 	uint64_t obj;
    672 
    673 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
    674 		return (ENOTSUP);
    675 
    676 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
    677 	if (obj == 0) {
    678 		*bufsizep = 0;
    679 		return (0);
    680 	}
    681 
    682 	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
    683 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
    684 	    zap_cursor_advance(&zc)) {
    685 		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
    686 		    *bufsizep)
    687 			break;
    688 
    689 		fuidstr_to_sid(zfsvfs, za.za_name,
    690 		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
    691 
    692 		buf->zu_space = za.za_first_integer;
    693 		buf++;
    694 	}
    695 	if (error == ENOENT)
    696 		error = 0;
    697 
    698 	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
    699 	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
    700 	*cookiep = zap_cursor_serialize(&zc);
    701 	zap_cursor_fini(&zc);
    702 	return (error);
    703 }
    704 
    705 /*
    706  * buf must be big enough (eg, 32 bytes)
    707  */
    708 static int
    709 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
    710     char *buf, boolean_t addok)
    711 {
    712 	uint64_t fuid;
    713 	int domainid = 0;
    714 
    715 	if (domain && domain[0]) {
    716 		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
    717 		if (domainid == -1)
    718 			return (ENOENT);
    719 	}
    720 	fuid = FUID_ENCODE(domainid, rid);
    721 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
    722 	return (0);
    723 }
    724 
    725 int
    726 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
    727     const char *domain, uint64_t rid, uint64_t *valp)
    728 {
    729 	char buf[32];
    730 	int err;
    731 	uint64_t obj;
    732 
    733 	*valp = 0;
    734 
    735 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
    736 		return (ENOTSUP);
    737 
    738 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
    739 	if (obj == 0)
    740 		return (0);
    741 
    742 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
    743 	if (err)
    744 		return (err);
    745 
    746 	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
    747 	if (err == ENOENT)
    748 		err = 0;
    749 	return (err);
    750 }
    751 
    752 int
    753 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
    754     const char *domain, uint64_t rid, uint64_t quota)
    755 {
    756 	char buf[32];
    757 	int err;
    758 	dmu_tx_t *tx;
    759 	uint64_t *objp;
    760 	boolean_t fuid_dirtied;
    761 
    762 	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
    763 		return (EINVAL);
    764 
    765 	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
    766 		return (ENOTSUP);
    767 
    768 	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
    769 	    &zfsvfs->z_groupquota_obj;
    770 
    771 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
    772 	if (err)
    773 		return (err);
    774 	fuid_dirtied = zfsvfs->z_fuid_dirty;
    775 
    776 	tx = dmu_tx_create(zfsvfs->z_os);
    777 	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
    778 	if (*objp == 0) {
    779 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
    780 		    zfs_userquota_prop_prefixes[type]);
    781 	}
    782 	if (fuid_dirtied)
    783 		zfs_fuid_txhold(zfsvfs, tx);
    784 	err = dmu_tx_assign(tx, TXG_WAIT);
    785 	if (err) {
    786 		dmu_tx_abort(tx);
    787 		return (err);
    788 	}
    789 
    790 	mutex_enter(&zfsvfs->z_lock);
    791 	if (*objp == 0) {
    792 		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
    793 		    DMU_OT_NONE, 0, tx);
    794 		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
    795 		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
    796 	}
    797 	mutex_exit(&zfsvfs->z_lock);
    798 
    799 	if (quota == 0) {
    800 		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
    801 		if (err == ENOENT)
    802 			err = 0;
    803 	} else {
    804 		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
    805 	}
    806 	ASSERT(err == 0);
    807 	if (fuid_dirtied)
    808 		zfs_fuid_sync(zfsvfs, tx);
    809 	dmu_tx_commit(tx);
    810 	return (err);
    811 }
    812 
    813 boolean_t
    814 zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
    815 {
    816 	char buf[32];
    817 	uint64_t used, quota, usedobj, quotaobj;
    818 	int err;
    819 
    820 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
    821 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
    822 
    823 	if (quotaobj == 0 || zfsvfs->z_replay)
    824 		return (B_FALSE);
    825 
    826 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
    827 	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
    828 	if (err != 0)
    829 		return (B_FALSE);
    830 
    831 	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
    832 	if (err != 0)
    833 		return (B_FALSE);
    834 	return (used >= quota);
    835 }
    836 
    837 int
    838 zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
    839 {
    840 	objset_t *os;
    841 	zfsvfs_t *zfsvfs;
    842 	uint64_t zval;
    843 	int i, error;
    844 
    845 	if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL))
    846 		return (error);
    847 	if (zval)
    848 		mode |= DS_MODE_READONLY;
    849 
    850 	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
    851 	if (error == EROFS) {
    852 		mode |= DS_MODE_READONLY;
    853 		error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
    854 	}
    855 	if (error)
    856 		return (error);
    857 
    858 	/*
    859 	 * Initialize the zfs-specific filesystem structure.
    860 	 * Should probably make this a kmem cache, shuffle fields,
    861 	 * and just bzero up to z_hold_mtx[].
    862 	 */
    863 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
    864 	zfsvfs->z_vfs = NULL;
    865 	zfsvfs->z_parent = zfsvfs;
    866 	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
    867 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
    868 	zfsvfs->z_os = os;
    869 
    870 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
    871 	if (error) {
    872 		goto out;
    873 	} else if (zfsvfs->z_version > ZPL_VERSION) {
    874 		(void) printf("Mismatched versions:  File system "
    875 		    "is version %llu on-disk format, which is "
    876 		    "incompatible with this software version %lld!",
    877 		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
    878 		error = ENOTSUP;
    879 		goto out;
    880 	}
    881 
    882 	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
    883 		goto out;
    884 	zfsvfs->z_norm = (int)zval;
    885 
    886 	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
    887 		goto out;
    888 	zfsvfs->z_utf8 = (zval != 0);
    889 
    890 	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
    891 		goto out;
    892 	zfsvfs->z_case = (uint_t)zval;
    893 
    894 	/*
    895 	 * Fold case on file systems that are always or sometimes case
    896 	 * insensitive.
    897 	 */
    898 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
    899 	    zfsvfs->z_case == ZFS_CASE_MIXED)
    900 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
    901 
    902 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
    903 
    904 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
    905 	    &zfsvfs->z_root);
    906 	if (error)
    907 		goto out;
    908 	ASSERT(zfsvfs->z_root != 0);
    909 
    910 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
    911 	    &zfsvfs->z_unlinkedobj);
    912 	if (error)
    913 		goto out;
    914 
    915 	error = zap_lookup(os, MASTER_NODE_OBJ,
    916 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
    917 	    8, 1, &zfsvfs->z_userquota_obj);
    918 	if (error && error != ENOENT)
    919 		goto out;
    920 
    921 	error = zap_lookup(os, MASTER_NODE_OBJ,
    922 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
    923 	    8, 1, &zfsvfs->z_groupquota_obj);
    924 	if (error && error != ENOENT)
    925 		goto out;
    926 
    927 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
    928 	    &zfsvfs->z_fuid_obj);
    929 	if (error && error != ENOENT)
    930 		goto out;
    931 
    932 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
    933 	    &zfsvfs->z_shares_dir);
    934 	if (error && error != ENOENT)
    935 		goto out;
    936 
    937 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
    938 	mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
    939 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
    940 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
    941 	    offsetof(znode_t, z_link_node));
    942 	rrw_init(&zfsvfs->z_teardown_lock);
    943 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
    944 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
    945 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
    946 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
    947 
    948 	*zvp = zfsvfs;
    949 	return (0);
    950 
    951 out:
    952 	dmu_objset_close(os);
    953 	*zvp = NULL;
    954 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
    955 	return (error);
    956 }
    957 
    958 static int
    959 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
    960 {
    961 	int error;
    962 
    963 	error = zfs_register_callbacks(zfsvfs->z_vfs);
    964 	if (error)
    965 		return (error);
    966 
    967 	/*
    968 	 * Set the objset user_ptr to track its zfsvfs.
    969 	 */
    970 	mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
    971 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
    972 	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
    973 
    974 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
    975 	if (zil_disable) {
    976 		zil_destroy(zfsvfs->z_log, 0);
    977 		zfsvfs->z_log = NULL;
    978 	}
    979 
    980 	/*
    981 	 * If we are not mounting (ie: online recv), then we don't
    982 	 * have to worry about replaying the log as we blocked all
    983 	 * operations out since we closed the ZIL.
    984 	 */
    985 	if (mounting) {
    986 		boolean_t readonly;
    987 
    988 		/*
    989 		 * During replay we remove the read only flag to
    990 		 * allow replays to succeed.
    991 		 */
    992 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
    993 		if (readonly != 0)
    994 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
    995 		else
    996 			zfs_unlinked_drain(zfsvfs);
    997 
    998 		if (zfsvfs->z_log) {
    999 			/*
   1000 			 * Parse and replay the intent log.
   1001 			 *
   1002 			 * Because of ziltest, this must be done after
   1003 			 * zfs_unlinked_drain().  (Further note: ziltest
   1004 			 * doesn't use readonly mounts, where
   1005 			 * zfs_unlinked_drain() isn't called.)  This is because
   1006 			 * ziltest causes spa_sync() to think it's committed,
   1007 			 * but actually it is not, so the intent log contains
   1008 			 * many txg's worth of changes.
   1009 			 *
   1010 			 * In particular, if object N is in the unlinked set in
   1011 			 * the last txg to actually sync, then it could be
   1012 			 * actually freed in a later txg and then reallocated
   1013 			 * in a yet later txg.  This would write a "create
   1014 			 * object N" record to the intent log.  Normally, this
   1015 			 * would be fine because the spa_sync() would have
   1016 			 * written out the fact that object N is free, before
   1017 			 * we could write the "create object N" intent log
   1018 			 * record.
   1019 			 *
   1020 			 * But when we are in ziltest mode, we advance the "open
   1021 			 * txg" without actually spa_sync()-ing the changes to
   1022 			 * disk.  So we would see that object N is still
   1023 			 * allocated and in the unlinked set, and there is an
   1024 			 * intent log record saying to allocate it.
   1025 			 */
   1026 			zfsvfs->z_replay = B_TRUE;
   1027 			zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
   1028 			zfsvfs->z_replay = B_FALSE;
   1029 		}
   1030 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
   1031 	}
   1032 
   1033 	return (0);
   1034 }
   1035 
   1036 void
   1037 zfsvfs_free(zfsvfs_t *zfsvfs)
   1038 {
   1039 	int i;
   1040 	extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
   1041 
   1042 	/*
   1043 	 * This is a barrier to prevent the filesystem from going away in
   1044 	 * zfs_znode_move() until we can safely ensure that the filesystem is
   1045 	 * not unmounted. We consider the filesystem valid before the barrier
   1046 	 * and invalid after the barrier.
   1047 	 */
   1048 	rw_enter(&zfsvfs_lock, RW_READER);
   1049 	rw_exit(&zfsvfs_lock);
   1050 
   1051 	zfs_fuid_destroy(zfsvfs);
   1052 
   1053 	mutex_destroy(&zfsvfs->z_znodes_lock);
   1054 	mutex_destroy(&zfsvfs->z_online_recv_lock);
   1055 	mutex_destroy(&zfsvfs->z_lock);
   1056 	list_destroy(&zfsvfs->z_all_znodes);
   1057 	rrw_destroy(&zfsvfs->z_teardown_lock);
   1058 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
   1059 	rw_destroy(&zfsvfs->z_fuid_lock);
   1060 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
   1061 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
   1062 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
   1063 }
   1064 
   1065 static void
   1066 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
   1067 {
   1068 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
   1069 	if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) {
   1070 		vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
   1071 		vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
   1072 		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
   1073 		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
   1074 		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
   1075 	}
   1076 }
   1077 
   1078 static int
   1079 zfs_domount(vfs_t *vfsp, char *osname)
   1080 {
   1081 	dev_t mount_dev;
   1082 	uint64_t recordsize, fsid_guid;
   1083 	int error = 0;
   1084 	zfsvfs_t *zfsvfs;
   1085 
   1086 	ASSERT(vfsp);
   1087 	ASSERT(osname);
   1088 
   1089 	error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs);
   1090 	if (error)
   1091 		return (error);
   1092 	zfsvfs->z_vfs = vfsp;
   1093 
   1094 	/* Initialize the generic filesystem structure. */
   1095 	vfsp->vfs_bcount = 0;
   1096 	vfsp->vfs_data = NULL;
   1097 
   1098 	if (zfs_create_unique_device(&mount_dev) == -1) {
   1099 		error = ENODEV;
   1100 		goto out;
   1101 	}
   1102 	ASSERT(vfs_devismounted(mount_dev) == 0);
   1103 
   1104 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
   1105 	    NULL))
   1106 		goto out;
   1107 
   1108 	vfsp->vfs_dev = mount_dev;
   1109 	vfsp->vfs_fstype = zfsfstype;
   1110 	vfsp->vfs_bsize = recordsize;
   1111 	vfsp->vfs_flag |= VFS_NOTRUNC;
   1112 	vfsp->vfs_data = zfsvfs;
   1113 
   1114 	/*
   1115 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
   1116 	 * separates our fsid from any other filesystem types, and a
   1117 	 * 56-bit objset unique ID.  The objset unique ID is unique to
   1118 	 * all objsets open on this system, provided by unique_create().
   1119 	 * The 8-bit fs type must be put in the low bits of fsid[1]
   1120 	 * because that's where other Solaris filesystems put it.
   1121 	 */
   1122 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
   1123 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
   1124 	vfsp->vfs_fsid.val[0] = fsid_guid;
   1125 	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
   1126 	    zfsfstype & 0xFF;
   1127 
   1128 	/*
   1129 	 * Set features for file system.
   1130 	 */
   1131 	zfs_set_fuid_feature(zfsvfs);
   1132 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
   1133 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
   1134 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
   1135 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
   1136 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
   1137 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
   1138 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
   1139 	}
   1140 
   1141 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
   1142 		uint64_t pval;
   1143 
   1144 		atime_changed_cb(zfsvfs, B_FALSE);
   1145 		readonly_changed_cb(zfsvfs, B_TRUE);
   1146 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
   1147 			goto out;
   1148 		xattr_changed_cb(zfsvfs, pval);
   1149 		zfsvfs->z_issnap = B_TRUE;
   1150 
   1151 		mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
   1152 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
   1153 		mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
   1154 	} else {
   1155 		error = zfsvfs_setup(zfsvfs, B_TRUE);
   1156 	}
   1157 
   1158 	if (!zfsvfs->z_issnap)
   1159 		zfsctl_create(zfsvfs);
   1160 out:
   1161 	if (error) {
   1162 		dmu_objset_close(zfsvfs->z_os);
   1163 		zfsvfs_free(zfsvfs);
   1164 	} else {
   1165 		atomic_add_32(&zfs_active_fs_count, 1);
   1166 	}
   1167 
   1168 	return (error);
   1169 }
   1170 
   1171 void
   1172 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
   1173 {
   1174 	objset_t *os = zfsvfs->z_os;
   1175 	struct dsl_dataset *ds;
   1176 
   1177 	/*
   1178 	 * Unregister properties.
   1179 	 */
   1180 	if (!dmu_objset_is_snapshot(os)) {
   1181 		ds = dmu_objset_ds(os);
   1182 		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
   1183 		    zfsvfs) == 0);
   1184 
   1185 		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
   1186 		    zfsvfs) == 0);
   1187 
   1188 		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
   1189 		    zfsvfs) == 0);
   1190 
   1191 		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
   1192 		    zfsvfs) == 0);
   1193 
   1194 		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
   1195 		    zfsvfs) == 0);
   1196 
   1197 		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
   1198 		    zfsvfs) == 0);
   1199 
   1200 		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
   1201 		    zfsvfs) == 0);
   1202 
   1203 		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
   1204 		    zfsvfs) == 0);
   1205 
   1206 		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
   1207 		    zfsvfs) == 0);
   1208 
   1209 		VERIFY(dsl_prop_unregister(ds, "aclinherit",
   1210 		    acl_inherit_changed_cb, zfsvfs) == 0);
   1211 
   1212 		VERIFY(dsl_prop_unregister(ds, "vscan",
   1213 		    vscan_changed_cb, zfsvfs) == 0);
   1214 	}
   1215 }
   1216 
   1217 /*
   1218  * Convert a decimal digit string to a uint64_t integer.
   1219  */
   1220 static int
   1221 str_to_uint64(char *str, uint64_t *objnum)
   1222 {
   1223 	uint64_t num = 0;
   1224 
   1225 	while (*str) {
   1226 		if (*str < '0' || *str > '9')
   1227 			return (EINVAL);
   1228 
   1229 		num = num*10 + *str++ - '0';
   1230 	}
   1231 
   1232 	*objnum = num;
   1233 	return (0);
   1234 }
   1235 
   1236 /*
   1237  * The boot path passed from the boot loader is in the form of
   1238  * "rootpool-name/root-filesystem-object-number'. Convert this
   1239  * string to a dataset name: "rootpool-name/root-filesystem-name".
   1240  */
   1241 static int
   1242 zfs_parse_bootfs(char *bpath, char *outpath)
   1243 {
   1244 	char *slashp;
   1245 	uint64_t objnum;
   1246 	int error;
   1247 
   1248 	if (*bpath == 0 || *bpath == '/')
   1249 		return (EINVAL);
   1250 
   1251 	(void) strcpy(outpath, bpath);
   1252 
   1253 	slashp = strchr(bpath, '/');
   1254 
   1255 	/* if no '/', just return the pool name */
   1256 	if (slashp == NULL) {
   1257 		return (0);
   1258 	}
   1259 
   1260 	/* if not a number, just return the root dataset name */
   1261 	if (str_to_uint64(slashp+1, &objnum)) {
   1262 		return (0);
   1263 	}
   1264 
   1265 	*slashp = '\0';
   1266 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
   1267 	*slashp = '/';
   1268 
   1269 	return (error);
   1270 }
   1271 
   1272 static int
   1273 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
   1274 {
   1275 	int error = 0;
   1276 	static int zfsrootdone = 0;
   1277 	zfsvfs_t *zfsvfs = NULL;
   1278 	znode_t *zp = NULL;
   1279 	vnode_t *vp = NULL;
   1280 	char *zfs_bootfs;
   1281 	char *zfs_devid;
   1282 
   1283 	ASSERT(vfsp);
   1284 
   1285 	/*
   1286 	 * The filesystem that we mount as root is defined in the
   1287 	 * boot property "zfs-bootfs" with a format of
   1288 	 * "poolname/root-dataset-objnum".
   1289 	 */
   1290 	if (why == ROOT_INIT) {
   1291 		if (zfsrootdone++)
   1292 			return (EBUSY);
   1293 		/*
   1294 		 * the process of doing a spa_load will require the
   1295 		 * clock to be set before we could (for example) do
   1296 		 * something better by looking at the timestamp on
   1297 		 * an uberblock, so just set it to -1.
   1298 		 */
   1299 		clkset(-1);
   1300 
   1301 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
   1302 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
   1303 			    "bootfs name");
   1304 			return (EINVAL);
   1305 		}
   1306 		zfs_devid = spa_get_bootprop("diskdevid");
   1307 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
   1308 		if (zfs_devid)
   1309 			spa_free_bootprop(zfs_devid);
   1310 		if (error) {
   1311 			spa_free_bootprop(zfs_bootfs);
   1312 			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
   1313 			    error);
   1314 			return (error);
   1315 		}
   1316 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
   1317 			spa_free_bootprop(zfs_bootfs);
   1318 			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
   1319 			    error);
   1320 			return (error);
   1321 		}
   1322 
   1323 		spa_free_bootprop(zfs_bootfs);
   1324 
   1325 		if (error = vfs_lock(vfsp))
   1326 			return (error);
   1327 
   1328 		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
   1329 			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
   1330 			goto out;
   1331 		}
   1332 
   1333 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
   1334 		ASSERT(zfsvfs);
   1335 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
   1336 			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
   1337 			goto out;
   1338 		}
   1339 
   1340 		vp = ZTOV(zp);
   1341 		mutex_enter(&vp->v_lock);
   1342 		vp->v_flag |= VROOT;
   1343 		mutex_exit(&vp->v_lock);
   1344 		rootvp = vp;
   1345 
   1346 		/*
   1347 		 * Leave rootvp held.  The root file system is never unmounted.
   1348 		 */
   1349 
   1350 		vfs_add((struct vnode *)0, vfsp,
   1351 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
   1352 out:
   1353 		vfs_unlock(vfsp);
   1354 		return (error);
   1355 	} else if (why == ROOT_REMOUNT) {
   1356 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
   1357 		vfsp->vfs_flag |= VFS_REMOUNT;
   1358 
   1359 		/* refresh mount options */
   1360 		zfs_unregister_callbacks(vfsp->vfs_data);
   1361 		return (zfs_register_callbacks(vfsp));
   1362 
   1363 	} else if (why == ROOT_UNMOUNT) {
   1364 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
   1365 		(void) zfs_sync(vfsp, 0, 0);
   1366 		return (0);
   1367 	}
   1368 
   1369 	/*
   1370 	 * if "why" is equal to anything else other than ROOT_INIT,
   1371 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
   1372 	 */
   1373 	return (ENOTSUP);
   1374 }
   1375 
   1376 /*ARGSUSED*/
   1377 static int
   1378 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
   1379 {
   1380 	char		*osname;
   1381 	pathname_t	spn;
   1382 	int		error = 0;
   1383 	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
   1384 	    UIO_SYSSPACE : UIO_USERSPACE;
   1385 	int		canwrite;
   1386 
   1387 	if (mvp->v_type != VDIR)
   1388 		return (ENOTDIR);
   1389 
   1390 	mutex_enter(&mvp->v_lock);
   1391 	if ((uap->flags & MS_REMOUNT) == 0 &&
   1392 	    (uap->flags & MS_OVERLAY) == 0 &&
   1393 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
   1394 		mutex_exit(&mvp->v_lock);
   1395 		return (EBUSY);
   1396 	}
   1397 	mutex_exit(&mvp->v_lock);
   1398 
   1399 	/*
   1400 	 * ZFS does not support passing unparsed data in via MS_DATA.
   1401 	 * Users should use the MS_OPTIONSTR interface; this means
   1402 	 * that all option parsing is already done and the options struct
   1403 	 * can be interrogated.
   1404 	 */
   1405 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
   1406 		return (EINVAL);
   1407 
   1408 	/*
   1409 	 * Get the objset name (the "special" mount argument).
   1410 	 */
   1411 	if (error = pn_get(uap->spec, fromspace, &spn))
   1412 		return (error);
   1413 
   1414 	osname = spn.pn_path;
   1415 
   1416 	/*
   1417 	 * Check for mount privilege?
   1418 	 *
   1419 	 * If we don't have privilege then see if
   1420 	 * we have local permission to allow it
   1421 	 */
   1422 	error = secpolicy_fs_mount(cr, mvp, vfsp);
   1423 	if (error) {
   1424 		error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
   1425 		if (error == 0) {
   1426 			vattr_t		vattr;
   1427 
   1428 			/*
   1429 			 * Make sure user is the owner of the mount point
   1430 			 * or has sufficient privileges.
   1431 			 */
   1432 
   1433 			vattr.va_mask = AT_UID;
   1434 
   1435 			if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
   1436 				goto out;
   1437 			}
   1438 
   1439 			if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
   1440 			    VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
   1441 				error = EPERM;
   1442 				goto out;
   1443 			}
   1444 
   1445 			secpolicy_fs_mount_clearopts(cr, vfsp);
   1446 		} else {
   1447 			goto out;
   1448 		}
   1449 	}
   1450 
   1451 	/*
   1452 	 * Refuse to mount a filesystem if we are in a local zone and the
   1453 	 * dataset is not visible.
   1454 	 */
   1455 	if (!INGLOBALZONE(curproc) &&
   1456 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
   1457 		error = EPERM;
   1458 		goto out;
   1459 	}
   1460 
   1461 	/*
   1462 	 * When doing a remount, we simply refresh our temporary properties
   1463 	 * according to those options set in the current VFS options.
   1464 	 */
   1465 	if (uap->flags & MS_REMOUNT) {
   1466 		/* refresh mount options */
   1467 		zfs_unregister_callbacks(vfsp->vfs_data);
   1468 		error = zfs_register_callbacks(vfsp);
   1469 		goto out;
   1470 	}
   1471 
   1472 	error = zfs_domount(vfsp, osname);
   1473 
   1474 	/*
   1475 	 * Add an extra VFS_HOLD on our parent vfs so that it can't
   1476 	 * disappear due to a forced unmount.
   1477 	 */
   1478 	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
   1479 		VFS_HOLD(mvp->v_vfsp);
   1480 
   1481 out:
   1482 	pn_free(&spn);
   1483 	return (error);
   1484 }
   1485 
   1486 static int
   1487 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
   1488 {
   1489 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   1490 	dev32_t d32;
   1491 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
   1492 
   1493 	ZFS_ENTER(zfsvfs);
   1494 
   1495 	dmu_objset_space(zfsvfs->z_os,
   1496 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
   1497 
   1498 	/*
   1499 	 * The underlying storage pool actually uses multiple block sizes.
   1500 	 * We report the fragsize as the smallest block size we support,
   1501 	 * and we report our blocksize as the filesystem's maximum blocksize.
   1502 	 */
   1503 	statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
   1504 	statp->f_bsize = zfsvfs->z_max_blksz;
   1505 
   1506 	/*
   1507 	 * The following report "total" blocks of various kinds in the
   1508 	 * file system, but reported in terms of f_frsize - the
   1509 	 * "fragment" size.
   1510 	 */
   1511 
   1512 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
   1513 	statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
   1514 	statp->f_bavail = statp->f_bfree; /* no root reservation */
   1515 
   1516 	/*
   1517 	 * statvfs() should really be called statufs(), because it assumes
   1518 	 * static metadata.  ZFS doesn't preallocate files, so the best
   1519 	 * we can do is report the max that could possibly fit in f_files,
   1520 	 * and that minus the number actually used in f_ffree.
   1521 	 * For f_ffree, report the smaller of the number of object available
   1522 	 * and the number of blocks (each object will take at least a block).
   1523 	 */
   1524 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
   1525 	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
   1526 	statp->f_files = statp->f_ffree + usedobjs;
   1527 
   1528 	(void) cmpldev(&d32, vfsp->vfs_dev);
   1529 	statp->f_fsid = d32;
   1530 
   1531 	/*
   1532 	 * We're a zfs filesystem.
   1533 	 */
   1534 	(void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
   1535 
   1536 	statp->f_flag = vf_to_stf(vfsp->vfs_flag);
   1537 
   1538 	statp->f_namemax = ZFS_MAXNAMELEN;
   1539 
   1540 	/*
   1541 	 * We have all of 32 characters to stuff a string here.
   1542 	 * Is there anything useful we could/should provide?
   1543 	 */
   1544 	bzero(statp->f_fstr, sizeof (statp->f_fstr));
   1545 
   1546 	ZFS_EXIT(zfsvfs);
   1547 	return (0);
   1548 }
   1549 
   1550 static int
   1551 zfs_root(vfs_t *vfsp, vnode_t **vpp)
   1552 {
   1553 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   1554 	znode_t *rootzp;
   1555 	int error;
   1556 
   1557 	ZFS_ENTER(zfsvfs);
   1558 
   1559 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
   1560 	if (error == 0)
   1561 		*vpp = ZTOV(rootzp);
   1562 
   1563 	ZFS_EXIT(zfsvfs);
   1564 	return (error);
   1565 }
   1566 
   1567 /*
   1568  * Teardown the zfsvfs::z_os.
   1569  *
   1570  * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
   1571  * and 'z_teardown_inactive_lock' held.
   1572  */
   1573 static int
   1574 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
   1575 {
   1576 	znode_t	*zp;
   1577 
   1578 	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
   1579 
   1580 	if (!unmounting) {
   1581 		/*
   1582 		 * We purge the parent filesystem's vfsp as the parent
   1583 		 * filesystem and all of its snapshots have their vnode's
   1584 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
   1585 		 * 'z_parent' is self referential for non-snapshots.
   1586 		 */
   1587 		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
   1588 	}
   1589 
   1590 	/*
   1591 	 * Close the zil. NB: Can't close the zil while zfs_inactive
   1592 	 * threads are blocked as zil_close can call zfs_inactive.
   1593 	 */
   1594 	if (zfsvfs->z_log) {
   1595 		zil_close(zfsvfs->z_log);
   1596 		zfsvfs->z_log = NULL;
   1597 	}
   1598 
   1599 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
   1600 
   1601 	/*
   1602 	 * If we are not unmounting (ie: online recv) and someone already
   1603 	 * unmounted this file system while we were doing the switcheroo,
   1604 	 * or a reopen of z_os failed then just bail out now.
   1605 	 */
   1606 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
   1607 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
   1608 		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
   1609 		return (EIO);
   1610 	}
   1611 
   1612 	/*
   1613 	 * At this point there are no vops active, and any new vops will
   1614 	 * fail with EIO since we have z_teardown_lock for writer (only
   1615 	 * relavent for forced unmount).
   1616 	 *
   1617 	 * Release all holds on dbufs.
   1618 	 */
   1619 	mutex_enter(&zfsvfs->z_znodes_lock);
   1620 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
   1621 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
   1622 		if (zp->z_dbuf) {
   1623 			ASSERT(ZTOV(zp)->v_count > 0);
   1624 			zfs_znode_dmu_fini(zp);
   1625 		}
   1626 	mutex_exit(&zfsvfs->z_znodes_lock);
   1627 
   1628 	/*
   1629 	 * If we are unmounting, set the unmounted flag and let new vops
   1630 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
   1631 	 * other vops will fail with EIO.
   1632 	 */
   1633 	if (unmounting) {
   1634 		zfsvfs->z_unmounted = B_TRUE;
   1635 		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
   1636 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
   1637 	}
   1638 
   1639 	/*
   1640 	 * z_os will be NULL if there was an error in attempting to reopen
   1641 	 * zfsvfs, so just return as the properties had already been
   1642 	 * unregistered and cached data had been evicted before.
   1643 	 */
   1644 	if (zfsvfs->z_os == NULL)
   1645 		return (0);
   1646 
   1647 	/*
   1648 	 * Unregister properties.
   1649 	 */
   1650 	zfs_unregister_callbacks(zfsvfs);
   1651 
   1652 	/*
   1653 	 * Evict cached data
   1654 	 */
   1655 	if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
   1656 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
   1657 		(void) dmu_objset_evict_dbufs(zfsvfs->z_os);
   1658 	}
   1659 
   1660 	return (0);
   1661 }
   1662 
   1663 /*ARGSUSED*/
   1664 static int
   1665 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
   1666 {
   1667 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   1668 	objset_t *os;
   1669 	int ret;
   1670 
   1671 	ret = secpolicy_fs_unmount(cr, vfsp);
   1672 	if (ret) {
   1673 		ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
   1674 		    ZFS_DELEG_PERM_MOUNT, cr);
   1675 		if (ret)
   1676 			return (ret);
   1677 	}
   1678 
   1679 	/*
   1680 	 * We purge the parent filesystem's vfsp as the parent filesystem
   1681 	 * and all of its snapshots have their vnode's v_vfsp set to the
   1682 	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
   1683 	 * referential for non-snapshots.
   1684 	 */
   1685 	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
   1686 
   1687 	/*
   1688 	 * Unmount any snapshots mounted under .zfs before unmounting the
   1689 	 * dataset itself.
   1690 	 */
   1691 	if (zfsvfs->z_ctldir != NULL &&
   1692 	    (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
   1693 		return (ret);
   1694 	}
   1695 
   1696 	if (!(fflag & MS_FORCE)) {
   1697 		/*
   1698 		 * Check the number of active vnodes in the file system.
   1699 		 * Our count is maintained in the vfs structure, but the
   1700 		 * number is off by 1 to indicate a hold on the vfs
   1701 		 * structure itself.
   1702 		 *
   1703 		 * The '.zfs' directory maintains a reference of its
   1704 		 * own, and any active references underneath are
   1705 		 * reflected in the vnode count.
   1706 		 */
   1707 		if (zfsvfs->z_ctldir == NULL) {
   1708 			if (vfsp->vfs_count > 1)
   1709 				return (EBUSY);
   1710 		} else {
   1711 			if (vfsp->vfs_count > 2 ||
   1712 			    zfsvfs->z_ctldir->v_count > 1)
   1713 				return (EBUSY);
   1714 		}
   1715 	}
   1716 
   1717 	vfsp->vfs_flag |= VFS_UNMOUNTED;
   1718 
   1719 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
   1720 	os = zfsvfs->z_os;
   1721 
   1722 	/*
   1723 	 * z_os will be NULL if there was an error in
   1724 	 * attempting to reopen zfsvfs.
   1725 	 */
   1726 	if (os != NULL) {
   1727 		/*
   1728 		 * Unset the objset user_ptr.
   1729 		 */
   1730 		mutex_enter(&os->os->os_user_ptr_lock);
   1731 		dmu_objset_set_user(os, NULL);
   1732 		mutex_exit(&os->os->os_user_ptr_lock);
   1733 
   1734 		/*
   1735 		 * Finally release the objset
   1736 		 */
   1737 		dmu_objset_close(os);
   1738 	}
   1739 
   1740 	/*
   1741 	 * We can now safely destroy the '.zfs' directory node.
   1742 	 */
   1743 	if (zfsvfs->z_ctldir != NULL)
   1744 		zfsctl_destroy(zfsvfs);
   1745 
   1746 	return (0);
   1747 }
   1748 
   1749 static int
   1750 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
   1751 {
   1752 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
   1753 	znode_t		*zp;
   1754 	uint64_t	object = 0;
   1755 	uint64_t	fid_gen = 0;
   1756 	uint64_t	gen_mask;
   1757 	uint64_t	zp_gen;
   1758 	int 		i, err;
   1759 
   1760 	*vpp = NULL;
   1761 
   1762 	ZFS_ENTER(zfsvfs);
   1763 
   1764 	if (fidp->fid_len == LONG_FID_LEN) {
   1765 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
   1766 		uint64_t	objsetid = 0;
   1767 		uint64_t	setgen = 0;
   1768 
   1769 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
   1770 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
   1771 
   1772 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
   1773 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
   1774 
   1775 		ZFS_EXIT(zfsvfs);
   1776 
   1777 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
   1778 		if (err)
   1779 			return (EINVAL);
   1780 		ZFS_ENTER(zfsvfs);
   1781 	}
   1782 
   1783 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
   1784 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
   1785 
   1786 		for (i = 0; i < sizeof (zfid->zf_object); i++)
   1787 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
   1788 
   1789 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
   1790 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
   1791 	} else {
   1792 		ZFS_EXIT(zfsvfs);
   1793 		return (EINVAL);
   1794 	}
   1795 
   1796 	/* A zero fid_gen means we are in the .zfs control directories */
   1797 	if (fid_gen == 0 &&
   1798 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
   1799 		*vpp = zfsvfs->z_ctldir;
   1800 		ASSERT(*vpp != NULL);
   1801 		if (object == ZFSCTL_INO_SNAPDIR) {
   1802 			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
   1803 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
   1804 		} else {
   1805 			VN_HOLD(*vpp);
   1806 		}
   1807 		ZFS_EXIT(zfsvfs);
   1808 		return (0);
   1809 	}
   1810 
   1811 	gen_mask = -1ULL >> (64 - 8 * i);
   1812 
   1813 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
   1814 	if (err = zfs_zget(zfsvfs, object, &zp)) {
   1815 		ZFS_EXIT(zfsvfs);
   1816 		return (err);
   1817 	}
   1818 	zp_gen = zp->z_phys->zp_gen & gen_mask;
   1819 	if (zp_gen == 0)
   1820 		zp_gen = 1;
   1821 	if (zp->z_unlinked || zp_gen != fid_gen) {
   1822 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
   1823 		VN_RELE(ZTOV(zp));
   1824 		ZFS_EXIT(zfsvfs);
   1825 		return (EINVAL);
   1826 	}
   1827 
   1828 	*vpp = ZTOV(zp);
   1829 	ZFS_EXIT(zfsvfs);
   1830 	return (0);
   1831 }
   1832 
   1833 /*
   1834  * Block out VOPs and close zfsvfs_t::z_os
   1835  *
   1836  * Note, if successful, then we return with the 'z_teardown_lock' and
   1837  * 'z_teardown_inactive_lock' write held.
   1838  */
   1839 int
   1840 zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
   1841 {
   1842 	int error;
   1843 
   1844 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
   1845 		return (error);
   1846 
   1847 	*modep = zfsvfs->z_os->os_mode;
   1848 	if (name)
   1849 		dmu_objset_name(zfsvfs->z_os, name);
   1850 	dmu_objset_close(zfsvfs->z_os);
   1851 
   1852 	return (0);
   1853 }
   1854 
   1855 /*
   1856  * Reopen zfsvfs_t::z_os and release VOPs.
   1857  */
   1858 int
   1859 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
   1860 {
   1861 	int err;
   1862 
   1863 	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
   1864 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
   1865 
   1866 	err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
   1867 	if (err) {
   1868 		zfsvfs->z_os = NULL;
   1869 	} else {
   1870 		znode_t *zp;
   1871 
   1872 		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
   1873 
   1874 		/*
   1875 		 * Attempt to re-establish all the active znodes with
   1876 		 * their dbufs.  If a zfs_rezget() fails, then we'll let
   1877 		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
   1878 		 * when they try to use their znode.
   1879 		 */
   1880 		mutex_enter(&zfsvfs->z_znodes_lock);
   1881 		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
   1882 		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
   1883 			(void) zfs_rezget(zp);
   1884 		}
   1885 		mutex_exit(&zfsvfs->z_znodes_lock);
   1886 
   1887 	}
   1888 
   1889 	/* release the VOPs */
   1890 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
   1891 	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
   1892 
   1893 	if (err) {
   1894 		/*
   1895 		 * Since we couldn't reopen zfsvfs::z_os, force
   1896 		 * unmount this file system.
   1897 		 */
   1898 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
   1899 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
   1900 	}
   1901 	return (err);
   1902 }
   1903 
   1904 static void
   1905 zfs_freevfs(vfs_t *vfsp)
   1906 {
   1907 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   1908 
   1909 	/*
   1910 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
   1911 	 * from zfs_mount().  Release it here.
   1912 	 */
   1913 	if (zfsvfs->z_issnap)
   1914 		VFS_RELE(zfsvfs->z_parent->z_vfs);
   1915 
   1916 	zfsvfs_free(zfsvfs);
   1917 
   1918 	atomic_add_32(&zfs_active_fs_count, -1);
   1919 }
   1920 
   1921 /*
   1922  * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
   1923  * so we can't safely do any non-idempotent initialization here.
   1924  * Leave that to zfs_init() and zfs_fini(), which are called
   1925  * from the module's _init() and _fini() entry points.
   1926  */
   1927 /*ARGSUSED*/
   1928 static int
   1929 zfs_vfsinit(int fstype, char *name)
   1930 {
   1931 	int error;
   1932 
   1933 	zfsfstype = fstype;
   1934 
   1935 	/*
   1936 	 * Setup vfsops and vnodeops tables.
   1937 	 */
   1938 	error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
   1939 	if (error != 0) {
   1940 		cmn_err(CE_WARN, "zfs: bad vfs ops template");
   1941 	}
   1942 
   1943 	error = zfs_create_op_tables();
   1944 	if (error) {
   1945 		zfs_remove_op_tables();
   1946 		cmn_err(CE_WARN, "zfs: bad vnode ops template");
   1947 		(void) vfs_freevfsops_by_type(zfsfstype);
   1948 		return (error);
   1949 	}
   1950 
   1951 	mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
   1952 
   1953 	/*
   1954 	 * Unique major number for all zfs mounts.
   1955 	 * If we run out of 32-bit minors, we'll getudev() another major.
   1956 	 */
   1957 	zfs_major = ddi_name_to_major(ZFS_DRIVER);
   1958 	zfs_minor = ZFS_MIN_MINOR;
   1959 
   1960 	return (0);
   1961 }
   1962 
   1963 void
   1964 zfs_init(void)
   1965 {
   1966 	/*
   1967 	 * Initialize .zfs directory structures
   1968 	 */
   1969 	zfsctl_init();
   1970 
   1971 	/*
   1972 	 * Initialize znode cache, vnode ops, etc...
   1973 	 */
   1974 	zfs_znode_init();
   1975 
   1976 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
   1977 }
   1978 
   1979 void
   1980 zfs_fini(void)
   1981 {
   1982 	zfsctl_fini();
   1983 	zfs_znode_fini();
   1984 }
   1985 
   1986 int
   1987 zfs_busy(void)
   1988 {
   1989 	return (zfs_active_fs_count != 0);
   1990 }
   1991 
   1992 int
   1993 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
   1994 {
   1995 	int error;
   1996 	objset_t *os = zfsvfs->z_os;
   1997 	dmu_tx_t *tx;
   1998 
   1999 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
   2000 		return (EINVAL);
   2001 
   2002 	if (newvers < zfsvfs->z_version)
   2003 		return (EINVAL);
   2004 
   2005 	tx = dmu_tx_create(os);
   2006 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
   2007 	error = dmu_tx_assign(tx, TXG_WAIT);
   2008 	if (error) {
   2009 		dmu_tx_abort(tx);
   2010 		return (error);
   2011 	}
   2012 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
   2013 	    8, 1, &newvers, tx);
   2014 
   2015 	if (error) {
   2016 		dmu_tx_commit(tx);
   2017 		return (error);
   2018 	}
   2019 
   2020 	spa_history_internal_log(LOG_DS_UPGRADE,
   2021 	    dmu_objset_spa(os), tx, CRED(),
   2022 	    "oldver=%llu newver=%llu dataset = %llu",
   2023 	    zfsvfs->z_version, newvers, dmu_objset_id(os));
   2024 
   2025 	dmu_tx_commit(tx);
   2026 
   2027 	zfsvfs->z_version = newvers;
   2028 
   2029 	if (zfsvfs->z_version >= ZPL_VERSION_FUID)
   2030 		zfs_set_fuid_feature(zfsvfs);
   2031 
   2032 	return (0);
   2033 }
   2034 
   2035 /*
   2036  * Read a property stored within the master node.
   2037  */
   2038 int
   2039 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
   2040 {
   2041 	const char *pname;
   2042 	int error = ENOENT;
   2043 
   2044 	/*
   2045 	 * Look up the file system's value for the property.  For the
   2046 	 * version property, we look up a slightly different string.
   2047 	 */
   2048 	if (prop == ZFS_PROP_VERSION)
   2049 		pname = ZPL_VERSION_STR;
   2050 	else
   2051 		pname = zfs_prop_to_name(prop);
   2052 
   2053 	if (os != NULL)
   2054 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
   2055 
   2056 	if (error == ENOENT) {
   2057 		/* No value set, use the default value */
   2058 		switch (prop) {
   2059 		case ZFS_PROP_VERSION:
   2060 			*value = ZPL_VERSION;
   2061 			break;
   2062 		case ZFS_PROP_NORMALIZE:
   2063 		case ZFS_PROP_UTF8ONLY:
   2064 			*value = 0;
   2065 			break;
   2066 		case ZFS_PROP_CASE:
   2067 			*value = ZFS_CASE_SENSITIVE;
   2068 			break;
   2069 		default:
   2070 			return (error);
   2071 		}
   2072 		error = 0;
   2073 	}
   2074 	return (error);
   2075 }
   2076 
   2077 static vfsdef_t vfw = {
   2078 	VFSDEF_VERSION,
   2079 	MNTTYPE_ZFS,
   2080 	zfs_vfsinit,
   2081 	VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
   2082 	    VSW_XID,
   2083 	&zfs_mntopts
   2084 };
   2085 
   2086 struct modlfs zfs_modlfs = {
   2087 	&mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
   2088 };
   2089