Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/zfs_context.h>
     27 #include <sys/spa_impl.h>
     28 #include <sys/zio.h>
     29 #include <sys/zio_checksum.h>
     30 #include <sys/zio_compress.h>
     31 #include <sys/dmu.h>
     32 #include <sys/dmu_tx.h>
     33 #include <sys/zap.h>
     34 #include <sys/zil.h>
     35 #include <sys/vdev_impl.h>
     36 #include <sys/metaslab.h>
     37 #include <sys/uberblock_impl.h>
     38 #include <sys/txg.h>
     39 #include <sys/avl.h>
     40 #include <sys/unique.h>
     41 #include <sys/dsl_pool.h>
     42 #include <sys/dsl_dir.h>
     43 #include <sys/dsl_prop.h>
     44 #include <sys/fs/zfs.h>
     45 #include <sys/metaslab_impl.h>
     46 #include <sys/sunddi.h>
     47 #include <sys/arc.h>
     48 #include "zfs_prop.h"
     49 
     50 /*
     51  * SPA locking
     52  *
     53  * There are four basic locks for managing spa_t structures:
     54  *
     55  * spa_namespace_lock (global mutex)
     56  *
     57  *	This lock must be acquired to do any of the following:
     58  *
     59  *		- Lookup a spa_t by name
     60  *		- Add or remove a spa_t from the namespace
     61  *		- Increase spa_refcount from non-zero
     62  *		- Check if spa_refcount is zero
     63  *		- Rename a spa_t
     64  *		- add/remove/attach/detach devices
     65  *		- Held for the duration of create/destroy/import/export
     66  *
     67  *	It does not need to handle recursion.  A create or destroy may
     68  *	reference objects (files or zvols) in other pools, but by
     69  *	definition they must have an existing reference, and will never need
     70  *	to lookup a spa_t by name.
     71  *
     72  * spa_refcount (per-spa refcount_t protected by mutex)
     73  *
     74  *	This reference count keep track of any active users of the spa_t.  The
     75  *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
     76  *	the refcount is never really 'zero' - opening a pool implicitly keeps
     77  *	some references in the DMU.  Internally we check against spa_minref, but
     78  *	present the image of a zero/non-zero value to consumers.
     79  *
     80  * spa_config_lock[] (per-spa array of rwlocks)
     81  *
     82  *	This protects the spa_t from config changes, and must be held in
     83  *	the following circumstances:
     84  *
     85  *		- RW_READER to perform I/O to the spa
     86  *		- RW_WRITER to change the vdev config
     87  *
     88  * The locking order is fairly straightforward:
     89  *
     90  *		spa_namespace_lock	->	spa_refcount
     91  *
     92  *	The namespace lock must be acquired to increase the refcount from 0
     93  *	or to check if it is zero.
     94  *
     95  *		spa_refcount		->	spa_config_lock[]
     96  *
     97  *	There must be at least one valid reference on the spa_t to acquire
     98  *	the config lock.
     99  *
    100  *		spa_namespace_lock	->	spa_config_lock[]
    101  *
    102  *	The namespace lock must always be taken before the config lock.
    103  *
    104  *
    105  * The spa_namespace_lock can be acquired directly and is globally visible.
    106  *
    107  * The namespace is manipulated using the following functions, all of which
    108  * require the spa_namespace_lock to be held.
    109  *
    110  *	spa_lookup()		Lookup a spa_t by name.
    111  *
    112  *	spa_add()		Create a new spa_t in the namespace.
    113  *
    114  *	spa_remove()		Remove a spa_t from the namespace.  This also
    115  *				frees up any memory associated with the spa_t.
    116  *
    117  *	spa_next()		Returns the next spa_t in the system, or the
    118  *				first if NULL is passed.
    119  *
    120  *	spa_evict_all()		Shutdown and remove all spa_t structures in
    121  *				the system.
    122  *
    123  *	spa_guid_exists()	Determine whether a pool/device guid exists.
    124  *
    125  * The spa_refcount is manipulated using the following functions:
    126  *
    127  *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
    128  *				called with spa_namespace_lock held if the
    129  *				refcount is currently zero.
    130  *
    131  *	spa_close()		Remove a reference from the spa_t.  This will
    132  *				not free the spa_t or remove it from the
    133  *				namespace.  No locking is required.
    134  *
    135  *	spa_refcount_zero()	Returns true if the refcount is currently
    136  *				zero.  Must be called with spa_namespace_lock
    137  *				held.
    138  *
    139  * The spa_config_lock[] is an array of rwlocks, ordered as follows:
    140  * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
    141  * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
    142  *
    143  * To read the configuration, it suffices to hold one of these locks as reader.
    144  * To modify the configuration, you must hold all locks as writer.  To modify
    145  * vdev state without altering the vdev tree's topology (e.g. online/offline),
    146  * you must hold SCL_STATE and SCL_ZIO as writer.
    147  *
    148  * We use these distinct config locks to avoid recursive lock entry.
    149  * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
    150  * block allocations (SCL_ALLOC), which may require reading space maps
    151  * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
    152  *
    153  * The spa config locks cannot be normal rwlocks because we need the
    154  * ability to hand off ownership.  For example, SCL_ZIO is acquired
    155  * by the issuing thread and later released by an interrupt thread.
    156  * They do, however, obey the usual write-wanted semantics to prevent
    157  * writer (i.e. system administrator) starvation.
    158  *
    159  * The lock acquisition rules are as follows:
    160  *
    161  * SCL_CONFIG
    162  *	Protects changes to the vdev tree topology, such as vdev
    163  *	add/remove/attach/detach.  Protects the dirty config list
    164  *	(spa_config_dirty_list) and the set of spares and l2arc devices.
    165  *
    166  * SCL_STATE
    167  *	Protects changes to pool state and vdev state, such as vdev
    168  *	online/offline/fault/degrade/clear.  Protects the dirty state list
    169  *	(spa_state_dirty_list) and global pool state (spa_state).
    170  *
    171  * SCL_ALLOC
    172  *	Protects changes to metaslab groups and classes.
    173  *	Held as reader by metaslab_alloc() and metaslab_claim().
    174  *
    175  * SCL_ZIO
    176  *	Held by bp-level zios (those which have no io_vd upon entry)
    177  *	to prevent changes to the vdev tree.  The bp-level zio implicitly
    178  *	protects all of its vdev child zios, which do not hold SCL_ZIO.
    179  *
    180  * SCL_FREE
    181  *	Protects changes to metaslab groups and classes.
    182  *	Held as reader by metaslab_free().  SCL_FREE is distinct from
    183  *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
    184  *	blocks in zio_done() while another i/o that holds either
    185  *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
    186  *
    187  * SCL_VDEV
    188  *	Held as reader to prevent changes to the vdev tree during trivial
    189  *	inquiries such as bp_get_dasize().  SCL_VDEV is distinct from the
    190  *	other locks, and lower than all of them, to ensure that it's safe
    191  *	to acquire regardless of caller context.
    192  *
    193  * In addition, the following rules apply:
    194  *
    195  * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
    196  *	The lock ordering is SCL_CONFIG > spa_props_lock.
    197  *
    198  * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
    199  *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
    200  *	or zio_write_phys() -- the caller must ensure that the config cannot
    201  *	cannot change in the interim, and that the vdev cannot be reopened.
    202  *	SCL_STATE as reader suffices for both.
    203  *
    204  * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
    205  *
    206  *	spa_vdev_enter()	Acquire the namespace lock and the config lock
    207  *				for writing.
    208  *
    209  *	spa_vdev_exit()		Release the config lock, wait for all I/O
    210  *				to complete, sync the updated configs to the
    211  *				cache, and release the namespace lock.
    212  *
    213  * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
    214  * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
    215  * locking is, always, based on spa_namespace_lock and spa_config_lock[].
    216  *
    217  * spa_rename() is also implemented within this file since is requires
    218  * manipulation of the namespace.
    219  */
    220 
    221 static avl_tree_t spa_namespace_avl;
    222 kmutex_t spa_namespace_lock;
    223 static kcondvar_t spa_namespace_cv;
    224 static int spa_active_count;
    225 int spa_max_replication_override = SPA_DVAS_PER_BP;
    226 
    227 static kmutex_t spa_spare_lock;
    228 static avl_tree_t spa_spare_avl;
    229 static kmutex_t spa_l2cache_lock;
    230 static avl_tree_t spa_l2cache_avl;
    231 
    232 kmem_cache_t *spa_buffer_pool;
    233 int spa_mode_global;
    234 
    235 #ifdef ZFS_DEBUG
    236 /* Everything except dprintf is on by default in debug builds */
    237 int zfs_flags = ~ZFS_DEBUG_DPRINTF;
    238 #else
    239 int zfs_flags = 0;
    240 #endif
    241 
    242 /*
    243  * zfs_recover can be set to nonzero to attempt to recover from
    244  * otherwise-fatal errors, typically caused by on-disk corruption.  When
    245  * set, calls to zfs_panic_recover() will turn into warning messages.
    246  */
    247 int zfs_recover = 0;
    248 
    249 
    250 /*
    251  * ==========================================================================
    252  * SPA config locking
    253  * ==========================================================================
    254  */
    255 static void
    256 spa_config_lock_init(spa_t *spa)
    257 {
    258 	for (int i = 0; i < SCL_LOCKS; i++) {
    259 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
    260 		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
    261 		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
    262 		refcount_create(&scl->scl_count);
    263 		scl->scl_writer = NULL;
    264 		scl->scl_write_wanted = 0;
    265 	}
    266 }
    267 
    268 static void
    269 spa_config_lock_destroy(spa_t *spa)
    270 {
    271 	for (int i = 0; i < SCL_LOCKS; i++) {
    272 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
    273 		mutex_destroy(&scl->scl_lock);
    274 		cv_destroy(&scl->scl_cv);
    275 		refcount_destroy(&scl->scl_count);
    276 		ASSERT(scl->scl_writer == NULL);
    277 		ASSERT(scl->scl_write_wanted == 0);
    278 	}
    279 }
    280 
    281 int
    282 spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
    283 {
    284 	for (int i = 0; i < SCL_LOCKS; i++) {
    285 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
    286 		if (!(locks & (1 << i)))
    287 			continue;
    288 		mutex_enter(&scl->scl_lock);
    289 		if (rw == RW_READER) {
    290 			if (scl->scl_writer || scl->scl_write_wanted) {
    291 				mutex_exit(&scl->scl_lock);
    292 				spa_config_exit(spa, locks ^ (1 << i), tag);
    293 				return (0);
    294 			}
    295 		} else {
    296 			ASSERT(scl->scl_writer != curthread);
    297 			if (!refcount_is_zero(&scl->scl_count)) {
    298 				mutex_exit(&scl->scl_lock);
    299 				spa_config_exit(spa, locks ^ (1 << i), tag);
    300 				return (0);
    301 			}
    302 			scl->scl_writer = curthread;
    303 		}
    304 		(void) refcount_add(&scl->scl_count, tag);
    305 		mutex_exit(&scl->scl_lock);
    306 	}
    307 	return (1);
    308 }
    309 
    310 void
    311 spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
    312 {
    313 	int wlocks_held = 0;
    314 
    315 	for (int i = 0; i < SCL_LOCKS; i++) {
    316 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
    317 		if (scl->scl_writer == curthread)
    318 			wlocks_held |= (1 << i);
    319 		if (!(locks & (1 << i)))
    320 			continue;
    321 		mutex_enter(&scl->scl_lock);
    322 		if (rw == RW_READER) {
    323 			while (scl->scl_writer || scl->scl_write_wanted) {
    324 				cv_wait(&scl->scl_cv, &scl->scl_lock);
    325 			}
    326 		} else {
    327 			ASSERT(scl->scl_writer != curthread);
    328 			while (!refcount_is_zero(&scl->scl_count)) {
    329 				scl->scl_write_wanted++;
    330 				cv_wait(&scl->scl_cv, &scl->scl_lock);
    331 				scl->scl_write_wanted--;
    332 			}
    333 			scl->scl_writer = curthread;
    334 		}
    335 		(void) refcount_add(&scl->scl_count, tag);
    336 		mutex_exit(&scl->scl_lock);
    337 	}
    338 	ASSERT(wlocks_held <= locks);
    339 }
    340 
    341 void
    342 spa_config_exit(spa_t *spa, int locks, void *tag)
    343 {
    344 	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
    345 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
    346 		if (!(locks & (1 << i)))
    347 			continue;
    348 		mutex_enter(&scl->scl_lock);
    349 		ASSERT(!refcount_is_zero(&scl->scl_count));
    350 		if (refcount_remove(&scl->scl_count, tag) == 0) {
    351 			ASSERT(scl->scl_writer == NULL ||
    352 			    scl->scl_writer == curthread);
    353 			scl->scl_writer = NULL;	/* OK in either case */
    354 			cv_broadcast(&scl->scl_cv);
    355 		}
    356 		mutex_exit(&scl->scl_lock);
    357 	}
    358 }
    359 
    360 int
    361 spa_config_held(spa_t *spa, int locks, krw_t rw)
    362 {
    363 	int locks_held = 0;
    364 
    365 	for (int i = 0; i < SCL_LOCKS; i++) {
    366 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
    367 		if (!(locks & (1 << i)))
    368 			continue;
    369 		if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
    370 		    (rw == RW_WRITER && scl->scl_writer == curthread))
    371 			locks_held |= 1 << i;
    372 	}
    373 
    374 	return (locks_held);
    375 }
    376 
    377 /*
    378  * ==========================================================================
    379  * SPA namespace functions
    380  * ==========================================================================
    381  */
    382 
    383 /*
    384  * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
    385  * Returns NULL if no matching spa_t is found.
    386  */
    387 spa_t *
    388 spa_lookup(const char *name)
    389 {
    390 	static spa_t search;	/* spa_t is large; don't allocate on stack */
    391 	spa_t *spa;
    392 	avl_index_t where;
    393 	char c;
    394 	char *cp;
    395 
    396 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
    397 
    398 	/*
    399 	 * If it's a full dataset name, figure out the pool name and
    400 	 * just use that.
    401 	 */
    402 	cp = strpbrk(name, "/@");
    403 	if (cp) {
    404 		c = *cp;
    405 		*cp = '\0';
    406 	}
    407 
    408 	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
    409 	spa = avl_find(&spa_namespace_avl, &search, &where);
    410 
    411 	if (cp)
    412 		*cp = c;
    413 
    414 	return (spa);
    415 }
    416 
    417 /*
    418  * Create an uninitialized spa_t with the given name.  Requires
    419  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
    420  * exist by calling spa_lookup() first.
    421  */
    422 spa_t *
    423 spa_add(const char *name, const char *altroot)
    424 {
    425 	spa_t *spa;
    426 	spa_config_dirent_t *dp;
    427 
    428 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
    429 
    430 	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
    431 
    432 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
    433 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
    434 	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
    435 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
    436 	mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
    437 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
    438 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
    439 
    440 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
    441 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
    442 	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
    443 
    444 	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
    445 	spa->spa_state = POOL_STATE_UNINITIALIZED;
    446 	spa->spa_freeze_txg = UINT64_MAX;
    447 	spa->spa_final_txg = UINT64_MAX;
    448 
    449 	refcount_create(&spa->spa_refcount);
    450 	spa_config_lock_init(spa);
    451 
    452 	avl_add(&spa_namespace_avl, spa);
    453 
    454 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
    455 
    456 	/*
    457 	 * Set the alternate root, if there is one.
    458 	 */
    459 	if (altroot) {
    460 		spa->spa_root = spa_strdup(altroot);
    461 		spa_active_count++;
    462 	}
    463 
    464 	/*
    465 	 * Every pool starts with the default cachefile
    466 	 */
    467 	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
    468 	    offsetof(spa_config_dirent_t, scd_link));
    469 
    470 	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
    471 	dp->scd_path = spa_strdup(spa_config_path);
    472 	list_insert_head(&spa->spa_config_list, dp);
    473 
    474 	return (spa);
    475 }
    476 
    477 /*
    478  * Removes a spa_t from the namespace, freeing up any memory used.  Requires
    479  * spa_namespace_lock.  This is called only after the spa_t has been closed and
    480  * deactivated.
    481  */
    482 void
    483 spa_remove(spa_t *spa)
    484 {
    485 	spa_config_dirent_t *dp;
    486 
    487 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
    488 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
    489 
    490 	avl_remove(&spa_namespace_avl, spa);
    491 	cv_broadcast(&spa_namespace_cv);
    492 
    493 	if (spa->spa_root) {
    494 		spa_strfree(spa->spa_root);
    495 		spa_active_count--;
    496 	}
    497 
    498 	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
    499 		list_remove(&spa->spa_config_list, dp);
    500 		if (dp->scd_path != NULL)
    501 			spa_strfree(dp->scd_path);
    502 		kmem_free(dp, sizeof (spa_config_dirent_t));
    503 	}
    504 
    505 	list_destroy(&spa->spa_config_list);
    506 
    507 	spa_config_set(spa, NULL);
    508 
    509 	refcount_destroy(&spa->spa_refcount);
    510 
    511 	spa_config_lock_destroy(spa);
    512 
    513 	cv_destroy(&spa->spa_async_cv);
    514 	cv_destroy(&spa->spa_scrub_io_cv);
    515 	cv_destroy(&spa->spa_suspend_cv);
    516 
    517 	mutex_destroy(&spa->spa_async_lock);
    518 	mutex_destroy(&spa->spa_scrub_lock);
    519 	mutex_destroy(&spa->spa_errlog_lock);
    520 	mutex_destroy(&spa->spa_errlist_lock);
    521 	mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
    522 	mutex_destroy(&spa->spa_history_lock);
    523 	mutex_destroy(&spa->spa_props_lock);
    524 	mutex_destroy(&spa->spa_suspend_lock);
    525 
    526 	kmem_free(spa, sizeof (spa_t));
    527 }
    528 
    529 /*
    530  * Given a pool, return the next pool in the namespace, or NULL if there is
    531  * none.  If 'prev' is NULL, return the first pool.
    532  */
    533 spa_t *
    534 spa_next(spa_t *prev)
    535 {
    536 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
    537 
    538 	if (prev)
    539 		return (AVL_NEXT(&spa_namespace_avl, prev));
    540 	else
    541 		return (avl_first(&spa_namespace_avl));
    542 }
    543 
    544 /*
    545  * ==========================================================================
    546  * SPA refcount functions
    547  * ==========================================================================
    548  */
    549 
    550 /*
    551  * Add a reference to the given spa_t.  Must have at least one reference, or
    552  * have the namespace lock held.
    553  */
    554 void
    555 spa_open_ref(spa_t *spa, void *tag)
    556 {
    557 	ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
    558 	    MUTEX_HELD(&spa_namespace_lock));
    559 	(void) refcount_add(&spa->spa_refcount, tag);
    560 }
    561 
    562 /*
    563  * Remove a reference to the given spa_t.  Must have at least one reference, or
    564  * have the namespace lock held.
    565  */
    566 void
    567 spa_close(spa_t *spa, void *tag)
    568 {
    569 	ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
    570 	    MUTEX_HELD(&spa_namespace_lock));
    571 	(void) refcount_remove(&spa->spa_refcount, tag);
    572 }
    573 
    574 /*
    575  * Check to see if the spa refcount is zero.  Must be called with
    576  * spa_namespace_lock held.  We really compare against spa_minref, which is the
    577  * number of references acquired when opening a pool
    578  */
    579 boolean_t
    580 spa_refcount_zero(spa_t *spa)
    581 {
    582 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
    583 
    584 	return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
    585 }
    586 
    587 /*
    588  * ==========================================================================
    589  * SPA spare and l2cache tracking
    590  * ==========================================================================
    591  */
    592 
    593 /*
    594  * Hot spares and cache devices are tracked using the same code below,
    595  * for 'auxiliary' devices.
    596  */
    597 
    598 typedef struct spa_aux {
    599 	uint64_t	aux_guid;
    600 	uint64_t	aux_pool;
    601 	avl_node_t	aux_avl;
    602 	int		aux_count;
    603 } spa_aux_t;
    604 
    605 static int
    606 spa_aux_compare(const void *a, const void *b)
    607 {
    608 	const spa_aux_t *sa = a;
    609 	const spa_aux_t *sb = b;
    610 
    611 	if (sa->aux_guid < sb->aux_guid)
    612 		return (-1);
    613 	else if (sa->aux_guid > sb->aux_guid)
    614 		return (1);
    615 	else
    616 		return (0);
    617 }
    618 
    619 void
    620 spa_aux_add(vdev_t *vd, avl_tree_t *avl)
    621 {
    622 	avl_index_t where;
    623 	spa_aux_t search;
    624 	spa_aux_t *aux;
    625 
    626 	search.aux_guid = vd->vdev_guid;
    627 	if ((aux = avl_find(avl, &search, &where)) != NULL) {
    628 		aux->aux_count++;
    629 	} else {
    630 		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
    631 		aux->aux_guid = vd->vdev_guid;
    632 		aux->aux_count = 1;
    633 		avl_insert(avl, aux, where);
    634 	}
    635 }
    636 
    637 void
    638 spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
    639 {
    640 	spa_aux_t search;
    641 	spa_aux_t *aux;
    642 	avl_index_t where;
    643 
    644 	search.aux_guid = vd->vdev_guid;
    645 	aux = avl_find(avl, &search, &where);
    646 
    647 	ASSERT(aux != NULL);
    648 
    649 	if (--aux->aux_count == 0) {
    650 		avl_remove(avl, aux);
    651 		kmem_free(aux, sizeof (spa_aux_t));
    652 	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
    653 		aux->aux_pool = 0ULL;
    654 	}
    655 }
    656 
    657 boolean_t
    658 spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
    659 {
    660 	spa_aux_t search, *found;
    661 
    662 	search.aux_guid = guid;
    663 	found = avl_find(avl, &search, NULL);
    664 
    665 	if (pool) {
    666 		if (found)
    667 			*pool = found->aux_pool;
    668 		else
    669 			*pool = 0ULL;
    670 	}
    671 
    672 	if (refcnt) {
    673 		if (found)
    674 			*refcnt = found->aux_count;
    675 		else
    676 			*refcnt = 0;
    677 	}
    678 
    679 	return (found != NULL);
    680 }
    681 
    682 void
    683 spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
    684 {
    685 	spa_aux_t search, *found;
    686 	avl_index_t where;
    687 
    688 	search.aux_guid = vd->vdev_guid;
    689 	found = avl_find(avl, &search, &where);
    690 	ASSERT(found != NULL);
    691 	ASSERT(found->aux_pool == 0ULL);
    692 
    693 	found->aux_pool = spa_guid(vd->vdev_spa);
    694 }
    695 
    696 /*
    697  * Spares are tracked globally due to the following constraints:
    698  *
    699  * 	- A spare may be part of multiple pools.
    700  * 	- A spare may be added to a pool even if it's actively in use within
    701  *	  another pool.
    702  * 	- A spare in use in any pool can only be the source of a replacement if
    703  *	  the target is a spare in the same pool.
    704  *
    705  * We keep track of all spares on the system through the use of a reference
    706  * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
    707  * spare, then we bump the reference count in the AVL tree.  In addition, we set
    708  * the 'vdev_isspare' member to indicate that the device is a spare (active or
    709  * inactive).  When a spare is made active (used to replace a device in the
    710  * pool), we also keep track of which pool its been made a part of.
    711  *
    712  * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
    713  * called under the spa_namespace lock as part of vdev reconfiguration.  The
    714  * separate spare lock exists for the status query path, which does not need to
    715  * be completely consistent with respect to other vdev configuration changes.
    716  */
    717 
    718 static int
    719 spa_spare_compare(const void *a, const void *b)
    720 {
    721 	return (spa_aux_compare(a, b));
    722 }
    723 
    724 void
    725 spa_spare_add(vdev_t *vd)
    726 {
    727 	mutex_enter(&spa_spare_lock);
    728 	ASSERT(!vd->vdev_isspare);
    729 	spa_aux_add(vd, &spa_spare_avl);
    730 	vd->vdev_isspare = B_TRUE;
    731 	mutex_exit(&spa_spare_lock);
    732 }
    733 
    734 void
    735 spa_spare_remove(vdev_t *vd)
    736 {
    737 	mutex_enter(&spa_spare_lock);
    738 	ASSERT(vd->vdev_isspare);
    739 	spa_aux_remove(vd, &spa_spare_avl);
    740 	vd->vdev_isspare = B_FALSE;
    741 	mutex_exit(&spa_spare_lock);
    742 }
    743 
    744 boolean_t
    745 spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
    746 {
    747 	boolean_t found;
    748 
    749 	mutex_enter(&spa_spare_lock);
    750 	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
    751 	mutex_exit(&spa_spare_lock);
    752 
    753 	return (found);
    754 }
    755 
    756 void
    757 spa_spare_activate(vdev_t *vd)
    758 {
    759 	mutex_enter(&spa_spare_lock);
    760 	ASSERT(vd->vdev_isspare);
    761 	spa_aux_activate(vd, &spa_spare_avl);
    762 	mutex_exit(&spa_spare_lock);
    763 }
    764 
    765 /*
    766  * Level 2 ARC devices are tracked globally for the same reasons as spares.
    767  * Cache devices currently only support one pool per cache device, and so
    768  * for these devices the aux reference count is currently unused beyond 1.
    769  */
    770 
    771 static int
    772 spa_l2cache_compare(const void *a, const void *b)
    773 {
    774 	return (spa_aux_compare(a, b));
    775 }
    776 
    777 void
    778 spa_l2cache_add(vdev_t *vd)
    779 {
    780 	mutex_enter(&spa_l2cache_lock);
    781 	ASSERT(!vd->vdev_isl2cache);
    782 	spa_aux_add(vd, &spa_l2cache_avl);
    783 	vd->vdev_isl2cache = B_TRUE;
    784 	mutex_exit(&spa_l2cache_lock);
    785 }
    786 
    787 void
    788 spa_l2cache_remove(vdev_t *vd)
    789 {
    790 	mutex_enter(&spa_l2cache_lock);
    791 	ASSERT(vd->vdev_isl2cache);
    792 	spa_aux_remove(vd, &spa_l2cache_avl);
    793 	vd->vdev_isl2cache = B_FALSE;
    794 	mutex_exit(&spa_l2cache_lock);
    795 }
    796 
    797 boolean_t
    798 spa_l2cache_exists(uint64_t guid, uint64_t *pool)
    799 {
    800 	boolean_t found;
    801 
    802 	mutex_enter(&spa_l2cache_lock);
    803 	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
    804 	mutex_exit(&spa_l2cache_lock);
    805 
    806 	return (found);
    807 }
    808 
    809 void
    810 spa_l2cache_activate(vdev_t *vd)
    811 {
    812 	mutex_enter(&spa_l2cache_lock);
    813 	ASSERT(vd->vdev_isl2cache);
    814 	spa_aux_activate(vd, &spa_l2cache_avl);
    815 	mutex_exit(&spa_l2cache_lock);
    816 }
    817 
    818 void
    819 spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
    820 {
    821 	vdev_space_update(vd, space, alloc, B_FALSE);
    822 }
    823 
    824 /*
    825  * ==========================================================================
    826  * SPA vdev locking
    827  * ==========================================================================
    828  */
    829 
    830 /*
    831  * Lock the given spa_t for the purpose of adding or removing a vdev.
    832  * Grabs the global spa_namespace_lock plus the spa config lock for writing.
    833  * It returns the next transaction group for the spa_t.
    834  */
    835 uint64_t
    836 spa_vdev_enter(spa_t *spa)
    837 {
    838 	mutex_enter(&spa_namespace_lock);
    839 
    840 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
    841 
    842 	return (spa_last_synced_txg(spa) + 1);
    843 }
    844 
    845 /*
    846  * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
    847  * locking of spa_vdev_enter(), we also want make sure the transactions have
    848  * synced to disk, and then update the global configuration cache with the new
    849  * information.
    850  */
    851 int
    852 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
    853 {
    854 	int config_changed = B_FALSE;
    855 
    856 	ASSERT(txg > spa_last_synced_txg(spa));
    857 
    858 	spa->spa_pending_vdev = NULL;
    859 
    860 	/*
    861 	 * Reassess the DTLs.
    862 	 */
    863 	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
    864 
    865 	/*
    866 	 * If the config changed, notify the scrub thread that it must restart.
    867 	 */
    868 	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
    869 		dsl_pool_scrub_restart(spa->spa_dsl_pool);
    870 		config_changed = B_TRUE;
    871 	}
    872 
    873 	spa_config_exit(spa, SCL_ALL, spa);
    874 
    875 	/*
    876 	 * Note: this txg_wait_synced() is important because it ensures
    877 	 * that there won't be more than one config change per txg.
    878 	 * This allows us to use the txg as the generation number.
    879 	 */
    880 	if (error == 0)
    881 		txg_wait_synced(spa->spa_dsl_pool, txg);
    882 
    883 	if (vd != NULL) {
    884 		ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
    885 		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
    886 		vdev_free(vd);
    887 		spa_config_exit(spa, SCL_ALL, spa);
    888 	}
    889 
    890 	/*
    891 	 * If the config changed, update the config cache.
    892 	 */
    893 	if (config_changed)
    894 		spa_config_sync(spa, B_FALSE, B_TRUE);
    895 
    896 	mutex_exit(&spa_namespace_lock);
    897 
    898 	return (error);
    899 }
    900 
    901 /*
    902  * Lock the given spa_t for the purpose of changing vdev state.
    903  */
    904 void
    905 spa_vdev_state_enter(spa_t *spa)
    906 {
    907 	spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
    908 }
    909 
    910 int
    911 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
    912 {
    913 	if (vd != NULL)
    914 		vdev_state_dirty(vd->vdev_top);
    915 
    916 	spa_config_exit(spa, SCL_STATE_ALL, spa);
    917 
    918 	/*
    919 	 * If anything changed, wait for it to sync.  This ensures that,
    920 	 * from the system administrator's perspective, zpool(1M) commands
    921 	 * are synchronous.  This is important for things like zpool offline:
    922 	 * when the command completes, you expect no further I/O from ZFS.
    923 	 */
    924 	if (vd != NULL)
    925 		txg_wait_synced(spa->spa_dsl_pool, 0);
    926 
    927 	return (error);
    928 }
    929 
    930 /*
    931  * ==========================================================================
    932  * Miscellaneous functions
    933  * ==========================================================================
    934  */
    935 
    936 /*
    937  * Rename a spa_t.
    938  */
    939 int
    940 spa_rename(const char *name, const char *newname)
    941 {
    942 	spa_t *spa;
    943 	int err;
    944 
    945 	/*
    946 	 * Lookup the spa_t and grab the config lock for writing.  We need to
    947 	 * actually open the pool so that we can sync out the necessary labels.
    948 	 * It's OK to call spa_open() with the namespace lock held because we
    949 	 * allow recursive calls for other reasons.
    950 	 */
    951 	mutex_enter(&spa_namespace_lock);
    952 	if ((err = spa_open(name, &spa, FTAG)) != 0) {
    953 		mutex_exit(&spa_namespace_lock);
    954 		return (err);
    955 	}
    956 
    957 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
    958 
    959 	avl_remove(&spa_namespace_avl, spa);
    960 	(void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
    961 	avl_add(&spa_namespace_avl, spa);
    962 
    963 	/*
    964 	 * Sync all labels to disk with the new names by marking the root vdev
    965 	 * dirty and waiting for it to sync.  It will pick up the new pool name
    966 	 * during the sync.
    967 	 */
    968 	vdev_config_dirty(spa->spa_root_vdev);
    969 
    970 	spa_config_exit(spa, SCL_ALL, FTAG);
    971 
    972 	txg_wait_synced(spa->spa_dsl_pool, 0);
    973 
    974 	/*
    975 	 * Sync the updated config cache.
    976 	 */
    977 	spa_config_sync(spa, B_FALSE, B_TRUE);
    978 
    979 	spa_close(spa, FTAG);
    980 
    981 	mutex_exit(&spa_namespace_lock);
    982 
    983 	return (0);
    984 }
    985 
    986 
    987 /*
    988  * Determine whether a pool with given pool_guid exists.  If device_guid is
    989  * non-zero, determine whether the pool exists *and* contains a device with the
    990  * specified device_guid.
    991  */
    992 boolean_t
    993 spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
    994 {
    995 	spa_t *spa;
    996 	avl_tree_t *t = &spa_namespace_avl;
    997 
    998 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
    999 
   1000 	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
   1001 		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
   1002 			continue;
   1003 		if (spa->spa_root_vdev == NULL)
   1004 			continue;
   1005 		if (spa_guid(spa) == pool_guid) {
   1006 			if (device_guid == 0)
   1007 				break;
   1008 
   1009 			if (vdev_lookup_by_guid(spa->spa_root_vdev,
   1010 			    device_guid) != NULL)
   1011 				break;
   1012 
   1013 			/*
   1014 			 * Check any devices we may be in the process of adding.
   1015 			 */
   1016 			if (spa->spa_pending_vdev) {
   1017 				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
   1018 				    device_guid) != NULL)
   1019 					break;
   1020 			}
   1021 		}
   1022 	}
   1023 
   1024 	return (spa != NULL);
   1025 }
   1026 
   1027 char *
   1028 spa_strdup(const char *s)
   1029 {
   1030 	size_t len;
   1031 	char *new;
   1032 
   1033 	len = strlen(s);
   1034 	new = kmem_alloc(len + 1, KM_SLEEP);
   1035 	bcopy(s, new, len);
   1036 	new[len] = '\0';
   1037 
   1038 	return (new);
   1039 }
   1040 
   1041 void
   1042 spa_strfree(char *s)
   1043 {
   1044 	kmem_free(s, strlen(s) + 1);
   1045 }
   1046 
   1047 uint64_t
   1048 spa_get_random(uint64_t range)
   1049 {
   1050 	uint64_t r;
   1051 
   1052 	ASSERT(range != 0);
   1053 
   1054 	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
   1055 
   1056 	return (r % range);
   1057 }
   1058 
   1059 void
   1060 sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
   1061 {
   1062 	int d;
   1063 
   1064 	if (bp == NULL) {
   1065 		(void) snprintf(buf, len, "<NULL>");
   1066 		return;
   1067 	}
   1068 
   1069 	if (BP_IS_HOLE(bp)) {
   1070 		(void) snprintf(buf, len, "<hole>");
   1071 		return;
   1072 	}
   1073 
   1074 	(void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
   1075 	    (u_longlong_t)BP_GET_LEVEL(bp),
   1076 	    dmu_ot[BP_GET_TYPE(bp)].ot_name,
   1077 	    (u_longlong_t)BP_GET_LSIZE(bp),
   1078 	    (u_longlong_t)BP_GET_PSIZE(bp));
   1079 
   1080 	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
   1081 		const dva_t *dva = &bp->blk_dva[d];
   1082 		(void) snprintf(buf + strlen(buf), len - strlen(buf),
   1083 		    "DVA[%d]=<%llu:%llx:%llx> ", d,
   1084 		    (u_longlong_t)DVA_GET_VDEV(dva),
   1085 		    (u_longlong_t)DVA_GET_OFFSET(dva),
   1086 		    (u_longlong_t)DVA_GET_ASIZE(dva));
   1087 	}
   1088 
   1089 	(void) snprintf(buf + strlen(buf), len - strlen(buf),
   1090 	    "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
   1091 	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
   1092 	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
   1093 	    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
   1094 	    BP_IS_GANG(bp) ? "gang" : "contiguous",
   1095 	    (u_longlong_t)bp->blk_birth,
   1096 	    (u_longlong_t)bp->blk_fill,
   1097 	    (u_longlong_t)bp->blk_cksum.zc_word[0],
   1098 	    (u_longlong_t)bp->blk_cksum.zc_word[1],
   1099 	    (u_longlong_t)bp->blk_cksum.zc_word[2],
   1100 	    (u_longlong_t)bp->blk_cksum.zc_word[3]);
   1101 }
   1102 
   1103 void
   1104 spa_freeze(spa_t *spa)
   1105 {
   1106 	uint64_t freeze_txg = 0;
   1107 
   1108 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
   1109 	if (spa->spa_freeze_txg == UINT64_MAX) {
   1110 		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
   1111 		spa->spa_freeze_txg = freeze_txg;
   1112 	}
   1113 	spa_config_exit(spa, SCL_ALL, FTAG);
   1114 	if (freeze_txg != 0)
   1115 		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
   1116 }
   1117 
   1118 void
   1119 zfs_panic_recover(const char *fmt, ...)
   1120 {
   1121 	va_list adx;
   1122 
   1123 	va_start(adx, fmt);
   1124 	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
   1125 	va_end(adx);
   1126 }
   1127 
   1128 /*
   1129  * ==========================================================================
   1130  * Accessor functions
   1131  * ==========================================================================
   1132  */
   1133 
   1134 boolean_t
   1135 spa_shutting_down(spa_t *spa)
   1136 {
   1137 	return (spa->spa_async_suspended);
   1138 }
   1139 
   1140 dsl_pool_t *
   1141 spa_get_dsl(spa_t *spa)
   1142 {
   1143 	return (spa->spa_dsl_pool);
   1144 }
   1145 
   1146 blkptr_t *
   1147 spa_get_rootblkptr(spa_t *spa)
   1148 {
   1149 	return (&spa->spa_ubsync.ub_rootbp);
   1150 }
   1151 
   1152 void
   1153 spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
   1154 {
   1155 	spa->spa_uberblock.ub_rootbp = *bp;
   1156 }
   1157 
   1158 void
   1159 spa_altroot(spa_t *spa, char *buf, size_t buflen)
   1160 {
   1161 	if (spa->spa_root == NULL)
   1162 		buf[0] = '\0';
   1163 	else
   1164 		(void) strncpy(buf, spa->spa_root, buflen);
   1165 }
   1166 
   1167 int
   1168 spa_sync_pass(spa_t *spa)
   1169 {
   1170 	return (spa->spa_sync_pass);
   1171 }
   1172 
   1173 char *
   1174 spa_name(spa_t *spa)
   1175 {
   1176 	return (spa->spa_name);
   1177 }
   1178 
   1179 uint64_t
   1180 spa_guid(spa_t *spa)
   1181 {
   1182 	/*
   1183 	 * If we fail to parse the config during spa_load(), we can go through
   1184 	 * the error path (which posts an ereport) and end up here with no root
   1185 	 * vdev.  We stash the original pool guid in 'spa_load_guid' to handle
   1186 	 * this case.
   1187 	 */
   1188 	if (spa->spa_root_vdev != NULL)
   1189 		return (spa->spa_root_vdev->vdev_guid);
   1190 	else
   1191 		return (spa->spa_load_guid);
   1192 }
   1193 
   1194 uint64_t
   1195 spa_last_synced_txg(spa_t *spa)
   1196 {
   1197 	return (spa->spa_ubsync.ub_txg);
   1198 }
   1199 
   1200 uint64_t
   1201 spa_first_txg(spa_t *spa)
   1202 {
   1203 	return (spa->spa_first_txg);
   1204 }
   1205 
   1206 pool_state_t
   1207 spa_state(spa_t *spa)
   1208 {
   1209 	return (spa->spa_state);
   1210 }
   1211 
   1212 uint64_t
   1213 spa_freeze_txg(spa_t *spa)
   1214 {
   1215 	return (spa->spa_freeze_txg);
   1216 }
   1217 
   1218 /*
   1219  * Return how much space is allocated in the pool (ie. sum of all asize)
   1220  */
   1221 uint64_t
   1222 spa_get_alloc(spa_t *spa)
   1223 {
   1224 	return (spa->spa_root_vdev->vdev_stat.vs_alloc);
   1225 }
   1226 
   1227 /*
   1228  * Return how much (raid-z inflated) space there is in the pool.
   1229  */
   1230 uint64_t
   1231 spa_get_space(spa_t *spa)
   1232 {
   1233 	return (spa->spa_root_vdev->vdev_stat.vs_space);
   1234 }
   1235 
   1236 /*
   1237  * Return the amount of raid-z-deflated space in the pool.
   1238  */
   1239 uint64_t
   1240 spa_get_dspace(spa_t *spa)
   1241 {
   1242 	if (spa->spa_deflate)
   1243 		return (spa->spa_root_vdev->vdev_stat.vs_dspace);
   1244 	else
   1245 		return (spa->spa_root_vdev->vdev_stat.vs_space);
   1246 }
   1247 
   1248 /* ARGSUSED */
   1249 uint64_t
   1250 spa_get_asize(spa_t *spa, uint64_t lsize)
   1251 {
   1252 	/*
   1253 	 * For now, the worst case is 512-byte RAID-Z blocks, in which
   1254 	 * case the space requirement is exactly 2x; so just assume that.
   1255 	 * Add to this the fact that we can have up to 3 DVAs per bp, and
   1256 	 * we have to multiply by a total of 6x.
   1257 	 */
   1258 	return (lsize * 6);
   1259 }
   1260 
   1261 /*
   1262  * Return the failure mode that has been set to this pool. The default
   1263  * behavior will be to block all I/Os when a complete failure occurs.
   1264  */
   1265 uint8_t
   1266 spa_get_failmode(spa_t *spa)
   1267 {
   1268 	return (spa->spa_failmode);
   1269 }
   1270 
   1271 boolean_t
   1272 spa_suspended(spa_t *spa)
   1273 {
   1274 	return (spa->spa_suspended);
   1275 }
   1276 
   1277 uint64_t
   1278 spa_version(spa_t *spa)
   1279 {
   1280 	return (spa->spa_ubsync.ub_version);
   1281 }
   1282 
   1283 int
   1284 spa_max_replication(spa_t *spa)
   1285 {
   1286 	/*
   1287 	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
   1288 	 * handle BPs with more than one DVA allocated.  Set our max
   1289 	 * replication level accordingly.
   1290 	 */
   1291 	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
   1292 		return (1);
   1293 	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
   1294 }
   1295 
   1296 uint64_t
   1297 bp_get_dasize(spa_t *spa, const blkptr_t *bp)
   1298 {
   1299 	int sz = 0, i;
   1300 
   1301 	if (!spa->spa_deflate)
   1302 		return (BP_GET_ASIZE(bp));
   1303 
   1304 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
   1305 	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
   1306 		vdev_t *vd =
   1307 		    vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
   1308 		if (vd)
   1309 			sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
   1310 			    SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
   1311 	}
   1312 	spa_config_exit(spa, SCL_VDEV, FTAG);
   1313 	return (sz);
   1314 }
   1315 
   1316 /*
   1317  * ==========================================================================
   1318  * Initialization and Termination
   1319  * ==========================================================================
   1320  */
   1321 
   1322 static int
   1323 spa_name_compare(const void *a1, const void *a2)
   1324 {
   1325 	const spa_t *s1 = a1;
   1326 	const spa_t *s2 = a2;
   1327 	int s;
   1328 
   1329 	s = strcmp(s1->spa_name, s2->spa_name);
   1330 	if (s > 0)
   1331 		return (1);
   1332 	if (s < 0)
   1333 		return (-1);
   1334 	return (0);
   1335 }
   1336 
   1337 int
   1338 spa_busy(void)
   1339 {
   1340 	return (spa_active_count);
   1341 }
   1342 
   1343 void
   1344 spa_boot_init()
   1345 {
   1346 	spa_config_load();
   1347 }
   1348 
   1349 void
   1350 spa_init(int mode)
   1351 {
   1352 	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
   1353 	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
   1354 	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
   1355 	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
   1356 
   1357 	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
   1358 	    offsetof(spa_t, spa_avl));
   1359 
   1360 	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
   1361 	    offsetof(spa_aux_t, aux_avl));
   1362 
   1363 	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
   1364 	    offsetof(spa_aux_t, aux_avl));
   1365 
   1366 	spa_mode_global = mode;
   1367 
   1368 	refcount_init();
   1369 	unique_init();
   1370 	zio_init();
   1371 	dmu_init();
   1372 	zil_init();
   1373 	vdev_cache_stat_init();
   1374 	zfs_prop_init();
   1375 	zpool_prop_init();
   1376 	spa_config_load();
   1377 	l2arc_start();
   1378 }
   1379 
   1380 void
   1381 spa_fini(void)
   1382 {
   1383 	l2arc_stop();
   1384 
   1385 	spa_evict_all();
   1386 
   1387 	vdev_cache_stat_fini();
   1388 	zil_fini();
   1389 	dmu_fini();
   1390 	zio_fini();
   1391 	unique_fini();
   1392 	refcount_fini();
   1393 
   1394 	avl_destroy(&spa_namespace_avl);
   1395 	avl_destroy(&spa_spare_avl);
   1396 	avl_destroy(&spa_l2cache_avl);
   1397 
   1398 	cv_destroy(&spa_namespace_cv);
   1399 	mutex_destroy(&spa_namespace_lock);
   1400 	mutex_destroy(&spa_spare_lock);
   1401 	mutex_destroy(&spa_l2cache_lock);
   1402 }
   1403 
   1404 /*
   1405  * Return whether this pool has slogs. No locking needed.
   1406  * It's not a problem if the wrong answer is returned as it's only for
   1407  * performance and not correctness
   1408  */
   1409 boolean_t
   1410 spa_has_slogs(spa_t *spa)
   1411 {
   1412 	return (spa->spa_log_class->mc_rotor != NULL);
   1413 }
   1414 
   1415 /*
   1416  * Return whether this pool is the root pool.
   1417  */
   1418 boolean_t
   1419 spa_is_root(spa_t *spa)
   1420 {
   1421 	return (spa->spa_is_root);
   1422 }
   1423 
   1424 boolean_t
   1425 spa_writeable(spa_t *spa)
   1426 {
   1427 	return (!!(spa->spa_mode & FWRITE));
   1428 }
   1429 
   1430 int
   1431 spa_mode(spa_t *spa)
   1432 {
   1433 	return (spa->spa_mode);
   1434 }
   1435