OpenGrok

Cross Reference: spa_errlog.c
xref: /onnv/onnv-gate/usr/src/uts/common/fs/zfs/spa_errlog.c
Home | History | Annotate | Line # | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
     23  */
     24 
     25 /*
     26  * Routines to manage the on-disk persistent error log.
     27  *
     28  * Each pool stores a log of all logical data errors seen during normal
     29  * operation.  This is actually the union of two distinct logs: the last log,
     30  * and the current log.  All errors seen are logged to the current log.  When a
     31  * scrub completes, the current log becomes the last log, the last log is thrown
     32  * out, and the current log is reinitialized.  This way, if an error is somehow
     33  * corrected, a new scrub will show that that it no longer exists, and will be
     34  * deleted from the log when the scrub completes.
     35  *
     36  * The log is stored using a ZAP object whose key is a string form of the
     37  * zbookmark tuple (objset, object, level, blkid), and whose contents is an
     38  * optional 'objset:object' human-readable string describing the data.  When an
     39  * error is first logged, this string will be empty, indicating that no name is
     40  * known.  This prevents us from having to issue a potentially large amount of
     41  * I/O to discover the object name during an error path.  Instead, we do the
     42  * calculation when the data is requested, storing the result so future queries
     43  * will be faster.
     44  *
     45  * This log is then shipped into an nvlist where the key is the dataset name and
     46  * the value is the object name.  Userland is then responsible for uniquifying
     47  * this list and displaying it to the user.
     48  */
     49 
     50 #include <sys/dmu_tx.h>
     51 #include <sys/spa.h>
     52 #include <sys/spa_impl.h>
     53 #include <sys/zap.h>
     54 #include <sys/zio.h>
     55 
     56 
     57 /*
     58  * Convert a bookmark to a string.
     59  */
     60 static void
     61 bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
     62 {
     63 	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
     64 	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
     65 	    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
     66 }
     67 
     68 /*
     69  * Convert a string to a bookmark
     70  */
     71 #ifdef _KERNEL
     72 static void
     73 name_to_bookmark(char *buf, zbookmark_t *zb)
     74 {
     75 	zb->zb_objset = strtonum(buf, &buf);
     76 	ASSERT(*buf == ':');
     77 	zb->zb_object = strtonum(buf + 1, &buf);
     78 	ASSERT(*buf == ':');
     79 	zb->zb_level = (int)strtonum(buf + 1, &buf);
     80 	ASSERT(*buf == ':');
     81 	zb->zb_blkid = strtonum(buf + 1, &buf);
     82 	ASSERT(*buf == '\0');
     83 }
     84 #endif
     85 
     86 /*
     87  * Log an uncorrectable error to the persistent error log.  We add it to the
     88  * spa's list of pending errors.  The changes are actually synced out to disk
     89  * during spa_errlog_sync().
     90  */
     91 void
     92 spa_log_error(spa_t *spa, zio_t *zio)
     93 {
     94 	zbookmark_t *zb = &zio->io_logical->io_bookmark;
     95 	spa_error_entry_t search;
     96 	spa_error_entry_t *new;
     97 	avl_tree_t *tree;
     98 	avl_index_t where;
     99 
    100 	/*
    101 	 * If we are trying to import a pool, ignore any errors, as we won't be
    102 	 * writing to the pool any time soon.
    103 	 */
    104 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
    105 		return;
    106 
    107 	mutex_enter(&spa->spa_errlist_lock);
    108 
    109 	/*
    110 	 * If we have had a request to rotate the log, log it to the next list
    111 	 * instead of the current one.
    112 	 */
    113 	if (spa->spa_scrub_active || spa->spa_scrub_finished)
    114 		tree = &spa->spa_errlist_scrub;
    115 	else
    116 		tree = &spa->spa_errlist_last;
    117 
    118 	search.se_bookmark = *zb;
    119 	if (avl_find(tree, &search, &where) != NULL) {
    120 		mutex_exit(&spa->spa_errlist_lock);
    121 		return;
    122 	}
    123 
    124 	new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
    125 	new->se_bookmark = *zb;
    126 	avl_insert(tree, new, where);
    127 
    128 	mutex_exit(&spa->spa_errlist_lock);
    129 }
    130 
    131 /*
    132  * Return the number of errors currently in the error log.  This is actually the
    133  * sum of both the last log and the current log, since we don't know the union
    134  * of these logs until we reach userland.
    135  */
    136 uint64_t
    137 spa_get_errlog_size(spa_t *spa)
    138 {
    139 	uint64_t total = 0, count;
    140 
    141 	mutex_enter(&spa->spa_errlog_lock);
    142 	if (spa->spa_errlog_scrub != 0 &&
    143 	    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
    144 	    &count) == 0)
    145 		total += count;
    146 
    147 	if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
    148 	    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
    149 	    &count) == 0)
    150 		total += count;
    151 	mutex_exit(&spa->spa_errlog_lock);
    152 
    153 	mutex_enter(&spa->spa_errlist_lock);
    154 	total += avl_numnodes(&spa->spa_errlist_last);
    155 	total += avl_numnodes(&spa->spa_errlist_scrub);
    156 	mutex_exit(&spa->spa_errlist_lock);
    157 
    158 	return (total);
    159 }
    160 
    161 #ifdef _KERNEL
    162 static int
    163 process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
    164 {
    165 	zap_cursor_t zc;
    166 	zap_attribute_t za;
    167 	zbookmark_t zb;
    168 
    169 	if (obj == 0)
    170 		return (0);
    171 
    172 	for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
    173 	    zap_cursor_retrieve(&zc, &za) == 0;
    174 	    zap_cursor_advance(&zc)) {
    175 
    176 		if (*count == 0) {
    177 			zap_cursor_fini(&zc);
    178 			return (ENOMEM);
    179 		}
    180 
    181 		name_to_bookmark(za.za_name, &zb);
    182 
    183 		if (copyout(&zb, (char *)addr +
    184 		    (*count - 1) * sizeof (zbookmark_t),
    185 		    sizeof (zbookmark_t)) != 0)
    186 			return (EFAULT);
    187 
    188 		*count -= 1;
    189 	}
    190 
    191 	zap_cursor_fini(&zc);
    192 
    193 	return (0);
    194 }
    195 
    196 static int
    197 process_error_list(avl_tree_t *list, void *addr, size_t *count)
    198 {
    199 	spa_error_entry_t *se;
    200 
    201 	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
    202 
    203 		if (*count == 0)
    204 			return (ENOMEM);
    205 
    206 		if (copyout(&se->se_bookmark, (char *)addr +
    207 		    (*count - 1) * sizeof (zbookmark_t),
    208 		    sizeof (zbookmark_t)) != 0)
    209 			return (EFAULT);
    210 
    211 		*count -= 1;
    212 	}
    213 
    214 	return (0);
    215 }
    216 #endif
    217 
    218 /*
    219  * Copy all known errors to userland as an array of bookmarks.  This is
    220  * actually a union of the on-disk last log and current log, as well as any
    221  * pending error requests.
    222  *
    223  * Because the act of reading the on-disk log could cause errors to be
    224  * generated, we have two separate locks: one for the error log and one for the
    225  * in-core error lists.  We only need the error list lock to log and error, so
    226  * we grab the error log lock while we read the on-disk logs, and only pick up
    227  * the error list lock when we are finished.
    228  */
    229 int
    230 spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
    231 {
    232 	int ret = 0;
    233 
    234 #ifdef _KERNEL
    235 	mutex_enter(&spa->spa_errlog_lock);
    236 
    237 	ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
    238 
    239 	if (!ret && !spa->spa_scrub_finished)
    240 		ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
    241 		    count);
    242 
    243 	mutex_enter(&spa->spa_errlist_lock);
    244 	if (!ret)
    245 		ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
    246 		    count);
    247 	if (!ret)
    248 		ret = process_error_list(&spa->spa_errlist_last, uaddr,
    249 		    count);
    250 	mutex_exit(&spa->spa_errlist_lock);
    251 
    252 	mutex_exit(&spa->spa_errlog_lock);
    253 #endif
    254 
    255 	return (ret);
    256 }
    257 
    258 /*
    259  * Called when a scrub completes.  This simply set a bit which tells which AVL
    260  * tree to add new errors.  spa_errlog_sync() is responsible for actually
    261  * syncing the changes to the underlying objects.
    262  */
    263 void
    264 spa_errlog_rotate(spa_t *spa)
    265 {
    266 	mutex_enter(&spa->spa_errlist_lock);
    267 	spa->spa_scrub_finished = B_TRUE;
    268 	mutex_exit(&spa->spa_errlist_lock);
    269 }
    270 
    271 /*
    272  * Discard any pending errors from the spa_t.  Called when unloading a faulted
    273  * pool, as the errors encountered during the open cannot be synced to disk.
    274  */
    275 void
    276 spa_errlog_drain(spa_t *spa)
    277 {
    278 	spa_error_entry_t *se;
    279 	void *cookie;
    280 
    281 	mutex_enter(&spa->spa_errlist_lock);
    282 
    283 	cookie = NULL;
    284 	while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
    285 	    &cookie)) != NULL)
    286 		kmem_free(se, sizeof (spa_error_entry_t));
    287 	cookie = NULL;
    288 	while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
    289 	    &cookie)) != NULL)
    290 		kmem_free(se, sizeof (spa_error_entry_t));
    291 
    292 	mutex_exit(&spa->spa_errlist_lock);
    293 }
    294 
    295 /*
    296  * Process a list of errors into the current on-disk log.
    297  */
    298 static void
    299 sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
    300 {
    301 	spa_error_entry_t *se;
    302 	char buf[64];
    303 	void *cookie;
    304 
    305 	if (avl_numnodes(t) != 0) {
    306 		/* create log if necessary */
    307 		if (*obj == 0)
    308 			*obj = zap_create(spa->spa_meta_objset,
    309 			    DMU_OT_ERROR_LOG, DMU_OT_NONE,
    310 			    0, tx);
    311 
    312 		/* add errors to the current log */
    313 		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
    314 			char *name = se->se_name ? se->se_name : "";
    315 
    316 			bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
    317 
    318 			(void) zap_update(spa->spa_meta_objset,
    319 			    *obj, buf, 1, strlen(name) + 1, name, tx);
    320 		}
    321 
    322 		/* purge the error list */
    323 		cookie = NULL;
    324 		while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
    325 			kmem_free(se, sizeof (spa_error_entry_t));
    326 	}
    327 }
    328 
    329 /*
    330  * Sync the error log out to disk.  This is a little tricky because the act of
    331  * writing the error log requires the spa_errlist_lock.  So, we need to lock the
    332  * error lists, take a copy of the lists, and then reinitialize them.  Then, we
    333  * drop the error list lock and take the error log lock, at which point we
    334  * do the errlog processing.  Then, if we encounter an I/O error during this
    335  * process, we can successfully add the error to the list.  Note that this will
    336  * result in the perpetual recycling of errors, but it is an unlikely situation
    337  * and not a performance critical operation.
    338  */
    339 void
    340 spa_errlog_sync(spa_t *spa, uint64_t txg)
    341 {
    342 	dmu_tx_t *tx;
    343 	avl_tree_t scrub, last;
    344 	int scrub_finished;
    345 
    346 	mutex_enter(&spa->spa_errlist_lock);
    347 
    348 	/*
    349 	 * Bail out early under normal circumstances.
    350 	 */
    351 	if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
    352 	    avl_numnodes(&spa->spa_errlist_last) == 0 &&
    353 	    !spa->spa_scrub_finished) {
    354 		mutex_exit(&spa->spa_errlist_lock);
    355 		return;
    356 	}
    357 
    358 	spa_get_errlists(spa, &last, &scrub);
    359 	scrub_finished = spa->spa_scrub_finished;
    360 	spa->spa_scrub_finished = B_FALSE;
    361 
    362 	mutex_exit(&spa->spa_errlist_lock);
    363 	mutex_enter(&spa->spa_errlog_lock);
    364 
    365 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
    366 
    367 	/*
    368 	 * Sync out the current list of errors.
    369 	 */
    370 	sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
    371 
    372 	/*
    373 	 * Rotate the log if necessary.
    374 	 */
    375 	if (scrub_finished) {
    376 		if (spa->spa_errlog_last != 0)
    377 			VERIFY(dmu_object_free(spa->spa_meta_objset,
    378 			    spa->spa_errlog_last, tx) == 0);
    379 		spa->spa_errlog_last = spa->spa_errlog_scrub;
    380 		spa->spa_errlog_scrub = 0;
    381 
    382 		sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
    383 	}
    384 
    385 	/*
    386 	 * Sync out any pending scrub errors.
    387 	 */
    388 	sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
    389 
    390 	/*
    391 	 * Update the MOS to reflect the new values.
    392 	 */
    393 	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    394 	    DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
    395 	    &spa->spa_errlog_last, tx);
    396 	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
    397 	    DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
    398 	    &spa->spa_errlog_scrub, tx);
    399 
    400 	dmu_tx_commit(tx);
    401 
    402 	mutex_exit(&spa->spa_errlog_lock);
    403 }
    404