1 1544 eschrock /* 2 1544 eschrock * CDDL HEADER START 3 1544 eschrock * 4 1544 eschrock * The contents of this file are subject to the terms of the 5 1544 eschrock * Common Development and Distribution License (the "License"). 6 1544 eschrock * You may not use this file except in compliance with the License. 7 1544 eschrock * 8 1544 eschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 1544 eschrock * or http://www.opensolaris.org/os/licensing. 10 1544 eschrock * See the License for the specific language governing permissions 11 1544 eschrock * and limitations under the License. 12 1544 eschrock * 13 1544 eschrock * When distributing Covered Code, include this CDDL HEADER in each 14 1544 eschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 1544 eschrock * If applicable, add the following below this CDDL HEADER, with the 16 1544 eschrock * fields enclosed by brackets "[]" replaced with your own identifying 17 1544 eschrock * information: Portions Copyright [yyyy] [name of copyright owner] 18 1544 eschrock * 19 1544 eschrock * CDDL HEADER END 20 1544 eschrock */ 21 1544 eschrock /* 22 9396 Matthew * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 1544 eschrock * Use is subject to license terms. 24 1544 eschrock */ 25 1544 eschrock 26 1544 eschrock /* 27 1544 eschrock * Routines to manage the on-disk persistent error log. 28 1544 eschrock * 29 1544 eschrock * Each pool stores a log of all logical data errors seen during normal 30 1544 eschrock * operation. This is actually the union of two distinct logs: the last log, 31 1544 eschrock * and the current log. All errors seen are logged to the current log. When a 32 1544 eschrock * scrub completes, the current log becomes the last log, the last log is thrown 33 1544 eschrock * out, and the current log is reinitialized. This way, if an error is somehow 34 1544 eschrock * corrected, a new scrub will show that that it no longer exists, and will be 35 1544 eschrock * deleted from the log when the scrub completes. 36 1544 eschrock * 37 1544 eschrock * The log is stored using a ZAP object whose key is a string form of the 38 1544 eschrock * zbookmark tuple (objset, object, level, blkid), and whose contents is an 39 1544 eschrock * optional 'objset:object' human-readable string describing the data. When an 40 1544 eschrock * error is first logged, this string will be empty, indicating that no name is 41 1544 eschrock * known. This prevents us from having to issue a potentially large amount of 42 1544 eschrock * I/O to discover the object name during an error path. Instead, we do the 43 1544 eschrock * calculation when the data is requested, storing the result so future queries 44 1544 eschrock * will be faster. 45 1544 eschrock * 46 1544 eschrock * This log is then shipped into an nvlist where the key is the dataset name and 47 1544 eschrock * the value is the object name. Userland is then responsible for uniquifying 48 1544 eschrock * this list and displaying it to the user. 49 1544 eschrock */ 50 1544 eschrock 51 1544 eschrock #include <sys/dmu_tx.h> 52 1544 eschrock #include <sys/spa.h> 53 1544 eschrock #include <sys/spa_impl.h> 54 1544 eschrock #include <sys/zap.h> 55 1544 eschrock #include <sys/zio.h> 56 1544 eschrock 57 1544 eschrock /* 58 1544 eschrock * This is a stripped-down version of strtoull, suitable only for converting 59 1544 eschrock * lowercase hexidecimal numbers that don't overflow. 60 1544 eschrock */ 61 9396 Matthew uint64_t 62 9396 Matthew strtonum(const char *str, char **nptr) 63 1544 eschrock { 64 1544 eschrock uint64_t val = 0; 65 1544 eschrock char c; 66 1544 eschrock int digit; 67 1544 eschrock 68 1544 eschrock while ((c = *str) != '\0') { 69 1544 eschrock if (c >= '0' && c <= '9') 70 1544 eschrock digit = c - '0'; 71 1544 eschrock else if (c >= 'a' && c <= 'f') 72 1544 eschrock digit = 10 + c - 'a'; 73 1544 eschrock else 74 1544 eschrock break; 75 1544 eschrock 76 1544 eschrock val *= 16; 77 1544 eschrock val += digit; 78 1544 eschrock 79 1544 eschrock str++; 80 1544 eschrock } 81 1544 eschrock 82 9396 Matthew if (nptr) 83 9396 Matthew *nptr = (char *)str; 84 1544 eschrock 85 1544 eschrock return (val); 86 1544 eschrock } 87 1544 eschrock 88 1544 eschrock /* 89 1544 eschrock * Convert a bookmark to a string. 90 1544 eschrock */ 91 1544 eschrock static void 92 1544 eschrock bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) 93 1544 eschrock { 94 1544 eschrock (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", 95 1544 eschrock (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, 96 1544 eschrock (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); 97 1544 eschrock } 98 1544 eschrock 99 1544 eschrock /* 100 1544 eschrock * Convert a string to a bookmark 101 1544 eschrock */ 102 2856 nd150628 #ifdef _KERNEL 103 1544 eschrock static void 104 1544 eschrock name_to_bookmark(char *buf, zbookmark_t *zb) 105 1544 eschrock { 106 1544 eschrock zb->zb_objset = strtonum(buf, &buf); 107 1544 eschrock ASSERT(*buf == ':'); 108 1544 eschrock zb->zb_object = strtonum(buf + 1, &buf); 109 1544 eschrock ASSERT(*buf == ':'); 110 1544 eschrock zb->zb_level = (int)strtonum(buf + 1, &buf); 111 1544 eschrock ASSERT(*buf == ':'); 112 1544 eschrock zb->zb_blkid = strtonum(buf + 1, &buf); 113 1544 eschrock ASSERT(*buf == '\0'); 114 1544 eschrock } 115 2856 nd150628 #endif 116 1544 eschrock 117 1544 eschrock /* 118 1544 eschrock * Log an uncorrectable error to the persistent error log. We add it to the 119 1544 eschrock * spa's list of pending errors. The changes are actually synced out to disk 120 1544 eschrock * during spa_errlog_sync(). 121 1544 eschrock */ 122 1544 eschrock void 123 1544 eschrock spa_log_error(spa_t *spa, zio_t *zio) 124 1544 eschrock { 125 1544 eschrock zbookmark_t *zb = &zio->io_logical->io_bookmark; 126 1544 eschrock spa_error_entry_t search; 127 1544 eschrock spa_error_entry_t *new; 128 1544 eschrock avl_tree_t *tree; 129 1544 eschrock avl_index_t where; 130 1544 eschrock 131 1544 eschrock /* 132 1544 eschrock * If we are trying to import a pool, ignore any errors, as we won't be 133 1544 eschrock * writing to the pool any time soon. 134 1544 eschrock */ 135 11147 George if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) 136 1544 eschrock return; 137 1544 eschrock 138 1544 eschrock mutex_enter(&spa->spa_errlist_lock); 139 1544 eschrock 140 1544 eschrock /* 141 1544 eschrock * If we have had a request to rotate the log, log it to the next list 142 1544 eschrock * instead of the current one. 143 1544 eschrock */ 144 1544 eschrock if (spa->spa_scrub_active || spa->spa_scrub_finished) 145 1544 eschrock tree = &spa->spa_errlist_scrub; 146 1544 eschrock else 147 1544 eschrock tree = &spa->spa_errlist_last; 148 1544 eschrock 149 1544 eschrock search.se_bookmark = *zb; 150 1544 eschrock if (avl_find(tree, &search, &where) != NULL) { 151 1544 eschrock mutex_exit(&spa->spa_errlist_lock); 152 1544 eschrock return; 153 1544 eschrock } 154 1544 eschrock 155 1544 eschrock new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); 156 1544 eschrock new->se_bookmark = *zb; 157 1544 eschrock avl_insert(tree, new, where); 158 1544 eschrock 159 1544 eschrock mutex_exit(&spa->spa_errlist_lock); 160 1544 eschrock } 161 1544 eschrock 162 1544 eschrock /* 163 1544 eschrock * Return the number of errors currently in the error log. This is actually the 164 1544 eschrock * sum of both the last log and the current log, since we don't know the union 165 1544 eschrock * of these logs until we reach userland. 166 1544 eschrock */ 167 1544 eschrock uint64_t 168 1544 eschrock spa_get_errlog_size(spa_t *spa) 169 1544 eschrock { 170 1544 eschrock uint64_t total = 0, count; 171 1544 eschrock 172 1544 eschrock mutex_enter(&spa->spa_errlog_lock); 173 1544 eschrock if (spa->spa_errlog_scrub != 0 && 174 1544 eschrock zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, 175 1544 eschrock &count) == 0) 176 1544 eschrock total += count; 177 1544 eschrock 178 1544 eschrock if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && 179 1544 eschrock zap_count(spa->spa_meta_objset, spa->spa_errlog_last, 180 1544 eschrock &count) == 0) 181 1544 eschrock total += count; 182 1544 eschrock mutex_exit(&spa->spa_errlog_lock); 183 1544 eschrock 184 1544 eschrock mutex_enter(&spa->spa_errlist_lock); 185 1544 eschrock total += avl_numnodes(&spa->spa_errlist_last); 186 1544 eschrock total += avl_numnodes(&spa->spa_errlist_scrub); 187 1544 eschrock mutex_exit(&spa->spa_errlist_lock); 188 1544 eschrock 189 1544 eschrock return (total); 190 1544 eschrock } 191 1544 eschrock 192 1544 eschrock #ifdef _KERNEL 193 1544 eschrock static int 194 1544 eschrock process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) 195 1544 eschrock { 196 1544 eschrock zap_cursor_t zc; 197 1544 eschrock zap_attribute_t za; 198 1544 eschrock zbookmark_t zb; 199 1544 eschrock 200 1544 eschrock if (obj == 0) 201 1544 eschrock return (0); 202 1544 eschrock 203 1544 eschrock for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); 204 1544 eschrock zap_cursor_retrieve(&zc, &za) == 0; 205 1544 eschrock zap_cursor_advance(&zc)) { 206 1544 eschrock 207 1544 eschrock if (*count == 0) { 208 1544 eschrock zap_cursor_fini(&zc); 209 1544 eschrock return (ENOMEM); 210 1544 eschrock } 211 1544 eschrock 212 1544 eschrock name_to_bookmark(za.za_name, &zb); 213 1544 eschrock 214 1544 eschrock if (copyout(&zb, (char *)addr + 215 1544 eschrock (*count - 1) * sizeof (zbookmark_t), 216 1544 eschrock sizeof (zbookmark_t)) != 0) 217 1544 eschrock return (EFAULT); 218 1544 eschrock 219 1544 eschrock *count -= 1; 220 1544 eschrock } 221 1544 eschrock 222 1544 eschrock zap_cursor_fini(&zc); 223 1544 eschrock 224 1544 eschrock return (0); 225 1544 eschrock } 226 1544 eschrock 227 1544 eschrock static int 228 1544 eschrock process_error_list(avl_tree_t *list, void *addr, size_t *count) 229 1544 eschrock { 230 1544 eschrock spa_error_entry_t *se; 231 1544 eschrock 232 1544 eschrock for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { 233 1544 eschrock 234 1544 eschrock if (*count == 0) 235 1544 eschrock return (ENOMEM); 236 1544 eschrock 237 1544 eschrock if (copyout(&se->se_bookmark, (char *)addr + 238 1544 eschrock (*count - 1) * sizeof (zbookmark_t), 239 1544 eschrock sizeof (zbookmark_t)) != 0) 240 1544 eschrock return (EFAULT); 241 1544 eschrock 242 1544 eschrock *count -= 1; 243 1544 eschrock } 244 1544 eschrock 245 1544 eschrock return (0); 246 1544 eschrock } 247 1544 eschrock #endif 248 1544 eschrock 249 1544 eschrock /* 250 1544 eschrock * Copy all known errors to userland as an array of bookmarks. This is 251 1544 eschrock * actually a union of the on-disk last log and current log, as well as any 252 1544 eschrock * pending error requests. 253 1544 eschrock * 254 1544 eschrock * Because the act of reading the on-disk log could cause errors to be 255 1544 eschrock * generated, we have two separate locks: one for the error log and one for the 256 1544 eschrock * in-core error lists. We only need the error list lock to log and error, so 257 1544 eschrock * we grab the error log lock while we read the on-disk logs, and only pick up 258 1544 eschrock * the error list lock when we are finished. 259 1544 eschrock */ 260 1544 eschrock int 261 1544 eschrock spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) 262 1544 eschrock { 263 1544 eschrock int ret = 0; 264 1544 eschrock 265 1544 eschrock #ifdef _KERNEL 266 1544 eschrock mutex_enter(&spa->spa_errlog_lock); 267 1544 eschrock 268 1544 eschrock ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); 269 1544 eschrock 270 1544 eschrock if (!ret && !spa->spa_scrub_finished) 271 1544 eschrock ret = process_error_log(spa, spa->spa_errlog_last, uaddr, 272 1544 eschrock count); 273 1544 eschrock 274 1544 eschrock mutex_enter(&spa->spa_errlist_lock); 275 1544 eschrock if (!ret) 276 1544 eschrock ret = process_error_list(&spa->spa_errlist_scrub, uaddr, 277 1544 eschrock count); 278 1544 eschrock if (!ret) 279 1544 eschrock ret = process_error_list(&spa->spa_errlist_last, uaddr, 280 1544 eschrock count); 281 1544 eschrock mutex_exit(&spa->spa_errlist_lock); 282 1544 eschrock 283 1544 eschrock mutex_exit(&spa->spa_errlog_lock); 284 1544 eschrock #endif 285 1544 eschrock 286 1544 eschrock return (ret); 287 1544 eschrock } 288 1544 eschrock 289 1544 eschrock /* 290 1544 eschrock * Called when a scrub completes. This simply set a bit which tells which AVL 291 1544 eschrock * tree to add new errors. spa_errlog_sync() is responsible for actually 292 1544 eschrock * syncing the changes to the underlying objects. 293 1544 eschrock */ 294 1544 eschrock void 295 1544 eschrock spa_errlog_rotate(spa_t *spa) 296 1544 eschrock { 297 1544 eschrock mutex_enter(&spa->spa_errlist_lock); 298 1544 eschrock spa->spa_scrub_finished = B_TRUE; 299 1544 eschrock mutex_exit(&spa->spa_errlist_lock); 300 1544 eschrock } 301 1544 eschrock 302 1544 eschrock /* 303 1544 eschrock * Discard any pending errors from the spa_t. Called when unloading a faulted 304 1544 eschrock * pool, as the errors encountered during the open cannot be synced to disk. 305 1544 eschrock */ 306 1544 eschrock void 307 1544 eschrock spa_errlog_drain(spa_t *spa) 308 1544 eschrock { 309 1544 eschrock spa_error_entry_t *se; 310 1544 eschrock void *cookie; 311 1544 eschrock 312 1544 eschrock mutex_enter(&spa->spa_errlist_lock); 313 1544 eschrock 314 1544 eschrock cookie = NULL; 315 1544 eschrock while ((se = avl_destroy_nodes(&spa->spa_errlist_last, 316 1544 eschrock &cookie)) != NULL) 317 1544 eschrock kmem_free(se, sizeof (spa_error_entry_t)); 318 1544 eschrock cookie = NULL; 319 1544 eschrock while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, 320 1544 eschrock &cookie)) != NULL) 321 1544 eschrock kmem_free(se, sizeof (spa_error_entry_t)); 322 1544 eschrock 323 1544 eschrock mutex_exit(&spa->spa_errlist_lock); 324 1544 eschrock } 325 1544 eschrock 326 1544 eschrock /* 327 1544 eschrock * Process a list of errors into the current on-disk log. 328 1544 eschrock */ 329 1544 eschrock static void 330 1544 eschrock sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) 331 1544 eschrock { 332 1544 eschrock spa_error_entry_t *se; 333 1544 eschrock char buf[64]; 334 1544 eschrock void *cookie; 335 1544 eschrock 336 1544 eschrock if (avl_numnodes(t) != 0) { 337 1544 eschrock /* create log if necessary */ 338 1544 eschrock if (*obj == 0) 339 1544 eschrock *obj = zap_create(spa->spa_meta_objset, 340 1544 eschrock DMU_OT_ERROR_LOG, DMU_OT_NONE, 341 1544 eschrock 0, tx); 342 1544 eschrock 343 1544 eschrock /* add errors to the current log */ 344 1544 eschrock for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { 345 1544 eschrock char *name = se->se_name ? se->se_name : ""; 346 1544 eschrock 347 1544 eschrock bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); 348 1544 eschrock 349 1544 eschrock (void) zap_update(spa->spa_meta_objset, 350 1544 eschrock *obj, buf, 1, strlen(name) + 1, name, tx); 351 1544 eschrock } 352 1544 eschrock 353 1544 eschrock /* purge the error list */ 354 1544 eschrock cookie = NULL; 355 1544 eschrock while ((se = avl_destroy_nodes(t, &cookie)) != NULL) 356 1544 eschrock kmem_free(se, sizeof (spa_error_entry_t)); 357 1544 eschrock } 358 1544 eschrock } 359 1544 eschrock 360 1544 eschrock /* 361 1544 eschrock * Sync the error log out to disk. This is a little tricky because the act of 362 1544 eschrock * writing the error log requires the spa_errlist_lock. So, we need to lock the 363 1544 eschrock * error lists, take a copy of the lists, and then reinitialize them. Then, we 364 1544 eschrock * drop the error list lock and take the error log lock, at which point we 365 1544 eschrock * do the errlog processing. Then, if we encounter an I/O error during this 366 1544 eschrock * process, we can successfully add the error to the list. Note that this will 367 1544 eschrock * result in the perpetual recycling of errors, but it is an unlikely situation 368 1544 eschrock * and not a performance critical operation. 369 1544 eschrock */ 370 1544 eschrock void 371 1544 eschrock spa_errlog_sync(spa_t *spa, uint64_t txg) 372 1544 eschrock { 373 1544 eschrock dmu_tx_t *tx; 374 1544 eschrock avl_tree_t scrub, last; 375 1544 eschrock int scrub_finished; 376 1544 eschrock 377 1544 eschrock mutex_enter(&spa->spa_errlist_lock); 378 1544 eschrock 379 1544 eschrock /* 380 1544 eschrock * Bail out early under normal circumstances. 381 1544 eschrock */ 382 1544 eschrock if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && 383 1544 eschrock avl_numnodes(&spa->spa_errlist_last) == 0 && 384 1544 eschrock !spa->spa_scrub_finished) { 385 1544 eschrock mutex_exit(&spa->spa_errlist_lock); 386 1544 eschrock return; 387 1544 eschrock } 388 1544 eschrock 389 1544 eschrock spa_get_errlists(spa, &last, &scrub); 390 1544 eschrock scrub_finished = spa->spa_scrub_finished; 391 1544 eschrock spa->spa_scrub_finished = B_FALSE; 392 1544 eschrock 393 1544 eschrock mutex_exit(&spa->spa_errlist_lock); 394 1544 eschrock mutex_enter(&spa->spa_errlog_lock); 395 1544 eschrock 396 1544 eschrock tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 397 1544 eschrock 398 1544 eschrock /* 399 1544 eschrock * Sync out the current list of errors. 400 1544 eschrock */ 401 1544 eschrock sync_error_list(spa, &last, &spa->spa_errlog_last, tx); 402 1544 eschrock 403 1544 eschrock /* 404 1544 eschrock * Rotate the log if necessary. 405 1544 eschrock */ 406 1544 eschrock if (scrub_finished) { 407 1544 eschrock if (spa->spa_errlog_last != 0) 408 1544 eschrock VERIFY(dmu_object_free(spa->spa_meta_objset, 409 1544 eschrock spa->spa_errlog_last, tx) == 0); 410 1544 eschrock spa->spa_errlog_last = spa->spa_errlog_scrub; 411 1544 eschrock spa->spa_errlog_scrub = 0; 412 1544 eschrock 413 1544 eschrock sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); 414 1544 eschrock } 415 1544 eschrock 416 1544 eschrock /* 417 1544 eschrock * Sync out any pending scrub errors. 418 1544 eschrock */ 419 1544 eschrock sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); 420 1544 eschrock 421 1544 eschrock /* 422 1544 eschrock * Update the MOS to reflect the new values. 423 1544 eschrock */ 424 1544 eschrock (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 425 1544 eschrock DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, 426 1544 eschrock &spa->spa_errlog_last, tx); 427 1544 eschrock (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 428 1544 eschrock DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, 429 1544 eschrock &spa->spa_errlog_scrub, tx); 430 1544 eschrock 431 1544 eschrock dmu_tx_commit(tx); 432 1544 eschrock 433 1544 eschrock mutex_exit(&spa->spa_errlog_lock); 434 1544 eschrock } 435