Home | History | Annotate | Download | only in zfs-diagnosis
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <assert.h>
     27 #include <stddef.h>
     28 #include <strings.h>
     29 #include <libuutil.h>
     30 #include <libzfs.h>
     31 #include <fm/fmd_api.h>
     32 #include <fm/libtopo.h>
     33 #include <sys/fs/zfs.h>
     34 #include <sys/fm/protocol.h>
     35 #include <sys/fm/fs/zfs.h>
     36 
     37 /*
     38  * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'.  This
     39  * #define reserves enough space for two 64-bit hex values plus the length of
     40  * the longest string.
     41  */
     42 #define	MAX_SERDLEN	(16 * 2 + sizeof ("zfs___checksum"))
     43 
     44 /*
     45  * On-disk case structure.  This must maintain backwards compatibility with
     46  * previous versions of the DE.  By default, any members appended to the end
     47  * will be filled with zeros if they don't exist in a previous version.
     48  */
     49 typedef struct zfs_case_data {
     50 	uint64_t	zc_version;
     51 	uint64_t	zc_ena;
     52 	uint64_t	zc_pool_guid;
     53 	uint64_t	zc_vdev_guid;
     54 	int		zc_has_timer;		/* defunct */
     55 	int		zc_pool_state;
     56 	char		zc_serd_checksum[MAX_SERDLEN];
     57 	char		zc_serd_io[MAX_SERDLEN];
     58 	int		zc_has_remove_timer;
     59 } zfs_case_data_t;
     60 
     61 /*
     62  * In-core case structure.
     63  */
     64 typedef struct zfs_case {
     65 	boolean_t	zc_present;
     66 	uint32_t	zc_version;
     67 	zfs_case_data_t	zc_data;
     68 	fmd_case_t	*zc_case;
     69 	uu_list_node_t	zc_node;
     70 	id_t		zc_remove_timer;
     71 	char		*zc_fru;
     72 } zfs_case_t;
     73 
     74 #define	CASE_DATA			"data"
     75 #define	CASE_FRU			"fru"
     76 #define	CASE_DATA_VERSION_INITIAL	1
     77 #define	CASE_DATA_VERSION_SERD		2
     78 
     79 static hrtime_t zfs_remove_timeout;
     80 
     81 uu_list_pool_t *zfs_case_pool;
     82 uu_list_t *zfs_cases;
     83 
     84 #define	ZFS_MAKE_RSRC(type)	\
     85     FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type
     86 #define	ZFS_MAKE_EREPORT(type)	\
     87     FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
     88 
     89 /*
     90  * Write out the persistent representation of an active case.
     91  */
     92 static void
     93 zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp)
     94 {
     95 	/*
     96 	 * Always update cases to the latest version, even if they were the
     97 	 * previous version when unserialized.
     98 	 */
     99 	zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD;
    100 	fmd_buf_write(hdl, zcp->zc_case, CASE_DATA, &zcp->zc_data,
    101 	    sizeof (zcp->zc_data));
    102 
    103 	if (zcp->zc_fru != NULL)
    104 		fmd_buf_write(hdl, zcp->zc_case, CASE_FRU, zcp->zc_fru,
    105 		    strlen(zcp->zc_fru));
    106 }
    107 
    108 /*
    109  * Read back the persistent representation of an active case.
    110  */
    111 static zfs_case_t *
    112 zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
    113 {
    114 	zfs_case_t *zcp;
    115 	size_t frulen;
    116 
    117 	zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP);
    118 	zcp->zc_case = cp;
    119 
    120 	fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data,
    121 	    sizeof (zcp->zc_data));
    122 
    123 	if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) {
    124 		fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
    125 		return (NULL);
    126 	}
    127 
    128 	if ((frulen = fmd_buf_size(hdl, zcp->zc_case, CASE_FRU)) > 0) {
    129 		zcp->zc_fru = fmd_hdl_alloc(hdl, frulen + 1, FMD_SLEEP);
    130 		fmd_buf_read(hdl, zcp->zc_case, CASE_FRU, zcp->zc_fru,
    131 		    frulen);
    132 		zcp->zc_fru[frulen] = '\0';
    133 	}
    134 
    135 	/*
    136 	 * fmd_buf_read() will have already zeroed out the remainder of the
    137 	 * buffer, so we don't have to do anything special if the version
    138 	 * doesn't include the SERD engine name.
    139 	 */
    140 
    141 	if (zcp->zc_data.zc_has_remove_timer)
    142 		zcp->zc_remove_timer = fmd_timer_install(hdl, zcp,
    143 		    NULL, zfs_remove_timeout);
    144 
    145 	(void) uu_list_insert_before(zfs_cases, NULL, zcp);
    146 
    147 	fmd_case_setspecific(hdl, cp, zcp);
    148 
    149 	return (zcp);
    150 }
    151 
    152 /*
    153  * Iterate over any active cases.  If any cases are associated with a pool or
    154  * vdev which is no longer present on the system, close the associated case.
    155  */
    156 static void
    157 zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd)
    158 {
    159 	uint64_t vdev_guid;
    160 	uint_t c, children;
    161 	nvlist_t **child;
    162 	zfs_case_t *zcp;
    163 	int ret;
    164 
    165 	ret = nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid);
    166 	assert(ret == 0);
    167 
    168 	/*
    169 	 * Mark any cases associated with this (pool, vdev) pair.
    170 	 */
    171 	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
    172 	    zcp = uu_list_next(zfs_cases, zcp)) {
    173 		if (zcp->zc_data.zc_pool_guid == pool_guid &&
    174 		    zcp->zc_data.zc_vdev_guid == vdev_guid)
    175 			zcp->zc_present = B_TRUE;
    176 	}
    177 
    178 	/*
    179 	 * Iterate over all children.
    180 	 */
    181 	if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child,
    182 	    &children) == 0) {
    183 		for (c = 0; c < children; c++)
    184 			zfs_mark_vdev(pool_guid, child[c]);
    185 	}
    186 
    187 	if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child,
    188 	    &children) == 0) {
    189 		for (c = 0; c < children; c++)
    190 			zfs_mark_vdev(pool_guid, child[c]);
    191 	}
    192 
    193 	if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child,
    194 	    &children) == 0) {
    195 		for (c = 0; c < children; c++)
    196 			zfs_mark_vdev(pool_guid, child[c]);
    197 	}
    198 }
    199 
    200 /*ARGSUSED*/
    201 static int
    202 zfs_mark_pool(zpool_handle_t *zhp, void *unused)
    203 {
    204 	zfs_case_t *zcp;
    205 	uint64_t pool_guid;
    206 	nvlist_t *config, *vd;
    207 	int ret;
    208 
    209 	pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
    210 	/*
    211 	 * Mark any cases associated with just this pool.
    212 	 */
    213 	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
    214 	    zcp = uu_list_next(zfs_cases, zcp)) {
    215 		if (zcp->zc_data.zc_pool_guid == pool_guid &&
    216 		    zcp->zc_data.zc_vdev_guid == 0)
    217 			zcp->zc_present = B_TRUE;
    218 	}
    219 
    220 	if ((config = zpool_get_config(zhp, NULL)) == NULL) {
    221 		zpool_close(zhp);
    222 		return (-1);
    223 	}
    224 
    225 	ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd);
    226 	assert(ret == 0);
    227 
    228 	zfs_mark_vdev(pool_guid, vd);
    229 
    230 	zpool_close(zhp);
    231 
    232 	return (0);
    233 }
    234 
    235 static void
    236 zfs_purge_cases(fmd_hdl_t *hdl)
    237 {
    238 	zfs_case_t *zcp;
    239 	uu_list_walk_t *walk;
    240 	libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
    241 
    242 	/*
    243 	 * There is no way to open a pool by GUID, or lookup a vdev by GUID.  No
    244 	 * matter what we do, we're going to have to stomach a O(vdevs * cases)
    245 	 * algorithm.  In reality, both quantities are likely so small that
    246 	 * neither will matter. Given that iterating over pools is more
    247 	 * expensive than iterating over the in-memory case list, we opt for a
    248 	 * 'present' flag in each case that starts off cleared.  We then iterate
    249 	 * over all pools, marking those that are still present, and removing
    250 	 * those that aren't found.
    251 	 *
    252 	 * Note that we could also construct an FMRI and rely on
    253 	 * fmd_nvl_fmri_present(), but this would end up doing the same search.
    254 	 */
    255 
    256 	/*
    257 	 * Mark the cases an not present.
    258 	 */
    259 	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
    260 	    zcp = uu_list_next(zfs_cases, zcp))
    261 		zcp->zc_present = B_FALSE;
    262 
    263 	/*
    264 	 * Iterate over all pools and mark the pools and vdevs found.  If this
    265 	 * fails (most probably because we're out of memory), then don't close
    266 	 * any of the cases and we cannot be sure they are accurate.
    267 	 */
    268 	if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0)
    269 		return;
    270 
    271 	/*
    272 	 * Remove those cases which were not found.
    273 	 */
    274 	walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
    275 	while ((zcp = uu_list_walk_next(walk)) != NULL) {
    276 		if (!zcp->zc_present)
    277 			fmd_case_close(hdl, zcp->zc_case);
    278 	}
    279 	uu_list_walk_end(walk);
    280 }
    281 
    282 /*
    283  * Construct the name of a serd engine given the pool/vdev GUID and type (io or
    284  * checksum).
    285  */
    286 static void
    287 zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
    288     const char *type)
    289 {
    290 	(void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", pool_guid,
    291 	    vdev_guid, type);
    292 }
    293 
    294 /*
    295  * Solve a given ZFS case.  This first checks to make sure the diagnosis is
    296  * still valid, as well as cleaning up any pending timer associated with the
    297  * case.
    298  */
    299 static void
    300 zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname,
    301     boolean_t checkunusable)
    302 {
    303 	libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
    304 	nvlist_t *detector, *fault;
    305 	boolean_t serialize;
    306 	nvlist_t *fmri, *fru;
    307 	topo_hdl_t *thp;
    308 	int err;
    309 
    310 	/*
    311 	 * Construct the detector from the case data.  The detector is in the
    312 	 * ZFS scheme, and is either the pool or the vdev, depending on whether
    313 	 * this is a vdev or pool fault.
    314 	 */
    315 	detector = fmd_nvl_alloc(hdl, FMD_SLEEP);
    316 
    317 	(void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0);
    318 	(void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
    319 	(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL,
    320 	    zcp->zc_data.zc_pool_guid);
    321 	if (zcp->zc_data.zc_vdev_guid != 0) {
    322 		(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV,
    323 		    zcp->zc_data.zc_vdev_guid);
    324 	}
    325 
    326 	/*
    327 	 * We also want to make sure that the detector (pool or vdev) properly
    328 	 * reflects the diagnosed state, when the fault corresponds to internal
    329 	 * ZFS state (i.e. not checksum or I/O error-induced).  Otherwise, a
    330 	 * device which was unavailable early in boot (because the driver/file
    331 	 * wasn't available) and is now healthy will be mis-diagnosed.
    332 	 */
    333 	if (!fmd_nvl_fmri_present(hdl, detector) ||
    334 	    (checkunusable && !fmd_nvl_fmri_unusable(hdl, detector))) {
    335 		fmd_case_close(hdl, zcp->zc_case);
    336 		nvlist_free(detector);
    337 		return;
    338 	}
    339 
    340 
    341 	fru = NULL;
    342 	if (zcp->zc_fru != NULL &&
    343 	    (thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION)) != NULL) {
    344 		/*
    345 		 * If the vdev had an associated FRU, then get the FRU nvlist
    346 		 * from the topo handle and use that in the suspect list.  We
    347 		 * explicitly lookup the FRU because the fmri reported from the
    348 		 * kernel may not have up to date details about the disk itself
    349 		 * (serial, part, etc).
    350 		 */
    351 		if (topo_fmri_str2nvl(thp, zcp->zc_fru, &fmri, &err) == 0) {
    352 			/*
    353 			 * If the disk is part of the system chassis, but the
    354 			 * FRU indicates a different chassis ID than our
    355 			 * current system, then ignore the error.  This
    356 			 * indicates that the device was part of another
    357 			 * cluster head, and for obvious reasons cannot be
    358 			 * imported on this system.
    359 			 */
    360 			if (libzfs_fru_notself(zhdl, zcp->zc_fru)) {
    361 				fmd_case_close(hdl, zcp->zc_case);
    362 				nvlist_free(fmri);
    363 				fmd_hdl_topo_rele(hdl, thp);
    364 				nvlist_free(detector);
    365 				return;
    366 			}
    367 
    368 			/*
    369 			 * If the device is no longer present on the system, or
    370 			 * topo_fmri_fru() fails for other reasons, then fall
    371 			 * back to the fmri specified in the vdev.
    372 			 */
    373 			if (topo_fmri_fru(thp, fmri, &fru, &err) != 0)
    374 				fru = fmd_nvl_dup(hdl, fmri, FMD_SLEEP);
    375 			nvlist_free(fmri);
    376 		}
    377 
    378 		fmd_hdl_topo_rele(hdl, thp);
    379 	}
    380 
    381 	fault = fmd_nvl_create_fault(hdl, faultname, 100, detector,
    382 	    fru, detector);
    383 	fmd_case_add_suspect(hdl, zcp->zc_case, fault);
    384 
    385 	nvlist_free(fru);
    386 
    387 	fmd_case_solve(hdl, zcp->zc_case);
    388 
    389 	serialize = B_FALSE;
    390 	if (zcp->zc_data.zc_has_remove_timer) {
    391 		fmd_timer_remove(hdl, zcp->zc_remove_timer);
    392 		zcp->zc_data.zc_has_remove_timer = 0;
    393 		serialize = B_TRUE;
    394 	}
    395 	if (serialize)
    396 		zfs_case_serialize(hdl, zcp);
    397 
    398 	nvlist_free(detector);
    399 }
    400 
    401 /*
    402  * Main fmd entry point.
    403  */
    404 /*ARGSUSED*/
    405 static void
    406 zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
    407 {
    408 	zfs_case_t *zcp, *dcp;
    409 	int32_t pool_state;
    410 	uint64_t ena, pool_guid, vdev_guid;
    411 	nvlist_t *detector;
    412 	boolean_t isresource;
    413 	char *fru, *type;
    414 
    415 	/*
    416 	 * We subscribe to notifications for vdev or pool removal.  In these
    417 	 * cases, there may be cases that no longer apply.  Purge any cases
    418 	 * that no longer apply.
    419 	 */
    420 	if (fmd_nvl_class_match(hdl, nvl, "resource.sysevent.EC_zfs.*")) {
    421 		zfs_purge_cases(hdl);
    422 		return;
    423 	}
    424 
    425 	isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*");
    426 
    427 	if (isresource) {
    428 		/*
    429 		 * For resources, we don't have a normal payload.
    430 		 */
    431 		if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
    432 		    &vdev_guid) != 0)
    433 			pool_state = SPA_LOAD_OPEN;
    434 		else
    435 			pool_state = SPA_LOAD_NONE;
    436 		detector = NULL;
    437 	} else {
    438 		(void) nvlist_lookup_nvlist(nvl,
    439 		    FM_EREPORT_DETECTOR, &detector);
    440 		(void) nvlist_lookup_int32(nvl,
    441 		    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state);
    442 	}
    443 
    444 	/*
    445 	 * We also ignore all ereports generated during an import of a pool,
    446 	 * since the only possible fault (.pool) would result in import failure,
    447 	 * and hence no persistent fault.  Some day we may want to do something
    448 	 * with these ereports, so we continue generating them internally.
    449 	 */
    450 	if (pool_state == SPA_LOAD_IMPORT)
    451 		return;
    452 
    453 	/*
    454 	 * Device I/O errors are ignored during pool open.
    455 	 */
    456 	if (pool_state == SPA_LOAD_OPEN &&
    457 	    (fmd_nvl_class_match(hdl, nvl,
    458 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
    459 	    fmd_nvl_class_match(hdl, nvl,
    460 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
    461 	    fmd_nvl_class_match(hdl, nvl,
    462 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))))
    463 		return;
    464 
    465 	/*
    466 	 * We ignore ereports for anything except disks and files.
    467 	 */
    468 	if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
    469 	    &type) == 0) {
    470 		if (strcmp(type, VDEV_TYPE_DISK) != 0 &&
    471 		    strcmp(type, VDEV_TYPE_FILE) != 0)
    472 			return;
    473 	}
    474 
    475 	/*
    476 	 * Determine if this ereport corresponds to an open case.  Previous
    477 	 * incarnations of this DE used the ENA to chain events together as
    478 	 * part of the same case.  The problem with this is that we rely on
    479 	 * global uniqueness of cases based on (pool_guid, vdev_guid) pair when
    480 	 * generating SERD engines.  Instead, we have a case for each vdev or
    481 	 * pool, regardless of the ENA.
    482 	 */
    483 	(void) nvlist_lookup_uint64(nvl,
    484 	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid);
    485 	if (nvlist_lookup_uint64(nvl,
    486 	    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
    487 		vdev_guid = 0;
    488 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0)
    489 		ena = 0;
    490 
    491 	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
    492 	    zcp = uu_list_next(zfs_cases, zcp)) {
    493 		if (zcp->zc_data.zc_pool_guid == pool_guid &&
    494 		    zcp->zc_data.zc_vdev_guid == vdev_guid)
    495 			break;
    496 	}
    497 
    498 	if (zcp == NULL) {
    499 		fmd_case_t *cs;
    500 		zfs_case_data_t data = { 0 };
    501 
    502 		/*
    503 		 * If this is one of our 'fake' resource ereports, and there is
    504 		 * no case open, simply discard it.
    505 		 */
    506 		if (isresource)
    507 			return;
    508 
    509 		/*
    510 		 * Open a new case.
    511 		 */
    512 		cs = fmd_case_open(hdl, NULL);
    513 
    514 		/*
    515 		 * Initialize the case buffer.  To commonize code, we actually
    516 		 * create the buffer with existing data, and then call
    517 		 * zfs_case_unserialize() to instantiate the in-core structure.
    518 		 */
    519 		fmd_buf_create(hdl, cs, CASE_DATA,
    520 		    sizeof (zfs_case_data_t));
    521 
    522 		data.zc_version = CASE_DATA_VERSION_SERD;
    523 		data.zc_ena = ena;
    524 		data.zc_pool_guid = pool_guid;
    525 		data.zc_vdev_guid = vdev_guid;
    526 		data.zc_pool_state = (int)pool_state;
    527 
    528 		fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data));
    529 
    530 		zcp = zfs_case_unserialize(hdl, cs);
    531 		assert(zcp != NULL);
    532 	}
    533 
    534 	/*
    535 	 * If this is an ereport for a case with an associated vdev FRU, make
    536 	 * sure it is accurate and up to date.
    537 	 */
    538 	if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
    539 	    &fru) == 0) {
    540 		topo_hdl_t *thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION);
    541 		if (zcp->zc_fru == NULL ||
    542 		    !topo_fmri_strcmp(thp, zcp->zc_fru, fru)) {
    543 			if (zcp->zc_fru != NULL) {
    544 				fmd_hdl_strfree(hdl, zcp->zc_fru);
    545 				fmd_buf_destroy(hdl, zcp->zc_case, CASE_FRU);
    546 			}
    547 			zcp->zc_fru = fmd_hdl_strdup(hdl, fru, FMD_SLEEP);
    548 			zfs_case_serialize(hdl, zcp);
    549 		}
    550 		fmd_hdl_topo_rele(hdl, thp);
    551 	}
    552 
    553 	if (isresource) {
    554 		if (fmd_nvl_class_match(hdl, nvl,
    555 		    ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) {
    556 			/*
    557 			 * The 'resource.fs.zfs.autoreplace' event indicates
    558 			 * that the pool was loaded with the 'autoreplace'
    559 			 * property set.  In this case, any pending device
    560 			 * failures should be ignored, as the asynchronous
    561 			 * autoreplace handling will take care of them.
    562 			 */
    563 			fmd_case_close(hdl, zcp->zc_case);
    564 		} else if (fmd_nvl_class_match(hdl, nvl,
    565 		    ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) {
    566 			/*
    567 			 * The 'resource.fs.zfs.removed' event indicates that
    568 			 * device removal was detected, and the device was
    569 			 * closed asynchronously.  If this is the case, we
    570 			 * assume that any recent I/O errors were due to the
    571 			 * device removal, not any fault of the device itself.
    572 			 * We reset the SERD engine, and cancel any pending
    573 			 * timers.
    574 			 */
    575 			if (zcp->zc_data.zc_has_remove_timer) {
    576 				fmd_timer_remove(hdl, zcp->zc_remove_timer);
    577 				zcp->zc_data.zc_has_remove_timer = 0;
    578 				zfs_case_serialize(hdl, zcp);
    579 			}
    580 			if (zcp->zc_data.zc_serd_io[0] != '\0')
    581 				fmd_serd_reset(hdl,
    582 				    zcp->zc_data.zc_serd_io);
    583 			if (zcp->zc_data.zc_serd_checksum[0] != '\0')
    584 				fmd_serd_reset(hdl,
    585 				    zcp->zc_data.zc_serd_checksum);
    586 		}
    587 		return;
    588 	}
    589 
    590 	/*
    591 	 * Associate the ereport with this case.
    592 	 */
    593 	fmd_case_add_ereport(hdl, zcp->zc_case, ep);
    594 
    595 	/*
    596 	 * Don't do anything else if this case is already solved.
    597 	 */
    598 	if (fmd_case_solved(hdl, zcp->zc_case))
    599 		return;
    600 
    601 	/*
    602 	 * Determine if we should solve the case and generate a fault.  We solve
    603 	 * a case if:
    604 	 *
    605 	 * 	a. A pool failed to open (ereport.fs.zfs.pool)
    606 	 * 	b. A device failed to open (ereport.fs.zfs.pool) while a pool
    607 	 *	   was up and running.
    608 	 *
    609 	 * We may see a series of ereports associated with a pool open, all
    610 	 * chained together by the same ENA.  If the pool open succeeds, then
    611 	 * we'll see no further ereports.  To detect when a pool open has
    612 	 * succeeded, we associate a timer with the event.  When it expires, we
    613 	 * close the case.
    614 	 */
    615 	if (fmd_nvl_class_match(hdl, nvl,
    616 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) {
    617 		/*
    618 		 * Pool level fault.  Before solving the case, go through and
    619 		 * close any open device cases that may be pending.
    620 		 */
    621 		for (dcp = uu_list_first(zfs_cases); dcp != NULL;
    622 		    dcp = uu_list_next(zfs_cases, dcp)) {
    623 			if (dcp->zc_data.zc_pool_guid ==
    624 			    zcp->zc_data.zc_pool_guid &&
    625 			    dcp->zc_data.zc_vdev_guid != 0)
    626 				fmd_case_close(hdl, dcp->zc_case);
    627 		}
    628 
    629 		zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE);
    630 	} else if (fmd_nvl_class_match(hdl, nvl,
    631 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) {
    632 		/*
    633 		 * Pool level fault for reading the intent logs.
    634 		 */
    635 		zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE);
    636 	} else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) {
    637 		/*
    638 		 * Device fault.
    639 		 */
    640 		zfs_case_solve(hdl, zcp, "fault.fs.zfs.device",  B_TRUE);
    641 	} else if (fmd_nvl_class_match(hdl, nvl,
    642 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
    643 	    fmd_nvl_class_match(hdl, nvl,
    644 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
    645 	    fmd_nvl_class_match(hdl, nvl,
    646 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
    647 	    fmd_nvl_class_match(hdl, nvl,
    648 	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
    649 		char *failmode = NULL;
    650 		boolean_t checkremove = B_FALSE;
    651 
    652 		/*
    653 		 * If this is a checksum or I/O error, then toss it into the
    654 		 * appropriate SERD engine and check to see if it has fired.
    655 		 * Ideally, we want to do something more sophisticated,
    656 		 * (persistent errors for a single data block, etc).  For now,
    657 		 * a single SERD engine is sufficient.
    658 		 */
    659 		if (fmd_nvl_class_match(hdl, nvl,
    660 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) {
    661 			if (zcp->zc_data.zc_serd_io[0] == '\0') {
    662 				zfs_serd_name(zcp->zc_data.zc_serd_io,
    663 				    pool_guid, vdev_guid, "io");
    664 				fmd_serd_create(hdl, zcp->zc_data.zc_serd_io,
    665 				    fmd_prop_get_int32(hdl, "io_N"),
    666 				    fmd_prop_get_int64(hdl, "io_T"));
    667 				zfs_case_serialize(hdl, zcp);
    668 			}
    669 			if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
    670 				checkremove = B_TRUE;
    671 		} else if (fmd_nvl_class_match(hdl, nvl,
    672 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
    673 			if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
    674 				zfs_serd_name(zcp->zc_data.zc_serd_checksum,
    675 				    pool_guid, vdev_guid, "checksum");
    676 				fmd_serd_create(hdl,
    677 				    zcp->zc_data.zc_serd_checksum,
    678 				    fmd_prop_get_int32(hdl, "checksum_N"),
    679 				    fmd_prop_get_int64(hdl, "checksum_T"));
    680 				zfs_case_serialize(hdl, zcp);
    681 			}
    682 			if (fmd_serd_record(hdl,
    683 			    zcp->zc_data.zc_serd_checksum, ep)) {
    684 				zfs_case_solve(hdl, zcp,
    685 				    "fault.fs.zfs.vdev.checksum", B_FALSE);
    686 			}
    687 		} else if (fmd_nvl_class_match(hdl, nvl,
    688 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) &&
    689 		    (nvlist_lookup_string(nvl,
    690 		    FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) &&
    691 		    failmode != NULL) {
    692 			if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE,
    693 			    strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) {
    694 				zfs_case_solve(hdl, zcp,
    695 				    "fault.fs.zfs.io_failure_continue",
    696 				    B_FALSE);
    697 			} else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT,
    698 			    strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) {
    699 				zfs_case_solve(hdl, zcp,
    700 				    "fault.fs.zfs.io_failure_wait", B_FALSE);
    701 			}
    702 		} else if (fmd_nvl_class_match(hdl, nvl,
    703 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
    704 			checkremove = B_TRUE;
    705 		}
    706 
    707 		/*
    708 		 * Because I/O errors may be due to device removal, we postpone
    709 		 * any diagnosis until we're sure that we aren't about to
    710 		 * receive a 'resource.fs.zfs.removed' event.
    711 		 */
    712 		if (checkremove) {
    713 			if (zcp->zc_data.zc_has_remove_timer)
    714 				fmd_timer_remove(hdl, zcp->zc_remove_timer);
    715 			zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL,
    716 			    zfs_remove_timeout);
    717 			if (!zcp->zc_data.zc_has_remove_timer) {
    718 				zcp->zc_data.zc_has_remove_timer = 1;
    719 				zfs_case_serialize(hdl, zcp);
    720 			}
    721 		}
    722 	}
    723 }
    724 
    725 /*
    726  * The timeout is fired when we diagnosed an I/O error, and it was not due to
    727  * device removal (which would cause the timeout to be cancelled).
    728  */
    729 /* ARGSUSED */
    730 static void
    731 zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data)
    732 {
    733 	zfs_case_t *zcp = data;
    734 
    735 	if (id == zcp->zc_remove_timer)
    736 		zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE);
    737 }
    738 
    739 static void
    740 zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
    741 {
    742 	zfs_case_t *zcp = fmd_case_getspecific(hdl, cs);
    743 
    744 	if (zcp->zc_data.zc_serd_checksum[0] != '\0')
    745 		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
    746 	if (zcp->zc_data.zc_serd_io[0] != '\0')
    747 		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
    748 	if (zcp->zc_data.zc_has_remove_timer)
    749 		fmd_timer_remove(hdl, zcp->zc_remove_timer);
    750 	uu_list_remove(zfs_cases, zcp);
    751 	fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
    752 }
    753 
    754 /*
    755  * We use the fmd gc entry point to look for old cases that no longer apply.
    756  * This allows us to keep our set of case data small in a long running system.
    757  */
    758 static void
    759 zfs_fm_gc(fmd_hdl_t *hdl)
    760 {
    761 	zfs_purge_cases(hdl);
    762 }
    763 
    764 static const fmd_hdl_ops_t fmd_ops = {
    765 	zfs_fm_recv,	/* fmdo_recv */
    766 	zfs_fm_timeout,	/* fmdo_timeout */
    767 	zfs_fm_close,	/* fmdo_close */
    768 	NULL,		/* fmdo_stats */
    769 	zfs_fm_gc,	/* fmdo_gc */
    770 };
    771 
    772 static const fmd_prop_t fmd_props[] = {
    773 	{ "checksum_N", FMD_TYPE_UINT32, "10" },
    774 	{ "checksum_T", FMD_TYPE_TIME, "10min" },
    775 	{ "io_N", FMD_TYPE_UINT32, "10" },
    776 	{ "io_T", FMD_TYPE_TIME, "10min" },
    777 	{ "remove_timeout", FMD_TYPE_TIME, "15sec" },
    778 	{ NULL, 0, NULL }
    779 };
    780 
    781 static const fmd_hdl_info_t fmd_info = {
    782 	"ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props
    783 };
    784 
    785 void
    786 _fmd_init(fmd_hdl_t *hdl)
    787 {
    788 	fmd_case_t *cp;
    789 	libzfs_handle_t *zhdl;
    790 
    791 	if ((zhdl = libzfs_init()) == NULL)
    792 		return;
    793 
    794 	if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool",
    795 	    sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node),
    796 	    NULL, 0)) == NULL) {
    797 		libzfs_fini(zhdl);
    798 		return;
    799 	}
    800 
    801 	if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, 0)) == NULL) {
    802 		uu_list_pool_destroy(zfs_case_pool);
    803 		libzfs_fini(zhdl);
    804 		return;
    805 	}
    806 
    807 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
    808 		uu_list_destroy(zfs_cases);
    809 		uu_list_pool_destroy(zfs_case_pool);
    810 		libzfs_fini(zhdl);
    811 		return;
    812 	}
    813 
    814 	fmd_hdl_setspecific(hdl, zhdl);
    815 
    816 	/*
    817 	 * Iterate over all active cases and unserialize the associated buffers,
    818 	 * adding them to our list of open cases.
    819 	 */
    820 	for (cp = fmd_case_next(hdl, NULL);
    821 	    cp != NULL; cp = fmd_case_next(hdl, cp))
    822 		(void) zfs_case_unserialize(hdl, cp);
    823 
    824 	/*
    825 	 * Clear out any old cases that are no longer valid.
    826 	 */
    827 	zfs_purge_cases(hdl);
    828 
    829 	zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
    830 }
    831 
    832 void
    833 _fmd_fini(fmd_hdl_t *hdl)
    834 {
    835 	zfs_case_t *zcp;
    836 	uu_list_walk_t *walk;
    837 	libzfs_handle_t *zhdl;
    838 
    839 	/*
    840 	 * Remove all active cases.
    841 	 */
    842 	walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
    843 	while ((zcp = uu_list_walk_next(walk)) != NULL) {
    844 		uu_list_remove(zfs_cases, zcp);
    845 		fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
    846 	}
    847 	uu_list_walk_end(walk);
    848 
    849 	uu_list_destroy(zfs_cases);
    850 	uu_list_pool_destroy(zfs_case_pool);
    851 
    852 	zhdl = fmd_hdl_getspecific(hdl);
    853 	libzfs_fini(zhdl);
    854 }
    855