Home | History | Annotate | Download | only in zfs_mod
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 /*
     29  * ZFS syseventd module.
     30  *
     31  * The purpose of this module is to identify when devices are added to the
     32  * system, and appropriately online or replace the affected vdevs.
     33  *
     34  * When a device is added to the system:
     35  *
     36  * 	1. Search for any vdevs whose devid matches that of the newly added
     37  *	   device.
     38  *
     39  * 	2. If no vdevs are found, then search for any vdevs whose devfs path
     40  *	   matches that of the new device.
     41  *
     42  *	3. If no vdevs match by either method, then ignore the event.
     43  *
     44  * 	4. Attempt to online the device with a flag to indicate that it should
     45  *	   be unspared when resilvering completes.  If this succeeds, then the
     46  *	   same device was inserted and we should continue normally.
     47  *
     48  *	5. If the pool does not have the 'autoreplace' property set, attempt to
     49  *	   online the device again without the unspare flag, which will
     50  *	   generate a FMA fault.
     51  *
     52  *	6. If the pool has the 'autoreplace' property set, and the matching vdev
     53  *	   is a whole disk, then label the new disk and attempt a 'zpool
     54  *	   replace'.
     55  *
     56  * The module responds to EC_DEV_ADD events for both disks and lofi devices,
     57  * with the latter used for testing.  The special ESC_ZFS_VDEV_CHECK event
     58  * indicates that a device failed to open during pool load, but the autoreplace
     59  * property was set.  In this case, we deferred the associated FMA fault until
     60  * our module had a chance to process the autoreplace logic.  If the device
     61  * could not be replaced, then the second online attempt will trigger the FMA
     62  * fault that we skipped earlier.
     63  */
     64 
     65 #include <alloca.h>
     66 #include <devid.h>
     67 #include <fcntl.h>
     68 #include <libnvpair.h>
     69 #include <libsysevent.h>
     70 #include <libzfs.h>
     71 #include <limits.h>
     72 #include <stdlib.h>
     73 #include <string.h>
     74 #include <syslog.h>
     75 #include <sys/sunddi.h>
     76 #include <sys/sysevent/eventdefs.h>
     77 #include <sys/sysevent/dev.h>
     78 #include <unistd.h>
     79 
     80 #if defined(__i386) || defined(__amd64)
     81 #define	PHYS_PATH	":q"
     82 #define	RAW_SLICE	"p0"
     83 #elif defined(__sparc)
     84 #define	PHYS_PATH	":c"
     85 #define	RAW_SLICE	"s2"
     86 #else
     87 #error Unknown architecture
     88 #endif
     89 
     90 typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
     91 
     92 libzfs_handle_t *g_zfshdl;
     93 
     94 /*
     95  * The device associated with the given vdev (either by devid or physical path)
     96  * has been added to the system.  If 'isdisk' is set, then we only attempt a
     97  * replacement if it's a whole disk.  This also implies that we should label the
     98  * disk first.
     99  *
    100  * First, we attempt to online the device (making sure to undo any spare
    101  * operation when finished).  If this succeeds, then we're done.  If it fails,
    102  * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
    103  * but that the label was not what we expected.  If the 'autoreplace' property
    104  * is not set, then we relabel the disk (if specified), and attempt a 'zpool
    105  * replace'.  If the online is successful, but the new state is something else
    106  * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
    107  * race, and we should avoid attempting to relabel the disk.
    108  */
    109 static void
    110 zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk)
    111 {
    112 	char *path;
    113 	vdev_state_t newstate;
    114 	nvlist_t *nvroot, *newvd;
    115 	uint64_t wholedisk = 0ULL;
    116 	char *physpath = NULL;
    117 	char rawpath[PATH_MAX], fullpath[PATH_MAX];
    118 	size_t len;
    119 
    120 	if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
    121 		return;
    122 
    123 	(void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
    124 	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
    125 
    126 	/*
    127 	 * We should have a way to online a device by guid.  With the current
    128 	 * interface, we are forced to chop off the 's0' for whole disks.
    129 	 */
    130 	(void) strlcpy(fullpath, path, sizeof (fullpath));
    131 	if (wholedisk)
    132 		fullpath[strlen(fullpath) - 2] = '\0';
    133 
    134 	/*
    135 	 * Attempt to online the device.  It would be nice to online this by
    136 	 * GUID, but the current interface only supports lookup by path.
    137 	 */
    138 	if (zpool_vdev_online(zhp, fullpath,
    139 	    ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
    140 	    (newstate == VDEV_STATE_HEALTHY || newstate == VDEV_STATE_DEGRADED))
    141 		return;
    142 
    143 	/*
    144 	 * If the pool doesn't have the autoreplace property set, then attempt a
    145 	 * true online (without the unspare flag), which will trigger a FMA
    146 	 * fault.
    147 	 */
    148 	if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
    149 	    (isdisk && !wholedisk)) {
    150 		(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
    151 		    &newstate);
    152 		return;
    153 	}
    154 
    155 	if (isdisk) {
    156 		/*
    157 		 * If this is a request to label a whole disk, then attempt to
    158 		 * write out the label.  Before we can label the disk, we need
    159 		 * access to a raw node.  Ideally, we'd like to walk the devinfo
    160 		 * tree and find a raw node from the corresponding parent node.
    161 		 * This is overly complicated, and since we know how we labeled
    162 		 * this device in the first place, we know it's save to switch
    163 		 * from /dev/dsk to /dev/rdsk and append the backup slice.
    164 		 *
    165 		 * If any part of this process fails, then do a force online to
    166 		 * trigger a ZFS fault for the device (and any hot spare
    167 		 * replacement).
    168 		 */
    169 		if (strncmp(path, "/dev/dsk/", 9) != 0) {
    170 			(void) zpool_vdev_online(zhp, fullpath,
    171 			    ZFS_ONLINE_FORCEFAULT, &newstate);
    172 			return;
    173 		}
    174 
    175 		(void) strlcpy(rawpath, path + 9, sizeof (rawpath));
    176 		len = strlen(rawpath);
    177 		rawpath[len - 2] = '\0';
    178 
    179 		if (zpool_label_disk(g_zfshdl, zhp, rawpath) != 0) {
    180 			(void) zpool_vdev_online(zhp, fullpath,
    181 			    ZFS_ONLINE_FORCEFAULT, &newstate);
    182 			return;
    183 		}
    184 	}
    185 
    186 	/*
    187 	 * Cosntruct the root vdev to pass to zpool_vdev_attach().  While adding
    188 	 * the entire vdev structure is harmless, we construct a reduced set of
    189 	 * path/physpath/wholedisk to keep it simple.
    190 	 */
    191 	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
    192 		return;
    193 
    194 	if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
    195 		nvlist_free(nvroot);
    196 		return;
    197 	}
    198 
    199 	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
    200 	    nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
    201 	    (physpath != NULL && nvlist_add_string(newvd,
    202 	    ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
    203 	    nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
    204 	    nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
    205 	    nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
    206 	    1) != 0) {
    207 		nvlist_free(newvd);
    208 		nvlist_free(nvroot);
    209 		return;
    210 	}
    211 
    212 	nvlist_free(newvd);
    213 
    214 	(void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE);
    215 
    216 	nvlist_free(nvroot);
    217 
    218 }
    219 
    220 /*
    221  * Utility functions to find a vdev matching given criteria.
    222  */
    223 typedef struct dev_data {
    224 	const char		*dd_compare;
    225 	const char		*dd_prop;
    226 	zfs_process_func_t	dd_func;
    227 	boolean_t		dd_found;
    228 	boolean_t		dd_isdisk;
    229 	uint64_t		dd_pool_guid;
    230 	uint64_t		dd_vdev_guid;
    231 } dev_data_t;
    232 
    233 static void
    234 zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
    235 {
    236 	dev_data_t *dp = data;
    237 	char *path;
    238 	uint_t c, children;
    239 	nvlist_t **child;
    240 	size_t len;
    241 	uint64_t guid;
    242 
    243 	/*
    244 	 * First iterate over any children.
    245 	 */
    246 	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
    247 	    &child, &children) == 0) {
    248 		for (c = 0; c < children; c++)
    249 			zfs_iter_vdev(zhp, child[c], data);
    250 		return;
    251 	}
    252 
    253 	if (dp->dd_vdev_guid != 0) {
    254 		if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
    255 		    &guid) != 0 || guid != dp->dd_vdev_guid)
    256 			return;
    257 	} else {
    258 		len = strlen(dp->dd_compare);
    259 
    260 		if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
    261 		    strncmp(dp->dd_compare, path, len) != 0)
    262 			return;
    263 
    264 		/*
    265 		 * Normally, we want to have an exact match for the comparison
    266 		 * string.  However, we allow substring matches in the following
    267 		 * cases:
    268 		 *
    269 		 * 	<path>:		This is a devpath, and the target is one
    270 		 * 			of its children.
    271 		 *
    272 		 * 	<path/>		This is a devid for a whole disk, and
    273 		 * 			the target is one of its children.
    274 		 */
    275 		if (path[len] != '\0' && path[len] != ':' &&
    276 		    path[len - 1] != '/')
    277 			return;
    278 	}
    279 
    280 	(dp->dd_func)(zhp, nvl, dp->dd_isdisk);
    281 }
    282 
    283 static int
    284 zfs_iter_pool(zpool_handle_t *zhp, void *data)
    285 {
    286 	nvlist_t *config, *nvl;
    287 	dev_data_t *dp = data;
    288 	uint64_t pool_guid;
    289 
    290 	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
    291 		if (dp->dd_pool_guid == 0 ||
    292 		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
    293 		    &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
    294 			(void) nvlist_lookup_nvlist(config,
    295 			    ZPOOL_CONFIG_VDEV_TREE, &nvl);
    296 			zfs_iter_vdev(zhp, nvl, data);
    297 		}
    298 	}
    299 
    300 	zpool_close(zhp);
    301 	return (0);
    302 }
    303 
    304 /*
    305  * Given a physical device path, iterate over all (pool, vdev) pairs which
    306  * correspond to the given path.
    307  */
    308 static boolean_t
    309 devpath_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
    310 {
    311 	dev_data_t data = { 0 };
    312 
    313 	data.dd_compare = devpath;
    314 	data.dd_func = func;
    315 	data.dd_prop = ZPOOL_CONFIG_PHYS_PATH;
    316 	data.dd_found = B_FALSE;
    317 	data.dd_isdisk = wholedisk;
    318 
    319 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
    320 
    321 	return (data.dd_found);
    322 }
    323 
    324 /*
    325  * Given a /devices path, lookup the corresponding devid for each minor node,
    326  * and find any vdevs with matching devids.  Doing this straight up would be
    327  * rather inefficient, O(minor nodes * vdevs in system), so we take advantage of
    328  * the fact that each devid ends with "/<minornode>".  Once we find any valid
    329  * minor node, we chop off the portion after the last slash, and then search for
    330  * matching vdevs, which is O(vdevs in system).
    331  */
    332 static boolean_t
    333 devid_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
    334 {
    335 	size_t len = strlen(devpath) + sizeof ("/devices") +
    336 	    sizeof (PHYS_PATH) - 1;
    337 	char *fullpath;
    338 	int fd;
    339 	ddi_devid_t devid;
    340 	char *devidstr, *fulldevid;
    341 	dev_data_t data = { 0 };
    342 
    343 	/*
    344 	 * Try to open a known minor node.
    345 	 */
    346 	fullpath = alloca(len);
    347 	(void) snprintf(fullpath, len, "/devices%s%s", devpath, PHYS_PATH);
    348 	if ((fd = open(fullpath, O_RDONLY)) < 0)
    349 		return (B_FALSE);
    350 
    351 	/*
    352 	 * Determine the devid as a string, with no trailing slash for the minor
    353 	 * node.
    354 	 */
    355 	if (devid_get(fd, &devid) != 0) {
    356 		(void) close(fd);
    357 		return (B_FALSE);
    358 	}
    359 	(void) close(fd);
    360 
    361 	if ((devidstr = devid_str_encode(devid, NULL)) == NULL) {
    362 		devid_free(devid);
    363 		return (B_FALSE);
    364 	}
    365 
    366 	len = strlen(devidstr) + 2;
    367 	fulldevid = alloca(len);
    368 	(void) snprintf(fulldevid, len, "%s/", devidstr);
    369 
    370 	data.dd_compare = fulldevid;
    371 	data.dd_func = func;
    372 	data.dd_prop = ZPOOL_CONFIG_DEVID;
    373 	data.dd_found = B_FALSE;
    374 	data.dd_isdisk = wholedisk;
    375 
    376 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
    377 
    378 	devid_str_free(devidstr);
    379 
    380 	return (data.dd_found);
    381 }
    382 
    383 /*
    384  * This function is called when we receive a devfs add event.  This can be
    385  * either a disk event or a lofi event, and the behavior is slightly different
    386  * depending on which it is.
    387  */
    388 static int
    389 zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
    390 {
    391 	char *devpath, *devname;
    392 	char path[PATH_MAX], realpath[PATH_MAX];
    393 	char *colon, *raw;
    394 	int ret;
    395 
    396 	/*
    397 	 * The main unit of operation is the physical device path.  For disks,
    398 	 * this is the device node, as all minor nodes are affected.  For lofi
    399 	 * devices, this includes the minor path.  Unfortunately, this isn't
    400 	 * represented in the DEV_PHYS_PATH for various reasons.
    401 	 */
    402 	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath) != 0)
    403 		return (-1);
    404 
    405 	/*
    406 	 * If this is a lofi device, then also get the minor instance name.
    407 	 * Unfortunately, the current payload doesn't include an easy way to get
    408 	 * this information.  So we cheat by resolving the 'dev_name' (which
    409 	 * refers to the raw device) and taking the portion between ':(*),raw'.
    410 	 */
    411 	(void) strlcpy(realpath, devpath, sizeof (realpath));
    412 	if (is_lofi) {
    413 		if (nvlist_lookup_string(nvl, DEV_NAME,
    414 		    &devname) == 0 &&
    415 		    (ret = resolvepath(devname, path,
    416 		    sizeof (path))) > 0) {
    417 			path[ret] = '\0';
    418 			colon = strchr(path, ':');
    419 			if (colon != NULL)
    420 				raw = strstr(colon + 1, ",raw");
    421 			if (colon != NULL && raw != NULL) {
    422 				*raw = '\0';
    423 				(void) snprintf(realpath,
    424 				    sizeof (realpath), "%s%s",
    425 				    devpath, colon);
    426 				*raw = ',';
    427 			}
    428 		}
    429 	}
    430 
    431 	/*
    432 	 * Iterate over all vdevs with a matching devid, and then those with a
    433 	 * matching /devices path.  For disks, we only want to pay attention to
    434 	 * vdevs marked as whole disks.  For lofi, we don't care (because we're
    435 	 * matching an exact minor name).
    436 	 */
    437 	if (!devid_iter(realpath, zfs_process_add, !is_lofi))
    438 		(void) devpath_iter(realpath, zfs_process_add, !is_lofi);
    439 
    440 	return (0);
    441 }
    442 
    443 /*
    444  * Called when we receive a VDEV_CHECK event, which indicates a device could not
    445  * be opened during initial pool open, but the autoreplace property was set on
    446  * the pool.  In this case, we treat it as if it were an add event.
    447  */
    448 static int
    449 zfs_deliver_check(nvlist_t *nvl)
    450 {
    451 	dev_data_t data = { 0 };
    452 
    453 	if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID,
    454 	    &data.dd_pool_guid) != 0 ||
    455 	    nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID,
    456 	    &data.dd_vdev_guid) != 0)
    457 		return (0);
    458 
    459 	data.dd_isdisk = B_TRUE;
    460 	data.dd_func = zfs_process_add;
    461 
    462 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
    463 
    464 	return (0);
    465 }
    466 
    467 /*ARGSUSED*/
    468 static int
    469 zfs_deliver_event(sysevent_t *ev, int unused)
    470 {
    471 	const char *class = sysevent_get_class_name(ev);
    472 	const char *subclass = sysevent_get_subclass_name(ev);
    473 	nvlist_t *nvl;
    474 	int ret;
    475 	boolean_t is_lofi, is_check;
    476 
    477 	if (strcmp(class, EC_DEV_ADD) == 0) {
    478 		/*
    479 		 * We're mainly interested in disk additions, but we also listen
    480 		 * for new lofi devices, to allow for simplified testing.
    481 		 */
    482 		if (strcmp(subclass, ESC_DISK) == 0)
    483 			is_lofi = B_FALSE;
    484 		else if (strcmp(subclass, ESC_LOFI) == 0)
    485 			is_lofi = B_TRUE;
    486 		else
    487 			return (0);
    488 
    489 		is_check = B_FALSE;
    490 	} else if (strcmp(class, EC_ZFS) == 0 &&
    491 	    strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) {
    492 		/*
    493 		 * This event signifies that a device failed to open during pool
    494 		 * load, but the 'autoreplace' property was set, so we should
    495 		 * pretend it's just been added.
    496 		 */
    497 		is_check = B_TRUE;
    498 	} else {
    499 		return (0);
    500 	}
    501 
    502 	if (sysevent_get_attr_list(ev, &nvl) != 0)
    503 		return (-1);
    504 
    505 	if (is_check)
    506 		ret = zfs_deliver_check(nvl);
    507 	else
    508 		ret = zfs_deliver_add(nvl, is_lofi);
    509 
    510 
    511 	nvlist_free(nvl);
    512 	return (ret);
    513 }
    514 
    515 static struct slm_mod_ops zfs_mod_ops = {
    516 	SE_MAJOR_VERSION, SE_MINOR_VERSION, 10, zfs_deliver_event
    517 };
    518 
    519 struct slm_mod_ops *
    520 slm_init()
    521 {
    522 	if ((g_zfshdl = libzfs_init()) == NULL)
    523 		return (NULL);
    524 
    525 	return (&zfs_mod_ops);
    526 }
    527 
    528 void
    529 slm_fini()
    530 {
    531 }
    532