Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/zfs_context.h>
     27 #include <sys/spa.h>
     28 #include <sys/refcount.h>
     29 #include <sys/vdev_disk.h>
     30 #include <sys/vdev_impl.h>
     31 #include <sys/fs/zfs.h>
     32 #include <sys/zio.h>
     33 #include <sys/sunldi.h>
     34 #include <sys/fm/fs/zfs.h>
     35 
     36 /*
     37  * Virtual device vector for disks.
     38  */
     39 
     40 extern ldi_ident_t zfs_li;
     41 
     42 typedef struct vdev_disk_buf {
     43 	buf_t	vdb_buf;
     44 	zio_t	*vdb_io;
     45 } vdev_disk_buf_t;
     46 
     47 static int
     48 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
     49 {
     50 	spa_t *spa = vd->vdev_spa;
     51 	vdev_disk_t *dvd;
     52 	struct dk_minfo dkm;
     53 	int error;
     54 	dev_t dev;
     55 	int otyp;
     56 
     57 	/*
     58 	 * We must have a pathname, and it must be absolute.
     59 	 */
     60 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
     61 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
     62 		return (EINVAL);
     63 	}
     64 
     65 	/*
     66 	 * Reopen the device if it's not currently open. Otherwise,
     67 	 * just update the physical size of the device.
     68 	 */
     69 	if (vd->vdev_tsd != NULL) {
     70 		ASSERT(vd->vdev_reopening);
     71 		dvd = vd->vdev_tsd;
     72 		goto skip_open;
     73 	}
     74 
     75 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
     76 
     77 	/*
     78 	 * When opening a disk device, we want to preserve the user's original
     79 	 * intent.  We always want to open the device by the path the user gave
     80 	 * us, even if it is one of multiple paths to the save device.  But we
     81 	 * also want to be able to survive disks being removed/recabled.
     82 	 * Therefore the sequence of opening devices is:
     83 	 *
     84 	 * 1. Try opening the device by path.  For legacy pools without the
     85 	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
     86 	 *
     87 	 * 2. If the devid of the device matches the stored value, return
     88 	 *    success.
     89 	 *
     90 	 * 3. Otherwise, the device may have moved.  Try opening the device
     91 	 *    by the devid instead.
     92 	 */
     93 	if (vd->vdev_devid != NULL) {
     94 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
     95 		    &dvd->vd_minor) != 0) {
     96 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
     97 			return (EINVAL);
     98 		}
     99 	}
    100 
    101 	error = EINVAL;		/* presume failure */
    102 
    103 	if (vd->vdev_path != NULL) {
    104 		ddi_devid_t devid;
    105 
    106 		if (vd->vdev_wholedisk == -1ULL) {
    107 			size_t len = strlen(vd->vdev_path) + 3;
    108 			char *buf = kmem_alloc(len, KM_SLEEP);
    109 			ldi_handle_t lh;
    110 
    111 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
    112 
    113 			if (ldi_open_by_name(buf, spa_mode(spa), kcred,
    114 			    &lh, zfs_li) == 0) {
    115 				spa_strfree(vd->vdev_path);
    116 				vd->vdev_path = buf;
    117 				vd->vdev_wholedisk = 1ULL;
    118 				(void) ldi_close(lh, spa_mode(spa), kcred);
    119 			} else {
    120 				kmem_free(buf, len);
    121 			}
    122 		}
    123 
    124 		error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred,
    125 		    &dvd->vd_lh, zfs_li);
    126 
    127 		/*
    128 		 * Compare the devid to the stored value.
    129 		 */
    130 		if (error == 0 && vd->vdev_devid != NULL &&
    131 		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
    132 			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
    133 				error = EINVAL;
    134 				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
    135 				    kcred);
    136 				dvd->vd_lh = NULL;
    137 			}
    138 			ddi_devid_free(devid);
    139 		}
    140 
    141 		/*
    142 		 * If we succeeded in opening the device, but 'vdev_wholedisk'
    143 		 * is not yet set, then this must be a slice.
    144 		 */
    145 		if (error == 0 && vd->vdev_wholedisk == -1ULL)
    146 			vd->vdev_wholedisk = 0;
    147 	}
    148 
    149 	/*
    150 	 * If we were unable to open by path, or the devid check fails, open by
    151 	 * devid instead.
    152 	 */
    153 	if (error != 0 && vd->vdev_devid != NULL)
    154 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
    155 		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
    156 
    157 	/*
    158 	 * If all else fails, then try opening by physical path (if available)
    159 	 * or the logical path (if we failed due to the devid check).  While not
    160 	 * as reliable as the devid, this will give us something, and the higher
    161 	 * level vdev validation will prevent us from opening the wrong device.
    162 	 */
    163 	if (error) {
    164 		if (vd->vdev_physpath != NULL &&
    165 		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
    166 			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
    167 			    kcred, &dvd->vd_lh, zfs_li);
    168 
    169 		/*
    170 		 * Note that we don't support the legacy auto-wholedisk support
    171 		 * as above.  This hasn't been used in a very long time and we
    172 		 * don't need to propagate its oddities to this edge condition.
    173 		 */
    174 		if (error && vd->vdev_path != NULL)
    175 			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
    176 			    kcred, &dvd->vd_lh, zfs_li);
    177 	}
    178 
    179 	if (error) {
    180 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
    181 		return (error);
    182 	}
    183 
    184 	/*
    185 	 * Once a device is opened, verify that the physical device path (if
    186 	 * available) is up to date.
    187 	 */
    188 	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
    189 	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
    190 		char *physpath, *minorname;
    191 
    192 		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
    193 		minorname = NULL;
    194 		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
    195 		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
    196 		    (vd->vdev_physpath == NULL ||
    197 		    strcmp(vd->vdev_physpath, physpath) != 0)) {
    198 			if (vd->vdev_physpath)
    199 				spa_strfree(vd->vdev_physpath);
    200 			(void) strlcat(physpath, ":", MAXPATHLEN);
    201 			(void) strlcat(physpath, minorname, MAXPATHLEN);
    202 			vd->vdev_physpath = spa_strdup(physpath);
    203 		}
    204 		if (minorname)
    205 			kmem_free(minorname, strlen(minorname) + 1);
    206 		kmem_free(physpath, MAXPATHLEN);
    207 	}
    208 
    209 skip_open:
    210 	/*
    211 	 * Determine the actual size of the device.
    212 	 */
    213 	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
    214 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
    215 		return (EINVAL);
    216 	}
    217 
    218 	/*
    219 	 * If we own the whole disk, try to enable disk write caching.
    220 	 * We ignore errors because it's OK if we can't do it.
    221 	 */
    222 	if (vd->vdev_wholedisk == 1) {
    223 		int wce = 1;
    224 		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
    225 		    FKIOCTL, kcred, NULL);
    226 	}
    227 
    228 	/*
    229 	 * Determine the device's minimum transfer size.
    230 	 * If the ioctl isn't supported, assume DEV_BSIZE.
    231 	 */
    232 	if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
    233 	    FKIOCTL, kcred, NULL) != 0)
    234 		dkm.dki_lbsize = DEV_BSIZE;
    235 
    236 	*ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
    237 
    238 	/*
    239 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
    240 	 * try again.
    241 	 */
    242 	vd->vdev_nowritecache = B_FALSE;
    243 
    244 	return (0);
    245 }
    246 
    247 static void
    248 vdev_disk_close(vdev_t *vd)
    249 {
    250 	vdev_disk_t *dvd = vd->vdev_tsd;
    251 
    252 	if (vd->vdev_reopening || dvd == NULL)
    253 		return;
    254 
    255 	if (dvd->vd_minor != NULL)
    256 		ddi_devid_str_free(dvd->vd_minor);
    257 
    258 	if (dvd->vd_devid != NULL)
    259 		ddi_devid_free(dvd->vd_devid);
    260 
    261 	if (dvd->vd_lh != NULL)
    262 		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
    263 
    264 	kmem_free(dvd, sizeof (vdev_disk_t));
    265 	vd->vdev_tsd = NULL;
    266 }
    267 
    268 int
    269 vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
    270     uint64_t offset, int flags)
    271 {
    272 	buf_t *bp;
    273 	int error = 0;
    274 
    275 	if (vd_lh == NULL)
    276 		return (EINVAL);
    277 
    278 	ASSERT(flags & B_READ || flags & B_WRITE);
    279 
    280 	bp = getrbuf(KM_SLEEP);
    281 	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
    282 	bp->b_bcount = size;
    283 	bp->b_un.b_addr = (void *)data;
    284 	bp->b_lblkno = lbtodb(offset);
    285 	bp->b_bufsize = size;
    286 
    287 	error = ldi_strategy(vd_lh, bp);
    288 	ASSERT(error == 0);
    289 	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
    290 		error = EIO;
    291 	freerbuf(bp);
    292 
    293 	return (error);
    294 }
    295 
    296 static void
    297 vdev_disk_io_intr(buf_t *bp)
    298 {
    299 	vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
    300 	zio_t *zio = vdb->vdb_io;
    301 
    302 	/*
    303 	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
    304 	 * Rather than teach the rest of the stack about other error
    305 	 * possibilities (EFAULT, etc), we normalize the error value here.
    306 	 */
    307 	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
    308 
    309 	if (zio->io_error == 0 && bp->b_resid != 0)
    310 		zio->io_error = EIO;
    311 
    312 	kmem_free(vdb, sizeof (vdev_disk_buf_t));
    313 
    314 	zio_interrupt(zio);
    315 }
    316 
    317 static void
    318 vdev_disk_ioctl_free(zio_t *zio)
    319 {
    320 	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
    321 }
    322 
    323 static const zio_vsd_ops_t vdev_disk_vsd_ops = {
    324 	vdev_disk_ioctl_free,
    325 	zio_vsd_default_cksum_report
    326 };
    327 
    328 static void
    329 vdev_disk_ioctl_done(void *zio_arg, int error)
    330 {
    331 	zio_t *zio = zio_arg;
    332 
    333 	zio->io_error = error;
    334 
    335 	zio_interrupt(zio);
    336 }
    337 
    338 static int
    339 vdev_disk_io_start(zio_t *zio)
    340 {
    341 	vdev_t *vd = zio->io_vd;
    342 	vdev_disk_t *dvd = vd->vdev_tsd;
    343 	vdev_disk_buf_t *vdb;
    344 	struct dk_callback *dkc;
    345 	buf_t *bp;
    346 	int error;
    347 
    348 	if (zio->io_type == ZIO_TYPE_IOCTL) {
    349 		/* XXPOLICY */
    350 		if (!vdev_readable(vd)) {
    351 			zio->io_error = ENXIO;
    352 			return (ZIO_PIPELINE_CONTINUE);
    353 		}
    354 
    355 		switch (zio->io_cmd) {
    356 
    357 		case DKIOCFLUSHWRITECACHE:
    358 
    359 			if (zfs_nocacheflush)
    360 				break;
    361 
    362 			if (vd->vdev_nowritecache) {
    363 				zio->io_error = ENOTSUP;
    364 				break;
    365 			}
    366 
    367 			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
    368 			zio->io_vsd_ops = &vdev_disk_vsd_ops;
    369 
    370 			dkc->dkc_callback = vdev_disk_ioctl_done;
    371 			dkc->dkc_flag = FLUSH_VOLATILE;
    372 			dkc->dkc_cookie = zio;
    373 
    374 			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
    375 			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
    376 
    377 			if (error == 0) {
    378 				/*
    379 				 * The ioctl will be done asychronously,
    380 				 * and will call vdev_disk_ioctl_done()
    381 				 * upon completion.
    382 				 */
    383 				return (ZIO_PIPELINE_STOP);
    384 			}
    385 
    386 			if (error == ENOTSUP || error == ENOTTY) {
    387 				/*
    388 				 * If we get ENOTSUP or ENOTTY, we know that
    389 				 * no future attempts will ever succeed.
    390 				 * In this case we set a persistent bit so
    391 				 * that we don't bother with the ioctl in the
    392 				 * future.
    393 				 */
    394 				vd->vdev_nowritecache = B_TRUE;
    395 			}
    396 			zio->io_error = error;
    397 
    398 			break;
    399 
    400 		default:
    401 			zio->io_error = ENOTSUP;
    402 		}
    403 
    404 		return (ZIO_PIPELINE_CONTINUE);
    405 	}
    406 
    407 	vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
    408 
    409 	vdb->vdb_io = zio;
    410 	bp = &vdb->vdb_buf;
    411 
    412 	bioinit(bp);
    413 	bp->b_flags = B_BUSY | B_NOCACHE |
    414 	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
    415 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
    416 		bp->b_flags |= B_FAILFAST;
    417 	bp->b_bcount = zio->io_size;
    418 	bp->b_un.b_addr = zio->io_data;
    419 	bp->b_lblkno = lbtodb(zio->io_offset);
    420 	bp->b_bufsize = zio->io_size;
    421 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
    422 
    423 	/* ldi_strategy() will return non-zero only on programming errors */
    424 	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
    425 
    426 	return (ZIO_PIPELINE_STOP);
    427 }
    428 
    429 static void
    430 vdev_disk_io_done(zio_t *zio)
    431 {
    432 	vdev_t *vd = zio->io_vd;
    433 
    434 	/*
    435 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
    436 	 * the device has been removed.  If this is the case, then we trigger an
    437 	 * asynchronous removal of the device. Otherwise, probe the device and
    438 	 * make sure it's still accessible.
    439 	 */
    440 	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
    441 		vdev_disk_t *dvd = vd->vdev_tsd;
    442 		int state = DKIO_NONE;
    443 
    444 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
    445 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
    446 			/*
    447 			 * We post the resource as soon as possible, instead of
    448 			 * when the async removal actually happens, because the
    449 			 * DE is using this information to discard previous I/O
    450 			 * errors.
    451 			 */
    452 			zfs_post_remove(zio->io_spa, vd);
    453 			vd->vdev_remove_wanted = B_TRUE;
    454 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
    455 		}
    456 	}
    457 }
    458 
    459 vdev_ops_t vdev_disk_ops = {
    460 	vdev_disk_open,
    461 	vdev_disk_close,
    462 	vdev_default_asize,
    463 	vdev_disk_io_start,
    464 	vdev_disk_io_done,
    465 	NULL,
    466 	VDEV_TYPE_DISK,		/* name of this vdev type */
    467 	B_TRUE			/* leaf vdev */
    468 };
    469 
    470 /*
    471  * Given the root disk device devid or pathname, read the label from
    472  * the device, and construct a configuration nvlist.
    473  */
    474 int
    475 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
    476 {
    477 	ldi_handle_t vd_lh;
    478 	vdev_label_t *label;
    479 	uint64_t s, size;
    480 	int l;
    481 	ddi_devid_t tmpdevid;
    482 	int error = -1;
    483 	char *minor_name;
    484 
    485 	/*
    486 	 * Read the device label and build the nvlist.
    487 	 */
    488 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
    489 	    &minor_name) == 0) {
    490 		error = ldi_open_by_devid(tmpdevid, minor_name,
    491 		    FREAD, kcred, &vd_lh, zfs_li);
    492 		ddi_devid_free(tmpdevid);
    493 		ddi_devid_str_free(minor_name);
    494 	}
    495 
    496 	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
    497 	    zfs_li)))
    498 		return (error);
    499 
    500 	if (ldi_get_size(vd_lh, &s)) {
    501 		(void) ldi_close(vd_lh, FREAD, kcred);
    502 		return (EIO);
    503 	}
    504 
    505 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
    506 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
    507 
    508 	*config = NULL;
    509 	for (l = 0; l < VDEV_LABELS; l++) {
    510 		uint64_t offset, state, txg = 0;
    511 
    512 		/* read vdev label */
    513 		offset = vdev_label_offset(size, l, 0);
    514 		if (vdev_disk_physio(vd_lh, (caddr_t)label,
    515 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
    516 			continue;
    517 
    518 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
    519 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
    520 			*config = NULL;
    521 			continue;
    522 		}
    523 
    524 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
    525 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
    526 			nvlist_free(*config);
    527 			*config = NULL;
    528 			continue;
    529 		}
    530 
    531 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
    532 		    &txg) != 0 || txg == 0) {
    533 			nvlist_free(*config);
    534 			*config = NULL;
    535 			continue;
    536 		}
    537 
    538 		break;
    539 	}
    540 
    541 	kmem_free(label, sizeof (vdev_label_t));
    542 	(void) ldi_close(vd_lh, FREAD, kcred);
    543 	if (*config == NULL)
    544 		error = EIDRM;
    545 
    546 	return (error);
    547 }
    548