Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/zfs_context.h>
     29 #include <sys/spa.h>
     30 #include <sys/vdev_file.h>
     31 #include <sys/vdev_impl.h>
     32 #include <sys/zio.h>
     33 #include <sys/fs/zfs.h>
     34 #include <sys/fm/fs/zfs.h>
     35 
     36 /*
     37  * Virtual device vector for files.
     38  */
     39 
     40 static int
     41 vdev_file_open_common(vdev_t *vd)
     42 {
     43 	vdev_file_t *vf;
     44 	vnode_t *vp;
     45 	int error;
     46 
     47 	/*
     48 	 * We must have a pathname, and it must be absolute.
     49 	 */
     50 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
     51 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
     52 		return (EINVAL);
     53 	}
     54 
     55 	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
     56 
     57 	/*
     58 	 * We always open the files from the root of the global zone, even if
     59 	 * we're in a local zone.  If the user has gotten to this point, the
     60 	 * administrator has already decided that the pool should be available
     61 	 * to local zone users, so the underlying devices should be as well.
     62 	 */
     63 	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
     64 	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
     65 	    spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
     66 
     67 	if (error) {
     68 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
     69 		return (error);
     70 	}
     71 
     72 	vf->vf_vnode = vp;
     73 
     74 #ifdef _KERNEL
     75 	/*
     76 	 * Make sure it's a regular file.
     77 	 */
     78 	if (vp->v_type != VREG) {
     79 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
     80 		return (ENODEV);
     81 	}
     82 #endif
     83 
     84 	return (0);
     85 }
     86 
     87 static int
     88 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
     89 {
     90 	vdev_file_t *vf;
     91 	vattr_t vattr;
     92 	int error;
     93 
     94 	if ((error = vdev_file_open_common(vd)) != 0)
     95 		return (error);
     96 
     97 	vf = vd->vdev_tsd;
     98 
     99 	/*
    100 	 * Determine the physical size of the file.
    101 	 */
    102 	vattr.va_mask = AT_SIZE;
    103 	error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
    104 	if (error) {
    105 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
    106 		return (error);
    107 	}
    108 
    109 	*psize = vattr.va_size;
    110 	*ashift = SPA_MINBLOCKSHIFT;
    111 
    112 	return (0);
    113 }
    114 
    115 static void
    116 vdev_file_close(vdev_t *vd)
    117 {
    118 	vdev_file_t *vf = vd->vdev_tsd;
    119 
    120 	if (vf == NULL)
    121 		return;
    122 
    123 	if (vf->vf_vnode != NULL) {
    124 		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
    125 		(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL);
    126 		VN_RELE(vf->vf_vnode);
    127 	}
    128 
    129 	kmem_free(vf, sizeof (vdev_file_t));
    130 	vd->vdev_tsd = NULL;
    131 }
    132 
    133 static int
    134 vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
    135     enum uio_rw rw)
    136 {
    137 	vdev_file_t *vf = vd ? vd->vdev_tsd : NULL;
    138 	ssize_t resid;
    139 	int error = 0;
    140 
    141 	if (vd == NULL || vf == NULL || vf->vf_vnode == NULL)
    142 		return (EINVAL);
    143 
    144 	ASSERT(rw == UIO_READ || rw ==  UIO_WRITE);
    145 
    146 	error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE,
    147 	    0, RLIM64_INFINITY, kcred, &resid);
    148 
    149 	if (error || resid != 0)
    150 		return (EIO);
    151 
    152 	if (zio_injection_enabled)
    153 		error = zio_handle_device_injection(vd, EIO);
    154 
    155 	return (error);
    156 }
    157 
    158 /*
    159  * Determine if the underlying device is accessible by reading and writing
    160  * to a known location. We must be able to do this during syncing context
    161  * and thus we cannot set the vdev state directly.
    162  */
    163 static int
    164 vdev_file_probe(vdev_t *vd)
    165 {
    166 	vdev_t *nvd;
    167 	char *vl_boot;
    168 	uint64_t offset;
    169 	int l, error = 0, retries = 0;
    170 
    171 	if (vd == NULL)
    172 		return (EINVAL);
    173 
    174 	/* Hijack the current vdev */
    175 	nvd = vd;
    176 
    177 	/*
    178 	 * Pick a random label to rewrite.
    179 	 */
    180 	l = spa_get_random(VDEV_LABELS);
    181 	ASSERT(l < VDEV_LABELS);
    182 
    183 	offset = vdev_label_offset(vd->vdev_psize, l,
    184 	    offsetof(vdev_label_t, vl_boot_header));
    185 
    186 	vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP);
    187 
    188 	while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
    189 	    offset, UIO_READ)) != 0 && retries == 0) {
    190 
    191 		/*
    192 		 * If we failed with the vdev that was passed in then
    193 		 * try allocating a new one and try again.
    194 		 */
    195 		nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
    196 		if (vd->vdev_path)
    197 			nvd->vdev_path = spa_strdup(vd->vdev_path);
    198 		nvd->vdev_guid = vd->vdev_guid;
    199 		retries++;
    200 
    201 		if (vdev_file_open_common(nvd) != 0)
    202 			break;
    203 	}
    204 
    205 	if ((spa_mode & FWRITE) && !error) {
    206 		error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
    207 		    offset, UIO_WRITE);
    208 	}
    209 
    210 	if (retries) {
    211 		vdev_file_close(nvd);
    212 		if (nvd->vdev_path)
    213 			spa_strfree(nvd->vdev_path);
    214 		kmem_free(nvd, sizeof (vdev_t));
    215 	}
    216 	kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE);
    217 
    218 	if (!error)
    219 		vd->vdev_is_failing = B_FALSE;
    220 
    221 	return (error);
    222 }
    223 
    224 static int
    225 vdev_file_io_start(zio_t *zio)
    226 {
    227 	vdev_t *vd = zio->io_vd;
    228 	vdev_file_t *vf = vd->vdev_tsd;
    229 	ssize_t resid;
    230 	int error;
    231 
    232 	if (zio->io_type == ZIO_TYPE_IOCTL) {
    233 		zio_vdev_io_bypass(zio);
    234 
    235 		/* XXPOLICY */
    236 		if (!vdev_readable(vd)) {
    237 			zio->io_error = ENXIO;
    238 			return (ZIO_PIPELINE_CONTINUE);
    239 		}
    240 
    241 		switch (zio->io_cmd) {
    242 		case DKIOCFLUSHWRITECACHE:
    243 			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
    244 			    kcred, NULL);
    245 			dprintf("fsync(%s) = %d\n", vdev_description(vd),
    246 			    zio->io_error);
    247 			break;
    248 		default:
    249 			zio->io_error = ENOTSUP;
    250 		}
    251 
    252 		return (ZIO_PIPELINE_CONTINUE);
    253 	}
    254 
    255 	/*
    256 	 * In the kernel, don't bother double-caching, but in userland,
    257 	 * we want to test the vdev_cache code.
    258 	 */
    259 #ifndef _KERNEL
    260 	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
    261 		return (ZIO_PIPELINE_STOP);
    262 #endif
    263 
    264 	if ((zio = vdev_queue_io(zio)) == NULL)
    265 		return (ZIO_PIPELINE_STOP);
    266 
    267 	/* XXPOLICY */
    268 	if (zio->io_type == ZIO_TYPE_WRITE)
    269 		error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
    270 	else
    271 		error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
    272 	error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
    273 	if (error) {
    274 		zio->io_error = error;
    275 		zio_interrupt(zio);
    276 		return (ZIO_PIPELINE_STOP);
    277 	}
    278 
    279 	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
    280 	    UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
    281 	    zio->io_size, zio->io_offset, UIO_SYSSPACE,
    282 	    0, RLIM64_INFINITY, kcred, &resid);
    283 
    284 	if (resid != 0 && zio->io_error == 0)
    285 		zio->io_error = ENOSPC;
    286 
    287 	zio_interrupt(zio);
    288 
    289 	return (ZIO_PIPELINE_STOP);
    290 }
    291 
    292 static int
    293 vdev_file_io_done(zio_t *zio)
    294 {
    295 	vdev_t *vd = zio->io_vd;
    296 
    297 	if (zio_injection_enabled && zio->io_error == 0)
    298 		zio->io_error = zio_handle_device_injection(vd, EIO);
    299 
    300 	/*
    301 	 * If an error has been encountered then attempt to probe the device
    302 	 * to determine if it's still accessible.
    303 	 */
    304 	if (zio->io_error == EIO && vdev_probe(vd) != 0) {
    305 		if (!vd->vdev_is_failing) {
    306 			vd->vdev_is_failing = B_TRUE;
    307 			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
    308 			    vd->vdev_spa, vd, zio, 0, 0);
    309 		}
    310 	}
    311 
    312 	vdev_queue_io_done(zio);
    313 
    314 #ifndef _KERNEL
    315 	if (zio->io_type == ZIO_TYPE_WRITE)
    316 		vdev_cache_write(zio);
    317 #endif
    318 
    319 	return (ZIO_PIPELINE_CONTINUE);
    320 }
    321 
    322 vdev_ops_t vdev_file_ops = {
    323 	vdev_file_open,
    324 	vdev_file_close,
    325 	vdev_file_probe,
    326 	vdev_default_asize,
    327 	vdev_file_io_start,
    328 	vdev_file_io_done,
    329 	NULL,
    330 	VDEV_TYPE_FILE,		/* name of this vdev type */
    331 	B_TRUE			/* leaf vdev */
    332 };
    333 
    334 /*
    335  * From userland we access disks just like files.
    336  */
    337 #ifndef _KERNEL
    338 
    339 vdev_ops_t vdev_disk_ops = {
    340 	vdev_file_open,
    341 	vdev_file_close,
    342 	vdev_file_probe,
    343 	vdev_default_asize,
    344 	vdev_file_io_start,
    345 	vdev_file_io_done,
    346 	NULL,
    347 	VDEV_TYPE_DISK,		/* name of this vdev type */
    348 	B_TRUE			/* leaf vdev */
    349 };
    350 
    351 #endif
    352