Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/zfs_context.h>
     28 #include <sys/spa.h>
     29 #include <sys/vdev_impl.h>
     30 #include <sys/zio.h>
     31 #include <sys/zio_checksum.h>
     32 #include <sys/fs/zfs.h>
     33 #include <sys/fm/fs/zfs.h>
     34 
     35 /*
     36  * Virtual device vector for RAID-Z.
     37  *
     38  * This vdev supports single, double, and triple parity. For single parity,
     39  * we use a simple XOR of all the data columns. For double or triple parity,
     40  * we use a special case of Reed-Solomon coding. This extends the
     41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
     42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
     43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
     44  * former is also based. The latter is designed to provide higher performance
     45  * for writes.
     46  *
     47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
     48  * amended six years later identifying a critical flaw that invalidates its
     49  * claims. Nevertheless, the technique can be adapted to work for up to
     50  * triple parity. For additional parity, the amendment "Note: Correction to
     51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
     52  * is viable, but the additional complexity means that write performance will
     53  * suffer.
     54  *
     55  * All of the methods above operate on a Galois field, defined over the
     56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
     57  * can be expressed with a single byte. Briefly, the operations on the
     58  * field are defined as follows:
     59  *
     60  *   o addition (+) is represented by a bitwise XOR
     61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
     62  *   o multiplication of A by 2 is defined by the following bitwise expression:
     63  *	(A * 2)_7 = A_6
     64  *	(A * 2)_6 = A_5
     65  *	(A * 2)_5 = A_4
     66  *	(A * 2)_4 = A_3 + A_7
     67  *	(A * 2)_3 = A_2 + A_7
     68  *	(A * 2)_2 = A_1 + A_7
     69  *	(A * 2)_1 = A_0
     70  *	(A * 2)_0 = A_7
     71  *
     72  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
     73  * As an aside, this multiplication is derived from the error correcting
     74  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
     75  *
     76  * Observe that any number in the field (except for 0) can be expressed as a
     77  * power of 2 -- a generator for the field. We store a table of the powers of
     78  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
     79  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
     80  * than field addition). The inverse of a field element A (A^-1) is therefore
     81  * A ^ (255 - 1) = A^254.
     82  *
     83  * The up-to-three parity columns, P, Q, R over several data columns,
     84  * D_0, ... D_n-1, can be expressed by field operations:
     85  *
     86  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
     87  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
     88  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
     89  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
     90  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
     91  *
     92  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
     93  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
     94  * independent coefficients. (There are no additional coefficients that have
     95  * this property which is why the uncorrected Plank method breaks down.)
     96  *
     97  * See the reconstruction code below for how P, Q and R can used individually
     98  * or in concert to recover missing data columns.
     99  */
    100 
    101 typedef struct raidz_col {
    102 	uint64_t rc_devidx;		/* child device index for I/O */
    103 	uint64_t rc_offset;		/* device offset */
    104 	uint64_t rc_size;		/* I/O size */
    105 	void *rc_data;			/* I/O data */
    106 	void *rc_gdata;			/* used to store the "good" version */
    107 	int rc_error;			/* I/O error for this device */
    108 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
    109 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
    110 } raidz_col_t;
    111 
    112 typedef struct raidz_map {
    113 	uint64_t rm_cols;		/* Regular column count */
    114 	uint64_t rm_scols;		/* Count including skipped columns */
    115 	uint64_t rm_bigcols;		/* Number of oversized columns */
    116 	uint64_t rm_asize;		/* Actual total I/O size */
    117 	uint64_t rm_missingdata;	/* Count of missing data devices */
    118 	uint64_t rm_missingparity;	/* Count of missing parity devices */
    119 	uint64_t rm_firstdatacol;	/* First data column/parity count */
    120 	uint64_t rm_nskip;		/* Skipped sectors for padding */
    121 	uint64_t rm_skipstart;	/* Column index of padding start */
    122 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
    123 	uintptr_t rm_reports;		/* # of referencing checksum reports */
    124 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
    125 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
    126 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
    127 } raidz_map_t;
    128 
    129 #define	VDEV_RAIDZ_P		0
    130 #define	VDEV_RAIDZ_Q		1
    131 #define	VDEV_RAIDZ_R		2
    132 
    133 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
    134 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
    135 
    136 /*
    137  * We provide a mechanism to perform the field multiplication operation on a
    138  * 64-bit value all at once rather than a byte at a time. This works by
    139  * creating a mask from the top bit in each byte and using that to
    140  * conditionally apply the XOR of 0x1d.
    141  */
    142 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
    143 { \
    144 	(mask) = (x) & 0x8080808080808080ULL; \
    145 	(mask) = ((mask) << 1) - ((mask) >> 7); \
    146 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
    147 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
    148 }
    149 
    150 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
    151 { \
    152 	VDEV_RAIDZ_64MUL_2((x), mask); \
    153 	VDEV_RAIDZ_64MUL_2((x), mask); \
    154 }
    155 
    156 /*
    157  * Force reconstruction to use the general purpose method.
    158  */
    159 int vdev_raidz_default_to_general;
    160 
    161 /*
    162  * These two tables represent powers and logs of 2 in the Galois field defined
    163  * above. These values were computed by repeatedly multiplying by 2 as above.
    164  */
    165 static const uint8_t vdev_raidz_pow2[256] = {
    166 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
    167 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
    168 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
    169 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
    170 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
    171 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
    172 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
    173 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
    174 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
    175 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
    176 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
    177 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
    178 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
    179 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
    180 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
    181 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
    182 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
    183 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
    184 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
    185 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
    186 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
    187 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
    188 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
    189 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
    190 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
    191 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
    192 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
    193 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
    194 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
    195 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
    196 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
    197 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
    198 };
    199 static const uint8_t vdev_raidz_log2[256] = {
    200 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
    201 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
    202 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
    203 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
    204 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
    205 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
    206 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
    207 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
    208 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
    209 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
    210 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
    211 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
    212 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
    213 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
    214 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
    215 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
    216 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
    217 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
    218 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
    219 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
    220 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
    221 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
    222 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
    223 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
    224 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
    225 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
    226 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
    227 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
    228 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
    229 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
    230 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
    231 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
    232 };
    233 
    234 static void vdev_raidz_generate_parity(raidz_map_t *rm);
    235 
    236 /*
    237  * Multiply a given number by 2 raised to the given power.
    238  */
    239 static uint8_t
    240 vdev_raidz_exp2(uint_t a, int exp)
    241 {
    242 	if (a == 0)
    243 		return (0);
    244 
    245 	ASSERT(exp >= 0);
    246 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
    247 
    248 	exp += vdev_raidz_log2[a];
    249 	if (exp > 255)
    250 		exp -= 255;
    251 
    252 	return (vdev_raidz_pow2[exp]);
    253 }
    254 
    255 static void
    256 vdev_raidz_map_free(raidz_map_t *rm)
    257 {
    258 	int c;
    259 	size_t size;
    260 
    261 	for (c = 0; c < rm->rm_firstdatacol; c++) {
    262 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
    263 
    264 		if (rm->rm_col[c].rc_gdata != NULL)
    265 			zio_buf_free(rm->rm_col[c].rc_gdata,
    266 			    rm->rm_col[c].rc_size);
    267 	}
    268 
    269 	size = 0;
    270 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
    271 		size += rm->rm_col[c].rc_size;
    272 
    273 	if (rm->rm_datacopy != NULL)
    274 		zio_buf_free(rm->rm_datacopy, size);
    275 
    276 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
    277 }
    278 
    279 static void
    280 vdev_raidz_map_free_vsd(zio_t *zio)
    281 {
    282 	raidz_map_t *rm = zio->io_vsd;
    283 
    284 	ASSERT3U(rm->rm_freed, ==, 0);
    285 	rm->rm_freed = 1;
    286 
    287 	if (rm->rm_reports == 0)
    288 		vdev_raidz_map_free(rm);
    289 }
    290 
    291 /*ARGSUSED*/
    292 static void
    293 vdev_raidz_cksum_free(void *arg, size_t ignored)
    294 {
    295 	raidz_map_t *rm = arg;
    296 
    297 	ASSERT3U(rm->rm_reports, >, 0);
    298 
    299 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
    300 		vdev_raidz_map_free(rm);
    301 }
    302 
    303 static void
    304 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
    305 {
    306 	raidz_map_t *rm = zcr->zcr_cbdata;
    307 	size_t c = zcr->zcr_cbinfo;
    308 	size_t x;
    309 
    310 	const char *good = NULL;
    311 	const char *bad = rm->rm_col[c].rc_data;
    312 
    313 	if (good_data == NULL) {
    314 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
    315 		return;
    316 	}
    317 
    318 	if (c < rm->rm_firstdatacol) {
    319 		/*
    320 		 * The first time through, calculate the parity blocks for
    321 		 * the good data (this relies on the fact that the good
    322 		 * data never changes for a given logical ZIO)
    323 		 */
    324 		if (rm->rm_col[0].rc_gdata == NULL) {
    325 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
    326 			char *buf;
    327 
    328 			/*
    329 			 * Set up the rm_col[]s to generate the parity for
    330 			 * good_data, first saving the parity bufs and
    331 			 * replacing them with buffers to hold the result.
    332 			 */
    333 			for (x = 0; x < rm->rm_firstdatacol; x++) {
    334 				bad_parity[x] = rm->rm_col[x].rc_data;
    335 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
    336 				    zio_buf_alloc(rm->rm_col[x].rc_size);
    337 			}
    338 
    339 			/* fill in the data columns from good_data */
    340 			buf = (char *)good_data;
    341 			for (; x < rm->rm_cols; x++) {
    342 				rm->rm_col[x].rc_data = buf;
    343 				buf += rm->rm_col[x].rc_size;
    344 			}
    345 
    346 			/*
    347 			 * Construct the parity from the good data.
    348 			 */
    349 			vdev_raidz_generate_parity(rm);
    350 
    351 			/* restore everything back to its original state */
    352 			for (x = 0; x < rm->rm_firstdatacol; x++)
    353 				rm->rm_col[x].rc_data = bad_parity[x];
    354 
    355 			buf = rm->rm_datacopy;
    356 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
    357 				rm->rm_col[x].rc_data = buf;
    358 				buf += rm->rm_col[x].rc_size;
    359 			}
    360 		}
    361 
    362 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
    363 		good = rm->rm_col[c].rc_gdata;
    364 	} else {
    365 		/* adjust good_data to point at the start of our column */
    366 		good = good_data;
    367 
    368 		for (x = rm->rm_firstdatacol; x < c; x++)
    369 			good += rm->rm_col[x].rc_size;
    370 	}
    371 
    372 	/* we drop the ereport if it ends up that the data was good */
    373 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
    374 }
    375 
    376 /*
    377  * Invoked indirectly by zfs_ereport_start_checksum(), called
    378  * below when our read operation fails completely.  The main point
    379  * is to keep a copy of everything we read from disk, so that at
    380  * vdev_raidz_cksum_finish() time we can compare it with the good data.
    381  */
    382 static void
    383 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
    384 {
    385 	size_t c = (size_t)(uintptr_t)arg;
    386 	caddr_t buf;
    387 
    388 	raidz_map_t *rm = zio->io_vsd;
    389 	size_t size;
    390 
    391 	/* set up the report and bump the refcount  */
    392 	zcr->zcr_cbdata = rm;
    393 	zcr->zcr_cbinfo = c;
    394 	zcr->zcr_finish = vdev_raidz_cksum_finish;
    395 	zcr->zcr_free = vdev_raidz_cksum_free;
    396 
    397 	rm->rm_reports++;
    398 	ASSERT3U(rm->rm_reports, >, 0);
    399 
    400 	if (rm->rm_datacopy != NULL)
    401 		return;
    402 
    403 	/*
    404 	 * It's the first time we're called for this raidz_map_t, so we need
    405 	 * to copy the data aside; there's no guarantee that our zio's buffer
    406 	 * won't be re-used for something else.
    407 	 *
    408 	 * Our parity data is already in separate buffers, so there's no need
    409 	 * to copy them.
    410 	 */
    411 
    412 	size = 0;
    413 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
    414 		size += rm->rm_col[c].rc_size;
    415 
    416 	buf = rm->rm_datacopy = zio_buf_alloc(size);
    417 
    418 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    419 		raidz_col_t *col = &rm->rm_col[c];
    420 
    421 		bcopy(col->rc_data, buf, col->rc_size);
    422 		col->rc_data = buf;
    423 
    424 		buf += col->rc_size;
    425 	}
    426 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
    427 }
    428 
    429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
    430 	vdev_raidz_map_free_vsd,
    431 	vdev_raidz_cksum_report
    432 };
    433 
    434 static raidz_map_t *
    435 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
    436     uint64_t nparity)
    437 {
    438 	raidz_map_t *rm;
    439 	uint64_t b = zio->io_offset >> unit_shift;
    440 	uint64_t s = zio->io_size >> unit_shift;
    441 	uint64_t f = b % dcols;
    442 	uint64_t o = (b / dcols) << unit_shift;
    443 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
    444 
    445 	q = s / (dcols - nparity);
    446 	r = s - q * (dcols - nparity);
    447 	bc = (r == 0 ? 0 : r + nparity);
    448 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
    449 
    450 	if (q == 0) {
    451 		acols = bc;
    452 		scols = MIN(dcols, roundup(bc, nparity + 1));
    453 	} else {
    454 		acols = dcols;
    455 		scols = dcols;
    456 	}
    457 
    458 	ASSERT3U(acols, <=, scols);
    459 
    460 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
    461 
    462 	rm->rm_cols = acols;
    463 	rm->rm_scols = scols;
    464 	rm->rm_bigcols = bc;
    465 	rm->rm_skipstart = bc;
    466 	rm->rm_missingdata = 0;
    467 	rm->rm_missingparity = 0;
    468 	rm->rm_firstdatacol = nparity;
    469 	rm->rm_datacopy = NULL;
    470 	rm->rm_reports = 0;
    471 	rm->rm_freed = 0;
    472 	rm->rm_ecksuminjected = 0;
    473 
    474 	asize = 0;
    475 
    476 	for (c = 0; c < scols; c++) {
    477 		col = f + c;
    478 		coff = o;
    479 		if (col >= dcols) {
    480 			col -= dcols;
    481 			coff += 1ULL << unit_shift;
    482 		}
    483 		rm->rm_col[c].rc_devidx = col;
    484 		rm->rm_col[c].rc_offset = coff;
    485 		rm->rm_col[c].rc_data = NULL;
    486 		rm->rm_col[c].rc_gdata = NULL;
    487 		rm->rm_col[c].rc_error = 0;
    488 		rm->rm_col[c].rc_tried = 0;
    489 		rm->rm_col[c].rc_skipped = 0;
    490 
    491 		if (c >= acols)
    492 			rm->rm_col[c].rc_size = 0;
    493 		else if (c < bc)
    494 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
    495 		else
    496 			rm->rm_col[c].rc_size = q << unit_shift;
    497 
    498 		asize += rm->rm_col[c].rc_size;
    499 	}
    500 
    501 	ASSERT3U(asize, ==, tot << unit_shift);
    502 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
    503 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
    504 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
    505 	ASSERT3U(rm->rm_nskip, <=, nparity);
    506 
    507 	for (c = 0; c < rm->rm_firstdatacol; c++)
    508 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
    509 
    510 	rm->rm_col[c].rc_data = zio->io_data;
    511 
    512 	for (c = c + 1; c < acols; c++)
    513 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
    514 		    rm->rm_col[c - 1].rc_size;
    515 
    516 	/*
    517 	 * If all data stored spans all columns, there's a danger that parity
    518 	 * will always be on the same device and, since parity isn't read
    519 	 * during normal operation, that that device's I/O bandwidth won't be
    520 	 * used effectively. We therefore switch the parity every 1MB.
    521 	 *
    522 	 * ... at least that was, ostensibly, the theory. As a practical
    523 	 * matter unless we juggle the parity between all devices evenly, we
    524 	 * won't see any benefit. Further, occasional writes that aren't a
    525 	 * multiple of the LCM of the number of children and the minimum
    526 	 * stripe width are sufficient to avoid pessimal behavior.
    527 	 * Unfortunately, this decision created an implicit on-disk format
    528 	 * requirement that we need to support for all eternity, but only
    529 	 * for single-parity RAID-Z.
    530 	 *
    531 	 * If we intend to skip a sector in the zeroth column for padding
    532 	 * we must make sure to note this swap. We will never intend to
    533 	 * skip the first column since at least one data and one parity
    534 	 * column must appear in each row.
    535 	 */
    536 	ASSERT(rm->rm_cols >= 2);
    537 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
    538 
    539 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
    540 		devidx = rm->rm_col[0].rc_devidx;
    541 		o = rm->rm_col[0].rc_offset;
    542 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
    543 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
    544 		rm->rm_col[1].rc_devidx = devidx;
    545 		rm->rm_col[1].rc_offset = o;
    546 
    547 		if (rm->rm_skipstart == 0)
    548 			rm->rm_skipstart = 1;
    549 	}
    550 
    551 	zio->io_vsd = rm;
    552 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
    553 	return (rm);
    554 }
    555 
    556 static void
    557 vdev_raidz_generate_parity_p(raidz_map_t *rm)
    558 {
    559 	uint64_t *p, *src, pcount, ccount, i;
    560 	int c;
    561 
    562 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    563 
    564 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    565 		src = rm->rm_col[c].rc_data;
    566 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    567 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    568 
    569 		if (c == rm->rm_firstdatacol) {
    570 			ASSERT(ccount == pcount);
    571 			for (i = 0; i < ccount; i++, src++, p++) {
    572 				*p = *src;
    573 			}
    574 		} else {
    575 			ASSERT(ccount <= pcount);
    576 			for (i = 0; i < ccount; i++, src++, p++) {
    577 				*p ^= *src;
    578 			}
    579 		}
    580 	}
    581 }
    582 
    583 static void
    584 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
    585 {
    586 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
    587 	int c;
    588 
    589 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    590 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    591 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    592 
    593 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    594 		src = rm->rm_col[c].rc_data;
    595 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    596 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    597 
    598 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
    599 
    600 		if (c == rm->rm_firstdatacol) {
    601 			ASSERT(ccnt == pcnt || ccnt == 0);
    602 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
    603 				*p = *src;
    604 				*q = *src;
    605 			}
    606 			for (; i < pcnt; i++, src++, p++, q++) {
    607 				*p = 0;
    608 				*q = 0;
    609 			}
    610 		} else {
    611 			ASSERT(ccnt <= pcnt);
    612 
    613 			/*
    614 			 * Apply the algorithm described above by multiplying
    615 			 * the previous result and adding in the new value.
    616 			 */
    617 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
    618 				*p ^= *src;
    619 
    620 				VDEV_RAIDZ_64MUL_2(*q, mask);
    621 				*q ^= *src;
    622 			}
    623 
    624 			/*
    625 			 * Treat short columns as though they are full of 0s.
    626 			 * Note that there's therefore nothing needed for P.
    627 			 */
    628 			for (; i < pcnt; i++, q++) {
    629 				VDEV_RAIDZ_64MUL_2(*q, mask);
    630 			}
    631 		}
    632 	}
    633 }
    634 
    635 static void
    636 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
    637 {
    638 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
    639 	int c;
    640 
    641 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    642 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    643 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    644 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    645 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
    646 
    647 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    648 		src = rm->rm_col[c].rc_data;
    649 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    650 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    651 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
    652 
    653 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
    654 
    655 		if (c == rm->rm_firstdatacol) {
    656 			ASSERT(ccnt == pcnt || ccnt == 0);
    657 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
    658 				*p = *src;
    659 				*q = *src;
    660 				*r = *src;
    661 			}
    662 			for (; i < pcnt; i++, src++, p++, q++, r++) {
    663 				*p = 0;
    664 				*q = 0;
    665 				*r = 0;
    666 			}
    667 		} else {
    668 			ASSERT(ccnt <= pcnt);
    669 
    670 			/*
    671 			 * Apply the algorithm described above by multiplying
    672 			 * the previous result and adding in the new value.
    673 			 */
    674 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
    675 				*p ^= *src;
    676 
    677 				VDEV_RAIDZ_64MUL_2(*q, mask);
    678 				*q ^= *src;
    679 
    680 				VDEV_RAIDZ_64MUL_4(*r, mask);
    681 				*r ^= *src;
    682 			}
    683 
    684 			/*
    685 			 * Treat short columns as though they are full of 0s.
    686 			 * Note that there's therefore nothing needed for P.
    687 			 */
    688 			for (; i < pcnt; i++, q++, r++) {
    689 				VDEV_RAIDZ_64MUL_2(*q, mask);
    690 				VDEV_RAIDZ_64MUL_4(*r, mask);
    691 			}
    692 		}
    693 	}
    694 }
    695 
    696 /*
    697  * Generate RAID parity in the first virtual columns according to the number of
    698  * parity columns available.
    699  */
    700 static void
    701 vdev_raidz_generate_parity(raidz_map_t *rm)
    702 {
    703 	switch (rm->rm_firstdatacol) {
    704 	case 1:
    705 		vdev_raidz_generate_parity_p(rm);
    706 		break;
    707 	case 2:
    708 		vdev_raidz_generate_parity_pq(rm);
    709 		break;
    710 	case 3:
    711 		vdev_raidz_generate_parity_pqr(rm);
    712 		break;
    713 	default:
    714 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
    715 	}
    716 }
    717 
    718 static int
    719 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
    720 {
    721 	uint64_t *dst, *src, xcount, ccount, count, i;
    722 	int x = tgts[0];
    723 	int c;
    724 
    725 	ASSERT(ntgts == 1);
    726 	ASSERT(x >= rm->rm_firstdatacol);
    727 	ASSERT(x < rm->rm_cols);
    728 
    729 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    730 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
    731 	ASSERT(xcount > 0);
    732 
    733 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    734 	dst = rm->rm_col[x].rc_data;
    735 	for (i = 0; i < xcount; i++, dst++, src++) {
    736 		*dst = *src;
    737 	}
    738 
    739 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    740 		src = rm->rm_col[c].rc_data;
    741 		dst = rm->rm_col[x].rc_data;
    742 
    743 		if (c == x)
    744 			continue;
    745 
    746 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    747 		count = MIN(ccount, xcount);
    748 
    749 		for (i = 0; i < count; i++, dst++, src++) {
    750 			*dst ^= *src;
    751 		}
    752 	}
    753 
    754 	return (1 << VDEV_RAIDZ_P);
    755 }
    756 
    757 static int
    758 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
    759 {
    760 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
    761 	uint8_t *b;
    762 	int x = tgts[0];
    763 	int c, j, exp;
    764 
    765 	ASSERT(ntgts == 1);
    766 
    767 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    768 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
    769 
    770 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    771 		src = rm->rm_col[c].rc_data;
    772 		dst = rm->rm_col[x].rc_data;
    773 
    774 		if (c == x)
    775 			ccount = 0;
    776 		else
    777 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    778 
    779 		count = MIN(ccount, xcount);
    780 
    781 		if (c == rm->rm_firstdatacol) {
    782 			for (i = 0; i < count; i++, dst++, src++) {
    783 				*dst = *src;
    784 			}
    785 			for (; i < xcount; i++, dst++) {
    786 				*dst = 0;
    787 			}
    788 
    789 		} else {
    790 			for (i = 0; i < count; i++, dst++, src++) {
    791 				VDEV_RAIDZ_64MUL_2(*dst, mask);
    792 				*dst ^= *src;
    793 			}
    794 
    795 			for (; i < xcount; i++, dst++) {
    796 				VDEV_RAIDZ_64MUL_2(*dst, mask);
    797 			}
    798 		}
    799 	}
    800 
    801 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    802 	dst = rm->rm_col[x].rc_data;
    803 	exp = 255 - (rm->rm_cols - 1 - x);
    804 
    805 	for (i = 0; i < xcount; i++, dst++, src++) {
    806 		*dst ^= *src;
    807 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
    808 			*b = vdev_raidz_exp2(*b, exp);
    809 		}
    810 	}
    811 
    812 	return (1 << VDEV_RAIDZ_Q);
    813 }
    814 
    815 static int
    816 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
    817 {
    818 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
    819 	void *pdata, *qdata;
    820 	uint64_t xsize, ysize, i;
    821 	int x = tgts[0];
    822 	int y = tgts[1];
    823 
    824 	ASSERT(ntgts == 2);
    825 	ASSERT(x < y);
    826 	ASSERT(x >= rm->rm_firstdatacol);
    827 	ASSERT(y < rm->rm_cols);
    828 
    829 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
    830 
    831 	/*
    832 	 * Move the parity data aside -- we're going to compute parity as
    833 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
    834 	 * reuse the parity generation mechanism without trashing the actual
    835 	 * parity so we make those columns appear to be full of zeros by
    836 	 * setting their lengths to zero.
    837 	 */
    838 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    839 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    840 	xsize = rm->rm_col[x].rc_size;
    841 	ysize = rm->rm_col[y].rc_size;
    842 
    843 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
    844 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
    845 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
    846 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    847 	rm->rm_col[x].rc_size = 0;
    848 	rm->rm_col[y].rc_size = 0;
    849 
    850 	vdev_raidz_generate_parity_pq(rm);
    851 
    852 	rm->rm_col[x].rc_size = xsize;
    853 	rm->rm_col[y].rc_size = ysize;
    854 
    855 	p = pdata;
    856 	q = qdata;
    857 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    858 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    859 	xd = rm->rm_col[x].rc_data;
    860 	yd = rm->rm_col[y].rc_data;
    861 
    862 	/*
    863 	 * We now have:
    864 	 *	Pxy = P + D_x + D_y
    865 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
    866 	 *
    867 	 * We can then solve for D_x:
    868 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
    869 	 * where
    870 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
    871 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
    872 	 *
    873 	 * With D_x in hand, we can easily solve for D_y:
    874 	 *	D_y = P + Pxy + D_x
    875 	 */
    876 
    877 	a = vdev_raidz_pow2[255 + x - y];
    878 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
    879 	tmp = 255 - vdev_raidz_log2[a ^ 1];
    880 
    881 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
    882 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
    883 
    884 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
    885 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
    886 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
    887 
    888 		if (i < ysize)
    889 			*yd = *p ^ *pxy ^ *xd;
    890 	}
    891 
    892 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
    893 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
    894 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
    895 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    896 
    897 	/*
    898 	 * Restore the saved parity data.
    899 	 */
    900 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
    901 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
    902 
    903 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
    904 }
    905 
    906 /* BEGIN CSTYLED */
    907 /*
    908  * In the general case of reconstruction, we must solve the system of linear
    909  * equations defined by the coeffecients used to generate parity as well as
    910  * the contents of the data and parity disks. This can be expressed with
    911  * vectors for the original data (D) and the actual data (d) and parity (p)
    912  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
    913  *
    914  *            __   __                     __     __
    915  *            |     |         __     __   |  p_0  |
    916  *            |  V  |         |  D_0  |   | p_m-1 |
    917  *            |     |    x    |   :   | = |  d_0  |
    918  *            |  I  |         | D_n-1 |   |   :   |
    919  *            |     |         ~~     ~~   | d_n-1 |
    920  *            ~~   ~~                     ~~     ~~
    921  *
    922  * I is simply a square identity matrix of size n, and V is a vandermonde
    923  * matrix defined by the coeffecients we chose for the various parity columns
    924  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
    925  * computation as well as linear separability.
    926  *
    927  *      __               __               __     __
    928  *      |   1   ..  1 1 1 |               |  p_0  |
    929  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
    930  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
    931  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
    932  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
    933  *      |   :       : : : |   |   :   |   |  d_2  |
    934  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
    935  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
    936  *      |   0   ..  0 0 1 |               | d_n-1 |
    937  *      ~~               ~~               ~~     ~~
    938  *
    939  * Note that I, V, d, and p are known. To compute D, we must invert the
    940  * matrix and use the known data and parity values to reconstruct the unknown
    941  * data values. We begin by removing the rows in V|I and d|p that correspond
    942  * to failed or missing columns; we then make V|I square (n x n) and d|p
    943  * sized n by removing rows corresponding to unused parity from the bottom up
    944  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
    945  * using Gauss-Jordan elimination. In the example below we use m=3 parity
    946  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
    947  *           __                               __
    948  *           |  1   1   1   1   1   1   1   1  |
    949  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
    950  *           |  19 205 116  29  64  16  4   1  |      / /
    951  *           |  1   0   0   0   0   0   0   0  |     / /
    952  *           |  0   1   0   0   0   0   0   0  | <--' /
    953  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
    954  *           |  0   0   0   1   0   0   0   0  |
    955  *           |  0   0   0   0   1   0   0   0  |
    956  *           |  0   0   0   0   0   1   0   0  |
    957  *           |  0   0   0   0   0   0   1   0  |
    958  *           |  0   0   0   0   0   0   0   1  |
    959  *           ~~                               ~~
    960  *           __                               __
    961  *           |  1   1   1   1   1   1   1   1  |
    962  *           | 128  64  32  16  8   4   2   1  |
    963  *           |  19 205 116  29  64  16  4   1  |
    964  *           |  1   0   0   0   0   0   0   0  |
    965  *           |  0   1   0   0   0   0   0   0  |
    966  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
    967  *           |  0   0   0   1   0   0   0   0  |
    968  *           |  0   0   0   0   1   0   0   0  |
    969  *           |  0   0   0   0   0   1   0   0  |
    970  *           |  0   0   0   0   0   0   1   0  |
    971  *           |  0   0   0   0   0   0   0   1  |
    972  *           ~~                               ~~
    973  *
    974  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
    975  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
    976  * matrix is not singular.
    977  * __                                                                 __
    978  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
    979  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
    980  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
    981  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
    982  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
    983  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
    984  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
    985  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
    986  * ~~                                                                 ~~
    987  * __                                                                 __
    988  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
    989  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
    990  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
    991  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
    992  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
    993  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
    994  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
    995  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
    996  * ~~                                                                 ~~
    997  * __                                                                 __
    998  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
    999  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
   1000  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
   1001  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1002  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1003  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1004  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1005  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1006  * ~~                                                                 ~~
   1007  * __                                                                 __
   1008  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1009  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
   1010  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
   1011  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1012  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1013  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1014  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1015  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1016  * ~~                                                                 ~~
   1017  * __                                                                 __
   1018  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1019  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
   1020  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
   1021  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1022  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1023  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1024  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1025  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1026  * ~~                                                                 ~~
   1027  * __                                                                 __
   1028  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1029  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
   1030  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
   1031  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1032  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1033  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1034  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1035  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1036  * ~~                                                                 ~~
   1037  *                   __                               __
   1038  *                   |  0   0   1   0   0   0   0   0  |
   1039  *                   | 167 100  5   41 159 169 217 208 |
   1040  *                   | 166 100  4   40 158 168 216 209 |
   1041  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
   1042  *                   |  0   0   0   0   1   0   0   0  |
   1043  *                   |  0   0   0   0   0   1   0   0  |
   1044  *                   |  0   0   0   0   0   0   1   0  |
   1045  *                   |  0   0   0   0   0   0   0   1  |
   1046  *                   ~~                               ~~
   1047  *
   1048  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
   1049  * of the missing data.
   1050  *
   1051  * As is apparent from the example above, the only non-trivial rows in the
   1052  * inverse matrix correspond to the data disks that we're trying to
   1053  * reconstruct. Indeed, those are the only rows we need as the others would
   1054  * only be useful for reconstructing data known or assumed to be valid. For
   1055  * that reason, we only build the coefficients in the rows that correspond to
   1056  * targeted columns.
   1057  */
   1058 /* END CSTYLED */
   1059 
   1060 static void
   1061 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
   1062     uint8_t **rows)
   1063 {
   1064 	int i, j;
   1065 	int pow;
   1066 
   1067 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
   1068 
   1069 	/*
   1070 	 * Fill in the missing rows of interest.
   1071 	 */
   1072 	for (i = 0; i < nmap; i++) {
   1073 		ASSERT3S(0, <=, map[i]);
   1074 		ASSERT3S(map[i], <=, 2);
   1075 
   1076 		pow = map[i] * n;
   1077 		if (pow > 255)
   1078 			pow -= 255;
   1079 		ASSERT(pow <= 255);
   1080 
   1081 		for (j = 0; j < n; j++) {
   1082 			pow -= map[i];
   1083 			if (pow < 0)
   1084 				pow += 255;
   1085 			rows[i][j] = vdev_raidz_pow2[pow];
   1086 		}
   1087 	}
   1088 }
   1089 
   1090 static void
   1091 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
   1092     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
   1093 {
   1094 	int i, j, ii, jj;
   1095 	uint8_t log;
   1096 
   1097 	/*
   1098 	 * Assert that the first nmissing entries from the array of used
   1099 	 * columns correspond to parity columns and that subsequent entries
   1100 	 * correspond to data columns.
   1101 	 */
   1102 	for (i = 0; i < nmissing; i++) {
   1103 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
   1104 	}
   1105 	for (; i < n; i++) {
   1106 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
   1107 	}
   1108 
   1109 	/*
   1110 	 * First initialize the storage where we'll compute the inverse rows.
   1111 	 */
   1112 	for (i = 0; i < nmissing; i++) {
   1113 		for (j = 0; j < n; j++) {
   1114 			invrows[i][j] = (i == j) ? 1 : 0;
   1115 		}
   1116 	}
   1117 
   1118 	/*
   1119 	 * Subtract all trivial rows from the rows of consequence.
   1120 	 */
   1121 	for (i = 0; i < nmissing; i++) {
   1122 		for (j = nmissing; j < n; j++) {
   1123 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
   1124 			jj = used[j] - rm->rm_firstdatacol;
   1125 			ASSERT3S(jj, <, n);
   1126 			invrows[i][j] = rows[i][jj];
   1127 			rows[i][jj] = 0;
   1128 		}
   1129 	}
   1130 
   1131 	/*
   1132 	 * For each of the rows of interest, we must normalize it and subtract
   1133 	 * a multiple of it from the other rows.
   1134 	 */
   1135 	for (i = 0; i < nmissing; i++) {
   1136 		for (j = 0; j < missing[i]; j++) {
   1137 			ASSERT3U(rows[i][j], ==, 0);
   1138 		}
   1139 		ASSERT3U(rows[i][missing[i]], !=, 0);
   1140 
   1141 		/*
   1142 		 * Compute the inverse of the first element and multiply each
   1143 		 * element in the row by that value.
   1144 		 */
   1145 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
   1146 
   1147 		for (j = 0; j < n; j++) {
   1148 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
   1149 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
   1150 		}
   1151 
   1152 		for (ii = 0; ii < nmissing; ii++) {
   1153 			if (i == ii)
   1154 				continue;
   1155 
   1156 			ASSERT3U(rows[ii][missing[i]], !=, 0);
   1157 
   1158 			log = vdev_raidz_log2[rows[ii][missing[i]]];
   1159 
   1160 			for (j = 0; j < n; j++) {
   1161 				rows[ii][j] ^=
   1162 				    vdev_raidz_exp2(rows[i][j], log);
   1163 				invrows[ii][j] ^=
   1164 				    vdev_raidz_exp2(invrows[i][j], log);
   1165 			}
   1166 		}
   1167 	}
   1168 
   1169 	/*
   1170 	 * Verify that the data that is left in the rows are properly part of
   1171 	 * an identity matrix.
   1172 	 */
   1173 	for (i = 0; i < nmissing; i++) {
   1174 		for (j = 0; j < n; j++) {
   1175 			if (j == missing[i]) {
   1176 				ASSERT3U(rows[i][j], ==, 1);
   1177 			} else {
   1178 				ASSERT3U(rows[i][j], ==, 0);
   1179 			}
   1180 		}
   1181 	}
   1182 }
   1183 
   1184 static void
   1185 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
   1186     int *missing, uint8_t **invrows, const uint8_t *used)
   1187 {
   1188 	int i, j, x, cc, c;
   1189 	uint8_t *src;
   1190 	uint64_t ccount;
   1191 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
   1192 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
   1193 	uint8_t log, val;
   1194 	int ll;
   1195 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
   1196 	uint8_t *p, *pp;
   1197 	size_t psize;
   1198 
   1199 	psize = sizeof (invlog[0][0]) * n * nmissing;
   1200 	p = kmem_alloc(psize, KM_SLEEP);
   1201 
   1202 	for (pp = p, i = 0; i < nmissing; i++) {
   1203 		invlog[i] = pp;
   1204 		pp += n;
   1205 	}
   1206 
   1207 	for (i = 0; i < nmissing; i++) {
   1208 		for (j = 0; j < n; j++) {
   1209 			ASSERT3U(invrows[i][j], !=, 0);
   1210 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
   1211 		}
   1212 	}
   1213 
   1214 	for (i = 0; i < n; i++) {
   1215 		c = used[i];
   1216 		ASSERT3U(c, <, rm->rm_cols);
   1217 
   1218 		src = rm->rm_col[c].rc_data;
   1219 		ccount = rm->rm_col[c].rc_size;
   1220 		for (j = 0; j < nmissing; j++) {
   1221 			cc = missing[j] + rm->rm_firstdatacol;
   1222 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
   1223 			ASSERT3U(cc, <, rm->rm_cols);
   1224 			ASSERT3U(cc, !=, c);
   1225 
   1226 			dst[j] = rm->rm_col[cc].rc_data;
   1227 			dcount[j] = rm->rm_col[cc].rc_size;
   1228 		}
   1229 
   1230 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
   1231 
   1232 		for (x = 0; x < ccount; x++, src++) {
   1233 			if (*src != 0)
   1234 				log = vdev_raidz_log2[*src];
   1235 
   1236 			for (cc = 0; cc < nmissing; cc++) {
   1237 				if (x >= dcount[cc])
   1238 					continue;
   1239 
   1240 				if (*src == 0) {
   1241 					val = 0;
   1242 				} else {
   1243 					if ((ll = log + invlog[cc][i]) >= 255)
   1244 						ll -= 255;
   1245 					val = vdev_raidz_pow2[ll];
   1246 				}
   1247 
   1248 				if (i == 0)
   1249 					dst[cc][x] = val;
   1250 				else
   1251 					dst[cc][x] ^= val;
   1252 			}
   1253 		}
   1254 	}
   1255 
   1256 	kmem_free(p, psize);
   1257 }
   1258 
   1259 static int
   1260 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
   1261 {
   1262 	int n, i, c, t, tt;
   1263 	int nmissing_rows;
   1264 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
   1265 	int parity_map[VDEV_RAIDZ_MAXPARITY];
   1266 
   1267 	uint8_t *p, *pp;
   1268 	size_t psize;
   1269 
   1270 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
   1271 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
   1272 	uint8_t *used;
   1273 
   1274 	int code = 0;
   1275 
   1276 
   1277 	n = rm->rm_cols - rm->rm_firstdatacol;
   1278 
   1279 	/*
   1280 	 * Figure out which data columns are missing.
   1281 	 */
   1282 	nmissing_rows = 0;
   1283 	for (t = 0; t < ntgts; t++) {
   1284 		if (tgts[t] >= rm->rm_firstdatacol) {
   1285 			missing_rows[nmissing_rows++] =
   1286 			    tgts[t] - rm->rm_firstdatacol;
   1287 		}
   1288 	}
   1289 
   1290 	/*
   1291 	 * Figure out which parity columns to use to help generate the missing
   1292 	 * data columns.
   1293 	 */
   1294 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
   1295 		ASSERT(tt < ntgts);
   1296 		ASSERT(c < rm->rm_firstdatacol);
   1297 
   1298 		/*
   1299 		 * Skip any targeted parity columns.
   1300 		 */
   1301 		if (c == tgts[tt]) {
   1302 			tt++;
   1303 			continue;
   1304 		}
   1305 
   1306 		code |= 1 << c;
   1307 
   1308 		parity_map[i] = c;
   1309 		i++;
   1310 	}
   1311 
   1312 	ASSERT(code != 0);
   1313 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
   1314 
   1315 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
   1316 	    nmissing_rows * n + sizeof (used[0]) * n;
   1317 	p = kmem_alloc(psize, KM_SLEEP);
   1318 
   1319 	for (pp = p, i = 0; i < nmissing_rows; i++) {
   1320 		rows[i] = pp;
   1321 		pp += n;
   1322 		invrows[i] = pp;
   1323 		pp += n;
   1324 	}
   1325 	used = pp;
   1326 
   1327 	for (i = 0; i < nmissing_rows; i++) {
   1328 		used[i] = parity_map[i];
   1329 	}
   1330 
   1331 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
   1332 		if (tt < nmissing_rows &&
   1333 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
   1334 			tt++;
   1335 			continue;
   1336 		}
   1337 
   1338 		ASSERT3S(i, <, n);
   1339 		used[i] = c;
   1340 		i++;
   1341 	}
   1342 
   1343 	/*
   1344 	 * Initialize the interesting rows of the matrix.
   1345 	 */
   1346 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
   1347 
   1348 	/*
   1349 	 * Invert the matrix.
   1350 	 */
   1351 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
   1352 	    invrows, used);
   1353 
   1354 	/*
   1355 	 * Reconstruct the missing data using the generated matrix.
   1356 	 */
   1357 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
   1358 	    invrows, used);
   1359 
   1360 	kmem_free(p, psize);
   1361 
   1362 	return (code);
   1363 }
   1364 
   1365 static int
   1366 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
   1367 {
   1368 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
   1369 	int ntgts;
   1370 	int i, c;
   1371 	int code;
   1372 	int nbadparity, nbaddata;
   1373 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
   1374 
   1375 	/*
   1376 	 * The tgts list must already be sorted.
   1377 	 */
   1378 	for (i = 1; i < nt; i++) {
   1379 		ASSERT(t[i] > t[i - 1]);
   1380 	}
   1381 
   1382 	nbadparity = rm->rm_firstdatacol;
   1383 	nbaddata = rm->rm_cols - nbadparity;
   1384 	ntgts = 0;
   1385 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
   1386 		if (c < rm->rm_firstdatacol)
   1387 			parity_valid[c] = B_FALSE;
   1388 
   1389 		if (i < nt && c == t[i]) {
   1390 			tgts[ntgts++] = c;
   1391 			i++;
   1392 		} else if (rm->rm_col[c].rc_error != 0) {
   1393 			tgts[ntgts++] = c;
   1394 		} else if (c >= rm->rm_firstdatacol) {
   1395 			nbaddata--;
   1396 		} else {
   1397 			parity_valid[c] = B_TRUE;
   1398 			nbadparity--;
   1399 		}
   1400 	}
   1401 
   1402 	ASSERT(ntgts >= nt);
   1403 	ASSERT(nbaddata >= 0);
   1404 	ASSERT(nbaddata + nbadparity == ntgts);
   1405 
   1406 	dt = &tgts[nbadparity];
   1407 
   1408 	/*
   1409 	 * See if we can use any of our optimized reconstruction routines.
   1410 	 */
   1411 	if (!vdev_raidz_default_to_general) {
   1412 		switch (nbaddata) {
   1413 		case 1:
   1414 			if (parity_valid[VDEV_RAIDZ_P])
   1415 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
   1416 
   1417 			ASSERT(rm->rm_firstdatacol > 1);
   1418 
   1419 			if (parity_valid[VDEV_RAIDZ_Q])
   1420 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
   1421 
   1422 			ASSERT(rm->rm_firstdatacol > 2);
   1423 			break;
   1424 
   1425 		case 2:
   1426 			ASSERT(rm->rm_firstdatacol > 1);
   1427 
   1428 			if (parity_valid[VDEV_RAIDZ_P] &&
   1429 			    parity_valid[VDEV_RAIDZ_Q])
   1430 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
   1431 
   1432 			ASSERT(rm->rm_firstdatacol > 2);
   1433 
   1434 			break;
   1435 		}
   1436 	}
   1437 
   1438 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
   1439 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
   1440 	ASSERT(code > 0);
   1441 	return (code);
   1442 }
   1443 
   1444 static int
   1445 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
   1446 {
   1447 	vdev_t *cvd;
   1448 	uint64_t nparity = vd->vdev_nparity;
   1449 	int c;
   1450 	int lasterror = 0;
   1451 	int numerrors = 0;
   1452 
   1453 	ASSERT(nparity > 0);
   1454 
   1455 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
   1456 	    vd->vdev_children < nparity + 1) {
   1457 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
   1458 		return (EINVAL);
   1459 	}
   1460 
   1461 	vdev_open_children(vd);
   1462 
   1463 	for (c = 0; c < vd->vdev_children; c++) {
   1464 		cvd = vd->vdev_child[c];
   1465 
   1466 		if (cvd->vdev_open_error != 0) {
   1467 			lasterror = cvd->vdev_open_error;
   1468 			numerrors++;
   1469 			continue;
   1470 		}
   1471 
   1472 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
   1473 		*ashift = MAX(*ashift, cvd->vdev_ashift);
   1474 	}
   1475 
   1476 	*asize *= vd->vdev_children;
   1477 
   1478 	if (numerrors > nparity) {
   1479 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
   1480 		return (lasterror);
   1481 	}
   1482 
   1483 	return (0);
   1484 }
   1485 
   1486 static void
   1487 vdev_raidz_close(vdev_t *vd)
   1488 {
   1489 	int c;
   1490 
   1491 	for (c = 0; c < vd->vdev_children; c++)
   1492 		vdev_close(vd->vdev_child[c]);
   1493 }
   1494 
   1495 static uint64_t
   1496 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
   1497 {
   1498 	uint64_t asize;
   1499 	uint64_t ashift = vd->vdev_top->vdev_ashift;
   1500 	uint64_t cols = vd->vdev_children;
   1501 	uint64_t nparity = vd->vdev_nparity;
   1502 
   1503 	asize = ((psize - 1) >> ashift) + 1;
   1504 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
   1505 	asize = roundup(asize, nparity + 1) << ashift;
   1506 
   1507 	return (asize);
   1508 }
   1509 
   1510 static void
   1511 vdev_raidz_child_done(zio_t *zio)
   1512 {
   1513 	raidz_col_t *rc = zio->io_private;
   1514 
   1515 	rc->rc_error = zio->io_error;
   1516 	rc->rc_tried = 1;
   1517 	rc->rc_skipped = 0;
   1518 }
   1519 
   1520 static int
   1521 vdev_raidz_io_start(zio_t *zio)
   1522 {
   1523 	vdev_t *vd = zio->io_vd;
   1524 	vdev_t *tvd = vd->vdev_top;
   1525 	vdev_t *cvd;
   1526 	raidz_map_t *rm;
   1527 	raidz_col_t *rc;
   1528 	int c, i;
   1529 
   1530 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
   1531 	    vd->vdev_nparity);
   1532 
   1533 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
   1534 
   1535 	if (zio->io_type == ZIO_TYPE_WRITE) {
   1536 		vdev_raidz_generate_parity(rm);
   1537 
   1538 		for (c = 0; c < rm->rm_cols; c++) {
   1539 			rc = &rm->rm_col[c];
   1540 			cvd = vd->vdev_child[rc->rc_devidx];
   1541 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   1542 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   1543 			    zio->io_type, zio->io_priority, 0,
   1544 			    vdev_raidz_child_done, rc));
   1545 		}
   1546 
   1547 		/*
   1548 		 * Generate optional I/Os for any skipped sectors to improve
   1549 		 * aggregation contiguity.
   1550 		 */
   1551 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
   1552 			ASSERT(c <= rm->rm_scols);
   1553 			if (c == rm->rm_scols)
   1554 				c = 0;
   1555 			rc = &rm->rm_col[c];
   1556 			cvd = vd->vdev_child[rc->rc_devidx];
   1557 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   1558 			    rc->rc_offset + rc->rc_size, NULL,
   1559 			    1 << tvd->vdev_ashift,
   1560 			    zio->io_type, zio->io_priority,
   1561 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
   1562 		}
   1563 
   1564 		return (ZIO_PIPELINE_CONTINUE);
   1565 	}
   1566 
   1567 	ASSERT(zio->io_type == ZIO_TYPE_READ);
   1568 
   1569 	/*
   1570 	 * Iterate over the columns in reverse order so that we hit the parity
   1571 	 * last -- any errors along the way will force us to read the parity.
   1572 	 */
   1573 	for (c = rm->rm_cols - 1; c >= 0; c--) {
   1574 		rc = &rm->rm_col[c];
   1575 		cvd = vd->vdev_child[rc->rc_devidx];
   1576 		if (!vdev_readable(cvd)) {
   1577 			if (c >= rm->rm_firstdatacol)
   1578 				rm->rm_missingdata++;
   1579 			else
   1580 				rm->rm_missingparity++;
   1581 			rc->rc_error = ENXIO;
   1582 			rc->rc_tried = 1;	/* don't even try */
   1583 			rc->rc_skipped = 1;
   1584 			continue;
   1585 		}
   1586 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
   1587 			if (c >= rm->rm_firstdatacol)
   1588 				rm->rm_missingdata++;
   1589 			else
   1590 				rm->rm_missingparity++;
   1591 			rc->rc_error = ESTALE;
   1592 			rc->rc_skipped = 1;
   1593 			continue;
   1594 		}
   1595 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
   1596 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
   1597 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   1598 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   1599 			    zio->io_type, zio->io_priority, 0,
   1600 			    vdev_raidz_child_done, rc));
   1601 		}
   1602 	}
   1603 
   1604 	return (ZIO_PIPELINE_CONTINUE);
   1605 }
   1606 
   1607 /*
   1608  * Report a checksum error for a child of a RAID-Z device.
   1609  */
   1610 static void
   1611 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
   1612 {
   1613 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
   1614 
   1615 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
   1616 		zio_bad_cksum_t zbc;
   1617 		raidz_map_t *rm = zio->io_vsd;
   1618 
   1619 		mutex_enter(&vd->vdev_stat_lock);
   1620 		vd->vdev_stat.vs_checksum_errors++;
   1621 		mutex_exit(&vd->vdev_stat_lock);
   1622 
   1623 		zbc.zbc_has_cksum = 0;
   1624 		zbc.zbc_injected = rm->rm_ecksuminjected;
   1625 
   1626 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
   1627 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
   1628 		    &zbc);
   1629 	}
   1630 }
   1631 
   1632 /*
   1633  * We keep track of whether or not there were any injected errors, so that
   1634  * any ereports we generate can note it.
   1635  */
   1636 static int
   1637 raidz_checksum_verify(zio_t *zio)
   1638 {
   1639 	zio_bad_cksum_t zbc;
   1640 	raidz_map_t *rm = zio->io_vsd;
   1641 
   1642 	int ret = zio_checksum_error(zio, &zbc);
   1643 	if (ret != 0 && zbc.zbc_injected != 0)
   1644 		rm->rm_ecksuminjected = 1;
   1645 
   1646 	return (ret);
   1647 }
   1648 
   1649 /*
   1650  * Generate the parity from the data columns. If we tried and were able to
   1651  * read the parity without error, verify that the generated parity matches the
   1652  * data we read. If it doesn't, we fire off a checksum error. Return the
   1653  * number such failures.
   1654  */
   1655 static int
   1656 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
   1657 {
   1658 	void *orig[VDEV_RAIDZ_MAXPARITY];
   1659 	int c, ret = 0;
   1660 	raidz_col_t *rc;
   1661 
   1662 	for (c = 0; c < rm->rm_firstdatacol; c++) {
   1663 		rc = &rm->rm_col[c];
   1664 		if (!rc->rc_tried || rc->rc_error != 0)
   1665 			continue;
   1666 		orig[c] = zio_buf_alloc(rc->rc_size);
   1667 		bcopy(rc->rc_data, orig[c], rc->rc_size);
   1668 	}
   1669 
   1670 	vdev_raidz_generate_parity(rm);
   1671 
   1672 	for (c = 0; c < rm->rm_firstdatacol; c++) {
   1673 		rc = &rm->rm_col[c];
   1674 		if (!rc->rc_tried || rc->rc_error != 0)
   1675 			continue;
   1676 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
   1677 			raidz_checksum_error(zio, rc, orig[c]);
   1678 			rc->rc_error = ECKSUM;
   1679 			ret++;
   1680 		}
   1681 		zio_buf_free(orig[c], rc->rc_size);
   1682 	}
   1683 
   1684 	return (ret);
   1685 }
   1686 
   1687 /*
   1688  * Keep statistics on all the ways that we used parity to correct data.
   1689  */
   1690 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
   1691 
   1692 static int
   1693 vdev_raidz_worst_error(raidz_map_t *rm)
   1694 {
   1695 	int error = 0;
   1696 
   1697 	for (int c = 0; c < rm->rm_cols; c++)
   1698 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
   1699 
   1700 	return (error);
   1701 }
   1702 
   1703 /*
   1704  * Iterate over all combinations of bad data and attempt a reconstruction.
   1705  * Note that the algorithm below is non-optimal because it doesn't take into
   1706  * account how reconstruction is actually performed. For example, with
   1707  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
   1708  * is targeted as invalid as if columns 1 and 4 are targeted since in both
   1709  * cases we'd only use parity information in column 0.
   1710  */
   1711 static int
   1712 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
   1713 {
   1714 	raidz_map_t *rm = zio->io_vsd;
   1715 	raidz_col_t *rc;
   1716 	void *orig[VDEV_RAIDZ_MAXPARITY];
   1717 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
   1718 	int *tgts = &tstore[1];
   1719 	int current, next, i, c, n;
   1720 	int code, ret = 0;
   1721 
   1722 	ASSERT(total_errors < rm->rm_firstdatacol);
   1723 
   1724 	/*
   1725 	 * This simplifies one edge condition.
   1726 	 */
   1727 	tgts[-1] = -1;
   1728 
   1729 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
   1730 		/*
   1731 		 * Initialize the targets array by finding the first n columns
   1732 		 * that contain no error.
   1733 		 *
   1734 		 * If there were no data errors, we need to ensure that we're
   1735 		 * always explicitly attempting to reconstruct at least one
   1736 		 * data column. To do this, we simply push the highest target
   1737 		 * up into the data columns.
   1738 		 */
   1739 		for (c = 0, i = 0; i < n; i++) {
   1740 			if (i == n - 1 && data_errors == 0 &&
   1741 			    c < rm->rm_firstdatacol) {
   1742 				c = rm->rm_firstdatacol;
   1743 			}
   1744 
   1745 			while (rm->rm_col[c].rc_error != 0) {
   1746 				c++;
   1747 				ASSERT3S(c, <, rm->rm_cols);
   1748 			}
   1749 
   1750 			tgts[i] = c++;
   1751 		}
   1752 
   1753 		/*
   1754 		 * Setting tgts[n] simplifies the other edge condition.
   1755 		 */
   1756 		tgts[n] = rm->rm_cols;
   1757 
   1758 		/*
   1759 		 * These buffers were allocated in previous iterations.
   1760 		 */
   1761 		for (i = 0; i < n - 1; i++) {
   1762 			ASSERT(orig[i] != NULL);
   1763 		}
   1764 
   1765 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
   1766 
   1767 		current = 0;
   1768 		next = tgts[current];
   1769 
   1770 		while (current != n) {
   1771 			tgts[current] = next;
   1772 			current = 0;
   1773 
   1774 			/*
   1775 			 * Save off the original data that we're going to
   1776 			 * attempt to reconstruct.
   1777 			 */
   1778 			for (i = 0; i < n; i++) {
   1779 				ASSERT(orig[i] != NULL);
   1780 				c = tgts[i];
   1781 				ASSERT3S(c, >=, 0);
   1782 				ASSERT3S(c, <, rm->rm_cols);
   1783 				rc = &rm->rm_col[c];
   1784 				bcopy(rc->rc_data, orig[i], rc->rc_size);
   1785 			}
   1786 
   1787 			/*
   1788 			 * Attempt a reconstruction and exit the outer loop on
   1789 			 * success.
   1790 			 */
   1791 			code = vdev_raidz_reconstruct(rm, tgts, n);
   1792 			if (raidz_checksum_verify(zio) == 0) {
   1793 				atomic_inc_64(&raidz_corrected[code]);
   1794 
   1795 				for (i = 0; i < n; i++) {
   1796 					c = tgts[i];
   1797 					rc = &rm->rm_col[c];
   1798 					ASSERT(rc->rc_error == 0);
   1799 					if (rc->rc_tried)
   1800 						raidz_checksum_error(zio, rc,
   1801 						    orig[i]);
   1802 					rc->rc_error = ECKSUM;
   1803 				}
   1804 
   1805 				ret = code;
   1806 				goto done;
   1807 			}
   1808 
   1809 			/*
   1810 			 * Restore the original data.
   1811 			 */
   1812 			for (i = 0; i < n; i++) {
   1813 				c = tgts[i];
   1814 				rc = &rm->rm_col[c];
   1815 				bcopy(orig[i], rc->rc_data, rc->rc_size);
   1816 			}
   1817 
   1818 			do {
   1819 				/*
   1820 				 * Find the next valid column after the current
   1821 				 * position..
   1822 				 */
   1823 				for (next = tgts[current] + 1;
   1824 				    next < rm->rm_cols &&
   1825 				    rm->rm_col[next].rc_error != 0; next++)
   1826 					continue;
   1827 
   1828 				ASSERT(next <= tgts[current + 1]);
   1829 
   1830 				/*
   1831 				 * If that spot is available, we're done here.
   1832 				 */
   1833 				if (next != tgts[current + 1])
   1834 					break;
   1835 
   1836 				/*
   1837 				 * Otherwise, find the next valid column after
   1838 				 * the previous position.
   1839 				 */
   1840 				for (c = tgts[current - 1] + 1;
   1841 				    rm->rm_col[c].rc_error != 0; c++)
   1842 					continue;
   1843 
   1844 				tgts[current] = c;
   1845 				current++;
   1846 
   1847 			} while (current != n);
   1848 		}
   1849 	}
   1850 	n--;
   1851 done:
   1852 	for (i = 0; i < n; i++) {
   1853 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
   1854 	}
   1855 
   1856 	return (ret);
   1857 }
   1858 
   1859 static void
   1860 vdev_raidz_io_done(zio_t *zio)
   1861 {
   1862 	vdev_t *vd = zio->io_vd;
   1863 	vdev_t *cvd;
   1864 	raidz_map_t *rm = zio->io_vsd;
   1865 	raidz_col_t *rc;
   1866 	int unexpected_errors = 0;
   1867 	int parity_errors = 0;
   1868 	int parity_untried = 0;
   1869 	int data_errors = 0;
   1870 	int total_errors = 0;
   1871 	int n, c;
   1872 	int tgts[VDEV_RAIDZ_MAXPARITY];
   1873 	int code;
   1874 
   1875 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
   1876 
   1877 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
   1878 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
   1879 
   1880 	for (c = 0; c < rm->rm_cols; c++) {
   1881 		rc = &rm->rm_col[c];
   1882 
   1883 		if (rc->rc_error) {
   1884 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
   1885 
   1886 			if (c < rm->rm_firstdatacol)
   1887 				parity_errors++;
   1888 			else
   1889 				data_errors++;
   1890 
   1891 			if (!rc->rc_skipped)
   1892 				unexpected_errors++;
   1893 
   1894 			total_errors++;
   1895 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
   1896 			parity_untried++;
   1897 		}
   1898 	}
   1899 
   1900 	if (zio->io_type == ZIO_TYPE_WRITE) {
   1901 		/*
   1902 		 * XXX -- for now, treat partial writes as a success.
   1903 		 * (If we couldn't write enough columns to reconstruct
   1904 		 * the data, the I/O failed.  Otherwise, good enough.)
   1905 		 *
   1906 		 * Now that we support write reallocation, it would be better
   1907 		 * to treat partial failure as real failure unless there are
   1908 		 * no non-degraded top-level vdevs left, and not update DTLs
   1909 		 * if we intend to reallocate.
   1910 		 */
   1911 		/* XXPOLICY */
   1912 		if (total_errors > rm->rm_firstdatacol)
   1913 			zio->io_error = vdev_raidz_worst_error(rm);
   1914 
   1915 		return;
   1916 	}
   1917 
   1918 	ASSERT(zio->io_type == ZIO_TYPE_READ);
   1919 	/*
   1920 	 * There are three potential phases for a read:
   1921 	 *	1. produce valid data from the columns read
   1922 	 *	2. read all disks and try again
   1923 	 *	3. perform combinatorial reconstruction
   1924 	 *
   1925 	 * Each phase is progressively both more expensive and less likely to
   1926 	 * occur. If we encounter more errors than we can repair or all phases
   1927 	 * fail, we have no choice but to return an error.
   1928 	 */
   1929 
   1930 	/*
   1931 	 * If the number of errors we saw was correctable -- less than or equal
   1932 	 * to the number of parity disks read -- attempt to produce data that
   1933 	 * has a valid checksum. Naturally, this case applies in the absence of
   1934 	 * any errors.
   1935 	 */
   1936 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
   1937 		if (data_errors == 0) {
   1938 			if (raidz_checksum_verify(zio) == 0) {
   1939 				/*
   1940 				 * If we read parity information (unnecessarily
   1941 				 * as it happens since no reconstruction was
   1942 				 * needed) regenerate and verify the parity.
   1943 				 * We also regenerate parity when resilvering
   1944 				 * so we can write it out to the failed device
   1945 				 * later.
   1946 				 */
   1947 				if (parity_errors + parity_untried <
   1948 				    rm->rm_firstdatacol ||
   1949 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
   1950 					n = raidz_parity_verify(zio, rm);
   1951 					unexpected_errors += n;
   1952 					ASSERT(parity_errors + n <=
   1953 					    rm->rm_firstdatacol);
   1954 				}
   1955 				goto done;
   1956 			}
   1957 		} else {
   1958 			/*
   1959 			 * We either attempt to read all the parity columns or
   1960 			 * none of them. If we didn't try to read parity, we
   1961 			 * wouldn't be here in the correctable case. There must
   1962 			 * also have been fewer parity errors than parity
   1963 			 * columns or, again, we wouldn't be in this code path.
   1964 			 */
   1965 			ASSERT(parity_untried == 0);
   1966 			ASSERT(parity_errors < rm->rm_firstdatacol);
   1967 
   1968 			/*
   1969 			 * Identify the data columns that reported an error.
   1970 			 */
   1971 			n = 0;
   1972 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
   1973 				rc = &rm->rm_col[c];
   1974 				if (rc->rc_error != 0) {
   1975 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
   1976 					tgts[n++] = c;
   1977 				}
   1978 			}
   1979 
   1980 			ASSERT(rm->rm_firstdatacol >= n);
   1981 
   1982 			code = vdev_raidz_reconstruct(rm, tgts, n);
   1983 
   1984 			if (raidz_checksum_verify(zio) == 0) {
   1985 				atomic_inc_64(&raidz_corrected[code]);
   1986 
   1987 				/*
   1988 				 * If we read more parity disks than were used
   1989 				 * for reconstruction, confirm that the other
   1990 				 * parity disks produced correct data. This
   1991 				 * routine is suboptimal in that it regenerates
   1992 				 * the parity that we already used in addition
   1993 				 * to the parity that we're attempting to
   1994 				 * verify, but this should be a relatively
   1995 				 * uncommon case, and can be optimized if it
   1996 				 * becomes a problem. Note that we regenerate
   1997 				 * parity when resilvering so we can write it
   1998 				 * out to failed devices later.
   1999 				 */
   2000 				if (parity_errors < rm->rm_firstdatacol - n ||
   2001 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
   2002 					n = raidz_parity_verify(zio, rm);
   2003 					unexpected_errors += n;
   2004 					ASSERT(parity_errors + n <=
   2005 					    rm->rm_firstdatacol);
   2006 				}
   2007 
   2008 				goto done;
   2009 			}
   2010 		}
   2011 	}
   2012 
   2013 	/*
   2014 	 * This isn't a typical situation -- either we got a read error or
   2015 	 * a child silently returned bad data. Read every block so we can
   2016 	 * try again with as much data and parity as we can track down. If
   2017 	 * we've already been through once before, all children will be marked
   2018 	 * as tried so we'll proceed to combinatorial reconstruction.
   2019 	 */
   2020 	unexpected_errors = 1;
   2021 	rm->rm_missingdata = 0;
   2022 	rm->rm_missingparity = 0;
   2023 
   2024 	for (c = 0; c < rm->rm_cols; c++) {
   2025 		if (rm->rm_col[c].rc_tried)
   2026 			continue;
   2027 
   2028 		zio_vdev_io_redone(zio);
   2029 		do {
   2030 			rc = &rm->rm_col[c];
   2031 			if (rc->rc_tried)
   2032 				continue;
   2033 			zio_nowait(zio_vdev_child_io(zio, NULL,
   2034 			    vd->vdev_child[rc->rc_devidx],
   2035 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   2036 			    zio->io_type, zio->io_priority, 0,
   2037 			    vdev_raidz_child_done, rc));
   2038 		} while (++c < rm->rm_cols);
   2039 
   2040 		return;
   2041 	}
   2042 
   2043 	/*
   2044 	 * At this point we've attempted to reconstruct the data given the
   2045 	 * errors we detected, and we've attempted to read all columns. There
   2046 	 * must, therefore, be one or more additional problems -- silent errors
   2047 	 * resulting in invalid data rather than explicit I/O errors resulting
   2048 	 * in absent data. We check if there is enough additional data to
   2049 	 * possibly reconstruct the data and then perform combinatorial
   2050 	 * reconstruction over all possible combinations. If that fails,
   2051 	 * we're cooked.
   2052 	 */
   2053 	if (total_errors > rm->rm_firstdatacol) {
   2054 		zio->io_error = vdev_raidz_worst_error(rm);
   2055 
   2056 	} else if (total_errors < rm->rm_firstdatacol &&
   2057 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
   2058 		/*
   2059 		 * If we didn't use all the available parity for the
   2060 		 * combinatorial reconstruction, verify that the remaining
   2061 		 * parity is correct.
   2062 		 */
   2063 		if (code != (1 << rm->rm_firstdatacol) - 1)
   2064 			(void) raidz_parity_verify(zio, rm);
   2065 	} else {
   2066 		/*
   2067 		 * We're here because either:
   2068 		 *
   2069 		 *	total_errors == rm_first_datacol, or
   2070 		 *	vdev_raidz_combrec() failed
   2071 		 *
   2072 		 * In either case, there is enough bad data to prevent
   2073 		 * reconstruction.
   2074 		 *
   2075 		 * Start checksum ereports for all children which haven't
   2076 		 * failed.
   2077 		 */
   2078 		zio->io_error = ECKSUM;
   2079 
   2080 		for (c = 0; c < rm->rm_cols; c++) {
   2081 			rc = &rm->rm_col[c];
   2082 			if (rc->rc_error == 0) {
   2083 				zio_bad_cksum_t zbc;
   2084 				zbc.zbc_has_cksum = 0;
   2085 				zbc.zbc_injected = rm->rm_ecksuminjected;
   2086 
   2087 				zfs_ereport_start_checksum(
   2088 				    zio->io_spa, vd->vdev_child[rc->rc_devidx],
   2089 				    zio, rc->rc_offset, rc->rc_size,
   2090 				    (void *)(uintptr_t)c, &zbc);
   2091 			}
   2092 		}
   2093 	}
   2094 
   2095 done:
   2096 	zio_checksum_verified(zio);
   2097 
   2098 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
   2099 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
   2100 		/*
   2101 		 * Use the good data we have in hand to repair damaged children.
   2102 		 */
   2103 		for (c = 0; c < rm->rm_cols; c++) {
   2104 			rc = &rm->rm_col[c];
   2105 			cvd = vd->vdev_child[rc->rc_devidx];
   2106 
   2107 			if (rc->rc_error == 0)
   2108 				continue;
   2109 
   2110 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   2111 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   2112 			    ZIO_TYPE_WRITE, zio->io_priority,
   2113 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
   2114 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
   2115 		}
   2116 	}
   2117 }
   2118 
   2119 static void
   2120 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
   2121 {
   2122 	if (faulted > vd->vdev_nparity)
   2123 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
   2124 		    VDEV_AUX_NO_REPLICAS);
   2125 	else if (degraded + faulted != 0)
   2126 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
   2127 	else
   2128 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
   2129 }
   2130 
   2131 vdev_ops_t vdev_raidz_ops = {
   2132 	vdev_raidz_open,
   2133 	vdev_raidz_close,
   2134 	vdev_raidz_asize,
   2135 	vdev_raidz_io_start,
   2136 	vdev_raidz_io_done,
   2137 	vdev_raidz_state_change,
   2138 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
   2139 	B_FALSE			/* not a leaf vdev */
   2140 };
   2141