Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/zfs_context.h>
     28 #include <sys/spa.h>
     29 #include <sys/vdev_impl.h>
     30 #include <sys/zio.h>
     31 #include <sys/zio_checksum.h>
     32 #include <sys/fs/zfs.h>
     33 #include <sys/fm/fs/zfs.h>
     34 
     35 /*
     36  * Virtual device vector for RAID-Z.
     37  *
     38  * This vdev supports both single and double parity. For single parity, we
     39  * use a simple XOR of all the data columns. For double parity, we use both
     40  * the simple XOR as well as a technique described in "The mathematics of
     41  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
     42  * over the integers expressable in a single byte. Briefly, the operations on
     43  * the field are defined as follows:
     44  *
     45  *   o addition (+) is represented by a bitwise XOR
     46  *   o subtraction (-) is therefore identical to addition: A + B = A - B
     47  *   o multiplication of A by 2 is defined by the following bitwise expression:
     48  *	(A * 2)_7 = A_6
     49  *	(A * 2)_6 = A_5
     50  *	(A * 2)_5 = A_4
     51  *	(A * 2)_4 = A_3 + A_7
     52  *	(A * 2)_3 = A_2 + A_7
     53  *	(A * 2)_2 = A_1 + A_7
     54  *	(A * 2)_1 = A_0
     55  *	(A * 2)_0 = A_7
     56  *
     57  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
     58  *
     59  * Observe that any number in the field (except for 0) can be expressed as a
     60  * power of 2 -- a generator for the field. We store a table of the powers of
     61  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
     62  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
     63  * than field addition). The inverse of a field element A (A^-1) is A^254.
     64  *
     65  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
     66  * can be expressed by field operations:
     67  *
     68  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
     69  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
     70  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
     71  *
     72  * See the reconstruction code below for how P and Q can used individually or
     73  * in concert to recover missing data columns.
     74  */
     75 
     76 typedef struct raidz_col {
     77 	uint64_t rc_devidx;		/* child device index for I/O */
     78 	uint64_t rc_offset;		/* device offset */
     79 	uint64_t rc_size;		/* I/O size */
     80 	void *rc_data;			/* I/O data */
     81 	int rc_error;			/* I/O error for this device */
     82 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
     83 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
     84 } raidz_col_t;
     85 
     86 typedef struct raidz_map {
     87 	uint64_t rm_cols;		/* Column count */
     88 	uint64_t rm_bigcols;		/* Number of oversized columns */
     89 	uint64_t rm_asize;		/* Actual total I/O size */
     90 	uint64_t rm_missingdata;	/* Count of missing data devices */
     91 	uint64_t rm_missingparity;	/* Count of missing parity devices */
     92 	uint64_t rm_firstdatacol;	/* First data column/parity count */
     93 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
     94 } raidz_map_t;
     95 
     96 #define	VDEV_RAIDZ_P		0
     97 #define	VDEV_RAIDZ_Q		1
     98 
     99 #define	VDEV_RAIDZ_MAXPARITY	2
    100 
    101 #define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
    102 
    103 /*
    104  * These two tables represent powers and logs of 2 in the Galois field defined
    105  * above. These values were computed by repeatedly multiplying by 2 as above.
    106  */
    107 static const uint8_t vdev_raidz_pow2[256] = {
    108 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
    109 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
    110 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
    111 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
    112 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
    113 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
    114 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
    115 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
    116 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
    117 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
    118 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
    119 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
    120 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
    121 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
    122 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
    123 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
    124 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
    125 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
    126 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
    127 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
    128 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
    129 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
    130 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
    131 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
    132 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
    133 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
    134 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
    135 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
    136 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
    137 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
    138 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
    139 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
    140 };
    141 static const uint8_t vdev_raidz_log2[256] = {
    142 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
    143 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
    144 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
    145 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
    146 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
    147 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
    148 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
    149 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
    150 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
    151 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
    152 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
    153 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
    154 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
    155 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
    156 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
    157 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
    158 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
    159 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
    160 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
    161 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
    162 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
    163 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
    164 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
    165 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
    166 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
    167 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
    168 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
    169 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
    170 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
    171 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
    172 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
    173 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
    174 };
    175 
    176 /*
    177  * Multiply a given number by 2 raised to the given power.
    178  */
    179 static uint8_t
    180 vdev_raidz_exp2(uint_t a, int exp)
    181 {
    182 	if (a == 0)
    183 		return (0);
    184 
    185 	ASSERT(exp >= 0);
    186 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
    187 
    188 	exp += vdev_raidz_log2[a];
    189 	if (exp > 255)
    190 		exp -= 255;
    191 
    192 	return (vdev_raidz_pow2[exp]);
    193 }
    194 
    195 static void
    196 vdev_raidz_map_free(zio_t *zio)
    197 {
    198 	raidz_map_t *rm = zio->io_vsd;
    199 	int c;
    200 
    201 	for (c = 0; c < rm->rm_firstdatacol; c++)
    202 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
    203 
    204 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
    205 }
    206 
    207 static raidz_map_t *
    208 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
    209     uint64_t nparity)
    210 {
    211 	raidz_map_t *rm;
    212 	uint64_t b = zio->io_offset >> unit_shift;
    213 	uint64_t s = zio->io_size >> unit_shift;
    214 	uint64_t f = b % dcols;
    215 	uint64_t o = (b / dcols) << unit_shift;
    216 	uint64_t q, r, c, bc, col, acols, coff, devidx;
    217 
    218 	q = s / (dcols - nparity);
    219 	r = s - q * (dcols - nparity);
    220 	bc = (r == 0 ? 0 : r + nparity);
    221 
    222 	acols = (q == 0 ? bc : dcols);
    223 
    224 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
    225 
    226 	rm->rm_cols = acols;
    227 	rm->rm_bigcols = bc;
    228 	rm->rm_asize = 0;
    229 	rm->rm_missingdata = 0;
    230 	rm->rm_missingparity = 0;
    231 	rm->rm_firstdatacol = nparity;
    232 
    233 	for (c = 0; c < acols; c++) {
    234 		col = f + c;
    235 		coff = o;
    236 		if (col >= dcols) {
    237 			col -= dcols;
    238 			coff += 1ULL << unit_shift;
    239 		}
    240 		rm->rm_col[c].rc_devidx = col;
    241 		rm->rm_col[c].rc_offset = coff;
    242 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
    243 		rm->rm_col[c].rc_data = NULL;
    244 		rm->rm_col[c].rc_error = 0;
    245 		rm->rm_col[c].rc_tried = 0;
    246 		rm->rm_col[c].rc_skipped = 0;
    247 		rm->rm_asize += rm->rm_col[c].rc_size;
    248 	}
    249 
    250 	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
    251 
    252 	for (c = 0; c < rm->rm_firstdatacol; c++)
    253 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
    254 
    255 	rm->rm_col[c].rc_data = zio->io_data;
    256 
    257 	for (c = c + 1; c < acols; c++)
    258 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
    259 		    rm->rm_col[c - 1].rc_size;
    260 
    261 	/*
    262 	 * If all data stored spans all columns, there's a danger that parity
    263 	 * will always be on the same device and, since parity isn't read
    264 	 * during normal operation, that that device's I/O bandwidth won't be
    265 	 * used effectively. We therefore switch the parity every 1MB.
    266 	 *
    267 	 * ... at least that was, ostensibly, the theory. As a practical
    268 	 * matter unless we juggle the parity between all devices evenly, we
    269 	 * won't see any benefit. Further, occasional writes that aren't a
    270 	 * multiple of the LCM of the number of children and the minimum
    271 	 * stripe width are sufficient to avoid pessimal behavior.
    272 	 * Unfortunately, this decision created an implicit on-disk format
    273 	 * requirement that we need to support for all eternity, but only
    274 	 * for single-parity RAID-Z.
    275 	 */
    276 	ASSERT(rm->rm_cols >= 2);
    277 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
    278 
    279 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
    280 		devidx = rm->rm_col[0].rc_devidx;
    281 		o = rm->rm_col[0].rc_offset;
    282 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
    283 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
    284 		rm->rm_col[1].rc_devidx = devidx;
    285 		rm->rm_col[1].rc_offset = o;
    286 	}
    287 
    288 	zio->io_vsd = rm;
    289 	zio->io_vsd_free = vdev_raidz_map_free;
    290 	return (rm);
    291 }
    292 
    293 static void
    294 vdev_raidz_generate_parity_p(raidz_map_t *rm)
    295 {
    296 	uint64_t *p, *src, pcount, ccount, i;
    297 	int c;
    298 
    299 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    300 
    301 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    302 		src = rm->rm_col[c].rc_data;
    303 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    304 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    305 
    306 		if (c == rm->rm_firstdatacol) {
    307 			ASSERT(ccount == pcount);
    308 			for (i = 0; i < ccount; i++, p++, src++) {
    309 				*p = *src;
    310 			}
    311 		} else {
    312 			ASSERT(ccount <= pcount);
    313 			for (i = 0; i < ccount; i++, p++, src++) {
    314 				*p ^= *src;
    315 			}
    316 		}
    317 	}
    318 }
    319 
    320 static void
    321 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
    322 {
    323 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
    324 	int c;
    325 
    326 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    327 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    328 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    329 
    330 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    331 		src = rm->rm_col[c].rc_data;
    332 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    333 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    334 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    335 
    336 		if (c == rm->rm_firstdatacol) {
    337 			ASSERT(ccount == pcount || ccount == 0);
    338 			for (i = 0; i < ccount; i++, p++, q++, src++) {
    339 				*q = *src;
    340 				*p = *src;
    341 			}
    342 			for (; i < pcount; i++, p++, q++, src++) {
    343 				*q = 0;
    344 				*p = 0;
    345 			}
    346 		} else {
    347 			ASSERT(ccount <= pcount);
    348 
    349 			/*
    350 			 * Rather than multiplying each byte individually (as
    351 			 * described above), we are able to handle 8 at once
    352 			 * by generating a mask based on the high bit in each
    353 			 * byte and using that to conditionally XOR in 0x1d.
    354 			 */
    355 			for (i = 0; i < ccount; i++, p++, q++, src++) {
    356 				mask = *q & 0x8080808080808080ULL;
    357 				mask = (mask << 1) - (mask >> 7);
    358 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
    359 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    360 				*q ^= *src;
    361 				*p ^= *src;
    362 			}
    363 
    364 			/*
    365 			 * Treat short columns as though they are full of 0s.
    366 			 */
    367 			for (; i < pcount; i++, q++) {
    368 				mask = *q & 0x8080808080808080ULL;
    369 				mask = (mask << 1) - (mask >> 7);
    370 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
    371 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    372 			}
    373 		}
    374 	}
    375 }
    376 
    377 static void
    378 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
    379 {
    380 	uint64_t *dst, *src, xcount, ccount, count, i;
    381 	int c;
    382 
    383 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    384 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
    385 	ASSERT(xcount > 0);
    386 
    387 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    388 	dst = rm->rm_col[x].rc_data;
    389 	for (i = 0; i < xcount; i++, dst++, src++) {
    390 		*dst = *src;
    391 	}
    392 
    393 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    394 		src = rm->rm_col[c].rc_data;
    395 		dst = rm->rm_col[x].rc_data;
    396 
    397 		if (c == x)
    398 			continue;
    399 
    400 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    401 		count = MIN(ccount, xcount);
    402 
    403 		for (i = 0; i < count; i++, dst++, src++) {
    404 			*dst ^= *src;
    405 		}
    406 	}
    407 }
    408 
    409 static void
    410 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
    411 {
    412 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
    413 	uint8_t *b;
    414 	int c, j, exp;
    415 
    416 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    417 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
    418 
    419 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    420 		src = rm->rm_col[c].rc_data;
    421 		dst = rm->rm_col[x].rc_data;
    422 
    423 		if (c == x)
    424 			ccount = 0;
    425 		else
    426 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    427 
    428 		count = MIN(ccount, xcount);
    429 
    430 		if (c == rm->rm_firstdatacol) {
    431 			for (i = 0; i < count; i++, dst++, src++) {
    432 				*dst = *src;
    433 			}
    434 			for (; i < xcount; i++, dst++) {
    435 				*dst = 0;
    436 			}
    437 
    438 		} else {
    439 			/*
    440 			 * For an explanation of this, see the comment in
    441 			 * vdev_raidz_generate_parity_pq() above.
    442 			 */
    443 			for (i = 0; i < count; i++, dst++, src++) {
    444 				mask = *dst & 0x8080808080808080ULL;
    445 				mask = (mask << 1) - (mask >> 7);
    446 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
    447 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    448 				*dst ^= *src;
    449 			}
    450 
    451 			for (; i < xcount; i++, dst++) {
    452 				mask = *dst & 0x8080808080808080ULL;
    453 				mask = (mask << 1) - (mask >> 7);
    454 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
    455 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    456 			}
    457 		}
    458 	}
    459 
    460 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    461 	dst = rm->rm_col[x].rc_data;
    462 	exp = 255 - (rm->rm_cols - 1 - x);
    463 
    464 	for (i = 0; i < xcount; i++, dst++, src++) {
    465 		*dst ^= *src;
    466 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
    467 			*b = vdev_raidz_exp2(*b, exp);
    468 		}
    469 	}
    470 }
    471 
    472 static void
    473 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
    474 {
    475 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
    476 	void *pdata, *qdata;
    477 	uint64_t xsize, ysize, i;
    478 
    479 	ASSERT(x < y);
    480 	ASSERT(x >= rm->rm_firstdatacol);
    481 	ASSERT(y < rm->rm_cols);
    482 
    483 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
    484 
    485 	/*
    486 	 * Move the parity data aside -- we're going to compute parity as
    487 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
    488 	 * reuse the parity generation mechanism without trashing the actual
    489 	 * parity so we make those columns appear to be full of zeros by
    490 	 * setting their lengths to zero.
    491 	 */
    492 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    493 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    494 	xsize = rm->rm_col[x].rc_size;
    495 	ysize = rm->rm_col[y].rc_size;
    496 
    497 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
    498 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
    499 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
    500 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    501 	rm->rm_col[x].rc_size = 0;
    502 	rm->rm_col[y].rc_size = 0;
    503 
    504 	vdev_raidz_generate_parity_pq(rm);
    505 
    506 	rm->rm_col[x].rc_size = xsize;
    507 	rm->rm_col[y].rc_size = ysize;
    508 
    509 	p = pdata;
    510 	q = qdata;
    511 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    512 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    513 	xd = rm->rm_col[x].rc_data;
    514 	yd = rm->rm_col[y].rc_data;
    515 
    516 	/*
    517 	 * We now have:
    518 	 *	Pxy = P + D_x + D_y
    519 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
    520 	 *
    521 	 * We can then solve for D_x:
    522 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
    523 	 * where
    524 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
    525 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
    526 	 *
    527 	 * With D_x in hand, we can easily solve for D_y:
    528 	 *	D_y = P + Pxy + D_x
    529 	 */
    530 
    531 	a = vdev_raidz_pow2[255 + x - y];
    532 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
    533 	tmp = 255 - vdev_raidz_log2[a ^ 1];
    534 
    535 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
    536 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
    537 
    538 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
    539 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
    540 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
    541 
    542 		if (i < ysize)
    543 			*yd = *p ^ *pxy ^ *xd;
    544 	}
    545 
    546 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
    547 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
    548 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
    549 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    550 
    551 	/*
    552 	 * Restore the saved parity data.
    553 	 */
    554 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
    555 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
    556 }
    557 
    558 
    559 static int
    560 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
    561 {
    562 	uint64_t nparity = vd->vdev_nparity;
    563 	int lasterror = 0;
    564 	int numerrors = 0;
    565 
    566 	ASSERT(nparity > 0);
    567 
    568 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
    569 	    vd->vdev_children < nparity + 1) {
    570 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
    571 		return (EINVAL);
    572 	}
    573 
    574 	vdev_open_children(vd);
    575 
    576 	for (int c = 0; c < vd->vdev_children; c++) {
    577 		vdev_t *cvd = vd->vdev_child[c];
    578 
    579 		if (cvd->vdev_open_error) {
    580 			lasterror = cvd->vdev_open_error;
    581 			numerrors++;
    582 			continue;
    583 		}
    584 
    585 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
    586 		*ashift = MAX(*ashift, cvd->vdev_ashift);
    587 	}
    588 
    589 	*asize *= vd->vdev_children;
    590 
    591 	if (numerrors > nparity) {
    592 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
    593 		return (lasterror);
    594 	}
    595 
    596 	return (0);
    597 }
    598 
    599 static void
    600 vdev_raidz_close(vdev_t *vd)
    601 {
    602 	for (int c = 0; c < vd->vdev_children; c++)
    603 		vdev_close(vd->vdev_child[c]);
    604 }
    605 
    606 static uint64_t
    607 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
    608 {
    609 	uint64_t asize;
    610 	uint64_t ashift = vd->vdev_top->vdev_ashift;
    611 	uint64_t cols = vd->vdev_children;
    612 	uint64_t nparity = vd->vdev_nparity;
    613 
    614 	asize = ((psize - 1) >> ashift) + 1;
    615 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
    616 	asize = roundup(asize, nparity + 1) << ashift;
    617 
    618 	return (asize);
    619 }
    620 
    621 static void
    622 vdev_raidz_child_done(zio_t *zio)
    623 {
    624 	raidz_col_t *rc = zio->io_private;
    625 
    626 	rc->rc_error = zio->io_error;
    627 	rc->rc_tried = 1;
    628 	rc->rc_skipped = 0;
    629 }
    630 
    631 static int
    632 vdev_raidz_io_start(zio_t *zio)
    633 {
    634 	vdev_t *vd = zio->io_vd;
    635 	vdev_t *tvd = vd->vdev_top;
    636 	vdev_t *cvd;
    637 	blkptr_t *bp = zio->io_bp;
    638 	raidz_map_t *rm;
    639 	raidz_col_t *rc;
    640 	int c;
    641 
    642 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
    643 	    vd->vdev_nparity);
    644 
    645 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
    646 
    647 	if (zio->io_type == ZIO_TYPE_WRITE) {
    648 		/*
    649 		 * Generate RAID parity in the first virtual columns.
    650 		 */
    651 		if (rm->rm_firstdatacol == 1)
    652 			vdev_raidz_generate_parity_p(rm);
    653 		else
    654 			vdev_raidz_generate_parity_pq(rm);
    655 
    656 		for (c = 0; c < rm->rm_cols; c++) {
    657 			rc = &rm->rm_col[c];
    658 			cvd = vd->vdev_child[rc->rc_devidx];
    659 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
    660 			    rc->rc_offset, rc->rc_data, rc->rc_size,
    661 			    zio->io_type, zio->io_priority, 0,
    662 			    vdev_raidz_child_done, rc));
    663 		}
    664 
    665 		return (ZIO_PIPELINE_CONTINUE);
    666 	}
    667 
    668 	ASSERT(zio->io_type == ZIO_TYPE_READ);
    669 
    670 	/*
    671 	 * Iterate over the columns in reverse order so that we hit the parity
    672 	 * last -- any errors along the way will force us to read the parity
    673 	 * data.
    674 	 */
    675 	for (c = rm->rm_cols - 1; c >= 0; c--) {
    676 		rc = &rm->rm_col[c];
    677 		cvd = vd->vdev_child[rc->rc_devidx];
    678 		if (!vdev_readable(cvd)) {
    679 			if (c >= rm->rm_firstdatacol)
    680 				rm->rm_missingdata++;
    681 			else
    682 				rm->rm_missingparity++;
    683 			rc->rc_error = ENXIO;
    684 			rc->rc_tried = 1;	/* don't even try */
    685 			rc->rc_skipped = 1;
    686 			continue;
    687 		}
    688 		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
    689 			if (c >= rm->rm_firstdatacol)
    690 				rm->rm_missingdata++;
    691 			else
    692 				rm->rm_missingparity++;
    693 			rc->rc_error = ESTALE;
    694 			rc->rc_skipped = 1;
    695 			continue;
    696 		}
    697 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
    698 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
    699 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
    700 			    rc->rc_offset, rc->rc_data, rc->rc_size,
    701 			    zio->io_type, zio->io_priority, 0,
    702 			    vdev_raidz_child_done, rc));
    703 		}
    704 	}
    705 
    706 	return (ZIO_PIPELINE_CONTINUE);
    707 }
    708 
    709 /*
    710  * Report a checksum error for a child of a RAID-Z device.
    711  */
    712 static void
    713 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
    714 {
    715 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
    716 
    717 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
    718 		mutex_enter(&vd->vdev_stat_lock);
    719 		vd->vdev_stat.vs_checksum_errors++;
    720 		mutex_exit(&vd->vdev_stat_lock);
    721 	}
    722 
    723 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
    724 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
    725 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
    726 }
    727 
    728 /*
    729  * Generate the parity from the data columns. If we tried and were able to
    730  * read the parity without error, verify that the generated parity matches the
    731  * data we read. If it doesn't, we fire off a checksum error. Return the
    732  * number such failures.
    733  */
    734 static int
    735 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
    736 {
    737 	void *orig[VDEV_RAIDZ_MAXPARITY];
    738 	int c, ret = 0;
    739 	raidz_col_t *rc;
    740 
    741 	for (c = 0; c < rm->rm_firstdatacol; c++) {
    742 		rc = &rm->rm_col[c];
    743 		if (!rc->rc_tried || rc->rc_error != 0)
    744 			continue;
    745 		orig[c] = zio_buf_alloc(rc->rc_size);
    746 		bcopy(rc->rc_data, orig[c], rc->rc_size);
    747 	}
    748 
    749 	if (rm->rm_firstdatacol == 1)
    750 		vdev_raidz_generate_parity_p(rm);
    751 	else
    752 		vdev_raidz_generate_parity_pq(rm);
    753 
    754 	for (c = 0; c < rm->rm_firstdatacol; c++) {
    755 		rc = &rm->rm_col[c];
    756 		if (!rc->rc_tried || rc->rc_error != 0)
    757 			continue;
    758 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
    759 			raidz_checksum_error(zio, rc);
    760 			rc->rc_error = ECKSUM;
    761 			ret++;
    762 		}
    763 		zio_buf_free(orig[c], rc->rc_size);
    764 	}
    765 
    766 	return (ret);
    767 }
    768 
    769 static uint64_t raidz_corrected_p;
    770 static uint64_t raidz_corrected_q;
    771 static uint64_t raidz_corrected_pq;
    772 
    773 static int
    774 vdev_raidz_worst_error(raidz_map_t *rm)
    775 {
    776 	int error = 0;
    777 
    778 	for (int c = 0; c < rm->rm_cols; c++)
    779 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
    780 
    781 	return (error);
    782 }
    783 
    784 static void
    785 vdev_raidz_io_done(zio_t *zio)
    786 {
    787 	vdev_t *vd = zio->io_vd;
    788 	vdev_t *cvd;
    789 	raidz_map_t *rm = zio->io_vsd;
    790 	raidz_col_t *rc, *rc1;
    791 	int unexpected_errors = 0;
    792 	int parity_errors = 0;
    793 	int parity_untried = 0;
    794 	int data_errors = 0;
    795 	int total_errors = 0;
    796 	int n, c, c1;
    797 
    798 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
    799 
    800 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
    801 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
    802 
    803 	for (c = 0; c < rm->rm_cols; c++) {
    804 		rc = &rm->rm_col[c];
    805 
    806 		if (rc->rc_error) {
    807 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
    808 
    809 			if (c < rm->rm_firstdatacol)
    810 				parity_errors++;
    811 			else
    812 				data_errors++;
    813 
    814 			if (!rc->rc_skipped)
    815 				unexpected_errors++;
    816 
    817 			total_errors++;
    818 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
    819 			parity_untried++;
    820 		}
    821 	}
    822 
    823 	if (zio->io_type == ZIO_TYPE_WRITE) {
    824 		/*
    825 		 * XXX -- for now, treat partial writes as a success.
    826 		 * (If we couldn't write enough columns to reconstruct
    827 		 * the data, the I/O failed.  Otherwise, good enough.)
    828 		 *
    829 		 * Now that we support write reallocation, it would be better
    830 		 * to treat partial failure as real failure unless there are
    831 		 * no non-degraded top-level vdevs left, and not update DTLs
    832 		 * if we intend to reallocate.
    833 		 */
    834 		/* XXPOLICY */
    835 		if (total_errors > rm->rm_firstdatacol)
    836 			zio->io_error = vdev_raidz_worst_error(rm);
    837 
    838 		return;
    839 	}
    840 
    841 	ASSERT(zio->io_type == ZIO_TYPE_READ);
    842 	/*
    843 	 * There are three potential phases for a read:
    844 	 *	1. produce valid data from the columns read
    845 	 *	2. read all disks and try again
    846 	 *	3. perform combinatorial reconstruction
    847 	 *
    848 	 * Each phase is progressively both more expensive and less likely to
    849 	 * occur. If we encounter more errors than we can repair or all phases
    850 	 * fail, we have no choice but to return an error.
    851 	 */
    852 
    853 	/*
    854 	 * If the number of errors we saw was correctable -- less than or equal
    855 	 * to the number of parity disks read -- attempt to produce data that
    856 	 * has a valid checksum. Naturally, this case applies in the absence of
    857 	 * any errors.
    858 	 */
    859 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
    860 		switch (data_errors) {
    861 		case 0:
    862 			if (zio_checksum_error(zio) == 0) {
    863 				/*
    864 				 * If we read parity information (unnecessarily
    865 				 * as it happens since no reconstruction was
    866 				 * needed) regenerate and verify the parity.
    867 				 * We also regenerate parity when resilvering
    868 				 * so we can write it out to the failed device
    869 				 * later.
    870 				 */
    871 				if (parity_errors + parity_untried <
    872 				    rm->rm_firstdatacol ||
    873 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
    874 					n = raidz_parity_verify(zio, rm);
    875 					unexpected_errors += n;
    876 					ASSERT(parity_errors + n <=
    877 					    rm->rm_firstdatacol);
    878 				}
    879 				goto done;
    880 			}
    881 			break;
    882 
    883 		case 1:
    884 			/*
    885 			 * We either attempt to read all the parity columns or
    886 			 * none of them. If we didn't try to read parity, we
    887 			 * wouldn't be here in the correctable case. There must
    888 			 * also have been fewer parity errors than parity
    889 			 * columns or, again, we wouldn't be in this code path.
    890 			 */
    891 			ASSERT(parity_untried == 0);
    892 			ASSERT(parity_errors < rm->rm_firstdatacol);
    893 
    894 			/*
    895 			 * Find the column that reported the error.
    896 			 */
    897 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    898 				rc = &rm->rm_col[c];
    899 				if (rc->rc_error != 0)
    900 					break;
    901 			}
    902 			ASSERT(c != rm->rm_cols);
    903 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
    904 			    rc->rc_error == ESTALE);
    905 
    906 			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
    907 				vdev_raidz_reconstruct_p(rm, c);
    908 			} else {
    909 				ASSERT(rm->rm_firstdatacol > 1);
    910 				vdev_raidz_reconstruct_q(rm, c);
    911 			}
    912 
    913 			if (zio_checksum_error(zio) == 0) {
    914 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
    915 					atomic_inc_64(&raidz_corrected_p);
    916 				else
    917 					atomic_inc_64(&raidz_corrected_q);
    918 
    919 				/*
    920 				 * If there's more than one parity disk that
    921 				 * was successfully read, confirm that the
    922 				 * other parity disk produced the correct data.
    923 				 * This routine is suboptimal in that it
    924 				 * regenerates both the parity we wish to test
    925 				 * as well as the parity we just used to
    926 				 * perform the reconstruction, but this should
    927 				 * be a relatively uncommon case, and can be
    928 				 * optimized if it becomes a problem.
    929 				 * We also regenerate parity when resilvering
    930 				 * so we can write it out to the failed device
    931 				 * later.
    932 				 */
    933 				if (parity_errors < rm->rm_firstdatacol - 1 ||
    934 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
    935 					n = raidz_parity_verify(zio, rm);
    936 					unexpected_errors += n;
    937 					ASSERT(parity_errors + n <=
    938 					    rm->rm_firstdatacol);
    939 				}
    940 
    941 				goto done;
    942 			}
    943 			break;
    944 
    945 		case 2:
    946 			/*
    947 			 * Two data column errors require double parity.
    948 			 */
    949 			ASSERT(rm->rm_firstdatacol == 2);
    950 
    951 			/*
    952 			 * Find the two columns that reported errors.
    953 			 */
    954 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    955 				rc = &rm->rm_col[c];
    956 				if (rc->rc_error != 0)
    957 					break;
    958 			}
    959 			ASSERT(c != rm->rm_cols);
    960 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
    961 			    rc->rc_error == ESTALE);
    962 
    963 			for (c1 = c++; c < rm->rm_cols; c++) {
    964 				rc = &rm->rm_col[c];
    965 				if (rc->rc_error != 0)
    966 					break;
    967 			}
    968 			ASSERT(c != rm->rm_cols);
    969 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
    970 			    rc->rc_error == ESTALE);
    971 
    972 			vdev_raidz_reconstruct_pq(rm, c1, c);
    973 
    974 			if (zio_checksum_error(zio) == 0) {
    975 				atomic_inc_64(&raidz_corrected_pq);
    976 				goto done;
    977 			}
    978 			break;
    979 
    980 		default:
    981 			ASSERT(rm->rm_firstdatacol <= 2);
    982 			ASSERT(0);
    983 		}
    984 	}
    985 
    986 	/*
    987 	 * This isn't a typical situation -- either we got a read error or
    988 	 * a child silently returned bad data. Read every block so we can
    989 	 * try again with as much data and parity as we can track down. If
    990 	 * we've already been through once before, all children will be marked
    991 	 * as tried so we'll proceed to combinatorial reconstruction.
    992 	 */
    993 	unexpected_errors = 1;
    994 	rm->rm_missingdata = 0;
    995 	rm->rm_missingparity = 0;
    996 
    997 	for (c = 0; c < rm->rm_cols; c++) {
    998 		if (rm->rm_col[c].rc_tried)
    999 			continue;
   1000 
   1001 		zio_vdev_io_redone(zio);
   1002 		do {
   1003 			rc = &rm->rm_col[c];
   1004 			if (rc->rc_tried)
   1005 				continue;
   1006 			zio_nowait(zio_vdev_child_io(zio, NULL,
   1007 			    vd->vdev_child[rc->rc_devidx],
   1008 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   1009 			    zio->io_type, zio->io_priority, 0,
   1010 			    vdev_raidz_child_done, rc));
   1011 		} while (++c < rm->rm_cols);
   1012 
   1013 		return;
   1014 	}
   1015 
   1016 	/*
   1017 	 * At this point we've attempted to reconstruct the data given the
   1018 	 * errors we detected, and we've attempted to read all columns. There
   1019 	 * must, therefore, be one or more additional problems -- silent errors
   1020 	 * resulting in invalid data rather than explicit I/O errors resulting
   1021 	 * in absent data. Before we attempt combinatorial reconstruction make
   1022 	 * sure we have a chance of coming up with the right answer.
   1023 	 */
   1024 	if (total_errors >= rm->rm_firstdatacol) {
   1025 		zio->io_error = vdev_raidz_worst_error(rm);
   1026 		/*
   1027 		 * If there were exactly as many device errors as parity
   1028 		 * columns, yet we couldn't reconstruct the data, then at
   1029 		 * least one device must have returned bad data silently.
   1030 		 */
   1031 		if (total_errors == rm->rm_firstdatacol)
   1032 			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
   1033 		goto done;
   1034 	}
   1035 
   1036 	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
   1037 		/*
   1038 		 * Attempt to reconstruct the data from parity P.
   1039 		 */
   1040 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
   1041 			void *orig;
   1042 			rc = &rm->rm_col[c];
   1043 
   1044 			orig = zio_buf_alloc(rc->rc_size);
   1045 			bcopy(rc->rc_data, orig, rc->rc_size);
   1046 			vdev_raidz_reconstruct_p(rm, c);
   1047 
   1048 			if (zio_checksum_error(zio) == 0) {
   1049 				zio_buf_free(orig, rc->rc_size);
   1050 				atomic_inc_64(&raidz_corrected_p);
   1051 
   1052 				/*
   1053 				 * If this child didn't know that it returned
   1054 				 * bad data, inform it.
   1055 				 */
   1056 				if (rc->rc_tried && rc->rc_error == 0)
   1057 					raidz_checksum_error(zio, rc);
   1058 				rc->rc_error = ECKSUM;
   1059 				goto done;
   1060 			}
   1061 
   1062 			bcopy(orig, rc->rc_data, rc->rc_size);
   1063 			zio_buf_free(orig, rc->rc_size);
   1064 		}
   1065 	}
   1066 
   1067 	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
   1068 		/*
   1069 		 * Attempt to reconstruct the data from parity Q.
   1070 		 */
   1071 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
   1072 			void *orig;
   1073 			rc = &rm->rm_col[c];
   1074 
   1075 			orig = zio_buf_alloc(rc->rc_size);
   1076 			bcopy(rc->rc_data, orig, rc->rc_size);
   1077 			vdev_raidz_reconstruct_q(rm, c);
   1078 
   1079 			if (zio_checksum_error(zio) == 0) {
   1080 				zio_buf_free(orig, rc->rc_size);
   1081 				atomic_inc_64(&raidz_corrected_q);
   1082 
   1083 				/*
   1084 				 * If this child didn't know that it returned
   1085 				 * bad data, inform it.
   1086 				 */
   1087 				if (rc->rc_tried && rc->rc_error == 0)
   1088 					raidz_checksum_error(zio, rc);
   1089 				rc->rc_error = ECKSUM;
   1090 				goto done;
   1091 			}
   1092 
   1093 			bcopy(orig, rc->rc_data, rc->rc_size);
   1094 			zio_buf_free(orig, rc->rc_size);
   1095 		}
   1096 	}
   1097 
   1098 	if (rm->rm_firstdatacol > 1 &&
   1099 	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
   1100 	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
   1101 		/*
   1102 		 * Attempt to reconstruct the data from both P and Q.
   1103 		 */
   1104 		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
   1105 			void *orig, *orig1;
   1106 			rc = &rm->rm_col[c];
   1107 
   1108 			orig = zio_buf_alloc(rc->rc_size);
   1109 			bcopy(rc->rc_data, orig, rc->rc_size);
   1110 
   1111 			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
   1112 				rc1 = &rm->rm_col[c1];
   1113 
   1114 				orig1 = zio_buf_alloc(rc1->rc_size);
   1115 				bcopy(rc1->rc_data, orig1, rc1->rc_size);
   1116 
   1117 				vdev_raidz_reconstruct_pq(rm, c, c1);
   1118 
   1119 				if (zio_checksum_error(zio) == 0) {
   1120 					zio_buf_free(orig, rc->rc_size);
   1121 					zio_buf_free(orig1, rc1->rc_size);
   1122 					atomic_inc_64(&raidz_corrected_pq);
   1123 
   1124 					/*
   1125 					 * If these children didn't know they
   1126 					 * returned bad data, inform them.
   1127 					 */
   1128 					if (rc->rc_tried && rc->rc_error == 0)
   1129 						raidz_checksum_error(zio, rc);
   1130 					if (rc1->rc_tried && rc1->rc_error == 0)
   1131 						raidz_checksum_error(zio, rc1);
   1132 
   1133 					rc->rc_error = ECKSUM;
   1134 					rc1->rc_error = ECKSUM;
   1135 
   1136 					goto done;
   1137 				}
   1138 
   1139 				bcopy(orig1, rc1->rc_data, rc1->rc_size);
   1140 				zio_buf_free(orig1, rc1->rc_size);
   1141 			}
   1142 
   1143 			bcopy(orig, rc->rc_data, rc->rc_size);
   1144 			zio_buf_free(orig, rc->rc_size);
   1145 		}
   1146 	}
   1147 
   1148 	/*
   1149 	 * All combinations failed to checksum. Generate checksum ereports for
   1150 	 * all children.
   1151 	 */
   1152 	zio->io_error = ECKSUM;
   1153 
   1154 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
   1155 		for (c = 0; c < rm->rm_cols; c++) {
   1156 			rc = &rm->rm_col[c];
   1157 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
   1158 			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
   1159 			    rc->rc_offset, rc->rc_size);
   1160 		}
   1161 	}
   1162 
   1163 done:
   1164 	zio_checksum_verified(zio);
   1165 
   1166 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
   1167 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
   1168 		/*
   1169 		 * Use the good data we have in hand to repair damaged children.
   1170 		 */
   1171 		for (c = 0; c < rm->rm_cols; c++) {
   1172 			rc = &rm->rm_col[c];
   1173 			cvd = vd->vdev_child[rc->rc_devidx];
   1174 
   1175 			if (rc->rc_error == 0)
   1176 				continue;
   1177 
   1178 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   1179 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   1180 			    ZIO_TYPE_WRITE, zio->io_priority,
   1181 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
   1182 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
   1183 		}
   1184 	}
   1185 }
   1186 
   1187 static void
   1188 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
   1189 {
   1190 	if (faulted > vd->vdev_nparity)
   1191 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
   1192 		    VDEV_AUX_NO_REPLICAS);
   1193 	else if (degraded + faulted != 0)
   1194 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
   1195 	else
   1196 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
   1197 }
   1198 
   1199 vdev_ops_t vdev_raidz_ops = {
   1200 	vdev_raidz_open,
   1201 	vdev_raidz_close,
   1202 	vdev_raidz_asize,
   1203 	vdev_raidz_io_start,
   1204 	vdev_raidz_io_done,
   1205 	vdev_raidz_state_change,
   1206 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
   1207 	B_FALSE			/* not a leaf vdev */
   1208 };
   1209