Home | History | Annotate | Download | only in zfs
      1    789    ahrens /*
      2    789    ahrens  * CDDL HEADER START
      3    789    ahrens  *
      4    789    ahrens  * The contents of this file are subject to the terms of the
      5   1544  eschrock  * Common Development and Distribution License (the "License").
      6   1544  eschrock  * You may not use this file except in compliance with the License.
      7    789    ahrens  *
      8    789    ahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9    789    ahrens  * or http://www.opensolaris.org/os/licensing.
     10    789    ahrens  * See the License for the specific language governing permissions
     11    789    ahrens  * and limitations under the License.
     12    789    ahrens  *
     13    789    ahrens  * When distributing Covered Code, include this CDDL HEADER in each
     14    789    ahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15    789    ahrens  * If applicable, add the following below this CDDL HEADER, with the
     16    789    ahrens  * fields enclosed by brackets "[]" replaced with your own identifying
     17    789    ahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
     18    789    ahrens  *
     19    789    ahrens  * CDDL HEADER END
     20    789    ahrens  */
     21   2082  eschrock 
     22    789    ahrens /*
     23   9434      Mark  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24    789    ahrens  * Use is subject to license terms.
     25    789    ahrens  */
     26    789    ahrens 
     27    789    ahrens #include <sys/zfs_context.h>
     28    789    ahrens #include <sys/spa.h>
     29    789    ahrens #include <sys/vdev_impl.h>
     30    789    ahrens #include <sys/zio.h>
     31    789    ahrens #include <sys/zio_checksum.h>
     32    789    ahrens #include <sys/fs/zfs.h>
     33   1544  eschrock #include <sys/fm/fs/zfs.h>
     34    789    ahrens 
     35    789    ahrens /*
     36    789    ahrens  * Virtual device vector for RAID-Z.
     37   2082  eschrock  *
     38  10105      adam  * This vdev supports single, double, and triple parity. For single parity,
     39  10105      adam  * we use a simple XOR of all the data columns. For double or triple parity,
     40  10105      adam  * we use a special case of Reed-Solomon coding. This extends the
     41  10105      adam  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
     42  10105      adam  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
     43  10105      adam  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
     44  10105      adam  * former is also based. The latter is designed to provide higher performance
     45  10105      adam  * for writes.
     46  10105      adam  *
     47  10105      adam  * Note that the Plank paper claimed to support arbitrary N+M, but was then
     48  10105      adam  * amended six years later identifying a critical flaw that invalidates its
     49  10105      adam  * claims. Nevertheless, the technique can be adapted to work for up to
     50  10105      adam  * triple parity. For additional parity, the amendment "Note: Correction to
     51  10105      adam  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
     52  10105      adam  * is viable, but the additional complexity means that write performance will
     53  10105      adam  * suffer.
     54  10105      adam  *
     55  10105      adam  * All of the methods above operate on a Galois field, defined over the
     56  10105      adam  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
     57  10105      adam  * can be expressed with a single byte. Briefly, the operations on the
     58  10105      adam  * field are defined as follows:
     59   2082  eschrock  *
     60   2082  eschrock  *   o addition (+) is represented by a bitwise XOR
     61   2082  eschrock  *   o subtraction (-) is therefore identical to addition: A + B = A - B
     62   2082  eschrock  *   o multiplication of A by 2 is defined by the following bitwise expression:
     63   2082  eschrock  *	(A * 2)_7 = A_6
     64   2082  eschrock  *	(A * 2)_6 = A_5
     65   2082  eschrock  *	(A * 2)_5 = A_4
     66   2082  eschrock  *	(A * 2)_4 = A_3 + A_7
     67   2082  eschrock  *	(A * 2)_3 = A_2 + A_7
     68   2082  eschrock  *	(A * 2)_2 = A_1 + A_7
     69   2082  eschrock  *	(A * 2)_1 = A_0
     70   2082  eschrock  *	(A * 2)_0 = A_7
     71   2082  eschrock  *
     72   2082  eschrock  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
     73  10105      adam  * As an aside, this multiplication is derived from the error correcting
     74  10105      adam  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
     75   2082  eschrock  *
     76   2082  eschrock  * Observe that any number in the field (except for 0) can be expressed as a
     77   2082  eschrock  * power of 2 -- a generator for the field. We store a table of the powers of
     78   2082  eschrock  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
     79   2082  eschrock  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
     80  10105      adam  * than field addition). The inverse of a field element A (A^-1) is therefore
     81  10105      adam  * A ^ (255 - 1) = A^254.
     82   2082  eschrock  *
     83  10105      adam  * The up-to-three parity columns, P, Q, R over several data columns,
     84  10105      adam  * D_0, ... D_n-1, can be expressed by field operations:
     85   2082  eschrock  *
     86   2082  eschrock  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
     87   2082  eschrock  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
     88   2082  eschrock  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
     89  10105      adam  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
     90  10105      adam  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
     91   2082  eschrock  *
     92  10105      adam  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
     93  10105      adam  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
     94  10105      adam  * independent coefficients. (There are no additional coefficients that have
     95  10105      adam  * this property which is why the uncorrected Plank method breaks down.)
     96  10105      adam  *
     97  10105      adam  * See the reconstruction code below for how P, Q and R can used individually
     98  10105      adam  * or in concert to recover missing data columns.
     99    789    ahrens  */
    100    789    ahrens 
    101    789    ahrens typedef struct raidz_col {
    102   2082  eschrock 	uint64_t rc_devidx;		/* child device index for I/O */
    103   2082  eschrock 	uint64_t rc_offset;		/* device offset */
    104   2082  eschrock 	uint64_t rc_size;		/* I/O size */
    105   2082  eschrock 	void *rc_data;			/* I/O data */
    106  10614  Jonathan 	void *rc_gdata;			/* used to store the "good" version */
    107   2082  eschrock 	int rc_error;			/* I/O error for this device */
    108   2082  eschrock 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
    109   2082  eschrock 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
    110    789    ahrens } raidz_col_t;
    111    789    ahrens 
    112    789    ahrens typedef struct raidz_map {
    113  10105      adam 	uint64_t rm_cols;		/* Regular column count */
    114  10105      adam 	uint64_t rm_scols;		/* Count including skipped columns */
    115   2082  eschrock 	uint64_t rm_bigcols;		/* Number of oversized columns */
    116   2082  eschrock 	uint64_t rm_asize;		/* Actual total I/O size */
    117   2082  eschrock 	uint64_t rm_missingdata;	/* Count of missing data devices */
    118   2082  eschrock 	uint64_t rm_missingparity;	/* Count of missing parity devices */
    119   2082  eschrock 	uint64_t rm_firstdatacol;	/* First data column/parity count */
    120  10450      adam 	uint64_t rm_nskip;		/* Skipped sectors for padding */
    121  10450      adam 	uint64_t rm_skipstart;	/* Column index of padding start */
    122  10614  Jonathan 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
    123  10614  Jonathan 	uintptr_t rm_reports;		/* # of referencing checksum reports */
    124  10614  Jonathan 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
    125  10614  Jonathan 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
    126   2082  eschrock 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
    127    789    ahrens } raidz_map_t;
    128    789    ahrens 
    129   2082  eschrock #define	VDEV_RAIDZ_P		0
    130   2082  eschrock #define	VDEV_RAIDZ_Q		1
    131  10105      adam #define	VDEV_RAIDZ_R		2
    132   2082  eschrock 
    133  10105      adam #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
    134  10105      adam #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
    135   2082  eschrock 
    136  10105      adam /*
    137  10105      adam  * We provide a mechanism to perform the field multiplication operation on a
    138  10105      adam  * 64-bit value all at once rather than a byte at a time. This works by
    139  10105      adam  * creating a mask from the top bit in each byte and using that to
    140  10105      adam  * conditionally apply the XOR of 0x1d.
    141  10105      adam  */
    142  10105      adam #define	VDEV_RAIDZ_64MUL_2(x, mask) \
    143  10105      adam { \
    144  10105      adam 	(mask) = (x) & 0x8080808080808080ULL; \
    145  10105      adam 	(mask) = ((mask) << 1) - ((mask) >> 7); \
    146  10105      adam 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
    147  10105      adam 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
    148  10105      adam }
    149  10105      adam 
    150  10105      adam #define	VDEV_RAIDZ_64MUL_4(x, mask) \
    151  10105      adam { \
    152  10105      adam 	VDEV_RAIDZ_64MUL_2((x), mask); \
    153  10105      adam 	VDEV_RAIDZ_64MUL_2((x), mask); \
    154  10105      adam }
    155  10105      adam 
    156  10105      adam /*
    157  10105      adam  * Force reconstruction to use the general purpose method.
    158  10105      adam  */
    159  10105      adam int vdev_raidz_default_to_general;
    160   2082  eschrock 
    161   2082  eschrock /*
    162   2082  eschrock  * These two tables represent powers and logs of 2 in the Galois field defined
    163   2082  eschrock  * above. These values were computed by repeatedly multiplying by 2 as above.
    164   2082  eschrock  */
    165   2082  eschrock static const uint8_t vdev_raidz_pow2[256] = {
    166   2082  eschrock 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
    167   2082  eschrock 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
    168   2082  eschrock 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
    169   2082  eschrock 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
    170   2082  eschrock 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
    171   2082  eschrock 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
    172   2082  eschrock 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
    173   2082  eschrock 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
    174   2082  eschrock 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
    175   2082  eschrock 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
    176   2082  eschrock 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
    177   2082  eschrock 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
    178   2082  eschrock 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
    179   2082  eschrock 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
    180   2082  eschrock 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
    181   2082  eschrock 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
    182   2082  eschrock 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
    183   2082  eschrock 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
    184   2082  eschrock 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
    185   2082  eschrock 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
    186   2082  eschrock 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
    187   2082  eschrock 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
    188   2082  eschrock 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
    189   2082  eschrock 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
    190   2082  eschrock 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
    191   2082  eschrock 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
    192   2082  eschrock 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
    193   2082  eschrock 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
    194   2082  eschrock 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
    195   2082  eschrock 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
    196   2082  eschrock 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
    197   2082  eschrock 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
    198   2082  eschrock };
    199   2082  eschrock static const uint8_t vdev_raidz_log2[256] = {
    200   2082  eschrock 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
    201   2082  eschrock 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
    202   2082  eschrock 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
    203   2082  eschrock 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
    204   2082  eschrock 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
    205   2082  eschrock 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
    206   2082  eschrock 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
    207   2082  eschrock 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
    208   2082  eschrock 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
    209   2082  eschrock 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
    210   2082  eschrock 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
    211   2082  eschrock 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
    212   2082  eschrock 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
    213   2082  eschrock 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
    214   2082  eschrock 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
    215   2082  eschrock 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
    216   2082  eschrock 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
    217   2082  eschrock 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
    218   2082  eschrock 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
    219   2082  eschrock 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
    220   2082  eschrock 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
    221   2082  eschrock 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
    222   2082  eschrock 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
    223   2082  eschrock 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
    224   2082  eschrock 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
    225   2082  eschrock 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
    226   2082  eschrock 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
    227   2082  eschrock 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
    228   2082  eschrock 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
    229   2082  eschrock 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
    230   2082  eschrock 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
    231   2082  eschrock 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
    232   2082  eschrock };
    233   2082  eschrock 
    234  10614  Jonathan static void vdev_raidz_generate_parity(raidz_map_t *rm);
    235  10614  Jonathan 
    236   2082  eschrock /*
    237   2082  eschrock  * Multiply a given number by 2 raised to the given power.
    238   2082  eschrock  */
    239   2082  eschrock static uint8_t
    240   2082  eschrock vdev_raidz_exp2(uint_t a, int exp)
    241   2082  eschrock {
    242   2082  eschrock 	if (a == 0)
    243   2082  eschrock 		return (0);
    244   2082  eschrock 
    245   2082  eschrock 	ASSERT(exp >= 0);
    246   2082  eschrock 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
    247   2082  eschrock 
    248   2082  eschrock 	exp += vdev_raidz_log2[a];
    249   2082  eschrock 	if (exp > 255)
    250   2082  eschrock 		exp -= 255;
    251   2082  eschrock 
    252   2082  eschrock 	return (vdev_raidz_pow2[exp]);
    253   2082  eschrock }
    254   2082  eschrock 
    255   7754      Jeff static void
    256  10614  Jonathan vdev_raidz_map_free(raidz_map_t *rm)
    257   7754      Jeff {
    258   7754      Jeff 	int c;
    259  10653  Jonathan 	size_t size;
    260   7754      Jeff 
    261  10614  Jonathan 	for (c = 0; c < rm->rm_firstdatacol; c++) {
    262   7754      Jeff 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
    263  10614  Jonathan 
    264  10614  Jonathan 		if (rm->rm_col[c].rc_gdata != NULL)
    265  10614  Jonathan 			zio_buf_free(rm->rm_col[c].rc_gdata,
    266  10614  Jonathan 			    rm->rm_col[c].rc_size);
    267  10614  Jonathan 	}
    268  10653  Jonathan 
    269  10653  Jonathan 	size = 0;
    270  10653  Jonathan 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
    271  10653  Jonathan 		size += rm->rm_col[c].rc_size;
    272  10614  Jonathan 
    273  10614  Jonathan 	if (rm->rm_datacopy != NULL)
    274  10614  Jonathan 		zio_buf_free(rm->rm_datacopy, size);
    275   7754      Jeff 
    276  10105      adam 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
    277   7754      Jeff }
    278  10614  Jonathan 
    279  10614  Jonathan static void
    280  10614  Jonathan vdev_raidz_map_free_vsd(zio_t *zio)
    281  10614  Jonathan {
    282  10614  Jonathan 	raidz_map_t *rm = zio->io_vsd;
    283  10614  Jonathan 
    284  10614  Jonathan 	ASSERT3U(rm->rm_freed, ==, 0);
    285  10614  Jonathan 	rm->rm_freed = 1;
    286  10614  Jonathan 
    287  10614  Jonathan 	if (rm->rm_reports == 0)
    288  10614  Jonathan 		vdev_raidz_map_free(rm);
    289  10614  Jonathan }
    290  10614  Jonathan 
    291  10614  Jonathan /*ARGSUSED*/
    292  10614  Jonathan static void
    293  10614  Jonathan vdev_raidz_cksum_free(void *arg, size_t ignored)
    294  10614  Jonathan {
    295  10614  Jonathan 	raidz_map_t *rm = arg;
    296  10614  Jonathan 
    297  10614  Jonathan 	ASSERT3U(rm->rm_reports, >, 0);
    298  10614  Jonathan 
    299  10653  Jonathan 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
    300  10614  Jonathan 		vdev_raidz_map_free(rm);
    301  10614  Jonathan }
    302  10614  Jonathan 
    303  10614  Jonathan static void
    304  10614  Jonathan vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
    305  10614  Jonathan {
    306  10614  Jonathan 	raidz_map_t *rm = zcr->zcr_cbdata;
    307  10614  Jonathan 	size_t c = zcr->zcr_cbinfo;
    308  10614  Jonathan 	size_t x;
    309  10614  Jonathan 
    310  10614  Jonathan 	const char *good = NULL;
    311  10614  Jonathan 	const char *bad = rm->rm_col[c].rc_data;
    312  10614  Jonathan 
    313  10614  Jonathan 	if (good_data == NULL) {
    314  10614  Jonathan 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
    315  10614  Jonathan 		return;
    316  10614  Jonathan 	}
    317  10614  Jonathan 
    318  10614  Jonathan 	if (c < rm->rm_firstdatacol) {
    319  10614  Jonathan 		/*
    320  10614  Jonathan 		 * The first time through, calculate the parity blocks for
    321  10614  Jonathan 		 * the good data (this relies on the fact that the good
    322  10614  Jonathan 		 * data never changes for a given logical ZIO)
    323  10614  Jonathan 		 */
    324  10614  Jonathan 		if (rm->rm_col[0].rc_gdata == NULL) {
    325  10614  Jonathan 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
    326  10614  Jonathan 			char *buf;
    327  10614  Jonathan 
    328  10614  Jonathan 			/*
    329  10614  Jonathan 			 * Set up the rm_col[]s to generate the parity for
    330  10614  Jonathan 			 * good_data, first saving the parity bufs and
    331  10614  Jonathan 			 * replacing them with buffers to hold the result.
    332  10614  Jonathan 			 */
    333  10614  Jonathan 			for (x = 0; x < rm->rm_firstdatacol; x++) {
    334  10614  Jonathan 				bad_parity[x] = rm->rm_col[x].rc_data;
    335  10614  Jonathan 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
    336  10614  Jonathan 				    zio_buf_alloc(rm->rm_col[x].rc_size);
    337  10614  Jonathan 			}
    338  10614  Jonathan 
    339  10614  Jonathan 			/* fill in the data columns from good_data */
    340  10614  Jonathan 			buf = (char *)good_data;
    341  10614  Jonathan 			for (; x < rm->rm_cols; x++) {
    342  10614  Jonathan 				rm->rm_col[x].rc_data = buf;
    343  10614  Jonathan 				buf += rm->rm_col[x].rc_size;
    344  10614  Jonathan 			}
    345  10614  Jonathan 
    346  10614  Jonathan 			/*
    347  10614  Jonathan 			 * Construct the parity from the good data.
    348  10614  Jonathan 			 */
    349  10614  Jonathan 			vdev_raidz_generate_parity(rm);
    350  10614  Jonathan 
    351  10614  Jonathan 			/* restore everything back to its original state */
    352  10614  Jonathan 			for (x = 0; x < rm->rm_firstdatacol; x++)
    353  10614  Jonathan 				rm->rm_col[x].rc_data = bad_parity[x];
    354  10614  Jonathan 
    355  10614  Jonathan 			buf = rm->rm_datacopy;
    356  10614  Jonathan 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
    357  10614  Jonathan 				rm->rm_col[x].rc_data = buf;
    358  10614  Jonathan 				buf += rm->rm_col[x].rc_size;
    359  10614  Jonathan 			}
    360  10614  Jonathan 		}
    361  10614  Jonathan 
    362  10614  Jonathan 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
    363  10614  Jonathan 		good = rm->rm_col[c].rc_gdata;
    364  10614  Jonathan 	} else {
    365  10614  Jonathan 		/* adjust good_data to point at the start of our column */
    366  10614  Jonathan 		good = good_data;
    367  10614  Jonathan 
    368  10614  Jonathan 		for (x = rm->rm_firstdatacol; x < c; x++)
    369  10614  Jonathan 			good += rm->rm_col[x].rc_size;
    370  10614  Jonathan 	}
    371  10614  Jonathan 
    372  10614  Jonathan 	/* we drop the ereport if it ends up that the data was good */
    373  10614  Jonathan 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
    374  10614  Jonathan }
    375  10614  Jonathan 
    376  10614  Jonathan /*
    377  10614  Jonathan  * Invoked indirectly by zfs_ereport_start_checksum(), called
    378  10614  Jonathan  * below when our read operation fails completely.  The main point
    379  10614  Jonathan  * is to keep a copy of everything we read from disk, so that at
    380  10614  Jonathan  * vdev_raidz_cksum_finish() time we can compare it with the good data.
    381  10614  Jonathan  */
    382  10614  Jonathan static void
    383  10614  Jonathan vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
    384  10614  Jonathan {
    385  10614  Jonathan 	size_t c = (size_t)(uintptr_t)arg;
    386  10614  Jonathan 	caddr_t buf;
    387  10614  Jonathan 
    388  10614  Jonathan 	raidz_map_t *rm = zio->io_vsd;
    389  10614  Jonathan 	size_t size;
    390  10614  Jonathan 
    391  10614  Jonathan 	/* set up the report and bump the refcount  */
    392  10614  Jonathan 	zcr->zcr_cbdata = rm;
    393  10614  Jonathan 	zcr->zcr_cbinfo = c;
    394  10614  Jonathan 	zcr->zcr_finish = vdev_raidz_cksum_finish;
    395  10614  Jonathan 	zcr->zcr_free = vdev_raidz_cksum_free;
    396  10614  Jonathan 
    397  10614  Jonathan 	rm->rm_reports++;
    398  10614  Jonathan 	ASSERT3U(rm->rm_reports, >, 0);
    399  10614  Jonathan 
    400  10653  Jonathan 	if (rm->rm_datacopy != NULL)
    401  10614  Jonathan 		return;
    402  10614  Jonathan 
    403  10614  Jonathan 	/*
    404  10653  Jonathan 	 * It's the first time we're called for this raidz_map_t, so we need
    405  10653  Jonathan 	 * to copy the data aside; there's no guarantee that our zio's buffer
    406  10653  Jonathan 	 * won't be re-used for something else.
    407  10614  Jonathan 	 *
    408  10653  Jonathan 	 * Our parity data is already in separate buffers, so there's no need
    409  10614  Jonathan 	 * to copy them.
    410  10614  Jonathan 	 */
    411  10614  Jonathan 
    412  10653  Jonathan 	size = 0;
    413  10653  Jonathan 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
    414  10653  Jonathan 		size += rm->rm_col[c].rc_size;
    415  10614  Jonathan 
    416  10614  Jonathan 	buf = rm->rm_datacopy = zio_buf_alloc(size);
    417  10653  Jonathan 
    418  10653  Jonathan 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    419  10614  Jonathan 		raidz_col_t *col = &rm->rm_col[c];
    420  10614  Jonathan 
    421  10614  Jonathan 		bcopy(col->rc_data, buf, col->rc_size);
    422  10614  Jonathan 		col->rc_data = buf;
    423  10614  Jonathan 
    424  10614  Jonathan 		buf += col->rc_size;
    425  10614  Jonathan 	}
    426  10614  Jonathan 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
    427  10614  Jonathan }
    428  10614  Jonathan 
    429  10614  Jonathan static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
    430  10614  Jonathan 	vdev_raidz_map_free_vsd,
    431  10614  Jonathan 	vdev_raidz_cksum_report
    432  10614  Jonathan };
    433   7754      Jeff 
    434    789    ahrens static raidz_map_t *
    435   2082  eschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
    436   2082  eschrock     uint64_t nparity)
    437    789    ahrens {
    438    789    ahrens 	raidz_map_t *rm;
    439    789    ahrens 	uint64_t b = zio->io_offset >> unit_shift;
    440    789    ahrens 	uint64_t s = zio->io_size >> unit_shift;
    441    789    ahrens 	uint64_t f = b % dcols;
    442    789    ahrens 	uint64_t o = (b / dcols) << unit_shift;
    443  10105      adam 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
    444    789    ahrens 
    445   2082  eschrock 	q = s / (dcols - nparity);
    446   2082  eschrock 	r = s - q * (dcols - nparity);
    447   2082  eschrock 	bc = (r == 0 ? 0 : r + nparity);
    448  10105      adam 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
    449    789    ahrens 
    450  10105      adam 	if (q == 0) {
    451  10105      adam 		acols = bc;
    452  10105      adam 		scols = MIN(dcols, roundup(bc, nparity + 1));
    453  10105      adam 	} else {
    454  10105      adam 		acols = dcols;
    455  10105      adam 		scols = dcols;
    456  10105      adam 	}
    457    789    ahrens 
    458  10105      adam 	ASSERT3U(acols, <=, scols);
    459  10105      adam 
    460  10105      adam 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
    461    789    ahrens 
    462    789    ahrens 	rm->rm_cols = acols;
    463  10105      adam 	rm->rm_scols = scols;
    464    789    ahrens 	rm->rm_bigcols = bc;
    465  10450      adam 	rm->rm_skipstart = bc;
    466   2082  eschrock 	rm->rm_missingdata = 0;
    467   2082  eschrock 	rm->rm_missingparity = 0;
    468   2082  eschrock 	rm->rm_firstdatacol = nparity;
    469  10614  Jonathan 	rm->rm_datacopy = NULL;
    470  10614  Jonathan 	rm->rm_reports = 0;
    471  10614  Jonathan 	rm->rm_freed = 0;
    472  10614  Jonathan 	rm->rm_ecksuminjected = 0;
    473    789    ahrens 
    474  10105      adam 	asize = 0;
    475  10105      adam 
    476  10105      adam 	for (c = 0; c < scols; c++) {
    477    789    ahrens 		col = f + c;
    478    789    ahrens 		coff = o;
    479    789    ahrens 		if (col >= dcols) {
    480    789    ahrens 			col -= dcols;
    481    789    ahrens 			coff += 1ULL << unit_shift;
    482    789    ahrens 		}
    483   2082  eschrock 		rm->rm_col[c].rc_devidx = col;
    484    789    ahrens 		rm->rm_col[c].rc_offset = coff;
    485    789    ahrens 		rm->rm_col[c].rc_data = NULL;
    486  10614  Jonathan 		rm->rm_col[c].rc_gdata = NULL;
    487    789    ahrens 		rm->rm_col[c].rc_error = 0;
    488    789    ahrens 		rm->rm_col[c].rc_tried = 0;
    489    789    ahrens 		rm->rm_col[c].rc_skipped = 0;
    490  10105      adam 
    491  10105      adam 		if (c >= acols)
    492  10105      adam 			rm->rm_col[c].rc_size = 0;
    493  10105      adam 		else if (c < bc)
    494  10105      adam 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
    495  10105      adam 		else
    496  10105      adam 			rm->rm_col[c].rc_size = q << unit_shift;
    497  10105      adam 
    498  10105      adam 		asize += rm->rm_col[c].rc_size;
    499    789    ahrens 	}
    500    789    ahrens 
    501  10105      adam 	ASSERT3U(asize, ==, tot << unit_shift);
    502  10105      adam 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
    503  10450      adam 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
    504  10450      adam 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
    505  10450      adam 	ASSERT3U(rm->rm_nskip, <=, nparity);
    506    789    ahrens 
    507    789    ahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
    508    789    ahrens 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
    509    789    ahrens 
    510    789    ahrens 	rm->rm_col[c].rc_data = zio->io_data;
    511    789    ahrens 
    512    789    ahrens 	for (c = c + 1; c < acols; c++)
    513    789    ahrens 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
    514    789    ahrens 		    rm->rm_col[c - 1].rc_size;
    515    789    ahrens 
    516   1133  eschrock 	/*
    517   2082  eschrock 	 * If all data stored spans all columns, there's a danger that parity
    518   2082  eschrock 	 * will always be on the same device and, since parity isn't read
    519   2082  eschrock 	 * during normal operation, that that device's I/O bandwidth won't be
    520   2082  eschrock 	 * used effectively. We therefore switch the parity every 1MB.
    521   2082  eschrock 	 *
    522   2082  eschrock 	 * ... at least that was, ostensibly, the theory. As a practical
    523   2082  eschrock 	 * matter unless we juggle the parity between all devices evenly, we
    524   2082  eschrock 	 * won't see any benefit. Further, occasional writes that aren't a
    525   2082  eschrock 	 * multiple of the LCM of the number of children and the minimum
    526   2082  eschrock 	 * stripe width are sufficient to avoid pessimal behavior.
    527   2082  eschrock 	 * Unfortunately, this decision created an implicit on-disk format
    528   3456       ahl 	 * requirement that we need to support for all eternity, but only
    529   3456       ahl 	 * for single-parity RAID-Z.
    530  10450      adam 	 *
    531  10450      adam 	 * If we intend to skip a sector in the zeroth column for padding
    532  10450      adam 	 * we must make sure to note this swap. We will never intend to
    533  10450      adam 	 * skip the first column since at least one data and one parity
    534  10450      adam 	 * column must appear in each row.
    535   1133  eschrock 	 */
    536   1133  eschrock 	ASSERT(rm->rm_cols >= 2);
    537   1133  eschrock 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
    538    789    ahrens 
    539   2082  eschrock 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
    540   2082  eschrock 		devidx = rm->rm_col[0].rc_devidx;
    541   1133  eschrock 		o = rm->rm_col[0].rc_offset;
    542   2082  eschrock 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
    543   1133  eschrock 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
    544   2082  eschrock 		rm->rm_col[1].rc_devidx = devidx;
    545   1133  eschrock 		rm->rm_col[1].rc_offset = o;
    546  10450      adam 
    547  10450      adam 		if (rm->rm_skipstart == 0)
    548  10450      adam 			rm->rm_skipstart = 1;
    549    789    ahrens 	}
    550    789    ahrens 
    551    789    ahrens 	zio->io_vsd = rm;
    552  10614  Jonathan 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
    553    789    ahrens 	return (rm);
    554    789    ahrens }
    555    789    ahrens 
    556    789    ahrens static void
    557   2082  eschrock vdev_raidz_generate_parity_p(raidz_map_t *rm)
    558    789    ahrens {
    559   2082  eschrock 	uint64_t *p, *src, pcount, ccount, i;
    560   2082  eschrock 	int c;
    561    789    ahrens 
    562   2082  eschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    563   2082  eschrock 
    564   2082  eschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    565    789    ahrens 		src = rm->rm_col[c].rc_data;
    566   2082  eschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    567   2082  eschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    568   2082  eschrock 
    569   2082  eschrock 		if (c == rm->rm_firstdatacol) {
    570   2082  eschrock 			ASSERT(ccount == pcount);
    571  10105      adam 			for (i = 0; i < ccount; i++, src++, p++) {
    572   2082  eschrock 				*p = *src;
    573   2082  eschrock 			}
    574    789    ahrens 		} else {
    575   2082  eschrock 			ASSERT(ccount <= pcount);
    576  10105      adam 			for (i = 0; i < ccount; i++, src++, p++) {
    577   2082  eschrock 				*p ^= *src;
    578   2082  eschrock 			}
    579    789    ahrens 		}
    580    789    ahrens 	}
    581    789    ahrens }
    582   2082  eschrock 
    583   2082  eschrock static void
    584   2082  eschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm)
    585   2082  eschrock {
    586  10105      adam 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
    587   2082  eschrock 	int c;
    588   2082  eschrock 
    589  10105      adam 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    590   2082  eschrock 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    591   2082  eschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    592   2082  eschrock 
    593   2082  eschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    594   2082  eschrock 		src = rm->rm_col[c].rc_data;
    595   2082  eschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    596   2082  eschrock 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    597  10105      adam 
    598  10105      adam 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
    599   2082  eschrock 
    600   2082  eschrock 		if (c == rm->rm_firstdatacol) {
    601  10105      adam 			ASSERT(ccnt == pcnt || ccnt == 0);
    602  10105      adam 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
    603  10105      adam 				*p = *src;
    604   2082  eschrock 				*q = *src;
    605   2082  eschrock 			}
    606  10105      adam 			for (; i < pcnt; i++, src++, p++, q++) {
    607  10105      adam 				*p = 0;
    608   2082  eschrock 				*q = 0;
    609   2082  eschrock 			}
    610   2082  eschrock 		} else {
    611  10105      adam 			ASSERT(ccnt <= pcnt);
    612   2082  eschrock 
    613   2082  eschrock 			/*
    614  10105      adam 			 * Apply the algorithm described above by multiplying
    615  10105      adam 			 * the previous result and adding in the new value.
    616   2082  eschrock 			 */
    617  10105      adam 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
    618  10105      adam 				*p ^= *src;
    619  10105      adam 
    620  10105      adam 				VDEV_RAIDZ_64MUL_2(*q, mask);
    621   2082  eschrock 				*q ^= *src;
    622   2082  eschrock 			}
    623   2082  eschrock 
    624   2082  eschrock 			/*
    625   2082  eschrock 			 * Treat short columns as though they are full of 0s.
    626  10105      adam 			 * Note that there's therefore nothing needed for P.
    627   2082  eschrock 			 */
    628  10105      adam 			for (; i < pcnt; i++, q++) {
    629  10105      adam 				VDEV_RAIDZ_64MUL_2(*q, mask);
    630   2082  eschrock 			}
    631   2082  eschrock 		}
    632   2082  eschrock 	}
    633   2082  eschrock }
    634   2082  eschrock 
    635   2082  eschrock static void
    636  10105      adam vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
    637  10105      adam {
    638  10105      adam 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
    639  10105      adam 	int c;
    640  10105      adam 
    641  10105      adam 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    642  10105      adam 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    643  10105      adam 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    644  10105      adam 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    645  10105      adam 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
    646  10105      adam 
    647  10105      adam 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    648  10105      adam 		src = rm->rm_col[c].rc_data;
    649  10105      adam 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    650  10105      adam 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    651  10105      adam 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
    652  10105      adam 
    653  10105      adam 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
    654  10105      adam 
    655  10105      adam 		if (c == rm->rm_firstdatacol) {
    656  10105      adam 			ASSERT(ccnt == pcnt || ccnt == 0);
    657  10105      adam 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
    658  10105      adam 				*p = *src;
    659  10105      adam 				*q = *src;
    660  10105      adam 				*r = *src;
    661  10105      adam 			}
    662  10105      adam 			for (; i < pcnt; i++, src++, p++, q++, r++) {
    663  10105      adam 				*p = 0;
    664  10105      adam 				*q = 0;
    665  10105      adam 				*r = 0;
    666  10105      adam 			}
    667  10105      adam 		} else {
    668  10105      adam 			ASSERT(ccnt <= pcnt);
    669  10105      adam 
    670  10105      adam 			/*
    671  10105      adam 			 * Apply the algorithm described above by multiplying
    672  10105      adam 			 * the previous result and adding in the new value.
    673  10105      adam 			 */
    674  10105      adam 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
    675  10105      adam 				*p ^= *src;
    676  10105      adam 
    677  10105      adam 				VDEV_RAIDZ_64MUL_2(*q, mask);
    678  10105      adam 				*q ^= *src;
    679  10105      adam 
    680  10105      adam 				VDEV_RAIDZ_64MUL_4(*r, mask);
    681  10105      adam 				*r ^= *src;
    682  10105      adam 			}
    683  10105      adam 
    684  10105      adam 			/*
    685  10105      adam 			 * Treat short columns as though they are full of 0s.
    686  10105      adam 			 * Note that there's therefore nothing needed for P.
    687  10105      adam 			 */
    688  10105      adam 			for (; i < pcnt; i++, q++, r++) {
    689  10105      adam 				VDEV_RAIDZ_64MUL_2(*q, mask);
    690  10105      adam 				VDEV_RAIDZ_64MUL_4(*r, mask);
    691  10105      adam 			}
    692  10105      adam 		}
    693  10105      adam 	}
    694  10105      adam }
    695  10105      adam 
    696  10105      adam /*
    697  10105      adam  * Generate RAID parity in the first virtual columns according to the number of
    698  10105      adam  * parity columns available.
    699  10105      adam  */
    700  10105      adam static void
    701  10105      adam vdev_raidz_generate_parity(raidz_map_t *rm)
    702  10105      adam {
    703  10105      adam 	switch (rm->rm_firstdatacol) {
    704  10105      adam 	case 1:
    705  10105      adam 		vdev_raidz_generate_parity_p(rm);
    706  10105      adam 		break;
    707  10105      adam 	case 2:
    708  10105      adam 		vdev_raidz_generate_parity_pq(rm);
    709  10105      adam 		break;
    710  10105      adam 	case 3:
    711  10105      adam 		vdev_raidz_generate_parity_pqr(rm);
    712  10105      adam 		break;
    713  10105      adam 	default:
    714  10105      adam 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
    715  10105      adam 	}
    716  10105      adam }
    717  10105      adam 
    718  10105      adam static int
    719  10105      adam vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
    720   2082  eschrock {
    721   2082  eschrock 	uint64_t *dst, *src, xcount, ccount, count, i;
    722  10105      adam 	int x = tgts[0];
    723   2082  eschrock 	int c;
    724  10105      adam 
    725  10105      adam 	ASSERT(ntgts == 1);
    726  10105      adam 	ASSERT(x >= rm->rm_firstdatacol);
    727  10105      adam 	ASSERT(x < rm->rm_cols);
    728   2082  eschrock 
    729   2082  eschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    730   2082  eschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
    731   2082  eschrock 	ASSERT(xcount > 0);
    732   2082  eschrock 
    733   2082  eschrock 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    734   2082  eschrock 	dst = rm->rm_col[x].rc_data;
    735   2082  eschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
    736   2082  eschrock 		*dst = *src;
    737   2082  eschrock 	}
    738   2082  eschrock 
    739   2082  eschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    740   2082  eschrock 		src = rm->rm_col[c].rc_data;
    741   2082  eschrock 		dst = rm->rm_col[x].rc_data;
    742   2082  eschrock 
    743   2082  eschrock 		if (c == x)
    744   2082  eschrock 			continue;
    745   2082  eschrock 
    746   2082  eschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    747   2082  eschrock 		count = MIN(ccount, xcount);
    748   2082  eschrock 
    749   2082  eschrock 		for (i = 0; i < count; i++, dst++, src++) {
    750   2082  eschrock 			*dst ^= *src;
    751   2082  eschrock 		}
    752   2082  eschrock 	}
    753  10105      adam 
    754  10105      adam 	return (1 << VDEV_RAIDZ_P);
    755   2082  eschrock }
    756   2082  eschrock 
    757  10105      adam static int
    758  10105      adam vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
    759   2082  eschrock {
    760   2082  eschrock 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
    761   2082  eschrock 	uint8_t *b;
    762  10105      adam 	int x = tgts[0];
    763   2082  eschrock 	int c, j, exp;
    764  10105      adam 
    765  10105      adam 	ASSERT(ntgts == 1);
    766   2082  eschrock 
    767   2082  eschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    768   2082  eschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
    769   2082  eschrock 
    770   2082  eschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    771   2082  eschrock 		src = rm->rm_col[c].rc_data;
    772   2082  eschrock 		dst = rm->rm_col[x].rc_data;
    773   2082  eschrock 
    774   2082  eschrock 		if (c == x)
    775   2082  eschrock 			ccount = 0;
    776   2082  eschrock 		else
    777   2082  eschrock 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    778   2082  eschrock 
    779   2082  eschrock 		count = MIN(ccount, xcount);
    780   2082  eschrock 
    781   2082  eschrock 		if (c == rm->rm_firstdatacol) {
    782   2082  eschrock 			for (i = 0; i < count; i++, dst++, src++) {
    783   2082  eschrock 				*dst = *src;
    784   2082  eschrock 			}
    785   2082  eschrock 			for (; i < xcount; i++, dst++) {
    786   2082  eschrock 				*dst = 0;
    787   2082  eschrock 			}
    788   2082  eschrock 
    789   2082  eschrock 		} else {
    790   2082  eschrock 			for (i = 0; i < count; i++, dst++, src++) {
    791  10105      adam 				VDEV_RAIDZ_64MUL_2(*dst, mask);
    792   2082  eschrock 				*dst ^= *src;
    793   2082  eschrock 			}
    794   2082  eschrock 
    795   2082  eschrock 			for (; i < xcount; i++, dst++) {
    796  10105      adam 				VDEV_RAIDZ_64MUL_2(*dst, mask);
    797   2082  eschrock 			}
    798   2082  eschrock 		}
    799   2082  eschrock 	}
    800   2082  eschrock 
    801   2082  eschrock 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    802   2082  eschrock 	dst = rm->rm_col[x].rc_data;
    803   2082  eschrock 	exp = 255 - (rm->rm_cols - 1 - x);
    804   2082  eschrock 
    805   2082  eschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
    806   2082  eschrock 		*dst ^= *src;
    807   2082  eschrock 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
    808   2082  eschrock 			*b = vdev_raidz_exp2(*b, exp);
    809   2082  eschrock 		}
    810   2082  eschrock 	}
    811  10105      adam 
    812  10105      adam 	return (1 << VDEV_RAIDZ_Q);
    813   2082  eschrock }
    814   2082  eschrock 
    815  10105      adam static int
    816  10105      adam vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
    817   2082  eschrock {
    818   2082  eschrock 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
    819   2082  eschrock 	void *pdata, *qdata;
    820   2082  eschrock 	uint64_t xsize, ysize, i;
    821  10105      adam 	int x = tgts[0];
    822  10105      adam 	int y = tgts[1];
    823   2082  eschrock 
    824  10105      adam 	ASSERT(ntgts == 2);
    825   2082  eschrock 	ASSERT(x < y);
    826   2082  eschrock 	ASSERT(x >= rm->rm_firstdatacol);
    827   2082  eschrock 	ASSERT(y < rm->rm_cols);
    828   2082  eschrock 
    829   2082  eschrock 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
    830   2082  eschrock 
    831   2082  eschrock 	/*
    832   2082  eschrock 	 * Move the parity data aside -- we're going to compute parity as
    833   2082  eschrock 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
    834   2082  eschrock 	 * reuse the parity generation mechanism without trashing the actual
    835   2082  eschrock 	 * parity so we make those columns appear to be full of zeros by
    836   2082  eschrock 	 * setting their lengths to zero.
    837   2082  eschrock 	 */
    838   2082  eschrock 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    839   2082  eschrock 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    840   2082  eschrock 	xsize = rm->rm_col[x].rc_size;
    841   2082  eschrock 	ysize = rm->rm_col[y].rc_size;
    842   2082  eschrock 
    843   2082  eschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
    844   2082  eschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
    845   2082  eschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
    846   2082  eschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    847   2082  eschrock 	rm->rm_col[x].rc_size = 0;
    848   2082  eschrock 	rm->rm_col[y].rc_size = 0;
    849   2082  eschrock 
    850   2082  eschrock 	vdev_raidz_generate_parity_pq(rm);
    851   2082  eschrock 
    852   2082  eschrock 	rm->rm_col[x].rc_size = xsize;
    853   2082  eschrock 	rm->rm_col[y].rc_size = ysize;
    854   2082  eschrock 
    855   2082  eschrock 	p = pdata;
    856   2082  eschrock 	q = qdata;
    857   2082  eschrock 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    858   2082  eschrock 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    859   2082  eschrock 	xd = rm->rm_col[x].rc_data;
    860   2082  eschrock 	yd = rm->rm_col[y].rc_data;
    861   2082  eschrock 
    862   2082  eschrock 	/*
    863   2082  eschrock 	 * We now have:
    864   2082  eschrock 	 *	Pxy = P + D_x + D_y
    865   2082  eschrock 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
    866   2082  eschrock 	 *
    867   2082  eschrock 	 * We can then solve for D_x:
    868   2082  eschrock 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
    869   2082  eschrock 	 * where
    870   2082  eschrock 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
    871   2082  eschrock 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
    872   2082  eschrock 	 *
    873   2082  eschrock 	 * With D_x in hand, we can easily solve for D_y:
    874   2082  eschrock 	 *	D_y = P + Pxy + D_x
    875   2082  eschrock 	 */
    876   2082  eschrock 
    877   2082  eschrock 	a = vdev_raidz_pow2[255 + x - y];
    878   2082  eschrock 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
    879   2082  eschrock 	tmp = 255 - vdev_raidz_log2[a ^ 1];
    880   2082  eschrock 
    881   2082  eschrock 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
    882   2082  eschrock 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
    883   2082  eschrock 
    884   2082  eschrock 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
    885   2082  eschrock 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
    886   2082  eschrock 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
    887   2082  eschrock 
    888   2082  eschrock 		if (i < ysize)
    889   2082  eschrock 			*yd = *p ^ *pxy ^ *xd;
    890   2082  eschrock 	}
    891   2082  eschrock 
    892   2082  eschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
    893   2082  eschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
    894   2082  eschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
    895   2082  eschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    896   2082  eschrock 
    897   2082  eschrock 	/*
    898   2082  eschrock 	 * Restore the saved parity data.
    899   2082  eschrock 	 */
    900   2082  eschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
    901   2082  eschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
    902  10105      adam 
    903  10105      adam 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
    904   2082  eschrock }
    905   2082  eschrock 
    906  10105      adam /* BEGIN CSTYLED */
    907  10105      adam /*
    908  10105      adam  * In the general case of reconstruction, we must solve the system of linear
    909  10105      adam  * equations defined by the coeffecients used to generate parity as well as
    910  10105      adam  * the contents of the data and parity disks. This can be expressed with
    911  10105      adam  * vectors for the original data (D) and the actual data (d) and parity (p)
    912  10105      adam  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
    913  10105      adam  *
    914  10105      adam  *            __   __                     __     __
    915  10105      adam  *            |     |         __     __   |  p_0  |
    916  10105      adam  *            |  V  |         |  D_0  |   | p_m-1 |
    917  10105      adam  *            |     |    x    |   :   | = |  d_0  |
    918  10105      adam  *            |  I  |         | D_n-1 |   |   :   |
    919  10105      adam  *            |     |         ~~     ~~   | d_n-1 |
    920  10105      adam  *            ~~   ~~                     ~~     ~~
    921  10105      adam  *
    922  10105      adam  * I is simply a square identity matrix of size n, and V is a vandermonde
    923  10105      adam  * matrix defined by the coeffecients we chose for the various parity columns
    924  10105      adam  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
    925  10105      adam  * computation as well as linear separability.
    926  10105      adam  *
    927  10105      adam  *      __               __               __     __
    928  10105      adam  *      |   1   ..  1 1 1 |               |  p_0  |
    929  10105      adam  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
    930  10105      adam  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
    931  10105      adam  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
    932  10105      adam  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
    933  10105      adam  *      |   :       : : : |   |   :   |   |  d_2  |
    934  10105      adam  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
    935  10105      adam  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
    936  10105      adam  *      |   0   ..  0 0 1 |               | d_n-1 |
    937  10105      adam  *      ~~               ~~               ~~     ~~
    938  10105      adam  *
    939  10105      adam  * Note that I, V, d, and p are known. To compute D, we must invert the
    940  10105      adam  * matrix and use the known data and parity values to reconstruct the unknown
    941  10105      adam  * data values. We begin by removing the rows in V|I and d|p that correspond
    942  10105      adam  * to failed or missing columns; we then make V|I square (n x n) and d|p
    943  10105      adam  * sized n by removing rows corresponding to unused parity from the bottom up
    944  10105      adam  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
    945  10105      adam  * using Gauss-Jordan elimination. In the example below we use m=3 parity
    946  10105      adam  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
    947  10105      adam  *           __                               __
    948  10105      adam  *           |  1   1   1   1   1   1   1   1  |
    949  10105      adam  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
    950  10105      adam  *           |  19 205 116  29  64  16  4   1  |      / /
    951  10105      adam  *           |  1   0   0   0   0   0   0   0  |     / /
    952  10105      adam  *           |  0   1   0   0   0   0   0   0  | <--' /
    953  10105      adam  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
    954  10105      adam  *           |  0   0   0   1   0   0   0   0  |
    955  10105      adam  *           |  0   0   0   0   1   0   0   0  |
    956  10105      adam  *           |  0   0   0   0   0   1   0   0  |
    957  10105      adam  *           |  0   0   0   0   0   0   1   0  |
    958  10105      adam  *           |  0   0   0   0   0   0   0   1  |
    959  10105      adam  *           ~~                               ~~
    960  10105      adam  *           __                               __
    961  10105      adam  *           |  1   1   1   1   1   1   1   1  |
    962  10105      adam  *           | 128  64  32  16  8   4   2   1  |
    963  10105      adam  *           |  19 205 116  29  64  16  4   1  |
    964  10105      adam  *           |  1   0   0   0   0   0   0   0  |
    965  10105      adam  *           |  0   1   0   0   0   0   0   0  |
    966  10105      adam  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
    967  10105      adam  *           |  0   0   0   1   0   0   0   0  |
    968  10105      adam  *           |  0   0   0   0   1   0   0   0  |
    969  10105      adam  *           |  0   0   0   0   0   1   0   0  |
    970  10105      adam  *           |  0   0   0   0   0   0   1   0  |
    971  10105      adam  *           |  0   0   0   0   0   0   0   1  |
    972  10105      adam  *           ~~                               ~~
    973  10105      adam  *
    974  10105      adam  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
    975  10105      adam  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
    976  10105      adam  * matrix is not singular.
    977  10105      adam  * __                                                                 __
    978  10105      adam  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
    979  10105      adam  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
    980  10105      adam  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
    981  10105      adam  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
    982  10105      adam  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
    983  10105      adam  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
    984  10105      adam  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
    985  10105      adam  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
    986  10105      adam  * ~~                                                                 ~~
    987  10105      adam  * __                                                                 __
    988  10105      adam  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
    989  10105      adam  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
    990  10105      adam  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
    991  10105      adam  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
    992  10105      adam  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
    993  10105      adam  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
    994  10105      adam  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
    995  10105      adam  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
    996  10105      adam  * ~~                                                                 ~~
    997  10105      adam  * __                                                                 __
    998  10105      adam  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
    999  10105      adam  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
   1000  10105      adam  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
   1001  10105      adam  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1002  10105      adam  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1003  10105      adam  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1004  10105      adam  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1005  10105      adam  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1006  10105      adam  * ~~                                                                 ~~
   1007  10105      adam  * __                                                                 __
   1008  10105      adam  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1009  10105      adam  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
   1010  10105      adam  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
   1011  10105      adam  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1012  10105      adam  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1013  10105      adam  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1014  10105      adam  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1015  10105      adam  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1016  10105      adam  * ~~                                                                 ~~
   1017  10105      adam  * __                                                                 __
   1018  10105      adam  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1019  10105      adam  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
   1020  10105      adam  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
   1021  10105      adam  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1022  10105      adam  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1023  10105      adam  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1024  10105      adam  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1025  10105      adam  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1026  10105      adam  * ~~                                                                 ~~
   1027  10105      adam  * __                                                                 __
   1028  10105      adam  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1029  10105      adam  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
   1030  10105      adam  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
   1031  10105      adam  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1032  10105      adam  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1033  10105      adam  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1034  10105      adam  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1035  10105      adam  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1036  10105      adam  * ~~                                                                 ~~
   1037  10105      adam  *                   __                               __
   1038  10105      adam  *                   |  0   0   1   0   0   0   0   0  |
   1039  10105      adam  *                   | 167 100  5   41 159 169 217 208 |
   1040  10105      adam  *                   | 166 100  4   40 158 168 216 209 |
   1041  10105      adam  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
   1042  10105      adam  *                   |  0   0   0   0   1   0   0   0  |
   1043  10105      adam  *                   |  0   0   0   0   0   1   0   0  |
   1044  10105      adam  *                   |  0   0   0   0   0   0   1   0  |
   1045  10105      adam  *                   |  0   0   0   0   0   0   0   1  |
   1046  10105      adam  *                   ~~                               ~~
   1047  10105      adam  *
   1048  10105      adam  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
   1049  10105      adam  * of the missing data.
   1050  10105      adam  *
   1051  10105      adam  * As is apparent from the example above, the only non-trivial rows in the
   1052  10105      adam  * inverse matrix correspond to the data disks that we're trying to
   1053  10105      adam  * reconstruct. Indeed, those are the only rows we need as the others would
   1054  10105      adam  * only be useful for reconstructing data known or assumed to be valid. For
   1055  10105      adam  * that reason, we only build the coefficients in the rows that correspond to
   1056  10105      adam  * targeted columns.
   1057  10105      adam  */
   1058  10105      adam /* END CSTYLED */
   1059  10105      adam 
   1060  10105      adam static void
   1061  10105      adam vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
   1062  10105      adam     uint8_t **rows)
   1063  10105      adam {
   1064  10105      adam 	int i, j;
   1065  10105      adam 	int pow;
   1066  10105      adam 
   1067  10105      adam 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
   1068  10105      adam 
   1069  10105      adam 	/*
   1070  10105      adam 	 * Fill in the missing rows of interest.
   1071  10105      adam 	 */
   1072  10105      adam 	for (i = 0; i < nmap; i++) {
   1073  10105      adam 		ASSERT3S(0, <=, map[i]);
   1074  10105      adam 		ASSERT3S(map[i], <=, 2);
   1075  10105      adam 
   1076  10105      adam 		pow = map[i] * n;
   1077  10105      adam 		if (pow > 255)
   1078  10105      adam 			pow -= 255;
   1079  10105      adam 		ASSERT(pow <= 255);
   1080  10105      adam 
   1081  10105      adam 		for (j = 0; j < n; j++) {
   1082  10105      adam 			pow -= map[i];
   1083  10105      adam 			if (pow < 0)
   1084  10105      adam 				pow += 255;
   1085  10105      adam 			rows[i][j] = vdev_raidz_pow2[pow];
   1086  10105      adam 		}
   1087  10105      adam 	}
   1088  10105      adam }
   1089  10105      adam 
   1090  10105      adam static void
   1091  10105      adam vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
   1092  10105      adam     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
   1093  10105      adam {
   1094  10105      adam 	int i, j, ii, jj;
   1095  10105      adam 	uint8_t log;
   1096  10105      adam 
   1097  10105      adam 	/*
   1098  10105      adam 	 * Assert that the first nmissing entries from the array of used
   1099  10105      adam 	 * columns correspond to parity columns and that subsequent entries
   1100  10105      adam 	 * correspond to data columns.
   1101  10105      adam 	 */
   1102  10105      adam 	for (i = 0; i < nmissing; i++) {
   1103  10105      adam 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
   1104  10105      adam 	}
   1105  10105      adam 	for (; i < n; i++) {
   1106  10105      adam 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
   1107  10105      adam 	}
   1108  10105      adam 
   1109  10105      adam 	/*
   1110  10105      adam 	 * First initialize the storage where we'll compute the inverse rows.
   1111  10105      adam 	 */
   1112  10105      adam 	for (i = 0; i < nmissing; i++) {
   1113  10105      adam 		for (j = 0; j < n; j++) {
   1114  10105      adam 			invrows[i][j] = (i == j) ? 1 : 0;
   1115  10105      adam 		}
   1116  10105      adam 	}
   1117  10105      adam 
   1118  10105      adam 	/*
   1119  10105      adam 	 * Subtract all trivial rows from the rows of consequence.
   1120  10105      adam 	 */
   1121  10105      adam 	for (i = 0; i < nmissing; i++) {
   1122  10105      adam 		for (j = nmissing; j < n; j++) {
   1123  10105      adam 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
   1124  10105      adam 			jj = used[j] - rm->rm_firstdatacol;
   1125  10105      adam 			ASSERT3S(jj, <, n);
   1126  10105      adam 			invrows[i][j] = rows[i][jj];
   1127  10105      adam 			rows[i][jj] = 0;
   1128  10105      adam 		}
   1129  10105      adam 	}
   1130  10105      adam 
   1131  10105      adam 	/*
   1132  10105      adam 	 * For each of the rows of interest, we must normalize it and subtract
   1133  10105      adam 	 * a multiple of it from the other rows.
   1134  10105      adam 	 */
   1135  10105      adam 	for (i = 0; i < nmissing; i++) {
   1136  10105      adam 		for (j = 0; j < missing[i]; j++) {
   1137  10105      adam 			ASSERT3U(rows[i][j], ==, 0);
   1138  10105      adam 		}
   1139  10105      adam 		ASSERT3U(rows[i][missing[i]], !=, 0);
   1140  10105      adam 
   1141  10105      adam 		/*
   1142  10105      adam 		 * Compute the inverse of the first element and multiply each
   1143  10105      adam 		 * element in the row by that value.
   1144  10105      adam 		 */
   1145  10105      adam 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
   1146  10105      adam 
   1147  10105      adam 		for (j = 0; j < n; j++) {
   1148  10105      adam 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
   1149  10105      adam 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
   1150  10105      adam 		}
   1151  10105      adam 
   1152  10105      adam 		for (ii = 0; ii < nmissing; ii++) {
   1153  10105      adam 			if (i == ii)
   1154  10105      adam 				continue;
   1155  10105      adam 
   1156  10105      adam 			ASSERT3U(rows[ii][missing[i]], !=, 0);
   1157  10105      adam 
   1158  10105      adam 			log = vdev_raidz_log2[rows[ii][missing[i]]];
   1159  10105      adam 
   1160  10105      adam 			for (j = 0; j < n; j++) {
   1161  10105      adam 				rows[ii][j] ^=
   1162  10105      adam 				    vdev_raidz_exp2(rows[i][j], log);
   1163  10105      adam 				invrows[ii][j] ^=
   1164  10105      adam 				    vdev_raidz_exp2(invrows[i][j], log);
   1165  10105      adam 			}
   1166  10105      adam 		}
   1167  10105      adam 	}
   1168  10105      adam 
   1169  10105      adam 	/*
   1170  10105      adam 	 * Verify that the data that is left in the rows are properly part of
   1171  10105      adam 	 * an identity matrix.
   1172  10105      adam 	 */
   1173  10105      adam 	for (i = 0; i < nmissing; i++) {
   1174  10105      adam 		for (j = 0; j < n; j++) {
   1175  10105      adam 			if (j == missing[i]) {
   1176  10105      adam 				ASSERT3U(rows[i][j], ==, 1);
   1177  10105      adam 			} else {
   1178  10105      adam 				ASSERT3U(rows[i][j], ==, 0);
   1179  10105      adam 			}
   1180  10105      adam 		}
   1181  10105      adam 	}
   1182  10105      adam }
   1183  10105      adam 
   1184  10105      adam static void
   1185  10105      adam vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
   1186  10105      adam     int *missing, uint8_t **invrows, const uint8_t *used)
   1187  10105      adam {
   1188  10105      adam 	int i, j, x, cc, c;
   1189  10105      adam 	uint8_t *src;
   1190  10105      adam 	uint64_t ccount;
   1191  10105      adam 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
   1192  10105      adam 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
   1193  10105      adam 	uint8_t log, val;
   1194  10105      adam 	int ll;
   1195  10105      adam 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
   1196  10105      adam 	uint8_t *p, *pp;
   1197  10105      adam 	size_t psize;
   1198  10105      adam 
   1199  10105      adam 	psize = sizeof (invlog[0][0]) * n * nmissing;
   1200  10105      adam 	p = kmem_alloc(psize, KM_SLEEP);
   1201  10105      adam 
   1202  10105      adam 	for (pp = p, i = 0; i < nmissing; i++) {
   1203  10105      adam 		invlog[i] = pp;
   1204  10105      adam 		pp += n;
   1205  10105      adam 	}
   1206  10105      adam 
   1207  10105      adam 	for (i = 0; i < nmissing; i++) {
   1208  10105      adam 		for (j = 0; j < n; j++) {
   1209  10105      adam 			ASSERT3U(invrows[i][j], !=, 0);
   1210  10105      adam 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
   1211  10105      adam 		}
   1212  10105      adam 	}
   1213  10105      adam 
   1214  10105      adam 	for (i = 0; i < n; i++) {
   1215  10105      adam 		c = used[i];
   1216  10105      adam 		ASSERT3U(c, <, rm->rm_cols);
   1217  10105      adam 
   1218  10105      adam 		src = rm->rm_col[c].rc_data;
   1219  10105      adam 		ccount = rm->rm_col[c].rc_size;
   1220  10105      adam 		for (j = 0; j < nmissing; j++) {
   1221  10105      adam 			cc = missing[j] + rm->rm_firstdatacol;
   1222  10105      adam 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
   1223  10105      adam 			ASSERT3U(cc, <, rm->rm_cols);
   1224  10105      adam 			ASSERT3U(cc, !=, c);
   1225  10105      adam 
   1226  10105      adam 			dst[j] = rm->rm_col[cc].rc_data;
   1227  10105      adam 			dcount[j] = rm->rm_col[cc].rc_size;
   1228  10105      adam 		}
   1229  10105      adam 
   1230  10105      adam 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
   1231  10105      adam 
   1232  10105      adam 		for (x = 0; x < ccount; x++, src++) {
   1233  10105      adam 			if (*src != 0)
   1234  10105      adam 				log = vdev_raidz_log2[*src];
   1235  10105      adam 
   1236  10105      adam 			for (cc = 0; cc < nmissing; cc++) {
   1237  10105      adam 				if (x >= dcount[cc])
   1238  10105      adam 					continue;
   1239  10105      adam 
   1240  10105      adam 				if (*src == 0) {
   1241  10105      adam 					val = 0;
   1242  10105      adam 				} else {
   1243  10105      adam 					if ((ll = log + invlog[cc][i]) >= 255)
   1244  10105      adam 						ll -= 255;
   1245  10105      adam 					val = vdev_raidz_pow2[ll];
   1246  10105      adam 				}
   1247  10105      adam 
   1248  10105      adam 				if (i == 0)
   1249  10105      adam 					dst[cc][x] = val;
   1250  10105      adam 				else
   1251  10105      adam 					dst[cc][x] ^= val;
   1252  10105      adam 			}
   1253  10105      adam 		}
   1254  10105      adam 	}
   1255  10105      adam 
   1256  10105      adam 	kmem_free(p, psize);
   1257  10105      adam }
   1258  10105      adam 
   1259  10105      adam static int
   1260  10105      adam vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
   1261  10105      adam {
   1262  10105      adam 	int n, i, c, t, tt;
   1263  10105      adam 	int nmissing_rows;
   1264  10105      adam 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
   1265  10105      adam 	int parity_map[VDEV_RAIDZ_MAXPARITY];
   1266  10105      adam 
   1267  10105      adam 	uint8_t *p, *pp;
   1268  10105      adam 	size_t psize;
   1269  10105      adam 
   1270  10105      adam 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
   1271  10105      adam 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
   1272  10105      adam 	uint8_t *used;
   1273  10105      adam 
   1274  10105      adam 	int code = 0;
   1275  10105      adam 
   1276  10105      adam 
   1277  10105      adam 	n = rm->rm_cols - rm->rm_firstdatacol;
   1278  10105      adam 
   1279  10105      adam 	/*
   1280  10105      adam 	 * Figure out which data columns are missing.
   1281  10105      adam 	 */
   1282  10105      adam 	nmissing_rows = 0;
   1283  10105      adam 	for (t = 0; t < ntgts; t++) {
   1284  10105      adam 		if (tgts[t] >= rm->rm_firstdatacol) {
   1285  10105      adam 			missing_rows[nmissing_rows++] =
   1286  10105      adam 			    tgts[t] - rm->rm_firstdatacol;
   1287  10105      adam 		}
   1288  10105      adam 	}
   1289  10105      adam 
   1290  10105      adam 	/*
   1291  10105      adam 	 * Figure out which parity columns to use to help generate the missing
   1292  10105      adam 	 * data columns.
   1293  10105      adam 	 */
   1294  10105      adam 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
   1295  10105      adam 		ASSERT(tt < ntgts);
   1296  10105      adam 		ASSERT(c < rm->rm_firstdatacol);
   1297  10105      adam 
   1298  10105      adam 		/*
   1299  10105      adam 		 * Skip any targeted parity columns.
   1300  10105      adam 		 */
   1301  10105      adam 		if (c == tgts[tt]) {
   1302  10105      adam 			tt++;
   1303  10105      adam 			continue;
   1304  10105      adam 		}
   1305  10105      adam 
   1306  10105      adam 		code |= 1 << c;
   1307  10105      adam 
   1308  10105      adam 		parity_map[i] = c;
   1309  10105      adam 		i++;
   1310  10105      adam 	}
   1311  10105      adam 
   1312  10105      adam 	ASSERT(code != 0);
   1313  10105      adam 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
   1314  10105      adam 
   1315  10105      adam 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
   1316  10105      adam 	    nmissing_rows * n + sizeof (used[0]) * n;
   1317  10105      adam 	p = kmem_alloc(psize, KM_SLEEP);
   1318  10105      adam 
   1319  10105      adam 	for (pp = p, i = 0; i < nmissing_rows; i++) {
   1320  10105      adam 		rows[i] = pp;
   1321  10105      adam 		pp += n;
   1322  10105      adam 		invrows[i] = pp;
   1323  10105      adam 		pp += n;
   1324  10105      adam 	}
   1325  10105      adam 	used = pp;
   1326  10105      adam 
   1327  10105      adam 	for (i = 0; i < nmissing_rows; i++) {
   1328  10105      adam 		used[i] = parity_map[i];
   1329  10105      adam 	}
   1330  10105      adam 
   1331  10105      adam 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
   1332  10105      adam 		if (tt < nmissing_rows &&
   1333  10105      adam 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
   1334  10105      adam 			tt++;
   1335  10105      adam 			continue;
   1336  10105      adam 		}
   1337  10105      adam 
   1338  10105      adam 		ASSERT3S(i, <, n);
   1339  10105      adam 		used[i] = c;
   1340  10105      adam 		i++;
   1341  10105      adam 	}
   1342  10105      adam 
   1343  10105      adam 	/*
   1344  10105      adam 	 * Initialize the interesting rows of the matrix.
   1345  10105      adam 	 */
   1346  10105      adam 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
   1347  10105      adam 
   1348  10105      adam 	/*
   1349  10105      adam 	 * Invert the matrix.
   1350  10105      adam 	 */
   1351  10105      adam 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
   1352  10105      adam 	    invrows, used);
   1353  10105      adam 
   1354  10105      adam 	/*
   1355  10105      adam 	 * Reconstruct the missing data using the generated matrix.
   1356  10105      adam 	 */
   1357  10105      adam 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
   1358  10105      adam 	    invrows, used);
   1359  10105      adam 
   1360  10105      adam 	kmem_free(p, psize);
   1361  10105      adam 
   1362  10105      adam 	return (code);
   1363  10105      adam }
   1364  10105      adam 
   1365  10105      adam static int
   1366  10105      adam vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
   1367  10105      adam {
   1368  10105      adam 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
   1369  10105      adam 	int ntgts;
   1370  10105      adam 	int i, c;
   1371  10105      adam 	int code;
   1372  10105      adam 	int nbadparity, nbaddata;
   1373  10105      adam 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
   1374  10105      adam 
   1375  10105      adam 	/*
   1376  10105      adam 	 * The tgts list must already be sorted.
   1377  10105      adam 	 */
   1378  10105      adam 	for (i = 1; i < nt; i++) {
   1379  10105      adam 		ASSERT(t[i] > t[i - 1]);
   1380  10105      adam 	}
   1381  10105      adam 
   1382  10105      adam 	nbadparity = rm->rm_firstdatacol;
   1383  10105      adam 	nbaddata = rm->rm_cols - nbadparity;
   1384  10105      adam 	ntgts = 0;
   1385  10105      adam 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
   1386  10105      adam 		if (c < rm->rm_firstdatacol)
   1387  10105      adam 			parity_valid[c] = B_FALSE;
   1388  10105      adam 
   1389  10105      adam 		if (i < nt && c == t[i]) {
   1390  10105      adam 			tgts[ntgts++] = c;
   1391  10105      adam 			i++;
   1392  10105      adam 		} else if (rm->rm_col[c].rc_error != 0) {
   1393  10105      adam 			tgts[ntgts++] = c;
   1394  10105      adam 		} else if (c >= rm->rm_firstdatacol) {
   1395  10105      adam 			nbaddata--;
   1396  10105      adam 		} else {
   1397  10105      adam 			parity_valid[c] = B_TRUE;
   1398  10105      adam 			nbadparity--;
   1399  10105      adam 		}
   1400  10105      adam 	}
   1401  10105      adam 
   1402  10105      adam 	ASSERT(ntgts >= nt);
   1403  10105      adam 	ASSERT(nbaddata >= 0);
   1404  10105      adam 	ASSERT(nbaddata + nbadparity == ntgts);
   1405  10105      adam 
   1406  10105      adam 	dt = &tgts[nbadparity];
   1407  10105      adam 
   1408  10105      adam 	/*
   1409  10105      adam 	 * See if we can use any of our optimized reconstruction routines.
   1410  10105      adam 	 */
   1411  10105      adam 	if (!vdev_raidz_default_to_general) {
   1412  10105      adam 		switch (nbaddata) {
   1413  10105      adam 		case 1:
   1414  10105      adam 			if (parity_valid[VDEV_RAIDZ_P])
   1415  10105      adam 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
   1416  10105      adam 
   1417  10105      adam 			ASSERT(rm->rm_firstdatacol > 1);
   1418  10105      adam 
   1419  10105      adam 			if (parity_valid[VDEV_RAIDZ_Q])
   1420  10105      adam 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
   1421  10105      adam 
   1422  10105      adam 			ASSERT(rm->rm_firstdatacol > 2);
   1423  10105      adam 			break;
   1424  10105      adam 
   1425  10105      adam 		case 2:
   1426  10105      adam 			ASSERT(rm->rm_firstdatacol > 1);
   1427  10105      adam 
   1428  10105      adam 			if (parity_valid[VDEV_RAIDZ_P] &&
   1429  10105      adam 			    parity_valid[VDEV_RAIDZ_Q])
   1430  10105      adam 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
   1431  10105      adam 
   1432  10105      adam 			ASSERT(rm->rm_firstdatacol > 2);
   1433  10105      adam 
   1434  10105      adam 			break;
   1435  10105      adam 		}
   1436  10105      adam 	}
   1437  10105      adam 
   1438  10105      adam 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
   1439  10105      adam 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
   1440  10105      adam 	ASSERT(code > 0);
   1441  10105      adam 	return (code);
   1442  10105      adam }
   1443    789    ahrens 
   1444    789    ahrens static int
   1445    789    ahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
   1446    789    ahrens {
   1447  10105      adam 	vdev_t *cvd;
   1448   2082  eschrock 	uint64_t nparity = vd->vdev_nparity;
   1449  10105      adam 	int c;
   1450    789    ahrens 	int lasterror = 0;
   1451    789    ahrens 	int numerrors = 0;
   1452    789    ahrens 
   1453   2082  eschrock 	ASSERT(nparity > 0);
   1454   2082  eschrock 
   1455   2082  eschrock 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
   1456   2082  eschrock 	    vd->vdev_children < nparity + 1) {
   1457    789    ahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
   1458    789    ahrens 		return (EINVAL);
   1459    789    ahrens 	}
   1460    789    ahrens 
   1461   9846      Eric 	vdev_open_children(vd);
   1462    789    ahrens 
   1463  10105      adam 	for (c = 0; c < vd->vdev_children; c++) {
   1464  10105      adam 		cvd = vd->vdev_child[c];
   1465   9846      Eric 
   1466  10105      adam 		if (cvd->vdev_open_error != 0) {
   1467   9846      Eric 			lasterror = cvd->vdev_open_error;
   1468    789    ahrens 			numerrors++;
   1469    789    ahrens 			continue;
   1470    789    ahrens 		}
   1471    789    ahrens 
   1472    789    ahrens 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
   1473   1732   bonwick 		*ashift = MAX(*ashift, cvd->vdev_ashift);
   1474    789    ahrens 	}
   1475    789    ahrens 
   1476    789    ahrens 	*asize *= vd->vdev_children;
   1477    789    ahrens 
   1478   2082  eschrock 	if (numerrors > nparity) {
   1479    789    ahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
   1480    789    ahrens 		return (lasterror);
   1481    789    ahrens 	}
   1482    789    ahrens 
   1483    789    ahrens 	return (0);
   1484    789    ahrens }
   1485    789    ahrens 
   1486    789    ahrens static void
   1487    789    ahrens vdev_raidz_close(vdev_t *vd)
   1488    789    ahrens {
   1489  10105      adam 	int c;
   1490  10105      adam 
   1491  10105      adam 	for (c = 0; c < vd->vdev_children; c++)
   1492    789    ahrens 		vdev_close(vd->vdev_child[c]);
   1493    789    ahrens }
   1494    789    ahrens 
   1495    789    ahrens static uint64_t
   1496    789    ahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
   1497    789    ahrens {
   1498    789    ahrens 	uint64_t asize;
   1499   1732   bonwick 	uint64_t ashift = vd->vdev_top->vdev_ashift;
   1500    789    ahrens 	uint64_t cols = vd->vdev_children;
   1501   2082  eschrock 	uint64_t nparity = vd->vdev_nparity;
   1502    789    ahrens 
   1503   1732   bonwick 	asize = ((psize - 1) >> ashift) + 1;
   1504   2082  eschrock 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
   1505   2082  eschrock 	asize = roundup(asize, nparity + 1) << ashift;
   1506    789    ahrens 
   1507    789    ahrens 	return (asize);
   1508    789    ahrens }
   1509    789    ahrens 
   1510    789    ahrens static void
   1511    789    ahrens vdev_raidz_child_done(zio_t *zio)
   1512    789    ahrens {
   1513    789    ahrens 	raidz_col_t *rc = zio->io_private;
   1514    789    ahrens 
   1515    789    ahrens 	rc->rc_error = zio->io_error;
   1516    789    ahrens 	rc->rc_tried = 1;
   1517    789    ahrens 	rc->rc_skipped = 0;
   1518    789    ahrens }
   1519    789    ahrens 
   1520   5530   bonwick static int
   1521    789    ahrens vdev_raidz_io_start(zio_t *zio)
   1522    789    ahrens {
   1523    789    ahrens 	vdev_t *vd = zio->io_vd;
   1524   1732   bonwick 	vdev_t *tvd = vd->vdev_top;
   1525    789    ahrens 	vdev_t *cvd;
   1526    789    ahrens 	raidz_map_t *rm;
   1527    789    ahrens 	raidz_col_t *rc;
   1528  10105      adam 	int c, i;
   1529    789    ahrens 
   1530   2082  eschrock 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
   1531   2082  eschrock 	    vd->vdev_nparity);
   1532    789    ahrens 
   1533   1775     billm 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
   1534    789    ahrens 
   1535    789    ahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
   1536  10105      adam 		vdev_raidz_generate_parity(rm);
   1537    789    ahrens 
   1538    789    ahrens 		for (c = 0; c < rm->rm_cols; c++) {
   1539    789    ahrens 			rc = &rm->rm_col[c];
   1540   2082  eschrock 			cvd = vd->vdev_child[rc->rc_devidx];
   1541    789    ahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   1542    789    ahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   1543   7754      Jeff 			    zio->io_type, zio->io_priority, 0,
   1544    789    ahrens 			    vdev_raidz_child_done, rc));
   1545    789    ahrens 		}
   1546   5530   bonwick 
   1547  10105      adam 		/*
   1548  10105      adam 		 * Generate optional I/Os for any skipped sectors to improve
   1549  10105      adam 		 * aggregation contiguity.
   1550  10105      adam 		 */
   1551  10450      adam 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
   1552  10105      adam 			ASSERT(c <= rm->rm_scols);
   1553  10105      adam 			if (c == rm->rm_scols)
   1554  10105      adam 				c = 0;
   1555  10105      adam 			rc = &rm->rm_col[c];
   1556  10105      adam 			cvd = vd->vdev_child[rc->rc_devidx];
   1557  10105      adam 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   1558  10105      adam 			    rc->rc_offset + rc->rc_size, NULL,
   1559  10105      adam 			    1 << tvd->vdev_ashift,
   1560  10105      adam 			    zio->io_type, zio->io_priority,
   1561  10105      adam 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
   1562  10105      adam 		}
   1563  10105      adam 
   1564   7754      Jeff 		return (ZIO_PIPELINE_CONTINUE);
   1565    789    ahrens 	}
   1566    789    ahrens 
   1567    789    ahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
   1568    789    ahrens 
   1569   2082  eschrock 	/*
   1570   2082  eschrock 	 * Iterate over the columns in reverse order so that we hit the parity
   1571  10105      adam 	 * last -- any errors along the way will force us to read the parity.
   1572   2082  eschrock 	 */
   1573    789    ahrens 	for (c = rm->rm_cols - 1; c >= 0; c--) {
   1574    789    ahrens 		rc = &rm->rm_col[c];
   1575   2082  eschrock 		cvd = vd->vdev_child[rc->rc_devidx];
   1576   5329   gw25295 		if (!vdev_readable(cvd)) {
   1577   2082  eschrock 			if (c >= rm->rm_firstdatacol)
   1578   2082  eschrock 				rm->rm_missingdata++;
   1579   2082  eschrock 			else
   1580   2082  eschrock 				rm->rm_missingparity++;
   1581    789    ahrens 			rc->rc_error = ENXIO;
   1582    789    ahrens 			rc->rc_tried = 1;	/* don't even try */
   1583    789    ahrens 			rc->rc_skipped = 1;
   1584    789    ahrens 			continue;
   1585    789    ahrens 		}
   1586  10922      Jeff 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
   1587   2082  eschrock 			if (c >= rm->rm_firstdatacol)
   1588   2082  eschrock 				rm->rm_missingdata++;
   1589   2082  eschrock 			else
   1590   2082  eschrock 				rm->rm_missingparity++;
   1591    789    ahrens 			rc->rc_error = ESTALE;
   1592    789    ahrens 			rc->rc_skipped = 1;
   1593    789    ahrens 			continue;
   1594    789    ahrens 		}
   1595   2082  eschrock 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
   1596   9434      Mark 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
   1597    789    ahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   1598    789    ahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   1599   7754      Jeff 			    zio->io_type, zio->io_priority, 0,
   1600    789    ahrens 			    vdev_raidz_child_done, rc));
   1601    789    ahrens 		}
   1602    789    ahrens 	}
   1603    789    ahrens 
   1604   7754      Jeff 	return (ZIO_PIPELINE_CONTINUE);
   1605    789    ahrens }
   1606    789    ahrens 
   1607   1544  eschrock /*
   1608   1544  eschrock  * Report a checksum error for a child of a RAID-Z device.
   1609   1544  eschrock  */
   1610   1544  eschrock static void
   1611  10614  Jonathan raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
   1612   1544  eschrock {
   1613   2082  eschrock 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
   1614   1544  eschrock 
   1615   1544  eschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
   1616  10614  Jonathan 		zio_bad_cksum_t zbc;
   1617  10614  Jonathan 		raidz_map_t *rm = zio->io_vsd;
   1618  10614  Jonathan 
   1619   1544  eschrock 		mutex_enter(&vd->vdev_stat_lock);
   1620   1544  eschrock 		vd->vdev_stat.vs_checksum_errors++;
   1621   1544  eschrock 		mutex_exit(&vd->vdev_stat_lock);
   1622  10614  Jonathan 
   1623  10614  Jonathan 		zbc.zbc_has_cksum = 0;
   1624  10614  Jonathan 		zbc.zbc_injected = rm->rm_ecksuminjected;
   1625  10614  Jonathan 
   1626  10614  Jonathan 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
   1627  10614  Jonathan 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
   1628  10614  Jonathan 		    &zbc);
   1629   1544  eschrock 	}
   1630  10614  Jonathan }
   1631   1544  eschrock 
   1632  10614  Jonathan /*
   1633  10614  Jonathan  * We keep track of whether or not there were any injected errors, so that
   1634  10614  Jonathan  * any ereports we generate can note it.
   1635  10614  Jonathan  */
   1636  10614  Jonathan static int
   1637  10614  Jonathan raidz_checksum_verify(zio_t *zio)
   1638  10614  Jonathan {
   1639  10614  Jonathan 	zio_bad_cksum_t zbc;
   1640  10614  Jonathan 	raidz_map_t *rm = zio->io_vsd;
   1641  10614  Jonathan 
   1642  10614  Jonathan 	int ret = zio_checksum_error(zio, &zbc);
   1643  10614  Jonathan 	if (ret != 0 && zbc.zbc_injected != 0)
   1644  10614  Jonathan 		rm->rm_ecksuminjected = 1;
   1645  10614  Jonathan 
   1646  10614  Jonathan 	return (ret);
   1647   1544  eschrock }
   1648   1544  eschrock 
   1649   2082  eschrock /*
   1650   2082  eschrock  * Generate the parity from the data columns. If we tried and were able to
   1651   2082  eschrock  * read the parity without error, verify that the generated parity matches the
   1652   2082  eschrock  * data we read. If it doesn't, we fire off a checksum error. Return the
   1653   2082  eschrock  * number such failures.
   1654   2082  eschrock  */
   1655   2082  eschrock static int
   1656   2082  eschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
   1657   2082  eschrock {
   1658   2082  eschrock 	void *orig[VDEV_RAIDZ_MAXPARITY];
   1659   2082  eschrock 	int c, ret = 0;
   1660   2082  eschrock 	raidz_col_t *rc;
   1661   2082  eschrock 
   1662   2082  eschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
   1663   2082  eschrock 		rc = &rm->rm_col[c];
   1664   2082  eschrock 		if (!rc->rc_tried || rc->rc_error != 0)
   1665   2082  eschrock 			continue;
   1666   2082  eschrock 		orig[c] = zio_buf_alloc(rc->rc_size);
   1667   2082  eschrock 		bcopy(rc->rc_data, orig[c], rc->rc_size);
   1668   2082  eschrock 	}
   1669   2082  eschrock 
   1670  10105      adam 	vdev_raidz_generate_parity(rm);
   1671   2082  eschrock 
   1672   2082  eschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
   1673   2082  eschrock 		rc = &rm->rm_col[c];
   1674   2082  eschrock 		if (!rc->rc_tried || rc->rc_error != 0)
   1675   2082  eschrock 			continue;
   1676   2082  eschrock 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
   1677  10614  Jonathan 			raidz_checksum_error(zio, rc, orig[c]);
   1678   2082  eschrock 			rc->rc_error = ECKSUM;
   1679   2082  eschrock 			ret++;
   1680   2082  eschrock 		}
   1681   2082  eschrock 		zio_buf_free(orig[c], rc->rc_size);
   1682   2082  eschrock 	}
   1683   2082  eschrock 
   1684   2082  eschrock 	return (ret);
   1685   2082  eschrock }
   1686   2082  eschrock 
   1687  10105      adam /*
   1688  10105      adam  * Keep statistics on all the ways that we used parity to correct data.
   1689  10105      adam  */
   1690  10105      adam static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
   1691   1544  eschrock 
   1692   5530   bonwick static int
   1693   7754      Jeff vdev_raidz_worst_error(raidz_map_t *rm)
   1694   7754      Jeff {
   1695   7754      Jeff 	int error = 0;
   1696   7754      Jeff 
   1697   7754      Jeff 	for (int c = 0; c < rm->rm_cols; c++)
   1698   7754      Jeff 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
   1699   7754      Jeff 
   1700   7754      Jeff 	return (error);
   1701   7754      Jeff }
   1702   7754      Jeff 
   1703  10105      adam /*
   1704  10105      adam  * Iterate over all combinations of bad data and attempt a reconstruction.
   1705  10105      adam  * Note that the algorithm below is non-optimal because it doesn't take into
   1706  10105      adam  * account how reconstruction is actually performed. For example, with
   1707  10105      adam  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
   1708  10105      adam  * is targeted as invalid as if columns 1 and 4 are targeted since in both
   1709  10105      adam  * cases we'd only use parity information in column 0.
   1710  10105      adam  */
   1711  10105      adam static int
   1712  10105      adam vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
   1713  10105      adam {
   1714  10105      adam 	raidz_map_t *rm = zio->io_vsd;
   1715  10105      adam 	raidz_col_t *rc;
   1716  10105      adam 	void *orig[VDEV_RAIDZ_MAXPARITY];
   1717  10105      adam 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
   1718  10105      adam 	int *tgts = &tstore[1];
   1719  10105      adam 	int current, next, i, c, n;
   1720  10105      adam 	int code, ret = 0;
   1721  10105      adam 
   1722  10105      adam 	ASSERT(total_errors < rm->rm_firstdatacol);
   1723  10105      adam 
   1724  10105      adam 	/*
   1725  10105      adam 	 * This simplifies one edge condition.
   1726  10105      adam 	 */
   1727  10105      adam 	tgts[-1] = -1;
   1728  10105      adam 
   1729  10105      adam 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
   1730  10105      adam 		/*
   1731  10105      adam 		 * Initialize the targets array by finding the first n columns
   1732  10105      adam 		 * that contain no error.
   1733  10105      adam 		 *
   1734  10105      adam 		 * If there were no data errors, we need to ensure that we're
   1735  10105      adam 		 * always explicitly attempting to reconstruct at least one
   1736  10105      adam 		 * data column. To do this, we simply push the highest target
   1737  10105      adam 		 * up into the data columns.
   1738  10105      adam 		 */
   1739  10105      adam 		for (c = 0, i = 0; i < n; i++) {
   1740  10105      adam 			if (i == n - 1 && data_errors == 0 &&
   1741  10105      adam 			    c < rm->rm_firstdatacol) {
   1742  10105      adam 				c = rm->rm_firstdatacol;
   1743  10105      adam 			}
   1744  10105      adam 
   1745  10105      adam 			while (rm->rm_col[c].rc_error != 0) {
   1746  10105      adam 				c++;
   1747  10105      adam 				ASSERT3S(c, <, rm->rm_cols);
   1748  10105      adam 			}
   1749  10105      adam 
   1750  10105      adam 			tgts[i] = c++;
   1751  10105      adam 		}
   1752  10105      adam 
   1753  10105      adam 		/*
   1754  10105      adam 		 * Setting tgts[n] simplifies the other edge condition.
   1755  10105      adam 		 */
   1756  10105      adam 		tgts[n] = rm->rm_cols;
   1757  10105      adam 
   1758  10105      adam 		/*
   1759  10105      adam 		 * These buffers were allocated in previous iterations.
   1760  10105      adam 		 */
   1761  10105      adam 		for (i = 0; i < n - 1; i++) {
   1762  10105      adam 			ASSERT(orig[i] != NULL);
   1763  10105      adam 		}
   1764  10105      adam 
   1765  10105      adam 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
   1766  10105      adam 
   1767  10105      adam 		current = 0;
   1768  10105      adam 		next = tgts[current];
   1769  10105      adam 
   1770  10105      adam 		while (current != n) {
   1771  10105      adam 			tgts[current] = next;
   1772  10105      adam 			current = 0;
   1773  10105      adam 
   1774  10105      adam 			/*
   1775  10105      adam 			 * Save off the original data that we're going to
   1776  10105      adam 			 * attempt to reconstruct.
   1777  10105      adam 			 */
   1778  10105      adam 			for (i = 0; i < n; i++) {
   1779  10105      adam 				ASSERT(orig[i] != NULL);
   1780  10105      adam 				c = tgts[i];
   1781  10105      adam 				ASSERT3S(c, >=, 0);
   1782  10105      adam 				ASSERT3S(c, <, rm->rm_cols);
   1783  10105      adam 				rc = &rm->rm_col[c];
   1784  10105      adam 				bcopy(rc->rc_data, orig[i], rc->rc_size);
   1785  10105      adam 			}
   1786  10105      adam 
   1787  10105      adam 			/*
   1788  10105      adam 			 * Attempt a reconstruction and exit the outer loop on
   1789  10105      adam 			 * success.
   1790  10105      adam 			 */
   1791  10105      adam 			code = vdev_raidz_reconstruct(rm, tgts, n);
   1792  10614  Jonathan 			if (raidz_checksum_verify(zio) == 0) {
   1793  10105      adam 				atomic_inc_64(&raidz_corrected[code]);
   1794  10105      adam 
   1795  10105      adam 				for (i = 0; i < n; i++) {
   1796  10105      adam 					c = tgts[i];
   1797  10105      adam 					rc = &rm->rm_col[c];
   1798  10105      adam 					ASSERT(rc->rc_error == 0);
   1799  10614  Jonathan 					if (rc->rc_tried)
   1800  10614  Jonathan 						raidz_checksum_error(zio, rc,
   1801  10614  Jonathan 						    orig[i]);
   1802  10105      adam 					rc->rc_error = ECKSUM;
   1803  10105      adam 				}
   1804  10105      adam 
   1805  10105      adam 				ret = code;
   1806  10105      adam 				goto done;
   1807  10105      adam 			}
   1808  10105      adam 
   1809  10105      adam 			/*
   1810  10105      adam 			 * Restore the original data.
   1811  10105      adam 			 */
   1812  10105      adam 			for (i = 0; i < n; i++) {
   1813  10105      adam 				c = tgts[i];
   1814  10105      adam 				rc = &rm->rm_col[c];
   1815  10105      adam 				bcopy(orig[i], rc->rc_data, rc->rc_size);
   1816  10105      adam 			}
   1817  10105      adam 
   1818  10105      adam 			do {
   1819  10105      adam 				/*
   1820  10105      adam 				 * Find the next valid column after the current
   1821  10105      adam 				 * position..
   1822  10105      adam 				 */
   1823  10105      adam 				for (next = tgts[current] + 1;
   1824  10105      adam 				    next < rm->rm_cols &&
   1825  10105      adam 				    rm->rm_col[next].rc_error != 0; next++)
   1826  10105      adam 					continue;
   1827  10105      adam 
   1828  10105      adam 				ASSERT(next <= tgts[current + 1]);
   1829  10105      adam 
   1830  10105      adam 				/*
   1831  10105      adam 				 * If that spot is available, we're done here.
   1832  10105      adam 				 */
   1833  10105      adam 				if (next != tgts[current + 1])
   1834  10105      adam 					break;
   1835  10105      adam 
   1836  10105      adam 				/*
   1837  10105      adam 				 * Otherwise, find the next valid column after
   1838  10105      adam 				 * the previous position.
   1839  10105      adam 				 */
   1840  10105      adam 				for (c = tgts[current - 1] + 1;
   1841  10105      adam 				    rm->rm_col[c].rc_error != 0; c++)
   1842  10105      adam 					continue;
   1843  10105      adam 
   1844  10105      adam 				tgts[current] = c;
   1845  10105      adam 				current++;
   1846  10105      adam 
   1847  10105      adam 			} while (current != n);
   1848  10105      adam 		}
   1849  10105      adam 	}
   1850  10105      adam 	n--;
   1851  10105      adam done:
   1852  10105      adam 	for (i = 0; i < n; i++) {
   1853  10105      adam 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
   1854  10105      adam 	}
   1855  10105      adam 
   1856  10105      adam 	return (ret);
   1857  10105      adam }
   1858  10105      adam 
   1859   7754      Jeff static void
   1860    789    ahrens vdev_raidz_io_done(zio_t *zio)
   1861    789    ahrens {
   1862    789    ahrens 	vdev_t *vd = zio->io_vd;
   1863    789    ahrens 	vdev_t *cvd;
   1864    789    ahrens 	raidz_map_t *rm = zio->io_vsd;
   1865  10105      adam 	raidz_col_t *rc;
   1866    789    ahrens 	int unexpected_errors = 0;
   1867   2082  eschrock 	int parity_errors = 0;
   1868   3456       ahl 	int parity_untried = 0;
   1869   2082  eschrock 	int data_errors = 0;
   1870   7754      Jeff 	int total_errors = 0;
   1871  10105      adam 	int n, c;
   1872  10105      adam 	int tgts[VDEV_RAIDZ_MAXPARITY];
   1873  10105      adam 	int code;
   1874    789    ahrens 
   1875   1775     billm 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
   1876   2082  eschrock 
   1877   2082  eschrock 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
   1878   2082  eschrock 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
   1879    789    ahrens 
   1880    789    ahrens 	for (c = 0; c < rm->rm_cols; c++) {
   1881    789    ahrens 		rc = &rm->rm_col[c];
   1882    789    ahrens 
   1883    789    ahrens 		if (rc->rc_error) {
   1884   7754      Jeff 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
   1885   2082  eschrock 
   1886   2082  eschrock 			if (c < rm->rm_firstdatacol)
   1887   2082  eschrock 				parity_errors++;
   1888   2082  eschrock 			else
   1889   2082  eschrock 				data_errors++;
   1890   2082  eschrock 
   1891    789    ahrens 			if (!rc->rc_skipped)
   1892    789    ahrens 				unexpected_errors++;
   1893   2082  eschrock 
   1894   7754      Jeff 			total_errors++;
   1895   3456       ahl 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
   1896   3456       ahl 			parity_untried++;
   1897    789    ahrens 		}
   1898    789    ahrens 	}
   1899    789    ahrens 
   1900    789    ahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
   1901    789    ahrens 		/*
   1902   7754      Jeff 		 * XXX -- for now, treat partial writes as a success.
   1903   7754      Jeff 		 * (If we couldn't write enough columns to reconstruct
   1904   7754      Jeff 		 * the data, the I/O failed.  Otherwise, good enough.)
   1905   7754      Jeff 		 *
   1906   7754      Jeff 		 * Now that we support write reallocation, it would be better
   1907   7754      Jeff 		 * to treat partial failure as real failure unless there are
   1908   7754      Jeff 		 * no non-degraded top-level vdevs left, and not update DTLs
   1909   7754      Jeff 		 * if we intend to reallocate.
   1910    789    ahrens 		 */
   1911    789    ahrens 		/* XXPOLICY */
   1912   7754      Jeff 		if (total_errors > rm->rm_firstdatacol)
   1913   7754      Jeff 			zio->io_error = vdev_raidz_worst_error(rm);
   1914    789    ahrens 
   1915   7754      Jeff 		return;
   1916    789    ahrens 	}
   1917    789    ahrens 
   1918    789    ahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
   1919   2082  eschrock 	/*
   1920   2082  eschrock 	 * There are three potential phases for a read:
   1921   2082  eschrock 	 *	1. produce valid data from the columns read
   1922   2082  eschrock 	 *	2. read all disks and try again
   1923   2082  eschrock 	 *	3. perform combinatorial reconstruction
   1924   2082  eschrock 	 *
   1925   2082  eschrock 	 * Each phase is progressively both more expensive and less likely to
   1926   2082  eschrock 	 * occur. If we encounter more errors than we can repair or all phases
   1927   2082  eschrock 	 * fail, we have no choice but to return an error.
   1928   2082  eschrock 	 */
   1929    789    ahrens 
   1930    789    ahrens 	/*
   1931   2082  eschrock 	 * If the number of errors we saw was correctable -- less than or equal
   1932   3456       ahl 	 * to the number of parity disks read -- attempt to produce data that
   1933   3456       ahl 	 * has a valid checksum. Naturally, this case applies in the absence of
   1934   3456       ahl 	 * any errors.
   1935    789    ahrens 	 */
   1936   7754      Jeff 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
   1937  10105      adam 		if (data_errors == 0) {
   1938  10614  Jonathan 			if (raidz_checksum_verify(zio) == 0) {
   1939   4034       ahl 				/*
   1940   4034       ahl 				 * If we read parity information (unnecessarily
   1941   4034       ahl 				 * as it happens since no reconstruction was
   1942   4034       ahl 				 * needed) regenerate and verify the parity.
   1943   4034       ahl 				 * We also regenerate parity when resilvering
   1944   4034       ahl 				 * so we can write it out to the failed device
   1945   4034       ahl 				 * later.
   1946   4034       ahl 				 */
   1947   3456       ahl 				if (parity_errors + parity_untried <
   1948   4034       ahl 				    rm->rm_firstdatacol ||
   1949   4034       ahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
   1950   3456       ahl 					n = raidz_parity_verify(zio, rm);
   1951   3456       ahl 					unexpected_errors += n;
   1952   3456       ahl 					ASSERT(parity_errors + n <=
   1953   3456       ahl 					    rm->rm_firstdatacol);
   1954   3456       ahl 				}
   1955   2082  eschrock 				goto done;
   1956   2082  eschrock 			}
   1957  10105      adam 		} else {
   1958   3456       ahl 			/*
   1959   3456       ahl 			 * We either attempt to read all the parity columns or
   1960   3456       ahl 			 * none of them. If we didn't try to read parity, we
   1961   3456       ahl 			 * wouldn't be here in the correctable case. There must
   1962   3456       ahl 			 * also have been fewer parity errors than parity
   1963   3456       ahl 			 * columns or, again, we wouldn't be in this code path.
   1964   3456       ahl 			 */
   1965   3456       ahl 			ASSERT(parity_untried == 0);
   1966   2082  eschrock 			ASSERT(parity_errors < rm->rm_firstdatacol);
   1967   2082  eschrock 
   1968   2082  eschrock 			/*
   1969  10105      adam 			 * Identify the data columns that reported an error.
   1970   2082  eschrock 			 */
   1971  10105      adam 			n = 0;
   1972   2082  eschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
   1973   2082  eschrock 				rc = &rm->rm_col[c];
   1974  10105      adam 				if (rc->rc_error != 0) {
   1975  10105      adam 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
   1976  10105      adam 					tgts[n++] = c;
   1977  10105      adam 				}
   1978   2082  eschrock 			}
   1979   2082  eschrock 
   1980  10105      adam 			ASSERT(rm->rm_firstdatacol >= n);
   1981  10105      adam 
   1982  10105      adam 			code = vdev_raidz_reconstruct(rm, tgts, n);
   1983  10105      adam 
   1984  10614  Jonathan 			if (raidz_checksum_verify(zio) == 0) {
   1985  10105      adam 				atomic_inc_64(&raidz_corrected[code]);
   1986   2082  eschrock 
   1987   2082  eschrock 				/*
   1988  10105      adam 				 * If we read more parity disks than were used
   1989  10105      adam 				 * for reconstruction, confirm that the other
   1990  10105      adam 				 * parity disks produced correct data. This
   1991  10105      adam 				 * routine is suboptimal in that it regenerates
   1992  10105      adam 				 * the parity that we already used in addition
   1993  10105      adam 				 * to the parity that we're attempting to
   1994  10105      adam 				 * verify, but this should be a relatively
   1995  10105      adam 				 * uncommon case, and can be optimized if it
   1996  10105      adam 				 * becomes a problem. Note that we regenerate
   1997  10105      adam 				 * parity when resilvering so we can write it
   1998  10105      adam 				 * out to failed devices later.
   1999   2082  eschrock 				 */
   2000  10105      adam 				if (parity_errors < rm->rm_firstdatacol - n ||
   2001   4034       ahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
   2002   2082  eschrock 					n = raidz_parity_verify(zio, rm);
   2003   2082  eschrock 					unexpected_errors += n;
   2004   2082  eschrock 					ASSERT(parity_errors + n <=
   2005   2082  eschrock 					    rm->rm_firstdatacol);
   2006   2082  eschrock 				}
   2007   2082  eschrock 
   2008   2082  eschrock 				goto done;
   2009   2082  eschrock 			}
   2010    789    ahrens 		}
   2011    789    ahrens 	}
   2012    789    ahrens 
   2013    789    ahrens 	/*
   2014   2082  eschrock 	 * This isn't a typical situation -- either we got a read error or
   2015   2082  eschrock 	 * a child silently returned bad data. Read every block so we can
   2016   2082  eschrock 	 * try again with as much data and parity as we can track down. If
   2017   2082  eschrock 	 * we've already been through once before, all children will be marked
   2018   2082  eschrock 	 * as tried so we'll proceed to combinatorial reconstruction.
   2019    789    ahrens 	 */
   2020    789    ahrens 	unexpected_errors = 1;
   2021   2082  eschrock 	rm->rm_missingdata = 0;
   2022   2082  eschrock 	rm->rm_missingparity = 0;
   2023    789    ahrens 
   2024   2082  eschrock 	for (c = 0; c < rm->rm_cols; c++) {
   2025   2082  eschrock 		if (rm->rm_col[c].rc_tried)
   2026   2082  eschrock 			continue;
   2027    789    ahrens 
   2028    789    ahrens 		zio_vdev_io_redone(zio);
   2029   2082  eschrock 		do {
   2030    789    ahrens 			rc = &rm->rm_col[c];
   2031    789    ahrens 			if (rc->rc_tried)
   2032    789    ahrens 				continue;
   2033    789    ahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
   2034   2082  eschrock 			    vd->vdev_child[rc->rc_devidx],
   2035    789    ahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   2036   7754      Jeff 			    zio->io_type, zio->io_priority, 0,
   2037    789    ahrens 			    vdev_raidz_child_done, rc));
   2038   2082  eschrock 		} while (++c < rm->rm_cols);
   2039   5530   bonwick 
   2040   7754      Jeff 		return;
   2041    789    ahrens 	}
   2042    789    ahrens 
   2043    789    ahrens 	/*
   2044   2082  eschrock 	 * At this point we've attempted to reconstruct the data given the
   2045   2082  eschrock 	 * errors we detected, and we've attempted to read all columns. There
   2046   2082  eschrock 	 * must, therefore, be one or more additional problems -- silent errors
   2047   2082  eschrock 	 * resulting in invalid data rather than explicit I/O errors resulting
   2048  10105      adam 	 * in absent data. We check if there is enough additional data to
   2049  10105      adam 	 * possibly reconstruct the data and then perform combinatorial
   2050  10105      adam 	 * reconstruction over all possible combinations. If that fails,
   2051  10105      adam 	 * we're cooked.
   2052    789    ahrens 	 */
   2053  10614  Jonathan 	if (total_errors > rm->rm_firstdatacol) {
   2054   7754      Jeff 		zio->io_error = vdev_raidz_worst_error(rm);
   2055    789    ahrens 
   2056  10614  Jonathan 	} else if (total_errors < rm->rm_firstdatacol &&
   2057  10614  Jonathan 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
   2058   2082  eschrock 		/*
   2059  10105      adam 		 * If we didn't use all the available parity for the
   2060  10105      adam 		 * combinatorial reconstruction, verify that the remaining
   2061  10105      adam 		 * parity is correct.
   2062   2082  eschrock 		 */
   2063  10105      adam 		if (code != (1 << rm->rm_firstdatacol) - 1)
   2064  10105      adam 			(void) raidz_parity_verify(zio, rm);
   2065  10105      adam 	} else {
   2066  10105      adam 		/*
   2067  10614  Jonathan 		 * We're here because either:
   2068  10614  Jonathan 		 *
   2069  10614  Jonathan 		 *	total_errors == rm_first_datacol, or
   2070  10614  Jonathan 		 *	vdev_raidz_combrec() failed
   2071  10614  Jonathan 		 *
   2072  10614  Jonathan 		 * In either case, there is enough bad data to prevent
   2073  10614  Jonathan 		 * reconstruction.
   2074  10614  Jonathan 		 *
   2075  10614  Jonathan 		 * Start checksum ereports for all children which haven't
   2076  10614  Jonathan 		 * failed.
   2077  10105      adam 		 */
   2078  10105      adam 		zio->io_error = ECKSUM;
   2079   2082  eschrock 
   2080  10614  Jonathan 		for (c = 0; c < rm->rm_cols; c++) {
   2081  10614  Jonathan 			rc = &rm->rm_col[c];
   2082  10614  Jonathan 			if (rc->rc_error == 0) {
   2083  10614  Jonathan 				zio_bad_cksum_t zbc;
   2084  10614  Jonathan 				zbc.zbc_has_cksum = 0;
   2085  10614  Jonathan 				zbc.zbc_injected = rm->rm_ecksuminjected;
   2086  10614  Jonathan 
   2087  10614  Jonathan 				zfs_ereport_start_checksum(
   2088  10105      adam 				    zio->io_spa, vd->vdev_child[rc->rc_devidx],
   2089  10614  Jonathan 				    zio, rc->rc_offset, rc->rc_size,
   2090  10614  Jonathan 				    (void *)(uintptr_t)c, &zbc);
   2091   2082  eschrock 			}
   2092   1544  eschrock 		}
   2093   1544  eschrock 	}
   2094    789    ahrens 
   2095    789    ahrens done:
   2096    789    ahrens 	zio_checksum_verified(zio);
   2097    789    ahrens 
   2098   8241      Jeff 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
   2099    789    ahrens 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
   2100    789    ahrens 		/*
   2101    789    ahrens 		 * Use the good data we have in hand to repair damaged children.
   2102    789    ahrens 		 */
   2103    789    ahrens 		for (c = 0; c < rm->rm_cols; c++) {
   2104    789    ahrens 			rc = &rm->rm_col[c];
   2105   2082  eschrock 			cvd = vd->vdev_child[rc->rc_devidx];
   2106    789    ahrens 
   2107   1732   bonwick 			if (rc->rc_error == 0)
   2108   1732   bonwick 				continue;
   2109    789    ahrens 
   2110   7754      Jeff 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   2111   1732   bonwick 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   2112   1732   bonwick 			    ZIO_TYPE_WRITE, zio->io_priority,
   2113   8241      Jeff 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
   2114   8241      Jeff 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
   2115    789    ahrens 		}
   2116    789    ahrens 	}
   2117    789    ahrens }
   2118    789    ahrens 
   2119    789    ahrens static void
   2120    789    ahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
   2121    789    ahrens {
   2122   2082  eschrock 	if (faulted > vd->vdev_nparity)
   2123   1544  eschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
   2124   1544  eschrock 		    VDEV_AUX_NO_REPLICAS);
   2125    789    ahrens 	else if (degraded + faulted != 0)
   2126   1544  eschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
   2127    789    ahrens 	else
   2128   1544  eschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
   2129    789    ahrens }
   2130    789    ahrens 
   2131    789    ahrens vdev_ops_t vdev_raidz_ops = {
   2132    789    ahrens 	vdev_raidz_open,
   2133    789    ahrens 	vdev_raidz_close,
   2134    789    ahrens 	vdev_raidz_asize,
   2135    789    ahrens 	vdev_raidz_io_start,
   2136    789    ahrens 	vdev_raidz_io_done,
   2137    789    ahrens 	vdev_raidz_state_change,
   2138    789    ahrens 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
   2139    789    ahrens 	B_FALSE			/* not a leaf vdev */
   2140    789    ahrens };
   2141