1 789 ahrens /* 2 789 ahrens * CDDL HEADER START 3 789 ahrens * 4 789 ahrens * The contents of this file are subject to the terms of the 5 1544 eschrock * Common Development and Distribution License (the "License"). 6 1544 eschrock * You may not use this file except in compliance with the License. 7 789 ahrens * 8 789 ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 789 ahrens * or http://www.opensolaris.org/os/licensing. 10 789 ahrens * See the License for the specific language governing permissions 11 789 ahrens * and limitations under the License. 12 789 ahrens * 13 789 ahrens * When distributing Covered Code, include this CDDL HEADER in each 14 789 ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 789 ahrens * If applicable, add the following below this CDDL HEADER, with the 16 789 ahrens * fields enclosed by brackets "[]" replaced with your own identifying 17 789 ahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18 789 ahrens * 19 789 ahrens * CDDL HEADER END 20 789 ahrens */ 21 2082 eschrock 22 789 ahrens /* 23 9434 Mark * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 789 ahrens * Use is subject to license terms. 25 789 ahrens */ 26 789 ahrens 27 789 ahrens #include <sys/zfs_context.h> 28 789 ahrens #include <sys/spa.h> 29 789 ahrens #include <sys/vdev_impl.h> 30 789 ahrens #include <sys/zio.h> 31 789 ahrens #include <sys/zio_checksum.h> 32 789 ahrens #include <sys/fs/zfs.h> 33 1544 eschrock #include <sys/fm/fs/zfs.h> 34 789 ahrens 35 789 ahrens /* 36 789 ahrens * Virtual device vector for RAID-Z. 37 2082 eschrock * 38 10105 adam * This vdev supports single, double, and triple parity. For single parity, 39 10105 adam * we use a simple XOR of all the data columns. For double or triple parity, 40 10105 adam * we use a special case of Reed-Solomon coding. This extends the 41 10105 adam * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 42 10105 adam * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 43 10105 adam * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 44 10105 adam * former is also based. The latter is designed to provide higher performance 45 10105 adam * for writes. 46 10105 adam * 47 10105 adam * Note that the Plank paper claimed to support arbitrary N+M, but was then 48 10105 adam * amended six years later identifying a critical flaw that invalidates its 49 10105 adam * claims. Nevertheless, the technique can be adapted to work for up to 50 10105 adam * triple parity. For additional parity, the amendment "Note: Correction to 51 10105 adam * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 52 10105 adam * is viable, but the additional complexity means that write performance will 53 10105 adam * suffer. 54 10105 adam * 55 10105 adam * All of the methods above operate on a Galois field, defined over the 56 10105 adam * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 57 10105 adam * can be expressed with a single byte. Briefly, the operations on the 58 10105 adam * field are defined as follows: 59 2082 eschrock * 60 2082 eschrock * o addition (+) is represented by a bitwise XOR 61 2082 eschrock * o subtraction (-) is therefore identical to addition: A + B = A - B 62 2082 eschrock * o multiplication of A by 2 is defined by the following bitwise expression: 63 2082 eschrock * (A * 2)_7 = A_6 64 2082 eschrock * (A * 2)_6 = A_5 65 2082 eschrock * (A * 2)_5 = A_4 66 2082 eschrock * (A * 2)_4 = A_3 + A_7 67 2082 eschrock * (A * 2)_3 = A_2 + A_7 68 2082 eschrock * (A * 2)_2 = A_1 + A_7 69 2082 eschrock * (A * 2)_1 = A_0 70 2082 eschrock * (A * 2)_0 = A_7 71 2082 eschrock * 72 2082 eschrock * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 73 10105 adam * As an aside, this multiplication is derived from the error correcting 74 10105 adam * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 75 2082 eschrock * 76 2082 eschrock * Observe that any number in the field (except for 0) can be expressed as a 77 2082 eschrock * power of 2 -- a generator for the field. We store a table of the powers of 78 2082 eschrock * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 79 2082 eschrock * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 80 10105 adam * than field addition). The inverse of a field element A (A^-1) is therefore 81 10105 adam * A ^ (255 - 1) = A^254. 82 2082 eschrock * 83 10105 adam * The up-to-three parity columns, P, Q, R over several data columns, 84 10105 adam * D_0, ... D_n-1, can be expressed by field operations: 85 2082 eschrock * 86 2082 eschrock * P = D_0 + D_1 + ... + D_n-2 + D_n-1 87 2082 eschrock * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 88 2082 eschrock * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 89 10105 adam * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 90 10105 adam * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 91 2082 eschrock * 92 10105 adam * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 93 10105 adam * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 94 10105 adam * independent coefficients. (There are no additional coefficients that have 95 10105 adam * this property which is why the uncorrected Plank method breaks down.) 96 10105 adam * 97 10105 adam * See the reconstruction code below for how P, Q and R can used individually 98 10105 adam * or in concert to recover missing data columns. 99 789 ahrens */ 100 789 ahrens 101 789 ahrens typedef struct raidz_col { 102 2082 eschrock uint64_t rc_devidx; /* child device index for I/O */ 103 2082 eschrock uint64_t rc_offset; /* device offset */ 104 2082 eschrock uint64_t rc_size; /* I/O size */ 105 2082 eschrock void *rc_data; /* I/O data */ 106 10614 Jonathan void *rc_gdata; /* used to store the "good" version */ 107 2082 eschrock int rc_error; /* I/O error for this device */ 108 2082 eschrock uint8_t rc_tried; /* Did we attempt this I/O column? */ 109 2082 eschrock uint8_t rc_skipped; /* Did we skip this I/O column? */ 110 789 ahrens } raidz_col_t; 111 789 ahrens 112 789 ahrens typedef struct raidz_map { 113 10105 adam uint64_t rm_cols; /* Regular column count */ 114 10105 adam uint64_t rm_scols; /* Count including skipped columns */ 115 2082 eschrock uint64_t rm_bigcols; /* Number of oversized columns */ 116 2082 eschrock uint64_t rm_asize; /* Actual total I/O size */ 117 2082 eschrock uint64_t rm_missingdata; /* Count of missing data devices */ 118 2082 eschrock uint64_t rm_missingparity; /* Count of missing parity devices */ 119 2082 eschrock uint64_t rm_firstdatacol; /* First data column/parity count */ 120 10450 adam uint64_t rm_nskip; /* Skipped sectors for padding */ 121 10450 adam uint64_t rm_skipstart; /* Column index of padding start */ 122 10614 Jonathan void *rm_datacopy; /* rm_asize-buffer of copied data */ 123 10614 Jonathan uintptr_t rm_reports; /* # of referencing checksum reports */ 124 10614 Jonathan uint8_t rm_freed; /* map no longer has referencing ZIO */ 125 10614 Jonathan uint8_t rm_ecksuminjected; /* checksum error was injected */ 126 2082 eschrock raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 127 789 ahrens } raidz_map_t; 128 789 ahrens 129 2082 eschrock #define VDEV_RAIDZ_P 0 130 2082 eschrock #define VDEV_RAIDZ_Q 1 131 10105 adam #define VDEV_RAIDZ_R 2 132 2082 eschrock 133 10105 adam #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 134 10105 adam #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 135 2082 eschrock 136 10105 adam /* 137 10105 adam * We provide a mechanism to perform the field multiplication operation on a 138 10105 adam * 64-bit value all at once rather than a byte at a time. This works by 139 10105 adam * creating a mask from the top bit in each byte and using that to 140 10105 adam * conditionally apply the XOR of 0x1d. 141 10105 adam */ 142 10105 adam #define VDEV_RAIDZ_64MUL_2(x, mask) \ 143 10105 adam { \ 144 10105 adam (mask) = (x) & 0x8080808080808080ULL; \ 145 10105 adam (mask) = ((mask) << 1) - ((mask) >> 7); \ 146 10105 adam (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 147 10105 adam ((mask) & 0x1d1d1d1d1d1d1d1d); \ 148 10105 adam } 149 10105 adam 150 10105 adam #define VDEV_RAIDZ_64MUL_4(x, mask) \ 151 10105 adam { \ 152 10105 adam VDEV_RAIDZ_64MUL_2((x), mask); \ 153 10105 adam VDEV_RAIDZ_64MUL_2((x), mask); \ 154 10105 adam } 155 10105 adam 156 10105 adam /* 157 10105 adam * Force reconstruction to use the general purpose method. 158 10105 adam */ 159 10105 adam int vdev_raidz_default_to_general; 160 2082 eschrock 161 2082 eschrock /* 162 2082 eschrock * These two tables represent powers and logs of 2 in the Galois field defined 163 2082 eschrock * above. These values were computed by repeatedly multiplying by 2 as above. 164 2082 eschrock */ 165 2082 eschrock static const uint8_t vdev_raidz_pow2[256] = { 166 2082 eschrock 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 167 2082 eschrock 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 168 2082 eschrock 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 169 2082 eschrock 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 170 2082 eschrock 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 171 2082 eschrock 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 172 2082 eschrock 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 173 2082 eschrock 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 174 2082 eschrock 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 175 2082 eschrock 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 176 2082 eschrock 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 177 2082 eschrock 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 178 2082 eschrock 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 179 2082 eschrock 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 180 2082 eschrock 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 181 2082 eschrock 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 182 2082 eschrock 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 183 2082 eschrock 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 184 2082 eschrock 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 185 2082 eschrock 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 186 2082 eschrock 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 187 2082 eschrock 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 188 2082 eschrock 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 189 2082 eschrock 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 190 2082 eschrock 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 191 2082 eschrock 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 192 2082 eschrock 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 193 2082 eschrock 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 194 2082 eschrock 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 195 2082 eschrock 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 196 2082 eschrock 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 197 2082 eschrock 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 198 2082 eschrock }; 199 2082 eschrock static const uint8_t vdev_raidz_log2[256] = { 200 2082 eschrock 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 201 2082 eschrock 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 202 2082 eschrock 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 203 2082 eschrock 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 204 2082 eschrock 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 205 2082 eschrock 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 206 2082 eschrock 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 207 2082 eschrock 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 208 2082 eschrock 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 209 2082 eschrock 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 210 2082 eschrock 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 211 2082 eschrock 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 212 2082 eschrock 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 213 2082 eschrock 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 214 2082 eschrock 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 215 2082 eschrock 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 216 2082 eschrock 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 217 2082 eschrock 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 218 2082 eschrock 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 219 2082 eschrock 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 220 2082 eschrock 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 221 2082 eschrock 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 222 2082 eschrock 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 223 2082 eschrock 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 224 2082 eschrock 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 225 2082 eschrock 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 226 2082 eschrock 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 227 2082 eschrock 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 228 2082 eschrock 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 229 2082 eschrock 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 230 2082 eschrock 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 231 2082 eschrock 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 232 2082 eschrock }; 233 2082 eschrock 234 10614 Jonathan static void vdev_raidz_generate_parity(raidz_map_t *rm); 235 10614 Jonathan 236 2082 eschrock /* 237 2082 eschrock * Multiply a given number by 2 raised to the given power. 238 2082 eschrock */ 239 2082 eschrock static uint8_t 240 2082 eschrock vdev_raidz_exp2(uint_t a, int exp) 241 2082 eschrock { 242 2082 eschrock if (a == 0) 243 2082 eschrock return (0); 244 2082 eschrock 245 2082 eschrock ASSERT(exp >= 0); 246 2082 eschrock ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 247 2082 eschrock 248 2082 eschrock exp += vdev_raidz_log2[a]; 249 2082 eschrock if (exp > 255) 250 2082 eschrock exp -= 255; 251 2082 eschrock 252 2082 eschrock return (vdev_raidz_pow2[exp]); 253 2082 eschrock } 254 2082 eschrock 255 7754 Jeff static void 256 10614 Jonathan vdev_raidz_map_free(raidz_map_t *rm) 257 7754 Jeff { 258 7754 Jeff int c; 259 10653 Jonathan size_t size; 260 7754 Jeff 261 10614 Jonathan for (c = 0; c < rm->rm_firstdatacol; c++) { 262 7754 Jeff zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 263 10614 Jonathan 264 10614 Jonathan if (rm->rm_col[c].rc_gdata != NULL) 265 10614 Jonathan zio_buf_free(rm->rm_col[c].rc_gdata, 266 10614 Jonathan rm->rm_col[c].rc_size); 267 10614 Jonathan } 268 10653 Jonathan 269 10653 Jonathan size = 0; 270 10653 Jonathan for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 271 10653 Jonathan size += rm->rm_col[c].rc_size; 272 10614 Jonathan 273 10614 Jonathan if (rm->rm_datacopy != NULL) 274 10614 Jonathan zio_buf_free(rm->rm_datacopy, size); 275 7754 Jeff 276 10105 adam kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 277 7754 Jeff } 278 10614 Jonathan 279 10614 Jonathan static void 280 10614 Jonathan vdev_raidz_map_free_vsd(zio_t *zio) 281 10614 Jonathan { 282 10614 Jonathan raidz_map_t *rm = zio->io_vsd; 283 10614 Jonathan 284 10614 Jonathan ASSERT3U(rm->rm_freed, ==, 0); 285 10614 Jonathan rm->rm_freed = 1; 286 10614 Jonathan 287 10614 Jonathan if (rm->rm_reports == 0) 288 10614 Jonathan vdev_raidz_map_free(rm); 289 10614 Jonathan } 290 10614 Jonathan 291 10614 Jonathan /*ARGSUSED*/ 292 10614 Jonathan static void 293 10614 Jonathan vdev_raidz_cksum_free(void *arg, size_t ignored) 294 10614 Jonathan { 295 10614 Jonathan raidz_map_t *rm = arg; 296 10614 Jonathan 297 10614 Jonathan ASSERT3U(rm->rm_reports, >, 0); 298 10614 Jonathan 299 10653 Jonathan if (--rm->rm_reports == 0 && rm->rm_freed != 0) 300 10614 Jonathan vdev_raidz_map_free(rm); 301 10614 Jonathan } 302 10614 Jonathan 303 10614 Jonathan static void 304 10614 Jonathan vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 305 10614 Jonathan { 306 10614 Jonathan raidz_map_t *rm = zcr->zcr_cbdata; 307 10614 Jonathan size_t c = zcr->zcr_cbinfo; 308 10614 Jonathan size_t x; 309 10614 Jonathan 310 10614 Jonathan const char *good = NULL; 311 10614 Jonathan const char *bad = rm->rm_col[c].rc_data; 312 10614 Jonathan 313 10614 Jonathan if (good_data == NULL) { 314 10614 Jonathan zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 315 10614 Jonathan return; 316 10614 Jonathan } 317 10614 Jonathan 318 10614 Jonathan if (c < rm->rm_firstdatacol) { 319 10614 Jonathan /* 320 10614 Jonathan * The first time through, calculate the parity blocks for 321 10614 Jonathan * the good data (this relies on the fact that the good 322 10614 Jonathan * data never changes for a given logical ZIO) 323 10614 Jonathan */ 324 10614 Jonathan if (rm->rm_col[0].rc_gdata == NULL) { 325 10614 Jonathan char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 326 10614 Jonathan char *buf; 327 10614 Jonathan 328 10614 Jonathan /* 329 10614 Jonathan * Set up the rm_col[]s to generate the parity for 330 10614 Jonathan * good_data, first saving the parity bufs and 331 10614 Jonathan * replacing them with buffers to hold the result. 332 10614 Jonathan */ 333 10614 Jonathan for (x = 0; x < rm->rm_firstdatacol; x++) { 334 10614 Jonathan bad_parity[x] = rm->rm_col[x].rc_data; 335 10614 Jonathan rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 336 10614 Jonathan zio_buf_alloc(rm->rm_col[x].rc_size); 337 10614 Jonathan } 338 10614 Jonathan 339 10614 Jonathan /* fill in the data columns from good_data */ 340 10614 Jonathan buf = (char *)good_data; 341 10614 Jonathan for (; x < rm->rm_cols; x++) { 342 10614 Jonathan rm->rm_col[x].rc_data = buf; 343 10614 Jonathan buf += rm->rm_col[x].rc_size; 344 10614 Jonathan } 345 10614 Jonathan 346 10614 Jonathan /* 347 10614 Jonathan * Construct the parity from the good data. 348 10614 Jonathan */ 349 10614 Jonathan vdev_raidz_generate_parity(rm); 350 10614 Jonathan 351 10614 Jonathan /* restore everything back to its original state */ 352 10614 Jonathan for (x = 0; x < rm->rm_firstdatacol; x++) 353 10614 Jonathan rm->rm_col[x].rc_data = bad_parity[x]; 354 10614 Jonathan 355 10614 Jonathan buf = rm->rm_datacopy; 356 10614 Jonathan for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 357 10614 Jonathan rm->rm_col[x].rc_data = buf; 358 10614 Jonathan buf += rm->rm_col[x].rc_size; 359 10614 Jonathan } 360 10614 Jonathan } 361 10614 Jonathan 362 10614 Jonathan ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 363 10614 Jonathan good = rm->rm_col[c].rc_gdata; 364 10614 Jonathan } else { 365 10614 Jonathan /* adjust good_data to point at the start of our column */ 366 10614 Jonathan good = good_data; 367 10614 Jonathan 368 10614 Jonathan for (x = rm->rm_firstdatacol; x < c; x++) 369 10614 Jonathan good += rm->rm_col[x].rc_size; 370 10614 Jonathan } 371 10614 Jonathan 372 10614 Jonathan /* we drop the ereport if it ends up that the data was good */ 373 10614 Jonathan zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 374 10614 Jonathan } 375 10614 Jonathan 376 10614 Jonathan /* 377 10614 Jonathan * Invoked indirectly by zfs_ereport_start_checksum(), called 378 10614 Jonathan * below when our read operation fails completely. The main point 379 10614 Jonathan * is to keep a copy of everything we read from disk, so that at 380 10614 Jonathan * vdev_raidz_cksum_finish() time we can compare it with the good data. 381 10614 Jonathan */ 382 10614 Jonathan static void 383 10614 Jonathan vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 384 10614 Jonathan { 385 10614 Jonathan size_t c = (size_t)(uintptr_t)arg; 386 10614 Jonathan caddr_t buf; 387 10614 Jonathan 388 10614 Jonathan raidz_map_t *rm = zio->io_vsd; 389 10614 Jonathan size_t size; 390 10614 Jonathan 391 10614 Jonathan /* set up the report and bump the refcount */ 392 10614 Jonathan zcr->zcr_cbdata = rm; 393 10614 Jonathan zcr->zcr_cbinfo = c; 394 10614 Jonathan zcr->zcr_finish = vdev_raidz_cksum_finish; 395 10614 Jonathan zcr->zcr_free = vdev_raidz_cksum_free; 396 10614 Jonathan 397 10614 Jonathan rm->rm_reports++; 398 10614 Jonathan ASSERT3U(rm->rm_reports, >, 0); 399 10614 Jonathan 400 10653 Jonathan if (rm->rm_datacopy != NULL) 401 10614 Jonathan return; 402 10614 Jonathan 403 10614 Jonathan /* 404 10653 Jonathan * It's the first time we're called for this raidz_map_t, so we need 405 10653 Jonathan * to copy the data aside; there's no guarantee that our zio's buffer 406 10653 Jonathan * won't be re-used for something else. 407 10614 Jonathan * 408 10653 Jonathan * Our parity data is already in separate buffers, so there's no need 409 10614 Jonathan * to copy them. 410 10614 Jonathan */ 411 10614 Jonathan 412 10653 Jonathan size = 0; 413 10653 Jonathan for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 414 10653 Jonathan size += rm->rm_col[c].rc_size; 415 10614 Jonathan 416 10614 Jonathan buf = rm->rm_datacopy = zio_buf_alloc(size); 417 10653 Jonathan 418 10653 Jonathan for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 419 10614 Jonathan raidz_col_t *col = &rm->rm_col[c]; 420 10614 Jonathan 421 10614 Jonathan bcopy(col->rc_data, buf, col->rc_size); 422 10614 Jonathan col->rc_data = buf; 423 10614 Jonathan 424 10614 Jonathan buf += col->rc_size; 425 10614 Jonathan } 426 10614 Jonathan ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 427 10614 Jonathan } 428 10614 Jonathan 429 10614 Jonathan static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 430 10614 Jonathan vdev_raidz_map_free_vsd, 431 10614 Jonathan vdev_raidz_cksum_report 432 10614 Jonathan }; 433 7754 Jeff 434 789 ahrens static raidz_map_t * 435 2082 eschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 436 2082 eschrock uint64_t nparity) 437 789 ahrens { 438 789 ahrens raidz_map_t *rm; 439 789 ahrens uint64_t b = zio->io_offset >> unit_shift; 440 789 ahrens uint64_t s = zio->io_size >> unit_shift; 441 789 ahrens uint64_t f = b % dcols; 442 789 ahrens uint64_t o = (b / dcols) << unit_shift; 443 10105 adam uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 444 789 ahrens 445 2082 eschrock q = s / (dcols - nparity); 446 2082 eschrock r = s - q * (dcols - nparity); 447 2082 eschrock bc = (r == 0 ? 0 : r + nparity); 448 10105 adam tot = s + nparity * (q + (r == 0 ? 0 : 1)); 449 789 ahrens 450 10105 adam if (q == 0) { 451 10105 adam acols = bc; 452 10105 adam scols = MIN(dcols, roundup(bc, nparity + 1)); 453 10105 adam } else { 454 10105 adam acols = dcols; 455 10105 adam scols = dcols; 456 10105 adam } 457 789 ahrens 458 10105 adam ASSERT3U(acols, <=, scols); 459 10105 adam 460 10105 adam rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 461 789 ahrens 462 789 ahrens rm->rm_cols = acols; 463 10105 adam rm->rm_scols = scols; 464 789 ahrens rm->rm_bigcols = bc; 465 10450 adam rm->rm_skipstart = bc; 466 2082 eschrock rm->rm_missingdata = 0; 467 2082 eschrock rm->rm_missingparity = 0; 468 2082 eschrock rm->rm_firstdatacol = nparity; 469 10614 Jonathan rm->rm_datacopy = NULL; 470 10614 Jonathan rm->rm_reports = 0; 471 10614 Jonathan rm->rm_freed = 0; 472 10614 Jonathan rm->rm_ecksuminjected = 0; 473 789 ahrens 474 10105 adam asize = 0; 475 10105 adam 476 10105 adam for (c = 0; c < scols; c++) { 477 789 ahrens col = f + c; 478 789 ahrens coff = o; 479 789 ahrens if (col >= dcols) { 480 789 ahrens col -= dcols; 481 789 ahrens coff += 1ULL << unit_shift; 482 789 ahrens } 483 2082 eschrock rm->rm_col[c].rc_devidx = col; 484 789 ahrens rm->rm_col[c].rc_offset = coff; 485 789 ahrens rm->rm_col[c].rc_data = NULL; 486 10614 Jonathan rm->rm_col[c].rc_gdata = NULL; 487 789 ahrens rm->rm_col[c].rc_error = 0; 488 789 ahrens rm->rm_col[c].rc_tried = 0; 489 789 ahrens rm->rm_col[c].rc_skipped = 0; 490 10105 adam 491 10105 adam if (c >= acols) 492 10105 adam rm->rm_col[c].rc_size = 0; 493 10105 adam else if (c < bc) 494 10105 adam rm->rm_col[c].rc_size = (q + 1) << unit_shift; 495 10105 adam else 496 10105 adam rm->rm_col[c].rc_size = q << unit_shift; 497 10105 adam 498 10105 adam asize += rm->rm_col[c].rc_size; 499 789 ahrens } 500 789 ahrens 501 10105 adam ASSERT3U(asize, ==, tot << unit_shift); 502 10105 adam rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 503 10450 adam rm->rm_nskip = roundup(tot, nparity + 1) - tot; 504 10450 adam ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 505 10450 adam ASSERT3U(rm->rm_nskip, <=, nparity); 506 789 ahrens 507 789 ahrens for (c = 0; c < rm->rm_firstdatacol; c++) 508 789 ahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 509 789 ahrens 510 789 ahrens rm->rm_col[c].rc_data = zio->io_data; 511 789 ahrens 512 789 ahrens for (c = c + 1; c < acols; c++) 513 789 ahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 514 789 ahrens rm->rm_col[c - 1].rc_size; 515 789 ahrens 516 1133 eschrock /* 517 2082 eschrock * If all data stored spans all columns, there's a danger that parity 518 2082 eschrock * will always be on the same device and, since parity isn't read 519 2082 eschrock * during normal operation, that that device's I/O bandwidth won't be 520 2082 eschrock * used effectively. We therefore switch the parity every 1MB. 521 2082 eschrock * 522 2082 eschrock * ... at least that was, ostensibly, the theory. As a practical 523 2082 eschrock * matter unless we juggle the parity between all devices evenly, we 524 2082 eschrock * won't see any benefit. Further, occasional writes that aren't a 525 2082 eschrock * multiple of the LCM of the number of children and the minimum 526 2082 eschrock * stripe width are sufficient to avoid pessimal behavior. 527 2082 eschrock * Unfortunately, this decision created an implicit on-disk format 528 3456 ahl * requirement that we need to support for all eternity, but only 529 3456 ahl * for single-parity RAID-Z. 530 10450 adam * 531 10450 adam * If we intend to skip a sector in the zeroth column for padding 532 10450 adam * we must make sure to note this swap. We will never intend to 533 10450 adam * skip the first column since at least one data and one parity 534 10450 adam * column must appear in each row. 535 1133 eschrock */ 536 1133 eschrock ASSERT(rm->rm_cols >= 2); 537 1133 eschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 538 789 ahrens 539 2082 eschrock if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 540 2082 eschrock devidx = rm->rm_col[0].rc_devidx; 541 1133 eschrock o = rm->rm_col[0].rc_offset; 542 2082 eschrock rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 543 1133 eschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 544 2082 eschrock rm->rm_col[1].rc_devidx = devidx; 545 1133 eschrock rm->rm_col[1].rc_offset = o; 546 10450 adam 547 10450 adam if (rm->rm_skipstart == 0) 548 10450 adam rm->rm_skipstart = 1; 549 789 ahrens } 550 789 ahrens 551 789 ahrens zio->io_vsd = rm; 552 10614 Jonathan zio->io_vsd_ops = &vdev_raidz_vsd_ops; 553 789 ahrens return (rm); 554 789 ahrens } 555 789 ahrens 556 789 ahrens static void 557 2082 eschrock vdev_raidz_generate_parity_p(raidz_map_t *rm) 558 789 ahrens { 559 2082 eschrock uint64_t *p, *src, pcount, ccount, i; 560 2082 eschrock int c; 561 789 ahrens 562 2082 eschrock pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 563 2082 eschrock 564 2082 eschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 565 789 ahrens src = rm->rm_col[c].rc_data; 566 2082 eschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 567 2082 eschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 568 2082 eschrock 569 2082 eschrock if (c == rm->rm_firstdatacol) { 570 2082 eschrock ASSERT(ccount == pcount); 571 10105 adam for (i = 0; i < ccount; i++, src++, p++) { 572 2082 eschrock *p = *src; 573 2082 eschrock } 574 789 ahrens } else { 575 2082 eschrock ASSERT(ccount <= pcount); 576 10105 adam for (i = 0; i < ccount; i++, src++, p++) { 577 2082 eschrock *p ^= *src; 578 2082 eschrock } 579 789 ahrens } 580 789 ahrens } 581 789 ahrens } 582 2082 eschrock 583 2082 eschrock static void 584 2082 eschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm) 585 2082 eschrock { 586 10105 adam uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 587 2082 eschrock int c; 588 2082 eschrock 589 10105 adam pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 590 2082 eschrock ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 591 2082 eschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 592 2082 eschrock 593 2082 eschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 594 2082 eschrock src = rm->rm_col[c].rc_data; 595 2082 eschrock p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 596 2082 eschrock q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 597 10105 adam 598 10105 adam ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 599 2082 eschrock 600 2082 eschrock if (c == rm->rm_firstdatacol) { 601 10105 adam ASSERT(ccnt == pcnt || ccnt == 0); 602 10105 adam for (i = 0; i < ccnt; i++, src++, p++, q++) { 603 10105 adam *p = *src; 604 2082 eschrock *q = *src; 605 2082 eschrock } 606 10105 adam for (; i < pcnt; i++, src++, p++, q++) { 607 10105 adam *p = 0; 608 2082 eschrock *q = 0; 609 2082 eschrock } 610 2082 eschrock } else { 611 10105 adam ASSERT(ccnt <= pcnt); 612 2082 eschrock 613 2082 eschrock /* 614 10105 adam * Apply the algorithm described above by multiplying 615 10105 adam * the previous result and adding in the new value. 616 2082 eschrock */ 617 10105 adam for (i = 0; i < ccnt; i++, src++, p++, q++) { 618 10105 adam *p ^= *src; 619 10105 adam 620 10105 adam VDEV_RAIDZ_64MUL_2(*q, mask); 621 2082 eschrock *q ^= *src; 622 2082 eschrock } 623 2082 eschrock 624 2082 eschrock /* 625 2082 eschrock * Treat short columns as though they are full of 0s. 626 10105 adam * Note that there's therefore nothing needed for P. 627 2082 eschrock */ 628 10105 adam for (; i < pcnt; i++, q++) { 629 10105 adam VDEV_RAIDZ_64MUL_2(*q, mask); 630 2082 eschrock } 631 2082 eschrock } 632 2082 eschrock } 633 2082 eschrock } 634 2082 eschrock 635 2082 eschrock static void 636 10105 adam vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 637 10105 adam { 638 10105 adam uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 639 10105 adam int c; 640 10105 adam 641 10105 adam pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 642 10105 adam ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 643 10105 adam rm->rm_col[VDEV_RAIDZ_Q].rc_size); 644 10105 adam ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 645 10105 adam rm->rm_col[VDEV_RAIDZ_R].rc_size); 646 10105 adam 647 10105 adam for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 648 10105 adam src = rm->rm_col[c].rc_data; 649 10105 adam p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 650 10105 adam q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 651 10105 adam r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 652 10105 adam 653 10105 adam ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 654 10105 adam 655 10105 adam if (c == rm->rm_firstdatacol) { 656 10105 adam ASSERT(ccnt == pcnt || ccnt == 0); 657 10105 adam for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 658 10105 adam *p = *src; 659 10105 adam *q = *src; 660 10105 adam *r = *src; 661 10105 adam } 662 10105 adam for (; i < pcnt; i++, src++, p++, q++, r++) { 663 10105 adam *p = 0; 664 10105 adam *q = 0; 665 10105 adam *r = 0; 666 10105 adam } 667 10105 adam } else { 668 10105 adam ASSERT(ccnt <= pcnt); 669 10105 adam 670 10105 adam /* 671 10105 adam * Apply the algorithm described above by multiplying 672 10105 adam * the previous result and adding in the new value. 673 10105 adam */ 674 10105 adam for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 675 10105 adam *p ^= *src; 676 10105 adam 677 10105 adam VDEV_RAIDZ_64MUL_2(*q, mask); 678 10105 adam *q ^= *src; 679 10105 adam 680 10105 adam VDEV_RAIDZ_64MUL_4(*r, mask); 681 10105 adam *r ^= *src; 682 10105 adam } 683 10105 adam 684 10105 adam /* 685 10105 adam * Treat short columns as though they are full of 0s. 686 10105 adam * Note that there's therefore nothing needed for P. 687 10105 adam */ 688 10105 adam for (; i < pcnt; i++, q++, r++) { 689 10105 adam VDEV_RAIDZ_64MUL_2(*q, mask); 690 10105 adam VDEV_RAIDZ_64MUL_4(*r, mask); 691 10105 adam } 692 10105 adam } 693 10105 adam } 694 10105 adam } 695 10105 adam 696 10105 adam /* 697 10105 adam * Generate RAID parity in the first virtual columns according to the number of 698 10105 adam * parity columns available. 699 10105 adam */ 700 10105 adam static void 701 10105 adam vdev_raidz_generate_parity(raidz_map_t *rm) 702 10105 adam { 703 10105 adam switch (rm->rm_firstdatacol) { 704 10105 adam case 1: 705 10105 adam vdev_raidz_generate_parity_p(rm); 706 10105 adam break; 707 10105 adam case 2: 708 10105 adam vdev_raidz_generate_parity_pq(rm); 709 10105 adam break; 710 10105 adam case 3: 711 10105 adam vdev_raidz_generate_parity_pqr(rm); 712 10105 adam break; 713 10105 adam default: 714 10105 adam cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 715 10105 adam } 716 10105 adam } 717 10105 adam 718 10105 adam static int 719 10105 adam vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 720 2082 eschrock { 721 2082 eschrock uint64_t *dst, *src, xcount, ccount, count, i; 722 10105 adam int x = tgts[0]; 723 2082 eschrock int c; 724 10105 adam 725 10105 adam ASSERT(ntgts == 1); 726 10105 adam ASSERT(x >= rm->rm_firstdatacol); 727 10105 adam ASSERT(x < rm->rm_cols); 728 2082 eschrock 729 2082 eschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 730 2082 eschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 731 2082 eschrock ASSERT(xcount > 0); 732 2082 eschrock 733 2082 eschrock src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 734 2082 eschrock dst = rm->rm_col[x].rc_data; 735 2082 eschrock for (i = 0; i < xcount; i++, dst++, src++) { 736 2082 eschrock *dst = *src; 737 2082 eschrock } 738 2082 eschrock 739 2082 eschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 740 2082 eschrock src = rm->rm_col[c].rc_data; 741 2082 eschrock dst = rm->rm_col[x].rc_data; 742 2082 eschrock 743 2082 eschrock if (c == x) 744 2082 eschrock continue; 745 2082 eschrock 746 2082 eschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 747 2082 eschrock count = MIN(ccount, xcount); 748 2082 eschrock 749 2082 eschrock for (i = 0; i < count; i++, dst++, src++) { 750 2082 eschrock *dst ^= *src; 751 2082 eschrock } 752 2082 eschrock } 753 10105 adam 754 10105 adam return (1 << VDEV_RAIDZ_P); 755 2082 eschrock } 756 2082 eschrock 757 10105 adam static int 758 10105 adam vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 759 2082 eschrock { 760 2082 eschrock uint64_t *dst, *src, xcount, ccount, count, mask, i; 761 2082 eschrock uint8_t *b; 762 10105 adam int x = tgts[0]; 763 2082 eschrock int c, j, exp; 764 10105 adam 765 10105 adam ASSERT(ntgts == 1); 766 2082 eschrock 767 2082 eschrock xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 768 2082 eschrock ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 769 2082 eschrock 770 2082 eschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 771 2082 eschrock src = rm->rm_col[c].rc_data; 772 2082 eschrock dst = rm->rm_col[x].rc_data; 773 2082 eschrock 774 2082 eschrock if (c == x) 775 2082 eschrock ccount = 0; 776 2082 eschrock else 777 2082 eschrock ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 778 2082 eschrock 779 2082 eschrock count = MIN(ccount, xcount); 780 2082 eschrock 781 2082 eschrock if (c == rm->rm_firstdatacol) { 782 2082 eschrock for (i = 0; i < count; i++, dst++, src++) { 783 2082 eschrock *dst = *src; 784 2082 eschrock } 785 2082 eschrock for (; i < xcount; i++, dst++) { 786 2082 eschrock *dst = 0; 787 2082 eschrock } 788 2082 eschrock 789 2082 eschrock } else { 790 2082 eschrock for (i = 0; i < count; i++, dst++, src++) { 791 10105 adam VDEV_RAIDZ_64MUL_2(*dst, mask); 792 2082 eschrock *dst ^= *src; 793 2082 eschrock } 794 2082 eschrock 795 2082 eschrock for (; i < xcount; i++, dst++) { 796 10105 adam VDEV_RAIDZ_64MUL_2(*dst, mask); 797 2082 eschrock } 798 2082 eschrock } 799 2082 eschrock } 800 2082 eschrock 801 2082 eschrock src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 802 2082 eschrock dst = rm->rm_col[x].rc_data; 803 2082 eschrock exp = 255 - (rm->rm_cols - 1 - x); 804 2082 eschrock 805 2082 eschrock for (i = 0; i < xcount; i++, dst++, src++) { 806 2082 eschrock *dst ^= *src; 807 2082 eschrock for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 808 2082 eschrock *b = vdev_raidz_exp2(*b, exp); 809 2082 eschrock } 810 2082 eschrock } 811 10105 adam 812 10105 adam return (1 << VDEV_RAIDZ_Q); 813 2082 eschrock } 814 2082 eschrock 815 10105 adam static int 816 10105 adam vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 817 2082 eschrock { 818 2082 eschrock uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 819 2082 eschrock void *pdata, *qdata; 820 2082 eschrock uint64_t xsize, ysize, i; 821 10105 adam int x = tgts[0]; 822 10105 adam int y = tgts[1]; 823 2082 eschrock 824 10105 adam ASSERT(ntgts == 2); 825 2082 eschrock ASSERT(x < y); 826 2082 eschrock ASSERT(x >= rm->rm_firstdatacol); 827 2082 eschrock ASSERT(y < rm->rm_cols); 828 2082 eschrock 829 2082 eschrock ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 830 2082 eschrock 831 2082 eschrock /* 832 2082 eschrock * Move the parity data aside -- we're going to compute parity as 833 2082 eschrock * though columns x and y were full of zeros -- Pxy and Qxy. We want to 834 2082 eschrock * reuse the parity generation mechanism without trashing the actual 835 2082 eschrock * parity so we make those columns appear to be full of zeros by 836 2082 eschrock * setting their lengths to zero. 837 2082 eschrock */ 838 2082 eschrock pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 839 2082 eschrock qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 840 2082 eschrock xsize = rm->rm_col[x].rc_size; 841 2082 eschrock ysize = rm->rm_col[y].rc_size; 842 2082 eschrock 843 2082 eschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = 844 2082 eschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 845 2082 eschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = 846 2082 eschrock zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 847 2082 eschrock rm->rm_col[x].rc_size = 0; 848 2082 eschrock rm->rm_col[y].rc_size = 0; 849 2082 eschrock 850 2082 eschrock vdev_raidz_generate_parity_pq(rm); 851 2082 eschrock 852 2082 eschrock rm->rm_col[x].rc_size = xsize; 853 2082 eschrock rm->rm_col[y].rc_size = ysize; 854 2082 eschrock 855 2082 eschrock p = pdata; 856 2082 eschrock q = qdata; 857 2082 eschrock pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 858 2082 eschrock qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 859 2082 eschrock xd = rm->rm_col[x].rc_data; 860 2082 eschrock yd = rm->rm_col[y].rc_data; 861 2082 eschrock 862 2082 eschrock /* 863 2082 eschrock * We now have: 864 2082 eschrock * Pxy = P + D_x + D_y 865 2082 eschrock * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 866 2082 eschrock * 867 2082 eschrock * We can then solve for D_x: 868 2082 eschrock * D_x = A * (P + Pxy) + B * (Q + Qxy) 869 2082 eschrock * where 870 2082 eschrock * A = 2^(x - y) * (2^(x - y) + 1)^-1 871 2082 eschrock * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 872 2082 eschrock * 873 2082 eschrock * With D_x in hand, we can easily solve for D_y: 874 2082 eschrock * D_y = P + Pxy + D_x 875 2082 eschrock */ 876 2082 eschrock 877 2082 eschrock a = vdev_raidz_pow2[255 + x - y]; 878 2082 eschrock b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 879 2082 eschrock tmp = 255 - vdev_raidz_log2[a ^ 1]; 880 2082 eschrock 881 2082 eschrock aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 882 2082 eschrock bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 883 2082 eschrock 884 2082 eschrock for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 885 2082 eschrock *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 886 2082 eschrock vdev_raidz_exp2(*q ^ *qxy, bexp); 887 2082 eschrock 888 2082 eschrock if (i < ysize) 889 2082 eschrock *yd = *p ^ *pxy ^ *xd; 890 2082 eschrock } 891 2082 eschrock 892 2082 eschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 893 2082 eschrock rm->rm_col[VDEV_RAIDZ_P].rc_size); 894 2082 eschrock zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 895 2082 eschrock rm->rm_col[VDEV_RAIDZ_Q].rc_size); 896 2082 eschrock 897 2082 eschrock /* 898 2082 eschrock * Restore the saved parity data. 899 2082 eschrock */ 900 2082 eschrock rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 901 2082 eschrock rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 902 10105 adam 903 10105 adam return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 904 2082 eschrock } 905 2082 eschrock 906 10105 adam /* BEGIN CSTYLED */ 907 10105 adam /* 908 10105 adam * In the general case of reconstruction, we must solve the system of linear 909 10105 adam * equations defined by the coeffecients used to generate parity as well as 910 10105 adam * the contents of the data and parity disks. This can be expressed with 911 10105 adam * vectors for the original data (D) and the actual data (d) and parity (p) 912 10105 adam * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 913 10105 adam * 914 10105 adam * __ __ __ __ 915 10105 adam * | | __ __ | p_0 | 916 10105 adam * | V | | D_0 | | p_m-1 | 917 10105 adam * | | x | : | = | d_0 | 918 10105 adam * | I | | D_n-1 | | : | 919 10105 adam * | | ~~ ~~ | d_n-1 | 920 10105 adam * ~~ ~~ ~~ ~~ 921 10105 adam * 922 10105 adam * I is simply a square identity matrix of size n, and V is a vandermonde 923 10105 adam * matrix defined by the coeffecients we chose for the various parity columns 924 10105 adam * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 925 10105 adam * computation as well as linear separability. 926 10105 adam * 927 10105 adam * __ __ __ __ 928 10105 adam * | 1 .. 1 1 1 | | p_0 | 929 10105 adam * | 2^n-1 .. 4 2 1 | __ __ | : | 930 10105 adam * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 931 10105 adam * | 1 .. 0 0 0 | | D_1 | | d_0 | 932 10105 adam * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 933 10105 adam * | : : : : | | : | | d_2 | 934 10105 adam * | 0 .. 1 0 0 | | D_n-1 | | : | 935 10105 adam * | 0 .. 0 1 0 | ~~ ~~ | : | 936 10105 adam * | 0 .. 0 0 1 | | d_n-1 | 937 10105 adam * ~~ ~~ ~~ ~~ 938 10105 adam * 939 10105 adam * Note that I, V, d, and p are known. To compute D, we must invert the 940 10105 adam * matrix and use the known data and parity values to reconstruct the unknown 941 10105 adam * data values. We begin by removing the rows in V|I and d|p that correspond 942 10105 adam * to failed or missing columns; we then make V|I square (n x n) and d|p 943 10105 adam * sized n by removing rows corresponding to unused parity from the bottom up 944 10105 adam * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 945 10105 adam * using Gauss-Jordan elimination. In the example below we use m=3 parity 946 10105 adam * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 947 10105 adam * __ __ 948 10105 adam * | 1 1 1 1 1 1 1 1 | 949 10105 adam * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 950 10105 adam * | 19 205 116 29 64 16 4 1 | / / 951 10105 adam * | 1 0 0 0 0 0 0 0 | / / 952 10105 adam * | 0 1 0 0 0 0 0 0 | <--' / 953 10105 adam * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 954 10105 adam * | 0 0 0 1 0 0 0 0 | 955 10105 adam * | 0 0 0 0 1 0 0 0 | 956 10105 adam * | 0 0 0 0 0 1 0 0 | 957 10105 adam * | 0 0 0 0 0 0 1 0 | 958 10105 adam * | 0 0 0 0 0 0 0 1 | 959 10105 adam * ~~ ~~ 960 10105 adam * __ __ 961 10105 adam * | 1 1 1 1 1 1 1 1 | 962 10105 adam * | 128 64 32 16 8 4 2 1 | 963 10105 adam * | 19 205 116 29 64 16 4 1 | 964 10105 adam * | 1 0 0 0 0 0 0 0 | 965 10105 adam * | 0 1 0 0 0 0 0 0 | 966 10105 adam * (V|I)' = | 0 0 1 0 0 0 0 0 | 967 10105 adam * | 0 0 0 1 0 0 0 0 | 968 10105 adam * | 0 0 0 0 1 0 0 0 | 969 10105 adam * | 0 0 0 0 0 1 0 0 | 970 10105 adam * | 0 0 0 0 0 0 1 0 | 971 10105 adam * | 0 0 0 0 0 0 0 1 | 972 10105 adam * ~~ ~~ 973 10105 adam * 974 10105 adam * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 975 10105 adam * have carefully chosen the seed values 1, 2, and 4 to ensure that this 976 10105 adam * matrix is not singular. 977 10105 adam * __ __ 978 10105 adam * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 979 10105 adam * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 980 10105 adam * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 981 10105 adam * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 982 10105 adam * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 983 10105 adam * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 984 10105 adam * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 985 10105 adam * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 986 10105 adam * ~~ ~~ 987 10105 adam * __ __ 988 10105 adam * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 989 10105 adam * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 990 10105 adam * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 991 10105 adam * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 992 10105 adam * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 993 10105 adam * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 994 10105 adam * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 995 10105 adam * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 996 10105 adam * ~~ ~~ 997 10105 adam * __ __ 998 10105 adam * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 999 10105 adam * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1000 10105 adam * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1001 10105 adam * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1002 10105 adam * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1003 10105 adam * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1004 10105 adam * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1005 10105 adam * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1006 10105 adam * ~~ ~~ 1007 10105 adam * __ __ 1008 10105 adam * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1009 10105 adam * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1010 10105 adam * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1011 10105 adam * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1012 10105 adam * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1013 10105 adam * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1014 10105 adam * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1015 10105 adam * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1016 10105 adam * ~~ ~~ 1017 10105 adam * __ __ 1018 10105 adam * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1019 10105 adam * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1020 10105 adam * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1021 10105 adam * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1022 10105 adam * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1023 10105 adam * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1024 10105 adam * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1025 10105 adam * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1026 10105 adam * ~~ ~~ 1027 10105 adam * __ __ 1028 10105 adam * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1029 10105 adam * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1030 10105 adam * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1031 10105 adam * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1032 10105 adam * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1033 10105 adam * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1034 10105 adam * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1035 10105 adam * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1036 10105 adam * ~~ ~~ 1037 10105 adam * __ __ 1038 10105 adam * | 0 0 1 0 0 0 0 0 | 1039 10105 adam * | 167 100 5 41 159 169 217 208 | 1040 10105 adam * | 166 100 4 40 158 168 216 209 | 1041 10105 adam * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1042 10105 adam * | 0 0 0 0 1 0 0 0 | 1043 10105 adam * | 0 0 0 0 0 1 0 0 | 1044 10105 adam * | 0 0 0 0 0 0 1 0 | 1045 10105 adam * | 0 0 0 0 0 0 0 1 | 1046 10105 adam * ~~ ~~ 1047 10105 adam * 1048 10105 adam * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1049 10105 adam * of the missing data. 1050 10105 adam * 1051 10105 adam * As is apparent from the example above, the only non-trivial rows in the 1052 10105 adam * inverse matrix correspond to the data disks that we're trying to 1053 10105 adam * reconstruct. Indeed, those are the only rows we need as the others would 1054 10105 adam * only be useful for reconstructing data known or assumed to be valid. For 1055 10105 adam * that reason, we only build the coefficients in the rows that correspond to 1056 10105 adam * targeted columns. 1057 10105 adam */ 1058 10105 adam /* END CSTYLED */ 1059 10105 adam 1060 10105 adam static void 1061 10105 adam vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1062 10105 adam uint8_t **rows) 1063 10105 adam { 1064 10105 adam int i, j; 1065 10105 adam int pow; 1066 10105 adam 1067 10105 adam ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1068 10105 adam 1069 10105 adam /* 1070 10105 adam * Fill in the missing rows of interest. 1071 10105 adam */ 1072 10105 adam for (i = 0; i < nmap; i++) { 1073 10105 adam ASSERT3S(0, <=, map[i]); 1074 10105 adam ASSERT3S(map[i], <=, 2); 1075 10105 adam 1076 10105 adam pow = map[i] * n; 1077 10105 adam if (pow > 255) 1078 10105 adam pow -= 255; 1079 10105 adam ASSERT(pow <= 255); 1080 10105 adam 1081 10105 adam for (j = 0; j < n; j++) { 1082 10105 adam pow -= map[i]; 1083 10105 adam if (pow < 0) 1084 10105 adam pow += 255; 1085 10105 adam rows[i][j] = vdev_raidz_pow2[pow]; 1086 10105 adam } 1087 10105 adam } 1088 10105 adam } 1089 10105 adam 1090 10105 adam static void 1091 10105 adam vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1092 10105 adam uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1093 10105 adam { 1094 10105 adam int i, j, ii, jj; 1095 10105 adam uint8_t log; 1096 10105 adam 1097 10105 adam /* 1098 10105 adam * Assert that the first nmissing entries from the array of used 1099 10105 adam * columns correspond to parity columns and that subsequent entries 1100 10105 adam * correspond to data columns. 1101 10105 adam */ 1102 10105 adam for (i = 0; i < nmissing; i++) { 1103 10105 adam ASSERT3S(used[i], <, rm->rm_firstdatacol); 1104 10105 adam } 1105 10105 adam for (; i < n; i++) { 1106 10105 adam ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1107 10105 adam } 1108 10105 adam 1109 10105 adam /* 1110 10105 adam * First initialize the storage where we'll compute the inverse rows. 1111 10105 adam */ 1112 10105 adam for (i = 0; i < nmissing; i++) { 1113 10105 adam for (j = 0; j < n; j++) { 1114 10105 adam invrows[i][j] = (i == j) ? 1 : 0; 1115 10105 adam } 1116 10105 adam } 1117 10105 adam 1118 10105 adam /* 1119 10105 adam * Subtract all trivial rows from the rows of consequence. 1120 10105 adam */ 1121 10105 adam for (i = 0; i < nmissing; i++) { 1122 10105 adam for (j = nmissing; j < n; j++) { 1123 10105 adam ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1124 10105 adam jj = used[j] - rm->rm_firstdatacol; 1125 10105 adam ASSERT3S(jj, <, n); 1126 10105 adam invrows[i][j] = rows[i][jj]; 1127 10105 adam rows[i][jj] = 0; 1128 10105 adam } 1129 10105 adam } 1130 10105 adam 1131 10105 adam /* 1132 10105 adam * For each of the rows of interest, we must normalize it and subtract 1133 10105 adam * a multiple of it from the other rows. 1134 10105 adam */ 1135 10105 adam for (i = 0; i < nmissing; i++) { 1136 10105 adam for (j = 0; j < missing[i]; j++) { 1137 10105 adam ASSERT3U(rows[i][j], ==, 0); 1138 10105 adam } 1139 10105 adam ASSERT3U(rows[i][missing[i]], !=, 0); 1140 10105 adam 1141 10105 adam /* 1142 10105 adam * Compute the inverse of the first element and multiply each 1143 10105 adam * element in the row by that value. 1144 10105 adam */ 1145 10105 adam log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1146 10105 adam 1147 10105 adam for (j = 0; j < n; j++) { 1148 10105 adam rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1149 10105 adam invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1150 10105 adam } 1151 10105 adam 1152 10105 adam for (ii = 0; ii < nmissing; ii++) { 1153 10105 adam if (i == ii) 1154 10105 adam continue; 1155 10105 adam 1156 10105 adam ASSERT3U(rows[ii][missing[i]], !=, 0); 1157 10105 adam 1158 10105 adam log = vdev_raidz_log2[rows[ii][missing[i]]]; 1159 10105 adam 1160 10105 adam for (j = 0; j < n; j++) { 1161 10105 adam rows[ii][j] ^= 1162 10105 adam vdev_raidz_exp2(rows[i][j], log); 1163 10105 adam invrows[ii][j] ^= 1164 10105 adam vdev_raidz_exp2(invrows[i][j], log); 1165 10105 adam } 1166 10105 adam } 1167 10105 adam } 1168 10105 adam 1169 10105 adam /* 1170 10105 adam * Verify that the data that is left in the rows are properly part of 1171 10105 adam * an identity matrix. 1172 10105 adam */ 1173 10105 adam for (i = 0; i < nmissing; i++) { 1174 10105 adam for (j = 0; j < n; j++) { 1175 10105 adam if (j == missing[i]) { 1176 10105 adam ASSERT3U(rows[i][j], ==, 1); 1177 10105 adam } else { 1178 10105 adam ASSERT3U(rows[i][j], ==, 0); 1179 10105 adam } 1180 10105 adam } 1181 10105 adam } 1182 10105 adam } 1183 10105 adam 1184 10105 adam static void 1185 10105 adam vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1186 10105 adam int *missing, uint8_t **invrows, const uint8_t *used) 1187 10105 adam { 1188 10105 adam int i, j, x, cc, c; 1189 10105 adam uint8_t *src; 1190 10105 adam uint64_t ccount; 1191 10105 adam uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1192 10105 adam uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1193 10105 adam uint8_t log, val; 1194 10105 adam int ll; 1195 10105 adam uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1196 10105 adam uint8_t *p, *pp; 1197 10105 adam size_t psize; 1198 10105 adam 1199 10105 adam psize = sizeof (invlog[0][0]) * n * nmissing; 1200 10105 adam p = kmem_alloc(psize, KM_SLEEP); 1201 10105 adam 1202 10105 adam for (pp = p, i = 0; i < nmissing; i++) { 1203 10105 adam invlog[i] = pp; 1204 10105 adam pp += n; 1205 10105 adam } 1206 10105 adam 1207 10105 adam for (i = 0; i < nmissing; i++) { 1208 10105 adam for (j = 0; j < n; j++) { 1209 10105 adam ASSERT3U(invrows[i][j], !=, 0); 1210 10105 adam invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1211 10105 adam } 1212 10105 adam } 1213 10105 adam 1214 10105 adam for (i = 0; i < n; i++) { 1215 10105 adam c = used[i]; 1216 10105 adam ASSERT3U(c, <, rm->rm_cols); 1217 10105 adam 1218 10105 adam src = rm->rm_col[c].rc_data; 1219 10105 adam ccount = rm->rm_col[c].rc_size; 1220 10105 adam for (j = 0; j < nmissing; j++) { 1221 10105 adam cc = missing[j] + rm->rm_firstdatacol; 1222 10105 adam ASSERT3U(cc, >=, rm->rm_firstdatacol); 1223 10105 adam ASSERT3U(cc, <, rm->rm_cols); 1224 10105 adam ASSERT3U(cc, !=, c); 1225 10105 adam 1226 10105 adam dst[j] = rm->rm_col[cc].rc_data; 1227 10105 adam dcount[j] = rm->rm_col[cc].rc_size; 1228 10105 adam } 1229 10105 adam 1230 10105 adam ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1231 10105 adam 1232 10105 adam for (x = 0; x < ccount; x++, src++) { 1233 10105 adam if (*src != 0) 1234 10105 adam log = vdev_raidz_log2[*src]; 1235 10105 adam 1236 10105 adam for (cc = 0; cc < nmissing; cc++) { 1237 10105 adam if (x >= dcount[cc]) 1238 10105 adam continue; 1239 10105 adam 1240 10105 adam if (*src == 0) { 1241 10105 adam val = 0; 1242 10105 adam } else { 1243 10105 adam if ((ll = log + invlog[cc][i]) >= 255) 1244 10105 adam ll -= 255; 1245 10105 adam val = vdev_raidz_pow2[ll]; 1246 10105 adam } 1247 10105 adam 1248 10105 adam if (i == 0) 1249 10105 adam dst[cc][x] = val; 1250 10105 adam else 1251 10105 adam dst[cc][x] ^= val; 1252 10105 adam } 1253 10105 adam } 1254 10105 adam } 1255 10105 adam 1256 10105 adam kmem_free(p, psize); 1257 10105 adam } 1258 10105 adam 1259 10105 adam static int 1260 10105 adam vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1261 10105 adam { 1262 10105 adam int n, i, c, t, tt; 1263 10105 adam int nmissing_rows; 1264 10105 adam int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1265 10105 adam int parity_map[VDEV_RAIDZ_MAXPARITY]; 1266 10105 adam 1267 10105 adam uint8_t *p, *pp; 1268 10105 adam size_t psize; 1269 10105 adam 1270 10105 adam uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1271 10105 adam uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1272 10105 adam uint8_t *used; 1273 10105 adam 1274 10105 adam int code = 0; 1275 10105 adam 1276 10105 adam 1277 10105 adam n = rm->rm_cols - rm->rm_firstdatacol; 1278 10105 adam 1279 10105 adam /* 1280 10105 adam * Figure out which data columns are missing. 1281 10105 adam */ 1282 10105 adam nmissing_rows = 0; 1283 10105 adam for (t = 0; t < ntgts; t++) { 1284 10105 adam if (tgts[t] >= rm->rm_firstdatacol) { 1285 10105 adam missing_rows[nmissing_rows++] = 1286 10105 adam tgts[t] - rm->rm_firstdatacol; 1287 10105 adam } 1288 10105 adam } 1289 10105 adam 1290 10105 adam /* 1291 10105 adam * Figure out which parity columns to use to help generate the missing 1292 10105 adam * data columns. 1293 10105 adam */ 1294 10105 adam for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1295 10105 adam ASSERT(tt < ntgts); 1296 10105 adam ASSERT(c < rm->rm_firstdatacol); 1297 10105 adam 1298 10105 adam /* 1299 10105 adam * Skip any targeted parity columns. 1300 10105 adam */ 1301 10105 adam if (c == tgts[tt]) { 1302 10105 adam tt++; 1303 10105 adam continue; 1304 10105 adam } 1305 10105 adam 1306 10105 adam code |= 1 << c; 1307 10105 adam 1308 10105 adam parity_map[i] = c; 1309 10105 adam i++; 1310 10105 adam } 1311 10105 adam 1312 10105 adam ASSERT(code != 0); 1313 10105 adam ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1314 10105 adam 1315 10105 adam psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1316 10105 adam nmissing_rows * n + sizeof (used[0]) * n; 1317 10105 adam p = kmem_alloc(psize, KM_SLEEP); 1318 10105 adam 1319 10105 adam for (pp = p, i = 0; i < nmissing_rows; i++) { 1320 10105 adam rows[i] = pp; 1321 10105 adam pp += n; 1322 10105 adam invrows[i] = pp; 1323 10105 adam pp += n; 1324 10105 adam } 1325 10105 adam used = pp; 1326 10105 adam 1327 10105 adam for (i = 0; i < nmissing_rows; i++) { 1328 10105 adam used[i] = parity_map[i]; 1329 10105 adam } 1330 10105 adam 1331 10105 adam for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1332 10105 adam if (tt < nmissing_rows && 1333 10105 adam c == missing_rows[tt] + rm->rm_firstdatacol) { 1334 10105 adam tt++; 1335 10105 adam continue; 1336 10105 adam } 1337 10105 adam 1338 10105 adam ASSERT3S(i, <, n); 1339 10105 adam used[i] = c; 1340 10105 adam i++; 1341 10105 adam } 1342 10105 adam 1343 10105 adam /* 1344 10105 adam * Initialize the interesting rows of the matrix. 1345 10105 adam */ 1346 10105 adam vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1347 10105 adam 1348 10105 adam /* 1349 10105 adam * Invert the matrix. 1350 10105 adam */ 1351 10105 adam vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1352 10105 adam invrows, used); 1353 10105 adam 1354 10105 adam /* 1355 10105 adam * Reconstruct the missing data using the generated matrix. 1356 10105 adam */ 1357 10105 adam vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1358 10105 adam invrows, used); 1359 10105 adam 1360 10105 adam kmem_free(p, psize); 1361 10105 adam 1362 10105 adam return (code); 1363 10105 adam } 1364 10105 adam 1365 10105 adam static int 1366 10105 adam vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1367 10105 adam { 1368 10105 adam int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1369 10105 adam int ntgts; 1370 10105 adam int i, c; 1371 10105 adam int code; 1372 10105 adam int nbadparity, nbaddata; 1373 10105 adam int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1374 10105 adam 1375 10105 adam /* 1376 10105 adam * The tgts list must already be sorted. 1377 10105 adam */ 1378 10105 adam for (i = 1; i < nt; i++) { 1379 10105 adam ASSERT(t[i] > t[i - 1]); 1380 10105 adam } 1381 10105 adam 1382 10105 adam nbadparity = rm->rm_firstdatacol; 1383 10105 adam nbaddata = rm->rm_cols - nbadparity; 1384 10105 adam ntgts = 0; 1385 10105 adam for (i = 0, c = 0; c < rm->rm_cols; c++) { 1386 10105 adam if (c < rm->rm_firstdatacol) 1387 10105 adam parity_valid[c] = B_FALSE; 1388 10105 adam 1389 10105 adam if (i < nt && c == t[i]) { 1390 10105 adam tgts[ntgts++] = c; 1391 10105 adam i++; 1392 10105 adam } else if (rm->rm_col[c].rc_error != 0) { 1393 10105 adam tgts[ntgts++] = c; 1394 10105 adam } else if (c >= rm->rm_firstdatacol) { 1395 10105 adam nbaddata--; 1396 10105 adam } else { 1397 10105 adam parity_valid[c] = B_TRUE; 1398 10105 adam nbadparity--; 1399 10105 adam } 1400 10105 adam } 1401 10105 adam 1402 10105 adam ASSERT(ntgts >= nt); 1403 10105 adam ASSERT(nbaddata >= 0); 1404 10105 adam ASSERT(nbaddata + nbadparity == ntgts); 1405 10105 adam 1406 10105 adam dt = &tgts[nbadparity]; 1407 10105 adam 1408 10105 adam /* 1409 10105 adam * See if we can use any of our optimized reconstruction routines. 1410 10105 adam */ 1411 10105 adam if (!vdev_raidz_default_to_general) { 1412 10105 adam switch (nbaddata) { 1413 10105 adam case 1: 1414 10105 adam if (parity_valid[VDEV_RAIDZ_P]) 1415 10105 adam return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1416 10105 adam 1417 10105 adam ASSERT(rm->rm_firstdatacol > 1); 1418 10105 adam 1419 10105 adam if (parity_valid[VDEV_RAIDZ_Q]) 1420 10105 adam return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1421 10105 adam 1422 10105 adam ASSERT(rm->rm_firstdatacol > 2); 1423 10105 adam break; 1424 10105 adam 1425 10105 adam case 2: 1426 10105 adam ASSERT(rm->rm_firstdatacol > 1); 1427 10105 adam 1428 10105 adam if (parity_valid[VDEV_RAIDZ_P] && 1429 10105 adam parity_valid[VDEV_RAIDZ_Q]) 1430 10105 adam return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1431 10105 adam 1432 10105 adam ASSERT(rm->rm_firstdatacol > 2); 1433 10105 adam 1434 10105 adam break; 1435 10105 adam } 1436 10105 adam } 1437 10105 adam 1438 10105 adam code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1439 10105 adam ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1440 10105 adam ASSERT(code > 0); 1441 10105 adam return (code); 1442 10105 adam } 1443 789 ahrens 1444 789 ahrens static int 1445 789 ahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 1446 789 ahrens { 1447 10105 adam vdev_t *cvd; 1448 2082 eschrock uint64_t nparity = vd->vdev_nparity; 1449 10105 adam int c; 1450 789 ahrens int lasterror = 0; 1451 789 ahrens int numerrors = 0; 1452 789 ahrens 1453 2082 eschrock ASSERT(nparity > 0); 1454 2082 eschrock 1455 2082 eschrock if (nparity > VDEV_RAIDZ_MAXPARITY || 1456 2082 eschrock vd->vdev_children < nparity + 1) { 1457 789 ahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1458 789 ahrens return (EINVAL); 1459 789 ahrens } 1460 789 ahrens 1461 9846 Eric vdev_open_children(vd); 1462 789 ahrens 1463 10105 adam for (c = 0; c < vd->vdev_children; c++) { 1464 10105 adam cvd = vd->vdev_child[c]; 1465 9846 Eric 1466 10105 adam if (cvd->vdev_open_error != 0) { 1467 9846 Eric lasterror = cvd->vdev_open_error; 1468 789 ahrens numerrors++; 1469 789 ahrens continue; 1470 789 ahrens } 1471 789 ahrens 1472 789 ahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1473 1732 bonwick *ashift = MAX(*ashift, cvd->vdev_ashift); 1474 789 ahrens } 1475 789 ahrens 1476 789 ahrens *asize *= vd->vdev_children; 1477 789 ahrens 1478 2082 eschrock if (numerrors > nparity) { 1479 789 ahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1480 789 ahrens return (lasterror); 1481 789 ahrens } 1482 789 ahrens 1483 789 ahrens return (0); 1484 789 ahrens } 1485 789 ahrens 1486 789 ahrens static void 1487 789 ahrens vdev_raidz_close(vdev_t *vd) 1488 789 ahrens { 1489 10105 adam int c; 1490 10105 adam 1491 10105 adam for (c = 0; c < vd->vdev_children; c++) 1492 789 ahrens vdev_close(vd->vdev_child[c]); 1493 789 ahrens } 1494 789 ahrens 1495 789 ahrens static uint64_t 1496 789 ahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1497 789 ahrens { 1498 789 ahrens uint64_t asize; 1499 1732 bonwick uint64_t ashift = vd->vdev_top->vdev_ashift; 1500 789 ahrens uint64_t cols = vd->vdev_children; 1501 2082 eschrock uint64_t nparity = vd->vdev_nparity; 1502 789 ahrens 1503 1732 bonwick asize = ((psize - 1) >> ashift) + 1; 1504 2082 eschrock asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1505 2082 eschrock asize = roundup(asize, nparity + 1) << ashift; 1506 789 ahrens 1507 789 ahrens return (asize); 1508 789 ahrens } 1509 789 ahrens 1510 789 ahrens static void 1511 789 ahrens vdev_raidz_child_done(zio_t *zio) 1512 789 ahrens { 1513 789 ahrens raidz_col_t *rc = zio->io_private; 1514 789 ahrens 1515 789 ahrens rc->rc_error = zio->io_error; 1516 789 ahrens rc->rc_tried = 1; 1517 789 ahrens rc->rc_skipped = 0; 1518 789 ahrens } 1519 789 ahrens 1520 5530 bonwick static int 1521 789 ahrens vdev_raidz_io_start(zio_t *zio) 1522 789 ahrens { 1523 789 ahrens vdev_t *vd = zio->io_vd; 1524 1732 bonwick vdev_t *tvd = vd->vdev_top; 1525 789 ahrens vdev_t *cvd; 1526 789 ahrens raidz_map_t *rm; 1527 789 ahrens raidz_col_t *rc; 1528 10105 adam int c, i; 1529 789 ahrens 1530 2082 eschrock rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1531 2082 eschrock vd->vdev_nparity); 1532 789 ahrens 1533 1775 billm ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1534 789 ahrens 1535 789 ahrens if (zio->io_type == ZIO_TYPE_WRITE) { 1536 10105 adam vdev_raidz_generate_parity(rm); 1537 789 ahrens 1538 789 ahrens for (c = 0; c < rm->rm_cols; c++) { 1539 789 ahrens rc = &rm->rm_col[c]; 1540 2082 eschrock cvd = vd->vdev_child[rc->rc_devidx]; 1541 789 ahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1542 789 ahrens rc->rc_offset, rc->rc_data, rc->rc_size, 1543 7754 Jeff zio->io_type, zio->io_priority, 0, 1544 789 ahrens vdev_raidz_child_done, rc)); 1545 789 ahrens } 1546 5530 bonwick 1547 10105 adam /* 1548 10105 adam * Generate optional I/Os for any skipped sectors to improve 1549 10105 adam * aggregation contiguity. 1550 10105 adam */ 1551 10450 adam for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1552 10105 adam ASSERT(c <= rm->rm_scols); 1553 10105 adam if (c == rm->rm_scols) 1554 10105 adam c = 0; 1555 10105 adam rc = &rm->rm_col[c]; 1556 10105 adam cvd = vd->vdev_child[rc->rc_devidx]; 1557 10105 adam zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1558 10105 adam rc->rc_offset + rc->rc_size, NULL, 1559 10105 adam 1 << tvd->vdev_ashift, 1560 10105 adam zio->io_type, zio->io_priority, 1561 10105 adam ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1562 10105 adam } 1563 10105 adam 1564 7754 Jeff return (ZIO_PIPELINE_CONTINUE); 1565 789 ahrens } 1566 789 ahrens 1567 789 ahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 1568 789 ahrens 1569 2082 eschrock /* 1570 2082 eschrock * Iterate over the columns in reverse order so that we hit the parity 1571 10105 adam * last -- any errors along the way will force us to read the parity. 1572 2082 eschrock */ 1573 789 ahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 1574 789 ahrens rc = &rm->rm_col[c]; 1575 2082 eschrock cvd = vd->vdev_child[rc->rc_devidx]; 1576 5329 gw25295 if (!vdev_readable(cvd)) { 1577 2082 eschrock if (c >= rm->rm_firstdatacol) 1578 2082 eschrock rm->rm_missingdata++; 1579 2082 eschrock else 1580 2082 eschrock rm->rm_missingparity++; 1581 789 ahrens rc->rc_error = ENXIO; 1582 789 ahrens rc->rc_tried = 1; /* don't even try */ 1583 789 ahrens rc->rc_skipped = 1; 1584 789 ahrens continue; 1585 789 ahrens } 1586 10922 Jeff if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1587 2082 eschrock if (c >= rm->rm_firstdatacol) 1588 2082 eschrock rm->rm_missingdata++; 1589 2082 eschrock else 1590 2082 eschrock rm->rm_missingparity++; 1591 789 ahrens rc->rc_error = ESTALE; 1592 789 ahrens rc->rc_skipped = 1; 1593 789 ahrens continue; 1594 789 ahrens } 1595 2082 eschrock if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1596 9434 Mark (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1597 789 ahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1598 789 ahrens rc->rc_offset, rc->rc_data, rc->rc_size, 1599 7754 Jeff zio->io_type, zio->io_priority, 0, 1600 789 ahrens vdev_raidz_child_done, rc)); 1601 789 ahrens } 1602 789 ahrens } 1603 789 ahrens 1604 7754 Jeff return (ZIO_PIPELINE_CONTINUE); 1605 789 ahrens } 1606 789 ahrens 1607 1544 eschrock /* 1608 1544 eschrock * Report a checksum error for a child of a RAID-Z device. 1609 1544 eschrock */ 1610 1544 eschrock static void 1611 10614 Jonathan raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1612 1544 eschrock { 1613 2082 eschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1614 1544 eschrock 1615 1544 eschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1616 10614 Jonathan zio_bad_cksum_t zbc; 1617 10614 Jonathan raidz_map_t *rm = zio->io_vsd; 1618 10614 Jonathan 1619 1544 eschrock mutex_enter(&vd->vdev_stat_lock); 1620 1544 eschrock vd->vdev_stat.vs_checksum_errors++; 1621 1544 eschrock mutex_exit(&vd->vdev_stat_lock); 1622 10614 Jonathan 1623 10614 Jonathan zbc.zbc_has_cksum = 0; 1624 10614 Jonathan zbc.zbc_injected = rm->rm_ecksuminjected; 1625 10614 Jonathan 1626 10614 Jonathan zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1627 10614 Jonathan rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1628 10614 Jonathan &zbc); 1629 1544 eschrock } 1630 10614 Jonathan } 1631 1544 eschrock 1632 10614 Jonathan /* 1633 10614 Jonathan * We keep track of whether or not there were any injected errors, so that 1634 10614 Jonathan * any ereports we generate can note it. 1635 10614 Jonathan */ 1636 10614 Jonathan static int 1637 10614 Jonathan raidz_checksum_verify(zio_t *zio) 1638 10614 Jonathan { 1639 10614 Jonathan zio_bad_cksum_t zbc; 1640 10614 Jonathan raidz_map_t *rm = zio->io_vsd; 1641 10614 Jonathan 1642 10614 Jonathan int ret = zio_checksum_error(zio, &zbc); 1643 10614 Jonathan if (ret != 0 && zbc.zbc_injected != 0) 1644 10614 Jonathan rm->rm_ecksuminjected = 1; 1645 10614 Jonathan 1646 10614 Jonathan return (ret); 1647 1544 eschrock } 1648 1544 eschrock 1649 2082 eschrock /* 1650 2082 eschrock * Generate the parity from the data columns. If we tried and were able to 1651 2082 eschrock * read the parity without error, verify that the generated parity matches the 1652 2082 eschrock * data we read. If it doesn't, we fire off a checksum error. Return the 1653 2082 eschrock * number such failures. 1654 2082 eschrock */ 1655 2082 eschrock static int 1656 2082 eschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1657 2082 eschrock { 1658 2082 eschrock void *orig[VDEV_RAIDZ_MAXPARITY]; 1659 2082 eschrock int c, ret = 0; 1660 2082 eschrock raidz_col_t *rc; 1661 2082 eschrock 1662 2082 eschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 1663 2082 eschrock rc = &rm->rm_col[c]; 1664 2082 eschrock if (!rc->rc_tried || rc->rc_error != 0) 1665 2082 eschrock continue; 1666 2082 eschrock orig[c] = zio_buf_alloc(rc->rc_size); 1667 2082 eschrock bcopy(rc->rc_data, orig[c], rc->rc_size); 1668 2082 eschrock } 1669 2082 eschrock 1670 10105 adam vdev_raidz_generate_parity(rm); 1671 2082 eschrock 1672 2082 eschrock for (c = 0; c < rm->rm_firstdatacol; c++) { 1673 2082 eschrock rc = &rm->rm_col[c]; 1674 2082 eschrock if (!rc->rc_tried || rc->rc_error != 0) 1675 2082 eschrock continue; 1676 2082 eschrock if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1677 10614 Jonathan raidz_checksum_error(zio, rc, orig[c]); 1678 2082 eschrock rc->rc_error = ECKSUM; 1679 2082 eschrock ret++; 1680 2082 eschrock } 1681 2082 eschrock zio_buf_free(orig[c], rc->rc_size); 1682 2082 eschrock } 1683 2082 eschrock 1684 2082 eschrock return (ret); 1685 2082 eschrock } 1686 2082 eschrock 1687 10105 adam /* 1688 10105 adam * Keep statistics on all the ways that we used parity to correct data. 1689 10105 adam */ 1690 10105 adam static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1691 1544 eschrock 1692 5530 bonwick static int 1693 7754 Jeff vdev_raidz_worst_error(raidz_map_t *rm) 1694 7754 Jeff { 1695 7754 Jeff int error = 0; 1696 7754 Jeff 1697 7754 Jeff for (int c = 0; c < rm->rm_cols; c++) 1698 7754 Jeff error = zio_worst_error(error, rm->rm_col[c].rc_error); 1699 7754 Jeff 1700 7754 Jeff return (error); 1701 7754 Jeff } 1702 7754 Jeff 1703 10105 adam /* 1704 10105 adam * Iterate over all combinations of bad data and attempt a reconstruction. 1705 10105 adam * Note that the algorithm below is non-optimal because it doesn't take into 1706 10105 adam * account how reconstruction is actually performed. For example, with 1707 10105 adam * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1708 10105 adam * is targeted as invalid as if columns 1 and 4 are targeted since in both 1709 10105 adam * cases we'd only use parity information in column 0. 1710 10105 adam */ 1711 10105 adam static int 1712 10105 adam vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1713 10105 adam { 1714 10105 adam raidz_map_t *rm = zio->io_vsd; 1715 10105 adam raidz_col_t *rc; 1716 10105 adam void *orig[VDEV_RAIDZ_MAXPARITY]; 1717 10105 adam int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1718 10105 adam int *tgts = &tstore[1]; 1719 10105 adam int current, next, i, c, n; 1720 10105 adam int code, ret = 0; 1721 10105 adam 1722 10105 adam ASSERT(total_errors < rm->rm_firstdatacol); 1723 10105 adam 1724 10105 adam /* 1725 10105 adam * This simplifies one edge condition. 1726 10105 adam */ 1727 10105 adam tgts[-1] = -1; 1728 10105 adam 1729 10105 adam for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1730 10105 adam /* 1731 10105 adam * Initialize the targets array by finding the first n columns 1732 10105 adam * that contain no error. 1733 10105 adam * 1734 10105 adam * If there were no data errors, we need to ensure that we're 1735 10105 adam * always explicitly attempting to reconstruct at least one 1736 10105 adam * data column. To do this, we simply push the highest target 1737 10105 adam * up into the data columns. 1738 10105 adam */ 1739 10105 adam for (c = 0, i = 0; i < n; i++) { 1740 10105 adam if (i == n - 1 && data_errors == 0 && 1741 10105 adam c < rm->rm_firstdatacol) { 1742 10105 adam c = rm->rm_firstdatacol; 1743 10105 adam } 1744 10105 adam 1745 10105 adam while (rm->rm_col[c].rc_error != 0) { 1746 10105 adam c++; 1747 10105 adam ASSERT3S(c, <, rm->rm_cols); 1748 10105 adam } 1749 10105 adam 1750 10105 adam tgts[i] = c++; 1751 10105 adam } 1752 10105 adam 1753 10105 adam /* 1754 10105 adam * Setting tgts[n] simplifies the other edge condition. 1755 10105 adam */ 1756 10105 adam tgts[n] = rm->rm_cols; 1757 10105 adam 1758 10105 adam /* 1759 10105 adam * These buffers were allocated in previous iterations. 1760 10105 adam */ 1761 10105 adam for (i = 0; i < n - 1; i++) { 1762 10105 adam ASSERT(orig[i] != NULL); 1763 10105 adam } 1764 10105 adam 1765 10105 adam orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1766 10105 adam 1767 10105 adam current = 0; 1768 10105 adam next = tgts[current]; 1769 10105 adam 1770 10105 adam while (current != n) { 1771 10105 adam tgts[current] = next; 1772 10105 adam current = 0; 1773 10105 adam 1774 10105 adam /* 1775 10105 adam * Save off the original data that we're going to 1776 10105 adam * attempt to reconstruct. 1777 10105 adam */ 1778 10105 adam for (i = 0; i < n; i++) { 1779 10105 adam ASSERT(orig[i] != NULL); 1780 10105 adam c = tgts[i]; 1781 10105 adam ASSERT3S(c, >=, 0); 1782 10105 adam ASSERT3S(c, <, rm->rm_cols); 1783 10105 adam rc = &rm->rm_col[c]; 1784 10105 adam bcopy(rc->rc_data, orig[i], rc->rc_size); 1785 10105 adam } 1786 10105 adam 1787 10105 adam /* 1788 10105 adam * Attempt a reconstruction and exit the outer loop on 1789 10105 adam * success. 1790 10105 adam */ 1791 10105 adam code = vdev_raidz_reconstruct(rm, tgts, n); 1792 10614 Jonathan if (raidz_checksum_verify(zio) == 0) { 1793 10105 adam atomic_inc_64(&raidz_corrected[code]); 1794 10105 adam 1795 10105 adam for (i = 0; i < n; i++) { 1796 10105 adam c = tgts[i]; 1797 10105 adam rc = &rm->rm_col[c]; 1798 10105 adam ASSERT(rc->rc_error == 0); 1799 10614 Jonathan if (rc->rc_tried) 1800 10614 Jonathan raidz_checksum_error(zio, rc, 1801 10614 Jonathan orig[i]); 1802 10105 adam rc->rc_error = ECKSUM; 1803 10105 adam } 1804 10105 adam 1805 10105 adam ret = code; 1806 10105 adam goto done; 1807 10105 adam } 1808 10105 adam 1809 10105 adam /* 1810 10105 adam * Restore the original data. 1811 10105 adam */ 1812 10105 adam for (i = 0; i < n; i++) { 1813 10105 adam c = tgts[i]; 1814 10105 adam rc = &rm->rm_col[c]; 1815 10105 adam bcopy(orig[i], rc->rc_data, rc->rc_size); 1816 10105 adam } 1817 10105 adam 1818 10105 adam do { 1819 10105 adam /* 1820 10105 adam * Find the next valid column after the current 1821 10105 adam * position.. 1822 10105 adam */ 1823 10105 adam for (next = tgts[current] + 1; 1824 10105 adam next < rm->rm_cols && 1825 10105 adam rm->rm_col[next].rc_error != 0; next++) 1826 10105 adam continue; 1827 10105 adam 1828 10105 adam ASSERT(next <= tgts[current + 1]); 1829 10105 adam 1830 10105 adam /* 1831 10105 adam * If that spot is available, we're done here. 1832 10105 adam */ 1833 10105 adam if (next != tgts[current + 1]) 1834 10105 adam break; 1835 10105 adam 1836 10105 adam /* 1837 10105 adam * Otherwise, find the next valid column after 1838 10105 adam * the previous position. 1839 10105 adam */ 1840 10105 adam for (c = tgts[current - 1] + 1; 1841 10105 adam rm->rm_col[c].rc_error != 0; c++) 1842 10105 adam continue; 1843 10105 adam 1844 10105 adam tgts[current] = c; 1845 10105 adam current++; 1846 10105 adam 1847 10105 adam } while (current != n); 1848 10105 adam } 1849 10105 adam } 1850 10105 adam n--; 1851 10105 adam done: 1852 10105 adam for (i = 0; i < n; i++) { 1853 10105 adam zio_buf_free(orig[i], rm->rm_col[0].rc_size); 1854 10105 adam } 1855 10105 adam 1856 10105 adam return (ret); 1857 10105 adam } 1858 10105 adam 1859 7754 Jeff static void 1860 789 ahrens vdev_raidz_io_done(zio_t *zio) 1861 789 ahrens { 1862 789 ahrens vdev_t *vd = zio->io_vd; 1863 789 ahrens vdev_t *cvd; 1864 789 ahrens raidz_map_t *rm = zio->io_vsd; 1865 10105 adam raidz_col_t *rc; 1866 789 ahrens int unexpected_errors = 0; 1867 2082 eschrock int parity_errors = 0; 1868 3456 ahl int parity_untried = 0; 1869 2082 eschrock int data_errors = 0; 1870 7754 Jeff int total_errors = 0; 1871 10105 adam int n, c; 1872 10105 adam int tgts[VDEV_RAIDZ_MAXPARITY]; 1873 10105 adam int code; 1874 789 ahrens 1875 1775 billm ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1876 2082 eschrock 1877 2082 eschrock ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1878 2082 eschrock ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1879 789 ahrens 1880 789 ahrens for (c = 0; c < rm->rm_cols; c++) { 1881 789 ahrens rc = &rm->rm_col[c]; 1882 789 ahrens 1883 789 ahrens if (rc->rc_error) { 1884 7754 Jeff ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1885 2082 eschrock 1886 2082 eschrock if (c < rm->rm_firstdatacol) 1887 2082 eschrock parity_errors++; 1888 2082 eschrock else 1889 2082 eschrock data_errors++; 1890 2082 eschrock 1891 789 ahrens if (!rc->rc_skipped) 1892 789 ahrens unexpected_errors++; 1893 2082 eschrock 1894 7754 Jeff total_errors++; 1895 3456 ahl } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1896 3456 ahl parity_untried++; 1897 789 ahrens } 1898 789 ahrens } 1899 789 ahrens 1900 789 ahrens if (zio->io_type == ZIO_TYPE_WRITE) { 1901 789 ahrens /* 1902 7754 Jeff * XXX -- for now, treat partial writes as a success. 1903 7754 Jeff * (If we couldn't write enough columns to reconstruct 1904 7754 Jeff * the data, the I/O failed. Otherwise, good enough.) 1905 7754 Jeff * 1906 7754 Jeff * Now that we support write reallocation, it would be better 1907 7754 Jeff * to treat partial failure as real failure unless there are 1908 7754 Jeff * no non-degraded top-level vdevs left, and not update DTLs 1909 7754 Jeff * if we intend to reallocate. 1910 789 ahrens */ 1911 789 ahrens /* XXPOLICY */ 1912 7754 Jeff if (total_errors > rm->rm_firstdatacol) 1913 7754 Jeff zio->io_error = vdev_raidz_worst_error(rm); 1914 789 ahrens 1915 7754 Jeff return; 1916 789 ahrens } 1917 789 ahrens 1918 789 ahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 1919 2082 eschrock /* 1920 2082 eschrock * There are three potential phases for a read: 1921 2082 eschrock * 1. produce valid data from the columns read 1922 2082 eschrock * 2. read all disks and try again 1923 2082 eschrock * 3. perform combinatorial reconstruction 1924 2082 eschrock * 1925 2082 eschrock * Each phase is progressively both more expensive and less likely to 1926 2082 eschrock * occur. If we encounter more errors than we can repair or all phases 1927 2082 eschrock * fail, we have no choice but to return an error. 1928 2082 eschrock */ 1929 789 ahrens 1930 789 ahrens /* 1931 2082 eschrock * If the number of errors we saw was correctable -- less than or equal 1932 3456 ahl * to the number of parity disks read -- attempt to produce data that 1933 3456 ahl * has a valid checksum. Naturally, this case applies in the absence of 1934 3456 ahl * any errors. 1935 789 ahrens */ 1936 7754 Jeff if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1937 10105 adam if (data_errors == 0) { 1938 10614 Jonathan if (raidz_checksum_verify(zio) == 0) { 1939 4034 ahl /* 1940 4034 ahl * If we read parity information (unnecessarily 1941 4034 ahl * as it happens since no reconstruction was 1942 4034 ahl * needed) regenerate and verify the parity. 1943 4034 ahl * We also regenerate parity when resilvering 1944 4034 ahl * so we can write it out to the failed device 1945 4034 ahl * later. 1946 4034 ahl */ 1947 3456 ahl if (parity_errors + parity_untried < 1948 4034 ahl rm->rm_firstdatacol || 1949 4034 ahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 1950 3456 ahl n = raidz_parity_verify(zio, rm); 1951 3456 ahl unexpected_errors += n; 1952 3456 ahl ASSERT(parity_errors + n <= 1953 3456 ahl rm->rm_firstdatacol); 1954 3456 ahl } 1955 2082 eschrock goto done; 1956 2082 eschrock } 1957 10105 adam } else { 1958 3456 ahl /* 1959 3456 ahl * We either attempt to read all the parity columns or 1960 3456 ahl * none of them. If we didn't try to read parity, we 1961 3456 ahl * wouldn't be here in the correctable case. There must 1962 3456 ahl * also have been fewer parity errors than parity 1963 3456 ahl * columns or, again, we wouldn't be in this code path. 1964 3456 ahl */ 1965 3456 ahl ASSERT(parity_untried == 0); 1966 2082 eschrock ASSERT(parity_errors < rm->rm_firstdatacol); 1967 2082 eschrock 1968 2082 eschrock /* 1969 10105 adam * Identify the data columns that reported an error. 1970 2082 eschrock */ 1971 10105 adam n = 0; 1972 2082 eschrock for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1973 2082 eschrock rc = &rm->rm_col[c]; 1974 10105 adam if (rc->rc_error != 0) { 1975 10105 adam ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1976 10105 adam tgts[n++] = c; 1977 10105 adam } 1978 2082 eschrock } 1979 2082 eschrock 1980 10105 adam ASSERT(rm->rm_firstdatacol >= n); 1981 10105 adam 1982 10105 adam code = vdev_raidz_reconstruct(rm, tgts, n); 1983 10105 adam 1984 10614 Jonathan if (raidz_checksum_verify(zio) == 0) { 1985 10105 adam atomic_inc_64(&raidz_corrected[code]); 1986 2082 eschrock 1987 2082 eschrock /* 1988 10105 adam * If we read more parity disks than were used 1989 10105 adam * for reconstruction, confirm that the other 1990 10105 adam * parity disks produced correct data. This 1991 10105 adam * routine is suboptimal in that it regenerates 1992 10105 adam * the parity that we already used in addition 1993 10105 adam * to the parity that we're attempting to 1994 10105 adam * verify, but this should be a relatively 1995 10105 adam * uncommon case, and can be optimized if it 1996 10105 adam * becomes a problem. Note that we regenerate 1997 10105 adam * parity when resilvering so we can write it 1998 10105 adam * out to failed devices later. 1999 2082 eschrock */ 2000 10105 adam if (parity_errors < rm->rm_firstdatacol - n || 2001 4034 ahl (zio->io_flags & ZIO_FLAG_RESILVER)) { 2002 2082 eschrock n = raidz_parity_verify(zio, rm); 2003 2082 eschrock unexpected_errors += n; 2004 2082 eschrock ASSERT(parity_errors + n <= 2005 2082 eschrock rm->rm_firstdatacol); 2006 2082 eschrock } 2007 2082 eschrock 2008 2082 eschrock goto done; 2009 2082 eschrock } 2010 789 ahrens } 2011 789 ahrens } 2012 789 ahrens 2013 789 ahrens /* 2014 2082 eschrock * This isn't a typical situation -- either we got a read error or 2015 2082 eschrock * a child silently returned bad data. Read every block so we can 2016 2082 eschrock * try again with as much data and parity as we can track down. If 2017 2082 eschrock * we've already been through once before, all children will be marked 2018 2082 eschrock * as tried so we'll proceed to combinatorial reconstruction. 2019 789 ahrens */ 2020 789 ahrens unexpected_errors = 1; 2021 2082 eschrock rm->rm_missingdata = 0; 2022 2082 eschrock rm->rm_missingparity = 0; 2023 789 ahrens 2024 2082 eschrock for (c = 0; c < rm->rm_cols; c++) { 2025 2082 eschrock if (rm->rm_col[c].rc_tried) 2026 2082 eschrock continue; 2027 789 ahrens 2028 789 ahrens zio_vdev_io_redone(zio); 2029 2082 eschrock do { 2030 789 ahrens rc = &rm->rm_col[c]; 2031 789 ahrens if (rc->rc_tried) 2032 789 ahrens continue; 2033 789 ahrens zio_nowait(zio_vdev_child_io(zio, NULL, 2034 2082 eschrock vd->vdev_child[rc->rc_devidx], 2035 789 ahrens rc->rc_offset, rc->rc_data, rc->rc_size, 2036 7754 Jeff zio->io_type, zio->io_priority, 0, 2037 789 ahrens vdev_raidz_child_done, rc)); 2038 2082 eschrock } while (++c < rm->rm_cols); 2039 5530 bonwick 2040 7754 Jeff return; 2041 789 ahrens } 2042 789 ahrens 2043 789 ahrens /* 2044 2082 eschrock * At this point we've attempted to reconstruct the data given the 2045 2082 eschrock * errors we detected, and we've attempted to read all columns. There 2046 2082 eschrock * must, therefore, be one or more additional problems -- silent errors 2047 2082 eschrock * resulting in invalid data rather than explicit I/O errors resulting 2048 10105 adam * in absent data. We check if there is enough additional data to 2049 10105 adam * possibly reconstruct the data and then perform combinatorial 2050 10105 adam * reconstruction over all possible combinations. If that fails, 2051 10105 adam * we're cooked. 2052 789 ahrens */ 2053 10614 Jonathan if (total_errors > rm->rm_firstdatacol) { 2054 7754 Jeff zio->io_error = vdev_raidz_worst_error(rm); 2055 789 ahrens 2056 10614 Jonathan } else if (total_errors < rm->rm_firstdatacol && 2057 10614 Jonathan (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2058 2082 eschrock /* 2059 10105 adam * If we didn't use all the available parity for the 2060 10105 adam * combinatorial reconstruction, verify that the remaining 2061 10105 adam * parity is correct. 2062 2082 eschrock */ 2063 10105 adam if (code != (1 << rm->rm_firstdatacol) - 1) 2064 10105 adam (void) raidz_parity_verify(zio, rm); 2065 10105 adam } else { 2066 10105 adam /* 2067 10614 Jonathan * We're here because either: 2068 10614 Jonathan * 2069 10614 Jonathan * total_errors == rm_first_datacol, or 2070 10614 Jonathan * vdev_raidz_combrec() failed 2071 10614 Jonathan * 2072 10614 Jonathan * In either case, there is enough bad data to prevent 2073 10614 Jonathan * reconstruction. 2074 10614 Jonathan * 2075 10614 Jonathan * Start checksum ereports for all children which haven't 2076 10614 Jonathan * failed. 2077 10105 adam */ 2078 10105 adam zio->io_error = ECKSUM; 2079 2082 eschrock 2080 10614 Jonathan for (c = 0; c < rm->rm_cols; c++) { 2081 10614 Jonathan rc = &rm->rm_col[c]; 2082 10614 Jonathan if (rc->rc_error == 0) { 2083 10614 Jonathan zio_bad_cksum_t zbc; 2084 10614 Jonathan zbc.zbc_has_cksum = 0; 2085 10614 Jonathan zbc.zbc_injected = rm->rm_ecksuminjected; 2086 10614 Jonathan 2087 10614 Jonathan zfs_ereport_start_checksum( 2088 10105 adam zio->io_spa, vd->vdev_child[rc->rc_devidx], 2089 10614 Jonathan zio, rc->rc_offset, rc->rc_size, 2090 10614 Jonathan (void *)(uintptr_t)c, &zbc); 2091 2082 eschrock } 2092 1544 eschrock } 2093 1544 eschrock } 2094 789 ahrens 2095 789 ahrens done: 2096 789 ahrens zio_checksum_verified(zio); 2097 789 ahrens 2098 8241 Jeff if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2099 789 ahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2100 789 ahrens /* 2101 789 ahrens * Use the good data we have in hand to repair damaged children. 2102 789 ahrens */ 2103 789 ahrens for (c = 0; c < rm->rm_cols; c++) { 2104 789 ahrens rc = &rm->rm_col[c]; 2105 2082 eschrock cvd = vd->vdev_child[rc->rc_devidx]; 2106 789 ahrens 2107 1732 bonwick if (rc->rc_error == 0) 2108 1732 bonwick continue; 2109 789 ahrens 2110 7754 Jeff zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2111 1732 bonwick rc->rc_offset, rc->rc_data, rc->rc_size, 2112 1732 bonwick ZIO_TYPE_WRITE, zio->io_priority, 2113 8241 Jeff ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2114 8241 Jeff ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2115 789 ahrens } 2116 789 ahrens } 2117 789 ahrens } 2118 789 ahrens 2119 789 ahrens static void 2120 789 ahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2121 789 ahrens { 2122 2082 eschrock if (faulted > vd->vdev_nparity) 2123 1544 eschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2124 1544 eschrock VDEV_AUX_NO_REPLICAS); 2125 789 ahrens else if (degraded + faulted != 0) 2126 1544 eschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2127 789 ahrens else 2128 1544 eschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2129 789 ahrens } 2130 789 ahrens 2131 789 ahrens vdev_ops_t vdev_raidz_ops = { 2132 789 ahrens vdev_raidz_open, 2133 789 ahrens vdev_raidz_close, 2134 789 ahrens vdev_raidz_asize, 2135 789 ahrens vdev_raidz_io_start, 2136 789 ahrens vdev_raidz_io_done, 2137 789 ahrens vdev_raidz_state_change, 2138 789 ahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2139 789 ahrens B_FALSE /* not a leaf vdev */ 2140 789 ahrens }; 2141