1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Support routines for managing per-CPU state. 30 */ 31 32 #include <cmd_cpu.h> 33 #include <cmd_mem.h> 34 #include <cmd.h> 35 36 #include <stdio.h> 37 #include <string.h> 38 #include <strings.h> 39 #include <errno.h> 40 #include <kstat.h> 41 #include <fm/fmd_api.h> 42 #include <sys/async.h> 43 #include <sys/fm/protocol.h> 44 #include <sys/fm/cpu/UltraSPARC-T1.h> 45 #include <sys/niagararegs.h> 46 #include <cmd_hc_sun4v.h> 47 48 int cmd_afsr_check(fmd_hdl_t *, uint64_t, cmd_errcl_t, uint8_t *); 49 50 const errdata_t l3errdata = 51 { &cmd.cmd_l3data_serd, "l3cachedata", CMD_PTR_CPU_L3DATA }; 52 const errdata_t n1l2errdata = 53 { &cmd.cmd_l2data_serd, "l2cachedata", CMD_PTR_CPU_L2DATA }; 54 const errdata_t n2ce_l2errdata = 55 { &cmd.cmd_l2data_serd, "l2data-c", CMD_PTR_CPU_L2DATA }; 56 const errdata_t n2ue_l2errdata = 57 { &cmd.cmd_l2data_serd, "l2data-u", CMD_PTR_CPU_L2DATA }; 58 const errdata_t miscregsdata = 59 { &cmd.cmd_miscregs_serd, "misc_reg", CMD_PTR_CPU_MISC_REGS }; 60 const errdata_t dcachedata = 61 { &cmd.cmd_dcache_serd, "dcache", CMD_PTR_CPU_DCACHE }; 62 const errdata_t icachedata = 63 { &cmd.cmd_icache_serd, "icache", CMD_PTR_CPU_ICACHE }; 64 65 static int 66 cmd_xr_error_type(cmd_errcl_t clcode) 67 { 68 if (CMD_ERRCL_ISMISCREGS(clcode)) 69 return (MISCREGS_ERR); 70 else if (CMD_ERRCL_ISL2XXCU(clcode)) 71 return (L2_ERR); 72 else if (CMD_ERRCL_ISL2ND(clcode)) 73 return (L2ND_ERR); 74 else if (CMD_ERRCL_ISMEM(clcode)) 75 return (MEM_ERR); 76 else if (CMD_ERRCL_ISDCDP(clcode)) 77 return (DCDP_ERR); 78 else if (CMD_ERRCL_ISICDP(clcode)) 79 return (ICDP_ERR); 80 else if (CMD_ERRCL_REMOTEL2(clcode)) 81 return (REMOTE_L2ERR); 82 else 83 return (UNKNOWN_ERR); 84 } 85 86 void 87 cmd_fill_errdata(cmd_errcl_t clcode, cmd_cpu_t *cpu, cmd_case_t **cc, 88 const errdata_t **ed) 89 { 90 int err_type; 91 92 err_type = cmd_xr_error_type(clcode); 93 switch (err_type) { 94 case MISCREGS_ERR: 95 *ed = &miscregsdata; 96 *cc = &cpu->cpu_misc_regs; 97 break; 98 case L2_ERR: 99 case REMOTE_L2ERR: 100 if (cpu->cpu_type == CPU_ULTRASPARC_T1) { 101 *ed = &n1l2errdata; 102 *cc = &cpu->cpu_l2data; 103 } else { 104 if (CMD_ERRCL_ISL2CE(clcode)) { 105 *ed = &n2ce_l2errdata; 106 *cc = &cpu->cpu_l2data; 107 } else { 108 *ed = &n2ue_l2errdata; 109 *cc = &cpu->cpu_l2data; 110 } 111 } 112 break; 113 case DCDP_ERR: 114 *ed = &dcachedata; 115 *cc = &cpu->cpu_dcache; 116 break; 117 case ICDP_ERR: 118 *ed = &icachedata; 119 *cc = &cpu->cpu_icache; 120 break; 121 /* 122 * When an error goes through the train, it requires 123 * to have cmd_case_t & errdata_t structures even it is not 124 * diagnosed when the error is resolved. Sun4v does 125 * does not have a L3 error, but the L3 cpu case was defined, 126 * so its data structures are used for the default cases. 127 */ 128 default: 129 *ed = &l3errdata; 130 *cc = &cpu->cpu_l3data; 131 break; 132 } 133 } 134 135 int 136 cmd_afar_status_check(uint8_t afar_status, cmd_errcl_t clcode) 137 { 138 139 /* 140 * There is no L2 data for a remote write back 141 * cache error in the ereport, so skip the status check 142 */ 143 if (clcode == CMD_ERRCL_WBUE) 144 return (0); 145 146 if (afar_status == AFLT_STAT_VALID) 147 return (0); 148 return (-1); 149 } 150 151 /* 152 * Search for the entry that matches the ena and the AFAR 153 * if we have a valid AFAR, otherwise search for the entry 154 * that its's ena is < delta ENA. 155 */ 156 /*ARGSUSED*/ 157 cmd_xxcu_trw_t * 158 cmd_trw_lookup(uint64_t ena, uint8_t afar_status, uint64_t afar) 159 { 160 int i; 161 162 if (afar_status == AFLT_STAT_VALID) { 163 for (i = 0; i < cmd.cmd_xxcu_ntrw; i++) { 164 if (cmd.cmd_xxcu_trw[i].trw_ena != 0) { 165 if ((llabs(ena - cmd.cmd_xxcu_trw[i].trw_ena) < 166 cmd.cmd_delta_ena) && 167 (cmd.cmd_xxcu_trw[i].trw_afar == afar)) 168 return (&cmd.cmd_xxcu_trw[i]); 169 } 170 } 171 } 172 173 for (i = 0; i < cmd.cmd_xxcu_ntrw; i++) { 174 if (cmd.cmd_xxcu_trw[i].trw_ena != 0) { 175 if (llabs(ena - cmd.cmd_xxcu_trw[i].trw_ena) 176 < cmd.cmd_delta_ena) 177 return (&cmd.cmd_xxcu_trw[i]); 178 } 179 } 180 181 return (NULL); 182 } 183 184 cmd_errcl_t 185 cmd_get_nextbit(cmd_errcl_t trw_mask) 186 { 187 cmd_errcl_t tmp_mask = 0; 188 cmd_errcl_t tmp; 189 int i; 190 191 for (i = 0; i < 64; i++) { 192 tmp = (0x0000000000000001ULL << i); 193 if (tmp & trw_mask) { 194 tmp_mask = tmp; 195 break; 196 } 197 } 198 return (tmp_mask); 199 } 200 201 /* 202 * For a resolved error, its error code will be paired with 203 * each error code in the train mask and compared against the 204 * pre-defined trains in the cmd_cpu.c to determine if the error 205 * is in the train. 206 */ 207 cmd_errcl_t 208 cmd_combine_two_train(cmd_errcl_t trw_mask, cmd_errcl_t resolved_err) 209 { 210 cmd_errcl_t tmp_mask = 0; 211 cmd_errcl_t train_mask = 0; 212 cmd_errcl_t cause = 0; 213 cmd_errcl_t error_mask = trw_mask ^ resolved_err; 214 215 while (error_mask) { 216 tmp_mask = cmd_get_nextbit(error_mask); 217 if (tmp_mask == 0) 218 break; 219 train_mask = tmp_mask | resolved_err; 220 cause = cmd_xxcu_train_match(train_mask); 221 if (cause) { 222 return (cause); 223 } 224 error_mask = error_mask ^ tmp_mask; 225 } 226 return (0); 227 } 228 229 cmd_errcl_t 230 cmd_train_match(cmd_errcl_t trw_mask, cmd_errcl_t resolved_err) 231 { 232 return (cmd_combine_two_train(trw_mask, resolved_err)); 233 } 234 235 int 236 cmd_xr_fill(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_xr_t *xr, cmd_errcl_t clcode) 237 { 238 uint64_t niagara_l2_afsr = 0; 239 int errtype; 240 241 errtype = cmd_xr_error_type(clcode); 242 /* 243 * skip the fill data for the errors which is not L2 errors. 244 */ 245 if (errtype != L2_ERR) { 246 fmd_hdl_debug(hdl, "Skip fill L2 data for errtype %d\n", 247 errtype); 248 return (0); 249 } 250 251 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFSR, 252 &niagara_l2_afsr) != 0 && 253 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_ESR, 254 &niagara_l2_afsr) != 0) { 255 fmd_hdl_debug(hdl, "No L2 AFSR data"); 256 return (-1); 257 } 258 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFAR, 259 &xr->xr_afar) != 0 && 260 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_EAR, 261 &xr->xr_afar) != 0) { 262 fmd_hdl_debug(hdl, "No L2 AFAR data"); 263 return (-1); 264 } 265 if (nvlist_lookup_uint32(nvl, FM_EREPORT_PAYLOAD_NAME_L2_SYND, 266 &xr->xr_synd) != 0) { 267 /* Niagara-2 doesn't provide separate (redundant) l2-synd */ 268 xr->xr_synd = niagara_l2_afsr & NI2_L2AFSR_SYND; 269 } 270 271 if (cmd_afsr_check(hdl, niagara_l2_afsr, clcode, 272 &xr->xr_synd_status) != 0) { 273 fmd_hdl_debug(hdl, "Invalid L2 syndrome"); 274 return (-1); 275 } 276 277 xr->xr_afar_status = xr->xr_synd_status; 278 return (0); 279 } 280 281 int 282 cmd_cpu_synd_check(uint32_t synd, cmd_errcl_t clcode) 283 { 284 int i; 285 286 /* 287 * Niagara L2 fetches from a memory location containing a UE 288 * are given a poison syndrome in one or more 7 bit subsyndromes 289 * each covering one of 4 4 byte checkwords. 290 * 291 * 0 is an invalid syndrome because it denotes no error, but 292 * is associated with an ereport -- meaning there WAS an error. 293 */ 294 /* 295 * HW does not store the syndrome value for write-back cache 296 * error, so skip the synd check for L2 write-back error 297 */ 298 if (CMD_ERRCL_L2UE_WRITEBACK(clcode)) 299 return (0); 300 301 if (synd == 0) 302 return (-1); 303 304 for (i = 0; i < 4; i++) { 305 if (((synd >> i*NI_L2_POISON_SYND_SIZE) & 306 NI_L2_POISON_SYND_MASK) == NI_L2_POISON_SYND_FROM_DAU) 307 return (-1); 308 } 309 return (0); 310 } 311 312 int 313 cmd_afsr_check(fmd_hdl_t *hdl, uint64_t afsr, 314 cmd_errcl_t clcode, uint8_t *stat_val) 315 { 316 /* 317 * Set Niagara afar and synd validity. 318 * For a given set of error registers, the payload value is valid iff 319 * no higher priority error status bit is set. See niagararegs.h 320 * for error status bit values and priority settings. 321 */ 322 switch (clcode) { 323 case CMD_ERRCL_LDAU: 324 case CMD_ERRCL_LDSU: 325 case CMD_ERRCL_DL2U: 326 case CMD_ERRCL_IL2U: 327 *stat_val = 328 ((afsr & NI_L2AFSR_P02) == 0) ? 329 AFLT_STAT_VALID: AFLT_STAT_INVALID; 330 break; 331 case CMD_ERRCL_LDWU: 332 *stat_val = 333 ((afsr & NI_L2AFSR_P03) == 0) ? 334 AFLT_STAT_VALID : AFLT_STAT_INVALID; 335 break; 336 case CMD_ERRCL_LDRU: 337 *stat_val = 338 ((afsr & NI_L2AFSR_P04) == 0) ? 339 AFLT_STAT_VALID : AFLT_STAT_INVALID; 340 break; 341 case CMD_ERRCL_LDAC: 342 case CMD_ERRCL_LDSC: 343 *stat_val = 344 ((afsr & NI_L2AFSR_P08) == 0) ? 345 AFLT_STAT_VALID : AFLT_STAT_INVALID; 346 break; 347 case CMD_ERRCL_LDWC: 348 *stat_val = 349 ((afsr & NI_L2AFSR_P09) == 0) ? 350 AFLT_STAT_VALID : AFLT_STAT_INVALID; 351 break; 352 case CMD_ERRCL_LDRC: 353 *stat_val = 354 ((afsr & NI_L2AFSR_P10) == 0) ? 355 AFLT_STAT_VALID : AFLT_STAT_INVALID; 356 break; 357 default: 358 fmd_hdl_debug(hdl, "Niagara unrecognized l2cache error\n"); 359 return (-1); 360 } 361 return (0); 362 } 363 364 365 int 366 cmd_afar_valid(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_errcl_t clcode, 367 uint64_t *afar) 368 { 369 uint64_t niagara_l2_afsr = 0; 370 uint8_t stat_val; 371 372 /* 373 * In Niagara-1, we carried forward the register names afsr and afar 374 * in ereports from sun4u, even though the hardware registers were 375 * named esr and ear respectively. In Niagara-2 we decided to conform 376 * to the hardware names. 377 */ 378 379 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFSR, 380 &niagara_l2_afsr) != 0 && 381 nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_ESR, 382 &niagara_l2_afsr) != 0) 383 return (-1); 384 385 if (cmd_afsr_check(hdl, niagara_l2_afsr, clcode, &stat_val) != 0) 386 return (-1); 387 388 if (stat_val == AFLT_STAT_VALID) { 389 if (nvlist_lookup_uint64(nvl, 390 FM_EREPORT_PAYLOAD_NAME_L2_AFAR, afar) == 0 || 391 nvlist_lookup_uint64(nvl, 392 FM_EREPORT_PAYLOAD_NAME_L2_EAR, afar) == 0) 393 return (0); 394 } 395 return (-1); 396 } 397 398 /* 399 * sun4v cmd_cpu_get_frustr expects a 'cpufru' element in 'detector' FMRI 400 * of ereport (which is stored as 'asru' of cmd_cpu_t). For early sun4v, 401 * this was mistakenly spec'ed as "hc://MB" instead of "hc:///component=MB", 402 * so this situation must be remediated when found. 403 */ 404 405 char * 406 cmd_cpu_getfrustr(fmd_hdl_t *hdl, cmd_cpu_t *cp) 407 { 408 char *frustr; 409 nvlist_t *asru = cp->cpu_asru_nvl; 410 411 if (nvlist_lookup_string(asru, FM_FMRI_CPU_CPUFRU, &frustr) == 0) { 412 fmd_hdl_debug(hdl, "cmd_cpu_getfrustr: cpufru=%s\n", frustr); 413 if (strncmp(frustr, CPU_FRU_FMRI, 414 sizeof (CPU_FRU_FMRI) -1) == 0) 415 return (fmd_hdl_strdup(hdl, frustr, FMD_SLEEP)); 416 else { 417 char *s1, *s2; 418 size_t frustrlen; 419 420 s2 = strstr(frustr, "MB"); 421 if ((s2 == NULL) || strcmp(s2, EMPTY_STR) == 0) { 422 fmd_hdl_debug(hdl, 423 "cmd_cpu_getfrustr: no cpufru"); 424 return (NULL); 425 } 426 frustrlen = strlen(s2) + sizeof (CPU_FRU_FMRI); 427 s1 = fmd_hdl_alloc(hdl, frustrlen, FMD_SLEEP); 428 s1 = strcpy(s1, CPU_FRU_FMRI); 429 s1 = strcat(s1, s2); 430 fmd_hdl_debug(hdl, "cmd_cpu_getfrustr frustr=%s\n", s1); 431 return (s1); 432 } 433 } 434 (void) cmd_set_errno(ENOENT); 435 return (NULL); 436 } 437 438 char * 439 cmd_cpu_getpartstr(fmd_hdl_t *hdl, cmd_cpu_t *cp) { 440 char *partstr; 441 nvlist_t *asru = cp->cpu_asru_nvl; 442 443 if (nvlist_lookup_string(asru, FM_FMRI_HC_PART, &partstr) == 0) 444 return (fmd_hdl_strdup(hdl, partstr, FMD_SLEEP)); 445 else 446 return (NULL); 447 } 448 449 char * 450 cmd_cpu_getserialstr(fmd_hdl_t *hdl, cmd_cpu_t *cp) { 451 char *serialstr; 452 nvlist_t *asru = cp->cpu_asru_nvl; 453 454 if (nvlist_lookup_string(asru, FM_FMRI_HC_SERIAL_ID, &serialstr) == 0) 455 return (fmd_hdl_strdup(hdl, serialstr, FMD_SLEEP)); 456 else 457 return (NULL); 458 } 459 460 nvlist_t * 461 cmd_cpu_mkfru(fmd_hdl_t *hdl, char *frustr, char *serialstr, char *partstr) 462 { 463 464 nvlist_t *fru; 465 if (strncmp(frustr, CPU_FRU_FMRI, sizeof (CPU_FRU_FMRI) - 1) != 0) 466 return (NULL); 467 fru = cmd_mkboard_fru(hdl, frustr, serialstr, partstr); 468 return (fru); 469 } 470