1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Ereport-handling routines for memory errors 28 */ 29 30 #include <cmd_mem.h> 31 #include <cmd_dimm.h> 32 #include <cmd_bank.h> 33 #include <cmd_page.h> 34 #include <cmd_cpu.h> 35 #ifdef sun4u 36 #include <cmd_dp.h> 37 #include <cmd_dp_page.h> 38 #endif 39 #include <cmd.h> 40 41 #include <strings.h> 42 #include <string.h> 43 #include <errno.h> 44 #include <fm/fmd_api.h> 45 #include <sys/fm/protocol.h> 46 #include <sys/async.h> 47 #include <sys/errclassify.h> 48 #include <assert.h> 49 50 #ifdef sun4v 51 #include <cmd_hc_sun4v.h> 52 #endif /* sun4v */ 53 54 struct ce_name2type { 55 const char *name; 56 ce_dispact_t type; 57 }; 58 59 ce_dispact_t 60 cmd_mem_name2type(const char *name, int minorvers) 61 { 62 static const struct ce_name2type old[] = { 63 { ERR_TYPE_DESC_INTERMITTENT, CE_DISP_INTERMITTENT }, 64 { ERR_TYPE_DESC_PERSISTENT, CE_DISP_PERS }, 65 { ERR_TYPE_DESC_STICKY, CE_DISP_STICKY }, 66 { ERR_TYPE_DESC_UNKNOWN, CE_DISP_UNKNOWN }, 67 { NULL } 68 }; 69 static const struct ce_name2type new[] = { 70 { CE_DISP_DESC_U, CE_DISP_UNKNOWN }, 71 { CE_DISP_DESC_I, CE_DISP_INTERMITTENT }, 72 { CE_DISP_DESC_PP, CE_DISP_POSS_PERS }, 73 { CE_DISP_DESC_P, CE_DISP_PERS }, 74 { CE_DISP_DESC_L, CE_DISP_LEAKY }, 75 { CE_DISP_DESC_PS, CE_DISP_POSS_STICKY }, 76 { CE_DISP_DESC_S, CE_DISP_STICKY }, 77 { NULL } 78 }; 79 const struct ce_name2type *names = (minorvers == 0) ? &old[0] : &new[0]; 80 const struct ce_name2type *tp; 81 82 for (tp = names; tp->name != NULL; tp++) 83 if (strcasecmp(name, tp->name) == 0) 84 return (tp->type); 85 86 return (CE_DISP_UNKNOWN); 87 } 88 89 static void 90 ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm) 91 { 92 nvlist_t *flt; 93 fmd_case_t *cp; 94 cmd_dimm_t *d; 95 nvlist_t *dflt; 96 uint_t nret, dret; 97 int foundrw; 98 99 if (dimm->dimm_flags & CMD_MEM_F_FAULTING) { 100 /* We've already complained about this DIMM */ 101 return; 102 } 103 104 nret = dimm->dimm_nretired; 105 if (dimm->dimm_bank != NULL) 106 nret += dimm->dimm_bank->bank_nretired; 107 108 if (!cmd_mem_thresh_check(hdl, nret)) 109 return; /* Don't warn until over specified % of system memory */ 110 111 /* Look for CEs on DIMMs in other banks */ 112 for (foundrw = 0, dret = 0, d = cmd_list_next(&cmd.cmd_dimms); 113 d != NULL; d = cmd_list_next(d)) { 114 if (d == dimm) { 115 dret += d->dimm_nretired; 116 continue; 117 } 118 119 if (dimm->dimm_bank != NULL && d->dimm_bank == dimm->dimm_bank) 120 continue; 121 122 if (d->dimm_nretired > cmd.cmd_thresh_abs_badrw) { 123 foundrw = 1; 124 dret += d->dimm_nretired; 125 } 126 } 127 128 if (foundrw) { 129 /* 130 * Found a DIMM in another bank with a significant number of 131 * retirements. Something strange is going on, perhaps in the 132 * datapath or with a bad CPU. A real person will need to 133 * figure out what's really happening. Emit a fault designed 134 * to trigger just that. 135 */ 136 cp = fmd_case_open(hdl, NULL); 137 for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL; 138 d = cmd_list_next(d)) { 139 140 if (d != dimm && d->dimm_bank != NULL && 141 d->dimm_bank == dimm->dimm_bank) 142 continue; 143 144 if (d->dimm_nretired <= cmd.cmd_thresh_abs_badrw) 145 continue; 146 147 if (!(d->dimm_flags & CMD_MEM_F_FAULTING)) { 148 d->dimm_flags |= CMD_MEM_F_FAULTING; 149 cmd_dimm_dirty(hdl, d); 150 } 151 152 flt = cmd_dimm_create_fault(hdl, d, 153 "fault.memory.datapath", 154 d->dimm_nretired * 100 / dret); 155 fmd_case_add_suspect(hdl, cp, flt); 156 } 157 158 fmd_case_solve(hdl, cp); 159 return; 160 } 161 162 dimm->dimm_flags |= CMD_MEM_F_FAULTING; 163 cmd_dimm_dirty(hdl, dimm); 164 165 cp = fmd_case_open(hdl, NULL); 166 dflt = cmd_dimm_create_fault(hdl, dimm, 167 "fault.memory.dimm-page-retires-excessive", 168 CMD_FLTMAXCONF); 169 fmd_case_add_suspect(hdl, cp, dflt); 170 fmd_case_solve(hdl, cp); 171 } 172 173 /* Create a fresh index block for MQSC CE correlation. */ 174 175 cmd_mq_t * 176 mq_create(fmd_hdl_t *hdl, fmd_event_t *ep, 177 uint64_t afar, uint16_t upos, uint64_t now) 178 { 179 cmd_mq_t *cp; 180 uint16_t ckwd = (afar & 0x30) >> 4; 181 182 cp = fmd_hdl_zalloc(hdl, sizeof (cmd_mq_t), FMD_SLEEP); 183 cp->mq_tstamp = now; 184 cp->mq_ckwd = ckwd; 185 cp->mq_phys_addr = afar; 186 cp->mq_unit_position = upos; 187 cp->mq_dram = cmd_upos2dram(upos); 188 cp->mq_ep = ep; 189 cp->mq_serdnm = 190 cmd_mq_serdnm_create(hdl, "mq", afar, ckwd, upos); 191 192 /* 193 * Create SERD to keep this event from being removed 194 * by fmd which may not know there is an event pointer 195 * saved here. This SERD is *never* meant to fire. 196 * NOTE: wouldn't need to do this if there were an fmd 197 * api to 'hold' an event. 198 */ 199 if (fmd_serd_exists(hdl, cp->mq_serdnm)) { 200 /* clean up dup */ 201 fmd_serd_destroy(hdl, cp->mq_serdnm); 202 } 203 fmd_serd_create(hdl, cp->mq_serdnm, CMD_MQ_SERDN, CMD_MQ_SERDT); 204 (void) fmd_serd_record(hdl, cp->mq_serdnm, ep); 205 206 return (cp); 207 } 208 209 /* Destroy MQSC tracking block as well as event tracking SERD. */ 210 211 cmd_mq_t * 212 mq_destroy(fmd_hdl_t *hdl, cmd_list_t *lp, cmd_mq_t *ip) 213 { 214 cmd_mq_t *jp = cmd_list_next(ip); 215 216 if (ip->mq_serdnm != NULL) { 217 if (fmd_serd_exists(hdl, ip->mq_serdnm)) { 218 fmd_serd_destroy(hdl, ip->mq_serdnm); 219 } 220 fmd_hdl_strfree(hdl, ip->mq_serdnm); 221 ip->mq_serdnm = NULL; 222 } 223 cmd_list_delete(lp, &ip->mq_l); 224 fmd_hdl_free(hdl, ip, sizeof (cmd_mq_t)); 225 226 return (jp); 227 } 228 229 /* 230 * Add an index block for a new CE, sorted 231 * a) by ascending unit position 232 * b) order of arrival (~= time order) 233 */ 234 235 void 236 mq_add(fmd_hdl_t *hdl, cmd_dimm_t *dimm, fmd_event_t *ep, 237 uint64_t afar, uint16_t synd, uint64_t now) 238 { 239 cmd_mq_t *ip, *jp; 240 int cw, unit_position; 241 242 cw = (afar & 0x30) >> 4; /* 0:3 */ 243 if ((unit_position = cmd_synd2upos(synd)) < 0) 244 return; /* not a CE */ 245 246 for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; ) { 247 if (ip->mq_unit_position > unit_position) { 248 /* list is in unit position order */ 249 break; 250 } else if (ip->mq_unit_position == unit_position && 251 ip->mq_phys_addr == afar) { 252 /* 253 * Found a duplicate cw, unit_position, and afar. 254 * Delete this node, to be superseded by the new 255 * node added below. 256 */ 257 ip = mq_destroy(hdl, &dimm->mq_root[cw], ip); 258 } else { 259 ip = cmd_list_next(ip); 260 } 261 } 262 263 jp = mq_create(hdl, ep, afar, unit_position, now); 264 if (ip == NULL) 265 cmd_list_append(&dimm->mq_root[cw], jp); 266 else 267 cmd_list_insert_before(&dimm->mq_root[cw], ip, jp); 268 } 269 270 /* 271 * Prune the MQSC index lists (one for each checkword), by deleting 272 * outdated index blocks from each list. 273 */ 274 275 void 276 mq_prune(fmd_hdl_t *hdl, cmd_dimm_t *dimm, uint64_t now) 277 { 278 cmd_mq_t *ip; 279 int cw; 280 281 for (cw = 0; cw < CMD_MAX_CKWDS; cw++) { 282 for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; ) { 283 if (ip->mq_tstamp < now - CMD_MQ_TIMELIM) { 284 /* 285 * This event has timed out - delete the 286 * mq block as well as serd for the event. 287 */ 288 ip = mq_destroy(hdl, &dimm->mq_root[cw], ip); 289 } else { 290 /* tstamp < now - ce_t */ 291 ip = cmd_list_next(ip); 292 } 293 } /* per checkword */ 294 } /* cw = 0...3 */ 295 } 296 297 /* 298 * Check the MQSC index lists (one for each checkword) by making a 299 * complete pass through each list, checking if the criteria for either 300 * Rule 4A or 4B have been met. Rule 4A checking is done for each checkword; 301 * 4B check is done at end. 302 * 303 * Rule 4A: fault a DIMM "whenever Solaris reports two or more CEs from 304 * two or more different physical addresses on each of two or more different 305 * bit positions from the same DIMM within 72 hours of each other, and all 306 * the addresses are in the same relative checkword (that is, the AFARs 307 * are all the same modulo 64). [Note: This means at least 4 CEs; two 308 * from one bit position, with unique addresses, and two from another, 309 * also with unique addresses, and the lower 6 bits of all the addresses 310 * are the same." 311 * 312 * Rule 4B: fault a DIMM "whenever Solaris reports two or more CEs from 313 * two or more different physical addresses on each of three or more 314 * different outputs from the same DRAM within 72 hours of each other, as 315 * long as the three outputs do not all correspond to the same relative 316 * bit position in their respective checkwords. [Note: This means at least 317 * 6 CEs; two from one DRAM output signal, with unique addresses, two from 318 * another output from the same DRAM, also with unique addresses, and two 319 * more from yet another output from the same DRAM, again with unique 320 * addresses, as long as the three outputs do not all correspond to the 321 * same relative bit position in their respective checkwords.]" 322 */ 323 324 void 325 mq_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm) 326 { 327 int upos_pairs, curr_upos, cw, i, j, k; 328 nvlist_t *flt; 329 typedef struct upos_pair { 330 int upos; 331 int dram; 332 cmd_mq_t *mq1; 333 cmd_mq_t *mq2; 334 } upos_pair_t; 335 upos_pair_t upos_array[8]; /* max per cw = 2, * 4 cw's */ 336 cmd_mq_t *ip; 337 338 /* 339 * Each upos_array[] member represents a pair of CEs for the same 340 * unit position (symbol) which on a sun4u is a bit, and on sun4v 341 * is a (4 bit) nibble. 342 * MQSC rule 4 requires pairs of CEs from the same symbol (same DIMM 343 * for rule 4A, and same DRAM for rule 4B) for a violation - this 344 * is why CE pairs are tracked. 345 */ 346 upos_pairs = 0; 347 upos_array[0].mq1 = NULL; 348 349 /* Loop through all checkwords */ 350 for (cw = 0; cw < CMD_MAX_CKWDS; cw++) { 351 i = upos_pairs; 352 curr_upos = -1; 353 354 /* 355 * mq_root[] is an array of cumulative lists of CEs 356 * indexed by checkword where the list is in unit position 357 * order. Loop through checking for duplicate unit position 358 * entries (filled in at mq_create()). 359 * The upos_array[] is filled in each time a duplicate 360 * unit position is found; the first time through the loop 361 * of a unit position sets curr_upos but does not fill in 362 * upos_array[] until the second symbol is found. 363 */ 364 for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; 365 ip = cmd_list_next(ip)) { 366 if (curr_upos != ip->mq_unit_position) { 367 /* Set initial current position */ 368 curr_upos = ip->mq_unit_position; 369 } else if (i > upos_pairs && 370 curr_upos == upos_array[i-1].upos) { 371 /* 372 * Only keep track of CE pairs; skip 373 * triples, quads, etc... 374 */ 375 continue; 376 } else if (upos_array[i].mq1 == NULL) { 377 /* 378 * Have a pair, add to upos_array[]. 379 */ 380 upos_array[i].upos = curr_upos; 381 upos_array[i].dram = ip->mq_dram; 382 upos_array[i].mq1 = cmd_list_prev(ip); 383 upos_array[i].mq2 = ip; 384 upos_array[++i].mq1 = NULL; 385 } 386 } 387 388 if (i - upos_pairs >= 2) { 389 /* Rule 4A Violation. */ 390 flt = cmd_dimm_create_fault(hdl, 391 dimm, "fault.memory.dimm-ue-imminent", 392 CMD_FLTMAXCONF); 393 for (j = upos_pairs; j < i; j++) { 394 fmd_case_add_ereport(hdl, 395 dimm->dimm_case.cc_cp, 396 upos_array[j].mq1->mq_ep); 397 fmd_case_add_ereport(hdl, 398 dimm->dimm_case.cc_cp, 399 upos_array[j].mq2->mq_ep); 400 } 401 dimm->dimm_flags |= CMD_MEM_F_FAULTING; 402 cmd_dimm_dirty(hdl, dimm); 403 fmd_case_add_suspect(hdl, dimm->dimm_case.cc_cp, flt); 404 fmd_case_solve(hdl, dimm->dimm_case.cc_cp); 405 return; 406 } 407 upos_pairs = i; 408 assert(upos_pairs < 8); 409 } 410 411 if (upos_pairs < 3) 412 return; /* 4B violation needs at least 3 pairs */ 413 414 /* 415 * Walk through checking for a rule 4B violation. 416 * Since we only keep track of two CE pairs per CW we'll only have 417 * a max of potentially 8 elements in the array. So as not to run 418 * off the end of the array, need to be careful with i and j indexes. 419 */ 420 for (i = 0; i < (upos_pairs - 2); i++) { 421 if (upos_array[i].dram == -1) { 422 /* 423 * Don't match failure codes. There is 424 * no platform DRAM xlation - return. 425 */ 426 fmd_hdl_debug(hdl, "Unable to determine DRAM" 427 " from the unit position\n"); 428 return; 429 } 430 431 for (j = i+1; j < (upos_pairs - 1); j++) { 432 if (upos_array[i].dram != upos_array[j].dram) { 433 /* 434 * These two pairs aren't the same dram; 435 * continue looking for pairs that are. 436 */ 437 continue; 438 } 439 440 for (k = j+1; k < upos_pairs; k++) { 441 if (upos_array[j].dram != upos_array[k].dram) { 442 /* 443 * DRAMs must be the same for a rule 444 * 4B violation. Continue looking for 445 * pairs that have the same DRAMs. 446 */ 447 continue; 448 } 449 450 if ((upos_array[i].upos != 451 upos_array[j].upos) || 452 (upos_array[j].upos != 453 upos_array[k].upos)) { 454 /* 455 * We've determined that all the dram 456 * CEs are the same dram, if all the 457 * unit positions are not the same, 458 * then we have a rule 4B violation. 459 */ 460 flt = cmd_dimm_create_fault(hdl, dimm, 461 "fault.memory.dram-ue-imminent", 462 CMD_FLTMAXCONF); 463 fmd_case_add_ereport(hdl, 464 dimm->dimm_case.cc_cp, 465 upos_array[i].mq1->mq_ep); 466 fmd_case_add_ereport(hdl, 467 dimm->dimm_case.cc_cp, 468 upos_array[i].mq2->mq_ep); 469 fmd_case_add_ereport(hdl, 470 dimm->dimm_case.cc_cp, 471 upos_array[j].mq1->mq_ep); 472 fmd_case_add_ereport(hdl, 473 dimm->dimm_case.cc_cp, 474 upos_array[j].mq2->mq_ep); 475 fmd_case_add_ereport(hdl, 476 dimm->dimm_case.cc_cp, 477 upos_array[k].mq1->mq_ep); 478 fmd_case_add_ereport(hdl, 479 dimm->dimm_case.cc_cp, 480 upos_array[k].mq2->mq_ep); 481 dimm->dimm_flags |= CMD_MEM_F_FAULTING; 482 cmd_dimm_dirty(hdl, dimm); 483 fmd_case_add_suspect(hdl, 484 dimm->dimm_case.cc_cp, flt); 485 fmd_case_solve(hdl, 486 dimm->dimm_case.cc_cp); 487 return; 488 } 489 } 490 } 491 } 492 } 493 494 /*ARGSUSED*/ 495 cmd_evdisp_t 496 cmd_ce_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 497 const char *class, uint64_t afar, uint8_t afar_status, uint16_t synd, 498 uint8_t synd_status, ce_dispact_t type, uint64_t disp, nvlist_t *asru) 499 { 500 cmd_dimm_t *dimm; 501 cmd_page_t *page; 502 const char *uuid; 503 504 if (afar_status != AFLT_STAT_VALID || 505 synd_status != AFLT_STAT_VALID) 506 return (CMD_EVD_UNUSED); 507 508 if ((page = cmd_page_lookup(afar)) != NULL && 509 page->page_case.cc_cp != NULL && 510 fmd_case_solved(hdl, page->page_case.cc_cp)) 511 return (CMD_EVD_REDUND); 512 513 #ifdef sun4u 514 if (cmd_dp_error(hdl) || cmd_dp_fault(hdl, afar)) { 515 CMD_STAT_BUMP(dp_ignored_ce); 516 return (CMD_EVD_UNUSED); 517 } 518 #endif /* sun4u */ 519 520 if (fmd_nvl_fmri_expand(hdl, asru) < 0) { 521 CMD_STAT_BUMP(bad_mem_asru); 522 return (CMD_EVD_BAD); 523 } 524 525 if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL && 526 (dimm = cmd_dimm_create(hdl, asru)) == NULL) 527 return (CMD_EVD_UNUSED); 528 529 if (dimm->dimm_case.cc_cp == NULL) { 530 dimm->dimm_case.cc_cp = cmd_case_create(hdl, 531 &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid); 532 } 533 534 /* 535 * Add to MQSC correlation lists all CEs which pass validity 536 * checks above. 537 */ 538 if (!(dimm->dimm_flags & CMD_MEM_F_FAULTING)) { 539 uint64_t *now; 540 uint_t nelem; 541 if (nvlist_lookup_uint64_array(nvl, 542 "__tod", &now, &nelem) == 0) { 543 544 mq_add(hdl, dimm, ep, afar, synd, *now); 545 mq_prune(hdl, dimm, *now); 546 mq_check(hdl, dimm); 547 } 548 } 549 550 switch (type) { 551 case CE_DISP_UNKNOWN: 552 CMD_STAT_BUMP(ce_unknown); 553 return (CMD_EVD_UNUSED); 554 case CE_DISP_INTERMITTENT: 555 CMD_STAT_BUMP(ce_interm); 556 return (CMD_EVD_UNUSED); 557 case CE_DISP_POSS_PERS: 558 CMD_STAT_BUMP(ce_ppersis); 559 break; 560 case CE_DISP_PERS: 561 CMD_STAT_BUMP(ce_persis); 562 break; 563 case CE_DISP_LEAKY: 564 CMD_STAT_BUMP(ce_leaky); 565 break; 566 case CE_DISP_POSS_STICKY: 567 { 568 uchar_t ptnrinfo = CE_XDIAG_PTNRINFO(disp); 569 570 if (CE_XDIAG_TESTVALID(ptnrinfo)) { 571 int ce1 = CE_XDIAG_CE1SEEN(ptnrinfo); 572 int ce2 = CE_XDIAG_CE2SEEN(ptnrinfo); 573 574 if (ce1 && ce2) { 575 /* Should have been CE_DISP_STICKY */ 576 return (CMD_EVD_BAD); 577 } else if (ce1) { 578 /* Partner could see and could fix CE */ 579 CMD_STAT_BUMP(ce_psticky_ptnrclrd); 580 } else { 581 /* Partner could not see ce1 (ignore ce2) */ 582 CMD_STAT_BUMP(ce_psticky_ptnrnoerr); 583 } 584 } else { 585 CMD_STAT_BUMP(ce_psticky_noptnr); 586 } 587 return (CMD_EVD_UNUSED); 588 } 589 case CE_DISP_STICKY: 590 CMD_STAT_BUMP(ce_sticky); 591 break; 592 default: 593 return (CMD_EVD_BAD); 594 } 595 596 if (page == NULL) 597 page = cmd_page_create(hdl, asru, afar); 598 599 if (page->page_case.cc_cp == NULL) { 600 page->page_case.cc_cp = cmd_case_create(hdl, 601 &page->page_header, CMD_PTR_PAGE_CASE, &uuid); 602 } 603 604 switch (type) { 605 case CE_DISP_POSS_PERS: 606 case CE_DISP_PERS: 607 fmd_hdl_debug(hdl, "adding %sPersistent event to CE serd " 608 "engine\n", type == CE_DISP_POSS_PERS ? "Possible-" : ""); 609 610 if (page->page_case.cc_serdnm == NULL) { 611 page->page_case.cc_serdnm = cmd_page_serdnm_create(hdl, 612 "page", page->page_physbase); 613 614 fmd_serd_create(hdl, page->page_case.cc_serdnm, 615 fmd_prop_get_int32(hdl, "ce_n"), 616 fmd_prop_get_int64(hdl, "ce_t")); 617 } 618 619 if (fmd_serd_record(hdl, page->page_case.cc_serdnm, ep) == 620 FMD_B_FALSE) 621 return (CMD_EVD_OK); /* engine hasn't fired */ 622 623 fmd_hdl_debug(hdl, "ce page serd fired\n"); 624 fmd_case_add_serd(hdl, page->page_case.cc_cp, 625 page->page_case.cc_serdnm); 626 fmd_serd_reset(hdl, page->page_case.cc_serdnm); 627 break; /* to retire */ 628 629 case CE_DISP_LEAKY: 630 case CE_DISP_STICKY: 631 fmd_case_add_ereport(hdl, page->page_case.cc_cp, ep); 632 break; /* to retire */ 633 } 634 635 dimm->dimm_nretired++; 636 dimm->dimm_retstat.fmds_value.ui64++; 637 cmd_dimm_dirty(hdl, dimm); 638 639 cmd_page_fault(hdl, asru, cmd_dimm_fru(dimm), ep, afar); 640 ce_thresh_check(hdl, dimm); 641 642 return (CMD_EVD_OK); 643 } 644 645 /* 646 * Solve a bank case with suspect "fault.memory.bank". The caller must 647 * have populated bank->bank_case.cc_cp and is also responsible for adding 648 * associated ereport(s) to that case. 649 */ 650 void 651 cmd_bank_fault(fmd_hdl_t *hdl, cmd_bank_t *bank) 652 { 653 fmd_case_t *cp = bank->bank_case.cc_cp; 654 nvlist_t *flt; 655 656 if (bank->bank_flags & CMD_MEM_F_FAULTING) 657 return; /* Only complain once per bank */ 658 659 bank->bank_flags |= CMD_MEM_F_FAULTING; 660 cmd_bank_dirty(hdl, bank); 661 662 #ifdef sun4u 663 flt = cmd_bank_create_fault(hdl, bank, "fault.memory.bank", 664 CMD_FLTMAXCONF); 665 fmd_case_add_suspect(hdl, cp, flt); 666 #else /* sun4v */ 667 { 668 cmd_bank_memb_t *d; 669 670 /* create separate fault for each dimm in bank */ 671 672 for (d = cmd_list_next(&bank->bank_dimms); 673 d != NULL; d = cmd_list_next(d)) { 674 flt = cmd_dimm_create_fault(hdl, d->bm_dimm, 675 "fault.memory.bank", CMD_FLTMAXCONF); 676 fmd_case_add_suspect(hdl, cp, flt); 677 } 678 } 679 #endif /* sun4u */ 680 fmd_case_solve(hdl, cp); 681 } 682 683 /*ARGSUSED*/ 684 cmd_evdisp_t 685 cmd_ue_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 686 const char *class, uint64_t afar, uint8_t afar_status, uint16_t synd, 687 uint8_t synd_status, ce_dispact_t type, uint64_t disp, nvlist_t *asru) 688 { 689 cmd_page_t *page; 690 cmd_bank_t *bank; 691 cmd_cpu_t *cpu; 692 693 #ifdef sun4u 694 /* 695 * Note: Currently all sun4u processors using this code share 696 * L2 and L3 cache at CMD_CPU_LEVEL_CORE. 697 */ 698 cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class, 699 CMD_CPU_LEVEL_CORE); 700 #else /* sun4v */ 701 cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class, 702 CMD_CPU_LEVEL_THREAD); 703 #endif /* sun4u */ 704 705 if (cpu == NULL) { 706 fmd_hdl_debug(hdl, "cmd_ue_common: cpu not found\n"); 707 return (CMD_EVD_UNUSED); 708 } 709 710 /* 711 * The following code applies only to sun4u, because sun4u does 712 * not poison data in L2 cache resulting from the fetch of a 713 * memory UE. 714 */ 715 716 #ifdef sun4u 717 if (afar_status != AFLT_STAT_VALID) { 718 /* 719 * Had this report's AFAR been valid, it would have 720 * contributed an address to the UE cache. We don't 721 * know what the AFAR would have been, and thus we can't 722 * add anything to the cache. If a xxU is caused by 723 * this UE, we won't be able to detect it, and will thus 724 * erroneously offline the CPU. To prevent this 725 * situation, we need to assume that all xxUs generated 726 * through the next E$ flush are attributable to the UE. 727 */ 728 cmd_cpu_uec_set_allmatch(hdl, cpu); 729 } else { 730 cmd_cpu_uec_add(hdl, cpu, afar); 731 } 732 #endif /* sun4u */ 733 734 if (synd_status != AFLT_STAT_VALID) { 735 fmd_hdl_debug(hdl, "cmd_ue_common: syndrome not valid\n"); 736 return (CMD_EVD_UNUSED); 737 } 738 739 if (cmd_mem_synd_check(hdl, afar, afar_status, synd, synd_status, 740 cpu) == CMD_EVD_UNUSED) 741 return (CMD_EVD_UNUSED); 742 743 if (afar_status != AFLT_STAT_VALID) 744 return (CMD_EVD_UNUSED); 745 746 if ((page = cmd_page_lookup(afar)) != NULL && 747 page->page_case.cc_cp != NULL && 748 fmd_case_solved(hdl, page->page_case.cc_cp)) 749 return (CMD_EVD_REDUND); 750 751 if (fmd_nvl_fmri_expand(hdl, asru) < 0) { 752 CMD_STAT_BUMP(bad_mem_asru); 753 return (NULL); 754 } 755 756 if ((bank = cmd_bank_lookup(hdl, asru)) == NULL && 757 (bank = cmd_bank_create(hdl, asru)) == NULL) 758 return (CMD_EVD_UNUSED); 759 760 #ifdef sun4v 761 { 762 nvlist_t *fmri; 763 char **snarray; 764 unsigned int i, n; 765 766 /* 767 * 1: locate the array of serial numbers inside the bank asru. 768 * 2: for each serial #, lookup its mem: FMRI in libtopo 769 * 3: ensure that each DIMM's FMRI is on bank's dimmlist 770 */ 771 772 if (nvlist_lookup_string_array(asru, 773 FM_FMRI_MEM_SERIAL_ID, &snarray, &n) != 0) 774 fmd_hdl_abort(hdl, "Cannot locate serial #s for bank"); 775 776 for (i = 0; i < n; i++) { 777 fmri = cmd_find_dimm_by_sn(hdl, FM_FMRI_SCHEME_MEM, 778 snarray[i]); 779 /* 780 * If dimm structure doesn't already exist for 781 * each dimm, create and link to bank. 782 */ 783 if (cmd_dimm_lookup(hdl, fmri) == NULL) 784 (void) cmd_dimm_create(hdl, fmri); 785 nvlist_free(fmri); 786 } 787 } 788 #endif /* sun4v */ 789 790 if (bank->bank_case.cc_cp == NULL) { 791 const char *uuid; 792 bank->bank_case.cc_cp = cmd_case_create(hdl, &bank->bank_header, 793 CMD_PTR_BANK_CASE, &uuid); 794 } 795 796 #ifdef sun4u 797 if (cmd_dp_error(hdl)) { 798 CMD_STAT_BUMP(dp_deferred_ue); 799 cmd_dp_page_defer(hdl, asru, ep, afar); 800 return (CMD_EVD_OK); 801 } else if (cmd_dp_fault(hdl, afar)) { 802 CMD_STAT_BUMP(dp_ignored_ue); 803 return (CMD_EVD_UNUSED); 804 } 805 #endif /* sun4u */ 806 807 fmd_case_add_ereport(hdl, bank->bank_case.cc_cp, ep); 808 809 bank->bank_nretired++; 810 bank->bank_retstat.fmds_value.ui64++; 811 cmd_bank_dirty(hdl, bank); 812 813 cmd_page_fault(hdl, bank->bank_asru_nvl, cmd_bank_fru(bank), ep, afar); 814 cmd_bank_fault(hdl, bank); 815 816 return (CMD_EVD_OK); 817 } 818 819 void 820 cmd_dimm_close(fmd_hdl_t *hdl, void *arg) 821 { 822 cmd_dimm_destroy(hdl, arg); 823 } 824 825 void 826 cmd_bank_close(fmd_hdl_t *hdl, void *arg) 827 { 828 cmd_bank_destroy(hdl, arg); 829 } 830