Home | History | Annotate | Download | only in cpumem-diagnosis
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Ereport-handling routines for memory errors
     28  */
     29 
     30 #include <cmd_mem.h>
     31 #include <cmd_dimm.h>
     32 #include <cmd_bank.h>
     33 #include <cmd_page.h>
     34 #include <cmd_cpu.h>
     35 #ifdef sun4u
     36 #include <cmd_dp.h>
     37 #include <cmd_dp_page.h>
     38 #endif
     39 #include <cmd.h>
     40 
     41 #include <strings.h>
     42 #include <string.h>
     43 #include <errno.h>
     44 #include <fm/fmd_api.h>
     45 #include <sys/fm/protocol.h>
     46 #include <sys/async.h>
     47 #include <sys/errclassify.h>
     48 #include <assert.h>
     49 
     50 #ifdef sun4v
     51 #include <cmd_hc_sun4v.h>
     52 #endif /* sun4v */
     53 
     54 struct ce_name2type {
     55 	const char *name;
     56 	ce_dispact_t type;
     57 };
     58 
     59 ce_dispact_t
     60 cmd_mem_name2type(const char *name, int minorvers)
     61 {
     62 	static const struct ce_name2type old[] = {
     63 		{ ERR_TYPE_DESC_INTERMITTENT,	CE_DISP_INTERMITTENT },
     64 		{ ERR_TYPE_DESC_PERSISTENT,	CE_DISP_PERS },
     65 		{ ERR_TYPE_DESC_STICKY,		CE_DISP_STICKY },
     66 		{ ERR_TYPE_DESC_UNKNOWN,	CE_DISP_UNKNOWN },
     67 		{ NULL }
     68 	};
     69 	static const struct ce_name2type new[] = {
     70 		{ CE_DISP_DESC_U,		CE_DISP_UNKNOWN },
     71 		{ CE_DISP_DESC_I,		CE_DISP_INTERMITTENT },
     72 		{ CE_DISP_DESC_PP,		CE_DISP_POSS_PERS },
     73 		{ CE_DISP_DESC_P,		CE_DISP_PERS },
     74 		{ CE_DISP_DESC_L,		CE_DISP_LEAKY },
     75 		{ CE_DISP_DESC_PS,		CE_DISP_POSS_STICKY },
     76 		{ CE_DISP_DESC_S,		CE_DISP_STICKY },
     77 		{ NULL }
     78 	};
     79 	const struct ce_name2type *names = (minorvers == 0) ? &old[0] : &new[0];
     80 	const struct ce_name2type *tp;
     81 
     82 	for (tp = names; tp->name != NULL; tp++)
     83 		if (strcasecmp(name, tp->name) == 0)
     84 			return (tp->type);
     85 
     86 	return (CE_DISP_UNKNOWN);
     87 }
     88 
     89 static void
     90 ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
     91 {
     92 	nvlist_t *flt;
     93 	fmd_case_t *cp;
     94 	cmd_dimm_t *d;
     95 	nvlist_t *dflt;
     96 	uint_t nret, dret;
     97 	int foundrw;
     98 
     99 	if (dimm->dimm_flags & CMD_MEM_F_FAULTING) {
    100 		/* We've already complained about this DIMM */
    101 		return;
    102 	}
    103 
    104 	nret = dimm->dimm_nretired;
    105 	if (dimm->dimm_bank != NULL)
    106 		nret += dimm->dimm_bank->bank_nretired;
    107 
    108 	if (!cmd_mem_thresh_check(hdl, nret))
    109 		return; /* Don't warn until over specified % of system memory */
    110 
    111 	/* Look for CEs on DIMMs in other banks */
    112 	for (foundrw = 0, dret = 0, d = cmd_list_next(&cmd.cmd_dimms);
    113 	    d != NULL; d = cmd_list_next(d)) {
    114 		if (d == dimm) {
    115 			dret += d->dimm_nretired;
    116 			continue;
    117 		}
    118 
    119 		if (dimm->dimm_bank != NULL && d->dimm_bank == dimm->dimm_bank)
    120 			continue;
    121 
    122 		if (d->dimm_nretired > cmd.cmd_thresh_abs_badrw) {
    123 			foundrw = 1;
    124 			dret += d->dimm_nretired;
    125 		}
    126 	}
    127 
    128 	if (foundrw) {
    129 		/*
    130 		 * Found a DIMM in another bank with a significant number of
    131 		 * retirements.  Something strange is going on, perhaps in the
    132 		 * datapath or with a bad CPU.  A real person will need to
    133 		 * figure out what's really happening.  Emit a fault designed
    134 		 * to trigger just that.
    135 		 */
    136 		cp = fmd_case_open(hdl, NULL);
    137 		for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL;
    138 		    d = cmd_list_next(d)) {
    139 
    140 			if (d != dimm && d->dimm_bank != NULL &&
    141 			    d->dimm_bank == dimm->dimm_bank)
    142 				continue;
    143 
    144 			if (d->dimm_nretired <= cmd.cmd_thresh_abs_badrw)
    145 				continue;
    146 
    147 			if (!(d->dimm_flags & CMD_MEM_F_FAULTING)) {
    148 				d->dimm_flags |= CMD_MEM_F_FAULTING;
    149 				cmd_dimm_dirty(hdl, d);
    150 			}
    151 
    152 			flt = cmd_dimm_create_fault(hdl, d,
    153 			    "fault.memory.datapath",
    154 			    d->dimm_nretired * 100 / dret);
    155 			fmd_case_add_suspect(hdl, cp, flt);
    156 		}
    157 
    158 		fmd_case_solve(hdl, cp);
    159 		return;
    160 	}
    161 
    162 	dimm->dimm_flags |= CMD_MEM_F_FAULTING;
    163 	cmd_dimm_dirty(hdl, dimm);
    164 
    165 	cp = fmd_case_open(hdl, NULL);
    166 	dflt = cmd_dimm_create_fault(hdl, dimm,
    167 	    "fault.memory.dimm-page-retires-excessive",
    168 	    CMD_FLTMAXCONF);
    169 	fmd_case_add_suspect(hdl, cp, dflt);
    170 	fmd_case_solve(hdl, cp);
    171 }
    172 
    173 /* Create a fresh index block for MQSC CE correlation. */
    174 
    175 cmd_mq_t *
    176 mq_create(fmd_hdl_t *hdl, fmd_event_t *ep,
    177     uint64_t afar, uint16_t upos, uint64_t now)
    178 {
    179 	cmd_mq_t *cp;
    180 	uint16_t ckwd = (afar & 0x30) >> 4;
    181 
    182 	cp = fmd_hdl_zalloc(hdl, sizeof (cmd_mq_t), FMD_SLEEP);
    183 	cp->mq_tstamp = now;
    184 	cp->mq_ckwd = ckwd;
    185 	cp->mq_phys_addr = afar;
    186 	cp->mq_unit_position = upos;
    187 	cp->mq_dram = cmd_upos2dram(upos);
    188 	cp->mq_ep = ep;
    189 	cp->mq_serdnm =
    190 	    cmd_mq_serdnm_create(hdl, "mq", afar, ckwd, upos);
    191 
    192 	/*
    193 	 * Create SERD to keep this event from being removed
    194 	 * by fmd which may not know there is an event pointer
    195 	 * saved here. This SERD is *never* meant to fire.
    196 	 * NOTE: wouldn't need to do this if there were an fmd
    197 	 * api to 'hold' an event.
    198 	 */
    199 	if (fmd_serd_exists(hdl, cp->mq_serdnm)) {
    200 		/* clean up dup */
    201 		fmd_serd_destroy(hdl, cp->mq_serdnm);
    202 	}
    203 	fmd_serd_create(hdl, cp->mq_serdnm, CMD_MQ_SERDN, CMD_MQ_SERDT);
    204 	(void) fmd_serd_record(hdl, cp->mq_serdnm, ep);
    205 
    206 	return (cp);
    207 }
    208 
    209 /* Destroy MQSC tracking block as well as event tracking SERD. */
    210 
    211 cmd_mq_t *
    212 mq_destroy(fmd_hdl_t *hdl, cmd_list_t *lp, cmd_mq_t *ip)
    213 {
    214 	cmd_mq_t *jp = cmd_list_next(ip);
    215 
    216 	if (ip->mq_serdnm != NULL) {
    217 		if (fmd_serd_exists(hdl, ip->mq_serdnm)) {
    218 			fmd_serd_destroy(hdl, ip->mq_serdnm);
    219 		}
    220 		fmd_hdl_strfree(hdl, ip->mq_serdnm);
    221 		ip->mq_serdnm = NULL;
    222 	}
    223 	cmd_list_delete(lp, &ip->mq_l);
    224 	fmd_hdl_free(hdl, ip, sizeof (cmd_mq_t));
    225 
    226 	return (jp);
    227 }
    228 
    229 /*
    230  * Add an index block for a new CE, sorted
    231  * a) by ascending unit position
    232  * b) order of arrival (~= time order)
    233  */
    234 
    235 void
    236 mq_add(fmd_hdl_t *hdl, cmd_dimm_t *dimm, fmd_event_t *ep,
    237     uint64_t afar, uint16_t synd, uint64_t now)
    238 {
    239 	cmd_mq_t *ip, *jp;
    240 	int cw, unit_position;
    241 
    242 	cw = (afar & 0x30) >> 4;		/* 0:3 */
    243 	if ((unit_position = cmd_synd2upos(synd)) < 0)
    244 		return;				/* not a CE */
    245 
    246 	for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; ) {
    247 		if (ip->mq_unit_position > unit_position) {
    248 			/* list is in unit position order */
    249 			break;
    250 		} else if (ip->mq_unit_position == unit_position &&
    251 		    ip->mq_phys_addr == afar) {
    252 			/*
    253 			 * Found a duplicate cw, unit_position, and afar.
    254 			 * Delete this node, to be superseded by the new
    255 			 * node added below.
    256 			 */
    257 			ip = mq_destroy(hdl, &dimm->mq_root[cw], ip);
    258 		} else {
    259 			ip = cmd_list_next(ip);
    260 		}
    261 	}
    262 
    263 	jp = mq_create(hdl, ep, afar, unit_position, now);
    264 	if (ip == NULL)
    265 		cmd_list_append(&dimm->mq_root[cw], jp);
    266 	else
    267 		cmd_list_insert_before(&dimm->mq_root[cw], ip, jp);
    268 }
    269 
    270 /*
    271  * Prune the MQSC index lists (one for each checkword), by deleting
    272  * outdated index blocks from each list.
    273  */
    274 
    275 void
    276 mq_prune(fmd_hdl_t *hdl, cmd_dimm_t *dimm, uint64_t now)
    277 {
    278 	cmd_mq_t *ip;
    279 	int cw;
    280 
    281 	for (cw = 0; cw < CMD_MAX_CKWDS; cw++) {
    282 		for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; ) {
    283 			if (ip->mq_tstamp < now - CMD_MQ_TIMELIM) {
    284 				/*
    285 				 * This event has timed out - delete the
    286 				 * mq block as well as serd for the event.
    287 				 */
    288 				ip = mq_destroy(hdl, &dimm->mq_root[cw], ip);
    289 			} else {
    290 				/* tstamp < now - ce_t */
    291 				ip = cmd_list_next(ip);
    292 			}
    293 		} /* per checkword */
    294 	} /* cw = 0...3 */
    295 }
    296 
    297 /*
    298  * Check the MQSC index lists (one for each checkword) by making a
    299  * complete pass through each list, checking if the criteria for either
    300  * Rule 4A or 4B have been met.  Rule 4A checking is done for each checkword;
    301  * 4B check is done at end.
    302  *
    303  * Rule 4A: fault a DIMM  "whenever Solaris reports two or more CEs from
    304  * two or more different physical addresses on each of two or more different
    305  * bit positions from the same DIMM within 72 hours of each other, and all
    306  * the addresses are in the same relative checkword (that is, the AFARs
    307  * are all the same modulo 64).  [Note: This means at least 4 CEs; two
    308  * from one bit position, with unique addresses, and two from another,
    309  * also with unique addresses, and the lower 6 bits of all the addresses
    310  * are the same."
    311  *
    312  * Rule 4B: fault a DIMM "whenever Solaris reports two or more CEs from
    313  * two or more different physical addresses on each of three or more
    314  * different outputs from the same DRAM within 72 hours of each other, as
    315  * long as the three outputs do not all correspond to the same relative
    316  * bit position in their respective checkwords.  [Note: This means at least
    317  * 6 CEs; two from one DRAM output signal, with unique addresses, two from
    318  * another output from the same DRAM, also with unique addresses, and two
    319  * more from yet another output from the same DRAM, again with unique
    320  * addresses, as long as the three outputs do not all correspond to the
    321  * same relative bit position in their respective checkwords.]"
    322  */
    323 
    324 void
    325 mq_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
    326 {
    327 	int upos_pairs, curr_upos, cw, i, j, k;
    328 	nvlist_t *flt;
    329 	typedef struct upos_pair {
    330 		int upos;
    331 		int dram;
    332 		cmd_mq_t *mq1;
    333 		cmd_mq_t *mq2;
    334 	} upos_pair_t;
    335 	upos_pair_t upos_array[8]; /* max per cw = 2, * 4 cw's */
    336 	cmd_mq_t *ip;
    337 
    338 	/*
    339 	 * Each upos_array[] member represents a pair of CEs for the same
    340 	 * unit position (symbol) which on a sun4u is a bit, and on sun4v
    341 	 * is a (4 bit) nibble.
    342 	 * MQSC rule 4 requires pairs of CEs from the same symbol (same DIMM
    343 	 * for rule 4A, and same DRAM for rule 4B) for a violation - this
    344 	 * is why CE pairs are tracked.
    345 	 */
    346 	upos_pairs = 0;
    347 	upos_array[0].mq1 = NULL;
    348 
    349 	/* Loop through all checkwords */
    350 	for (cw = 0; cw < CMD_MAX_CKWDS; cw++) {
    351 		i = upos_pairs;
    352 		curr_upos = -1;
    353 
    354 		/*
    355 		 * mq_root[] is an array of cumulative lists of CEs
    356 		 * indexed by checkword where the list is in unit position
    357 		 * order. Loop through checking for duplicate unit position
    358 		 * entries (filled in at mq_create()).
    359 		 * The upos_array[] is filled in each time a duplicate
    360 		 * unit position is found; the first time through the loop
    361 		 * of a unit position sets curr_upos but does not fill in
    362 		 * upos_array[] until the second symbol is found.
    363 		 */
    364 		for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL;
    365 		    ip = cmd_list_next(ip)) {
    366 			if (curr_upos != ip->mq_unit_position) {
    367 				/* Set initial current position */
    368 				curr_upos = ip->mq_unit_position;
    369 			} else if (i > upos_pairs &&
    370 			    curr_upos == upos_array[i-1].upos) {
    371 				/*
    372 				 * Only keep track of CE pairs; skip
    373 				 * triples, quads, etc...
    374 				 */
    375 				continue;
    376 			} else if (upos_array[i].mq1 == NULL) {
    377 				/*
    378 				 * Have a pair, add to upos_array[].
    379 				 */
    380 				upos_array[i].upos = curr_upos;
    381 				upos_array[i].dram = ip->mq_dram;
    382 				upos_array[i].mq1 = cmd_list_prev(ip);
    383 				upos_array[i].mq2 = ip;
    384 				upos_array[++i].mq1 = NULL;
    385 			}
    386 		}
    387 
    388 		if (i - upos_pairs >= 2) {
    389 			/* Rule 4A Violation. */
    390 			flt = cmd_dimm_create_fault(hdl,
    391 			    dimm, "fault.memory.dimm-ue-imminent",
    392 			    CMD_FLTMAXCONF);
    393 			for (j = upos_pairs; j < i; j++) {
    394 				fmd_case_add_ereport(hdl,
    395 				    dimm->dimm_case.cc_cp,
    396 				    upos_array[j].mq1->mq_ep);
    397 				fmd_case_add_ereport(hdl,
    398 				    dimm->dimm_case.cc_cp,
    399 				    upos_array[j].mq2->mq_ep);
    400 			}
    401 			dimm->dimm_flags |= CMD_MEM_F_FAULTING;
    402 			cmd_dimm_dirty(hdl, dimm);
    403 			fmd_case_add_suspect(hdl, dimm->dimm_case.cc_cp, flt);
    404 			fmd_case_solve(hdl, dimm->dimm_case.cc_cp);
    405 			return;
    406 		}
    407 		upos_pairs = i;
    408 		assert(upos_pairs < 8);
    409 	}
    410 
    411 	if (upos_pairs < 3)
    412 		return; /* 4B violation needs at least 3 pairs */
    413 
    414 	/*
    415 	 * Walk through checking for a rule 4B violation.
    416 	 * Since we only keep track of two CE pairs per CW we'll only have
    417 	 * a max of potentially 8 elements in the array. So as not to run
    418 	 * off the end of the array, need to be careful with i and j indexes.
    419 	 */
    420 	for (i = 0; i < (upos_pairs - 2); i++) {
    421 		if (upos_array[i].dram == -1) {
    422 			/*
    423 			 * Don't match failure codes. There is
    424 			 * no platform DRAM xlation - return.
    425 			 */
    426 			fmd_hdl_debug(hdl, "Unable to determine DRAM"
    427 			    " from the unit position\n");
    428 			return;
    429 		}
    430 
    431 		for (j = i+1; j < (upos_pairs - 1); j++) {
    432 			if (upos_array[i].dram != upos_array[j].dram) {
    433 				/*
    434 				 * These two pairs aren't the same dram;
    435 				 * continue looking for pairs that are.
    436 				 */
    437 				continue;
    438 			}
    439 
    440 			for (k = j+1; k < upos_pairs; k++) {
    441 				if (upos_array[j].dram != upos_array[k].dram) {
    442 					/*
    443 					 * DRAMs must be the same for a rule
    444 					 * 4B violation. Continue looking for
    445 					 * pairs that have the same DRAMs.
    446 					 */
    447 					continue;
    448 				}
    449 
    450 				if ((upos_array[i].upos !=
    451 				    upos_array[j].upos) ||
    452 				    (upos_array[j].upos !=
    453 				    upos_array[k].upos)) {
    454 					/*
    455 					 * We've determined that all the dram
    456 					 * CEs are the same dram, if all the
    457 					 * unit positions are not the same,
    458 					 * then we have a rule 4B violation.
    459 					 */
    460 					flt = cmd_dimm_create_fault(hdl, dimm,
    461 					    "fault.memory.dram-ue-imminent",
    462 					    CMD_FLTMAXCONF);
    463 					fmd_case_add_ereport(hdl,
    464 					    dimm->dimm_case.cc_cp,
    465 					    upos_array[i].mq1->mq_ep);
    466 					fmd_case_add_ereport(hdl,
    467 					    dimm->dimm_case.cc_cp,
    468 					    upos_array[i].mq2->mq_ep);
    469 					fmd_case_add_ereport(hdl,
    470 					    dimm->dimm_case.cc_cp,
    471 					    upos_array[j].mq1->mq_ep);
    472 					fmd_case_add_ereport(hdl,
    473 					    dimm->dimm_case.cc_cp,
    474 					    upos_array[j].mq2->mq_ep);
    475 					fmd_case_add_ereport(hdl,
    476 					    dimm->dimm_case.cc_cp,
    477 					    upos_array[k].mq1->mq_ep);
    478 					fmd_case_add_ereport(hdl,
    479 					    dimm->dimm_case.cc_cp,
    480 					    upos_array[k].mq2->mq_ep);
    481 					dimm->dimm_flags |= CMD_MEM_F_FAULTING;
    482 					cmd_dimm_dirty(hdl, dimm);
    483 					fmd_case_add_suspect(hdl,
    484 					    dimm->dimm_case.cc_cp, flt);
    485 					fmd_case_solve(hdl,
    486 					    dimm->dimm_case.cc_cp);
    487 					return;
    488 				}
    489 			}
    490 		}
    491 	}
    492 }
    493 
    494 /*ARGSUSED*/
    495 cmd_evdisp_t
    496 cmd_ce_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
    497     const char *class, uint64_t afar, uint8_t afar_status, uint16_t synd,
    498     uint8_t synd_status, ce_dispact_t type, uint64_t disp, nvlist_t *asru)
    499 {
    500 	cmd_dimm_t *dimm;
    501 	cmd_page_t *page;
    502 	const char *uuid;
    503 
    504 	if (afar_status != AFLT_STAT_VALID ||
    505 	    synd_status != AFLT_STAT_VALID)
    506 		return (CMD_EVD_UNUSED);
    507 
    508 	if ((page = cmd_page_lookup(afar)) != NULL &&
    509 	    page->page_case.cc_cp != NULL &&
    510 	    fmd_case_solved(hdl, page->page_case.cc_cp))
    511 		return (CMD_EVD_REDUND);
    512 
    513 #ifdef sun4u
    514 	if (cmd_dp_error(hdl) || cmd_dp_fault(hdl, afar)) {
    515 		CMD_STAT_BUMP(dp_ignored_ce);
    516 		return (CMD_EVD_UNUSED);
    517 	}
    518 #endif /* sun4u */
    519 
    520 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
    521 		CMD_STAT_BUMP(bad_mem_asru);
    522 		return (CMD_EVD_BAD);
    523 	}
    524 
    525 	if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL &&
    526 	    (dimm = cmd_dimm_create(hdl, asru)) == NULL)
    527 		return (CMD_EVD_UNUSED);
    528 
    529 	if (dimm->dimm_case.cc_cp == NULL) {
    530 		dimm->dimm_case.cc_cp = cmd_case_create(hdl,
    531 		    &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid);
    532 	}
    533 
    534 	/*
    535 	 * Add to MQSC correlation lists all CEs which pass validity
    536 	 * checks above.
    537 	 */
    538 	if (!(dimm->dimm_flags & CMD_MEM_F_FAULTING)) {
    539 		uint64_t *now;
    540 		uint_t nelem;
    541 		if (nvlist_lookup_uint64_array(nvl,
    542 		    "__tod", &now, &nelem) == 0) {
    543 
    544 			mq_add(hdl, dimm, ep, afar, synd, *now);
    545 			mq_prune(hdl, dimm, *now);
    546 			mq_check(hdl, dimm);
    547 		}
    548 	}
    549 
    550 	switch (type) {
    551 	case CE_DISP_UNKNOWN:
    552 		CMD_STAT_BUMP(ce_unknown);
    553 		return (CMD_EVD_UNUSED);
    554 	case CE_DISP_INTERMITTENT:
    555 		CMD_STAT_BUMP(ce_interm);
    556 		return (CMD_EVD_UNUSED);
    557 	case CE_DISP_POSS_PERS:
    558 		CMD_STAT_BUMP(ce_ppersis);
    559 		break;
    560 	case CE_DISP_PERS:
    561 		CMD_STAT_BUMP(ce_persis);
    562 		break;
    563 	case CE_DISP_LEAKY:
    564 		CMD_STAT_BUMP(ce_leaky);
    565 		break;
    566 	case CE_DISP_POSS_STICKY:
    567 	{
    568 		uchar_t ptnrinfo = CE_XDIAG_PTNRINFO(disp);
    569 
    570 		if (CE_XDIAG_TESTVALID(ptnrinfo)) {
    571 			int ce1 = CE_XDIAG_CE1SEEN(ptnrinfo);
    572 			int ce2 = CE_XDIAG_CE2SEEN(ptnrinfo);
    573 
    574 			if (ce1 && ce2) {
    575 				/* Should have been CE_DISP_STICKY */
    576 				return (CMD_EVD_BAD);
    577 			} else if (ce1) {
    578 				/* Partner could see and could fix CE */
    579 				CMD_STAT_BUMP(ce_psticky_ptnrclrd);
    580 			} else {
    581 				/* Partner could not see ce1 (ignore ce2) */
    582 				CMD_STAT_BUMP(ce_psticky_ptnrnoerr);
    583 			}
    584 		} else {
    585 			CMD_STAT_BUMP(ce_psticky_noptnr);
    586 		}
    587 		return (CMD_EVD_UNUSED);
    588 	}
    589 	case CE_DISP_STICKY:
    590 		CMD_STAT_BUMP(ce_sticky);
    591 		break;
    592 	default:
    593 		return (CMD_EVD_BAD);
    594 	}
    595 
    596 	if (page == NULL)
    597 		page = cmd_page_create(hdl, asru, afar);
    598 
    599 	if (page->page_case.cc_cp == NULL) {
    600 		page->page_case.cc_cp = cmd_case_create(hdl,
    601 		    &page->page_header, CMD_PTR_PAGE_CASE, &uuid);
    602 	}
    603 
    604 	switch (type) {
    605 	case CE_DISP_POSS_PERS:
    606 	case CE_DISP_PERS:
    607 		fmd_hdl_debug(hdl, "adding %sPersistent event to CE serd "
    608 		    "engine\n", type == CE_DISP_POSS_PERS ? "Possible-" : "");
    609 
    610 		if (page->page_case.cc_serdnm == NULL) {
    611 			page->page_case.cc_serdnm = cmd_page_serdnm_create(hdl,
    612 			    "page", page->page_physbase);
    613 
    614 			fmd_serd_create(hdl, page->page_case.cc_serdnm,
    615 			    fmd_prop_get_int32(hdl, "ce_n"),
    616 			    fmd_prop_get_int64(hdl, "ce_t"));
    617 		}
    618 
    619 		if (fmd_serd_record(hdl, page->page_case.cc_serdnm, ep) ==
    620 		    FMD_B_FALSE)
    621 				return (CMD_EVD_OK); /* engine hasn't fired */
    622 
    623 		fmd_hdl_debug(hdl, "ce page serd fired\n");
    624 		fmd_case_add_serd(hdl, page->page_case.cc_cp,
    625 		    page->page_case.cc_serdnm);
    626 		fmd_serd_reset(hdl, page->page_case.cc_serdnm);
    627 		break;	/* to retire */
    628 
    629 	case CE_DISP_LEAKY:
    630 	case CE_DISP_STICKY:
    631 		fmd_case_add_ereport(hdl, page->page_case.cc_cp, ep);
    632 		break;	/* to retire */
    633 	}
    634 
    635 	dimm->dimm_nretired++;
    636 	dimm->dimm_retstat.fmds_value.ui64++;
    637 	cmd_dimm_dirty(hdl, dimm);
    638 
    639 	cmd_page_fault(hdl, asru, cmd_dimm_fru(dimm), ep, afar);
    640 	ce_thresh_check(hdl, dimm);
    641 
    642 	return (CMD_EVD_OK);
    643 }
    644 
    645 /*
    646  * Solve a bank case with suspect "fault.memory.bank".  The caller must
    647  * have populated bank->bank_case.cc_cp and is also responsible for adding
    648  * associated ereport(s) to that case.
    649  */
    650 void
    651 cmd_bank_fault(fmd_hdl_t *hdl, cmd_bank_t *bank)
    652 {
    653 	fmd_case_t *cp = bank->bank_case.cc_cp;
    654 	nvlist_t *flt;
    655 
    656 	if (bank->bank_flags & CMD_MEM_F_FAULTING)
    657 		return; /* Only complain once per bank */
    658 
    659 	bank->bank_flags |= CMD_MEM_F_FAULTING;
    660 	cmd_bank_dirty(hdl, bank);
    661 
    662 #ifdef	sun4u
    663 	flt = cmd_bank_create_fault(hdl, bank, "fault.memory.bank",
    664 	    CMD_FLTMAXCONF);
    665 	fmd_case_add_suspect(hdl, cp, flt);
    666 #else /* sun4v */
    667 	{
    668 		cmd_bank_memb_t *d;
    669 
    670 		/* create separate fault for each dimm in bank */
    671 
    672 		for (d = cmd_list_next(&bank->bank_dimms);
    673 		    d != NULL; d = cmd_list_next(d)) {
    674 			flt = cmd_dimm_create_fault(hdl, d->bm_dimm,
    675 			    "fault.memory.bank", CMD_FLTMAXCONF);
    676 			fmd_case_add_suspect(hdl, cp, flt);
    677 		}
    678 	}
    679 #endif /* sun4u */
    680 	fmd_case_solve(hdl, cp);
    681 }
    682 
    683 /*ARGSUSED*/
    684 cmd_evdisp_t
    685 cmd_ue_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
    686     const char *class, uint64_t afar, uint8_t afar_status, uint16_t synd,
    687     uint8_t synd_status, ce_dispact_t type, uint64_t disp, nvlist_t *asru)
    688 {
    689 	cmd_page_t *page;
    690 	cmd_bank_t *bank;
    691 	cmd_cpu_t *cpu;
    692 
    693 #ifdef sun4u
    694 	/*
    695 	 * Note: Currently all sun4u processors using this code share
    696 	 * L2 and L3 cache at CMD_CPU_LEVEL_CORE.
    697 	 */
    698 	cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class,
    699 	    CMD_CPU_LEVEL_CORE);
    700 #else /* sun4v */
    701 	cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class,
    702 	    CMD_CPU_LEVEL_THREAD);
    703 #endif /* sun4u */
    704 
    705 	if (cpu == NULL) {
    706 		fmd_hdl_debug(hdl, "cmd_ue_common: cpu not found\n");
    707 		return (CMD_EVD_UNUSED);
    708 	}
    709 
    710 	/*
    711 	 * The following code applies only to sun4u, because sun4u does
    712 	 * not poison data in L2 cache resulting from the fetch of a
    713 	 * memory UE.
    714 	 */
    715 
    716 #ifdef sun4u
    717 	if (afar_status != AFLT_STAT_VALID) {
    718 		/*
    719 		 * Had this report's AFAR been valid, it would have
    720 		 * contributed an address to the UE cache.  We don't
    721 		 * know what the AFAR would have been, and thus we can't
    722 		 * add anything to the cache.  If a xxU is caused by
    723 		 * this UE, we won't be able to detect it, and will thus
    724 		 * erroneously offline the CPU.  To prevent this
    725 		 * situation, we need to assume that all xxUs generated
    726 		 * through the next E$ flush are attributable to the UE.
    727 		 */
    728 		cmd_cpu_uec_set_allmatch(hdl, cpu);
    729 	} else {
    730 		cmd_cpu_uec_add(hdl, cpu, afar);
    731 	}
    732 #endif /* sun4u */
    733 
    734 	if (synd_status != AFLT_STAT_VALID) {
    735 		fmd_hdl_debug(hdl, "cmd_ue_common: syndrome not valid\n");
    736 		return (CMD_EVD_UNUSED);
    737 	}
    738 
    739 	if (cmd_mem_synd_check(hdl, afar, afar_status, synd, synd_status,
    740 	    cpu) == CMD_EVD_UNUSED)
    741 		return (CMD_EVD_UNUSED);
    742 
    743 	if (afar_status != AFLT_STAT_VALID)
    744 		return (CMD_EVD_UNUSED);
    745 
    746 	if ((page = cmd_page_lookup(afar)) != NULL &&
    747 	    page->page_case.cc_cp != NULL &&
    748 	    fmd_case_solved(hdl, page->page_case.cc_cp))
    749 		return (CMD_EVD_REDUND);
    750 
    751 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
    752 		CMD_STAT_BUMP(bad_mem_asru);
    753 		return (NULL);
    754 	}
    755 
    756 	if ((bank = cmd_bank_lookup(hdl, asru)) == NULL &&
    757 	    (bank = cmd_bank_create(hdl, asru)) == NULL)
    758 		return (CMD_EVD_UNUSED);
    759 
    760 #ifdef sun4v
    761 	{
    762 		nvlist_t *fmri;
    763 		char **snarray;
    764 		unsigned int i, n;
    765 
    766 		/*
    767 		 * 1: locate the array of serial numbers inside the bank asru.
    768 		 * 2: for each serial #, lookup its mem: FMRI in libtopo
    769 		 * 3: ensure that each DIMM's FMRI is on bank's dimmlist
    770 		 */
    771 
    772 		if (nvlist_lookup_string_array(asru,
    773 		    FM_FMRI_MEM_SERIAL_ID, &snarray, &n) != 0)
    774 			fmd_hdl_abort(hdl, "Cannot locate serial #s for bank");
    775 
    776 		for (i = 0; i < n; i++) {
    777 			fmri = cmd_find_dimm_by_sn(hdl, FM_FMRI_SCHEME_MEM,
    778 			    snarray[i]);
    779 			/*
    780 			 * If dimm structure doesn't already exist for
    781 			 * each dimm, create and link to bank.
    782 			 */
    783 			if (cmd_dimm_lookup(hdl, fmri) == NULL)
    784 				(void) cmd_dimm_create(hdl, fmri);
    785 			nvlist_free(fmri);
    786 		}
    787 	}
    788 #endif /* sun4v */
    789 
    790 	if (bank->bank_case.cc_cp == NULL) {
    791 		const char *uuid;
    792 		bank->bank_case.cc_cp = cmd_case_create(hdl, &bank->bank_header,
    793 		    CMD_PTR_BANK_CASE, &uuid);
    794 	}
    795 
    796 #ifdef sun4u
    797 	if (cmd_dp_error(hdl)) {
    798 		CMD_STAT_BUMP(dp_deferred_ue);
    799 		cmd_dp_page_defer(hdl, asru, ep, afar);
    800 		return (CMD_EVD_OK);
    801 	} else if (cmd_dp_fault(hdl, afar)) {
    802 		CMD_STAT_BUMP(dp_ignored_ue);
    803 		return (CMD_EVD_UNUSED);
    804 	}
    805 #endif /* sun4u */
    806 
    807 	fmd_case_add_ereport(hdl, bank->bank_case.cc_cp, ep);
    808 
    809 	bank->bank_nretired++;
    810 	bank->bank_retstat.fmds_value.ui64++;
    811 	cmd_bank_dirty(hdl, bank);
    812 
    813 	cmd_page_fault(hdl, bank->bank_asru_nvl, cmd_bank_fru(bank), ep, afar);
    814 	cmd_bank_fault(hdl, bank);
    815 
    816 	return (CMD_EVD_OK);
    817 }
    818 
    819 void
    820 cmd_dimm_close(fmd_hdl_t *hdl, void *arg)
    821 {
    822 	cmd_dimm_destroy(hdl, arg);
    823 }
    824 
    825 void
    826 cmd_bank_close(fmd_hdl_t *hdl, void *arg)
    827 {
    828 	cmd_bank_destroy(hdl, arg);
    829 }
    830