Home | History | Annotate | Download | only in generic_cpu
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/mca_x86.h>
     28 #include <sys/cpu_module_impl.h>
     29 #include <sys/cpu_module_ms.h>
     30 #include <sys/cmn_err.h>
     31 #include <sys/cpuvar.h>
     32 #include <sys/pghw.h>
     33 #include <sys/x86_archext.h>
     34 #include <sys/sysmacros.h>
     35 #include <sys/regset.h>
     36 #include <sys/privregs.h>
     37 #include <sys/systm.h>
     38 #include <sys/types.h>
     39 #include <sys/log.h>
     40 #include <sys/psw.h>
     41 #include <sys/fm/protocol.h>
     42 #include <sys/fm/util.h>
     43 #include <sys/errorq.h>
     44 #include <sys/mca_x86.h>
     45 #include <sys/fm/cpu/GMCA.h>
     46 #include <sys/fm/smb/fmsmb.h>
     47 #include <sys/sysevent.h>
     48 #include <sys/ontrap.h>
     49 
     50 #include "gcpu.h"
     51 
     52 extern int x86gentopo_legacy;	/* x86 generic topology support */
     53 
     54 static uint_t gcpu_force_addr_in_payload = 0;
     55 
     56 /*
     57  * Clear to log telemetry found at initialization.  While processor docs
     58  * say you should process this telemetry on all but Intel family 0x6
     59  * there are way too many exceptions and we want to avoid bogus
     60  * diagnoses.
     61  */
     62 int gcpu_suppress_log_on_init = 1;
     63 
     64 /*
     65  * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
     66  * error logout time.  The stack will be included in the ereport if the
     67  * error type selects stack inclusion, or in all cases if
     68  * gcpu_mca_stack_ereport_include is nonzero.
     69  */
     70 int gcpu_mca_stack_flag = 0;
     71 int gcpu_mca_stack_ereport_include = 0;
     72 
     73 /*
     74  * The number of times to re-read MCA telemetry to try to obtain a
     75  * consistent snapshot if we find it to be changing under our feet.
     76  */
     77 int gcpu_mca_telemetry_retries = 5;
     78 
     79 #ifndef __xpv
     80 int gcpu_mca_cmci_throttling_threshold = 10;
     81 int gcpu_mca_cmci_reenable_threshold = 1000;
     82 #endif
     83 
     84 static gcpu_error_disp_t gcpu_errtypes[] = {
     85 
     86 	/*
     87 	 * Unclassified
     88 	 */
     89 	{
     90 		FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
     91 		NULL,
     92 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
     93 		MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
     94 		MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
     95 	},
     96 
     97 	/*
     98 	 * Microcode ROM Parity Error
     99 	 */
    100 	{
    101 		FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
    102 		NULL,
    103 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
    104 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
    105 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
    106 	},
    107 
    108 	/*
    109 	 * External - BINIT# from another processor during power-on config
    110 	 */
    111 	{
    112 		FM_EREPORT_CPU_GENERIC_EXTERNAL,
    113 		NULL,
    114 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
    115 		MCAX86_SIMPLE_EXTERNAL_MASKON,
    116 		MCAX86_SIMPLE_EXTERNAL_MASKOFF
    117 	},
    118 
    119 	/*
    120 	 * Functional redundancy check master/slave error
    121 	 */
    122 	{
    123 		FM_EREPORT_CPU_GENERIC_FRC,
    124 		NULL,
    125 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
    126 		MCAX86_SIMPLE_FRC_MASKON,
    127 		MCAX86_SIMPLE_FRC_MASKOFF
    128 	},
    129 
    130 	/*
    131 	 * Internal parity error
    132 	 */
    133 	{
    134 		FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY,
    135 		NULL,
    136 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
    137 		MCAX86_SIMPLE_INTERNAL_PARITY_MASKON,
    138 		MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF
    139 	},
    140 
    141 
    142 	/*
    143 	 * Internal timer error
    144 	 */
    145 	{
    146 		FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
    147 		NULL,
    148 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
    149 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
    150 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
    151 	},
    152 
    153 	/*
    154 	 * Internal unclassified
    155 	 */
    156 	{
    157 		FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
    158 		NULL,
    159 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
    160 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
    161 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
    162 	},
    163 
    164 	/*
    165 	 * Compound error codes - generic memory hierarchy
    166 	 */
    167 	{
    168 		FM_EREPORT_CPU_GENERIC_GENMEMHIER,
    169 		NULL,
    170 		FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
    171 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
    172 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
    173 	},
    174 
    175 	/*
    176 	 * Compound error codes - TLB errors
    177 	 */
    178 	{
    179 		FM_EREPORT_CPU_GENERIC_TLB,
    180 		"%1$s" "TLB" "%2$s" "_ERR",
    181 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
    182 		MCAX86_COMPOUND_TLB_MASKON,
    183 		MCAX86_COMPOUND_TLB_MASKOFF
    184 	},
    185 
    186 	/*
    187 	 * Compound error codes - memory hierarchy
    188 	 */
    189 	{
    190 		FM_EREPORT_CPU_GENERIC_MEMHIER,
    191 		"%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
    192 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
    193 		MCAX86_COMPOUND_MEMHIER_MASKON,
    194 		MCAX86_COMPOUND_MEMHIER_MASKOFF
    195 	},
    196 
    197 	/*
    198 	 * Compound error codes - bus and interconnect errors
    199 	 */
    200 	{
    201 		FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
    202 		"BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
    203 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
    204 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
    205 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
    206 	},
    207 	/*
    208 	 * Compound error codes - memory controller errors
    209 	 */
    210 	{
    211 		FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER,
    212 		"MC" "_" "%8$s" "_" "%9$s" "_ERR",
    213 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
    214 		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON,
    215 		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF
    216 	},
    217 };
    218 
    219 static gcpu_error_disp_t gcpu_unknown = {
    220 	FM_EREPORT_CPU_GENERIC_UNKNOWN,
    221 	"UNKNOWN",
    222 	FM_EREPORT_PAYLOAD_FLAGS_COMMON,
    223 	0,
    224 	0
    225 };
    226 
    227 static errorq_t *gcpu_mca_queue;
    228 static kmutex_t gcpu_mca_queue_lock;
    229 
    230 #ifdef __xpv
    231 static int isxpv = 1;
    232 #else
    233 static int isxpv = 0;
    234 #endif
    235 
    236 static const gcpu_error_disp_t *
    237 gcpu_disp_match(uint16_t code)
    238 {
    239 	const gcpu_error_disp_t *ged = gcpu_errtypes;
    240 	int i;
    241 
    242 	for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
    243 	    i++, ged++) {
    244 		uint16_t on = ged->ged_errcode_mask_on;
    245 		uint16_t off = ged->ged_errcode_mask_off;
    246 
    247 		if ((code & on) == on && (code & off) == 0)
    248 			return (ged);
    249 	}
    250 
    251 	return (NULL);
    252 }
    253 
    254 static uint16_t
    255 bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
    256 {
    257 	return ((code & mask) >> shift);
    258 }
    259 
    260 #define	BIT_STRIP(code, name) \
    261 	bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
    262 	MCAX86_ERRCODE_##name##_SHIFT)
    263 
    264 #define	GCPU_MNEMONIC_UNDEF	"undefined"
    265 #define	GCPU_MNEMONIC_RESVD	"reserved"
    266 
    267 /*
    268  * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
    269  * mnemonics and to ereport class name components.
    270  */
    271 
    272 struct gcpu_mnexp {
    273 	const char *mne_compound;	/* used in expanding compound errname */
    274 	const char *mne_ereport;	/* used in expanding ereport class */
    275 };
    276 
    277 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
    278 	{ "I", FM_EREPORT_CPU_GENERIC_TT_INSTR },		/* INSTR */
    279 	{ "D", FM_EREPORT_CPU_GENERIC_TT_DATA },		/* DATA */
    280 	{ "G", FM_EREPORT_CPU_GENERIC_TT_GEN },			/* GEN */
    281 	{ GCPU_MNEMONIC_UNDEF, "" }
    282 };
    283 
    284 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
    285 	{ "LO", FM_EREPORT_CPU_GENERIC_LL_L0 },			/* L0 */
    286 	{ "L1",	FM_EREPORT_CPU_GENERIC_LL_L1 },			/* L1 */
    287 	{ "L2",	FM_EREPORT_CPU_GENERIC_LL_L2 },			/* L2 */
    288 	{ "LG", FM_EREPORT_CPU_GENERIC_LL_LG }			/* LG */
    289 };
    290 
    291 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
    292 	{ "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR },		/* ERR */
    293 	{ "RD",	FM_EREPORT_CPU_GENERIC_RRRR_RD },		/* RD */
    294 	{ "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR },		/* WR */
    295 	{ "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD },		/* DRD */
    296 	{ "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR },		/* DWR */
    297 	{ "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD },		/* IRD */
    298 	{ "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH },	/* PREFETCH */
    299 	{ "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT },		/* EVICT */
    300 	{ "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP },		/* SNOOP */
    301 };
    302 
    303 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
    304 	{ "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC },		/* SRC */
    305 	{ "RES", FM_EREPORT_CPU_GENERIC_PP_RES },		/* RES */
    306 	{ "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS },		/* OBS */
    307 	{ "", FM_EREPORT_CPU_GENERIC_PP_GEN }			/* GEN */
    308 };
    309 
    310 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
    311 	{ "M", FM_EREPORT_CPU_GENERIC_II_MEM },			/* MEM */
    312 	{ GCPU_MNEMONIC_RESVD, "" },
    313 	{ "IO", FM_EREPORT_CPU_GENERIC_II_IO },			/* IO */
    314 	{ "", FM_EREPORT_CPU_GENERIC_II_GEN }			/* GEN */
    315 };
    316 
    317 static struct gcpu_mnexp gcpu_T_mnemonics[] = {	 /* MCAX86_ERRCODE_T_* */
    318 	{ "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT },	/* NONE */
    319 	{ "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT }		/* TIMEOUT */
    320 };
    321 
    322 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */
    323 	{ "CH0", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH0 */
    324 	{ "CH1", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH1 */
    325 	{ "CH2", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH2 */
    326 	{ "CH3", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH3 */
    327 	{ "CH4", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH4 */
    328 	{ "CH5", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH5 */
    329 	{ "CH6", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH6 */
    330 	{ "CH7", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH7 */
    331 	{ "CH8", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH8 */
    332 	{ "CH9", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH9 */
    333 	{ "CH10", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH10 */
    334 	{ "CH11", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH11 */
    335 	{ "CH12", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH12 */
    336 	{ "CH13", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH13 */
    337 	{ "CH14", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH14 */
    338 	{ "CH", FM_EREPORT_CPU_GENERIC_CCCC }		/* GEN */
    339 };
    340 
    341 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */
    342 	{ "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR },	/* GEN ERR */
    343 	{ "RD", FM_EREPORT_CPU_GENERIC_MMM_RD },	/* READ  */
    344 	{ "WR", FM_EREPORT_CPU_GENERIC_MMM_WR },	/* WRITE  */
    345 	{ "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD },	/* ADDR, CMD  */
    346 	{ "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB },
    347 	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
    348 	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
    349 	{ GCPU_MNEMONIC_RESVD, ""}			/* RESERVED  */
    350 };
    351 
    352 enum gcpu_mn_namespace {
    353 	GCPU_MN_NAMESPACE_COMPOUND,
    354 	GCPU_MN_NAMESPACE_EREPORT
    355 };
    356 
    357 static const char *
    358 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val,
    359     enum gcpu_mn_namespace nspace)
    360 {
    361 	if (val >= tbl_sz || val > 0xff)
    362 		return (GCPU_MNEMONIC_UNDEF);	/* for all namespaces */
    363 
    364 	switch (nspace) {
    365 	case GCPU_MN_NAMESPACE_COMPOUND:
    366 		return (tbl[val].mne_compound);
    367 		/*NOTREACHED*/
    368 
    369 	case GCPU_MN_NAMESPACE_EREPORT:
    370 		return (tbl[val].mne_ereport);
    371 		/*NOTREACHED*/
    372 
    373 	default:
    374 		return (GCPU_MNEMONIC_UNDEF);
    375 		/*NOTREACHED*/
    376 	}
    377 }
    378 
    379 /*
    380  * The ereport class leaf component is either a simple string with no
    381  * format specifiers, or a string with one or more embedded %n$s specifiers -
    382  * positional selection for string arguments.  The kernel snprintf does
    383  * not support %n$ (and teaching it to do so is too big a headache) so
    384  * we will expand this restricted format string ourselves.
    385  */
    386 
    387 #define	GCPU_CLASS_VARCOMPS	9
    388 
    389 #define	GCPU_MNEMONIC(code, name, nspace) \
    390 	gcpu_mnemonic(gcpu_##name##_mnemonics, \
    391 	sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
    392 	BIT_STRIP(code, name), nspace)
    393 
    394 static void
    395 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
    396     enum gcpu_mn_namespace nspace)
    397 {
    398 	uint16_t code = MCAX86_ERRCODE(status);
    399 	const char *mn[GCPU_CLASS_VARCOMPS];
    400 	char *p = buf;			/* current position in buf */
    401 	char *q = buf + buflen;		/* pointer past last char in buf */
    402 	int which, expfmtchar, error;
    403 	char c;
    404 
    405 	mn[0] = GCPU_MNEMONIC(code, TT, nspace);
    406 	mn[1] = GCPU_MNEMONIC(code, LL, nspace);
    407 	mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
    408 	mn[3] = GCPU_MNEMONIC(code, PP, nspace);
    409 	mn[4] = GCPU_MNEMONIC(code, II, nspace);
    410 	mn[5] = GCPU_MNEMONIC(code, T, nspace);
    411 	mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
    412 	mn[7] = GCPU_MNEMONIC(code, CCCC, nspace);
    413 	mn[8] = GCPU_MNEMONIC(code, MMM, nspace);
    414 
    415 	while (p < q - 1 && (c = *fmt++) != '\0') {
    416 		if (c != '%') {
    417 			/* not the beginning of a format specifier - copy */
    418 			*p++ = c;
    419 			continue;
    420 		}
    421 
    422 		error = 0;
    423 		which = -1;
    424 		expfmtchar = -1;
    425 
    426 nextfmt:
    427 		if ((c = *fmt++) == '\0')
    428 			break;	/* early termination of fmt specifier */
    429 
    430 		switch (c) {
    431 		case '1':
    432 		case '2':
    433 		case '3':
    434 		case '4':
    435 		case '5':
    436 		case '6':
    437 		case '7':
    438 		case '8':
    439 		case '9':
    440 			if (which != -1) { /* allow only one positional digit */
    441 				error++;
    442 				break;
    443 			}
    444 			which = c - '1';
    445 			goto nextfmt;
    446 			/*NOTREACHED*/
    447 
    448 		case '$':
    449 			if (which == -1) { /* no position specified */
    450 				error++;
    451 				break;
    452 			}
    453 			expfmtchar = 's';
    454 			goto nextfmt;
    455 			/*NOTREACHED*/
    456 
    457 		case 's':
    458 			if (expfmtchar != 's') {
    459 				error++;
    460 				break;
    461 			}
    462 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
    463 			    mn[which]);
    464 			p += strlen(p);
    465 			break;
    466 
    467 		default:
    468 			error++;
    469 			break;
    470 		}
    471 
    472 		if (error)
    473 			break;
    474 	}
    475 
    476 	*p = '\0';	/* NUL termination */
    477 }
    478 
    479 static void
    480 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
    481     const char *cpuclass, const char *leafclass)
    482 {
    483 	char *p = buf;			/* current position in buf */
    484 	char *q = buf + buflen;		/* pointer past last char in buf */
    485 
    486 	(void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
    487 	    FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
    488 
    489 	p += strlen(p);
    490 	if (p >= q)
    491 		return;
    492 
    493 	if (leafclass == NULL) {
    494 		gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
    495 		    GCPU_MN_NAMESPACE_EREPORT);
    496 	} else {
    497 		(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
    498 		    leafclass);
    499 	}
    500 }
    501 
    502 /*
    503  * Create an "hc" scheme FMRI identifying the given cpu with
    504  * motherboard/chip/core/strand instance numbers.
    505  */
    506 static nvlist_t *
    507 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
    508 {
    509 	nvlist_t *nvl, *fmri;
    510 
    511 	if ((nvl = fm_nvlist_create(nva)) == NULL)
    512 		return (NULL);
    513 
    514 	if (!x86gentopo_legacy) {
    515 		fmri = cmi_hdl_smb_bboard(hdl);
    516 		if (fmri == NULL)
    517 			return (NULL);
    518 
    519 		fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION,
    520 		    NULL, NULL, fmri, 3,
    521 		    "chip", cmi_hdl_smb_chipid(hdl),
    522 		    "core", cmi_hdl_coreid(hdl),
    523 		    "strand", cmi_hdl_strandid(hdl));
    524 	} else {
    525 		fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4,
    526 		    "motherboard", 0,
    527 		    "chip", cmi_hdl_chipid(hdl),
    528 		    "core", cmi_hdl_coreid(hdl),
    529 		    "strand", cmi_hdl_strandid(hdl));
    530 	}
    531 
    532 	return (nvl);
    533 }
    534 
    535 int gcpu_bleat_count_thresh = 5;
    536 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
    537 
    538 /*
    539  * Called when we are unable to propogate a logout structure onto an
    540  * errorq for subsequent ereport preparation and logging etc.  The caller
    541  * should usually only decide to call this for severe errors - those we
    542  * suspect we may need to panic for.
    543  */
    544 static void
    545 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
    546 {
    547 	hrtime_t now  = gethrtime_waitfree();
    548 	static hrtime_t gcpu_last_bleat;
    549 	gcpu_bank_logout_t *gbl;
    550 	static int bleatcount;
    551 	int i;
    552 
    553 	/*
    554 	 * Throttle spamming of the console.  The first gcpu_bleat_count_thresh
    555 	 * can come as fast as we like, but once we've spammed that many
    556 	 * to the console we require a minimum interval to pass before
    557 	 * any more complaints.
    558 	 */
    559 	if (++bleatcount > gcpu_bleat_count_thresh) {
    560 		if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
    561 			return;
    562 		else
    563 			bleatcount = 0;
    564 	}
    565 	gcpu_last_bleat = now;
    566 
    567 	cmn_err(CE_WARN,
    568 	    "Machine-Check Errors unlogged on chip %d core %d strand %d, "
    569 	    "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl),
    570 	    cmi_hdl_strandid(hdl));
    571 	cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
    572 	    (u_longlong_t)gcl->gcl_mcg_status);
    573 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
    574 		uint64_t status = gbl->gbl_status;
    575 
    576 		if (!(status & MSR_MC_STATUS_VAL))
    577 			continue;
    578 
    579 		/* Force ADDRV for AMD Family 0xf and above */
    580 		if (gcpu_force_addr_in_payload)
    581 			status = status | MSR_MC_STATUS_ADDRV;
    582 
    583 		switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
    584 		case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
    585 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
    586 			    "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
    587 			    i, IA32_MSR_MC(i, STATUS),
    588 			    (u_longlong_t)gbl->gbl_status,
    589 			    (u_longlong_t)gbl->gbl_addr,
    590 			    (u_longlong_t)gbl->gbl_misc);
    591 			break;
    592 
    593 		case MSR_MC_STATUS_ADDRV:
    594 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
    595 			    "STAT 0x%016llx ADDR 0x%016llx",
    596 			    i, IA32_MSR_MC(i, STATUS),
    597 			    (u_longlong_t)gbl->gbl_status,
    598 			    (u_longlong_t)gbl->gbl_addr);
    599 			break;
    600 
    601 		case MSR_MC_STATUS_MISCV:
    602 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
    603 			    "STAT 0x%016llx MISC 0x%016llx",
    604 			    i, IA32_MSR_MC(i, STATUS),
    605 			    (u_longlong_t)gbl->gbl_status,
    606 			    (u_longlong_t)gbl->gbl_misc);
    607 			break;
    608 
    609 		default:
    610 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
    611 			    "STAT 0x%016llx",
    612 			    i, IA32_MSR_MC(i, STATUS),
    613 			    (u_longlong_t)gbl->gbl_status);
    614 			break;
    615 
    616 		}
    617 	}
    618 }
    619 
    620 #define	_GCPU_BSTATUS(status, what) \
    621 	FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
    622 	(status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
    623 
    624 static void
    625 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
    626     uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
    627 {
    628 	uint64_t members = ged ? ged->ged_ereport_members :
    629 	    FM_EREPORT_PAYLOAD_FLAGS_COMMON;
    630 	uint64_t mcg = gcl->gcl_mcg_status;
    631 	int mcip = mcg & MCG_STATUS_MCIP;
    632 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
    633 	uint64_t bstat = gbl->gbl_status;
    634 
    635 	/*
    636 	 * Include the compound error name if requested and if this
    637 	 * is a compound error type.
    638 	 */
    639 	if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
    640 	    ged->ged_compound_fmt != NULL) {
    641 		char buf[FM_MAX_CLASS];
    642 
    643 		gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
    644 		    GCPU_MN_NAMESPACE_COMPOUND);
    645 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
    646 		    DATA_TYPE_STRING, buf, NULL);
    647 	}
    648 
    649 	/*
    650 	 * Include disposition information for this error
    651 	 */
    652 	if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
    653 	    gbl->gbl_disp != 0) {
    654 		int i, empty = 1;
    655 		char buf[128];
    656 		char *p = buf, *q = buf + 128;
    657 		static struct _gcpu_disp_name {
    658 			uint64_t dv;
    659 			const char *dn;
    660 		} disp_names[] = {
    661 			{ CMI_ERRDISP_CURCTXBAD,
    662 			    "processor_context_corrupt" },
    663 			{ CMI_ERRDISP_RIPV_INVALID,
    664 			    "return_ip_invalid" },
    665 			{ CMI_ERRDISP_UC_UNCONSTRAINED,
    666 			    "unconstrained" },
    667 			{ CMI_ERRDISP_FORCEFATAL,
    668 			    "forcefatal" },
    669 			{ CMI_ERRDISP_IGNORED,
    670 			    "ignored" },
    671 			{ CMI_ERRDISP_PCC_CLEARED,
    672 			    "corrupt_context_cleared" },
    673 			{ CMI_ERRDISP_UC_CLEARED,
    674 			    "uncorrected_data_cleared" },
    675 			{ CMI_ERRDISP_POISONED,
    676 			    "poisoned" },
    677 			{ CMI_ERRDISP_INCONSISTENT,
    678 			    "telemetry_unstable" },
    679 		};
    680 
    681 		for (i = 0; i < sizeof (disp_names) /
    682 		    sizeof (struct _gcpu_disp_name); i++) {
    683 			if ((gbl->gbl_disp & disp_names[i].dv) == 0)
    684 				continue;
    685 
    686 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
    687 			    "%s%s", empty ? "" : ",", disp_names[i].dn);
    688 			p += strlen(p);
    689 			empty = 0;
    690 		}
    691 
    692 		if (p != buf)
    693 			fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
    694 			    DATA_TYPE_STRING, buf, NULL);
    695 	}
    696 
    697 	/*
    698 	 * If MCG_STATUS is included add that and an indication of whether
    699 	 * this ereport was the result of a machine check or poll.
    700 	 */
    701 	if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
    702 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
    703 		    DATA_TYPE_UINT64, mcg, NULL);
    704 
    705 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
    706 		    DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
    707 	}
    708 
    709 	/*
    710 	 * If an instruction pointer is to be included add one provided
    711 	 * MCG_STATUS indicated it is valid; meaningless for polled events.
    712 	 */
    713 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
    714 	    mcg & MCG_STATUS_EIPV) {
    715 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
    716 		    DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
    717 	}
    718 
    719 	/*
    720 	 * Add an indication of whether the trap occured during privileged code.
    721 	 */
    722 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
    723 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
    724 		    DATA_TYPE_BOOLEAN_VALUE,
    725 		    gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
    726 	}
    727 
    728 	/*
    729 	 * If requested, add the index of the MCA bank.  This indicates the
    730 	 * n'th bank of 4 MCA registers, and does not necessarily correspond
    731 	 * to MCi_* - use the bank offset to correlate
    732 	 */
    733 	if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
    734 		fm_payload_set(ereport,
    735 		    /* Bank number */
    736 		    FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
    737 		    /* Offset of MCi_CTL */
    738 		    FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
    739 		    IA32_MSR_MC(bankno, CTL),
    740 		    NULL);
    741 	}
    742 
    743 	/*
    744 	 * Add MCi_STATUS if requested, and decode it.
    745 	 */
    746 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
    747 		const char *tbes[] = {
    748 			"No tracking",			/* 00 */
    749 			"Green - below threshold",	/* 01 */
    750 			"Yellow - above threshold",	/* 10 */
    751 			"Reserved"			/* 11 */
    752 		};
    753 
    754 		fm_payload_set(ereport,
    755 		    /* Bank MCi_STATUS */
    756 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
    757 		    /* Overflow? */
    758 		    _GCPU_BSTATUS(bstat, OVER),
    759 		    /* Uncorrected? */
    760 		    _GCPU_BSTATUS(bstat, UC),
    761 		    /* Enabled? */
    762 		    _GCPU_BSTATUS(bstat, EN),
    763 		    /* Processor context corrupt? */
    764 		    _GCPU_BSTATUS(bstat, PCC),
    765 		    /* Error code */
    766 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
    767 		    DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
    768 		    /* Model-specific error code */
    769 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
    770 		    DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
    771 		    NULL);
    772 
    773 		/*
    774 		 * If MCG_CAP.TES_P indicates that that thresholding info
    775 		 * is present in the architural component of the bank status
    776 		 * then include threshold information for this bank.
    777 		 */
    778 		if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
    779 			fm_payload_set(ereport,
    780 			    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
    781 			    DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
    782 			    NULL);
    783 		}
    784 	}
    785 
    786 	/*
    787 	 * Add MCi_ADDR info if requested and valid. We force addition of
    788 	 * MCi_ADDR, even if its not valid on AMD family 0xf and above,
    789 	 * to aid in analysis of ereports, for WatchDog errors.
    790 	 */
    791 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
    792 	    ((bstat & MSR_MC_STATUS_ADDRV) ||
    793 	    gcpu_force_addr_in_payload)) {
    794 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
    795 		    DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
    796 	}
    797 
    798 	/*
    799 	 * MCi_MISC if requested and MCi_STATUS.MISCV).
    800 	 */
    801 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
    802 	    bstat & MSR_MC_STATUS_MISCV) {
    803 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
    804 		    DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
    805 	}
    806 
    807 }
    808 
    809 /*
    810  * Construct and post an ereport based on the logout information from a
    811  * single MCA bank.  We are not necessarily running on the cpu that
    812  * detected the error.
    813  */
    814 static void
    815 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
    816     const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
    817 {
    818 	gcpu_data_t *gcpu = gcl->gcl_gcpu;
    819 	cmi_hdl_t hdl = gcpu->gcpu_hdl;
    820 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
    821 	const char *cpuclass = NULL, *leafclass = NULL;
    822 	uint16_t code = MCAX86_ERRCODE(status);
    823 	errorq_elem_t *eqep, *scr_eqep;
    824 	nvlist_t *ereport, *detector;
    825 	char buf[FM_MAX_CLASS];
    826 	const char *classfmt;
    827 	nv_alloc_t *nva;
    828 
    829 	if (panicstr) {
    830 		if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
    831 			return;
    832 		ereport = errorq_elem_nvl(ereport_errorq, eqep);
    833 
    834 		/*
    835 		 * Allocate another element for scratch space, but fallback
    836 		 * to the one we have if that fails.  We'd like to use the
    837 		 * additional scratch space for nvlist construction.
    838 		 */
    839 		if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
    840 			nva = errorq_elem_nva(ereport_errorq, scr_eqep);
    841 		else
    842 			nva = errorq_elem_nva(ereport_errorq, eqep);
    843 	} else {
    844 		ereport = fm_nvlist_create(NULL);
    845 		nva = NULL;
    846 	}
    847 
    848 	if (ereport == NULL)
    849 		return;
    850 
    851 	/*
    852 	 * Common payload data required by the protocol:
    853 	 *	- ereport class
    854 	 *	- detector
    855 	 *	- ENA
    856 	 */
    857 
    858 	/*
    859 	 * Ereport class - call into model-specific support to allow it to
    860 	 * provide a cpu class or leaf class, otherwise calculate our own.
    861 	 */
    862 	cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
    863 	classfmt = ged ?  ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
    864 	gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
    865 	    leafclass);
    866 
    867 	/*
    868 	 * The detector FMRI.
    869 	 */
    870 	if ((detector = cms_ereport_detector(hdl, bankidx, mscookie,
    871 	    nva)) == NULL)
    872 		detector = gcpu_fmri_create(hdl, nva);
    873 
    874 	/*
    875 	 * Should we define a new ENA format 3?? for chip/core/strand?
    876 	 * It will be better when virtualized.
    877 	 */
    878 	fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
    879 	    fm_ena_generate_cpu(gcl->gcl_timestamp,
    880 	    cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
    881 	    cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
    882 
    883 	if (panicstr) {
    884 		fm_nvlist_destroy(detector, FM_NVA_RETAIN);
    885 		nv_alloc_reset(nva);
    886 	} else {
    887 		fm_nvlist_destroy(detector, FM_NVA_FREE);
    888 	}
    889 
    890 	/*
    891 	 * Add the architectural ereport class-specific payload data.
    892 	 */
    893 	gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
    894 
    895 	/*
    896 	 * Allow model-specific code to add ereport members.
    897 	 */
    898 	cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
    899 	    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
    900 
    901 	/*
    902 	 * Include stack if options is turned on and either selected in
    903 	 * the payload member bitmask or inclusion is forced.
    904 	 */
    905 	if (gcpu_mca_stack_flag &&
    906 	    (cms_ereport_includestack(hdl, mscookie) ==
    907 	    B_TRUE || gcpu_mca_stack_ereport_include)) {
    908 		fm_payload_stack_add(ereport, gcl->gcl_stack,
    909 		    gcl->gcl_stackdepth);
    910 	}
    911 
    912 	/*
    913 	 * If injection has taken place anytime in the past then note this
    914 	 * on the ereport.
    915 	 */
    916 	if (cmi_inj_tainted() == B_TRUE) {
    917 		fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE,
    918 		    B_TRUE, NULL);
    919 	}
    920 
    921 	/*
    922 	 * Post ereport.
    923 	 */
    924 	if (panicstr) {
    925 		errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
    926 		if (scr_eqep)
    927 			errorq_cancel(ereport_errorq, scr_eqep);
    928 	} else {
    929 		(void) fm_ereport_post(ereport, EVCH_TRYHARD);
    930 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
    931 	}
    932 
    933 }
    934 
    935 /*ARGSUSED*/
    936 void
    937 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
    938 {
    939 	const gcpu_logout_t *gcl = data;
    940 	const gcpu_bank_logout_t *gbl;
    941 	int i;
    942 
    943 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
    944 		const gcpu_error_disp_t *gened;
    945 		cms_cookie_t mscookie;
    946 
    947 		if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
    948 		    !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
    949 			uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
    950 
    951 			/*
    952 			 * Perform a match based on IA32 MCA architectural
    953 			 * components alone.
    954 			 */
    955 			gened = gcpu_disp_match(code); /* may be NULL */
    956 
    957 			/*
    958 			 * Now see if an model-specific match can be made.
    959 			 */
    960 			mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i,
    961 			    gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
    962 			    gcl->gcl_ms_logout);
    963 
    964 			/*
    965 			 * Prepare and dispatch an ereport for logging and
    966 			 * diagnosis.
    967 			 */
    968 			gcpu_ereport_post(gcl, i, gened, mscookie,
    969 			    gbl->gbl_status);
    970 		} else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
    971 		    (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
    972 			/*
    973 			 * Telemetry kept changing as we tried to read
    974 			 * it.  Force an unknown ereport leafclass but
    975 			 * keep the telemetry unchanged for logging.
    976 			 */
    977 			gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
    978 			    gbl->gbl_status);
    979 		}
    980 	}
    981 }
    982 
    983 static size_t gcpu_mca_queue_datasz = 0;
    984 
    985 /*
    986  * The following code is ready to make a weak attempt at growing the
    987  * errorq structure size.  Since it is not foolproof (we don't know
    988  * who may already be producing to the outgoing errorq) our caller
    989  * instead assures that we'll always be called with no greater data
    990  * size than on our first call.
    991  */
    992 static void
    993 gcpu_errorq_init(size_t datasz)
    994 {
    995 	int slots;
    996 
    997 	mutex_enter(&gcpu_mca_queue_lock);
    998 
    999 	if (gcpu_mca_queue_datasz >= datasz) {
   1000 		mutex_exit(&gcpu_mca_queue_lock);
   1001 		return;
   1002 	}
   1003 
   1004 	membar_producer();
   1005 	if (gcpu_mca_queue) {
   1006 		gcpu_mca_queue_datasz = 0;
   1007 		errorq_destroy(gcpu_mca_queue);
   1008 	}
   1009 
   1010 	slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
   1011 	slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
   1012 
   1013 	gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
   1014 	    NULL, slots, datasz, 1, ERRORQ_VITAL);
   1015 
   1016 	if (gcpu_mca_queue != NULL)
   1017 		gcpu_mca_queue_datasz = datasz;
   1018 
   1019 	mutex_exit(&gcpu_mca_queue_lock);
   1020 }
   1021 
   1022 /*
   1023  * Perform MCA initialization as described in section 14.6 of Intel 64
   1024  * and IA-32 Architectures Software Developer's Manual Volume 3A.
   1025  */
   1026 
   1027 static uint_t global_nbanks;
   1028 
   1029 void
   1030 gcpu_mca_init(cmi_hdl_t hdl)
   1031 {
   1032 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
   1033 	uint64_t cap;
   1034 	uint_t vendor = cmi_hdl_vendor(hdl);
   1035 	uint_t family = cmi_hdl_family(hdl);
   1036 	uint_t rev = cmi_hdl_chiprev(hdl);
   1037 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
   1038 	int mcg_ctl_present;
   1039 	uint_t nbanks;
   1040 	uint32_t ctl_skip_mask = 0;
   1041 	uint32_t status_skip_mask = 0;
   1042 	size_t mslsz;
   1043 	int i;
   1044 #ifndef __xpv
   1045 	int mcg_ctl2_present;
   1046 	uint32_t cmci_capable = 0;
   1047 #endif
   1048 	if (gcpu == NULL)
   1049 		return;
   1050 
   1051 	/* We add MCi_ADDR always for AMD Family 0xf and above */
   1052 	if (X86_CHIPFAM_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B))
   1053 		gcpu_force_addr_in_payload = 1;
   1054 
   1055 	/*
   1056 	 * Protect from some silly /etc/system settings.
   1057 	 */
   1058 	if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
   1059 		gcpu_mca_telemetry_retries = 5;
   1060 
   1061 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
   1062 		return;
   1063 
   1064 	/*
   1065 	 * CPU startup code only calls cmi_mca_init if x86_feature indicates
   1066 	 * both MCA and MCE support (i.e., X86_MCA).  P5, K6, and earlier
   1067 	 * processors, which have their own * more primitive way of doing
   1068 	 * machine checks, will not have cmi_mca_init called since their
   1069 	 * CPUID information will not indicate both MCA and MCE features.
   1070 	 */
   1071 	ASSERT(x86_feature & X86_MCA);
   1072 
   1073 	/*
   1074 	 * Determine whether the IA32_MCG_CTL register is present.  If it
   1075 	 * is we will enable all features by writing -1 to it towards
   1076 	 * the end of this initialization;  if it is absent then volume 3A
   1077 	 * says we must nonetheless continue to initialize the individual
   1078 	 * banks.
   1079 	 */
   1080 	mcg_ctl_present = cap & MCG_CAP_CTL_P;
   1081 #ifndef __xpv
   1082 	mcg_ctl2_present = cap & MCG_CAP_CTL2_P;
   1083 #endif
   1084 
   1085 	/*
   1086 	 * We squirell values away for inspection/debugging.
   1087 	 */
   1088 	mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
   1089 	if (mcg_ctl_present)
   1090 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
   1091 		    &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
   1092 
   1093 	/*
   1094 	 * Determine the number of error-reporting banks implemented.
   1095 	 */
   1096 	mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
   1097 
   1098 	if (nbanks != 0 && global_nbanks == 0)
   1099 		global_nbanks = nbanks;	/* no race - BSP will get here first */
   1100 
   1101 	/*
   1102 	 * If someone is hiding the number of banks (perhaps we are fully
   1103 	 * virtualized?) or if this processor has more banks than the
   1104 	 * first to set global_nbanks then bail.  The latter requirement
   1105 	 * is because we need to size our errorq data structure and we
   1106 	 * don't want to have to grow the errorq (destroy and recreate)
   1107 	 * which may just lose some telemetry.
   1108 	 */
   1109 	if (nbanks == 0 || nbanks > global_nbanks)
   1110 		return;
   1111 
   1112 	mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
   1113 	    sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
   1114 
   1115 	/*
   1116 	 * Calculate the size we need to allocate for a gcpu_logout_t
   1117 	 * with a gcl_data array big enough for all banks of this cpu.
   1118 	 * Add any space requested by the model-specific logout support.
   1119 	 */
   1120 	mslsz = cms_logout_size(hdl);
   1121 	mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
   1122 	    (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
   1123 
   1124 	for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
   1125 		gcpu_logout_t *gcl;
   1126 
   1127 		mca->gcpu_mca_logout[i] = gcl =
   1128 		    kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
   1129 		gcl->gcl_gcpu = gcpu;
   1130 		gcl->gcl_nbanks = nbanks;
   1131 		gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
   1132 		    (char *)(&gcl->gcl_data[0]) + nbanks *
   1133 		    sizeof (gcpu_bank_logout_t);
   1134 
   1135 	}
   1136 
   1137 #ifdef __xpv
   1138 	gcpu_xpv_mca_init(nbanks);
   1139 #endif
   1140 
   1141 	mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
   1142 
   1143 #ifndef __xpv
   1144 	mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks,
   1145 	    KM_SLEEP);
   1146 #endif
   1147 
   1148 	/*
   1149 	 * Create our errorq to transport the logout structures.  This
   1150 	 * can fail so users of gcpu_mca_queue must be prepared for NULL.
   1151 	 */
   1152 	gcpu_errorq_init(mca->gcpu_mca_lgsz);
   1153 
   1154 	/*
   1155 	 * Not knowing which, if any, banks are shared between cores we
   1156 	 * assure serialization of MCA bank initialization by each cpu
   1157 	 * on the chip.  On chip architectures in which some banks are
   1158 	 * shared this will mean the shared resource is initialized more
   1159 	 * than once - we're simply aiming to avoid simultaneous MSR writes
   1160 	 * to the shared resource.
   1161 	 *
   1162 	 * Even with these precautions, some platforms may yield a GP fault
   1163 	 * if a core other than a designated master tries to write anything
   1164 	 * but all 0's to MCi_{STATUS,ADDR,CTL}.  So we will perform
   1165 	 * those writes under on_trap protection.
   1166 	 */
   1167 	mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
   1168 
   1169 	/*
   1170 	 * Initialize poller data, but don't start polling yet.
   1171 	 */
   1172 	gcpu_mca_poll_init(hdl);
   1173 
   1174 	/*
   1175 	 * Work out which MCA banks we will initialize.  In MCA logout
   1176 	 * code we will only read those banks which we initialize here.
   1177 	 */
   1178 	for (i = 0; i < nbanks; i++) {
   1179 		boolean_t skipctl = cms_bankctl_skipinit(hdl, i);
   1180 		boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i);
   1181 
   1182 		if (!cms_present(hdl)) {
   1183 			/*
   1184 			 * Model-specific support is not present, try to use
   1185 			 * sane defaults.
   1186 			 *
   1187 			 * On AMD family 6 processors, reports about spurious
   1188 			 * machine checks indicate that bank 0 should be
   1189 			 * skipped.
   1190 			 *
   1191 			 * On Intel family 6 processors, the documentation tells
   1192 			 * us not to write to MC0_CTL.
   1193 			 *
   1194 			 */
   1195 			if (i == 0 && family == 6) {
   1196 				switch (vendor) {
   1197 				case X86_VENDOR_AMD:
   1198 					skipstatus = B_TRUE;
   1199 					/*FALLTHRU*/
   1200 				case X86_VENDOR_Intel:
   1201 					skipctl = B_TRUE;
   1202 					break;
   1203 				}
   1204 			}
   1205 		}
   1206 
   1207 		ctl_skip_mask |= skipctl << i;
   1208 		status_skip_mask |= skipstatus << i;
   1209 
   1210 		if (skipctl && skipstatus)
   1211 			continue;
   1212 
   1213 		/*
   1214 		 * Record which MCA banks were enabled, from the point of view
   1215 		 * of the whole chip (if some cores share a bank we must be
   1216 		 * sure either can logout from it).
   1217 		 */
   1218 		atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
   1219 
   1220 #ifndef __xpv
   1221 		/*
   1222 		 * check CMCI capability
   1223 		 */
   1224 		if (mcg_ctl2_present) {
   1225 			uint64_t ctl2;
   1226 			uint32_t cap = 0;
   1227 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
   1228 			if (ctl2 & MSR_MC_CTL2_EN)
   1229 				continue;
   1230 			ctl2 |= MSR_MC_CTL2_EN;
   1231 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
   1232 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
   1233 			mca->gcpu_bank_cmci[i].cmci_cap = cap =
   1234 			    (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0;
   1235 			if (cap)
   1236 				cmci_capable ++;
   1237 			/*
   1238 			 * Set threshold to 1 while unset the en field, to avoid
   1239 			 * CMCI trigged before APIC LVT entry init.
   1240 			 */
   1241 			ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1;
   1242 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
   1243 
   1244 			/*
   1245 			 * init cmci related count
   1246 			 */
   1247 			mca->gcpu_bank_cmci[i].cmci_enabled = 0;
   1248 			mca->gcpu_bank_cmci[i].drtcmci = 0;
   1249 			mca->gcpu_bank_cmci[i].ncmci = 0;
   1250 		}
   1251 #endif
   1252 	}
   1253 
   1254 #ifndef __xpv
   1255 	if (cmci_capable)
   1256 		cmi_enable_cmci = 1;
   1257 #endif
   1258 
   1259 #ifndef __xpv
   1260 	/*
   1261 	 * Log any valid telemetry lurking in the MCA banks, but do not
   1262 	 * clear the status registers.  Ignore the disposition returned -
   1263 	 * we have already paniced or reset for any nasty errors found here.
   1264 	 *
   1265 	 * Intel vol 3A says that we should not do this on family 0x6,
   1266 	 * and that for any extended family the BIOS clears things
   1267 	 * on power-on reset so you'll only potentially find valid telemetry
   1268 	 * on warm reset (we do it for both - on power-on reset we should
   1269 	 * just see zeroes).
   1270 	 *
   1271 	 * AMD docs since K7 say we should process anything we find here.
   1272 	 */
   1273 	if (!gcpu_suppress_log_on_init &&
   1274 	    (vendor == X86_VENDOR_Intel && family >= 0xf ||
   1275 	    vendor == X86_VENDOR_AMD))
   1276 		gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE,
   1277 		    GCPU_MPT_WHAT_POKE_ERR);
   1278 
   1279 	/*
   1280 	 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
   1281 	 * model-specific module the power of veto.
   1282 	 */
   1283 	for (i = 0; i < nbanks; i++) {
   1284 		struct gcpu_bios_bankcfg *bcfgp =
   1285 		    mca->gcpu_mca_bioscfg.bios_bankcfg + i;
   1286 
   1287 		/*
   1288 		 * Stash inherited bank MCA state, even for banks we will
   1289 		 * not initialize ourselves.  Do not read the MISC register
   1290 		 * unconditionally - on some processors that will #GP on
   1291 		 * banks that do not implement the MISC register (would be
   1292 		 * caught by on_trap, anyway).
   1293 		 */
   1294 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
   1295 		    &bcfgp->bios_bank_ctl);
   1296 
   1297 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
   1298 		    &bcfgp->bios_bank_status);
   1299 
   1300 		if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) ||
   1301 		    gcpu_force_addr_in_payload) {
   1302 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
   1303 			    &bcfgp->bios_bank_addr);
   1304 		}
   1305 
   1306 		/*
   1307 		 * In some old BIOS the status value after boot can indicate
   1308 		 * MISCV when there is actually no MISC register for
   1309 		 * that bank.  The following read could therefore
   1310 		 * aggravate a general protection fault.  This should be
   1311 		 * caught by on_trap, but the #GP fault handler is busted
   1312 		 * and can suffer a double fault even before we get to
   1313 		 * trap() to check for on_trap protection.  Until that
   1314 		 * issue is fixed we remove the one access that we know
   1315 		 * can cause a #GP.
   1316 		 *
   1317 		 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
   1318 		 *	(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
   1319 		 *	    &bcfgp->bios_bank_misc);
   1320 		 */
   1321 		bcfgp->bios_bank_misc = 0;
   1322 
   1323 		if (!(ctl_skip_mask & (1 << i))) {
   1324 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
   1325 			    cms_bankctl_val(hdl, i, -1ULL));
   1326 		}
   1327 
   1328 		if (!(status_skip_mask & (1 << i))) {
   1329 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
   1330 			    cms_bankstatus_val(hdl, i, 0ULL));
   1331 		}
   1332 	}
   1333 #endif
   1334 	/*
   1335 	 * Now let the model-specific support perform further initialization
   1336 	 * of non-architectural features.
   1337 	 */
   1338 	cms_mca_init(hdl, nbanks);
   1339 
   1340 #ifndef __xpv
   1341 	(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
   1342 	membar_producer();
   1343 
   1344 	/* enable all machine-check features */
   1345 	if (mcg_ctl_present)
   1346 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
   1347 		    cms_mcgctl_val(hdl, nbanks, -1ULL));
   1348 #endif
   1349 
   1350 	mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
   1351 
   1352 #ifndef __xpv
   1353 	/* enable machine-check exception in CR4 */
   1354 	cmi_hdl_enable_mce(hdl);
   1355 #endif
   1356 }
   1357 
   1358 static uint64_t
   1359 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
   1360     gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
   1361 {
   1362 	int curctxbad = 0, unconstrained = 0, forcefatal = 0;
   1363 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
   1364 	int nbanks = mca->gcpu_mca_nbanks;
   1365 	gcpu_mce_status_t mce;
   1366 	gcpu_bank_logout_t *gbl;
   1367 	uint64_t disp = 0;
   1368 	int i;
   1369 
   1370 	if (mcesp == NULL)
   1371 		mcesp = &mce;
   1372 
   1373 	mcesp->mce_nerr = nerr;
   1374 
   1375 	mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
   1376 	    mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
   1377 	    mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
   1378 
   1379 	/*
   1380 	 * If this a machine check then if the return instruction pointer
   1381 	 * is not valid the current context is lost.
   1382 	 */
   1383 	if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
   1384 		disp |= CMI_ERRDISP_RIPV_INVALID;
   1385 
   1386 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
   1387 		uint64_t mcistatus = gbl->gbl_status;
   1388 		uint32_t ms_scope;
   1389 		int pcc, uc;
   1390 		int poisoned;
   1391 
   1392 		if (!(mcistatus & MSR_MC_STATUS_VAL))
   1393 			continue;
   1394 
   1395 		if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
   1396 			continue;
   1397 
   1398 		pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
   1399 		uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
   1400 		mcesp->mce_npcc += pcc;
   1401 		mcesp->mce_nuc += uc;
   1402 
   1403 		ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
   1404 		    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
   1405 
   1406 		if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
   1407 			pcc = 0;
   1408 			mcesp->mce_npcc_ok++;
   1409 			gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
   1410 		}
   1411 
   1412 		if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
   1413 			uc = 0;
   1414 			mcesp->mce_nuc_ok++;
   1415 			gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
   1416 		}
   1417 
   1418 		if (uc) {
   1419 			poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
   1420 			if (poisoned) {
   1421 				mcesp->mce_nuc_poisoned++;
   1422 				gbl->gbl_disp |= CMI_ERRDISP_POISONED;
   1423 			}
   1424 		}
   1425 
   1426 		if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
   1427 			/*
   1428 			 * We're not being instructed to ignore the error,
   1429 			 * so apply our standard disposition logic to it.
   1430 			 */
   1431 			if (uc && !poisoned) {
   1432 				unconstrained++;
   1433 				gbl->gbl_disp |= disp |
   1434 				    CMI_ERRDISP_UC_UNCONSTRAINED;
   1435 			}
   1436 
   1437 			if (pcc && ismc) {
   1438 				curctxbad++;
   1439 				gbl->gbl_disp |= disp |
   1440 				    CMI_ERRDISP_CURCTXBAD;
   1441 			}
   1442 
   1443 			/*
   1444 			 * Even if the above may not indicate that the error
   1445 			 * is terminal, model-specific support may insist
   1446 			 * that we treat it as such.  Such errors wil be
   1447 			 * fatal even if discovered via poll.
   1448 			 */
   1449 			if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
   1450 				forcefatal++;
   1451 				mcesp->mce_forcefatal++;
   1452 				gbl->gbl_disp |= disp |
   1453 				    CMI_ERRDISP_FORCEFATAL;
   1454 			}
   1455 		} else {
   1456 			mcesp->mce_ignored++;
   1457 			gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
   1458 		}
   1459 	}
   1460 
   1461 	if (unconstrained > 0)
   1462 		disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
   1463 
   1464 	if (curctxbad > 0)
   1465 		disp |= CMI_ERRDISP_CURCTXBAD;
   1466 
   1467 	if (forcefatal > 0)
   1468 		disp |= CMI_ERRDISP_FORCEFATAL;
   1469 
   1470 	if (gcpu_mca_queue != NULL) {
   1471 		int how;
   1472 
   1473 		if (ismc) {
   1474 			how = cmi_mce_response(rp, disp) ?
   1475 			    ERRORQ_ASYNC :	/* no panic, so arrange drain */
   1476 			    ERRORQ_SYNC;	/* panic flow will drain */
   1477 		} else {
   1478 			how = (disp & CMI_ERRDISP_FORCEFATAL &&
   1479 			    cmi_panic_on_ue()) ?
   1480 			    ERRORQ_SYNC :	/* poller will panic */
   1481 			    ERRORQ_ASYNC;	/* no panic */
   1482 		}
   1483 
   1484 		errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
   1485 	} else if (disp != 0) {
   1486 		gcpu_bleat(hdl, gcl);
   1487 	}
   1488 
   1489 	mcesp->mce_disp = disp;
   1490 
   1491 	return (disp);
   1492 }
   1493 
   1494 /*
   1495  * Gather error telemetry from our source, and then submit it for
   1496  * processing.
   1497  */
   1498 
   1499 #define	IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
   1500 	((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
   1501 
   1502 #define	STATUS_EQV(s1, s2) \
   1503 	(((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
   1504 
   1505 static uint32_t gcpu_deferrred_polled_clears;
   1506 
   1507 #ifndef __xpv
   1508 static void
   1509 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
   1510     uint64_t status, int what)
   1511 {
   1512 	uint64_t ctl2;
   1513 
   1514 	if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) &&
   1515 	    (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) &&
   1516 	    !(status & MSR_MC_STATUS_CEC_MASK)))) {
   1517 
   1518 		if (!(bank_cmci_p->cmci_enabled)) {
   1519 			/*
   1520 			 * when cmci is disabled, and the bank has no error or
   1521 			 * no corrected error for
   1522 			 * gcpu_mca_cmci_reenable_threshold consecutive polls,
   1523 			 * turn on this bank's cmci.
   1524 			 */
   1525 
   1526 			bank_cmci_p->drtcmci ++;
   1527 
   1528 			if (bank_cmci_p->drtcmci >=
   1529 			    gcpu_mca_cmci_reenable_threshold) {
   1530 
   1531 				/* turn on cmci */
   1532 
   1533 				(void) cmi_hdl_rdmsr(hdl,
   1534 				    IA32_MSR_MC_CTL2(bank), &ctl2);
   1535 				ctl2 |= MSR_MC_CTL2_EN;
   1536 				(void) cmi_hdl_wrmsr(hdl,
   1537 				    IA32_MSR_MC_CTL2(bank), ctl2);
   1538 
   1539 				/* reset counter and set flag */
   1540 				bank_cmci_p->drtcmci = 0;
   1541 				bank_cmci_p->cmci_enabled = 1;
   1542 			}
   1543 		} else {
   1544 			/*
   1545 			 * when cmci is enabled,if is in cyclic poll and the
   1546 			 * bank has no error or no corrected error, reset ncmci
   1547 			 * counter
   1548 			 */
   1549 			bank_cmci_p->ncmci = 0;
   1550 		}
   1551 	}
   1552 }
   1553 
   1554 static void
   1555 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
   1556     int what)
   1557 {
   1558 	uint64_t ctl2 = 0;
   1559 
   1560 	/*
   1561 	 * if cmci of this bank occurred beyond
   1562 	 * gcpu_mca_cmci_throttling_threshold between 2 polls,
   1563 	 * turn off this bank's CMCI;
   1564 	 */
   1565 	if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) {
   1566 
   1567 		/* if it is cmci trap, increase the count */
   1568 		bank_cmci_p->ncmci++;
   1569 
   1570 		if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) {
   1571 
   1572 			/* turn off cmci */
   1573 
   1574 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank),
   1575 			    &ctl2);
   1576 			ctl2 &= ~MSR_MC_CTL2_EN;
   1577 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank),
   1578 			    ctl2);
   1579 
   1580 			/* clear the flag and count */
   1581 
   1582 			bank_cmci_p->cmci_enabled = 0;
   1583 			bank_cmci_p->ncmci = 0;
   1584 		}
   1585 	}
   1586 }
   1587 #endif
   1588 
   1589 static void
   1590 clear_mc(int first, int last, int ismc, boolean_t clrstatus,
   1591     cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl)
   1592 {
   1593 	int i;
   1594 	gcpu_bank_logout_t *gbl, *pgbl;
   1595 	uint64_t status;
   1596 
   1597 	if (first < 0 || last < 0)
   1598 		return;
   1599 
   1600 	for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) {
   1601 		status = gbl->gbl_status;
   1602 		if (status == 0)
   1603 			continue;
   1604 		if (clrstatus == B_FALSE)
   1605 			goto serialize;
   1606 
   1607 		/*
   1608 		 * For i86xpv we always clear status in order to invalidate
   1609 		 * the interposed telemetry.
   1610 		 *
   1611 		 * For native machine checks we always clear status here.  For
   1612 		 * native polls we must be a little more cautious since there
   1613 		 * is an outside chance that we may clear telemetry from a
   1614 		 * shared MCA bank on which a sibling core is machine checking.
   1615 		 *
   1616 		 * For polled observations of errors that look like they may
   1617 		 * produce a machine check (UC/PCC and ENabled, although these
   1618 		 * do not guarantee a machine check on error occurence)
   1619 		 * we will not clear the status at this wakeup unless
   1620 		 * we saw the same status at the previous poll.	 We will
   1621 		 * always process and log the current observations - it
   1622 		 * is only the clearing of MCi_STATUS which may be
   1623 		 * deferred until the next wakeup.
   1624 		 */
   1625 		if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) {
   1626 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
   1627 			goto serialize;
   1628 		}
   1629 
   1630 		/*
   1631 		 * We have a polled observation of a machine check
   1632 		 * candidate.  If we saw essentially the same status at the
   1633 		 * last poll then clear the status now since this appears
   1634 		 * not to be a #MC candidate after all.	 If we see quite
   1635 		 * different status now then do not clear, but reconsider at
   1636 		 * the next poll.  In no actual machine check clears
   1637 		 * the status in the interim then the status should not
   1638 		 * keep changing forever (meaning we'd never clear it)
   1639 		 * since before long we'll simply have latched the highest-
   1640 		 * priority error and set the OVerflow bit.  Nonetheless
   1641 		 * we count how many times we defer clearing and after
   1642 		 * a while insist on clearing the status.
   1643 		 */
   1644 		pgbl = &pgcl->gcl_data[i];
   1645 		if (pgbl->gbl_clrdefcnt != 0) {
   1646 			/* We deferred clear on this bank at last wakeup */
   1647 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
   1648 			    pgbl->gbl_clrdefcnt > 5) {
   1649 				/*
   1650 				 * Status is unchanged so clear it now and,
   1651 				 * since we have already logged this info,
   1652 				 * avoid logging it again.
   1653 				 */
   1654 				gbl->gbl_status = 0;
   1655 				(void) cmi_hdl_wrmsr(hdl,
   1656 				    IA32_MSR_MC(i, STATUS), 0ULL);
   1657 			} else {
   1658 				/* Record deferral for next wakeup */
   1659 				gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
   1660 			}
   1661 		} else {
   1662 			/* Record initial deferral for next wakeup */
   1663 			gbl->gbl_clrdefcnt = 1;
   1664 			gcpu_deferrred_polled_clears++;
   1665 		}
   1666 
   1667 serialize:
   1668 		{
   1669 #ifdef __xpv
   1670 			;
   1671 #else
   1672 			/*
   1673 			 * Intel Vol 3A says to execute a serializing
   1674 			 * instruction here, ie CPUID.	Well WRMSR is also
   1675 			 * defined to be serializing, so the status clear above
   1676 			 * should suffice.  To be a good citizen, and since
   1677 			 * some clears are deferred, we'll execute a CPUID
   1678 			 * instruction here.
   1679 			 */
   1680 			struct cpuid_regs tmp;
   1681 			(void) __cpuid_insn(&tmp);
   1682 #endif
   1683 		}
   1684 	}
   1685 }
   1686 
   1687 /*ARGSUSED5*/
   1688 void
   1689 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
   1690     gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what)
   1691 {
   1692 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
   1693 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
   1694 	int nbanks = mca->gcpu_mca_nbanks;
   1695 	gcpu_bank_logout_t *gbl, *pgbl;
   1696 	gcpu_logout_t *gcl, *pgcl;
   1697 	int ismc = (rp != NULL);
   1698 	int ispoll = !ismc;
   1699 	int i, nerr = 0;
   1700 	cmi_errno_t err;
   1701 	uint64_t mcg_status;
   1702 	uint64_t disp;
   1703 	uint64_t cap;
   1704 	int first = -1;
   1705 	int last = -1;
   1706 	int willpanic = 0;
   1707 
   1708 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
   1709 	    CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
   1710 	    CMI_SUCCESS) {
   1711 		if (mcesp != NULL)
   1712 			mcesp->mce_nerr = mcesp->mce_disp = 0;
   1713 		return;
   1714 	}
   1715 
   1716 	if (ismc) {
   1717 		gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
   1718 	} else {
   1719 		int pidx = mca->gcpu_mca_nextpoll_idx;
   1720 		int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
   1721 		    GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
   1722 
   1723 		gcl = mca->gcpu_mca_logout[pidx];	/* current logout */
   1724 		pgcl = mca->gcpu_mca_logout[ppidx];	/* previous logout */
   1725 		mca->gcpu_mca_nextpoll_idx = ppidx;	/* switch next time */
   1726 	}
   1727 
   1728 	gcl->gcl_timestamp = gethrtime_waitfree();
   1729 	gcl->gcl_mcg_status = mcg_status;
   1730 	gcl->gcl_ip = rp ? rp->r_pc : 0;
   1731 
   1732 	gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
   1733 	if (cap & MCG_CAP_TES_P)
   1734 		gcl->gcl_flags |= GCPU_GCL_F_TES_P;
   1735 
   1736 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
   1737 		uint64_t status, status2, addr, misc;
   1738 		int retries = gcpu_mca_telemetry_retries;
   1739 
   1740 		gbl->gbl_status = 0;
   1741 		gbl->gbl_disp = 0;
   1742 		gbl->gbl_clrdefcnt = 0;
   1743 
   1744 		/*
   1745 		 * Only logout from MCA banks we have initialized from at
   1746 		 * least one core.  If a core shares an MCA bank with another
   1747 		 * but perhaps lost the race to initialize it, then it must
   1748 		 * still be allowed to logout from the shared bank.
   1749 		 */
   1750 		if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
   1751 			continue;
   1752 
   1753 		/*
   1754 		 * On a poll look only at the banks we've been asked to check.
   1755 		 */
   1756 		if (rp == NULL && !(bankmask & 1 << i))
   1757 			continue;
   1758 
   1759 
   1760 		if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
   1761 		    CMI_SUCCESS)
   1762 			continue;
   1763 
   1764 #ifndef __xpv
   1765 		gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what);
   1766 #endif
   1767 
   1768 retry:
   1769 		if (!(status & MSR_MC_STATUS_VAL))
   1770 			continue;
   1771 
   1772 		/* First and last bank that have valid status */
   1773 		if (first < 0)
   1774 			first = i;
   1775 		last = i;
   1776 
   1777 		addr = -1;
   1778 		misc = 0;
   1779 
   1780 		if ((status & MSR_MC_STATUS_ADDRV) ||
   1781 		    gcpu_force_addr_in_payload)
   1782 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
   1783 
   1784 		if (status & MSR_MC_STATUS_MISCV)
   1785 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
   1786 
   1787 #ifndef __xpv
   1788 		gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what);
   1789 #endif
   1790 
   1791 		/*
   1792 		 * Allow the model-specific code to extract bank telemetry.
   1793 		 */
   1794 		cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
   1795 
   1796 		/*
   1797 		 * Not all cpu models assure us that the status/address/misc
   1798 		 * data will not change during the above sequence of MSR reads,
   1799 		 * or that it can only change by the addition of the OVerflow
   1800 		 * bit to the status register.  If the status has changed
   1801 		 * other than in the overflow bit then we attempt to reread
   1802 		 * for a consistent snapshot, but eventually give up and
   1803 		 * go with what we've got.  We only perform this check
   1804 		 * for a poll - a further #MC during a #MC will reset, and
   1805 		 * polled errors should not overwrite higher-priority
   1806 		 * trapping errors (but could set the overflow bit).
   1807 		 */
   1808 		if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
   1809 		    &status2)) == CMI_SUCCESS) {
   1810 			if (!STATUS_EQV(status, status2)) {
   1811 				if (retries-- > 0) {
   1812 					status = status2;
   1813 					goto retry;
   1814 				} else {
   1815 					gbl->gbl_disp |=
   1816 					    CMI_ERRDISP_INCONSISTENT;
   1817 				}
   1818 			}
   1819 		} else if (ispoll && err != CMI_SUCCESS) {
   1820 			gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
   1821 		}
   1822 
   1823 		nerr++;
   1824 		gbl->gbl_status = status;
   1825 		gbl->gbl_addr = addr;
   1826 		gbl->gbl_misc = misc;
   1827 
   1828 		/*
   1829 		 * For polled observation, if the count of deferred status
   1830 		 * clears updated in the clear_mc() is nonzero and the
   1831 		 * MCi_STATUS has not changed, the last wakeup has produced
   1832 		 * the ereport of the error. Therefore, clear the status in
   1833 		 * this wakeup to avoid duplicate ereport.
   1834 		 */
   1835 		pgbl = &pgcl->gcl_data[i];
   1836 		if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) &&
   1837 		    pgbl->gbl_clrdefcnt != 0) {
   1838 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) {
   1839 				gbl->gbl_status = 0;
   1840 				(void) cmi_hdl_wrmsr(hdl,
   1841 				    IA32_MSR_MC(i, STATUS), 0ULL);
   1842 			}
   1843 		}
   1844 	}
   1845 
   1846 	if (gcpu_mca_stack_flag)
   1847 		gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
   1848 	else
   1849 		gcl->gcl_stackdepth = 0;
   1850 
   1851 	/*
   1852 	 * Decide our disposition for this error or errors, and submit for
   1853 	 * logging and subsequent diagnosis.
   1854 	 */
   1855 	if (nerr != 0) {
   1856 		disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
   1857 
   1858 		willpanic = (ismc && cmi_mce_response(rp, disp) == 0);
   1859 
   1860 		if (!willpanic)
   1861 			clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl);
   1862 	} else {
   1863 		disp = 0;
   1864 		if (mcesp) {
   1865 			mcesp->mce_nerr = mcesp->mce_disp = 0;
   1866 		}
   1867 	}
   1868 
   1869 	/*
   1870 	 * Clear MCG_STATUS if MCIP is set (machine check in progress).
   1871 	 * If a second #MC had occured before now the system would have
   1872 	 * reset.  We can only do thise once gcpu_mca_process has copied
   1873 	 * the logout structure.
   1874 	 */
   1875 	if (ismc && mcg_status & MCG_STATUS_MCIP)
   1876 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
   1877 
   1878 	/*
   1879 	 * At this point we have read and logged all telemetry that is visible
   1880 	 * under the MCA.  On architectures for which the NorthBridge is
   1881 	 * on-chip this may include NB-observed errors, but where the NB
   1882 	 * is off chip it may have been the source of the #MC request and
   1883 	 * so we must call into the memory-controller driver to give it
   1884 	 * a chance to log errors.
   1885 	 */
   1886 	if (ismc) {
   1887 		cmi_mc_logout(hdl, 1, willpanic);
   1888 	}
   1889 }
   1890 
   1891 #ifndef __xpv
   1892 int gcpu_mca_trap_vomit_summary = 0;
   1893 
   1894 /*
   1895  * On a native machine check exception we come here from mcetrap via
   1896  * cmi_mca_trap.  A machine check on one cpu of a chip does not trap others
   1897  * cpus of the chip, so it is possible that another cpu on this chip could
   1898  * initiate a poll while we're in the #mc handler;  it is also possible that
   1899  * this trap has occured during a poll on this cpu.  So we must acquire
   1900  * the chip-wide poll lock, but be careful to avoid deadlock.
   1901  *
   1902  * The 'data' pointer cannot be NULL due to init order.
   1903  */
   1904 uint64_t
   1905 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
   1906 {
   1907 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
   1908 	kmutex_t *poll_lock = NULL;
   1909 	gcpu_mce_status_t mce;
   1910 	uint64_t mcg_status;
   1911 	int tooklock = 0;
   1912 
   1913 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
   1914 	    CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
   1915 		return (0);
   1916 
   1917 	/*
   1918 	 * Synchronize with any poller from another core that may happen
   1919 	 * to share access to one or more of the MCA banks.
   1920 	 */
   1921 	if (gcpu->gcpu_shared != NULL)
   1922 		poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
   1923 
   1924 	if (poll_lock != NULL && !mutex_owned(poll_lock)) {
   1925 		/*
   1926 		 * The lock is not owned by the thread we have
   1927 		 * interrupted.  Spin for this adaptive lock.
   1928 		 */
   1929 		while (!mutex_tryenter(poll_lock)) {
   1930 			while (mutex_owner(poll_lock) != NULL)
   1931 				;
   1932 		}
   1933 		tooklock = 1;
   1934 	}
   1935 
   1936 	gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR);
   1937 
   1938 	if (tooklock)
   1939 		mutex_exit(poll_lock);
   1940 
   1941 	/*
   1942 	 * gcpu_mca_trap_vomit_summary may be set for debug assistance.
   1943 	 */
   1944 	if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
   1945 		cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
   1946 		    "%u PCC (%u ok), "
   1947 		    "%u UC (%d ok, %u poisoned), "
   1948 		    "%u forcefatal, %u ignored",
   1949 		    mce.mce_nerr, (u_longlong_t)mce.mce_disp,
   1950 		    mce.mce_npcc, mce.mce_npcc_ok,
   1951 		    mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
   1952 		    mce.mce_forcefatal, mce.mce_ignored);
   1953 	}
   1954 
   1955 	return (mce.mce_disp);
   1956 }
   1957 #endif
   1958 
   1959 /*ARGSUSED*/
   1960 void
   1961 gcpu_faulted_enter(cmi_hdl_t hdl)
   1962 {
   1963 	/* Nothing to do here */
   1964 }
   1965 
   1966 /*ARGSUSED*/
   1967 void
   1968 gcpu_faulted_exit(cmi_hdl_t hdl)
   1969 {
   1970 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
   1971 
   1972 	gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
   1973 }
   1974 
   1975 /*
   1976  * Write the requested values to the indicated MSRs.  Having no knowledge
   1977  * of the model-specific requirements for writing to these model-specific
   1978  * registers, we will only blindly write to those MSRs if the 'force'
   1979  * argument is nonzero.  That option should only be used in prototyping
   1980  * and debugging.
   1981  */
   1982 /*ARGSUSED*/
   1983 cmi_errno_t
   1984 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
   1985     int force)
   1986 {
   1987 	int i, errs = 0;
   1988 
   1989 	for (i = 0; i < nregs; i++) {
   1990 		uint_t msr = regs[i].cmr_msrnum;
   1991 		uint64_t val = regs[i].cmr_msrval;
   1992 
   1993 		if (cms_present(hdl)) {
   1994 			if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
   1995 				errs++;
   1996 		} else if (force) {
   1997 			errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
   1998 		} else {
   1999 			errs++;
   2000 		}
   2001 	}
   2002 
   2003 	return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
   2004 }
   2005