Home | History | Annotate | Download | only in cpumem-diagnosis
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #ifndef _CMD_CPU_H
     27 #define	_CMD_CPU_H
     28 
     29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     30 
     31 /*
     32  * Each CPU of interest has a cmd_cpu_t structure.  CPUs become of interest when
     33  * they are the focus of ereports, or when they detect UEs.  CPUs may be the
     34  * target of several different kinds of ereport, each of which is tracked
     35  * differently.  cpu_cases lists the types of cases that can be open against a
     36  * given CPU.  The life of a CPU is complicated by the fact that xxCs and xxUs
     37  * received by the DE may in fact be side-effects of earlier UEs, xxCs, or xxUs.
     38  * Causes of side-effects, and actions taken to resolve them, can be found below
     39  * and in cmd_memerr.h.
     40  *
     41  * Data structures:
     42  *      ________                                   CMD_PTR_CPU_ICACHE
     43  *     /        \       ,--------.                 CMD_PTR_CPU_DCACHE
     44  *     |CPU     | <---- |case_ptr| (one or more of CMD_PTR_CPU_PCACHE         )
     45  *     |        |       `--------'                 CMD_PTR_CPU_ITLB
     46  *     |,-------|       ,-------.                  CMD_PTR_CPU_DTLB
     47  *     ||asru   | ----> |fmri_t |                  CMD_PTR_CPU_L2DATA
     48  *     |:-------|       :-------:                  CMD_PTR_CPU_L2DATA_UERETRY
     49  *     ||fru    | ----> |fmri_t |                  CMD_PTR_CPU_L2TAG
     50  *     |`-------|       `-------'                  CMD_PTR_CPU_L3DATA
     51  *     |        |       ,---------.                CMD_PTR_CPU_L3DATA_UERETRY
     52  *     | uec    | ----> |UE cache |                CMD_PTR_CPU_L3TAG
     53  *     \________/       `---------'                CMD_PTR_CPU_FPU
     54  *						   CMD_PTR_CPU_IREG
     55  *						   CMD_PTR_CPU_FREG
     56  *						   CMD_PTR_CPU_MAU
     57  *						   CMD_PTR_CPU_L2CTL
     58  *
     59  *      ________
     60  *     /        \       ,--------.
     61  *     | xr     | <---- |case_ptr| (CMD_PTR_XR_WAITER)
     62  *     |        |       `--------'
     63  *     |,-------|       ,-------.
     64  *     ||rsrc   | ----> |fmri_t |
     65  *     |`-------|       `-------'
     66  *     | cpu    | ----> detecting CPU
     67  *     \________/
     68  *
     69  * Data structure	P?  Case- Notes
     70  *                          Rel?
     71  * ----------------	--- ----- --------------------------------------
     72  * cmd_cpu_t		Yes No    Name is derived from CPU ID ("cpu_%d")
     73  * cmd_case_ptr_t	Yes Yes   Name is case's UUID
     74  * cpu_asru (fmri_t)	Yes No    Name is derived from CPU ID ("cpu_asru_%d")
     75  * cpu_fru (fmri_t)	Yes No    Name is derived from CPU ID ("cpu_fru_%d")
     76  * cpu_uec		Yes No    Name is derived from CPU ID ("cpu_uec_%d")
     77  * cmd_xr_t		Yes Yes   Name is `redelivery'
     78  * xr_rsrc (fmri_t)     Yes No    Name is derived from case's UUID ("%s_rsrc")
     79  */
     80 
     81 #include <cmd.h>
     82 #include <cmd_state.h>
     83 #include <cmd_fmri.h>
     84 
     85 #ifdef __cplusplus
     86 extern "C" {
     87 #endif
     88 
     89 #define	CPU_FRU_FMRI		FM_FMRI_SCHEME_HC":///" \
     90     FM_FMRI_LEGACY_HC"="
     91 
     92 #define	BK_LFUFAULT_CERT	50
     93 
     94 typedef struct cmd_cpu cmd_cpu_t;
     95 
     96 typedef enum cmd_cpu_type {
     97 	CPU_ULTRASPARC_III = 1,
     98 	CPU_ULTRASPARC_IIIplus,
     99 	CPU_ULTRASPARC_IIIi,
    100 	CPU_ULTRASPARC_IV,
    101 	CPU_ULTRASPARC_IVplus,
    102 	CPU_ULTRASPARC_IIIiplus,
    103 	CPU_ULTRASPARC_T1,
    104 	CPU_SPARC64_VI,
    105 	CPU_SPARC64_VII,
    106 	CPU_ULTRASPARC_T2,
    107 	CPU_ULTRASPARC_T2plus
    108 } cmd_cpu_type_t;
    109 
    110 typedef struct cmd_cpu_cases {
    111 	cmd_case_t cpuc_icache;		/* All I$ errors (IPE, IDSPE, etc) */
    112 	cmd_case_t cpuc_dcache;		/* All D$ errors (DPE, DDSPE, etc) */
    113 	cmd_case_t cpuc_pcache;		/* All P$ errors (PDSPE) */
    114 	cmd_case_t cpuc_itlb;		/* ITLB errors (ITLBPE) */
    115 	cmd_case_t cpuc_dtlb;		/* DTLB errors (DTLBPE) */
    116 	cmd_case_t cpuc_l2data;		/* All correctable L2$ data errors */
    117 	cmd_case_t cpuc_l2tag;		/* All correctable L2$ tag errors */
    118 	cmd_case_t cpuc_l3data;		/* All correctable L3$ data errors */
    119 	cmd_case_t cpuc_l3tag;		/* All correctable L3$ tag errors */
    120 	cmd_case_t cpuc_fpu;		/* FPU errors */
    121 	cmd_case_t cpuc_ireg;		/* Integer reg errors (IRC, IRU) */
    122 	cmd_case_t cpuc_freg;		/* Floatpnt reg errors (frc, fru) */
    123 	cmd_case_t cpuc_mau;		/* Modular arith errors (MAU) */
    124 	cmd_case_t cpuc_l2ctl;		/* L2$ directory, VUAD parity */
    125 	cmd_case_t cpuc_misc_regs;	/* Scratchpad array (SCA) */
    126 					/* Tick compare (TC) */
    127 					/* Store buffer (SBD) */
    128 					/* Trap stack array errors (TSA) */
    129 	cmd_case_t cpuc_lfu;		/* Coherency link error (LFU) */
    130 #ifdef sun4u
    131 	cmd_case_t cpuc_opl_invsfsr;	/* Olympus-C cpu inv-sfsr errors */
    132 	cmd_case_t cpuc_oplue_detcpu;	/* Olympus-C cpu det. ue (eid=CPU) */
    133 	cmd_case_t cpuc_oplue_detio;	/* Olympus-C io det. ue (eid=CPU) */
    134 	cmd_case_t cpuc_opl_mtlb;	/* Olympus-C mtlb errors */
    135 	cmd_case_t cpuc_opl_tlbp;	/* Olympus-C tlbp errors */
    136 	cmd_case_t cpuc_opl_inv_urg;	/* Olympus-C inv-urg invalid urgent */
    137 	cmd_case_t cpuc_opl_cre;	/* Olympus-C cre urgent errors */
    138 	cmd_case_t cpuc_opl_tsb_ctx;	/* Olympus-C tsb_ctx urgent errors */
    139 	cmd_case_t cpuc_opl_tsbp;	/* Olympus-C tsbp urgent errors */
    140 	cmd_case_t cpuc_opl_pstate;	/* Olympus-C pstate urgent errors */
    141 	cmd_case_t cpuc_opl_tstate;	/* Olympus-C tstate urgent errors */
    142 	cmd_case_t cpuc_opl_iug_f;	/* Olympus-C iug_f urgent errors */
    143 	cmd_case_t cpuc_opl_iug_r;	/* Olympus-C iug_r urgent errors */
    144 	cmd_case_t cpuc_opl_sdc;	/* Olympus-C sdc urgent errors */
    145 	cmd_case_t cpuc_opl_wdt;	/* Olympus-C wdt urgent errors */
    146 	cmd_case_t cpuc_opl_dtlb;	/* Olympus-C dtlb urgent errors */
    147 	cmd_case_t cpuc_opl_itlb;	/* Olympus-C itlb urgent errors */
    148 	cmd_case_t cpuc_opl_core_err;	/* Olympus-C core-err urgent errors */
    149 	cmd_case_t cpuc_opl_dae;	/* Olympus-C dae urgent errors */
    150 	cmd_case_t cpuc_opl_iae;	/* Olympus-C iae urgent errors */
    151 	cmd_case_t cpuc_opl_uge;	/* Olympus-C uge urgent errors */
    152 #endif	/* sun4u */
    153 } cmd_cpu_cases_t;
    154 
    155 /*
    156  * The UE cache.  We actually have two UE caches - the current one and the old
    157  * one.  When it's time to flush the UE cache, we move the current UE cache to
    158  * the old position and flush the E$.  Then, we schedule the removal of the old
    159  * UE cache.  This allows a) xxUs triggered by the flush to match against the
    160  * old cache, while b) still allowing new UEs to be added to the current UE
    161  * cache.  UE matches will always search in both caches (if present), but
    162  * additions will only end up in the current cache.  We go to all of this
    163  * effort because the cost of a missed ereport (discarding due to a false match
    164  * in the cache) is much less than that of a missed match.  In the latter case,
    165  * the CPU will be erroneously offlined.
    166  *
    167  * A special case is triggered if we see a UE with a not valid AFAR.  Without
    168  * the AFAR, we aren't able to properly match subsequent xxU's.  As a result,
    169  * we need to throw the cache into all-match mode, wherein all subsequent match
    170  * attempts will succeed until the UE cache is flushed.
    171  */
    172 
    173 #define	CPU_UEC_F_ALLMATCH	0x1	/* all-match mode active */
    174 
    175 typedef struct cmd_cpu_uec {
    176 	uint64_t *uec_cache;		/* The UE cache */
    177 	uint_t uec_nent;		/* Number of allocated slots in cache */
    178 	uint_t uec_flags;		/* CPU_UEC_F_* */
    179 	char uec_bufname[CMD_BUFNMLEN];	/* Name of buffer used for cache */
    180 } cmd_cpu_uec_t;
    181 
    182 extern const char *cmd_cpu_type2name(fmd_hdl_t *, cmd_cpu_type_t);
    183 extern void cmd_cpu_uec_add(fmd_hdl_t *, cmd_cpu_t *, uint64_t);
    184 extern int cmd_cpu_uec_match(cmd_cpu_t *, uint64_t);
    185 extern void cmd_cpu_uec_clear(fmd_hdl_t *, cmd_cpu_t *);
    186 extern void cmd_cpu_uec_set_allmatch(fmd_hdl_t *, cmd_cpu_t *);
    187 
    188 /*
    189  * Certain types of xxC and xxU can trigger other types as side-effects.  These
    190  * secondary ereports need to be discarded, as treating them as legitimate
    191  * ereports in their own right will cause erroneous diagnosis.  As an example
    192  * (see cmd_xxcu_trains for more), an L2$ UCC will usually trigger an L2$ WDC
    193  * resulting from the trap handler's flushing of the L2$.  If we treat both as
    194  * legitimate, we'll end up adding two ereports to the SERD engine,
    195  * significantly cutting the threshold for retiring the CPU.
    196  *
    197  * Our saving grace is the fact that the side-effect ereports will have the same
    198  * ENA as the primary.  As such, we can keep track of groups of ereports by ENA.
    199  * These groups, which we'll call trains, can then be matched against a list of
    200  * known trains.  The list (an array of cmd_xxcu_train_t structures) has both a
    201  * description of the composition of the train and an indication as to which of
    202  * the received ereports is the primary.
    203  *
    204  * The cmd_xxcu_trw_t is used to gather the members of the train.  When the
    205  * first member comes in, we allocate a trw, recording the ENA of the ereport,
    206  * as well as noting its class in trw_mask.  We then reschedule the delivery of
    207  * the ereport for some configurable time in the future, trusting that all
    208  * members of the train will have arrived by that time.  Subsequent ereports in
    209  * the same train match the recorded ENA, and add themselves to the mask.
    210  * When the first ereport is redelivered, trw_mask is used to determine whether
    211  * or not a train has been seen.  An exact match is required.  If a match is
    212  * made, the ereport indicated as the primary cause is used for diagnosis.
    213  */
    214 
    215 #define	CMD_TRW_F_DELETING	0x1	/* reclaiming events */
    216 #define	CMD_TRW_F_CAUSESEEN	0x2	/* cause of train already processed */
    217 #define	CMD_TRW_F_GCSEEN	0x4	/* seen by GC, erased next time */
    218 
    219 typedef struct cmd_xxcu_trw {
    220 	uint64_t trw_ena;	/* the ENA for this group of ereports */
    221 	uint64_t trw_afar;	/* the AFAR for this group of ereports */
    222 	cmd_errcl_t trw_mask;	/* ereports seen thus far with this ENA */
    223 	uint16_t trw_cpuid;	/* CPU to which this watcher belongs */
    224 	uint8_t	 trw_ref;	/* number of ereports with this ENA */
    225 	uint8_t	 trw_flags;	/* CMD_TRW_F_* */
    226 	uint32_t trw_pad;
    227 } cmd_xxcu_trw_t;
    228 
    229 extern cmd_xxcu_trw_t *cmd_trw_lookup(uint64_t, uint8_t, uint64_t);
    230 extern cmd_xxcu_trw_t *cmd_trw_alloc(uint64_t, uint64_t);
    231 extern void cmd_trw_restore(fmd_hdl_t *);
    232 extern void cmd_trw_write(fmd_hdl_t *);
    233 extern void cmd_trw_ref(fmd_hdl_t *, cmd_xxcu_trw_t *, cmd_errcl_t);
    234 extern void cmd_trw_deref(fmd_hdl_t *, cmd_xxcu_trw_t *);
    235 
    236 extern cmd_errcl_t cmd_xxcu_train_match(cmd_errcl_t);
    237 
    238 /*
    239  * We don't have access to ereport nvlists when they are redelivered via timer.
    240  * As such, we have to retrieve everything we might need for diagnosis when we
    241  * first receive the ereport.  The retrieved information is stored in the
    242  * cmd_xr_t, which is persisted.
    243  */
    244 
    245 typedef struct cmd_xr cmd_xr_t;
    246 
    247 /*
    248  * xr_hdlr can't be persisted, so we use these in xr_hdlrid to indicate the
    249  * handler to be used.  xr_hdlr is then updated so it can be used directly.
    250  */
    251 #define	CMD_XR_HDLR_XXC		1
    252 #define	CMD_XR_HDLR_XXU		2
    253 #define	CMD_XR_HDLR_NOP		3
    254 
    255 typedef void cmd_xr_hdlr_f(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
    256 
    257 /*
    258  * For sun4v, the size of xr_synd is expanded to 32 bits in order to
    259  * accomodate the Niagara L2 syndrome (4x7 bits).
    260  */
    261 
    262 struct cmd_xr {
    263 	cmd_list_t xr_list;
    264 	id_t xr_id;		/* ID of timer used for redelivery */
    265 	cmd_cpu_t *xr_cpu;	/* Detecting CPU, recalc'd from cpuid */
    266 	uint32_t xr_cpuid;	/* ID of detecting CPU */
    267 	uint64_t xr_ena;	/* ENA from ereport */
    268 	uint64_t xr_afar;	/* AFAR from ereport nvlist */
    269 #ifdef sun4u
    270 	uint16_t xr_synd;	/* syndrome from ereport nvlist */
    271 #else /* sun4u */
    272 	uint32_t xr_synd;	/* for Niagara, enlarged to 32 bits */
    273 #endif /* sun4u */
    274 	uint8_t xr_afar_status;	/* AFAR status from ereport nvlist */
    275 	uint8_t xr_synd_status;	/* syndrome status from ereport nvlist */
    276 	cmd_fmri_t xr_rsrc;	/* resource from ereport nvlist */
    277 	cmd_errcl_t xr_clcode;	/* CMD_ERRCL_* for this ereport */
    278 	cmd_xr_hdlr_f *xr_hdlr;	/* handler, recalc'd from hdlrid on restart */
    279 	uint_t xr_hdlrid;	/* CMD_XR_HDLR_*, used for recalc of hdlr */
    280 	fmd_case_t *xr_case;	/* Throwaway case used to track redelivery */
    281 	uint_t xr_ref;		/* Number of references to this struct */
    282 #ifdef sun4u
    283 	uint64_t xr_afsr;	/* AFSR from ereport nvlist */
    284 	uint8_t  xr_num_ways;   /* Number of Cache ways reporting from nvlist */
    285 	uint32_t xr_error_way;  /* The way from the ereport nvlist payload */
    286 	uint64_t xr_error_tag;  /* The tag from the ereport nvlist payload */
    287 	uint32_t xr_error_index; /* the index from the ereport payload */
    288 	uint64_t *xr_cache_data; /* The cache data */
    289 	nvlist_t *xr_detector_nvlist; /* The detecting resource */
    290 #endif
    291 };
    292 
    293 #define	xr_rsrc_nvl		xr_rsrc.fmri_nvl
    294 
    295 extern cmd_xr_t *cmd_xr_create(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    296     cmd_cpu_t *, cmd_errcl_t);
    297 extern cmd_evdisp_t cmd_xr_reschedule(fmd_hdl_t *, cmd_xr_t *, uint_t);
    298 extern void cmd_xr_deref(fmd_hdl_t *, cmd_xr_t *);
    299 extern void cmd_xr_write(fmd_hdl_t *, cmd_xr_t *);
    300 
    301 extern void cmd_xxc_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
    302 extern void cmd_xxu_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
    303 extern void cmd_nop_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
    304 extern cmd_evdisp_t cmd_xxcu_initial(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    305     const char *, cmd_errcl_t,  uint_t);
    306 
    307 /*
    308  * The master structure containing or referencing all of the state for a given
    309  * CPU.
    310  */
    311 
    312 /*
    313  * We periodically flush the E$, thus allowing us to flush the UE cache (see
    314  * above for a description of the UE cache).  In particular, we flush it
    315  * whenever we see a UE with a non-valid AFAR.  To keep from overflushing the
    316  * CPU, we cap the number of flushes that we'll do in response to UEs with
    317  * non-valid AFARs.  The cap is the number of permitted flushes per GC/restart
    318  * cycle, and was determined arbitrarily.
    319  */
    320 #define	CPU_UEC_FLUSH_MAX	3
    321 
    322 /*
    323  * The CPU structure started life without a version number.  Making things more
    324  * complicated, the version number in the new struct occupies the space used for
    325  * cpu_cpuid in the non-versioned struct.  We therefore have to use somewhat
    326  * unorthodox version numbers to distinguish between the two types of struct
    327  * (pre- and post-versioning) -- version numbers that can't be mistaken for
    328  * CPUIDs.  Our version numbers, therefore, will be negative.
    329  *
    330  * For future expansion, the version member must always stay where it is.  At
    331  * some point in the future, when more structs get versions, the version member
    332  * should move into the cmd_header_t.
    333  */
    334 #define	CPU_MKVERSION(version)	((uint_t)(0 - (version)))
    335 
    336 #define	CMD_CPU_VERSION_1	CPU_MKVERSION(1)	/* -1 */
    337 #define	CMD_CPU_VERSION_2	CPU_MKVERSION(2)	/* -2 */
    338 #define	CMD_CPU_VERSION_3	CPU_MKVERSION(3)	/* -3 */
    339 #define	CMD_CPU_VERSION		CMD_CPU_VERSION_3
    340 
    341 #define	CMD_CPU_VERSIONED(cpu)	((int)(cpu)->cpu_version < 0)
    342 
    343 #define	CMD_CPU_F_DELETING	0x1
    344 
    345 typedef struct cmd_cpu_0 {
    346 	cmd_header_t cpu0_header;	/* Nodetype must be CMD_NT_CPU */
    347 	uint32_t cpu0_cpuid;		/* Logical ID for this CPU */
    348 	cmd_cpu_type_t cpu0_type;	/* CPU model */
    349 	fmd_case_t *cpu0_cases[4];	/* v0 had embedded case_t w/4 cases */
    350 	uint8_t cpu0_faulting;		/* Set if fault has been issued */
    351 	cmd_fmri_t cpu0_asru;		/* ASRU for this CPU */
    352 	cmd_fmri_t cpu0_fru;		/* FRU for this CPU */
    353 	cmd_cpu_uec_t cpu0_uec;		/* UE cache */
    354 	cmd_cpu_uec_t cpu0_olduec;	/* To-be-flushed UE cache */
    355 	id_t cpu0_uec_flush;		/* Timer ID for UE cache flush */
    356 	uint_t cpu0_uec_nflushes;	/* # of flushes since last restart/GC */
    357 	cmd_list_t cpu0_xxu_retries;	/* List of pending xxU retries */
    358 } cmd_cpu_0_t;
    359 
    360 typedef struct cmd_cpu_1 {
    361 	cmd_header_t cpu1_header;	/* Nodetype must be CMD_NT_CPU */
    362 	uint_t cpu1_version;		/* struct version - must follow hdr */
    363 	uint32_t cpu1_cpuid;		/* Logical ID for this CPU */
    364 	cmd_cpu_type_t cpu1_type;	/* CPU model */
    365 	uintptr_t *cpu1_cases;		/* v1 had a pointer to a case array */
    366 	uint8_t cpu1_faulting;		/* Set if fault has been issued */
    367 	cmd_fmri_t cpu1_asru;		/* ASRU for this CPU */
    368 	cmd_fmri_t cpu1_fru;		/* FRU for this CPU */
    369 	cmd_cpu_uec_t cpu1_uec;		/* UE cache */
    370 	cmd_cpu_uec_t cpu1_olduec;	/* To-be-flushed UE cache */
    371 	id_t cpu1_uec_flush;		/* Timer ID for UE cache flush */
    372 	uint_t cpu1_uec_nflushes;	/* # of flushes since last restart/GC */
    373 	cmd_list_t cpu1_xxu_retries;	/* List of pending xxU retries */
    374 } cmd_cpu_1_t;
    375 
    376 typedef struct cmd_cpu_2 {
    377 	cmd_header_t cpu2_header;	/* Nodetype must be CMD_NT_CPU */
    378 	uint_t cpu2_version;		/* struct version - must follow hdr */
    379 	uint32_t cpu2_cpuid;		/* Logical ID for this CPU */
    380 	cmd_cpu_type_t cpu2_type;	/* CPU model */
    381 	uint8_t cpu2_faulting;		/* Set if fault has been issued */
    382 	cmd_fmri_t cpu2_asru;		/* ASRU for this CPU */
    383 	cmd_fmri_t cpu2_fru;		/* FRU for this CPU */
    384 	cmd_cpu_uec_t cpu2_uec;		/* UE cache */
    385 	cmd_cpu_uec_t cpu2_olduec;	/* To-be-flushed UE cache */
    386 } cmd_cpu_2_t;
    387 
    388 /* Portion of the cpu structure which must be persisted */
    389 typedef struct cmd_cpu_pers {
    390 	cmd_header_t cpup_header;	/* Nodetype must be CMD_NT_CPU */
    391 	uint_t cpup_version;		/* struct version - must follow hdr */
    392 	uint32_t cpup_cpuid;		/* Logical ID for this CPU */
    393 	cmd_cpu_type_t cpup_type;	/* CPU model */
    394 	uint8_t cpup_faulting;		/* Set if fault has been issued */
    395 	uint8_t cpup_level;		/* cpu group level - 0 == thread */
    396 	cmd_fmri_t cpup_asru;		/* ASRU for this CPU */
    397 	cmd_fmri_t cpup_fru;		/* FRU for this CPU */
    398 	cmd_cpu_uec_t cpup_uec;		/* UE cache */
    399 	cmd_cpu_uec_t cpup_olduec;	/* To-be-flushed UE cache */
    400 } cmd_cpu_pers_t;
    401 
    402 /* Persistent and dynamic CPU data */
    403 struct cmd_cpu {
    404 	cmd_cpu_pers_t cpu_pers;
    405 	cmd_cpu_cases_t cpu_cases;
    406 	id_t cpu_uec_flush;		/* Timer ID for UE cache flush */
    407 	uint_t cpu_uec_nflushes;	/* # of flushes since last restart/GC */
    408 	cmd_list_t cpu_xxu_retries;	/* List of pending xxU retries */
    409 	uint_t cpu_flags;
    410 	cmd_list_t cpu_Lxcaches;	/* List of Lxcache state structures */
    411 	fmd_stat_t Lxcache_creat;	/* num of Lxcache states created */
    412 };
    413 
    414 #define	CMD_CPU_MAXSIZE \
    415 	MAX(MAX(sizeof (cmd_cpu_0_t), sizeof (cmd_cpu_1_t)), \
    416 	    MAX(sizeof (cmd_cpu_2_t), sizeof (cmd_cpu_pers_t)))
    417 #define	CMD_CPU_MINSIZE \
    418 	MIN(MIN(sizeof (cmd_cpu_0_t), sizeof (cmd_cpu_1_t)), \
    419 	    MIN(sizeof (cmd_cpu_2_t), sizeof (cmd_cpu_pers_t)))
    420 
    421 #define	cpu_header		cpu_pers.cpup_header
    422 #define	cpu_nodetype		cpu_pers.cpup_header.hdr_nodetype
    423 #define	cpu_bufname		cpu_pers.cpup_header.hdr_bufname
    424 #define	cpu_version		cpu_pers.cpup_version
    425 #define	cpu_cpuid		cpu_pers.cpup_cpuid
    426 #define	cpu_type		cpu_pers.cpup_type
    427 #define	cpu_faulting		cpu_pers.cpup_faulting
    428 #define	cpu_level		cpu_pers.cpup_level
    429 #define	cpu_asru		cpu_pers.cpup_asru
    430 #define	cpu_fru			cpu_pers.cpup_fru
    431 #define	cpu_uec			cpu_pers.cpup_uec
    432 #define	cpu_olduec		cpu_pers.cpup_olduec
    433 #define	cpu_icache		cpu_cases.cpuc_icache
    434 #define	cpu_dcache		cpu_cases.cpuc_dcache
    435 #define	cpu_pcache		cpu_cases.cpuc_pcache
    436 #define	cpu_itlb		cpu_cases.cpuc_itlb
    437 #define	cpu_dtlb		cpu_cases.cpuc_dtlb
    438 #define	cpu_l2data		cpu_cases.cpuc_l2data
    439 #define	cpu_l2tag		cpu_cases.cpuc_l2tag
    440 #define	cpu_l3data		cpu_cases.cpuc_l3data
    441 #define	cpu_l3tag		cpu_cases.cpuc_l3tag
    442 #define	cpu_fpu			cpu_cases.cpuc_fpu
    443 #define	cpu_ireg 		cpu_cases.cpuc_ireg
    444 #define	cpu_freg		cpu_cases.cpuc_freg
    445 #define	cpu_mau			cpu_cases.cpuc_mau
    446 #define	cpu_l2ctl		cpu_cases.cpuc_l2ctl
    447 #define	cpu_misc_regs		cpu_cases.cpuc_misc_regs
    448 #define	cpu_lfu			cpu_cases.cpuc_lfu
    449 #ifdef sun4u
    450 #define	cpu_opl_invsfsr		cpu_cases.cpuc_opl_invsfsr
    451 #define	cpu_oplue_detcpu	cpu_cases.cpuc_oplue_detcpu
    452 #define	cpu_oplue_detio		cpu_cases.cpuc_oplue_detio
    453 #define	cpu_opl_mtlb		cpu_cases.cpuc_opl_mtlb
    454 #define	cpu_opl_tlbp		cpu_cases.cpuc_opl_tlbp
    455 #define	cpu_opl_inv_urg		cpu_cases.cpuc_opl_inv_urg
    456 #define	cpu_opl_cre		cpu_cases.cpuc_opl_cre
    457 #define	cpu_opl_tsb_ctx		cpu_cases.cpuc_opl_tsb_ctx
    458 #define	cpu_opl_tsbp		cpu_cases.cpuc_opl_tsbp
    459 #define	cpu_opl_pstate		cpu_cases.cpuc_opl_pstate
    460 #define	cpu_opl_tstate		cpu_cases.cpuc_opl_tstate
    461 #define	cpu_opl_iug_f		cpu_cases.cpuc_opl_iug_f
    462 #define	cpu_opl_iug_r		cpu_cases.cpuc_opl_iug_r
    463 #define	cpu_opl_sdc		cpu_cases.cpuc_opl_sdc
    464 #define	cpu_opl_wdt		cpu_cases.cpuc_opl_wdt
    465 #define	cpu_opl_dtlb		cpu_cases.cpuc_opl_dtlb
    466 #define	cpu_opl_itlb		cpu_cases.cpuc_opl_itlb
    467 #define	cpu_opl_core_err	cpu_cases.cpuc_opl_core_err
    468 #define	cpu_opl_dae		cpu_cases.cpuc_opl_dae
    469 #define	cpu_opl_iae		cpu_cases.cpuc_opl_iae
    470 #define	cpu_opl_uge		cpu_cases.cpuc_opl_uge
    471 #endif	/* sun4u */
    472 
    473 #define	cpu_asru_nvl		cpu_asru.fmri_nvl
    474 #define	cpu_fru_nvl		cpu_fru.fmri_nvl
    475 
    476 /*
    477  * L2$ and L3$ Data errors
    478  *
    479  *          SERD name
    480  *   Type   (if any)   Fault
    481  *  ------ ----------- -------------------------------
    482  *   xxC   l2cachedata fault.cpu.<cputype>.l2cachedata
    483  *   xxU        -      fault.cpu.<cputype>.l2cachedata
    484  *  L3_xxC l3cachedata fault.cpu.<cputype>.l3cachedata
    485  *  L3_xxU      -      fault.cpu.<cputype>.l3cachedata
    486  *
    487  * NOTE: For the purposes of the discussion below, xxC and xxU refer to both
    488  *       L2$ and L3$ data errors.
    489  *
    490  * These ereports will be dropped if (among other things) they are side-effects
    491  * of UEs (xxUs only) or other xxCs or xxUs.  Whenever UEs are detected, they
    492  * are added to a per-CPU cache.  xxUs are then compared to this cache.  If a
    493  * xxU's AFAR refers to an address which recently saw a UE, the xxU is dropped,
    494  * as it was most likely caused by the UE.  When multiple xxCs and xxUs are seen
    495  * with the same ENA, all save one are generally side-effects.  We track these
    496  * groups (referred to as trains), matching them against a premade list.  If one
    497  * of the trains matches, we drop all but the primary, which is indicated in the
    498  * list.
    499  *
    500  * The expected resolution of l2cachedata and l3cachedata faults is the
    501  * disabling of the indicated CPU.
    502  */
    503 extern cmd_evdisp_t cmd_xxc(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    504     const char *, cmd_errcl_t);
    505 extern cmd_evdisp_t cmd_xxu(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    506     const char *, cmd_errcl_t);
    507 
    508 /*
    509  * As of Niagara-2, we ignore writeback (ldwc, ldwu) errors.  Since these were
    510  * the only defined follow-on errors for sun4v trains, sun4v L2 cache data
    511  * errors no longer need to use the train mechanism.
    512  */
    513 
    514 extern cmd_evdisp_t cmd_l2c(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    515     const char *, cmd_errcl_t);
    516 extern cmd_evdisp_t cmd_l2u(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    517     const char *, cmd_errcl_t);
    518 
    519 /*
    520  * Common Errdata structure for SERD engines
    521  */
    522 typedef struct errdata {
    523 	cmd_serd_t *ed_serd;
    524 	const char *ed_fltnm;
    525 	const cmd_ptrsubtype_t ed_pst;
    526 } errdata_t;
    527 
    528 /*
    529  * L2$ and L3$ Tag errors
    530  *
    531  *           SERD name
    532  *   Type    (if any)   Fault
    533  *  ------- ----------- -------------------------------
    534  *   TxCE   l2cachetag  fault.cpu.<cputype>.l2cachetag
    535  *  L3_THCE l3cachetag  fault.cpu.<cputype>.l3cachetag
    536  *    LTC   l2cachetag	fault.cpu.<cputype>.l2cachetag
    537  *
    538  * We'll never see the uncorrectable Tag errors - they'll cause the machine to
    539  * reset, and we'll be ne'er the wiser.
    540  *
    541  * The expected resolution of l2cachetag and l3cachetag faults is the disabling
    542  * of the indicated CPU.
    543  */
    544 extern cmd_evdisp_t cmd_txce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    545     const char *, cmd_errcl_t);
    546 
    547 extern cmd_evdisp_t cmd_l3_thce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    548     const char *, cmd_errcl_t);
    549 
    550 /*
    551  * L1$ errors
    552  *
    553  *          SERD name
    554  *   Type   (if any)   Fault
    555  *  ------- --------- -------------------------------
    556  *   IPE     icache   fault.cpu.<cputype>.icache
    557  *   IxSPE   icache   fault.cpu.<cputype>.icache
    558  *   DPE     dcache   fault.cpu.<cputype>.dcache
    559  *   DxSPE   dcache   fault.cpu.<cputype>.dcache
    560  *   PDSPE   pcache   fault.cpu.<cputype>.pcache
    561  *
    562  * The I$, D$, and P$ are clean, and thus have no uncorrectable errors.
    563  *
    564  * The expected resolution of icache, dcache, and pcache faults is the disabling
    565  * of the indicated CPU.
    566  */
    567 extern cmd_evdisp_t cmd_icache(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    568     const char *, cmd_errcl_t);
    569 extern cmd_evdisp_t cmd_dcache(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    570     const char *, cmd_errcl_t);
    571 extern cmd_evdisp_t cmd_pcache(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    572     const char *, cmd_errcl_t);
    573 
    574 /*
    575  * TLB errors
    576  *
    577  *         SERD name
    578  *   Type  (if any)   Fault
    579  *  ------ --------- -------------------------------
    580  *  ITLBPE   itlb    fault.cpu.<cputype>.itlb
    581  *  DTLBPE   dtlb    fault.cpu.<cputype>.dtlb
    582  *
    583  * The expected resolution of itlb and dtlb faults is the disabling of the
    584  * indicated CPU.
    585  */
    586 extern cmd_evdisp_t cmd_itlb(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    587     const char *, cmd_errcl_t);
    588 extern cmd_evdisp_t cmd_dtlb(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    589     const char *, cmd_errcl_t);
    590 
    591 extern void cmd_cpuerr_close(fmd_hdl_t *, void *);
    592 
    593 /*
    594  * FPU errors
    595  *
    596  *         SERD name
    597  *   Type  (if any)   Fault
    598  *  ------ --------- -------------------------------
    599  *   FPU       -     fault.cpu.<cputype>.fpu
    600  *
    601  * The expected resolution of FPU faults is the disabling of the indicated CPU.
    602  */
    603 extern cmd_evdisp_t cmd_fpu(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    604     const char *, cmd_errcl_t);
    605 
    606 
    607 
    608 /*
    609  * FPU (FP-Scrubber) errors
    610  *
    611  *         SERD name
    612  *   Type  (if any)   Fault
    613  *  ------ --------- -------------------------------
    614  *   FPU       -     fault.cpu.<cputype>.fpu
    615  *
    616  * The expected resolution of FPU faults is the disabling of the CPU
    617  * indicted in the resource FMRI.
    618  */
    619 extern cmd_evdisp_t cmd_fps(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    620     const char *, cmd_errcl_t);
    621 
    622 
    623 
    624 
    625 
    626 /*
    627  * ireg errors
    628  *
    629  *         SERD name
    630  *   Type  (if any)   Fault
    631  *  ------ --------- -------------------------------
    632  *   IRC     ireg    fault.cpu.<cputype>.ireg
    633  *   IRU      -				 "
    634  *
    635  * The expected resolution of ireg faults is the disabling of the indicated CPU.
    636  */
    637 extern cmd_evdisp_t cmd_irc(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    638     const char *, cmd_errcl_t);
    639 extern cmd_evdisp_t cmd_iru(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    640     const char *, cmd_errcl_t);
    641 
    642 /*
    643  * freg errors
    644  *
    645  *         SERD name
    646  *   Type  (if any)   Fault
    647  *  ------ --------- -------------------------------
    648  *   FRC     freg    fault.cpu.ultraSPARC-T1.frc
    649  *   FRU      -                           " .fru
    650  *
    651  * The expected resolution of freg faults is the repair of the indicated CPU.
    652  */
    653 extern cmd_evdisp_t cmd_frc(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    654     const char *, cmd_errcl_t);
    655 extern cmd_evdisp_t cmd_fru(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    656     const char *, cmd_errcl_t);
    657 
    658 /*
    659  * MAU errors
    660  *
    661  *         SERD name
    662  *   Type  (if any)   Fault
    663  *  ------ --------- -------------------------------
    664  *   MAU     mau    fault.cpu.<cputype>.mau
    665  *
    666  * The expected resolution of mau faults is the repair of the indicated CPU.
    667  */
    668 extern cmd_evdisp_t cmd_mau(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    669     const char *, cmd_errcl_t);
    670 
    671 /*
    672  * L2CTL errors
    673  *
    674  *         SERD name
    675  *   Type  (if any)   Fault
    676  *  ------ --------- -------------------------------
    677  *  L2CTL     -     fault.cpu.<cputype>.l2ctl
    678  *
    679  * The expected resolution of l2ctl faults is the repair of the indicated CPU.
    680  */
    681 extern cmd_evdisp_t cmd_l2ctl(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    682     const char *, cmd_errcl_t);
    683 
    684 /*
    685  * SBD (Storage Buffer Data) errors
    686  * SCA (Scratchpath Array) erros
    687  * TC (Tick compare) errors
    688  * TSA (Trap stack Array) errors
    689  *
    690  *         SERD name
    691  *   Type  (if any)   Fault
    692  *  ------ --------- -------------------------------
    693  *   SBDC     misc_regs    fault.cpu.<cputype>.misc_regs
    694  *   SBDU
    695  *   SCAC, SCAU
    696  *   TCC, TCU
    697  *   TSAC, TSAU
    698  *
    699  * The expected resolution of misc_regs faults is the repair of
    700  * the indicated CPU.
    701  */
    702 extern cmd_evdisp_t cmd_miscregs_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    703     const char *, cmd_errcl_t);
    704 extern cmd_evdisp_t cmd_miscregs_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    705     const char *, cmd_errcl_t);
    706 
    707 extern cmd_evdisp_t cmd_miscregs_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    708     const char *, cmd_errcl_t);
    709 
    710 /*
    711  * Type                                          Fault
    712  * ---------------------------------------------------------------------
    713  * LFU-RTF   uncorrectable link retrain fail error    fault.cpu.T2plus.lfu-u
    714  * LFU-TTO   uncorrectable training timeout error
    715  * LFU-CTO   uncorrectable config timeout error
    716  * LFU-MLF   uncorrectable multi lanes link fail error
    717  * LFU-SLF   correctable single lane failover	      fault.cpu.T2plus.lfu-f
    718  *
    719  * The expected resolution of lfu faults is the repair of the indicated CPU.
    720  */
    721 extern cmd_evdisp_t cmd_lfu_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    722     const char *, cmd_errcl_t);
    723 extern cmd_evdisp_t cmd_lfu_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    724     const char *, cmd_errcl_t);
    725 /*
    726  * Type                                          Fault
    727  * ---------------------------------------------------------------------
    728  * Coherency link protocol errors
    729  * to        Transaction timed out  		fault.cpu.T2plus.lfu-p
    730  * frack     Invalid or redundant request ack
    731  * fsr       Invalid or redundant snoop response
    732  * fdr       Invalid or redundant data return
    733  * snptyp    Invalid snoop type received from
    734  *           coherency link
    735  *
    736  * The expected resolution of lfu faults is the repair of the indicated CPU.
    737  */
    738 extern cmd_evdisp_t cmd_lfu_pe(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    739     const char *, cmd_errcl_t);
    740 
    741 /*
    742  * CPUs are described by FMRIs.  This routine will retrieve the CPU state
    743  * structure (creating a new one if necessary) described by the detector
    744  * FMRI in the passed ereport.
    745  */
    746 extern cmd_cpu_t *cmd_cpu_lookup_from_detector(fmd_hdl_t *, nvlist_t *,
    747     const char *, uint8_t);
    748 
    749 extern char *cmd_cpu_getfrustr(fmd_hdl_t *, cmd_cpu_t *);
    750 extern char *cmd_cpu_getpartstr(fmd_hdl_t *, cmd_cpu_t *);
    751 
    752 extern char *cmd_cpu_getserialstr(fmd_hdl_t *, cmd_cpu_t *);
    753 extern nvlist_t *cmd_cpu_mkfru(fmd_hdl_t *, char *, char *, char *);
    754 
    755 extern cmd_cpu_t *cmd_cpu_lookup(fmd_hdl_t *, nvlist_t *, const char *,
    756     uint8_t);
    757 
    758 extern void cmd_cpu_create_faultlist(fmd_hdl_t *, fmd_case_t *, cmd_cpu_t *,
    759     const char *, nvlist_t *, uint_t);
    760 
    761 extern cmd_cpu_t *cmd_restore_cpu_only(fmd_hdl_t *, fmd_case_t *, char *);
    762 extern void cmd_cpu_destroy(fmd_hdl_t *, cmd_cpu_t *);
    763 extern void *cmd_cpu_restore(fmd_hdl_t *, fmd_case_t *, cmd_case_ptr_t *);
    764 extern void cmd_cpu_validate(fmd_hdl_t *);
    765 extern void cmd_cpu_timeout(fmd_hdl_t *, id_t, void *);
    766 extern void cmd_cpu_gc(fmd_hdl_t *);
    767 extern void cmd_cpu_fini(fmd_hdl_t *hdl);
    768 extern char *cmd_cpu_serdnm_create(fmd_hdl_t *, cmd_cpu_t *, const char *);
    769 extern nvlist_t *cmd_cpu_fmri_create(uint32_t, uint8_t);
    770 
    771 extern uint32_t cmd_cpu2core(uint32_t, cmd_cpu_type_t, uint8_t);
    772 
    773 #define	CMD_CPU_LEVEL_THREAD		0
    774 #define	CMD_CPU_LEVEL_CORE		1
    775 #define	CMD_CPU_LEVEL_CHIP		2
    776 #define	CMD_CPU_STAT_BUMP(cpu, name)    cpu->name.fmds_value.ui64++
    777 
    778 typedef enum {
    779     CMD_CPU_FAM_UNSUPPORTED,
    780     CMD_CPU_FAM_CHEETAH,
    781     CMD_CPU_FAM_NIAGARA,
    782     CMD_CPU_FAM_SPARC64
    783 } cpu_family_t;
    784 
    785 typedef struct faminfo {
    786 	cpu_family_t fam_value;
    787 	boolean_t ecache_flush_needed;
    788 } faminfo_t;
    789 
    790 extern cpu_family_t cmd_cpu_check_support(void);
    791 extern boolean_t cmd_cpu_ecache_support(void);
    792 
    793 extern int cmd_xr_fill(fmd_hdl_t *, nvlist_t *, cmd_xr_t *, cmd_errcl_t);
    794 extern void cmd_fill_errdata(cmd_errcl_t, cmd_cpu_t *, cmd_case_t **,
    795     const errdata_t **);
    796 extern cmd_xxcu_trw_t *cmd_trw_lookup(uint64_t, uint8_t, uint64_t);
    797 extern cmd_evdisp_t cmd_nop_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
    798     const char *, cmd_errcl_t);
    799 extern cmd_errcl_t cmd_train_match(cmd_errcl_t, cmd_errcl_t);
    800 extern int cmd_afar_status_check(uint8_t, cmd_errcl_t);
    801 
    802 #ifdef sun4u
    803 extern int cmd_cpu_synd_check(uint16_t, cmd_errcl_t clcode);
    804 #else /* sun4u */
    805 extern int cmd_cpu_synd_check(uint32_t, cmd_errcl_t clcode);
    806 #endif /* sun4u */
    807 
    808 extern int cmd_afar_valid(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_errcl_t,
    809     uint64_t *afar);
    810 
    811 #ifdef __cplusplus
    812 }
    813 #endif
    814 
    815 #endif /* _CMD_CPU_H */
    816