Home | History | Annotate | Download | only in i86pc
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma dictionary "AMD"
     28 
     29 /*
     30  * Eversholt rules for the AMD Opteron CPU/Memory
     31  */
     32 
     33 #define	MAX(x, y) ((x) >= (y) ? (x) : (y))
     34 #define	MIN(x, y) ((x) <= (y) ? (x) : (y))
     35 
     36 /*
     37  * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that
     38  * we diagnose for page faults, to record the physical address of the faulting
     39  * page.
     40  */
     41 #define	SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR")))
     42 
     43 #define	SET_OFFSET (setpayloadprop("asru-offset", \
     44 	payloadprop("resource[0].hc-specific.offset")))
     45 
     46 /*
     47  * RESOURCE_EXISTS is true if a member with name "resource" exists in the
     48  * payload - regardless of type (e.g., nvlist or nvlist array) or value.
     49  */
     50 #define	RESOURCE_EXISTS	(payloadprop_defined("resource"))
     51 
     52 /*
     53  * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory
     54  * ereports) exists and one if its members matches the path for the
     55  * rank node.  Our memory propogation are of the form
     56  *
     57  * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/core/strand"
     58  *
     59  * since cpus detect memory errors;  in eversholt such a propogation, where
     60  * the lhs path and rhs path do not match, expands to the cross-product of
     61  * all dimms, ranks and cpus on the same chip (since chip appears in the
     62  * path on both sides).  We use CONTAINS_RANK to constrain the propogation
     63  * such that it only happens if the payload resource matches the rank.
     64  */
     65 #define	CONTAINS_RANK (payloadprop_contains("resource", \
     66 	asru(chip/memory-controller/dimm/rank)) \
     67 	|| payloadprop_contains("resource", \
     68 	asru(chip/memory-controller/dimm)))
     69 
     70 /*
     71  * The following will tell us whether a syndrome that is known to be
     72  * correctable (from a mem_ce ereport) is single-bit or multi-bit.  For a
     73  * correctable ChipKill syndrome the number of bits set in the lowest
     74  * nibble indicates how many bits were in error.
     75  */
     76 
     77 #define	CBITMASK(synd) ((synd) & 0xf)
     78 
     79 #define	CKSINGLE(synd)							\
     80 	((synd) == 0 ||							\
     81 	(CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 ||		\
     82 	CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8))
     83 
     84 #define	SINGLE_BIT_CE							\
     85 	(payloadprop("syndrome-type") == "E" ||				\
     86 	(payloadprop("syndrome-type") == "C" &&				\
     87 	CKSINGLE(payloadprop("syndrome"))))
     88 
     89 #define	MULTI_BIT_CE							\
     90 	(payloadprop("syndrome-type") == "C" &&				\
     91 	!CKSINGLE(payloadprop("syndrome")))
     92 
     93 /*								#PAGE#
     94  *								#DIMM_SCU#
     95  * A single bit fault in a memory rank can cause:
     96  *
     97  *  - mem_ce : reported by nb
     98  *  - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the
     99  *    ic do not record a syndrome; these errors will not be triggered in
    100  *    ChipKill ECC mode (the NB corrects all ECC errors in that mode)
    101  *  - s_ecc1: reported by bu; this error will not be triggered in ChipKill
    102  *    ECC mode (the NB corrects all ECC in that mode)
    103  *
    104  * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine
    105  * trips we diagnose a fault.memory.page so that the response agent can
    106  * retire the page that caused the trip.  If the total number of pages
    107  * faulted in this way on a single rank exceeds a threshold we will
    108  * diagnose a fault.memory.dimm_sb against the containing dimm.
    109  *
    110  * Multibit ChipKill-correctable errors are treated identically to
    111  * single-bit errors, but via separate serd engines to allow distinct
    112  * parameters if desired.
    113  *
    114  * Uncorrectable errors produce an immediate page fault and corresponding
    115  * fault.memory.dimm_ue.
    116  *
    117  * Page faults are essentially internal - action is only required when
    118  * they are accompanied by a dimm fault.  As such we include message=0
    119  * on page faults.
    120  */
    121 
    122 event ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand{within(5s)};
    123 event ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand{within(5s)};
    124 event ereport.cpu.amd.bu.s_ecc1@chip/core/strand{within(5s)};
    125 event ereport.cpu.amd.nb.mem_ce@chip/core/strand{within(5s)};
    126 
    127 /*
    128  * Single-bit correctable errors feed into per-rank
    129  * SERD engines which diagnose fault.memory.page_sb if they trip.
    130  *
    131  * Multi-bit correctable (via ChipKill) errors feed
    132  * into additional per-rank SERD engines which diagnose fault.memory.page_ck
    133  * if they trip.
    134  *
    135  * The number of fault.memory.page and fault.memory.page_ck diagnosed is
    136  * counted in stat engines for each type.  These are used in deciding
    137  * whether to declare a dimm faulty after repeated page faults.
    138  */
    139 
    140 #define PAGE_SB_COUNT		2
    141 #define PAGE_SB_TIME		72h
    142 #define	PAGE_CK_COUNT		2
    143 #define	PAGE_CK_TIME		72h
    144 
    145 engine stat.sbpgflt@chip/memory-controller/dimm/rank;
    146 engine stat.ckpgflt@chip/memory-controller/dimm/rank;
    147 engine serd.memory.page_sb@chip/memory-controller/dimm/rank,
    148     N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
    149 engine serd.memory.page_ck@chip/memory-controller/dimm/rank,
    150     N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
    151 engine serd.memory.dimm_sb@chip/memory-controller/dimm/rank,
    152     N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
    153 engine serd.memory.dimm_ck@chip/memory-controller/dimm/rank,
    154     N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
    155 event fault.memory.page_sb@chip/memory-controller/dimm/rank, message=0,
    156     count=stat.sbpgflt@chip/memory-controller/dimm/rank, response=0,
    157     engine=serd.memory.page_sb@chip/memory-controller/dimm/rank;
    158 event fault.memory.page_ck@chip/memory-controller/dimm/rank, message=0,
    159     count=stat.ckpgflt@chip/memory-controller/dimm/rank, response=0,
    160     engine=serd.memory.page_ck@chip/memory-controller/dimm/rank;
    161 event fault.memory.dimm_sb@chip/memory-controller/dimm/rank,
    162     engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank;
    163 event fault.memory.dimm_ck@chip/memory-controller/dimm/rank,
    164     engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank;
    165 
    166 /*
    167  * The fraction of pages on a single rank that must be diagnosed as faulty
    168  * with single correctable unit faults before we will fault the rank.
    169  * Once we have faulted the rank we will continue to diagnose any further page
    170  * faults on the rank up to some maximum multiple of the threshold at which
    171  * we faulted the dimm.  This allows us to potentially contain some fairly
    172  * far-reaching but still limited-extent fault (such as a partial column
    173  * failure) without getting carried away and allowing a single faulty rank to
    174  * use up the entire system-imposed page retirenment limit (which, once
    175  * reached, causes retirement request to have no effect other than to fill
    176  * the fault manager cache and logs).
    177  *
    178  * This fraction is specified in basis points, where 100 basis points are
    179  * equivalent to 1 percent.  It is applied on a per-rank basis.
    180  *
    181  * The system imposes an absolute maximum on the number of pages it will
    182  * retire;  the current value is 10 basis points, or 0.1% of 'physmem'.  Note
    183  * that 'physmem' is reduced from installed memory pages by an amount
    184  * reflecting permanent kernel memory allocations.  This system page retire
    185  * limit bounds the maximum real response to page faults across all ranks
    186  * that fault manager response agents can effect, but it should not be confused
    187  * with any diagnosis threshold (i.e., the number of faulty pages we are
    188  * prepared to tolerate from a single rank before faulting the rank is
    189  * distinct from the total number of pages we are prepared to retire from use
    190  * in response to that and other faults).  It is, however, desirable to
    191  * arrange that the maximum number of pages we are prepared to fault from
    192  * any one rank is less than the system-wide quota.
    193  */
    194 #define	PAGE_RETIRE_LIMIT_BPS	5		/* or 0.05%; ~ 131 pages/GB %/
    195 
    196 /*
    197  * A macro to manipulate the above fraction.  Given a size in bytes convert
    198  * this to pages (4K pagesize) and calculate the number of those pages
    199  * indicated by PAGE_RETIRE_LIMIT_BPS basis points.
    200  */
    201 #define	_BPS_PGCNT(totalbytes) \
    202 	((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000)
    203 
    204 /*
    205  * The single-correctable-unit threshold at which number of faulted pages
    206  * on a rank we we fault the rank.  We insist that this be at least 128 and
    207  * never more than 512.
    208  */
    209 #define	RANK_THRESH MIN(512, MAX(128, \
    210 	_BPS_PGCNT(confprop(chip/memory-controller/dimm/rank, "size"))))
    211 
    212 /*
    213  * The maximum number of single-correctable-unit page faults we will diagnose
    214  * on a single rank (must be greater than RANK_THRESH).  We set
    215  * this at twice the rank fault threshold.
    216  */
    217 #define	RANK_PGFLT_MAX (2 * RANK_THRESH)
    218 
    219 #define	SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank))
    220 #define	CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank))
    221 
    222 /*
    223  * "Single-correctable-unit" DIMM faults are diagnosed when the total number of
    224  * page faults (diagnosed from repeated single-bit or multibit-chipkills)
    225  * from any one rank on that DIMM reaches a threshold.  A "correctable unit"
    226  * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill
    227  * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron).
    228  *
    229  * We do not stop diagnosing further single-bit page faults once we have
    230  * declared a single-bit DIMM fault - we continue diagnosing them and
    231  * response agents can continue to retire those pages up to the system-imposed
    232  * retirement limit.
    233  *
    234  * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and
    235  * fault.memory.dimm_ck.  Which one is diagnosed depends on whether we
    236  * have reached the threshold for a majority of single-bit page faults or
    237  * multibit page faults.
    238  *
    239  * Implementation: we maintain parallel SERD engines to the page_sb and
    240  * page_ck engines, which trip in unison.  On trip it generates a distinct
    241  * ereport which we diagnose to a fault if the threshold has been reached.
    242  */
    243 prop fault.memory.page_sb@chip/memory-controller/dimm/rank
    244     { CONTAINS_RANK && SINGLE_BIT_CE &&
    245       SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
    246     ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
    247     ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
    248     ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
    249 
    250 prop fault.memory.page_ck@chip/memory-controller/dimm/rank
    251     { CONTAINS_RANK && !SINGLE_BIT_CE &&
    252       SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
    253     ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
    254     ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
    255     ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
    256 
    257 prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank
    258     { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
    259       SB_PGFLTS > RANK_THRESH / 2 } (1)->
    260     ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
    261     ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
    262     ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
    263 
    264 prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank
    265     { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
    266       CK_PGFLTS > RANK_THRESH / 2 } (1)->
    267     ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
    268 
    269 /*
    270  * If the address is not valid then no resource member will be included
    271  * in a nb.mem_ce or nb.mem_ue ereport.  These cases should be rare.
    272  * We will also discard all inf_sys_ecc1 events detected at the ic since they
    273  * have no syndrome and therefore no resource information.
    274  * We will discard such ereports.  An alternative may be to SERD them
    275  * on a per MC basis and trip if we see too many such events.
    276  */
    277 event upset.memory.discard1@chip/core/strand;
    278 prop upset.memory.discard1@chip/core/strand
    279     { !RESOURCE_EXISTS } (1)->
    280     ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand,
    281     ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand,
    282     ereport.cpu.amd.bu.s_ecc1@chip/core/strand,
    283     ereport.cpu.amd.nb.mem_ce@chip/core/strand;
    284 
    285 /* 								#DIMM_UE#
    286  *								#PAGE_UE#
    287  * An uncorrectable multi-bit fault in a memory dimm can cause:
    288  *
    289  *  - mem_ue    	   : reported by nb for an access from a remote cpu
    290  *  - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome
    291  *  - s_eccm	   : reported by bu
    292  *
    293  * Since on production systems we force HT Sync Flood on uncorrectable
    294  * memory errors (if not already set as such by the BIOS, as it should be)
    295  * we won't actually receive these ereports since the system will be reset.
    296  */
    297 
    298 event ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand{within(5s)};
    299 event ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand{within(5s)};
    300 event ereport.cpu.amd.bu.s_eccm@chip/core/strand{within(5s)};
    301 event ereport.cpu.amd.nb.mem_ue@chip/core/strand{within(5s)};
    302 
    303 event fault.memory.dimm_ue@chip/memory-controller/dimm/rank;
    304 event fault.memory.page_ue@chip/memory-controller/dimm/rank, message=0,
    305     response=0;
    306 
    307 prop fault.memory.dimm_ue@chip/memory-controller/dimm/rank
    308     { CONTAINS_RANK } (1)->
    309     ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>,
    310     ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>,
    311     ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>,
    312     ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>;
    313 
    314 prop fault.memory.page_ue@chip/memory-controller/dimm/rank
    315     { CONTAINS_RANK && SET_ADDR && SET_OFFSET } (1)->
    316     ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>,
    317     ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>,
    318     ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>,
    319     ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>;
    320 
    321 event upset.memory.discard3@chip/core/strand;
    322 prop upset.memory.discard3@chip/core/strand
    323     { !RESOURCE_EXISTS } (1)->
    324     ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand,
    325     ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand,
    326     ereport.cpu.amd.bu.s_eccm@chip/core/strand,
    327     ereport.cpu.amd.nb.mem_ue@chip/core/strand;
    328 
    329 /*								#CSTESTFAIL#
    330  * If the BIOS fails a chip-select during POST, or perhaps after a
    331  * sync flood from an uncorrectable error, then on revision F and G it
    332  * should mark that chip-select as TestFail in the CS Base register.
    333  * When the memory-controller driver discovers all the MC configuration
    334  * it notes such failed chip-selects and creates topology nodes for the
    335  * chip-select and associated dimms and ranks, and produces an ereport for each
    336  * failed chip-select with detector set to the memory-controller node
    337  * and resource indicating the failed chip-select.
    338  */
    339 
    340 event ereport.cpu.amd.mc.cs_testfail@chip/memory-controller{within(5s)};
    341 event fault.memory.dimm_testfail@chip/memory-controller/dimm/rank;
    342 event error.memory.cs_testfail@chip/memory-controller/chip-select;
    343 
    344 #define	CONTAINS_CS (payloadprop_contains("resource", \
    345 	asru(chip/memory-controller/chip-select)))
    346 
    347 prop error.memory.cs_testfail@chip/memory-controller/chip-select (1)->
    348     ereport.cpu.amd.mc.cs_testfail@chip/memory-controller
    349     { CONTAINS_CS };
    350 
    351 #define CSMATCH(s) \
    352 	(confprop_defined(chip/memory-controller/chip-select, s) && \
    353 	confprop(chip/memory-controller/chip-select, s) == \
    354 	confprop(chip/memory-controller/dimm/rank, "csname"))
    355 
    356 prop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank (0)->
    357     error.memory.cs_testfail@chip/memory-controller/chip-select
    358     { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")};
    359 
    360 /*								#ADDRPAR#
    361  * DRAM Command/Address Parity Errors.
    362  *
    363  *  - dramaddr_par : reported by the nb; the NB status register includes
    364  *    a bit indicating which dram controller channel (A or B) experienced
    365  *    the error.
    366  */
    367 
    368 event ereport.cpu.amd.nb.dramaddr_par@chip/core/strand{within(5s)};
    369 event fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel, response=0;
    370 
    371 prop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)->
    372     ereport.cpu.amd.nb.dramaddr_par@chip/core/strand {
    373     ((payloadprop("IA32_MCi_STATUS") >> 32 & 0x200) ? 1 : 0) == y };
    374 
    375 /* 								#L2D_SINGLE#
    376  * A single bit data array fault in an l2 cache can cause:
    377  *
    378  *  - inf_l2_ecc1 : reported by ic on this cpu
    379  *  - inf_l2_ecc1 : reported by dc on this cpu
    380  *  - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu
    381  */
    382 
    383 #define L2CACHEDATA_SB_COUNT	3
    384 #define L2CACHEDATA_SB_TIME	12h
    385 
    386 event ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand{within(5s)};
    387 event ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand{within(5s)};
    388 event ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand{within(5s)};
    389 engine serd.cpu.amd.l2d_sb@chip/core/strand,
    390     N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME;
    391 event fault.cpu.amd.l2cachedata@chip/core/strand, engine=serd.cpu.amd.l2d_sb@chip/core/strand;
    392 
    393 prop fault.cpu.amd.l2cachedata@chip/core/strand (0)->
    394     ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand,
    395     ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand,
    396     ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand;
    397 
    398 /* 								#L2D_MULTI#
    399  * A multi-bit data array fault in an l2 cache can cause:
    400  *
    401  *  - inf_l2_eccm : reported by ic on this cpu
    402  *  - inf_l2_eccm : reported by dc on this cpu
    403  *  - l2d_eccm : reported by bu on copyback or on snoop from another cpu
    404  */
    405 
    406 event ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand{within(5s)};
    407 event ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand{within(5s)};
    408 event ereport.cpu.amd.bu.l2d_eccm@chip/core/strand{within(5s)};
    409 
    410 prop fault.cpu.amd.l2cachedata@chip/core/strand
    411     { setserdincrement(L2CACHEDATA_SB_COUNT + 1) } (0)->
    412     ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand,
    413     ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand,
    414     ereport.cpu.amd.bu.l2d_eccm@chip/core/strand;
    415 
    416 /* 								#L2T_SINGLE#
    417  * A single bit tag array fault in an l2 cache can cause:
    418  *
    419  *  - l2t_ecc1 : reported by bu on this cpu when detected during snoop
    420  *  - l2t_par : reported by bu on this cpu when detected other than during snoop
    421  */
    422 
    423 #define L2CACHETAG_SB_COUNT	3
    424 #define L2CACHETAG_SB_TIME	12h
    425 
    426 event ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand{within(5s)};
    427 event ereport.cpu.amd.bu.l2t_par@chip/core/strand{within(5s)};
    428 engine serd.cpu.amd.l2t_sb@chip/core/strand,
    429     N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME;
    430 event fault.cpu.amd.l2cachetag@chip/core/strand, engine=serd.cpu.amd.l2t_sb@chip/core/strand;
    431 
    432 prop fault.cpu.amd.l2cachetag@chip/core/strand (0)->
    433     ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand;
    434 
    435 /* 								#L2T_MULTI#
    436  * A multi-bit tag array fault in an l2 cache can cause:
    437  *
    438  *  - l2t_eccm : reported by bu on this cpu when detected during snoop
    439  *  - l2t_par : reported by bu on this cpu when detected other than during snoop
    440  */
    441 
    442 event ereport.cpu.amd.bu.l2t_eccm@chip/core/strand{within(5s)};
    443 
    444 prop fault.cpu.amd.l2cachetag@chip/core/strand
    445     { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
    446     ereport.cpu.amd.bu.l2t_eccm@chip/core/strand,
    447     ereport.cpu.amd.bu.l2t_par@chip/core/strand;
    448 
    449 /* 								#ICD_PAR#
    450  * A data array parity fault in an I cache can cause:
    451  *
    452  *  - data_par : reported by ic on this cpu
    453  */
    454 
    455 #define ICACHEDATA_SB_COUNT	2
    456 #define ICACHEDATA_SB_TIME	168h
    457 
    458 event ereport.cpu.amd.ic.data_par@chip/core/strand{within(5s)};
    459 engine serd.cpu.amd.icachedata@chip/core/strand,
    460     N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME;
    461 event fault.cpu.amd.icachedata@chip/core/strand,
    462     engine=serd.cpu.amd.icachedata@chip/core/strand;
    463 
    464 prop fault.cpu.amd.icachedata@chip/core/strand (0)->
    465     ereport.cpu.amd.ic.data_par@chip/core/strand;
    466 
    467 /* 								#ICT_PAR#
    468  * A tag array parity fault in an I cache can cause:
    469  *
    470  *  - tag_par : reported by ic on this cpu
    471  */
    472 
    473 #define ICACHETAG_SB_COUNT	2
    474 #define ICACHETAG_SB_TIME	168h
    475 
    476 event ereport.cpu.amd.ic.tag_par@chip/core/strand{within(5s)};
    477 engine serd.cpu.amd.icachetag@chip/core/strand,
    478     N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME;
    479 event fault.cpu.amd.icachetag@chip/core/strand, engine=serd.cpu.amd.icachetag@chip/core/strand;
    480 
    481 prop fault.cpu.amd.icachetag@chip/core/strand (0)->
    482     ereport.cpu.amd.ic.tag_par@chip/core/strand;
    483 
    484 /* 								#ICT_SNOOP#
    485  * A snoop tag array parity fault in an I cache can cause:
    486  *
    487  *  - stag_par : reported by ic on this cpu
    488  */
    489 
    490 event ereport.cpu.amd.ic.stag_par@chip/core/strand{within(5s)};
    491 event fault.cpu.amd.icachestag@chip/core/strand;
    492 
    493 prop fault.cpu.amd.icachestag@chip/core/strand (1)->
    494     ereport.cpu.amd.ic.stag_par@chip/core/strand;
    495 
    496 /* 								#ICTLB_1#
    497  * An l1tlb parity fault in an I cache can cause:
    498  *
    499  *  - l1tlb_par : reported by ic on this cpu
    500  */
    501 
    502 #define ICACHEL1TLB_SB_COUNT	2
    503 #define ICACHEL1TLB_SB_TIME	168h
    504 
    505 event ereport.cpu.amd.ic.l1tlb_par@chip/core/strand{within(5s)};
    506 engine serd.cpu.amd.l1itlb@chip/core/strand,
    507     N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME;
    508 event fault.cpu.amd.l1itlb@chip/core/strand, engine=serd.cpu.amd.l1itlb@chip/core/strand;
    509 
    510 prop fault.cpu.amd.l1itlb@chip/core/strand (0)->
    511     ereport.cpu.amd.ic.l1tlb_par@chip/core/strand;
    512 
    513 /* 								#ICTLB_2#
    514  * An l2tlb parity fault in an I cache can cause:
    515  *
    516  *  - l2tlb_par : reported by ic on this cpu
    517  */
    518 
    519 #define ICACHEL2TLB_SB_COUNT	2
    520 #define ICACHEL2TLB_SB_TIME	168h
    521 
    522 event ereport.cpu.amd.ic.l2tlb_par@chip/core/strand{within(5s)};
    523 engine serd.cpu.amd.l2itlb@chip/core/strand,
    524     N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME;
    525 event fault.cpu.amd.l2itlb@chip/core/strand, engine=serd.cpu.amd.l2itlb@chip/core/strand;
    526 
    527 prop fault.cpu.amd.l2itlb@chip/core/strand (0)->
    528     ereport.cpu.amd.ic.l2tlb_par@chip/core/strand;
    529 
    530 /* 								#DCD_SINGLE#
    531  * A single bit data array fault in an D cache can cause:
    532  *
    533  *  - data_ecc1 : reported by dc on this cpu by scrubber
    534  *  - data_ecc1_uc : reported by dc on this cpu other than by scrubber
    535  *
    536  * Make data_ecc1_uc fault immediately as it may have caused a panic, so
    537  * it is handled by the multi-bit case in the following section.
    538  */
    539 
    540 #define DCACHEDATA_SB_COUNT	2
    541 #define DCACHEDATA_SB_TIME	168h
    542 
    543 event ereport.cpu.amd.dc.data_ecc1@chip/core/strand{within(5s)};
    544 event ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand{within(5s)};
    545 engine serd.cpu.amd.dc_sb@chip/core/strand,
    546     N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME;
    547 event fault.cpu.amd.dcachedata@chip/core/strand, engine=serd.cpu.amd.dc_sb@chip/core/strand;
    548 
    549 prop fault.cpu.amd.dcachedata@chip/core/strand (0)->
    550     ereport.cpu.amd.dc.data_ecc1@chip/core/strand;
    551 
    552 /* 								#DCD_MULTI#
    553  * A multi-bit data array fault in an D cache can cause:
    554  *
    555  *  - data_eccm : reported by dc on this cpu
    556  */
    557 
    558 event ereport.cpu.amd.dc.data_eccm@chip/core/strand{within(5s)};
    559 
    560 prop fault.cpu.amd.dcachedata@chip/core/strand
    561     { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
    562     ereport.cpu.amd.dc.data_eccm@chip/core/strand,
    563     ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand;
    564 
    565 /* 								#DCT_PAR#
    566  * A tag array parity fault in an D cache can cause:
    567  *
    568  *  - tag_par : reported by dc on this cpu
    569  */
    570 
    571 event ereport.cpu.amd.dc.tag_par@chip/core/strand{within(5s)};
    572 event fault.cpu.amd.dcachetag@chip/core/strand;
    573 
    574 prop fault.cpu.amd.dcachetag@chip/core/strand (1)->
    575     ereport.cpu.amd.dc.tag_par@chip/core/strand;
    576 
    577 /* 								#DCT_SNOOP#
    578  * A snoop tag array parity fault in an D cache can cause:
    579  *
    580  *  - stag_par : reported by dc on this cpu
    581  */
    582 
    583 event ereport.cpu.amd.dc.stag_par@chip/core/strand{within(5s)};
    584 event fault.cpu.amd.dcachestag@chip/core/strand;
    585 
    586 prop fault.cpu.amd.dcachestag@chip/core/strand (1)->
    587     ereport.cpu.amd.dc.stag_par@chip/core/strand;
    588 
    589 /* 								#DCTLB_1#
    590  * An l1tlb parity fault in an D cache can cause:
    591  *
    592  *  - l1tlb_par : reported by dc on this cpu
    593  */
    594 
    595 event ereport.cpu.amd.dc.l1tlb_par@chip/core/strand{within(5s)};
    596 event fault.cpu.amd.l1dtlb@chip/core/strand;
    597 
    598 prop fault.cpu.amd.l1dtlb@chip/core/strand (1)->
    599     ereport.cpu.amd.dc.l1tlb_par@chip/core/strand;
    600 
    601 /* 								#DCTLB_2#
    602  * An l2tlb parity fault in an D cache can cause:
    603  *
    604  *  - l2tlb_par : reported by dc on this cpu
    605  */
    606 
    607 event ereport.cpu.amd.dc.l2tlb_par@chip/core/strand{within(5s)};
    608 event fault.cpu.amd.l2dtlb@chip/core/strand;
    609 
    610 prop fault.cpu.amd.l2dtlb@chip/core/strand (1)->
    611     ereport.cpu.amd.dc.l2tlb_par@chip/core/strand;
    612 
    613 /*								#MISC#
    614  * Ereports that should not normally happen and which we will discard
    615  * without diagnosis if they do.  These fall into a few categories:
    616  *
    617  *	- the corresponding detector is not enabled, typically because
    618  *	  detection/handling of the event is taking place elsewhere
    619  *	  (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk)
    620  *	- the event is associated with a sync flood so even if the detector is
    621  *	  enabled we will never handle the event and generate an ereport *and*
    622  *	  even if the ereport did arrive we could perform no useful diagnosis
    623  *	  e.g., the NB can be configured for sync flood on nb.mem_eccm
    624  *	  but we don't choose to discard that ereport here since we could have
    625  *	  made a useful diagnosis from it had it been delivered
    626  *	  (nb.ht_sync, nb.ht_crc)
    627  *	- events that will be accompanied by an immediate panic and
    628  *	  delivery of the ereport during subsequent reboot but from
    629  *	  which no useful diagnosis can be made. (nb.rmw, nb.wdog)
    630  *
    631  * Ereports for all of these can be generated by error simulation and
    632  * injection.  We will perform a null diagnosos of all these ereports in order
    633  * to avoid "no subscription" complaints during test harness runs.
    634  */
    635 
    636 event ereport.cpu.amd.nb.ma@strand{within(5s)};
    637 event ereport.cpu.amd.nb.ta@strand{within(5s)};
    638 event ereport.cpu.amd.ls.s_rde@strand{within(5s)};
    639 event ereport.cpu.amd.ic.rdde@strand{within(5s)};
    640 event ereport.cpu.amd.bu.s_rde@strand{within(5s)};
    641 event ereport.cpu.amd.nb.gart_walk@strand{within(5s)};
    642 event ereport.cpu.amd.nb.ht_sync@strand{within(5s)};
    643 event ereport.cpu.amd.nb.ht_crc@strand{within(5s)};
    644 event ereport.cpu.amd.nb.rmw@strand{within(5s)};
    645 event ereport.cpu.amd.nb.wdog@strand{within(5s)};
    646 event ereport.cpu.amd.unknown@strand{within(5s)};
    647 
    648 event upset.null_diag@strand;
    649 
    650 prop upset.null_diag@strand (1)->
    651     ereport.cpu.amd.nb.ma@strand,
    652     ereport.cpu.amd.nb.ta@strand,
    653     ereport.cpu.amd.ls.s_rde@strand,
    654     ereport.cpu.amd.ic.rdde@strand,
    655     ereport.cpu.amd.bu.s_rde@strand,
    656     ereport.cpu.amd.nb.gart_walk@strand,
    657     ereport.cpu.amd.nb.ht_sync@strand,
    658     ereport.cpu.amd.nb.ht_crc@strand,
    659     ereport.cpu.amd.nb.rmw@strand,
    660     ereport.cpu.amd.nb.wdog@strand,
    661     ereport.cpu.amd.unknown@strand;
    662