Home | History | Annotate | Download | only in i86pc
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Eversholt rules for generic AMD with on-chip memory-controller(s), as seen
     29  * in AMD family 0xf and 0x10.
     30  *
     31  * In the absence of any model-specific support, any memory errors that
     32  * are observed via MCA (typically through an on-chip memory-controller)
     33  * will surface as ereport.cpu.generic-x86.bus_interconnect_memory[_uc]
     34  * ereports and are diagnosed via generic rules in gcpu.esc.
     35  *
     36  * If full model-specific support is available, including full NorthBridge
     37  * support, then memory ereports will surface in a more-specific subclass
     38  * such as ereport.cpu.amd.mem_ce; these are diagnosed in amd64.esc.
     39  *
     40  * In the case where some "vendor generic" support is present, memory errors
     41  * are reported as ereport.cpu.generic-x86.mem_{ce,ue} and include a
     42  * syndrome and syndrome-type, and usually also a resource FMRI to identify
     43  * the affected resource.  In the AMD case a resource FMRI is included for
     44  * those chip versions that include an Online Spare Control register; this
     45  * register provides counts of ECC errors seen per channel and chip-select
     46  * on a NorthBridge node.  The resource FMRI has form
     47  * 	hc:///motherboard/chip/memory-controller/dram-channel/chip-select
     48  * in these cases.
     49  */
     50 
     51 #pragma dictionary "GMCA"
     52 
     53 /*
     54  * The number of pages that must be faulted on a chip-select for repeated
     55  * correctable errors before we will consider one of the component dimms
     56  * faulty.
     57  */
     58 #define	CS_DIMMSB_THRESH	64
     59 
     60 /*
     61  * The maximum number of pages we will diagnose as faulty on any one
     62  * chip-select (must be at least CS_PAGEFLT_THRESH).  If a chip-select
     63  * has a fault that will affect zillions of pages this limit stops us
     64  * diagnosing excessive numbers of page faults.
     65  */
     66 #define	CS_PAGEFLT_MAX		(2 * CS_DIMMSB_THRESH)
     67 
     68 /*
     69  * SERD paramters for individual page faults.  When more than PAGE_SB_COUNT
     70  * correctable ereports are experienced on a single chip-select within
     71  * PAGE_SB_TIME the engine will fire and we will fault the most recent
     72  * page.
     73  */
     74 #define	PAGE_SB_COUNT		3
     75 #define	PAGE_SB_TIME		24h
     76 
     77 #define	CSPATH	chip/memory-controller/dram-channel/chip-select
     78 
     79 /*
     80  * ADDR_VALID is true if the ereport payload includes IA32_MCi_ADDR.
     81  */
     82 #define	ADDR_VALID (payloadprop_defined("IA32_MCi_ADDR"))
     83 
     84 /*
     85  * CONTAINS_CS is true if the resource nvlist array exists and one of its
     86  * members matches the chip-select path.  This is used to constrain
     87  * propogations to those for which a resource element matches the
     88  * chip-select path of the propogation.  This is necessary because the
     89  * detector element of memory ereports is a cpu and not the chip-select itself.
     90  */
     91 #define	CONTAINS_CS (payloadprop_contains("resource", asru(CSPATH)))
     92 
     93 #define	SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR")))
     94 /* Generic memory ereports. */
     95 event ereport.cpu.generic-x86.mem_ce@chip/core/strand { within(1s) };
     96 event ereport.cpu.generic-x86.mem_ue@chip/core/strand { within(1s) };
     97 
     98 /*
     99  *	 ========= Propogations for correctable memory faults ==========
    100  *	|								|
    101  *	| Discard mem_ce with no resource in the ereport payload.	|
    102  *	| Discard mem_ce with no address info - we can't fault the	|
    103  *	| corresponding page without it.				|
    104  *	|								|
    105  *	| For a mem_ce ereport detected by a given chip/cpu (as per	|
    106  *	| the payload detector info) whose resource payload member	|
    107  *	| includes a chip/memory-controller/dram-channel/chip-select	|
    108  *	| (CSPATH) for the same chip number, diagnose to an fault event	|
    109  *	| associated with a per-CSPATH SERD engine as long as we are	|
    110  *	| below the page fault limit for this CSPATH (defined below);	|
    111  *	| if we are over that limit then discard the event since we	|
    112  *	| will already have faulted a dimm and there is no point in	|
    113  *	| continuing to diagnose endless page faults from a dimm with	|
    114  *	| something like a pin failure.					|
    115  *	|								|
    116  *	| When the per-CSPATH SERD engine fires we fault the page	|
    117  *	| containing the address included in the ereport that caused	|
    118  *	| the trip, and increment a per-CSPATH counter to count page	|
    119  *	| faults on that chip-select from repeated correctable errors.	|
    120  *	|								|
    121  *	| A dimm_ce fault is diagnosed when we have faulted an		|
    122  *	| excessive number of page_ce faults on a chip-select - more	|
    123  *	| than CE_DIMMSB_THRESH.					|
    124  *	|===============================================================|
    125  */
    126 
    127 #define	CS_PGFLT_LIMIT_REACHED (count(stat.cepgflt@CSPATH) > CS_PAGEFLT_MAX)
    128 #define	CS_DIMMSB_THRESH_REACHED \
    129 	(count(stat.cepgflt@CSPATH) >= CS_DIMMSB_THRESH)
    130 
    131 engine stat.cepgflt@CSPATH;
    132 engine serd.memory.generic-x86.page_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
    133 event fault.memory.generic-x86.page_ce@CSPATH,
    134     message=0, response=0,		/* do not message individual pageflts */
    135     count=stat.cepgflt@CSPATH,		/* increment on pageflt diagnosis */
    136     engine=serd.memory.generic-x86.page_ce@CSPATH;
    137 engine serd.memory.generic-x86.dimm_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
    138 event fault.memory.generic-x86.dimm_ce@CSPATH,
    139     engine=serd.memory.generic-x86.dimm_ce@CSPATH;
    140 
    141 prop fault.memory.generic-x86.page_ce@CSPATH
    142     { ADDR_VALID && CONTAINS_CS && !CS_PGFLT_LIMIT_REACHED && SET_ADDR } (1)->
    143     ereport.cpu.generic-x86.mem_ce@chip/core<>/strand<>;
    144 
    145 prop fault.memory.generic-x86.dimm_ce@CSPATH
    146     { ADDR_VALID && CONTAINS_CS && CS_DIMMSB_THRESH_REACHED } (1)->
    147     ereport.cpu.generic-x86.mem_ce@chip/core<>/strand<>;
    148 
    149 event upset.memory.generic-x86.discard@chip/core/strand;
    150 prop upset.memory.generic-x86.discard@chip/core/strand
    151     { !payloadprop_defined("resource") || !ADDR_VALID } (1)->
    152     ereport.cpu.generic-x86.mem_ce@chip/core/strand;
    153 
    154 /*
    155  *	 ========= Propogations for uncorrectable page faults ==========
    156  *	|								|
    157  *	| A UE produces an immediate page fault.
    158  *	|===============================================================|
    159  */
    160 
    161 event fault.memory.generic-x86.page_ue@CSPATH,
    162     message=0, response=0,		/* do not message individual pageflts */
    163     count=stat.cepgflt@CSPATH;		/* increment on pageflt diagnosis */
    164 event fault.memory.generic-x86.dimm_ue@CSPATH;
    165 
    166 prop fault.memory.generic-x86.page_ue@CSPATH
    167     { ADDR_VALID && CONTAINS_CS && SET_ADDR } (1)->
    168     ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>;
    169 
    170 prop fault.memory.generic-x86.dimm_ue@CSPATH
    171     { ADDR_VALID && CONTAINS_CS } (1)->
    172     ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>;
    173 
    174 event upset.memory.generic-x86.discard3@CSPATH;
    175 prop upset.memory.generic-x86.discard3@CSPATH
    176     { !payloadprop_defined("resource") || !ADDR_VALID } (1)->
    177     ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>;
    178 
    179 /*
    180  *	 ========= Propogations for GART Table Walk Errors =============
    181  *	|								|
    182  *	| These are usually due to software mis-programming of the GART	|
    183  *	| TLB rather than from hardware errors.  It would be incorrect	|
    184  *	| to fault and potentially offline a cpu in response to these	|
    185  *	| so they have their own fault class to facilitate us ignoring	|
    186  *	| them.								|
    187  *	|===============================================================|
    188  */
    189 
    190 event ereport.cpu.generic-x86.gart_tbl_walk@chip/core/strand { within(1s) };
    191 event upset.cpu.generic-x86.gart_tbl_walk@chip/core/strand;
    192 
    193 prop upset.cpu.generic-x86.gart_tbl_walk@chip/core/strand (1)->
    194     ereport.cpu.generic-x86.gart_tbl_walk@chip/core/strand;
    195