Home | History | Annotate | Download | only in sys
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License, Version 1.0 only
      6  * (the "License").  You may not use this file except in compliance
      7  * with the License.
      8  *
      9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
     10  * or http://www.opensolaris.org/os/licensing.
     11  * See the License for the specific language governing permissions
     12  * and limitations under the License.
     13  *
     14  * When distributing Covered Code, include this CDDL HEADER in each
     15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     16  * If applicable, add the following below this CDDL HEADER, with the
     17  * fields enclosed by brackets "[]" replaced with your own identifying
     18  * information: Portions Copyright [yyyy] [name of copyright owner]
     19  *
     20  * CDDL HEADER END
     21  */
     22 /*
     23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #ifndef	_SYS_ASYNC_H
     28 #define	_SYS_ASYNC_H
     29 
     30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     31 
     32 #include <sys/privregs.h>
     33 
     34 #ifdef	__cplusplus
     35 extern "C" {
     36 #endif
     37 
     38 #ifndef	_ASM
     39 
     40 #include <sys/errorq.h>
     41 
     42 /*
     43  * The async_flt structure is used to record all pertinent information about
     44  * an asynchronous CPU or bus-related memory error.  Typically, the structure
     45  * is initialized by a high-level interrupt or trap handler, and then enqueued
     46  * for later processing.  Separate queues are maintained for correctable and
     47  * uncorrectable errors.  The current CPU module determines the size of the
     48  * queue elements, so that it may declare a CPU-specific fault structure
     49  * which contains a struct async_flt as its first member.  Each async_flt also
     50  * contains a callback function (flt_func) that is invoked by the processing
     51  * code in order to actually log messages when the event is dequeued.  This
     52  * function may be called from a softint, from trap() as part of AST handling
     53  * before the victim thread returns to userland, or as part of panic().  As
     54  * such, the flt_func should basically only be calling cmn_err (but NOT with
     55  * the CE_PANIC flag).  It must not call panic(), acquire locks, or block.
     56  * The owner of the event is responsible for determining whether the event is
     57  * fatal; if so, the owner should set flt_panic and panic() after enqueuing
     58  * the event.  The event will then be dequeued and logged as part of panic
     59  * processing.  If flt_panic is not set, the queue function will schedule a
     60  * soft interrupt to process the event.
     61  */
     62 
     63 struct async_flt;
     64 typedef void (*async_func_t)(struct async_flt *, char *);
     65 
     66 struct async_flt {
     67 	uint64_t	flt_id;		/* gethrtime() at time of fault */
     68 	uint64_t	flt_stat;	/* async fault status register */
     69 	uint64_t	flt_addr;	/* async fault address register */
     70 	caddr_t		flt_pc;		/* program counter from error trap */
     71 	async_func_t	flt_func;	/* logging function */
     72 	uint_t		flt_bus_id;	/* hardware bus id# of cpu/sbus/pci */
     73 	uint_t		flt_inst;	/* software instance of cpu/sbus/pci */
     74 	ushort_t	flt_status;	/* error information */
     75 	ushort_t	flt_synd;	/* ECC syndrome */
     76 	uchar_t		flt_in_memory;	/* fault occurred in memory if != 0 */
     77 	uchar_t		flt_class;	/* fault class (cpu or bus) */
     78 	uchar_t		flt_prot;	/* type of fault protection (if any) */
     79 	uchar_t		flt_priv;	/* fault occurred in kernel if != 0 */
     80 	uchar_t		flt_panic;	/* fault caused owner to panic() */
     81 	uchar_t		flt_tl;		/* fault occurred at TL > 0 */
     82 	uchar_t		flt_core;	/* fault occurred during core() dump */
     83 	uchar_t		flt_pad;	/* reserved for future use */
     84 	uint64_t	flt_disp;	/* error disposition information */
     85 	uint64_t	flt_payload;	/* ereport payload information */
     86 	char		*flt_erpt_class; /* ereport class string */
     87 };
     88 
     89 /*
     90  * Bus nexus drivers can use the bus_func_register() interface to register
     91  * callback functions for error handling and panic handling.  The handler
     92  * functions should be registered and unregistered from driver attach and
     93  * detach context, where it is safe to perform a sleeping allocation.  The
     94  * callbacks themselves can be invoked from panic, or from the CPU module's
     95  * asynchronous trap handler at high PIL.  As such, these routines may only
     96  * test for errors and enqueue async_flt events.  They may not grab adaptive
     97  * locks, call panic(), or invoke bus_func_register() or bus_func_unregister().
     98  * Each callback function should return one of the BF_* return status values
     99  * below.  The bus_func_invoke() function calls all the registered handlers of
    100  * the specified type, and returns the maximum of their return values (e.g.
    101  * BF_FATAL if any callback returned BF_FATAL).  If any callback returns
    102  * BF_FATAL, the system will panic at the end of callback processing.
    103  */
    104 
    105 typedef	uint_t (*busfunc_t)(void *);
    106 
    107 #define	BF_TYPE_UE		1	/* check for uncorrectable errors */
    108 #define	BF_TYPE_ERRDIS		2	/* disable error detection */
    109 #define	BF_TYPE_RESINTR		3	/* reset interrupts */
    110 
    111 #define	BF_NONE			0	/* no errors were detected */
    112 #define	BF_NONFATAL		1	/* one or more non-fatal errors found */
    113 #define	BF_FATAL		2	/* one or more fatal errors found */
    114 
    115 typedef struct bus_func_desc {
    116 	int bf_type;			/* type of function (see above) */
    117 	busfunc_t bf_func;		/* function to call */
    118 	void *bf_arg;			/* function argument */
    119 	struct bus_func_desc *bf_next;	/* pointer to next registered desc */
    120 } bus_func_desc_t;
    121 
    122 extern void bus_func_register(int, busfunc_t, void *);
    123 extern void bus_func_unregister(int, busfunc_t, void *);
    124 extern void bus_async_log_err(struct async_flt *);
    125 extern uint_t bus_func_invoke(int);
    126 
    127 extern void ecc_cpu_call(struct async_flt *, char *, int);
    128 
    129 extern void ce_scrub(struct async_flt *);
    130 extern void ecc_page_zero(void *);
    131 
    132 extern void error_init(void);
    133 
    134 extern	int	ce_verbose_memory;
    135 extern	int	ce_verbose_other;
    136 extern	int	ce_show_data;
    137 extern	int	ce_debug;
    138 extern	int	ue_debug;
    139 
    140 extern	int	aft_verbose;
    141 extern	int	aft_panic;
    142 extern	int	aft_testfatal;
    143 
    144 extern struct async_flt panic_aflt;
    145 
    146 extern errorq_t *ce_queue;
    147 extern errorq_t *ue_queue;
    148 
    149 #endif	/* !_ASM */
    150 
    151 /*
    152  * ECC or parity error status for async_flt.flt_status.
    153  */
    154 #define	ECC_C_TRAP		0x0001	/* Trap 0x63 Corrected ECC Error */
    155 #define	ECC_I_TRAP		0x0002	/* Trap 0x0A Instr Access Error */
    156 #define	ECC_ECACHE		0x0004	/* Ecache ECC Error */
    157 #define	ECC_IOBUS		0x0008	/* Pci or sysio ECC Error */
    158 #define	ECC_INTERMITTENT	0x0010	/* Intermittent ECC Error */
    159 #define	ECC_PERSISTENT		0x0020	/* Persistent ECC Error */
    160 #define	ECC_STICKY		0x0040	/* Sticky ECC Error */
    161 #define	ECC_D_TRAP		0x0080	/* Trap 0x32 Data Access Error */
    162 #define	ECC_F_TRAP		0x0100	/* Cheetah Trap 0x70 Fast ECC Error */
    163 #define	ECC_DP_TRAP		0x0200	/* Cheetah+ Trap 0x71 D$ Parity Error */
    164 #define	ECC_IP_TRAP		0x0400	/* Cheetah+ Trap 0x72 I$ Parity Error */
    165 #define	ECC_ITLB_TRAP		0x0800	/* Panther ITLB Parity Error */
    166 #define	ECC_DTLB_TRAP		0x1000	/* Panther DTLB Parity Error */
    167 #define	ECC_IO_CE		0x2000	/* Pci or sysio CE */
    168 #define	ECC_IO_UE		0x4000	/* Pci or sysio UE */
    169 
    170 /*
    171  * Trap type numbers corresponding to the fault types defined above.
    172  */
    173 #define	TRAP_TYPE_ECC_I		0x0A
    174 #define	TRAP_TYPE_ECC_D		0x32
    175 #define	TRAP_TYPE_ECC_F		0x70
    176 #define	TRAP_TYPE_ECC_C		0x63
    177 #define	TRAP_TYPE_ECC_DP	0x71
    178 #define	TRAP_TYPE_ECC_IP	0x72
    179 #define	TRAP_TYPE_ECC_ITLB	0x08
    180 #define	TRAP_TYPE_ECC_DTLB	0x30
    181 #define	TRAP_TYPE_UNKNOWN	0
    182 
    183 /*
    184  * Fault classes for async_flt.flt_class.
    185  */
    186 #define	BUS_FAULT		0	/* originating from bus drivers */
    187 #define	CPU_FAULT		1	/* originating from CPUs */
    188 #define	RECIRC_BUS_FAULT	2	/* scheduled diagnostic */
    189 #define	RECIRC_CPU_FAULT	3	/* scheduled diagnostic */
    190 
    191 /*
    192  * Invalid or unknown physical address for async_flt.flt_addr.
    193  */
    194 #define	AFLT_INV_ADDR	(-1ULL)
    195 
    196 /*
    197  * Fault protection values for async_flt.flt_prot.  The async error handling
    198  * code may be able to recover from errors when kernel code has explicitly
    199  * protected itself using one of the mechanisms specified here.
    200  */
    201 #define	AFLT_PROT_NONE		0	/* no protection active */
    202 #define	AFLT_PROT_ACCESS	1	/* on_trap OT_DATA_ACCESS protection */
    203 #define	AFLT_PROT_EC		2	/* on_trap OT_DATA_EC protection */
    204 #define	AFLT_PROT_COPY		3	/* t_lofault protection (ucopy, etc.) */
    205 
    206 /*
    207  * These flags are used to indicate the validity of certain data based on
    208  * the various overwrite priority features of the AFSR/AFAR:
    209  * AFAR, ESYND and MSYND, each of which have different overwrite priorities.
    210  *
    211  * Given a specific afsr error bit and the entire afsr, there are three cases:
    212  *   INVALID:	The specified bit is lower overwrite priority than some other
    213  *		error bit which is on in the afsr (or IVU/IVC).
    214  *   VALID:	The specified bit is higher priority than all other error bits
    215  *		which are on in the afsr.
    216  *   AMBIGUOUS: Another error bit (or bits) of equal priority to the specified
    217  *		bit is on in the afsr.
    218  *
    219  * NB: The domain-to-SC communications depend on these values. If they are
    220  * changed, plat_ecc_unum.[ch] must be updated to match.
    221  */
    222 #define	AFLT_STAT_INVALID	0	/* higher priority afsr bit is on */
    223 #define	AFLT_STAT_VALID		1	/* this is highest priority afsr bit */
    224 #define	AFLT_STAT_AMBIGUOUS	2	/* two afsr bits of equal priority */
    225 
    226 /*
    227  * Maximum length of unum string.
    228  */
    229 #define	UNUM_NAMLEN	60
    230 
    231 /*
    232  * Maximum length of a DIMM serial id string + null
    233  */
    234 #define	DIMM_SERIAL_ID_LEN	16
    235 
    236 #ifdef	__cplusplus
    237 }
    238 #endif
    239 
    240 #endif	/* _SYS_ASYNC_H */
    241