Home | History | Annotate | Download | only in common
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma dictionary "DISK"
     27 
     28 #define	P			disk
     29 
     30 fru P;
     31 asru P;
     32 
     33 /*
     34  * Over all comments for this file:
     35  * <disk-as-detector> The disk-as-detector DE provides the mapping between
     36  * ereports generated by a kernel disk driver sd(7D) and resulting faults.
     37  */
     38 
     39 /*
     40  * SERD engine for media error fault propagation:
     41  *
     42  * This strategy is designed to give a file system, like ZFS, the
     43  * ability to attempt data recovery/relocation without faulting a disk.
     44  * This implementation depends on a file system retry to the same lba
     45  * to trigger a fault when recovery/relocation is not possible.
     46  *
     47  * We let the engine propagate one error only once every 1 minute and then if we
     48  * still get 2 or more * errors within 24 hours for the same LBA, there is a fault.
     49  */
     50 engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
     51 
     52 /*
     53  * disk-as-detector: fault events.
     54  */
     55 event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
     56 event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
     57     engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
     58 
     59 /*
     60  * The uderr fault will be defined at some future time.
     61  * event fault.io.scsi.cmd.disk.dev.uderr@P;
     62  */
     63 
     64 /*
     65  * disk-as-detector: upset events.
     66  * NOTE: For now we define an upset to implement discard.
     67  */
     68 event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
     69 event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
     70 event upset.io.scsi.cmd.disk.dev.uderr@P;
     71 event upset.io.scsi.cmd.disk.dev.serr@P;
     72 event upset.io.scsi.cmd.disk.tran@P;
     73 event upset.io.scsi.cmd.disk.recovered@P;
     74 
     75 /*
     76  * disk-as-detector: ereports from the kernel.
     77  *
     78  * We don't know the topology for all scsi disks, but the kernel will always
     79  * generate ereport telemetry assuming that we do. We define these ereports
     80  * with 'discard_if_config_unknown=1', which permits ereports against things
     81  * with unknown topology to be silently discarded.  The ereport data is logged
     82  * in either case, and can be viewed via 'fmdump -eV'.
     83  */
     84 event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
     85 event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
     86 event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
     87 event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
     88 event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
     89 event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
     90 
     91 /*
     92  * For some ereports we let the 'driver-assessment', communicated as part of
     93  * the ereport payload, determine fault .vs. upset via propagation constraints.
     94  */
     95 #define DRIVER_ASSESSMENT_FATAL		\
     96 	    (payloadprop_contains("driver-assessment", "fatal"))
     97 #define DRIVER_ASSESSMENT_NONFATAL	(!DRIVER_ASSESSMENT_FATAL)
     98 
     99 /*
    100  * disk-as-detector: propagations from faults(based on
    101  * DRIVER_ASSESSMENT_FATAL).
    102  * We need to set additional fault payloads to indicate fault details.
    103  * The payload we may need are listed as following:
    104  * fault.io.scsi.cmd.disk.dev.rqs.derr
    105  *     op_code, key, asc, ascq
    106  * fault.io.scsi.cmd.disk.dev.rqs.merr
    107  *     op_code, key, asc, ascq, lba
    108  */
    109 prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
    110     ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
    111     setpayloadprop("key", payloadprop("key")) &&
    112     setpayloadprop("asc", payloadprop("asc")) &&
    113     setpayloadprop("ascq", payloadprop("ascq"))};
    114 
    115 /*
    116  * Utilize setserdsuffix with specific LBA,
    117  * the serd engine would only trigger if the fault recurred on the same LBA
    118  */
    119 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
    120     ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
    121     setserdsuffix(payloadprop("lba")) &&
    122     setpayloadprop("key", payloadprop("key")) &&
    123     setpayloadprop("asc", payloadprop("asc")) &&
    124     setpayloadprop("ascq", payloadprop("ascq")) &&
    125     setpayloadprop("lba", payloadprop("lba"))};
    126 
    127 /*
    128  * NOTE: this propagation uses the "may" propagation of eversholt.
    129  * The ereport need never exist. It's just a way of making
    130  * the diagnosis wait for the within time on that ereport
    131  * to complete. Once it has completed the diagnosis continues
    132  * even though the dummy ereport didn't occur.
    133  */
    134 event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
    135 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
    136 	ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
    137 
    138 /*
    139  * The uderr fault will be propagated at some future time.
    140  * prop fault.io.scsi.cmd.disk.dev.uderr@P->
    141  *     ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
    142  */
    143 
    144 /*
    145  * disk-as-detector: propagations from upsets(based on
    146  * DRIVER_ASSESSMENT_NONFATAL).
    147  */
    148 prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
    149     ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
    150 
    151 prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
    152     ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
    153 
    154 /*
    155  * disk-as-detector: propagations from upsets(independent of
    156  * driver-assessment)
    157  */
    158 
    159 prop upset.io.scsi.cmd.disk.dev.serr@P->
    160     ereport.io.scsi.cmd.disk.dev.serr@P;
    161 
    162 prop upset.io.scsi.cmd.disk.dev.uderr@P->
    163     ereport.io.scsi.cmd.disk.dev.uderr@P;
    164 
    165 prop upset.io.scsi.cmd.disk.recovered@P->
    166     ereport.io.scsi.cmd.disk.recovered@P;
    167 
    168 prop upset.io.scsi.cmd.disk.tran@P->
    169     ereport.io.scsi.cmd.disk.tran@P;
    170 
    171 /*
    172  * --------------------------------------
    173  * The remainder of this file contains rules associated with the operation of
    174  * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
    175  *
    176  * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
    177  * generated by the disk-transport fmd module, and the resulting faults.
    178  */
    179 
    180 /*
    181  * Fault events.
    182  */
    183 event fault.io.disk.over-temperature@P,
    184     FITrate=10, FRU=P, ASRU=P;
    185 event fault.io.disk.predictive-failure@P, FITrate=10,
    186     FITrate=10, FRU=P, ASRU=P;
    187 event fault.io.disk.self-test-failure@P, FITrate=10,
    188     FITrate=10, FRU=P, ASRU=P;
    189 
    190 /*
    191  * ereports.
    192  */
    193 event ereport.io.scsi.disk.over-temperature@P;
    194 event ereport.io.scsi.disk.predictive-failure@P;
    195 event ereport.io.scsi.disk.self-test-failure@P;
    196 
    197 /*
    198  * Propagations.
    199  */
    200 prop fault.io.disk.over-temperature@P ->
    201     ereport.io.scsi.disk.over-temperature@P;
    202 
    203 prop fault.io.disk.self-test-failure@P ->
    204     ereport.io.scsi.disk.self-test-failure@P;
    205 
    206 prop fault.io.disk.predictive-failure@P ->
    207     ereport.io.scsi.disk.predictive-failure@P;
    208