Home | History | Annotate | Download | only in common
      1   4198  eschrock /*
      2   4198  eschrock  * CDDL HEADER START
      3   4198  eschrock  *
      4   4198  eschrock  * The contents of this file are subject to the terms of the
      5   4198  eschrock  * Common Development and Distribution License (the "License").
      6   4198  eschrock  * You may not use this file except in compliance with the License.
      7   4198  eschrock  *
      8   4198  eschrock  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9   4198  eschrock  * or http://www.opensolaris.org/os/licensing.
     10   4198  eschrock  * See the License for the specific language governing permissions
     11   4198  eschrock  * and limitations under the License.
     12   4198  eschrock  *
     13   4198  eschrock  * When distributing Covered Code, include this CDDL HEADER in each
     14   4198  eschrock  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15   4198  eschrock  * If applicable, add the following below this CDDL HEADER, with the
     16   4198  eschrock  * fields enclosed by brackets "[]" replaced with your own identifying
     17   4198  eschrock  * information: Portions Copyright [yyyy] [name of copyright owner]
     18   4198  eschrock  *
     19   4198  eschrock  * CDDL HEADER END
     20   4198  eschrock  */
     21   4198  eschrock /*
     22  11031     David  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23   4198  eschrock  * Use is subject to license terms.
     24   4198  eschrock  */
     25   4198  eschrock 
     26   4198  eschrock #pragma dictionary "DISK"
     27   4198  eschrock 
     28  11031     David #define	P			disk
     29   4198  eschrock 
     30   4198  eschrock fru P;
     31   4198  eschrock asru P;
     32   7570     David 
     33   7570     David /*
     34  11031     David  * Over all comments for this file:
     35  11031     David  * <disk-as-detector> The disk-as-detector DE provides the mapping between
     36   7570     David  * ereports generated by a kernel disk driver sd(7D) and resulting faults.
     37   7570     David  */
     38  11031     David 
     39  11031     David /*
     40  11031     David  * SERD engine for media error fault propagation:
     41  11031     David  *
     42  11031     David  * This strategy is designed to give a file system, like ZFS, the
     43  11031     David  * ability to attempt data recovery/relocation without faulting a disk.
     44  11031     David  * This implementation depends on a file system retry to the same lba
     45  11031     David  * to trigger a fault when recovery/relocation is not possible.
     46  11031     David  *
     47  11031     David  * We let the engine propagate one error only once every 1 minute and then if we
     48  11031     David  * still get 2 or more * errors within 24 hours for the same LBA, there is a fault.
     49  11031     David  */
     50  11031     David engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
     51   7570     David 
     52   7570     David /*
     53   7570     David  * disk-as-detector: fault events.
     54   7570     David  */
     55   7570     David event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
     56  11031     David event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
     57  11031     David     engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
     58  11031     David 
     59   7570     David /*
     60   7570     David  * The uderr fault will be defined at some future time.
     61   7570     David  * event fault.io.scsi.cmd.disk.dev.uderr@P;
     62   7570     David  */
     63   7570     David 
     64   7570     David /*
     65   7570     David  * disk-as-detector: upset events.
     66   7570     David  * NOTE: For now we define an upset to implement discard.
     67   7570     David  */
     68   7570     David event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
     69   7570     David event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
     70   7570     David event upset.io.scsi.cmd.disk.dev.uderr@P;
     71   7570     David event upset.io.scsi.cmd.disk.dev.serr@P;
     72   7570     David event upset.io.scsi.cmd.disk.tran@P;
     73   7570     David event upset.io.scsi.cmd.disk.recovered@P;
     74   7570     David 
     75   7570     David /*
     76   7570     David  * disk-as-detector: ereports from the kernel.
     77   7570     David  *
     78   7570     David  * We don't know the topology for all scsi disks, but the kernel will always
     79   7570     David  * generate ereport telemetry assuming that we do. We define these ereports
     80   7570     David  * with 'discard_if_config_unknown=1', which permits ereports against things
     81   7570     David  * with unknown topology to be silently discarded.  The ereport data is logged
     82   7570     David  * in either case, and can be viewed via 'fmdump -eV'.
     83   7570     David  */
     84   7570     David event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
     85   7570     David event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
     86   7570     David event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
     87   7570     David event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
     88   7570     David event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
     89   7570     David event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
     90   7570     David 
     91   7570     David /*
     92   7570     David  * For some ereports we let the 'driver-assessment', communicated as part of
     93   7570     David  * the ereport payload, determine fault .vs. upset via propagation constraints.
     94   7570     David  */
     95   7570     David #define DRIVER_ASSESSMENT_FATAL		\
     96   7570     David 	    (payloadprop_contains("driver-assessment", "fatal"))
     97   7570     David #define DRIVER_ASSESSMENT_NONFATAL	(!DRIVER_ASSESSMENT_FATAL)
     98   7570     David 
     99   7570     David /*
    100   7570     David  * disk-as-detector: propagations from faults(based on
    101   7570     David  * DRIVER_ASSESSMENT_FATAL).
    102   7570     David  * We need to set additional fault payloads to indicate fault details.
    103   7570     David  * The payload we may need are listed as following:
    104   7570     David  * fault.io.scsi.cmd.disk.dev.rqs.derr
    105   7570     David  *     op_code, key, asc, ascq
    106   7570     David  * fault.io.scsi.cmd.disk.dev.rqs.merr
    107   7570     David  *     op_code, key, asc, ascq, lba
    108   7570     David  */
    109   7570     David prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
    110   7570     David     ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
    111   7570     David     setpayloadprop("key", payloadprop("key")) &&
    112   7570     David     setpayloadprop("asc", payloadprop("asc")) &&
    113   7570     David     setpayloadprop("ascq", payloadprop("ascq"))};
    114   7570     David 
    115  11031     David /*
    116  11031     David  * Utilize setserdsuffix with specific LBA,
    117  11031     David  * the serd engine would only trigger if the fault recurred on the same LBA
    118  11031     David  */
    119   7570     David prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
    120   7570     David     ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
    121  11031     David     setserdsuffix(payloadprop("lba")) &&
    122   7570     David     setpayloadprop("key", payloadprop("key")) &&
    123   7570     David     setpayloadprop("asc", payloadprop("asc")) &&
    124   7570     David     setpayloadprop("ascq", payloadprop("ascq")) &&
    125   7570     David     setpayloadprop("lba", payloadprop("lba"))};
    126  11031     David 
    127  11031     David /*
    128  11031     David  * NOTE: this propagation uses the "may" propagation of eversholt.
    129  11031     David  * The ereport need never exist. It's just a way of making
    130  11031     David  * the diagnosis wait for the within time on that ereport
    131  11031     David  * to complete. Once it has completed the diagnosis continues
    132  11031     David  * even though the dummy ereport didn't occur.
    133  11031     David  */
    134  11031     David event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
    135  11031     David prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
    136  11031     David 	ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
    137   7570     David 
    138   7570     David /*
    139   7570     David  * The uderr fault will be propagated at some future time.
    140   7570     David  * prop fault.io.scsi.cmd.disk.dev.uderr@P->
    141   7570     David  *     ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
    142   7570     David  */
    143   7570     David 
    144   7570     David /*
    145   7570     David  * disk-as-detector: propagations from upsets(based on
    146   7570     David  * DRIVER_ASSESSMENT_NONFATAL).
    147   7570     David  */
    148   7570     David prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
    149   7570     David     ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
    150   7570     David 
    151   7570     David prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
    152   7570     David     ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
    153   7570     David 
    154   7570     David /*
    155   7570     David  * disk-as-detector: propagations from upsets(independent of
    156   7570     David  * driver-assessment)
    157   7570     David  */
    158   7570     David 
    159   7570     David prop upset.io.scsi.cmd.disk.dev.serr@P->
    160   7570     David     ereport.io.scsi.cmd.disk.dev.serr@P;
    161   7570     David 
    162   7570     David prop upset.io.scsi.cmd.disk.dev.uderr@P->
    163   7570     David     ereport.io.scsi.cmd.disk.dev.uderr@P;
    164   7570     David 
    165   7570     David prop upset.io.scsi.cmd.disk.recovered@P->
    166   7570     David     ereport.io.scsi.cmd.disk.recovered@P;
    167   7570     David 
    168   7570     David prop upset.io.scsi.cmd.disk.tran@P->
    169   7570     David     ereport.io.scsi.cmd.disk.tran@P;
    170   7570     David 
    171   7570     David /*
    172   7570     David  * --------------------------------------
    173   7570     David  * The remainder of this file contains rules associated with the operation of
    174   7570     David  * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
    175   7570     David  *
    176   7570     David  * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
    177   7570     David  * generated by the disk-transport fmd module, and the resulting faults.
    178   7570     David  */
    179   4198  eschrock 
    180   4198  eschrock /*
    181   4198  eschrock  * Fault events.
    182   4198  eschrock  */
    183   4198  eschrock event fault.io.disk.over-temperature@P,
    184   4198  eschrock     FITrate=10, FRU=P, ASRU=P;
    185   4198  eschrock event fault.io.disk.predictive-failure@P, FITrate=10,
    186   4198  eschrock     FITrate=10, FRU=P, ASRU=P;
    187   4198  eschrock event fault.io.disk.self-test-failure@P, FITrate=10,
    188   4198  eschrock     FITrate=10, FRU=P, ASRU=P;
    189   4198  eschrock 
    190   4198  eschrock /*
    191   4198  eschrock  * ereports.
    192   4198  eschrock  */
    193   4198  eschrock event ereport.io.scsi.disk.over-temperature@P;
    194   4198  eschrock event ereport.io.scsi.disk.predictive-failure@P;
    195   4198  eschrock event ereport.io.scsi.disk.self-test-failure@P;
    196   4198  eschrock 
    197   4198  eschrock /*
    198   4198  eschrock  * Propagations.
    199   4198  eschrock  */
    200   4198  eschrock prop fault.io.disk.over-temperature@P ->
    201   4198  eschrock     ereport.io.scsi.disk.over-temperature@P;
    202   4198  eschrock 
    203   4198  eschrock prop fault.io.disk.self-test-failure@P ->
    204   4198  eschrock     ereport.io.scsi.disk.self-test-failure@P;
    205   4198  eschrock 
    206   4198  eschrock prop fault.io.disk.predictive-failure@P ->
    207   4198  eschrock     ereport.io.scsi.disk.predictive-failure@P;
    208