1 4198 eschrock /* 2 4198 eschrock * CDDL HEADER START 3 4198 eschrock * 4 4198 eschrock * The contents of this file are subject to the terms of the 5 4198 eschrock * Common Development and Distribution License (the "License"). 6 4198 eschrock * You may not use this file except in compliance with the License. 7 4198 eschrock * 8 4198 eschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 4198 eschrock * or http://www.opensolaris.org/os/licensing. 10 4198 eschrock * See the License for the specific language governing permissions 11 4198 eschrock * and limitations under the License. 12 4198 eschrock * 13 4198 eschrock * When distributing Covered Code, include this CDDL HEADER in each 14 4198 eschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 4198 eschrock * If applicable, add the following below this CDDL HEADER, with the 16 4198 eschrock * fields enclosed by brackets "[]" replaced with your own identifying 17 4198 eschrock * information: Portions Copyright [yyyy] [name of copyright owner] 18 4198 eschrock * 19 4198 eschrock * CDDL HEADER END 20 4198 eschrock */ 21 4198 eschrock /* 22 11031 David * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 4198 eschrock * Use is subject to license terms. 24 4198 eschrock */ 25 4198 eschrock 26 4198 eschrock #pragma dictionary "DISK" 27 4198 eschrock 28 11031 David #define P disk 29 4198 eschrock 30 4198 eschrock fru P; 31 4198 eschrock asru P; 32 7570 David 33 7570 David /* 34 11031 David * Over all comments for this file: 35 11031 David * <disk-as-detector> The disk-as-detector DE provides the mapping between 36 7570 David * ereports generated by a kernel disk driver sd(7D) and resulting faults. 37 7570 David */ 38 11031 David 39 11031 David /* 40 11031 David * SERD engine for media error fault propagation: 41 11031 David * 42 11031 David * This strategy is designed to give a file system, like ZFS, the 43 11031 David * ability to attempt data recovery/relocation without faulting a disk. 44 11031 David * This implementation depends on a file system retry to the same lba 45 11031 David * to trigger a fault when recovery/relocation is not possible. 46 11031 David * 47 11031 David * We let the engine propagate one error only once every 1 minute and then if we 48 11031 David * still get 2 or more * errors within 24 hours for the same LBA, there is a fault. 49 11031 David */ 50 11031 David engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h; 51 7570 David 52 7570 David /* 53 7570 David * disk-as-detector: fault events. 54 7570 David */ 55 7570 David event fault.io.scsi.cmd.disk.dev.rqs.derr@P; 56 11031 David event fault.io.scsi.cmd.disk.dev.rqs.merr@P, 57 11031 David engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P; 58 11031 David 59 7570 David /* 60 7570 David * The uderr fault will be defined at some future time. 61 7570 David * event fault.io.scsi.cmd.disk.dev.uderr@P; 62 7570 David */ 63 7570 David 64 7570 David /* 65 7570 David * disk-as-detector: upset events. 66 7570 David * NOTE: For now we define an upset to implement discard. 67 7570 David */ 68 7570 David event upset.io.scsi.cmd.disk.dev.rqs.derr@P; 69 7570 David event upset.io.scsi.cmd.disk.dev.rqs.merr@P; 70 7570 David event upset.io.scsi.cmd.disk.dev.uderr@P; 71 7570 David event upset.io.scsi.cmd.disk.dev.serr@P; 72 7570 David event upset.io.scsi.cmd.disk.tran@P; 73 7570 David event upset.io.scsi.cmd.disk.recovered@P; 74 7570 David 75 7570 David /* 76 7570 David * disk-as-detector: ereports from the kernel. 77 7570 David * 78 7570 David * We don't know the topology for all scsi disks, but the kernel will always 79 7570 David * generate ereport telemetry assuming that we do. We define these ereports 80 7570 David * with 'discard_if_config_unknown=1', which permits ereports against things 81 7570 David * with unknown topology to be silently discarded. The ereport data is logged 82 7570 David * in either case, and can be viewed via 'fmdump -eV'. 83 7570 David */ 84 7570 David event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1; 85 7570 David event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1; 86 7570 David event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1; 87 7570 David event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1; 88 7570 David event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1; 89 7570 David event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1; 90 7570 David 91 7570 David /* 92 7570 David * For some ereports we let the 'driver-assessment', communicated as part of 93 7570 David * the ereport payload, determine fault .vs. upset via propagation constraints. 94 7570 David */ 95 7570 David #define DRIVER_ASSESSMENT_FATAL \ 96 7570 David (payloadprop_contains("driver-assessment", "fatal")) 97 7570 David #define DRIVER_ASSESSMENT_NONFATAL (!DRIVER_ASSESSMENT_FATAL) 98 7570 David 99 7570 David /* 100 7570 David * disk-as-detector: propagations from faults(based on 101 7570 David * DRIVER_ASSESSMENT_FATAL). 102 7570 David * We need to set additional fault payloads to indicate fault details. 103 7570 David * The payload we may need are listed as following: 104 7570 David * fault.io.scsi.cmd.disk.dev.rqs.derr 105 7570 David * op_code, key, asc, ascq 106 7570 David * fault.io.scsi.cmd.disk.dev.rqs.merr 107 7570 David * op_code, key, asc, ascq, lba 108 7570 David */ 109 7570 David prop fault.io.scsi.cmd.disk.dev.rqs.derr@P-> 110 7570 David ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL && 111 7570 David setpayloadprop("key", payloadprop("key")) && 112 7570 David setpayloadprop("asc", payloadprop("asc")) && 113 7570 David setpayloadprop("ascq", payloadprop("ascq"))}; 114 7570 David 115 11031 David /* 116 11031 David * Utilize setserdsuffix with specific LBA, 117 11031 David * the serd engine would only trigger if the fault recurred on the same LBA 118 11031 David */ 119 7570 David prop fault.io.scsi.cmd.disk.dev.rqs.merr@P-> 120 7570 David ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL && 121 11031 David setserdsuffix(payloadprop("lba")) && 122 7570 David setpayloadprop("key", payloadprop("key")) && 123 7570 David setpayloadprop("asc", payloadprop("asc")) && 124 7570 David setpayloadprop("ascq", payloadprop("ascq")) && 125 7570 David setpayloadprop("lba", payloadprop("lba"))}; 126 11031 David 127 11031 David /* 128 11031 David * NOTE: this propagation uses the "may" propagation of eversholt. 129 11031 David * The ereport need never exist. It's just a way of making 130 11031 David * the diagnosis wait for the within time on that ereport 131 11031 David * to complete. Once it has completed the diagnosis continues 132 11031 David * even though the dummy ereport didn't occur. 133 11031 David */ 134 11031 David event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)}; 135 11031 David prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) -> 136 11031 David ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P; 137 7570 David 138 7570 David /* 139 7570 David * The uderr fault will be propagated at some future time. 140 7570 David * prop fault.io.scsi.cmd.disk.dev.uderr@P-> 141 7570 David * ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL }; 142 7570 David */ 143 7570 David 144 7570 David /* 145 7570 David * disk-as-detector: propagations from upsets(based on 146 7570 David * DRIVER_ASSESSMENT_NONFATAL). 147 7570 David */ 148 7570 David prop upset.io.scsi.cmd.disk.dev.rqs.derr@P-> 149 7570 David ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL }; 150 7570 David 151 7570 David prop upset.io.scsi.cmd.disk.dev.rqs.merr@P-> 152 7570 David ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL }; 153 7570 David 154 7570 David /* 155 7570 David * disk-as-detector: propagations from upsets(independent of 156 7570 David * driver-assessment) 157 7570 David */ 158 7570 David 159 7570 David prop upset.io.scsi.cmd.disk.dev.serr@P-> 160 7570 David ereport.io.scsi.cmd.disk.dev.serr@P; 161 7570 David 162 7570 David prop upset.io.scsi.cmd.disk.dev.uderr@P-> 163 7570 David ereport.io.scsi.cmd.disk.dev.uderr@P; 164 7570 David 165 7570 David prop upset.io.scsi.cmd.disk.recovered@P-> 166 7570 David ereport.io.scsi.cmd.disk.recovered@P; 167 7570 David 168 7570 David prop upset.io.scsi.cmd.disk.tran@P-> 169 7570 David ereport.io.scsi.cmd.disk.tran@P; 170 7570 David 171 7570 David /* 172 7570 David * -------------------------------------- 173 7570 David * The remainder of this file contains rules associated with the operation of 174 7570 David * cmd/fm/modules/common/disk-monitor/disk_monitor.c code. 175 7570 David * 176 7570 David * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events 177 7570 David * generated by the disk-transport fmd module, and the resulting faults. 178 7570 David */ 179 4198 eschrock 180 4198 eschrock /* 181 4198 eschrock * Fault events. 182 4198 eschrock */ 183 4198 eschrock event fault.io.disk.over-temperature@P, 184 4198 eschrock FITrate=10, FRU=P, ASRU=P; 185 4198 eschrock event fault.io.disk.predictive-failure@P, FITrate=10, 186 4198 eschrock FITrate=10, FRU=P, ASRU=P; 187 4198 eschrock event fault.io.disk.self-test-failure@P, FITrate=10, 188 4198 eschrock FITrate=10, FRU=P, ASRU=P; 189 4198 eschrock 190 4198 eschrock /* 191 4198 eschrock * ereports. 192 4198 eschrock */ 193 4198 eschrock event ereport.io.scsi.disk.over-temperature@P; 194 4198 eschrock event ereport.io.scsi.disk.predictive-failure@P; 195 4198 eschrock event ereport.io.scsi.disk.self-test-failure@P; 196 4198 eschrock 197 4198 eschrock /* 198 4198 eschrock * Propagations. 199 4198 eschrock */ 200 4198 eschrock prop fault.io.disk.over-temperature@P -> 201 4198 eschrock ereport.io.scsi.disk.over-temperature@P; 202 4198 eschrock 203 4198 eschrock prop fault.io.disk.self-test-failure@P -> 204 4198 eschrock ereport.io.scsi.disk.self-test-failure@P; 205 4198 eschrock 206 4198 eschrock prop fault.io.disk.predictive-failure@P -> 207 4198 eschrock ereport.io.scsi.disk.predictive-failure@P; 208