1 0 stevel /* 2 0 stevel * CDDL HEADER START 3 0 stevel * 4 0 stevel * The contents of this file are subject to the terms of the 5 1493 gavinm * Common Development and Distribution License (the "License"). 6 1493 gavinm * You may not use this file except in compliance with the License. 7 0 stevel * 8 0 stevel * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 0 stevel * or http://www.opensolaris.org/os/licensing. 10 0 stevel * See the License for the specific language governing permissions 11 0 stevel * and limitations under the License. 12 0 stevel * 13 0 stevel * When distributing Covered Code, include this CDDL HEADER in each 14 0 stevel * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 0 stevel * If applicable, add the following below this CDDL HEADER, with the 16 0 stevel * fields enclosed by brackets "[]" replaced with your own identifying 17 0 stevel * information: Portions Copyright [yyyy] [name of copyright owner] 18 0 stevel * 19 0 stevel * CDDL HEADER END 20 0 stevel */ 21 1193 mws 22 0 stevel /* 23 8740 Sean * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 0 stevel * Use is subject to license terms. 25 0 stevel */ 26 0 stevel 27 0 stevel #include <cma.h> 28 0 stevel 29 7532 Sean #include <unistd.h> 30 7532 Sean #include <fcntl.h> 31 0 stevel #include <strings.h> 32 0 stevel #include <errno.h> 33 0 stevel #include <time.h> 34 0 stevel #include <fm/fmd_api.h> 35 0 stevel #include <sys/fm/protocol.h> 36 5084 johnlev #include <sys/systeminfo.h> 37 5084 johnlev #include <sys/utsname.h> 38 0 stevel 39 6111 cy152378 #ifdef sun4v 40 6111 cy152378 #include <sys/fm/ldom.h> 41 6111 cy152378 42 6111 cy152378 static fmd_hdl_t *init_hdl; 43 6111 cy152378 ldom_hdl_t *cma_lhp; 44 7532 Sean #endif 45 7532 Sean 46 7532 Sean #ifdef i386 47 7532 Sean boolean_t cma_is_native; 48 6111 cy152378 #endif 49 6111 cy152378 50 6111 cy152378 extern const char *fmd_fmri_get_platform(); 51 6111 cy152378 52 0 stevel cma_t cma; 53 0 stevel 54 0 stevel cma_stats_t cma_stats = { 55 0 stevel { "cpu_flts", FMD_TYPE_UINT64, "cpu faults resolved" }, 56 6111 cy152378 { "cpu_repairs", FMD_TYPE_UINT64, "cpu faults repaired" }, 57 0 stevel { "cpu_fails", FMD_TYPE_UINT64, "cpu faults unresolveable" }, 58 0 stevel { "cpu_blfails", FMD_TYPE_UINT64, "failed cpu blacklists" }, 59 0 stevel { "cpu_supp", FMD_TYPE_UINT64, "cpu offlines suppressed" }, 60 0 stevel { "cpu_blsupp", FMD_TYPE_UINT64, "cpu blacklists suppressed" }, 61 0 stevel { "page_flts", FMD_TYPE_UINT64, "page faults resolved" }, 62 6111 cy152378 { "page_repairs", FMD_TYPE_UINT64, "page faults repaired" }, 63 0 stevel { "page_fails", FMD_TYPE_UINT64, "page faults unresolveable" }, 64 0 stevel { "page_supp", FMD_TYPE_UINT64, "page retires suppressed" }, 65 0 stevel { "page_nonent", FMD_TYPE_UINT64, "retires for non-existent fmris" }, 66 0 stevel { "bad_flts", FMD_TYPE_UINT64, "invalid fault events received" }, 67 0 stevel { "nop_flts", FMD_TYPE_UINT64, "inapplicable fault events received" }, 68 0 stevel { "auto_flts", FMD_TYPE_UINT64, "auto-close faults received" } 69 0 stevel }; 70 0 stevel 71 0 stevel typedef struct cma_subscriber { 72 0 stevel const char *subr_class; 73 0 stevel const char *subr_sname; 74 0 stevel uint_t subr_svers; 75 6111 cy152378 int (*subr_func)(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *, 76 6111 cy152378 boolean_t); 77 0 stevel } cma_subscriber_t; 78 0 stevel 79 0 stevel static const cma_subscriber_t cma_subrs[] = { 80 7532 Sean #if defined(i386) 81 7532 Sean /* 82 7532 Sean * On x86, the ASRUs are expected to be in hc scheme. When 83 7532 Sean * cpumem-retire wants to retire a cpu or mem page, it calls the 84 7532 Sean * methods registered in the topo node to do that. The topo 85 7532 Sean * enumerator, which necessarily knows all the config info that 86 7532 Sean * we'd ever need in deciding what/how to retire etc. This takes 87 7532 Sean * away much of that complexity from the agent into the entity 88 7532 Sean * that knows all config/topo information. 89 7532 Sean */ 90 7532 Sean { "fault.memory.page", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 91 7532 Sean cma_page_retire }, 92 7532 Sean { "fault.memory.page_sb", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 93 7532 Sean cma_page_retire }, 94 7532 Sean { "fault.memory.page_ck", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 95 7532 Sean cma_page_retire }, 96 7532 Sean { "fault.memory.page_ue", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 97 7532 Sean cma_page_retire }, 98 7532 Sean { "fault.memory.generic-x86.page_ce", FM_FMRI_SCHEME_HC, 99 7532 Sean FM_HC_SCHEME_VERSION, cma_page_retire }, 100 7532 Sean { "fault.memory.generic-x86.page_ue", FM_FMRI_SCHEME_HC, 101 7532 Sean FM_HC_SCHEME_VERSION, cma_page_retire }, 102 7532 Sean { "fault.memory.intel.page_ce", FM_FMRI_SCHEME_HC, 103 7532 Sean FM_HC_SCHEME_VERSION, cma_page_retire }, 104 7532 Sean { "fault.memory.intel.page_ue", FM_FMRI_SCHEME_HC, 105 7532 Sean FM_HC_SCHEME_VERSION, cma_page_retire }, 106 7532 Sean { "fault.memory.dimm", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 107 7532 Sean NULL }, 108 7532 Sean { "fault.memory.dimm_sb", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 109 7532 Sean NULL }, 110 7532 Sean { "fault.memory.dimm_ck", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 111 7532 Sean NULL }, 112 7532 Sean { "fault.memory.dimm_ue", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 113 7532 Sean NULL }, 114 7532 Sean { "fault.memory.generic-x86.dimm_ce", FM_FMRI_SCHEME_HC, 115 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 116 7532 Sean { "fault.memory.generic-x86.dimm_ue", FM_FMRI_SCHEME_HC, 117 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 118 7532 Sean { "fault.memory.intel.dimm_ce", FM_FMRI_SCHEME_HC, 119 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 120 7532 Sean { "fault.memory.intel.dimm_ue", FM_FMRI_SCHEME_HC, 121 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 122 7532 Sean { "fault.memory.intel.fbd.*", FM_FMRI_SCHEME_HC, 123 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 124 7532 Sean { "fault.memory.dimm_testfail", FM_FMRI_SCHEME_HC, 125 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 126 7532 Sean { "fault.memory.bank", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 127 7532 Sean NULL }, 128 7532 Sean { "fault.memory.datapath", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 129 7532 Sean NULL }, 130 10831 Yanmin { "fault.cpu.intel.quickpath.mem_scrubbing", FM_FMRI_SCHEME_HC, 131 10831 Yanmin FM_HC_SCHEME_VERSION, cma_page_retire }, 132 7532 Sean { "fault.cpu.intel.quickpath.*", FM_FMRI_SCHEME_HC, 133 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 134 7532 Sean { "fault.cpu.generic-x86.mc", FM_FMRI_SCHEME_HC, 135 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 136 7532 Sean { "fault.cpu.intel.dma", FM_FMRI_SCHEME_HC, 137 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 138 7532 Sean { "fault.cpu.intel.dma", FM_FMRI_SCHEME_CPU, 139 7532 Sean FM_CPU_SCHEME_VERSION, NULL }, 140 7532 Sean 141 7532 Sean /* 142 7532 Sean * The ASRU for cpu faults are in cpu scheme on native and in hc 143 7532 Sean * scheme on xpv. So each cpu fault class needs to be listed twice. 144 7532 Sean */ 145 7532 Sean 146 7532 Sean /* 147 7532 Sean * The following faults do NOT retire a cpu thread, 148 7532 Sean * and therefore must be intercepted before 149 8221 Sean * the default "fault.cpu.*" dispatch to cma_cpu_hc_retire. 150 7532 Sean */ 151 7532 Sean { "fault.cpu.amd.dramchannel", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 152 7532 Sean NULL }, 153 7532 Sean { "fault.cpu.amd.dramchannel", FM_FMRI_SCHEME_CPU, 154 7532 Sean FM_CPU_SCHEME_VERSION, NULL }, 155 7532 Sean { "fault.cpu.generic-x86.bus_interconnect_memory", FM_FMRI_SCHEME_HC, 156 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 157 7532 Sean { "fault.cpu.generic-x86.bus_interconnect_memory", FM_FMRI_SCHEME_CPU, 158 7532 Sean FM_CPU_SCHEME_VERSION, NULL }, 159 7532 Sean { "fault.cpu.generic-x86.bus_interconnect_io", FM_FMRI_SCHEME_HC, 160 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 161 7532 Sean { "fault.cpu.generic-x86.bus_interconnect_io", FM_FMRI_SCHEME_CPU, 162 7532 Sean FM_CPU_SCHEME_VERSION, NULL }, 163 7532 Sean { "fault.cpu.generic-x86.bus_interconnect", FM_FMRI_SCHEME_HC, 164 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 165 7532 Sean { "fault.cpu.generic-x86.bus_interconnect", FM_FMRI_SCHEME_CPU, 166 7532 Sean FM_CPU_SCHEME_VERSION, NULL }, 167 7532 Sean { "fault.cpu.intel.bus_interconnect_memory", FM_FMRI_SCHEME_HC, 168 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 169 7532 Sean { "fault.cpu.intel.bus_interconnect_memory", FM_FMRI_SCHEME_CPU, 170 7532 Sean FM_CPU_SCHEME_VERSION, NULL }, 171 7532 Sean { "fault.cpu.intel.bus_interconnect_io", FM_FMRI_SCHEME_HC, 172 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 173 7532 Sean { "fault.cpu.intel.bus_interconnect_io", FM_FMRI_SCHEME_CPU, 174 7532 Sean FM_CPU_SCHEME_VERSION, NULL }, 175 7532 Sean { "fault.cpu.intel.bus_interconnect", FM_FMRI_SCHEME_HC, 176 7532 Sean FM_HC_SCHEME_VERSION, NULL }, 177 7532 Sean { "fault.cpu.intel.bus_interconnect", FM_FMRI_SCHEME_CPU, 178 7532 Sean FM_CPU_SCHEME_VERSION, NULL }, 179 7532 Sean { "fault.cpu.intel.nb.*", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 180 7532 Sean NULL }, 181 7532 Sean { "fault.cpu.intel.nb.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 182 7532 Sean NULL }, 183 7532 Sean { "fault.cpu.intel.dma", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 184 7532 Sean NULL }, 185 7532 Sean { "fault.cpu.intel.dma", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 186 7532 Sean NULL }, 187 7532 Sean { "fault.cpu.*", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 188 7532 Sean cma_cpu_hc_retire }, 189 7532 Sean { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 190 7532 Sean cma_cpu_hc_retire }, 191 7532 Sean #elif defined(sun4v) 192 8346 Scott /* 193 8346 Scott * The following are PI sun4v faults 194 8346 Scott */ 195 8346 Scott { "fault.memory.memlink", FM_FMRI_SCHEME_HC, 196 8346 Scott FM_HC_SCHEME_VERSION, NULL }, 197 8346 Scott { "fault.memory.memlink-uc", FM_FMRI_SCHEME_HC, 198 8346 Scott FM_HC_SCHEME_VERSION, NULL }, 199 8346 Scott { "fault.memory.memlink-failover", FM_FMRI_SCHEME_HC, 200 8346 Scott FM_HC_SCHEME_VERSION, NULL }, 201 8346 Scott { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_HC, 202 8346 Scott FM_HC_SCHEME_VERSION, NULL }, 203 8346 Scott { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_HC, 204 8346 Scott FM_HC_SCHEME_VERSION, NULL }, 205 8346 Scott { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_HC, 206 8346 Scott FM_HC_SCHEME_VERSION, NULL }, 207 6111 cy152378 { "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 208 6111 cy152378 cma_page_retire }, 209 6111 cy152378 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 210 6111 cy152378 NULL }, 211 6111 cy152378 { "fault.memory.dimm_sb", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 212 6111 cy152378 NULL }, 213 6111 cy152378 { "fault.memory.dimm_ck", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 214 6111 cy152378 NULL }, 215 6111 cy152378 { "fault.memory.dimm_ue", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 216 6111 cy152378 NULL }, 217 8297 Tom { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM, 218 8297 Tom FM_MEM_SCHEME_VERSION, NULL }, 219 8297 Tom { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM, 220 8297 Tom FM_MEM_SCHEME_VERSION, NULL }, 221 8297 Tom { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM, 222 8297 Tom FM_MEM_SCHEME_VERSION, NULL }, 223 6111 cy152378 { "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 224 6111 cy152378 NULL }, 225 6111 cy152378 { "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 226 6111 cy152378 NULL }, 227 6111 cy152378 { "fault.memory.link-c", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 228 6111 cy152378 NULL }, 229 6111 cy152378 { "fault.memory.link-u", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 230 6111 cy152378 NULL }, 231 6111 cy152378 { "fault.memory.link-f", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 232 6111 cy152378 NULL }, 233 7374 Jakub { "fault.memory.link-c", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 234 7374 Jakub NULL }, 235 7374 Jakub { "fault.memory.link-u", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 236 7374 Jakub NULL }, 237 7374 Jakub { "fault.memory.link-f", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 238 7374 Jakub NULL }, 239 6111 cy152378 240 6111 cy152378 /* 241 6111 cy152378 * The following ultraSPARC-T1/T2 faults do NOT retire a cpu thread, 242 6111 cy152378 * and therefore must be intercepted before 243 8221 Sean * the default "fault.cpu.*" dispatch to cma_cpu_hc_retire. 244 6111 cy152378 */ 245 6111 cy152378 { "fault.cpu.*.l2cachedata", FM_FMRI_SCHEME_CPU, 246 6111 cy152378 FM_CPU_SCHEME_VERSION, NULL }, 247 6111 cy152378 { "fault.cpu.*.l2cachetag", FM_FMRI_SCHEME_CPU, 248 6111 cy152378 FM_CPU_SCHEME_VERSION, NULL }, 249 6111 cy152378 { "fault.cpu.*.l2cachectl", FM_FMRI_SCHEME_CPU, 250 6111 cy152378 FM_CPU_SCHEME_VERSION, NULL }, 251 6111 cy152378 { "fault.cpu.*.l2data-c", FM_FMRI_SCHEME_CPU, 252 6111 cy152378 FM_CPU_SCHEME_VERSION, NULL }, 253 6111 cy152378 { "fault.cpu.*.l2data-u", FM_FMRI_SCHEME_CPU, 254 6111 cy152378 FM_CPU_SCHEME_VERSION, NULL }, 255 6111 cy152378 { "fault.cpu.*.mau", FM_FMRI_SCHEME_CPU, 256 6111 cy152378 FM_CPU_SCHEME_VERSION, NULL }, 257 6111 cy152378 { "fault.cpu.*.lfu-u", FM_FMRI_SCHEME_CPU, 258 6111 cy152378 FM_CPU_SCHEME_VERSION, NULL }, 259 6111 cy152378 { "fault.cpu.*.lfu-f", FM_FMRI_SCHEME_CPU, 260 6111 cy152378 FM_CPU_SCHEME_VERSION, NULL }, 261 6111 cy152378 { "fault.cpu.*.lfu-p", FM_FMRI_SCHEME_CPU, 262 6111 cy152378 FM_CPU_SCHEME_VERSION, NULL }, 263 7374 Jakub { "fault.cpu.ultraSPARC-T1.freg", FM_FMRI_SCHEME_CPU, 264 7374 Jakub FM_CPU_SCHEME_VERSION, NULL }, 265 7374 Jakub { "fault.cpu.ultraSPARC-T1.l2cachedata", FM_FMRI_SCHEME_CPU, 266 7374 Jakub FM_CPU_SCHEME_VERSION, NULL }, 267 7374 Jakub { "fault.cpu.ultraSPARC-T1.l2cachetag", FM_FMRI_SCHEME_CPU, 268 7374 Jakub FM_CPU_SCHEME_VERSION, NULL }, 269 7374 Jakub { "fault.cpu.ultraSPARC-T1.l2cachectl", FM_FMRI_SCHEME_CPU, 270 7374 Jakub FM_CPU_SCHEME_VERSION, NULL }, 271 7374 Jakub { "fault.cpu.ultraSPARC-T1.mau", FM_FMRI_SCHEME_CPU, 272 7374 Jakub FM_CPU_SCHEME_VERSION, NULL }, 273 7374 Jakub { "fault.cpu.ultraSPARC-T2plus.chip", FM_FMRI_SCHEME_HC, 274 7374 Jakub FM_HC_SCHEME_VERSION, NULL }, 275 8221 Sean { "fault.cpu.*", FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, 276 8221 Sean cma_cpu_hc_retire }, 277 6111 cy152378 { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 278 8221 Sean cma_cpu_hc_retire }, 279 6111 cy152378 #elif defined(opl) 280 6111 cy152378 { "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 281 6111 cy152378 cma_page_retire }, 282 6111 cy152378 { "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 283 6111 cy152378 NULL }, 284 8297 Tom { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM, 285 8297 Tom FM_MEM_SCHEME_VERSION, NULL }, 286 8297 Tom { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM, 287 8297 Tom FM_MEM_SCHEME_VERSION, NULL }, 288 8297 Tom { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM, 289 8297 Tom FM_MEM_SCHEME_VERSION, NULL }, 290 6111 cy152378 { "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 291 6111 cy152378 NULL }, 292 6111 cy152378 { "fault.cpu.SPARC64-VI.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 293 8221 Sean cma_cpu_cpu_retire }, 294 6111 cy152378 { "fault.cpu.SPARC64-VII.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 295 8221 Sean cma_cpu_cpu_retire }, 296 6111 cy152378 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.se", 297 6111 cy152378 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 298 6111 cy152378 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.se-offlinereq", 299 6111 cy152378 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 300 6111 cy152378 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.ce", 301 6111 cy152378 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 302 6111 cy152378 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VI.core.ce-offlinereq", 303 6111 cy152378 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 304 6111 cy152378 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.se", 305 6111 cy152378 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 306 6111 cy152378 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.se-offlinereq", 307 6111 cy152378 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 308 6111 cy152378 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.ce", 309 6111 cy152378 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 310 6111 cy152378 { "fault.chassis.SPARC-Enterprise.cpu.SPARC64-VII.core.ce-offlinereq", 311 6111 cy152378 FM_FMRI_SCHEME_HC, FM_HC_SCHEME_VERSION, cma_cpu_hc_retire }, 312 7532 Sean #else 313 7532 Sean /* 314 7532 Sean * For platforms excluding i386, sun4v and opl. 315 7532 Sean */ 316 0 stevel { "fault.memory.page", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 317 0 stevel cma_page_retire }, 318 2869 gavinm { "fault.memory.page_sb", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 319 2869 gavinm cma_page_retire }, 320 2869 gavinm { "fault.memory.page_ck", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 321 2869 gavinm cma_page_retire }, 322 2869 gavinm { "fault.memory.page_ue", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 323 2869 gavinm cma_page_retire }, 324 0 stevel { "fault.memory.dimm", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 325 1414 cindi NULL }, 326 1493 gavinm { "fault.memory.dimm_sb", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 327 1414 cindi NULL }, 328 1493 gavinm { "fault.memory.dimm_ck", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 329 1414 cindi NULL }, 330 1493 gavinm { "fault.memory.dimm_ue", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 331 0 stevel NULL }, 332 8297 Tom { "fault.memory.dimm-page-retires-excessive", FM_FMRI_SCHEME_MEM, 333 8297 Tom FM_MEM_SCHEME_VERSION, NULL }, 334 8297 Tom { "fault.memory.dimm-ue-imminent", FM_FMRI_SCHEME_MEM, 335 8297 Tom FM_MEM_SCHEME_VERSION, NULL }, 336 8297 Tom { "fault.memory.dram-ue-imminent", FM_FMRI_SCHEME_MEM, 337 8297 Tom FM_MEM_SCHEME_VERSION, NULL }, 338 2869 gavinm { "fault.memory.dimm_testfail", FM_FMRI_SCHEME_MEM, 339 2869 gavinm FM_MEM_SCHEME_VERSION, NULL }, 340 0 stevel { "fault.memory.bank", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 341 0 stevel NULL }, 342 0 stevel { "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, 343 0 stevel NULL }, 344 962 tsien 345 962 tsien /* 346 2869 gavinm * The following faults do NOT retire a cpu thread, 347 962 tsien * and therefore must be intercepted before 348 8221 Sean * the default "fault.cpu.*" dispatch to cma_cpu_cpu_retire. 349 962 tsien */ 350 6330 jc25722 { "fault.cpu.ultraSPARC-IVplus.l2cachedata-line", 351 6330 jc25722 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 352 6330 jc25722 cma_cache_way_retire }, 353 6330 jc25722 { "fault.cpu.ultraSPARC-IVplus.l3cachedata-line", 354 6330 jc25722 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 355 6330 jc25722 cma_cache_way_retire }, 356 6330 jc25722 { "fault.cpu.ultraSPARC-IVplus.l2cachetag-line", 357 6330 jc25722 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 358 6330 jc25722 cma_cache_way_retire }, 359 6330 jc25722 { "fault.cpu.ultraSPARC-IVplus.l3cachetag-line", 360 6330 jc25722 FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 361 6330 jc25722 cma_cache_way_retire }, 362 7374 Jakub 363 5254 gavinm /* 364 7532 Sean * Default "fault.cpu.*" for "cpu" scheme ASRU dispatch. 365 5254 gavinm */ 366 962 tsien { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, 367 8221 Sean cma_cpu_cpu_retire }, 368 6111 cy152378 #endif 369 0 stevel { NULL, NULL, 0, NULL } 370 0 stevel }; 371 0 stevel 372 0 stevel static const cma_subscriber_t * 373 0 stevel nvl2subr(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t **asrup) 374 0 stevel { 375 0 stevel const cma_subscriber_t *sp; 376 0 stevel nvlist_t *asru; 377 0 stevel char *scheme; 378 0 stevel uint8_t version; 379 7197 stephh boolean_t retire; 380 7197 stephh 381 7197 stephh if (nvlist_lookup_boolean_value(nvl, FM_SUSPECT_RETIRE, &retire) == 0 && 382 7197 stephh retire == 0) { 383 7197 stephh fmd_hdl_debug(hdl, "cma_recv: retire suppressed"); 384 7197 stephh return (NULL); 385 7197 stephh } 386 0 stevel 387 0 stevel if (nvlist_lookup_nvlist(nvl, FM_FAULT_ASRU, &asru) != 0 || 388 0 stevel nvlist_lookup_string(asru, FM_FMRI_SCHEME, &scheme) != 0 || 389 0 stevel nvlist_lookup_uint8(asru, FM_VERSION, &version) != 0) { 390 0 stevel cma_stats.bad_flts.fmds_value.ui64++; 391 0 stevel return (NULL); 392 0 stevel } 393 0 stevel 394 0 stevel for (sp = cma_subrs; sp->subr_class != NULL; sp++) { 395 0 stevel if (fmd_nvl_class_match(hdl, nvl, sp->subr_class) && 396 0 stevel strcmp(scheme, sp->subr_sname) == 0 && 397 0 stevel version <= sp->subr_svers) { 398 0 stevel *asrup = asru; 399 0 stevel return (sp); 400 0 stevel } 401 0 stevel } 402 0 stevel 403 0 stevel cma_stats.nop_flts.fmds_value.ui64++; 404 0 stevel return (NULL); 405 0 stevel } 406 0 stevel 407 0 stevel static void 408 7275 stephh cma_recv_list(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) 409 0 stevel { 410 0 stevel char *uuid = NULL; 411 10656 Stephen nvlist_t **nva, **save_nva; 412 10656 Stephen uint_t nvc = 0, save_nvc; 413 1772 jl139090 uint_t keepopen; 414 0 stevel int err = 0; 415 7470 Scott nvlist_t *asru = NULL; 416 7256 jc25722 uint32_t index; 417 0 stevel 418 0 stevel err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 419 0 stevel err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 420 0 stevel &nva, &nvc); 421 0 stevel if (err != 0) { 422 0 stevel cma_stats.bad_flts.fmds_value.ui64++; 423 0 stevel return; 424 0 stevel } 425 0 stevel 426 10656 Stephen save_nvc = keepopen = nvc; 427 10656 Stephen save_nva = nva; 428 7275 stephh while (nvc-- != 0 && (strcmp(class, FM_LIST_SUSPECT_CLASS) != 0 || 429 7275 stephh !fmd_case_uuclosed(hdl, uuid))) { 430 0 stevel nvlist_t *nvl = *nva++; 431 0 stevel const cma_subscriber_t *subr; 432 7275 stephh int has_fault; 433 0 stevel 434 0 stevel if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 435 0 stevel continue; 436 0 stevel 437 1772 jl139090 /* 438 1772 jl139090 * A handler returns CMA_RA_SUCCESS to indicate that 439 1772 jl139090 * from this suspects point-of-view the case may be 440 1772 jl139090 * closed, CMA_RA_FAILURE otherwise. 441 1772 jl139090 * A handler must not close the case itself. 442 1772 jl139090 */ 443 1772 jl139090 if (subr->subr_func != NULL) { 444 7275 stephh has_fault = fmd_nvl_fmri_has_fault(hdl, asru, 445 7275 stephh FMD_HAS_FAULT_ASRU, NULL); 446 7275 stephh if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0) { 447 7275 stephh if (has_fault == 1) 448 7275 stephh err = subr->subr_func(hdl, nvl, asru, 449 7275 stephh uuid, 0); 450 7275 stephh } else { 451 7275 stephh if (has_fault == 0) 452 7275 stephh err = subr->subr_func(hdl, nvl, asru, 453 7275 stephh uuid, 1); 454 7275 stephh } 455 1772 jl139090 if (err == CMA_RA_SUCCESS) 456 1772 jl139090 keepopen--; 457 10656 Stephen } 458 10656 Stephen } 459 10656 Stephen 460 10656 Stephen /* 461 10656 Stephen * Run though again to catch any new faults in list.updated. 462 10656 Stephen */ 463 10656 Stephen while (save_nvc-- != 0 && (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)) { 464 10656 Stephen nvlist_t *nvl = *save_nva++; 465 10656 Stephen const cma_subscriber_t *subr; 466 10656 Stephen int has_fault; 467 10656 Stephen 468 10656 Stephen if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 469 10656 Stephen continue; 470 10656 Stephen if (subr->subr_func != NULL) { 471 10656 Stephen has_fault = fmd_nvl_fmri_has_fault(hdl, asru, 472 10656 Stephen FMD_HAS_FAULT_ASRU, NULL); 473 10656 Stephen if (has_fault == 1) 474 10656 Stephen err = subr->subr_func(hdl, nvl, asru, uuid, 0); 475 1772 jl139090 } 476 0 stevel } 477 7532 Sean 478 7256 jc25722 /* 479 7256 jc25722 * Do not close the case if we are handling cache faults. 480 7256 jc25722 */ 481 7470 Scott if (asru != NULL) { 482 7470 Scott if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_CACHE_INDEX, 483 7470 Scott &index) != 0) { 484 7470 Scott if (!keepopen && strcmp(class, 485 7470 Scott FM_LIST_SUSPECT_CLASS) == 0) { 486 7470 Scott fmd_case_uuclose(hdl, uuid); 487 7470 Scott } 488 7256 jc25722 } 489 7256 jc25722 } 490 7532 Sean 491 7275 stephh if (!keepopen && strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 492 7275 stephh fmd_case_uuresolved(hdl, uuid); 493 0 stevel } 494 0 stevel 495 0 stevel static void 496 0 stevel cma_recv_one(fmd_hdl_t *hdl, nvlist_t *nvl) 497 0 stevel { 498 0 stevel const cma_subscriber_t *subr; 499 0 stevel nvlist_t *asru; 500 0 stevel 501 0 stevel if ((subr = nvl2subr(hdl, nvl, &asru)) == NULL) 502 0 stevel return; 503 0 stevel 504 7275 stephh if (subr->subr_func != NULL) { 505 7275 stephh if (fmd_nvl_fmri_has_fault(hdl, asru, 506 7275 stephh FMD_HAS_FAULT_ASRU, NULL) == 1) 507 7275 stephh (void) subr->subr_func(hdl, nvl, asru, NULL, 0); 508 7275 stephh } 509 0 stevel } 510 0 stevel 511 0 stevel /*ARGSUSED*/ 512 0 stevel static void 513 0 stevel cma_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) 514 0 stevel { 515 0 stevel fmd_hdl_debug(hdl, "received %s\n", class); 516 0 stevel 517 9120 Stephen if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 518 9120 Stephen return; 519 9120 Stephen 520 6111 cy152378 if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || 521 7275 stephh strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 || 522 7275 stephh strcmp(class, FM_LIST_UPDATED_CLASS) == 0) 523 7275 stephh cma_recv_list(hdl, nvl, class); 524 0 stevel else 525 0 stevel cma_recv_one(hdl, nvl); 526 0 stevel } 527 0 stevel 528 0 stevel /*ARGSUSED*/ 529 0 stevel static void 530 0 stevel cma_timeout(fmd_hdl_t *hdl, id_t id, void *arg) 531 0 stevel { 532 0 stevel if (id == cma.cma_page_timerid) 533 0 stevel cma_page_retry(hdl); 534 6111 cy152378 #ifdef sun4v 535 6111 cy152378 /* 536 6111 cy152378 * cpu offline/online needs to be retried on sun4v because 537 6111 cy152378 * ldom request can be asynchronous. 538 6111 cy152378 */ 539 6111 cy152378 else if (id == cma.cma_cpu_timerid) 540 6111 cy152378 cma_cpu_retry(hdl); 541 6111 cy152378 #endif 542 0 stevel } 543 6111 cy152378 544 6111 cy152378 #ifdef sun4v 545 6111 cy152378 static void * 546 6111 cy152378 cma_init_alloc(size_t size) 547 6111 cy152378 { 548 6111 cy152378 return (fmd_hdl_alloc(init_hdl, size, FMD_SLEEP)); 549 6111 cy152378 } 550 6111 cy152378 551 6111 cy152378 static void 552 6111 cy152378 cma_init_free(void *addr, size_t size) 553 6111 cy152378 { 554 6111 cy152378 fmd_hdl_free(init_hdl, addr, size); 555 6111 cy152378 } 556 6111 cy152378 #endif 557 0 stevel 558 0 stevel static const fmd_hdl_ops_t fmd_ops = { 559 0 stevel cma_recv, /* fmdo_recv */ 560 0 stevel cma_timeout, /* fmdo_timeout */ 561 0 stevel NULL, /* fmdo_close */ 562 0 stevel NULL, /* fmdo_stats */ 563 0 stevel NULL, /* fmdo_gc */ 564 0 stevel }; 565 0 stevel 566 0 stevel static const fmd_prop_t fmd_props[] = { 567 0 stevel { "cpu_tries", FMD_TYPE_UINT32, "10" }, 568 0 stevel { "cpu_delay", FMD_TYPE_TIME, "1sec" }, 569 6111 cy152378 #ifdef sun4v 570 6111 cy152378 { "cpu_ret_mindelay", FMD_TYPE_TIME, "5sec" }, 571 6111 cy152378 { "cpu_ret_maxdelay", FMD_TYPE_TIME, "5min" }, 572 6111 cy152378 #endif /* sun4v */ 573 0 stevel { "cpu_offline_enable", FMD_TYPE_BOOL, "true" }, 574 6111 cy152378 { "cpu_online_enable", FMD_TYPE_BOOL, "true" }, 575 0 stevel { "cpu_forced_offline", FMD_TYPE_BOOL, "true" }, 576 6111 cy152378 #ifdef opl 577 6111 cy152378 { "cpu_blacklist_enable", FMD_TYPE_BOOL, "false" }, 578 6111 cy152378 { "cpu_unblacklist_enable", FMD_TYPE_BOOL, "false" }, 579 6111 cy152378 #else 580 0 stevel { "cpu_blacklist_enable", FMD_TYPE_BOOL, "true" }, 581 6111 cy152378 { "cpu_unblacklist_enable", FMD_TYPE_BOOL, "true" }, 582 6111 cy152378 #endif /* opl */ 583 0 stevel { "page_ret_mindelay", FMD_TYPE_TIME, "1sec" }, 584 0 stevel { "page_ret_maxdelay", FMD_TYPE_TIME, "5min" }, 585 0 stevel { "page_retire_enable", FMD_TYPE_BOOL, "true" }, 586 6111 cy152378 { "page_unretire_enable", FMD_TYPE_BOOL, "true" }, 587 0 stevel { NULL, 0, NULL } 588 0 stevel }; 589 0 stevel 590 0 stevel static const fmd_hdl_info_t fmd_info = { 591 0 stevel "CPU/Memory Retire Agent", CMA_VERSION, &fmd_ops, fmd_props 592 0 stevel }; 593 0 stevel 594 0 stevel void 595 0 stevel _fmd_init(fmd_hdl_t *hdl) 596 0 stevel { 597 0 stevel hrtime_t nsec; 598 6111 cy152378 #ifdef i386 599 7532 Sean char buf[BUFSIZ]; 600 7532 Sean const char *dom0 = "control_d"; 601 7532 Sean 602 5084 johnlev /* 603 7532 Sean * Abort the cpumem-retire module if Solaris is running under DomU. 604 5084 johnlev */ 605 7532 Sean if (sysinfo(SI_PLATFORM, buf, sizeof (buf)) == -1) 606 5084 johnlev return; 607 7532 Sean 608 7532 Sean if (strncmp(buf, "i86pc", sizeof (buf)) == 0) { 609 7532 Sean cma_is_native = B_TRUE; 610 7532 Sean } else if (strncmp(buf, "i86xpv", sizeof (buf)) != 0) { 611 7532 Sean return; 612 7532 Sean } else { 613 7532 Sean int fd = open("/dev/xen/domcaps", O_RDONLY); 614 7532 Sean 615 7532 Sean if (fd != -1) { 616 7532 Sean if (read(fd, buf, sizeof (buf)) <= 0 || 617 7532 Sean strncmp(buf, dom0, strlen(dom0)) != 0) { 618 7532 Sean (void) close(fd); 619 7532 Sean return; 620 7532 Sean } 621 7532 Sean (void) close(fd); 622 7532 Sean } 623 7532 Sean cma_is_native = B_FALSE; 624 7532 Sean } 625 7532 Sean #endif /* i386 */ 626 0 stevel 627 0 stevel if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) 628 0 stevel return; /* invalid data in configuration file */ 629 0 stevel 630 0 stevel fmd_hdl_subscribe(hdl, "fault.cpu.*"); 631 0 stevel fmd_hdl_subscribe(hdl, "fault.memory.*"); 632 6111 cy152378 #ifdef opl 633 6111 cy152378 fmd_hdl_subscribe(hdl, "fault.chassis.SPARC-Enterprise.cpu.*"); 634 6111 cy152378 #endif 635 0 stevel 636 0 stevel (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (cma_stats) / 637 0 stevel sizeof (fmd_stat_t), (fmd_stat_t *)&cma_stats); 638 0 stevel 639 0 stevel cma.cma_cpu_tries = fmd_prop_get_int32(hdl, "cpu_tries"); 640 0 stevel 641 0 stevel nsec = fmd_prop_get_int64(hdl, "cpu_delay"); 642 0 stevel cma.cma_cpu_delay.tv_sec = nsec / NANOSEC; 643 0 stevel cma.cma_cpu_delay.tv_nsec = nsec % NANOSEC; 644 0 stevel 645 0 stevel cma.cma_page_mindelay = fmd_prop_get_int64(hdl, "page_ret_mindelay"); 646 0 stevel cma.cma_page_maxdelay = fmd_prop_get_int64(hdl, "page_ret_maxdelay"); 647 0 stevel 648 6111 cy152378 #ifdef sun4v 649 6111 cy152378 cma.cma_cpu_mindelay = fmd_prop_get_int64(hdl, "cpu_ret_mindelay"); 650 6111 cy152378 cma.cma_cpu_maxdelay = fmd_prop_get_int64(hdl, "cpu_ret_maxdelay"); 651 6111 cy152378 #endif 652 6111 cy152378 653 0 stevel cma.cma_cpu_dooffline = fmd_prop_get_int32(hdl, "cpu_offline_enable"); 654 0 stevel cma.cma_cpu_forcedoffline = fmd_prop_get_int32(hdl, 655 0 stevel "cpu_forced_offline"); 656 6111 cy152378 cma.cma_cpu_doonline = fmd_prop_get_int32(hdl, "cpu_online_enable"); 657 0 stevel cma.cma_cpu_doblacklist = fmd_prop_get_int32(hdl, 658 0 stevel "cpu_blacklist_enable"); 659 6111 cy152378 cma.cma_cpu_dounblacklist = fmd_prop_get_int32(hdl, 660 6111 cy152378 "cpu_unblacklist_enable"); 661 0 stevel cma.cma_page_doretire = fmd_prop_get_int32(hdl, "page_retire_enable"); 662 6111 cy152378 cma.cma_page_dounretire = fmd_prop_get_int32(hdl, 663 6111 cy152378 "page_unretire_enable"); 664 0 stevel 665 0 stevel if (cma.cma_page_maxdelay < cma.cma_page_mindelay) 666 0 stevel fmd_hdl_abort(hdl, "page retirement delays conflict\n"); 667 6111 cy152378 668 6111 cy152378 #ifdef sun4v 669 6111 cy152378 init_hdl = hdl; 670 6111 cy152378 cma_lhp = ldom_init(cma_init_alloc, cma_init_free); 671 6111 cy152378 #endif 672 0 stevel } 673 0 stevel 674 0 stevel void 675 0 stevel _fmd_fini(fmd_hdl_t *hdl) 676 0 stevel { 677 6111 cy152378 #ifdef sun4v 678 6111 cy152378 ldom_fini(cma_lhp); 679 6111 cy152378 cma_cpu_fini(hdl); 680 6111 cy152378 #endif 681 0 stevel cma_page_fini(hdl); 682 0 stevel } 683