Home | History | Annotate | Download | only in cpumem-retire
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <cma.h>
     27 
     28 #include <fcntl.h>
     29 #include <unistd.h>
     30 #include <strings.h>
     31 #include <errno.h>
     32 #include <time.h>
     33 #include <fm/fmd_api.h>
     34 #include <fm/fmd_agent.h>
     35 #include <sys/fm/protocol.h>
     36 #include <sys/bl.h>
     37 #include <sys/processor.h>
     38 
     39 static int cpu_statechange(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *,
     40     uint32_t, boolean_t);
     41 
     42 #ifndef opl
     43 /*
     44  * Perform retire/unretire by invoking the topo methods registered in the
     45  * hc-scheme resource.
     46  *
     47  * If the fault is found to be diagnosed under the old topology, the resource
     48  * will not exist in the current topology, then we fall back to legacy retire
     49  * (using the "cpu" scheme ASRU).
     50  */
     51 
     52 static boolean_t
     53 old_topo_fault(nvlist_t *nvl)
     54 {
     55 	nvlist_t *rsrc;
     56 #ifdef i386
     57 	nvlist_t **hcl;
     58 	uint_t nhcl = 0;
     59 	char *name;
     60 #endif
     61 
     62 	if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) != 0)
     63 		return (B_TRUE);
     64 #ifdef i386
     65 	/*
     66 	 * x86 has moved from "motherboard/chip/cpu" topo to
     67 	 * "motherboard/chip/core/strand"
     68 	 */
     69 	if (nvlist_lookup_nvlist_array(rsrc, FM_FMRI_HC_LIST, &hcl, &nhcl)
     70 	    == 0 && nhcl == 3 &&
     71 	    nvlist_lookup_string(hcl[0], FM_FMRI_HC_NAME, &name) == 0 &&
     72 	    strcmp(name, "motherboard") == 0 &&
     73 	    nvlist_lookup_string(hcl[1], FM_FMRI_HC_NAME, &name) == 0 &&
     74 	    strcmp(name, "chip") == 0 &&
     75 	    nvlist_lookup_string(hcl[2], FM_FMRI_HC_NAME, &name) == 0 &&
     76 	    strcmp(name, "cpu") == 0)
     77 		return (B_TRUE);
     78 #endif
     79 
     80 	return (B_FALSE);
     81 }
     82 
     83 /* ARGSUSED */
     84 int
     85 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
     86     const char *uuid, boolean_t repair)
     87 {
     88 	int i, err;
     89 	int rc = CMA_RA_SUCCESS;
     90 	nvlist_t *rsrc;
     91 
     92 	/*
     93 	 * For the cached faults which were diagnosed under the old
     94 	 * topology,  we fall back to retire by using cpu-scheme ASRUs.
     95 	 * Under xVM Dom0, since logic cpuid in "cpu" scheme ASRU makes no
     96 	 * sense, the fault should be ignored.
     97 	 */
     98 	if (old_topo_fault(nvl)) {
     99 #ifdef i386
    100 		if (! cma_is_native)
    101 			return (CMA_RA_FAILURE);
    102 #endif
    103 		return (cma_cpu_cpu_retire(hdl, nvl, asru, uuid, repair));
    104 	}
    105 
    106 	/*
    107 	 * Lookup the resource and call its topo methods to do retire/unretire
    108 	 */
    109 	if ((! repair && ! cma.cma_cpu_dooffline) ||
    110 	    (repair && ! cma.cma_cpu_doonline)) {
    111 		fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
    112 		    repair ? "unretire" : "retire");
    113 		cma_stats.cpu_supp.fmds_value.ui64++;
    114 	} else {
    115 		err = FMD_AGENT_RETIRE_FAIL;
    116 		if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) {
    117 			if (repair) {
    118 				err = fmd_nvl_fmri_unretire(hdl, rsrc);
    119 			} else {
    120 				for (i = 0; i < cma.cma_cpu_tries; i++) {
    121 					err = fmd_nvl_fmri_retire(hdl, rsrc);
    122 					if (err == FMD_AGENT_RETIRE_DONE)
    123 						break;
    124 					(void) nanosleep(&cma.cma_cpu_delay,
    125 					    NULL);
    126 				}
    127 			}
    128 		}
    129 		if (err == FMD_AGENT_RETIRE_DONE) {
    130 			if (repair)
    131 				cma_stats.cpu_repairs.fmds_value.ui64++;
    132 			else
    133 				cma_stats.cpu_flts.fmds_value.ui64++;
    134 		} else {
    135 			rc = CMA_RA_FAILURE;
    136 			cma_stats.bad_flts.fmds_value.ui64++;
    137 #ifdef sun4v
    138 			/* libldom requests are processed asynchronously */
    139 			cma_cpu_start_retry(hdl, nvl, uuid, repair);
    140 #endif
    141 		}
    142 	}
    143 
    144 	if ((! repair && ! cma.cma_cpu_doblacklist) ||
    145 	    (repair && ! cma.cma_cpu_dounblacklist)) {
    146 		fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
    147 		    repair ? "unblacklist" : "blacklist");
    148 		cma_stats.cpu_blsupp.fmds_value.ui64++;
    149 	} else {
    150 		if (cma_cpu_blacklist(hdl, nvl, asru, repair) < 0)
    151 			cma_stats.cpu_blfails.fmds_value.ui64++;
    152 	}
    153 
    154 	return (rc);
    155 }
    156 
    157 #else /* opl */
    158 
    159 /* ARGSUSED 4 */
    160 int
    161 cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
    162     const char *uuid, boolean_t repair)
    163 {
    164 	uint_t cpuid;
    165 	uint_t i, nprs;
    166 	nvlist_t **hc_prs = NULL, *hc_spec_nvl;
    167 
    168 	/* OPL has ASRU in "hc" scheme */
    169 	if (nvlist_lookup_nvlist(asru, FM_FMRI_HC_SPECIFIC,
    170 	    &hc_spec_nvl) != 0) {
    171 		cma_stats.bad_flts.fmds_value.ui64++;
    172 		fmd_hdl_debug(hdl,
    173 		    "cma_cpu_hc_retire lookup hc_spec_nvl failed\n");
    174 		return (CMA_RA_FAILURE);
    175 	}
    176 
    177 	if (nvlist_lookup_nvlist_array(hc_spec_nvl, FM_FMRI_HC_CPUIDS,
    178 	    &hc_prs, &nprs) != 0) {
    179 		cma_stats.bad_flts.fmds_value.ui64++;
    180 		fmd_hdl_debug(hdl,
    181 		    "cma_cpu_hc_retire lookup cpuid array failed\n");
    182 		return (CMA_RA_FAILURE);
    183 	}
    184 
    185 	for (i = 0; i < nprs; i++) {
    186 		if (nvlist_lookup_uint32(hc_prs[i],
    187 		    FM_FMRI_CPU_ID, &cpuid) != 0) {
    188 			cma_stats.bad_flts.fmds_value.ui64++;
    189 			return (CMA_RA_FAILURE);
    190 		}
    191 
    192 		if (cpu_statechange(hdl, nvl, hc_prs[i], uuid, cpuid, repair)
    193 		    != CMA_RA_SUCCESS) {
    194 			cma_stats.bad_flts.fmds_value.ui64++;
    195 			return (CMA_RA_FAILURE);
    196 		}
    197 	}
    198 
    199 	return (CMA_RA_SUCCESS);
    200 }
    201 #endif /* opl */
    202 
    203 /*
    204  * The rest of this file uses ASRUs to do retire, this is now not the
    205  * preferable way, but it's still needed for some circumstances when
    206  * retire via topo methods can't work, ie.
    207  *
    208  * 1) There are legacy platforms which don't have full topology.
    209  * 2) The resources in the FMD cached faults may not be set or exist in the
    210  *    up-to-dated topology.
    211  */
    212 
    213 /* ARGSUSED */
    214 static int
    215 cpu_online(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
    216     uint32_t cpuid)
    217 {
    218 	int err = CMA_RA_SUCCESS;
    219 
    220 	if (cma.cma_cpu_doonline) {
    221 		err = cma_cpu_statechange(hdl, asru, uuid, P_ONLINE,
    222 		    B_TRUE);
    223 	} else {
    224 		fmd_hdl_debug(hdl, "suppressed online of CPU %u\n",
    225 		    cpuid);
    226 		cma_stats.cpu_supp.fmds_value.ui64++;
    227 	}
    228 
    229 	/* OPL performs the blacklist in the service processor */
    230 #ifndef opl
    231 	if (cma.cma_cpu_dounblacklist) {
    232 		if (cma_cpu_blacklist(hdl, nvl, asru, B_TRUE) < 0)
    233 			cma_stats.cpu_blfails.fmds_value.ui64++;
    234 	} else {
    235 		fmd_hdl_debug(hdl, "suppressed unblacklist of CPU %u\n", cpuid);
    236 		cma_stats.cpu_blsupp.fmds_value.ui64++;
    237 	}
    238 #endif /* opl */
    239 
    240 	return (err);
    241 }
    242 
    243 /* ARGSUSED */
    244 static int
    245 cpu_offline(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
    246     uint32_t cpuid)
    247 {
    248 	int err = CMA_RA_FAILURE;
    249 
    250 	if (cma.cma_cpu_dooffline) {
    251 		int cpustate = P_FAULTED;
    252 
    253 		if (cma.cma_cpu_forcedoffline)
    254 			cpustate |= P_FORCED;
    255 		err = cma_cpu_statechange(hdl, asru, uuid, cpustate,
    256 		    B_FALSE);
    257 	} else {
    258 		fmd_hdl_debug(hdl, "suppressed offline of CPU %u\n",
    259 		    cpuid);
    260 		cma_stats.cpu_supp.fmds_value.ui64++;
    261 	}
    262 
    263 	/* OPL performs the blacklist in the service processor */
    264 #ifndef opl
    265 	if (cma.cma_cpu_doblacklist) {
    266 		if (cma_cpu_blacklist(hdl, nvl, asru, B_FALSE) < 0)
    267 			cma_stats.cpu_blfails.fmds_value.ui64++;
    268 	} else {
    269 		fmd_hdl_debug(hdl, "suppressed blacklist of CPU %u\n",
    270 		    cpuid);
    271 		cma_stats.cpu_blsupp.fmds_value.ui64++;
    272 	}
    273 #endif /* opl */
    274 
    275 	return (err);
    276 }
    277 
    278 static int
    279 cpu_statechange(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
    280     uint32_t cpuid, boolean_t repair)
    281 {
    282 	if (repair)
    283 		return (cpu_online(hdl, nvl, asru, uuid, cpuid));
    284 	else
    285 		return (cpu_offline(hdl, nvl, asru, uuid, cpuid));
    286 }
    287 
    288 const char *
    289 p_online_state_fmt(int state)
    290 {
    291 	state &= ~P_FORCED;
    292 	switch (state) {
    293 	case P_OFFLINE:
    294 		return (PS_OFFLINE);
    295 	case P_ONLINE:
    296 		return (PS_ONLINE);
    297 	case P_FAULTED:
    298 		return (PS_FAULTED);
    299 	case P_POWEROFF:
    300 		return (PS_POWEROFF);
    301 	case P_NOINTR:
    302 		return (PS_NOINTR);
    303 	case P_SPARE:
    304 		return (PS_SPARE);
    305 	default:
    306 		return ("unknown");
    307 	}
    308 }
    309 
    310 int
    311 cma_cpu_cpu_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
    312     const char *uuid, boolean_t repair)
    313 {
    314 	uint_t cpuid;
    315 
    316 	if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
    317 		fmd_hdl_debug(hdl, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID);
    318 		cma_stats.bad_flts.fmds_value.ui64++;
    319 		return (CMA_RA_FAILURE);
    320 	}
    321 
    322 	return (cpu_statechange(hdl, nvl, asru, uuid, cpuid, repair));
    323 }
    324