Home | History | Annotate | Download | only in cpumem-retire
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <cma.h>
     27 
     28 #include <sys/fm/ldom.h>
     29 #include <sys/fm/protocol.h>
     30 #include <fm/fmd_fmri.h>
     31 #include <fm/libtopo.h>
     32 
     33 #include <assert.h>
     34 #include <fcntl.h>
     35 #include <unistd.h>
     36 #include <errno.h>
     37 #include <strings.h>
     38 
     39 #include <sys/types.h>
     40 #include <sys/processor.h>
     41 
     42 extern ldom_hdl_t *cma_lhp;
     43 
     44 /*ARGSUSED*/
     45 int
     46 cpu_blacklist_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, boolean_t repair)
     47 {
     48 	if (repair)
     49 		return (ldom_fmri_unblacklist(cma_lhp, fmri));
     50 	else
     51 		return (ldom_fmri_blacklist(cma_lhp, fmri));
     52 }
     53 
     54 int
     55 cma_cpu_blacklist(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
     56     boolean_t repair)
     57 {
     58 	nvlist_t *fmri;
     59 	int rc, err;
     60 
     61 	/*
     62 	 * Some platforms have special unums for the E$ DIMMs.	If we're dealing
     63 	 * with a platform that has these unums, one will have been added to the
     64 	 * fault as the resource.  We'll use that for the blacklisting.  If we
     65 	 * can't find a resource, we'll fall back to the ASRU.
     66 	 */
     67 	if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &fmri) != 0)
     68 		fmri = asru;
     69 
     70 	rc = cpu_blacklist_cmd(hdl, fmri, repair);
     71 	err = errno;
     72 
     73 	if (rc < 0 && err != ENOTSUP) {
     74 		errno = err;
     75 		return (-1);
     76 	}
     77 
     78 	return (0);
     79 }
     80 
     81 /*ARGSUSED*/
     82 static int
     83 cpu_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, int cmd)
     84 {
     85 	int rc = 0;
     86 	char *scheme;
     87 
     88 	/*
     89 	 * We're using topo retire if the fmri is in "hc" scheme.
     90 	 */
     91 	if (nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) == 0 &&
     92 	    strcmp(scheme, FM_FMRI_SCHEME_HC) == 0) {
     93 		if (cmd != P_STATUS) {
     94 			errno = EINVAL;
     95 			return (-1);
     96 		}
     97 		rc = fmd_nvl_fmri_service_state(hdl, fmri);
     98 		switch (rc) {
     99 		case FMD_SERVICE_STATE_UNUSABLE:
    100 			return (P_FAULTED);
    101 		case -1:
    102 			return (-1);
    103 		default:
    104 			return (P_ONLINE);
    105 		}
    106 	}
    107 
    108 	switch (cmd & ~P_FORCED) {
    109 	case P_STATUS:
    110 		rc = ldom_fmri_status(cma_lhp, fmri);
    111 		break;
    112 	case P_FAULTED:
    113 		rc = ldom_fmri_retire(cma_lhp, fmri);
    114 		break;
    115 	case P_ONLINE:
    116 		rc = ldom_fmri_unretire(cma_lhp, fmri);
    117 		break;
    118 	default:
    119 		errno = EINVAL;
    120 		return (-1);
    121 	}
    122 
    123 	if (rc != P_OFFLINE && rc != P_ONLINE && rc != P_FAULTED) {
    124 		errno = rc;
    125 		return (-1);
    126 	}
    127 
    128 	return (rc);
    129 }
    130 
    131 void
    132 cma_cpu_start_retry(fmd_hdl_t *hdl, nvlist_t *fmri, const char *uuid,
    133     boolean_t repair)
    134 {
    135 	cma_cpu_t *cpu;
    136 	char *scheme;
    137 	uint_t cpuid;
    138 	nvlist_t *asru = NULL;
    139 	topo_hdl_t *thp;
    140 	int err;
    141 
    142 	if (repair || nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) != 0)
    143 		return;
    144 	if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) {
    145 		if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &cpuid) != 0)
    146 			return;
    147 	} else if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) {
    148 		return;
    149 	} else {
    150 		/* lookup cpuid from ASRU */
    151 		thp = fmd_fmri_topo_hold(TOPO_VERSION);
    152 		if (thp != NULL) {
    153 			(void) topo_fmri_asru(thp, fmri, &asru, &err);
    154 			fmd_fmri_topo_rele(thp);
    155 		}
    156 		if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
    157 			nvlist_free(asru);
    158 			return;
    159 		}
    160 	}
    161 
    162 	/*
    163 	 * check to see if the cpu has been offline.
    164 	 */
    165 	fmd_hdl_debug(hdl, "cpu %u is not offline yet - sleeping\n", cpuid);
    166 
    167 	/*
    168 	 * Create a cpu node and add to the head of the cpu list
    169 	 */
    170 	cpu = fmd_hdl_zalloc(hdl, sizeof (cma_cpu_t), FMD_SLEEP);
    171 	(void) nvlist_dup(fmri, &cpu->cpu_fmri, 0);
    172 	if (uuid != NULL)
    173 		cpu->cpu_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);
    174 
    175 	cpu->cpuid = cpuid;
    176 	cpu->cpu_next = cma.cma_cpus;
    177 	cma.cma_cpus = cpu;
    178 
    179 	if (cma.cma_cpu_timerid != 0)
    180 		fmd_timer_remove(hdl, cma.cma_cpu_timerid);
    181 
    182 	cma.cma_cpu_curdelay = cma.cma_cpu_mindelay;
    183 
    184 	cma.cma_cpu_timerid =
    185 	    fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
    186 }
    187 
    188 
    189 int
    190 cma_cpu_statechange(fmd_hdl_t *hdl, nvlist_t *asru, const char *uuid,
    191     int cpustate, boolean_t repair)
    192 {
    193 	int i;
    194 	uint_t cpuid;
    195 
    196 	if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
    197 		fmd_hdl_debug(hdl, "missing '%s'\n", FM_FMRI_CPU_ID);
    198 		cma_stats.bad_flts.fmds_value.ui64++;
    199 		return (CMA_RA_FAILURE);
    200 	}
    201 
    202 	/*
    203 	 * cpu offlining using ldom_fmri_retire() may be asynchronous, so we
    204 	 * have to set the timer and check the cpu status later.
    205 	 */
    206 	for (i = 0; i < cma.cma_cpu_tries;
    207 	    i++, (void) nanosleep(&cma.cma_cpu_delay, NULL)) {
    208 		if (cpu_cmd(hdl, asru, cpustate) != -1) {
    209 			if (repair)
    210 				cma_stats.cpu_repairs.fmds_value.ui64++;
    211 			else
    212 				cma_stats.cpu_flts.fmds_value.ui64++;
    213 			break;
    214 		}
    215 	}
    216 
    217 	if (i >= cma.cma_cpu_tries) {
    218 		cma_stats.cpu_fails.fmds_value.ui64++;
    219 	}
    220 
    221 	cma_cpu_start_retry(hdl, asru, uuid, repair);
    222 
    223 	return (CMA_RA_FAILURE);
    224 }
    225 
    226 static int
    227 cpu_retry(fmd_hdl_t *hdl, cma_cpu_t *cpu)
    228 {
    229 	int rc = 0;
    230 
    231 	fmd_hdl_debug(hdl, "cpu_retry()\n");
    232 
    233 	if (cpu->cpu_fmri == NULL) {
    234 		return (1);
    235 	}
    236 
    237 	if (!fmd_nvl_fmri_present(hdl, cpu->cpu_fmri)) {
    238 		fmd_hdl_debug(hdl, "cpu %u is not present", cpu->cpuid);
    239 		return (1);
    240 	}
    241 
    242 	rc = cpu_cmd(hdl, cpu->cpu_fmri, P_STATUS);
    243 	if (rc == P_FAULTED || rc == P_OFFLINE) {
    244 		fmd_hdl_debug(hdl, "cpu %u is offlined on retry %u\n",
    245 		    cpu->cpuid, cpu->cpu_nretries);
    246 		cma_stats.cpu_flts.fmds_value.ui64++;
    247 
    248 		if (cpu->cpu_uuid != NULL)
    249 			fmd_case_uuclose(hdl, cpu->cpu_uuid);
    250 		return (1); /* success */
    251 	}
    252 
    253 	if (rc == -1) {
    254 		fmd_hdl_debug(hdl, "failed to retry cpu %u\n", cpu->cpuid);
    255 		cma_stats.page_fails.fmds_value.ui64++;
    256 		return (1); /* give up */
    257 	}
    258 
    259 	return (0);
    260 }
    261 
    262 static void
    263 cma_cpu_free(fmd_hdl_t *hdl, cma_cpu_t *cpu)
    264 {
    265 	if (cpu->cpu_fmri != NULL)
    266 		nvlist_free(cpu->cpu_fmri);
    267 	if (cpu->cpu_uuid != NULL)
    268 		fmd_hdl_strfree(hdl, cpu->cpu_uuid);
    269 	fmd_hdl_free(hdl, cpu, sizeof (cma_cpu_t));
    270 }
    271 
    272 void
    273 cma_cpu_retry(fmd_hdl_t *hdl)
    274 {
    275 	cma_cpu_t **cpup;
    276 
    277 	fmd_hdl_debug(hdl, "cma_cpu_retry: timer fired\n");
    278 
    279 	cma.cma_cpu_timerid = 0;
    280 
    281 	cpup = &cma.cma_cpus;
    282 	while (*cpup != NULL) {
    283 		cma_cpu_t *cpu = *cpup;
    284 
    285 		if (cpu_retry(hdl, cpu)) {
    286 			/*
    287 			 * Successful retry or we're giving up - remove from
    288 			 * the list
    289 			 */
    290 			*cpup = cpu->cpu_next;
    291 
    292 			cma_cpu_free(hdl, cpu);
    293 		} else {
    294 			cpu->cpu_nretries++;
    295 			cpup = &cpu->cpu_next;
    296 		}
    297 	}
    298 
    299 	if (cma.cma_cpus == NULL)
    300 		return; /* no more cpus */
    301 
    302 	/*
    303 	 * We still have cpus to check.  Back the delay
    304 	 * off, and schedule a retry.
    305 	 */
    306 	cma.cma_cpu_curdelay = MIN(cma.cma_cpu_curdelay * 2,
    307 	    cma.cma_cpu_maxdelay);
    308 
    309 	fmd_hdl_debug(hdl, "scheduled cpu offline retry for %llu secs\n",
    310 	    (u_longlong_t)(cma.cma_cpu_curdelay / NANOSEC));
    311 
    312 	cma.cma_cpu_timerid =
    313 	    fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
    314 }
    315 
    316 void
    317 cma_cpu_fini(fmd_hdl_t *hdl)
    318 {
    319 	cma_cpu_t *cpu;
    320 
    321 	while ((cpu = cma.cma_cpus) != NULL) {
    322 		cma.cma_cpus = cpu->cpu_next;
    323 		cma_cpu_free(hdl, cpu);
    324 	}
    325 }
    326