Home | History | Annotate | Download | only in cpumem-retire
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Page retirement can be an extended process due to the fact that a retirement
     29  * may not be possible when the original request is made.  The kernel will
     30  * repeatedly attempt to retire a given page, but will not let us know when the
     31  * page has been retired.  We therefore have to poll to see if the retirement
     32  * has been completed.  This poll is implemented with a bounded exponential
     33  * backoff to reduce the burden which we impose upon the system.
     34  *
     35  * To reduce the burden on fmd in the face of retirement storms, we schedule
     36  * all retries as a group.  In the simplest case, we attempt to retire a single
     37  * page.  When forced to retry, we initially schedule a retry at a configurable
     38  * interval t.  If the retry fails, we schedule another at 2 * t, and so on,
     39  * until t reaches the maximum interval (also configurable).  Future retries
     40  * for that page will occur with t equal to the maximum interval value.  We
     41  * will never give up on a retirement.
     42  *
     43  * With multiple retirements, the situation gets slightly more complicated.  As
     44  * indicated above, we schedule retries as a group.  We don't want to deny new
     45  * pages their short retry intervals, so we'll (re)set the retry interval to the
     46  * value appropriate for the newest page.
     47  */
     48 
     49 #include <cma.h>
     50 
     51 #include <time.h>
     52 #include <errno.h>
     53 #include <unistd.h>
     54 #include <strings.h>
     55 #include <fm/fmd_api.h>
     56 #include <fm/libtopo.h>
     57 #include <fm/fmd_fmri.h>
     58 #include <fm/fmd_agent.h>
     59 #include <sys/fm/protocol.h>
     60 
     61 static void
     62 cma_page_free(fmd_hdl_t *hdl, cma_page_t *page)
     63 {
     64 	nvlist_free(page->pg_asru);
     65 	nvlist_free(page->pg_rsrc);
     66 	fmd_hdl_free(hdl, page, sizeof (cma_page_t));
     67 }
     68 
     69 /*
     70  * Retire the specified ASRU, referring to a memory page by PA or by DIMM
     71  * offset (i.e. the encoded coordinates internal bank, row, and column).
     72  * In the initial FMA implementation, fault.memory.page exported an ASRU
     73  * with an explicit physical address, which is valid at the initial time of
     74  * diagnosis but may not be later following DR, DIMM removal, or interleave
     75  * changes.  On SPARC, this issue was solved by exporting the DIMM offset
     76  * and pushing the entire FMRI to the platform memory controller through
     77  * /dev/fm so it can derive the current PA from the DIMM and offset.
     78  * On x86, we also encode DIMM and offset in hc-specific, which is then used
     79  * by the x64 memory controller driver.
     80  * At some point these three approaches need to be rationalized: all platforms
     81  * should use the same scheme, either with decoding in the kernel or decoding
     82  * in userland (i.e. with a libtopo method to compute and update the PA).
     83  */
     84 /*ARGSUSED*/
     85 int
     86 cma_page_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
     87     const char *uuid, boolean_t repair)
     88 {
     89 	cma_page_t *page;
     90 	uint64_t pageaddr;
     91 	const char *action = repair ? "unretire" : "retire";
     92 	int rc;
     93 	nvlist_t *rsrc = NULL, *asrucp = NULL, *hcsp;
     94 
     95 	(void) nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc);
     96 
     97 	if (nvlist_dup(asru, &asrucp, 0) != 0) {
     98 		fmd_hdl_debug(hdl, "page retire nvlist dup failed\n");
     99 		return (CMA_RA_FAILURE);
    100 	}
    101 
    102 	/* It should already be expanded, but we'll do it again anyway */
    103 	if (fmd_nvl_fmri_expand(hdl, asrucp) < 0) {
    104 		fmd_hdl_debug(hdl, "failed to expand page asru\n");
    105 		cma_stats.bad_flts.fmds_value.ui64++;
    106 		nvlist_free(asrucp);
    107 		return (CMA_RA_FAILURE);
    108 	}
    109 
    110 	if (!repair && !fmd_nvl_fmri_present(hdl, asrucp)) {
    111 		fmd_hdl_debug(hdl, "page retire overtaken by events\n");
    112 		cma_stats.page_nonent.fmds_value.ui64++;
    113 		nvlist_free(asrucp);
    114 		return (CMA_RA_SUCCESS);
    115 	}
    116 
    117 	/* Figure out physaddr from resource or asru */
    118 	if (rsrc == NULL ||
    119 	    nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcsp) != 0 ||
    120 	    (nvlist_lookup_uint64(hcsp, "asru-" FM_FMRI_HC_SPECIFIC_PHYSADDR,
    121 	    &pageaddr) != 0 && nvlist_lookup_uint64(hcsp,
    122 	    FM_FMRI_HC_SPECIFIC_PHYSADDR, &pageaddr) != 0)) {
    123 		if (nvlist_lookup_uint64(asrucp, FM_FMRI_MEM_PHYSADDR,
    124 		    &pageaddr) != 0) {
    125 			fmd_hdl_debug(hdl, "mem fault missing 'physaddr'\n");
    126 			cma_stats.bad_flts.fmds_value.ui64++;
    127 			nvlist_free(asrucp);
    128 			return (CMA_RA_FAILURE);
    129 		}
    130 	}
    131 
    132 	if (repair) {
    133 		if (!cma.cma_page_dounretire) {
    134 			fmd_hdl_debug(hdl, "suppressed unretire of page %llx\n",
    135 			    (u_longlong_t)pageaddr);
    136 			cma_stats.page_supp.fmds_value.ui64++;
    137 			nvlist_free(asrucp);
    138 			return (CMA_RA_SUCCESS);
    139 		}
    140 		/* If unretire via topo fails, we fall back to legacy way */
    141 		if (rsrc == NULL || (rc = fmd_nvl_fmri_unretire(hdl, rsrc)) < 0)
    142 			rc = cma_fmri_page_unretire(hdl, asrucp);
    143 	} else {
    144 		if (!cma.cma_page_doretire) {
    145 			fmd_hdl_debug(hdl, "suppressed retire of page %llx\n",
    146 			    (u_longlong_t)pageaddr);
    147 			cma_stats.page_supp.fmds_value.ui64++;
    148 			nvlist_free(asrucp);
    149 			return (CMA_RA_FAILURE);
    150 		}
    151 		/* If retire via topo fails, we fall back to legacy way */
    152 		if (rsrc == NULL || (rc = fmd_nvl_fmri_retire(hdl, rsrc)) < 0)
    153 			rc = cma_fmri_page_retire(hdl, asrucp);
    154 	}
    155 
    156 	if (rc == FMD_AGENT_RETIRE_DONE) {
    157 		fmd_hdl_debug(hdl, "%sd page 0x%llx\n",
    158 		    action, (u_longlong_t)pageaddr);
    159 		if (repair)
    160 			cma_stats.page_repairs.fmds_value.ui64++;
    161 		else
    162 			cma_stats.page_flts.fmds_value.ui64++;
    163 		nvlist_free(asrucp);
    164 		return (CMA_RA_SUCCESS);
    165 	} else if (repair || rc != FMD_AGENT_RETIRE_ASYNC) {
    166 		fmd_hdl_debug(hdl, "%s of page 0x%llx failed, will not "
    167 		    "retry: %s\n", action, (u_longlong_t)pageaddr,
    168 		    strerror(errno));
    169 
    170 		cma_stats.page_fails.fmds_value.ui64++;
    171 		nvlist_free(asrucp);
    172 		return (CMA_RA_FAILURE);
    173 	}
    174 
    175 	/*
    176 	 * The page didn't immediately retire.  We'll need to periodically
    177 	 * check to see if it has been retired.
    178 	 */
    179 	fmd_hdl_debug(hdl, "page didn't retire - sleeping\n");
    180 
    181 	page = fmd_hdl_zalloc(hdl, sizeof (cma_page_t), FMD_SLEEP);
    182 	page->pg_addr = pageaddr;
    183 	if (rsrc != NULL)
    184 		(void) nvlist_dup(rsrc, &page->pg_rsrc, 0);
    185 	page->pg_asru = asrucp;
    186 	if (uuid != NULL)
    187 		page->pg_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);
    188 
    189 	page->pg_next = cma.cma_pages;
    190 	cma.cma_pages = page;
    191 
    192 	if (cma.cma_page_timerid != 0)
    193 		fmd_timer_remove(hdl, cma.cma_page_timerid);
    194 
    195 	cma.cma_page_curdelay = cma.cma_page_mindelay;
    196 
    197 	cma.cma_page_timerid =
    198 	    fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay);
    199 
    200 	/* Don't free asrucp here.  This FMRI will be needed for retry. */
    201 	return (CMA_RA_FAILURE);
    202 }
    203 
    204 static int
    205 page_retry(fmd_hdl_t *hdl, cma_page_t *page)
    206 {
    207 	int rc;
    208 
    209 	if (page->pg_asru != NULL &&
    210 	    !fmd_nvl_fmri_present(hdl, page->pg_asru)) {
    211 		fmd_hdl_debug(hdl, "page retire overtaken by events");
    212 		cma_stats.page_nonent.fmds_value.ui64++;
    213 
    214 		if (page->pg_uuid != NULL)
    215 			fmd_case_uuclose(hdl, page->pg_uuid);
    216 		return (1); /* no longer a page to retire */
    217 	}
    218 
    219 	if (page->pg_rsrc == NULL ||
    220 	    (rc = fmd_nvl_fmri_service_state(hdl, page->pg_rsrc)) < 0)
    221 		rc = cma_fmri_page_service_state(hdl, page->pg_asru);
    222 
    223 	if (rc == FMD_SERVICE_STATE_UNUSABLE) {
    224 		fmd_hdl_debug(hdl, "retired page 0x%llx on retry %u\n",
    225 		    page->pg_addr, page->pg_nretries);
    226 		cma_stats.page_flts.fmds_value.ui64++;
    227 
    228 		if (page->pg_uuid != NULL)
    229 			fmd_case_uuclose(hdl, page->pg_uuid);
    230 		return (1); /* page retired */
    231 	}
    232 
    233 	if (rc == FMD_SERVICE_STATE_ISOLATE_PENDING) {
    234 		fmd_hdl_debug(hdl, "scheduling another retry for 0x%llx\n",
    235 		    page->pg_addr);
    236 		return (0); /* schedule another retry */
    237 	} else {
    238 		fmd_hdl_debug(hdl, "failed to retry page 0x%llx "
    239 		    "retirement: %s\n", page->pg_addr,
    240 		    strerror(errno));
    241 
    242 		cma_stats.page_fails.fmds_value.ui64++;
    243 		return (1); /* give up */
    244 	}
    245 }
    246 
    247 void
    248 cma_page_retry(fmd_hdl_t *hdl)
    249 {
    250 	cma_page_t **pagep;
    251 
    252 	cma.cma_page_timerid = 0;
    253 
    254 	fmd_hdl_debug(hdl, "page_retry: timer fired\n");
    255 
    256 	pagep = &cma.cma_pages;
    257 	while (*pagep != NULL) {
    258 		cma_page_t *page = *pagep;
    259 
    260 		if (page_retry(hdl, page)) {
    261 			/*
    262 			 * Successful retry or we're giving up - remove from
    263 			 * the list
    264 			 */
    265 			*pagep = page->pg_next;
    266 
    267 			if (page->pg_uuid != NULL)
    268 				fmd_hdl_strfree(hdl, page->pg_uuid);
    269 
    270 			cma_page_free(hdl, page);
    271 		} else {
    272 			page->pg_nretries++;
    273 			pagep = &page->pg_next;
    274 		}
    275 	}
    276 
    277 	if (cma.cma_pages == NULL)
    278 		return; /* no more retirements */
    279 
    280 	/*
    281 	 * We still have retirements that haven't completed.  Back the delay
    282 	 * off, and schedule a retry.
    283 	 */
    284 	cma.cma_page_curdelay = MIN(cma.cma_page_curdelay * 2,
    285 	    cma.cma_page_maxdelay);
    286 
    287 	fmd_hdl_debug(hdl, "scheduled page retirement retry for %llu secs\n",
    288 	    (u_longlong_t)(cma.cma_page_curdelay / NANOSEC));
    289 
    290 	cma.cma_page_timerid =
    291 	    fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay);
    292 }
    293 
    294 void
    295 cma_page_fini(fmd_hdl_t *hdl)
    296 {
    297 	cma_page_t *page;
    298 
    299 	while ((page = cma.cma_pages) != NULL) {
    300 		cma.cma_pages = page->pg_next;
    301 		if (page->pg_uuid != NULL)
    302 			fmd_hdl_strfree(hdl, page->pg_uuid);
    303 		cma_page_free(hdl, page);
    304 	}
    305 }
    306