Home | History | Annotate | Download | only in cpumem-diagnosis
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <strings.h>
     29 #include <string.h>
     30 #include <errno.h>
     31 #include <fm/fmd_api.h>
     32 #include <sys/fm/protocol.h>
     33 #include <sys/async.h>
     34 #include <sys/time.h>
     35 #include <cmd.h>
     36 #include <cmd_state.h>
     37 #include <cmd_mem.h>
     38 #include <cmd_dp.h>
     39 #include <cmd_dp_page.h>
     40 #include <libnvpair.h>
     41 #include <fcntl.h>
     42 #include <unistd.h>
     43 #include <sys/mem.h>
     44 #include <sys/plat_datapath.h>
     45 
     46 /*ARGSUSED*/
     47 static nvlist_t *
     48 dp_cpu_fmri(fmd_hdl_t *hdl, uint32_t cpuid, uint64_t serial_id)
     49 {
     50 	nvlist_t	*nvl = NULL;
     51 	int		err;
     52 	char sbuf[21]; /* sizeof (UINT64_MAX) + '\0' */
     53 
     54 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
     55 		return (NULL);
     56 
     57 	err = nvlist_add_string(nvl, FM_FMRI_SCHEME, FM_FMRI_SCHEME_CPU);
     58 	err |= nvlist_add_uint8(nvl, FM_VERSION, FM_CPU_SCHEME_VERSION);
     59 	err |= nvlist_add_uint32(nvl, FM_FMRI_CPU_ID, cpuid);
     60 
     61 	/*
     62 	 * Version 1 calls for a string-based serial number
     63 	 */
     64 	(void) snprintf(sbuf, sizeof (sbuf), "%llX", (u_longlong_t)serial_id);
     65 	err |= nvlist_add_string(nvl, FM_FMRI_CPU_SERIAL_ID, sbuf);
     66 	if (err != 0) {
     67 		nvlist_free(nvl);
     68 		return (NULL);
     69 	}
     70 	return (nvl);
     71 }
     72 
     73 cmd_dp_t *
     74 cmd_dp_lookup_fault(fmd_hdl_t *hdl, uint32_t cpuid)
     75 {
     76 	cmd_dp_t	*ptr;
     77 	int		i, found = 0;
     78 
     79 	/*
     80 	 * Scan the cmd.cmd_datapaths list to see if there is
     81 	 * a fault event present that impacts 'cpuid'
     82 	 */
     83 	for (ptr = cmd_list_next(&cmd.cmd_datapaths); ptr != NULL;
     84 	    ptr = cmd_list_next(ptr)) {
     85 		if (ptr->dp_erpt_type == DP_FAULT) {
     86 			for (i = 0; i < ptr->dp_ncpus; i++) {
     87 				if (ptr->dp_cpuid_list[i] == cpuid) {
     88 					found = 1;
     89 					break;
     90 				}
     91 			}
     92 		}
     93 		if (found)
     94 			break;
     95 	}
     96 
     97 	/*
     98 	 * Check if the FMRI for the found cpuid exists in the domain.
     99 	 * If it does not, it implies a DR has been done and this DP_FAULT
    100 	 * is no longer needed.
    101 	 */
    102 	if (ptr != NULL) {
    103 		nvlist_t	*nvl;
    104 
    105 		nvl = dp_cpu_fmri(hdl, ptr->dp_cpuid_list[i],
    106 		    ptr->dp_serid_list[i]);
    107 
    108 		if (nvl != NULL) {
    109 			if (!fmd_nvl_fmri_present(hdl, nvl)) {
    110 				cmd_dp_destroy(hdl, ptr);
    111 				ptr = NULL;
    112 			}
    113 			nvlist_free(nvl);
    114 		}
    115 	}
    116 	return (ptr);
    117 }
    118 
    119 cmd_dp_t *
    120 cmd_dp_lookup_error(cmd_dp_t *dp)
    121 {
    122 	cmd_dp_t	*ptr;
    123 
    124 	/*
    125 	 * Scan the cmd.cmd_datapaths list to see if there is
    126 	 * an existing error that matches 'dp'. A match is if
    127 	 * both dp_err and the base cpuid are identical
    128 	 */
    129 	for (ptr = cmd_list_next(&cmd.cmd_datapaths); ptr != NULL;
    130 	    ptr = cmd_list_next(ptr)) {
    131 		if (ptr->dp_erpt_type == DP_ERROR) {
    132 			if ((ptr->dp_err == dp->dp_err) &&
    133 			    (ptr->dp_cpuid_list[0] == dp->dp_cpuid_list[0]))
    134 				return (ptr);
    135 		}
    136 	}
    137 	return (NULL);
    138 }
    139 
    140 /*
    141  * Allocates an nvlist_t, and sets ASRU information according to
    142  * the cmd_dp_t provided.
    143  */
    144 /*ARGSUSED*/
    145 nvlist_t *
    146 cmd_dp_setasru(fmd_hdl_t *hdl, cmd_dp_t *dpt)
    147 {
    148 	nvlist_t	*asru, *hcelem[DP_MAX_ASRUS];
    149 	int		i, j, sz, err;
    150 	char		buf[DP_MAX_BUF];
    151 
    152 	sz = dpt->dp_ncpus;
    153 
    154 	/* put ASRUs in an nvlist */
    155 	for (i = 0; i < sz; i++) {
    156 		(void) snprintf(buf, DP_MAX_BUF, "%d", dpt->dp_cpuid_list[i]);
    157 		if (nvlist_alloc(&hcelem[i], NV_UNIQUE_NAME, 0) != 0)
    158 			return (NULL);
    159 
    160 		err = nvlist_add_string(hcelem[i], FM_FMRI_HC_NAME,
    161 		    FM_FMRI_CPU_ID);
    162 		err |= nvlist_add_string(hcelem[i], FM_FMRI_HC_ID, buf);
    163 		if (err != 0) {
    164 			for (j = 0; j < i + 1; j++)
    165 				nvlist_free(hcelem[j]);
    166 			return (NULL);
    167 		}
    168 	}
    169 
    170 	/* put it in an HC scheme */
    171 	if (nvlist_alloc(&asru, NV_UNIQUE_NAME, 0) != 0) {
    172 		for (j = 0; j < sz; j++)
    173 			nvlist_free(hcelem[j]);
    174 		return (NULL);
    175 	}
    176 	err = nvlist_add_uint8(asru, FM_VERSION, FM_HC_SCHEME_VERSION);
    177 	err |= nvlist_add_string(asru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
    178 	err |= nvlist_add_string(asru, FM_FMRI_HC_ROOT, "");
    179 	err |= nvlist_add_uint32(asru, FM_FMRI_HC_LIST_SZ, sz);
    180 	err |= nvlist_add_nvlist_array(asru, FM_FMRI_HC_LIST, &hcelem[0],
    181 	    dpt->dp_ncpus);
    182 	if (err != 0) {
    183 		for (j = 0; j < sz; j++)
    184 			nvlist_free(hcelem[j]);
    185 		nvlist_free(asru);
    186 		return (NULL);
    187 	}
    188 
    189 	/* free up memory */
    190 	for (j = 0; j < sz; j++)
    191 		nvlist_free(hcelem[j]);
    192 
    193 	/* return the ASRU */
    194 	return (asru);
    195 }
    196 
    197 void
    198 dp_buf_write(fmd_hdl_t *hdl, cmd_dp_t *dp)
    199 {
    200 	size_t sz;
    201 
    202 	if ((sz = fmd_buf_size(hdl, NULL, dp->dp_bufname)) != 0 &&
    203 	    sz != sizeof (cmd_dp_pers_t))
    204 		fmd_buf_destroy(hdl, NULL, dp->dp_bufname);
    205 
    206 	fmd_buf_write(hdl, NULL, dp->dp_bufname, &dp->dp_pers,
    207 	    sizeof (cmd_dp_pers_t));
    208 }
    209 
    210 static cmd_dp_t *
    211 dp_wrapv0(fmd_hdl_t *hdl, cmd_dp_pers_t *pers, size_t psz)
    212 {
    213 	cmd_dp_t *dp;
    214 
    215 	if (psz != sizeof (cmd_dp_pers_t)) {
    216 		fmd_hdl_abort(hdl, "size of state doesn't match size of "
    217 		    "version 1 state (%u bytes).\n", sizeof (cmd_dp_pers_t));
    218 	}
    219 
    220 	dp = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_t), FMD_SLEEP);
    221 	bcopy(pers, dp, sizeof (cmd_dp_pers_t));
    222 	fmd_hdl_free(hdl, pers, psz);
    223 	return (dp);
    224 }
    225 
    226 void *
    227 cmd_dp_restore(fmd_hdl_t *hdl, fmd_case_t *cp, cmd_case_ptr_t *ptr)
    228 {
    229 	cmd_dp_t *dp;
    230 
    231 	for (dp = cmd_list_next(&cmd.cmd_datapaths); dp != NULL;
    232 	    dp = cmd_list_next(dp)) {
    233 		if (dp->dp_case == cp)
    234 			break;
    235 	}
    236 
    237 	if (dp == NULL) {
    238 		size_t dpsz;
    239 
    240 		fmd_hdl_debug(hdl, "restoring dp from %s\n", ptr->ptr_name);
    241 
    242 		if ((dpsz = fmd_buf_size(hdl, NULL, ptr->ptr_name)) == 0) {
    243 			if (fmd_case_solved(hdl, cp) ||
    244 			    fmd_case_closed(hdl, cp)) {
    245 				fmd_hdl_debug(hdl, "dp %s from case %s not "
    246 				    "found. Case is already solved or closed\n",
    247 				    ptr->ptr_name, fmd_case_uuid(hdl, cp));
    248 				return (NULL);
    249 			} else {
    250 				fmd_hdl_abort(hdl, "dp referenced by case %s "
    251 				    "does not exist in saved state\n",
    252 				    fmd_case_uuid(hdl, cp));
    253 			}
    254 		} else if (dpsz > CMD_DP_MAXSIZE ||
    255 		    dpsz < CMD_DP_MINSIZE) {
    256 			fmd_hdl_abort(hdl, "dp buffer referenced by "
    257 			    "case %s is out of bounds (is %u bytes, "
    258 			    "max %u, min %u)\n", fmd_case_uuid(hdl, cp),
    259 			    dpsz, CMD_DP_MAXSIZE, CMD_DP_MINSIZE);
    260 		}
    261 
    262 		if ((dp = cmd_buf_read(hdl, NULL, ptr->ptr_name, dpsz)) == NULL)
    263 			fmd_hdl_abort(hdl, "failed to read dp buf %s",
    264 			    ptr->ptr_name);
    265 
    266 		switch (dp->dp_version) {
    267 		case CMD_DP_VERSION_0:
    268 			dp = dp_wrapv0(hdl, (cmd_dp_pers_t *)dp, dpsz);
    269 			break;
    270 		default:
    271 			fmd_hdl_abort(hdl, "unknown version (found %d) "
    272 			    "for dp state referenced by case %s.\n",
    273 			    dp->dp_version, fmd_case_uuid(hdl, cp));
    274 			break;
    275 		}
    276 
    277 		dp->dp_case = cp;
    278 
    279 		if (dp->dp_erpt_type == DP_ERROR) {
    280 			fmd_event_t *ep = fmd_case_getprincipal(hdl, cp);
    281 
    282 			++cmd.cmd_dp_flag;
    283 
    284 			dp->dp_id = fmd_timer_install(hdl,
    285 			    (void *)CMD_TIMERTYPE_DP, ep,
    286 			    (hrtime_t)NANOSEC * (dp->dp_t_value + 120));
    287 		}
    288 
    289 		cmd_list_append(&cmd.cmd_datapaths, dp);
    290 	}
    291 
    292 	return (dp);
    293 }
    294 
    295 void
    296 cmd_dp_close(fmd_hdl_t *hdl, void *arg)
    297 {
    298 	cmd_dp_destroy(hdl, arg);
    299 }
    300 
    301 void
    302 cmd_dp_timeout(fmd_hdl_t *hdl, id_t id)
    303 {
    304 	cmd_dp_t		*dp;
    305 
    306 	/* close case associated with the timer */
    307 	for (dp = cmd_list_next(&cmd.cmd_datapaths); dp != NULL;
    308 	    dp = cmd_list_next(dp)) {
    309 		if (dp->dp_id == id) {
    310 			cmd_dp_destroy(hdl, dp);
    311 			break;
    312 		}
    313 	}
    314 
    315 	fmd_hdl_debug(hdl, "cmd_dp_timeout() complete\n");
    316 }
    317 
    318 /*
    319  * Validate by matching each cmd_dp_t cpu and serial id to what is
    320  * installed and active on this machine or domain. Delete the cmd_dp_t
    321  * if no match is made.
    322  */
    323 void
    324 cmd_dp_validate(fmd_hdl_t *hdl)
    325 {
    326 	cmd_dp_t *dp, *next;
    327 	nvlist_t *nvl;
    328 	int i, no_match;
    329 
    330 	for (dp = cmd_list_next(&cmd.cmd_datapaths); dp != NULL; dp = next) {
    331 		next = cmd_list_next(dp);
    332 
    333 		for (i = 0, no_match = 0; i < dp->dp_ncpus; i++) {
    334 			nvl = dp_cpu_fmri(hdl, dp->dp_cpuid_list[i],
    335 			    dp->dp_serid_list[i]);
    336 
    337 			if (nvl == NULL)
    338 				fmd_hdl_abort(hdl, "could not make CPU fmri");
    339 
    340 			if (!fmd_nvl_fmri_present(hdl, nvl))
    341 				no_match = 1;
    342 
    343 			nvlist_free(nvl);
    344 
    345 			if (no_match) {
    346 				cmd_dp_destroy(hdl, dp);
    347 				break;
    348 			}
    349 		}
    350 	}
    351 }
    352 
    353 static void
    354 cmd_dp_free(fmd_hdl_t *hdl, cmd_dp_t *dp, int destroy)
    355 {
    356 	if (dp->dp_case != NULL)
    357 		cmd_case_fini(hdl, dp->dp_case, destroy);
    358 
    359 	if (destroy && dp->dp_erpt_type == DP_ERROR) {
    360 		--cmd.cmd_dp_flag;
    361 		/*
    362 		 * If there are no active datapath events, replay any
    363 		 * pages that were deferred.
    364 		 */
    365 		if (cmd.cmd_dp_flag == 0)
    366 			cmd_dp_page_replay(hdl);
    367 	}
    368 
    369 	if (destroy)
    370 		fmd_buf_destroy(hdl, NULL, dp->dp_bufname);
    371 
    372 	cmd_list_delete(&cmd.cmd_datapaths, dp);
    373 	fmd_hdl_free(hdl, dp, sizeof (cmd_dp_t));
    374 }
    375 
    376 void
    377 cmd_dp_destroy(fmd_hdl_t *hdl, cmd_dp_t *dp)
    378 {
    379 	cmd_dp_free(hdl, dp, FMD_B_TRUE);
    380 }
    381 
    382 /*ARGSUSED*/
    383 int
    384 cmd_dp_error(fmd_hdl_t *hdl)
    385 {
    386 	if (cmd.cmd_dp_flag)
    387 		return (1);
    388 	else
    389 		return (0);
    390 }
    391 
    392 int
    393 cmd_dp_get_mcid(uint64_t addr, int *mcid)
    394 {
    395 	int fd, rc;
    396 	mem_info_t data;
    397 
    398 	if ((fd = open("/dev/mem", O_RDONLY)) < 0)
    399 		return (-1);
    400 
    401 	data.m_addr = addr;
    402 	data.m_synd = 0;
    403 	if ((rc = ioctl(fd, MEM_INFO, &data)) < 0) {
    404 		(void) close(fd);
    405 		return (rc);
    406 	}
    407 
    408 	(void) close(fd);
    409 	*mcid = data.m_mcid;
    410 
    411 	return (0);
    412 }
    413 
    414 /*ARGSUSED*/
    415 int
    416 cmd_dp_fault(fmd_hdl_t *hdl, uint64_t addr)
    417 {
    418 	int mcid;
    419 
    420 	if (cmd_dp_get_mcid(addr, &mcid) < 0)
    421 		fmd_hdl_abort(hdl, "cmd_dp_get_mcid failed");
    422 
    423 	if (cmd_dp_lookup_fault(hdl, mcid) != NULL)
    424 		return (1);
    425 	else
    426 		return (0);
    427 }
    428 
    429 void
    430 cmd_dp_fini(fmd_hdl_t *hdl)
    431 {
    432 	cmd_dp_t *dp;
    433 	cmd_dp_defer_t *dpage;
    434 
    435 	while ((dp = cmd_list_next(&cmd.cmd_datapaths)) != NULL)
    436 		cmd_dp_free(hdl, dp, FMD_B_FALSE);
    437 
    438 	while ((dpage = cmd_list_next(&cmd.cmd_deferred_pages)) != NULL) {
    439 		cmd_list_delete(&cmd.cmd_deferred_pages, dpage);
    440 		fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t));
    441 	}
    442 }
    443