Home | History | Annotate | Download | only in cpumem-diagnosis
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     28 
     29 /*
     30  * OPL platform specific functions for
     31  * CPU/Memory error diagnosis engine.
     32  */
     33 #include <cmd.h>
     34 #include <cmd_dimm.h>
     35 #include <cmd_bank.h>
     36 #include <cmd_page.h>
     37 #include <cmd_opl.h>
     38 #include <string.h>
     39 #include <errno.h>
     40 #include <fcntl.h>
     41 #include <unistd.h>
     42 #include <dirent.h>
     43 #include <sys/stat.h>
     44 
     45 #include <sys/fm/protocol.h>
     46 #include <sys/fm/io/opl_mc_fm.h>
     47 #include <sys/async.h>
     48 #include <sys/opl_olympus_regs.h>
     49 #include <sys/fm/cpu/SPARC64-VI.h>
     50 #include <sys/int_const.h>
     51 #include <sys/mutex.h>
     52 #include <sys/dditypes.h>
     53 #include <opl/sys/mc-opl.h>
     54 
     55 /*
     56  * The following is the common function for handling
     57  * memory UE with EID=MEM.
     58  * The error could be detected by either CPU/IO.
     59  */
     60 cmd_evdisp_t
     61 opl_ue_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
     62     int hdlr_type)
     63 {
     64 	nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
     65 	uint64_t ubc_ue_log_reg, pa;
     66 	cmd_page_t *page;
     67 
     68 	if (nvlist_lookup_nvlist(nvl,
     69 	    FM_EREPORT_PAYLOAD_NAME_RESOURCE, &rsrc) != 0)
     70 		return (CMD_EVD_BAD);
     71 
     72 	switch (hdlr_type) {
     73 	case CMD_OPL_HDLR_CPU:
     74 
     75 		if (nvlist_lookup_uint64(nvl,
     76 		    FM_EREPORT_PAYLOAD_NAME_SFAR, &pa) != 0)
     77 			return (CMD_EVD_BAD);
     78 
     79 		fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
     80 		    (u_longlong_t)pa);
     81 		break;
     82 
     83 	case CMD_OPL_HDLR_IO:
     84 
     85 		if (nvlist_lookup_uint64(nvl, OBERON_UBC_MUE,
     86 		    &ubc_ue_log_reg) != 0)
     87 			return (CMD_EVD_BAD);
     88 
     89 		pa = (ubc_ue_log_reg & UBC_UE_ADR_MASK);
     90 
     91 		fmd_hdl_debug(hdl, "cmd_ue_mem: ue_log_reg=%llx\n",
     92 		    (u_longlong_t)ubc_ue_log_reg);
     93 		fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
     94 		    (u_longlong_t)pa);
     95 		break;
     96 
     97 	default:
     98 
     99 		return (CMD_EVD_BAD);
    100 	}
    101 
    102 	if ((page = cmd_page_lookup(pa)) != NULL &&
    103 	    page->page_case.cc_cp != NULL &&
    104 	    fmd_case_solved(hdl, page->page_case.cc_cp))
    105 		return (CMD_EVD_REDUND);
    106 
    107 	if (nvlist_dup(rsrc, &asru, 0) != 0) {
    108 		fmd_hdl_debug(hdl, "opl_ue_mem nvlist dup failed\n");
    109 		return (CMD_EVD_BAD);
    110 	}
    111 
    112 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
    113 		nvlist_free(asru);
    114 		CMD_STAT_BUMP(bad_mem_asru);
    115 		return (CMD_EVD_BAD);
    116 	}
    117 
    118 	if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
    119 		nvlist_free(asru);
    120 		return (CMD_EVD_BAD);
    121 	}
    122 
    123 	cmd_page_fault(hdl, asru, fru, ep, pa);
    124 	nvlist_free(asru);
    125 	nvlist_free(fru);
    126 	return (CMD_EVD_OK);
    127 }
    128 
    129 /*
    130  * The following is the main function to handle generating
    131  * the sibling cpu suspect list for the CPU detected UE
    132  * error cases.  This is to handle the
    133  * multiple strand/core architecture on the OPL platform.
    134  */
    135 cmd_evdisp_t
    136 cmd_opl_ue_cpu(fmd_hdl_t *hdl, fmd_event_t *ep,
    137     const char *class, const char *fltname,
    138     cmd_ptrsubtype_t ptr, cmd_cpu_t *cpu,
    139     cmd_case_t *cc, uint8_t cpumask)
    140 {
    141 	const char *uuid;
    142 	cmd_cpu_t *main_cpu, *sib_cpu;
    143 	nvlist_t *fmri;
    144 	cmd_list_t *cpu_list;
    145 	opl_cpu_t *opl_cpu;
    146 	uint32_t main_cpuid, nsusp = 1;
    147 	uint8_t cert;
    148 
    149 	fmd_hdl_debug(hdl,
    150 	    "Enter OPL_CPUUE_HANDLER for class %x\n", class);
    151 
    152 	main_cpu = cpu;
    153 	main_cpuid = cpu->cpu_cpuid;
    154 
    155 	if (strcmp(fltname, "core") == 0)
    156 		cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
    157 		    IS_CORE);
    158 	else if (strcmp(fltname, "chip") == 0)
    159 		cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
    160 		    IS_CHIP);
    161 	else
    162 		cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
    163 		    IS_STRAND);
    164 
    165 	for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
    166 	    opl_cpu = cmd_list_next(opl_cpu)) {
    167 		if (opl_cpu->oc_cpuid == main_cpuid) {
    168 			sib_cpu = main_cpu;
    169 			opl_cpu->oc_cmd_cpu = main_cpu;
    170 		} else {
    171 			fmri = cmd_cpu_fmri_create(opl_cpu->oc_cpuid, cpumask);
    172 			if (fmri == NULL) {
    173 				opl_cpu->oc_cmd_cpu = NULL;
    174 				fmd_hdl_debug(hdl,
    175 				    "missing asru, cpuid %u excluded\n",
    176 				    opl_cpu->oc_cpuid);
    177 				continue;
    178 			}
    179 
    180 			sib_cpu = cmd_cpu_lookup(hdl, fmri, class,
    181 			    CMD_CPU_LEVEL_THREAD);
    182 			if (sib_cpu == NULL || sib_cpu->cpu_faulting) {
    183 				if (fmri != NULL)
    184 					nvlist_free(fmri);
    185 				opl_cpu->oc_cmd_cpu = NULL;
    186 				fmd_hdl_debug(hdl,
    187 				"cpu not present, cpuid %u excluded\n",
    188 				    opl_cpu->oc_cpuid);
    189 				continue;
    190 			}
    191 			opl_cpu->oc_cmd_cpu = sib_cpu;
    192 			if (fmri != NULL)
    193 				nvlist_free(fmri);
    194 			nsusp++;
    195 		}
    196 		if (cpu->cpu_cpuid == main_cpuid) {
    197 			if (cc->cc_cp != NULL &&
    198 			    fmd_case_solved(hdl, cc->cc_cp)) {
    199 				if (cpu_list != NULL)
    200 					opl_cpulist_free(hdl, cpu_list);
    201 				return (CMD_EVD_REDUND);
    202 			}
    203 
    204 			if (cc->cc_cp == NULL)
    205 				cc->cc_cp = cmd_case_create(hdl,
    206 				    &cpu->cpu_header, ptr, &uuid);
    207 
    208 			if (cc->cc_serdnm != NULL) {
    209 				fmd_hdl_debug(hdl,
    210 			"destroying existing %s state for class %x\n",
    211 				    cc->cc_serdnm, class);
    212 				fmd_serd_destroy(hdl, cc->cc_serdnm);
    213 				fmd_hdl_strfree(hdl, cc->cc_serdnm);
    214 				cc->cc_serdnm = NULL;
    215 				fmd_case_reset(hdl, cc->cc_cp);
    216 			}
    217 			fmd_case_add_ereport(hdl, cc->cc_cp, ep);
    218 		}
    219 	}
    220 	cert = opl_avg(100, nsusp);
    221 	for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
    222 	    opl_cpu = cmd_list_next(opl_cpu)) {
    223 		if (opl_cpu->oc_cmd_cpu != NULL) {
    224 			nvlist_t *cpu_rsrc;
    225 
    226 			cpu_rsrc = opl_cpursrc_create(hdl, opl_cpu->oc_cpuid);
    227 			if (cpu_rsrc == NULL) {
    228 				fmd_hdl_debug(hdl,
    229 				"missing rsrc, cpuid %u excluded\n",
    230 				    opl_cpu->oc_cpuid);
    231 				continue;
    232 			}
    233 			cmd_cpu_create_faultlist(hdl, cc->cc_cp,
    234 			    opl_cpu->oc_cmd_cpu, fltname, cpu_rsrc, cert);
    235 			nvlist_free(cpu_rsrc);
    236 		}
    237 	}
    238 	fmd_case_solve(hdl, cc->cc_cp);
    239 	if (cpu_list != NULL)
    240 		opl_cpulist_free(hdl, cpu_list);
    241 	return (CMD_EVD_OK);
    242 }
    243 
    244 /*
    245  * Generates DIMM fault if the number of Permanent CE
    246  * threshold is exceeded.
    247  */
    248 static void
    249 opl_ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
    250 {
    251 	nvlist_t *dflt;
    252 	fmd_case_t *cp;
    253 
    254 	fmd_hdl_debug(hdl,
    255 	    "Permanent CE event threshold checking.\n");
    256 
    257 	if (dimm->dimm_flags & CMD_MEM_F_FAULTING) {
    258 		/* We've already complained about this DIMM */
    259 		return;
    260 	}
    261 
    262 	if (dimm->dimm_nretired >= fmd_prop_get_int32(hdl,
    263 	    "max_perm_ce_dimm")) {
    264 		dimm->dimm_flags |= CMD_MEM_F_FAULTING;
    265 		cp = fmd_case_open(hdl, NULL);
    266 		dflt = cmd_dimm_create_fault(hdl, dimm, "fault.memory.dimm",
    267 		    CMD_FLTMAXCONF);
    268 		fmd_case_add_suspect(hdl, cp, dflt);
    269 		fmd_case_solve(hdl, cp);
    270 	}
    271 }
    272 
    273 /*
    274  * Notify fault page information (pa and errlog) to XSCF via mc-opl
    275  */
    276 #define	MC_PHYDEV_DIR	"/devices"
    277 #define	MC_PHYPREFIX	"pseudo-mc@"
    278 static int
    279 opl_scf_log(fmd_hdl_t *hdl, nvlist_t *nvl)
    280 {
    281 	uint32_t *eadd, *elog;
    282 	uint_t n;
    283 	uint64_t pa;
    284 	char path[MAXPATHLEN];
    285 	char *unum;
    286 	nvlist_t *rsrc;
    287 	DIR *mcdir;
    288 	struct dirent *dp;
    289 	mc_flt_page_t flt_page;
    290 	cmd_page_t *page;
    291 	struct stat statbuf;
    292 
    293 	/*
    294 	 * Extract ereport.
    295 	 * Sanity check of pa is already done at cmd_opl_mac_common().
    296 	 * mc-opl sets only one entry for MC_OPL_ERR_ADD, MC_OPL_ERR_LOG,
    297 	 * and MC_OPL_BANK.
    298 	 */
    299 	if ((nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) != 0) ||
    300 	    (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_ADD, &eadd, &n) != 0) ||
    301 	    (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_LOG, &elog, &n) != 0)) {
    302 		fmd_hdl_debug(hdl, "opl_scf_log failed to extract ereport.\n");
    303 		return (-1);
    304 	}
    305 	if (nvlist_lookup_nvlist(nvl, FM_EREPORT_PAYLOAD_NAME_RESOURCE,
    306 	    &rsrc) != 0) {
    307 		fmd_hdl_debug(hdl, "opl_scf_log failed to get resource.\n");
    308 		return (-1);
    309 	}
    310 	if (nvlist_lookup_string(rsrc, FM_FMRI_MEM_UNUM, &unum) != 0) {
    311 		fmd_hdl_debug(hdl, "opl_scf_log failed to get unum.\n");
    312 		return (-1);
    313 	}
    314 
    315 	page = cmd_page_lookup(pa);
    316 	if (page != NULL && page->page_flags & CMD_MEM_F_FAULTING) {
    317 		/*
    318 		 * fault.memory.page will not be created.
    319 		 */
    320 		return (0);
    321 	}
    322 
    323 	flt_page.err_add = eadd[0];
    324 	flt_page.err_log = elog[0];
    325 	flt_page.fmri_addr = (uint64_t)(uint32_t)unum;
    326 	flt_page.fmri_sz = strlen(unum) + 1;
    327 
    328 	fmd_hdl_debug(hdl, "opl_scf_log DIMM: %s (%d)\n",
    329 	    unum, strlen(unum) + 1);
    330 	fmd_hdl_debug(hdl, "opl_scf_log pa:%llx add:%x log:%x\n",
    331 	    pa, eadd[0], elog[0]);
    332 
    333 	if ((mcdir = opendir(MC_PHYDEV_DIR)) != NULL) {
    334 		while ((dp = readdir(mcdir)) != NULL) {
    335 			int fd;
    336 
    337 			if (strncmp(dp->d_name, MC_PHYPREFIX,
    338 			    strlen(MC_PHYPREFIX)) != 0)
    339 				continue;
    340 
    341 			(void) snprintf(path, sizeof (path),
    342 			    "%s/%s", MC_PHYDEV_DIR, dp->d_name);
    343 
    344 			if (stat(path, &statbuf) != 0 ||
    345 			    (statbuf.st_mode & S_IFCHR) == 0) {
    346 				/* skip if not a character device */
    347 				continue;
    348 			}
    349 
    350 			if ((fd = open(path, O_RDONLY)) < 0)
    351 				continue;
    352 
    353 			if (ioctl(fd, MCIOC_FAULT_PAGE, &flt_page) == 0) {
    354 				fmd_hdl_debug(hdl, "opl_scf_log ioctl(%s)\n",
    355 				    path);
    356 				(void) close(fd);
    357 				(void) closedir(mcdir);
    358 				return (0);
    359 			}
    360 			(void) close(fd);
    361 		}
    362 		(void) closedir(mcdir);
    363 	}
    364 
    365 	fmd_hdl_debug(hdl, "opl_scf_log failed ioctl().\n");
    366 
    367 	return (-1);
    368 }
    369 
    370 /*
    371  * This is the common function for processing MAC detected
    372  * Intermittent and Permanent CEs.
    373  */
    374 
    375 cmd_evdisp_t
    376 cmd_opl_mac_ce(fmd_hdl_t *hdl, fmd_event_t *ep, const char *class,
    377     nvlist_t *asru, nvlist_t *fru, uint64_t pa, nvlist_t *nvl)
    378 {
    379 	cmd_dimm_t *dimm;
    380 	const char *uuid;
    381 
    382 	fmd_hdl_debug(hdl,
    383 	    "Processing CE ereport\n");
    384 
    385 	if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL &&
    386 	    (dimm = cmd_dimm_create(hdl, asru)) == NULL)
    387 		return (CMD_EVD_UNUSED);
    388 
    389 	if (dimm->dimm_case.cc_cp == NULL) {
    390 		dimm->dimm_case.cc_cp = cmd_case_create(hdl,
    391 		    &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid);
    392 	}
    393 
    394 	if (strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
    395 		CMD_STAT_BUMP(ce_interm);
    396 		fmd_hdl_debug(hdl, "adding FJ-Intermittent event "
    397 		    "to CE serd engine\n");
    398 
    399 		if (dimm->dimm_case.cc_serdnm == NULL) {
    400 			dimm->dimm_case.cc_serdnm =
    401 			    cmd_mem_serdnm_create(hdl,
    402 			    "dimm", dimm->dimm_unum);
    403 			fmd_serd_create(hdl, dimm->dimm_case.cc_serdnm,
    404 			    fmd_prop_get_int32(hdl, "ce_n"),
    405 			    fmd_prop_get_int64(hdl, "ce_t"));
    406 		}
    407 
    408 		if (fmd_serd_record(hdl, dimm->dimm_case.cc_serdnm, ep) ==
    409 		    FMD_B_FALSE) {
    410 			return (CMD_EVD_OK); /* engine hasn't fired */
    411 		}
    412 		fmd_hdl_debug(hdl, "ce serd fired\n");
    413 		fmd_case_add_serd(hdl, dimm->dimm_case.cc_cp,
    414 		    dimm->dimm_case.cc_serdnm);
    415 		fmd_serd_reset(hdl, dimm->dimm_case.cc_serdnm);
    416 
    417 		(void) opl_scf_log(hdl, nvl);
    418 	} else {
    419 		CMD_STAT_BUMP(ce_sticky);
    420 	}
    421 
    422 	dimm->dimm_nretired++;
    423 	dimm->dimm_retstat.fmds_value.ui64++;
    424 	cmd_dimm_dirty(hdl, dimm);
    425 
    426 	cmd_page_fault(hdl, asru, fru, ep, pa);
    427 	opl_ce_thresh_check(hdl, dimm);
    428 
    429 	return (CMD_EVD_OK);
    430 }
    431 
    432 /*
    433  * This is the common entry for processing MAC detected errors.
    434  * It is responsible for generating the memory page fault event.
    435  * The permanent CE (sticky) in normal mode is handled here also
    436  * in the same way as in the UE case.
    437  */
    438 /*ARGSUSED*/
    439 cmd_evdisp_t
    440 cmd_opl_mac_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
    441     const char *class, cmd_errcl_t clcode)
    442 {
    443 	uint64_t pa;
    444 	nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
    445 	cmd_page_t *page;
    446 
    447 	fmd_hdl_debug(hdl, "cmd_mac_common: clcode=%ll\n", clcode);
    448 
    449 	if (nvlist_lookup_nvlist(nvl, MC_OPL_RESOURCE, &rsrc) != 0)
    450 		return (CMD_EVD_BAD);
    451 
    452 	if (nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa)
    453 	    != 0)
    454 		return (CMD_EVD_BAD);
    455 
    456 	/*
    457 	 * Check for invalid pa.
    458 	 * The most sig. bit should not be on.
    459 	 * It would be out of the range of possible pa
    460 	 * in MAC's view.
    461 	 */
    462 	if (((uint64_t)1 << 63) & pa)
    463 		return (CMD_EVD_BAD);
    464 
    465 	if ((page = cmd_page_lookup(pa)) != NULL &&
    466 	    page->page_case.cc_cp != NULL &&
    467 	    fmd_case_solved(hdl, page->page_case.cc_cp))
    468 		return (CMD_EVD_REDUND);
    469 
    470 	if (nvlist_dup(rsrc, &asru, 0) != 0) {
    471 		fmd_hdl_debug(hdl, "cmd_opl_mac_common nvlist dup failed\n");
    472 		return (CMD_EVD_BAD);
    473 	}
    474 
    475 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
    476 		fmd_hdl_debug(hdl, "cmd_opl_mac_common expand failed\n");
    477 		nvlist_free(asru);
    478 		CMD_STAT_BUMP(bad_mem_asru);
    479 		return (CMD_EVD_BAD);
    480 	}
    481 
    482 	if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
    483 		fmd_hdl_debug(hdl, "cmd_opl_mac_common fru_create failed\n");
    484 		nvlist_free(asru);
    485 		return (CMD_EVD_BAD);
    486 	}
    487 
    488 	/*
    489 	 * process PCE and ICE to create DIMM fault
    490 	 */
    491 	if (strcmp(class, "ereport.asic.mac.mi-ce") == 0 ||
    492 	    strcmp(class, "ereport.asic.mac.ptrl-ce") == 0 ||
    493 	    strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
    494 		cmd_evdisp_t ret;
    495 
    496 		ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa, nvl);
    497 		nvlist_free(asru);
    498 		nvlist_free(fru);
    499 		if (ret != CMD_EVD_OK) {
    500 			fmd_hdl_debug(hdl,
    501 			    "cmd_opl_mac_common: mac_ce failed\n");
    502 			return (CMD_EVD_BAD);
    503 		} else
    504 			return (CMD_EVD_OK);
    505 	}
    506 
    507 	/* The following code handles page retires for UEs and CMPEs.  */
    508 
    509 	cmd_page_fault(hdl, asru, fru, ep, pa);
    510 	nvlist_free(asru);
    511 	nvlist_free(fru);
    512 	return (CMD_EVD_OK);
    513 }
    514 
    515 /*
    516  * Common entry points for handling CPU/IO detected UE with
    517  * respect to EID=MEM.
    518  */
    519 /*ARGSUSED*/
    520 cmd_evdisp_t
    521 cmd_opl_cpu_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
    522     const char *class, cmd_errcl_t clcode)
    523 {
    524 	return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_CPU));
    525 }
    526 
    527 /*ARGSUSED*/
    528 cmd_evdisp_t
    529 cmd_opl_io_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
    530     const char *class, cmd_errcl_t clcode)
    531 {
    532 	return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_IO));
    533 }
    534