Home | History | Annotate | Download | only in cpumem-diagnosis
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 /*
     29  * Support routines for managing per-CPU state.
     30  */
     31 
     32 #include <cmd_cpu.h>
     33 #include <cmd_mem.h>
     34 #include <cmd.h>
     35 
     36 #include <stdio.h>
     37 #include <string.h>
     38 #include <strings.h>
     39 #include <errno.h>
     40 #include <kstat.h>
     41 #include <fm/fmd_api.h>
     42 #include <sys/async.h>
     43 #include <sys/fm/protocol.h>
     44 #include <sys/fm/cpu/UltraSPARC-T1.h>
     45 #include <sys/niagararegs.h>
     46 #include <cmd_hc_sun4v.h>
     47 
     48 int cmd_afsr_check(fmd_hdl_t *,  uint64_t, cmd_errcl_t, uint8_t *);
     49 
     50 const errdata_t l3errdata =
     51 	{ &cmd.cmd_l3data_serd, "l3cachedata", CMD_PTR_CPU_L3DATA  };
     52 const errdata_t n1l2errdata =
     53 	{ &cmd.cmd_l2data_serd, "l2cachedata", CMD_PTR_CPU_L2DATA };
     54 const errdata_t n2ce_l2errdata =
     55 	{ &cmd.cmd_l2data_serd, "l2data-c", CMD_PTR_CPU_L2DATA };
     56 const errdata_t n2ue_l2errdata =
     57 	{ &cmd.cmd_l2data_serd, "l2data-u", CMD_PTR_CPU_L2DATA };
     58 const errdata_t miscregsdata =
     59 	{ &cmd.cmd_miscregs_serd, "misc_reg", CMD_PTR_CPU_MISC_REGS };
     60 const errdata_t dcachedata =
     61 	{ &cmd.cmd_dcache_serd, "dcache", CMD_PTR_CPU_DCACHE };
     62 const errdata_t icachedata =
     63 	{ &cmd.cmd_icache_serd, "icache", CMD_PTR_CPU_ICACHE };
     64 
     65 static int
     66 cmd_xr_error_type(cmd_errcl_t clcode)
     67 {
     68 	if (CMD_ERRCL_ISMISCREGS(clcode))
     69 		return (MISCREGS_ERR);
     70 	else if (CMD_ERRCL_ISL2XXCU(clcode))
     71 		return (L2_ERR);
     72 	else if (CMD_ERRCL_ISL2ND(clcode))
     73 		return (L2ND_ERR);
     74 	else if (CMD_ERRCL_ISMEM(clcode))
     75 		return (MEM_ERR);
     76 	else if (CMD_ERRCL_ISDCDP(clcode))
     77 		return (DCDP_ERR);
     78 	else if (CMD_ERRCL_ISICDP(clcode))
     79 		return (ICDP_ERR);
     80 	else if (CMD_ERRCL_REMOTEL2(clcode))
     81 		return (REMOTE_L2ERR);
     82 	else
     83 		return (UNKNOWN_ERR);
     84 }
     85 
     86 void
     87 cmd_fill_errdata(cmd_errcl_t clcode, cmd_cpu_t *cpu, cmd_case_t **cc,
     88     const errdata_t **ed)
     89 {
     90 	int err_type;
     91 
     92 	err_type = cmd_xr_error_type(clcode);
     93 	switch (err_type) {
     94 		case MISCREGS_ERR:
     95 			*ed = &miscregsdata;
     96 			*cc = &cpu->cpu_misc_regs;
     97 			break;
     98 		case L2_ERR:
     99 		case REMOTE_L2ERR:
    100 			if (cpu->cpu_type == CPU_ULTRASPARC_T1) {
    101 				*ed = &n1l2errdata;
    102 				*cc = &cpu->cpu_l2data;
    103 			} else {
    104 				if (CMD_ERRCL_ISL2CE(clcode)) {
    105 					*ed = &n2ce_l2errdata;
    106 					*cc = &cpu->cpu_l2data;
    107 				} else {
    108 					*ed = &n2ue_l2errdata;
    109 					*cc = &cpu->cpu_l2data;
    110 				}
    111 			}
    112 			break;
    113 		case DCDP_ERR:
    114 			*ed = &dcachedata;
    115 			*cc = &cpu->cpu_dcache;
    116 			break;
    117 		case ICDP_ERR:
    118 			*ed = &icachedata;
    119 			*cc = &cpu->cpu_icache;
    120 			break;
    121 		/*
    122 		 * When an error goes through the train, it requires
    123 		 * to have cmd_case_t & errdata_t structures even it is not
    124 		 * diagnosed when the error is resolved. Sun4v does
    125 		 * does not have a L3 error, but the L3 cpu case was defined,
    126 		 * so its data structures are used for the default cases.
    127 		 */
    128 		default:
    129 			*ed = &l3errdata;
    130 			*cc = &cpu->cpu_l3data;
    131 			break;
    132 	}
    133 }
    134 
    135 int
    136 cmd_afar_status_check(uint8_t afar_status, cmd_errcl_t clcode)
    137 {
    138 
    139 	/*
    140 	 * There is no L2 data for a remote write back
    141 	 * cache error in the ereport, so skip the status check
    142 	 */
    143 	if (clcode == CMD_ERRCL_WBUE)
    144 		return (0);
    145 
    146 	if (afar_status == AFLT_STAT_VALID)
    147 		return (0);
    148 	return (-1);
    149 }
    150 
    151 /*
    152  * Search for the entry that matches the ena and the AFAR
    153  * if we have a valid AFAR, otherwise search for the entry
    154  * that its's ena is < delta ENA.
    155  */
    156 /*ARGSUSED*/
    157 cmd_xxcu_trw_t *
    158 cmd_trw_lookup(uint64_t ena, uint8_t afar_status, uint64_t afar)
    159 {
    160 	int i;
    161 
    162 	if (afar_status == AFLT_STAT_VALID) {
    163 		for (i = 0; i < cmd.cmd_xxcu_ntrw; i++) {
    164 			if (cmd.cmd_xxcu_trw[i].trw_ena != 0) {
    165 				if ((llabs(ena - cmd.cmd_xxcu_trw[i].trw_ena) <
    166 				    cmd.cmd_delta_ena) &&
    167 				    (cmd.cmd_xxcu_trw[i].trw_afar == afar))
    168 					return (&cmd.cmd_xxcu_trw[i]);
    169 			}
    170 		}
    171 	}
    172 
    173 	for (i = 0; i < cmd.cmd_xxcu_ntrw; i++) {
    174 		if (cmd.cmd_xxcu_trw[i].trw_ena != 0) {
    175 			if (llabs(ena - cmd.cmd_xxcu_trw[i].trw_ena)
    176 			    < cmd.cmd_delta_ena)
    177 				return (&cmd.cmd_xxcu_trw[i]);
    178 		}
    179 	}
    180 
    181 	return (NULL);
    182 }
    183 
    184 cmd_errcl_t
    185 cmd_get_nextbit(cmd_errcl_t trw_mask)
    186 {
    187 	cmd_errcl_t tmp_mask = 0;
    188 	cmd_errcl_t tmp;
    189 	int i;
    190 
    191 	for (i = 0; i < 64; i++) {
    192 		tmp = (0x0000000000000001ULL << i);
    193 		if (tmp & trw_mask) {
    194 			tmp_mask = tmp;
    195 			break;
    196 		}
    197 	}
    198 	return (tmp_mask);
    199 }
    200 
    201 /*
    202  * For a resolved error, its error code will be paired with
    203  * each error code in the train mask and compared against the
    204  * pre-defined trains in the cmd_cpu.c to determine if the error
    205  * is in the train.
    206  */
    207 cmd_errcl_t
    208 cmd_combine_two_train(cmd_errcl_t trw_mask, cmd_errcl_t resolved_err)
    209 {
    210 	cmd_errcl_t tmp_mask = 0;
    211 	cmd_errcl_t train_mask = 0;
    212 	cmd_errcl_t cause = 0;
    213 	cmd_errcl_t error_mask = trw_mask ^ resolved_err;
    214 
    215 	while (error_mask) {
    216 		tmp_mask = cmd_get_nextbit(error_mask);
    217 		if (tmp_mask == 0)
    218 			break;
    219 		train_mask = tmp_mask | resolved_err;
    220 		cause = cmd_xxcu_train_match(train_mask);
    221 		if (cause) {
    222 			return (cause);
    223 		}
    224 		error_mask = error_mask ^ tmp_mask;
    225 	}
    226 	return (0);
    227 }
    228 
    229 cmd_errcl_t
    230 cmd_train_match(cmd_errcl_t trw_mask, cmd_errcl_t resolved_err)
    231 {
    232 	return (cmd_combine_two_train(trw_mask, resolved_err));
    233 }
    234 
    235 int
    236 cmd_xr_fill(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_xr_t *xr, cmd_errcl_t clcode)
    237 {
    238 	uint64_t niagara_l2_afsr = 0;
    239 	int errtype;
    240 
    241 	errtype = cmd_xr_error_type(clcode);
    242 	/*
    243 	 * skip the fill data for the errors which is not L2 errors.
    244 	 */
    245 	if (errtype != L2_ERR) {
    246 		fmd_hdl_debug(hdl, "Skip fill L2 data for errtype %d\n",
    247 		    errtype);
    248 		return (0);
    249 	}
    250 
    251 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFSR,
    252 	    &niagara_l2_afsr) != 0 &&
    253 	    nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_ESR,
    254 	    &niagara_l2_afsr) != 0) {
    255 		fmd_hdl_debug(hdl, "No L2 AFSR data");
    256 		return (-1);
    257 	}
    258 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFAR,
    259 	    &xr->xr_afar) != 0 &&
    260 	    nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_EAR,
    261 	    &xr->xr_afar) != 0) {
    262 		fmd_hdl_debug(hdl, "No L2 AFAR data");
    263 		return (-1);
    264 	}
    265 	if (nvlist_lookup_uint32(nvl, FM_EREPORT_PAYLOAD_NAME_L2_SYND,
    266 	    &xr->xr_synd) != 0) {
    267 		/* Niagara-2 doesn't provide separate (redundant) l2-synd */
    268 		xr->xr_synd = niagara_l2_afsr & NI2_L2AFSR_SYND;
    269 	}
    270 
    271 	if (cmd_afsr_check(hdl, niagara_l2_afsr, clcode,
    272 	    &xr->xr_synd_status) != 0) {
    273 		fmd_hdl_debug(hdl, "Invalid L2 syndrome");
    274 		return (-1);
    275 	}
    276 
    277 	xr->xr_afar_status = xr->xr_synd_status;
    278 	return (0);
    279 }
    280 
    281 int
    282 cmd_cpu_synd_check(uint32_t synd, cmd_errcl_t clcode)
    283 {
    284 	int i;
    285 
    286 	/*
    287 	 * Niagara L2 fetches from a memory location containing a UE
    288 	 * are given a poison syndrome in one or more 7 bit subsyndromes
    289 	 * each covering one of 4 4 byte checkwords.
    290 	 *
    291 	 * 0 is an invalid syndrome because it denotes no error, but
    292 	 * is associated with an ereport -- meaning there WAS an error.
    293 	 */
    294 	/*
    295 	 * HW does not store the syndrome value for write-back cache
    296 	 * error, so skip the synd check for L2 write-back error
    297 	 */
    298 	if (CMD_ERRCL_L2UE_WRITEBACK(clcode))
    299 		return (0);
    300 
    301 	if (synd == 0)
    302 		return (-1);
    303 
    304 	for (i = 0; i < 4; i++) {
    305 		if (((synd >> i*NI_L2_POISON_SYND_SIZE) &
    306 		    NI_L2_POISON_SYND_MASK) == NI_L2_POISON_SYND_FROM_DAU)
    307 			return (-1);
    308 	}
    309 	return (0);
    310 }
    311 
    312 int
    313 cmd_afsr_check(fmd_hdl_t *hdl, uint64_t afsr,
    314     cmd_errcl_t clcode, uint8_t *stat_val)
    315 {
    316 	/*
    317 	 * Set Niagara afar and synd validity.
    318 	 * For a given set of error registers, the payload value is valid iff
    319 	 * no higher priority error status bit is set.  See niagararegs.h
    320 	 * for error status bit values and priority settings.
    321 	 */
    322 	switch (clcode) {
    323 	case CMD_ERRCL_LDAU:
    324 	case CMD_ERRCL_LDSU:
    325 	case CMD_ERRCL_DL2U:
    326 	case CMD_ERRCL_IL2U:
    327 		*stat_val =
    328 		    ((afsr & NI_L2AFSR_P02) == 0) ?
    329 		    AFLT_STAT_VALID: AFLT_STAT_INVALID;
    330 		break;
    331 	case CMD_ERRCL_LDWU:
    332 		*stat_val =
    333 		    ((afsr & NI_L2AFSR_P03) == 0) ?
    334 		    AFLT_STAT_VALID : AFLT_STAT_INVALID;
    335 		break;
    336 	case CMD_ERRCL_LDRU:
    337 		*stat_val =
    338 		    ((afsr & NI_L2AFSR_P04) == 0) ?
    339 		    AFLT_STAT_VALID : AFLT_STAT_INVALID;
    340 		break;
    341 	case CMD_ERRCL_LDAC:
    342 	case CMD_ERRCL_LDSC:
    343 		*stat_val =
    344 		    ((afsr & NI_L2AFSR_P08) == 0) ?
    345 		    AFLT_STAT_VALID : AFLT_STAT_INVALID;
    346 		break;
    347 	case CMD_ERRCL_LDWC:
    348 		*stat_val =
    349 		    ((afsr & NI_L2AFSR_P09) == 0) ?
    350 		    AFLT_STAT_VALID : AFLT_STAT_INVALID;
    351 		break;
    352 	case CMD_ERRCL_LDRC:
    353 		*stat_val =
    354 		    ((afsr & NI_L2AFSR_P10) == 0) ?
    355 		    AFLT_STAT_VALID : AFLT_STAT_INVALID;
    356 		break;
    357 	default:
    358 		fmd_hdl_debug(hdl, "Niagara unrecognized l2cache error\n");
    359 		return (-1);
    360 	}
    361 	return (0);
    362 }
    363 
    364 
    365 int
    366 cmd_afar_valid(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_errcl_t clcode,
    367     uint64_t *afar)
    368 {
    369 	uint64_t niagara_l2_afsr = 0;
    370 	uint8_t stat_val;
    371 
    372 	/*
    373 	 * In Niagara-1, we carried forward the register names afsr and afar
    374 	 * in ereports from sun4u, even though the hardware registers were
    375 	 * named esr and ear respectively.  In Niagara-2 we decided to conform
    376 	 * to the hardware names.
    377 	 */
    378 
    379 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_AFSR,
    380 	    &niagara_l2_afsr) != 0 &&
    381 	    nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_ESR,
    382 	    &niagara_l2_afsr) != 0)
    383 		return (-1);
    384 
    385 	if (cmd_afsr_check(hdl, niagara_l2_afsr, clcode, &stat_val) != 0)
    386 		return (-1);
    387 
    388 	if (stat_val == AFLT_STAT_VALID) {
    389 		if (nvlist_lookup_uint64(nvl,
    390 		    FM_EREPORT_PAYLOAD_NAME_L2_AFAR, afar) == 0 ||
    391 		    nvlist_lookup_uint64(nvl,
    392 		    FM_EREPORT_PAYLOAD_NAME_L2_EAR, afar) == 0)
    393 			return (0);
    394 	}
    395 	return (-1);
    396 }
    397 
    398 /*
    399  * sun4v cmd_cpu_get_frustr expects a 'cpufru' element in 'detector' FMRI
    400  * of ereport (which is stored as 'asru' of cmd_cpu_t).  For early sun4v,
    401  * this was mistakenly spec'ed as "hc://MB" instead of "hc:///component=MB",
    402  * so this situation must be remediated when found.
    403  */
    404 
    405 char *
    406 cmd_cpu_getfrustr(fmd_hdl_t *hdl, cmd_cpu_t *cp)
    407 {
    408 	char *frustr;
    409 	nvlist_t *asru = cp->cpu_asru_nvl;
    410 
    411 	if (nvlist_lookup_string(asru, FM_FMRI_CPU_CPUFRU, &frustr) == 0) {
    412 		fmd_hdl_debug(hdl, "cmd_cpu_getfrustr: cpufru=%s\n", frustr);
    413 		if (strncmp(frustr, CPU_FRU_FMRI,
    414 		    sizeof (CPU_FRU_FMRI) -1) == 0)
    415 			return (fmd_hdl_strdup(hdl, frustr, FMD_SLEEP));
    416 		else {
    417 			char *s1, *s2;
    418 			size_t frustrlen;
    419 
    420 			s2 = strstr(frustr, "MB");
    421 			if ((s2 == NULL) || strcmp(s2, EMPTY_STR) == 0) {
    422 				fmd_hdl_debug(hdl,
    423 				    "cmd_cpu_getfrustr: no cpufru");
    424 				return (NULL);
    425 			}
    426 			frustrlen = strlen(s2) + sizeof (CPU_FRU_FMRI);
    427 			s1 = fmd_hdl_alloc(hdl, frustrlen, FMD_SLEEP);
    428 			s1 = strcpy(s1, CPU_FRU_FMRI);
    429 			s1 = strcat(s1, s2);
    430 			fmd_hdl_debug(hdl, "cmd_cpu_getfrustr frustr=%s\n", s1);
    431 			return (s1);
    432 		}
    433 	}
    434 	(void) cmd_set_errno(ENOENT);
    435 	return (NULL);
    436 }
    437 
    438 char *
    439 cmd_cpu_getpartstr(fmd_hdl_t *hdl, cmd_cpu_t *cp) {
    440 	char *partstr;
    441 	nvlist_t *asru = cp->cpu_asru_nvl;
    442 
    443 	if (nvlist_lookup_string(asru, FM_FMRI_HC_PART, &partstr) == 0)
    444 		return (fmd_hdl_strdup(hdl, partstr, FMD_SLEEP));
    445 	else
    446 		return (NULL);
    447 }
    448 
    449 char *
    450 cmd_cpu_getserialstr(fmd_hdl_t *hdl, cmd_cpu_t *cp) {
    451 	char *serialstr;
    452 	nvlist_t *asru = cp->cpu_asru_nvl;
    453 
    454 	if (nvlist_lookup_string(asru, FM_FMRI_HC_SERIAL_ID, &serialstr) == 0)
    455 		return (fmd_hdl_strdup(hdl, serialstr, FMD_SLEEP));
    456 	else
    457 		return (NULL);
    458 }
    459 
    460 nvlist_t *
    461 cmd_cpu_mkfru(fmd_hdl_t *hdl, char *frustr, char *serialstr, char *partstr)
    462 {
    463 
    464 	nvlist_t *fru;
    465 	if (strncmp(frustr, CPU_FRU_FMRI, sizeof (CPU_FRU_FMRI) - 1) != 0)
    466 		return (NULL);
    467 	fru = cmd_mkboard_fru(hdl, frustr, serialstr, partstr);
    468 	return (fru);
    469 }
    470