Home | History | Annotate | Download | only in common
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * FMD Case Subsystem
     29  *
     30  * Diagnosis engines are expected to group telemetry events related to the
     31  * diagnosis of a particular problem on the system into a set of cases.  The
     32  * diagnosis engine may have any number of cases open at a given point in time.
     33  * Some cases may eventually be *solved* by associating a suspect list of one
     34  * or more problems with the case, at which point fmd publishes a list.suspect
     35  * event for the case and it becomes visible to administrators and agents.
     36  *
     37  * Every case is named using a UUID, and is globally visible in the case hash.
     38  * Cases are reference-counted, except for the reference from the case hash
     39  * itself.  Consumers of case references include modules, which store active
     40  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
     41  *
     42  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
     43  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
     44  * or transport) and the case is referenced by the mod_cases list.  Once the
     45  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
     46  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
     47  *
     48  *			+------------+
     49  *	     +----------|  UNSOLVED  |
     50  *	     |		+------------+
     51  *	     |		      1 |
     52  *	     |			|
     53  *	     |		+-------v----+
     54  *	   2 |		|    SOLVED  |
     55  *	     |		+------------+
     56  *	     |		    3 |  5 |
     57  *	     +------------+   |    |
     58  *			  |   |    |
     59  *			+-v---v----v-+
     60  *			| CLOSE_WAIT |
     61  *			+------------+
     62  *			  |   |    |
     63  *	      +-----------+   |    +------------+
     64  *	      |		    4 |			|
     65  *	      v		+-----v------+		|
     66  *	   discard      |   CLOSED   |	      6	|
     67  *			+------------+		|
     68  *			      |			|
     69  *			      |	   +------------+
     70  *			    7 |	   |
     71  *			+-----v----v-+
     72  *			|  REPAIRED  |
     73  *			+------------+
     74  *			      |
     75  *			    8 |
     76  *			+-----v------+
     77  *			|  RESOLVED  |
     78  *			+------------+
     79  *			      |
     80  *			      v
     81  *			   discard
     82  *
     83  * The state machine changes are triggered by calls to fmd_case_transition()
     84  * from various locations inside of fmd, as described below:
     85  *
     86  * [1] Called by: fmd_case_solve()
     87  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
     88  *                conviction policy is applied to suspect list
     89  *                suspects convicted are marked faulty (F) in R$
     90  *                list.suspect event logged and dispatched
     91  *
     92  * [2] Called by: fmd_case_close(), fmd_case_uuclose()
     93  *       Actions: diagnosis engine fmdo_close() entry point scheduled
     94  *                case discarded upon exit from CLOSE_WAIT
     95  *
     96  * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
     97  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
     98  *                suspects convicted (F) are marked unusable (U) in R$
     99  *                diagnosis engine fmdo_close() entry point scheduled
    100  *                case transitions to CLOSED [4] upon exit from CLOSE_WAIT
    101  *
    102  * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
    103  *       Actions: list.isolated event dispatched
    104  *                case deleted from module's list of open cases
    105  *
    106  * [5] Called by: fmd_case_repair(), fmd_case_update()
    107  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
    108  *                diagnosis engine fmdo_close() entry point scheduled
    109  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
    110  *
    111  * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
    112  *       Actions: suspects convicted are marked non faulty (!F) in R$
    113  *                list.repaired or list.updated event dispatched
    114  *
    115  * [7] Called by: fmd_case_repair(), fmd_case_update()
    116  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
    117  *                suspects convicted are marked non faulty (!F) in R$
    118  *                list.repaired or list.updated event dispatched
    119  *
    120  * [8] Called by: fmd_case_uuresolve()
    121  *       Actions: list.resolved event dispatched
    122  *		  case is discarded
    123  */
    124 
    125 #include <sys/fm/protocol.h>
    126 #include <uuid/uuid.h>
    127 #include <alloca.h>
    128 
    129 #include <fmd_alloc.h>
    130 #include <fmd_module.h>
    131 #include <fmd_error.h>
    132 #include <fmd_conf.h>
    133 #include <fmd_case.h>
    134 #include <fmd_string.h>
    135 #include <fmd_subr.h>
    136 #include <fmd_protocol.h>
    137 #include <fmd_event.h>
    138 #include <fmd_eventq.h>
    139 #include <fmd_dispq.h>
    140 #include <fmd_buf.h>
    141 #include <fmd_log.h>
    142 #include <fmd_asru.h>
    143 #include <fmd_fmri.h>
    144 #include <fmd_xprt.h>
    145 
    146 #include <fmd.h>
    147 
    148 static const char *const _fmd_case_snames[] = {
    149 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
    150 	"SOLVED",	/* FMD_CASE_SOLVED */
    151 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
    152 	"CLOSED",	/* FMD_CASE_CLOSED */
    153 	"REPAIRED",	/* FMD_CASE_REPAIRED */
    154 	"RESOLVED"	/* FMD_CASE_RESOLVED */
    155 };
    156 
    157 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
    158 
    159 fmd_case_hash_t *
    160 fmd_case_hash_create(void)
    161 {
    162 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
    163 
    164 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
    165 	chp->ch_hashlen = fmd.d_str_buckets;
    166 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
    167 	chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
    168 	    FMD_SLEEP);
    169 	chp->ch_count = 0;
    170 
    171 	return (chp);
    172 }
    173 
    174 /*
    175  * Destroy the case hash.  Unlike most of our hash tables, no active references
    176  * are kept by the case hash itself; all references come from other subsystems.
    177  * The hash must be destroyed after all modules are unloaded; if anything was
    178  * present in the hash it would be by definition a reference count leak.
    179  */
    180 void
    181 fmd_case_hash_destroy(fmd_case_hash_t *chp)
    182 {
    183 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
    184 	fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
    185 	fmd_free(chp, sizeof (fmd_case_hash_t));
    186 }
    187 
    188 /*
    189  * Take a snapshot of the case hash by placing an additional hold on each
    190  * member in an auxiliary array, and then call 'func' for each case.
    191  */
    192 void
    193 fmd_case_hash_apply(fmd_case_hash_t *chp,
    194     void (*func)(fmd_case_t *, void *), void *arg)
    195 {
    196 	fmd_case_impl_t *cp, **cps, **cpp;
    197 	uint_t cpc, i;
    198 
    199 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
    200 
    201 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
    202 	cpc = chp->ch_count;
    203 
    204 	for (i = 0; i < chp->ch_hashlen; i++) {
    205 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
    206 			*cpp++ = fmd_case_tryhold(cp);
    207 	}
    208 
    209 	ASSERT(cpp == cps + cpc);
    210 	(void) pthread_rwlock_unlock(&chp->ch_lock);
    211 
    212 	for (i = 0; i < cpc; i++) {
    213 		if (cps[i] != NULL) {
    214 			func((fmd_case_t *)cps[i], arg);
    215 			fmd_case_rele((fmd_case_t *)cps[i]);
    216 		}
    217 	}
    218 
    219 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
    220 }
    221 
    222 static void
    223 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
    224 {
    225 	uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
    226 
    227 	cip->ci_code_next = chp->ch_code_hash[h];
    228 	chp->ch_code_hash[h] = cip;
    229 }
    230 
    231 static void
    232 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
    233 {
    234 	fmd_case_impl_t **pp, *cp;
    235 
    236 	if (cip->ci_code) {
    237 		uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
    238 
    239 		pp = &chp->ch_code_hash[h];
    240 		for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
    241 			if (cp != cip)
    242 				pp = &cp->ci_code_next;
    243 			else
    244 				break;
    245 		}
    246 		if (cp != NULL) {
    247 			*pp = cp->ci_code_next;
    248 			cp->ci_code_next = NULL;
    249 		}
    250 	}
    251 }
    252 
    253 /*
    254  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
    255  * were defined for this case or if the lookup fails, the event dictionary or
    256  * module code is broken, and we set the event code to a precomputed default.
    257  */
    258 static const char *
    259 fmd_case_mkcode(fmd_case_t *cp)
    260 {
    261 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    262 	fmd_case_susp_t *cis;
    263 	fmd_case_hash_t *chp = fmd.d_cases;
    264 
    265 	char **keys, **keyp;
    266 	const char *s;
    267 
    268 	ASSERT(MUTEX_HELD(&cip->ci_lock));
    269 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
    270 
    271 	/*
    272 	 * delete any existing entry from code hash if it is on it
    273 	 */
    274 	fmd_case_code_hash_delete(chp, cip);
    275 
    276 	fmd_free(cip->ci_code, cip->ci_codelen);
    277 	cip->ci_codelen = cip->ci_mod->mod_codelen;
    278 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
    279 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
    280 
    281 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
    282 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
    283 			keyp++;
    284 	}
    285 
    286 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
    287 
    288 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
    289 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
    290 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
    291 		fmd_free(cip->ci_code, cip->ci_codelen);
    292 		cip->ci_codelen = strlen(s) + 1;
    293 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
    294 		(void) strcpy(cip->ci_code, s);
    295 	}
    296 
    297 	/*
    298 	 * add into hash of solved cases
    299 	 */
    300 	fmd_case_code_hash_insert(chp, cip);
    301 
    302 	return (cip->ci_code);
    303 }
    304 
    305 typedef struct {
    306 	int	*fcl_countp;
    307 	int	fcl_maxcount;
    308 	uint8_t *fcl_ba;
    309 	nvlist_t **fcl_nva;
    310 	int	*fcl_msgp;
    311 } fmd_case_lst_t;
    312 
    313 static void
    314 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
    315 {
    316 	fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
    317 	boolean_t b;
    318 	int state;
    319 
    320 	if (*entryp->fcl_countp >= entryp->fcl_maxcount)
    321 		return;
    322 	if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
    323 	    &b) == 0 && b == B_FALSE)
    324 		*entryp->fcl_msgp = B_FALSE;
    325 	entryp->fcl_ba[*entryp->fcl_countp] = 0;
    326 	state = fmd_asru_al_getstate(alp);
    327 	if (state & FMD_ASRU_DEGRADED)
    328 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED;
    329 	if (state & FMD_ASRU_UNUSABLE)
    330 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
    331 	if (state & FMD_ASRU_FAULTY)
    332 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
    333 	if (!(state & FMD_ASRU_PRESENT))
    334 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
    335 	if (alp->al_reason == FMD_ASRU_REPAIRED)
    336 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED;
    337 	else if (alp->al_reason == FMD_ASRU_REPLACED)
    338 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED;
    339 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
    340 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED;
    341 	entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
    342 	(*entryp->fcl_countp)++;
    343 }
    344 
    345 static void
    346 fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
    347 {
    348 	int *faultyp = (int *)arg;
    349 
    350 	*faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
    351 }
    352 
    353 static void
    354 fmd_case_usable(fmd_asru_link_t *alp, void *arg)
    355 {
    356 	int *usablep = (int *)arg;
    357 
    358 	*usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
    359 }
    360 
    361 static void
    362 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg)
    363 {
    364 	int *not_faultyp = (int *)arg;
    365 
    366 	*not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY);
    367 }
    368 
    369 /*
    370  * Have we got any suspects with an asru that are still unusable and present?
    371  */
    372 static void
    373 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg)
    374 {
    375 	int *rvalp = (int *)arg;
    376 	int state;
    377 	nvlist_t *asru;
    378 
    379 	/*
    380 	 * if this a proxy case and this suspect doesn't have an local asru
    381 	 * then state is unknown so we must assume it may still be unusable.
    382 	 */
    383 	if ((alp->al_flags & FMD_ASRU_PROXY) &&
    384 	    !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) {
    385 		*rvalp |= B_TRUE;
    386 		return;
    387 	}
    388 
    389 	state = fmd_asru_al_getstate(alp);
    390 	if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0)
    391 		return;
    392 	*rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT));
    393 }
    394 
    395 nvlist_t *
    396 fmd_case_mkevent(fmd_case_t *cp, const char *class)
    397 {
    398 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    399 	nvlist_t **nva, *nvl;
    400 	uint8_t *ba;
    401 	int msg = B_TRUE;
    402 	const char *code;
    403 	fmd_case_lst_t fcl;
    404 	int count = 0;
    405 
    406 	(void) pthread_mutex_lock(&cip->ci_lock);
    407 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
    408 
    409 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
    410 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
    411 
    412 	/*
    413 	 * For each suspect associated with the case, store its fault event
    414 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
    415 	 * have asked not to be messaged.  If any of them have made such a
    416 	 * request, propagate that attribute to the composite list.* event.
    417 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
    418 	 */
    419 	fcl.fcl_countp = &count;
    420 	fcl.fcl_maxcount = cip->ci_nsuspects;
    421 	fcl.fcl_msgp = &msg;
    422 	fcl.fcl_ba = ba;
    423 	fcl.fcl_nva = nva;
    424 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
    425 
    426 	if (cip->ci_code == NULL)
    427 		(void) fmd_case_mkcode(cp);
    428 	/*
    429 	 * For repair and updated event, we lookup diagcode from dict using key
    430 	 * "list.repaired" or "list.updated" or "list.resolved".
    431 	 */
    432 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
    433 		(void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
    434 	else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
    435 		(void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code);
    436 	else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)
    437 		(void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code);
    438 	else
    439 		code = cip->ci_code;
    440 
    441 	if (msg == B_FALSE)
    442 		cip->ci_flags |= FMD_CF_INVISIBLE;
    443 
    444 	/*
    445 	 * Use the ci_diag_de if one has been saved (eg for an injected fault).
    446 	 * Otherwise use the authority for the current module.
    447 	 */
    448 	nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ?
    449 	    cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count,
    450 	    nva, ba, msg, &cip->ci_tv, cip->ci_injected);
    451 
    452 	(void) pthread_mutex_unlock(&cip->ci_lock);
    453 	return (nvl);
    454 }
    455 
    456 static int fmd_case_match_on_faulty_overlap = 1;
    457 static int fmd_case_match_on_acquit_overlap = 1;
    458 static int fmd_case_auto_acquit_isolated = 1;
    459 static int fmd_case_auto_acquit_non_acquitted = 1;
    460 static int fmd_case_too_recent = 10; /* time in seconds */
    461 
    462 static boolean_t
    463 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
    464 {
    465 	nvlist_t *new_rsrc;
    466 	nvlist_t *rsrc;
    467 	char *new_name = NULL;
    468 	char *name = NULL;
    469 	ssize_t new_namelen;
    470 	ssize_t namelen;
    471 	int fmri_present = 1;
    472 	int new_fmri_present = 1;
    473 	int match = B_FALSE;
    474 	fmd_topo_t *ftp = fmd_topo_hold();
    475 
    476 	if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
    477 		fmri_present = 0;
    478 	else {
    479 		if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
    480 			goto done;
    481 		name = fmd_alloc(namelen + 1, FMD_SLEEP);
    482 		if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
    483 			goto done;
    484 	}
    485 	if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
    486 		new_fmri_present = 0;
    487 	else {
    488 		if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
    489 			goto done;
    490 		new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
    491 		if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
    492 			goto done;
    493 	}
    494 	match = (fmri_present == new_fmri_present &&
    495 	    (fmri_present == 0 ||
    496 	    topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
    497 done:
    498 	if (name != NULL)
    499 		fmd_free(name, namelen + 1);
    500 	if (new_name != NULL)
    501 		fmd_free(new_name, new_namelen + 1);
    502 	fmd_topo_rele(ftp);
    503 	return (match);
    504 }
    505 
    506 static int
    507 fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2)
    508 {
    509 	char *class, *new_class;
    510 
    511 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU))
    512 		return (0);
    513 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE))
    514 		return (0);
    515 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU))
    516 		return (0);
    517 	(void) nvlist_lookup_string(nvl2, FM_CLASS, &class);
    518 	(void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class);
    519 	return (strcmp(class, new_class) == 0);
    520 }
    521 
    522 typedef struct {
    523 	int	*fcms_countp;
    524 	int	fcms_maxcount;
    525 	fmd_case_impl_t *fcms_cip;
    526 	uint8_t *fcms_new_susp_state;
    527 	uint8_t *fcms_old_susp_state;
    528 	uint8_t *fcms_old_match_state;
    529 } fcms_t;
    530 #define	SUSPECT_STATE_FAULTY				0x1
    531 #define	SUSPECT_STATE_ISOLATED				0x2
    532 #define	SUSPECT_STATE_REMOVED				0x4
    533 #define	SUSPECT_STATE_ACQUITED				0x8
    534 #define	SUSPECT_STATE_REPAIRED				0x10
    535 #define	SUSPECT_STATE_REPLACED				0x20
    536 #define	SUSPECT_STATE_NO_MATCH				0x1
    537 
    538 /*
    539  * This is called for each suspect in the old case. Compare it against each
    540  * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state
    541  * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not
    542  * found in the old case.
    543  */
    544 static void
    545 fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg)
    546 {
    547 	fcms_t *fcmsp = (fcms_t *)arg;
    548 	fmd_case_impl_t *cip = fcmsp->fcms_cip;
    549 	fmd_case_susp_t *cis;
    550 	int i = 0;
    551 	int state = fmd_asru_al_getstate(alp);
    552 
    553 	if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount)
    554 		return;
    555 
    556 	if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) &&
    557 	    alp->al_reason == FMD_ASRU_REMOVED))
    558 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
    559 		    SUSPECT_STATE_REMOVED;
    560 	else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY))
    561 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
    562 		    SUSPECT_STATE_ISOLATED;
    563 	else if (state & FMD_ASRU_FAULTY)
    564 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
    565 		    SUSPECT_STATE_FAULTY;
    566 	else if (alp->al_reason == FMD_ASRU_REPLACED)
    567 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
    568 		    SUSPECT_STATE_REPLACED;
    569 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
    570 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
    571 		    SUSPECT_STATE_ACQUITED;
    572 	else
    573 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
    574 		    SUSPECT_STATE_REPAIRED;
    575 
    576 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++)
    577 		if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1)
    578 			break;
    579 	if (cis != NULL)
    580 		fcmsp->fcms_new_susp_state[i] =
    581 		    fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp];
    582 	else
    583 		fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |=
    584 		    SUSPECT_STATE_NO_MATCH;
    585 	(*fcmsp->fcms_countp)++;
    586 }
    587 
    588 typedef struct {
    589 	int	*fca_do_update;
    590 	fmd_case_impl_t *fca_cip;
    591 } fca_t;
    592 
    593 /*
    594  * Re-fault all acquitted suspects that are still present in the new list.
    595  */
    596 static void
    597 fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg)
    598 {
    599 	fca_t *fcap = (fca_t *)arg;
    600 	fmd_case_impl_t *cip = fcap->fca_cip;
    601 	fmd_case_susp_t *cis;
    602 	int state = fmd_asru_al_getstate(alp);
    603 
    604 	if (!(state & FMD_ASRU_FAULTY) &&
    605 	    alp->al_reason == FMD_ASRU_ACQUITTED) {
    606 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
    607 			if (fmd_case_match_suspect(cis->cis_nvl,
    608 			    alp->al_event) == 1)
    609 				break;
    610 		if (cis != NULL) {
    611 			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
    612 			*fcap->fca_do_update = 1;
    613 		}
    614 	}
    615 }
    616 
    617 /*
    618  * Re-fault all suspects that are still present in the new list.
    619  */
    620 static void
    621 fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg)
    622 {
    623 	fca_t *fcap = (fca_t *)arg;
    624 	fmd_case_impl_t *cip = fcap->fca_cip;
    625 	fmd_case_susp_t *cis;
    626 	int state = fmd_asru_al_getstate(alp);
    627 
    628 	if (!(state & FMD_ASRU_FAULTY)) {
    629 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
    630 			if (fmd_case_match_suspect(cis->cis_nvl,
    631 			    alp->al_event) == 1)
    632 				break;
    633 		if (cis != NULL) {
    634 			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
    635 			*fcap->fca_do_update = 1;
    636 		}
    637 	}
    638 }
    639 
    640 /*
    641  * Acquit all suspects that are no longer present in the new list.
    642  */
    643 static void
    644 fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg)
    645 {
    646 	fca_t *fcap = (fca_t *)arg;
    647 	fmd_case_impl_t *cip = fcap->fca_cip;
    648 	fmd_case_susp_t *cis;
    649 	int state = fmd_asru_al_getstate(alp);
    650 
    651 	if (state & FMD_ASRU_FAULTY) {
    652 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
    653 			if (fmd_case_match_suspect(cis->cis_nvl,
    654 			    alp->al_event) == 1)
    655 				break;
    656 		if (cis == NULL) {
    657 			(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
    658 			    FMD_ASRU_ACQUITTED);
    659 			*fcap->fca_do_update = 1;
    660 		}
    661 	}
    662 }
    663 
    664 /*
    665  * Acquit all isolated suspects.
    666  */
    667 static void
    668 fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg)
    669 {
    670 	int *do_update = (int *)arg;
    671 	int state = fmd_asru_al_getstate(alp);
    672 
    673 	if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) &&
    674 	    (state & FMD_ASRU_FAULTY)) {
    675 		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
    676 		    FMD_ASRU_ACQUITTED);
    677 		*do_update = 1;
    678 	}
    679 }
    680 
    681 /*
    682  * Acquit suspect which matches specified nvlist
    683  */
    684 static void
    685 fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg)
    686 {
    687 	nvlist_t *nvl = (nvlist_t *)arg;
    688 	int state = fmd_asru_al_getstate(alp);
    689 
    690 	if ((state & FMD_ASRU_FAULTY) &&
    691 	    fmd_case_match_suspect(nvl, alp->al_event) == 1)
    692 		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
    693 		    FMD_ASRU_ACQUITTED);
    694 }
    695 
    696 typedef struct {
    697 	fmd_case_impl_t *fccd_cip;
    698 	uint8_t *fccd_new_susp_state;
    699 	uint8_t *fccd_new_match_state;
    700 	int *fccd_discard_new;
    701 	int *fccd_adjust_new;
    702 } fccd_t;
    703 
    704 /*
    705  * see if a matching suspect list already exists in the cache
    706  */
    707 static void
    708 fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg)
    709 {
    710 	fccd_t *fccdp = (fccd_t *)arg;
    711 	fmd_case_impl_t *new_cip = fccdp->fccd_cip;
    712 	fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp;
    713 	int i, count = 0, do_update = 0, got_isolated_overlap = 0;
    714 	int got_faulty_overlap = 0;
    715 	int got_acquit_overlap = 0;
    716 	boolean_t too_recent;
    717 	uint64_t most_recent = 0;
    718 	fcms_t fcms;
    719 	fca_t fca;
    720 	uint8_t *new_susp_state;
    721 	uint8_t *old_susp_state;
    722 	uint8_t *old_match_state;
    723 
    724 	new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t));
    725 	for (i = 0; i < new_cip->ci_nsuspects; i++)
    726 		new_susp_state[i] = 0;
    727 	old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
    728 	for (i = 0; i < old_cip->ci_nsuspects; i++)
    729 		old_susp_state[i] = 0;
    730 	old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
    731 	for (i = 0; i < old_cip->ci_nsuspects; i++)
    732 		old_match_state[i] = 0;
    733 
    734 	/*
    735 	 * Compare with each suspect in the existing case.
    736 	 */
    737 	fcms.fcms_countp = &count;
    738 	fcms.fcms_maxcount = old_cip->ci_nsuspects;
    739 	fcms.fcms_cip = new_cip;
    740 	fcms.fcms_new_susp_state = new_susp_state;
    741 	fcms.fcms_old_susp_state = old_susp_state;
    742 	fcms.fcms_old_match_state = old_match_state;
    743 	fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip,
    744 	    fmd_case_match_suspects, &fcms);
    745 
    746 	/*
    747 	 * If we have some faulty, non-isolated suspects that overlap, then most
    748 	 * likely it is the suspects that overlap in the suspect lists that are
    749 	 * to blame. So we can consider this to be a match.
    750 	 */
    751 	for (i = 0; i < new_cip->ci_nsuspects; i++)
    752 		if (new_susp_state[i] == SUSPECT_STATE_FAULTY)
    753 			got_faulty_overlap = 1;
    754 	if (got_faulty_overlap && fmd_case_match_on_faulty_overlap)
    755 		goto got_match;
    756 
    757 	/*
    758 	 * If we have no faulty, non-isolated suspects in the old case, but we
    759 	 * do have some acquitted suspects that overlap, then most likely it is
    760 	 * the acquitted suspects that overlap in the suspect lists that are
    761 	 * to blame. So we can consider this to be a match.
    762 	 */
    763 	for (i = 0; i < new_cip->ci_nsuspects; i++)
    764 		if (new_susp_state[i] == SUSPECT_STATE_ACQUITED)
    765 			got_acquit_overlap = 1;
    766 	for (i = 0; i < old_cip->ci_nsuspects; i++)
    767 		if (old_susp_state[i] == SUSPECT_STATE_FAULTY)
    768 			got_acquit_overlap = 0;
    769 	if (got_acquit_overlap && fmd_case_match_on_acquit_overlap)
    770 		goto got_match;
    771 
    772 	/*
    773 	 * Check that all suspects in the new list are present in the old list.
    774 	 * Return if we find one that isn't.
    775 	 */
    776 	for (i = 0; i < new_cip->ci_nsuspects; i++)
    777 		if (new_susp_state[i] == 0)
    778 			return;
    779 
    780 	/*
    781 	 * Check that all suspects in the old list are present in the new list
    782 	 * *or* they are isolated or removed/replaced (which would explain why
    783 	 * they are not present in the new list). Return if we find one that is
    784 	 * faulty and unisolated or repaired or acquitted, and that is not
    785 	 * present in the new case.
    786 	 */
    787 	for (i = 0; i < old_cip->ci_nsuspects; i++)
    788 		if (old_match_state[i] == SUSPECT_STATE_NO_MATCH &&
    789 		    (old_susp_state[i] == SUSPECT_STATE_FAULTY ||
    790 		    old_susp_state[i] == SUSPECT_STATE_ACQUITED ||
    791 		    old_susp_state[i] == SUSPECT_STATE_REPAIRED))
    792 			return;
    793 
    794 got_match:
    795 	/*
    796 	 * If the old case is already in repaired/resolved state, we can't
    797 	 * do anything more with it, so keep the new case, but acquit some
    798 	 * of the suspects if appropriate.
    799 	 */
    800 	if (old_cip->ci_state >= FMD_CASE_REPAIRED) {
    801 		if (fmd_case_auto_acquit_non_acquitted) {
    802 			*fccdp->fccd_adjust_new = 1;
    803 			for (i = 0; i < new_cip->ci_nsuspects; i++) {
    804 				fccdp->fccd_new_susp_state[i] |=
    805 				    new_susp_state[i];
    806 				if (new_susp_state[i] == 0)
    807 					fccdp->fccd_new_susp_state[i] =
    808 					    SUSPECT_STATE_NO_MATCH;
    809 			}
    810 		}
    811 		return;
    812 	}
    813 
    814 	/*
    815 	 * Otherwise discard the new case and keep the old, again updating the
    816 	 * state of the suspects as appropriate
    817 	 */
    818 	*fccdp->fccd_discard_new = 1;
    819 	fca.fca_cip = new_cip;
    820 	fca.fca_do_update = &do_update;
    821 
    822 	/*
    823 	 * See if new case occurred within fmd_case_too_recent seconds of the
    824 	 * most recent modification to the old case and if so don't do
    825 	 * auto-acquit. This avoids problems if a flood of ereports come in and
    826 	 * they don't all get diagnosed before the first case causes some of
    827 	 * the devices to be isolated making it appear that an isolated device
    828 	 * was in the suspect list.
    829 	 */
    830 	fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
    831 	    fmd_asru_most_recent, &most_recent);
    832 	too_recent = (new_cip->ci_tv.tv_sec - most_recent <
    833 	    fmd_case_too_recent);
    834 
    835 	if (got_faulty_overlap) {
    836 		/*
    837 		 * Acquit any suspects not present in the new list, plus
    838 		 * any that are are present but are isolated.
    839 		 */
    840 		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
    841 		    fmd_case_acquit_no_match, &fca);
    842 		if (fmd_case_auto_acquit_isolated && !too_recent)
    843 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
    844 			    fmd_case_acquit_isolated, &do_update);
    845 	} else if (got_acquit_overlap) {
    846 		/*
    847 		 * Re-fault the acquitted matching suspects and acquit all
    848 		 * isolated suspects.
    849 		 */
    850 		if (fmd_case_auto_acquit_isolated && !too_recent) {
    851 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
    852 			    fmd_case_fault_acquitted_matching, &fca);
    853 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
    854 			    fmd_case_acquit_isolated, &do_update);
    855 		}
    856 	} else if (fmd_case_auto_acquit_isolated) {
    857 		/*
    858 		 * To get here, there must be no faulty or acquitted suspects,
    859 		 * but there must be at least one isolated suspect. Just acquit
    860 		 * non-matching isolated suspects. If there are no matching
    861 		 * isolated suspects, then re-fault all matching suspects.
    862 		 */
    863 		for (i = 0; i < new_cip->ci_nsuspects; i++)
    864 			if (new_susp_state[i] == SUSPECT_STATE_ISOLATED)
    865 				got_isolated_overlap = 1;
    866 		if (!got_isolated_overlap)
    867 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
    868 			    fmd_case_fault_all_matching, &fca);
    869 		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
    870 		    fmd_case_acquit_no_match, &fca);
    871 	}
    872 
    873 	/*
    874 	 * If we've updated anything in the old case, call fmd_case_update()
    875 	 */
    876 	if (do_update)
    877 		fmd_case_update(old_cp);
    878 }
    879 
    880 /*
    881  * Convict suspects in a case by applying a conviction policy and updating the
    882  * resource cache prior to emitting the list.suspect event for the given case.
    883  * At present, our policy is very simple: convict every suspect in the case.
    884  * In the future, this policy can be extended and made configurable to permit:
    885  *
    886  * - convicting the suspect with the highest FIT rate
    887  * - convicting the suspect with the cheapest FRU
    888  * - convicting the suspect with the FRU that is in a depot's inventory
    889  * - convicting the suspect with the longest lifetime
    890  *
    891  * and so forth.  A word to the wise: this problem is significantly harder that
    892  * it seems at first glance.  Future work should heed the following advice:
    893  *
    894  * Hacking the policy into C code here is a very bad idea.  The policy needs to
    895  * be decided upon very carefully and fundamentally encodes knowledge of what
    896  * suspect list combinations can be emitted by what diagnosis engines.  As such
    897  * fmd's code is the wrong location, because that would require fmd itself to
    898  * be updated for every diagnosis engine change, defeating the entire design.
    899  * The FMA Event Registry knows the suspect list combinations: policy inputs
    900  * can be derived from it and used to produce per-module policy configuration.
    901  *
    902  * If the policy needs to be dynamic and not statically fixed at either fmd
    903  * startup or module load time, any implementation of dynamic policy retrieval
    904  * must employ some kind of caching mechanism or be part of a built-in module.
    905  * The fmd_case_convict() function is called with locks held inside of fmd and
    906  * is not a place where unbounded blocking on some inter-process or inter-
    907  * system communication to another service (e.g. another daemon) can occur.
    908  */
    909 static int
    910 fmd_case_convict(fmd_case_t *cp)
    911 {
    912 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    913 	fmd_asru_hash_t *ahp = fmd.d_asrus;
    914 	int discard_new = 0, i;
    915 	fmd_case_susp_t *cis;
    916 	fmd_asru_link_t *alp;
    917 	uint8_t *new_susp_state;
    918 	uint8_t *new_match_state;
    919 	int adjust_new = 0;
    920 	fccd_t fccd;
    921 	fmd_case_impl_t *ncp, **cps, **cpp;
    922 	uint_t cpc;
    923 	fmd_case_hash_t *chp;
    924 
    925 	/*
    926 	 * First we must see if any matching cases already exist.
    927 	 */
    928 	new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
    929 	for (i = 0; i < cip->ci_nsuspects; i++)
    930 		new_susp_state[i] = 0;
    931 	new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
    932 	for (i = 0; i < cip->ci_nsuspects; i++)
    933 		new_match_state[i] = 0;
    934 	fccd.fccd_cip = cip;
    935 	fccd.fccd_adjust_new = &adjust_new;
    936 	fccd.fccd_new_susp_state = new_susp_state;
    937 	fccd.fccd_new_match_state = new_match_state;
    938 	fccd.fccd_discard_new = &discard_new;
    939 
    940 	/*
    941 	 * Hold all cases
    942 	 */
    943 	chp = fmd.d_cases;
    944 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
    945 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
    946 	cpc = chp->ch_count;
    947 	for (i = 0; i < chp->ch_hashlen; i++)
    948 		for (ncp = chp->ch_hash[i]; ncp != NULL; ncp = ncp->ci_next)
    949 			*cpp++ = fmd_case_tryhold(ncp);
    950 	ASSERT(cpp == cps + cpc);
    951 	(void) pthread_rwlock_unlock(&chp->ch_lock);
    952 
    953 	/*
    954 	 * Run fmd_case_check_for_dups() on all cases except the current one.
    955 	 */
    956 	for (i = 0; i < cpc; i++) {
    957 		if (cps[i] != NULL) {
    958 			if (cps[i] != (fmd_case_impl_t *)cp)
    959 				fmd_case_check_for_dups((fmd_case_t *)cps[i],
    960 				    &fccd);
    961 			fmd_case_rele((fmd_case_t *)cps[i]);
    962 		}
    963 	}
    964 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
    965 
    966 	(void) pthread_mutex_lock(&cip->ci_lock);
    967 	if (cip->ci_code == NULL)
    968 		(void) fmd_case_mkcode(cp);
    969 	else if (cip->ci_precanned)
    970 		fmd_case_code_hash_insert(fmd.d_cases, cip);
    971 
    972 	if (discard_new) {
    973 		/*
    974 		 * We've found an existing case that is a match and it is not
    975 		 * already in repaired or resolved state. So we can close this
    976 		 * one as a duplicate.
    977 		 */
    978 		(void) pthread_mutex_unlock(&cip->ci_lock);
    979 		return (1);
    980 	}
    981 
    982 	/*
    983 	 * Allocate new cache entries
    984 	 */
    985 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
    986 		if ((alp = fmd_asru_hash_create_entry(ahp,
    987 		    cp, cis->cis_nvl)) == NULL) {
    988 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
    989 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
    990 			continue;
    991 		}
    992 		alp->al_flags |= FMD_ASRU_PRESENT;
    993 		alp->al_asru->asru_flags |= FMD_ASRU_PRESENT;
    994 		(void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0);
    995 		(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
    996 	}
    997 
    998 	if (adjust_new) {
    999 		int some_suspect = 0, some_not_suspect = 0;
   1000 
   1001 		/*
   1002 		 * There is one or more matching case but they are already in
   1003 		 * repaired or resolved state. So we need to keep the new
   1004 		 * case, but we can adjust it. Repaired/removed/replaced
   1005 		 * suspects are unlikely to be to blame (unless there are
   1006 		 * actually two separate faults). So if we have a combination of
   1007 		 * repaired/replaced/removed suspects and acquitted suspects in
   1008 		 * the old lists, then we should acquit in the new list those
   1009 		 * that were repaired/replaced/removed in the old.
   1010 		 */
   1011 		for (i = 0; i < cip->ci_nsuspects; i++) {
   1012 			if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) ||
   1013 			    (new_susp_state[i] & SUSPECT_STATE_REPAIRED) ||
   1014 			    (new_susp_state[i] & SUSPECT_STATE_REMOVED) ||
   1015 			    (new_match_state[i] & SUSPECT_STATE_NO_MATCH))
   1016 				some_not_suspect = 1;
   1017 			else
   1018 				some_suspect = 1;
   1019 		}
   1020 		if (some_suspect && some_not_suspect) {
   1021 			for (cis = cip->ci_suspects, i = 0; cis != NULL;
   1022 			    cis = cis->cis_next, i++)
   1023 				if ((new_susp_state[i] &
   1024 				    SUSPECT_STATE_REPLACED) ||
   1025 				    (new_susp_state[i] &
   1026 				    SUSPECT_STATE_REPAIRED) ||
   1027 				    (new_susp_state[i] &
   1028 				    SUSPECT_STATE_REMOVED) ||
   1029 				    (new_match_state[i] &
   1030 				    SUSPECT_STATE_NO_MATCH))
   1031 					fmd_asru_hash_apply_by_case(fmd.d_asrus,
   1032 					    cp, fmd_case_acquit_suspect,
   1033 					    cis->cis_nvl);
   1034 		}
   1035 	}
   1036 
   1037 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1038 	return (0);
   1039 }
   1040 
   1041 void
   1042 fmd_case_publish(fmd_case_t *cp, uint_t state)
   1043 {
   1044 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1045 	fmd_event_t *e;
   1046 	nvlist_t *nvl;
   1047 	char *class;
   1048 
   1049 	if (state == FMD_CASE_CURRENT)
   1050 		state = cip->ci_state; /* use current state */
   1051 
   1052 	switch (state) {
   1053 	case FMD_CASE_SOLVED:
   1054 		(void) pthread_mutex_lock(&cip->ci_lock);
   1055 
   1056 		/*
   1057 		 * If we already have a code, then case is already solved.
   1058 		 */
   1059 		if (cip->ci_precanned == 0 && cip->ci_xprt == NULL &&
   1060 		    cip->ci_code != NULL) {
   1061 			(void) pthread_mutex_unlock(&cip->ci_lock);
   1062 			break;
   1063 		}
   1064 
   1065 		if (cip->ci_tv_valid == 0) {
   1066 			fmd_time_gettimeofday(&cip->ci_tv);
   1067 			cip->ci_tv_valid = 1;
   1068 		}
   1069 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1070 
   1071 		if (fmd_case_convict(cp) == 1) { /* dupclose */
   1072 			cip->ci_flags &= ~FMD_CF_SOLVED;
   1073 			fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
   1074 			break;
   1075 		}
   1076 		if (cip->ci_xprt != NULL) {
   1077 			/*
   1078 			 * For proxy, save some information about the transport
   1079 			 * in the resource cache.
   1080 			 */
   1081 			int count = 0;
   1082 			fmd_asru_set_on_proxy_t fasp;
   1083 			fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt;
   1084 
   1085 			fasp.fasp_countp = &count;
   1086 			fasp.fasp_maxcount = cip->ci_nsuspects;
   1087 			fasp.fasp_proxy_asru = cip->ci_proxy_asru;
   1088 			fasp.fasp_proxy_external = xip->xi_flags &
   1089 			    FMD_XPRT_EXTERNAL;
   1090 			fasp.fasp_proxy_rdonly = ((xip->xi_flags &
   1091 			    FMD_XPRT_RDWR) == FMD_XPRT_RDONLY);
   1092 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
   1093 			    fmd_asru_set_on_proxy, &fasp);
   1094 		}
   1095 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
   1096 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
   1097 
   1098 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
   1099 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
   1100 		fmd_log_append(fmd.d_fltlog, e, cp);
   1101 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
   1102 		fmd_dispq_dispatch(fmd.d_disp, e, class);
   1103 
   1104 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
   1105 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
   1106 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
   1107 
   1108 		break;
   1109 
   1110 	case FMD_CASE_CLOSE_WAIT:
   1111 		fmd_case_hold(cp);
   1112 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
   1113 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
   1114 
   1115 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
   1116 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
   1117 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
   1118 
   1119 		break;
   1120 
   1121 	case FMD_CASE_CLOSED:
   1122 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
   1123 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
   1124 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
   1125 		fmd_dispq_dispatch(fmd.d_disp, e, class);
   1126 		break;
   1127 
   1128 	case FMD_CASE_REPAIRED:
   1129 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
   1130 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
   1131 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
   1132 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
   1133 		fmd_log_append(fmd.d_fltlog, e, cp);
   1134 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
   1135 		fmd_dispq_dispatch(fmd.d_disp, e, class);
   1136 		break;
   1137 
   1138 	case FMD_CASE_RESOLVED:
   1139 		nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS);
   1140 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
   1141 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
   1142 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
   1143 		fmd_log_append(fmd.d_fltlog, e, cp);
   1144 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
   1145 		fmd_dispq_dispatch(fmd.d_disp, e, class);
   1146 		break;
   1147 	}
   1148 }
   1149 
   1150 fmd_case_t *
   1151 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
   1152 {
   1153 	fmd_case_impl_t *cip;
   1154 	uint_t h;
   1155 
   1156 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
   1157 	h = fmd_strhash(uuid) % chp->ch_hashlen;
   1158 
   1159 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
   1160 		if (strcmp(cip->ci_uuid, uuid) == 0)
   1161 			break;
   1162 	}
   1163 
   1164 	/*
   1165 	 * If deleting bit is set, treat the case as if it doesn't exist.
   1166 	 */
   1167 	if (cip != NULL)
   1168 		cip = fmd_case_tryhold(cip);
   1169 
   1170 	if (cip == NULL)
   1171 		(void) fmd_set_errno(EFMD_CASE_INVAL);
   1172 
   1173 	(void) pthread_rwlock_unlock(&chp->ch_lock);
   1174 	return ((fmd_case_t *)cip);
   1175 }
   1176 
   1177 static fmd_case_impl_t *
   1178 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
   1179 {
   1180 	fmd_case_impl_t *eip;
   1181 	uint_t h;
   1182 
   1183 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
   1184 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
   1185 
   1186 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
   1187 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
   1188 		    fmd_case_tryhold(eip) != NULL) {
   1189 			(void) pthread_rwlock_unlock(&chp->ch_lock);
   1190 			return (eip); /* uuid already present */
   1191 		}
   1192 	}
   1193 
   1194 	cip->ci_next = chp->ch_hash[h];
   1195 	chp->ch_hash[h] = cip;
   1196 
   1197 	chp->ch_count++;
   1198 	ASSERT(chp->ch_count != 0);
   1199 
   1200 	(void) pthread_rwlock_unlock(&chp->ch_lock);
   1201 	return (cip);
   1202 }
   1203 
   1204 static void
   1205 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
   1206 {
   1207 	fmd_case_impl_t *cp, **pp;
   1208 	uint_t h;
   1209 
   1210 	ASSERT(MUTEX_HELD(&cip->ci_lock));
   1211 
   1212 	cip->ci_flags |= FMD_CF_DELETING;
   1213 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1214 
   1215 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
   1216 
   1217 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
   1218 	pp = &chp->ch_hash[h];
   1219 
   1220 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
   1221 		if (cp != cip)
   1222 			pp = &cp->ci_next;
   1223 		else
   1224 			break;
   1225 	}
   1226 
   1227 	if (cp == NULL) {
   1228 		fmd_panic("case %p (%s) not found on hash chain %u\n",
   1229 		    (void *)cip, cip->ci_uuid, h);
   1230 	}
   1231 
   1232 	*pp = cp->ci_next;
   1233 	cp->ci_next = NULL;
   1234 
   1235 	/*
   1236 	 * delete from code hash if it is on it
   1237 	 */
   1238 	fmd_case_code_hash_delete(chp, cip);
   1239 
   1240 	ASSERT(chp->ch_count != 0);
   1241 	chp->ch_count--;
   1242 
   1243 	(void) pthread_rwlock_unlock(&chp->ch_lock);
   1244 
   1245 	(void) pthread_mutex_lock(&cip->ci_lock);
   1246 	ASSERT(cip->ci_flags & FMD_CF_DELETING);
   1247 }
   1248 
   1249 fmd_case_t *
   1250 fmd_case_create(fmd_module_t *mp, void *data)
   1251 {
   1252 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
   1253 	fmd_case_impl_t *eip = NULL;
   1254 	uuid_t uuid;
   1255 
   1256 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
   1257 	fmd_buf_hash_create(&cip->ci_bufs);
   1258 
   1259 	fmd_module_hold(mp);
   1260 	cip->ci_mod = mp;
   1261 	cip->ci_refs = 1;
   1262 	cip->ci_state = FMD_CASE_UNSOLVED;
   1263 	cip->ci_flags = FMD_CF_DIRTY;
   1264 	cip->ci_data = data;
   1265 
   1266 	/*
   1267 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
   1268 	 * define any constant for the length of an unparse string, and do not
   1269 	 * permit the caller to specify a buffer length for safety.  The spec
   1270 	 * says it will be 36 bytes, but we make it tunable just in case.
   1271 	 */
   1272 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
   1273 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
   1274 
   1275 	/*
   1276 	 * We expect this loop to execute only once, but code it defensively
   1277 	 * against the possibility of libuuid bugs.  Keep generating uuids and
   1278 	 * attempting to do a hash insert until we get a unique one.
   1279 	 */
   1280 	do {
   1281 		if (eip != NULL)
   1282 			fmd_case_rele((fmd_case_t *)eip);
   1283 		uuid_generate(uuid);
   1284 		uuid_unparse(uuid, cip->ci_uuid);
   1285 	} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
   1286 
   1287 	ASSERT(fmd_module_locked(mp));
   1288 	fmd_list_append(&mp->mod_cases, cip);
   1289 	fmd_module_setcdirty(mp);
   1290 
   1291 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
   1292 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
   1293 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
   1294 
   1295 	return ((fmd_case_t *)cip);
   1296 }
   1297 
   1298 static void
   1299 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
   1300 {
   1301 	fmd_case_susp_t *cis, *ncis;
   1302 
   1303 	ASSERT(MUTEX_HELD(&cip->ci_lock));
   1304 
   1305 	if (cip->ci_proxy_asru)
   1306 		fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) *
   1307 		    cip->ci_nsuspects);
   1308 	if (cip->ci_diag_de)
   1309 		nvlist_free(cip->ci_diag_de);
   1310 	if (cip->ci_diag_asru)
   1311 		fmd_free(cip->ci_diag_asru, sizeof (uint8_t) *
   1312 		    cip->ci_nsuspects);
   1313 
   1314 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
   1315 		ncis = cis->cis_next;
   1316 		nvlist_free(cis->cis_nvl);
   1317 		fmd_free(cis, sizeof (fmd_case_susp_t));
   1318 	}
   1319 
   1320 	cip->ci_suspects = NULL;
   1321 	cip->ci_nsuspects = 0;
   1322 }
   1323 
   1324 fmd_case_t *
   1325 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
   1326     uint_t state, const char *uuid, const char *code)
   1327 {
   1328 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
   1329 	fmd_case_impl_t *eip;
   1330 
   1331 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
   1332 	fmd_buf_hash_create(&cip->ci_bufs);
   1333 
   1334 	fmd_module_hold(mp);
   1335 	cip->ci_mod = mp;
   1336 	cip->ci_xprt = xp;
   1337 	cip->ci_refs = 1;
   1338 	cip->ci_state = state;
   1339 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
   1340 	cip->ci_uuidlen = strlen(cip->ci_uuid);
   1341 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
   1342 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
   1343 
   1344 	if (state > FMD_CASE_CLOSE_WAIT)
   1345 		cip->ci_flags |= FMD_CF_SOLVED;
   1346 
   1347 	/*
   1348 	 * Insert the case into the global case hash.  If the specified UUID is
   1349 	 * already present, check to see if it is an orphan: if so, reclaim it;
   1350 	 * otherwise if it is owned by a different module then return NULL.
   1351 	 */
   1352 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
   1353 		(void) pthread_mutex_lock(&cip->ci_lock);
   1354 		cip->ci_refs--; /* decrement to zero */
   1355 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
   1356 
   1357 		cip = eip; /* switch 'cip' to the existing case */
   1358 		(void) pthread_mutex_lock(&cip->ci_lock);
   1359 
   1360 		/*
   1361 		 * If the ASRU cache is trying to recreate an orphan, then just
   1362 		 * return the existing case that we found without changing it.
   1363 		 */
   1364 		if (mp == fmd.d_rmod) {
   1365 			/*
   1366 			 * In case the case has already been created from
   1367 			 * a checkpoint file we need to set up code now.
   1368 			 */
   1369 			if (cip->ci_state < FMD_CASE_CLOSED) {
   1370 				if (code != NULL && cip->ci_code == NULL) {
   1371 					cip->ci_code = fmd_strdup(code,
   1372 					    FMD_SLEEP);
   1373 					cip->ci_codelen = cip->ci_code ?
   1374 					    strlen(cip->ci_code) + 1 : 0;
   1375 					fmd_case_code_hash_insert(fmd.d_cases,
   1376 					    cip);
   1377 				}
   1378 			}
   1379 
   1380 			/*
   1381 			 * When recreating an orphan case, state passed in may
   1382 			 * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If
   1383 			 * any suspects are still CLOSED (faulty) then the
   1384 			 * overall state needs to be CLOSED.
   1385 			 */
   1386 			if ((cip->ci_state == FMD_CASE_REPAIRED ||
   1387 			    cip->ci_state == FMD_CASE_RESOLVED) &&
   1388 			    state == FMD_CASE_CLOSED)
   1389 				cip->ci_state = FMD_CASE_CLOSED;
   1390 			(void) pthread_mutex_unlock(&cip->ci_lock);
   1391 			fmd_case_rele((fmd_case_t *)cip);
   1392 			return ((fmd_case_t *)cip);
   1393 		}
   1394 
   1395 		/*
   1396 		 * If the existing case isn't an orphan or is being proxied,
   1397 		 * then we have a UUID conflict: return failure to the caller.
   1398 		 */
   1399 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
   1400 			(void) pthread_mutex_unlock(&cip->ci_lock);
   1401 			fmd_case_rele((fmd_case_t *)cip);
   1402 			return (NULL);
   1403 		}
   1404 
   1405 		/*
   1406 		 * If the new module is reclaiming an orphaned case, remove
   1407 		 * the case from the root module, switch ci_mod, and then fall
   1408 		 * through to adding the case to the new owner module 'mp'.
   1409 		 */
   1410 		fmd_module_lock(cip->ci_mod);
   1411 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
   1412 		fmd_module_unlock(cip->ci_mod);
   1413 
   1414 		fmd_module_rele(cip->ci_mod);
   1415 		cip->ci_mod = mp;
   1416 		fmd_module_hold(mp);
   1417 
   1418 		/*
   1419 		 * It's possible that fmd crashed or was restarted during a
   1420 		 * previous solve operation between the asru cache being created
   1421 		 * and the ckpt file being updated to SOLVED. Thus when the DE
   1422 		 * recreates the case here from the checkpoint file, the state
   1423 		 * will be UNSOLVED and yet we are having to reclaim because
   1424 		 * the case was in the asru cache. If this happens, revert the
   1425 		 * case back to the UNSOLVED state and let the DE solve it again
   1426 		 */
   1427 		if (state == FMD_CASE_UNSOLVED) {
   1428 			fmd_asru_hash_delete_case(fmd.d_asrus,
   1429 			    (fmd_case_t *)cip);
   1430 			fmd_case_destroy_suspects(cip);
   1431 			fmd_case_code_hash_delete(fmd.d_cases, cip);
   1432 			fmd_free(cip->ci_code, cip->ci_codelen);
   1433 			cip->ci_code = NULL;
   1434 			cip->ci_codelen = 0;
   1435 			cip->ci_tv_valid = 0;
   1436 		}
   1437 
   1438 		cip->ci_state = state;
   1439 
   1440 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1441 		fmd_case_rele((fmd_case_t *)cip);
   1442 	} else {
   1443 		/*
   1444 		 * add into hash of solved cases
   1445 		 */
   1446 		if (cip->ci_code)
   1447 			fmd_case_code_hash_insert(fmd.d_cases, cip);
   1448 	}
   1449 
   1450 	ASSERT(fmd_module_locked(mp));
   1451 	fmd_list_append(&mp->mod_cases, cip);
   1452 
   1453 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
   1454 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
   1455 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
   1456 
   1457 	return ((fmd_case_t *)cip);
   1458 }
   1459 
   1460 void
   1461 fmd_case_destroy(fmd_case_t *cp, int visible)
   1462 {
   1463 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1464 	fmd_case_item_t *cit, *ncit;
   1465 
   1466 	ASSERT(MUTEX_HELD(&cip->ci_lock));
   1467 	ASSERT(cip->ci_refs == 0);
   1468 
   1469 	if (visible) {
   1470 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
   1471 		fmd_case_hash_delete(fmd.d_cases, cip);
   1472 	}
   1473 
   1474 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
   1475 		ncit = cit->cit_next;
   1476 		fmd_event_rele(cit->cit_event);
   1477 		fmd_free(cit, sizeof (fmd_case_item_t));
   1478 	}
   1479 
   1480 	fmd_case_destroy_suspects(cip);
   1481 
   1482 	if (cip->ci_principal != NULL)
   1483 		fmd_event_rele(cip->ci_principal);
   1484 
   1485 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
   1486 	fmd_free(cip->ci_code, cip->ci_codelen);
   1487 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
   1488 
   1489 	fmd_module_rele(cip->ci_mod);
   1490 	fmd_free(cip, sizeof (fmd_case_impl_t));
   1491 }
   1492 
   1493 void
   1494 fmd_case_hold(fmd_case_t *cp)
   1495 {
   1496 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1497 
   1498 	(void) pthread_mutex_lock(&cip->ci_lock);
   1499 	fmd_case_hold_locked(cp);
   1500 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1501 }
   1502 
   1503 void
   1504 fmd_case_hold_locked(fmd_case_t *cp)
   1505 {
   1506 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1507 
   1508 	ASSERT(MUTEX_HELD(&cip->ci_lock));
   1509 	if (cip->ci_flags & FMD_CF_DELETING)
   1510 		fmd_panic("attempt to hold a deleting case %p (%s)\n",
   1511 		    (void *)cip, cip->ci_uuid);
   1512 	cip->ci_refs++;
   1513 	ASSERT(cip->ci_refs != 0);
   1514 }
   1515 
   1516 static fmd_case_impl_t *
   1517 fmd_case_tryhold(fmd_case_impl_t *cip)
   1518 {
   1519 	/*
   1520 	 * If the case's "deleting" bit is unset, hold and return case,
   1521 	 * otherwise, return NULL.
   1522 	 */
   1523 	(void) pthread_mutex_lock(&cip->ci_lock);
   1524 	if (cip->ci_flags & FMD_CF_DELETING) {
   1525 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1526 		cip = NULL;
   1527 	} else {
   1528 		fmd_case_hold_locked((fmd_case_t *)cip);
   1529 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1530 	}
   1531 	return (cip);
   1532 }
   1533 
   1534 void
   1535 fmd_case_rele(fmd_case_t *cp)
   1536 {
   1537 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1538 
   1539 	(void) pthread_mutex_lock(&cip->ci_lock);
   1540 	ASSERT(cip->ci_refs != 0);
   1541 
   1542 	if (--cip->ci_refs == 0)
   1543 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
   1544 	else
   1545 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1546 }
   1547 
   1548 void
   1549 fmd_case_rele_locked(fmd_case_t *cp)
   1550 {
   1551 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1552 
   1553 	ASSERT(MUTEX_HELD(&cip->ci_lock));
   1554 	--cip->ci_refs;
   1555 	ASSERT(cip->ci_refs != 0);
   1556 }
   1557 
   1558 int
   1559 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
   1560 {
   1561 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1562 	fmd_case_item_t *cit;
   1563 	fmd_event_t *oep;
   1564 	uint_t state;
   1565 	int new;
   1566 
   1567 	fmd_event_hold(ep);
   1568 	(void) pthread_mutex_lock(&cip->ci_lock);
   1569 
   1570 	if (cip->ci_flags & FMD_CF_SOLVED)
   1571 		state = FMD_EVS_DIAGNOSED;
   1572 	else
   1573 		state = FMD_EVS_ACCEPTED;
   1574 
   1575 	oep = cip->ci_principal;
   1576 	cip->ci_principal = ep;
   1577 
   1578 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
   1579 		if (cit->cit_event == ep)
   1580 			break;
   1581 	}
   1582 
   1583 	cip->ci_flags |= FMD_CF_DIRTY;
   1584 	new = cit == NULL && ep != oep;
   1585 
   1586 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1587 
   1588 	fmd_module_setcdirty(cip->ci_mod);
   1589 	fmd_event_transition(ep, state);
   1590 
   1591 	if (oep != NULL)
   1592 		fmd_event_rele(oep);
   1593 
   1594 	return (new);
   1595 }
   1596 
   1597 int
   1598 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
   1599 {
   1600 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1601 	fmd_case_item_t *cit;
   1602 	uint_t state;
   1603 	int new;
   1604 	boolean_t injected;
   1605 
   1606 	(void) pthread_mutex_lock(&cip->ci_lock);
   1607 
   1608 	if (cip->ci_flags & FMD_CF_SOLVED)
   1609 		state = FMD_EVS_DIAGNOSED;
   1610 	else
   1611 		state = FMD_EVS_ACCEPTED;
   1612 
   1613 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
   1614 		if (cit->cit_event == ep)
   1615 			break;
   1616 	}
   1617 
   1618 	new = cit == NULL && ep != cip->ci_principal;
   1619 
   1620 	/*
   1621 	 * If the event is already in the case or the case is already solved,
   1622 	 * there is no reason to save it: just transition it appropriately.
   1623 	 */
   1624 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
   1625 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1626 		fmd_event_transition(ep, state);
   1627 		return (new);
   1628 	}
   1629 
   1630 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
   1631 	fmd_event_hold(ep);
   1632 
   1633 	if (nvlist_lookup_boolean_value(((fmd_event_impl_t *)ep)->ev_nvl,
   1634 	    "__injected", &injected) == 0 && injected)
   1635 		fmd_case_set_injected(cp);
   1636 
   1637 	cit->cit_next = cip->ci_items;
   1638 	cit->cit_event = ep;
   1639 
   1640 	cip->ci_items = cit;
   1641 	cip->ci_nitems++;
   1642 
   1643 	cip->ci_flags |= FMD_CF_DIRTY;
   1644 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1645 
   1646 	fmd_module_setcdirty(cip->ci_mod);
   1647 	fmd_event_transition(ep, state);
   1648 
   1649 	return (new);
   1650 }
   1651 
   1652 void
   1653 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
   1654 {
   1655 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1656 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
   1657 
   1658 	(void) pthread_mutex_lock(&cip->ci_lock);
   1659 	ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
   1660 	cip->ci_flags |= FMD_CF_DIRTY;
   1661 
   1662 	cis->cis_next = cip->ci_suspects;
   1663 	cis->cis_nvl = nvl;
   1664 
   1665 	cip->ci_suspects = cis;
   1666 	cip->ci_nsuspects++;
   1667 
   1668 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1669 	if (cip->ci_xprt == NULL)
   1670 		fmd_module_setcdirty(cip->ci_mod);
   1671 }
   1672 
   1673 void
   1674 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
   1675 {
   1676 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1677 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
   1678 	boolean_t b;
   1679 
   1680 	(void) pthread_mutex_lock(&cip->ci_lock);
   1681 
   1682 	cis->cis_next = cip->ci_suspects;
   1683 	cis->cis_nvl = nvl;
   1684 
   1685 	if (nvlist_lookup_boolean_value(nvl,
   1686 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
   1687 		cip->ci_flags |= FMD_CF_INVISIBLE;
   1688 
   1689 	cip->ci_suspects = cis;
   1690 	cip->ci_nsuspects++;
   1691 
   1692 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1693 }
   1694 
   1695 void
   1696 fmd_case_reset_suspects(fmd_case_t *cp)
   1697 {
   1698 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1699 
   1700 	(void) pthread_mutex_lock(&cip->ci_lock);
   1701 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
   1702 
   1703 	fmd_case_destroy_suspects(cip);
   1704 	cip->ci_flags |= FMD_CF_DIRTY;
   1705 
   1706 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1707 	fmd_module_setcdirty(cip->ci_mod);
   1708 }
   1709 
   1710 /*ARGSUSED*/
   1711 static void
   1712 fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
   1713 {
   1714 	(void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
   1715 }
   1716 
   1717 /*
   1718  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
   1719  * whatever actions and emit whatever events are appropriate for the state.
   1720  * Refer to the topmost block comment explaining the state machine for details.
   1721  */
   1722 void
   1723 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
   1724 {
   1725 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1726 	fmd_case_item_t *cit;
   1727 	fmd_event_t *e;
   1728 	int resolved = 0;
   1729 	int any_unusable_and_present = 0;
   1730 
   1731 	ASSERT(state <= FMD_CASE_RESOLVED);
   1732 	(void) pthread_mutex_lock(&cip->ci_lock);
   1733 
   1734 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
   1735 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED);
   1736 
   1737 	cip->ci_flags |= flags;
   1738 
   1739 	if (cip->ci_state >= state) {
   1740 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1741 		return; /* already in specified state */
   1742 	}
   1743 
   1744 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
   1745 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
   1746 
   1747 	cip->ci_state = state;
   1748 	cip->ci_flags |= FMD_CF_DIRTY;
   1749 
   1750 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
   1751 		fmd_module_setcdirty(cip->ci_mod);
   1752 
   1753 	switch (state) {
   1754 	case FMD_CASE_SOLVED:
   1755 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
   1756 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
   1757 
   1758 		if (cip->ci_principal != NULL) {
   1759 			fmd_event_transition(cip->ci_principal,
   1760 			    FMD_EVS_DIAGNOSED);
   1761 		}
   1762 		break;
   1763 
   1764 	case FMD_CASE_CLOSE_WAIT:
   1765 		/*
   1766 		 * If the case was never solved, do not change ASRUs.
   1767 		 * If the case was never fmd_case_closed, do not change ASRUs.
   1768 		 * If the case was repaired, do not change ASRUs.
   1769 		 */
   1770 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
   1771 		    FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
   1772 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
   1773 			    fmd_case_unusable, NULL);
   1774 
   1775 		/*
   1776 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
   1777 		 * module is no longer loaded: continue on to CASE_CLOSED or
   1778 		 * CASE_REPAIRED as appropriate.
   1779 		 */
   1780 		if (fmd_case_orphaned(cp)) {
   1781 			if (cip->ci_flags & FMD_CF_REPAIRED) {
   1782 				state = cip->ci_state = FMD_CASE_REPAIRED;
   1783 				TRACE((FMD_DBG_CASE, "case %s %s->%s",
   1784 				    cip->ci_uuid,
   1785 				    _fmd_case_snames[FMD_CASE_CLOSE_WAIT],
   1786 				    _fmd_case_snames[FMD_CASE_REPAIRED]));
   1787 				goto do_repair;
   1788 			} else {
   1789 				state = cip->ci_state = FMD_CASE_CLOSED;
   1790 				TRACE((FMD_DBG_CASE, "case %s %s->%s",
   1791 				    cip->ci_uuid,
   1792 				    _fmd_case_snames[FMD_CASE_CLOSE_WAIT],
   1793 				    _fmd_case_snames[FMD_CASE_CLOSED]));
   1794 			}
   1795 		}
   1796 		break;
   1797 
   1798 	case FMD_CASE_REPAIRED:
   1799 do_repair:
   1800 		ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp));
   1801 
   1802 		/*
   1803 		 * If we've been requested to transition straight on to the
   1804 		 * RESOLVED state (which can happen with fault proxying where a
   1805 		 * list.resolved or a uuresolved is received from the other
   1806 		 * side), or if all suspects are already either usable or not
   1807 		 * present then transition straight to RESOLVED state,
   1808 		 * publishing both the list.repaired and list.resolved. For a
   1809 		 * proxy, if we discover here that all suspects are already
   1810 		 * either usable or not present, notify the diag side instead
   1811 		 * using fmd_xprt_uuresolved().
   1812 		 */
   1813 		if (flags & FMD_CF_RESOLVED) {
   1814 			if (cip->ci_xprt != NULL)
   1815 				fmd_list_delete(&cip->ci_mod->mod_cases, cip);
   1816 		} else {
   1817 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
   1818 			    fmd_case_unusable_and_present,
   1819 			    &any_unusable_and_present);
   1820 			if (any_unusable_and_present)
   1821 				break;
   1822 			if (cip->ci_xprt != NULL) {
   1823 				fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid);
   1824 				break;
   1825 			}
   1826 		}
   1827 
   1828 		cip->ci_state = FMD_CASE_RESOLVED;
   1829 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1830 		fmd_case_publish(cp, state);
   1831 		TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
   1832 		    _fmd_case_snames[FMD_CASE_REPAIRED],
   1833 		    _fmd_case_snames[FMD_CASE_RESOLVED]));
   1834 		state = FMD_CASE_RESOLVED;
   1835 		resolved = 1;
   1836 		(void) pthread_mutex_lock(&cip->ci_lock);
   1837 		break;
   1838 
   1839 	case FMD_CASE_RESOLVED:
   1840 		/*
   1841 		 * For a proxy, no need to check that all suspects are already
   1842 		 * either usable or not present - this request has come from
   1843 		 * the diagnosing side which makes the final decision on this.
   1844 		 */
   1845 		if (cip->ci_xprt != NULL) {
   1846 			fmd_list_delete(&cip->ci_mod->mod_cases, cip);
   1847 			resolved = 1;
   1848 			break;
   1849 		}
   1850 
   1851 		ASSERT(fmd_case_orphaned(cp));
   1852 
   1853 		/*
   1854 		 * If all suspects are already either usable or not present then
   1855 		 * carry on, publish list.resolved and discard the case.
   1856 		 */
   1857 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
   1858 		    fmd_case_unusable_and_present, &any_unusable_and_present);
   1859 		if (any_unusable_and_present) {
   1860 			(void) pthread_mutex_unlock(&cip->ci_lock);
   1861 			return;
   1862 		}
   1863 
   1864 		resolved = 1;
   1865 		break;
   1866 	}
   1867 
   1868 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1869 
   1870 	/*
   1871 	 * If the module has initialized, then publish the appropriate event
   1872 	 * for the new case state.  If not, we are being called from the
   1873 	 * checkpoint code during module load, in which case the module's
   1874 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
   1875 	 * may not be open yet, which will prevent us from computing the event
   1876 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
   1877 	 * event in our queue: this won't be processed until _fmd_init is done.
   1878 	 */
   1879 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
   1880 		fmd_case_publish(cp, state);
   1881 	else {
   1882 		fmd_case_hold(cp);
   1883 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
   1884 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
   1885 	}
   1886 
   1887 	if (resolved) {
   1888 		if (cip->ci_xprt != NULL) {
   1889 			/*
   1890 			 * If we transitioned to RESOLVED, adjust the reference
   1891 			 * count to reflect our removal from
   1892 			 * fmd.d_rmod->mod_cases above.  If the caller has not
   1893 			 * placed an additional hold on the case, it will now
   1894 			 * be freed.
   1895 			 */
   1896 			(void) pthread_mutex_lock(&cip->ci_lock);
   1897 			fmd_asru_hash_delete_case(fmd.d_asrus, cp);
   1898 			(void) pthread_mutex_unlock(&cip->ci_lock);
   1899 			fmd_case_rele(cp);
   1900 		} else {
   1901 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
   1902 			    fmd_asru_log_resolved, NULL);
   1903 			(void) pthread_mutex_lock(&cip->ci_lock);
   1904 			/* mark as "ready to be discarded */
   1905 			cip->ci_flags |= FMD_CF_RES_CMPL;
   1906 			(void) pthread_mutex_unlock(&cip->ci_lock);
   1907 		}
   1908 	}
   1909 }
   1910 
   1911 /*
   1912  * Discard any case if it is in RESOLVED state (and if check_if_aged argument
   1913  * is set if all suspects have passed the rsrc.aged time).
   1914  */
   1915 void
   1916 fmd_case_discard_resolved(fmd_case_t *cp, void *arg)
   1917 {
   1918 	int check_if_aged = *(int *)arg;
   1919 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1920 
   1921 	/*
   1922 	 * First check if case has completed transition to resolved.
   1923 	 */
   1924 	(void) pthread_mutex_lock(&cip->ci_lock);
   1925 	if (!(cip->ci_flags & FMD_CF_RES_CMPL)) {
   1926 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1927 		return;
   1928 	}
   1929 
   1930 	/*
   1931 	 * Now if check_is_aged is set, see if all suspects have aged.
   1932 	 */
   1933 	if (check_if_aged) {
   1934 		int aged = 1;
   1935 
   1936 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
   1937 		    fmd_asru_check_if_aged, &aged);
   1938 		if (!aged) {
   1939 			(void) pthread_mutex_unlock(&cip->ci_lock);
   1940 			return;
   1941 		}
   1942 	}
   1943 
   1944 	/*
   1945 	 * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't
   1946 	 * do it twice.
   1947 	 */
   1948 	fmd_module_lock(cip->ci_mod);
   1949 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
   1950 	fmd_module_unlock(cip->ci_mod);
   1951 	fmd_asru_hash_delete_case(fmd.d_asrus, cp);
   1952 	cip->ci_flags &= ~FMD_CF_RES_CMPL;
   1953 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1954 	fmd_case_rele(cp);
   1955 }
   1956 
   1957 /*
   1958  * Transition the specified case to *at least* the specified state by first
   1959  * re-validating the suspect list using the resource cache.  This function is
   1960  * employed by the checkpoint code when restoring a saved, solved case to see
   1961  * if the state of the case has effectively changed while fmd was not running
   1962  * or the module was not loaded.
   1963  */
   1964 void
   1965 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
   1966 {
   1967 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1968 
   1969 	int usable = 0;		/* are any suspects usable? */
   1970 
   1971 	ASSERT(state >= FMD_CASE_SOLVED);
   1972 	(void) pthread_mutex_lock(&cip->ci_lock);
   1973 
   1974 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
   1975 
   1976 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1977 
   1978 	if (!usable) {
   1979 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
   1980 		flags |= FMD_CF_ISOLATED;
   1981 	}
   1982 
   1983 	fmd_case_transition(cp, state, flags);
   1984 }
   1985 
   1986 void
   1987 fmd_case_setdirty(fmd_case_t *cp)
   1988 {
   1989 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1990 
   1991 	(void) pthread_mutex_lock(&cip->ci_lock);
   1992 	cip->ci_flags |= FMD_CF_DIRTY;
   1993 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1994 
   1995 	fmd_module_setcdirty(cip->ci_mod);
   1996 }
   1997 
   1998 void
   1999 fmd_case_clrdirty(fmd_case_t *cp)
   2000 {
   2001 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2002 
   2003 	(void) pthread_mutex_lock(&cip->ci_lock);
   2004 	cip->ci_flags &= ~FMD_CF_DIRTY;
   2005 	(void) pthread_mutex_unlock(&cip->ci_lock);
   2006 }
   2007 
   2008 void
   2009 fmd_case_commit(fmd_case_t *cp)
   2010 {
   2011 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2012 	fmd_case_item_t *cit;
   2013 
   2014 	(void) pthread_mutex_lock(&cip->ci_lock);
   2015 
   2016 	if (cip->ci_flags & FMD_CF_DIRTY) {
   2017 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
   2018 			fmd_event_commit(cit->cit_event);
   2019 
   2020 		if (cip->ci_principal != NULL)
   2021 			fmd_event_commit(cip->ci_principal);
   2022 
   2023 		fmd_buf_hash_commit(&cip->ci_bufs);
   2024 		cip->ci_flags &= ~FMD_CF_DIRTY;
   2025 	}
   2026 
   2027 	(void) pthread_mutex_unlock(&cip->ci_lock);
   2028 }
   2029 
   2030 /*
   2031  * On proxy side, send back repair/acquit/etc request to diagnosing side
   2032  */
   2033 void
   2034 fmd_case_xprt_updated(fmd_case_t *cp)
   2035 {
   2036 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2037 	nvlist_t **nva;
   2038 	uint8_t *ba;
   2039 	int msg = B_TRUE;
   2040 	int count = 0;
   2041 	fmd_case_lst_t fcl;
   2042 
   2043 	ASSERT(cip->ci_xprt != NULL);
   2044 	(void) pthread_mutex_lock(&cip->ci_lock);
   2045 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
   2046 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
   2047 	fcl.fcl_countp = &count;
   2048 	fcl.fcl_maxcount = cip->ci_nsuspects;
   2049 	fcl.fcl_msgp = &msg;
   2050 	fcl.fcl_ba = ba;
   2051 	fcl.fcl_nva = nva;
   2052 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
   2053 	(void) pthread_mutex_unlock(&cip->ci_lock);
   2054 	fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru,
   2055 	    count);
   2056 }
   2057 
   2058 /*
   2059  * fmd_case_update_status() can be called on either the proxy side when a
   2060  * list.suspect is received, or on the diagnosing side when an update request
   2061  * is received from the proxy. It updates the status in the resource cache.
   2062  */
   2063 void
   2064 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup,
   2065     uint8_t *diag_asrup)
   2066 {
   2067 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2068 	int count = 0;
   2069 	fmd_asru_update_status_t faus;
   2070 
   2071 	/*
   2072 	 * update status of resource cache entries
   2073 	 */
   2074 	faus.faus_countp = &count;
   2075 	faus.faus_maxcount = cip->ci_nsuspects;
   2076 	faus.faus_ba = statusp;
   2077 	faus.faus_proxy_asru = proxy_asrup;
   2078 	faus.faus_diag_asru = diag_asrup;
   2079 	faus.faus_is_proxy = (cip->ci_xprt != NULL);
   2080 	(void) pthread_mutex_lock(&cip->ci_lock);
   2081 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status,
   2082 	    &faus);
   2083 	(void) pthread_mutex_unlock(&cip->ci_lock);
   2084 }
   2085 
   2086 /*
   2087  * Called on either the proxy side or the diag side when a repair has taken
   2088  * place on the other side but this side may know the asru "contains"
   2089  * relationships.
   2090  */
   2091 void
   2092 fmd_case_update_containees(fmd_case_t *cp)
   2093 {
   2094 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2095 
   2096 	(void) pthread_mutex_lock(&cip->ci_lock);
   2097 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
   2098 	    fmd_asru_update_containees, NULL);
   2099 	(void) pthread_mutex_unlock(&cip->ci_lock);
   2100 }
   2101 
   2102 /*
   2103  * fmd_case_close_status() is called on diagnosing side when proxy side
   2104  * has had a uuclose. It updates the status in the resource cache.
   2105  */
   2106 void
   2107 fmd_case_close_status(fmd_case_t *cp)
   2108 {
   2109 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2110 	int count = 0;
   2111 	fmd_asru_close_status_t facs;
   2112 
   2113 	/*
   2114 	 * update status of resource cache entries
   2115 	 */
   2116 	facs.facs_countp = &count;
   2117 	facs.facs_maxcount = cip->ci_nsuspects;
   2118 	(void) pthread_mutex_lock(&cip->ci_lock);
   2119 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status,
   2120 	    &facs);
   2121 	(void) pthread_mutex_unlock(&cip->ci_lock);
   2122 }
   2123 
   2124 /*
   2125  * Indicate that the case may need to change state because one or more of the
   2126  * ASRUs named as a suspect has changed state.  We examine all the suspects
   2127  * and if none are still faulty, we initiate a case close transition.
   2128  */
   2129 void
   2130 fmd_case_update(fmd_case_t *cp)
   2131 {
   2132 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2133 	uint_t cstate;
   2134 	int faulty = 0;
   2135 
   2136 	(void) pthread_mutex_lock(&cip->ci_lock);
   2137 	cstate = cip->ci_state;
   2138 
   2139 	if (cip->ci_state < FMD_CASE_SOLVED) {
   2140 		(void) pthread_mutex_unlock(&cip->ci_lock);
   2141 		return; /* update is not appropriate */
   2142 	}
   2143 
   2144 	if (cip->ci_flags & FMD_CF_REPAIRED) {
   2145 		(void) pthread_mutex_unlock(&cip->ci_lock);
   2146 		return; /* already repaired */
   2147 	}
   2148 
   2149 	TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid));
   2150 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
   2151 	(void) pthread_mutex_unlock(&cip->ci_lock);
   2152 
   2153 	if (faulty) {
   2154 		nvlist_t *nvl;
   2155 		fmd_event_t *e;
   2156 		char *class;
   2157 
   2158 		TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid));
   2159 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
   2160 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
   2161 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
   2162 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
   2163 		fmd_log_append(fmd.d_fltlog, e, cp);
   2164 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
   2165 		fmd_dispq_dispatch(fmd.d_disp, e, class);
   2166 		return; /* one or more suspects are still marked faulty */
   2167 	}
   2168 
   2169 	if (cstate == FMD_CASE_CLOSED)
   2170 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
   2171 	else
   2172 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
   2173 }
   2174 
   2175 /*
   2176  * Delete a closed case from the module's case list once the fmdo_close() entry
   2177  * point has run to completion.  If the case is owned by a transport module,
   2178  * tell the transport to proxy a case close on the other end of the transport.
   2179  * Transition to the appropriate next state based on ci_flags.  This
   2180  * function represents the end of CLOSE_WAIT and transitions the case to either
   2181  * CLOSED or REPAIRED or discards it entirely because it was never solved;
   2182  * refer to the topmost block comment explaining the state machine for details.
   2183  */
   2184 void
   2185 fmd_case_delete(fmd_case_t *cp)
   2186 {
   2187 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2188 	fmd_modstat_t *msp;
   2189 	size_t buftotal;
   2190 
   2191 	TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid));
   2192 	ASSERT(fmd_module_locked(cip->ci_mod));
   2193 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
   2194 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
   2195 
   2196 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
   2197 	msp = cip->ci_mod->mod_stats;
   2198 
   2199 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
   2200 	msp->ms_caseopen.fmds_value.ui64--;
   2201 
   2202 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
   2203 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
   2204 
   2205 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
   2206 
   2207 	if (cip->ci_xprt == NULL)
   2208 		fmd_module_setcdirty(cip->ci_mod);
   2209 
   2210 	fmd_module_rele(cip->ci_mod);
   2211 	cip->ci_mod = fmd.d_rmod;
   2212 	fmd_module_hold(cip->ci_mod);
   2213 
   2214 	/*
   2215 	 * If the case has been solved, then retain it
   2216 	 * on the root module's case list at least until we're transitioned.
   2217 	 * Otherwise free the case with our final fmd_case_rele() below.
   2218 	 */
   2219 	if (cip->ci_flags & FMD_CF_SOLVED) {
   2220 		fmd_module_lock(cip->ci_mod);
   2221 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
   2222 		fmd_module_unlock(cip->ci_mod);
   2223 		fmd_case_hold(cp);
   2224 	}
   2225 
   2226 	/*
   2227 	 * Transition onwards to REPAIRED or CLOSED as originally requested.
   2228 	 * Note that for proxy case if we're transitioning to CLOSED it means
   2229 	 * the case was isolated locally, so call fmd_xprt_uuclose() to notify
   2230 	 * the diagnosing side. No need to notify the diagnosing side if we are
   2231 	 * transitioning to REPAIRED as we only do this when requested to do
   2232 	 * so by the diagnosing side anyway.
   2233 	 */
   2234 	if (cip->ci_flags & FMD_CF_REPAIRED)
   2235 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
   2236 	else if (cip->ci_flags & FMD_CF_ISOLATED) {
   2237 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
   2238 		if (cip->ci_xprt != NULL)
   2239 			fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
   2240 	}
   2241 
   2242 	fmd_case_rele(cp);
   2243 }
   2244 
   2245 void
   2246 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache)
   2247 {
   2248 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2249 
   2250 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
   2251 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
   2252 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
   2253 
   2254 	ASSERT(fmd_module_locked(cip->ci_mod));
   2255 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
   2256 	if (delete_from_asru_cache) {
   2257 		(void) pthread_mutex_lock(&cip->ci_lock);
   2258 		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
   2259 		(void) pthread_mutex_unlock(&cip->ci_lock);
   2260 	}
   2261 	fmd_case_rele(cp);
   2262 }
   2263 
   2264 /*
   2265  * Indicate that the problem corresponding to a case has been repaired by
   2266  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
   2267  * already been closed, this function initiates the transition to CLOSE_WAIT.
   2268  * The caller must have the case held from fmd_case_hash_lookup(), so we can
   2269  * grab and drop ci_lock without the case being able to be freed in between.
   2270  */
   2271 int
   2272 fmd_case_repair(fmd_case_t *cp)
   2273 {
   2274 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2275 	uint_t cstate;
   2276 	fmd_asru_rep_arg_t fara;
   2277 
   2278 	(void) pthread_mutex_lock(&cip->ci_lock);
   2279 	cstate = cip->ci_state;
   2280 
   2281 	if (cstate < FMD_CASE_SOLVED) {
   2282 		(void) pthread_mutex_unlock(&cip->ci_lock);
   2283 		return (fmd_set_errno(EFMD_CASE_STATE));
   2284 	}
   2285 
   2286 	if (cip->ci_flags & FMD_CF_REPAIRED) {
   2287 		(void) pthread_mutex_unlock(&cip->ci_lock);
   2288 		return (0); /* already repaired */
   2289 	}
   2290 
   2291 	TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid));
   2292 	fara.fara_reason = FMD_ASRU_REPAIRED;
   2293 	fara.fara_bywhat = FARA_BY_CASE;
   2294 	fara.fara_rval = NULL;
   2295 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
   2296 	(void) pthread_mutex_unlock(&cip->ci_lock);
   2297 
   2298 	/*
   2299 	 * if this is a proxied case, send the repair across the transport.
   2300 	 * The remote side will then do the repair and send a list.repaired back
   2301 	 * again such that we can finally repair the case on this side.
   2302 	 */
   2303 	if (cip->ci_xprt != NULL) {
   2304 		fmd_case_xprt_updated(cp);
   2305 		return (0);
   2306 	}
   2307 
   2308 	if (cstate == FMD_CASE_CLOSED)
   2309 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
   2310 	else
   2311 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
   2312 
   2313 	return (0);
   2314 }
   2315 
   2316 int
   2317 fmd_case_acquit(fmd_case_t *cp)
   2318 {
   2319 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2320 	uint_t cstate;
   2321 	fmd_asru_rep_arg_t fara;
   2322 
   2323 	(void) pthread_mutex_lock(&cip->ci_lock);
   2324 	cstate = cip->ci_state;
   2325 
   2326 	if (cstate < FMD_CASE_SOLVED) {
   2327 		(void) pthread_mutex_unlock(&cip->ci_lock);
   2328 		return (fmd_set_errno(EFMD_CASE_STATE));
   2329 	}
   2330 
   2331 	if (cip->ci_flags & FMD_CF_REPAIRED) {
   2332 		(void) pthread_mutex_unlock(&cip->ci_lock);
   2333 		return (0); /* already repaired */
   2334 	}
   2335 
   2336 	TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid));
   2337 	fara.fara_reason = FMD_ASRU_ACQUITTED;
   2338 	fara.fara_bywhat = FARA_BY_CASE;
   2339 	fara.fara_rval = NULL;
   2340 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
   2341 	(void) pthread_mutex_unlock(&cip->ci_lock);
   2342 
   2343 	/*
   2344 	 * if this is a proxied case, send the repair across the transport.
   2345 	 * The remote side will then do the repair and send a list.repaired back
   2346 	 * again such that we can finally repair the case on this side.
   2347 	 */
   2348 	if (cip->ci_xprt != NULL) {
   2349 		fmd_case_xprt_updated(cp);
   2350 		return (0);
   2351 	}
   2352 
   2353 	if (cstate == FMD_CASE_CLOSED)
   2354 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
   2355 	else
   2356 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
   2357 
   2358 	return (0);
   2359 }
   2360 
   2361 int
   2362 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
   2363 {
   2364 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2365 	fmd_case_item_t *cit;
   2366 	uint_t state;
   2367 	int rv = 0;
   2368 
   2369 	(void) pthread_mutex_lock(&cip->ci_lock);
   2370 
   2371 	if (cip->ci_state >= FMD_CASE_SOLVED)
   2372 		state = FMD_EVS_DIAGNOSED;
   2373 	else
   2374 		state = FMD_EVS_ACCEPTED;
   2375 
   2376 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
   2377 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
   2378 			break;
   2379 	}
   2380 
   2381 	if (rv == 0 && cip->ci_principal != NULL)
   2382 		rv = fmd_event_equal(ep, cip->ci_principal);
   2383 
   2384 	(void) pthread_mutex_unlock(&cip->ci_lock);
   2385 
   2386 	if (rv != 0)
   2387 		fmd_event_transition(ep, state);
   2388 
   2389 	return (rv);
   2390 }
   2391 
   2392 int
   2393 fmd_case_orphaned(fmd_case_t *cp)
   2394 {
   2395 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
   2396 }
   2397 
   2398 void
   2399 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
   2400 {
   2401 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
   2402 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
   2403 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
   2404 }
   2405 
   2406 void
   2407 fmd_case_set_injected(fmd_case_t *cp)
   2408 {
   2409 	((fmd_case_impl_t *)cp)->ci_injected = 1;
   2410 }
   2411 
   2412 void
   2413 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl)
   2414 {
   2415 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2416 
   2417 	if (cip->ci_diag_de)
   2418 		nvlist_free(cip->ci_diag_de);
   2419 	cip->ci_diag_de = nvl;
   2420 }
   2421 
   2422 void
   2423 fmd_case_setcode(fmd_case_t *cp, char *code)
   2424 {
   2425 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2426 
   2427 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
   2428 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
   2429 }
   2430 
   2431 /*ARGSUSED*/
   2432 static void
   2433 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
   2434 {
   2435 	int not_faulty = 0;
   2436 	int faulty = 0;
   2437 	nvlist_t *nvl;
   2438 	fmd_event_t *e;
   2439 	char *class;
   2440 	int any_unusable_and_present = 0;
   2441 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   2442 
   2443 	if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL)
   2444 		return;
   2445 
   2446 	if (cip->ci_state == FMD_CASE_RESOLVED) {
   2447 		cip->ci_flags |= FMD_CF_RES_CMPL;
   2448 		return;
   2449 	}
   2450 
   2451 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
   2452 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty,
   2453 	    &not_faulty);
   2454 
   2455 	if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) {
   2456 		/*
   2457 		 * If none of the suspects is faulty, replay the list.repaired.
   2458 		 * If all suspects are already either usable or not present then
   2459 		 * also transition straight to RESOLVED state.
   2460 		 */
   2461 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
   2462 		    fmd_case_unusable_and_present, &any_unusable_and_present);
   2463 		if (!any_unusable_and_present) {
   2464 			cip->ci_state = FMD_CASE_RESOLVED;
   2465 
   2466 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
   2467 			    cip->ci_uuid));
   2468 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
   2469 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
   2470 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
   2471 			    class);
   2472 			fmd_dispq_dispatch(fmd.d_disp, e, class);
   2473 
   2474 			TRACE((FMD_DBG_CASE, "replay sending list.resolved %s",
   2475 			    cip->ci_uuid));
   2476 			fmd_case_publish(cp, FMD_CASE_RESOLVED);
   2477 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
   2478 			    fmd_asru_log_resolved, NULL);
   2479 			cip->ci_flags |= FMD_CF_RES_CMPL;
   2480 		} else {
   2481 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
   2482 			    cip->ci_uuid));
   2483 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
   2484 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
   2485 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
   2486 			    class);
   2487 			fmd_dispq_dispatch(fmd.d_disp, e, class);
   2488 		}
   2489 	} else if (faulty && not_faulty) {
   2490 		/*
   2491 		 * if some but not all of the suspects are not faulty, replay
   2492 		 * the list.updated.
   2493 		 */
   2494 		TRACE((FMD_DBG_CASE, "replay sending list.updated %s",
   2495 		    cip->ci_uuid));
   2496 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
   2497 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
   2498 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
   2499 		fmd_dispq_dispatch(fmd.d_disp, e, class);
   2500 	}
   2501 }
   2502 
   2503 void
   2504 fmd_case_repair_replay()
   2505 {
   2506 	fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL);
   2507 }
   2508