Home | History | Annotate | Download | only in disk-monitor
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Disk Monitor
     29  */
     30 #include <sys/types.h>
     31 #include <sys/stat.h>
     32 #include <fcntl.h>
     33 #include <time.h>
     34 #include <stdio.h>
     35 #include <stdlib.h>
     36 #include <strings.h>
     37 #include <stdarg.h>
     38 #include <errno.h>
     39 #include <signal.h>
     40 #include <unistd.h>
     41 #include <pthread.h>
     42 #include <libnvpair.h>
     43 #include <fm/fmd_api.h>
     44 #include <fm/fmd_fmri.h>
     45 #include <sys/fm/protocol.h>
     46 #include <sys/fm/io/disk.h>
     47 #include <fm/libtopo.h>
     48 
     49 #include "disk_monitor.h"
     50 #include "hotplug_mgr.h"
     51 #include "schg_mgr.h"
     52 #include "topo_gather.h"
     53 #include "dm_platform.h"
     54 
     55 #define	THIS_FMD_MODULE_NAME "disk-monitor"
     56 
     57 static enum disk_init_state {
     58 	INIT_STATE_NONE = 0,
     59 	STATE_CHANGE_MGR_INITTED = 2,
     60 	HOTPLUG_MGR_INITTED = 4
     61 } g_init_state = INIT_STATE_NONE;
     62 
     63 typedef enum {
     64 	LT_SUSPECT,
     65 	LT_REPAIRED
     66 } fm_list_type_t;
     67 
     68 /*
     69  * Global verbosity flag -- controls chattiness of debug messages and
     70  * warnings.  Its value is determined by the fmd property "log-level"
     71  * settable in the DE's .conf file.
     72  */
     73 log_class_t			g_verbose = 0;
     74 cfgdata_t			*config_data = NULL;
     75 fmd_hdl_t			*g_fm_hdl = NULL;
     76 
     77 static const fmd_prop_t		fmd_props[];
     78 
     79 static void
     80 diskmon_teardown_all(void)
     81 {
     82 	cleanup_hotplug_manager();
     83 	cleanup_state_change_manager(config_data);
     84 	config_fini();
     85 }
     86 
     87 static int
     88 count_disks(diskmon_t *disklistp)
     89 {
     90 	int i = 0;
     91 
     92 	while (disklistp != NULL) {
     93 		i++;
     94 		disklistp = disklistp->next;
     95 	}
     96 
     97 	return (i);
     98 }
     99 
    100 static int
    101 diskmon_init(void)
    102 {
    103 	/*
    104 	 * Block the generation of state change events (generated by the
    105 	 * hotplug manager thread) here; they will be unblocked after the
    106 	 * state change manager thread is ready to accept state changes
    107 	 * (shortly after it starts).
    108 	 */
    109 	block_state_change_events();
    110 
    111 	if (dm_platform_init() != 0)
    112 		goto cleanup;
    113 
    114 	if (init_hotplug_manager() != 0)
    115 		goto cleanup;
    116 	else
    117 		g_init_state |= HOTPLUG_MGR_INITTED;
    118 
    119 	if (init_state_change_manager(config_data) != 0)
    120 		goto cleanup;
    121 	else
    122 		g_init_state |= STATE_CHANGE_MGR_INITTED;
    123 
    124 	return (E_SUCCESS);
    125 
    126 cleanup:
    127 
    128 	unblock_state_change_events();
    129 
    130 	/*
    131 	 * The cleanup order here does matter, due to dependencies between the
    132 	 * managers.
    133 	 */
    134 	if (g_init_state & HOTPLUG_MGR_INITTED)
    135 		cleanup_hotplug_manager();
    136 	if (g_init_state & STATE_CHANGE_MGR_INITTED)
    137 		cleanup_state_change_manager(config_data);
    138 	dm_platform_fini();
    139 
    140 	return (E_ERROR);
    141 }
    142 
    143 static void
    144 dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl)
    145 {
    146 	const char		*action_prop = NULL;
    147 	const char		*action_string;
    148 
    149 	/*
    150 	 * The predictive failure action is the activation of the fault
    151 	 * indicator.
    152 	 */
    153 	if (fmd_nvl_class_match(hdl, nvl,
    154 	    DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP))
    155 		action_prop = DISK_PROP_OTEMPACTION;
    156 
    157 	if (fmd_nvl_class_match(hdl, nvl,
    158 	    DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL))
    159 		action_prop = DISK_PROP_STFAILACTION;
    160 
    161 	dm_fault_indicator_set(diskp, INDICATOR_ON);
    162 
    163 	if (action_prop != NULL &&
    164 	    (action_string = dm_prop_lookup(diskp->props, action_prop))
    165 	    != NULL) {
    166 
    167 		if (dm_platform_indicator_execute(action_string) != 0) {
    168 			log_warn("Fault action `%s' did not successfully "
    169 			    "complete.\n", action_string);
    170 		}
    171 	}
    172 }
    173 
    174 static void
    175 diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair)
    176 {
    177 	char		*uuid = NULL;
    178 	nvlist_t	**nva;
    179 	uint_t		nvc;
    180 	diskmon_t	*diskp;
    181 	nvlist_t	*fmri;
    182 	nvlist_t	*fltnvl;
    183 	int		err = 0;
    184 
    185 	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
    186 	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
    187 	    &nva, &nvc);
    188 	if (err != 0)
    189 		return;
    190 
    191 	while (nvc-- != 0) {
    192 
    193 		fltnvl = *nva++;
    194 
    195 		if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri)
    196 		    != 0)
    197 			continue;
    198 
    199 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
    200 			continue;
    201 
    202 		log_msg(MM_MAIN, "Disk %s repaired!\n",
    203 		    diskp->location);
    204 
    205 		dm_fault_indicator_set(diskp, INDICATOR_OFF);
    206 
    207 		dm_state_change(diskp, HPS_REPAIRED);
    208 	}
    209 
    210 	if (repair)
    211 		fmd_case_uuresolved(hdl, uuid);
    212 
    213 }
    214 
    215 static void
    216 diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl)
    217 {
    218 	char		*uuid = NULL;
    219 	nvlist_t	**nva;
    220 	uint_t		nvc;
    221 	diskmon_t	*diskp;
    222 	nvlist_t	*fmri;
    223 	nvlist_t	*fltnvl;
    224 	int		err = 0;
    225 
    226 	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
    227 	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
    228 	    &nva, &nvc);
    229 	if (err != 0)
    230 		return;
    231 
    232 	while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
    233 
    234 		fltnvl = *nva++;
    235 
    236 		if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0)
    237 			continue;
    238 
    239 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
    240 			continue;
    241 
    242 		/* Execute the actions associated with this fault */
    243 		dm_fault_execute_actions(hdl, diskp,  fltnvl);
    244 
    245 		/*
    246 		 * Send a state change event to the state change manager
    247 		 */
    248 		dm_state_change(diskp, HPS_FAULTED);
    249 	}
    250 
    251 	if (!fmd_case_uuclosed(hdl, uuid)) {
    252 		/* Case is closed */
    253 		fmd_case_uuclose(hdl, uuid);
    254 	}
    255 }
    256 
    257 /*ARGSUSED*/
    258 static void
    259 diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
    260 {
    261 	diskmon_t	*diskp;
    262 	nvlist_t	*fmri;
    263 
    264 	if (g_verbose & MM_MAIN)
    265 		nvlist_print(stderr, nvl);
    266 
    267 	/*
    268 	 * Act on the fault suspect list or repaired list (embedded agent
    269 	 * action).
    270 	 */
    271 	if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) {
    272 
    273 		diskmon_agent_repair(hdl, nvl, 1);
    274 		return;
    275 
    276 	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) {
    277 
    278 		diskmon_agent_repair(hdl, nvl, 0);
    279 		return;
    280 
    281 	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) {
    282 
    283 		diskmon_agent_suspect(hdl, nvl);
    284 		return;
    285 	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) {
    286 		return;
    287 	}
    288 
    289 	/*
    290 	 * If we get any replayed faults, set the diskmon's faulted
    291 	 * flag for the appropriate fault, then change the diskmon's state
    292 	 * to faulted.
    293 	 */
    294 	if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) {
    295 
    296 		if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE,
    297 		    &fmri) != 0)
    298 			return;
    299 
    300 		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
    301 			return;
    302 
    303 		/* Execute the actions associated with this fault */
    304 		dm_fault_execute_actions(hdl, diskp, nvl);
    305 
    306 		/*
    307 		 * If the fault wasn't generated by this module, send a
    308 		 * state change event to the state change manager
    309 		 */
    310 		dm_state_change(diskp, HPS_FAULTED);
    311 		return;
    312 	}
    313 }
    314 
    315 static const fmd_hdl_ops_t fmd_ops = {
    316 	diskmon_recv,	/* fmdo_recv */
    317 	NULL,		/* fmdo_timeout */
    318 	NULL,		/* fmdo_close */
    319 	NULL,		/* fmdo_stats */
    320 	NULL,		/* fmdo_gc */
    321 };
    322 
    323 static const fmd_prop_t fmd_props[] = {
    324 	{ GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" },
    325 	{ NULL, 0, NULL }
    326 };
    327 
    328 static const fmd_hdl_info_t fmd_info = {
    329 	"Disk Monitor",
    330 	DISK_MONITOR_MODULE_VERSION,
    331 	&fmd_ops,
    332 	fmd_props
    333 };
    334 
    335 void
    336 _fmd_init(fmd_hdl_t *hdl)
    337 {
    338 	fmd_case_t	*cp;
    339 	int		disk_count;
    340 
    341 	g_fm_hdl = hdl;
    342 
    343 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
    344 		return;
    345 	}
    346 
    347 	if (config_init()) {
    348 		log_err("Could not initialize configuration!\n");
    349 		fmd_hdl_unregister(hdl);
    350 		return;
    351 	}
    352 
    353 	if (config_get(hdl, fmd_props)) {
    354 		config_fini();
    355 		log_err("Could not retrieve configuration from libtopo!\n");
    356 		fmd_hdl_unregister(hdl);
    357 		return;
    358 	}
    359 
    360 	/*
    361 	 * If there are no disks to monitor, bail out
    362 	 */
    363 	if ((disk_count = count_disks(config_data->disk_list)) == 0) {
    364 		config_fini();
    365 		fmd_hdl_unregister(hdl);
    366 		return;
    367 	}
    368 
    369 	if (diskmon_init() == E_ERROR) {
    370 		config_fini();
    371 		fmd_hdl_unregister(hdl);
    372 		return;
    373 	}
    374 
    375 	log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count);
    376 
    377 	/*
    378 	 * Iterate over all active cases.
    379 	 * Since we automatically solve all cases, these cases must have
    380 	 * had the fault added, but the DE must have been interrupted
    381 	 * before they were solved.
    382 	 */
    383 	for (cp = fmd_case_next(hdl, NULL);
    384 	    cp != NULL; cp = fmd_case_next(hdl, cp)) {
    385 
    386 		if (!fmd_case_solved(hdl, cp))
    387 			fmd_case_solve(hdl, cp);
    388 	}
    389 }
    390 
    391 /*ARGSUSED*/
    392 void
    393 _fmd_fini(fmd_hdl_t *hdl)
    394 {
    395 	diskmon_teardown_all();
    396 	g_fm_hdl = NULL;
    397 }
    398