Home | History | Annotate | Download | only in sp-monitor
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     28 
     29 /*
     30  * /dev/bmc IPMI monitor
     31  *
     32  * The purpose of this module is to monitor the connection between the system
     33  * and the service processor attached via /dev/bmc.  The module assumes the SP
     34  * supports the Sun OEM uptime IPMI command.  If the BMC connection does not
     35  * exist, or the uptime function is not implemented, then the module unloads
     36  * without doing anything.
     37  *
     38  * When the module is first loaded, or a reset is detected, the module will
     39  * generate the ESC_PLATFORM_SP_RESET sysevent as a system-wide notification to
     40  * indicate that this event has occurred.
     41  *
     42  * Note that this event generation is not guaranteed to have a one-to-one
     43  * correspondence with an SP reset.  There is no persistence, so if fmd is
     44  * restarted we will generate this event again.  Thus the event only indicates
     45  * the possibility that the SP has been reset.  This could be enhanced using fmd
     46  * checkpoints to have some persistent state to avoid this scenario.  However,
     47  * it currently serves the useful dual purpose of notifying consumers of system
     48  * startup as well as SP reset through a single channel.
     49  */
     50 
     51 #include <errno.h>
     52 #include <libipmi.h>
     53 #include <libsysevent.h>
     54 #include <string.h>
     55 #include <fm/fmd_api.h>
     56 #include <sys/sysevent/eventdefs.h>
     57 
     58 typedef struct sp_monitor {
     59 	ipmi_handle_t	*sm_hdl;
     60 	uint32_t	sm_seconds;
     61 	uint32_t	sm_generation;
     62 	hrtime_t	sm_interval;
     63 } sp_monitor_t;
     64 
     65 static void
     66 sp_post_sysevent(fmd_hdl_t *hdl)
     67 {
     68 	sp_monitor_t *smp = fmd_hdl_getspecific(hdl);
     69 	sysevent_id_t eid;
     70 
     71 	fmd_hdl_debug(hdl, "SP reset detected, posting sysevent");
     72 
     73 	if (sysevent_post_event(EC_PLATFORM, ESC_PLATFORM_SP_RESET,
     74 	    SUNW_VENDOR, "fmd", NULL, &eid) != 0) {
     75 		fmd_hdl_debug(hdl, "failed to send sysevent: %s",
     76 		    strerror(errno));
     77 		/*
     78 		 * We reset the seconds and generation so that the next time
     79 		 * through we will try to post the sysevent again.
     80 		 */
     81 		smp->sm_seconds = -1U;
     82 		smp->sm_generation = -1U;
     83 	}
     84 }
     85 
     86 /*ARGSUSED*/
     87 static void
     88 sp_timeout(fmd_hdl_t *hdl, id_t id, void *data)
     89 {
     90 	sp_monitor_t *smp = fmd_hdl_getspecific(hdl);
     91 	uint32_t seconds, generation;
     92 
     93 	if (ipmi_sunoem_uptime(smp->sm_hdl, &seconds, &generation) != 0) {
     94 		/*
     95 		 * Ignore uptime failures.  We will generate the appropriate
     96 		 * event when it comes back online.
     97 		 */
     98 		fmd_hdl_debug(hdl, "failed to get uptime: %s",
     99 		    ipmi_errmsg(smp->sm_hdl));
    100 	} else {
    101 		/*
    102 		 * We want to catch cases where the generation number is
    103 		 * explicitly reset, or when the SP configuration is reset after
    104 		 * a reboot (and the generation number is 0).  We also post a
    105 		 * sysevent when the module initially loads, since we can't be
    106 		 * sure if we missed a SP reset or not.
    107 		 */
    108 		if (seconds < smp->sm_seconds ||
    109 		    generation != smp->sm_generation ||
    110 		    smp->sm_seconds == 0)
    111 			sp_post_sysevent(hdl);
    112 
    113 		smp->sm_seconds = seconds;
    114 		smp->sm_generation = generation;
    115 	}
    116 
    117 	(void) fmd_timer_install(hdl, NULL, NULL, smp->sm_interval);
    118 }
    119 
    120 static const fmd_hdl_ops_t fmd_ops = {
    121 	NULL,		/* fmdo_recv */
    122 	sp_timeout,	/* fmdo_timeout */
    123 	NULL,		/* fmdo_close */
    124 	NULL,		/* fmdo_stats */
    125 	NULL,		/* fmdo_gc */
    126 };
    127 
    128 static const fmd_prop_t fmd_props[] = {
    129 	{ "interval", FMD_TYPE_TIME, "60sec" },
    130 	{ NULL, 0, NULL }
    131 };
    132 
    133 static const fmd_hdl_info_t fmd_info = {
    134 	"Service Processor Monitor", "1.0", &fmd_ops, fmd_props
    135 };
    136 
    137 void
    138 _fmd_init(fmd_hdl_t *hdl)
    139 {
    140 	sp_monitor_t *smp;
    141 	int error;
    142 	char *msg;
    143 
    144 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
    145 		return;
    146 
    147 	smp = fmd_hdl_zalloc(hdl, sizeof (sp_monitor_t), FMD_SLEEP);
    148 	fmd_hdl_setspecific(hdl, smp);
    149 
    150 	if ((smp->sm_hdl = ipmi_open(&error, &msg)) == NULL) {
    151 		/*
    152 		 * If /dev/bmc doesn't exist on the system, then unload the
    153 		 * module without doing anything.
    154 		 */
    155 		if (error != EIPMI_BMC_OPEN_FAILED)
    156 			fmd_hdl_abort(hdl, "failed to initialize IPMI "
    157 			    "connection: %s\n", msg);
    158 		fmd_hdl_debug(hdl, "failed to load: no IPMI connection "
    159 		    "present");
    160 		fmd_hdl_free(hdl, smp, sizeof (sp_monitor_t));
    161 		fmd_hdl_unregister(hdl);
    162 		return;
    163 	}
    164 
    165 	/*
    166 	 * Attempt an initial uptime() call.  If the IPMI command is
    167 	 * unrecognized, then this is an unsupported platform and the module
    168 	 * should be unloaded.  Any other error is treated is transient failure.
    169 	 */
    170 	if ((error = ipmi_sunoem_uptime(smp->sm_hdl, &smp->sm_seconds,
    171 	    &smp->sm_generation)) != 0 &&
    172 	    ipmi_errno(smp->sm_hdl) == EIPMI_INVALID_COMMAND) {
    173 		fmd_hdl_debug(hdl, "failed to load: uptime command "
    174 		    "not supported");
    175 		ipmi_close(smp->sm_hdl);
    176 		fmd_hdl_free(hdl, smp, sizeof (sp_monitor_t));
    177 		fmd_hdl_unregister(hdl);
    178 		return;
    179 	}
    180 
    181 	smp->sm_interval = fmd_prop_get_int64(hdl, "interval");
    182 
    183 	if (error == 0)
    184 		fmd_hdl_debug(hdl, "successfully loaded, uptime = %u seconds "
    185 		    "(generation %u)", smp->sm_seconds, smp->sm_generation);
    186 	else
    187 		fmd_hdl_debug(hdl, "successfully loaded, but uptime call "
    188 		    "failed: %s", ipmi_errmsg(smp->sm_hdl));
    189 
    190 	/*
    191 	 * Setup the recurring timer.
    192 	 */
    193 	(void) fmd_timer_install(hdl, NULL, NULL, 0);
    194 }
    195 
    196 void
    197 _fmd_fini(fmd_hdl_t *hdl)
    198 {
    199 	sp_monitor_t *smp = fmd_hdl_getspecific(hdl);
    200 
    201 	if (smp) {
    202 		ipmi_close(smp->sm_hdl);
    203 		fmd_hdl_free(hdl, smp, sizeof (sp_monitor_t));
    204 	}
    205 }
    206