Home | History | Annotate | Download | only in common
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/mkdev.h>
     29 #include <sys/stat.h>
     30 
     31 #include <strings.h>
     32 #include <unistd.h>
     33 #include <limits.h>
     34 #include <fcntl.h>
     35 
     36 #include <fmd_module.h>
     37 #include <fmd_error.h>
     38 #include <fmd_alloc.h>
     39 #include <fmd_case.h>
     40 #include <fmd_serd.h>
     41 #include <fmd_subr.h>
     42 #include <fmd_conf.h>
     43 #include <fmd_event.h>
     44 #include <fmd_log.h>
     45 #include <fmd_api.h>
     46 #include <fmd_ckpt.h>
     47 
     48 #include <fmd.h>
     49 
     50 #define	P2ROUNDUP(x, align)	(-(-(x) & -(align)))
     51 #define	IS_P2ALIGNED(v, a)	((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
     52 
     53 /*
     54  * The fmd_ckpt_t structure is used to manage all of the state needed by the
     55  * various subroutines that save and restore checkpoints.  The structure is
     56  * initialized using fmd_ckpt_create() or fmd_ckpt_open() and is destroyed
     57  * by fmd_ckpt_destroy().  Refer to the subroutines below for more details.
     58  */
     59 typedef struct fmd_ckpt {
     60 	char ckp_src[PATH_MAX];	/* ckpt input or output filename */
     61 	char ckp_dst[PATH_MAX];	/* ckpt rename filename */
     62 	uchar_t *ckp_buf;	/* data buffer base address */
     63 	fcf_hdr_t *ckp_hdr;	/* file header pointer */
     64 	uchar_t *ckp_ptr;	/* data buffer pointer */
     65 	size_t ckp_size;	/* data buffer size */
     66 	fcf_sec_t *ckp_secp;	/* section header table pointer */
     67 	fcf_sec_t *ckp_modp;	/* section header for module */
     68 	uint_t ckp_secs;	/* number of sections */
     69 	char *ckp_strs;		/* string table base pointer */
     70 	char *ckp_strp;		/* string table pointer */
     71 	size_t ckp_strn;	/* string table size */
     72 	int ckp_fd;		/* output descriptor */
     73 	fmd_module_t *ckp_mp;	/* checkpoint module */
     74 	void *ckp_arg;		/* private arg for callbacks */
     75 } fmd_ckpt_t;
     76 
     77 typedef struct fmd_ckpt_desc {
     78 	uint64_t secd_size;	/* minimum section size */
     79 	uint32_t secd_entsize;	/* minimum section entry size */
     80 	uint32_t secd_align;	/* section alignment */
     81 } fmd_ckpt_desc_t;
     82 
     83 /*
     84  * Table of FCF section descriptions.  Here we record the minimum size for each
     85  * section (for use during restore) and the expected entry size and alignment
     86  * for each section (for use during both checkpoint and restore).
     87  */
     88 static const fmd_ckpt_desc_t _fmd_ckpt_sections[] = {
     89 { 0, 0, sizeof (uint8_t) },					   /* NONE */
     90 { 1, 0, sizeof (char) },					   /* STRTAB */
     91 { sizeof (fcf_module_t), 0, sizeof (uint32_t) },		   /* MODULE */
     92 { sizeof (fcf_case_t), 0, sizeof (uint32_t) },			   /* CASE */
     93 { sizeof (fcf_buf_t), sizeof (fcf_buf_t), sizeof (uint32_t) },	   /* BUFS */
     94 { 0, 0, _MAX_ALIGNMENT },					   /* BUFFER */
     95 { sizeof (fcf_serd_t), sizeof (fcf_serd_t), sizeof (uint64_t) },   /* SERD */
     96 { sizeof (fcf_event_t), sizeof (fcf_event_t), sizeof (uint64_t) }, /* EVENTS */
     97 { sizeof (fcf_nvl_t), sizeof (fcf_nvl_t), sizeof (uint64_t) },	   /* NVLISTS */
     98 };
     99 
    100 static int
    101 fmd_ckpt_create(fmd_ckpt_t *ckp, fmd_module_t *mp)
    102 {
    103 	const char *dir = mp->mod_ckpt;
    104 	const char *name = mp->mod_name;
    105 	mode_t mode;
    106 
    107 	bzero(ckp, sizeof (fmd_ckpt_t));
    108 	ckp->ckp_mp = mp;
    109 
    110 	ckp->ckp_size = sizeof (fcf_hdr_t);
    111 	ckp->ckp_strn = 1; /* for \0 */
    112 
    113 	(void) snprintf(ckp->ckp_src, PATH_MAX, "%s/%s+", dir, name);
    114 	(void) snprintf(ckp->ckp_dst, PATH_MAX, "%s/%s", dir, name);
    115 
    116 	(void) unlink(ckp->ckp_src);
    117 	(void) fmd_conf_getprop(fmd.d_conf, "ckpt.mode", &mode);
    118 	ckp->ckp_fd = open64(ckp->ckp_src, O_WRONLY | O_CREAT | O_EXCL, mode);
    119 
    120 	return (ckp->ckp_fd);
    121 }
    122 
    123 /*PRINTFLIKE2*/
    124 static int
    125 fmd_ckpt_inval(fmd_ckpt_t *ckp, const char *format, ...)
    126 {
    127 	va_list ap;
    128 
    129 	va_start(ap, format);
    130 	fmd_verror(EFMD_CKPT_INVAL, format, ap);
    131 	va_end(ap);
    132 
    133 	fmd_free(ckp->ckp_buf, ckp->ckp_size);
    134 	return (fmd_set_errno(EFMD_CKPT_INVAL));
    135 }
    136 
    137 static int
    138 fmd_ckpt_open(fmd_ckpt_t *ckp, fmd_module_t *mp)
    139 {
    140 	struct stat64 st;
    141 	uint64_t seclen;
    142 	uint_t i;
    143 	int err;
    144 
    145 	bzero(ckp, sizeof (fmd_ckpt_t));
    146 	ckp->ckp_mp = mp;
    147 
    148 	(void) snprintf(ckp->ckp_src, PATH_MAX, "%s/%s",
    149 	    mp->mod_ckpt, mp->mod_name);
    150 
    151 	if ((ckp->ckp_fd = open(ckp->ckp_src, O_RDONLY)) == -1)
    152 		return (-1); /* failed to open checkpoint file */
    153 
    154 	if (fstat64(ckp->ckp_fd, &st) == -1) {
    155 		err = errno;
    156 		(void) close(ckp->ckp_fd);
    157 		return (fmd_set_errno(err));
    158 	}
    159 
    160 	ckp->ckp_buf = fmd_alloc(st.st_size, FMD_SLEEP);
    161 	ckp->ckp_hdr = (void *)ckp->ckp_buf;
    162 	ckp->ckp_size = read(ckp->ckp_fd, ckp->ckp_buf, st.st_size);
    163 
    164 	if (ckp->ckp_size != st.st_size || ckp->ckp_size < sizeof (fcf_hdr_t) ||
    165 	    ckp->ckp_size != ckp->ckp_hdr->fcfh_filesz) {
    166 		err = ckp->ckp_size == (size_t)-1L ? errno : EFMD_CKPT_SHORT;
    167 		fmd_free(ckp->ckp_buf, st.st_size);
    168 		(void) close(ckp->ckp_fd);
    169 		return (fmd_set_errno(err));
    170 	}
    171 
    172 	(void) close(ckp->ckp_fd);
    173 	ckp->ckp_fd = -1;
    174 
    175 	/*
    176 	 * Once we've read in a consistent copy of the FCF file and we're sure
    177 	 * the header can be accessed, go through it and make sure everything
    178 	 * is valid.  We also check that unused bits are zero so we can expand
    179 	 * to use them safely in the future and support old files if needed.
    180 	 */
    181 	if (bcmp(&ckp->ckp_hdr->fcfh_ident[FCF_ID_MAG0],
    182 	    FCF_MAG_STRING, FCF_MAG_STRLEN) != 0)
    183 		return (fmd_ckpt_inval(ckp, "bad checkpoint magic string\n"));
    184 
    185 	if (ckp->ckp_hdr->fcfh_ident[FCF_ID_MODEL] != FCF_MODEL_NATIVE)
    186 		return (fmd_ckpt_inval(ckp, "bad checkpoint data model\n"));
    187 
    188 	if (ckp->ckp_hdr->fcfh_ident[FCF_ID_ENCODING] != FCF_ENCODE_NATIVE)
    189 		return (fmd_ckpt_inval(ckp, "bad checkpoint data encoding\n"));
    190 
    191 	if (ckp->ckp_hdr->fcfh_ident[FCF_ID_VERSION] != FCF_VERSION_1) {
    192 		return (fmd_ckpt_inval(ckp, "bad checkpoint version %u\n",
    193 		    ckp->ckp_hdr->fcfh_ident[FCF_ID_VERSION]));
    194 	}
    195 
    196 	for (i = FCF_ID_PAD; i < FCF_ID_SIZE; i++) {
    197 		if (ckp->ckp_hdr->fcfh_ident[i] != 0) {
    198 			return (fmd_ckpt_inval(ckp,
    199 			    "bad checkpoint padding at id[%d]", i));
    200 		}
    201 	}
    202 
    203 	if (ckp->ckp_hdr->fcfh_flags & ~FCF_FL_VALID)
    204 		return (fmd_ckpt_inval(ckp, "bad checkpoint flags\n"));
    205 
    206 	if (ckp->ckp_hdr->fcfh_pad != 0)
    207 		return (fmd_ckpt_inval(ckp, "reserved field in use\n"));
    208 
    209 	if (ckp->ckp_hdr->fcfh_hdrsize < sizeof (fcf_hdr_t) ||
    210 	    ckp->ckp_hdr->fcfh_secsize < sizeof (fcf_sec_t)) {
    211 		return (fmd_ckpt_inval(ckp,
    212 		    "bad header and/or section size\n"));
    213 	}
    214 
    215 	seclen = (uint64_t)ckp->ckp_hdr->fcfh_secnum *
    216 	    (uint64_t)ckp->ckp_hdr->fcfh_secsize;
    217 
    218 	if (ckp->ckp_hdr->fcfh_secoff > ckp->ckp_size ||
    219 	    seclen > ckp->ckp_size ||
    220 	    ckp->ckp_hdr->fcfh_secoff + seclen > ckp->ckp_size ||
    221 	    ckp->ckp_hdr->fcfh_secoff + seclen < ckp->ckp_hdr->fcfh_secoff)
    222 		return (fmd_ckpt_inval(ckp, "truncated section headers\n"));
    223 
    224 	if (!IS_P2ALIGNED(ckp->ckp_hdr->fcfh_secoff, sizeof (uint64_t)) ||
    225 	    !IS_P2ALIGNED(ckp->ckp_hdr->fcfh_secsize, sizeof (uint64_t)))
    226 		return (fmd_ckpt_inval(ckp, "misaligned section headers\n"));
    227 
    228 	/*
    229 	 * Once the header is validated, iterate over the section headers
    230 	 * ensuring that each one is valid w.r.t. offset, alignment, and size.
    231 	 * We also pick up the string table pointer during this pass.
    232 	 */
    233 	ckp->ckp_secp = (void *)(ckp->ckp_buf + ckp->ckp_hdr->fcfh_secoff);
    234 	ckp->ckp_secs = ckp->ckp_hdr->fcfh_secnum;
    235 
    236 	for (i = 0; i < ckp->ckp_secs; i++) {
    237 		fcf_sec_t *sp = (void *)(ckp->ckp_buf +
    238 		    ckp->ckp_hdr->fcfh_secoff + ckp->ckp_hdr->fcfh_secsize * i);
    239 
    240 		const fmd_ckpt_desc_t *dp = &_fmd_ckpt_sections[sp->fcfs_type];
    241 
    242 		if (sp->fcfs_flags != 0) {
    243 			return (fmd_ckpt_inval(ckp, "section %u has invalid "
    244 			    "section flags (0x%x)\n", i, sp->fcfs_flags));
    245 		}
    246 
    247 		if (sp->fcfs_align & (sp->fcfs_align - 1)) {
    248 			return (fmd_ckpt_inval(ckp, "section %u has invalid "
    249 			    "alignment (%u)\n", i, sp->fcfs_align));
    250 		}
    251 
    252 		if (sp->fcfs_offset & (sp->fcfs_align - 1)) {
    253 			return (fmd_ckpt_inval(ckp, "section %u is not properly"
    254 			    " aligned (offset %llu)\n", i, sp->fcfs_offset));
    255 		}
    256 
    257 		if (sp->fcfs_entsize != 0 &&
    258 		    (sp->fcfs_entsize & (sp->fcfs_align - 1)) != 0) {
    259 			return (fmd_ckpt_inval(ckp, "section %u has misaligned "
    260 			    "entsize %u\n", i, sp->fcfs_entsize));
    261 		}
    262 
    263 		if (sp->fcfs_offset > ckp->ckp_size ||
    264 		    sp->fcfs_size > ckp->ckp_size ||
    265 		    sp->fcfs_offset + sp->fcfs_size > ckp->ckp_size ||
    266 		    sp->fcfs_offset + sp->fcfs_size < sp->fcfs_offset) {
    267 			return (fmd_ckpt_inval(ckp, "section %u has corrupt "
    268 			    "size or offset\n", i));
    269 		}
    270 
    271 		if (sp->fcfs_type >= sizeof (_fmd_ckpt_sections) /
    272 		    sizeof (_fmd_ckpt_sections[0])) {
    273 			return (fmd_ckpt_inval(ckp, "section %u has unknown "
    274 			    "section type %u\n", i, sp->fcfs_type));
    275 		}
    276 
    277 		if (sp->fcfs_align != dp->secd_align) {
    278 			return (fmd_ckpt_inval(ckp, "section %u has align %u "
    279 			    "(not %u)\n", i, sp->fcfs_align, dp->secd_align));
    280 		}
    281 
    282 		if (sp->fcfs_size < dp->secd_size ||
    283 		    sp->fcfs_entsize < dp->secd_entsize) {
    284 			return (fmd_ckpt_inval(ckp, "section %u has short "
    285 			    "size or entsize\n", i));
    286 		}
    287 
    288 		switch (sp->fcfs_type) {
    289 		case FCF_SECT_STRTAB:
    290 			if (ckp->ckp_strs != NULL) {
    291 				return (fmd_ckpt_inval(ckp, "multiple string "
    292 				    "tables are present in checkpoint file\n"));
    293 			}
    294 
    295 			ckp->ckp_strs = (char *)ckp->ckp_buf + sp->fcfs_offset;
    296 			ckp->ckp_strn = sp->fcfs_size;
    297 
    298 			if (ckp->ckp_strs[ckp->ckp_strn - 1] != '\0') {
    299 				return (fmd_ckpt_inval(ckp, "string table %u "
    300 				    "is missing terminating nul byte\n", i));
    301 			}
    302 			break;
    303 
    304 		case FCF_SECT_MODULE:
    305 			if (ckp->ckp_modp != NULL) {
    306 				return (fmd_ckpt_inval(ckp, "multiple module "
    307 				    "sects are present in checkpoint file\n"));
    308 			}
    309 			ckp->ckp_modp = sp;
    310 			break;
    311 		}
    312 	}
    313 
    314 	/*
    315 	 * Ensure that the first section is an empty one of type FCF_SECT_NONE.
    316 	 * This is done to ensure that links can use index 0 as a null section.
    317 	 */
    318 	if (ckp->ckp_secs == 0 || ckp->ckp_secp->fcfs_type != FCF_SECT_NONE ||
    319 	    ckp->ckp_secp->fcfs_entsize != 0 || ckp->ckp_secp->fcfs_size != 0) {
    320 		return (fmd_ckpt_inval(ckp, "section 0 is not of the "
    321 		    "appropriate size and/or attributes (SECT_NONE)\n"));
    322 	}
    323 
    324 	if (ckp->ckp_modp == NULL) {
    325 		return (fmd_ckpt_inval(ckp,
    326 		    "no module section found in file\n"));
    327 	}
    328 
    329 	return (0);
    330 }
    331 
    332 static void
    333 fmd_ckpt_destroy(fmd_ckpt_t *ckp)
    334 {
    335 	if (ckp->ckp_buf != NULL)
    336 		fmd_free(ckp->ckp_buf, ckp->ckp_size);
    337 	if (ckp->ckp_fd >= 0)
    338 		(void) close(ckp->ckp_fd);
    339 }
    340 
    341 /*
    342  * fmd_ckpt_error() is used as a wrapper around fmd_error() for ckpt routines.
    343  * It calls fmd_module_unlock() on behalf of its caller, logs the error, and
    344  * then aborts the API call and the surrounding module entry point by doing an
    345  * fmd_module_abort(), which longjmps to the place where we entered the module.
    346  * Depending on the type of error and conf settings, we will reset or fail.
    347  */
    348 /*PRINTFLIKE3*/
    349 static void
    350 fmd_ckpt_error(fmd_ckpt_t *ckp, int err, const char *format, ...)
    351 {
    352 	fmd_module_t *mp = ckp->ckp_mp;
    353 	va_list ap;
    354 
    355 	va_start(ap, format);
    356 	fmd_verror(err, format, ap);
    357 	va_end(ap);
    358 
    359 	if (fmd_module_locked(mp))
    360 		fmd_module_unlock(mp);
    361 
    362 	fmd_ckpt_destroy(ckp);
    363 	fmd_module_abort(mp, err);
    364 }
    365 
    366 static fcf_secidx_t
    367 fmd_ckpt_section(fmd_ckpt_t *ckp, const void *data, uint_t type, uint64_t size)
    368 {
    369 	const fmd_ckpt_desc_t *dp;
    370 
    371 	ASSERT(type < sizeof (_fmd_ckpt_sections) / sizeof (fmd_ckpt_desc_t));
    372 	dp = &_fmd_ckpt_sections[type];
    373 
    374 	ckp->ckp_ptr = (uchar_t *)
    375 	    P2ROUNDUP((uintptr_t)ckp->ckp_ptr, dp->secd_align);
    376 
    377 	ckp->ckp_secp->fcfs_type = type;
    378 	ckp->ckp_secp->fcfs_align = dp->secd_align;
    379 	ckp->ckp_secp->fcfs_flags = 0;
    380 	ckp->ckp_secp->fcfs_entsize = dp->secd_entsize;
    381 	ckp->ckp_secp->fcfs_offset = (size_t)(ckp->ckp_ptr - ckp->ckp_buf);
    382 	ckp->ckp_secp->fcfs_size = size;
    383 
    384 	/*
    385 	 * If the data pointer is non-NULL, copy the data to our buffer; else
    386 	 * the caller is responsible for doing so and updating ckp->ckp_ptr.
    387 	 */
    388 	if (data != NULL) {
    389 		bcopy(data, ckp->ckp_ptr, size);
    390 		ckp->ckp_ptr += size;
    391 	}
    392 
    393 	ckp->ckp_secp++;
    394 	return (ckp->ckp_secs++);
    395 }
    396 
    397 static fcf_stridx_t
    398 fmd_ckpt_string(fmd_ckpt_t *ckp, const char *s)
    399 {
    400 	fcf_stridx_t idx = (fcf_stridx_t)(ckp->ckp_strp - ckp->ckp_strs);
    401 
    402 	(void) strcpy(ckp->ckp_strp, s);
    403 	ckp->ckp_strp += strlen(s) + 1;
    404 
    405 	return (idx);
    406 }
    407 
    408 static int
    409 fmd_ckpt_alloc(fmd_ckpt_t *ckp, uint64_t gen)
    410 {
    411 	/*
    412 	 * We've added up all the sections by now: add two more for SECT_NONE
    413 	 * and SECT_STRTAB, and add the size of the section header table and
    414 	 * string table to the total size.  We know that the fcf_hdr_t is
    415 	 * aligned so that that fcf_sec_t's can follow it, and that fcf_sec_t
    416 	 * is aligned so that any section can follow it, so no extra padding
    417 	 * bytes need to be allocated between any of these items.
    418 	 */
    419 	ckp->ckp_secs += 2; /* for FCF_SECT_NONE and FCF_SECT_STRTAB */
    420 	ckp->ckp_size += sizeof (fcf_sec_t) * ckp->ckp_secs;
    421 	ckp->ckp_size += ckp->ckp_strn;
    422 
    423 	TRACE((FMD_DBG_CKPT, "alloc fcf buf size %u", ckp->ckp_size));
    424 	ckp->ckp_buf = fmd_zalloc(ckp->ckp_size, FMD_NOSLEEP);
    425 
    426 	if (ckp->ckp_buf == NULL)
    427 		return (-1); /* errno is set for us */
    428 
    429 	ckp->ckp_hdr = (void *)ckp->ckp_buf;
    430 
    431 	ckp->ckp_hdr->fcfh_ident[FCF_ID_MAG0] = FCF_MAG_MAG0;
    432 	ckp->ckp_hdr->fcfh_ident[FCF_ID_MAG1] = FCF_MAG_MAG1;
    433 	ckp->ckp_hdr->fcfh_ident[FCF_ID_MAG2] = FCF_MAG_MAG2;
    434 	ckp->ckp_hdr->fcfh_ident[FCF_ID_MAG3] = FCF_MAG_MAG3;
    435 	ckp->ckp_hdr->fcfh_ident[FCF_ID_MODEL] = FCF_MODEL_NATIVE;
    436 	ckp->ckp_hdr->fcfh_ident[FCF_ID_ENCODING] = FCF_ENCODE_NATIVE;
    437 	ckp->ckp_hdr->fcfh_ident[FCF_ID_VERSION] = FCF_VERSION;
    438 
    439 	ckp->ckp_hdr->fcfh_hdrsize = sizeof (fcf_hdr_t);
    440 	ckp->ckp_hdr->fcfh_secsize = sizeof (fcf_sec_t);
    441 	ckp->ckp_hdr->fcfh_secnum = ckp->ckp_secs;
    442 	ckp->ckp_hdr->fcfh_secoff = sizeof (fcf_hdr_t);
    443 	ckp->ckp_hdr->fcfh_filesz = ckp->ckp_size;
    444 	ckp->ckp_hdr->fcfh_cgen = gen;
    445 
    446 	ckp->ckp_secs = 0; /* reset section counter for second pass */
    447 	ckp->ckp_secp = (void *)(ckp->ckp_buf + sizeof (fcf_hdr_t));
    448 	ckp->ckp_strs = (char *)ckp->ckp_buf + ckp->ckp_size - ckp->ckp_strn;
    449 	ckp->ckp_strp = ckp->ckp_strs + 1; /* use first byte as \0 */
    450 	ckp->ckp_ptr = (uchar_t *)(ckp->ckp_secp + ckp->ckp_hdr->fcfh_secnum);
    451 
    452 	(void) fmd_ckpt_section(ckp, NULL, FCF_SECT_NONE, 0);
    453 	return (0);
    454 }
    455 
    456 static int
    457 fmd_ckpt_commit(fmd_ckpt_t *ckp)
    458 {
    459 	fcf_sec_t *secbase = (void *)(ckp->ckp_buf + sizeof (fcf_hdr_t));
    460 	size_t stroff = ckp->ckp_size - ckp->ckp_strn;
    461 
    462 	/*
    463 	 * Before committing the checkpoint, we assert that fmd_ckpt_t's sizes
    464 	 * and current pointer locations all add up appropriately.  Any ASSERTs
    465 	 * which trip here likely indicate an inconsistency in the code for the
    466 	 * reservation pass and the buffer update pass of the FCF subroutines.
    467 	 */
    468 	ASSERT((size_t)(ckp->ckp_ptr - ckp->ckp_buf) == stroff);
    469 	(void) fmd_ckpt_section(ckp, NULL, FCF_SECT_STRTAB, ckp->ckp_strn);
    470 	ckp->ckp_ptr += ckp->ckp_strn; /* string table is already filled in */
    471 
    472 	ASSERT(ckp->ckp_secs == ckp->ckp_hdr->fcfh_secnum);
    473 	ASSERT(ckp->ckp_secp == secbase + ckp->ckp_hdr->fcfh_secnum);
    474 	ASSERT(ckp->ckp_ptr == ckp->ckp_buf + ckp->ckp_hdr->fcfh_filesz);
    475 
    476 	if (write(ckp->ckp_fd, ckp->ckp_buf, ckp->ckp_size) != ckp->ckp_size ||
    477 	    fsync(ckp->ckp_fd) != 0 || close(ckp->ckp_fd) != 0)
    478 		return (-1); /* errno is set for us */
    479 
    480 	ckp->ckp_fd = -1; /* fd is now closed */
    481 	return (rename(ckp->ckp_src, ckp->ckp_dst) != 0);
    482 }
    483 
    484 static void
    485 fmd_ckpt_resv(fmd_ckpt_t *ckp, size_t size, size_t align)
    486 {
    487 	if (size != 0) {
    488 		ckp->ckp_size = P2ROUNDUP(ckp->ckp_size, align) + size;
    489 		ckp->ckp_secs++;
    490 	}
    491 }
    492 
    493 static void
    494 fmd_ckpt_resv_buf(fmd_buf_t *bp, fmd_ckpt_t *ckp)
    495 {
    496 	ckp->ckp_size = P2ROUNDUP(ckp->ckp_size, _MAX_ALIGNMENT) + bp->buf_size;
    497 	ckp->ckp_strn += strlen(bp->buf_name) + 1;
    498 	ckp->ckp_secs++;
    499 }
    500 
    501 static void
    502 fmd_ckpt_save_buf(fmd_buf_t *bp, fmd_ckpt_t *ckp)
    503 {
    504 	fcf_buf_t *fcfb = ckp->ckp_arg;
    505 
    506 	fcfb->fcfb_name = fmd_ckpt_string(ckp, bp->buf_name);
    507 	fcfb->fcfb_data = fmd_ckpt_section(ckp,
    508 	    bp->buf_data, FCF_SECT_BUFFER, bp->buf_size);
    509 
    510 	ckp->ckp_arg = fcfb + 1;
    511 }
    512 
    513 static void
    514 fmd_ckpt_save_event(fmd_ckpt_t *ckp, fmd_event_t *e)
    515 {
    516 	fcf_event_t *fcfe = (void *)ckp->ckp_ptr;
    517 	fmd_event_impl_t *ep = (fmd_event_impl_t *)e;
    518 	fmd_log_t *lp = ep->ev_log;
    519 
    520 	fcfe->fcfe_todsec = ep->ev_time.ftv_sec;
    521 	fcfe->fcfe_todnsec = ep->ev_time.ftv_nsec;
    522 	fcfe->fcfe_major = lp ? major(lp->log_stat.st_dev) : -1U;
    523 	fcfe->fcfe_minor = lp ? minor(lp->log_stat.st_dev) : -1U;
    524 	fcfe->fcfe_inode = lp ? lp->log_stat.st_ino : -1ULL;
    525 	fcfe->fcfe_offset = ep->ev_off;
    526 
    527 	ckp->ckp_ptr += sizeof (fcf_event_t);
    528 }
    529 
    530 static void
    531 fmd_ckpt_save_nvlist(fmd_ckpt_t *ckp, nvlist_t *nvl)
    532 {
    533 	fcf_nvl_t *fcfn = (void *)ckp->ckp_ptr;
    534 	char *nvbuf = (char *)ckp->ckp_ptr + sizeof (fcf_nvl_t);
    535 	size_t nvsize = 0;
    536 
    537 	(void) nvlist_size(nvl, &nvsize, NV_ENCODE_NATIVE);
    538 	fcfn->fcfn_size = (uint64_t)nvsize;
    539 
    540 	(void) nvlist_pack(nvl, &nvbuf, &nvsize, NV_ENCODE_NATIVE, 0);
    541 	ckp->ckp_ptr += sizeof (fcf_nvl_t) + nvsize;
    542 
    543 	ckp->ckp_ptr = (uchar_t *)
    544 	    P2ROUNDUP((uintptr_t)ckp->ckp_ptr, sizeof (uint64_t));
    545 }
    546 
    547 static void
    548 fmd_ckpt_resv_serd(fmd_serd_eng_t *sgp, fmd_ckpt_t *ckp)
    549 {
    550 	fmd_ckpt_resv(ckp,
    551 	    sizeof (fcf_event_t) * sgp->sg_count, sizeof (uint64_t));
    552 
    553 	ckp->ckp_strn += strlen(sgp->sg_name) + 1;
    554 }
    555 
    556 static void
    557 fmd_ckpt_save_serd(fmd_serd_eng_t *sgp, fmd_ckpt_t *ckp)
    558 {
    559 	fcf_serd_t *fcfd = ckp->ckp_arg;
    560 	fcf_secidx_t evsec = FCF_SECT_NONE;
    561 	fmd_serd_elem_t *sep;
    562 
    563 	if (sgp->sg_count != 0) {
    564 		evsec = fmd_ckpt_section(ckp, NULL, FCF_SECT_EVENTS,
    565 		    sizeof (fcf_event_t) * sgp->sg_count);
    566 
    567 		for (sep = fmd_list_next(&sgp->sg_list);
    568 		    sep != NULL; sep = fmd_list_next(sep))
    569 			fmd_ckpt_save_event(ckp, sep->se_event);
    570 	}
    571 
    572 	fcfd->fcfd_name = fmd_ckpt_string(ckp, sgp->sg_name);
    573 	fcfd->fcfd_events = evsec;
    574 	fcfd->fcfd_pad = 0;
    575 	fcfd->fcfd_n = sgp->sg_n;
    576 	fcfd->fcfd_t = sgp->sg_t;
    577 
    578 	ckp->ckp_arg = fcfd + 1;
    579 }
    580 
    581 static void
    582 fmd_ckpt_resv_case(fmd_ckpt_t *ckp, fmd_case_t *cp)
    583 {
    584 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    585 	fmd_case_susp_t *cis;
    586 	uint_t n;
    587 
    588 	if (cip->ci_xprt != NULL)
    589 		return; /* do not checkpoint cases from remote transports */
    590 
    591 	n = fmd_buf_hash_count(&cip->ci_bufs);
    592 	fmd_buf_hash_apply(&cip->ci_bufs, (fmd_buf_f *)fmd_ckpt_resv_buf, ckp);
    593 	fmd_ckpt_resv(ckp, sizeof (fcf_buf_t) * n, sizeof (uint32_t));
    594 
    595 	if (cip->ci_principal != NULL)
    596 		fmd_ckpt_resv(ckp, sizeof (fcf_event_t), sizeof (uint64_t));
    597 
    598 	fmd_ckpt_resv(ckp,
    599 	    sizeof (fcf_event_t) * cip->ci_nitems, sizeof (uint64_t));
    600 
    601 	if (cip->ci_nsuspects != 0)
    602 		ckp->ckp_size = P2ROUNDUP(ckp->ckp_size, sizeof (uint64_t));
    603 
    604 	cip->ci_nvsz = 0; /* compute size of packed suspect nvlist array */
    605 
    606 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
    607 		size_t nvsize = 0;
    608 
    609 		(void) nvlist_size(cis->cis_nvl, &nvsize, NV_ENCODE_NATIVE);
    610 		cip->ci_nvsz += sizeof (fcf_nvl_t) + nvsize;
    611 		cip->ci_nvsz = P2ROUNDUP(cip->ci_nvsz, sizeof (uint64_t));
    612 	}
    613 
    614 	fmd_ckpt_resv(ckp, cip->ci_nvsz, sizeof (uint64_t));
    615 	fmd_ckpt_resv(ckp, sizeof (fcf_case_t), sizeof (uint32_t));
    616 	ckp->ckp_strn += strlen(cip->ci_uuid) + 1;
    617 }
    618 
    619 static void
    620 fmd_ckpt_save_case(fmd_ckpt_t *ckp, fmd_case_t *cp)
    621 {
    622 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    623 
    624 	fmd_case_item_t *cit;
    625 	fmd_case_susp_t *cis;
    626 	fcf_case_t fcfc;
    627 	uint_t n;
    628 
    629 	fcf_secidx_t bufsec = FCF_SECIDX_NONE;
    630 	fcf_secidx_t evsec = FCF_SECIDX_NONE;
    631 	fcf_secidx_t nvsec = FCF_SECIDX_NONE;
    632 	fcf_secidx_t prsec = FCF_SECIDX_NONE;
    633 
    634 	if (cip->ci_xprt != NULL)
    635 		return; /* do not checkpoint cases from remote transports */
    636 
    637 	if ((n = fmd_buf_hash_count(&cip->ci_bufs)) != 0) {
    638 		size_t size = sizeof (fcf_buf_t) * n;
    639 		fcf_buf_t *bufs = ckp->ckp_arg = fmd_alloc(size, FMD_SLEEP);
    640 
    641 		fmd_buf_hash_apply(&cip->ci_bufs,
    642 		    (fmd_buf_f *)fmd_ckpt_save_buf, ckp);
    643 
    644 		bufsec = fmd_ckpt_section(ckp, bufs, FCF_SECT_BUFS, size);
    645 		fmd_free(bufs, size);
    646 	}
    647 
    648 	if (cip->ci_principal != NULL) {
    649 		prsec = fmd_ckpt_section(ckp, NULL, FCF_SECT_EVENTS,
    650 		    sizeof (fcf_event_t));
    651 
    652 		fmd_ckpt_save_event(ckp, cip->ci_principal);
    653 	}
    654 
    655 	if (cip->ci_nitems != 0) {
    656 		evsec = fmd_ckpt_section(ckp, NULL, FCF_SECT_EVENTS,
    657 		    sizeof (fcf_event_t) * cip->ci_nitems);
    658 
    659 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
    660 			fmd_ckpt_save_event(ckp, cit->cit_event);
    661 	}
    662 
    663 	if (cip->ci_nsuspects != 0) {
    664 		nvsec = fmd_ckpt_section(ckp, NULL,
    665 		    FCF_SECT_NVLISTS, cip->ci_nvsz);
    666 
    667 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
    668 			fmd_ckpt_save_nvlist(ckp, cis->cis_nvl);
    669 	}
    670 
    671 	fcfc.fcfc_uuid = fmd_ckpt_string(ckp, cip->ci_uuid);
    672 	fcfc.fcfc_bufs = bufsec;
    673 	fcfc.fcfc_principal = prsec;
    674 	fcfc.fcfc_events = evsec;
    675 	fcfc.fcfc_suspects = nvsec;
    676 
    677 	switch (cip->ci_state) {
    678 	case FMD_CASE_UNSOLVED:
    679 		fcfc.fcfc_state = FCF_CASE_UNSOLVED;
    680 		break;
    681 	case FMD_CASE_SOLVED:
    682 		fcfc.fcfc_state = FCF_CASE_SOLVED;
    683 		break;
    684 	case FMD_CASE_CLOSE_WAIT:
    685 		fcfc.fcfc_state = FCF_CASE_CLOSE_WAIT;
    686 		break;
    687 	default:
    688 		fmd_panic("case %p (%s) has invalid state %u",
    689 		    (void *)cp, cip->ci_uuid, cip->ci_state);
    690 	}
    691 
    692 	(void) fmd_ckpt_section(ckp, &fcfc, FCF_SECT_CASE, sizeof (fcf_case_t));
    693 }
    694 
    695 static void
    696 fmd_ckpt_resv_module(fmd_ckpt_t *ckp, fmd_module_t *mp)
    697 {
    698 	fmd_case_t *cp;
    699 	uint_t n;
    700 
    701 	for (cp = fmd_list_next(&mp->mod_cases); cp; cp = fmd_list_next(cp))
    702 		fmd_ckpt_resv_case(ckp, cp);
    703 
    704 	n = fmd_serd_hash_count(&mp->mod_serds);
    705 	fmd_serd_hash_apply(&mp->mod_serds,
    706 	    (fmd_serd_eng_f *)fmd_ckpt_resv_serd, ckp);
    707 	fmd_ckpt_resv(ckp, sizeof (fcf_serd_t) * n, sizeof (uint64_t));
    708 
    709 	n = fmd_buf_hash_count(&mp->mod_bufs);
    710 	fmd_buf_hash_apply(&mp->mod_bufs, (fmd_buf_f *)fmd_ckpt_resv_buf, ckp);
    711 	fmd_ckpt_resv(ckp, sizeof (fcf_buf_t) * n, sizeof (uint32_t));
    712 
    713 	fmd_ckpt_resv(ckp, sizeof (fcf_module_t), sizeof (uint32_t));
    714 	ckp->ckp_strn += strlen(mp->mod_name) + 1;
    715 	ckp->ckp_strn += strlen(mp->mod_path) + 1;
    716 	ckp->ckp_strn += strlen(mp->mod_info->fmdi_desc) + 1;
    717 	ckp->ckp_strn += strlen(mp->mod_info->fmdi_vers) + 1;
    718 }
    719 
    720 static void
    721 fmd_ckpt_save_module(fmd_ckpt_t *ckp, fmd_module_t *mp)
    722 {
    723 	fcf_secidx_t bufsec = FCF_SECIDX_NONE;
    724 	fcf_module_t fcfm;
    725 	fmd_case_t *cp;
    726 	uint_t n;
    727 
    728 	for (cp = fmd_list_next(&mp->mod_cases); cp; cp = fmd_list_next(cp))
    729 		fmd_ckpt_save_case(ckp, cp);
    730 
    731 	if ((n = fmd_serd_hash_count(&mp->mod_serds)) != 0) {
    732 		size_t size = sizeof (fcf_serd_t) * n;
    733 		fcf_serd_t *serds = ckp->ckp_arg = fmd_alloc(size, FMD_SLEEP);
    734 
    735 		fmd_serd_hash_apply(&mp->mod_serds,
    736 		    (fmd_serd_eng_f *)fmd_ckpt_save_serd, ckp);
    737 
    738 		(void) fmd_ckpt_section(ckp, serds, FCF_SECT_SERD, size);
    739 		fmd_free(serds, size);
    740 	}
    741 
    742 	if ((n = fmd_buf_hash_count(&mp->mod_bufs)) != 0) {
    743 		size_t size = sizeof (fcf_buf_t) * n;
    744 		fcf_buf_t *bufs = ckp->ckp_arg = fmd_alloc(size, FMD_SLEEP);
    745 
    746 		fmd_buf_hash_apply(&mp->mod_bufs,
    747 		    (fmd_buf_f *)fmd_ckpt_save_buf, ckp);
    748 
    749 		bufsec = fmd_ckpt_section(ckp, bufs, FCF_SECT_BUFS, size);
    750 		fmd_free(bufs, size);
    751 	}
    752 
    753 	fcfm.fcfm_name = fmd_ckpt_string(ckp, mp->mod_name);
    754 	fcfm.fcfm_path = fmd_ckpt_string(ckp, mp->mod_path);
    755 	fcfm.fcfm_desc = fmd_ckpt_string(ckp, mp->mod_info->fmdi_desc);
    756 	fcfm.fcfm_vers = fmd_ckpt_string(ckp, mp->mod_info->fmdi_vers);
    757 	fcfm.fcfm_bufs = bufsec;
    758 
    759 	(void) fmd_ckpt_section(ckp, &fcfm,
    760 	    FCF_SECT_MODULE, sizeof (fcf_module_t));
    761 }
    762 
    763 void
    764 fmd_ckpt_save(fmd_module_t *mp)
    765 {
    766 	struct stat64 st;
    767 	char path[PATH_MAX];
    768 	mode_t dirmode;
    769 
    770 	hrtime_t now = gethrtime();
    771 	fmd_ckpt_t ckp;
    772 	int err;
    773 
    774 	ASSERT(fmd_module_locked(mp));
    775 
    776 	/*
    777 	 * If checkpointing is disabled for the module, just return.  We must
    778 	 * commit the module state anyway to transition pending log events.
    779 	 */
    780 	if (mp->mod_stats->ms_ckpt_save.fmds_value.bool == FMD_B_FALSE) {
    781 		fmd_module_commit(mp);
    782 		return;
    783 	}
    784 
    785 	if (!(mp->mod_flags & (FMD_MOD_MDIRTY | FMD_MOD_CDIRTY)))
    786 		return; /* no checkpoint is necessary for this module */
    787 
    788 	TRACE((FMD_DBG_CKPT, "ckpt save begin %s %llu",
    789 	    mp->mod_name, mp->mod_gen + 1));
    790 
    791 	/*
    792 	 * If the per-module checkpoint directory isn't found or isn't of type
    793 	 * directory, move aside whatever is there (if anything) and attempt
    794 	 * to mkdir(2) a new module checkpoint directory.  If this fails, we
    795 	 * have no choice but to abort the checkpoint and try again later.
    796 	 */
    797 	if (stat64(mp->mod_ckpt, &st) != 0 || !S_ISDIR(st.st_mode)) {
    798 		(void) snprintf(path, sizeof (path), "%s-", mp->mod_ckpt);
    799 		(void) rename(mp->mod_ckpt, path);
    800 		(void) fmd_conf_getprop(fmd.d_conf, "ckpt.dirmode", &dirmode);
    801 
    802 		if (mkdir(mp->mod_ckpt, dirmode) != 0) {
    803 			fmd_error(EFMD_CKPT_MKDIR,
    804 			    "failed to mkdir %s", mp->mod_ckpt);
    805 			return; /* return without clearing dirty bits */
    806 		}
    807 	}
    808 
    809 	/*
    810 	 * Create a temporary file to write out the checkpoint into, and create
    811 	 * a fmd_ckpt_t structure to manage construction of the checkpoint.  We
    812 	 * then figure out how much space will be required, and allocate it.
    813 	 */
    814 	if (fmd_ckpt_create(&ckp, mp) == -1) {
    815 		fmd_error(EFMD_CKPT_CREATE, "failed to create %s", ckp.ckp_src);
    816 		return;
    817 	}
    818 
    819 	fmd_ckpt_resv_module(&ckp, mp);
    820 
    821 	if (fmd_ckpt_alloc(&ckp, mp->mod_gen + 1) != 0) {
    822 		fmd_error(EFMD_CKPT_NOMEM, "failed to build %s", ckp.ckp_src);
    823 		fmd_ckpt_destroy(&ckp);
    824 		return;
    825 	}
    826 
    827 	/*
    828 	 * Fill in the checkpoint content, write it to disk, sync it, and then
    829 	 * atomically rename it to the destination path.  If this fails, we
    830 	 * have no choice but to leave all our dirty bits set and return.
    831 	 */
    832 	fmd_ckpt_save_module(&ckp, mp);
    833 	err = fmd_ckpt_commit(&ckp);
    834 	fmd_ckpt_destroy(&ckp);
    835 
    836 	if (err != 0) {
    837 		fmd_error(EFMD_CKPT_COMMIT, "failed to commit %s", ckp.ckp_dst);
    838 		return; /* return without clearing dirty bits */
    839 	}
    840 
    841 	fmd_module_commit(mp);
    842 	TRACE((FMD_DBG_CKPT, "ckpt save end %s", mp->mod_name));
    843 
    844 	mp->mod_stats->ms_ckpt_cnt.fmds_value.ui64++;
    845 	mp->mod_stats->ms_ckpt_time.fmds_value.ui64 += gethrtime() - now;
    846 
    847 	fmd_dprintf(FMD_DBG_CKPT, "saved checkpoint of %s (%llu)\n",
    848 	    mp->mod_name, mp->mod_gen);
    849 }
    850 
    851 /*
    852  * Utility function to retrieve a pointer to a section's header and verify that
    853  * it is of the expected type or it is a FCF_SECT_NONE reference.
    854  */
    855 static const fcf_sec_t *
    856 fmd_ckpt_secptr(fmd_ckpt_t *ckp, fcf_secidx_t sid, uint_t type)
    857 {
    858 	const fcf_sec_t *sp = (void *)(ckp->ckp_buf +
    859 	    ckp->ckp_hdr->fcfh_secoff + ckp->ckp_hdr->fcfh_secsize * sid);
    860 
    861 	return (sid < ckp->ckp_secs && (sp->fcfs_type == type ||
    862 	    sp->fcfs_type == FCF_SECT_NONE) ? sp : NULL);
    863 }
    864 
    865 /*
    866  * Utility function to retrieve the data pointer for a particular section.  The
    867  * validity of the header values has already been checked by fmd_ckpt_open().
    868  */
    869 static const void *
    870 fmd_ckpt_dataptr(fmd_ckpt_t *ckp, const fcf_sec_t *sp)
    871 {
    872 	return (ckp->ckp_buf + sp->fcfs_offset);
    873 }
    874 
    875 /*
    876  * Utility function to retrieve the end of the data region for a particular
    877  * section.  The validity of this value has been confirmed by fmd_ckpt_open().
    878  */
    879 static const void *
    880 fmd_ckpt_datalim(fmd_ckpt_t *ckp, const fcf_sec_t *sp)
    881 {
    882 	return (ckp->ckp_buf + sp->fcfs_offset + sp->fcfs_size);
    883 }
    884 
    885 /*
    886  * Utility function to retrieve a string pointer (fcf_stridx_t).  If the string
    887  * index is valid, the string data is returned; otherwise 'defstr' is returned.
    888  */
    889 static const char *
    890 fmd_ckpt_strptr(fmd_ckpt_t *ckp, fcf_stridx_t sid, const char *defstr)
    891 {
    892 	return (sid < ckp->ckp_strn ? ckp->ckp_strs + sid : defstr);
    893 }
    894 
    895 static void
    896 fmd_ckpt_restore_events(fmd_ckpt_t *ckp, fcf_secidx_t sid,
    897     void (*func)(void *, fmd_event_t *), void *arg)
    898 {
    899 	const fcf_event_t *fcfe;
    900 	const fcf_sec_t *sp;
    901 	fmd_timeval_t ftv;
    902 	fmd_log_t *lp, *errlp;
    903 	uint_t i, n;
    904 	uint32_t e_maj, e_min;
    905 	uint64_t e_ino;
    906 
    907 	if ((sp = fmd_ckpt_secptr(ckp, sid, FCF_SECT_EVENTS)) == NULL) {
    908 		fmd_ckpt_error(ckp, EFMD_CKPT_INVAL,
    909 		    "invalid link to section %u: expected events\n", sid);
    910 	}
    911 
    912 	if (sp->fcfs_size == 0)
    913 		return; /* empty events section or type none */
    914 
    915 	fcfe = fmd_ckpt_dataptr(ckp, sp);
    916 	n = sp->fcfs_size / sp->fcfs_entsize;
    917 
    918 	/*
    919 	 * Hold the reader lock on log pointers to block log rotation during
    920 	 * the section restore so that we can safely insert refs to d_errlog.
    921 	 */
    922 	(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
    923 	errlp = fmd.d_errlog;
    924 
    925 	e_maj = major(errlp->log_stat.st_dev);
    926 	e_min = minor(errlp->log_stat.st_dev);
    927 	e_ino = errlp->log_stat.st_ino;
    928 
    929 	for (i = 0; i < n; i++) {
    930 		fmd_event_t *ep;
    931 
    932 		ftv.ftv_sec = fcfe->fcfe_todsec;
    933 		ftv.ftv_nsec = fcfe->fcfe_todnsec;
    934 
    935 		if (e_ino == fcfe->fcfe_inode &&
    936 		    e_maj == fcfe->fcfe_major &&
    937 		    e_min == fcfe->fcfe_minor)
    938 			lp = errlp;
    939 		else
    940 			lp = NULL;
    941 
    942 		ep = fmd_event_recreate(FMD_EVT_PROTOCOL,
    943 		    &ftv, NULL, NULL, lp, fcfe->fcfe_offset, 0);
    944 		fmd_event_hold(ep);
    945 		func(arg, ep);
    946 		fmd_event_rele(ep);
    947 
    948 		fcfe = (fcf_event_t *)((uintptr_t)fcfe + sp->fcfs_entsize);
    949 	}
    950 
    951 	(void) pthread_rwlock_unlock(&fmd.d_log_lock);
    952 }
    953 
    954 static int
    955 fmd_ckpt_restore_suspects(fmd_ckpt_t *ckp, fmd_case_t *cp, fcf_secidx_t sid)
    956 {
    957 	const fcf_nvl_t *fcfn, *endn;
    958 	const fcf_sec_t *sp;
    959 	nvlist_t *nvl;
    960 	int err, i;
    961 
    962 	if ((sp = fmd_ckpt_secptr(ckp, sid, FCF_SECT_NVLISTS)) == NULL) {
    963 		fmd_ckpt_error(ckp, EFMD_CKPT_INVAL,
    964 		    "invalid link to section %u: expected nvlists\n", sid);
    965 	}
    966 
    967 	fcfn = fmd_ckpt_dataptr(ckp, sp);
    968 	endn = fmd_ckpt_datalim(ckp, sp);
    969 
    970 	for (i = 0; fcfn < endn; i++) {
    971 		char *data = (char *)fcfn + sp->fcfs_entsize;
    972 		size_t size = (size_t)fcfn->fcfn_size;
    973 
    974 		if (fcfn->fcfn_size > (size_t)((char *)endn - data)) {
    975 			fmd_ckpt_error(ckp, EFMD_CKPT_INVAL, "nvlist %u [%d] "
    976 			    "size %u exceeds buffer\n", sid, i, size);
    977 		}
    978 
    979 		if ((err = nvlist_xunpack(data, size, &nvl, &fmd.d_nva)) != 0) {
    980 			fmd_ckpt_error(ckp, EFMD_CKPT_INVAL, "failed to "
    981 			    "unpack nvlist %u [%d]: %s\n", sid, i,
    982 			    fmd_strerror(err));
    983 		}
    984 
    985 		fmd_case_insert_suspect(cp, nvl);
    986 
    987 		size = sp->fcfs_entsize + fcfn->fcfn_size;
    988 		size = P2ROUNDUP(size, sizeof (uint64_t));
    989 		fcfn = (fcf_nvl_t *)((uintptr_t)fcfn + size);
    990 	}
    991 
    992 	return (i);
    993 }
    994 
    995 static void
    996 fmd_ckpt_restore_bufs(fmd_ckpt_t *ckp, fmd_module_t *mp,
    997     fmd_case_t *cp, fcf_secidx_t sid)
    998 {
    999 	const fcf_sec_t *sp, *dsp;
   1000 	const fcf_buf_t *fcfb;
   1001 	uint_t i, n;
   1002 
   1003 	if ((sp = fmd_ckpt_secptr(ckp, sid, FCF_SECT_BUFS)) == NULL) {
   1004 		fmd_ckpt_error(ckp, EFMD_CKPT_INVAL,
   1005 		    "invalid link to section %u: expected bufs\n", sid);
   1006 	}
   1007 
   1008 	if (sp->fcfs_size == 0)
   1009 		return; /* empty events section or type none */
   1010 
   1011 	fcfb = fmd_ckpt_dataptr(ckp, sp);
   1012 	n = sp->fcfs_size / sp->fcfs_entsize;
   1013 
   1014 	for (i = 0; i < n; i++) {
   1015 		dsp = fmd_ckpt_secptr(ckp, fcfb->fcfb_data, FCF_SECT_BUFFER);
   1016 
   1017 		if (dsp == NULL) {
   1018 			fmd_ckpt_error(ckp, EFMD_CKPT_INVAL, "invalid %u "
   1019 			    "buffer link %u\n", sid, fcfb->fcfb_data);
   1020 		}
   1021 
   1022 		fmd_buf_write((fmd_hdl_t *)mp, cp,
   1023 		    fmd_ckpt_strptr(ckp, fcfb->fcfb_name, "<CORRUPT>"),
   1024 		    ckp->ckp_buf + dsp->fcfs_offset, dsp->fcfs_size);
   1025 
   1026 		fcfb = (fcf_buf_t *)((uintptr_t)fcfb + sp->fcfs_entsize);
   1027 	}
   1028 }
   1029 
   1030 static void
   1031 fmd_ckpt_restore_case(fmd_ckpt_t *ckp, fmd_module_t *mp, const fcf_sec_t *sp)
   1032 {
   1033 	const fcf_case_t *fcfc = fmd_ckpt_dataptr(ckp, sp);
   1034 	const char *uuid = fmd_ckpt_strptr(ckp, fcfc->fcfc_uuid, NULL);
   1035 	fmd_case_t *cp;
   1036 	int n;
   1037 
   1038 	if (uuid == NULL || fcfc->fcfc_state > FCF_CASE_CLOSE_WAIT) {
   1039 		fmd_ckpt_error(ckp, EFMD_CKPT_INVAL, "corrupt %u case uuid "
   1040 		    "and/or state\n", (uint_t)(sp - ckp->ckp_secp));
   1041 	}
   1042 
   1043 	fmd_module_lock(mp);
   1044 
   1045 	if ((cp = fmd_case_recreate(mp, NULL,
   1046 	    fcfc->fcfc_state != FCF_CASE_UNSOLVED ? FCF_CASE_SOLVED :
   1047 	    FMD_CASE_UNSOLVED, uuid, NULL)) == NULL) {
   1048 		fmd_ckpt_error(ckp, EFMD_CKPT_INVAL,
   1049 		    "duplicate case uuid: %s\n", uuid);
   1050 	}
   1051 
   1052 	fmd_ckpt_restore_events(ckp, fcfc->fcfc_principal,
   1053 	    (void (*)(void *, fmd_event_t *))fmd_case_insert_principal, cp);
   1054 
   1055 	fmd_ckpt_restore_events(ckp, fcfc->fcfc_events,
   1056 	    (void (*)(void *, fmd_event_t *))fmd_case_insert_event, cp);
   1057 
   1058 	/*
   1059 	 * Once solved, treat suspects from resource cache as master copy.
   1060 	 *
   1061 	 * If !fmd.d_running, this module must be a builtin, and so we don't
   1062 	 * want to restore suspects or call fmd_case_transition_update() at this
   1063 	 * stage. The suspects will be added later from the resource cache.
   1064 	 * Calling fmd_case_transition("SOLVED") is OK here as the state is
   1065 	 * already solved, so all it does is update the case flags.
   1066 	 */
   1067 	if (fmd.d_running && (n = ((fmd_case_impl_t *)cp)->ci_nsuspects) == 0)
   1068 		n = fmd_ckpt_restore_suspects(ckp, cp, fcfc->fcfc_suspects);
   1069 
   1070 	if (!fmd.d_running)
   1071 		fmd_case_transition(cp, FMD_CASE_SOLVED, FMD_CF_SOLVED);
   1072 	else if (fcfc->fcfc_state == FCF_CASE_SOLVED)
   1073 		fmd_case_transition_update(cp, FMD_CASE_SOLVED, FMD_CF_SOLVED);
   1074 	else if (fcfc->fcfc_state == FCF_CASE_CLOSE_WAIT && n != 0)
   1075 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_SOLVED);
   1076 	else if (fcfc->fcfc_state == FCF_CASE_CLOSE_WAIT && n == 0)
   1077 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
   1078 
   1079 	fmd_module_unlock(mp);
   1080 	fmd_ckpt_restore_bufs(ckp, mp, cp, fcfc->fcfc_bufs);
   1081 }
   1082 
   1083 static void
   1084 fmd_ckpt_restore_serd(fmd_ckpt_t *ckp, fmd_module_t *mp, const fcf_sec_t *sp)
   1085 {
   1086 	const fcf_serd_t *fcfd = fmd_ckpt_dataptr(ckp, sp);
   1087 	uint_t i, n = sp->fcfs_size / sp->fcfs_entsize;
   1088 	const fcf_sec_t *esp;
   1089 	const char *s;
   1090 
   1091 	for (i = 0; i < n; i++) {
   1092 		esp = fmd_ckpt_secptr(ckp, fcfd->fcfd_events, FCF_SECT_EVENTS);
   1093 
   1094 		if (esp == NULL) {
   1095 			fmd_ckpt_error(ckp, EFMD_CKPT_INVAL,
   1096 			    "invalid events link %u\n", fcfd->fcfd_events);
   1097 		}
   1098 
   1099 		if ((s = fmd_ckpt_strptr(ckp, fcfd->fcfd_name, NULL)) == NULL) {
   1100 			fmd_ckpt_error(ckp, EFMD_CKPT_INVAL,
   1101 			    "serd name %u is corrupt\n", fcfd->fcfd_name);
   1102 		}
   1103 
   1104 		fmd_serd_create((fmd_hdl_t *)mp, s, fcfd->fcfd_n, fcfd->fcfd_t);
   1105 		fmd_module_lock(mp);
   1106 
   1107 		fmd_ckpt_restore_events(ckp, fcfd->fcfd_events,
   1108 		    (void (*)(void *, fmd_event_t *))fmd_serd_eng_record,
   1109 		    fmd_serd_eng_lookup(&mp->mod_serds, s));
   1110 
   1111 		fmd_module_unlock(mp);
   1112 		fcfd = (fcf_serd_t *)((uintptr_t)fcfd + sp->fcfs_entsize);
   1113 	}
   1114 }
   1115 
   1116 static void
   1117 fmd_ckpt_restore_module(fmd_ckpt_t *ckp, fmd_module_t *mp)
   1118 {
   1119 	const fcf_module_t *fcfm = fmd_ckpt_dataptr(ckp, ckp->ckp_modp);
   1120 	const fcf_sec_t *sp;
   1121 	uint_t i;
   1122 
   1123 	if (strcmp(mp->mod_name, fmd_ckpt_strptr(ckp, fcfm->fcfm_name, "")) ||
   1124 	    strcmp(mp->mod_path, fmd_ckpt_strptr(ckp, fcfm->fcfm_path, ""))) {
   1125 		fmd_ckpt_error(ckp, EFMD_CKPT_INVAL,
   1126 		    "checkpoint is not for module %s\n", mp->mod_name);
   1127 	}
   1128 
   1129 	for (i = 0; i < ckp->ckp_secs; i++) {
   1130 		sp = (void *)(ckp->ckp_buf +
   1131 		    ckp->ckp_hdr->fcfh_secoff + ckp->ckp_hdr->fcfh_secsize * i);
   1132 
   1133 		switch (sp->fcfs_type) {
   1134 		case FCF_SECT_CASE:
   1135 			fmd_ckpt_restore_case(ckp, mp, sp);
   1136 			break;
   1137 		case FCF_SECT_SERD:
   1138 			fmd_ckpt_restore_serd(ckp, mp, sp);
   1139 			break;
   1140 		}
   1141 	}
   1142 
   1143 	fmd_ckpt_restore_bufs(ckp, mp, NULL, fcfm->fcfm_bufs);
   1144 	mp->mod_gen = ckp->ckp_hdr->fcfh_cgen;
   1145 }
   1146 
   1147 /*
   1148  * Restore a checkpoint for the specified module.  Any errors which occur
   1149  * during restore will call fmd_ckpt_error() or trigger an fmd_api_error(),
   1150  * either of which will automatically unlock the module and trigger an abort.
   1151  */
   1152 void
   1153 fmd_ckpt_restore(fmd_module_t *mp)
   1154 {
   1155 	fmd_ckpt_t ckp;
   1156 
   1157 	if (mp->mod_stats->ms_ckpt_restore.fmds_value.bool == FMD_B_FALSE)
   1158 		return; /* never restore checkpoints for this module */
   1159 
   1160 	TRACE((FMD_DBG_CKPT, "ckpt restore begin %s", mp->mod_name));
   1161 
   1162 	if (fmd_ckpt_open(&ckp, mp) == -1) {
   1163 		if (errno != ENOENT)
   1164 			fmd_error(EFMD_CKPT_OPEN, "can't open %s", ckp.ckp_src);
   1165 		TRACE((FMD_DBG_CKPT, "ckpt restore end %s", mp->mod_name));
   1166 		return;
   1167 	}
   1168 
   1169 	ASSERT(!fmd_module_locked(mp));
   1170 	fmd_ckpt_restore_module(&ckp, mp);
   1171 	fmd_ckpt_destroy(&ckp);
   1172 	fmd_module_clrdirty(mp);
   1173 
   1174 	TRACE((FMD_DBG_CKPT, "ckpt restore end %s", mp->mod_name));
   1175 	fmd_dprintf(FMD_DBG_CKPT, "restored checkpoint of %s\n", mp->mod_name);
   1176 }
   1177 
   1178 /*
   1179  * Delete the module's checkpoint file.  This is used by the ckpt.zero property
   1180  * code or by the fmadm reset RPC service path to force a checkpoint delete.
   1181  */
   1182 void
   1183 fmd_ckpt_delete(fmd_module_t *mp)
   1184 {
   1185 	char path[PATH_MAX];
   1186 
   1187 	(void) snprintf(path, sizeof (path),
   1188 	    "%s/%s", mp->mod_ckpt, mp->mod_name);
   1189 
   1190 	TRACE((FMD_DBG_CKPT, "delete %s ckpt", mp->mod_name));
   1191 
   1192 	if (unlink(path) != 0 && errno != ENOENT)
   1193 		fmd_error(EFMD_CKPT_DELETE, "failed to delete %s", path);
   1194 }
   1195 
   1196 /*
   1197  * Move aside the module's checkpoint file if checkpoint restore has failed.
   1198  * We rename the file rather than deleting it in the hopes that someone might
   1199  * send it to us for post-mortem analysis of whether we have a checkpoint bug.
   1200  */
   1201 void
   1202 fmd_ckpt_rename(fmd_module_t *mp)
   1203 {
   1204 	char src[PATH_MAX], dst[PATH_MAX];
   1205 
   1206 	(void) snprintf(src, sizeof (src), "%s/%s", mp->mod_ckpt, mp->mod_name);
   1207 	(void) snprintf(dst, sizeof (dst), "%s-", src);
   1208 
   1209 	TRACE((FMD_DBG_CKPT, "rename %s ckpt", mp->mod_name));
   1210 
   1211 	if (rename(src, dst) != 0 && errno != ENOENT)
   1212 		fmd_error(EFMD_CKPT_DELETE, "failed to rename %s", src);
   1213 }
   1214