Home | History | Annotate | Download | only in cpr
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/types.h>
     29 #include <sys/errno.h>
     30 #include <sys/cpuvar.h>
     31 #include <sys/vfs.h>
     32 #include <sys/vnode.h>
     33 #include <sys/pathname.h>
     34 #include <sys/callb.h>
     35 #include <sys/fs/ufs_inode.h>
     36 #include <vm/anon.h>
     37 #include <sys/fs/swapnode.h>	/* for swapfs_minfree */
     38 #include <sys/kmem.h>
     39 #include <sys/cpr.h>
     40 #include <sys/conf.h>
     41 #include <sys/machclock.h>
     42 
     43 /*
     44  * CPR miscellaneous support routines
     45  */
     46 #define	cpr_open(path, mode,  vpp)	(vn_open(path, UIO_SYSSPACE, \
     47 		mode, 0600, vpp, CRCREAT, 0))
     48 #define	cpr_rdwr(rw, vp, basep, cnt)	(vn_rdwr(rw, vp,  (caddr_t)(basep), \
     49 		cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \
     50 		(ssize_t *)NULL))
     51 
     52 extern void clkset(time_t);
     53 extern cpu_t *i_cpr_bootcpu(void);
     54 extern caddr_t i_cpr_map_setup(void);
     55 extern void i_cpr_free_memory_resources(void);
     56 
     57 extern kmutex_t cpr_slock;
     58 extern size_t cpr_buf_size;
     59 extern char *cpr_buf;
     60 extern size_t cpr_pagedata_size;
     61 extern char *cpr_pagedata;
     62 extern int cpr_bufs_allocated;
     63 extern int cpr_bitmaps_allocated;
     64 
     65 #if defined(__sparc)
     66 static struct cprconfig cprconfig;
     67 static int cprconfig_loaded = 0;
     68 static int cpr_statefile_ok(vnode_t *, int);
     69 static int cpr_p_online(cpu_t *, int);
     70 static void cpr_save_mp_state(void);
     71 #endif
     72 
     73 int cpr_is_ufs(struct vfs *);
     74 int cpr_is_zfs(struct vfs *);
     75 
     76 char cpr_default_path[] = CPR_DEFAULT;
     77 
     78 #define	COMPRESS_PERCENT 40	/* approx compression ratio in percent */
     79 #define	SIZE_RATE	115	/* increase size by 15% */
     80 #define	INTEGRAL	100	/* for integer math */
     81 
     82 
     83 /*
     84  * cmn_err() followed by a 1/4 second delay; this gives the
     85  * logging service a chance to flush messages and helps avoid
     86  * intermixing output from prom_printf().
     87  */
     88 /*PRINTFLIKE2*/
     89 void
     90 cpr_err(int ce, const char *fmt, ...)
     91 {
     92 	va_list adx;
     93 
     94 	va_start(adx, fmt);
     95 	vcmn_err(ce, fmt, adx);
     96 	va_end(adx);
     97 	drv_usecwait(MICROSEC >> 2);
     98 }
     99 
    100 
    101 int
    102 cpr_init(int fcn)
    103 {
    104 	/*
    105 	 * Allow only one suspend/resume process.
    106 	 */
    107 	if (mutex_tryenter(&cpr_slock) == 0)
    108 		return (EBUSY);
    109 
    110 	CPR->c_flags = 0;
    111 	CPR->c_substate = 0;
    112 	CPR->c_cprboot_magic = 0;
    113 	CPR->c_alloc_cnt = 0;
    114 
    115 	CPR->c_fcn = fcn;
    116 	if (fcn == AD_CPR_REUSABLE)
    117 		CPR->c_flags |= C_REUSABLE;
    118 	else
    119 		CPR->c_flags |= C_SUSPENDING;
    120 	if (fcn == AD_SUSPEND_TO_RAM || fcn == DEV_SUSPEND_TO_RAM) {
    121 		return (0);
    122 	}
    123 #if defined(__sparc)
    124 	if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ)
    125 		CPR->c_flags |= C_COMPRESSING;
    126 	/*
    127 	 * reserve CPR_MAXCONTIG virtual pages for cpr_dump()
    128 	 */
    129 	CPR->c_mapping_area = i_cpr_map_setup();
    130 	if (CPR->c_mapping_area == 0) {		/* no space in kernelmap */
    131 		cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n");
    132 		mutex_exit(&cpr_slock);
    133 		return (EAGAIN);
    134 	}
    135 	if (cpr_debug & CPR_DEBUG3)
    136 		cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing "
    137 		    "kas\n", (void *)CPR->c_mapping_area);
    138 #endif
    139 
    140 	return (0);
    141 }
    142 
    143 /*
    144  * This routine releases any resources used during the checkpoint.
    145  */
    146 void
    147 cpr_done(void)
    148 {
    149 	cpr_stat_cleanup();
    150 	i_cpr_bitmap_cleanup();
    151 
    152 	/*
    153 	 * Free pages used by cpr buffers.
    154 	 */
    155 	if (cpr_buf) {
    156 		kmem_free(cpr_buf, cpr_buf_size);
    157 		cpr_buf = NULL;
    158 	}
    159 	if (cpr_pagedata) {
    160 		kmem_free(cpr_pagedata, cpr_pagedata_size);
    161 		cpr_pagedata = NULL;
    162 	}
    163 
    164 	i_cpr_free_memory_resources();
    165 	mutex_exit(&cpr_slock);
    166 	cpr_err(CE_CONT, "System has been resumed.\n");
    167 }
    168 
    169 
    170 #if defined(__sparc)
    171 /*
    172  * reads config data into cprconfig
    173  */
    174 static int
    175 cpr_get_config(void)
    176 {
    177 	static char config_path[] = CPR_CONFIG;
    178 	struct cprconfig *cf = &cprconfig;
    179 	struct vnode *vp;
    180 	char *fmt;
    181 	int err;
    182 
    183 	if (cprconfig_loaded)
    184 		return (0);
    185 
    186 	fmt = "cannot %s config file \"%s\", error %d\n";
    187 	if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) {
    188 		cpr_err(CE_CONT, fmt, "open", config_path, err);
    189 		return (err);
    190 	}
    191 
    192 	err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf));
    193 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
    194 	VN_RELE(vp);
    195 	if (err) {
    196 		cpr_err(CE_CONT, fmt, "read", config_path, err);
    197 		return (err);
    198 	}
    199 
    200 	if (cf->cf_magic == CPR_CONFIG_MAGIC)
    201 		cprconfig_loaded = 1;
    202 	else {
    203 		cpr_err(CE_CONT, "invalid config file \"%s\", "
    204 		    "rerun pmconfig(1M)\n", config_path);
    205 		err = EINVAL;
    206 	}
    207 
    208 	return (err);
    209 }
    210 
    211 
    212 /*
    213  * concat fs and path fields of the cprconfig structure;
    214  * returns pointer to the base of static data
    215  */
    216 static char *
    217 cpr_cprconfig_to_path(void)
    218 {
    219 	static char full_path[MAXNAMELEN];
    220 	struct cprconfig *cf = &cprconfig;
    221 	char *ptr;
    222 
    223 	/*
    224 	 * build /fs/path without extra '/'
    225 	 */
    226 	(void) strcpy(full_path, cf->cf_fs);
    227 	if (strcmp(cf->cf_fs, "/"))
    228 		(void) strcat(full_path, "/");
    229 	ptr = cf->cf_path;
    230 	if (*ptr == '/')
    231 		ptr++;
    232 	(void) strcat(full_path, ptr);
    233 	return (full_path);
    234 }
    235 
    236 
    237 /*
    238  * Verify that the information in the configuration file regarding the
    239  * location for the statefile is still valid, depending on cf_type.
    240  * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be
    241  *	mounted on the same device as when pmconfig was last run,
    242  *	and the translation of that device to a node in the prom's
    243  *	device tree must be the same as when pmconfig was last run.
    244  * for CFT_SPEC and CFT_ZVOL, cf_path must be the path to a block
    245  *      special file, it must have no file system mounted on it,
    246  *	and the translation of that device to a node in the prom's
    247  *	device tree must be the same as when pmconfig was last run.
    248  */
    249 static int
    250 cpr_verify_statefile_path(void)
    251 {
    252 	struct cprconfig *cf = &cprconfig;
    253 	static const char long_name[] = "Statefile pathname is too long.\n";
    254 	static const char lookup_fmt[] = "Lookup failed for "
    255 	    "cpr statefile device %s.\n";
    256 	static const char path_chg_fmt[] = "Device path for statefile "
    257 	    "has changed from %s to %s.\t%s\n";
    258 	static const char rerun[] = "Please rerun pmconfig(1m).";
    259 	struct vfs *vfsp = NULL, *vfsp_save = rootvfs;
    260 	ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data;
    261 	ufsvfs_t *ufsvfsp_save = ufsvfsp;
    262 	int error;
    263 	struct vnode *vp;
    264 	char *slash, *tail, *longest;
    265 	char *errstr;
    266 	int found = 0;
    267 	union {
    268 		char un_devpath[OBP_MAXPATHLEN];
    269 		char un_sfpath[MAXNAMELEN];
    270 	} un;
    271 #define	devpath	un.un_devpath
    272 #define	sfpath	un.un_sfpath
    273 
    274 	ASSERT(cprconfig_loaded);
    275 	/*
    276 	 * We need not worry about locking or the timing of releasing
    277 	 * the vnode, since we are single-threaded now.
    278 	 */
    279 
    280 	switch (cf->cf_type) {
    281 	case CFT_SPEC:
    282 		error = i_devname_to_promname(cf->cf_devfs, devpath,
    283 		    OBP_MAXPATHLEN);
    284 		if (error || strcmp(devpath, cf->cf_dev_prom)) {
    285 			cpr_err(CE_CONT, path_chg_fmt,
    286 			    cf->cf_dev_prom, devpath, rerun);
    287 			return (error);
    288 		}
    289 		/*FALLTHROUGH*/
    290 	case CFT_ZVOL:
    291 		if (strlen(cf->cf_path) > sizeof (sfpath)) {
    292 			cpr_err(CE_CONT, long_name);
    293 			return (ENAMETOOLONG);
    294 		}
    295 		if ((error = lookupname(cf->cf_devfs,
    296 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
    297 			cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
    298 			return (error);
    299 		}
    300 		if (vp->v_type != VBLK)
    301 			errstr = "statefile must be a block device";
    302 		else if (vfs_devismounted(vp->v_rdev))
    303 			errstr = "statefile device must not "
    304 			    "have a file system mounted on it";
    305 		else if (IS_SWAPVP(vp))
    306 			errstr = "statefile device must not "
    307 			    "be configured as swap file";
    308 		else
    309 			errstr = NULL;
    310 
    311 		VN_RELE(vp);
    312 		if (errstr) {
    313 			cpr_err(CE_CONT, "%s.\n", errstr);
    314 			return (ENOTSUP);
    315 		}
    316 
    317 		return (error);
    318 	case CFT_UFS:
    319 		break;		/* don't indent all the original code */
    320 	default:
    321 		cpr_err(CE_PANIC, "invalid cf_type");
    322 	}
    323 
    324 	/*
    325 	 * The original code for UFS statefile
    326 	 */
    327 	if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) {
    328 		cpr_err(CE_CONT, long_name);
    329 		return (ENAMETOOLONG);
    330 	}
    331 
    332 	bzero(sfpath, sizeof (sfpath));
    333 	(void) strcpy(sfpath, cpr_cprconfig_to_path());
    334 
    335 	if (*sfpath != '/') {
    336 		cpr_err(CE_CONT, "Statefile pathname %s "
    337 		    "must begin with a /\n", sfpath);
    338 		return (EINVAL);
    339 	}
    340 
    341 	/*
    342 	 * Find the longest prefix of the statefile pathname which
    343 	 * is the mountpoint of a filesystem.  This string must
    344 	 * match the cf_fs field we read from the config file.  Other-
    345 	 * wise the user has changed things without running pmconfig.
    346 	 */
    347 	tail = longest = sfpath + 1;	/* pt beyond the leading "/" */
    348 	while ((slash = strchr(tail, '/')) != NULL) {
    349 		*slash = '\0';	  /* temporarily terminate the string */
    350 		if ((error = lookupname(sfpath,
    351 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
    352 			*slash = '/';
    353 			cpr_err(CE_CONT, "A directory in the "
    354 			    "statefile path %s was not found.\n", sfpath);
    355 			VN_RELE(vp);
    356 
    357 			return (error);
    358 		}
    359 
    360 		vfs_list_read_lock();
    361 		vfsp = rootvfs;
    362 		do {
    363 			ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
    364 			if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) {
    365 				found = 1;
    366 				break;
    367 			}
    368 			vfsp = vfsp->vfs_next;
    369 		} while (vfsp != rootvfs);
    370 		vfs_list_unlock();
    371 
    372 		/*
    373 		 * If we have found a filesystem mounted on the current
    374 		 * path prefix, remember the end of the string in
    375 		 * "longest".  If it happens to be the the exact fs
    376 		 * saved in the configuration file, save the current
    377 		 * ufsvfsp so we can make additional checks further down.
    378 		 */
    379 		if (found) {
    380 			longest = slash;
    381 			if (strcmp(cf->cf_fs, sfpath) == 0) {
    382 				ufsvfsp_save = ufsvfsp;
    383 				vfsp_save = vfsp;
    384 			}
    385 			found = 0;
    386 		}
    387 
    388 		VN_RELE(vp);
    389 		*slash = '/';
    390 		tail = slash + 1;
    391 	}
    392 	*longest = '\0';
    393 	if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) {
    394 		cpr_err(CE_CONT, "Filesystem containing "
    395 		    "the statefile when pmconfig was run (%s) has "
    396 		    "changed to %s. %s\n", cf->cf_fs, sfpath, rerun);
    397 		return (EINVAL);
    398 	}
    399 
    400 	if ((error = lookupname(cf->cf_devfs,
    401 	    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
    402 		cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
    403 		return (error);
    404 	}
    405 
    406 	if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) {
    407 		cpr_err(CE_CONT, "Filesystem containing "
    408 		    "statefile no longer mounted on device %s. "
    409 		    "See power.conf(4).", cf->cf_devfs);
    410 		VN_RELE(vp);
    411 		return (ENXIO);
    412 	}
    413 	VN_RELE(vp);
    414 
    415 	error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN);
    416 	if (error || strcmp(devpath, cf->cf_dev_prom)) {
    417 		cpr_err(CE_CONT, path_chg_fmt,
    418 		    cf->cf_dev_prom, devpath, rerun);
    419 		return (error);
    420 	}
    421 
    422 	return (0);
    423 }
    424 
    425 /*
    426  * Make sure that the statefile can be used as a block special statefile
    427  * (meaning that is exists and has nothing mounted on it)
    428  * Returns errno if not a valid statefile.
    429  */
    430 int
    431 cpr_check_spec_statefile(void)
    432 {
    433 	int err;
    434 
    435 	if (err = cpr_get_config())
    436 		return (err);
    437 	ASSERT(cprconfig.cf_type == CFT_SPEC ||
    438 	    cprconfig.cf_type == CFT_ZVOL);
    439 
    440 	if (cprconfig.cf_devfs == NULL)
    441 		return (ENXIO);
    442 
    443 	return (cpr_verify_statefile_path());
    444 
    445 }
    446 
    447 int
    448 cpr_alloc_statefile(int alloc_retry)
    449 {
    450 	register int rc = 0;
    451 	char *str;
    452 
    453 	/*
    454 	 * Statefile size validation. If checkpoint the first time, disk blocks
    455 	 * allocation will be done; otherwise, just do file size check.
    456 	 * if statefile allocation is being retried, C_VP will be inited
    457 	 */
    458 	if (alloc_retry) {
    459 		str = "\n-->Retrying statefile allocation...";
    460 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
    461 			prom_printf(str);
    462 		if (C_VP->v_type != VBLK)
    463 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
    464 	} else {
    465 		/*
    466 		 * Open an exiting file for writing, the state file needs to be
    467 		 * pre-allocated since we can't and don't want to do allocation
    468 		 * during checkpoint (too much of the OS is disabled).
    469 		 *    - do a preliminary size checking here, if it is too small,
    470 		 *	allocate more space internally and retry.
    471 		 *    - check the vp to make sure it's the right type.
    472 		 */
    473 		char *path = cpr_build_statefile_path();
    474 
    475 		if (path == NULL)
    476 			return (ENXIO);
    477 		else if (rc = cpr_verify_statefile_path())
    478 			return (rc);
    479 
    480 		if (rc = vn_open(path, UIO_SYSSPACE,
    481 		    FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) {
    482 			cpr_err(CE_WARN, "cannot open statefile %s", path);
    483 			return (rc);
    484 		}
    485 	}
    486 
    487 	/*
    488 	 * Only ufs and block special statefiles supported
    489 	 */
    490 	if (C_VP->v_type != VREG && C_VP->v_type != VBLK) {
    491 		cpr_err(CE_CONT,
    492 		    "Statefile must be regular file or block special file.");
    493 		return (EACCES);
    494 	}
    495 
    496 	if (rc = cpr_statefile_ok(C_VP, alloc_retry))
    497 		return (rc);
    498 
    499 	if (C_VP->v_type != VBLK) {
    500 		/*
    501 		 * sync out the fs change due to the statefile reservation.
    502 		 */
    503 		(void) VFS_SYNC(C_VP->v_vfsp, 0, CRED());
    504 
    505 		/*
    506 		 * Validate disk blocks allocation for the state file.
    507 		 * Ask the file system prepare itself for the dump operation.
    508 		 */
    509 		if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL, NULL)) {
    510 			cpr_err(CE_CONT, "Error allocating "
    511 			    "blocks for cpr statefile.");
    512 			return (rc);
    513 		}
    514 	}
    515 	return (0);
    516 }
    517 
    518 
    519 /*
    520  * Lookup device size and return available space in bytes.
    521  * NOTE: Since prop_op(9E) can't tell the difference between a character
    522  * and a block reference, it is ok to ask for "Size" instead of "Nblocks".
    523  */
    524 size_t
    525 cpr_get_devsize(dev_t dev)
    526 {
    527 	size_t bytes = 0;
    528 
    529 	bytes = cdev_Size(dev);
    530 	if (bytes == 0)
    531 		bytes = cdev_size(dev);
    532 
    533 	if (bytes > CPR_SPEC_OFFSET)
    534 		bytes -= CPR_SPEC_OFFSET;
    535 	else
    536 		bytes = 0;
    537 
    538 	return (bytes);
    539 }
    540 
    541 
    542 /*
    543  * increase statefile size
    544  */
    545 static int
    546 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize)
    547 {
    548 	extern uchar_t cpr_pagecopy[];
    549 	struct inode *ip = VTOI(vp);
    550 	u_longlong_t offset;
    551 	int error, increase;
    552 	ssize_t resid;
    553 
    554 	rw_enter(&ip->i_contents, RW_READER);
    555 	increase = (ip->i_size < newsize);
    556 	offset = ip->i_size;
    557 	rw_exit(&ip->i_contents);
    558 
    559 	if (increase == 0)
    560 		return (0);
    561 
    562 	/*
    563 	 * write to each logical block to reserve disk space
    564 	 */
    565 	error = 0;
    566 	cpr_pagecopy[0] = '1';
    567 	for (; offset < newsize; offset += ip->i_fs->fs_bsize) {
    568 		if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy,
    569 		    ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0,
    570 		    (rlim64_t)MAXOFF_T, CRED(), &resid)) {
    571 			if (error == ENOSPC) {
    572 				cpr_err(CE_WARN, "error %d while reserving "
    573 				    "disk space for statefile %s\n"
    574 				    "wanted %lld bytes, file is %lld short",
    575 				    error, cpr_cprconfig_to_path(),
    576 				    newsize, newsize - offset);
    577 			}
    578 			break;
    579 		}
    580 	}
    581 	return (error);
    582 }
    583 
    584 
    585 /*
    586  * do a simple estimate of the space needed to hold the statefile
    587  * taking compression into account, but be fairly conservative
    588  * so we have a better chance of completing; when dump fails,
    589  * the retry cost is fairly high.
    590  *
    591  * Do disk blocks allocation for the state file if no space has
    592  * been allocated yet. Since the state file will not be removed,
    593  * allocation should only be done once.
    594  */
    595 static int
    596 cpr_statefile_ok(vnode_t *vp, int alloc_retry)
    597 {
    598 	extern size_t cpr_bitmap_size;
    599 	struct inode *ip = VTOI(vp);
    600 	const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */
    601 	u_longlong_t size, isize, ksize, raw_data;
    602 	char *str, *est_fmt;
    603 	size_t space;
    604 	int error;
    605 
    606 	/*
    607 	 * number of pages short for swapping.
    608 	 */
    609 	STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv;
    610 	if (STAT->cs_nosw_pages < 0)
    611 		STAT->cs_nosw_pages = 0;
    612 
    613 	str = "cpr_statefile_ok:";
    614 
    615 	CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n",
    616 	    k_anoninfo.ani_max, k_anoninfo.ani_phys_resv);
    617 	CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n",
    618 	    MAX(availrmem - swapfs_minfree, 0),
    619 	    k_anoninfo.ani_mem_resv);
    620 	CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n",
    621 	    CURRENT_TOTAL_AVAILABLE_SWAP);
    622 
    623 	/*
    624 	 * try increasing filesize by 15%
    625 	 */
    626 	if (alloc_retry) {
    627 		/*
    628 		 * block device doesn't get any bigger
    629 		 */
    630 		if (vp->v_type == VBLK) {
    631 			if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
    632 				prom_printf(
    633 				    "Retry statefile on special file\n");
    634 			return (ENOMEM);
    635 		} else {
    636 			rw_enter(&ip->i_contents, RW_READER);
    637 			size = (ip->i_size * SIZE_RATE) / INTEGRAL;
    638 			rw_exit(&ip->i_contents);
    639 		}
    640 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
    641 			prom_printf("Retry statefile size = %lld\n", size);
    642 	} else {
    643 		u_longlong_t cpd_size;
    644 		pgcnt_t npages, nback;
    645 		int ndvram;
    646 
    647 		ndvram = 0;
    648 		(void) callb_execute_class(CB_CL_CPR_FB,
    649 		    (int)(uintptr_t)&ndvram);
    650 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
    651 			prom_printf("ndvram size = %d\n", ndvram);
    652 
    653 		/*
    654 		 * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages
    655 		 */
    656 		npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit);
    657 		cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2));
    658 		raw_data = cpd_size + cpr_bitmap_size;
    659 		ksize = ndvram + mmu_ptob(npages);
    660 
    661 		est_fmt = "%s estimated size with "
    662 		    "%scompression %lld, ksize %lld\n";
    663 		nback = mmu_ptob(STAT->cs_nosw_pages);
    664 		if (CPR->c_flags & C_COMPRESSING) {
    665 			size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) +
    666 			    raw_data + ((nback * 10) / UCOMP_RATE);
    667 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize);
    668 		} else {
    669 			size = ksize + raw_data + nback;
    670 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ",
    671 			    size, ksize);
    672 		}
    673 	}
    674 
    675 	/*
    676 	 * All this is much simpler for a block device
    677 	 */
    678 	if (vp->v_type == VBLK) {
    679 		space = cpr_get_devsize(vp->v_rdev);
    680 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
    681 			prom_printf("statefile dev size %lu\n", space);
    682 
    683 		/*
    684 		 * Export the estimated filesize info, this value will be
    685 		 * compared before dumping out the statefile in the case of
    686 		 * no compression.
    687 		 */
    688 		STAT->cs_est_statefsz = size;
    689 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
    690 			prom_printf("%s Estimated statefile size %llu, "
    691 			    "space %lu\n", str, size, space);
    692 		if (size > space) {
    693 			cpr_err(CE_CONT, "Statefile partition too small.");
    694 			return (ENOMEM);
    695 		}
    696 		return (0);
    697 	} else {
    698 		if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) {
    699 			cpr_err(CE_CONT, "Statefile allocation retry failed\n");
    700 			return (ENOMEM);
    701 		}
    702 
    703 		/*
    704 		 * Estimate space needed for the state file.
    705 		 *
    706 		 * State file size in bytes:
    707 		 * 	kernel size + non-cache pte seg +
    708 		 *	bitmap size + cpr state file headers size
    709 		 * (round up to fs->fs_bsize)
    710 		 */
    711 		size = blkroundup(ip->i_fs, size);
    712 
    713 		/*
    714 		 * Export the estimated filesize info, this value will be
    715 		 * compared before dumping out the statefile in the case of
    716 		 * no compression.
    717 		 */
    718 		STAT->cs_est_statefsz = size;
    719 		error = cpr_grow_statefile(vp, size);
    720 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) {
    721 			rw_enter(&ip->i_contents, RW_READER);
    722 			isize = ip->i_size;
    723 			rw_exit(&ip->i_contents);
    724 			prom_printf("%s Estimated statefile size %lld, "
    725 			    "i_size %lld\n", str, size, isize);
    726 		}
    727 
    728 		return (error);
    729 	}
    730 }
    731 
    732 
    733 void
    734 cpr_statef_close(void)
    735 {
    736 	if (C_VP) {
    737 		if (!cpr_reusable_mode)
    738 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
    739 		(void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED(), NULL);
    740 		VN_RELE(C_VP);
    741 		C_VP = 0;
    742 	}
    743 }
    744 
    745 
    746 /*
    747  * open cpr default file and display error
    748  */
    749 int
    750 cpr_open_deffile(int mode, vnode_t **vpp)
    751 {
    752 	int error;
    753 
    754 	if (error = cpr_open(cpr_default_path, mode, vpp))
    755 		cpr_err(CE_CONT, "cannot open \"%s\", error %d\n",
    756 		    cpr_default_path, error);
    757 	return (error);
    758 }
    759 
    760 
    761 /*
    762  * write cdef_t to disk.  This contains the original values of prom
    763  * properties that we modify.  We fill in the magic number of the file
    764  * here as a signal to the booter code that the state file is valid.
    765  * Be sure the file gets synced, since we may be shutting down the OS.
    766  */
    767 int
    768 cpr_write_deffile(cdef_t *cdef)
    769 {
    770 	struct vnode *vp;
    771 	char *str;
    772 	int rc;
    773 
    774 	if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp))
    775 		return (rc);
    776 
    777 	if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef)))
    778 		str = "write";
    779 	else if (rc = VOP_FSYNC(vp, FSYNC, CRED(), NULL))
    780 		str = "fsync";
    781 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
    782 	VN_RELE(vp);
    783 
    784 	if (rc) {
    785 		cpr_err(CE_WARN, "%s error %d, file \"%s\"",
    786 		    str, rc, cpr_default_path);
    787 	}
    788 	return (rc);
    789 }
    790 
    791 /*
    792  * Clear the magic number in the defaults file.  This tells the booter
    793  * program that the state file is not current and thus prevents
    794  * any attempt to restore from an obsolete state file.
    795  */
    796 void
    797 cpr_clear_definfo(void)
    798 {
    799 	struct vnode *vp;
    800 	cmini_t mini;
    801 
    802 	if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) ||
    803 	    cpr_open_deffile(FCREAT|FWRITE, &vp))
    804 		return;
    805 	mini.magic = mini.reusable = 0;
    806 	(void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini));
    807 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
    808 	VN_RELE(vp);
    809 }
    810 
    811 /*
    812  * If the cpr default file is invalid, then we must not be in reusable mode
    813  * if it is valid, it tells us our mode
    814  */
    815 int
    816 cpr_get_reusable_mode(void)
    817 {
    818 	struct vnode *vp;
    819 	cmini_t mini;
    820 	int rc;
    821 
    822 	if (cpr_open(cpr_default_path, FREAD, &vp))
    823 		return (0);
    824 
    825 	rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini));
    826 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
    827 	VN_RELE(vp);
    828 	if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC)
    829 		return (mini.reusable);
    830 
    831 	return (0);
    832 }
    833 #endif
    834 
    835 /*
    836  * clock/time related routines
    837  */
    838 static time_t   cpr_time_stamp;
    839 
    840 
    841 void
    842 cpr_tod_get(cpr_time_t *ctp)
    843 {
    844 	timestruc_t ts;
    845 
    846 	mutex_enter(&tod_lock);
    847 	ts = TODOP_GET(tod_ops);
    848 	mutex_exit(&tod_lock);
    849 	ctp->tv_sec = (time32_t)ts.tv_sec;
    850 	ctp->tv_nsec = (int32_t)ts.tv_nsec;
    851 }
    852 
    853 void
    854 cpr_tod_fault_reset(void)
    855 {
    856 	mutex_enter(&tod_lock);
    857 	tod_fault_reset();
    858 	mutex_exit(&tod_lock);
    859 }
    860 
    861 void
    862 cpr_save_time(void)
    863 {
    864 	cpr_time_stamp = gethrestime_sec();
    865 }
    866 
    867 /*
    868  * correct time based on saved time stamp or hardware clock
    869  */
    870 void
    871 cpr_restore_time(void)
    872 {
    873 	clkset(cpr_time_stamp);
    874 }
    875 
    876 #if defined(__sparc)
    877 /*
    878  * CPU ONLINE/OFFLINE CODE
    879  */
    880 int
    881 cpr_mp_offline(void)
    882 {
    883 	cpu_t *cp, *bootcpu;
    884 	int rc = 0;
    885 	int brought_up_boot = 0;
    886 
    887 	/*
    888 	 * Do nothing for UP.
    889 	 */
    890 	if (ncpus == 1)
    891 		return (0);
    892 
    893 	mutex_enter(&cpu_lock);
    894 
    895 	cpr_save_mp_state();
    896 
    897 	bootcpu = i_cpr_bootcpu();
    898 	if (!CPU_ACTIVE(bootcpu)) {
    899 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) {
    900 			mutex_exit(&cpu_lock);
    901 			return (rc);
    902 		}
    903 		brought_up_boot = 1;
    904 	}
    905 
    906 	cp = cpu_list;
    907 	do {
    908 		if (cp == bootcpu)
    909 			continue;
    910 		if (cp->cpu_flags & CPU_OFFLINE)
    911 			continue;
    912 		if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) {
    913 			mutex_exit(&cpu_lock);
    914 			return (rc);
    915 		}
    916 	} while ((cp = cp->cpu_next) != cpu_list);
    917 	if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)))
    918 		prom_printf("changed cpu %p to state %d\n",
    919 		    (void *)bootcpu, CPU_CPR_ONLINE);
    920 	mutex_exit(&cpu_lock);
    921 
    922 	return (rc);
    923 }
    924 
    925 int
    926 cpr_mp_online(void)
    927 {
    928 	cpu_t *cp, *bootcpu = CPU;
    929 	int rc = 0;
    930 
    931 	/*
    932 	 * Do nothing for UP.
    933 	 */
    934 	if (ncpus == 1)
    935 		return (0);
    936 
    937 	/*
    938 	 * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags
    939 	 * to indicate a cpu was online at the time of cpr_suspend();
    940 	 * now restart those cpus that were marked as CPU_CPR_ONLINE
    941 	 * and actually are offline.
    942 	 */
    943 	mutex_enter(&cpu_lock);
    944 	for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) {
    945 		/*
    946 		 * Clear the CPU_FROZEN flag in all cases.
    947 		 */
    948 		cp->cpu_flags &= ~CPU_FROZEN;
    949 
    950 		if (CPU_CPR_IS_OFFLINE(cp))
    951 			continue;
    952 		if (CPU_ACTIVE(cp))
    953 			continue;
    954 		if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) {
    955 			mutex_exit(&cpu_lock);
    956 			return (rc);
    957 		}
    958 	}
    959 
    960 	/*
    961 	 * turn off the boot cpu if it was offlined
    962 	 */
    963 	if (CPU_CPR_IS_OFFLINE(bootcpu)) {
    964 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) {
    965 			mutex_exit(&cpu_lock);
    966 			return (rc);
    967 		}
    968 	}
    969 	mutex_exit(&cpu_lock);
    970 	return (0);
    971 }
    972 
    973 static void
    974 cpr_save_mp_state(void)
    975 {
    976 	cpu_t *cp;
    977 
    978 	ASSERT(MUTEX_HELD(&cpu_lock));
    979 
    980 	cp = cpu_list;
    981 	do {
    982 		cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE;
    983 		if (CPU_ACTIVE(cp))
    984 			CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE);
    985 	} while ((cp = cp->cpu_next) != cpu_list);
    986 }
    987 
    988 /*
    989  * change cpu to online/offline
    990  */
    991 static int
    992 cpr_p_online(cpu_t *cp, int state)
    993 {
    994 	int rc;
    995 
    996 	ASSERT(MUTEX_HELD(&cpu_lock));
    997 
    998 	switch (state) {
    999 	case CPU_CPR_ONLINE:
   1000 		rc = cpu_online(cp);
   1001 		break;
   1002 	case CPU_CPR_OFFLINE:
   1003 		rc = cpu_offline(cp, CPU_FORCED);
   1004 		break;
   1005 	}
   1006 	if (rc) {
   1007 		cpr_err(CE_WARN, "Failed to change processor %d to "
   1008 		    "state %d, (errno %d)", cp->cpu_id, state, rc);
   1009 	}
   1010 	return (rc);
   1011 }
   1012 
   1013 /*
   1014  * Construct the pathname of the state file and return a pointer to
   1015  * caller.  Read the config file to get the mount point of the
   1016  * filesystem and the pathname within fs.
   1017  */
   1018 char *
   1019 cpr_build_statefile_path(void)
   1020 {
   1021 	struct cprconfig *cf = &cprconfig;
   1022 
   1023 	if (cpr_get_config())
   1024 		return (NULL);
   1025 
   1026 	switch (cf->cf_type) {
   1027 	case CFT_UFS:
   1028 		if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) {
   1029 			cpr_err(CE_CONT, "Statefile path is too long.\n");
   1030 			return (NULL);
   1031 		}
   1032 		return (cpr_cprconfig_to_path());
   1033 	case CFT_ZVOL:
   1034 		/*FALLTHROUGH*/
   1035 	case CFT_SPEC:
   1036 		return (cf->cf_devfs);
   1037 	default:
   1038 		cpr_err(CE_PANIC, "invalid statefile type");
   1039 		/*NOTREACHED*/
   1040 		return (NULL);
   1041 	}
   1042 }
   1043 
   1044 int
   1045 cpr_statefile_is_spec(void)
   1046 {
   1047 	if (cpr_get_config())
   1048 		return (0);
   1049 	return (cprconfig.cf_type == CFT_SPEC);
   1050 }
   1051 
   1052 char *
   1053 cpr_get_statefile_prom_path(void)
   1054 {
   1055 	struct cprconfig *cf = &cprconfig;
   1056 
   1057 	ASSERT(cprconfig_loaded);
   1058 	ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC);
   1059 	ASSERT(cf->cf_type == CFT_SPEC || cf->cf_type == CFT_ZVOL);
   1060 	return (cf->cf_dev_prom);
   1061 }
   1062 
   1063 
   1064 /*
   1065  * XXX The following routines need to be in the vfs source code.
   1066  */
   1067 
   1068 int
   1069 cpr_is_ufs(struct vfs *vfsp)
   1070 {
   1071 	char *fsname;
   1072 
   1073 	fsname = vfssw[vfsp->vfs_fstype].vsw_name;
   1074 	return (strcmp(fsname, "ufs") == 0);
   1075 }
   1076 
   1077 int
   1078 cpr_is_zfs(struct vfs *vfsp)
   1079 {
   1080 	char *fsname;
   1081 
   1082 	fsname = vfssw[vfsp->vfs_fstype].vsw_name;
   1083 	return (strcmp(fsname, "zfs") == 0);
   1084 }
   1085 
   1086 /*
   1087  * This is a list of file systems that are allowed to be writeable when a
   1088  * reusable statefile checkpoint is taken.  They must not have any state that
   1089  * cannot be restored to consistency by simply rebooting using the checkpoint.
   1090  * (In contrast to ufs, cachefs and pcfs which have disk state that could get
   1091  * out of sync with the in-kernel data).
   1092  */
   1093 int
   1094 cpr_reusable_mount_check(void)
   1095 {
   1096 	struct vfs *vfsp;
   1097 	char *fsname;
   1098 	char **cpp;
   1099 	static char *cpr_writeok_fss[] = {
   1100 		"autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs",
   1101 		"proc", "tmpfs", "ctfs", "objfs", "dev", NULL
   1102 	};
   1103 
   1104 	vfs_list_read_lock();
   1105 	vfsp = rootvfs;
   1106 	do {
   1107 		if (vfsp->vfs_flag & VFS_RDONLY) {
   1108 			vfsp = vfsp->vfs_next;
   1109 			continue;
   1110 		}
   1111 		fsname = vfssw[vfsp->vfs_fstype].vsw_name;
   1112 		for (cpp = cpr_writeok_fss; *cpp; cpp++) {
   1113 			if (strcmp(fsname, *cpp) == 0)
   1114 				break;
   1115 		}
   1116 		/*
   1117 		 * if the inner loop reached the NULL terminator,
   1118 		 * the current fs-type does not match any OK-type
   1119 		 */
   1120 		if (*cpp == NULL) {
   1121 			cpr_err(CE_CONT, "a filesystem of type %s is "
   1122 			    "mounted read/write.\nReusable statefile requires "
   1123 			    "no writeable filesystem of this type be mounted\n",
   1124 			    fsname);
   1125 			vfs_list_unlock();
   1126 			return (EINVAL);
   1127 		}
   1128 		vfsp =