Home | History | Annotate | Download | only in cpr
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 /*
     29  * Fill in and write out the cpr state file
     30  *	1. Allocate and write headers, ELF and cpr dump header
     31  *	2. Allocate bitmaps according to phys_install
     32  *	3. Tag kernel pages into corresponding bitmap
     33  *	4. Write bitmaps to state file
     34  *	5. Write actual physical page data to state file
     35  */
     36 
     37 #include <sys/types.h>
     38 #include <sys/systm.h>
     39 #include <sys/vm.h>
     40 #include <sys/memlist.h>
     41 #include <sys/kmem.h>
     42 #include <sys/vnode.h>
     43 #include <sys/fs/ufs_inode.h>
     44 #include <sys/errno.h>
     45 #include <sys/cmn_err.h>
     46 #include <sys/debug.h>
     47 #include <vm/page.h>
     48 #include <vm/seg.h>
     49 #include <vm/seg_kmem.h>
     50 #include <vm/seg_kpm.h>
     51 #include <vm/hat.h>
     52 #include <sys/cpr.h>
     53 #include <sys/conf.h>
     54 #include <sys/ddi.h>
     55 #include <sys/panic.h>
     56 #include <sys/thread.h>
     57 #include <sys/note.h>
     58 
     59 /* Local defines and variables */
     60 #define	BTOb(bytes)	((bytes) << 3)		/* Bytes to bits, log2(NBBY) */
     61 #define	bTOB(bits)	((bits) >> 3)		/* bits to Bytes, log2(NBBY) */
     62 
     63 #if defined(__sparc)
     64 static uint_t cpr_pages_tobe_dumped;
     65 static uint_t cpr_regular_pgs_dumped;
     66 static int cpr_dump_regular_pages(vnode_t *);
     67 static int cpr_count_upages(int, bitfunc_t);
     68 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t);
     69 #endif
     70 
     71 int cpr_flush_write(vnode_t *);
     72 
     73 int cpr_contig_pages(vnode_t *, int);
     74 
     75 void cpr_clear_bitmaps();
     76 
     77 extern size_t cpr_get_devsize(dev_t);
     78 extern int i_cpr_dump_setup(vnode_t *);
     79 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *);
     80 extern int cpr_test_mode;
     81 int cpr_setbit(pfn_t, int);
     82 int cpr_clrbit(pfn_t, int);
     83 
     84 ctrm_t cpr_term;
     85 
     86 char *cpr_buf, *cpr_buf_end;
     87 int cpr_buf_blocks;		/* size of cpr_buf in blocks */
     88 size_t cpr_buf_size;		/* size of cpr_buf in bytes */
     89 size_t cpr_bitmap_size;
     90 int cpr_nbitmaps;
     91 
     92 char *cpr_pagedata;		/* page buffer for compression / tmp copy */
     93 size_t cpr_pagedata_size;	/* page buffer size in bytes */
     94 
     95 #if defined(__sparc)
     96 static char *cpr_wptr;		/* keep track of where to write to next */
     97 static int cpr_file_bn;		/* cpr state-file block offset */
     98 static int cpr_disk_writes_ok;
     99 static size_t cpr_dev_space = 0;
    100 #endif
    101 
    102 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE];
    103 
    104 #if defined(__sparc)
    105 /*
    106  * On some platforms bcopy may modify the thread structure
    107  * during bcopy (eg, to prevent cpu migration).  If the
    108  * range we are currently writing out includes our own
    109  * thread structure then it will be snapshotted by bcopy
    110  * including those modified members - and the updates made
    111  * on exit from bcopy will no longer be seen when we later
    112  * restore the mid-bcopy kthread_t.  So if the range we
    113  * need to copy overlaps with our thread structure we will
    114  * use a simple byte copy.
    115  */
    116 void
    117 cprbcopy(void *from, void *to, size_t bytes)
    118 {
    119 	extern int curthreadremapped;
    120 	caddr_t kthrend;
    121 
    122 	kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1;
    123 	if (curthreadremapped || (kthrend >= (caddr_t)from &&
    124 	    kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) {
    125 		caddr_t src = from, dst = to;
    126 
    127 		while (bytes-- > 0)
    128 			*dst++ = *src++;
    129 	} else {
    130 		bcopy(from, to, bytes);
    131 	}
    132 }
    133 
    134 /*
    135  * Allocate pages for buffers used in writing out the statefile
    136  */
    137 static int
    138 cpr_alloc_bufs(void)
    139 {
    140 	char *allocerr = "Unable to allocate memory for cpr buffer";
    141 	size_t size;
    142 
    143 	/*
    144 	 * set the cpr write buffer size to at least the historic
    145 	 * size (128k) or large enough to store the both the early
    146 	 * set of statefile structures (well under 0x800) plus the
    147 	 * bitmaps, and roundup to the next pagesize.
    148 	 */
    149 	size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size);
    150 	cpr_buf_size = MAX(size, CPRBUFSZ);
    151 	cpr_buf_blocks = btodb(cpr_buf_size);
    152 	cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP);
    153 	if (cpr_buf == NULL) {
    154 		cpr_err(CE_WARN, allocerr);
    155 		return (ENOMEM);
    156 	}
    157 	cpr_buf_end = cpr_buf + cpr_buf_size;
    158 
    159 	cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1);
    160 	cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP);
    161 	if (cpr_pagedata == NULL) {
    162 		kmem_free(cpr_buf, cpr_buf_size);
    163 		cpr_buf = NULL;
    164 		cpr_err(CE_WARN, allocerr);
    165 		return (ENOMEM);
    166 	}
    167 
    168 	return (0);
    169 }
    170 
    171 
    172 /*
    173  * Set bitmap size in bytes based on phys_install.
    174  */
    175 void
    176 cpr_set_bitmap_size(void)
    177 {
    178 	struct memlist *pmem;
    179 	size_t size = 0;
    180 
    181 	memlist_read_lock();
    182 	for (pmem = phys_install; pmem; pmem = pmem->next)
    183 		size += pmem->size;
    184 	memlist_read_unlock();
    185 	cpr_bitmap_size = BITMAP_BYTES(size);
    186 }
    187 
    188 
    189 /*
    190  * CPR dump header contains the following information:
    191  *	1. header magic -- unique to cpr state file
    192  *	2. kernel return pc & ppn for resume
    193  *	3. current thread info
    194  *	4. debug level and test mode
    195  *	5. number of bitmaps allocated
    196  *	6. number of page records
    197  */
    198 static int
    199 cpr_write_header(vnode_t *vp)
    200 {
    201 	extern ushort_t cpr_mach_type;
    202 	struct cpr_dump_desc cdump;
    203 	pgcnt_t bitmap_pages;
    204 	pgcnt_t kpages, vpages, upages;
    205 	pgcnt_t cpr_count_kpages(int mapflag, bitfunc_t bitfunc);
    206 
    207 	cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC;
    208 	cdump.cdd_version = CPR_VERSION;
    209 	cdump.cdd_machine = cpr_mach_type;
    210 	cdump.cdd_debug = cpr_debug;
    211 	cdump.cdd_test_mode = cpr_test_mode;
    212 	cdump.cdd_bitmaprec = cpr_nbitmaps;
    213 
    214 	cpr_clear_bitmaps();
    215 
    216 	/*
    217 	 * Remember how many pages we plan to save to statefile.
    218 	 * This information will be used for sanity checks.
    219 	 * Untag those pages that will not be saved to statefile.
    220 	 */
    221 	kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit);
    222 	vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit);
    223 	upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit);
    224 	cdump.cdd_dumppgsize = kpages - vpages + upages;
    225 	cpr_pages_tobe_dumped = cdump.cdd_dumppgsize;
    226 	CPR_DEBUG(CPR_DEBUG7,
    227 	    "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n",
    228 	    kpages, vpages, upages, cdump.cdd_dumppgsize);
    229 
    230 	/*
    231 	 * Some pages contain volatile data (cpr_buf and storage area for
    232 	 * sensitive kpages), which are no longer needed after the statefile
    233 	 * is dumped to disk.  We have already untagged them from regular
    234 	 * bitmaps.  Now tag them into the volatile bitmaps.  The pages in
    235 	 * volatile bitmaps will be claimed during resume, and the resumed
    236 	 * kernel will free them.
    237 	 */
    238 	(void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit);
    239 
    240 	bitmap_pages = mmu_btopr(cpr_bitmap_size);
    241 
    242 	/*
    243 	 * Export accurate statefile size for statefile allocation retry.
    244 	 * statefile_size = all the headers + total pages +
    245 	 * number of pages used by the bitmaps.
    246 	 * Roundup will be done in the file allocation code.
    247 	 */
    248 	STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) +
    249 	    (sizeof (cbd_t) * cdump.cdd_bitmaprec) +
    250 	    (sizeof (cpd_t) * cdump.cdd_dumppgsize) +
    251 	    mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages);
    252 
    253 	/*
    254 	 * If the estimated statefile is not big enough,
    255 	 * go retry now to save un-necessary operations.
    256 	 */
    257 	if (!(CPR->c_flags & C_COMPRESSING) &&
    258 	    (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) {
    259 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
    260 			prom_printf("cpr_write_header: "
    261 			    "STAT->cs_nocomp_statefsz > "
    262 			    "STAT->cs_est_statefsz\n");
    263 		return (ENOSPC);
    264 	}
    265 
    266 	/* now write cpr dump descriptor */
    267 	return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t)));
    268 }
    269 
    270 
    271 /*
    272  * CPR dump tail record contains the following information:
    273  *	1. header magic -- unique to cpr state file
    274  *	2. all misc info that needs to be passed to cprboot or resumed kernel
    275  */
    276 static int
    277 cpr_write_terminator(vnode_t *vp)
    278 {
    279 	cpr_term.magic = (uint_t)CPR_TERM_MAGIC;
    280 	cpr_term.va = (cpr_ptr)&cpr_term;
    281 	cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term);
    282 
    283 	/* count the last one (flush) */
    284 	cpr_term.real_statef_size = STAT->cs_real_statefsz +
    285 	    btod(cpr_wptr - cpr_buf) * DEV_BSIZE;
    286 
    287 	CPR_DEBUG(CPR_DEBUG9, "cpr_dump: Real Statefile Size: %ld\n",
    288 	    STAT->cs_real_statefsz);
    289 
    290 	cpr_tod_get(&cpr_term.tm_shutdown);
    291 
    292 	return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term)));
    293 }
    294 
    295 /*
    296  * Write bitmap descriptor array, followed by merged bitmaps.
    297  */
    298 static int
    299 cpr_write_bitmap(vnode_t *vp)
    300 {
    301 	char *rmap, *vmap, *dst, *tail;
    302 	size_t size, bytes;
    303 	cbd_t *dp;
    304 	int err;
    305 
    306 	dp = CPR->c_bmda;
    307 	if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp)))
    308 		return (err);
    309 
    310 	/*
    311 	 * merge regular and volatile bitmaps into tmp space
    312 	 * and write to disk
    313 	 */
    314 	for (; dp->cbd_size; dp++) {
    315 		rmap = (char *)dp->cbd_reg_bitmap;
    316 		vmap = (char *)dp->cbd_vlt_bitmap;
    317 		for (size = dp->cbd_size; size; size -= bytes) {
    318 			bytes = min(size, sizeof (cpr_pagecopy));
    319 			tail = &cpr_pagecopy[bytes];
    320 			for (dst = cpr_pagecopy; dst < tail; dst++)
    321 				*dst = *rmap++ | *vmap++;
    322 			if (err = cpr_write(vp, cpr_pagecopy, bytes))
    323 				break;
    324 		}
    325 	}
    326 
    327 	return (err);
    328 }
    329 
    330 
    331 static int
    332 cpr_write_statefile(vnode_t *vp)
    333 {
    334 	uint_t error = 0;
    335 	extern	int	i_cpr_check_pgs_dumped();
    336 	void flush_windows(void);
    337 	pgcnt_t spages;
    338 	char *str;
    339 
    340 	flush_windows();
    341 
    342 	/*
    343 	 * to get an accurate view of kas, we need to untag sensitive
    344 	 * pages *before* dumping them because the disk driver makes
    345 	 * allocations and changes kas along the way.  The remaining
    346 	 * pages referenced in the bitmaps are dumped out later as
    347 	 * regular kpages.
    348 	 */
    349 	str = "cpr_write_statefile:";
    350 	spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit);
    351 	CPR_DEBUG(CPR_DEBUG7, "%s untag %ld sens pages\n", str, spages);
    352 
    353 	/*
    354 	 * now it's OK to call a driver that makes allocations
    355 	 */
    356 	cpr_disk_writes_ok = 1;
    357 
    358 	/*
    359 	 * now write out the clean sensitive kpages
    360 	 * according to the sensitive descriptors
    361 	 */
    362 	error = i_cpr_dump_sensitive_kpages(vp);
    363 	if (error) {
    364 		CPR_DEBUG(CPR_DEBUG7,
    365 		    "%s cpr_dump_sensitive_kpages() failed!\n", str);
    366 		return (error);
    367 	}
    368 
    369 	/*
    370 	 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped
    371 	 */
    372 	error = cpr_dump_regular_pages(vp);
    373 	if (error) {
    374 		CPR_DEBUG(CPR_DEBUG7,
    375 		    "%s cpr_dump_regular_pages() failed!\n", str);
    376 		return (error);
    377 	}
    378 
    379 	/*
    380 	 * sanity check to verify the right number of pages were dumped
    381 	 */
    382 	error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped,
    383 	    cpr_regular_pgs_dumped);
    384 
    385 	if (error) {
    386 		prom_printf("\n%s page count mismatch!\n", str);
    387 #ifdef DEBUG
    388 		if (cpr_test_mode)
    389 			debug_enter(NULL);
    390 #endif
    391 	}
    392 
    393 	return (error);
    394 }
    395 #endif
    396 
    397 
    398 /*
    399  * creates the CPR state file, the following sections are
    400  * written out in sequence:
    401  *    - writes the cpr dump header
    402  *    - writes the memory usage bitmaps
    403  *    - writes the platform dependent info
    404  *    - writes the remaining user pages
    405  *    - writes the kernel pages
    406  */
    407 #if defined(__x86)
    408 	_NOTE(ARGSUSED(0))
    409 #endif
    410 int
    411 cpr_dump(vnode_t *vp)
    412 {
    413 #if defined(__sparc)
    414 	int error;
    415 
    416 	if (cpr_buf == NULL) {
    417 		ASSERT(cpr_pagedata == NULL);
    418 		if (error = cpr_alloc_bufs())
    419 			return (error);
    420 	}
    421 	/* point to top of internal buffer */
    422 	cpr_wptr = cpr_buf;
    423 
    424 	/* initialize global variables used by the write operation */
    425 	cpr_file_bn = cpr_statefile_offset();
    426 	cpr_dev_space = 0;
    427 
    428 	/* allocate bitmaps */
    429 	if (CPR->c_bmda == NULL) {
    430 		if (error = i_cpr_alloc_bitmaps()) {
    431 			cpr_err(CE_WARN, "cannot allocate bitmaps");
    432 			return (error);
    433 		}
    434 	}
    435 
    436 	if (error = i_cpr_prom_pages(CPR_PROM_SAVE))
    437 		return (error);
    438 
    439 	if (error = i_cpr_dump_setup(vp))
    440 		return (error);
    441 
    442 	/*
    443 	 * set internal cross checking; we dont want to call
    444 	 * a disk driver that makes allocations until after
    445 	 * sensitive pages are saved
    446 	 */
    447 	cpr_disk_writes_ok = 0;
    448 
    449 	/*
    450 	 * 1253112: heap corruption due to memory allocation when dumpping
    451 	 *	    statefile.
    452 	 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and
    453 	 * kvseg segments can be contaminated should memory allocations happen
    454 	 * during sddump, which is not supposed to happen after the system
    455 	 * is quiesced. Let's call the kernel pages that tend to be affected
    456 	 * 'sensitive kpages' here. To avoid saving inconsistent pages, we
    457 	 * will allocate some storage space to save the clean sensitive pages
    458 	 * aside before statefile dumping takes place. Since there may not be
    459 	 * much memory left at this stage, the sensitive pages will be
    460 	 * compressed before they are saved into the storage area.
    461 	 */
    462 	if (error = i_cpr_save_sensitive_kpages()) {
    463 		CPR_DEBUG(CPR_DEBUG7,
    464 		    "cpr_dump: save_sensitive_kpages failed!\n");
    465 		return (error);
    466 	}
    467 
    468 	/*
    469 	 * since all cpr allocations are done (space for sensitive kpages,
    470 	 * bitmaps, cpr_buf), kas is stable, and now we can accurately
    471 	 * count regular and sensitive kpages.
    472 	 */
    473 	if (error = cpr_write_header(vp)) {
    474 		CPR_DEBUG(CPR_DEBUG7,
    475 		    "cpr_dump: cpr_write_header() failed!\n");
    476 		return (error);
    477 	}
    478 
    479 	if (error = i_cpr_write_machdep(vp))
    480 		return (error);
    481 
    482 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL))
    483 		return (error);
    484 
    485 	if (error = cpr_write_bitmap(vp))
    486 		return (error);
    487 
    488 	if (error = cpr_write_statefile(vp)) {
    489 		CPR_DEBUG(CPR_DEBUG7,
    490 		    "cpr_dump: cpr_write_statefile() failed!\n");
    491 		return (error);
    492 	}
    493 
    494 	if (error = cpr_write_terminator(vp))
    495 		return (error);
    496 
    497 	if (error = cpr_flush_write(vp))
    498 		return (error);
    499 
    500 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp))
    501 		return (error);
    502 #endif
    503 
    504 	return (0);
    505 }
    506 
    507 
    508 #if defined(__sparc)
    509 /*
    510  * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc;
    511  * a page-count from each range is accumulated at arg->pages.
    512  */
    513 static void
    514 cpr_xwalk(void *arg, void *base, size_t size)
    515 {
    516 	struct cpr_walkinfo *cwip = arg;
    517 
    518 	cwip->pages += cpr_count_pages(base, size,
    519 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
    520 	cwip->size += size;
    521 	cwip->ranges++;
    522 }
    523 
    524 /*
    525  * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc;
    526  * a page-count from each range is accumulated at arg->pages.
    527  */
    528 static void
    529 cpr_walk(void *arg, void *base, size_t size)
    530 {
    531 	caddr_t addr = base;
    532 	caddr_t addr_end = addr + size;
    533 
    534 	/*
    535 	 * If we are about to start walking the range of addresses we
    536 	 * carved out of the kernel heap for the large page heap walk
    537 	 * heap_lp_arena to find what segments are actually populated
    538 	 */
    539 	if (SEGKMEM_USE_LARGEPAGES &&
    540 	    addr == heap_lp_base && addr_end == heap_lp_end &&
    541 	    vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
    542 		vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg);
    543 	} else {
    544 		cpr_xwalk(arg, base, size);
    545 	}
    546 }
    547 
    548 
    549 /*
    550  * faster scan of kvseg using vmem_walk() to visit
    551  * allocated ranges.
    552  */
    553 pgcnt_t
    554 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg)
    555 {
    556 	struct cpr_walkinfo cwinfo;
    557 
    558 	bzero(&cwinfo, sizeof (cwinfo));
    559 	cwinfo.mapflag = mapflag;
    560 	cwinfo.bitfunc = bitfunc;
    561 
    562 	vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo);
    563 
    564 	if (cpr_debug & CPR_DEBUG7) {
    565 		prom_printf("walked %d sub-ranges, total pages %ld\n",
    566 		    cwinfo.ranges, mmu_btop(cwinfo.size));
    567 		cpr_show_range(seg->s_base, seg->s_size,
    568 		    mapflag, bitfunc, cwinfo.pages);
    569 	}
    570 
    571 	return (cwinfo.pages);
    572 }
    573 
    574 
    575 /*
    576  * cpr_walk_kpm() is called for every used area within the large
    577  * segkpm virtual address window. A page-count is accumulated at
    578  * arg->pages.
    579  */
    580 static void
    581 cpr_walk_kpm(void *arg, void *base, size_t size)
    582 {
    583 	struct cpr_walkinfo *cwip = arg;
    584 
    585 	cwip->pages += cpr_count_pages(base, size,
    586 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
    587 	cwip->size += size;
    588 	cwip->ranges++;
    589 }
    590 
    591 
    592 /*
    593  * faster scan of segkpm using hat_kpm_walk() to visit only used ranges.
    594  */
    595 /*ARGSUSED*/
    596 static pgcnt_t
    597 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg)
    598 {
    599 	struct cpr_walkinfo cwinfo;
    600 
    601 	if (kpm_enable == 0)
    602 		return (0);
    603 
    604 	bzero(&cwinfo, sizeof (cwinfo));
    605 	cwinfo.mapflag = mapflag;
    606 	cwinfo.bitfunc = bitfunc;
    607 	hat_kpm_walk(cpr_walk_kpm, &cwinfo);
    608 
    609 	if (cpr_debug & CPR_DEBUG7) {
    610 		prom_printf("walked %d sub-ranges, total pages %ld\n",
    611 		    cwinfo.ranges, mmu_btop(cwinfo.size));
    612 		cpr_show_range(segkpm->s_base, segkpm->s_size,
    613 		    mapflag, bitfunc, cwinfo.pages);
    614 	}
    615 
    616 	return (cwinfo.pages);
    617 }
    618 
    619 
    620 /*
    621  * Sparsely filled kernel segments are registered in kseg_table for
    622  * easier lookup. See also block comment for cpr_count_seg_pages.
    623  */
    624 
    625 #define	KSEG_SEG_ADDR	0	/* address of struct seg */
    626 #define	KSEG_PTR_ADDR	1	/* address of pointer to struct seg */
    627 
    628 typedef struct {
    629 	struct seg **st_seg;		/* segment pointer or segment address */
    630 	pgcnt_t	(*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */
    631 	int	st_addrtype;		/* address type in st_seg */
    632 } ksegtbl_entry_t;
    633 
    634 ksegtbl_entry_t kseg_table[] = {
    635 	{(struct seg **)&kvseg,		cpr_scan_kvseg,		KSEG_SEG_ADDR},
    636 	{&segkpm,			cpr_scan_segkpm,	KSEG_PTR_ADDR},
    637 	{NULL,				0,			0}
    638 };
    639 
    640 
    641 /*
    642  * Compare seg with each entry in kseg_table; when there is a match
    643  * return the entry pointer, otherwise return NULL.
    644  */
    645 static ksegtbl_entry_t *
    646 cpr_sparse_seg_check(struct seg *seg)
    647 {
    648 	ksegtbl_entry_t *ste = &kseg_table[0];
    649 	struct seg *tseg;
    650 
    651 	for (; ste->st_seg; ste++) {
    652 		tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ?
    653 		    *ste->st_seg : (struct seg *)ste->st_seg;
    654 
    655 		if (seg == tseg)
    656 			return (ste);
    657 	}
    658 
    659 	return ((ksegtbl_entry_t *)NULL);
    660 }
    661 
    662 
    663 /*
    664  * Count pages within each kernel segment; call cpr_sparse_seg_check()
    665  * to find out whether a sparsely filled segment needs special
    666  * treatment (e.g. kvseg).
    667  * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr
    668  *       module shouldn't need to know segment details like if it is
    669  *       sparsely filled or not (makes kseg_table obsolete).
    670  */
    671 pgcnt_t
    672 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc)
    673 {
    674 	struct seg *segp;
    675 	pgcnt_t pages;
    676 	ksegtbl_entry_t *ste;
    677 
    678 	pages = 0;
    679 	for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) {
    680 		if (ste = cpr_sparse_seg_check(segp)) {
    681 			pages += (ste->st_fcn)(mapflag, bitfunc, segp);
    682 		} else {
    683 			pages += cpr_count_pages(segp->s_base,
    684 			    segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE);
    685 		}
    686 	}
    687 
    688 	return (pages);
    689 }
    690 
    691 
    692 /*
    693  * count kernel pages within kas and any special ranges
    694  */
    695 pgcnt_t
    696 cpr_count_kpages(int mapflag, bitfunc_t bitfunc)
    697 {
    698 	pgcnt_t kas_cnt;
    699 
    700 	/*
    701 	 * Some pages need to be taken care of differently.
    702 	 * eg: panicbuf pages of sun4m are not in kas but they need
    703 	 * to be saved.  On sun4u, the physical pages of panicbuf are
    704 	 * allocated via prom_retain().
    705 	 */
    706 	kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc);
    707 	kas_cnt += cpr_count_seg_pages(mapflag, bitfunc);
    708 
    709 	CPR_DEBUG(CPR_DEBUG9, "cpr_count_kpages: kas_cnt=%ld\n", kas_cnt);
    710 	CPR_DEBUG(CPR_DEBUG7, "\ncpr_count_kpages: %ld pages, 0x%lx bytes\n",
    711 	    kas_cnt, mmu_ptob(kas_cnt));
    712 
    713 	return (kas_cnt);
    714 }
    715 
    716 
    717 /*
    718  * Set a bit corresponding to the arg phys page number;
    719  * returns 0 when the ppn is valid and the corresponding
    720  * map bit was clear, otherwise returns 1.
    721  */
    722 int
    723 cpr_setbit(pfn_t ppn, int mapflag)
    724 {
    725 	char *bitmap;
    726 	cbd_t *dp;
    727 	pfn_t rel;
    728 	int clr;
    729 
    730 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
    731 		if (PPN_IN_RANGE(ppn, dp)) {
    732 			bitmap = DESC_TO_MAP(dp, mapflag);
    733 			rel = ppn - dp->cbd_spfn;
    734 			if ((clr = isclr(bitmap, rel)) != 0)
    735 				setbit(bitmap, rel);
    736 			return (clr == 0);
    737 		}
    738 	}
    739 
    740 	return (1);
    741 }
    742 
    743 
    744 /*
    745  * Clear a bit corresponding to the arg phys page number.
    746  */
    747 int
    748 cpr_clrbit(pfn_t ppn, int mapflag)
    749 {
    750 	char *bitmap;
    751 	cbd_t *dp;
    752 	pfn_t rel;
    753 	int set;
    754 
    755 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
    756 		if (PPN_IN_RANGE(ppn, dp)) {
    757 			bitmap = DESC_TO_MAP(dp, mapflag);
    758 			rel = ppn - dp->cbd_spfn;
    759 			if ((set = isset(bitmap, rel)) != 0)
    760 				clrbit(bitmap, rel);
    761 			return (set == 0);
    762 		}
    763 	}
    764 
    765 	return (1);
    766 }
    767 
    768 
    769 /* ARGSUSED */
    770 int
    771 cpr_nobit(pfn_t ppn, int mapflag)
    772 {
    773 	return (0);
    774 }
    775 
    776 
    777 /*
    778  * Lookup a bit corresponding to the arg phys page number.
    779  */
    780 int
    781 cpr_isset(pfn_t ppn, int mapflag)
    782 {
    783 	char *bitmap;
    784 	cbd_t *dp;
    785 	pfn_t rel;
    786 
    787 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
    788 		if (PPN_IN_RANGE(ppn, dp)) {
    789 			bitmap = DESC_TO_MAP(dp, mapflag);
    790 			rel = ppn - dp->cbd_spfn;
    791 			return (isset(bitmap, rel));
    792 		}
    793 	}
    794 
    795 	return (0);
    796 }
    797 
    798 
    799 /*
    800  * Go thru all pages and pick up any page not caught during the invalidation
    801  * stage. This is also used to save pages with cow lock or phys page lock held
    802  * (none zero p_lckcnt or p_cowcnt)
    803  */
    804 static	int
    805 cpr_count_upages(int mapflag, bitfunc_t bitfunc)
    806 {
    807 	page_t *pp, *page0;
    808 	pgcnt_t dcnt = 0, tcnt = 0;
    809 	pfn_t pfn;
    810 
    811 	page0 = pp = page_first();
    812 
    813 	do {
    814 #if defined(__sparc)
    815 		extern struct vnode prom_ppages;
    816 		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
    817 		    pp->p_vnode == &prom_ppages ||
    818 		    PP_ISFREE(pp) && PP_ISAGED(pp))
    819 #else
    820 		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
    821 		    PP_ISFREE(pp) && PP_ISAGED(pp))
    822 #endif /* __sparc */
    823 			continue;
    824 
    825 		pfn = page_pptonum(pp);
    826 		if (pf_is_memory(pfn)) {
    827 			tcnt++;
    828 			if ((*bitfunc)(pfn, mapflag) == 0)
    829 				dcnt++; /* dirty count */
    830 		}
    831 	} while ((pp = page_next(pp)) != page0);
    832 
    833 	STAT->cs_upage2statef = dcnt;
    834 	CPR_DEBUG(CPR_DEBUG9, "cpr_count_upages: dirty=%ld total=%ld\n",
    835 	    dcnt, tcnt);
    836 	CPR_DEBUG(CPR_DEBUG7, "cpr_count_upages: %ld pages, 0x%lx bytes\n",
    837 	    dcnt, mmu_ptob(dcnt));
    838 
    839 	return (dcnt);
    840 }
    841 
    842 
    843 /*
    844  * try compressing pages based on cflag,
    845  * and for DEBUG kernels, verify uncompressed data checksum;
    846  *
    847  * this routine replaces common code from
    848  * i_cpr_compress_and_save() and cpr_compress_and_write()
    849  */
    850 char *
    851 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag)
    852 {
    853 	size_t nbytes, clen, len;
    854 	uint32_t test_sum;
    855 	char *datap;
    856 
    857 	nbytes = mmu_ptob(pages);
    858 
    859 	/*
    860 	 * set length to the original uncompressed data size;
    861 	 * always init cpd_flag to zero
    862 	 */
    863 	dp->cpd_length = nbytes;
    864 	dp->cpd_flag = 0;
    865 
    866 #ifdef	DEBUG
    867 	/*
    868 	 * Make a copy of the uncompressed data so we can checksum it.
    869 	 * Compress that copy so the checksum works at the other end
    870 	 */
    871 	cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes);
    872 	dp->cpd_usum = checksum32(cpr_pagecopy, nbytes);
    873 	dp->cpd_flag |= CPD_USUM;
    874 	datap = cpr_pagecopy;
    875 #else
    876 	datap = CPR->c_mapping_area;
    877 	dp->cpd_usum = 0;
    878 #endif
    879 
    880 	/*
    881 	 * try compressing the raw data to cpr_pagedata;
    882 	 * if there was a size reduction: record the new length,
    883 	 * flag the compression, and point to the compressed data.
    884 	 */
    885 	dp->cpd_csum = 0;
    886 	if (cflag) {
    887 		clen = compress(datap, cpr_pagedata, nbytes);
    888 		if (clen < nbytes) {
    889 			dp->cpd_flag |= CPD_COMPRESS;
    890 			dp->cpd_length = clen;
    891 			datap = cpr_pagedata;
    892 #ifdef	DEBUG
    893 			dp->cpd_csum = checksum32(datap, clen);
    894 			dp->cpd_flag |= CPD_CSUM;
    895 
    896 			/*
    897 			 * decompress the data back to a scratch area
    898 			 * and compare the new checksum with the original
    899 			 * checksum to verify the compression.
    900 			 */
    901 			bzero(cpr_pagecopy, sizeof (cpr_pagecopy));
    902 			len = decompress(datap, cpr_pagecopy,
    903 			    clen, sizeof (cpr_pagecopy));
    904 			test_sum = checksum32(cpr_pagecopy, len);
    905 			ASSERT(test_sum == dp->cpd_usum);
    906 #endif
    907 		}
    908 	}
    909 
    910 	return (datap);
    911 }
    912 
    913 
    914 /*
    915  * 1. Prepare cpr page descriptor and write it to file
    916  * 2. Compress page data and write it out
    917  */
    918 static int
    919 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg)
    920 {
    921 	int error = 0;
    922 	char *datap;
    923 	cpd_t cpd;	/* cpr page descriptor */
    924 	extern void i_cpr_mapin(caddr_t, uint_t, pfn_t);
    925 	extern void i_cpr_mapout(caddr_t, uint_t);
    926 
    927 	i_cpr_mapin(CPR->c_mapping_area, npg, pfn);
    928 
    929 	CPR_DEBUG(CPR_DEBUG3, "mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n",
    930 	    npg, (void *)CPR->c_mapping_area, pfn);
    931 
    932 	/*
    933 	 * Fill cpr page descriptor.
    934 	 */
    935 	cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC;
    936 	cpd.cpd_pfn = pfn;
    937 	cpd.cpd_pages = npg;
    938 
    939 	STAT->cs_dumped_statefsz += mmu_ptob(npg);
    940 
    941 	datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING);
    942 
    943 	/* Write cpr page descriptor */
    944 	error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t));
    945 
    946 	/* Write compressed page data */
    947 	error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length);
    948 
    949 	/*
    950 	 * Unmap the pages for tlb and vac flushing
    951 	 */
    952 	i_cpr_mapout(CPR->c_mapping_area, npg);
    953 
    954 	if (error) {
    955 		CPR_DEBUG(CPR_DEBUG1,
    956 		    "cpr_compress_and_write: vp 0x%p va 0x%x ", (void *)vp, va);
    957 		CPR_DEBUG(CPR_DEBUG1, "pfn 0x%lx blk %d err %d\n",
    958 		    pfn, cpr_file_bn, error);
    959 	} else {
    960 		cpr_regular_pgs_dumped += npg;
    961 	}
    962 
    963 	return (error);
    964 }
    965 
    966 
    967 int
    968 cpr_write(vnode_t *vp, caddr_t buffer, size_t size)
    969 {
    970 	caddr_t	fromp = buffer;
    971 	size_t bytes, wbytes;
    972 	int error;
    973 
    974 	if (cpr_dev_space == 0) {
    975 		if (vp->v_type == VBLK) {
    976 			cpr_dev_space = cpr_get_devsize(vp->v_rdev);
    977 			ASSERT(cpr_dev_space);
    978 		} else
    979 			cpr_dev_space = 1;	/* not used in this case */
    980 	}
    981 
    982 	/*
    983 	 * break the write into multiple part if request is large,
    984 	 * calculate count up to buf page boundary, then write it out.
    985 	 * repeat until done.
    986 	 */
    987 	while (size) {
    988 		bytes = MIN(size, cpr_buf_end - cpr_wptr);
    989 		cprbcopy(fromp, cpr_wptr, bytes);
    990 		cpr_wptr += bytes;
    991 		fromp += bytes;
    992 		size -= bytes;
    993 		if (cpr_wptr < cpr_buf_end)
    994 			return (0);	/* buffer not full yet */
    995 		ASSERT(cpr_wptr == cpr_buf_end);
    996 
    997 		wbytes = dbtob(cpr_file_bn + cpr_buf_blocks);
    998 		if (vp->v_type == VBLK) {
    999 			if (wbytes > cpr_dev_space)
   1000 				return (ENOSPC);
   1001 		} else {
   1002 			if (wbytes > VTOI(vp)->i_size)
   1003 				return (ENOSPC);
   1004 		}
   1005 
   1006 		CPR_DEBUG(CPR_DEBUG3,
   1007 		    "cpr_write: frmp=%p wptr=%p cnt=%lx...",
   1008 		    (void *)fromp, (void *)cpr_wptr, bytes);
   1009 		/*
   1010 		 * cross check, this should not happen!
   1011 		 */
   1012 		if (cpr_disk_writes_ok == 0) {
   1013 			prom_printf("cpr_write: disk write too early!\n");
   1014 			return (EINVAL);
   1015 		}
   1016 
   1017 		do_polled_io = 1;
   1018 		error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks,
   1019 		    NULL);
   1020 		do_polled_io = 0;
   1021 		CPR_DEBUG(CPR_DEBUG3, "done\n");
   1022 
   1023 		STAT->cs_real_statefsz += cpr_buf_size;
   1024 
   1025 		if (error) {
   1026 			cpr_err(CE_WARN, "cpr_write error %d", error);
   1027 			return (error);
   1028 		}
   1029 		cpr_file_bn += cpr_buf_blocks;	/* Increment block count */
   1030 		cpr_wptr = cpr_buf;		/* back to top of buffer */
   1031 	}
   1032 	return (0);
   1033 }
   1034 
   1035 
   1036 int
   1037 cpr_flush_write(vnode_t *vp)
   1038 {
   1039 	int	nblk;
   1040 	int	error;
   1041 
   1042 	/*
   1043 	 * Calculate remaining blocks in buffer, rounded up to nearest
   1044 	 * disk block
   1045 	 */
   1046 	nblk = btod(cpr_wptr - cpr_buf);
   1047 
   1048 	do_polled_io = 1;
   1049 	error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk, NULL);
   1050 	do_polled_io = 0;
   1051 
   1052 	cpr_file_bn += nblk;
   1053 	if (error)
   1054 		CPR_DEBUG(CPR_DEBUG2, "cpr_flush_write: error (%d)\n",
   1055 		    error);
   1056 	return (error);
   1057 }
   1058 
   1059 void
   1060 cpr_clear_bitmaps(void)
   1061 {
   1062 	cbd_t *dp;
   1063 
   1064 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
   1065 		bzero((void *)dp->cbd_reg_bitmap,
   1066 		    (size_t)dp->cbd_size * 2);
   1067 	}
   1068 	CPR_DEBUG(CPR_DEBUG7, "\ncleared reg and vlt bitmaps\n");
   1069 }
   1070 
   1071 int
   1072 cpr_contig_pages(vnode_t *vp, int flag)
   1073 {
   1074 	int chunks = 0, error = 0;
   1075 	pgcnt_t i, j, totbit;
   1076 	pfn_t spfn;
   1077 	cbd_t *dp;
   1078 	uint_t	spin_cnt = 0;
   1079 	extern	int i_cpr_compress_and_save();
   1080 
   1081 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
   1082 		spfn = dp->cbd_spfn;
   1083 		totbit = BTOb(dp->cbd_size);
   1084 		i = 0; /* Beginning of bitmap */
   1085 		j = 0;
   1086 		while (i < totbit) {
   1087 			while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) {
   1088 				if (isset((char *)dp->cbd_reg_bitmap, j+i))
   1089 					j++;
   1090 				else /* not contiguous anymore */
   1091 					break;
   1092 			}
   1093 
   1094 			if (j) {
   1095 				chunks++;
   1096 				if (flag == SAVE_TO_STORAGE) {
   1097 					error = i_cpr_compress_and_save(
   1098 					    chunks, spfn + i, j);
   1099 					if (error)
   1100 						return (error);
   1101 				} else if (flag == WRITE_TO_STATEFILE) {
   1102 					error = cpr_compress_and_write(vp, 0,
   1103 					    spfn + i, j);
   1104 					if (error)
   1105 						return (error);
   1106 					else {
   1107 						spin_cnt++;
   1108 						if ((spin_cnt & 0x5F) == 1)
   1109 							cpr_spinning_bar();
   1110 					}
   1111 				}
   1112 			}
   1113 
   1114 			i += j;
   1115 			if (j != CPR_MAXCONTIG) {
   1116 				/* Stopped on a non-tagged page */
   1117 				i++;
   1118 			}
   1119 
   1120 			j = 0;
   1121 		}
   1122 	}
   1123 
   1124 	if (flag == STORAGE_DESC_ALLOC)
   1125 		return (chunks);
   1126 	else
   1127 		return (0);
   1128 }
   1129 
   1130 
   1131 void
   1132 cpr_show_range(caddr_t vaddr, size_t size,
   1133     int mapflag, bitfunc_t bitfunc, pgcnt_t count)
   1134 {
   1135