Home | History | Annotate | Download | only in cpr
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Fill in and write out the cpr state file
     28  *	1. Allocate and write headers, ELF and cpr dump header
     29  *	2. Allocate bitmaps according to phys_install
     30  *	3. Tag kernel pages into corresponding bitmap
     31  *	4. Write bitmaps to state file
     32  *	5. Write actual physical page data to state file
     33  */
     34 
     35 #include <sys/types.h>
     36 #include <sys/systm.h>
     37 #include <sys/vm.h>
     38 #include <sys/memlist.h>
     39 #include <sys/kmem.h>
     40 #include <sys/vnode.h>
     41 #include <sys/fs/ufs_inode.h>
     42 #include <sys/errno.h>
     43 #include <sys/cmn_err.h>
     44 #include <sys/debug.h>
     45 #include <vm/page.h>
     46 #include <vm/seg.h>
     47 #include <vm/seg_kmem.h>
     48 #include <vm/seg_kpm.h>
     49 #include <vm/hat.h>
     50 #include <sys/cpr.h>
     51 #include <sys/conf.h>
     52 #include <sys/ddi.h>
     53 #include <sys/panic.h>
     54 #include <sys/thread.h>
     55 #include <sys/note.h>
     56 
     57 /* Local defines and variables */
     58 #define	BTOb(bytes)	((bytes) << 3)		/* Bytes to bits, log2(NBBY) */
     59 #define	bTOB(bits)	((bits) >> 3)		/* bits to Bytes, log2(NBBY) */
     60 
     61 #if defined(__sparc)
     62 static uint_t cpr_pages_tobe_dumped;
     63 static uint_t cpr_regular_pgs_dumped;
     64 static int cpr_dump_regular_pages(vnode_t *);
     65 static int cpr_count_upages(int, bitfunc_t);
     66 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t);
     67 #endif
     68 
     69 int cpr_flush_write(vnode_t *);
     70 
     71 int cpr_contig_pages(vnode_t *, int);
     72 
     73 void cpr_clear_bitmaps();
     74 
     75 extern size_t cpr_get_devsize(dev_t);
     76 extern int i_cpr_dump_setup(vnode_t *);
     77 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *);
     78 extern int cpr_test_mode;
     79 int cpr_setbit(pfn_t, int);
     80 int cpr_clrbit(pfn_t, int);
     81 
     82 ctrm_t cpr_term;
     83 
     84 char *cpr_buf, *cpr_buf_end;
     85 int cpr_buf_blocks;		/* size of cpr_buf in blocks */
     86 size_t cpr_buf_size;		/* size of cpr_buf in bytes */
     87 size_t cpr_bitmap_size;
     88 int cpr_nbitmaps;
     89 
     90 char *cpr_pagedata;		/* page buffer for compression / tmp copy */
     91 size_t cpr_pagedata_size;	/* page buffer size in bytes */
     92 
     93 #if defined(__sparc)
     94 static char *cpr_wptr;		/* keep track of where to write to next */
     95 static int cpr_file_bn;		/* cpr state-file block offset */
     96 static int cpr_disk_writes_ok;
     97 static size_t cpr_dev_space = 0;
     98 #endif
     99 
    100 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE];
    101 
    102 #if defined(__sparc)
    103 /*
    104  * On some platforms bcopy may modify the thread structure
    105  * during bcopy (eg, to prevent cpu migration).  If the
    106  * range we are currently writing out includes our own
    107  * thread structure then it will be snapshotted by bcopy
    108  * including those modified members - and the updates made
    109  * on exit from bcopy will no longer be seen when we later
    110  * restore the mid-bcopy kthread_t.  So if the range we
    111  * need to copy overlaps with our thread structure we will
    112  * use a simple byte copy.
    113  */
    114 void
    115 cprbcopy(void *from, void *to, size_t bytes)
    116 {
    117 	extern int curthreadremapped;
    118 	caddr_t kthrend;
    119 
    120 	kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1;
    121 	if (curthreadremapped || (kthrend >= (caddr_t)from &&
    122 	    kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) {
    123 		caddr_t src = from, dst = to;
    124 
    125 		while (bytes-- > 0)
    126 			*dst++ = *src++;
    127 	} else {
    128 		bcopy(from, to, bytes);
    129 	}
    130 }
    131 
    132 /*
    133  * Allocate pages for buffers used in writing out the statefile
    134  */
    135 static int
    136 cpr_alloc_bufs(void)
    137 {
    138 	char *allocerr = "Unable to allocate memory for cpr buffer";
    139 	size_t size;
    140 
    141 	/*
    142 	 * set the cpr write buffer size to at least the historic
    143 	 * size (128k) or large enough to store the both the early
    144 	 * set of statefile structures (well under 0x800) plus the
    145 	 * bitmaps, and roundup to the next pagesize.
    146 	 */
    147 	size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size);
    148 	cpr_buf_size = MAX(size, CPRBUFSZ);
    149 	cpr_buf_blocks = btodb(cpr_buf_size);
    150 	cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP);
    151 	if (cpr_buf == NULL) {
    152 		cpr_err(CE_WARN, allocerr);
    153 		return (ENOMEM);
    154 	}
    155 	cpr_buf_end = cpr_buf + cpr_buf_size;
    156 
    157 	cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1);
    158 	cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP);
    159 	if (cpr_pagedata == NULL) {
    160 		kmem_free(cpr_buf, cpr_buf_size);
    161 		cpr_buf = NULL;
    162 		cpr_err(CE_WARN, allocerr);
    163 		return (ENOMEM);
    164 	}
    165 
    166 	return (0);
    167 }
    168 
    169 
    170 /*
    171  * Set bitmap size in bytes based on phys_install.
    172  */
    173 void
    174 cpr_set_bitmap_size(void)
    175 {
    176 	struct memlist *pmem;
    177 	size_t size = 0;
    178 
    179 	memlist_read_lock();
    180 	for (pmem = phys_install; pmem; pmem = pmem->ml_next)
    181 		size += pmem->ml_size;
    182 	memlist_read_unlock();
    183 	cpr_bitmap_size = BITMAP_BYTES(size);
    184 }
    185 
    186 
    187 /*
    188  * CPR dump header contains the following information:
    189  *	1. header magic -- unique to cpr state file
    190  *	2. kernel return pc & ppn for resume
    191  *	3. current thread info
    192  *	4. debug level and test mode
    193  *	5. number of bitmaps allocated
    194  *	6. number of page records
    195  */
    196 static int
    197 cpr_write_header(vnode_t *vp)
    198 {
    199 	extern ushort_t cpr_mach_type;
    200 	struct cpr_dump_desc cdump;
    201 	pgcnt_t bitmap_pages;
    202 	pgcnt_t kpages, vpages, upages;
    203 	pgcnt_t cpr_count_kpages(int mapflag, bitfunc_t bitfunc);
    204 
    205 	cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC;
    206 	cdump.cdd_version = CPR_VERSION;
    207 	cdump.cdd_machine = cpr_mach_type;
    208 	cdump.cdd_debug = cpr_debug;
    209 	cdump.cdd_test_mode = cpr_test_mode;
    210 	cdump.cdd_bitmaprec = cpr_nbitmaps;
    211 
    212 	cpr_clear_bitmaps();
    213 
    214 	/*
    215 	 * Remember how many pages we plan to save to statefile.
    216 	 * This information will be used for sanity checks.
    217 	 * Untag those pages that will not be saved to statefile.
    218 	 */
    219 	kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit);
    220 	vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit);
    221 	upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit);
    222 	cdump.cdd_dumppgsize = kpages - vpages + upages;
    223 	cpr_pages_tobe_dumped = cdump.cdd_dumppgsize;
    224 	CPR_DEBUG(CPR_DEBUG7,
    225 	    "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n",
    226 	    kpages, vpages, upages, cdump.cdd_dumppgsize);
    227 
    228 	/*
    229 	 * Some pages contain volatile data (cpr_buf and storage area for
    230 	 * sensitive kpages), which are no longer needed after the statefile
    231 	 * is dumped to disk.  We have already untagged them from regular
    232 	 * bitmaps.  Now tag them into the volatile bitmaps.  The pages in
    233 	 * volatile bitmaps will be claimed during resume, and the resumed
    234 	 * kernel will free them.
    235 	 */
    236 	(void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit);
    237 
    238 	bitmap_pages = mmu_btopr(cpr_bitmap_size);
    239 
    240 	/*
    241 	 * Export accurate statefile size for statefile allocation retry.
    242 	 * statefile_size = all the headers + total pages +
    243 	 * number of pages used by the bitmaps.
    244 	 * Roundup will be done in the file allocation code.
    245 	 */
    246 	STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) +
    247 	    (sizeof (cbd_t) * cdump.cdd_bitmaprec) +
    248 	    (sizeof (cpd_t) * cdump.cdd_dumppgsize) +
    249 	    mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages);
    250 
    251 	/*
    252 	 * If the estimated statefile is not big enough,
    253 	 * go retry now to save un-necessary operations.
    254 	 */
    255 	if (!(CPR->c_flags & C_COMPRESSING) &&
    256 	    (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) {
    257 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
    258 			prom_printf("cpr_write_header: "
    259 			    "STAT->cs_nocomp_statefsz > "
    260 			    "STAT->cs_est_statefsz\n");
    261 		return (ENOSPC);
    262 	}
    263 
    264 	/* now write cpr dump descriptor */
    265 	return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t)));
    266 }
    267 
    268 
    269 /*
    270  * CPR dump tail record contains the following information:
    271  *	1. header magic -- unique to cpr state file
    272  *	2. all misc info that needs to be passed to cprboot or resumed kernel
    273  */
    274 static int
    275 cpr_write_terminator(vnode_t *vp)
    276 {
    277 	cpr_term.magic = (uint_t)CPR_TERM_MAGIC;
    278 	cpr_term.va = (cpr_ptr)&cpr_term;
    279 	cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term);
    280 
    281 	/* count the last one (flush) */
    282 	cpr_term.real_statef_size = STAT->cs_real_statefsz +
    283 	    btod(cpr_wptr - cpr_buf) * DEV_BSIZE;
    284 
    285 	CPR_DEBUG(CPR_DEBUG9, "cpr_dump: Real Statefile Size: %ld\n",
    286 	    STAT->cs_real_statefsz);
    287 
    288 	cpr_tod_get(&cpr_term.tm_shutdown);
    289 
    290 	return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term)));
    291 }
    292 
    293 /*
    294  * Write bitmap descriptor array, followed by merged bitmaps.
    295  */
    296 static int
    297 cpr_write_bitmap(vnode_t *vp)
    298 {
    299 	char *rmap, *vmap, *dst, *tail;
    300 	size_t size, bytes;
    301 	cbd_t *dp;
    302 	int err;
    303 
    304 	dp = CPR->c_bmda;
    305 	if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp)))
    306 		return (err);
    307 
    308 	/*
    309 	 * merge regular and volatile bitmaps into tmp space
    310 	 * and write to disk
    311 	 */
    312 	for (; dp->cbd_size; dp++) {
    313 		rmap = (char *)dp->cbd_reg_bitmap;
    314 		vmap = (char *)dp->cbd_vlt_bitmap;
    315 		for (size = dp->cbd_size; size; size -= bytes) {
    316 			bytes = min(size, sizeof (cpr_pagecopy));
    317 			tail = &cpr_pagecopy[bytes];
    318 			for (dst = cpr_pagecopy; dst < tail; dst++)
    319 				*dst = *rmap++ | *vmap++;
    320 			if (err = cpr_write(vp, cpr_pagecopy, bytes))
    321 				break;
    322 		}
    323 	}
    324 
    325 	return (err);
    326 }
    327 
    328 
    329 static int
    330 cpr_write_statefile(vnode_t *vp)
    331 {
    332 	uint_t error = 0;
    333 	extern	int	i_cpr_check_pgs_dumped();
    334 	void flush_windows(void);
    335 	pgcnt_t spages;
    336 	char *str;
    337 
    338 	flush_windows();
    339 
    340 	/*
    341 	 * to get an accurate view of kas, we need to untag sensitive
    342 	 * pages *before* dumping them because the disk driver makes
    343 	 * allocations and changes kas along the way.  The remaining
    344 	 * pages referenced in the bitmaps are dumped out later as
    345 	 * regular kpages.
    346 	 */
    347 	str = "cpr_write_statefile:";
    348 	spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit);
    349 	CPR_DEBUG(CPR_DEBUG7, "%s untag %ld sens pages\n", str, spages);
    350 
    351 	/*
    352 	 * now it's OK to call a driver that makes allocations
    353 	 */
    354 	cpr_disk_writes_ok = 1;
    355 
    356 	/*
    357 	 * now write out the clean sensitive kpages
    358 	 * according to the sensitive descriptors
    359 	 */
    360 	error = i_cpr_dump_sensitive_kpages(vp);
    361 	if (error) {
    362 		CPR_DEBUG(CPR_DEBUG7,
    363 		    "%s cpr_dump_sensitive_kpages() failed!\n", str);
    364 		return (error);
    365 	}
    366 
    367 	/*
    368 	 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped
    369 	 */
    370 	error = cpr_dump_regular_pages(vp);
    371 	if (error) {
    372 		CPR_DEBUG(CPR_DEBUG7,
    373 		    "%s cpr_dump_regular_pages() failed!\n", str);
    374 		return (error);
    375 	}
    376 
    377 	/*
    378 	 * sanity check to verify the right number of pages were dumped
    379 	 */
    380 	error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped,
    381 	    cpr_regular_pgs_dumped);
    382 
    383 	if (error) {
    384 		prom_printf("\n%s page count mismatch!\n", str);
    385 #ifdef DEBUG
    386 		if (cpr_test_mode)
    387 			debug_enter(NULL);
    388 #endif
    389 	}
    390 
    391 	return (error);
    392 }
    393 #endif
    394 
    395 
    396 /*
    397  * creates the CPR state file, the following sections are
    398  * written out in sequence:
    399  *    - writes the cpr dump header
    400  *    - writes the memory usage bitmaps
    401  *    - writes the platform dependent info
    402  *    - writes the remaining user pages
    403  *    - writes the kernel pages
    404  */
    405 #if defined(__x86)
    406 	_NOTE(ARGSUSED(0))
    407 #endif
    408 int
    409 cpr_dump(vnode_t *vp)
    410 {
    411 #if defined(__sparc)
    412 	int error;
    413 
    414 	if (cpr_buf == NULL) {
    415 		ASSERT(cpr_pagedata == NULL);
    416 		if (error = cpr_alloc_bufs())
    417 			return (error);
    418 	}
    419 	/* point to top of internal buffer */
    420 	cpr_wptr = cpr_buf;
    421 
    422 	/* initialize global variables used by the write operation */
    423 	cpr_file_bn = cpr_statefile_offset();
    424 	cpr_dev_space = 0;
    425 
    426 	/* allocate bitmaps */
    427 	if (CPR->c_bmda == NULL) {
    428 		if (error = i_cpr_alloc_bitmaps()) {
    429 			cpr_err(CE_WARN, "cannot allocate bitmaps");
    430 			return (error);
    431 		}
    432 	}
    433 
    434 	if (error = i_cpr_prom_pages(CPR_PROM_SAVE))
    435 		return (error);
    436 
    437 	if (error = i_cpr_dump_setup(vp))
    438 		return (error);
    439 
    440 	/*
    441 	 * set internal cross checking; we dont want to call
    442 	 * a disk driver that makes allocations until after
    443 	 * sensitive pages are saved
    444 	 */
    445 	cpr_disk_writes_ok = 0;
    446 
    447 	/*
    448 	 * 1253112: heap corruption due to memory allocation when dumpping
    449 	 *	    statefile.
    450 	 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and
    451 	 * kvseg segments can be contaminated should memory allocations happen
    452 	 * during sddump, which is not supposed to happen after the system
    453 	 * is quiesced. Let's call the kernel pages that tend to be affected
    454 	 * 'sensitive kpages' here. To avoid saving inconsistent pages, we
    455 	 * will allocate some storage space to save the clean sensitive pages
    456 	 * aside before statefile dumping takes place. Since there may not be
    457 	 * much memory left at this stage, the sensitive pages will be
    458 	 * compressed before they are saved into the storage area.
    459 	 */
    460 	if (error = i_cpr_save_sensitive_kpages()) {
    461 		CPR_DEBUG(CPR_DEBUG7,
    462 		    "cpr_dump: save_sensitive_kpages failed!\n");
    463 		return (error);
    464 	}
    465 
    466 	/*
    467 	 * since all cpr allocations are done (space for sensitive kpages,
    468 	 * bitmaps, cpr_buf), kas is stable, and now we can accurately
    469 	 * count regular and sensitive kpages.
    470 	 */
    471 	if (error = cpr_write_header(vp)) {
    472 		CPR_DEBUG(CPR_DEBUG7,
    473 		    "cpr_dump: cpr_write_header() failed!\n");
    474 		return (error);
    475 	}
    476 
    477 	if (error = i_cpr_write_machdep(vp))
    478 		return (error);
    479 
    480 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL))
    481 		return (error);
    482 
    483 	if (error = cpr_write_bitmap(vp))
    484 		return (error);
    485 
    486 	if (error = cpr_write_statefile(vp)) {
    487 		CPR_DEBUG(CPR_DEBUG7,
    488 		    "cpr_dump: cpr_write_statefile() failed!\n");
    489 		return (error);
    490 	}
    491 
    492 	if (error = cpr_write_terminator(vp))
    493 		return (error);
    494 
    495 	if (error = cpr_flush_write(vp))
    496 		return (error);
    497 
    498 	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp))
    499 		return (error);
    500 #endif
    501 
    502 	return (0);
    503 }
    504 
    505 
    506 #if defined(__sparc)
    507 /*
    508  * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc;
    509  * a page-count from each range is accumulated at arg->pages.
    510  */
    511 static void
    512 cpr_xwalk(void *arg, void *base, size_t size)
    513 {
    514 	struct cpr_walkinfo *cwip = arg;
    515 
    516 	cwip->pages += cpr_count_pages(base, size,
    517 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
    518 	cwip->size += size;
    519 	cwip->ranges++;
    520 }
    521 
    522 /*
    523  * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc;
    524  * a page-count from each range is accumulated at arg->pages.
    525  */
    526 static void
    527 cpr_walk(void *arg, void *base, size_t size)
    528 {
    529 	caddr_t addr = base;
    530 	caddr_t addr_end = addr + size;
    531 
    532 	/*
    533 	 * If we are about to start walking the range of addresses we
    534 	 * carved out of the kernel heap for the large page heap walk
    535 	 * heap_lp_arena to find what segments are actually populated
    536 	 */
    537 	if (SEGKMEM_USE_LARGEPAGES &&
    538 	    addr == heap_lp_base && addr_end == heap_lp_end &&
    539 	    vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
    540 		vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg);
    541 	} else {
    542 		cpr_xwalk(arg, base, size);
    543 	}
    544 }
    545 
    546 
    547 /*
    548  * faster scan of kvseg using vmem_walk() to visit
    549  * allocated ranges.
    550  */
    551 pgcnt_t
    552 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg)
    553 {
    554 	struct cpr_walkinfo cwinfo;
    555 
    556 	bzero(&cwinfo, sizeof (cwinfo));
    557 	cwinfo.mapflag = mapflag;
    558 	cwinfo.bitfunc = bitfunc;
    559 
    560 	vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo);
    561 
    562 	if (cpr_debug & CPR_DEBUG7) {
    563 		prom_printf("walked %d sub-ranges, total pages %ld\n",
    564 		    cwinfo.ranges, mmu_btop(cwinfo.size));
    565 		cpr_show_range(seg->s_base, seg->s_size,
    566 		    mapflag, bitfunc, cwinfo.pages);
    567 	}
    568 
    569 	return (cwinfo.pages);
    570 }
    571 
    572 
    573 /*
    574  * cpr_walk_kpm() is called for every used area within the large
    575  * segkpm virtual address window. A page-count is accumulated at
    576  * arg->pages.
    577  */
    578 static void
    579 cpr_walk_kpm(void *arg, void *base, size_t size)
    580 {
    581 	struct cpr_walkinfo *cwip = arg;
    582 
    583 	cwip->pages += cpr_count_pages(base, size,
    584 	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
    585 	cwip->size += size;
    586 	cwip->ranges++;
    587 }
    588 
    589 
    590 /*
    591  * faster scan of segkpm using hat_kpm_walk() to visit only used ranges.
    592  */
    593 /*ARGSUSED*/
    594 static pgcnt_t
    595 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg)
    596 {
    597 	struct cpr_walkinfo cwinfo;
    598 
    599 	if (kpm_enable == 0)
    600 		return (0);
    601 
    602 	bzero(&cwinfo, sizeof (cwinfo));
    603 	cwinfo.mapflag = mapflag;
    604 	cwinfo.bitfunc = bitfunc;
    605 	hat_kpm_walk(cpr_walk_kpm, &cwinfo);
    606 
    607 	if (cpr_debug & CPR_DEBUG7) {
    608 		prom_printf("walked %d sub-ranges, total pages %ld\n",
    609 		    cwinfo.ranges, mmu_btop(cwinfo.size));
    610 		cpr_show_range(segkpm->s_base, segkpm->s_size,
    611 		    mapflag, bitfunc, cwinfo.pages);
    612 	}
    613 
    614 	return (cwinfo.pages);
    615 }
    616 
    617 
    618 /*
    619  * Sparsely filled kernel segments are registered in kseg_table for
    620  * easier lookup. See also block comment for cpr_count_seg_pages.
    621  */
    622 
    623 #define	KSEG_SEG_ADDR	0	/* address of struct seg */
    624 #define	KSEG_PTR_ADDR	1	/* address of pointer to struct seg */
    625 
    626 typedef struct {
    627 	struct seg **st_seg;		/* segment pointer or segment address */
    628 	pgcnt_t	(*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */
    629 	int	st_addrtype;		/* address type in st_seg */
    630 } ksegtbl_entry_t;
    631 
    632 ksegtbl_entry_t kseg_table[] = {
    633 	{(struct seg **)&kvseg,		cpr_scan_kvseg,		KSEG_SEG_ADDR},
    634 	{&segkpm,			cpr_scan_segkpm,	KSEG_PTR_ADDR},
    635 	{NULL,				0,			0}
    636 };
    637 
    638 
    639 /*
    640  * Compare seg with each entry in kseg_table; when there is a match
    641  * return the entry pointer, otherwise return NULL.
    642  */
    643 static ksegtbl_entry_t *
    644 cpr_sparse_seg_check(struct seg *seg)
    645 {
    646 	ksegtbl_entry_t *ste = &kseg_table[0];
    647 	struct seg *tseg;
    648 
    649 	for (; ste->st_seg; ste++) {
    650 		tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ?
    651 		    *ste->st_seg : (struct seg *)ste->st_seg;
    652 
    653 		if (seg == tseg)
    654 			return (ste);
    655 	}
    656 
    657 	return ((ksegtbl_entry_t *)NULL);
    658 }
    659 
    660 
    661 /*
    662  * Count pages within each kernel segment; call cpr_sparse_seg_check()
    663  * to find out whether a sparsely filled segment needs special
    664  * treatment (e.g. kvseg).
    665  * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr
    666  *       module shouldn't need to know segment details like if it is
    667  *       sparsely filled or not (makes kseg_table obsolete).
    668  */
    669 pgcnt_t
    670 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc)
    671 {
    672 	struct seg *segp;
    673 	pgcnt_t pages;
    674 	ksegtbl_entry_t *ste;
    675 
    676 	pages = 0;
    677 	for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) {
    678 		if (ste = cpr_sparse_seg_check(segp)) {
    679 			pages += (ste->st_fcn)(mapflag, bitfunc, segp);
    680 		} else {
    681 			pages += cpr_count_pages(segp->s_base,
    682 			    segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE);
    683 		}
    684 	}
    685 
    686 	return (pages);
    687 }
    688 
    689 
    690 /*
    691  * count kernel pages within kas and any special ranges
    692  */
    693 pgcnt_t
    694 cpr_count_kpages(int mapflag, bitfunc_t bitfunc)
    695 {
    696 	pgcnt_t kas_cnt;
    697 
    698 	/*
    699 	 * Some pages need to be taken care of differently.
    700 	 * eg: panicbuf pages of sun4m are not in kas but they need
    701 	 * to be saved.  On sun4u, the physical pages of panicbuf are
    702 	 * allocated via prom_retain().
    703 	 */
    704 	kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc);
    705 	kas_cnt += cpr_count_seg_pages(mapflag, bitfunc);
    706 
    707 	CPR_DEBUG(CPR_DEBUG9, "cpr_count_kpages: kas_cnt=%ld\n", kas_cnt);
    708 	CPR_DEBUG(CPR_DEBUG7, "\ncpr_count_kpages: %ld pages, 0x%lx bytes\n",
    709 	    kas_cnt, mmu_ptob(kas_cnt));
    710 
    711 	return (kas_cnt);
    712 }
    713 
    714 
    715 /*
    716  * Set a bit corresponding to the arg phys page number;
    717  * returns 0 when the ppn is valid and the corresponding
    718  * map bit was clear, otherwise returns 1.
    719  */
    720 int
    721 cpr_setbit(pfn_t ppn, int mapflag)
    722 {
    723 	char *bitmap;
    724 	cbd_t *dp;
    725 	pfn_t rel;
    726 	int clr;
    727 
    728 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
    729 		if (PPN_IN_RANGE(ppn, dp)) {
    730 			bitmap = DESC_TO_MAP(dp, mapflag);
    731 			rel = ppn - dp->cbd_spfn;
    732 			if ((clr = isclr(bitmap, rel)) != 0)
    733 				setbit(bitmap, rel);
    734 			return (clr == 0);
    735 		}
    736 	}
    737 
    738 	return (1);
    739 }
    740 
    741 
    742 /*
    743  * Clear a bit corresponding to the arg phys page number.
    744  */
    745 int
    746 cpr_clrbit(pfn_t ppn, int mapflag)
    747 {
    748 	char *bitmap;
    749 	cbd_t *dp;
    750 	pfn_t rel;
    751 	int set;
    752 
    753 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
    754 		if (PPN_IN_RANGE(ppn, dp)) {
    755 			bitmap = DESC_TO_MAP(dp, mapflag);
    756 			rel = ppn - dp->cbd_spfn;
    757 			if ((set = isset(bitmap, rel)) != 0)
    758 				clrbit(bitmap, rel);
    759 			return (set == 0);
    760 		}
    761 	}
    762 
    763 	return (1);
    764 }
    765 
    766 
    767 /* ARGSUSED */
    768 int
    769 cpr_nobit(pfn_t ppn, int mapflag)
    770 {
    771 	return (0);
    772 }
    773 
    774 
    775 /*
    776  * Lookup a bit corresponding to the arg phys page number.
    777  */
    778 int
    779 cpr_isset(pfn_t ppn, int mapflag)
    780 {
    781 	char *bitmap;
    782 	cbd_t *dp;
    783 	pfn_t rel;
    784 
    785 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
    786 		if (PPN_IN_RANGE(ppn, dp)) {
    787 			bitmap = DESC_TO_MAP(dp, mapflag);
    788 			rel = ppn - dp->cbd_spfn;
    789 			return (isset(bitmap, rel));
    790 		}
    791 	}
    792 
    793 	return (0);
    794 }
    795 
    796 
    797 /*
    798  * Go thru all pages and pick up any page not caught during the invalidation
    799  * stage. This is also used to save pages with cow lock or phys page lock held
    800  * (none zero p_lckcnt or p_cowcnt)
    801  */
    802 static	int
    803 cpr_count_upages(int mapflag, bitfunc_t bitfunc)
    804 {
    805 	page_t *pp, *page0;
    806 	pgcnt_t dcnt = 0, tcnt = 0;
    807 	pfn_t pfn;
    808 
    809 	page0 = pp = page_first();
    810 
    811 	do {
    812 		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
    813 		    PP_ISFREE(pp) && PP_ISAGED(pp))
    814 			continue;
    815 
    816 		pfn = page_pptonum(pp);
    817 		if (pf_is_memory(pfn)) {
    818 			tcnt++;
    819 			if ((*bitfunc)(pfn, mapflag) == 0)
    820 				dcnt++; /* dirty count */
    821 		}
    822 	} while ((pp = page_next(pp)) != page0);
    823 
    824 	STAT->cs_upage2statef = dcnt;
    825 	CPR_DEBUG(CPR_DEBUG9, "cpr_count_upages: dirty=%ld total=%ld\n",
    826 	    dcnt, tcnt);
    827 	CPR_DEBUG(CPR_DEBUG7, "cpr_count_upages: %ld pages, 0x%lx bytes\n",
    828 	    dcnt, mmu_ptob(dcnt));
    829 	page0 = NULL; /* for Lint */
    830 	return (dcnt);
    831 }
    832 
    833 
    834 /*
    835  * try compressing pages based on cflag,
    836  * and for DEBUG kernels, verify uncompressed data checksum;
    837  *
    838  * this routine replaces common code from
    839  * i_cpr_compress_and_save() and cpr_compress_and_write()
    840  */
    841 char *
    842 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag)
    843 {
    844 	size_t nbytes, clen, len;
    845 	uint32_t test_sum;
    846 	char *datap;
    847 
    848 	nbytes = mmu_ptob(pages);
    849 
    850 	/*
    851 	 * set length to the original uncompressed data size;
    852 	 * always init cpd_flag to zero
    853 	 */
    854 	dp->cpd_length = nbytes;
    855 	dp->cpd_flag = 0;
    856 
    857 #ifdef	DEBUG
    858 	/*
    859 	 * Make a copy of the uncompressed data so we can checksum it.
    860 	 * Compress that copy so the checksum works at the other end
    861 	 */
    862 	cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes);
    863 	dp->cpd_usum = checksum32(cpr_pagecopy, nbytes);
    864 	dp->cpd_flag |= CPD_USUM;
    865 	datap = cpr_pagecopy;
    866 #else
    867 	datap = CPR->c_mapping_area;
    868 	dp->cpd_usum = 0;
    869 #endif
    870 
    871 	/*
    872 	 * try compressing the raw data to cpr_pagedata;
    873 	 * if there was a size reduction: record the new length,
    874 	 * flag the compression, and point to the compressed data.
    875 	 */
    876 	dp->cpd_csum = 0;
    877 	if (cflag) {
    878 		clen = compress(datap, cpr_pagedata, nbytes);
    879 		if (clen < nbytes) {
    880 			dp->cpd_flag |= CPD_COMPRESS;
    881 			dp->cpd_length = clen;
    882 			datap = cpr_pagedata;
    883 #ifdef	DEBUG
    884 			dp->cpd_csum = checksum32(datap, clen);
    885 			dp->cpd_flag |= CPD_CSUM;
    886 
    887 			/*
    888 			 * decompress the data back to a scratch area
    889 			 * and compare the new checksum with the original
    890 			 * checksum to verify the compression.
    891 			 */
    892 			bzero(cpr_pagecopy, sizeof (cpr_pagecopy));
    893 			len = decompress(datap, cpr_pagecopy,
    894 			    clen, sizeof (cpr_pagecopy));
    895 			test_sum = checksum32(cpr_pagecopy, len);
    896 			ASSERT(test_sum == dp->cpd_usum);
    897 #endif
    898 		}
    899 	}
    900 
    901 	return (datap);
    902 }
    903 
    904 
    905 /*
    906  * 1. Prepare cpr page descriptor and write it to file
    907  * 2. Compress page data and write it out
    908  */
    909 static int
    910 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg)
    911 {
    912 	int error = 0;
    913 	char *datap;
    914 	cpd_t cpd;	/* cpr page descriptor */
    915 	extern void i_cpr_mapin(caddr_t, uint_t, pfn_t);
    916 	extern void i_cpr_mapout(caddr_t, uint_t);
    917 
    918 	i_cpr_mapin(CPR->c_mapping_area, npg, pfn);
    919 
    920 	CPR_DEBUG(CPR_DEBUG3, "mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n",
    921 	    npg, (void *)CPR->c_mapping_area, pfn);
    922 
    923 	/*
    924 	 * Fill cpr page descriptor.
    925 	 */
    926 	cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC;
    927 	cpd.cpd_pfn = pfn;
    928 	cpd.cpd_pages = npg;
    929 
    930 	STAT->cs_dumped_statefsz += mmu_ptob(npg);
    931 
    932 	datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING);
    933 
    934 	/* Write cpr page descriptor */
    935 	error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t));
    936 
    937 	/* Write compressed page data */
    938 	error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length);
    939 
    940 	/*
    941 	 * Unmap the pages for tlb and vac flushing
    942 	 */
    943 	i_cpr_mapout(CPR->c_mapping_area, npg);
    944 
    945 	if (error) {
    946 		CPR_DEBUG(CPR_DEBUG1,
    947 		    "cpr_compress_and_write: vp 0x%p va 0x%x ", (void *)vp, va);
    948 		CPR_DEBUG(CPR_DEBUG1, "pfn 0x%lx blk %d err %d\n",
    949 		    pfn, cpr_file_bn, error);
    950 	} else {
    951 		cpr_regular_pgs_dumped += npg;
    952 	}
    953 
    954 	return (error);
    955 }
    956 
    957 
    958 int
    959 cpr_write(vnode_t *vp, caddr_t buffer, size_t size)
    960 {
    961 	caddr_t	fromp = buffer;
    962 	size_t bytes, wbytes;
    963 	int error;
    964 
    965 	if (cpr_dev_space == 0) {
    966 		if (vp->v_type == VBLK) {
    967 			cpr_dev_space = cpr_get_devsize(vp->v_rdev);
    968 			ASSERT(cpr_dev_space);
    969 		} else
    970 			cpr_dev_space = 1;	/* not used in this case */
    971 	}
    972 
    973 	/*
    974 	 * break the write into multiple part if request is large,
    975 	 * calculate count up to buf page boundary, then write it out.
    976 	 * repeat until done.
    977 	 */
    978 	while (size) {
    979 		bytes = MIN(size, cpr_buf_end - cpr_wptr);
    980 		cprbcopy(fromp, cpr_wptr, bytes);
    981 		cpr_wptr += bytes;
    982 		fromp += bytes;
    983 		size -= bytes;
    984 		if (cpr_wptr < cpr_buf_end)
    985 			return (0);	/* buffer not full yet */
    986 		ASSERT(cpr_wptr == cpr_buf_end);
    987 
    988 		wbytes = dbtob(cpr_file_bn + cpr_buf_blocks);
    989 		if (vp->v_type == VBLK) {
    990 			if (wbytes > cpr_dev_space)
    991 				return (ENOSPC);
    992 		} else {
    993 			if (wbytes > VTOI(vp)->i_size)
    994 				return (ENOSPC);
    995 		}
    996 
    997 		CPR_DEBUG(CPR_DEBUG3,
    998 		    "cpr_write: frmp=%p wptr=%p cnt=%lx...",
    999 		    (void *)fromp, (void *)cpr_wptr, bytes);
   1000 		/*
   1001 		 * cross check, this should not happen!
   1002 		 */
   1003 		if (cpr_disk_writes_ok == 0) {
   1004 			prom_printf("cpr_write: disk write too early!\n");
   1005 			return (EINVAL);
   1006 		}
   1007 
   1008 		do_polled_io = 1;
   1009 		error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks,
   1010 		    NULL);
   1011 		do_polled_io = 0;
   1012 		CPR_DEBUG(CPR_DEBUG3, "done\n");
   1013 
   1014 		STAT->cs_real_statefsz += cpr_buf_size;
   1015 
   1016 		if (error) {
   1017 			cpr_err(CE_WARN, "cpr_write error %d", error);
   1018 			return (error);
   1019 		}
   1020 		cpr_file_bn += cpr_buf_blocks;	/* Increment block count */
   1021 		cpr_wptr = cpr_buf;		/* back to top of buffer */
   1022 	}
   1023 	return (0);
   1024 }
   1025 
   1026 
   1027 int
   1028 cpr_flush_write(vnode_t *vp)
   1029 {
   1030 	int	nblk;
   1031 	int	error;
   1032 
   1033 	/*
   1034 	 * Calculate remaining blocks in buffer, rounded up to nearest
   1035 	 * disk block
   1036 	 */
   1037 	nblk = btod(cpr_wptr - cpr_buf);
   1038 
   1039 	do_polled_io = 1;
   1040 	error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk, NULL);
   1041 	do_polled_io = 0;
   1042 
   1043 	cpr_file_bn += nblk;
   1044 	if (error)
   1045 		CPR_DEBUG(CPR_DEBUG2, "cpr_flush_write: error (%d)\n",
   1046 		    error);
   1047 	return (error);
   1048 }
   1049 
   1050 void
   1051 cpr_clear_bitmaps(void)
   1052 {
   1053 	cbd_t *dp;
   1054 
   1055 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
   1056 		bzero((void *)dp->cbd_reg_bitmap,
   1057 		    (size_t)dp->cbd_size * 2);
   1058 	}
   1059 	CPR_DEBUG(CPR_DEBUG7, "\ncleared reg and vlt bitmaps\n");
   1060 }
   1061 
   1062 int
   1063 cpr_contig_pages(vnode_t *vp, int flag)
   1064 {
   1065 	int chunks = 0, error = 0;
   1066 	pgcnt_t i, j, totbit;
   1067 	pfn_t spfn;
   1068 	cbd_t *dp;
   1069 	uint_t	spin_cnt = 0;
   1070 	extern	int i_cpr_compress_and_save();
   1071 
   1072 	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
   1073 		spfn = dp->cbd_spfn;
   1074 		totbit = BTOb(dp->cbd_size);
   1075 		i = 0; /* Beginning of bitmap */
   1076 		j = 0;
   1077 		while (i < totbit) {
   1078 			while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) {
   1079 				if (isset((char *)dp->cbd_reg_bitmap, j+i))
   1080 					j++;
   1081 				else /* not contiguous anymore */
   1082 					break;
   1083 			}
   1084 
   1085 			if (j) {
   1086 				chunks++;
   1087 				if (flag == SAVE_TO_STORAGE) {
   1088 					error = i_cpr_compress_and_save(
   1089 					    chunks, spfn + i, j);
   1090 					if (error)
   1091 						return (error);
   1092 				} else if (flag == WRITE_TO_STATEFILE) {
   1093 					error = cpr_compress_and_write(vp, 0,
   1094 					    spfn + i, j);
   1095 					if (error)
   1096 						return (error);
   1097 					else {
   1098 						spin_cnt++;
   1099 						if ((spin_cnt & 0x5F) == 1)
   1100 							cpr_spinning_bar();
   1101 					}
   1102 				}
   1103 			}
   1104 
   1105 			i += j;
   1106 			if (j != CPR_MAXCONTIG) {
   1107 				/* Stopped on a non-tagged page */
   1108 				i++;
   1109 			}
   1110 
   1111 			j = 0;
   1112 		}
   1113 	}
   1114 
   1115 	if (flag == STORAGE_DESC_ALLOC)
   1116 		return (chunks);
   1117 	else
   1118 		return (0);
   1119 }
   1120 
   1121 
   1122 void
   1123 cpr_show_range(caddr_t vaddr, size_t size,
   1124     int mapflag, bitfunc_t bitfunc, pgcnt_t count)
   1125 {
   1126 	char *action, *bname;
   1127 
   1128 	bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile";
   1129 	if (bitfunc == cpr_setbit)
   1130 		action = "tag";
   1131 	else if (bitfunc == cpr_clrbit)
   1132 		action = "untag";
   1133 	else
   1134 		action = "none";
   1135 	prom_printf("range (0x%p, 0x%p), %s bitmap, %s %ld\n",
   1136 	    (void *)vaddr, (void *)(vaddr + size), bname, action, count);
   1137 }
   1138 
   1139 
   1140 pgcnt_t
   1141 cpr_count_pages(caddr_t sva, size_t size,
   1142     int mapflag, bitfunc_t bitfunc, int showrange)
   1143 {
   1144 	caddr_t	va, eva;
   1145 	pfn_t pfn;
   1146 	pgcnt_t count = 0;
   1147 
   1148 	eva = sva + PAGE_ROUNDUP(size);
   1149 	for (va = sva; va < eva; va += MMU_PAGESIZE) {
   1150 		pfn = va_to_pfn(va);
   1151 		if (pfn != PFN_INVALID && pf_is_memory(pfn)) {
   1152 			if ((*bitfunc)(pfn, mapflag) == 0)
   1153 				count++;
   1154 		}
   1155 	}
   1156 
   1157 	if ((cpr_debug & CPR_DEBUG7) && showrange == DBG_SHOWRANGE)
   1158 		cpr_show_range(sva, size, mapflag, bitfunc, count);
   1159 
   1160 	return (count);
   1161 }
   1162 
   1163 
   1164 pgcnt_t
   1165 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc)
   1166 {
   1167 	pgcnt_t count = 0;
   1168 
   1169 	if (cpr_buf) {
   1170 		count += cpr_count_pages(cpr_buf, cpr_buf_size,
   1171 		    mapflag, bitfunc, DBG_SHOWRANGE);
   1172 	}
   1173 	if (cpr_pagedata) {
   1174 		count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size,
   1175 		    mapflag, bitfunc, DBG_SHOWRANGE);
   1176 	}
   1177 	count += i_cpr_count_storage_pages(mapflag, bitfunc);
   1178 
   1179 	CPR_DEBUG(CPR_DEBUG7, "cpr_count_vpages: %ld pages, 0x%lx bytes\n",
   1180 	    count, mmu_ptob(count));
   1181 	return (count);
   1182 }
   1183 
   1184 
   1185 static int
   1186 cpr_dump_regular_pages(vnode_t *vp)
   1187 {
   1188 	int error;
   1189 
   1190 	cpr_regular_pgs_dumped = 0;
   1191 	error = cpr_contig_pages(vp, WRITE_TO_STATEFILE);
   1192 	if (!error)
   1193 		CPR_DEBUG(CPR_DEBUG7, "cpr_dump_regular_pages() done.\n");
   1194 	return (error);
   1195 }
   1196 #endif
   1197