Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/types.h>
     27 #include <sys/cmn_err.h>
     28 #include <sys/vmem.h>
     29 #include <sys/kmem.h>
     30 #include <sys/systm.h>
     31 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
     32 #include <sys/errno.h>
     33 #include <sys/memnode.h>
     34 #include <sys/memlist.h>
     35 #include <sys/memlist_impl.h>
     36 #include <sys/tuneable.h>
     37 #include <sys/proc.h>
     38 #include <sys/disp.h>
     39 #include <sys/debug.h>
     40 #include <sys/vm.h>
     41 #include <sys/callb.h>
     42 #include <sys/memlist_plat.h>	/* for installed_top_size() */
     43 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
     44 #include <sys/dumphdr.h>	/* for dump_resize() */
     45 #include <sys/atomic.h>		/* for use in stats collection */
     46 #include <sys/rwlock.h>
     47 #include <sys/cpuvar.h>
     48 #include <vm/seg_kmem.h>
     49 #include <vm/seg_kpm.h>
     50 #include <vm/page.h>
     51 #include <vm/vm_dep.h>
     52 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
     53 #include <sys/sunddi.h>
     54 #include <sys/mem_config.h>
     55 #include <sys/mem_cage.h>
     56 #include <sys/lgrp.h>
     57 #include <sys/ddi.h>
     58 #include <sys/modctl.h>
     59 
     60 extern struct memlist *phys_avail;
     61 
     62 extern void mem_node_add(pfn_t, pfn_t);
     63 extern void mem_node_del(pfn_t, pfn_t);
     64 
     65 extern uint_t page_ctrs_adjust(int);
     66 void page_ctrs_cleanup(void);
     67 static void kphysm_setup_post_add(pgcnt_t);
     68 static int kphysm_setup_pre_del(pgcnt_t);
     69 static void kphysm_setup_post_del(pgcnt_t, int);
     70 
     71 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
     72 
     73 static int delspan_reserve(pfn_t, pgcnt_t);
     74 static void delspan_unreserve(pfn_t, pgcnt_t);
     75 
     76 kmutex_t memseg_lists_lock;
     77 struct memseg *memseg_va_avail;
     78 struct memseg *memseg_alloc(void);
     79 static struct memseg *memseg_delete_junk;
     80 static struct memseg *memseg_edit_junk;
     81 void memseg_remap_init(void);
     82 static void memseg_remap_to_dummy(struct memseg *);
     83 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
     84 static struct memseg *memseg_reuse(pgcnt_t);
     85 
     86 static struct kmem_cache *memseg_cache;
     87 
     88 /*
     89  * Interfaces to manage externally allocated
     90  * page_t memory (metadata) for a memseg.
     91  */
     92 #pragma weak	memseg_alloc_meta
     93 #pragma weak	memseg_free_meta
     94 #pragma weak	memseg_get_metapfn
     95 #pragma weak	memseg_remap_meta
     96 
     97 extern int ppvm_enable;
     98 extern page_t *ppvm_base;
     99 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
    100 extern void memseg_free_meta(void *, pgcnt_t);
    101 extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
    102 extern void memseg_remap_meta(struct memseg *);
    103 static int memseg_is_dynamic(struct memseg *);
    104 static int memseg_includes_meta(struct memseg *);
    105 pfn_t memseg_get_start(struct memseg *);
    106 static void memseg_cpu_vm_flush(void);
    107 
    108 int meta_alloc_enable;
    109 
    110 /*
    111  * Add a chunk of memory to the system.
    112  * base: starting PAGESIZE page of new memory.
    113  * npgs: length in PAGESIZE pages.
    114  *
    115  * Adding mem this way doesn't increase the size of the hash tables;
    116  * growing them would be too hard.  This should be OK, but adding memory
    117  * dynamically most likely means more hash misses, since the tables will
    118  * be smaller than they otherwise would be.
    119  */
    120 #ifdef	DEBUG
    121 static int memseg_debug;
    122 #define	MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
    123 #else
    124 #define	MEMSEG_DEBUG(...)
    125 #endif
    126 
    127 int
    128 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
    129 {
    130 	page_t *pp;
    131 	page_t		*opp, *oepp, *segpp;
    132 	struct memseg	*seg;
    133 	uint64_t	avmem;
    134 	pfn_t		pfn;
    135 	pfn_t		pt_base = base;
    136 	pgcnt_t		tpgs = npgs;
    137 	pgcnt_t		metapgs = 0;
    138 	int		exhausted;
    139 	pfn_t		pnum;
    140 	int		mnode;
    141 	caddr_t		vaddr;
    142 	int		reuse;
    143 	int		mlret;
    144 	int		rv;
    145 	int		flags;
    146 	int		meta_alloc = 0;
    147 	void		*mapva;
    148 	void		*metabase = (void *)base;
    149 	pgcnt_t		nkpmpgs = 0;
    150 	offset_t	kpm_pages_off;
    151 
    152 	cmn_err(CE_CONT,
    153 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
    154 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
    155 
    156 	/*
    157 	 * Add this span in the delete list to prevent interactions.
    158 	 */
    159 	if (!delspan_reserve(base, npgs)) {
    160 		return (KPHYSM_ESPAN);
    161 	}
    162 	/*
    163 	 * Check to see if any of the memory span has been added
    164 	 * by trying an add to the installed memory list. This
    165 	 * forms the interlocking process for add.
    166 	 */
    167 
    168 	memlist_write_lock();
    169 
    170 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
    171 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
    172 
    173 	if (mlret == MEML_SPANOP_OK)
    174 		installed_top_size(phys_install, &physmax, &physinstalled);
    175 
    176 	memlist_write_unlock();
    177 
    178 	if (mlret != MEML_SPANOP_OK) {
    179 		if (mlret == MEML_SPANOP_EALLOC) {
    180 			delspan_unreserve(pt_base, tpgs);
    181 			return (KPHYSM_ERESOURCE);
    182 		} else if (mlret == MEML_SPANOP_ESPAN) {
    183 			delspan_unreserve(pt_base, tpgs);
    184 			return (KPHYSM_ESPAN);
    185 		} else {
    186 			delspan_unreserve(pt_base, tpgs);
    187 			return (KPHYSM_ERESOURCE);
    188 		}
    189 	}
    190 
    191 	if (meta_alloc_enable) {
    192 		/*
    193 		 * Allocate the page_t's from existing memory;
    194 		 * if that fails, allocate from the incoming memory.
    195 		 */
    196 		rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
    197 		if (rv == KPHYSM_OK) {
    198 			ASSERT(metapgs);
    199 			ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
    200 			meta_alloc = 1;
    201 			goto mapalloc;
    202 		}
    203 	}
    204 
    205 	/*
    206 	 * We store the page_t's for this new memory in the first
    207 	 * few pages of the chunk. Here, we go and get'em ...
    208 	 */
    209 
    210 	/*
    211 	 * The expression after the '-' gives the number of pages
    212 	 * that will fit in the new memory based on a requirement
    213 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
    214 	 */
    215 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
    216 	    (PAGESIZE + sizeof (page_t)));
    217 
    218 	npgs -= metapgs;
    219 	base += metapgs;
    220 
    221 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
    222 
    223 	exhausted = (metapgs == 0 || npgs == 0);
    224 
    225 	if (kpm_enable && !exhausted) {
    226 		pgcnt_t start, end, nkpmpgs_prelim;
    227 		size_t	ptsz;
    228 
    229 		/*
    230 		 * A viable kpm large page mapping must not overlap two
    231 		 * dynamic memsegs. Therefore the total size is checked
    232 		 * to be at least kpm_pgsz and also whether start and end
    233 		 * points are at least kpm_pgsz aligned.
    234 		 */
    235 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
    236 		    pmodkpmp(base + npgs)) {
    237 
    238 			kphysm_addmem_error_undospan(pt_base, tpgs);
    239 
    240 			/*
    241 			 * There is no specific error code for violating
    242 			 * kpm granularity constraints.
    243 			 */
    244 			return (KPHYSM_ENOTVIABLE);
    245 		}
    246 
    247 		start = kpmptop(ptokpmp(base));
    248 		end = kpmptop(ptokpmp(base + npgs));
    249 		nkpmpgs_prelim = ptokpmp(end - start);
    250 		ptsz = npgs * sizeof (page_t);
    251 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
    252 		exhausted = (tpgs <= metapgs);
    253 		if (!exhausted) {
    254 			npgs = tpgs - metapgs;
    255 			base = pt_base + metapgs;
    256 
    257 			/* final nkpmpgs */
    258 			start = kpmptop(ptokpmp(base));
    259 			nkpmpgs = ptokpmp(end - start);
    260 			kpm_pages_off = ptsz +
    261 			    (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
    262 		}
    263 	}
    264 
    265 	/*
    266 	 * Is memory area supplied too small?
    267 	 */
    268 	if (exhausted) {
    269 		kphysm_addmem_error_undospan(pt_base, tpgs);
    270 		/*
    271 		 * There is no specific error code for 'too small'.
    272 		 */
    273 		return (KPHYSM_ERESOURCE);
    274 	}
    275 
    276 mapalloc:
    277 	/*
    278 	 * We may re-use a previously allocated VA space for the page_ts
    279 	 * eventually, but we need to initialize and lock the pages first.
    280 	 */
    281 
    282 	/*
    283 	 * Get an address in the kernel address map, map
    284 	 * the page_t pages and see if we can touch them.
    285 	 */
    286 
    287 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
    288 	if (mapva == NULL) {
    289 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
    290 		    " Can't allocate VA for page_ts");
    291 
    292 		if (meta_alloc)
    293 			memseg_free_meta(metabase, metapgs);
    294 		kphysm_addmem_error_undospan(pt_base, tpgs);
    295 
    296 		return (KPHYSM_ERESOURCE);
    297 	}
    298 	pp = mapva;
    299 
    300 	if (physmax < (pt_base + tpgs))
    301 		physmax = (pt_base + tpgs);
    302 
    303 	/*
    304 	 * In the remapping code we map one page at a time so we must do
    305 	 * the same here to match mapping sizes.
    306 	 */
    307 	pfn = pt_base;
    308 	vaddr = (caddr_t)pp;
    309 	for (pnum = 0; pnum < metapgs; pnum++) {
    310 		if (meta_alloc)
    311 			pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
    312 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
    313 		    PROT_READ | PROT_WRITE,
    314 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
    315 		pfn++;
    316 		vaddr += ptob(1);
    317 	}
    318 
    319 	if (ddi_peek32((dev_info_t *)NULL,
    320 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
    321 
    322 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
    323 		    " Can't access pp array at 0x%p [phys 0x%lx]",
    324 		    (void *)pp, pt_base);
    325 
    326 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
    327 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
    328 
    329 		vmem_free(heap_arena, mapva, ptob(metapgs));
    330 		if (meta_alloc)
    331 			memseg_free_meta(metabase, metapgs);
    332 		kphysm_addmem_error_undospan(pt_base, tpgs);
    333 
    334 		return (KPHYSM_EFAULT);
    335 	}
    336 
    337 	/*
    338 	 * Add this memory slice to its memory node translation.
    339 	 *
    340 	 * Note that right now, each node may have only one slice;
    341 	 * this may change with COD or in larger SSM systems with
    342 	 * nested latency groups, so we must not assume that the
    343 	 * node does not yet exist.
    344 	 */
    345 	pnum = pt_base + tpgs - 1;
    346 	mem_node_add_range(pt_base, pnum);
    347 
    348 	/*
    349 	 * Allocate or resize page counters as necessary to accommodate
    350 	 * the increase in memory pages.
    351 	 */
    352 	mnode = PFN_2_MEM_NODE(pnum);
    353 	PAGE_CTRS_ADJUST(base, npgs, rv);
    354 	if (rv) {
    355 
    356 		mem_node_del_range(pt_base, pnum);
    357 
    358 		/* cleanup the  page counters */
    359 		page_ctrs_cleanup();
    360 
    361 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
    362 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
    363 
    364 		vmem_free(heap_arena, mapva, ptob(metapgs));
    365 		if (meta_alloc)
    366 			memseg_free_meta(metabase, metapgs);
    367 		kphysm_addmem_error_undospan(pt_base, tpgs);
    368 
    369 		return (KPHYSM_ERESOURCE);
    370 	}
    371 
    372 	/*
    373 	 * Update the phys_avail memory list.
    374 	 * The phys_install list was done at the start.
    375 	 */
    376 
    377 	memlist_write_lock();
    378 
    379 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
    380 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
    381 	ASSERT(mlret == MEML_SPANOP_OK);
    382 
    383 	memlist_write_unlock();
    384 
    385 	/* See if we can find a memseg to re-use. */
    386 	if (meta_alloc) {
    387 		seg = memseg_reuse(0);
    388 		reuse = 1;	/* force unmapping of temp mapva */
    389 		flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
    390 		/*
    391 		 * There is a 1:1 fixed relationship between a pfn
    392 		 * and a page_t VA.  The pfn is used as an index into
    393 		 * the ppvm_base page_t table in order to calculate
    394 		 * the page_t base address for a given pfn range.
    395 		 */
    396 		segpp = ppvm_base + base;
    397 	} else {
    398 		seg = memseg_reuse(metapgs);
    399 		reuse = (seg != NULL);
    400 		flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
    401 		segpp = pp;
    402 	}
    403 
    404 	/*
    405 	 * Initialize the memseg structure representing this memory
    406 	 * and add it to the existing list of memsegs. Do some basic
    407 	 * initialization and add the memory to the system.
    408 	 * In order to prevent lock deadlocks, the add_physmem()
    409 	 * code is repeated here, but split into several stages.
    410 	 *
    411 	 * If a memseg is reused, invalidate memseg pointers in
    412 	 * all cpu vm caches.  We need to do this this since the check
    413 	 * 	pp >= seg->pages && pp < seg->epages
    414 	 * used in various places is not atomic and so the first compare
    415 	 * can happen before reuse and the second compare after reuse.
    416 	 * The invalidation ensures that a memseg is not deferenced while
    417 	 * it's page/pfn pointers are changing.
    418 	 */
    419 	if (seg == NULL) {
    420 		seg = memseg_alloc();
    421 		ASSERT(seg != NULL);
    422 		seg->msegflags = flags;
    423 		MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
    424 		    (void *)seg, (void *)(seg->pages));
    425 		seg->pages = segpp;
    426 	} else {
    427 		ASSERT(seg->msegflags == flags);
    428 		ASSERT(seg->pages_base == seg->pages_end);
    429 		MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
    430 		    (void *)seg, (void *)(seg->pages));
    431 		if (meta_alloc) {
    432 			memseg_cpu_vm_flush();
    433 			seg->pages = segpp;
    434 		}
    435 	}
    436 
    437 	seg->epages = seg->pages + npgs;
    438 	seg->pages_base = base;
    439 	seg->pages_end = base + npgs;
    440 
    441 	/*
    442 	 * Initialize metadata. The page_ts are set to locked state
    443 	 * ready to be freed.
    444 	 */
    445 	bzero((caddr_t)pp, ptob(metapgs));
    446 
    447 	pfn = seg->pages_base;
    448 	/* Save the original pp base in case we reuse a memseg. */
    449 	opp = pp;
    450 	oepp = opp + npgs;
    451 	for (pp = opp; pp < oepp; pp++) {
    452 		pp->p_pagenum = pfn;
    453 		pfn++;
    454 		page_iolock_init(pp);
    455 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
    456 			continue;
    457 		pp->p_offset = (u_offset_t)-1;
    458 	}
    459 
    460 	if (reuse) {
    461 		/* Remap our page_ts to the re-used memseg VA space. */
    462 		pfn = pt_base;
    463 		vaddr = (caddr_t)seg->pages;
    464 		for (pnum = 0; pnum < metapgs; pnum++) {
    465 			if (meta_alloc)
    466 				pfn = memseg_get_metapfn(metabase,
    467 				    (pgcnt_t)pnum);
    468 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
    469 			    PROT_READ | PROT_WRITE,
    470 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
    471 			pfn++;
    472 			vaddr += ptob(1);
    473 		}
    474 
    475 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
    476 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
    477 
    478 		vmem_free(heap_arena, mapva, ptob(metapgs));
    479 	}
    480 
    481 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
    482 
    483 	memsegs_lock(1);
    484 
    485 	/*
    486 	 * The new memseg is inserted at the beginning of the list.
    487 	 * Not only does this save searching for the tail, but in the
    488 	 * case of a re-used memseg, it solves the problem of what
    489 	 * happens if some process has still got a pointer to the
    490 	 * memseg and follows the next pointer to continue traversing
    491 	 * the memsegs list.
    492 	 */
    493 
    494 	hat_kpm_addmem_mseg_insert(seg);
    495 
    496 	seg->next = memsegs;
    497 	membar_producer();
    498 
    499 	hat_kpm_addmem_memsegs_update(seg);
    500 
    501 	memsegs = seg;
    502 
    503 	build_pfn_hash();
    504 
    505 	total_pages += npgs;
    506 
    507 	/*
    508 	 * Recalculate the paging parameters now total_pages has changed.
    509 	 * This will also cause the clock hands to be reset before next use.
    510 	 */
    511 	setupclock(1);
    512 
    513 	memsegs_unlock(1);
    514 
    515 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
    516 
    517 	/*
    518 	 * Free the pages outside the lock to avoid locking loops.
    519 	 */
    520 	for (pp = seg->pages; pp < seg->epages; pp++) {
    521 		page_free(pp, 1);
    522 	}
    523 
    524 	/*
    525 	 * Now that we've updated the appropriate memory lists we
    526 	 * need to reset a number of globals, since we've increased memory.
    527 	 * Several have already been updated for us as noted above. The
    528 	 * globals we're interested in at this point are:
    529 	 *   physmax - highest page frame number.
    530 	 *   physinstalled - number of pages currently installed (done earlier)
    531 	 *   maxmem - max free pages in the system
    532 	 *   physmem - physical memory pages available
    533 	 *   availrmem - real memory available
    534 	 */
    535 
    536 	mutex_enter(&freemem_lock);
    537 	maxmem += npgs;
    538 	physmem += npgs;
    539 	availrmem += npgs;
    540 	availrmem_initial += npgs;
    541 
    542 	mutex_exit(&freemem_lock);
    543 
    544 	dump_resize();
    545 
    546 	page_freelist_coalesce_all(mnode);
    547 
    548 	kphysm_setup_post_add(npgs);
    549 
    550 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
    551 	    "(0x%" PRIx64 ")\n",
    552 	    physinstalled << (PAGESHIFT - 10),
    553 	    (uint64_t)physinstalled << PAGESHIFT);
    554 
    555 	avmem = (uint64_t)freemem << PAGESHIFT;
    556 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
    557 	    "avail mem = %" PRId64 "\n", avmem);
    558 
    559 	/*
    560 	 * Update lgroup generation number on single lgroup systems
    561 	 */
    562 	if (nlgrps == 1)
    563 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
    564 
    565 	delspan_unreserve(pt_base, tpgs);
    566 	return (KPHYSM_OK);		/* Successfully added system memory */
    567 
    568 }
    569 
    570 /*
    571  * There are various error conditions in kphysm_add_memory_dynamic()
    572  * which require a rollback of already changed global state.
    573  */
    574 static void
    575 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
    576 {
    577 	int mlret;
    578 
    579 	/* Unreserve memory span. */
    580 	memlist_write_lock();
    581 
    582 	mlret = memlist_delete_span(
    583 	    (uint64_t)(pt_base) << PAGESHIFT,
    584 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
    585 
    586 	ASSERT(mlret == MEML_SPANOP_OK);
    587 	phys_install_has_changed();
    588 	installed_top_size(phys_install, &physmax, &physinstalled);
    589 
    590 	memlist_write_unlock();
    591 	delspan_unreserve(pt_base, tpgs);
    592 }
    593 
    594 /*
    595  * Only return an available memseg of exactly the right size
    596  * if size is required.
    597  * When the meta data area has it's own virtual address space
    598  * we will need to manage this more carefully and do best fit
    599  * allocations, possibly splitting an available area.
    600  */
    601 struct memseg *
    602 memseg_reuse(pgcnt_t metapgs)
    603 {
    604 	int type;
    605 	struct memseg **segpp, *seg;
    606 
    607 	mutex_enter(&memseg_lists_lock);
    608 
    609 	segpp = &memseg_va_avail;
    610 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
    611 		caddr_t end;
    612 
    613 		/*
    614 		 * Make sure we are reusing the right segment type.
    615 		 */
    616 		type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
    617 
    618 		if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
    619 		    != type)
    620 			continue;
    621 
    622 		if (kpm_enable)
    623 			end = hat_kpm_mseg_reuse(seg);
    624 		else
    625 			end = (caddr_t)seg->epages;
    626 
    627 		/*
    628 		 * Check for the right size if it is provided.
    629 		 */
    630 		if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
    631 			*segpp = seg->lnext;
    632 			seg->lnext = NULL;
    633 			break;
    634 		}
    635 	}
    636 	mutex_exit(&memseg_lists_lock);
    637 
    638 	return (seg);
    639 }
    640 
    641 static uint_t handle_gen;
    642 
    643 struct memdelspan {
    644 	struct memdelspan *mds_next;
    645 	pfn_t		mds_base;
    646 	pgcnt_t		mds_npgs;
    647 	uint_t		*mds_bitmap;
    648 	uint_t		*mds_bitmap_retired;
    649 };
    650 
    651 #define	NBPBMW		(sizeof (uint_t) * NBBY)
    652 #define	MDS_BITMAPBYTES(MDSP) \
    653 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
    654 
    655 struct transit_list {
    656 	struct transit_list	*trl_next;
    657 	struct memdelspan	*trl_spans;
    658 	int			trl_collect;
    659 };
    660 
    661 struct transit_list_head {
    662 	kmutex_t		trh_lock;
    663 	struct transit_list	*trh_head;
    664 };
    665 
    666 static struct transit_list_head transit_list_head;
    667 
    668 struct mem_handle;
    669 static void transit_list_collect(struct mem_handle *, int);
    670 static void transit_list_insert(struct transit_list *);
    671 static void transit_list_remove(struct transit_list *);
    672 
    673 #ifdef DEBUG
    674 #define	MEM_DEL_STATS
    675 #endif /* DEBUG */
    676 
    677 #ifdef MEM_DEL_STATS
    678 static int mem_del_stat_print = 0;
    679 struct mem_del_stat {
    680 	uint_t	nloop;
    681 	uint_t	need_free;
    682 	uint_t	free_loop;
    683 	uint_t	free_low;
    684 	uint_t	free_failed;
    685 	uint_t	ncheck;
    686 	uint_t	nopaget;
    687 	uint_t	lockfail;
    688 	uint_t	nfree;
    689 	uint_t	nreloc;
    690 	uint_t	nrelocfail;
    691 	uint_t	already_done;
    692 	uint_t	first_notfree;
    693 	uint_t	npplocked;
    694 	uint_t	nlockreloc;
    695 	uint_t	nnorepl;
    696 	uint_t	nmodreloc;
    697 	uint_t	ndestroy;
    698 	uint_t	nputpage;
    699 	uint_t	nnoreclaim;
    700 	uint_t	ndelay;
    701 	uint_t	demotefail;
    702 	uint64_t nticks_total;
    703 	uint64_t nticks_pgrp;
    704 	uint_t	retired;
    705 	uint_t	toxic;
    706 	uint_t	failing;
    707 	uint_t	modtoxic;
    708 	uint_t	npplkdtoxic;
    709 	uint_t	gptlmodfail;
    710 	uint_t	gptllckfail;
    711 };
    712 /*
    713  * The stat values are only incremented in the delete thread
    714  * so no locking or atomic required.
    715  */
    716 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
    717 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
    718 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
    719 static void mem_del_stat_print_func(struct mem_handle *);
    720 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
    721 #else /* MEM_DEL_STATS */
    722 #define	MDSTAT_INCR(MHP, FLD)
    723 #define	MDSTAT_TOTAL(MHP, ntck)
    724 #define	MDSTAT_PGRP(MHP, ntck)
    725 #define	MDSTAT_PRINT(MHP)
    726 #endif /* MEM_DEL_STATS */
    727 
    728 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
    729 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
    730 
    731 /*
    732  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
    733  * The mutex may not be required for other fields, dependent on mh_state.
    734  */
    735 struct mem_handle {
    736 	kmutex_t	mh_mutex;
    737 	struct mem_handle *mh_next;
    738 	memhandle_t	mh_exthandle;
    739 	mhnd_state_t	mh_state;
    740 	struct transit_list mh_transit;
    741 	pgcnt_t		mh_phys_pages;
    742 	pgcnt_t		mh_vm_pages;
    743 	pgcnt_t		mh_hold_todo;
    744 	void		(*mh_delete_complete)(void *, int error);
    745 	void		*mh_delete_complete_arg;
    746 	volatile uint_t mh_cancel;
    747 	volatile uint_t mh_dr_aio_cleanup_cancel;
    748 	volatile uint_t mh_aio_cleanup_done;
    749 	kcondvar_t	mh_cv;
    750 	kthread_id_t	mh_thread_id;
    751 	page_t		*mh_deleted;	/* link through p_next */
    752 #ifdef MEM_DEL_STATS
    753 	struct mem_del_stat mh_delstat;
    754 #endif /* MEM_DEL_STATS */
    755 };
    756 
    757 static struct mem_handle *mem_handle_head;
    758 static kmutex_t mem_handle_list_mutex;
    759 
    760 static struct mem_handle *
    761 kphysm_allocate_mem_handle()
    762 {
    763 	struct mem_handle *mhp;
    764 
    765 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
    766 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
    767 	mutex_enter(&mem_handle_list_mutex);
    768 	mutex_enter(&mhp->mh_mutex);
    769 	/* handle_gen is protected by list mutex. */
    770 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
    771 	mhp->mh_next = mem_handle_head;
    772 	mem_handle_head = mhp;
    773 	mutex_exit(&mem_handle_list_mutex);
    774 
    775 	return (mhp);
    776 }
    777 
    778 static void
    779 kphysm_free_mem_handle(struct mem_handle *mhp)
    780 {
    781 	struct mem_handle **mhpp;
    782 
    783 	ASSERT(mutex_owned(&mhp->mh_mutex));
    784 	ASSERT(mhp->mh_state == MHND_FREE);
    785 	/*
    786 	 * Exit the mutex to preserve locking order. This is OK
    787 	 * here as once in the FREE state, the handle cannot
    788 	 * be found by a lookup.
    789 	 */
    790 	mutex_exit(&mhp->mh_mutex);
    791 
    792 	mutex_enter(&mem_handle_list_mutex);
    793 	mhpp = &mem_handle_head;
    794 	while (*mhpp != NULL && *mhpp != mhp)
    795 		mhpp = &(*mhpp)->mh_next;
    796 	ASSERT(*mhpp == mhp);
    797 	/*
    798 	 * No need to lock the handle (mh_mutex) as only
    799 	 * mh_next changing and this is the only thread that
    800 	 * can be referncing mhp.
    801 	 */
    802 	*mhpp = mhp->mh_next;
    803 	mutex_exit(&mem_handle_list_mutex);
    804 
    805 	mutex_destroy(&mhp->mh_mutex);
    806 	kmem_free(mhp, sizeof (struct mem_handle));
    807 }
    808 
    809 /*
    810  * This function finds the internal mem_handle corresponding to an
    811  * external handle and returns it with the mh_mutex held.
    812  */
    813 static struct mem_handle *
    814 kphysm_lookup_mem_handle(memhandle_t handle)
    815 {
    816 	struct mem_handle *mhp;
    817 
    818 	mutex_enter(&mem_handle_list_mutex);
    819 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
    820 		if (mhp->mh_exthandle == handle) {
    821 			mutex_enter(&mhp->mh_mutex);
    822 			/*
    823 			 * The state of the handle could have been changed
    824 			 * by kphysm_del_release() while waiting for mh_mutex.
    825 			 */
    826 			if (mhp->mh_state == MHND_FREE) {
    827 				mutex_exit(&mhp->mh_mutex);
    828 				continue;
    829 			}
    830 			break;
    831 		}
    832 	}
    833 	mutex_exit(&mem_handle_list_mutex);
    834 	return (mhp);
    835 }
    836 
    837 int
    838 kphysm_del_gethandle(memhandle_t *xmhp)
    839 {
    840 	struct mem_handle *mhp;
    841 
    842 	mhp = kphysm_allocate_mem_handle();
    843 	/*
    844 	 * The handle is allocated using KM_SLEEP, so cannot fail.
    845 	 * If the implementation is changed, the correct error to return
    846 	 * here would be KPHYSM_ENOHANDLES.
    847 	 */
    848 	ASSERT(mhp->mh_state == MHND_FREE);
    849 	mhp->mh_state = MHND_INIT;
    850 	*xmhp = mhp->mh_exthandle;
    851 	mutex_exit(&mhp->mh_mutex);
    852 	return (KPHYSM_OK);
    853 }
    854 
    855 static int
    856 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
    857 {
    858 	pfn_t e1, e2;
    859 
    860 	e1 = b1 + l1;
    861 	e2 = b2 + l2;
    862 
    863 	return (!(b2 >= e1 || b1 >= e2));
    864 }
    865 
    866 static int can_remove_pgs(pgcnt_t);
    867 
    868 static struct memdelspan *
    869 span_to_install(pfn_t base, pgcnt_t npgs)
    870 {
    871 	struct memdelspan *mdsp;
    872 	struct memdelspan *mdsp_new;
    873 	uint64_t address, size, thislen;
    874 	struct memlist *mlp;
    875 
    876 	mdsp_new = NULL;
    877 
    878 	address = (uint64_t)base << PAGESHIFT;
    879 	size = (uint64_t)npgs << PAGESHIFT;
    880 	while (size != 0) {
    881 		memlist_read_lock();
    882 		for (mlp = phys_install; mlp != NULL; mlp = mlp->next) {
    883 			if (address >= (mlp->address + mlp->size))
    884 				continue;
    885 			if ((address + size) > mlp->address)
    886 				break;
    887 		}
    888 		if (mlp == NULL) {
    889 			address += size;
    890 			size = 0;
    891 			thislen = 0;
    892 		} else {
    893 			if (address < mlp->address) {
    894 				size -= (mlp->address - address);
    895 				address = mlp->address;
    896 			}
    897 			ASSERT(address >= mlp->address);
    898 			if ((address + size) > (mlp->address + mlp->size)) {
    899 				thislen = mlp->size - (address - mlp->address);
    900 			} else {
    901 				thislen = size;
    902 			}
    903 		}
    904 		memlist_read_unlock();
    905 		/* TODO: phys_install could change now */
    906 		if (thislen == 0)
    907 			continue;
    908 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
    909 		mdsp->mds_base = btop(address);
    910 		mdsp->mds_npgs = btop(thislen);
    911 		mdsp->mds_next = mdsp_new;
    912 		mdsp_new = mdsp;
    913 		address += thislen;
    914 		size -= thislen;
    915 	}
    916 	return (mdsp_new);
    917 }
    918 
    919 static void
    920 free_delspans(struct memdelspan *mdsp)
    921 {
    922 	struct memdelspan *amdsp;
    923 
    924 	while ((amdsp = mdsp) != NULL) {
    925 		mdsp = amdsp->mds_next;
    926 		kmem_free(amdsp, sizeof (struct memdelspan));
    927 	}
    928 }
    929 
    930 /*
    931  * Concatenate lists. No list ordering is required.
    932  */
    933 
    934 static void
    935 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
    936 {
    937 	while (*mdspp != NULL)
    938 		mdspp = &(*mdspp)->mds_next;
    939 
    940 	*mdspp = mdsp;
    941 }
    942 
    943 /*
    944  * Given a new list of delspans, check there is no overlap with
    945  * all existing span activity (add or delete) and then concatenate
    946  * the new spans to the given list.
    947  * Return 1 for OK, 0 if overlapping.
    948  */
    949 static int
    950 delspan_insert(
    951 	struct transit_list *my_tlp,
    952 	struct memdelspan *mdsp_new)
    953 {
    954 	struct transit_list_head *trh;
    955 	struct transit_list *tlp;
    956 	int ret;
    957 
    958 	trh = &transit_list_head;
    959 
    960 	ASSERT(my_tlp != NULL);
    961 	ASSERT(mdsp_new != NULL);
    962 
    963 	ret = 1;
    964 	mutex_enter(&trh->trh_lock);
    965 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
    966 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
    967 		struct memdelspan *mdsp;
    968 
    969 		for (mdsp = tlp->trl_spans; mdsp != NULL;
    970 		    mdsp = mdsp->mds_next) {
    971 			struct memdelspan *nmdsp;
    972 
    973 			for (nmdsp = mdsp_new; nmdsp != NULL;
    974 			    nmdsp = nmdsp->mds_next) {
    975 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
    976 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
    977 					ret = 0;
    978 					goto done;
    979 				}
    980 			}
    981 		}
    982 	}
    983 done:
    984 	if (ret != 0) {
    985 		if (my_tlp->trl_spans == NULL)
    986 			transit_list_insert(my_tlp);
    987 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
    988 	}
    989 	mutex_exit(&trh->trh_lock);
    990 	return (ret);
    991 }
    992 
    993 static void
    994 delspan_remove(
    995 	struct transit_list *my_tlp,
    996 	pfn_t base,
    997 	pgcnt_t npgs)
    998 {
    999 	struct transit_list_head *trh;
   1000 	struct memdelspan *mdsp;
   1001 
   1002 	trh = &transit_list_head;
   1003 
   1004 	ASSERT(my_tlp != NULL);
   1005 
   1006 	mutex_enter(&trh->trh_lock);
   1007 	if ((mdsp = my_tlp->trl_spans) != NULL) {
   1008 		if (npgs == 0) {
   1009 			my_tlp->trl_spans = NULL;
   1010 			free_delspans(mdsp);
   1011 			transit_list_remove(my_tlp);
   1012 		} else {
   1013 			struct memdelspan **prv;
   1014 
   1015 			prv = &my_tlp->trl_spans;
   1016 			while (mdsp != NULL) {
   1017 				pfn_t p_end;
   1018 
   1019 				p_end = mdsp->mds_base + mdsp->mds_npgs;
   1020 				if (mdsp->mds_base >= base &&
   1021 				    p_end <= (base + npgs)) {
   1022 					*prv = mdsp->mds_next;
   1023 					mdsp->mds_next = NULL;
   1024 					free_delspans(mdsp);
   1025 				} else {
   1026 					prv = &mdsp->mds_next;
   1027 				}
   1028 				mdsp = *prv;
   1029 			}
   1030 			if (my_tlp->trl_spans == NULL)
   1031 				transit_list_remove(my_tlp);
   1032 		}
   1033 	}
   1034 	mutex_exit(&trh->trh_lock);
   1035 }
   1036 
   1037 /*
   1038  * Reserve interface for add to stop delete before add finished.
   1039  * This list is only accessed through the delspan_insert/remove
   1040  * functions and so is fully protected by the mutex in struct transit_list.
   1041  */
   1042 
   1043 static struct transit_list reserve_transit;
   1044 
   1045 static int
   1046 delspan_reserve(pfn_t base, pgcnt_t npgs)
   1047 {
   1048 	struct memdelspan *mdsp;
   1049 	int ret;
   1050 
   1051 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
   1052 	mdsp->mds_base = base;
   1053 	mdsp->mds_npgs = npgs;
   1054 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
   1055 		free_delspans(mdsp);
   1056 	}
   1057 	return (ret);
   1058 }
   1059 
   1060 static void
   1061 delspan_unreserve(pfn_t base, pgcnt_t npgs)
   1062 {
   1063 	delspan_remove(&reserve_transit, base, npgs);
   1064 }
   1065 
   1066 /*
   1067  * Return whether memseg was created by kphysm_add_memory_dynamic().
   1068  */
   1069 static int
   1070 memseg_is_dynamic(struct memseg *seg)
   1071 {
   1072 	return (seg->msegflags & MEMSEG_DYNAMIC);
   1073 }
   1074 
   1075 int
   1076 kphysm_del_span(
   1077 	memhandle_t handle,
   1078 	pfn_t base,
   1079 	pgcnt_t npgs)
   1080 {
   1081 	struct mem_handle *mhp;
   1082 	struct memseg *seg;
   1083 	struct memdelspan *mdsp;
   1084 	struct memdelspan *mdsp_new;
   1085 	pgcnt_t phys_pages, vm_pages;
   1086 	pfn_t p_end;
   1087 	page_t *pp;
   1088 	int ret;
   1089 
   1090 	mhp = kphysm_lookup_mem_handle(handle);
   1091 	if (mhp == NULL) {
   1092 		return (KPHYSM_EHANDLE);
   1093 	}
   1094 	if (mhp->mh_state != MHND_INIT) {
   1095 		mutex_exit(&mhp->mh_mutex);
   1096 		return (KPHYSM_ESEQUENCE);
   1097 	}
   1098 
   1099 	/*
   1100 	 * Intersect the span with the installed memory list (phys_install).
   1101 	 */
   1102 	mdsp_new = span_to_install(base, npgs);
   1103 	if (mdsp_new == NULL) {
   1104 		/*
   1105 		 * No physical memory in this range. Is this an
   1106 		 * error? If an attempt to start the delete is made
   1107 		 * for OK returns from del_span such as this, start will
   1108 		 * return an error.
   1109 		 * Could return KPHYSM_ENOWORK.
   1110 		 */
   1111 		/*
   1112 		 * It is assumed that there are no error returns
   1113 		 * from span_to_install() due to kmem_alloc failure.
   1114 		 */
   1115 		mutex_exit(&mhp->mh_mutex);
   1116 		return (KPHYSM_OK);
   1117 	}
   1118 	/*
   1119 	 * Does this span overlap an existing span?
   1120 	 */
   1121 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
   1122 		/*
   1123 		 * Differentiate between already on list for this handle
   1124 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
   1125 		 */
   1126 		ret = KPHYSM_EBUSY;
   1127 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   1128 		    mdsp = mdsp->mds_next) {
   1129 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
   1130 			    base, npgs)) {
   1131 				ret = KPHYSM_EDUP;
   1132 				break;
   1133 			}
   1134 		}
   1135 		mutex_exit(&mhp->mh_mutex);
   1136 		free_delspans(mdsp_new);
   1137 		return (ret);
   1138 	}
   1139 	/*
   1140 	 * At this point the spans in mdsp_new have been inserted into the
   1141 	 * list of spans for this handle and thereby to the global list of
   1142 	 * spans being processed. Each of these spans must now be checked
   1143 	 * for relocatability. As a side-effect segments in the memseg list
   1144 	 * may be split.
   1145 	 *
   1146 	 * Note that mdsp_new can no longer be used as it is now part of
   1147 	 * a larger list. Select elements of this larger list based
   1148 	 * on base and npgs.
   1149 	 */
   1150 restart:
   1151 	phys_pages = 0;
   1152 	vm_pages = 0;
   1153 	ret = KPHYSM_OK;
   1154 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   1155 	    mdsp = mdsp->mds_next) {
   1156 		pgcnt_t pages_checked;
   1157 
   1158 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
   1159 			continue;
   1160 		}
   1161 		p_end = mdsp->mds_base + mdsp->mds_npgs;
   1162 		/*
   1163 		 * The pages_checked count is a hack. All pages should be
   1164 		 * checked for relocatability. Those not covered by memsegs
   1165 		 * should be tested with arch_kphysm_del_span_ok().
   1166 		 */
   1167 		pages_checked = 0;
   1168 		for (seg = memsegs; seg; seg = seg->next) {
   1169 			pfn_t mseg_start;
   1170 
   1171 			if (seg->pages_base >= p_end ||
   1172 			    seg->pages_end <= mdsp->mds_base) {
   1173 				/* Span and memseg don't overlap. */
   1174 				continue;
   1175 			}
   1176 			mseg_start = memseg_get_start(seg);
   1177 			/* Check that segment is suitable for delete. */
   1178 			if (memseg_includes_meta(seg)) {
   1179 				/*
   1180 				 * Check that this segment is completely
   1181 				 * within the span.
   1182 				 */
   1183 				if (mseg_start < mdsp->mds_base ||
   1184 				    seg->pages_end > p_end) {
   1185 					ret = KPHYSM_EBUSY;
   1186 					break;
   1187 				}
   1188 				pages_checked += seg->pages_end - mseg_start;
   1189 			} else {
   1190 				/*
   1191 				 * If this segment is larger than the span,
   1192 				 * try to split it. After the split, it
   1193 				 * is necessary to restart.
   1194 				 */
   1195 				if (seg->pages_base < mdsp->mds_base ||
   1196 				    seg->pages_end > p_end) {
   1197 					pfn_t abase;
   1198 					pgcnt_t anpgs;
   1199 					int s_ret;
   1200 
   1201 					/* Split required.  */
   1202 					if (mdsp->mds_base < seg->pages_base)
   1203 						abase = seg->pages_base;
   1204 					else
   1205 						abase = mdsp->mds_base;
   1206 					if (p_end > seg->pages_end)
   1207 						anpgs = seg->pages_end - abase;
   1208 					else
   1209 						anpgs = p_end - abase;
   1210 					s_ret = kphysm_split_memseg(abase,
   1211 					    anpgs);
   1212 					if (s_ret == 0) {
   1213 						/* Split failed. */
   1214 						ret = KPHYSM_ERESOURCE;
   1215 						break;
   1216 					}
   1217 					goto restart;
   1218 				}
   1219 				pages_checked +=
   1220 				    seg->pages_end - seg->pages_base;
   1221 			}
   1222 			/*
   1223 			 * The memseg is wholly within the delete span.
   1224 			 * The individual pages can now be checked.
   1225 			 */
   1226 			/* Cage test. */
   1227 			for (pp = seg->pages; pp < seg->epages; pp++) {
   1228 				if (PP_ISNORELOC(pp)) {
   1229 					ret = KPHYSM_ENONRELOC;
   1230 					break;
   1231 				}
   1232 			}
   1233 			if (ret != KPHYSM_OK) {
   1234 				break;
   1235 			}
   1236 			phys_pages += (seg->pages_end - mseg_start);
   1237 			vm_pages += MSEG_NPAGES(seg);
   1238 		}
   1239 		if (ret != KPHYSM_OK)
   1240 			break;
   1241 		if (pages_checked != mdsp->mds_npgs) {
   1242 			ret = KPHYSM_ENONRELOC;
   1243 			break;
   1244 		}
   1245 	}
   1246 
   1247 	if (ret == KPHYSM_OK) {
   1248 		mhp->mh_phys_pages += phys_pages;
   1249 		mhp->mh_vm_pages += vm_pages;
   1250 	} else {
   1251 		/*
   1252 		 * Keep holding the mh_mutex to prevent it going away.
   1253 		 */
   1254 		delspan_remove(&mhp->mh_transit, base, npgs);
   1255 	}
   1256 	mutex_exit(&mhp->mh_mutex);
   1257 	return (ret);
   1258 }
   1259 
   1260 int
   1261 kphysm_del_span_query(
   1262 	pfn_t base,
   1263 	pgcnt_t npgs,
   1264 	memquery_t *mqp)
   1265 {
   1266 	struct memdelspan *mdsp;
   1267 	struct memdelspan *mdsp_new;
   1268 	int done_first_nonreloc;
   1269 
   1270 	mqp->phys_pages = 0;
   1271 	mqp->managed = 0;
   1272 	mqp->nonrelocatable = 0;
   1273 	mqp->first_nonrelocatable = 0;
   1274 	mqp->last_nonrelocatable = 0;
   1275 
   1276 	mdsp_new = span_to_install(base, npgs);
   1277 	/*
   1278 	 * It is OK to proceed here if mdsp_new == NULL.
   1279 	 */
   1280 	done_first_nonreloc = 0;
   1281 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
   1282 		pfn_t sbase;
   1283 		pgcnt_t snpgs;
   1284 
   1285 		mqp->phys_pages += mdsp->mds_npgs;
   1286 		sbase = mdsp->mds_base;
   1287 		snpgs = mdsp->mds_npgs;
   1288 		while (snpgs != 0) {
   1289 			struct memseg *lseg, *seg;
   1290 			pfn_t p_end;
   1291 			page_t *pp;
   1292 			pfn_t mseg_start;
   1293 
   1294 			p_end = sbase + snpgs;
   1295 			/*
   1296 			 * Find the lowest addressed memseg that starts
   1297 			 * after sbase and account for it.
   1298 			 * This is to catch dynamic memsegs whose start
   1299 			 * is hidden.
   1300 			 */
   1301 			seg = NULL;
   1302 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
   1303 				if ((lseg->pages_base >= sbase) ||
   1304 				    (lseg->pages_base < p_end &&
   1305 				    lseg->pages_end > sbase)) {
   1306 					if (seg == NULL ||
   1307 					    seg->pages_base > lseg->pages_base)
   1308 						seg = lseg;
   1309 				}
   1310 			}
   1311 			if (seg != NULL) {
   1312 				mseg_start = memseg_get_start(seg);
   1313 				/*
   1314 				 * Now have the full extent of the memseg so
   1315 				 * do the range check.
   1316 				 */
   1317 				if (mseg_start >= p_end ||
   1318 				    seg->pages_end <= sbase) {
   1319 					/* Span does not overlap memseg. */
   1320 					seg = NULL;
   1321 				}
   1322 			}
   1323 			/*
   1324 			 * Account for gap either before the segment if
   1325 			 * there is one or to the end of the span.
   1326 			 */
   1327 			if (seg == NULL || mseg_start > sbase) {
   1328 				pfn_t a_end;
   1329 
   1330 				a_end = (seg == NULL) ? p_end : mseg_start;
   1331 				/*
   1332 				 * Check with arch layer for relocatability.
   1333 				 */
   1334 				if (arch_kphysm_del_span_ok(sbase,
   1335 				    (a_end - sbase))) {
   1336 					/*
   1337 					 * No non-relocatble pages in this
   1338 					 * area, avoid the fine-grained
   1339 					 * test.
   1340 					 */
   1341 					snpgs -= (a_end - sbase);
   1342 					sbase = a_end;
   1343 				}
   1344 				while (sbase < a_end) {
   1345 					if (!arch_kphysm_del_span_ok(sbase,
   1346 					    1)) {
   1347 						mqp->nonrelocatable++;
   1348 						if (!done_first_nonreloc) {
   1349 							mqp->
   1350 							    first_nonrelocatable
   1351 							    = sbase;
   1352 							done_first_nonreloc = 1;
   1353 						}
   1354 						mqp->last_nonrelocatable =
   1355 						    sbase;
   1356 					}
   1357 					sbase++;
   1358 					snpgs--;
   1359 				}
   1360 			}
   1361 			if (seg != NULL) {
   1362 				ASSERT(mseg_start <= sbase);
   1363 				if (seg->pages_base != mseg_start &&
   1364 				    seg->pages_base > sbase) {
   1365 					pgcnt_t skip_pgs;
   1366 
   1367 					/*
   1368 					 * Skip the page_t area of a
   1369 					 * dynamic memseg.
   1370 					 */
   1371 					skip_pgs = seg->pages_base - sbase;
   1372 					if (snpgs <= skip_pgs) {
   1373 						sbase += snpgs;
   1374 						snpgs = 0;
   1375 						continue;
   1376 					}
   1377 					snpgs -= skip_pgs;
   1378 					sbase += skip_pgs;
   1379 				}
   1380 				ASSERT(snpgs != 0);
   1381 				ASSERT(seg->pages_base <= sbase);
   1382 				/*
   1383 				 * The individual pages can now be checked.
   1384 				 */
   1385 				for (pp = seg->pages +
   1386 				    (sbase - seg->pages_base);
   1387 				    snpgs != 0 && pp < seg->epages; pp++) {
   1388 					mqp->managed++;
   1389 					if (PP_ISNORELOC(pp)) {
   1390 						mqp->nonrelocatable++;
   1391 						if (!done_first_nonreloc) {
   1392 							mqp->
   1393 							    first_nonrelocatable
   1394 							    = sbase;
   1395 							done_first_nonreloc = 1;
   1396 						}
   1397 						mqp->last_nonrelocatable =
   1398 						    sbase;
   1399 					}
   1400 					sbase++;
   1401 					snpgs--;
   1402 				}
   1403 			}
   1404 		}
   1405 	}
   1406 
   1407 	free_delspans(mdsp_new);
   1408 
   1409 	return (KPHYSM_OK);
   1410 }
   1411 
   1412 /*
   1413  * This release function can be called at any stage as follows:
   1414  *	_gethandle only called
   1415  *	_span(s) only called
   1416  *	_start called but failed
   1417  *	delete thread exited
   1418  */
   1419 int
   1420 kphysm_del_release(memhandle_t handle)
   1421 {
   1422 	struct mem_handle *mhp;
   1423 
   1424 	mhp = kphysm_lookup_mem_handle(handle);
   1425 	if (mhp == NULL) {
   1426 		return (KPHYSM_EHANDLE);
   1427 	}
   1428 	switch (mhp->mh_state) {
   1429 	case MHND_STARTING:
   1430 	case MHND_RUNNING:
   1431 		mutex_exit(&mhp->mh_mutex);
   1432 		return (KPHYSM_ENOTFINISHED);
   1433 	case MHND_FREE:
   1434 		ASSERT(mhp->mh_state != MHND_FREE);
   1435 		mutex_exit(&mhp->mh_mutex);
   1436 		return (KPHYSM_EHANDLE);
   1437 	case MHND_INIT:
   1438 		break;
   1439 	case MHND_DONE:
   1440 		break;
   1441 	case MHND_RELEASE:
   1442 		mutex_exit(&mhp->mh_mutex);
   1443 		return (KPHYSM_ESEQUENCE);
   1444 	default:
   1445 #ifdef DEBUG
   1446 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
   1447 		    (void *)mhp, mhp->mh_state);
   1448 #endif /* DEBUG */
   1449 		mutex_exit(&mhp->mh_mutex);
   1450 		return (KPHYSM_EHANDLE);
   1451 	}
   1452 	/*
   1453 	 * Set state so that we can wait if necessary.
   1454 	 * Also this means that we have read/write access to all
   1455 	 * fields except mh_exthandle and mh_state.
   1456 	 */
   1457 	mhp->mh_state = MHND_RELEASE;
   1458 	/*
   1459 	 * The mem_handle cannot be de-allocated by any other operation
   1460 	 * now, so no need to hold mh_mutex.
   1461 	 */
   1462 	mutex_exit(&mhp->mh_mutex);
   1463 
   1464 	delspan_remove(&mhp->mh_transit, 0, 0);
   1465 	mhp->mh_phys_pages = 0;
   1466 	mhp->mh_vm_pages = 0;
   1467 	mhp->mh_hold_todo = 0;
   1468 	mhp->mh_delete_complete = NULL;
   1469 	mhp->mh_delete_complete_arg = NULL;
   1470 	mhp->mh_cancel = 0;
   1471 
   1472 	mutex_enter(&mhp->mh_mutex);
   1473 	ASSERT(mhp->mh_state == MHND_RELEASE);
   1474 	mhp->mh_state = MHND_FREE;
   1475 
   1476 	kphysm_free_mem_handle(mhp);
   1477 
   1478 	return (KPHYSM_OK);
   1479 }
   1480 
   1481 /*
   1482  * This cancel function can only be called with the thread running.
   1483  */
   1484 int
   1485 kphysm_del_cancel(memhandle_t handle)
   1486 {
   1487 	struct mem_handle *mhp;
   1488 
   1489 	mhp = kphysm_lookup_mem_handle(handle);
   1490 	if (mhp == NULL) {
   1491 		return (KPHYSM_EHANDLE);
   1492 	}
   1493 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
   1494 		mutex_exit(&mhp->mh_mutex);
   1495 		return (KPHYSM_ENOTRUNNING);
   1496 	}
   1497 	/*
   1498 	 * Set the cancel flag and wake the delete thread up.
   1499 	 * The thread may be waiting on I/O, so the effect of the cancel
   1500 	 * may be delayed.
   1501 	 */
   1502 	if (mhp->mh_cancel == 0) {
   1503 		mhp->mh_cancel = KPHYSM_ECANCELLED;
   1504 		cv_signal(&mhp->mh_cv);
   1505 	}
   1506 	mutex_exit(&mhp->mh_mutex);
   1507 	return (KPHYSM_OK);
   1508 }
   1509 
   1510 int
   1511 kphysm_del_status(
   1512 	memhandle_t handle,
   1513 	memdelstat_t *mdstp)
   1514 {
   1515 	struct mem_handle *mhp;
   1516 
   1517 	mhp = kphysm_lookup_mem_handle(handle);
   1518 	if (mhp == NULL) {
   1519 		return (KPHYSM_EHANDLE);
   1520 	}
   1521 	/*
   1522 	 * Calling kphysm_del_status() is allowed before the delete
   1523 	 * is started to allow for status display.
   1524 	 */
   1525 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
   1526 	    mhp->mh_state != MHND_RUNNING) {
   1527 		mutex_exit(&mhp->mh_mutex);
   1528 		return (KPHYSM_ENOTRUNNING);
   1529 	}
   1530 	mdstp->phys_pages = mhp->mh_phys_pages;
   1531 	mdstp->managed = mhp->mh_vm_pages;
   1532 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
   1533 	mutex_exit(&mhp->mh_mutex);
   1534 	return (KPHYSM_OK);
   1535 }
   1536 
   1537 static int mem_delete_additional_pages = 100;
   1538 
   1539 static int
   1540 can_remove_pgs(pgcnt_t npgs)
   1541 {
   1542 	/*
   1543 	 * If all pageable pages were paged out, freemem would
   1544 	 * equal availrmem.  There is a minimum requirement for
   1545 	 * availrmem.
   1546 	 */
   1547 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
   1548 	    < npgs)
   1549 		return (0);
   1550 	/* TODO: check swap space, etc. */
   1551 	return (1);
   1552 }
   1553 
   1554 static int
   1555 get_availrmem(pgcnt_t npgs)
   1556 {
   1557 	int ret;
   1558 
   1559 	mutex_enter(&freemem_lock);
   1560 	ret = can_remove_pgs(npgs);
   1561 	if (ret != 0)
   1562 		availrmem -= npgs;
   1563 	mutex_exit(&freemem_lock);
   1564 	return (ret);
   1565 }
   1566 
   1567 static void
   1568 put_availrmem(pgcnt_t npgs)
   1569 {
   1570 	mutex_enter(&freemem_lock);
   1571 	availrmem += npgs;
   1572 	mutex_exit(&freemem_lock);
   1573 }
   1574 
   1575 #define	FREEMEM_INCR	100
   1576 static pgcnt_t freemem_incr = FREEMEM_INCR;
   1577 #define	DEL_FREE_WAIT_FRAC	4
   1578 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
   1579 
   1580 #define	DEL_BUSY_WAIT_FRAC	20
   1581 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
   1582 
   1583 static void kphysm_del_cleanup(struct mem_handle *);
   1584 
   1585 static void page_delete_collect(page_t *, struct mem_handle *);
   1586 
   1587 static pgcnt_t
   1588 delthr_get_freemem(struct mem_handle *mhp)
   1589 {
   1590 	pgcnt_t free_get;
   1591 	int ret;
   1592 
   1593 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
   1594 
   1595 	MDSTAT_INCR(mhp, need_free);
   1596 	/*
   1597 	 * Get up to freemem_incr pages.
   1598 	 */
   1599 	free_get = freemem_incr;
   1600 	if (free_get > mhp->mh_hold_todo)
   1601 		free_get = mhp->mh_hold_todo;
   1602 	/*
   1603 	 * Take free_get pages away from freemem,
   1604 	 * waiting if necessary.
   1605 	 */
   1606 
   1607 	while (!mhp->mh_cancel) {
   1608 		mutex_exit(&mhp->mh_mutex);
   1609 		MDSTAT_INCR(mhp, free_loop);
   1610 		/*
   1611 		 * Duplicate test from page_create_throttle()
   1612 		 * but don't override with !PG_WAIT.
   1613 		 */
   1614 		if (freemem < (free_get + throttlefree)) {
   1615 			MDSTAT_INCR(mhp, free_low);
   1616 			ret = 0;
   1617 		} else {
   1618 			ret = page_create_wait(free_get, 0);
   1619 			if (ret == 0) {
   1620 				/* EMPTY */
   1621 				MDSTAT_INCR(mhp, free_failed);
   1622 			}
   1623 		}
   1624 		if (ret != 0) {
   1625 			mutex_enter(&mhp->mh_mutex);
   1626 			return (free_get);
   1627 		}
   1628 
   1629 		/*
   1630 		 * Put pressure on pageout.
   1631 		 */
   1632 		page_needfree(free_get);
   1633 		cv_signal(&proc_pageout->p_cv);
   1634 
   1635 		mutex_enter(&mhp->mh_mutex);
   1636 		(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
   1637 		    DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
   1638 		mutex_exit(&mhp->mh_mutex);
   1639 		page_needfree(-(spgcnt_t)free_get);
   1640 
   1641 		mutex_enter(&mhp->mh_mutex);
   1642 	}
   1643 	return (0);
   1644 }
   1645 
   1646 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
   1647 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
   1648 /*
   1649  * This function is run as a helper thread for delete_memory_thread.
   1650  * It is needed in order to force kaio cleanup, so that pages used in kaio
   1651  * will be unlocked and subsequently relocated by delete_memory_thread.
   1652  * The address of the delete_memory_threads's mem_handle is passed in to
   1653  * this thread function, and is used to set the mh_aio_cleanup_done member
   1654  * prior to calling thread_exit().
   1655  */
   1656 static void
   1657 dr_aio_cleanup_thread(caddr_t amhp)
   1658 {
   1659 	proc_t *procp;
   1660 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
   1661 	int cleaned;
   1662 	int n = 0;
   1663 	struct mem_handle *mhp;
   1664 	volatile uint_t *pcancel;
   1665 
   1666 	mhp = (struct mem_handle *)amhp;
   1667 	ASSERT(mhp != NULL);
   1668 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
   1669 	if (modload("sys", "kaio") == -1) {
   1670 		mhp->mh_aio_cleanup_done = 1;
   1671 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
   1672 		thread_exit();
   1673 	}
   1674 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
   1675 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
   1676 	if (aio_cleanup_dr_delete_memory == NULL) {
   1677 		mhp->mh_aio_cleanup_done = 1;
   1678 		cmn_err(CE_WARN,
   1679 	    "aio_cleanup_dr_delete_memory not found in kaio");
   1680 		thread_exit();
   1681 	}
   1682 	do {
   1683 		cleaned = 0;
   1684 		mutex_enter(&pidlock);
   1685 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
   1686 		    procp = procp->p_next) {
   1687 			mutex_enter(&procp->p_lock);
   1688 			if (procp->p_aio != NULL) {
   1689 				/* cleanup proc's outstanding kaio */
   1690 				cleaned +=
   1691 				    (*aio_cleanup_dr_delete_memory)(procp);
   1692 			}
   1693 			mutex_exit(&procp->p_lock);
   1694 		}
   1695 		mutex_exit(&pidlock);
   1696 		if ((*pcancel == 0) &&
   1697 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
   1698 			/* delay a bit before retrying all procs again */
   1699 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
   1700 			n = 0;
   1701 		}
   1702 	} while (*pcancel == 0);
   1703 	mhp->mh_aio_cleanup_done = 1;
   1704 	thread_exit();
   1705 }
   1706 
   1707 static void
   1708 delete_memory_thread(caddr_t amhp)
   1709 {
   1710 	struct mem_handle *mhp;
   1711 	struct memdelspan *mdsp;
   1712 	callb_cpr_t cprinfo;
   1713 	page_t *pp_targ;
   1714 	spgcnt_t freemem_left;
   1715 	void (*del_complete_funcp)(void *, int error);
   1716 	void *del_complete_arg;
   1717 	int comp_code;
   1718 	int ret;
   1719 	int first_scan;
   1720 	uint_t szc;
   1721 #ifdef MEM_DEL_STATS
   1722 	uint64_t start_total, ntick_total;
   1723 	uint64_t start_pgrp, ntick_pgrp;
   1724 #endif /* MEM_DEL_STATS */
   1725 
   1726 	mhp = (struct mem_handle *)amhp;
   1727 
   1728 #ifdef MEM_DEL_STATS
   1729 	start_total = ddi_get_lbolt();
   1730 #endif /* MEM_DEL_STATS */
   1731 
   1732 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
   1733 	    callb_generic_cpr, "memdel");
   1734 
   1735 	mutex_enter(&mhp->mh_mutex);
   1736 	ASSERT(mhp->mh_state == MHND_STARTING);
   1737 
   1738 	mhp->mh_state = MHND_RUNNING;
   1739 	mhp->mh_thread_id = curthread;
   1740 
   1741 	mhp->mh_hold_todo = mhp->mh_vm_pages;
   1742 	mutex_exit(&mhp->mh_mutex);
   1743 
   1744 	/* Allocate the remap pages now, if necessary. */
   1745 	memseg_remap_init();
   1746 
   1747 	/*
   1748 	 * Subtract from availrmem now if possible as availrmem
   1749 	 * may not be available by the end of the delete.
   1750 	 */
   1751 	if (!get_availrmem(mhp->mh_vm_pages)) {
   1752 		comp_code = KPHYSM_ENOTVIABLE;
   1753 		mutex_enter(&mhp->mh_mutex);
   1754 		goto early_exit;
   1755 	}
   1756 
   1757 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
   1758 
   1759 	mutex_enter(&mhp->mh_mutex);
   1760 
   1761 	if (ret != 0) {
   1762 		mhp->mh_cancel = KPHYSM_EREFUSED;
   1763 		goto refused;
   1764 	}
   1765 
   1766 	transit_list_collect(mhp, 1);
   1767 
   1768 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   1769 	    mdsp = mdsp->mds_next) {
   1770 		ASSERT(mdsp->mds_bitmap == NULL);
   1771 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
   1772 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
   1773 		    KM_SLEEP);
   1774 	}
   1775 
   1776 	first_scan = 1;
   1777 	freemem_left = 0;
   1778 	/*
   1779 	 * Start dr_aio_cleanup_thread, which periodically iterates
   1780 	 * through the process list and invokes aio cleanup.  This
   1781 	 * is needed in order to avoid a deadly embrace between the
   1782 	 * delete_memory_thread (waiting on writer lock for page, with the
   1783 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
   1784 	 * reader lock on the same page that is wanted by the
   1785 	 * delete_memory_thread), and threads waiting for kaio completion
   1786 	 * (blocked on spt_amp->lock).
   1787 	 */
   1788 	mhp->mh_dr_aio_cleanup_cancel = 0;
   1789 	mhp->mh_aio_cleanup_done = 0;
   1790 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
   1791 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
   1792 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
   1793 		pgcnt_t collected;
   1794 
   1795 		MDSTAT_INCR(mhp, nloop);
   1796 		collected = 0;
   1797 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
   1798 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
   1799 			pfn_t pfn, p_end;
   1800 
   1801 			p_end = mdsp->mds_base + mdsp->mds_npgs;
   1802 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
   1803 			    (mhp->mh_cancel == 0); pfn++) {
   1804 				page_t *pp, *tpp, *tpp_targ;
   1805 				pgcnt_t bit;
   1806 				struct vnode *vp;
   1807 				u_offset_t offset;
   1808 				int mod, result;
   1809 				spgcnt_t pgcnt;
   1810 
   1811 				bit = pfn - mdsp->mds_base;
   1812 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
   1813 				    (1 << (bit % NBPBMW))) != 0) {
   1814 					MDSTAT_INCR(mhp, already_done);
   1815 					continue;
   1816 				}
   1817 				if (freemem_left == 0) {
   1818 					freemem_left += delthr_get_freemem(mhp);
   1819 					if (freemem_left == 0)
   1820 						break;
   1821 				}
   1822 
   1823 				/*
   1824 				 * Release mh_mutex - some of this
   1825 				 * stuff takes some time (eg PUTPAGE).
   1826 				 */
   1827 
   1828 				mutex_exit(&mhp->mh_mutex);
   1829 				MDSTAT_INCR(mhp, ncheck);
   1830 
   1831 				pp = page_numtopp_nolock(pfn);
   1832 				if (pp == NULL) {
   1833 					/*
   1834 					 * Not covered by a page_t - will
   1835 					 * be dealt with elsewhere.
   1836 					 */
   1837 					MDSTAT_INCR(mhp, nopaget);
   1838 					mutex_enter(&mhp->mh_mutex);
   1839 					mdsp->mds_bitmap[bit / NBPBMW] |=
   1840 					    (1 << (bit % NBPBMW));
   1841 					continue;
   1842 				}
   1843 
   1844 				if (!page_try_reclaim_lock(pp, SE_EXCL,
   1845 				    SE_EXCL_WANTED | SE_RETIRED)) {
   1846 					/*
   1847 					 * Page in use elsewhere.  Skip it.
   1848 					 */
   1849 					MDSTAT_INCR(mhp, lockfail);
   1850 					mutex_enter(&mhp->mh_mutex);
   1851 					continue;
   1852 				}
   1853 				/*
   1854 				 * See if the cage expanded into the delete.
   1855 				 * This can happen as we have to allow the
   1856 				 * cage to expand.
   1857 				 */
   1858 				if (PP_ISNORELOC(pp)) {
   1859 					page_unlock(pp);
   1860 					mutex_enter(&mhp->mh_mutex);
   1861 					mhp->mh_cancel = KPHYSM_ENONRELOC;
   1862 					break;
   1863 				}
   1864 				if (PP_RETIRED(pp)) {
   1865 					/*
   1866 					 * Page has been retired and is
   1867 					 * not part of the cage so we
   1868 					 * can now do the accounting for
   1869 					 * it.
   1870 					 */
   1871 					MDSTAT_INCR(mhp, retired);
   1872 					mutex_enter(&mhp->mh_mutex);
   1873 					mdsp->mds_bitmap[bit / NBPBMW]
   1874 					    |= (1 << (bit % NBPBMW));
   1875 					mdsp->mds_bitmap_retired[bit /
   1876 					    NBPBMW] |=
   1877 					    (1 << (bit % NBPBMW));
   1878 					mhp->mh_hold_todo--;
   1879 					continue;
   1880 				}
   1881 				ASSERT(freemem_left != 0);
   1882 				if (PP_ISFREE(pp)) {
   1883 					/*
   1884 					 * Like page_reclaim() only 'freemem'
   1885 					 * processing is already done.
   1886 					 */
   1887 					MDSTAT_INCR(mhp, nfree);
   1888 				free_page_collect:
   1889 					if (PP_ISAGED(pp)) {
   1890 						page_list_sub(pp,
   1891 						    PG_FREE_LIST);
   1892 					} else {
   1893 						page_list_sub(pp,
   1894 						    PG_CACHE_LIST);
   1895 					}
   1896 					PP_CLRFREE(pp);
   1897 					PP_CLRAGED(pp);
   1898 					collected++;
   1899 					mutex_enter(&mhp->mh_mutex);
   1900 					page_delete_collect(pp, mhp);
   1901 					mdsp->mds_bitmap[bit / NBPBMW] |=
   1902 					    (1 << (bit % NBPBMW));
   1903 					freemem_left--;
   1904 					continue;
   1905 				}
   1906 				ASSERT(pp->p_vnode != NULL);
   1907 				if (first_scan) {
   1908 					MDSTAT_INCR(mhp, first_notfree);
   1909 					page_unlock(pp);
   1910 					mutex_enter(&mhp->mh_mutex);
   1911 					continue;
   1912 				}
   1913 				/*
   1914 				 * Keep stats on pages encountered that
   1915 				 * are marked for retirement.
   1916 				 */
   1917 				if (PP_TOXIC(pp)) {
   1918 					MDSTAT_INCR(mhp, toxic);
   1919 				} else if (PP_PR_REQ(pp)) {
   1920 					MDSTAT_INCR(mhp, failing);
   1921 				}
   1922 				/*
   1923 				 * In certain cases below, special exceptions
   1924 				 * are made for pages that are toxic.  This
   1925 				 * is because the current meaning of toxic
   1926 				 * is that an uncorrectable error has been
   1927 				 * previously associated with the page.
   1928 				 */
   1929 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
   1930 					if (!PP_TOXIC(pp)) {
   1931 						/*
   1932 						 * Must relocate locked in
   1933 						 * memory pages.
   1934 						 */
   1935 #ifdef MEM_DEL_STATS
   1936 						start_pgrp = ddi_get_lbolt();
   1937 #endif /* MEM_DEL_STATS */
   1938 						/*
   1939 						 * Lock all constituent pages
   1940 						 * of a large page to ensure
   1941 						 * that p_szc won't change.
   1942 						 */
   1943 						if (!group_page_trylock(pp,
   1944 						    SE_EXCL)) {
   1945 							MDSTAT_INCR(mhp,
   1946 							    gptllckfail);
   1947 							page_unlock(pp);
   1948 							mutex_enter(
   1949 							    &mhp->mh_mutex);
   1950 							continue;
   1951 						}
   1952 						MDSTAT_INCR(mhp, npplocked);
   1953 						pp_targ =
   1954 						    page_get_replacement_page(
   1955 						    pp, NULL, 0);
   1956 						if (pp_targ != NULL) {
   1957 #ifdef MEM_DEL_STATS
   1958 							ntick_pgrp =
   1959 							    (uint64_t)
   1960 							    ddi_get_lbolt() -
   1961 							    start_pgrp;
   1962 #endif /* MEM_DEL_STATS */
   1963 							MDSTAT_PGRP(mhp,
   1964 							    ntick_pgrp);
   1965 							MDSTAT_INCR(mhp,
   1966 							    nlockreloc);
   1967 							goto reloc;
   1968 						}
   1969 						group_page_unlock(pp);
   1970 						page_unlock(pp);
   1971 #ifdef MEM_DEL_STATS
   1972 						ntick_pgrp =
   1973 						    (uint64_t)ddi_get_lbolt() -
   1974 						    start_pgrp;
   1975 #endif /* MEM_DEL_STATS */
   1976 						MDSTAT_PGRP(mhp, ntick_pgrp);
   1977 						MDSTAT_INCR(mhp, nnorepl);
   1978 						mutex_enter(&mhp->mh_mutex);
   1979 						continue;
   1980 					} else {
   1981 						/*
   1982 						 * Cannot do anything about
   1983 						 * this page because it is
   1984 						 * toxic.
   1985 						 */
   1986 						MDSTAT_INCR(mhp, npplkdtoxic);
   1987 						page_unlock(pp);
   1988 						mutex_enter(&mhp->mh_mutex);
   1989 						continue;
   1990 					}
   1991 				}
   1992 				/*
   1993 				 * Unload the mappings and check if mod bit
   1994 				 * is set.
   1995 				 */
   1996 				ASSERT(!PP_ISKAS(pp));
   1997 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
   1998 				mod = hat_ismod(pp);
   1999 
   2000 #ifdef MEM_DEL_STATS
   2001 				start_pgrp = ddi_get_lbolt();
   2002 #endif /* MEM_DEL_STATS */
   2003 				if (mod && !PP_TOXIC(pp)) {
   2004 					/*
   2005 					 * Lock all constituent pages
   2006 					 * of a large page to ensure
   2007 					 * that p_szc won't change.
   2008 					 */
   2009 					if (!group_page_trylock(pp, SE_EXCL)) {
   2010 						MDSTAT_INCR(mhp, gptlmodfail);
   2011 						page_unlock(pp);
   2012 						mutex_enter(&mhp->mh_mutex);
   2013 						continue;
   2014 					}
   2015 					pp_targ = page_get_replacement_page(pp,
   2016 					    NULL, 0);
   2017 					if (pp_targ != NULL) {
   2018 						MDSTAT_INCR(mhp, nmodreloc);
   2019 #ifdef MEM_DEL_STATS
   2020 						ntick_pgrp =
   2021 						    (uint64_t)ddi_get_lbolt() -
   2022 						    start_pgrp;
   2023 #endif /* MEM_DEL_STATS */
   2024 						MDSTAT_PGRP(mhp, ntick_pgrp);
   2025 						goto reloc;
   2026 					}
   2027 					group_page_unlock(pp);
   2028 				}
   2029 
   2030 				if (!page_try_demote_pages(pp)) {
   2031 					MDSTAT_INCR(mhp, demotefail);
   2032 					page_unlock(pp);
   2033 #ifdef MEM_DEL_STATS
   2034 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
   2035 					    start_pgrp;
   2036 #endif /* MEM_DEL_STATS */
   2037 					MDSTAT_PGRP(mhp, ntick_pgrp);
   2038 					mutex_enter(&mhp->mh_mutex);
   2039 					continue;
   2040 				}
   2041 
   2042 				/*
   2043 				 * Regular 'page-out'.
   2044 				 */
   2045 				if (!mod) {
   2046 					MDSTAT_INCR(mhp, ndestroy);
   2047 					page_destroy(pp, 1);
   2048 					/*
   2049 					 * page_destroy was called with
   2050 					 * dontfree. As long as p_lckcnt
   2051 					 * and p_cowcnt are both zero, the
   2052 					 * only additional action of
   2053 					 * page_destroy with !dontfree is to
   2054 					 * call page_free, so we can collect
   2055 					 * the page here.
   2056 					 */
   2057 					collected++;
   2058 #ifdef MEM_DEL_STATS
   2059 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
   2060 					    start_pgrp;
   2061 #endif /* MEM_DEL_STATS */
   2062 					MDSTAT_PGRP(mhp, ntick_pgrp);
   2063 					mutex_enter(&mhp->mh_mutex);
   2064 					page_delete_collect(pp, mhp);
   2065 					mdsp->mds_bitmap[bit / NBPBMW] |=
   2066 					    (1 << (bit % NBPBMW));
   2067 					continue;
   2068 				}
   2069 				/*
   2070 				 * The page is toxic and the mod bit is
   2071 				 * set, we cannot do anything here to deal
   2072 				 * with it.
   2073 				 */
   2074 				if (PP_TOXIC(pp)) {
   2075 					page_unlock(pp);
   2076 #ifdef MEM_DEL_STATS
   2077 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
   2078 					    start_pgrp;
   2079 #endif /* MEM_DEL_STATS */
   2080 					MDSTAT_PGRP(mhp, ntick_pgrp);
   2081 					MDSTAT_INCR(mhp, modtoxic);
   2082 					mutex_enter(&mhp->mh_mutex);
   2083 					continue;
   2084 				}
   2085 				MDSTAT_INCR(mhp, nputpage);
   2086 				vp = pp->p_vnode;
   2087 				offset = pp->p_offset;
   2088 				VN_HOLD(vp);
   2089 				page_unlock(pp);
   2090 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
   2091 				    B_INVAL|B_FORCE, kcred, NULL);
   2092 				VN_RELE(vp);
   2093 #ifdef MEM_DEL_STATS
   2094 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
   2095 				    start_pgrp;
   2096 #endif /* MEM_DEL_STATS */
   2097 				MDSTAT_PGRP(mhp, ntick_pgrp);
   2098 				/*
   2099 				 * Try to get the page back immediately
   2100 				 * so that it can be collected.
   2101 				 */
   2102 				pp = page_numtopp_nolock(pfn);
   2103 				if (pp == NULL) {
   2104 					MDSTAT_INCR(mhp, nnoreclaim);
   2105 					/*
   2106 					 * This should not happen as this
   2107 					 * thread is deleting the page.
   2108 					 * If this code is generalized, this
   2109 					 * becomes a reality.
   2110 					 */
   2111 #ifdef DEBUG
   2112 					cmn_err(CE_WARN,
   2113 					    "delete_memory_thread(0x%p) "
   2114 					    "pfn 0x%lx has no page_t",
   2115 					    (void *)mhp, pfn);
   2116 #endif /* DEBUG */
   2117 					mutex_enter(&mhp->mh_mutex);
   2118 					continue;
   2119 				}
   2120 				if (page_try_reclaim_lock(pp, SE_EXCL,
   2121 				    SE_EXCL_WANTED | SE_RETIRED)) {
   2122 					if (PP_ISFREE(pp)) {
   2123 						goto free_page_collect;
   2124 					}
   2125 					page_unlock(pp);
   2126 				}
   2127 				MDSTAT_INCR(mhp, nnoreclaim);
   2128 				mutex_enter(&mhp->mh_mutex);
   2129 				continue;
   2130 
   2131 			reloc:
   2132 				/*
   2133 				 * Got some freemem and a target
   2134 				 * page, so move the data to avoid
   2135 				 * I/O and lock problems.
   2136 				 */
   2137 				ASSERT(!page_iolock_assert(pp));
   2138 				MDSTAT_INCR(mhp, nreloc);
   2139 				/*
   2140 				 * page_relocate() will return pgcnt: the
   2141 				 * number of consecutive pages relocated.
   2142 				 * If it is successful, pp will be a
   2143 				 * linked list of the page structs that
   2144 				 * were relocated. If page_relocate() is
   2145 				 * unsuccessful, pp will be unmodified.
   2146 				 */
   2147 #ifdef MEM_DEL_STATS
   2148 				start_pgrp = ddi_get_lbolt();
   2149 #endif /* MEM_DEL_STATS */
   2150 				result = page_relocate(&pp, &pp_targ, 0, 0,
   2151 				    &pgcnt, NULL);
   2152 #ifdef MEM_DEL_STATS
   2153 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
   2154 				    start_pgrp;
   2155 #endif /* MEM_DEL_STATS */
   2156 				MDSTAT_PGRP(mhp, ntick_pgrp);
   2157 				if (result != 0) {
   2158 					MDSTAT_INCR(mhp, nrelocfail);
   2159 					/*
   2160 					 * We did not succeed. We need
   2161 					 * to give the pp_targ pages back.
   2162 					 * page_free(pp_targ, 1) without
   2163 					 * the freemem accounting.
   2164 					 */
   2165 					group_page_unlock(pp);
   2166 					page_free_replacement_page(pp_targ);
   2167 					page_unlock(pp);
   2168 					mutex_enter(&mhp->mh_mutex);
   2169 					continue;
   2170 				}
   2171 
   2172 				/*
   2173 				 * We will then collect pgcnt pages.
   2174 				 */
   2175 				ASSERT(pgcnt > 0);
   2176 				mutex_enter(&mhp->mh_mutex);
   2177 				/*
   2178 				 * We need to make sure freemem_left is
   2179 				 * large enough.
   2180 				 */
   2181 				while ((freemem_left < pgcnt) &&
   2182 				    (!mhp->mh_cancel)) {
   2183 					freemem_left +=
   2184 					    delthr_get_freemem(mhp);
   2185 				}
   2186 
   2187 				/*
   2188 				 * Do not proceed if mh_cancel is set.
   2189 				 */
   2190 				if (mhp->mh_cancel) {
   2191 					while (pp_targ != NULL) {
   2192 						/*
   2193 						 * Unlink and unlock each page.
   2194 						 */
   2195 						tpp_targ = pp_targ;
   2196 						page_sub(&pp_targ, tpp_targ);
   2197 						page_unlock(tpp_targ);
   2198 					}
   2199 					/*
   2200 					 * We need to give the pp pages back.
   2201 					 * page_free(pp, 1) without the
   2202 					 * freemem accounting.
   2203 					 */
   2204 					page_free_replacement_page(pp);
   2205 					break;
   2206 				}
   2207 
   2208 				/* Now remove pgcnt from freemem_left */
   2209 				freemem_left -= pgcnt;
   2210 				ASSERT(freemem_left >= 0);
   2211 				szc = pp->p_szc;
   2212 				while (pp != NULL) {
   2213 					/*
   2214 					 * pp and pp_targ were passed back as
   2215 					 * a linked list of pages.
   2216 					 * Unlink and unlock each page.
   2217 					 */
   2218 					tpp_targ = pp_targ;
   2219 					page_sub(&pp_targ, tpp_targ);
   2220 					page_unlock(tpp_targ);
   2221 					/*
   2222 					 * The original page is now free
   2223 					 * so remove it from the linked
   2224 					 * list and collect it.
   2225 					 */
   2226 					tpp = pp;
   2227 					page_sub(&pp, tpp);
   2228 					pfn = page_pptonum(tpp);
   2229 					collected++;
   2230 					ASSERT(PAGE_EXCL(tpp));
   2231 					ASSERT(tpp->p_vnode == NULL);
   2232 					ASSERT(!hat_page_is_mapped(tpp));
   2233 					ASSERT(tpp->p_szc == szc);
   2234 					tpp->p_szc = 0;
   2235 					page_delete_collect(tpp, mhp);
   2236 					bit = pfn - mdsp->mds_base;
   2237 					mdsp->mds_bitmap[bit / NBPBMW] |=
   2238 					    (1 << (bit % NBPBMW));
   2239 				}
   2240 				ASSERT(pp_targ == NULL);
   2241 			}
   2242 		}
   2243 		first_scan = 0;
   2244 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
   2245 		    (collected == 0)) {
   2246 			/*
   2247 			 * This code is needed as we cannot wait
   2248 			 * for a page to be locked OR the delete to
   2249 			 * be cancelled.  Also, we must delay so
   2250 			 * that other threads get a chance to run
   2251 			 * on our cpu, otherwise page locks may be
   2252 			 * held indefinitely by those threads.
   2253 			 */
   2254 			MDSTAT_INCR(mhp, ndelay);
   2255 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
   2256 			(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
   2257 			    DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
   2258 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
   2259 		}
   2260 	}
   2261 	/* stop the dr aio cleanup thread */
   2262 	mhp->mh_dr_aio_cleanup_cancel = 1;
   2263 	transit_list_collect(mhp, 0);
   2264 	if (freemem_left != 0) {
   2265 		/* Return any surplus. */
   2266 		page_create_putback(freemem_left);
   2267 		freemem_left = 0;
   2268 	}
   2269 #ifdef MEM_DEL_STATS
   2270 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
   2271 #endif /* MEM_DEL_STATS */
   2272 	MDSTAT_TOTAL(mhp, ntick_total);
   2273 	MDSTAT_PRINT(mhp);
   2274 
   2275 	/*
   2276 	 * If the memory delete was cancelled, exclusive-wanted bits must
   2277 	 * be cleared. If there are retired pages being deleted, they need
   2278 	 * to be unretired.
   2279 	 */
   2280 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   2281 	    mdsp = mdsp->mds_next) {
   2282 		pfn_t pfn, p_end;
   2283 
   2284 		p_end = mdsp->mds_base + mdsp->mds_npgs;
   2285 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
   2286 			page_t *pp;
   2287 			pgcnt_t bit;
   2288 
   2289 			bit = pfn - mdsp->mds_base;
   2290 			if (mhp->mh_cancel) {
   2291 				pp = page_numtopp_nolock(pfn);
   2292 				if (pp != NULL) {
   2293 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
   2294 					    (1 << (bit % NBPBMW))) == 0) {
   2295 						page_lock_clr_exclwanted(pp);
   2296 					}
   2297 				}
   2298 			} else {
   2299 				pp = NULL;
   2300 			}
   2301 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
   2302 			    (1 << (bit % NBPBMW))) != 0) {
   2303 				/* do we already have pp? */
   2304 				if (pp == NULL) {
   2305 					pp = page_numtopp_nolock(pfn);
   2306 				}
   2307 				ASSERT(pp != NULL);
   2308 				ASSERT(PP_RETIRED(pp));
   2309 				if (mhp->mh_cancel != 0) {
   2310 					page_unlock(pp);
   2311 					/*
   2312 					 * To satisfy ASSERT below in
   2313 					 * cancel code.
   2314 					 */
   2315 					mhp->mh_hold_todo++;
   2316 				} else {
   2317 					(void) page_unretire_pp(pp,
   2318 					    PR_UNR_CLEAN);
   2319 				}
   2320 			}
   2321 		}
   2322 	}
   2323 	/*
   2324 	 * Free retired page bitmap and collected page bitmap
   2325 	 */
   2326 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   2327 	    mdsp = mdsp->mds_next) {
   2328 		ASSERT(mdsp->mds_bitmap_retired != NULL);
   2329 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
   2330 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
   2331 		ASSERT(mdsp->mds_bitmap != NULL);
   2332 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
   2333 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
   2334 	}
   2335 
   2336 	/* wait for our dr aio cancel thread to exit */
   2337 	while (!(mhp->mh_aio_cleanup_done)) {
   2338 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   2339 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
   2340 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
   2341 	}
   2342 refused:
   2343 	if (mhp->mh_cancel != 0) {
   2344 		page_t *pp;
   2345 
   2346 		comp_code = mhp->mh_cancel;
   2347 		/*
   2348 		 * Go through list of deleted pages (mh_deleted) freeing
   2349 		 * them.
   2350 		 */
   2351 		while ((pp = mhp->mh_deleted) != NULL) {
   2352 			mhp->mh_deleted = pp->p_next;
   2353 			mhp->mh_hold_todo++;
   2354 			mutex_exit(&mhp->mh_mutex);
   2355 			/* Restore p_next. */
   2356 			pp->p_next = pp->p_prev;
   2357 			if (PP_ISFREE(pp)) {
   2358 				cmn_err(CE_PANIC,
   2359 				    "page %p is free",
   2360 				    (void *)pp);
   2361 			}
   2362 			page_free(pp, 1);
   2363 			mutex_enter(&mhp->mh_mutex);
   2364 		}
   2365 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
   2366 
   2367 		mutex_exit(&mhp->mh_mutex);
   2368 		put_availrmem(mhp->mh_vm_pages);
   2369 		mutex_enter(&mhp->mh_mutex);
   2370 
   2371 		goto t_exit;
   2372 	}
   2373 
   2374 	/*
   2375 	 * All the pages are no longer in use and are exclusively locked.
   2376 	 */
   2377 
   2378 	mhp->mh_deleted = NULL;
   2379 
   2380 	kphysm_del_cleanup(mhp);
   2381 
   2382 	/*
   2383 	 * mem_node_del_range needs to be after kphysm_del_cleanup so
   2384 	 * that the mem_node_config[] will remain intact for the cleanup.
   2385 	 */
   2386 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   2387 	    mdsp = mdsp->mds_next) {
   2388 		mem_node_del_range(mdsp->mds_base,
   2389 		    mdsp->mds_base + mdsp->mds_npgs - 1);
   2390 	}
   2391 	/* cleanup the page counters */
   2392 	page_ctrs_cleanup();
   2393 
   2394 	comp_code = KPHYSM_OK;
   2395 
   2396 t_exit:
   2397 	mutex_exit(&mhp->mh_mutex);
   2398 	kphysm_setup_post_del(mhp->mh_vm_pages,
   2399 	    (comp_code == KPHYSM_OK) ? 0 : 1);
   2400 	mutex_enter(&mhp->mh_mutex);
   2401 
   2402 early_exit:
   2403 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
   2404 	mhp->mh_state = MHND_DONE;
   2405 	del_complete_funcp = mhp->mh_delete_complete;
   2406 	del_complete_arg = mhp->mh_delete_complete_arg;
   2407 	CALLB_CPR_EXIT(&cprinfo);
   2408 	(*del_complete_funcp)(del_complete_arg, comp_code);
   2409 	thread_exit();
   2410 	/*NOTREACHED*/
   2411 }
   2412 
   2413 /*
   2414  * Start the delete of the memory from the system.
   2415  */
   2416 int
   2417 kphysm_del_start(
   2418 	memhandle_t handle,
   2419 	void (*complete)(void *, int),
   2420 	void *complete_arg)
   2421 {
   2422 	struct mem_handle *mhp;
   2423 
   2424 	mhp = kphysm_lookup_mem_handle(handle);
   2425 	if (mhp == NULL) {
   2426 		return (KPHYSM_EHANDLE);
   2427 	}
   2428 	switch (mhp->mh_state) {
   2429 	case MHND_FREE:
   2430 		ASSERT(mhp->mh_state != MHND_FREE);
   2431 		mutex_exit(&mhp->mh_mutex);
   2432 		return (KPHYSM_EHANDLE);
   2433 	case MHND_INIT:
   2434 		break;
   2435 	case MHND_STARTING:
   2436 	case MHND_RUNNING:
   2437 		mutex_exit(&mhp->mh_mutex);
   2438 		return (KPHYSM_ESEQUENCE);
   2439 	case MHND_DONE:
   2440 		mutex_exit(&mhp->mh_mutex);
   2441 		return (KPHYSM_ESEQUENCE);
   2442 	case MHND_RELEASE:
   2443 		mutex_exit(&mhp->mh_mutex);
   2444 		return (KPHYSM_ESEQUENCE);
   2445 	default:
   2446 #ifdef DEBUG
   2447 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
   2448 		    (void *)mhp, mhp->mh_state);
   2449 #endif /* DEBUG */
   2450 		mutex_exit(&mhp->mh_mutex);
   2451 		return (KPHYSM_EHANDLE);
   2452 	}
   2453 
   2454 	if (mhp->mh_transit.trl_spans == NULL) {
   2455 		mutex_exit(&mhp->mh_mutex);
   2456 		return (KPHYSM_ENOWORK);
   2457 	}
   2458 
   2459 	ASSERT(complete != NULL);
   2460 	mhp->mh_delete_complete = complete;
   2461 	mhp->mh_delete_complete_arg = complete_arg;
   2462 	mhp->mh_state = MHND_STARTING;
   2463 	/*
   2464 	 * Release the mutex in case thread_create sleeps.
   2465 	 */
   2466 	mutex_exit(&mhp->mh_mutex);
   2467 
   2468 	/*
   2469 	 * The "obvious" process for this thread is pageout (proc_pageout)
   2470 	 * but this gives the thread too much power over freemem
   2471 	 * which results in freemem starvation.
   2472 	 */
   2473 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
   2474 	    TS_RUN, maxclsyspri - 1);
   2475 
   2476 	return (KPHYSM_OK);
   2477 }
   2478 
   2479 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
   2480 static caddr_t pp_dummy;
   2481 static pgcnt_t pp_dummy_npages;
   2482 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
   2483 
   2484 static void
   2485 memseg_remap_init_pages(page_t *pages, page_t *epages)
   2486 {
   2487 	page_t *pp;
   2488 
   2489 	for (pp = pages; pp < epages; pp++) {
   2490 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
   2491 		pp->p_offset = (u_offset_t)-1;
   2492 		page_iolock_init(pp);
   2493 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
   2494 			continue;
   2495 		page_lock_delete(pp);
   2496 	}
   2497 }
   2498 
   2499 void
   2500 memseg_remap_init()
   2501 {
   2502 	mutex_enter(&pp_dummy_lock);
   2503 	if (pp_dummy == NULL) {
   2504 		uint_t dpages;
   2505 		int i;
   2506 
   2507 		/*
   2508 		 * dpages starts off as the size of the structure and
   2509 		 * ends up as the minimum number of pages that will
   2510 		 * hold a whole number of page_t structures.
   2511 		 */
   2512 		dpages = sizeof (page_t);
   2513 		ASSERT(dpages != 0);
   2514 		ASSERT(dpages <= MMU_PAGESIZE);
   2515 
   2516 		while ((dpages & 1) == 0)
   2517 			dpages >>= 1;
   2518 
   2519 		pp_dummy_npages = dpages;
   2520 		/*
   2521 		 * Allocate pp_dummy pages directly from static_arena,
   2522 		 * since these are whole page allocations and are
   2523 		 * referenced by physical address.  This also has the
   2524 		 * nice fringe benefit of hiding the memory from
   2525 		 * ::findleaks since it doesn't deal well with allocated
   2526 		 * kernel heap memory that doesn't have any mappings.
   2527 		 */
   2528 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
   2529 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
   2530 		bzero(pp_dummy, ptob(pp_dummy_npages));
   2531 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
   2532 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
   2533 		    pp_dummy_npages, KM_SLEEP);
   2534 		for (i = 0; i < pp_dummy_npages; i++) {
   2535 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
   2536 			    &pp_dummy[MMU_PAGESIZE * i]);
   2537 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
   2538 		}
   2539 		/*
   2540 		 * Initialize the page_t's to a known 'deleted' state
   2541 		 * that matches the state of deleted pages.
   2542 		 */
   2543 		memseg_remap_init_pages((page_t *)pp_dummy,
   2544 		    (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
   2545 		/* Remove kmem mappings for the pages for safety. */
   2546 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
   2547 		    HAT_UNLOAD_UNLOCK);
   2548 		/* Leave pp_dummy pointer set as flag that init is done. */
   2549 	}
   2550 	mutex_exit(&pp_dummy_lock);
   2551 }
   2552 
   2553 /*
   2554  * Remap a page-aglined range of page_t's to dummy pages.
   2555  */
   2556 void
   2557 remap_to_dummy(caddr_t va, pgcnt_t metapgs)
   2558 {
   2559 	int phase;
   2560 
   2561 	ASSERT(IS_P2ALIGNED((uint64_t)va, PAGESIZE));
   2562 
   2563 	/*
   2564 	 * We may start remapping at a non-zero page offset
   2565 	 * within the dummy pages since the low/high ends
   2566 	 * of the outgoing pp's could be shared by other
   2567 	 * memsegs (see memseg_remap_meta).
   2568 	 */
   2569 	phase = btop((uint64_t)va) % pp_dummy_npages;
   2570 	ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
   2571 
   2572 	while (metapgs != 0) {
   2573 		pgcnt_t n;
   2574 		int i, j;
   2575 
   2576 		n = pp_dummy_npages;
   2577 		if (n > metapgs)
   2578 			n = metapgs;
   2579 		for (i = 0; i < n; i++) {
   2580 			j = (i + phase) % pp_dummy_npages;
   2581 			hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
   2582 			    PROT_READ,
   2583 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
   2584 			    HAT_LOAD_REMAP);
   2585 			va += ptob(1);
   2586 		}
   2587 		metapgs -= n;
   2588 	}
   2589 }
   2590 
   2591 static void
   2592 memseg_remap_to_dummy(struct memseg *seg)
   2593 {
   2594 	caddr_t pp;
   2595 	pgcnt_t metapgs;
   2596 
   2597 	ASSERT(memseg_is_dynamic(seg));
   2598 	ASSERT(pp_dummy != NULL);
   2599 
   2600 
   2601 	if (!memseg_includes_meta(seg)) {
   2602 		memseg_remap_meta(seg);
   2603 		return;
   2604 	}
   2605 
   2606 	pp = (caddr_t)seg->pages;
   2607 	metapgs = seg->pages_base - memseg_get_start(seg);
   2608 	ASSERT(metapgs != 0);
   2609 
   2610 	seg->pages_end = seg->pages_base;
   2611 
   2612 	remap_to_dummy(pp, metapgs);
   2613 }
   2614 
   2615 /*
   2616  * Transition all the deleted pages to the deleted state so that
   2617  * page_lock will not wait. The page_lock_delete call will
   2618  * also wake up any waiters.
   2619  */
   2620 static void
   2621 memseg_lock_delete_all(struct memseg *seg)
   2622 {
   2623 	page_t *pp;
   2624 
   2625 	for (pp = seg->pages; pp < seg->epages; pp++) {
   2626 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
   2627 		page_lock_delete(pp);
   2628 	}
   2629 }
   2630 
   2631 static void
   2632 kphysm_del_cleanup(struct mem_handle *mhp)
   2633 {
   2634 	struct memdelspan	*mdsp;
   2635 	struct memseg		*seg;
   2636 	struct memseg   	**segpp;
   2637 	struct memseg		*seglist;
   2638 	pfn_t			p_end;
   2639 	uint64_t		avmem;
   2640 	pgcnt_t			avpgs;
   2641 	pgcnt_t			npgs;
   2642 
   2643 	avpgs = mhp->mh_vm_pages;
   2644 
   2645 	memsegs_lock(1);
   2646 
   2647 	/*
   2648 	 * remove from main segment list.
   2649 	 */
   2650 	npgs = 0;
   2651 	seglist = NULL;
   2652 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   2653 	    mdsp = mdsp->mds_next) {
   2654 		p_end = mdsp->mds_base + mdsp->mds_npgs;
   2655 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
   2656 			if (seg->pages_base >= p_end ||
   2657 			    seg->pages_end <= mdsp->mds_base) {
   2658 				/* Span and memseg don't overlap. */
   2659 				segpp = &((*segpp)->next);
   2660 				continue;
   2661 			}
   2662 			ASSERT(seg->pages_base >= mdsp->mds_base);
   2663 			ASSERT(seg->pages_end <= p_end);
   2664 
   2665 			PLCNT_MODIFY_MAX(seg->pages_base,
   2666 			    seg->pages_base - seg->pages_end);
   2667 
   2668 			/* Hide the memseg from future scans. */
   2669 			hat_kpm_delmem_mseg_update(seg, segpp);
   2670 			*segpp = seg->next;
   2671 			membar_producer();	/* TODO: Needed? */
   2672 			npgs += MSEG_NPAGES(seg);
   2673 
   2674 			/*
   2675 			 * Leave the deleted segment's next pointer intact
   2676 			 * in case a memsegs scanning loop is walking this
   2677 			 * segment concurrently.
   2678 			 */
   2679 			seg->lnext = seglist;
   2680 			seglist = seg;
   2681 		}
   2682 	}
   2683 
   2684 	build_pfn_hash();
   2685 
   2686 	ASSERT(npgs < total_pages);
   2687 	total_pages -= npgs;
   2688 
   2689 	/*
   2690 	 * Recalculate the paging parameters now total_pages has changed.
   2691 	 * This will also cause the clock hands to be reset before next use.
   2692 	 */
   2693 	setupclock(1);
   2694 
   2695 	memsegs_unlock(1);
   2696 
   2697 	mutex_exit(&mhp->mh_mutex);
   2698 
   2699 	while ((seg = seglist) != NULL) {
   2700 		pfn_t mseg_start;
   2701 		pfn_t mseg_base, mseg_end;
   2702 		pgcnt_t mseg_npgs;
   2703 		int mlret;
   2704 
   2705 		seglist = seg->lnext;
   2706 
   2707 		/*
   2708 		 * Put the page_t's into the deleted state to stop
   2709 		 * cv_wait()s on the pages. When we remap, the dummy
   2710 		 * page_t's will be in the same state.
   2711 		 */
   2712 		memseg_lock_delete_all(seg);
   2713 		/*
   2714 		 * Collect up information based on pages_base and pages_end
   2715 		 * early so that we can flag early that the memseg has been
   2716 		 * deleted by setting pages_end == pages_base.
   2717 		 */
   2718 		mseg_base = seg->pages_base;
   2719 		mseg_end = seg->pages_end;
   2720 		mseg_npgs = MSEG_NPAGES(seg);
   2721 		mseg_start = memseg_get_start(seg);
   2722 
   2723 		if (memseg_is_dynamic(seg)) {
   2724 			/* Remap the meta data to our special dummy area. */
   2725 			memseg_remap_to_dummy(seg);
   2726 
   2727 			mutex_enter(&memseg_lists_lock);
   2728 			seg->lnext = memseg_va_avail;
   2729 			memseg_va_avail = seg;
   2730 			mutex_exit(&memseg_lists_lock);
   2731 		} else {
   2732 			/*
   2733 			 * For memory whose page_ts were allocated
   2734 			 * at boot, we need to find a new use for
   2735 			 * the page_t memory.
   2736 			 * For the moment, just leak it.
   2737 			 * (It is held in the memseg_delete_junk list.)
   2738 			 */
   2739 			seg->pages_end = seg->pages_base;
   2740 
   2741 			mutex_enter(&memseg_lists_lock);
   2742 			seg->lnext = memseg_delete_junk;
   2743 			memseg_delete_junk = seg;
   2744 			mutex_exit(&memseg_lists_lock);
   2745 		}
   2746 
   2747 		/* Must not use seg now as it could be re-used. */
   2748 
   2749 		memlist_write_lock();
   2750 
   2751 		mlret = memlist_delete_span(
   2752 		    (uint64_t)(mseg_base) << PAGESHIFT,
   2753 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
   2754 		    &phys_avail);
   2755 		ASSERT(mlret == MEML_SPANOP_OK);
   2756 
   2757 		mlret = memlist_delete_span(
   2758 		    (uint64_t)(mseg_start) << PAGESHIFT,
   2759 		    (uint64_t)(mseg_end - mseg_start) <<
   2760 		    PAGESHIFT,
   2761 		    &phys_install);
   2762 		ASSERT(mlret == MEML_SPANOP_OK);
   2763 		phys_install_has_changed();
   2764 
   2765 		memlist_write_unlock();
   2766 	}
   2767 
   2768 	memlist_read_lock();
   2769 	installed_top_size(phys_install, &physmax, &physinstalled);
   2770 	memlist_read_unlock();
   2771 
   2772 	mutex_enter(&freemem_lock);
   2773 	maxmem -= avpgs;
   2774 	physmem -= avpgs;
   2775 	/* availrmem is adjusted during the delete. */
   2776 	availrmem_initial -= avpgs;
   2777 
   2778 	mutex_exit(&freemem_lock);
   2779 
   2780 	dump_resize();
   2781 
   2782 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
   2783 	    "(0x%" PRIx64 ")\n",
   2784 	    physinstalled << (PAGESHIFT - 10),
   2785 	    (uint64_t)physinstalled << PAGESHIFT);
   2786 
   2787 	avmem = (uint64_t)freemem << PAGESHIFT;
   2788 	cmn_err(CE_CONT, "?kphysm_delete: "
   2789 	    "avail mem = %" PRId64 "\n", avmem);
   2790 
   2791 	/*
   2792 	 * Update lgroup generation number on single lgroup systems
   2793 	 */
   2794 	if (nlgrps == 1)
   2795 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
   2796 
   2797 	/* Successfully deleted system memory */
   2798 	mutex_enter(&mhp->mh_mutex);
   2799 }
   2800 
   2801 static uint_t mdel_nullvp_waiter;
   2802 
   2803 static void
   2804 page_delete_collect(
   2805 	page_t *pp,
   2806 	struct mem_handle *mhp)
   2807 {
   2808 	if (pp->p_vnode) {
   2809 		page_hashout(pp, (kmutex_t *)NULL);
   2810 		/* do not do PP_SETAGED(pp); */
   2811 	} else {
   2812 		kmutex_t *sep;
   2813 
   2814 		sep = page_se_mutex(pp);
   2815 		mutex_enter(sep);
   2816 		if (CV_HAS_WAITERS(&pp->p_cv)) {
   2817 			mdel_nullvp_waiter++;
   2818 			cv_broadcast(&pp->p_cv);
   2819 		}
   2820 		mutex_exit(sep);
   2821 	}
   2822 	ASSERT(pp->p_next == pp->p_prev);
   2823 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
   2824 	pp->p_next = mhp->mh_deleted;
   2825 	mhp->mh_deleted = pp;
   2826 	ASSERT(mhp->mh_hold_todo != 0);
   2827 	mhp->mh_hold_todo--;
   2828 }
   2829 
   2830 static void
   2831 transit_list_collect(struct mem_handle *mhp, int v)
   2832 {
   2833 	struct transit_list_head *trh;
   2834 
   2835 	trh = &transit_list_head;
   2836 	mutex_enter(&trh->trh_lock);
   2837 	mhp->mh_transit.trl_collect = v;
   2838 	mutex_exit(&trh->trh_lock);
   2839 }
   2840 
   2841 static void
   2842 transit_list_insert(struct transit_list *tlp)
   2843 {
   2844 	struct transit_list_head *trh;
   2845 
   2846 	trh = &transit_list_head;
   2847 	ASSERT(MUTEX_HELD(&trh->trh_lock));
   2848 	tlp->trl_next = trh->trh_head;
   2849 	trh->trh_head = tlp;
   2850 }
   2851 
   2852 static void
   2853 transit_list_remove(struct transit_list *tlp)
   2854 {
   2855 	struct transit_list_head *trh;
   2856 	struct transit_list **tlpp;
   2857 
   2858 	trh = &transit_list_head;
   2859 	tlpp = &trh->trh_head;
   2860 	ASSERT(MUTEX_HELD(&trh->trh_lock));
   2861 	while (*tlpp != NULL && *tlpp != tlp)
   2862 		tlpp = &(*tlpp)->trl_next;
   2863 	ASSERT(*tlpp != NULL);
   2864 	if (*tlpp == tlp)
   2865 		*tlpp = tlp->trl_next;
   2866 	tlp->trl_next = NULL;
   2867 }
   2868 
   2869 static struct transit_list *
   2870 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
   2871 {
   2872 	struct transit_list *tlp;
   2873 
   2874 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
   2875 		struct memdelspan *mdsp;
   2876 
   2877 		for (mdsp = tlp->trl_spans; mdsp != NULL;
   2878 		    mdsp = mdsp->mds_next) {
   2879 			if (pfnum >= mdsp->mds_base &&
   2880 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
   2881 				return (tlp);
   2882 			}
   2883 		}
   2884 	}
   2885 	return (NULL);
   2886 }
   2887 
   2888 int
   2889 pfn_is_being_deleted(pfn_t pfnum)
   2890 {
   2891 	struct transit_list_head *trh;
   2892 	struct transit_list *tlp;
   2893 	int ret;
   2894 
   2895 	trh = &transit_list_head;
   2896 	if (trh->trh_head == NULL)
   2897 		return (0);
   2898 
   2899 	mutex_enter(&trh->trh_lock);
   2900 	tlp = pfnum_to_transit_list(trh, pfnum);
   2901 	ret = (tlp != NULL && tlp->trl_collect);
   2902 	mutex_exit(&trh->trh_lock);
   2903 
   2904 	return (ret);
   2905 }
   2906 
   2907 #ifdef MEM_DEL_STATS
   2908 extern int hz;
   2909 static void
   2910 mem_del_stat_print_func(struct mem_handle *mhp)
   2911 {
   2912 	uint64_t tmp;
   2913 
   2914 	if (mem_del_stat_print) {
   2915 		printf("memory delete loop %x/%x, statistics%s\n",
   2916 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
   2917 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
   2918 		    (mhp->mh_cancel ? " (cancelled)" : ""));
   2919 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
   2920 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
   2921 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
   2922 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
   2923 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
   2924 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
   2925 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
   2926 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
   2927 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
   2928 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
   2929 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
   2930 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
   2931 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
   2932 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
   2933 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
   2934 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
   2935 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
   2936 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
   2937 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
   2938 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
   2939 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
   2940 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
   2941 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
   2942 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
   2943 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
   2944 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
   2945 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
   2946 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
   2947 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
   2948 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
   2949 		printf(
   2950 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
   2951 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
   2952 
   2953 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
   2954 		printf(
   2955 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
   2956 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
   2957 	}
   2958 }
   2959 #endif /* MEM_DEL_STATS */
   2960 
   2961 struct mem_callback {
   2962 	kphysm_setup_vector_t	*vec;
   2963 	void			*arg;
   2964 };
   2965 
   2966 #define	NMEMCALLBACKS		100
   2967 
   2968 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
   2969 static uint_t nmemcallbacks;
   2970 static krwlock_t mem_callback_rwlock;
   2971 
   2972 int
   2973 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
   2974 {
   2975 	uint_t i, found;
   2976 
   2977 	/*
   2978 	 * This test will become more complicated when the version must
   2979 	 * change.
   2980 	 */
   2981 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
   2982 		return (EINVAL);
   2983 
   2984 	if (vec->post_add == NULL || vec->pre_del == NULL ||
   2985 	    vec->post_del == NULL)
   2986 		return (EINVAL);
   2987 
   2988 	rw_enter(&mem_callback_rwlock, RW_WRITER);
   2989 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
   2990 		if (mem_callbacks[i].vec == NULL && found == 0)
   2991 			found = i + 1;
   2992 		if (mem_callbacks[i].vec == vec &&
   2993 		    mem_callbacks[i].arg == arg) {
   2994 #ifdef DEBUG
   2995 			/* Catch this in DEBUG kernels. */
   2996 			cmn_err(CE_WARN, "kphysm_setup_func_register"
   2997 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
   2998 			    (void *)vec, arg, (void *)caller());
   2999 #endif /* DEBUG */
   3000 			rw_exit(&mem_callback_rwlock);
   3001 			return (EEXIST);
   3002 		}
   3003 	}
   3004 	if (found != 0) {
   3005 		i = found - 1;
   3006 	} else {
   3007 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
   3008 		if (nmemcallbacks == NMEMCALLBACKS) {
   3009 			rw_exit(&mem_callback_rwlock);
   3010 			return (ENOMEM);
   3011 		}
   3012 		i = nmemcallbacks++;
   3013 	}
   3014 	mem_callbacks[i].vec = vec;
   3015 	mem_callbacks[i].arg = arg;
   3016 	rw_exit(&mem_callback_rwlock);
   3017 	return (0);
   3018 }
   3019 
   3020 void
   3021 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
   3022 {
   3023 	uint_t i;
   3024 
   3025 	rw_enter(&mem_callback_rwlock, RW_WRITER);
   3026 	for (i = 0; i < nmemcallbacks; i++) {
   3027 		if (mem_callbacks[i].vec == vec &&
   3028 		    mem_callbacks[i].arg == arg) {
   3029 			mem_callbacks[i].vec = NULL;
   3030 			mem_callbacks[i].arg = NULL;
   3031 			if (i == (nmemcallbacks - 1))
   3032 				nmemcallbacks--;
   3033 			break;
   3034 		}
   3035 	}
   3036 	rw_exit(&mem_callback_rwlock);
   3037 }
   3038 
   3039 static void
   3040 kphysm_setup_post_add(pgcnt_t delta_pages)
   3041 {
   3042 	uint_t i;
   3043 
   3044 	rw_enter(&mem_callback_rwlock, RW_READER);
   3045 	for (i = 0; i < nmemcallbacks; i++) {
   3046 		if (mem_callbacks[i].vec != NULL) {
   3047 			(*mem_callbacks[i].vec->post_add)
   3048 			    (mem_callbacks[i].arg, delta_pages);
   3049 		}
   3050 	}
   3051 	rw_exit(&mem_callback_rwlock);
   3052 }
   3053 
   3054 /*
   3055  * Note the locking between pre_del and post_del: The reader lock is held
   3056  * between the two calls to stop the set of functions from changing.
   3057  */
   3058 
   3059 static int
   3060 kphysm_setup_pre_del(pgcnt_t delta_pages)
   3061 {
   3062 	uint_t i;
   3063 	int ret;
   3064 	int aret;
   3065 
   3066 	ret = 0;
   3067 	rw_enter(&mem_callback_rwlock, RW_READER);
   3068 	for (i = 0; i < nmemcallbacks; i++) {
   3069 		if (mem_callbacks[i].vec != NULL) {
   3070 			aret = (*mem_callbacks[i].vec->pre_del)
   3071 			    (mem_callbacks[i].arg, delta_pages);
   3072 			ret |= aret;
   3073 		}
   3074 	}
   3075 
   3076 	return (ret);
   3077 }
   3078 
   3079 static void
   3080 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
   3081 {
   3082 	uint_t i;
   3083 
   3084 	for (i = 0; i < nmemcallbacks; i++) {
   3085 		if (mem_callbacks[i].vec != NULL) {
   3086 			(*mem_callbacks[i].vec->post_del)
   3087 			    (mem_callbacks[i].arg, delta_pages, cancelled);
   3088 		}
   3089 	}
   3090 	rw_exit(&mem_callback_rwlock);
   3091 }
   3092 
   3093 static int
   3094 kphysm_split_memseg(
   3095 	pfn_t base,
   3096 	pgcnt_t npgs)
   3097 {
   3098 	struct memseg *seg;
   3099 	struct memseg **segpp;
   3100 	pgcnt_t size_low, size_high;
   3101 	struct memseg *seg_low, *seg_mid, *seg_high;
   3102 
   3103 	/*
   3104 	 * Lock the memsegs list against other updates now
   3105 	 */
   3106 	memsegs_lock(1);
   3107 
   3108 	/*
   3109 	 * Find boot time memseg that wholly covers this area.
   3110 	 */
   3111 
   3112 	/* First find the memseg with page 'base' in it. */
   3113 	for (segpp = &memsegs; (seg = *segpp) != NULL;
   3114 	    segpp = &((*segpp)->next)) {
   3115 		if (base >= seg->pages_base && base < seg->pages_end)
   3116 			break;
   3117 	}
   3118 	if (seg == NULL) {
   3119 		memsegs_unlock(1);
   3120 		return (0);
   3121 	}
   3122 	if (memseg_includes_meta(seg)) {
   3123 		memsegs_unlock(1);
   3124 		return (0);
   3125 	}
   3126 	if ((base + npgs) > seg->pages_end) {
   3127 		memsegs_unlock(1);
   3128 		return (0);
   3129 	}
   3130 
   3131 	/*
   3132 	 * Work out the size of the two segments that will
   3133 	 * surround the new segment, one for low address
   3134 	 * and one for high.
   3135 	 */
   3136 	ASSERT(base >= seg->pages_base);
   3137 	size_low = base - seg->pages_base;
   3138 	ASSERT(seg->pages_end >= (base + npgs));
   3139 	size_high = seg->pages_end - (base + npgs);
   3140 
   3141 	/*
   3142 	 * Sanity check.
   3143 	 */
   3144 	if ((size_low + size_high) == 0) {
   3145 		memsegs_unlock(1);
   3146 		return (0);
   3147 	}
   3148 
   3149 	/*
   3150 	 * Allocate the new structures. The old memseg will not be freed
   3151 	 * as there may be a reference to it.
   3152 	 */
   3153 	seg_low = NULL;
   3154 	seg_high = NULL;
   3155 
   3156 	if (size_low != 0)
   3157 		seg_low = memseg_alloc();
   3158 
   3159 	seg_mid = memseg_alloc();
   3160 
   3161 	if (size_high != 0)
   3162 		seg_high = memseg_alloc();
   3163 
   3164 	/*
   3165 	 * All allocation done now.
   3166 	 */
   3167 	if (size_low != 0) {
   3168 		seg_low->pages = seg->pages;
   3169 		seg_low->epages = seg_low->pages + size_low;
   3170 		seg_low->pages_base = seg->pages_base;
   3171 		seg_low->pages_end = seg_low->pages_base + size_low;
   3172 		seg_low->next = seg_mid;
   3173 		seg_low->msegflags = seg->msegflags;
   3174 	}
   3175 	if (size_high != 0) {
   3176 		seg_high->pages = seg->epages - size_high;
   3177 		seg_high->epages = seg_high->pages + size_high;
   3178 		seg_high->pages_base = seg->pages_end - size_high;
   3179 		seg_high->pages_end = seg_high->pages_base + size_high;
   3180 		seg_high->next = seg->next;
   3181 		seg_high->msegflags = seg->msegflags;
   3182 	}
   3183 
   3184 	seg_mid->pages = seg->pages + size_low;
   3185 	seg_mid->pages_base = seg->pages_base + size_low;
   3186 	seg_mid->epages = seg->epages - size_high;
   3187 	seg_mid->pages_end = seg->pages_end - size_high;
   3188 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
   3189 	seg_mid->msegflags = seg->msegflags;
   3190 
   3191 	/*
   3192 	 * Update hat_kpm specific info of all involved memsegs and
   3193 	 * allow hat_kpm specific global chain updates.
   3194 	 */
   3195 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
   3196 
   3197 	/*
   3198 	 * At this point we have two equivalent memseg sub-chains,
   3199 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
   3200 	 * the same place in the global chain. By re-writing the pointer
   3201 	 * in the previous element we switch atomically from using the old
   3202 	 * (seg) to the new.
   3203 	 */
   3204 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
   3205 
   3206 	membar_enter();
   3207 
   3208 	build_pfn_hash();
   3209 	memsegs_unlock(1);
   3210 
   3211 	/*
   3212 	 * We leave the old segment, 'seg', intact as there may be
   3213 	 * references to it. Also, as the value of total_pages has not
   3214 	 * changed and the memsegs list is effectively the same when
   3215 	 * accessed via the old or the new pointer, we do not have to
   3216 	 * cause pageout_scanner() to re-evaluate its hand pointers.
   3217 	 *
   3218 	 * We currently do not re-use or reclaim the page_t memory.
   3219 	 * If we do, then this may have to change.
   3220 	 */
   3221 
   3222 	mutex_enter(&memseg_lists_lock);
   3223 	seg->lnext = memseg_edit_junk;
   3224 	memseg_edit_junk = seg;
   3225 	mutex_exit(&memseg_lists_lock);
   3226 
   3227 	return (1);
   3228 }
   3229 
   3230 /*
   3231  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
   3232  * structure using physical addresses. Therefore a kmem_cache is
   3233  * used with KMC_NOHASH to avoid page crossings within a memseg
   3234  * structure. KMC_NOHASH requires that no external (outside of
   3235  * slab) information is allowed. This, in turn, implies that the
   3236  * cache's slabsize must be exactly a single page, since per-slab
   3237  * information (e.g. the freelist for the slab) is kept at the
   3238  * end of the slab, where it is easy to locate. Should be changed
   3239  * when a more obvious kmem_cache interface/flag will become
   3240  * available.
   3241  */
   3242 void
   3243 mem_config_init()
   3244 {
   3245 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
   3246 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
   3247 }
   3248 
   3249 struct memseg *
   3250 memseg_alloc()
   3251 {
   3252 	struct memseg *seg;
   3253 
   3254 	seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
   3255 	bzero(seg, sizeof (struct memseg));
   3256 
   3257 	return (seg);
   3258 }
   3259 
   3260 /*
   3261  * Return whether the page_t memory for this memseg
   3262  * is included in the memseg itself.
   3263  */
   3264 static int
   3265 memseg_includes_meta(struct memseg *seg)
   3266 {
   3267 	return (seg->msegflags & MEMSEG_META_INCL);
   3268 }
   3269 
   3270 pfn_t
   3271 memseg_get_start(struct memseg *seg)
   3272 {
   3273 	pfn_t		pt_start;
   3274 
   3275 	if (memseg_includes_meta(seg)) {
   3276 		pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
   3277 
   3278 		/* Meta data is required to be at the beginning */
   3279 		ASSERT(pt_start < seg->pages_base);
   3280 	} else
   3281 		pt_start = seg->pages_base;
   3282 
   3283 	return (pt_start);
   3284 }
   3285 
   3286 /*
   3287  * Invalidate memseg pointers in cpu private vm data caches.
   3288  */
   3289 static void
   3290 memseg_cpu_vm_flush()
   3291 {
   3292 	cpu_t *cp;
   3293 	vm_cpu_data_t *vc;
   3294 
   3295 	mutex_enter(&cpu_lock);
   3296 	pause_cpus(NULL);
   3297 
   3298 	cp = cpu_list;
   3299 	do {
   3300 		vc = cp->cpu_vm_data;
   3301 		vc->vc_pnum_memseg = NULL;
   3302 		vc->vc_pnext_memseg = NULL;
   3303 
   3304 	} while ((cp = cp->cpu_next) != cpu_list);
   3305 
   3306 	start_cpus();
   3307 	mutex_exit(&cpu_lock);
   3308 }
   3309