Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/types.h>
     27 #include <sys/param.h>
     28 #include <sys/thread.h>
     29 #include <sys/proc.h>
     30 #include <sys/callb.h>
     31 #include <sys/vnode.h>
     32 #include <sys/debug.h>
     33 #include <sys/systm.h>		/* for bzero */
     34 #include <sys/memlist.h>
     35 #include <sys/cmn_err.h>
     36 #include <sys/sysmacros.h>
     37 #include <sys/vmsystm.h>	/* for NOMEMWAIT() */
     38 #include <sys/atomic.h>		/* used to update kcage_freemem */
     39 #include <sys/kmem.h>		/* for kmem_reap */
     40 #include <sys/errno.h>
     41 #include <sys/mem_cage.h>
     42 #include <vm/seg_kmem.h>
     43 #include <vm/page.h>
     44 #include <vm/hat.h>
     45 #include <vm/vm_dep.h>
     46 #include <sys/mem_config.h>
     47 #include <sys/lgrp.h>
     48 #include <sys/rwlock.h>
     49 #include <sys/cpupart.h>
     50 
     51 extern pri_t maxclsyspri;
     52 
     53 #ifdef DEBUG
     54 #define	KCAGE_STATS
     55 #endif
     56 
     57 #ifdef KCAGE_STATS
     58 
     59 #define	KCAGE_STATS_VERSION 9	/* can help report generators */
     60 #define	KCAGE_STATS_NSCANS 256	/* depth of scan statistics buffer */
     61 
     62 struct kcage_stats_scan {
     63 	/* managed by KCAGE_STAT_* macros */
     64 	clock_t	scan_lbolt;
     65 	uint_t	scan_id;
     66 
     67 	/* set in kcage_cageout() */
     68 	uint_t	kt_passes;
     69 	clock_t	kt_ticks;
     70 	pgcnt_t	kt_kcage_freemem_start;
     71 	pgcnt_t	kt_kcage_freemem_end;
     72 	pgcnt_t kt_freemem_start;
     73 	pgcnt_t kt_freemem_end;
     74 	uint_t	kt_examined;
     75 	uint_t	kt_cantlock;
     76 	uint_t	kt_gotone;
     77 	uint_t	kt_gotonefree;
     78 	uint_t	kt_skiplevel;
     79 	uint_t	kt_skipshared;
     80 	uint_t	kt_skiprefd;
     81 	uint_t	kt_destroy;
     82 
     83 	/* set in kcage_invalidate_page() */
     84 	uint_t	kip_reloclocked;
     85 	uint_t	kip_relocmod;
     86 	uint_t	kip_destroy;
     87 	uint_t	kip_nomem;
     88 	uint_t	kip_demotefailed;
     89 
     90 	/* set in kcage_expand() */
     91 	uint_t	ke_wanted;
     92 	uint_t	ke_examined;
     93 	uint_t	ke_lefthole;
     94 	uint_t	ke_gotone;
     95 	uint_t	ke_gotonefree;
     96 };
     97 
     98 struct kcage_stats {
     99 	/* managed by KCAGE_STAT_* macros */
    100 	uint_t	version;
    101 	uint_t	size;
    102 
    103 	/* set in kcage_cageout */
    104 	uint_t	kt_wakeups;
    105 	uint_t	kt_scans;
    106 	uint_t	kt_cageout_break;
    107 
    108 	/* set in kcage_expand */
    109 	uint_t	ke_calls;
    110 	uint_t	ke_nopfn;
    111 	uint_t	ke_nopaget;
    112 	uint_t	ke_isnoreloc;
    113 	uint_t	ke_deleting;
    114 	uint_t	ke_lowfreemem;
    115 	uint_t	ke_terminate;
    116 
    117 	/* set in kcage_freemem_add() */
    118 	uint_t	kfa_trottlewake;
    119 
    120 	/* set in kcage_freemem_sub() */
    121 	uint_t	kfs_cagewake;
    122 
    123 	/* set in kcage_create_throttle */
    124 	uint_t	kct_calls;
    125 	uint_t	kct_cageout;
    126 	uint_t	kct_critical;
    127 	uint_t	kct_exempt;
    128 	uint_t	kct_cagewake;
    129 	uint_t	kct_wait;
    130 	uint_t	kct_progress;
    131 	uint_t	kct_noprogress;
    132 	uint_t	kct_timeout;
    133 
    134 	/* set in kcage_cageout_wakeup */
    135 	uint_t	kcw_expandearly;
    136 
    137 	/* managed by KCAGE_STAT_* macros */
    138 	uint_t	scan_array_size;
    139 	uint_t	scan_index;
    140 	struct kcage_stats_scan scans[KCAGE_STATS_NSCANS];
    141 };
    142 
    143 static struct kcage_stats kcage_stats;
    144 static struct kcage_stats_scan kcage_stats_scan_zero;
    145 
    146 /*
    147  * No real need for atomics here. For the most part the incs and sets are
    148  * done by the kernel cage thread. There are a few that are done by any
    149  * number of other threads. Those cases are noted by comments.
    150  */
    151 #define	KCAGE_STAT_INCR(m)	kcage_stats.m++
    152 
    153 #define	KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v)
    154 
    155 #define	KCAGE_STAT_INCR_SCAN(m)	\
    156 	KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m)
    157 
    158 #define	KCAGE_STAT_NINCR_SCAN(m, v) \
    159 	KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v)
    160 
    161 #define	KCAGE_STAT_SET(m, v)	kcage_stats.m = (v)
    162 
    163 #define	KCAGE_STAT_SETZ(m, v)	\
    164 	if (kcage_stats.m == 0) kcage_stats.m = (v)
    165 
    166 #define	KCAGE_STAT_SET_SCAN(m, v)	\
    167 	KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v)
    168 
    169 #define	KCAGE_STAT_SETZ_SCAN(m, v)	\
    170 	KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v)
    171 
    172 #define	KCAGE_STAT_INC_SCAN_INDEX \
    173 	KCAGE_STAT_SET_SCAN(scan_lbolt, ddi_get_lbolt()); \
    174 	KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \
    175 	kcage_stats.scan_index = \
    176 	(kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \
    177 	kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero
    178 
    179 #define	KCAGE_STAT_INIT_SCAN_INDEX \
    180 	kcage_stats.version = KCAGE_STATS_VERSION; \
    181 	kcage_stats.size = sizeof (kcage_stats); \
    182 	kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \
    183 	kcage_stats.scan_index = 0
    184 
    185 #else /* KCAGE_STATS */
    186 
    187 #define	KCAGE_STAT_INCR(v)
    188 #define	KCAGE_STAT_NINCR(m, v)
    189 #define	KCAGE_STAT_INCR_SCAN(v)
    190 #define	KCAGE_STAT_NINCR_SCAN(m, v)
    191 #define	KCAGE_STAT_SET(m, v)
    192 #define	KCAGE_STAT_SETZ(m, v)
    193 #define	KCAGE_STAT_SET_SCAN(m, v)
    194 #define	KCAGE_STAT_SETZ_SCAN(m, v)
    195 #define	KCAGE_STAT_INC_SCAN_INDEX
    196 #define	KCAGE_STAT_INIT_SCAN_INDEX
    197 
    198 #endif /* KCAGE_STATS */
    199 
    200 static kmutex_t kcage_throttle_mutex;	/* protects kcage_throttle_cv */
    201 static kcondvar_t kcage_throttle_cv;
    202 
    203 static kmutex_t kcage_cageout_mutex;	/* protects cv and ready flag */
    204 static kcondvar_t kcage_cageout_cv;	/* cageout thread naps here */
    205 static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
    206 kthread_id_t kcage_cageout_thread;	/* to aid debugging */
    207 
    208 static krwlock_t kcage_range_rwlock;	/* protects kcage_glist elements */
    209 
    210 /*
    211  * Cage expansion happens within a range.
    212  */
    213 struct kcage_glist {
    214 	struct kcage_glist	*next;
    215 	pfn_t			base;
    216 	pfn_t			lim;
    217 	pfn_t			curr;
    218 	int			decr;
    219 };
    220 
    221 static struct kcage_glist *kcage_glist;
    222 static struct kcage_glist *kcage_current_glist;
    223 
    224 /*
    225  * The firstfree element is provided so that kmem_alloc can be avoided
    226  * until that cage has somewhere to go. This is not currently a problem
    227  * as early kmem_alloc's use BOP_ALLOC instead of page_create_va.
    228  */
    229 static vmem_t *kcage_arena;
    230 static struct kcage_glist kcage_glist_firstfree;
    231 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree;
    232 
    233 /*
    234  * Miscellaneous forward references
    235  */
    236 static struct kcage_glist *kcage_glist_alloc(void);
    237 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **);
    238 static void kcage_cageout(void);
    239 static int kcage_invalidate_page(page_t *, pgcnt_t *);
    240 static int kcage_setnoreloc_pages(page_t *, se_t);
    241 static int kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t);
    242 static void kcage_init(pgcnt_t preferred_size);
    243 static int kcage_range_delete_internal(pfn_t base, pgcnt_t npgs);
    244 
    245 /*
    246  * Kernel Memory Cage counters and thresholds.
    247  */
    248 int kcage_on = 0;
    249 pgcnt_t kcage_freemem;
    250 pgcnt_t kcage_needfree;
    251 pgcnt_t kcage_lotsfree;
    252 pgcnt_t kcage_desfree;
    253 pgcnt_t kcage_minfree;
    254 pgcnt_t kcage_throttlefree;
    255 pgcnt_t	kcage_reserve;
    256 int kcage_maxwait = 10;	/* in seconds */
    257 
    258 /* when we use lp for kmem we start the cage at a higher initial value */
    259 pgcnt_t kcage_kmemlp_mincage;
    260 
    261 #ifdef DEBUG
    262 pgcnt_t	kcage_pagets;
    263 #define	KCAGEPAGETS_INC()	kcage_pagets++
    264 #else
    265 #define	KCAGEPAGETS_INC()
    266 #endif
    267 
    268 /* kstats to export what pages are currently caged */
    269 kmutex_t kcage_kstat_lock;
    270 static int kcage_kstat_update(kstat_t *ksp, int rw);
    271 static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
    272 
    273 /*
    274  * Startup and Dynamic Reconfiguration interfaces.
    275  * kcage_range_add()
    276  * kcage_range_del()
    277  * kcage_range_delete_post_mem_del()
    278  * kcage_range_init()
    279  * kcage_set_thresholds()
    280  */
    281 
    282 /*
    283  * Called from page_get_contig_pages to get the approximate kcage pfn range
    284  * for exclusion from search for contiguous pages. This routine is called
    285  * without kcage_range lock (kcage routines can call page_get_contig_pages
    286  * through page_relocate) and with the assumption, based on kcage_range_add,
    287  * that kcage_current_glist always contain a valid pointer.
    288  */
    289 
    290 int
    291 kcage_current_pfn(pfn_t *pfncur)
    292 {
    293 	struct kcage_glist *lp = kcage_current_glist;
    294 
    295 	ASSERT(kcage_on);
    296 
    297 	ASSERT(lp != NULL);
    298 
    299 	*pfncur = lp->curr;
    300 
    301 	return (lp->decr);
    302 }
    303 
    304 /*
    305  * Called from vm_pagelist.c during coalesce to find kernel cage regions
    306  * within an mnode. Looks for the lowest range between lo and hi.
    307  *
    308  * Kernel cage memory is defined between kcage_glist and kcage_current_glist.
    309  * Non-cage memory is defined between kcage_current_glist and list end.
    310  *
    311  * If incage is set, returns the lowest kcage range. Otherwise returns lowest
    312  * non-cage range.
    313  *
    314  * Returns zero on success and nlo, nhi:
    315  * 	lo <= nlo < nhi <= hi
    316  * Returns non-zero if no overlapping range is found.
    317  */
    318 int
    319 kcage_next_range(int incage, pfn_t lo, pfn_t hi,
    320     pfn_t *nlo, pfn_t *nhi)
    321 {
    322 	struct kcage_glist *lp;
    323 	pfn_t tlo = hi;
    324 	pfn_t thi = hi;
    325 
    326 	ASSERT(lo <= hi);
    327 
    328 	/*
    329 	 * Reader lock protects the list, but kcage_get_pfn
    330 	 * running concurrently may advance kcage_current_glist
    331 	 * and also update kcage_current_glist->curr. Page
    332 	 * coalesce can handle this race condition.
    333 	 */
    334 	rw_enter(&kcage_range_rwlock, RW_READER);
    335 
    336 	for (lp = incage ? kcage_glist : kcage_current_glist;
    337 	    lp != NULL; lp = lp->next) {
    338 
    339 		pfn_t klo, khi;
    340 
    341 		/* find the range limits in this element */
    342 		if ((incage && lp->decr) || (!incage && !lp->decr)) {
    343 			klo = lp->curr;
    344 			khi = lp->lim;
    345 		} else {
    346 			klo = lp->base;
    347 			khi = lp->curr;
    348 		}
    349 
    350 		/* handle overlap */
    351 		if (klo < tlo && klo < khi && lo < khi && klo < hi) {
    352 			tlo = MAX(lo, klo);
    353 			thi = MIN(hi, khi);
    354 			if (tlo == lo)
    355 				break;
    356 		}
    357 
    358 		/* check end of kcage */
    359 		if (incage && lp == kcage_current_glist) {
    360 			break;
    361 		}
    362 	}
    363 
    364 	rw_exit(&kcage_range_rwlock);
    365 
    366 	/* return non-zero if no overlapping range found */
    367 	if (tlo == thi)
    368 		return (1);
    369 
    370 	ASSERT(lo <= tlo && tlo < thi && thi <= hi);
    371 
    372 	/* return overlapping range */
    373 	*nlo = tlo;
    374 	*nhi = thi;
    375 	return (0);
    376 }
    377 
    378 void
    379 kcage_range_init(struct memlist *ml, kcage_dir_t d, pgcnt_t preferred_size)
    380 {
    381 	int ret = 0;
    382 
    383 	ASSERT(kcage_arena == NULL);
    384 	kcage_arena = vmem_create("kcage_arena", NULL, 0, sizeof (uint64_t),
    385 	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
    386 	ASSERT(kcage_arena != NULL);
    387 
    388 	if (d == KCAGE_DOWN) {
    389 		while (ml->next != NULL)
    390 			ml = ml->next;
    391 	}
    392 
    393 	rw_enter(&kcage_range_rwlock, RW_WRITER);
    394 
    395 	while (ml != NULL) {
    396 		ret = kcage_range_add_internal(btop(ml->address),
    397 		    btop(ml->size), d);
    398 		if (ret)
    399 			panic("kcage_range_add_internal failed: "
    400 			    "ml=%p, ret=0x%x\n", (void *)ml, ret);
    401 
    402 		ml = (d == KCAGE_DOWN ? ml->prev : ml->next);
    403 	}
    404 
    405 	rw_exit(&kcage_range_rwlock);
    406 
    407 	if (ret == 0)
    408 		kcage_init(preferred_size);
    409 }
    410 
    411 /*
    412  * Third arg controls direction of growth: 0: increasing pfns,
    413  * 1: decreasing.
    414  */
    415 static int
    416 kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
    417 {
    418 	struct kcage_glist *new, **lpp;
    419 	pfn_t lim;
    420 
    421 	ASSERT(rw_write_held(&kcage_range_rwlock));
    422 
    423 	ASSERT(npgs != 0);
    424 	if (npgs == 0)
    425 		return (EINVAL);
    426 
    427 	lim = base + npgs;
    428 
    429 	ASSERT(lim > base);
    430 	if (lim <= base)
    431 		return (EINVAL);
    432 
    433 	new = kcage_glist_alloc();
    434 	if (new == NULL) {
    435 		return (ENOMEM);
    436 	}
    437 
    438 	new->base = base;
    439 	new->lim = lim;
    440 	new->decr = (d == KCAGE_DOWN);
    441 	if (new->decr != 0)
    442 		new->curr = new->lim;
    443 	else
    444 		new->curr = new->base;
    445 	/*
    446 	 * Any overlapping existing ranges are removed by deleting
    447 	 * from the new list as we search for the tail.
    448 	 */
    449 	lpp = &kcage_glist;
    450 	while (*lpp != NULL) {
    451 		int ret;
    452 		ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new);
    453 		if (ret != 0)
    454 			return (ret);
    455 		lpp = &(*lpp)->next;
    456 	}
    457 
    458 	*lpp = new;
    459 
    460 	if (kcage_current_glist == NULL) {
    461 		kcage_current_glist = kcage_glist;
    462 	}
    463 
    464 	return (0);
    465 }
    466 
    467 int
    468 kcage_range_add(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
    469 {
    470 	int ret;
    471 
    472 	rw_enter(&kcage_range_rwlock, RW_WRITER);
    473 	ret = kcage_range_add_internal(base, npgs, d);
    474 	rw_exit(&kcage_range_rwlock);
    475 	return (ret);
    476 }
    477 
    478 /*
    479  * Calls to add and delete must be protected by kcage_range_rwlock
    480  */
    481 static int
    482 kcage_range_delete_internal(pfn_t base, pgcnt_t npgs)
    483 {
    484 	struct kcage_glist *lp;
    485 	pfn_t lim;
    486 
    487 	ASSERT(rw_write_held(&kcage_range_rwlock));
    488 
    489 	ASSERT(npgs != 0);
    490 	if (npgs == 0)
    491 		return (EINVAL);
    492 
    493 	lim = base + npgs;
    494 
    495 	ASSERT(lim > base);
    496 	if (lim <= base)
    497 		return (EINVAL);
    498 
    499 	/*
    500 	 * Check if the delete is OK first as a number of elements
    501 	 * might be involved and it will be difficult to go
    502 	 * back and undo (can't just add the range back in).
    503 	 */
    504 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
    505 		/*
    506 		 * If there have been no pages allocated from this
    507 		 * element, we don't need to check it.
    508 		 */
    509 		if ((lp->decr == 0 && lp->curr == lp->base) ||
    510 		    (lp->decr != 0 && lp->curr == lp->lim))
    511 			continue;
    512 		/*
    513 		 * If the element does not overlap, its OK.
    514 		 */
    515 		if (base >= lp->lim || lim <= lp->base)
    516 			continue;
    517 		/*
    518 		 * Overlapping element: Does the range to be deleted
    519 		 * overlap the area already used? If so fail.
    520 		 */
    521 		if (lp->decr == 0 && base < lp->curr && lim >= lp->base) {
    522 			return (EBUSY);
    523 		}
    524 		if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) {
    525 			return (EBUSY);
    526 		}
    527 	}
    528 	return (kcage_glist_delete(base, lim, &kcage_glist));
    529 }
    530 
    531 int
    532 kcage_range_delete(pfn_t base, pgcnt_t npgs)
    533 {
    534 	int ret;
    535 
    536 	rw_enter(&kcage_range_rwlock, RW_WRITER);
    537 	ret = kcage_range_delete_internal(base, npgs);
    538 	rw_exit(&kcage_range_rwlock);
    539 	return (ret);
    540 }
    541 
    542 /*
    543  * Calls to add and delete must be protected by kcage_range_rwlock.
    544  * This routine gets called after successful Solaris memory
    545  * delete operation from DR post memory delete routines.
    546  */
    547 static int
    548 kcage_range_delete_post_mem_del_internal(pfn_t base, pgcnt_t npgs)
    549 {
    550 	pfn_t lim;
    551 
    552 	ASSERT(rw_write_held(&kcage_range_rwlock));
    553 
    554 	ASSERT(npgs != 0);
    555 	if (npgs == 0)
    556 		return (EINVAL);
    557 
    558 	lim = base + npgs;
    559 
    560 	ASSERT(lim > base);
    561 	if (lim <= base)
    562 		return (EINVAL);
    563 
    564 	return (kcage_glist_delete(base, lim, &kcage_glist));
    565 }
    566 
    567 int
    568 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs)
    569 {
    570 	int ret;
    571 
    572 	rw_enter(&kcage_range_rwlock, RW_WRITER);
    573 	ret = kcage_range_delete_post_mem_del_internal(base, npgs);
    574 	rw_exit(&kcage_range_rwlock);
    575 	return (ret);
    576 }
    577 
    578 /*
    579  * No locking is required here as the whole operation is covered
    580  * by kcage_range_rwlock writer lock.
    581  */
    582 static struct kcage_glist *
    583 kcage_glist_alloc(void)
    584 {
    585 	struct kcage_glist *new;
    586 
    587 	if ((new = kcage_glist_freelist) != NULL) {
    588 		kcage_glist_freelist = new->next;
    589 	} else if (kernel_cage_enable) {
    590 		new = vmem_alloc(kcage_arena, sizeof (*new), VM_NOSLEEP);
    591 	} else {
    592 		/*
    593 		 * On DR supported platforms we allow memory add
    594 		 * even when kernel cage is disabled. "kcage_arena" is
    595 		 * created only when kernel cage is enabled.
    596 		 */
    597 		new = kmem_zalloc(sizeof (*new), KM_NOSLEEP);
    598 	}
    599 
    600 	if (new != NULL)
    601 		bzero(new, sizeof (*new));
    602 
    603 	return (new);
    604 }
    605 
    606 static void
    607 kcage_glist_free(struct kcage_glist *lp)
    608 {
    609 	lp->next = kcage_glist_freelist;
    610 	kcage_glist_freelist = lp;
    611 }
    612 
    613 static int
    614 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp)
    615 {
    616 	struct kcage_glist *lp, *prev = *lpp;
    617 
    618 	while ((lp = *lpp) != NULL) {
    619 		if (lim > lp->base && base < lp->lim) {
    620 			/* The delete range overlaps this element. */
    621 			if (base <= lp->base && lim >= lp->lim) {
    622 				/* Delete whole element. */
    623 				*lpp = lp->next;
    624 				if (lp == kcage_current_glist) {
    625 					/* This can never happen. */
    626 					ASSERT(kcage_current_glist != prev);
    627 					kcage_current_glist = prev;
    628 				}
    629 				kcage_glist_free(lp);
    630 				continue;
    631 			}
    632 
    633 			/* Partial delete. */
    634 			if (base > lp->base && lim < lp->lim) {
    635 				struct kcage_glist *new;
    636 
    637 				/*
    638 				 * Remove a section from the middle,
    639 				 * need to allocate a new element.
    640 				 */
    641 				new = kcage_glist_alloc();
    642 				if (new == NULL) {
    643 					return (ENOMEM);
    644 				}
    645 
    646 				/*
    647 				 * Tranfser unused range to new.
    648 				 * Edit lp in place to preserve
    649 				 * kcage_current_glist.
    650 				 */
    651 				new->decr = lp->decr;
    652 				if (new->decr != 0) {
    653 					new->base = lp->base;
    654 					new->lim = base;
    655 					new->curr = base;
    656 
    657 					lp->base = lim;
    658 				} else {
    659 					new->base = lim;
    660 					new->lim = lp->lim;
    661 					new->curr = new->base;
    662 
    663 					lp->lim = base;
    664 				}
    665 
    666 				/* Insert new. */
    667 				new->next = lp->next;
    668 				lp->next = new;
    669 				lpp = &lp->next;
    670 			} else {
    671 				/* Delete part of current block. */
    672 				if (base > lp->base) {
    673 					ASSERT(lim >= lp->lim);
    674 					ASSERT(base < lp->lim);
    675 					if (lp->decr != 0 &&
    676 					    lp->curr == lp->lim)
    677 						lp->curr = base;
    678 					lp->lim = base;
    679 				} else {
    680 					ASSERT(base <= lp->base);
    681 					ASSERT(lim > lp->base);
    682 					if (lp->decr == 0 &&
    683 					    lp->curr == lp->base)
    684 						lp->curr = lim;
    685 					lp->base = lim;
    686 				}
    687 			}
    688 		}
    689 		prev = *lpp;
    690 		lpp = &(*lpp)->next;
    691 	}
    692 
    693 	return (0);
    694 }
    695 
    696 /*
    697  * If lockit is 1, kcage_get_pfn holds the
    698  * reader lock for kcage_range_rwlock.
    699  * Changes to lp->curr can cause race conditions, but
    700  * they are handled by higher level code (see kcage_next_range.)
    701  */
    702 static pfn_t
    703 kcage_get_pfn(int lockit)
    704 {
    705 	struct kcage_glist *lp;
    706 	pfn_t pfn = PFN_INVALID;
    707 
    708 	if (lockit && !rw_tryenter(&kcage_range_rwlock, RW_READER))
    709 		return (pfn);
    710 
    711 	lp = kcage_current_glist;
    712 	while (lp != NULL) {
    713 		if (lp->decr != 0) {
    714 			if (lp->curr != lp->base) {
    715 				pfn = --lp->curr;
    716 				break;
    717 			}
    718 		} else {
    719 			if (lp->curr != lp->lim) {
    720 				pfn = lp->curr++;
    721 				break;
    722 			}
    723 		}
    724 
    725 		lp = lp->next;
    726 		if (lp)
    727 			kcage_current_glist = lp;
    728 	}
    729 
    730 	if (lockit)
    731 		rw_exit(&kcage_range_rwlock);
    732 	return (pfn);
    733 }
    734 
    735 /*
    736  * Walk the physical address space of the cage.
    737  * This routine does not guarantee to return PFNs in the order
    738  * in which they were allocated to the cage. Instead, it walks
    739  * each range as they appear on the growth list returning the PFNs
    740  * range in ascending order.
    741  *
    742  * To begin scanning at lower edge of cage, reset should be nonzero.
    743  * To step through cage, reset should be zero.
    744  *
    745  * PFN_INVALID will be returned when the upper end of the cage is
    746  * reached -- indicating a full scan of the cage has been completed since
    747  * previous reset. PFN_INVALID will continue to be returned until
    748  * kcage_walk_cage is reset.
    749  *
    750  * It is possible to receive a PFN_INVALID result on reset if a growth
    751  * list is not installed or if none of the PFNs in the installed list have
    752  * been allocated to the cage. In otherwords, there is no cage.
    753  *
    754  * Caller need not hold kcage_range_rwlock while calling this function
    755  * as the front part of the list is static - pages never come out of
    756  * the cage.
    757  *
    758  * The caller is expected to only be kcage_cageout().
    759  */
    760 static pfn_t
    761 kcage_walk_cage(int reset)
    762 {
    763 	static struct kcage_glist *lp = NULL;
    764 	static pfn_t pfn;
    765 
    766 	if (reset)
    767 		lp = NULL;
    768 	if (lp == NULL) {
    769 		lp = kcage_glist;
    770 		pfn = PFN_INVALID;
    771 	}
    772 again:
    773 	if (pfn == PFN_INVALID) {
    774 		if (lp == NULL)
    775 			return (PFN_INVALID);
    776 
    777 		if (lp->decr != 0) {
    778 			/*
    779 			 * In this range the cage grows from the highest
    780 			 * address towards the lowest.
    781 			 * Arrange to return pfns from curr to lim-1,
    782 			 * inclusive, in ascending order.
    783 			 */
    784 
    785 			pfn = lp->curr;
    786 		} else {
    787 			/*
    788 			 * In this range the cage grows from the lowest
    789 			 * address towards the highest.
    790 			 * Arrange to return pfns from base to curr,
    791 			 * inclusive, in ascending order.
    792 			 */
    793 
    794 			pfn = lp->base;
    795 		}
    796 	}
    797 
    798 	if (lp->decr != 0) {		/* decrementing pfn */
    799 		if (pfn == lp->lim) {
    800 			/* Don't go beyond the static part of the glist. */
    801 			if (lp == kcage_current_glist)
    802 				lp = NULL;
    803 			else
    804 				lp = lp->next;
    805 			pfn = PFN_INVALID;
    806 			goto again;
    807 		}
    808 
    809 		ASSERT(pfn >= lp->curr && pfn < lp->lim);
    810 	} else {			/* incrementing pfn */
    811 		if (pfn == lp->curr) {
    812 			/* Don't go beyond the static part of the glist. */
    813 			if (lp == kcage_current_glist)
    814 				lp = NULL;
    815 			else
    816 				lp = lp->next;
    817 			pfn = PFN_INVALID;
    818 			goto again;
    819 		}
    820 
    821 		ASSERT(pfn >= lp->base && pfn < lp->curr);
    822 	}
    823 
    824 	return (pfn++);
    825 }
    826 
    827 /*
    828  * Callback functions for to recalc cage thresholds after
    829  * Kphysm memory add/delete operations.
    830  */
    831 /*ARGSUSED*/
    832 static void
    833 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages)
    834 {
    835 	kcage_recalc_thresholds();
    836 }
    837 
    838 /*ARGSUSED*/
    839 static int
    840 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages)
    841 {
    842 	/* TODO: when should cage refuse memory delete requests? */
    843 	return (0);
    844 }
    845 
    846 /*ARGSUSED*/
    847 static  void
    848 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled)
    849 {
    850 	kcage_recalc_thresholds();
    851 }
    852 
    853 static kphysm_setup_vector_t kcage_kphysm_vectors = {
    854 	KPHYSM_SETUP_VECTOR_VERSION,
    855 	kcage_kphysm_postadd_cb,
    856 	kcage_kphysm_predel_cb,
    857 	kcage_kphysm_postdel_cb
    858 };
    859 
    860 /*
    861  * This is called before a CPR suspend and after a CPR resume.  We have to
    862  * turn off kcage_cageout_ready before a suspend, and turn it back on after a
    863  * restart.
    864  */
    865 /*ARGSUSED*/
    866 static boolean_t
    867 kcage_cageout_cpr(void *arg, int code)
    868 {
    869 	if (code == CB_CODE_CPR_CHKPT) {
    870 		ASSERT(kcage_cageout_ready);
    871 		kcage_cageout_ready = 0;
    872 		return (B_TRUE);
    873 	} else if (code == CB_CODE_CPR_RESUME) {
    874 		ASSERT(kcage_cageout_ready == 0);
    875 		kcage_cageout_ready = 1;
    876 		return (B_TRUE);
    877 	}
    878 	return (B_FALSE);
    879 }
    880 
    881 /*
    882  * kcage_recalc_preferred_size() increases initial cage size to improve large
    883  * page availability when lp for kmem is enabled and kpr is disabled
    884  */
    885 static pgcnt_t
    886 kcage_recalc_preferred_size(pgcnt_t preferred_size)
    887 {
    888 	if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) {
    889 		pgcnt_t lpmincage = kcage_kmemlp_mincage;
    890 		if (lpmincage == 0) {
    891 			lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8),
    892 			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
    893 		}
    894 		kcage_kmemlp_mincage = MIN(lpmincage,
    895 		    (segkmem_kmemlp_max / PAGESIZE));
    896 		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
    897 	}
    898 	return (preferred_size);
    899 }
    900 
    901 /*
    902  * Kcage_init() builds the cage and initializes the cage thresholds.
    903  * The size of the cage is determined by the argument preferred_size.
    904  * or the actual amount of memory, whichever is smaller.
    905  */
    906 static void
    907 kcage_init(pgcnt_t preferred_size)
    908 {
    909 	pgcnt_t wanted;
    910 	pfn_t pfn;
    911 	page_t *pp;
    912 	kstat_t *ksp;
    913 
    914 	extern void page_list_noreloc_startup(page_t *);
    915 
    916 	ASSERT(!kcage_on);
    917 
    918 	/* increase preferred cage size for lp for kmem */
    919 	preferred_size = kcage_recalc_preferred_size(preferred_size);
    920 
    921 	/* Debug note: initialize this now so early expansions can stat */
    922 	KCAGE_STAT_INIT_SCAN_INDEX;
    923 
    924 	/*
    925 	 * Initialize cage thresholds and install kphysm callback.
    926 	 * If we can't arrange to have the thresholds track with
    927 	 * available physical memory, then the cage thresholds may
    928 	 * end up over time at levels that adversly effect system
    929 	 * performance; so, bail out.
    930 	 */
    931 	kcage_recalc_thresholds();
    932 	if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) {
    933 		ASSERT(0);		/* Catch this in DEBUG kernels. */
    934 		return;
    935 	}
    936 
    937 	/*
    938 	 * Limit startup cage size within the range of kcage_minfree
    939 	 * and availrmem, inclusively.
    940 	 */
    941 	wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem);
    942 
    943 	/*
    944 	 * Construct the cage. PFNs are allocated from the glist. It
    945 	 * is assumed that the list has been properly ordered for the
    946 	 * platform by the platform code. Typically, this is as simple
    947 	 * as calling kcage_range_init(phys_avail, decr), where decr is
    948 	 * 1 if the kernel has been loaded into upper end of physical
    949 	 * memory, or 0 if the kernel has been loaded at the low end.
    950 	 *
    951 	 * Note: it is assumed that we are in the startup flow, so there
    952 	 * is no reason to grab the page lock.
    953 	 */
    954 	kcage_freemem = 0;
    955 	pfn = PFN_INVALID;			/* prime for alignment test */
    956 	while (wanted != 0) {
    957 		if ((pfn = kcage_get_pfn(0)) == PFN_INVALID)
    958 			break;
    959 
    960 		if ((pp = page_numtopp_nolock(pfn)) != NULL) {
    961 			KCAGEPAGETS_INC();
    962 			/*
    963 			 * Set the noreloc state on the page.
    964 			 * If the page is free and not already
    965 			 * on the noreloc list then move it.
    966 			 */
    967 			if (PP_ISFREE(pp)) {
    968 				if (PP_ISNORELOC(pp) == 0)
    969 					page_list_noreloc_startup(pp);
    970 			} else {
    971 				ASSERT(pp->p_szc == 0);
    972 				PP_SETNORELOC(pp);
    973 			}
    974 		}
    975 		PLCNT_XFER_NORELOC(pp);
    976 		wanted -= 1;
    977 	}
    978 
    979 	/*
    980 	 * Need to go through and find kernel allocated pages
    981 	 * and capture them into the Cage.  These will primarily
    982 	 * be pages gotten through boot_alloc().
    983 	 */
    984 	if (kvp.v_pages) {
    985 
    986 		pp = kvp.v_pages;
    987 		do {
    988 			ASSERT(!PP_ISFREE(pp));
    989 			ASSERT(pp->p_szc == 0);
    990 			if (PP_ISNORELOC(pp) == 0) {
    991 				PP_SETNORELOC(pp);
    992 				PLCNT_XFER_NORELOC(pp);
    993 			}
    994 		} while ((pp = pp->p_vpnext) != kvp.v_pages);
    995 
    996 	}
    997 
    998 	kcage_on = 1;
    999 
   1000 	/*
   1001 	 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend()
   1002 	 * after the cageout thread is blocked, and executes from cpr_resume()
   1003 	 * before the cageout thread is restarted.  By executing in this class,
   1004 	 * we are assured that the kernel cage thread won't miss wakeup calls
   1005 	 * and also CPR's larger kmem_alloc requests will not fail after
   1006 	 * CPR shuts down the cageout kernel thread.
   1007 	 */
   1008 	(void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL,
   1009 	    "cageout");
   1010 
   1011 	/*
   1012 	 * Coalesce pages to improve large page availability. A better fix
   1013 	 * would to coalesce pages as they are included in the cage
   1014 	 */
   1015 	if (SEGKMEM_USE_LARGEPAGES) {
   1016 		extern void page_freelist_coalesce_all(int mnode);
   1017 		page_freelist_coalesce_all(-1);	/* do all mnodes */
   1018 	}
   1019 
   1020 	ksp = kstat_create("kcage", 0, "kcage_page_list", "misc",
   1021 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
   1022 	if (ksp != NULL) {
   1023 		ksp->ks_update = kcage_kstat_update;
   1024 		ksp->ks_snapshot = kcage_kstat_snapshot;
   1025 		ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */
   1026 		kstat_install(ksp);
   1027 	}
   1028 }
   1029 
   1030 static int
   1031 kcage_kstat_update(kstat_t *ksp, int rw)
   1032 {
   1033 	struct kcage_glist *lp;
   1034 	uint_t count;
   1035 
   1036 	if (rw == KSTAT_WRITE)
   1037 		return (EACCES);
   1038 
   1039 	count = 0;
   1040 	rw_enter(&kcage_range_rwlock, RW_WRITER);
   1041 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
   1042 		if (lp->decr) {
   1043 			if (lp->curr != lp->lim) {
   1044 				count++;
   1045 			}
   1046 		} else {
   1047 			if (lp->curr != lp->base) {
   1048 				count++;
   1049 			}
   1050 		}
   1051 	}
   1052 	rw_exit(&kcage_range_rwlock);
   1053 
   1054 	ksp->ks_ndata = count;
   1055 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
   1056 
   1057 	return (0);
   1058 }
   1059 
   1060 static int
   1061 kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
   1062 {
   1063 	struct kcage_glist *lp;
   1064 	struct memunit {
   1065 		uint64_t address;
   1066 		uint64_t size;
   1067 	} *kspmem;
   1068 
   1069 	if (rw == KSTAT_WRITE)
   1070 		return (EACCES);
   1071 
   1072 	ksp->ks_snaptime = gethrtime();
   1073 
   1074 	kspmem = (struct memunit *)buf;
   1075 	rw_enter(&kcage_range_rwlock, RW_WRITER);
   1076 	for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) {
   1077 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
   1078 			break;
   1079 
   1080 		if (lp->decr) {
   1081 			if (lp->curr != lp->lim) {
   1082 				kspmem->address = ptob(lp->curr);
   1083 				kspmem->size = ptob(lp->lim - lp->curr);
   1084 			}
   1085 		} else {
   1086 			if (lp->curr != lp->base) {
   1087 				kspmem->address = ptob(lp->base);
   1088 				kspmem->size = ptob(lp->curr - lp->base);
   1089 			}
   1090 		}
   1091 	}
   1092 	rw_exit(&kcage_range_rwlock);
   1093 
   1094 	return (0);
   1095 }
   1096 
   1097 void
   1098 kcage_recalc_thresholds()
   1099 {
   1100 	static int first = 1;
   1101 	static pgcnt_t init_lotsfree;
   1102 	static pgcnt_t init_desfree;
   1103 	static pgcnt_t init_minfree;
   1104 	static pgcnt_t init_throttlefree;
   1105 	static pgcnt_t init_reserve;
   1106 
   1107 	/* TODO: any reason to take more care than this with live editing? */
   1108 	mutex_enter(&kcage_cageout_mutex);
   1109 	mutex_enter(&freemem_lock);
   1110 
   1111 	if (first) {
   1112 		first = 0;
   1113 		init_lotsfree = kcage_lotsfree;
   1114 		init_desfree = kcage_desfree;
   1115 		init_minfree = kcage_minfree;
   1116 		init_throttlefree = kcage_throttlefree;
   1117 		init_reserve = kcage_reserve;
   1118 	} else {
   1119 		kcage_lotsfree = init_lotsfree;
   1120 		kcage_desfree = init_desfree;
   1121 		kcage_minfree = init_minfree;
   1122 		kcage_throttlefree = init_throttlefree;
   1123 		kcage_reserve = init_reserve;
   1124 	}
   1125 
   1126 	if (kcage_lotsfree == 0)
   1127 		kcage_lotsfree = MAX(32, total_pages / 256);
   1128 
   1129 	if (kcage_minfree == 0)
   1130 		kcage_minfree = MAX(32, kcage_lotsfree / 2);
   1131 
   1132 	if (kcage_desfree == 0)
   1133 		kcage_desfree = MAX(32, kcage_minfree);
   1134 
   1135 	if (kcage_throttlefree == 0)
   1136 		kcage_throttlefree = MAX(32, kcage_minfree / 2);
   1137 
   1138 	if (kcage_reserve == 0)
   1139 		kcage_reserve = MIN(32, kcage_throttlefree / 2);
   1140 
   1141 	mutex_exit(&freemem_lock);
   1142 	mutex_exit(&kcage_cageout_mutex);
   1143 
   1144 	if (kcage_cageout_ready) {
   1145 		if (kcage_freemem < kcage_desfree)
   1146 			kcage_cageout_wakeup();
   1147 
   1148 		if (kcage_needfree) {
   1149 			mutex_enter(&kcage_throttle_mutex);
   1150 			cv_broadcast(&kcage_throttle_cv);
   1151 			mutex_exit(&kcage_throttle_mutex);
   1152 		}
   1153 	}
   1154 }
   1155 
   1156 /*
   1157  * Pageout interface:
   1158  * kcage_cageout_init()
   1159  */
   1160 void
   1161 kcage_cageout_init()
   1162 {
   1163 	if (kcage_on) {
   1164 		(void) lwp_kernel_create(proc_pageout, kcage_cageout, NULL,
   1165 		    TS_RUN, maxclsyspri - 1);
   1166 	}
   1167 }
   1168 
   1169 
   1170 /*
   1171  * VM Interfaces:
   1172  * kcage_create_throttle()
   1173  * kcage_freemem_add()
   1174  * kcage_freemem_sub()
   1175  */
   1176 
   1177 /*
   1178  * Wakeup cageout thread and throttle waiting for the number of pages
   1179  * requested to become available.  For non-critical requests, a
   1180  * timeout is added, since freemem accounting is separate from cage
   1181  * freemem accounting: it's possible for us to get stuck and not make
   1182  * forward progress even though there was sufficient freemem before
   1183  * arriving here.
   1184  */
   1185 int
   1186 kcage_create_throttle(pgcnt_t npages, int flags)
   1187 {
   1188 	int niter = 0;
   1189 	pgcnt_t lastfree;
   1190 	int enough = kcage_freemem > kcage_throttlefree + npages;
   1191 
   1192 	KCAGE_STAT_INCR(kct_calls);		/* unprotected incr. */
   1193 
   1194 	kcage_cageout_wakeup();			/* just to be sure */
   1195 	KCAGE_STAT_INCR(kct_cagewake);		/* unprotected incr. */
   1196 
   1197 	/*
   1198 	 * Obviously, we can't throttle the cageout thread since
   1199 	 * we depend on it.  We also can't throttle the panic thread.
   1200 	 */
   1201 	if (curthread == kcage_cageout_thread || panicstr) {
   1202 		KCAGE_STAT_INCR(kct_cageout);	/* unprotected incr. */
   1203 		return (KCT_CRIT);
   1204 	}
   1205 
   1206 	/*
   1207 	 * Don't throttle threads which are critical for proper
   1208 	 * vm management if we're above kcage_throttlefree or
   1209 	 * if freemem is very low.
   1210 	 */
   1211 	if (NOMEMWAIT()) {
   1212 		if (enough) {
   1213 			KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
   1214 			return (KCT_CRIT);
   1215 		} else if (freemem < minfree) {
   1216 			KCAGE_STAT_INCR(kct_critical);  /* unprotected incr. */
   1217 			return (KCT_CRIT);
   1218 		}
   1219 	}
   1220 
   1221 	/*
   1222 	 * Don't throttle real-time threads if kcage_freemem > kcage_reserve.
   1223 	 */
   1224 	if (DISP_PRIO(curthread) > maxclsyspri &&
   1225 	    kcage_freemem > kcage_reserve) {
   1226 		KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
   1227 		return (KCT_CRIT);
   1228 	}
   1229 
   1230 	/*
   1231 	 * Cause all other threads (which are assumed to not be
   1232 	 * critical to cageout) to wait here until their request
   1233 	 * can be satisfied. Be a little paranoid and wake the
   1234 	 * kernel cage on each loop through this logic.
   1235 	 */
   1236 	while (kcage_freemem < kcage_throttlefree + npages) {
   1237 		ASSERT(kcage_on);
   1238 
   1239 		lastfree = kcage_freemem;
   1240 
   1241 		if (kcage_cageout_ready) {
   1242 			mutex_enter(&kcage_throttle_mutex);
   1243 
   1244 			kcage_needfree += npages;
   1245 			KCAGE_STAT_INCR(kct_wait);
   1246 
   1247 			kcage_cageout_wakeup();
   1248 			KCAGE_STAT_INCR(kct_cagewake);
   1249 
   1250 			cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex);
   1251 
   1252 			kcage_needfree -= npages;
   1253 
   1254 			mutex_exit(&kcage_throttle_mutex);
   1255 		} else {
   1256 			/*
   1257 			 * NOTE: atomics are used just in case we enter
   1258 			 * mp operation before the cageout thread is ready.
   1259 			 */
   1260 			atomic_add_long(&kcage_needfree, npages);
   1261 
   1262 			kcage_cageout_wakeup();
   1263 			KCAGE_STAT_INCR(kct_cagewake);	/* unprotected incr. */
   1264 
   1265 			atomic_add_long(&kcage_needfree, -npages);
   1266 		}
   1267 
   1268 		if ((flags & PG_WAIT) == 0) {
   1269 			if (kcage_freemem > lastfree) {
   1270 				KCAGE_STAT_INCR(kct_progress);
   1271 				niter = 0;
   1272 			} else {
   1273 				KCAGE_STAT_INCR(kct_noprogress);
   1274 				if (++niter >= kcage_maxwait) {
   1275 					KCAGE_STAT_INCR(kct_timeout);
   1276 					return (KCT_FAILURE);
   1277 				}
   1278 			}
   1279 		}
   1280 
   1281 		if (NOMEMWAIT() && freemem < minfree) {
   1282 			return (KCT_CRIT);
   1283 		}
   1284 
   1285 	}
   1286 	return (KCT_NONCRIT);
   1287 }
   1288 
   1289 void
   1290 kcage_freemem_add(pgcnt_t npages)
   1291 {
   1292 	extern void wakeup_pcgs(void);
   1293 
   1294 	atomic_add_long(&kcage_freemem, npages);
   1295 
   1296 	wakeup_pcgs();  /* wakeup threads in pcgs() */
   1297 
   1298 	if (kcage_needfree != 0 &&
   1299 	    kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
   1300 
   1301 		mutex_enter(&kcage_throttle_mutex);
   1302 		cv_broadcast(&kcage_throttle_cv);
   1303 		KCAGE_STAT_INCR(kfa_trottlewake);
   1304 		mutex_exit(&kcage_throttle_mutex);
   1305 	}
   1306 }
   1307 
   1308 void
   1309 kcage_freemem_sub(pgcnt_t npages)
   1310 {
   1311 	atomic_add_long(&kcage_freemem, -npages);
   1312 
   1313 	if (kcage_freemem < kcage_desfree) {
   1314 		kcage_cageout_wakeup();
   1315 		KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */
   1316 	}
   1317 }
   1318 
   1319 /*
   1320  * return 0 on failure and 1 on success.
   1321  */
   1322 static int
   1323 kcage_setnoreloc_pages(page_t *rootpp, se_t se)
   1324 {
   1325 	pgcnt_t npgs, i;
   1326 	page_t *pp;
   1327 	pfn_t rootpfn = page_pptonum(rootpp);
   1328 	uint_t szc;
   1329 
   1330 	ASSERT(!PP_ISFREE(rootpp));
   1331 	ASSERT(PAGE_LOCKED_SE(rootpp, se));
   1332 	if (!group_page_trylock(rootpp, se)) {
   1333 		return (0);
   1334 	}
   1335 	szc = rootpp->p_szc;
   1336 	if (szc == 0) {
   1337 		/*
   1338 		 * The szc of a locked page can only change for pages that are
   1339 		 * non-swapfs (i.e. anonymous memory) file system pages.
   1340 		 */
   1341 		ASSERT(rootpp->p_vnode != NULL &&
   1342 		    !PP_ISKAS(rootpp) &&
   1343 		    !IS_SWAPFSVP(rootpp->p_vnode));
   1344 		PP_SETNORELOC(rootpp);
   1345 		return (1);
   1346 	}
   1347 	npgs = page_get_pagecnt(szc);
   1348 	ASSERT(IS_P2ALIGNED(rootpfn, npgs));
   1349 	pp = rootpp;
   1350 	for (i = 0; i < npgs; i++, pp++) {
   1351 		ASSERT(PAGE_LOCKED_SE(pp, se));
   1352 		ASSERT(!PP_ISFREE(pp));
   1353 		ASSERT(pp->p_szc == szc);
   1354 		PP_SETNORELOC(pp);
   1355 	}
   1356 	group_page_unlock(rootpp);
   1357 	return (1);
   1358 }
   1359 
   1360 /*
   1361  * Attempt to convert page to a caged page (set the P_NORELOC flag).
   1362  * If successful and pages is free, move page to the tail of whichever
   1363  * list it is on.
   1364  * Returns:
   1365  *   EBUSY  page already locked, assimilated but not free.
   1366  *   ENOMEM page assimilated, but memory too low to relocate. Page not free.
   1367  *   EAGAIN page not assimilated. Page not free.
   1368  *   ERANGE page assimilated. Page not root.
   1369  *   0      page assimilated. Page free.
   1370  *   *nfreedp number of pages freed.
   1371  * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way
   1372  * to distinguish between a page that was already a NORELOC page from
   1373  * those newly converted to NORELOC pages by this invocation of
   1374  * kcage_assimilate_page.
   1375  */
   1376 static int
   1377 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp)
   1378 {
   1379 	if (page_trylock(pp, SE_EXCL)) {
   1380 		if (PP_ISNORELOC(pp)) {
   1381 check_free_and_return:
   1382 			if (PP_ISFREE(pp)) {
   1383 				page_unlock(pp);
   1384 				*nfreedp = 0;
   1385 				return (0);
   1386 			} else {
   1387 				page_unlock(pp);
   1388 				return (EBUSY);
   1389 			}
   1390 			/*NOTREACHED*/
   1391 		}
   1392 	} else {
   1393 		if (page_trylock(pp, SE_SHARED)) {
   1394 			if (PP_ISNORELOC(pp))
   1395 				goto check_free_and_return;
   1396 		} else
   1397 			return (EAGAIN);
   1398 
   1399 		if (!PP_ISFREE(pp)) {
   1400 			page_unlock(pp);
   1401 			return (EAGAIN);
   1402 		}
   1403 
   1404 		/*
   1405 		 * Need to upgrade the lock on it and set the NORELOC
   1406 		 * bit. If it is free then remove it from the free
   1407 		 * list so that the platform free list code can keep
   1408 		 * NORELOC pages where they should be.
   1409 		 */
   1410 		/*
   1411 		 * Before doing anything, get the exclusive lock.
   1412 		 * This may fail (eg ISM pages are left shared locked).
   1413 		 * If the page is free this will leave a hole in the
   1414 		 * cage. There is no solution yet to this.
   1415 		 */
   1416 		if (!page_tryupgrade(pp)) {
   1417 			page_unlock(pp);
   1418 			return (EAGAIN);
   1419 		}
   1420 	}
   1421 
   1422 	ASSERT(PAGE_EXCL(pp));
   1423 
   1424 	if (PP_ISFREE(pp)) {
   1425 		int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST;
   1426 
   1427 		page_list_sub(pp, which);
   1428 		ASSERT(pp->p_szc == 0);
   1429 		PP_SETNORELOC(pp);
   1430 		PLCNT_XFER_NORELOC(pp);
   1431 		page_list_add(pp, which | PG_LIST_TAIL);
   1432 
   1433 		page_unlock(pp);
   1434 		*nfreedp = 1;
   1435 		return (0);
   1436 	} else {
   1437 		if (pp->p_szc != 0) {
   1438 			if (!kcage_setnoreloc_pages(pp, SE_EXCL)) {
   1439 				page_unlock(pp);
   1440 				return (EAGAIN);
   1441 			}
   1442 			ASSERT(PP_ISNORELOC(pp));
   1443 		} else {
   1444 			PP_SETNORELOC(pp);
   1445 		}
   1446 		PLCNT_XFER_NORELOC(pp);
   1447 		return (kcage_invalidate_page(pp, nfreedp));
   1448 	}
   1449 	/*NOTREACHED*/
   1450 }
   1451 
   1452 static int
   1453 kcage_expand()
   1454 {
   1455 	int did_something = 0;
   1456 
   1457 	spgcnt_t wanted;
   1458 	pfn_t pfn;
   1459 	page_t *pp;
   1460 	/* TODO: we don't really need n any more? */
   1461 	pgcnt_t n;
   1462 	pgcnt_t nf, nfreed;
   1463 
   1464 	/*
   1465 	 * Expand the cage if available cage memory is really low. Calculate
   1466 	 * the amount required to return kcage_freemem to the level of
   1467 	 * kcage_lotsfree, or to satisfy throttled requests, whichever is
   1468 	 * more.  It is rare for their sum to create an artificial threshold
   1469 	 * above kcage_lotsfree, but it is possible.
   1470 	 *
   1471 	 * Exit early if expansion amount is equal to or less than zero.
   1472 	 * (<0 is possible if kcage_freemem rises suddenly.)
   1473 	 *
   1474 	 * Exit early when the global page pool (apparently) does not
   1475 	 * have enough free pages to page_relocate() even a single page.
   1476 	 */
   1477 	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
   1478 	    - kcage_freemem;
   1479 	if (wanted <= 0)
   1480 		return (0);
   1481 	else if (freemem < pageout_reserve + 1) {
   1482 		KCAGE_STAT_INCR(ke_lowfreemem);
   1483 		return (0);
   1484 	}
   1485 
   1486 	KCAGE_STAT_INCR(ke_calls);
   1487 	KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted);
   1488 
   1489 	/*
   1490 	 * Assimilate more pages from the global page pool into the cage.
   1491 	 */
   1492 	n = 0;				/* number of pages PP_SETNORELOC'd */
   1493 	nf = 0;				/* number of those actually free */
   1494 	while (kcage_on && nf < wanted) {
   1495 		pfn = kcage_get_pfn(1);
   1496 		if (pfn == PFN_INVALID) {	/* eek! no where to grow */
   1497 			KCAGE_STAT_INCR(ke_nopfn);
   1498 			goto terminate;
   1499 		}
   1500 
   1501 		KCAGE_STAT_INCR_SCAN(ke_examined);
   1502 
   1503 		if ((pp = page_numtopp_nolock(pfn)) == NULL) {
   1504 			KCAGE_STAT_INCR(ke_nopaget);
   1505 			continue;
   1506 		}
   1507 		KCAGEPAGETS_INC();
   1508 		/*
   1509 		 * Sanity check. Skip this pfn if it is
   1510 		 * being deleted.
   1511 		 */
   1512 		if (pfn_is_being_deleted(pfn)) {
   1513 			KCAGE_STAT_INCR(ke_deleting);
   1514 			continue;
   1515 		}
   1516 
   1517 		if (PP_ISNORELOC(pp)) {
   1518 			KCAGE_STAT_INCR(ke_isnoreloc);
   1519 			continue;
   1520 		}
   1521 
   1522 		switch (kcage_assimilate_page(pp, &nfreed)) {
   1523 			case 0:		/* assimilated, page is free */
   1524 				KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed);
   1525 				did_something = 1;
   1526 				nf += nfreed;
   1527 				n++;
   1528 				break;
   1529 
   1530 			case EBUSY:	/* assimilated, page not free */
   1531 			case ERANGE:	/* assimilated, page not root */
   1532 				KCAGE_STAT_INCR_SCAN(ke_gotone);
   1533 				did_something = 1;
   1534 				n++;
   1535 				break;
   1536 
   1537 			case ENOMEM:	/* assimilated, but no mem */
   1538 				KCAGE_STAT_INCR(ke_terminate);
   1539 				did_something = 1;
   1540 				n++;
   1541 				goto terminate;
   1542 
   1543 			case EAGAIN:	/* can't assimilate */
   1544 				KCAGE_STAT_INCR_SCAN(ke_lefthole);
   1545 				break;
   1546 
   1547 			default:	/* catch this with debug kernels */
   1548 				ASSERT(0);
   1549 				break;
   1550 		}
   1551 	}
   1552 
   1553 	/*
   1554 	 * Realign cage edge with the nearest physical address
   1555 	 * boundry for big pages. This is done to give us a
   1556 	 * better chance of actually getting usable big pages
   1557 	 * in the cage.
   1558 	 */
   1559 
   1560 terminate:
   1561 
   1562 	return (did_something);
   1563 }
   1564 
   1565 /*
   1566  * Relocate page opp (Original Page Pointer) from cage pool to page rpp
   1567  * (Replacement Page Pointer) in the global pool. Page opp will be freed
   1568  * if relocation is successful, otherwise it is only unlocked.
   1569  * On entry, page opp must be exclusively locked and not free.
   1570  * *nfreedp: number of pages freed.
   1571  */
   1572 static int
   1573 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp)
   1574 {
   1575 	page_t *opp = pp;
   1576 	page_t *rpp = NULL;
   1577 	spgcnt_t npgs;
   1578 	int result;
   1579 
   1580 	ASSERT(!PP_ISFREE(opp));
   1581 	ASSERT(PAGE_EXCL(opp));
   1582 
   1583 	result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL);
   1584 	*nfreedp = npgs;
   1585 	if (result == 0) {
   1586 		while (npgs-- > 0) {
   1587 			page_t *tpp;
   1588 
   1589 			ASSERT(rpp != NULL);
   1590 			tpp = rpp;
   1591 			page_sub(&rpp, tpp);
   1592 			page_unlock(tpp);
   1593 		}
   1594 
   1595 		ASSERT(rpp == NULL);
   1596 
   1597 		return (0);		/* success */
   1598 	}
   1599 
   1600 	page_unlock(opp);
   1601 	return (result);
   1602 }
   1603 
   1604 /*
   1605  * Based on page_invalidate_pages()
   1606  *
   1607  * Kcage_invalidate_page() uses page_relocate() twice. Both instances
   1608  * of use must be updated to match the new page_relocate() when it
   1609  * becomes available.
   1610  *
   1611  * Return result of kcage_relocate_page or zero if page was directly freed.
   1612  * *nfreedp: number of pages freed.
   1613  */
   1614 static int
   1615 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
   1616 {
   1617 	int result;
   1618 
   1619 #if defined(__sparc)
   1620 	ASSERT(pp->p_vnode != &promvp);
   1621 #endif /* __sparc */
   1622 	ASSERT(!PP_ISFREE(pp));
   1623 	ASSERT(PAGE_EXCL(pp));
   1624 
   1625 	/*
   1626 	 * Is this page involved in some I/O? shared?
   1627 	 * The page_struct_lock need not be acquired to
   1628 	 * examine these fields since the page has an
   1629 	 * "exclusive" lock.
   1630 	 */
   1631 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
   1632 		result = kcage_relocate_page(pp, nfreedp);
   1633 #ifdef KCAGE_STATS
   1634 		if (result == 0)
   1635 			KCAGE_STAT_INCR_SCAN(kip_reloclocked);
   1636 		else if (result == ENOMEM)
   1637 			KCAGE_STAT_INCR_SCAN(kip_nomem);
   1638 #endif
   1639 		return (result);
   1640 	}
   1641 
   1642 	ASSERT(pp->p_vnode->v_type != VCHR);
   1643 
   1644 	/*
   1645 	 * Unload the mappings and check if mod bit is set.
   1646 	 */
   1647 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
   1648 
   1649 	if (hat_ismod(pp)) {
   1650 		result = kcage_relocate_page(pp, nfreedp);
   1651 #ifdef KCAGE_STATS
   1652 		if (result == 0)
   1653 			KCAGE_STAT_INCR_SCAN(kip_relocmod);
   1654 		else if (result == ENOMEM)
   1655 			KCAGE_STAT_INCR_SCAN(kip_nomem);
   1656 #endif
   1657 		return (result);
   1658 	}
   1659 
   1660 	if (!page_try_demote_pages(pp)) {
   1661 		KCAGE_STAT_INCR_SCAN(kip_demotefailed);
   1662 		page_unlock(pp);
   1663 		return (EAGAIN);
   1664 	}
   1665 
   1666 	/* LINTED: constant in conditional context */
   1667 	VN_DISPOSE(pp, B_INVAL, 0, kcred);
   1668 	KCAGE_STAT_INCR_SCAN(kip_destroy);
   1669 	*nfreedp = 1;
   1670 	return (0);
   1671 }
   1672 
   1673 static void
   1674 kcage_cageout()
   1675 {
   1676 	pfn_t pfn;
   1677 	page_t *pp;
   1678 	callb_cpr_t cprinfo;
   1679 	int did_something;
   1680 	int scan_again;
   1681 	pfn_t start_pfn;
   1682 	int pass;
   1683 	int last_pass;
   1684 	int pages_skipped;
   1685 	int shared_skipped;
   1686 	ulong_t shared_level = 8;
   1687 	pgcnt_t nfreed;
   1688 #ifdef KCAGE_STATS
   1689 	clock_t scan_start;
   1690 #endif
   1691 
   1692 	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
   1693 	    callb_generic_cpr, "cageout");
   1694 
   1695 	mutex_enter(&kcage_cageout_mutex);
   1696 	kcage_cageout_thread = curthread;
   1697 
   1698 	pfn = PFN_INVALID;		/* force scan reset */
   1699 	start_pfn = PFN_INVALID;	/* force init with 1st cage pfn */
   1700 	kcage_cageout_ready = 1;	/* switch kcage_cageout_wakeup mode */
   1701 
   1702 loop:
   1703 	/*
   1704 	 * Wait here. Sooner or later, kcage_freemem_sub() will notice
   1705 	 * that kcage_freemem is less than kcage_desfree. When it does
   1706 	 * notice, kcage_freemem_sub() will wake us up via call to
   1707 	 * kcage_cageout_wakeup().
   1708 	 */
   1709 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
   1710 	cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex);
   1711 	CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex);
   1712 
   1713 	KCAGE_STAT_INCR(kt_wakeups);
   1714 	KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
   1715 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
   1716 	pass = 0;
   1717 	last_pass = 0;
   1718 
   1719 #ifdef KCAGE_STATS
   1720 	scan_start = ddi_get_lbolt();
   1721 #endif
   1722 
   1723 again:
   1724 	if (!kcage_on)
   1725 		goto loop;
   1726 
   1727 	KCAGE_STAT_INCR(kt_scans);
   1728 	KCAGE_STAT_INCR_SCAN(kt_passes);
   1729 
   1730 	did_something = 0;
   1731 	pages_skipped = 0;
   1732 	shared_skipped = 0;
   1733 	while ((kcage_freemem < kcage_lotsfree || kcage_needfree) &&
   1734 	    (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
   1735 
   1736 		if (start_pfn == PFN_INVALID)
   1737 			start_pfn = pfn;
   1738 		else if (start_pfn == pfn) {
   1739 			last_pass = pass;
   1740 			pass += 1;
   1741 			/*
   1742 			 * Did a complete walk of kernel cage, but didn't free
   1743 			 * any pages.  If only one cpu is active then
   1744 			 * stop kernel cage walk and try expanding.
   1745 			 */
   1746 			if (cp_default.cp_ncpus == 1 && did_something == 0) {
   1747 				KCAGE_STAT_INCR(kt_cageout_break);
   1748 				break;
   1749 			}
   1750 		}
   1751 
   1752 		pp = page_numtopp_nolock(pfn);
   1753 		if (pp == NULL) {
   1754 			continue;
   1755 		}
   1756 
   1757 		KCAGE_STAT_INCR_SCAN(kt_examined);
   1758 
   1759 		/*
   1760 		 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside
   1761 		 * of the lock. If one is missed it will be seen next
   1762 		 * time through.
   1763 		 *
   1764 		 * Skip non-caged-pages. These pages can exist in the cage
   1765 		 * because, if during cage expansion, a page is
   1766 		 * encountered that is long-term locked the lock prevents the
   1767 		 * expansion logic from setting the P_NORELOC flag. Hence,
   1768 		 * non-caged-pages surrounded by caged-pages.
   1769 		 */
   1770 		if (!PP_ISNORELOC(pp)) {
   1771 			switch (kcage_assimilate_page(pp, &nfreed)) {
   1772 				case 0:
   1773 					did_something = 1;
   1774 					KCAGE_STAT_NINCR_SCAN(kt_gotonefree,
   1775 					    nfreed);
   1776 					break;
   1777 
   1778 				case EBUSY:
   1779 				case ERANGE:
   1780 					did_something = 1;
   1781 					KCAGE_STAT_INCR_SCAN(kt_gotone);
   1782 					break;
   1783 
   1784 				case EAGAIN:
   1785 				case ENOMEM:
   1786 					break;
   1787 
   1788 				default:
   1789 					/* catch this with debug kernels */
   1790 					ASSERT(0);
   1791 					break;
   1792 			}
   1793 
   1794 			continue;
   1795 		} else {
   1796 			int prm;
   1797 
   1798 			if (PP_ISFREE(pp)) {
   1799 				continue;
   1800 			}
   1801 
   1802 			if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) ||
   1803 			    !page_trylock(pp, SE_EXCL)) {
   1804 				KCAGE_STAT_INCR_SCAN(kt_cantlock);
   1805 				continue;
   1806 			}
   1807 
   1808 			/* P_NORELOC bit should not have gone away. */
   1809 			ASSERT(PP_ISNORELOC(pp));
   1810 			if (PP_ISFREE(pp) || (PP_ISKAS(pp) &&
   1811 			    pp->p_lckcnt > 0)) {
   1812 				page_unlock(pp);
   1813 				continue;
   1814 			}
   1815 
   1816 			KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level);
   1817 			if (hat_page_checkshare(pp, shared_level)) {
   1818 				page_unlock(pp);
   1819 				pages_skipped = 1;
   1820 				shared_skipped = 1;
   1821 				KCAGE_STAT_INCR_SCAN(kt_skipshared);
   1822 				continue;
   1823 			}
   1824 
   1825 			/*
   1826 			 * In pass {0, 1}, skip page if ref bit is set.
   1827 			 * In pass {0, 1, 2}, skip page if mod bit is set.
   1828 			 */
   1829 			prm = hat_pagesync(pp,
   1830 			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
   1831 
   1832 			/* On first pass ignore ref'd pages */
   1833 			if (pass <= 1 && (prm & P_REF)) {
   1834 				KCAGE_STAT_INCR_SCAN(kt_skiprefd);
   1835 				pages_skipped = 1;
   1836 				page_unlock(pp);
   1837 				continue;
   1838 			}
   1839 
   1840 			/* On pass 2, VN_DISPOSE if mod bit is not set */
   1841 			if (pass <= 2) {
   1842 				if (pp->p_szc != 0 || (prm & P_MOD) ||
   1843 				    pp->p_lckcnt || pp->p_cowcnt) {
   1844 					pages_skipped = 1;
   1845 					page_unlock(pp);
   1846 				} else {
   1847 
   1848 					/*
   1849 					 * unload the mappings before
   1850 					 * checking if mod bit is set
   1851 					 */
   1852 					(void) hat_pageunload(pp,
   1853 					    HAT_FORCE_PGUNLOAD);
   1854 
   1855 					/*
   1856 					 * skip this page if modified
   1857 					 */
   1858 					if (hat_ismod(pp)) {
   1859 						pages_skipped = 1;
   1860 						page_unlock(pp);
   1861 						continue;
   1862 					}
   1863 
   1864 					KCAGE_STAT_INCR_SCAN(kt_destroy);
   1865 					/* constant in conditional context */
   1866 					/* LINTED */
   1867 					VN_DISPOSE(pp, B_INVAL, 0, kcred);
   1868 					did_something = 1;
   1869 				}
   1870 				continue;
   1871 			}
   1872 
   1873 			if (kcage_invalidate_page(pp, &nfreed) == 0) {
   1874 				did_something = 1;
   1875 				KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
   1876 			}
   1877 
   1878 			/*
   1879 			 * No need to drop the page lock here.
   1880 			 * Kcage_invalidate_page has done that for us
   1881 			 * either explicitly or through a page_free.
   1882 			 */
   1883 		}
   1884 	}
   1885 
   1886 	/*
   1887 	 * Expand the cage only if available cage memory is really low.
   1888 	 * This test is done only after a complete scan of the cage.
   1889 	 * The reason for not checking and expanding more often is to
   1890 	 * avoid rapid expansion of the cage. Naturally, scanning the
   1891 	 * cage takes time. So by scanning first, we use that work as a
   1892 	 * delay loop in between expand decisions.
   1893 	 */
   1894 
   1895 	scan_again = 0;
   1896 	if (kcage_freemem < kcage_minfree || kcage_needfree) {
   1897 		/*
   1898 		 * Kcage_expand() will return a non-zero value if it was
   1899 		 * able to expand the cage -- whether or not the new
   1900 		 * pages are free and immediately usable. If non-zero,
   1901 		 * we do another scan of the cage. The pages might be
   1902 		 * freed during that scan or by time we get back here.
   1903 		 * If not, we will attempt another expansion.
   1904 		 * However, if kcage_expand() returns zero, then it was
   1905 		 * unable to expand the cage. This is the case when the
   1906 		 * the growth list is exausted, therefore no work was done
   1907 		 * and there is no reason to scan the cage again.
   1908 		 * Note: Kernel cage scan is not repeated when only one
   1909 		 * cpu is active to avoid kernel cage thread hogging cpu.
   1910 		 */
   1911 		if (pass <= 3 && pages_skipped && cp_default.cp_ncpus > 1)
   1912 			scan_again = 1;
   1913 		else
   1914 			(void) kcage_expand(); /* don't scan again */
   1915 	} else if (kcage_freemem < kcage_lotsfree) {
   1916 		/*
   1917 		 * If available cage memory is less than abundant
   1918 		 * and a full scan of the cage has not yet been completed,
   1919 		 * or a scan has completed and some work was performed,
   1920 		 * or pages were skipped because of sharing,
   1921 		 * or we simply have not yet completed two passes,
   1922 		 * then do another scan.
   1923 		 */
   1924 		if (pass <= 2 && pages_skipped)
   1925 			scan_again = 1;
   1926 		if (pass == last_pass || did_something)
   1927 			scan_again = 1;
   1928 		else if (shared_skipped && shared_level < (8<<24)) {
   1929 			shared_level <<= 1;
   1930 			scan_again = 1;
   1931 		}
   1932 	}
   1933 
   1934 	if (scan_again && cp_default.cp_ncpus > 1)
   1935 		goto again;
   1936 	else {
   1937 		if (shared_level > 8)
   1938 			shared_level >>= 1;
   1939 
   1940 		KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
   1941 		KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
   1942 		KCAGE_STAT_SET_SCAN(kt_ticks, ddi_get_lbolt() - scan_start);
   1943 		KCAGE_STAT_INC_SCAN_INDEX;
   1944 		goto loop;
   1945 	}
   1946 
   1947 	/*NOTREACHED*/
   1948 }
   1949 
   1950 void
   1951 kcage_cageout_wakeup()
   1952 {
   1953 	if (mutex_tryenter(&kcage_cageout_mutex)) {
   1954 		if (kcage_cageout_ready) {
   1955 			cv_signal(&kcage_cageout_cv);
   1956 		} else if (kcage_freemem < kcage_minfree || kcage_needfree) {
   1957 			/*
   1958 			 * Available cage memory is really low. Time to
   1959 			 * start expanding the cage. However, the
   1960 			 * kernel cage thread is not yet ready to
   1961 			 * do the work. Use *this* thread, which is
   1962 			 * most likely to be t0, to do the work.
   1963 			 */
   1964 			KCAGE_STAT_INCR(kcw_expandearly);
   1965 			(void) kcage_expand();
   1966 			KCAGE_STAT_INC_SCAN_INDEX;
   1967 		}
   1968 
   1969 		mutex_exit(&kcage_cageout_mutex);
   1970 	}
   1971 	/* else, kernel cage thread is already running */
   1972 }
   1973 
   1974 void
   1975 kcage_tick()
   1976 {
   1977 	/*
   1978 	 * Once per second we wake up all the threads throttled
   1979 	 * waiting for cage memory, in case we've become stuck
   1980 	 * and haven't made forward progress expanding the cage.
   1981 	 */
   1982 	if (kcage_on && kcage_cageout_ready)
   1983 		cv_broadcast(&kcage_throttle_cv);
   1984 }
   1985