Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 /*
     40  * VM - anonymous pages.
     41  *
     42  * This layer sits immediately above the vm_swap layer.  It manages
     43  * physical pages that have no permanent identity in the file system
     44  * name space, using the services of the vm_swap layer to allocate
     45  * backing storage for these pages.  Since these pages have no external
     46  * identity, they are discarded when the last reference is removed.
     47  *
     48  * An important function of this layer is to manage low-level sharing
     49  * of pages that are logically distinct but that happen to be
     50  * physically identical (e.g., the corresponding pages of the processes
     51  * resulting from a fork before one process or the other changes their
     52  * contents).  This pseudo-sharing is present only as an optimization
     53  * and is not to be confused with true sharing in which multiple
     54  * address spaces deliberately contain references to the same object;
     55  * such sharing is managed at a higher level.
     56  *
     57  * The key data structure here is the anon struct, which contains a
     58  * reference count for its associated physical page and a hint about
     59  * the identity of that page.  Anon structs typically live in arrays,
     60  * with an instance's position in its array determining where the
     61  * corresponding backing storage is allocated; however, the swap_xlate()
     62  * routine abstracts away this representation information so that the
     63  * rest of the anon layer need not know it.  (See the swap layer for
     64  * more details on anon struct layout.)
     65  *
     66  * In the future versions of the system, the association between an
     67  * anon struct and its position on backing store will change so that
     68  * we don't require backing store all anonymous pages in the system.
     69  * This is important for consideration for large memory systems.
     70  * We can also use this technique to delay binding physical locations
     71  * to anonymous pages until pageout/swapout time where we can make
     72  * smarter allocation decisions to improve anonymous klustering.
     73  *
     74  * Many of the routines defined here take a (struct anon **) argument,
     75  * which allows the code at this level to manage anon pages directly,
     76  * so that callers can regard anon structs as opaque objects and not be
     77  * concerned with assigning or inspecting their contents.
     78  *
     79  * Clients of this layer refer to anon pages indirectly.  That is, they
     80  * maintain arrays of pointers to anon structs rather than maintaining
     81  * anon structs themselves.  The (struct anon **) arguments mentioned
     82  * above are pointers to entries in these arrays.  It is these arrays
     83  * that capture the mapping between offsets within a given segment and
     84  * the corresponding anonymous backing storage address.
     85  */
     86 
     87 #ifdef DEBUG
     88 #define	ANON_DEBUG
     89 #endif
     90 
     91 #include <sys/types.h>
     92 #include <sys/t_lock.h>
     93 #include <sys/param.h>
     94 #include <sys/systm.h>
     95 #include <sys/mman.h>
     96 #include <sys/cred.h>
     97 #include <sys/thread.h>
     98 #include <sys/vnode.h>
     99 #include <sys/cpuvar.h>
    100 #include <sys/swap.h>
    101 #include <sys/cmn_err.h>
    102 #include <sys/vtrace.h>
    103 #include <sys/kmem.h>
    104 #include <sys/sysmacros.h>
    105 #include <sys/bitmap.h>
    106 #include <sys/vmsystm.h>
    107 #include <sys/tuneable.h>
    108 #include <sys/debug.h>
    109 #include <sys/fs/swapnode.h>
    110 #include <sys/tnf_probe.h>
    111 #include <sys/lgrp.h>
    112 #include <sys/policy.h>
    113 #include <sys/condvar_impl.h>
    114 #include <sys/mutex_impl.h>
    115 #include <sys/rctl.h>
    116 
    117 #include <vm/as.h>
    118 #include <vm/hat.h>
    119 #include <vm/anon.h>
    120 #include <vm/page.h>
    121 #include <vm/vpage.h>
    122 #include <vm/seg.h>
    123 #include <vm/rm.h>
    124 
    125 #include <fs/fs_subr.h>
    126 
    127 struct vnode *anon_vp;
    128 
    129 int anon_debug;
    130 
    131 kmutex_t	anoninfo_lock;
    132 struct		k_anoninfo k_anoninfo;
    133 ani_free_t	ani_free_pool[ANI_MAX_POOL];
    134 pad_mutex_t	anon_array_lock[ANON_LOCKSIZE];
    135 kcondvar_t	anon_array_cv[ANON_LOCKSIZE];
    136 
    137 /*
    138  * Global hash table for (vp, off) -> anon slot
    139  */
    140 extern	int swap_maxcontig;
    141 size_t	anon_hash_size;
    142 struct anon **anon_hash;
    143 
    144 static struct kmem_cache *anon_cache;
    145 static struct kmem_cache *anonmap_cache;
    146 
    147 #ifdef VM_STATS
    148 static struct anonvmstats_str {
    149 	ulong_t getpages[30];
    150 	ulong_t privatepages[10];
    151 	ulong_t demotepages[9];
    152 	ulong_t decrefpages[9];
    153 	ulong_t	dupfillholes[4];
    154 	ulong_t freepages[1];
    155 } anonvmstats;
    156 #endif /* VM_STATS */
    157 
    158 /*ARGSUSED*/
    159 static int
    160 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
    161 {
    162 	struct anon_map *amp = buf;
    163 
    164 	rw_init(&amp->a_rwlock, NULL, RW_DEFAULT, NULL);
    165 	cv_init(&amp->a_purgecv, NULL, CV_DEFAULT, NULL);
    166 	mutex_init(&amp->a_pmtx, NULL, MUTEX_DEFAULT, NULL);
    167 	mutex_init(&amp->a_purgemtx, NULL, MUTEX_DEFAULT, NULL);
    168 	return (0);
    169 }
    170 
    171 /*ARGSUSED1*/
    172 static void
    173 anonmap_cache_destructor(void *buf, void *cdrarg)
    174 {
    175 	struct anon_map *amp = buf;
    176 
    177 	rw_destroy(&amp->a_rwlock);
    178 	cv_destroy(&amp->a_purgecv);
    179 	mutex_destroy(&amp->a_pmtx);
    180 	mutex_destroy(&amp->a_purgemtx);
    181 }
    182 
    183 kmutex_t	anonhash_lock[AH_LOCK_SIZE];
    184 kmutex_t	anonpages_hash_lock[AH_LOCK_SIZE];
    185 
    186 void
    187 anon_init(void)
    188 {
    189 	int i;
    190 
    191 	anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN);
    192 
    193 	for (i = 0; i < AH_LOCK_SIZE; i++) {
    194 		mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL);
    195 		mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL);
    196 	}
    197 
    198 	for (i = 0; i < ANON_LOCKSIZE; i++) {
    199 		mutex_init(&anon_array_lock[i].pad_mutex, NULL,
    200 		    MUTEX_DEFAULT, NULL);
    201 		cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL);
    202 	}
    203 
    204 	anon_hash = (struct anon **)
    205 	    kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP);
    206 	anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon),
    207 	    AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
    208 	anonmap_cache = kmem_cache_create("anonmap_cache",
    209 	    sizeof (struct anon_map), 0,
    210 	    anonmap_cache_constructor, anonmap_cache_destructor, NULL,
    211 	    NULL, NULL, 0);
    212 	swap_maxcontig = (1024 * 1024) >> PAGESHIFT;	/* 1MB of pages */
    213 
    214 	anon_vp = vn_alloc(KM_SLEEP);
    215 	vn_setops(anon_vp, swap_vnodeops);
    216 	anon_vp->v_type = VREG;
    217 	anon_vp->v_flag |= (VISSWAP|VISSWAPFS);
    218 }
    219 
    220 /*
    221  * Global anon slot hash table manipulation.
    222  */
    223 
    224 static void
    225 anon_addhash(struct anon *ap)
    226 {
    227 	int index;
    228 
    229 	ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]));
    230 	index = ANON_HASH(ap->an_vp, ap->an_off);
    231 	ap->an_hash = anon_hash[index];
    232 	anon_hash[index] = ap;
    233 }
    234 
    235 static void
    236 anon_rmhash(struct anon *ap)
    237 {
    238 	struct anon **app;
    239 
    240 	ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]));
    241 
    242 	for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)];
    243 	    *app; app = &((*app)->an_hash)) {
    244 		if (*app == ap) {
    245 			*app = ap->an_hash;
    246 			break;
    247 		}
    248 	}
    249 }
    250 
    251 /*
    252  * The anon array interfaces. Functions allocating,
    253  * freeing array of pointers, and returning/setting
    254  * entries in the array of pointers for a given offset.
    255  *
    256  * Create the list of pointers
    257  */
    258 struct anon_hdr *
    259 anon_create(pgcnt_t npages, int flags)
    260 {
    261 	struct anon_hdr *ahp;
    262 	ulong_t nchunks;
    263 	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
    264 
    265 	if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) {
    266 		return (NULL);
    267 	}
    268 
    269 	mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL);
    270 	/*
    271 	 * Single level case.
    272 	 */
    273 	ahp->size = npages;
    274 	if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) {
    275 
    276 		if (flags & ANON_ALLOC_FORCE)
    277 			ahp->flags |= ANON_ALLOC_FORCE;
    278 
    279 		ahp->array_chunk = kmem_zalloc(
    280 		    ahp->size * sizeof (struct anon *), kmemflags);
    281 
    282 		if (ahp->array_chunk == NULL) {
    283 			kmem_free(ahp, sizeof (struct anon_hdr));
    284 			return (NULL);
    285 		}
    286 	} else {
    287 		/*
    288 		 * 2 Level case.
    289 		 * anon hdr size needs to be rounded off  to be a multiple
    290 		 * of ANON_CHUNK_SIZE. This is important as various anon
    291 		 * related functions depend on this.
    292 		 * NOTE -
    293 		 * anon_grow()  makes anon hdr size a multiple of
    294 		 * ANON_CHUNK_SIZE.
    295 		 * amp size is <= anon hdr size.
    296 		 * anon_index + seg_pgs <= anon hdr size.
    297 		 */
    298 		ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE);
    299 		nchunks = ahp->size >> ANON_CHUNK_SHIFT;
    300 
    301 		ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *),
    302 		    kmemflags);
    303 
    304 		if (ahp->array_chunk == NULL) {
    305 			kmem_free(ahp, sizeof (struct anon_hdr));
    306 			return (NULL);
    307 		}
    308 	}
    309 	return (ahp);
    310 }
    311 
    312 /*
    313  * Free the array of pointers
    314  */
    315 void
    316 anon_release(struct anon_hdr *ahp, pgcnt_t npages)
    317 {
    318 	ulong_t i;
    319 	void **ppp;
    320 	ulong_t nchunks;
    321 
    322 	ASSERT(npages <= ahp->size);
    323 
    324 	/*
    325 	 * Single level case.
    326 	 */
    327 	if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
    328 		kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *));
    329 	} else {
    330 		/*
    331 		 * 2 level case.
    332 		 */
    333 		nchunks = ahp->size >> ANON_CHUNK_SHIFT;
    334 		for (i = 0; i < nchunks; i++) {
    335 			ppp = &ahp->array_chunk[i];
    336 			if (*ppp != NULL)
    337 				kmem_free(*ppp, PAGESIZE);
    338 		}
    339 		kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *));
    340 	}
    341 	mutex_destroy(&ahp->serial_lock);
    342 	kmem_free(ahp, sizeof (struct anon_hdr));
    343 }
    344 
    345 /*
    346  * Return the pointer from the list for a
    347  * specified anon index.
    348  */
    349 struct anon *
    350 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx)
    351 {
    352 	struct anon **app;
    353 
    354 	ASSERT(an_idx < ahp->size);
    355 
    356 	/*
    357 	 * Single level case.
    358 	 */
    359 	if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
    360 		return ((struct anon *)
    361 		    ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK));
    362 	} else {
    363 
    364 		/*
    365 		 * 2 level case.
    366 		 */
    367 		app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
    368 		if (app) {
    369 			return ((struct anon *)
    370 			    ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] &
    371 			    ANON_PTRMASK));
    372 		} else {
    373 			return (NULL);
    374 		}
    375 	}
    376 }
    377 
    378 /*
    379  * Return the anon pointer for the first valid entry in the anon list,
    380  * starting from the given index.
    381  */
    382 struct anon *
    383 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index)
    384 {
    385 	struct anon *ap;
    386 	struct anon **app;
    387 	ulong_t chunkoff;
    388 	ulong_t i;
    389 	ulong_t j;
    390 	pgcnt_t size;
    391 
    392 	i = *index;
    393 	size = ahp->size;
    394 
    395 	ASSERT(i < size);
    396 
    397 	if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
    398 		/*
    399 		 * 1 level case
    400 		 */
    401 		while (i < size) {
    402 			ap = (struct anon *)
    403 			    ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK);
    404 			if (ap) {
    405 				*index = i;
    406 				return (ap);
    407 			}
    408 			i++;
    409 		}
    410 	} else {
    411 		/*
    412 		 * 2 level case
    413 		 */
    414 		chunkoff = i & ANON_CHUNK_OFF;
    415 		while (i < size) {
    416 			app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT];
    417 			if (app)
    418 				for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) {
    419 					ap = (struct anon *)
    420 					    ((uintptr_t)app[j] & ANON_PTRMASK);
    421 					if (ap) {
    422 						*index = i + (j - chunkoff);
    423 						return (ap);
    424 					}
    425 				}
    426 			chunkoff = 0;
    427 			i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF;
    428 		}
    429 	}
    430 	*index = size;
    431 	return (NULL);
    432 }
    433 
    434 /*
    435  * Set list entry with a given pointer for a specified offset
    436  */
    437 int
    438 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags)
    439 {
    440 	void		**ppp;
    441 	struct anon	**app;
    442 	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
    443 	uintptr_t	*ap_addr;
    444 
    445 	ASSERT(an_idx < ahp->size);
    446 
    447 	/*
    448 	 * Single level case.
    449 	 */
    450 	if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
    451 		ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx];
    452 	} else {
    453 
    454 		/*
    455 		 * 2 level case.
    456 		 */
    457 		ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
    458 
    459 		ASSERT(ppp != NULL);
    460 		if (*ppp == NULL) {
    461 			mutex_enter(&ahp->serial_lock);
    462 			ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
    463 			if (*ppp == NULL) {
    464 				*ppp = kmem_zalloc(PAGESIZE, kmemflags);
    465 				if (*ppp == NULL) {
    466 					mutex_exit(&ahp->serial_lock);
    467 					return (ENOMEM);
    468 				}
    469 			}
    470 			mutex_exit(&ahp->serial_lock);
    471 		}
    472 		app = *ppp;
    473 		ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF];
    474 	}
    475 	*ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap;
    476 	return (0);
    477 }
    478 
    479 /*
    480  * Copy anon array into a given new anon array
    481  */
    482 int
    483 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx,
    484 	struct anon_hdr *dahp, ulong_t d_idx,
    485 	pgcnt_t npages, int flags)
    486 {
    487 	void **sapp, **dapp;
    488 	void *ap;
    489 	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
    490 
    491 	ASSERT((s_idx < sahp->size) && (d_idx < dahp->size));
    492 	ASSERT((npages <= sahp->size) && (npages <= dahp->size));
    493 
    494 	/*
    495 	 * Both arrays are 1 level.
    496 	 */
    497 	if (((sahp->size <= ANON_CHUNK_SIZE) &&
    498 	    (dahp->size <= ANON_CHUNK_SIZE)) ||
    499 	    ((sahp->flags & ANON_ALLOC_FORCE) &&
    500 	    (dahp->flags & ANON_ALLOC_FORCE))) {
    501 
    502 		bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx],
    503 		    npages * sizeof (struct anon *));
    504 		return (0);
    505 	}
    506 
    507 	/*
    508 	 * Both arrays are 2 levels.
    509 	 */
    510 	if (sahp->size > ANON_CHUNK_SIZE &&
    511 	    dahp->size > ANON_CHUNK_SIZE &&
    512 	    ((sahp->flags & ANON_ALLOC_FORCE) == 0) &&
    513 	    ((dahp->flags & ANON_ALLOC_FORCE) == 0)) {
    514 
    515 		ulong_t sapidx, dapidx;
    516 		ulong_t *sap, *dap;
    517 		ulong_t chknp;
    518 
    519 		while (npages != 0) {
    520 
    521 			sapidx = s_idx & ANON_CHUNK_OFF;
    522 			dapidx = d_idx & ANON_CHUNK_OFF;
    523 			chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx);
    524 			if (chknp > npages)
    525 				chknp = npages;
    526 
    527 			sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT];
    528 			if ((sap = *sapp) != NULL) {
    529 				dapp = &dahp->array_chunk[d_idx
    530 				    >> ANON_CHUNK_SHIFT];
    531 				if ((dap = *dapp) == NULL) {
    532 					*dapp = kmem_zalloc(PAGESIZE,
    533 					    kmemflags);
    534 					if ((dap = *dapp) == NULL)
    535 						return (ENOMEM);
    536 				}
    537 				bcopy((sap + sapidx), (dap + dapidx),
    538 				    chknp << ANON_PTRSHIFT);
    539 			}
    540 			s_idx += chknp;
    541 			d_idx += chknp;
    542 			npages -= chknp;
    543 		}
    544 		return (0);
    545 	}
    546 
    547 	/*
    548 	 * At least one of the arrays is 2 level.
    549 	 */
    550 	while (npages--) {
    551 		if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) {
    552 			ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx)));
    553 			if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM)
    554 					return (ENOMEM);
    555 		}
    556 		s_idx++;
    557 		d_idx++;
    558 	}
    559 	return (0);
    560 }
    561 
    562 
    563 /*
    564  * ANON_INITBUF is a convenience macro for anon_grow() below. It
    565  * takes a buffer dst, which is at least as large as buffer src. It
    566  * does a bcopy from src into dst, and then bzeros the extra bytes
    567  * of dst. If tail is set, the data in src is tail aligned within
    568  * dst instead of head aligned.
    569  */
    570 
    571 #define	ANON_INITBUF(src, srclen, dst, dstsize, tail)			      \
    572 	if (tail) {							      \
    573 		bzero((dst), (dstsize) - (srclen));			      \
    574 		bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \
    575 	} else {							      \
    576 		bcopy((src), (dst), (srclen));				      \
    577 		bzero((char *)(dst) + (srclen), (dstsize) - (srclen));	      \
    578 	}
    579 
    580 #define	ANON_1_LEVEL_INC	(ANON_CHUNK_SIZE / 8)
    581 #define	ANON_2_LEVEL_INC	(ANON_1_LEVEL_INC * ANON_CHUNK_SIZE)
    582 
    583 /*
    584  * anon_grow() is used to efficiently extend an existing anon array.
    585  * startidx_p points to the index into the anon array of the first page
    586  * that is in use. oldseg_pgs is the number of pages in use, starting at
    587  * *startidx_p. newpages is the number of additional pages desired.
    588  *
    589  * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed.
    590  *
    591  * The growth is done by creating a new top level of the anon array,
    592  * and (if the array is 2-level) reusing the existing second level arrays.
    593  *
    594  * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN.
    595  *
    596  * Returns the new number of pages in the anon array.
    597  */
    598 pgcnt_t
    599 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs,
    600     pgcnt_t newseg_pgs, int flags)
    601 {
    602 	ulong_t startidx = startidx_p ? *startidx_p : 0;
    603 	pgcnt_t oldamp_pgs = ahp->size, newamp_pgs;
    604 	pgcnt_t oelems, nelems, totpages;
    605 	void **level1;
    606 	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
    607 	int growdown = (flags & ANON_GROWDOWN);
    608 	size_t newarrsz, oldarrsz;
    609 	void *level2;
    610 
    611 	ASSERT(!(startidx_p == NULL && growdown));
    612 	ASSERT(startidx + oldseg_pgs <= ahp->size);
    613 
    614 	/*
    615 	 * Determine the total number of pages needed in the new
    616 	 * anon array. If growing down, totpages is all pages from
    617 	 * startidx through the end of the array, plus <newseg_pgs>
    618 	 * pages. If growing up, keep all pages from page 0 through
    619 	 * the last page currently in use, plus <newseg_pgs> pages.
    620 	 */
    621 	if (growdown)
    622 		totpages = oldamp_pgs - startidx + newseg_pgs;
    623 	else
    624 		totpages = startidx + oldseg_pgs + newseg_pgs;
    625 
    626 	/* If the array is already large enough, just return. */
    627 
    628 	if (oldamp_pgs >= totpages) {
    629 		if (growdown)
    630 			*startidx_p = oldamp_pgs - totpages;
    631 		return (oldamp_pgs);
    632 	}
    633 
    634 	/*
    635 	 * oldamp_pgs/newamp_pgs are the total numbers of pages represented
    636 	 * by the corresponding arrays.
    637 	 * oelems/nelems are the number of pointers in the top level arrays
    638 	 * which may be either level 1 or level 2.
    639 	 * Will the new anon array be one level or two levels?
    640 	 */
    641 	if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
    642 		newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC);
    643 		oelems = oldamp_pgs;
    644 		nelems = newamp_pgs;
    645 	} else {
    646 		newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC);
    647 		oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
    648 		nelems = newamp_pgs >> ANON_CHUNK_SHIFT;
    649 	}
    650 
    651 	newarrsz = nelems * sizeof (void *);
    652 	level1 = kmem_alloc(newarrsz, kmemflags);
    653 	if (level1 == NULL)
    654 		return (0);
    655 
    656 	/* Are we converting from a one level to a two level anon array? */
    657 
    658 	if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE &&
    659 	    !(ahp->flags & ANON_ALLOC_FORCE)) {
    660 
    661 		/*
    662 		 * Yes, we're converting to a two level. Reuse old level 1
    663 		 * as new level 2 if it is exactly PAGESIZE. Otherwise
    664 		 * alloc a new level 2 and copy the old level 1 data into it.
    665 		 */
    666 		if (oldamp_pgs == ANON_CHUNK_SIZE) {
    667 			level2 = (void *)ahp->array_chunk;
    668 		} else {
    669 			level2 = kmem_alloc(PAGESIZE, kmemflags);
    670 			if (level2 == NULL) {
    671 				kmem_free(level1, newarrsz);
    672 				return (0);
    673 			}
    674 			oldarrsz = oldamp_pgs * sizeof (void *);
    675 
    676 			ANON_INITBUF(ahp->array_chunk, oldarrsz,
    677 			    level2, PAGESIZE, growdown);
    678 			kmem_free(ahp->array_chunk, oldarrsz);
    679 		}
    680 		bzero(level1, newarrsz);
    681 		if (growdown)
    682 			level1[nelems - 1] = level2;
    683 		else
    684 			level1[0] = level2;
    685 	} else {
    686 		oldarrsz = oelems * sizeof (void *);
    687 
    688 		ANON_INITBUF(ahp->array_chunk, oldarrsz,
    689 		    level1, newarrsz, growdown);
    690 		kmem_free(ahp->array_chunk, oldarrsz);
    691 	}
    692 
    693 	ahp->array_chunk = level1;
    694 	ahp->size = newamp_pgs;
    695 	if (growdown)
    696 		*startidx_p = newamp_pgs - totpages;
    697 
    698 	return (newamp_pgs);
    699 }
    700 
    701 
    702 /*
    703  * Called from clock handler to sync ani_free value.
    704  */
    705 
    706 void
    707 set_anoninfo(void)
    708 {
    709 	int	ix;
    710 	pgcnt_t	total = 0;
    711 
    712 	for (ix = 0; ix < ANI_MAX_POOL; ix++) {
    713 		total += ani_free_pool[ix].ani_count;
    714 	}
    715 	k_anoninfo.ani_free = total;
    716 }
    717 
    718 /*
    719  * Reserve anon space.
    720  *
    721  * It's no longer simply a matter of incrementing ani_resv to
    722  * reserve swap space, we need to check memory-based as well
    723  * as disk-backed (physical) swap.  The following algorithm
    724  * is used:
    725  * 	Check the space on physical swap
    726  * 		i.e. amount needed < ani_max - ani_phys_resv
    727  * 	If we are swapping on swapfs check
    728  *		amount needed < (availrmem - swapfs_minfree)
    729  * Since the algorithm to check for the quantity of swap space is
    730  * almost the same as that for reserving it, we'll just use anon_resvmem
    731  * with a flag to decrement availrmem.
    732  *
    733  * Return non-zero on success.
    734  */
    735 int
    736 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
    737 {
    738 	pgcnt_t npages = btopr(size);
    739 	pgcnt_t mswap_pages = 0;
    740 	pgcnt_t pswap_pages = 0;
    741 	proc_t *p = curproc;
    742 
    743 	if (zone != NULL && takemem) {
    744 		/* test zone.max-swap resource control */
    745 		mutex_enter(&p->p_lock);
    746 		if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
    747 			mutex_exit(&p->p_lock);
    748 			return (0);
    749 		}
    750 		mutex_exit(&p->p_lock);
    751 	}
    752 	mutex_enter(&anoninfo_lock);
    753 
    754 	/*
    755 	 * pswap_pages is the number of pages we can take from
    756 	 * physical (i.e. disk-backed) swap.
    757 	 */
    758 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
    759 	pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv;
    760 
    761 	ANON_PRINT(A_RESV,
    762 	    ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n",
    763 	    npages, takemem, pswap_pages, (void *)caller()));
    764 
    765 	if (npages <= pswap_pages) {
    766 		/*
    767 		 * we have enough space on a physical swap
    768 		 */
    769 		if (takemem)
    770 			k_anoninfo.ani_phys_resv += npages;
    771 		mutex_exit(&anoninfo_lock);
    772 		return (1);
    773 	} else if (pswap_pages != 0) {
    774 		/*
    775 		 * we have some space on a physical swap
    776 		 */
    777 		if (takemem) {
    778 			/*
    779 			 * use up remainder of phys swap
    780 			 */
    781 			k_anoninfo.ani_phys_resv += pswap_pages;
    782 			ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max);
    783 		}
    784 	}
    785 	/*
    786 	 * since (npages > pswap_pages) we need mem swap
    787 	 * mswap_pages is the number of pages needed from availrmem
    788 	 */
    789 	ASSERT(npages > pswap_pages);
    790 	mswap_pages = npages - pswap_pages;
    791 
    792 	ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n",
    793 	    mswap_pages));
    794 
    795 	/*
    796 	 * priv processes can reserve memory as swap as long as availrmem
    797 	 * remains greater than swapfs_minfree; in the case of non-priv
    798 	 * processes, memory can be reserved as swap only if availrmem
    799 	 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
    800 	 * swapfs_reserve amount of memswap is not available to non-priv
    801 	 * processes. This protects daemons such as automounter dying
    802 	 * as a result of application processes eating away almost entire
    803 	 * membased swap. This safeguard becomes useless if apps are run
    804 	 * with root access.
    805 	 *
    806 	 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
    807 	 *
    808 	 */
    809 	if (tryhard) {
    810 		pgcnt_t floor_pages;
    811 
    812 		if (secpolicy_resource_anon_mem(CRED())) {
    813 			floor_pages = swapfs_minfree;
    814 		} else {
    815 			floor_pages = swapfs_minfree + swapfs_reserve;
    816 		}
    817 
    818 		mutex_exit(&anoninfo_lock);
    819 		(void) page_reclaim_mem(mswap_pages, floor_pages, 0);
    820 		mutex_enter(&anoninfo_lock);
    821 	}
    822 
    823 	mutex_enter(&freemem_lock);
    824 	if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) ||
    825 	    (availrmem > (swapfs_minfree + mswap_pages) &&
    826 	    secpolicy_resource(CRED()) == 0)) {
    827 
    828 		if (takemem) {
    829 			/*
    830 			 * Take the memory from the rest of the system.
    831 			 */
    832 			availrmem -= mswap_pages;
    833 			mutex_exit(&freemem_lock);
    834 			k_anoninfo.ani_mem_resv += mswap_pages;
    835 			ANI_ADD(mswap_pages);
    836 			ANON_PRINT((A_RESV | A_MRESV),
    837 			    ("anon_resvmem: took %ld pages of availrmem\n",
    838 			    mswap_pages));
    839 		} else {
    840 			mutex_exit(&freemem_lock);
    841 		}
    842 
    843 		ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
    844 		mutex_exit(&anoninfo_lock);
    845 		return (1);
    846 	} else {
    847 		/*
    848 		 * Fail if not enough memory
    849 		 */
    850 		if (takemem) {
    851 			k_anoninfo.ani_phys_resv -= pswap_pages;
    852 		}
    853 
    854 		mutex_exit(&freemem_lock);
    855 		mutex_exit(&anoninfo_lock);
    856 		ANON_PRINT(A_RESV,
    857 		    ("anon_resvmem: not enough space from swapfs\n"));
    858 		if (zone != NULL && takemem)
    859 			rctl_decr_swap(zone, ptob(npages));
    860 		return (0);
    861 	}
    862 }
    863 
    864 /*
    865  * Give back an anon reservation.
    866  */
    867 void
    868 anon_unresvmem(size_t size, zone_t *zone)
    869 {
    870 	pgcnt_t npages = btopr(size);
    871 	spgcnt_t mem_free_pages = 0;
    872 	pgcnt_t phys_free_slots;
    873 #ifdef	ANON_DEBUG
    874 	pgcnt_t mem_resv;
    875 #endif
    876 	if (zone != NULL)
    877 		rctl_decr_swap(zone, ptob(npages));
    878 
    879 	mutex_enter(&anoninfo_lock);
    880 
    881 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
    882 
    883 	/*
    884 	 * If some of this reservation belonged to swapfs
    885 	 * give it back to availrmem.
    886 	 * ani_mem_resv is the amount of availrmem swapfs has reserved.
    887 	 * but some of that memory could be locked by segspt so we can only
    888 	 * return non locked ani_mem_resv back to availrmem
    889 	 */
    890 	if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
    891 		ANON_PRINT((A_RESV | A_MRESV),
    892 		    ("anon_unresv: growing availrmem by %ld pages\n",
    893 		    MIN(k_anoninfo.ani_mem_resv, npages)));
    894 
    895 		mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv -
    896 		    k_anoninfo.ani_locked_swap), npages);
    897 		mutex_enter(&freemem_lock);
    898 		availrmem += mem_free_pages;
    899 		mutex_exit(&freemem_lock);
    900 		k_anoninfo.ani_mem_resv -= mem_free_pages;
    901 
    902 		ANI_ADD(-mem_free_pages);
    903 	}
    904 	/*
    905 	 * The remainder of the pages is returned to phys swap
    906 	 */
    907 	ASSERT(npages >= mem_free_pages);
    908 	phys_free_slots = npages - mem_free_pages;
    909 
    910 	if (phys_free_slots) {
    911 		k_anoninfo.ani_phys_resv -= phys_free_slots;
    912 	}
    913 
    914 #ifdef	ANON_DEBUG
    915 	mem_resv = k_anoninfo.ani_mem_resv;
    916 #endif
    917 
    918 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
    919 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
    920 
    921 	mutex_exit(&anoninfo_lock);
    922 
    923 	ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n",
    924 	    npages, mem_resv, (void *)caller()));
    925 }
    926 
    927 /*
    928  * Allocate an anon slot and return it with the lock held.
    929  */
    930 struct anon *
    931 anon_alloc(struct vnode *vp, anoff_t off)
    932 {
    933 	struct anon	*ap;
    934 	kmutex_t	*ahm;
    935 
    936 	ap = kmem_cache_alloc(anon_cache, KM_SLEEP);
    937 	if (vp == NULL) {
    938 		swap_alloc(ap);
    939 	} else {
    940 		ap->an_vp = vp;
    941 		ap->an_off = off;
    942 	}
    943 	ap->an_refcnt = 1;
    944 	ap->an_pvp = NULL;
    945 	ap->an_poff = 0;
    946 	ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
    947 	mutex_enter(ahm);
    948 	anon_addhash(ap);
    949 	mutex_exit(ahm);
    950 	ANI_ADD(-1);
    951 	ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n",
    952 	    (void *)ap, (ap ? (void *)ap->an_vp : NULL)));
    953 	return (ap);
    954 }
    955 
    956 /*
    957  * Called for pages locked in memory via softlock/pagelock/mlock to make sure
    958  * such pages don't consume any physical swap resources needed for swapping
    959  * unlocked pages.
    960  */
    961 void
    962 anon_swap_free(struct anon *ap, page_t *pp)
    963 {
    964 	kmutex_t *ahm;
    965 
    966 	ASSERT(ap != NULL);
    967 	ASSERT(pp != NULL);
    968 	ASSERT(PAGE_LOCKED(pp));
    969 	ASSERT(pp->p_vnode != NULL);
    970 	ASSERT(IS_SWAPFSVP(pp->p_vnode));
    971 	ASSERT(ap->an_refcnt != 0);
    972 	ASSERT(pp->p_vnode == ap->an_vp);
    973 	ASSERT(pp->p_offset == ap->an_off);
    974 
    975 	if (ap->an_pvp == NULL)
    976 		return;
    977 
    978 	page_io_lock(pp);
    979 	ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
    980 	mutex_enter(ahm);
    981 
    982 	ASSERT(ap->an_refcnt != 0);
    983 	ASSERT(pp->p_vnode == ap->an_vp);
    984 	ASSERT(pp->p_offset == ap->an_off);
    985 
    986 	if (ap->an_pvp != NULL) {
    987 		swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
    988 		ap->an_pvp = NULL;
    989 		ap->an_poff = 0;
    990 		mutex_exit(ahm);
    991 		hat_setmod(pp);
    992 	} else {
    993 		mutex_exit(ahm);
    994 	}
    995 	page_io_unlock(pp);
    996 }
    997 
    998 /*
    999  * Decrement the reference count of an anon page.
   1000  * If reference count goes to zero, free it and
   1001  * its associated page (if any).
   1002  */
   1003 void
   1004 anon_decref(struct anon *ap)
   1005 {
   1006 	page_t *pp;
   1007 	struct vnode *vp;
   1008 	anoff_t off;
   1009 	kmutex_t *ahm;
   1010 
   1011 	ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
   1012 	mutex_enter(ahm);
   1013 	ASSERT(ap->an_refcnt != 0);
   1014 	if (ap->an_refcnt == 0)
   1015 		panic("anon_decref: slot count 0");
   1016 	if (--ap->an_refcnt == 0) {
   1017 		swap_xlate(ap, &vp, &off);
   1018 		anon_rmhash(ap);
   1019 		if (ap->an_pvp != NULL)
   1020 			swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
   1021 		mutex_exit(ahm);
   1022 
   1023 		/*
   1024 		 * If there is a page for this anon slot we will need to
   1025 		 * call VN_DISPOSE to get rid of the vp association and
   1026 		 * put the page back on the free list as really free.
   1027 		 * Acquire the "exclusive" lock to ensure that any
   1028 		 * pending i/o always completes before the swap slot
   1029 		 * is freed.
   1030 		 */
   1031 		pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
   1032 		if (pp != NULL) {
   1033 			/*LINTED: constant in conditional context */
   1034 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
   1035 		}
   1036 		ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n",
   1037 		    (void *)ap, (void *)ap->an_vp));
   1038 
   1039 		kmem_cache_free(anon_cache, ap);
   1040 
   1041 		ANI_ADD(1);
   1042 	} else {
   1043 		mutex_exit(ahm);
   1044 	}
   1045 }
   1046 
   1047 
   1048 /*
   1049  * check an_refcnt of the root anon slot (anon_index argument is aligned at
   1050  * seg->s_szc level) to determine whether COW processing is required.
   1051  * anonpages_hash_lock[] held on the root ap ensures that if root's
   1052  * refcnt is 1 all other refcnt's are 1 as well (and they can't increase
   1053  * later since this process can't fork while its AS lock is held).
   1054  *
   1055  * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0.
   1056  */
   1057 int
   1058 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index)
   1059 {
   1060 	struct anon	*ap;
   1061 	kmutex_t	*ahmpages = NULL;
   1062 
   1063 	ap = anon_get_ptr(ahp, anon_index);
   1064 	if (ap == NULL)
   1065 		return (0);
   1066 
   1067 	ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
   1068 	mutex_enter(ahmpages);
   1069 	ASSERT(ap->an_refcnt >= 1);
   1070 	if (ap->an_refcnt == 1) {
   1071 		mutex_exit(ahmpages);
   1072 		return (0);
   1073 	}
   1074 	mutex_exit(ahmpages);
   1075 	return (1);
   1076 }
   1077 /*
   1078  * Check 'nslots' anon slots for refcnt > 1.
   1079  *
   1080  * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise
   1081  * returns 0.
   1082  */
   1083 static int
   1084 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
   1085 {
   1086 	struct anon *ap;
   1087 
   1088 	while (nslots-- > 0) {
   1089 		if ((ap = anon_get_ptr(ahp, anon_index)) != NULL &&
   1090 		    ap->an_refcnt > 1)
   1091 			return (1);
   1092 		anon_index++;
   1093 	}
   1094 
   1095 	return (0);
   1096 }
   1097 
   1098 static void
   1099 anon_decref_pages(
   1100 	struct anon_hdr *ahp,
   1101 	ulong_t an_idx,
   1102 	uint_t szc)
   1103 {
   1104 	struct anon *ap = anon_get_ptr(ahp, an_idx);
   1105 	kmutex_t *ahmpages = NULL;
   1106 	page_t *pp;
   1107 	pgcnt_t pgcnt = page_get_pagecnt(szc);
   1108 	pgcnt_t i;
   1109 	struct vnode *vp;
   1110 	anoff_t   off;
   1111 	kmutex_t *ahm;
   1112 #ifdef DEBUG
   1113 	int refcnt = 1;
   1114 #endif
   1115 
   1116 	ASSERT(szc != 0);
   1117 	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
   1118 	ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
   1119 	ASSERT(an_idx < ahp->size);
   1120 
   1121 	if (ahp->size - an_idx < pgcnt) {
   1122 		/*
   1123 		 * In case of shared mappings total anon map size may not be
   1124 		 * the largest page size aligned.
   1125 		 */
   1126 		pgcnt = ahp->size - an_idx;
   1127 	}
   1128 
   1129 	VM_STAT_ADD(anonvmstats.decrefpages[0]);
   1130 
   1131 	if (ap != NULL) {
   1132 		ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
   1133 		mutex_enter(ahmpages);
   1134 		ASSERT((refcnt = ap->an_refcnt) != 0);
   1135 		VM_STAT_ADD(anonvmstats.decrefpages[1]);
   1136 		if (ap->an_refcnt == 1) {
   1137 			VM_STAT_ADD(anonvmstats.decrefpages[2]);
   1138 			ASSERT(!anon_share(ahp, an_idx, pgcnt));
   1139 			mutex_exit(ahmpages);
   1140 			ahmpages = NULL;
   1141 		}
   1142 	}
   1143 
   1144 	i = 0;
   1145 	while (i < pgcnt) {
   1146 		if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) {
   1147 			ASSERT(refcnt == 1 && ahmpages == NULL);
   1148 			i++;
   1149 			continue;
   1150 		}
   1151 		ASSERT(ap->an_refcnt == refcnt);
   1152 		ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
   1153 		ASSERT(ahmpages == NULL || ap->an_refcnt > 1);
   1154 
   1155 		if (ahmpages == NULL) {
   1156 			swap_xlate(ap, &vp, &off);
   1157 			pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
   1158 			if (pp == NULL || pp->p_szc == 0) {
   1159 				VM_STAT_ADD(anonvmstats.decrefpages[3]);
   1160 				ahm = &anonhash_lock[AH_LOCK(ap->an_vp,
   1161 				    ap->an_off)];
   1162 				(void) anon_set_ptr(ahp, an_idx + i, NULL,
   1163 				    ANON_SLEEP);
   1164 				mutex_enter(ahm);
   1165 				ap->an_refcnt--;
   1166 				ASSERT(ap->an_refcnt == 0);
   1167 				anon_rmhash(ap);
   1168 				if (ap->an_pvp)
   1169 					swap_phys_free(ap->an_pvp, ap->an_poff,
   1170 					    PAGESIZE);
   1171 				mutex_exit(ahm);
   1172 				if (pp == NULL) {
   1173 					pp = page_lookup(vp, (u_offset_t)off,
   1174 					    SE_EXCL);
   1175 					ASSERT(pp == NULL || pp->p_szc == 0);
   1176 				}
   1177 				if (pp != NULL) {
   1178 					VM_STAT_ADD(anonvmstats.decrefpages[4]);
   1179 					/*LINTED*/
   1180 					VN_DISPOSE(pp, B_INVAL, 0, kcred);
   1181 				}
   1182 				kmem_cache_free(anon_cache, ap);
   1183 				ANI_ADD(1);
   1184 				i++;
   1185 			} else {
   1186 				pgcnt_t j;
   1187 				pgcnt_t curpgcnt =
   1188 				    page_get_pagecnt(pp->p_szc);
   1189 				size_t ppasize = curpgcnt * sizeof (page_t *);
   1190 				page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
   1191 				int dispose = 0;
   1192 
   1193 				VM_STAT_ADD(anonvmstats.decrefpages[5]);
   1194 
   1195 				ASSERT(pp->p_szc <= szc);
   1196 				ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt));
   1197 				ASSERT(IS_P2ALIGNED(i, curpgcnt));
   1198 				ASSERT(i + curpgcnt <= pgcnt);
   1199 				ASSERT(!(page_pptonum(pp) & (curpgcnt - 1)));
   1200 				ppa[0] = pp;
   1201 				for (j = i + 1; j < i + curpgcnt; j++) {
   1202 					ap = anon_get_ptr(ahp, an_idx + j);
   1203 					ASSERT(ap != NULL &&
   1204 					    ap->an_refcnt == 1);
   1205 					swap_xlate(ap, &vp, &off);
   1206 					pp = page_lookup(vp, (u_offset_t)off,
   1207 					    SE_EXCL);
   1208 					if (pp == NULL)
   1209 						panic("anon_decref_pages: "
   1210 						    "no page");
   1211 
   1212 					(void) hat_pageunload(pp,
   1213 					    HAT_FORCE_PGUNLOAD);
   1214 					ASSERT(pp->p_szc == ppa[0]->p_szc);
   1215 					ASSERT(page_pptonum(pp) - 1 ==
   1216 					    page_pptonum(ppa[j - i - 1]));
   1217 					ppa[j - i] = pp;
   1218 					if (ap->an_pvp != NULL &&
   1219 					    !vn_matchopval(ap->an_pvp,
   1220 					    VOPNAME_DISPOSE,
   1221 					    (fs_generic_func_p)fs_dispose))
   1222 						dispose = 1;
   1223 				}
   1224 				for (j = i; j < i + curpgcnt; j++) {
   1225 					ap = anon_get_ptr(ahp, an_idx + j);
   1226 					ASSERT(ap != NULL &&
   1227 					    ap->an_refcnt == 1);
   1228 					ahm = &anonhash_lock[AH_LOCK(ap->an_vp,
   1229 					    ap->an_off)];
   1230 					(void) anon_set_ptr(ahp, an_idx + j,
   1231 					    NULL, ANON_SLEEP);
   1232 					mutex_enter(ahm);
   1233 					ap->an_refcnt--;
   1234 					ASSERT(ap->an_refcnt == 0);
   1235 					anon_rmhash(ap);
   1236 					if (ap->an_pvp)
   1237 						swap_phys_free(ap->an_pvp,
   1238 						    ap->an_poff, PAGESIZE);
   1239 					mutex_exit(ahm);
   1240 					kmem_cache_free(anon_cache, ap);
   1241 					ANI_ADD(1);
   1242 				}
   1243 				if (!dispose) {
   1244 					VM_STAT_ADD(anonvmstats.decrefpages[6]);
   1245 					page_destroy_pages(ppa[0]);
   1246 				} else {
   1247 					VM_STAT_ADD(anonvmstats.decrefpages[7]);
   1248 					for (j = 0; j < curpgcnt; j++) {
   1249 						ASSERT(PAGE_EXCL(ppa[j]));
   1250 						ppa[j]->p_szc = 0;
   1251 					}
   1252 					for (j = 0; j < curpgcnt; j++) {
   1253 						ASSERT(!hat_page_is_mapped(
   1254 						    ppa[j]));
   1255 						/*LINTED*/
   1256 						VN_DISPOSE(ppa[j], B_INVAL, 0,
   1257 						    kcred);
   1258 					}
   1259 				}
   1260 				kmem_free(ppa, ppasize);
   1261 				i += curpgcnt;
   1262 			}
   1263 		} else {
   1264 			VM_STAT_ADD(anonvmstats.decrefpages[8]);
   1265 			(void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP);
   1266 			ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
   1267 			mutex_enter(ahm);
   1268 			ap->an_refcnt--;
   1269 			mutex_exit(ahm);
   1270 			i++;
   1271 		}
   1272 	}
   1273 
   1274 	if (ahmpages != NULL) {
   1275 		mutex_exit(ahmpages);
   1276 	}
   1277 }
   1278 
   1279 /*
   1280  * Duplicate references to size bytes worth of anon pages.
   1281  * Used when duplicating a segment that contains private anon pages.
   1282  * This code assumes that procedure calling this one has already used
   1283  * hat_chgprot() to disable write access to the range of addresses that
   1284  * that *old actually refers to.
   1285  */
   1286 void
   1287 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new,
   1288 			ulong_t new_idx, size_t size)
   1289 {
   1290 	spgcnt_t npages;
   1291 	kmutex_t *ahm;
   1292 	struct anon *ap;
   1293 	ulong_t off;
   1294 	ulong_t index;
   1295 
   1296 	npages = btopr(size);
   1297 	while (npages > 0) {
   1298 		index = old_idx;
   1299 		if ((ap = anon_get_next_ptr(old, &index)) == NULL)
   1300 			break;
   1301 
   1302 		ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
   1303 		off = index - old_idx;
   1304 		npages -= off;
   1305 		if (npages <= 0)
   1306 			break;
   1307 
   1308 		(void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP);
   1309 		ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
   1310 
   1311 		mutex_enter(ahm);
   1312 		ap->an_refcnt++;
   1313 		mutex_exit(ahm);
   1314 
   1315 		off++;
   1316 		new_idx += off;
   1317 		old_idx += off;
   1318 		npages--;
   1319 	}
   1320 }
   1321 
   1322 /*
   1323  * Just like anon_dup but also guarantees there are no holes (unallocated anon
   1324  * slots) within any large page region. That means if a large page region is
   1325  * empty in the old array it will skip it. If there are 1 or more valid slots
   1326  * in the large page region of the old array it will make sure to fill in any
   1327  * unallocated ones and also copy them to the new array. If noalloc is 1 large
   1328  * page region should either have no valid anon slots or all slots should be
   1329  * valid.
   1330  */
   1331 void
   1332 anon_dup_fill_holes(
   1333 	struct anon_hdr *old,
   1334 	ulong_t old_idx,
   1335 	struct anon_hdr *new,
   1336 	ulong_t new_idx,
   1337 	size_t size,
   1338 	uint_t szc,
   1339 	int noalloc)
   1340 {
   1341 	struct anon	*ap;
   1342 	spgcnt_t	npages;
   1343 	kmutex_t	*ahm, *ahmpages = NULL;
   1344 	pgcnt_t		pgcnt, i;
   1345 	ulong_t		index, off;
   1346 #ifdef DEBUG
   1347 	int		refcnt;
   1348 #endif
   1349 
   1350 	ASSERT(szc != 0);
   1351 	pgcnt = page_get_pagecnt(szc);
   1352 	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
   1353 	npages = btopr(size);
   1354 	ASSERT(IS_P2ALIGNED(npages, pgcnt));
   1355 	ASSERT(IS_P2ALIGNED(old_idx, pgcnt));
   1356 
   1357 	VM_STAT_ADD(anonvmstats.dupfillholes[0]);
   1358 
   1359 	while (npages > 0) {
   1360 		index = old_idx;
   1361 
   1362 		/*
   1363 		 * Find the next valid slot.
   1364 		 */
   1365 		if (anon_get_next_ptr(old, &index) == NULL)
   1366 			break;
   1367 
   1368 		ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
   1369 		/*
   1370 		 * Now backup index to the beginning of the
   1371 		 * current large page region of the old array.
   1372 		 */
   1373 		index = P2ALIGN(index, pgcnt);
   1374 		off = index - old_idx;
   1375 		ASSERT(IS_P2ALIGNED(off, pgcnt));
   1376 		npages -= off;
   1377 		if (npages <= 0)
   1378 			break;
   1379 
   1380 		/*
   1381 		 * Fill and copy a large page regions worth
   1382 		 * of anon slots.
   1383 		 */
   1384 		for (i = 0; i < pgcnt; i++) {
   1385 			if ((ap = anon_get_ptr(old, index + i)) == NULL) {
   1386 				if (noalloc) {
   1387 					panic("anon_dup_fill_holes: "
   1388 					    "empty anon slot\n");
   1389 				}
   1390 				VM_STAT_ADD(anonvmstats.dupfillholes[1]);
   1391 				ap = anon_alloc(NULL, 0);
   1392 				(void) anon_set_ptr(old, index + i, ap,
   1393 				    ANON_SLEEP);
   1394 			} else if (i == 0) {
   1395 				/*
   1396 				 * make the increment of all refcnts of all
   1397 				 * anon slots of a large page appear atomic by
   1398 				 * getting an anonpages_hash_lock for the
   1399 				 * first anon slot of a large page.
   1400 				 */
   1401 				int hash = AH_LOCK(ap->an_vp, ap->an_off);
   1402 
   1403 				VM_STAT_ADD(anonvmstats.dupfillholes[2]);
   1404 
   1405 				ahmpages = &anonpages_hash_lock[hash];
   1406 				mutex_enter(ahmpages);
   1407 				/*LINTED*/
   1408 				ASSERT(refcnt = ap->an_refcnt);
   1409 
   1410 				VM_STAT_COND_ADD(ap->an_refcnt > 1,
   1411 				    anonvmstats.dupfillholes[3]);
   1412 			}
   1413 			(void) anon_set_ptr(new, new_idx + off + i, ap,
   1414 			    ANON_SLEEP);
   1415 			ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
   1416 			mutex_enter(ahm);
   1417 			ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
   1418 			ASSERT(i == 0 || ahmpages == NULL ||
   1419 			    refcnt == ap->an_refcnt);
   1420 			ap->an_refcnt++;
   1421 			mutex_exit(ahm);
   1422 		}
   1423 		if (ahmpages != NULL) {
   1424 			mutex_exit(ahmpages);
   1425 			ahmpages = NULL;
   1426 		}
   1427 		off += pgcnt;
   1428 		new_idx += off;
   1429 		old_idx += off;
   1430 		npages -= pgcnt;
   1431 	}
   1432 }
   1433 
   1434 /*
   1435  * Used when a segment with a vnode changes szc. similarly to
   1436  * anon_dup_fill_holes() makes sure each large page region either has no anon
   1437  * slots or all of them. but new slots are created by COWing the file
   1438  * pages. on entrance no anon slots should be shared.
   1439  */
   1440 int
   1441 anon_fill_cow_holes(
   1442 	struct seg *seg,
   1443 	caddr_t addr,
   1444 	struct anon_hdr *ahp,
   1445 	ulong_t an_idx,
   1446 	struct vnode *vp,
   1447 	u_offset_t vp_off,
   1448 	size_t size,
   1449 	uint_t szc,
   1450 	uint_t prot,
   1451 	struct vpage vpage[],
   1452 	struct cred *cred)
   1453 {
   1454 	struct anon	*ap;
   1455 	spgcnt_t	npages;
   1456 	pgcnt_t		pgcnt, i;
   1457 	ulong_t		index, off;
   1458 	int		err = 0;
   1459 	int		pageflags = 0;
   1460 
   1461 	ASSERT(szc != 0);
   1462 	pgcnt = page_get_pagecnt(szc);
   1463 	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
   1464 	npages = btopr(size);
   1465 	ASSERT(IS_P2ALIGNED(npages, pgcnt));
   1466 	ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
   1467 
   1468 	while (npages > 0) {
   1469 		index = an_idx;
   1470 
   1471 		/*
   1472 		 * Find the next valid slot.
   1473 		 */
   1474 		if (anon_get_next_ptr(ahp, &index) == NULL) {
   1475 			break;
   1476 		}
   1477 
   1478 		ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
   1479 		/*
   1480 		 * Now backup index to the beginning of the
   1481 		 * current large page region of the anon array.
   1482 		 */
   1483 		index = P2ALIGN(index, pgcnt);
   1484 		off = index - an_idx;
   1485 		ASSERT(IS_P2ALIGNED(off, pgcnt));
   1486 		npages -= off;
   1487 		if (npages <= 0)
   1488 			break;
   1489 		an_idx += off;
   1490 		vp_off += ptob(off);
   1491 		addr += ptob(off);
   1492 		if (vpage != NULL) {
   1493 			vpage += off;
   1494 		}
   1495 
   1496 		for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) {
   1497 			if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) {
   1498 				page_t *pl[1 + 1];
   1499 				page_t *pp;
   1500 
   1501 				err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL,
   1502 				    pl, PAGESIZE, seg, addr, S_READ, cred,
   1503 				    NULL);
   1504 				if (err) {
   1505 					break;
   1506 				}
   1507 				if (vpage != NULL) {
   1508 					prot = VPP_PROT(vpage);
   1509 					pageflags = VPP_ISPPLOCK(vpage) ?
   1510 					    LOCK_PAGE : 0;
   1511 				}
   1512 				pp = anon_private(&ap, seg, addr, prot, pl[0],
   1513 				    pageflags, cred);
   1514 				if (pp == NULL) {
   1515 					err = ENOMEM;
   1516 					break;
   1517 				}
   1518 				(void) anon_set_ptr(ahp, an_idx, ap,
   1519 				    ANON_SLEEP);
   1520 				page_unlock(pp);
   1521 			}
   1522 			ASSERT(ap->an_refcnt == 1);
   1523 			addr += PAGESIZE;
   1524 			if (vpage != NULL) {
   1525 				vpage++;
   1526 			}
   1527 		}
   1528 		npages -= pgcnt;
   1529 	}
   1530 
   1531 	return (err);
   1532 }
   1533 
   1534 /*
   1535  * Free a group of "size" anon pages, size in bytes,
   1536  * and clear out the pointers to the anon entries.
   1537  */
   1538 void
   1539 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size)
   1540 {
   1541 	spgcnt_t npages;
   1542 	struct anon *ap;
   1543 	ulong_t old;
   1544 
   1545 	npages = btopr(size);
   1546 
   1547 	while (npages > 0) {
   1548 		old = index;
   1549 		if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
   1550 			break;
   1551 
   1552 		ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
   1553 		npages -= index - old;
   1554 		if (npages <= 0)
   1555 			break;
   1556 
   1557 		(void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP);
   1558 		anon_decref(ap);
   1559 		/*
   1560 		 * Bump index and decrement page count
   1561 		 */
   1562 		index++;
   1563 		npages--;
   1564 	}
   1565 }
   1566 
   1567 void
   1568 anon_free_pages(
   1569 	struct anon_hdr *ahp,
   1570 	ulong_t an_idx,
   1571 	size_t size,
   1572 	uint_t szc)
   1573 {
   1574 	spgcnt_t	npages;
   1575 	pgcnt_t		pgcnt;
   1576 	ulong_t		index, off;
   1577 
   1578 	ASSERT(szc != 0);
   1579 	pgcnt = page_get_pagecnt(szc);
   1580 	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
   1581 	npages = btopr(size);
   1582 	ASSERT(IS_P2ALIGNED(npages, pgcnt));
   1583 	ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
   1584 	ASSERT(an_idx < ahp->size);
   1585 
   1586 	VM_STAT_ADD(anonvmstats.freepages[0]);
   1587 
   1588 	while (npages > 0) {
   1589 		index = an_idx;
   1590 
   1591 		/*
   1592 		 * Find the next valid slot.
   1593 		 */
   1594 		if (anon_get_next_ptr(ahp, &index) == NULL)
   1595 			break;
   1596 
   1597 		ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
   1598 		/*
   1599 		 * Now backup index to the beginning of the
   1600 		 * current large page region of the old array.
   1601 		 */
   1602 		index = P2ALIGN(index, pgcnt);
   1603 		off = index - an_idx;
   1604 		ASSERT(IS_P2ALIGNED(off, pgcnt));
   1605 		npages -= off;
   1606 		if (npages <= 0)
   1607 			break;
   1608 
   1609 		anon_decref_pages(ahp, index, szc);
   1610 
   1611 		off += pgcnt;
   1612 		an_idx += off;
   1613 		npages -= pgcnt;
   1614 	}
   1615 }
   1616 
   1617 /*
   1618  * Make anonymous pages discardable
   1619  */
   1620 void
   1621 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size)
   1622 {
   1623 	spgcnt_t npages = btopr(size);
   1624 	struct anon *ap;
   1625 	struct vnode *vp;
   1626 	anoff_t off;
   1627 	page_t *pp, *root_pp;
   1628 	kmutex_t *ahm;
   1629 	pgcnt_t pgcnt;
   1630 	ulong_t old_idx, idx, i;
   1631 	struct anon_hdr *ahp = amp->ahp;
   1632 	anon_sync_obj_t cookie;
   1633 
   1634 	ASSERT(RW_READ_HELD(&amp->a_rwlock));
   1635 	pgcnt = 1;
   1636 	for (; npages > 0; index = (pgcnt == 1) ? index + 1 :
   1637 	    P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) {
   1638 
   1639 		/*
   1640 		 * get anon pointer and index for the first valid entry
   1641 		 * in the anon list, starting from "index"
   1642 		 */
   1643 		old_idx = index;
   1644 		if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
   1645 			break;
   1646 
   1647 		/*
   1648 		 * decrement npages by number of NULL anon slots we skipped
   1649 		 */
   1650 		npages -= index - old_idx;
   1651 		if (npages <= 0)
   1652 			break;
   1653 
   1654 		anon_array_enter(amp, index, &cookie);
   1655 		ap = anon_get_ptr(ahp, index);
   1656 		ASSERT(ap != NULL);
   1657 
   1658 		/*
   1659 		 * Get anonymous page and try to lock it SE_EXCL;
   1660 		 * if we couldn't grab the lock we skip to next page.
   1661 		 */
   1662 		swap_xlate(ap, &vp, &off);
   1663 		pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL);
   1664 		if (pp == NULL) {
   1665 			segadvstat.MADV_FREE_miss.value.ul++;
   1666 			pgcnt = 1;
   1667 			anon_array_exit(&cookie);
   1668 			continue;
   1669 		}
   1670 		pgcnt = page_get_pagecnt(pp->p_szc);
   1671 
   1672 		/*
   1673 		 * we cannot free a page which is permanently locked.
   1674 		 * The page_struct_lock need not be acquired to examine
   1675 		 * these fields since the page has an "exclusive" lock.
   1676 		 */
   1677 		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
   1678 			page_unlock(pp);
   1679 			segadvstat.MADV_FREE_miss.value.ul++;
   1680 			anon_array_exit(&cookie);
   1681 			continue;
   1682 		}
   1683 
   1684 		ahm = &anonhash_lock[AH_LOCK(vp, off)];
   1685 		mutex_enter(ahm);
   1686 		ASSERT(ap->an_refcnt != 0);
   1687 		/*
   1688 		 * skip this one if copy-on-write is not yet broken.
   1689 		 */
   1690 		if (ap->an_refcnt > 1) {
   1691 			mutex_exit(ahm);
   1692 			page_unlock(pp);
   1693 			segadvstat.MADV_FREE_miss.value.ul++;
   1694 			anon_array_exit(&cookie);
   1695 			continue;
   1696 		}
   1697 
   1698 		if (pp->p_szc == 0) {
   1699 			pgcnt = 1;
   1700 
   1701 			/*
   1702 			 * free swap slot;
   1703 			 */
   1704 			if (ap->an_pvp) {
   1705 				swap_phys_free(ap->an_pvp, ap->an_poff,
   1706 				    PAGESIZE);
   1707 				ap->an_pvp = NULL;
   1708 				ap->an_poff = 0;
   1709 			}
   1710 			mutex_exit(ahm);
   1711 			segadvstat.MADV_FREE_hit.value.ul++;
   1712 
   1713 			/*
   1714 			 * while we are at it, unload all the translations
   1715 			 * and attempt to free the page.
   1716 			 */
   1717 			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
   1718 			/*LINTED: constant in conditional context */
   1719 			VN_DISPOSE(pp, B_FREE, 0, kcred);
   1720 			anon_array_exit(&cookie);
   1721 			continue;
   1722 		}
   1723 
   1724 		pgcnt = page_get_pagecnt(pp->p_szc);
   1725 		if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) {
   1726 			if (!page_try_demote_pages(pp)) {
   1727 				mutex_exit(ahm);
   1728 				page_unlock(pp);
   1729 				segadvstat.MADV_FREE_miss.value.ul++;
   1730 				anon_array_exit(&cookie);
   1731 				continue;
   1732 			} else {
   1733 				pgcnt = 1;
   1734 				if (ap->an_pvp) {
   1735 					swap_phys_free(ap->an_pvp,
   1736 					    ap->an_poff, PAGESIZE);
   1737 					ap->an_pvp = NULL;
   1738 					ap->an_poff = 0;
   1739 				}
   1740 				mutex_exit(ahm);
   1741 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
   1742 				/*LINTED*/
   1743 				VN_DISPOSE(pp, B_FREE, 0, kcred);
   1744 				segadvstat.MADV_FREE_hit.value.ul++;
   1745 				anon_array_exit(&cookie);
   1746 				continue;
   1747 			}
   1748 		}
   1749 		mutex_exit(ahm);
   1750 		root_pp = pp;
   1751 
   1752 		/*
   1753 		 * try to lock remaining pages
   1754 		 */
   1755 		for (idx = 1; idx < pgcnt; idx++) {
   1756 			pp++;
   1757 			if (!page_trylock(pp, SE_EXCL))
   1758 				break;
   1759 			if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
   1760 				page_unlock(pp);
   1761 				break;
   1762 			}
   1763 		}
   1764 
   1765 		if (idx == pgcnt) {
   1766 			for (i = 0; i < pgcnt; i++) {
   1767 				ap = anon_get_ptr(ahp, index + i);
   1768 				if (ap == NULL)
   1769 					break;
   1770 				swap_xlate(ap, &vp, &off);
   1771 				ahm = &anonhash_lock[AH_LOCK(vp, off)];
   1772 				mutex_enter(ahm);
   1773 				ASSERT(ap->an_refcnt != 0);
   1774 
   1775 				/*
   1776 				 * skip this one if copy-on-write
   1777 				 * is not yet broken.
   1778 				 */
   1779 				if (ap->an_refcnt > 1) {
   1780 					mutex_exit(ahm);
   1781 					goto skiplp;
   1782 				}
   1783 				if (ap->an_pvp) {
   1784 					swap_phys_free(ap->an_pvp,
   1785 					    ap->an_poff, PAGESIZE);
   1786 					ap->an_pvp = NULL;
   1787 					ap->an_poff = 0;
   1788 				}
   1789 				mutex_exit(ahm);
   1790 			}
   1791 			page_destroy_pages(root_pp);
   1792 			segadvstat.MADV_FREE_hit.value.ul += pgcnt;
   1793 			anon_array_exit(&cookie);
   1794 			continue;
   1795 		}
   1796 skiplp:
   1797 		segadvstat.MADV_FREE_miss.value.ul += pgcnt;
   1798 		for (i = 0, pp = root_pp; i < idx; pp++, i++)
   1799 			page_unlock(pp);
   1800 		anon_array_exit(&cookie);
   1801 	}
   1802 }
   1803 
   1804 /*
   1805  * Return the kept page(s) and protections back to the segment driver.
   1806  */
   1807 int
   1808 anon_getpage(
   1809 	struct anon **app,
   1810 	uint_t *protp,
   1811 	page_t *pl[],
   1812 	size_t plsz,
   1813 	struct seg *seg,
   1814 	caddr_t addr,
   1815 	enum seg_rw rw,
   1816 	struct cred *cred)
   1817 {
   1818 	page_t *pp;
   1819 	struct anon *ap = *app;
   1820 	struct vnode *vp;
   1821 	anoff_t off;
   1822 	int err;
   1823 	kmutex_t *ahm;
   1824 
   1825 	swap_xlate(ap, &vp, &off);
   1826 
   1827 	/*
   1828 	 * Lookup the page. If page is being paged in,
   1829 	 * wait for it to finish as we must return a list of
   1830 	 * pages since this routine acts like the VOP_GETPAGE
   1831 	 * routine does.
   1832 	 */
   1833 	if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) {
   1834 		ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
   1835 		mutex_enter(ahm);
   1836 		if (ap->an_refcnt == 1)
   1837 			*protp = PROT_ALL;
   1838 		else
   1839 			*protp = PROT_ALL & ~PROT_WRITE;
   1840 		mutex_exit(ahm);
   1841 		pl[0] = pp;
   1842 		pl[1] = NULL;
   1843 		return (0);
   1844 	}
   1845 
   1846 	/*
   1847 	 * Simply treat it as a vnode fault on the anon vp.
   1848 	 */
   1849 
   1850 	TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE,
   1851 	    "anon_getpage:seg %x addr %x vp %x",
   1852 	    seg, addr, vp);
   1853 
   1854 	err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz,
   1855 	    seg, addr, rw, cred, NULL);
   1856 
   1857 	if (err == 0 && pl != NULL) {
   1858 		ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
   1859 		mutex_enter(ahm);
   1860 		if (ap->an_refcnt != 1)
   1861 			*protp &= ~PROT_WRITE;	/* make read-only */
   1862 		mutex_exit(ahm);
   1863 	}
   1864 	return (err);
   1865 }
   1866 
   1867 /*
   1868  * Creates or returns kept pages to the segment driver.  returns -1 if a large
   1869  * page cannot be allocated. returns -2 if some other process has allocated a
   1870  * larger page.
   1871  *
   1872  * For cowfault it will allocate any size pages to fill the requested area to
   1873  * avoid partially overwriting anon slots (i.e. sharing only some of the anon
   1874  * slots within a large page with other processes). This policy greatly
   1875  * simplifies large page freeing (which is only freed when all anon slot
   1876  * refcnts are 0).
   1877  */
   1878 int
   1879 anon_map_getpages(
   1880 	struct anon_map *amp,
   1881 	ulong_t	start_idx,
   1882 	uint_t	szc,
   1883 	struct seg *seg,
   1884 	caddr_t	addr,
   1885 	uint_t prot,
   1886 	uint_t *protp,
   1887 	page_t	*ppa[],
   1888 	uint_t	*ppa_szc,
   1889 	struct vpage vpage[],
   1890 	enum seg_rw rw,
   1891 	int brkcow,
   1892 	int anypgsz,
   1893 	int pgflags,
   1894 	struct cred *cred)
   1895 {
   1896 	pgcnt_t		pgcnt;
   1897 	struct anon	*ap;
   1898 	struct vnode	*vp;
   1899 	anoff_t		off;
   1900 	page_t		*pp, *pl[2], *conpp = NULL;
   1901 	caddr_t		vaddr;
   1902 	ulong_t		pg_idx, an_idx, i;
   1903 	spgcnt_t	nreloc = 0;
   1904 	int		prealloc = 1;
   1905 	int		err, slotcreate;
   1906 	uint_t		vpprot;
   1907 	int		upsize = (szc < seg->s_szc);
   1908 
   1909 #if !defined(__i386) && !defined(__amd64)
   1910 	ASSERT(seg->s_szc != 0);
   1911 #endif
   1912 	ASSERT(szc <= seg->s_szc);
   1913 	ASSERT(ppa_szc != NULL);
   1914 	ASSERT(rw != S_CREATE);
   1915 
   1916 	*protp = PROT_ALL;
   1917 
   1918 	VM_STAT_ADD(anonvmstats.getpages[0]);
   1919 
   1920 	if (szc == 0) {
   1921 		VM_STAT_ADD(anonvmstats.getpages[1]);
   1922 		if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) {
   1923 			err = anon_getpage(&ap, protp, pl, PAGESIZE, seg,
   1924 			    addr, rw, cred);
   1925 			if (err)
   1926 				return (err);
   1927 			ppa[0] = pl[0];
   1928 			if (brkcow == 0 || (*protp & PROT_WRITE)) {
   1929 				VM_STAT_ADD(anonvmstats.getpages[2]);
   1930 				if (ppa[0]->p_szc != 0 && upsize) {
   1931 					VM_STAT_ADD(anonvmstats.getpages[3]);
   1932 					*ppa_szc = MIN(ppa[0]->p_szc,
   1933 					    seg->s_szc);
   1934 					page_unlock(ppa[0]);
   1935 					return (-2);
   1936 				}
   1937 				return (0);
   1938 			}
   1939 			panic("anon_map_getpages: cowfault for szc 0");
   1940 		} else {
   1941 			VM_STAT_ADD(anonvmstats.getpages[4]);
   1942 			ppa[0] = anon_zero(seg, addr, &ap, cred);
   1943 			if (ppa[0] == NULL)
   1944 				return (ENOMEM);
   1945 			(void) anon_set_ptr(amp->ahp, start_idx, ap,
   1946 			    ANON_SLEEP);
   1947 			return (0);
   1948 		}
   1949 	}
   1950 
   1951 	pgcnt = page_get_pagecnt(szc);
   1952 	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
   1953 	ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
   1954 
   1955 	/*
   1956 	 * First we check for the case that the requtested large
   1957 	 * page or larger page already exists in the system.
   1958 	 * Actually we only check if the first constituent page
   1959 	 * exists and only preallocate if it's not found.
   1960 	 */
   1961 	ap = anon_get_ptr(amp->ahp, start_idx);
   1962 	if (ap) {
   1963 		uint_t pszc;
   1964 		swap_xlate(ap, &vp, &off);
   1965 		if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) {
   1966 			if (pszc > szc && upsize) {
   1967 				*ppa_szc = MIN(pszc, seg->s_szc);
   1968 				return (-2);
   1969 			}
   1970 			if (pszc >= szc) {
   1971 				prealloc = 0;
   1972 			}
   1973 		}
   1974 	}
   1975 
   1976 	VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]);
   1977 	VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]);
   1978 
   1979 top:
   1980 	/*
   1981 	 * If a smaller page or no page at all was found,
   1982 	 * grab a large page off the freelist.
   1983 	 */
   1984 	if (prealloc) {
   1985 		ASSERT(conpp == NULL);
   1986 		if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa,
   1987 		    szc, 0, pgflags) != 0) {
   1988 			VM_STAT_ADD(anonvmstats.getpages[7]);
   1989 			if (brkcow == 0 || szc < seg->s_szc ||
   1990 			    !anon_szcshare(amp->ahp, start_idx)) {
   1991 				/*
   1992 				 * If the refcnt's of all anon slots are <= 1
   1993 				 * they can't increase since we are holding
   1994 				 * the address space's lock. So segvn can
   1995 				 * safely decrease szc without risking to
   1996 				 * generate a cow fault for the region smaller
   1997 				 * than the segment's largest page size.
   1998 				 */
   1999 				VM_STAT_ADD(anonvmstats.getpages[8]);
   2000 				return (-1);
   2001 			}
   2002 		docow:
   2003 			/*
   2004 			 * This is a cow fault. Copy away the entire 1 large
   2005 			 * page region of this segment.
   2006 			 */
   2007 			if (szc != seg->s_szc)
   2008 				panic("anon_map_getpages: cowfault for szc %d",
   2009 				    szc);
   2010 			vaddr = addr;
   2011 			for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
   2012 			    pg_idx++, an_idx++, vaddr += PAGESIZE) {
   2013 				if ((ap = anon_get_ptr(amp->ahp, an_idx)) !=
   2014 				    NULL) {
   2015 					err = anon_getpage(&ap, &vpprot, pl,
   2016 					    PAGESIZE, seg, vaddr, rw, cred);
   2017 					if (err) {
   2018 						for (i = 0; i < pg_idx; i++) {
   2019 							if ((pp = ppa[i]) !=
   2020 							    NULL)
   2021 								page_unlock(pp);
   2022 						}
   2023 						return (err);
   2024 					}
   2025 					ppa[pg_idx] = pl[0];
   2026 				} else {
   2027 					/*
   2028 					 * Since this is a cowfault we know
   2029 					 * that this address space has a
   2030 					 * parent or children which means
   2031 					 * anon_dup_fill_holes() has initialized
   2032 					 * all anon slots within a large page
   2033 					 * region that had at least one anon
   2034 					 * slot at the time of fork().
   2035 					 */
   2036 					panic("anon_map_getpages: "
   2037 					    "cowfault but anon slot is empty");
   2038 				}
   2039 			}
   2040 			VM_STAT_ADD(anonvmstats.getpages[9]);
   2041 			*protp = PROT_ALL;
   2042 			return (anon_map_privatepages(amp, start_idx, szc, seg,
   2043 			    addr, prot, ppa, vpage, anypgsz, pgflags, cred));
   2044 		}
   2045 	}
   2046 
   2047 	VM_STAT_ADD(anonvmstats.getpages[10]);
   2048 
   2049 	an_idx = start_idx;
   2050 	pg_idx = 0;
   2051 	vaddr = addr;
   2052 	while (pg_idx < pgcnt) {
   2053 		slotcreate = 0;
   2054 		if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) {
   2055 			VM_STAT_ADD(anonvmstats.getpages[11]);
   2056 			/*
   2057 			 * For us to have decided not to preallocate
   2058 			 * would have meant that a large page
   2059 			 * was found. Which also means that all of the
   2060 			 * anon slots for that page would have been
   2061 			 * already created for us.
   2062 			 */
   2063 			if (prealloc == 0)
   2064 				panic("anon_map_getpages: prealloc = 0");
   2065 
   2066 			slotcreate = 1;
   2067 			ap = anon_alloc(NULL, 0);
   2068 		}
   2069 		swap_xlate(ap, &vp, &off);
   2070 
   2071 		/*
   2072 		 * Now setup our preallocated page to pass down
   2073 		 * to swap_getpage().
   2074 		 */
   2075 		if (prealloc) {
   2076 			ASSERT(ppa[pg_idx]->p_szc == szc);
   2077 			conpp = ppa[pg_idx];
   2078 		}
   2079 		ASSERT(prealloc || conpp == NULL);
   2080 
   2081 		/*
   2082 		 * If we just created this anon slot then call
   2083 		 * with S_CREATE to prevent doing IO on the page.
   2084 		 * Similar to the anon_zero case.
   2085 		 */
   2086 		err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE,
   2087 		    NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr,
   2088 		    slotcreate == 1 ? S_CREATE : rw, cred);
   2089 
   2090 		if (err) {
   2091 			ASSERT(err != -2 || upsize);
   2092 			VM_STAT_ADD(anonvmstats.getpages[12]);
   2093 			ASSERT(slotcreate == 0);
   2094 			goto io_err;
   2095 		}
   2096 
   2097 		pp = pl[0];
   2098 
   2099 		if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) {
   2100 			VM_STAT_ADD(anonvmstats.getpages[13]);
   2101 			ASSERT(slotcreate == 0);
   2102 			ASSERT(prealloc == 0);
   2103 			ASSERT(pg_idx == 0);
   2104 			if (pp->p_szc > szc) {
   2105 				ASSERT(upsize);
   2106 				*ppa_szc = MIN(pp->p_szc, seg->s_szc);
   2107 				page_unlock(pp);
   2108 				VM_STAT_ADD(anonvmstats.getpages[14]);
   2109 				return (-2);
   2110 			}
   2111 			page_unlock(pp);
   2112 			prealloc = 1;
   2113 			goto top;
   2114 		}
   2115 
   2116 		/*
   2117 		 * If we decided to preallocate but VOP_GETPAGE
   2118 		 * found a page in the system that satisfies our
   2119 		 * request then free up our preallocated large page
   2120 		 * and continue looping accross the existing large
   2121 		 * page via VOP_GETPAGE.
   2122 		 */
   2123 		if (prealloc && pp != ppa[pg_idx]) {
   2124 			VM_STAT_ADD(anonvmstats.getpages[15]);
   2125 			ASSERT(slotcreate == 0);
   2126 			ASSERT(pg_idx == 0);
   2127 			conpp = NULL;
   2128 			prealloc = 0;
   2129 			page_free_pages(ppa[0]);
   2130 		}
   2131 
   2132 		if (prealloc && nreloc > 1) {
   2133 			/*
   2134 			 * we have relocated out of a smaller large page.
   2135 			 * skip npgs - 1 iterations and continue which will
   2136 			 * increment by one the loop indices.
   2137 			 */
   2138 			spgcnt_t npgs = nreloc;
   2139 
   2140 			VM_STAT_ADD(anonvmstats.getpages[16]);
   2141 
   2142 			ASSERT(pp == ppa[pg_idx]);
   2143 			ASSERT(slotcreate == 0);
   2144 			ASSERT(pg_idx + npgs <= pgcnt);
   2145 			if ((*protp & PROT_WRITE) &&
   2146 			    anon_share(amp->ahp, an_idx, npgs)) {
   2147 				*protp &= ~PROT_WRITE;
   2148 			}
   2149 			pg_idx += npgs;
   2150 			an_idx += npgs;
   2151 			vaddr += PAGESIZE * npgs;
   2152 			continue;
   2153 		}
   2154 
   2155 		VM_STAT_ADD(anonvmstats.getpages[17]);
   2156 
   2157 		/*
   2158 		 * Anon_zero case.
   2159 		 */
   2160 		if (slotcreate) {
   2161 			ASSERT(prealloc);
   2162 			pagezero(pp, 0, PAGESIZE);
   2163 			CPU_STATS_ADD_K(vm, zfod, 1);
   2164 			hat_setrefmod(pp);
   2165 		}
   2166 
   2167 		ASSERT(prealloc == 0 || ppa[pg_idx] == pp);
   2168 		ASSERT(prealloc != 0 || PAGE_SHARED(pp));
   2169 		ASSERT(prealloc == 0 || PAGE_EXCL(pp));
   2170 
   2171 		if (pg_idx > 0 &&
   2172 		    ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) ||
   2173 		    (pp->p_szc != ppa[pg_idx - 1]->p_szc))) {
   2174 			panic("anon_map_getpages: unexpected page");
   2175 		} else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) {
   2176 			panic("anon_map_getpages: unaligned page");
   2177 		}
   2178 
   2179 		if (prealloc == 0) {
   2180 			ppa[pg_idx] = pp;
   2181 		}
   2182 
   2183 		if (ap->an_refcnt > 1) {
   2184 			VM_STAT_ADD(anonvmstats.getpages[18]);
   2185 			*protp &= ~PROT_WRITE;
   2186 		}
   2187 
   2188 		/*
   2189 		 * If this is a new anon slot then initialize
   2190 		 * the anon array entry.
   2191 		 */
   2192 		if (slotcreate) {
   2193 			(void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
   2194 		}
   2195 		pg_idx++;
   2196 		an_idx++;
   2197 		vaddr += PAGESIZE;
   2198 	}
   2199 
   2200 	/*
   2201 	 * Since preallocated pages come off the freelist
   2202 	 * they are locked SE_EXCL. Simply downgrade and return.
   2203 	 */
   2204 	if (prealloc) {
   2205 		VM_STAT_ADD(anonvmstats.getpages[19]);
   2206 		conpp = NULL;
   2207 		for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
   2208 			page_downgrade(ppa[pg_idx]);
   2209 		}
   2210 	}
   2211 	ASSERT(conpp == NULL);
   2212 
   2213 	if (brkcow == 0 || (*protp & PROT_WRITE)) {
   2214 		VM_STAT_ADD(anonvmstats.getpages[20]);
   2215 		return (0);
   2216 	}
   2217 
   2218 	if (szc < seg->s_szc)
   2219 		panic("anon_map_getpages: cowfault for szc %d", szc);
   2220 
   2221 	VM_STAT_ADD(anonvmstats.getpages[21]);
   2222 
   2223 	*protp = PROT_ALL;
   2224 	return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot,
   2225 	    ppa, vpage, anypgsz, pgflags, cred));
   2226 io_err:
   2227 	/*
   2228 	 * We got an IO error somewhere in our large page.
   2229 	 * If we were using a preallocated page then just demote
   2230 	 * all the constituent pages that we've succeeded with sofar
   2231 	 * to PAGESIZE pages and leave them in the system
   2232 	 * unlocked.
   2233 	 */
   2234 
   2235 	ASSERT(err != -2 || ((pg_idx == 0) && upsize));
   2236 
   2237 	VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]);
   2238 	VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]);
   2239 	VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]);
   2240 
   2241 	if (prealloc) {
   2242 		conpp = NULL;
   2243 		if (pg_idx > 0) {
   2244 			VM_STAT_ADD(anonvmstats.getpages[25]);
   2245 			for (i = 0; i < pgcnt; i++) {
   2246 				pp = ppa[i];
   2247 				ASSERT(PAGE_EXCL(pp));
   2248 				ASSERT(pp->p_szc == szc);
   2249 				pp->p_szc = 0;
   2250 			}
   2251 			for (i = 0; i < pg_idx; i++) {
   2252 				ASSERT(!hat_page_is_mapped(ppa[i]));
   2253 				page_unlock(ppa[i]);
   2254 			}
   2255 			/*
   2256 			 * Now free up the remaining unused constituent
   2257 			 * pages.
   2258 			 */
   2259 			while (pg_idx < pgcnt) {
   2260 				ASSERT(!hat_page_is_mapped(ppa[pg_idx]));
   2261 				page_free(ppa[pg_idx], 0);
   2262 				pg_idx++;
   2263 			}
   2264 		} else {
   2265 			VM_STAT_ADD(anonvmstats.getpages[26]);
   2266 			page_free_pages(ppa[0]);
   2267 		}
   2268 	} else {
   2269 		VM_STAT_ADD(anonvmstats.getpages[27]);
   2270 		ASSERT(err > 0);
   2271 		for (i = 0; i < pg_idx; i++)
   2272 			page_unlock(ppa[i]);
   2273 	}
   2274 	ASSERT(conpp == NULL);
   2275 	if (err != -1)
   2276 		return (err);
   2277 	/*
   2278 	 * we are here because we failed to relocate.
   2279 	 */
   2280 	ASSERT(prealloc);
   2281 	if (brkcow == 0 || szc < seg->s_szc ||
   2282 	    !anon_szcshare(amp->ahp, start_idx)) {
   2283 		VM_STAT_ADD(anonvmstats.getpages[28]);
   2284 		return (-1);
   2285 	}
   2286 	VM_STAT_ADD(anonvmstats.getpages[29]);
   2287 	goto docow;
   2288 }
   2289 
   2290 
   2291 /*
   2292  * Turn a reference to an object or shared anon page
   2293  * into a private page with a copy of the data from the
   2294  * original page which is always locked by the caller.
   2295  * This routine unloads the translation and unlocks the
   2296  * original page, if it isn't being stolen, before returning
   2297  * to the caller.
   2298  *
   2299  * NOTE:  The original anon slot is not freed by this routine
   2300  *	  It must be freed by the caller while holding the
   2301  *	  "anon_map" lock to prevent races which can occur if
   2302  *	  a process has multiple lwps in its address space.
   2303  */
   2304 page_t *
   2305 anon_private(
   2306 	struct anon **app,
   2307 	struct seg *seg,
   2308 	caddr_t addr,
   2309 	uint_t	prot,
   2310 	page_t *opp,
   2311 	int oppflags,
   2312 	struct cred *cred)
   2313 {
   2314 	struct anon *old = *app;
   2315 	struct anon *new;
   2316 	page_t *pp = NULL;
   2317 	struct vnode *vp;
   2318 	anoff_t off;
   2319 	page_t *anon_pl[1 + 1];
   2320 	int err;
   2321 
   2322 	if (oppflags & STEAL_PAGE)
   2323 		ASSERT(PAGE_EXCL(opp));
   2324 	else
   2325 		ASSERT(PAGE_LOCKED(opp));
   2326 
   2327 	CPU_STATS_ADD_K(vm, cow_fault, 1);
   2328 
   2329 	/* Kernel probe */
   2330 	TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */,
   2331 		tnf_opaque,	address,	addr);
   2332 
   2333 	*app = new = anon_alloc(NULL, 0);
   2334 	swap_xlate(new, &vp, &off);
   2335 
   2336 	if (oppflags & STEAL_PAGE) {
   2337 		page_rename(opp, vp, (u_offset_t)off);
   2338 		pp = opp;
   2339 		TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE,
   2340 		    "anon_private:seg %p addr %x pp %p vp %p off %lx",
   2341 		    seg, addr, pp, vp, off);
   2342 		hat_setmod(pp);
   2343 
   2344 		/* bug 4026339 */
   2345 		page_downgrade(pp);
   2346 		return (pp);
   2347 	}
   2348 
   2349 	/*
   2350 	 * Call the VOP_GETPAGE routine to create the page, thereby
   2351 	 * enabling the vnode driver to allocate any filesystem
   2352 	 * space (e.g., disk block allocation for UFS).  This also
   2353 	 * prevents more than one page from being added to the
   2354 	 * vnode at the same time.
   2355 	 */
   2356 	err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL,
   2357 	    anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
   2358 	if (err)
   2359 		goto out;
   2360 
   2361 	pp = anon_pl[0];
   2362 
   2363 	/*
   2364 	 * If the original page was locked, we need to move the lock
   2365 	 * to the new page by transfering 'cowcnt/lckcnt' of the original
   2366 	 * page to 'cowcnt/lckcnt' of the new page.
   2367 	 *
   2368 	 * See Statement at the beginning of segvn_lockop() and
   2369 	 * comments in page_pp_useclaim() regarding the way
   2370 	 * cowcnts/lckcnts are handled.
   2371 	 *
   2372 	 * Also availrmem must be decremented up front for read only mapping
   2373 	 * before calling page_pp_useclaim. page_pp_useclaim will bump it back
   2374 	 * if availrmem did not need to be decremented after all.
   2375 	 */
   2376 	if (oppflags & LOCK_PAGE) {
   2377 		if ((prot & PROT_WRITE) == 0) {
   2378 			mutex_enter(&freemem_lock);
   2379 			if (availrmem > pages_pp_maximum) {
   2380 				availrmem--;
   2381 				pages_useclaim++;
   2382 			} else {
   2383 				mutex_exit(&freemem_lock);
   2384 				goto out;
   2385 			}
   2386 			mutex_exit(&freemem_lock);
   2387 		}
   2388 		page_pp_useclaim(opp, pp, prot & PROT_WRITE);
   2389 	}
   2390 
   2391 	/*
   2392 	 * Now copy the contents from the original page,
   2393 	 * which is locked and loaded in the MMU by
   2394 	 * the caller to prevent yet another page fault.
   2395 	 */
   2396 	/* XXX - should set mod bit in here */
   2397 	if (ppcopy(opp, pp) == 0) {
   2398 		/*
   2399 		 * Before ppcopy could hanlde UE or other faults, we
   2400 		 * would have panicked here, and still have no option
   2401 		 * but to do so now.
   2402 		 */
   2403 		panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p",
   2404 		    (void *)opp, (void *)pp);
   2405 	}
   2406 
   2407 	hat_setrefmod(pp);		/* mark as modified */
   2408 
   2409 	/*
   2410 	 * Unload the old translation.
   2411 	 */
   2412 	hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD);
   2413 
   2414 	/*
   2415 	 * Free unmapped, unmodified original page.
   2416 	 * or release the lock on the original page,
   2417 	 * otherwise the process will sleep forever in
   2418 	 * anon_decref() waiting for the "exclusive" lock
   2419 	 * on the page.
   2420 	 */
   2421 	(void) page_release(opp, 1);
   2422 
   2423 	/*
   2424 	 * we are done with page creation so downgrade the new
   2425 	 * page's selock to shared, this helps when multiple
   2426 	 * as_fault(...SOFTLOCK...) are done to the same
   2427 	 * page(aio)
   2428 	 */
   2429 	page_downgrade(pp);
   2430 
   2431 	/*
   2432 	 * NOTE:  The original anon slot must be freed by the
   2433 	 * caller while holding the "anon_map" lock, if we
   2434 	 * copied away from an anonymous page.
   2435 	 */
   2436 	return (pp);
   2437 
   2438 out:
   2439 	*app = old;
   2440 	if (pp)
   2441 		page_unlock(pp);
   2442 	anon_decref(new);
   2443 	page_unlock(opp);
   2444 	return ((page_t *)NULL);
   2445 }
   2446 
   2447 int
   2448 anon_map_privatepages(
   2449 	struct anon_map *amp,
   2450 	ulong_t	start_idx,
   2451 	uint_t	szc,
   2452 	struct seg *seg,
   2453 	caddr_t addr,
   2454 	uint_t	prot,
   2455 	page_t	*ppa[],
   2456 	struct vpage vpage[],
   2457 	int anypgsz,
   2458 	int pgflags,
   2459 	struct cred *cred)
   2460 {
   2461 	pgcnt_t		pgcnt;
   2462 	struct vnode	*vp;
   2463 	anoff_t		off;
   2464 	page_t		*pl[2], *conpp = NULL;
   2465 	int		err;
   2466 	int		prealloc = 1;
   2467 	struct anon	*ap, *oldap;
   2468 	caddr_t		vaddr;
   2469 	page_t		*pplist, *pp;
   2470 	ulong_t		pg_idx, an_idx;
   2471 	spgcnt_t	nreloc = 0;
   2472 	int		pagelock = 0;
   2473 	kmutex_t	*ahmpages = NULL;
   2474 #ifdef DEBUG
   2475 	int		refcnt;
   2476 #endif
   2477 
   2478 	ASSERT(szc != 0);
   2479 	ASSERT(szc == seg->s_szc);
   2480 
   2481 	VM_STAT_ADD(anonvmstats.privatepages[0]);
   2482 
   2483 	pgcnt = page_get_pagecnt(szc);
   2484 	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
   2485 	ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
   2486 
   2487 	ASSERT(amp != NULL);
   2488 	ap = anon_get_ptr(amp->ahp, start_idx);
   2489 	ASSERT(ap == NULL || ap->an_refcnt >= 1);
   2490 
   2491 	VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]);
   2492 
   2493 	/*
   2494 	 * Now try and allocate the large page. If we fail then just
   2495 	 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let
   2496 	 * the caller make this decision but to avoid added complexity
   2497 	 * it's simplier to handle that case here.
   2498 	 */
   2499 	if (anypgsz == -1) {
   2500 		VM_STAT_ADD(anonvmstats.privatepages[2]);
   2501 		prealloc = 0;
   2502 	} else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc,
   2503 	    anypgsz, pgflags) != 0) {
   2504 		VM_STAT_ADD(anonvmstats.privatepages[3]);
   2505 		prealloc = 0;
   2506 	}
   2507 
   2508 	/*
   2509 	 * make the decrement of all refcnts of all
   2510 	 * anon slots of a large page appear atomic by
   2511 	 * getting an anonpages_hash_lock for the
   2512 	 * first anon slot of a large page.
   2513 	 */
   2514 	if (ap != NULL) {
   2515 		ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp,
   2516 		    ap->an_off)];
   2517 		mutex_enter(ahmpages);
   2518 		if (ap->an_refcnt == 1) {
   2519 			VM_STAT_ADD(anonvmstats.privatepages[4]);
   2520 			ASSERT(!anon_share(amp->ahp, start_idx, pgcnt));
   2521 			mutex_exit(ahmpages);
   2522 
   2523 			if (prealloc) {
   2524 				page_free_replacement_page(pplist);
   2525 				page_create_putback(pgcnt);
   2526 			}
   2527 			ASSERT(ppa[0]->p_szc <= szc);
   2528 			if (ppa[0]->p_szc == szc) {
   2529 				VM_STAT_ADD(anonvmstats.privatepages[5]);
   2530 				return (0);
   2531 			}
   2532 			for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
   2533 				ASSERT(ppa[pg_idx] != NULL);
   2534 				page_unlock(ppa[pg_idx]);
   2535 			}
   2536 			return (-1);
   2537 		}
   2538 	}
   2539 
   2540 	/*
   2541 	 * If we are passed in the vpage array and this is
   2542 	 * not PROT_WRITE then we need to decrement availrmem
   2543 	 * up front before we try anything. If we need to and
   2544 	 * can't decrement availrmem then its better to fail now
   2545 	 * than in the middle of processing the new large page.
   2546 	 * page_pp_usclaim() on behalf of each constituent page
   2547 	 * below will adjust availrmem back for the cases not needed.
   2548 	 */
   2549 	if (vpage != NULL && (prot & PROT_WRITE) == 0) {
   2550 		for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
   2551 			if (VPP_ISPPLOCK(&vpage[pg_idx])) {
   2552 				pagelock = 1;
   2553 				break;
   2554 			}
   2555 		}
   2556 		if (pagelock) {
   2557 			VM_STAT_ADD(anonvmstats.privatepages[6]);
   2558 			mutex_enter(&freemem_lock);
   2559 			if (availrmem >= pages_pp_maximum + pgcnt) {
   2560 				availrmem -= pgcnt;
   2561 				pages_useclaim += pgcnt;
   2562 			} else {
   2563 				VM_STAT_ADD(anonvmstats.privatepages[7]);
   2564 				mutex_exit(&freemem_lock);
   2565 				if (ahmpages != NULL) {
   2566 					mutex_exit(ahmpages);
   2567 				}
   2568 				if (prealloc) {
   2569 					page_free_replacement_page(pplist);
   2570 					page_create_putback(pgcnt);
   2571 				}
   2572 				for (pg_idx = 0; pg_idx < pgcnt; pg_idx++)
   2573 					if (ppa[pg_idx] != NULL)
   2574 						page_unlock(ppa[pg_idx]);
   2575 				return (ENOMEM);
   2576 			}
   2577 			mutex_exit(&freemem_lock);
   2578 		}
   2579 	}
   2580 
   2581 	CPU_STATS_ADD_K(vm, cow_fault, pgcnt);
   2582 
   2583 	VM_STAT_ADD(anonvmstats.privatepages[8]);
   2584 
   2585 	an_idx = start_idx;
   2586 	pg_idx = 0;
   2587 	vaddr = addr;
   2588 	for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) {
   2589 		ASSERT(ppa[pg_idx] != NULL);
   2590 		oldap = anon_get_ptr(amp->ahp, an_idx);
   2591 		ASSERT(ahmpages != NULL || oldap == NULL);
   2592 		ASSERT(ahmpages == NULL || oldap != NULL);
   2593 		ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
   2594 		ASSERT(ahmpages == NULL || pg_idx != 0 ||
   2595 		    (refcnt = oldap->an_refcnt));
   2596 		ASSERT(ahmpages == NULL || pg_idx == 0 ||
   2597 		    refcnt == oldap->an_refcnt);
   2598 
   2599 		ap = anon_alloc(NULL, 0);
   2600 
   2601 		swap_xlate(ap, &vp, &off);
   2602 
   2603 		/*
   2604 		 * Now setup our preallocated page to pass down to
   2605 		 * swap_getpage().
   2606 		 */
   2607 		if (prealloc) {
   2608 			pp = pplist;
   2609 			page_sub(&pplist, pp);
   2610 			conpp = pp;
   2611 		}
   2612 
   2613 		err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl,
   2614 		    PAGESIZE, conpp, NULL, &nreloc, seg, vaddr,
   2615 		    S_CREATE, cred);
   2616 
   2617 		/*
   2618 		 * Impossible to fail this is S_CREATE.
   2619 		 */
   2620 		if (err)
   2621 			panic("anon_map_privatepages: VOP_GETPAGE failed");
   2622 
   2623 		ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0);
   2624 		ASSERT(prealloc == 0 || nreloc == 1);
   2625 
   2626 		pp = pl[0];
   2627 
   2628 		/*
   2629 		 * If the original page was locked, we need to move
   2630 		 * the lock to the new page by transfering
   2631 		 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt'
   2632 		 * of the new page. pg_idx can be used to index
   2633 		 * into the vpage array since the caller will guarentee
   2634 		 * that vpage struct passed in corresponds to addr
   2635 		 * and forward.
   2636 		 */
   2637 		if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) {
   2638 			page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE);
   2639 		} else if (pagelock) {
   2640 			mutex_enter(&freemem_lock);
   2641 			availrmem++;
   2642 			pages_useclaim--;
   2643 			mutex_exit(&freemem_lock);
   2644 		}
   2645 
   2646 		/*
   2647 		 * Now copy the contents from the original page.
   2648 		 */
   2649 		if (ppcopy(ppa[pg_idx], pp) == 0) {
   2650 			/*
   2651 			 * Before ppcopy could hanlde UE or other faults, we
   2652 			 * would have panicked here, and still have no option
   2653 			 * but to do so now.
   2654 			 */
   2655 			panic("anon_map_privatepages, ppcopy failed");
   2656 		}
   2657 
   2658 		hat_setrefmod(pp);		/* mark as modified */
   2659 
   2660 		/*
   2661 		 * Release the lock on the original page,
   2662 		 * derement the old slot, and down grade the lock
   2663 		 * on the new copy.
   2664 		 */
   2665 		page_unlock(ppa[pg_idx]);
   2666 
   2667 		if (!prealloc)
   2668 			page_downgrade(pp);
   2669 
   2670 		ppa[pg_idx] = pp;
   2671 
   2672 		/*
   2673 		 * Now reflect the copy in the new anon array.
   2674 		 */
   2675 		ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
   2676 		if (oldap != NULL)
   2677 			anon_decref(oldap);
   2678 		(void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
   2679 	}
   2680 
   2681 	/*
   2682 	 * Unload the old large page translation.
   2683 	 */
   2684 	hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD);
   2685 
   2686 	if (ahmpages != NULL) {
   2687 		mutex_exit(ahmpages);
   2688 	}
   2689 	ASSERT(prealloc == 0 || pplist == NULL);
   2690 	if (prealloc) {
   2691 		VM_STAT_ADD(anonvmstats.privatepages[9]);
   2692 		for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
   2693 			page_downgrade(ppa[pg_idx]);
   2694 		}
   2695 	}
   2696 
   2697 	return (0);
   2698 }
   2699 
   2700 /*
   2701  * Allocate a private zero-filled anon page.
   2702  */
   2703 page_t *
   2704 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred)
   2705 {
   2706 	struct anon *ap;
   2707 	page_t *pp;
   2708 	struct vnode *vp;
   2709 	anoff_t off;
   2710 	page_t *anon_pl[1 + 1];
   2711 	int err;
   2712 
   2713 	/* Kernel probe */
   2714 	TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */,
   2715 		tnf_opaque,	address,	addr);
   2716 
   2717 	*app = ap = anon_alloc(NULL, 0);
   2718 	swap_xlate(ap, &vp, &off);
   2719 
   2720 	/*
   2721 	 * Call the VOP_GETPAGE routine to create the page, thereby
   2722 	 * enabling the vnode driver to allocate any filesystem
   2723 	 * dependent structures (e.g., disk block allocation for UFS).
   2724 	 * This also prevents more than on page from being added to
   2725 	 * the vnode at the same time since it is locked.
   2726 	 */
   2727 	err = VOP_GETPAGE(vp, off, PAGESIZE, NULL,
   2728 	    anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
   2729 	if (err) {
   2730 		*app = NULL;
   2731 		anon_decref(ap);
   2732 		return (NULL);
   2733 	}
   2734 	pp = anon_pl[0];
   2735 
   2736 	pagezero(pp, 0, PAGESIZE);	/* XXX - should set mod bit */
   2737 	page_downgrade(pp);
   2738 	CPU_STATS_ADD_K(vm, zfod, 1);
   2739 	hat_setrefmod(pp);	/* mark as modified so pageout writes back */
   2740 	return (pp);
   2741 }
   2742 
   2743 
   2744 /*
   2745  * Allocate array of private zero-filled anon pages for empty slots
   2746  * and kept pages for non empty slots within given range.
   2747  *
   2748  * NOTE: This rontine will try and use large pages
   2749  *	if available and supported by underlying platform.
   2750  */
   2751 int
   2752 anon_map_createpages(
   2753 	struct anon_map *amp,
   2754 	ulong_t start_index,
   2755 	size_t len,
   2756 	page_t *ppa[],
   2757 	struct seg *seg,
   2758 	caddr_t addr,
   2759 	enum seg_rw rw,
   2760 	struct cred *cred)
   2761 {
   2762 
   2763 	struct anon	*ap;
   2764 	struct vnode	*ap_vp;
   2765 	page_t		*pp, *pplist, *anon_pl[1 + 1], *conpp = NULL;
   2766 	int		err = 0;
   2767 	ulong_t		p_index, index;
   2768 	pgcnt_t		npgs, pg_cnt;
   2769 	spgcnt_t	nreloc = 0;
   2770 	uint_t		l_szc, szc, prot;
   2771 	anoff_t		ap_off;
   2772 	size_t		pgsz;
   2773 	lgrp_t		*lgrp;
   2774 	kmutex_t	*ahm;
   2775 
   2776 	/*
   2777 	 * XXX For now only handle S_CREATE.
   2778 	 */
   2779 	ASSERT(rw == S_CREATE);
   2780 
   2781 	index	= start_index;
   2782 	p_index	= 0;
   2783 	npgs = btopr(len);
   2784 
   2785 	/*
   2786 	 * If this platform supports multiple page sizes
   2787 	 * then try and allocate directly from the free
   2788 	 * list for pages larger than PAGESIZE.
   2789 	 *
   2790 	 * NOTE:When we have page_create_ru we can stop
   2791 	 *	directly allocating from the freelist.
   2792 	 */
   2793 	l_szc  = seg->s_szc;
   2794 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
   2795 	while (npgs) {
   2796 
   2797 		/*
   2798 		 * if anon slot already exists
   2799 		 *   (means page has been created)
   2800 		 * so 1) look up the page
   2801 		 *    2) if the page is still in memory, get it.
   2802 		 *    3) if not, create a page and
   2803 		 *	  page in from physical swap device.
   2804 		 * These are done in anon_getpage().
   2805 		 */
   2806 		ap = anon_get_ptr(amp->ahp, index);
   2807 		if (ap) {
   2808 			err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE,
   2809 			    seg, addr, S_READ, cred);
   2810 			if (err) {
   2811 				ANON_LOCK_EXIT(&amp->a_rwlock);
   2812 				panic("anon_map_createpages: anon_getpage");
   2813 			}
   2814 			pp = anon_pl[0];
   2815 			ppa[p_index++] = pp;
   2816 
   2817 			/*
   2818 			 * an_pvp can become non-NULL after SysV's page was
   2819 			 * paged out before ISM was attached to this SysV
   2820 			 * shared memory segment. So free swap slot if needed.
   2821 			 */
   2822 			if (ap->an_pvp != NULL) {
   2823 				page_io_lock(pp);
   2824 				ahm = &anonhash_lock[AH_LOCK(ap->an_vp,
   2825 				    ap->an_off)];
   2826 				mutex_enter(ahm);
   2827 				if (ap->an_pvp != NULL) {
   2828 					swap_phys_free(ap->an_pvp,
   2829 					    ap->an_poff, PAGESIZE);
   2830 					ap->an_pvp = NULL;
   2831 					ap->an_poff = 0;
   2832 					mutex_exit(ahm);
   2833 					hat_setmod(pp);
   2834 				} else {
   2835 					mutex_exit(ahm);
   2836 				}
   2837 				page_io_unlock(pp);
   2838 			}
   2839 
   2840 			addr += PAGESIZE;
   2841 			index++;
   2842 			npgs--;
   2843 			continue;
   2844 		}
   2845 		/*
   2846 		 * Now try and allocate the largest page possible
   2847 		 * for the current address and range.
   2848 		 * Keep dropping down in page size until:
   2849 		 *
   2850 		 *	1) Properly aligned
   2851 		 *	2) Does not overlap existing anon pages
   2852 		 *	3) Fits in remaining range.
   2853 		 *	4) able to allocate one.
   2854 		 *
   2855 		 * NOTE: XXX When page_create_ru is completed this code
   2856 		 *	 will change.
   2857 		 */
   2858 		szc    = l_szc;
   2859 		pplist = NULL;
   2860 		pg_cnt = 0;
   2861 		while (szc) {
   2862 			pgsz	= page_get_pagesize(szc);
   2863 			pg_cnt	= pgsz >> PAGESHIFT;
   2864 			if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs &&
   2865 			    anon_pages(amp->ahp, index, pg_cnt) == 0) {
   2866 				/*
   2867 				 * XXX
   2868 				 * Since we are faking page_create()
   2869 				 * we also need to do the freemem and
   2870 				 * pcf accounting.
   2871 				 */
   2872 				(void) page_create_wait(pg_cnt, PG_WAIT);
   2873 
   2874 				/*
   2875 				 * Get lgroup to allocate next page of shared
   2876 				 * memory from and use it to specify where to
   2877 				 * allocate the physical memory
   2878 				 */
   2879 				lgrp = lgrp_mem_choose(seg, addr, pgsz);
   2880 
   2881 				pplist = page_get_freelist(
   2882 				    anon_vp, (u_offset_t)0, seg,
   2883 				    addr, pgsz, 0, lgrp);
   2884 
   2885 				if (pplist == NULL) {
   2886 					page_create_putback(pg_cnt);
   2887 				}
   2888 
   2889 				/*
   2890 				 * If a request for a page of size
   2891 				 * larger than PAGESIZE failed
   2892 				 * then don't try that size anymore.
   2893 				 */
   2894 				if (pplist == NULL) {
   2895 					l_szc = szc - 1;
   2896 				} else {
   2897 					break;
   2898 				}
   2899 			}
   2900 			szc--;
   2901 		}
   2902 
   2903 		/*
   2904 		 * If just using PAGESIZE pages then don't
   2905 		 * directly allocate from the free list.
   2906 		 */
   2907 		if (pplist == NULL) {
   2908 			ASSERT(szc == 0);
   2909 			pp = anon_zero(seg, addr, &ap, cred);
   2910 			if (pp == NULL) {
   2911 				ANON_LOCK_EXIT(&amp->a_rwlock);
   2912 				panic("anon_map_createpages: anon_zero");
   2913 			}
   2914 			ppa[p_index++] = pp;
   2915 
   2916 			ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
   2917 			(void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
   2918 
   2919 			addr += PAGESIZE;
   2920 			index++;
   2921 			npgs--;
   2922 			continue;
   2923 		}
   2924 
   2925 		/*
   2926 		 * pplist is a list of pg_cnt PAGESIZE pages.
   2927 		 * These pages are locked SE_EXCL since they
   2928 		 * came directly off the free list.
   2929 		 */
   2930 		ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt));
   2931 		ASSERT(IS_P2ALIGNED(index, pg_cnt));
   2932 		ASSERT(conpp == NULL);
   2933 		while (pg_cnt--) {
   2934 
   2935 			ap = anon_alloc(NULL, 0);
   2936 			swap_xlate(ap, &ap_vp, &ap_off);
   2937 
   2938 			ASSERT(pplist != NULL);
   2939 			pp = pplist;
   2940 			page_sub(&pplist, pp);
   2941 			PP_CLRFREE(pp);
   2942 			PP_CLRAGED(pp);
   2943 			conpp = pp;
   2944 
   2945 			err = swap_getconpage(ap_vp, ap_off, PAGESIZE,
   2946 			    (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL,
   2947 			    &nreloc, seg, addr, S_CREATE, cred);
   2948 
   2949 			if (err) {
   2950 				ANON_LOCK_EXIT(&amp->a_rwlock);
   2951 				panic("anon_map_createpages: S_CREATE");
   2952 			}
   2953 
   2954 			ASSERT(anon_pl[0] == pp);
   2955 			ASSERT(nreloc == 1);
   2956 			pagezero(pp, 0, PAGESIZE);
   2957 			CPU_STATS_ADD_K(vm, zfod, 1);
   2958 			hat_setrefmod(pp);
   2959 
   2960 			ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
   2961 			(void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
   2962 
   2963 			ppa[p_index++] = pp;
   2964 
   2965 			addr += PAGESIZE;
   2966 			index++;
   2967 			npgs--;
   2968 		}
   2969 		conpp = NULL;
   2970 		pg_cnt	= pgsz >> PAGESHIFT;
   2971 		p_index = p_index - pg_cnt;
   2972 		while (pg_cnt--) {
   2973 			page_downgrade(ppa[p_index++]);
   2974 		}
   2975 	}
   2976 	ANON_LOCK_EXIT(&amp->a_rwlock);
   2977 	return (0);
   2978 }
   2979 
   2980 static int
   2981 anon_try_demote_pages(
   2982 	struct anon_hdr *ahp,
   2983 	ulong_t sidx,
   2984 	uint_t szc,
   2985 	page_t **ppa,
   2986 	int private)
   2987 {
   2988 	struct anon	*ap;
   2989 	pgcnt_t		pgcnt = page_get_pagecnt(szc);
   2990 	page_t		*pp;
   2991 	pgcnt_t		i;
   2992 	kmutex_t	*ahmpages = NULL;
   2993 	int		root = 0;
   2994 	pgcnt_t		npgs;
   2995 	pgcnt_t		curnpgs = 0;
   2996 	size_t		ppasize = 0;
   2997 
   2998 	ASSERT(szc != 0);
   2999 	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
   3000 	ASSERT(IS_P2ALIGNED(sidx, pgcnt));
   3001 	ASSERT(sidx < ahp->size);
   3002 
   3003 	if (ppa == NULL) {
   3004 		ppasize = pgcnt * sizeof (page_t *);
   3005 		ppa = kmem_alloc(ppasize, KM_SLEEP);
   3006 	}
   3007 
   3008 	ap = anon_get_ptr(ahp, sidx);
   3009 	if (ap != NULL && private) {
   3010 		VM_STAT_ADD(anonvmstats.demotepages[1]);
   3011 		ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
   3012 		mutex_enter(ahmpages);
   3013 	}
   3014 
   3015 	if (ap != NULL && ap->an_refcnt > 1) {
   3016 		if (ahmpages != NULL) {
   3017 			VM_STAT_ADD(anonvmstats.demotepages[2]);
   3018 			mutex_exit(ahmpages);
   3019 		}
   3020 		if (ppasize != 0) {
   3021 			kmem_free(ppa, ppasize);
   3022 		}
   3023 		return (0);
   3024 	}
   3025 	if (ahmpages != NULL) {
   3026 		mutex_exit(ahmpages);
   3027 	}
   3028 	if (ahp->size - sidx < pgcnt) {
   3029 		ASSERT(private == 0);
   3030 		pgcnt = ahp->size - sidx;
   3031 	}
   3032 	for (i = 0; i < pgcnt; i++, sidx++) {
   3033 		ap = anon_get_ptr(ahp, sidx);
   3034 		if (ap != NULL) {
   3035 			if (ap->an_refcnt != 1) {
   3036 				panic("anon_try_demote_pages: an_refcnt != 1");
   3037 			}
   3038 			pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off,
   3039 			    SE_EXCL);
   3040 			if (pp != NULL) {
   3041 				(void) hat_pageunload(pp,
   3042 				    HAT_FORCE_PGUNLOAD);
   3043 			}
   3044 		} else {
   3045 			ppa[i] = NULL;
   3046 		}
   3047 	}
   3048 	for (i = 0; i < pgcnt; i++) {
   3049 		if ((pp = ppa[i]) != NULL && pp->p_szc != 0) {
   3050 			ASSERT(pp->p_szc <= szc);
   3051 			if (!root) {
   3052 				VM_STAT_ADD(anonvmstats.demotepages[3]);
   3053 				if (curnpgs != 0)
   3054 					panic("anon_try_demote_pages: "
   3055 					    "bad large page");
   3056 
   3057 				root = 1;
   3058 				curnpgs = npgs =
   3059 				    page_get_pagecnt(pp->p_szc);
   3060 
   3061 				ASSERT(npgs <= pgcnt);
   3062 				ASSERT(IS_P2ALIGNED(npgs, npgs));
   3063 				ASSERT(!(page_pptonum(pp) & (npgs - 1)));
   3064 			} else {
   3065 				ASSERT(i > 0);
   3066 				ASSERT(page_pptonum(pp) - 1 ==
   3067 				    page_pptonum(ppa[i - 1]));
   3068 				if ((page_pptonum(pp) & (npgs - 1)) ==
   3069 				    npgs - 1)
   3070 					root = 0;
   3071 			}
   3072 			ASSERT(PAGE_EXCL(pp));
   3073 			pp->p_szc = 0;
   3074 			ASSERT(curnpgs > 0);
   3075 			curnpgs--;
   3076 		}
   3077 	}
   3078 	if (root != 0 || curnpgs != 0)
   3079 		panic("anon_try_demote_pages: bad large page");
   3080 
   3081 	for (i = 0; i < pgcnt; i++) {
   3082 		if ((pp = ppa[i]) != NULL) {
   3083 			ASSERT(!hat_page_is_mapped(pp));
   3084 			ASSERT(pp->p_szc == 0);
   3085 			page_unlock(pp);
   3086 		}
   3087 	}
   3088 	if (ppasize != 0) {
   3089 		kmem_free(ppa, ppasize);
   3090 	}
   3091 	return (1);
   3092 }
   3093 
   3094 /*
   3095  * anon_map_demotepages() can only be called by MAP_PRIVATE segments.
   3096  */
   3097 int
   3098 anon_map_demotepages(
   3099 	struct anon_map *amp,
   3100 	ulong_t	start_idx,
   3101 	struct seg *seg,
   3102 	caddr_t addr,
   3103 	uint_t prot,
   3104 	struct vpage vpage[],
   3105 	struct cred *cred)
   3106 {
   3107 	struct anon	*ap;
   3108 	uint_t		szc = seg->s_szc;
   3109 	pgcnt_t		pgcnt = page_get_pagecnt(szc);
   3110 	size_t		ppasize = pgcnt * sizeof (page_t *);
   3111 	page_t		**ppa = kmem_alloc(ppasize, KM_SLEEP);
   3112 	page_t		*pp;
   3113 	page_t		*pl[2];
   3114 	pgcnt_t		i, pg_idx;
   3115 	ulong_t		an_idx;
   3116 	caddr_t		vaddr;
   3117 	int 		err;
   3118 	int		retry = 0;
   3119 	uint_t		vpprot;
   3120 
   3121 	ASSERT(RW_WRITE_HELD(&amp->a_rwlock));
   3122 	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
   3123 	ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
   3124 	ASSERT(ppa != NULL);
   3125 	ASSERT(szc != 0);
   3126 	ASSERT(szc == amp->a_szc);
   3127 
   3128 	VM_STAT_ADD(anonvmstats.demotepages[0]);
   3129 
   3130 top:
   3131 	if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) {
   3132 		kmem_free(ppa, ppasize);
   3133 		return (0);
   3134 	}
   3135 
   3136 	VM_STAT_ADD(anonvmstats.demotepages[4]);
   3137 
   3138 	ASSERT(retry == 0); /* we can be here only once */
   3139 
   3140 	vaddr = addr;
   3141 	for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
   3142 	    pg_idx++, an_idx++, vaddr += PAGESIZE) {
   3143 		ap = anon_get_ptr(amp->ahp, an_idx);
   3144 		if (ap == NULL)
   3145 			panic("anon_map_demotepages: no anon slot");
   3146 		err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr,
   3147 		    S_READ, cred);
   3148 		if (err) {
   3149 			for (i = 0; i < pg_idx; i++) {
   3150 				if ((pp = ppa[i]) != NULL)
   3151 					page_unlock(pp);
   3152 			}
   3153 			kmem_free(ppa, ppasize);
   3154 			return (err);
   3155 		}
   3156 		ppa[pg_idx] = pl[0];
   3157 	}
   3158 
   3159 	err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa,
   3160 	    vpage, -1, 0, cred);
   3161 	if (err > 0) {
   3162 		VM_STAT_ADD(anonvmstats.demotepages[5]);
   3163 		kmem_free(ppa, ppasize);
   3164 		return (err);
   3165 	}
   3166 	ASSERT(err == 0 || err == -1);
   3167 	if (err == -1) {
   3168 		VM_STAT_ADD(anonvmstats.demotepages[6]);
   3169 		retry = 1;
   3170 		goto top;
   3171 	}
   3172 	for (i = 0; i < pgcnt; i++) {
   3173 		ASSERT(ppa[i] != NULL);
   3174 		if (ppa[i]->p_szc != 0)
   3175 			retry = 1;
   3176 		page_unlock(ppa[i]);
   3177 	}
   3178 	if (retry) {
   3179 		VM_STAT_ADD(anonvmstats.demotepages[7]);
   3180 		goto top;
   3181 	}
   3182 
   3183 	VM_STAT_ADD(anonvmstats.demotepages[8]);
   3184 
   3185 	kmem_free(ppa, ppasize);
   3186 
   3187 	return (0);
   3188 }
   3189 
   3190 /*
   3191  * Free pages of shared anon map. It's assumed that anon maps don't share anon
   3192  * structures with private anon maps. Therefore all anon structures should
   3193  * have at most one reference at this point. This means underlying pages can
   3194  * be exclusively locked and demoted or freed.  If not freeing the entire
   3195  * large pages demote the ends of the region we free to be able to free
   3196  * subpages. Page roots correspond to aligned index positions in anon map.
   3197  */
   3198 void
   3199 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len)
   3200 {
   3201 	ulong_t eidx = sidx + btopr(len);
   3202 	pgcnt_t pages = page_get_pagecnt(amp->a_szc);
   3203 	struct anon_hdr *ahp = amp->ahp;
   3204 	ulong_t tidx;
   3205 	size_t size;
   3206 	ulong_t sidx_aligned;
   3207 	ulong_t eidx_aligned;
   3208 
   3209 	ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
   3210 	ASSERT(amp->refcnt <= 1);
   3211 	ASSERT(amp->a_szc > 0);
   3212 	ASSERT(eidx <= ahp->size);
   3213 	ASSERT(!anon_share(ahp, sidx, btopr(len)));
   3214 
   3215 	if (len == 0) {	/* XXX */
   3216 		return;
   3217 	}
   3218 
   3219 	sidx_aligned = P2ALIGN(sidx, pages);
   3220 	if (sidx_aligned != sidx ||
   3221 	    (eidx < sidx_aligned + pages && eidx < ahp->size)) {
   3222 		if (!anon_try_demote_pages(ahp, sidx_aligned,
   3223 		    amp->a_szc, NULL, 0)) {
   3224 			panic("anon_shmap_free_pages: demote failed");
   3225 		}
   3226 		size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) :
   3227 		    P2NPHASE(sidx, pages);
   3228 		size <<= PAGESHIFT;
   3229 		anon_free(ahp, sidx, size);
   3230 		sidx = sidx_aligned + pages;
   3231 		if (eidx <= sidx) {
   3232 			return;
   3233 		}
   3234 	}
   3235 	eidx_aligned = P2ALIGN(eidx, pages);
   3236 	if (sidx < eidx_aligned) {
   3237 		anon_free_pages(ahp, sidx,
   3238 		    (eidx_aligned - sidx) << PAGESHIFT,
   3239 		    amp->a_szc);
   3240 		sidx = eidx_aligned;
   3241 	}
   3242 	ASSERT(sidx == eidx_aligned);
   3243 	if (eidx == eidx_aligned) {
   3244 		return;
   3245 	}
   3246 	tidx = eidx;
   3247 	if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL &&
   3248 	    tidx - sidx < pages) {
   3249 		if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) {
   3250 			panic("anon_shmap_free_pages: demote failed");
   3251 		}
   3252 		size = (eidx - sidx) << PAGESHIFT;
   3253 		anon_free(ahp, sidx, size);
   3254 	} else {
   3255 		anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc);
   3256 	}
   3257 }
   3258 
   3259 /*
   3260  * This routine should be called with amp's writer lock when there're no other
   3261  * users of amp.  All pcache entries of this amp must have been already
   3262  * inactivated. We must not drop a_rwlock here to prevent new users from
   3263  * attaching to this amp.
   3264  */
   3265 void
   3266 anonmap_purge(struct anon_map *amp)
   3267 {
   3268 	ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
   3269 	ASSERT(amp->refcnt <= 1);
   3270 
   3271 	if (amp->a_softlockcnt != 0) {
   3272 		seg_ppurge(NULL, amp, 0);
   3273 	}
   3274 
   3275 	/*
   3276 	 * Since all pcache entries were already inactive before this routine
   3277 	 * was called seg_ppurge() couldn't return while there're still
   3278 	 * entries that can be found via the list anchored at a_phead. So we
   3279 	 * can assert this list is empty now. a_softlockcnt may be still non 0
   3280 	 * if asynchronous thread that manages pcache already removed pcache
   3281 	 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non
   3282 	 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if
   3283 	 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map
   3284 	 * before shamp_reclaim() is done with it. a_purgemtx also taken by
   3285 	 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a
   3286 	 * barrier that prevents anonmap_purge() to complete while
   3287 	 * shamp_reclaim() may still be referencing this amp.
   3288 	 */
   3289 	ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
   3290 	ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
   3291 
   3292 	mutex_enter(&amp->a_purgemtx);
   3293 	while (amp->a_softlockcnt != 0) {
   3294 		ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
   3295 		ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
   3296 		amp->a_purgewait = 1;
   3297 		cv_wait(&amp->a_purgecv, &amp->a_purgemtx);
   3298 	}
   3299 	mutex_exit(&amp->a_purgemtx);
   3300 
   3301 	ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
   3302 	ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
   3303 	ASSERT(amp->a_softlockcnt == 0);
   3304 }
   3305 
   3306 /*
   3307  * Allocate and initialize an anon_map structure for seg
   3308  * associating the given swap reservation with the new anon_map.
   3309  */
   3310 struct anon_map *
   3311 anonmap_alloc(size_t size, size_t swresv, int flags)
   3312 {
   3313 	struct anon_map *amp;
   3314 	int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
   3315 
   3316 	amp = kmem_cache_alloc(anonmap_cache, kmflags);
   3317 	if (amp == NULL) {
   3318 		ASSERT(kmflags == KM_NOSLEEP);
   3319 		return (NULL);
   3320 	}
   3321 
   3322 	amp->ahp = anon_create(btopr(size), flags);
   3323 	if (amp->ahp == NULL) {
   3324 		ASSERT(flags == ANON_NOSLEEP);
   3325 		kmem_cache_free(anonmap_cache, amp);
   3326 		return (NULL);
   3327 	}
   3328 	amp->refcnt = 1;
   3329 	amp->size = size;
   3330 	amp->swresv = swresv;
   3331 	amp->locality = 0;
   3332 	amp->a_szc = 0;
   3333 	amp->a_sp = NULL;
   3334 	amp->a_softlockcnt = 0;
   3335 	amp->a_purgewait = 0;
   3336 	amp->a_phead.p_lnext = &amp->a_phead;
   3337 	amp->a_phead.p_lprev = &amp->a_phead;
   3338 
   3339 	return (amp);
   3340 }
   3341 
   3342 void
   3343 anonmap_free(struct anon_map *amp)
   3344 {
   3345 	ASSERT(amp->ahp != NULL);
   3346 	ASSERT(amp->refcnt == 0);
   3347 	ASSERT(amp->a_softlockcnt == 0);
   3348 	ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
   3349 	ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
   3350 
   3351 	lgrp_shm_policy_fini(amp, NULL);
   3352 	anon_release(amp->ahp, btopr(amp->size));
   3353 	kmem_cache_free(anonmap_cache, amp);
   3354 }
   3355 
   3356 /*
   3357  * Returns true if the app array has some empty slots.
   3358  * The offp and lenp parameters are in/out parameters.  On entry
   3359  * these values represent the starting offset and length of the
   3360  * mapping.  When true is returned, these values may be modified
   3361  * to be the largest range which includes empty slots.
   3362  */
   3363 int
   3364 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp,
   3365 				size_t *lenp)
   3366 {
   3367 	ulong_t i, el;
   3368 	ssize_t low, high;
   3369 	struct anon *ap;
   3370 
   3371 	low = -1;
   3372 	for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) {
   3373 		ap = anon_get_ptr(ahp, anon_idx);
   3374 		if (ap == NULL) {
   3375 			if (low == -1)
   3376 				low = i;
   3377 			high = i;
   3378 		}
   3379 	}
   3380 	if (low != -1) {
   3381 		/*
   3382 		 * Found at least one non-anon page.
   3383 		 * Set up the off and len return values.
   3384 		 */
   3385 		if (low != 0)
   3386 			*offp += low;
   3387 		*lenp = high - low + PAGESIZE;
   3388 		return (1);
   3389 	}
   3390 	return (0);
   3391 }
   3392 
   3393 /*
   3394  * Return a count of the number of existing anon pages in the anon array
   3395  * app in the range (off, off+len). The array and slots must be guaranteed
   3396  * stable by the caller.
   3397  */
   3398 pgcnt_t
   3399 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
   3400 {
   3401 	pgcnt_t cnt = 0;
   3402 
   3403 	while (nslots-- > 0) {
   3404 		if ((anon_get_ptr(ahp, anon_index)) != NULL)
   3405 			cnt++;
   3406 		anon_index++;
   3407 	}
   3408 	return (cnt);
   3409 }
   3410 
   3411 /*
   3412  * Move reserved phys swap into memory swap (unreserve phys swap
   3413  * and reserve mem swap by the same amount).
   3414  * Used by segspt when it needs to lock reserved swap npages in memory
   3415  */
   3416 int
   3417 anon_swap_adjust(pgcnt_t npages)
   3418 {
   3419 	pgcnt_t unlocked_mem_swap;
   3420 
   3421 	mutex_enter(&anoninfo_lock);
   3422 
   3423 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
   3424 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
   3425 
   3426 	unlocked_mem_swap = k_anoninfo.ani_mem_resv
   3427 	    - k_anoninfo.ani_locked_swap;
   3428 	if (npages > unlocked_mem_swap) {
   3429 		spgcnt_t adjusted_swap = npages - unlocked_mem_swap;
   3430 
   3431 		/*
   3432 		 * if there is not enough unlocked mem swap we take missing
   3433 		 * amount from phys swap and give it to mem swap
   3434 		 */
   3435 		if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) {
   3436 			mutex_exit(&anoninfo_lock);
   3437 			return (ENOMEM);
   3438 		}
   3439 
   3440 		k_anoninfo.ani_mem_resv += adjusted_swap;
   3441 		ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap);
   3442 		k_anoninfo.ani_phys_resv -= adjusted_swap;
   3443 
   3444 		ANI_ADD(adjusted_swap);
   3445 	}
   3446 	k_anoninfo.ani_locked_swap += npages;
   3447 
   3448 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
   3449 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
   3450 
   3451 	mutex_exit(&anoninfo_lock);
   3452 
   3453 	return (0);
   3454 }
   3455 
   3456 /*
   3457  * 'unlocked' reserved mem swap so when it is unreserved it
   3458  * can be moved back phys (disk) swap
   3459  */
   3460 void
   3461 anon_swap_restore(pgcnt_t npages)
   3462 {
   3463 	mutex_enter(&anoninfo_lock);
   3464 
   3465 	ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
   3466 
   3467 	ASSERT(k_anoninfo.ani_locked_swap >= npages);
   3468 	k_anoninfo.ani_locked_swap -= npages;
   3469 
   3470 	ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
   3471 
   3472 	mutex_exit(&anoninfo_lock);
   3473 }
   3474 
   3475 /*
   3476  * Return the pointer from the list for a
   3477  * specified anon index.
   3478  */
   3479 ulong_t *
   3480 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx)
   3481 {
   3482 	struct anon	**app;
   3483 	void 		**ppp;
   3484 
   3485 	ASSERT(an_idx < ahp->size);
   3486 
   3487 	/*
   3488 	 * Single level case.
   3489 	 */
   3490 	if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
   3491 		return ((ulong_t *)&ahp->array_chunk[an_idx]);
   3492 	} else {
   3493 
   3494 		/*
   3495 		 * 2 level case.
   3496 		 */
   3497 		ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
   3498 		if (*ppp == NULL) {
   3499 			mutex_enter(&ahp->serial_lock);
   3500 			ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
   3501 			if (*ppp == NULL)
   3502 				*ppp = kmem_zalloc(PAGESIZE, KM_SLEEP);
   3503 			mutex_exit(&ahp->serial_lock);
   3504 		}
   3505 		app = *ppp;
   3506 		return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]);
   3507 	}
   3508 }
   3509 
   3510 void
   3511 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj)
   3512 {
   3513 	ulong_t		*ap_slot;
   3514 	kmutex_t	*mtx;
   3515 	kcondvar_t	*cv;
   3516 	int		hash;
   3517 
   3518 	/*
   3519 	 * Use szc to determine anon slot(s) to appear atomic.
   3520 	 * If szc = 0, then lock the anon slot and mark it busy.
   3521 	 * If szc > 0, then lock the range of slots by getting the
   3522 	 * anon_array_lock for the first anon slot, and mark only the
   3523 	 * first anon slot busy to represent whole range being busy.
   3524 	 */
   3525 
   3526 	ASSERT(RW_READ_HELD(&amp->a_rwlock));
   3527 	an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
   3528 	hash = ANON_ARRAY_HASH(amp, an_idx);
   3529 	sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
   3530 	sobj->sync_cv = cv = &anon_array_cv[hash];
   3531 	mutex_enter(mtx);
   3532 	ap_slot = anon_get_slot(amp->ahp, an_idx);
   3533 	while (ANON_ISBUSY(ap_slot))
   3534 		cv_wait(cv, mtx);
   3535 	ANON_SETBUSY(ap_slot);
   3536 	sobj->sync_data = ap_slot;
   3537 	mutex_exit(mtx);
   3538 }
   3539 
   3540 int
   3541 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx,
   3542 			anon_sync_obj_t *sobj)
   3543 {
   3544 	ulong_t		*ap_slot;
   3545 	kmutex_t	*mtx;
   3546 	int		hash;
   3547 
   3548 	/*
   3549 	 * Try to lock a range of anon slots.
   3550 	 * Use szc to determine anon slot(s) to appear atomic.
   3551 	 * If szc = 0, then lock the anon slot and mark it busy.
   3552 	 * If szc > 0, then lock the range of slots by getting the
   3553 	 * anon_array_lock for the first anon slot, and mark only the
   3554 	 * first anon slot busy to represent whole range being busy.
   3555 	 * Fail if the mutex or the anon_array are busy.
   3556 	 */
   3557 
   3558 	ASSERT(RW_READ_HELD(&amp->a_rwlock));
   3559 	an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
   3560 	hash = ANON_ARRAY_HASH(amp, an_idx);
   3561 	sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
   3562 	sobj->sync_cv = &anon_array_cv[hash];
   3563 	if (!mutex_tryenter(mtx)) {
   3564 		return (EWOULDBLOCK);
   3565 	}
   3566 	ap_slot = anon_get_slot(amp->ahp, an_idx);
   3567 	if (ANON_ISBUSY(ap_slot)) {
   3568 		mutex_exit(mtx);
   3569 		return (EWOULDBLOCK);
   3570 	}
   3571 	ANON_SETBUSY(ap_slot);
   3572 	sobj->sync_data = ap_slot;
   3573 	mutex_exit(mtx);
   3574 	return (0);
   3575 }
   3576 
   3577 void
   3578 anon_array_exit(anon_sync_obj_t *sobj)
   3579 {
   3580 	mutex_enter(sobj->sync_mutex);
   3581 	ASSERT(ANON_ISBUSY(sobj->sync_data));
   3582 	ANON_CLRBUSY(sobj->sync_data);
   3583 	if (CV_HAS_WAITERS(sobj->sync_cv))
   3584 		cv_broadcast(sobj->sync_cv);
   3585 	mutex_exit(sobj->sync_mutex);
   3586 }
   3587