Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 
     27 /*
     28  * VM - page locking primitives
     29  */
     30 #include <sys/param.h>
     31 #include <sys/t_lock.h>
     32 #include <sys/vtrace.h>
     33 #include <sys/debug.h>
     34 #include <sys/cmn_err.h>
     35 #include <sys/bitmap.h>
     36 #include <sys/lockstat.h>
     37 #include <sys/sysmacros.h>
     38 #include <sys/condvar_impl.h>
     39 #include <vm/page.h>
     40 #include <vm/seg_enum.h>
     41 #include <vm/vm_dep.h>
     42 #include <vm/seg_kmem.h>
     43 
     44 /*
     45  * This global mutex is for logical page locking.
     46  * The following fields in the page structure are protected
     47  * by this lock:
     48  *
     49  *	p_lckcnt
     50  *	p_cowcnt
     51  */
     52 kmutex_t page_llock;
     53 
     54 /*
     55  * This is a global lock for the logical page free list.  The
     56  * logical free list, in this implementation, is maintained as two
     57  * separate physical lists - the cache list and the free list.
     58  */
     59 kmutex_t  page_freelock;
     60 
     61 /*
     62  * The hash table, page_hash[], the p_selock fields, and the
     63  * list of pages associated with vnodes are protected by arrays of mutexes.
     64  *
     65  * Unless the hashes are changed radically, the table sizes must be
     66  * a power of two.  Also, we typically need more mutexes for the
     67  * vnodes since these locks are occasionally held for long periods.
     68  * And since there seem to be two special vnodes (kvp and swapvp),
     69  * we make room for private mutexes for them.
     70  *
     71  * The pse_mutex[] array holds the mutexes to protect the p_selock
     72  * fields of all page_t structures.
     73  *
     74  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
     75  * when given a pointer to a page_t.
     76  *
     77  * PIO_TABLE_SIZE must be a power of two.  One could argue that we
     78  * should go to the trouble of setting it up at run time and base it
     79  * on memory size rather than the number of compile time CPUs.
     80  *
     81  * XX64	We should be using physmem size to calculate PIO_SHIFT.
     82  *
     83  *	These might break in 64 bit world.
     84  */
     85 #define	PIO_SHIFT	7	/* log2(sizeof(page_t)) */
     86 #define	PIO_TABLE_SIZE	128	/* number of io mutexes to have */
     87 
     88 pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
     89 kmutex_t	pio_mutex[PIO_TABLE_SIZE];
     90 
     91 #define	PAGE_IO_MUTEX(pp) \
     92 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
     93 
     94 /*
     95  * The pse_mutex[] array is allocated in the platform startup code
     96  * based on the size of the machine at startup.
     97  */
     98 extern pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
     99 extern size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
    100 extern int pse_shift;			/* log2(pse_table_size) */
    101 #define	PAGE_SE_MUTEX(pp)	&pse_mutex[				\
    102 	((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &	\
    103 	(pse_table_size - 1)].pad_mutex
    104 
    105 #define	PSZC_MTX_TABLE_SIZE	128
    106 #define	PSZC_MTX_TABLE_SHIFT	7
    107 
    108 static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
    109 
    110 #define	PAGE_SZC_MUTEX(_pp) \
    111 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
    112 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
    113 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
    114 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
    115 
    116 /*
    117  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
    118  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
    119  * and p_vpnext).
    120  *
    121  * The page_vnode_mutex(vp) function returns the address of the appropriate
    122  * mutex from this array given a pointer to a vnode.  It is complicated
    123  * by the fact that the kernel's vnode and the swapfs vnode are referenced
    124  * frequently enough to warrent their own mutexes.
    125  *
    126  * The VP_HASH_FUNC returns the index into the vph_mutex array given
    127  * an address of a vnode.
    128  */
    129 
    130 /*
    131  * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
    132  *	Need to review again.
    133  */
    134 #if defined(_LP64)
    135 #define	VPH_TABLE_SIZE  (1 << (VP_SHIFT + 3))
    136 #else	/* 32 bits */
    137 #define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
    138 #endif
    139 
    140 #define	VP_HASH_FUNC(vp) \
    141 	((((uintptr_t)(vp) >> 6) + \
    142 	    ((uintptr_t)(vp) >> 8) + \
    143 	    ((uintptr_t)(vp) >> 10) + \
    144 	    ((uintptr_t)(vp) >> 12)) \
    145 	    & (VPH_TABLE_SIZE - 1))
    146 
    147 /*
    148  * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
    149  * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
    150  * VPH_TABLE_SIZE + 1.
    151  */
    152 
    153 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
    154 
    155 /*
    156  * Initialize the locks used by the Virtual Memory Management system.
    157  */
    158 void
    159 page_lock_init()
    160 {
    161 }
    162 
    163 /*
    164  * Return a value for pse_shift based on npg (the number of physical pages)
    165  * and ncpu (the maximum number of CPUs).  This is called by platform startup
    166  * code.
    167  *
    168  * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
    169  * locks grew approximately as the square of the number of threads executing.
    170  * So the primary scaling factor used is NCPU^2.  The size of the machine in
    171  * megabytes is used as an upper bound, particularly for sun4v machines which
    172  * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
    173  * (128) is used as a minimum.  Since the size of the table has to be a power
    174  * of two, the calculated size is rounded up to the next power of two.
    175  */
    176 /*ARGSUSED*/
    177 int
    178 size_pse_array(pgcnt_t npg, int ncpu)
    179 {
    180 	size_t size;
    181 	pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
    182 
    183 	size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
    184 	size += (1 << (highbit(size) - 1)) - 1;
    185 	return (highbit(size) - 1);
    186 }
    187 
    188 /*
    189  * At present we only use page ownership to aid debugging, so it's
    190  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
    191  * can map to the same owner because we just 'or' in 0x80000000 and
    192  * then clear the second highest bit, so that (for example) 0x2faced00
    193  * and 0xafaced00 both map to 0xafaced00.
    194  * In the 64-bit world, p_selock may not be large enough to hold a full
    195  * thread pointer.  If we ever need precise ownership (e.g. if we implement
    196  * priority inheritance for page locks) then p_selock should become a
    197  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
    198  */
    199 #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
    200 #define	SE_READER	1
    201 
    202 /*
    203  * A page that is deleted must be marked as such using the
    204  * page_lock_delete() function. The page must be exclusively locked.
    205  * The SE_DELETED marker is put in p_selock when this function is called.
    206  * SE_DELETED must be distinct from any SE_WRITER value.
    207  */
    208 #define	SE_DELETED	(1 | INT_MIN)
    209 
    210 #ifdef VM_STATS
    211 uint_t	vph_kvp_count;
    212 uint_t	vph_swapfsvp_count;
    213 uint_t	vph_other;
    214 #endif /* VM_STATS */
    215 
    216 #ifdef VM_STATS
    217 uint_t	page_lock_count;
    218 uint_t	page_lock_miss;
    219 uint_t	page_lock_miss_lock;
    220 uint_t	page_lock_reclaim;
    221 uint_t	page_lock_bad_reclaim;
    222 uint_t	page_lock_same_page;
    223 uint_t	page_lock_upgrade;
    224 uint_t	page_lock_retired;
    225 uint_t	page_lock_upgrade_failed;
    226 uint_t	page_lock_deleted;
    227 
    228 uint_t	page_trylock_locked;
    229 uint_t	page_trylock_failed;
    230 uint_t	page_trylock_missed;
    231 
    232 uint_t	page_try_reclaim_upgrade;
    233 #endif /* VM_STATS */
    234 
    235 /*
    236  * Acquire the "shared/exclusive" lock on a page.
    237  *
    238  * Returns 1 on success and locks the page appropriately.
    239  *	   0 on failure and does not lock the page.
    240  *
    241  * If `lock' is non-NULL, it will be dropped and reacquired in the
    242  * failure case.  This routine can block, and if it does
    243  * it will always return a failure since the page identity [vp, off]
    244  * or state may have changed.
    245  */
    246 
    247 int
    248 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
    249 {
    250 	return (page_lock_es(pp, se, lock, reclaim, 0));
    251 }
    252 
    253 /*
    254  * With the addition of reader-writer lock semantics to page_lock_es,
    255  * callers wanting an exclusive (writer) lock may prevent shared-lock
    256  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
    257  * In this case, when an exclusive lock cannot be acquired, p_selock's
    258  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
    259  * if the page is slated for retirement.
    260  *
    261  * The se and es parameters determine if the lock should be granted
    262  * based on the following decision table:
    263  *
    264  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
    265  * ----------- -------------- -------------------  ---------
    266  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
    267  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
    268  * SE_EXCL        none         any lock/any        deny
    269  * SE_SHARED      n/a [2]        shared/0          grant
    270  * SE_SHARED      n/a [2]      unlocked/0          grant
    271  * SE_SHARED      n/a            shared/1          deny
    272  * SE_SHARED      n/a          unlocked/1          deny
    273  * SE_SHARED      n/a              excl/any        deny
    274  *
    275  * Notes:
    276  * [1] The code grants an exclusive lock to the caller and clears the bit
    277  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
    278  *   bit's value.  This was deemed acceptable as we are not concerned about
    279  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
    280  *   fifo mechanism should also be implemented. Meantime, the thread that
    281  *   set SE_EWANTED should be prepared to catch this condition and reset it
    282  *
    283  * [2] Retired pages may not be locked at any time, regardless of the
    284  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
    285  *
    286  * Notes on values of "es":
    287  *
    288  *   es & 1: page_lookup_create will attempt page relocation
    289  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
    290  *       memory thread); this prevents reader-starvation of waiting
    291  *       writer thread(s) by giving priority to writers over readers.
    292  *   es & SE_RETIRED: caller wants to lock pages even if they are
    293  *       retired.  Default is to deny the lock if the page is retired.
    294  *
    295  * And yes, we know, the semantics of this function are too complicated.
    296  * It's on the list to be cleaned up.
    297  */
    298 int
    299 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
    300 {
    301 	int		retval;
    302 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
    303 	int		upgraded;
    304 	int		reclaim_it;
    305 
    306 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
    307 
    308 	VM_STAT_ADD(page_lock_count);
    309 
    310 	upgraded = 0;
    311 	reclaim_it = 0;
    312 
    313 	mutex_enter(pse);
    314 
    315 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
    316 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
    317 
    318 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
    319 		mutex_exit(pse);
    320 		VM_STAT_ADD(page_lock_retired);
    321 		return (0);
    322 	}
    323 
    324 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
    325 		se = SE_EXCL;
    326 	}
    327 
    328 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
    329 
    330 		reclaim_it = 1;
    331 		if (se == SE_SHARED) {
    332 			/*
    333 			 * This is an interesting situation.
    334 			 *
    335 			 * Remember that p_free can only change if
    336 			 * p_selock < 0.
    337 			 * p_free does not depend on our holding `pse'.
    338 			 * And, since we hold `pse', p_selock can not change.
    339 			 * So, if p_free changes on us, the page is already
    340 			 * exclusively held, and we would fail to get p_selock
    341 			 * regardless.
    342 			 *
    343 			 * We want to avoid getting the share
    344 			 * lock on a free page that needs to be reclaimed.
    345 			 * It is possible that some other thread has the share
    346 			 * lock and has left the free page on the cache list.
    347 			 * pvn_vplist_dirty() does this for brief periods.
    348 			 * If the se_share is currently SE_EXCL, we will fail
    349 			 * to acquire p_selock anyway.  Blocking is the
    350 			 * right thing to do.
    351 			 * If we need to reclaim this page, we must get
    352 			 * exclusive access to it, force the upgrade now.
    353 			 * Again, we will fail to acquire p_selock if the
    354 			 * page is not free and block.
    355 			 */
    356 			upgraded = 1;
    357 			se = SE_EXCL;
    358 			VM_STAT_ADD(page_lock_upgrade);
    359 		}
    360 	}
    361 
    362 	if (se == SE_EXCL) {
    363 		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
    364 			/*
    365 			 * if the caller wants a writer lock (but did not
    366 			 * specify exclusive access), and there is a pending
    367 			 * writer that wants exclusive access, return failure
    368 			 */
    369 			retval = 0;
    370 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
    371 			/* no reader/writer lock held */
    372 			THREAD_KPRI_REQUEST();
    373 			/* this clears our setting of the SE_EWANTED bit */
    374 			pp->p_selock = SE_WRITER;
    375 			retval = 1;
    376 		} else {
    377 			/* page is locked */
    378 			if (es & SE_EXCL_WANTED) {
    379 				/* set the SE_EWANTED bit */
    380 				pp->p_selock |= SE_EWANTED;
    381 			}
    382 			retval = 0;
    383 		}
    384 	} else {
    385 		retval = 0;
    386 		if (pp->p_selock >= 0) {
    387 			if ((pp->p_selock & SE_EWANTED) == 0) {
    388 				pp->p_selock += SE_READER;
    389 				retval = 1;
    390 			}
    391 		}
    392 	}
    393 
    394 	if (retval == 0) {
    395 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
    396 			VM_STAT_ADD(page_lock_deleted);
    397 			mutex_exit(pse);
    398 			return (retval);
    399 		}
    400 
    401 #ifdef VM_STATS
    402 		VM_STAT_ADD(page_lock_miss);
    403 		if (upgraded) {
    404 			VM_STAT_ADD(page_lock_upgrade_failed);
    405 		}
    406 #endif
    407 		if (lock) {
    408 			VM_STAT_ADD(page_lock_miss_lock);
    409 			mutex_exit(lock);
    410 		}
    411 
    412 		/*
    413 		 * Now, wait for the page to be unlocked and
    414 		 * release the lock protecting p_cv and p_selock.
    415 		 */
    416 		cv_wait(&pp->p_cv, pse);
    417 		mutex_exit(pse);
    418 
    419 		/*
    420 		 * The page identity may have changed while we were
    421 		 * blocked.  If we are willing to depend on "pp"
    422 		 * still pointing to a valid page structure (i.e.,
    423 		 * assuming page structures are not dynamically allocated
    424 		 * or freed), we could try to lock the page if its
    425 		 * identity hasn't changed.
    426 		 *
    427 		 * This needs to be measured, since we come back from
    428 		 * cv_wait holding pse (the expensive part of this
    429 		 * operation) we might as well try the cheap part.
    430 		 * Though we would also have to confirm that dropping
    431 		 * `lock' did not cause any grief to the callers.
    432 		 */
    433 		if (lock) {
    434 			mutex_enter(lock);
    435 		}
    436 	} else {
    437 		/*
    438 		 * We have the page lock.
    439 		 * If we needed to reclaim the page, and the page
    440 		 * needed reclaiming (ie, it was free), then we
    441 		 * have the page exclusively locked.  We may need
    442 		 * to downgrade the page.
    443 		 */
    444 		ASSERT((upgraded) ?
    445 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
    446 		mutex_exit(pse);
    447 
    448 		/*
    449 		 * We now hold this page's lock, either shared or
    450 		 * exclusive.  This will prevent its identity from changing.
    451 		 * The page, however, may or may not be free.  If the caller
    452 		 * requested, and it is free, go reclaim it from the
    453 		 * free list.  If the page can't be reclaimed, return failure
    454 		 * so that the caller can start all over again.
    455 		 *
    456 		 * NOTE:page_reclaim() releases the page lock (p_selock)
    457 		 *	if it can't be reclaimed.
    458 		 */
    459 		if (reclaim_it) {
    460 			if (!page_reclaim(pp, lock)) {
    461 				VM_STAT_ADD(page_lock_bad_reclaim);
    462 				retval = 0;
    463 			} else {
    464 				VM_STAT_ADD(page_lock_reclaim);
    465 				if (upgraded) {
    466 					page_downgrade(pp);
    467 				}
    468 			}
    469 		}
    470 	}
    471 	return (retval);
    472 }
    473 
    474 /*
    475  * Clear the SE_EWANTED bit from p_selock.  This function allows
    476  * callers of page_lock_es and page_try_reclaim_lock to clear
    477  * their setting of this bit if they decide they no longer wish
    478  * to gain exclusive access to the page.  Currently only
    479  * delete_memory_thread uses this when the delete memory
    480  * operation is cancelled.
    481  */
    482 void
    483 page_lock_clr_exclwanted(page_t *pp)
    484 {
    485 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    486 
    487 	mutex_enter(pse);
    488 	pp->p_selock &= ~SE_EWANTED;
    489 	if (CV_HAS_WAITERS(&pp->p_cv))
    490 		cv_broadcast(&pp->p_cv);
    491 	mutex_exit(pse);
    492 }
    493 
    494 /*
    495  * Read the comments inside of page_lock_es() carefully.
    496  *
    497  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
    498  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
    499  * This is used by threads subject to reader-starvation (eg. memory delete).
    500  *
    501  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
    502  * it is expected that it will retry at a later time.  Threads that will
    503  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
    504  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
    505  * the bit is cleared.)
    506  */
    507 int
    508 page_try_reclaim_lock(page_t *pp, se_t se, int es)
    509 {
    510 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    511 	selock_t old;
    512 
    513 	mutex_enter(pse);
    514 
    515 	old = pp->p_selock;
    516 
    517 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
    518 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
    519 
    520 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
    521 		mutex_exit(pse);
    522 		VM_STAT_ADD(page_trylock_failed);
    523 		return (0);
    524 	}
    525 
    526 	if (se == SE_SHARED && es == 1 && old == 0) {
    527 		se = SE_EXCL;
    528 	}
    529 
    530 	if (se == SE_SHARED) {
    531 		if (!PP_ISFREE(pp)) {
    532 			if (old >= 0) {
    533 				/*
    534 				 * Readers are not allowed when excl wanted
    535 				 */
    536 				if ((old & SE_EWANTED) == 0) {
    537 					pp->p_selock = old + SE_READER;
    538 					mutex_exit(pse);
    539 					return (1);
    540 				}
    541 			}
    542 			mutex_exit(pse);
    543 			return (0);
    544 		}
    545 		/*
    546 		 * The page is free, so we really want SE_EXCL (below)
    547 		 */
    548 		VM_STAT_ADD(page_try_reclaim_upgrade);
    549 	}
    550 
    551 	/*
    552 	 * The caller wants a writer lock.  We try for it only if
    553 	 * SE_EWANTED is not set, or if the caller specified
    554 	 * SE_EXCL_WANTED.
    555 	 */
    556 	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
    557 		if ((old & ~SE_EWANTED) == 0) {
    558 			/* no reader/writer lock held */
    559 			THREAD_KPRI_REQUEST();
    560 			/* this clears out our setting of the SE_EWANTED bit */
    561 			pp->p_selock = SE_WRITER;
    562 			mutex_exit(pse);
    563 			return (1);
    564 		}
    565 	}
    566 	if (es & SE_EXCL_WANTED) {
    567 		/* page is locked, set the SE_EWANTED bit */
    568 		pp->p_selock |= SE_EWANTED;
    569 	}
    570 	mutex_exit(pse);
    571 	return (0);
    572 }
    573 
    574 /*
    575  * Acquire a page's "shared/exclusive" lock, but never block.
    576  * Returns 1 on success, 0 on failure.
    577  */
    578 int
    579 page_trylock(page_t *pp, se_t se)
    580 {
    581 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    582 
    583 	mutex_enter(pse);
    584 	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
    585 	    (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
    586 		/*
    587 		 * Fail if a thread wants exclusive access and page is
    588 		 * retired, if the page is slated for retirement, or a
    589 		 * share lock is requested.
    590 		 */
    591 		mutex_exit(pse);
    592 		VM_STAT_ADD(page_trylock_failed);
    593 		return (0);
    594 	}
    595 
    596 	if (se == SE_EXCL) {
    597 		if (pp->p_selock == 0) {
    598 			THREAD_KPRI_REQUEST();
    599 			pp->p_selock = SE_WRITER;
    600 			mutex_exit(pse);
    601 			return (1);
    602 		}
    603 	} else {
    604 		if (pp->p_selock >= 0) {
    605 			pp->p_selock += SE_READER;
    606 			mutex_exit(pse);
    607 			return (1);
    608 		}
    609 	}
    610 	mutex_exit(pse);
    611 	return (0);
    612 }
    613 
    614 /*
    615  * Variant of page_unlock() specifically for the page freelist
    616  * code. The mere existence of this code is a vile hack that
    617  * has resulted due to the backwards locking order of the page
    618  * freelist manager; please don't call it.
    619  */
    620 void
    621 page_unlock_nocapture(page_t *pp)
    622 {
    623 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    624 	selock_t old;
    625 
    626 	mutex_enter(pse);
    627 
    628 	old = pp->p_selock;
    629 	if ((old & ~SE_EWANTED) == SE_READER) {
    630 		pp->p_selock = old & ~SE_READER;
    631 		if (CV_HAS_WAITERS(&pp->p_cv))
    632 			cv_broadcast(&pp->p_cv);
    633 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
    634 		panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
    635 	} else if (old < 0) {
    636 		THREAD_KPRI_RELEASE();
    637 		pp->p_selock &= SE_EWANTED;
    638 		if (CV_HAS_WAITERS(&pp->p_cv))
    639 			cv_broadcast(&pp->p_cv);
    640 	} else if ((old & ~SE_EWANTED) > SE_READER) {
    641 		pp->p_selock = old - SE_READER;
    642 	} else {
    643 		panic("page_unlock_nocapture: page %p is not locked",
    644 		    (void *)pp);
    645 	}
    646 
    647 	mutex_exit(pse);
    648 }
    649 
    650 /*
    651  * Release the page's "shared/exclusive" lock and wake up anyone
    652  * who might be waiting for it.
    653  */
    654 void
    655 page_unlock(page_t *pp)
    656 {
    657 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    658 	selock_t old;
    659 
    660 	mutex_enter(pse);
    661 
    662 	old = pp->p_selock;
    663 	if ((old & ~SE_EWANTED) == SE_READER) {
    664 		pp->p_selock = old & ~SE_READER;
    665 		if (CV_HAS_WAITERS(&pp->p_cv))
    666 			cv_broadcast(&pp->p_cv);
    667 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
    668 		panic("page_unlock: page %p is deleted", (void *)pp);
    669 	} else if (old < 0) {
    670 		THREAD_KPRI_RELEASE();
    671 		pp->p_selock &= SE_EWANTED;
    672 		if (CV_HAS_WAITERS(&pp->p_cv))
    673 			cv_broadcast(&pp->p_cv);
    674 	} else if ((old & ~SE_EWANTED) > SE_READER) {
    675 		pp->p_selock = old - SE_READER;
    676 	} else {
    677 		panic("page_unlock: page %p is not locked", (void *)pp);
    678 	}
    679 
    680 	if (pp->p_selock == 0) {
    681 		/*
    682 		 * If the T_CAPTURING bit is set, that means that we should
    683 		 * not try and capture the page again as we could recurse
    684 		 * which could lead to a stack overflow panic or spending a
    685 		 * relatively long time in the kernel making no progress.
    686 		 */
    687 		if ((pp->p_toxic & PR_CAPTURE) &&
    688 		    !(curthread->t_flag & T_CAPTURING) &&
    689 		    !PP_RETIRED(pp)) {
    690 			THREAD_KPRI_REQUEST();
    691 			pp->p_selock = SE_WRITER;
    692 			mutex_exit(pse);
    693 			page_unlock_capture(pp);
    694 		} else {
    695 			mutex_exit(pse);
    696 		}
    697 	} else {
    698 		mutex_exit(pse);
    699 	}
    700 }
    701 
    702 /*
    703  * Try to upgrade the lock on the page from a "shared" to an
    704  * "exclusive" lock.  Since this upgrade operation is done while
    705  * holding the mutex protecting this page, no one else can acquire this page's
    706  * lock and change the page. Thus, it is safe to drop the "shared"
    707  * lock and attempt to acquire the "exclusive" lock.
    708  *
    709  * Returns 1 on success, 0 on failure.
    710  */
    711 int
    712 page_tryupgrade(page_t *pp)
    713 {
    714 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    715 
    716 	mutex_enter(pse);
    717 	if (!(pp->p_selock & SE_EWANTED)) {
    718 		/* no threads want exclusive access, try upgrade */
    719 		if (pp->p_selock == SE_READER) {
    720 			THREAD_KPRI_REQUEST();
    721 			/* convert to exclusive lock */
    722 			pp->p_selock = SE_WRITER;
    723 			mutex_exit(pse);
    724 			return (1);
    725 		}
    726 	}
    727 	mutex_exit(pse);
    728 	return (0);
    729 }
    730 
    731 /*
    732  * Downgrade the "exclusive" lock on the page to a "shared" lock
    733  * while holding the mutex protecting this page's p_selock field.
    734  */
    735 void
    736 page_downgrade(page_t *pp)
    737 {
    738 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    739 	int excl_waiting;
    740 
    741 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
    742 	ASSERT(PAGE_EXCL(pp));
    743 
    744 	mutex_enter(pse);
    745 	excl_waiting =  pp->p_selock & SE_EWANTED;
    746 	THREAD_KPRI_RELEASE();
    747 	pp->p_selock = SE_READER | excl_waiting;
    748 	if (CV_HAS_WAITERS(&pp->p_cv))
    749 		cv_broadcast(&pp->p_cv);
    750 	mutex_exit(pse);
    751 }
    752 
    753 void
    754 page_lock_delete(page_t *pp)
    755 {
    756 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    757 
    758 	ASSERT(PAGE_EXCL(pp));
    759 	ASSERT(pp->p_vnode == NULL);
    760 	ASSERT(pp->p_offset == (u_offset_t)-1);
    761 	ASSERT(!PP_ISFREE(pp));
    762 
    763 	mutex_enter(pse);
    764 	THREAD_KPRI_RELEASE();
    765 	pp->p_selock = SE_DELETED;
    766 	if (CV_HAS_WAITERS(&pp->p_cv))
    767 		cv_broadcast(&pp->p_cv);
    768 	mutex_exit(pse);
    769 }
    770 
    771 int
    772 page_deleted(page_t *pp)
    773 {
    774 	return (pp->p_selock == SE_DELETED);
    775 }
    776 
    777 /*
    778  * Implement the io lock for pages
    779  */
    780 void
    781 page_iolock_init(page_t *pp)
    782 {
    783 	pp->p_iolock_state = 0;
    784 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
    785 }
    786 
    787 /*
    788  * Acquire the i/o lock on a page.
    789  */
    790 void
    791 page_io_lock(page_t *pp)
    792 {
    793 	kmutex_t *pio;
    794 
    795 	pio = PAGE_IO_MUTEX(pp);
    796 	mutex_enter(pio);
    797 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
    798 		cv_wait(&(pp->p_io_cv), pio);
    799 	}
    800 	pp->p_iolock_state |= PAGE_IO_INUSE;
    801 	mutex_exit(pio);
    802 }
    803 
    804 /*
    805  * Release the i/o lock on a page.
    806  */
    807 void
    808 page_io_unlock(page_t *pp)
    809 {
    810 	kmutex_t *pio;
    811 
    812 	pio = PAGE_IO_MUTEX(pp);
    813 	mutex_enter(pio);
    814 	cv_broadcast(&pp->p_io_cv);
    815 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
    816 	mutex_exit(pio);
    817 }
    818 
    819 /*
    820  * Try to acquire the i/o lock on a page without blocking.
    821  * Returns 1 on success, 0 on failure.
    822  */
    823 int
    824 page_io_trylock(page_t *pp)
    825 {
    826 	kmutex_t *pio;
    827 
    828 	if (pp->p_iolock_state & PAGE_IO_INUSE)
    829 		return (0);
    830 
    831 	pio = PAGE_IO_MUTEX(pp);
    832 	mutex_enter(pio);
    833 
    834 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
    835 		mutex_exit(pio);
    836 		return (0);
    837 	}
    838 	pp->p_iolock_state |= PAGE_IO_INUSE;
    839 	mutex_exit(pio);
    840 
    841 	return (1);
    842 }
    843 
    844 /*
    845  * Wait until the i/o lock is not held.
    846  */
    847 void
    848 page_io_wait(page_t *pp)
    849 {
    850 	kmutex_t *pio;
    851 
    852 	pio = PAGE_IO_MUTEX(pp);
    853 	mutex_enter(pio);
    854 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
    855 		cv_wait(&(pp->p_io_cv), pio);
    856 	}
    857 	mutex_exit(pio);
    858 }
    859 
    860 /*
    861  * Returns 1 on success, 0 on failure.
    862  */
    863 int
    864 page_io_locked(page_t *pp)
    865 {
    866 	return (pp->p_iolock_state & PAGE_IO_INUSE);
    867 }
    868 
    869 /*
    870  * Assert that the i/o lock on a page is held.
    871  * Returns 1 on success, 0 on failure.
    872  */
    873 int
    874 page_iolock_assert(page_t *pp)
    875 {
    876 	return (page_io_locked(pp));
    877 }
    878 
    879 /*
    880  * Wrapper exported to kernel routines that are built
    881  * platform-independent (the macro is platform-dependent;
    882  * the size of vph_mutex[] is based on NCPU).
    883  *
    884  * Note that you can do stress testing on this by setting the
    885  * variable page_vnode_mutex_stress to something other than
    886  * zero in a DEBUG kernel in a debugger after loading the kernel.
    887  * Setting it after the kernel is running may not work correctly.
    888  */
    889 #ifdef DEBUG
    890 static int page_vnode_mutex_stress = 0;
    891 #endif
    892 
    893 kmutex_t *
    894 page_vnode_mutex(vnode_t *vp)
    895 {
    896 	if (vp == &kvp)
    897 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
    898 
    899 	if (vp == &zvp)
    900 		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
    901 #ifdef DEBUG
    902 	if (page_vnode_mutex_stress != 0)
    903 		return (&vph_mutex[0]);
    904 #endif
    905 
    906 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
    907 }
    908 
    909 kmutex_t *
    910 page_se_mutex(page_t *pp)
    911 {
    912 	return (PAGE_SE_MUTEX(pp));
    913 }
    914 
    915 #ifdef VM_STATS
    916 uint_t pszclck_stat[4];
    917 #endif
    918 /*
    919  * Find, take and return a mutex held by hat_page_demote().
    920  * Called by page_demote_vp_pages() before hat_page_demote() call and by
    921  * routines that want to block hat_page_demote() but can't do it
    922  * via locking all constituent pages.
    923  *
    924  * Return NULL if p_szc is 0.
    925  *
    926  * It should only be used for pages that can be demoted by hat_page_demote()
    927  * i.e. non swapfs file system pages.  The logic here is lifted from
    928  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
    929  * since the page is locked and not free.
    930  *
    931  * Hash of the root page is used to find the lock.
    932  * To find the root in the presense of hat_page_demote() chageing the location
    933  * of the root this routine relies on the fact that hat_page_demote() changes
    934  * root last.
    935  *
    936  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
    937  * returned pp's p_szc may be any value.
    938  */
    939 kmutex_t *
    940 page_szc_lock(page_t *pp)
    941 {
    942 	kmutex_t	*mtx;
    943 	page_t		*rootpp;
    944 	uint_t		szc;
    945 	uint_t		rszc;
    946 	uint_t		pszc = pp->p_szc;
    947 
    948 	ASSERT(pp != NULL);
    949 	ASSERT(PAGE_LOCKED(pp));
    950 	ASSERT(!PP_ISFREE(pp));
    951 	ASSERT(pp->p_vnode != NULL);
    952 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
    953 	ASSERT(!PP_ISKAS(pp));
    954 
    955 again:
    956 	if (pszc == 0) {
    957 		VM_STAT_ADD(pszclck_stat[0]);
    958 		return (NULL);
    959 	}
    960 
    961 	/* The lock lives in the root page */
    962 
    963 	rootpp = PP_GROUPLEADER(pp, pszc);
    964 	mtx = PAGE_SZC_MUTEX(rootpp);
    965 	mutex_enter(mtx);
    966 
    967 	/*
    968 	 * since p_szc can only decrease if pp == rootpp
    969 	 * rootpp will be always the same i.e we have the right root
    970 	 * regardless of rootpp->p_szc.
    971 	 * If location of pp's root didn't change after we took
    972 	 * the lock we have the right root. return mutex hashed off it.
    973 	 */
    974 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
    975 		VM_STAT_ADD(pszclck_stat[1]);
    976 		return (mtx);
    977 	}
    978 
    979 	/*
    980 	 * root location changed because page got demoted.
    981 	 * locate the new root.
    982 	 */
    983 	if (rszc < pszc) {
    984 		szc = pp->p_szc;
    985 		ASSERT(szc < pszc);
    986 		mutex_exit(mtx);
    987 		pszc = szc;
    988 		VM_STAT_ADD(pszclck_stat[2]);
    989 		goto again;
    990 	}
    991 
    992 	VM_STAT_ADD(pszclck_stat[3]);
    993 	/*
    994 	 * current hat_page_demote not done yet.
    995 	 * wait for it to finish.
    996 	 */
    997 	mutex_exit(mtx);
    998 	rootpp = PP_GROUPLEADER(rootpp, rszc);
    999 	mtx = PAGE_SZC_MUTEX(rootpp);
   1000 	mutex_enter(mtx);
   1001 	mutex_exit(mtx);
   1002 	ASSERT(rootpp->p_szc < rszc);
   1003 	goto again;
   1004 }
   1005 
   1006 int
   1007 page_szc_lock_assert(page_t *pp)
   1008 {
   1009 	page_t *rootpp = PP_PAGEROOT(pp);
   1010 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
   1011 
   1012 	return (MUTEX_HELD(mtx));
   1013 }
   1014 
   1015 /*
   1016  * memseg locking
   1017  */
   1018 static krwlock_t memsegslock;
   1019 
   1020 /*
   1021  * memlist (phys_install, phys_avail) locking.
   1022  */
   1023 static krwlock_t memlists_lock;
   1024 
   1025 int
   1026 memsegs_trylock(int writer)
   1027 {
   1028 	return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER));
   1029 }
   1030 
   1031 void
   1032 memsegs_lock(int writer)
   1033 {
   1034 	rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
   1035 }
   1036 
   1037 /*ARGSUSED*/
   1038 void
   1039 memsegs_unlock(int writer)
   1040 {
   1041 	rw_exit(&memsegslock);
   1042 }
   1043 
   1044 int
   1045 memsegs_lock_held(void)
   1046 {
   1047 	return (RW_LOCK_HELD(&memsegslock));
   1048 }
   1049 
   1050 void
   1051 memlist_read_lock(void)
   1052 {
   1053 	rw_enter(&memlists_lock, RW_READER);
   1054 }
   1055 
   1056 void
   1057 memlist_read_unlock(void)
   1058 {
   1059 	rw_exit(&memlists_lock);
   1060 }
   1061 
   1062 void
   1063 memlist_write_lock(void)
   1064 {
   1065 	rw_enter(&memlists_lock, RW_WRITER);
   1066 }
   1067 
   1068 void
   1069 memlist_write_unlock(void)
   1070 {
   1071 	rw_exit(&memlists_lock);
   1072 }
   1073