Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * Portions of this source code were derived from Berkeley 4.3 BSD
     31  * under license from the Regents of the University of California.
     32  */
     33 
     34 /*
     35  * VM - generic vnode mapping segment.
     36  *
     37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
     38  * mappings [lower routine overhead; more persistent cache] to random
     39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
     40  */
     41 
     42 #include <sys/types.h>
     43 #include <sys/t_lock.h>
     44 #include <sys/param.h>
     45 #include <sys/sysmacros.h>
     46 #include <sys/buf.h>
     47 #include <sys/systm.h>
     48 #include <sys/vnode.h>
     49 #include <sys/mman.h>
     50 #include <sys/errno.h>
     51 #include <sys/cred.h>
     52 #include <sys/kmem.h>
     53 #include <sys/vtrace.h>
     54 #include <sys/cmn_err.h>
     55 #include <sys/debug.h>
     56 #include <sys/thread.h>
     57 #include <sys/dumphdr.h>
     58 #include <sys/bitmap.h>
     59 #include <sys/lgrp.h>
     60 
     61 #include <vm/seg_kmem.h>
     62 #include <vm/hat.h>
     63 #include <vm/as.h>
     64 #include <vm/seg.h>
     65 #include <vm/seg_kpm.h>
     66 #include <vm/seg_map.h>
     67 #include <vm/page.h>
     68 #include <vm/pvn.h>
     69 #include <vm/rm.h>
     70 
     71 /*
     72  * Private seg op routines.
     73  */
     74 static void	segmap_free(struct seg *seg);
     75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
     76 			size_t len, enum fault_type type, enum seg_rw rw);
     77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
     78 static int	segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
     79 			uint_t prot);
     80 static int	segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
     81 static int	segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
     82 			uint_t *protv);
     83 static u_offset_t	segmap_getoffset(struct seg *seg, caddr_t addr);
     84 static int	segmap_gettype(struct seg *seg, caddr_t addr);
     85 static int	segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
     86 static void	segmap_dump(struct seg *seg);
     87 static int	segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
     88 			struct page ***ppp, enum lock_type type,
     89 			enum seg_rw rw);
     90 static void	segmap_badop(void);
     91 static int	segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
     92 static lgrp_mem_policy_info_t	*segmap_getpolicy(struct seg *seg,
     93     caddr_t addr);
     94 static int	segmap_capable(struct seg *seg, segcapability_t capability);
     95 
     96 /* segkpm support */
     97 static caddr_t	segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
     98 			struct smap *, enum seg_rw);
     99 struct smap	*get_smap_kpm(caddr_t, page_t **);
    100 
    101 #define	SEGMAP_BADOP(t)	(t(*)())segmap_badop
    102 
    103 static struct seg_ops segmap_ops = {
    104 	SEGMAP_BADOP(int),	/* dup */
    105 	SEGMAP_BADOP(int),	/* unmap */
    106 	segmap_free,
    107 	segmap_fault,
    108 	segmap_faulta,
    109 	SEGMAP_BADOP(int),	/* setprot */
    110 	segmap_checkprot,
    111 	segmap_kluster,
    112 	SEGMAP_BADOP(size_t),	/* swapout */
    113 	SEGMAP_BADOP(int),	/* sync */
    114 	SEGMAP_BADOP(size_t),	/* incore */
    115 	SEGMAP_BADOP(int),	/* lockop */
    116 	segmap_getprot,
    117 	segmap_getoffset,
    118 	segmap_gettype,
    119 	segmap_getvp,
    120 	SEGMAP_BADOP(int),	/* advise */
    121 	segmap_dump,
    122 	segmap_pagelock,	/* pagelock */
    123 	SEGMAP_BADOP(int),	/* setpgsz */
    124 	segmap_getmemid,	/* getmemid */
    125 	segmap_getpolicy,	/* getpolicy */
    126 	segmap_capable,		/* capable */
    127 };
    128 
    129 /*
    130  * Private segmap routines.
    131  */
    132 static void	segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
    133 			size_t len, enum seg_rw rw, struct smap *smp);
    134 static void	segmap_smapadd(struct smap *smp);
    135 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
    136 			u_offset_t off, int hashid);
    137 static void	segmap_hashout(struct smap *smp);
    138 
    139 
    140 /*
    141  * Statistics for segmap operations.
    142  *
    143  * No explicit locking to protect these stats.
    144  */
    145 struct segmapcnt segmapcnt = {
    146 	{ "fault",		KSTAT_DATA_ULONG },
    147 	{ "faulta",		KSTAT_DATA_ULONG },
    148 	{ "getmap",		KSTAT_DATA_ULONG },
    149 	{ "get_use",		KSTAT_DATA_ULONG },
    150 	{ "get_reclaim",	KSTAT_DATA_ULONG },
    151 	{ "get_reuse",		KSTAT_DATA_ULONG },
    152 	{ "get_unused",		KSTAT_DATA_ULONG },
    153 	{ "get_nofree",		KSTAT_DATA_ULONG },
    154 	{ "rel_async",		KSTAT_DATA_ULONG },
    155 	{ "rel_write",		KSTAT_DATA_ULONG },
    156 	{ "rel_free",		KSTAT_DATA_ULONG },
    157 	{ "rel_abort",		KSTAT_DATA_ULONG },
    158 	{ "rel_dontneed",	KSTAT_DATA_ULONG },
    159 	{ "release",		KSTAT_DATA_ULONG },
    160 	{ "pagecreate",		KSTAT_DATA_ULONG },
    161 	{ "free_notfree",	KSTAT_DATA_ULONG },
    162 	{ "free_dirty",		KSTAT_DATA_ULONG },
    163 	{ "free",		KSTAT_DATA_ULONG },
    164 	{ "stolen",		KSTAT_DATA_ULONG },
    165 	{ "get_nomtx",		KSTAT_DATA_ULONG }
    166 };
    167 
    168 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
    169 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
    170 
    171 /*
    172  * Return number of map pages in segment.
    173  */
    174 #define	MAP_PAGES(seg)		((seg)->s_size >> MAXBSHIFT)
    175 
    176 /*
    177  * Translate addr into smap number within segment.
    178  */
    179 #define	MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
    180 
    181 /*
    182  * Translate addr in seg into struct smap pointer.
    183  */
    184 #define	GET_SMAP(seg, addr)	\
    185 	&(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
    186 
    187 /*
    188  * Bit in map (16 bit bitmap).
    189  */
    190 #define	SMAP_BIT_MASK(bitindex)	(1 << ((bitindex) & 0xf))
    191 
    192 static int smd_colormsk = 0;
    193 static int smd_ncolor = 0;
    194 static int smd_nfree = 0;
    195 static int smd_freemsk = 0;
    196 #ifdef DEBUG
    197 static int *colors_used;
    198 #endif
    199 static struct smap *smd_smap;
    200 static struct smaphash *smd_hash;
    201 #ifdef SEGMAP_HASHSTATS
    202 static unsigned int *smd_hash_len;
    203 #endif
    204 static struct smfree *smd_free;
    205 static ulong_t smd_hashmsk = 0;
    206 
    207 #define	SEGMAP_MAXCOLOR		2
    208 #define	SEGMAP_CACHE_PAD	64
    209 
    210 union segmap_cpu {
    211 	struct {
    212 		uint32_t	scpu_free_ndx[SEGMAP_MAXCOLOR];
    213 		struct smap	*scpu_last_smap;
    214 		ulong_t		scpu_getmap;
    215 		ulong_t		scpu_release;
    216 		ulong_t		scpu_get_reclaim;
    217 		ulong_t		scpu_fault;
    218 		ulong_t		scpu_pagecreate;
    219 		ulong_t		scpu_get_reuse;
    220 	} scpu;
    221 	char	scpu_pad[SEGMAP_CACHE_PAD];
    222 };
    223 static union segmap_cpu *smd_cpu;
    224 
    225 /*
    226  * There are three locks in seg_map:
    227  *	- per freelist mutexes
    228  *	- per hashchain mutexes
    229  *	- per smap mutexes
    230  *
    231  * The lock ordering is to get the smap mutex to lock down the slot
    232  * first then the hash lock (for hash in/out (vp, off) list) or the
    233  * freelist lock to put the slot back on the free list.
    234  *
    235  * The hash search is done by only holding the hashchain lock, when a wanted
    236  * slot is found, we drop the hashchain lock then lock the slot so there
    237  * is no overlapping of hashchain and smap locks. After the slot is
    238  * locked, we verify again if the slot is still what we are looking
    239  * for.
    240  *
    241  * Allocation of a free slot is done by holding the freelist lock,
    242  * then locking the smap slot at the head of the freelist. This is
    243  * in reversed lock order so mutex_tryenter() is used.
    244  *
    245  * The smap lock protects all fields in smap structure except for
    246  * the link fields for hash/free lists which are protected by
    247  * hashchain and freelist locks.
    248  */
    249 
    250 #define	SHASHMTX(hashid)	(&smd_hash[hashid].sh_mtx)
    251 
    252 #define	SMP2SMF(smp)		(&smd_free[(smp - smd_smap) & smd_freemsk])
    253 #define	SMP2SMF_NDX(smp)	(ushort_t)((smp - smd_smap) & smd_freemsk)
    254 
    255 #define	SMAPMTX(smp) (&smp->sm_mtx)
    256 
    257 #define	SMAP_HASHFUNC(vp, off, hashid) \
    258 	{ \
    259 	hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
    260 		((off) >> MAXBSHIFT)) & smd_hashmsk); \
    261 	}
    262 
    263 /*
    264  * The most frequently updated kstat counters are kept in the
    265  * per cpu array to avoid hot cache blocks. The update function
    266  * sums the cpu local counters to update the global counters.
    267  */
    268 
    269 /* ARGSUSED */
    270 int
    271 segmap_kstat_update(kstat_t *ksp, int rw)
    272 {
    273 	int i;
    274 	ulong_t	getmap, release, get_reclaim;
    275 	ulong_t	fault, pagecreate, get_reuse;
    276 
    277 	if (rw == KSTAT_WRITE)
    278 		return (EACCES);
    279 	getmap = release = get_reclaim = (ulong_t)0;
    280 	fault = pagecreate = get_reuse = (ulong_t)0;
    281 	for (i = 0; i < max_ncpus; i++) {
    282 		getmap += smd_cpu[i].scpu.scpu_getmap;
    283 		release  += smd_cpu[i].scpu.scpu_release;
    284 		get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
    285 		fault  += smd_cpu[i].scpu.scpu_fault;
    286 		pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
    287 		get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
    288 	}
    289 	segmapcnt.smp_getmap.value.ul = getmap;
    290 	segmapcnt.smp_release.value.ul = release;
    291 	segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
    292 	segmapcnt.smp_fault.value.ul = fault;
    293 	segmapcnt.smp_pagecreate.value.ul = pagecreate;
    294 	segmapcnt.smp_get_reuse.value.ul = get_reuse;
    295 	return (0);
    296 }
    297 
    298 int
    299 segmap_create(struct seg *seg, void *argsp)
    300 {
    301 	struct segmap_data *smd;
    302 	struct smap *smp;
    303 	struct smfree *sm;
    304 	struct segmap_crargs *a = (struct segmap_crargs *)argsp;
    305 	struct smaphash *shashp;
    306 	union segmap_cpu *scpu;
    307 	long i, npages;
    308 	size_t hashsz;
    309 	uint_t nfreelist;
    310 	extern void prefetch_smap_w(void *);
    311 	extern int max_ncpus;
    312 
    313 	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
    314 
    315 	if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
    316 		panic("segkmap not MAXBSIZE aligned");
    317 		/*NOTREACHED*/
    318 	}
    319 
    320 	smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
    321 
    322 	seg->s_data = (void *)smd;
    323 	seg->s_ops = &segmap_ops;
    324 	smd->smd_prot = a->prot;
    325 
    326 	/*
    327 	 * Scale the number of smap freelists to be
    328 	 * proportional to max_ncpus * number of virtual colors.
    329 	 * The caller can over-ride this scaling by providing
    330 	 * a non-zero a->nfreelist argument.
    331 	 */
    332 	nfreelist = a->nfreelist;
    333 	if (nfreelist == 0)
    334 		nfreelist = max_ncpus;
    335 	else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
    336 		cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
    337 		"%d, using %d", nfreelist, max_ncpus);
    338 		nfreelist = max_ncpus;
    339 	}
    340 	if (nfreelist & (nfreelist - 1)) {
    341 		/* round up nfreelist to the next power of two. */
    342 		nfreelist = 1 << (highbit(nfreelist));
    343 	}
    344 
    345 	/*
    346 	 * Get the number of virtual colors - must be a power of 2.
    347 	 */
    348 	if (a->shmsize)
    349 		smd_ncolor = a->shmsize >> MAXBSHIFT;
    350 	else
    351 		smd_ncolor = 1;
    352 	ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
    353 	ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
    354 	smd_colormsk = smd_ncolor - 1;
    355 	smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
    356 	smd_freemsk = smd_nfree - 1;
    357 
    358 	/*
    359 	 * Allocate and initialize the freelist headers.
    360 	 * Note that sm_freeq[1] starts out as the release queue. This
    361 	 * is known when the smap structures are initialized below.
    362 	 */
    363 	smd_free = smd->smd_free =
    364 	    kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
    365 	for (i = 0; i < smd_nfree; i++) {
    366 		sm = &smd->smd_free[i];
    367 		mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
    368 		mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
    369 		sm->sm_allocq = &sm->sm_freeq[0];
    370 		sm->sm_releq = &sm->sm_freeq[1];
    371 	}
    372 
    373 	/*
    374 	 * Allocate and initialize the smap hash chain headers.
    375 	 * Compute hash size rounding down to the next power of two.
    376 	 */
    377 	npages = MAP_PAGES(seg);
    378 	smd->smd_npages = npages;
    379 	hashsz = npages / SMAP_HASHAVELEN;
    380 	hashsz = 1 << (highbit(hashsz)-1);
    381 	smd_hashmsk = hashsz - 1;
    382 	smd_hash = smd->smd_hash =
    383 	    kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
    384 #ifdef SEGMAP_HASHSTATS
    385 	smd_hash_len =
    386 	    kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
    387 #endif
    388 	for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
    389 		shashp->sh_hash_list = NULL;
    390 		mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
    391 	}
    392 
    393 	/*
    394 	 * Allocate and initialize the smap structures.
    395 	 * Link all slots onto the appropriate freelist.
    396 	 * The smap array is large enough to affect boot time
    397 	 * on large systems, so use memory prefetching and only
    398 	 * go through the array 1 time. Inline a optimized version
    399 	 * of segmap_smapadd to add structures to freelists with
    400 	 * knowledge that no locks are needed here.
    401 	 */
    402 	smd_smap = smd->smd_sm =
    403 	    kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
    404 
    405 	for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
    406 	    smp >= smd->smd_sm; smp--) {
    407 		struct smap *smpfreelist;
    408 		struct sm_freeq *releq;
    409 
    410 		prefetch_smap_w((char *)smp);
    411 
    412 		smp->sm_vp = NULL;
    413 		smp->sm_hash = NULL;
    414 		smp->sm_off = 0;
    415 		smp->sm_bitmap = 0;
    416 		smp->sm_refcnt = 0;
    417 		mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
    418 		smp->sm_free_ndx = SMP2SMF_NDX(smp);
    419 
    420 		sm = SMP2SMF(smp);
    421 		releq = sm->sm_releq;
    422 
    423 		smpfreelist = releq->smq_free;
    424 		if (smpfreelist == 0) {
    425 			releq->smq_free = smp->sm_next = smp->sm_prev = smp;
    426 		} else {
    427 			smp->sm_next = smpfreelist;
    428 			smp->sm_prev = smpfreelist->sm_prev;
    429 			smpfreelist->sm_prev = smp;
    430 			smp->sm_prev->sm_next = smp;
    431 			releq->smq_free = smp->sm_next;
    432 		}
    433 
    434 		/*
    435 		 * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
    436 		 */
    437 		smp->sm_flags = 0;
    438 
    439 #ifdef	SEGKPM_SUPPORT
    440 		/*
    441 		 * Due to the fragile prefetch loop no
    442 		 * separate function is used here.
    443 		 */
    444 		smp->sm_kpme_next = NULL;
    445 		smp->sm_kpme_prev = NULL;
    446 		smp->sm_kpme_page = NULL;
    447 #endif
    448 	}
    449 
    450 	/*
    451 	 * Allocate the per color indices that distribute allocation
    452 	 * requests over the free lists. Each cpu will have a private
    453 	 * rotor index to spread the allocations even across the available
    454 	 * smap freelists. Init the scpu_last_smap field to the first
    455 	 * smap element so there is no need to check for NULL.
    456 	 */
    457 	smd_cpu =
    458 	    kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
    459 	for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
    460 		int j;
    461 		for (j = 0; j < smd_ncolor; j++)
    462 			scpu->scpu.scpu_free_ndx[j] = j;
    463 		scpu->scpu.scpu_last_smap = smd_smap;
    464 	}
    465 
    466 	vpm_init();
    467 
    468 #ifdef DEBUG
    469 	/*
    470 	 * Keep track of which colors are used more often.
    471 	 */
    472 	colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
    473 #endif /* DEBUG */
    474 
    475 	return (0);
    476 }
    477 
    478 static void
    479 segmap_free(seg)
    480 	struct seg *seg;
    481 {
    482 	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
    483 }
    484 
    485 /*
    486  * Do a F_SOFTUNLOCK call over the range requested.
    487  * The range must have already been F_SOFTLOCK'ed.
    488  */
    489 static void
    490 segmap_unlock(
    491 	struct hat *hat,
    492 	struct seg *seg,
    493 	caddr_t addr,
    494 	size_t len,
    495 	enum seg_rw rw,
    496 	struct smap *smp)
    497 {
    498 	page_t *pp;
    499 	caddr_t adr;
    500 	u_offset_t off;
    501 	struct vnode *vp;
    502 	kmutex_t *smtx;
    503 
    504 	ASSERT(smp->sm_refcnt > 0);
    505 
    506 #ifdef lint
    507 	seg = seg;
    508 #endif
    509 
    510 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
    511 
    512 		/*
    513 		 * We're called only from segmap_fault and this was a
    514 		 * NOP in case of a kpm based smap, so dangerous things
    515 		 * must have happened in the meantime. Pages are prefaulted
    516 		 * and locked in segmap_getmapflt and they will not be
    517 		 * unlocked until segmap_release.
    518 		 */
    519 		panic("segmap_unlock: called with kpm addr %p", (void *)addr);
    520 		/*NOTREACHED*/
    521 	}
    522 
    523 	vp = smp->sm_vp;
    524 	off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
    525 
    526 	hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
    527 	for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
    528 		ushort_t bitmask;
    529 
    530 		/*
    531 		 * Use page_find() instead of page_lookup() to
    532 		 * find the page since we know that it has
    533 		 * "shared" lock.
    534 		 */
    535 		pp = page_find(vp, off);
    536 		if (pp == NULL) {
    537 			panic("segmap_unlock: page not found");
    538 			/*NOTREACHED*/
    539 		}
    540 
    541 		if (rw == S_WRITE) {
    542 			hat_setrefmod(pp);
    543 		} else if (rw != S_OTHER) {
    544 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
    545 			"segmap_fault:pp %p vp %p offset %llx", pp, vp, off);
    546 			hat_setref(pp);
    547 		}
    548 
    549 		/*
    550 		 * Clear bitmap, if the bit corresponding to "off" is set,
    551 		 * since the page and translation are being unlocked.
    552 		 */
    553 		bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
    554 
    555 		/*
    556 		 * Large Files: Following assertion is to verify
    557 		 * the correctness of the cast to (int) above.
    558 		 */
    559 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
    560 		smtx = SMAPMTX(smp);
    561 		mutex_enter(smtx);
    562 		if (smp->sm_bitmap & bitmask) {
    563 			smp->sm_bitmap &= ~bitmask;
    564 		}
    565 		mutex_exit(smtx);
    566 
    567 		page_unlock(pp);
    568 	}
    569 }
    570 
    571 #define	MAXPPB	(MAXBSIZE/4096)	/* assumes minimum page size of 4k */
    572 
    573 /*
    574  * This routine is called via a machine specific fault handling
    575  * routine.  It is also called by software routines wishing to
    576  * lock or unlock a range of addresses.
    577  *
    578  * Note that this routine expects a page-aligned "addr".
    579  */
    580 faultcode_t
    581 segmap_fault(
    582 	struct hat *hat,
    583 	struct seg *seg,
    584 	caddr_t addr,
    585 	size_t len,
    586 	enum fault_type type,
    587 	enum seg_rw rw)
    588 {
    589 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
    590 	struct smap *smp;
    591 	page_t *pp, **ppp;
    592 	struct vnode *vp;
    593 	u_offset_t off;
    594 	page_t *pl[MAXPPB + 1];
    595 	uint_t prot;
    596 	u_offset_t addroff;
    597 	caddr_t adr;
    598 	int err;
    599 	u_offset_t sm_off;
    600 	int hat_flag;
    601 
    602 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
    603 		int newpage;
    604 		kmutex_t *smtx;
    605 
    606 		/*
    607 		 * Pages are successfully prefaulted and locked in
    608 		 * segmap_getmapflt and can't be unlocked until
    609 		 * segmap_release. No hat mappings have to be locked
    610 		 * and they also can't be unlocked as long as the
    611 		 * caller owns an active kpm addr.
    612 		 */
    613 #ifndef DEBUG
    614 		if (type != F_SOFTUNLOCK)
    615 			return (0);
    616 #endif
    617 
    618 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
    619 			panic("segmap_fault: smap not found "
    620 			    "for addr %p", (void *)addr);
    621 			/*NOTREACHED*/
    622 		}
    623 
    624 		smtx = SMAPMTX(smp);
    625 #ifdef	DEBUG
    626 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
    627 		if (newpage) {
    628 			cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
    629 			    (void *)smp);
    630 		}
    631 
    632 		if (type != F_SOFTUNLOCK) {
    633 			mutex_exit(smtx);
    634 			return (0);
    635 		}
    636 #endif
    637 		mutex_exit(smtx);
    638 		vp = smp->sm_vp;
    639 		sm_off = smp->sm_off;
    640 
    641 		if (vp == NULL)
    642 			return (FC_MAKE_ERR(EIO));
    643 
    644 		ASSERT(smp->sm_refcnt > 0);
    645 
    646 		addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
    647 		if (addroff + len > MAXBSIZE)
    648 			panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
    649 			    (void *)(addr + len));
    650 
    651 		off = sm_off + addroff;
    652 
    653 		pp = page_find(vp, off);
    654 
    655 		if (pp == NULL)
    656 			panic("segmap_fault: softunlock page not found");
    657 
    658 		/*
    659 		 * Set ref bit also here in case of S_OTHER to avoid the
    660 		 * overhead of supporting other cases than F_SOFTUNLOCK
    661 		 * with segkpm. We can do this because the underlying
    662 		 * pages are locked anyway.
    663 		 */
    664 		if (rw == S_WRITE) {
    665 			hat_setrefmod(pp);
    666 		} else {
    667 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
    668 			    "segmap_fault:pp %p vp %p offset %llx",
    669 			    pp, vp, off);
    670 			hat_setref(pp);
    671 		}
    672 
    673 		return (0);
    674 	}
    675 
    676 	smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
    677 	smp = GET_SMAP(seg, addr);
    678 	vp = smp->sm_vp;
    679 	sm_off = smp->sm_off;
    680 
    681 	if (vp == NULL)
    682 		return (FC_MAKE_ERR(EIO));
    683 
    684 	ASSERT(smp->sm_refcnt > 0);
    685 
    686 	addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
    687 	if (addroff + len > MAXBSIZE) {
    688 		panic("segmap_fault: endaddr %p "
    689 		    "exceeds MAXBSIZE chunk", (void *)(addr + len));
    690 		/*NOTREACHED*/
    691 	}
    692 	off = sm_off + addroff;
    693 
    694 	/*
    695 	 * First handle the easy stuff
    696 	 */
    697 	if (type == F_SOFTUNLOCK) {
    698 		segmap_unlock(hat, seg, addr, len, rw, smp);
    699 		return (0);
    700 	}
    701 
    702 	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
    703 	    "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
    704 	err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
    705 	    seg, addr, rw, CRED(), NULL);
    706 
    707 	if (err)
    708 		return (FC_MAKE_ERR(err));
    709 
    710 	prot &= smd->smd_prot;
    711 
    712 	/*
    713 	 * Handle all pages returned in the pl[] array.
    714 	 * This loop is coded on the assumption that if
    715 	 * there was no error from the VOP_GETPAGE routine,
    716 	 * that the page list returned will contain all the
    717 	 * needed pages for the vp from [off..off + len].
    718 	 */
    719 	ppp = pl;
    720 	while ((pp = *ppp++) != NULL) {
    721 		u_offset_t poff;
    722 		ASSERT(pp->p_vnode == vp);
    723 		hat_flag = HAT_LOAD;
    724 
    725 		/*
    726 		 * Verify that the pages returned are within the range
    727 		 * of this segmap region.  Note that it is theoretically
    728 		 * possible for pages outside this range to be returned,
    729 		 * but it is not very likely.  If we cannot use the
    730 		 * page here, just release it and go on to the next one.
    731 		 */
    732 		if (pp->p_offset < sm_off ||
    733 		    pp->p_offset >= sm_off + MAXBSIZE) {
    734 			(void) page_release(pp, 1);
    735 			continue;
    736 		}
    737 
    738 		ASSERT(hat == kas.a_hat);
    739 		poff = pp->p_offset;
    740 		adr = addr + (poff - off);
    741 		if (adr >= addr && adr < addr + len) {
    742 			hat_setref(pp);
    743 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
    744 			    "segmap_fault:pp %p vp %p offset %llx",
    745 			    pp, vp, poff);
    746 			if (type == F_SOFTLOCK)
    747 				hat_flag = HAT_LOAD_LOCK;
    748 		}
    749 
    750 		/*
    751 		 * Deal with VMODSORT pages here. If we know this is a write
    752 		 * do the setmod now and allow write protection.
    753 		 * As long as it's modified or not S_OTHER, remove write
    754 		 * protection. With S_OTHER it's up to the FS to deal with this.
    755 		 */
    756 		if (IS_VMODSORT(vp)) {
    757 			if (rw == S_WRITE)
    758 				hat_setmod(pp);
    759 			else if (rw != S_OTHER && !hat_ismod(pp))
    760 				prot &= ~PROT_WRITE;
    761 		}
    762 
    763 		hat_memload(hat, adr, pp, prot, hat_flag);
    764 		if (hat_flag != HAT_LOAD_LOCK)
    765 			page_unlock(pp);
    766 	}
    767 	return (0);
    768 }
    769 
    770 /*
    771  * This routine is used to start I/O on pages asynchronously.
    772  */
    773 static faultcode_t
    774 segmap_faulta(struct seg *seg, caddr_t addr)
    775 {
    776 	struct smap *smp;
    777 	struct vnode *vp;
    778 	u_offset_t off;
    779 	int err;
    780 
    781 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
    782 		int	newpage;
    783 		kmutex_t *smtx;
    784 
    785 		/*
    786 		 * Pages are successfully prefaulted and locked in
    787 		 * segmap_getmapflt and can't be unlocked until
    788 		 * segmap_release. No hat mappings have to be locked
    789 		 * and they also can't be unlocked as long as the
    790 		 * caller owns an active kpm addr.
    791 		 */
    792 #ifdef	DEBUG
    793 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
    794 			panic("segmap_faulta: smap not found "
    795 			    "for addr %p", (void *)addr);
    796 			/*NOTREACHED*/
    797 		}
    798 
    799 		smtx = SMAPMTX(smp);
    800 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
    801 		mutex_exit(smtx);
    802 		if (newpage)
    803 			cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
    804 			    (void *)smp);
    805 #endif
    806 		return (0);
    807 	}
    808 
    809 	segmapcnt.smp_faulta.value.ul++;
    810 	smp = GET_SMAP(seg, addr);
    811 
    812 	ASSERT(smp->sm_refcnt > 0);
    813 
    814 	vp = smp->sm_vp;
    815 	off = smp->sm_off;
    816 
    817 	if (vp == NULL) {
    818 		cmn_err(CE_WARN, "segmap_faulta - no vp");
    819 		return (FC_MAKE_ERR(EIO));
    820 	}
    821 
    822 	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
    823 	    "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
    824 
    825 	err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
    826 	    & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
    827 	    seg, addr, S_READ, CRED(), NULL);
    828 
    829 	if (err)
    830 		return (FC_MAKE_ERR(err));
    831 	return (0);
    832 }
    833 
    834 /*ARGSUSED*/
    835 static int
    836 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
    837 {
    838 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
    839 
    840 	ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
    841 
    842 	/*
    843 	 * Need not acquire the segment lock since
    844 	 * "smd_prot" is a read-only field.
    845 	 */
    846 	return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
    847 }
    848 
    849 static int
    850 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
    851 {
    852 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
    853 	size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
    854 
    855 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
    856 
    857 	if (pgno != 0) {
    858 		do {
    859 			protv[--pgno] = smd->smd_prot;
    860 		} while (pgno != 0);
    861 	}
    862 	return (0);
    863 }
    864 
    865 static u_offset_t
    866 segmap_getoffset(struct seg *seg, caddr_t addr)
    867 {
    868 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
    869 
    870 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
    871 
    872 	return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
    873 }
    874 
    875 /*ARGSUSED*/
    876 static int
    877 segmap_gettype(struct seg *seg, caddr_t addr)
    878 {
    879 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
    880 
    881 	return (MAP_SHARED);
    882 }
    883 
    884 /*ARGSUSED*/
    885 static int
    886 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
    887 {
    888 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
    889 
    890 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
    891 
    892 	/* XXX - This doesn't make any sense */
    893 	*vpp = smd->smd_sm->sm_vp;
    894 	return (0);
    895 }
    896 
    897 /*
    898  * Check to see if it makes sense to do kluster/read ahead to
    899  * addr + delta relative to the mapping at addr.  We assume here
    900  * that delta is a signed PAGESIZE'd multiple (which can be negative).
    901  *
    902  * For segmap we always "approve" of this action from our standpoint.
    903  */
    904 /*ARGSUSED*/
    905 static int
    906 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
    907 {
    908 	return (0);
    909 }
    910 
    911 static void
    912 segmap_badop()
    913 {
    914 	panic("segmap_badop");
    915 	/*NOTREACHED*/
    916 }
    917 
    918 /*
    919  * Special private segmap operations
    920  */
    921 
    922 /*
    923  * Add smap to the appropriate free list.
    924  */
    925 static void
    926 segmap_smapadd(struct smap *smp)
    927 {
    928 	struct smfree *sm;
    929 	struct smap *smpfreelist;
    930 	struct sm_freeq *releq;
    931 
    932 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
    933 
    934 	if (smp->sm_refcnt != 0) {
    935 		panic("segmap_smapadd");
    936 		/*NOTREACHED*/
    937 	}
    938 
    939 	sm = &smd_free[smp->sm_free_ndx];
    940 	/*
    941 	 * Add to the tail of the release queue
    942 	 * Note that sm_releq and sm_allocq could toggle
    943 	 * before we get the lock. This does not affect
    944 	 * correctness as the 2 queues are only maintained
    945 	 * to reduce lock pressure.
    946 	 */
    947 	releq = sm->sm_releq;
    948 	if (releq == &sm->sm_freeq[0])
    949 		smp->sm_flags |= SM_QNDX_ZERO;
    950 	else
    951 		smp->sm_flags &= ~SM_QNDX_ZERO;
    952 	mutex_enter(&releq->smq_mtx);
    953 	smpfreelist = releq->smq_free;
    954 	if (smpfreelist == 0) {
    955 		int want;
    956 
    957 		releq->smq_free = smp->sm_next = smp->sm_prev = smp;
    958 		/*
    959 		 * Both queue mutexes held to set sm_want;
    960 		 * snapshot the value before dropping releq mutex.
    961 		 * If sm_want appears after the releq mutex is dropped,
    962 		 * then the smap just freed is already gone.
    963 		 */
    964 		want = sm->sm_want;
    965 		mutex_exit(&releq->smq_mtx);
    966 		/*
    967 		 * See if there was a waiter before dropping the releq mutex
    968 		 * then recheck after obtaining sm_freeq[0] mutex as
    969 		 * the another thread may have already signaled.
    970 		 */
    971 		if (want) {
    972 			mutex_enter(&sm->sm_freeq[0].smq_mtx);
    973 			if (sm->sm_want)
    974 				cv_signal(&sm->sm_free_cv);
    975 			mutex_exit(&sm->sm_freeq[0].smq_mtx);
    976 		}
    977 	} else {
    978 		smp->sm_next = smpfreelist;
    979 		smp->sm_prev = smpfreelist->sm_prev;
    980 		smpfreelist->sm_prev = smp;
    981 		smp->sm_prev->sm_next = smp;
    982 		mutex_exit(&releq->smq_mtx);
    983 	}
    984 }
    985 
    986 
    987 static struct smap *
    988 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
    989 {
    990 	struct smap **hpp;
    991 	struct smap *tmp;
    992 	kmutex_t *hmtx;
    993 
    994 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
    995 	ASSERT(smp->sm_vp == NULL);
    996 	ASSERT(smp->sm_hash == NULL);
    997 	ASSERT(smp->sm_prev == NULL);
    998 	ASSERT(smp->sm_next == NULL);
    999 	ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
   1000 
   1001 	hmtx = SHASHMTX(hashid);
   1002 
   1003 	mutex_enter(hmtx);
   1004 	/*
   1005 	 * First we need to verify that no one has created a smp
   1006 	 * with (vp,off) as its tag before we us.
   1007 	 */
   1008 	for (tmp = smd_hash[hashid].sh_hash_list;
   1009 	    tmp != NULL; tmp = tmp->sm_hash)
   1010 		if (tmp->sm_vp == vp && tmp->sm_off == off)
   1011 			break;
   1012 
   1013 	if (tmp == NULL) {
   1014 		/*
   1015 		 * No one created one yet.
   1016 		 *
   1017 		 * Funniness here - we don't increment the ref count on the
   1018 		 * vnode * even though we have another pointer to it here.
   1019 		 * The reason for this is that we don't want the fact that
   1020 		 * a seg_map entry somewhere refers to a vnode to prevent the
   1021 		 * vnode * itself from going away.  This is because this
   1022 		 * reference to the vnode is a "soft one".  In the case where
   1023 		 * a mapping is being used by a rdwr [or directory routine?]
   1024 		 * there already has to be a non-zero ref count on the vnode.
   1025 		 * In the case where the vp has been freed and the the smap
   1026 		 * structure is on the free list, there are no pages in memory
   1027 		 * that can refer to the vnode.  Thus even if we reuse the same
   1028 		 * vnode/smap structure for a vnode which has the same
   1029 		 * address but represents a different object, we are ok.
   1030 		 */
   1031 		smp->sm_vp = vp;
   1032 		smp->sm_off = off;
   1033 
   1034 		hpp = &smd_hash[hashid].sh_hash_list;
   1035 		smp->sm_hash = *hpp;
   1036 		*hpp = smp;
   1037 #ifdef SEGMAP_HASHSTATS
   1038 		smd_hash_len[hashid]++;
   1039 #endif
   1040 	}
   1041 	mutex_exit(hmtx);
   1042 
   1043 	return (tmp);
   1044 }
   1045 
   1046 static void
   1047 segmap_hashout(struct smap *smp)
   1048 {
   1049 	struct smap **hpp, *hp;
   1050 	struct vnode *vp;
   1051 	kmutex_t *mtx;
   1052 	int hashid;
   1053 	u_offset_t off;
   1054 
   1055 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
   1056 
   1057 	vp = smp->sm_vp;
   1058 	off = smp->sm_off;
   1059 
   1060 	SMAP_HASHFUNC(vp, off, hashid);	/* macro assigns hashid */
   1061 	mtx = SHASHMTX(hashid);
   1062 	mutex_enter(mtx);
   1063 
   1064 	hpp = &smd_hash[hashid].sh_hash_list;
   1065 	for (;;) {
   1066 		hp = *hpp;
   1067 		if (hp == NULL) {
   1068 			panic("segmap_hashout");
   1069 			/*NOTREACHED*/
   1070 		}
   1071 		if (hp == smp)
   1072 			break;
   1073 		hpp = &hp->sm_hash;
   1074 	}
   1075 
   1076 	*hpp = smp->sm_hash;
   1077 	smp->sm_hash = NULL;
   1078 #ifdef SEGMAP_HASHSTATS
   1079 	smd_hash_len[hashid]--;
   1080 #endif
   1081 	mutex_exit(mtx);
   1082 
   1083 	smp->sm_vp = NULL;
   1084 	smp->sm_off = (u_offset_t)0;
   1085 
   1086 }
   1087 
   1088 /*
   1089  * Attempt to free unmodified, unmapped, and non locked segmap
   1090  * pages.
   1091  */
   1092 void
   1093 segmap_pagefree(struct vnode *vp, u_offset_t off)
   1094 {
   1095 	u_offset_t pgoff;
   1096 	page_t  *pp;
   1097 
   1098 	for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
   1099 
   1100 		if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
   1101 			continue;
   1102 
   1103 		switch (page_release(pp, 1)) {
   1104 		case PGREL_NOTREL:
   1105 			segmapcnt.smp_free_notfree.value.ul++;
   1106 			break;
   1107 		case PGREL_MOD:
   1108 			segmapcnt.smp_free_dirty.value.ul++;
   1109 			break;
   1110 		case PGREL_CLEAN:
   1111 			segmapcnt.smp_free.value.ul++;
   1112 			break;
   1113 		}
   1114 	}
   1115 }
   1116 
   1117 /*
   1118  * Locks held on entry: smap lock
   1119  * Locks held on exit : smap lock.
   1120  */
   1121 
   1122 static void
   1123 grab_smp(struct smap *smp, page_t *pp)
   1124 {
   1125 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
   1126 	ASSERT(smp->sm_refcnt == 0);
   1127 
   1128 	if (smp->sm_vp != (struct vnode *)NULL) {
   1129 		struct vnode	*vp = smp->sm_vp;
   1130 		u_offset_t 	off = smp->sm_off;
   1131 		/*
   1132 		 * Destroy old vnode association and
   1133 		 * unload any hardware translations to
   1134 		 * the old object.
   1135 		 */
   1136 		smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
   1137 		segmap_hashout(smp);
   1138 
   1139 		/*
   1140 		 * This node is off freelist and hashlist,
   1141 		 * so there is no reason to drop/reacquire sm_mtx
   1142 		 * across calls to hat_unload.
   1143 		 */
   1144 		if (segmap_kpm) {
   1145 			caddr_t vaddr;
   1146 			int hat_unload_needed = 0;
   1147 
   1148 			/*
   1149 			 * unload kpm mapping
   1150 			 */
   1151 			if (pp != NULL) {
   1152 				vaddr = hat_kpm_page2va(pp, 1);
   1153 				hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
   1154 				page_unlock(pp);
   1155 			}
   1156 
   1157 			/*
   1158 			 * Check if we have (also) the rare case of a
   1159 			 * non kpm mapping.
   1160 			 */
   1161 			if (smp->sm_flags & SM_NOTKPM_RELEASED) {
   1162 				hat_unload_needed = 1;
   1163 				smp->sm_flags &= ~SM_NOTKPM_RELEASED;
   1164 			}
   1165 
   1166 			if (hat_unload_needed) {
   1167 				hat_unload(kas.a_hat, segkmap->s_base +
   1168 				    ((smp - smd_smap) * MAXBSIZE),
   1169 				    MAXBSIZE, HAT_UNLOAD);
   1170 			}
   1171 
   1172 		} else {
   1173 			ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
   1174 			smp->sm_flags &= ~SM_NOTKPM_RELEASED;
   1175 			hat_unload(kas.a_hat, segkmap->s_base +
   1176 			    ((smp - smd_smap) * MAXBSIZE),
   1177 			    MAXBSIZE, HAT_UNLOAD);
   1178 		}
   1179 		segmap_pagefree(vp, off);
   1180 	}
   1181 }
   1182 
   1183 static struct smap *
   1184 get_free_smp(int free_ndx)
   1185 {
   1186 	struct smfree *sm;
   1187 	kmutex_t *smtx;
   1188 	struct smap *smp, *first;
   1189 	struct sm_freeq *allocq, *releq;
   1190 	struct kpme *kpme;
   1191 	page_t *pp = NULL;
   1192 	int end_ndx, page_locked = 0;
   1193 
   1194 	end_ndx = free_ndx;
   1195 	sm = &smd_free[free_ndx];
   1196 
   1197 retry_queue:
   1198 	allocq = sm->sm_allocq;
   1199 	mutex_enter(&allocq->smq_mtx);
   1200 
   1201 	if ((smp = allocq->smq_free) == NULL) {
   1202 
   1203 skip_queue:
   1204 		/*
   1205 		 * The alloc list is empty or this queue is being skipped;
   1206 		 * first see if the allocq toggled.
   1207 		 */
   1208 		if (sm->sm_allocq != allocq) {
   1209 			/* queue changed */
   1210 			mutex_exit(&allocq->smq_mtx);
   1211 			goto retry_queue;
   1212 		}
   1213 		releq = sm->sm_releq;
   1214 		if (!mutex_tryenter(&releq->smq_mtx)) {
   1215 			/* cannot get releq; a free smp may be there now */
   1216 			mutex_exit(&allocq->smq_mtx);
   1217 
   1218 			/*
   1219 			 * This loop could spin forever if this thread has
   1220 			 * higher priority than the thread that is holding
   1221 			 * releq->smq_mtx. In order to force the other thread
   1222 			 * to run, we'll lock/unlock the mutex which is safe
   1223 			 * since we just unlocked the allocq mutex.
   1224 			 */
   1225 			mutex_enter(&releq->smq_mtx);
   1226 			mutex_exit(&releq->smq_mtx);
   1227 			goto retry_queue;
   1228 		}
   1229 		if (releq->smq_free == NULL) {
   1230 			/*
   1231 			 * This freelist is empty.
   1232 			 * This should not happen unless clients
   1233 			 * are failing to release the segmap
   1234 			 * window after accessing the data.
   1235 			 * Before resorting to sleeping, try
   1236 			 * the next list of the same color.
   1237 			 */
   1238 			free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
   1239 			if (free_ndx != end_ndx) {
   1240 				mutex_exit(&releq->smq_mtx);
   1241 				mutex_exit(&allocq->smq_mtx);
   1242 				sm = &smd_free[free_ndx];
   1243 				goto retry_queue;
   1244 			}
   1245 			/*
   1246 			 * Tried all freelists of the same color once,
   1247 			 * wait on this list and hope something gets freed.
   1248 			 */
   1249 			segmapcnt.smp_get_nofree.value.ul++;
   1250 			sm->sm_want++;
   1251 			mutex_exit(&sm->sm_freeq[1].smq_mtx);
   1252 			cv_wait(&sm->sm_free_cv,
   1253 			    &sm->sm_freeq[0].smq_mtx);
   1254 			sm->sm_want--;
   1255 			mutex_exit(&sm->sm_freeq[0].smq_mtx);
   1256 			sm = &smd_free[free_ndx];
   1257 			goto retry_queue;
   1258 		} else {
   1259 			/*
   1260 			 * Something on the rele queue; flip the alloc
   1261 			 * and rele queues and retry.
   1262 			 */
   1263 			sm->sm_allocq = releq;
   1264 			sm->sm_releq = allocq;
   1265 			mutex_exit(&allocq->smq_mtx);
   1266 			mutex_exit(&releq->smq_mtx);
   1267 			if (page_locked) {
   1268 				delay(hz >> 2);
   1269 				page_locked = 0;
   1270 			}
   1271 			goto retry_queue;
   1272 		}
   1273 	} else {
   1274 		/*
   1275 		 * Fastpath the case we get the smap mutex
   1276 		 * on the first try.
   1277 		 */
   1278 		first = smp;
   1279 next_smap:
   1280 		smtx = SMAPMTX(smp);
   1281 		if (!mutex_tryenter(smtx)) {
   1282 			/*
   1283 			 * Another thread is trying to reclaim this slot.
   1284 			 * Skip to the next queue or smap.
   1285 			 */
   1286 			if ((smp = smp->sm_next) == first) {
   1287 				goto skip_queue;
   1288 			} else {
   1289 				goto next_smap;
   1290 			}
   1291 		} else {
   1292 			/*
   1293 			 * if kpme exists, get shared lock on the page
   1294 			 */
   1295 			if (segmap_kpm && smp->sm_vp != NULL) {
   1296 
   1297 				kpme = GET_KPME(smp);
   1298 				pp = kpme->kpe_page;
   1299 
   1300 				if (pp != NULL) {
   1301 					if (!page_trylock(pp, SE_SHARED)) {
   1302 						smp = smp->sm_next;
   1303 						mutex_exit(smtx);
   1304 						page_locked = 1;
   1305 
   1306 						pp = NULL;
   1307 
   1308 						if (smp == first) {
   1309 							goto skip_queue;
   1310 						} else {
   1311 							goto next_smap;
   1312 						}
   1313 					} else {
   1314 						if (kpme->kpe_page == NULL) {
   1315 							page_unlock(pp);
   1316 							pp = NULL;
   1317 						}
   1318 					}
   1319 				}
   1320 			}
   1321 
   1322 			/*
   1323 			 * At this point, we've selected smp.  Remove smp
   1324 			 * from its freelist.  If smp is the first one in
   1325 			 * the freelist, update the head of the freelist.
   1326 			 */
   1327 			if (first == smp) {
   1328 				ASSERT(first == allocq->smq_free);
   1329 				allocq->smq_free = smp->sm_next;
   1330 			}
   1331 
   1332 			/*
   1333 			 * if the head of the freelist still points to smp,
   1334 			 * then there are no more free smaps in that list.
   1335 			 */
   1336 			if (allocq->smq_free == smp)
   1337 				/*
   1338 				 * Took the last one
   1339 				 */
   1340 				allocq->smq_free = NULL;
   1341 			else {
   1342 				smp->sm_prev->sm_next = smp->sm_next;
   1343 				smp->sm_next->sm_prev = smp->sm_prev;
   1344 			}
   1345 			mutex_exit(&allocq->smq_mtx);
   1346 			smp->sm_prev = smp->sm_next = NULL;
   1347 
   1348 			/*
   1349 			 * if pp != NULL, pp must have been locked;
   1350 			 * grab_smp() unlocks pp.
   1351 			 */
   1352 			ASSERT((pp == NULL) || PAGE_LOCKED(pp));
   1353 			grab_smp(smp, pp);
   1354 			/* return smp locked. */
   1355 			ASSERT(SMAPMTX(smp) == smtx);
   1356 			ASSERT(MUTEX_HELD(smtx));
   1357 			return (smp);
   1358 		}
   1359 	}
   1360 }
   1361 
   1362 /*
   1363  * Special public segmap operations
   1364  */
   1365 
   1366 /*
   1367  * Create pages (without using VOP_GETPAGE) and load up translations to them.
   1368  * If softlock is TRUE, then set things up so that it looks like a call
   1369  * to segmap_fault with F_SOFTLOCK.
   1370  *
   1371  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
   1372  *
   1373  * All fields in the generic segment (struct seg) are considered to be
   1374  * read-only for "segmap" even though the kernel address space (kas) may
   1375  * not be locked, hence no lock is needed to access them.
   1376  */
   1377 int
   1378 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
   1379 {
   1380 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
   1381 	page_t *pp;
   1382 	u_offset_t off;
   1383 	struct smap *smp;
   1384 	struct vnode *vp;
   1385 	caddr_t eaddr;
   1386 	int newpage = 0;
   1387 	uint_t prot;
   1388 	kmutex_t *smtx;
   1389 	int hat_flag;
   1390 
   1391 	ASSERT(seg->s_as == &kas);
   1392 
   1393 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
   1394 		/*
   1395 		 * Pages are successfully prefaulted and locked in
   1396 		 * segmap_getmapflt and can't be unlocked until
   1397 		 * segmap_release. The SM_KPM_NEWPAGE flag is set
   1398 		 * in segmap_pagecreate_kpm when new pages are created.
   1399 		 * and it is returned as "newpage" indication here.
   1400 		 */
   1401 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
   1402 			panic("segmap_pagecreate: smap not found "
   1403 			    "for addr %p", (void *)addr);
   1404 			/*NOTREACHED*/
   1405 		}
   1406 
   1407 		smtx = SMAPMTX(smp);
   1408 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
   1409 		smp->sm_flags &= ~SM_KPM_NEWPAGE;
   1410 		mutex_exit(smtx);
   1411 
   1412 		return (newpage);
   1413 	}
   1414 
   1415 	smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
   1416 
   1417 	eaddr = addr + len;
   1418 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
   1419 
   1420 	smp = GET_SMAP(seg, addr);
   1421 
   1422 	/*
   1423 	 * We don't grab smp mutex here since we assume the smp
   1424 	 * has a refcnt set already which prevents the slot from
   1425 	 * changing its id.
   1426 	 */
   1427 	ASSERT(smp->sm_refcnt > 0);
   1428 
   1429 	vp = smp->sm_vp;
   1430 	off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
   1431 	prot = smd->smd_prot;
   1432 
   1433 	for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
   1434 		hat_flag = HAT_LOAD;
   1435 		pp = page_lookup(vp, off, SE_SHARED);
   1436 		if (pp == NULL) {
   1437 			ushort_t bitindex;
   1438 
   1439 			if ((pp = page_create_va(vp, off,
   1440 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
   1441 				panic("segmap_pagecreate: page_create failed");
   1442 				/*NOTREACHED*/
   1443 			}
   1444 			newpage = 1;
   1445 			page_io_unlock(pp);
   1446 
   1447 			/*
   1448 			 * Since pages created here do not contain valid
   1449 			 * data until the caller writes into them, the
   1450 			 * "exclusive" lock will not be dropped to prevent
   1451 			 * other users from accessing the page.  We also
   1452 			 * have to lock the translation to prevent a fault
   1453 			 * from occurring when the virtual address mapped by
   1454 			 * this page is written into.  This is necessary to
   1455 			 * avoid a deadlock since we haven't dropped the
   1456 			 * "exclusive" lock.
   1457 			 */
   1458 			bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
   1459 
   1460 			/*
   1461 			 * Large Files: The following assertion is to
   1462 			 * verify the cast above.
   1463 			 */
   1464 			ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
   1465 			smtx = SMAPMTX(smp);
   1466 			mutex_enter(smtx);
   1467 			smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
   1468 			mutex_exit(smtx);
   1469 
   1470 			hat_flag = HAT_LOAD_LOCK;
   1471 		} else if (softlock) {
   1472 			hat_flag = HAT_LOAD_LOCK;
   1473 		}
   1474 
   1475 		if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
   1476 			hat_setmod(pp);
   1477 
   1478 		hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
   1479 
   1480 		if (hat_flag != HAT_LOAD_LOCK)
   1481 			page_unlock(pp);
   1482 
   1483 		TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
   1484 		    "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
   1485 		    seg, addr, pp, vp, off);
   1486 	}
   1487 
   1488 	return (newpage);
   1489 }
   1490 
   1491 void
   1492 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
   1493 {
   1494 	struct smap	*smp;
   1495 	ushort_t	bitmask;
   1496 	page_t		*pp;
   1497 	struct	vnode	*vp;
   1498 	u_offset_t	off;
   1499 	caddr_t		eaddr;
   1500 	kmutex_t	*smtx;
   1501 
   1502 	ASSERT(seg->s_as == &kas);
   1503 
   1504 	eaddr = addr + len;
   1505 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
   1506 
   1507 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
   1508 		/*
   1509 		 * Pages are successfully prefaulted and locked in
   1510 		 * segmap_getmapflt and can't be unlocked until
   1511 		 * segmap_release, so no pages or hat mappings have
   1512 		 * to be unlocked at this point.
   1513 		 */
   1514 #ifdef DEBUG
   1515 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
   1516 			panic("segmap_pageunlock: smap not found "
   1517 			    "for addr %p", (void *)addr);
   1518 			/*NOTREACHED*/
   1519 		}
   1520 
   1521 		ASSERT(smp->sm_refcnt > 0);
   1522 		mutex_exit(SMAPMTX(smp));
   1523 #endif
   1524 		return;
   1525 	}
   1526 
   1527 	smp = GET_SMAP(seg, addr);
   1528 	smtx = SMAPMTX(smp);
   1529 
   1530 	ASSERT(smp->sm_refcnt > 0);
   1531 
   1532 	vp = smp->sm_vp;
   1533 	off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
   1534 
   1535 	for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
   1536 		bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
   1537 
   1538 		/*
   1539 		 * Large Files: Following assertion is to verify
   1540 		 * the correctness of the cast to (int) above.
   1541 		 */
   1542 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
   1543 
   1544 		/*
   1545 		 * If the bit corresponding to "off" is set,
   1546 		 * clear this bit in the bitmap, unlock translations,
   1547 		 * and release the "exclusive" lock on the page.
   1548 		 */
   1549 		if (smp->sm_bitmap & bitmask) {
   1550 			mutex_enter(smtx);
   1551 			smp->sm_bitmap &= ~bitmask;
   1552 			mutex_exit(smtx);
   1553 
   1554 			hat_unlock(kas.a_hat, addr, PAGESIZE);
   1555 
   1556 			/*
   1557 			 * Use page_find() instead of page_lookup() to
   1558 			 * find the page since we know that it has
   1559 			 * "exclusive" lock.
   1560 			 */
   1561 			pp = page_find(vp, off);
   1562 			if (pp == NULL) {
   1563 				panic("segmap_pageunlock: page not found");
   1564 				/*NOTREACHED*/
   1565 			}
   1566 			if (rw == S_WRITE) {
   1567 				hat_setrefmod(pp);
   1568 			} else if (rw != S_OTHER) {
   1569 				hat_setref(pp);
   1570 			}
   1571 
   1572 			page_unlock(pp);
   1573 		}
   1574 	}
   1575 }
   1576 
   1577 caddr_t
   1578 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
   1579 {
   1580 	return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
   1581 }
   1582 
   1583 /*
   1584  * This is the magic virtual address that offset 0 of an ELF
   1585  * file gets mapped to in user space. This is used to pick
   1586  * the vac color on the freelist.
   1587  */
   1588 #define	ELF_OFFZERO_VA	(0x10000)
   1589 /*
   1590  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
   1591  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
   1592  * The return address is  always MAXBSIZE aligned.
   1593  *
   1594  * If forcefault is nonzero and the MMU translations haven't yet been created,
   1595  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
   1596  */
   1597 caddr_t
   1598 segmap_getmapflt(
   1599 	struct seg *seg,
   1600 	struct vnode *vp,
   1601 	u_offset_t off,
   1602 	size_t len,
   1603 	int forcefault,
   1604 	enum seg_rw rw)
   1605 {
   1606 	struct smap *smp, *nsmp;
   1607 	extern struct vnode *common_specvp();
   1608 	caddr_t baseaddr;			/* MAXBSIZE aligned */
   1609 	u_offset_t baseoff;
   1610 	int newslot;
   1611 	caddr_t vaddr;
   1612 	int color, hashid;
   1613 	kmutex_t *hashmtx, *smapmtx;
   1614 	struct smfree *sm;
   1615 	page_t	*pp;
   1616 	struct kpme *kpme;
   1617 	uint_t	prot;
   1618 	caddr_t base;
   1619 	page_t	*pl[MAXPPB + 1];
   1620 	int	error;
   1621 	int	is_kpm = 1;
   1622 
   1623 	ASSERT(seg->s_as == &kas);
   1624 	ASSERT(seg == segkmap);
   1625 
   1626 	baseoff = off & (offset_t)MAXBMASK;
   1627 	if (off + len > baseoff + MAXBSIZE) {
   1628 		panic("segmap_getmap bad len");
   1629 		/*NOTREACHED*/
   1630 	}
   1631 
   1632 	/*
   1633 	 * If this is a block device we have to be sure to use the
   1634 	 * "common" block device vnode for the mapping.
   1635 	 */
   1636 	if (vp->v_type == VBLK)
   1637 		vp = common_specvp(vp);
   1638 
   1639 	smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
   1640 
   1641 	if (segmap_kpm == 0 ||
   1642 	    (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
   1643 		is_kpm = 0;
   1644 	}
   1645 
   1646 	SMAP_HASHFUNC(vp, off, hashid);	/* macro assigns hashid */
   1647 	hashmtx = SHASHMTX(hashid);
   1648 
   1649 retry_hash:
   1650 	mutex_enter(hashmtx);
   1651 	for (smp = smd_hash[hashid].sh_hash_list;
   1652 	    smp != NULL; smp = smp->sm_hash)
   1653 		if (smp->sm_vp == vp && smp->sm_off == baseoff)
   1654 			break;
   1655 	mutex_exit(hashmtx);
   1656 
   1657 vrfy_smp:
   1658 	if (smp != NULL) {
   1659 
   1660 		ASSERT(vp->v_count != 0);
   1661 
   1662 		/*
   1663 		 * Get smap lock and recheck its tag. The hash lock
   1664 		 * is dropped since the hash is based on (vp, off)
   1665 		 * and (vp, off) won't change when we have smap mtx.
   1666 		 */
   1667 		smapmtx = SMAPMTX(smp);
   1668 		mutex_enter(smapmtx);
   1669 		if (smp->sm_vp != vp || smp->sm_off != baseoff) {
   1670 			mutex_exit(smapmtx);
   1671 			goto retry_hash;
   1672 		}
   1673 
   1674 		if (smp->sm_refcnt == 0) {
   1675 
   1676 			smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
   1677 
   1678 			/*
   1679 			 * Could still be on the free list. However, this
   1680 			 * could also be an smp that is transitioning from
   1681 			 * the free list when we have too much contention
   1682 			 * for the smapmtx's. In this case, we have an
   1683 			 * unlocked smp that is not on the free list any
   1684 			 * longer, but still has a 0 refcnt.  The only way
   1685 			 * to be sure is to check the freelist pointers.
   1686 			 * Since we now have the smapmtx, we are guaranteed
   1687 			 * that the (vp, off) won't change, so we are safe
   1688 			 * to reclaim it.  get_free_smp() knows that this
   1689 			 * can happen, and it will check the refcnt.
   1690 			 */
   1691 
   1692 			if ((smp->sm_next != NULL)) {
   1693 				struct sm_freeq *freeq;
   1694 
   1695 				ASSERT(smp->sm_prev != NULL);
   1696 				sm = &smd_free[smp->sm_free_ndx];
   1697 
   1698 				if (smp->sm_flags & SM_QNDX_ZERO)
   1699 					freeq = &sm->sm_freeq[0];
   1700 				else
   1701 					freeq = &sm->sm_freeq[1];
   1702 
   1703 				mutex_enter(&freeq->smq_mtx);
   1704 				if (freeq->smq_free != smp) {
   1705 					/*
   1706 					 * fastpath normal case
   1707 					 */
   1708 					smp->sm_prev->sm_next = smp->sm_next;
   1709 					smp->sm_next->sm_prev = smp->sm_prev;
   1710 				} else if (smp == smp->sm_next) {
   1711 					/*
   1712 					 * Taking the last smap on freelist
   1713 					 */
   1714 					freeq->smq_free = NULL;
   1715 				} else {
   1716 					/*
   1717 					 * Reclaiming 1st smap on list
   1718 					 */
   1719 					freeq->smq_free = smp->sm_next;
   1720 					smp->sm_prev->sm_next = smp->sm_next;
   1721 					smp->sm_next->sm_prev = smp->sm_prev;
   1722 				}
   1723 				mutex_exit(&freeq->smq_mtx);
   1724 				smp->sm_prev = smp->sm_next = NULL;
   1725 			} else {
   1726 				ASSERT(smp->sm_prev == NULL);
   1727 				segmapcnt.smp_stolen.value.ul++;
   1728 			}
   1729 
   1730 		} else {
   1731 			segmapcnt.smp_get_use.value.ul++;
   1732 		}
   1733 		smp->sm_refcnt++;		/* another user */
   1734 
   1735 		/*
   1736 		 * We don't invoke segmap_fault via TLB miss, so we set ref
   1737 		 * and mod bits in advance. For S_OTHER  we set them in
   1738 		 * segmap_fault F_SOFTUNLOCK.
   1739 		 */
   1740 		if (is_kpm) {
   1741 			if (rw == S_WRITE) {
   1742 				smp->sm_flags |= SM_WRITE_DATA;
   1743 			} else if (rw == S_READ) {
   1744 				smp->sm_flags |= SM_READ_DATA;
   1745 			}
   1746 		}
   1747 		mutex_exit(smapmtx);
   1748 
   1749 		newslot = 0;
   1750 	} else {
   1751 
   1752 		uint32_t free_ndx, *free_ndxp;
   1753 		union segmap_cpu *scpu;
   1754 
   1755 		/*
   1756 		 * On a PAC machine or a machine with anti-alias
   1757 		 * hardware, smd_colormsk will be zero.
   1758 		 *
   1759 		 * On a VAC machine- pick color by offset in the file
   1760 		 * so we won't get VAC conflicts on elf files.
   1761 		 * On data files, color does not matter but we
   1762 		 * don't know what kind of file it is so we always
   1763 		 * pick color by offset. This causes color
   1764 		 * corresponding to file offset zero to be used more
   1765 		 * heavily.
   1766 		 */
   1767 		color = (baseoff >> MAXBSHIFT) & smd_colormsk;
   1768 		scpu = smd_cpu+CPU->cpu_seqid;
   1769 		free_ndxp = &scpu->scpu.scpu_free_ndx[color];
   1770 		free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
   1771 #ifdef DEBUG
   1772 		colors_used[free_ndx]++;
   1773 #endif /* DEBUG */
   1774 
   1775 		/*
   1776 		 * Get a locked smp slot from the free list.
   1777 		 */
   1778 		smp = get_free_smp(free_ndx);
   1779 		smapmtx = SMAPMTX(smp);
   1780 
   1781 		ASSERT(smp->sm_vp == NULL);
   1782 
   1783 		if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
   1784 			/*
   1785 			 * Failed to hashin, there exists one now.
   1786 			 * Return the smp we just allocated.
   1787 			 */
   1788 			segmap_smapadd(smp);
   1789 			mutex_exit(smapmtx);
   1790 
   1791 			smp = nsmp;
   1792 			goto vrfy_smp;
   1793 		}
   1794 		smp->sm_refcnt++;		/* another user */
   1795 
   1796 		/*
   1797 		 * We don't invoke segmap_fault via TLB miss, so we set ref
   1798 		 * and mod bits in advance. For S_OTHER  we set them in
   1799 		 * segmap_fault F_SOFTUNLOCK.
   1800 		 */
   1801 		if (is_kpm) {
   1802 			if (rw == S_WRITE) {
   1803 				smp->sm_flags |= SM_WRITE_DATA;
   1804 			} else if (rw == S_READ) {
   1805 				smp->sm_flags |= SM_READ_DATA;
   1806 			}
   1807 		}
   1808 		mutex_exit(smapmtx);
   1809 
   1810 		newslot = 1;
   1811 	}
   1812 
   1813 	if (!is_kpm)
   1814 		goto use_segmap_range;
   1815 
   1816 	/*
   1817 	 * Use segkpm
   1818 	 */
   1819 	/* Lint directive required until 6746211 is fixed */
   1820 	/*CONSTCOND*/
   1821 	ASSERT(PAGESIZE == MAXBSIZE);
   1822 
   1823 	/*
   1824 	 * remember the last smp faulted on this cpu.
   1825 	 */
   1826 	(smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
   1827 
   1828 	if (forcefault == SM_PAGECREATE) {
   1829 		baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
   1830 		return (baseaddr);
   1831 	}
   1832 
   1833 	if (newslot == 0 &&
   1834 	    (pp = GET_KPME(smp)->kpe_page) != NULL) {
   1835 
   1836 		/* fastpath */
   1837 		switch (rw) {
   1838 		case S_READ:
   1839 		case S_WRITE:
   1840 			if (page_trylock(pp, SE_SHARED)) {
   1841 				if (PP_ISFREE(pp) ||
   1842 				    !(pp->p_vnode == vp &&
   1843 				    pp->p_offset == baseoff)) {
   1844 					page_unlock(pp);
   1845 					pp = page_lookup(vp, baseoff,
   1846 					    SE_SHARED);
   1847 				}
   1848 			} else {
   1849 				pp = page_lookup(vp, baseoff, SE_SHARED);
   1850 			}
   1851 
   1852 			if (pp == NULL) {
   1853 				ASSERT(GET_KPME(smp)->kpe_page == NULL);
   1854 				break;
   1855 			}
   1856 
   1857 			if (rw == S_WRITE &&
   1858 			    hat_page_getattr(pp, P_MOD | P_REF) !=
   1859 			    (P_MOD | P_REF)) {
   1860 				page_unlock(pp);
   1861 				break;
   1862 			}
   1863 
   1864 			/*
   1865 			 * We have the p_selock as reader, grab_smp
   1866 			 * can't hit us, we have bumped the smap
   1867 			 * refcnt and hat_pageunload needs the
   1868 			 * p_selock exclusive.
   1869 			 */
   1870 			kpme = GET_KPME(smp);
   1871 			if (kpme->kpe_page == pp) {
   1872 				baseaddr = hat_kpm_page2va(pp, 0);
   1873 			} else if (kpme->kpe_page == NULL) {
   1874 				baseaddr = hat_kpm_mapin(pp, kpme);
   1875 			} else {
   1876 				panic("segmap_getmapflt: stale "
   1877 				    "kpme page, kpme %p", (void *)kpme);
   1878 				/*NOTREACHED*/
   1879 			}
   1880 
   1881 			/*
   1882 			 * We don't invoke segmap_fault via TLB miss,
   1883 			 * so we set ref and mod bits in advance.
   1884 			 * For S_OTHER and we set them in segmap_fault
   1885 			 * F_SOFTUNLOCK.
   1886 			 */
   1887 			if (rw == S_READ && !hat_isref(pp))
   1888 				hat_setref(pp);
   1889 
   1890 			return (baseaddr);
   1891 		default:
   1892 			break;
   1893 		}
   1894 	}
   1895 
   1896 	base = segkpm_create_va(baseoff);
   1897 	error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
   1898 	    seg, base, rw, CRED(), NULL);
   1899 
   1900 	pp = pl[0];
   1901 	if (error || pp == NULL) {
   1902 		/*
   1903 		 * Use segmap address slot and let segmap_fault deal
   1904 		 * with the error cases. There is no error return
   1905 		 * possible here.
   1906 		 */
   1907 		goto use_segmap_range;
   1908 	}
   1909 
   1910 	ASSERT(pl[1] == NULL);
   1911 
   1912 	/*
   1913 	 * When prot is not returned w/ PROT_ALL the returned pages
   1914 	 * are not backed by fs blocks. For most of the segmap users
   1915 	 * this is no problem, they don't write to the pages in the
   1916 	 * same request and therefore don't rely on a following
   1917 	 * trap driven segmap_fault. With SM_LOCKPROTO users it
   1918 	 * is more secure to use segkmap adresses to allow
   1919 	 * protection segmap_fault's.
   1920 	 */
   1921 	if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
   1922 		/*
   1923 		 * Use segmap address slot and let segmap_fault
   1924 		 * do the error return.
   1925 		 */
   1926 		ASSERT(rw != S_WRITE);
   1927 		ASSERT(PAGE_LOCKED(pp));
   1928 		page_unlock(pp);
   1929 		forcefault = 0;
   1930 		goto use_segmap_range;
   1931 	}
   1932 
   1933 	/*
   1934 	 * We have the p_selock as reader, grab_smp can't hit us, we
   1935 	 * have bumped the smap refcnt and hat_pageunload needs the
   1936 	 * p_selock exclusive.
   1937 	 */
   1938 	kpme = GET_KPME(smp);
   1939 	if (kpme->kpe_page == pp) {
   1940 		baseaddr = hat_kpm_page2va(pp, 0);
   1941 	} else if (kpme->kpe_page == NULL) {
   1942 		baseaddr = hat_kpm_mapin(pp, kpme);
   1943 	} else {
   1944 		panic("segmap_getmapflt: stale kpme page after "
   1945 		    "VOP_GETPAGE, kpme %p", (void *)kpme);
   1946 		/*NOTREACHED*/
   1947 	}
   1948 
   1949 	smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
   1950 
   1951 	return (baseaddr);
   1952 
   1953 
   1954 use_segmap_range:
   1955 	baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
   1956 	TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
   1957 	    "segmap_getmap:seg %p addr %p vp %p offset %llx",
   1958 	    seg, baseaddr, vp, baseoff);
   1959 
   1960 	/*
   1961 	 * Prefault the translations
   1962 	 */
   1963 	vaddr = baseaddr + (off - baseoff);
   1964 	if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
   1965 
   1966 		caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
   1967 		    (uintptr_t)PAGEMASK);
   1968 
   1969 		(void) segmap_fault(kas.a_hat, seg, pgaddr,
   1970 		    (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
   1971 		    F_INVAL, rw);
   1972 	}
   1973 
   1974 	return (baseaddr);
   1975 }
   1976 
   1977 int
   1978 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
   1979 {
   1980 	struct smap	*smp;
   1981 	int 		error;
   1982 	int		bflags = 0;
   1983 	struct vnode	*vp;
   1984 	u_offset_t	offset;
   1985 	kmutex_t	*smtx;
   1986 	int		is_kpm = 0;
   1987 	page_t		*pp;
   1988 
   1989 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
   1990 
   1991 		if (((uintptr_t)addr & MAXBOFFSET) != 0) {
   1992 			panic("segmap_release: addr %p not "
   1993 			    "MAXBSIZE aligned", (void *)addr);
   1994 			/*NOTREACHED*/
   1995 		}
   1996 
   1997 		if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
   1998 			panic("segmap_release: smap not found "
   1999 			    "for addr %p", (void *)addr);
   2000 			/*NOTREACHED*/
   2001 		}
   2002 
   2003 		TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
   2004 		    "segmap_relmap:seg %p addr %p smp %p",
   2005 		    seg, addr, smp);
   2006 
   2007 		smtx = SMAPMTX(smp);
   2008 
   2009 		/*
   2010 		 * For compatibility reasons segmap_pagecreate_kpm sets this
   2011 		 * flag to allow a following segmap_pagecreate to return
   2012 		 * this as "newpage" flag. When segmap_pagecreate is not
   2013 		 * called at all we clear it now.
   2014 		 */
   2015 		smp->sm_flags &= ~SM_KPM_NEWPAGE;
   2016 		is_kpm = 1;
   2017 		if (smp->sm_flags & SM_WRITE_DATA) {
   2018 			hat_setrefmod(pp);
   2019 		} else if (smp->sm_flags & SM_READ_DATA) {
   2020 			hat_setref(pp);
   2021 		}
   2022 	} else {
   2023 		if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
   2024 		    ((uintptr_t)addr & MAXBOFFSET) != 0) {
   2025 			panic("segmap_release: bad addr %p", (void *)addr);
   2026 			/*NOTREACHED*/
   2027 		}
   2028 		smp = GET_SMAP(seg, addr);
   2029 
   2030 		TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
   2031 		    "segmap_relmap:seg %p addr %p smp %p",
   2032 		    seg, addr, smp);
   2033 
   2034 		smtx = SMAPMTX(smp);
   2035 		mutex_enter(smtx);
   2036 		smp->sm_flags |= SM_NOTKPM_RELEASED;
   2037 	}
   2038 
   2039 	ASSERT(smp->sm_refcnt > 0);
   2040 
   2041 	/*
   2042 	 * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
   2043 	 * are set.
   2044 	 */
   2045 	if ((flags & ~SM_DONTNEED) != 0) {
   2046 		if (flags & SM_WRITE)
   2047 			segmapcnt.smp_rel_write.value.ul++;
   2048 		if (flags & SM_ASYNC) {
   2049 			bflags |= B_ASYNC;
   2050 			segmapcnt.smp_rel_async.value.ul++;
   2051 		}
   2052 		if (flags & SM_INVAL) {
   2053 			bflags |= B_INVAL;
   2054 			segmapcnt.smp_rel_abort.value.ul++;
   2055 		}
   2056 		if (flags & SM_DESTROY) {
   2057 			bflags |= (B_INVAL|B_TRUNC);
   2058 			segmapcnt.smp_rel_abort.value.ul++;
   2059 		}
   2060 		if (smp->sm_refcnt == 1) {
   2061 			/*
   2062 			 * We only bother doing the FREE and DONTNEED flags
   2063 			 * if no one else is still referencing this mapping.
   2064 			 */
   2065 			if (flags & SM_FREE) {
   2066 				bflags |= B_FREE;
   2067 				segmapcnt.smp_rel_free.value.ul++;
   2068 			}
   2069 			if (flags & SM_DONTNEED) {
   2070 				bflags |= B_DONTNEED;
   2071 				segmapcnt.smp_rel_dontneed.value.ul++;
   2072 			}
   2073 		}
   2074 	} else {
   2075 		smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
   2076 	}
   2077 
   2078 	vp = smp->sm_vp;
   2079 	offset = smp->sm_off;
   2080 
   2081 	if (--smp->sm_refcnt == 0) {
   2082 
   2083 		smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
   2084 
   2085 		if (flags & (SM_INVAL|SM_DESTROY)) {
   2086 			segmap_hashout(smp);	/* remove map info */
   2087 			if (is_kpm) {
   2088 				hat_kpm_mapout(pp, GET_KPME(smp), addr);
   2089 				if (smp->sm_flags & SM_NOTKPM_RELEASED) {
   2090 					smp->sm_flags &= ~SM_NOTKPM_RELEASED;
   2091 					hat_unload(kas.a_hat, segkmap->s_base +
   2092 					    ((smp - smd_smap) * MAXBSIZE),
   2093 					    MAXBSIZE, HAT_UNLOAD);
   2094 				}
   2095 
   2096 			} else {
   2097 				if (segmap_kpm)
   2098 					segkpm_mapout_validkpme(GET_KPME(smp));
   2099 
   2100 				smp->sm_flags &= ~SM_NOTKPM_RELEASED;
   2101 				hat_unload(kas.a_hat, addr, MAXBSIZE,
   2102 				    HAT_UNLOAD);
   2103 			}
   2104 		}
   2105 		segmap_smapadd(smp);	/* add to free list */
   2106 	}
   2107 
   2108 	mutex_exit(smtx);
   2109 
   2110 	if (is_kpm)
   2111 		page_unlock(pp);
   2112 	/*
   2113 	 * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
   2114 	 * are set.
   2115 	 */
   2116 	if ((flags & ~SM_DONTNEED) != 0) {
   2117 		error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
   2118 		    bflags, CRED(), NULL);
   2119 	} else {
   2120 		error = 0;
   2121 	}
   2122 
   2123 	return (error);
   2124 }
   2125 
   2126 /*
   2127  * Dump the pages belonging to this segmap segment.
   2128  */
   2129 static void
   2130 segmap_dump(struct seg *seg)
   2131 {
   2132 	struct segmap_data *smd;
   2133 	struct smap *smp, *smp_end;
   2134 	page_t *pp;
   2135 	pfn_t pfn;
   2136 	u_offset_t off;
   2137 	caddr_t addr;
   2138 
   2139 	smd = (struct segmap_data *)seg->s_data;
   2140 	addr = seg->s_base;
   2141 	for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
   2142 	    smp < smp_end; smp++) {
   2143 
   2144 		if (smp->sm_refcnt) {
   2145 			for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
   2146 				int we_own_it = 0;
   2147 
   2148 				/*
   2149 				 * If pp == NULL, the page either does
   2150 				 * not exist or is exclusively locked.
   2151 				 * So determine if it exists before
   2152 				 * searching for it.
   2153 				 */
   2154 				if ((pp = page_lookup_nowait(smp->sm_vp,
   2155 				    smp->sm_off + off, SE_SHARED)))
   2156 					we_own_it = 1;
   2157 				else
   2158 					pp = page_exists(smp->sm_vp,
   2159 					    smp->sm_off + off);
   2160 
   2161 				if (pp) {
   2162 					pfn = page_pptonum(pp);
   2163 					dump_addpage(seg->s_as,
   2164 					    addr + off, pfn);
   2165 					if (we_own_it)
   2166 						page_unlock(pp);
   2167 				}
   2168 				dump_timeleft = dump_timeout;
   2169 			}
   2170 		}
   2171 		addr += MAXBSIZE;
   2172 	}
   2173 }
   2174 
   2175 /*ARGSUSED*/
   2176 static int
   2177 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
   2178     struct page ***ppp, enum lock_type type, enum seg_rw rw)
   2179 {
   2180 	return (ENOTSUP);
   2181 }
   2182 
   2183 static int
   2184 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
   2185 {
   2186 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
   2187 
   2188 	memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
   2189 	memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
   2190 	return (0);
   2191 }
   2192 
   2193 /*ARGSUSED*/
   2194 static lgrp_mem_policy_info_t *
   2195 segmap_getpolicy(struct seg *seg, caddr_t addr)
   2196 {
   2197 	return (NULL);
   2198 }
   2199 
   2200 /*ARGSUSED*/
   2201 static int
   2202 segmap_capable(struct seg *seg, segcapability_t capability)
   2203 {
   2204 	return (0);
   2205 }
   2206 
   2207 
   2208 #ifdef	SEGKPM_SUPPORT
   2209 
   2210 /*
   2211  * segkpm support routines
   2212  */
   2213 
   2214 static caddr_t
   2215 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
   2216 	struct smap *smp, enum seg_rw rw)
   2217 {
   2218 	caddr_t	base;
   2219 	page_t	*pp;
   2220 	int	newpage = 0;
   2221 	struct kpme	*kpme;
   2222 
   2223 	ASSERT(smp->sm_refcnt > 0);
   2224 
   2225 	if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
   2226 		kmutex_t *smtx;
   2227 
   2228 		base = segkpm_create_va(off);
   2229 
   2230 		if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
   2231 		    seg, base)) == NULL) {
   2232 			panic("segmap_pagecreate_kpm: "
   2233 			    "page_create failed");
   2234 			/*NOTREACHED*/
   2235 		}
   2236 
   2237 		newpage = 1;
   2238 		page_io_unlock(pp);
   2239 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
   2240 
   2241 		/*
   2242 		 * Mark this here until the following segmap_pagecreate
   2243 		 * or segmap_release.
   2244 		 */
   2245 		smtx = SMAPMTX(smp);
   2246 		mutex_enter(smtx);
   2247 		smp->sm_flags |= SM_KPM_NEWPAGE;
   2248 		mutex_exit(smtx);
   2249 	}
   2250 
   2251 	kpme = GET_KPME(smp);
   2252 	if (!newpage && kpme->kpe_page == pp)
   2253 		base = hat_kpm_page2va(pp, 0);
   2254 	else
   2255 		base = hat_kpm_mapin(pp, kpme);
   2256 
   2257 	/*
   2258 	 * FS code may decide not to call segmap_pagecreate and we
   2259 	 * don't invoke segmap_fault via TLB miss, so we have to set
   2260 	 * ref and mod bits in advance.
   2261 	 */
   2262 	if (rw == S_WRITE) {
   2263 		hat_setrefmod(pp);
   2264 	} else {
   2265 		ASSERT(rw == S_READ);
   2266 		hat_setref(pp);
   2267 	}
   2268 
   2269 	smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
   2270 
   2271 	return (base);
   2272 }
   2273 
   2274 /*
   2275  * Find the smap structure corresponding to the
   2276  * KPM addr and return it locked.
   2277  */
   2278 struct smap *
   2279 get_smap_kpm(caddr_t addr, page_t **ppp)
   2280 {
   2281 	struct smap	*smp;
   2282 	struct vnode	*vp;
   2283 	u_offset_t	offset;
   2284 	caddr_t		baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
   2285 	int		hashid;
   2286 	kmutex_t	*hashmtx;
   2287 	page_t		*pp;
   2288 	union segmap_cpu *scpu;
   2289 
   2290 	pp = hat_kpm_vaddr2page(baseaddr);
   2291 
   2292 	ASSERT(pp && !PP_ISFREE(pp));
   2293 	ASSERT(PAGE_LOCKED(pp));
   2294 	ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
   2295 
   2296 	vp = pp->p_vnode;
   2297 	offset = pp->p_offset;
   2298 	ASSERT(vp != NULL);
   2299 
   2300 	/*
   2301 	 * Assume the last smap used on this cpu is the one needed.
   2302 	 */
   2303 	scpu = smd_cpu+CPU->cpu_seqid;
   2304 	smp = scpu->scpu.scpu_last_smap;
   2305 	mutex_enter(&smp->sm_mtx);
   2306 	if (smp->sm_vp == vp && smp->sm_off == offset) {
   2307 		ASSERT(smp->sm_refcnt > 0);
   2308 	} else {
   2309 		/*
   2310 		 * Assumption wrong, find the smap on the hash chain.
   2311 		 */
   2312 		mutex_exit(&smp->sm_mtx);
   2313 		SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
   2314 		hashmtx = SHASHMTX(hashid);
   2315 
   2316 		mutex_enter(hashmtx);
   2317 		smp = smd_hash[hashid].sh_hash_list;
   2318 		for (; smp != NULL; smp = smp->sm_hash) {
   2319 			if (smp->sm_vp == vp && smp->sm_off == offset)
   2320 				break;
   2321 		}
   2322 		mutex_exit(hashmtx);
   2323 		if (smp) {
   2324 			mutex_enter(&smp->sm_mtx);
   2325 			ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
   2326 		}
   2327 	}
   2328 
   2329 	if (ppp)
   2330 		*ppp = smp ? pp : NULL;
   2331 
   2332 	return (smp);
   2333 }
   2334 
   2335 #else	/* SEGKPM_SUPPORT */
   2336 
   2337 /* segkpm stubs */
   2338 
   2339 /*ARGSUSED*/
   2340 static caddr_t
   2341 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
   2342 	struct smap *smp, enum seg_rw rw)
   2343 {
   2344 	return (NULL);
   2345 }
   2346 
   2347 /*ARGSUSED*/
   2348 struct smap *
   2349 get_smap_kpm(caddr_t addr, page_t **ppp)
   2350 {
   2351 	return (NULL);
   2352 }
   2353 
   2354 #endif	/* SEGKPM_SUPPORT */
   2355