Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
     27 /*	All Rights Reserved   */
     28 
     29 /*
     30  * Portions of this source code were derived from Berkeley 4.3 BSD
     31  * under license from the Regents of the University of California.
     32  */
     33 
     34 /*
     35  * segkp is a segment driver that administers the allocation and deallocation
     36  * of pageable variable size chunks of kernel virtual address space. Each
     37  * allocated resource is page-aligned.
     38  *
     39  * The user may specify whether the resource should be initialized to 0,
     40  * include a redzone, or locked in memory.
     41  */
     42 
     43 #include <sys/types.h>
     44 #include <sys/t_lock.h>
     45 #include <sys/thread.h>
     46 #include <sys/param.h>
     47 #include <sys/errno.h>
     48 #include <sys/sysmacros.h>
     49 #include <sys/systm.h>
     50 #include <sys/buf.h>
     51 #include <sys/mman.h>
     52 #include <sys/vnode.h>
     53 #include <sys/cmn_err.h>
     54 #include <sys/swap.h>
     55 #include <sys/tuneable.h>
     56 #include <sys/kmem.h>
     57 #include <sys/vmem.h>
     58 #include <sys/cred.h>
     59 #include <sys/dumphdr.h>
     60 #include <sys/debug.h>
     61 #include <sys/vtrace.h>
     62 #include <sys/stack.h>
     63 #include <sys/atomic.h>
     64 #include <sys/archsystm.h>
     65 #include <sys/lgrp.h>
     66 
     67 #include <vm/as.h>
     68 #include <vm/seg.h>
     69 #include <vm/seg_kp.h>
     70 #include <vm/seg_kmem.h>
     71 #include <vm/anon.h>
     72 #include <vm/page.h>
     73 #include <vm/hat.h>
     74 #include <sys/bitmap.h>
     75 
     76 /*
     77  * Private seg op routines
     78  */
     79 static void	segkp_badop(void);
     80 static void	segkp_dump(struct seg *seg);
     81 static int	segkp_checkprot(struct seg *seg, caddr_t addr, size_t len,
     82 			uint_t prot);
     83 static int	segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
     84 static int	segkp_pagelock(struct seg *seg, caddr_t addr, size_t len,
     85 			struct page ***page, enum lock_type type,
     86 			enum seg_rw rw);
     87 static void	segkp_insert(struct seg *seg, struct segkp_data *kpd);
     88 static void	segkp_delete(struct seg *seg, struct segkp_data *kpd);
     89 static caddr_t	segkp_get_internal(struct seg *seg, size_t len, uint_t flags,
     90 			struct segkp_data **tkpd, struct anon_map *amp);
     91 static void	segkp_release_internal(struct seg *seg,
     92 			struct segkp_data *kpd, size_t len);
     93 static int	segkp_unlock(struct hat *hat, struct seg *seg, caddr_t vaddr,
     94 			size_t len, struct segkp_data *kpd, uint_t flags);
     95 static int	segkp_load(struct hat *hat, struct seg *seg, caddr_t vaddr,
     96 			size_t len, struct segkp_data *kpd, uint_t flags);
     97 static struct	segkp_data *segkp_find(struct seg *seg, caddr_t vaddr);
     98 static int	segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
     99 static lgrp_mem_policy_info_t	*segkp_getpolicy(struct seg *seg,
    100     caddr_t addr);
    101 static int	segkp_capable(struct seg *seg, segcapability_t capability);
    102 
    103 /*
    104  * Lock used to protect the hash table(s) and caches.
    105  */
    106 static kmutex_t	segkp_lock;
    107 
    108 /*
    109  * The segkp caches
    110  */
    111 static struct segkp_cache segkp_cache[SEGKP_MAX_CACHE];
    112 
    113 #define	SEGKP_BADOP(t)	(t(*)())segkp_badop
    114 
    115 /*
    116  * When there are fewer than red_minavail bytes left on the stack,
    117  * segkp_map_red() will map in the redzone (if called).  5000 seems
    118  * to work reasonably well...
    119  */
    120 long		red_minavail = 5000;
    121 
    122 /*
    123  * will be set to 1 for 32 bit x86 systems only, in startup.c
    124  */
    125 int	segkp_fromheap = 0;
    126 ulong_t *segkp_bitmap;
    127 
    128 /*
    129  * If segkp_map_red() is called with the redzone already mapped and
    130  * with less than RED_DEEP_THRESHOLD bytes available on the stack,
    131  * then the stack situation has become quite serious;  if much more stack
    132  * is consumed, we have the potential of scrogging the next thread/LWP
    133  * structure.  To help debug the "can't happen" panics which may
    134  * result from this condition, we record hrestime and the calling thread
    135  * in red_deep_hires and red_deep_thread respectively.
    136  */
    137 #define	RED_DEEP_THRESHOLD	2000
    138 
    139 hrtime_t	red_deep_hires;
    140 kthread_t	*red_deep_thread;
    141 
    142 uint32_t	red_nmapped;
    143 uint32_t	red_closest = UINT_MAX;
    144 uint32_t	red_ndoubles;
    145 
    146 pgcnt_t anon_segkp_pages_locked;	/* See vm/anon.h */
    147 pgcnt_t anon_segkp_pages_resv;		/* anon reserved by seg_kp */
    148 
    149 static struct	seg_ops segkp_ops = {
    150 	SEGKP_BADOP(int),		/* dup */
    151 	SEGKP_BADOP(int),		/* unmap */
    152 	SEGKP_BADOP(void),		/* free */
    153 	segkp_fault,
    154 	SEGKP_BADOP(faultcode_t),	/* faulta */
    155 	SEGKP_BADOP(int),		/* setprot */
    156 	segkp_checkprot,
    157 	segkp_kluster,
    158 	SEGKP_BADOP(size_t),		/* swapout */
    159 	SEGKP_BADOP(int),		/* sync */
    160 	SEGKP_BADOP(size_t),		/* incore */
    161 	SEGKP_BADOP(int),		/* lockop */
    162 	SEGKP_BADOP(int),		/* getprot */
    163 	SEGKP_BADOP(u_offset_t),		/* getoffset */
    164 	SEGKP_BADOP(int),		/* gettype */
    165 	SEGKP_BADOP(int),		/* getvp */
    166 	SEGKP_BADOP(int),		/* advise */
    167 	segkp_dump,			/* dump */
    168 	segkp_pagelock,			/* pagelock */
    169 	SEGKP_BADOP(int),		/* setpgsz */
    170 	segkp_getmemid,			/* getmemid */
    171 	segkp_getpolicy,		/* getpolicy */
    172 	segkp_capable,			/* capable */
    173 };
    174 
    175 
    176 static void
    177 segkp_badop(void)
    178 {
    179 	panic("segkp_badop");
    180 	/*NOTREACHED*/
    181 }
    182 
    183 static void segkpinit_mem_config(struct seg *);
    184 
    185 static uint32_t segkp_indel;
    186 
    187 /*
    188  * Allocate the segment specific private data struct and fill it in
    189  * with the per kp segment mutex, anon ptr. array and hash table.
    190  */
    191 int
    192 segkp_create(struct seg *seg)
    193 {
    194 	struct segkp_segdata *kpsd;
    195 	size_t	np;
    196 
    197 	ASSERT(seg != NULL && seg->s_as == &kas);
    198 	ASSERT(RW_WRITE_HELD(&seg->s_as->a_lock));
    199 
    200 	if (seg->s_size & PAGEOFFSET) {
    201 		panic("Bad segkp size");
    202 		/*NOTREACHED*/
    203 	}
    204 
    205 	kpsd = kmem_zalloc(sizeof (struct segkp_segdata), KM_SLEEP);
    206 
    207 	/*
    208 	 * Allocate the virtual memory for segkp and initialize it
    209 	 */
    210 	if (segkp_fromheap) {
    211 		np = btop(kvseg.s_size);
    212 		segkp_bitmap = kmem_zalloc(BT_SIZEOFMAP(np), KM_SLEEP);
    213 		kpsd->kpsd_arena = vmem_create("segkp", NULL, 0, PAGESIZE,
    214 		    vmem_alloc, vmem_free, heap_arena, 5 * PAGESIZE, VM_SLEEP);
    215 	} else {
    216 		segkp_bitmap = NULL;
    217 		np = btop(seg->s_size);
    218 		kpsd->kpsd_arena = vmem_create("segkp", seg->s_base,
    219 		    seg->s_size, PAGESIZE, NULL, NULL, NULL, 5 * PAGESIZE,
    220 		    VM_SLEEP);
    221 	}
    222 
    223 	kpsd->kpsd_anon = anon_create(np, ANON_SLEEP | ANON_ALLOC_FORCE);
    224 
    225 	kpsd->kpsd_hash = kmem_zalloc(SEGKP_HASHSZ * sizeof (struct segkp *),
    226 	    KM_SLEEP);
    227 	seg->s_data = (void *)kpsd;
    228 	seg->s_ops = &segkp_ops;
    229 	segkpinit_mem_config(seg);
    230 	return (0);
    231 }
    232 
    233 
    234 /*
    235  * Find a free 'freelist' and initialize it with the appropriate attributes
    236  */
    237 void *
    238 segkp_cache_init(struct seg *seg, int maxsize, size_t len, uint_t flags)
    239 {
    240 	int i;
    241 
    242 	if ((flags & KPD_NO_ANON) && !(flags & KPD_LOCKED))
    243 		return ((void *)-1);
    244 
    245 	mutex_enter(&segkp_lock);
    246 	for (i = 0; i < SEGKP_MAX_CACHE; i++) {
    247 		if (segkp_cache[i].kpf_inuse)
    248 			continue;
    249 		segkp_cache[i].kpf_inuse = 1;
    250 		segkp_cache[i].kpf_max = maxsize;
    251 		segkp_cache[i].kpf_flags = flags;
    252 		segkp_cache[i].kpf_seg = seg;
    253 		segkp_cache[i].kpf_len = len;
    254 		mutex_exit(&segkp_lock);
    255 		return ((void *)(uintptr_t)i);
    256 	}
    257 	mutex_exit(&segkp_lock);
    258 	return ((void *)-1);
    259 }
    260 
    261 /*
    262  * Free all the cache resources.
    263  */
    264 void
    265 segkp_cache_free(void)
    266 {
    267 	struct segkp_data *kpd;
    268 	struct seg *seg;
    269 	int i;
    270 
    271 	mutex_enter(&segkp_lock);
    272 	for (i = 0; i < SEGKP_MAX_CACHE; i++) {
    273 		if (!segkp_cache[i].kpf_inuse)
    274 			continue;
    275 		/*
    276 		 * Disconnect the freelist and process each element
    277 		 */
    278 		kpd = segkp_cache[i].kpf_list;
    279 		seg = segkp_cache[i].kpf_seg;
    280 		segkp_cache[i].kpf_list = NULL;
    281 		segkp_cache[i].kpf_count = 0;
    282 		mutex_exit(&segkp_lock);
    283 
    284 		while (kpd != NULL) {
    285 			struct segkp_data *next;
    286 
    287 			next = kpd->kp_next;
    288 			segkp_release_internal(seg, kpd, kpd->kp_len);
    289 			kpd = next;
    290 		}
    291 		mutex_enter(&segkp_lock);
    292 	}
    293 	mutex_exit(&segkp_lock);
    294 }
    295 
    296 /*
    297  * There are 2 entries into segkp_get_internal. The first includes a cookie
    298  * used to access a pool of cached segkp resources. The second does not
    299  * use the cache.
    300  */
    301 caddr_t
    302 segkp_get(struct seg *seg, size_t len, uint_t flags)
    303 {
    304 	struct segkp_data *kpd = NULL;
    305 
    306 	if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) {
    307 		kpd->kp_cookie = -1;
    308 		return (stom(kpd->kp_base, flags));
    309 	}
    310 	return (NULL);
    311 }
    312 
    313 /*
    314  * Return a 'cached' segkp address
    315  */
    316 caddr_t
    317 segkp_cache_get(void *cookie)
    318 {
    319 	struct segkp_cache *freelist = NULL;
    320 	struct segkp_data *kpd = NULL;
    321 	int index = (int)(uintptr_t)cookie;
    322 	struct seg *seg;
    323 	size_t len;
    324 	uint_t flags;
    325 
    326 	if (index < 0 || index >= SEGKP_MAX_CACHE)
    327 		return (NULL);
    328 	freelist = &segkp_cache[index];
    329 
    330 	mutex_enter(&segkp_lock);
    331 	seg = freelist->kpf_seg;
    332 	flags = freelist->kpf_flags;
    333 	if (freelist->kpf_list != NULL) {
    334 		kpd = freelist->kpf_list;
    335 		freelist->kpf_list = kpd->kp_next;
    336 		freelist->kpf_count--;
    337 		mutex_exit(&segkp_lock);
    338 		kpd->kp_next = NULL;
    339 		segkp_insert(seg, kpd);
    340 		return (stom(kpd->kp_base, flags));
    341 	}
    342 	len = freelist->kpf_len;
    343 	mutex_exit(&segkp_lock);
    344 	if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) {
    345 		kpd->kp_cookie = index;
    346 		return (stom(kpd->kp_base, flags));
    347 	}
    348 	return (NULL);
    349 }
    350 
    351 caddr_t
    352 segkp_get_withanonmap(
    353 	struct seg *seg,
    354 	size_t len,
    355 	uint_t flags,
    356 	struct anon_map *amp)
    357 {
    358 	struct segkp_data *kpd = NULL;
    359 
    360 	ASSERT(amp != NULL);
    361 	flags |= KPD_HASAMP;
    362 	if (segkp_get_internal(seg, len, flags, &kpd, amp) != NULL) {
    363 		kpd->kp_cookie = -1;
    364 		return (stom(kpd->kp_base, flags));
    365 	}
    366 	return (NULL);
    367 }
    368 
    369 /*
    370  * This does the real work of segkp allocation.
    371  * Return to client base addr. len must be page-aligned. A null value is
    372  * returned if there are no more vm resources (e.g. pages, swap). The len
    373  * and base recorded in the private data structure include the redzone
    374  * and the redzone length (if applicable). If the user requests a redzone
    375  * either the first or last page is left unmapped depending whether stacks
    376  * grow to low or high memory.
    377  *
    378  * The client may also specify a no-wait flag. If that is set then the
    379  * request will choose a non-blocking path when requesting resources.
    380  * The default is make the client wait.
    381  */
    382 static caddr_t
    383 segkp_get_internal(
    384 	struct seg *seg,
    385 	size_t len,
    386 	uint_t flags,
    387 	struct segkp_data **tkpd,
    388 	struct anon_map *amp)
    389 {
    390 	struct segkp_segdata	*kpsd = (struct segkp_segdata *)seg->s_data;
    391 	struct segkp_data	*kpd;
    392 	caddr_t vbase = NULL;	/* always first virtual, may not be mapped */
    393 	pgcnt_t np = 0;		/* number of pages in the resource */
    394 	pgcnt_t segkpindex;
    395 	long i;
    396 	caddr_t va;
    397 	pgcnt_t pages = 0;
    398 	ulong_t anon_idx = 0;
    399 	int kmflag = (flags & KPD_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
    400 	caddr_t s_base = (segkp_fromheap) ? kvseg.s_base : seg->s_base;
    401 
    402 	if (len & PAGEOFFSET) {
    403 		panic("segkp_get: len is not page-aligned");
    404 		/*NOTREACHED*/
    405 	}
    406 
    407 	ASSERT(((flags & KPD_HASAMP) == 0) == (amp == NULL));
    408 
    409 	/* Only allow KPD_NO_ANON if we are going to lock it down */
    410 	if ((flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON)
    411 		return (NULL);
    412 
    413 	if ((kpd = kmem_zalloc(sizeof (struct segkp_data), kmflag)) == NULL)
    414 		return (NULL);
    415 	/*
    416 	 * Fix up the len to reflect the REDZONE if applicable
    417 	 */
    418 	if (flags & KPD_HASREDZONE)
    419 		len += PAGESIZE;
    420 	np = btop(len);
    421 
    422 	vbase = vmem_alloc(SEGKP_VMEM(seg), len, kmflag | VM_BESTFIT);
    423 	if (vbase == NULL) {
    424 		kmem_free(kpd, sizeof (struct segkp_data));
    425 		return (NULL);
    426 	}
    427 
    428 	/* If locking, reserve physical memory */
    429 	if (flags & KPD_LOCKED) {
    430 		pages = btop(SEGKP_MAPLEN(len, flags));
    431 		if (page_resv(pages, kmflag) == 0) {
    432 			vmem_free(SEGKP_VMEM(seg), vbase, len);
    433 			kmem_free(kpd, sizeof (struct segkp_data));
    434 			return (NULL);
    435 		}
    436 		if ((flags & KPD_NO_ANON) == 0)
    437 			atomic_add_long(&anon_segkp_pages_locked, pages);
    438 	}
    439 
    440 	/*
    441 	 * Reserve sufficient swap space for this vm resource.  We'll
    442 	 * actually allocate it in the loop below, but reserving it
    443 	 * here allows us to back out more gracefully than if we
    444 	 * had an allocation failure in the body of the loop.
    445 	 *
    446 	 * Note that we don't need swap space for the red zone page.
    447 	 */
    448 	if (amp != NULL) {
    449 		/*
    450 		 * The swap reservation has been done, if required, and the
    451 		 * anon_hdr is separate.
    452 		 */
    453 		anon_idx = 0;
    454 		kpd->kp_anon_idx = anon_idx;
    455 		kpd->kp_anon = amp->ahp;
    456 
    457 		TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
    458 		    kpd, vbase, len, flags, 1);
    459 
    460 	} else if ((flags & KPD_NO_ANON) == 0) {
    461 		if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) {
    462 			if (flags & KPD_LOCKED) {
    463 				atomic_add_long(&anon_segkp_pages_locked,
    464 				    -pages);
    465 				page_unresv(pages);
    466 			}
    467 			vmem_free(SEGKP_VMEM(seg), vbase, len);
    468 			kmem_free(kpd, sizeof (struct segkp_data));
    469 			return (NULL);
    470 		}
    471 		atomic_add_long(&anon_segkp_pages_resv,
    472 		    btop(SEGKP_MAPLEN(len, flags)));
    473 		anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT;
    474 		kpd->kp_anon_idx = anon_idx;
    475 		kpd->kp_anon = kpsd->kpsd_anon;
    476 
    477 		TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
    478 		    kpd, vbase, len, flags, 1);
    479 	} else {
    480 		kpd->kp_anon = NULL;
    481 		kpd->kp_anon_idx = 0;
    482 	}
    483 
    484 	/*
    485 	 * Allocate page and anon resources for the virtual address range
    486 	 * except the redzone
    487 	 */
    488 	if (segkp_fromheap)
    489 		segkpindex = btop((uintptr_t)(vbase - kvseg.s_base));
    490 	for (i = 0, va = vbase; i < np; i++, va += PAGESIZE) {
    491 		page_t		*pl[2];
    492 		struct vnode	*vp;
    493 		anoff_t		off;
    494 		int		err;
    495 		page_t		*pp = NULL;
    496 
    497 		/*
    498 		 * Mark this page to be a segkp page in the bitmap.
    499 		 */
    500 		if (segkp_fromheap) {
    501 			BT_ATOMIC_SET(segkp_bitmap, segkpindex);
    502 			segkpindex++;
    503 		}
    504 
    505 		/*
    506 		 * If this page is the red zone page, we don't need swap
    507 		 * space for it.  Note that we skip over the code that
    508 		 * establishes MMU mappings, so that the page remains
    509 		 * invalid.
    510 		 */
    511 		if ((flags & KPD_HASREDZONE) && KPD_REDZONE(kpd) == i)
    512 			continue;
    513 
    514 		if (kpd->kp_anon != NULL) {
    515 			struct anon *ap;
    516 
    517 			ASSERT(anon_get_ptr(kpd->kp_anon, anon_idx + i)
    518 			    == NULL);
    519 			/*
    520 			 * Determine the "vp" and "off" of the anon slot.
    521 			 */
    522 			ap = anon_alloc(NULL, 0);
    523 			if (amp != NULL)
    524 				ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
    525 			(void) anon_set_ptr(kpd->kp_anon, anon_idx + i,
    526 			    ap, ANON_SLEEP);
    527 			if (amp != NULL)
    528 				ANON_LOCK_EXIT(&amp->a_rwlock);
    529 			swap_xlate(ap, &vp, &off);
    530 
    531 			/*
    532 			 * Create a page with the specified identity.  The
    533 			 * page is returned with the "shared" lock held.
    534 			 */
    535 			err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE,
    536 			    NULL, pl, PAGESIZE, seg, va, S_CREATE,
    537 			    kcred, NULL);
    538 			if (err) {
    539 				/*
    540 				 * XXX - This should not fail.
    541 				 */
    542 				panic("segkp_get: no pages");
    543 				/*NOTREACHED*/
    544 			}
    545 			pp = pl[0];
    546 		} else {
    547 			ASSERT(page_exists(&kvp,
    548 			    (u_offset_t)(uintptr_t)va) == NULL);
    549 
    550 			if ((pp = page_create_va(&kvp,
    551 			    (u_offset_t)(uintptr_t)va, PAGESIZE,
    552 			    (flags & KPD_NOWAIT ? 0 : PG_WAIT) | PG_EXCL |
    553 			    PG_NORELOC, seg, va)) == NULL) {
    554 				/*
    555 				 * Legitimize resource; then destroy it.
    556 				 * Easier than trying to unwind here.
    557 				 */
    558 				kpd->kp_flags = flags;
    559 				kpd->kp_base = vbase;
    560 				kpd->kp_len = len;
    561 				segkp_release_internal(seg, kpd, va - vbase);
    562 				return (NULL);
    563 			}
    564 			page_io_unlock(pp);
    565 		}
    566 
    567 		if (flags & KPD_ZERO)
    568 			pagezero(pp, 0, PAGESIZE);
    569 
    570 		/*
    571 		 * Load and lock an MMU translation for the page.
    572 		 */
    573 		hat_memload(seg->s_as->a_hat, va, pp, (PROT_READ|PROT_WRITE),
    574 		    ((flags & KPD_LOCKED) ? HAT_LOAD_LOCK : HAT_LOAD));
    575 
    576 		/*
    577 		 * Now, release lock on the page.
    578 		 */
    579 		if (flags & KPD_LOCKED)
    580 			page_downgrade(pp);
    581 		else
    582 			page_unlock(pp);
    583 	}
    584 
    585 	kpd->kp_flags = flags;
    586 	kpd->kp_base = vbase;
    587 	kpd->kp_len = len;
    588 	segkp_insert(seg, kpd);
    589 	*tkpd = kpd;
    590 	return (stom(kpd->kp_base, flags));
    591 }
    592 
    593 /*
    594  * Release the resource to cache if the pool(designate by the cookie)
    595  * has less than the maximum allowable. If inserted in cache,
    596  * segkp_delete insures element is taken off of active list.
    597  */
    598 void
    599 segkp_release(struct seg *seg, caddr_t vaddr)
    600 {
    601 	struct segkp_cache *freelist;
    602 	struct segkp_data *kpd = NULL;
    603 
    604 	if ((kpd = segkp_find(seg, vaddr)) == NULL) {
    605 		panic("segkp_release: null kpd");
    606 		/*NOTREACHED*/
    607 	}
    608 
    609 	if (kpd->kp_cookie != -1) {
    610 		freelist = &segkp_cache[kpd->kp_cookie];
    611 		mutex_enter(&segkp_lock);
    612 		if (!segkp_indel && freelist->kpf_count < freelist->kpf_max) {
    613 			segkp_delete(seg, kpd);
    614 			kpd->kp_next = freelist->kpf_list;
    615 			freelist->kpf_list = kpd;
    616 			freelist->kpf_count++;
    617 			mutex_exit(&segkp_lock);
    618 			return;
    619 		} else {
    620 			mutex_exit(&segkp_lock);
    621 			kpd->kp_cookie = -1;
    622 		}
    623 	}
    624 	segkp_release_internal(seg, kpd, kpd->kp_len);
    625 }
    626 
    627 /*
    628  * Free the entire resource. segkp_unlock gets called with the start of the
    629  * mapped portion of the resource. The length is the size of the mapped
    630  * portion
    631  */
    632 static void
    633 segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len)
    634 {
    635 	caddr_t		va;
    636 	long		i;
    637 	long		redzone;
    638 	size_t		np;
    639 	page_t		*pp;
    640 	struct vnode 	*vp;
    641 	anoff_t		off;
    642 	struct anon	*ap;
    643 	pgcnt_t		segkpindex;
    644 
    645 	ASSERT(kpd != NULL);
    646 	ASSERT((kpd->kp_flags & KPD_HASAMP) == 0 || kpd->kp_cookie == -1);
    647 	np = btop(len);
    648 
    649 	/* Remove from active hash list */
    650 	if (kpd->kp_cookie == -1) {
    651 		mutex_enter(&segkp_lock);
    652 		segkp_delete(seg, kpd);
    653 		mutex_exit(&segkp_lock);
    654 	}
    655 
    656 	/*
    657 	 * Precompute redzone page index.
    658 	 */
    659 	redzone = -1;
    660 	if (kpd->kp_flags & KPD_HASREDZONE)
    661 		redzone = KPD_REDZONE(kpd);
    662 
    663 
    664 	va = kpd->kp_base;
    665 
    666 	hat_unload(seg->s_as->a_hat, va, (np << PAGESHIFT),
    667 	    ((kpd->kp_flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD));
    668 	/*
    669 	 * Free up those anon resources that are quiescent.
    670 	 */
    671 	if (segkp_fromheap)
    672 		segkpindex = btop((uintptr_t)(va - kvseg.s_base));
    673 	for (i = 0; i < np; i++, va += PAGESIZE) {
    674 
    675 		/*
    676 		 * Clear the bit for this page from the bitmap.
    677 		 */
    678 		if (segkp_fromheap) {
    679 			BT_ATOMIC_CLEAR(segkp_bitmap, segkpindex);
    680 			segkpindex++;
    681 		}
    682 
    683 		if (i == redzone)
    684 			continue;
    685 		if (kpd->kp_anon) {
    686 			/*
    687 			 * Free up anon resources and destroy the
    688 			 * associated pages.
    689 			 *
    690 			 * Release the lock if there is one. Have to get the
    691 			 * page to do this, unfortunately.
    692 			 */
    693 			if (kpd->kp_flags & KPD_LOCKED) {
    694 				ap = anon_get_ptr(kpd->kp_anon,
    695 				    kpd->kp_anon_idx + i);
    696 				swap_xlate(ap, &vp, &off);
    697 				/* Find the shared-locked page. */
    698 				pp = page_find(vp, (u_offset_t)off);
    699 				if (pp == NULL) {
    700 					panic("segkp_release: "
    701 					    "kp_anon: no page to unlock ");
    702 					/*NOTREACHED*/
    703 				}
    704 				page_unlock(pp);
    705 			}
    706 			if ((kpd->kp_flags & KPD_HASAMP) == 0) {
    707 				anon_free(kpd->kp_anon, kpd->kp_anon_idx + i,
    708 				    PAGESIZE);
    709 				anon_unresv_zone(PAGESIZE, NULL);
    710 				atomic_add_long(&anon_segkp_pages_resv,
    711 				    -1);
    712 			}
    713 			TRACE_5(TR_FAC_VM,
    714 			    TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
    715 			    kpd, va, PAGESIZE, 0, 0);
    716 		} else {
    717 			if (kpd->kp_flags & KPD_LOCKED) {
    718 				pp = page_find(&kvp, (u_offset_t)(uintptr_t)va);
    719 				if (pp == NULL) {
    720 					panic("segkp_release: "
    721 					    "no page to unlock");
    722 					/*NOTREACHED*/
    723 				}
    724 				/*
    725 				 * We should just upgrade the lock here
    726 				 * but there is no upgrade that waits.
    727 				 */
    728 				page_unlock(pp);
    729 			}
    730 			pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)va,
    731 			    SE_EXCL);
    732 			if (pp != NULL)
    733 				page_destroy(pp, 0);
    734 		}
    735 	}
    736 
    737 	/* If locked, release physical memory reservation */
    738 	if (kpd->kp_flags & KPD_LOCKED) {
    739 		pgcnt_t pages = btop(SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags));
    740 		if ((kpd->kp_flags & KPD_NO_ANON) == 0)
    741 			atomic_add_long(&anon_segkp_pages_locked, -pages);
    742 		page_unresv(pages);
    743 	}
    744 
    745 	vmem_free(SEGKP_VMEM(seg), kpd->kp_base, kpd->kp_len);
    746 	kmem_free(kpd, sizeof (struct segkp_data));
    747 }
    748 
    749 /*
    750  * segkp_map_red() will check the current frame pointer against the
    751  * stack base.  If the amount of stack remaining is questionable
    752  * (less than red_minavail), then segkp_map_red() will map in the redzone
    753  * and return 1.  Otherwise, it will return 0.  segkp_map_red() can
    754  * _only_ be called when:
    755  *
    756  *   - it is safe to sleep on page_create_va().
    757  *   - the caller is non-swappable.
    758  *
    759  * It is up to the caller to remember whether segkp_map_red() successfully
    760  * mapped the redzone, and, if so, to call segkp_unmap_red() at a later
    761  * time.  Note that the caller must _remain_ non-swappable until after
    762  * calling segkp_unmap_red().
    763  *
    764  * Currently, this routine is only called from pagefault() (which necessarily
    765  * satisfies the above conditions).
    766  */
    767 #if defined(STACK_GROWTH_DOWN)
    768 int
    769 segkp_map_red(void)
    770 {
    771 	uintptr_t fp = STACK_BIAS + (uintptr_t)getfp();
    772 #ifndef _LP64
    773 	caddr_t stkbase;
    774 #endif
    775 
    776 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
    777 
    778 	/*
    779 	 * Optimize for the common case where we simply return.
    780 	 */
    781 	if ((curthread->t_red_pp == NULL) &&
    782 	    (fp - (uintptr_t)curthread->t_stkbase >= red_minavail))
    783 		return (0);
    784 
    785 #if defined(_LP64)
    786 	/*
    787 	 * XXX	We probably need something better than this.
    788 	 */
    789 	panic("kernel stack overflow");
    790 	/*NOTREACHED*/
    791 #else /* _LP64 */
    792 	if (curthread->t_red_pp == NULL) {
    793 		page_t *red_pp;
    794 		struct seg kseg;
    795 
    796 		caddr_t red_va = (caddr_t)
    797 		    (((uintptr_t)curthread->t_stkbase & (uintptr_t)PAGEMASK) -
    798 		    PAGESIZE);
    799 
    800 		ASSERT(page_exists(&kvp, (u_offset_t)(uintptr_t)red_va) ==
    801 		    NULL);
    802 
    803 		/*
    804 		 * Allocate the physical for the red page.
    805 		 */
    806 		/*
    807 		 * No PG_NORELOC here to avoid waits. Unlikely to get
    808 		 * a relocate happening in the short time the page exists
    809 		 * and it will be OK anyway.
    810 		 */
    811 
    812 		kseg.s_as = &kas;
    813 		red_pp = page_create_va(&kvp, (u_offset_t)(uintptr_t)red_va,
    814 		    PAGESIZE, PG_WAIT | PG_EXCL, &kseg, red_va);
    815 		ASSERT(red_pp != NULL);
    816 
    817 		/*
    818 		 * So we now have a page to jam into the redzone...
    819 		 */
    820 		page_io_unlock(red_pp);
    821 
    822 		hat_memload(kas.a_hat, red_va, red_pp,
    823 		    (PROT_READ|PROT_WRITE), HAT_LOAD_LOCK);
    824 		page_downgrade(red_pp);
    825 
    826 		/*
    827 		 * The page is left SE_SHARED locked so we can hold on to
    828 		 * the page_t pointer.
    829 		 */
    830 		curthread->t_red_pp = red_pp;
    831 
    832 		atomic_add_32(&red_nmapped, 1);
    833 		while (fp - (uintptr_t)curthread->t_stkbase < red_closest) {
    834 			(void) cas32(&red_closest, red_closest,
    835 			    (uint32_t)(fp - (uintptr_t)curthread->t_stkbase));
    836 		}
    837 		return (1);
    838 	}
    839 
    840 	stkbase = (caddr_t)(((uintptr_t)curthread->t_stkbase &
    841 	    (uintptr_t)PAGEMASK) - PAGESIZE);
    842 
    843 	atomic_add_32(&red_ndoubles, 1);
    844 
    845 	if (fp - (uintptr_t)stkbase < RED_DEEP_THRESHOLD) {
    846 		/*
    847 		 * Oh boy.  We're already deep within the mapped-in
    848 		 * redzone page, and the caller is trying to prepare
    849 		 * for a deep stack run.  We're running without a
    850 		 * redzone right now:  if the caller plows off the
    851 		 * end of the stack, it'll plow another thread or
    852 		 * LWP structure.  That situation could result in
    853 		 * a very hard-to-debug panic, so, in the spirit of
    854 		 * recording the name of one's killer in one's own
    855 		 * blood, we're going to record hrestime and the calling
    856 		 * thread.
    857 		 */
    858 		red_deep_hires = hrestime.tv_nsec;
    859 		red_deep_thread = curthread;
    860 	}
    861 
    862 	/*
    863 	 * If this is a DEBUG kernel, and we've run too deep for comfort, toss.
    864 	 */
    865 	ASSERT(fp - (uintptr_t)stkbase >= RED_DEEP_THRESHOLD);
    866 	return (0);
    867 #endif /* _LP64 */
    868 }
    869 
    870 void
    871 segkp_unmap_red(void)
    872 {
    873 	page_t *pp;
    874 	caddr_t red_va = (caddr_t)(((uintptr_t)curthread->t_stkbase &
    875 	    (uintptr_t)PAGEMASK) - PAGESIZE);
    876 
    877 	ASSERT(curthread->t_red_pp != NULL);
    878 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
    879 
    880 	/*
    881 	 * Because we locked the mapping down, we can't simply rely
    882 	 * on page_destroy() to clean everything up;  we need to call
    883 	 * hat_unload() to explicitly unlock the mapping resources.
    884 	 */
    885 	hat_unload(kas.a_hat, red_va, PAGESIZE, HAT_UNLOAD_UNLOCK);
    886 
    887 	pp = curthread->t_red_pp;
    888 
    889 	ASSERT(pp == page_find(&kvp, (u_offset_t)(uintptr_t)red_va));
    890 
    891 	/*
    892 	 * Need to upgrade the SE_SHARED lock to SE_EXCL.
    893 	 */
    894 	if (!page_tryupgrade(pp)) {
    895 		/*
    896 		 * As there is now wait for upgrade, release the
    897 		 * SE_SHARED lock and wait for SE_EXCL.
    898 		 */
    899 		page_unlock(pp);
    900 		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)red_va, SE_EXCL);
    901 		/* pp may be NULL here, hence the test below */
    902 	}
    903 
    904 	/*
    905 	 * Destroy the page, with dontfree set to zero (i.e. free it).
    906 	 */
    907 	if (pp != NULL)
    908 		page_destroy(pp, 0);
    909 	curthread->t_red_pp = NULL;
    910 }
    911 #else
    912 #error Red stacks only supported with downwards stack growth.
    913 #endif
    914 
    915 /*
    916  * Handle a fault on an address corresponding to one of the
    917  * resources in the segkp segment.
    918  */
    919 faultcode_t
    920 segkp_fault(
    921 	struct hat	*hat,
    922 	struct seg	*seg,
    923 	caddr_t		vaddr,
    924 	size_t		len,
    925 	enum fault_type	type,
    926 	enum seg_rw rw)
    927 {
    928 	struct segkp_data	*kpd = NULL;
    929 	int			err;
    930 
    931 	ASSERT(seg->s_as == &kas && RW_READ_HELD(&seg->s_as->a_lock));
    932 
    933 	/*
    934 	 * Sanity checks.
    935 	 */
    936 	if (type == F_PROT) {
    937 		panic("segkp_fault: unexpected F_PROT fault");
    938 		/*NOTREACHED*/
    939 	}
    940 
    941 	if ((kpd = segkp_find(seg, vaddr)) == NULL)
    942 		return (FC_NOMAP);
    943 
    944 	mutex_enter(&kpd->kp_lock);
    945 
    946 	if (type == F_SOFTLOCK) {
    947 		ASSERT(!(kpd->kp_flags & KPD_LOCKED));
    948 		/*
    949 		 * The F_SOFTLOCK case has more stringent
    950 		 * range requirements: the given range must exactly coincide
    951 		 * with the resource's mapped portion. Note reference to
    952 		 * redzone is handled since vaddr would not equal base
    953 		 */
    954 		if (vaddr != stom(kpd->kp_base, kpd->kp_flags) ||
    955 		    len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) {
    956 			mutex_exit(&kpd->kp_lock);
    957 			return (FC_MAKE_ERR(EFAULT));
    958 		}
    959 
    960 		if ((err = segkp_load(hat, seg, vaddr, len, kpd, KPD_LOCKED))) {
    961 			mutex_exit(&kpd->kp_lock);
    962 			return (FC_MAKE_ERR(err));
    963 		}
    964 		kpd->kp_flags |= KPD_LOCKED;
    965 		mutex_exit(&kpd->kp_lock);
    966 		return (0);
    967 	}
    968 
    969 	if (type == F_INVAL) {
    970 		ASSERT(!(kpd->kp_flags & KPD_NO_ANON));
    971 
    972 		/*
    973 		 * Check if we touched the redzone. Somewhat optimistic
    974 		 * here if we are touching the redzone of our own stack
    975 		 * since we wouldn't have a stack to get this far...
    976 		 */
    977 		if ((kpd->kp_flags & KPD_HASREDZONE) &&
    978 		    btop((uintptr_t)(vaddr - kpd->kp_base)) == KPD_REDZONE(kpd))
    979 			panic("segkp_fault: accessing redzone");
    980 
    981 		/*
    982 		 * This fault may occur while the page is being F_SOFTLOCK'ed.
    983 		 * Return since a 2nd segkp_load is unnecessary and also would
    984 		 * result in the page being locked twice and eventually
    985 		 * hang the thread_reaper thread.
    986 		 */
    987 		if (kpd->kp_flags & KPD_LOCKED) {
    988 			mutex_exit(&kpd->kp_lock);
    989 			return (0);
    990 		}
    991 
    992 		err = segkp_load(hat, seg, vaddr, len, kpd, kpd->kp_flags);
    993 		mutex_exit(&kpd->kp_lock);
    994 		return (err ? FC_MAKE_ERR(err) : 0);
    995 	}
    996 
    997 	if (type == F_SOFTUNLOCK) {
    998 		uint_t	flags;
    999 
   1000 		/*
   1001 		 * Make sure the addr is LOCKED and it has anon backing
   1002 		 * before unlocking
   1003 		 */
   1004 		if ((kpd->kp_flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON) {
   1005 			panic("segkp_fault: bad unlock");
   1006 			/*NOTREACHED*/
   1007 		}
   1008 
   1009 		if (vaddr != stom(kpd->kp_base, kpd->kp_flags) ||
   1010 		    len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) {
   1011 			panic("segkp_fault: bad range");
   1012 			/*NOTREACHED*/
   1013 		}
   1014 
   1015 		if (rw == S_WRITE)
   1016 			flags = kpd->kp_flags | KPD_WRITEDIRTY;
   1017 		else
   1018 			flags = kpd->kp_flags;
   1019 		err = segkp_unlock(hat, seg, vaddr, len, kpd, flags);
   1020 		kpd->kp_flags &= ~KPD_LOCKED;
   1021 		mutex_exit(&kpd->kp_lock);
   1022 		return (err ? FC_MAKE_ERR(err) : 0);
   1023 	}
   1024 	mutex_exit(&kpd->kp_lock);
   1025 	panic("segkp_fault: bogus fault type: %d\n", type);
   1026 	/*NOTREACHED*/
   1027 }
   1028 
   1029 /*
   1030  * Check that the given protections suffice over the range specified by
   1031  * vaddr and len.  For this segment type, the only issue is whether or
   1032  * not the range lies completely within the mapped part of an allocated
   1033  * resource.
   1034  */
   1035 /* ARGSUSED */
   1036 static int
   1037 segkp_checkprot(struct seg *seg, caddr_t vaddr, size_t len, uint_t prot)
   1038 {
   1039 	struct segkp_data *kpd = NULL;
   1040 	caddr_t mbase;
   1041 	size_t mlen;
   1042 
   1043 	if ((kpd = segkp_find(seg, vaddr)) == NULL)
   1044 		return (EACCES);
   1045 
   1046 	mutex_enter(&kpd->kp_lock);
   1047 	mbase = stom(kpd->kp_base, kpd->kp_flags);
   1048 	mlen = SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags);
   1049 	if (len > mlen || vaddr < mbase ||
   1050 	    ((vaddr + len) > (mbase + mlen))) {
   1051 		mutex_exit(&kpd->kp_lock);
   1052 		return (EACCES);
   1053 	}
   1054 	mutex_exit(&kpd->kp_lock);
   1055 	return (0);
   1056 }
   1057 
   1058 
   1059 /*
   1060  * Check to see if it makes sense to do kluster/read ahead to
   1061  * addr + delta relative to the mapping at addr.  We assume here
   1062  * that delta is a signed PAGESIZE'd multiple (which can be negative).
   1063  *
   1064  * For seg_u we always "approve" of this action from our standpoint.
   1065  */
   1066 /*ARGSUSED*/
   1067 static int
   1068 segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
   1069 {
   1070 	return (0);
   1071 }
   1072 
   1073 /*
   1074  * Load and possibly lock intra-slot resources in the range given by
   1075  * vaddr and len.
   1076  */
   1077 static int
   1078 segkp_load(
   1079 	struct hat *hat,
   1080 	struct seg *seg,
   1081 	caddr_t vaddr,
   1082 	size_t len,
   1083 	struct segkp_data *kpd,
   1084 	uint_t flags)
   1085 {
   1086 	caddr_t va;
   1087 	caddr_t vlim;
   1088 	ulong_t i;
   1089 	uint_t lock;
   1090 
   1091 	ASSERT(MUTEX_HELD(&kpd->kp_lock));
   1092 
   1093 	len = P2ROUNDUP(len, PAGESIZE);
   1094 
   1095 	/* If locking, reserve physical memory */
   1096 	if (flags & KPD_LOCKED) {
   1097 		pgcnt_t pages = btop(len);
   1098 		if ((kpd->kp_flags & KPD_NO_ANON) == 0)
   1099 			atomic_add_long(&anon_segkp_pages_locked, pages);
   1100 		(void) page_resv(pages, KM_SLEEP);
   1101 	}
   1102 
   1103 	/*
   1104 	 * Loop through the pages in the given range.
   1105 	 */
   1106 	va = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
   1107 	vaddr = va;
   1108 	vlim = va + len;
   1109 	lock = flags & KPD_LOCKED;
   1110 	i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT;
   1111 	for (; va < vlim; va += PAGESIZE, i++) {
   1112 		page_t		*pl[2];	/* second element NULL terminator */
   1113 		struct vnode    *vp;
   1114 		anoff_t		off;
   1115 		int		err;
   1116 		struct anon	*ap;
   1117 
   1118 		/*
   1119 		 * Summon the page.  If it's not resident, arrange
   1120 		 * for synchronous i/o to pull it in.
   1121 		 */
   1122 		ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i);
   1123 		swap_xlate(ap, &vp, &off);
   1124 
   1125 		/*
   1126 		 * The returned page list will have exactly one entry,
   1127 		 * which is returned to us already kept.
   1128 		 */
   1129 		err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, NULL,
   1130 		    pl, PAGESIZE, seg, va, S_READ, kcred, NULL);
   1131 
   1132 		if (err) {
   1133 			/*
   1134 			 * Back out of what we've done so far.
   1135 			 */
   1136 			(void) segkp_unlock(hat, seg, vaddr,
   1137 			    (va - vaddr), kpd, flags);
   1138 			return (err);
   1139 		}
   1140 
   1141 		/*
   1142 		 * Load an MMU translation for the page.
   1143 		 */
   1144 		hat_memload(hat, va, pl[0], (PROT_READ|PROT_WRITE),
   1145 		    lock ? HAT_LOAD_LOCK : HAT_LOAD);
   1146 
   1147 		if (!lock) {
   1148 			/*
   1149 			 * Now, release "shared" lock on the page.
   1150 			 */
   1151 			page_unlock(pl[0]);
   1152 		}
   1153 	}
   1154 	return (0);
   1155 }
   1156 
   1157 /*
   1158  * At the very least unload the mmu-translations and unlock the range if locked
   1159  * Can be called with the following flag value KPD_WRITEDIRTY which specifies
   1160  * any dirty pages should be written to disk.
   1161  */
   1162 static int
   1163 segkp_unlock(
   1164 	struct hat *hat,
   1165 	struct seg *seg,
   1166 	caddr_t vaddr,
   1167 	size_t len,
   1168 	struct segkp_data *kpd,
   1169 	uint_t flags)
   1170 {
   1171 	caddr_t va;
   1172 	caddr_t vlim;
   1173 	ulong_t i;
   1174 	struct page *pp;
   1175 	struct vnode *vp;
   1176 	anoff_t off;
   1177 	struct anon *ap;
   1178 
   1179 #ifdef lint
   1180 	seg = seg;
   1181 #endif /* lint */
   1182 
   1183 	ASSERT(MUTEX_HELD(&kpd->kp_lock));
   1184 
   1185 	/*
   1186 	 * Loop through the pages in the given range. It is assumed
   1187 	 * segkp_unlock is called with page aligned base
   1188 	 */
   1189 	va = vaddr;
   1190 	vlim = va + len;
   1191 	i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT;
   1192 	hat_unload(hat, va, len,
   1193 	    ((flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD));
   1194 	for (; va < vlim; va += PAGESIZE, i++) {
   1195 		/*
   1196 		 * Find the page associated with this part of the
   1197 		 * slot, tracking it down through its associated swap
   1198 		 * space.
   1199 		 */
   1200 		ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i);
   1201 		swap_xlate(ap, &vp, &off);
   1202 
   1203 		if (flags & KPD_LOCKED) {
   1204 			if ((pp = page_find(vp, off)) == NULL) {
   1205 				if (flags & KPD_LOCKED) {
   1206 					panic("segkp_softunlock: missing page");
   1207 					/*NOTREACHED*/
   1208 				}
   1209 			}
   1210 		} else {
   1211 			/*
   1212 			 * Nothing to do if the slot is not locked and the
   1213 			 * page doesn't exist.
   1214 			 */
   1215 			if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL)
   1216 				continue;
   1217 		}
   1218 
   1219 		/*
   1220 		 * If the page doesn't have any translations, is
   1221 		 * dirty and not being shared, then push it out
   1222 		 * asynchronously and avoid waiting for the
   1223 		 * pageout daemon to do it for us.
   1224 		 *
   1225 		 * XXX - Do we really need to get the "exclusive"
   1226 		 * lock via an upgrade?
   1227 		 */
   1228 		if ((flags & KPD_WRITEDIRTY) && !hat_page_is_mapped(pp) &&
   1229 		    hat_ismod(pp) && page_tryupgrade(pp)) {
   1230 			/*
   1231 			 * Hold the vnode before releasing the page lock to
   1232 			 * prevent it from being freed and re-used by some
   1233 			 * other thread.
   1234 			 */
   1235 			VN_HOLD(vp);
   1236 			page_unlock(pp);
   1237 
   1238 			/*
   1239 			 * Want most powerful credentials we can get so
   1240 			 * use kcred.
   1241 			 */
   1242 			(void) VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE,
   1243 			    B_ASYNC | B_FREE, kcred, NULL);
   1244 			VN_RELE(vp);
   1245 		} else {
   1246 			page_unlock(pp);
   1247 		}
   1248 	}
   1249 
   1250 	/* If unlocking, release physical memory */
   1251 	if (flags & KPD_LOCKED) {
   1252 		pgcnt_t pages = btopr(len);
   1253 		if ((kpd->kp_flags & KPD_NO_ANON) == 0)
   1254 			atomic_add_long(&anon_segkp_pages_locked, -pages);
   1255 		page_unresv(pages);
   1256 	}
   1257 	return (0);
   1258 }
   1259 
   1260 /*
   1261  * Insert the kpd in the hash table.
   1262  */
   1263 static void
   1264 segkp_insert(struct seg *seg, struct segkp_data *kpd)
   1265 {
   1266 	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
   1267 	int index;
   1268 
   1269 	/*
   1270 	 * Insert the kpd based on the address that will be returned
   1271 	 * via segkp_release.
   1272 	 */
   1273 	index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags));
   1274 	mutex_enter(&segkp_lock);
   1275 	kpd->kp_next = kpsd->kpsd_hash[index];
   1276 	kpsd->kpsd_hash[index] = kpd;
   1277 	mutex_exit(&segkp_lock);
   1278 }
   1279 
   1280 /*
   1281  * Remove kpd from the hash table.
   1282  */
   1283 static void
   1284 segkp_delete(struct seg *seg, struct segkp_data *kpd)
   1285 {
   1286 	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
   1287 	struct segkp_data **kpp;
   1288 	int index;
   1289 
   1290 	ASSERT(MUTEX_HELD(&segkp_lock));
   1291 
   1292 	index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags));
   1293 	for (kpp = &kpsd->kpsd_hash[index];
   1294 	    *kpp != NULL; kpp = &((*kpp)->kp_next)) {
   1295 		if (*kpp == kpd) {
   1296 			*kpp = kpd->kp_next;
   1297 			return;
   1298 		}
   1299 	}
   1300 	panic("segkp_delete: unable to find element to delete");
   1301 	/*NOTREACHED*/
   1302 }
   1303 
   1304 /*
   1305  * Find the kpd associated with a vaddr.
   1306  *
   1307  * Most of the callers of segkp_find will pass the vaddr that
   1308  * hashes to the desired index, but there are cases where
   1309  * this is not true in which case we have to (potentially) scan
   1310  * the whole table looking for it. This should be very rare
   1311  * (e.g. a segkp_fault(F_INVAL) on an address somewhere in the
   1312  * middle of the segkp_data region).
   1313  */
   1314 static struct segkp_data *
   1315 segkp_find(struct seg *seg, caddr_t vaddr)
   1316 {
   1317 	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
   1318 	struct segkp_data *kpd;
   1319 	int	i;
   1320 	int	stop;
   1321 
   1322 	i = stop = SEGKP_HASH(vaddr);
   1323 	mutex_enter(&segkp_lock);
   1324 	do {
   1325 		for (kpd = kpsd->kpsd_hash[i]; kpd != NULL;
   1326 		    kpd = kpd->kp_next) {
   1327 			if (vaddr >= kpd->kp_base &&
   1328 			    vaddr < kpd->kp_base + kpd->kp_len) {
   1329 				mutex_exit(&segkp_lock);
   1330 				return (kpd);
   1331 			}
   1332 		}
   1333 		if (--i < 0)
   1334 			i = SEGKP_HASHSZ - 1;	/* Wrap */
   1335 	} while (i != stop);
   1336 	mutex_exit(&segkp_lock);
   1337 	return (NULL);		/* Not found */
   1338 }
   1339 
   1340 /*
   1341  * returns size of swappable area.
   1342  */
   1343 size_t
   1344 swapsize(caddr_t v)
   1345 {
   1346 	struct segkp_data *kpd;
   1347 
   1348 	if ((kpd = segkp_find(segkp, v)) != NULL)
   1349 		return (SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags));
   1350 	else
   1351 		return (NULL);
   1352 }
   1353 
   1354 /*
   1355  * Dump out all the active segkp pages
   1356  */
   1357 static void
   1358 segkp_dump(struct seg *seg)
   1359 {
   1360 	int i;
   1361 	struct segkp_data *kpd;
   1362 	struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
   1363 
   1364 	for (i = 0; i < SEGKP_HASHSZ; i++) {
   1365 		for (kpd = kpsd->kpsd_hash[i];
   1366 		    kpd != NULL; kpd = kpd->kp_next) {
   1367 			pfn_t pfn;
   1368 			caddr_t addr;
   1369 			caddr_t eaddr;
   1370 
   1371 			addr = kpd->kp_base;
   1372 			eaddr = addr + kpd->kp_len;
   1373 			while (addr < eaddr) {
   1374 				ASSERT(seg->s_as == &kas);
   1375 				pfn = hat_getpfnum(seg->s_as->a_hat, addr);
   1376 				if (pfn != PFN_INVALID)
   1377 					dump_addpage(seg->s_as, addr, pfn);
   1378 				addr += PAGESIZE;
   1379 				dump_timeleft = dump_timeout;
   1380 			}
   1381 		}
   1382 	}
   1383 }
   1384 
   1385 /*ARGSUSED*/
   1386 static int
   1387 segkp_pagelock(struct seg *seg, caddr_t addr, size_t len,
   1388     struct page ***ppp, enum lock_type type, enum seg_rw rw)
   1389 {
   1390 	return (ENOTSUP);
   1391 }
   1392 
   1393 /*ARGSUSED*/
   1394 static int
   1395 segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
   1396 {
   1397 	return (ENODEV);
   1398 }
   1399 
   1400 /*ARGSUSED*/
   1401 static lgrp_mem_policy_info_t	*
   1402 segkp_getpolicy(struct seg *seg, caddr_t addr)
   1403 {
   1404 	return (NULL);
   1405 }
   1406 
   1407 /*ARGSUSED*/
   1408 static int
   1409 segkp_capable(struct seg *seg, segcapability_t capability)
   1410 {
   1411 	return (0);
   1412 }
   1413 
   1414 #include <sys/mem_config.h>
   1415 
   1416 /*ARGSUSED*/
   1417 static void
   1418 segkp_mem_config_post_add(void *arg, pgcnt_t delta_pages)
   1419 {}
   1420 
   1421 /*
   1422  * During memory delete, turn off caches so that pages are not held.
   1423  * A better solution may be to unlock the pages while they are
   1424  * in the cache so that they may be collected naturally.
   1425  */
   1426 
   1427 /*ARGSUSED*/
   1428 static int
   1429 segkp_mem_config_pre_del(void *arg, pgcnt_t delta_pages)
   1430 {
   1431 	atomic_add_32(&segkp_indel, 1);
   1432 	segkp_cache_free();
   1433 	return (0);
   1434 }
   1435 
   1436 /*ARGSUSED*/
   1437 static void
   1438 segkp_mem_config_post_del(void *arg, pgcnt_t delta_pages, int cancelled)
   1439 {
   1440 	atomic_add_32(&segkp_indel, -1);
   1441 }
   1442 
   1443 static kphysm_setup_vector_t segkp_mem_config_vec = {
   1444 	KPHYSM_SETUP_VECTOR_VERSION,
   1445 	segkp_mem_config_post_add,
   1446 	segkp_mem_config_pre_del,
   1447 	segkp_mem_config_post_del,
   1448 };
   1449 
   1450 static void
   1451 segkpinit_mem_config(struct seg *seg)
   1452 {
   1453 	int ret;
   1454 
   1455 	ret = kphysm_setup_func_register(&segkp_mem_config_vec, (void *)seg);
   1456 	ASSERT(ret == 0);
   1457 }
   1458