Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 /*
     40  * VM - shared or copy-on-write from a vnode/anonymous memory.
     41  */
     42 
     43 #include <sys/types.h>
     44 #include <sys/param.h>
     45 #include <sys/t_lock.h>
     46 #include <sys/errno.h>
     47 #include <sys/systm.h>
     48 #include <sys/mman.h>
     49 #include <sys/debug.h>
     50 #include <sys/cred.h>
     51 #include <sys/vmsystm.h>
     52 #include <sys/tuneable.h>
     53 #include <sys/bitmap.h>
     54 #include <sys/swap.h>
     55 #include <sys/kmem.h>
     56 #include <sys/sysmacros.h>
     57 #include <sys/vtrace.h>
     58 #include <sys/cmn_err.h>
     59 #include <sys/callb.h>
     60 #include <sys/vm.h>
     61 #include <sys/dumphdr.h>
     62 #include <sys/lgrp.h>
     63 
     64 #include <vm/hat.h>
     65 #include <vm/as.h>
     66 #include <vm/seg.h>
     67 #include <vm/seg_vn.h>
     68 #include <vm/pvn.h>
     69 #include <vm/anon.h>
     70 #include <vm/page.h>
     71 #include <vm/vpage.h>
     72 #include <sys/proc.h>
     73 #include <sys/task.h>
     74 #include <sys/project.h>
     75 #include <sys/zone.h>
     76 #include <sys/shm_impl.h>
     77 /*
     78  * Private seg op routines.
     79  */
     80 static int	segvn_dup(struct seg *seg, struct seg *newseg);
     81 static int	segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
     82 static void	segvn_free(struct seg *seg);
     83 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
     84 		    caddr_t addr, size_t len, enum fault_type type,
     85 		    enum seg_rw rw);
     86 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
     87 static int	segvn_setprot(struct seg *seg, caddr_t addr,
     88 		    size_t len, uint_t prot);
     89 static int	segvn_checkprot(struct seg *seg, caddr_t addr,
     90 		    size_t len, uint_t prot);
     91 static int	segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
     92 static size_t	segvn_swapout(struct seg *seg);
     93 static int	segvn_sync(struct seg *seg, caddr_t addr, size_t len,
     94 		    int attr, uint_t flags);
     95 static size_t	segvn_incore(struct seg *seg, caddr_t addr, size_t len,
     96 		    char *vec);
     97 static int	segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
     98 		    int attr, int op, ulong_t *lockmap, size_t pos);
     99 static int	segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
    100 		    uint_t *protv);
    101 static u_offset_t	segvn_getoffset(struct seg *seg, caddr_t addr);
    102 static int	segvn_gettype(struct seg *seg, caddr_t addr);
    103 static int	segvn_getvp(struct seg *seg, caddr_t addr,
    104 		    struct vnode **vpp);
    105 static int	segvn_advise(struct seg *seg, caddr_t addr, size_t len,
    106 		    uint_t behav);
    107 static void	segvn_dump(struct seg *seg);
    108 static int	segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
    109 		    struct page ***ppp, enum lock_type type, enum seg_rw rw);
    110 static int	segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
    111 		    uint_t szc);
    112 static int	segvn_getmemid(struct seg *seg, caddr_t addr,
    113 		    memid_t *memidp);
    114 static lgrp_mem_policy_info_t	*segvn_getpolicy(struct seg *, caddr_t);
    115 static int	segvn_capable(struct seg *seg, segcapability_t capable);
    116 
    117 struct	seg_ops segvn_ops = {
    118 	segvn_dup,
    119 	segvn_unmap,
    120 	segvn_free,
    121 	segvn_fault,
    122 	segvn_faulta,
    123 	segvn_setprot,
    124 	segvn_checkprot,
    125 	segvn_kluster,
    126 	segvn_swapout,
    127 	segvn_sync,
    128 	segvn_incore,
    129 	segvn_lockop,
    130 	segvn_getprot,
    131 	segvn_getoffset,
    132 	segvn_gettype,
    133 	segvn_getvp,
    134 	segvn_advise,
    135 	segvn_dump,
    136 	segvn_pagelock,
    137 	segvn_setpagesize,
    138 	segvn_getmemid,
    139 	segvn_getpolicy,
    140 	segvn_capable,
    141 };
    142 
    143 /*
    144  * Common zfod structures, provided as a shorthand for others to use.
    145  */
    146 static segvn_crargs_t zfod_segvn_crargs =
    147 	SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
    148 static segvn_crargs_t kzfod_segvn_crargs =
    149 	SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
    150 	PROT_ALL & ~PROT_USER);
    151 static segvn_crargs_t stack_noexec_crargs =
    152 	SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
    153 
    154 caddr_t	zfod_argsp = (caddr_t)&zfod_segvn_crargs;	/* user zfod argsp */
    155 caddr_t	kzfod_argsp = (caddr_t)&kzfod_segvn_crargs;	/* kernel zfod argsp */
    156 caddr_t	stack_exec_argsp = (caddr_t)&zfod_segvn_crargs;	/* executable stack */
    157 caddr_t	stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
    158 
    159 #define	vpgtob(n)	((n) * sizeof (struct vpage))	/* For brevity */
    160 
    161 size_t	segvn_comb_thrshld = UINT_MAX;	/* patchable -- see 1196681 */
    162 
    163 size_t	segvn_pglock_comb_thrshld = (1UL << 16);	/* 64K */
    164 size_t	segvn_pglock_comb_balign = (1UL << 16);		/* 64K */
    165 uint_t	segvn_pglock_comb_bshift;
    166 size_t	segvn_pglock_comb_palign;
    167 
    168 static int	segvn_concat(struct seg *, struct seg *, int);
    169 static int	segvn_extend_prev(struct seg *, struct seg *,
    170 		    struct segvn_crargs *, size_t);
    171 static int	segvn_extend_next(struct seg *, struct seg *,
    172 		    struct segvn_crargs *, size_t);
    173 static void	segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw);
    174 static void	segvn_pagelist_rele(page_t **);
    175 static void	segvn_setvnode_mpss(vnode_t *);
    176 static void	segvn_relocate_pages(page_t **, page_t *);
    177 static int	segvn_full_szcpages(page_t **, uint_t, int *, uint_t *);
    178 static int	segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t,
    179     uint_t, page_t **, page_t **, uint_t *, int *);
    180 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t,
    181     caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
    182 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t,
    183     caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
    184 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
    185     u_offset_t, struct vpage *, page_t **, uint_t,
    186     enum fault_type, enum seg_rw, int);
    187 static void	segvn_vpage(struct seg *);
    188 static size_t	segvn_count_swap_by_vpages(struct seg *);
    189 
    190 static void segvn_purge(struct seg *seg);
    191 static int segvn_reclaim(void *, caddr_t, size_t, struct page **,
    192     enum seg_rw, int);
    193 static int shamp_reclaim(void *, caddr_t, size_t, struct page **,
    194     enum seg_rw, int);
    195 
    196 static int sameprot(struct seg *, caddr_t, size_t);
    197 
    198 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t);
    199 static int segvn_clrszc(struct seg *);
    200 static struct seg *segvn_split_seg(struct seg *, caddr_t);
    201 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
    202     ulong_t, uint_t);
    203 
    204 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t,
    205     size_t, void *, u_offset_t);
    206 
    207 static struct kmem_cache *segvn_cache;
    208 static struct kmem_cache **segvn_szc_cache;
    209 
    210 #ifdef VM_STATS
    211 static struct segvnvmstats_str {
    212 	ulong_t	fill_vp_pages[31];
    213 	ulong_t fltvnpages[49];
    214 	ulong_t	fullszcpages[10];
    215 	ulong_t	relocatepages[3];
    216 	ulong_t	fltanpages[17];
    217 	ulong_t pagelock[2];
    218 	ulong_t	demoterange[3];
    219 } segvnvmstats;
    220 #endif /* VM_STATS */
    221 
    222 #define	SDR_RANGE	1		/* demote entire range */
    223 #define	SDR_END		2		/* demote non aligned ends only */
    224 
    225 #define	CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) {	    \
    226 		if ((len) != 0) { 		      	      		      \
    227 			lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);  \
    228 			ASSERT(lpgaddr >= (seg)->s_base);	      	      \
    229 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) +    \
    230 			    (len)), pgsz);				      \
    231 			ASSERT(lpgeaddr > lpgaddr);		      	      \
    232 			ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size);    \
    233 		} else {					      	      \
    234 			lpgeaddr = lpgaddr = (addr);	      		      \
    235 		}							      \
    236 	}
    237 
    238 /*ARGSUSED*/
    239 static int
    240 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
    241 {
    242 	struct segvn_data *svd = buf;
    243 
    244 	rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
    245 	mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
    246 	svd->svn_trnext = svd->svn_trprev = NULL;
    247 	return (0);
    248 }
    249 
    250 /*ARGSUSED1*/
    251 static void
    252 segvn_cache_destructor(void *buf, void *cdrarg)
    253 {
    254 	struct segvn_data *svd = buf;
    255 
    256 	rw_destroy(&svd->lock);
    257 	mutex_destroy(&svd->segfree_syncmtx);
    258 }
    259 
    260 /*ARGSUSED*/
    261 static int
    262 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags)
    263 {
    264 	bzero(buf, sizeof (svntr_t));
    265 	return (0);
    266 }
    267 
    268 /*
    269  * Patching this variable to non-zero allows the system to run with
    270  * stacks marked as "not executable".  It's a bit of a kludge, but is
    271  * provided as a tweakable for platforms that export those ABIs
    272  * (e.g. sparc V8) that have executable stacks enabled by default.
    273  * There are also some restrictions for platforms that don't actually
    274  * implement 'noexec' protections.
    275  *
    276  * Once enabled, the system is (therefore) unable to provide a fully
    277  * ABI-compliant execution environment, though practically speaking,
    278  * most everything works.  The exceptions are generally some interpreters
    279  * and debuggers that create executable code on the stack and jump
    280  * into it (without explicitly mprotecting the address range to include
    281  * PROT_EXEC).
    282  *
    283  * One important class of applications that are disabled are those
    284  * that have been transformed into malicious agents using one of the
    285  * numerous "buffer overflow" attacks.  See 4007890.
    286  */
    287 int noexec_user_stack = 0;
    288 int noexec_user_stack_log = 1;
    289 
    290 int segvn_lpg_disable = 0;
    291 uint_t segvn_maxpgszc = 0;
    292 
    293 ulong_t segvn_vmpss_clrszc_cnt;
    294 ulong_t segvn_vmpss_clrszc_err;
    295 ulong_t segvn_fltvnpages_clrszc_cnt;
    296 ulong_t segvn_fltvnpages_clrszc_err;
    297 ulong_t segvn_setpgsz_align_err;
    298 ulong_t segvn_setpgsz_anon_align_err;
    299 ulong_t segvn_setpgsz_getattr_err;
    300 ulong_t segvn_setpgsz_eof_err;
    301 ulong_t segvn_faultvnmpss_align_err1;
    302 ulong_t segvn_faultvnmpss_align_err2;
    303 ulong_t segvn_faultvnmpss_align_err3;
    304 ulong_t segvn_faultvnmpss_align_err4;
    305 ulong_t segvn_faultvnmpss_align_err5;
    306 ulong_t	segvn_vmpss_pageio_deadlk_err;
    307 
    308 int segvn_use_regions = 1;
    309 
    310 /*
    311  * Segvn supports text replication optimization for NUMA platforms. Text
    312  * replica's are represented by anon maps (amp). There's one amp per text file
    313  * region per lgroup. A process chooses the amp for each of its text mappings
    314  * based on the lgroup assignment of its main thread (t_tid = 1). All
    315  * processes that want a replica on a particular lgroup for the same text file
    316  * mapping share the same amp. amp's are looked up in svntr_hashtab hash table
    317  * with vp,off,size,szc used as a key. Text replication segments are read only
    318  * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
    319  * forcing COW faults from vnode to amp and mapping amp pages instead of vnode
    320  * pages. Replication amp is assigned to a segment when it gets its first
    321  * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
    322  * rechecks periodically if the process still maps an amp local to the main
    323  * thread. If not async thread forces process to remap to an amp in the new
    324  * home lgroup of the main thread. Current text replication implementation
    325  * only provides the benefit to workloads that do most of their work in the
    326  * main thread of a process or all the threads of a process run in the same
    327  * lgroup. To extend text replication benefit to different types of
    328  * multithreaded workloads further work would be needed in the hat layer to
    329  * allow the same virtual address in the same hat to simultaneously map
    330  * different physical addresses (i.e. page table replication would be needed
    331  * for x86).
    332  *
    333  * amp pages are used instead of vnode pages as long as segment has a very
    334  * simple life cycle.  It's created via segvn_create(), handles S_EXEC
    335  * (S_READ) pagefaults and is fully unmapped.  If anything more complicated
    336  * happens such as protection is changed, real COW fault happens, pagesize is
    337  * changed, MC_LOCK is requested or segment is partially unmapped we turn off
    338  * text replication by converting the segment back to vnode only segment
    339  * (unmap segment's address range and set svd->amp to NULL).
    340  *
    341  * The original file can be changed after amp is inserted into
    342  * svntr_hashtab. Processes that are launched after the file is already
    343  * changed can't use the replica's created prior to the file change. To
    344  * implement this functionality hash entries are timestamped. Replica's can
    345  * only be used if current file modification time is the same as the timestamp
    346  * saved when hash entry was created. However just timestamps alone are not
    347  * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
    348  * deal with file changes via MAP_SHARED mappings differently. When writable
    349  * MAP_SHARED mappings are created to vnodes marked as executable we mark all
    350  * existing replica's for this vnode as not usable for future text
    351  * mappings. And we don't create new replica's for files that currently have
    352  * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
    353  * true).
    354  */
    355 
    356 #define	SEGVN_TEXTREPL_MAXBYTES_FACTOR	(20)
    357 size_t	segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR;
    358 
    359 static ulong_t			svntr_hashtab_sz = 512;
    360 static svntr_bucket_t		*svntr_hashtab = NULL;
    361 static struct kmem_cache	*svntr_cache;
    362 static svntr_stats_t		*segvn_textrepl_stats;
    363 static ksema_t 			segvn_trasync_sem;
    364 
    365 int				segvn_disable_textrepl = 1;
    366 size_t				textrepl_size_thresh = (size_t)-1;
    367 size_t				segvn_textrepl_bytes = 0;
    368 size_t				segvn_textrepl_max_bytes = 0;
    369 clock_t				segvn_update_textrepl_interval = 0;
    370 int				segvn_update_tr_time = 10;
    371 int				segvn_disable_textrepl_update = 0;
    372 
    373 static void segvn_textrepl(struct seg *);
    374 static void segvn_textunrepl(struct seg *, int);
    375 static void segvn_inval_trcache(vnode_t *);
    376 static void segvn_trasync_thread(void);
    377 static void segvn_trupdate_wakeup(void *);
    378 static void segvn_trupdate(void);
    379 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *,
    380     ulong_t);
    381 
    382 /*
    383  * Initialize segvn data structures
    384  */
    385 void
    386 segvn_init(void)
    387 {
    388 	uint_t maxszc;
    389 	uint_t szc;
    390 	size_t pgsz;
    391 
    392 	segvn_cache = kmem_cache_create("segvn_cache",
    393 	    sizeof (struct segvn_data), 0,
    394 	    segvn_cache_constructor, segvn_cache_destructor, NULL,
    395 	    NULL, NULL, 0);
    396 
    397 	if (segvn_lpg_disable == 0) {
    398 		szc = maxszc = page_num_pagesizes() - 1;
    399 		if (szc == 0) {
    400 			segvn_lpg_disable = 1;
    401 		}
    402 		if (page_get_pagesize(0) != PAGESIZE) {
    403 			panic("segvn_init: bad szc 0");
    404 			/*NOTREACHED*/
    405 		}
    406 		while (szc != 0) {
    407 			pgsz = page_get_pagesize(szc);
    408 			if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) {
    409 				panic("segvn_init: bad szc %d", szc);
    410 				/*NOTREACHED*/
    411 			}
    412 			szc--;
    413 		}
    414 		if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc)
    415 			segvn_maxpgszc = maxszc;
    416 	}
    417 
    418 	if (segvn_maxpgszc) {
    419 		segvn_szc_cache = (struct kmem_cache **)kmem_alloc(
    420 		    (segvn_maxpgszc + 1) * sizeof (struct kmem_cache *),
    421 		    KM_SLEEP);
    422 	}
    423 
    424 	for (szc = 1; szc <= segvn_maxpgszc; szc++) {
    425 		char	str[32];
    426 
    427 		(void) sprintf(str, "segvn_szc_cache%d", szc);
    428 		segvn_szc_cache[szc] = kmem_cache_create(str,
    429 		    page_get_pagecnt(szc) * sizeof (page_t *), 0,
    430 		    NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
    431 	}
    432 
    433 
    434 	if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL))
    435 		segvn_use_regions = 0;
    436 
    437 	/*
    438 	 * For now shared regions and text replication segvn support
    439 	 * are mutually exclusive. This is acceptable because
    440 	 * currently significant benefit from text replication was
    441 	 * only observed on AMD64 NUMA platforms (due to relatively
    442 	 * small L2$ size) and currently we don't support shared
    443 	 * regions on x86.
    444 	 */
    445 	if (segvn_use_regions && !segvn_disable_textrepl) {
    446 		segvn_disable_textrepl = 1;
    447 	}
    448 
    449 #if defined(_LP64)
    450 	if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 &&
    451 	    !segvn_disable_textrepl) {
    452 		ulong_t i;
    453 		size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t);
    454 
    455 		svntr_cache = kmem_cache_create("svntr_cache",
    456 		    sizeof (svntr_t), 0, svntr_cache_constructor, NULL,
    457 		    NULL, NULL, NULL, 0);
    458 		svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP);
    459 		for (i = 0; i < svntr_hashtab_sz; i++) {
    460 			mutex_init(&svntr_hashtab[i].tr_lock,  NULL,
    461 			    MUTEX_DEFAULT, NULL);
    462 		}
    463 		segvn_textrepl_max_bytes = ptob(physmem) /
    464 		    segvn_textrepl_max_bytes_factor;
    465 		segvn_textrepl_stats = kmem_zalloc(NCPU *
    466 		    sizeof (svntr_stats_t), KM_SLEEP);
    467 		sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
    468 		(void) thread_create(NULL, 0, segvn_trasync_thread,
    469 		    NULL, 0, &p0, TS_RUN, minclsyspri);
    470 	}
    471 #endif
    472 
    473 	if (!ISP2(segvn_pglock_comb_balign) ||
    474 	    segvn_pglock_comb_balign < PAGESIZE) {
    475 		segvn_pglock_comb_balign = 1UL << 16; /* 64K */
    476 	}
    477 	segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1;
    478 	segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign);
    479 }
    480 
    481 #define	SEGVN_PAGEIO	((void *)0x1)
    482 #define	SEGVN_NOPAGEIO	((void *)0x2)
    483 
    484 static void
    485 segvn_setvnode_mpss(vnode_t *vp)
    486 {
    487 	int err;
    488 
    489 	ASSERT(vp->v_mpssdata == NULL ||
    490 	    vp->v_mpssdata == SEGVN_PAGEIO ||
    491 	    vp->v_mpssdata == SEGVN_NOPAGEIO);
    492 
    493 	if (vp->v_mpssdata == NULL) {
    494 		if (vn_vmpss_usepageio(vp)) {
    495 			err = VOP_PAGEIO(vp, (page_t *)NULL,
    496 			    (u_offset_t)0, 0, 0, CRED(), NULL);
    497 		} else {
    498 			err = ENOSYS;
    499 		}
    500 		/*
    501 		 * set v_mpssdata just once per vnode life
    502 		 * so that it never changes.
    503 		 */
    504 		mutex_enter(&vp->v_lock);
    505 		if (vp->v_mpssdata == NULL) {
    506 			if (err == EINVAL) {
    507 				vp->v_mpssdata = SEGVN_PAGEIO;
    508 			} else {
    509 				vp->v_mpssdata = SEGVN_NOPAGEIO;
    510 			}
    511 		}
    512 		mutex_exit(&vp->v_lock);
    513 	}
    514 }
    515 
    516 int
    517 segvn_create(struct seg *seg, void *argsp)
    518 {
    519 	struct segvn_crargs *a = (struct segvn_crargs *)argsp;
    520 	struct segvn_data *svd;
    521 	size_t swresv = 0;
    522 	struct cred *cred;
    523 	struct anon_map *amp;
    524 	int error = 0;
    525 	size_t pgsz;
    526 	lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT;
    527 	int use_rgn = 0;
    528 	int trok = 0;
    529 
    530 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
    531 
    532 	if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) {
    533 		panic("segvn_create type");
    534 		/*NOTREACHED*/
    535 	}
    536 
    537 	/*
    538 	 * Check arguments.  If a shared anon structure is given then
    539 	 * it is illegal to also specify a vp.
    540 	 */
    541 	if (a->amp != NULL && a->vp != NULL) {
    542 		panic("segvn_create anon_map");
    543 		/*NOTREACHED*/
    544 	}
    545 
    546 	if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) &&
    547 	    a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) &&
    548 	    segvn_use_regions) {
    549 		use_rgn = 1;
    550 	}
    551 
    552 	/* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
    553 	if (a->type == MAP_SHARED)
    554 		a->flags &= ~MAP_NORESERVE;
    555 
    556 	if (a->szc != 0) {
    557 		if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) ||
    558 		    (a->amp != NULL && a->type == MAP_PRIVATE) ||
    559 		    (a->flags & MAP_NORESERVE) || seg->s_as == &kas) {
    560 			a->szc = 0;
    561 		} else {
    562 			if (a->szc > segvn_maxpgszc)
    563 				a->szc = segvn_maxpgszc;
    564 			pgsz = page_get_pagesize(a->szc);
    565 			if (!IS_P2ALIGNED(seg->s_base, pgsz) ||
    566 			    !IS_P2ALIGNED(seg->s_size, pgsz)) {
    567 				a->szc = 0;
    568 			} else if (a->vp != NULL) {
    569 				if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) {
    570 					/*
    571 					 * paranoid check.
    572 					 * hat_page_demote() is not supported
    573 					 * on swapfs pages.
    574 					 */
    575 					a->szc = 0;
    576 				} else if (map_addr_vacalign_check(seg->s_base,
    577 				    a->offset & PAGEMASK)) {
    578 					a->szc = 0;
    579 				}
    580 			} else if (a->amp != NULL) {
    581 				pgcnt_t anum = btopr(a->offset);
    582 				pgcnt_t pgcnt = page_get_pagecnt(a->szc);
    583 				if (!IS_P2ALIGNED(anum, pgcnt)) {
    584 					a->szc = 0;
    585 				}
    586 			}
    587 		}
    588 	}
    589 
    590 	/*
    591 	 * If segment may need private pages, reserve them now.
    592 	 */
    593 	if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) ||
    594 	    (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) {
    595 		if (anon_resv_zone(seg->s_size,
    596 		    seg->s_as->a_proc->p_zone) == 0)
    597 			return (EAGAIN);
    598 		swresv = seg->s_size;
    599 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
    600 		    seg, swresv, 1);
    601 	}
    602 
    603 	/*
    604 	 * Reserve any mapping structures that may be required.
    605 	 *
    606 	 * Don't do it for segments that may use regions. It's currently a
    607 	 * noop in the hat implementations anyway.
    608 	 */
    609 	if (!use_rgn) {
    610 		hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
    611 	}
    612 
    613 	if (a->cred) {
    614 		cred = a->cred;
    615 		crhold(cred);
    616 	} else {
    617 		crhold(cred = CRED());
    618 	}
    619 
    620 	/* Inform the vnode of the new mapping */
    621 	if (a->vp != NULL) {
    622 		error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK,
    623 		    seg->s_as, seg->s_base, seg->s_size, a->prot,
    624 		    a->maxprot, a->type, cred, NULL);
    625 		if (error) {
    626 			if (swresv != 0) {
    627 				anon_unresv_zone(swresv,
    628 				    seg->s_as->a_proc->p_zone);
    629 				TRACE_3(TR_FAC_VM, TR_ANON_PROC,
    630 				    "anon proc:%p %lu %u", seg, swresv, 0);
    631 			}
    632 			crfree(cred);
    633 			if (!use_rgn) {
    634 				hat_unload(seg->s_as->a_hat, seg->s_base,
    635 				    seg->s_size, HAT_UNLOAD_UNMAP);
    636 			}
    637 			return (error);
    638 		}
    639 		/*
    640 		 * svntr_hashtab will be NULL if we support shared regions.
    641 		 */
    642 		trok = ((a->flags & MAP_TEXT) &&
    643 		    (seg->s_size > textrepl_size_thresh ||
    644 		    (a->flags & _MAP_TEXTREPL)) &&
    645 		    lgrp_optimizations() && svntr_hashtab != NULL &&
    646 		    a->type == MAP_PRIVATE && swresv == 0 &&
    647 		    !(a->flags & MAP_NORESERVE) &&
    648 		    seg->s_as != &kas && a->vp->v_type == VREG);
    649 
    650 		ASSERT(!trok || !use_rgn);
    651 	}
    652 
    653 	/*
    654 	 * If more than one segment in the address space, and they're adjacent
    655 	 * virtually, try to concatenate them.  Don't concatenate if an
    656 	 * explicit anon_map structure was supplied (e.g., SystemV shared
    657 	 * memory) or if we'll use text replication for this segment.
    658 	 */
    659 	if (a->amp == NULL && !use_rgn && !trok) {
    660 		struct seg *pseg, *nseg;
    661 		struct segvn_data *psvd, *nsvd;
    662 		lgrp_mem_policy_t ppolicy, npolicy;
    663 		uint_t	lgrp_mem_policy_flags = 0;
    664 		extern lgrp_mem_policy_t lgrp_mem_default_policy;
    665 
    666 		/*
    667 		 * Memory policy flags (lgrp_mem_policy_flags) is valid when
    668 		 * extending stack/heap segments.
    669 		 */
    670 		if ((a->vp == NULL) && (a->type == MAP_PRIVATE) &&
    671 		    !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
    672 			lgrp_mem_policy_flags = a->lgrp_mem_policy_flags;
    673 		} else {
    674 			/*
    675 			 * Get policy when not extending it from another segment
    676 			 */
    677 			mpolicy = lgrp_mem_policy_default(seg->s_size, a->type);
    678 		}
    679 
    680 		/*
    681 		 * First, try to concatenate the previous and new segments
    682 		 */
    683 		pseg = AS_SEGPREV(seg->s_as, seg);
    684 		if (pseg != NULL &&
    685 		    pseg->s_base + pseg->s_size == seg->s_base &&
    686 		    pseg->s_ops == &segvn_ops) {
    687 			/*
    688 			 * Get memory allocation policy from previous segment.
    689 			 * When extension is specified (e.g. for heap) apply
    690 			 * this policy to the new segment regardless of the
    691 			 * outcome of segment concatenation.  Extension occurs
    692 			 * for non-default policy otherwise default policy is
    693 			 * used and is based on extended segment size.
    694 			 */
    695 			psvd = (struct segvn_data *)pseg->s_data;
    696 			ppolicy = psvd->policy_info.mem_policy;
    697 			if (lgrp_mem_policy_flags ==
    698 			    LGRP_MP_FLAG_EXTEND_UP) {
    699 				if (ppolicy != lgrp_mem_default_policy) {
    700 					mpolicy = ppolicy;
    701 				} else {
    702 					mpolicy = lgrp_mem_policy_default(
    703 					    pseg->s_size + seg->s_size,
    704 					    a->type);
    705 				}
    706 			}
    707 
    708 			if (mpolicy == ppolicy &&
    709 			    (pseg->s_size + seg->s_size <=
    710 			    segvn_comb_thrshld || psvd->amp == NULL) &&
    711 			    segvn_extend_prev(pseg, seg, a, swresv) == 0) {
    712 				/*
    713 				 * success! now try to concatenate
    714 				 * with following seg
    715 				 */
    716 				crfree(cred);
    717 				nseg = AS_SEGNEXT(pseg->s_as, pseg);
    718 				if (nseg != NULL &&
    719 				    nseg != pseg &&
    720 				    nseg->s_ops == &segvn_ops &&
    721 				    pseg->s_base + pseg->s_size ==
    722 				    nseg->s_base)
    723 					(void) segvn_concat(pseg, nseg, 0);
    724 				ASSERT(pseg->s_szc == 0 ||
    725 				    (a->szc == pseg->s_szc &&
    726 				    IS_P2ALIGNED(pseg->s_base, pgsz) &&
    727 				    IS_P2ALIGNED(pseg->s_size, pgsz)));
    728 				return (0);
    729 			}
    730 		}
    731 
    732 		/*
    733 		 * Failed, so try to concatenate with following seg
    734 		 */
    735 		nseg = AS_SEGNEXT(seg->s_as, seg);
    736 		if (nseg != NULL &&
    737 		    seg->s_base + seg->s_size == nseg->s_base &&
    738 		    nseg->s_ops == &segvn_ops) {
    739 			/*
    740 			 * Get memory allocation policy from next segment.
    741 			 * When extension is specified (e.g. for stack) apply
    742 			 * this policy to the new segment regardless of the
    743 			 * outcome of segment concatenation.  Extension occurs
    744 			 * for non-default policy otherwise default policy is
    745 			 * used and is based on extended segment size.
    746 			 */
    747 			nsvd = (struct segvn_data *)nseg->s_data;
    748 			npolicy = nsvd->policy_info.mem_policy;
    749 			if (lgrp_mem_policy_flags ==
    750 			    LGRP_MP_FLAG_EXTEND_DOWN) {
    751 				if (npolicy != lgrp_mem_default_policy) {
    752 					mpolicy = npolicy;
    753 				} else {
    754 					mpolicy = lgrp_mem_policy_default(
    755 					    nseg->s_size + seg->s_size,
    756 					    a->type);
    757 				}
    758 			}
    759 
    760 			if (mpolicy == npolicy &&
    761 			    segvn_extend_next(seg, nseg, a, swresv) == 0) {
    762 				crfree(cred);
    763 				ASSERT(nseg->s_szc == 0 ||
    764 				    (a->szc == nseg->s_szc &&
    765 				    IS_P2ALIGNED(nseg->s_base, pgsz) &&
    766 				    IS_P2ALIGNED(nseg->s_size, pgsz)));
    767 				return (0);
    768 			}
    769 		}
    770 	}
    771 
    772 	if (a->vp != NULL) {
    773 		VN_HOLD(a->vp);
    774 		if (a->type == MAP_SHARED)
    775 			lgrp_shm_policy_init(NULL, a->vp);
    776 	}
    777 	svd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
    778 
    779 	seg->s_ops = &segvn_ops;
    780 	seg->s_data = (void *)svd;
    781 	seg->s_szc = a->szc;
    782 
    783 	svd->seg = seg;
    784 	svd->vp = a->vp;
    785 	/*
    786 	 * Anonymous mappings have no backing file so the offset is meaningless.
    787 	 */
    788 	svd->offset = a->vp ? (a->offset & PAGEMASK) : 0;
    789 	svd->prot = a->prot;
    790 	svd->maxprot = a->maxprot;
    791 	svd->pageprot = 0;
    792 	svd->type = a->type;
    793 	svd->vpage = NULL;
    794 	svd->cred = cred;
    795 	svd->advice = MADV_NORMAL;
    796 	svd->pageadvice = 0;
    797 	svd->flags = (ushort_t)a->flags;
    798 	svd->softlockcnt = 0;
    799 	svd->softlockcnt_sbase = 0;
    800 	svd->softlockcnt_send = 0;
    801 	svd->rcookie = HAT_INVALID_REGION_COOKIE;
    802 	svd->pageswap = 0;
    803 
    804 	if (a->szc != 0 && a->vp != NULL) {
    805 		segvn_setvnode_mpss(a->vp);
    806 	}
    807 	if (svd->type == MAP_SHARED && svd->vp != NULL &&
    808 	    (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) {
    809 		ASSERT(vn_is_mapped(svd->vp, V_WRITE));
    810 		segvn_inval_trcache(svd->vp);
    811 	}
    812 
    813 	amp = a->amp;
    814 	if ((svd->amp = amp) == NULL) {
    815 		svd->anon_index = 0;
    816 		if (svd->type == MAP_SHARED) {
    817 			svd->swresv = 0;
    818 			/*
    819 			 * Shared mappings to a vp need no other setup.
    820 			 * If we have a shared mapping to an anon_map object
    821 			 * which hasn't been allocated yet,  allocate the
    822 			 * struct now so that it will be properly shared
    823 			 * by remembering the swap reservation there.
    824 			 */
    825 			if (a->vp == NULL) {
    826 				svd->amp = anonmap_alloc(seg->s_size, swresv,
    827 				    ANON_SLEEP);
    828 				svd->amp->a_szc = seg->s_szc;
    829 			}
    830 		} else {
    831 			/*
    832 			 * Private mapping (with or without a vp).
    833 			 * Allocate anon_map when needed.
    834 			 */
    835 			svd->swresv = swresv;
    836 		}
    837 	} else {
    838 		pgcnt_t anon_num;
    839 
    840 		/*
    841 		 * Mapping to an existing anon_map structure without a vp.
    842 		 * For now we will insure that the segment size isn't larger
    843 		 * than the size - offset gives us.  Later on we may wish to
    844 		 * have the anon array dynamically allocated itself so that
    845 		 * we don't always have to allocate all the anon pointer slots.
    846 		 * This of course involves adding extra code to check that we
    847 		 * aren't trying to use an anon pointer slot beyond the end
    848 		 * of the currently allocated anon array.
    849 		 */
    850 		if ((amp->size - a->offset) < seg->s_size) {
    851 			panic("segvn_create anon_map size");
    852 			/*NOTREACHED*/
    853 		}
    854 
    855 		anon_num = btopr(a->offset);
    856 
    857 		if (a->type == MAP_SHARED) {
    858 			/*
    859 			 * SHARED mapping to a given anon_map.
    860 			 */
    861 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
    862 			amp->refcnt++;
    863 			if (a->szc > amp->a_szc) {
    864 				amp->a_szc = a->szc;
    865 			}
    866 			ANON_LOCK_EXIT(&amp->a_rwlock);
    867 			svd->anon_index = anon_num;
    868 			svd->swresv = 0;
    869 		} else {
    870 			/*
    871 			 * PRIVATE mapping to a given anon_map.
    872 			 * Make sure that all the needed anon
    873 			 * structures are created (so that we will
    874 			 * share the underlying pages if nothing
    875 			 * is written by this mapping) and then
    876 			 * duplicate the anon array as is done
    877 			 * when a privately mapped segment is dup'ed.
    878 			 */
    879 			struct anon *ap;
    880 			caddr_t addr;
    881 			caddr_t eaddr;
    882 			ulong_t	anon_idx;
    883 			int hat_flag = HAT_LOAD;
    884 
    885 			if (svd->flags & MAP_TEXT) {
    886 				hat_flag |= HAT_LOAD_TEXT;
    887 			}
    888 
    889 			svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
    890 			svd->amp->a_szc = seg->s_szc;
    891 			svd->anon_index = 0;
    892 			svd->swresv = swresv;
    893 
    894 			/*
    895 			 * Prevent 2 threads from allocating anon
    896 			 * slots simultaneously.
    897 			 */
    898 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
    899 			eaddr = seg->s_base + seg->s_size;
    900 
    901 			for (anon_idx = anon_num, addr = seg->s_base;
    902 			    addr < eaddr; addr += PAGESIZE, anon_idx++) {
    903 				page_t *pp;
    904 
    905 				if ((ap = anon_get_ptr(amp->ahp,
    906 				    anon_idx)) != NULL)
    907 					continue;
    908 
    909 				/*
    910 				 * Allocate the anon struct now.
    911 				 * Might as well load up translation
    912 				 * to the page while we're at it...
    913 				 */
    914 				pp = anon_zero(seg, addr, &ap, cred);
    915 				if (ap == NULL || pp == NULL) {
    916 					panic("segvn_create anon_zero");
    917 					/*NOTREACHED*/
    918 				}
    919 
    920 				/*
    921 				 * Re-acquire the anon_map lock and
    922 				 * initialize the anon array entry.
    923 				 */
    924 				ASSERT(anon_get_ptr(amp->ahp,
    925 				    anon_idx) == NULL);
    926 				(void) anon_set_ptr(amp->ahp, anon_idx, ap,
    927 				    ANON_SLEEP);
    928 
    929 				ASSERT(seg->s_szc == 0);
    930 				ASSERT(!IS_VMODSORT(pp->p_vnode));
    931 
    932 				ASSERT(use_rgn == 0);
    933 				hat_memload(seg->s_as->a_hat, addr, pp,
    934 				    svd->prot & ~PROT_WRITE, hat_flag);
    935 
    936 				page_unlock(pp);
    937 			}
    938 			ASSERT(seg->s_szc == 0);
    939 			anon_dup(amp->ahp, anon_num, svd->amp->ahp,
    940 			    0, seg->s_size);
    941 			ANON_LOCK_EXIT(&amp->a_rwlock);
    942 		}
    943 	}
    944 
    945 	/*
    946 	 * Set default memory allocation policy for segment
    947 	 *
    948 	 * Always set policy for private memory at least for initialization
    949 	 * even if this is a shared memory segment
    950 	 */
    951 	(void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size);
    952 
    953 	if (svd->type == MAP_SHARED)
    954 		(void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index,
    955 		    svd->vp, svd->offset, seg->s_size);
    956 
    957 	if (use_rgn) {
    958 		ASSERT(!trok);
    959 		ASSERT(svd->amp == NULL);
    960 		svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base,
    961 		    seg->s_size, (void *)svd->vp, svd->offset, svd->prot,
    962 		    (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback,
    963 		    HAT_REGION_TEXT);
    964 	}
    965 
    966 	ASSERT(!trok || !(svd->prot & PROT_WRITE));
    967 	svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF;
    968 
    969 	return (0);
    970 }
    971 
    972 /*
    973  * Concatenate two existing segments, if possible.
    974  * Return 0 on success, -1 if two segments are not compatible
    975  * or -2 on memory allocation failure.
    976  * If amp_cat == 1 then try and concat segments with anon maps
    977  */
    978 static int
    979 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat)
    980 {
    981 	struct segvn_data *svd1 = seg1->s_data;
    982 	struct segvn_data *svd2 = seg2->s_data;
    983 	struct anon_map *amp1 = svd1->amp;
    984 	struct anon_map *amp2 = svd2->amp;
    985 	struct vpage *vpage1 = svd1->vpage;
    986 	struct vpage *vpage2 = svd2->vpage, *nvpage = NULL;
    987 	size_t size, nvpsize;
    988 	pgcnt_t npages1, npages2;
    989 
    990 	ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as);
    991 	ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
    992 	ASSERT(seg1->s_ops == seg2->s_ops);
    993 
    994 	if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie) ||
    995 	    HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
    996 		return (-1);
    997 	}
    998 
    999 	/* both segments exist, try to merge them */
   1000 #define	incompat(x)	(svd1->x != svd2->x)
   1001 	if (incompat(vp) || incompat(maxprot) ||
   1002 	    (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) ||
   1003 	    (!svd1->pageprot && !svd2->pageprot && incompat(prot)) ||
   1004 	    incompat(type) || incompat(cred) || incompat(flags) ||
   1005 	    seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) ||
   1006 	    (svd2->softlockcnt > 0) || svd1->softlockcnt_send > 0)
   1007 		return (-1);
   1008 #undef incompat
   1009 
   1010 	/*
   1011 	 * vp == NULL implies zfod, offset doesn't matter
   1012 	 */
   1013 	if (svd1->vp != NULL &&
   1014 	    svd1->offset + seg1->s_size != svd2->offset) {
   1015 		return (-1);
   1016 	}
   1017 
   1018 	/*
   1019 	 * Don't concatenate if either segment uses text replication.
   1020 	 */
   1021 	if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) {
   1022 		return (-1);
   1023 	}
   1024 
   1025 	/*
   1026 	 * Fail early if we're not supposed to concatenate
   1027 	 * segments with non NULL amp.
   1028 	 */
   1029 	if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) {
   1030 		return (-1);
   1031 	}
   1032 
   1033 	if (svd1->vp == NULL && svd1->type == MAP_SHARED) {
   1034 		if (amp1 != amp2) {
   1035 			return (-1);
   1036 		}
   1037 		if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) !=
   1038 		    svd2->anon_index) {
   1039 			return (-1);
   1040 		}
   1041 		ASSERT(amp1 == NULL || amp1->refcnt >= 2);
   1042 	}
   1043 
   1044 	/*
   1045 	 * If either seg has vpages, create a new merged vpage array.
   1046 	 */
   1047 	if (vpage1 != NULL || vpage2 != NULL) {
   1048 		struct vpage *vp, *evp;
   1049 
   1050 		npages1 = seg_pages(seg1);
   1051 		npages2 = seg_pages(seg2);
   1052 		nvpsize = vpgtob(npages1 + npages2);
   1053 
   1054 		if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) {
   1055 			return (-2);
   1056 		}
   1057 
   1058 		if (vpage1 != NULL) {
   1059 			bcopy(vpage1, nvpage, vpgtob(npages1));
   1060 		} else {
   1061 			evp = nvpage + npages1;
   1062 			for (vp = nvpage; vp < evp; vp++) {
   1063 				VPP_SETPROT(vp, svd1->prot);
   1064 				VPP_SETADVICE(vp, svd1->advice);
   1065 			}
   1066 		}
   1067 
   1068 		if (vpage2 != NULL) {
   1069 			bcopy(vpage2, nvpage + npages1, vpgtob(npages2));
   1070 		} else {
   1071 			evp = nvpage + npages1 + npages2;
   1072 			for (vp = nvpage + npages1; vp < evp; vp++) {
   1073 				VPP_SETPROT(vp, svd2->prot);
   1074 				VPP_SETADVICE(vp, svd2->advice);
   1075 			}
   1076 		}
   1077 
   1078 		if (svd2->pageswap && (!svd1->pageswap && svd1->swresv)) {
   1079 			ASSERT(svd1->swresv == seg1->s_size);
   1080 			ASSERT(!(svd1->flags & MAP_NORESERVE));
   1081 			ASSERT(!(svd2->flags & MAP_NORESERVE));
   1082 			evp = nvpage + npages1;
   1083 			for (vp = nvpage; vp < evp; vp++) {
   1084 				VPP_SETSWAPRES(vp);
   1085 			}
   1086 		}
   1087 
   1088 		if (svd1->pageswap && (!svd2->pageswap && svd2->swresv)) {
   1089 			ASSERT(svd2->swresv == seg2->s_size);
   1090 			ASSERT(!(svd1->flags & MAP_NORESERVE));
   1091 			ASSERT(!(svd2->flags & MAP_NORESERVE));
   1092 			vp = nvpage + npages1;
   1093 			evp = vp + npages2;
   1094 			for (; vp < evp; vp++) {
   1095 				VPP_SETSWAPRES(vp);
   1096 			}
   1097 		}
   1098 	}
   1099 	ASSERT((vpage1 != NULL || vpage2 != NULL) ||
   1100 	    (svd1->pageswap == 0 && svd2->pageswap == 0));
   1101 
   1102 	/*
   1103 	 * If either segment has private pages, create a new merged anon
   1104 	 * array. If mergeing shared anon segments just decrement anon map's
   1105 	 * refcnt.
   1106 	 */
   1107 	if (amp1 != NULL && svd1->type == MAP_SHARED) {
   1108 		ASSERT(amp1 == amp2 && svd1->vp == NULL);
   1109 		ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
   1110 		ASSERT(amp1->refcnt >= 2);
   1111 		amp1->refcnt--;
   1112 		ANON_LOCK_EXIT(&amp1->a_rwlock);
   1113 		svd2->amp = NULL;
   1114 	} else if (amp1 != NULL || amp2 != NULL) {
   1115 		struct anon_hdr *nahp;
   1116 		struct anon_map *namp = NULL;
   1117 		size_t asize;
   1118 
   1119 		ASSERT(svd1->type == MAP_PRIVATE);
   1120 
   1121 		asize = seg1->s_size + seg2->s_size;
   1122 		if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) {
   1123 			if (nvpage != NULL) {
   1124 				kmem_free(nvpage, nvpsize);
   1125 			}
   1126 			return (-2);
   1127 		}
   1128 		if (amp1 != NULL) {
   1129 			/*
   1130 			 * XXX anon rwlock is not really needed because
   1131 			 * this is a private segment and we are writers.
   1132 			 */
   1133 			ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
   1134 			ASSERT(amp1->refcnt == 1);
   1135 			if (anon_copy_ptr(amp1->ahp, svd1->anon_index,
   1136 			    nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) {
   1137 				anon_release(nahp, btop(asize));
   1138 				ANON_LOCK_EXIT(&amp1->a_rwlock);
   1139 				if (nvpage != NULL) {
   1140 					kmem_free(nvpage, nvpsize);
   1141 				}
   1142 				return (-2);
   1143 			}
   1144 		}
   1145 		if (amp2 != NULL) {
   1146 			ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
   1147 			ASSERT(amp2->refcnt == 1);
   1148 			if (anon_copy_ptr(amp2->ahp, svd2->anon_index,
   1149 			    nahp, btop(seg1->s_size), btop(seg2->s_size),
   1150 			    ANON_NOSLEEP)) {
   1151 				anon_release(nahp, btop(asize));
   1152 				ANON_LOCK_EXIT(&amp2->a_rwlock);
   1153 				if (amp1 != NULL) {
   1154 					ANON_LOCK_EXIT(&amp1->a_rwlock);
   1155 				}
   1156 				if (nvpage != NULL) {
   1157 					kmem_free(nvpage, nvpsize);
   1158 				}
   1159 				return (-2);
   1160 			}
   1161 		}
   1162 		if (amp1 != NULL) {
   1163 			namp = amp1;
   1164 			anon_release(amp1->ahp, btop(amp1->size));
   1165 		}
   1166 		if (amp2 != NULL) {
   1167 			if (namp == NULL) {
   1168 				ASSERT(amp1 == NULL);
   1169 				namp = amp2;
   1170 				anon_release(amp2->ahp, btop(amp2->size));
   1171 			} else {
   1172 				amp2->refcnt--;
   1173 				ANON_LOCK_EXIT(&amp2->a_rwlock);
   1174 				anonmap_free(amp2);
   1175 			}
   1176 			svd2->amp = NULL; /* needed for seg_free */
   1177 		}
   1178 		namp->ahp = nahp;
   1179 		namp->size = asize;
   1180 		svd1->amp = namp;
   1181 		svd1->anon_index = 0;
   1182 		ANON_LOCK_EXIT(&namp->a_rwlock);
   1183 	}
   1184 	/*
   1185 	 * Now free the old vpage structures.
   1186 	 */
   1187 	if (nvpage != NULL) {
   1188 		if (vpage1 != NULL) {
   1189 			kmem_free(vpage1, vpgtob(npages1));
   1190 		}
   1191 		if (vpage2 != NULL) {
   1192 			svd2->vpage = NULL;
   1193 			kmem_free(vpage2, vpgtob(npages2));
   1194 		}
   1195 		if (svd2->pageprot) {
   1196 			svd1->pageprot = 1;
   1197 		}
   1198 		if (svd2->pageadvice) {
   1199 			svd1->pageadvice = 1;
   1200 		}
   1201 		if (svd2->pageswap) {
   1202 			svd1->pageswap = 1;
   1203 		}
   1204 		svd1->vpage = nvpage;
   1205 	}
   1206 
   1207 	/* all looks ok, merge segments */
   1208 	svd1->swresv += svd2->swresv;
   1209 	svd2->swresv = 0;  /* so seg_free doesn't release swap space */
   1210 	size = seg2->s_size;
   1211 	seg_free(seg2);
   1212 	seg1->s_size += size;
   1213 	return (0);
   1214 }
   1215 
   1216 /*
   1217  * Extend the previous segment (seg1) to include the
   1218  * new segment (seg2 + a), if possible.
   1219  * Return 0 on success.
   1220  */
   1221 static int
   1222 segvn_extend_prev(seg1, seg2, a, swresv)
   1223 	struct seg *seg1, *seg2;
   1224 	struct segvn_crargs *a;
   1225 	size_t swresv;
   1226 {
   1227 	struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data;
   1228 	size_t size;
   1229 	struct anon_map *amp1;
   1230 	struct vpage *new_vpage;
   1231 
   1232 	/*
   1233 	 * We don't need any segment level locks for "segvn" data
   1234 	 * since the address space is "write" locked.
   1235 	 */
   1236 	ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
   1237 
   1238 	if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie)) {
   1239 		return (-1);
   1240 	}
   1241 
   1242 	/* second segment is new, try to extend first */
   1243 	/* XXX - should also check cred */
   1244 	if (svd1->vp != a->vp || svd1->maxprot != a->maxprot ||
   1245 	    (!svd1->pageprot && (svd1->prot != a->prot)) ||
   1246 	    svd1->type != a->type || svd1->flags != a->flags ||
   1247 	    seg1->s_szc != a->szc || svd1->softlockcnt_send > 0)
   1248 		return (-1);
   1249 
   1250 	/* vp == NULL implies zfod, offset doesn't matter */
   1251 	if (svd1->vp != NULL &&
   1252 	    svd1->offset + seg1->s_size != (a->offset & PAGEMASK))
   1253 		return (-1);
   1254 
   1255 	if (svd1->tr_state != SEGVN_TR_OFF) {
   1256 		return (-1);
   1257 	}
   1258 
   1259 	amp1 = svd1->amp;
   1260 	if (amp1) {
   1261 		pgcnt_t newpgs;
   1262 
   1263 		/*
   1264 		 * Segment has private pages, can data structures
   1265 		 * be expanded?
   1266 		 *
   1267 		 * Acquire the anon_map lock to prevent it from changing,
   1268 		 * if it is shared.  This ensures that the anon_map
   1269 		 * will not change while a thread which has a read/write
   1270 		 * lock on an address space references it.
   1271 		 * XXX - Don't need the anon_map lock at all if "refcnt"
   1272 		 * is 1.
   1273 		 *
   1274 		 * Can't grow a MAP_SHARED segment with an anonmap because
   1275 		 * there may be existing anon slots where we want to extend
   1276 		 * the segment and we wouldn't know what to do with them
   1277 		 * (e.g., for tmpfs right thing is to just leave them there,
   1278 		 * for /dev/zero they should be cleared out).
   1279 		 */
   1280 		if (svd1->type == MAP_SHARED)
   1281 			return (-1);
   1282 
   1283 		ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
   1284 		if (amp1->refcnt > 1) {
   1285 			ANON_LOCK_EXIT(&amp1->a_rwlock);
   1286 			return (-1);
   1287 		}
   1288 		newpgs = anon_grow(amp1->ahp, &svd1->anon_index,
   1289 		    btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP);
   1290 
   1291 		if (newpgs == 0) {
   1292 			ANON_LOCK_EXIT(&amp1->a_rwlock);
   1293 			return (-1);
   1294 		}
   1295 		amp1->size = ptob(newpgs);
   1296 		ANON_LOCK_EXIT(&amp1->a_rwlock);
   1297 	}
   1298 	if (svd1->vpage != NULL) {
   1299 		struct vpage *vp, *evp;
   1300 		new_vpage =
   1301 		    kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
   1302 			KM_NOSLEEP);
   1303 		if (new_vpage == NULL)
   1304 			return (-1);
   1305 		bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1)));
   1306 		kmem_free(svd1->vpage, vpgtob(seg_pages(seg1)));
   1307 		svd1->vpage = new_vpage;
   1308 
   1309 		vp = new_vpage + seg_pages(seg1);
   1310 		evp = vp + seg_pages(seg2);
   1311 		for (; vp < evp; vp++)
   1312 			VPP_SETPROT(vp, a->prot);
   1313 		if (svd1->pageswap && swresv) {
   1314 			ASSERT(!(svd1->flags & MAP_NORESERVE));
   1315 			ASSERT(swresv == seg2->s_size);
   1316 			vp = new_vpage + seg_pages(seg1);
   1317 			for (; vp < evp; vp++) {
   1318 				VPP_SETSWAPRES(vp);
   1319 			}
   1320 		}
   1321 	}
   1322 	ASSERT(svd1->vpage != NULL || svd1->pageswap == 0);
   1323 	size = seg2->s_size;
   1324 	seg_free(seg2);
   1325 	seg1->s_size += size;
   1326 	svd1->swresv += swresv;
   1327 	if (svd1->pageprot && (a->prot & PROT_WRITE) &&
   1328 	    svd1->type == MAP_SHARED && svd1->vp != NULL &&
   1329 	    (svd1->vp->v_flag & VVMEXEC)) {
   1330 		ASSERT(vn_is_mapped(svd1->vp, V_WRITE));
   1331 		segvn_inval_trcache(svd1->vp);
   1332 	}
   1333 	return (0);
   1334 }
   1335 
   1336 /*
   1337  * Extend the next segment (seg2) to include the
   1338  * new segment (seg1 + a), if possible.
   1339  * Return 0 on success.
   1340  */
   1341 static int
   1342 segvn_extend_next(
   1343 	struct seg *seg1,
   1344 	struct seg *seg2,
   1345 	struct segvn_crargs *a,
   1346 	size_t swresv)
   1347 {
   1348 	struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data;
   1349 	size_t size;
   1350 	struct anon_map *amp2;
   1351 	struct vpage *new_vpage;
   1352 
   1353 	/*
   1354 	 * We don't need any segment level locks for "segvn" data
   1355 	 * since the address space is "write" locked.
   1356 	 */
   1357 	ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock));
   1358 
   1359 	if (HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
   1360 		return (-1);
   1361 	}
   1362 
   1363 	/* first segment is new, try to extend second */
   1364 	/* XXX - should also check cred */
   1365 	if (svd2->vp != a->vp || svd2->maxprot != a->maxprot ||
   1366 	    (!svd2->pageprot && (svd2->prot != a->prot)) ||
   1367 	    svd2->type != a->type || svd2->flags != a->flags ||
   1368 	    seg2->s_szc != a->szc || svd2->softlockcnt_sbase > 0)
   1369 		return (-1);
   1370 	/* vp == NULL implies zfod, offset doesn't matter */
   1371 	if (svd2->vp != NULL &&
   1372 	    (a->offset & PAGEMASK) + seg1->s_size != svd2->offset)
   1373 		return (-1);
   1374 
   1375 	if (svd2->tr_state != SEGVN_TR_OFF) {
   1376 		return (-1);
   1377 	}
   1378 
   1379 	amp2 = svd2->amp;
   1380 	if (amp2) {
   1381 		pgcnt_t newpgs;
   1382 
   1383 		/*
   1384 		 * Segment has private pages, can data structures
   1385 		 * be expanded?
   1386 		 *
   1387 		 * Acquire the anon_map lock to prevent it from changing,
   1388 		 * if it is shared.  This ensures that the anon_map
   1389 		 * will not change while a thread which has a read/write
   1390 		 * lock on an address space references it.
   1391 		 *
   1392 		 * XXX - Don't need the anon_map lock at all if "refcnt"
   1393 		 * is 1.
   1394 		 */
   1395 		if (svd2->type == MAP_SHARED)
   1396 			return (-1);
   1397 
   1398 		ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
   1399 		if (amp2->refcnt > 1) {
   1400 			ANON_LOCK_EXIT(&amp2->a_rwlock);
   1401 			return (-1);
   1402 		}
   1403 		newpgs = anon_grow(amp2->ahp, &svd2->anon_index,
   1404 		    btop(seg2->s_size), btop(seg1->s_size),
   1405 		    ANON_NOSLEEP | ANON_GROWDOWN);
   1406 
   1407 		if (newpgs == 0) {
   1408 			ANON_LOCK_EXIT(&amp2->a_rwlock);
   1409 			return (-1);
   1410 		}
   1411 		amp2->size = ptob(newpgs);
   1412 		ANON_LOCK_EXIT(&amp2->a_rwlock);
   1413 	}
   1414 	if (svd2->vpage != NULL) {
   1415 		struct vpage *vp, *evp;
   1416 		new_vpage =
   1417 		    kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
   1418 		    KM_NOSLEEP);
   1419 		if (new_vpage == NULL) {
   1420 			/* Not merging segments so adjust anon_index back */
   1421 			if (amp2)
   1422 				svd2->anon_index += seg_pages(seg1);
   1423 			return (-1);
   1424 		}
   1425 		bcopy(svd2->vpage, new_vpage + seg_pages(seg1),
   1426 		    vpgtob(seg_pages(seg2)));
   1427 		kmem_free(svd2->vpage, vpgtob(seg_pages(seg2)));
   1428 		svd2->vpage = new_vpage;
   1429 
   1430 		vp = new_vpage;
   1431 		evp = vp + seg_pages(seg1);
   1432 		for (; vp < evp; vp++)
   1433 			VPP_SETPROT(vp, a->prot);
   1434 		if (svd2->pageswap && swresv) {
   1435 			ASSERT(!(svd2->flags & MAP_NORESERVE));
   1436 			ASSERT(swresv == seg1->s_size);
   1437 			vp = new_vpage;
   1438 			for (; vp < evp; vp++) {
   1439 				VPP_SETSWAPRES(vp);
   1440 			}
   1441 		}
   1442 	}
   1443 	ASSERT(svd2->vpage != NULL || svd2->pageswap == 0);
   1444 	size = seg1->s_size;
   1445 	seg_free(seg1);
   1446 	seg2->s_size += size;
   1447 	seg2->s_base -= size;
   1448 	svd2->offset -= size;
   1449 	svd2->swresv += swresv;
   1450 	if (svd2->pageprot && (a->prot & PROT_WRITE) &&
   1451 	    svd2->type == MAP_SHARED && svd2->vp != NULL &&
   1452 	    (svd2->vp->v_flag & VVMEXEC)) {
   1453 		ASSERT(vn_is_mapped(svd2->vp, V_WRITE));
   1454 		segvn_inval_trcache(svd2->vp);
   1455 	}
   1456 	return (0);
   1457 }
   1458 
   1459 static int
   1460 segvn_dup(struct seg *seg, struct seg *newseg)
   1461 {
   1462 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   1463 	struct segvn_data *newsvd;
   1464 	pgcnt_t npages = seg_pages(seg);
   1465 	int error = 0;
   1466 	uint_t prot;
   1467 	size_t len;
   1468 	struct anon_map *amp;
   1469 
   1470 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
   1471 	ASSERT(newseg->s_as->a_proc->p_parent == curproc);
   1472 
   1473 	/*
   1474 	 * If segment has anon reserved, reserve more for the new seg.
   1475 	 * For a MAP_NORESERVE segment swresv will be a count of all the
   1476 	 * allocated anon slots; thus we reserve for the child as many slots
   1477 	 * as the parent has allocated. This semantic prevents the child or
   1478 	 * parent from dieing during a copy-on-write fault caused by trying
   1479 	 * to write a shared pre-existing anon page.
   1480 	 */
   1481 	if ((len = svd->swresv) != 0) {
   1482 		if (anon_resv(svd->swresv) == 0)
   1483 			return (ENOMEM);
   1484 
   1485 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
   1486 		    seg, len, 0);
   1487 	}
   1488 
   1489 	newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
   1490 
   1491 	newseg->s_ops = &segvn_ops;
   1492 	newseg->s_data = (void *)newsvd;
   1493 	newseg->s_szc = seg->s_szc;
   1494 
   1495 	newsvd->seg = newseg;
   1496 	if ((newsvd->vp = svd->vp) != NULL) {
   1497 		VN_HOLD(svd->vp);
   1498 		if (svd->type == MAP_SHARED)
   1499 			lgrp_shm_policy_init(NULL, svd->vp);
   1500 	}
   1501 	newsvd->offset = svd->offset;
   1502 	newsvd->prot = svd->prot;
   1503 	newsvd->maxprot = svd->maxprot;
   1504 	newsvd->pageprot = svd->pageprot;
   1505 	newsvd->type = svd->type;
   1506 	newsvd->cred = svd->cred;
   1507 	crhold(newsvd->cred);
   1508 	newsvd->advice = svd->advice;
   1509 	newsvd->pageadvice = svd->pageadvice;
   1510 	newsvd->swresv = svd->swresv;
   1511 	newsvd->pageswap = svd->pageswap;
   1512 	newsvd->flags = svd->flags;
   1513 	newsvd->softlockcnt = 0;
   1514 	newsvd->softlockcnt_sbase = 0;
   1515 	newsvd->softlockcnt_send = 0;
   1516 	newsvd->policy_info = svd->policy_info;
   1517 	newsvd->rcookie = HAT_INVALID_REGION_COOKIE;
   1518 
   1519 	if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) {
   1520 		/*
   1521 		 * Not attaching to a shared anon object.
   1522 		 */
   1523 		ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie) ||
   1524 		    svd->tr_state == SEGVN_TR_OFF);
   1525 		if (svd->tr_state == SEGVN_TR_ON) {
   1526 			ASSERT(newsvd->vp != NULL && amp != NULL);
   1527 			newsvd->tr_state = SEGVN_TR_INIT;
   1528 		} else {
   1529 			newsvd->tr_state = svd->tr_state;
   1530 		}
   1531 		newsvd->amp = NULL;
   1532 		newsvd->anon_index = 0;
   1533 	} else {
   1534 		/* regions for now are only used on pure vnode segments */
   1535 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   1536 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
   1537 		newsvd->tr_state = SEGVN_TR_OFF;
   1538 		if (svd->type == MAP_SHARED) {
   1539 			newsvd->amp = amp;
   1540 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
   1541 			amp->refcnt++;
   1542 			ANON_LOCK_EXIT(&amp->a_rwlock);
   1543 			newsvd->anon_index = svd->anon_index;
   1544 		} else {
   1545 			int reclaim = 1;
   1546 
   1547 			/*
   1548 			 * Allocate and initialize new anon_map structure.
   1549 			 */
   1550 			newsvd->amp = anonmap_alloc(newseg->s_size, 0,
   1551 			    ANON_SLEEP);
   1552 			newsvd->amp->a_szc = newseg->s_szc;
   1553 			newsvd->anon_index = 0;
   1554 
   1555 			/*
   1556 			 * We don't have to acquire the anon_map lock
   1557 			 * for the new segment (since it belongs to an
   1558 			 * address space that is still not associated
   1559 			 * with any process), or the segment in the old
   1560 			 * address space (since all threads in it
   1561 			 * are stopped while duplicating the address space).
   1562 			 */
   1563 
   1564 			/*
   1565 			 * The goal of the following code is to make sure that
   1566 			 * softlocked pages do not end up as copy on write
   1567 			 * pages.  This would cause problems where one
   1568 			 * thread writes to a page that is COW and a different
   1569 			 * thread in the same process has softlocked it.  The
   1570 			 * softlock lock would move away from this process
   1571 			 * because the write would cause this process to get
   1572 			 * a copy (without the softlock).
   1573 			 *
   1574 			 * The strategy here is to just break the
   1575 			 * sharing on pages that could possibly be
   1576 			 * softlocked.
   1577 			 */
   1578 retry:
   1579 			if (svd->softlockcnt) {
   1580 				struct anon *ap, *newap;
   1581 				size_t i;
   1582 				uint_t vpprot;
   1583 				page_t *anon_pl[1+1], *pp;
   1584 				caddr_t addr;
   1585 				ulong_t old_idx = svd->anon_index;
   1586 				ulong_t new_idx = 0;
   1587 
   1588 				/*
   1589 				 * The softlock count might be non zero
   1590 				 * because some pages are still stuck in the
   1591 				 * cache for lazy reclaim. Flush the cache
   1592 				 * now. This should drop the count to zero.
   1593 				 * [or there is really I/O going on to these
   1594 				 * pages]. Note, we have the writers lock so
   1595 				 * nothing gets inserted during the flush.
   1596 				 */
   1597 				if (reclaim == 1) {
   1598 					segvn_purge(seg);
   1599 					reclaim = 0;
   1600 					goto retry;
   1601 				}
   1602 				i = btopr(seg->s_size);
   1603 				addr = seg->s_base;
   1604 				/*
   1605 				 * XXX break cow sharing using PAGESIZE
   1606 				 * pages. They will be relocated into larger
   1607 				 * pages at fault time.
   1608 				 */
   1609 				while (i-- > 0) {
   1610 					if (ap = anon_get_ptr(amp->ahp,
   1611 					    old_idx)) {
   1612 						error = anon_getpage(&ap,
   1613 						    &vpprot, anon_pl, PAGESIZE,
   1614 						    seg, addr, S_READ,
   1615 						    svd->cred);
   1616 						if (error) {
   1617 							newsvd->vpage = NULL;
   1618 							goto out;
   1619 						}
   1620 						/*
   1621 						 * prot need not be computed
   1622 						 * below 'cause anon_private is
   1623 						 * going to ignore it anyway
   1624 						 * as child doesn't inherit
   1625 						 * pagelock from parent.
   1626 						 */
   1627 						prot = svd->pageprot ?
   1628 						    VPP_PROT(
   1629 						    &svd->vpage[
   1630 						    seg_page(seg, addr)])
   1631 						    : svd->prot;
   1632 						pp = anon_private(&newap,
   1633 						    newseg, addr, prot,
   1634 						    anon_pl[0],	0,
   1635 						    newsvd->cred);
   1636 						if (pp == NULL) {
   1637 							/* no mem abort */
   1638 							newsvd->vpage = NULL;
   1639 							error = ENOMEM;
   1640 							goto out;
   1641 						}
   1642 						(void) anon_set_ptr(
   1643 						    newsvd->amp->ahp, new_idx,
   1644 						    newap, ANON_SLEEP);
   1645 						page_unlock(pp);
   1646 					}
   1647 					addr += PAGESIZE;
   1648 					old_idx++;
   1649 					new_idx++;
   1650 				}
   1651 			} else {	/* common case */
   1652 				if (seg->s_szc != 0) {
   1653 					/*
   1654 					 * If at least one of anon slots of a
   1655 					 * large page exists then make sure
   1656 					 * all anon slots of a large page
   1657 					 * exist to avoid partial cow sharing
   1658 					 * of a large page in the future.
   1659 					 */
   1660 					anon_dup_fill_holes(amp->ahp,
   1661 					    svd->anon_index, newsvd->amp->ahp,
   1662 					    0, seg->s_size, seg->s_szc,
   1663 					    svd->vp != NULL);
   1664 				} else {
   1665 					anon_dup(amp->ahp, svd->anon_index,
   1666 					    newsvd->amp->ahp, 0, seg->s_size);
   1667 				}
   1668 
   1669 				hat_clrattr(seg->s_as->a_hat, seg->s_base,
   1670 				    seg->s_size, PROT_WRITE);
   1671 			}
   1672 		}
   1673 	}
   1674 	/*
   1675 	 * If necessary, create a vpage structure for the new segment.
   1676 	 * Do not copy any page lock indications.
   1677 	 */
   1678 	if (svd->vpage != NULL) {
   1679 		uint_t i;
   1680 		struct vpage *ovp = svd->vpage;
   1681 		struct vpage *nvp;
   1682 
   1683 		nvp = newsvd->vpage =
   1684 		    kmem_alloc(vpgtob(npages), KM_SLEEP);
   1685 		for (i = 0; i < npages; i++) {
   1686 			*nvp = *ovp++;
   1687 			VPP_CLRPPLOCK(nvp++);
   1688 		}
   1689 	} else
   1690 		newsvd->vpage = NULL;
   1691 
   1692 	/* Inform the vnode of the new mapping */
   1693 	if (newsvd->vp != NULL) {
   1694 		error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset,
   1695 		    newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot,
   1696 		    newsvd->maxprot, newsvd->type, newsvd->cred, NULL);
   1697 	}
   1698 out:
   1699 	if (error == 0 && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
   1700 		ASSERT(newsvd->amp == NULL);
   1701 		ASSERT(newsvd->tr_state == SEGVN_TR_OFF);
   1702 		newsvd->rcookie = svd->rcookie;
   1703 		hat_dup_region(newseg->s_as->a_hat, newsvd->rcookie);
   1704 	}
   1705 	return (error);
   1706 }
   1707 
   1708 
   1709 /*
   1710  * callback function to invoke free_vp_pages() for only those pages actually
   1711  * processed by the HAT when a shared region is destroyed.
   1712  */
   1713 extern int free_pages;
   1714 
   1715 static void
   1716 segvn_hat_rgn_unload_callback(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr,
   1717     size_t r_size, void *r_obj, u_offset_t r_objoff)
   1718 {
   1719 	u_offset_t off;
   1720 	size_t len;
   1721 	vnode_t *vp = (vnode_t *)r_obj;
   1722 
   1723 	ASSERT(eaddr > saddr);
   1724 	ASSERT(saddr >= r_saddr);
   1725 	ASSERT(saddr < r_saddr + r_size);
   1726 	ASSERT(eaddr > r_saddr);
   1727 	ASSERT(eaddr <= r_saddr + r_size);
   1728 	ASSERT(vp != NULL);
   1729 
   1730 	if (!free_pages) {
   1731 		return;
   1732 	}
   1733 
   1734 	len = eaddr - saddr;
   1735 	off = (saddr - r_saddr) + r_objoff;
   1736 	free_vp_pages(vp, off, len);
   1737 }
   1738 
   1739 /*
   1740  * callback function used by segvn_unmap to invoke free_vp_pages() for only
   1741  * those pages actually processed by the HAT
   1742  */
   1743 static void
   1744 segvn_hat_unload_callback(hat_callback_t *cb)
   1745 {
   1746 	struct seg		*seg = cb->hcb_data;
   1747 	struct segvn_data	*svd = (struct segvn_data *)seg->s_data;
   1748 	size_t			len;
   1749 	u_offset_t		off;
   1750 
   1751 	ASSERT(svd->vp != NULL);
   1752 	ASSERT(cb->hcb_end_addr > cb->hcb_start_addr);
   1753 	ASSERT(cb->hcb_start_addr >= seg->s_base);
   1754 
   1755 	len = cb->hcb_end_addr - cb->hcb_start_addr;
   1756 	off = cb->hcb_start_addr - seg->s_base;
   1757 	free_vp_pages(svd->vp, svd->offset + off, len);
   1758 }
   1759 
   1760 /*
   1761  * This function determines the number of bytes of swap reserved by
   1762  * a segment for which per-page accounting is present. It is used to
   1763  * calculate the correct value of a segvn_data's swresv.
   1764  */
   1765 static size_t
   1766 segvn_count_swap_by_vpages(struct seg *seg)
   1767 {
   1768 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   1769 	struct vpage *vp, *evp;
   1770 	size_t nswappages = 0;
   1771 
   1772 	ASSERT(svd->pageswap);
   1773 	ASSERT(svd->vpage != NULL);
   1774 
   1775 	evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)];
   1776 
   1777 	for (vp = svd->vpage; vp < evp; vp++) {
   1778 		if (VPP_ISSWAPRES(vp))
   1779 			nswappages++;
   1780 	}
   1781 
   1782 	return (nswappages << PAGESHIFT);
   1783 }
   1784 
   1785 static int
   1786 segvn_unmap(struct seg *seg, caddr_t addr, size_t len)
   1787 {
   1788 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   1789 	struct segvn_data *nsvd;
   1790 	struct seg *nseg;
   1791 	struct anon_map *amp;
   1792 	pgcnt_t	opages;		/* old segment size in pages */
   1793 	pgcnt_t	npages;		/* new segment size in pages */
   1794 	pgcnt_t	dpages;		/* pages being deleted (unmapped) */
   1795 	hat_callback_t callback;	/* used for free_vp_pages() */
   1796 	hat_callback_t *cbp = NULL;
   1797 	caddr_t nbase;
   1798 	size_t nsize;
   1799 	size_t oswresv;
   1800 	int reclaim = 1;
   1801 
   1802 	/*
   1803 	 * We don't need any segment level locks for "segvn" data
   1804 	 * since the address space is "write" locked.
   1805 	 */
   1806 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
   1807 
   1808 	/*
   1809 	 * Fail the unmap if pages are SOFTLOCKed through this mapping.
   1810 	 * softlockcnt is protected from change by the as write lock.
   1811 	 */
   1812 retry:
   1813 	if (svd->softlockcnt > 0) {
   1814 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
   1815 
   1816 		/*
   1817 		 * If this is shared segment non 0 softlockcnt
   1818 		 * means locked pages are still in use.
   1819 		 */
   1820 		if (svd->type == MAP_SHARED) {
   1821 			return (EAGAIN);
   1822 		}
   1823 
   1824 		/*
   1825 		 * since we do have the writers lock nobody can fill
   1826 		 * the cache during the purge. The flush either succeeds
   1827 		 * or we still have pending I/Os.
   1828 		 */
   1829 		if (reclaim == 1) {
   1830 			segvn_purge(seg);
   1831 			reclaim = 0;
   1832 			goto retry;
   1833 		}
   1834 		return (EAGAIN);
   1835 	}
   1836 
   1837 	/*
   1838 	 * Check for bad sizes
   1839 	 */
   1840 	if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size ||
   1841 	    (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) {
   1842 		panic("segvn_unmap");
   1843 		/*NOTREACHED*/
   1844 	}
   1845 
   1846 	if (seg->s_szc != 0) {
   1847 		size_t pgsz = page_get_pagesize(seg->s_szc);
   1848 		int err;
   1849 		if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
   1850 			ASSERT(seg->s_base != addr || seg->s_size != len);
   1851 			if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
   1852 				ASSERT(svd->amp == NULL);
   1853 				ASSERT(svd->tr_state == SEGVN_TR_OFF);
   1854 				hat_leave_region(seg->s_as->a_hat,
   1855 				    svd->rcookie, HAT_REGION_TEXT);
   1856 				svd->rcookie = HAT_INVALID_REGION_COOKIE;
   1857 				/*
   1858 				 * could pass a flag to segvn_demote_range()
   1859 				 * below to tell it not to do any unloads but
   1860 				 * this case is rare enough to not bother for
   1861 				 * now.
   1862 				 */
   1863 			} else if (svd->tr_state == SEGVN_TR_INIT) {
   1864 				svd->tr_state = SEGVN_TR_OFF;
   1865 			} else if (svd->tr_state == SEGVN_TR_ON) {
   1866 				ASSERT(svd->amp != NULL);
   1867 				segvn_textunrepl(seg, 1);
   1868 				ASSERT(svd->amp == NULL);
   1869 				ASSERT(svd->tr_state == SEGVN_TR_OFF);
   1870 			}
   1871 			VM_STAT_ADD(segvnvmstats.demoterange[0]);
   1872 			err = segvn_demote_range(seg, addr, len, SDR_END, 0);
   1873 			if (err == 0) {
   1874 				return (IE_RETRY);
   1875 			}
   1876 			return (err);
   1877 		}
   1878 	}
   1879 
   1880 	/* Inform the vnode of the unmapping. */
   1881 	if (svd->vp) {
   1882 		int error;
   1883 
   1884 		error = VOP_DELMAP(svd->vp,
   1885 		    (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base),
   1886 		    seg->s_as, addr, len, svd->prot, svd->maxprot,
   1887 		    svd->type, svd->cred, NULL);
   1888 
   1889 		if (error == EAGAIN)
   1890 			return (error);
   1891 	}
   1892 
   1893 	/*
   1894 	 * Remove any page locks set through this mapping.
   1895 	 * If text replication is not off no page locks could have been
   1896 	 * established via this mapping.
   1897 	 */
   1898 	if (svd->tr_state == SEGVN_TR_OFF) {
   1899 		(void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0);
   1900 	}
   1901 
   1902 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
   1903 		ASSERT(svd->amp == NULL);
   1904 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
   1905 		ASSERT(svd->type == MAP_PRIVATE);
   1906 		hat_leave_region(seg->s_as->a_hat, svd->rcookie,
   1907 		    HAT_REGION_TEXT);
   1908 		svd->rcookie = HAT_INVALID_REGION_COOKIE;
   1909 	} else if (svd->tr_state == SEGVN_TR_ON) {
   1910 		ASSERT(svd->amp != NULL);
   1911 		ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE));
   1912 		segvn_textunrepl(seg, 1);
   1913 		ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
   1914 	} else {
   1915 		if (svd->tr_state != SEGVN_TR_OFF) {
   1916 			ASSERT(svd->tr_state == SEGVN_TR_INIT);
   1917 			svd->tr_state = SEGVN_TR_OFF;
   1918 		}
   1919 		/*
   1920 		 * Unload any hardware translations in the range to be taken
   1921 		 * out. Use a callback to invoke free_vp_pages() effectively.
   1922 		 */
   1923 		if (svd->vp != NULL && free_pages != 0) {
   1924 			callback.hcb_data = seg;
   1925 			callback.hcb_function = segvn_hat_unload_callback;
   1926 			cbp = &callback;
   1927 		}
   1928 		hat_unload_callback(seg->s_as->a_hat, addr, len,
   1929 		    HAT_UNLOAD_UNMAP, cbp);
   1930 
   1931 		if (svd->type == MAP_SHARED && svd->vp != NULL &&
   1932 		    (svd->vp->v_flag & VVMEXEC) &&
   1933 		    ((svd->prot & PROT_WRITE) || svd->pageprot)) {
   1934 			segvn_inval_trcache(svd->vp);
   1935 		}
   1936 	}
   1937 
   1938 	/*
   1939 	 * Check for entire segment
   1940 	 */
   1941 	if (addr == seg->s_base && len == seg->s_size) {
   1942 		seg_free(seg);
   1943 		return (0);
   1944 	}
   1945 
   1946 	opages = seg_pages(seg);
   1947 	dpages = btop(len);
   1948 	npages = opages - dpages;
   1949 	amp = svd->amp;
   1950 	ASSERT(amp == NULL || amp->a_szc >= seg->s_szc);
   1951 
   1952 	/*
   1953 	 * Check for beginning of segment
   1954 	 */
   1955 	if (addr == seg->s_base) {
   1956 		if (svd->vpage != NULL) {
   1957 			size_t nbytes;
   1958 			struct vpage *ovpage;
   1959 
   1960 			ovpage = svd->vpage;	/* keep pointer to vpage */
   1961 
   1962 			nbytes = vpgtob(npages);
   1963 			svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
   1964 			bcopy(&ovpage[dpages], svd->vpage, nbytes);
   1965 
   1966 			/* free up old vpage */
   1967 			kmem_free(ovpage, vpgtob(opages));
   1968 		}
   1969 		if (amp != NULL) {
   1970 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
   1971 			if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
   1972 				/*
   1973 				 * Shared anon map is no longer in use. Before
   1974 				 * freeing its pages purge all entries from
   1975 				 * pcache that belong to this amp.
   1976 				 */
   1977 				if (svd->type == MAP_SHARED) {
   1978 					ASSERT(amp->refcnt == 1);
   1979 					ASSERT(svd->softlockcnt == 0);
   1980 					anonmap_purge(amp);
   1981 				}
   1982 				/*
   1983 				 * Free up now unused parts of anon_map array.
   1984 				 */
   1985 				if (amp->a_szc == seg->s_szc) {
   1986 					if (seg->s_szc != 0) {
   1987 						anon_free_pages(amp->ahp,
   1988 						    svd->anon_index, len,
   1989 						    seg->s_szc);
   1990 					} else {
   1991 						anon_free(amp->ahp,
   1992 						    svd->anon_index,
   1993 						    len);
   1994 					}
   1995 				} else {
   1996 					ASSERT(svd->type == MAP_SHARED);
   1997 					ASSERT(amp->a_szc > seg->s_szc);
   1998 					anon_shmap_free_pages(amp,
   1999 					    svd->anon_index, len);
   2000 				}
   2001 
   2002 				/*
   2003 				 * Unreserve swap space for the
   2004 				 * unmapped chunk of this segment in
   2005 				 * case it's MAP_SHARED
   2006 				 */
   2007 				if (svd->type == MAP_SHARED) {
   2008 					anon_unresv_zone(len,
   2009 					    seg->s_as->a_proc->p_zone);
   2010 					amp->swresv -= len;
   2011 				}
   2012 			}
   2013 			ANON_LOCK_EXIT(&amp->a_rwlock);
   2014 			svd->anon_index += dpages;
   2015 		}
   2016 		if (svd->vp != NULL)
   2017 			svd->offset += len;
   2018 
   2019 		seg->s_base += len;
   2020 		seg->s_size -= len;
   2021 
   2022 		if (svd->swresv) {
   2023 			if (svd->flags & MAP_NORESERVE) {
   2024 				ASSERT(amp);
   2025 				oswresv = svd->swresv;
   2026 
   2027 				svd->swresv = ptob(anon_pages(amp->ahp,
   2028 				    svd->anon_index, npages));
   2029 				anon_unresv_zone(oswresv - svd->swresv,
   2030 				    seg->s_as->a_proc->p_zone);
   2031 				if (SEG_IS_PARTIAL_RESV(seg))
   2032 					seg->s_as->a_resvsize -= oswresv -
   2033 					    svd->swresv;
   2034 			} else {
   2035 				size_t unlen;
   2036 
   2037 				if (svd->pageswap) {
   2038 					oswresv = svd->swresv;
   2039 					svd->swresv =
   2040 					    segvn_count_swap_by_vpages(seg);
   2041 					ASSERT(oswresv >= svd->swresv);
   2042 					unlen = oswresv - svd->swresv;
   2043 				} else {
   2044 					svd->swresv -= len;
   2045 					ASSERT(svd->swresv == seg->s_size);
   2046 					unlen = len;
   2047 				}
   2048 				anon_unresv_zone(unlen,
   2049 				    seg->s_as->a_proc->p_zone);
   2050 			}
   2051 			TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
   2052 			    seg, len, 0);
   2053 		}
   2054 
   2055 		return (0);
   2056 	}
   2057 
   2058 	/*
   2059 	 * Check for end of segment
   2060 	 */
   2061 	if (addr + len == seg->s_base + seg->s_size) {
   2062 		if (svd->vpage != NULL) {
   2063 			size_t nbytes;
   2064 			struct vpage *ovpage;
   2065 
   2066 			ovpage = svd->vpage;	/* keep pointer to vpage */
   2067 
   2068 			nbytes = vpgtob(npages);
   2069 			svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
   2070 			bcopy(ovpage, svd->vpage, nbytes);
   2071 
   2072 			/* free up old vpage */
   2073 			kmem_free(ovpage, vpgtob(opages));
   2074 
   2075 		}
   2076 		if (amp != NULL) {
   2077 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
   2078 			if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
   2079 				/*
   2080 				 * Free up now unused parts of anon_map array.
   2081 				 */
   2082 				ulong_t an_idx = svd->anon_index + npages;
   2083 
   2084 				/*
   2085 				 * Shared anon map is no longer in use. Before
   2086 				 * freeing its pages purge all entries from
   2087 				 * pcache that belong to this amp.
   2088 				 */
   2089 				if (svd->type == MAP_SHARED) {
   2090 					ASSERT(amp->refcnt == 1);
   2091 					ASSERT(svd->softlockcnt == 0);
   2092 					anonmap_purge(amp);
   2093 				}
   2094 
   2095 				if (amp->a_szc == seg->s_szc) {
   2096 					if (seg->s_szc != 0) {
   2097 						anon_free_pages(amp->ahp,
   2098 						    an_idx, len,
   2099 						    seg->s_szc);
   2100 					} else {
   2101 						anon_free(amp->ahp, an_idx,
   2102 						    len);
   2103 					}
   2104 				} else {
   2105 					ASSERT(svd->type == MAP_SHARED);
   2106 					ASSERT(amp->a_szc > seg->s_szc);
   2107 					anon_shmap_free_pages(amp,
   2108 					    an_idx, len);
   2109 				}
   2110 
   2111 				/*
   2112 				 * Unreserve swap space for the
   2113 				 * unmapped chunk of this segment in
   2114 				 * case it's MAP_SHARED
   2115 				 */
   2116 				if (svd->type == MAP_SHARED) {
   2117 					anon_unresv_zone(len,
   2118 					    seg->s_as->a_proc->p_zone);
   2119 					amp->swresv -= len;
   2120 				}
   2121 			}
   2122 			ANON_LOCK_EXIT(&amp->a_rwlock);
   2123 		}
   2124 
   2125 		seg->s_size -= len;
   2126 
   2127 		if (svd->swresv) {
   2128 			if (svd->flags & MAP_NORESERVE) {
   2129 				ASSERT(amp);
   2130 				oswresv = svd->swresv;
   2131 				svd->swresv = ptob(anon_pages(amp->ahp,
   2132 				    svd->anon_index, npages));
   2133 				anon_unresv_zone(oswresv - svd->swresv,
   2134 				    seg->s_as->a_proc->p_zone);
   2135 				if (SEG_IS_PARTIAL_RESV(seg))
   2136 					seg->s_as->a_resvsize -= oswresv -
   2137 					    svd->swresv;
   2138 			} else {
   2139 				size_t unlen;
   2140 
   2141 				if (svd->pageswap) {
   2142 					oswresv = svd->swresv;
   2143 					svd->swresv =
   2144 					    segvn_count_swap_by_vpages(seg);
   2145 					ASSERT(oswresv >= svd->swresv);
   2146 					unlen = oswresv - svd->swresv;
   2147 				} else {
   2148 					svd->swresv -= len;
   2149 					ASSERT(svd->swresv == seg->s_size);
   2150 					unlen = len;
   2151 				}
   2152 				anon_unresv_zone(unlen,
   2153 				    seg->s_as->a_proc->p_zone);
   2154 			}
   2155 			TRACE_3(TR_FAC_VM, TR_ANON_PROC,
   2156 			    "anon proc:%p %lu %u", seg, len, 0);
   2157 		}
   2158 
   2159 		return (0);
   2160 	}
   2161 
   2162 	/*
   2163 	 * The section to go is in the middle of the segment,
   2164 	 * have to make it into two segments.  nseg is made for
   2165 	 * the high end while seg is cut down at the low end.
   2166 	 */
   2167 	nbase = addr + len;				/* new seg base */
   2168 	nsize = (seg->s_base + seg->s_size) - nbase;	/* new seg size */
   2169 	seg->s_size = addr - seg->s_base;		/* shrink old seg */
   2170 	nseg = seg_alloc(seg->s_as, nbase, nsize);
   2171 	if (nseg == NULL) {
   2172 		panic("segvn_unmap seg_alloc");
   2173 		/*NOTREACHED*/
   2174 	}
   2175 	nseg->s_ops = seg->s_ops;
   2176 	nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
   2177 	nseg->s_data = (void *)nsvd;
   2178 	nseg->s_szc = seg->s_szc;
   2179 	*nsvd = *svd;
   2180 	nsvd->seg = nseg;
   2181 	nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base);
   2182 	nsvd->swresv = 0;
   2183 	nsvd->softlockcnt = 0;
   2184 	nsvd->softlockcnt_sbase = 0;
   2185 	nsvd->softlockcnt_send = 0;
   2186 	ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
   2187 
   2188 	if (svd->vp != NULL) {
   2189 		VN_HOLD(nsvd->vp);
   2190 		if (nsvd->type == MAP_SHARED)
   2191 			lgrp_shm_policy_init(NULL, nsvd->vp);
   2192 	}
   2193 	crhold(svd->cred);
   2194 
   2195 	if (svd->vpage == NULL) {
   2196 		nsvd->vpage = NULL;
   2197 	} else {
   2198 		/* need to split vpage into two arrays */
   2199 		size_t nbytes;
   2200 		struct vpage *ovpage;
   2201 
   2202 		ovpage = svd->vpage;		/* keep pointer to vpage */
   2203 
   2204 		npages = seg_pages(seg);	/* seg has shrunk */
   2205 		nbytes = vpgtob(npages);
   2206 		svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
   2207 
   2208 		bcopy(ovpage, svd->vpage, nbytes);
   2209 
   2210 		npages = seg_pages(nseg);
   2211 		nbytes = vpgtob(npages);
   2212 		nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
   2213 
   2214 		bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes);
   2215 
   2216 		/* free up old vpage */
   2217 		kmem_free(ovpage, vpgtob(opages));
   2218 	}
   2219 
   2220 	if (amp == NULL) {
   2221 		nsvd->amp = NULL;
   2222 		nsvd->anon_index = 0;
   2223 	} else {
   2224 		/*
   2225 		 * Need to create a new anon map for the new segment.
   2226 		 * We'll also allocate a new smaller array for the old
   2227 		 * smaller segment to save space.
   2228 		 */
   2229 		opages = btop((uintptr_t)(addr - seg->s_base));
   2230 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
   2231 		if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
   2232 			/*
   2233 			 * Free up now unused parts of anon_map array.
   2234 			 */
   2235 			ulong_t an_idx = svd->anon_index + opages;
   2236 
   2237 			/*
   2238 			 * Shared anon map is no longer in use. Before
   2239 			 * freeing its pages purge all entries from
   2240 			 * pcache that belong to this amp.
   2241 			 */
   2242 			if (svd->type == MAP_SHARED) {
   2243 				ASSERT(amp->refcnt == 1);
   2244 				ASSERT(svd->softlockcnt == 0);
   2245 				anonmap_purge(amp);
   2246 			}
   2247 
   2248 			if (amp->a_szc == seg->s_szc) {
   2249 				if (seg->s_szc != 0) {
   2250 					anon_free_pages(amp->ahp, an_idx, len,
   2251 					    seg->s_szc);
   2252 				} else {
   2253 					anon_free(amp->ahp, an_idx,
   2254 					    len);
   2255 				}
   2256 			} else {
   2257 				ASSERT(svd->type == MAP_SHARED);
   2258 				ASSERT(amp->a_szc > seg->s_szc);
   2259 				anon_shmap_free_pages(amp, an_idx, len);
   2260 			}
   2261 
   2262 			/*
   2263 			 * Unreserve swap space for the
   2264 			 * unmapped chunk of this segment in
   2265 			 * case it's MAP_SHARED
   2266 			 */
   2267 			if (svd->type == MAP_SHARED) {
   2268 				anon_unresv_zone(len,
   2269 				    seg->s_as->a_proc->p_zone);
   2270 				amp->swresv -= len;
   2271 			}
   2272 		}
   2273 		nsvd->anon_index = svd->anon_index +
   2274 		    btop((uintptr_t)(nseg->s_base - seg->s_base));
   2275 		if (svd->type == MAP_SHARED) {
   2276 			amp->refcnt++;
   2277 			nsvd->amp = amp;
   2278 		} else {
   2279 			struct anon_map *namp;
   2280 			struct anon_hdr *nahp;
   2281 
   2282 			ASSERT(svd->type == MAP_PRIVATE);
   2283 			nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
   2284 			namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP);
   2285 			namp->a_szc = seg->s_szc;
   2286 			(void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp,
   2287 			    0, btop(seg->s_size), ANON_SLEEP);
   2288 			(void) anon_copy_ptr(amp->ahp, nsvd->anon_index,
   2289 			    namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
   2290 			anon_release(amp->ahp, btop(amp->size));
   2291 			svd->anon_index = 0;
   2292 			nsvd->anon_index = 0;
   2293 			amp->ahp = nahp;
   2294 			amp->size = seg->s_size;
   2295 			nsvd->amp = namp;
   2296 		}
   2297 		ANON_LOCK_EXIT(&amp->a_rwlock);
   2298 	}
   2299 	if (svd->swresv) {
   2300 		if (svd->flags & MAP_NORESERVE) {
   2301 			ASSERT(amp);
   2302 			oswresv = svd->swresv;
   2303 			svd->swresv = ptob(anon_pages(amp->ahp,
   2304 			    svd->anon_index, btop(seg->s_size)));
   2305 			nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
   2306 			    nsvd->anon_index, btop(nseg->s_size)));
   2307 			ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
   2308 			anon_unresv_zone(oswresv - (svd->swresv + nsvd->swresv),
   2309 			    seg->s_as->a_proc->p_zone);
   2310 			if (SEG_IS_PARTIAL_RESV(seg))
   2311 				seg->s_as->a_resvsize -= oswresv -
   2312 				    (svd->swresv + nsvd->swresv);
   2313 		} else {
   2314 			size_t unlen;
   2315 
   2316 			if (svd->pageswap) {
   2317 				oswresv = svd->swresv;
   2318 				svd->swresv = segvn_count_swap_by_vpages(seg);
   2319 				nsvd->swresv = segvn_count_swap_by_vpages(nseg);
   2320 				ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
   2321 				unlen = oswresv - (svd->swresv + nsvd->swresv);
   2322 			} else {
   2323 				if (seg->s_size + nseg->s_size + len !=
   2324 				    svd->swresv) {
   2325 					panic("segvn_unmap: cannot split "
   2326 					    "swap reservation");
   2327 					/*NOTREACHED*/
   2328 				}
   2329 				svd->swresv = seg->s_size;
   2330 				nsvd->swresv = nseg->s_size;
   2331 				unlen = len;
   2332 			}
   2333 			anon_unresv_zone(unlen,
   2334 			    seg->s_as->a_proc->p_zone);
   2335 		}
   2336 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
   2337 		    seg, len, 0);
   2338 	}
   2339 
   2340 	return (0);			/* I'm glad that's all over with! */
   2341 }
   2342 
   2343 static void
   2344 segvn_free(struct seg *seg)
   2345 {
   2346 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   2347 	pgcnt_t npages = seg_pages(seg);
   2348 	struct anon_map *amp;
   2349 	size_t len;
   2350 
   2351 	/*
   2352 	 * We don't need any segment level locks for "segvn" data
   2353 	 * since the address space is "write" locked.
   2354 	 */
   2355 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
   2356 	ASSERT(svd->tr_state == SEGVN_TR_OFF);
   2357 
   2358 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   2359 
   2360 	/*
   2361 	 * Be sure to unlock pages. XXX Why do things get free'ed instead
   2362 	 * of unmapped? XXX
   2363 	 */
   2364 	(void) segvn_lockop(seg, seg->s_base, seg->s_size,
   2365 	    0, MC_UNLOCK, NULL, 0);
   2366 
   2367 	/*
   2368 	 * Deallocate the vpage and anon pointers if necessary and possible.
   2369 	 */
   2370 	if (svd->vpage != NULL) {
   2371 		kmem_free(svd->vpage, vpgtob(npages));
   2372 		svd->vpage = NULL;
   2373 	}
   2374 	if ((amp = svd->amp) != NULL) {
   2375 		/*
   2376 		 * If there are no more references to this anon_map
   2377 		 * structure, then deallocate the structure after freeing
   2378 		 * up all the anon slot pointers that we can.
   2379 		 */
   2380 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
   2381 		ASSERT(amp->a_szc >= seg->s_szc);
   2382 		if (--amp->refcnt == 0) {
   2383 			if (svd->type == MAP_PRIVATE) {
   2384 				/*
   2385 				 * Private - we only need to anon_free
   2386 				 * the part that this segment refers to.
   2387 				 */
   2388 				if (seg->s_szc != 0) {
   2389 					anon_free_pages(amp->ahp,
   2390 					    svd->anon_index, seg->s_size,
   2391 					    seg->s_szc);
   2392 				} else {
   2393 					anon_free(amp->ahp, svd->anon_index,
   2394 					    seg->s_size);
   2395 				}
   2396 			} else {
   2397 
   2398 				/*
   2399 				 * Shared anon map is no longer in use. Before
   2400 				 * freeing its pages purge all entries from
   2401 				 * pcache that belong to this amp.
   2402 				 */
   2403 				ASSERT(svd->softlockcnt == 0);
   2404 				anonmap_purge(amp);
   2405 
   2406 				/*
   2407 				 * Shared - anon_free the entire
   2408 				 * anon_map's worth of stuff and
   2409 				 * release any swap reservation.
   2410 				 */
   2411 				if (amp->a_szc != 0) {
   2412 					anon_shmap_free_pages(amp, 0,
   2413 					    amp->size);
   2414 				} else {
   2415 					anon_free(amp->ahp, 0, amp->size);
   2416 				}
   2417 				if ((len = amp->swresv) != 0) {
   2418 					anon_unresv_zone(len,
   2419 					    seg->s_as->a_proc->p_zone);
   2420 					TRACE_3(TR_FAC_VM, TR_ANON_PROC,
   2421 					    "anon proc:%p %lu %u", seg, len, 0);
   2422 				}
   2423 			}
   2424 			svd->amp = NULL;
   2425 			ANON_LOCK_EXIT(&amp->a_rwlock);
   2426 			anonmap_free(amp);
   2427 		} else if (svd->type == MAP_PRIVATE) {
   2428 			/*
   2429 			 * We had a private mapping which still has
   2430 			 * a held anon_map so just free up all the
   2431 			 * anon slot pointers that we were using.
   2432 			 */
   2433 			if (seg->s_szc != 0) {
   2434 				anon_free_pages(amp->ahp, svd->anon_index,
   2435 				    seg->s_size, seg->s_szc);
   2436 			} else {
   2437 				anon_free(amp->ahp, svd->anon_index,
   2438 				    seg->s_size);
   2439 			}
   2440 			ANON_LOCK_EXIT(&amp->a_rwlock);
   2441 		} else {
   2442 			ANON_LOCK_EXIT(&amp->a_rwlock);
   2443 		}
   2444 	}
   2445 
   2446 	/*
   2447 	 * Release swap reservation.
   2448 	 */
   2449 	if ((len = svd->swresv) != 0) {
   2450 		anon_unresv_zone(svd->swresv,
   2451 		    seg->s_as->a_proc->p_zone);
   2452 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
   2453 		    seg, len, 0);
   2454 		if (SEG_IS_PARTIAL_RESV(seg))
   2455 			seg->s_as->a_resvsize -= svd->swresv;
   2456 		svd->swresv = 0;
   2457 	}
   2458 	/*
   2459 	 * Release claim on vnode, credentials, and finally free the
   2460 	 * private data.
   2461 	 */
   2462 	if (svd->vp != NULL) {
   2463 		if (svd->type == MAP_SHARED)
   2464 			lgrp_shm_policy_fini(NULL, svd->vp);
   2465 		VN_RELE(svd->vp);
   2466 		svd->vp = NULL;
   2467 	}
   2468 	crfree(svd->cred);
   2469 	svd->pageprot = 0;
   2470 	svd->pageadvice = 0;
   2471 	svd->pageswap = 0;
   2472 	svd->cred = NULL;
   2473 
   2474 	/*
   2475 	 * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's
   2476 	 * still working with this segment without holding as lock (in case
   2477 	 * it's called by pcache async thread).
   2478 	 */
   2479 	ASSERT(svd->softlockcnt == 0);
   2480 	mutex_enter(&svd->segfree_syncmtx);
   2481 	mutex_exit(&svd->segfree_syncmtx);
   2482 
   2483 	seg->s_data = NULL;
   2484 	kmem_cache_free(segvn_cache, svd);
   2485 }
   2486 
   2487 /*
   2488  * Do a F_SOFTUNLOCK call over the range requested.  The range must have
   2489  * already been F_SOFTLOCK'ed.
   2490  * Caller must always match addr and len of a softunlock with a previous
   2491  * softlock with exactly the same addr and len.
   2492  */
   2493 static void
   2494 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
   2495 {
   2496 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   2497 	page_t *pp;
   2498 	caddr_t adr;
   2499 	struct vnode *vp;
   2500 	u_offset_t offset;
   2501 	ulong_t anon_index;
   2502 	struct anon_map *amp;
   2503 	struct anon *ap = NULL;
   2504 
   2505 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   2506 	ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
   2507 
   2508 	if ((amp = svd->amp) != NULL)
   2509 		anon_index = svd->anon_index + seg_page(seg, addr);
   2510 
   2511 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
   2512 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
   2513 		hat_unlock_region(seg->s_as->a_hat, addr, len, svd->rcookie);
   2514 	} else {
   2515 		hat_unlock(seg->s_as->a_hat, addr, len);
   2516 	}
   2517 	for (adr = addr; adr < addr + len; adr += PAGESIZE) {
   2518 		if (amp != NULL) {
   2519 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   2520 			if ((ap = anon_get_ptr(amp->ahp, anon_index++))
   2521 			    != NULL) {
   2522 				swap_xlate(ap, &vp, &offset);
   2523 			} else {
   2524 				vp = svd->vp;
   2525 				offset = svd->offset +
   2526 				    (uintptr_t)(adr - seg->s_base);
   2527 			}
   2528 			ANON_LOCK_EXIT(&amp->a_rwlock);
   2529 		} else {
   2530 			vp = svd->vp;
   2531 			offset = svd->offset +
   2532 			    (uintptr_t)(adr - seg->s_base);
   2533 		}
   2534 
   2535 		/*
   2536 		 * Use page_find() instead of page_lookup() to
   2537 		 * find the page since we know that it is locked.
   2538 		 */
   2539 		pp = page_find(vp, offset);
   2540 		if (pp == NULL) {
   2541 			panic(
   2542 			    "segvn_softunlock: addr %p, ap %p, vp %p, off %llx",
   2543 			    (void *)adr, (void *)ap, (void *)vp, offset);
   2544 			/*NOTREACHED*/
   2545 		}
   2546 
   2547 		if (rw == S_WRITE) {
   2548 			hat_setrefmod(pp);
   2549 			if (seg->s_as->a_vbits)
   2550 				hat_setstat(seg->s_as, adr, PAGESIZE,
   2551 				    P_REF | P_MOD);
   2552 		} else if (rw != S_OTHER) {
   2553 			hat_setref(pp);
   2554 			if (seg->s_as->a_vbits)
   2555 				hat_setstat(seg->s_as, adr, PAGESIZE, P_REF);
   2556 		}
   2557 		TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
   2558 		    "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset);
   2559 		page_unlock(pp);
   2560 	}
   2561 	ASSERT(svd->softlockcnt >= btop(len));
   2562 	if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -btop(len))) {
   2563 		/*
   2564 		 * All SOFTLOCKS are gone. Wakeup any waiting
   2565 		 * unmappers so they can try again to unmap.
   2566 		 * Check for waiters first without the mutex
   2567 		 * held so we don't always grab the mutex on
   2568 		 * softunlocks.
   2569 		 */
   2570 		if (AS_ISUNMAPWAIT(seg->s_as)) {
   2571 			mutex_enter(&seg->s_as->a_contents);
   2572 			if (AS_ISUNMAPWAIT(seg->s_as)) {
   2573 				AS_CLRUNMAPWAIT(seg->s_as);
   2574 				cv_broadcast(&seg->s_as->a_cv);
   2575 			}
   2576 			mutex_exit(&seg->s_as->a_contents);
   2577 		}
   2578 	}
   2579 }
   2580 
   2581 #define	PAGE_HANDLED	((page_t *)-1)
   2582 
   2583 /*
   2584  * Release all the pages in the NULL terminated ppp list
   2585  * which haven't already been converted to PAGE_HANDLED.
   2586  */
   2587 static void
   2588 segvn_pagelist_rele(page_t **ppp)
   2589 {
   2590 	for (; *ppp != NULL; ppp++) {
   2591 		if (*ppp != PAGE_HANDLED)
   2592 			page_unlock(*ppp);
   2593 	}
   2594 }
   2595 
   2596 static int stealcow = 1;
   2597 
   2598 /*
   2599  * Workaround for viking chip bug.  See bug id 1220902.
   2600  * To fix this down in pagefault() would require importing so
   2601  * much as and segvn code as to be unmaintainable.
   2602  */
   2603 int enable_mbit_wa = 0;
   2604 
   2605 /*
   2606  * Handles all the dirty work of getting the right
   2607  * anonymous pages and loading up the translations.
   2608  * This routine is called only from segvn_fault()
   2609  * when looping over the range of addresses requested.
   2610  *
   2611  * The basic algorithm here is:
   2612  * 	If this is an anon_zero case
   2613  *		Call anon_zero to allocate page
   2614  *		Load up translation
   2615  *		Return
   2616  *	endif
   2617  *	If this is an anon page
   2618  *		Use anon_getpage to get the page
   2619  *	else
   2620  *		Find page in pl[] list passed in
   2621  *	endif
   2622  *	If not a cow
   2623  *		Load up the translation to the page
   2624  *		return
   2625  *	endif
   2626  *	Call anon_private to handle cow
   2627  *	Load up (writable) translation to new page
   2628  */
   2629 static faultcode_t
   2630 segvn_faultpage(
   2631 	struct hat *hat,		/* the hat to use for mapping */
   2632 	struct seg *seg,		/* seg_vn of interest */
   2633 	caddr_t addr,			/* address in as */
   2634 	u_offset_t off,			/* offset in vp */
   2635 	struct vpage *vpage,		/* pointer to vpage for vp, off */
   2636 	page_t *pl[],			/* object source page pointer */
   2637 	uint_t vpprot,			/* access allowed to object pages */
   2638 	enum fault_type type,		/* type of fault */
   2639 	enum seg_rw rw,			/* type of access at fault */
   2640 	int brkcow)			/* we may need to break cow */
   2641 {
   2642 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   2643 	page_t *pp, **ppp;
   2644 	uint_t pageflags = 0;
   2645 	page_t *anon_pl[1 + 1];
   2646 	page_t *opp = NULL;		/* original page */
   2647 	uint_t prot;
   2648 	int err;
   2649 	int cow;
   2650 	int claim;
   2651 	int steal = 0;
   2652 	ulong_t anon_index;
   2653 	struct anon *ap, *oldap;
   2654 	struct anon_map *amp;
   2655 	int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
   2656 	int anon_lock = 0;
   2657 	anon_sync_obj_t cookie;
   2658 
   2659 	if (svd->flags & MAP_TEXT) {
   2660 		hat_flag |= HAT_LOAD_TEXT;
   2661 	}
   2662 
   2663 	ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
   2664 	ASSERT(seg->s_szc == 0);
   2665 	ASSERT(svd->tr_state != SEGVN_TR_INIT);
   2666 
   2667 	/*
   2668 	 * Initialize protection value for this page.
   2669 	 * If we have per page protection values check it now.
   2670 	 */
   2671 	if (svd->pageprot) {
   2672 		uint_t protchk;
   2673 
   2674 		switch (rw) {
   2675 		case S_READ:
   2676 			protchk = PROT_READ;
   2677 			break;
   2678 		case S_WRITE:
   2679 			protchk = PROT_WRITE;
   2680 			break;
   2681 		case S_EXEC:
   2682 			protchk = PROT_EXEC;
   2683 			break;
   2684 		case S_OTHER:
   2685 		default:
   2686 			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
   2687 			break;
   2688 		}
   2689 
   2690 		prot = VPP_PROT(vpage);
   2691 		if ((prot & protchk) == 0)
   2692 			return (FC_PROT);	/* illegal access type */
   2693 	} else {
   2694 		prot = svd->prot;
   2695 	}
   2696 
   2697 	if (type == F_SOFTLOCK) {
   2698 		atomic_add_long((ulong_t *)&svd->softlockcnt, 1);
   2699 	}
   2700 
   2701 	/*
   2702 	 * Always acquire the anon array lock to prevent 2 threads from
   2703 	 * allocating separate anon slots for the same "addr".
   2704 	 */
   2705 
   2706 	if ((amp = svd->amp) != NULL) {
   2707 		ASSERT(RW_READ_HELD(&amp->a_rwlock));
   2708 		anon_index = svd->anon_index + seg_page(seg, addr);
   2709 		anon_array_enter(amp, anon_index, &cookie);
   2710 		anon_lock = 1;
   2711 	}
   2712 
   2713 	if (svd->vp == NULL && amp != NULL) {
   2714 		if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) {
   2715 			/*
   2716 			 * Allocate a (normally) writable anonymous page of
   2717 			 * zeroes. If no advance reservations, reserve now.
   2718 			 */
   2719 			if (svd->flags & MAP_NORESERVE) {
   2720 				if (anon_resv_zone(ptob(1),
   2721 				    seg->s_as->a_proc->p_zone)) {
   2722 					atomic_add_long(&svd->swresv, ptob(1));
   2723 					atomic_add_long(&seg->s_as->a_resvsize,
   2724 					    ptob(1));
   2725 				} else {
   2726 					err = ENOMEM;
   2727 					goto out;
   2728 				}
   2729 			}
   2730 			if ((pp = anon_zero(seg, addr, &ap,
   2731 			    svd->cred)) == NULL) {
   2732 				err = ENOMEM;
   2733 				goto out;	/* out of swap space */
   2734 			}
   2735 			/*
   2736 			 * Re-acquire the anon_map lock and
   2737 			 * initialize the anon array entry.
   2738 			 */
   2739 			(void) anon_set_ptr(amp->ahp, anon_index, ap,
   2740 			    ANON_SLEEP);
   2741 
   2742 			ASSERT(pp->p_szc == 0);
   2743 
   2744 			/*
   2745 			 * Handle pages that have been marked for migration
   2746 			 */
   2747 			if (lgrp_optimizations())
   2748 				page_migrate(seg, addr, &pp, 1);
   2749 
   2750 			if (enable_mbit_wa) {
   2751 				if (rw == S_WRITE)
   2752 					hat_setmod(pp);
   2753 				else if (!hat_ismod(pp))
   2754 					prot &= ~PROT_WRITE;
   2755 			}
   2756 			/*
   2757 			 * If AS_PAGLCK is set in a_flags (via memcntl(2)
   2758 			 * with MC_LOCKAS, MCL_FUTURE) and this is a
   2759 			 * MAP_NORESERVE segment, we may need to
   2760 			 * permanently lock the page as it is being faulted
   2761 			 * for the first time. The following text applies
   2762 			 * only to MAP_NORESERVE segments:
   2763 			 *
   2764 			 * As per memcntl(2), if this segment was created
   2765 			 * after MCL_FUTURE was applied (a "future"
   2766 			 * segment), its pages must be locked.  If this
   2767 			 * segment existed at MCL_FUTURE application (a
   2768 			 * "past" segment), the interface is unclear.
   2769 			 *
   2770 			 * We decide to lock only if vpage is present:
   2771 			 *
   2772 			 * - "future" segments will have a vpage array (see
   2773 			 *    as_map), and so will be locked as required
   2774 			 *
   2775 			 * - "past" segments may not have a vpage array,
   2776 			 *    depending on whether events (such as
   2777 			 *    mprotect) have occurred. Locking if vpage
   2778 			 *    exists will preserve legacy behavior.  Not
   2779 			 *    locking if vpage is absent, will not break
   2780 			 *    the interface or legacy behavior.  Note that
   2781 			 *    allocating vpage here if it's absent requires
   2782 			 *    upgrading the segvn reader lock, the cost of
   2783 			 *    which does not seem worthwhile.
   2784 			 *
   2785 			 * Usually testing and setting VPP_ISPPLOCK and
   2786 			 * VPP_SETPPLOCK requires holding the segvn lock as
   2787 			 * writer, but in this case all readers are
   2788 			 * serializing on the anon array lock.
   2789 			 */
   2790 			if (AS_ISPGLCK(seg->s_as) && vpage != NULL &&
   2791 			    (svd->flags & MAP_NORESERVE) &&
   2792 			    !VPP_ISPPLOCK(vpage)) {
   2793 				proc_t *p = seg->s_as->a_proc;
   2794 				ASSERT(svd->type == MAP_PRIVATE);
   2795 				mutex_enter(&p->p_lock);
   2796 				if (rctl_incr_locked_mem(p, NULL, PAGESIZE,
   2797 				    1) == 0) {
   2798 					claim = VPP_PROT(vpage) & PROT_WRITE;
   2799 					if (page_pp_lock(pp, claim, 0)) {
   2800 						VPP_SETPPLOCK(vpage);
   2801 					} else {
   2802 						rctl_decr_locked_mem(p, NULL,
   2803 						    PAGESIZE, 1);
   2804 					}
   2805 				}
   2806 				mutex_exit(&p->p_lock);
   2807 			}
   2808 
   2809 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   2810 			hat_memload(hat, addr, pp, prot, hat_flag);
   2811 
   2812 			if (!(hat_flag & HAT_LOAD_LOCK))
   2813 				page_unlock(pp);
   2814 
   2815 			anon_array_exit(&cookie);
   2816 			return (0);
   2817 		}
   2818 	}
   2819 
   2820 	/*
   2821 	 * Obtain the page structure via anon_getpage() if it is
   2822 	 * a private copy of an object (the result of a previous
   2823 	 * copy-on-write).
   2824 	 */
   2825 	if (amp != NULL) {
   2826 		if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) {
   2827 			err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE,
   2828 			    seg, addr, rw, svd->cred);
   2829 			if (err)
   2830 				goto out;
   2831 
   2832 			if (svd->type == MAP_SHARED) {
   2833 				/*
   2834 				 * If this is a shared mapping to an
   2835 				 * anon_map, then ignore the write
   2836 				 * permissions returned by anon_getpage().
   2837 				 * They apply to the private mappings
   2838 				 * of this anon_map.
   2839 				 */
   2840 				vpprot |= PROT_WRITE;
   2841 			}
   2842 			opp = anon_pl[0];
   2843 		}
   2844 	}
   2845 
   2846 	/*
   2847 	 * Search the pl[] list passed in if it is from the
   2848 	 * original object (i.e., not a private copy).
   2849 	 */
   2850 	if (opp == NULL) {
   2851 		/*
   2852 		 * Find original page.  We must be bringing it in
   2853 		 * from the list in pl[].
   2854 		 */
   2855 		for (ppp = pl; (opp = *ppp) != NULL; ppp++) {
   2856 			if (opp == PAGE_HANDLED)
   2857 				continue;
   2858 			ASSERT(opp->p_vnode == svd->vp); /* XXX */
   2859 			if (opp->p_offset == off)
   2860 				break;
   2861 		}
   2862 		if (opp == NULL) {
   2863 			panic("segvn_faultpage not found");
   2864 			/*NOTREACHED*/
   2865 		}
   2866 		*ppp = PAGE_HANDLED;
   2867 
   2868 	}
   2869 
   2870 	ASSERT(PAGE_LOCKED(opp));
   2871 
   2872 	TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
   2873 	    "segvn_fault:pp %p vp %p offset %llx", opp, NULL, 0);
   2874 
   2875 	/*
   2876 	 * The fault is treated as a copy-on-write fault if a
   2877 	 * write occurs on a private segment and the object
   2878 	 * page (i.e., mapping) is write protected.  We assume
   2879 	 * that fatal protection checks have already been made.
   2880 	 */
   2881 
   2882 	if (brkcow) {
   2883 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
   2884 		cow = !(vpprot & PROT_WRITE);
   2885 	} else if (svd->tr_state == SEGVN_TR_ON) {
   2886 		/*
   2887 		 * If we are doing text replication COW on first touch.
   2888 		 */
   2889 		ASSERT(amp != NULL);
   2890 		ASSERT(svd->vp != NULL);
   2891 		ASSERT(rw != S_WRITE);
   2892 		cow = (ap == NULL);
   2893 	} else {
   2894 		cow = 0;
   2895 	}
   2896 
   2897 	/*
   2898 	 * If not a copy-on-write case load the translation
   2899 	 * and return.
   2900 	 */
   2901 	if (cow == 0) {
   2902 
   2903 		/*
   2904 		 * Handle pages that have been marked for migration
   2905 		 */
   2906 		if (lgrp_optimizations())
   2907 			page_migrate(seg, addr, &opp, 1);
   2908 
   2909 		if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) {
   2910 			if (rw == S_WRITE)
   2911 				hat_setmod(opp);
   2912 			else if (rw != S_OTHER && !hat_ismod(opp))
   2913 				prot &= ~PROT_WRITE;
   2914 		}
   2915 
   2916 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE ||
   2917 		    (!svd->pageprot && svd->prot == (prot & vpprot)));
   2918 		ASSERT(amp == NULL ||
   2919 		    svd->rcookie == HAT_INVALID_REGION_COOKIE);
   2920 		hat_memload_region(hat, addr, opp, prot & vpprot, hat_flag,
   2921 		    svd->rcookie);
   2922 
   2923 		if (!(hat_flag & HAT_LOAD_LOCK))
   2924 			page_unlock(opp);
   2925 
   2926 		if (anon_lock) {
   2927 			anon_array_exit(&cookie);
   2928 		}
   2929 		return (0);
   2930 	}
   2931 
   2932 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   2933 
   2934 	hat_setref(opp);
   2935 
   2936 	ASSERT(amp != NULL && anon_lock);
   2937 
   2938 	/*
   2939 	 * Steal the page only if it isn't a private page
   2940 	 * since stealing a private page is not worth the effort.
   2941 	 */
   2942 	if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL)
   2943 		steal = 1;
   2944 
   2945 	/*
   2946 	 * Steal the original page if the following conditions are true:
   2947 	 *
   2948 	 * We are low on memory, the page is not private, page is not large,
   2949 	 * not shared, not modified, not `locked' or if we have it `locked'
   2950 	 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies
   2951 	 * that the page is not shared) and if it doesn't have any
   2952 	 * translations. page_struct_lock isn't needed to look at p_cowcnt
   2953 	 * and p_lckcnt because we first get exclusive lock on page.
   2954 	 */
   2955 	(void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
   2956 
   2957 	if (stealcow && freemem < minfree && steal && opp->p_szc == 0 &&
   2958 	    page_tryupgrade(opp) && !hat_ismod(opp) &&
   2959 	    ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) ||
   2960 	    (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 &&
   2961 	    vpage != NULL && VPP_ISPPLOCK(vpage)))) {
   2962 		/*
   2963 		 * Check if this page has other translations
   2964 		 * after unloading our translation.
   2965 		 */
   2966 		if (hat_page_is_mapped(opp)) {
   2967 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   2968 			hat_unload(seg->s_as->a_hat, addr, PAGESIZE,
   2969 			    HAT_UNLOAD);
   2970 		}
   2971 
   2972 		/*
   2973 		 * hat_unload() might sync back someone else's recent
   2974 		 * modification, so check again.
   2975 		 */
   2976 		if (!hat_ismod(opp) && !hat_page_is_mapped(opp))
   2977 			pageflags |= STEAL_PAGE;
   2978 	}
   2979 
   2980 	/*
   2981 	 * If we have a vpage pointer, see if it indicates that we have
   2982 	 * ``locked'' the page we map -- if so, tell anon_private to
   2983 	 * transfer the locking resource to the new page.
   2984 	 *
   2985 	 * See Statement at the beginning of segvn_lockop regarding
   2986 	 * the way lockcnts/cowcnts are handled during COW.
   2987 	 *
   2988 	 */
   2989 	if (vpage != NULL && VPP_ISPPLOCK(vpage))
   2990 		pageflags |= LOCK_PAGE;
   2991 
   2992 	/*
   2993 	 * Allocate a private page and perform the copy.
   2994 	 * For MAP_NORESERVE reserve swap space now, unless this
   2995 	 * is a cow fault on an existing anon page in which case
   2996 	 * MAP_NORESERVE will have made advance reservations.
   2997 	 */
   2998 	if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) {
   2999 		if (anon_resv_zone(ptob(1), seg->s_as->a_proc->p_zone)) {
   3000 			atomic_add_long(&svd->swresv, ptob(1));
   3001 			atomic_add_long(&seg->s_as->a_resvsize, ptob(1));
   3002 		} else {
   3003 			page_unlock(opp);
   3004 			err = ENOMEM;
   3005 			goto out;
   3006 		}
   3007 	}
   3008 	oldap = ap;
   3009 	pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred);
   3010 	if (pp == NULL) {
   3011 		err = ENOMEM;	/* out of swap space */
   3012 		goto out;
   3013 	}
   3014 
   3015 	/*
   3016 	 * If we copied away from an anonymous page, then
   3017 	 * we are one step closer to freeing up an anon slot.
   3018 	 *
   3019 	 * NOTE:  The original anon slot must be released while
   3020 	 * holding the "anon_map" lock.  This is necessary to prevent
   3021 	 * other threads from obtaining a pointer to the anon slot
   3022 	 * which may be freed if its "refcnt" is 1.
   3023 	 */
   3024 	if (oldap != NULL)
   3025 		anon_decref(oldap);
   3026 
   3027 	(void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
   3028 
   3029 	/*
   3030 	 * Handle pages that have been marked for migration
   3031 	 */
   3032 	if (lgrp_optimizations())
   3033 		page_migrate(seg, addr, &pp, 1);
   3034 
   3035 	ASSERT(pp->p_szc == 0);
   3036 
   3037 	ASSERT(!IS_VMODSORT(pp->p_vnode));
   3038 	if (enable_mbit_wa) {
   3039 		if (rw == S_WRITE)
   3040 			hat_setmod(pp);
   3041 		else if (!hat_ismod(pp))
   3042 			prot &= ~PROT_WRITE;
   3043 	}
   3044 
   3045 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   3046 	hat_memload(hat, addr, pp, prot, hat_flag);
   3047 
   3048 	if (!(hat_flag & HAT_LOAD_LOCK))
   3049 		page_unlock(pp);
   3050 
   3051 	ASSERT(anon_lock);
   3052 	anon_array_exit(&cookie);
   3053 	return (0);
   3054 out:
   3055 	if (anon_lock)
   3056 		anon_array_exit(&cookie);
   3057 
   3058 	if (type == F_SOFTLOCK) {
   3059 		atomic_add_long((ulong_t *)&svd->softlockcnt, -1);
   3060 	}
   3061 	return (FC_MAKE_ERR(err));
   3062 }
   3063 
   3064 /*
   3065  * relocate a bunch of smaller targ pages into one large repl page. all targ
   3066  * pages must be complete pages smaller than replacement pages.
   3067  * it's assumed that no page's szc can change since they are all PAGESIZE or
   3068  * complete large pages locked SHARED.
   3069  */
   3070 static void
   3071 segvn_relocate_pages(page_t **targ, page_t *replacement)
   3072 {
   3073 	page_t *pp;
   3074 	pgcnt_t repl_npgs, curnpgs;
   3075 	pgcnt_t i;
   3076 	uint_t repl_szc = replacement->p_szc;
   3077 	page_t *first_repl = replacement;
   3078 	page_t *repl;
   3079 	spgcnt_t npgs;
   3080 
   3081 	VM_STAT_ADD(segvnvmstats.relocatepages[0]);
   3082 
   3083 	ASSERT(repl_szc != 0);
   3084 	npgs = repl_npgs = page_get_pagecnt(repl_szc);
   3085 
   3086 	i = 0;
   3087 	while (repl_npgs) {
   3088 		spgcnt_t nreloc;
   3089 		int err;
   3090 		ASSERT(replacement != NULL);
   3091 		pp = targ[i];
   3092 		ASSERT(pp->p_szc < repl_szc);
   3093 		ASSERT(PAGE_EXCL(pp));
   3094 		ASSERT(!PP_ISFREE(pp));
   3095 		curnpgs = page_get_pagecnt(pp->p_szc);
   3096 		if (curnpgs == 1) {
   3097 			VM_STAT_ADD(segvnvmstats.relocatepages[1]);
   3098 			repl = replacement;
   3099 			page_sub(&replacement, repl);
   3100 			ASSERT(PAGE_EXCL(repl));
   3101 			ASSERT(!PP_ISFREE(repl));
   3102 			ASSERT(repl->p_szc == repl_szc);
   3103 		} else {
   3104 			page_t *repl_savepp;
   3105 			int j;
   3106 			VM_STAT_ADD(segvnvmstats.relocatepages[2]);
   3107 			repl_savepp = replacement;
   3108 			for (j = 0; j < curnpgs; j++) {
   3109 				repl = replacement;
   3110 				page_sub(&replacement, repl);
   3111 				ASSERT(PAGE_EXCL(repl));
   3112 				ASSERT(!PP_ISFREE(repl));
   3113 				ASSERT(repl->p_szc == repl_szc);
   3114 				ASSERT(page_pptonum(targ[i + j]) ==
   3115 				    page_pptonum(targ[i]) + j);
   3116 			}
   3117 			repl = repl_savepp;
   3118 			ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs));
   3119 		}
   3120 		err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL);
   3121 		if (err || nreloc != curnpgs) {
   3122 			panic("segvn_relocate_pages: "
   3123 			    "page_relocate failed err=%d curnpgs=%ld "
   3124 			    "nreloc=%ld", err, curnpgs, nreloc);
   3125 		}
   3126 		ASSERT(curnpgs <= repl_npgs);
   3127 		repl_npgs -= curnpgs;
   3128 		i += curnpgs;
   3129 	}
   3130 	ASSERT(replacement == NULL);
   3131 
   3132 	repl = first_repl;
   3133 	repl_npgs = npgs;
   3134 	for (i = 0; i < repl_npgs; i++) {
   3135 		ASSERT(PAGE_EXCL(repl));
   3136 		ASSERT(!PP_ISFREE(repl));
   3137 		targ[i] = repl;
   3138 		page_downgrade(targ[i]);
   3139 		repl++;
   3140 	}
   3141 }
   3142 
   3143 /*
   3144  * Check if all pages in ppa array are complete smaller than szc pages and
   3145  * their roots will still be aligned relative to their current size if the
   3146  * entire ppa array is relocated into one szc page. If these conditions are
   3147  * not met return 0.
   3148  *
   3149  * If all pages are properly aligned attempt to upgrade their locks
   3150  * to exclusive mode. If it fails set *upgrdfail to 1 and return 0.
   3151  * upgrdfail was set to 0 by caller.
   3152  *
   3153  * Return 1 if all pages are aligned and locked exclusively.
   3154  *
   3155  * If all pages in ppa array happen to be physically contiguous to make one
   3156  * szc page and all exclusive locks are successfully obtained promote the page
   3157  * size to szc and set *pszc to szc. Return 1 with pages locked shared.
   3158  */
   3159 static int
   3160 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc)
   3161 {
   3162 	page_t *pp;
   3163 	pfn_t pfn;
   3164 	pgcnt_t totnpgs = page_get_pagecnt(szc);
   3165 	pfn_t first_pfn;
   3166 	int contig = 1;
   3167 	pgcnt_t i;
   3168 	pgcnt_t j;
   3169 	uint_t curszc;
   3170 	pgcnt_t curnpgs;
   3171 	int root = 0;
   3172 
   3173 	ASSERT(szc > 0);
   3174 
   3175 	VM_STAT_ADD(segvnvmstats.fullszcpages[0]);
   3176 
   3177 	for (i = 0; i < totnpgs; i++) {
   3178 		pp = ppa[i];
   3179 		ASSERT(PAGE_SHARED(pp));
   3180 		ASSERT(!PP_ISFREE(pp));
   3181 		pfn = page_pptonum(pp);
   3182 		if (i == 0) {
   3183 			if (!IS_P2ALIGNED(pfn, totnpgs)) {
   3184 				contig = 0;
   3185 			} else {
   3186 				first_pfn = pfn;
   3187 			}
   3188 		} else if (contig && pfn != first_pfn + i) {
   3189 			contig = 0;
   3190 		}
   3191 		if (pp->p_szc == 0) {
   3192 			if (root) {
   3193 				VM_STAT_ADD(segvnvmstats.fullszcpages[1]);
   3194 				return (0);
   3195 			}
   3196 		} else if (!root) {
   3197 			if ((curszc = pp->p_szc) >= szc) {
   3198 				VM_STAT_ADD(segvnvmstats.fullszcpages[2]);
   3199 				return (0);
   3200 			}
   3201 			if (curszc == 0) {
   3202 				/*
   3203 				 * p_szc changed means we don't have all pages
   3204 				 * locked. return failure.
   3205 				 */
   3206 				VM_STAT_ADD(segvnvmstats.fullszcpages[3]);
   3207 				return (0);
   3208 			}
   3209 			curnpgs = page_get_pagecnt(curszc);
   3210 			if (!IS_P2ALIGNED(pfn, curnpgs) ||
   3211 			    !IS_P2ALIGNED(i, curnpgs)) {
   3212 				VM_STAT_ADD(segvnvmstats.fullszcpages[4]);
   3213 				return (0);
   3214 			}
   3215 			root = 1;
   3216 		} else {
   3217 			ASSERT(i > 0);
   3218 			VM_STAT_ADD(segvnvmstats.fullszcpages[5]);
   3219 			if (pp->p_szc != curszc) {
   3220 				VM_STAT_ADD(segvnvmstats.fullszcpages[6]);
   3221 				return (0);
   3222 			}
   3223 			if (pfn - 1 != page_pptonum(ppa[i - 1])) {
   3224 				panic("segvn_full_szcpages: "
   3225 				    "large page not physically contiguous");
   3226 			}
   3227 			if (P2PHASE(pfn, curnpgs) == curnpgs - 1) {
   3228 				root = 0;
   3229 			}
   3230 		}
   3231 	}
   3232 
   3233 	for (i = 0; i < totnpgs; i++) {
   3234 		ASSERT(ppa[i]->p_szc < szc);
   3235 		if (!page_tryupgrade(ppa[i])) {
   3236 			for (j = 0; j < i; j++) {
   3237 				page_downgrade(ppa[j]);
   3238 			}
   3239 			*pszc = ppa[i]->p_szc;
   3240 			*upgrdfail = 1;
   3241 			VM_STAT_ADD(segvnvmstats.fullszcpages[7]);
   3242 			return (0);
   3243 		}
   3244 	}
   3245 
   3246 	/*
   3247 	 * When a page is put a free cachelist its szc is set to 0.  if file
   3248 	 * system reclaimed pages from cachelist targ pages will be physically
   3249 	 * contiguous with 0 p_szc.  in this case just upgrade szc of targ
   3250 	 * pages without any relocations.
   3251 	 * To avoid any hat issues with previous small mappings
   3252 	 * hat_pageunload() the target pages first.
   3253 	 */
   3254 	if (contig) {
   3255 		VM_STAT_ADD(segvnvmstats.fullszcpages[8]);
   3256 		for (i = 0; i < totnpgs; i++) {
   3257 			(void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD);
   3258 		}
   3259 		for (i = 0; i < totnpgs; i++) {
   3260 			ppa[i]->p_szc = szc;
   3261 		}
   3262 		for (i = 0; i < totnpgs; i++) {
   3263 			ASSERT(PAGE_EXCL(ppa[i]));
   3264 			page_downgrade(ppa[i]);
   3265 		}
   3266 		if (pszc != NULL) {
   3267 			*pszc = szc;
   3268 		}
   3269 	}
   3270 	VM_STAT_ADD(segvnvmstats.fullszcpages[9]);
   3271 	return (1);
   3272 }
   3273 
   3274 /*
   3275  * Create physically contiguous pages for [vp, off] - [vp, off +
   3276  * page_size(szc)) range and for private segment return them in ppa array.
   3277  * Pages are created either via IO or relocations.
   3278  *
   3279  * Return 1 on success and 0 on failure.
   3280  *
   3281  * If physically contiguous pages already exist for this range return 1 without
   3282  * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa
   3283  * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE().
   3284  */
   3285 
   3286 static int
   3287 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off,
   3288     uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc,
   3289     int *downsize)
   3290 
   3291 {
   3292 	page_t *pplist = *ppplist;
   3293 	size_t pgsz = page_get_pagesize(szc);
   3294 	pgcnt_t pages = btop(pgsz);
   3295 	ulong_t start_off = off;
   3296 	u_offset_t eoff = off + pgsz;
   3297 	spgcnt_t nreloc;
   3298 	u_offset_t io_off = off;
   3299 	size_t io_len;
   3300 	page_t *io_pplist = NULL;
   3301 	page_t *done_pplist = NULL;
   3302 	pgcnt_t pgidx = 0;
   3303 	page_t *pp;
   3304 	page_t *newpp;
   3305 	page_t *targpp;
   3306 	int io_err = 0;
   3307 	int i;
   3308 	pfn_t pfn;
   3309 	ulong_t ppages;
   3310 	page_t *targ_pplist = NULL;
   3311 	page_t *repl_pplist = NULL;
   3312 	page_t *tmp_pplist;
   3313 	int nios = 0;
   3314 	uint_t pszc;
   3315 	struct vattr va;
   3316 
   3317 	VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]);
   3318 
   3319 	ASSERT(szc != 0);
   3320 	ASSERT(pplist->p_szc == szc);
   3321 
   3322 	/*
   3323 	 * downsize will be set to 1 only if we fail to lock pages. this will
   3324 	 * allow subsequent faults to try to relocate the page again. If we
   3325 	 * fail due to misalignment don't downsize and let the caller map the
   3326 	 * whole region with small mappings to avoid more faults into the area
   3327 	 * where we can't get large pages anyway.
   3328 	 */
   3329 	*downsize = 0;
   3330 
   3331 	while (off < eoff) {
   3332 		newpp = pplist;
   3333 		ASSERT(newpp != NULL);
   3334 		ASSERT(PAGE_EXCL(newpp));
   3335 		ASSERT(!PP_ISFREE(newpp));
   3336 		/*
   3337 		 * we pass NULL for nrelocp to page_lookup_create()
   3338 		 * so that it doesn't relocate. We relocate here
   3339 		 * later only after we make sure we can lock all
   3340 		 * pages in the range we handle and they are all
   3341 		 * aligned.
   3342 		 */
   3343 		pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0);
   3344 		ASSERT(pp != NULL);
   3345 		ASSERT(!PP_ISFREE(pp));
   3346 		ASSERT(pp->p_vnode == vp);
   3347 		ASSERT(pp->p_offset == off);
   3348 		if (pp == newpp) {
   3349 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]);
   3350 			page_sub(&pplist, pp);
   3351 			ASSERT(PAGE_EXCL(pp));
   3352 			ASSERT(page_iolock_assert(pp));
   3353 			page_list_concat(&io_pplist, &pp);
   3354 			off += PAGESIZE;
   3355 			continue;
   3356 		}
   3357 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]);
   3358 		pfn = page_pptonum(pp);
   3359 		pszc = pp->p_szc;
   3360 		if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL &&
   3361 		    IS_P2ALIGNED(pfn, pages)) {
   3362 			ASSERT(repl_pplist == NULL);
   3363 			ASSERT(done_pplist == NULL);
   3364 			ASSERT(pplist == *ppplist);
   3365 			page_unlock(pp);
   3366 			page_free_replacement_page(pplist);
   3367 			page_create_putback(pages);
   3368 			*ppplist = NULL;
   3369 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]);
   3370 			return (1);
   3371 		}
   3372 		if (pszc >= szc) {
   3373 			page_unlock(pp);
   3374 			segvn_faultvnmpss_align_err1++;
   3375 			goto out;
   3376 		}
   3377 		ppages = page_get_pagecnt(pszc);
   3378 		if (!IS_P2ALIGNED(pfn, ppages)) {
   3379 			ASSERT(pszc > 0);
   3380 			/*
   3381 			 * sizing down to pszc won't help.
   3382 			 */
   3383 			page_unlock(pp);
   3384 			segvn_faultvnmpss_align_err2++;
   3385 			goto out;
   3386 		}
   3387 		pfn = page_pptonum(newpp);
   3388 		if (!IS_P2ALIGNED(pfn, ppages)) {
   3389 			ASSERT(pszc > 0);
   3390 			/*
   3391 			 * sizing down to pszc won't help.
   3392 			 */
   3393 			page_unlock(pp);
   3394 			segvn_faultvnmpss_align_err3++;
   3395 			goto out;
   3396 		}
   3397 		if (!PAGE_EXCL(pp)) {
   3398 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]);
   3399 			page_unlock(pp);
   3400 			*downsize = 1;
   3401 			*ret_pszc = pp->p_szc;
   3402 			goto out;
   3403 		}
   3404 		targpp = pp;
   3405 		if (io_pplist != NULL) {
   3406 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]);
   3407 			io_len = off - io_off;
   3408 			/*
   3409 			 * Some file systems like NFS don't check EOF
   3410 			 * conditions in VOP_PAGEIO(). Check it here
   3411 			 * now that pages are locked SE_EXCL. Any file
   3412 			 * truncation will wait until the pages are
   3413 			 * unlocked so no need to worry that file will
   3414 			 * be truncated after we check its size here.
   3415 			 * XXX fix NFS to remove this check.
   3416 			 */
   3417 			va.va_mask = AT_SIZE;
   3418 			if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL)) {
   3419 				VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]);
   3420 				page_unlock(targpp);
   3421 				goto out;
   3422 			}
   3423 			if (btopr(va.va_size) < btopr(io_off + io_len)) {
   3424 				VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]);
   3425 				*downsize = 1;
   3426 				*ret_pszc = 0;
   3427 				page_unlock(targpp);
   3428 				goto out;
   3429 			}
   3430 			io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len,
   3431 				B_READ, svd->cred, NULL);
   3432 			if (io_err) {
   3433 				VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]);
   3434 				page_unlock(targpp);
   3435 				if (io_err == EDEADLK) {
   3436 					segvn_vmpss_pageio_deadlk_err++;
   3437 				}
   3438 				goto out;
   3439 			}
   3440 			nios++;
   3441 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]);
   3442 			while (io_pplist != NULL) {
   3443 				pp = io_pplist;
   3444 				page_sub(&io_pplist, pp);
   3445 				ASSERT(page_iolock_assert(pp));
   3446 				page_io_unlock(pp);
   3447 				pgidx = (pp->p_offset - start_off) >>
   3448 				    PAGESHIFT;
   3449 				ASSERT(pgidx < pages);
   3450 				ppa[pgidx] = pp;
   3451 				page_list_concat(&done_pplist, &pp);
   3452 			}
   3453 		}
   3454 		pp = targpp;
   3455 		ASSERT(PAGE_EXCL(pp));
   3456 		ASSERT(pp->p_szc <= pszc);
   3457 		if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) {
   3458 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]);
   3459 			page_unlock(pp);
   3460 			*downsize = 1;
   3461 			*ret_pszc = pp->p_szc;
   3462 			goto out;
   3463 		}
   3464 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]);
   3465 		/*
   3466 		 * page szc chould have changed before the entire group was
   3467 		 * locked. reread page szc.
   3468 		 */
   3469 		pszc = pp->p_szc;
   3470 		ppages = page_get_pagecnt(pszc);
   3471 
   3472 		/* link just the roots */
   3473 		page_list_concat(&targ_pplist, &pp);
   3474 		page_sub(&pplist, newpp);
   3475 		page_list_concat(&repl_pplist, &newpp);
   3476 		off += PAGESIZE;
   3477 		while (--ppages != 0) {
   3478 			newpp = pplist;
   3479 			page_sub(&pplist, newpp);
   3480 			off += PAGESIZE;
   3481 		}
   3482 		io_off = off;
   3483 	}
   3484 	if (io_pplist != NULL) {
   3485 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]);
   3486 		io_len = eoff - io_off;
   3487 		va.va_mask = AT_SIZE;
   3488 		if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL) != 0) {
   3489 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]);
   3490 			goto out;
   3491 		}
   3492 		if (btopr(va.va_size) < btopr(io_off + io_len)) {
   3493 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]);
   3494 			*downsize = 1;
   3495 			*ret_pszc = 0;
   3496 			goto out;
   3497 		}
   3498 		io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len,
   3499 		    B_READ, svd->cred, NULL);
   3500 		if (io_err) {
   3501 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]);
   3502 			if (io_err == EDEADLK) {
   3503 				segvn_vmpss_pageio_deadlk_err++;
   3504 			}
   3505 			goto out;
   3506 		}
   3507 		nios++;
   3508 		while (io_pplist != NULL) {
   3509 			pp = io_pplist;
   3510 			page_sub(&io_pplist, pp);
   3511 			ASSERT(page_iolock_assert(pp));
   3512 			page_io_unlock(pp);
   3513 			pgidx = (pp->p_offset - start_off) >> PAGESHIFT;
   3514 			ASSERT(pgidx < pages);
   3515 			ppa[pgidx] = pp;
   3516 		}
   3517 	}
   3518 	/*
   3519 	 * we're now bound to succeed or panic.
   3520 	 * remove pages from done_pplist. it's not needed anymore.
   3521 	 */
   3522 	while (done_pplist != NULL) {
   3523 		pp = done_pplist;
   3524 		page_sub(&done_pplist, pp);
   3525 	}
   3526 	VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]);
   3527 	ASSERT(pplist == NULL);
   3528 	*ppplist = NULL;
   3529 	while (targ_pplist != NULL) {
   3530 		int ret;
   3531 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]);
   3532 		ASSERT(repl_pplist);
   3533 		pp = targ_pplist;
   3534 		page_sub(&targ_pplist, pp);
   3535 		pgidx = (pp->p_offset - start_off) >> PAGESHIFT;
   3536 		newpp = repl_pplist;
   3537 		page_sub(&repl_pplist, newpp);
   3538 #ifdef DEBUG
   3539 		pfn = page_pptonum(pp);
   3540 		pszc = pp->p_szc;
   3541 		ppages = page_get_pagecnt(pszc);
   3542 		ASSERT(IS_P2ALIGNED(pfn, ppages));
   3543 		pfn = page_pptonum(newpp);
   3544 		ASSERT(IS_P2ALIGNED(pfn, ppages));
   3545 		ASSERT(P2PHASE(pfn, pages) == pgidx);
   3546 #endif
   3547 		nreloc = 0;
   3548 		ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL);
   3549 		if (ret != 0 || nreloc == 0) {
   3550 			panic("segvn_fill_vp_pages: "
   3551 			    "page_relocate failed");
   3552 		}
   3553 		pp = newpp;
   3554 		while (nreloc-- != 0) {
   3555 			ASSERT(PAGE_EXCL(pp));
   3556 			ASSERT(pp->p_vnode == vp);
   3557 			ASSERT(pgidx ==
   3558 			    ((pp->p_offset - start_off) >> PAGESHIFT));
   3559 			ppa[pgidx++] = pp;
   3560 			pp++;
   3561 		}
   3562 	}
   3563 
   3564 	if (svd->type == MAP_PRIVATE) {
   3565 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]);
   3566 		for (i = 0; i < pages; i++) {
   3567 			ASSERT(ppa[i] != NULL);
   3568 			ASSERT(PAGE_EXCL(ppa[i]));
   3569 			ASSERT(ppa[i]->p_vnode == vp);
   3570 			ASSERT(ppa[i]->p_offset ==
   3571 			    start_off + (i << PAGESHIFT));
   3572 			page_downgrade(ppa[i]);
   3573 		}
   3574 		ppa[pages] = NULL;
   3575 	} else {
   3576 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]);
   3577 		/*
   3578 		 * the caller will still call VOP_GETPAGE() for shared segments
   3579 		 * to check FS write permissions. For private segments we map
   3580 		 * file read only anyway.  so no VOP_GETPAGE is needed.
   3581 		 */
   3582 		for (i = 0; i < pages; i++) {
   3583 			ASSERT(ppa[i] != NULL);
   3584 			ASSERT(PAGE_EXCL(ppa[i]));
   3585 			ASSERT(ppa[i]->p_vnode == vp);
   3586 			ASSERT(ppa[i]->p_offset ==
   3587 			    start_off + (i << PAGESHIFT));
   3588 			page_unlock(ppa[i]);
   3589 		}
   3590 		ppa[0] = NULL;
   3591 	}
   3592 
   3593 	return (1);
   3594 out:
   3595 	/*
   3596 	 * Do the cleanup. Unlock target pages we didn't relocate. They are
   3597 	 * linked on targ_pplist by root pages. reassemble unused replacement
   3598 	 * and io pages back to pplist.
   3599 	 */
   3600 	if (io_pplist != NULL) {
   3601 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]);
   3602 		pp = io_pplist;
   3603 		do {
   3604 			ASSERT(pp->p_vnode == vp);
   3605 			ASSERT(pp->p_offset == io_off);
   3606 			ASSERT(page_iolock_assert(pp));
   3607 			page_io_unlock(pp);
   3608 			page_hashout(pp, NULL);
   3609 			io_off += PAGESIZE;
   3610 		} while ((pp = pp->p_next) != io_pplist);
   3611 		page_list_concat(&io_pplist, &pplist);
   3612 		pplist = io_pplist;
   3613 	}
   3614 	tmp_pplist = NULL;
   3615 	while (targ_pplist != NULL) {
   3616 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]);
   3617 		pp = targ_pplist;
   3618 		ASSERT(PAGE_EXCL(pp));
   3619 		page_sub(&targ_pplist, pp);
   3620 
   3621 		pszc = pp->p_szc;
   3622 		ppages = page_get_pagecnt(pszc);
   3623 		ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages));
   3624 
   3625 		if (pszc != 0) {
   3626 			group_page_unlock(pp);
   3627 		}
   3628 		page_unlock(pp);
   3629 
   3630 		pp = repl_pplist;
   3631 		ASSERT(pp != NULL);
   3632 		ASSERT(PAGE_EXCL(pp));
   3633 		ASSERT(pp->p_szc == szc);
   3634 		page_sub(&repl_pplist, pp);
   3635 
   3636 		ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages));
   3637 
   3638 		/* relink replacement page */
   3639 		page_list_concat(&tmp_pplist, &pp);
   3640 		while (--ppages != 0) {
   3641 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]);
   3642 			pp++;
   3643 			ASSERT(PAGE_EXCL(pp));
   3644 			ASSERT(pp->p_szc == szc);
   3645 			page_list_concat(&tmp_pplist, &pp);
   3646 		}
   3647 	}
   3648 	if (tmp_pplist != NULL) {
   3649 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]);
   3650 		page_list_concat(&tmp_pplist, &pplist);
   3651 		pplist = tmp_pplist;
   3652 	}
   3653 	/*
   3654 	 * at this point all pages are either on done_pplist or
   3655 	 * pplist. They can't be all on done_pplist otherwise
   3656 	 * we'd've been done.
   3657 	 */
   3658 	ASSERT(pplist != NULL);
   3659 	if (nios != 0) {
   3660 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]);
   3661 		pp = pplist;
   3662 		do {
   3663 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]);
   3664 			ASSERT(pp->p_szc == szc);
   3665 			ASSERT(PAGE_EXCL(pp));
   3666 			ASSERT(pp->p_vnode != vp);
   3667 			pp->p_szc = 0;
   3668 		} while ((pp = pp->p_next) != pplist);
   3669 
   3670 		pp = done_pplist;
   3671 		do {
   3672 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]);
   3673 			ASSERT(pp->p_szc == szc);
   3674 			ASSERT(PAGE_EXCL(pp));
   3675 			ASSERT(pp->p_vnode == vp);
   3676 			pp->p_szc = 0;
   3677 		} while ((pp = pp->p_next) != done_pplist);
   3678 
   3679 		while (pplist != NULL) {
   3680 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]);
   3681 			pp = pplist;
   3682 			page_sub(&pplist, pp);
   3683 			page_free(pp, 0);
   3684 		}
   3685 
   3686 		while (done_pplist != NULL) {
   3687 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]);
   3688 			pp = done_pplist;
   3689 			page_sub(&done_pplist, pp);
   3690 			page_unlock(pp);
   3691 		}
   3692 		*ppplist = NULL;
   3693 		return (0);
   3694 	}
   3695 	ASSERT(pplist == *ppplist);
   3696 	if (io_err) {
   3697 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]);
   3698 		/*
   3699 		 * don't downsize on io error.
   3700 		 * see if vop_getpage succeeds.
   3701 		 * pplist may still be used in this case
   3702 		 * for relocations.
   3703 		 */
   3704 		return (0);
   3705 	}
   3706 	VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]);
   3707 	page_free_replacement_page(pplist);
   3708 	page_create_putback(pages);
   3709 	*ppplist = NULL;
   3710 	return (0);
   3711 }
   3712 
   3713 int segvn_anypgsz = 0;
   3714 
   3715 #define	SEGVN_RESTORE_SOFTLOCK_VP(type, pages) 				\
   3716 		if ((type) == F_SOFTLOCK) {				\
   3717 			atomic_add_long((ulong_t *)&(svd)->softlockcnt, \
   3718 			    -(pages));					\
   3719 		}
   3720 
   3721 #define	SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot)		\
   3722 		if (IS_VMODSORT((ppa)[0]->p_vnode)) {			\
   3723 			if ((rw) == S_WRITE) {				\
   3724 				for (i = 0; i < (pages); i++) {		\
   3725 					ASSERT((ppa)[i]->p_vnode ==	\
   3726 					    (ppa)[0]->p_vnode);		\
   3727 					hat_setmod((ppa)[i]);		\
   3728 				}					\
   3729 			} else if ((rw) != S_OTHER &&			\
   3730 			    ((prot) & (vpprot) & PROT_WRITE)) {		\
   3731 				for (i = 0; i < (pages); i++) {		\
   3732 					ASSERT((ppa)[i]->p_vnode ==	\
   3733 					    (ppa)[0]->p_vnode);		\
   3734 					if (!hat_ismod((ppa)[i])) {	\
   3735 						prot &= ~PROT_WRITE;	\
   3736 						break;			\
   3737 					}				\
   3738 				}					\
   3739 			}						\
   3740 		}
   3741 
   3742 #ifdef  VM_STATS
   3743 
   3744 #define	SEGVN_VMSTAT_FLTVNPAGES(idx)					\
   3745 		VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]);
   3746 
   3747 #else /* VM_STATS */
   3748 
   3749 #define	SEGVN_VMSTAT_FLTVNPAGES(idx)
   3750 
   3751 #endif
   3752 
   3753 static faultcode_t
   3754 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
   3755     caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr,
   3756     caddr_t eaddr, int brkcow)
   3757 {
   3758 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   3759 	struct anon_map *amp = svd->amp;
   3760 	uchar_t segtype = svd->type;
   3761 	uint_t szc = seg->s_szc;
   3762 	size_t pgsz = page_get_pagesize(szc);
   3763 	size_t maxpgsz = pgsz;
   3764 	pgcnt_t pages = btop(pgsz);
   3765 	pgcnt_t maxpages = pages;
   3766 	size_t ppasize = (pages + 1) * sizeof (page_t *);
   3767 	caddr_t a = lpgaddr;
   3768 	caddr_t	maxlpgeaddr = lpgeaddr;
   3769 	u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base);
   3770 	ulong_t aindx = svd->anon_index + seg_page(seg, a);
   3771 	struct vpage *vpage = (svd->vpage != NULL) ?
   3772 	    &svd->vpage[seg_page(seg, a)] : NULL;
   3773 	vnode_t *vp = svd->vp;
   3774 	page_t **ppa;
   3775 	uint_t	pszc;
   3776 	size_t	ppgsz;
   3777 	pgcnt_t	ppages;
   3778 	faultcode_t err = 0;
   3779 	int ierr;
   3780 	int vop_size_err = 0;
   3781 	uint_t protchk, prot, vpprot;
   3782 	ulong_t i;
   3783 	int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
   3784 	anon_sync_obj_t an_cookie;
   3785 	enum seg_rw arw;
   3786 	int alloc_failed = 0;
   3787 	int adjszc_chk;
   3788 	struct vattr va;
   3789 	int xhat = 0;
   3790 	page_t *pplist;
   3791 	pfn_t pfn;
   3792 	int physcontig;
   3793 	int upgrdfail;
   3794 	int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */
   3795 	int tron = (svd->tr_state == SEGVN_TR_ON);
   3796 
   3797 	ASSERT(szc != 0);
   3798 	ASSERT(vp != NULL);
   3799 	ASSERT(brkcow == 0 || amp != NULL);
   3800 	ASSERT(tron == 0 || amp != NULL);
   3801 	ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
   3802 	ASSERT(!(svd->flags & MAP_NORESERVE));
   3803 	ASSERT(type != F_SOFTUNLOCK);
   3804 	ASSERT(IS_P2ALIGNED(a, maxpgsz));
   3805 	ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages));
   3806 	ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
   3807 	ASSERT(seg->s_szc < NBBY * sizeof (int));
   3808 	ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz);
   3809 	ASSERT(svd->tr_state != SEGVN_TR_INIT);
   3810 
   3811 	VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]);
   3812 	VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]);
   3813 
   3814 	if (svd->flags & MAP_TEXT) {
   3815 		hat_flag |= HAT_LOAD_TEXT;
   3816 	}
   3817 
   3818 	if (svd->pageprot) {
   3819 		switch (rw) {
   3820 		case S_READ:
   3821 			protchk = PROT_READ;
   3822 			break;
   3823 		case S_WRITE:
   3824 			protchk = PROT_WRITE;
   3825 			break;
   3826 		case S_EXEC:
   3827 			protchk = PROT_EXEC;
   3828 			break;
   3829 		case S_OTHER:
   3830 		default:
   3831 			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
   3832 			break;
   3833 		}
   3834 	} else {
   3835 		prot = svd->prot;
   3836 		/* caller has already done segment level protection check. */
   3837 	}
   3838 
   3839 	if (seg->s_as->a_hat != hat) {
   3840 		xhat = 1;
   3841 	}
   3842 
   3843 	if (rw == S_WRITE && segtype == MAP_PRIVATE) {
   3844 		SEGVN_VMSTAT_FLTVNPAGES(2);
   3845 		arw = S_READ;
   3846 	} else {
   3847 		arw = rw;
   3848 	}
   3849 
   3850 	ppa = kmem_alloc(ppasize, KM_SLEEP);
   3851 
   3852 	VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]);
   3853 
   3854 	for (;;) {
   3855 		adjszc_chk = 0;
   3856 		for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) {
   3857 			if (adjszc_chk) {
   3858 				while (szc < seg->s_szc) {
   3859 					uintptr_t e;
   3860 					uint_t tszc;
   3861 					tszc = segvn_anypgsz_vnode ? szc + 1 :
   3862 					    seg->s_szc;
   3863 					ppgsz = page_get_pagesize(tszc);
   3864 					if (!IS_P2ALIGNED(a, ppgsz) ||
   3865 					    ((alloc_failed >> tszc) & 0x1)) {
   3866 						break;
   3867 					}
   3868 					SEGVN_VMSTAT_FLTVNPAGES(4);
   3869 					szc = tszc;
   3870 					pgsz = ppgsz;
   3871 					pages = btop(pgsz);
   3872 					e = P2ROUNDUP((uintptr_t)eaddr, pgsz);
   3873 					lpgeaddr = (caddr_t)e;
   3874 				}
   3875 			}
   3876 
   3877 		again:
   3878 			if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) {
   3879 				ASSERT(IS_P2ALIGNED(aindx, maxpages));
   3880 				ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   3881 				anon_array_enter(amp, aindx, &an_cookie);
   3882 				if (anon_get_ptr(amp->ahp, aindx) != NULL) {
   3883 					SEGVN_VMSTAT_FLTVNPAGES(5);
   3884 					ASSERT(anon_pages(amp->ahp, aindx,
   3885 					    maxpages) == maxpages);
   3886 					anon_array_exit(&an_cookie);
   3887 					ANON_LOCK_EXIT(&amp->a_rwlock);
   3888 					err = segvn_fault_anonpages(hat, seg,
   3889 					    a, a + maxpgsz, type, rw,
   3890 					    MAX(a, addr),
   3891 					    MIN(a + maxpgsz, eaddr), brkcow);
   3892 					if (err != 0) {
   3893 						SEGVN_VMSTAT_FLTVNPAGES(6);
   3894 						goto out;
   3895 					}
   3896 					if (szc < seg->s_szc) {
   3897 						szc = seg->s_szc;
   3898 						pgsz = maxpgsz;
   3899 						pages = maxpages;
   3900 						lpgeaddr = maxlpgeaddr;
   3901 					}
   3902 					goto next;
   3903 				} else {
   3904 					ASSERT(anon_pages(amp->ahp, aindx,
   3905 					    maxpages) == 0);
   3906 					SEGVN_VMSTAT_FLTVNPAGES(7);
   3907 					anon_array_exit(&an_cookie);
   3908 					ANON_LOCK_EXIT(&amp->a_rwlock);
   3909 				}
   3910 			}
   3911 			ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz));
   3912 			ASSERT(!tron || IS_P2ALIGNED(a, maxpgsz));
   3913 
   3914 			if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
   3915 				ASSERT(vpage != NULL);
   3916 				prot = VPP_PROT(vpage);
   3917 				ASSERT(sameprot(seg, a, maxpgsz));
   3918 				if ((prot & protchk) == 0) {
   3919 					SEGVN_VMSTAT_FLTVNPAGES(8);
   3920 					err = FC_PROT;
   3921 					goto out;
   3922 				}
   3923 			}
   3924 			if (type == F_SOFTLOCK) {
   3925 				atomic_add_long((ulong_t *)&svd->softlockcnt,
   3926 				    pages);
   3927 			}
   3928 
   3929 			pplist = NULL;
   3930 			physcontig = 0;
   3931 			ppa[0] = NULL;
   3932 			if (!brkcow && !tron && szc &&
   3933 			    !page_exists_physcontig(vp, off, szc,
   3934 			    segtype == MAP_PRIVATE ? ppa : NULL)) {
   3935 				SEGVN_VMSTAT_FLTVNPAGES(9);
   3936 				if (page_alloc_pages(vp, seg, a, &pplist, NULL,
   3937 				    szc, 0, 0) && type != F_SOFTLOCK) {
   3938 					SEGVN_VMSTAT_FLTVNPAGES(10);
   3939 					pszc = 0;
   3940 					ierr = -1;
   3941 					alloc_failed |= (1 << szc);
   3942 					break;
   3943 				}
   3944 				if (pplist != NULL &&
   3945 				    vp->v_mpssdata == SEGVN_PAGEIO) {
   3946 					int downsize;
   3947 					SEGVN_VMSTAT_FLTVNPAGES(11);
   3948 					physcontig = segvn_fill_vp_pages(svd,
   3949 					    vp, off, szc, ppa, &pplist,
   3950 					    &pszc, &downsize);
   3951 					ASSERT(!physcontig || pplist == NULL);
   3952 					if (!physcontig && downsize &&
   3953 					    type != F_SOFTLOCK) {
   3954 						ASSERT(pplist == NULL);
   3955 						SEGVN_VMSTAT_FLTVNPAGES(12);
   3956 						ierr = -1;
   3957 						break;
   3958 					}
   3959 					ASSERT(!physcontig ||
   3960 					    segtype == MAP_PRIVATE ||
   3961 					    ppa[0] == NULL);
   3962 					if (physcontig && ppa[0] == NULL) {
   3963 						physcontig = 0;
   3964 					}
   3965 				}
   3966 			} else if (!brkcow && !tron && szc && ppa[0] != NULL) {
   3967 				SEGVN_VMSTAT_FLTVNPAGES(13);
   3968 				ASSERT(segtype == MAP_PRIVATE);
   3969 				physcontig = 1;
   3970 			}
   3971 
   3972 			if (!physcontig) {
   3973 				SEGVN_VMSTAT_FLTVNPAGES(14);
   3974 				ppa[0] = NULL;
   3975 				ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz,
   3976 				    &vpprot, ppa, pgsz, seg, a, arw,
   3977 				    svd->cred, NULL);
   3978 #ifdef DEBUG
   3979 				if (ierr == 0) {
   3980 					for (i = 0; i < pages; i++) {
   3981 						ASSERT(PAGE_LOCKED(ppa[i]));
   3982 						ASSERT(!PP_ISFREE(ppa[i]));
   3983 						ASSERT(ppa[i]->p_vnode == vp);
   3984 						ASSERT(ppa[i]->p_offset ==
   3985 						    off + (i << PAGESHIFT));
   3986 					}
   3987 				}
   3988 #endif /* DEBUG */
   3989 				if (segtype == MAP_PRIVATE) {
   3990 					SEGVN_VMSTAT_FLTVNPAGES(15);
   3991 					vpprot &= ~PROT_WRITE;
   3992 				}
   3993 			} else {
   3994 				ASSERT(segtype == MAP_PRIVATE);
   3995 				SEGVN_VMSTAT_FLTVNPAGES(16);
   3996 				vpprot = PROT_ALL & ~PROT_WRITE;
   3997 				ierr = 0;
   3998 			}
   3999 
   4000 			if (ierr != 0) {
   4001 				SEGVN_VMSTAT_FLTVNPAGES(17);
   4002 				if (pplist != NULL) {
   4003 					SEGVN_VMSTAT_FLTVNPAGES(18);
   4004 					page_free_replacement_page(pplist);
   4005 					page_create_putback(pages);
   4006 				}
   4007 				SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
   4008 				if (a + pgsz <= eaddr) {
   4009 					SEGVN_VMSTAT_FLTVNPAGES(19);
   4010 					err = FC_MAKE_ERR(ierr);
   4011 					goto out;
   4012 				}
   4013 				va.va_mask = AT_SIZE;
   4014 				if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL)) {
   4015 					SEGVN_VMSTAT_FLTVNPAGES(20);
   4016 					err = FC_MAKE_ERR(EIO);
   4017 					goto out;
   4018 				}
   4019 				if (btopr(va.va_size) >= btopr(off + pgsz)) {
   4020 					SEGVN_VMSTAT_FLTVNPAGES(21);
   4021 					err = FC_MAKE_ERR(ierr);
   4022 					goto out;
   4023 				}
   4024 				if (btopr(va.va_size) <
   4025 				    btopr(off + (eaddr - a))) {
   4026 					SEGVN_VMSTAT_FLTVNPAGES(22);
   4027 					err = FC_MAKE_ERR(ierr);
   4028 					goto out;
   4029 				}
   4030 				if (brkcow || tron || type == F_SOFTLOCK) {
   4031 					/* can't reduce map area */
   4032 					SEGVN_VMSTAT_FLTVNPAGES(23);
   4033 					vop_size_err = 1;
   4034 					goto out;
   4035 				}
   4036 				SEGVN_VMSTAT_FLTVNPAGES(24);
   4037 				ASSERT(szc != 0);
   4038 				pszc = 0;
   4039 				ierr = -1;
   4040 				break;
   4041 			}
   4042 
   4043 			if (amp != NULL) {
   4044 				ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   4045 				anon_array_enter(amp, aindx, &an_cookie);
   4046 			}
   4047 			if (amp != NULL &&
   4048 			    anon_get_ptr(amp->ahp, aindx) != NULL) {
   4049 				ulong_t taindx = P2ALIGN(aindx, maxpages);
   4050 
   4051 				SEGVN_VMSTAT_FLTVNPAGES(25);
   4052 				ASSERT(anon_pages(amp->ahp, taindx,
   4053 				    maxpages) == maxpages);
   4054 				for (i = 0; i < pages; i++) {
   4055 					page_unlock(ppa[i]);
   4056 				}
   4057 				anon_array_exit(&an_cookie);
   4058 				ANON_LOCK_EXIT(&amp->a_rwlock);
   4059 				if (pplist != NULL) {
   4060 					page_free_replacement_page(pplist);
   4061 					page_create_putback(pages);
   4062 				}
   4063 				SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
   4064 				if (szc < seg->s_szc) {
   4065 					SEGVN_VMSTAT_FLTVNPAGES(26);
   4066 					/*
   4067 					 * For private segments SOFTLOCK
   4068 					 * either always breaks cow (any rw
   4069 					 * type except S_READ_NOCOW) or
   4070 					 * address space is locked as writer
   4071 					 * (S_READ_NOCOW case) and anon slots
   4072 					 * can't show up on second check.
   4073 					 * Therefore if we are here for
   4074 					 * SOFTLOCK case it must be a cow
   4075 					 * break but cow break never reduces
   4076 					 * szc. text replication (tron) in
   4077 					 * this case works as cow break.
   4078 					 * Thus the assert below.
   4079 					 */
   4080 					ASSERT(!brkcow && !tron &&
   4081 					    type != F_SOFTLOCK);
   4082 					pszc = seg->s_szc;
   4083 					ierr = -2;
   4084 					break;
   4085 				}
   4086 				ASSERT(IS_P2ALIGNED(a, maxpgsz));
   4087 				goto again;
   4088 			}
   4089 #ifdef DEBUG
   4090 			if (amp != NULL) {
   4091 				ulong_t taindx = P2ALIGN(aindx, maxpages);
   4092 				ASSERT(!anon_pages(amp->ahp, taindx, maxpages));
   4093 			}
   4094 #endif /* DEBUG */
   4095 
   4096 			if (brkcow || tron) {
   4097 				ASSERT(amp != NULL);
   4098 				ASSERT(pplist == NULL);
   4099 				ASSERT(szc == seg->s_szc);
   4100 				ASSERT(IS_P2ALIGNED(a, maxpgsz));
   4101 				ASSERT(IS_P2ALIGNED(aindx, maxpages));
   4102 				SEGVN_VMSTAT_FLTVNPAGES(27);
   4103 				ierr = anon_map_privatepages(amp, aindx, szc,
   4104 				    seg, a, prot, ppa, vpage, segvn_anypgsz,
   4105 				    tron ? PG_LOCAL : 0, svd->cred);
   4106 				if (ierr != 0) {
   4107 					SEGVN_VMSTAT_FLTVNPAGES(28);
   4108 					anon_array_exit(&an_cookie);
   4109 					ANON_LOCK_EXIT(&amp->a_rwlock);
   4110 					SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
   4111 					err = FC_MAKE_ERR(ierr);
   4112 					goto out;
   4113 				}
   4114 
   4115 				ASSERT(!IS_VMODSORT(ppa[0]->p_vnode));
   4116 				/*
   4117 				 * p_szc can't be changed for locked
   4118 				 * swapfs pages.
   4119 				 */
   4120 				ASSERT(svd->rcookie ==
   4121 				    HAT_INVALID_REGION_COOKIE);
   4122 				hat_memload_array(hat, a, pgsz, ppa, prot,
   4123 				    hat_flag);
   4124 
   4125 				if (!(hat_flag & HAT_LOAD_LOCK)) {
   4126 					SEGVN_VMSTAT_FLTVNPAGES(29);
   4127 					for (i = 0; i < pages; i++) {
   4128 						page_unlock(ppa[i]);
   4129 					}
   4130 				}
   4131 				anon_array_exit(&an_cookie);
   4132 				ANON_LOCK_EXIT(&amp->a_rwlock);
   4133 				goto next;
   4134 			}
   4135 
   4136 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE ||
   4137 			    (!svd->pageprot && svd->prot == (prot & vpprot)));
   4138 
   4139 			pfn = page_pptonum(ppa[0]);
   4140 			/*
   4141 			 * hat_page_demote() needs an SE_EXCL lock on one of
   4142 			 * constituent page_t's and it decreases root's p_szc
   4143 			 * last. This means if root's p_szc is equal szc and
   4144 			 * all its constituent pages are locked
   4145 			 * hat_page_demote() that could have changed p_szc to
   4146 			 * szc is already done and no new have page_demote()
   4147 			 * can start for this large page.
   4148 			 */
   4149 
   4150 			/*
   4151 			 * we need to make sure same mapping size is used for
   4152 			 * the same address range if there's a possibility the
   4153 			 * adddress is already mapped because hat layer panics
   4154 			 * when translation is loaded for the range already
   4155 			 * mapped with a different page size.  We achieve it
   4156 			 * by always using largest page size possible subject
   4157 			 * to the constraints of page size, segment page size
   4158 			 * and page alignment.  Since mappings are invalidated
   4159 			 * when those constraints change and make it
   4160 			 * impossible to use previously used mapping size no
   4161 			 * mapping size conflicts should happen.
   4162 			 */
   4163 
   4164 		chkszc:
   4165 			if ((pszc = ppa[0]->p_szc) == szc &&
   4166 			    IS_P2ALIGNED(pfn, pages)) {
   4167 
   4168 				SEGVN_VMSTAT_FLTVNPAGES(30);
   4169 #ifdef DEBUG
   4170 				for (i = 0; i < pages; i++) {
   4171 					ASSERT(PAGE_LOCKED(ppa[i]));
   4172 					ASSERT(!PP_ISFREE(ppa[i]));
   4173 					ASSERT(page_pptonum(ppa[i]) ==
   4174 					    pfn + i);
   4175 					ASSERT(ppa[i]->p_szc == szc);
   4176 					ASSERT(ppa[i]->p_vnode == vp);
   4177 					ASSERT(ppa[i]->p_offset ==
   4178 					    off + (i << PAGESHIFT));
   4179 				}
   4180 #endif /* DEBUG */
   4181 				/*
   4182 				 * All pages are of szc we need and they are
   4183 				 * all locked so they can't change szc. load
   4184 				 * translations.
   4185 				 *
   4186 				 * if page got promoted since last check
   4187 				 * we don't need pplist.
   4188 				 */
   4189 				if (pplist != NULL) {
   4190 					page_free_replacement_page(pplist);
   4191 					page_create_putback(pages);
   4192 				}
   4193 				if (PP_ISMIGRATE(ppa[0])) {
   4194 					page_migrate(seg, a, ppa, pages);
   4195 				}
   4196 				SEGVN_UPDATE_MODBITS(ppa, pages, rw,
   4197 				    prot, vpprot);
   4198 				if (!xhat) {
   4199 					hat_memload_array_region(hat, a, pgsz,
   4200 					    ppa, prot & vpprot, hat_flag,
   4201 					    svd->rcookie);
   4202 				} else {
   4203 					/*
   4204 					 * avoid large xhat mappings to FS
   4205 					 * pages so that hat_page_demote()
   4206 					 * doesn't need to check for xhat
   4207 					 * large mappings.
   4208 					 * Don't use regions with xhats.
   4209 					 */
   4210 					for (i = 0; i < pages; i++) {
   4211 						hat_memload(hat,
   4212 						    a + (i << PAGESHIFT),
   4213 						    ppa[i], prot & vpprot,
   4214 						    hat_flag);
   4215 					}
   4216 				}
   4217 
   4218 				if (!(hat_flag & HAT_LOAD_LOCK)) {
   4219 					for (i = 0; i < pages; i++) {
   4220 						page_unlock(ppa[i]);
   4221 					}
   4222 				}
   4223 				if (amp != NULL) {
   4224 					anon_array_exit(&an_cookie);
   4225 					ANON_LOCK_EXIT(&amp->a_rwlock);
   4226 				}
   4227 				goto next;
   4228 			}
   4229 
   4230 			/*
   4231 			 * See if upsize is possible.
   4232 			 */
   4233 			if (pszc > szc && szc < seg->s_szc &&
   4234 			    (segvn_anypgsz_vnode || pszc >= seg->s_szc)) {
   4235 				pgcnt_t aphase;
   4236 				uint_t pszc1 = MIN(pszc, seg->s_szc);
   4237 				ppgsz = page_get_pagesize(pszc1);
   4238 				ppages = btop(ppgsz);
   4239 				aphase = btop(P2PHASE((uintptr_t)a, ppgsz));
   4240 
   4241 				ASSERT(type != F_SOFTLOCK);
   4242 
   4243 				SEGVN_VMSTAT_FLTVNPAGES(31);
   4244 				if (aphase != P2PHASE(pfn, ppages)) {
   4245 					segvn_faultvnmpss_align_err4++;
   4246 				} else {
   4247 					SEGVN_VMSTAT_FLTVNPAGES(32);
   4248 					if (pplist != NULL) {
   4249 						page_t *pl = pplist;
   4250 						page_free_replacement_page(pl);
   4251 						page_create_putback(pages);
   4252 					}
   4253 					for (i = 0; i < pages; i++) {
   4254 						page_unlock(ppa[i]);
   4255 					}
   4256 					if (amp != NULL) {
   4257 						anon_array_exit(&an_cookie);
   4258 						ANON_LOCK_EXIT(&amp->a_rwlock);
   4259 					}
   4260 					pszc = pszc1;
   4261 					ierr = -2;
   4262 					break;
   4263 				}
   4264 			}
   4265 
   4266 			/*
   4267 			 * check if we should use smallest mapping size.
   4268 			 */
   4269 			upgrdfail = 0;
   4270 			if (szc == 0 || xhat ||
   4271 			    (pszc >= szc &&
   4272 			    !IS_P2ALIGNED(pfn, pages)) ||
   4273 			    (pszc < szc &&
   4274 			    !segvn_full_szcpages(ppa, szc, &upgrdfail,
   4275 			    &pszc))) {
   4276 
   4277 				if (upgrdfail && type != F_SOFTLOCK) {
   4278 					/*
   4279 					 * segvn_full_szcpages failed to lock
   4280 					 * all pages EXCL. Size down.
   4281 					 */
   4282 					ASSERT(pszc < szc);
   4283 
   4284 					SEGVN_VMSTAT_FLTVNPAGES(33);
   4285 
   4286 					if (pplist != NULL) {
   4287 						page_t *pl = pplist;
   4288 						page_free_replacement_page(pl);
   4289 						page_create_putback(pages);
   4290 					}
   4291 
   4292 					for (i = 0; i < pages; i++) {
   4293 						page_unlock(ppa[i]);
   4294 					}
   4295 					if (amp != NULL) {
   4296 						anon_array_exit(&an_cookie);
   4297 						ANON_LOCK_EXIT(&amp->a_rwlock);
   4298 					}
   4299 					ierr = -1;
   4300 					break;
   4301 				}
   4302 				if (szc != 0 && !xhat && !upgrdfail) {
   4303 					segvn_faultvnmpss_align_err5++;
   4304 				}
   4305 				SEGVN_VMSTAT_FLTVNPAGES(34);
   4306 				if (pplist != NULL) {
   4307 					page_free_replacement_page(pplist);
   4308 					page_create_putback(pages);
   4309 				}
   4310 				SEGVN_UPDATE_MODBITS(ppa, pages, rw,
   4311 				    prot, vpprot);
   4312 				if (upgrdfail && segvn_anypgsz_vnode) {
   4313 					/* SOFTLOCK case */
   4314 					hat_memload_array_region(hat, a, pgsz,
   4315 					    ppa, prot & vpprot, hat_flag,
   4316 					    svd->rcookie);
   4317 				} else {
   4318 					for (i = 0; i < pages; i++) {
   4319 						hat_memload_region(hat,
   4320 						    a + (i << PAGESHIFT),
   4321 						    ppa[i], prot & vpprot,
   4322 						    hat_flag, svd->rcookie);
   4323 					}
   4324 				}
   4325 				if (!(hat_flag & HAT_LOAD_LOCK)) {
   4326 					for (i = 0; i < pages; i++) {
   4327 						page_unlock(ppa[i]);
   4328 					}
   4329 				}
   4330 				if (amp != NULL) {
   4331 					anon_array_exit(&an_cookie);
   4332 					ANON_LOCK_EXIT(&amp->a_rwlock);
   4333 				}
   4334 				goto next;
   4335 			}
   4336 
   4337 			if (pszc == szc) {
   4338 				/*
   4339 				 * segvn_full_szcpages() upgraded pages szc.
   4340 				 */
   4341 				ASSERT(pszc == ppa[0]->p_szc);
   4342 				ASSERT(IS_P2ALIGNED(pfn, pages));
   4343 				goto chkszc;
   4344 			}
   4345 
   4346 			if (pszc > szc) {
   4347 				kmutex_t *szcmtx;
   4348 				SEGVN_VMSTAT_FLTVNPAGES(35);
   4349 				/*
   4350 				 * p_szc of ppa[0] can change since we haven't
   4351 				 * locked all constituent pages. Call
   4352 				 * page_lock_szc() to prevent szc changes.
   4353 				 * This should be a rare case that happens when
   4354 				 * multiple segments use a different page size
   4355 				 * to map the same file offsets.
   4356 				 */
   4357 				szcmtx = page_szc_lock(ppa[0]);
   4358 				pszc = ppa[0]->p_szc;
   4359 				ASSERT(szcmtx != NULL || pszc == 0);
   4360 				ASSERT(ppa[0]->p_szc <= pszc);
   4361 				if (pszc <= szc) {
   4362 					SEGVN_VMSTAT_FLTVNPAGES(36);
   4363 					if (szcmtx != NULL) {
   4364 						mutex_exit(szcmtx);
   4365 					}
   4366 					goto chkszc;
   4367 				}
   4368 				if (pplist != NULL) {
   4369 					/*
   4370 					 * page got promoted since last check.
   4371 					 * we don't need preaalocated large
   4372 					 * page.
   4373 					 */
   4374 					SEGVN_VMSTAT_FLTVNPAGES(37);
   4375 					page_free_replacement_page(pplist);
   4376 					page_create_putback(pages);
   4377 				}
   4378 				SEGVN_UPDATE_MODBITS(ppa, pages, rw,
   4379 				    prot, vpprot);
   4380 				hat_memload_array_region(hat, a, pgsz, ppa,
   4381 				    prot & vpprot, hat_flag, svd->rcookie);
   4382 				mutex_exit(szcmtx);
   4383 				if (!(hat_flag & HAT_LOAD_LOCK)) {
   4384 					for (i = 0; i < pages; i++) {
   4385 						page_unlock(ppa[i]);
   4386 					}
   4387 				}
   4388 				if (amp != NULL) {
   4389 					anon_array_exit(&an_cookie);
   4390 					ANON_LOCK_EXIT(&amp->a_rwlock);
   4391 				}
   4392 				goto next;
   4393 			}
   4394 
   4395 			/*
   4396 			 * if page got demoted since last check
   4397 			 * we could have not allocated larger page.
   4398 			 * allocate now.
   4399 			 */
   4400 			if (pplist == NULL &&
   4401 			    page_alloc_pages(vp, seg, a, &pplist, NULL,
   4402 			    szc, 0, 0) && type != F_SOFTLOCK) {
   4403 				SEGVN_VMSTAT_FLTVNPAGES(38);
   4404 				for (i = 0; i < pages; i++) {
   4405 					page_unlock(ppa[i]);
   4406 				}
   4407 				if (amp != NULL) {
   4408 					anon_array_exit(&an_cookie);
   4409 					ANON_LOCK_EXIT(&amp->a_rwlock);
   4410 				}
   4411 				ierr = -1;
   4412 				alloc_failed |= (1 << szc);
   4413 				break;
   4414 			}
   4415 
   4416 			SEGVN_VMSTAT_FLTVNPAGES(39);
   4417 
   4418 			if (pplist != NULL) {
   4419 				segvn_relocate_pages(ppa, pplist);
   4420 #ifdef DEBUG
   4421 			} else {
   4422 				ASSERT(type == F_SOFTLOCK);
   4423 				SEGVN_VMSTAT_FLTVNPAGES(40);
   4424 #endif /* DEBUG */
   4425 			}
   4426 
   4427 			SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot);
   4428 
   4429 			if (pplist == NULL && segvn_anypgsz_vnode == 0) {
   4430 				ASSERT(type == F_SOFTLOCK);
   4431 				for (i = 0; i < pages; i++) {
   4432 					ASSERT(ppa[i]->p_szc < szc);
   4433 					hat_memload_region(hat,
   4434 					    a + (i << PAGESHIFT),
   4435 					    ppa[i], prot & vpprot, hat_flag,
   4436 					    svd->rcookie);
   4437 				}
   4438 			} else {
   4439 				ASSERT(pplist != NULL || type == F_SOFTLOCK);
   4440 				hat_memload_array_region(hat, a, pgsz, ppa,
   4441 				    prot & vpprot, hat_flag, svd->rcookie);
   4442 			}
   4443 			if (!(hat_flag & HAT_LOAD_LOCK)) {
   4444 				for (i = 0; i < pages; i++) {
   4445 					ASSERT(PAGE_SHARED(ppa[i]));
   4446 					page_unlock(ppa[i]);
   4447 				}
   4448 			}
   4449 			if (amp != NULL) {
   4450 				anon_array_exit(&an_cookie);
   4451 				ANON_LOCK_EXIT(&amp->a_rwlock);
   4452 			}
   4453 
   4454 		next:
   4455 			if (vpage != NULL) {
   4456 				vpage += pages;
   4457 			}
   4458 			adjszc_chk = 1;
   4459 		}
   4460 		if (a == lpgeaddr)
   4461 			break;
   4462 		ASSERT(a < lpgeaddr);
   4463 
   4464 		ASSERT(!brkcow && !tron && type != F_SOFTLOCK);
   4465 
   4466 		/*
   4467 		 * ierr == -1 means we failed to map with a large page.
   4468 		 * (either due to allocation/relocation failures or
   4469 		 * misalignment with other mappings to this file.
   4470 		 *
   4471 		 * ierr == -2 means some other thread allocated a large page
   4472 		 * after we gave up tp map with a large page.  retry with
   4473 		 * larger mapping.
   4474 		 */
   4475 		ASSERT(ierr == -1 || ierr == -2);
   4476 		ASSERT(ierr == -2 || szc != 0);
   4477 		ASSERT(ierr == -1 || szc < seg->s_szc);
   4478 		if (ierr == -2) {
   4479 			SEGVN_VMSTAT_FLTVNPAGES(41);
   4480 			ASSERT(pszc > szc && pszc <= seg->s_szc);
   4481 			szc = pszc;
   4482 		} else if (segvn_anypgsz_vnode) {
   4483 			SEGVN_VMSTAT_FLTVNPAGES(42);
   4484 			szc--;
   4485 		} else {
   4486 			SEGVN_VMSTAT_FLTVNPAGES(43);
   4487 			ASSERT(pszc < szc);
   4488 			/*
   4489 			 * other process created pszc large page.
   4490 			 * but we still have to drop to 0 szc.
   4491 			 */
   4492 			szc = 0;
   4493 		}
   4494 
   4495 		pgsz = page_get_pagesize(szc);
   4496 		pages = btop(pgsz);
   4497 		if (ierr == -2) {
   4498 			/*
   4499 			 * Size up case. Note lpgaddr may only be needed for
   4500 			 * softlock case so we don't adjust it here.
   4501 			 */
   4502 			a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz);
   4503 			ASSERT(a >= lpgaddr);
   4504 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
   4505 			off = svd->offset + (uintptr_t)(a - seg->s_base);
   4506 			aindx = svd->anon_index + seg_page(seg, a);
   4507 			vpage = (svd->vpage != NULL) ?
   4508 			    &svd->vpage[seg_page(seg, a)] : NULL;
   4509 		} else {
   4510 			/*
   4511 			 * Size down case. Note lpgaddr may only be needed for
   4512 			 * softlock case so we don't adjust it here.
   4513 			 */
   4514 			ASSERT(IS_P2ALIGNED(a, pgsz));
   4515 			ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz));
   4516 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
   4517 			ASSERT(a < lpgeaddr);
   4518 			if (a < addr) {
   4519 				SEGVN_VMSTAT_FLTVNPAGES(44);
   4520 				/*
   4521 				 * The beginning of the large page region can
   4522 				 * be pulled to the right to make a smaller
   4523 				 * region. We haven't yet faulted a single
   4524 				 * page.
   4525 				 */
   4526 				a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
   4527 				ASSERT(a >= lpgaddr);
   4528 				off = svd->offset +
   4529 				    (uintptr_t)(a - seg->s_base);
   4530 				aindx = svd->anon_index + seg_page(seg, a);
   4531 				vpage = (svd->vpage != NULL) ?
   4532 				    &svd->vpage[seg_page(seg, a)] : NULL;
   4533 			}
   4534 		}
   4535 	}
   4536 out:
   4537 	kmem_free(ppa, ppasize);
   4538 	if (!err && !vop_size_err) {
   4539 		SEGVN_VMSTAT_FLTVNPAGES(45);
   4540 		return (0);
   4541 	}
   4542 	if (type == F_SOFTLOCK && a > lpgaddr) {
   4543 		SEGVN_VMSTAT_FLTVNPAGES(46);
   4544 		segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER);
   4545 	}
   4546 	if (!vop_size_err) {
   4547 		SEGVN_VMSTAT_FLTVNPAGES(47);
   4548 		return (err);
   4549 	}
   4550 	ASSERT(brkcow || tron || type == F_SOFTLOCK);
   4551 	/*
   4552 	 * Large page end is mapped beyond the end of file and it's a cow
   4553 	 * fault (can be a text replication induced cow) or softlock so we can't
   4554 	 * reduce the map area.  For now just demote the segment. This should
   4555 	 * really only happen if the end of the file changed after the mapping
   4556 	 * was established since when large page segments are created we make
   4557 	 * sure they don't extend beyond the end of the file.
   4558 	 */
   4559 	SEGVN_VMSTAT_FLTVNPAGES(48);
   4560 
   4561 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   4562 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
   4563 	err = 0;
   4564 	if (seg->s_szc != 0) {
   4565 		segvn_fltvnpages_clrszc_cnt++;
   4566 		ASSERT(svd->softlockcnt == 0);
   4567 		err = segvn_clrszc(seg);
   4568 		if (err != 0) {
   4569 			segvn_fltvnpages_clrszc_err++;
   4570 		}
   4571 	}
   4572 	ASSERT(err || seg->s_szc == 0);
   4573 	SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock);
   4574 	/* segvn_fault will do its job as if szc had been zero to begin with */
   4575 	return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err));
   4576 }
   4577 
   4578 /*
   4579  * This routine will attempt to fault in one large page.
   4580  * it will use smaller pages if that fails.
   4581  * It should only be called for pure anonymous segments.
   4582  */
   4583 static faultcode_t
   4584 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
   4585     caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr,
   4586     caddr_t eaddr, int brkcow)
   4587 {
   4588 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   4589 	struct anon_map *amp = svd->amp;
   4590 	uchar_t segtype = svd->type;
   4591 	uint_t szc = seg->s_szc;
   4592 	size_t pgsz = page_get_pagesize(szc);
   4593 	size_t maxpgsz = pgsz;
   4594 	pgcnt_t pages = btop(pgsz);
   4595 	uint_t ppaszc = szc;
   4596 	caddr_t a = lpgaddr;
   4597 	ulong_t aindx = svd->anon_index + seg_page(seg, a);
   4598 	struct vpage *vpage = (svd->vpage != NULL) ?
   4599 	    &svd->vpage[seg_page(seg, a)] : NULL;
   4600 	page_t **ppa;
   4601 	uint_t	ppa_szc;
   4602 	faultcode_t err;
   4603 	int ierr;
   4604 	uint_t protchk, prot, vpprot;
   4605 	ulong_t i;
   4606 	int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
   4607 	anon_sync_obj_t cookie;
   4608 	int adjszc_chk;
   4609 	int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0;
   4610 
   4611 	ASSERT(szc != 0);
   4612 	ASSERT(amp != NULL);
   4613 	ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
   4614 	ASSERT(!(svd->flags & MAP_NORESERVE));
   4615 	ASSERT(type != F_SOFTUNLOCK);
   4616 	ASSERT(IS_P2ALIGNED(a, maxpgsz));
   4617 	ASSERT(!brkcow || svd->tr_state == SEGVN_TR_OFF);
   4618 	ASSERT(svd->tr_state != SEGVN_TR_INIT);
   4619 
   4620 	ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
   4621 
   4622 	VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]);
   4623 	VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]);
   4624 
   4625 	if (svd->flags & MAP_TEXT) {
   4626 		hat_flag |= HAT_LOAD_TEXT;
   4627 	}
   4628 
   4629 	if (svd->pageprot) {
   4630 		switch (rw) {
   4631 		case S_READ:
   4632 			protchk = PROT_READ;
   4633 			break;
   4634 		case S_WRITE:
   4635 			protchk = PROT_WRITE;
   4636 			break;
   4637 		case S_EXEC:
   4638 			protchk = PROT_EXEC;
   4639 			break;
   4640 		case S_OTHER:
   4641 		default:
   4642 			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
   4643 			break;
   4644 		}
   4645 		VM_STAT_ADD(segvnvmstats.fltanpages[2]);
   4646 	} else {
   4647 		prot = svd->prot;
   4648 		/* caller has already done segment level protection check. */
   4649 	}
   4650 
   4651 	ppa = kmem_cache_alloc(segvn_szc_cache[ppaszc], KM_SLEEP);
   4652 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   4653 	for (;;) {
   4654 		adjszc_chk = 0;
   4655 		for (; a < lpgeaddr; a += pgsz, aindx += pages) {
   4656 			if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
   4657 				VM_STAT_ADD(segvnvmstats.fltanpages[3]);
   4658 				ASSERT(vpage != NULL);
   4659 				prot = VPP_PROT(vpage);
   4660 				ASSERT(sameprot(seg, a, maxpgsz));
   4661 				if ((prot & protchk) == 0) {
   4662 					err = FC_PROT;
   4663 					goto error;
   4664 				}
   4665 			}
   4666 			if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) &&
   4667 			    pgsz < maxpgsz) {
   4668 				ASSERT(a > lpgaddr);
   4669 				szc = seg->s_szc;
   4670 				pgsz = maxpgsz;
   4671 				pages = btop(pgsz);
   4672 				ASSERT(IS_P2ALIGNED(aindx, pages));
   4673 				lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr,
   4674 				    pgsz);
   4675 			}
   4676 			if (type == F_SOFTLOCK) {
   4677 				atomic_add_long((ulong_t *)&svd->softlockcnt,
   4678 				    pages);
   4679 			}
   4680 			anon_array_enter(amp, aindx, &cookie);
   4681 			ppa_szc = (uint_t)-1;
   4682 			ierr = anon_map_getpages(amp, aindx, szc, seg, a,
   4683 			    prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow,
   4684 			    segvn_anypgsz, pgflags, svd->cred);
   4685 			if (ierr != 0) {
   4686 				anon_array_exit(&cookie);
   4687 				VM_STAT_ADD(segvnvmstats.fltanpages[4]);
   4688 				if (type == F_SOFTLOCK) {
   4689 					atomic_add_long(
   4690 					    (ulong_t *)&svd->softlockcnt,
   4691 					    -pages);
   4692 				}
   4693 				if (ierr > 0) {
   4694 					VM_STAT_ADD(segvnvmstats.fltanpages[6]);
   4695 					err = FC_MAKE_ERR(ierr);
   4696 					goto error;
   4697 				}
   4698 				break;
   4699 			}
   4700 
   4701 			ASSERT(!IS_VMODSORT(ppa[0]->p_vnode));
   4702 
   4703 			ASSERT(segtype == MAP_SHARED ||
   4704 			    ppa[0]->p_szc <= szc);
   4705 			ASSERT(segtype == MAP_PRIVATE ||
   4706 			    ppa[0]->p_szc >= szc);
   4707 
   4708 			/*
   4709 			 * Handle pages that have been marked for migration
   4710 			 */
   4711 			if (lgrp_optimizations())
   4712 				page_migrate(seg, a, ppa, pages);
   4713 
   4714 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   4715 
   4716 			if (segtype == MAP_SHARED) {
   4717 				vpprot |= PROT_WRITE;
   4718 			}
   4719 
   4720 			hat_memload_array(hat, a, pgsz, ppa,
   4721 			    prot & vpprot, hat_flag);
   4722 
   4723 			if (hat_flag & HAT_LOAD_LOCK) {
   4724 				VM_STAT_ADD(segvnvmstats.fltanpages[7]);
   4725 			} else {
   4726 				VM_STAT_ADD(segvnvmstats.fltanpages[8]);
   4727 				for (i = 0; i < pages; i++)
   4728 					page_unlock(ppa[i]);
   4729 			}
   4730 			if (vpage != NULL)
   4731 				vpage += pages;
   4732 
   4733 			anon_array_exit(&cookie);
   4734 			adjszc_chk = 1;
   4735 		}
   4736 		if (a == lpgeaddr)
   4737 			break;
   4738 		ASSERT(a < lpgeaddr);
   4739 		/*
   4740 		 * ierr == -1 means we failed to allocate a large page.
   4741 		 * so do a size down operation.
   4742 		 *
   4743 		 * ierr == -2 means some other process that privately shares
   4744 		 * pages with this process has allocated a larger page and we
   4745 		 * need to retry with larger pages. So do a size up
   4746 		 * operation. This relies on the fact that large pages are
   4747 		 * never partially shared i.e. if we share any constituent
   4748 		 * page of a large page with another process we must share the
   4749 		 * entire large page. Note this cannot happen for SOFTLOCK
   4750 		 * case, unless current address (a) is at the beginning of the
   4751 		 * next page size boundary because the other process couldn't
   4752 		 * have relocated locked pages.
   4753 		 */
   4754 		ASSERT(ierr == -1 || ierr == -2);
   4755 
   4756 		if (segvn_anypgsz) {
   4757 			ASSERT(ierr == -2 || szc != 0);
   4758 			ASSERT(ierr == -1 || szc < seg->s_szc);
   4759 			szc = (ierr == -1) ? szc - 1 : szc + 1;
   4760 		} else {
   4761 			/*
   4762 			 * For non COW faults and segvn_anypgsz == 0
   4763 			 * we need to be careful not to loop forever
   4764 			 * if existing page is found with szc other
   4765 			 * than 0 or seg->s_szc. This could be due
   4766 			 * to page relocations on behalf of DR or
   4767 			 * more likely large page creation. For this
   4768 			 * case simply re-size to existing page's szc
   4769 			 * if returned by anon_map_getpages().
   4770 			 */
   4771 			if (ppa_szc == (uint_t)-1) {
   4772 				szc = (ierr == -1) ? 0 : seg->s_szc;
   4773 			} else {
   4774 				ASSERT(ppa_szc <= seg->s_szc);
   4775 				ASSERT(ierr == -2 || ppa_szc < szc);
   4776 				ASSERT(ierr == -1 || ppa_szc > szc);
   4777 				szc = ppa_szc;
   4778 			}
   4779 		}
   4780 
   4781 		pgsz = page_get_pagesize(szc);
   4782 		pages = btop(pgsz);
   4783 		ASSERT(type != F_SOFTLOCK || ierr == -1 ||
   4784 		    (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz)));
   4785 		if (type == F_SOFTLOCK) {
   4786 			/*
   4787 			 * For softlocks we cannot reduce the fault area
   4788 			 * (calculated based on the largest page size for this
   4789 			 * segment) for size down and a is already next
   4790 			 * page size aligned as assertted above for size
   4791 			 * ups. Therefore just continue in case of softlock.
   4792 			 */
   4793 			VM_STAT_ADD(segvnvmstats.fltanpages[9]);
   4794 			continue; /* keep lint happy */
   4795 		} else if (ierr == -2) {
   4796 
   4797 			/*
   4798 			 * Size up case. Note lpgaddr may only be needed for
   4799 			 * softlock case so we don't adjust it here.
   4800 			 */
   4801 			VM_STAT_ADD(segvnvmstats.fltanpages[10]);
   4802 			a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz);
   4803 			ASSERT(a >= lpgaddr);
   4804 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
   4805 			aindx = svd->anon_index + seg_page(seg, a);
   4806 			vpage = (svd->vpage != NULL) ?
   4807 			    &svd->vpage[seg_page(seg, a)] : NULL;
   4808 		} else {
   4809 			/*
   4810 			 * Size down case. Note lpgaddr may only be needed for
   4811 			 * softlock case so we don't adjust it here.
   4812 			 */
   4813 			VM_STAT_ADD(segvnvmstats.fltanpages[11]);
   4814 			ASSERT(IS_P2ALIGNED(a, pgsz));
   4815 			ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz));
   4816 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
   4817 			ASSERT(a < lpgeaddr);
   4818 			if (a < addr) {
   4819 				/*
   4820 				 * The beginning of the large page region can
   4821 				 * be pulled to the right to make a smaller
   4822 				 * region. We haven't yet faulted a single
   4823 				 * page.
   4824 				 */
   4825 				VM_STAT_ADD(segvnvmstats.fltanpages[12]);
   4826 				a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
   4827 				ASSERT(a >= lpgaddr);
   4828 				aindx = svd->anon_index + seg_page(seg, a);
   4829 				vpage = (svd->vpage != NULL) ?
   4830 				    &svd->vpage[seg_page(seg, a)] : NULL;
   4831 			}
   4832 		}
   4833 	}
   4834 	VM_STAT_ADD(segvnvmstats.fltanpages[13]);
   4835 	ANON_LOCK_EXIT(&amp->a_rwlock);
   4836 	kmem_cache_free(segvn_szc_cache[ppaszc], ppa);
   4837 	return (0);
   4838 error:
   4839 	VM_STAT_ADD(segvnvmstats.fltanpages[14]);
   4840 	ANON_LOCK_EXIT(&amp->a_rwlock);
   4841 	kmem_cache_free(segvn_szc_cache[ppaszc], ppa);
   4842 	if (type == F_SOFTLOCK && a > lpgaddr) {
   4843 		VM_STAT_ADD(segvnvmstats.fltanpages[15]);
   4844 		segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER);
   4845 	}
   4846 	return (err);
   4847 }
   4848 
   4849 int fltadvice = 1;	/* set to free behind pages for sequential access */
   4850 
   4851 /*
   4852  * This routine is called via a machine specific fault handling routine.
   4853  * It is also called by software routines wishing to lock or unlock
   4854  * a range of addresses.
   4855  *
   4856  * Here is the basic algorithm:
   4857  *	If unlocking
   4858  *		Call segvn_softunlock
   4859  *		Return
   4860  *	endif
   4861  *	Checking and set up work
   4862  *	If we will need some non-anonymous pages
   4863  *		Call VOP_GETPAGE over the range of non-anonymous pages
   4864  *	endif
   4865  *	Loop over all addresses requested
   4866  *		Call segvn_faultpage passing in page list
   4867  *		    to load up translations and handle anonymous pages
   4868  *	endloop
   4869  *	Load up translation to any additional pages in page list not
   4870  *	    already handled that fit into this segment
   4871  */
   4872 static faultcode_t
   4873 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
   4874     enum fault_type type, enum seg_rw rw)
   4875 {
   4876 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   4877 	page_t **plp, **ppp, *pp;
   4878 	u_offset_t off;
   4879 	caddr_t a;
   4880 	struct vpage *vpage;
   4881 	uint_t vpprot, prot;
   4882 	int err;
   4883 	page_t *pl[PVN_GETPAGE_NUM + 1];
   4884 	size_t plsz, pl_alloc_sz;
   4885 	size_t page;
   4886 	ulong_t anon_index;
   4887 	struct anon_map *amp;
   4888 	int dogetpage = 0;
   4889 	caddr_t	lpgaddr, lpgeaddr;
   4890 	size_t pgsz;
   4891 	anon_sync_obj_t cookie;
   4892 	int brkcow = BREAK_COW_SHARE(rw, type, svd->type);
   4893 
   4894 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   4895 	ASSERT(svd->amp == NULL || svd->rcookie == HAT_INVALID_REGION_COOKIE);
   4896 
   4897 	/*
   4898 	 * First handle the easy stuff
   4899 	 */
   4900 	if (type == F_SOFTUNLOCK) {
   4901 		if (rw == S_READ_NOCOW) {
   4902 			rw = S_READ;
   4903 			ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
   4904 		}
   4905 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
   4906 		pgsz = (seg->s_szc == 0) ? PAGESIZE :
   4907 		    page_get_pagesize(seg->s_szc);
   4908 		VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]);
   4909 		CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
   4910 		segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw);
   4911 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   4912 		return (0);
   4913 	}
   4914 
   4915 	ASSERT(svd->tr_state == SEGVN_TR_OFF ||
   4916 	    !HAT_IS_REGION_COOKIE_VALID(svd->rcookie));
   4917 	if (brkcow == 0) {
   4918 		if (svd->tr_state == SEGVN_TR_INIT) {
   4919 			SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
   4920 			if (svd->tr_state == SEGVN_TR_INIT) {
   4921 				ASSERT(svd->vp != NULL && svd->amp == NULL);
   4922 				ASSERT(svd->flags & MAP_TEXT);
   4923 				ASSERT(svd->type == MAP_PRIVATE);
   4924 				segvn_textrepl(seg);
   4925 				ASSERT(svd->tr_state != SEGVN_TR_INIT);
   4926 				ASSERT(svd->tr_state != SEGVN_TR_ON ||
   4927 				    svd->amp != NULL);
   4928 			}
   4929 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   4930 		}
   4931 	} else if (svd->tr_state != SEGVN_TR_OFF) {
   4932 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
   4933 
   4934 		if (rw == S_WRITE && svd->tr_state != SEGVN_TR_OFF) {
   4935 			ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE));
   4936 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   4937 			return (FC_PROT);
   4938 		}
   4939 
   4940 		if (svd->tr_state == SEGVN_TR_ON) {
   4941 			ASSERT(svd->vp != NULL && svd->amp != NULL);
   4942 			segvn_textunrepl(seg, 0);
   4943 			ASSERT(svd->amp == NULL &&
   4944 			    svd->tr_state == SEGVN_TR_OFF);
   4945 		} else if (svd->tr_state != SEGVN_TR_OFF) {
   4946 			svd->tr_state = SEGVN_TR_OFF;
   4947 		}
   4948 		ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
   4949 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   4950 	}
   4951 
   4952 top:
   4953 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
   4954 
   4955 	/*
   4956 	 * If we have the same protections for the entire segment,
   4957 	 * insure that the access being attempted is legitimate.
   4958 	 */
   4959 
   4960 	if (svd->pageprot == 0) {
   4961 		uint_t protchk;
   4962 
   4963 		switch (rw) {
   4964 		case S_READ:
   4965 		case S_READ_NOCOW:
   4966 			protchk = PROT_READ;
   4967 			break;
   4968 		case S_WRITE:
   4969 			protchk = PROT_WRITE;
   4970 			break;
   4971 		case S_EXEC:
   4972 			protchk = PROT_EXEC;
   4973 			break;
   4974 		case S_OTHER:
   4975 		default:
   4976 			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
   4977 			break;
   4978 		}
   4979 
   4980 		if ((svd->prot & protchk) == 0) {
   4981 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   4982 			return (FC_PROT);	/* illegal access type */
   4983 		}
   4984 	}
   4985 
   4986 	if (brkcow && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
   4987 		/* this must be SOFTLOCK S_READ fault */
   4988 		ASSERT(svd->amp == NULL);
   4989 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
   4990 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   4991 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
   4992 		if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
   4993 			/*
   4994 			 * this must be the first ever non S_READ_NOCOW
   4995 			 * softlock for this segment.
   4996 			 */
   4997 			ASSERT(svd->softlockcnt == 0);
   4998 			hat_leave_region(seg->s_as->a_hat, svd->rcookie,
   4999 			    HAT_REGION_TEXT);
   5000 			svd->rcookie = HAT_INVALID_REGION_COOKIE;
   5001 		}
   5002 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5003 		goto top;
   5004 	}
   5005 
   5006 	/*
   5007 	 * We can't allow the long term use of softlocks for vmpss segments,
   5008 	 * because in some file truncation cases we should be able to demote
   5009 	 * the segment, which requires that there are no softlocks.  The
   5010 	 * only case where it's ok to allow a SOFTLOCK fault against a vmpss
   5011 	 * segment is S_READ_NOCOW, where the caller holds the address space
   5012 	 * locked as writer and calls softunlock before dropping the as lock.
   5013 	 * S_READ_NOCOW is used by /proc to read memory from another user.
   5014 	 *
   5015 	 * Another deadlock between SOFTLOCK and file truncation can happen
   5016 	 * because segvn_fault_vnodepages() calls the FS one pagesize at
   5017 	 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages()
   5018 	 * can cause a deadlock because the first set of page_t's remain
   5019 	 * locked SE_SHARED.  To avoid this, we demote segments on a first
   5020 	 * SOFTLOCK if they have a length greater than the segment's
   5021 	 * page size.
   5022 	 *
   5023 	 * So for now, we only avoid demoting a segment on a SOFTLOCK when
   5024 	 * the access type is S_READ_NOCOW and the fault length is less than
   5025 	 * or equal to the segment's page size. While this is quite restrictive,
   5026 	 * it should be the most common case of SOFTLOCK against a vmpss
   5027 	 * segment.
   5028 	 *
   5029 	 * For S_READ_NOCOW, it's safe not to do a copy on write because the
   5030 	 * caller makes sure no COW will be caused by another thread for a
   5031 	 * softlocked page.
   5032 	 */
   5033 	if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) {
   5034 		int demote = 0;
   5035 
   5036 		if (rw != S_READ_NOCOW) {
   5037 			demote = 1;
   5038 		}
   5039 		if (!demote && len > PAGESIZE) {
   5040 			pgsz = page_get_pagesize(seg->s_szc);
   5041 			CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr,
   5042 			    lpgeaddr);
   5043 			if (lpgeaddr - lpgaddr > pgsz) {
   5044 				demote = 1;
   5045 			}
   5046 		}
   5047 
   5048 		ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
   5049 
   5050 		if (demote) {
   5051 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5052 			SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
   5053 			if (seg->s_szc != 0) {
   5054 				segvn_vmpss_clrszc_cnt++;
   5055 				ASSERT(svd->softlockcnt == 0);
   5056 				err = segvn_clrszc(seg);
   5057 				if (err) {
   5058 					segvn_vmpss_clrszc_err++;
   5059 					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5060 					return (FC_MAKE_ERR(err));
   5061 				}
   5062 			}
   5063 			ASSERT(seg->s_szc == 0);
   5064 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5065 			goto top;
   5066 		}
   5067 	}
   5068 
   5069 	/*
   5070 	 * Check to see if we need to allocate an anon_map structure.
   5071 	 */
   5072 	if (svd->amp == NULL && (svd->vp == NULL || brkcow)) {
   5073 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   5074 		/*
   5075 		 * Drop the "read" lock on the segment and acquire
   5076 		 * the "write" version since we have to allocate the
   5077 		 * anon_map.
   5078 		 */
   5079 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5080 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
   5081 
   5082 		if (svd->amp == NULL) {
   5083 			svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
   5084 			svd->amp->a_szc = seg->s_szc;
   5085 		}
   5086 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5087 
   5088 		/*
   5089 		 * Start all over again since segment protections
   5090 		 * may have changed after we dropped the "read" lock.
   5091 		 */
   5092 		goto top;
   5093 	}
   5094 
   5095 	/*
   5096 	 * S_READ_NOCOW vs S_READ distinction was
   5097 	 * only needed for the code above. After
   5098 	 * that we treat it as S_READ.
   5099 	 */
   5100 	if (rw == S_READ_NOCOW) {
   5101 		ASSERT(type == F_SOFTLOCK);
   5102 		ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
   5103 		rw = S_READ;
   5104 	}
   5105 
   5106 	amp = svd->amp;
   5107 
   5108 	/*
   5109 	 * MADV_SEQUENTIAL work is ignored for large page segments.
   5110 	 */
   5111 	if (seg->s_szc != 0) {
   5112 		pgsz = page_get_pagesize(seg->s_szc);
   5113 		ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
   5114 		CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
   5115 		if (svd->vp == NULL) {
   5116 			err = segvn_fault_anonpages(hat, seg, lpgaddr,
   5117 			    lpgeaddr, type, rw, addr, addr + len, brkcow);
   5118 		} else {
   5119 			err = segvn_fault_vnodepages(hat, seg, lpgaddr,
   5120 			    lpgeaddr, type, rw, addr, addr + len, brkcow);
   5121 			if (err == IE_RETRY) {
   5122 				ASSERT(seg->s_szc == 0);
   5123 				ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
   5124 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5125 				goto top;
   5126 			}
   5127 		}
   5128 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5129 		return (err);
   5130 	}
   5131 
   5132 	page = seg_page(seg, addr);
   5133 	if (amp != NULL) {
   5134 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   5135 		anon_index = svd->anon_index + page;
   5136 
   5137 		if (type == F_PROT && rw == S_READ &&
   5138 		    svd->tr_state == SEGVN_TR_OFF &&
   5139 		    svd->type == MAP_PRIVATE && svd->pageprot == 0) {
   5140 			size_t index = anon_index;
   5141 			struct anon *ap;
   5142 
   5143 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   5144 			/*
   5145 			 * The fast path could apply to S_WRITE also, except
   5146 			 * that the protection fault could be caused by lazy
   5147 			 * tlb flush when ro->rw. In this case, the pte is
   5148 			 * RW already. But RO in the other cpu's tlb causes
   5149 			 * the fault. Since hat_chgprot won't do anything if
   5150 			 * pte doesn't change, we may end up faulting
   5151 			 * indefinitely until the RO tlb entry gets replaced.
   5152 			 */
   5153 			for (a = addr; a < addr + len; a += PAGESIZE, index++) {
   5154 				anon_array_enter(amp, index, &cookie);
   5155 				ap = anon_get_ptr(amp->ahp, index);
   5156 				anon_array_exit(&cookie);
   5157 				if ((ap == NULL) || (ap->an_refcnt != 1)) {
   5158 					ANON_LOCK_EXIT(&amp->a_rwlock);
   5159 					goto slow;
   5160 				}
   5161 			}
   5162 			hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot);
   5163 			ANON_LOCK_EXIT(&amp->a_rwlock);
   5164 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5165 			return (0);
   5166 		}
   5167 	}
   5168 slow:
   5169 
   5170 	if (svd->vpage == NULL)
   5171 		vpage = NULL;
   5172 	else
   5173 		vpage = &svd->vpage[page];
   5174 
   5175 	off = svd->offset + (uintptr_t)(addr - seg->s_base);
   5176 
   5177 	/*
   5178 	 * If MADV_SEQUENTIAL has been set for the particular page we
   5179 	 * are faulting on, free behind all pages in the segment and put
   5180 	 * them on the free list.
   5181 	 */
   5182 
   5183 	if ((page != 0) && fltadvice && svd->tr_state != SEGVN_TR_ON) {
   5184 		struct vpage *vpp;
   5185 		ulong_t fanon_index;
   5186 		size_t fpage;
   5187 		u_offset_t pgoff, fpgoff;
   5188 		struct vnode *fvp;
   5189 		struct anon *fap = NULL;
   5190 
   5191 		if (svd->advice == MADV_SEQUENTIAL ||
   5192 		    (svd->pageadvice &&
   5193 		    VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) {
   5194 			pgoff = off - PAGESIZE;
   5195 			fpage = page - 1;
   5196 			if (vpage != NULL)
   5197 				vpp = &svd->vpage[fpage];
   5198 			if (amp != NULL)
   5199 				fanon_index = svd->anon_index + fpage;
   5200 
   5201 			while (pgoff > svd->offset) {
   5202 				if (svd->advice != MADV_SEQUENTIAL &&
   5203 				    (!svd->pageadvice || (vpage &&
   5204 				    VPP_ADVICE(vpp) != MADV_SEQUENTIAL)))
   5205 					break;
   5206 
   5207 				/*
   5208 				 * If this is an anon page, we must find the
   5209 				 * correct <vp, offset> for it
   5210 				 */
   5211 				fap = NULL;
   5212 				if (amp != NULL) {
   5213 					ANON_LOCK_ENTER(&amp->a_rwlock,
   5214 					    RW_READER);
   5215 					anon_array_enter(amp, fanon_index,
   5216 					    &cookie);
   5217 					fap = anon_get_ptr(amp->ahp,
   5218 					    fanon_index);
   5219 					if (fap != NULL) {
   5220 						swap_xlate(fap, &fvp, &fpgoff);
   5221 					} else {
   5222 						fpgoff = pgoff;
   5223 						fvp = svd->vp;
   5224 					}
   5225 					anon_array_exit(&cookie);
   5226 					ANON_LOCK_EXIT(&amp->a_rwlock);
   5227 				} else {
   5228 					fpgoff = pgoff;
   5229 					fvp = svd->vp;
   5230 				}
   5231 				if (fvp == NULL)
   5232 					break;	/* XXX */
   5233 				/*
   5234 				 * Skip pages that are free or have an
   5235 				 * "exclusive" lock.
   5236 				 */
   5237 				pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED);
   5238 				if (pp == NULL)
   5239 					break;
   5240 				/*
   5241 				 * We don't need the page_struct_lock to test
   5242 				 * as this is only advisory; even if we
   5243 				 * acquire it someone might race in and lock
   5244 				 * the page after we unlock and before the
   5245 				 * PUTPAGE, then VOP_PUTPAGE will do nothing.
   5246 				 */
   5247 				if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
   5248 					/*
   5249 					 * Hold the vnode before releasing
   5250 					 * the page lock to prevent it from
   5251 					 * being freed and re-used by some
   5252 					 * other thread.
   5253 					 */
   5254 					VN_HOLD(fvp);
   5255 					page_unlock(pp);
   5256 					/*
   5257 					 * We should build a page list
   5258 					 * to kluster putpages XXX
   5259 					 */
   5260 					(void) VOP_PUTPAGE(fvp,
   5261 					    (offset_t)fpgoff, PAGESIZE,
   5262 					    (B_DONTNEED|B_FREE|B_ASYNC),
   5263 					    svd->cred, NULL);
   5264 					VN_RELE(fvp);
   5265 				} else {
   5266 					/*
   5267 					 * XXX - Should the loop terminate if
   5268 					 * the page is `locked'?
   5269 					 */
   5270 					page_unlock(pp);
   5271 				}
   5272 				--vpp;
   5273 				--fanon_index;
   5274 				pgoff -= PAGESIZE;
   5275 			}
   5276 		}
   5277 	}
   5278 
   5279 	plp = pl;
   5280 	*plp = NULL;
   5281 	pl_alloc_sz = 0;
   5282 
   5283 	/*
   5284 	 * See if we need to call VOP_GETPAGE for
   5285 	 * *any* of the range being faulted on.
   5286 	 * We can skip all of this work if there
   5287 	 * was no original vnode.
   5288 	 */
   5289 	if (svd->vp != NULL) {
   5290 		u_offset_t vp_off;
   5291 		size_t vp_len;
   5292 		struct anon *ap;
   5293 		vnode_t *vp;
   5294 
   5295 		vp_off = off;
   5296 		vp_len = len;
   5297 
   5298 		if (amp == NULL)
   5299 			dogetpage = 1;
   5300 		else {
   5301 			/*
   5302 			 * Only acquire reader lock to prevent amp->ahp
   5303 			 * from being changed.  It's ok to miss pages,
   5304 			 * hence we don't do anon_array_enter
   5305 			 */
   5306 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   5307 			ap = anon_get_ptr(amp->ahp, anon_index);
   5308 
   5309 			if (len <= PAGESIZE)
   5310 				/* inline non_anon() */
   5311 				dogetpage = (ap == NULL);
   5312 			else
   5313 				dogetpage = non_anon(amp->ahp, anon_index,
   5314 				    &vp_off, &vp_len);
   5315 			ANON_LOCK_EXIT(&amp->a_rwlock);
   5316 		}
   5317 
   5318 		if (dogetpage) {
   5319 			enum seg_rw arw;
   5320 			struct as *as = seg->s_as;
   5321 
   5322 			if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) {
   5323 				/*
   5324 				 * Page list won't fit in local array,
   5325 				 * allocate one of the needed size.
   5326 				 */
   5327 				pl_alloc_sz =
   5328 				    (btop(len) + 1) * sizeof (page_t *);
   5329 				plp = kmem_alloc(pl_alloc_sz, KM_SLEEP);
   5330 				plp[0] = NULL;
   5331 				plsz = len;
   5332 			} else if (rw == S_WRITE && svd->type == MAP_PRIVATE ||
   5333 			    svd->tr_state == SEGVN_TR_ON || rw == S_OTHER ||
   5334 			    (((size_t)(addr + PAGESIZE) <
   5335 			    (size_t)(seg->s_base + seg->s_size)) &&
   5336 			    hat_probe(as->a_hat, addr + PAGESIZE))) {
   5337 				/*
   5338 				 * Ask VOP_GETPAGE to return the exact number
   5339 				 * of pages if
   5340 				 * (a) this is a COW fault, or
   5341 				 * (b) this is a software fault, or
   5342 				 * (c) next page is already mapped.
   5343 				 */
   5344 				plsz = len;
   5345 			} else {
   5346 				/*
   5347 				 * Ask VOP_GETPAGE to return adjacent pages
   5348 				 * within the segment.
   5349 				 */
   5350 				plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t)
   5351 				    ((seg->s_base + seg->s_size) - addr));
   5352 				ASSERT((addr + plsz) <=
   5353 				    (seg->s_base + seg->s_size));
   5354 			}
   5355 
   5356 			/*
   5357 			 * Need to get some non-anonymous pages.
   5358 			 * We need to make only one call to GETPAGE to do
   5359 			 * this to prevent certain deadlocking conditions
   5360 			 * when we are doing locking.  In this case
   5361 			 * non_anon() should have picked up the smallest
   5362 			 * range which includes all the non-anonymous
   5363 			 * pages in the requested range.  We have to
   5364 			 * be careful regarding which rw flag to pass in
   5365 			 * because on a private mapping, the underlying
   5366 			 * object is never allowed to be written.
   5367 			 */
   5368 			if (rw == S_WRITE && svd->type == MAP_PRIVATE) {
   5369 				arw = S_READ;
   5370 			} else {
   5371 				arw = rw;
   5372 			}
   5373 			vp = svd->vp;
   5374 			TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE,
   5375 			    "segvn_getpage:seg %p addr %p vp %p",
   5376 			    seg, addr, vp);
   5377 			err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len,
   5378 			    &vpprot, plp, plsz, seg, addr + (vp_off - off), arw,
   5379 			    svd->cred, NULL);
   5380 			if (err) {
   5381 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5382 				segvn_pagelist_rele(plp);
   5383 				if (pl_alloc_sz)
   5384 					kmem_free(plp, pl_alloc_sz);
   5385 				return (FC_MAKE_ERR(err));
   5386 			}
   5387 			if (svd->type == MAP_PRIVATE)
   5388 				vpprot &= ~PROT_WRITE;
   5389 		}
   5390 	}
   5391 
   5392 	/*
   5393 	 * N.B. at this time the plp array has all the needed non-anon
   5394 	 * pages in addition to (possibly) having some adjacent pages.
   5395 	 */
   5396 
   5397 	/*
   5398 	 * Always acquire the anon_array_lock to prevent
   5399 	 * 2 threads from allocating separate anon slots for
   5400 	 * the same "addr".
   5401 	 *
   5402 	 * If this is a copy-on-write fault and we don't already
   5403 	 * have the anon_array_lock, acquire it to prevent the
   5404 	 * fault routine from handling multiple copy-on-write faults
   5405 	 * on the same "addr" in the same address space.
   5406 	 *
   5407 	 * Only one thread should deal with the fault since after
   5408 	 * it is handled, the other threads can acquire a translation
   5409 	 * to the newly created private page.  This prevents two or
   5410 	 * more threads from creating different private pages for the
   5411 	 * same fault.
   5412 	 *
   5413 	 * We grab "serialization" lock here if this is a MAP_PRIVATE segment
   5414 	 * to prevent deadlock between this thread and another thread
   5415 	 * which has soft-locked this page and wants to acquire serial_lock.
   5416 	 * ( bug 4026339 )
   5417 	 *
   5418 	 * The fix for bug 4026339 becomes unnecessary when using the
   5419 	 * locking scheme with per amp rwlock and a global set of hash
   5420 	 * lock, anon_array_lock.  If we steal a vnode page when low
   5421 	 * on memory and upgrad the page lock through page_rename,
   5422 	 * then the page is PAGE_HANDLED, nothing needs to be done
   5423 	 * for this page after returning from segvn_faultpage.
   5424 	 *
   5425 	 * But really, the page lock should be downgraded after
   5426 	 * the stolen page is page_rename'd.
   5427 	 */
   5428 
   5429 	if (amp != NULL)
   5430 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   5431 
   5432 	/*
   5433 	 * Ok, now loop over the address range and handle faults
   5434 	 */
   5435 	for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) {
   5436 		err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot,
   5437 		    type, rw, brkcow);
   5438 		if (err) {
   5439 			if (amp != NULL)
   5440 				ANON_LOCK_EXIT(&amp->a_rwlock);
   5441 			if (type == F_SOFTLOCK && a > addr) {
   5442 				segvn_softunlock(seg, addr, (a - addr),
   5443 				    S_OTHER);
   5444 			}
   5445 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5446 			segvn_pagelist_rele(plp);
   5447 			if (pl_alloc_sz)
   5448 				kmem_free(plp, pl_alloc_sz);
   5449 			return (err);
   5450 		}
   5451 		if (vpage) {
   5452 			vpage++;
   5453 		} else if (svd->vpage) {
   5454 			page = seg_page(seg, addr);
   5455 			vpage = &svd->vpage[++page];
   5456 		}
   5457 	}
   5458 
   5459 	/* Didn't get pages from the underlying fs so we're done */
   5460 	if (!dogetpage)
   5461 		goto done;
   5462 
   5463 	/*
   5464 	 * Now handle any other pages in the list returned.
   5465 	 * If the page can be used, load up the translations now.
   5466 	 * Note that the for loop will only be entered if "plp"
   5467 	 * is pointing to a non-NULL page pointer which means that
   5468 	 * VOP_GETPAGE() was called and vpprot has been initialized.
   5469 	 */
   5470 	if (svd->pageprot == 0)
   5471 		prot = svd->prot & vpprot;
   5472 
   5473 
   5474 	/*
   5475 	 * Large Files: diff should be unsigned value because we started
   5476 	 * supporting > 2GB segment sizes from 2.5.1 and when a
   5477 	 * large file of size > 2GB gets mapped to address space
   5478 	 * the diff value can be > 2GB.
   5479 	 */
   5480 
   5481 	for (ppp = plp; (pp = *ppp) != NULL; ppp++) {
   5482 		size_t diff;
   5483 		struct anon *ap;
   5484 		int anon_index;
   5485 		anon_sync_obj_t cookie;
   5486 		int hat_flag = HAT_LOAD_ADV;
   5487 
   5488 		if (svd->flags & MAP_TEXT) {
   5489 			hat_flag |= HAT_LOAD_TEXT;
   5490 		}
   5491 
   5492 		if (pp == PAGE_HANDLED)
   5493 			continue;
   5494 
   5495 		if (svd->tr_state != SEGVN_TR_ON &&
   5496 		    pp->p_offset >=  svd->offset &&
   5497 		    pp->p_offset < svd->offset + seg->s_size) {
   5498 
   5499 			diff = pp->p_offset - svd->offset;
   5500 
   5501 			/*
   5502 			 * Large Files: Following is the assertion
   5503 			 * validating the above cast.
   5504 			 */
   5505 			ASSERT(svd->vp == pp->p_vnode);
   5506 
   5507 			page = btop(diff);
   5508 			if (svd->pageprot)
   5509 				prot = VPP_PROT(&svd->vpage[page]) & vpprot;
   5510 
   5511 			/*
   5512 			 * Prevent other threads in the address space from
   5513 			 * creating private pages (i.e., allocating anon slots)
   5514 			 * while we are in the process of loading translations
   5515 			 * to additional pages returned by the underlying
   5516 			 * object.
   5517 			 */
   5518 			if (amp != NULL) {
   5519 				anon_index = svd->anon_index + page;
   5520 				anon_array_enter(amp, anon_index, &cookie);
   5521 				ap = anon_get_ptr(amp->ahp, anon_index);
   5522 			}
   5523 			if ((amp == NULL) || (ap == NULL)) {
   5524 				if (IS_VMODSORT(pp->p_vnode) ||
   5525 				    enable_mbit_wa) {
   5526 					if (rw == S_WRITE)
   5527 						hat_setmod(pp);
   5528 					else if (rw != S_OTHER &&
   5529 					    !hat_ismod(pp))
   5530 						prot &= ~PROT_WRITE;
   5531 				}
   5532 				/*
   5533 				 * Skip mapping read ahead pages marked
   5534 				 * for migration, so they will get migrated
   5535 				 * properly on fault
   5536 				 */
   5537 				ASSERT(amp == NULL ||
   5538 				    svd->rcookie == HAT_INVALID_REGION_COOKIE);
   5539 				if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) {
   5540 					hat_memload_region(hat,
   5541 					    seg->s_base + diff,
   5542 					    pp, prot, hat_flag,
   5543 					    svd->rcookie);
   5544 				}
   5545 			}
   5546 			if (amp != NULL)
   5547 				anon_array_exit(&cookie);
   5548 		}
   5549 		page_unlock(pp);
   5550 	}
   5551 done:
   5552 	if (amp != NULL)
   5553 		ANON_LOCK_EXIT(&amp->a_rwlock);
   5554 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5555 	if (pl_alloc_sz)
   5556 		kmem_free(plp, pl_alloc_sz);
   5557 	return (0);
   5558 }
   5559 
   5560 /*
   5561  * This routine is used to start I/O on pages asynchronously.  XXX it will
   5562  * only create PAGESIZE pages. At fault time they will be relocated into
   5563  * larger pages.
   5564  */
   5565 static faultcode_t
   5566 segvn_faulta(struct seg *seg, caddr_t addr)
   5567 {
   5568 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   5569 	int err;
   5570 	struct anon_map *amp;
   5571 	vnode_t *vp;
   5572 
   5573 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   5574 
   5575 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
   5576 	if ((amp = svd->amp) != NULL) {
   5577 		struct anon *ap;
   5578 
   5579 		/*
   5580 		 * Reader lock to prevent amp->ahp from being changed.
   5581 		 * This is advisory, it's ok to miss a page, so
   5582 		 * we don't do anon_array_enter lock.
   5583 		 */
   5584 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   5585 		if ((ap = anon_get_ptr(amp->ahp,
   5586 		    svd->anon_index + seg_page(seg, addr))) != NULL) {
   5587 
   5588 			err = anon_getpage(&ap, NULL, NULL,
   5589 			    0, seg, addr, S_READ, svd->cred);
   5590 
   5591 			ANON_LOCK_EXIT(&amp->a_rwlock);
   5592 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5593 			if (err)
   5594 				return (FC_MAKE_ERR(err));
   5595 			return (0);
   5596 		}
   5597 		ANON_LOCK_EXIT(&amp->a_rwlock);
   5598 	}
   5599 
   5600 	if (svd->vp == NULL) {
   5601 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5602 		return (0);			/* zfod page - do nothing now */
   5603 	}
   5604 
   5605 	vp = svd->vp;
   5606 	TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE,
   5607 	    "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp);
   5608 	err = VOP_GETPAGE(vp,
   5609 	    (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)),
   5610 	    PAGESIZE, NULL, NULL, 0, seg, addr,
   5611 	    S_OTHER, svd->cred, NULL);
   5612 
   5613 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5614 	if (err)
   5615 		return (FC_MAKE_ERR(err));
   5616 	return (0);
   5617 }
   5618 
   5619 static int
   5620 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
   5621 {
   5622 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   5623 	struct vpage *cvp, *svp, *evp;
   5624 	struct vnode *vp;
   5625 	size_t pgsz;
   5626 	pgcnt_t pgcnt;
   5627 	anon_sync_obj_t cookie;
   5628 	int unload_done = 0;
   5629 
   5630 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   5631 
   5632 	if ((svd->maxprot & prot) != prot)
   5633 		return (EACCES);			/* violated maxprot */
   5634 
   5635 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
   5636 
   5637 	/* return if prot is the same */
   5638 	if (!svd->pageprot && svd->prot == prot) {
   5639 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5640 		return (0);
   5641 	}
   5642 
   5643 	/*
   5644 	 * Since we change protections we first have to flush the cache.
   5645 	 * This makes sure all the pagelock calls have to recheck
   5646 	 * protections.
   5647 	 */
   5648 	if (svd->softlockcnt > 0) {
   5649 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
   5650 
   5651 		/*
   5652 		 * If this is shared segment non 0 softlockcnt
   5653 		 * means locked pages are still in use.
   5654 		 */
   5655 		if (svd->type == MAP_SHARED) {
   5656 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5657 			return (EAGAIN);
   5658 		}
   5659 
   5660 		/*
   5661 		 * Since we do have the segvn writers lock nobody can fill
   5662 		 * the cache with entries belonging to this seg during
   5663 		 * the purge. The flush either succeeds or we still have
   5664 		 * pending I/Os.
   5665 		 */
   5666 		segvn_purge(seg);
   5667 		if (svd->softlockcnt > 0) {
   5668 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5669 			return (EAGAIN);
   5670 		}
   5671 	}
   5672 
   5673 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
   5674 		ASSERT(svd->amp == NULL);
   5675 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
   5676 		hat_leave_region(seg->s_as->a_hat, svd->rcookie,
   5677 		    HAT_REGION_TEXT);
   5678 		svd->rcookie = HAT_INVALID_REGION_COOKIE;
   5679 		unload_done = 1;
   5680 	} else if (svd->tr_state == SEGVN_TR_INIT) {
   5681 		svd->tr_state = SEGVN_TR_OFF;
   5682 	} else if (svd->tr_state == SEGVN_TR_ON) {
   5683 		ASSERT(svd->amp != NULL);
   5684 		segvn_textunrepl(seg, 0);
   5685 		ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
   5686 		unload_done = 1;
   5687 	}
   5688 
   5689 	if ((prot & PROT_WRITE) && svd->type == MAP_SHARED &&
   5690 	    svd->vp != NULL && (svd->vp->v_flag & VVMEXEC)) {
   5691 		ASSERT(vn_is_mapped(svd->vp, V_WRITE));
   5692 		segvn_inval_trcache(svd->vp);
   5693 	}
   5694 	if (seg->s_szc != 0) {
   5695 		int err;
   5696 		pgsz = page_get_pagesize(seg->s_szc);
   5697 		pgcnt = pgsz >> PAGESHIFT;
   5698 		ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
   5699 		if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
   5700 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5701 			ASSERT(seg->s_base != addr || seg->s_size != len);
   5702 			/*
   5703 			 * If we are holding the as lock as a reader then
   5704 			 * we need to return IE_RETRY and let the as
   5705 			 * layer drop and re-acquire the lock as a writer.
   5706 			 */
   5707 			if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock))
   5708 				return (IE_RETRY);
   5709 			VM_STAT_ADD(segvnvmstats.demoterange[1]);
   5710 			if (svd->type == MAP_PRIVATE || svd->vp != NULL) {
   5711 				err = segvn_demote_range(seg, addr, len,
   5712 				    SDR_END, 0);
   5713 			} else {
   5714 				uint_t szcvec = map_pgszcvec(seg->s_base,
   5715 				    pgsz, (uintptr_t)seg->s_base,
   5716 				    (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0);
   5717 				err = segvn_demote_range(seg, addr, len,
   5718 				    SDR_END, szcvec);
   5719 			}
   5720 			if (err == 0)
   5721 				return (IE_RETRY);
   5722 			if (err == ENOMEM)
   5723 				return (IE_NOMEM);
   5724 			return (err);
   5725 		}
   5726 	}
   5727 
   5728 
   5729 	/*
   5730 	 * If it's a private mapping and we're making it writable then we
   5731 	 * may have to reserve the additional swap space now. If we are
   5732 	 * making writable only a part of the segment then we use its vpage
   5733 	 * array to keep a record of the pages for which we have reserved
   5734 	 * swap. In this case we set the pageswap field in the segment's
   5735 	 * segvn structure to record this.
   5736 	 *
   5737 	 * If it's a private mapping to a file (i.e., vp != NULL) and we're
   5738 	 * removing write permission on the entire segment and we haven't
   5739 	 * modified any pages, we can release the swap space.
   5740 	 */
   5741 	if (svd->type == MAP_PRIVATE) {
   5742 		if (prot & PROT_WRITE) {
   5743 			if (!(svd->flags & MAP_NORESERVE) &&
   5744 			    !(svd->swresv && svd->pageswap == 0)) {
   5745 				size_t sz = 0;
   5746 
   5747 				/*
   5748 				 * Start by determining how much swap
   5749 				 * space is required.
   5750 				 */
   5751 				if (addr == seg->s_base &&
   5752 				    len == seg->s_size &&
   5753 				    svd->pageswap == 0) {
   5754 					/* The whole segment */
   5755 					sz = seg->s_size;
   5756 				} else {
   5757 					/*
   5758 					 * Make sure that the vpage array
   5759 					 * exists, and make a note of the
   5760 					 * range of elements corresponding
   5761 					 * to len.
   5762 					 */
   5763 					segvn_vpage(seg);
   5764 					svp = &svd->vpage[seg_page(seg, addr)];
   5765 					evp = &svd->vpage[seg_page(seg,
   5766 					    addr + len)];
   5767 
   5768 					if (svd->pageswap == 0) {
   5769 						/*
   5770 						 * This is the first time we've
   5771 						 * asked for a part of this
   5772 						 * segment, so we need to
   5773 						 * reserve everything we've
   5774 						 * been asked for.
   5775 						 */
   5776 						sz = len;
   5777 					} else {
   5778 						/*
   5779 						 * We have to count the number
   5780 						 * of pages required.
   5781 						 */
   5782 						for (cvp = svp;  cvp < evp;
   5783 						    cvp++) {
   5784 							if (!VPP_ISSWAPRES(cvp))
   5785 								sz++;
   5786 						}
   5787 						sz <<= PAGESHIFT;
   5788 					}
   5789 				}
   5790 
   5791 				/* Try to reserve the necessary swap. */
   5792 				if (anon_resv_zone(sz,
   5793 				    seg->s_as->a_proc->p_zone) == 0) {
   5794 					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5795 					return (IE_NOMEM);
   5796 				}
   5797 
   5798 				/*
   5799 				 * Make a note of how much swap space
   5800 				 * we've reserved.
   5801 				 */
   5802 				if (svd->pageswap == 0 && sz == seg->s_size) {
   5803 					svd->swresv = sz;
   5804 				} else {
   5805 					ASSERT(svd->vpage != NULL);
   5806 					svd->swresv += sz;
   5807 					svd->pageswap = 1;
   5808 					for (cvp = svp; cvp < evp; cvp++) {
   5809 						if (!VPP_ISSWAPRES(cvp))
   5810 							VPP_SETSWAPRES(cvp);
   5811 					}
   5812 				}
   5813 			}
   5814 		} else {
   5815 			/*
   5816 			 * Swap space is released only if this segment
   5817 			 * does not map anonymous memory, since read faults
   5818 			 * on such segments still need an anon slot to read
   5819 			 * in the data.
   5820 			 */
   5821 			if (svd->swresv != 0 && svd->vp != NULL &&
   5822 			    svd->amp == NULL && addr == seg->s_base &&
   5823 			    len == seg->s_size && svd->pageprot == 0) {
   5824 				ASSERT(svd->pageswap == 0);
   5825 				anon_unresv_zone(svd->swresv,
   5826 				    seg->s_as->a_proc->p_zone);
   5827 				svd->swresv = 0;
   5828 				TRACE_3(TR_FAC_VM, TR_ANON_PROC,
   5829 				    "anon proc:%p %lu %u", seg, 0, 0);
   5830 			}
   5831 		}
   5832 	}
   5833 
   5834 	if (addr == seg->s_base && len == seg->s_size && svd->vpage == NULL) {
   5835 		if (svd->prot == prot) {
   5836 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5837 			return (0);			/* all done */
   5838 		}
   5839 		svd->prot = (uchar_t)prot;
   5840 	} else if (svd->type == MAP_PRIVATE) {
   5841 		struct anon *ap = NULL;
   5842 		page_t *pp;
   5843 		u_offset_t offset, off;
   5844 		struct anon_map *amp;
   5845 		ulong_t anon_idx = 0;
   5846 
   5847 		/*
   5848 		 * A vpage structure exists or else the change does not
   5849 		 * involve the entire segment.  Establish a vpage structure
   5850 		 * if none is there.  Then, for each page in the range,
   5851 		 * adjust its individual permissions.  Note that write-
   5852 		 * enabling a MAP_PRIVATE page can affect the claims for
   5853 		 * locked down memory.  Overcommitting memory terminates
   5854 		 * the operation.
   5855 		 */
   5856 		segvn_vpage(seg);
   5857 		svd->pageprot = 1;
   5858 		if ((amp = svd->amp) != NULL) {
   5859 			anon_idx = svd->anon_index + seg_page(seg, addr);
   5860 			ASSERT(seg->s_szc == 0 ||
   5861 			    IS_P2ALIGNED(anon_idx, pgcnt));
   5862 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   5863 		}
   5864 
   5865 		offset = svd->offset + (uintptr_t)(addr - seg->s_base);
   5866 		evp = &svd->vpage[seg_page(seg, addr + len)];
   5867 
   5868 		/*
   5869 		 * See Statement at the beginning of segvn_lockop regarding
   5870 		 * the way cowcnts and lckcnts are handled.
   5871 		 */
   5872 		for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) {
   5873 
   5874 			if (seg->s_szc != 0) {
   5875 				if (amp != NULL) {
   5876 					anon_array_enter(amp, anon_idx,
   5877 					    &cookie);
   5878 				}
   5879 				if (IS_P2ALIGNED(anon_idx, pgcnt) &&
   5880 				    !segvn_claim_pages(seg, svp, offset,
   5881 				    anon_idx, prot)) {
   5882 					if (amp != NULL) {
   5883 						anon_array_exit(&cookie);
   5884 					}
   5885 					break;
   5886 				}
   5887 				if (amp != NULL) {
   5888 					anon_array_exit(&cookie);
   5889 				}
   5890 				anon_idx++;
   5891 			} else {
   5892 				if (amp != NULL) {
   5893 					anon_array_enter(amp, anon_idx,
   5894 					    &cookie);
   5895 					ap = anon_get_ptr(amp->ahp, anon_idx++);
   5896 				}
   5897 
   5898 				if (VPP_ISPPLOCK(svp) &&
   5899 				    VPP_PROT(svp) != prot) {
   5900 
   5901 					if (amp == NULL || ap == NULL) {
   5902 						vp = svd->vp;
   5903 						off = offset;
   5904 					} else
   5905 						swap_xlate(ap, &vp, &off);
   5906 					if (amp != NULL)
   5907 						anon_array_exit(&cookie);
   5908 
   5909 					if ((pp = page_lookup(vp, off,
   5910 					    SE_SHARED)) == NULL) {
   5911 						panic("segvn_setprot: no page");
   5912 						/*NOTREACHED*/
   5913 					}
   5914 					ASSERT(seg->s_szc == 0);
   5915 					if ((VPP_PROT(svp) ^ prot) &
   5916 					    PROT_WRITE) {
   5917 						if (prot & PROT_WRITE) {
   5918 							if (!page_addclaim(
   5919 							    pp)) {
   5920 								page_unlock(pp);
   5921 								break;
   5922 							}
   5923 						} else {
   5924 							if (!page_subclaim(
   5925 							    pp)) {
   5926 								page_unlock(pp);
   5927 								break;
   5928 							}
   5929 						}
   5930 					}
   5931 					page_unlock(pp);
   5932 				} else if (amp != NULL)
   5933 					anon_array_exit(&cookie);
   5934 			}
   5935 			VPP_SETPROT(svp, prot);
   5936 			offset += PAGESIZE;
   5937 		}
   5938 		if (amp != NULL)
   5939 			ANON_LOCK_EXIT(&amp->a_rwlock);
   5940 
   5941 		/*
   5942 		 * Did we terminate prematurely?  If so, simply unload
   5943 		 * the translations to the things we've updated so far.
   5944 		 */
   5945 		if (svp != evp) {
   5946 			if (unload_done) {
   5947 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5948 				return (IE_NOMEM);
   5949 			}
   5950 			len = (svp - &svd->vpage[seg_page(seg, addr)]) *
   5951 			    PAGESIZE;
   5952 			ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz));
   5953 			if (len != 0)
   5954 				hat_unload(seg->s_as->a_hat, addr,
   5955 				    len, HAT_UNLOAD);
   5956 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5957 			return (IE_NOMEM);
   5958 		}
   5959 	} else {
   5960 		segvn_vpage(seg);
   5961 		svd->pageprot = 1;
   5962 		evp = &svd->vpage[seg_page(seg, addr + len)];
   5963 		for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) {
   5964 			VPP_SETPROT(svp, prot);
   5965 		}
   5966 	}
   5967 
   5968 	if (unload_done) {
   5969 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5970 		return (0);
   5971 	}
   5972 
   5973 	if (((prot & PROT_WRITE) != 0 &&
   5974 	    (svd->vp != NULL || svd->type == MAP_PRIVATE)) ||
   5975 	    (prot & ~PROT_USER) == PROT_NONE) {
   5976 		/*
   5977 		 * Either private or shared data with write access (in
   5978 		 * which case we need to throw out all former translations
   5979 		 * so that we get the right translations set up on fault
   5980 		 * and we don't allow write access to any copy-on-write pages
   5981 		 * that might be around or to prevent write access to pages
   5982 		 * representing holes in a file), or we don't have permission
   5983 		 * to access the memory at all (in which case we have to
   5984 		 * unload any current translations that might exist).
   5985 		 */
   5986 		hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
   5987 	} else {
   5988 		/*
   5989 		 * A shared mapping or a private mapping in which write
   5990 		 * protection is going to be denied - just change all the
   5991 		 * protections over the range of addresses in question.
   5992 		 * segvn does not support any other attributes other
   5993 		 * than prot so we can use hat_chgattr.
   5994 		 */
   5995 		hat_chgattr(seg->s_as->a_hat, addr, len, prot);
   5996 	}
   5997 
   5998 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   5999 
   6000 	return (0);
   6001 }
   6002 
   6003 /*
   6004  * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize,
   6005  * to determine if the seg is capable of mapping the requested szc.
   6006  */
   6007 static int
   6008 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
   6009 {
   6010 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6011 	struct segvn_data *nsvd;
   6012 	struct anon_map *amp = svd->amp;
   6013 	struct seg *nseg;
   6014 	caddr_t eaddr = addr + len, a;
   6015 	size_t pgsz = page_get_pagesize(szc);
   6016 	pgcnt_t pgcnt = page_get_pagecnt(szc);
   6017 	int err;
   6018 	u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base);
   6019 
   6020 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
   6021 	ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
   6022 
   6023 	if (seg->s_szc == szc || segvn_lpg_disable != 0) {
   6024 		return (0);
   6025 	}
   6026 
   6027 	/*
   6028 	 * addr should always be pgsz aligned but eaddr may be misaligned if
   6029 	 * it's at the end of the segment.
   6030 	 *
   6031 	 * XXX we should assert this condition since as_setpagesize() logic
   6032 	 * guarantees it.
   6033 	 */
   6034 	if (!IS_P2ALIGNED(addr, pgsz) ||
   6035 	    (!IS_P2ALIGNED(eaddr, pgsz) &&
   6036 	    eaddr != seg->s_base + seg->s_size)) {
   6037 
   6038 		segvn_setpgsz_align_err++;
   6039 		return (EINVAL);
   6040 	}
   6041 
   6042 	if (amp != NULL && svd->type == MAP_SHARED) {
   6043 		ulong_t an_idx = svd->anon_index + seg_page(seg, addr);
   6044 		if (!IS_P2ALIGNED(an_idx, pgcnt)) {
   6045 
   6046 			segvn_setpgsz_anon_align_err++;
   6047 			return (EINVAL);
   6048 		}
   6049 	}
   6050 
   6051 	if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas ||
   6052 	    szc > segvn_maxpgszc) {
   6053 		return (EINVAL);
   6054 	}
   6055 
   6056 	/* paranoid check */
   6057 	if (svd->vp != NULL &&
   6058 	    (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) {
   6059 		return (EINVAL);
   6060 	}
   6061 
   6062 	if (seg->s_szc == 0 && svd->vp != NULL &&
   6063 	    map_addr_vacalign_check(addr, off)) {
   6064 		return (EINVAL);
   6065 	}
   6066 
   6067 	/*
   6068 	 * Check that protections are the same within new page
   6069 	 * size boundaries.
   6070 	 */
   6071 	if (svd->pageprot) {
   6072 		for (a = addr; a < eaddr; a += pgsz) {
   6073 			if ((a + pgsz) > eaddr) {
   6074 				if (!sameprot(seg, a, eaddr - a)) {
   6075 					return (EINVAL);
   6076 				}
   6077 			} else {
   6078 				if (!sameprot(seg, a, pgsz)) {
   6079 					return (EINVAL);
   6080 				}
   6081 			}
   6082 		}
   6083 	}
   6084 
   6085 	/*
   6086 	 * Since we are changing page size we first have to flush
   6087 	 * the cache. This makes sure all the pagelock calls have
   6088 	 * to recheck protections.
   6089 	 */
   6090 	if (svd->softlockcnt > 0) {
   6091 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
   6092 
   6093 		/*
   6094 		 * If this is shared segment non 0 softlockcnt
   6095 		 * means locked pages are still in use.
   6096 		 */
   6097 		if (svd->type == MAP_SHARED) {
   6098 			return (EAGAIN);
   6099 		}
   6100 
   6101 		/*
   6102 		 * Since we do have the segvn writers lock nobody can fill
   6103 		 * the cache with entries belonging to this seg during
   6104 		 * the purge. The flush either succeeds or we still have
   6105 		 * pending I/Os.
   6106 		 */
   6107 		segvn_purge(seg);
   6108 		if (svd->softlockcnt > 0) {
   6109 			return (EAGAIN);
   6110 		}
   6111 	}
   6112 
   6113 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
   6114 		ASSERT(svd->amp == NULL);
   6115 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
   6116 		hat_leave_region(seg->s_as->a_hat, svd->rcookie,
   6117 		    HAT_REGION_TEXT);
   6118 		svd->rcookie = HAT_INVALID_REGION_COOKIE;
   6119 	} else if (svd->tr_state == SEGVN_TR_INIT) {
   6120 		svd->tr_state = SEGVN_TR_OFF;
   6121 	} else if (svd->tr_state == SEGVN_TR_ON) {
   6122 		ASSERT(svd->amp != NULL);
   6123 		segvn_textunrepl(seg, 1);
   6124 		ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
   6125 		amp = NULL;
   6126 	}
   6127 
   6128 	/*
   6129 	 * Operation for sub range of existing segment.
   6130 	 */
   6131 	if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) {
   6132 		if (szc < seg->s_szc) {
   6133 			VM_STAT_ADD(segvnvmstats.demoterange[2]);
   6134 			err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0);
   6135 			if (err == 0) {
   6136 				return (IE_RETRY);
   6137 			}
   6138 			if (err == ENOMEM) {
   6139 				return (IE_NOMEM);
   6140 			}
   6141 			return (err);
   6142 		}
   6143 		if (addr != seg->s_base) {
   6144 			nseg = segvn_split_seg(seg, addr);
   6145 			if (eaddr != (nseg->s_base + nseg->s_size)) {
   6146 				/* eaddr is szc aligned */
   6147 				(void) segvn_split_seg(nseg, eaddr);
   6148 			}
   6149 			return (IE_RETRY);
   6150 		}
   6151 		if (eaddr != (seg->s_base + seg->s_size)) {
   6152 			/* eaddr is szc aligned */
   6153 			(void) segvn_split_seg(seg, eaddr);
   6154 		}
   6155 		return (IE_RETRY);
   6156 	}
   6157 
   6158 	/*
   6159 	 * Break any low level sharing and reset seg->s_szc to 0.
   6160 	 */
   6161 	if ((err = segvn_clrszc(seg)) != 0) {
   6162 		if (err == ENOMEM) {
   6163 			err = IE_NOMEM;
   6164 		}
   6165 		return (err);
   6166 	}
   6167 	ASSERT(seg->s_szc == 0);
   6168 
   6169 	/*
   6170 	 * If the end of the current segment is not pgsz aligned
   6171 	 * then attempt to concatenate with the next segment.
   6172 	 */
   6173 	if (!IS_P2ALIGNED(eaddr, pgsz)) {
   6174 		nseg = AS_SEGNEXT(seg->s_as, seg);
   6175 		if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) {
   6176 			return (ENOMEM);
   6177 		}
   6178 		if (nseg->s_ops != &segvn_ops) {
   6179 			return (EINVAL);
   6180 		}
   6181 		nsvd = (struct segvn_data *)nseg->s_data;
   6182 		if (nsvd->softlockcnt > 0) {
   6183 			/*
   6184 			 * If this is shared segment non 0 softlockcnt
   6185 			 * means locked pages are still in use.
   6186 			 */
   6187 			if (nsvd->type == MAP_SHARED) {
   6188 				return (EAGAIN);
   6189 			}
   6190 			segvn_purge(nseg);
   6191 			if (nsvd->softlockcnt > 0) {
   6192 				return (EAGAIN);
   6193 			}
   6194 		}
   6195 		err = segvn_clrszc(nseg);
   6196 		if (err == ENOMEM) {
   6197 			err = IE_NOMEM;
   6198 		}
   6199 		if (err != 0) {
   6200 			return (err);
   6201 		}
   6202 		ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
   6203 		err = segvn_concat(seg, nseg, 1);
   6204 		if (err == -1) {
   6205 			return (EINVAL);
   6206 		}
   6207 		if (err == -2) {
   6208 			return (IE_NOMEM);
   6209 		}
   6210 		return (IE_RETRY);
   6211 	}
   6212 
   6213 	/*
   6214 	 * May need to re-align anon array to
   6215 	 * new szc.
   6216 	 */
   6217 	if (amp != NULL) {
   6218 		if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) {
   6219 			struct anon_hdr *nahp;
   6220 
   6221 			ASSERT(svd->type == MAP_PRIVATE);
   6222 
   6223 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
   6224 			ASSERT(amp->refcnt == 1);
   6225 			nahp = anon_create(btop(amp->size), ANON_NOSLEEP);
   6226 			if (nahp == NULL) {
   6227 				ANON_LOCK_EXIT(&amp->a_rwlock);
   6228 				return (IE_NOMEM);
   6229 			}
   6230 			if (anon_copy_ptr(amp->ahp, svd->anon_index,
   6231 			    nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) {
   6232 				anon_release(nahp, btop(amp->size));
   6233 				ANON_LOCK_EXIT(&amp->a_rwlock);
   6234 				return (IE_NOMEM);
   6235 			}
   6236 			anon_release(amp->ahp, btop(amp->size));
   6237 			amp->ahp = nahp;
   6238 			svd->anon_index = 0;
   6239 			ANON_LOCK_EXIT(&amp->a_rwlock);
   6240 		}
   6241 	}
   6242 	if (svd->vp != NULL && szc != 0) {
   6243 		struct vattr va;
   6244 		u_offset_t eoffpage = svd->offset;
   6245 		va.va_mask = AT_SIZE;
   6246 		eoffpage += seg->s_size;
   6247 		eoffpage = btopr(eoffpage);
   6248 		if (VOP_GETATTR(svd->vp, &va, 0, svd->cred, NULL) != 0) {
   6249 			segvn_setpgsz_getattr_err++;
   6250 			return (EINVAL);
   6251 		}
   6252 		if (btopr(va.va_size) < eoffpage) {
   6253 			segvn_setpgsz_eof_err++;
   6254 			return (EINVAL);
   6255 		}
   6256 		if (amp != NULL) {
   6257 			/*
   6258 			 * anon_fill_cow_holes() may call VOP_GETPAGE().
   6259 			 * don't take anon map lock here to avoid holding it
   6260 			 * across VOP_GETPAGE() calls that may call back into
   6261 			 * segvn for klsutering checks. We don't really need
   6262 			 * anon map lock here since it's a private segment and
   6263 			 * we hold as level lock as writers.
   6264 			 */
   6265 			if ((err = anon_fill_cow_holes(seg, seg->s_base,
   6266 			    amp->ahp, svd->anon_index, svd->vp, svd->offset,
   6267 			    seg->s_size, szc, svd->prot, svd->vpage,
   6268 			    svd->cred)) != 0) {
   6269 				return (EINVAL);
   6270 			}
   6271 		}
   6272 		segvn_setvnode_mpss(svd->vp);
   6273 	}
   6274 
   6275 	if (amp != NULL) {
   6276 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
   6277 		if (svd->type == MAP_PRIVATE) {
   6278 			amp->a_szc = szc;
   6279 		} else if (szc > amp->a_szc) {
   6280 			amp->a_szc = szc;
   6281 		}
   6282 		ANON_LOCK_EXIT(&amp->a_rwlock);
   6283 	}
   6284 
   6285 	seg->s_szc = szc;
   6286 
   6287 	return (0);
   6288 }
   6289 
   6290 static int
   6291 segvn_clrszc(struct seg *seg)
   6292 {
   6293 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6294 	struct anon_map *amp = svd->amp;
   6295 	size_t pgsz;
   6296 	pgcnt_t pages;
   6297 	int err = 0;
   6298 	caddr_t a = seg->s_base;
   6299 	caddr_t ea = a + seg->s_size;
   6300 	ulong_t an_idx = svd->anon_index;
   6301 	vnode_t *vp = svd->vp;
   6302 	struct vpage *vpage = svd->vpage;
   6303 	page_t *anon_pl[1 + 1], *pp;
   6304 	struct anon *ap, *oldap;
   6305 	uint_t prot = svd->prot, vpprot;
   6306 	int pageflag = 0;
   6307 
   6308 	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
   6309 	    SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
   6310 	ASSERT(svd->softlockcnt == 0);
   6311 
   6312 	if (vp == NULL && amp == NULL) {
   6313 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   6314 		seg->s_szc = 0;
   6315 		return (0);
   6316 	}
   6317 
   6318 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
   6319 		ASSERT(svd->amp == NULL);
   6320 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
   6321 		hat_leave_region(seg->s_as->a_hat, svd->rcookie,
   6322 		    HAT_REGION_TEXT);
   6323 		svd->rcookie = HAT_INVALID_REGION_COOKIE;
   6324 	} else if (svd->tr_state == SEGVN_TR_ON) {
   6325 		ASSERT(svd->amp != NULL);
   6326 		segvn_textunrepl(seg, 1);
   6327 		ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
   6328 		amp = NULL;
   6329 	} else {
   6330 		if (svd->tr_state != SEGVN_TR_OFF) {
   6331 			ASSERT(svd->tr_state == SEGVN_TR_INIT);
   6332 			svd->tr_state = SEGVN_TR_OFF;
   6333 		}
   6334 
   6335 		/*
   6336 		 * do HAT_UNLOAD_UNMAP since we are changing the pagesize.
   6337 		 * unload argument is 0 when we are freeing the segment
   6338 		 * and unload was already done.
   6339 		 */
   6340 		hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size,
   6341 		    HAT_UNLOAD_UNMAP);
   6342 	}
   6343 
   6344 	if (amp == NULL || svd->type == MAP_SHARED) {
   6345 		seg->s_szc = 0;
   6346 		return (0);
   6347 	}
   6348 
   6349 	pgsz = page_get_pagesize(seg->s_szc);
   6350 	pages = btop(pgsz);
   6351 
   6352 	/*
   6353 	 * XXX anon rwlock is not really needed because this is a
   6354 	 * private segment and we are writers.
   6355 	 */
   6356 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
   6357 
   6358 	for (; a < ea; a += pgsz, an_idx += pages) {
   6359 		if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) {
   6360 			ASSERT(vpage != NULL || svd->pageprot == 0);
   6361 			if (vpage != NULL) {
   6362 				ASSERT(sameprot(seg, a, pgsz));
   6363 				prot = VPP_PROT(vpage);
   6364 				pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0;
   6365 			}
   6366 			if (seg->s_szc != 0) {
   6367 				ASSERT(vp == NULL || anon_pages(amp->ahp,
   6368 				    an_idx, pages) == pages);
   6369 				if ((err = anon_map_demotepages(amp, an_idx,
   6370 				    seg, a, prot, vpage, svd->cred)) != 0) {
   6371 					goto out;
   6372 				}
   6373 			} else {
   6374 				if (oldap->an_refcnt == 1) {
   6375 					continue;
   6376 				}
   6377 				if ((err = anon_getpage(&oldap, &vpprot,
   6378 				    anon_pl, PAGESIZE, seg, a, S_READ,
   6379 				    svd->cred))) {
   6380 					goto out;
   6381 				}
   6382 				if ((pp = anon_private(&ap, seg, a, prot,
   6383 				    anon_pl[0], pageflag, svd->cred)) == NULL) {
   6384 					err = ENOMEM;
   6385 					goto out;
   6386 				}
   6387 				anon_decref(oldap);
   6388 				(void) anon_set_ptr(amp->ahp, an_idx, ap,
   6389 				    ANON_SLEEP);
   6390 				page_unlock(pp);
   6391 			}
   6392 		}
   6393 		vpage = (vpage == NULL) ? NULL : vpage + pages;
   6394 	}
   6395 
   6396 	amp->a_szc = 0;
   6397 	seg->s_szc = 0;
   6398 out:
   6399 	ANON_LOCK_EXIT(&amp->a_rwlock);
   6400 	return (err);
   6401 }
   6402 
   6403 static int
   6404 segvn_claim_pages(
   6405 	struct seg *seg,
   6406 	struct vpage *svp,
   6407 	u_offset_t off,
   6408 	ulong_t anon_idx,
   6409 	uint_t prot)
   6410 {
   6411 	pgcnt_t	pgcnt = page_get_pagecnt(seg->s_szc);
   6412 	size_t ppasize = (pgcnt + 1) * sizeof (page_t *);
   6413 	page_t	**ppa;
   6414 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6415 	struct anon_map *amp = svd->amp;
   6416 	struct vpage *evp = svp + pgcnt;
   6417 	caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT)
   6418 	    + seg->s_base;
   6419 	struct anon *ap;
   6420 	struct vnode *vp = svd->vp;
   6421 	page_t *pp;
   6422 	pgcnt_t pg_idx, i;
   6423 	int err = 0;
   6424 	anoff_t aoff;
   6425 	int anon = (amp != NULL) ? 1 : 0;
   6426 
   6427 	ASSERT(svd->type == MAP_PRIVATE);
   6428 	ASSERT(svd->vpage != NULL);
   6429 	ASSERT(seg->s_szc != 0);
   6430 	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
   6431 	ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt));
   6432 	ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT));
   6433 
   6434 	if (VPP_PROT(svp) == prot)
   6435 		return (1);
   6436 	if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE))
   6437 		return (1);
   6438 
   6439 	ppa = kmem_alloc(ppasize, KM_SLEEP);
   6440 	if (anon && vp != NULL) {
   6441 		if (anon_get_ptr(amp->ahp, anon_idx) == NULL) {
   6442 			anon = 0;
   6443 			ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt));
   6444 		}
   6445 		ASSERT(!anon ||
   6446 		    anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt);
   6447 	}
   6448 
   6449 	for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) {
   6450 		if (!VPP_ISPPLOCK(svp))
   6451 			continue;
   6452 		if (anon) {
   6453 			ap = anon_get_ptr(amp->ahp, anon_idx);
   6454 			if (ap == NULL) {
   6455 				panic("segvn_claim_pages: no anon slot");
   6456 			}
   6457 			swap_xlate(ap, &vp, &aoff);
   6458 			off = (u_offset_t)aoff;
   6459 		}
   6460 		ASSERT(vp != NULL);
   6461 		if ((pp = page_lookup(vp,
   6462 		    (u_offset_t)off, SE_SHARED)) == NULL) {
   6463 			panic("segvn_claim_pages: no page");
   6464 		}
   6465 		ppa[pg_idx++] = pp;
   6466 		off += PAGESIZE;
   6467 	}
   6468 
   6469 	if (ppa[0] == NULL) {
   6470 		kmem_free(ppa, ppasize);
   6471 		return (1);
   6472 	}
   6473 
   6474 	ASSERT(pg_idx <= pgcnt);
   6475 	ppa[pg_idx] = NULL;
   6476 
   6477 	if (prot & PROT_WRITE)
   6478 		err = page_addclaim_pages(ppa);
   6479 	else
   6480 		err = page_subclaim_pages(ppa);
   6481 
   6482 	for (i = 0; i < pg_idx; i++) {
   6483 		ASSERT(ppa[i] != NULL);
   6484 		page_unlock(ppa[i]);
   6485 	}
   6486 
   6487 	kmem_free(ppa, ppasize);
   6488 	return (err);
   6489 }
   6490 
   6491 /*
   6492  * Returns right (upper address) segment if split occurred.
   6493  * If the address is equal to the beginning or end of its segment it returns
   6494  * the current segment.
   6495  */
   6496 static struct seg *
   6497 segvn_split_seg(struct seg *seg, caddr_t addr)
   6498 {
   6499 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6500 	struct seg *nseg;
   6501 	size_t nsize;
   6502 	struct segvn_data *nsvd;
   6503 
   6504 	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
   6505 	ASSERT(svd->tr_state == SEGVN_TR_OFF);
   6506 
   6507 	ASSERT(addr >= seg->s_base);
   6508 	ASSERT(addr <= seg->s_base + seg->s_size);
   6509 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   6510 
   6511 	if (addr == seg->s_base || addr == seg->s_base + seg->s_size)
   6512 		return (seg);
   6513 
   6514 	nsize = seg->s_base + seg->s_size - addr;
   6515 	seg->s_size = addr - seg->s_base;
   6516 	nseg = seg_alloc(seg->s_as, addr, nsize);
   6517 	ASSERT(nseg != NULL);
   6518 	nseg->s_ops = seg->s_ops;
   6519 	nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
   6520 	nseg->s_data = (void *)nsvd;
   6521 	nseg->s_szc = seg->s_szc;
   6522 	*nsvd = *svd;
   6523 	ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
   6524 	nsvd->seg = nseg;
   6525 	rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL);
   6526 
   6527 	if (nsvd->vp != NULL) {
   6528 		VN_HOLD(nsvd->vp);
   6529 		nsvd->offset = svd->offset +
   6530 		    (uintptr_t)(nseg->s_base - seg->s_base);
   6531 		if (nsvd->type == MAP_SHARED)
   6532 			lgrp_shm_policy_init(NULL, nsvd->vp);
   6533 	} else {
   6534 		/*
   6535 		 * The offset for an anonymous segment has no signifigance in
   6536 		 * terms of an offset into a file. If we were to use the above
   6537 		 * calculation instead, the structures read out of
   6538 		 * /proc/<pid>/xmap would be more difficult to decipher since
   6539 		 * it would be unclear whether two seemingly contiguous
   6540 		 * prxmap_t structures represented different segments or a
   6541 		 * single segment that had been split up into multiple prxmap_t
   6542 		 * structures (e.g. if some part of the segment had not yet
   6543 		 * been faulted in).
   6544 		 */
   6545 		nsvd->offset = 0;
   6546 	}
   6547 
   6548 	ASSERT(svd->softlockcnt == 0);
   6549 	ASSERT(svd->softlockcnt_sbase == 0);
   6550 	ASSERT(svd->softlockcnt_send == 0);
   6551 	crhold(svd->cred);
   6552 
   6553 	if (svd->vpage != NULL) {
   6554 		size_t bytes = vpgtob(seg_pages(seg));
   6555 		size_t nbytes = vpgtob(seg_pages(nseg));
   6556 		struct vpage *ovpage = svd->vpage;
   6557 
   6558 		svd->vpage = kmem_alloc(bytes, KM_SLEEP);
   6559 		bcopy(ovpage, svd->vpage, bytes);
   6560 		nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
   6561 		bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes);
   6562 		kmem_free(ovpage, bytes + nbytes);
   6563 	}
   6564 	if (svd->amp != NULL && svd->type == MAP_PRIVATE) {
   6565 		struct anon_map *oamp = svd->amp, *namp;
   6566 		struct anon_hdr *nahp;
   6567 
   6568 		ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER);
   6569 		ASSERT(oamp->refcnt == 1);
   6570 		nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
   6571 		(void) anon_copy_ptr(oamp->ahp, svd->anon_index,
   6572 		    nahp, 0, btop(seg->s_size), ANON_SLEEP);
   6573 
   6574 		namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP);
   6575 		namp->a_szc = nseg->s_szc;
   6576 		(void) anon_copy_ptr(oamp->ahp,
   6577 		    svd->anon_index + btop(seg->s_size),
   6578 		    namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
   6579 		anon_release(oamp->ahp, btop(oamp->size));
   6580 		oamp->ahp = nahp;
   6581 		oamp->size = seg->s_size;
   6582 		svd->anon_index = 0;
   6583 		nsvd->amp = namp;
   6584 		nsvd->anon_index = 0;
   6585 		ANON_LOCK_EXIT(&oamp->a_rwlock);
   6586 	} else if (svd->amp != NULL) {
   6587 		pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc);
   6588 		ASSERT(svd->amp == nsvd->amp);
   6589 		ASSERT(seg->s_szc <= svd->amp->a_szc);
   6590 		nsvd->anon_index = svd->anon_index + seg_pages(seg);
   6591 		ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt));
   6592 		ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER);
   6593 		svd->amp->refcnt++;
   6594 		ANON_LOCK_EXIT(&svd->amp->a_rwlock);
   6595 	}
   6596 
   6597 	/*
   6598 	 * Split the amount of swap reserved.
   6599 	 */
   6600 	if (svd->swresv) {
   6601 		/*
   6602 		 * For MAP_NORESERVE, only allocate swap reserve for pages
   6603 		 * being used.  Other segments get enough to cover whole
   6604 		 * segment.
   6605 		 */
   6606 		if (svd->flags & MAP_NORESERVE) {
   6607 			size_t	oswresv;
   6608 
   6609 			ASSERT(svd->amp);
   6610 			oswresv = svd->swresv;
   6611 			svd->swresv = ptob(anon_pages(svd->amp->ahp,
   6612 			    svd->anon_index, btop(seg->s_size)));
   6613 			nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
   6614 			    nsvd->anon_index, btop(nseg->s_size)));
   6615 			ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
   6616 		} else {
   6617 			if (svd->pageswap) {
   6618 				svd->swresv = segvn_count_swap_by_vpages(seg);
   6619 				ASSERT(nsvd->swresv >= svd->swresv);
   6620 				nsvd->swresv -= svd->swresv;
   6621 			} else {
   6622 				ASSERT(svd->swresv == seg->s_size +
   6623 				    nseg->s_size);
   6624 				svd->swresv = seg->s_size;
   6625 				nsvd->swresv = nseg->s_size;
   6626 			}
   6627 		}
   6628 	}
   6629 
   6630 	return (nseg);
   6631 }
   6632 
   6633 /*
   6634  * called on memory operations (unmap, setprot, setpagesize) for a subset
   6635  * of a large page segment to either demote the memory range (SDR_RANGE)
   6636  * or the ends (SDR_END) by addr/len.
   6637  *
   6638  * returns 0 on success. returns errno, including ENOMEM, on failure.
   6639  */
   6640 static int
   6641 segvn_demote_range(
   6642 	struct seg *seg,
   6643 	caddr_t addr,
   6644 	size_t len,
   6645 	int flag,
   6646 	uint_t szcvec)
   6647 {
   6648 	caddr_t eaddr = addr + len;
   6649 	caddr_t lpgaddr, lpgeaddr;
   6650 	struct seg *nseg;
   6651 	struct seg *badseg1 = NULL;
   6652 	struct seg *badseg2 = NULL;
   6653 	size_t pgsz;
   6654 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6655 	int err;
   6656 	uint_t szc = seg->s_szc;
   6657 	uint_t tszcvec;
   6658 
   6659 	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
   6660 	ASSERT(svd->tr_state == SEGVN_TR_OFF);
   6661 	ASSERT(szc != 0);
   6662 	pgsz = page_get_pagesize(szc);
   6663 	ASSERT(seg->s_base != addr || seg->s_size != len);
   6664 	ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
   6665 	ASSERT(svd->softlockcnt == 0);
   6666 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
   6667 	ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED));
   6668 
   6669 	CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
   6670 	ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr);
   6671 	if (flag == SDR_RANGE) {
   6672 		/* demote entire range */
   6673 		badseg1 = nseg = segvn_split_seg(seg, lpgaddr);
   6674 		(void) segvn_split_seg(nseg, lpgeaddr);
   6675 		ASSERT(badseg1->s_base == lpgaddr);
   6676 		ASSERT(badseg1->s_size == lpgeaddr - lpgaddr);
   6677 	} else if (addr != lpgaddr) {
   6678 		ASSERT(flag == SDR_END);
   6679 		badseg1 = nseg = segvn_split_seg(seg, lpgaddr);
   6680 		if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz &&
   6681 		    eaddr < lpgaddr + 2 * pgsz) {
   6682 			(void) segvn_split_seg(nseg, lpgeaddr);
   6683 			ASSERT(badseg1->s_base == lpgaddr);
   6684 			ASSERT(badseg1->s_size == 2 * pgsz);
   6685 		} else {
   6686 			nseg = segvn_split_seg(nseg, lpgaddr + pgsz);
   6687 			ASSERT(badseg1->s_base == lpgaddr);
   6688 			ASSERT(badseg1->s_size == pgsz);
   6689 			if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) {
   6690 				ASSERT(lpgeaddr - lpgaddr > 2 * pgsz);
   6691 				nseg = segvn_split_seg(nseg, lpgeaddr - pgsz);
   6692 				badseg2 = nseg;
   6693 				(void) segvn_split_seg(nseg, lpgeaddr);
   6694 				ASSERT(badseg2->s_base == lpgeaddr - pgsz);
   6695 				ASSERT(badseg2->s_size == pgsz);
   6696 			}
   6697 		}
   6698 	} else {
   6699 		ASSERT(flag == SDR_END);
   6700 		ASSERT(eaddr < lpgeaddr);
   6701 		badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz);
   6702 		(void) segvn_split_seg(nseg, lpgeaddr);
   6703 		ASSERT(badseg1->s_base == lpgeaddr - pgsz);
   6704 		ASSERT(badseg1->s_size == pgsz);
   6705 	}
   6706 
   6707 	ASSERT(badseg1 != NULL);
   6708 	ASSERT(badseg1->s_szc == szc);
   6709 	ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz ||
   6710 	    badseg1->s_size == 2 * pgsz);
   6711 	ASSERT(sameprot(badseg1, badseg1->s_base, pgsz));
   6712 	ASSERT(badseg1->s_size == pgsz ||
   6713 	    sameprot(badseg1, badseg1->s_base + pgsz, pgsz));
   6714 	if (err = segvn_clrszc(badseg1)) {
   6715 		return (err);
   6716 	}
   6717 	ASSERT(badseg1->s_szc == 0);
   6718 
   6719 	if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) {
   6720 		uint_t tszc = highbit(tszcvec) - 1;
   6721 		caddr_t ta = MAX(addr, badseg1->s_base);
   6722 		caddr_t te;
   6723 		size_t tpgsz = page_get_pagesize(tszc);
   6724 
   6725 		ASSERT(svd->type == MAP_SHARED);
   6726 		ASSERT(flag == SDR_END);
   6727 		ASSERT(tszc < szc && tszc > 0);
   6728 
   6729 		if (eaddr > badseg1->s_base + badseg1->s_size) {
   6730 			te = badseg1->s_base + badseg1->s_size;
   6731 		} else {
   6732 			te = eaddr;
   6733 		}
   6734 
   6735 		ASSERT(ta <= te);
   6736 		badseg1->s_szc = tszc;
   6737 		if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) {
   6738 			if (badseg2 != NULL) {
   6739 				err = segvn_demote_range(badseg1, ta, te - ta,
   6740 				    SDR_END, tszcvec);
   6741 				if (err != 0) {
   6742 					return (err);
   6743 				}
   6744 			} else {
   6745 				return (segvn_demote_range(badseg1, ta,
   6746 				    te - ta, SDR_END, tszcvec));
   6747 			}
   6748 		}
   6749 	}
   6750 
   6751 	if (badseg2 == NULL)
   6752 		return (0);
   6753 	ASSERT(badseg2->s_szc == szc);
   6754 	ASSERT(badseg2->s_size == pgsz);
   6755 	ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size));
   6756 	if (err = segvn_clrszc(badseg2)) {
   6757 		return (err);
   6758 	}
   6759 	ASSERT(badseg2->s_szc == 0);
   6760 
   6761 	if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) {
   6762 		uint_t tszc = highbit(tszcvec) - 1;
   6763 		size_t tpgsz = page_get_pagesize(tszc);
   6764 
   6765 		ASSERT(svd->type == MAP_SHARED);
   6766 		ASSERT(flag == SDR_END);
   6767 		ASSERT(tszc < szc && tszc > 0);
   6768 		ASSERT(badseg2->s_base > addr);
   6769 		ASSERT(eaddr > badseg2->s_base);
   6770 		ASSERT(eaddr < badseg2->s_base + badseg2->s_size);
   6771 
   6772 		badseg2->s_szc = tszc;
   6773 		if (!IS_P2ALIGNED(eaddr, tpgsz)) {
   6774 			return (segvn_demote_range(badseg2, badseg2->s_base,
   6775 			    eaddr - badseg2->s_base, SDR_END, tszcvec));
   6776 		}
   6777 	}
   6778 
   6779 	return (0);
   6780 }
   6781 
   6782 static int
   6783 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
   6784 {
   6785 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6786 	struct vpage *vp, *evp;
   6787 
   6788 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   6789 
   6790 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
   6791 	/*
   6792 	 * If segment protection can be used, simply check against them.
   6793 	 */
   6794 	if (svd->pageprot == 0) {
   6795 		int err;
   6796 
   6797 		err = ((svd->prot & prot) != prot) ? EACCES : 0;
   6798 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   6799 		return (err);
   6800 	}
   6801 
   6802 	/*
   6803 	 * Have to check down to the vpage level.
   6804 	 */
   6805 	evp = &svd->vpage[seg_page(seg, addr + len)];
   6806 	for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) {
   6807 		if ((VPP_PROT(vp) & prot) != prot) {
   6808 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   6809 			return (EACCES);
   6810 		}
   6811 	}
   6812 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   6813 	return (0);
   6814 }
   6815 
   6816 static int
   6817 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
   6818 {
   6819 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6820 	size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
   6821 
   6822 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   6823 
   6824 	if (pgno != 0) {
   6825 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
   6826 		if (svd->pageprot == 0) {
   6827 			do {
   6828 				protv[--pgno] = svd->prot;
   6829 			} while (pgno != 0);
   6830 		} else {
   6831 			size_t pgoff = seg_page(seg, addr);
   6832 
   6833 			do {
   6834 				pgno--;
   6835 				protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]);
   6836 			} while (pgno != 0);
   6837 		}
   6838 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   6839 	}
   6840 	return (0);
   6841 }
   6842 
   6843 static u_offset_t
   6844 segvn_getoffset(struct seg *seg, caddr_t addr)
   6845 {
   6846 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6847 
   6848 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   6849 
   6850 	return (svd->offset + (uintptr_t)(addr - seg->s_base));
   6851 }
   6852 
   6853 /*ARGSUSED*/
   6854 static int
   6855 segvn_gettype(struct seg *seg, caddr_t addr)
   6856 {
   6857 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6858 
   6859 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   6860 
   6861 	return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT |
   6862 	    MAP_INITDATA)));
   6863 }
   6864 
   6865 /*ARGSUSED*/
   6866 static int
   6867 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
   6868 {
   6869 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6870 
   6871 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   6872 
   6873 	*vpp = svd->vp;
   6874 	return (0);
   6875 }
   6876 
   6877 /*
   6878  * Check to see if it makes sense to do kluster/read ahead to
   6879  * addr + delta relative to the mapping at addr.  We assume here
   6880  * that delta is a signed PAGESIZE'd multiple (which can be negative).
   6881  *
   6882  * For segvn, we currently "approve" of the action if we are
   6883  * still in the segment and it maps from the same vp/off,
   6884  * or if the advice stored in segvn_data or vpages allows it.
   6885  * Currently, klustering is not allowed only if MADV_RANDOM is set.
   6886  */
   6887 static int
   6888 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
   6889 {
   6890 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6891 	struct anon *oap, *ap;
   6892 	ssize_t pd;
   6893 	size_t page;
   6894 	struct vnode *vp1, *vp2;
   6895 	u_offset_t off1, off2;
   6896 	struct anon_map *amp;
   6897 
   6898 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   6899 	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
   6900 	    SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
   6901 
   6902 	if (addr + delta < seg->s_base ||
   6903 	    addr + delta >= (seg->s_base + seg->s_size))
   6904 		return (-1);		/* exceeded segment bounds */
   6905 
   6906 	pd = delta / (ssize_t)PAGESIZE;	/* divide to preserve sign bit */
   6907 	page = seg_page(seg, addr);
   6908 
   6909 	/*
   6910 	 * Check to see if either of the pages addr or addr + delta
   6911 	 * have advice set that prevents klustering (if MADV_RANDOM advice
   6912 	 * is set for entire segment, or MADV_SEQUENTIAL is set and delta
   6913 	 * is negative).
   6914 	 */
   6915 	if (svd->advice == MADV_RANDOM ||
   6916 	    svd->advice == MADV_SEQUENTIAL && delta < 0)
   6917 		return (-1);
   6918 	else if (svd->pageadvice && svd->vpage) {
   6919 		struct vpage *bvpp, *evpp;
   6920 
   6921 		bvpp = &svd->vpage[page];
   6922 		evpp = &svd->vpage[page + pd];
   6923 		if (VPP_ADVICE(bvpp) == MADV_RANDOM ||
   6924 		    VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0)
   6925 			return (-1);
   6926 		if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) &&
   6927 		    VPP_ADVICE(evpp) == MADV_RANDOM)
   6928 			return (-1);
   6929 	}
   6930 
   6931 	if (svd->type == MAP_SHARED)
   6932 		return (0);		/* shared mapping - all ok */
   6933 
   6934 	if ((amp = svd->amp) == NULL)
   6935 		return (0);		/* off original vnode */
   6936 
   6937 	page += svd->anon_index;
   6938 
   6939 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   6940 
   6941 	oap = anon_get_ptr(amp->ahp, page);
   6942 	ap = anon_get_ptr(amp->ahp, page + pd);
   6943 
   6944 	ANON_LOCK_EXIT(&amp->a_rwlock);
   6945 
   6946 	if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) {
   6947 		return (-1);		/* one with and one without an anon */
   6948 	}
   6949 
   6950 	if (oap == NULL) {		/* implies that ap == NULL */
   6951 		return (0);		/* off original vnode */
   6952 	}
   6953 
   6954 	/*
   6955 	 * Now we know we have two anon pointers - check to
   6956 	 * see if they happen to be properly allocated.
   6957 	 */
   6958 
   6959 	/*
   6960 	 * XXX We cheat here and don't lock the anon slots. We can't because
   6961 	 * we may have been called from the anon layer which might already
   6962 	 * have locked them. We are holding a refcnt on the slots so they
   6963 	 * can't disappear. The worst that will happen is we'll get the wrong
   6964 	 * names (vp, off) for the slots and make a poor klustering decision.
   6965 	 */
   6966 	swap_xlate(ap, &vp1, &off1);
   6967 	swap_xlate(oap, &vp2, &off2);
   6968 
   6969 
   6970 	if (!VOP_CMP(vp1, vp2, NULL) || off1 - off2 != delta)
   6971 		return (-1);
   6972 	return (0);
   6973 }
   6974 
   6975 /*
   6976  * Swap the pages of seg out to secondary storage, returning the
   6977  * number of bytes of storage freed.
   6978  *
   6979  * The basic idea is first to unload all translations and then to call
   6980  * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the
   6981  * swap device.  Pages to which other segments have mappings will remain
   6982  * mapped and won't be swapped.  Our caller (as_swapout) has already
   6983  * performed the unloading step.
   6984  *
   6985  * The value returned is intended to correlate well with the process's
   6986  * memory requirements.  However, there are some caveats:
   6987  * 1)	When given a shared segment as argument, this routine will
   6988  *	only succeed in swapping out pages for the last sharer of the
   6989  *	segment.  (Previous callers will only have decremented mapping
   6990  *	reference counts.)
   6991  * 2)	We assume that the hat layer maintains a large enough translation
   6992  *	cache to capture process reference patterns.
   6993  */
   6994 static size_t
   6995 segvn_swapout(struct seg *seg)
   6996 {
   6997 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   6998 	struct anon_map *amp;
   6999 	pgcnt_t pgcnt = 0;
   7000 	pgcnt_t npages;
   7001 	pgcnt_t page;
   7002 	ulong_t anon_index;
   7003 
   7004 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   7005 
   7006 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
   7007 	/*
   7008 	 * Find pages unmapped by our caller and force them
   7009 	 * out to the virtual swap device.
   7010 	 */
   7011 	if ((amp = svd->amp) != NULL)
   7012 		anon_index = svd->anon_index;
   7013 	npages = seg->s_size >> PAGESHIFT;
   7014 	for (page = 0; page < npages; page++) {
   7015 		page_t *pp;
   7016 		struct anon *ap;
   7017 		struct vnode *vp;
   7018 		u_offset_t off;
   7019 		anon_sync_obj_t cookie;
   7020 
   7021 		/*
   7022 		 * Obtain <vp, off> pair for the page, then look it up.
   7023 		 *
   7024 		 * Note that this code is willing to consider regular
   7025 		 * pages as well as anon pages.  Is this appropriate here?
   7026 		 */
   7027 		ap = NULL;
   7028 		if (amp != NULL) {
   7029 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   7030 			if (anon_array_try_enter(amp, anon_index + page,
   7031 			    &cookie)) {
   7032 				ANON_LOCK_EXIT(&amp->a_rwlock);
   7033 				continue;
   7034 			}
   7035 			ap = anon_get_ptr(amp->ahp, anon_index + page);
   7036 			if (ap != NULL) {
   7037 				swap_xlate(ap, &vp, &off);
   7038 			} else {
   7039 				vp = svd->vp;
   7040 				off = svd->offset + ptob(page);
   7041 			}
   7042 			anon_array_exit(&cookie);
   7043 			ANON_LOCK_EXIT(&amp->a_rwlock);
   7044 		} else {
   7045 			vp = svd->vp;
   7046 			off = svd->offset + ptob(page);
   7047 		}
   7048 		if (vp == NULL) {		/* untouched zfod page */
   7049 			ASSERT(ap == NULL);
   7050 			continue;
   7051 		}
   7052 
   7053 		pp = page_lookup_nowait(vp, off, SE_SHARED);
   7054 		if (pp == NULL)
   7055 			continue;
   7056 
   7057 
   7058 		/*
   7059 		 * Examine the page to see whether it can be tossed out,
   7060 		 * keeping track of how many we've found.
   7061 		 */
   7062 		if (!page_tryupgrade(pp)) {
   7063 			/*
   7064 			 * If the page has an i/o lock and no mappings,
   7065 			 * it's very likely that the page is being
   7066 			 * written out as a result of klustering.
   7067 			 * Assume this is so and take credit for it here.
   7068 			 */
   7069 			if (!page_io_trylock(pp)) {
   7070 				if (!hat_page_is_mapped(pp))
   7071 					pgcnt++;
   7072 			} else {
   7073 				page_io_unlock(pp);
   7074 			}
   7075 			page_unlock(pp);
   7076 			continue;
   7077 		}
   7078 		ASSERT(!page_iolock_assert(pp));
   7079 
   7080 
   7081 		/*
   7082 		 * Skip if page is locked or has mappings.
   7083 		 * We don't need the page_struct_lock to look at lckcnt
   7084 		 * and cowcnt because the page is exclusive locked.
   7085 		 */
   7086 		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
   7087 		    hat_page_is_mapped(pp)) {
   7088 			page_unlock(pp);
   7089 			continue;
   7090 		}
   7091 
   7092 		/*
   7093 		 * dispose skips large pages so try to demote first.
   7094 		 */
   7095 		if (pp->p_szc != 0 && !page_try_demote_pages(pp)) {
   7096 			page_unlock(pp);
   7097 			/*
   7098 			 * XXX should skip the remaining page_t's of this
   7099 			 * large page.
   7100 			 */
   7101 			continue;
   7102 		}
   7103 
   7104 		ASSERT(pp->p_szc == 0);
   7105 
   7106 		/*
   7107 		 * No longer mapped -- we can toss it out.  How
   7108 		 * we do so depends on whether or not it's dirty.
   7109 		 */
   7110 		if (hat_ismod(pp) && pp->p_vnode) {
   7111 			/*
   7112 			 * We must clean the page before it can be
   7113 			 * freed.  Setting B_FREE will cause pvn_done
   7114 			 * to free the page when the i/o completes.
   7115 			 * XXX:	This also causes it to be accounted
   7116 			 *	as a pageout instead of a swap: need
   7117 			 *	B_SWAPOUT bit to use instead of B_FREE.
   7118 			 *
   7119 			 * Hold the vnode before releasing the page lock
   7120 			 * to prevent it from being freed and re-used by
   7121 			 * some other thread.
   7122 			 */
   7123 			VN_HOLD(vp);
   7124 			page_unlock(pp);
   7125 
   7126 			/*
   7127 			 * Queue all i/o requests for the pageout thread
   7128 			 * to avoid saturating the pageout devices.
   7129 			 */
   7130 			if (!queue_io_request(vp, off))
   7131 				VN_RELE(vp);
   7132 		} else {
   7133 			/*
   7134 			 * The page was clean, free it.
   7135 			 *
   7136 			 * XXX:	Can we ever encounter modified pages
   7137 			 *	with no associated vnode here?
   7138 			 */
   7139 			ASSERT(pp->p_vnode != NULL);
   7140 			/*LINTED: constant in conditional context*/
   7141 			VN_DISPOSE(pp, B_FREE, 0, kcred);
   7142 		}
   7143 
   7144 		/*
   7145 		 * Credit now even if i/o is in progress.
   7146 		 */
   7147 		pgcnt++;
   7148 	}
   7149 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   7150 
   7151 	/*
   7152 	 * Wakeup pageout to initiate i/o on all queued requests.
   7153 	 */
   7154 	cv_signal_pageout();
   7155 	return (ptob(pgcnt));
   7156 }
   7157 
   7158 /*
   7159  * Synchronize primary storage cache with real object in virtual memory.
   7160  *
   7161  * XXX - Anonymous pages should not be sync'ed out at all.
   7162  */
   7163 static int
   7164 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
   7165 {
   7166 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   7167 	struct vpage *vpp;
   7168 	page_t *pp;
   7169 	u_offset_t offset;
   7170 	struct vnode *vp;
   7171 	u_offset_t off;
   7172 	caddr_t eaddr;
   7173 	int bflags;
   7174 	int err = 0;
   7175 	int segtype;
   7176 	int pageprot;
   7177 	int prot;
   7178 	ulong_t anon_index;
   7179 	struct anon_map *amp;
   7180 	struct anon *ap;
   7181 	anon_sync_obj_t cookie;
   7182 
   7183 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   7184 
   7185 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
   7186 
   7187 	if (svd->softlockcnt > 0) {
   7188 		/*
   7189 		 * If this is shared segment non 0 softlockcnt
   7190 		 * means locked pages are still in use.
   7191 		 */
   7192 		if (svd->type == MAP_SHARED) {
   7193 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   7194 			return (EAGAIN);
   7195 		}
   7196 
   7197 		/*
   7198 		 * flush all pages from seg cache
   7199 		 * otherwise we may deadlock in swap_putpage
   7200 		 * for B_INVAL page (4175402).
   7201 		 *
   7202 		 * Even if we grab segvn WRITER's lock
   7203 		 * here, there might be another thread which could've
   7204 		 * successfully performed lookup/insert just before
   7205 		 * we acquired the lock here.  So, grabbing either
   7206 		 * lock here is of not much use.  Until we devise
   7207 		 * a strategy at upper layers to solve the
   7208 		 * synchronization issues completely, we expect
   7209 		 * applications to handle this appropriately.
   7210 		 */
   7211 		segvn_purge(seg);
   7212 		if (svd->softlockcnt > 0) {
   7213 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   7214 			return (EAGAIN);
   7215 		}
   7216 	} else if (svd->type == MAP_SHARED && svd->amp != NULL &&
   7217 	    svd->amp->a_softlockcnt > 0) {
   7218 		/*
   7219 		 * Try to purge this amp's entries from pcache. It will
   7220 		 * succeed only if other segments that share the amp have no
   7221 		 * outstanding softlock's.
   7222 		 */
   7223 		segvn_purge(seg);
   7224 		if (svd->amp->a_softlockcnt > 0 || svd->softlockcnt > 0) {
   7225 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   7226 			return (EAGAIN);
   7227 		}
   7228 	}
   7229 
   7230 	vpp = svd->vpage;
   7231 	offset = svd->offset + (uintptr_t)(addr - seg->s_base);
   7232 	bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
   7233 	    ((flags & MS_INVALIDATE) ? B_INVAL : 0);
   7234 
   7235 	if (attr) {
   7236 		pageprot = attr & ~(SHARED|PRIVATE);
   7237 		segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE;
   7238 
   7239 		/*
   7240 		 * We are done if the segment types don't match
   7241 		 * or if we have segment level protections and
   7242 		 * they don't match.
   7243 		 */
   7244 		if (svd->type != segtype) {
   7245 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   7246 			return (0);
   7247 		}
   7248 		if (vpp == NULL) {
   7249 			if (svd->prot != pageprot) {
   7250 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   7251 				return (0);
   7252 			}
   7253 			prot = svd->prot;
   7254 		} else
   7255 			vpp = &svd->vpage[seg_page(seg, addr)];
   7256 
   7257 	} else if (svd->vp && svd->amp == NULL &&
   7258 	    (flags & MS_INVALIDATE) == 0) {
   7259 
   7260 		/*
   7261 		 * No attributes, no anonymous pages and MS_INVALIDATE flag
   7262 		 * is not on, just use one big request.
   7263 		 */
   7264 		err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
   7265 		    bflags, svd->cred, NULL);
   7266 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   7267 		return (err);
   7268 	}
   7269 
   7270 	if ((amp = svd->amp) != NULL)
   7271 		anon_index = svd->anon_index + seg_page(seg, addr);
   7272 
   7273 	for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) {
   7274 		ap = NULL;
   7275 		if (amp != NULL) {
   7276 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   7277 			anon_array_enter(amp, anon_index, &cookie);
   7278 			ap = anon_get_ptr(amp->ahp, anon_index++);
   7279 			if (ap != NULL) {
   7280 				swap_xlate(ap, &vp, &off);
   7281 			} else {
   7282 				vp = svd->vp;
   7283 				off = offset;
   7284 			}
   7285 			anon_array_exit(&cookie);
   7286 			ANON_LOCK_EXIT(&amp->a_rwlock);
   7287 		} else {
   7288 			vp = svd->vp;
   7289 			off = offset;
   7290 		}
   7291 		offset += PAGESIZE;
   7292 
   7293 		if (vp == NULL)		/* untouched zfod page */
   7294 			continue;
   7295 
   7296 		if (attr) {
   7297 			if (vpp) {
   7298 				prot = VPP_PROT(vpp);
   7299 				vpp++;
   7300 			}
   7301 			if (prot != pageprot) {
   7302 				continue;
   7303 			}
   7304 		}
   7305 
   7306 		/*
   7307 		 * See if any of these pages are locked --  if so, then we
   7308 		 * will have to truncate an invalidate request at the first
   7309 		 * locked one. We don't need the page_struct_lock to test
   7310 		 * as this is only advisory; even if we acquire it someone
   7311 		 * might race in and lock the page after we unlock and before
   7312 		 * we do the PUTPAGE, then PUTPAGE simply does nothing.
   7313 		 */
   7314 		if (flags & MS_INVALIDATE) {
   7315 			if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
   7316 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
   7317 					page_unlock(pp);
   7318 					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   7319 					return (EBUSY);
   7320 				}
   7321 				if (ap != NULL && pp->p_szc != 0 &&
   7322 				    page_tryupgrade(pp)) {
   7323 					if (pp->p_lckcnt == 0 &&
   7324 					    pp->p_cowcnt == 0) {
   7325 						/*
   7326 						 * swapfs VN_DISPOSE() won't
   7327 						 * invalidate large pages.
   7328 						 * Attempt to demote.
   7329 						 * XXX can't help it if it
   7330 						 * fails. But for swapfs
   7331 						 * pages it is no big deal.
   7332 						 */
   7333 						(void) page_try_demote_pages(
   7334 						    pp);
   7335 					}
   7336 				}
   7337 				page_unlock(pp);
   7338 			}
   7339 		} else if (svd->type == MAP_SHARED && amp != NULL) {
   7340 			/*
   7341 			 * Avoid writing out to disk ISM's large pages
   7342 			 * because segspt_free_pages() relies on NULL an_pvp
   7343 			 * of anon slots of such pages.
   7344 			 */
   7345 
   7346 			ASSERT(svd->vp == NULL);
   7347 			/*
   7348 			 * swapfs uses page_lookup_nowait if not freeing or
   7349 			 * invalidating and skips a page if
   7350 			 * page_lookup_nowait returns NULL.
   7351 			 */
   7352 			pp = page_lookup_nowait(vp, off, SE_SHARED);
   7353 			if (pp == NULL) {
   7354 				continue;
   7355 			}
   7356 			if (pp->p_szc != 0) {
   7357 				page_unlock(pp);
   7358 				continue;
   7359 			}
   7360 
   7361 			/*
   7362 			 * Note ISM pages are created large so (vp, off)'s
   7363 			 * page cannot suddenly become large after we unlock
   7364 			 * pp.
   7365 			 */
   7366 			page_unlock(pp);
   7367 		}
   7368 		/*
   7369 		 * XXX - Should ultimately try to kluster
   7370 		 * calls to VOP_PUTPAGE() for performance.
   7371 		 */
   7372 		VN_HOLD(vp);
   7373 		err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE,
   7374 		    (bflags | (IS_SWAPFSVP(vp) ? B_PAGE_NOWAIT : 0)),
   7375 		    svd->cred, NULL);
   7376 
   7377 		VN_RELE(vp);
   7378 		if (err)
   7379 			break;
   7380 	}
   7381 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   7382 	return (err);
   7383 }
   7384 
   7385 /*
   7386  * Determine if we have data corresponding to pages in the
   7387  * primary storage virtual memory cache (i.e., "in core").
   7388  */
   7389 static size_t
   7390 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
   7391 {
   7392 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
   7393 	struct vnode *vp, *avp;
   7394 	u_offset_t offset, aoffset;
   7395 	size_t p, ep;
   7396 	int ret;
   7397 	struct vpage *vpp;
   7398 	page_t *pp;
   7399 	uint_t start;
   7400 	struct anon_map *amp;		/* XXX - for locknest */
   7401 	struct anon *ap;
   7402 	uint_t attr;
   7403 	anon_sync_obj_t cookie;
   7404 
   7405 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
   7406 
   7407 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
   7408 	if (svd->amp == NULL && svd->vp == NULL) {
   7409 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
   7410 		bzero(vec, btopr(len));
   7411 		return (len);	/* no anonymous pages created yet */
   7412 	}
   7413 
   7414 	p = seg_page(seg, addr);
   7415 	ep = seg_page(seg, addr + len);
   7416 	start = svd->vp ? SEG_PAGE_VNODEBACKED : 0;
   7417 
   7418 	amp = svd->amp;
   7419 	for (; p < ep; p++, addr += PAGESIZE) {
   7420 		vpp = (svd->vpage) ? &svd->vpage[p]: NULL;
   7421 		ret = start;
   7422 		ap = NULL;
   7423 		avp = NULL;
   7424 		/* Grab the vnode/offset for the anon slot */
   7425 		if (amp != NULL) {
   7426 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
   7427 			anon_array_enter(amp, svd->anon_index + p, &cookie);
   7428 			ap = anon_get_ptr(amp->ahp, svd->anon_index + p);
   7429 			if (ap != NULL) {
   7430 				swap_xlate(ap, &avp, &aoffset);
   7431 			}
   7432 			anon_array_exit(&cookie);
   7433 			ANON_LOCK_EXIT(&amp->a_rwlock);
   7434 		}
   7435 		if ((avp != NULL) && page_exists(avp, aoffset)) {
   7436 			/* A page exists for the anon slot */
   7437 			ret |= SEG_PAGE_INCORE;
   7438 
   7439 			/*
   7440 			 * If page is mapped and writable
   7441 			 */
   7442 			attr = (uint_t)0;
   7443 			if ((hat_getattr(seg->s_as->a_hat, addr,
   7444 			    &attr) != -1) && (attr & PROT_WRITE)) {
   7445 				ret |= SEG_PAGE_ANON;
   7446 			}
   7447 			/*
   7448 			 * Don't get page_struct lock for lckcnt and cowcnt,
   7449 			 * since this is purely advisory.
   7450 			 */
   7451 			if ((pp = page_lookup_nowait(avp, aoffset,
   7452 			    SE_SHARED)) != NULL) {
   7453 				if (pp->p_lckcnt)
   7454 					ret |= SEG_PAGE_SOFTLOCK;
   7455 				if (pp->p_cowcnt)
   7456 					ret |= SEG_PAGE_HASCOW;
   7457 				page_unlock(pp);
   7458 			}
   7459 		}
   7460 
   7461 		/* Gather vnode statistics */
   7462 		vp = svd->vp;
   7463 		offset = svd->offset + (uintptr_t)(addr - seg->s_base);
   7464 
   7465 		if (vp != NULL) {
   7466 			/*
   7467 			 * Try to obtain a "shared" lock on the page
   7468 			 * without blocking.  If this fails, determine
   7469 			 * if the page is in memory.
   7470 			 */
   7471 			pp = page_lookup_nowait(vp, offset, SE_SHARED);
   7472 			if ((pp == NULL) && (page_exists(vp, offset))) {
   7473 				/* Page is incore, and is named */
   7474 				ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE);
   7475 			}
   7476 			/*
   7477 			 * Don't get page_struct lock for lckcnt and cowcnt,
   7478 			 * since this is purely advisory.
   7479 			 */
   7480 			if (pp != NULL) {
   7481 				ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE);
   7482 				if (pp->p_lckcnt)
   7483 					ret |= SEG_PAGE_SOFTLOCK;
   7484 				if (pp->p_cowcnt)
   7485 					ret |= SEG_PAGE_HASCOW;
   7486 				page_unlock(pp);
   7487 			}
   7488 		}
   7489 
   7490 		/* Gather virtual page information */
   7491 		if (vpp) {
   7492 			if (VPP_ISPPLOCK(vpp))
   7493 				ret