Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     28 /*	  All Rights Reserved  	*/
     29 
     30 #include <sys/types.h>
     31 #include <sys/inttypes.h>
     32 #include <sys/param.h>
     33 #include <sys/sysmacros.h>
     34 #include <sys/systm.h>
     35 #include <sys/signal.h>
     36 #include <sys/user.h>
     37 #include <sys/errno.h>
     38 #include <sys/var.h>
     39 #include <sys/proc.h>
     40 #include <sys/tuneable.h>
     41 #include <sys/debug.h>
     42 #include <sys/cmn_err.h>
     43 #include <sys/cred.h>
     44 #include <sys/vnode.h>
     45 #include <sys/vfs.h>
     46 #include <sys/vm.h>
     47 #include <sys/file.h>
     48 #include <sys/mman.h>
     49 #include <sys/vmparam.h>
     50 #include <sys/fcntl.h>
     51 #include <sys/lwpchan_impl.h>
     52 #include <sys/nbmlock.h>
     53 
     54 #include <vm/hat.h>
     55 #include <vm/as.h>
     56 #include <vm/seg.h>
     57 #include <vm/seg_dev.h>
     58 #include <vm/seg_vn.h>
     59 
     60 int use_brk_lpg = 1;
     61 int use_stk_lpg = 1;
     62 
     63 static int brk_lpg(caddr_t nva);
     64 static int grow_lpg(caddr_t sp);
     65 
     66 int
     67 brk(caddr_t nva)
     68 {
     69 	int error;
     70 	proc_t *p = curproc;
     71 
     72 	/*
     73 	 * Serialize brk operations on an address space.
     74 	 * This also serves as the lock protecting p_brksize
     75 	 * and p_brkpageszc.
     76 	 */
     77 	as_rangelock(p->p_as);
     78 	if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
     79 		error = brk_lpg(nva);
     80 	} else {
     81 		error = brk_internal(nva, p->p_brkpageszc);
     82 	}
     83 	as_rangeunlock(p->p_as);
     84 	return ((error != 0 ? set_errno(error) : 0));
     85 }
     86 
     87 /*
     88  * Algorithm: call arch-specific map_pgsz to get best page size to use,
     89  * then call brk_internal().
     90  * Returns 0 on success.
     91  */
     92 static int
     93 brk_lpg(caddr_t nva)
     94 {
     95 	struct proc *p = curproc;
     96 	size_t pgsz, len;
     97 	caddr_t addr, brkend;
     98 	caddr_t bssbase = p->p_bssbase;
     99 	caddr_t brkbase = p->p_brkbase;
    100 	int oszc, szc;
    101 	int err;
    102 
    103 	oszc = p->p_brkpageszc;
    104 
    105 	/*
    106 	 * If p_brkbase has not yet been set, the first call
    107 	 * to brk_internal() will initialize it.
    108 	 */
    109 	if (brkbase == 0) {
    110 		return (brk_internal(nva, oszc));
    111 	}
    112 
    113 	len = nva - bssbase;
    114 
    115 	pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
    116 	szc = page_szc(pgsz);
    117 
    118 	/*
    119 	 * Covers two cases:
    120 	 * 1. page_szc() returns -1 for invalid page size, so we want to
    121 	 * ignore it in that case.
    122 	 * 2. By design we never decrease page size, as it is more stable.
    123 	 */
    124 	if (szc <= oszc) {
    125 		err = brk_internal(nva, oszc);
    126 		/* If failed, back off to base page size. */
    127 		if (err != 0 && oszc != 0) {
    128 			err = brk_internal(nva, 0);
    129 		}
    130 		return (err);
    131 	}
    132 
    133 	err = brk_internal(nva, szc);
    134 	/* If using szc failed, map with base page size and return. */
    135 	if (err != 0) {
    136 		if (szc != 0) {
    137 			err = brk_internal(nva, 0);
    138 		}
    139 		return (err);
    140 	}
    141 
    142 	/*
    143 	 * Round up brk base to a large page boundary and remap
    144 	 * anything in the segment already faulted in beyond that
    145 	 * point.
    146 	 */
    147 	addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
    148 	brkend = brkbase + p->p_brksize;
    149 	len = brkend - addr;
    150 	/* Check that len is not negative. Update page size code for heap. */
    151 	if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
    152 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
    153 		p->p_brkpageszc = szc;
    154 	}
    155 
    156 	ASSERT(err == 0);
    157 	return (err);		/* should always be 0 */
    158 }
    159 
    160 /*
    161  * Returns 0 on success.
    162  */
    163 int
    164 brk_internal(caddr_t nva, uint_t brkszc)
    165 {
    166 	caddr_t ova;			/* current break address */
    167 	size_t size;
    168 	int	error;
    169 	struct proc *p = curproc;
    170 	struct as *as = p->p_as;
    171 	size_t pgsz;
    172 	uint_t szc;
    173 	rctl_qty_t as_rctl;
    174 
    175 	/*
    176 	 * extend heap to brkszc alignment but use current p->p_brkpageszc
    177 	 * for the newly created segment. This allows the new extension
    178 	 * segment to be concatenated successfully with the existing brk
    179 	 * segment.
    180 	 */
    181 	if ((szc = brkszc) != 0) {
    182 		pgsz = page_get_pagesize(szc);
    183 		ASSERT(pgsz > PAGESIZE);
    184 	} else {
    185 		pgsz = PAGESIZE;
    186 	}
    187 
    188 	mutex_enter(&p->p_lock);
    189 	as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
    190 	    p->p_rctls, p);
    191 	mutex_exit(&p->p_lock);
    192 
    193 	/*
    194 	 * If p_brkbase has not yet been set, the first call
    195 	 * to brk() will initialize it.
    196 	 */
    197 	if (p->p_brkbase == 0)
    198 		p->p_brkbase = nva;
    199 
    200 	/*
    201 	 * Before multiple page size support existed p_brksize was the value
    202 	 * not rounded to the pagesize (i.e. it stored the exact user request
    203 	 * for heap size). If pgsz is greater than PAGESIZE calculate the
    204 	 * heap size as the real new heap size by rounding it up to pgsz.
    205 	 * This is useful since we may want to know where the heap ends
    206 	 * without knowing heap pagesize (e.g. some old code) and also if
    207 	 * heap pagesize changes we can update p_brkpageszc but delay adding
    208 	 * new mapping yet still know from p_brksize where the heap really
    209 	 * ends. The user requested heap end is stored in libc variable.
    210 	 */
    211 	if (pgsz > PAGESIZE) {
    212 		caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
    213 		size = tnva - p->p_brkbase;
    214 		if (tnva < p->p_brkbase || (size > p->p_brksize &&
    215 		    size > (size_t)as_rctl)) {
    216 			szc = 0;
    217 			pgsz = PAGESIZE;
    218 			size = nva - p->p_brkbase;
    219 		}
    220 	} else {
    221 		size = nva - p->p_brkbase;
    222 	}
    223 
    224 	/*
    225 	 * use PAGESIZE to roundup ova because we want to know the real value
    226 	 * of the current heap end in case p_brkpageszc changes since the last
    227 	 * p_brksize was computed.
    228 	 */
    229 	nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
    230 	ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
    231 	    PAGESIZE);
    232 
    233 	if ((nva < p->p_brkbase) || (size > p->p_brksize &&
    234 	    size > as_rctl)) {
    235 		mutex_enter(&p->p_lock);
    236 		(void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
    237 		    RCA_SAFE);
    238 		mutex_exit(&p->p_lock);
    239 		return (ENOMEM);
    240 	}
    241 
    242 	if (nva > ova) {
    243 		struct segvn_crargs crargs =
    244 		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
    245 
    246 		if (!(p->p_datprot & PROT_EXEC)) {
    247 			crargs.prot &= ~PROT_EXEC;
    248 		}
    249 
    250 		/*
    251 		 * Add new zfod mapping to extend UNIX data segment
    252 		 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
    253 		 * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
    254 		 * page sizes if ova is not aligned to szc's pgsz.
    255 		 */
    256 		if (szc > 0) {
    257 			caddr_t rbss;
    258 
    259 			rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
    260 			    pgsz);
    261 			if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
    262 				crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
    263 				    AS_MAP_NO_LPOOB;
    264 			} else if (ova == rbss) {
    265 				crargs.szc = szc;
    266 			} else {
    267 				crargs.szc = AS_MAP_HEAP;
    268 			}
    269 		} else {
    270 			crargs.szc = AS_MAP_NO_LPOOB;
    271 		}
    272 		crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
    273 		error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
    274 		    &crargs);
    275 		if (error) {
    276 			return (error);
    277 		}
    278 
    279 	} else if (nva < ova) {
    280 		/*
    281 		 * Release mapping to shrink UNIX data segment.
    282 		 */
    283 		(void) as_unmap(as, nva, (size_t)(ova - nva));
    284 	}
    285 	p->p_brksize = size;
    286 	return (0);
    287 }
    288 
    289 /*
    290  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
    291  * This routine assumes that the stack grows downward.
    292  */
    293 int
    294 grow(caddr_t sp)
    295 {
    296 	struct proc *p = curproc;
    297 	struct as *as = p->p_as;
    298 	size_t oldsize = p->p_stksize;
    299 	size_t newsize;
    300 	int err;
    301 
    302 	/*
    303 	 * Serialize grow operations on an address space.
    304 	 * This also serves as the lock protecting p_stksize
    305 	 * and p_stkpageszc.
    306 	 */
    307 	as_rangelock(as);
    308 	if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
    309 		err = grow_lpg(sp);
    310 	} else {
    311 		err = grow_internal(sp, p->p_stkpageszc);
    312 	}
    313 	as_rangeunlock(as);
    314 
    315 	if (err == 0 && (newsize = p->p_stksize) > oldsize) {
    316 		ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
    317 		ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
    318 		/*
    319 		 * Set up translations so the process doesn't have to fault in
    320 		 * the stack pages we just gave it.
    321 		 */
    322 		(void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
    323 		    newsize - oldsize, F_INVAL, S_WRITE);
    324 	}
    325 	return ((err == 0 ? 1 : 0));
    326 }
    327 
    328 /*
    329  * Algorithm: call arch-specific map_pgsz to get best page size to use,
    330  * then call grow_internal().
    331  * Returns 0 on success.
    332  */
    333 static int
    334 grow_lpg(caddr_t sp)
    335 {
    336 	struct proc *p = curproc;
    337 	size_t pgsz;
    338 	size_t len, newsize;
    339 	caddr_t addr, saddr;
    340 	caddr_t growend;
    341 	int oszc, szc;
    342 	int err;
    343 
    344 	newsize = p->p_usrstack - sp;
    345 
    346 	oszc = p->p_stkpageszc;
    347 	pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
    348 	szc = page_szc(pgsz);
    349 
    350 	/*
    351 	 * Covers two cases:
    352 	 * 1. page_szc() returns -1 for invalid page size, so we want to
    353 	 * ignore it in that case.
    354 	 * 2. By design we never decrease page size, as it is more stable.
    355 	 * This shouldn't happen as the stack never shrinks.
    356 	 */
    357 	if (szc <= oszc) {
    358 		err = grow_internal(sp, oszc);
    359 		/* failed, fall back to base page size */
    360 		if (err != 0 && oszc != 0) {
    361 			err = grow_internal(sp, 0);
    362 		}
    363 		return (err);
    364 	}
    365 
    366 	/*
    367 	 * We've grown sufficiently to switch to a new page size.
    368 	 * So we are going to remap the whole segment with the new page size.
    369 	 */
    370 	err = grow_internal(sp, szc);
    371 	/* The grow with szc failed, so fall back to base page size. */
    372 	if (err != 0) {
    373 		if (szc != 0) {
    374 			err = grow_internal(sp, 0);
    375 		}
    376 		return (err);
    377 	}
    378 
    379 	/*
    380 	 * Round up stack pointer to a large page boundary and remap
    381 	 * any pgsz pages in the segment already faulted in beyond that
    382 	 * point.
    383 	 */
    384 	saddr = p->p_usrstack - p->p_stksize;
    385 	addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
    386 	growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
    387 	len = growend - addr;
    388 	/* Check that len is not negative. Update page size code for stack. */
    389 	if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
    390 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
    391 		p->p_stkpageszc = szc;
    392 	}
    393 
    394 	ASSERT(err == 0);
    395 	return (err);		/* should always be 0 */
    396 }
    397 
    398 /*
    399  * This routine assumes that the stack grows downward.
    400  * Returns 0 on success, errno on failure.
    401  */
    402 int
    403 grow_internal(caddr_t sp, uint_t growszc)
    404 {
    405 	struct proc *p = curproc;
    406 	size_t newsize;
    407 	size_t oldsize;
    408 	int    error;
    409 	size_t pgsz;
    410 	uint_t szc;
    411 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
    412 
    413 	ASSERT(sp < p->p_usrstack);
    414 	sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
    415 
    416 	/*
    417 	 * grow to growszc alignment but use current p->p_stkpageszc for
    418 	 * the segvn_crargs szc passed to segvn_create. For memcntl to
    419 	 * increase the szc, this allows the new extension segment to be
    420 	 * concatenated successfully with the existing stack segment.
    421 	 */
    422 	if ((szc = growszc) != 0) {
    423 		pgsz = page_get_pagesize(szc);
    424 		ASSERT(pgsz > PAGESIZE);
    425 		newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
    426 		if (newsize > (size_t)p->p_stk_ctl) {
    427 			szc = 0;
    428 			pgsz = PAGESIZE;
    429 			newsize = p->p_usrstack - sp;
    430 		}
    431 	} else {
    432 		pgsz = PAGESIZE;
    433 		newsize = p->p_usrstack - sp;
    434 	}
    435 
    436 	if (newsize > (size_t)p->p_stk_ctl) {
    437 		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
    438 		    RCA_UNSAFE_ALL);
    439 
    440 		return (ENOMEM);
    441 	}
    442 
    443 	oldsize = p->p_stksize;
    444 	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
    445 
    446 	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
    447 		return (0);
    448 	}
    449 
    450 	if (!(p->p_stkprot & PROT_EXEC)) {
    451 		crargs.prot &= ~PROT_EXEC;
    452 	}
    453 	/*
    454 	 * extend stack with the proposed new growszc, which is different
    455 	 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
    456 	 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
    457 	 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
    458 	 * if not aligned to szc's pgsz.
    459 	 */
    460 	if (szc > 0) {
    461 		caddr_t oldsp = p->p_usrstack - oldsize;
    462 		caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
    463 		    pgsz);
    464 
    465 		if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
    466 			crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
    467 			    AS_MAP_NO_LPOOB;
    468 		} else if (oldsp == austk) {
    469 			crargs.szc = szc;
    470 		} else {
    471 			crargs.szc = AS_MAP_STACK;
    472 		}
    473 	} else {
    474 		crargs.szc = AS_MAP_NO_LPOOB;
    475 	}
    476 	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
    477 
    478 	if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
    479 	    segvn_create, &crargs)) != 0) {
    480 		if (error == EAGAIN) {
    481 			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
    482 			    "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
    483 		}
    484 		return (error);
    485 	}
    486 	p->p_stksize = newsize;
    487 	return (0);
    488 }
    489 
    490 /*
    491  * Find address for user to map.
    492  * If MAP_FIXED is not specified, we can pick any address we want, but we will
    493  * first try the value in *addrp if it is non-NULL.  Thus this is implementing
    494  * a way to try and get a preferred address.
    495  */
    496 int
    497 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
    498     int vacalign, uint_t flags)
    499 {
    500 	caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
    501 	size_t lenp = len;
    502 
    503 	ASSERT(AS_ISCLAIMGAP(as));	/* searches should be serialized */
    504 	if (flags & MAP_FIXED) {
    505 		(void) as_unmap(as, *addrp, len);
    506 		return (0);
    507 	} else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
    508 	    !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
    509 		/* User supplied address was available */
    510 		*addrp = basep;
    511 	} else {
    512 		/*
    513 		 * No user supplied address or the address supplied was not
    514 		 * available.
    515 		 */
    516 		map_addr(addrp, len, off, vacalign, flags);
    517 	}
    518 	if (*addrp == NULL)
    519 		return (ENOMEM);
    520 	return (0);
    521 }
    522 
    523 
    524 /*
    525  * Used for MAP_ANON - fast way to get anonymous pages
    526  */
    527 static int
    528 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
    529     offset_t pos)
    530 {
    531 	struct segvn_crargs vn_a;
    532 	int error;
    533 
    534 	if (((PROT_ALL & uprot) != uprot))
    535 		return (EACCES);
    536 
    537 	if ((flags & MAP_FIXED) != 0) {
    538 		caddr_t userlimit;
    539 
    540 		/*
    541 		 * Use the user address.  First verify that
    542 		 * the address to be used is page aligned.
    543 		 * Then make some simple bounds checks.
    544 		 */
    545 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
    546 			return (EINVAL);
    547 
    548 		userlimit = flags & _MAP_LOW32 ?
    549 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
    550 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
    551 		case RANGE_OKAY:
    552 			break;
    553 		case RANGE_BADPROT:
    554 			return (ENOTSUP);
    555 		case RANGE_BADADDR:
    556 		default:
    557 			return (ENOMEM);
    558 		}
    559 	}
    560 	/*
    561 	 * No need to worry about vac alignment for anonymous
    562 	 * pages since this is a "clone" object that doesn't
    563 	 * yet exist.
    564 	 */
    565 	error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
    566 	if (error != 0) {
    567 		return (error);
    568 	}
    569 
    570 	/*
    571 	 * Use the seg_vn segment driver; passing in the NULL amp
    572 	 * gives the desired "cloning" effect.
    573 	 */
    574 	vn_a.vp = NULL;
    575 	vn_a.offset = 0;
    576 	vn_a.type = flags & MAP_TYPE;
    577 	vn_a.prot = uprot;
    578 	vn_a.maxprot = PROT_ALL;
    579 	vn_a.flags = flags & ~MAP_TYPE;
    580 	vn_a.cred = CRED();
    581 	vn_a.amp = NULL;
    582 	vn_a.szc = 0;
    583 	vn_a.lgrp_mem_policy_flags = 0;
    584 
    585 	return (as_map(as, *addrp, len, segvn_create, &vn_a));
    586 }
    587 
    588 static int
    589 smmap_common(caddr_t *addrp, size_t len,
    590     int prot, int flags, struct file *fp, offset_t pos)
    591 {
    592 	struct vnode *vp;
    593 	struct as *as = curproc->p_as;
    594 	uint_t uprot, maxprot, type;
    595 	int error;
    596 	int in_crit = 0;
    597 
    598 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
    599 	    _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
    600 	    MAP_TEXT | MAP_INITDATA)) != 0) {
    601 		/* | MAP_RENAME */	/* not implemented, let user know */
    602 		return (EINVAL);
    603 	}
    604 
    605 	if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
    606 		return (EINVAL);
    607 	}
    608 
    609 	if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
    610 		return (EINVAL);
    611 	}
    612 
    613 #if defined(__sparc)
    614 	/*
    615 	 * See if this is an "old mmap call".  If so, remember this
    616 	 * fact and convert the flags value given to mmap to indicate
    617 	 * the specified address in the system call must be used.
    618 	 * _MAP_NEW is turned set by all new uses of mmap.
    619 	 */
    620 	if ((flags & _MAP_NEW) == 0)
    621 		flags |= MAP_FIXED;
    622 #endif
    623 	flags &= ~_MAP_NEW;
    624 
    625 	type = flags & MAP_TYPE;
    626 	if (type != MAP_PRIVATE && type != MAP_SHARED)
    627 		return (EINVAL);
    628 
    629 
    630 	if (flags & MAP_ALIGN) {
    631 
    632 		if (flags & MAP_FIXED)
    633 			return (EINVAL);
    634 
    635 		/* alignment needs to be a power of 2 >= page size */
    636 		if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
    637 		    !ISP2((uintptr_t)*addrp))
    638 			return (EINVAL);
    639 	}
    640 	/*
    641 	 * Check for bad lengths and file position.
    642 	 * We let the VOP_MAP routine check for negative lengths
    643 	 * since on some vnode types this might be appropriate.
    644 	 */
    645 	if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
    646 		return (EINVAL);
    647 
    648 	maxprot = PROT_ALL;		/* start out allowing all accesses */
    649 	uprot = prot | PROT_USER;
    650 
    651 	if (fp == NULL) {
    652 		ASSERT(flags & MAP_ANON);
    653 		/* discard lwpchan mappings, like munmap() */
    654 		if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
    655 			lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
    656 		as_rangelock(as);
    657 		error = zmap(as, addrp, len, uprot, flags, pos);
    658 		as_rangeunlock(as);
    659 		/*
    660 		 * Tell machine specific code that lwp has mapped shared memory
    661 		 */
    662 		if (error == 0 && (flags & MAP_SHARED)) {
    663 			/* EMPTY */
    664 			LWP_MMODEL_SHARED_AS(*addrp, len);
    665 		}
    666 		return (error);
    667 	} else if ((flags & MAP_ANON) != 0)
    668 		return (EINVAL);
    669 
    670 	vp = fp->f_vnode;
    671 
    672 	/* Can't execute code from "noexec" mounted filesystem. */
    673 	if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
    674 		maxprot &= ~PROT_EXEC;
    675 
    676 	/*
    677 	 * These checks were added as part of large files.
    678 	 *
    679 	 * Return ENXIO if the initial position is negative; return EOVERFLOW
    680 	 * if (offset + len) would overflow the maximum allowed offset for the
    681 	 * type of file descriptor being used.
    682 	 */
    683 	if (vp->v_type == VREG) {
    684 		if (pos < 0)
    685 			return (ENXIO);
    686 		if ((offset_t)len > (OFFSET_MAX(fp) - pos))
    687 			return (EOVERFLOW);
    688 	}
    689 
    690 	if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
    691 		/* no write access allowed */
    692 		maxprot &= ~PROT_WRITE;
    693 	}
    694 
    695 	/*
    696 	 * XXX - Do we also adjust maxprot based on protections
    697 	 * of the vnode?  E.g. if no execute permission is given
    698 	 * on the vnode for the current user, maxprot probably
    699 	 * should disallow PROT_EXEC also?  This is different
    700 	 * from the write access as this would be a per vnode
    701 	 * test as opposed to a per fd test for writability.
    702 	 */
    703 
    704 	/*
    705 	 * Verify that the specified protections are not greater than
    706 	 * the maximum allowable protections.  Also test to make sure
    707 	 * that the file descriptor does allows for read access since
    708 	 * "write only" mappings are hard to do since normally we do
    709 	 * the read from the file before the page can be written.
    710 	 */
    711 	if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
    712 		return (EACCES);
    713 
    714 	/*
    715 	 * If the user specified an address, do some simple checks here
    716 	 */
    717 	if ((flags & MAP_FIXED) != 0) {
    718 		caddr_t userlimit;
    719 
    720 		/*
    721 		 * Use the user address.  First verify that
    722 		 * the address to be used is page aligned.
    723 		 * Then make some simple bounds checks.
    724 		 */
    725 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
    726 			return (EINVAL);
    727 
    728 		userlimit = flags & _MAP_LOW32 ?
    729 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
    730 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
    731 		case RANGE_OKAY:
    732 			break;
    733 		case RANGE_BADPROT:
    734 			return (ENOTSUP);
    735 		case RANGE_BADADDR:
    736 		default:
    737 			return (ENOMEM);
    738 		}
    739 	}
    740 
    741 	if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
    742 	    nbl_need_check(vp)) {
    743 		int svmand;
    744 		nbl_op_t nop;
    745 
    746 		nbl_start_crit(vp, RW_READER);
    747 		in_crit = 1;
    748 		error = nbl_svmand(vp, fp->f_cred, &svmand);
    749 		if (error != 0)
    750 			goto done;
    751 		if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
    752 			if (prot & (PROT_READ | PROT_EXEC)) {
    753 				nop = NBL_READWRITE;
    754 			} else {
    755 				nop = NBL_WRITE;
    756 			}
    757 		} else {
    758 			nop = NBL_READ;
    759 		}
    760 		if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
    761 			error = EACCES;
    762 			goto done;
    763 		}
    764 	}
    765 
    766 	/* discard lwpchan mappings, like munmap() */
    767 	if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
    768 		lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
    769 
    770 	/*
    771 	 * Ok, now let the vnode map routine do its thing to set things up.
    772 	 */
    773 	error = VOP_MAP(vp, pos, as,
    774 	    addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
    775 
    776 	if (error == 0) {
    777 		/*
    778 		 * Tell machine specific code that lwp has mapped shared memory
    779 		 */
    780 		if (flags & MAP_SHARED) {
    781 			/* EMPTY */
    782 			LWP_MMODEL_SHARED_AS(*addrp, len);
    783 		}
    784 		if (vp->v_type == VREG &&
    785 		    (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
    786 			/*
    787 			 * Mark this as an executable vnode
    788 			 */
    789 			mutex_enter(&vp->v_lock);
    790 			vp->v_flag |= VVMEXEC;
    791 			mutex_exit(&vp->v_lock);
    792 		}
    793 	}
    794 
    795 done:
    796 	if (in_crit)
    797 		nbl_end_crit(vp);
    798 	return (error);
    799 }
    800 
    801 #ifdef _LP64
    802 /*
    803  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
    804  *
    805  * The "large file" mmap routine mmap64(2) is also mapped to this routine
    806  * by the 64-bit version of libc.
    807  *
    808  * Eventually, this should be the only version, and have smmap_common()
    809  * folded back into it again.  Some day.
    810  */
    811 caddr_t
    812 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
    813 {
    814 	struct file *fp;
    815 	int error;
    816 
    817 	if (flags & _MAP_LOW32)
    818 		error = EINVAL;
    819 	else if (fd == -1 && (flags & MAP_ANON) != 0)
    820 		error = smmap_common(&addr, len, prot, flags,
    821 		    NULL, (offset_t)pos);
    822 	else if ((fp = getf(fd)) != NULL) {
    823 		error = smmap_common(&addr, len, prot, flags,
    824 		    fp, (offset_t)pos);
    825 		releasef(fd);
    826 	} else
    827 		error = EBADF;
    828 
    829 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
    830 }
    831 #endif	/* _LP64 */
    832 
    833 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
    834 
    835 /*
    836  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
    837  */
    838 caddr_t
    839 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
    840 {
    841 	struct file *fp;
    842 	int error;
    843 	caddr_t a = (caddr_t)(uintptr_t)addr;
    844 
    845 	if (flags & _MAP_LOW32)
    846 		error = EINVAL;
    847 	else if (fd == -1 && (flags & MAP_ANON) != 0)
    848 		error = smmap_common(&a, (size_t)len, prot,
    849 		    flags | _MAP_LOW32, NULL, (offset_t)pos);
    850 	else if ((fp = getf(fd)) != NULL) {
    851 		error = smmap_common(&a, (size_t)len, prot,
    852 		    flags | _MAP_LOW32, fp, (offset_t)pos);
    853 		releasef(fd);
    854 	} else
    855 		error = EBADF;
    856 
    857 	ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
    858 
    859 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
    860 }
    861 
    862 /*
    863  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
    864  *
    865  * Now things really get ugly because we can't use the C-style
    866  * calling convention for more than 6 args, and 64-bit parameter
    867  * passing on 32-bit systems is less than clean.
    868  */
    869 
    870 struct mmaplf32a {
    871 	caddr_t addr;
    872 	size_t len;
    873 #ifdef _LP64
    874 	/*
    875 	 * 32-bit contents, 64-bit cells
    876 	 */
    877 	uint64_t prot;
    878 	uint64_t flags;
    879 	uint64_t fd;
    880 	uint64_t offhi;
    881 	uint64_t offlo;
    882 #else
    883 	/*
    884 	 * 32-bit contents, 32-bit cells
    885 	 */
    886 	uint32_t prot;
    887 	uint32_t flags;
    888 	uint32_t fd;
    889 	uint32_t offhi;
    890 	uint32_t offlo;
    891 #endif
    892 };
    893 
    894 int
    895 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
    896 {
    897 	struct file *fp;
    898 	int error;
    899 	caddr_t a = uap->addr;
    900 	int flags = (int)uap->flags;
    901 	int fd = (int)uap->fd;
    902 #ifdef _BIG_ENDIAN
    903 	offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
    904 #else
    905 	offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
    906 #endif
    907 
    908 	if (flags & _MAP_LOW32)
    909 		error = EINVAL;
    910 	else if (fd == -1 && (flags & MAP_ANON) != 0)
    911 		error = smmap_common(&a, uap->len, (int)uap->prot,
    912 		    flags | _MAP_LOW32, NULL, off);
    913 	else if ((fp = getf(fd)) != NULL) {
    914 		error = smmap_common(&a, uap->len, (int)uap->prot,
    915 		    flags | _MAP_LOW32, fp, off);
    916 		releasef(fd);
    917 	} else
    918 		error = EBADF;
    919 
    920 	if (error == 0)
    921 		rvp->r_val1 = (uintptr_t)a;
    922 	return (error);
    923 }
    924 
    925 #endif	/* _SYSCALL32_IMPL || _ILP32 */
    926 
    927 int
    928 munmap(caddr_t addr, size_t len)
    929 {
    930 	struct proc *p = curproc;
    931 	struct as *as = p->p_as;
    932 
    933 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
    934 		return (set_errno(EINVAL));
    935 
    936 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
    937 		return (set_errno(EINVAL));
    938 
    939 	/*
    940 	 * Discard lwpchan mappings.
    941 	 */
    942 	if (p->p_lcp != NULL)
    943 		lwpchan_delete_mapping(p, addr, addr + len);
    944 	if (as_unmap(as, addr, len) != 0)
    945 		return (set_errno(EINVAL));
    946 
    947 	return (0);
    948 }
    949 
    950 int
    951 mprotect(caddr_t addr, size_t len, int prot)
    952 {
    953 	struct as *as = curproc->p_as;
    954 	uint_t uprot = prot | PROT_USER;
    955 	int error;
    956 
    957 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
    958 		return (set_errno(EINVAL));
    959 
    960 	switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
    961 	case RANGE_OKAY:
    962 		break;
    963 	case RANGE_BADPROT:
    964 		return (set_errno(ENOTSUP));
    965 	case RANGE_BADADDR:
    966 	default:
    967 		return (set_errno(ENOMEM));
    968 	}
    969 
    970 	error = as_setprot(as, addr, len, uprot);
    971 	if (error)
    972 		return (set_errno(error));
    973 	return (0);
    974 }
    975 
    976 #define	MC_CACHE	128			/* internal result buffer */
    977 #define	MC_QUANTUM	(MC_CACHE * PAGESIZE)	/* addresses covered in loop */
    978 
    979 int
    980 mincore(caddr_t addr, size_t len, char *vecp)
    981 {
    982 	struct as *as = curproc->p_as;
    983 	caddr_t ea;			/* end address of loop */
    984 	size_t rl;			/* inner result length */
    985 	char vec[MC_CACHE];		/* local vector cache */
    986 	int error;
    987 	model_t model;
    988 	long	llen;
    989 
    990 	model = get_udatamodel();
    991 	/*
    992 	 * Validate form of address parameters.
    993 	 */
    994 	if (model == DATAMODEL_NATIVE) {
    995 		llen = (long)len;
    996 	} else {
    997 		llen = (int32_t)(size32_t)len;
    998 	}
    999 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
   1000 		return (set_errno(EINVAL));
   1001 
   1002 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
   1003 		return (set_errno(ENOMEM));
   1004 
   1005 	/*
   1006 	 * Loop over subranges of interval [addr : addr + len), recovering
   1007 	 * results internally and then copying them out to caller.  Subrange
   1008 	 * is based on the size of MC_CACHE, defined above.
   1009 	 */
   1010 	for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
   1011 		error = as_incore(as, addr,
   1012 		    (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
   1013 		if (rl != 0) {
   1014 			rl = (rl + PAGESIZE - 1) / PAGESIZE;
   1015 			if (copyout(vec, vecp, rl) != 0)
   1016 				return (set_errno(EFAULT));
   1017 			vecp += rl;
   1018 		}
   1019 		if (error != 0)
   1020 			return (set_errno(ENOMEM));
   1021 	}
   1022 	return (0);
   1023 }
   1024