Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/types.h>
     27 #include <sys/sysmacros.h>
     28 #include <sys/kmem.h>
     29 #include <sys/param.h>
     30 #include <sys/systm.h>
     31 #include <sys/errno.h>
     32 #include <sys/mman.h>
     33 #include <sys/cmn_err.h>
     34 #include <sys/cred.h>
     35 #include <sys/vmsystm.h>
     36 #include <sys/machsystm.h>
     37 #include <sys/debug.h>
     38 #include <vm/as.h>
     39 #include <vm/seg.h>
     40 #include <sys/vmparam.h>
     41 #include <sys/vfs.h>
     42 #include <sys/elf.h>
     43 #include <sys/machelf.h>
     44 #include <sys/corectl.h>
     45 #include <sys/exec.h>
     46 #include <sys/exechdr.h>
     47 #include <sys/autoconf.h>
     48 #include <sys/mem.h>
     49 #include <vm/seg_dev.h>
     50 #include <sys/vmparam.h>
     51 #include <sys/mmapobj.h>
     52 #include <sys/atomic.h>
     53 
     54 /*
     55  * Theory statement:
     56  *
     57  * The main driving force behind mmapobj is to interpret and map ELF files
     58  * inside of the kernel instead of having the linker be responsible for this.
     59  *
     60  * mmapobj also supports the AOUT 4.x binary format as well as flat files in
     61  * a read only manner.
     62  *
     63  * When interpreting and mapping an ELF file, mmapobj will map each PT_LOAD
     64  * or PT_SUNWBSS segment according to the ELF standard.  Refer to the "Linker
     65  * and Libraries Guide" for more information about the standard and mapping
     66  * rules.
     67  *
     68  * Having mmapobj interpret and map objects will allow the kernel to make the
     69  * best decision for where to place the mappings for said objects.  Thus, we
     70  * can make optimizations inside of the kernel for specific platforms or
     71  * cache mapping information to make mapping objects faster.
     72  *
     73  * The lib_va_hash will be one such optimization.  For each ELF object that
     74  * mmapobj is asked to interpret, we will attempt to cache the information
     75  * about the PT_LOAD and PT_SUNWBSS sections to speed up future mappings of
     76  * the same objects.  We will cache up to LIBVA_CACHED_SEGS (see below) program
     77  * headers which should cover a majority of the libraries out there without
     78  * wasting space.  In order to make sure that the cached information is valid,
     79  * we check the passed in vnode's mtime and ctime to make sure the vnode
     80  * has not been modified since the last time we used it.
     81  *
     82  * In addition, the lib_va_hash may contain a preferred starting VA for the
     83  * object which can be useful for platforms which support a shared context.
     84  * This will increase the likelyhood that library text can be shared among
     85  * many different processes.  We limit the reserved VA space for 32 bit objects
     86  * in order to minimize fragmenting the processes address space.
     87  *
     88  * In addition to the above, the mmapobj interface allows for padding to be
     89  * requested before the first mapping and after the last mapping created.
     90  * When padding is requested, no additional optimizations will be made for
     91  * that request.
     92  */
     93 
     94 /*
     95  * Threshold to prevent allocating too much kernel memory to read in the
     96  * program headers for an object.  If it requires more than below,
     97  * we will use a KM_NOSLEEP allocation to allocate memory to hold all of the
     98  * program headers which could possibly fail.  If less memory than below is
     99  * needed, then we use a KM_SLEEP allocation and are willing to wait for the
    100  * memory if we need to.
    101  */
    102 size_t mmapobj_alloc_threshold = 65536;
    103 
    104 /* Debug stats for test coverage */
    105 #ifdef DEBUG
    106 struct mobj_stats {
    107 	uint_t	mobjs_unmap_called;
    108 	uint_t	mobjs_remap_devnull;
    109 	uint_t	mobjs_lookup_start;
    110 	uint_t	mobjs_alloc_start;
    111 	uint_t	mobjs_alloc_vmem;
    112 	uint_t	mobjs_add_collision;
    113 	uint_t	mobjs_get_addr;
    114 	uint_t	mobjs_map_flat_no_padding;
    115 	uint_t	mobjs_map_flat_padding;
    116 	uint_t	mobjs_map_ptload_text;
    117 	uint_t	mobjs_map_ptload_initdata;
    118 	uint_t	mobjs_map_ptload_preread;
    119 	uint_t	mobjs_map_ptload_unaligned_text;
    120 	uint_t	mobjs_map_ptload_unaligned_map_fail;
    121 	uint_t	mobjs_map_ptload_unaligned_read_fail;
    122 	uint_t	mobjs_zfoddiff;
    123 	uint_t	mobjs_zfoddiff_nowrite;
    124 	uint_t	mobjs_zfodextra;
    125 	uint_t	mobjs_ptload_failed;
    126 	uint_t	mobjs_map_elf_no_holes;
    127 	uint_t	mobjs_unmap_hole;
    128 	uint_t	mobjs_nomem_header;
    129 	uint_t	mobjs_inval_header;
    130 	uint_t	mobjs_overlap_header;
    131 	uint_t	mobjs_np2_align;
    132 	uint_t	mobjs_np2_align_overflow;
    133 	uint_t	mobjs_exec_padding;
    134 	uint_t	mobjs_exec_addr_mapped;
    135 	uint_t	mobjs_exec_addr_devnull;
    136 	uint_t	mobjs_exec_addr_in_use;
    137 	uint_t	mobjs_lvp_found;
    138 	uint_t	mobjs_no_loadable_yet;
    139 	uint_t	mobjs_nothing_to_map;
    140 	uint_t	mobjs_e2big;
    141 	uint_t	mobjs_dyn_pad_align;
    142 	uint_t	mobjs_dyn_pad_noalign;
    143 	uint_t	mobjs_alloc_start_fail;
    144 	uint_t	mobjs_lvp_nocache;
    145 	uint_t	mobjs_extra_padding;
    146 	uint_t	mobjs_lvp_not_needed;
    147 	uint_t	mobjs_no_mem_map_sz;
    148 	uint_t	mobjs_check_exec_failed;
    149 	uint_t	mobjs_lvp_used;
    150 	uint_t	mobjs_wrong_model;
    151 	uint_t	mobjs_noexec_fs;
    152 	uint_t	mobjs_e2big_et_rel;
    153 	uint_t	mobjs_et_rel_mapped;
    154 	uint_t	mobjs_unknown_elf_type;
    155 	uint_t	mobjs_phent32_too_small;
    156 	uint_t	mobjs_phent64_too_small;
    157 	uint_t	mobjs_inval_elf_class;
    158 	uint_t	mobjs_too_many_phdrs;
    159 	uint_t	mobjs_no_phsize;
    160 	uint_t	mobjs_phsize_large;
    161 	uint_t	mobjs_phsize_xtralarge;
    162 	uint_t	mobjs_fast_wrong_model;
    163 	uint_t	mobjs_fast_e2big;
    164 	uint_t	mobjs_fast;
    165 	uint_t	mobjs_fast_success;
    166 	uint_t	mobjs_fast_not_now;
    167 	uint_t	mobjs_small_file;
    168 	uint_t	mobjs_read_error;
    169 	uint_t	mobjs_unsupported;
    170 	uint_t	mobjs_flat_e2big;
    171 	uint_t	mobjs_phent_align32;
    172 	uint_t	mobjs_phent_align64;
    173 	uint_t	mobjs_lib_va_find_hit;
    174 	uint_t	mobjs_lib_va_find_delay_delete;
    175 	uint_t	mobjs_lib_va_find_delete;
    176 	uint_t	mobjs_lib_va_add_delay_delete;
    177 	uint_t	mobjs_lib_va_add_delete;
    178 	uint_t	mobjs_lib_va_create_failure;
    179 	uint_t	mobjs_min_align;
    180 #if defined(__sparc)
    181 	uint_t	mobjs_aout_uzero_fault;
    182 	uint_t	mobjs_aout_64bit_try;
    183 	uint_t	mobjs_aout_noexec;
    184 	uint_t	mobjs_aout_e2big;
    185 	uint_t	mobjs_aout_lib;
    186 	uint_t	mobjs_aout_fixed;
    187 	uint_t	mobjs_aout_zfoddiff;
    188 	uint_t	mobjs_aout_map_bss;
    189 	uint_t	mobjs_aout_bss_fail;
    190 	uint_t	mobjs_aout_nlist;
    191 	uint_t	mobjs_aout_addr_in_use;
    192 #endif
    193 } mobj_stats;
    194 
    195 #define	MOBJ_STAT_ADD(stat)		((mobj_stats.mobjs_##stat)++)
    196 #else
    197 #define	MOBJ_STAT_ADD(stat)
    198 #endif
    199 
    200 /*
    201  * Check if addr is at or above the address space reserved for the stack.
    202  * The stack is at the top of the address space for all sparc processes
    203  * and 64 bit x86 processes.  For 32 bit x86, the stack is not at the top
    204  * of the address space and thus this check wil always return false for
    205  * 32 bit x86 processes.
    206  */
    207 #if defined(__sparc)
    208 #define	OVERLAPS_STACK(addr, p)						\
    209 	(addr >= (p->p_usrstack - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK)))
    210 #elif defined(__amd64)
    211 #define	OVERLAPS_STACK(addr, p)						\
    212 	((p->p_model == DATAMODEL_LP64) &&				\
    213 	(addr >= (p->p_usrstack - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK))))
    214 #elif defined(__i386)
    215 #define	OVERLAPS_STACK(addr, p)	0
    216 #endif
    217 
    218 /* lv_flags values - bitmap */
    219 #define	LV_ELF32	0x1		/* 32 bit ELF file */
    220 #define	LV_ELF64	0x2		/* 64 bit ELF file */
    221 #define	LV_DEL		0x4		/* delete when lv_refcnt hits zero */
    222 
    223 /*
    224  * Note: lv_num_segs will denote how many segments this file has and will
    225  * only be set after the lv_mps array has been filled out.
    226  * lv_mps can only be valid if lv_num_segs is non-zero.
    227  */
    228 struct lib_va {
    229 	struct lib_va		*lv_next;
    230 	caddr_t			lv_base_va;	/* start va for library */
    231 	ssize_t			lv_len;		/* total va span of library */
    232 	size_t			lv_align;	/* minimum alignment */
    233 	uint64_t		lv_nodeid;	/* filesystem node id */
    234 	uint64_t		lv_fsid;	/* filesystem id */
    235 	timestruc_t		lv_ctime;	/* last time file was changed */
    236 	timestruc_t		lv_mtime;	/* or modified */
    237 	mmapobj_result_t	lv_mps[LIBVA_CACHED_SEGS]; /* cached pheaders */
    238 	int			lv_num_segs;	/* # segs for this file */
    239 	int			lv_flags;
    240 	uint_t			lv_refcnt;	/* number of holds on struct */
    241 };
    242 
    243 #define	LIB_VA_SIZE	1024
    244 #define	LIB_VA_MASK	(LIB_VA_SIZE - 1)
    245 #define	LIB_VA_MUTEX_SHIFT	3
    246 
    247 #if (LIB_VA_SIZE & (LIB_VA_SIZE - 1))
    248 #error	"LIB_VA_SIZE is not a power of 2"
    249 #endif
    250 
    251 static struct lib_va *lib_va_hash[LIB_VA_SIZE];
    252 static kmutex_t lib_va_hash_mutex[LIB_VA_SIZE >> LIB_VA_MUTEX_SHIFT];
    253 
    254 #define	LIB_VA_HASH_MUTEX(index)					\
    255 	(&lib_va_hash_mutex[index >> LIB_VA_MUTEX_SHIFT])
    256 
    257 #define	LIB_VA_HASH(nodeid)						\
    258 	(((nodeid) ^ ((nodeid) << 7) ^ ((nodeid) << 13)) & LIB_VA_MASK)
    259 
    260 #define	LIB_VA_MATCH_ID(arg1, arg2)					\
    261 	((arg1)->lv_nodeid == (arg2)->va_nodeid &&			\
    262 	(arg1)->lv_fsid == (arg2)->va_fsid)
    263 
    264 #define	LIB_VA_MATCH_TIME(arg1, arg2)					\
    265 	((arg1)->lv_ctime.tv_sec == (arg2)->va_ctime.tv_sec &&		\
    266 	(arg1)->lv_mtime.tv_sec == (arg2)->va_mtime.tv_sec &&		\
    267 	(arg1)->lv_ctime.tv_nsec == (arg2)->va_ctime.tv_nsec &&		\
    268 	(arg1)->lv_mtime.tv_nsec == (arg2)->va_mtime.tv_nsec)
    269 
    270 #define	LIB_VA_MATCH(arg1, arg2)					\
    271 	(LIB_VA_MATCH_ID(arg1, arg2) && LIB_VA_MATCH_TIME(arg1, arg2))
    272 
    273 /*
    274  * lib_va will be used for optimized allocation of address ranges for
    275  * libraries, such that subsequent mappings of the same library will attempt
    276  * to use the same VA as previous mappings of that library.
    277  * In order to map libraries at the same VA in many processes, we need to carve
    278  * out our own address space for them which is unique across many processes.
    279  * We use different arenas for 32 bit and 64 bit libraries.
    280  *
    281  * Since the 32 bit address space is relatively small, we limit the number of
    282  * libraries which try to use consistent virtual addresses to lib_threshold.
    283  * For 64 bit libraries there is no such limit since the address space is large.
    284  */
    285 static vmem_t *lib_va_32_arena;
    286 static vmem_t *lib_va_64_arena;
    287 uint_t lib_threshold = 20;	/* modifiable via /etc/system */
    288 
    289 static kmutex_t lib_va_init_mutex;	/* no need to initialize */
    290 
    291 /*
    292  * Number of 32 bit and 64 bit libraries in lib_va hash.
    293  */
    294 static uint_t libs_mapped_32 = 0;
    295 static uint_t libs_mapped_64 = 0;
    296 
    297 /*
    298  * Free up the resources associated with lvp as well as lvp itself.
    299  * We also decrement the number of libraries mapped via a lib_va
    300  * cached virtual address.
    301  */
    302 void
    303 lib_va_free(struct lib_va *lvp)
    304 {
    305 	int is_64bit = lvp->lv_flags & LV_ELF64;
    306 	ASSERT(lvp->lv_refcnt == 0);
    307 
    308 	if (lvp->lv_base_va != NULL) {
    309 		vmem_xfree(is_64bit ? lib_va_64_arena : lib_va_32_arena,
    310 		    lvp->lv_base_va, lvp->lv_len);
    311 		if (is_64bit) {
    312 			atomic_add_32(&libs_mapped_64, -1);
    313 		} else {
    314 			atomic_add_32(&libs_mapped_32, -1);
    315 		}
    316 	}
    317 	kmem_free(lvp, sizeof (struct lib_va));
    318 }
    319 
    320 /*
    321  * See if the file associated with the vap passed in is in the lib_va hash.
    322  * If it is and the file has not been modified since last use, then
    323  * return a pointer to that data.  Otherwise, return NULL if the file has
    324  * changed or the file was not found in the hash.
    325  */
    326 static struct lib_va *
    327 lib_va_find(vattr_t *vap)
    328 {
    329 	struct lib_va *lvp;
    330 	struct lib_va *del = NULL;
    331 	struct lib_va **tmp;
    332 	uint_t index;
    333 	index = LIB_VA_HASH(vap->va_nodeid);
    334 
    335 	mutex_enter(LIB_VA_HASH_MUTEX(index));
    336 	tmp = &lib_va_hash[index];
    337 	while (*tmp != NULL) {
    338 		lvp = *tmp;
    339 		if (LIB_VA_MATCH_ID(lvp, vap)) {
    340 			if (LIB_VA_MATCH_TIME(lvp, vap)) {
    341 				ASSERT((lvp->lv_flags & LV_DEL) == 0);
    342 				lvp->lv_refcnt++;
    343 				MOBJ_STAT_ADD(lib_va_find_hit);
    344 			} else {
    345 				/*
    346 				 * file was updated since last use.
    347 				 * need to remove it from list.
    348 				 */
    349 				del = lvp;
    350 				*tmp = del->lv_next;
    351 				del->lv_next = NULL;
    352 				/*
    353 				 * If we can't delete it now, mark it for later
    354 				 */
    355 				if (del->lv_refcnt) {
    356 					MOBJ_STAT_ADD(lib_va_find_delay_delete);
    357 					del->lv_flags |= LV_DEL;
    358 					del = NULL;
    359 				}
    360 				lvp = NULL;
    361 			}
    362 			mutex_exit(LIB_VA_HASH_MUTEX(index));
    363 			if (del) {
    364 				ASSERT(del->lv_refcnt == 0);
    365 				MOBJ_STAT_ADD(lib_va_find_delete);
    366 				lib_va_free(del);
    367 			}
    368 			return (lvp);
    369 		}
    370 		tmp = &lvp->lv_next;
    371 	}
    372 	mutex_exit(LIB_VA_HASH_MUTEX(index));
    373 	return (NULL);
    374 }
    375 
    376 /*
    377  * Add a new entry to the lib_va hash.
    378  * Search the hash while holding the appropriate mutex to make sure that the
    379  * data is not already in the cache.  If we find data that is in the cache
    380  * already and has not been modified since last use, we return NULL.  If it
    381  * has been modified since last use, we will remove that entry from
    382  * the hash and it will be deleted once it's reference count reaches zero.
    383  * If there is no current entry in the hash we will add the new entry and
    384  * return it to the caller who is responsible for calling lib_va_release to
    385  * drop their reference count on it.
    386  *
    387  * lv_num_segs will be set to zero since the caller needs to add that
    388  * information to the data structure.
    389  */
    390 static struct lib_va *
    391 lib_va_add_hash(caddr_t base_va, ssize_t len, size_t align, vattr_t *vap)
    392 {
    393 	struct lib_va *lvp;
    394 	uint_t index;
    395 	model_t model;
    396 	struct lib_va **tmp;
    397 	struct lib_va *del = NULL;
    398 
    399 	model = get_udatamodel();
    400 	index = LIB_VA_HASH(vap->va_nodeid);
    401 
    402 	lvp = kmem_alloc(sizeof (struct lib_va), KM_SLEEP);
    403 
    404 	mutex_enter(LIB_VA_HASH_MUTEX(index));
    405 
    406 	/*
    407 	 * Make sure not adding same data a second time.
    408 	 * The hash chains should be relatively short and adding
    409 	 * is a relatively rare event, so it's worth the check.
    410 	 */
    411 	tmp = &lib_va_hash[index];
    412 	while (*tmp != NULL) {
    413 		if (LIB_VA_MATCH_ID(*tmp, vap)) {
    414 			if (LIB_VA_MATCH_TIME(*tmp, vap)) {
    415 				mutex_exit(LIB_VA_HASH_MUTEX(index));
    416 				kmem_free(lvp, sizeof (struct lib_va));
    417 				return (NULL);
    418 			}
    419 
    420 			/*
    421 			 * We have the same nodeid and fsid but the file has
    422 			 * been modified since we last saw it.
    423 			 * Need to remove the old node and add this new
    424 			 * one.
    425 			 * Could probably use a callback mechanism to make
    426 			 * this cleaner.
    427 			 */
    428 			ASSERT(del == NULL);
    429 			del = *tmp;
    430 			*tmp = del->lv_next;
    431 			del->lv_next = NULL;
    432 
    433 			/*
    434 			 * Check to see if we can free it.  If lv_refcnt
    435 			 * is greater than zero, than some other thread
    436 			 * has a reference to the one we want to delete
    437 			 * and we can not delete it.  All of this is done
    438 			 * under the lib_va_hash_mutex lock so it is atomic.
    439 			 */
    440 			if (del->lv_refcnt) {
    441 				MOBJ_STAT_ADD(lib_va_add_delay_delete);
    442 				del->lv_flags |= LV_DEL;
    443 				del = NULL;
    444 			}
    445 			/* tmp is already advanced */
    446 			continue;
    447 		}
    448 		tmp = &((*tmp)->lv_next);
    449 	}
    450 
    451 	lvp->lv_base_va = base_va;
    452 	lvp->lv_len = len;
    453 	lvp->lv_align = align;
    454 	lvp->lv_nodeid = vap->va_nodeid;
    455 	lvp->lv_fsid = vap->va_fsid;
    456 	lvp->lv_ctime.tv_sec = vap->va_ctime.tv_sec;
    457 	lvp->lv_ctime.tv_nsec = vap->va_ctime.tv_nsec;
    458 	lvp->lv_mtime.tv_sec = vap->va_mtime.tv_sec;
    459 	lvp->lv_mtime.tv_nsec = vap->va_mtime.tv_nsec;
    460 	lvp->lv_next = NULL;
    461 	lvp->lv_refcnt = 1;
    462 
    463 	/* Caller responsible for filling this and lv_mps out */
    464 	lvp->lv_num_segs = 0;
    465 
    466 	if (model == DATAMODEL_LP64) {
    467 		lvp->lv_flags = LV_ELF64;
    468 	} else {
    469 		ASSERT(model == DATAMODEL_ILP32);
    470 		lvp->lv_flags = LV_ELF32;
    471 	}
    472 
    473 	if (base_va != NULL) {
    474 		if (model == DATAMODEL_LP64) {
    475 			atomic_add_32(&libs_mapped_64, 1);
    476 		} else {
    477 			ASSERT(model == DATAMODEL_ILP32);
    478 			atomic_add_32(&libs_mapped_32, 1);
    479 		}
    480 	}
    481 	ASSERT(*tmp == NULL);
    482 	*tmp = lvp;
    483 	mutex_exit(LIB_VA_HASH_MUTEX(index));
    484 	if (del) {
    485 		ASSERT(del->lv_refcnt == 0);
    486 		MOBJ_STAT_ADD(lib_va_add_delete);
    487 		lib_va_free(del);
    488 	}
    489 	return (lvp);
    490 }
    491 
    492 /*
    493  * Release the hold on lvp which was acquired by lib_va_find or lib_va_add_hash.
    494  * In addition, if this is the last hold and lvp is marked for deletion,
    495  * free up it's reserved address space and free the structure.
    496  */
    497 static void
    498 lib_va_release(struct lib_va *lvp)
    499 {
    500 	uint_t index;
    501 	int to_del = 0;
    502 
    503 	ASSERT(lvp->lv_refcnt > 0);
    504 
    505 	index = LIB_VA_HASH(lvp->lv_nodeid);
    506 	mutex_enter(LIB_VA_HASH_MUTEX(index));
    507 	if (--lvp->lv_refcnt == 0 && (lvp->lv_flags & LV_DEL)) {
    508 		to_del = 1;
    509 	}
    510 	mutex_exit(LIB_VA_HASH_MUTEX(index));
    511 	if (to_del) {
    512 		ASSERT(lvp->lv_next == 0);
    513 		lib_va_free(lvp);
    514 	}
    515 }
    516 
    517 /*
    518  * Dummy function for mapping through /dev/null
    519  * Normally I would have used mmmmap in common/io/mem.c
    520  * but that is a static function, and for /dev/null, it
    521  * just returns -1.
    522  */
    523 /* ARGSUSED */
    524 static int
    525 mmapobj_dummy(dev_t dev, off_t off, int prot)
    526 {
    527 	return (-1);
    528 }
    529 
    530 /*
    531  * Called when an error occurred which requires mmapobj to return failure.
    532  * All mapped objects will be unmapped and /dev/null mappings will be
    533  * reclaimed if necessary.
    534  * num_mapped is the number of elements of mrp which have been mapped, and
    535  * num_segs is the total number of elements in mrp.
    536  * For e_type ET_EXEC, we need to unmap all of the elements in mrp since
    537  * we had already made reservations for them.
    538  * If num_mapped equals num_segs, then we know that we had fully mapped
    539  * the file and only need to clean up the segments described.
    540  * If they are not equal, then for ET_DYN we will unmap the range from the
    541  * end of the last mapped segment to the end of the last segment in mrp
    542  * since we would have made a reservation for that memory earlier.
    543  * If e_type is passed in as zero, num_mapped must equal num_segs.
    544  */
    545 void
    546 mmapobj_unmap(mmapobj_result_t *mrp, int num_mapped, int num_segs,
    547     ushort_t e_type)
    548 {
    549 	int i;
    550 	struct as *as = curproc->p_as;
    551 	caddr_t addr;
    552 	size_t size;
    553 
    554 	if (e_type == ET_EXEC) {
    555 		num_mapped = num_segs;
    556 	}
    557 #ifdef DEBUG
    558 	if (e_type == 0) {
    559 		ASSERT(num_mapped == num_segs);
    560 	}
    561 #endif
    562 
    563 	MOBJ_STAT_ADD(unmap_called);
    564 	for (i = 0; i < num_mapped; i++) {
    565 
    566 		/*
    567 		 * If we are going to have to create a mapping we need to
    568 		 * make sure that no one else will use the address we
    569 		 * need to remap between the time it is unmapped and
    570 		 * mapped below.
    571 		 */
    572 		if (mrp[i].mr_flags & MR_RESV) {
    573 			as_rangelock(as);
    574 		}
    575 		/* Always need to unmap what we mapped */
    576 		(void) as_unmap(as, mrp[i].mr_addr, mrp[i].mr_msize);
    577 
    578 		/* Need to reclaim /dev/null reservation from earlier */
    579 		if (mrp[i].mr_flags & MR_RESV) {
    580 			struct segdev_crargs dev_a;
    581 
    582 			ASSERT(e_type != ET_DYN);
    583 			/*
    584 			 * Use seg_dev segment driver for /dev/null mapping.
    585 			 */
    586 			dev_a.mapfunc = mmapobj_dummy;
    587 			dev_a.dev = makedevice(mm_major, M_NULL);
    588 			dev_a.offset = 0;
    589 			dev_a.type = 0;		/* neither PRIVATE nor SHARED */
    590 			dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
    591 			dev_a.hat_attr = 0;
    592 			dev_a.hat_flags = 0;
    593 
    594 			(void) as_map(as, mrp[i].mr_addr, mrp[i].mr_msize,
    595 			    segdev_create, &dev_a);
    596 			MOBJ_STAT_ADD(remap_devnull);
    597 			as_rangeunlock(as);
    598 		}
    599 	}
    600 
    601 	if (num_mapped != num_segs) {
    602 		ASSERT(e_type == ET_DYN);
    603 		/* Need to unmap any reservation made after last mapped seg */
    604 		if (num_mapped == 0) {
    605 			addr = mrp[0].mr_addr;
    606 		} else {
    607 			addr = mrp[num_mapped - 1].mr_addr +
    608 			    mrp[num_mapped - 1].mr_msize;
    609 		}
    610 		size = (size_t)mrp[num_segs - 1].mr_addr +
    611 		    mrp[num_segs - 1].mr_msize - (size_t)addr;
    612 		(void) as_unmap(as, addr, size);
    613 
    614 		/*
    615 		 * Now we need to unmap the holes between mapped segs.
    616 		 * Note that we have not mapped all of the segments and thus
    617 		 * the holes between segments would not have been unmapped
    618 		 * yet.  If num_mapped == num_segs, then all of the holes
    619 		 * between segments would have already been unmapped.
    620 		 */
    621 
    622 		for (i = 1; i < num_mapped; i++) {
    623 			addr = mrp[i - 1].mr_addr + mrp[i - 1].mr_msize;
    624 			size = mrp[i].mr_addr - addr;
    625 			(void) as_unmap(as, addr, size);
    626 		}
    627 	}
    628 }
    629 
    630 /*
    631  * We need to add the start address into mrp so that the unmap function
    632  * has absolute addresses to use.
    633  */
    634 static void
    635 mmapobj_unmap_exec(mmapobj_result_t *mrp, int num_mapped, caddr_t start_addr)
    636 {
    637 	int i;
    638 
    639 	for (i = 0; i < num_mapped; i++) {
    640 		mrp[i].mr_addr += (size_t)start_addr;
    641 	}
    642 	mmapobj_unmap(mrp, num_mapped, num_mapped, ET_EXEC);
    643 }
    644 
    645 static caddr_t
    646 mmapobj_lookup_start_addr(struct lib_va *lvp)
    647 {
    648 	proc_t *p = curproc;
    649 	struct as *as = p->p_as;
    650 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL);
    651 	int error;
    652 	uint_t ma_flags = _MAP_LOW32;
    653 	caddr_t base = NULL;
    654 	size_t len;
    655 	size_t align;
    656 
    657 	ASSERT(lvp != NULL);
    658 	MOBJ_STAT_ADD(lookup_start);
    659 
    660 	as_rangelock(as);
    661 
    662 	base = lvp->lv_base_va;
    663 	len = lvp->lv_len;
    664 
    665 	/*
    666 	 * If we don't have an expected base address, or the one that we want
    667 	 * to use is not available or acceptable, go get an acceptable
    668 	 * address range.
    669 	 */
    670 	if (base == NULL || as_gap(as, len, &base, &len, 0, NULL) ||
    671 	    valid_usr_range(base, len, PROT_ALL, as, as->a_userlimit) !=
    672 	    RANGE_OKAY || OVERLAPS_STACK(base + len, p)) {
    673 		if (lvp->lv_flags & LV_ELF64) {
    674 			ma_flags = 0;
    675 		}
    676 
    677 		align = lvp->lv_align;
    678 		if (align > 1) {
    679 			ma_flags |= MAP_ALIGN;
    680 		}
    681 
    682 		base = (caddr_t)align;
    683 		map_addr(&base, len, 0, 1, ma_flags);
    684 	}
    685 
    686 	/*
    687 	 * Need to reserve the address space we're going to use.
    688 	 * Don't reserve swap space since we'll be mapping over this.
    689 	 */
    690 	if (base != NULL) {
    691 		crargs.flags |= MAP_NORESERVE;
    692 		error = as_map(as, base, len, segvn_create, &crargs);
    693 		if (error) {
    694 			base = NULL;
    695 		}
    696 	}
    697 
    698 	as_rangeunlock(as);
    699 	return (base);
    700 }
    701 
    702 /*
    703  * Get the starting address for a given file to be mapped and return it
    704  * to the caller.  If we're using lib_va and we need to allocate an address,
    705  * we will attempt to allocate it from the global reserved pool such that the
    706  * same address can be used in the future for this file.  If we can't use the
    707  * reserved address then we just get one that will fit in our address space.
    708  *
    709  * Returns the starting virtual address for the range to be mapped or NULL
    710  * if an error is encountered. If we successfully insert the requested info
    711  * into the lib_va hash, then *lvpp will be set to point to this lib_va
    712  * structure.  The structure will have a hold on it and thus lib_va_release
    713  * needs to be called on it by the caller.  This function will not fill out
    714  * lv_mps or lv_num_segs since it does not have enough information to do so.
    715  * The caller is responsible for doing this making sure that any modifications
    716  * to lv_mps are visible before setting lv_num_segs.
    717  */
    718 static caddr_t
    719 mmapobj_alloc_start_addr(struct lib_va **lvpp, size_t len, int use_lib_va,
    720     size_t align, vattr_t *vap)
    721 {
    722 	proc_t *p = curproc;
    723 	struct as *as = p->p_as;
    724 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL);
    725 	int error;
    726 	model_t model;
    727 	uint_t ma_flags = _MAP_LOW32;
    728 	caddr_t base = NULL;
    729 	vmem_t *model_vmem;
    730 	size_t lib_va_start;
    731 	size_t lib_va_end;
    732 	size_t lib_va_len;
    733 
    734 	ASSERT(lvpp != NULL);
    735 
    736 	MOBJ_STAT_ADD(alloc_start);
    737 	model = get_udatamodel();
    738 
    739 	if (model == DATAMODEL_LP64) {
    740 		ma_flags = 0;
    741 		model_vmem = lib_va_64_arena;
    742 	} else {
    743 		ASSERT(model == DATAMODEL_ILP32);
    744 		model_vmem = lib_va_32_arena;
    745 	}
    746 
    747 	if (align > 1) {
    748 		ma_flags |= MAP_ALIGN;
    749 	}
    750 	if (use_lib_va) {
    751 		/*
    752 		 * The first time through, we need to setup the lib_va arenas.
    753 		 * We call map_addr to find a suitable range of memory to map
    754 		 * the given library, and we will set the highest address
    755 		 * in our vmem arena to the end of this adddress range.
    756 		 * We allow up to half of the address space to be used
    757 		 * for lib_va addresses but we do not prevent any allocations
    758 		 * in this range from other allocation paths.
    759 		 */
    760 		if (lib_va_64_arena == NULL && model == DATAMODEL_LP64) {
    761 			mutex_enter(&lib_va_init_mutex);
    762 			if (lib_va_64_arena == NULL) {
    763 				base = (caddr_t)align;
    764 				as_rangelock(as);
    765 				map_addr(&base, len, 0, 1, ma_flags);
    766 				as_rangeunlock(as);
    767 				if (base == NULL) {
    768 					mutex_exit(&lib_va_init_mutex);
    769 					MOBJ_STAT_ADD(lib_va_create_failure);
    770 					goto nolibva;
    771 				}
    772 				lib_va_end = (size_t)base + len;
    773 				lib_va_len = lib_va_end >> 1;
    774 				lib_va_len = P2ROUNDUP(lib_va_len, PAGESIZE);
    775 				lib_va_start = lib_va_end - lib_va_len;
    776 
    777 				/*
    778 				 * Need to make sure we avoid the address hole.
    779 				 * We know lib_va_end is valid but we need to
    780 				 * make sure lib_va_start is as well.
    781 				 */
    782 				if ((lib_va_end > (size_t)hole_end) &&
    783 				    (lib_va_start < (size_t)hole_end)) {
    784 					lib_va_start = P2ROUNDUP(
    785 					    (size_t)hole_end, PAGESIZE);
    786 					lib_va_len = lib_va_end - lib_va_start;
    787 				}
    788 				lib_va_64_arena = vmem_create("lib_va_64",
    789 				    (void *)lib_va_start, lib_va_len, PAGESIZE,
    790 				    NULL, NULL, NULL, 0,
    791 				    VM_NOSLEEP | VMC_IDENTIFIER);
    792 				if (lib_va_64_arena == NULL) {
    793 					mutex_exit(&lib_va_init_mutex);
    794 					goto nolibva;
    795 				}
    796 			}
    797 			model_vmem = lib_va_64_arena;
    798 			mutex_exit(&lib_va_init_mutex);
    799 		} else if (lib_va_32_arena == NULL &&
    800 		    model == DATAMODEL_ILP32) {
    801 			mutex_enter(&lib_va_init_mutex);
    802 			if (lib_va_32_arena == NULL) {
    803 				base = (caddr_t)align;
    804 				as_rangelock(as);
    805 				map_addr(&base, len, 0, 1, ma_flags);
    806 				as_rangeunlock(as);
    807 				if (base == NULL) {
    808 					mutex_exit(&lib_va_init_mutex);
    809 					MOBJ_STAT_ADD(lib_va_create_failure);
    810 					goto nolibva;
    811 				}
    812 				lib_va_end = (size_t)base + len;
    813 				lib_va_len = lib_va_end >> 1;
    814 				lib_va_len = P2ROUNDUP(lib_va_len, PAGESIZE);
    815 				lib_va_start = lib_va_end - lib_va_len;
    816 				lib_va_32_arena = vmem_create("lib_va_32",
    817 				    (void *)lib_va_start, lib_va_len, PAGESIZE,
    818 				    NULL, NULL, NULL, 0,
    819 				    VM_NOSLEEP | VMC_IDENTIFIER);
    820 				if (lib_va_32_arena == NULL) {
    821 					mutex_exit(&lib_va_init_mutex);
    822 					goto nolibva;
    823 				}
    824 			}
    825 			model_vmem = lib_va_32_arena;
    826 			mutex_exit(&lib_va_init_mutex);
    827 		}
    828 
    829 		if (model == DATAMODEL_LP64 || libs_mapped_32 < lib_threshold) {
    830 			base = vmem_xalloc(model_vmem, len, align, 0, 0, NULL,
    831 			    NULL, VM_NOSLEEP | VM_ENDALLOC);
    832 			MOBJ_STAT_ADD(alloc_vmem);
    833 		}
    834 
    835 		/*
    836 		 * Even if the address fails to fit in our address space,
    837 		 * or we can't use a reserved address,
    838 		 * we should still save it off in lib_va_hash.
    839 		 */
    840 		*lvpp = lib_va_add_hash(base, len, align, vap);
    841 
    842 		/*
    843 		 * Check for collision on insertion and free up our VA space.
    844 		 * This is expected to be rare, so we'll just reset base to
    845 		 * NULL instead of looking it up in the lib_va hash.
    846 		 */
    847 		if (*lvpp == NULL) {
    848 			if (base != NULL) {
    849 				vmem_xfree(model_vmem, base, len);
    850 				base = NULL;
    851 				MOBJ_STAT_ADD(add_collision);
    852 			}
    853 		}
    854 	}
    855 
    856 nolibva:
    857 	as_rangelock(as);
    858 
    859 	/*
    860 	 * If we don't have an expected base address, or the one that we want
    861 	 * to use is not available or acceptable, go get an acceptable
    862 	 * address range.
    863 	 */
    864 	if (base == NULL || as_gap(as, len, &base, &len, 0, NULL) ||
    865 	    valid_usr_range(base, len, PROT_ALL, as, as->a_userlimit) !=
    866 	    RANGE_OKAY || OVERLAPS_STACK(base + len, p)) {
    867 		MOBJ_STAT_ADD(get_addr);
    868 		base = (caddr_t)align;
    869 		map_addr(&base, len, 0, 1, ma_flags);
    870 	}
    871 
    872 	/*
    873 	 * Need to reserve the address space we're going to use.
    874 	 * Don't reserve swap space since we'll be mapping over this.
    875 	 */
    876 	if (base != NULL) {
    877 		/* Don't reserve swap space since we'll be mapping over this */
    878 		crargs.flags |= MAP_NORESERVE;
    879 		error = as_map(as, base, len, segvn_create, &crargs);
    880 		if (error) {
    881 			base = NULL;
    882 		}
    883 	}
    884 
    885 	as_rangeunlock(as);
    886 	return (base);
    887 }
    888 
    889 /*
    890  * Map the file associated with vp into the address space as a single
    891  * read only private mapping.
    892  * Returns 0 for success, and non-zero for failure to map the file.
    893  */
    894 static int
    895 mmapobj_map_flat(vnode_t *vp, mmapobj_result_t *mrp, size_t padding,
    896     cred_t *fcred)
    897 {
    898 	int error = 0;
    899 	struct as *as = curproc->p_as;
    900 	caddr_t addr = NULL;
    901 	caddr_t start_addr;
    902 	size_t len;
    903 	size_t pad_len;
    904 	int prot = PROT_USER | PROT_READ;
    905 	uint_t ma_flags = _MAP_LOW32;
    906 	vattr_t vattr;
    907 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL);
    908 
    909 	if (get_udatamodel() == DATAMODEL_LP64) {
    910 		ma_flags = 0;
    911 	}
    912 
    913 	vattr.va_mask = AT_SIZE;
    914 	error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL);
    915 	if (error) {
    916 		return (error);
    917 	}
    918 
    919 	len = vattr.va_size;
    920 
    921 	ma_flags |= MAP_PRIVATE;
    922 	if (padding == 0) {
    923 		MOBJ_STAT_ADD(map_flat_no_padding);
    924 		error = VOP_MAP(vp, 0, as, &addr, len, prot, PROT_ALL,
    925 		    ma_flags, fcred, NULL);
    926 		if (error == 0) {
    927 			mrp[0].mr_addr = addr;
    928 			mrp[0].mr_msize = len;
    929 			mrp[0].mr_fsize = len;
    930 			mrp[0].mr_offset = 0;
    931 			mrp[0].mr_prot = prot;
    932 			mrp[0].mr_flags = 0;
    933 		}
    934 		return (error);
    935 	}
    936 
    937 	/* padding was requested so there's more work to be done */
    938 	MOBJ_STAT_ADD(map_flat_padding);
    939 
    940 	/* No need to reserve swap space now since it will be reserved later */
    941 	crargs.flags |= MAP_NORESERVE;
    942 
    943 	/* Need to setup padding which can only be in PAGESIZE increments. */
    944 	ASSERT((padding & PAGEOFFSET) == 0);
    945 	pad_len = len + (2 * padding);
    946 
    947 	as_rangelock(as);
    948 	map_addr(&addr, pad_len, 0, 1, ma_flags);
    949 	error = as_map(as, addr, pad_len, segvn_create, &crargs);
    950 	as_rangeunlock(as);
    951 	if (error) {
    952 		return (error);
    953 	}
    954 	start_addr = addr;
    955 	addr += padding;
    956 	ma_flags |= MAP_FIXED;
    957 	error = VOP_MAP(vp, 0, as, &addr, len, prot, PROT_ALL, ma_flags,
    958 	    fcred, NULL);
    959 	if (error == 0) {
    960 		mrp[0].mr_addr = start_addr;
    961 		mrp[0].mr_msize = padding;
    962 		mrp[0].mr_fsize = 0;
    963 		mrp[0].mr_offset = 0;
    964 		mrp[0].mr_prot = 0;
    965 		mrp[0].mr_flags = MR_PADDING;
    966 
    967 		mrp[1].mr_addr = addr;
    968 		mrp[1].mr_msize = len;
    969 		mrp[1].mr_fsize = len;
    970 		mrp[1].mr_offset = 0;
    971 		mrp[1].mr_prot = prot;
    972 		mrp[1].mr_flags = 0;
    973 
    974 		mrp[2].mr_addr = addr + P2ROUNDUP(len, PAGESIZE);
    975 		mrp[2].mr_msize = padding;
    976 		mrp[2].mr_fsize = 0;
    977 		mrp[2].mr_offset = 0;
    978 		mrp[2].mr_prot = 0;
    979 		mrp[2].mr_flags = MR_PADDING;
    980 	} else {
    981 		/* Need to cleanup the as_map from earlier */
    982 		(void) as_unmap(as, start_addr, pad_len);
    983 	}
    984 	return (error);
    985 }
    986 
    987 /*
    988  * Map a PT_LOAD or PT_SUNWBSS section of an executable file into the user's
    989  * address space.
    990  * vp - vnode to be mapped in
    991  * addr - start address
    992  * len - length of vp to be mapped
    993  * zfodlen - length of zero filled memory after len above
    994  * offset - offset into file where mapping should start
    995  * prot - protections for this mapping
    996  * fcred - credentials for the file associated with vp at open time.
    997  */
    998 static int
    999 mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
   1000     off_t offset, int prot, cred_t *fcred)
   1001 {
   1002 	int error = 0;
   1003 	caddr_t zfodbase, oldaddr;
   1004 	size_t oldlen;
   1005 	size_t end;
   1006 	size_t zfoddiff;
   1007 	label_t ljb;
   1008 	struct as *as = curproc->p_as;
   1009 	model_t model;
   1010 	int full_page;
   1011 
   1012 	/*
   1013 	 * See if addr and offset are aligned such that we can map in
   1014 	 * full pages instead of partial pages.
   1015 	 */
   1016 	full_page = (((uintptr_t)addr & PAGEOFFSET) ==
   1017 	    ((uintptr_t)offset & PAGEOFFSET));
   1018 
   1019 	model = get_udatamodel();
   1020 
   1021 	oldaddr = addr;
   1022 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
   1023 	if (len) {
   1024 		spgcnt_t availm, npages;
   1025 		int preread;
   1026 		uint_t mflag = MAP_PRIVATE | MAP_FIXED;
   1027 
   1028 		if (model == DATAMODEL_ILP32) {
   1029 			mflag |= _MAP_LOW32;
   1030 		}
   1031 		/* We may need to map in extra bytes */
   1032 		oldlen = len;
   1033 		len += ((size_t)oldaddr & PAGEOFFSET);
   1034 
   1035 		if (full_page) {
   1036 			offset = (off_t)((uintptr_t)offset & PAGEMASK);
   1037 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
   1038 				mflag |= MAP_TEXT;
   1039 				MOBJ_STAT_ADD(map_ptload_text);
   1040 			} else {
   1041 				mflag |= MAP_INITDATA;
   1042 				MOBJ_STAT_ADD(map_ptload_initdata);
   1043 			}
   1044 
   1045 			/*
   1046 			 * maxprot is passed as PROT_ALL so that mdb can
   1047 			 * write to this segment.
   1048 			 */
   1049 			if (error = VOP_MAP(vp, (offset_t)offset, as, &addr,
   1050 			    len, prot, PROT_ALL, mflag, fcred, NULL)) {
   1051 				return (error);
   1052 			}
   1053 
   1054 			/*
   1055 			 * If the segment can fit and is relatively small, then
   1056 			 * we prefault the entire segment in.  This is based
   1057 			 * on the model that says the best working set of a
   1058 			 * small program is all of its pages.
   1059 			 * We only do this if freemem will not drop below
   1060 			 * lotsfree since we don't want to induce paging.
   1061 			 */
   1062 			npages = (spgcnt_t)btopr(len);
   1063 			availm = freemem - lotsfree;
   1064 			preread = (npages < availm && len < PGTHRESH) ? 1 : 0;
   1065 
   1066 			/*
   1067 			 * If we aren't prefaulting the segment,
   1068 			 * increment "deficit", if necessary to ensure
   1069 			 * that pages will become available when this
   1070 			 * process starts executing.
   1071 			 */
   1072 			if (preread == 0 && npages > availm &&
   1073 			    deficit < lotsfree) {
   1074 				deficit += MIN((pgcnt_t)(npages - availm),
   1075 				    lotsfree - deficit);
   1076 			}
   1077 
   1078 			if (preread) {
   1079 				(void) as_faulta(as, addr, len);
   1080 				MOBJ_STAT_ADD(map_ptload_preread);
   1081 			}
   1082 		} else {
   1083 			/*
   1084 			 * addr and offset were not aligned such that we could
   1085 			 * use VOP_MAP, thus we need to as_map the memory we
   1086 			 * need and then read the data in from disk.
   1087 			 * This code path is a corner case which should never
   1088 			 * be taken, but hand crafted binaries could trigger
   1089 			 * this logic and it needs to work correctly.
   1090 			 */
   1091 			MOBJ_STAT_ADD(map_ptload_unaligned_text);
   1092 			as_rangelock(as);
   1093 			(void) as_unmap(as, addr, len);
   1094 
   1095 			/*
   1096 			 * We use zfod_argsp because we need to be able to
   1097 			 * write to the mapping and then we'll change the
   1098 			 * protections later if they are incorrect.
   1099 			 */
   1100 			error = as_map(as, addr, len, segvn_create, zfod_argsp);
   1101 			as_rangeunlock(as);
   1102 			if (error) {
   1103 				MOBJ_STAT_ADD(map_ptload_unaligned_map_fail);
   1104 				return (error);
   1105 			}
   1106 
   1107 			/* Now read in the data from disk */
   1108 			error = vn_rdwr(UIO_READ, vp, oldaddr, oldlen, offset,
   1109 			    UIO_USERSPACE, 0, (rlim64_t)0, fcred, NULL);
   1110 			if (error) {
   1111 				MOBJ_STAT_ADD(map_ptload_unaligned_read_fail);
   1112 				return (error);
   1113 			}
   1114 
   1115 			/*
   1116 			 * Now set protections.
   1117 			 */
   1118 			if (prot != PROT_ZFOD) {
   1119 				(void) as_setprot(as, addr, len, prot);
   1120 			}
   1121 		}
   1122 	}
   1123 
   1124 	if (zfodlen) {
   1125 		end = (size_t)addr + len;
   1126 		zfodbase = (caddr_t)P2ROUNDUP(end, PAGESIZE);
   1127 		zfoddiff = (uintptr_t)zfodbase - end;
   1128 		if (zfoddiff) {
   1129 			MOBJ_STAT_ADD(zfoddiff);
   1130 			if ((prot & PROT_WRITE) == 0) {
   1131 				(void) as_setprot(as, (caddr_t)end,
   1132 				    zfoddiff, prot | PROT_WRITE);
   1133 				MOBJ_STAT_ADD(zfoddiff_nowrite);
   1134 			}
   1135 			if (on_fault(&ljb)) {
   1136 				no_fault();
   1137 				if ((prot & PROT_WRITE) == 0) {
   1138 					(void) as_setprot(as, (caddr_t)end,
   1139 					    zfoddiff, prot);
   1140 				}
   1141 				return (EFAULT);
   1142 			}
   1143 			uzero((void *)end, zfoddiff);
   1144 			no_fault();
   1145 
   1146 			/*
   1147 			 * Remove write protection to return to original state
   1148 			 */
   1149 			if ((prot & PROT_WRITE) == 0) {
   1150 				(void) as_setprot(as, (caddr_t)end,
   1151 				    zfoddiff, prot);
   1152 			}
   1153 		}
   1154 		if (zfodlen > zfoddiff) {
   1155 			struct segvn_crargs crargs =
   1156 			    SEGVN_ZFOD_ARGS(prot, PROT_ALL);
   1157 
   1158 			MOBJ_STAT_ADD(zfodextra);
   1159 			zfodlen -= zfoddiff;
   1160 			crargs.szc = AS_MAP_NO_LPOOB;
   1161 
   1162 
   1163 			as_rangelock(as);
   1164 			(void) as_unmap(as, (caddr_t)zfodbase, zfodlen);
   1165 			error = as_map(as, (caddr_t)zfodbase,
   1166 			    zfodlen, segvn_create, &crargs);
   1167 			as_rangeunlock(as);
   1168 			if (error) {
   1169 				return (error);
   1170 			}
   1171 		}
   1172 	}
   1173 	return (0);
   1174 }
   1175 
   1176 /*
   1177  * Map the ELF file represented by vp into the users address space.  The
   1178  * first mapping will start at start_addr and there will be num_elements
   1179  * mappings.  The mappings are described by the data in mrp which may be
   1180  * modified upon returning from this function.
   1181  * Returns 0 for success or errno for failure.
   1182  */
   1183 static int
   1184 mmapobj_map_elf(struct vnode *vp, caddr_t start_addr, mmapobj_result_t *mrp,
   1185     int num_elements, cred_t *fcred, ushort_t e_type)
   1186 {
   1187 	int i;
   1188 	int ret;
   1189 	caddr_t lo;
   1190 	caddr_t hi;
   1191 	struct as *as = curproc->p_as;
   1192 
   1193 	for (i = 0; i < num_elements; i++) {
   1194 		caddr_t addr;
   1195 		size_t p_memsz;
   1196 		size_t p_filesz;
   1197 		size_t zfodlen;
   1198 		offset_t p_offset;
   1199 		size_t dif;
   1200 		int prot;
   1201 
   1202 		/* Always need to adjust mr_addr */
   1203 		addr = start_addr + (size_t)(mrp[i].mr_addr);
   1204 		mrp[i].mr_addr =
   1205 		    (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
   1206 
   1207 		/* Padding has already been mapped */
   1208 		if (MR_GET_TYPE(mrp[i].mr_flags) == MR_PADDING) {
   1209 			continue;
   1210 		}
   1211 		p_memsz = mrp[i].mr_msize;
   1212 		p_filesz = mrp[i].mr_fsize;
   1213 		zfodlen = p_memsz - p_filesz;
   1214 		p_offset = mrp[i].mr_offset;
   1215 		dif = (uintptr_t)(addr) & PAGEOFFSET;
   1216 		prot = mrp[i].mr_prot | PROT_USER;
   1217 		ret = mmapobj_map_ptload(vp, addr, p_filesz, zfodlen,
   1218 		    p_offset, prot, fcred);
   1219 		if (ret != 0) {
   1220 			MOBJ_STAT_ADD(ptload_failed);
   1221 			mmapobj_unmap(mrp, i, num_elements, e_type);
   1222 			return (ret);
   1223 		}
   1224 
   1225 		/* Need to cleanup mrp to reflect the actual values used */
   1226 		mrp[i].mr_msize += dif;
   1227 		mrp[i].mr_offset = (size_t)addr & PAGEOFFSET;
   1228 	}
   1229 
   1230 	/* Also need to unmap any holes created above */
   1231 	if (num_elements == 1) {
   1232 		MOBJ_STAT_ADD(map_elf_no_holes);
   1233 		return (0);
   1234 	}
   1235 	if (e_type == ET_EXEC) {
   1236 		return (0);
   1237 	}
   1238 
   1239 	as_rangelock(as);
   1240 	lo = start_addr;
   1241 	hi = mrp[0].mr_addr;
   1242 
   1243 	/* Remove holes made by the rest of the segments */
   1244 	for (i = 0; i < num_elements - 1; i++) {
   1245 		lo = (caddr_t)P2ROUNDUP((size_t)(mrp[i].mr_addr) +
   1246 		    mrp[i].mr_msize, PAGESIZE);
   1247 		hi = mrp[i + 1].mr_addr;
   1248 		if (lo < hi) {
   1249 			/*
   1250 			 * If as_unmap fails we just use up a bit of extra
   1251 			 * space
   1252 			 */
   1253 			(void) as_unmap(as, (caddr_t)lo,
   1254 			    (size_t)hi - (size_t)lo);
   1255 			MOBJ_STAT_ADD(unmap_hole);
   1256 		}
   1257 	}
   1258 	as_rangeunlock(as);
   1259 
   1260 	return (0);
   1261 }
   1262 
   1263 /* Ugly hack to get STRUCT_* macros to work below */
   1264 struct myphdr {
   1265 	Phdr		x;	/* native version */
   1266 };
   1267 
   1268 struct myphdr32 {
   1269 	Elf32_Phdr	x;
   1270 };
   1271 
   1272 /*
   1273  * Calculate and return the number of loadable segments in the ELF Phdr
   1274  * represented by phdrbase as well as the len of the total mapping and
   1275  * the max alignment that is needed for a given segment.  On success,
   1276  * 0 is returned, and *len, *loadable and *align have been filled out.
   1277  * On failure, errno will be returned, which in this case is ENOTSUP
   1278  * if we were passed an ELF file with overlapping segments.
   1279  */
   1280 static int
   1281 calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len,
   1282     int *loadable, size_t *align)
   1283 {
   1284 	int i;
   1285 	int hsize;
   1286 	model_t model;
   1287 	ushort_t e_type = ehdrp->e_type;	/* same offset 32 and 64 bit */
   1288 	uint_t p_type;
   1289 	offset_t p_offset;
   1290 	size_t p_memsz;
   1291 	size_t p_align;
   1292 	caddr_t vaddr;
   1293 	int num_segs = 0;
   1294 	caddr_t start_addr = NULL;
   1295 	caddr_t p_end = NULL;
   1296 	size_t max_align = 0;
   1297 	size_t min_align = PAGESIZE;	/* needed for vmem_xalloc */
   1298 	STRUCT_HANDLE(myphdr, mph);
   1299 #if defined(__sparc)
   1300 	extern int vac_size;
   1301 
   1302 	/*
   1303 	 * Want to prevent aliasing by making the start address at least be
   1304 	 * aligned to vac_size.
   1305 	 */
   1306 	min_align = MAX(PAGESIZE, vac_size);
   1307 #endif
   1308 
   1309 	model = get_udatamodel();
   1310 	STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase);
   1311 
   1312 	/* hsize alignment should have been checked before calling this func */
   1313 	if (model == DATAMODEL_LP64) {
   1314 		hsize = ehdrp->e_phentsize;
   1315 		if (hsize & 7) {
   1316 			return (ENOTSUP);
   1317 		}
   1318 	} else {
   1319 		ASSERT(model == DATAMODEL_ILP32);
   1320 		hsize = ((Elf32_Ehdr *)ehdrp)->e_phentsize;
   1321 		if (hsize & 3) {
   1322 			return (ENOTSUP);
   1323 		}
   1324 	}
   1325 
   1326 	/*
   1327 	 * Determine the span of all loadable segments and calculate the
   1328 	 * number of loadable segments.
   1329 	 */
   1330 	for (i = 0; i < nphdrs; i++) {
   1331 		p_type = STRUCT_FGET(mph, x.p_type);
   1332 		if (p_type == PT_LOAD || p_type == PT_SUNWBSS) {
   1333 			vaddr = (caddr_t)(uintptr_t)STRUCT_FGET(mph, x.p_vaddr);
   1334 			p_memsz = STRUCT_FGET(mph, x.p_memsz);
   1335 
   1336 			/*
   1337 			 * Skip this header if it requests no memory to be
   1338 			 * mapped.
   1339 			 */
   1340 			if (p_memsz == 0) {
   1341 				STRUCT_SET_HANDLE(mph, model,
   1342 				    (struct myphdr *)((size_t)STRUCT_BUF(mph) +
   1343 				    hsize));
   1344 				MOBJ_STAT_ADD(nomem_header);
   1345 				continue;
   1346 			}
   1347 			if (num_segs++ == 0) {
   1348 				/*
   1349 				 * The p_vaddr of the first PT_LOAD segment
   1350 				 * must either be NULL or within the first
   1351 				 * page in order to be interpreted.
   1352 				 * Otherwise, its an invalid file.
   1353 				 */
   1354 				if (e_type == ET_DYN &&
   1355 				    ((caddr_t)((uintptr_t)vaddr &
   1356 				    (uintptr_t)PAGEMASK) != NULL)) {
   1357 					MOBJ_STAT_ADD(inval_header);
   1358 					return (ENOTSUP);
   1359 				}
   1360 				start_addr = vaddr;
   1361 				/*
   1362 				 * For the first segment, we need to map from
   1363 				 * the beginning of the file, so we will
   1364 				 * adjust the size of the mapping to include
   1365 				 * this memory.
   1366 				 */
   1367 				p_offset = STRUCT_FGET(mph, x.p_offset);
   1368 			} else {
   1369 				p_offset = 0;
   1370 			}
   1371 			/*
   1372 			 * Check to make sure that this mapping wouldn't
   1373 			 * overlap a previous mapping.
   1374 			 */
   1375 			if (vaddr < p_end) {
   1376 				MOBJ_STAT_ADD(overlap_header);
   1377 				return (ENOTSUP);
   1378 			}
   1379 
   1380 			p_end = vaddr + p_memsz + p_offset;
   1381 			p_end = (caddr_t)P2ROUNDUP((size_t)p_end, PAGESIZE);
   1382 
   1383 			p_align = STRUCT_FGET(mph, x.p_align);
   1384 			if (p_align > 1 && p_align > max_align) {
   1385 				max_align = p_align;
   1386 				if (max_align < min_align) {
   1387 					max_align = min_align;
   1388 					MOBJ_STAT_ADD(min_align);
   1389 				}
   1390 			}
   1391 		}
   1392 		STRUCT_SET_HANDLE(mph, model,
   1393 		    (struct myphdr *)((size_t)STRUCT_BUF(mph) + hsize));
   1394 	}
   1395 
   1396 	/*
   1397 	 * The alignment should be a power of 2, if it isn't we forgive it
   1398 	 * and round up.  On overflow, we'll set the alignment to max_align
   1399 	 * rounded down to the nearest power of 2.
   1400 	 */
   1401 	if (max_align > 0 && !ISP2(max_align)) {
   1402 		MOBJ_STAT_ADD(np2_align);
   1403 		*align = 2 * (1L << (highbit(max_align) - 1));
   1404 		if (*align < max_align ||
   1405 		    (*align > UINT_MAX && model == DATAMODEL_ILP32)) {
   1406 			MOBJ_STAT_ADD(np2_align_overflow);
   1407 			*align = 1L << (highbit(max_align) - 1);
   1408 		}
   1409 	} else {
   1410 		*align = max_align;
   1411 	}
   1412 
   1413 	ASSERT(*align >= PAGESIZE || *align == 0);
   1414 
   1415 	*loadable = num_segs;
   1416 	*len = p_end - start_addr;
   1417 	return (0);
   1418 }
   1419 
   1420 /*
   1421  * Check the address space to see if the virtual addresses to be used are
   1422  * available.  If they are not, return errno for failure.  On success, 0
   1423  * will be returned, and the virtual addresses for each mmapobj_result_t
   1424  * will be reserved.  Note that a reservation could have earlier been made
   1425  * for a given segment via a /dev/null mapping.  If that is the case, then
   1426  * we can use that VA space for our mappings.
   1427  * Note: this function will only be used for ET_EXEC binaries.
   1428  */
   1429 int
   1430 check_exec_addrs(int loadable, mmapobj_result_t *mrp, caddr_t start_addr)
   1431 {
   1432 	int i;
   1433 	struct as *as = curproc->p_as;
   1434 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
   1435 	int ret;
   1436 	caddr_t myaddr;
   1437 	size_t mylen;
   1438 	struct seg *seg;
   1439 
   1440 	/* No need to reserve swap space now since it will be reserved later */
   1441 	crargs.flags |= MAP_NORESERVE;
   1442 	as_rangelock(as);
   1443 	for (i = 0; i < loadable; i++) {
   1444 
   1445 		myaddr = start_addr + (size_t)mrp[i].mr_addr;
   1446 		mylen = mrp[i].mr_msize;
   1447 
   1448 		/* See if there is a hole in the as for this range */
   1449 		if (as_gap(as, mylen, &myaddr, &mylen, 0, NULL) == 0) {
   1450 			ASSERT(myaddr == start_addr + (size_t)mrp[i].mr_addr);
   1451 			ASSERT(mylen == mrp[i].mr_msize);
   1452 
   1453 #ifdef DEBUG
   1454 			if (MR_GET_TYPE(mrp[i].mr_flags) == MR_PADDING) {
   1455 				MOBJ_STAT_ADD(exec_padding);
   1456 			}
   1457 #endif
   1458 			ret = as_map(as, myaddr, mylen, segvn_create, &crargs);
   1459 			if (ret) {
   1460 				as_rangeunlock(as);
   1461 				mmapobj_unmap_exec(mrp, i, start_addr);
   1462 				return (ret);
   1463 			}
   1464 		} else {
   1465 			/*
   1466 			 * There is a mapping that exists in the range
   1467 			 * so check to see if it was a "reservation"
   1468 			 * from /dev/null.  The mapping is from
   1469 			 * /dev/null if the mapping comes from
   1470 			 * segdev and the type is neither MAP_SHARED
   1471 			 * nor MAP_PRIVATE.
   1472 			 */
   1473 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
   1474 			seg = as_findseg(as, myaddr, 0);
   1475 			MOBJ_STAT_ADD(exec_addr_mapped);
   1476 			if (seg && seg->s_ops == &segdev_ops &&
   1477 			    ((SEGOP_GETTYPE(seg, myaddr) &
   1478 			    (MAP_SHARED | MAP_PRIVATE)) == 0) &&
   1479 			    myaddr >= seg->s_base &&
   1480 			    myaddr + mylen <=
   1481 			    seg->s_base + seg->s_size) {
   1482 				MOBJ_STAT_ADD(exec_addr_devnull);
   1483 				AS_LOCK_EXIT(as, &as->a_lock);
   1484 				(void) as_unmap(as, myaddr, mylen);
   1485 				ret = as_map(as, myaddr, mylen, segvn_create,
   1486 				    &crargs);
   1487 				mrp[i].mr_flags |= MR_RESV;
   1488 				if (ret) {
   1489 					as_rangeunlock(as);
   1490 					/* Need to remap what we unmapped */
   1491 					mmapobj_unmap_exec(mrp, i + 1,
   1492 					    start_addr);
   1493 					return (ret);
   1494 				}
   1495 			} else {
   1496 				AS_LOCK_EXIT(as, &as->a_lock);
   1497 				as_rangeunlock(as);
   1498 				mmapobj_unmap_exec(mrp, i, start_addr);
   1499 				MOBJ_STAT_ADD(exec_addr_in_use);
   1500 				return (EADDRINUSE);
   1501 			}
   1502 		}
   1503 	}
   1504 	as_rangeunlock(as);
   1505 	return (0);
   1506 }
   1507 
   1508 /*
   1509  * Walk through the ELF program headers and extract all useful information
   1510  * for PT_LOAD and PT_SUNWBSS segments into mrp.
   1511  * Return 0 on success or error on failure.
   1512  */
   1513 static int
   1514 process_phdr(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, mmapobj_result_t *mrp,
   1515     vnode_t *vp, uint_t *num_mapped, size_t padding, cred_t *fcred)
   1516 {
   1517 	int i;
   1518 	caddr_t start_addr = NULL;
   1519 	caddr_t vaddr;
   1520 	size_t len = 0;
   1521 	size_t lib_len = 0;
   1522 	int ret;
   1523 	int prot;
   1524 	struct lib_va *lvp = NULL;
   1525 	vattr_t vattr;
   1526 	struct as *as = curproc->p_as;
   1527 	int error;
   1528 	int loadable = 0;
   1529 	int current = 0;
   1530 	int use_lib_va = 1;
   1531 	size_t align = 0;
   1532 	size_t add_pad = 0;
   1533 	int hdr_seen = 0;
   1534 	ushort_t e_type = ehdrp->e_type;	/* same offset 32 and 64 bit */
   1535 	uint_t p_type;
   1536 	offset_t p_offset;
   1537 	size_t p_memsz;
   1538 	size_t p_filesz;
   1539 	uint_t p_flags;
   1540 	int hsize;
   1541 	model_t model;
   1542 	STRUCT_HANDLE(myphdr, mph);
   1543 
   1544 	model = get_udatamodel();
   1545 	STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase);
   1546 
   1547 	/*
   1548 	 * Need to make sure that hsize is aligned properly.
   1549 	 * For 32bit processes, 4 byte alignment is required.
   1550 	 * For 64bit processes, 8 byte alignment is required.
   1551 	 * If the alignment isn't correct, we need to return failure
   1552 	 * since it could cause an alignment error panic while walking
   1553 	 * the phdr array.
   1554 	 */
   1555 	if (model == DATAMODEL_LP64) {
   1556 		hsize = ehdrp->e_phentsize;
   1557 		if (hsize & 7) {
   1558 			MOBJ_STAT_ADD(phent_align64);
   1559 			return (ENOTSUP);
   1560 		}
   1561 	} else {
   1562 		ASSERT(model == DATAMODEL_ILP32);
   1563 		hsize = ((Elf32_Ehdr *)ehdrp)->e_phentsize;
   1564 		if (hsize & 3) {
   1565 			MOBJ_STAT_ADD(phent_align32);
   1566 			return (ENOTSUP);
   1567 		}
   1568 	}
   1569 
   1570 	if (padding != 0) {
   1571 		use_lib_va = 0;
   1572 	}
   1573 	if (e_type == ET_DYN) {
   1574 		vattr.va_mask = AT_FSID | AT_NODEID | AT_CTIME | AT_MTIME;
   1575 		error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL);
   1576 		if (error) {
   1577 			return (error);
   1578 		}
   1579 		/* Check to see if we already have a description for this lib */
   1580 		lvp = lib_va_find(&vattr);
   1581 
   1582 		if (lvp != NULL) {
   1583 			MOBJ_STAT_ADD(lvp_found);
   1584 			if (use_lib_va) {
   1585 				start_addr = mmapobj_lookup_start_addr(lvp);
   1586 				if (start_addr == NULL) {
   1587 					lib_va_release(lvp);
   1588 					return (ENOMEM);
   1589 				}
   1590 			}
   1591 
   1592 			/*
   1593 			 * loadable may be zero if the original allocator
   1594 			 * of lvp hasn't finished setting it up but the rest
   1595 			 * of the fields will be accurate.
   1596 			 */
   1597 			loadable = lvp->lv_num_segs;
   1598 			len = lvp->lv_len;
   1599 			align = lvp->lv_align;
   1600 		}
   1601 	}
   1602 
   1603 	/*
   1604 	 * Determine the span of all loadable segments and calculate the
   1605 	 * number of loadable segments, the total len spanned by the mappings
   1606 	 * and the max alignment, if we didn't get them above.
   1607 	 */
   1608 	if (loadable == 0) {
   1609 		MOBJ_STAT_ADD(no_loadable_yet);
   1610 		ret = calc_loadable(ehdrp, phdrbase, nphdrs, &len,
   1611 		    &loadable, &align);
   1612 		if (ret != 0) {
   1613 			/*
   1614 			 * Since it'd be an invalid file, we shouldn't have
   1615 			 * cached it previously.
   1616 			 */
   1617 			ASSERT(lvp == NULL);
   1618 			return (ret);
   1619 		}
   1620 #ifdef DEBUG
   1621 		if (lvp) {
   1622 			ASSERT(len == lvp->lv_len);
   1623 			ASSERT(align == lvp->lv_align);
   1624 		}
   1625 #endif
   1626 	}
   1627 
   1628 	/* Make sure there's something to map. */
   1629 	if (len == 0 || loadable == 0) {
   1630 		/*
   1631 		 * Since it'd be an invalid file, we shouldn't have
   1632 		 * cached it previously.
   1633 		 */
   1634 		ASSERT(lvp == NULL);
   1635 		MOBJ_STAT_ADD(nothing_to_map);
   1636 		return (ENOTSUP);
   1637 	}
   1638 
   1639 	lib_len = len;
   1640 	if (padding != 0) {
   1641 		loadable += 2;
   1642 	}
   1643 	if (loadable > *num_mapped) {
   1644 		*num_mapped = loadable;
   1645 		/* cleanup previous reservation */
   1646 		if (start_addr) {
   1647 			(void) as_unmap(as, start_addr, lib_len);
   1648 		}
   1649 		MOBJ_STAT_ADD(e2big);
   1650 		if (lvp) {
   1651 			lib_va_release(lvp);
   1652 		}
   1653 		return (E2BIG);
   1654 	}
   1655 
   1656 	/*
   1657 	 * We now know the size of the object to map and now we need to
   1658 	 * get the start address to map it at.  It's possible we already
   1659 	 * have it if we found all the info we need in the lib_va cache.
   1660 	 */
   1661 	if (e_type == ET_DYN && start_addr == NULL) {
   1662 		/*
   1663 		 * Need to make sure padding does not throw off
   1664 		 * required alignment.  We can only specify an
   1665 		 * alignment for the starting address to be mapped,
   1666 		 * so we round padding up to the alignment and map
   1667 		 * from there and then throw out the extra later.
   1668 		 */
   1669 		if (padding != 0) {
   1670 			if (align > 1) {
   1671 				add_pad = P2ROUNDUP(padding, align);
   1672 				len += add_pad;
   1673 				MOBJ_STAT_ADD(dyn_pad_align);
   1674 			} else {
   1675 				MOBJ_STAT_ADD(dyn_pad_noalign);
   1676 				len += padding;	/* at beginning */
   1677 			}
   1678 			len += padding;	/* at end of mapping */
   1679 		}
   1680 		/*
   1681 		 * At this point, if lvp is non-NULL, then above we
   1682 		 * already found it in the cache but did not get
   1683 		 * the start address since we were not going to use lib_va.
   1684 		 * Since we know that lib_va will not be used, it's safe
   1685 		 * to call mmapobj_alloc_start_addr and know that lvp
   1686 		 * will not be modified.
   1687 		 */
   1688 		ASSERT(lvp ? use_lib_va == 0 : 1);
   1689 		start_addr = mmapobj_alloc_start_addr(&lvp, len,
   1690 		    use_lib_va, align, &vattr);
   1691 		if (start_addr == NULL) {
   1692 			if (lvp) {
   1693 				lib_va_release(lvp);
   1694 			}
   1695 			MOBJ_STAT_ADD(alloc_start_fail);
   1696 			return (ENOMEM);
   1697 		}
   1698 		/*
   1699 		 * If we can't cache it, no need to hang on to it.
   1700 		 * Setting lv_num_segs to non-zero will make that
   1701 		 * field active and since there are too many segments
   1702 		 * to cache, all future users will not try to use lv_mps.
   1703 		 */
   1704 		if (lvp != NULL && loadable > LIBVA_CACHED_SEGS && use_lib_va) {
   1705 			lvp->lv_num_segs = loadable;
   1706 			lib_va_release(lvp);
   1707 			lvp = NULL;
   1708 			MOBJ_STAT_ADD(lvp_nocache);
   1709 		}
   1710 		/*
   1711 		 * Free the beginning of the mapping if the padding
   1712 		 * was not aligned correctly.
   1713 		 */
   1714 		if (padding != 0 && add_pad != padding) {
   1715 			(void) as_unmap(as, start_addr,
   1716 			    add_pad - padding);
   1717 			start_addr += (add_pad - padding);
   1718 			MOBJ_STAT_ADD(extra_padding);
   1719 		}
   1720 	}
   1721 
   1722 	/*
   1723 	 * At this point, we have reserved the virtual address space
   1724 	 * for our mappings.  Now we need to start filling out the mrp
   1725 	 * array to describe all of the individual mappings we are going
   1726 	 * to return.
   1727 	 * For ET_EXEC there has been no memory reservation since we are
   1728 	 * using fixed addresses.  While filling in the mrp array below,
   1729 	 * we will have the first segment biased to start at addr 0
   1730 	 * and the rest will be biased by this same amount.  Thus if there
   1731 	 * is padding, the first padding will start at addr 0, and the next
   1732 	 * segment will start at the value of padding.
   1733 	 */
   1734 
   1735 	/* We'll fill out padding later, so start filling in mrp at index 1 */
   1736 	if (padding != 0) {
   1737 		current = 1;
   1738 	}
   1739 
   1740 	/* If we have no more need for lvp let it go now */
   1741 	if (lvp != NULL && use_lib_va == 0) {
   1742 		lib_va_release(lvp);
   1743 		MOBJ_STAT_ADD(lvp_not_needed);
   1744 		lvp = NULL;
   1745 	}
   1746 
   1747 	/* Now fill out the mrp structs from the program headers */
   1748 	STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase);
   1749 	for (i = 0; i < nphdrs; i++) {
   1750 		p_type = STRUCT_FGET(mph, x.p_type);
   1751 		if (p_type == PT_LOAD || p_type == PT_SUNWBSS) {
   1752 			vaddr = (caddr_t)(uintptr_t)STRUCT_FGET(mph, x.p_vaddr);
   1753 			p_memsz = STRUCT_FGET(mph, x.p_memsz);
   1754 			p_filesz = STRUCT_FGET(mph, x.p_filesz);
   1755 			p_offset = STRUCT_FGET(mph, x.p_offset);
   1756 			p_flags = STRUCT_FGET(mph, x.p_flags);
   1757 
   1758 			/*
   1759 			 * Skip this header if it requests no memory to be
   1760 			 * mapped.
   1761 			 */
   1762 			if (p_memsz == 0) {
   1763 				STRUCT_SET_HANDLE(mph, model,
   1764 				    (struct myphdr *)((size_t)STRUCT_BUF(mph) +
   1765 				    hsize));
   1766 				MOBJ_STAT_ADD(no_mem_map_sz);
   1767 				continue;
   1768 			}
   1769 
   1770 			prot = 0;
   1771 			if (p_flags & PF_R)
   1772 				prot |= PROT_READ;
   1773 			if (p_flags & PF_W)
   1774 				prot |= PROT_WRITE;
   1775 			if (p_flags & PF_X)
   1776 				prot |= PROT_EXEC;
   1777 
   1778 			ASSERT(current < loadable);
   1779 			mrp[current].mr_msize = p_memsz;
   1780 			mrp[current].mr_fsize = p_filesz;
   1781 			mrp[current].mr_offset = p_offset;
   1782 			mrp[current].mr_prot = prot;
   1783 
   1784 			if (hdr_seen == 0 && p_filesz != 0) {
   1785 				mrp[current].mr_flags = MR_HDR_ELF;
   1786 				/*
   1787 				 * We modify mr_offset because we
   1788 				 * need to map the ELF header as well, and if
   1789 				 * we didn't then the header could be left out
   1790 				 * of the mapping that we will create later.
   1791 				 * Since we're removing the offset, we need to
   1792 				 * account for that in the other fields as well
   1793 				 * since we will be mapping the memory from 0
   1794 				 * to p_offset.
   1795 				 */
   1796 				if (e_type == ET_DYN) {
   1797 					mrp[current].mr_offset = 0;
   1798 					mrp[current].mr_msize += p_offset;
   1799 					mrp[current].mr_fsize += p_offset;
   1800 				} else {
   1801 					ASSERT(e_type == ET_EXEC);
   1802 					/*
   1803 					 * Save off the start addr which will be
   1804 					 * our bias for the rest of the
   1805 					 * ET_EXEC mappings.
   1806 					 */
   1807 					start_addr = vaddr - padding;
   1808 				}
   1809 				mrp[current].mr_addr = (caddr_t)padding;
   1810 				hdr_seen = 1;
   1811 			} else {
   1812 				if (e_type == ET_EXEC) {
   1813 					/* bias mr_addr */
   1814 					mrp[current].mr_addr =
   1815 					    vaddr - (size_t)start_addr;
   1816 				} else {
   1817 					mrp[current].mr_addr = vaddr + padding;
   1818 				}
   1819 				mrp[current].mr_flags = 0;
   1820 			}
   1821 			current++;
   1822 		}
   1823 
   1824 		/* Move to next phdr */
   1825 		STRUCT_SET_HANDLE(mph, model,
   1826 		    (struct myphdr *)((size_t)STRUCT_BUF(mph) +
   1827 		    hsize));
   1828 	}
   1829 
   1830 	/* Now fill out the padding segments */
   1831 	if (padding != 0) {
   1832 		mrp[0].mr_addr = NULL;
   1833 		mrp[0].mr_msize = padding;
   1834 		mrp[0].mr_fsize = 0;
   1835 		mrp[0].mr_offset = 0;
   1836 		mrp[0].mr_prot = 0;
   1837 		mrp[0].mr_flags = MR_PADDING;
   1838 
   1839 		/* Setup padding for the last segment */
   1840 		ASSERT(current == loadable - 1);
   1841 		mrp[current].mr_addr = (caddr_t)lib_len + padding;
   1842 		mrp[current].mr_msize = padding;
   1843 		mrp[current].mr_fsize = 0;
   1844 		mrp[current].mr_offset = 0;
   1845 		mrp[current].mr_prot = 0;
   1846 		mrp[current].mr_flags = MR_PADDING;
   1847 	}
   1848 
   1849 	/*
   1850 	 * Need to make sure address ranges desired are not in use or
   1851 	 * are previously allocated reservations from /dev/null.  For
   1852 	 * ET_DYN, we already made sure our address range was free.
   1853 	 */
   1854 	if (e_type == ET_EXEC) {
   1855 		ret = check_exec_addrs(loadable, mrp, start_addr);
   1856 		if (ret != 0) {
   1857 			ASSERT(lvp == NULL);
   1858 			MOBJ_STAT_ADD(check_exec_failed);
   1859 			return (ret);
   1860 		}
   1861 	}
   1862 
   1863 	/* Finish up our business with lvp. */
   1864 	if (lvp) {
   1865 		ASSERT(e_type == ET_DYN);
   1866 		if (lvp->lv_num_segs == 0 && loadable <= LIBVA_CACHED_SEGS) {
   1867 			bcopy(mrp, lvp->lv_mps,
   1868 			    loadable * sizeof (mmapobj_result_t));
   1869 			membar_producer();
   1870 		}
   1871 		/*
   1872 		 * Setting lv_num_segs to a non-zero value indicates that
   1873 		 * lv_mps is now valid and can be used by other threads.
   1874 		 * So, the above stores need to finish before lv_num_segs
   1875 		 * is updated. lv_mps is only valid if lv_num_segs is
   1876 		 * greater than LIBVA_CACHED_SEGS.
   1877 		 */
   1878 		lvp->lv_num_segs = loadable;
   1879 		lib_va_release(lvp);
   1880 		MOBJ_STAT_ADD(lvp_used);
   1881 	}
   1882 
   1883 	/* Now that we have mrp completely filled out go map it */
   1884 	ret = mmapobj_map_elf(vp, start_addr, mrp, loadable, fcred, e_type);
   1885 	if (ret == 0) {
   1886 		*num_mapped = loadable;
   1887 	}
   1888 
   1889 	return (ret);
   1890 }
   1891 
   1892 /*
   1893  * Take the ELF file passed in, and do the work of mapping it.
   1894  * num_mapped in - # elements in user buffer
   1895  * num_mapped out - # sections mapped and length of mrp array if
   1896  *			no errors.
   1897  */
   1898 static int
   1899 doelfwork(Ehdr *ehdrp, vnode_t *vp, mmapobj_result_t *mrp,
   1900     uint_t *num_mapped, size_t padding, cred_t *fcred)
   1901 {
   1902 	int error;
   1903 	offset_t phoff;
   1904 	int nphdrs;
   1905 	unsigned char ei_class;
   1906 	unsigned short phentsize;
   1907 	ssize_t phsizep;
   1908 	caddr_t phbasep;
   1909 	int to_map;
   1910 	model_t model;
   1911 
   1912 	ei_class = ehdrp->e_ident[EI_CLASS];
   1913 	model = get_udatamodel();
   1914 	if ((model == DATAMODEL_ILP32 && ei_class == ELFCLASS64) ||
   1915 	    (model == DATAMODEL_LP64 && ei_class == ELFCLASS32)) {
   1916 		MOBJ_STAT_ADD(wrong_model);
   1917 		return (ENOTSUP);
   1918 	}
   1919 
   1920 	/* Can't execute code from "noexec" mounted filesystem. */
   1921 	if (ehdrp->e_type == ET_EXEC &&
   1922 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0) {
   1923 		MOBJ_STAT_ADD(noexec_fs);
   1924 		return (EACCES);
   1925 	}
   1926 
   1927 	/*
   1928 	 * Relocatable and core files are mapped as a single flat file
   1929 	 * since no interpretation is done on them by mmapobj.
   1930 	 */
   1931 	if (ehdrp->e_type == ET_REL || ehdrp->e_type == ET_CORE) {
   1932 		to_map = padding ? 3 : 1;
   1933 		if (*num_mapped < to_map) {
   1934 			*num_mapped = to_map;
   1935 			MOBJ_STAT_ADD(e2big_et_rel);
   1936 			return (E2BIG);
   1937 		}
   1938 		error = mmapobj_map_flat(vp, mrp, padding, fcred);
   1939 		if (error == 0) {
   1940 			*num_mapped = to_map;
   1941 			mrp[padding ? 1 : 0].mr_flags = MR_HDR_ELF;
   1942 			MOBJ_STAT_ADD(et_rel_mapped);
   1943 		}
   1944 		return (error);
   1945 	}
   1946 
   1947 	/* Check for an unknown ELF type */
   1948 	if (ehdrp->e_type != ET_EXEC && ehdrp->e_type != ET_DYN) {
   1949 		MOBJ_STAT_ADD(unknown_elf_type);
   1950 		return (ENOTSUP);
   1951 	}
   1952 
   1953 	if (ei_class == ELFCLASS32) {
   1954 		Elf32_Ehdr *e32hdr = (Elf32_Ehdr *)ehdrp;
   1955 		ASSERT(model == DATAMODEL_ILP32);
   1956 		nphdrs = e32hdr->e_phnum;
   1957 		phentsize = e32hdr->e_phentsize;
   1958 		if (phentsize < sizeof (Elf32_Phdr)) {
   1959 			MOBJ_STAT_ADD(phent32_too_small);
   1960 			return (ENOTSUP);
   1961 		}
   1962 		phoff = e32hdr->e_phoff;
   1963 	} else if (ei_class == ELFCLASS64) {
   1964 		Elf64_Ehdr *e64hdr = (Elf64_Ehdr *)ehdrp;
   1965 		ASSERT(model == DATAMODEL_LP64);
   1966 		nphdrs = e64hdr->e_phnum;
   1967 		phentsize = e64hdr->e_phentsize;
   1968 		if (phentsize < sizeof (Elf64_Phdr)) {
   1969 			MOBJ_STAT_ADD(phent64_too_small);
   1970 			return (ENOTSUP);
   1971 		}
   1972 		phoff = e64hdr->e_phoff;
   1973 	} else {
   1974 		/* fallthrough case for an invalid ELF class */
   1975 		MOBJ_STAT_ADD(inval_elf_class);
   1976 		return (ENOTSUP);
   1977 	}
   1978 
   1979 	/*
   1980 	 * nphdrs should only have this value for core files which are handled
   1981 	 * above as a single mapping.  If other file types ever use this
   1982 	 * sentinel, then we'll add the support needed to handle this here.
   1983 	 */
   1984 	if (nphdrs == PN_XNUM) {
   1985 		MOBJ_STAT_ADD(too_many_phdrs);
   1986 		return (ENOTSUP);
   1987 	}
   1988 
   1989 	phsizep = nphdrs * phentsize;
   1990 
   1991 	if (phsizep == 0) {
   1992 		MOBJ_STAT_ADD(no_phsize);
   1993 		return (ENOTSUP);
   1994 	}
   1995 
   1996 	/* Make sure we only wait for memory if it's a reasonable request */
   1997 	if (phsizep > mmapobj_alloc_threshold) {
   1998 		MOBJ_STAT_ADD(phsize_large);
   1999 		if ((phbasep = kmem_alloc(phsizep, KM_NOSLEEP)) == NULL) {
   2000 			MOBJ_STAT_ADD(phsize_xtralarge);
   2001 			return (ENOMEM);
   2002 		}
   2003 	} else {
   2004 		phbasep = kmem_alloc(phsizep, KM_SLEEP);
   2005 	}
   2006 
   2007 	if ((error = vn_rdwr(UIO_READ, vp, phbasep, phsizep,
   2008 	    (offset_t)phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
   2009 	    fcred, NULL)) != 0) {
   2010 		kmem_free(phbasep, phsizep);
   2011 		return (error);
   2012 	}
   2013 
   2014 	/* Now process the phdr's */
   2015 	error = process_phdr(ehdrp, phbasep, nphdrs, mrp, vp, num_mapped,
   2016 	    padding, fcred);
   2017 	kmem_free(phbasep, phsizep);
   2018 	return (error);
   2019 }
   2020 
   2021 #if defined(__sparc)
   2022 /*
   2023  * Hack to support 64 bit kernels running AOUT 4.x programs.
   2024  * This is the sizeof (struct nlist) for a 32 bit kernel.
   2025  * Since AOUT programs are 32 bit only, they will never use the 64 bit
   2026  * sizeof (struct nlist) and thus creating a #define is the simplest
   2027  * way around this since this is a format which is not being updated.
   2028  * This will be used in the place of sizeof (struct nlist) below.
   2029  */
   2030 #define	NLIST_SIZE	(0xC)
   2031 
   2032 static int
   2033 doaoutwork(vnode_t *vp, mmapobj_result_t *mrp,
   2034     uint_t *num_mapped, struct exec *hdr, cred_t *fcred)
   2035 {
   2036 	int error;
   2037 	size_t size;
   2038 	size_t osize;
   2039 	size_t nsize;	/* nlist size */
   2040 	size_t msize;
   2041 	size_t zfoddiff;
   2042 	caddr_t addr;
   2043 	caddr_t start_addr;
   2044 	struct as *as = curproc->p_as;
   2045 	int prot = PROT_USER | PROT_READ | PROT_EXEC;
   2046 	uint_t mflag = MAP_PRIVATE | _MAP_LOW32;
   2047 	offset_t off = 0;
   2048 	int segnum = 0;
   2049 	uint_t to_map;
   2050 	int is_library = 0;
   2051 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
   2052 
   2053 	/* Only 32bit apps supported by this file format */
   2054 	if (get_udatamodel() != DATAMODEL_ILP32) {
   2055 		MOBJ_STAT_ADD(aout_64bit_try);
   2056 		return (ENOTSUP);
   2057 	}
   2058 
   2059 	/* Check to see if this is a library */
   2060 	if (hdr->a_magic == ZMAGIC && hdr->a_entry < PAGESIZE) {
   2061 		is_library = 1;
   2062 	}
   2063 
   2064 	/* Can't execute code from "noexec" mounted filesystem. */
   2065 	if (((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0) && (is_library == 0)) {
   2066 		MOBJ_STAT_ADD(aout_noexec);
   2067 		return (EACCES);
   2068 	}
   2069 
   2070 	/*
   2071 	 * There are 2 ways to calculate the mapped size of executable:
   2072 	 * 1) rounded text size + data size + bss size.
   2073 	 * 2) starting offset for text + text size + data size + text relocation
   2074 	 *    size + data relocation size + room for nlist data structure.
   2075 	 *
   2076 	 * The larger of the two sizes will be used to map this binary.
   2077 	 */
   2078 	osize = P2ROUNDUP(hdr->a_text, PAGESIZE) + hdr->a_data + hdr->a_bss;
   2079 
   2080 	off = hdr->a_magic == ZMAGIC ? 0 : sizeof (struct exec);
   2081 
   2082 	nsize = off + hdr->a_text + hdr->a_data + hdr->a_trsize +
   2083 	    hdr->a_drsize + NLIST_SIZE;
   2084 
   2085 	size = MAX(osize, nsize);
   2086 	if (size != nsize) {
   2087 		nsize = 0;
   2088 	}
   2089 
   2090 	/*
   2091 	 * 1 seg for text and 1 seg for initialized data.
   2092 	 * 1 seg for bss (if can't fit in leftover space of init data)
   2093 	 * 1 seg for nlist if needed.
   2094 	 */
   2095 	to_map = 2 + (nsize ? 1 : 0) +
   2096 	    (hdr->a_bss > PAGESIZE - P2PHASE(hdr->a_data, PAGESIZE) ? 1 : 0);
   2097 	if (*num_mapped < to_map) {
   2098 		*num_mapped = to_map;
   2099 		MOBJ_STAT_ADD(aout_e2big);
   2100 		return (E2BIG);
   2101 	}
   2102 
   2103 	/* Reserve address space for the whole mapping */
   2104 	if (is_library) {
   2105 		/* We'll let VOP_MAP below pick our address for us */
   2106 		addr = NULL;
   2107 		MOBJ_STAT_ADD(aout_lib);
   2108 	} else {
   2109 		/*
   2110 		 * default start address for fixed binaries from AOUT 4.x
   2111 		 * standard.
   2112 		 */
   2113 		MOBJ_STAT_ADD(aout_fixed);
   2114 		mflag |= MAP_FIXED;
   2115 		addr = (caddr_t)0x2000;
   2116 		as_rangelock(as);
   2117 		if (as_gap(as, size, &addr, &size, 0, NULL) != 0) {
   2118 			as_rangeunlock(as);
   2119 			MOBJ_STAT_ADD(aout_addr_in_use);
   2120 			return (EADDRINUSE);
   2121 		}
   2122 		crargs.flags |= MAP_NORESERVE;
   2123 		error = as_map(as, addr, size, segvn_create, &crargs);
   2124 		ASSERT(addr == (caddr_t)0x2000);
   2125 		as_rangeunlock(as);
   2126 	}
   2127 
   2128 	start_addr = addr;
   2129 	osize = size;
   2130 
   2131 	/*
   2132 	 * Map as large as we need, backed by file, this will be text, and
   2133 	 * possibly the nlist segment.  We map over this mapping for bss and
   2134 	 * initialized data segments.
   2135 	 */
   2136 	error = VOP_MAP(vp, off, as, &addr, size, prot, PROT_ALL,
   2137 	    mflag, fcred, NULL);
   2138 	if (error) {
   2139 		if (!is_library) {
   2140 			(void) as_unmap(as, start_addr, osize);
   2141 		}
   2142 		return (error);
   2143 	}
   2144 
   2145 	/* pickup the value of start_addr and osize for libraries */
   2146 	start_addr = addr;
   2147 	osize = size;
   2148 
   2149 	/*
   2150 	 * We have our initial reservation/allocation so we need to use fixed
   2151 	 * addresses from now on.
   2152 	 */
   2153 	mflag |= MAP_FIXED;
   2154 
   2155 	mrp[0].mr_addr = addr;
   2156 	mrp[0].mr_msize = hdr->a_text;
   2157 	mrp[0].mr_fsize = hdr->a_text;
   2158 	mrp[0].mr_offset = 0;
   2159 	mrp[0].mr_prot = PROT_READ | PROT_EXEC;
   2160 	mrp[0].mr_flags = MR_HDR_AOUT;
   2161 
   2162 
   2163 	/*
   2164 	 * Map initialized data. We are mapping over a portion of the
   2165 	 * previous mapping which will be unmapped in VOP_MAP below.
   2166 	 */
   2167 	off = P2ROUNDUP((offset_t)(hdr->a_text), PAGESIZE);
   2168 	msize = off;
   2169 	addr += off;
   2170 	size = hdr->a_data;
   2171 	error = VOP_MAP(vp, off, as, &addr, size, PROT_ALL, PROT_ALL,
   2172 	    mflag, fcred, NULL);
   2173 	if (error) {
   2174 		(void) as_unmap(as, start_addr, osize);
   2175 		return (error);
   2176 	}
   2177 	msize += size;
   2178 	mrp[1].mr_addr = addr;
   2179 	mrp[1].mr_msize = size;
   2180 	mrp[1].mr_fsize = size;
   2181 	mrp[1].mr_offset = 0;
   2182 	mrp[1].mr_prot = PROT_READ | PROT_WRITE | PROT_EXEC;
   2183 	mrp[1].mr_flags = 0;
   2184 
   2185 	/* Need to zero out remainder of page */
   2186 	addr += hdr->a_data;
   2187 	zfoddiff = P2PHASE((size_t)addr, PAGESIZE);
   2188 	if (zfoddiff) {
   2189 		label_t ljb;
   2190 
   2191 		MOBJ_STAT_ADD(aout_zfoddiff);
   2192 		zfoddiff = PAGESIZE - zfoddiff;
   2193 		if (on_fault(&ljb)) {
   2194 			no_fault();
   2195 			MOBJ_STAT_ADD(aout_uzero_fault);
   2196 			(void) as_unmap(as, start_addr, osize);
   2197 			return (EFAULT);
   2198 		}
   2199 		uzero(addr, zfoddiff);
   2200 		no_fault();
   2201 	}
   2202 	msize += zfoddiff;
   2203 	segnum = 2;
   2204 
   2205 	/* Map bss */
   2206 	if (hdr->a_bss > zfoddiff) {
   2207 		struct segvn_crargs crargs =
   2208 		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
   2209 		MOBJ_STAT_ADD(aout_map_bss);
   2210 		addr += zfoddiff;
   2211 		size = hdr->a_bss - zfoddiff;
   2212 		as_rangelock(as);
   2213 		(void) as_unmap(as, addr, size);
   2214 		error = as_map(as, addr, size, segvn_create, &crargs);
   2215 		as_rangeunlock(as);
   2216 		msize += size;
   2217 
   2218 		if (error) {
   2219 			MOBJ_STAT_ADD(aout_bss_fail);
   2220 			(void) as_unmap(as, start_addr, osize);
   2221 			return (error);
   2222 		}
   2223 		mrp[2].mr_addr = addr;
   2224 		mrp[2].mr_msize = size;
   2225 		mrp[2].mr_fsize = 0;
   2226 		mrp[2].mr_offset = 0;
   2227 		mrp[2].mr_prot = PROT_READ | PROT_WRITE | PROT_EXEC;
   2228 		mrp[2].mr_flags = 0;
   2229 
   2230 		addr += size;
   2231 		segnum = 3;
   2232 	}
   2233 
   2234 	/*
   2235 	 * If we have extra bits left over, we need to include that in how
   2236 	 * much we mapped to make sure the nlist logic is correct
   2237 	 */
   2238 	msize = P2ROUNDUP(msize, PAGESIZE);
   2239 
   2240 	if (nsize && msize < nsize) {
   2241 		MOBJ_STAT_ADD(aout_nlist);
   2242 		mrp[segnum].mr_addr = addr;
   2243 		mrp[segnum].mr_msize = nsize - msize;
   2244 		mrp[segnum].mr_fsize = 0;
   2245 		mrp[segnum].mr_offset = 0;
   2246 		mrp[segnum].mr_prot = PROT_READ | PROT_EXEC;
   2247 		mrp[segnum].mr_flags = 0;
   2248 	}
   2249 
   2250 	*num_mapped = to_map;
   2251 	return (0);
   2252 }
   2253 #endif
   2254 
   2255 /*
   2256  * These are the two types of files that we can interpret and we want to read
   2257  * in enough info to cover both types when looking at the initial header.
   2258  */
   2259 #define	MAX_HEADER_SIZE	(MAX(sizeof (Ehdr), sizeof (struct exec)))
   2260 
   2261 /*
   2262  * Map vp passed in in an interpreted manner.  ELF and AOUT files will be
   2263  * interpreted and mapped appropriately for execution.
   2264  * num_mapped in - # elements in mrp
   2265  * num_mapped out - # sections mapped and length of mrp array if
   2266  *		    no errors or E2BIG returned.
   2267  *
   2268  * Returns 0 on success, errno value on failure.
   2269  */
   2270 static int
   2271 mmapobj_map_interpret(vnode_t *vp, mmapobj_result_t *mrp,
   2272     uint_t *num_mapped, size_t padding, cred_t *fcred)
   2273 {
   2274 	int error = 0;
   2275 	vattr_t vattr;
   2276 	struct lib_va *lvp;
   2277 	caddr_t start_addr;
   2278 	model_t model;
   2279 
   2280 	/*
   2281 	 * header has to be aligned to the native size of ulong_t in order
   2282 	 * to avoid an unaligned access when dereferencing the header as
   2283 	 * a ulong_t.  Thus we allocate our array on the stack of type
   2284 	 * ulong_t and then have header, which we dereference later as a char
   2285 	 * array point at lheader.
   2286 	 */
   2287 	ulong_t lheader[(MAX_HEADER_SIZE / (sizeof (ulong_t))) + 1];
   2288 	caddr_t header = (caddr_t)&lheader;
   2289 
   2290 	vattr.va_mask = AT_FSID | AT_NODEID | AT_CTIME | AT_MTIME | AT_SIZE;
   2291 	error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL);
   2292 	if (error) {
   2293 		return (error);
   2294 	}
   2295 
   2296 	/*
   2297 	 * Check lib_va to see if we already have a full description
   2298 	 * for this library.  This is the fast path and only used for
   2299 	 * ET_DYN ELF files (dynamic libraries).
   2300 	 */
   2301 	if (padding == 0 && (lvp = lib_va_find(&vattr)) != NULL) {
   2302 		int num_segs;
   2303 
   2304 		model = get_udatamodel();
   2305 		if ((model == DATAMODEL_ILP32 &&
   2306 		    lvp->lv_flags & LV_ELF64) ||
   2307 		    (model == DATAMODEL_LP64 &&
   2308 		    lvp->lv_flags & LV_ELF32)) {
   2309 			lib_va_release(lvp);
   2310 			MOBJ_STAT_ADD(fast_wrong_model);
   2311 			return (ENOTSUP);
   2312 		}
   2313 		num_segs = lvp->lv_num_segs;
   2314 		if (*num_mapped < num_segs) {
   2315 			*num_mapped = num_segs;
   2316 			lib_va_release(lvp);
   2317 			MOBJ_STAT_ADD(fast_e2big);
   2318 			return (E2BIG);
   2319 		}
   2320 
   2321 		/*
   2322 		 * Check to see if we have all the mappable program headers
   2323 		 * cached.
   2324 		 */
   2325 		if (num_segs <= LIBVA_CACHED_SEGS && num_segs != 0) {
   2326 			MOBJ_STAT_ADD(fast);
   2327 			start_addr = mmapobj_lookup_start_addr(lvp);
   2328 			if (start_addr == NULL) {
   2329 				lib_va_release(lvp);
   2330 				return (ENOMEM);
   2331 			}
   2332 
   2333 			bcopy(lvp->lv_mps, mrp,
   2334 			    num_segs * sizeof (mmapobj_result_t));
   2335 
   2336 			error = mmapobj_map_elf(vp, start_addr, mrp,
   2337 			    num_segs, fcred, ET_DYN);
   2338 
   2339 			lib_va_release(lvp);
   2340 			if (error == 0) {
   2341 				*num_mapped = num_segs;
   2342 				MOBJ_STAT_ADD(fast_success);
   2343 			}
   2344 			return (error);
   2345 		}
   2346 		MOBJ_STAT_ADD(fast_not_now);
   2347 
   2348 		/* Release it for now since we'll look it up below */
   2349 		lib_va_release(lvp);
   2350 	}
   2351 
   2352 	/*
   2353 	 * Time to see if this is a file we can interpret.  If it's smaller
   2354 	 * than this, then we can't interpret it.
   2355 	 */
   2356 	if (vattr.va_size < MAX_HEADER_SIZE) {
   2357 		MOBJ_STAT_ADD(small_file);
   2358 		return (ENOTSUP);
   2359 	}
   2360 
   2361 	if ((error = vn_rdwr(UIO_READ, vp, header, MAX_HEADER_SIZE, 0,
   2362 	    UIO_SYSSPACE, 0, (rlim64_t)0, fcred, NULL)) != 0) {
   2363 		MOBJ_STAT_ADD(read_error);
   2364 		return (error);
   2365 	}
   2366 
   2367 	/* Verify file type */
   2368 	if (header[EI_MAG0] == ELFMAG0 && header[EI_MAG1] == ELFMAG1 &&
   2369 	    header[EI_MAG2] == ELFMAG2 && header[EI_MAG3] == ELFMAG3) {
   2370 		return (doelfwork((Ehdr *)lheader, vp, mrp, num_mapped,
   2371 		    padding, fcred));
   2372 	}
   2373 
   2374 #if defined(__sparc)
   2375 	/* On sparc, check for 4.X AOUT format */
   2376 	switch (((struct exec *)header)->a_magic) {
   2377 	case OMAGIC:
   2378 	case ZMAGIC:
   2379 	case NMAGIC:
   2380 		return (doaoutwork(vp, mrp, num_mapped,
   2381 		    (struct exec *)lheader, fcred));
   2382 	}
   2383 #endif
   2384 
   2385 	/* Unsupported type */
   2386 	MOBJ_STAT_ADD(unsupported);
   2387 	return (ENOTSUP);
   2388 }
   2389 
   2390 /*
   2391  * Given a vnode, map it as either a flat file or interpret it and map
   2392  * it according to the rules of the file type.
   2393  * *num_mapped will contain the size of the mmapobj_result_t array passed in.
   2394  * If padding is non-zero, the mappings will be padded by that amount
   2395  * rounded up to the nearest pagesize.
   2396  * If the mapping is successful, *num_mapped will contain the number of
   2397  * distinct mappings created, and mrp will point to the array of
   2398  * mmapobj_result_t's which describe these mappings.
   2399  *
   2400  * On error, -1 is returned and errno is set appropriately.
   2401  * A special error case will set errno to E2BIG when there are more than
   2402  * *num_mapped mappings to be created and *num_mapped will be set to the
   2403  * number of mappings needed.
   2404  */
   2405 int
   2406 mmapobj(vnode_t *vp, uint_t flags, mmapobj_result_t *mrp,
   2407     uint_t *num_mapped, size_t padding, cred_t *fcred)
   2408 {
   2409 	int to_map;
   2410 	int error = 0;
   2411 
   2412 	ASSERT((padding & PAGEOFFSET) == 0);
   2413 	ASSERT((flags & ~MMOBJ_ALL_FLAGS) == 0);
   2414 	ASSERT(num_mapped != NULL);
   2415 	ASSERT((flags & MMOBJ_PADDING) ? padding != 0 : padding == 0);
   2416 
   2417 	if ((flags & MMOBJ_INTERPRET) == 0) {
   2418 		to_map = padding ? 3 : 1;
   2419 		if (*num_mapped < to_map) {
   2420 			*num_mapped = to_map;
   2421 			MOBJ_STAT_ADD(flat_e2big);
   2422 			return (E2BIG);
   2423 		}
   2424 		error = mmapobj_map_flat(vp, mrp, padding, fcred);
   2425 
   2426 		if (error) {
   2427 			return (error);
   2428 		}
   2429 		*num_mapped = to_map;
   2430 		return (0);
   2431 	}
   2432 
   2433 	error = mmapobj_map_interpret(vp, mrp, num_mapped, padding, fcred);
   2434 	return (error);
   2435 }
   2436