Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/types.h>
     27 #include <sys/kmem.h>
     28 #include <sys/errno.h>
     29 #include <sys/thread.h>
     30 #include <sys/systm.h>
     31 #include <sys/syscall.h>
     32 #include <sys/proc.h>
     33 #include <sys/modctl.h>
     34 #include <sys/cmn_err.h>
     35 #include <sys/model.h>
     36 #include <sys/exec.h>
     37 #include <sys/lx_impl.h>
     38 #include <sys/machbrand.h>
     39 #include <sys/lx_syscalls.h>
     40 #include <sys/lx_pid.h>
     41 #include <sys/lx_futex.h>
     42 #include <sys/lx_brand.h>
     43 #include <sys/termios.h>
     44 #include <sys/sunddi.h>
     45 #include <sys/ddi.h>
     46 #include <sys/vnode.h>
     47 #include <sys/pathname.h>
     48 #include <sys/auxv.h>
     49 #include <sys/priv.h>
     50 #include <sys/regset.h>
     51 #include <sys/privregs.h>
     52 #include <sys/archsystm.h>
     53 #include <sys/zone.h>
     54 #include <sys/brand.h>
     55 
     56 int	lx_debug = 0;
     57 
     58 void	lx_init_brand_data(zone_t *);
     59 void	lx_free_brand_data(zone_t *);
     60 void	lx_setbrand(proc_t *);
     61 int	lx_getattr(zone_t *, int, void *, size_t *);
     62 int	lx_setattr(zone_t *, int, void *, size_t);
     63 int	lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
     64 		uintptr_t, uintptr_t, uintptr_t);
     65 int	lx_get_kern_version(void);
     66 void	lx_set_kern_version(zone_t *, int);
     67 void	lx_copy_procdata(proc_t *, proc_t *);
     68 
     69 extern void lx_setrval(klwp_t *, int, int);
     70 extern void lx_proc_exit(proc_t *, klwp_t *);
     71 extern void lx_exec();
     72 extern int lx_initlwp(klwp_t *);
     73 extern void lx_forklwp(klwp_t *, klwp_t *);
     74 extern void lx_exitlwp(klwp_t *);
     75 extern void lx_freelwp(klwp_t *);
     76 extern greg_t lx_fixsegreg(greg_t, model_t);
     77 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *);
     78 
     79 int lx_systrace_brand_enabled;
     80 
     81 lx_systrace_f *lx_systrace_entry_ptr;
     82 lx_systrace_f *lx_systrace_return_ptr;
     83 
     84 static int lx_systrace_enabled;
     85 
     86 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
     87     struct intpdata *idata, int level, long *execsz, int setid,
     88     caddr_t exec_file, struct cred *cred, int brand_action);
     89 
     90 /* lx brand */
     91 struct brand_ops lx_brops = {
     92 	lx_init_brand_data,
     93 	lx_free_brand_data,
     94 	lx_brandsys,
     95 	lx_setbrand,
     96 	lx_getattr,
     97 	lx_setattr,
     98 	lx_copy_procdata,
     99 	lx_proc_exit,
    100 	lx_exec,
    101 	lx_setrval,
    102 	lx_initlwp,
    103 	lx_forklwp,
    104 	lx_freelwp,
    105 	lx_exitlwp,
    106 	lx_elfexec
    107 };
    108 
    109 struct brand_mach_ops lx_mops = {
    110 	NULL,
    111 	lx_brand_int80_callback,
    112 	NULL,
    113 	NULL,
    114 	NULL,
    115 	lx_fixsegreg,
    116 };
    117 
    118 struct brand lx_brand = {
    119 	BRAND_VER_1,
    120 	"lx",
    121 	&lx_brops,
    122 	&lx_mops
    123 };
    124 
    125 static struct modlbrand modlbrand = {
    126 	&mod_brandops, "lx brand", &lx_brand
    127 };
    128 
    129 static struct modlinkage modlinkage = {
    130 	MODREV_1, (void *)&modlbrand, NULL
    131 };
    132 
    133 void
    134 lx_proc_exit(proc_t *p, klwp_t *lwp)
    135 {
    136 	zone_t *z = p->p_zone;
    137 
    138 	ASSERT(p->p_brand != NULL);
    139 	ASSERT(p->p_brand_data != NULL);
    140 
    141 	/*
    142 	 * If init is dying and we aren't explicitly shutting down the zone
    143 	 * or the system, then Solaris is about to restart init.  The Linux
    144 	 * init is not designed to handle a restart, which it interprets as
    145 	 * a reboot.  To give it a sane environment in which to run, we
    146 	 * reboot the zone.
    147 	 */
    148 	if (p->p_pid == z->zone_proc_initpid) {
    149 		if (z->zone_boot_err == 0 &&
    150 		    z->zone_restart_init &&
    151 		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
    152 		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN)
    153 			(void) zone_kadmin(A_REBOOT, 0, NULL, CRED());
    154 	}
    155 	lx_exitlwp(lwp);
    156 	kmem_free(p->p_brand_data, sizeof (struct lx_proc_data));
    157 	p->p_brand_data = NULL;
    158 }
    159 
    160 void
    161 lx_setbrand(proc_t *p)
    162 {
    163 	kthread_t *t = p->p_tlist;
    164 	int err;
    165 
    166 	ASSERT(p->p_brand_data == NULL);
    167 	ASSERT(ttolxlwp(curthread) == NULL);
    168 
    169 	p->p_brand_data = kmem_zalloc(sizeof (struct lx_proc_data), KM_SLEEP);
    170 
    171 	/*
    172 	 * This routine can only be called for single-threaded processes.
    173 	 * Since lx_initlwp() can only fail if we run out of PIDs for
    174 	 * multithreaded processes, we know that this can never fail.
    175 	 */
    176 	err = lx_initlwp(t->t_lwp);
    177 	ASSERT(err == 0);
    178 }
    179 
    180 /* ARGSUSED */
    181 int
    182 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
    183 {
    184 	boolean_t val;
    185 	int num;
    186 
    187 	if (attr == LX_ATTR_RESTART_INIT) {
    188 		if (bufsize > sizeof (boolean_t))
    189 			return (ERANGE);
    190 		if (copyin(buf, &val, sizeof (val)) != 0)
    191 			return (EFAULT);
    192 		if (val != B_TRUE && val != B_FALSE)
    193 			return (EINVAL);
    194 		zone->zone_restart_init = val;
    195 		return (0);
    196 	} else if (attr == LX_KERN_VERSION_NUM) {
    197 		if (bufsize > sizeof (int))
    198 			return (ERANGE);
    199 		if (copyin(buf, &num, sizeof (num)) != 0)
    200 			return (EFAULT);
    201 		lx_set_kern_version(zone, num);
    202 		return (0);
    203 	}
    204 	return (EINVAL);
    205 }
    206 
    207 /* ARGSUSED */
    208 int
    209 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
    210 {
    211 	int num;
    212 	if (attr == LX_ATTR_RESTART_INIT) {
    213 		if (*bufsize < sizeof (boolean_t))
    214 			return (ERANGE);
    215 		if (copyout(&zone->zone_restart_init, buf,
    216 		    sizeof (boolean_t)) != 0)
    217 			return (EFAULT);
    218 		*bufsize = sizeof (boolean_t);
    219 		return (0);
    220 	} else if (attr == LX_KERN_VERSION_NUM) {
    221 		if (*bufsize < sizeof (int))
    222 			return (ERANGE);
    223 		num = lx_get_kern_version();
    224 		if (copyout(&num, buf, sizeof (int)) != 0)
    225 			return (EFAULT);
    226 		*bufsize = sizeof (int);
    227 		return (0);
    228 	}
    229 	return (-EINVAL);
    230 }
    231 
    232 /*
    233  * Enable ptrace system call tracing for the given LWP. This is done by
    234  * both setting the flag in that LWP's brand data (in the kernel) and setting
    235  * the process-wide trace flag (in the brand library of the traced process).
    236  */
    237 static int
    238 lx_ptrace_syscall_set(pid_t pid, id_t lwpid, int set)
    239 {
    240 	proc_t *p;
    241 	kthread_t *t;
    242 	klwp_t *lwp;
    243 	lx_proc_data_t *lpdp;
    244 	lx_lwp_data_t *lldp;
    245 	uintptr_t addr;
    246 	int ret, flag = 1;
    247 
    248 	if ((p = sprlock(pid)) == NULL)
    249 		return (ESRCH);
    250 
    251 	if (priv_proc_cred_perm(curproc->p_cred, p, NULL, VWRITE) != 0) {
    252 		sprunlock(p);
    253 		return (EPERM);
    254 	}
    255 
    256 	if ((t = idtot(p, lwpid)) == NULL || (lwp = ttolwp(t)) == NULL) {
    257 		sprunlock(p);
    258 		return (ESRCH);
    259 	}
    260 
    261 	if ((lpdp = p->p_brand_data) == NULL ||
    262 	    (lldp = lwp->lwp_brand) == NULL) {
    263 		sprunlock(p);
    264 		return (ESRCH);
    265 	}
    266 
    267 	if (set) {
    268 		/*
    269 		 * Enable the ptrace flag for this LWP and this process. Note
    270 		 * that we will turn off the LWP's ptrace flag, but we don't
    271 		 * turn off the process's ptrace flag.
    272 		 */
    273 		lldp->br_ptrace = 1;
    274 		lpdp->l_ptrace = 1;
    275 
    276 		addr = lpdp->l_traceflag;
    277 
    278 		mutex_exit(&p->p_lock);
    279 
    280 		/*
    281 		 * This can fail only in some rare corner cases where the
    282 		 * process is exiting or we're completely out of memory. In
    283 		 * these cases, it's sufficient to return an error to the ptrace
    284 		 * consumer and leave the process-wide flag set.
    285 		 */
    286 		ret = uwrite(p, &flag, sizeof (flag), addr);
    287 
    288 		mutex_enter(&p->p_lock);
    289 
    290 		/*
    291 		 * If we couldn't set the trace flag, unset the LWP's ptrace
    292 		 * flag as there ptrace consumer won't expect this LWP to stop.
    293 		 */
    294 		if (ret != 0)
    295 			lldp->br_ptrace = 0;
    296 	} else {
    297 		lldp->br_ptrace = 0;
    298 		ret = 0;
    299 	}
    300 
    301 	sprunlock(p);
    302 
    303 	if (ret != 0)
    304 		ret = EIO;
    305 
    306 	return (ret);
    307 }
    308 
    309 static void
    310 lx_ptrace_fire(void)
    311 {
    312 	kthread_t *t = curthread;
    313 	klwp_t *lwp = ttolwp(t);
    314 	lx_lwp_data_t *lldp = lwp->lwp_brand;
    315 
    316 	/*
    317 	 * The ptrace flag only applies until the next event is encountered
    318 	 * for the given LWP. If it's set, turn off the flag and poke the
    319 	 * controlling process by raising a signal.
    320 	 */
    321 	if (lldp->br_ptrace) {
    322 		lldp->br_ptrace = 0;
    323 		tsignal(t, SIGTRAP);
    324 	}
    325 }
    326 
    327 void
    328 lx_brand_systrace_enable(void)
    329 {
    330 	extern void lx_brand_int80_enable(void);
    331 
    332 	ASSERT(!lx_systrace_enabled);
    333 
    334 	lx_brand_int80_enable();
    335 
    336 	lx_systrace_enabled = 1;
    337 }
    338 
    339 void
    340 lx_brand_systrace_disable(void)
    341 {
    342 	extern void lx_brand_int80_disable(void);
    343 
    344 	ASSERT(lx_systrace_enabled);
    345 
    346 	lx_brand_int80_disable();
    347 
    348 	lx_systrace_enabled = 0;
    349 }
    350 
    351 void
    352 lx_init_brand_data(zone_t *zone)
    353 {
    354 	lx_zone_data_t *data;
    355 	ASSERT(zone->zone_brand == &lx_brand);
    356 	ASSERT(zone->zone_brand_data == NULL);
    357 	data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP);
    358 	/*
    359 	 * Set the default lxzd_kernel_version to LX_KERN_2_4.
    360 	 * This can be changed by a call to setattr() during zone boot.
    361 	 */
    362 	data->lxzd_kernel_version = LX_KERN_2_4;
    363 	data->lxzd_max_syscall = LX_NSYSCALLS_2_4;
    364 	zone->zone_brand_data = data;
    365 }
    366 
    367 void
    368 lx_free_brand_data(zone_t *zone)
    369 {
    370 	kmem_free(zone->zone_brand_data, sizeof (lx_zone_data_t));
    371 }
    372 
    373 /*
    374  * Get the addresses of the user-space system call handler and attach it to
    375  * the proc structure. Returning 0 indicates success; the value returned
    376  * by the system call is the value stored in rval. Returning a non-zero
    377  * value indicates a failure; the value returned is used to set errno, -1
    378  * is returned from the syscall and the contents of rval are ignored. To
    379  * set errno and have the syscall return a value other than -1 we can
    380  * manually set errno and rval and return 0.
    381  */
    382 int
    383 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
    384     uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
    385 {
    386 	kthread_t *t = curthread;
    387 	proc_t *p = ttoproc(t);
    388 	lx_proc_data_t *pd;
    389 	int linux_call;
    390 	struct termios *termios;
    391 	uint_t termios_len;
    392 	int error;
    393 	lx_brand_registration_t reg;
    394 
    395 	/*
    396 	 * There is one operation that is suppored for non-branded
    397 	 * process.  B_EXEC_BRAND.  This is the equilivant of an
    398 	 * exec call, but the new process that is created will be
    399 	 * a branded process.
    400 	 */
    401 	if (cmd == B_EXEC_BRAND) {
    402 		ASSERT(p->p_zone != NULL);
    403 		ASSERT(p->p_zone->zone_brand == &lx_brand);
    404 		return (exec_common(
    405 		    (char *)arg1, (const char **)arg2, (const char **)arg3,
    406 		    EBA_BRAND));
    407 	}
    408 
    409 	/* For all other operations this must be a branded process. */
    410 	if (p->p_brand == NULL)
    411 		return (set_errno(ENOSYS));
    412 
    413 	ASSERT(p->p_brand == &lx_brand);
    414 	ASSERT(p->p_brand_data != NULL);
    415 
    416 	switch (cmd) {
    417 	case B_REGISTER:
    418 		if (p->p_model == DATAMODEL_NATIVE) {
    419 			if (copyin((void *)arg1, &reg, sizeof (reg)) != 0) {
    420 				lx_print("Failed to copyin brand registration "
    421 				    "at 0x%p\n", (void *)arg1);
    422 				return (EFAULT);
    423 			}
    424 #ifdef _LP64
    425 		} else {
    426 			lx_brand_registration32_t reg32;
    427 
    428 			if (copyin((void *)arg1, &reg32, sizeof (reg32)) != 0) {
    429 				lx_print("Failed to copyin brand registration "
    430 				    "at 0x%p\n", (void *)arg1);
    431 				return (EFAULT);
    432 			}
    433 
    434 			reg.lxbr_version = (uint_t)reg32.lxbr_version;
    435 			reg.lxbr_handler =
    436 			    (void *)(uintptr_t)reg32.lxbr_handler;
    437 			reg.lxbr_tracehandler =
    438 			    (void *)(uintptr_t)reg32.lxbr_tracehandler;
    439 			reg.lxbr_traceflag =
    440 			    (void *)(uintptr_t)reg32.lxbr_traceflag;
    441 #endif
    442 		}
    443 
    444 		if (reg.lxbr_version != LX_VERSION_1) {
    445 			lx_print("Invalid brand library version (%u)\n",
    446 			    reg.lxbr_version);
    447 			return (EINVAL);
    448 		}
    449 
    450 		lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n",
    451 		    (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p);
    452 		pd = p->p_brand_data;
    453 		pd->l_handler = (uintptr_t)reg.lxbr_handler;
    454 		pd->l_tracehandler = (uintptr_t)reg.lxbr_tracehandler;
    455 		pd->l_traceflag = (uintptr_t)reg.lxbr_traceflag;
    456 		*rval = 0;
    457 		return (0);
    458 	case B_TTYMODES:
    459 		/* This is necessary for emulating TCGETS ioctls. */
    460 		if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
    461 		    DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios,
    462 		    &termios_len) != DDI_SUCCESS)
    463 			return (EIO);
    464 
    465 		ASSERT(termios_len == sizeof (*termios));
    466 
    467 		if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) {
    468 			ddi_prop_free(termios);
    469 			return (EFAULT);
    470 		}
    471 
    472 		ddi_prop_free(termios);
    473 		*rval = 0;
    474 		return (0);
    475 
    476 	case B_ELFDATA:
    477 		pd = curproc->p_brand_data;
    478 		if (copyout(&pd->l_elf_data, (void *)arg1,
    479 		    sizeof (lx_elf_data_t)) != 0) {
    480 			(void) set_errno(EFAULT);
    481 			return (*rval = -1);
    482 		}
    483 		*rval = 0;
    484 		return (0);
    485 
    486 	case B_EXEC_NATIVE:
    487 		error = exec_common(
    488 		    (char *)arg1, (const char **)arg2, (const char **)arg3,
    489 		    EBA_NATIVE);
    490 		if (error) {
    491 			(void) set_errno(error);
    492 			return (*rval = -1);
    493 		}
    494 		return (*rval = 0);
    495 
    496 	case B_LPID_TO_SPAIR:
    497 		/*
    498 		 * Given a Linux pid as arg1, return the Solaris pid in arg2 and
    499 		 * the Solaris LWP in arg3.  We also translate pid 1 (which is
    500 		 * hardcoded in many applications) to the zone's init process.
    501 		 */
    502 		{
    503 			pid_t s_pid;
    504 			id_t s_tid;
    505 
    506 			if ((pid_t)arg1 == 1) {
    507 				s_pid = p->p_zone->zone_proc_initpid;
    508 				/* handle the dead/missing init(1M) case */
    509 				if (s_pid == -1)
    510 					s_pid = 1;
    511 				s_tid = 1;
    512 			} else if (lx_lpid_to_spair((pid_t)arg1, &s_pid,
    513 			    &s_tid) < 0)
    514 				return (ESRCH);
    515 
    516 			if (copyout(&s_pid, (void *)arg2,
    517 			    sizeof (s_pid)) != 0 ||
    518 			    copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0)
    519 				return (EFAULT);
    520 
    521 			*rval = 0;
    522 			return (0);
    523 		}
    524 
    525 	case B_PTRACE_SYSCALL:
    526 		*rval = lx_ptrace_syscall_set((pid_t)arg1, (id_t)arg2,
    527 		    (int)arg3);
    528 		return (0);
    529 
    530 	case B_SYSENTRY:
    531 		if (lx_systrace_enabled) {
    532 			uint32_t args[6];
    533 
    534 			ASSERT(lx_systrace_entry_ptr != NULL);
    535 
    536 			if (copyin((void *)arg2, args, sizeof (args)) != 0)
    537 				return (EFAULT);
    538 
    539 			(*lx_systrace_entry_ptr)(arg1, args[0], args[1],
    540 			    args[2], args[3], args[4], args[5]);
    541 		}
    542 
    543 		lx_ptrace_fire();
    544 
    545 		pd = p->p_brand_data;
    546 
    547 		/*
    548 		 * If neither DTrace not ptrace are interested in tracing
    549 		 * this process any more, turn off the trace flag.
    550 		 */
    551 		if (!lx_systrace_enabled && !pd->l_ptrace)
    552 			(void) suword32((void *)pd->l_traceflag, 0);
    553 
    554 		*rval = 0;
    555 		return (0);
    556 
    557 	case B_SYSRETURN:
    558 		if (lx_systrace_enabled) {
    559 			ASSERT(lx_systrace_return_ptr != NULL);
    560 
    561 			(*lx_systrace_return_ptr)(arg1, arg2, arg2, 0, 0, 0, 0);
    562 		}
    563 
    564 		lx_ptrace_fire();
    565 
    566 		pd = p->p_brand_data;
    567 
    568 		/*
    569 		 * If neither DTrace not ptrace are interested in tracing
    570 		 * this process any more, turn off the trace flag.
    571 		 */
    572 		if (!lx_systrace_enabled && !pd->l_ptrace)
    573 			(void) suword32((void *)pd->l_traceflag, 0);
    574 
    575 		*rval = 0;
    576 		return (0);
    577 
    578 	case B_SET_AFFINITY_MASK:
    579 	case B_GET_AFFINITY_MASK:
    580 		/*
    581 		 * Retrieve or store the CPU affinity mask for the
    582 		 * requested linux pid.
    583 		 *
    584 		 * arg1 is a linux PID (0 means curthread).
    585 		 * arg2 is the size of the given mask.
    586 		 * arg3 is the address of the affinity mask.
    587 		 */
    588 		return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval));
    589 
    590 	default:
    591 		linux_call = cmd - B_EMULATE_SYSCALL;
    592 		/*
    593 		 * Only checking against highest syscall number for all kernel
    594 		 * versions, since check for specific kernel version is done
    595 		 * in userland prior to this call, and duplicating logic would
    596 		 * be redundant.
    597 		 */
    598 		if (linux_call >= 0 && linux_call < LX_NSYSCALLS) {
    599 			*rval = lx_emulate_syscall(linux_call, arg1, arg2,
    600 			    arg3, arg4, arg5, arg6);
    601 			return (0);
    602 		}
    603 	}
    604 
    605 	return (EINVAL);
    606 }
    607 
    608 int
    609 lx_get_zone_kern_version(zone_t *zone)
    610 {
    611 	return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version);
    612 }
    613 
    614 int
    615 lx_get_kern_version()
    616 {
    617 	return (lx_get_zone_kern_version(curzone));
    618 }
    619 
    620 void
    621 lx_set_kern_version(zone_t *zone, int vers)
    622 {
    623 	lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
    624 
    625 	lxzd->lxzd_kernel_version = vers;
    626 	if (vers == LX_KERN_2_6)
    627 		lxzd->lxzd_max_syscall = LX_NSYSCALLS_2_6;
    628 }
    629 
    630 /*
    631  * Copy the per-process brand data from a parent proc to a child.
    632  */
    633 void
    634 lx_copy_procdata(proc_t *child, proc_t *parent)
    635 {
    636 	lx_proc_data_t *cpd, *ppd;
    637 
    638 	ppd = parent->p_brand_data;
    639 
    640 	ASSERT(ppd != NULL);
    641 
    642 	cpd = kmem_alloc(sizeof (lx_proc_data_t), KM_SLEEP);
    643 	*cpd = *ppd;
    644 
    645 	child->p_brand_data = cpd;
    646 }
    647 
    648 /*
    649  * Currently, only 32-bit branded ELF executables are supported.
    650  */
    651 #if defined(_LP64)
    652 #define	elfexec			elf32exec
    653 #define	mapexec_brand		mapexec32_brand
    654 #endif /* _LP64 */
    655 
    656 /*
    657  * Exec routine called by elfexec() to load 32-bit Linux binaries.
    658  */
    659 static int
    660 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
    661     struct intpdata *idata, int level, long *execsz, int setid,
    662     caddr_t exec_file, struct cred *cred, int brand_action)
    663 {
    664 	int		error;
    665 	vnode_t		*nvp;
    666 	auxv32_t	phdr_auxv32[3] = {
    667 	    { AT_SUN_BRAND_LX_PHDR, 0 },
    668 	    { AT_SUN_BRAND_AUX2, 0 },
    669 	    { AT_SUN_BRAND_AUX3, 0 }
    670 	};
    671 	Elf32_Ehdr	ehdr;
    672 	Elf32_Addr	uphdr_vaddr;
    673 	intptr_t	voffset;
    674 	int		interp;
    675 	int		i;
    676 	struct execenv	env;
    677 	struct user	*up = PTOU(ttoproc(curthread));
    678 	lx_elf_data_t	*edp =
    679 	    &((lx_proc_data_t *)ttoproc(curthread)->p_brand_data)->l_elf_data;
    680 
    681 	ASSERT(ttoproc(curthread)->p_brand == &lx_brand);
    682 	ASSERT(ttoproc(curthread)->p_brand_data != NULL);
    683 
    684 	/*
    685 	 * Set the brandname and library name for the new process so that
    686 	 * elfexec() puts them onto the stack.
    687 	 */
    688 	args->brandname = LX_BRANDNAME;
    689 	args->emulator = LX_LIB_PATH;
    690 
    691 	/*
    692 	 * We will exec the brand library, and map in the linux linker and the
    693 	 * linux executable.
    694 	 */
    695 	if (error = lookupname(LX_LIB_PATH, UIO_SYSSPACE, FOLLOW, NULLVPP,
    696 	    &nvp)) {
    697 		uprintf("%s: not found.", LX_LIB);
    698 		return (error);
    699 	}
    700 
    701 	if (error = elfexec(nvp, uap, args, idata, level + 1, execsz, setid,
    702 	    exec_file, cred, brand_action)) {
    703 		VN_RELE(nvp);
    704 		return (error);
    705 	}
    706 	VN_RELE(nvp);
    707 
    708 	bzero(&env, sizeof (env));
    709 
    710 	if (error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset,
    711 	    exec_file, &interp, &env.ex_bssbase, &env.ex_brkbase,
    712 	    &env.ex_brksize, NULL))
    713 		return (error);
    714 
    715 	/*
    716 	 * Save off the important properties of the lx executable. The brand
    717 	 * library will ask us for this data later, when it is ready to set
    718 	 * things up for the lx executable.
    719 	 */
    720 	edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff :
    721 	    voffset + uphdr_vaddr;
    722 	edp->ed_entry = voffset + ehdr.e_entry;
    723 	edp->ed_phent = ehdr.e_phentsize;
    724 	edp->ed_phnum = ehdr.e_phnum;
    725 
    726 	if (interp) {
    727 		if (ehdr.e_type == ET_DYN) {
    728 			/*
    729 			 * This is a shared object executable, so we need to
    730 			 * pick a reasonable place to put the heap. Just don't
    731 			 * use the first page.
    732 			 */
    733 			env.ex_brkbase = (caddr_t)PAGESIZE;
    734 			env.ex_bssbase = (caddr_t)PAGESIZE;
    735 		}
    736 
    737 		/*
    738 		 * If the program needs an interpreter (most do), map it in and
    739 		 * store relevant information about it in the aux vector, where
    740 		 * the brand library can find it.
    741 		 */
    742 		if (error = lookupname(LX_LINKER, UIO_SYSSPACE, FOLLOW, NULLVPP,
    743 		    &nvp)) {
    744 			uprintf("%s: not found.", LX_LINKER);
    745 			return (error);
    746 		}
    747 		if (error = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr,
    748 		    &voffset, exec_file, &interp, NULL, NULL, NULL, NULL)) {
    749 			VN_RELE(nvp);
    750 			return (error);
    751 		}
    752 		VN_RELE(nvp);
    753 
    754 		/*
    755 		 * Now that we know the base address of the brand's linker,
    756 		 * place it in the aux vector.
    757 		 */
    758 		edp->ed_base = voffset;
    759 		edp->ed_ldentry = voffset + ehdr.e_entry;
    760 	} else {
    761 		/*
    762 		 * This program has no interpreter. The lx brand library will
    763 		 * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector,
    764 		 * so in this case, put the entry point of the main executable
    765 		 * there.
    766 		 */
    767 		if (ehdr.e_type == ET_EXEC) {
    768 			/*
    769 			 * An executable with no interpreter, this must be a
    770 			 * statically linked executable, which means we loaded
    771 			 * it at the address specified in the elf header, in
    772 			 * which case the e_entry field of the elf header is an
    773 			 * absolute address.
    774 			 */
    775 			edp->ed_ldentry = ehdr.e_entry;
    776 			edp->ed_entry = ehdr.e_entry;
    777 		} else {
    778 			/*
    779 			 * A shared object with no interpreter, we use the
    780 			 * calculated address from above.
    781 			 */
    782 			edp->ed_ldentry = edp->ed_entry;
    783 
    784 			/*
    785 			 * In all situations except an ET_DYN elf object with no
    786 			 * interpreter, we want to leave the brk and base
    787 			 * values set by mapexec_brand alone. Normally when
    788 			 * running ET_DYN objects on Solaris (most likely
    789 			 * /lib/ld.so.1) the kernel sets brk and base to 0 since
    790 			 * it doesn't know where to put the heap, and later the
    791 			 * linker will call brk() to initialize the heap in:
    792 			 *	usr/src/cmd/sgs/rtld/common/setup.c:setup()
    793 			 * after it has determined where to put it.  (This
    794 			 * decision is made after the linker loads and inspects
    795 			 * elf properties of the target executable being run.)
    796 			 *
    797 			 * So for ET_DYN Linux executables, we also don't know
    798 			 * where the heap should go, so we'll set the brk and
    799 			 * base to 0.  But in this case the Solaris linker will
    800 			 * not initialize the heap, so when the Linux linker
    801 			 * starts running there is no heap allocated.  This
    802 			 * seems to be ok on Linux 2.4 based systems because the
    803 			 * Linux linker/libc fall back to using mmap() to
    804 			 * allocate memory. But on 2.6 systems, running
    805 			 * applications by specifying them as command line
    806 			 * arguments to the linker results in segfaults for an
    807 			 * as yet undetermined reason (which seems to indicatej
    808 			 * that a more permanent fix for heap initalization in
    809 			 * these cases may be necessary).
    810 			 */
    811 			if (ehdr.e_type == ET_DYN) {
    812 				env.ex_bssbase = (caddr_t)0;
    813 				env.ex_brkbase = (caddr_t)0;
    814 				env.ex_brksize = 0;
    815 			}
    816 		}
    817 
    818 	}
    819 
    820 	env.ex_vp = vp;
    821 	setexecenv(&env);
    822 
    823 	/*
    824 	 * We don't need to copy this stuff out. It is only used by our
    825 	 * tools to locate the lx linker's debug section. But we should at
    826 	 * least try to keep /proc's view of the aux vector consistent with
    827 	 * what's on the process stack.
    828 	 */
    829 	phdr_auxv32[0].a_un.a_val = edp->ed_phdr;
    830 
    831 	/*
    832 	 * Linux 2.6 programs such as ps will print an error message if the
    833 	 * following aux entry is missing
    834 	 */
    835 	if (lx_get_kern_version() >= LX_KERN_2_6) {
    836 		phdr_auxv32[1].a_type = AT_CLKTCK;
    837 		phdr_auxv32[1].a_un.a_val = hz;
    838 	}
    839 
    840 	if (copyout(&phdr_auxv32, args->auxp_brand,
    841 	    sizeof (phdr_auxv32)) == -1)
    842 		return (EFAULT);
    843 
    844 	/*
    845 	 * /proc uses the AT_ENTRY aux vector entry to deduce
    846 	 * the location of the executable in the address space. The user
    847 	 * structure contains a copy of the aux vector that needs to have those
    848 	 * entries patched with the values of the real lx executable (they
    849 	 * currently contain the values from the lx brand library that was
    850 	 * elfexec'd, above).
    851 	 *
    852 	 * For live processes, AT_BASE is used to locate the linker segment,
    853 	 * which /proc and friends will later use to find Solaris symbols
    854 	 * (such as rtld_db_preinit). However, for core files, /proc uses
    855 	 * AT_ENTRY to find the right segment to label as the executable.
    856 	 * So we set AT_ENTRY to be the entry point of the linux executable,
    857 	 * but leave AT_BASE to be the address of the Solaris linker.
    858 	 */
    859 	for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
    860 		if (up->u_auxv[i].a_type == AT_ENTRY)
    861 			up->u_auxv[i].a_un.a_val = edp->ed_entry;
    862 		if (up->u_auxv[i].a_type == AT_SUN_BRAND_LX_PHDR)
    863 			up->u_auxv[i].a_un.a_val = edp->ed_phdr;
    864 	}
    865 
    866 	return (0);
    867 }
    868 
    869 int
    870 _init(void)
    871 {
    872 	int err = 0;
    873 
    874 	/* pid/tid conversion hash tables */
    875 	lx_pid_init();
    876 
    877 	/* for lx_futex() */
    878 	lx_futex_init();
    879 
    880 	err = mod_install(&modlinkage);
    881 	if (err != 0) {
    882 		cmn_err(CE_WARN, "Couldn't install lx brand module");
    883 
    884 		/*
    885 		 * This looks drastic, but it should never happen.  These
    886 		 * two data structures should be completely free-able until
    887 		 * they are used by Linux processes.  Since the brand
    888 		 * wasn't loaded there should be no Linux processes, and
    889 		 * thus no way for these data structures to be modified.
    890 		 */
    891 		lx_pid_fini();
    892 		if (lx_futex_fini())
    893 			panic("lx brand module cannot be loaded or unloaded.");
    894 	}
    895 	return (err);
    896 }
    897 
    898 int
    899 _info(struct modinfo *modinfop)
    900 {
    901 	return (mod_info(&modlinkage, modinfop));
    902 }
    903 
    904 int
    905 _fini(void)
    906 {
    907 	int err;
    908 	int futex_done = 0;
    909 
    910 	/*
    911 	 * If there are any zones using this brand, we can't allow it to be
    912 	 * unloaded.
    913 	 */
    914 	if (brand_zone_count(&lx_brand))
    915 		return (EBUSY);
    916 
    917 	lx_pid_fini();
    918 
    919 	if ((err = lx_futex_fini()) != 0)
    920 		goto done;
    921 	futex_done = 1;
    922 
    923 	err = mod_remove(&modlinkage);
    924 
    925 done:
    926 	if (err) {
    927 		/*
    928 		 * If we can't unload the module, then we have to get it
    929 		 * back into a sane state.
    930 		 */
    931 		lx_pid_init();
    932 
    933 		if (futex_done)
    934 			lx_futex_init();
    935 
    936 	}
    937 
    938 	return (err);
    939 }
    940