1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/kmem.h> 28 #include <sys/errno.h> 29 #include <sys/thread.h> 30 #include <sys/systm.h> 31 #include <sys/syscall.h> 32 #include <sys/proc.h> 33 #include <sys/modctl.h> 34 #include <sys/cmn_err.h> 35 #include <sys/model.h> 36 #include <sys/exec.h> 37 #include <sys/lx_impl.h> 38 #include <sys/machbrand.h> 39 #include <sys/lx_syscalls.h> 40 #include <sys/lx_pid.h> 41 #include <sys/lx_futex.h> 42 #include <sys/lx_brand.h> 43 #include <sys/termios.h> 44 #include <sys/sunddi.h> 45 #include <sys/ddi.h> 46 #include <sys/vnode.h> 47 #include <sys/pathname.h> 48 #include <sys/auxv.h> 49 #include <sys/priv.h> 50 #include <sys/regset.h> 51 #include <sys/privregs.h> 52 #include <sys/archsystm.h> 53 #include <sys/zone.h> 54 #include <sys/brand.h> 55 56 int lx_debug = 0; 57 58 void lx_init_brand_data(zone_t *); 59 void lx_free_brand_data(zone_t *); 60 void lx_setbrand(proc_t *); 61 int lx_getattr(zone_t *, int, void *, size_t *); 62 int lx_setattr(zone_t *, int, void *, size_t); 63 int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, 64 uintptr_t, uintptr_t, uintptr_t); 65 int lx_get_kern_version(void); 66 void lx_set_kern_version(zone_t *, int); 67 void lx_copy_procdata(proc_t *, proc_t *); 68 69 extern void lx_setrval(klwp_t *, int, int); 70 extern void lx_proc_exit(proc_t *, klwp_t *); 71 extern void lx_exec(); 72 extern int lx_initlwp(klwp_t *); 73 extern void lx_forklwp(klwp_t *, klwp_t *); 74 extern void lx_exitlwp(klwp_t *); 75 extern void lx_freelwp(klwp_t *); 76 extern greg_t lx_fixsegreg(greg_t, model_t); 77 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *); 78 79 int lx_systrace_brand_enabled; 80 81 lx_systrace_f *lx_systrace_entry_ptr; 82 lx_systrace_f *lx_systrace_return_ptr; 83 84 static int lx_systrace_enabled; 85 86 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, 87 struct intpdata *idata, int level, long *execsz, int setid, 88 caddr_t exec_file, struct cred *cred, int brand_action); 89 90 /* lx brand */ 91 struct brand_ops lx_brops = { 92 lx_init_brand_data, 93 lx_free_brand_data, 94 lx_brandsys, 95 lx_setbrand, 96 lx_getattr, 97 lx_setattr, 98 lx_copy_procdata, 99 lx_proc_exit, 100 lx_exec, 101 lx_setrval, 102 lx_initlwp, 103 lx_forklwp, 104 lx_freelwp, 105 lx_exitlwp, 106 lx_elfexec 107 }; 108 109 struct brand_mach_ops lx_mops = { 110 NULL, 111 lx_brand_int80_callback, 112 NULL, 113 NULL, 114 NULL, 115 lx_fixsegreg, 116 }; 117 118 struct brand lx_brand = { 119 BRAND_VER_1, 120 "lx", 121 &lx_brops, 122 &lx_mops 123 }; 124 125 static struct modlbrand modlbrand = { 126 &mod_brandops, "lx brand", &lx_brand 127 }; 128 129 static struct modlinkage modlinkage = { 130 MODREV_1, (void *)&modlbrand, NULL 131 }; 132 133 void 134 lx_proc_exit(proc_t *p, klwp_t *lwp) 135 { 136 zone_t *z = p->p_zone; 137 138 ASSERT(p->p_brand != NULL); 139 ASSERT(p->p_brand_data != NULL); 140 141 /* 142 * If init is dying and we aren't explicitly shutting down the zone 143 * or the system, then Solaris is about to restart init. The Linux 144 * init is not designed to handle a restart, which it interprets as 145 * a reboot. To give it a sane environment in which to run, we 146 * reboot the zone. 147 */ 148 if (p->p_pid == z->zone_proc_initpid) { 149 if (z->zone_boot_err == 0 && 150 z->zone_restart_init && 151 zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && 152 zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) 153 (void) zone_kadmin(A_REBOOT, 0, NULL, CRED()); 154 } 155 lx_exitlwp(lwp); 156 kmem_free(p->p_brand_data, sizeof (struct lx_proc_data)); 157 p->p_brand_data = NULL; 158 } 159 160 void 161 lx_setbrand(proc_t *p) 162 { 163 kthread_t *t = p->p_tlist; 164 int err; 165 166 ASSERT(p->p_brand_data == NULL); 167 ASSERT(ttolxlwp(curthread) == NULL); 168 169 p->p_brand_data = kmem_zalloc(sizeof (struct lx_proc_data), KM_SLEEP); 170 171 /* 172 * This routine can only be called for single-threaded processes. 173 * Since lx_initlwp() can only fail if we run out of PIDs for 174 * multithreaded processes, we know that this can never fail. 175 */ 176 err = lx_initlwp(t->t_lwp); 177 ASSERT(err == 0); 178 } 179 180 /* ARGSUSED */ 181 int 182 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) 183 { 184 boolean_t val; 185 int num; 186 187 if (attr == LX_ATTR_RESTART_INIT) { 188 if (bufsize > sizeof (boolean_t)) 189 return (ERANGE); 190 if (copyin(buf, &val, sizeof (val)) != 0) 191 return (EFAULT); 192 if (val != B_TRUE && val != B_FALSE) 193 return (EINVAL); 194 zone->zone_restart_init = val; 195 return (0); 196 } else if (attr == LX_KERN_VERSION_NUM) { 197 if (bufsize > sizeof (int)) 198 return (ERANGE); 199 if (copyin(buf, &num, sizeof (num)) != 0) 200 return (EFAULT); 201 lx_set_kern_version(zone, num); 202 return (0); 203 } 204 return (EINVAL); 205 } 206 207 /* ARGSUSED */ 208 int 209 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize) 210 { 211 int num; 212 if (attr == LX_ATTR_RESTART_INIT) { 213 if (*bufsize < sizeof (boolean_t)) 214 return (ERANGE); 215 if (copyout(&zone->zone_restart_init, buf, 216 sizeof (boolean_t)) != 0) 217 return (EFAULT); 218 *bufsize = sizeof (boolean_t); 219 return (0); 220 } else if (attr == LX_KERN_VERSION_NUM) { 221 if (*bufsize < sizeof (int)) 222 return (ERANGE); 223 num = lx_get_kern_version(); 224 if (copyout(&num, buf, sizeof (int)) != 0) 225 return (EFAULT); 226 *bufsize = sizeof (int); 227 return (0); 228 } 229 return (-EINVAL); 230 } 231 232 /* 233 * Enable ptrace system call tracing for the given LWP. This is done by 234 * both setting the flag in that LWP's brand data (in the kernel) and setting 235 * the process-wide trace flag (in the brand library of the traced process). 236 */ 237 static int 238 lx_ptrace_syscall_set(pid_t pid, id_t lwpid, int set) 239 { 240 proc_t *p; 241 kthread_t *t; 242 klwp_t *lwp; 243 lx_proc_data_t *lpdp; 244 lx_lwp_data_t *lldp; 245 uintptr_t addr; 246 int ret, flag = 1; 247 248 if ((p = sprlock(pid)) == NULL) 249 return (ESRCH); 250 251 if (priv_proc_cred_perm(curproc->p_cred, p, NULL, VWRITE) != 0) { 252 sprunlock(p); 253 return (EPERM); 254 } 255 256 if ((t = idtot(p, lwpid)) == NULL || (lwp = ttolwp(t)) == NULL) { 257 sprunlock(p); 258 return (ESRCH); 259 } 260 261 if ((lpdp = p->p_brand_data) == NULL || 262 (lldp = lwp->lwp_brand) == NULL) { 263 sprunlock(p); 264 return (ESRCH); 265 } 266 267 if (set) { 268 /* 269 * Enable the ptrace flag for this LWP and this process. Note 270 * that we will turn off the LWP's ptrace flag, but we don't 271 * turn off the process's ptrace flag. 272 */ 273 lldp->br_ptrace = 1; 274 lpdp->l_ptrace = 1; 275 276 addr = lpdp->l_traceflag; 277 278 mutex_exit(&p->p_lock); 279 280 /* 281 * This can fail only in some rare corner cases where the 282 * process is exiting or we're completely out of memory. In 283 * these cases, it's sufficient to return an error to the ptrace 284 * consumer and leave the process-wide flag set. 285 */ 286 ret = uwrite(p, &flag, sizeof (flag), addr); 287 288 mutex_enter(&p->p_lock); 289 290 /* 291 * If we couldn't set the trace flag, unset the LWP's ptrace 292 * flag as there ptrace consumer won't expect this LWP to stop. 293 */ 294 if (ret != 0) 295 lldp->br_ptrace = 0; 296 } else { 297 lldp->br_ptrace = 0; 298 ret = 0; 299 } 300 301 sprunlock(p); 302 303 if (ret != 0) 304 ret = EIO; 305 306 return (ret); 307 } 308 309 static void 310 lx_ptrace_fire(void) 311 { 312 kthread_t *t = curthread; 313 klwp_t *lwp = ttolwp(t); 314 lx_lwp_data_t *lldp = lwp->lwp_brand; 315 316 /* 317 * The ptrace flag only applies until the next event is encountered 318 * for the given LWP. If it's set, turn off the flag and poke the 319 * controlling process by raising a signal. 320 */ 321 if (lldp->br_ptrace) { 322 lldp->br_ptrace = 0; 323 tsignal(t, SIGTRAP); 324 } 325 } 326 327 void 328 lx_brand_systrace_enable(void) 329 { 330 extern void lx_brand_int80_enable(void); 331 332 ASSERT(!lx_systrace_enabled); 333 334 lx_brand_int80_enable(); 335 336 lx_systrace_enabled = 1; 337 } 338 339 void 340 lx_brand_systrace_disable(void) 341 { 342 extern void lx_brand_int80_disable(void); 343 344 ASSERT(lx_systrace_enabled); 345 346 lx_brand_int80_disable(); 347 348 lx_systrace_enabled = 0; 349 } 350 351 void 352 lx_init_brand_data(zone_t *zone) 353 { 354 lx_zone_data_t *data; 355 ASSERT(zone->zone_brand == &lx_brand); 356 ASSERT(zone->zone_brand_data == NULL); 357 data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP); 358 /* 359 * Set the default lxzd_kernel_version to LX_KERN_2_4. 360 * This can be changed by a call to setattr() during zone boot. 361 */ 362 data->lxzd_kernel_version = LX_KERN_2_4; 363 data->lxzd_max_syscall = LX_NSYSCALLS_2_4; 364 zone->zone_brand_data = data; 365 } 366 367 void 368 lx_free_brand_data(zone_t *zone) 369 { 370 kmem_free(zone->zone_brand_data, sizeof (lx_zone_data_t)); 371 } 372 373 /* 374 * Get the addresses of the user-space system call handler and attach it to 375 * the proc structure. Returning 0 indicates success; the value returned 376 * by the system call is the value stored in rval. Returning a non-zero 377 * value indicates a failure; the value returned is used to set errno, -1 378 * is returned from the syscall and the contents of rval are ignored. To 379 * set errno and have the syscall return a value other than -1 we can 380 * manually set errno and rval and return 0. 381 */ 382 int 383 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, 384 uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) 385 { 386 kthread_t *t = curthread; 387 proc_t *p = ttoproc(t); 388 lx_proc_data_t *pd; 389 int linux_call; 390 struct termios *termios; 391 uint_t termios_len; 392 int error; 393 lx_brand_registration_t reg; 394 395 /* 396 * There is one operation that is suppored for non-branded 397 * process. B_EXEC_BRAND. This is the equilivant of an 398 * exec call, but the new process that is created will be 399 * a branded process. 400 */ 401 if (cmd == B_EXEC_BRAND) { 402 ASSERT(p->p_zone != NULL); 403 ASSERT(p->p_zone->zone_brand == &lx_brand); 404 return (exec_common( 405 (char *)arg1, (const char **)arg2, (const char **)arg3, 406 EBA_BRAND)); 407 } 408 409 /* For all other operations this must be a branded process. */ 410 if (p->p_brand == NULL) 411 return (set_errno(ENOSYS)); 412 413 ASSERT(p->p_brand == &lx_brand); 414 ASSERT(p->p_brand_data != NULL); 415 416 switch (cmd) { 417 case B_REGISTER: 418 if (p->p_model == DATAMODEL_NATIVE) { 419 if (copyin((void *)arg1, ®, sizeof (reg)) != 0) { 420 lx_print("Failed to copyin brand registration " 421 "at 0x%p\n", (void *)arg1); 422 return (EFAULT); 423 } 424 #ifdef _LP64 425 } else { 426 lx_brand_registration32_t reg32; 427 428 if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) { 429 lx_print("Failed to copyin brand registration " 430 "at 0x%p\n", (void *)arg1); 431 return (EFAULT); 432 } 433 434 reg.lxbr_version = (uint_t)reg32.lxbr_version; 435 reg.lxbr_handler = 436 (void *)(uintptr_t)reg32.lxbr_handler; 437 reg.lxbr_tracehandler = 438 (void *)(uintptr_t)reg32.lxbr_tracehandler; 439 reg.lxbr_traceflag = 440 (void *)(uintptr_t)reg32.lxbr_traceflag; 441 #endif 442 } 443 444 if (reg.lxbr_version != LX_VERSION_1) { 445 lx_print("Invalid brand library version (%u)\n", 446 reg.lxbr_version); 447 return (EINVAL); 448 } 449 450 lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n", 451 (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p); 452 pd = p->p_brand_data; 453 pd->l_handler = (uintptr_t)reg.lxbr_handler; 454 pd->l_tracehandler = (uintptr_t)reg.lxbr_tracehandler; 455 pd->l_traceflag = (uintptr_t)reg.lxbr_traceflag; 456 *rval = 0; 457 return (0); 458 case B_TTYMODES: 459 /* This is necessary for emulating TCGETS ioctls. */ 460 if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), 461 DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios, 462 &termios_len) != DDI_SUCCESS) 463 return (EIO); 464 465 ASSERT(termios_len == sizeof (*termios)); 466 467 if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) { 468 ddi_prop_free(termios); 469 return (EFAULT); 470 } 471 472 ddi_prop_free(termios); 473 *rval = 0; 474 return (0); 475 476 case B_ELFDATA: 477 pd = curproc->p_brand_data; 478 if (copyout(&pd->l_elf_data, (void *)arg1, 479 sizeof (lx_elf_data_t)) != 0) { 480 (void) set_errno(EFAULT); 481 return (*rval = -1); 482 } 483 *rval = 0; 484 return (0); 485 486 case B_EXEC_NATIVE: 487 error = exec_common( 488 (char *)arg1, (const char **)arg2, (const char **)arg3, 489 EBA_NATIVE); 490 if (error) { 491 (void) set_errno(error); 492 return (*rval = -1); 493 } 494 return (*rval = 0); 495 496 case B_LPID_TO_SPAIR: 497 /* 498 * Given a Linux pid as arg1, return the Solaris pid in arg2 and 499 * the Solaris LWP in arg3. We also translate pid 1 (which is 500 * hardcoded in many applications) to the zone's init process. 501 */ 502 { 503 pid_t s_pid; 504 id_t s_tid; 505 506 if ((pid_t)arg1 == 1) { 507 s_pid = p->p_zone->zone_proc_initpid; 508 /* handle the dead/missing init(1M) case */ 509 if (s_pid == -1) 510 s_pid = 1; 511 s_tid = 1; 512 } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, 513 &s_tid) < 0) 514 return (ESRCH); 515 516 if (copyout(&s_pid, (void *)arg2, 517 sizeof (s_pid)) != 0 || 518 copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) 519 return (EFAULT); 520 521 *rval = 0; 522 return (0); 523 } 524 525 case B_PTRACE_SYSCALL: 526 *rval = lx_ptrace_syscall_set((pid_t)arg1, (id_t)arg2, 527 (int)arg3); 528 return (0); 529 530 case B_SYSENTRY: 531 if (lx_systrace_enabled) { 532 uint32_t args[6]; 533 534 ASSERT(lx_systrace_entry_ptr != NULL); 535 536 if (copyin((void *)arg2, args, sizeof (args)) != 0) 537 return (EFAULT); 538 539 (*lx_systrace_entry_ptr)(arg1, args[0], args[1], 540 args[2], args[3], args[4], args[5]); 541 } 542 543 lx_ptrace_fire(); 544 545 pd = p->p_brand_data; 546 547 /* 548 * If neither DTrace not ptrace are interested in tracing 549 * this process any more, turn off the trace flag. 550 */ 551 if (!lx_systrace_enabled && !pd->l_ptrace) 552 (void) suword32((void *)pd->l_traceflag, 0); 553 554 *rval = 0; 555 return (0); 556 557 case B_SYSRETURN: 558 if (lx_systrace_enabled) { 559 ASSERT(lx_systrace_return_ptr != NULL); 560 561 (*lx_systrace_return_ptr)(arg1, arg2, arg2, 0, 0, 0, 0); 562 } 563 564 lx_ptrace_fire(); 565 566 pd = p->p_brand_data; 567 568 /* 569 * If neither DTrace not ptrace are interested in tracing 570 * this process any more, turn off the trace flag. 571 */ 572 if (!lx_systrace_enabled && !pd->l_ptrace) 573 (void) suword32((void *)pd->l_traceflag, 0); 574 575 *rval = 0; 576 return (0); 577 578 case B_SET_AFFINITY_MASK: 579 case B_GET_AFFINITY_MASK: 580 /* 581 * Retrieve or store the CPU affinity mask for the 582 * requested linux pid. 583 * 584 * arg1 is a linux PID (0 means curthread). 585 * arg2 is the size of the given mask. 586 * arg3 is the address of the affinity mask. 587 */ 588 return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval)); 589 590 default: 591 linux_call = cmd - B_EMULATE_SYSCALL; 592 /* 593 * Only checking against highest syscall number for all kernel 594 * versions, since check for specific kernel version is done 595 * in userland prior to this call, and duplicating logic would 596 * be redundant. 597 */ 598 if (linux_call >= 0 && linux_call < LX_NSYSCALLS) { 599 *rval = lx_emulate_syscall(linux_call, arg1, arg2, 600 arg3, arg4, arg5, arg6); 601 return (0); 602 } 603 } 604 605 return (EINVAL); 606 } 607 608 int 609 lx_get_zone_kern_version(zone_t *zone) 610 { 611 return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version); 612 } 613 614 int 615 lx_get_kern_version() 616 { 617 return (lx_get_zone_kern_version(curzone)); 618 } 619 620 void 621 lx_set_kern_version(zone_t *zone, int vers) 622 { 623 lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; 624 625 lxzd->lxzd_kernel_version = vers; 626 if (vers == LX_KERN_2_6) 627 lxzd->lxzd_max_syscall = LX_NSYSCALLS_2_6; 628 } 629 630 /* 631 * Copy the per-process brand data from a parent proc to a child. 632 */ 633 void 634 lx_copy_procdata(proc_t *child, proc_t *parent) 635 { 636 lx_proc_data_t *cpd, *ppd; 637 638 ppd = parent->p_brand_data; 639 640 ASSERT(ppd != NULL); 641 642 cpd = kmem_alloc(sizeof (lx_proc_data_t), KM_SLEEP); 643 *cpd = *ppd; 644 645 child->p_brand_data = cpd; 646 } 647 648 /* 649 * Currently, only 32-bit branded ELF executables are supported. 650 */ 651 #if defined(_LP64) 652 #define elfexec elf32exec 653 #define mapexec_brand mapexec32_brand 654 #endif /* _LP64 */ 655 656 /* 657 * Exec routine called by elfexec() to load 32-bit Linux binaries. 658 */ 659 static int 660 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, 661 struct intpdata *idata, int level, long *execsz, int setid, 662 caddr_t exec_file, struct cred *cred, int brand_action) 663 { 664 int error; 665 vnode_t *nvp; 666 auxv32_t phdr_auxv32[3] = { 667 { AT_SUN_BRAND_LX_PHDR, 0 }, 668 { AT_SUN_BRAND_AUX2, 0 }, 669 { AT_SUN_BRAND_AUX3, 0 } 670 }; 671 Elf32_Ehdr ehdr; 672 Elf32_Addr uphdr_vaddr; 673 intptr_t voffset; 674 int interp; 675 int i; 676 struct execenv env; 677 struct user *up = PTOU(ttoproc(curthread)); 678 lx_elf_data_t *edp = 679 &((lx_proc_data_t *)ttoproc(curthread)->p_brand_data)->l_elf_data; 680 681 ASSERT(ttoproc(curthread)->p_brand == &lx_brand); 682 ASSERT(ttoproc(curthread)->p_brand_data != NULL); 683 684 /* 685 * Set the brandname and library name for the new process so that 686 * elfexec() puts them onto the stack. 687 */ 688 args->brandname = LX_BRANDNAME; 689 args->emulator = LX_LIB_PATH; 690 691 /* 692 * We will exec the brand library, and map in the linux linker and the 693 * linux executable. 694 */ 695 if (error = lookupname(LX_LIB_PATH, UIO_SYSSPACE, FOLLOW, NULLVPP, 696 &nvp)) { 697 uprintf("%s: not found.", LX_LIB); 698 return (error); 699 } 700 701 if (error = elfexec(nvp, uap, args, idata, level + 1, execsz, setid, 702 exec_file, cred, brand_action)) { 703 VN_RELE(nvp); 704 return (error); 705 } 706 VN_RELE(nvp); 707 708 bzero(&env, sizeof (env)); 709 710 if (error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset, 711 exec_file, &interp, &env.ex_bssbase, &env.ex_brkbase, 712 &env.ex_brksize, NULL)) 713 return (error); 714 715 /* 716 * Save off the important properties of the lx executable. The brand 717 * library will ask us for this data later, when it is ready to set 718 * things up for the lx executable. 719 */ 720 edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff : 721 voffset + uphdr_vaddr; 722 edp->ed_entry = voffset + ehdr.e_entry; 723 edp->ed_phent = ehdr.e_phentsize; 724 edp->ed_phnum = ehdr.e_phnum; 725 726 if (interp) { 727 if (ehdr.e_type == ET_DYN) { 728 /* 729 * This is a shared object executable, so we need to 730 * pick a reasonable place to put the heap. Just don't 731 * use the first page. 732 */ 733 env.ex_brkbase = (caddr_t)PAGESIZE; 734 env.ex_bssbase = (caddr_t)PAGESIZE; 735 } 736 737 /* 738 * If the program needs an interpreter (most do), map it in and 739 * store relevant information about it in the aux vector, where 740 * the brand library can find it. 741 */ 742 if (error = lookupname(LX_LINKER, UIO_SYSSPACE, FOLLOW, NULLVPP, 743 &nvp)) { 744 uprintf("%s: not found.", LX_LINKER); 745 return (error); 746 } 747 if (error = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr, 748 &voffset, exec_file, &interp, NULL, NULL, NULL, NULL)) { 749 VN_RELE(nvp); 750 return (error); 751 } 752 VN_RELE(nvp); 753 754 /* 755 * Now that we know the base address of the brand's linker, 756 * place it in the aux vector. 757 */ 758 edp->ed_base = voffset; 759 edp->ed_ldentry = voffset + ehdr.e_entry; 760 } else { 761 /* 762 * This program has no interpreter. The lx brand library will 763 * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector, 764 * so in this case, put the entry point of the main executable 765 * there. 766 */ 767 if (ehdr.e_type == ET_EXEC) { 768 /* 769 * An executable with no interpreter, this must be a 770 * statically linked executable, which means we loaded 771 * it at the address specified in the elf header, in 772 * which case the e_entry field of the elf header is an 773 * absolute address. 774 */ 775 edp->ed_ldentry = ehdr.e_entry; 776 edp->ed_entry = ehdr.e_entry; 777 } else { 778 /* 779 * A shared object with no interpreter, we use the 780 * calculated address from above. 781 */ 782 edp->ed_ldentry = edp->ed_entry; 783 784 /* 785 * In all situations except an ET_DYN elf object with no 786 * interpreter, we want to leave the brk and base 787 * values set by mapexec_brand alone. Normally when 788 * running ET_DYN objects on Solaris (most likely 789 * /lib/ld.so.1) the kernel sets brk and base to 0 since 790 * it doesn't know where to put the heap, and later the 791 * linker will call brk() to initialize the heap in: 792 * usr/src/cmd/sgs/rtld/common/setup.c:setup() 793 * after it has determined where to put it. (This 794 * decision is made after the linker loads and inspects 795 * elf properties of the target executable being run.) 796 * 797 * So for ET_DYN Linux executables, we also don't know 798 * where the heap should go, so we'll set the brk and 799 * base to 0. But in this case the Solaris linker will 800 * not initialize the heap, so when the Linux linker 801 * starts running there is no heap allocated. This 802 * seems to be ok on Linux 2.4 based systems because the 803 * Linux linker/libc fall back to using mmap() to 804 * allocate memory. But on 2.6 systems, running 805 * applications by specifying them as command line 806 * arguments to the linker results in segfaults for an 807 * as yet undetermined reason (which seems to indicatej 808 * that a more permanent fix for heap initalization in 809 * these cases may be necessary). 810 */ 811 if (ehdr.e_type == ET_DYN) { 812 env.ex_bssbase = (caddr_t)0; 813 env.ex_brkbase = (caddr_t)0; 814 env.ex_brksize = 0; 815 } 816 } 817 818 } 819 820 env.ex_vp = vp; 821 setexecenv(&env); 822 823 /* 824 * We don't need to copy this stuff out. It is only used by our 825 * tools to locate the lx linker's debug section. But we should at 826 * least try to keep /proc's view of the aux vector consistent with 827 * what's on the process stack. 828 */ 829 phdr_auxv32[0].a_un.a_val = edp->ed_phdr; 830 831 /* 832 * Linux 2.6 programs such as ps will print an error message if the 833 * following aux entry is missing 834 */ 835 if (lx_get_kern_version() >= LX_KERN_2_6) { 836 phdr_auxv32[1].a_type = AT_CLKTCK; 837 phdr_auxv32[1].a_un.a_val = hz; 838 } 839 840 if (copyout(&phdr_auxv32, args->auxp_brand, 841 sizeof (phdr_auxv32)) == -1) 842 return (EFAULT); 843 844 /* 845 * /proc uses the AT_ENTRY aux vector entry to deduce 846 * the location of the executable in the address space. The user 847 * structure contains a copy of the aux vector that needs to have those 848 * entries patched with the values of the real lx executable (they 849 * currently contain the values from the lx brand library that was 850 * elfexec'd, above). 851 * 852 * For live processes, AT_BASE is used to locate the linker segment, 853 * which /proc and friends will later use to find Solaris symbols 854 * (such as rtld_db_preinit). However, for core files, /proc uses 855 * AT_ENTRY to find the right segment to label as the executable. 856 * So we set AT_ENTRY to be the entry point of the linux executable, 857 * but leave AT_BASE to be the address of the Solaris linker. 858 */ 859 for (i = 0; i < __KERN_NAUXV_IMPL; i++) { 860 if (up->u_auxv[i].a_type == AT_ENTRY) 861 up->u_auxv[i].a_un.a_val = edp->ed_entry; 862 if (up->u_auxv[i].a_type == AT_SUN_BRAND_LX_PHDR) 863 up->u_auxv[i].a_un.a_val = edp->ed_phdr; 864 } 865 866 return (0); 867 } 868 869 int 870 _init(void) 871 { 872 int err = 0; 873 874 /* pid/tid conversion hash tables */ 875 lx_pid_init(); 876 877 /* for lx_futex() */ 878 lx_futex_init(); 879 880 err = mod_install(&modlinkage); 881 if (err != 0) { 882 cmn_err(CE_WARN, "Couldn't install lx brand module"); 883 884 /* 885 * This looks drastic, but it should never happen. These 886 * two data structures should be completely free-able until 887 * they are used by Linux processes. Since the brand 888 * wasn't loaded there should be no Linux processes, and 889 * thus no way for these data structures to be modified. 890 */ 891 lx_pid_fini(); 892 if (lx_futex_fini()) 893 panic("lx brand module cannot be loaded or unloaded."); 894 } 895 return (err); 896 } 897 898 int 899 _info(struct modinfo *modinfop) 900 { 901 return (mod_info(&modlinkage, modinfop)); 902 } 903 904 int 905 _fini(void) 906 { 907 int err; 908 int futex_done = 0; 909 910 /* 911 * If there are any zones using this brand, we can't allow it to be 912 * unloaded. 913 */ 914 if (brand_zone_count(&lx_brand)) 915 return (EBUSY); 916 917 lx_pid_fini(); 918 919 if ((err = lx_futex_fini()) != 0) 920 goto done; 921 futex_done = 1; 922 923 err = mod_remove(&modlinkage); 924 925 done: 926 if (err) { 927 /* 928 * If we can't unload the module, then we have to get it 929 * back into a sane state. 930 */ 931 lx_pid_init(); 932 933 if (futex_done) 934 lx_futex_init(); 935 936 } 937 938 return (err); 939 } 940