1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "@(#)shm_mmap_ops.c 1.16 09/05/26 SMI" 28 29 /* 30 * shared memory operations. They support the attaching and 31 * detaching of shared memory, to reduce the overall address space 32 * usage on a 32 bit system. All this extra work is a waste of 33 * time on a 64 bit system at present, unless someone has decreased 34 * some of the limits. 35 * 36 * Still to do: 37 * The fd should not be held in the mmap_shm structure, but 38 * in a seperate structure with a reference count. When 39 * the reference count drops to 0 the fd can be closed. 40 * 41 * Currently it is possible, to leak file descriptors, but 42 * this will only happen during error paths, most if not all 43 * of which are fatal so it won't impact the program. 44 */ 45 #ifndef _LARGEFILE64_SOURCE 46 #define _LARGEFILE64_SOURCE 47 #endif 48 #include <unistd.h> 49 #include <stdlib.h> 50 #include <sys/types.h> 51 #include <sys/stat.h> 52 #include <fcntl.h> 53 #include <sys/mman.h> 54 #include <errno.h> 55 #include <string.h> 56 #include <assert.h> 57 #include <limits.h> 58 #include <stdio.h> 59 #include "shm_ops.h" 60 #include <diskomizer/log.h> 61 #include <diskomizer/stack_trace.h> 62 #include "signal_catch.h" 63 #include "args.h" 64 #include "list.h" 65 #include "utils.h" 66 67 struct list_head { 68 struct mmap_shm *head; 69 struct mmap_shm *tail; 70 }; 71 struct mmap_shm { 72 struct mmap_shm *next; 73 struct mmap_shm *prev; 74 shm_flags flags; 75 int fd; 76 off_t off; 77 int ref; 78 int users; 79 void *addr; 80 ulong_t len; 81 }; 82 struct frag { 83 struct mmap_shm *base; 84 off_t off; 85 }; 86 struct free_frags { 87 struct free_frags *next; 88 struct free_frags *prev; 89 struct frag *frag; 90 ulong_t len; 91 }; 92 93 static struct free_frags free_frag; 94 static struct list_head free_list, attach_list; 95 static int hasfailed; 96 static struct fd { 97 int fd; 98 off_t trunc; 99 off_t off; 100 } fd = { 101 -1, 102 0, 103 0 104 }; 105 static void * 106 my_alloc(size_t nelem, size_t elsize) 107 { 108 void *res; 109 long sbrkval; 110 struct mmap_shm *q; 111 112 res = calloc(nelem, elsize); 113 if (res == NULL && (sbrkval = (long)sbrk(0)) != -1) { 114 sbrkval += (nelem * elsize) + 8192; 115 for (q = free_list.tail; q != NULL && res == NULL; 116 q = q->prev) { 117 if (q->addr != NULL && q->ref == 0 && 118 (ulong_t)q->addr <= sbrkval) { 119 if (munmap(q->addr, q->len) == -1) { 120 pperror("%d, munmap(%lx, %lx)", 121 __LINE__, (ulong_t)q->addr, q->len); 122 } else { 123 q->addr = NULL; 124 } 125 res = calloc(nelem, elsize); 126 } 127 } 128 } 129 for (q = free_list.tail; q != NULL && res == NULL; q = q->prev) { 130 if (q->addr != NULL && q->ref == 0) { 131 if (munmap(q->addr, q->len) == -1) { 132 pperror("%d, munmap(%lx, %lx)", __LINE__, 133 (ulong_t)q->addr, q->len); 134 } else { 135 res = calloc(nelem, elsize); 136 if (res == NULL) 137 q->addr = NULL; 138 else 139 q->addr = NULL; 140 } 141 } 142 } 143 return (res); 144 } 145 146 static int 147 create_temp(void) 148 { 149 char xtempfile[] = "mmap_memory_file_XXXXXX"; 150 char *tempfile, *tmp; 151 int fd; 152 153 tempfile = my_alloc(1, strlen(xtempfile) + 154 strlen(opts.mmap_file_directory) + 2); 155 if (tempfile == NULL) { 156 return (-1); 157 } 158 (void) sprintf(tempfile, "%s/%s", opts.mmap_file_directory, 159 xtempfile); 160 161 tmp = mktemp(tempfile); 162 if ((fd = get_high_fd(open(tmp, O_RDWR|O_CREAT|O_EXCL, 163 S_IRUSR|S_IWUSR|S_ISVTX))) == -1) { 164 int saveerrno = errno; 165 free(tempfile); 166 errno = saveerrno; 167 } else { 168 (void) unlink(tmp); 169 free(tempfile); 170 } 171 return (fd); 172 } 173 void * 174 mmap_init(long a, long b, shm_flags flags) 175 { 176 struct mmap_shm *x; 177 struct mmap_shm *q; 178 int prot = PROT_READ | PROT_WRITE | 179 (flags.execute == 1 ? PROT_EXEC : 0); 180 int use_dev_zero = !(flags.allow_detach || 181 opts.expert_always_mmap_from_file); 182 183 if ((x = (struct mmap_shm *)my_alloc(1, sizeof (struct mmap_shm))) == 184 NULL) { 185 return (NULL); 186 } 187 188 assert(a); 189 assert(b); 190 x->len = a * b; 191 assert(x->len != 0); 192 x->flags = flags; 193 x->flags.write = 1; 194 195 if (!use_dev_zero) { 196 int pagesz = sysconf(_SC_PAGESIZE); 197 do { 198 if (fd.fd == -1) { 199 if ((x->fd = create_temp()) == -1) { 200 int saveerrno = errno; 201 free(x); 202 errno = saveerrno; 203 return (NULL); 204 } 205 fd.fd = x->fd; 206 fd.trunc = fd.off = 0; 207 } else { 208 x->fd = fd.fd; 209 } 210 if (fd.off + x->len > fd.trunc) { 211 off_t old_trunc = fd.trunc; 212 while (fd.off + x->len > fd.trunc) 213 fd.trunc += 100 * 1024 * pagesz; 214 if (ftruncate(x->fd, fd.trunc) == -1) { 215 int e; 216 fd.trunc = old_trunc + x->len + 217 (pagesz - (x->len % pagesz)); 218 /* 219 * Try to just get what is needed now. 220 * It may be enough! 221 */ 222 if (ftruncate(x->fd, fd.trunc) != -1) { 223 break; 224 } 225 e = errno; 226 /* 227 * Shrink the file back to match 228 * what is actually being used. 229 */ 230 (void) ftruncate(x->fd, fd.off); 231 errno = e; 232 233 if (errno != EFBIG && errno != EINVAL) { 234 fd.trunc = old_trunc; 235 pperror("ftruncate(%d, %llX)", 236 x->fd, (long long)fd.trunc); 237 return (NULL); 238 } 239 fd.fd = -1; 240 } 241 } 242 } while (fd.fd == -1); 243 x->off = fd.off; 244 fd.off = fd.off + x->len + (pagesz - (x->len % pagesz)); 245 } else { 246 if ((x->fd = open("/dev/zero", O_RDWR, 0600)) == -1) { 247 int saveerrno = errno; 248 free(x); 249 errno = saveerrno; 250 return (NULL); 251 } 252 x->off = 0; 253 } 254 255 if (opts.fast_start && flags.allow_detach) { 256 x->addr = NULL; 257 x->ref = 0; 258 LIST_ADD(&free_list, x); 259 return (x); 260 } 261 x->addr = mmap(0, x->len, prot, MAP_SHARED, x->fd, x->off); 262 x->ref = 0; 263 264 for (q = free_list.tail; q != NULL && x->addr == MAP_FAILED; 265 q = q->prev) { 266 hasfailed = 1; 267 if (x->len != q->len) 268 continue; 269 if (q->addr != NULL && q->ref == 0) { 270 if (munmap(q->addr, q->len) == -1) { 271 pperror("munmap(%lx, %lx)", 272 (ulong_t)q->addr, q->len); 273 } else { 274 q->addr = NULL; 275 } 276 x->addr = mmap(0, x->len, prot, 277 MAP_SHARED, x->fd, x->off); 278 } 279 } 280 for (q = free_list.tail; q != NULL && x->addr == MAP_FAILED; 281 q = q->prev) { 282 if (q->addr != NULL && q->ref == 0) { 283 if (munmap(q->addr, q->len) == -1) { 284 pperror("munmap(%lx, %lx)", 285 (ulong_t)q->addr, q->len); 286 } else { 287 q->addr = NULL; 288 } 289 x->addr = mmap(0, x->len, prot, 290 MAP_SHARED, x->fd, x->off); 291 } 292 } 293 294 if (x->addr == MAP_FAILED) { 295 (void) close(x->fd); 296 free(x); 297 return (NULL); 298 } 299 if (!use_dev_zero) { 300 /* 301 * Be ready to take SIGBUS if there is no enough free 302 * memory to allocate this page and then be able to 303 * report a sensible failure message. 304 */ 305 void *sig = expect_signal(SIGBUS, "memset", x->addr, a *b); 306 (void) memset(x->addr, NULL, a * b); 307 cancel_expected_signal(SIGBUS, sig); 308 if (x->flags.always_detach || !flags.leave_attached) { 309 if (munmap(x->addr, x->len) == -1) 310 pperror("munmap(%lx, %lx)", 311 (ulong_t)x->addr, x->len); 312 else 313 x->addr = NULL; 314 } 315 } else { 316 if (close(x->fd) != -1) 317 x->fd = -1; 318 } 319 LIST_ADD(&free_list, x); 320 assert(x->addr != MAP_FAILED); 321 return (x); 322 } 323 void * 324 mmap_attach(void * arg) 325 { 326 struct mmap_shm *x = (struct mmap_shm *)arg; 327 328 if (x->addr == NULL) { 329 struct mmap_shm *q; 330 void *hint = NULL; /* may get passed in as an arg in future */ 331 int prot = PROT_READ | 332 (x->flags.write ? PROT_WRITE : 0) | 333 (x->flags.execute ? PROT_EXEC : 0); 334 x->addr = mmap(hint, x->len, prot, MAP_SHARED, 335 x->fd, x->off); 336 for (q = free_list.tail; q != NULL && x->addr == MAP_FAILED; 337 q = q->prev) { 338 hasfailed = 1; 339 if (x->len != q->len) 340 continue; 341 if (q != x && q->addr != NULL) { 342 if (munmap(q->addr, q->len) == -1) { 343 pperror("munmap(%lx, %lx)", 344 (ulong_t)q->addr, q->len); 345 } else { 346 hint = q->addr; 347 q->addr = NULL; 348 } 349 x->addr = mmap(hint, x->len, prot, 350 MAP_SHARED, x->fd, x->off); 351 } 352 } 353 for (q = free_list.tail; q != NULL && x->addr == MAP_FAILED; 354 q = q->prev) { 355 if (q != x && q->addr != NULL) { 356 if (munmap(q->addr, q->len) == -1) { 357 pperror("munmap(%lx, %lx)", 358 (ulong_t)q->addr, q->len); 359 } else { 360 hint = q->addr; 361 q->addr = NULL; 362 } 363 x->addr = mmap(hint, x->len, prot, 364 MAP_SHARED, x->fd, x->off); 365 } 366 } 367 if (x->addr == MAP_FAILED) { 368 x->addr = NULL; 369 } 370 } 371 if (x->addr != NULL) { 372 if (x->ref++ == 0) { 373 LIST_REMOVE(&free_list, x); 374 LIST_ADD(&attach_list, x); 375 } 376 } 377 assert(x->addr != MAP_FAILED); 378 return (x->addr); 379 } 380 shm_det_status 381 mmap_detach(void *arg) 382 { 383 struct mmap_shm *x = (struct mmap_shm *)arg; 384 385 if (x->flags.allow_detach) { 386 if (--(x->ref) == 0) { 387 LIST_REMOVE(&attach_list, x); 388 LIST_ADD(&free_list, x); 389 if (x->flags.always_detach) { 390 if (munmap(x->addr, x->len) != -1) { 391 x->addr = NULL; 392 return (SHM_DETACH_DONE); 393 } else { 394 return (SHM_DETACH_ERR); 395 } 396 } 397 } 398 } 399 return (SHM_DETACH_OK); 400 } 401 void 402 mmap_destroy(void *arg) 403 { 404 struct mmap_shm *x = (struct mmap_shm *)arg; 405 406 if (--(x->users) != 0) 407 return; 408 if (x->addr) { 409 if (munmap(x->addr, x->len) == -1) 410 x->addr = NULL; 411 } 412 /* 413 * 414 * if (x->fd) { 415 * (void) close(x->fd); 416 * } 417 */ 418 free(x); 419 } 420 ulong_t 421 mmap_max_size() 422 { 423 return (opts.expert_mmap_max_size); 424 } 425 /*ARGSUSED*/ 426 const char * 427 mmap_name(void *x) 428 { 429 static char name[] = "mmap_no_frag"; 430 return (&name[0]); 431 } 432 static int 433 mmap_is_short_of_mem(void) 434 { 435 return (hasfailed); 436 } 437 /*ARGSUSED*/ 438 static void 439 mmap_complete(void * handle) 440 { 441 if (fd.off != fd.trunc && fd.fd != -1) { 442 if (ftruncate(fd.fd, fd.off) == -1) { 443 pperror("ftruncate(%d, %llX)", fd.fd, 444 (long long)fd.trunc); 445 } else { 446 plog(LOG_DEBUG, "ftruncate(%d, %llX)\n", 447 fd.fd, (long long) fd.trunc); 448 } 449 } 450 } 451 452 void * 453 frag_init(long a, long b, shm_flags flags) 454 { 455 struct frag *f; 456 struct free_frags *ff; 457 long len = a * b; 458 int pagesz = sysconf(_SC_PAGESIZE); 459 460 len = ROUND_UP(len, sizeof (long long)); 461 462 if (pagesz > len) { 463 for (ff = &free_frag; ff != NULL && ff->frag != NULL; 464 ff = ff->next) { 465 if (len <= ff->len && memcmp(&flags, 466 &ff->frag->base->flags, sizeof (flags))) { 467 /* Bingo */ 468 if (ff->len - len < sizeof (off64_t)) { 469 if (ff->prev != NULL) { 470 ff->prev->next = ff->next; 471 } 472 if (ff->next != NULL) { 473 ff->next->prev = ff->prev; 474 } 475 f = ff->frag; 476 if (ff == &free_frag) { 477 ff->frag = NULL; 478 ff->prev = NULL; 479 ff->next = NULL; 480 } else { 481 free(ff); 482 } 483 } else { 484 if ((f = my_alloc(1, 485 sizeof (struct frag))) == NULL) { 486 return (NULL); 487 } 488 *f = *ff->frag; 489 ff->len -= len; 490 ff->frag->off += len; 491 } 492 f->base->users++; 493 return (f); 494 } 495 } 496 /* nothing suitable */ 497 if ((f = my_alloc(1, sizeof (struct frag))) == NULL) { 498 return (NULL); 499 } 500 if ((f->base = mmap_init(1, pagesz, flags)) == NULL) { 501 free(f); 502 return (NULL); 503 } 504 f->off = 0; 505 if ((pagesz - len) >= sizeof (long long)) { 506 if (free_frag.frag == NULL) { 507 ff = &free_frag; 508 if ((ff->frag = my_alloc(1, 509 sizeof (struct frag))) == NULL) { 510 return (NULL); 511 } 512 } else if ((ff = my_alloc(1, 513 sizeof (struct free_frags))) == NULL) { 514 return (NULL); 515 } else { 516 if ((ff->frag = my_alloc(1, 517 sizeof (struct frag))) == NULL) { 518 free(ff); 519 return (NULL); 520 } 521 ff->next = free_frag.next; 522 if (free_frag.next != NULL) { 523 free_frag.next->prev = ff; 524 } 525 ff->prev = &free_frag; 526 free_frag.next = ff; 527 } 528 ff->frag->base = f->base; 529 ff->frag->off = len; 530 ff->len = pagesz - len; 531 } 532 } else { 533 if ((f = my_alloc(1, sizeof (struct frag))) == NULL) { 534 return (NULL); 535 } 536 if ((f->base = mmap_init(1, len, flags)) == NULL) { 537 free(f); 538 return (NULL); 539 } 540 f->off = 0; 541 } 542 f->base->users++; 543 return (f); 544 } 545 546 static void * 547 frag_attach(void *arg) 548 { 549 struct frag *f = (struct frag *)arg; 550 char *addr; 551 552 if ((addr = mmap_attach(f->base)) == NULL) { 553 return (NULL); 554 } else { 555 return ((void *)(addr + f->off)); 556 } 557 } 558 559 static shm_det_status 560 frag_detach(void *arg) 561 { 562 struct frag *f = (struct frag *)arg; 563 return (mmap_detach(f->base)); 564 } 565 566 /*ARGSUSED*/ 567 static const char * 568 frag_name(void *arg) 569 { 570 static char name[] = "mmap"; 571 return (&name[0]); 572 } 573 574 static void 575 frag_destroy(void *arg) 576 { 577 struct frag *f = (struct frag *)arg; 578 mmap_destroy(f->base); 579 } 580 581 static void 582 mmap_fini(void) 583 { 584 } 585 586 587 struct shm_ops shm_mmap_no_frag_ops = { 588 mmap_name, 589 mmap_name, 590 mmap_init, 591 mmap_attach, 592 mmap_detach, 593 mmap_destroy, 594 mmap_max_size, 595 mmap_is_short_of_mem, 596 mmap_complete, 597 mmap_fini 598 /* mmap_garbage_collect */ 599 }; 600 struct shm_ops shm_mmap_ops = { 601 frag_name, 602 frag_name, 603 frag_init, 604 frag_attach, 605 frag_detach, 606 frag_destroy, 607 mmap_max_size, 608 mmap_is_short_of_mem, 609 mmap_complete, 610 mmap_fini 611 /* mmap_garbage_collect */ 612 }; 613