Home | History | Annotate | Download | only in diskomizer
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)shm_mmap_ops.c	1.16	09/05/26 SMI"
     28 
     29 /*
     30  * shared memory operations.  They support the attaching and
     31  * detaching of shared memory, to reduce the overall address space
     32  * usage on a 32 bit system. All this extra work is a waste of
     33  * time on a 64 bit system at present, unless someone has decreased
     34  * some of the limits.
     35  *
     36  * Still to do:
     37  *	The fd should not be held in the mmap_shm structure, but
     38  *	in a seperate structure with a reference count.  When
     39  *	the reference count drops to 0 the fd can be closed.
     40  *
     41  *	Currently it is possible, to leak file descriptors, but
     42  *	this will only happen during error paths, most if not all
     43  *	of which are fatal so it won't impact the program.
     44  */
     45 #ifndef _LARGEFILE64_SOURCE
     46 #define	_LARGEFILE64_SOURCE
     47 #endif
     48 #include <unistd.h>
     49 #include <stdlib.h>
     50 #include <sys/types.h>
     51 #include <sys/stat.h>
     52 #include <fcntl.h>
     53 #include <sys/mman.h>
     54 #include <errno.h>
     55 #include <string.h>
     56 #include <assert.h>
     57 #include <limits.h>
     58 #include <stdio.h>
     59 #include "shm_ops.h"
     60 #include <diskomizer/log.h>
     61 #include <diskomizer/stack_trace.h>
     62 #include "signal_catch.h"
     63 #include "args.h"
     64 #include "list.h"
     65 #include "utils.h"
     66 
     67 struct list_head {
     68 	struct mmap_shm *head;
     69 	struct mmap_shm *tail;
     70 };
     71 struct mmap_shm {
     72 	struct mmap_shm *next;
     73 	struct mmap_shm *prev;
     74 	shm_flags flags;
     75 	int fd;
     76 	off_t off;
     77 	int ref;
     78 	int users;
     79 	void *addr;
     80 	ulong_t len;
     81 };
     82 struct frag {
     83 	struct mmap_shm *base;
     84 	off_t off;
     85 };
     86 struct free_frags {
     87 	struct free_frags *next;
     88 	struct free_frags *prev;
     89 	struct frag *frag;
     90 	ulong_t len;
     91 };
     92 
     93 static struct free_frags free_frag;
     94 static struct list_head free_list, attach_list;
     95 static int hasfailed;
     96 static struct fd {
     97 	int fd;
     98 	off_t trunc;
     99 	off_t off;
    100 } fd = {
    101 	-1,
    102 	0,
    103 	0
    104 };
    105 static void *
    106 my_alloc(size_t nelem, size_t elsize)
    107 {
    108 	void *res;
    109 	long sbrkval;
    110 	struct mmap_shm *q;
    111 
    112 	res = calloc(nelem, elsize);
    113 	if (res == NULL && (sbrkval = (long)sbrk(0))  != -1) {
    114 		sbrkval += (nelem * elsize) + 8192;
    115 		for (q = free_list.tail; q != NULL && res == NULL;
    116 		    q = q->prev) {
    117 			if (q->addr != NULL && q->ref == 0 &&
    118 			    (ulong_t)q->addr <= sbrkval) {
    119 				if (munmap(q->addr, q->len) == -1) {
    120 					pperror("%d, munmap(%lx, %lx)",
    121 					    __LINE__, (ulong_t)q->addr, q->len);
    122 				} else {
    123 					q->addr = NULL;
    124 				}
    125 				res = calloc(nelem, elsize);
    126 			}
    127 		}
    128 	}
    129 	for (q = free_list.tail; q != NULL && res == NULL; q = q->prev) {
    130 		if (q->addr != NULL && q->ref == 0) {
    131 			if (munmap(q->addr, q->len) == -1) {
    132 				pperror("%d, munmap(%lx, %lx)", __LINE__,
    133 				    (ulong_t)q->addr, q->len);
    134 			} else {
    135 				res = calloc(nelem, elsize);
    136 				if (res == NULL)
    137 					q->addr = NULL;
    138 				else
    139 					q->addr = NULL;
    140 			}
    141 		}
    142 	}
    143 	return (res);
    144 }
    145 
    146 static int
    147 create_temp(void)
    148 {
    149 	char xtempfile[] = "mmap_memory_file_XXXXXX";
    150 	char *tempfile, *tmp;
    151 	int fd;
    152 
    153 	tempfile = my_alloc(1, strlen(xtempfile) +
    154 	    strlen(opts.mmap_file_directory) + 2);
    155 	if (tempfile == NULL) {
    156 		return (-1);
    157 	}
    158 	(void) sprintf(tempfile, "%s/%s", opts.mmap_file_directory,
    159 	    xtempfile);
    160 
    161 	tmp = mktemp(tempfile);
    162 	if ((fd = get_high_fd(open(tmp, O_RDWR|O_CREAT|O_EXCL,
    163 	    S_IRUSR|S_IWUSR|S_ISVTX))) == -1) {
    164 		int saveerrno = errno;
    165 		free(tempfile);
    166 		errno = saveerrno;
    167 	} else {
    168 		(void) unlink(tmp);
    169 		free(tempfile);
    170 	}
    171 	return (fd);
    172 }
    173 void *
    174 mmap_init(long a, long b, shm_flags flags)
    175 {
    176 	struct mmap_shm *x;
    177 	struct mmap_shm *q;
    178 	int prot = PROT_READ | PROT_WRITE |
    179 	    (flags.execute == 1 ? PROT_EXEC : 0);
    180 	int use_dev_zero = !(flags.allow_detach ||
    181 	    opts.expert_always_mmap_from_file);
    182 
    183 	if ((x = (struct mmap_shm *)my_alloc(1, sizeof (struct mmap_shm))) ==
    184 	    NULL) {
    185 		return (NULL);
    186 	}
    187 
    188 	assert(a);
    189 	assert(b);
    190 	x->len = a * b;
    191 	assert(x->len != 0);
    192 	x->flags = flags;
    193 	x->flags.write = 1;
    194 
    195 	if (!use_dev_zero) {
    196 		int pagesz = sysconf(_SC_PAGESIZE);
    197 		do {
    198 			if (fd.fd == -1) {
    199 				if ((x->fd = create_temp()) == -1) {
    200 					int saveerrno = errno;
    201 					free(x);
    202 					errno = saveerrno;
    203 					return (NULL);
    204 				}
    205 				fd.fd = x->fd;
    206 				fd.trunc =  fd.off = 0;
    207 			} else {
    208 				x->fd = fd.fd;
    209 			}
    210 			if (fd.off + x->len > fd.trunc) {
    211 				off_t old_trunc = fd.trunc;
    212 				while (fd.off + x->len > fd.trunc)
    213 					fd.trunc += 100 * 1024 * pagesz;
    214 				if (ftruncate(x->fd, fd.trunc) == -1) {
    215 					int e;
    216 					fd.trunc = old_trunc + x->len +
    217 					    (pagesz - (x->len % pagesz));
    218 					/*
    219 					 * Try to just get what is needed now.
    220 					 * It may be enough!
    221 					 */
    222 					if (ftruncate(x->fd, fd.trunc) != -1) {
    223 						break;
    224 					}
    225 					e = errno;
    226 					/*
    227 					 * Shrink the file back to match
    228 					 * what is actually being used.
    229 					 */
    230 					(void) ftruncate(x->fd, fd.off);
    231 					errno = e;
    232 
    233 					if (errno != EFBIG && errno != EINVAL) {
    234 						fd.trunc = old_trunc;
    235 						pperror("ftruncate(%d, %llX)",
    236 						    x->fd, (long long)fd.trunc);
    237 						return (NULL);
    238 					}
    239 					fd.fd = -1;
    240 				}
    241 			}
    242 		} while (fd.fd == -1);
    243 		x->off = fd.off;
    244 		fd.off = fd.off + x->len + (pagesz - (x->len % pagesz));
    245 	} else {
    246 		if ((x->fd = open("/dev/zero", O_RDWR, 0600)) == -1) {
    247 			int saveerrno = errno;
    248 			free(x);
    249 			errno = saveerrno;
    250 			return (NULL);
    251 		}
    252 		x->off = 0;
    253 	}
    254 
    255 	if (opts.fast_start && flags.allow_detach) {
    256 		x->addr = NULL;
    257 		x->ref = 0;
    258 		LIST_ADD(&free_list, x);
    259 		return (x);
    260 	}
    261 	x->addr = mmap(0, x->len, prot, MAP_SHARED, x->fd, x->off);
    262 	x->ref = 0;
    263 
    264 	for (q = free_list.tail; q != NULL && x->addr == MAP_FAILED;
    265 	    q = q->prev) {
    266 		hasfailed = 1;
    267 		if (x->len != q->len)
    268 			continue;
    269 		if (q->addr != NULL && q->ref == 0) {
    270 			if (munmap(q->addr, q->len) == -1) {
    271 				pperror("munmap(%lx, %lx)",
    272 				    (ulong_t)q->addr, q->len);
    273 			} else {
    274 				q->addr = NULL;
    275 			}
    276 			x->addr = mmap(0, x->len, prot,
    277 			    MAP_SHARED, x->fd, x->off);
    278 		}
    279 	}
    280 	for (q = free_list.tail; q != NULL && x->addr == MAP_FAILED;
    281 	    q = q->prev) {
    282 		if (q->addr != NULL && q->ref == 0) {
    283 			if (munmap(q->addr, q->len) == -1) {
    284 				pperror("munmap(%lx, %lx)",
    285 				    (ulong_t)q->addr, q->len);
    286 			} else {
    287 				q->addr = NULL;
    288 			}
    289 			x->addr = mmap(0, x->len, prot,
    290 			    MAP_SHARED, x->fd, x->off);
    291 		}
    292 	}
    293 
    294 	if (x->addr == MAP_FAILED) {
    295 		(void) close(x->fd);
    296 		free(x);
    297 		return (NULL);
    298 	}
    299 	if (!use_dev_zero) {
    300 		/*
    301 		 * Be ready to take SIGBUS if there is no enough free
    302 		 * memory to allocate this page and then be able to
    303 		 * report a sensible failure message.
    304 		 */
    305 		void *sig = expect_signal(SIGBUS, "memset", x->addr, a *b);
    306 		(void) memset(x->addr, NULL, a * b);
    307 		cancel_expected_signal(SIGBUS, sig);
    308 		if (x->flags.always_detach || !flags.leave_attached) {
    309 			if (munmap(x->addr, x->len) == -1)
    310 				pperror("munmap(%lx, %lx)",
    311 				    (ulong_t)x->addr, x->len);
    312 			else
    313 				x->addr = NULL;
    314 		}
    315 	} else {
    316 		if (close(x->fd) != -1)
    317 			x->fd = -1;
    318 	}
    319 	LIST_ADD(&free_list, x);
    320 	assert(x->addr != MAP_FAILED);
    321 	return (x);
    322 }
    323 void *
    324 mmap_attach(void * arg)
    325 {
    326 	struct mmap_shm *x = (struct mmap_shm *)arg;
    327 
    328 	if (x->addr == NULL) {
    329 		struct mmap_shm *q;
    330 		void *hint = NULL; /* may get passed in as an arg in future */
    331 		int prot =  PROT_READ |
    332 		    (x->flags.write ? PROT_WRITE : 0) |
    333 		    (x->flags.execute ? PROT_EXEC : 0);
    334 		x->addr = mmap(hint, x->len, prot, MAP_SHARED,
    335 		    x->fd, x->off);
    336 		for (q = free_list.tail; q != NULL && x->addr == MAP_FAILED;
    337 		    q = q->prev) {
    338 			hasfailed = 1;
    339 			if (x->len != q->len)
    340 				continue;
    341 			if (q != x && q->addr != NULL) {
    342 				if (munmap(q->addr, q->len) == -1) {
    343 					pperror("munmap(%lx, %lx)",
    344 					    (ulong_t)q->addr, q->len);
    345 				} else {
    346 					hint = q->addr;
    347 					q->addr = NULL;
    348 				}
    349 				x->addr = mmap(hint, x->len, prot,
    350 				    MAP_SHARED, x->fd, x->off);
    351 			}
    352 		}
    353 		for (q = free_list.tail; q != NULL && x->addr == MAP_FAILED;
    354 		    q = q->prev) {
    355 			if (q != x && q->addr != NULL) {
    356 				if (munmap(q->addr, q->len) == -1) {
    357 					pperror("munmap(%lx, %lx)",
    358 					    (ulong_t)q->addr, q->len);
    359 				} else {
    360 					hint = q->addr;
    361 					q->addr = NULL;
    362 				}
    363 				x->addr = mmap(hint, x->len, prot,
    364 				    MAP_SHARED, x->fd, x->off);
    365 			}
    366 		}
    367 		if (x->addr == MAP_FAILED) {
    368 			x->addr = NULL;
    369 		}
    370 	}
    371 	if (x->addr != NULL) {
    372 		if (x->ref++ == 0) {
    373 			LIST_REMOVE(&free_list, x);
    374 			LIST_ADD(&attach_list, x);
    375 		}
    376 	}
    377 	assert(x->addr != MAP_FAILED);
    378 	return (x->addr);
    379 }
    380 shm_det_status
    381 mmap_detach(void *arg)
    382 {
    383 	struct mmap_shm *x = (struct mmap_shm *)arg;
    384 
    385 	if (x->flags.allow_detach) {
    386 		if (--(x->ref) == 0) {
    387 			LIST_REMOVE(&attach_list, x);
    388 			LIST_ADD(&free_list, x);
    389 			if (x->flags.always_detach) {
    390 				if (munmap(x->addr, x->len) != -1) {
    391 					x->addr = NULL;
    392 					return (SHM_DETACH_DONE);
    393 				} else {
    394 					return (SHM_DETACH_ERR);
    395 				}
    396 			}
    397 		}
    398 	}
    399 	return (SHM_DETACH_OK);
    400 }
    401 void
    402 mmap_destroy(void *arg)
    403 {
    404 	struct mmap_shm *x = (struct mmap_shm *)arg;
    405 
    406 	if (--(x->users) != 0)
    407 		return;
    408 	if (x->addr) {
    409 		if (munmap(x->addr, x->len) == -1)
    410 			x->addr = NULL;
    411 	}
    412 /*
    413  *
    414  *	if (x->fd) {
    415  *		(void) close(x->fd);
    416  *	}
    417  */
    418 	free(x);
    419 }
    420 ulong_t
    421 mmap_max_size()
    422 {
    423 	return (opts.expert_mmap_max_size);
    424 }
    425 /*ARGSUSED*/
    426 const char *
    427 mmap_name(void *x)
    428 {
    429 	static char name[] = "mmap_no_frag";
    430 	return (&name[0]);
    431 }
    432 static int
    433 mmap_is_short_of_mem(void)
    434 {
    435 	return (hasfailed);
    436 }
    437 /*ARGSUSED*/
    438 static void
    439 mmap_complete(void * handle)
    440 {
    441 	if (fd.off != fd.trunc && fd.fd != -1) {
    442 		if (ftruncate(fd.fd, fd.off) == -1) {
    443 			pperror("ftruncate(%d, %llX)", fd.fd,
    444 			    (long long)fd.trunc);
    445 		} else {
    446 			plog(LOG_DEBUG, "ftruncate(%d, %llX)\n",
    447 			    fd.fd, (long long) fd.trunc);
    448 		}
    449 	}
    450 }
    451 
    452 void *
    453 frag_init(long a, long b, shm_flags flags)
    454 {
    455 	struct frag *f;
    456 	struct free_frags *ff;
    457 	long len = a * b;
    458 	int pagesz = sysconf(_SC_PAGESIZE);
    459 
    460 	len = ROUND_UP(len, sizeof (long long));
    461 
    462 	if (pagesz > len) {
    463 		for (ff = &free_frag; ff != NULL && ff->frag != NULL;
    464 		    ff = ff->next) {
    465 			if (len <= ff->len && memcmp(&flags,
    466 			    &ff->frag->base->flags, sizeof (flags))) {
    467 				/* Bingo */
    468 				if (ff->len - len < sizeof (off64_t)) {
    469 					if (ff->prev != NULL) {
    470 						ff->prev->next = ff->next;
    471 					}
    472 					if (ff->next != NULL) {
    473 						ff->next->prev = ff->prev;
    474 					}
    475 					f = ff->frag;
    476 					if (ff == &free_frag) {
    477 						ff->frag = NULL;
    478 						ff->prev = NULL;
    479 						ff->next = NULL;
    480 					} else {
    481 						free(ff);
    482 					}
    483 				} else {
    484 					if ((f = my_alloc(1,
    485 					    sizeof (struct frag))) == NULL) {
    486 						return (NULL);
    487 					}
    488 					*f = *ff->frag;
    489 					ff->len -= len;
    490 					ff->frag->off += len;
    491 				}
    492 				f->base->users++;
    493 				return (f);
    494 			}
    495 		}
    496 		/* nothing suitable */
    497 		if ((f = my_alloc(1, sizeof (struct frag))) == NULL) {
    498 			return (NULL);
    499 		}
    500 		if ((f->base = mmap_init(1, pagesz, flags)) == NULL) {
    501 			free(f);
    502 			return (NULL);
    503 		}
    504 		f->off = 0;
    505 		if ((pagesz - len) >= sizeof (long long)) {
    506 			if (free_frag.frag == NULL) {
    507 				ff = &free_frag;
    508 				if ((ff->frag = my_alloc(1,
    509 					sizeof (struct frag))) == NULL) {
    510 					return (NULL);
    511 				}
    512 			} else if ((ff = my_alloc(1,
    513 					sizeof (struct free_frags))) == NULL) {
    514 				return (NULL);
    515 			} else {
    516 				if ((ff->frag = my_alloc(1,
    517 					sizeof (struct frag))) == NULL) {
    518 					free(ff);
    519 					return (NULL);
    520 				}
    521 				ff->next = free_frag.next;
    522 				if (free_frag.next != NULL) {
    523 					free_frag.next->prev = ff;
    524 				}
    525 				ff->prev = &free_frag;
    526 				free_frag.next = ff;
    527 			}
    528 			ff->frag->base = f->base;
    529 			ff->frag->off = len;
    530 			ff->len = pagesz - len;
    531 		}
    532 	} else {
    533 		if ((f = my_alloc(1, sizeof (struct frag))) == NULL) {
    534 			return (NULL);
    535 		}
    536 		if ((f->base = mmap_init(1, len, flags)) == NULL) {
    537 			free(f);
    538 			return (NULL);
    539 		}
    540 		f->off = 0;
    541 	}
    542 	f->base->users++;
    543 	return (f);
    544 }
    545 
    546 static void *
    547 frag_attach(void *arg)
    548 {
    549 	struct frag *f = (struct frag *)arg;
    550 	char *addr;
    551 
    552 	if ((addr = mmap_attach(f->base)) == NULL) {
    553 		return (NULL);
    554 	} else {
    555 		return ((void *)(addr + f->off));
    556 	}
    557 }
    558 
    559 static shm_det_status
    560 frag_detach(void *arg)
    561 {
    562 	struct frag *f = (struct frag *)arg;
    563 	return (mmap_detach(f->base));
    564 }
    565 
    566 /*ARGSUSED*/
    567 static const char *
    568 frag_name(void *arg)
    569 {
    570 	static char name[] = "mmap";
    571 	return (&name[0]);
    572 }
    573 
    574 static void
    575 frag_destroy(void *arg)
    576 {
    577 	struct frag *f = (struct frag *)arg;
    578 	mmap_destroy(f->base);
    579 }
    580 
    581 static void
    582 mmap_fini(void)
    583 {
    584 }
    585 
    586 
    587 struct shm_ops shm_mmap_no_frag_ops = {
    588 	mmap_name,
    589 	mmap_name,
    590 	mmap_init,
    591 	mmap_attach,
    592 	mmap_detach,
    593 	mmap_destroy,
    594 	mmap_max_size,
    595 	mmap_is_short_of_mem,
    596 	mmap_complete,
    597 	mmap_fini
    598 	/* mmap_garbage_collect */
    599 };
    600 struct shm_ops shm_mmap_ops = {
    601 	frag_name,
    602 	frag_name,
    603 	frag_init,
    604 	frag_attach,
    605 	frag_detach,
    606 	frag_destroy,
    607 	mmap_max_size,
    608 	mmap_is_short_of_mem,
    609 	mmap_complete,
    610 	mmap_fini
    611 	/* mmap_garbage_collect */
    612 };
    613