Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     28 /*	All Rights Reserved */
     29 
     30 #include <sys/types.h>
     31 #include <sys/sysmacros.h>
     32 #include <sys/param.h>
     33 #include <sys/systm.h>
     34 #include <sys/errno.h>
     35 #include <sys/signal.h>
     36 #include <sys/cred.h>
     37 #include <sys/user.h>
     38 #include <sys/conf.h>
     39 #include <sys/vfs.h>
     40 #include <sys/vnode.h>
     41 #include <sys/pathname.h>
     42 #include <sys/file.h>
     43 #include <sys/proc.h>
     44 #include <sys/var.h>
     45 #include <sys/cpuvar.h>
     46 #include <sys/open.h>
     47 #include <sys/cmn_err.h>
     48 #include <sys/priocntl.h>
     49 #include <sys/procset.h>
     50 #include <sys/prsystm.h>
     51 #include <sys/debug.h>
     52 #include <sys/kmem.h>
     53 #include <sys/atomic.h>
     54 #include <sys/fcntl.h>
     55 #include <sys/poll.h>
     56 #include <sys/rctl.h>
     57 #include <sys/port_impl.h>
     58 
     59 #include <c2/audit.h>
     60 #include <sys/nbmlock.h>
     61 
     62 #ifdef DEBUG
     63 
     64 static uint32_t afd_maxfd;	/* # of entries in maximum allocated array */
     65 static uint32_t afd_alloc;	/* count of kmem_alloc()s */
     66 static uint32_t afd_free;	/* count of kmem_free()s */
     67 static uint32_t afd_wait;	/* count of waits on non-zero ref count */
     68 #define	MAXFD(x)	(afd_maxfd = ((afd_maxfd >= (x))? afd_maxfd : (x)))
     69 #define	COUNT(x)	atomic_add_32(&x, 1)
     70 
     71 #else	/* DEBUG */
     72 
     73 #define	MAXFD(x)
     74 #define	COUNT(x)
     75 
     76 #endif	/* DEBUG */
     77 
     78 kmem_cache_t *file_cache;
     79 static int vpsetattr(vnode_t *, vattr_t *, int);
     80 
     81 static void port_close_fd(portfd_t *);
     82 
     83 /*
     84  * File descriptor allocation.
     85  *
     86  * fd_find(fip, minfd) finds the first available descriptor >= minfd.
     87  * The most common case is open(2), in which minfd = 0, but we must also
     88  * support fcntl(fd, F_DUPFD, minfd).
     89  *
     90  * The algorithm is as follows: we keep all file descriptors in an infix
     91  * binary tree in which each node records the number of descriptors
     92  * allocated in its right subtree, including itself.  Starting at minfd,
     93  * we ascend the tree until we find a non-fully allocated right subtree.
     94  * We then descend that subtree in a binary search for the smallest fd.
     95  * Finally, we ascend the tree again to increment the allocation count
     96  * of every subtree containing the newly-allocated fd.  Freeing an fd
     97  * requires only the last step: we ascend the tree to decrement allocation
     98  * counts.  Each of these three steps (ascent to find non-full subtree,
     99  * descent to find lowest fd, ascent to update allocation counts) is
    100  * O(log n), thus the algorithm as a whole is O(log n).
    101  *
    102  * We don't implement the fd tree using the customary left/right/parent
    103  * pointers, but instead take advantage of the glorious mathematics of
    104  * full infix binary trees.  For reference, here's an illustration of the
    105  * logical structure of such a tree, rooted at 4 (binary 100), covering
    106  * the range 1-7 (binary 001-111).  Our canonical trees do not include
    107  * fd 0; we'll deal with that later.
    108  *
    109  *	      100
    110  *	     /	 \
    111  *	    /	  \
    112  *	  010	  110
    113  *	  / \	  / \
    114  *	001 011 101 111
    115  *
    116  * We make the following observations, all of which are easily proven by
    117  * induction on the depth of the tree:
    118  *
    119  * (T1) The least-significant bit (LSB) of any node is equal to its level
    120  *      in the tree.  In our example, nodes 001, 011, 101 and 111 are at
    121  *      level 0; nodes 010 and 110 are at level 1; and node 100 is at level 2.
    122  *
    123  * (T2) The child size (CSIZE) of node N -- that is, the total number of
    124  *	right-branch descendants in a child of node N, including itself -- is
    125  *	given by clearing all but the least significant bit of N.  This
    126  *	follows immediately from (T1).  Applying this rule to our example, we
    127  *	see that CSIZE(100) = 100, CSIZE(x10) = 10, and CSIZE(xx1) = 1.
    128  *
    129  * (T3) The nearest left ancestor (LPARENT) of node N -- that is, the nearest
    130  *	ancestor containing node N in its right child -- is given by clearing
    131  *	the LSB of N.  For example, LPARENT(111) = 110 and LPARENT(110) = 100.
    132  *	Clearing the LSB of nodes 001, 010 or 100 yields zero, reflecting
    133  *	the fact that these are leftmost nodes.  Note that this algorithm
    134  *	automatically skips generations as necessary.  For example, the parent
    135  *      of node 101 is 110, which is a *right* ancestor (not what we want);
    136  *      but its grandparent is 100, which is a left ancestor. Clearing the LSB
    137  *      of 101 gets us to 100 directly, skipping right past the uninteresting
    138  *      generation (110).
    139  *
    140  *      Note that since LPARENT clears the LSB, whereas CSIZE clears all *but*
    141  *	the LSB, we can express LPARENT() nicely in terms of CSIZE():
    142  *
    143  *	LPARENT(N) = N - CSIZE(N)
    144  *
    145  * (T4) The nearest right ancestor (RPARENT) of node N is given by:
    146  *
    147  *	RPARENT(N) = N + CSIZE(N)
    148  *
    149  * (T5) For every interior node, the children differ from their parent by
    150  *	CSIZE(parent) / 2.  In our example, CSIZE(100) / 2 = 2 = 10 binary,
    151  *      and indeed, the children of 100 are 100 +/- 10 = 010 and 110.
    152  *
    153  * Next, we'll need a few two's-complement math tricks.  Suppose a number,
    154  * N, has the following form:
    155  *
    156  *		N = xxxx10...0
    157  *
    158  * That is, the binary representation of N consists of some string of bits,
    159  * then a 1, then all zeroes.  This amounts to nothing more than saying that
    160  * N has a least-significant bit, which is true for any N != 0.  If we look
    161  * at N and N - 1 together, we see that we can combine them in useful ways:
    162  *
    163  *		  N = xxxx10...0
    164  *	      N - 1 = xxxx01...1
    165  *	------------------------
    166  *	N & (N - 1) = xxxx000000
    167  *	N | (N - 1) = xxxx111111
    168  *	N ^ (N - 1) =     111111
    169  *
    170  * In particular, this suggests several easy ways to clear all but the LSB,
    171  * which by (T2) is exactly what we need to determine CSIZE(N) = 10...0.
    172  * We'll opt for this formulation:
    173  *
    174  *	(C1) CSIZE(N) = (N - 1) ^ (N | (N - 1))
    175  *
    176  * Similarly, we have an easy way to determine LPARENT(N), which requires
    177  * that we clear the LSB of N:
    178  *
    179  *	(L1) LPARENT(N) = N & (N - 1)
    180  *
    181  * We note in the above relations that (N | (N - 1)) - N = CSIZE(N) - 1.
    182  * When combined with (T4), this yields an easy way to compute RPARENT(N):
    183  *
    184  *	(R1) RPARENT(N) = (N | (N - 1)) + 1
    185  *
    186  * Finally, to accommodate fd 0 we must adjust all of our results by +/-1 to
    187  * move the fd range from [1, 2^n) to [0, 2^n - 1).  This is straightforward,
    188  * so there's no need to belabor the algebra; the revised relations become:
    189  *
    190  *	(C1a) CSIZE(N) = N ^ (N | (N + 1))
    191  *
    192  *	(L1a) LPARENT(N) = (N & (N + 1)) - 1
    193  *
    194  *	(R1a) RPARENT(N) = N | (N + 1)
    195  *
    196  * This completes the mathematical framework.  We now have all the tools
    197  * we need to implement fd_find() and fd_reserve().
    198  *
    199  * fd_find(fip, minfd) finds the smallest available file descriptor >= minfd.
    200  * It does not actually allocate the descriptor; that's done by fd_reserve().
    201  * fd_find() proceeds in two steps:
    202  *
    203  * (1) Find the leftmost subtree that contains a descriptor >= minfd.
    204  *     We start at the right subtree rooted at minfd.  If this subtree is
    205  *     not full -- if fip->fi_list[minfd].uf_alloc != CSIZE(minfd) -- then
    206  *     step 1 is done.  Otherwise, we know that all fds in this subtree
    207  *     are taken, so we ascend to RPARENT(minfd) using (R1a).  We repeat
    208  *     this process until we either find a candidate subtree or exceed
    209  *     fip->fi_nfiles.  We use (C1a) to compute CSIZE().
    210  *
    211  * (2) Find the smallest fd in the subtree discovered by step 1.
    212  *     Starting at the root of this subtree, we descend to find the
    213  *     smallest available fd.  Since the left children have the smaller
    214  *     fds, we will descend rightward only when the left child is full.
    215  *
    216  *     We begin by comparing the number of allocated fds in the root
    217  *     to the number of allocated fds in its right child; if they differ
    218  *     by exactly CSIZE(child), we know the left subtree is full, so we
    219  *     descend right; that is, the right child becomes the search root.
    220  *     Otherwise we leave the root alone and start following the right
    221  *     child's left children.  As fortune would have it, this is very
    222  *     simple computationally: by (T5), the right child of fd is just
    223  *     fd + size, where size = CSIZE(fd) / 2.  Applying (T5) again,
    224  *     we find that the right child's left child is fd + size - (size / 2) =
    225  *     fd + (size / 2); *its* left child is fd + (size / 2) - (size / 4) =
    226  *     fd + (size / 4), and so on.  In general, fd's right child's
    227  *     leftmost nth descendant is fd + (size >> n).  Thus, to follow
    228  *     the right child's left descendants, we just halve the size in
    229  *     each iteration of the search.
    230  *
    231  *     When we descend leftward, we must keep track of the number of fds
    232  *     that were allocated in all the right subtrees we rejected, so we
    233  *     know how many of the root fd's allocations are in the remaining
    234  *     (as yet unexplored) leftmost part of its right subtree.  When we
    235  *     encounter a fully-allocated left child -- that is, when we find
    236  *     that fip->fi_list[fd].uf_alloc == ralloc + size -- we descend right
    237  *     (as described earlier), resetting ralloc to zero.
    238  *
    239  * fd_reserve(fip, fd, incr) either allocates or frees fd, depending
    240  * on whether incr is 1 or -1.  Starting at fd, fd_reserve() ascends
    241  * the leftmost ancestors (see (T3)) and updates the allocation counts.
    242  * At each step we use (L1a) to compute LPARENT(), the next left ancestor.
    243  *
    244  * flist_minsize() finds the minimal tree that still covers all
    245  * used fds; as long as the allocation count of a root node is zero, we
    246  * don't need that node or its right subtree.
    247  *
    248  * flist_nalloc() counts the number of allocated fds in the tree, by starting
    249  * at the top of the tree and summing the right-subtree allocation counts as
    250  * it descends leftwards.
    251  *
    252  * Note: we assume that flist_grow() will keep fip->fi_nfiles of the form
    253  * 2^n - 1.  This ensures that the fd trees are always full, which saves
    254  * quite a bit of boundary checking.
    255  */
    256 static int
    257 fd_find(uf_info_t *fip, int minfd)
    258 {
    259 	int size, ralloc, fd;
    260 
    261 	ASSERT(MUTEX_HELD(&fip->fi_lock));
    262 	ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0);
    263 
    264 	for (fd = minfd; (uint_t)fd < fip->fi_nfiles; fd |= fd + 1) {
    265 		size = fd ^ (fd | (fd + 1));
    266 		if (fip->fi_list[fd].uf_alloc == size)
    267 			continue;
    268 		for (ralloc = 0, size >>= 1; size != 0; size >>= 1) {
    269 			ralloc += fip->fi_list[fd + size].uf_alloc;
    270 			if (fip->fi_list[fd].uf_alloc == ralloc + size) {
    271 				fd += size;
    272 				ralloc = 0;
    273 			}
    274 		}
    275 		return (fd);
    276 	}
    277 	return (-1);
    278 }
    279 
    280 static void
    281 fd_reserve(uf_info_t *fip, int fd, int incr)
    282 {
    283 	int pfd;
    284 	uf_entry_t *ufp = &fip->fi_list[fd];
    285 
    286 	ASSERT((uint_t)fd < fip->fi_nfiles);
    287 	ASSERT((ufp->uf_busy == 0 && incr == 1) ||
    288 	    (ufp->uf_busy == 1 && incr == -1));
    289 	ASSERT(MUTEX_HELD(&ufp->uf_lock));
    290 	ASSERT(MUTEX_HELD(&fip->fi_lock));
    291 
    292 	for (pfd = fd; pfd >= 0; pfd = (pfd & (pfd + 1)) - 1)
    293 		fip->fi_list[pfd].uf_alloc += incr;
    294 
    295 	ufp->uf_busy += incr;
    296 }
    297 
    298 static int
    299 flist_minsize(uf_info_t *fip)
    300 {
    301 	int fd;
    302 
    303 	/*
    304 	 * We'd like to ASSERT(MUTEX_HELD(&fip->fi_lock)), but we're called
    305 	 * by flist_fork(), which relies on other mechanisms for mutual
    306 	 * exclusion.
    307 	 */
    308 	ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0);
    309 
    310 	for (fd = fip->fi_nfiles; fd != 0; fd >>= 1)
    311 		if (fip->fi_list[fd >> 1].uf_alloc != 0)
    312 			break;
    313 
    314 	return (fd);
    315 }
    316 
    317 static int
    318 flist_nalloc(uf_info_t *fip)
    319 {
    320 	int fd;
    321 	int nalloc = 0;
    322 
    323 	ASSERT(MUTEX_HELD(&fip->fi_lock));
    324 	ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0);
    325 
    326 	for (fd = fip->fi_nfiles; fd != 0; fd >>= 1)
    327 		nalloc += fip->fi_list[fd >> 1].uf_alloc;
    328 
    329 	return (nalloc);
    330 }
    331 
    332 /*
    333  * Increase size of the fi_list array to accommodate at least maxfd.
    334  * We keep the size of the form 2^n - 1 for benefit of fd_find().
    335  */
    336 static void
    337 flist_grow(int maxfd)
    338 {
    339 	uf_info_t *fip = P_FINFO(curproc);
    340 	int newcnt, oldcnt;
    341 	uf_entry_t *src, *dst, *newlist, *oldlist, *newend, *oldend;
    342 	uf_rlist_t *urp;
    343 
    344 	for (newcnt = 1; newcnt <= maxfd; newcnt = (newcnt << 1) | 1)
    345 		continue;
    346 
    347 	newlist = kmem_zalloc(newcnt * sizeof (uf_entry_t), KM_SLEEP);
    348 
    349 	mutex_enter(&fip->fi_lock);
    350 	oldcnt = fip->fi_nfiles;
    351 	if (newcnt <= oldcnt) {
    352 		mutex_exit(&fip->fi_lock);
    353 		kmem_free(newlist, newcnt * sizeof (uf_entry_t));
    354 		return;
    355 	}
    356 	ASSERT((newcnt & (newcnt + 1)) == 0);
    357 	oldlist = fip->fi_list;
    358 	oldend = oldlist + oldcnt;
    359 	newend = newlist + oldcnt;	/* no need to lock beyond old end */
    360 
    361 	/*
    362 	 * fi_list and fi_nfiles cannot change while any uf_lock is held,
    363 	 * so we must grab all the old locks *and* the new locks up to oldcnt.
    364 	 * (Locks beyond the end of oldcnt aren't visible until we store
    365 	 * the new fi_nfiles, which is the last thing we do before dropping
    366 	 * all the locks, so there's no need to acquire these locks).
    367 	 * Holding the new locks is necessary because when fi_list changes
    368 	 * to point to the new list, fi_nfiles won't have been stored yet.
    369 	 * If we *didn't* hold the new locks, someone doing a UF_ENTER()
    370 	 * could see the new fi_list, grab the new uf_lock, and then see
    371 	 * fi_nfiles change while the lock is held -- in violation of
    372 	 * UF_ENTER() semantics.
    373 	 */
    374 	for (src = oldlist; src < oldend; src++)
    375 		mutex_enter(&src->uf_lock);
    376 
    377 	for (dst = newlist; dst < newend; dst++)
    378 		mutex_enter(&dst->uf_lock);
    379 
    380 	for (src = oldlist, dst = newlist; src < oldend; src++, dst++) {
    381 		dst->uf_file = src->uf_file;
    382 		dst->uf_fpollinfo = src->uf_fpollinfo;
    383 		dst->uf_refcnt = src->uf_refcnt;
    384 		dst->uf_alloc = src->uf_alloc;
    385 		dst->uf_flag = src->uf_flag;
    386 		dst->uf_busy = src->uf_busy;
    387 		dst->uf_portfd = src->uf_portfd;
    388 	}
    389 
    390 	/*
    391 	 * As soon as we store the new flist, future locking operations
    392 	 * will use it.  Therefore, we must ensure that all the state
    393 	 * we've just established reaches global visibility before the
    394 	 * new flist does.
    395 	 */
    396 	membar_producer();
    397 	fip->fi_list = newlist;
    398 
    399 	/*
    400 	 * Routines like getf() make an optimistic check on the validity
    401 	 * of the supplied file descriptor: if it's less than the current
    402 	 * value of fi_nfiles -- examined without any locks -- then it's
    403 	 * safe to attempt a UF_ENTER() on that fd (which is a valid
    404 	 * assumption because fi_nfiles only increases).  Therefore, it
    405 	 * is critical that the new value of fi_nfiles not reach global
    406 	 * visibility until after the new fi_list: if it happened the
    407 	 * other way around, getf() could see the new fi_nfiles and attempt
    408 	 * a UF_ENTER() on the old fi_list, which would write beyond its
    409 	 * end if the fd exceeded the old fi_nfiles.
    410 	 */
    411 	membar_producer();
    412 	fip->fi_nfiles = newcnt;
    413 
    414 	/*
    415 	 * The new state is consistent now, so we can drop all the locks.
    416 	 */
    417 	for (dst = newlist; dst < newend; dst++)
    418 		mutex_exit(&dst->uf_lock);
    419 
    420 	for (src = oldlist; src < oldend; src++) {
    421 		/*
    422 		 * If any threads are blocked on the old cvs, wake them.
    423 		 * This will force them to wake up, discover that fi_list
    424 		 * has changed, and go back to sleep on the new cvs.
    425 		 */
    426 		cv_broadcast(&src->uf_wanted_cv);
    427 		cv_broadcast(&src->uf_closing_cv);
    428 		mutex_exit(&src->uf_lock);
    429 	}
    430 
    431 	mutex_exit(&fip->fi_lock);
    432 
    433 	/*
    434 	 * Retire the old flist.  We can't actually kmem_free() it now
    435 	 * because someone may still have a pointer to it.  Instead,
    436 	 * we link it onto a list of retired flists.  The new flist
    437 	 * is at least double the size of the previous flist, so the
    438 	 * total size of all retired flists will be less than the size
    439 	 * of the current one (to prove, consider the sum of a geometric
    440 	 * series in powers of 2).  exit() frees the retired flists.
    441 	 */
    442 	urp = kmem_zalloc(sizeof (uf_rlist_t), KM_SLEEP);
    443 	urp->ur_list = oldlist;
    444 	urp->ur_nfiles = oldcnt;
    445 
    446 	mutex_enter(&fip->fi_lock);
    447 	urp->ur_next = fip->fi_rlist;
    448 	fip->fi_rlist = urp;
    449 	mutex_exit(&fip->fi_lock);
    450 }
    451 
    452 /*
    453  * Utility functions for keeping track of the active file descriptors.
    454  */
    455 void
    456 clear_stale_fd()		/* called from post_syscall() */
    457 {
    458 	afd_t *afd = &curthread->t_activefd;
    459 	int i;
    460 
    461 	/* uninitialized is ok here, a_nfd is then zero */
    462 	for (i = 0; i < afd->a_nfd; i++) {
    463 		/* assert that this should not be necessary */
    464 		ASSERT(afd->a_fd[i] == -1);
    465 		afd->a_fd[i] = -1;
    466 	}
    467 	afd->a_stale = 0;
    468 }
    469 
    470 void
    471 free_afd(afd_t *afd)		/* called below and from thread_free() */
    472 {
    473 	int i;
    474 
    475 	/* free the buffer if it was kmem_alloc()ed */
    476 	if (afd->a_nfd > sizeof (afd->a_buf) / sizeof (afd->a_buf[0])) {
    477 		COUNT(afd_free);
    478 		kmem_free(afd->a_fd, afd->a_nfd * sizeof (afd->a_fd[0]));
    479 	}
    480 
    481 	/* (re)initialize the structure */
    482 	afd->a_fd = &afd->a_buf[0];
    483 	afd->a_nfd = sizeof (afd->a_buf) / sizeof (afd->a_buf[0]);
    484 	afd->a_stale = 0;
    485 	for (i = 0; i < afd->a_nfd; i++)
    486 		afd->a_fd[i] = -1;
    487 }
    488 
    489 static void
    490 set_active_fd(int fd)
    491 {
    492 	afd_t *afd = &curthread->t_activefd;
    493 	int i;
    494 	int *old_fd;
    495 	int old_nfd;
    496 	int *new_fd;
    497 	int new_nfd;
    498 
    499 	if (afd->a_nfd == 0) {	/* first time initialization */
    500 		ASSERT(fd == -1);
    501 		mutex_enter(&afd->a_fdlock);
    502 		free_afd(afd);
    503 		mutex_exit(&afd->a_fdlock);
    504 	}
    505 
    506 	/* insert fd into vacant slot, if any */
    507 	for (i = 0; i < afd->a_nfd; i++) {
    508 		if (afd->a_fd[i] == -1) {
    509 			afd->a_fd[i] = fd;
    510 			return;
    511 		}
    512 	}
    513 
    514 	/*
    515 	 * Reallocate the a_fd[] array to add one more slot.
    516 	 */
    517 	ASSERT(fd == -1);
    518 	old_nfd = afd->a_nfd;
    519 	old_fd = afd->a_fd;
    520 	new_nfd = old_nfd + 1;
    521 	new_fd = kmem_alloc(new_nfd * sizeof (afd->a_fd[0]), KM_SLEEP);
    522 	MAXFD(new_nfd);
    523 	COUNT(afd_alloc);
    524 
    525 	mutex_enter(&afd->a_fdlock);
    526 	afd->a_fd = new_fd;
    527 	afd->a_nfd = new_nfd;
    528 	for (i = 0; i < old_nfd; i++)
    529 		afd->a_fd[i] = old_fd[i];
    530 	afd->a_fd[i] = fd;
    531 	mutex_exit(&afd->a_fdlock);
    532 
    533 	if (old_nfd > sizeof (afd->a_buf) / sizeof (afd->a_buf[0])) {
    534 		COUNT(afd_free);
    535 		kmem_free(old_fd, old_nfd * sizeof (afd->a_fd[0]));
    536 	}
    537 }
    538 
    539 void
    540 clear_active_fd(int fd)		/* called below and from aio.c */
    541 {
    542 	afd_t *afd = &curthread->t_activefd;
    543 	int i;
    544 
    545 	for (i = 0; i < afd->a_nfd; i++) {
    546 		if (afd->a_fd[i] == fd) {
    547 			afd->a_fd[i] = -1;
    548 			break;
    549 		}
    550 	}
    551 	ASSERT(i < afd->a_nfd);		/* not found is not ok */
    552 }
    553 
    554 /*
    555  * Does this thread have this fd active?
    556  */
    557 static int
    558 is_active_fd(kthread_t *t, int fd)
    559 {
    560 	afd_t *afd = &t->t_activefd;
    561 	int i;
    562 
    563 	ASSERT(t != curthread);
    564 	mutex_enter(&afd->a_fdlock);
    565 	/* uninitialized is ok here, a_nfd is then zero */
    566 	for (i = 0; i < afd->a_nfd; i++) {
    567 		if (afd->a_fd[i] == fd) {
    568 			mutex_exit(&afd->a_fdlock);
    569 			return (1);
    570 		}
    571 	}
    572 	mutex_exit(&afd->a_fdlock);
    573 	return (0);
    574 }
    575 
    576 /*
    577  * Convert a user supplied file descriptor into a pointer to a file
    578  * structure.  Only task is to check range of the descriptor (soft
    579  * resource limit was enforced at open time and shouldn't be checked
    580  * here).
    581  */
    582 file_t *
    583 getf(int fd)
    584 {
    585 	uf_info_t *fip = P_FINFO(curproc);
    586 	uf_entry_t *ufp;
    587 	file_t *fp;
    588 
    589 	if ((uint_t)fd >= fip->fi_nfiles)
    590 		return (NULL);
    591 
    592 	/*
    593 	 * Reserve a slot in the active fd array now so we can call
    594 	 * set_active_fd(fd) for real below, while still inside UF_ENTER().
    595 	 */
    596 	set_active_fd(-1);
    597 
    598 	UF_ENTER(ufp, fip, fd);
    599 
    600 	if ((fp = ufp->uf_file) == NULL) {
    601 		UF_EXIT(ufp);
    602 
    603 		if (fd == fip->fi_badfd && fip->fi_action > 0)
    604 			tsignal(curthread, fip->fi_action);
    605 
    606 		return (NULL);
    607 	}
    608 	ufp->uf_refcnt++;
    609 
    610 	/*
    611 	 * archive per file audit data
    612 	 */
    613 	if (audit_active)
    614 		(void) audit_getf(fd);
    615 
    616 	set_active_fd(fd);	/* record the active file descriptor */
    617 
    618 	UF_EXIT(ufp);
    619 
    620 	return (fp);
    621 }
    622 
    623 /*
    624  * Close whatever file currently occupies the file descriptor slot
    625  * and install the new file, usually NULL, in the file descriptor slot.
    626  * The close must complete before we release the file descriptor slot.
    627  * If newfp != NULL we only return an error if we can't allocate the
    628  * slot so the caller knows that it needs to free the filep;
    629  * in the other cases we return the error number from closef().
    630  */
    631 int
    632 closeandsetf(int fd, file_t *newfp)
    633 {
    634 	proc_t *p = curproc;
    635 	uf_info_t *fip = P_FINFO(p);
    636 	uf_entry_t *ufp;
    637 	file_t *fp;
    638 	fpollinfo_t *fpip;
    639 	portfd_t *pfd;
    640 	int error;
    641 
    642 	if ((uint_t)fd >= fip->fi_nfiles) {
    643 		if (newfp == NULL)
    644 			return (EBADF);
    645 		flist_grow(fd);
    646 	}
    647 
    648 	if (newfp != NULL) {
    649 		/*
    650 		 * If ufp is reserved but has no file pointer, it's in the
    651 		 * transition between ufalloc() and setf().  We must wait
    652 		 * for this transition to complete before assigning the
    653 		 * new non-NULL file pointer.
    654 		 */
    655 		mutex_enter(&fip->fi_lock);
    656 		if (fd == fip->fi_badfd) {
    657 			mutex_exit(&fip->fi_lock);
    658 			if (fip->fi_action > 0)
    659 				tsignal(curthread, fip->fi_action);
    660 			return (EBADF);
    661 		}
    662 		UF_ENTER(ufp, fip, fd);
    663 		while (ufp->uf_busy && ufp->uf_file == NULL) {
    664 			mutex_exit(&fip->fi_lock);
    665 			cv_wait_stop(&ufp->uf_wanted_cv, &ufp->uf_lock, 250);
    666 			UF_EXIT(ufp);
    667 			mutex_enter(&fip->fi_lock);
    668 			UF_ENTER(ufp, fip, fd);
    669 		}
    670 		if ((fp = ufp->uf_file) == NULL) {
    671 			ASSERT(ufp->uf_fpollinfo == NULL);
    672 			ASSERT(ufp->uf_flag == 0);
    673 			fd_reserve(fip, fd, 1);
    674 			ufp->uf_file = newfp;
    675 			UF_EXIT(ufp);
    676 			mutex_exit(&fip->fi_lock);
    677 			return (0);
    678 		}
    679 		mutex_exit(&fip->fi_lock);
    680 	} else {
    681 		UF_ENTER(ufp, fip, fd);
    682 		if ((fp = ufp->uf_file) == NULL) {
    683 			UF_EXIT(ufp);
    684 			return (EBADF);
    685 		}
    686 	}
    687 
    688 	/*
    689 	 * archive per file audit data
    690 	 */
    691 	if (audit_active)
    692 		(void) audit_getf(fd);
    693 	ASSERT(ufp->uf_busy);
    694 	ufp->uf_file = NULL;
    695 	ufp->uf_flag = 0;
    696 
    697 	/*
    698 	 * If the file descriptor reference count is non-zero, then
    699 	 * some other lwp in the process is performing system call
    700 	 * activity on the file.  To avoid blocking here for a long
    701 	 * time (the other lwp might be in a long term sleep in its
    702 	 * system call), we scan all other lwps in the process to
    703 	 * find the ones with this fd as one of their active fds,
    704 	 * set their a_stale flag, and set them running if they
    705 	 * are in an interruptible sleep so they will emerge from
    706 	 * their system calls immediately.  post_syscall() will
    707 	 * test the a_stale flag and set errno to EBADF.
    708 	 */
    709 	ASSERT(ufp->uf_refcnt == 0 || p->p_lwpcnt > 1);
    710 	if (ufp->uf_refcnt > 0) {
    711 		kthread_t *t;
    712 
    713 		/*
    714 		 * We call sprlock_proc(p) to ensure that the thread
    715 		 * list will not change while we are scanning it.
    716 		 * To do this, we must drop ufp->uf_lock and then
    717 		 * reacquire it (so we are not holding both p->p_lock
    718 		 * and ufp->uf_lock at the same time).  ufp->uf_lock
    719 		 * must be held for is_active_fd() to be correct
    720 		 * (set_active_fd() is called while holding ufp->uf_lock).
    721 		 *
    722 		 * This is a convoluted dance, but it is better than
    723 		 * the old brute-force method of stopping every thread
    724 		 * in the process by calling holdlwps(SHOLDFORK1).
    725 		 */
    726 
    727 		UF_EXIT(ufp);
    728 		COUNT(afd_wait);
    729 
    730 		mutex_enter(&p->p_lock);
    731 		sprlock_proc(p);
    732 		mutex_exit(&p->p_lock);
    733 
    734 		UF_ENTER(ufp, fip, fd);
    735 		ASSERT(ufp->uf_file == NULL);
    736 
    737 		if (ufp->uf_refcnt > 0) {
    738 			for (t = curthread->t_forw;
    739 			    t != curthread;
    740 			    t = t->t_forw) {
    741 				if (is_active_fd(t, fd)) {
    742 					thread_lock(t);
    743 					t->t_activefd.a_stale = 1;
    744 					t->t_post_sys = 1;
    745 					if (ISWAKEABLE(t))
    746 						setrun_locked(t);
    747 					thread_unlock(t);
    748 				}
    749 			}
    750 		}
    751 
    752 		UF_EXIT(ufp);
    753 
    754 		mutex_enter(&p->p_lock);
    755 		sprunlock(p);
    756 
    757 		UF_ENTER(ufp, fip, fd);
    758 		ASSERT(ufp->uf_file == NULL);
    759 	}
    760 
    761 	/*
    762 	 * Wait for other lwps to stop using this file descriptor.
    763 	 */
    764 	while (ufp->uf_refcnt > 0) {
    765 		cv_wait_stop(&ufp->uf_closing_cv, &ufp->uf_lock, 250);
    766 		/*
    767 		 * cv_wait_stop() drops ufp->uf_lock, so the file list
    768 		 * can change.  Drop the lock on our (possibly) stale
    769 		 * ufp and let UF_ENTER() find and lock the current ufp.
    770 		 */
    771 		UF_EXIT(ufp);
    772 		UF_ENTER(ufp, fip, fd);
    773 	}
    774 
    775 #ifdef DEBUG
    776 	/*
    777 	 * catch a watchfd on device's pollhead list but not on fpollinfo list
    778 	 */
    779 	if (ufp->uf_fpollinfo != NULL)
    780 		checkwfdlist(fp->f_vnode, ufp->uf_fpollinfo);
    781 #endif	/* DEBUG */
    782 
    783 	/*
    784 	 * We may need to cleanup some cached poll states in t_pollstate
    785 	 * before the fd can be reused. It is important that we don't
    786 	 * access a stale thread structure. We will do the cleanup in two
    787 	 * phases to avoid deadlock and holding uf_lock for too long.
    788 	 * In phase 1, hold the uf_lock and call pollblockexit() to set
    789 	 * state in t_pollstate struct so that a thread does not exit on
    790 	 * us. In phase 2, we drop the uf_lock and call pollcacheclean().
    791 	 */
    792 	pfd = ufp->uf_portfd;
    793 	ufp->uf_portfd = NULL;
    794 	fpip = ufp->uf_fpollinfo;
    795 	ufp->uf_fpollinfo = NULL;
    796 	if (fpip != NULL)
    797 		pollblockexit(fpip);
    798 	UF_EXIT(ufp);
    799 	if (fpip != NULL)
    800 		pollcacheclean(fpip, fd);
    801 	if (pfd)
    802 		port_close_fd(pfd);
    803 
    804 	/*
    805 	 * Keep the file descriptor entry reserved across the closef().
    806 	 */
    807 	error = closef(fp);
    808 
    809 	setf(fd, newfp);
    810 
    811 	/* Only return closef() error when closing is all we do */
    812 	return (newfp == NULL ? error : 0);
    813 }
    814 
    815 /*
    816  * Decrement uf_refcnt; wakeup anyone waiting to close the file.
    817  */
    818 void
    819 releasef(int fd)
    820 {
    821 	uf_info_t *fip = P_FINFO(curproc);
    822 	uf_entry_t *ufp;
    823 
    824 	UF_ENTER(ufp, fip, fd);
    825 	ASSERT(ufp->uf_refcnt > 0);
    826 	clear_active_fd(fd);	/* clear the active file descriptor */
    827 	if (--ufp->uf_refcnt == 0)
    828 		cv_broadcast(&ufp->uf_closing_cv);
    829 	UF_EXIT(ufp);
    830 }
    831 
    832 /*
    833  * Identical to releasef() but can be called from another process.
    834  */
    835 void
    836 areleasef(int fd, uf_info_t *fip)
    837 {
    838 	uf_entry_t *ufp;
    839 
    840 	UF_ENTER(ufp, fip, fd);
    841 	ASSERT(ufp->uf_refcnt > 0);
    842 	if (--ufp->uf_refcnt == 0)
    843 		cv_broadcast(&ufp->uf_closing_cv);
    844 	UF_EXIT(ufp);
    845 }
    846 
    847 /*
    848  * Duplicate all file descriptors across a fork.
    849  */
    850 void
    851 flist_fork(uf_info_t *pfip, uf_info_t *cfip)
    852 {
    853 	int fd, nfiles;
    854 	uf_entry_t *pufp, *cufp;
    855 
    856 	mutex_init(&cfip->fi_lock, NULL, MUTEX_DEFAULT, NULL);
    857 	cfip->fi_rlist = NULL;
    858 
    859 	/*
    860 	 * We don't need to hold fi_lock because all other lwp's in the
    861 	 * parent have been held.
    862 	 */
    863 	cfip->fi_nfiles = nfiles = flist_minsize(pfip);
    864 
    865 	cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
    866 
    867 	for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles;
    868 	    fd++, pufp++, cufp++) {
    869 		cufp->uf_file = pufp->uf_file;
    870 		cufp->uf_alloc = pufp->uf_alloc;
    871 		cufp->uf_flag = pufp->uf_flag;
    872 		cufp->uf_busy = pufp->uf_busy;
    873 		if (pufp->uf_file == NULL) {
    874 			ASSERT(pufp->uf_flag == 0);
    875 			if (pufp->uf_busy) {
    876 				/*
    877 				 * Grab locks to appease ASSERTs in fd_reserve
    878 				 */
    879 				mutex_enter(&cfip->fi_lock);
    880 				mutex_enter(&cufp->uf_lock);
    881 				fd_reserve(cfip, fd, -1);
    882 				mutex_exit(&cufp->uf_lock);
    883 				mutex_exit(&cfip->fi_lock);
    884 			}
    885 		}
    886 	}
    887 }
    888 
    889 /*
    890  * Close all open file descriptors for the current process.
    891  * This is only called from exit(), which is single-threaded,
    892  * so we don't need any locking.
    893  */
    894 void
    895 closeall(uf_info_t *fip)
    896 {
    897 	int fd;
    898 	file_t *fp;
    899 	uf_entry_t *ufp;
    900 
    901 	ufp = fip->fi_list;
    902 	for (fd = 0; fd < fip->fi_nfiles; fd++, ufp++) {
    903 		if ((fp = ufp->uf_file) != NULL) {
    904 			ufp->uf_file = NULL;
    905 			if (ufp->uf_portfd != NULL) {
    906 				portfd_t *pfd;
    907 				/* remove event port association */
    908 				pfd = ufp->uf_portfd;
    909 				ufp->uf_portfd = NULL;
    910 				port_close_fd(pfd);
    911 			}
    912 			ASSERT(ufp->uf_fpollinfo == NULL);
    913 			(void) closef(fp);
    914 		}
    915 	}
    916 
    917 	kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t));
    918 	fip->fi_list = NULL;
    919 	fip->fi_nfiles = 0;
    920 	while (fip->fi_rlist != NULL) {
    921 		uf_rlist_t *urp = fip->fi_rlist;
    922 		fip->fi_rlist = urp->ur_next;
    923 		kmem_free(urp->ur_list, urp->ur_nfiles * sizeof (uf_entry_t));
    924 		kmem_free(urp, sizeof (uf_rlist_t));
    925 	}
    926 }
    927 
    928 /*
    929  * Internal form of close.  Decrement reference count on file
    930  * structure.  Decrement reference count on the vnode following
    931  * removal of the referencing file structure.
    932  */
    933 int
    934 closef(file_t *fp)
    935 {
    936 	vnode_t *vp;
    937 	int error;
    938 	int count;
    939 	int flag;
    940 	offset_t offset;
    941 
    942 	/*
    943 	 * audit close of file (may be exit)
    944 	 */
    945 	if (audit_active)
    946 		audit_closef(fp);
    947 	ASSERT(MUTEX_NOT_HELD(&P_FINFO(curproc)->fi_lock));
    948 
    949 	mutex_enter(&fp->f_tlock);
    950 
    951 	ASSERT(fp->f_count > 0);
    952 
    953 	count = fp->f_count--;
    954 	flag = fp->f_flag;
    955 	offset = fp->f_offset;
    956 
    957 	vp = fp->f_vnode;
    958 
    959 	error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL);
    960 
    961 	if (count > 1) {
    962 		mutex_exit(&fp->f_tlock);
    963 		return (error);
    964 	}
    965 	ASSERT(fp->f_count == 0);
    966 	mutex_exit(&fp->f_tlock);
    967 
    968 	VN_RELE(vp);
    969 	/*
    970 	 * deallocate resources to audit_data
    971 	 */
    972 	if (audit_active)
    973 		audit_unfalloc(fp);
    974 	crfree(fp->f_cred);
    975 	kmem_cache_free(file_cache, fp);
    976 	return (error);
    977 }
    978 
    979 /*
    980  * This is a combination of ufalloc() and setf().
    981  */
    982 int
    983 ufalloc_file(int start, file_t *fp)
    984 {
    985 	proc_t *p = curproc;
    986 	uf_info_t *fip = P_FINFO(p);
    987 	int filelimit;
    988 	uf_entry_t *ufp;
    989 	int nfiles;
    990 	int fd;
    991 
    992 	/*
    993 	 * Assertion is to convince the correctness of the following
    994 	 * assignment for filelimit after casting to int.
    995 	 */
    996 	ASSERT(p->p_fno_ctl <= INT_MAX);
    997 	filelimit = (int)p->p_fno_ctl;
    998 
    999 	for (;;) {
   1000 		mutex_enter(&fip->fi_lock);
   1001 		fd = fd_find(fip, start);
   1002 		if (fd >= 0 && fd == fip->fi_badfd) {
   1003 			start = fd + 1;
   1004 			mutex_exit(&fip->fi_lock);
   1005 			continue;
   1006 		}
   1007 		if ((uint_t)fd < filelimit)
   1008 			break;
   1009 		if (fd >= filelimit) {
   1010 			mutex_exit(&fip->fi_lock);
   1011 			mutex_enter(&p->p_lock);
   1012 			(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
   1013 			    p->p_rctls, p, RCA_SAFE);
   1014 			mutex_exit(&p->p_lock);
   1015 			return (-1);
   1016 		}
   1017 		/* fd_find() returned -1 */
   1018 		nfiles = fip->fi_nfiles;
   1019 		mutex_exit(&fip->fi_lock);
   1020 		flist_grow(MAX(start, nfiles));
   1021 	}
   1022 
   1023 	UF_ENTER(ufp, fip, fd);
   1024 	fd_reserve(fip, fd, 1);
   1025 	ASSERT(ufp->uf_file == NULL);
   1026 	ufp->uf_file = fp;
   1027 	UF_EXIT(ufp);
   1028 	mutex_exit(&fip->fi_lock);
   1029 	return (fd);
   1030 }
   1031 
   1032 /*
   1033  * Allocate a user file descriptor greater than or equal to "start".
   1034  */
   1035 int
   1036 ufalloc(int start)
   1037 {
   1038 	return (ufalloc_file(start, NULL));
   1039 }
   1040 
   1041 /*
   1042  * Check that a future allocation of count fds on proc p has a good
   1043  * chance of succeeding.  If not, do rctl processing as if we'd failed
   1044  * the allocation.
   1045  *
   1046  * Our caller must guarantee that p cannot disappear underneath us.
   1047  */
   1048 int
   1049 ufcanalloc(proc_t *p, uint_t count)
   1050 {
   1051 	uf_info_t *fip = P_FINFO(p);
   1052 	int filelimit;
   1053 	int current;
   1054 
   1055 	if (count == 0)
   1056 		return (1);
   1057 
   1058 	ASSERT(p->p_fno_ctl <= INT_MAX);
   1059 	filelimit = (int)p->p_fno_ctl;
   1060 
   1061 	mutex_enter(&fip->fi_lock);
   1062 	current = flist_nalloc(fip);		/* # of in-use descriptors */
   1063 	mutex_exit(&fip->fi_lock);
   1064 
   1065 	/*
   1066 	 * If count is a positive integer, the worst that can happen is
   1067 	 * an overflow to a negative value, which is caught by the >= 0 check.
   1068 	 */
   1069 	current += count;
   1070 	if (count <= INT_MAX && current >= 0 && current <= filelimit)
   1071 		return (1);
   1072 
   1073 	mutex_enter(&p->p_lock);
   1074 	(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
   1075 	    p->p_rctls, p, RCA_SAFE);
   1076 	mutex_exit(&p->p_lock);
   1077 	return (0);
   1078 }
   1079 
   1080 /*
   1081  * Allocate a user file descriptor and a file structure.
   1082  * Initialize the descriptor to point at the file structure.
   1083  * If fdp is NULL, the user file descriptor will not be allocated.
   1084  */
   1085 int
   1086 falloc(vnode_t *vp, int flag, file_t **fpp, int *fdp)
   1087 {
   1088 	file_t *fp;
   1089 	int fd;
   1090 
   1091 	if (fdp) {
   1092 		if ((fd = ufalloc(0)) == -1)
   1093 			return (EMFILE);
   1094 	}
   1095 	fp = kmem_cache_alloc(file_cache, KM_SLEEP);
   1096 	/*
   1097 	 * Note: falloc returns the fp locked
   1098 	 */
   1099 	mutex_enter(&fp->f_tlock);
   1100 	fp->f_count = 1;
   1101 	fp->f_flag = (ushort_t)flag;
   1102 	fp->f_vnode = vp;
   1103 	fp->f_offset = 0;
   1104 	fp->f_audit_data = 0;
   1105 	crhold(fp->f_cred = CRED());
   1106 	/*
   1107 	 * allocate resources to audit_data
   1108 	 */
   1109 	if (audit_active)
   1110 		audit_falloc(fp);
   1111 	*fpp = fp;
   1112 	if (fdp)
   1113 		*fdp = fd;
   1114 	return (0);
   1115 }
   1116 
   1117 /*ARGSUSED*/
   1118 static int
   1119 file_cache_constructor(void *buf, void *cdrarg, int kmflags)
   1120 {
   1121 	file_t *fp = buf;
   1122 
   1123 	mutex_init(&fp->f_tlock, NULL, MUTEX_DEFAULT, NULL);
   1124 	return (0);
   1125 }
   1126 
   1127 /*ARGSUSED*/
   1128 static void
   1129 file_cache_destructor(void *buf, void *cdrarg)
   1130 {
   1131 	file_t *fp = buf;
   1132 
   1133 	mutex_destroy(&fp->f_tlock);
   1134 }
   1135 
   1136 void
   1137 finit()
   1138 {
   1139 	file_cache = kmem_cache_create("file_cache", sizeof (file_t), 0,
   1140 	    file_cache_constructor, file_cache_destructor, NULL, NULL, NULL, 0);
   1141 }
   1142 
   1143 void
   1144 unfalloc(file_t *fp)
   1145 {
   1146 	ASSERT(MUTEX_HELD(&fp->f_tlock));
   1147 	if (--fp->f_count <= 0) {
   1148 		/*
   1149 		 * deallocate resources to audit_data
   1150 		 */
   1151 		if (audit_active)
   1152 			audit_unfalloc(fp);
   1153 		crfree(fp->f_cred);
   1154 		mutex_exit(&fp->f_tlock);
   1155 		kmem_cache_free(file_cache, fp);
   1156 	} else
   1157 		mutex_exit(&fp->f_tlock);
   1158 }
   1159 
   1160 /*
   1161  * Given a file descriptor, set the user's
   1162  * file pointer to the given parameter.
   1163  */
   1164 void
   1165 setf(int fd, file_t *fp)
   1166 {
   1167 	uf_info_t *fip = P_FINFO(curproc);
   1168 	uf_entry_t *ufp;
   1169 
   1170 	if (audit_active)
   1171 		audit_setf(fp, fd);
   1172 
   1173 	if (fp == NULL) {
   1174 		mutex_enter(&fip->fi_lock);
   1175 		UF_ENTER(ufp, fip, fd);
   1176 		fd_reserve(fip, fd, -1);
   1177 		mutex_exit(&fip->fi_lock);
   1178 	} else {
   1179 		UF_ENTER(ufp, fip, fd);
   1180 		ASSERT(ufp->uf_busy);
   1181 	}
   1182 	ASSERT(ufp->uf_fpollinfo == NULL);
   1183 	ASSERT(ufp->uf_flag == 0);
   1184 	ufp->uf_file = fp;
   1185 	cv_broadcast(&ufp->uf_wanted_cv);
   1186 	UF_EXIT(ufp);
   1187 }
   1188 
   1189 /*
   1190  * Given a file descriptor, return the file table flags, plus,
   1191  * if this is a socket in asynchronous mode, the FASYNC flag.
   1192  * getf() may or may not have been called before calling f_getfl().
   1193  */
   1194 int
   1195 f_getfl(int fd, int *flagp)
   1196 {
   1197 	uf_info_t *fip = P_FINFO(curproc);
   1198 	uf_entry_t *ufp;
   1199 	file_t *fp;
   1200 	int error;
   1201 
   1202 	if ((uint_t)fd >= fip->fi_nfiles)
   1203 		error = EBADF;
   1204 	else {
   1205 		UF_ENTER(ufp, fip, fd);
   1206 		if ((fp = ufp->uf_file) == NULL)
   1207 			error = EBADF;
   1208 		else {
   1209 			vnode_t *vp = fp->f_vnode;
   1210 			int flag = fp->f_flag;
   1211 
   1212 			/*
   1213 			 * BSD fcntl() FASYNC compatibility.
   1214 			 */
   1215 			if (vp->v_type == VSOCK)
   1216 				flag |= sock_getfasync(vp);
   1217 			*flagp = flag;
   1218 			error = 0;
   1219 		}
   1220 		UF_EXIT(ufp);
   1221 	}
   1222 
   1223 	return (error);
   1224 }
   1225 
   1226 /*
   1227  * Given a file descriptor, return the user's file flags.
   1228  * Force the FD_CLOEXEC flag for writable self-open /proc files.
   1229  * getf() may or may not have been called before calling f_getfd_error().
   1230  */
   1231 int
   1232 f_getfd_error(int fd, int *flagp)
   1233 {
   1234 	uf_info_t *fip = P_FINFO(curproc);
   1235 	uf_entry_t *ufp;
   1236 	file_t *fp;
   1237 	int flag;
   1238 	int error;
   1239 
   1240 	if ((uint_t)fd >= fip->fi_nfiles)
   1241 		error = EBADF;
   1242 	else {
   1243 		UF_ENTER(ufp, fip, fd);
   1244 		if ((fp = ufp->uf_file) == NULL)
   1245 			error = EBADF;
   1246 		else {
   1247 			flag = ufp->uf_flag;
   1248 			if ((fp->f_flag & FWRITE) && pr_isself(fp->f_vnode))
   1249 				flag |= FD_CLOEXEC;
   1250 			*flagp = flag;
   1251 			error = 0;
   1252 		}
   1253 		UF_EXIT(ufp);
   1254 	}
   1255 
   1256 	return (error);
   1257 }
   1258 
   1259 /*
   1260  * getf() must have been called before calling f_getfd().
   1261  */
   1262 char
   1263 f_getfd(int fd)
   1264 {
   1265 	int flag = 0;
   1266 	(void) f_getfd_error(fd, &flag);
   1267 	return ((char)flag);
   1268 }
   1269 
   1270 /*
   1271  * Given a file descriptor and file flags, set the user's file flags.
   1272  * At present, the only valid flag is FD_CLOEXEC.
   1273  * getf() may or may not have been called before calling f_setfd_error().
   1274  */
   1275 int
   1276 f_setfd_error(int fd, int flags)
   1277 {
   1278 	uf_info_t *fip = P_FINFO(curproc);
   1279 	uf_entry_t *ufp;
   1280 	int error;
   1281 
   1282 	if ((uint_t)fd >= fip->fi_nfiles)
   1283 		error = EBADF;
   1284 	else {
   1285 		UF_ENTER(ufp, fip, fd);
   1286 		if (ufp->uf_file == NULL)
   1287 			error = EBADF;
   1288 		else {
   1289 			ufp->uf_flag = flags & FD_CLOEXEC;
   1290 			error = 0;
   1291 		}
   1292 		UF_EXIT(ufp);
   1293 	}
   1294 	return (error);
   1295 }
   1296 
   1297 void
   1298 f_setfd(int fd, char flags)
   1299 {
   1300 	(void) f_setfd_error(fd, flags);
   1301 }
   1302 
   1303 #define	BADFD_MIN	3
   1304 #define	BADFD_MAX	255
   1305 
   1306 /*
   1307  * Attempt to allocate a file descriptor which is bad and which
   1308  * is "poison" to the application.  It cannot be closed (except
   1309  * on exec), allocated for a different use, etc.
   1310  */
   1311 int
   1312 f_badfd(int start, int *fdp, int action)
   1313 {
   1314 	int fdr;
   1315 	int badfd;
   1316 	uf_info_t *fip = P_FINFO(curproc);
   1317 
   1318 #ifdef _LP64
   1319 	/* No restrictions on 64 bit _file */
   1320 	if (get_udatamodel() != DATAMODEL_ILP32)
   1321 		return (EINVAL);
   1322 #endif
   1323 
   1324 	if (start > BADFD_MAX || start < BADFD_MIN)
   1325 		return (EINVAL);
   1326 
   1327 	if (action >= NSIG || action < 0)
   1328 		return (EINVAL);
   1329 
   1330 	mutex_enter(&fip->fi_lock);
   1331 	badfd = fip->fi_badfd;
   1332 	mutex_exit(&fip->fi_lock);
   1333 
   1334 	if (badfd != -1)
   1335 		return (EAGAIN);
   1336 
   1337 	fdr = ufalloc(start);
   1338 
   1339 	if (fdr > BADFD_MAX) {
   1340 		setf(fdr, NULL);
   1341 		return (EMFILE);
   1342 	}
   1343 	if (fdr < 0)
   1344 		return (EMFILE);
   1345 
   1346 	mutex_enter(&fip->fi_lock);
   1347 	if (fip->fi_badfd != -1) {
   1348 		/* Lost race */
   1349 		mutex_exit(&fip->fi_lock);
   1350 		setf(fdr, NULL);
   1351 		return (EAGAIN);
   1352 	}
   1353 	fip->fi_action = action;
   1354 	fip->fi_badfd = fdr;
   1355 	mutex_exit(&fip->fi_lock);
   1356 	setf(fdr, NULL);
   1357 
   1358 	*fdp = fdr;
   1359 
   1360 	return (0);
   1361 }
   1362 
   1363 /*
   1364  * Allocate a file descriptor and assign it to the vnode "*vpp",
   1365  * performing the usual open protocol upon it and returning the
   1366  * file descriptor allocated.  It is the responsibility of the
   1367  * caller to dispose of "*vpp" if any error occurs.
   1368  */
   1369 int
   1370 fassign(vnode_t **vpp, int mode, int *fdp)
   1371 {
   1372 	file_t *fp;
   1373 	int error;
   1374 	int fd;
   1375 
   1376 	if (error = falloc((vnode_t *)NULL, mode, &fp, &fd))
   1377 		return (error);
   1378 	if (error = VOP_OPEN(vpp, mode, fp->f_cred, NULL)) {
   1379 		setf(fd, NULL);
   1380 		unfalloc(fp);
   1381 		return (error);
   1382 	}
   1383 	fp->f_vnode = *vpp;
   1384 	mutex_exit(&fp->f_tlock);
   1385 	/*
   1386 	 * Fill in the slot falloc reserved.
   1387 	 */
   1388 	setf(fd, fp);
   1389 	*fdp = fd;
   1390 	return (0);
   1391 }
   1392 
   1393 /*
   1394  * When a process forks it must increment the f_count of all file pointers
   1395  * since there is a new process pointing at them.  fcnt_add(fip, 1) does this.
   1396  * Since we are called when there is only 1 active lwp we don't need to
   1397  * hold fi_lock or any uf_lock.  If the fork fails, fork_fail() calls
   1398  * fcnt_add(fip, -1) to restore the counts.
   1399  */
   1400 void
   1401 fcnt_add(uf_info_t *fip, int incr)
   1402 {
   1403 	int i;
   1404 	uf_entry_t *ufp;
   1405 	file_t *fp;
   1406 
   1407 	ufp = fip->fi_list;
   1408 	for (i = 0; i < fip->fi_nfiles; i++, ufp++) {
   1409 		if ((fp = ufp->uf_file) != NULL) {
   1410 			mutex_enter(&fp->f_tlock);
   1411 			ASSERT((incr == 1 && fp->f_count >= 1) ||
   1412 			    (incr == -1 && fp->f_count >= 2));
   1413 			fp->f_count += incr;
   1414 			mutex_exit(&fp->f_tlock);
   1415 		}
   1416 	}
   1417 }
   1418 
   1419 /*
   1420  * This is called from exec to close all fd's that have the FD_CLOEXEC flag
   1421  * set and also to close all self-open for write /proc file descriptors.
   1422  */
   1423 void
   1424 close_exec(uf_info_t *fip)
   1425 {
   1426 	int fd;
   1427 	file_t *fp;
   1428 	fpollinfo_t *fpip;
   1429 	uf_entry_t *ufp;
   1430 	portfd_t *pfd;
   1431 
   1432 	ufp = fip->fi_list;
   1433 	for (fd = 0; fd < fip->fi_nfiles; fd++, ufp++) {
   1434 		if ((fp = ufp->uf_file) != NULL &&
   1435 		    ((ufp->uf_flag & FD_CLOEXEC) ||
   1436 		    ((fp->f_flag & FWRITE) && pr_isself(fp->f_vnode)))) {
   1437 			fpip = ufp->uf_fpollinfo;
   1438 			mutex_enter(&fip->fi_lock);
   1439 			mutex_enter(&ufp->uf_lock);
   1440 			fd_reserve(fip, fd, -1);
   1441 			mutex_exit(&fip->fi_lock);
   1442 			ufp->uf_file = NULL;
   1443 			ufp->uf_fpollinfo = NULL;
   1444 			ufp->uf_flag = 0;
   1445 			/*
   1446 			 * We may need to cleanup some cached poll states
   1447 			 * in t_pollstate before the fd can be reused. It
   1448 			 * is important that we don't access a stale thread
   1449 			 * structure. We will do the cleanup in two
   1450 			 * phases to avoid deadlock and holding uf_lock for
   1451 			 * too long. In phase 1, hold the uf_lock and call
   1452 			 * pollblockexit() to set state in t_pollstate struct
   1453 			 * so that a thread does not exit on us. In phase 2,
   1454 			 * we drop the uf_lock and call pollcacheclean().
   1455 			 */
   1456 			pfd = ufp->uf_portfd;
   1457 			ufp->uf_portfd = NULL;
   1458 			if (fpip != NULL)
   1459 				pollblockexit(fpip);
   1460 			mutex_exit(&ufp->uf_lock);
   1461 			if (fpip != NULL)
   1462 				pollcacheclean(fpip, fd);
   1463 			if (pfd)
   1464 				port_close_fd(pfd);
   1465 			(void) closef(fp);
   1466 		}
   1467 	}
   1468 
   1469 	/* Reset bad fd */
   1470 	fip->fi_badfd = -1;
   1471 	fip->fi_action = -1;
   1472 }
   1473 
   1474 /*
   1475  * Common routine for modifying attributes of named files.
   1476  */
   1477 int
   1478 namesetattr(char *fnamep, enum symfollow followlink, vattr_t *vap, int flags)
   1479 {
   1480 	vnode_t *vp;
   1481 	int error = 0;
   1482 
   1483 	if (error = lookupname(fnamep, UIO_USERSPACE, followlink, NULLVPP, &vp))
   1484 		return (set_errno(error));
   1485 	if (error = vpsetattr(vp, vap, flags))
   1486 		(void) set_errno(error);
   1487 	VN_RELE(vp);
   1488 	return (error);
   1489 }
   1490 
   1491 /*
   1492  * Common routine for modifying attributes of files referenced
   1493  * by descriptor.
   1494  */
   1495 int
   1496 fdsetattr(int fd, vattr_t *vap)
   1497 {
   1498 	file_t *fp;
   1499 	vnode_t *vp;
   1500 	int error = 0;
   1501 
   1502 	if ((fp = getf(fd)) != NULL) {
   1503 		vp = fp->f_vnode;
   1504 		if (error = vpsetattr(vp, vap, 0)) {
   1505 			(void) set_errno(error);
   1506 		}
   1507 		releasef(fd);
   1508 	} else
   1509 		error = set_errno(EBADF);
   1510 	return (error);
   1511 }
   1512 
   1513 /*
   1514  * Common routine to set the attributes for the given vnode.
   1515  * If the vnode is a file and the filesize is being manipulated,
   1516  * this makes sure that there are no conflicting non-blocking
   1517  * mandatory locks in that region.
   1518  */
   1519 static int
   1520 vpsetattr(vnode_t *vp, vattr_t *vap, int flags)
   1521 {
   1522 	int error = 0;
   1523 	int in_crit = 0;
   1524 	u_offset_t	begin;
   1525 	vattr_t	vattr;
   1526 	ssize_t	length;
   1527 
   1528 	if (vn_is_readonly(vp)) {
   1529 		error = EROFS;
   1530 	}
   1531 	if (!error && (vap->va_mask & AT_SIZE) &&
   1532 	    nbl_need_check(vp)) {
   1533 		nbl_start_crit(vp, RW_READER);
   1534 		in_crit = 1;
   1535 		vattr.va_mask = AT_SIZE;
   1536 		if (!(error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
   1537 			begin = vap->va_size > vattr.va_size ?
   1538 			    vattr.va_size : vap->va_size;
   1539 			length = vattr.va_size > vap->va_size ?
   1540 			    vattr.va_size - vap->va_size :
   1541 			    vap->va_size - vattr.va_size;
   1542 
   1543 			if (nbl_conflict(vp, NBL_WRITE, begin, length, 0,
   1544 			    NULL)) {
   1545 				error = EACCES;
   1546 			}
   1547 		}
   1548 	}
   1549 	if (!error)
   1550 		error = VOP_SETATTR(vp, vap, flags, CRED(), NULL);
   1551 
   1552 	if (in_crit)
   1553 		nbl_end_crit(vp);
   1554 
   1555 	return (error);
   1556 }
   1557 
   1558 /*
   1559  * Return true if the given vnode is referenced by any
   1560  * entry in the current process's file descriptor table.
   1561  */
   1562 int
   1563 fisopen(vnode_t *vp)
   1564 {
   1565 	int fd;
   1566 	file_t *fp;
   1567 	vnode_t *ovp;
   1568 	uf_info_t *fip = P_FINFO(curproc);
   1569 	uf_entry_t *ufp;
   1570 
   1571 	mutex_enter(&fip->fi_lock);
   1572 	for (fd = 0; fd < fip->fi_nfiles; fd++) {
   1573 		UF_ENTER(ufp, fip, fd);
   1574 		if ((fp = ufp->uf_file) != NULL &&
   1575 		    (ovp = fp->f_vnode) != NULL && VN_CMP(vp, ovp)) {
   1576 			UF_EXIT(ufp);
   1577 			mutex_exit(&fip->fi_lock);
   1578 			return (1);
   1579 		}
   1580 		UF_EXIT(ufp);
   1581 	}
   1582 	mutex_exit(&fip->fi_lock);
   1583 	return (0);
   1584 }
   1585 
   1586 /*
   1587  * Return zero if at least one file currently open (by curproc) shouldn't be
   1588  * allowed to change zones.
   1589  */
   1590 int
   1591 files_can_change_zones(void)
   1592 {
   1593 	int fd;
   1594 	file_t *fp;
   1595 	uf_info_t *fip = P_FINFO(curproc);
   1596 	uf_entry_t *ufp;
   1597 
   1598 	mutex_enter(&fip->fi_lock);
   1599 	for (fd = 0; fd < fip->fi_nfiles; fd++) {
   1600 		UF_ENTER(ufp, fip, fd);
   1601 		if ((fp = ufp->uf_file) != NULL &&
   1602 		    !vn_can_change_zones(fp->f_vnode)) {
   1603 			UF_EXIT(ufp);
   1604 			mutex_exit(&fip->fi_lock);
   1605 			return (0);
   1606 		}
   1607 		UF_EXIT(ufp);
   1608 	}
   1609 	mutex_exit(&fip->fi_lock);
   1610 	return (1);
   1611 }
   1612 
   1613 #ifdef DEBUG
   1614 
   1615 /*
   1616  * The following functions are only used in ASSERT()s elsewhere.
   1617  * They do not modify the state of the system.
   1618  */
   1619 
   1620 /*
   1621  * Return true (1) if the current thread is in the fpollinfo
   1622  * list for this file descriptor, else false (0).
   1623  */
   1624 static int
   1625 curthread_in_plist(uf_entry_t *ufp)
   1626 {
   1627 	fpollinfo_t *fpip;
   1628 
   1629 	ASSERT(MUTEX_HELD(&ufp->uf_lock));
   1630 	for (fpip = ufp->uf_fpollinfo; fpip; fpip = fpip->fp_next)
   1631 		if (fpip->fp_thread == curthread)
   1632 			return (1);
   1633 	return (0);
   1634 }
   1635 
   1636 /*
   1637  * Sanity check to make sure that after lwp_exit(),
   1638  * curthread does not appear on any fd's fpollinfo list.
   1639  */
   1640 void
   1641 checkfpollinfo(void)
   1642 {
   1643 	int fd;
   1644 	uf_info_t *fip = P_FINFO(curproc);
   1645 	uf_entry_t *ufp;
   1646 
   1647 	mutex_enter(&fip->fi_lock);
   1648 	for (fd = 0; fd < fip->fi_nfiles; fd++) {
   1649 		UF_ENTER(ufp, fip, fd);
   1650 		ASSERT(!curthread_in_plist(ufp));
   1651 		UF_EXIT(ufp);
   1652 	}
   1653 	mutex_exit(&fip->fi_lock);
   1654 }
   1655 
   1656 /*
   1657  * Return true (1) if the current thread is in the fpollinfo
   1658  * list for this file descriptor, else false (0).
   1659  * This is the same as curthread_in_plist(),
   1660  * but is called w/o holding uf_lock.
   1661  */
   1662 int
   1663 infpollinfo(int fd)
   1664 {
   1665 	uf_info_t *fip = P_FINFO(curproc);
   1666 	uf_entry_t *ufp;
   1667 	int rc;
   1668 
   1669 	UF_ENTER(ufp, fip, fd);
   1670 	rc = curthread_in_plist(ufp);
   1671 	UF_EXIT(ufp);
   1672 	return (rc);
   1673 }
   1674 
   1675 #endif	/* DEBUG */
   1676 
   1677 /*
   1678  * Add the curthread to fpollinfo list, meaning this fd is currently in the
   1679  * thread's poll cache. Each lwp polling this file descriptor should call
   1680  * this routine once.
   1681  */
   1682 void
   1683 addfpollinfo(int fd)
   1684 {
   1685 	struct uf_entry *ufp;
   1686 	fpollinfo_t *fpip;
   1687 	uf_info_t *fip = P_FINFO(curproc);
   1688 
   1689 	fpip = kmem_zalloc(sizeof (fpollinfo_t), KM_SLEEP);
   1690 	fpip->fp_thread = curthread;
   1691 	UF_ENTER(ufp, fip, fd);
   1692 	/*
   1693 	 * Assert we are not already on the list, that is, that
   1694 	 * this lwp did not call addfpollinfo twice for the same fd.
   1695 	 */
   1696 	ASSERT(!curthread_in_plist(ufp));
   1697 	/*
   1698 	 * addfpollinfo is always done inside the getf/releasef pair.
   1699 	 */
   1700 	ASSERT(ufp->uf_refcnt >= 1);
   1701 	fpip->fp_next = ufp->uf_fpollinfo;
   1702 	ufp->uf_fpollinfo = fpip;
   1703 	UF_EXIT(ufp);
   1704 }
   1705 
   1706 /*
   1707  * delete curthread from fpollinfo list.
   1708  */
   1709 /*ARGSUSED*/
   1710 void
   1711 delfpollinfo(int fd)
   1712 {
   1713 	struct uf_entry *ufp;
   1714 	struct fpollinfo *fpip;
   1715 	struct fpollinfo **fpipp;
   1716 	uf_info_t *fip = P_FINFO(curproc);
   1717 
   1718 	UF_ENTER(ufp, fip, fd);
   1719 	if (ufp->uf_fpollinfo == NULL) {
   1720 		UF_EXIT(ufp);
   1721 		return;
   1722 	}
   1723 	ASSERT(ufp->uf_busy);
   1724 	/*
   1725 	 * Find and delete curthread from the list.
   1726 	 */
   1727 	fpipp = &ufp->uf_fpollinfo;
   1728 	while ((fpip = *fpipp)->fp_thread != curthread)
   1729 		fpipp = &fpip->fp_next;
   1730 	*fpipp = fpip->fp_next;
   1731 	kmem_free(fpip, sizeof (fpollinfo_t));
   1732 	/*
   1733 	 * Assert that we are not still on the list, that is, that
   1734 	 * this lwp did not call addfpollinfo twice for the same fd.
   1735 	 */
   1736 	ASSERT(!curthread_in_plist(ufp));
   1737 	UF_EXIT(ufp);
   1738 }
   1739 
   1740 /*
   1741  * fd is associated with a port. pfd is a pointer to the fd entry in the
   1742  * cache of the port.
   1743  */
   1744 
   1745 void
   1746 addfd_port(int fd, portfd_t *pfd)
   1747 {
   1748 	struct uf_entry *ufp;
   1749 	uf_info_t *fip = P_FINFO(curproc);
   1750 
   1751 	UF_ENTER(ufp, fip, fd);
   1752 	/*
   1753 	 * addfd_port is always done inside the getf/releasef pair.
   1754 	 */
   1755 	ASSERT(ufp->uf_refcnt >= 1);
   1756 	if (ufp->uf_portfd == NULL) {
   1757 		/* first entry */
   1758 		ufp->uf_portfd = pfd;
   1759 		pfd->pfd_next = NULL;
   1760 	} else {
   1761 		pfd->pfd_next = ufp->uf_portfd;
   1762 		ufp->uf_portfd = pfd;
   1763 		pfd->pfd_next->pfd_prev = pfd;
   1764 	}
   1765 	UF_EXIT(ufp);
   1766 }
   1767 
   1768 void
   1769 delfd_port(int fd, portfd_t *pfd)
   1770 {
   1771 	struct uf_entry *ufp;
   1772 	uf_info_t *fip = P_FINFO(curproc);
   1773 
   1774 	UF_ENTER(ufp, fip, fd);
   1775 	/*
   1776 	 * delfd_port is always done inside the getf/releasef pair.
   1777 	 */
   1778 	ASSERT(ufp->uf_refcnt >= 1);
   1779 	if (ufp->uf_portfd == pfd) {
   1780 		/* remove first entry */
   1781 		ufp->uf_portfd = pfd->pfd_next;
   1782 	} else {
   1783 		pfd->pfd_prev->pfd_next = pfd->pfd_next;
   1784 		if (pfd->pfd_next != NULL)
   1785 			pfd->pfd_next->pfd_prev = pfd->pfd_prev;
   1786 	}
   1787 	UF_EXIT(ufp);
   1788 }
   1789 
   1790 static void
   1791 port_close_fd(portfd_t *pfd)
   1792 {
   1793 	portfd_t	*pfdn;
   1794 
   1795 	/*
   1796 	 * At this point, no other thread should access
   1797 	 * the portfd_t list for this fd. The uf_file, uf_portfd
   1798 	 * pointers in the uf_entry_t struct for this fd would
   1799 	 * be set to NULL.
   1800 	 */
   1801 	for (; pfd != NULL; pfd = pfdn) {
   1802 		pfdn = pfd->pfd_next;
   1803 		port_close_pfd(pfd);
   1804 	}
   1805 }
   1806