Home | History | Annotate | Download | only in sockfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/param.h>
     29 #include <sys/signal.h>
     30 #include <sys/cmn_err.h>
     31 
     32 #include <sys/stropts.h>
     33 #include <sys/socket.h>
     34 #include <sys/socketvar.h>
     35 #include <sys/sockio.h>
     36 #include <sys/strsubr.h>
     37 #include <sys/strsun.h>
     38 #include <sys/atomic.h>
     39 #include <sys/tihdr.h>
     40 
     41 #include <fs/sockfs/sockcommon.h>
     42 #include <fs/sockfs/socktpi.h>
     43 #include <fs/sockfs/sodirect.h>
     44 #include <sys/ddi.h>
     45 #include <inet/ip.h>
     46 #include <sys/time.h>
     47 #include <sys/cmn_err.h>
     48 
     49 #ifdef SOCK_TEST
     50 extern int do_useracc;
     51 extern clock_t sock_test_timelimit;
     52 #endif /* SOCK_TEST */
     53 
     54 #define	MBLK_PULL_LEN 64
     55 uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
     56 
     57 #ifdef DEBUG
     58 boolean_t so_debug_length = B_FALSE;
     59 static boolean_t so_check_length(sonode_t *so);
     60 #endif
     61 
     62 int
     63 so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso)
     64 {
     65 	ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
     66 	ASSERT(nso->so_acceptq_next == NULL);
     67 
     68 	*so->so_acceptq_tail = nso;
     69 	so->so_acceptq_tail = &nso->so_acceptq_next;
     70 	so->so_acceptq_len++;
     71 
     72 	/* Wakeup a single consumer */
     73 	cv_signal(&so->so_acceptq_cv);
     74 
     75 	return (so->so_acceptq_len);
     76 }
     77 
     78 /*
     79  * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
     80  *
     81  * Enqueue an incoming connection on a listening socket.
     82  *
     83  * Arguments:
     84  *   so	  - listening socket
     85  *   nso  - new connection
     86  *
     87  * Returns:
     88  *   Number of queued connections, including the new connection
     89  */
     90 int
     91 so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
     92 {
     93 	int conns;
     94 
     95 	mutex_enter(&so->so_acceptq_lock);
     96 	conns = so_acceptq_enqueue_locked(so, nso);
     97 	mutex_exit(&so->so_acceptq_lock);
     98 
     99 	return (conns);
    100 }
    101 
    102 static int
    103 so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
    104     struct sonode **nsop)
    105 {
    106 	struct sonode *nso = NULL;
    107 
    108 	*nsop = NULL;
    109 	ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
    110 	while ((nso = so->so_acceptq_head) == NULL) {
    111 		/*
    112 		 * No need to check so_error here, because it is not
    113 		 * possible for a listening socket to be reset or otherwise
    114 		 * disconnected.
    115 		 *
    116 		 * So now we just need check if it's ok to wait.
    117 		 */
    118 		if (dontblock)
    119 			return (EWOULDBLOCK);
    120 		if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
    121 			return (EINTR);
    122 
    123 		if (cv_wait_sig_swap(&so->so_acceptq_cv,
    124 		    &so->so_acceptq_lock) == 0)
    125 			return (EINTR);
    126 	}
    127 
    128 	ASSERT(nso != NULL);
    129 	so->so_acceptq_head = nso->so_acceptq_next;
    130 	nso->so_acceptq_next = NULL;
    131 
    132 	if (so->so_acceptq_head == NULL) {
    133 		ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next);
    134 		so->so_acceptq_tail = &so->so_acceptq_head;
    135 	}
    136 	ASSERT(so->so_acceptq_len > 0);
    137 	--so->so_acceptq_len;
    138 
    139 	*nsop = nso;
    140 
    141 	return (0);
    142 }
    143 
    144 /*
    145  * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
    146  *
    147  * Pulls a connection off of the accept queue.
    148  *
    149  * Arguments:
    150  *   so	       - listening socket
    151  *   dontblock - indicate whether it's ok to sleep if there are no
    152  *		 connections on the queue
    153  *   nsop      - Value-return argument
    154  *
    155  * Return values:
    156  *   0 when a connection is successfully dequeued, in which case nsop
    157  *   is set to point to the new connection. Upon failure a non-zero
    158  *   value is returned, and the value of nsop is set to NULL.
    159  *
    160  * Note:
    161  *   so_acceptq_dequeue() may return prematurly if the socket is falling
    162  *   back to TPI.
    163  */
    164 int
    165 so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
    166     struct sonode **nsop)
    167 {
    168 	int error;
    169 
    170 	mutex_enter(&so->so_acceptq_lock);
    171 	error = so_acceptq_dequeue_locked(so, dontblock, nsop);
    172 	mutex_exit(&so->so_acceptq_lock);
    173 
    174 	return (error);
    175 }
    176 
    177 /*
    178  * void so_acceptq_flush(struct sonode *so, boolean_t doclose)
    179  *
    180  * Removes all pending connections from a listening socket, and
    181  * frees the associated resources.
    182  *
    183  * Arguments
    184  *   so	     - listening socket
    185  *   doclose - make a close downcall for each socket on the accept queue
    186  *             (Note, only SCTP and SDP sockets rely on this)
    187  *
    188  * Return values:
    189  *   None.
    190  *
    191  * Note:
    192  *   The caller has to ensure that no calls to so_acceptq_enqueue() or
    193  *   so_acceptq_dequeue() occur while the accept queue is being flushed.
    194  *   So either the socket needs to be in a state where no operations
    195  *   would come in, or so_lock needs to be obtained.
    196  */
    197 void
    198 so_acceptq_flush(struct sonode *so, boolean_t doclose)
    199 {
    200 	struct sonode *nso;
    201 
    202 	while ((nso = so->so_acceptq_head) != NULL) {
    203 		so->so_acceptq_head = nso->so_acceptq_next;
    204 		nso->so_acceptq_next = NULL;
    205 
    206 		if (doclose) {
    207 			(void) socket_close(nso, 0, CRED());
    208 		} else {
    209 			/*
    210 			 * Since the socket is on the accept queue, there can
    211 			 * only be one reference. We drop the reference and
    212 			 * just blow off the socket.
    213 			 */
    214 			ASSERT(nso->so_count == 1);
    215 			nso->so_count--;
    216 		}
    217 		socket_destroy(nso);
    218 	}
    219 
    220 	so->so_acceptq_head = NULL;
    221 	so->so_acceptq_tail = &so->so_acceptq_head;
    222 	so->so_acceptq_len = 0;
    223 }
    224 
    225 int
    226 so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
    227     sock_connid_t id)
    228 {
    229 	ASSERT(MUTEX_HELD(&so->so_lock));
    230 
    231 	/*
    232 	 * The protocol has notified us that a connection attempt is being
    233 	 * made, so before we wait for a notification to arrive we must
    234 	 * clear out any errors associated with earlier connection attempts.
    235 	 */
    236 	if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
    237 		so->so_error = 0;
    238 
    239 	while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
    240 		if (nonblock)
    241 			return (EINPROGRESS);
    242 
    243 		if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
    244 			return (EINTR);
    245 
    246 		if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
    247 			return (EINTR);
    248 	}
    249 
    250 	if (so->so_error != 0)
    251 		return (sogeterr(so, B_TRUE));
    252 	/*
    253 	 * Under normal circumstances, so_error should contain an error
    254 	 * in case the connect failed. However, it is possible for another
    255 	 * thread to come in a consume the error, so generate a sensible
    256 	 * error in that case.
    257 	 */
    258 	if ((so->so_state & SS_ISCONNECTED) == 0)
    259 		return (ECONNREFUSED);
    260 
    261 	return (0);
    262 }
    263 
    264 /*
    265  * int so_wait_connected(struct sonode *so, boolean_t nonblock,
    266  *    sock_connid_t id)
    267  *
    268  * Wait until the socket is connected or an error has occured.
    269  *
    270  * Arguments:
    271  *   so	      - socket
    272  *   nonblock - indicate whether it's ok to sleep if the connection has
    273  *		not yet been established
    274  *   gen      - generation number that was returned by the protocol
    275  *		when the operation was started
    276  *
    277  * Returns:
    278  *   0 if the connection attempt was successful, or an error indicating why
    279  *   the connection attempt failed.
    280  */
    281 int
    282 so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
    283 {
    284 	int error;
    285 
    286 	mutex_enter(&so->so_lock);
    287 	error = so_wait_connected_locked(so, nonblock, id);
    288 	mutex_exit(&so->so_lock);
    289 
    290 	return (error);
    291 }
    292 
    293 int
    294 so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
    295 {
    296 	int error;
    297 
    298 	ASSERT(MUTEX_HELD(&so->so_lock));
    299 	while (so->so_snd_qfull) {
    300 		if (so->so_state & SS_CANTSENDMORE)
    301 			return (EPIPE);
    302 		if (dontblock)
    303 			return (EWOULDBLOCK);
    304 
    305 		if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
    306 			return (EINTR);
    307 
    308 		if (so->so_sndtimeo == 0) {
    309 			/*
    310 			 * Zero means disable timeout.
    311 			 */
    312 			error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
    313 		} else {
    314 			error = cv_reltimedwait_sig(&so->so_snd_cv,
    315 			    &so->so_lock, so->so_sndtimeo, TR_CLOCK_TICK);
    316 		}
    317 		if (error == 0)
    318 			return (EINTR);
    319 		else if (error == -1)
    320 			return (EAGAIN);
    321 	}
    322 	return (0);
    323 }
    324 
    325 /*
    326  * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
    327  *
    328  * Wait for the transport to notify us about send buffers becoming
    329  * available.
    330  */
    331 int
    332 so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
    333 {
    334 	int error = 0;
    335 
    336 	mutex_enter(&so->so_lock);
    337 	if (so->so_snd_qfull) {
    338 		so->so_snd_wakeup = B_TRUE;
    339 		error = so_snd_wait_qnotfull_locked(so, dontblock);
    340 		so->so_snd_wakeup = B_FALSE;
    341 	}
    342 	mutex_exit(&so->so_lock);
    343 
    344 	return (error);
    345 }
    346 
    347 void
    348 so_snd_qfull(struct sonode *so)
    349 {
    350 	mutex_enter(&so->so_lock);
    351 	so->so_snd_qfull = B_TRUE;
    352 	mutex_exit(&so->so_lock);
    353 }
    354 
    355 void
    356 so_snd_qnotfull(struct sonode *so)
    357 {
    358 	mutex_enter(&so->so_lock);
    359 	so->so_snd_qfull = B_FALSE;
    360 	/* wake up everyone waiting for buffers */
    361 	cv_broadcast(&so->so_snd_cv);
    362 	mutex_exit(&so->so_lock);
    363 }
    364 
    365 /*
    366  * Change the process/process group to which SIGIO is sent.
    367  */
    368 int
    369 socket_chgpgrp(struct sonode *so, pid_t pid)
    370 {
    371 	int error;
    372 
    373 	ASSERT(MUTEX_HELD(&so->so_lock));
    374 	if (pid != 0) {
    375 		/*
    376 		 * Permissions check by sending signal 0.
    377 		 * Note that when kill fails it does a
    378 		 * set_errno causing the system call to fail.
    379 		 */
    380 		error = kill(pid, 0);
    381 		if (error != 0) {
    382 			return (error);
    383 		}
    384 	}
    385 	so->so_pgrp = pid;
    386 	return (0);
    387 }
    388 
    389 
    390 /*
    391  * Generate a SIGIO, for 'writable' events include siginfo structure,
    392  * for read events just send the signal.
    393  */
    394 /*ARGSUSED*/
    395 static void
    396 socket_sigproc(proc_t *proc, int event)
    397 {
    398 	k_siginfo_t info;
    399 
    400 	ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
    401 
    402 	if (event & SOCKETSIG_WRITE) {
    403 		info.si_signo = SIGPOLL;
    404 		info.si_code = POLL_OUT;
    405 		info.si_errno = 0;
    406 		info.si_fd = 0;
    407 		info.si_band = 0;
    408 		sigaddq(proc, NULL, &info, KM_NOSLEEP);
    409 	}
    410 	if (event & SOCKETSIG_READ) {
    411 		sigtoproc(proc, NULL, SIGPOLL);
    412 	}
    413 	if (event & SOCKETSIG_URG) {
    414 		sigtoproc(proc, NULL, SIGURG);
    415 	}
    416 }
    417 
    418 void
    419 socket_sendsig(struct sonode *so, int event)
    420 {
    421 	proc_t *proc;
    422 
    423 	ASSERT(MUTEX_HELD(&so->so_lock));
    424 
    425 	if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
    426 	    event != SOCKETSIG_URG)) {
    427 		return;
    428 	}
    429 
    430 	dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
    431 
    432 	if (so->so_pgrp > 0) {
    433 		/*
    434 		 * XXX This unfortunately still generates
    435 		 * a signal when a fd is closed but
    436 		 * the proc is active.
    437 		 */
    438 		mutex_enter(&pidlock);
    439 		proc = prfind(so->so_pgrp);
    440 		if (proc == NULL) {
    441 			mutex_exit(&pidlock);
    442 			return;
    443 		}
    444 		mutex_enter(&proc->p_lock);
    445 		mutex_exit(&pidlock);
    446 		socket_sigproc(proc, event);
    447 		mutex_exit(&proc->p_lock);
    448 	} else {
    449 		/*
    450 		 * Send to process group. Hold pidlock across
    451 		 * calls to socket_sigproc().
    452 		 */
    453 		pid_t pgrp = -so->so_pgrp;
    454 
    455 		mutex_enter(&pidlock);
    456 		proc = pgfind(pgrp);
    457 		while (proc != NULL) {
    458 			mutex_enter(&proc->p_lock);
    459 			socket_sigproc(proc, event);
    460 			mutex_exit(&proc->p_lock);
    461 			proc = proc->p_pglink;
    462 		}
    463 		mutex_exit(&pidlock);
    464 	}
    465 }
    466 
    467 #define	MIN(a, b) ((a) < (b) ? (a) : (b))
    468 /* Copy userdata into a new mblk_t */
    469 mblk_t *
    470 socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
    471     size_t tail_len, int *errorp)
    472 {
    473 	mblk_t	*head = NULL, **tail = &head;
    474 
    475 	ASSERT(iosize == INFPSZ || iosize > 0);
    476 
    477 	if (iosize == INFPSZ || iosize > uiop->uio_resid)
    478 		iosize = uiop->uio_resid;
    479 
    480 	if (maxblk == INFPSZ)
    481 		maxblk = iosize;
    482 
    483 	/* Nothing to do in these cases, so we're done */
    484 	if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
    485 		goto done;
    486 
    487 	/*
    488 	 * We will enter the loop below if iosize is 0; it will allocate an
    489 	 * empty message block and call uiomove(9F) which will just return.
    490 	 * We could avoid that with an extra check but would only slow
    491 	 * down the much more likely case where iosize is larger than 0.
    492 	 */
    493 	do {
    494 		ssize_t blocksize;
    495 		mblk_t	*mp;
    496 
    497 		blocksize = MIN(iosize, maxblk);
    498 		ASSERT(blocksize >= 0);
    499 		mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
    500 		if (mp == NULL) {
    501 			*errorp = ENOMEM;
    502 			return (head);
    503 		}
    504 		mp->b_rptr += wroff;
    505 		mp->b_wptr = mp->b_rptr + blocksize;
    506 
    507 		*tail = mp;
    508 		tail = &mp->b_cont;
    509 
    510 		/* uiomove(9F) either returns 0 or EFAULT */
    511 		if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
    512 		    UIO_WRITE, uiop)) != 0) {
    513 			ASSERT(*errorp != ENOMEM);
    514 			freemsg(head);
    515 			return (NULL);
    516 		}
    517 
    518 		iosize -= blocksize;
    519 	} while (iosize > 0);
    520 
    521 done:
    522 	*errorp = 0;
    523 	return (head);
    524 }
    525 
    526 mblk_t *
    527 socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
    528 {
    529 	int error;
    530 	ptrdiff_t n;
    531 	mblk_t *nmp;
    532 
    533 	ASSERT(mp->b_wptr >= mp->b_rptr);
    534 
    535 	/*
    536 	 * max_read is the offset of the oobmark and read can not go pass
    537 	 * the oobmark.
    538 	 */
    539 	if (max_read == INFPSZ || max_read > uiop->uio_resid)
    540 		max_read = uiop->uio_resid;
    541 
    542 	do {
    543 		if ((n = MIN(max_read, MBLKL(mp))) != 0) {
    544 			ASSERT(n > 0);
    545 
    546 			error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
    547 			if (error != 0) {
    548 				freemsg(mp);
    549 				*errorp = error;
    550 				return (NULL);
    551 			}
    552 		}
    553 
    554 		mp->b_rptr += n;
    555 		max_read -= n;
    556 		while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
    557 			/*
    558 			 * get rid of zero length mblks
    559 			 */
    560 			nmp = mp;
    561 			mp = mp->b_cont;
    562 			freeb(nmp);
    563 		}
    564 	} while (mp != NULL && max_read > 0);
    565 
    566 	*errorp = 0;
    567 	return (mp);
    568 }
    569 
    570 static void
    571 so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
    572 {
    573 	ASSERT(last_tail != NULL);
    574 	mp->b_next = so->so_rcv_q_head;
    575 	mp->b_prev = last_tail;
    576 	ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
    577 
    578 	if (so->so_rcv_q_head == NULL) {
    579 		ASSERT(so->so_rcv_q_last_head == NULL);
    580 		so->so_rcv_q_last_head = mp;
    581 #ifdef DEBUG
    582 	} else {
    583 		ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
    584 #endif
    585 	}
    586 	so->so_rcv_q_head = mp;
    587 
    588 #ifdef DEBUG
    589 	if (so_debug_length) {
    590 		mutex_enter(&so->so_lock);
    591 		ASSERT(so_check_length(so));
    592 		mutex_exit(&so->so_lock);
    593 	}
    594 #endif
    595 }
    596 
    597 /*
    598  * Move a mblk chain (mp_head, mp_last_head) to the sonode's rcv queue so it
    599  * can be processed by so_dequeue_msg().
    600  */
    601 void
    602 so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
    603 {
    604 	ASSERT(mp_head->b_prev != NULL);
    605 	if (so->so_rcv_q_head  == NULL) {
    606 		so->so_rcv_q_head = mp_head;
    607 		so->so_rcv_q_last_head = mp_last_head;
    608 		ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
    609 	} else {
    610 		boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
    611 		    (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
    612 
    613 		if (mp_head->b_next == NULL &&
    614 		    DB_TYPE(mp_head) == M_DATA &&
    615 		    DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
    616 			so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
    617 			so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
    618 			mp_head->b_prev = NULL;
    619 		} else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
    620 			/*
    621 			 * Append to last_head if more than one mblks, and both
    622 			 * mp_head and last_head are I/OAT mblks.
    623 			 */
    624 			ASSERT(mp_head->b_next != NULL);
    625 			so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
    626 			so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
    627 			mp_head->b_prev = NULL;
    628 
    629 			so->so_rcv_q_last_head->b_next = mp_head->b_next;
    630 			mp_head->b_next = NULL;
    631 			so->so_rcv_q_last_head = mp_last_head;
    632 		} else {
    633 #ifdef DEBUG
    634 			{
    635 				mblk_t *tmp_mblk;
    636 				tmp_mblk = mp_head;
    637 				while (tmp_mblk != NULL) {
    638 					ASSERT(tmp_mblk->b_prev != NULL);
    639 					tmp_mblk = tmp_mblk->b_next;
    640 				}
    641 			}
    642 #endif
    643 			so->so_rcv_q_last_head->b_next = mp_head;
    644 			so->so_rcv_q_last_head = mp_last_head;
    645 		}
    646 	}
    647 }
    648 
    649 /*
    650  * Check flow control on a given sonode.  Must have so_lock held, and
    651  * this function will release the hold.
    652  */
    653 
    654 static void
    655 so_check_flow_control(struct sonode *so)
    656 {
    657 	ASSERT(MUTEX_HELD(&so->so_lock));
    658 
    659 	if (so->so_flowctrld && so->so_rcv_queued < so->so_rcvlowat) {
    660 		so->so_flowctrld = B_FALSE;
    661 		mutex_exit(&so->so_lock);
    662 		/*
    663 		 * Open up flow control. SCTP does not have any downcalls, and
    664 		 * it will clr flow ctrl in sosctp_recvmsg().
    665 		 */
    666 		if (so->so_downcalls != NULL &&
    667 		    so->so_downcalls->sd_clr_flowctrl != NULL) {
    668 			(*so->so_downcalls->sd_clr_flowctrl)
    669 			    (so->so_proto_handle);
    670 		}
    671 	} else {
    672 		mutex_exit(&so->so_lock);
    673 	}
    674 }
    675 
    676 int
    677 so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
    678     rval_t *rvalp, int flags)
    679 {
    680 	mblk_t	*mp, *nmp;
    681 	mblk_t	*savemp, *savemptail;
    682 	mblk_t	*new_msg_head;
    683 	mblk_t	*new_msg_last_head;
    684 	mblk_t	*last_tail;
    685 	boolean_t partial_read;
    686 	boolean_t reset_atmark = B_FALSE;
    687 	int more = 0;
    688 	int error;
    689 	ssize_t oobmark;
    690 	sodirect_t *sodp = so->so_direct;
    691 
    692 	partial_read = B_FALSE;
    693 	*mctlp = NULL;
    694 again:
    695 	mutex_enter(&so->so_lock);
    696 again1:
    697 #ifdef DEBUG
    698 	if (so_debug_length) {
    699 		ASSERT(so_check_length(so));
    700 	}
    701 #endif
    702 	if (so->so_state & SS_RCVATMARK) {
    703 		/* Check whether the caller is OK to read past the mark */
    704 		if (flags & MSG_NOMARK) {
    705 			mutex_exit(&so->so_lock);
    706 			return (EWOULDBLOCK);
    707 		}
    708 		reset_atmark = B_TRUE;
    709 	}
    710 	/*
    711 	 * First move messages from the dump area to processing area
    712 	 */
    713 	if (sodp != NULL) {
    714 		if (sodp->sod_enabled) {
    715 			if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
    716 				/* nothing to uioamove */
    717 				sodp = NULL;
    718 			} else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
    719 				sodp->sod_uioa.uioa_state &= UIOA_CLR;
    720 				sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
    721 				/*
    722 				 * try to uioamove() the data that
    723 				 * has already queued.
    724 				 */
    725 				sod_uioa_so_init(so, sodp, uiop);
    726 			}
    727 		} else {
    728 			sodp = NULL;
    729 		}
    730 	}
    731 	new_msg_head = so->so_rcv_head;
    732 	new_msg_last_head = so->so_rcv_last_head;
    733 	so->so_rcv_head = NULL;
    734 	so->so_rcv_last_head = NULL;
    735 	oobmark = so->so_oobmark;
    736 	/*
    737 	 * We can release the lock as there can only be one reader
    738 	 */
    739 	mutex_exit(&so->so_lock);
    740 
    741 	if (new_msg_head != NULL) {
    742 		so_process_new_message(so, new_msg_head, new_msg_last_head);
    743 	}
    744 	savemp = savemptail = NULL;
    745 	rvalp->r_val1 = 0;
    746 	error = 0;
    747 	mp = so->so_rcv_q_head;
    748 
    749 	if (mp != NULL &&
    750 	    (so->so_rcv_timer_tid == 0 ||
    751 	    so->so_rcv_queued >= so->so_rcv_thresh)) {
    752 		partial_read = B_FALSE;
    753 
    754 		if (flags & MSG_PEEK) {
    755 			if ((nmp = dupmsg(mp)) == NULL &&
    756 			    (nmp = copymsg(mp)) == NULL) {
    757 				size_t size = msgsize(mp);
    758 
    759 				error = strwaitbuf(size, BPRI_HI);
    760 				if (error) {
    761 					return (error);
    762 				}
    763 				goto again;
    764 			}
    765 			mp = nmp;
    766 		} else {
    767 			ASSERT(mp->b_prev != NULL);
    768 			last_tail = mp->b_prev;
    769 			mp->b_prev = NULL;
    770 			so->so_rcv_q_head = mp->b_next;
    771 			if (so->so_rcv_q_head == NULL) {
    772 				so->so_rcv_q_last_head = NULL;
    773 			}
    774 			mp->b_next = NULL;
    775 		}
    776 
    777 		ASSERT(mctlp != NULL);
    778 		/*
    779 		 * First process PROTO or PCPROTO blocks, if any.
    780 		 */
    781 		if (DB_TYPE(mp) != M_DATA) {
    782 			*mctlp = mp;
    783 			savemp = mp;
    784 			savemptail = mp;
    785 			ASSERT(DB_TYPE(mp) == M_PROTO ||
    786 			    DB_TYPE(mp) == M_PCPROTO);
    787 			while (mp->b_cont != NULL &&
    788 			    DB_TYPE(mp->b_cont) != M_DATA) {
    789 				ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
    790 				    DB_TYPE(mp->b_cont) == M_PCPROTO);
    791 				mp = mp->b_cont;
    792 				savemptail = mp;
    793 			}
    794 			mp = savemptail->b_cont;
    795 			savemptail->b_cont = NULL;
    796 		}
    797 
    798 		ASSERT(DB_TYPE(mp) == M_DATA);
    799 		/*
    800 		 * Now process DATA blocks, if any. Note that for sodirect
    801 		 * enabled socket, uio_resid can be 0.
    802 		 */
    803 		if (uiop->uio_resid >= 0) {
    804 			ssize_t copied = 0;
    805 
    806 			if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
    807 				mutex_enter(&so->so_lock);
    808 				ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
    809 				copied = sod_uioa_mblk(so, mp);
    810 				if (copied > 0)
    811 					partial_read = B_TRUE;
    812 				mutex_exit(&so->so_lock);
    813 				/* mark this mblk as processed */
    814 				mp = NULL;
    815 			} else {
    816 				ssize_t oldresid = uiop->uio_resid;
    817 
    818 				if (MBLKL(mp) < so_mblk_pull_len) {
    819 					if (pullupmsg(mp, -1) == 1) {
    820 						last_tail = mp;
    821 					}
    822 				}
    823 				/*
    824 				 * Can not read beyond the oobmark
    825 				 */
    826 				mp = socopyoutuio(mp, uiop,
    827 				    oobmark == 0 ? INFPSZ : oobmark, &error);
    828 				if (error != 0) {
    829 					freemsg(*mctlp);
    830 					*mctlp = NULL;
    831 					more = 0;
    832 					goto done;
    833 				}
    834 				ASSERT(oldresid >= uiop->uio_resid);
    835 				copied = oldresid - uiop->uio_resid;
    836 				if (oldresid > uiop->uio_resid)
    837 					partial_read = B_TRUE;
    838 			}
    839 			ASSERT(copied >= 0);
    840 			if (copied > 0 && !(flags & MSG_PEEK)) {
    841 				mutex_enter(&so->so_lock);
    842 				so->so_rcv_queued -= copied;
    843 				ASSERT(so->so_oobmark >= 0);
    844 				if (so->so_oobmark > 0) {
    845 					so->so_oobmark -= copied;
    846 					ASSERT(so->so_oobmark >= 0);
    847 					if (so->so_oobmark == 0) {
    848 						ASSERT(so->so_state &
    849 						    SS_OOBPEND);
    850 						so->so_oobmark = 0;
    851 						so->so_state |= SS_RCVATMARK;
    852 					}
    853 				}
    854 				/*
    855 				 * so_check_flow_control() will drop
    856 				 * so->so_lock.
    857 				 */
    858 				so_check_flow_control(so);
    859 			}
    860 		}
    861 		if (mp != NULL) { /* more data blocks in msg */
    862 			more |= MOREDATA;
    863 			if ((flags & (MSG_PEEK|MSG_TRUNC))) {
    864 				if (flags & MSG_PEEK) {
    865 					freemsg(mp);
    866 				} else {
    867 					unsigned int msize = msgdsize(mp);
    868 
    869 					freemsg(mp);
    870 					mutex_enter(&so->so_lock);
    871 					so->so_rcv_queued -= msize;
    872 					/*
    873 					 * so_check_flow_control() will drop
    874 					 * so->so_lock.
    875 					 */
    876 					so_check_flow_control(so);
    877 				}
    878 			} else if (partial_read && !somsghasdata(mp)) {
    879 				/*
    880 				 * Avoid queuing a zero-length tail part of
    881 				 * a message. partial_read == 1 indicates that
    882 				 * we read some of the message.
    883 				 */
    884 				freemsg(mp);
    885 				more &= ~MOREDATA;
    886 			} else {
    887 				if (savemp != NULL &&
    888 				    (flags & MSG_DUPCTRL)) {
    889 					mblk_t *nmp;
    890 					/*
    891 					 * There should only be non data mblks
    892 					 */
    893 					ASSERT(DB_TYPE(savemp) != M_DATA &&
    894 					    DB_TYPE(savemptail) != M_DATA);
    895 try_again:
    896 					if ((nmp = dupmsg(savemp)) == NULL &&
    897 					    (nmp = copymsg(savemp)) == NULL) {
    898 
    899 						size_t size = msgsize(savemp);
    900 
    901 						error = strwaitbuf(size,
    902 						    BPRI_HI);
    903 						if (error != 0) {
    904 							/*
    905 							 * In case we
    906 							 * cannot copy
    907 							 * control data
    908 							 * free the remaining
    909 							 * data.
    910 							 */
    911 							freemsg(mp);
    912 							goto done;
    913 						}
    914 						goto try_again;
    915 					}
    916 
    917 					ASSERT(nmp != NULL);
    918 					ASSERT(DB_TYPE(nmp) != M_DATA);
    919 					savemptail->b_cont = mp;
    920 					*mctlp = nmp;
    921 					mp = savemp;
    922 				}
    923 				/*
    924 				 * putback mp
    925 				 */
    926 				so_prepend_msg(so, mp, last_tail);
    927 			}
    928 		}
    929 
    930 		/* fast check so_rcv_head if there is more data */
    931 		if (partial_read && !(so->so_state & SS_RCVATMARK) &&
    932 		    *mctlp == NULL && uiop->uio_resid > 0 &&
    933 		    !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
    934 			goto again;
    935 		}
    936 	} else if (!partial_read) {
    937 		mutex_enter(&so->so_lock);
    938 		if (so->so_error != 0) {
    939 			error = sogeterr(so, !(flags & MSG_PEEK));
    940 			mutex_exit(&so->so_lock);
    941 			return (error);
    942 		}
    943 		/*
    944 		 * No pending data. Return right away for nonblocking
    945 		 * socket, otherwise sleep waiting for data.
    946 		 */
    947 		if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
    948 			if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
    949 			    (flags & MSG_DONTWAIT)) {
    950 				error = EWOULDBLOCK;
    951 			} else {
    952 				if (so->so_state & (SS_CLOSING |
    953 				    SS_FALLBACK_PENDING)) {
    954 					mutex_exit(&so->so_lock);
    955 					error = EINTR;
    956 					goto done;
    957 				}
    958 
    959 				if (so->so_rcv_head != NULL) {
    960 					goto again1;
    961 				}
    962 				so->so_rcv_wakeup = B_TRUE;
    963 				so->so_rcv_wanted = uiop->uio_resid;
    964 				if (so->so_rcvtimeo == 0) {
    965 					/*
    966 					 * Zero means disable timeout.
    967 					 */
    968 					error = cv_wait_sig(&so->so_rcv_cv,
    969 					    &so->so_lock);
    970 				} else {
    971 					error = cv_reltimedwait_sig(
    972 					    &so->so_rcv_cv, &so->so_lock,
    973 					    so->so_rcvtimeo, TR_CLOCK_TICK);
    974 				}
    975 				so->so_rcv_wakeup = B_FALSE;
    976 				so->so_rcv_wanted = 0;
    977 
    978 				if (error == 0) {
    979 					error = EINTR;
    980 				} else if (error == -1) {
    981 					error = EAGAIN;
    982 				} else {
    983 					goto again1;
    984 				}
    985 			}
    986 		}
    987 		mutex_exit(&so->so_lock);
    988 	}
    989 	if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
    990 		/*
    991 		 * We are passed the mark, update state
    992 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
    993 		 * The draft Posix socket spec states that the mark should
    994 		 * not be cleared when peeking. We follow the latter.
    995 		 */
    996 		mutex_enter(&so->so_lock);
    997 		ASSERT(so_verify_oobstate(so));
    998 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
    999 		freemsg(so->so_oobmsg);
   1000 		so->so_oobmsg = NULL;
   1001 		ASSERT(so_verify_oobstate(so));
   1002 		mutex_exit(&so->so_lock);
   1003 	}
   1004 	ASSERT(so->so_rcv_wakeup == B_FALSE);
   1005 done:
   1006 	if (sodp != NULL) {
   1007 		mutex_enter(&so->so_lock);
   1008 		if (sodp->sod_enabled &&
   1009 		    (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
   1010 			SOD_UIOAFINI(sodp);
   1011 			if (sodp->sod_uioa.uioa_mbytes > 0) {
   1012 				ASSERT(so->so_rcv_q_head != NULL ||
   1013 				    so->so_rcv_head != NULL);
   1014 				so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
   1015 				if (error == EWOULDBLOCK)
   1016 					error = 0;
   1017 			}
   1018 		}
   1019 		mutex_exit(&so->so_lock);
   1020 	}
   1021 #ifdef DEBUG
   1022 	if (so_debug_length) {
   1023 		mutex_enter(&so->so_lock);
   1024 		ASSERT(so_check_length(so));
   1025 		mutex_exit(&so->so_lock);
   1026 	}
   1027 #endif
   1028 	rvalp->r_val1 = more;
   1029 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
   1030 	return (error);
   1031 }
   1032 
   1033 /*
   1034  * Enqueue data from the protocol on the socket's rcv queue.
   1035  *
   1036  * We try to hook new M_DATA mblks onto an existing chain, however,
   1037  * that cannot be done if the existing chain has already been
   1038  * processed by I/OAT. Non-M_DATA mblks are just linked together via
   1039  * b_next. In all cases the b_prev of the enqueued mblk is set to
   1040  * point to the last mblk in its b_cont chain.
   1041  */
   1042 void
   1043 so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
   1044 {
   1045 	ASSERT(MUTEX_HELD(&so->so_lock));
   1046 
   1047 #ifdef DEBUG
   1048 	if (so_debug_length) {
   1049 		ASSERT(so_check_length(so));
   1050 	}
   1051 #endif
   1052 	so->so_rcv_queued += msg_size;
   1053 
   1054 	if (so->so_rcv_head == NULL) {
   1055 		ASSERT(so->so_rcv_last_head == NULL);
   1056 		so->so_rcv_head = mp;
   1057 		so->so_rcv_last_head = mp;
   1058 	} else if ((DB_TYPE(mp) == M_DATA &&
   1059 	    DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
   1060 	    ((DB_FLAGS(mp) & DBLK_UIOA) ==
   1061 	    (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
   1062 		/* Added to the end */
   1063 		ASSERT(so->so_rcv_last_head != NULL);
   1064 		ASSERT(so->so_rcv_last_head->b_prev != NULL);
   1065 		so->so_rcv_last_head->b_prev->b_cont = mp;
   1066 	} else {
   1067 		/* Start a new end */
   1068 		so->so_rcv_last_head->b_next = mp;
   1069 		so->so_rcv_last_head = mp;
   1070 	}
   1071 	while (mp->b_cont != NULL)
   1072 		mp = mp->b_cont;
   1073 
   1074 	so->so_rcv_last_head->b_prev = mp;
   1075 #ifdef DEBUG
   1076 	if (so_debug_length) {
   1077 		ASSERT(so_check_length(so));
   1078 	}
   1079 #endif
   1080 }
   1081 
   1082 /*
   1083  * Return B_TRUE if there is data in the message, B_FALSE otherwise.
   1084  */
   1085 boolean_t
   1086 somsghasdata(mblk_t *mp)
   1087 {
   1088 	for (; mp; mp = mp->b_cont)
   1089 		if (mp->b_datap->db_type == M_DATA) {
   1090 			ASSERT(mp->b_wptr >= mp->b_rptr);
   1091 			if (mp->b_wptr > mp->b_rptr)
   1092 				return (B_TRUE);
   1093 		}
   1094 	return (B_FALSE);
   1095 }
   1096 
   1097 /*
   1098  * Flush the read side of sockfs.
   1099  *
   1100  * The caller must be sure that a reader is not already active when the
   1101  * buffer is being flushed.
   1102  */
   1103 void
   1104 so_rcv_flush(struct sonode *so)
   1105 {
   1106 	mblk_t  *mp;
   1107 
   1108 	ASSERT(MUTEX_HELD(&so->so_lock));
   1109 
   1110 	if (so->so_oobmsg != NULL) {
   1111 		freemsg(so->so_oobmsg);
   1112 		so->so_oobmsg = NULL;
   1113 		so->so_oobmark = 0;
   1114 		so->so_state &=
   1115 		    ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
   1116 	}
   1117 
   1118 	/*
   1119 	 * Free messages sitting in the send and recv queue
   1120 	 */
   1121 	while (so->so_rcv_q_head != NULL) {
   1122 		mp = so->so_rcv_q_head;
   1123 		so->so_rcv_q_head = mp->b_next;
   1124 		mp->b_next = mp->b_prev = NULL;
   1125 		freemsg(mp);
   1126 	}
   1127 	while (so->so_rcv_head != NULL) {
   1128 		mp = so->so_rcv_head;
   1129 		so->so_rcv_head = mp->b_next;
   1130 		mp->b_next = mp->b_prev = NULL;
   1131 		freemsg(mp);
   1132 	}
   1133 	so->so_rcv_queued = 0;
   1134 	so->so_rcv_q_head = NULL;
   1135 	so->so_rcv_q_last_head = NULL;
   1136 	so->so_rcv_head = NULL;
   1137 	so->so_rcv_last_head = NULL;
   1138 }
   1139 
   1140 /*
   1141  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
   1142  */
   1143 int
   1144 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
   1145     boolean_t oob_inline)
   1146 {
   1147 	mblk_t		*mp, *nmp;
   1148 	int		error;
   1149 
   1150 	dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
   1151 	    flags));
   1152 
   1153 	if (msg != NULL) {
   1154 		/*
   1155 		 * There is never any oob data with addresses or control since
   1156 		 * the T_EXDATA_IND does not carry any options.
   1157 		 */
   1158 		msg->msg_controllen = 0;
   1159 		msg->msg_namelen = 0;
   1160 		msg->msg_flags = 0;
   1161 	}
   1162 
   1163 	mutex_enter(&so->so_lock);
   1164 	ASSERT(so_verify_oobstate(so));
   1165 	if (oob_inline ||
   1166 	    (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
   1167 		dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
   1168 		mutex_exit(&so->so_lock);
   1169 		return (EINVAL);
   1170 	}
   1171 	if (!(so->so_state & SS_HAVEOOBDATA)) {
   1172 		dprintso(so, 1, ("sorecvoob: no data yet\n"));
   1173 		mutex_exit(&so->so_lock);
   1174 		return (EWOULDBLOCK);
   1175 	}
   1176 	ASSERT(so->so_oobmsg != NULL);
   1177 	mp = so->so_oobmsg;
   1178 	if (flags & MSG_PEEK) {
   1179 		/*
   1180 		 * Since recv* can not return ENOBUFS we can not use dupmsg.
   1181 		 * Instead we revert to the consolidation private
   1182 		 * allocb_wait plus bcopy.
   1183 		 */
   1184 		mblk_t *mp1;
   1185 
   1186 		mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
   1187 		ASSERT(mp1);
   1188 
   1189 		while (mp != NULL) {
   1190 			ssize_t size;
   1191 
   1192 			size = MBLKL(mp);
   1193 			bcopy(mp->b_rptr, mp1->b_wptr, size);
   1194 			mp1->b_wptr += size;
   1195 			ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
   1196 			mp = mp->b_cont;
   1197 		}
   1198 		mp = mp1;
   1199 	} else {
   1200 		/*
   1201 		 * Update the state indicating that the data has been consumed.
   1202 		 * Keep SS_OOBPEND set until data is consumed past the mark.
   1203 		 */
   1204 		so->so_oobmsg = NULL;
   1205 		so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
   1206 	}
   1207 	ASSERT(so_verify_oobstate(so));
   1208 	mutex_exit(&so->so_lock);
   1209 
   1210 	error = 0;
   1211 	nmp = mp;
   1212 	while (nmp != NULL && uiop->uio_resid > 0) {
   1213 		ssize_t n = MBLKL(nmp);
   1214 
   1215 		n = MIN(n, uiop->uio_resid);
   1216 		if (n > 0)
   1217 			error = uiomove(nmp->b_rptr, n,
   1218 			    UIO_READ, uiop);
   1219 		if (error)
   1220 			break;
   1221 		nmp = nmp->b_cont;
   1222 	}
   1223 	ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
   1224 	freemsg(mp);
   1225 	return (error);
   1226 }
   1227 
   1228 /*
   1229  * Allocate and initializ sonode
   1230  */
   1231 /* ARGSUSED */
   1232 struct sonode *
   1233 socket_sonode_create(struct sockparams *sp, int family, int type,
   1234     int protocol, int version, int sflags, int *errorp, struct cred *cr)
   1235 {
   1236 	sonode_t *so;
   1237 	int	kmflags;
   1238 
   1239 	/*
   1240 	 * Choose the right set of sonodeops based on the upcall and
   1241 	 * down call version that the protocol has provided
   1242 	 */
   1243 	if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
   1244 	    SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
   1245 		/*
   1246 		 * mismatch
   1247 		 */
   1248 #ifdef DEBUG
   1249 		cmn_err(CE_CONT, "protocol and socket module version mismatch");
   1250 #endif
   1251 		*errorp = EINVAL;
   1252 		return (NULL);
   1253 	}
   1254 
   1255 	kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
   1256 
   1257 	so = kmem_cache_alloc(socket_cache, kmflags);
   1258 	if (so == NULL) {
   1259 		*errorp = ENOMEM;
   1260 		return (NULL);
   1261 	}
   1262 
   1263 	sonode_init(so, sp, family, type, protocol, &so_sonodeops);
   1264 
   1265 	if (version == SOV_DEFAULT)
   1266 		version = so_default_version;
   1267 
   1268 	so->so_version = (short)version;
   1269 
   1270 	/*
   1271 	 * set the default values to be INFPSZ
   1272 	 * if a protocol desires it can change the value later
   1273 	 */
   1274 	so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
   1275 	so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
   1276 	so->so_proto_props.sopp_maxpsz = INFPSZ;
   1277 	so->so_proto_props.sopp_maxblk = INFPSZ;
   1278 
   1279 	return (so);
   1280 }
   1281 
   1282 int
   1283 socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
   1284 {
   1285 	int error = 0;
   1286 
   1287 	if (pso != NULL) {
   1288 		/*
   1289 		 * We have a passive open, so inherit basic state from
   1290 		 * the parent (listener).
   1291 		 *
   1292 		 * No need to grab the new sonode's lock, since there is no
   1293 		 * one that can have a reference to it.
   1294 		 */
   1295 		mutex_enter(&pso->so_lock);
   1296 
   1297 		so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
   1298 		so->so_pgrp = pso->so_pgrp;
   1299 		so->so_rcvtimeo = pso->so_rcvtimeo;
   1300 		so->so_sndtimeo = pso->so_sndtimeo;
   1301 		so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
   1302 		/*
   1303 		 * Make note of the socket level options. TCP and IP level
   1304 		 * options are already inherited. We could do all this after
   1305 		 * accept is successful but doing it here simplifies code and
   1306 		 * no harm done for error case.
   1307 		 */
   1308 		so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
   1309 		    SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
   1310 		    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
   1311 		so->so_proto_props = pso->so_proto_props;
   1312 		so->so_mode = pso->so_mode;
   1313 		so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
   1314 
   1315 		mutex_exit(&pso->so_lock);
   1316 	} else {
   1317 		struct sockparams *sp = so->so_sockparams;
   1318 		sock_upcalls_t *upcalls_to_use;
   1319 
   1320 		/*
   1321 		 * Based on the version number select the right upcalls to
   1322 		 * pass down. Currently we only have one version so choose
   1323 		 * default
   1324 		 */
   1325 		upcalls_to_use = &so_upcalls;
   1326 
   1327 		/* active open, so create a lower handle */
   1328 		so->so_proto_handle =
   1329 		    sp->sp_smod_info->smod_proto_create_func(so->so_family,
   1330 		    so->so_type, so->so_protocol, &so->so_downcalls,
   1331 		    &so->so_mode, &error, flags, cr);
   1332 
   1333 		if (so->so_proto_handle == NULL) {
   1334 			ASSERT(error != 0);
   1335 			/*
   1336 			 * To be safe; if a lower handle cannot be created, and
   1337 			 * the proto does not give a reason why, assume there
   1338 			 * was a lack of memory.
   1339 			 */
   1340 			return ((error == 0) ? ENOMEM : error);
   1341 		}
   1342 		ASSERT(so->so_downcalls != NULL);
   1343 		ASSERT(so->so_downcalls->sd_send != NULL ||
   1344 		    so->so_downcalls->sd_send_uio != NULL);
   1345 		if (so->so_downcalls->sd_recv_uio != NULL) {
   1346 			ASSERT(so->so_downcalls->sd_poll != NULL);
   1347 			so->so_pollev |= SO_POLLEV_ALWAYS;
   1348 		}
   1349 
   1350 		(*so->so_downcalls->sd_activate)(so->so_proto_handle,
   1351 		    (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
   1352 
   1353 		/* Wildcard */
   1354 
   1355 		/*
   1356 		 * FIXME No need for this, the protocol can deal with it in
   1357 		 * sd_create(). Should update ICMP.
   1358 		 */
   1359 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
   1360 			int protocol = so->so_protocol;
   1361 			int error;
   1362 			/*
   1363 			 * Issue SO_PROTOTYPE setsockopt.
   1364 			 */
   1365 			error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
   1366 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
   1367 			if (error) {
   1368 				(void) (*so->so_downcalls->sd_close)
   1369 				    (so->so_proto_handle, 0, cr);
   1370 
   1371 				mutex_enter(&so->so_lock);
   1372 				so_rcv_flush(so);
   1373 				mutex_exit(&so->so_lock);
   1374 				/*
   1375 				 * Setsockopt often fails with ENOPROTOOPT but
   1376 				 * socket() should fail with
   1377 				 * EPROTONOSUPPORT/EPROTOTYPE.
   1378 				 */
   1379 				return (EPROTONOSUPPORT);
   1380 			}
   1381 		}
   1382 	}
   1383 
   1384 	if (uioasync.enabled)
   1385 		sod_sock_init(so);
   1386 
   1387 	return (0);
   1388 }
   1389 
   1390 /*
   1391  * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
   1392  *         struct cred *cr, int32_t *rvalp)
   1393  *
   1394  * Handle ioctls that manipulate basic socket state; non-blocking,
   1395  * async, etc.
   1396  *
   1397  * Returns:
   1398  *   < 0  - ioctl was not handle
   1399  *  >= 0  - ioctl was handled, if > 0, then it is an errno
   1400  *
   1401  * Notes:
   1402  *   Assumes the standard receive buffer is used to obtain info for
   1403  *   NREAD.
   1404  */
   1405 /* ARGSUSED */
   1406 int
   1407 socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
   1408     struct cred *cr, int32_t *rvalp)
   1409 {
   1410 	switch (cmd) {
   1411 	case SIOCSQPTR:
   1412 		/*
   1413 		 * SIOCSQPTR is valid only when helper stream is created
   1414 		 * by the protocol.
   1415 		 */
   1416 
   1417 		return (EOPNOTSUPP);
   1418 	case FIONBIO: {
   1419 		int32_t value;
   1420 
   1421 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
   1422 		    (mode & (int)FKIOCTL)))
   1423 			return (EFAULT);
   1424 
   1425 		mutex_enter(&so->so_lock);
   1426 		if (value) {
   1427 			so->so_state |= SS_NDELAY;
   1428 		} else {
   1429 			so->so_state &= ~SS_NDELAY;
   1430 		}
   1431 		mutex_exit(&so->so_lock);
   1432 		return (0);
   1433 	}
   1434 	case FIOASYNC: {
   1435 		int32_t value;
   1436 
   1437 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
   1438 		    (mode & (int)FKIOCTL)))
   1439 			return (EFAULT);
   1440 
   1441 		mutex_enter(&so->so_lock);
   1442 
   1443 		if (value) {
   1444 			/* Turn on SIGIO */
   1445 			so->so_state |= SS_ASYNC;
   1446 		} else {
   1447 			/* Turn off SIGIO */
   1448 			so->so_state &= ~SS_ASYNC;
   1449 		}
   1450 		mutex_exit(&so->so_lock);
   1451 
   1452 		return (0);
   1453 	}
   1454 
   1455 	case SIOCSPGRP:
   1456 	case FIOSETOWN: {
   1457 		int error;
   1458 		pid_t pid;
   1459 
   1460 		if (so_copyin((void *)arg, &pid, sizeof (pid_t),
   1461 		    (mode & (int)FKIOCTL)))
   1462 			return (EFAULT);
   1463 
   1464 		mutex_enter(&so->so_lock);
   1465 		error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
   1466 		mutex_exit(&so->so_lock);
   1467 		return (error);
   1468 	}
   1469 	case SIOCGPGRP:
   1470 	case FIOGETOWN:
   1471 		if (so_copyout(&so->so_pgrp, (void *)arg,
   1472 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
   1473 			return (EFAULT);
   1474 
   1475 		return (0);
   1476 	case SIOCATMARK: {
   1477 		int retval;
   1478 
   1479 		/*
   1480 		 * Only protocols that support urgent data can handle ATMARK.
   1481 		 */
   1482 		if ((so->so_mode & SM_EXDATA) == 0)
   1483 			return (EINVAL);
   1484 
   1485 		/*
   1486 		 * If the protocol is maintaining its own buffer, then the
   1487 		 * request must be passed down.
   1488 		 */
   1489 		if (so->so_downcalls->sd_recv_uio != NULL)
   1490 			return (-1);
   1491 
   1492 		retval = (so->so_state & SS_RCVATMARK) != 0;
   1493 
   1494 		if (so_copyout(&retval, (void *)arg, sizeof (int),
   1495 		    (mode & (int)FKIOCTL))) {
   1496 			return (EFAULT);
   1497 		}
   1498 		return (0);
   1499 	}
   1500 
   1501 	case FIONREAD: {
   1502 		int retval;
   1503 
   1504 		/*
   1505 		 * If the protocol is maintaining its own buffer, then the
   1506 		 * request must be passed down.
   1507 		 */
   1508 		if (so->so_downcalls->sd_recv_uio != NULL)
   1509 			return (-1);
   1510 
   1511 		retval = MIN(so->so_rcv_queued, INT_MAX);
   1512 
   1513 		if (so_copyout(&retval, (void *)arg,
   1514 		    sizeof (retval), (mode & (int)FKIOCTL))) {
   1515 			return (EFAULT);
   1516 		}
   1517 		return (0);
   1518 	}
   1519 
   1520 	case _I_GETPEERCRED: {
   1521 		int error = 0;
   1522 
   1523 		if ((mode & FKIOCTL) == 0)
   1524 			return (EINVAL);
   1525 
   1526 		mutex_enter(&so->so_lock);
   1527 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
   1528 			error = ENOTSUP;
   1529 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
   1530 			error = ENOTCONN;
   1531 		} else if (so->so_peercred != NULL) {
   1532 			k_peercred_t *kp = (k_peercred_t *)arg;
   1533 			kp->pc_cr = so->so_peercred;
   1534 			kp->pc_cpid = so->so_cpid;
   1535 			crhold(so->so_peercred);
   1536 		} else {
   1537 			error = EINVAL;
   1538 		}
   1539 		mutex_exit(&so->so_lock);
   1540 		return (error);
   1541 	}
   1542 	default:
   1543 		return (-1);
   1544 	}
   1545 }
   1546 
   1547 /*
   1548  * Handle the I_NREAD STREAM ioctl.
   1549  */
   1550 static int
   1551 so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp)
   1552 {
   1553 	size_t size = 0;
   1554 	int retval;
   1555 	int count = 0;
   1556 	mblk_t *mp;
   1557 	clock_t wakeup = drv_usectohz(10);
   1558 
   1559 	if (so->so_downcalls == NULL ||
   1560 	    so->so_downcalls->sd_recv_uio != NULL)
   1561 		return (EINVAL);
   1562 
   1563 	mutex_enter(&so->so_lock);
   1564 	/* Wait for reader to get out of the way. */
   1565 	while (so->so_flag & SOREADLOCKED) {
   1566 		/*
   1567 		 * If reader is waiting for data, then there should be nothing
   1568 		 * on the rcv queue.
   1569 		 */
   1570 		if (so->so_rcv_wakeup)
   1571 			goto out;
   1572 
   1573 		so->so_flag |= SOWANT;
   1574 		/* Do a timed sleep, in case the reader goes to sleep. */
   1575 		(void) cv_reltimedwait(&so->so_state_cv, &so->so_lock, wakeup,
   1576 		    TR_CLOCK_TICK);
   1577 	}
   1578 
   1579 	/*
   1580 	 * Since we are holding so_lock no new reader will come in, and the
   1581 	 * protocol will not be able to enqueue data. So it's safe to walk
   1582 	 * both rcv queues.
   1583 	 */
   1584 	mp = so->so_rcv_q_head;
   1585 	if (mp != NULL) {
   1586 		size = msgdsize(so->so_rcv_q_head);
   1587 		for (; mp != NULL; mp = mp->b_next)
   1588 			count++;
   1589 	} else {
   1590 		/*
   1591 		 * In case the processing list was empty, get the size of the
   1592 		 * next msg in line.
   1593 		 */
   1594 		size = msgdsize(so->so_rcv_head);
   1595 	}
   1596 
   1597 	for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next)
   1598 		count++;
   1599 out:
   1600 	mutex_exit(&so->so_lock);
   1601 
   1602 	/*
   1603 	 * Drop down from size_t to the "int" required by the
   1604 	 * interface.  Cap at INT_MAX.
   1605 	 */
   1606 	retval = MIN(size, INT_MAX);
   1607 	if (so_copyout(&retval, (void *)arg, sizeof (retval),
   1608 	    (mode & (int)FKIOCTL))) {
   1609 		return (EFAULT);
   1610 	} else {
   1611 		*rvalp = count;
   1612 		return (0);
   1613 	}
   1614 }
   1615 
   1616 /*
   1617  * Process STREAM ioctls.
   1618  *
   1619  * Returns:
   1620  *   < 0  - ioctl was not handle
   1621  *  >= 0  - ioctl was handled, if > 0, then it is an errno
   1622  */
   1623 int
   1624 socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
   1625     struct cred *cr, int32_t *rvalp)
   1626 {
   1627 	int retval;
   1628 
   1629 	/* Only STREAM iotcls are handled here */
   1630 	if ((cmd & 0xffffff00U) != STR)
   1631 		return (-1);
   1632 
   1633 	switch (cmd) {
   1634 	case I_CANPUT:
   1635 		/*
   1636 		 * We return an error for I_CANPUT so that isastream(3C) will
   1637 		 * not report the socket as being a STREAM.
   1638 		 */
   1639 		return (EOPNOTSUPP);
   1640 	case I_NREAD:
   1641 		/* Avoid doing a fallback for I_NREAD. */
   1642 		return (so_strioc_nread(so, arg, mode, rvalp));
   1643 	case I_LOOK:
   1644 		/* Avoid doing a fallback for I_LOOK. */
   1645 		if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
   1646 		    (mode & (int)FKIOCTL))) {
   1647 			return (EFAULT);
   1648 		}
   1649 		return (0);
   1650 	default:
   1651 		break;
   1652 	}
   1653 
   1654 	/*
   1655 	 * Try to fall back to TPI, and if successful, reissue the ioctl.
   1656 	 */
   1657 	if ((retval = so_tpi_fallback(so, cr)) == 0) {
   1658 		/* Reissue the ioctl */
   1659 		ASSERT(so->so_rcv_q_head == NULL);
   1660 		return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
   1661 	} else {
   1662 		return (retval);
   1663 	}
   1664 }
   1665 
   1666 /*
   1667  * This is called for all socket types to verify that the buffer size is large
   1668  * enough for the option, and if we can, handle the request as well. Most
   1669  * options will be forwarded to the protocol.
   1670  */
   1671 int
   1672 socket_getopt_common(struct sonode *so, int level, int option_name,
   1673     void *optval, socklen_t *optlenp, int flags)
   1674 {
   1675 	if (level != SOL_SOCKET)
   1676 		return (-1);
   1677 
   1678 	switch (option_name) {
   1679 	case SO_ERROR:
   1680 	case SO_DOMAIN:
   1681 	case SO_TYPE:
   1682 	case SO_ACCEPTCONN: {
   1683 		int32_t value;
   1684 		socklen_t optlen = *optlenp;
   1685 
   1686 		if (optlen < (t_uscalar_t)sizeof (int32_t)) {
   1687 			return (EINVAL);
   1688 		}
   1689 
   1690 		switch (option_name) {
   1691 		case SO_ERROR:
   1692 			mutex_enter(&so->so_lock);
   1693 			value = sogeterr(so, B_TRUE);
   1694 			mutex_exit(&so->so_lock);
   1695 			break;
   1696 		case SO_DOMAIN:
   1697 			value = so->so_family;
   1698 			break;
   1699 		case SO_TYPE:
   1700 			value = so->so_type;
   1701 			break;
   1702 		case SO_ACCEPTCONN:
   1703 			if (so->so_state & SS_ACCEPTCONN)
   1704 				value = SO_ACCEPTCONN;
   1705 			else
   1706 				value = 0;
   1707 			break;
   1708 		}
   1709 
   1710 		bcopy(&value, optval, sizeof (value));
   1711 		*optlenp = sizeof (value);
   1712 
   1713 		return (0);
   1714 	}
   1715 	case SO_SNDTIMEO:
   1716 	case SO_RCVTIMEO: {
   1717 		clock_t value;
   1718 		socklen_t optlen = *optlenp;
   1719 
   1720 		if (get_udatamodel() == DATAMODEL_NONE ||
   1721 		    get_udatamodel() == DATAMODEL_NATIVE) {
   1722 			if (optlen < sizeof (struct timeval))
   1723 				return (EINVAL);
   1724 		} else {
   1725 			if (optlen < sizeof (struct timeval32))
   1726 				return (EINVAL);
   1727 		}
   1728 		if (option_name == SO_RCVTIMEO)
   1729 			value = drv_hztousec(so->so_rcvtimeo);
   1730 		else
   1731 			value = drv_hztousec(so->so_sndtimeo);
   1732 
   1733 		if (get_udatamodel() == DATAMODEL_NONE ||
   1734 		    get_udatamodel() == DATAMODEL_NATIVE) {
   1735 			((struct timeval *)(optval))->tv_sec =
   1736 			    value / (1000 * 1000);
   1737 			((struct timeval *)(optval))->tv_usec =
   1738 			    value % (1000 * 1000);
   1739 			*optlenp = sizeof (struct timeval);
   1740 		} else {
   1741 			((struct timeval32 *)(optval))->tv_sec =
   1742 			    value / (1000 * 1000);
   1743 			((struct timeval32 *)(optval))->tv_usec =
   1744 			    value % (1000 * 1000);
   1745 			*optlenp = sizeof (struct timeval32);
   1746 		}
   1747 		return (0);
   1748 	}
   1749 	case SO_DEBUG:
   1750 	case SO_REUSEADDR:
   1751 	case SO_KEEPALIVE:
   1752 	case SO_DONTROUTE:
   1753 	case SO_BROADCAST:
   1754 	case SO_USELOOPBACK:
   1755 	case SO_OOBINLINE:
   1756 	case SO_SNDBUF:
   1757 #ifdef notyet
   1758 	case SO_SNDLOWAT:
   1759 	case SO_RCVLOWAT:
   1760 #endif /* notyet */
   1761 	case SO_DGRAM_ERRIND: {
   1762 		socklen_t optlen = *optlenp;
   1763 
   1764 		if (optlen < (t_uscalar_t)sizeof (int32_t))
   1765 			return (EINVAL);
   1766 		break;
   1767 	}
   1768 	case SO_RCVBUF: {
   1769 		socklen_t optlen = *optlenp;
   1770 
   1771 		if (optlen < (t_uscalar_t)sizeof (int32_t))
   1772 			return (EINVAL);
   1773 
   1774 		if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
   1775 			/*
   1776 			 * XXX If SO_RCVBUF has been set and this is an
   1777 			 * XPG 4.2 application then do not ask the transport
   1778 			 * since the transport might adjust the value and not
   1779 			 * return exactly what was set by the application.
   1780 			 * For non-XPG 4.2 application we return the value
   1781 			 * that the transport is actually using.
   1782 			 */
   1783 			*(int32_t *)optval = so->so_xpg_rcvbuf;
   1784 			*optlenp = sizeof (so->so_xpg_rcvbuf);
   1785 			return (0);
   1786 		}
   1787 		/*
   1788 		 * If the option has not been set then get a default
   1789 		 * value from the transport.
   1790 		 */
   1791 		break;
   1792 	}
   1793 	case SO_LINGER: {
   1794 		socklen_t optlen = *optlenp;
   1795 
   1796 		if (optlen < (t_uscalar_t)sizeof (struct linger))
   1797 			return (EINVAL);
   1798 		break;
   1799 	}
   1800 	case SO_SND_BUFINFO: {
   1801 		socklen_t optlen = *optlenp;
   1802 
   1803 		if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
   1804 			return (EINVAL);
   1805 		((struct so_snd_bufinfo *)(optval))->sbi_wroff =
   1806 		    (so->so_proto_props).sopp_wroff;
   1807 		((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
   1808 		    (so->so_proto_props).sopp_maxblk;
   1809 		((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
   1810 		    (so->so_proto_props).sopp_maxpsz;
   1811 		((struct so_snd_bufinfo *)(optval))->sbi_tail =
   1812 		    (so->so_proto_props).sopp_tail;
   1813 		*optlenp = sizeof (struct so_snd_bufinfo);
   1814 		return (0);
   1815 	}
   1816 	default:
   1817 		break;
   1818 	}
   1819 
   1820 	/* Unknown Option */
   1821 	return (-1);
   1822 }
   1823 
   1824 void
   1825 socket_sonode_destroy(struct sonode *so)
   1826 {
   1827 	sonode_fini(so);
   1828 	kmem_cache_free(socket_cache, so);
   1829 }
   1830 
   1831 int
   1832 so_zcopy_wait(struct sonode *so)
   1833 {
   1834 	int error = 0;
   1835 
   1836 	mutex_enter(&so->so_lock);
   1837 	while (!(so->so_copyflag & STZCNOTIFY)) {
   1838 		if (so->so_state & SS_CLOSING) {
   1839 			mutex_exit(&so->so_lock);
   1840 			return (EINTR);
   1841 		}
   1842 		if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
   1843 			error = EINTR;
   1844 			break;
   1845 		}
   1846 	}
   1847 	so->so_copyflag &= ~STZCNOTIFY;
   1848 	mutex_exit(&so->so_lock);
   1849 	return (error);
   1850 }
   1851 
   1852 void
   1853 so_timer_callback(void *arg)
   1854 {
   1855 	struct sonode *so = (struct sonode *)arg;
   1856 
   1857 	mutex_enter(&so->so_lock);
   1858 
   1859 	so->so_rcv_timer_tid = 0;
   1860 	if (so->so_rcv_queued > 0) {
   1861 		so_notify_data(so, so->so_rcv_queued);
   1862 	} else {
   1863 		mutex_exit(&so->so_lock);
   1864 	}
   1865 }
   1866 
   1867 #ifdef DEBUG
   1868 /*
   1869  * Verify that the length stored in so_rcv_queued and the length of data blocks
   1870  * queued is same.
   1871  */
   1872 static boolean_t
   1873 so_check_length(sonode_t *so)
   1874 {
   1875 	mblk_t *mp = so->so_rcv_q_head;
   1876 	int len = 0;
   1877 
   1878 	ASSERT(MUTEX_HELD(&so->so_lock));
   1879 
   1880 	if (mp != NULL) {
   1881 		len = msgdsize(mp);
   1882 		while ((mp = mp->b_next) != NULL)
   1883 			len += msgdsize(mp);
   1884 	}
   1885 	mp = so->so_rcv_head;
   1886 	if (mp != NULL) {
   1887 		len += msgdsize(mp);
   1888 		while ((mp = mp->b_next) != NULL)
   1889 			len += msgdsize(mp);
   1890 	}
   1891 	return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
   1892 }
   1893 #endif
   1894 
   1895 int
   1896 so_get_mod_version(struct sockparams *sp)
   1897 {
   1898 	ASSERT(sp != NULL && sp->sp_smod_info != NULL);
   1899 	return (sp->sp_smod_info->smod_version);
   1900 }
   1901 
   1902 /*
   1903  * so_start_fallback()
   1904  *
   1905  * Block new socket operations from coming in, and wait for active operations
   1906  * to complete. Threads that are sleeping will be woken up so they can get
   1907  * out of the way.
   1908  *
   1909  * The caller must be a reader on so_fallback_rwlock.
   1910  */
   1911 static boolean_t
   1912 so_start_fallback(struct sonode *so)
   1913 {
   1914 	ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
   1915 
   1916 	mutex_enter(&so->so_lock);
   1917 	if (so->so_state & SS_FALLBACK_PENDING) {
   1918 		mutex_exit(&so->so_lock);
   1919 		return (B_FALSE);
   1920 	}
   1921 	so->so_state |= SS_FALLBACK_PENDING;
   1922 	/*
   1923 	 * Poke all threads that might be sleeping. Any operation that comes
   1924 	 * in after the cv_broadcast will observe the fallback pending flag
   1925 	 * which cause the call to return where it would normally sleep.
   1926 	 */
   1927 	cv_broadcast(&so->so_state_cv);		/* threads in connect() */
   1928 	cv_broadcast(&so->so_rcv_cv);		/* threads in recvmsg() */
   1929 	cv_broadcast(&so->so_snd_cv);		/* threads in sendmsg() */
   1930 	mutex_enter(&so->so_acceptq_lock);
   1931 	cv_broadcast(&so->so_acceptq_cv);	/* threads in accept() */
   1932 	mutex_exit(&so->so_acceptq_lock);
   1933 	mutex_exit(&so->so_lock);
   1934 
   1935 	/*
   1936 	 * The main reason for the rw_tryupgrade call is to provide
   1937 	 * observability during the fallback process. We want to
   1938 	 * be able to see if there are pending operations.
   1939 	 */
   1940 	if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
   1941 		/*
   1942 		 * It is safe to drop and reaquire the fallback lock, because
   1943 		 * we are guaranteed that another fallback cannot take place.
   1944 		 */
   1945 		rw_exit(&so->so_fallback_rwlock);
   1946 		DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
   1947 		rw_enter(&so->so_fallback_rwlock, RW_WRITER);
   1948 		DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
   1949 	}
   1950 
   1951 	return (B_TRUE);
   1952 }
   1953 
   1954 /*
   1955  * so_end_fallback()
   1956  *
   1957  * Allow socket opertions back in.
   1958  *
   1959  * The caller must be a writer on so_fallback_rwlock.
   1960  */
   1961 static void
   1962 so_end_fallback(struct sonode *so)
   1963 {
   1964 	ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
   1965 
   1966 	mutex_enter(&so->so_lock);
   1967 	so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN);
   1968 	mutex_exit(&so->so_lock);
   1969 
   1970 	rw_downgrade(&so->so_fallback_rwlock);
   1971 }
   1972 
   1973 /*
   1974  * so_quiesced_cb()
   1975  *
   1976  * Callback passed to the protocol during fallback. It is called once
   1977  * the endpoint is quiescent.
   1978  *
   1979  * No requests from the user, no notifications from the protocol, so it
   1980  * is safe to synchronize the state. Data can also be moved without
   1981  * risk for reordering.
   1982  *
   1983  * We do not need to hold so_lock, since there can be only one thread
   1984  * operating on the sonode.
   1985  */
   1986 static void
   1987 so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
   1988     struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen,
   1989     struct sockaddr *faddr, socklen_t faddrlen, short opts)
   1990 {
   1991 	struct sonode *so = (struct sonode *)sock_handle;
   1992 	boolean_t atmark;
   1993 
   1994 	sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
   1995 
   1996 	/*
   1997 	 * Some protocols do not quiece the data path during fallback. Once
   1998 	 * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
   1999 	 * fail and the protocol is responsible for saving the data for later
   2000 	 * delivery (i.e., once the fallback has completed).
   2001 	 */
   2002 	mutex_enter(&so->so_lock);
   2003 	so->so_state |= SS_FALLBACK_DRAIN;
   2004 	SOCKET_TIMER_CANCEL(so);
   2005 	mutex_exit(&so->so_lock);
   2006 
   2007 	if (so->so_rcv_head != NULL) {
   2008 		if (so->so_rcv_q_last_head == NULL)
   2009 			so->so_rcv_q_head = so->so_rcv_head;
   2010 		else
   2011 			so->so_rcv_q_last_head->b_next = so->so_rcv_head;
   2012 		so->so_rcv_q_last_head = so->so_rcv_last_head;
   2013 	}
   2014 
   2015 	atmark = (so->so_state & SS_RCVATMARK) != 0;
   2016 	/*
   2017 	 * Clear any OOB state having to do with pending data. The TPI
   2018 	 * code path will set the appropriate oob state when we move the
   2019 	 * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
   2020 	 * data has already been consumed.
   2021 	 */
   2022 	so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA);
   2023 
   2024 	ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued);
   2025 
   2026 	/*
   2027 	 * Move data to the STREAM head.
   2028 	 */
   2029 	while (so->so_rcv_q_head != NULL) {
   2030 		mblk_t *mp = so->so_rcv_q_head;
   2031 		size_t mlen = msgdsize(mp);
   2032 
   2033 		so->so_rcv_q_head = mp->b_next;
   2034 		mp->b_next = NULL;
   2035 		mp->b_prev = NULL;
   2036 
   2037 		/*
   2038 		 * Send T_EXDATA_IND if we are at the oob mark.
   2039 		 */
   2040 		if (atmark) {
   2041 			struct T_exdata_ind *tei;
   2042 			mblk_t *mp1 = SOTOTPI(so)->sti_exdata_mp;
   2043 
   2044 			SOTOTPI(so)->sti_exdata_mp = NULL;
   2045 			ASSERT(mp1 != NULL);
   2046 			mp1->b_datap->db_type = M_PROTO;
   2047 			tei = (struct T_exdata_ind *)mp1->b_rptr;
   2048 			tei->PRIM_type = T_EXDATA_IND;
   2049 			tei->MORE_flag = 0;
   2050 			mp1->b_wptr = (uchar_t *)&tei[1];
   2051 
   2052 			if (IS_SO_OOB_INLINE(so)) {
   2053 				mp1->b_cont = mp;
   2054 			} else {
   2055 				ASSERT(so->so_oobmsg != NULL);
   2056 				mp1->b_cont = so->so_oobmsg;
   2057 				so->so_oobmsg = NULL;
   2058 
   2059 				/* process current mp next time around */
   2060 				mp->b_next = so->so_rcv_q_head;
   2061 				so->so_rcv_q_head = mp;
   2062 				mlen = 0;
   2063 			}
   2064 			mp = mp1;
   2065 
   2066 			/* we have consumed the oob mark */
   2067 			atmark = B_FALSE;
   2068 		} else if (so->so_oobmark > 0) {
   2069 			/*
   2070 			 * Check if the OOB mark is within the current
   2071 			 * mblk chain. In that case we have to split it up.
   2072 			 */
   2073 			if (so->so_oobmark < mlen) {
   2074 				mblk_t *urg_mp = mp;
   2075 
   2076 				atmark = B_TRUE;
   2077 				mp = NULL;
   2078 				mlen = so->so_oobmark;
   2079 
   2080 				/*
   2081 				 * It is assumed that the OOB mark does
   2082 				 * not land within a mblk.
   2083 				 */
   2084 				do {
   2085 					so->so_oobmark -= MBLKL(urg_mp);
   2086 					mp = urg_mp;
   2087 					urg_mp = urg_mp->b_cont;
   2088 				} while (so->so_oobmark > 0);
   2089 				mp->b_cont = NULL;
   2090 				if (urg_mp != NULL) {
   2091 					urg_mp->b_next = so->so_rcv_q_head;
   2092 					so->so_rcv_q_head = urg_mp;
   2093 				}
   2094 			} else {
   2095 				so->so_oobmark -= mlen;
   2096 				if (so->so_oobmark == 0)
   2097 					atmark = B_TRUE;
   2098 			}
   2099 		}
   2100 
   2101 		/*
   2102 		 * Queue data on the STREAM head.
   2103 		 */
   2104 		so->so_rcv_queued -= mlen;
   2105 		putnext(q, mp);
   2106 	}
   2107 	so->so_rcv_head = NULL;
   2108 	so->so_rcv_last_head = NULL;
   2109 	so->so_rcv_q_head = NULL;
   2110 	so->so_rcv_q_last_head = NULL;
   2111 
   2112 	/*
   2113 	 * Check if the oob byte is at the end of the data stream, or if the
   2114 	 * oob byte has not yet arrived. In the latter case we have to send a
   2115 	 * SIGURG and a mark indicator to the STREAM head. The mark indicator
   2116 	 * is needed to guarantee correct behavior for SIOCATMARK. See block
   2117 	 * comment in socktpi.h for more details.
   2118 	 */
   2119 	if (atmark || so->so_oobmark > 0) {
   2120 		mblk_t *mp;
   2121 
   2122 		if (atmark && so->so_oobmsg != NULL) {
   2123 			struct T_exdata_ind *tei;
   2124 
   2125 			mp = SOTOTPI(so)->sti_exdata_mp;
   2126 			SOTOTPI(so)->sti_exdata_mp = NULL;
   2127 			ASSERT(mp != NULL);
   2128 			mp->b_datap->db_type = M_PROTO;
   2129 			tei = (struct T_exdata_ind *)mp->b_rptr;
   2130 			tei->PRIM_type = T_EXDATA_IND;
   2131 			tei->MORE_flag = 0;
   2132 			mp->b_wptr = (uchar_t *)&tei[1];
   2133 
   2134 			mp->b_cont = so->so_oobmsg;
   2135 			so->so_oobmsg = NULL;
   2136 
   2137 			putnext(q, mp);
   2138 		} else {
   2139 			/* Send up the signal */
   2140 			mp = SOTOTPI(so)->sti_exdata_mp;
   2141 			SOTOTPI(so)->sti_exdata_mp = NULL;
   2142 			ASSERT(mp != NULL);
   2143 			DB_TYPE(mp) = M_PCSIG;
   2144 			*mp->b_wptr++ = (uchar_t)SIGURG;
   2145 			putnext(q, mp);
   2146 
   2147 			/* Send up the mark indicator */
   2148 			mp = SOTOTPI(so)->sti_urgmark_mp;
   2149 			SOTOTPI(so)->sti_urgmark_mp = NULL;
   2150 			mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
   2151 			putnext(q, mp);
   2152 
   2153 			so->so_oobmark = 0;
   2154 		}
   2155 	}
   2156 
   2157 	if (SOTOTPI(so)->sti_exdata_mp != NULL) {
   2158 		freeb(SOTOTPI(so)->sti_exdata_mp);
   2159 		SOTOTPI(so)->sti_exdata_mp = NULL;
   2160 	}
   2161 
   2162 	if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
   2163 		freeb(SOTOTPI(so)->sti_urgmark_mp);
   2164 		SOTOTPI(so)->sti_urgmark_mp = NULL;
   2165 	}
   2166 
   2167 	ASSERT(so->so_oobmark == 0);
   2168 	ASSERT(so->so_rcv_queued == 0);
   2169 }
   2170 
   2171 #ifdef DEBUG
   2172 /*
   2173  * Do an integrity check of the sonode. This should be done if a
   2174  * fallback fails after sonode has initially been converted to use
   2175  * TPI and subsequently have to be reverted.
   2176  *
   2177  * Failure to pass the integrity check will panic the system.
   2178  */
   2179 void
   2180 so_integrity_check(struct sonode *cur, struct sonode *orig)
   2181 {
   2182 	VERIFY(cur->so_vnode == orig->so_vnode);
   2183 	VERIFY(cur->so_ops == orig->so_ops);
   2184 	/*
   2185 	 * For so_state we can only VERIFY the state flags in CHECK_STATE.
   2186 	 * The other state flags might be affected by a notification from the
   2187 	 * protocol.
   2188 	 */
   2189 #define	CHECK_STATE	(SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
   2190 	SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
   2191 	SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
   2192 	VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) ==
   2193 	    (orig->so_state & CHECK_STATE));
   2194 	VERIFY(cur->so_mode == orig->so_mode);
   2195 	VERIFY(cur->so_flag == orig->so_flag);
   2196 	VERIFY(cur->so_count == orig->so_count);
   2197 	/* Cannot VERIFY so_proto_connid; proto can update it */
   2198 	VERIFY(cur->so_sockparams == orig->so_sockparams);
   2199 	/* an error might have been recorded, but it can not be lost */
   2200 	VERIFY(cur->so_error != 0 || orig->so_error == 0);
   2201 	VERIFY(cur->so_family == orig->so_family);
   2202 	VERIFY(cur->so_type == orig->so_type);
   2203 	VERIFY(cur->so_protocol == orig->so_protocol);
   2204 	VERIFY(cur->so_version == orig->so_version);
   2205 	/* New conns might have arrived, but none should have been lost */
   2206 	VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
   2207 	VERIFY(cur->so_acceptq_head == orig->so_acceptq_head);
   2208 	VERIFY(cur->so_backlog == orig->so_backlog);
   2209 	/* New OOB migth have arrived, but mark should not have been lost */
   2210 	VERIFY(cur->so_oobmark >= orig->so_oobmark);
   2211 	/* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
   2212 	VERIFY(cur->so_pgrp == orig->so_pgrp);
   2213 	VERIFY(cur->so_peercred == orig->so_peercred);
   2214 	VERIFY(cur->so_cpid == orig->so_cpid);
   2215 	VERIFY(cur->so_zoneid == orig->so_zoneid);
   2216 	/* New data migth have arrived, but none should have been lost */
   2217 	VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued);
   2218 	VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head);
   2219 	VERIFY(cur->so_rcv_head == orig->so_rcv_head);
   2220 	VERIFY(cur->so_proto_handle == orig->so_proto_handle);
   2221 	VERIFY(cur->so_downcalls == orig->so_downcalls);
   2222 	/* Cannot VERIFY so_proto_props; they can be updated by proto */
   2223 }
   2224 #endif
   2225 
   2226 /*
   2227  * so_tpi_fallback()
   2228  *
   2229  * This is the fallback initation routine; things start here.
   2230  *
   2231  * Basic strategy:
   2232  *   o Block new socket operations from coming in
   2233  *   o Allocate/initate info needed by TPI
   2234  *   o Quiesce the connection, at which point we sync
   2235  *     state and move data
   2236  *   o Change operations (sonodeops) associated with the socket
   2237  *   o Unblock threads waiting for the fallback to finish
   2238  */
   2239 int
   2240 so_tpi_fallback(struct sonode *so, struct cred *cr)
   2241 {
   2242 	int error;
   2243 	queue_t *q;
   2244 	struct sockparams *sp;
   2245 	struct sockparams *newsp = NULL;
   2246 	so_proto_fallback_func_t fbfunc;
   2247 	boolean_t direct;
   2248 	struct sonode *nso;
   2249 #ifdef DEBUG
   2250 	struct sonode origso;
   2251 #endif
   2252 	error = 0;
   2253 	sp = so->so_sockparams;
   2254 	fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
   2255 
   2256 	/*
   2257 	 * Fallback can only happen if there is a device associated
   2258 	 * with the sonode, and the socket module has a fallback function.
   2259 	 */
   2260 	if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL)
   2261 		return (EINVAL);
   2262 
   2263 	/*
   2264 	 * Initiate fallback; upon success we know that no new requests
   2265 	 * will come in from the user.
   2266 	 */
   2267 	if (!so_start_fallback(so))
   2268 		return (EAGAIN);
   2269 #ifdef DEBUG
   2270 	/*
   2271 	 * Make a copy of the sonode in case we need to make an integrity
   2272 	 * check later on.
   2273 	 */
   2274 	bcopy(so, &origso, sizeof (*so));
   2275 #endif
   2276 
   2277 	sp->sp_stats.sps_nfallback.value.ui64++;
   2278 
   2279 	newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
   2280 	    so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
   2281 	    KM_SLEEP, &error);
   2282 	if (error != 0)
   2283 		goto out;
   2284 
   2285 	if (so->so_direct != NULL) {
   2286 		sodirect_t *sodp = so->so_direct;
   2287 		mutex_enter(&so->so_lock);
   2288 
   2289 		so->so_direct->sod_enabled = B_FALSE;
   2290 		so->so_state &= ~SS_SODIRECT;
   2291 		ASSERT(sodp->sod_uioafh == NULL);
   2292 		mutex_exit(&so->so_lock);
   2293 	}
   2294 
   2295 	/* Turn sonode into a TPI socket */
   2296 	error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
   2297 	if (error != 0)
   2298 		goto out;
   2299 
   2300 
   2301 	/*
   2302 	 * Now tell the protocol to start using TPI. so_quiesced_cb be
   2303 	 * called once it's safe to synchronize state.
   2304 	 */
   2305 	DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
   2306 	error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
   2307 	DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
   2308 
   2309 	if (error != 0) {
   2310 		/* protocol was unable to do a fallback, revert the sonode */
   2311 		sotpi_revert_sonode(so, cr);
   2312 		goto out;
   2313 	}
   2314 
   2315 	/*
   2316 	 * Walk the accept queue and notify the proto that they should
   2317 	 * fall back to TPI. The protocol will send up the T_CONN_IND.
   2318 	 */
   2319 	nso = so->so_acceptq_head;
   2320 	while (nso != NULL) {
   2321 		int rval;
   2322 
   2323 		DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
   2324 		rval = (*fbfunc)(nso->so_proto_handle, NULL, direct, NULL);
   2325 		DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
   2326 		if (rval != 0) {
   2327 			zcmn_err(getzoneid(), CE_WARN,
   2328 			    "Failed to convert socket in accept queue to TPI. "
   2329 			    "Pid = %d\n", curproc->p_pid);
   2330 		}
   2331 		nso = nso->so_acceptq_next;
   2332 	}
   2333 
   2334 	/*
   2335 	 * Now flush the acceptq, this will destroy all sockets. They will
   2336 	 * be recreated in sotpi_accept().
   2337 	 */
   2338 	so_acceptq_flush(so, B_FALSE);
   2339 
   2340 	mutex_enter(&so->so_lock);
   2341 	so->so_state |= SS_FALLBACK_COMP;
   2342 	mutex_exit(&so->so_lock);
   2343 
   2344 	/*
   2345 	 * Swap the sonode ops. Socket opertations that come in once this
   2346 	 * is done will proceed without blocking.
   2347 	 */
   2348 	so->so_ops = &sotpi_sonodeops;
   2349 
   2350 	/*
   2351 	 * Wake up any threads stuck in poll. This is needed since the poll
   2352 	 * head changes when the fallback happens (moves from the sonode to
   2353 	 * the STREAMS head).
   2354 	 */
   2355 	pollwakeup(&so->so_poll_list, POLLERR);
   2356 out:
   2357 	so_end_fallback(so);
   2358 
   2359 	if (error != 0) {
   2360 #ifdef DEBUG
   2361 		so_integrity_check(so, &origso);
   2362 #endif
   2363 		zcmn_err(getzoneid(), CE_WARN,
   2364 		    "Failed to convert socket to TPI (err=%d). Pid = %d\n",
   2365 		    error, curproc->p_pid);
   2366 		if (newsp != NULL)
   2367 			SOCKPARAMS_DEC_REF(newsp);
   2368 	}
   2369 
   2370 	return (error);
   2371 }
   2372