Home | History | Annotate | Download | only in sockfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/param.h>
     29 #include <sys/systm.h>
     30 #include <sys/sysmacros.h>
     31 #include <sys/debug.h>
     32 #include <sys/cmn_err.h>
     33 
     34 #include <sys/stropts.h>
     35 #include <sys/socket.h>
     36 #include <sys/socketvar.h>
     37 
     38 #define	_SUN_TPI_VERSION	2
     39 #include <sys/tihdr.h>
     40 #include <sys/sockio.h>
     41 #include <sys/kmem_impl.h>
     42 
     43 #include <sys/strsubr.h>
     44 #include <sys/strsun.h>
     45 #include <sys/ddi.h>
     46 #include <netinet/in.h>
     47 #include <inet/ip.h>
     48 
     49 #include <fs/sockfs/sockcommon.h>
     50 
     51 #include <sys/socket_proto.h>
     52 
     53 #include <fs/sockfs/socktpi_impl.h>
     54 #include <fs/sockfs/sodirect.h>
     55 #include <sys/tihdr.h>
     56 #include <fs/sockfs/nl7c.h>
     57 #include <inet/kssl/ksslapi.h>
     58 
     59 
     60 extern int xnet_skip_checks;
     61 extern int xnet_check_print;
     62 
     63 static void so_queue_oob(sock_upper_handle_t, mblk_t *, size_t);
     64 
     65 
     66 /*ARGSUSED*/
     67 int
     68 so_accept_notsupp(struct sonode *lso, int fflag,
     69     struct cred *cr, struct sonode **nsop)
     70 {
     71 	return (EOPNOTSUPP);
     72 }
     73 
     74 /*ARGSUSED*/
     75 int
     76 so_listen_notsupp(struct sonode *so, int backlog, struct cred *cr)
     77 {
     78 	return (EOPNOTSUPP);
     79 }
     80 
     81 /*ARGSUSED*/
     82 int
     83 so_getsockname_notsupp(struct sonode *so, struct sockaddr *sa,
     84     socklen_t *len, struct cred *cr)
     85 {
     86 	return (EOPNOTSUPP);
     87 }
     88 
     89 /*ARGSUSED*/
     90 int
     91 so_getpeername_notsupp(struct sonode *so, struct sockaddr *addr,
     92     socklen_t *addrlen, boolean_t accept, struct cred *cr)
     93 {
     94 	return (EOPNOTSUPP);
     95 }
     96 
     97 /*ARGSUSED*/
     98 int
     99 so_shutdown_notsupp(struct sonode *so, int how, struct cred *cr)
    100 {
    101 	return (EOPNOTSUPP);
    102 }
    103 
    104 /*ARGSUSED*/
    105 int
    106 so_sendmblk_notsupp(struct sonode *so, struct msghdr *msg, int fflag,
    107     struct cred *cr, mblk_t **mpp)
    108 {
    109 	return (EOPNOTSUPP);
    110 }
    111 
    112 /*
    113  * Generic Socket Ops
    114  */
    115 
    116 /* ARGSUSED */
    117 int
    118 so_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags)
    119 {
    120 	return (socket_init_common(so, pso, flags, cr));
    121 }
    122 
    123 int
    124 so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
    125     int flags, struct cred *cr)
    126 {
    127 	int error;
    128 
    129 	SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr));
    130 
    131 	ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD);
    132 
    133 	/* X/Open requires this check */
    134 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
    135 		if (xnet_check_print) {
    136 			printf("sockfs: X/Open bind state check "
    137 			    "caused EINVAL\n");
    138 		}
    139 		error = EINVAL;
    140 		goto done;
    141 	}
    142 
    143 	/*
    144 	 * a bind to a NULL address is interpreted as unbind. So just
    145 	 * do the downcall.
    146 	 */
    147 	if (name == NULL)
    148 		goto dobind;
    149 
    150 	switch (so->so_family) {
    151 	case AF_INET:
    152 		if ((size_t)namelen != sizeof (sin_t)) {
    153 			error = name->sa_family != so->so_family ?
    154 			    EAFNOSUPPORT : EINVAL;
    155 			eprintsoline(so, error);
    156 			goto done;
    157 		}
    158 
    159 		if ((flags & _SOBIND_XPG4_2) &&
    160 		    (name->sa_family != so->so_family)) {
    161 			/*
    162 			 * This check has to be made for X/Open
    163 			 * sockets however application failures have
    164 			 * been observed when it is applied to
    165 			 * all sockets.
    166 			 */
    167 			error = EAFNOSUPPORT;
    168 			eprintsoline(so, error);
    169 			goto done;
    170 		}
    171 		/*
    172 		 * Force a zero sa_family to match so_family.
    173 		 *
    174 		 * Some programs like inetd(1M) don't set the
    175 		 * family field. Other programs leave
    176 		 * sin_family set to garbage - SunOS 4.X does
    177 		 * not check the family field on a bind.
    178 		 * We use the family field that
    179 		 * was passed in to the socket() call.
    180 		 */
    181 		name->sa_family = so->so_family;
    182 		break;
    183 
    184 	case AF_INET6: {
    185 #ifdef DEBUG
    186 		sin6_t *sin6 = (sin6_t *)name;
    187 #endif
    188 		if ((size_t)namelen != sizeof (sin6_t)) {
    189 			error = name->sa_family != so->so_family ?
    190 			    EAFNOSUPPORT : EINVAL;
    191 			eprintsoline(so, error);
    192 			goto done;
    193 		}
    194 
    195 		if (name->sa_family != so->so_family) {
    196 			/*
    197 			 * With IPv6 we require the family to match
    198 			 * unlike in IPv4.
    199 			 */
    200 			error = EAFNOSUPPORT;
    201 			eprintsoline(so, error);
    202 			goto done;
    203 		}
    204 #ifdef DEBUG
    205 		/*
    206 		 * Verify that apps don't forget to clear
    207 		 * sin6_scope_id etc
    208 		 */
    209 		if (sin6->sin6_scope_id != 0 &&
    210 		    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
    211 			zcmn_err(getzoneid(), CE_WARN,
    212 			    "bind with uninitialized sin6_scope_id "
    213 			    "(%d) on socket. Pid = %d\n",
    214 			    (int)sin6->sin6_scope_id,
    215 			    (int)curproc->p_pid);
    216 		}
    217 		if (sin6->__sin6_src_id != 0) {
    218 			zcmn_err(getzoneid(), CE_WARN,
    219 			    "bind with uninitialized __sin6_src_id "
    220 			    "(%d) on socket. Pid = %d\n",
    221 			    (int)sin6->__sin6_src_id,
    222 			    (int)curproc->p_pid);
    223 		}
    224 #endif /* DEBUG */
    225 
    226 		break;
    227 	}
    228 	default:
    229 		/* Just pass the request to the protocol */
    230 		goto dobind;
    231 	}
    232 
    233 	/*
    234 	 * First we check if either NCA or KSSL has been enabled for
    235 	 * the requested address, and if so, we fall back to TPI.
    236 	 * If neither of those two services are enabled, then we just
    237 	 * pass the request to the protocol.
    238 	 *
    239 	 * Note that KSSL can only be enabled on a socket if NCA is NOT
    240 	 * enabled for that socket, hence the else-statement below.
    241 	 */
    242 	if (nl7c_enabled && ((so->so_family == AF_INET ||
    243 	    so->so_family == AF_INET6) &&
    244 	    nl7c_lookup_addr(name, namelen) != NULL)) {
    245 		/*
    246 		 * NL7C is not supported in non-global zones,
    247 		 * we enforce this restriction here.
    248 		 */
    249 		if (so->so_zoneid == GLOBAL_ZONEID) {
    250 			/* NCA should be used, so fall back to TPI */
    251 			error = so_tpi_fallback(so, cr);
    252 			SO_UNBLOCK_FALLBACK(so);
    253 			if (error)
    254 				return (error);
    255 			else
    256 				return (SOP_BIND(so, name, namelen, flags, cr));
    257 		}
    258 	} else if (so->so_type == SOCK_STREAM) {
    259 		/* Check if KSSL has been configured for this address */
    260 		kssl_ent_t ent;
    261 		kssl_endpt_type_t type;
    262 		struct T_bind_req bind_req;
    263 		mblk_t *mp;
    264 
    265 		/*
    266 		 * TODO: Check with KSSL team if we could add a function call
    267 		 * that only queries whether KSSL is enabled for the given
    268 		 * address.
    269 		 */
    270 		bind_req.PRIM_type = T_BIND_REQ;
    271 		bind_req.ADDR_length = namelen;
    272 		bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
    273 		mp = soallocproto2(&bind_req, sizeof (bind_req),
    274 		    name, namelen, 0, _ALLOC_SLEEP, cr);
    275 
    276 		type = kssl_check_proxy(mp, so, &ent);
    277 		freemsg(mp);
    278 
    279 		if (type != KSSL_NO_PROXY) {
    280 			/*
    281 			 * KSSL has been configured for this address, so
    282 			 * we must fall back to TPI.
    283 			 */
    284 			kssl_release_ent(ent, so, type);
    285 			error = so_tpi_fallback(so, cr);
    286 			SO_UNBLOCK_FALLBACK(so);
    287 			if (error)
    288 				return (error);
    289 			else
    290 				return (SOP_BIND(so, name, namelen, flags, cr));
    291 		}
    292 	}
    293 
    294 dobind:
    295 	error = (*so->so_downcalls->sd_bind)
    296 	    (so->so_proto_handle, name, namelen, cr);
    297 done:
    298 	SO_UNBLOCK_FALLBACK(so);
    299 
    300 	return (error);
    301 }
    302 
    303 int
    304 so_listen(struct sonode *so, int backlog, struct cred *cr)
    305 {
    306 	int	error = 0;
    307 
    308 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
    309 	SO_BLOCK_FALLBACK(so, SOP_LISTEN(so, backlog, cr));
    310 
    311 	error = (*so->so_downcalls->sd_listen)(so->so_proto_handle, backlog,
    312 	    cr);
    313 
    314 	SO_UNBLOCK_FALLBACK(so);
    315 
    316 	return (error);
    317 }
    318 
    319 
    320 int
    321 so_connect(struct sonode *so, const struct sockaddr *name,
    322     socklen_t namelen, int fflag, int flags, struct cred *cr)
    323 {
    324 	int error = 0;
    325 	sock_connid_t id;
    326 
    327 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
    328 	SO_BLOCK_FALLBACK(so, SOP_CONNECT(so, name, namelen, fflag, flags, cr));
    329 
    330 	/*
    331 	 * If there is a pending error, return error
    332 	 * This can happen if a non blocking operation caused an error.
    333 	 */
    334 
    335 	if (so->so_error != 0) {
    336 		mutex_enter(&so->so_lock);
    337 		error = sogeterr(so, B_TRUE);
    338 		mutex_exit(&so->so_lock);
    339 		if (error != 0)
    340 			goto done;
    341 	}
    342 
    343 	error = (*so->so_downcalls->sd_connect)(so->so_proto_handle,
    344 	    name, namelen, &id, cr);
    345 
    346 	if (error == EINPROGRESS)
    347 		error = so_wait_connected(so, fflag & (FNONBLOCK|FNDELAY), id);
    348 
    349 done:
    350 	SO_UNBLOCK_FALLBACK(so);
    351 	return (error);
    352 }
    353 
    354 /*ARGSUSED*/
    355 int
    356 so_accept(struct sonode *so, int fflag, struct cred *cr, struct sonode **nsop)
    357 {
    358 	int error = 0;
    359 	struct sonode *nso;
    360 
    361 	*nsop = NULL;
    362 
    363 	SO_BLOCK_FALLBACK(so, SOP_ACCEPT(so, fflag, cr, nsop));
    364 	if ((so->so_state & SS_ACCEPTCONN) == 0) {
    365 		SO_UNBLOCK_FALLBACK(so);
    366 		return ((so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW) ?
    367 		    EOPNOTSUPP : EINVAL);
    368 	}
    369 
    370 	if ((error = so_acceptq_dequeue(so, (fflag & (FNONBLOCK|FNDELAY)),
    371 	    &nso)) == 0) {
    372 		ASSERT(nso != NULL);
    373 
    374 		/* finish the accept */
    375 		error = (*so->so_downcalls->sd_accept)(so->so_proto_handle,
    376 		    nso->so_proto_handle, (sock_upper_handle_t)nso, cr);
    377 		if (error != 0) {
    378 			(void) socket_close(nso, 0, cr);
    379 			socket_destroy(nso);
    380 		} else {
    381 			*nsop = nso;
    382 		}
    383 	}
    384 
    385 	SO_UNBLOCK_FALLBACK(so);
    386 	return (error);
    387 }
    388 
    389 int
    390 so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
    391     struct cred *cr)
    392 {
    393 	int error, flags;
    394 	boolean_t dontblock;
    395 	ssize_t orig_resid;
    396 	mblk_t  *mp;
    397 
    398 	SO_BLOCK_FALLBACK(so, SOP_SENDMSG(so, msg, uiop, cr));
    399 
    400 	flags = msg->msg_flags;
    401 	error = 0;
    402 	dontblock = (flags & MSG_DONTWAIT) ||
    403 	    (uiop->uio_fmode & (FNONBLOCK|FNDELAY));
    404 
    405 	if (!(flags & MSG_XPG4_2) && msg->msg_controllen != 0) {
    406 		/*
    407 		 * Old way of passing fd's is not supported
    408 		 */
    409 		SO_UNBLOCK_FALLBACK(so);
    410 		return (EOPNOTSUPP);
    411 	}
    412 
    413 	if ((so->so_mode & SM_ATOMIC) &&
    414 	    uiop->uio_resid > so->so_proto_props.sopp_maxpsz &&
    415 	    so->so_proto_props.sopp_maxpsz != -1) {
    416 		SO_UNBLOCK_FALLBACK(so);
    417 		return (EMSGSIZE);
    418 	}
    419 
    420 	/*
    421 	 * For atomic sends we will only do one iteration.
    422 	 */
    423 	do {
    424 		if (so->so_state & SS_CANTSENDMORE) {
    425 			error = EPIPE;
    426 			break;
    427 		}
    428 
    429 		if (so->so_error != 0) {
    430 			mutex_enter(&so->so_lock);
    431 			error = sogeterr(so, B_TRUE);
    432 			mutex_exit(&so->so_lock);
    433 			if (error != 0)
    434 				break;
    435 		}
    436 
    437 		/*
    438 		 * Send down OOB messages even if the send path is being
    439 		 * flow controlled (assuming the protocol supports OOB data).
    440 		 */
    441 		if (flags & MSG_OOB) {
    442 			if ((so->so_mode & SM_EXDATA) == 0) {
    443 				error = EOPNOTSUPP;
    444 				break;
    445 			}
    446 		} else if (so->so_snd_qfull) {
    447 			/*
    448 			 * Need to wait until the protocol is ready to receive
    449 			 * more data for transmission.
    450 			 */
    451 			if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
    452 				break;
    453 		}
    454 
    455 		/*
    456 		 * Time to send data to the protocol. We either copy the
    457 		 * data into mblks or pass the uio directly to the protocol.
    458 		 * We decide what to do based on the available down calls.
    459 		 */
    460 		if (so->so_downcalls->sd_send_uio != NULL) {
    461 			error = (*so->so_downcalls->sd_send_uio)
    462 			    (so->so_proto_handle, uiop, msg, cr);
    463 			if (error != 0)
    464 				break;
    465 		} else {
    466 			/* save the resid in case of failure */
    467 			orig_resid = uiop->uio_resid;
    468 
    469 			if ((mp = socopyinuio(uiop,
    470 			    so->so_proto_props.sopp_maxpsz,
    471 			    so->so_proto_props.sopp_wroff,
    472 			    so->so_proto_props.sopp_maxblk,
    473 			    so->so_proto_props.sopp_tail, &error)) == NULL) {
    474 				break;
    475 			}
    476 			ASSERT(uiop->uio_resid >= 0);
    477 
    478 			error = (*so->so_downcalls->sd_send)
    479 			    (so->so_proto_handle, mp, msg, cr);
    480 			if (error != 0) {
    481 				/*
    482 				 * The send failed. We do not have to free the
    483 				 * mblks, because that is the protocol's
    484 				 * responsibility. However, uio_resid must
    485 				 * remain accurate, so adjust that here.
    486 				 */
    487 				uiop->uio_resid = orig_resid;
    488 					break;
    489 			}
    490 		}
    491 	} while (uiop->uio_resid > 0);
    492 
    493 	SO_UNBLOCK_FALLBACK(so);
    494 
    495 	return (error);
    496 }
    497 
    498 int
    499 so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
    500     struct cred *cr, mblk_t **mpp)
    501 {
    502 	int error;
    503 	boolean_t dontblock;
    504 	size_t size;
    505 	mblk_t *mp = *mpp;
    506 
    507 	SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
    508 
    509 	error = 0;
    510 	dontblock = (msg->msg_flags & MSG_DONTWAIT) ||
    511 	    (fflag & (FNONBLOCK|FNDELAY));
    512 	size = msgdsize(mp);
    513 
    514 	if ((so->so_mode & SM_SENDFILESUPP) == 0 ||
    515 	    so->so_downcalls->sd_send == NULL) {
    516 		SO_UNBLOCK_FALLBACK(so);
    517 		return (EOPNOTSUPP);
    518 	}
    519 
    520 	if ((so->so_mode & SM_ATOMIC) &&
    521 	    size > so->so_proto_props.sopp_maxpsz &&
    522 	    so->so_proto_props.sopp_maxpsz != -1) {
    523 		SO_UNBLOCK_FALLBACK(so);
    524 		return (EMSGSIZE);
    525 	}
    526 
    527 	while (mp != NULL) {
    528 		mblk_t *nmp, *last_mblk;
    529 		size_t mlen;
    530 
    531 		if (so->so_state & SS_CANTSENDMORE) {
    532 			error = EPIPE;
    533 			break;
    534 		}
    535 		if (so->so_error != 0) {
    536 			mutex_enter(&so->so_lock);
    537 			error = sogeterr(so, B_TRUE);
    538 			mutex_exit(&so->so_lock);
    539 			if (error != 0)
    540 				break;
    541 		}
    542 		if (so->so_snd_qfull) {
    543 			/*
    544 			 * Need to wait until the protocol is ready to receive
    545 			 * more data for transmission.
    546 			 */
    547 			if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
    548 				break;
    549 		}
    550 
    551 		/*
    552 		 * We only allow so_maxpsz of data to be sent down to
    553 		 * the protocol at time.
    554 		 */
    555 		mlen = MBLKL(mp);
    556 		nmp = mp->b_cont;
    557 		last_mblk = mp;
    558 		while (nmp != NULL) {
    559 			mlen += MBLKL(nmp);
    560 			if (mlen > so->so_proto_props.sopp_maxpsz) {
    561 				last_mblk->b_cont = NULL;
    562 				break;
    563 			}
    564 			last_mblk = nmp;
    565 			nmp = nmp->b_cont;
    566 		}
    567 
    568 		error = (*so->so_downcalls->sd_send)
    569 		    (so->so_proto_handle, mp, msg, cr);
    570 		if (error != 0) {
    571 			/*
    572 			 * The send failed. The protocol will free the mblks
    573 			 * that were sent down. Let the caller deal with the
    574 			 * rest.
    575 			 */
    576 			*mpp = nmp;
    577 			break;
    578 		}
    579 
    580 		*mpp = mp = nmp;
    581 	}
    582 
    583 	SO_UNBLOCK_FALLBACK(so);
    584 
    585 	return (error);
    586 }
    587 
    588 int
    589 so_shutdown(struct sonode *so, int how, struct cred *cr)
    590 {
    591 	int error;
    592 
    593 	SO_BLOCK_FALLBACK(so, SOP_SHUTDOWN(so, how, cr));
    594 
    595 	/*
    596 	 * SunOS 4.X has no check for datagram sockets.
    597 	 * 5.X checks that it is connected (ENOTCONN)
    598 	 * X/Open requires that we check the connected state.
    599 	 */
    600 	if (!(so->so_state & SS_ISCONNECTED)) {
    601 		if (!xnet_skip_checks) {
    602 			error = ENOTCONN;
    603 			if (xnet_check_print) {
    604 				printf("sockfs: X/Open shutdown check "
    605 				    "caused ENOTCONN\n");
    606 			}
    607 		}
    608 		goto done;
    609 	}
    610 
    611 	error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle,
    612 	    how, cr));
    613 
    614 	/*
    615 	 * Protocol agreed to shutdown. We need to flush the
    616 	 * receive buffer if the receive side is being shutdown.
    617 	 */
    618 	if (error == 0 && how != SHUT_WR) {
    619 		mutex_enter(&so->so_lock);
    620 		/* wait for active reader to finish */
    621 		(void) so_lock_read(so, 0);
    622 
    623 		so_rcv_flush(so);
    624 
    625 		so_unlock_read(so);
    626 		mutex_exit(&so->so_lock);
    627 	}
    628 
    629 done:
    630 	SO_UNBLOCK_FALLBACK(so);
    631 	return (error);
    632 }
    633 
    634 int
    635 so_getsockname(struct sonode *so, struct sockaddr *addr,
    636     socklen_t *addrlen, struct cred *cr)
    637 {
    638 	int error;
    639 
    640 	SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
    641 
    642 	error = (*so->so_downcalls->sd_getsockname)
    643 	    (so->so_proto_handle, addr, addrlen, cr);
    644 
    645 	SO_UNBLOCK_FALLBACK(so);
    646 	return (error);
    647 }
    648 
    649 int
    650 so_getpeername(struct sonode *so, struct sockaddr *addr,
    651     socklen_t *addrlen, boolean_t accept, struct cred *cr)
    652 {
    653 	int error;
    654 
    655 	SO_BLOCK_FALLBACK(so, SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
    656 
    657 	if (accept) {
    658 		error = (*so->so_downcalls->sd_getpeername)
    659 		    (so->so_proto_handle, addr, addrlen, cr);
    660 	} else if (!(so->so_state & SS_ISCONNECTED)) {
    661 		error = ENOTCONN;
    662 	} else if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
    663 		/* Added this check for X/Open */
    664 		error = EINVAL;
    665 		if (xnet_check_print) {
    666 			printf("sockfs: X/Open getpeername check => EINVAL\n");
    667 		}
    668 	} else {
    669 		error = (*so->so_downcalls->sd_getpeername)
    670 		    (so->so_proto_handle, addr, addrlen, cr);
    671 	}
    672 
    673 	SO_UNBLOCK_FALLBACK(so);
    674 	return (error);
    675 }
    676 
    677 int
    678 so_getsockopt(struct sonode *so, int level, int option_name,
    679     void *optval, socklen_t *optlenp, int flags, struct cred *cr)
    680 {
    681 	int error = 0;
    682 
    683 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
    684 	SO_BLOCK_FALLBACK(so,
    685 	    SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
    686 
    687 	error = socket_getopt_common(so, level, option_name, optval, optlenp,
    688 	    flags);
    689 	if (error < 0) {
    690 		error = (*so->so_downcalls->sd_getsockopt)
    691 		    (so->so_proto_handle, level, option_name, optval, optlenp,
    692 		    cr);
    693 		if (error ==  ENOPROTOOPT) {
    694 			if (level == SOL_SOCKET) {
    695 				/*
    696 				 * If a protocol does not support a particular
    697 				 * socket option, set can fail (not allowed)
    698 				 * but get can not fail. This is the previous
    699 				 * sockfs bahvior.
    700 				 */
    701 				switch (option_name) {
    702 				case SO_LINGER:
    703 					if (*optlenp < (t_uscalar_t)
    704 					    sizeof (struct linger)) {
    705 						error = EINVAL;
    706 						break;
    707 					}
    708 					error = 0;
    709 					bzero(optval, sizeof (struct linger));
    710 					*optlenp = sizeof (struct linger);
    711 					break;
    712 				case SO_RCVTIMEO:
    713 				case SO_SNDTIMEO:
    714 					if (*optlenp < (t_uscalar_t)
    715 					    sizeof (struct timeval)) {
    716 						error = EINVAL;
    717 						break;
    718 					}
    719 					error = 0;
    720 					bzero(optval, sizeof (struct timeval));
    721 					*optlenp = sizeof (struct timeval);
    722 					break;
    723 				case SO_SND_BUFINFO:
    724 					if (*optlenp < (t_uscalar_t)
    725 					    sizeof (struct so_snd_bufinfo)) {
    726 						error = EINVAL;
    727 						break;
    728 					}
    729 					error = 0;
    730 					bzero(optval,
    731 					    sizeof (struct so_snd_bufinfo));
    732 					*optlenp =
    733 					    sizeof (struct so_snd_bufinfo);
    734 					break;
    735 				case SO_DEBUG:
    736 				case SO_REUSEADDR:
    737 				case SO_KEEPALIVE:
    738 				case SO_DONTROUTE:
    739 				case SO_BROADCAST:
    740 				case SO_USELOOPBACK:
    741 				case SO_OOBINLINE:
    742 				case SO_DGRAM_ERRIND:
    743 				case SO_SNDBUF:
    744 				case SO_RCVBUF:
    745 					error = 0;
    746 					*((int32_t *)optval) = 0;
    747 					*optlenp = sizeof (int32_t);
    748 					break;
    749 				default:
    750 					break;
    751 				}
    752 			}
    753 		}
    754 	}
    755 
    756 	SO_UNBLOCK_FALLBACK(so);
    757 	return (error);
    758 }
    759 
    760 int
    761 so_setsockopt(struct sonode *so, int level, int option_name,
    762     const void *optval, socklen_t optlen, struct cred *cr)
    763 {
    764 	int error = 0;
    765 	struct timeval tl;
    766 	const void *opt = optval;
    767 
    768 	SO_BLOCK_FALLBACK(so,
    769 	    SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
    770 
    771 	/* X/Open requires this check */
    772 	if (so->so_state & SS_CANTSENDMORE && !xnet_skip_checks) {
    773 		SO_UNBLOCK_FALLBACK(so);
    774 		if (xnet_check_print)
    775 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
    776 		return (EINVAL);
    777 	}
    778 
    779 	if (level == SOL_SOCKET) {
    780 		switch (option_name) {
    781 		case SO_RCVTIMEO:
    782 		case SO_SNDTIMEO: {
    783 			/*
    784 			 * We pass down these two options to protocol in order
    785 			 * to support some third part protocols which need to
    786 			 * know them. For those protocols which don't care
    787 			 * these two options, simply return 0.
    788 			 */
    789 			clock_t t_usec;
    790 
    791 			if (get_udatamodel() == DATAMODEL_NONE ||
    792 			    get_udatamodel() == DATAMODEL_NATIVE) {
    793 				if (optlen != sizeof (struct timeval)) {
    794 					error = EINVAL;
    795 					goto done;
    796 				}
    797 				bcopy((struct timeval *)optval, &tl,
    798 				    sizeof (struct timeval));
    799 			} else {
    800 				if (optlen != sizeof (struct timeval32)) {
    801 					error = EINVAL;
    802 					goto done;
    803 				}
    804 				TIMEVAL32_TO_TIMEVAL(&tl,
    805 				    (struct timeval32 *)optval);
    806 			}
    807 			opt = &tl;
    808 			optlen = sizeof (tl);
    809 			t_usec = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
    810 			mutex_enter(&so->so_lock);
    811 			if (option_name == SO_RCVTIMEO)
    812 				so->so_rcvtimeo = drv_usectohz(t_usec);
    813 			else
    814 				so->so_sndtimeo = drv_usectohz(t_usec);
    815 			mutex_exit(&so->so_lock);
    816 			break;
    817 		}
    818 		case SO_RCVBUF:
    819 			/*
    820 			 * XXX XPG 4.2 applications retrieve SO_RCVBUF from
    821 			 * sockfs since the transport might adjust the value
    822 			 * and not return exactly what was set by the
    823 			 * application.
    824 			 */
    825 			so->so_xpg_rcvbuf = *(int32_t *)optval;
    826 			break;
    827 		}
    828 	}
    829 	error = (*so->so_downcalls->sd_setsockopt)
    830 	    (so->so_proto_handle, level, option_name, opt, optlen, cr);
    831 done:
    832 	SO_UNBLOCK_FALLBACK(so);
    833 	return (error);
    834 }
    835 
    836 int
    837 so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
    838     struct cred *cr, int32_t *rvalp)
    839 {
    840 	int error = 0;
    841 
    842 	SO_BLOCK_FALLBACK(so, SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
    843 
    844 	/*
    845 	 * If there is a pending error, return error
    846 	 * This can happen if a non blocking operation caused an error.
    847 	 */
    848 	if (so->so_error != 0) {
    849 		mutex_enter(&so->so_lock);
    850 		error = sogeterr(so, B_TRUE);
    851 		mutex_exit(&so->so_lock);
    852 		if (error != 0)
    853 			goto done;
    854 	}
    855 
    856 	/*
    857 	 * calling strioc can result in the socket falling back to TPI,
    858 	 * if that is supported.
    859 	 */
    860 	if ((error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 &&
    861 	    (error = socket_strioc_common(so, cmd, arg, mode, cr, rvalp)) < 0) {
    862 		error = (*so->so_downcalls->sd_ioctl)(so->so_proto_handle,
    863 		    cmd, arg, mode, rvalp, cr);
    864 	}
    865 
    866 done:
    867 	SO_UNBLOCK_FALLBACK(so);
    868 
    869 	return (error);
    870 }
    871 
    872 int
    873 so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
    874     struct pollhead **phpp)
    875 {
    876 	int state = so->so_state;
    877 	*reventsp = 0;
    878 
    879 	/*
    880 	 * In sockets the errors are represented as input/output events
    881 	 */
    882 	if (so->so_error != 0 &&
    883 	    ((POLLIN|POLLRDNORM|POLLOUT) & events) != 0) {
    884 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & events;
    885 		return (0);
    886 	}
    887 
    888 	/*
    889 	 * If the socket is in a state where it can send data
    890 	 * turn on POLLWRBAND and POLLOUT events.
    891 	 */
    892 	if ((so->so_mode & SM_CONNREQUIRED) == 0 || (state & SS_ISCONNECTED)) {
    893 		/*
    894 		 * out of band data is allowed even if the connection
    895 		 * is flow controlled
    896 		 */
    897 		*reventsp |= POLLWRBAND & events;
    898 		if (!so->so_snd_qfull) {
    899 			/*
    900 			 * As long as there is buffer to send data
    901 			 * turn on POLLOUT events
    902 			 */
    903 			*reventsp |= POLLOUT & events;
    904 		}
    905 	}
    906 
    907 	/*
    908 	 * Turn on POLLIN whenever there is data on the receive queue,
    909 	 * or the socket is in a state where no more data will be received.
    910 	 * Also, if the socket is accepting connections, flip the bit if
    911 	 * there is something on the queue.
    912 	 *
    913 	 * We do an initial check for events without holding locks. However,
    914 	 * if there are no event available, then we redo the check for POLLIN
    915 	 * events under the lock.
    916 	 */
    917 
    918 	/* Pending connections */
    919 	if (so->so_acceptq_len > 0)
    920 		*reventsp |= (POLLIN|POLLRDNORM) & events;
    921 
    922 	/* Data */
    923 	/* so_downcalls is null for sctp */
    924 	if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) {
    925 		*reventsp |= (*so->so_downcalls->sd_poll)
    926 		    (so->so_proto_handle, events & SO_PROTO_POLLEV, anyyet,
    927 		    CRED()) & events;
    928 		ASSERT((*reventsp & ~events) == 0);
    929 		/* do not recheck events */
    930 		events &= ~SO_PROTO_POLLEV;
    931 	} else {
    932 		if (SO_HAVE_DATA(so))
    933 			*reventsp |= (POLLIN|POLLRDNORM) & events;
    934 
    935 		/* Urgent data */
    936 		if ((state & SS_OOBPEND) != 0) {
    937 			*reventsp |= (POLLRDBAND | POLLPRI) & events;
    938 		}
    939 	}
    940 
    941 	if (!*reventsp && !anyyet) {
    942 		/* Check for read events again, but this time under lock */
    943 		if (events & (POLLIN|POLLRDNORM)) {
    944 			mutex_enter(&so->so_lock);
    945 			if (SO_HAVE_DATA(so) || so->so_acceptq_len > 0) {
    946 				mutex_exit(&so->so_lock);
    947 				*reventsp |= (POLLIN|POLLRDNORM) & events;
    948 				return (0);
    949 			} else {
    950 				so->so_pollev |= SO_POLLEV_IN;
    951 				mutex_exit(&so->so_lock);
    952 			}
    953 		}
    954 		*phpp = &so->so_poll_list;
    955 	}
    956 	return (0);
    957 }
    958 
    959 /*
    960  * Generic Upcalls
    961  */
    962 void
    963 so_connected(sock_upper_handle_t sock_handle, sock_connid_t id,
    964     cred_t *peer_cred, pid_t peer_cpid)
    965 {
    966 	struct sonode *so = (struct sonode *)sock_handle;
    967 
    968 	mutex_enter(&so->so_lock);
    969 	ASSERT(so->so_proto_handle != NULL);
    970 
    971 	if (peer_cred != NULL) {
    972 		if (so->so_peercred != NULL)
    973 			crfree(so->so_peercred);
    974 		crhold(peer_cred);
    975 		so->so_peercred = peer_cred;
    976 		so->so_cpid = peer_cpid;
    977 	}
    978 
    979 	so->so_proto_connid = id;
    980 	soisconnected(so);
    981 	/*
    982 	 * Wake ones who're waiting for conn to become established.
    983 	 */
    984 	so_notify_connected(so);
    985 }
    986 
    987 int
    988 so_disconnected(sock_upper_handle_t sock_handle, sock_connid_t id, int error)
    989 {
    990 	struct sonode *so = (struct sonode *)sock_handle;
    991 
    992 	mutex_enter(&so->so_lock);
    993 
    994 	so->so_proto_connid = id;
    995 	soisdisconnected(so, error);
    996 	so_notify_disconnected(so, error);
    997 
    998 	return (0);
    999 }
   1000 
   1001 void
   1002 so_opctl(sock_upper_handle_t sock_handle, sock_opctl_action_t action,
   1003     uintptr_t arg)
   1004 {
   1005 	struct sonode *so = (struct sonode *)sock_handle;
   1006 
   1007 	switch (action) {
   1008 	case SOCK_OPCTL_SHUT_SEND:
   1009 		mutex_enter(&so->so_lock);
   1010 		socantsendmore(so);
   1011 		so_notify_disconnecting(so);
   1012 		break;
   1013 	case SOCK_OPCTL_SHUT_RECV: {
   1014 		mutex_enter(&so->so_lock);
   1015 		socantrcvmore(so);
   1016 		so_notify_eof(so);
   1017 		break;
   1018 	}
   1019 	case SOCK_OPCTL_ENAB_ACCEPT:
   1020 		mutex_enter(&so->so_lock);
   1021 		so->so_state |= SS_ACCEPTCONN;
   1022 		so->so_backlog = (unsigned int)arg;
   1023 		mutex_exit(&so->so_lock);
   1024 		break;
   1025 	default:
   1026 		ASSERT(0);
   1027 		break;
   1028 	}
   1029 }
   1030 
   1031 void
   1032 so_txq_full(sock_upper_handle_t sock_handle, boolean_t qfull)
   1033 {
   1034 	struct sonode *so = (struct sonode *)sock_handle;
   1035 
   1036 	if (qfull) {
   1037 		so_snd_qfull(so);
   1038 	} else {
   1039 		so_snd_qnotfull(so);
   1040 		mutex_enter(&so->so_lock);
   1041 		so_notify_writable(so);
   1042 	}
   1043 }
   1044 
   1045 sock_upper_handle_t
   1046 so_newconn(sock_upper_handle_t parenthandle,
   1047     sock_lower_handle_t proto_handle, sock_downcalls_t *sock_downcalls,
   1048     struct cred *peer_cred, pid_t peer_cpid, sock_upcalls_t **sock_upcallsp)
   1049 {
   1050 	struct sonode	*so = (struct sonode *)parenthandle;
   1051 	struct sonode	*nso;
   1052 	int error;
   1053 
   1054 	ASSERT(proto_handle != NULL);
   1055 
   1056 	if ((so->so_state & SS_ACCEPTCONN) == 0 ||
   1057 	    so->so_acceptq_len >= so->so_backlog)
   1058 		return (NULL);
   1059 
   1060 	nso = socket_newconn(so, proto_handle, sock_downcalls, SOCKET_NOSLEEP,
   1061 	    &error);
   1062 	if (nso == NULL)
   1063 		return (NULL);
   1064 
   1065 	if (peer_cred != NULL) {
   1066 		crhold(peer_cred);
   1067 		nso->so_peercred = peer_cred;
   1068 		nso->so_cpid = peer_cpid;
   1069 	}
   1070 
   1071 	/*
   1072 	 * The new socket (nso), proto_handle and sock_upcallsp are all
   1073 	 * valid at this point. But as soon as nso is placed in the accept
   1074 	 * queue that can no longer be assumed (since an accept() thread may
   1075 	 * pull it off the queue and close the socket).
   1076 	 */
   1077 	*sock_upcallsp = &so_upcalls;
   1078 
   1079 	(void) so_acceptq_enqueue(so, nso);
   1080 
   1081 	mutex_enter(&so->so_lock);
   1082 	so_notify_newconn(so);
   1083 
   1084 	return ((sock_upper_handle_t)nso);
   1085 }
   1086 
   1087 void
   1088 so_set_prop(sock_upper_handle_t sock_handle, struct sock_proto_props *soppp)
   1089 {
   1090 	struct sonode *so;
   1091 
   1092 	so = (struct sonode *)sock_handle;
   1093 
   1094 	mutex_enter(&so->so_lock);
   1095 
   1096 	if (soppp->sopp_flags & SOCKOPT_MAXBLK)
   1097 		so->so_proto_props.sopp_maxblk = soppp->sopp_maxblk;
   1098 	if (soppp->sopp_flags & SOCKOPT_WROFF)
   1099 		so->so_proto_props.sopp_wroff = soppp->sopp_wroff;
   1100 	if (soppp->sopp_flags & SOCKOPT_TAIL)
   1101 		so->so_proto_props.sopp_tail = soppp->sopp_tail;
   1102 	if (soppp->sopp_flags & SOCKOPT_RCVHIWAT)
   1103 		so->so_proto_props.sopp_rxhiwat = soppp->sopp_rxhiwat;
   1104 	if (soppp->sopp_flags & SOCKOPT_RCVLOWAT)
   1105 		so->so_proto_props.sopp_rxlowat = soppp->sopp_rxlowat;
   1106 	if (soppp->sopp_flags & SOCKOPT_MAXPSZ)
   1107 		so->so_proto_props.sopp_maxpsz = soppp->sopp_maxpsz;
   1108 	if (soppp->sopp_flags & SOCKOPT_MINPSZ)
   1109 		so->so_proto_props.sopp_minpsz = soppp->sopp_minpsz;
   1110 	if (soppp->sopp_flags & SOCKOPT_ZCOPY) {
   1111 		if (soppp->sopp_zcopyflag & ZCVMSAFE) {
   1112 			so->so_proto_props.sopp_zcopyflag |= STZCVMSAFE;
   1113 			so->so_proto_props.sopp_zcopyflag &= ~STZCVMUNSAFE;
   1114 		} else if (soppp->sopp_zcopyflag & ZCVMUNSAFE) {
   1115 			so->so_proto_props.sopp_zcopyflag |= STZCVMUNSAFE;
   1116 			so->so_proto_props.sopp_zcopyflag &= ~STZCVMSAFE;
   1117 		}
   1118 
   1119 		if (soppp->sopp_zcopyflag & COPYCACHED) {
   1120 			so->so_proto_props.sopp_zcopyflag |= STRCOPYCACHED;
   1121 		}
   1122 	}
   1123 	if (soppp->sopp_flags & SOCKOPT_OOBINLINE)
   1124 		so->so_proto_props.sopp_oobinline = soppp->sopp_oobinline;
   1125 	if (soppp->sopp_flags & SOCKOPT_RCVTIMER)
   1126 		so->so_proto_props.sopp_rcvtimer = soppp->sopp_rcvtimer;
   1127 	if (soppp->sopp_flags & SOCKOPT_RCVTHRESH)
   1128 		so->so_proto_props.sopp_rcvthresh = soppp->sopp_rcvthresh;
   1129 	if (soppp->sopp_flags & SOCKOPT_MAXADDRLEN)
   1130 		so->so_proto_props.sopp_maxaddrlen = soppp->sopp_maxaddrlen;
   1131 	if (soppp->sopp_flags & SOCKOPT_LOOPBACK)
   1132 		so->so_proto_props.sopp_loopback = soppp->sopp_loopback;
   1133 
   1134 	mutex_exit(&so->so_lock);
   1135 
   1136 #ifdef DEBUG
   1137 	soppp->sopp_flags &= ~(SOCKOPT_MAXBLK | SOCKOPT_WROFF | SOCKOPT_TAIL |
   1138 	    SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXPSZ |
   1139 	    SOCKOPT_ZCOPY | SOCKOPT_OOBINLINE | SOCKOPT_RCVTIMER |
   1140 	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ |
   1141 	    SOCKOPT_LOOPBACK);
   1142 	ASSERT(soppp->sopp_flags == 0);
   1143 #endif
   1144 }
   1145 
   1146 /* ARGSUSED */
   1147 ssize_t
   1148 so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
   1149     size_t msg_size, int flags, int *errorp,  boolean_t *force_pushp)
   1150 {
   1151 	struct sonode *so = (struct sonode *)sock_handle;
   1152 	boolean_t force_push = B_TRUE;
   1153 	int space_left;
   1154 	sodirect_t *sodp = so->so_direct;
   1155 
   1156 	ASSERT(errorp != NULL);
   1157 	*errorp = 0;
   1158 	if (mp == NULL) {
   1159 		if (msg_size > 0) {
   1160 			ASSERT(so->so_downcalls->sd_recv_uio != NULL);
   1161 			mutex_enter(&so->so_lock);
   1162 			/* the notify functions will drop the lock */
   1163 			if (flags & MSG_OOB)
   1164 				so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
   1165 			else
   1166 				so_notify_data(so, msg_size);
   1167 			return (0);
   1168 		}
   1169 		/*
   1170 		 * recv space check
   1171 		 */
   1172 		mutex_enter(&so->so_lock);
   1173 		space_left = so->so_rcvbuf - so->so_rcv_queued;
   1174 		if (space_left <= 0) {
   1175 			so->so_flowctrld = B_TRUE;
   1176 			*errorp = ENOSPC;
   1177 			space_left = -1;
   1178 		}
   1179 		goto done_unlock;
   1180 	}
   1181 
   1182 	ASSERT(mp->b_next == NULL);
   1183 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO);
   1184 	ASSERT(msg_size == msgdsize(mp));
   1185 
   1186 	if (flags & MSG_OOB) {
   1187 		so_queue_oob(sock_handle, mp, msg_size);
   1188 		return (0);
   1189 	}
   1190 
   1191 	if (force_pushp != NULL)
   1192 		force_push = *force_pushp;
   1193 
   1194 	if (DB_TYPE(mp) == M_PROTO && !__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
   1195 		/* The read pointer is not aligned correctly for TPI */
   1196 		zcmn_err(getzoneid(), CE_WARN,
   1197 		    "sockfs: Unaligned TPI message received. rptr = %p\n",
   1198 		    (void *)mp->b_rptr);
   1199 		freemsg(mp);
   1200 		mutex_enter(&so->so_lock);
   1201 		if (sodp != NULL)
   1202 			SOD_UIOAFINI(sodp);
   1203 		mutex_exit(&so->so_lock);
   1204 
   1205 		return (so->so_rcvbuf - so->so_rcv_queued);
   1206 	}
   1207 
   1208 	mutex_enter(&so->so_lock);
   1209 	if (so->so_state & (SS_FALLBACK_DRAIN | SS_FALLBACK_COMP)) {
   1210 		if (sodp != NULL)
   1211 			SOD_DISABLE(sodp);
   1212 		mutex_exit(&so->so_lock);
   1213 		*errorp = EOPNOTSUPP;
   1214 		return (-1);
   1215 	}
   1216 	if (so->so_state & SS_CANTRCVMORE) {
   1217 		freemsg(mp);
   1218 		if (sodp != NULL)
   1219 			SOD_DISABLE(sodp);
   1220 		mutex_exit(&so->so_lock);
   1221 		return (0);
   1222 	}
   1223 
   1224 	/* process the mblk via I/OAT if capable */
   1225 	if (sodp != NULL && sodp->sod_enabled) {
   1226 		if (DB_TYPE(mp) == M_DATA) {
   1227 			sod_uioa_mblk_init(sodp, mp, msg_size);
   1228 		} else {
   1229 			SOD_UIOAFINI(sodp);
   1230 		}
   1231 	}
   1232 
   1233 	if (mp->b_next == NULL) {
   1234 		so_enqueue_msg(so, mp, msg_size);
   1235 	} else {
   1236 		do {
   1237 			mblk_t *nmp;
   1238 
   1239 			if ((nmp = mp->b_next) != NULL) {
   1240 				mp->b_next = NULL;
   1241 			}
   1242 			so_enqueue_msg(so, mp, msgdsize(mp));
   1243 			mp = nmp;
   1244 		} while (mp != NULL);
   1245 	}
   1246 
   1247 	space_left = so->so_rcvbuf - so->so_rcv_queued;
   1248 	if (space_left <= 0) {
   1249 		so->so_flowctrld = B_TRUE;
   1250 		*errorp = ENOSPC;
   1251 		space_left = -1;
   1252 	}
   1253 
   1254 	if (force_push || so->so_rcv_queued >= so->so_rcv_thresh ||
   1255 	    so->so_rcv_queued >= so->so_rcv_wanted) {
   1256 		SOCKET_TIMER_CANCEL(so);
   1257 		/*
   1258 		 * so_notify_data will release the lock
   1259 		 */
   1260 		so_notify_data(so, so->so_rcv_queued);
   1261 
   1262 		if (force_pushp != NULL)
   1263 			*force_pushp = B_TRUE;
   1264 		goto done;
   1265 	} else if (so->so_rcv_timer_tid == 0) {
   1266 		/* Make sure the recv push timer is running */
   1267 		SOCKET_TIMER_START(so);
   1268 	}
   1269 
   1270 done_unlock:
   1271 	mutex_exit(&so->so_lock);
   1272 done:
   1273 	return (space_left);
   1274 }
   1275 
   1276 /*
   1277  * Set the offset of where the oob data is relative to the bytes in
   1278  * queued. Also generate SIGURG
   1279  */
   1280 void
   1281 so_signal_oob(sock_upper_handle_t sock_handle, ssize_t offset)
   1282 {
   1283 	struct sonode *so;
   1284 
   1285 	ASSERT(offset >= 0);
   1286 	so = (struct sonode *)sock_handle;
   1287 	mutex_enter(&so->so_lock);
   1288 	if (so->so_direct != NULL)
   1289 		SOD_UIOAFINI(so->so_direct);
   1290 
   1291 	/*
   1292 	 * New urgent data on the way so forget about any old
   1293 	 * urgent data.
   1294 	 */
   1295 	so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
   1296 
   1297 	/*
   1298 	 * Record that urgent data is pending.
   1299 	 */
   1300 	so->so_state |= SS_OOBPEND;
   1301 
   1302 	if (so->so_oobmsg != NULL) {
   1303 		dprintso(so, 1, ("sock: discarding old oob\n"));
   1304 		freemsg(so->so_oobmsg);
   1305 		so->so_oobmsg = NULL;
   1306 	}
   1307 
   1308 	/*
   1309 	 * set the offset where the urgent byte is
   1310 	 */
   1311 	so->so_oobmark = so->so_rcv_queued + offset;
   1312 	if (so->so_oobmark == 0)
   1313 		so->so_state |= SS_RCVATMARK;
   1314 	else
   1315 		so->so_state &= ~SS_RCVATMARK;
   1316 
   1317 	so_notify_oobsig(so);
   1318 }
   1319 
   1320 /*
   1321  * Queue the OOB byte
   1322  */
   1323 static void
   1324 so_queue_oob(sock_upper_handle_t sock_handle, mblk_t *mp, size_t len)
   1325 {
   1326 	struct sonode *so;
   1327 
   1328 	so = (struct sonode *)sock_handle;
   1329 	mutex_enter(&so->so_lock);
   1330 	if (so->so_direct != NULL)
   1331 		SOD_UIOAFINI(so->so_direct);
   1332 
   1333 	ASSERT(mp != NULL);
   1334 	if (!IS_SO_OOB_INLINE(so)) {
   1335 		so->so_oobmsg = mp;
   1336 		so->so_state |= SS_HAVEOOBDATA;
   1337 	} else {
   1338 		so_enqueue_msg(so, mp, len);
   1339 	}
   1340 
   1341 	so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
   1342 }
   1343 
   1344 int
   1345 so_close(struct sonode *so, int flag, struct cred *cr)
   1346 {
   1347 	int error;
   1348 
   1349 	error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr);
   1350 
   1351 	/*
   1352 	 * At this point there will be no more upcalls from the protocol
   1353 	 */
   1354 	mutex_enter(&so->so_lock);
   1355 
   1356 	ASSERT(so_verify_oobstate(so));
   1357 
   1358 	so_rcv_flush(so);
   1359 	mutex_exit(&so->so_lock);
   1360 
   1361 	return (error);
   1362 }
   1363 
   1364 void
   1365 so_zcopy_notify(sock_upper_handle_t sock_handle)
   1366 {
   1367 	struct sonode *so = (struct sonode *)sock_handle;
   1368 
   1369 	mutex_enter(&so->so_lock);
   1370 	so->so_copyflag |= STZCNOTIFY;
   1371 	cv_broadcast(&so->so_copy_cv);
   1372 	mutex_exit(&so->so_lock);
   1373 }
   1374 
   1375 void
   1376 so_set_error(sock_upper_handle_t sock_handle, int error)
   1377 {
   1378 	struct sonode *so = (struct sonode *)sock_handle;
   1379 
   1380 	mutex_enter(&so->so_lock);
   1381 
   1382 	soseterror(so, error);
   1383 
   1384 	so_notify_error(so);
   1385 }
   1386 
   1387 /*
   1388  * so_recvmsg - read data from the socket
   1389  *
   1390  * There are two ways of obtaining data; either we ask the protocol to
   1391  * copy directly into the supplied buffer, or we copy data from the
   1392  * sonode's receive queue. The decision which one to use depends on
   1393  * whether the protocol has a sd_recv_uio down call.
   1394  */
   1395 int
   1396 so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
   1397     struct cred *cr)
   1398 {
   1399 	rval_t 		rval;
   1400 	int 		flags = 0;
   1401 	t_uscalar_t	controllen, namelen;
   1402 	int 		error = 0;
   1403 	int ret;
   1404 	mblk_t		*mctlp = NULL;
   1405 	union T_primitives *tpr;
   1406 	void		*control;
   1407 	ssize_t		saved_resid;
   1408 	struct uio	*suiop;
   1409 
   1410 	SO_BLOCK_FALLBACK(so, SOP_RECVMSG(so, msg, uiop, cr));
   1411 
   1412 	if ((so->so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
   1413 	    (so->so_mode & SM_CONNREQUIRED)) {
   1414 		SO_UNBLOCK_FALLBACK(so);
   1415 		return (ENOTCONN);
   1416 	}
   1417 
   1418 	if (msg->msg_flags & MSG_PEEK)
   1419 		msg->msg_flags &= ~MSG_WAITALL;
   1420 
   1421 	if (so->so_mode & SM_ATOMIC)
   1422 		msg->msg_flags |= MSG_TRUNC;
   1423 
   1424 	if (msg->msg_flags & MSG_OOB) {
   1425 		if ((so->so_mode & SM_EXDATA) == 0) {
   1426 			error = EOPNOTSUPP;
   1427 		} else if (so->so_downcalls->sd_recv_uio != NULL) {
   1428 			error = (*so->so_downcalls->sd_recv_uio)
   1429 			    (so->so_proto_handle, uiop, msg, cr);
   1430 		} else {
   1431 			error = sorecvoob(so, msg, uiop, msg->msg_flags,
   1432 			    IS_SO_OOB_INLINE(so));
   1433 		}
   1434 		SO_UNBLOCK_FALLBACK(so);
   1435 		return (error);
   1436 	}
   1437 
   1438 	/*
   1439 	 * If the protocol has the recv down call, then pass the request
   1440 	 * down.
   1441 	 */
   1442 	if (so->so_downcalls->sd_recv_uio != NULL) {
   1443 		error = (*so->so_downcalls->sd_recv_uio)
   1444 		    (so->so_proto_handle, uiop, msg, cr);
   1445 		SO_UNBLOCK_FALLBACK(so);
   1446 		return (error);
   1447 	}
   1448 
   1449 	/*
   1450 	 * Reading data from the socket buffer
   1451 	 */
   1452 	flags = msg->msg_flags;
   1453 	msg->msg_flags = 0;
   1454 
   1455 	/*
   1456 	 * Set msg_controllen and msg_namelen to zero here to make it
   1457 	 * simpler in the cases that no control or name is returned.
   1458 	 */
   1459 	controllen = msg->msg_controllen;
   1460 	namelen = msg->msg_namelen;
   1461 	msg->msg_controllen = 0;
   1462 	msg->msg_namelen = 0;
   1463 
   1464 	mutex_enter(&so->so_lock);
   1465 	/* Set SOREADLOCKED */
   1466 	error = so_lock_read_intr(so,
   1467 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
   1468 	mutex_exit(&so->so_lock);
   1469 	if (error) {
   1470 		SO_UNBLOCK_FALLBACK(so);
   1471 		return (error);
   1472 	}
   1473 
   1474 	suiop = sod_rcv_init(so, flags, &uiop);
   1475 retry:
   1476 	saved_resid = uiop->uio_resid;
   1477 	error = so_dequeue_msg(so, &mctlp, uiop, &rval, flags);
   1478 	if (error != 0) {
   1479 		goto out;
   1480 	}
   1481 	/*
   1482 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
   1483 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
   1484 	 */
   1485 	ASSERT(!(rval.r_val1 & MORECTL));
   1486 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
   1487 		msg->msg_flags |= MSG_TRUNC;
   1488 	if (mctlp == NULL) {
   1489 		dprintso(so, 1, ("so_recvmsg: got M_DATA\n"));
   1490 
   1491 		mutex_enter(&so->so_lock);
   1492 		/* Set MSG_EOR based on MOREDATA */
   1493 		if (!(rval.r_val1 & MOREDATA)) {
   1494 			if (so->so_state & SS_SAVEDEOR) {
   1495 				msg->msg_flags |= MSG_EOR;
   1496 				so->so_state &= ~SS_SAVEDEOR;
   1497 			}
   1498 		}
   1499 		/*
   1500 		 * If some data was received (i.e. not EOF) and the
   1501 		 * read/recv* has not been satisfied wait for some more.
   1502 		 */
   1503 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
   1504 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
   1505 			mutex_exit(&so->so_lock);
   1506 			flags |= MSG_NOMARK;
   1507 			goto retry;
   1508 		}
   1509 
   1510 		goto out_locked;
   1511 	}
   1512 	/* so_queue_msg has already verified length and alignment */
   1513 	tpr = (union T_primitives *)mctlp->b_rptr;
   1514 	dprintso(so, 1, ("so_recvmsg: type %d\n", tpr->type));
   1515 	switch (tpr->type) {
   1516 	case T_DATA_IND: {
   1517 		/*
   1518 		 * Set msg_flags to MSG_EOR based on
   1519 		 * MORE_flag and MOREDATA.
   1520 		 */
   1521 		mutex_enter(&so->so_lock);
   1522 		so->so_state &= ~SS_SAVEDEOR;
   1523 		if (!(tpr->data_ind.MORE_flag & 1)) {
   1524 			if (!(rval.r_val1 & MOREDATA))
   1525 				msg->msg_flags |= MSG_EOR;
   1526 			else
   1527 				so->so_state |= SS_SAVEDEOR;
   1528 		}
   1529 		freemsg(mctlp);
   1530 		/*
   1531 		 * If some data was received (i.e. not EOF) and the
   1532 		 * read/recv* has not been satisfied wait for some more.
   1533 		 */
   1534 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
   1535 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
   1536 			mutex_exit(&so->so_lock);
   1537 			flags |= MSG_NOMARK;
   1538 			goto retry;
   1539 		}
   1540 		goto out_locked;
   1541 	}
   1542 	case T_UNITDATA_IND: {
   1543 		void *addr;
   1544 		t_uscalar_t addrlen;
   1545 		void *abuf;
   1546 		t_uscalar_t optlen;
   1547 		void *opt;
   1548 
   1549 		if (namelen != 0) {
   1550 			/* Caller wants source address */
   1551 			addrlen = tpr->unitdata_ind.SRC_length;
   1552 			addr = sogetoff(mctlp, tpr->unitdata_ind.SRC_offset,
   1553 			    addrlen, 1);
   1554 			if (addr == NULL) {
   1555 				freemsg(mctlp);
   1556 				error = EPROTO;
   1557 				eprintsoline(so, error);
   1558 				goto out;
   1559 			}
   1560 			ASSERT(so->so_family != AF_UNIX);
   1561 		}
   1562 		optlen = tpr->unitdata_ind.OPT_length;
   1563 		if (optlen != 0) {
   1564 			t_uscalar_t ncontrollen;
   1565 
   1566 			/*
   1567 			 * Extract any source address option.
   1568 			 * Determine how large cmsg buffer is needed.
   1569 			 */
   1570 			opt = sogetoff(mctlp, tpr->unitdata_ind.OPT_offset,
   1571 			    optlen, __TPI_ALIGN_SIZE);
   1572 
   1573 			if (opt == NULL) {
   1574 				freemsg(mctlp);
   1575 				error = EPROTO;
   1576 				eprintsoline(so, error);
   1577 				goto out;
   1578 			}
   1579 			if (so->so_family == AF_UNIX)
   1580 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
   1581 			ncontrollen = so_cmsglen(mctlp, opt, optlen,
   1582 			    !(flags & MSG_XPG4_2));
   1583 			if (controllen != 0)
   1584 				controllen = ncontrollen;
   1585 			else if (ncontrollen != 0)
   1586 				msg->msg_flags |= MSG_CTRUNC;
   1587 		} else {
   1588 			controllen = 0;
   1589 		}
   1590 
   1591 		if (namelen != 0) {
   1592 			/*
   1593 			 * Return address to caller.
   1594 			 * Caller handles truncation if length
   1595 			 * exceeds msg_namelen.
   1596 			 * NOTE: AF_UNIX NUL termination is ensured by
   1597 			 * the sender's copyin_name().
   1598 			 */
   1599 			abuf = kmem_alloc(addrlen, KM_SLEEP);
   1600 
   1601 			bcopy(addr, abuf, addrlen);
   1602 			msg->msg_name = abuf;
   1603 			msg->msg_namelen = addrlen;
   1604 		}
   1605 
   1606 		if (controllen != 0) {
   1607 			/*
   1608 			 * Return control msg to caller.
   1609 			 * Caller handles truncation if length
   1610 			 * exceeds msg_controllen.
   1611 			 */
   1612 			control = kmem_zalloc(controllen, KM_SLEEP);
   1613 
   1614 			error = so_opt2cmsg(mctlp, opt, optlen,
   1615 			    !(flags & MSG_XPG4_2), control, controllen);
   1616 			if (error) {
   1617 				freemsg(mctlp);
   1618 				if (msg->msg_namelen != 0)
   1619 					kmem_free(msg->msg_name,
   1620 					    msg->msg_namelen);
   1621 				kmem_free(control, controllen);
   1622 				eprintsoline(so, error);
   1623 				goto out;
   1624 			}
   1625 			msg->msg_control = control;
   1626 			msg->msg_controllen = controllen;
   1627 		}
   1628 
   1629 		freemsg(mctlp);
   1630 		goto out;
   1631 	}
   1632 	case T_OPTDATA_IND: {
   1633 		struct T_optdata_req *tdr;
   1634 		void *opt;
   1635 		t_uscalar_t optlen;
   1636 
   1637 		tdr = (struct T_optdata_req *)mctlp->b_rptr;
   1638 		optlen = tdr->OPT_length;
   1639 		if (optlen != 0) {
   1640 			t_uscalar_t ncontrollen;
   1641 			/*
   1642 			 * Determine how large cmsg buffer is needed.
   1643 			 */
   1644 			opt = sogetoff(mctlp,
   1645 			    tpr->optdata_ind.OPT_offset, optlen,
   1646 			    __TPI_ALIGN_SIZE);
   1647 
   1648 			if (opt == NULL) {
   1649 				freemsg(mctlp);
   1650 				error = EPROTO;
   1651 				eprintsoline(so, error);
   1652 				goto out;
   1653 			}
   1654 
   1655 			ncontrollen = so_cmsglen(mctlp, opt, optlen,
   1656 			    !(flags & MSG_XPG4_2));
   1657 			if (controllen != 0)
   1658 				controllen = ncontrollen;
   1659 			else if (ncontrollen != 0)
   1660 				msg->msg_flags |= MSG_CTRUNC;
   1661 		} else {
   1662 			controllen = 0;
   1663 		}
   1664 
   1665 		if (controllen != 0) {
   1666 			/*
   1667 			 * Return control msg to caller.
   1668 			 * Caller handles truncation if length
   1669 			 * exceeds msg_controllen.
   1670 			 */
   1671 			control = kmem_zalloc(controllen, KM_SLEEP);
   1672 
   1673 			error = so_opt2cmsg(mctlp, opt, optlen,
   1674 			    !(flags & MSG_XPG4_2), control, controllen);
   1675 			if (error) {
   1676 				freemsg(mctlp);
   1677 				kmem_free(control, controllen);
   1678 				eprintsoline(so, error);
   1679 				goto out;
   1680 			}
   1681 			msg->msg_control = control;
   1682 			msg->msg_controllen = controllen;
   1683 		}
   1684 
   1685 		/*
   1686 		 * Set msg_flags to MSG_EOR based on
   1687 		 * DATA_flag and MOREDATA.
   1688 		 */
   1689 		mutex_enter(&so->so_lock);
   1690 		so->so_state &= ~SS_SAVEDEOR;
   1691 		if (!(tpr->data_ind.MORE_flag & 1)) {
   1692 			if (!(rval.r_val1 & MOREDATA))
   1693 				msg->msg_flags |= MSG_EOR;
   1694 			else
   1695 				so->so_state |= SS_SAVEDEOR;
   1696 		}
   1697 		freemsg(mctlp);
   1698 		/*
   1699 		 * If some data was received (i.e. not EOF) and the
   1700 		 * read/recv* has not been satisfied wait for some more.
   1701 		 * Not possible to wait if control info was received.
   1702 		 */
   1703 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
   1704 		    controllen == 0 &&
   1705 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
   1706 			mutex_exit(&so->so_lock);
   1707 			flags |= MSG_NOMARK;
   1708 			goto retry;
   1709 		}
   1710 		goto out_locked;
   1711 	}
   1712 	default:
   1713 		cmn_err(CE_CONT, "so_recvmsg bad type %x \n",
   1714 		    tpr->type);
   1715 		freemsg(mctlp);
   1716 		error = EPROTO;
   1717 		ASSERT(0);
   1718 	}
   1719 out:
   1720 	mutex_enter(&so->so_lock);
   1721 out_locked:
   1722 	ret = sod_rcv_done(so, suiop, uiop);
   1723 	if (ret != 0 && error == 0)
   1724 		error = ret;
   1725 
   1726 	so_unlock_read(so);	/* Clear SOREADLOCKED */
   1727 	mutex_exit(&so->so_lock);
   1728 
   1729 	SO_UNBLOCK_FALLBACK(so);
   1730 
   1731 	return (error);
   1732 }
   1733 
   1734 sonodeops_t so_sonodeops = {
   1735 	so_init,		/* sop_init	*/
   1736 	so_accept,		/* sop_accept   */
   1737 	so_bind,		/* sop_bind	*/
   1738 	so_listen,		/* sop_listen   */
   1739 	so_connect,		/* sop_connect  */
   1740 	so_recvmsg,		/* sop_recvmsg  */
   1741 	so_sendmsg,		/* sop_sendmsg  */
   1742 	so_sendmblk,		/* sop_sendmblk */
   1743 	so_getpeername,		/* sop_getpeername */
   1744 	so_getsockname,		/* sop_getsockname */
   1745 	so_shutdown,		/* sop_shutdown */
   1746 	so_getsockopt,		/* sop_getsockopt */
   1747 	so_setsockopt,		/* sop_setsockopt */
   1748 	so_ioctl,		/* sop_ioctl    */
   1749 	so_poll,		/* sop_poll	*/
   1750 	so_close,		/* sop_close */
   1751 };
   1752 
   1753 sock_upcalls_t so_upcalls = {
   1754 	so_newconn,
   1755 	so_connected,
   1756 	so_disconnected,
   1757 	so_opctl,
   1758 	so_queue_msg,
   1759 	so_set_prop,
   1760 	so_txq_full,
   1761 	so_signal_oob,
   1762 	so_zcopy_notify,
   1763 	so_set_error
   1764 };
   1765