Home | History | Annotate | Download | only in sockfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/t_lock.h>
     29 #include <sys/param.h>
     30 #include <sys/systm.h>
     31 #include <sys/buf.h>
     32 #include <sys/conf.h>
     33 #include <sys/cred.h>
     34 #include <sys/kmem.h>
     35 #include <sys/kmem_impl.h>
     36 #include <sys/sysmacros.h>
     37 #include <sys/vfs.h>
     38 #include <sys/vnode.h>
     39 #include <sys/debug.h>
     40 #include <sys/errno.h>
     41 #include <sys/time.h>
     42 #include <sys/file.h>
     43 #include <sys/open.h>
     44 #include <sys/user.h>
     45 #include <sys/termios.h>
     46 #include <sys/stream.h>
     47 #include <sys/strsubr.h>
     48 #include <sys/strsun.h>
     49 #include <sys/suntpi.h>
     50 #include <sys/ddi.h>
     51 #include <sys/esunddi.h>
     52 #include <sys/flock.h>
     53 #include <sys/modctl.h>
     54 #include <sys/vtrace.h>
     55 #include <sys/cmn_err.h>
     56 #include <sys/pathname.h>
     57 
     58 #include <sys/socket.h>
     59 #include <sys/socketvar.h>
     60 #include <sys/sockio.h>
     61 #include <netinet/in.h>
     62 #include <sys/un.h>
     63 #include <sys/strsun.h>
     64 
     65 #include <sys/tiuser.h>
     66 #define	_SUN_TPI_VERSION	2
     67 #include <sys/tihdr.h>
     68 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
     69 
     70 #include <c2/audit.h>
     71 
     72 #include <inet/common.h>
     73 #include <inet/ip.h>
     74 #include <inet/ip6.h>
     75 #include <inet/tcp.h>
     76 #include <inet/udp_impl.h>
     77 
     78 #include <sys/zone.h>
     79 
     80 #include <fs/sockfs/nl7c.h>
     81 #include <fs/sockfs/nl7curi.h>
     82 
     83 #include <inet/kssl/ksslapi.h>
     84 
     85 #include <fs/sockfs/sockcommon.h>
     86 #include <fs/sockfs/socktpi.h>
     87 #include <fs/sockfs/socktpi_impl.h>
     88 
     89 /*
     90  * Possible failures when memory can't be allocated. The documented behavior:
     91  *
     92  * 		5.5:			4.X:		XNET:
     93  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
     94  *							EINTR
     95  *	(4.X does not document EINTR but returns it)
     96  * bind:	ENOSR			-		ENOBUFS/ENOSR
     97  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
     98  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
     99  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
    100  *	(4.X getpeername and getsockname do not fail in practice)
    101  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
    102  * listen:	-			-		ENOBUFS
    103  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
    104  *							EINTR
    105  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
    106  *							EINTR
    107  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
    108  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
    109  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
    110  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
    111  *
    112  * Resolution. When allocation fails:
    113  *	recv: return EINTR
    114  *	send: return EINTR
    115  *	connect, accept: EINTR
    116  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
    117  *	socket, socketpair: ENOBUFS
    118  *	getpeername, getsockname: sleep
    119  *	getsockopt, setsockopt: sleep
    120  */
    121 
    122 #ifdef SOCK_TEST
    123 /*
    124  * Variables that make sockfs do something other than the standard TPI
    125  * for the AF_INET transports.
    126  *
    127  * solisten_tpi_tcp:
    128  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
    129  *	the transport is already bound. This is needed to avoid loosing the
    130  *	port number should listen() do a T_UNBIND_REQ followed by a
    131  *	O_T_BIND_REQ.
    132  *
    133  * soconnect_tpi_udp:
    134  *	UDP and ICMP can handle a T_CONN_REQ.
    135  *	This is needed to make the sequence of connect(), getsockname()
    136  *	return the local IP address used to send packets to the connected to
    137  *	destination.
    138  *
    139  * soconnect_tpi_tcp:
    140  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
    141  *	Set this to non-zero to send TPI conformant messages to TCP in this
    142  *	respect. This is a performance optimization.
    143  *
    144  * soaccept_tpi_tcp:
    145  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
    146  *	This is a performance optimization that has been picked up in XTI.
    147  *
    148  * soaccept_tpi_multioptions:
    149  *	When inheriting SOL_SOCKET options from the listener to the accepting
    150  *	socket send them as a single message for AF_INET{,6}.
    151  */
    152 int solisten_tpi_tcp = 0;
    153 int soconnect_tpi_udp = 0;
    154 int soconnect_tpi_tcp = 0;
    155 int soaccept_tpi_tcp = 0;
    156 int soaccept_tpi_multioptions = 1;
    157 #else /* SOCK_TEST */
    158 #define	soconnect_tpi_tcp	0
    159 #define	soconnect_tpi_udp	0
    160 #define	solisten_tpi_tcp	0
    161 #define	soaccept_tpi_tcp	0
    162 #define	soaccept_tpi_multioptions	1
    163 #endif /* SOCK_TEST */
    164 
    165 #ifdef SOCK_TEST
    166 extern int do_useracc;
    167 extern clock_t sock_test_timelimit;
    168 #endif /* SOCK_TEST */
    169 
    170 /*
    171  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
    172  * applications working. Turn on this flag to disable these checks.
    173  */
    174 int xnet_skip_checks = 0;
    175 int xnet_check_print = 0;
    176 int xnet_truncate_print = 0;
    177 
    178 static void sotpi_destroy(struct sonode *);
    179 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
    180     int, int *, cred_t *cr);
    181 
    182 static boolean_t	sotpi_info_create(struct sonode *, int);
    183 static void		sotpi_info_init(struct sonode *);
    184 static void 		sotpi_info_fini(struct sonode *);
    185 static void 		sotpi_info_destroy(struct sonode *);
    186 
    187 /*
    188  * Do direct function call to the transport layer below; this would
    189  * also allow the transport to utilize read-side synchronous stream
    190  * interface if necessary.  This is a /etc/system tunable that must
    191  * not be modified on a running system.  By default this is enabled
    192  * for performance reasons and may be disabled for debugging purposes.
    193  */
    194 boolean_t socktpi_direct = B_TRUE;
    195 
    196 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
    197 
    198 extern	void sigintr(k_sigset_t *, int);
    199 extern	void sigunintr(k_sigset_t *);
    200 
    201 /* Sockets acting as an in-kernel SSL proxy */
    202 extern mblk_t	*strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
    203 		    strsigset_t *, strsigset_t *, strpollset_t *);
    204 extern mblk_t	*strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
    205 		    strsigset_t *, strsigset_t *, strpollset_t *);
    206 
    207 static int	sotpi_unbind(struct sonode *, int);
    208 
    209 /* TPI sockfs sonode operations */
    210 int 		sotpi_init(struct sonode *, struct sonode *, struct cred *,
    211 		    int);
    212 static int	sotpi_accept(struct sonode *, int, struct cred *,
    213 		    struct sonode **);
    214 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
    215 		    int, struct cred *);
    216 static int	sotpi_listen(struct sonode *, int, struct cred *);
    217 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
    218 		    socklen_t, int, int, struct cred *);
    219 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
    220 		    struct uio *, struct cred *);
    221 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
    222 		    struct uio *, struct cred *);
    223 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
    224 		    struct cred *, mblk_t **);
    225 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
    226 		    struct uio *, void *, t_uscalar_t, int);
    227 static int	sodgram_direct(struct sonode *, struct sockaddr *,
    228 		    socklen_t, struct uio *, int);
    229 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
    230 		    socklen_t *, boolean_t, struct cred *);
    231 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
    232 		    socklen_t *, struct cred *);
    233 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
    234 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
    235 		    socklen_t *, int, struct cred *);
    236 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
    237 		    socklen_t, struct cred *);
    238 static int 	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
    239 		    int32_t *);
    240 static int 	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
    241 		    struct cred *, int32_t *);
    242 static int 	sotpi_poll(struct sonode *, short, int, short *,
    243 		    struct pollhead **);
    244 static int 	sotpi_close(struct sonode *, int, struct cred *);
    245 
    246 static int	i_sotpi_info_constructor(sotpi_info_t *);
    247 static void 	i_sotpi_info_destructor(sotpi_info_t *);
    248 
    249 sonodeops_t sotpi_sonodeops = {
    250 	sotpi_init,		/* sop_init		*/
    251 	sotpi_accept,		/* sop_accept		*/
    252 	sotpi_bind,		/* sop_bind		*/
    253 	sotpi_listen,		/* sop_listen		*/
    254 	sotpi_connect,		/* sop_connect		*/
    255 	sotpi_recvmsg,		/* sop_recvmsg		*/
    256 	sotpi_sendmsg,		/* sop_sendmsg		*/
    257 	sotpi_sendmblk,		/* sop_sendmblk		*/
    258 	sotpi_getpeername,	/* sop_getpeername	*/
    259 	sotpi_getsockname,	/* sop_getsockname	*/
    260 	sotpi_shutdown,		/* sop_shutdown		*/
    261 	sotpi_getsockopt,	/* sop_getsockopt	*/
    262 	sotpi_setsockopt,	/* sop_setsockopt	*/
    263 	sotpi_ioctl,		/* sop_ioctl		*/
    264 	sotpi_poll,		/* sop_poll		*/
    265 	sotpi_close,		/* sop_close		*/
    266 };
    267 
    268 /*
    269  * Return a TPI socket vnode.
    270  *
    271  * Note that sockets assume that the driver will clone (either itself
    272  * or by using the clone driver) i.e. a socket() call will always
    273  * result in a new vnode being created.
    274  */
    275 
    276 /*
    277  * Common create code for socket and accept. If tso is set the values
    278  * from that node is used instead of issuing a T_INFO_REQ.
    279  */
    280 
    281 /* ARGSUSED */
    282 static struct sonode *
    283 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
    284     int version, int sflags, int *errorp, cred_t *cr)
    285 {
    286 	struct sonode	*so;
    287 	kmem_cache_t 	*cp;
    288 	int		sfamily = family;
    289 
    290 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
    291 
    292 	if (family == AF_NCA) {
    293 		/*
    294 		 * The request is for an NCA socket so for NL7C use the
    295 		 * INET domain instead and mark NL7C_AF_NCA below.
    296 		 */
    297 		family = AF_INET;
    298 		/*
    299 		 * NL7C is not supported in the non-global zone,
    300 		 * we enforce this restriction here.
    301 		 */
    302 		if (getzoneid() != GLOBAL_ZONEID) {
    303 			*errorp = ENOTSUP;
    304 			return (NULL);
    305 		}
    306 	}
    307 
    308 	/*
    309 	 * to be compatible with old tpi socket implementation ignore
    310 	 * sleep flag (sflags) passed in
    311 	 */
    312 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
    313 	so = kmem_cache_alloc(cp, KM_SLEEP);
    314 	if (so == NULL) {
    315 		*errorp = ENOMEM;
    316 		return (NULL);
    317 	}
    318 
    319 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
    320 	sotpi_info_init(so);
    321 
    322 	if (sfamily == AF_NCA) {
    323 		SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
    324 	}
    325 
    326 	if (version == SOV_DEFAULT)
    327 		version = so_default_version;
    328 
    329 	so->so_version = (short)version;
    330 	*errorp = 0;
    331 
    332 	return (so);
    333 }
    334 
    335 static void
    336 sotpi_destroy(struct sonode *so)
    337 {
    338 	kmem_cache_t *cp;
    339 	struct sockparams *origsp;
    340 
    341 	/*
    342 	 * If there is a new dealloc function (ie. smod_destroy_func),
    343 	 * then it should check the correctness of the ops.
    344 	 */
    345 
    346 	ASSERT(so->so_ops == &sotpi_sonodeops);
    347 
    348 	origsp = SOTOTPI(so)->sti_orig_sp;
    349 
    350 	sotpi_info_fini(so);
    351 
    352 	if (so->so_state & SS_FALLBACK_COMP) {
    353 		/*
    354 		 * A fallback happend, which means that a sotpi_info_t struct
    355 		 * was allocated (as opposed to being allocated from the TPI
    356 		 * sonode cache. Therefore we explicitly free the struct
    357 		 * here.
    358 		 */
    359 		sotpi_info_destroy(so);
    360 		ASSERT(origsp != NULL);
    361 
    362 		origsp->sp_smod_info->smod_sock_destroy_func(so);
    363 		SOCKPARAMS_DEC_REF(origsp);
    364 	} else {
    365 		sonode_fini(so);
    366 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
    367 		    socktpi_cache;
    368 		kmem_cache_free(cp, so);
    369 	}
    370 }
    371 
    372 /* ARGSUSED1 */
    373 int
    374 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
    375 {
    376 	major_t maj;
    377 	dev_t newdev;
    378 	struct vnode *vp;
    379 	int error = 0;
    380 	struct stdata *stp;
    381 
    382 	sotpi_info_t *sti = SOTOTPI(so);
    383 
    384 	dprint(1, ("sotpi_init()\n"));
    385 
    386 	/*
    387 	 * over write the sleep flag passed in but that is ok
    388 	 * as tpi socket does not honor sleep flag.
    389 	 */
    390 	flags |= FREAD|FWRITE;
    391 
    392 	/*
    393 	 * Record in so_flag that it is a clone.
    394 	 */
    395 	if (getmajor(sti->sti_dev) == clone_major)
    396 		so->so_flag |= SOCLONE;
    397 
    398 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
    399 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
    400 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
    401 	    so->so_protocol == IPPROTO_IP)) {
    402 		/* Tell tcp or udp that it's talking to sockets */
    403 		flags |= SO_SOCKSTR;
    404 
    405 		/*
    406 		 * Here we indicate to socktpi_open() our attempt to
    407 		 * make direct calls between sockfs and transport.
    408 		 * The final decision is left to socktpi_open().
    409 		 */
    410 		sti->sti_direct = 1;
    411 
    412 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
    413 		if (so->so_type == SOCK_STREAM && tso != NULL) {
    414 			if (SOTOTPI(tso)->sti_direct) {
    415 				/*
    416 				 * Inherit sti_direct from listener and pass
    417 				 * SO_ACCEPTOR open flag to tcp, indicating
    418 				 * that this is an accept fast-path instance.
    419 				 */
    420 				flags |= SO_ACCEPTOR;
    421 			} else {
    422 				/*
    423 				 * sti_direct is not set on listener, meaning
    424 				 * that the listener has been converted from
    425 				 * a socket to a stream.  Ensure that the
    426 				 * acceptor inherits these settings.
    427 				 */
    428 				sti->sti_direct = 0;
    429 				flags &= ~SO_SOCKSTR;
    430 			}
    431 		}
    432 	}
    433 
    434 	/*
    435 	 * Tell local transport that it is talking to sockets.
    436 	 */
    437 	if (so->so_family == AF_UNIX) {
    438 		flags |= SO_SOCKSTR;
    439 	}
    440 
    441 	vp = SOTOV(so);
    442 	newdev = vp->v_rdev;
    443 	maj = getmajor(newdev);
    444 	ASSERT(STREAMSTAB(maj));
    445 
    446 	error = stropen(vp, &newdev, flags, cr);
    447 
    448 	stp = vp->v_stream;
    449 	if (error == 0) {
    450 		if (so->so_flag & SOCLONE)
    451 			ASSERT(newdev != vp->v_rdev);
    452 		mutex_enter(&so->so_lock);
    453 		sti->sti_dev = newdev;
    454 		vp->v_rdev = newdev;
    455 		mutex_exit(&so->so_lock);
    456 
    457 		if (stp->sd_flag & STRISTTY) {
    458 			/*
    459 			 * this is a post SVR4 tty driver - a socket can not
    460 			 * be a controlling terminal. Fail the open.
    461 			 */
    462 			(void) sotpi_close(so, flags, cr);
    463 			return (ENOTTY);	/* XXX */
    464 		}
    465 
    466 		ASSERT(stp->sd_wrq != NULL);
    467 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
    468 
    469 		/*
    470 		 * If caller is interested in doing direct function call
    471 		 * interface to/from transport module, probe the module
    472 		 * directly beneath the streamhead to see if it qualifies.
    473 		 *
    474 		 * We turn off the direct interface when qualifications fail.
    475 		 * In the acceptor case, we simply turn off the sti_direct
    476 		 * flag on the socket. We do the fallback after the accept
    477 		 * has completed, before the new socket is returned to the
    478 		 * application.
    479 		 */
    480 		if (sti->sti_direct) {
    481 			queue_t *tq = stp->sd_wrq->q_next;
    482 
    483 			/*
    484 			 * sti_direct is currently supported and tested
    485 			 * only for tcp/udp; this is the main reason to
    486 			 * have the following assertions.
    487 			 */
    488 			ASSERT(so->so_family == AF_INET ||
    489 			    so->so_family == AF_INET6);
    490 			ASSERT(so->so_protocol == IPPROTO_UDP ||
    491 			    so->so_protocol == IPPROTO_TCP ||
    492 			    so->so_protocol == IPPROTO_IP);
    493 			ASSERT(so->so_type == SOCK_DGRAM ||
    494 			    so->so_type == SOCK_STREAM);
    495 
    496 			/*
    497 			 * Abort direct call interface if the module directly
    498 			 * underneath the stream head is not defined with the
    499 			 * _D_DIRECT flag.  This could happen in the tcp or
    500 			 * udp case, when some other module is autopushed
    501 			 * above it, or for some reasons the expected module
    502 			 * isn't purely D_MP (which is the main requirement).
    503 			 */
    504 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
    505 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
    506 				int rval;
    507 
    508 				/* Continue on without direct calls */
    509 				sti->sti_direct = 0;
    510 
    511 				/*
    512 				 * Cannot issue ioctl on fallback socket since
    513 				 * there is no conn associated with the queue.
    514 				 * The fallback downcall will notify the proto
    515 				 * of the change.
    516 				 */
    517 				if (!(flags & SO_ACCEPTOR) &&
    518 				    !(flags & SO_FALLBACK)) {
    519 					if ((error = strioctl(vp,
    520 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
    521 					    cr, &rval)) != 0) {
    522 						(void) sotpi_close(so, flags,
    523 						    cr);
    524 						return (error);
    525 					}
    526 				}
    527 			}
    528 		}
    529 
    530 		if (flags & SO_FALLBACK) {
    531 			/*
    532 			 * The stream created does not have a conn.
    533 			 * do stream set up after conn has been assigned
    534 			 */
    535 			return (error);
    536 		}
    537 		if (error = so_strinit(so, tso)) {
    538 			(void) sotpi_close(so, flags, cr);
    539 			return (error);
    540 		}
    541 
    542 		/* Wildcard */
    543 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
    544 			int protocol = so->so_protocol;
    545 			/*
    546 			 * Issue SO_PROTOTYPE setsockopt.
    547 			 */
    548 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
    549 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
    550 			if (error != 0) {
    551 				(void) sotpi_close(so, flags, cr);
    552 				/*
    553 				 * Setsockopt often fails with ENOPROTOOPT but
    554 				 * socket() should fail with
    555 				 * EPROTONOSUPPORT/EPROTOTYPE.
    556 				 */
    557 				return (EPROTONOSUPPORT);
    558 			}
    559 		}
    560 
    561 	} else {
    562 		/*
    563 		 * While the same socket can not be reopened (unlike specfs)
    564 		 * the stream head sets STREOPENFAIL when the autopush fails.
    565 		 */
    566 		if ((stp != NULL) &&
    567 		    (stp->sd_flag & STREOPENFAIL)) {
    568 			/*
    569 			 * Open failed part way through.
    570 			 */
    571 			mutex_enter(&stp->sd_lock);
    572 			stp->sd_flag &= ~STREOPENFAIL;
    573 			mutex_exit(&stp->sd_lock);
    574 			(void) sotpi_close(so, flags, cr);
    575 			return (error);
    576 			/*NOTREACHED*/
    577 		}
    578 		ASSERT(stp == NULL);
    579 	}
    580 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
    581 	    "sockfs open:maj %d vp %p so %p error %d",
    582 	    maj, vp, so, error);
    583 	return (error);
    584 }
    585 
    586 /*
    587  * Bind the socket to an unspecified address in sockfs only.
    588  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
    589  * required in all cases.
    590  */
    591 static void
    592 so_automatic_bind(struct sonode *so)
    593 {
    594 	sotpi_info_t *sti = SOTOTPI(so);
    595 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
    596 
    597 	ASSERT(MUTEX_HELD(&so->so_lock));
    598 	ASSERT(!(so->so_state & SS_ISBOUND));
    599 	ASSERT(sti->sti_unbind_mp);
    600 
    601 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
    602 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
    603 	sti->sti_laddr_sa->sa_family = so->so_family;
    604 	so->so_state |= SS_ISBOUND;
    605 }
    606 
    607 
    608 /*
    609  * bind the socket.
    610  *
    611  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
    612  * are passed in we allow rebinding. Note that for backwards compatibility
    613  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
    614  * Thus the rebinding code is currently not executed.
    615  *
    616  * The constraints for rebinding are:
    617  * - it is a SOCK_DGRAM, or
    618  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
    619  *   and no listen() has been done.
    620  * This rebinding code was added based on some language in the XNET book
    621  * about not returning EINVAL it the protocol allows rebinding. However,
    622  * this language is not present in the Posix socket draft. Thus maybe the
    623  * rebinding logic should be deleted from the source.
    624  *
    625  * A null "name" can be used to unbind the socket if:
    626  * - it is a SOCK_DGRAM, or
    627  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
    628  *   and no listen() has been done.
    629  */
    630 /* ARGSUSED */
    631 static int
    632 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
    633     socklen_t namelen, int backlog, int flags, struct cred *cr)
    634 {
    635 	struct T_bind_req	bind_req;
    636 	struct T_bind_ack	*bind_ack;
    637 	int			error = 0;
    638 	mblk_t			*mp;
    639 	void			*addr;
    640 	t_uscalar_t		addrlen;
    641 	int			unbind_on_err = 1;
    642 	boolean_t		clear_acceptconn_on_err = B_FALSE;
    643 	boolean_t		restore_backlog_on_err = B_FALSE;
    644 	int			save_so_backlog;
    645 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
    646 	boolean_t		tcp_udp_xport;
    647 	void			*nl7c = NULL;
    648 	sotpi_info_t		*sti = SOTOTPI(so);
    649 
    650 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
    651 	    (void *)so, (void *)name, namelen, backlog, flags,
    652 	    pr_state(so->so_state, so->so_mode)));
    653 
    654 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
    655 
    656 	if (!(flags & _SOBIND_LOCK_HELD)) {
    657 		mutex_enter(&so->so_lock);
    658 		so_lock_single(so);	/* Set SOLOCKED */
    659 	} else {
    660 		ASSERT(MUTEX_HELD(&so->so_lock));
    661 		ASSERT(so->so_flag & SOLOCKED);
    662 	}
    663 
    664 	/*
    665 	 * Make sure that there is a preallocated unbind_req message
    666 	 * before binding. This message allocated when the socket is
    667 	 * created  but it might be have been consumed.
    668 	 */
    669 	if (sti->sti_unbind_mp == NULL) {
    670 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
    671 		/* NOTE: holding so_lock while sleeping */
    672 		sti->sti_unbind_mp =
    673 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
    674 		    cr);
    675 	}
    676 
    677 	if (flags & _SOBIND_REBIND) {
    678 		/*
    679 		 * Called from solisten after doing an sotpi_unbind() or
    680 		 * potentially without the unbind (latter for AF_INET{,6}).
    681 		 */
    682 		ASSERT(name == NULL && namelen == 0);
    683 
    684 		if (so->so_family == AF_UNIX) {
    685 			ASSERT(sti->sti_ux_bound_vp);
    686 			addr = &sti->sti_ux_laddr;
    687 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
    688 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
    689 			    "addr 0x%p, vp %p\n",
    690 			    addrlen,
    691 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
    692 			    (void *)sti->sti_ux_bound_vp));
    693 		} else {
    694 			addr = sti->sti_laddr_sa;
    695 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
    696 		}
    697 	} else if (flags & _SOBIND_UNSPEC) {
    698 		ASSERT(name == NULL && namelen == 0);
    699 
    700 		/*
    701 		 * The caller checked SS_ISBOUND but not necessarily
    702 		 * under so_lock
    703 		 */
    704 		if (so->so_state & SS_ISBOUND) {
    705 			/* No error */
    706 			goto done;
    707 		}
    708 
    709 		/* Set an initial local address */
    710 		switch (so->so_family) {
    711 		case AF_UNIX:
    712 			/*
    713 			 * Use an address with same size as struct sockaddr
    714 			 * just like BSD.
    715 			 */
    716 			sti->sti_laddr_len =
    717 			    (socklen_t)sizeof (struct sockaddr);
    718 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
    719 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
    720 			sti->sti_laddr_sa->sa_family = so->so_family;
    721 
    722 			/*
    723 			 * Pass down an address with the implicit bind
    724 			 * magic number and the rest all zeros.
    725 			 * The transport will return a unique address.
    726 			 */
    727 			sti->sti_ux_laddr.soua_vp = NULL;
    728 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
    729 			addr = &sti->sti_ux_laddr;
    730 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
    731 			break;
    732 
    733 		case AF_INET:
    734 		case AF_INET6:
    735 			/*
    736 			 * An unspecified bind in TPI has a NULL address.
    737 			 * Set the address in sockfs to have the sa_family.
    738 			 */
    739 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
    740 			    (socklen_t)sizeof (sin_t) :
    741 			    (socklen_t)sizeof (sin6_t);
    742 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
    743 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
    744 			sti->sti_laddr_sa->sa_family = so->so_family;
    745 			addr = NULL;
    746 			addrlen = 0;
    747 			break;
    748 
    749 		default:
    750 			/*
    751 			 * An unspecified bind in TPI has a NULL address.
    752 			 * Set the address in sockfs to be zero length.
    753 			 *
    754 			 * Can not assume there is a sa_family for all
    755 			 * protocol families. For example, AF_X25 does not
    756 			 * have a family field.
    757 			 */
    758 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
    759 			sti->sti_laddr_len = 0;	/* XXX correct? */
    760 			addr = NULL;
    761 			addrlen = 0;
    762 			break;
    763 		}
    764 
    765 	} else {
    766 		if (so->so_state & SS_ISBOUND) {
    767 			/*
    768 			 * If it is ok to rebind the socket, first unbind
    769 			 * with the transport. A rebind to the NULL address
    770 			 * is interpreted as an unbind.
    771 			 * Note that a bind to NULL in BSD does unbind the
    772 			 * socket but it fails with EINVAL.
    773 			 * Note that regular sockets set SOV_SOCKBSD i.e.
    774 			 * _SOBIND_SOCKBSD gets set here hence no type of
    775 			 * socket does currently allow rebinding.
    776 			 *
    777 			 * If the name is NULL just do an unbind.
    778 			 */
    779 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
    780 			    name != NULL) {
    781 				error = EINVAL;
    782 				unbind_on_err = 0;
    783 				eprintsoline(so, error);
    784 				goto done;
    785 			}
    786 			if ((so->so_mode & SM_CONNREQUIRED) &&
    787 			    (so->so_state & SS_CANTREBIND)) {
    788 				error = EINVAL;
    789 				unbind_on_err = 0;
    790 				eprintsoline(so, error);
    791 				goto done;
    792 			}
    793 			error = sotpi_unbind(so, 0);
    794 			if (error) {
    795 				eprintsoline(so, error);
    796 				goto done;
    797 			}
    798 			ASSERT(!(so->so_state & SS_ISBOUND));
    799 			if (name == NULL) {
    800 				so->so_state &=
    801 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
    802 				goto done;
    803 			}
    804 		}
    805 
    806 		/* X/Open requires this check */
    807 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
    808 			if (xnet_check_print) {
    809 				printf("sockfs: X/Open bind state check "
    810 				    "caused EINVAL\n");
    811 			}
    812 			error = EINVAL;
    813 			goto done;
    814 		}
    815 
    816 		switch (so->so_family) {
    817 		case AF_UNIX:
    818 			/*
    819 			 * All AF_UNIX addresses are nul terminated
    820 			 * when copied (copyin_name) in so the minimum
    821 			 * length is 3 bytes.
    822 			 */
    823 			if (name == NULL ||
    824 			    (ssize_t)namelen <= sizeof (short) + 1) {
    825 				error = EISDIR;
    826 				eprintsoline(so, error);
    827 				goto done;
    828 			}
    829 			/*
    830 			 * Verify so_family matches the bound family.
    831 			 * BSD does not check this for AF_UNIX resulting
    832 			 * in funny mknods.
    833 			 */
    834 			if (name->sa_family != so->so_family) {
    835 				error = EAFNOSUPPORT;
    836 				goto done;
    837 			}
    838 			break;
    839 		case AF_INET:
    840 			if (name == NULL) {
    841 				error = EINVAL;
    842 				eprintsoline(so, error);
    843 				goto done;
    844 			}
    845 			if ((size_t)namelen != sizeof (sin_t)) {
    846 				error = name->sa_family != so->so_family ?
    847 				    EAFNOSUPPORT : EINVAL;
    848 				eprintsoline(so, error);
    849 				goto done;
    850 			}
    851 			if ((flags & _SOBIND_XPG4_2) &&
    852 			    (name->sa_family != so->so_family)) {
    853 				/*
    854 				 * This check has to be made for X/Open
    855 				 * sockets however application failures have
    856 				 * been observed when it is applied to
    857 				 * all sockets.
    858 				 */
    859 				error = EAFNOSUPPORT;
    860 				eprintsoline(so, error);
    861 				goto done;
    862 			}
    863 			/*
    864 			 * Force a zero sa_family to match so_family.
    865 			 *
    866 			 * Some programs like inetd(1M) don't set the
    867 			 * family field. Other programs leave
    868 			 * sin_family set to garbage - SunOS 4.X does
    869 			 * not check the family field on a bind.
    870 			 * We use the family field that
    871 			 * was passed in to the socket() call.
    872 			 */
    873 			name->sa_family = so->so_family;
    874 			break;
    875 
    876 		case AF_INET6: {
    877 #ifdef DEBUG
    878 			sin6_t *sin6 = (sin6_t *)name;
    879 #endif /* DEBUG */
    880 
    881 			if (name == NULL) {
    882 				error = EINVAL;
    883 				eprintsoline(so, error);
    884 				goto done;
    885 			}
    886 			if ((size_t)namelen != sizeof (sin6_t)) {
    887 				error = name->sa_family != so->so_family ?
    888 				    EAFNOSUPPORT : EINVAL;
    889 				eprintsoline(so, error);
    890 				goto done;
    891 			}
    892 			if (name->sa_family != so->so_family) {
    893 				/*
    894 				 * With IPv6 we require the family to match
    895 				 * unlike in IPv4.
    896 				 */
    897 				error = EAFNOSUPPORT;
    898 				eprintsoline(so, error);
    899 				goto done;
    900 			}
    901 #ifdef DEBUG
    902 			/*
    903 			 * Verify that apps don't forget to clear
    904 			 * sin6_scope_id etc
    905 			 */
    906 			if (sin6->sin6_scope_id != 0 &&
    907 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
    908 				zcmn_err(getzoneid(), CE_WARN,
    909 				    "bind with uninitialized sin6_scope_id "
    910 				    "(%d) on socket. Pid = %d\n",
    911 				    (int)sin6->sin6_scope_id,
    912 				    (int)curproc->p_pid);
    913 			}
    914 			if (sin6->__sin6_src_id != 0) {
    915 				zcmn_err(getzoneid(), CE_WARN,
    916 				    "bind with uninitialized __sin6_src_id "
    917 				    "(%d) on socket. Pid = %d\n",
    918 				    (int)sin6->__sin6_src_id,
    919 				    (int)curproc->p_pid);
    920 			}
    921 #endif /* DEBUG */
    922 			break;
    923 		}
    924 		default:
    925 			/*
    926 			 * Don't do any length or sa_family check to allow
    927 			 * non-sockaddr style addresses.
    928 			 */
    929 			if (name == NULL) {
    930 				error = EINVAL;
    931 				eprintsoline(so, error);
    932 				goto done;
    933 			}
    934 			break;
    935 		}
    936 
    937 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
    938 			error = ENAMETOOLONG;
    939 			eprintsoline(so, error);
    940 			goto done;
    941 		}
    942 		/*
    943 		 * Save local address.
    944 		 */
    945 		sti->sti_laddr_len = (socklen_t)namelen;
    946 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
    947 		bcopy(name, sti->sti_laddr_sa, namelen);
    948 
    949 		addr = sti->sti_laddr_sa;
    950 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
    951 		switch (so->so_family) {
    952 		case AF_INET6:
    953 		case AF_INET:
    954 			break;
    955 		case AF_UNIX: {
    956 			struct sockaddr_un *soun =
    957 			    (struct sockaddr_un *)sti->sti_laddr_sa;
    958 			struct vnode *vp, *rvp;
    959 			struct vattr vattr;
    960 
    961 			ASSERT(sti->sti_ux_bound_vp == NULL);
    962 			/*
    963 			 * Create vnode for the specified path name.
    964 			 * Keep vnode held with a reference in sti_ux_bound_vp.
    965 			 * Use the vnode pointer as the address used in the
    966 			 * bind with the transport.
    967 			 *
    968 			 * Use the same mode as in BSD. In particular this does
    969 			 * not observe the umask.
    970 			 */
    971 			/* MAXPATHLEN + soun_family + nul termination */
    972 			if (sti->sti_laddr_len >
    973 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
    974 				error = ENAMETOOLONG;
    975 				eprintsoline(so, error);
    976 				goto done;
    977 			}
    978 			vattr.va_type = VSOCK;
    979 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
    980 			vattr.va_mask = AT_TYPE|AT_MODE;
    981 			/* NOTE: holding so_lock */
    982 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
    983 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
    984 			if (error) {
    985 				if (error == EEXIST)
    986 					error = EADDRINUSE;
    987 				eprintsoline(so, error);
    988 				goto done;
    989 			}
    990 			/*
    991 			 * Establish pointer from the underlying filesystem
    992 			 * vnode to the socket node.
    993 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
    994 			 * cross-linkage between the underlying filesystem
    995 			 * node and the socket node.
    996 			 */
    997 
    998 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
    999 				VN_HOLD(rvp);
   1000 				VN_RELE(vp);
   1001 				vp = rvp;
   1002 			}
   1003 
   1004 			ASSERT(SOTOV(so)->v_stream);
   1005 			mutex_enter(&vp->v_lock);
   1006 			vp->v_stream = SOTOV(so)->v_stream;
   1007 			sti->sti_ux_bound_vp = vp;
   1008 			mutex_exit(&vp->v_lock);
   1009 
   1010 			/*
   1011 			 * Use the vnode pointer value as a unique address
   1012 			 * (together with the magic number to avoid conflicts
   1013 			 * with implicit binds) in the transport provider.
   1014 			 */
   1015 			sti->sti_ux_laddr.soua_vp =
   1016 			    (void *)sti->sti_ux_bound_vp;
   1017 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
   1018 			addr = &sti->sti_ux_laddr;
   1019 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
   1020 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
   1021 			    addrlen,
   1022 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
   1023 			break;
   1024 		}
   1025 		} /* end switch (so->so_family) */
   1026 	}
   1027 
   1028 	/*
   1029 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
   1030 	 * the transport can start passing up T_CONN_IND messages
   1031 	 * as soon as it receives the bind req and strsock_proto()
   1032 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
   1033 	 */
   1034 	if (flags & _SOBIND_LISTEN) {
   1035 		if ((so->so_state & SS_ACCEPTCONN) == 0)
   1036 			clear_acceptconn_on_err = B_TRUE;
   1037 		save_so_backlog = so->so_backlog;
   1038 		restore_backlog_on_err = B_TRUE;
   1039 		so->so_state |= SS_ACCEPTCONN;
   1040 		so->so_backlog = backlog;
   1041 	}
   1042 
   1043 	/*
   1044 	 * If NL7C addr(s) have been configured check for addr/port match,
   1045 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
   1046 	 *
   1047 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
   1048 	 * family sockets only. If match mark as such.
   1049 	 */
   1050 	if (nl7c_enabled && ((addr != NULL &&
   1051 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
   1052 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
   1053 	    sti->sti_nl7c_flags == NL7C_AF_NCA)) {
   1054 		/*
   1055 		 * NL7C is not supported in non-global zones,
   1056 		 * we enforce this restriction here.
   1057 		 */
   1058 		if (so->so_zoneid == GLOBAL_ZONEID) {
   1059 			/* An NL7C socket, mark it */
   1060 			sti->sti_nl7c_flags |= NL7C_ENABLED;
   1061 			if (nl7c == NULL) {
   1062 				/*
   1063 				 * Was an AF_NCA bind() so add it to the
   1064 				 * addr list for reporting purposes.
   1065 				 */
   1066 				nl7c = nl7c_add_addr(addr, addrlen);
   1067 			}
   1068 		} else
   1069 			nl7c = NULL;
   1070 	}
   1071 
   1072 	/*
   1073 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
   1074 	 * for other transports we will send in a O_T_BIND_REQ.
   1075 	 */
   1076 	if (tcp_udp_xport &&
   1077 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
   1078 		PRIM_type = T_BIND_REQ;
   1079 
   1080 	bind_req.PRIM_type = PRIM_type;
   1081 	bind_req.ADDR_length = addrlen;
   1082 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
   1083 	bind_req.CONIND_number = backlog;
   1084 	/* NOTE: holding so_lock while sleeping */
   1085 	mp = soallocproto2(&bind_req, sizeof (bind_req),
   1086 	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
   1087 	sti->sti_laddr_valid = 0;
   1088 
   1089 	/* Done using sti_laddr_sa - can drop the lock */
   1090 	mutex_exit(&so->so_lock);
   1091 
   1092 	/*
   1093 	 * Intercept the bind_req message here to check if this <address/port>
   1094 	 * was configured as an SSL proxy server, or if another endpoint was
   1095 	 * already configured to act as a proxy for us.
   1096 	 *
   1097 	 * Note, only if NL7C not enabled for this socket.
   1098 	 */
   1099 	if (nl7c == NULL &&
   1100 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
   1101 	    so->so_type == SOCK_STREAM) {
   1102 
   1103 		if (sti->sti_kssl_ent != NULL) {
   1104 			kssl_release_ent(sti->sti_kssl_ent, so,
   1105 			    sti->sti_kssl_type);
   1106 			sti->sti_kssl_ent = NULL;
   1107 		}
   1108 
   1109 		sti->sti_kssl_type = kssl_check_proxy(mp, so,
   1110 		    &sti->sti_kssl_ent);
   1111 		switch (sti->sti_kssl_type) {
   1112 		case KSSL_NO_PROXY:
   1113 			break;
   1114 
   1115 		case KSSL_HAS_PROXY:
   1116 			mutex_enter(&so->so_lock);
   1117 			goto skip_transport;
   1118 
   1119 		case KSSL_IS_PROXY:
   1120 			break;
   1121 		}
   1122 	}
   1123 
   1124 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
   1125 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
   1126 	if (error) {
   1127 		eprintsoline(so, error);
   1128 		mutex_enter(&so->so_lock);
   1129 		goto done;
   1130 	}
   1131 
   1132 	mutex_enter(&so->so_lock);
   1133 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
   1134 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
   1135 	if (error) {
   1136 		eprintsoline(so, error);
   1137 		goto done;
   1138 	}
   1139 skip_transport:
   1140 	ASSERT(mp);
   1141 	/*
   1142 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
   1143 	 * strsock_proto while the lock was dropped above, the bind
   1144 	 * is allowed to complete.
   1145 	 */
   1146 
   1147 	/* Mark as bound. This will be undone if we detect errors below. */
   1148 	if (flags & _SOBIND_NOXLATE) {
   1149 		ASSERT(so->so_family == AF_UNIX);
   1150 		sti->sti_faddr_noxlate = 1;
   1151 	}
   1152 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
   1153 	so->so_state |= SS_ISBOUND;
   1154 	ASSERT(sti->sti_unbind_mp);
   1155 
   1156 	/* note that we've already set SS_ACCEPTCONN above */
   1157 
   1158 	/*
   1159 	 * Recompute addrlen - an unspecied bind sent down an
   1160 	 * address of length zero but we expect the appropriate length
   1161 	 * in return.
   1162 	 */
   1163 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
   1164 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
   1165 
   1166 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
   1167 	/*
   1168 	 * The alignment restriction is really too strict but
   1169 	 * we want enough alignment to inspect the fields of
   1170 	 * a sockaddr_in.
   1171 	 */
   1172 	addr = sogetoff(mp, bind_ack->ADDR_offset,
   1173 	    bind_ack->ADDR_length,
   1174 	    __TPI_ALIGN_SIZE);
   1175 	if (addr == NULL) {
   1176 		freemsg(mp);
   1177 		error = EPROTO;
   1178 		eprintsoline(so, error);
   1179 		goto done;
   1180 	}
   1181 	if (!(flags & _SOBIND_UNSPEC)) {
   1182 		/*
   1183 		 * Verify that the transport didn't return something we
   1184 		 * did not want e.g. an address other than what we asked for.
   1185 		 *
   1186 		 * NOTE: These checks would go away if/when we switch to
   1187 		 * using the new TPI (in which the transport would fail
   1188 		 * the request instead of assigning a different address).
   1189 		 *
   1190 		 * NOTE2: For protocols that we don't know (i.e. any
   1191 		 * other than AF_INET6, AF_INET and AF_UNIX), we
   1192 		 * cannot know if the transport should be expected to
   1193 		 * return the same address as that requested.
   1194 		 *
   1195 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
   1196 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
   1197 		 *
   1198 		 * For example, in the case of netatalk it may be
   1199 		 * inappropriate for the transport to return the
   1200 		 * requested address (as it may have allocated a local
   1201 		 * port number in behaviour similar to that of an
   1202 		 * AF_INET bind request with a port number of zero).
   1203 		 *
   1204 		 * Given the definition of O_T_BIND_REQ, where the
   1205 		 * transport may bind to an address other than the
   1206 		 * requested address, it's not possible to determine
   1207 		 * whether a returned address that differs from the
   1208 		 * requested address is a reason to fail (because the
   1209 		 * requested address was not available) or succeed
   1210 		 * (because the transport allocated an appropriate
   1211 		 * address and/or port).
   1212 		 *
   1213 		 * sockfs currently requires that the transport return
   1214 		 * the requested address in the T_BIND_ACK, unless
   1215 		 * there is code here to allow for any discrepancy.
   1216 		 * Such code exists for AF_INET and AF_INET6.
   1217 		 *
   1218 		 * Netatalk chooses to return the requested address
   1219 		 * rather than the (correct) allocated address.  This
   1220 		 * means that netatalk violates the TPI specification
   1221 		 * (and would not function correctly if used from a
   1222 		 * TLI application), but it does mean that it works
   1223 		 * with sockfs.
   1224 		 *
   1225 		 * As noted above, using the newer XTI bind primitive
   1226 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
   1227 		 * allow sockfs to be more sure about whether or not
   1228 		 * the bind request had succeeded (as transports are
   1229 		 * not permitted to bind to a different address than
   1230 		 * that requested - they must return failure).
   1231 		 * Unfortunately, support for T_BIND_REQ may not be
   1232 		 * present in all transport implementations (netatalk,
   1233 		 * for example, doesn't have it), making the
   1234 		 * transition difficult.
   1235 		 */
   1236 		if (bind_ack->ADDR_length != addrlen) {
   1237 			/* Assumes that the requested address was in use */
   1238 			freemsg(mp);
   1239 			error = EADDRINUSE;
   1240 			eprintsoline(so, error);
   1241 			goto done;
   1242 		}
   1243 
   1244 		switch (so->so_family) {
   1245 		case AF_INET6:
   1246 		case AF_INET: {
   1247 			sin_t *rname, *aname;
   1248 
   1249 			rname = (sin_t *)addr;
   1250 			aname = (sin_t *)sti->sti_laddr_sa;
   1251 
   1252 			/*
   1253 			 * Take advantage of the alignment
   1254 			 * of sin_port and sin6_port which fall
   1255 			 * in the same place in their data structures.
   1256 			 * Just use sin_port for either address family.
   1257 			 *
   1258 			 * This may become a problem if (heaven forbid)
   1259 			 * there's a separate ipv6port_reserved... :-P
   1260 			 *
   1261 			 * Binding to port 0 has the semantics of letting
   1262 			 * the transport bind to any port.
   1263 			 *
   1264 			 * If the transport is TCP or UDP since we had sent
   1265 			 * a T_BIND_REQ we would not get a port other than
   1266 			 * what we asked for.
   1267 			 */
   1268 			if (tcp_udp_xport) {
   1269 				/*
   1270 				 * Pick up the new port number if we bound to
   1271 				 * port 0.
   1272 				 */
   1273 				if (aname->sin_port == 0)
   1274 					aname->sin_port = rname->sin_port;
   1275 				sti->sti_laddr_valid = 1;
   1276 				break;
   1277 			}
   1278 			if (aname->sin_port != 0 &&
   1279 			    aname->sin_port != rname->sin_port) {
   1280 				freemsg(mp);
   1281 				error = EADDRINUSE;
   1282 				eprintsoline(so, error);
   1283 				goto done;
   1284 			}
   1285 			/*
   1286 			 * Pick up the new port number if we bound to port 0.
   1287 			 */
   1288 			aname->sin_port = rname->sin_port;
   1289 
   1290 			/*
   1291 			 * Unfortunately, addresses aren't _quite_ the same.
   1292 			 */
   1293 			if (so->so_family == AF_INET) {
   1294 				if (aname->sin_addr.s_addr !=
   1295 				    rname->sin_addr.s_addr) {
   1296 					freemsg(mp);
   1297 					error = EADDRNOTAVAIL;
   1298 					eprintsoline(so, error);
   1299 					goto done;
   1300 				}
   1301 			} else {
   1302 				sin6_t *rname6 = (sin6_t *)rname;
   1303 				sin6_t *aname6 = (sin6_t *)aname;
   1304 
   1305 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
   1306 				    &rname6->sin6_addr)) {
   1307 					freemsg(mp);
   1308 					error = EADDRNOTAVAIL;
   1309 					eprintsoline(so, error);
   1310 					goto done;
   1311 				}
   1312 			}
   1313 			break;
   1314 		}
   1315 		case AF_UNIX:
   1316 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
   1317 				freemsg(mp);
   1318 				error = EADDRINUSE;
   1319 				eprintsoline(so, error);
   1320 				eprintso(so,
   1321 				    ("addrlen %d, addr 0x%x, vp %p\n",
   1322 				    addrlen, *((int *)addr),
   1323 				    (void *)sti->sti_ux_bound_vp));
   1324 				goto done;
   1325 			}
   1326 			sti->sti_laddr_valid = 1;
   1327 			break;
   1328 		default:
   1329 			/*
   1330 			 * NOTE: This assumes that addresses can be
   1331 			 * byte-compared for equivalence.
   1332 			 */
   1333 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
   1334 				freemsg(mp);
   1335 				error = EADDRINUSE;
   1336 				eprintsoline(so, error);
   1337 				goto done;
   1338 			}
   1339 			/*
   1340 			 * Don't mark sti_laddr_valid, as we cannot be
   1341 			 * sure that the returned address is the real
   1342 			 * bound address when talking to an unknown
   1343 			 * transport.
   1344 			 */
   1345 			break;
   1346 		}
   1347 	} else {
   1348 		/*
   1349 		 * Save for returned address for getsockname.
   1350 		 * Needed for unspecific bind unless transport supports
   1351 		 * the TI_GETMYNAME ioctl.
   1352 		 * Do this for AF_INET{,6} even though they do, as
   1353 		 * caching info here is much better performance than
   1354 		 * a TPI/STREAMS trip to the transport for getsockname.
   1355 		 * Any which can't for some reason _must_ _not_ set
   1356 		 * sti_laddr_valid here for the caching version of
   1357 		 * getsockname to not break;
   1358 		 */
   1359 		switch (so->so_family) {
   1360 		case AF_UNIX:
   1361 			/*
   1362 			 * Record the address bound with the transport
   1363 			 * for use by socketpair.
   1364 			 */
   1365 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
   1366 			sti->sti_laddr_valid = 1;
   1367 			break;
   1368 		case AF_INET:
   1369 		case AF_INET6:
   1370 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
   1371 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
   1372 			sti->sti_laddr_valid = 1;
   1373 			break;
   1374 		default:
   1375 			/*
   1376 			 * Don't mark sti_laddr_valid, as we cannot be
   1377 			 * sure that the returned address is the real
   1378 			 * bound address when talking to an unknown
   1379 			 * transport.
   1380 			 */
   1381 			break;
   1382 		}
   1383 	}
   1384 
   1385 	if (nl7c != NULL) {
   1386 		/* Register listen()er sonode pointer with NL7C */
   1387 		nl7c_listener_addr(nl7c, so);
   1388 	}
   1389 
   1390 	freemsg(mp);
   1391 
   1392 done:
   1393 	if (error) {
   1394 		/* reset state & backlog to values held on entry */
   1395 		if (clear_acceptconn_on_err == B_TRUE)
   1396 			so->so_state &= ~SS_ACCEPTCONN;
   1397 		if (restore_backlog_on_err == B_TRUE)
   1398 			so->so_backlog = save_so_backlog;
   1399 
   1400 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
   1401 			int err;
   1402 
   1403 			err = sotpi_unbind(so, 0);
   1404 			/* LINTED - statement has no consequent: if */
   1405 			if (err) {
   1406 				eprintsoline(so, error);
   1407 			} else {
   1408 				ASSERT(!(so->so_state & SS_ISBOUND));
   1409 			}
   1410 		}
   1411 	}
   1412 	if (!(flags & _SOBIND_LOCK_HELD)) {
   1413 		so_unlock_single(so, SOLOCKED);
   1414 		mutex_exit(&so->so_lock);
   1415 	} else {
   1416 		ASSERT(MUTEX_HELD(&so->so_lock));
   1417 		ASSERT(so->so_flag & SOLOCKED);
   1418 	}
   1419 	return (error);
   1420 }
   1421 
   1422 /* bind the socket */
   1423 static int
   1424 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
   1425     int flags, struct cred *cr)
   1426 {
   1427 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
   1428 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
   1429 
   1430 	flags &= ~_SOBIND_SOCKETPAIR;
   1431 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
   1432 }
   1433 
   1434 /*
   1435  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
   1436  * address, or when listen needs to unbind and bind.
   1437  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
   1438  * so that a sobind can pick them up.
   1439  */
   1440 static int
   1441 sotpi_unbind(struct sonode *so, int flags)
   1442 {
   1443 	struct T_unbind_req	unbind_req;
   1444 	int			error = 0;
   1445 	mblk_t			*mp;
   1446 	sotpi_info_t		*sti = SOTOTPI(so);
   1447 
   1448 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
   1449 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
   1450 
   1451 	ASSERT(MUTEX_HELD(&so->so_lock));
   1452 	ASSERT(so->so_flag & SOLOCKED);
   1453 
   1454 	if (!(so->so_state & SS_ISBOUND)) {
   1455 		error = EINVAL;
   1456 		eprintsoline(so, error);
   1457 		goto done;
   1458 	}
   1459 
   1460 	mutex_exit(&so->so_lock);
   1461 
   1462 	/*
   1463 	 * Flush the read and write side (except stream head read queue)
   1464 	 * and send down T_UNBIND_REQ.
   1465 	 */
   1466 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
   1467 
   1468 	unbind_req.PRIM_type = T_UNBIND_REQ;
   1469 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
   1470 	    0, _ALLOC_SLEEP, CRED());
   1471 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
   1472 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
   1473 	mutex_enter(&so->so_lock);
   1474 	if (error) {
   1475 		eprintsoline(so, error);
   1476 		goto done;
   1477 	}
   1478 
   1479 	error = sowaitokack(so, T_UNBIND_REQ);
   1480 	if (error) {
   1481 		eprintsoline(so, error);
   1482 		goto done;
   1483 	}
   1484 
   1485 	/*
   1486 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
   1487 	 * strsock_proto while the lock was dropped above, the unbind
   1488 	 * is allowed to complete.
   1489 	 */
   1490 	if (!(flags & _SOUNBIND_REBIND)) {
   1491 		/*
   1492 		 * Clear out bound address.
   1493 		 */
   1494 		vnode_t *vp;
   1495 
   1496 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
   1497 
   1498 			/* Undo any SSL proxy setup */
   1499 			if ((so->so_family == AF_INET ||
   1500 			    so->so_family == AF_INET6) &&
   1501 			    (so->so_type == SOCK_STREAM) &&
   1502 			    (sti->sti_kssl_ent != NULL)) {
   1503 				kssl_release_ent(sti->sti_kssl_ent, so,
   1504 				    sti->sti_kssl_type);
   1505 				sti->sti_kssl_ent = NULL;
   1506 				sti->sti_kssl_type = KSSL_NO_PROXY;
   1507 			}
   1508 			sti->sti_ux_bound_vp = NULL;
   1509 			vn_rele_stream(vp);
   1510 		}
   1511 		/* Clear out address */
   1512 		sti->sti_laddr_len = 0;
   1513 	}
   1514 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
   1515 	sti->sti_laddr_valid = 0;
   1516 
   1517 done:
   1518 
   1519 	/* If the caller held the lock don't release it here */
   1520 	ASSERT(MUTEX_HELD(&so->so_lock));
   1521 	ASSERT(so->so_flag & SOLOCKED);
   1522 
   1523 	return (error);
   1524 }
   1525 
   1526 /*
   1527  * listen on the socket.
   1528  * For TPI conforming transports this has to first unbind with the transport
   1529  * and then bind again using the new backlog.
   1530  */
   1531 /* ARGSUSED */
   1532 int
   1533 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
   1534 {
   1535 	int		error = 0;
   1536 	sotpi_info_t	*sti = SOTOTPI(so);
   1537 
   1538 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
   1539 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
   1540 
   1541 	if (sti->sti_serv_type == T_CLTS)
   1542 		return (EOPNOTSUPP);
   1543 
   1544 	/*
   1545 	 * If the socket is ready to accept connections already, then
   1546 	 * return without doing anything.  This avoids a problem where
   1547 	 * a second listen() call fails if a connection is pending and
   1548 	 * leaves the socket unbound. Only when we are not unbinding
   1549 	 * with the transport can we safely increase the backlog.
   1550 	 */
   1551 	if (so->so_state & SS_ACCEPTCONN &&
   1552 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
   1553 	    /*CONSTCOND*/
   1554 	    !solisten_tpi_tcp))
   1555 		return (0);
   1556 
   1557 	if (so->so_state & SS_ISCONNECTED)
   1558 		return (EINVAL);
   1559 
   1560 	mutex_enter(&so->so_lock);
   1561 	so_lock_single(so);	/* Set SOLOCKED */
   1562 
   1563 	/*
   1564 	 * If the listen doesn't change the backlog we do nothing.
   1565 	 * This avoids an EPROTO error from the transport.
   1566 	 */
   1567 	if ((so->so_state & SS_ACCEPTCONN) &&
   1568 	    so->so_backlog == backlog)
   1569 		goto done;
   1570 
   1571 	if (!(so->so_state & SS_ISBOUND)) {
   1572 		/*
   1573 		 * Must have been explicitly bound in the UNIX domain.
   1574 		 */
   1575 		if (so->so_family == AF_UNIX) {
   1576 			error = EINVAL;
   1577 			goto done;
   1578 		}
   1579 		error = sotpi_bindlisten(so, NULL, 0, backlog,
   1580 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
   1581 	} else if (backlog > 0) {
   1582 		/*
   1583 		 * AF_INET{,6} hack to avoid losing the port.
   1584 		 * Assumes that all AF_INET{,6} transports can handle a
   1585 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
   1586 		 * has already bound thus it is possible to avoid the unbind.
   1587 		 */
   1588 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
   1589 		    /*CONSTCOND*/
   1590 		    !solisten_tpi_tcp)) {
   1591 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
   1592 			if (error)
   1593 				goto done;
   1594 		}
   1595 		error = sotpi_bindlisten(so, NULL, 0, backlog,
   1596 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
   1597 	} else {
   1598 		so->so_state |= SS_ACCEPTCONN;
   1599 		so->so_backlog = backlog;
   1600 	}
   1601 	if (error)
   1602 		goto done;
   1603 	ASSERT(so->so_state & SS_ACCEPTCONN);
   1604 done:
   1605 	so_unlock_single(so, SOLOCKED);
   1606 	mutex_exit(&so->so_lock);
   1607 	return (error);
   1608 }
   1609 
   1610 /*
   1611  * Disconnect either a specified seqno or all (-1).
   1612  * The former is used on listening sockets only.
   1613  *
   1614  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
   1615  * the current use of sodisconnect(seqno == -1) is only for shutdown
   1616  * so there is no point (and potentially incorrect) to unbind.
   1617  */
   1618 static int
   1619 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
   1620 {
   1621 	struct T_discon_req	discon_req;
   1622 	int			error = 0;
   1623 	mblk_t			*mp;
   1624 
   1625 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
   1626 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
   1627 
   1628 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
   1629 		mutex_enter(&so->so_lock);
   1630 		so_lock_single(so);	/* Set SOLOCKED */
   1631 	} else {
   1632 		ASSERT(MUTEX_HELD(&so->so_lock));
   1633 		ASSERT(so->so_flag & SOLOCKED);
   1634 	}
   1635 
   1636 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
   1637 		error = EINVAL;
   1638 		eprintsoline(so, error);
   1639 		goto done;
   1640 	}
   1641 
   1642 	mutex_exit(&so->so_lock);
   1643 	/*
   1644 	 * Flush the write side (unless this is a listener)
   1645 	 * and then send down a T_DISCON_REQ.
   1646 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
   1647 	 * and other messages.)
   1648 	 */
   1649 	if (!(so->so_state & SS_ACCEPTCONN))
   1650 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
   1651 
   1652 	discon_req.PRIM_type = T_DISCON_REQ;
   1653 	discon_req.SEQ_number = seqno;
   1654 	mp = soallocproto1(&discon_req, sizeof (discon_req),
   1655 	    0, _ALLOC_SLEEP, CRED());
   1656 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
   1657 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
   1658 	mutex_enter(&so->so_lock);
   1659 	if (error) {
   1660 		eprintsoline(so, error);
   1661 		goto done;
   1662 	}
   1663 
   1664 	error = sowaitokack(so, T_DISCON_REQ);
   1665 	if (error) {
   1666 		eprintsoline(so, error);
   1667 		goto done;
   1668 	}
   1669 	/*
   1670 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
   1671 	 * strsock_proto while the lock was dropped above, the disconnect
   1672 	 * is allowed to complete. However, it is not possible to
   1673 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
   1674 	 */
   1675 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
   1676 	SOTOTPI(so)->sti_laddr_valid = 0;
   1677 	SOTOTPI(so)->sti_faddr_valid = 0;
   1678 done:
   1679 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
   1680 		so_unlock_single(so, SOLOCKED);
   1681 		mutex_exit(&so->so_lock);
   1682 	} else {
   1683 		/* If the caller held the lock don't release it here */
   1684 		ASSERT(MUTEX_HELD(&so->so_lock));
   1685 		ASSERT(so->so_flag & SOLOCKED);
   1686 	}
   1687 	return (error);
   1688 }
   1689 
   1690 /* ARGSUSED */
   1691 int
   1692 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
   1693     struct sonode **nsop)
   1694 {
   1695 	struct T_conn_ind	*conn_ind;
   1696 	struct T_conn_res	*conn_res;
   1697 	int			error = 0;
   1698 	mblk_t			*mp, *ctxmp, *ack_mp;
   1699 	struct sonode		*nso;
   1700 	vnode_t			*nvp;
   1701 	void			*src;
   1702 	t_uscalar_t		srclen;
   1703 	void			*opt;
   1704 	t_uscalar_t		optlen;
   1705 	t_scalar_t		PRIM_type;
   1706 	t_scalar_t		SEQ_number;
   1707 	size_t			sinlen;
   1708 	sotpi_info_t		*sti = SOTOTPI(so);
   1709 	sotpi_info_t		*nsti;
   1710 
   1711 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
   1712 	    (void *)so, fflag, (void *)nsop,
   1713 	    pr_state(so->so_state, so->so_mode)));
   1714 
   1715 	/*
   1716 	 * Defer single-threading the accepting socket until
   1717 	 * the T_CONN_IND has been received and parsed and the
   1718 	 * new sonode has been opened.
   1719 	 */
   1720 
   1721 	/* Check that we are not already connected */
   1722 	if ((so->so_state & SS_ACCEPTCONN) == 0)
   1723 		goto conn_bad;
   1724 again:
   1725 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
   1726 		goto e_bad;
   1727 
   1728 	ASSERT(mp != NULL);
   1729 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
   1730 	ctxmp = mp->b_cont;
   1731 
   1732 	/*
   1733 	 * Save SEQ_number for error paths.
   1734 	 */
   1735 	SEQ_number = conn_ind->SEQ_number;
   1736 
   1737 	srclen = conn_ind->SRC_length;
   1738 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
   1739 	if (src == NULL) {
   1740 		error = EPROTO;
   1741 		freemsg(mp);
   1742 		eprintsoline(so, error);
   1743 		goto disconnect_unlocked;
   1744 	}
   1745 	optlen = conn_ind->OPT_length;
   1746 	switch (so->so_family) {
   1747 	case AF_INET:
   1748 	case AF_INET6:
   1749 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
   1750 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
   1751 			    &opt, conn_ind->OPT_length);
   1752 		} else {
   1753 			/*
   1754 			 * The transport (in this case TCP) hasn't sent up
   1755 			 * a pointer to an instance for the accept fast-path.
   1756 			 * Disable fast-path completely because the call to
   1757 			 * sotpi_create() below would otherwise create an
   1758 			 * incomplete TCP instance, which would lead to
   1759 			 * problems when sockfs sends a normal T_CONN_RES
   1760 			 * message down the new stream.
   1761 			 */
   1762 			if (sti->sti_direct) {
   1763 				int rval;
   1764 				/*
   1765 				 * For consistency we inform tcp to disable
   1766 				 * direct interface on the listener, though
   1767 				 * we can certainly live without doing this
   1768 				 * because no data will ever travel upstream
   1769 				 * on the listening socket.
   1770 				 */
   1771 				sti->sti_direct = 0;
   1772 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
   1773 				    0, 0, K_TO_K, cr, &rval);
   1774 			}
   1775 			opt = NULL;
   1776 			optlen = 0;
   1777 		}
   1778 		break;
   1779 	case AF_UNIX:
   1780 	default:
   1781 		if (optlen != 0) {
   1782 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
   1783 			    __TPI_ALIGN_SIZE);
   1784 			if (opt == NULL) {
   1785 				error = EPROTO;
   1786 				freemsg(mp);
   1787 				eprintsoline(so, error);
   1788 				goto disconnect_unlocked;
   1789 			}
   1790 		}
   1791 		if (so->so_family == AF_UNIX) {
   1792 			if (!sti->sti_faddr_noxlate) {
   1793 				src = NULL;
   1794 				srclen = 0;
   1795 			}
   1796 			/* Extract src address from options */
   1797 			if (optlen != 0)
   1798 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
   1799 		}
   1800 		break;
   1801 	}
   1802 
   1803 	/*
   1804 	 * Create the new socket.
   1805 	 */
   1806 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
   1807 	if (nso == NULL) {
   1808 		ASSERT(error != 0);
   1809 		/*
   1810 		 * Accept can not fail with ENOBUFS. sotpi_create
   1811 		 * sleeps waiting for memory until a signal is caught
   1812 		 * so return EINTR.
   1813 		 */
   1814 		freemsg(mp);
   1815 		if (error == ENOBUFS)
   1816 			error = EINTR;
   1817 		goto e_disc_unl;
   1818 	}
   1819 	nvp = SOTOV(nso);
   1820 	nsti = SOTOTPI(nso);
   1821 
   1822 	/*
   1823 	 * If the transport sent up an SSL connection context, then attach
   1824 	 * it the new socket, and set the (sd_wputdatafunc)() and
   1825 	 * (sd_rputdatafunc)() stream head hooks to intercept and process
   1826 	 * SSL records.
   1827 	 */
   1828 	if (ctxmp != NULL) {
   1829 		/*
   1830 		 * This kssl_ctx_t is already held for us by the transport.
   1831 		 * So, we don't need to do a kssl_hold_ctx() here.
   1832 		 */
   1833 		nsti->sti_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
   1834 		freemsg(ctxmp);
   1835 		mp->b_cont = NULL;
   1836 		strsetrwputdatahooks(nvp, strsock_kssl_input,
   1837 		    strsock_kssl_output);
   1838 	}
   1839 #ifdef DEBUG
   1840 	/*
   1841 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
   1842 	 * it's inherited early to allow debugging of the accept code itself.
   1843 	 */
   1844 	nso->so_options |= so->so_options & SO_DEBUG;
   1845 #endif /* DEBUG */
   1846 
   1847 	/*
   1848 	 * Save the SRC address from the T_CONN_IND
   1849 	 * for getpeername to work on AF_UNIX and on transports that do not
   1850 	 * support TI_GETPEERNAME.
   1851 	 *
   1852 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
   1853 	 * copyin_name().
   1854 	 */
   1855 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
   1856 		error = EINVAL;
   1857 		freemsg(mp);
   1858 		eprintsoline(so, error);
   1859 		goto disconnect_vp_unlocked;
   1860 	}
   1861 	nsti->sti_faddr_len = (socklen_t)srclen;
   1862 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
   1863 	bcopy(src, nsti->sti_faddr_sa, srclen);
   1864 	nsti->sti_faddr_valid = 1;
   1865 
   1866 	/*
   1867 	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
   1868 	 */
   1869 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
   1870 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
   1871 		cred_t	*cr;
   1872 		pid_t	cpid;
   1873 
   1874 		cr = msg_getcred(mp, &cpid);
   1875 		if (cr != NULL) {
   1876 			crhold(cr);
   1877 			nso->so_peercred = cr;
   1878 			nso->so_cpid = cpid;
   1879 		}
   1880 		freemsg(mp);
   1881 
   1882 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
   1883 		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
   1884 		if (mp == NULL) {
   1885 			/*
   1886 			 * Accept can not fail with ENOBUFS.
   1887 			 * A signal was caught so return EINTR.
   1888 			 */
   1889 			error = EINTR;
   1890 			eprintsoline(so, error);
   1891 			goto disconnect_vp_unlocked;
   1892 		}
   1893 		conn_res = (struct T_conn_res *)mp->b_rptr;
   1894 	} else {
   1895 		/*
   1896 		 * For efficency reasons we use msg_extractcred; no crhold
   1897 		 * needed since db_credp is cleared (i.e., we move the cred
   1898 		 * from the message to so_peercred.
   1899 		 */
   1900 		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
   1901 
   1902 		mp->b_rptr = DB_BASE(mp);
   1903 		conn_res = (struct T_conn_res *)mp->b_rptr;
   1904 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
   1905 
   1906 		mblk_setcred(mp, cr, curproc->p_pid);
   1907 	}
   1908 
   1909 	/*
   1910 	 * New socket must be bound at least in sockfs and, except for AF_INET,
   1911 	 * (or AF_INET6) it also has to be bound in the transport provider.
   1912 	 * We set the local address in the sonode from the T_OK_ACK of the
   1913 	 * T_CONN_RES. For this reason the address we bind to here isn't
   1914 	 * important.
   1915 	 */
   1916 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
   1917 	    /*CONSTCOND*/
   1918 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
   1919 		/*
   1920 		 * Optimization for AF_INET{,6} transports
   1921 		 * that can handle a T_CONN_RES without being bound.
   1922 		 */
   1923 		mutex_enter(&nso->so_lock);
   1924 		so_automatic_bind(nso);
   1925 		mutex_exit(&nso->so_lock);
   1926 	} else {
   1927 		/* Perform NULL bind with the transport provider. */
   1928 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
   1929 		    cr)) != 0) {
   1930 			ASSERT(error != ENOBUFS);
   1931 			freemsg(mp);
   1932 			eprintsoline(nso, error);
   1933 			goto disconnect_vp_unlocked;
   1934 		}
   1935 	}
   1936 
   1937 	/*
   1938 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
   1939 	 * so that any data arriving on the new socket will cause the
   1940 	 * appropriate signals to be delivered for the new socket.
   1941 	 *
   1942 	 * No other thread (except strsock_proto and strsock_misc)
   1943 	 * can access the new socket thus we relax the locking.
   1944 	 */
   1945 	nso->so_pgrp = so->so_pgrp;
   1946 	nso->so_state |= so->so_state & SS_ASYNC;
   1947 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
   1948 
   1949 	if (nso->so_pgrp != 0) {
   1950 		if ((error = so_set_events(nso, nvp, cr)) != 0) {
   1951 			eprintsoline(nso, error);
   1952 			error = 0;
   1953 			nso->so_pgrp = 0;
   1954 		}
   1955 	}
   1956 
   1957 	/*
   1958 	 * Make note of the socket level options. TCP and IP level options
   1959 	 * are already inherited. We could do all this after accept is
   1960 	 * successful but doing it here simplifies code and no harm done
   1961 	 * for error case.
   1962 	 */
   1963 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
   1964 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
   1965 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
   1966 	nso->so_sndbuf = so->so_sndbuf;
   1967 	nso->so_rcvbuf = so->so_rcvbuf;
   1968 	if (nso->so_options & SO_LINGER)
   1969 		nso->so_linger = so->so_linger;
   1970 
   1971 	/*
   1972 	 * Note that the following sti_direct code path should be
   1973 	 * removed once we are confident that the direct sockets
   1974 	 * do not result in any degradation.
   1975 	 */
   1976 	if (sti->sti_direct) {
   1977 
   1978 		ASSERT(opt != NULL);
   1979 
   1980 		conn_res->OPT_length = optlen;
   1981 		conn_res->OPT_offset = MBLKL(mp);
   1982 		bcopy(&opt, mp->b_wptr, optlen);
   1983 		mp->b_wptr += optlen;
   1984 		conn_res->PRIM_type = T_CONN_RES;
   1985 		conn_res->ACCEPTOR_id = 0;
   1986 		PRIM_type = T_CONN_RES;
   1987 
   1988 		/* Send down the T_CONN_RES on acceptor STREAM */
   1989 		error = kstrputmsg(SOTOV(nso), mp, NULL,
   1990 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
   1991 		if (error) {
   1992 			mutex_enter(&so->so_lock);
   1993 			so_lock_single(so);
   1994 			eprintsoline(so, error);
   1995 			goto disconnect_vp;
   1996 		}
   1997 		mutex_enter(&nso->so_lock);
   1998 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
   1999 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
   2000 		if (error) {
   2001 			mutex_exit(&nso->so_lock);
   2002 			mutex_enter(&so->so_lock);
   2003 			so_lock_single(so);
   2004 			eprintsoline(so, error);
   2005 			goto disconnect_vp;
   2006 		}
   2007 		if (nso->so_family == AF_INET) {
   2008 			sin_t *sin;
   2009 
   2010 			sin = (sin_t *)(ack_mp->b_rptr +
   2011 			    sizeof (struct T_ok_ack));
   2012 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
   2013 			nsti->sti_laddr_len = sizeof (sin_t);
   2014 		} else {
   2015 			sin6_t *sin6;
   2016 
   2017 			sin6 = (sin6_t *)(ack_mp->b_rptr +
   2018 			    sizeof (struct T_ok_ack));
   2019 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
   2020 			nsti->sti_laddr_len = sizeof (sin6_t);
   2021 		}
   2022 		freemsg(ack_mp);
   2023 
   2024 		nso->so_state |= SS_ISCONNECTED;
   2025 		nso->so_proto_handle = (sock_lower_handle_t)opt;
   2026 		nsti->sti_laddr_valid = 1;
   2027 
   2028 		if (sti->sti_nl7c_flags & NL7C_ENABLED) {
   2029 			/*
   2030 			 * A NL7C marked listen()er so the new socket
   2031 			 * inherits the listen()er's NL7C state, except
   2032 			 * for NL7C_POLLIN.
   2033 			 *
   2034 			 * Only call NL7C to process the new socket if
   2035 			 * the listen socket allows blocking i/o.
   2036 			 */
   2037 			nsti->sti_nl7c_flags =
   2038 			    sti->sti_nl7c_flags & (~NL7C_POLLIN);
   2039 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
   2040 				/*
   2041 				 * Nonblocking accept() just make it
   2042 				 * persist to defer processing to the
   2043 				 * read-side syscall (e.g. read).
   2044 				 */
   2045 				nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
   2046 			} else if (nl7c_process(nso, B_FALSE)) {
   2047 				/*
   2048 				 * NL7C has completed processing on the
   2049 				 * socket, close the socket and back to
   2050 				 * the top to await the next T_CONN_IND.
   2051 				 */
   2052 				mutex_exit(&nso->so_lock);
   2053 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
   2054 				    cr, NULL);
   2055 				VN_RELE(nvp);
   2056 				goto again;
   2057 			}
   2058 			/* Pass the new socket out */
   2059 		}
   2060 
   2061 		mutex_exit(&nso->so_lock);
   2062 
   2063 		/*
   2064 		 * It's possible, through the use of autopush for example,
   2065 		 * that the acceptor stream may not support sti_direct
   2066 		 * semantics. If the new socket does not support sti_direct
   2067 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
   2068 		 * as we would in the I_PUSH case.
   2069 		 */
   2070 		if (nsti->sti_direct == 0) {
   2071 			int	rval;
   2072 
   2073 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
   2074 			    0, 0, K_TO_K, cr, &rval)) != 0) {
   2075 				mutex_enter(&so->so_lock);
   2076 				so_lock_single(so);
   2077 				eprintsoline(so, error);
   2078 				goto disconnect_vp;
   2079 			}
   2080 		}
   2081 
   2082 		/*
   2083 		 * Pass out new socket.
   2084 		 */
   2085 		if (nsop != NULL)
   2086 			*nsop = nso;
   2087 
   2088 		return (0);
   2089 	}
   2090 
   2091 	/*
   2092 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
   2093 	 * which don't support the FireEngine accept fast-path. It is also
   2094 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
   2095 	 * again. Neither sockfs nor TCP attempt to find out if some other
   2096 	 * random module has been inserted in between (in which case we
   2097 	 * should follow TLI accept behaviour). We blindly assume the worst
   2098 	 * case and revert back to old behaviour i.e. TCP will not send us
   2099 	 * any option (eager) and the accept should happen on the listener
   2100 	 * queue. Any queued T_conn_ind have already got their options removed
   2101 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
   2102 	 */
   2103 	/*
   2104 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
   2105 	 */
   2106 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
   2107 #ifdef	_ILP32
   2108 		queue_t	*q;
   2109 
   2110 		/*
   2111 		 * Find read queue in driver
   2112 		 * Can safely do this since we "own" nso/nvp.
   2113 		 */
   2114 		q = strvp2wq(nvp)->q_next;
   2115 		while (SAMESTR(q))
   2116 			q = q->q_next;
   2117 		q = RD(q);
   2118 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
   2119 #else
   2120 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
   2121 #endif	/* _ILP32 */
   2122 		conn_res->PRIM_type = O_T_CONN_RES;
   2123 		PRIM_type = O_T_CONN_RES;
   2124 	} else {
   2125 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
   2126 		conn_res->PRIM_type = T_CONN_RES;
   2127 		PRIM_type = T_CONN_RES;
   2128 	}
   2129 	conn_res->SEQ_number = SEQ_number;
   2130 	conn_res->OPT_length = 0;
   2131 	conn_res->OPT_offset = 0;
   2132 
   2133 	mutex_enter(&so->so_lock);
   2134 	so_lock_single(so);	/* Set SOLOCKED */
   2135 	mutex_exit(&so->so_lock);
   2136 
   2137 	error = kstrputmsg(SOTOV(so), mp, NULL,
   2138 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
   2139 	mutex_enter(&so->so_lock);
   2140 	if (error) {
   2141 		eprintsoline(so, error);
   2142 		goto disconnect_vp;
   2143 	}
   2144 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
   2145 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
   2146 	if (error) {
   2147 		eprintsoline(so, error);
   2148 		goto disconnect_vp;
   2149 	}
   2150 	/*
   2151 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
   2152 	 * that to set the local address. If this is not present
   2153 	 * then we zero out the address and don't set the
   2154 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
   2155 	 * the pathname from the listening socket.
   2156 	 */
   2157 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
   2158 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
   2159 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
   2160 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
   2161 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
   2162 		nsti->sti_laddr_len = sinlen;
   2163 		nsti->sti_laddr_valid = 1;
   2164 	} else if (nso->so_family == AF_UNIX) {
   2165 		ASSERT(so->so_family == AF_UNIX);
   2166 		nsti->sti_laddr_len = sti->sti_laddr_len;
   2167 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
   2168 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
   2169 		    nsti->sti_laddr_len);
   2170 		nsti->sti_laddr_valid = 1;
   2171 	} else {
   2172 		nsti->sti_laddr_len = sti->sti_laddr_len;
   2173 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
   2174 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
   2175 		nsti->sti_laddr_sa->sa_family = nso->so_family;
   2176 	}
   2177 	freemsg(ack_mp);
   2178 
   2179 	so_unlock_single(so, SOLOCKED);
   2180 	mutex_exit(&so->so_lock);
   2181 
   2182 	nso->so_state |= SS_ISCONNECTED;
   2183 
   2184 	/*
   2185 	 * Pass out new socket.
   2186 	 */
   2187 	if (nsop != NULL)
   2188 		*nsop = nso;
   2189 
   2190 	return (0);
   2191 
   2192 
   2193 eproto_disc_unl:
   2194 	error = EPROTO;
   2195 e_disc_unl:
   2196 	eprintsoline(so, error);
   2197 	goto disconnect_unlocked;
   2198 
   2199 pr_disc_vp_unl:
   2200 	eprintsoline(so, error);
   2201 disconnect_vp_unlocked:
   2202 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
   2203 	VN_RELE(nvp);
   2204 disconnect_unlocked:
   2205 	(void) sodisconnect(so, SEQ_number, 0);
   2206 	return (error);
   2207 
   2208 pr_disc_vp:
   2209 	eprintsoline(so, error);
   2210 disconnect_vp:
   2211 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
   2212 	so_unlock_single(so, SOLOCKED);
   2213 	mutex_exit(&so->so_lock);
   2214 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
   2215 	VN_RELE(nvp);
   2216 	return (error);
   2217 
   2218 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
   2219 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
   2220 	    ? EOPNOTSUPP : EINVAL;
   2221 e_bad:
   2222 	eprintsoline(so, error);
   2223 	return (error);
   2224 }
   2225 
   2226 /*
   2227  * connect a socket.
   2228  *
   2229  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
   2230  * unconnect (by specifying a null address).
   2231  */
   2232 int
   2233 sotpi_connect(struct sonode *so,
   2234 	const struct sockaddr *name,
   2235 	socklen_t namelen,
   2236 	int fflag,
   2237 	int flags,
   2238 	struct cred *cr)
   2239 {
   2240 	struct T_conn_req	conn_req;
   2241 	int			error = 0;
   2242 	mblk_t			*mp;
   2243 	void			*src;
   2244 	socklen_t		srclen;
   2245 	void			*addr;
   2246 	socklen_t		addrlen;
   2247 	boolean_t		need_unlock;
   2248 	sotpi_info_t		*sti = SOTOTPI(so);
   2249 
   2250 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
   2251 	    (void *)so, (void *)name, namelen, fflag, flags,
   2252 	    pr_state(so->so_state, so->so_mode)));
   2253 
   2254 	/*
   2255 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
   2256 	 * avoid sleeping for memory with SOLOCKED held.
   2257 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
   2258 	 * + sizeof (struct T_opthdr).
   2259 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
   2260 	 * exceed sti_faddr_maxlen).
   2261 	 */
   2262 	mp = soallocproto(sizeof (struct T_conn_req) +
   2263 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
   2264 	    cr);
   2265 	if (mp == NULL) {
   2266 		/*
   2267 		 * Connect can not fail with ENOBUFS. A signal was
   2268 		 * caught so return EINTR.
   2269 		 */
   2270 		error = EINTR;
   2271 		eprintsoline(so, error);
   2272 		return (error);
   2273 	}
   2274 
   2275 	mutex_enter(&so->so_lock);
   2276 	/*
   2277 	 * Make sure there is a preallocated T_unbind_req message
   2278 	 * before any binding. This message is allocated when the
   2279 	 * socket is created. Since another thread can consume
   2280 	 * so_unbind_mp by the time we return from so_lock_single(),
   2281 	 * we should check the availability of so_unbind_mp after
   2282 	 * we return from so_lock_single().
   2283 	 */
   2284 
   2285 	so_lock_single(so);	/* Set SOLOCKED */
   2286 	need_unlock = B_TRUE;
   2287 
   2288 	if (sti->sti_unbind_mp == NULL) {
   2289 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
   2290 		/* NOTE: holding so_lock while sleeping */
   2291 		sti->sti_unbind_mp =
   2292 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
   2293 		if (sti->sti_unbind_mp == NULL) {
   2294 			error = EINTR;
   2295 			goto done;
   2296 		}
   2297 	}
   2298 
   2299 	/*
   2300 	 * Can't have done a listen before connecting.
   2301 	 */
   2302 	if (so->so_state & SS_ACCEPTCONN) {
   2303 		error = EOPNOTSUPP;
   2304 		goto done;
   2305 	}
   2306 
   2307 	/*
   2308 	 * Must be bound with the transport
   2309 	 */
   2310 	if (!(so->so_state & SS_ISBOUND)) {
   2311 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
   2312 		    /*CONSTCOND*/
   2313 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
   2314 			/*
   2315 			 * Optimization for AF_INET{,6} transports
   2316 			 * that can handle a T_CONN_REQ without being bound.
   2317 			 */
   2318 			so_automatic_bind(so);
   2319 		} else {
   2320 			error = sotpi_bind(so, NULL, 0,
   2321 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
   2322 			if (error)
   2323 				goto done;
   2324 		}
   2325 		ASSERT(so->so_state & SS_ISBOUND);
   2326 		flags |= _SOCONNECT_DID_BIND;
   2327 	}
   2328 
   2329 	/*
   2330 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
   2331 	 * connect to a null address. This is the portable method to
   2332 	 * unconnect a socket.
   2333 	 */
   2334 	if ((namelen >= sizeof (sa_family_t)) &&
   2335 	    (name->sa_family == AF_UNSPEC)) {
   2336 		name = NULL;
   2337 		namelen = 0;
   2338 	}
   2339 
   2340 	/*
   2341 	 * Check that we are not already connected.
   2342 	 * A connection-oriented socket cannot be reconnected.
   2343 	 * A connected connection-less socket can be
   2344 	 * - connected to a different address by a subsequent connect
   2345 	 * - "unconnected" by a connect to the NULL address
   2346 	 */
   2347 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
   2348 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
   2349 		if (so->so_mode & SM_CONNREQUIRED) {
   2350 			/* Connection-oriented socket */
   2351 			error = so->so_state & SS_ISCONNECTED ?
   2352 			    EISCONN : EALREADY;
   2353 			goto done;
   2354 		}
   2355 		/* Connection-less socket */
   2356 		if (name == NULL) {
   2357 			/*
   2358 			 * Remove the connected state and clear SO_DGRAM_ERRIND
   2359 			 * since it was set when the socket was connected.
   2360 			 * If this is UDP also send down a T_DISCON_REQ.
   2361 			 */
   2362 			int val;
   2363 
   2364 			if ((so->so_family == AF_INET ||
   2365 			    so->so_family == AF_INET6) &&
   2366 			    (so->so_type == SOCK_DGRAM ||
   2367 			    so->so_type == SOCK_RAW) &&
   2368 			    /*CONSTCOND*/
   2369 			    !soconnect_tpi_udp) {
   2370 				/* XXX What about implicitly unbinding here? */
   2371 				error = sodisconnect(so, -1,
   2372 				    _SODISCONNECT_LOCK_HELD);
   2373 			} else {
   2374 				so->so_state &=
   2375 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
   2376 				sti->sti_faddr_valid = 0;
   2377 				sti->sti_faddr_len = 0;
   2378 			}
   2379 
   2380 			/* Remove SOLOCKED since setsockopt will grab it */
   2381 			so_unlock_single(so, SOLOCKED);
   2382 			mutex_exit(&so->so_lock);
   2383 
   2384 			val = 0;
   2385 			(void) sotpi_setsockopt(so, SOL_SOCKET,
   2386 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
   2387 			    cr);
   2388 
   2389 			mutex_enter(&so->so_lock);
   2390 			so_lock_single(so);	/* Set SOLOCKED */
   2391 			goto done;
   2392 		}
   2393 	}
   2394 	ASSERT(so->so_state & SS_ISBOUND);
   2395 
   2396 	if (name == NULL || namelen == 0) {
   2397 		error = EINVAL;
   2398 		goto done;
   2399 	}
   2400 	/*
   2401 	 * Mark the socket if sti_faddr_sa represents the transport level
   2402 	 * address.
   2403 	 */
   2404 	if (flags & _SOCONNECT_NOXLATE) {
   2405 		struct sockaddr_ux	*soaddr_ux;
   2406 
   2407 		ASSERT(so->so_family == AF_UNIX);
   2408 		if (namelen != sizeof (struct sockaddr_ux)) {
   2409 			error = EINVAL;
   2410 			goto done;
   2411 		}
   2412 		soaddr_ux = (struct sockaddr_ux *)name;
   2413 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
   2414 		namelen = sizeof (soaddr_ux->sou_addr);
   2415 		sti->sti_faddr_noxlate = 1;
   2416 	}
   2417 
   2418 	/*
   2419 	 * Length and family checks.
   2420 	 */
   2421 	error = so_addr_verify(so, name, namelen);
   2422 	if (error)
   2423 		goto bad;
   2424 
   2425 	/*
   2426 	 * Save foreign address. Needed for AF_UNIX as well as
   2427 	 * transport providers that do not support TI_GETPEERNAME.
   2428 	 * Also used for cached foreign address for TCP and UDP.
   2429 	 */
   2430 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
   2431 		error = EINVAL;
   2432 		goto done;
   2433 	}
   2434 	sti->sti_faddr_len = (socklen_t)namelen;
   2435 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
   2436 	bcopy(name, sti->sti_faddr_sa, namelen);
   2437 	sti->sti_faddr_valid = 1;
   2438 
   2439 	if (so->so_family == AF_UNIX) {
   2440 		if (sti->sti_faddr_noxlate) {
   2441 			/*
   2442 			 * Already have a transport internal address. Do not
   2443 			 * pass any (transport internal) source address.
   2444 			 */
   2445 			addr = sti->sti_faddr_sa;
   2446 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
   2447 			src = NULL;
   2448 			srclen = 0;
   2449 		} else {
   2450 			/*
   2451 			 * Pass the sockaddr_un source address as an option
   2452 			 * and translate the remote address.
   2453 			 * Holding so_lock thus sti_laddr_sa can not change.
   2454 			 */
   2455 			src = sti->sti_laddr_sa;
   2456 			srclen = (t_uscalar_t)sti->sti_laddr_len;
   2457 			dprintso(so, 1,
   2458 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
   2459 			    srclen, src));
   2460 			error = so_ux_addr_xlate(so,
   2461 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
   2462 			    (flags & _SOCONNECT_XPG4_2),
   2463 			    &addr, &addrlen);
   2464 			if (error)
   2465 				goto bad;
   2466 		}
   2467 	} else {
   2468 		addr = sti->sti_faddr_sa;
   2469 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
   2470 		src = NULL;
   2471 		srclen = 0;
   2472 	}
   2473 	/*
   2474 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
   2475 	 * option which asks the transport provider to send T_UDERR_IND
   2476 	 * messages. These T_UDERR_IND messages are used to return connected
   2477 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
   2478 	 *
   2479 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
   2480 	 * we send down a T_CONN_REQ. This is needed to let the
   2481 	 * transport assign a local address that is consistent with
   2482 	 * the remote address. Applications depend on a getsockname()
   2483 	 * after a connect() to retrieve the "source" IP address for
   2484 	 * the connected socket.  Invalidate the cached local address
   2485 	 * to force getsockname() to enquire of the transport.
   2486 	 */
   2487 	if (!(so->so_mode & SM_CONNREQUIRED)) {
   2488 		/*
   2489 		 * Datagram socket.
   2490 		 */
   2491 		int32_t val;
   2492 
   2493 		so_unlock_single(so, SOLOCKED);
   2494 		mutex_exit(&so->so_lock);
   2495 
   2496 		val = 1;
   2497 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
   2498 		    &val, (t_uscalar_t)sizeof (val), cr);
   2499 
   2500 		mutex_enter(&so->so_lock);
   2501 		so_lock_single(so);	/* Set SOLOCKED */
   2502 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
   2503 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
   2504 		    soconnect_tpi_udp) {
   2505 			soisconnected(so);
   2506 			goto done;
   2507 		}
   2508 		/*
   2509 		 * Send down T_CONN_REQ etc.
   2510 		 * Clear fflag to avoid returning EWOULDBLOCK.
   2511 		 */
   2512 		fflag = 0;
   2513 		ASSERT(so->so_family != AF_UNIX);
   2514 		sti->sti_laddr_valid = 0;
   2515 	} else if (sti->sti_laddr_len != 0) {
   2516 		/*
   2517 		 * If the local address or port was "any" then it may be
   2518 		 * changed by the transport as a result of the
   2519 		 * connect.  Invalidate the cached version if we have one.
   2520 		 */
   2521 		switch (so->so_family) {
   2522 		case AF_INET:
   2523 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
   2524 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
   2525 			    INADDR_ANY ||
   2526 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
   2527 				sti->sti_laddr_valid = 0;
   2528 			break;
   2529 
   2530 		case AF_INET6:
   2531 			ASSERT(sti->sti_laddr_len ==
   2532 			    (socklen_t)sizeof (sin6_t));
   2533 			if (IN6_IS_ADDR_UNSPECIFIED(
   2534 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
   2535 			    IN6_IS_ADDR_V4MAPPED_ANY(
   2536 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
   2537 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
   2538 				sti->sti_laddr_valid = 0;
   2539 			break;
   2540 
   2541 		default:
   2542 			break;
   2543 		}
   2544 	}
   2545 
   2546 	/*
   2547 	 * Check for failure of an earlier call
   2548 	 */
   2549 	if (so->so_error != 0)
   2550 		goto so_bad;
   2551 
   2552 	/*
   2553 	 * Send down T_CONN_REQ. Message was allocated above.
   2554 	 */
   2555 	conn_req.PRIM_type = T_CONN_REQ;
   2556 	conn_req.DEST_length = addrlen;
   2557 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
   2558 	if (srclen == 0) {
   2559 		conn_req.OPT_length = 0;
   2560 		conn_req.OPT_offset = 0;
   2561 		soappendmsg(mp, &conn_req, sizeof (conn_req));
   2562 		soappendmsg(mp, addr, addrlen);
   2563 	} else {
   2564 		/*
   2565 		 * There is a AF_UNIX sockaddr_un to include as a source
   2566 		 * address option.
   2567 		 */
   2568 		struct T_opthdr toh;
   2569 
   2570 		toh.level = SOL_SOCKET;
   2571 		toh.name = SO_SRCADDR;
   2572 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
   2573 		toh.status = 0;
   2574 		conn_req.OPT_length =
   2575 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
   2576 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
   2577 		    _TPI_ALIGN_TOPT(addrlen));
   2578 
   2579 		soappendmsg(mp, &conn_req, sizeof (conn_req));
   2580 		soappendmsg(mp, addr, addrlen);
   2581 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
   2582 		soappendmsg(mp, &toh, sizeof (toh));
   2583 		soappendmsg(mp, src, srclen);
   2584 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
   2585 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
   2586 	}
   2587 	/*
   2588 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
   2589 	 * in order to have the right state when the T_CONN_CON shows up.
   2590 	 */
   2591 	soisconnecting(so);
   2592 	mutex_exit(&so->so_lock);
   2593 
   2594 	if (audit_active)
   2595 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
   2596 
   2597 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
   2598 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
   2599 	mp = NULL;
   2600 	mutex_enter(&so->so_lock);
   2601 	if (error != 0)
   2602 		goto bad;
   2603 
   2604 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
   2605 		goto bad;
   2606 
   2607 	/* Allow other threads to access the socket */
   2608 	so_unlock_single(so, SOLOCKED);
   2609 	need_unlock = B_FALSE;
   2610 
   2611 	/*
   2612 	 * Wait until we get a T_CONN_CON or an error
   2613 	 */
   2614 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
   2615 		so_lock_single(so);	/* Set SOLOCKED */
   2616 		need_unlock = B_TRUE;
   2617 	}
   2618 
   2619 done:
   2620 	freemsg(mp);
   2621 	switch (error) {
   2622 	case EINPROGRESS:
   2623 	case EALREADY:
   2624 	case EISCONN:
   2625 	case EINTR:
   2626 		/* Non-fatal errors */
   2627 		sti->sti_laddr_valid = 0;
   2628 		/* FALLTHRU */
   2629 	case 0:
   2630 		break;
   2631 	default:
   2632 		ASSERT(need_unlock);
   2633 		/*
   2634 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
   2635 		 * and invalidate local-address cache
   2636 		 */
   2637 		so->so_state &= ~SS_ISCONNECTING;
   2638 		sti->sti_laddr_valid = 0;
   2639 		/* A discon_ind might have already unbound us */
   2640 		if ((flags & _SOCONNECT_DID_BIND) &&
   2641 		    (so->so_state & SS_ISBOUND)) {
   2642 			int err;
   2643 
   2644 			err = sotpi_unbind(so, 0);
   2645 			/* LINTED - statement has no conseq */
   2646 			if (err) {
   2647 				eprintsoline(so, err);
   2648 			}
   2649 		}
   2650 		break;
   2651 	}
   2652 	if (need_unlock)
   2653 		so_unlock_single(so, SOLOCKED);
   2654 	mutex_exit(&so->so_lock);
   2655 	return (error);
   2656 
   2657 so_bad:	error = sogeterr(so, B_TRUE);
   2658 bad:	eprintsoline(so, error);
   2659 	goto done;
   2660 }
   2661 
   2662 /* ARGSUSED */
   2663 int
   2664 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
   2665 {
   2666 	struct T_ordrel_req	ordrel_req;
   2667 	mblk_t			*mp;
   2668 	uint_t			old_state, state_change;
   2669 	int			error = 0;
   2670 	sotpi_info_t		*sti = SOTOTPI(so);
   2671 
   2672 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
   2673 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
   2674 
   2675 	mutex_enter(&so->so_lock);
   2676 	so_lock_single(so);	/* Set SOLOCKED */
   2677 
   2678 	/*
   2679 	 * SunOS 4.X has no check for datagram sockets.
   2680 	 * 5.X checks that it is connected (ENOTCONN)
   2681 	 * X/Open requires that we check the connected state.
   2682 	 */
   2683 	if (!(so->so_state & SS_ISCONNECTED)) {
   2684 		if (!xnet_skip_checks) {
   2685 			error = ENOTCONN;
   2686 			if (xnet_check_print) {
   2687 				printf("sockfs: X/Open shutdown check "
   2688 				    "caused ENOTCONN\n");
   2689 			}
   2690 		}
   2691 		goto done;
   2692 	}
   2693 	/*
   2694 	 * Record the current state and then perform any state changes.
   2695 	 * Then use the difference between the old and new states to
   2696 	 * determine which messages need to be sent.
   2697 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
   2698 	 * duplicate calls to shutdown().
   2699 	 */
   2700 	old_state = so->so_state;
   2701 
   2702 	switch (how) {
   2703 	case 0:
   2704 		socantrcvmore(so);
   2705 		break;
   2706 	case 1:
   2707 		socantsendmore(so);
   2708 		break;
   2709 	case 2:
   2710 		socantsendmore(so);
   2711 		socantrcvmore(so);
   2712 		break;
   2713 	default:
   2714 		error = EINVAL;
   2715 		goto done;
   2716 	}
   2717 
   2718 	/*
   2719 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
   2720 	 */
   2721 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
   2722 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
   2723 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
   2724 
   2725 	switch (state_change) {
   2726 	case 0:
   2727 		dprintso(so, 1,
   2728 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
   2729 		    so->so_state));
   2730 		goto done;
   2731 
   2732 	case SS_CANTRCVMORE:
   2733 		mutex_exit(&so->so_lock);
   2734 		strseteof(SOTOV(so), 1);
   2735 		/*
   2736 		 * strseteof takes care of read side wakeups,
   2737 		 * pollwakeups, and signals.
   2738 		 */
   2739 		/*
   2740 		 * Get the read lock before flushing data to avoid problems
   2741 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
   2742 		 */
   2743 		mutex_enter(&so->so_lock);
   2744 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
   2745 		mutex_exit(&so->so_lock);
   2746 
   2747 		/* Flush read side queue */
   2748 		strflushrq(SOTOV(so), FLUSHALL);
   2749 
   2750 		mutex_enter(&so->so_lock);
   2751 		so_unlock_read(so);		/* Clear SOREADLOCKED */
   2752 		break;
   2753 
   2754 	case SS_CANTSENDMORE:
   2755 		mutex_exit(&so->so_lock);
   2756 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
   2757 		mutex_enter(&so->so_lock);
   2758 		break;
   2759 
   2760 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
   2761 		mutex_exit(&so->so_lock);
   2762 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
   2763 		strseteof(SOTOV(so), 1);
   2764 		/*
   2765 		 * strseteof takes care of read side wakeups,
   2766 		 * pollwakeups, and signals.
   2767 		 */
   2768 		/*
   2769 		 * Get the read lock before flushing data to avoid problems
   2770 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
   2771 		 */
   2772 		mutex_enter(&so->so_lock);
   2773 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
   2774 		mutex_exit(&so->so_lock);
   2775 
   2776 		/* Flush read side queue */
   2777 		strflushrq(SOTOV(so), FLUSHALL);
   2778 
   2779 		mutex_enter(&so->so_lock);
   2780 		so_unlock_read(so);		/* Clear SOREADLOCKED */
   2781 		break;
   2782 	}
   2783 
   2784 	ASSERT(MUTEX_HELD(&so->so_lock));
   2785 
   2786 	/*
   2787 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
   2788 	 * was set due to this call and the new state has both of them set:
   2789 	 *	Send the AF_UNIX close indication
   2790 	 *	For T_COTS send a discon_ind
   2791 	 *
   2792 	 * If cantsend was set due to this call:
   2793 	 *	For T_COTSORD send an ordrel_ind
   2794 	 *
   2795 	 * Note that for T_CLTS there is no message sent here.
   2796 	 */
   2797 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
   2798 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
   2799 		/*
   2800 		 * For SunOS 4.X compatibility we tell the other end
   2801 		 * that we are unable to receive at this point.
   2802 		 */
   2803 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
   2804 			so_unix_close(so);
   2805 
   2806 		if (sti->sti_serv_type == T_COTS)
   2807 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
   2808 	}
   2809 	if ((state_change & SS_CANTSENDMORE) &&
   2810 	    (sti->sti_serv_type == T_COTS_ORD)) {
   2811 		/* Send an orderly release */
   2812 		ordrel_req.PRIM_type = T_ORDREL_REQ;
   2813 
   2814 		mutex_exit(&so->so_lock);
   2815 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
   2816 		    0, _ALLOC_SLEEP, cr);
   2817 		/*
   2818 		 * Send down the T_ORDREL_REQ even if there is flow control.
   2819 		 * This prevents shutdown from blocking.
   2820 		 * Note that there is no T_OK_ACK for ordrel_req.
   2821 		 */
   2822 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
   2823 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
   2824 		mutex_enter(&so->so_lock);
   2825 		if (error) {
   2826 			eprintsoline(so, error);
   2827 			goto done;
   2828 		}
   2829 	}
   2830 
   2831 done:
   2832 	so_unlock_single(so, SOLOCKED);
   2833 	mutex_exit(&so->so_lock);
   2834 	return (error);
   2835 }
   2836 
   2837 /*
   2838  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
   2839  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
   2840  * that we have closed.
   2841  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
   2842  * T_UNITDATA_REQ containing the same option.
   2843  *
   2844  * For SOCK_DGRAM half-connections (somebody connected to this end
   2845  * but this end is not connect) we don't know where to send any
   2846  * SO_UNIX_CLOSE.
   2847  *
   2848  * We have to ignore stream head errors just in case there has been
   2849  * a shutdown(output).
   2850  * Ignore any flow control to try to get the message more quickly to the peer.
   2851  * While locally ignoring flow control solves the problem when there
   2852  * is only the loopback transport on the stream it would not provide
   2853  * the correct AF_UNIX socket semantics when one or more modules have
   2854  * been pushed.
   2855  */
   2856 void
   2857 so_unix_close(struct sonode *so)
   2858 {
   2859 	int		error;
   2860 	struct T_opthdr	toh;
   2861 	mblk_t		*mp;
   2862 	sotpi_info_t	*sti = SOTOTPI(so);
   2863 
   2864 	ASSERT(MUTEX_HELD(&so->so_lock));
   2865 
   2866 	ASSERT(so->so_family == AF_UNIX);
   2867 
   2868 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
   2869 	    (SS_ISCONNECTED|SS_ISBOUND))
   2870 		return;
   2871 
   2872 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
   2873 	    (void *)so, pr_state(so->so_state, so->so_mode)));
   2874 
   2875 	toh.level = SOL_SOCKET;
   2876 	toh.name = SO_UNIX_CLOSE;
   2877 
   2878 	/* zero length + header */
   2879 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
   2880 	toh.status = 0;
   2881 
   2882 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
   2883 		struct T_optdata_req tdr;
   2884 
   2885 		tdr.PRIM_type = T_OPTDATA_REQ;
   2886 		tdr.DATA_flag = 0;
   2887 
   2888 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
   2889 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
   2890 
   2891 		/* NOTE: holding so_lock while sleeping */
   2892 		mp = soallocproto2(&tdr, sizeof (tdr),
   2893 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
   2894 	} else {
   2895 		struct T_unitdata_req	tudr;
   2896 		void			*addr;
   2897 		socklen_t		addrlen;
   2898 		void			*src;
   2899 		socklen_t		srclen;
   2900 		struct T_opthdr		toh2;
   2901 		t_scalar_t		size;
   2902 
   2903 		/* Connecteded DGRAM socket */
   2904 
   2905 		/*
   2906 		 * For AF_UNIX the destination address is translated to
   2907 		 * an internal name and the source address is passed as
   2908 		 * an option.
   2909 		 */
   2910 		/*
   2911 		 * Length and family checks.
   2912 		 */
   2913 		error = so_addr_verify(so, sti->sti_faddr_sa,
   2914 		    (t_uscalar_t)sti->sti_faddr_len);
   2915 		if (error) {
   2916 			eprintsoline(so, error);
   2917 			return;
   2918 		}
   2919 		if (sti->sti_faddr_noxlate) {
   2920 			/*
   2921 			 * Already have a transport internal address. Do not
   2922 			 * pass any (transport internal) source address.
   2923 			 */
   2924 			addr = sti->sti_faddr_sa;
   2925 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
   2926 			src = NULL;
   2927 			srclen = 0;
   2928 		} else {
   2929 			/*
   2930 			 * Pass the sockaddr_un source address as an option
   2931 			 * and translate the remote address.
   2932 			 * Holding so_lock thus sti_laddr_sa can not change.
   2933 			 */
   2934 			src = sti->sti_laddr_sa;
   2935 			srclen = (socklen_t)sti->sti_laddr_len;
   2936 			dprintso(so, 1,
   2937 			    ("so_ux_close: srclen %d, src %p\n",
   2938 			    srclen, src));
   2939 			error = so_ux_addr_xlate(so,
   2940 			    sti->sti_faddr_sa,
   2941 			    (socklen_t)sti->sti_faddr_len, 0,
   2942 			    &addr, &addrlen);
   2943 			if (error) {
   2944 				eprintsoline(so, error);
   2945 				return;
   2946 			}
   2947 		}
   2948 		tudr.PRIM_type = T_UNITDATA_REQ;
   2949 		tudr.DEST_length = addrlen;
   2950 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
   2951 		if (srclen == 0) {
   2952 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
   2953 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
   2954 			    _TPI_ALIGN_TOPT(addrlen));
   2955 
   2956 			size = tudr.OPT_offset + tudr.OPT_length;
   2957 			/* NOTE: holding so_lock while sleeping */
   2958 			mp = soallocproto2(&tudr, sizeof (tudr),
   2959 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
   2960 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
   2961 			soappendmsg(mp, &toh, sizeof (toh));
   2962 		} else {
   2963 			/*
   2964 			 * There is a AF_UNIX sockaddr_un to include as a
   2965 			 * source address option.
   2966 			 */
   2967 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
   2968 			    _TPI_ALIGN_TOPT(srclen));
   2969 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
   2970 			    _TPI_ALIGN_TOPT(addrlen));
   2971 
   2972 			toh2.level = SOL_SOCKET;
   2973 			toh2.name = SO_SRCADDR;
   2974 			toh2.len = (t_uscalar_t)(srclen +
   2975 			    sizeof (struct T_opthdr));
   2976 			toh2.status = 0;
   2977 
   2978 			size = tudr.OPT_offset + tudr.OPT_length;
   2979 
   2980 			/* NOTE: holding so_lock while sleeping */
   2981 			mp = soallocproto2(&tudr, sizeof (tudr),
   2982 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
   2983 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
   2984 			soappendmsg(mp, &toh, sizeof (toh));
   2985 			soappendmsg(mp, &toh2, sizeof (toh2));
   2986 			soappendmsg(mp, src, srclen);
   2987 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
   2988 		}
   2989 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
   2990 	}
   2991 	mutex_exit(&so->so_lock);
   2992 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
   2993 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
   2994 	mutex_enter(&so->so_lock);
   2995 }
   2996 
   2997 /*
   2998  * Called by sotpi_recvmsg when reading a non-zero amount of data.
   2999  * In addition, the caller typically verifies that there is some
   3000  * potential state to clear by checking
   3001  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
   3002  * before calling this routine.
   3003  * Note that such a check can be made without holding so_lock since
   3004  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
   3005  * decrements sti_oobsigcnt.
   3006  *
   3007  * When data is read *after* the point that all pending
   3008  * oob data has been consumed the oob indication is cleared.
   3009  *
   3010  * This logic keeps select/poll returning POLLRDBAND and
   3011  * SIOCATMARK returning true until we have read past
   3012  * the mark.
   3013  */
   3014 static void
   3015 sorecv_update_oobstate(struct sonode *so)
   3016 {
   3017 	sotpi_info_t *sti = SOTOTPI(so);
   3018 
   3019 	mutex_enter(&so->so_lock);
   3020 	ASSERT(so_verify_oobstate(so));
   3021 	dprintso(so, 1,
   3022 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
   3023 	    sti->sti_oobsigcnt,
   3024 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
   3025 	if (sti->sti_oobsigcnt == 0) {
   3026 		/* No more pending oob indications */
   3027 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
   3028 		freemsg(so->so_oobmsg);
   3029 		so->so_oobmsg = NULL;
   3030 	}
   3031 	ASSERT(so_verify_oobstate(so));
   3032 	mutex_exit(&so->so_lock);
   3033 }
   3034 
   3035 /*
   3036  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
   3037  */
   3038 static int
   3039 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
   3040 {
   3041 	sotpi_info_t *sti = SOTOTPI(so);
   3042 	int	error = 0;
   3043 	mblk_t *tmp = NULL;
   3044 	mblk_t *pmp = NULL;
   3045 	mblk_t *nmp = sti->sti_nl7c_rcv_mp;
   3046 
   3047 	ASSERT(nmp != NULL);
   3048 
   3049 	while (nmp != NULL && uiop->uio_resid > 0) {
   3050 		ssize_t n;
   3051 
   3052 		if (DB_TYPE(nmp) == M_DATA) {
   3053 			/*
   3054 			 * We have some data, uiomove up to resid bytes.
   3055 			 */
   3056 			n = MIN(MBLKL(nmp), uiop->uio_resid);
   3057 			if (n > 0)
   3058 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
   3059 			nmp->b_rptr += n;
   3060 			if (nmp->b_rptr == nmp->b_wptr) {
   3061 				pmp = nmp;
   3062 				nmp = nmp->b_cont;
   3063 			}
   3064 			if (error)
   3065 				break;
   3066 		} else {
   3067 			/*
   3068 			 * We only handle data, save for caller to handle.
   3069 			 */
   3070 			if (pmp != NULL) {
   3071 				pmp->b_cont = nmp->b_cont;
   3072 			}
   3073 			nmp->b_cont = NULL;
   3074 			if (*rmp == NULL) {
   3075 				*rmp = nmp;
   3076 			} else {
   3077 				tmp->b_cont = nmp;
   3078 			}
   3079 			nmp = nmp->b_cont;
   3080 			tmp = nmp;
   3081 		}
   3082 	}
   3083 	if (pmp != NULL) {
   3084 		/* Free any mblk_t(s) which we have consumed */
   3085 		pmp->b_cont = NULL;
   3086 		freemsg(sti->sti_nl7c_rcv_mp);
   3087 	}
   3088 	if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
   3089 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
   3090 		if (error == 0) {
   3091 			rval_t	*p = (rval_t *)&sti->sti_nl7c_rcv_rval;
   3092 
   3093 			error = p->r_v.r_v2;
   3094 			p->r_v.r_v2 = 0;
   3095 		}
   3096 		rp->r_vals = sti->sti_nl7c_rcv_rval;
   3097 		sti->sti_nl7c_rcv_rval = 0;
   3098 	} else {
   3099 		/* More mblk_t(s) to process so no rval to return */
   3100 		rp->r_vals = 0;
   3101 	}
   3102 	return (error);
   3103 }
   3104 /*
   3105  * Receive the next message on the queue.
   3106  * If msg_controllen is non-zero when called the caller is interested in
   3107  * any received control info (options).
   3108  * If msg_namelen is non-zero when called the caller is interested in
   3109  * any received source address.
   3110  * The routine returns with msg_control and msg_name pointing to
   3111  * kmem_alloc'ed memory which the caller has to free.
   3112  */
   3113 /* ARGSUSED */
   3114 int
   3115 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
   3116     struct cred *cr)
   3117 {
   3118 	union T_primitives	*tpr;
   3119 	mblk_t			*mp;
   3120 	uchar_t			pri;
   3121 	int			pflag, opflag;
   3122 	void			*control;
   3123 	t_uscalar_t		controllen;
   3124 	t_uscalar_t		namelen;
   3125 	int			so_state = so->so_state; /* Snapshot */
   3126 	ssize_t			saved_resid;
   3127 	rval_t			rval;
   3128 	int			flags;
   3129 	clock_t			timout;
   3130 	int			error = 0;
   3131 	sotpi_info_t		*sti = SOTOTPI(so);
   3132 
   3133 	flags = msg->msg_flags;
   3134 	msg->msg_flags = 0;
   3135 
   3136 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
   3137 	    (void *)so, (void *)msg, flags,
   3138 	    pr_state(so->so_state, so->so_mode), so->so_error));
   3139 
   3140 	if (so->so_version == SOV_STREAM) {
   3141 		so_update_attrs(so, SOACC);
   3142 		/* The imaginary "sockmod" has been popped - act as a stream */
   3143 		return (strread(SOTOV(so), uiop, cr));
   3144 	}
   3145 
   3146 	/*
   3147 	 * If we are not connected because we have never been connected
   3148 	 * we return ENOTCONN. If we have been connected (but are no longer
   3149 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
   3150 	 * the EOF.
   3151 	 *
   3152 	 * An alternative would be to post an ENOTCONN error in stream head
   3153 	 * (read+write) and clear it when we're connected. However, that error
   3154 	 * would cause incorrect poll/select behavior!
   3155 	 */
   3156 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
   3157 	    (so->so_mode & SM_CONNREQUIRED)) {
   3158 		return (ENOTCONN);
   3159 	}
   3160 
   3161 	/*
   3162 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
   3163 	 * after checking that the read queue is empty) and returns zero.
   3164 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
   3165 	 * is zero.
   3166 	 */
   3167 
   3168 	if (flags & MSG_OOB) {
   3169 		/* Check that the transport supports OOB */
   3170 		if (!(so->so_mode & SM_EXDATA))
   3171 			return (EOPNOTSUPP);
   3172 		so_update_attrs(so, SOACC);
   3173 		return (sorecvoob(so, msg, uiop, flags,
   3174 		    (so->so_options & SO_OOBINLINE)));
   3175 	}
   3176 
   3177 	so_update_attrs(so, SOACC);
   3178 
   3179 	/*
   3180 	 * Set msg_controllen and msg_namelen to zero here to make it
   3181 	 * simpler in the cases that no control or name is returned.
   3182 	 */
   3183 	controllen = msg->msg_controllen;
   3184 	namelen = msg->msg_namelen;
   3185 	msg->msg_controllen = 0;
   3186 	msg->msg_namelen = 0;
   3187 
   3188 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
   3189 	    namelen, controllen));
   3190 
   3191 	mutex_enter(&so->so_lock);
   3192 	/*
   3193 	 * If an NL7C enabled socket and not waiting for write data.
   3194 	 */
   3195 	if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
   3196 	    NL7C_ENABLED) {
   3197 		if (sti->sti_nl7c_uri) {
   3198 			/* Close uri processing for a previous request */
   3199 			nl7c_close(so);
   3200 		}
   3201 		if ((so_state & SS_CANTRCVMORE) &&
   3202 		    sti->sti_nl7c_rcv_mp == NULL) {
   3203 			/* Nothing to process, EOF */
   3204 			mutex_exit(&so->so_lock);
   3205 			return (0);
   3206 		} else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
   3207 			/* Persistent NL7C socket, try to process request */
   3208 			boolean_t ret;
   3209 
   3210 			ret = nl7c_process(so,
   3211 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
   3212 			rval.r_vals = sti->sti_nl7c_rcv_rval;
   3213 			error = rval.r_v.r_v2;
   3214 			if (error) {
   3215 				/* Error of some sort, return it */
   3216 				mutex_exit(&so->so_lock);
   3217 				return (error);
   3218 			}
   3219 			if (sti->sti_nl7c_flags &&
   3220 			    ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
   3221 				/*
   3222 				 * Still an NL7C socket and no data
   3223 				 * to pass up to the caller.
   3224 				 */
   3225 				mutex_exit(&so->so_lock);
   3226 				if (ret) {
   3227 					/* EOF */
   3228 					return (0);
   3229 				} else {
   3230 					/* Need more data */
   3231 					return (EAGAIN);
   3232 				}
   3233 			}
   3234 		} else {
   3235 			/*
   3236 			 * Not persistent so no further NL7C processing.
   3237 			 */
   3238 			sti->sti_nl7c_flags = 0;
   3239 		}
   3240 	}
   3241 	/*
   3242 	 * Only one reader is allowed at any given time. This is needed
   3243 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
   3244 	 *
   3245 	 * This is slightly different that BSD behavior in that it fails with
   3246 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
   3247 	 * is single-threaded using sblock(), which is dropped while waiting
   3248 	 * for data to appear. The difference shows up e.g. if one
   3249 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
   3250 	 * does use nonblocking io and different threads are reading each
   3251 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
   3252 	 * in this case as long as the read queue doesn't get empty.
   3253 	 * In this implementation the thread using nonblocking io can
   3254 	 * get an EWOULDBLOCK error due to the blocking thread executing
   3255 	 * e.g. in the uiomove in kstrgetmsg.
   3256 	 * This difference is not believed to be significant.
   3257 	 */
   3258 	/* Set SOREADLOCKED */
   3259 	error = so_lock_read_intr(so,
   3260 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
   3261 	mutex_exit(&so->so_lock);
   3262 	if (error)
   3263 		return (error);
   3264 
   3265 	/*
   3266 	 * Tell kstrgetmsg to not inspect the stream head errors until all
   3267 	 * queued data has been consumed.
   3268 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
   3269 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
   3270 	 *
   3271 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
   3272 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
   3273 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
   3274 	 */
   3275 	pflag = MSG_ANY | MSG_DELAYERROR;
   3276 	if (flags & MSG_PEEK) {
   3277 		pflag |= MSG_IPEEK;
   3278 		flags &= ~MSG_WAITALL;
   3279 	}
   3280 	if (so->so_mode & SM_ATOMIC)
   3281 		pflag |= MSG_DISCARDTAIL;
   3282 
   3283 	if (flags & MSG_DONTWAIT)
   3284 		timout = 0;
   3285 	else
   3286 		timout = -1;
   3287 	opflag = pflag;
   3288 retry:
   3289 	saved_resid = uiop->uio_resid;
   3290 	pri = 0;
   3291 	mp = NULL;
   3292 	if (sti->sti_nl7c_rcv_mp != NULL) {
   3293 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
   3294 		error = nl7c_sorecv(so, &mp, uiop, &rval);
   3295 	} else {
   3296 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
   3297 		    timout, &rval);
   3298 	}
   3299 	if (error != 0) {
   3300 		/* kstrgetmsg returns ETIME when timeout expires */
   3301 		if (error == ETIME)
   3302 			error = EWOULDBLOCK;
   3303 		goto out;
   3304 	}
   3305 	/*
   3306 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
   3307 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
   3308 	 */
   3309 	ASSERT(!(rval.r_val1 & MORECTL));
   3310 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
   3311 		msg->msg_flags |= MSG_TRUNC;
   3312 
   3313 	if (mp == NULL) {
   3314 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
   3315 		/*
   3316 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
   3317 		 * The draft Posix socket spec states that the mark should
   3318 		 * not be cleared when peeking. We follow the latter.
   3319 		 */
   3320 		if ((so->so_state &
   3321 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
   3322 		    (uiop->uio_resid != saved_resid) &&
   3323 		    !(flags & MSG_PEEK)) {
   3324 			sorecv_update_oobstate(so);
   3325 		}
   3326 
   3327 		mutex_enter(&so->so_lock);
   3328 		/* Set MSG_EOR based on MOREDATA */
   3329 		if (!(rval.r_val1 & MOREDATA)) {
   3330 			if (so->so_state & SS_SAVEDEOR) {
   3331 				msg->msg_flags |= MSG_EOR;
   3332 				so->so_state &= ~SS_SAVEDEOR;
   3333 			}
   3334 		}
   3335 		/*
   3336 		 * If some data was received (i.e. not EOF) and the
   3337 		 * read/recv* has not been satisfied wait for some more.
   3338 		 */
   3339 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
   3340 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
   3341 			mutex_exit(&so->so_lock);
   3342 			pflag = opflag | MSG_NOMARK;
   3343 			goto retry;
   3344 		}
   3345 		goto out_locked;
   3346 	}
   3347 
   3348 	/* strsock_proto has already verified length and alignment */
   3349 	tpr = (union T_primitives *)mp->b_rptr;
   3350 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
   3351 
   3352 	switch (tpr->type) {
   3353 	case T_DATA_IND: {
   3354 		if ((so->so_state &
   3355 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
   3356 		    (uiop->uio_resid != saved_resid) &&
   3357 		    !(flags & MSG_PEEK)) {
   3358 			sorecv_update_oobstate(so);
   3359 		}
   3360 
   3361 		/*
   3362 		 * Set msg_flags to MSG_EOR based on
   3363 		 * MORE_flag and MOREDATA.
   3364 		 */
   3365 		mutex_enter(&so->so_lock);
   3366 		so->so_state &= ~SS_SAVEDEOR;
   3367 		if (!(tpr->data_ind.MORE_flag & 1)) {
   3368 			if (!(rval.r_val1 & MOREDATA))
   3369 				msg->msg_flags |= MSG_EOR;
   3370 			else
   3371 				so->so_state |= SS_SAVEDEOR;
   3372 		}
   3373 		freemsg(mp);
   3374 		/*
   3375 		 * If some data was received (i.e. not EOF) and the
   3376 		 * read/recv* has not been satisfied wait for some more.
   3377 		 */
   3378 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
   3379 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
   3380 			mutex_exit(&so->so_lock);
   3381 			pflag = opflag | MSG_NOMARK;
   3382 			goto retry;
   3383 		}
   3384 		goto out_locked;
   3385 	}
   3386 	case T_UNITDATA_IND: {
   3387 		void *addr;
   3388 		t_uscalar_t addrlen;
   3389 		void *abuf;
   3390 		t_uscalar_t optlen;
   3391 		void *opt;
   3392 
   3393 		if ((so->so_state &
   3394 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
   3395 		    (uiop->uio_resid != saved_resid) &&
   3396 		    !(flags & MSG_PEEK)) {
   3397 			sorecv_update_oobstate(so);
   3398 		}
   3399 
   3400 		if (namelen != 0) {
   3401 			/* Caller wants source address */
   3402 			addrlen = tpr->unitdata_ind.SRC_length;
   3403 			addr = sogetoff(mp,
   3404 			    tpr->unitdata_ind.SRC_offset,
   3405 			    addrlen, 1);
   3406 			if (addr == NULL) {
   3407 				freemsg(mp);
   3408 				error = EPROTO;
   3409 				eprintsoline(so, error);
   3410 				goto out;
   3411 			}
   3412 			if (so->so_family == AF_UNIX) {
   3413 				/*
   3414 				 * Can not use the transport level address.
   3415 				 * If there is a SO_SRCADDR option carrying
   3416 				 * the socket level address it will be
   3417 				 * extracted below.
   3418 				 */
   3419 				addr = NULL;
   3420 				addrlen = 0;
   3421 			}
   3422 		}
   3423 		optlen = tpr->unitdata_ind.OPT_length;
   3424 		if (optlen != 0) {
   3425 			t_uscalar_t ncontrollen;
   3426 
   3427 			/*
   3428 			 * Extract any source address option.
   3429 			 * Determine how large cmsg buffer is needed.
   3430 			 */
   3431 			opt = sogetoff(mp,
   3432 			    tpr->unitdata_ind.OPT_offset,
   3433 			    optlen, __TPI_ALIGN_SIZE);
   3434 
   3435 			if (opt == NULL) {
   3436 				freemsg(mp);
   3437 				error = EPROTO;
   3438 				eprintsoline(so, error);
   3439 				goto out;
   3440 			}
   3441 			if (so->so_family == AF_UNIX)
   3442 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
   3443 			ncontrollen = so_cmsglen(mp, opt, optlen,
   3444 			    !(flags & MSG_XPG4_2));
   3445 			if (controllen != 0)
   3446 				controllen = ncontrollen;
   3447 			else if (ncontrollen != 0)
   3448 				msg->msg_flags |= MSG_CTRUNC;
   3449 		} else {
   3450 			controllen = 0;
   3451 		}
   3452 
   3453 		if (namelen != 0) {
   3454 			/*
   3455 			 * Return address to caller.
   3456 			 * Caller handles truncation if length
   3457 			 * exceeds msg_namelen.
   3458 			 * NOTE: AF_UNIX NUL termination is ensured by
   3459 			 * the sender's copyin_name().
   3460 			 */
   3461 			abuf = kmem_alloc(addrlen, KM_SLEEP);
   3462 
   3463 			bcopy(addr, abuf, addrlen);
   3464 			msg->msg_name = abuf;
   3465 			msg->msg_namelen = addrlen;
   3466 		}
   3467 
   3468 		if (controllen != 0) {
   3469 			/*
   3470 			 * Return control msg to caller.
   3471 			 * Caller handles truncation if length
   3472 			 * exceeds msg_controllen.
   3473 			 */
   3474 			control = kmem_zalloc(controllen, KM_SLEEP);
   3475 
   3476 			error = so_opt2cmsg(mp, opt, optlen,
   3477 			    !(flags & MSG_XPG4_2),
   3478 			    control, controllen);
   3479 			if (error) {
   3480 				freemsg(mp);
   3481 				if (msg->msg_namelen != 0)
   3482 					kmem_free(msg->msg_name,
   3483 					    msg->msg_namelen);
   3484 				kmem_free(control, controllen);
   3485 				eprintsoline(so, error);
   3486 				goto out;
   3487 			}
   3488 			msg->msg_control = control;
   3489 			msg->msg_controllen = controllen;
   3490 		}
   3491 
   3492 		freemsg(mp);
   3493 		goto out;
   3494 	}
   3495 	case T_OPTDATA_IND: {
   3496 		struct T_optdata_req *tdr;
   3497 		void *opt;
   3498 		t_uscalar_t optlen;
   3499 
   3500 		if ((so->so_state &
   3501 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
   3502 		    (uiop->uio_resid != saved_resid) &&
   3503 		    !(flags & MSG_PEEK)) {
   3504 			sorecv_update_oobstate(so);
   3505 		}
   3506 
   3507 		tdr = (struct T_optdata_req *)mp->b_rptr;
   3508 		optlen = tdr->OPT_length;
   3509 		if (optlen != 0) {
   3510 			t_uscalar_t ncontrollen;
   3511 			/*
   3512 			 * Determine how large cmsg buffer is needed.
   3513 			 */
   3514 			opt = sogetoff(mp,
   3515 			    tpr->optdata_ind.OPT_offset,
   3516 			    optlen, __TPI_ALIGN_SIZE);
   3517 
   3518 			if (opt == NULL) {
   3519 				freemsg(mp);
   3520 				error = EPROTO;
   3521 				eprintsoline(so, error);
   3522 				goto out;
   3523 			}
   3524 
   3525 			ncontrollen = so_cmsglen(mp, opt, optlen,
   3526 			    !(flags & MSG_XPG4_2));
   3527 			if (controllen != 0)
   3528 				controllen = ncontrollen;
   3529 			else if (ncontrollen != 0)
   3530 				msg->msg_flags |= MSG_CTRUNC;
   3531 		} else {
   3532 			controllen = 0;
   3533 		}
   3534 
   3535 		if (controllen != 0) {
   3536 			/*
   3537 			 * Return control msg to caller.
   3538 			 * Caller handles truncation if length
   3539 			 * exceeds msg_controllen.
   3540 			 */
   3541 			control = kmem_zalloc(controllen, KM_SLEEP);
   3542 
   3543 			error = so_opt2cmsg(mp, opt, optlen,
   3544 			    !(flags & MSG_XPG4_2),
   3545 			    control, controllen);
   3546 			if (error) {
   3547 				freemsg(mp);
   3548 				kmem_free(control, controllen);
   3549 				eprintsoline(so, error);
   3550 				goto out;
   3551 			}
   3552 			msg->msg_control = control;
   3553 			msg->msg_controllen = controllen;
   3554 		}
   3555 
   3556 		/*
   3557 		 * Set msg_flags to MSG_EOR based on
   3558 		 * DATA_flag and MOREDATA.
   3559 		 */
   3560 		mutex_enter(&so->so_lock);
   3561 		so->so_state &= ~SS_SAVEDEOR;
   3562 		if (!(tpr->data_ind.MORE_flag & 1)) {
   3563 			if (!(rval.r_val1 & MOREDATA))
   3564 				msg->msg_flags |= MSG_EOR;
   3565 			else
   3566 				so->so_state |= SS_SAVEDEOR;
   3567 		}
   3568 		freemsg(mp);
   3569 		/*
   3570 		 * If some data was received (i.e. not EOF) and the
   3571 		 * read/recv* has not been satisfied wait for some more.
   3572 		 * Not possible to wait if control info was received.
   3573 		 */
   3574 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
   3575 		    controllen == 0 &&
   3576 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
   3577 			mutex_exit(&so->so_lock);
   3578 			pflag = opflag | MSG_NOMARK;
   3579 			goto retry;
   3580 		}
   3581 		goto out_locked;
   3582 	}
   3583 	case T_EXDATA_IND: {
   3584 		dprintso(so, 1,
   3585 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
   3586 		    "state %s\n",
   3587 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
   3588 		    saved_resid - uiop->uio_resid,
   3589 		    pr_state(so->so_state, so->so_mode)));
   3590 		/*
   3591 		 * kstrgetmsg handles MSGMARK so there is nothing to
   3592 		 * inspect in the T_EXDATA_IND.
   3593 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
   3594 		 * as a separate message with no M_DATA component. Furthermore,
   3595 		 * the stream head does not consolidate M_DATA messages onto
   3596 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
   3597 		 * remains a message by itself. This is needed since MSGMARK
   3598 		 * marks both the whole message as well as the last byte
   3599 		 * of the message.
   3600 		 */
   3601 		freemsg(mp);
   3602 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
   3603 		if (flags & MSG_PEEK) {
   3604 			/*
   3605 			 * Even though we are peeking we consume the
   3606 			 * T_EXDATA_IND thereby moving the mark information
   3607 			 * to SS_RCVATMARK. Then the oob code below will
   3608 			 * retry the peeking kstrgetmsg.
   3609 			 * Note that the stream head read queue is
   3610 			 * never flushed without holding SOREADLOCKED
   3611 			 * thus the T_EXDATA_IND can not disappear
   3612 			 * underneath us.
   3613 			 */
   3614 			dprintso(so, 1,
   3615 			    ("sotpi_recvmsg: consume EXDATA_IND "
   3616 			    "counts %d/%d state %s\n",
   3617 			    sti->sti_oobsigcnt,
   3618 			    sti->sti_oobcnt,
   3619 			    pr_state(so->so_state, so->so_mode)));
   3620 
   3621 			pflag = MSG_ANY | MSG_DELAYERROR;
   3622 			if (so->so_mode & SM_ATOMIC)
   3623 				pflag |= MSG_DISCARDTAIL;
   3624 
   3625 			pri = 0;
   3626 			mp = NULL;
   3627 
   3628 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
   3629 			    &pri, &pflag, (clock_t)-1, &rval);
   3630 			ASSERT(uiop->uio_resid == saved_resid);
   3631 
   3632 			if (error) {
   3633 #ifdef SOCK_DEBUG
   3634 				if (error != EWOULDBLOCK && error != EINTR) {
   3635 					eprintsoline(so, error);
   3636 				}
   3637 #endif /* SOCK_DEBUG */
   3638 				goto out;
   3639 			}
   3640 			ASSERT(mp);
   3641 			tpr = (union T_primitives *)mp->b_rptr;
   3642 			ASSERT(tpr->type == T_EXDATA_IND);
   3643 			freemsg(mp);
   3644 		} /* end "if (flags & MSG_PEEK)" */
   3645 
   3646 		/*
   3647 		 * Decrement the number of queued and pending oob.
   3648 		 *
   3649 		 * SS_RCVATMARK is cleared when we read past a mark.
   3650 		 * SS_HAVEOOBDATA is cleared when we've read past the
   3651 		 * last mark.
   3652 		 * SS_OOBPEND is cleared if we've read past the last
   3653 		 * mark and no (new) SIGURG has been posted.
   3654 		 */
   3655 		mutex_enter(&so->so_lock);
   3656 		ASSERT(so_verify_oobstate(so));
   3657 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
   3658 		ASSERT(sti->sti_oobsigcnt > 0);
   3659 		sti->sti_oobsigcnt--;
   3660 		ASSERT(sti->sti_oobcnt > 0);
   3661 		sti->sti_oobcnt--;
   3662 		/*
   3663 		 * Since the T_EXDATA_IND has been removed from the stream
   3664 		 * head, but we have not read data past the mark,
   3665 		 * sockfs needs to track that the socket is still at the mark.
   3666 		 *
   3667 		 * Since no data was received call kstrgetmsg again to wait
   3668 		 * for data.
   3669 		 */
   3670 		so->so_state |= SS_RCVATMARK;
   3671 		mutex_exit(&so->so_lock);
   3672 		dprintso(so, 1,
   3673 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
   3674 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
   3675 		    pr_state(so->so_state, so->so_mode)));
   3676 		pflag = opflag;
   3677 		goto retry;
   3678 	}
   3679 	default:
   3680 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
   3681 		    (void *)so, tpr->type, (void *)mp);
   3682 		ASSERT(0);
   3683 		freemsg(mp);
   3684 		error = EPROTO;
   3685 		eprintsoline(so, error);
   3686 		goto out;
   3687 	}
   3688 	/* NOTREACHED */
   3689 out:
   3690 	mutex_enter(&so->so_lock);
   3691 out_locked:
   3692 	so_unlock_read(so);	/* Clear SOREADLOCKED */
   3693 	mutex_exit(&so->so_lock);
   3694 	return (error);
   3695 }
   3696 
   3697 /*
   3698  * Sending data with options on a datagram socket.
   3699  * Assumes caller has verified that SS_ISBOUND etc. are set.
   3700  */
   3701 static int
   3702 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
   3703     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
   3704 {
   3705 	struct T_unitdata_req	tudr;
   3706 	mblk_t			*mp;
   3707 	int			error;
   3708 	void			*addr;
   3709 	socklen_t		addrlen;
   3710 	void			*src;
   3711 	socklen_t		srclen;
   3712 	ssize_t			len;
   3713 	int			size;
   3714 	struct T_opthdr		toh;
   3715 	struct fdbuf		*fdbuf;
   3716 	t_uscalar_t		optlen;
   3717 	void			*fds;
   3718 	int			fdlen;
   3719 	sotpi_info_t		*sti = SOTOTPI(so);
   3720 
   3721 	ASSERT(name && namelen);
   3722 	ASSERT(control && controllen);
   3723 
   3724 	len = uiop->uio_resid;
   3725 	if (len > (ssize_t)sti->sti_tidu_size) {
   3726 		return (EMSGSIZE);
   3727 	}
   3728 
   3729 	/*
   3730 	 * For AF_UNIX the destination address is translated to an internal
   3731 	 * name and the source address is passed as an option.
   3732 	 * Also, file descriptors are passed as file pointers in an
   3733 	 * option.
   3734 	 */
   3735 
   3736 	/*
   3737 	 * Length and family checks.
   3738 	 */
   3739 	error = so_addr_verify(so, name, namelen);
   3740 	if (error) {
   3741 		eprintsoline(so, error);
   3742 		return (error);
   3743 	}
   3744 	if (so->so_family == AF_UNIX) {
   3745 		if (sti->sti_faddr_noxlate) {
   3746 			/*
   3747 			 * Already have a transport internal address. Do not
   3748 			 * pass any (transport internal) source address.
   3749 			 */
   3750 			addr = name;
   3751 			addrlen = namelen;
   3752 			src = NULL;
   3753 			srclen = 0;
   3754 		} else {
   3755 			/*
   3756 			 * Pass the sockaddr_un source address as an option
   3757 			 * and translate the remote address.
   3758 			 *
   3759 			 * Note that this code does not prevent sti_laddr_sa
   3760 			 * from changing while it is being used. Thus
   3761 			 * if an unbind+bind occurs concurrently with this
   3762 			 * send the peer might see a partially new and a
   3763 			 * partially old "from" address.
   3764 			 */
   3765 			src = sti->sti_laddr_sa;
   3766 			srclen = (t_uscalar_t)sti->sti_laddr_len;
   3767 			dprintso(so, 1,
   3768 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
   3769 			    srclen, src));
   3770 			error = so_ux_addr_xlate(so, name, namelen,
   3771 			    (flags & MSG_XPG4_2),
   3772 			    &addr, &addrlen);
   3773 			if (error) {
   3774 				eprintsoline(so, error);
   3775 				return (error);
   3776 			}
   3777 		}
   3778 	} else {
   3779 		addr = name;
   3780 		addrlen = namelen;
   3781 		src = NULL;
   3782 		srclen = 0;
   3783 	}
   3784 	optlen = so_optlen(control, controllen,
   3785 	    !(flags & MSG_XPG4_2));
   3786 	tudr.PRIM_type = T_UNITDATA_REQ;
   3787 	tudr.DEST_length = addrlen;
   3788 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
   3789 	if (srclen != 0)
   3790 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
   3791 		    _TPI_ALIGN_TOPT(srclen));
   3792 	else
   3793 		tudr.OPT_length = optlen;
   3794 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
   3795 	    _TPI_ALIGN_TOPT(addrlen));
   3796 
   3797 	size = tudr.OPT_offset + tudr.OPT_length;
   3798 
   3799 	/*
   3800 	 * File descriptors only when SM_FDPASSING set.
   3801 	 */
   3802 	error = so_getfdopt(control, controllen,
   3803 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
   3804 	if (error)
   3805 		return (error);
   3806 	if (fdlen != -1) {
   3807 		if (!(so->so_mode & SM_FDPASSING))
   3808 			return (EOPNOTSUPP);
   3809 
   3810 		error = fdbuf_create(fds, fdlen, &fdbuf);
   3811 		if (error)
   3812 			return (error);
   3813 		mp = fdbuf_allocmsg(size, fdbuf);
   3814 	} else {
   3815 		mp = soallocproto(size, _ALLOC_INTR, CRED());
   3816 		if (mp == NULL) {
   3817 			/*
   3818 			 * Caught a signal waiting for memory.
   3819 			 * Let send* return EINTR.
   3820 			 */
   3821 			return (EINTR);
   3822 		}
   3823 	}
   3824 	soappendmsg(mp, &tudr, sizeof (tudr));
   3825 	soappendmsg(mp, addr, addrlen);
   3826 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
   3827 
   3828 	if (fdlen != -1) {
   3829 		ASSERT(fdbuf != NULL);
   3830 		toh.level = SOL_SOCKET;
   3831 		toh.name = SO_FILEP;
   3832 		toh.len = fdbuf->fd_size +
   3833 		    (t_uscalar_t)sizeof (struct T_opthdr);
   3834 		toh.status = 0;
   3835 		soappendmsg(mp, &toh, sizeof (toh));
   3836 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
   3837 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
   3838 	}
   3839 	if (srclen != 0) {
   3840 		/*
   3841 		 * There is a AF_UNIX sockaddr_un to include as a source
   3842 		 * address option.
   3843 		 */
   3844 		toh.level = SOL_SOCKET;
   3845 		toh.name = SO_SRCADDR;
   3846 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
   3847 		toh.status = 0;
   3848 		soappendmsg(mp, &toh, sizeof (toh));
   3849 		soappendmsg(mp, src, srclen);
   3850 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
   3851 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
   3852 	}
   3853 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
   3854 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
   3855 	/* At most 3 bytes left in the message */
   3856 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
   3857 	ASSERT(MBLKL(mp) <= (ssize_t)size);
   3858 
   3859 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
   3860 	if (audit_active)
   3861 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
   3862 
   3863 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
   3864 #ifdef SOCK_DEBUG
   3865 	if (error) {
   3866 		eprintsoline(so, error);
   3867 	}
   3868 #endif /* SOCK_DEBUG */
   3869 	return (error);
   3870 }
   3871 
   3872 /*
   3873  * Sending data with options on a connected stream socket.
   3874  * Assumes caller has verified that SS_ISCONNECTED is set.
   3875  */
   3876 static int
   3877 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
   3878     t_uscalar_t controllen, int flags)
   3879 {
   3880 	struct T_optdata_req	tdr;
   3881 	mblk_t			*mp;
   3882 	int			error;
   3883 	ssize_t			iosize;
   3884 	int			size;
   3885 	struct fdbuf		*fdbuf;
   3886 	t_uscalar_t		optlen;
   3887 	void			*fds;
   3888 	int			fdlen;
   3889 	struct T_opthdr		toh;
   3890 	sotpi_info_t		*sti = SOTOTPI(so);
   3891 
   3892 	dprintso(so, 1,
   3893 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
   3894 
   3895 	/*
   3896 	 * Has to be bound and connected. However, since no locks are
   3897 	 * held the state could have changed after sotpi_sendmsg checked it
   3898 	 * thus it is not possible to ASSERT on the state.
   3899 	 */
   3900 
   3901 	/* Options on connection-oriented only when SM_OPTDATA set. */
   3902 	if (!(so->so_mode & SM_OPTDATA))
   3903 		return (EOPNOTSUPP);
   3904 
   3905 	do {
   3906 		/*
   3907 		 * Set the MORE flag if uio_resid does not fit in this
   3908 		 * message or if the caller passed in "more".
   3909 		 * Error for transports with zero tidu_size.
   3910 		 */
   3911 		tdr.PRIM_type = T_OPTDATA_REQ;
   3912 		iosize = sti->sti_tidu_size;
   3913 		if (iosize <= 0)
   3914 			return (EMSGSIZE);
   3915 		if (uiop->uio_resid > iosize) {
   3916 			tdr.DATA_flag = 1;
   3917 		} else {
   3918 			if (more)
   3919 				tdr.DATA_flag = 1;
   3920 			else
   3921 				tdr.DATA_flag = 0;
   3922 			iosize = uiop->uio_resid;
   3923 		}
   3924 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
   3925 		    tdr.DATA_flag, iosize));
   3926 
   3927 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
   3928 		tdr.OPT_length = optlen;
   3929 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
   3930 
   3931 		size = (int)sizeof (tdr) + optlen;
   3932 		/*
   3933 		 * File descriptors only when SM_FDPASSING set.
   3934 		 */
   3935 		error = so_getfdopt(control, controllen,
   3936 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
   3937 		if (error)
   3938 			return (error);
   3939 		if (fdlen != -1) {
   3940 			if (!(so->so_mode & SM_FDPASSING))
   3941 				return (EOPNOTSUPP);
   3942 
   3943 			error = fdbuf_create(fds, fdlen, &fdbuf);
   3944 			if (error)
   3945 				return (error);
   3946 			mp = fdbuf_allocmsg(size, fdbuf);
   3947 		} else {
   3948 			mp = soallocproto(size, _ALLOC_INTR, CRED());
   3949 			if (mp == NULL) {
   3950 				/*
   3951 				 * Caught a signal waiting for memory.
   3952 				 * Let send* return EINTR.
   3953 				 */
   3954 				return (EINTR);
   3955 			}
   3956 		}
   3957 		soappendmsg(mp, &tdr, sizeof (tdr));
   3958 
   3959 		if (fdlen != -1) {
   3960 			ASSERT(fdbuf != NULL);
   3961 			toh.level = SOL_SOCKET;
   3962 			toh.name = SO_FILEP;
   3963 			toh.len = fdbuf->fd_size +
   3964 			    (t_uscalar_t)sizeof (struct T_opthdr);
   3965 			toh.status = 0;
   3966 			soappendmsg(mp, &toh, sizeof (toh));
   3967 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
   3968 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
   3969 		}
   3970 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
   3971 		/* At most 3 bytes left in the message */
   3972 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
   3973 		ASSERT(MBLKL(mp) <= (ssize_t)size);
   3974 
   3975 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
   3976 
   3977 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
   3978 		    0, MSG_BAND, 0);
   3979 		if (error) {
   3980 			eprintsoline(so, error);
   3981 			return (error);
   3982 		}
   3983 		control = NULL;
   3984 		if (uiop->uio_resid > 0) {
   3985 			/*
   3986 			 * Recheck for fatal errors. Fail write even though
   3987 			 * some data have been written. This is consistent
   3988 			 * with strwrite semantics and BSD sockets semantics.
   3989 			 */
   3990 			if (so->so_state & SS_CANTSENDMORE) {
   3991 				eprintsoline(so, error);
   3992 				return (EPIPE);
   3993 			}
   3994 			if (so->so_error != 0) {
   3995 				mutex_enter(&so->so_lock);
   3996 				error = sogeterr(so, B_TRUE);
   3997 				mutex_exit(&so->so_lock);
   3998 				if (error != 0) {
   3999 					eprintsoline(so, error);
   4000 					return (error);
   4001 				}
   4002 			}
   4003 		}
   4004 	} while (uiop->uio_resid > 0);
   4005 	return (0);
   4006 }
   4007 
   4008 /*
   4009  * Sending data on a datagram socket.
   4010  * Assumes caller has verified that SS_ISBOUND etc. are set.
   4011  *
   4012  * For AF_UNIX the destination address is translated to an internal
   4013  * name and the source address is passed as an option.
   4014  */
   4015 int
   4016 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
   4017     struct uio *uiop, int flags)
   4018 {
   4019 	struct T_unitdata_req	tudr;
   4020 	mblk_t			*mp;
   4021 	int			error;
   4022 	void			*addr;
   4023 	socklen_t		addrlen;
   4024 	void			*src;
   4025 	socklen_t		srclen;
   4026 	ssize_t			len;
   4027 	sotpi_info_t		*sti = SOTOTPI(so);
   4028 
   4029 	ASSERT(name != NULL && namelen != 0);
   4030 
   4031 	len = uiop->uio_resid;
   4032 	if (len > sti->sti_tidu_size) {
   4033 		error = EMSGSIZE;
   4034 		goto done;
   4035 	}
   4036 
   4037 	/* Length and family checks */
   4038 	error = so_addr_verify(so, name, namelen);
   4039 	if (error != 0)
   4040 		goto done;
   4041 
   4042 	if (sti->sti_direct)
   4043 		return (sodgram_direct(so, name, namelen, uiop, flags));
   4044 
   4045 	if (so->so_family == AF_UNIX) {
   4046 		if (sti->sti_faddr_noxlate) {
   4047 			/*
   4048 			 * Already have a transport internal address. Do not
   4049 			 * pass any (transport internal) source address.
   4050 			 */
   4051 			addr = name;
   4052 			addrlen = namelen;
   4053 			src = NULL;
   4054 			srclen = 0;
   4055 		} else {
   4056 			/*
   4057 			 * Pass the sockaddr_un source address as an option
   4058 			 * and translate the remote address.
   4059 			 *
   4060 			 * Note that this code does not prevent sti_laddr_sa
   4061 			 * from changing while it is being used. Thus
   4062 			 * if an unbind+bind occurs concurrently with this
   4063 			 * send the peer might see a partially new and a
   4064 			 * partially old "from" address.
   4065 			 */
   4066 			src = sti->sti_laddr_sa;
   4067 			srclen = (socklen_t)sti->sti_laddr_len;
   4068 			dprintso(so, 1,
   4069 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
   4070 			    srclen, src));
   4071 			error = so_ux_addr_xlate(so, name, namelen,
   4072 			    (flags & MSG_XPG4_2),
   4073 			    &addr, &addrlen);
   4074 			if (error) {
   4075 				eprintsoline(so, error);
   4076 				goto done;
   4077 			}
   4078 		}
   4079 	} else {
   4080 		addr = name;
   4081 		addrlen = namelen;
   4082 		src = NULL;
   4083 		srclen = 0;
   4084 	}
   4085 	tudr.PRIM_type = T_UNITDATA_REQ;
   4086 	tudr.DEST_length = addrlen;
   4087 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
   4088 	if (srclen == 0) {
   4089 		tudr.OPT_length = 0;
   4090 		tudr.OPT_offset = 0;
   4091 
   4092 		mp = soallocproto2(&tudr, sizeof (tudr),
   4093 		    addr, addrlen, 0, _ALLOC_INTR, CRED());
   4094 		if (mp == NULL) {
   4095 			/*
   4096 			 * Caught a signal waiting for memory.
   4097 			 * Let send* return EINTR.
   4098 			 */
   4099 			error = EINTR;
   4100 			goto done;
   4101 		}
   4102 	} else {
   4103 		/*
   4104 		 * There is a AF_UNIX sockaddr_un to include as a source
   4105 		 * address option.
   4106 		 */
   4107 		struct T_opthdr toh;
   4108 		ssize_t size;
   4109 
   4110 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
   4111 		    _TPI_ALIGN_TOPT(srclen));
   4112 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
   4113 		    _TPI_ALIGN_TOPT(addrlen));
   4114 
   4115 		toh.level = SOL_SOCKET;
   4116 		toh.name = SO_SRCADDR;
   4117 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
   4118 		toh.status = 0;
   4119 
   4120 		size = tudr.OPT_offset + tudr.OPT_length;
   4121 		mp = soallocproto2(&tudr, sizeof (tudr),
   4122 		    addr, addrlen, size, _ALLOC_INTR, CRED());
   4123 		if (mp == NULL) {
   4124 			/*
   4125 			 * Caught a signal waiting for memory.
   4126 			 * Let send* return EINTR.
   4127 			 */
   4128 			error = EINTR;
   4129 			goto done;
   4130 		}
   4131 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
   4132 		soappendmsg(mp, &toh, sizeof (toh));
   4133 		soappendmsg(mp, src, srclen);
   4134 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
   4135 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
   4136 	}
   4137 
   4138 	if (audit_active)
   4139 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
   4140 
   4141 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
   4142 done:
   4143 #ifdef SOCK_DEBUG
   4144 	if (error) {
   4145 		eprintsoline(so, error);
   4146 	}
   4147 #endif /* SOCK_DEBUG */
   4148 	return (error);
   4149 }
   4150 
   4151 /*
   4152  * Sending data on a connected stream socket.
   4153  * Assumes caller has verified that SS_ISCONNECTED is set.
   4154  */
   4155 int
   4156 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
   4157     int sflag)
   4158 {
   4159 	struct T_data_req	tdr;
   4160 	mblk_t			*mp;
   4161 	int			error;
   4162 	ssize_t			iosize;
   4163 	sotpi_info_t		*sti = SOTOTPI(so);
   4164 
   4165 	dprintso(so, 1,
   4166 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
   4167 	    (void *)so, uiop->uio_resid, prim, sflag));
   4168 
   4169 	/*
   4170 	 * Has to be bound and connected. However, since no locks are
   4171 	 * held the state could have changed after sotpi_sendmsg checked it
   4172 	 * thus it is not possible to ASSERT on the state.
   4173 	 */
   4174 
   4175 	do {
   4176 		/*
   4177 		 * Set the MORE flag if uio_resid does not fit in this
   4178 		 * message or if the caller passed in "more".
   4179 		 * Error for transports with zero tidu_size.
   4180 		 */
   4181 		tdr.PRIM_type = prim;
   4182 		iosize = sti->sti_tidu_size;
   4183 		if (iosize <= 0)
   4184 			return (EMSGSIZE);
   4185 		if (uiop->uio_resid > iosize) {
   4186 			tdr.MORE_flag = 1;
   4187 		} else {
   4188 			if (more)
   4189 				tdr.MORE_flag = 1;
   4190 			else
   4191 				tdr.MORE_flag = 0;
   4192 			iosize = uiop->uio_resid;
   4193 		}
   4194 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
   4195 		    prim, tdr.MORE_flag, iosize));
   4196 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
   4197 		if (mp == NULL) {
   4198 			/*
   4199 			 * Caught a signal waiting for memory.
   4200 			 * Let send* return EINTR.
   4201 			 */
   4202 			return (EINTR);
   4203 		}
   4204 
   4205 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
   4206 		    0, sflag | MSG_BAND, 0);
   4207 		if (error) {
   4208 			eprintsoline(so, error);
   4209 			return (error);
   4210 		}
   4211 		if (uiop->uio_resid > 0) {
   4212 			/*
   4213 			 * Recheck for fatal errors. Fail write even though
   4214 			 * some data have been written. This is consistent
   4215 			 * with strwrite semantics and BSD sockets semantics.
   4216 			 */
   4217 			if (so->so_state & SS_CANTSENDMORE) {
   4218 				eprintsoline(so, error);
   4219 				return (EPIPE);
   4220 			}
   4221 			if (so->so_error != 0) {
   4222 				mutex_enter(&so->so_lock);
   4223 				error = sogeterr(so, B_TRUE);
   4224 				mutex_exit(&so->so_lock);
   4225 				if (error != 0) {
   4226 					eprintsoline(so, error);
   4227 					return (error);
   4228 				}
   4229 			}
   4230 		}
   4231 	} while (uiop->uio_resid > 0);
   4232 	return (0);
   4233 }
   4234 
   4235 /*
   4236  * Check the state for errors and call the appropriate send function.
   4237  *
   4238  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
   4239  * this function issues a setsockopt to toggle SO_DONTROUTE before and
   4240  * after sending the message.
   4241  */
   4242 static int
   4243 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
   4244     struct cred *cr)
   4245 {
   4246 	int		so_state;
   4247 	int		so_mode;
   4248 	int		error;
   4249 	struct sockaddr *name;
   4250 	t_uscalar_t	namelen;
   4251 	int		dontroute;
   4252 	int		flags;
   4253 	sotpi_info_t	*sti = SOTOTPI(so);
   4254 
   4255 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
   4256 	    (void *)so, (void *)msg, msg->msg_flags,
   4257 	    pr_state(so->so_state, so->so_mode), so->so_error));
   4258 
   4259 	if (so->so_version == SOV_STREAM) {
   4260 		/* The imaginary "sockmod" has been popped - act as a stream */
   4261 		so_update_attrs(so, SOMOD);
   4262 		return (strwrite(SOTOV(so), uiop, cr));
   4263 	}
   4264 
   4265 	mutex_enter(&so->so_lock);
   4266 	so_state = so->so_state;
   4267 
   4268 	if (so_state & SS_CANTSENDMORE) {
   4269 		mutex_exit(&so->so_lock);
   4270 		return (EPIPE);
   4271 	}
   4272 
   4273 	if (so->so_error != 0) {
   4274 		error = sogeterr(so, B_TRUE);
   4275 		if (error != 0) {
   4276 			mutex_exit(&so->so_lock);
   4277 			return (error);
   4278 		}
   4279 	}
   4280 
   4281 	name = (struct sockaddr *)msg->msg_name;
   4282 	namelen = msg->msg_namelen;
   4283 
   4284 	so_mode = so->so_mode;
   4285 
   4286 	if (name == NULL) {
   4287 		if (!(so_state & SS_ISCONNECTED)) {
   4288 			mutex_exit(&so->so_lock);
   4289 			if (so_mode & SM_CONNREQUIRED)
   4290 				return (ENOTCONN);
   4291 			else
   4292 				return (EDESTADDRREQ);
   4293 		}
   4294 		if (so_mode & SM_CONNREQUIRED) {
   4295 			name = NULL;
   4296 			namelen = 0;
   4297 		} else {
   4298 			/*
   4299 			 * Note that this code does not prevent sti_faddr_sa
   4300 			 * from changing while it is being used. Thus
   4301 			 * if an "unconnect"+connect occurs concurrently with
   4302 			 * this send the datagram might be delivered to a
   4303 			 * garbaled address.
   4304 			 */
   4305 			ASSERT(sti->sti_faddr_sa);
   4306 			name = sti->sti_faddr_sa;
   4307 			namelen = (t_uscalar_t)sti->sti_faddr_len;
   4308 		}
   4309 	} else {
   4310 		if (!(so_state & SS_ISCONNECTED) &&
   4311 		    (so_mode & SM_CONNREQUIRED)) {
   4312 			/* Required but not connected */
   4313 			mutex_exit(&so->so_lock);
   4314 			return (ENOTCONN);
   4315 		}
   4316 		/*
   4317 		 * Ignore the address on connection-oriented sockets.
   4318 		 * Just like BSD this code does not generate an error for
   4319 		 * TCP (a CONNREQUIRED socket) when sending to an address
   4320 		 * passed in with sendto/sendmsg. Instead the data is
   4321 		 * delivered on the connection as if no address had been
   4322 		 * supplied.
   4323 		 */
   4324 		if ((so_state & SS_ISCONNECTED) &&
   4325 		    !(so_mode & SM_CONNREQUIRED)) {
   4326 			mutex_exit(&so->so_lock);
   4327 			return (EISCONN);
   4328 		}
   4329 		if (!(so_state & SS_ISBOUND)) {
   4330 			so_lock_single(so);	/* Set SOLOCKED */
   4331 			error = sotpi_bind(so, NULL, 0,
   4332 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
   4333 			so_unlock_single(so, SOLOCKED);
   4334 			if (error) {
   4335 				mutex_exit(&so->so_lock);
   4336 				eprintsoline(so, error);
   4337 				return (error);
   4338 			}
   4339 		}
   4340 		/*
   4341 		 * Handle delayed datagram errors. These are only queued
   4342 		 * when the application sets SO_DGRAM_ERRIND.
   4343 		 * Return the error if we are sending to the address
   4344 		 * that was returned in the last T_UDERROR_IND.
   4345 		 * If sending to some other address discard the delayed
   4346 		 * error indication.
   4347 		 */
   4348 		if (sti->sti_delayed_error) {
   4349 			struct T_uderror_ind	*tudi;
   4350 			void			*addr;
   4351 			t_uscalar_t		addrlen;
   4352 			boolean_t		match = B_FALSE;
   4353 
   4354 			ASSERT(sti->sti_eaddr_mp);
   4355 			error = sti->sti_delayed_error;
   4356 			sti->sti_delayed_error = 0;
   4357 			tudi =
   4358 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
   4359 			addrlen = tudi->DEST_length;
   4360 			addr = sogetoff(sti->sti_eaddr_mp,
   4361 			    tudi->DEST_offset, addrlen, 1);
   4362 			ASSERT(addr);	/* Checked by strsock_proto */
   4363 			switch (so->so_family) {
   4364 			case AF_INET: {
   4365 				/* Compare just IP address and port */
   4366 				sin_t *sin1 = (sin_t *)name;
   4367 				sin_t *sin2 = (sin_t *)addr;
   4368 
   4369 				if (addrlen == sizeof (sin_t) &&
   4370 				    namelen == addrlen &&
   4371 				    sin1->sin_port == sin2->sin_port &&
   4372 				    sin1->sin_addr.s_addr ==
   4373 				    sin2->sin_addr.s_addr)
   4374 					match = B_TRUE;
   4375 				break;
   4376 			}
   4377 			case AF_INET6: {
   4378 				/* Compare just IP address and port. Not flow */
   4379 				sin6_t *sin1 = (sin6_t *)name;
   4380 				sin6_t *sin2 = (sin6_t *)addr;
   4381 
   4382 				if (addrlen == sizeof (sin6_t) &&
   4383 				    namelen == addrlen &&
   4384 				    sin1->sin6_port == sin2->sin6_port &&
   4385 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
   4386 				    &sin2->sin6_addr))
   4387 					match = B_TRUE;
   4388 				break;
   4389 			}
   4390 			case AF_UNIX:
   4391 			default:
   4392 				if (namelen == addrlen &&
   4393 				    bcmp(name, addr, namelen) == 0)
   4394 					match = B_TRUE;
   4395 			}
   4396 			if (match) {
   4397 				freemsg(sti->sti_eaddr_mp);
   4398 				sti->sti_eaddr_mp = NULL;
   4399 				mutex_exit(&so->so_lock);
   4400 #ifdef DEBUG
   4401 				dprintso(so, 0,
   4402 				    ("sockfs delayed error %d for %s\n",
   4403 				    error,
   4404 				    pr_addr(so->so_family, name, namelen)));
   4405 #endif /* DEBUG */
   4406 				return (error);
   4407 			}
   4408 			freemsg(sti->sti_eaddr_mp);
   4409 			sti->sti_eaddr_mp = NULL;
   4410 		}
   4411 	}
   4412 	mutex_exit(&so->so_lock);
   4413 
   4414 	flags = msg->msg_flags;
   4415 	dontroute = 0;
   4416 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
   4417 		uint32_t	val;
   4418 
   4419 		val = 1;
   4420 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
   4421 		    &val, (t_uscalar_t)sizeof (val), cr);
   4422 		if (error)
   4423 			return (error);
   4424 		dontroute = 1;
   4425 	}
   4426 
   4427 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
   4428 		error = EOPNOTSUPP;
   4429 		goto done;
   4430 	}
   4431 	if (msg->msg_controllen != 0) {
   4432 		if (!(so_mode & SM_CONNREQUIRED)) {
   4433 			so_update_attrs(so, SOMOD);
   4434 			error = sosend_dgramcmsg(so, name, namelen, uiop,
   4435 			    msg->msg_control, msg->msg_controllen, flags);
   4436 		} else {
   4437 			if (flags & MSG_OOB) {
   4438 				/* Can't generate T_EXDATA_REQ with options */
   4439 				error = EOPNOTSUPP;
   4440 				goto done;
   4441 			}
   4442 			so_update_attrs(so, SOMOD);
   4443 			error = sosend_svccmsg(so, uiop,
   4444 			    !(flags & MSG_EOR),
   4445 			    msg->msg_control, msg->msg_controllen,
   4446 			    flags);
   4447 		}
   4448 		goto done;
   4449 	}
   4450 
   4451 	so_update_attrs(so, SOMOD);
   4452 	if (!(so_mode & SM_CONNREQUIRED)) {
   4453 		/*
   4454 		 * If there is no SO_DONTROUTE to turn off return immediately
   4455 		 * from send_dgram. This can allow tail-call optimizations.
   4456 		 */
   4457 		if (!dontroute) {
   4458 			return (sosend_dgram(so, name, namelen, uiop, flags));
   4459 		}
   4460 		error = sosend_dgram(so, name, namelen, uiop, flags);
   4461 	} else {
   4462 		t_scalar_t prim;
   4463 		int sflag;
   4464 
   4465 		/* Ignore msg_name in the connected state */
   4466 		if (flags & MSG_OOB) {
   4467 			prim = T_EXDATA_REQ;
   4468 			/*
   4469 			 * Send down T_EXDATA_REQ even if there is flow
   4470 			 * control for data.
   4471 			 */
   4472 			sflag = MSG_IGNFLOW;
   4473 		} else {
   4474 			if (so_mode & SM_BYTESTREAM) {
   4475 				/* Byte stream transport - use write */
   4476 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
   4477 
   4478 				/* Send M_DATA messages */
   4479 				if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
   4480 				    (error = nl7c_data(so, uiop)) >= 0) {
   4481 					/* NL7C consumed the data */
   4482 					return (error);
   4483 				}
   4484 				/*
   4485 				 * If there is no SO_DONTROUTE to turn off,
   4486 				 * sti_direct is on, and there is no flow
   4487 				 * control, we can take the fast path.
   4488 				 */
   4489 				if (!dontroute && sti->sti_direct != 0 &&
   4490 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
   4491 					return (sostream_direct(so, uiop,
   4492 					    NULL, cr));
   4493 				}
   4494 				error = strwrite(SOTOV(so), uiop, cr);
   4495 				goto done;
   4496 			}
   4497 			prim = T_DATA_REQ;
   4498 			sflag = 0;
   4499 		}
   4500 		/*
   4501 		 * If there is no SO_DONTROUTE to turn off return immediately
   4502 		 * from sosend_svc. This can allow tail-call optimizations.
   4503 		 */
   4504 		if (!dontroute)
   4505 			return (sosend_svc(so, uiop, prim,
   4506 			    !(flags & MSG_EOR), sflag));
   4507 		error = sosend_svc(so, uiop, prim,
   4508 		    !(flags & MSG_EOR), sflag);
   4509 	}
   4510 	ASSERT(dontroute);
   4511 done:
   4512 	if (dontroute) {
   4513 		uint32_t	val;
   4514 
   4515 		val = 0;
   4516 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
   4517 		    &val, (t_uscalar_t)sizeof (val), cr);
   4518 	}
   4519 	return (error);
   4520 }
   4521 
   4522 /*
   4523  * kstrwritemp() has very similar semantics as that of strwrite().
   4524  * The main difference is it obtains mblks from the caller and also
   4525  * does not do any copy as done in strwrite() from user buffers to
   4526  * kernel buffers.
   4527  *
   4528  * Currently, this routine is used by sendfile to send data allocated
   4529  * within the kernel without any copying. This interface does not use the
   4530  * synchronous stream interface as synch. stream interface implies
   4531  * copying.
   4532  */
   4533 int
   4534 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
   4535 {
   4536 	struct stdata *stp;
   4537 	struct queue *wqp;
   4538 	mblk_t *newmp;
   4539 	char waitflag;
   4540 	int tempmode;
   4541 	int error = 0;
   4542 	int done = 0;
   4543 	struct sonode *so;
   4544 	boolean_t direct;
   4545 
   4546 	ASSERT(vp->v_stream);
   4547 	stp = vp->v_stream;
   4548 
   4549 	so = VTOSO(vp);
   4550 	direct = _SOTOTPI(so)->sti_direct;
   4551 
   4552 	/*
   4553 	 * This is the sockfs direct fast path. canputnext() need
   4554 	 * not be accurate so we don't grab the sd_lock here. If
   4555 	 * we get flow-controlled, we grab sd_lock just before the
   4556 	 * do..while loop below to emulate what strwrite() does.
   4557 	 */
   4558 	wqp = stp->sd_wrq;
   4559 	if (canputnext(wqp) && direct &&
   4560 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
   4561 		return (sostream_direct(so, NULL, mp, CRED()));
   4562 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
   4563 		/* Fast check of flags before acquiring the lock */
   4564 		mutex_enter(&stp->sd_lock);
   4565 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
   4566 		mutex_exit(&stp->sd_lock);
   4567 		if (error != 0) {
   4568 			if (!(stp->sd_flag & STPLEX) &&
   4569 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
   4570 				error = EPIPE;
   4571 			}
   4572 			return (error);
   4573 		}
   4574 	}
   4575 
   4576 	waitflag = WRITEWAIT;
   4577 	if (stp->sd_flag & OLDNDELAY)
   4578 		tempmode = fmode & ~FNDELAY;
   4579 	else
   4580 		tempmode = fmode;
   4581 
   4582 	mutex_enter(&stp->sd_lock);
   4583 	do {
   4584 		if (canputnext(wqp)) {
   4585 			mutex_exit(&stp->sd_lock);
   4586 			if (stp->sd_wputdatafunc != NULL) {
   4587 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
   4588 				    NULL, NULL, NULL);
   4589 				if (newmp == NULL) {
   4590 					/* The caller will free mp */
   4591 					return (ECOMM);
   4592 				}
   4593 				mp = newmp;
   4594 			}
   4595 			putnext(wqp, mp);
   4596 			return (0);
   4597 		}
   4598 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
   4599 		    &done);
   4600 	} while (error == 0 && !done);
   4601 
   4602 	mutex_exit(&stp->sd_lock);
   4603 	/*
   4604 	 * EAGAIN tells the application to try again. ENOMEM
   4605 	 * is returned only if the memory allocation size
   4606 	 * exceeds the physical limits of the system. ENOMEM
   4607 	 * can't be true here.
   4608 	 */
   4609 	if (error == ENOMEM)
   4610 		error = EAGAIN;
   4611 	return (error);
   4612 }
   4613 
   4614 /* ARGSUSED */
   4615 static int
   4616 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
   4617     struct cred *cr, mblk_t **mpp)
   4618 {
   4619 	int error;
   4620 
   4621 	if (so->so_family != AF_INET && so->so_family != AF_INET6)
   4622 		return (EAFNOSUPPORT);
   4623 
   4624 	if (so->so_state & SS_CANTSENDMORE)
   4625 		return (EPIPE);
   4626 
   4627 	if (so->so_type != SOCK_STREAM)
   4628 		return (EOPNOTSUPP);
   4629 
   4630 	if ((so->so_state & SS_ISCONNECTED) == 0)
   4631 		return (ENOTCONN);
   4632 
   4633 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
   4634 	if (error == 0)
   4635 		*mpp = NULL;
   4636 	return (error);
   4637 }
   4638 
   4639 /*
   4640  * Sending data on a datagram socket.
   4641  * Assumes caller has verified that SS_ISBOUND etc. are set.
   4642  */
   4643 /* ARGSUSED */
   4644 static int
   4645 sodgram_direct(struct sonode *so, struct sockaddr *name,
   4646     socklen_t namelen, struct uio *uiop, int flags)
   4647 {
   4648 	struct T_unitdata_req	tudr;
   4649 	mblk_t			*mp = NULL;
   4650 	int			error = 0;
   4651 	void			*addr;
   4652 	socklen_t		addrlen;
   4653 	ssize_t			len;
   4654 	struct stdata		*stp = SOTOV(so)->v_stream;
   4655 	int			so_state;
   4656 	queue_t			*udp_wq;
   4657 	boolean_t		connected;
   4658 	mblk_t			*mpdata = NULL;
   4659 	sotpi_info_t		*sti = SOTOTPI(so);
   4660 
   4661 	ASSERT(name != NULL && namelen != 0);
   4662 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
   4663 	ASSERT(!(so->so_mode & SM_EXDATA));
   4664 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
   4665 	ASSERT(SOTOV(so)->v_type == VSOCK);
   4666 
   4667 	/* Caller checked for proper length */
   4668 	len = uiop->uio_resid;
   4669 	ASSERT(len <= sti->sti_tidu_size);
   4670 
   4671 	/* Length and family checks have been done by caller */
   4672 	ASSERT(name->sa_family == so->so_family);
   4673 	ASSERT(so->so_family == AF_INET ||
   4674 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
   4675 	ASSERT(so->so_family == AF_INET6 ||
   4676 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
   4677 
   4678 	addr = name;
   4679 	addrlen = namelen;
   4680 
   4681 	if (stp->sd_sidp != NULL &&
   4682 	    (error = straccess(stp, JCWRITE)) != 0)
   4683 		goto done;
   4684 
   4685 	so_state = so->so_state;
   4686 
   4687 	connected = so_state & SS_ISCONNECTED;
   4688 	if (!connected) {
   4689 		tudr.PRIM_type = T_UNITDATA_REQ;
   4690 		tudr.DEST_length = addrlen;
   4691 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
   4692 		tudr.OPT_length = 0;
   4693 		tudr.OPT_offset = 0;
   4694 
   4695 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
   4696 		    _ALLOC_INTR, CRED());
   4697 		if (mp == NULL) {
   4698 			/*
   4699 			 * Caught a signal waiting for memory.
   4700 			 * Let send* return EINTR.
   4701 			 */
   4702 			error = EINTR;
   4703 			goto done;
   4704 		}
   4705 	}
   4706 
   4707 	/*
   4708 	 * For UDP we don't break up the copyin into smaller pieces
   4709 	 * as in the TCP case.  That means if ENOMEM is returned by
   4710 	 * mcopyinuio() then the uio vector has not been modified at
   4711 	 * all and we fallback to either strwrite() or kstrputmsg()
   4712 	 * below.  Note also that we never generate priority messages
   4713 	 * from here.
   4714 	 */
   4715 	udp_wq = stp->sd_wrq->q_next;
   4716 	if (canput(udp_wq) &&
   4717 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
   4718 		ASSERT(DB_TYPE(mpdata) == M_DATA);
   4719 		ASSERT(uiop->uio_resid == 0);
   4720 		if (!connected)
   4721 			linkb(mp, mpdata);
   4722 		else
   4723 			mp = mpdata;
   4724 		if (audit_active)
   4725 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
   4726 
   4727 		udp_wput(udp_wq, mp);
   4728 		return (0);
   4729 	}
   4730 
   4731 	ASSERT(mpdata == NULL);
   4732 	if (error != 0 && error != ENOMEM) {
   4733 		freemsg(mp);
   4734 		return (error);
   4735 	}
   4736 
   4737 	/*
   4738 	 * For connected, let strwrite() handle the blocking case.
   4739 	 * Otherwise we fall thru and use kstrputmsg().
   4740 	 */
   4741 	if (connected)
   4742 		return (strwrite(SOTOV(so), uiop, CRED()));
   4743 
   4744 	if (audit_active)
   4745 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
   4746 
   4747 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
   4748 done:
   4749 #ifdef SOCK_DEBUG
   4750 	if (error != 0) {
   4751 		eprintsoline(so, error);
   4752 	}
   4753 #endif /* SOCK_DEBUG */
   4754 	return (error);
   4755 }
   4756 
   4757 int
   4758 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
   4759 {
   4760 	struct stdata *stp = SOTOV(so)->v_stream;
   4761 	ssize_t iosize, rmax, maxblk;
   4762 	queue_t *tcp_wq = stp->sd_wrq->q_next;
   4763 	mblk_t *newmp;
   4764 	int error = 0, wflag = 0;
   4765 
   4766 	ASSERT(so->so_mode & SM_BYTESTREAM);
   4767 	ASSERT(SOTOV(so)->v_type == VSOCK);
   4768 
   4769 	if (stp->sd_sidp != NULL &&
   4770 	    (error = straccess(stp, JCWRITE)) != 0)
   4771 		return (error);
   4772 
   4773 	if (uiop == NULL) {
   4774 		/*
   4775 		 * kstrwritemp() should have checked sd_flag and
   4776 		 * flow-control before coming here.  If we end up
   4777 		 * here it means that we can simply pass down the
   4778 		 * data to tcp.
   4779 		 */
   4780 		ASSERT(mp != NULL);
   4781 		if (stp->sd_wputdatafunc != NULL) {
   4782 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
   4783 			    NULL, NULL, NULL);
   4784 			if (newmp == NULL) {
   4785 				/* The caller will free mp */
   4786 				return (ECOMM);
   4787 			}
   4788 			mp = newmp;
   4789 		}
   4790 		tcp_wput(tcp_wq, mp);
   4791 		return (0);
   4792 	}
   4793 
   4794 	/* Fallback to strwrite() to do proper error handling */
   4795 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
   4796 		return (strwrite(SOTOV(so), uiop, cr));
   4797 
   4798 	rmax = stp->sd_qn_maxpsz;
   4799 	ASSERT(rmax >= 0 || rmax == INFPSZ);
   4800 	if (rmax == 0 || uiop->uio_resid <= 0)
   4801 		return (0);
   4802 
   4803 	if (rmax == INFPSZ)
   4804 		rmax = uiop->uio_resid;
   4805 
   4806 	maxblk = stp->sd_maxblk;
   4807 
   4808 	for (;;) {
   4809 		iosize = MIN(uiop->uio_resid, rmax);
   4810 
   4811 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
   4812 		if (mp == NULL) {
   4813 			/*
   4814 			 * Fallback to strwrite() for ENOMEM; if this
   4815 			 * is our first time in this routine and the uio
   4816 			 * vector has not been modified, we will end up
   4817 			 * calling strwrite() without any flag set.
   4818 			 */
   4819 			if (error == ENOMEM)
   4820 				goto slow_send;
   4821 			else
   4822 				return (error);
   4823 		}
   4824 		ASSERT(uiop->uio_resid >= 0);
   4825 		/*
   4826 		 * If mp is non-NULL and ENOMEM is set, it means that
   4827 		 * mcopyinuio() was able to break down some of the user
   4828 		 * data into one or more mblks.  Send the partial data
   4829 		 * to tcp and let the rest be handled in strwrite().
   4830 		 */
   4831 		ASSERT(error == 0 || error == ENOMEM);
   4832 		if (stp->sd_wputdatafunc != NULL) {
   4833 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
   4834 			    NULL, NULL, NULL);
   4835 			if (newmp == NULL) {
   4836 				/* The caller will free mp */
   4837 				return (ECOMM);
   4838 			}
   4839 			mp = newmp;
   4840 		}
   4841 		tcp_wput(tcp_wq, mp);
   4842 
   4843 		wflag |= NOINTR;
   4844 
   4845 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
   4846 			ASSERT(error == 0);
   4847 			break;
   4848 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
   4849 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
   4850 slow_send:
   4851 			/*
   4852 			 * We were able to send down partial data using
   4853 			 * the direct call interface, but are now relying
   4854 			 * on strwrite() to handle the non-fastpath cases.
   4855 			 * If the socket is blocking we will sleep in
   4856 			 * strwaitq() until write is permitted, otherwise,
   4857 			 * we will need to return the amount of bytes
   4858 			 * written so far back to the app.  This is the
   4859 			 * reason why we pass NOINTR flag to strwrite()
   4860 			 * for non-blocking socket, because we don't want
   4861 			 * to return EAGAIN when portion of the user data
   4862 			 * has actually been sent down.
   4863 			 */
   4864 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
   4865 		}
   4866 	}
   4867 	return (0);
   4868 }
   4869 
   4870 /*
   4871  * Update sti_faddr by asking the transport (unless AF_UNIX).
   4872  */
   4873 /* ARGSUSED */
   4874 int
   4875 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
   4876     boolean_t accept, struct cred *cr)
   4877 {
   4878 	struct strbuf	strbuf;
   4879 	int		error = 0, res;
   4880 	void		*addr;
   4881 	t_uscalar_t	addrlen;
   4882 	k_sigset_t	smask;
   4883 	sotpi_info_t	*sti = SOTOTPI(so);
   4884 
   4885 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
   4886 	    (void *)so, pr_state(so->so_state, so->so_mode)));
   4887 
   4888 	ASSERT(*namelen > 0);
   4889 	mutex_enter(&so->so_lock);
   4890 	so_lock_single(so);	/* Set SOLOCKED */
   4891 
   4892 	if (accept) {
   4893 		bcopy(sti->sti_faddr_sa, name,
   4894 		    MIN(*namelen, sti->sti_faddr_len));
   4895 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
   4896 		goto done;
   4897 	}
   4898 
   4899 	if (!(so->so_state & SS_ISCONNECTED)) {
   4900 		error = ENOTCONN;
   4901 		goto done;
   4902 	}
   4903 	/* Added this check for X/Open */
   4904 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
   4905 		error = EINVAL;
   4906 		if (xnet_check_print) {
   4907 			printf("sockfs: X/Open getpeername check => EINVAL\n");
   4908 		}
   4909 		goto done;
   4910 	}
   4911 
   4912 	if (sti->sti_faddr_valid) {
   4913 		bcopy(sti->sti_faddr_sa, name,
   4914 		    MIN(*namelen, sti->sti_faddr_len));
   4915 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
   4916 		goto done;
   4917 	}
   4918 
   4919 #ifdef DEBUG
   4920 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
   4921 	    pr_addr(so->so_family, sti->sti_faddr_sa,
   4922 	    (t_uscalar_t)sti->sti_faddr_len)));
   4923 #endif /* DEBUG */
   4924 
   4925 	if (so->so_family == AF_UNIX) {
   4926 		/* Transport has different name space - return local info */
   4927 		if (sti->sti_faddr_noxlate)
   4928 			*namelen = 0;
   4929 		error = 0;
   4930 		goto done;
   4931 	}
   4932 
   4933 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
   4934 
   4935 	ASSERT(sti->sti_faddr_sa);
   4936 	/* Allocate local buffer to use with ioctl */
   4937 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
   4938 	mutex_exit(&so->so_lock);
   4939 	addr = kmem_alloc(addrlen, KM_SLEEP);
   4940 
   4941 	/*
   4942 	 * Issue TI_GETPEERNAME with signals masked.
   4943 	 * Put the result in sti_faddr_sa so that getpeername works after
   4944 	 * a shutdown(output).
   4945 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
   4946 	 * back to the socket.
   4947 	 */
   4948 	strbuf.buf = addr;
   4949 	strbuf.maxlen = addrlen;
   4950 	strbuf.len = 0;
   4951 
   4952 	sigintr(&smask, 0);
   4953 	res = 0;
   4954 	ASSERT(cr);
   4955 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
   4956 	    0, K_TO_K, cr, &res);
   4957 	sigunintr(&smask);
   4958 
   4959 	mutex_enter(&so->so_lock);
   4960 	/*
   4961 	 * If there is an error record the error in so_error put don't fail
   4962 	 * the getpeername. Instead fallback on the recorded
   4963 	 * sti->sti_faddr_sa.
   4964 	 */
   4965 	if (error) {
   4966 		/*
   4967 		 * Various stream head errors can be returned to the ioctl.
   4968 		 * However, it is impossible to determine which ones of
   4969 		 * these are really socket level errors that were incorrectly
   4970 		 * consumed by the ioctl. Thus this code silently ignores the
   4971 		 * error - to code explicitly does not reinstate the error
   4972 		 * using soseterror().
   4973 		 * Experiments have shows that at least this set of
   4974 		 * errors are reported and should not be reinstated on the
   4975 		 * socket:
   4976 		 *	EINVAL	E.g. if an I_LINK was in effect when
   4977 		 *		getpeername was called.
   4978 		 *	EPIPE	The ioctl error semantics prefer the write
   4979 		 *		side error over the read side error.
   4980 		 *	ENOTCONN The transport just got disconnected but
   4981 		 *		sockfs had not yet seen the T_DISCON_IND
   4982 		 *		when issuing the ioctl.
   4983 		 */
   4984 		error = 0;
   4985 	} else if (res == 0 && strbuf.len > 0 &&
   4986 	    (so->so_state & SS_ISCONNECTED)) {
   4987 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
   4988 		sti->sti_faddr_len = (socklen_t)strbuf.len;
   4989 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
   4990 		sti->sti_faddr_valid = 1;
   4991 
   4992 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
   4993 		*namelen = sti->sti_faddr_len;
   4994 	}
   4995 	kmem_free(addr, addrlen);
   4996 #ifdef DEBUG
   4997 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
   4998 	    pr_addr(so->so_family, sti->sti_faddr_sa,
   4999 	    (t_uscalar_t)sti->sti_faddr_len)));
   5000 #endif /* DEBUG */
   5001 done:
   5002 	so_unlock_single(so, SOLOCKED);
   5003 	mutex_exit(&so->so_lock);
   5004 	return (error);
   5005 }
   5006 
   5007 /*
   5008  * Update sti_laddr by asking the transport (unless AF_UNIX).
   5009  */
   5010 int
   5011 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
   5012     struct cred *cr)
   5013 {
   5014 	struct strbuf	strbuf;
   5015 	int		error = 0, res;
   5016 	void		*addr;
   5017 	t_uscalar_t	addrlen;
   5018 	k_sigset_t	smask;
   5019 	sotpi_info_t	*sti = SOTOTPI(so);
   5020 
   5021 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
   5022 	    (void *)so, pr_state(so->so_state, so->so_mode)));
   5023 
   5024 	ASSERT(*namelen > 0);
   5025 	mutex_enter(&so->so_lock);
   5026 	so_lock_single(so);	/* Set SOLOCKED */
   5027 
   5028 #ifdef DEBUG
   5029 
   5030 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
   5031 	    pr_addr(so->so_family, sti->sti_laddr_sa,
   5032 	    (t_uscalar_t)sti->sti_laddr_len)));
   5033 #endif /* DEBUG */
   5034 	if (sti->sti_laddr_valid) {
   5035 		bcopy(sti->sti_laddr_sa, name,
   5036 		    MIN(*namelen, sti->sti_laddr_len));
   5037 		*namelen = sti->sti_laddr_len;
   5038 		goto done;
   5039 	}
   5040 
   5041 	if (so->so_family == AF_UNIX) {
   5042 		/* Transport has different name space - return local info */
   5043 		error = 0;
   5044 		*namelen = 0;
   5045 		goto done;
   5046 	}
   5047 	if (!(so->so_state & SS_ISBOUND)) {
   5048 		/* If not bound, then nothing to return. */
   5049 		error = 0;
   5050 		goto done;
   5051 	}
   5052 
   5053 	/* Allocate local buffer to use with ioctl */
   5054 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
   5055 	mutex_exit(&so->so_lock);
   5056 	addr = kmem_alloc(addrlen, KM_SLEEP);
   5057 
   5058 	/*
   5059 	 * Issue TI_GETMYNAME with signals masked.
   5060 	 * Put the result in sti_laddr_sa so that getsockname works after
   5061 	 * a shutdown(output).
   5062 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
   5063 	 * back to the socket.
   5064 	 */
   5065 	strbuf.buf = addr;
   5066 	strbuf.maxlen = addrlen;
   5067 	strbuf.len = 0;
   5068 
   5069 	sigintr(&smask, 0);
   5070 	res = 0;
   5071 	ASSERT(cr);
   5072 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
   5073 	    0, K_TO_K, cr, &res);
   5074 	sigunintr(&smask);
   5075 
   5076 	mutex_enter(&so->so_lock);
   5077 	/*
   5078 	 * If there is an error record the error in so_error put don't fail
   5079 	 * the getsockname. Instead fallback on the recorded
   5080 	 * sti->sti_laddr_sa.
   5081 	 */
   5082 	if (error) {
   5083 		/*
   5084 		 * Various stream head errors can be returned to the ioctl.
   5085 		 * However, it is impossible to determine which ones of
   5086 		 * these are really socket level errors that were incorrectly
   5087 		 * consumed by the ioctl. Thus this code silently ignores the
   5088 		 * error - to code explicitly does not reinstate the error
   5089 		 * using soseterror().
   5090 		 * Experiments have shows that at least this set of
   5091 		 * errors are reported and should not be reinstated on the
   5092 		 * socket:
   5093 		 *	EINVAL	E.g. if an I_LINK was in effect when
   5094 		 *		getsockname was called.
   5095 		 *	EPIPE	The ioctl error semantics prefer the write
   5096 		 *		side error over the read side error.
   5097 		 */
   5098 		error = 0;
   5099 	} else if (res == 0 && strbuf.len > 0 &&
   5100 	    (so->so_state & SS_ISBOUND)) {
   5101 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
   5102 		sti->sti_laddr_len = (socklen_t)strbuf.len;
   5103 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
   5104 		sti->sti_laddr_valid = 1;
   5105 
   5106 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
   5107 		*namelen = sti->sti_laddr_len;
   5108 	}
   5109 	kmem_free(addr, addrlen);
   5110 #ifdef DEBUG
   5111 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
   5112 	    pr_addr(so->so_family, sti->sti_laddr_sa,
   5113 	    (t_uscalar_t)sti->sti_laddr_len)));
   5114 #endif /* DEBUG */
   5115 done:
   5116 	so_unlock_single(so, SOLOCKED);
   5117 	mutex_exit(&so->so_lock);
   5118 	return (error);
   5119 }
   5120 
   5121 /*
   5122  * Get socket options. For SOL_SOCKET options some options are handled
   5123  * by the sockfs while others use the value recorded in the sonode as a
   5124  * fallback should the T_SVR4_OPTMGMT_REQ fail.
   5125  *
   5126  * On the return most *optlenp bytes are copied to optval.
   5127  */
   5128 /* ARGSUSED */
   5129 int
   5130 sotpi_getsockopt(struct sonode *so, int level, int option_name,
   5131 		void *optval, socklen_t *optlenp, int flags, struct cred *cr)
   5132 {
   5133 	struct T_optmgmt_req	optmgmt_req;
   5134 	struct T_optmgmt_ack	*optmgmt_ack;
   5135 	struct opthdr		oh;
   5136 	struct opthdr		*opt_res;
   5137 	mblk_t			*mp = NULL;
   5138 	int			error = 0;
   5139 	void			*option = NULL;	/* Set if fallback value */
   5140 	t_uscalar_t		maxlen = *optlenp;
   5141 	t_uscalar_t		len;
   5142 	uint32_t		value;
   5143 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
   5144 	struct timeval32	tmo_val32;
   5145 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
   5146 
   5147 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
   5148 	    (void *)so, level, option_name, optval, (void *)optlenp,
   5149 	    pr_state(so->so_state, so->so_mode)));
   5150 
   5151 	mutex_enter(&so->so_lock);
   5152 	so_lock_single(so);	/* Set SOLOCKED */
   5153 
   5154 	/*
   5155 	 * Check for SOL_SOCKET options.
   5156 	 * Certain SOL_SOCKET options are returned directly whereas
   5157 	 * others only provide a default (fallback) value should
   5158 	 * the T_SVR4_OPTMGMT_REQ fail.
   5159 	 */
   5160 	if (level == SOL_SOCKET) {
   5161 		/* Check parameters */
   5162 		switch (option_name) {
   5163 		case SO_TYPE:
   5164 		case SO_ERROR:
   5165 		case SO_DEBUG:
   5166 		case SO_ACCEPTCONN:
   5167 		case SO_REUSEADDR:
   5168 		case SO_KEEPALIVE:
   5169 		case SO_DONTROUTE:
   5170 		case SO_BROADCAST:
   5171 		case SO_USELOOPBACK:
   5172 		case SO_OOBINLINE:
   5173 		case SO_SNDBUF:
   5174 		case SO_RCVBUF:
   5175 #ifdef notyet
   5176 		case SO_SNDLOWAT:
   5177 		case SO_RCVLOWAT:
   5178 #endif /* notyet */
   5179 		case SO_DOMAIN:
   5180 		case SO_DGRAM_ERRIND:
   5181 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
   5182 				error = EINVAL;
   5183 				eprintsoline(so, error);
   5184 				goto done2;
   5185 			}
   5186 			break;
   5187 		case SO_RCVTIMEO:
   5188 		case SO_SNDTIMEO:
   5189 			if (get_udatamodel() == DATAMODEL_NONE ||
   5190 			    get_udatamodel() == DATAMODEL_NATIVE) {
   5191 				if (maxlen < sizeof (struct timeval)) {
   5192 					error = EINVAL;
   5193 					eprintsoline(so, error);
   5194 					goto done2;
   5195 				}
   5196 			} else {
   5197 				if (maxlen < sizeof (struct timeval32)) {
   5198 					error = EINVAL;
   5199 					eprintsoline(so, error);
   5200 					goto done2;
   5201 				}
   5202 
   5203 			}
   5204 			break;
   5205 		case SO_LINGER:
   5206 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
   5207 				error = EINVAL;
   5208 				eprintsoline(so, error);
   5209 				goto done2;
   5210 			}
   5211 			break;
   5212 		case SO_SND_BUFINFO:
   5213 			if (maxlen < (t_uscalar_t)
   5214 			    sizeof (struct so_snd_bufinfo)) {
   5215 				error = EINVAL;
   5216 				eprintsoline(so, error);
   5217 				goto done2;
   5218 			}
   5219 			break;
   5220 		}
   5221 
   5222 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
   5223 
   5224 		switch (option_name) {
   5225 		case SO_TYPE:
   5226 			value = so->so_type;
   5227 			option = &value;
   5228 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
   5229 
   5230 		case SO_ERROR:
   5231 			value = sogeterr(so, B_TRUE);
   5232 			option = &value;
   5233 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
   5234 
   5235 		case SO_ACCEPTCONN:
   5236 			if (so->so_state & SS_ACCEPTCONN)
   5237 				value = SO_ACCEPTCONN;
   5238 			else
   5239 				value = 0;
   5240 #ifdef DEBUG
   5241 			if (value) {
   5242 				dprintso(so, 1,
   5243 				    ("sotpi_getsockopt: 0x%x is set\n",
   5244 				    option_name));
   5245 			} else {
   5246 				dprintso(so, 1,
   5247 				    ("sotpi_getsockopt: 0x%x not set\n",
   5248 				    option_name));
   5249 			}
   5250 #endif /* DEBUG */
   5251 			option = &value;
   5252 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
   5253 
   5254 		case SO_DEBUG:
   5255 		case SO_REUSEADDR:
   5256 		case SO_KEEPALIVE:
   5257 		case SO_DONTROUTE:
   5258 		case SO_BROADCAST:
   5259 		case SO_USELOOPBACK:
   5260 		case SO_OOBINLINE:
   5261 		case SO_DGRAM_ERRIND:
   5262 			value = (so->so_options & option_name);
   5263 #ifdef DEBUG
   5264 			if (value) {
   5265 				dprintso(so, 1,
   5266 				    ("sotpi_getsockopt: 0x%x is set\n",
   5267 				    option_name));
   5268 			} else {
   5269 				dprintso(so, 1,
   5270 				    ("sotpi_getsockopt: 0x%x not set\n",
   5271 				    option_name));
   5272 			}
   5273 #endif /* DEBUG */
   5274 			option = &value;
   5275 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
   5276 
   5277 		/*
   5278 		 * The following options are only returned by sockfs when the
   5279 		 * T_SVR4_OPTMGMT_REQ fails.
   5280 		 */
   5281 		case SO_LINGER:
   5282 			option = &so->so_linger;
   5283 			len = (t_uscalar_t)sizeof (struct linger);
   5284 			break;
   5285 		case SO_SNDBUF: {
   5286 			ssize_t lvalue;
   5287 
   5288 			/*
   5289 			 * If the option has not been set then get a default
   5290 			 * value from the read queue. This value is
   5291 			 * returned if the transport fails
   5292 			 * the T_SVR4_OPTMGMT_REQ.
   5293 			 */
   5294 			lvalue = so->so_sndbuf;
   5295 			if (lvalue == 0) {
   5296 				mutex_exit(&so->so_lock);
   5297 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
   5298 				    QHIWAT, 0, &lvalue);
   5299 				mutex_enter(&so->so_lock);
   5300 				dprintso(so, 1,
   5301 				    ("got SO_SNDBUF %ld from q\n", lvalue));
   5302 			}
   5303 			value = (int)lvalue;
   5304 			option = &value;
   5305 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
   5306 			break;
   5307 		}
   5308 		case SO_RCVBUF: {
   5309 			ssize_t lvalue;
   5310 
   5311 			/*
   5312 			 * If the option has not been set then get a default
   5313 			 * value from the read queue. This value is
   5314 			 * returned if the transport fails
   5315 			 * the T_SVR4_OPTMGMT_REQ.
   5316 			 *
   5317 			 * XXX If SO_RCVBUF has been set and this is an
   5318 			 * XPG 4.2 application then do not ask the transport
   5319 			 * since the transport might adjust the value and not
   5320 			 * return exactly what was set by the application.
   5321 			 * For non-XPG 4.2 application we return the value
   5322 			 * that the transport is actually using.
   5323 			 */
   5324 			lvalue = so->so_rcvbuf;
   5325 			if (lvalue == 0) {
   5326 				mutex_exit(&so->so_lock);
   5327 				(void) strqget(RD(strvp2wq(SOTOV(so))),
   5328 				    QHIWAT, 0, &lvalue);
   5329 				mutex_enter(&so->so_lock);
   5330 				dprintso(so, 1,
   5331 				    ("got SO_RCVBUF %ld from q\n", lvalue));
   5332 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
   5333 				value = (int)lvalue;
   5334 				option = &value;
   5335 				goto copyout;	/* skip asking transport */
   5336 			}
   5337 			value = (int)lvalue;
   5338 			option = &value;
   5339 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
   5340 			break;
   5341 		}
   5342 		case SO_DOMAIN:
   5343 			value = so->so_family;
   5344 			option = &value;
   5345 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
   5346 
   5347 #ifdef notyet
   5348 		/*
   5349 		 * We do not implement the semantics of these options
   5350 		 * thus we shouldn't implement the options either.
   5351 		 */
   5352 		case SO_SNDLOWAT:
   5353 			value = so->so_sndlowat;
   5354 			option = &value;
   5355 			break;
   5356 		case SO_RCVLOWAT:
   5357 			value = so->so_rcvlowat;
   5358 			option = &value;
   5359 			break;
   5360 #endif /* notyet */
   5361 		case SO_SNDTIMEO:
   5362 		case SO_RCVTIMEO: {
   5363 			clock_t val;
   5364 
   5365 			if (option_name == SO_RCVTIMEO)
   5366 				val = drv_hztousec(so->so_rcvtimeo);
   5367 			else
   5368 				val = drv_hztousec(so->so_sndtimeo);
   5369 			tmo_val.tv_sec = val / (1000 * 1000);
   5370 			tmo_val.tv_usec = val % (1000 * 1000);
   5371 			if (get_udatamodel() == DATAMODEL_NONE ||
   5372 			    get_udatamodel() == DATAMODEL_NATIVE) {
   5373 				option = &tmo_val;
   5374 				len = sizeof (struct timeval);
   5375 			} else {
   5376 				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
   5377 				option = &tmo_val32;
   5378 				len = sizeof (struct timeval32);
   5379 			}
   5380 			break;
   5381 		}
   5382 		case SO_SND_BUFINFO: {
   5383 			snd_bufinfo.sbi_wroff =
   5384 			    (so->so_proto_props).sopp_wroff;
   5385 			snd_bufinfo.sbi_maxblk =
   5386 			    (so->so_proto_props).sopp_maxblk;
   5387 			snd_bufinfo.sbi_maxpsz =
   5388 			    (so->so_proto_props).sopp_maxpsz;
   5389 			snd_bufinfo.sbi_tail =
   5390 			    (so->so_proto_props).sopp_tail;
   5391 			option = &snd_bufinfo;
   5392 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
   5393 			break;
   5394 		}
   5395 		}
   5396 	}
   5397 
   5398 	mutex_exit(&so->so_lock);
   5399 
   5400 	/* Send request */
   5401 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
   5402 	optmgmt_req.MGMT_flags = T_CHECK;
   5403 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
   5404 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
   5405 
   5406 	oh.level = level;
   5407 	oh.name = option_name;
   5408 	oh.len = maxlen;
   5409 
   5410 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
   5411 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
   5412 	/* Let option management work in the presence of data flow control */
   5413 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
   5414 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
   5415 	mp = NULL;
   5416 	mutex_enter(&so->so_lock);
   5417 	if (error) {
   5418 		eprintsoline(so, error);
   5419 		goto done2;
   5420 	}
   5421 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
   5422 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
   5423 	if (error) {
   5424 		if (option != NULL) {
   5425 			/* We have a fallback value */
   5426 			error = 0;
   5427 			goto copyout;
   5428 		}
   5429 		eprintsoline(so, error);
   5430 		goto done2;
   5431 	}
   5432 	ASSERT(mp);
   5433 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
   5434 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
   5435 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
   5436 	if (opt_res == NULL) {
   5437 		if (option != NULL) {
   5438 			/* We have a fallback value */
   5439 			error = 0;
   5440 			goto copyout;
   5441 		}
   5442 		error = EPROTO;
   5443 		eprintsoline(so, error);
   5444 		goto done;
   5445 	}
   5446 	option = &opt_res[1];
   5447 
   5448 	/* check to ensure that the option is within bounds */
   5449 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
   5450 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
   5451 		if (option != NULL) {
   5452 			/* We have a fallback value */
   5453 			error = 0;
   5454 			goto copyout;
   5455 		}
   5456 		error = EPROTO;
   5457 		eprintsoline(so, error);
   5458 		goto done;
   5459 	}
   5460 
   5461 	len = opt_res->len;
   5462 
   5463 copyout: {
   5464 		t_uscalar_t size = MIN(len, maxlen);
   5465 		bcopy(option, optval, size);
   5466 		bcopy(&size, optlenp, sizeof (size));
   5467 	}
   5468 done:
   5469 	freemsg(mp);
   5470 done2:
   5471 	so_unlock_single(so, SOLOCKED);
   5472 	mutex_exit(&so->so_lock);
   5473 
   5474 	return (error);
   5475 }
   5476 
   5477 /*
   5478  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
   5479  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
   5480  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
   5481  * setsockopt has to work even if the transport does not support the option.
   5482  */
   5483 /* ARGSUSED */
   5484 int
   5485 sotpi_setsockopt(struct sonode *so, int level, int option_name,
   5486 	const void *optval, t_uscalar_t optlen, struct cred *cr)
   5487 {
   5488 	struct T_optmgmt_req	optmgmt_req;
   5489 	struct opthdr		oh;
   5490 	mblk_t			*mp;
   5491 	int			error = 0;
   5492 	boolean_t		handled = B_FALSE;
   5493 
   5494 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
   5495 	    (void *)so, level, option_name, optval, optlen,
   5496 	    pr_state(so->so_state, so->so_mode)));
   5497 
   5498 	/* X/Open requires this check */
   5499 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
   5500 		if (xnet_check_print)
   5501 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
   5502 		return (EINVAL);
   5503 	}
   5504 
   5505 	mutex_enter(&so->so_lock);
   5506 	so_lock_single(so);	/* Set SOLOCKED */
   5507 	mutex_exit(&so->so_lock);
   5508 
   5509 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
   5510 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
   5511 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
   5512 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
   5513 
   5514 	oh.level = level;
   5515 	oh.name = option_name;
   5516 	oh.len = optlen;
   5517 
   5518 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
   5519 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
   5520 	/* Let option management work in the presence of data flow control */
   5521 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
   5522 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
   5523 	mp = NULL;
   5524 	mutex_enter(&so->so_lock);
   5525 	if (error) {
   5526 		eprintsoline(so, error);
   5527 		goto done2;
   5528 	}
   5529 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
   5530 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
   5531 	if (error) {
   5532 		eprintsoline(so, error);
   5533 		goto done;
   5534 	}
   5535 	ASSERT(mp);
   5536 	/* No need to verify T_optmgmt_ack */
   5537 	freemsg(mp);
   5538 done:
   5539 	/*
   5540 	 * Check for SOL_SOCKET options and record their values.
   5541 	 * If we know about a SOL_SOCKET parameter and the transport
   5542 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
   5543 	 * EPROTO) we let the setsockopt succeed.
   5544 	 */
   5545 	if (level == SOL_SOCKET) {
   5546 		/* Check parameters */
   5547 		switch (option_name) {
   5548 		case SO_DEBUG:
   5549 		case SO_REUSEADDR:
   5550 		case SO_KEEPALIVE:
   5551 		case SO_DONTROUTE:
   5552 		case SO_BROADCAST:
   5553 		case SO_USELOOPBACK:
   5554 		case SO_OOBINLINE:
   5555 		case SO_SNDBUF:
   5556 		case SO_RCVBUF:
   5557 #ifdef notyet
   5558 		case SO_SNDLOWAT:
   5559 		case SO_RCVLOWAT:
   5560 #endif /* notyet */
   5561 		case SO_DGRAM_ERRIND:
   5562 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
   5563 				error = EINVAL;
   5564 				eprintsoline(so, error);
   5565 				goto done2;
   5566 			}
   5567 			ASSERT(optval);
   5568 			handled = B_TRUE;
   5569 			break;
   5570 		case SO_SNDTIMEO:
   5571 		case SO_RCVTIMEO:
   5572 			if (get_udatamodel() == DATAMODEL_NONE ||
   5573 			    get_udatamodel() == DATAMODEL_NATIVE) {
   5574 				if (optlen != sizeof (struct timeval)) {
   5575 					error = EINVAL;
   5576 					eprintsoline(so, error);
   5577 					goto done2;
   5578 				}
   5579 			} else {
   5580 				if (optlen != sizeof (struct timeval32)) {
   5581 					error = EINVAL;
   5582 					eprintsoline(so, error);
   5583 					goto done2;
   5584 				}
   5585 			}
   5586 			ASSERT(optval);
   5587 			handled = B_TRUE;
   5588 			break;
   5589 		case SO_LINGER:
   5590 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
   5591 				error = EINVAL;
   5592 				eprintsoline(so, error);
   5593 				goto done2;
   5594 			}
   5595 			ASSERT(optval);
   5596 			handled = B_TRUE;
   5597 			break;
   5598 		}
   5599 
   5600 #define	intvalue	(*(int32_t *)optval)
   5601 
   5602 		switch (option_name) {
   5603 		case SO_TYPE:
   5604 		case SO_ERROR:
   5605 		case SO_ACCEPTCONN:
   5606 			/* Can't be set */
   5607 			error = ENOPROTOOPT;
   5608 			goto done2;
   5609 		case SO_LINGER: {
   5610 			struct linger *l = (struct linger *)optval;
   5611 
   5612 			so->so_linger.l_linger = l->l_linger;
   5613 			if (l->l_onoff) {
   5614 				so->so_linger.l_onoff = SO_LINGER;
   5615 				so->so_options |= SO_LINGER;
   5616 			} else {
   5617 				so->so_linger.l_onoff = 0;
   5618 				so->so_options &= ~SO_LINGER;
   5619 			}
   5620 			break;
   5621 		}
   5622 
   5623 		case SO_DEBUG:
   5624 #ifdef SOCK_TEST
   5625 			if (intvalue & 2)
   5626 				sock_test_timelimit = 10 * hz;
   5627 			else
   5628 				sock_test_timelimit = 0;
   5629 
   5630 			if (intvalue & 4)
   5631 				do_useracc = 0;
   5632 			else
   5633 				do_useracc = 1;
   5634 #endif /* SOCK_TEST */
   5635 			/* FALLTHRU */
   5636 		case SO_REUSEADDR:
   5637 		case SO_KEEPALIVE:
   5638 		case SO_DONTROUTE:
   5639 		case SO_BROADCAST:
   5640 		case SO_USELOOPBACK:
   5641 		case SO_OOBINLINE:
   5642 		case SO_DGRAM_ERRIND:
   5643 			if (intvalue != 0) {
   5644 				dprintso(so, 1,
   5645 				    ("socket_setsockopt: setting 0x%x\n",
   5646 				    option_name));
   5647 				so->so_options |= option_name;
   5648 			} else {
   5649 				dprintso(so, 1,
   5650 				    ("socket_setsockopt: clearing 0x%x\n",
   5651 				    option_name));
   5652 				so->so_options &= ~option_name;
   5653 			}
   5654 			break;
   5655 		/*
   5656 		 * The following options are only returned by us when the
   5657 		 * transport layer fails.
   5658 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
   5659 		 * since the transport might adjust the value and not
   5660 		 * return exactly what was set by the application.
   5661 		 */
   5662 		case SO_SNDBUF:
   5663 			so->so_sndbuf = intvalue;
   5664 			break;
   5665 		case SO_RCVBUF:
   5666 			so->so_rcvbuf = intvalue;
   5667 			break;
   5668 		case SO_RCVPSH:
   5669 			so->so_rcv_timer_interval = intvalue;
   5670 			break;
   5671 #ifdef notyet
   5672 		/*
   5673 		 * We do not implement the semantics of these options
   5674 		 * thus we shouldn't implement the options either.
   5675 		 */
   5676 		case SO_SNDLOWAT:
   5677 			so->so_sndlowat = intvalue;
   5678 			break;
   5679 		case SO_RCVLOWAT:
   5680 			so->so_rcvlowat = intvalue;
   5681 			break;
   5682 #endif /* notyet */
   5683 		case SO_SNDTIMEO:
   5684 		case SO_RCVTIMEO: {
   5685 			struct timeval tl;
   5686 			clock_t val;
   5687 
   5688 			if (get_udatamodel() == DATAMODEL_NONE ||
   5689 			    get_udatamodel() == DATAMODEL_NATIVE)
   5690 				bcopy(&tl, (struct timeval *)optval,
   5691 				    sizeof (struct timeval));
   5692 			else
   5693 				TIMEVAL32_TO_TIMEVAL(&tl,
   5694 				    (struct timeval32 *)optval);
   5695 			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
   5696 			if (option_name == SO_RCVTIMEO)
   5697 				so->so_rcvtimeo = drv_usectohz(val);
   5698 			else
   5699 				so->so_sndtimeo = drv_usectohz(val);
   5700 			break;
   5701 		}
   5702 		}
   5703 #undef	intvalue
   5704 
   5705 		if (error) {
   5706 			if ((error == ENOPROTOOPT || error == EPROTO ||
   5707 			    error == EINVAL) && handled) {
   5708 				dprintso(so, 1,
   5709 				    ("setsockopt: ignoring error %d for 0x%x\n",
   5710 				    error, option_name));
   5711 				error = 0;
   5712 			}
   5713 		}
   5714 	}
   5715 done2:
   5716 	so_unlock_single(so, SOLOCKED);
   5717 	mutex_exit(&so->so_lock);
   5718 	return (error);
   5719 }
   5720 
   5721 /*
   5722  * sotpi_close() is called when the last open reference goes away.
   5723  */
   5724 /* ARGSUSED */
   5725 int
   5726 sotpi_close(struct sonode *so, int flag, struct cred *cr)
   5727 {
   5728 	struct vnode *vp = SOTOV(so);
   5729 	dev_t dev;
   5730 	int error = 0;
   5731 	sotpi_info_t *sti = SOTOTPI(so);
   5732 
   5733 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
   5734 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
   5735 
   5736 	dev = sti->sti_dev;
   5737 
   5738 	ASSERT(STREAMSTAB(getmajor(dev)));
   5739 
   5740 	mutex_enter(&so->so_lock);
   5741 	so_lock_single(so);	/* Set SOLOCKED */
   5742 
   5743 	ASSERT(so_verify_oobstate(so));
   5744 
   5745 	if (sti->sti_nl7c_flags & NL7C_ENABLED) {
   5746 		sti->sti_nl7c_flags = 0;
   5747 		nl7c_close(so);
   5748 	}
   5749 
   5750 	if (vp->v_stream != NULL) {
   5751 		vnode_t *ux_vp;
   5752 
   5753 		if (so->so_family == AF_UNIX) {
   5754 			/* Could avoid this when CANTSENDMORE for !dgram */
   5755 			so_unix_close(so);
   5756 		}
   5757 
   5758 		mutex_exit(&so->so_lock);
   5759 		/*
   5760 		 * Disassemble the linkage from the AF_UNIX underlying file
   5761 		 * system vnode to this socket (by atomically clearing
   5762 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
   5763 		 * and frees the stream head.
   5764 		 */
   5765 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
   5766 			ASSERT(ux_vp->v_stream);
   5767 			sti->sti_ux_bound_vp = NULL;
   5768 			vn_rele_stream(ux_vp);
   5769 		}
   5770 		if (so->so_family == AF_INET || so->so_family == AF_INET6) {
   5771 			strsetrwputdatahooks(SOTOV(so), NULL, NULL);
   5772 			if (sti->sti_kssl_ent != NULL) {
   5773 				kssl_release_ent(sti->sti_kssl_ent, so,
   5774 				    sti->sti_kssl_type);
   5775 				sti->sti_kssl_ent = NULL;
   5776 			}
   5777 			if (sti->sti_kssl_ctx != NULL) {
   5778 				kssl_release_ctx(sti->sti_kssl_ctx);
   5779 				sti->sti_kssl_ctx = NULL;
   5780 			}
   5781 			sti->sti_kssl_type = KSSL_NO_PROXY;
   5782 		}
   5783 		error = strclose(vp, flag, cr);
   5784 		vp->v_stream = NULL;
   5785 		mutex_enter(&so->so_lock);
   5786 	}
   5787 
   5788 	/*
   5789 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
   5790 	 */
   5791 	so_flush_discon_ind(so);
   5792 
   5793 	so_unlock_single(so, SOLOCKED);
   5794 	mutex_exit(&so->so_lock);
   5795 
   5796 	/*
   5797 	 * Needed for STREAMs.
   5798 	 * Decrement the device driver's reference count for streams
   5799 	 * opened via the clone dip. The driver was held in clone_open().
   5800 	 * The absence of clone_close() forces this asymmetry.
   5801 	 */
   5802 	if (so->so_flag & SOCLONE)
   5803 		ddi_rele_driver(getmajor(dev));
   5804 
   5805 	return (error);
   5806 }
   5807 
   5808 static int
   5809 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
   5810     struct cred *cr, int32_t *rvalp)
   5811 {
   5812 	struct vnode *vp = SOTOV(so);
   5813 	sotpi_info_t *sti = SOTOTPI(so);
   5814 	int error = 0;
   5815 
   5816 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
   5817 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
   5818 
   5819 	switch (cmd) {
   5820 	case SIOCSQPTR:
   5821 		/*
   5822 		 * SIOCSQPTR is valid only when helper stream is created
   5823 		 * by the protocol.
   5824 		 */
   5825 	case _I_INSERT:
   5826 	case _I_REMOVE:
   5827 		/*
   5828 		 * Since there's no compelling reason to support these ioctls
   5829 		 * on sockets, and doing so would increase the complexity
   5830 		 * markedly, prevent it.
   5831 		 */
   5832 		return (EOPNOTSUPP);
   5833 
   5834 	case I_FIND:
   5835 	case I_LIST:
   5836 	case I_LOOK:
   5837 	case I_POP:
   5838 	case I_PUSH:
   5839 		/*
   5840 		 * To prevent races and inconsistencies between the actual
   5841 		 * state of the stream and the state according to the sonode,
   5842 		 * we serialize all operations which modify or operate on the
   5843 		 * list of modules on the socket's stream.
   5844 		 */
   5845 		mutex_enter(&sti->sti_plumb_lock);
   5846 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
   5847 		mutex_exit(&sti->sti_plumb_lock);
   5848 		return (error);
   5849 
   5850 	default:
   5851 		if (so->so_version != SOV_STREAM)
   5852 			break;
   5853 
   5854 		/*
   5855 		 * The imaginary "sockmod" has been popped; act as a stream.
   5856 		 */
   5857 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
   5858 	}
   5859 
   5860 	ASSERT(so->so_version != SOV_STREAM);
   5861 
   5862 	/*
   5863 	 * Process socket-specific ioctls.
   5864 	 */
   5865 	switch (cmd) {
   5866 	case FIONBIO: {
   5867 		int32_t value;
   5868 
   5869 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
   5870 		    (mode & (int)FKIOCTL)))
   5871 			return (EFAULT);
   5872 
   5873 		mutex_enter(&so->so_lock);
   5874 		if (value) {
   5875 			so->so_state |= SS_NDELAY;
   5876 		} else {
   5877 			so->so_state &= ~SS_NDELAY;
   5878 		}
   5879 		mutex_exit(&so->so_lock);
   5880 		return (0);
   5881 	}
   5882 
   5883 	case FIOASYNC: {
   5884 		int32_t value;
   5885 
   5886 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
   5887 		    (mode & (int)FKIOCTL)))
   5888 			return (EFAULT);
   5889 
   5890 		mutex_enter(&so->so_lock);
   5891 		/*
   5892 		 * SS_ASYNC flag not already set correctly?
   5893 		 * (!value != !(so->so_state & SS_ASYNC))
   5894 		 * but some engineers find that too hard to read.
   5895 		 */
   5896 		if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
   5897 		    value != 0 && (so->so_state & SS_ASYNC) == 0)
   5898 			error = so_flip_async(so, vp, mode, cr);
   5899 		mutex_exit(&so->so_lock);
   5900 		return (error);
   5901 	}
   5902 
   5903 	case SIOCSPGRP:
   5904 	case FIOSETOWN: {
   5905 		pid_t pgrp;
   5906 
   5907 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
   5908 		    (mode & (int)FKIOCTL)))
   5909 			return (EFAULT);
   5910 
   5911 		mutex_enter(&so->so_lock);
   5912 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
   5913 		/* Any change? */
   5914 		if (pgrp != so->so_pgrp)
   5915 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
   5916 		mutex_exit(&so->so_lock);
   5917 		return (error);
   5918 	}
   5919 	case SIOCGPGRP:
   5920 	case FIOGETOWN:
   5921 		if (so_copyout(&so->so_pgrp, (void *)arg,
   5922 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
   5923 			return (EFAULT);
   5924 		return (0);
   5925 
   5926 	case SIOCATMARK: {
   5927 		int retval;
   5928 		uint_t so_state;
   5929 
   5930 		/*
   5931 		 * strwaitmark has a finite timeout after which it
   5932 		 * returns -1 if the mark state is undetermined.
   5933 		 * In order to avoid any race between the mark state
   5934 		 * in sockfs and the mark state in the stream head this
   5935 		 * routine loops until the mark state can be determined
   5936 		 * (or the urgent data indication has been removed by some
   5937 		 * other thread).
   5938 		 */
   5939 		do {
   5940 			mutex_enter(&so->so_lock);
   5941 			so_state = so->so_state;
   5942 			mutex_exit(&so->so_lock);
   5943 			if (so_state & SS_RCVATMARK) {
   5944 				retval = 1;
   5945 			} else if (!(so_state & SS_OOBPEND)) {
   5946 				/*
   5947 				 * No SIGURG has been generated -- there is no
   5948 				 * pending or present urgent data. Thus can't
   5949 				 * possibly be at the mark.
   5950 				 */
   5951 				retval = 0;
   5952 			} else {
   5953 				/*
   5954 				 * Have the stream head wait until there is
   5955 				 * either some messages on the read queue, or
   5956 				 * STRATMARK or STRNOTATMARK gets set. The
   5957 				 * STRNOTATMARK flag is used so that the
   5958 				 * transport can send up a MSGNOTMARKNEXT
   5959 				 * M_DATA to indicate that it is not
   5960 				 * at the mark and additional data is not about
   5961 				 * to be send upstream.
   5962 				 *
   5963 				 * If the mark state is undetermined this will
   5964 				 * return -1 and we will loop rechecking the
   5965 				 * socket state.
   5966 				 */
   5967 				retval = strwaitmark(vp);
   5968 			}
   5969 		} while (retval == -1);
   5970 
   5971 		if (so_copyout(&retval, (void *)arg, sizeof (int),
   5972 		    (mode & (int)FKIOCTL)))
   5973 			return (EFAULT);
   5974 		return (0);
   5975 	}
   5976 
   5977 	case I_FDINSERT:
   5978 	case I_SENDFD:
   5979 	case I_RECVFD:
   5980 	case I_ATMARK:
   5981 	case _SIOCSOCKFALLBACK:
   5982 		/*
   5983 		 * These ioctls do not apply to sockets. I_FDINSERT can be
   5984 		 * used to send M_PROTO messages without modifying the socket
   5985 		 * state. I_SENDFD/RECVFD should not be used for socket file
   5986 		 * descriptor passing since they assume a twisted stream.
   5987 		 * SIOCATMARK must be used instead of I_ATMARK.
   5988 		 *
   5989 		 * _SIOCSOCKFALLBACK from an application should never be
   5990 		 * processed.  It is only generated by socktpi_open() or
   5991 		 * in response to I_POP or I_PUSH.
   5992 		 */
   5993 #ifdef DEBUG
   5994 		zcmn_err(getzoneid(), CE_WARN,
   5995 		    "Unsupported STREAMS ioctl 0x%x on socket. "
   5996 		    "Pid = %d\n", cmd, curproc->p_pid);
   5997 #endif /* DEBUG */
   5998 		return (EOPNOTSUPP);
   5999 
   6000 	case _I_GETPEERCRED:
   6001 		if ((mode & FKIOCTL) == 0)
   6002 			return (EINVAL);
   6003 
   6004 		mutex_enter(&so->so_lock);
   6005 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
   6006 			error = ENOTSUP;
   6007 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
   6008 			error = ENOTCONN;
   6009 		} else if (so->so_peercred != NULL) {
   6010 			k_peercred_t *kp = (k_peercred_t *)arg;
   6011 			kp->pc_cr = so->so_peercred;
   6012 			kp->pc_cpid = so->so_cpid;
   6013 			crhold(so->so_peercred);
   6014 		} else {
   6015 			error = EINVAL;
   6016 		}
   6017 		mutex_exit(&so->so_lock);
   6018 		return (error);
   6019 
   6020 	default:
   6021 		/*
   6022 		 * Do the higher-order bits of the ioctl cmd indicate
   6023 		 * that it is an I_* streams ioctl?
   6024 		 */
   6025 		if ((cmd & 0xffffff00U) == STR &&
   6026 		    so->so_version == SOV_SOCKBSD) {
   6027 #ifdef DEBUG
   6028 			zcmn_err(getzoneid(), CE_WARN,
   6029 			    "Unsupported STREAMS ioctl 0x%x on socket. "
   6030 			    "Pid = %d\n", cmd, 	curproc->p_pid);
   6031 #endif /* DEBUG */
   6032 			return (EOPNOTSUPP);
   6033 		}
   6034 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
   6035 	}
   6036 }
   6037 
   6038 /*
   6039  * Handle plumbing-related ioctls.
   6040  */
   6041 static int
   6042 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
   6043     struct cred *cr, int32_t *rvalp)
   6044 {
   6045 	static const char sockmod_name[] = "sockmod";
   6046 	struct sonode	*so = VTOSO(vp);
   6047 	char		mname[FMNAMESZ + 1];
   6048 	int		error;
   6049 	sotpi_info_t	*sti = SOTOTPI(so);
   6050 
   6051 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
   6052 
   6053 	if (so->so_version == SOV_SOCKBSD)
   6054 		return (EOPNOTSUPP);
   6055 
   6056 	if (so->so_version == SOV_STREAM) {
   6057 		/*
   6058 		 * The imaginary "sockmod" has been popped - act as a stream.
   6059 		 * If this is a push of sockmod then change back to a socket.
   6060 		 */
   6061 		if (cmd == I_PUSH) {
   6062 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
   6063 			    (void *)arg, mname, sizeof (mname), NULL);
   6064 
   6065 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
   6066 				dprintso(so, 0, ("socktpi_ioctl: going to "
   6067 				    "socket version\n"));
   6068 				so_stream2sock(so);
   6069 				return (0);
   6070 			}
   6071 		}
   6072 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
   6073 	}
   6074 
   6075 	switch (cmd) {
   6076 	case I_PUSH:
   6077 		if (sti->sti_direct) {
   6078 			mutex_enter(&so->so_lock);
   6079 			so_lock_single(so);
   6080 			mutex_exit(&so->so_lock);
   6081 
   6082 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
   6083 			    cr, rvalp);
   6084 
   6085 			mutex_enter(&so->so_lock);
   6086 			if (error == 0)
   6087 				sti->sti_direct = 0;
   6088 			so_unlock_single(so, SOLOCKED);
   6089 			mutex_exit(&so->so_lock);
   6090 
   6091 			if (error != 0)
   6092 				return (error);
   6093 		}
   6094 
   6095 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
   6096 		if (error == 0)
   6097 			sti->sti_pushcnt++;
   6098 		return (error);
   6099 
   6100 	case I_POP:
   6101 		if (sti->sti_pushcnt == 0) {
   6102 			/* Emulate sockmod being popped */
   6103 			dprintso(so, 0,
   6104 			    ("socktpi_ioctl: going to STREAMS version\n"));
   6105 			return (so_sock2stream(so));
   6106 		}
   6107 
   6108 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
   6109 		if (error == 0)
   6110 			sti->sti_pushcnt--;
   6111 		return (error);
   6112 
   6113 	case I_LIST: {
   6114 		struct str_mlist *kmlistp, *umlistp;
   6115 		struct str_list	kstrlist;
   6116 		ssize_t		kstrlistsize;
   6117 		int		i, nmods;
   6118 
   6119 		STRUCT_DECL(str_list, ustrlist);
   6120 		STRUCT_INIT(ustrlist, mode);
   6121 
   6122 		if (arg == NULL) {
   6123 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
   6124 			if (error == 0)
   6125 				(*rvalp)++;	/* Add one for sockmod */
   6126 			return (error);
   6127 		}
   6128 
   6129 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
   6130 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
   6131 		if (error != 0)
   6132 			return (error);
   6133 
   6134 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
   6135 		if (nmods <= 0)
   6136 			return (EINVAL);
   6137 		/*
   6138 		 * Ceiling nmods at nstrpush to prevent someone from
   6139 		 * maliciously consuming lots of kernel memory.
   6140 		 */
   6141 		nmods = MIN(nmods, nstrpush);
   6142 
   6143 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
   6144 		kstrlist.sl_nmods = nmods;
   6145 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
   6146 
   6147 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
   6148 		    cr, rvalp);
   6149 		if (error != 0)
   6150 			goto done;
   6151 
   6152 		/*
   6153 		 * Considering the module list as a 0-based array of sl_nmods
   6154 		 * modules, sockmod should conceptually exist at slot
   6155 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
   6156 		 * of the module names after so_pushcnt over by one.  We know
   6157 		 * that there will be room to do this since we allocated
   6158 		 * sl_modlist with an additional slot.
   6159 		 */
   6160 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
   6161 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
   6162 
   6163 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
   6164 		kstrlist.sl_nmods++;
   6165 
   6166 		/*
   6167 		 * Copy all of the entries out to ustrlist.
   6168 		 */
   6169 		kmlistp = kstrlist.sl_modlist;
   6170 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
   6171 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
   6172 			error = so_copyout(kmlistp++, umlistp++,
   6173 			    sizeof (struct str_mlist), mode & FKIOCTL);
   6174 			if (error != 0)
   6175 				goto done;
   6176 		}
   6177 
   6178 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
   6179 		    mode & FKIOCTL);
   6180 		if (error == 0)
   6181 			*rvalp = 0;
   6182 	done:
   6183 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
   6184 		return (error);
   6185 	}
   6186 	case I_LOOK:
   6187 		if (sti->sti_pushcnt == 0) {
   6188 			return (so_copyout(sockmod_name, (void *)arg,
   6189 			    sizeof (sockmod_name), mode & FKIOCTL));
   6190 		}
   6191 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
   6192 
   6193 	case I_FIND:
   6194 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
   6195 		if (error && error != EINVAL)
   6196 			return (error);
   6197 
   6198 		/* if not found and string was sockmod return 1 */
   6199 		if (*rvalp == 0 || error == EINVAL) {
   6200 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
   6201 			    (void *)arg, mname, sizeof (mname), NULL);
   6202 			if (error == ENAMETOOLONG)
   6203 				error = EINVAL;
   6204 
   6205 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
   6206 				*rvalp = 1;
   6207 		}
   6208 		return (error);
   6209 
   6210 	default:
   6211 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
   6212 		break;
   6213 	}
   6214 
   6215 	return (0);
   6216 }
   6217 
   6218 /*
   6219  * Wrapper around the streams poll routine that implements socket poll
   6220  * semantics.
   6221  * The sockfs never calls pollwakeup itself - the stream head take care
   6222  * of all pollwakeups. Since sockfs never holds so_lock when calling the
   6223  * stream head there can never be a deadlock due to holding so_lock across
   6224  * pollwakeup and acquiring so_lock in this routine.
   6225  *
   6226  * However, since the performance of VOP_POLL is critical we avoid
   6227  * acquiring so_lock here. This is based on two assumptions:
   6228  *  - The poll implementation holds locks to serialize the VOP_POLL call
   6229  *    and a pollwakeup for the same pollhead. This ensures that should
   6230  *    e.g. so_state change during a socktpi_poll call the pollwakeup
   6231  *    (which strsock_* and strrput conspire to issue) is issued after
   6232  *    the state change. Thus the pollwakeup will block until VOP_POLL has
   6233  *    returned and then wake up poll and have it call VOP_POLL again.
   6234  *  - The reading of so_state without holding so_lock does not result in
   6235  *    stale data that is older than the latest state change that has dropped
   6236  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
   6237  *    memory barrier to force the data into the coherency domain.
   6238  */
   6239 static int
   6240 sotpi_poll(
   6241 	struct sonode	*so,
   6242 	short		events,
   6243 	int		anyyet,
   6244 	short		*reventsp,
   6245 	struct pollhead **phpp)
   6246 {
   6247 	short origevents = events;
   6248 	struct vnode *vp = SOTOV(so);
   6249 	int error;
   6250 	int so_state = so->so_state;	/* snapshot */
   6251 	sotpi_info_t *sti = SOTOTPI(so);
   6252 
   6253 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
   6254 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
   6255 
   6256 	ASSERT(vp->v_type == VSOCK);
   6257 	ASSERT(vp->v_stream != NULL);
   6258 
   6259 	if (so->so_version == SOV_STREAM) {
   6260 		/* The imaginary "sockmod" has been popped - act as a stream */
   6261 		return (strpoll(vp->v_stream, events, anyyet,
   6262 		    reventsp, phpp));
   6263 	}
   6264 
   6265 	if (!(so_state & SS_ISCONNECTED) &&
   6266 	    (so->so_mode & SM_CONNREQUIRED)) {
   6267 		/* Not connected yet - turn off write side events */
   6268 		events &= ~(POLLOUT|POLLWRBAND);
   6269 	}
   6270 	/*
   6271 	 * Check for errors without calling strpoll if the caller wants them.
   6272 	 * In sockets the errors are represented as input/output events
   6273 	 * and there is no need to ask the stream head for this information.
   6274 	 */
   6275 	if (so->so_error != 0 &&
   6276 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
   6277 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
   6278 		return (0);
   6279 	}
   6280 	/*
   6281 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
   6282 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
   6283 	 * will not trigger a POLLIN event with POLLRDDATA set.
   6284 	 * The handling of urgent data (causing POLLRDBAND) is done by
   6285 	 * inspecting SS_OOBPEND below.
   6286 	 */
   6287 	events |= POLLRDDATA;
   6288 
   6289 	/*
   6290 	 * After shutdown(output) a stream head write error is set.
   6291 	 * However, we should not return output events.
   6292 	 */
   6293 	events |= POLLNOERR;
   6294 	error = strpoll(vp->v_stream, events, anyyet,
   6295 	    reventsp, phpp);
   6296 	if (error)
   6297 		return (error);
   6298 
   6299 	ASSERT(!(*reventsp & POLLERR));
   6300 
   6301 	/*
   6302 	 * Notes on T_CONN_IND handling for sockets.
   6303 	 *
   6304 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
   6305 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
   6306 	 *
   6307 	 * Since the so_lock is not held, soqueueconnind() may have run
   6308 	 * and a T_CONN_IND may be waiting. We now check for any queued
   6309 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
   6310 	 * to ensure poll returns.
   6311 	 *
   6312 	 * However:
   6313 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
   6314 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
   6315 	 * the following actions will occur; taken together they ensure the
   6316 	 * syscall will return.
   6317 	 *
   6318 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
   6319 	 *    the accept() was run on a non-blocking socket sowaitconnind()
   6320 	 *    may have already returned EWOULDBLOCK, so not be waiting to
   6321 	 *    process the message. Additionally socktpi_poll() has probably
   6322 	 *    proceeded past the sti_conn_ind_head check below.
   6323 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
   6324 	 *    this thread,  however that could occur before poll_common()
   6325 	 *    has entered cv_wait.
   6326 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
   6327 	 *
   6328 	 * Before proceeding to cv_wait() in poll_common() for an event,
   6329 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
   6330 	 * and if set, re-calls strpoll() to ensure the late arriving
   6331 	 * T_CONN_IND is recognized, and pollsys() returns.
   6332 	 */
   6333 
   6334 	if (sti->sti_conn_ind_head != NULL)
   6335 		*reventsp |= (POLLIN|POLLRDNORM) & events;
   6336 
   6337 	if (so->so_state & SS_OOBPEND)
   6338 		*reventsp |= POLLRDBAND & events;
   6339 
   6340 	if (sti->sti_nl7c_rcv_mp != NULL) {
   6341 		*reventsp |= (POLLIN|POLLRDNORM) & events;
   6342 	}
   6343 	if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
   6344 	    ((POLLIN|POLLRDNORM) & *reventsp)) {
   6345 		sti->sti_nl7c_flags |= NL7C_POLLIN;
   6346 	}
   6347 
   6348 	return (0);
   6349 }
   6350 
   6351 /*ARGSUSED*/
   6352 static int
   6353 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
   6354 {
   6355 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
   6356 	int error = 0;
   6357 
   6358 	error = sonode_constructor(buf, cdrarg, kmflags);
   6359 	if (error != 0)
   6360 		return (error);
   6361 
   6362 	error = i_sotpi_info_constructor(&st->st_info);
   6363 	if (error != 0)
   6364 		sonode_destructor(buf, cdrarg);
   6365 
   6366 	st->st_sonode.so_priv = &st->st_info;
   6367 
   6368 	return (error);
   6369 }
   6370 
   6371 /*ARGSUSED1*/
   6372 static void
   6373 socktpi_destructor(void *buf, void *cdrarg)
   6374 {
   6375 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
   6376 
   6377 	ASSERT(st->st_sonode.so_priv == &st->st_info);
   6378 	st->st_sonode.so_priv = NULL;
   6379 
   6380 	i_sotpi_info_destructor(&st->st_info);
   6381 	sonode_destructor(buf, cdrarg);
   6382 }
   6383 
   6384 static int
   6385 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
   6386 {
   6387 	int retval;
   6388 
   6389 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
   6390 		struct sonode *so = (struct sonode *)buf;
   6391 		sotpi_info_t *sti = SOTOTPI(so);
   6392 
   6393 		mutex_enter(&socklist.sl_lock);
   6394 
   6395 		sti->sti_next_so = socklist.sl_list;
   6396 		sti->sti_prev_so = NULL;
   6397 		if (sti->sti_next_so != NULL)
   6398 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
   6399 		socklist.sl_list = so;
   6400 
   6401 		mutex_exit(&socklist.sl_lock);
   6402 
   6403 	}
   6404 	return (retval);
   6405 }
   6406 
   6407 static void
   6408 socktpi_unix_destructor(void *buf, void *cdrarg)
   6409 {
   6410 	struct sonode	*so = (struct sonode *)buf;
   6411 	sotpi_info_t	*sti = SOTOTPI(so);
   6412 
   6413 	mutex_enter(&socklist.sl_lock);
   6414 
   6415 	if (sti->sti_next_so != NULL)
   6416 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
   6417 	if (sti->sti_prev_so != NULL)
   6418 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
   6419 	else
   6420 		socklist.sl_list = sti->sti_next_so;
   6421 
   6422 	mutex_exit(&socklist.sl_lock);
   6423 
   6424 	socktpi_destructor(buf, cdrarg);
   6425 }
   6426 
   6427 int
   6428 socktpi_init(void)
   6429 {
   6430 	/*
   6431 	 * Create sonode caches.  We create a special one for AF_UNIX so
   6432 	 * that we can track them for netstat(1m).
   6433 	 */
   6434 	socktpi_cache = kmem_cache_create("socktpi_cache",
   6435 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
   6436 	    socktpi_destructor, NULL, NULL, NULL, 0);
   6437 
   6438 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
   6439 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
   6440 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
   6441 
   6442 	return (0);
   6443 }
   6444 
   6445 /*
   6446  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
   6447  *
   6448  * Caller must still update state and mode using sotpi_update_state().
   6449  */
   6450 int
   6451 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
   6452     boolean_t *direct, queue_t **qp, struct cred *cr)
   6453 {
   6454 	sotpi_info_t *sti;
   6455 	struct sockparams *origsp = so->so_sockparams;
   6456 	sock_lower_handle_t handle = so->so_proto_handle;
   6457 	struct stdata *stp;
   6458 	struct vnode *vp;
   6459 	queue_t *q;
   6460 	int error = 0;
   6461 
   6462 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
   6463 	    SS_FALLBACK_PENDING);
   6464 	ASSERT(SOCK_IS_NONSTR(so));
   6465 
   6466 	*qp = NULL;
   6467 	*direct = B_FALSE;
   6468 	so->so_sockparams = newsp;
   6469 	/*
   6470 	 * Allocate and initalize fields required by TPI.
   6471 	 */
   6472 	(void) sotpi_info_create(so, KM_SLEEP);
   6473 	sotpi_info_init(so);
   6474 
   6475 	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
   6476 		sotpi_info_fini(so);
   6477 		sotpi_info_destroy(so);
   6478 		return (error);
   6479 	}
   6480 	ASSERT(handle == so->so_proto_handle);
   6481 	sti = SOTOTPI(so);
   6482 	if (sti->sti_direct != 0)
   6483 		*direct = B_TRUE;
   6484 
   6485 	/*
   6486 	 * When it comes to urgent data we have two cases to deal with;
   6487 	 * (1) The oob byte has already arrived, or (2) the protocol has
   6488 	 * notified that oob data is pending, but it has not yet arrived.
   6489 	 *
   6490 	 * For (1) all we need to do is send a T_EXDATA_IND to indicate were
   6491 	 * in the byte stream the oob byte is. For (2) we have to send a
   6492 	 * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
   6493 	 * the oob byte will be the next byte from the protocol.
   6494 	 *
   6495 	 * So in the worst case we need two mblks, one for the signal, another
   6496 	 * for mark indication. In that case we use the exdata_mp for the sig.
   6497 	 */
   6498 	sti->sti_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind), BPRI_MED,
   6499 	    STR_NOSIG, NULL);
   6500 	sti->sti_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
   6501 
   6502 	/*
   6503 	 * Keep the original sp around so we can properly dispose of the
   6504 	 * sonode when the socket is being closed.
   6505 	 */
   6506 	sti->sti_orig_sp = origsp;
   6507 
   6508 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
   6509 	so_alloc_addr(so, so->so_max_addr_len);
   6510 
   6511 	/*
   6512 	 * If the application has done a SIOCSPGRP, make sure the
   6513 	 * STREAM head is aware. This needs to take place before
   6514 	 * the protocol start sending up messages. Otherwise we
   6515 	 * might miss to generate SIGPOLL.
   6516 	 *
   6517 	 * It is possible that the application will receive duplicate
   6518 	 * signals if some were already generated for either data or
   6519 	 * connection indications.
   6520 	 */
   6521 	if (so->so_pgrp != 0) {
   6522 		if (so_set_events(so, so->so_vnode, cr) != 0)
   6523 			so->so_pgrp = 0;
   6524 	}
   6525 
   6526 	/*
   6527 	 * Determine which queue to use.
   6528 	 */
   6529 	vp = SOTOV(so);
   6530 	stp = vp->v_stream;
   6531 	ASSERT(stp != NULL);
   6532 	q = stp->sd_wrq->q_next;
   6533 
   6534 	/*
   6535 	 * Skip any modules that may have been auto pushed when the device
   6536 	 * was opened
   6537 	 */
   6538 	while (q->q_next != NULL)
   6539 		q = q->q_next;
   6540 	*qp = _RD(q);
   6541 
   6542 	/* This is now a STREAMS sockets */
   6543 	so->so_not_str = B_FALSE;
   6544 
   6545 	return (error);
   6546 }
   6547 
   6548 /*
   6549  * Revert a TPI sonode. It is only allowed to revert the sonode during
   6550  * the fallback process.
   6551  */
   6552 void
   6553 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
   6554 {
   6555 	vnode_t *vp = SOTOV(so);
   6556 
   6557 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
   6558 	    SS_FALLBACK_PENDING);
   6559 	ASSERT(!SOCK_IS_NONSTR(so));
   6560 	ASSERT(vp->v_stream != NULL);
   6561 
   6562 	if (SOTOTPI(so)->sti_exdata_mp != NULL) {
   6563 		freeb(SOTOTPI(so)->sti_exdata_mp);
   6564 		SOTOTPI(so)->sti_exdata_mp = NULL;
   6565 	}
   6566 
   6567 	if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
   6568 		freeb(SOTOTPI(so)->sti_urgmark_mp);
   6569 		SOTOTPI(so)->sti_urgmark_mp = NULL;
   6570 	}
   6571 
   6572 	strclean(vp);
   6573 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
   6574 
   6575 	/*
   6576 	 * Restore the original sockparams. The caller is responsible for
   6577 	 * dropping the ref to the new sp.
   6578 	 */
   6579 	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
   6580 
   6581 	sotpi_info_fini(so);
   6582 	sotpi_info_destroy(so);
   6583 
   6584 	/* This is no longer a STREAMS sockets */
   6585 	so->so_not_str = B_TRUE;
   6586 }
   6587 
   6588 void
   6589 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
   6590     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
   6591     socklen_t faddrlen, short opts)
   6592 {
   6593 	sotpi_info_t *sti = SOTOTPI(so);
   6594 
   6595 	so_proc_tcapability_ack(so, tcap);
   6596 
   6597 	so->so_options |= opts;
   6598 
   6599 	/*
   6600 	 * Determine whether the foreign and local address are valid
   6601 	 */
   6602 	if (laddrlen != 0) {
   6603 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
   6604 		sti->sti_laddr_len = laddrlen;
   6605 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
   6606 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
   6607 	}
   6608 
   6609 	if (faddrlen != 0) {
   6610 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
   6611 		sti->sti_faddr_len = faddrlen;
   6612 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
   6613 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
   6614 	}
   6615 
   6616 }
   6617 
   6618 /*
   6619  * Allocate enough space to cache the local and foreign addresses.
   6620  */
   6621 void
   6622 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
   6623 {
   6624 	sotpi_info_t *sti = SOTOTPI(so);
   6625 
   6626 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
   6627 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
   6628 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
   6629 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
   6630 	so->so_max_addr_len = sti->sti_laddr_maxlen;
   6631 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
   6632 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
   6633 	    + sti->sti_laddr_maxlen);
   6634 
   6635 	if (so->so_family == AF_UNIX) {
   6636 		/*
   6637 		 * Initialize AF_UNIX related fields.
   6638 		 */
   6639 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
   6640 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
   6641 	}
   6642 }
   6643 
   6644 
   6645 sotpi_info_t *
   6646 sotpi_sototpi(struct sonode *so)
   6647 {
   6648 	sotpi_info_t *sti;
   6649 
   6650 	ASSERT(so != NULL);
   6651 
   6652 	sti = (sotpi_info_t *)so->so_priv;
   6653 
   6654 	ASSERT(sti != NULL);
   6655 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
   6656 
   6657 	return (sti);
   6658 }
   6659 
   6660 static int
   6661 i_sotpi_info_constructor(sotpi_info_t *sti)
   6662 {
   6663 	sti->sti_magic		= SOTPI_INFO_MAGIC;
   6664 	sti->sti_ack_mp		= NULL;
   6665 	sti->sti_discon_ind_mp	= NULL;
   6666 	sti->sti_ux_bound_vp	= NULL;
   6667 	sti->sti_unbind_mp	= NULL;
   6668 
   6669 	sti->sti_conn_ind_head	= NULL;
   6670 	sti->sti_conn_ind_tail	= NULL;
   6671 
   6672 	sti->sti_laddr_sa	= NULL;
   6673 	sti->sti_faddr_sa	= NULL;
   6674 
   6675 	sti->sti_nl7c_flags	= 0;
   6676 	sti->sti_nl7c_uri	= NULL;
   6677 	sti->sti_nl7c_rcv_mp	= NULL;
   6678 
   6679 	sti->sti_exdata_mp	= NULL;
   6680 	sti->sti_urgmark_mp	= NULL;
   6681 
   6682 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
   6683 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
   6684 
   6685 	return (0);
   6686 }
   6687 
   6688 static void
   6689 i_sotpi_info_destructor(sotpi_info_t *sti)
   6690 {
   6691 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
   6692 	ASSERT(sti->sti_ack_mp == NULL);
   6693 	ASSERT(sti->sti_discon_ind_mp == NULL);
   6694 	ASSERT(sti->sti_ux_bound_vp == NULL);
   6695 	ASSERT(sti->sti_unbind_mp == NULL);
   6696 
   6697 	ASSERT(sti->sti_conn_ind_head == NULL);
   6698 	ASSERT(sti->sti_conn_ind_tail == NULL);
   6699 
   6700 	ASSERT(sti->sti_laddr_sa == NULL);
   6701 	ASSERT(sti->sti_faddr_sa == NULL);
   6702 
   6703 	ASSERT(sti->sti_nl7c_flags == 0);
   6704 	ASSERT(sti->sti_nl7c_uri == NULL);
   6705 	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
   6706 
   6707 	ASSERT(sti->sti_exdata_mp == NULL);
   6708 	ASSERT(sti->sti_urgmark_mp == NULL);
   6709 
   6710 	mutex_destroy(&sti->sti_plumb_lock);
   6711 	cv_destroy(&sti->sti_ack_cv);
   6712 }
   6713 
   6714 /*
   6715  * Creates and attaches TPI information to the given sonode
   6716  */
   6717 static boolean_t
   6718 sotpi_info_create(struct sonode *so, int kmflags)
   6719 {
   6720 	sotpi_info_t *sti;
   6721 
   6722 	ASSERT(so->so_priv == NULL);
   6723 
   6724 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
   6725 		return (B_FALSE);
   6726 
   6727 	if (i_sotpi_info_constructor(sti) != 0) {
   6728 		kmem_free(sti, sizeof (*sti));
   6729 		return (B_FALSE);
   6730 	}
   6731 
   6732 	so->so_priv = (void *)sti;
   6733 	return (B_TRUE);
   6734 }
   6735 
   6736 /*
   6737  * Initializes the TPI information.
   6738  */
   6739 static void
   6740 sotpi_info_init(struct sonode *so)
   6741 {
   6742 	struct vnode *vp = SOTOV(so);
   6743 	sotpi_info_t *sti = SOTOTPI(so);
   6744 	time_t now;
   6745 
   6746 	sti->sti_dev 	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
   6747 	vp->v_rdev	= sti->sti_dev;
   6748 
   6749 	sti->sti_orig_sp = NULL;
   6750 
   6751 	sti->sti_pushcnt = 0;
   6752 
   6753 	now = gethrestime_sec();
   6754 	sti->sti_atime	= now;
   6755 	sti->sti_mtime	= now;
   6756 	sti->sti_ctime	= now;
   6757 
   6758 	sti->sti_eaddr_mp = NULL;
   6759 	sti->sti_delayed_error = 0;
   6760 
   6761 	sti->sti_provinfo = NULL;
   6762 
   6763 	sti->sti_oobcnt = 0;
   6764 	sti->sti_oobsigcnt = 0;
   6765 
   6766 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
   6767 
   6768 	sti->sti_laddr_sa	= 0;
   6769 	sti->sti_faddr_sa	= 0;
   6770 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
   6771 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
   6772 
   6773 	sti->sti_laddr_valid = 0;
   6774 	sti->sti_faddr_valid = 0;
   6775 	sti->sti_faddr_noxlate = 0;
   6776 
   6777 	sti->sti_direct = 0;
   6778 
   6779 	ASSERT(sti->sti_ack_mp == NULL);
   6780 	ASSERT(sti->sti_ux_bound_vp == NULL);
   6781 	ASSERT(sti->sti_unbind_mp == NULL);
   6782 
   6783 	ASSERT(sti->sti_conn_ind_head == NULL);
   6784 	ASSERT(sti->sti_conn_ind_tail == NULL);
   6785 
   6786 	/* Initialize the kernel SSL proxy fields */
   6787 	sti->sti_kssl_type = KSSL_NO_PROXY;
   6788 	sti->sti_kssl_ent = NULL;
   6789 	sti->sti_kssl_ctx = NULL;
   6790 }
   6791 
   6792 /*
   6793  * Given a sonode, grab the TPI info and free any data.
   6794  */
   6795 static void
   6796 sotpi_info_fini(struct sonode *so)
   6797 {
   6798 	sotpi_info_t *sti = SOTOTPI(so);
   6799 	mblk_t *mp;
   6800 
   6801 	ASSERT(sti->sti_discon_ind_mp == NULL);
   6802 
   6803 	if ((mp = sti->sti_conn_ind_head) != NULL) {
   6804 		mblk_t *mp1;
   6805 
   6806 		while (mp) {
   6807 			mp1 = mp->b_next;
   6808 			mp->b_next = NULL;
   6809 			freemsg(mp);
   6810 			mp = mp1;
   6811 		}
   6812 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
   6813 	}
   6814 
   6815 	/*
   6816 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
   6817 	 * indirect them.  It also uses so_count as a validity test.
   6818 	 */
   6819 	mutex_enter(&so->so_lock);
   6820 
   6821 	if (sti->sti_laddr_sa) {
   6822 		ASSERT((caddr_t)sti->sti_faddr_sa ==
   6823 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
   6824 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
   6825 		sti->sti_laddr_valid = 0;
   6826 		sti->sti_faddr_valid = 0;
   6827 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
   6828 		sti->sti_laddr_sa = NULL;
   6829 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
   6830 		sti->sti_faddr_sa = NULL;
   6831 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
   6832 	}
   6833 
   6834 	mutex_exit(&so->so_lock);
   6835 
   6836 	if ((mp = sti->sti_eaddr_mp) != NULL) {
   6837 		freemsg(mp);
   6838 		sti->sti_eaddr_mp = NULL;
   6839 		sti->sti_delayed_error = 0;
   6840 	}
   6841 
   6842 	if ((mp = sti->sti_ack_mp) != NULL) {
   6843 		freemsg(mp);
   6844 		sti->sti_ack_mp = NULL;
   6845 	}
   6846 
   6847 	if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
   6848 		sti->sti_nl7c_rcv_mp = NULL;
   6849 		freemsg(mp);
   6850 	}
   6851 	sti->sti_nl7c_rcv_rval = 0;
   6852 	if (sti->sti_nl7c_uri != NULL) {
   6853 		nl7c_urifree(so);
   6854 		/* urifree() cleared nl7c_uri */
   6855 	}
   6856 	if (sti->sti_nl7c_flags) {
   6857 		sti->sti_nl7c_flags = 0;
   6858 	}
   6859 
   6860 	ASSERT(sti->sti_ux_bound_vp == NULL);
   6861 	if ((mp = sti->sti_unbind_mp) != NULL) {
   6862 		freemsg(mp);
   6863 		sti->sti_unbind_mp = NULL;
   6864 	}
   6865 }
   6866 
   6867 /*
   6868  * Destroys the TPI information attached to a sonode.
   6869  */
   6870 static void
   6871 sotpi_info_destroy(struct sonode *so)
   6872 {
   6873 	sotpi_info_t *sti = SOTOTPI(so);
   6874 
   6875 	i_sotpi_info_destructor(sti);
   6876 	kmem_free(sti, sizeof (*sti));
   6877 
   6878 	so->so_priv = NULL;
   6879 }
   6880 
   6881 /*
   6882  * Create the global sotpi socket module entry. It will never be freed.
   6883  */
   6884 smod_info_t *
   6885 sotpi_smod_create(void)
   6886 {
   6887 	smod_info_t *smodp;
   6888 
   6889 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
   6890 	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
   6891 	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
   6892 	/*
   6893 	 * Initialize the smod_refcnt to 1 so it will never be freed.
   6894 	 */
   6895 	smodp->smod_refcnt = 1;
   6896 	smodp->smod_uc_version = SOCK_UC_VERSION;
   6897 	smodp->smod_dc_version = SOCK_DC_VERSION;
   6898 	smodp->smod_sock_create_func = &sotpi_create;
   6899 	smodp->smod_sock_destroy_func = &sotpi_destroy;
   6900 	return (smodp);
   6901 }
   6902