Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /* Copyright (c) 1990 Mentat Inc. */
     26 
     27 #include <sys/types.h>
     28 #include <sys/stream.h>
     29 #include <sys/stropts.h>
     30 #include <sys/strlog.h>
     31 #include <sys/strsun.h>
     32 #define	_SUN_TPI_VERSION 2
     33 #include <sys/tihdr.h>
     34 #include <sys/timod.h>
     35 #include <sys/ddi.h>
     36 #include <sys/sunddi.h>
     37 #include <sys/strsubr.h>
     38 #include <sys/suntpi.h>
     39 #include <sys/xti_inet.h>
     40 #include <sys/cmn_err.h>
     41 #include <sys/kmem.h>
     42 #include <sys/cred_impl.h>
     43 #include <sys/policy.h>
     44 #include <sys/priv.h>
     45 #include <sys/ucred.h>
     46 #include <sys/zone.h>
     47 
     48 #include <sys/sockio.h>
     49 #include <sys/socket.h>
     50 #include <sys/socketvar.h>
     51 #include <sys/vtrace.h>
     52 #include <sys/sdt.h>
     53 #include <sys/debug.h>
     54 #include <sys/isa_defs.h>
     55 #include <sys/random.h>
     56 #include <netinet/in.h>
     57 #include <netinet/ip6.h>
     58 #include <netinet/icmp6.h>
     59 #include <netinet/udp.h>
     60 
     61 #include <inet/common.h>
     62 #include <inet/ip.h>
     63 #include <inet/ip_impl.h>
     64 #include <inet/ipsec_impl.h>
     65 #include <inet/ip6.h>
     66 #include <inet/ip_ire.h>
     67 #include <inet/ip_if.h>
     68 #include <inet/ip_multi.h>
     69 #include <inet/ip_ndp.h>
     70 #include <inet/proto_set.h>
     71 #include <inet/mib2.h>
     72 #include <inet/nd.h>
     73 #include <inet/optcom.h>
     74 #include <inet/snmpcom.h>
     75 #include <inet/kstatcom.h>
     76 #include <inet/ipclassifier.h>
     77 
     78 #include <sys/tsol/label.h>
     79 #include <sys/tsol/tnet.h>
     80 
     81 #include <inet/rawip_impl.h>
     82 
     83 #include <sys/disp.h>
     84 
     85 /*
     86  * Synchronization notes:
     87  *
     88  * RAWIP is MT and uses the usual kernel synchronization primitives. We use
     89  * conn_lock to protect the icmp_t.
     90  *
     91  * Plumbing notes:
     92  * ICMP is always a device driver. For compatibility with mibopen() code
     93  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
     94  * dummy module.
     95  */
     96 
     97 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
     98 static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
     99 static void	icmp_bind_proto(icmp_t *icmp);
    100 static int	icmp_build_hdr_template(conn_t *, const in6_addr_t *,
    101     const in6_addr_t *, uint32_t);
    102 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
    103 static int	icmp_close(queue_t *q, int flags);
    104 static void	icmp_close_free(conn_t *);
    105 static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
    106 static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
    107 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
    108     int sys_error);
    109 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
    110     t_scalar_t tlierr, int sys_error);
    111 static void	icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
    112     ip_recv_attr_t *);
    113 static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
    114     ip_recv_attr_t *);
    115 static void	icmp_info_req(queue_t *q, mblk_t *mp);
    116 static void	icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
    117 static conn_t 	*icmp_open(int family, cred_t *credp, int *err, int flags);
    118 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
    119 		    cred_t *credp);
    120 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
    121 		    cred_t *credp);
    122 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
    123 int		icmp_opt_set(conn_t *connp, uint_t optset_context,
    124 		    int level, int name, uint_t inlen,
    125 		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
    126 		    void *thisdg_attrs, cred_t *cr);
    127 int		icmp_opt_get(conn_t *connp, int level, int name,
    128 		    uchar_t *ptr);
    129 static int	icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
    130 		    sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
    131 static int	icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
    132 static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
    133 static int	icmp_param_set(queue_t *q, mblk_t *mp, char *value,
    134 		    caddr_t cp, cred_t *cr);
    135 static mblk_t	*icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
    136     const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
    137 static mblk_t	*icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
    138     mblk_t *, const in6_addr_t *, uint32_t, int *);
    139 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
    140 		    uchar_t *ptr, int len);
    141 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
    142 static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
    143 static void	icmp_wput(queue_t *q, mblk_t *mp);
    144 static void	icmp_wput_fallback(queue_t *q, mblk_t *mp);
    145 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
    146 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
    147 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
    148 static void	icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
    149 
    150 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
    151 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
    152 
    153 static void	*rawip_kstat_init(netstackid_t stackid);
    154 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
    155 static int	rawip_kstat_update(kstat_t *kp, int rw);
    156 static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
    157 
    158 /* Common routines for TPI and socket module */
    159 static conn_t	*rawip_do_open(int, cred_t *, int *, int);
    160 static void	rawip_do_close(conn_t *);
    161 static int	rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
    162 static int	rawip_do_unbind(conn_t *);
    163 static int	rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
    164     cred_t *, pid_t);
    165 
    166 int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
    167 		    socklen_t *, cred_t *);
    168 int		rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
    169 		    socklen_t *, cred_t *);
    170 
    171 static struct module_info icmp_mod_info =  {
    172 	5707, "icmp", 1, INFPSZ, 512, 128
    173 };
    174 
    175 /*
    176  * Entry points for ICMP as a device.
    177  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
    178  */
    179 static struct qinit icmprinitv4 = {
    180 	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
    181 };
    182 
    183 static struct qinit icmprinitv6 = {
    184 	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
    185 };
    186 
    187 static struct qinit icmpwinit = {
    188 	(pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
    189 };
    190 
    191 /* ICMP entry point during fallback */
    192 static struct qinit icmp_fallback_sock_winit = {
    193 	(pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
    194 };
    195 
    196 /* For AF_INET aka /dev/icmp */
    197 struct streamtab icmpinfov4 = {
    198 	&icmprinitv4, &icmpwinit
    199 };
    200 
    201 /* For AF_INET6 aka /dev/icmp6 */
    202 struct streamtab icmpinfov6 = {
    203 	&icmprinitv6, &icmpwinit
    204 };
    205 
    206 static sin_t	sin_null;	/* Zero address for quick clears */
    207 static sin6_t	sin6_null;	/* Zero address for quick clears */
    208 
    209 /* Default structure copied into T_INFO_ACK messages */
    210 static struct T_info_ack icmp_g_t_info_ack = {
    211 	T_INFO_ACK,
    212 	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
    213 	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
    214 	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
    215 	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
    216 	0,		/* ADDR_size - filled in later. */
    217 	0,		/* OPT_size - not initialized here */
    218 	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
    219 	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
    220 	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
    221 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
    222 };
    223 
    224 /*
    225  * Table of ND variables supported by icmp.  These are loaded into is_nd
    226  * when the stack instance is created.
    227  * All of these are alterable, within the min/max values given, at run time.
    228  */
    229 static icmpparam_t	icmp_param_arr[] = {
    230 	/* min	max	value	name */
    231 	{ 0,	128,	32,	"icmp_wroff_extra" },
    232 	{ 1,	255,	255,	"icmp_ipv4_ttl" },
    233 	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"icmp_ipv6_hoplimit"},
    234 	{ 0,	1,	1,	"icmp_bsd_compat" },
    235 	{ 4096,	65536,	8192,	"icmp_xmit_hiwat"},
    236 	{ 0,	65536,	1024,	"icmp_xmit_lowat"},
    237 	{ 4096,	65536,	8192,	"icmp_recv_hiwat"},
    238 	{ 65536, 1024*1024*1024, 256*1024,	"icmp_max_buf"},
    239 	{ 0,	1,	0,	"icmp_pmtu_discovery" },
    240 	{ 0,	1,	0,	"icmp_sendto_ignerr" },
    241 };
    242 #define	is_wroff_extra			is_param_arr[0].icmp_param_value
    243 #define	is_ipv4_ttl			is_param_arr[1].icmp_param_value
    244 #define	is_ipv6_hoplimit		is_param_arr[2].icmp_param_value
    245 #define	is_bsd_compat			is_param_arr[3].icmp_param_value
    246 #define	is_xmit_hiwat			is_param_arr[4].icmp_param_value
    247 #define	is_xmit_lowat			is_param_arr[5].icmp_param_value
    248 #define	is_recv_hiwat			is_param_arr[6].icmp_param_value
    249 #define	is_max_buf			is_param_arr[7].icmp_param_value
    250 #define	is_pmtu_discovery		is_param_arr[8].icmp_param_value
    251 #define	is_sendto_ignerr		is_param_arr[9].icmp_param_value
    252 
    253 typedef union T_primitives *t_primp_t;
    254 
    255 /*
    256  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
    257  * passed to icmp_wput.
    258  * It calls IP to verify the local IP address, and calls IP to insert
    259  * the conn_t in the fanout table.
    260  * If everything is ok it then sends the T_BIND_ACK back up.
    261  */
    262 static void
    263 icmp_tpi_bind(queue_t *q, mblk_t *mp)
    264 {
    265 	int	error;
    266 	struct sockaddr *sa;
    267 	struct T_bind_req *tbr;
    268 	socklen_t	len;
    269 	sin_t	*sin;
    270 	sin6_t	*sin6;
    271 	icmp_t		*icmp;
    272 	conn_t	*connp = Q_TO_CONN(q);
    273 	mblk_t *mp1;
    274 	cred_t *cr;
    275 
    276 	/*
    277 	 * All Solaris components should pass a db_credp
    278 	 * for this TPI message, hence we ASSERT.
    279 	 * But in case there is some other M_PROTO that looks
    280 	 * like a TPI message sent by some other kernel
    281 	 * component, we check and return an error.
    282 	 */
    283 	cr = msg_getcred(mp, NULL);
    284 	ASSERT(cr != NULL);
    285 	if (cr == NULL) {
    286 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
    287 		return;
    288 	}
    289 
    290 	icmp = connp->conn_icmp;
    291 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
    292 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    293 		    "icmp_bind: bad req, len %u",
    294 		    (uint_t)(mp->b_wptr - mp->b_rptr));
    295 		icmp_err_ack(q, mp, TPROTO, 0);
    296 		return;
    297 	}
    298 
    299 	if (icmp->icmp_state != TS_UNBND) {
    300 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    301 		    "icmp_bind: bad state, %u", icmp->icmp_state);
    302 		icmp_err_ack(q, mp, TOUTSTATE, 0);
    303 		return;
    304 	}
    305 
    306 	/*
    307 	 * Reallocate the message to make sure we have enough room for an
    308 	 * address.
    309 	 */
    310 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
    311 	if (mp1 == NULL) {
    312 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
    313 		return;
    314 	}
    315 	mp = mp1;
    316 
    317 	/* Reset the message type in preparation for shipping it back. */
    318 	DB_TYPE(mp) = M_PCPROTO;
    319 	tbr = (struct T_bind_req *)mp->b_rptr;
    320 	len = tbr->ADDR_length;
    321 	switch (len) {
    322 	case 0:	/* request for a generic port */
    323 		tbr->ADDR_offset = sizeof (struct T_bind_req);
    324 		if (connp->conn_family == AF_INET) {
    325 			tbr->ADDR_length = sizeof (sin_t);
    326 			sin = (sin_t *)&tbr[1];
    327 			*sin = sin_null;
    328 			sin->sin_family = AF_INET;
    329 			mp->b_wptr = (uchar_t *)&sin[1];
    330 			sa = (struct sockaddr *)sin;
    331 			len = sizeof (sin_t);
    332 		} else {
    333 			ASSERT(connp->conn_family == AF_INET6);
    334 			tbr->ADDR_length = sizeof (sin6_t);
    335 			sin6 = (sin6_t *)&tbr[1];
    336 			*sin6 = sin6_null;
    337 			sin6->sin6_family = AF_INET6;
    338 			mp->b_wptr = (uchar_t *)&sin6[1];
    339 			sa = (struct sockaddr *)sin6;
    340 			len = sizeof (sin6_t);
    341 		}
    342 		break;
    343 
    344 	case sizeof (sin_t):	/* Complete IPv4 address */
    345 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
    346 		    sizeof (sin_t));
    347 		break;
    348 
    349 	case sizeof (sin6_t):	/* Complete IPv6 address */
    350 		sa = (struct sockaddr *)mi_offset_param(mp,
    351 		    tbr->ADDR_offset, sizeof (sin6_t));
    352 		break;
    353 
    354 	default:
    355 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    356 		    "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
    357 		icmp_err_ack(q, mp, TBADADDR, 0);
    358 		return;
    359 	}
    360 
    361 	error = rawip_do_bind(connp, sa, len);
    362 	if (error != 0) {
    363 		if (error > 0) {
    364 			icmp_err_ack(q, mp, TSYSERR, error);
    365 		} else {
    366 			icmp_err_ack(q, mp, -error, 0);
    367 		}
    368 	} else {
    369 		tbr->PRIM_type = T_BIND_ACK;
    370 		qreply(q, mp);
    371 	}
    372 }
    373 
    374 static int
    375 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
    376 {
    377 	sin_t		*sin;
    378 	sin6_t		*sin6;
    379 	icmp_t		*icmp = connp->conn_icmp;
    380 	int		error = 0;
    381 	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
    382 	in_port_t	lport;		/* Network byte order */
    383 	ipaddr_t	v4src;		/* Set if AF_INET */
    384 	in6_addr_t	v6src;
    385 	uint_t		scopeid = 0;
    386 	zoneid_t	zoneid = IPCL_ZONEID(connp);
    387 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    388 
    389 	if (sa == NULL || !OK_32PTR((char *)sa)) {
    390 		return (EINVAL);
    391 	}
    392 
    393 	switch (len) {
    394 	case sizeof (sin_t):    /* Complete IPv4 address */
    395 		sin = (sin_t *)sa;
    396 		if (sin->sin_family != AF_INET ||
    397 		    connp->conn_family != AF_INET) {
    398 			/* TSYSERR, EAFNOSUPPORT */
    399 			return (EAFNOSUPPORT);
    400 		}
    401 		v4src = sin->sin_addr.s_addr;
    402 		IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
    403 		if (v4src != INADDR_ANY) {
    404 			laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
    405 			    B_TRUE);
    406 		}
    407 		lport = sin->sin_port;
    408 		break;
    409 	case sizeof (sin6_t): /* Complete IPv6 address */
    410 		sin6 = (sin6_t *)sa;
    411 		if (sin6->sin6_family != AF_INET6 ||
    412 		    connp->conn_family != AF_INET6) {
    413 			/* TSYSERR, EAFNOSUPPORT */
    414 			return (EAFNOSUPPORT);
    415 		}
    416 		/* No support for mapped addresses on raw sockets */
    417 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
    418 			/* TSYSERR, EADDRNOTAVAIL */
    419 			return (EADDRNOTAVAIL);
    420 		}
    421 		v6src = sin6->sin6_addr;
    422 		if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
    423 			if (IN6_IS_ADDR_LINKSCOPE(&v6src))
    424 				scopeid = sin6->sin6_scope_id;
    425 			laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
    426 			    B_TRUE, scopeid);
    427 		}
    428 		lport = sin6->sin6_port;
    429 		break;
    430 
    431 	default:
    432 		/* TBADADDR */
    433 		return (EADDRNOTAVAIL);
    434 	}
    435 
    436 	/* Is the local address a valid unicast, multicast, or broadcast? */
    437 	if (laddr_type == IPVL_BAD)
    438 		return (EADDRNOTAVAIL);
    439 
    440 	/*
    441 	 * The state must be TS_UNBND.
    442 	 */
    443 	mutex_enter(&connp->conn_lock);
    444 	if (icmp->icmp_state != TS_UNBND) {
    445 		mutex_exit(&connp->conn_lock);
    446 		return (-TOUTSTATE);
    447 	}
    448 
    449 	/*
    450 	 * Copy the source address into our icmp structure.  This address
    451 	 * may still be zero; if so, ip will fill in the correct address
    452 	 * each time an outbound packet is passed to it.
    453 	 * If we are binding to a broadcast or multicast address then
    454 	 * we just set the conn_bound_addr since we don't want to use
    455 	 * that as the source address when sending.
    456 	 */
    457 	connp->conn_bound_addr_v6 = v6src;
    458 	connp->conn_laddr_v6 = v6src;
    459 	if (scopeid != 0) {
    460 		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
    461 		connp->conn_ixa->ixa_scopeid = scopeid;
    462 		connp->conn_incoming_ifindex = scopeid;
    463 	} else {
    464 		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
    465 		connp->conn_incoming_ifindex = connp->conn_bound_if;
    466 	}
    467 
    468 	switch (laddr_type) {
    469 	case IPVL_UNICAST_UP:
    470 	case IPVL_UNICAST_DOWN:
    471 		connp->conn_saddr_v6 = v6src;
    472 		connp->conn_mcbc_bind = B_FALSE;
    473 		break;
    474 	case IPVL_MCAST:
    475 	case IPVL_BCAST:
    476 		/* ip_set_destination will pick a source address later */
    477 		connp->conn_saddr_v6 = ipv6_all_zeros;
    478 		connp->conn_mcbc_bind = B_TRUE;
    479 		break;
    480 	}
    481 
    482 	/* Any errors after this point should use late_error */
    483 
    484 	/*
    485 	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
    486 	 * with IPPROTO_TCP.
    487 	 */
    488 	connp->conn_lport = lport;
    489 	connp->conn_fport = 0;
    490 
    491 	if (connp->conn_family == AF_INET) {
    492 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
    493 	} else {
    494 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
    495 	}
    496 
    497 	icmp->icmp_state = TS_IDLE;
    498 
    499 	/*
    500 	 * We create an initial header template here to make a subsequent
    501 	 * sendto have a starting point. Since conn_last_dst is zero the
    502 	 * first sendto will always follow the 'dst changed' code path.
    503 	 * Note that we defer massaging options and the related checksum
    504 	 * adjustment until we have a destination address.
    505 	 */
    506 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
    507 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
    508 	if (error != 0) {
    509 		mutex_exit(&connp->conn_lock);
    510 		goto late_error;
    511 	}
    512 	/* Just in case */
    513 	connp->conn_faddr_v6 = ipv6_all_zeros;
    514 	connp->conn_v6lastdst = ipv6_all_zeros;
    515 	mutex_exit(&connp->conn_lock);
    516 
    517 	error = ip_laddr_fanout_insert(connp);
    518 	if (error != 0)
    519 		goto late_error;
    520 
    521 	/* Bind succeeded */
    522 	return (0);
    523 
    524 late_error:
    525 	mutex_enter(&connp->conn_lock);
    526 	connp->conn_saddr_v6 = ipv6_all_zeros;
    527 	connp->conn_bound_addr_v6 = ipv6_all_zeros;
    528 	connp->conn_laddr_v6 = ipv6_all_zeros;
    529 	if (scopeid != 0) {
    530 		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
    531 		connp->conn_incoming_ifindex = connp->conn_bound_if;
    532 	}
    533 	icmp->icmp_state = TS_UNBND;
    534 	connp->conn_v6lastdst = ipv6_all_zeros;
    535 	connp->conn_lport = 0;
    536 
    537 	/* Restore the header that was built above - different source address */
    538 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
    539 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
    540 	mutex_exit(&connp->conn_lock);
    541 	return (error);
    542 }
    543 
    544 /*
    545  * Tell IP to just bind to the protocol.
    546  */
    547 static void
    548 icmp_bind_proto(icmp_t *icmp)
    549 {
    550 	conn_t	*connp = icmp->icmp_connp;
    551 
    552 	mutex_enter(&connp->conn_lock);
    553 	connp->conn_saddr_v6 = ipv6_all_zeros;
    554 	connp->conn_laddr_v6 = ipv6_all_zeros;
    555 	connp->conn_faddr_v6 = ipv6_all_zeros;
    556 	connp->conn_v6lastdst = ipv6_all_zeros;
    557 	mutex_exit(&connp->conn_lock);
    558 
    559 	(void) ip_laddr_fanout_insert(connp);
    560 }
    561 
    562 /*
    563  * This routine handles each T_CONN_REQ message passed to icmp.  It
    564  * associates a default destination address with the stream.
    565  *
    566  * After various error checks are completed, icmp_connect() lays
    567  * the target address and port into the composite header template.
    568  * Then we ask IP for information, including a source address if we didn't
    569  * already have one. Finally we send up the T_OK_ACK reply message.
    570  */
    571 static void
    572 icmp_tpi_connect(queue_t *q, mblk_t *mp)
    573 {
    574 	conn_t	*connp = Q_TO_CONN(q);
    575 	struct T_conn_req	*tcr;
    576 	struct sockaddr *sa;
    577 	socklen_t len;
    578 	int error;
    579 	cred_t *cr;
    580 	pid_t pid;
    581 	/*
    582 	 * All Solaris components should pass a db_credp
    583 	 * for this TPI message, hence we ASSERT.
    584 	 * But in case there is some other M_PROTO that looks
    585 	 * like a TPI message sent by some other kernel
    586 	 * component, we check and return an error.
    587 	 */
    588 	cr = msg_getcred(mp, &pid);
    589 	ASSERT(cr != NULL);
    590 	if (cr == NULL) {
    591 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
    592 		return;
    593 	}
    594 
    595 	tcr = (struct T_conn_req *)mp->b_rptr;
    596 	/* Sanity checks */
    597 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
    598 		icmp_err_ack(q, mp, TPROTO, 0);
    599 		return;
    600 	}
    601 
    602 	if (tcr->OPT_length != 0) {
    603 		icmp_err_ack(q, mp, TBADOPT, 0);
    604 		return;
    605 	}
    606 
    607 	len = tcr->DEST_length;
    608 
    609 	switch (len) {
    610 	default:
    611 		icmp_err_ack(q, mp, TBADADDR, 0);
    612 		return;
    613 	case sizeof (sin_t):
    614 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
    615 		    sizeof (sin_t));
    616 		break;
    617 	case sizeof (sin6_t):
    618 		sa = (struct sockaddr *)mi_offset_param(mp,
    619 		    tcr->DEST_offset, sizeof (sin6_t));
    620 		break;
    621 	}
    622 
    623 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
    624 	if (error != 0) {
    625 		icmp_err_ack(q, mp, TSYSERR, error);
    626 		return;
    627 	}
    628 
    629 	error = rawip_do_connect(connp, sa, len, cr, pid);
    630 	if (error != 0) {
    631 		if (error < 0) {
    632 			icmp_err_ack(q, mp, -error, 0);
    633 		} else {
    634 			icmp_err_ack(q, mp, 0, error);
    635 		}
    636 	} else {
    637 		mblk_t *mp1;
    638 
    639 		/*
    640 		 * We have to send a connection confirmation to
    641 		 * keep TLI happy.
    642 		 */
    643 		if (connp->conn_family == AF_INET) {
    644 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
    645 			    sizeof (sin_t), NULL, 0);
    646 		} else {
    647 			ASSERT(connp->conn_family == AF_INET6);
    648 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
    649 			    sizeof (sin6_t), NULL, 0);
    650 		}
    651 		if (mp1 == NULL) {
    652 			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
    653 			return;
    654 		}
    655 
    656 		/*
    657 		 * Send ok_ack for T_CONN_REQ
    658 		 */
    659 		mp = mi_tpi_ok_ack_alloc(mp);
    660 		if (mp == NULL) {
    661 			/* Unable to reuse the T_CONN_REQ for the ack. */
    662 			icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
    663 			return;
    664 		}
    665 		putnext(connp->conn_rq, mp);
    666 		putnext(connp->conn_rq, mp1);
    667 	}
    668 }
    669 
    670 static int
    671 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
    672     cred_t *cr, pid_t pid)
    673 {
    674 	icmp_t		*icmp;
    675 	sin_t		*sin;
    676 	sin6_t		*sin6;
    677 	int		error;
    678 	uint16_t 	dstport;
    679 	ipaddr_t	v4dst;
    680 	in6_addr_t	v6dst;
    681 	uint32_t	flowinfo;
    682 	ip_xmit_attr_t	*ixa;
    683 	uint_t		scopeid = 0;
    684 	uint_t		srcid = 0;
    685 	in6_addr_t	v6src = connp->conn_saddr_v6;
    686 
    687 	icmp = connp->conn_icmp;
    688 
    689 	if (sa == NULL || !OK_32PTR((char *)sa)) {
    690 		return (EINVAL);
    691 	}
    692 
    693 	ASSERT(sa != NULL && len != 0);
    694 
    695 	/*
    696 	 * Determine packet type based on type of address passed in
    697 	 * the request should contain an IPv4 or IPv6 address.
    698 	 * Make sure that address family matches the type of
    699 	 * family of the address passed down.
    700 	 */
    701 	switch (len) {
    702 	case sizeof (sin_t):
    703 		sin = (sin_t *)sa;
    704 
    705 		v4dst = sin->sin_addr.s_addr;
    706 		dstport = sin->sin_port;
    707 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
    708 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
    709 		break;
    710 
    711 	case sizeof (sin6_t):
    712 		sin6 = (sin6_t *)sa;
    713 
    714 		/* No support for mapped addresses on raw sockets */
    715 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
    716 			return (EADDRNOTAVAIL);
    717 		}
    718 		v6dst = sin6->sin6_addr;
    719 		dstport = sin6->sin6_port;
    720 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
    721 		flowinfo = sin6->sin6_flowinfo;
    722 		if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
    723 			scopeid = sin6->sin6_scope_id;
    724 		srcid = sin6->__sin6_src_id;
    725 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
    726 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
    727 			    connp->conn_netstack);
    728 		}
    729 		break;
    730 	}
    731 
    732 	/*
    733 	 * If there is a different thread using conn_ixa then we get a new
    734 	 * copy and cut the old one loose from conn_ixa. Otherwise we use
    735 	 * conn_ixa and prevent any other thread from using/changing it.
    736 	 * Once connect() is done other threads can use conn_ixa since the
    737 	 * refcnt will be back at one.
    738 	 */
    739 	ixa = conn_get_ixa(connp, B_TRUE);
    740 	if (ixa == NULL)
    741 		return (ENOMEM);
    742 
    743 	ASSERT(ixa->ixa_refcnt >= 2);
    744 	ASSERT(ixa == connp->conn_ixa);
    745 
    746 	mutex_enter(&connp->conn_lock);
    747 	/*
    748 	 * This icmp_t must have bound already before doing a connect.
    749 	 * Reject if a connect is in progress (we drop conn_lock during
    750 	 * rawip_do_connect).
    751 	 */
    752 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
    753 		mutex_exit(&connp->conn_lock);
    754 		ixa_refrele(ixa);
    755 		return (-TOUTSTATE);
    756 	}
    757 
    758 	if (icmp->icmp_state == TS_DATA_XFER) {
    759 		/* Already connected - clear out state */
    760 		if (connp->conn_mcbc_bind)
    761 			connp->conn_saddr_v6 = ipv6_all_zeros;
    762 		else
    763 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
    764 		connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
    765 		connp->conn_faddr_v6 = ipv6_all_zeros;
    766 		icmp->icmp_state = TS_IDLE;
    767 	}
    768 
    769 	/*
    770 	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
    771 	 * with IPPROTO_TCP.
    772 	 */
    773 	connp->conn_fport = dstport;
    774 	if (connp->conn_ipversion == IPV4_VERSION) {
    775 		/*
    776 		 * Interpret a zero destination to mean loopback.
    777 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
    778 		 * generate the T_CONN_CON.
    779 		 */
    780 		if (v4dst == INADDR_ANY) {
    781 			v4dst = htonl(INADDR_LOOPBACK);
    782 			IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
    783 			ASSERT(connp->conn_family == AF_INET);
    784 			sin->sin_addr.s_addr = v4dst;
    785 		}
    786 		connp->conn_faddr_v6 = v6dst;
    787 		connp->conn_flowinfo = 0;
    788 	} else {
    789 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
    790 		/*
    791 		 * Interpret a zero destination to mean loopback.
    792 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
    793 		 * generate the T_CONN_CON.
    794 		 */
    795 		if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
    796 			v6dst = ipv6_loopback;
    797 			sin6->sin6_addr = v6dst;
    798 		}
    799 		connp->conn_faddr_v6 = v6dst;
    800 		connp->conn_flowinfo = flowinfo;
    801 	}
    802 
    803 	ixa->ixa_cred = cr;
    804 	ixa->ixa_cpid = pid;
    805 	if (is_system_labeled()) {
    806 		/* We need to restart with a label based on the cred */
    807 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
    808 	}
    809 
    810 	if (scopeid != 0) {
    811 		ixa->ixa_flags |= IXAF_SCOPEID_SET;
    812 		ixa->ixa_scopeid = scopeid;
    813 		connp->conn_incoming_ifindex = scopeid;
    814 	} else {
    815 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
    816 		connp->conn_incoming_ifindex = connp->conn_bound_if;
    817 	}
    818 
    819 	/*
    820 	 * conn_connect will drop conn_lock and reacquire it.
    821 	 * To prevent a send* from messing with this icmp_t while the lock
    822 	 * is dropped we set icmp_state and clear conn_v6lastdst.
    823 	 * That will make all send* fail with EISCONN.
    824 	 */
    825 	connp->conn_v6lastdst = ipv6_all_zeros;
    826 	icmp->icmp_state = TS_WCON_CREQ;
    827 
    828 	error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
    829 	mutex_exit(&connp->conn_lock);
    830 	if (error != 0)
    831 		goto connect_failed;
    832 
    833 	/*
    834 	 * The addresses have been verified. Time to insert in
    835 	 * the correct fanout list.
    836 	 */
    837 	error = ipcl_conn_insert(connp);
    838 	if (error != 0)
    839 		goto connect_failed;
    840 
    841 	mutex_enter(&connp->conn_lock);
    842 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
    843 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
    844 	if (error != 0) {
    845 		mutex_exit(&connp->conn_lock);
    846 		goto connect_failed;
    847 	}
    848 
    849 	icmp->icmp_state = TS_DATA_XFER;
    850 	/* Record this as the "last" send even though we haven't sent any */
    851 	connp->conn_v6lastdst = connp->conn_faddr_v6;
    852 	connp->conn_lastipversion = connp->conn_ipversion;
    853 	connp->conn_lastdstport = connp->conn_fport;
    854 	connp->conn_lastflowinfo = connp->conn_flowinfo;
    855 	connp->conn_lastscopeid = scopeid;
    856 	connp->conn_lastsrcid = srcid;
    857 	/* Also remember a source to use together with lastdst */
    858 	connp->conn_v6lastsrc = v6src;
    859 	mutex_exit(&connp->conn_lock);
    860 
    861 	ixa_refrele(ixa);
    862 	return (0);
    863 
    864 connect_failed:
    865 	if (ixa != NULL)
    866 		ixa_refrele(ixa);
    867 	mutex_enter(&connp->conn_lock);
    868 	icmp->icmp_state = TS_IDLE;
    869 	/* In case the source address was set above */
    870 	if (connp->conn_mcbc_bind)
    871 		connp->conn_saddr_v6 = ipv6_all_zeros;
    872 	else
    873 		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
    874 	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
    875 	connp->conn_faddr_v6 = ipv6_all_zeros;
    876 	connp->conn_v6lastdst = ipv6_all_zeros;
    877 	connp->conn_flowinfo = 0;
    878 
    879 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
    880 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
    881 	mutex_exit(&connp->conn_lock);
    882 	return (error);
    883 }
    884 
    885 static void
    886 rawip_do_close(conn_t *connp)
    887 {
    888 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
    889 
    890 	ip_quiesce_conn(connp);
    891 
    892 	if (!IPCL_IS_NONSTR(connp)) {
    893 		qprocsoff(connp->conn_rq);
    894 	}
    895 
    896 	icmp_close_free(connp);
    897 
    898 	/*
    899 	 * Now we are truly single threaded on this stream, and can
    900 	 * delete the things hanging off the connp, and finally the connp.
    901 	 * We removed this connp from the fanout list, it cannot be
    902 	 * accessed thru the fanouts, and we already waited for the
    903 	 * conn_ref to drop to 0. We are already in close, so
    904 	 * there cannot be any other thread from the top. qprocsoff
    905 	 * has completed, and service has completed or won't run in
    906 	 * future.
    907 	 */
    908 	ASSERT(connp->conn_ref == 1);
    909 
    910 	if (!IPCL_IS_NONSTR(connp)) {
    911 		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
    912 	} else {
    913 		ip_free_helper_stream(connp);
    914 	}
    915 
    916 	connp->conn_ref--;
    917 	ipcl_conn_destroy(connp);
    918 }
    919 
    920 static int
    921 icmp_close(queue_t *q, int flags)
    922 {
    923 	conn_t  *connp;
    924 
    925 	if (flags & SO_FALLBACK) {
    926 		/*
    927 		 * stream is being closed while in fallback
    928 		 * simply free the resources that were allocated
    929 		 */
    930 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
    931 		qprocsoff(q);
    932 		goto done;
    933 	}
    934 
    935 	connp = Q_TO_CONN(q);
    936 	(void) rawip_do_close(connp);
    937 done:
    938 	q->q_ptr = WR(q)->q_ptr = NULL;
    939 	return (0);
    940 }
    941 
    942 static void
    943 icmp_close_free(conn_t *connp)
    944 {
    945 	icmp_t *icmp = connp->conn_icmp;
    946 
    947 	if (icmp->icmp_filter != NULL) {
    948 		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
    949 		icmp->icmp_filter = NULL;
    950 	}
    951 
    952 	/*
    953 	 * Clear any fields which the kmem_cache constructor clears.
    954 	 * Only icmp_connp needs to be preserved.
    955 	 * TBD: We should make this more efficient to avoid clearing
    956 	 * everything.
    957 	 */
    958 	ASSERT(icmp->icmp_connp == connp);
    959 	bzero(icmp, sizeof (icmp_t));
    960 	icmp->icmp_connp = connp;
    961 }
    962 
    963 /*
    964  * This routine handles each T_DISCON_REQ message passed to icmp
    965  * as an indicating that ICMP is no longer connected. This results
    966  * in telling IP to restore the binding to just the local address.
    967  */
    968 static int
    969 icmp_do_disconnect(conn_t *connp)
    970 {
    971 	icmp_t	*icmp = connp->conn_icmp;
    972 	int	error;
    973 
    974 	mutex_enter(&connp->conn_lock);
    975 	if (icmp->icmp_state != TS_DATA_XFER) {
    976 		mutex_exit(&connp->conn_lock);
    977 		return (-TOUTSTATE);
    978 	}
    979 	if (connp->conn_mcbc_bind)
    980 		connp->conn_saddr_v6 = ipv6_all_zeros;
    981 	else
    982 		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
    983 	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
    984 	connp->conn_faddr_v6 = ipv6_all_zeros;
    985 	icmp->icmp_state = TS_IDLE;
    986 
    987 	connp->conn_v6lastdst = ipv6_all_zeros;
    988 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
    989 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
    990 	mutex_exit(&connp->conn_lock);
    991 	if (error != 0)
    992 		return (error);
    993 
    994 	/*
    995 	 * Tell IP to remove the full binding and revert
    996 	 * to the local address binding.
    997 	 */
    998 	return (ip_laddr_fanout_insert(connp));
    999 }
   1000 
   1001 static void
   1002 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
   1003 {
   1004 	conn_t	*connp = Q_TO_CONN(q);
   1005 	int	error;
   1006 
   1007 	/*
   1008 	 * Allocate the largest primitive we need to send back
   1009 	 * T_error_ack is > than T_ok_ack
   1010 	 */
   1011 	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
   1012 	if (mp == NULL) {
   1013 		/* Unable to reuse the T_DISCON_REQ for the ack. */
   1014 		icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
   1015 		return;
   1016 	}
   1017 
   1018 	error = icmp_do_disconnect(connp);
   1019 
   1020 	if (error != 0) {
   1021 		if (error > 0) {
   1022 			icmp_err_ack(q, mp, 0, error);
   1023 		} else {
   1024 			icmp_err_ack(q, mp, -error, 0);
   1025 		}
   1026 	} else {
   1027 		mp = mi_tpi_ok_ack_alloc(mp);
   1028 		ASSERT(mp != NULL);
   1029 		qreply(q, mp);
   1030 	}
   1031 }
   1032 
   1033 static int
   1034 icmp_disconnect(conn_t *connp)
   1035 {
   1036 	int	error;
   1037 
   1038 	connp->conn_dgram_errind = B_FALSE;
   1039 
   1040 	error = icmp_do_disconnect(connp);
   1041 
   1042 	if (error < 0)
   1043 		error = proto_tlitosyserr(-error);
   1044 	return (error);
   1045 }
   1046 
   1047 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
   1048 static void
   1049 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
   1050 {
   1051 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
   1052 		qreply(q, mp);
   1053 }
   1054 
   1055 /* Shorthand to generate and send TPI error acks to our client */
   1056 static void
   1057 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
   1058     t_scalar_t t_error, int sys_error)
   1059 {
   1060 	struct T_error_ack	*teackp;
   1061 
   1062 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
   1063 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
   1064 		teackp = (struct T_error_ack *)mp->b_rptr;
   1065 		teackp->ERROR_prim = primitive;
   1066 		teackp->TLI_error = t_error;
   1067 		teackp->UNIX_error = sys_error;
   1068 		qreply(q, mp);
   1069 	}
   1070 }
   1071 
   1072 /*
   1073  * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
   1074  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
   1075  * Assumes that IP has pulled up everything up to and including the ICMP header.
   1076  */
   1077 /* ARGSUSED2 */
   1078 static void
   1079 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   1080 {
   1081 	conn_t		*connp = (conn_t *)arg1;
   1082 	icmp_t		*icmp = connp->conn_icmp;
   1083 	icmph_t		*icmph;
   1084 	ipha_t		*ipha;
   1085 	int		iph_hdr_length;
   1086 	sin_t		sin;
   1087 	mblk_t		*mp1;
   1088 	int		error = 0;
   1089 
   1090 	ipha = (ipha_t *)mp->b_rptr;
   1091 
   1092 	ASSERT(OK_32PTR(mp->b_rptr));
   1093 
   1094 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
   1095 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
   1096 		icmp_icmp_error_ipv6(connp, mp, ira);
   1097 		return;
   1098 	}
   1099 	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
   1100 
   1101 	/* Skip past the outer IP and ICMP headers */
   1102 	ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
   1103 	iph_hdr_length = ira->ira_ip_hdr_length;
   1104 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   1105 	ipha = (ipha_t *)&icmph[1];	/* Inner IP header */
   1106 
   1107 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
   1108 
   1109 	switch (icmph->icmph_type) {
   1110 	case ICMP_DEST_UNREACHABLE:
   1111 		switch (icmph->icmph_code) {
   1112 		case ICMP_FRAGMENTATION_NEEDED: {
   1113 			ipha_t		*ipha;
   1114 			ip_xmit_attr_t	*ixa;
   1115 			/*
   1116 			 * IP has already adjusted the path MTU.
   1117 			 * But we need to adjust DF for IPv4.
   1118 			 */
   1119 			if (connp->conn_ipversion != IPV4_VERSION)
   1120 				break;
   1121 
   1122 			ixa = conn_get_ixa(connp, B_FALSE);
   1123 			if (ixa == NULL || ixa->ixa_ire == NULL) {
   1124 				/*
   1125 				 * Some other thread holds conn_ixa. We will
   1126 				 * redo this on the next ICMP too big.
   1127 				 */
   1128 				if (ixa != NULL)
   1129 					ixa_refrele(ixa);
   1130 				break;
   1131 			}
   1132 			(void) ip_get_pmtu(ixa);
   1133 
   1134 			mutex_enter(&connp->conn_lock);
   1135 			ipha = (ipha_t *)connp->conn_ht_iphc;
   1136 			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
   1137 				ipha->ipha_fragment_offset_and_flags |=
   1138 				    IPH_DF_HTONS;
   1139 			} else {
   1140 				ipha->ipha_fragment_offset_and_flags &=
   1141 				    ~IPH_DF_HTONS;
   1142 			}
   1143 			mutex_exit(&connp->conn_lock);
   1144 			ixa_refrele(ixa);
   1145 			break;
   1146 		}
   1147 		case ICMP_PORT_UNREACHABLE:
   1148 		case ICMP_PROTOCOL_UNREACHABLE:
   1149 			error = ECONNREFUSED;
   1150 			break;
   1151 		default:
   1152 			/* Transient errors */
   1153 			break;
   1154 		}
   1155 		break;
   1156 	default:
   1157 		/* Transient errors */
   1158 		break;
   1159 	}
   1160 	if (error == 0) {
   1161 		freemsg(mp);
   1162 		return;
   1163 	}
   1164 
   1165 	/*
   1166 	 * Deliver T_UDERROR_IND when the application has asked for it.
   1167 	 * The socket layer enables this automatically when connected.
   1168 	 */
   1169 	if (!connp->conn_dgram_errind) {
   1170 		freemsg(mp);
   1171 		return;
   1172 	}
   1173 
   1174 	sin = sin_null;
   1175 	sin.sin_family = AF_INET;
   1176 	sin.sin_addr.s_addr = ipha->ipha_dst;
   1177 
   1178 	if (IPCL_IS_NONSTR(connp)) {
   1179 		mutex_enter(&connp->conn_lock);
   1180 		if (icmp->icmp_state == TS_DATA_XFER) {
   1181 			if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
   1182 				mutex_exit(&connp->conn_lock);
   1183 				(*connp->conn_upcalls->su_set_error)
   1184 				    (connp->conn_upper_handle, error);
   1185 				goto done;
   1186 			}
   1187 		} else {
   1188 			icmp->icmp_delayed_error = error;
   1189 			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
   1190 		}
   1191 		mutex_exit(&connp->conn_lock);
   1192 	} else {
   1193 		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
   1194 		    error);
   1195 		if (mp1 != NULL)
   1196 			putnext(connp->conn_rq, mp1);
   1197 	}
   1198 done:
   1199 	freemsg(mp);
   1200 }
   1201 
   1202 /*
   1203  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
   1204  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
   1205  * Assumes that IP has pulled up all the extension headers as well as the
   1206  * ICMPv6 header.
   1207  */
   1208 static void
   1209 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
   1210 {
   1211 	icmp6_t		*icmp6;
   1212 	ip6_t		*ip6h, *outer_ip6h;
   1213 	uint16_t	iph_hdr_length;
   1214 	uint8_t		*nexthdrp;
   1215 	sin6_t		sin6;
   1216 	mblk_t		*mp1;
   1217 	int		error = 0;
   1218 	icmp_t		*icmp = connp->conn_icmp;
   1219 
   1220 	outer_ip6h = (ip6_t *)mp->b_rptr;
   1221 #ifdef DEBUG
   1222 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
   1223 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
   1224 	else
   1225 		iph_hdr_length = IPV6_HDR_LEN;
   1226 	ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
   1227 #endif
   1228 	/* Skip past the outer IP and ICMP headers */
   1229 	iph_hdr_length = ira->ira_ip_hdr_length;
   1230 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
   1231 
   1232 	ip6h = (ip6_t *)&icmp6[1];	/* Inner IP header */
   1233 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
   1234 		freemsg(mp);
   1235 		return;
   1236 	}
   1237 
   1238 	switch (icmp6->icmp6_type) {
   1239 	case ICMP6_DST_UNREACH:
   1240 		switch (icmp6->icmp6_code) {
   1241 		case ICMP6_DST_UNREACH_NOPORT:
   1242 			error = ECONNREFUSED;
   1243 			break;
   1244 		case ICMP6_DST_UNREACH_ADMIN:
   1245 		case ICMP6_DST_UNREACH_NOROUTE:
   1246 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
   1247 		case ICMP6_DST_UNREACH_ADDR:
   1248 			/* Transient errors */
   1249 			break;
   1250 		default:
   1251 			break;
   1252 		}
   1253 		break;
   1254 	case ICMP6_PACKET_TOO_BIG: {
   1255 		struct T_unitdata_ind	*tudi;
   1256 		struct T_opthdr		*toh;
   1257 		size_t			udi_size;
   1258 		mblk_t			*newmp;
   1259 		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
   1260 		    sizeof (struct ip6_mtuinfo);
   1261 		sin6_t			*sin6;
   1262 		struct ip6_mtuinfo	*mtuinfo;
   1263 
   1264 		/*
   1265 		 * If the application has requested to receive path mtu
   1266 		 * information, send up an empty message containing an
   1267 		 * IPV6_PATHMTU ancillary data item.
   1268 		 */
   1269 		if (!connp->conn_ipv6_recvpathmtu)
   1270 			break;
   1271 
   1272 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
   1273 		    opt_length;
   1274 		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
   1275 			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
   1276 			break;
   1277 		}
   1278 
   1279 		/*
   1280 		 * newmp->b_cont is left to NULL on purpose.  This is an
   1281 		 * empty message containing only ancillary data.
   1282 		 */
   1283 		newmp->b_datap->db_type = M_PROTO;
   1284 		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
   1285 		newmp->b_wptr = (uchar_t *)tudi + udi_size;
   1286 		tudi->PRIM_type = T_UNITDATA_IND;
   1287 		tudi->SRC_length = sizeof (sin6_t);
   1288 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
   1289 		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
   1290 		tudi->OPT_length = opt_length;
   1291 
   1292 		sin6 = (sin6_t *)&tudi[1];
   1293 		bzero(sin6, sizeof (sin6_t));
   1294 		sin6->sin6_family = AF_INET6;
   1295 		sin6->sin6_addr = connp->conn_faddr_v6;
   1296 
   1297 		toh = (struct T_opthdr *)&sin6[1];
   1298 		toh->level = IPPROTO_IPV6;
   1299 		toh->name = IPV6_PATHMTU;
   1300 		toh->len = opt_length;
   1301 		toh->status = 0;
   1302 
   1303 		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
   1304 		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
   1305 		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
   1306 		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
   1307 		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
   1308 		/*
   1309 		 * We've consumed everything we need from the original
   1310 		 * message.  Free it, then send our empty message.
   1311 		 */
   1312 		freemsg(mp);
   1313 		icmp_ulp_recv(connp, newmp, msgdsize(newmp));
   1314 		return;
   1315 	}
   1316 	case ICMP6_TIME_EXCEEDED:
   1317 		/* Transient errors */
   1318 		break;
   1319 	case ICMP6_PARAM_PROB:
   1320 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
   1321 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
   1322 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
   1323 		    (uchar_t *)nexthdrp) {
   1324 			error = ECONNREFUSED;
   1325 			break;
   1326 		}
   1327 		break;
   1328 	}
   1329 	if (error == 0) {
   1330 		freemsg(mp);
   1331 		return;
   1332 	}
   1333 
   1334 	/*
   1335 	 * Deliver T_UDERROR_IND when the application has asked for it.
   1336 	 * The socket layer enables this automatically when connected.
   1337 	 */
   1338 	if (!connp->conn_dgram_errind) {
   1339 		freemsg(mp);
   1340 		return;
   1341 	}
   1342 
   1343 	sin6 = sin6_null;
   1344 	sin6.sin6_family = AF_INET6;
   1345 	sin6.sin6_addr = ip6h->ip6_dst;
   1346 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
   1347 	if (IPCL_IS_NONSTR(connp)) {
   1348 		mutex_enter(&connp->conn_lock);
   1349 		if (icmp->icmp_state == TS_DATA_XFER) {
   1350 			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
   1351 			    &connp->conn_faddr_v6)) {
   1352 				mutex_exit(&connp->conn_lock);
   1353 				(*connp->conn_upcalls->su_set_error)
   1354 				    (connp->conn_upper_handle, error);
   1355 				goto done;
   1356 			}
   1357 		} else {
   1358 			icmp->icmp_delayed_error = error;
   1359 			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
   1360 		}
   1361 		mutex_exit(&connp->conn_lock);
   1362 	} else {
   1363 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
   1364 		    NULL, 0, error);
   1365 		if (mp1 != NULL)
   1366 			putnext(connp->conn_rq, mp1);
   1367 	}
   1368 done:
   1369 	freemsg(mp);
   1370 }
   1371 
   1372 /*
   1373  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
   1374  * The local address is filled in if endpoint is bound. The remote address
   1375  * is filled in if remote address has been precified ("connected endpoint")
   1376  * (The concept of connected CLTS sockets is alien to published TPI
   1377  *  but we support it anyway).
   1378  */
   1379 static void
   1380 icmp_addr_req(queue_t *q, mblk_t *mp)
   1381 {
   1382 	struct sockaddr *sa;
   1383 	mblk_t	*ackmp;
   1384 	struct T_addr_ack *taa;
   1385 	icmp_t	*icmp = Q_TO_ICMP(q);
   1386 	conn_t	*connp = icmp->icmp_connp;
   1387 	uint_t	addrlen;
   1388 
   1389 	/* Make it large enough for worst case */
   1390 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
   1391 	    2 * sizeof (sin6_t), 1);
   1392 	if (ackmp == NULL) {
   1393 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
   1394 		return;
   1395 	}
   1396 	taa = (struct T_addr_ack *)ackmp->b_rptr;
   1397 
   1398 	bzero(taa, sizeof (struct T_addr_ack));
   1399 	ackmp->b_wptr = (uchar_t *)&taa[1];
   1400 
   1401 	taa->PRIM_type = T_ADDR_ACK;
   1402 	ackmp->b_datap->db_type = M_PCPROTO;
   1403 
   1404 	if (connp->conn_family == AF_INET)
   1405 		addrlen = sizeof (sin_t);
   1406 	else
   1407 		addrlen = sizeof (sin6_t);
   1408 
   1409 	mutex_enter(&connp->conn_lock);
   1410 	/*
   1411 	 * Note: Following code assumes 32 bit alignment of basic
   1412 	 * data structures like sin_t and struct T_addr_ack.
   1413 	 */
   1414 	if (icmp->icmp_state != TS_UNBND) {
   1415 		/*
   1416 		 * Fill in local address first
   1417 		 */
   1418 		taa->LOCADDR_offset = sizeof (*taa);
   1419 		taa->LOCADDR_length = addrlen;
   1420 		sa = (struct sockaddr *)&taa[1];
   1421 		(void) conn_getsockname(connp, sa, &addrlen);
   1422 		ackmp->b_wptr += addrlen;
   1423 	}
   1424 	if (icmp->icmp_state == TS_DATA_XFER) {
   1425 		/*
   1426 		 * connected, fill remote address too
   1427 		 */
   1428 		taa->REMADDR_length = addrlen;
   1429 		/* assumed 32-bit alignment */
   1430 		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
   1431 		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
   1432 		(void) conn_getpeername(connp, sa, &addrlen);
   1433 		ackmp->b_wptr += addrlen;
   1434 	}
   1435 	mutex_exit(&connp->conn_lock);
   1436 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
   1437 	qreply(q, ackmp);
   1438 }
   1439 
   1440 static void
   1441 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
   1442 {
   1443 	conn_t		*connp = icmp->icmp_connp;
   1444 
   1445 	*tap = icmp_g_t_info_ack;
   1446 
   1447 	if (connp->conn_family == AF_INET6)
   1448 		tap->ADDR_size = sizeof (sin6_t);
   1449 	else
   1450 		tap->ADDR_size = sizeof (sin_t);
   1451 	tap->CURRENT_state = icmp->icmp_state;
   1452 	tap->OPT_size = icmp_max_optsize;
   1453 }
   1454 
   1455 static void
   1456 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
   1457     t_uscalar_t cap_bits1)
   1458 {
   1459 	tcap->CAP_bits1 = 0;
   1460 
   1461 	if (cap_bits1 & TC1_INFO) {
   1462 		icmp_copy_info(&tcap->INFO_ack, icmp);
   1463 		tcap->CAP_bits1 |= TC1_INFO;
   1464 	}
   1465 }
   1466 
   1467 /*
   1468  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
   1469  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
   1470  * icmp_g_t_info_ack.  The current state of the stream is copied from
   1471  * icmp_state.
   1472  */
   1473 static void
   1474 icmp_capability_req(queue_t *q, mblk_t *mp)
   1475 {
   1476 	icmp_t			*icmp = Q_TO_ICMP(q);
   1477 	t_uscalar_t		cap_bits1;
   1478 	struct T_capability_ack	*tcap;
   1479 
   1480 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
   1481 
   1482 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
   1483 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
   1484 	if (!mp)
   1485 		return;
   1486 
   1487 	tcap = (struct T_capability_ack *)mp->b_rptr;
   1488 
   1489 	icmp_do_capability_ack(icmp, tcap, cap_bits1);
   1490 
   1491 	qreply(q, mp);
   1492 }
   1493 
   1494 /*
   1495  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
   1496  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
   1497  * The current state of the stream is copied from icmp_state.
   1498  */
   1499 static void
   1500 icmp_info_req(queue_t *q, mblk_t *mp)
   1501 {
   1502 	icmp_t	*icmp = Q_TO_ICMP(q);
   1503 
   1504 	/* Create a T_INFO_ACK message. */
   1505 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
   1506 	    T_INFO_ACK);
   1507 	if (!mp)
   1508 		return;
   1509 	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
   1510 	qreply(q, mp);
   1511 }
   1512 
   1513 static int
   1514 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
   1515     int family)
   1516 {
   1517 	conn_t *connp;
   1518 	dev_t	conn_dev;
   1519 	int	error;
   1520 
   1521 	/* If the stream is already open, return immediately. */
   1522 	if (q->q_ptr != NULL)
   1523 		return (0);
   1524 
   1525 	if (sflag == MODOPEN)
   1526 		return (EINVAL);
   1527 
   1528 	/*
   1529 	 * Since ICMP is not used so heavily, allocating from the small
   1530 	 * arena should be sufficient.
   1531 	 */
   1532 	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
   1533 		return (EBUSY);
   1534 	}
   1535 
   1536 	if (flag & SO_FALLBACK) {
   1537 		/*
   1538 		 * Non streams socket needs a stream to fallback to
   1539 		 */
   1540 		RD(q)->q_ptr = (void *)conn_dev;
   1541 		WR(q)->q_qinfo = &icmp_fallback_sock_winit;
   1542 		WR(q)->q_ptr = (void *)ip_minor_arena_sa;
   1543 		qprocson(q);
   1544 		return (0);
   1545 	}
   1546 
   1547 	connp = rawip_do_open(family, credp, &error, KM_SLEEP);
   1548 	if (connp == NULL) {
   1549 		ASSERT(error != 0);
   1550 		inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
   1551 		return (error);
   1552 	}
   1553 
   1554 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
   1555 	connp->conn_dev = conn_dev;
   1556 	connp->conn_minor_arena = ip_minor_arena_sa;
   1557 
   1558 	/*
   1559 	 * Initialize the icmp_t structure for this stream.
   1560 	 */
   1561 	q->q_ptr = connp;
   1562 	WR(q)->q_ptr = connp;
   1563 	connp->conn_rq = q;
   1564 	connp->conn_wq = WR(q);
   1565 
   1566 	WR(q)->q_hiwat = connp->conn_sndbuf;
   1567 	WR(q)->q_lowat = connp->conn_sndlowat;
   1568 
   1569 	qprocson(q);
   1570 
   1571 	/* Set the Stream head write offset. */
   1572 	(void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
   1573 	(void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
   1574 
   1575 	mutex_enter(&connp->conn_lock);
   1576 	connp->conn_state_flags &= ~CONN_INCIPIENT;
   1577 	mutex_exit(&connp->conn_lock);
   1578 
   1579 	icmp_bind_proto(connp->conn_icmp);
   1580 
   1581 	return (0);
   1582 }
   1583 
   1584 /* For /dev/icmp aka AF_INET open */
   1585 static int
   1586 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   1587 {
   1588 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
   1589 }
   1590 
   1591 /* For /dev/icmp6 aka AF_INET6 open */
   1592 static int
   1593 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   1594 {
   1595 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
   1596 }
   1597 
   1598 /*
   1599  * This is the open routine for icmp.  It allocates a icmp_t structure for
   1600  * the stream and, on the first open of the module, creates an ND table.
   1601  */
   1602 static conn_t *
   1603 rawip_do_open(int family, cred_t *credp, int *err, int flags)
   1604 {
   1605 	icmp_t	*icmp;
   1606 	conn_t *connp;
   1607 	zoneid_t zoneid;
   1608 	netstack_t *ns;
   1609 	icmp_stack_t *is;
   1610 	int len;
   1611 	boolean_t isv6 = B_FALSE;
   1612 
   1613 	*err = secpolicy_net_icmpaccess(credp);
   1614 	if (*err != 0)
   1615 		return (NULL);
   1616 
   1617 	if (family == AF_INET6)
   1618 		isv6 = B_TRUE;
   1619 
   1620 	ns = netstack_find_by_cred(credp);
   1621 	ASSERT(ns != NULL);
   1622 	is = ns->netstack_icmp;
   1623 	ASSERT(is != NULL);
   1624 
   1625 	/*
   1626 	 * For exclusive stacks we set the zoneid to zero
   1627 	 * to make ICMP operate as if in the global zone.
   1628 	 */
   1629 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
   1630 		zoneid = GLOBAL_ZONEID;
   1631 	else
   1632 		zoneid = crgetzoneid(credp);
   1633 
   1634 	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
   1635 
   1636 	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
   1637 	icmp = connp->conn_icmp;
   1638 
   1639 	/*
   1640 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
   1641 	 * done by netstack_find_by_cred()
   1642 	 */
   1643 	netstack_rele(ns);
   1644 
   1645 	/*
   1646 	 * Since this conn_t/icmp_t is not yet visible to anybody else we don't
   1647 	 * need to lock anything.
   1648 	 */
   1649 	ASSERT(connp->conn_proto == IPPROTO_ICMP);
   1650 	ASSERT(connp->conn_icmp == icmp);
   1651 	ASSERT(icmp->icmp_connp == connp);
   1652 
   1653 	/* Set the initial state of the stream and the privilege status. */
   1654 	icmp->icmp_state = TS_UNBND;
   1655 	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
   1656 	if (isv6) {
   1657 		connp->conn_family = AF_INET6;
   1658 		connp->conn_ipversion = IPV6_VERSION;
   1659 		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
   1660 		connp->conn_proto = IPPROTO_ICMPV6;
   1661 		/* May be changed by a SO_PROTOTYPE socket option. */
   1662 		connp->conn_proto = IPPROTO_ICMPV6;
   1663 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
   1664 		connp->conn_ixa->ixa_raw_cksum_offset = 2;
   1665 		connp->conn_default_ttl = is->is_ipv6_hoplimit;
   1666 		len = sizeof (ip6_t);
   1667 	} else {
   1668 		connp->conn_family = AF_INET;
   1669 		connp->conn_ipversion = IPV4_VERSION;
   1670 		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
   1671 		/* May be changed by a SO_PROTOTYPE socket option. */
   1672 		connp->conn_proto = IPPROTO_ICMP;
   1673 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
   1674 		connp->conn_default_ttl = is->is_ipv4_ttl;
   1675 		len = sizeof (ipha_t);
   1676 	}
   1677 	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
   1678 
   1679 	connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
   1680 
   1681 	/*
   1682 	 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
   1683 	 * the checksum is provided in the pre-built packet. We clear
   1684 	 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
   1685 	 * complete IP header and not to compute the transport checksum.
   1686 	 */
   1687 	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
   1688 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
   1689 	connp->conn_ixa->ixa_zoneid = zoneid;
   1690 
   1691 	connp->conn_zoneid = zoneid;
   1692 
   1693 	/*
   1694 	 * If the caller has the process-wide flag set, then default to MAC
   1695 	 * exempt mode.  This allows read-down to unlabeled hosts.
   1696 	 */
   1697 	if (getpflags(NET_MAC_AWARE, credp) != 0)
   1698 		connp->conn_mac_mode = CONN_MAC_AWARE;
   1699 
   1700 	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
   1701 
   1702 	icmp->icmp_is = is;
   1703 
   1704 	connp->conn_rcvbuf = is->is_recv_hiwat;
   1705 	connp->conn_sndbuf = is->is_xmit_hiwat;
   1706 	connp->conn_sndlowat = is->is_xmit_lowat;
   1707 	connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
   1708 
   1709 	connp->conn_wroff = len + is->is_wroff_extra;
   1710 	connp->conn_so_type = SOCK_RAW;
   1711 
   1712 	connp->conn_recv = icmp_input;
   1713 	connp->conn_recvicmp = icmp_icmp_input;
   1714 	crhold(credp);
   1715 	connp->conn_cred = credp;
   1716 	connp->conn_cpid = curproc->p_pid;
   1717 	connp->conn_open_time = ddi_get_lbolt64();
   1718 	/* Cache things in ixa without an extra refhold */
   1719 	connp->conn_ixa->ixa_cred = connp->conn_cred;
   1720 	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
   1721 	if (is_system_labeled())
   1722 		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
   1723 
   1724 	connp->conn_flow_cntrld = B_FALSE;
   1725 
   1726 	if (is->is_pmtu_discovery)
   1727 		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
   1728 
   1729 	return (connp);
   1730 }
   1731 
   1732 /*
   1733  * Which ICMP options OK to set through T_UNITDATA_REQ...
   1734  */
   1735 /* ARGSUSED */
   1736 static boolean_t
   1737 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
   1738 {
   1739 	return (B_TRUE);
   1740 }
   1741 
   1742 /*
   1743  * This routine gets default values of certain options whose default
   1744  * values are maintained by protcol specific code
   1745  */
   1746 int
   1747 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
   1748 {
   1749 	icmp_t *icmp = Q_TO_ICMP(q);
   1750 	icmp_stack_t *is = icmp->icmp_is;
   1751 	int *i1 = (int *)ptr;
   1752 
   1753 	switch (level) {
   1754 	case IPPROTO_IP:
   1755 		switch (name) {
   1756 		case IP_MULTICAST_TTL:
   1757 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
   1758 			return (sizeof (uchar_t));
   1759 		case IP_MULTICAST_LOOP:
   1760 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
   1761 			return (sizeof (uchar_t));
   1762 		}
   1763 		break;
   1764 	case IPPROTO_IPV6:
   1765 		switch (name) {
   1766 		case IPV6_MULTICAST_HOPS:
   1767 			*i1 = IP_DEFAULT_MULTICAST_TTL;
   1768 			return (sizeof (int));
   1769 		case IPV6_MULTICAST_LOOP:
   1770 			*i1 = IP_DEFAULT_MULTICAST_LOOP;
   1771 			return (sizeof (int));
   1772 		case IPV6_UNICAST_HOPS:
   1773 			*i1 = is->is_ipv6_hoplimit;
   1774 			return (sizeof (int));
   1775 		}
   1776 		break;
   1777 	case IPPROTO_ICMPV6:
   1778 		switch (name) {
   1779 		case ICMP6_FILTER:
   1780 			/* Make it look like "pass all" */
   1781 			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
   1782 			return (sizeof (icmp6_filter_t));
   1783 		}
   1784 		break;
   1785 	}
   1786 	return (-1);
   1787 }
   1788 
   1789 /*
   1790  * This routine retrieves the current status of socket options.
   1791  * It returns the size of the option retrieved, or -1.
   1792  */
   1793 int
   1794 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
   1795 {
   1796 	icmp_t		*icmp = connp->conn_icmp;
   1797 	int		*i1 = (int *)ptr;
   1798 	conn_opt_arg_t	coas;
   1799 	int		retval;
   1800 
   1801 	coas.coa_connp = connp;
   1802 	coas.coa_ixa = connp->conn_ixa;
   1803 	coas.coa_ipp = &connp->conn_xmit_ipp;
   1804 	coas.coa_ancillary = B_FALSE;
   1805 	coas.coa_changed = 0;
   1806 
   1807 	/*
   1808 	 * We assume that the optcom framework has checked for the set
   1809 	 * of levels and names that are supported, hence we don't worry
   1810 	 * about rejecting based on that.
   1811 	 * First check for ICMP specific handling, then pass to common routine.
   1812 	 */
   1813 	switch (level) {
   1814 	case IPPROTO_IP:
   1815 		/*
   1816 		 * Only allow IPv4 option processing on IPv4 sockets.
   1817 		 */
   1818 		if (connp->conn_family != AF_INET)
   1819 			return (-1);
   1820 
   1821 		switch (name) {
   1822 		case IP_OPTIONS:
   1823 		case T_IP_OPTIONS:
   1824 			/* Options are passed up with each packet */
   1825 			return (0);
   1826 		case IP_HDRINCL:
   1827 			mutex_enter(&connp->conn_lock);
   1828 			*i1 = (int)icmp->icmp_hdrincl;
   1829 			mutex_exit(&connp->conn_lock);
   1830 			return (sizeof (int));
   1831 		}
   1832 		break;
   1833 
   1834 	case IPPROTO_IPV6:
   1835 		/*
   1836 		 * Only allow IPv6 option processing on native IPv6 sockets.
   1837 		 */
   1838 		if (connp->conn_family != AF_INET6)
   1839 			return (-1);
   1840 
   1841 		switch (name) {
   1842 		case IPV6_CHECKSUM:
   1843 			/*
   1844 			 * Return offset or -1 if no checksum offset.
   1845 			 * Does not apply to IPPROTO_ICMPV6
   1846 			 */
   1847 			if (connp->conn_proto == IPPROTO_ICMPV6)
   1848 				return (-1);
   1849 
   1850 			mutex_enter(&connp->conn_lock);
   1851 			if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
   1852 				*i1 = connp->conn_ixa->ixa_raw_cksum_offset;
   1853 			else
   1854 				*i1 = -1;
   1855 			mutex_exit(&connp->conn_lock);
   1856 			return (sizeof (int));
   1857 		}
   1858 		break;
   1859 
   1860 	case IPPROTO_ICMPV6:
   1861 		/*
   1862 		 * Only allow IPv6 option processing on native IPv6 sockets.
   1863 		 */
   1864 		if (connp->conn_family != AF_INET6)
   1865 			return (-1);
   1866 
   1867 		if (connp->conn_proto != IPPROTO_ICMPV6)
   1868 			return (-1);
   1869 
   1870 		switch (name) {
   1871 		case ICMP6_FILTER:
   1872 			mutex_enter(&connp->conn_lock);
   1873 			if (icmp->icmp_filter == NULL) {
   1874 				/* Make it look like "pass all" */
   1875 				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
   1876 			} else {
   1877 				(void) bcopy(icmp->icmp_filter, ptr,
   1878 				    sizeof (icmp6_filter_t));
   1879 			}
   1880 			mutex_exit(&connp->conn_lock);
   1881 			return (sizeof (icmp6_filter_t));
   1882 		}
   1883 	}
   1884 	mutex_enter(&connp->conn_lock);
   1885 	retval = conn_opt_get(&coas, level, name, ptr);
   1886 	mutex_exit(&connp->conn_lock);
   1887 	return (retval);
   1888 }
   1889 
   1890 /*
   1891  * This routine retrieves the current status of socket options.
   1892  * It returns the size of the option retrieved, or -1.
   1893  */
   1894 int
   1895 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
   1896 {
   1897 	conn_t		*connp = Q_TO_CONN(q);
   1898 	int 		err;
   1899 
   1900 	err = icmp_opt_get(connp, level, name, ptr);
   1901 	return (err);
   1902 }
   1903 
   1904 /*
   1905  * This routine sets socket options.
   1906  */
   1907 int
   1908 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
   1909     uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
   1910 {
   1911 	conn_t		*connp = coa->coa_connp;
   1912 	ip_xmit_attr_t	*ixa = coa->coa_ixa;
   1913 	icmp_t		*icmp = connp->conn_icmp;
   1914 	icmp_stack_t	*is = icmp->icmp_is;
   1915 	int		*i1 = (int *)invalp;
   1916 	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
   1917 	int		error;
   1918 
   1919 	ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
   1920 
   1921 	/*
   1922 	 * For fixed length options, no sanity check
   1923 	 * of passed in length is done. It is assumed *_optcom_req()
   1924 	 * routines do the right thing.
   1925 	 */
   1926 
   1927 	switch (level) {
   1928 	case SOL_SOCKET:
   1929 		switch (name) {
   1930 		case SO_PROTOTYPE:
   1931 			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
   1932 			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
   1933 			    secpolicy_net_rawaccess(cr) != 0) {
   1934 				return (EACCES);
   1935 			}
   1936 			if (checkonly)
   1937 				break;
   1938 
   1939 			mutex_enter(&connp->conn_lock);
   1940 			connp->conn_proto = *i1 & 0xFF;
   1941 			ixa->ixa_protocol = connp->conn_proto;
   1942 			if ((connp->conn_proto == IPPROTO_RAW ||
   1943 			    connp->conn_proto == IPPROTO_IGMP) &&
   1944 			    connp->conn_family == AF_INET) {
   1945 				icmp->icmp_hdrincl = 1;
   1946 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
   1947 			} else if (connp->conn_proto == IPPROTO_UDP ||
   1948 			    connp->conn_proto == IPPROTO_TCP ||
   1949 			    connp->conn_proto == IPPROTO_SCTP) {
   1950 				/* Used by test applications like psh */
   1951 				icmp->icmp_hdrincl = 0;
   1952 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
   1953 			} else {
   1954 				icmp->icmp_hdrincl = 0;
   1955 				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
   1956 			}
   1957 
   1958 			if (connp->conn_family == AF_INET6 &&
   1959 			    connp->conn_proto == IPPROTO_ICMPV6) {
   1960 				/* Set offset for icmp6_cksum */
   1961 				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
   1962 				ixa->ixa_raw_cksum_offset = 2;
   1963 			}
   1964 			if (icmp->icmp_filter != NULL &&
   1965 			    connp->conn_proto != IPPROTO_ICMPV6) {
   1966 				kmem_free(icmp->icmp_filter,
   1967 				    sizeof (icmp6_filter_t));
   1968 				icmp->icmp_filter = NULL;
   1969 			}
   1970 			mutex_exit(&connp->conn_lock);
   1971 
   1972 			coa->coa_changed |= COA_HEADER_CHANGED;
   1973 			/*
   1974 			 * For SCTP, we don't use icmp_bind_proto() for
   1975 			 * raw socket binding.
   1976 			 */
   1977 			if (connp->conn_proto == IPPROTO_SCTP)
   1978 				return (0);
   1979 
   1980 			coa->coa_changed |= COA_ICMP_BIND_NEEDED;
   1981 			return (0);
   1982 
   1983 		case SO_SNDBUF:
   1984 			if (*i1 > is->is_max_buf) {
   1985 				return (ENOBUFS);
   1986 			}
   1987 			break;
   1988 		case SO_RCVBUF:
   1989 			if (*i1 > is->is_max_buf) {
   1990 				return (ENOBUFS);
   1991 			}
   1992 			break;
   1993 		}
   1994 		break;
   1995 
   1996 	case IPPROTO_IP:
   1997 		/*
   1998 		 * Only allow IPv4 option processing on IPv4 sockets.
   1999 		 */
   2000 		if (connp->conn_family != AF_INET)
   2001 			return (EINVAL);
   2002 
   2003 		switch (name) {
   2004 		case IP_HDRINCL:
   2005 			if (!checkonly) {
   2006 				mutex_enter(&connp->conn_lock);
   2007 				icmp->icmp_hdrincl = onoff;
   2008 				if (onoff)
   2009 					ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
   2010 				else
   2011 					ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
   2012 				mutex_exit(&connp->conn_lock);
   2013 			}
   2014 			break;
   2015 		}
   2016 		break;
   2017 
   2018 	case IPPROTO_IPV6:
   2019 		if (connp->conn_family != AF_INET6)
   2020 			return (EINVAL);
   2021 
   2022 		switch (name) {
   2023 		case IPV6_CHECKSUM:
   2024 			/*
   2025 			 * Integer offset into the user data of where the
   2026 			 * checksum is located.
   2027 			 * Offset of -1 disables option.
   2028 			 * Does not apply to IPPROTO_ICMPV6.
   2029 			 */
   2030 			if (connp->conn_proto == IPPROTO_ICMPV6 ||
   2031 			    coa->coa_ancillary) {
   2032 				return (EINVAL);
   2033 			}
   2034 			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
   2035 				/* Negative or not 16 bit aligned offset */
   2036 				return (EINVAL);
   2037 			}
   2038 			if (checkonly)
   2039 				break;
   2040 
   2041 			mutex_enter(&connp->conn_lock);
   2042 			if (*i1 == -1) {
   2043 				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
   2044 				ixa->ixa_raw_cksum_offset = 0;
   2045 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
   2046 			} else {
   2047 				ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
   2048 				ixa->ixa_raw_cksum_offset = *i1;
   2049 				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
   2050 			}
   2051 			mutex_exit(&connp->conn_lock);
   2052 			break;
   2053 		}
   2054 		break;
   2055 
   2056 	case IPPROTO_ICMPV6:
   2057 		/*
   2058 		 * Only allow IPv6 option processing on IPv6 sockets.
   2059 		 */
   2060 		if (connp->conn_family != AF_INET6)
   2061 			return (EINVAL);
   2062 		if (connp->conn_proto != IPPROTO_ICMPV6)
   2063 			return (EINVAL);
   2064 
   2065 		switch (name) {
   2066 		case ICMP6_FILTER:
   2067 			if (checkonly)
   2068 				break;
   2069 
   2070 			if ((inlen != 0) &&
   2071 			    (inlen != sizeof (icmp6_filter_t)))
   2072 				return (EINVAL);
   2073 
   2074 			mutex_enter(&connp->conn_lock);
   2075 			if (inlen == 0) {
   2076 				if (icmp->icmp_filter != NULL) {
   2077 					kmem_free(icmp->icmp_filter,
   2078 					    sizeof (icmp6_filter_t));
   2079 					icmp->icmp_filter = NULL;
   2080 				}
   2081 			} else {
   2082 				if (icmp->icmp_filter == NULL) {
   2083 					icmp->icmp_filter = kmem_alloc(
   2084 					    sizeof (icmp6_filter_t),
   2085 					    KM_NOSLEEP);
   2086 					if (icmp->icmp_filter == NULL) {
   2087 						mutex_exit(&connp->conn_lock);
   2088 						return (ENOBUFS);
   2089 					}
   2090 				}
   2091 				(void) bcopy(invalp, icmp->icmp_filter, inlen);
   2092 			}
   2093 			mutex_exit(&connp->conn_lock);
   2094 			break;
   2095 		}
   2096 		break;
   2097 	}
   2098 	error = conn_opt_set(coa, level, name, inlen, invalp,
   2099 	    checkonly, cr);
   2100 	return (error);
   2101 }
   2102 
   2103 /*
   2104  * This routine sets socket options.
   2105  */
   2106 int
   2107 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
   2108     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
   2109     void *thisdg_attrs, cred_t *cr)
   2110 {
   2111 	icmp_t		*icmp = connp->conn_icmp;
   2112 	int		err;
   2113 	conn_opt_arg_t	coas, *coa;
   2114 	boolean_t	checkonly;
   2115 	icmp_stack_t	*is = icmp->icmp_is;
   2116 
   2117 	switch (optset_context) {
   2118 	case SETFN_OPTCOM_CHECKONLY:
   2119 		checkonly = B_TRUE;
   2120 		/*
   2121 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
   2122 		 * inlen != 0 implies value supplied and
   2123 		 * 	we have to "pretend" to set it.
   2124 		 * inlen == 0 implies that there is no
   2125 		 * 	value part in T_CHECK request and just validation
   2126 		 * done elsewhere should be enough, we just return here.
   2127 		 */
   2128 		if (inlen == 0) {
   2129 			*outlenp = 0;
   2130 			return (0);
   2131 		}
   2132 		break;
   2133 	case SETFN_OPTCOM_NEGOTIATE:
   2134 		checkonly = B_FALSE;
   2135 		break;
   2136 	case SETFN_UD_NEGOTIATE:
   2137 	case SETFN_CONN_NEGOTIATE:
   2138 		checkonly = B_FALSE;
   2139 		/*
   2140 		 * Negotiating local and "association-related" options
   2141 		 * through T_UNITDATA_REQ.
   2142 		 *
   2143 		 * Following routine can filter out ones we do not
   2144 		 * want to be "set" this way.
   2145 		 */
   2146 		if (!icmp_opt_allow_udr_set(level, name)) {
   2147 			*outlenp = 0;
   2148 			return (EINVAL);
   2149 		}
   2150 		break;
   2151 	default:
   2152 		/*
   2153 		 * We should never get here
   2154 		 */
   2155 		*outlenp = 0;
   2156 		return (EINVAL);
   2157 	}
   2158 
   2159 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
   2160 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
   2161 
   2162 	if (thisdg_attrs != NULL) {
   2163 		/* Options from T_UNITDATA_REQ */
   2164 		coa = (conn_opt_arg_t *)thisdg_attrs;
   2165 		ASSERT(coa->coa_connp == connp);
   2166 		ASSERT(coa->coa_ixa != NULL);
   2167 		ASSERT(coa->coa_ipp != NULL);
   2168 		ASSERT(coa->coa_ancillary);
   2169 	} else {
   2170 		coa = &coas;
   2171 		coas.coa_connp = connp;
   2172 		/* Get a reference on conn_ixa to prevent concurrent mods */
   2173 		coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
   2174 		if (coas.coa_ixa == NULL) {
   2175 			*outlenp = 0;
   2176 			return (ENOMEM);
   2177 		}
   2178 		coas.coa_ipp = &connp->conn_xmit_ipp;
   2179 		coas.coa_ancillary = B_FALSE;
   2180 		coas.coa_changed = 0;
   2181 	}
   2182 
   2183 	err = icmp_do_opt_set(coa, level, name, inlen, invalp,
   2184 	    cr, checkonly);
   2185 	if (err != 0) {
   2186 errout:
   2187 		if (!coa->coa_ancillary)
   2188 			ixa_refrele(coa->coa_ixa);
   2189 		*outlenp = 0;
   2190 		return (err);
   2191 	}
   2192 
   2193 	/*
   2194 	 * Common case of OK return with outval same as inval.
   2195 	 */
   2196 	if (invalp != outvalp) {
   2197 		/* don't trust bcopy for identical src/dst */
   2198 		(void) bcopy(invalp, outvalp, inlen);
   2199 	}
   2200 	*outlenp = inlen;
   2201 
   2202 	/*
   2203 	 * If this was not ancillary data, then we rebuild the headers,
   2204 	 * update the IRE/NCE, and IPsec as needed.
   2205 	 * Since the label depends on the destination we go through
   2206 	 * ip_set_destination first.
   2207 	 */
   2208 	if (coa->coa_ancillary) {
   2209 		return (0);
   2210 	}
   2211 
   2212 	if (coa->coa_changed & COA_ROUTE_CHANGED) {
   2213 		in6_addr_t saddr, faddr, nexthop;
   2214 		in_port_t fport;
   2215 
   2216 		/*
   2217 		 * We clear lastdst to make sure we pick up the change
   2218 		 * next time sending.
   2219 		 * If we are connected we re-cache the information.
   2220 		 * We ignore errors to preserve BSD behavior.
   2221 		 * Note that we don't redo IPsec policy lookup here
   2222 		 * since the final destination (or source) didn't change.
   2223 		 */
   2224 		mutex_enter(&connp->conn_lock);
   2225 		connp->conn_v6lastdst = ipv6_all_zeros;
   2226 
   2227 		ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
   2228 		    &connp->conn_faddr_v6, &nexthop);
   2229 		saddr = connp->conn_saddr_v6;
   2230 		faddr = connp->conn_faddr_v6;
   2231 		fport = connp->conn_fport;
   2232 		mutex_exit(&connp->conn_lock);
   2233 
   2234 		if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
   2235 		    !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
   2236 			(void) ip_attr_connect(connp, coa->coa_ixa,
   2237 			    &saddr, &faddr, &nexthop, fport, NULL, NULL,
   2238 			    IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
   2239 		}
   2240 	}
   2241 
   2242 	ixa_refrele(coa->coa_ixa);
   2243 
   2244 	if (coa->coa_changed & COA_HEADER_CHANGED) {
   2245 		/*
   2246 		 * Rebuild the header template if we are connected.
   2247 		 * Otherwise clear conn_v6lastdst so we rebuild the header
   2248 		 * in the data path.
   2249 		 */
   2250 		mutex_enter(&connp->conn_lock);
   2251 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
   2252 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
   2253 			err = icmp_build_hdr_template(connp,
   2254 			    &connp->conn_saddr_v6, &connp->conn_faddr_v6,
   2255 			    connp->conn_flowinfo);
   2256 			if (err != 0) {
   2257 				mutex_exit(&connp->conn_lock);
   2258 				return (err);
   2259 			}
   2260 		} else {
   2261 			connp->conn_v6lastdst = ipv6_all_zeros;
   2262 		}
   2263 		mutex_exit(&connp->conn_lock);
   2264 	}
   2265 	if (coa->coa_changed & COA_RCVBUF_CHANGED) {
   2266 		(void) proto_set_rx_hiwat(connp->conn_rq, connp,
   2267 		    connp->conn_rcvbuf);
   2268 	}
   2269 	if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
   2270 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
   2271 	}
   2272 	if (coa->coa_changed & COA_WROFF_CHANGED) {
   2273 		/* Increase wroff if needed */
   2274 		uint_t wroff;
   2275 
   2276 		mutex_enter(&connp->conn_lock);
   2277 		wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
   2278 		if (wroff > connp->conn_wroff) {
   2279 			connp->conn_wroff = wroff;
   2280 			mutex_exit(&connp->conn_lock);
   2281 			(void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
   2282 		} else {
   2283 			mutex_exit(&connp->conn_lock);
   2284 		}
   2285 	}
   2286 	if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
   2287 		icmp_bind_proto(icmp);
   2288 	}
   2289 	return (err);
   2290 }
   2291 
   2292 /* This routine sets socket options. */
   2293 int
   2294 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
   2295     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
   2296     void *thisdg_attrs, cred_t *cr)
   2297 {
   2298 	conn_t	*connp = Q_TO_CONN(q);
   2299 	int error;
   2300 
   2301 	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
   2302 	    outlenp, outvalp, thisdg_attrs, cr);
   2303 	return (error);
   2304 }
   2305 
   2306 /*
   2307  * Setup IP headers.
   2308  *
   2309  * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
   2310  * but icmp_output_hdrincl restores ipha_protocol once we return.
   2311  */
   2312 mblk_t *
   2313 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
   2314     const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
   2315     mblk_t *data_mp, int *errorp)
   2316 {
   2317 	mblk_t		*mp;
   2318 	icmp_stack_t	*is = connp->conn_netstack->netstack_icmp;
   2319 	uint_t		data_len;
   2320 	uint32_t	cksum;
   2321 
   2322 	data_len = msgdsize(data_mp);
   2323 	mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
   2324 	    flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
   2325 	if (mp == NULL) {
   2326 		ASSERT(*errorp != 0);
   2327 		return (NULL);
   2328 	}
   2329 
   2330 	ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
   2331 
   2332 	/*
   2333 	 * If there was a routing option/header then conn_prepend_hdr
   2334 	 * has massaged it and placed the pseudo-header checksum difference
   2335 	 * in the cksum argument.
   2336 	 *
   2337 	 * Prepare for ICMPv6 checksum done in IP.
   2338 	 *
   2339 	 * We make it easy for IP to include our pseudo header
   2340 	 * by putting our length (and any routing header adjustment)
   2341 	 * in the ICMPv6 checksum field.
   2342 	 * The IP source, destination, and length have already been set by
   2343 	 * conn_prepend_hdr.
   2344 	 */
   2345 	cksum += data_len;
   2346 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
   2347 	ASSERT(cksum < 0x10000);
   2348 
   2349 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
   2350 		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
   2351 
   2352 		ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
   2353 	} else {
   2354 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
   2355 		uint_t	cksum_offset = 0;
   2356 
   2357 		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
   2358 
   2359 		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
   2360 			if (connp->conn_proto == IPPROTO_ICMPV6) {
   2361 				cksum_offset = ixa->ixa_ip_hdr_length +
   2362 				    offsetof(icmp6_t, icmp6_cksum);
   2363 			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
   2364 				cksum_offset = ixa->ixa_ip_hdr_length +
   2365 				    ixa->ixa_raw_cksum_offset;
   2366 			}
   2367 		}
   2368 		if (cksum_offset != 0) {
   2369 			uint16_t *ptr;
   2370 
   2371 			/* Make sure the checksum fits in the first mblk */
   2372 			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
   2373 				mblk_t *mp1;
   2374 
   2375 				mp1 = msgpullup(mp,
   2376 				    cksum_offset + sizeof (short));
   2377 				freemsg(mp);
   2378 				if (mp1 == NULL) {
   2379 					*errorp = ENOMEM;
   2380 					return (NULL);
   2381 				}
   2382 				mp = mp1;
   2383 				ip6h = (ip6_t *)mp->b_rptr;
   2384 			}
   2385 			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
   2386 			*ptr = htons(cksum);
   2387 		}
   2388 	}
   2389 
   2390 	/* Note that we don't try to update wroff due to ancillary data */
   2391 	return (mp);
   2392 }
   2393 
   2394 static int
   2395 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
   2396     const in6_addr_t *v6dst, uint32_t flowinfo)
   2397 {
   2398 	int		error;
   2399 
   2400 	ASSERT(MUTEX_HELD(&connp->conn_lock));
   2401 	/*
   2402 	 * We clear lastdst to make sure we don't use the lastdst path
   2403 	 * next time sending since we might not have set v6dst yet.
   2404 	 */
   2405 	connp->conn_v6lastdst = ipv6_all_zeros;
   2406 
   2407 	error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
   2408 	if (error != 0)
   2409 		return (error);
   2410 
   2411 	/*
   2412 	 * Any routing header/option has been massaged. The checksum difference
   2413 	 * is stored in conn_sum.
   2414 	 */
   2415 	return (0);
   2416 }
   2417 
   2418 /*
   2419  * This routine retrieves the value of an ND variable in a icmpparam_t
   2420  * structure.  It is called through nd_getset when a user reads the
   2421  * variable.
   2422  */
   2423 /* ARGSUSED */
   2424 static int
   2425 icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
   2426 {
   2427 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
   2428 
   2429 	(void) mi_mpprintf(mp, "%d", icmppa->icmp_param_value);
   2430 	return (0);
   2431 }
   2432 
   2433 /*
   2434  * Walk through the param array specified registering each element with the
   2435  * named dispatch (ND) handler.
   2436  */
   2437 static boolean_t
   2438 icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt)
   2439 {
   2440 	for (; cnt-- > 0; icmppa++) {
   2441 		if (icmppa->icmp_param_name && icmppa->icmp_param_name[0]) {
   2442 			if (!nd_load(ndp, icmppa->icmp_param_name,
   2443 			    icmp_param_get, icmp_param_set,
   2444 			    (caddr_t)icmppa)) {
   2445 				nd_free(ndp);
   2446 				return (B_FALSE);
   2447 			}
   2448 		}
   2449 	}
   2450 	return (B_TRUE);
   2451 }
   2452 
   2453 /* This routine sets an ND variable in a icmpparam_t structure. */
   2454 /* ARGSUSED */
   2455 static int
   2456 icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
   2457 {
   2458 	long		new_value;
   2459 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
   2460 
   2461 	/*
   2462 	 * Fail the request if the new value does not lie within the
   2463 	 * required bounds.
   2464 	 */
   2465 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
   2466 	    new_value < icmppa->icmp_param_min ||
   2467 	    new_value > icmppa->icmp_param_max) {
   2468 		return (EINVAL);
   2469 	}
   2470 	/* Set the new value */
   2471 	icmppa->icmp_param_value = new_value;
   2472 	return (0);
   2473 }
   2474 
   2475 static mblk_t *
   2476 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
   2477 {
   2478 	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
   2479 	if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
   2480 		/*
   2481 		 * fallback has started but messages have not been moved yet
   2482 		 */
   2483 		if (icmp->icmp_fallback_queue_head == NULL) {
   2484 			ASSERT(icmp->icmp_fallback_queue_tail == NULL);
   2485 			icmp->icmp_fallback_queue_head = mp;
   2486 			icmp->icmp_fallback_queue_tail = mp;
   2487 		} else {
   2488 			ASSERT(icmp->icmp_fallback_queue_tail != NULL);
   2489 			icmp->icmp_fallback_queue_tail->b_next = mp;
   2490 			icmp->icmp_fallback_queue_tail = mp;
   2491 		}
   2492 		return (NULL);
   2493 	} else {
   2494 		/*
   2495 		 * Fallback completed, let the caller putnext() the mblk.
   2496 		 */
   2497 		return (mp);
   2498 	}
   2499 }
   2500 
   2501 /*
   2502  * Deliver data to ULP. In case we have a socket, and it's falling back to
   2503  * TPI, then we'll queue the mp for later processing.
   2504  */
   2505 static void
   2506 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
   2507 {
   2508 	if (IPCL_IS_NONSTR(connp)) {
   2509 		icmp_t *icmp = connp->conn_icmp;
   2510 		int error;
   2511 
   2512 		ASSERT(len == msgdsize(mp));
   2513 		if ((*connp->conn_upcalls->su_recv)
   2514 		    (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
   2515 			mutex_enter(&icmp->icmp_recv_lock);
   2516 			if (error == ENOSPC) {
   2517 				/*
   2518 				 * let's confirm while holding the lock
   2519 				 */
   2520 				if ((*connp->conn_upcalls->su_recv)
   2521 				    (connp->conn_upper_handle, NULL, 0, 0,
   2522 				    &error, NULL) < 0) {
   2523 					ASSERT(error == ENOSPC);
   2524 					if (error == ENOSPC) {
   2525 						connp->conn_flow_cntrld =
   2526 						    B_TRUE;
   2527 					}
   2528 				}
   2529 				mutex_exit(&icmp->icmp_recv_lock);
   2530 			} else {
   2531 				ASSERT(error == EOPNOTSUPP);
   2532 				mp = icmp_queue_fallback(icmp, mp);
   2533 				mutex_exit(&icmp->icmp_recv_lock);
   2534 				if (mp != NULL)
   2535 					putnext(connp->conn_rq, mp);
   2536 			}
   2537 		}
   2538 		ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
   2539 	} else {
   2540 		putnext(connp->conn_rq, mp);
   2541 	}
   2542 }
   2543 
   2544 /*
   2545  * This is the inbound data path.
   2546  * IP has already pulled up the IP headers and verified alignment
   2547  * etc.
   2548  */
   2549 /* ARGSUSED2 */
   2550 static void
   2551 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   2552 {
   2553 	conn_t			*connp = (conn_t *)arg1;
   2554 	struct T_unitdata_ind	*tudi;
   2555 	uchar_t			*rptr;		/* Pointer to IP header */
   2556 	int			ip_hdr_length;
   2557 	int			udi_size;	/* Size of T_unitdata_ind */
   2558 	int			pkt_len;
   2559 	icmp_t			*icmp;
   2560 	ip_pkt_t		ipps;
   2561 	ip6_t			*ip6h;
   2562 	mblk_t			*mp1;
   2563 	crb_t			recv_ancillary;
   2564 	icmp_stack_t		*is;
   2565 	sin_t			*sin;
   2566 	sin6_t			*sin6;
   2567 	ipha_t			*ipha;
   2568 
   2569 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
   2570 
   2571 	icmp = connp->conn_icmp;
   2572 	is = icmp->icmp_is;
   2573 	rptr = mp->b_rptr;
   2574 
   2575 	ASSERT(DB_TYPE(mp) == M_DATA);
   2576 	ASSERT(OK_32PTR(rptr));
   2577 	ASSERT(ira->ira_pktlen == msgdsize(mp));
   2578 	pkt_len = ira->ira_pktlen;
   2579 
   2580 	/*
   2581 	 * Get a snapshot of these and allow other threads to change
   2582 	 * them after that. We need the same recv_ancillary when determining
   2583 	 * the size as when adding the ancillary data items.
   2584 	 */
   2585 	mutex_enter(&connp->conn_lock);
   2586 	recv_ancillary = connp->conn_recv_ancillary;
   2587 	mutex_exit(&connp->conn_lock);
   2588 
   2589 	ip_hdr_length = ira->ira_ip_hdr_length;
   2590 	ASSERT(MBLKL(mp) >= ip_hdr_length);	/* IP did a pullup */
   2591 
   2592 	/* Initialize regardless of IP version */
   2593 	ipps.ipp_fields = 0;
   2594 
   2595 	if (ira->ira_flags & IRAF_IS_IPV4) {
   2596 		ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
   2597 		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
   2598 		ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
   2599 
   2600 		ipha = (ipha_t *)mp->b_rptr;
   2601 		if (recv_ancillary.crb_all != 0)
   2602 			(void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
   2603 
   2604 		/*
   2605 		 * BSD for some reason adjusts ipha_length to exclude the
   2606 		 * IP header length. We do the same.
   2607 		 */
   2608 		if (is->is_bsd_compat) {
   2609 			ushort_t len;
   2610 
   2611 			len = ntohs(ipha->ipha_length);
   2612 			if (mp->b_datap->db_ref > 1) {
   2613 				/*
   2614 				 * Allocate a new IP header so that we can
   2615 				 * modify ipha_length.
   2616 				 */
   2617 				mblk_t	*mp1;
   2618 
   2619 				mp1 = allocb(ip_hdr_length, BPRI_MED);
   2620 				if (mp1 == NULL) {
   2621 					freemsg(mp);
   2622 					BUMP_MIB(&is->is_rawip_mib,
   2623 					    rawipInErrors);
   2624 					return;
   2625 				}
   2626 				bcopy(rptr, mp1->b_rptr, ip_hdr_length);
   2627 				mp->b_rptr = rptr + ip_hdr_length;
   2628 				rptr = mp1->b_rptr;
   2629 				ipha = (ipha_t *)rptr;
   2630 				mp1->b_cont = mp;
   2631 				mp1->b_wptr = rptr + ip_hdr_length;
   2632 				mp = mp1;
   2633 			}
   2634 			len -= ip_hdr_length;
   2635 			ipha->ipha_length = htons(len);
   2636 		}
   2637 
   2638 		/*
   2639 		 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
   2640 		 * sockets. This is ensured by icmp_bind and the IP fanout code.
   2641 		 */
   2642 		ASSERT(connp->conn_family == AF_INET);
   2643 
   2644 		/*
   2645 		 * This is the inbound data path.  Packets are passed upstream
   2646 		 * as T_UNITDATA_IND messages with full IPv4 headers still
   2647 		 * attached.
   2648 		 */
   2649 
   2650 		/*
   2651 		 * Normally only send up the source address.
   2652 		 * If any ancillary data items are wanted we add those.
   2653 		 */
   2654 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
   2655 		if (recv_ancillary.crb_all != 0) {
   2656 			udi_size += conn_recvancillary_size(connp,
   2657 			    recv_ancillary, ira, mp, &ipps);
   2658 		}
   2659 
   2660 		/* Allocate a message block for the T_UNITDATA_IND structure. */
   2661 		mp1 = allocb(udi_size, BPRI_MED);
   2662 		if (mp1 == NULL) {
   2663 			freemsg(mp);
   2664 			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
   2665 			return;
   2666 		}
   2667 		mp1->b_cont = mp;
   2668 		tudi = (struct T_unitdata_ind *)mp1->b_rptr;
   2669 		mp1->b_datap->db_type = M_PROTO;
   2670 		mp1->b_wptr = (uchar_t *)tudi + udi_size;
   2671 		tudi->PRIM_type = T_UNITDATA_IND;
   2672 		tudi->SRC_length = sizeof (sin_t);
   2673 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
   2674 		sin = (sin_t *)&tudi[1];
   2675 		*sin = sin_null;
   2676 		sin->sin_family = AF_INET;
   2677 		sin->sin_addr.s_addr = ipha->ipha_src;
   2678 		*(uint32_t *)&sin->sin_zero[0] = 0;
   2679 		*(uint32_t *)&sin->sin_zero[4] = 0;
   2680 		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
   2681 		    sizeof (sin_t);
   2682 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
   2683 		tudi->OPT_length = udi_size;
   2684 
   2685 		/*
   2686 		 * Add options if IP_RECVIF etc is set
   2687 		 */
   2688 		if (udi_size != 0) {
   2689 			conn_recvancillary_add(connp, recv_ancillary, ira,
   2690 			    &ipps, (uchar_t *)&sin[1], udi_size);
   2691 		}
   2692 		goto deliver;
   2693 	}
   2694 
   2695 	ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
   2696 	/*
   2697 	 * IPv6 packets can only be received by applications
   2698 	 * that are prepared to receive IPv6 addresses.
   2699 	 * The IP fanout must ensure this.
   2700 	 */
   2701 	ASSERT(connp->conn_family == AF_INET6);
   2702 
   2703 	/*
   2704 	 * Handle IPv6 packets. We don't pass up the IP headers with the
   2705 	 * payload for IPv6.
   2706 	 */
   2707 
   2708 	ip6h = (ip6_t *)rptr;
   2709 	if (recv_ancillary.crb_all != 0) {
   2710 		/*
   2711 		 * Call on ip_find_hdr_v6 which gets individual lenghts of
   2712 		 * extension headers (and pointers to them).
   2713 		 */
   2714 		uint8_t		nexthdr;
   2715 
   2716 		/* We don't care about the length or nextheader. */
   2717 		(void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
   2718 
   2719 		/*
   2720 		 * We do not pass up hop-by-hop options or any other
   2721 		 * extension header as part of the packet. Applications
   2722 		 * that want to see them have to specify IPV6_RECV* socket
   2723 		 * options. And conn_recvancillary_size/add explicitly
   2724 		 * drops the TX option from IPV6_HOPOPTS as it does for UDP.
   2725 		 *
   2726 		 * If we had multilevel ICMP sockets, then we'd want to
   2727 		 * modify conn_recvancillary_size/add to
   2728 		 * allow the user to see the label.
   2729 		 */
   2730 	}
   2731 
   2732 	/*
   2733 	 * Check a filter for ICMPv6 types if needed.
   2734 	 * Verify raw checksums if needed.
   2735 	 */
   2736 	mutex_enter(&connp->conn_lock);
   2737 	if (icmp->icmp_filter != NULL) {
   2738 		int type;
   2739 
   2740 		/* Assumes that IP has done the pullupmsg */
   2741 		type = mp->b_rptr[ip_hdr_length];
   2742 
   2743 		ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
   2744 		if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
   2745 			mutex_exit(&connp->conn_lock);
   2746 			freemsg(mp);
   2747 			return;
   2748 		}
   2749 	}
   2750 	if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
   2751 		/* Checksum */
   2752 		uint16_t	*up;
   2753 		uint32_t	sum;
   2754 		int		remlen;
   2755 
   2756 		up = (uint16_t *)&ip6h->ip6_src;
   2757 
   2758 		remlen = msgdsize(mp) - ip_hdr_length;
   2759 		sum = htons(connp->conn_proto + remlen)
   2760 		    + up[0] + up[1] + up[2] + up[3]
   2761 		    + up[4] + up[5] + up[6] + up[7]
   2762 		    + up[8] + up[9] + up[10] + up[11]
   2763 		    + up[12] + up[13] + up[14] + up[15];
   2764 		sum = (sum & 0xffff) + (sum >> 16);
   2765 		sum = IP_CSUM(mp, ip_hdr_length, sum);
   2766 		if (sum != 0) {
   2767 			/* IPv6 RAW checksum failed */
   2768 			ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
   2769 			mutex_exit(&connp->conn_lock);
   2770 			freemsg(mp);
   2771 			BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
   2772 			return;
   2773 		}
   2774 	}
   2775 	mutex_exit(&connp->conn_lock);
   2776 
   2777 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
   2778 
   2779 	if (recv_ancillary.crb_all != 0) {
   2780 		udi_size += conn_recvancillary_size(connp,
   2781 		    recv_ancillary, ira, mp, &ipps);
   2782 	}
   2783 
   2784 	mp1 = allocb(udi_size, BPRI_MED);
   2785 	if (mp1 == NULL) {
   2786 		freemsg(mp);
   2787 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
   2788 		return;
   2789 	}
   2790 	mp1->b_cont = mp;
   2791 	mp1->b_datap->db_type = M_PROTO;
   2792 	tudi = (struct T_unitdata_ind *)mp1->b_rptr;
   2793 	mp1->b_wptr = (uchar_t *)tudi + udi_size;
   2794 	tudi->PRIM_type = T_UNITDATA_IND;
   2795 	tudi->SRC_length = sizeof (sin6_t);
   2796 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
   2797 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
   2798 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
   2799 	tudi->OPT_length = udi_size;
   2800 	sin6 = (sin6_t *)&tudi[1];
   2801 	*sin6 = sin6_null;
   2802 	sin6->sin6_port = 0;
   2803 	sin6->sin6_family = AF_INET6;
   2804 
   2805 	sin6->sin6_addr = ip6h->ip6_src;
   2806 	/* No sin6_flowinfo per API */
   2807 	sin6->sin6_flowinfo = 0;
   2808 	/* For link-scope pass up scope id */
   2809 	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
   2810 		sin6->sin6_scope_id = ira->ira_ruifindex;
   2811 	else
   2812 		sin6->sin6_scope_id = 0;
   2813 	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
   2814 	    IPCL_ZONEID(connp), is->is_netstack);
   2815 
   2816 	if (udi_size != 0) {
   2817 		conn_recvancillary_add(connp, recv_ancillary, ira,
   2818 		    &ipps, (uchar_t *)&sin6[1], udi_size);
   2819 	}
   2820 
   2821 	/* Skip all the IPv6 headers per API */
   2822 	mp->b_rptr += ip_hdr_length;
   2823 	pkt_len -= ip_hdr_length;
   2824 
   2825 deliver:
   2826 	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
   2827 	icmp_ulp_recv(connp, mp1, pkt_len);
   2828 }
   2829 
   2830 /*
   2831  * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
   2832  * information that can be changing beneath us.
   2833  */
   2834 mblk_t *
   2835 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
   2836 {
   2837 	mblk_t			*mpdata;
   2838 	struct opthdr		*optp;
   2839 	conn_t			*connp = Q_TO_CONN(q);
   2840 	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
   2841 	mblk_t			*mp2ctl;
   2842 
   2843 	/*
   2844 	 * make a copy of the original message
   2845 	 */
   2846 	mp2ctl = copymsg(mpctl);
   2847 
   2848 	if (mpctl == NULL ||
   2849 	    (mpdata = mpctl->b_cont) == NULL) {
   2850 		freemsg(mpctl);
   2851 		freemsg(mp2ctl);
   2852 		return (0);
   2853 	}
   2854 
   2855 	/* fixed length structure for IPv4 and IPv6 counters */
   2856 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   2857 	optp->level = EXPER_RAWIP;
   2858 	optp->name = 0;
   2859 	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
   2860 	    sizeof (is->is_rawip_mib));
   2861 	optp->len = msgdsize(mpdata);
   2862 	qreply(q, mpctl);
   2863 
   2864 	return (mp2ctl);
   2865 }
   2866 
   2867 /*
   2868  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
   2869  * TODO:  If this ever actually tries to set anything, it needs to be
   2870  * to do the appropriate locking.
   2871  */
   2872 /* ARGSUSED */
   2873 int
   2874 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
   2875     uchar_t *ptr, int len)
   2876 {
   2877 	switch (level) {
   2878 	case EXPER_RAWIP:
   2879 		return (0);
   2880 	default:
   2881 		return (1);
   2882 	}
   2883 }
   2884 
   2885 /*
   2886  * This routine creates a T_UDERROR_IND message and passes it upstream.
   2887  * The address and options are copied from the T_UNITDATA_REQ message
   2888  * passed in mp.  This message is freed.
   2889  */
   2890 static void
   2891 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
   2892 {
   2893 	struct T_unitdata_req *tudr;
   2894 	mblk_t	*mp1;
   2895 	uchar_t *destaddr;
   2896 	t_scalar_t destlen;
   2897 	uchar_t	*optaddr;
   2898 	t_scalar_t optlen;
   2899 
   2900 	if ((mp->b_wptr < mp->b_rptr) ||
   2901 	    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
   2902 		goto done;
   2903 	}
   2904 	tudr = (struct T_unitdata_req *)mp->b_rptr;
   2905 	destaddr = mp->b_rptr + tudr->DEST_offset;
   2906 	if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
   2907 	    destaddr + tudr->DEST_length < mp->b_rptr ||
   2908 	    destaddr + tudr->DEST_length > mp->b_wptr) {
   2909 		goto done;
   2910 	}
   2911 	optaddr = mp->b_rptr + tudr->OPT_offset;
   2912 	if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
   2913 	    optaddr + tudr->OPT_length < mp->b_rptr ||
   2914 	    optaddr + tudr->OPT_length > mp->b_wptr) {
   2915 		goto done;
   2916 	}
   2917 	destlen = tudr->DEST_length;
   2918 	optlen = tudr->OPT_length;
   2919 
   2920 	mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
   2921 	    (char *)optaddr, optlen, err);
   2922 	if (mp1 != NULL)
   2923 		qreply(q, mp1);
   2924 
   2925 done:
   2926 	freemsg(mp);
   2927 }
   2928 
   2929 static int
   2930 rawip_do_unbind(conn_t *connp)
   2931 {
   2932 	icmp_t	*icmp = connp->conn_icmp;
   2933 
   2934 	mutex_enter(&connp->conn_lock);
   2935 	/* If a bind has not been done, we can't unbind. */
   2936 	if (icmp->icmp_state == TS_UNBND) {
   2937 		mutex_exit(&connp->conn_lock);
   2938 		return (-TOUTSTATE);
   2939 	}
   2940 	connp->conn_saddr_v6 = ipv6_all_zeros;
   2941 	connp->conn_bound_addr_v6 = ipv6_all_zeros;
   2942 	connp->conn_laddr_v6 = ipv6_all_zeros;
   2943 	connp->conn_mcbc_bind = B_FALSE;
   2944 	connp->conn_lport = 0;
   2945 	connp->conn_fport = 0;
   2946 	/* In case we were also connected */
   2947 	connp->conn_faddr_v6 = ipv6_all_zeros;
   2948 	connp->conn_v6lastdst = ipv6_all_zeros;
   2949 
   2950 	icmp->icmp_state = TS_UNBND;
   2951 
   2952 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
   2953 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
   2954 	mutex_exit(&connp->conn_lock);
   2955 
   2956 	ip_unbind(connp);
   2957 	return (0);
   2958 }
   2959 
   2960 /*
   2961  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
   2962  * After some error checking, the message is passed downstream to ip.
   2963  */
   2964 static void
   2965 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
   2966 {
   2967 	conn_t	*connp = Q_TO_CONN(q);
   2968 	int	error;
   2969 
   2970 	ASSERT(mp->b_cont == NULL);
   2971 	error = rawip_do_unbind(connp);
   2972 	if (error) {
   2973 		if (error < 0) {
   2974 			icmp_err_ack(q, mp, -error, 0);
   2975 		} else {
   2976 			icmp_err_ack(q, mp, 0, error);
   2977 		}
   2978 		return;
   2979 	}
   2980 
   2981 	/*
   2982 	 * Convert mp into a T_OK_ACK
   2983 	 */
   2984 
   2985 	mp = mi_tpi_ok_ack_alloc(mp);
   2986 
   2987 	/*
   2988 	 * should not happen in practice... T_OK_ACK is smaller than the
   2989 	 * original message.
   2990 	 */
   2991 	ASSERT(mp != NULL);
   2992 	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
   2993 	qreply(q, mp);
   2994 }
   2995 
   2996 /*
   2997  * Process IPv4 packets that already include an IP header.
   2998  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
   2999  * IPPROTO_IGMP).
   3000  * In this case we ignore the address and any options in the T_UNITDATA_REQ.
   3001  *
   3002  * The packet is assumed to have a base (20 byte) IP header followed
   3003  * by the upper-layer protocol. We include any IP_OPTIONS including a
   3004  * CIPSO label but otherwise preserve the base IP header.
   3005  */
   3006 static int
   3007 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
   3008 {
   3009 	icmp_t		*icmp = connp->conn_icmp;
   3010 	icmp_stack_t	*is = icmp->icmp_is;
   3011 	ipha_t		iphas;
   3012 	ipha_t		*ipha;
   3013 	int		ip_hdr_length;
   3014 	int		tp_hdr_len;
   3015 	ip_xmit_attr_t	*ixa;
   3016 	ip_pkt_t	*ipp;
   3017 	in6_addr_t	v6src;
   3018 	in6_addr_t	v6dst;
   3019 	in6_addr_t	v6nexthop;
   3020 	int		error;
   3021 	boolean_t	do_ipsec;
   3022 
   3023 	/*
   3024 	 * We need an exclusive copy of conn_ixa since the included IP
   3025 	 * header could have any destination.
   3026 	 * That copy has no pointers hence we
   3027 	 * need to set them up once we've parsed the ancillary data.
   3028 	 */
   3029 	ixa = conn_get_ixa_exclusive(connp);
   3030 	if (ixa == NULL) {
   3031 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3032 		freemsg(mp);
   3033 		return (ENOMEM);
   3034 	}
   3035 	ASSERT(cr != NULL);
   3036 	/*
   3037 	 * Caller has a reference on cr; from db_credp or because we
   3038 	 * are running in process context.
   3039 	 */
   3040 	ixa->ixa_cred = cr;
   3041 	ixa->ixa_cpid = pid;
   3042 	if (is_system_labeled()) {
   3043 		/* We need to restart with a label based on the cred */
   3044 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
   3045 	}
   3046 
   3047 	/* In case previous destination was multicast or multirt */
   3048 	ip_attr_newdst(ixa);
   3049 
   3050 	/* Get a copy of conn_xmit_ipp since the TX label might change it */
   3051 	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
   3052 	if (ipp == NULL) {
   3053 		ixa_refrele(ixa);
   3054 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3055 		freemsg(mp);
   3056 		return (ENOMEM);
   3057 	}
   3058 	mutex_enter(&connp->conn_lock);
   3059 	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
   3060 	mutex_exit(&connp->conn_lock);
   3061 	if (error != 0) {
   3062 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3063 		freemsg(mp);
   3064 		goto done;
   3065 	}
   3066 
   3067 	/* Sanity check length of packet */
   3068 	ipha = (ipha_t *)mp->b_rptr;
   3069 
   3070 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
   3071 	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
   3072 		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
   3073 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3074 			freemsg(mp);
   3075 			goto done;
   3076 		}
   3077 		ipha = (ipha_t *)mp->b_rptr;
   3078 	}
   3079 	ipha->ipha_version_and_hdr_length =
   3080 	    (IP_VERSION<<4) | (ip_hdr_length>>2);
   3081 
   3082 	/*
   3083 	 * We set IXAF_DONTFRAG if the application set DF which makes
   3084 	 * IP not fragment.
   3085 	 */
   3086 	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
   3087 	if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
   3088 		ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
   3089 	else
   3090 		ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
   3091 
   3092 	/* Even for multicast and broadcast we honor the apps ttl */
   3093 	ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
   3094 
   3095 	if (ipha->ipha_dst == INADDR_ANY)
   3096 		ipha->ipha_dst = htonl(INADDR_LOOPBACK);
   3097 
   3098 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
   3099 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
   3100 
   3101 	/* Defer IPsec if it might need to look at ICMP type/code */
   3102 	do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
   3103 	ixa->ixa_flags |= IXAF_IS_IPV4;
   3104 
   3105 	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
   3106 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
   3107 	    connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
   3108 	    (do_ipsec ? IPDF_IPSEC : 0));
   3109 	switch (error) {
   3110 	case 0:
   3111 		break;
   3112 	case EADDRNOTAVAIL:
   3113 		/*
   3114 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
   3115 		 * Don't have the application see that errno
   3116 		 */
   3117 		error = ENETUNREACH;
   3118 		goto failed;
   3119 	case ENETDOWN:
   3120 		/*
   3121 		 * Have !ipif_addr_ready address; drop packet silently
   3122 		 * until we can get applications to not send until we
   3123 		 * are ready.
   3124 		 */
   3125 		error = 0;
   3126 		goto failed;
   3127 	case EHOSTUNREACH:
   3128 	case ENETUNREACH:
   3129 		if (ixa->ixa_ire != NULL) {
   3130 			/*
   3131 			 * Let conn_ip_output/ire_send_noroute return
   3132 			 * the error and send any local ICMP error.
   3133 			 */
   3134 			error = 0;
   3135 			break;
   3136 		}
   3137 		/* FALLTHRU */
   3138 	default:
   3139 	failed:
   3140 		freemsg(mp);
   3141 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3142 		goto done;
   3143 	}
   3144 	if (ipha->ipha_src == INADDR_ANY)
   3145 		IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
   3146 
   3147 	/*
   3148 	 * We might be going to a different destination than last time,
   3149 	 * thus check that TX allows the communication and compute any
   3150 	 * needed label.
   3151 	 *
   3152 	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
   3153 	 * don't have to worry about concurrent threads.
   3154 	 */
   3155 	if (is_system_labeled()) {
   3156 		/*
   3157 		 * Check whether Trusted Solaris policy allows communication
   3158 		 * with this host, and pretend that the destination is
   3159 		 * unreachable if not.
   3160 		 * Compute any needed label and place it in ipp_label_v4/v6.
   3161 		 *
   3162 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
   3163 		 * ipp_label_v4/v6 to form the packet.
   3164 		 *
   3165 		 * Tsol note: We have ipp structure local to this thread so
   3166 		 * no locking is needed.
   3167 		 */
   3168 		error = conn_update_label(connp, ixa, &v6dst, ipp);
   3169 		if (error != 0) {
   3170 			freemsg(mp);
   3171 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3172 			goto done;
   3173 		}
   3174 	}
   3175 
   3176 	/*
   3177 	 * Save away a copy of the IPv4 header the application passed down
   3178 	 * and then prepend an IPv4 header complete with any IP options
   3179 	 * including label.
   3180 	 * We need a struct copy since icmp_prepend_hdr will reuse the available
   3181 	 * space in the mblk.
   3182 	 */
   3183 	iphas = *ipha;
   3184 	mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
   3185 
   3186 	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
   3187 	if (mp == NULL) {
   3188 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3189 		ASSERT(error != 0);
   3190 		goto done;
   3191 	}
   3192 	if (ixa->ixa_pktlen > IP_MAXPACKET) {
   3193 		error = EMSGSIZE;
   3194 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3195 		freemsg(mp);
   3196 		goto done;
   3197 	}
   3198 	/* Restore key parts of the header that the application passed down */
   3199 	ipha = (ipha_t *)mp->b_rptr;
   3200 	ipha->ipha_type_of_service = iphas.ipha_type_of_service;
   3201 	ipha->ipha_ident = iphas.ipha_ident;
   3202 	ipha->ipha_fragment_offset_and_flags =
   3203 	    iphas.ipha_fragment_offset_and_flags;
   3204 	ipha->ipha_ttl = iphas.ipha_ttl;
   3205 	ipha->ipha_protocol = iphas.ipha_protocol;
   3206 	ipha->ipha_src = iphas.ipha_src;
   3207 	ipha->ipha_dst = iphas.ipha_dst;
   3208 
   3209 	ixa->ixa_protocol = ipha->ipha_protocol;
   3210 
   3211 	/*
   3212 	 * Make sure that the IP header plus any transport header that is
   3213 	 * checksumed by ip_output is in the first mblk. (ip_output assumes
   3214 	 * that at least the checksum field is in the first mblk.)
   3215 	 */
   3216 	switch (ipha->ipha_protocol) {
   3217 	case IPPROTO_UDP:
   3218 		tp_hdr_len = 8;
   3219 		break;
   3220 	case IPPROTO_TCP:
   3221 		tp_hdr_len = 20;
   3222 		break;
   3223 	default:
   3224 		tp_hdr_len = 0;
   3225 		break;
   3226 	}
   3227 	ip_hdr_length = IPH_HDR_LENGTH(ipha);
   3228 	if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
   3229 		if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
   3230 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3231 			if (mp->b_cont == NULL)
   3232 				error = EINVAL;
   3233 			else
   3234 				error = ENOMEM;
   3235 			freemsg(mp);
   3236 			goto done;
   3237 		}
   3238 	}
   3239 
   3240 	if (!do_ipsec) {
   3241 		/* Policy might differ for different ICMP type/code */
   3242 		if (ixa->ixa_ipsec_policy != NULL) {
   3243 			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
   3244 			ixa->ixa_ipsec_policy = NULL;
   3245 			ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
   3246 		}
   3247 		mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
   3248 		if (mp == NULL) {
   3249 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3250 			error = EHOSTUNREACH;	/* IPsec policy failure */
   3251 			goto done;
   3252 		}
   3253 	}
   3254 
   3255 	/* We're done.  Pass the packet to ip. */
   3256 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
   3257 
   3258 	error = conn_ip_output(mp, ixa);
   3259 	/* No rawipOutErrors if an error since IP increases its error counter */
   3260 	switch (error) {
   3261 	case 0:
   3262 		break;
   3263 	case EWOULDBLOCK:
   3264 		(void) ixa_check_drain_insert(connp, ixa);
   3265 		error = 0;
   3266 		break;
   3267 	case EADDRNOTAVAIL:
   3268 		/*
   3269 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
   3270 		 * Don't have the application see that errno
   3271 		 */
   3272 		error = ENETUNREACH;
   3273 		break;
   3274 	}
   3275 done:
   3276 	ixa_refrele(ixa);
   3277 	ip_pkt_free(ipp);
   3278 	kmem_free(ipp, sizeof (*ipp));
   3279 	return (error);
   3280 }
   3281 
   3282 static mblk_t *
   3283 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
   3284 {
   3285 	ipha_t	*ipha = NULL;
   3286 	ip6_t	*ip6h = NULL;
   3287 
   3288 	if (ixa->ixa_flags & IXAF_IS_IPV4)
   3289 		ipha = (ipha_t *)mp->b_rptr;
   3290 	else
   3291 		ip6h = (ip6_t *)mp->b_rptr;
   3292 
   3293 	if (ixa->ixa_ipsec_policy != NULL) {
   3294 		IPPOL_REFRELE(ixa->ixa_ipsec_policy);
   3295 		ixa->ixa_ipsec_policy = NULL;
   3296 		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
   3297 	}
   3298 	return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
   3299 }
   3300 
   3301 /*
   3302  * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
   3303  * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
   3304  * the TPI options, otherwise we take them from msg_control.
   3305  * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
   3306  * Always consumes mp; never consumes tudr_mp.
   3307  */
   3308 static int
   3309 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
   3310     mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
   3311 {
   3312 	icmp_t		*icmp = connp->conn_icmp;
   3313 	icmp_stack_t	*is = icmp->icmp_is;
   3314 	int		error;
   3315 	ip_xmit_attr_t	*ixa;
   3316 	ip_pkt_t	*ipp;
   3317 	in6_addr_t	v6src;
   3318 	in6_addr_t	v6dst;
   3319 	in6_addr_t	v6nexthop;
   3320 	in_port_t	dstport;
   3321 	uint32_t	flowinfo;
   3322 	uint_t		srcid;
   3323 	int		is_absreq_failure = 0;
   3324 	conn_opt_arg_t	coas, *coa;
   3325 
   3326 	ASSERT(tudr_mp != NULL || msg != NULL);
   3327 
   3328 	/*
   3329 	 * Get ixa before checking state to handle a disconnect race.
   3330 	 *
   3331 	 * We need an exclusive copy of conn_ixa since the ancillary data
   3332 	 * options might modify it. That copy has no pointers hence we
   3333 	 * need to set them up once we've parsed the ancillary data.
   3334 	 */
   3335 	ixa = conn_get_ixa_exclusive(connp);
   3336 	if (ixa == NULL) {
   3337 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3338 		freemsg(mp);
   3339 		return (ENOMEM);
   3340 	}
   3341 	ASSERT(cr != NULL);
   3342 	ixa->ixa_cred = cr;
   3343 	ixa->ixa_cpid = pid;
   3344 	if (is_system_labeled()) {
   3345 		/* We need to restart with a label based on the cred */
   3346 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
   3347 	}
   3348 
   3349 	/* In case previous destination was multicast or multirt */
   3350 	ip_attr_newdst(ixa);
   3351 
   3352 	/* Get a copy of conn_xmit_ipp since the options might change it */
   3353 	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
   3354 	if (ipp == NULL) {
   3355 		ixa_refrele(ixa);
   3356 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3357 		freemsg(mp);
   3358 		return (ENOMEM);
   3359 	}
   3360 	mutex_enter(&connp->conn_lock);
   3361 	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
   3362 	mutex_exit(&connp->conn_lock);
   3363 	if (error != 0) {
   3364 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3365 		freemsg(mp);
   3366 		goto done;
   3367 	}
   3368 
   3369 	/*
   3370 	 * Parse the options and update ixa and ipp as a result.
   3371 	 */
   3372 
   3373 	coa = &coas;
   3374 	coa->coa_connp = connp;
   3375 	coa->coa_ixa = ixa;
   3376 	coa->coa_ipp = ipp;
   3377 	coa->coa_ancillary = B_TRUE;
   3378 	coa->coa_changed = 0;
   3379 
   3380 	if (msg != NULL) {
   3381 		error = process_auxiliary_options(connp, msg->msg_control,
   3382 		    msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
   3383 	} else {
   3384 		struct T_unitdata_req *tudr;
   3385 
   3386 		tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
   3387 		ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
   3388 		error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
   3389 		    &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
   3390 		    coa, &is_absreq_failure);
   3391 	}
   3392 	if (error != 0) {
   3393 		/*
   3394 		 * Note: No special action needed in this
   3395 		 * module for "is_absreq_failure"
   3396 		 */
   3397 		freemsg(mp);
   3398 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3399 		goto done;
   3400 	}
   3401 	ASSERT(is_absreq_failure == 0);
   3402 
   3403 	mutex_enter(&connp->conn_lock);
   3404 	/*
   3405 	 * If laddr is unspecified then we look at sin6_src_id.
   3406 	 * We will give precedence to a source address set with IPV6_PKTINFO
   3407 	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
   3408 	 * want ip_attr_connect to select a source (since it can fail) when
   3409 	 * IPV6_PKTINFO is specified.
   3410 	 * If this doesn't result in a source address then we get a source
   3411 	 * from ip_attr_connect() below.
   3412 	 */
   3413 	v6src = connp->conn_saddr_v6;
   3414 	if (sin != NULL) {
   3415 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
   3416 		dstport = sin->sin_port;
   3417 		flowinfo = 0;
   3418 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
   3419 		ixa->ixa_flags |= IXAF_IS_IPV4;
   3420 	} else if (sin6 != NULL) {
   3421 		v6dst = sin6->sin6_addr;
   3422 		dstport = sin6->sin6_port;
   3423 		flowinfo = sin6->sin6_flowinfo;
   3424 		srcid = sin6->__sin6_src_id;
   3425 		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
   3426 			ixa->ixa_scopeid = sin6->sin6_scope_id;
   3427 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
   3428 		} else {
   3429 			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
   3430 		}
   3431 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
   3432 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
   3433 			    connp->conn_netstack);
   3434 		}
   3435 		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
   3436 			ixa->ixa_flags |= IXAF_IS_IPV4;
   3437 		else
   3438 			ixa->ixa_flags &= ~IXAF_IS_IPV4;
   3439 	} else {
   3440 		/* Connected case */
   3441 		v6dst = connp->conn_faddr_v6;
   3442 		flowinfo = connp->conn_flowinfo;
   3443 	}
   3444 	mutex_exit(&connp->conn_lock);
   3445 	/* Handle IPV6_PKTINFO setting source address. */
   3446 	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
   3447 	    (ipp->ipp_fields & IPPF_ADDR)) {
   3448 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
   3449 			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
   3450 				v6src = ipp->ipp_addr;
   3451 		} else {
   3452 			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
   3453 				v6src = ipp->ipp_addr;
   3454 		}
   3455 	}
   3456 
   3457 	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
   3458 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
   3459 	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
   3460 
   3461 	switch (error) {
   3462 	case 0:
   3463 		break;
   3464 	case EADDRNOTAVAIL:
   3465 		/*
   3466 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
   3467 		 * Don't have the application see that errno
   3468 		 */
   3469 		error = ENETUNREACH;
   3470 		goto failed;
   3471 	case ENETDOWN:
   3472 		/*
   3473 		 * Have !ipif_addr_ready address; drop packet silently
   3474 		 * until we can get applications to not send until we
   3475 		 * are ready.
   3476 		 */
   3477 		error = 0;
   3478 		goto failed;
   3479 	case EHOSTUNREACH:
   3480 	case ENETUNREACH:
   3481 		if (ixa->ixa_ire != NULL) {
   3482 			/*
   3483 			 * Let conn_ip_output/ire_send_noroute return
   3484 			 * the error and send any local ICMP error.
   3485 			 */
   3486 			error = 0;
   3487 			break;
   3488 		}
   3489 		/* FALLTHRU */
   3490 	default:
   3491 	failed:
   3492 		freemsg(mp);
   3493 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3494 		goto done;
   3495 	}
   3496 
   3497 	/*
   3498 	 * We might be going to a different destination than last time,
   3499 	 * thus check that TX allows the communication and compute any
   3500 	 * needed label.
   3501 	 *
   3502 	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
   3503 	 * don't have to worry about concurrent threads.
   3504 	 */
   3505 	if (is_system_labeled()) {
   3506 		/*
   3507 		 * Check whether Trusted Solaris policy allows communication
   3508 		 * with this host, and pretend that the destination is
   3509 		 * unreachable if not.
   3510 		 * Compute any needed label and place it in ipp_label_v4/v6.
   3511 		 *
   3512 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
   3513 		 * ipp_label_v4/v6 to form the packet.
   3514 		 *
   3515 		 * Tsol note: We have ipp structure local to this thread so
   3516 		 * no locking is needed.
   3517 		 */
   3518 		error = conn_update_label(connp, ixa, &v6dst, ipp);
   3519 		if (error != 0) {
   3520 			freemsg(mp);
   3521 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3522 			goto done;
   3523 		}
   3524 	}
   3525 	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
   3526 	    &error);
   3527 	if (mp == NULL) {
   3528 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3529 		ASSERT(error != 0);
   3530 		goto done;
   3531 	}
   3532 	if (ixa->ixa_pktlen > IP_MAXPACKET) {
   3533 		error = EMSGSIZE;
   3534 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3535 		freemsg(mp);
   3536 		goto done;
   3537 	}
   3538 
   3539 	/* Policy might differ for different ICMP type/code */
   3540 	mp = icmp_output_attach_policy(mp, connp, ixa);
   3541 	if (mp == NULL) {
   3542 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3543 		error = EHOSTUNREACH;	/* IPsec policy failure */
   3544 		goto done;
   3545 	}
   3546 
   3547 	/* We're done.  Pass the packet to ip. */
   3548 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
   3549 
   3550 	/* Allow source not assigned to the system? */
   3551 	ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
   3552 	error = conn_ip_output(mp, ixa);
   3553 	if (!connp->conn_unspec_src)
   3554 		ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
   3555 	/* No rawipOutErrors if an error since IP increases its error counter */
   3556 	switch (error) {
   3557 	case 0:
   3558 		break;
   3559 	case EWOULDBLOCK:
   3560 		(void) ixa_check_drain_insert(connp, ixa);
   3561 		error = 0;
   3562 		break;
   3563 	case EADDRNOTAVAIL:
   3564 		/*
   3565 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
   3566 		 * Don't have the application see that errno
   3567 		 */
   3568 		error = ENETUNREACH;
   3569 		/* FALLTHRU */
   3570 	default:
   3571 		mutex_enter(&connp->conn_lock);
   3572 		/*
   3573 		 * Clear the source and v6lastdst so we call ip_attr_connect
   3574 		 * for the next packet and try to pick a better source.
   3575 		 */
   3576 		if (connp->conn_mcbc_bind)
   3577 			connp->conn_saddr_v6 = ipv6_all_zeros;
   3578 		else
   3579 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
   3580 		connp->conn_v6lastdst = ipv6_all_zeros;
   3581 		mutex_exit(&connp->conn_lock);
   3582 		break;
   3583 	}
   3584 done:
   3585 	ixa_refrele(ixa);
   3586 	ip_pkt_free(ipp);
   3587 	kmem_free(ipp, sizeof (*ipp));
   3588 	return (error);
   3589 }
   3590 
   3591 /*
   3592  * Handle sending an M_DATA for a connected socket.
   3593  * Handles both IPv4 and IPv6.
   3594  */
   3595 int
   3596 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
   3597 {
   3598 	icmp_t		*icmp = connp->conn_icmp;
   3599 	icmp_stack_t	*is = icmp->icmp_is;
   3600 	int		error;
   3601 	ip_xmit_attr_t	*ixa;
   3602 	boolean_t	do_ipsec;
   3603 
   3604 	/*
   3605 	 * If no other thread is using conn_ixa this just gets a reference to
   3606 	 * conn_ixa. Otherwise we get a safe copy of conn_ixa.
   3607 	 */
   3608 	ixa = conn_get_ixa(connp, B_FALSE);
   3609 	if (ixa == NULL) {
   3610 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3611 		freemsg(mp);
   3612 		return (ENOMEM);
   3613 	}
   3614 
   3615 	ASSERT(cr != NULL);
   3616 	ixa->ixa_cred = cr;
   3617 	ixa->ixa_cpid = pid;
   3618 
   3619 	/* Defer IPsec if it might need to look at ICMP type/code */
   3620 	switch (ixa->ixa_protocol) {
   3621 	case IPPROTO_ICMP:
   3622 	case IPPROTO_ICMPV6:
   3623 		do_ipsec = B_FALSE;
   3624 		break;
   3625 	default:
   3626 		do_ipsec = B_TRUE;
   3627 	}
   3628 
   3629 	mutex_enter(&connp->conn_lock);
   3630 	mp = icmp_prepend_header_template(connp, ixa, mp,
   3631 	    &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
   3632 
   3633 	if (mp == NULL) {
   3634 		ASSERT(error != 0);
   3635 		mutex_exit(&connp->conn_lock);
   3636 		ixa_refrele(ixa);
   3637 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3638 		freemsg(mp);
   3639 		return (error);
   3640 	}
   3641 
   3642 	if (!do_ipsec) {
   3643 		/* Policy might differ for different ICMP type/code */
   3644 		mp = icmp_output_attach_policy(mp, connp, ixa);
   3645 		if (mp == NULL) {
   3646 			mutex_exit(&connp->conn_lock);
   3647 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3648 			ixa_refrele(ixa);
   3649 			return (EHOSTUNREACH);	/* IPsec policy failure */
   3650 		}
   3651 	}
   3652 
   3653 	/*
   3654 	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
   3655 	 * safe copy, then we need to fill in any pointers in it.
   3656 	 */
   3657 	if (ixa->ixa_ire == NULL) {
   3658 		in6_addr_t	faddr, saddr;
   3659 		in6_addr_t	nexthop;
   3660 		in_port_t	fport;
   3661 
   3662 		saddr = connp->conn_saddr_v6;
   3663 		faddr = connp->conn_faddr_v6;
   3664 		fport = connp->conn_fport;
   3665 		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
   3666 		mutex_exit(&connp->conn_lock);
   3667 
   3668 		error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
   3669 		    fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
   3670 		    (do_ipsec ? IPDF_IPSEC : 0));
   3671 		switch (error) {
   3672 		case 0:
   3673 			break;
   3674 		case EADDRNOTAVAIL:
   3675 			/*
   3676 			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
   3677 			 * Don't have the application see that errno
   3678 			 */
   3679 			error = ENETUNREACH;
   3680 			goto failed;
   3681 		case ENETDOWN:
   3682 			/*
   3683 			 * Have !ipif_addr_ready address; drop packet silently
   3684 			 * until we can get applications to not send until we
   3685 			 * are ready.
   3686 			 */
   3687 			error = 0;
   3688 			goto failed;
   3689 		case EHOSTUNREACH:
   3690 		case ENETUNREACH:
   3691 			if (ixa->ixa_ire != NULL) {
   3692 				/*
   3693 				 * Let conn_ip_output/ire_send_noroute return
   3694 				 * the error and send any local ICMP error.
   3695 				 */
   3696 				error = 0;
   3697 				break;
   3698 			}
   3699 			/* FALLTHRU */
   3700 		default:
   3701 		failed:
   3702 			ixa_refrele(ixa);
   3703 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3704 			freemsg(mp);
   3705 			return (error);
   3706 		}
   3707 	} else {
   3708 		/* Done with conn_t */
   3709 		mutex_exit(&connp->conn_lock);
   3710 	}
   3711 
   3712 	/* We're done.  Pass the packet to ip. */
   3713 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
   3714 
   3715 	error = conn_ip_output(mp, ixa);
   3716 	/* No rawipOutErrors if an error since IP increases its error counter */
   3717 	switch (error) {
   3718 	case 0:
   3719 		break;
   3720 	case EWOULDBLOCK:
   3721 		(void) ixa_check_drain_insert(connp, ixa);
   3722 		error = 0;
   3723 		break;
   3724 	case EADDRNOTAVAIL:
   3725 		/*
   3726 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
   3727 		 * Don't have the application see that errno
   3728 		 */
   3729 		error = ENETUNREACH;
   3730 		break;
   3731 	}
   3732 	ixa_refrele(ixa);
   3733 	return (error);
   3734 }
   3735 
   3736 /*
   3737  * Handle sending an M_DATA to the last destination.
   3738  * Handles both IPv4 and IPv6.
   3739  *
   3740  * NOTE: The caller must hold conn_lock and we drop it here.
   3741  */
   3742 int
   3743 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
   3744     ip_xmit_attr_t *ixa)
   3745 {
   3746 	icmp_t		*icmp = connp->conn_icmp;
   3747 	icmp_stack_t	*is = icmp->icmp_is;
   3748 	int		error;
   3749 	boolean_t	do_ipsec;
   3750 
   3751 	ASSERT(MUTEX_HELD(&connp->conn_lock));
   3752 	ASSERT(ixa != NULL);
   3753 
   3754 	ASSERT(cr != NULL);
   3755 	ixa->ixa_cred = cr;
   3756 	ixa->ixa_cpid = pid;
   3757 
   3758 	/* Defer IPsec if it might need to look at ICMP type/code */
   3759 	switch (ixa->ixa_protocol) {
   3760 	case IPPROTO_ICMP:
   3761 	case IPPROTO_ICMPV6:
   3762 		do_ipsec = B_FALSE;
   3763 		break;
   3764 	default:
   3765 		do_ipsec = B_TRUE;
   3766 	}
   3767 
   3768 
   3769 	mp = icmp_prepend_header_template(connp, ixa, mp,
   3770 	    &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
   3771 
   3772 	if (mp == NULL) {
   3773 		ASSERT(error != 0);
   3774 		mutex_exit(&connp->conn_lock);
   3775 		ixa_refrele(ixa);
   3776 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3777 		freemsg(mp);
   3778 		return (error);
   3779 	}
   3780 
   3781 	if (!do_ipsec) {
   3782 		/* Policy might differ for different ICMP type/code */
   3783 		mp = icmp_output_attach_policy(mp, connp, ixa);
   3784 		if (mp == NULL) {
   3785 			mutex_exit(&connp->conn_lock);
   3786 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3787 			ixa_refrele(ixa);
   3788 			return (EHOSTUNREACH);	/* IPsec policy failure */
   3789 		}
   3790 	}
   3791 
   3792 	/*
   3793 	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
   3794 	 * safe copy, then we need to fill in any pointers in it.
   3795 	 */
   3796 	if (ixa->ixa_ire == NULL) {
   3797 		in6_addr_t	lastdst, lastsrc;
   3798 		in6_addr_t	nexthop;
   3799 		in_port_t	lastport;
   3800 
   3801 		lastsrc = connp->conn_v6lastsrc;
   3802 		lastdst = connp->conn_v6lastdst;
   3803 		lastport = connp->conn_lastdstport;
   3804 		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
   3805 		mutex_exit(&connp->conn_lock);
   3806 
   3807 		error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
   3808 		    &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
   3809 		    IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
   3810 		switch (error) {
   3811 		case 0:
   3812 			break;
   3813 		case EADDRNOTAVAIL:
   3814 			/*
   3815 			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
   3816 			 * Don't have the application see that errno
   3817 			 */
   3818 			error = ENETUNREACH;
   3819 			goto failed;
   3820 		case ENETDOWN:
   3821 			/*
   3822 			 * Have !ipif_addr_ready address; drop packet silently
   3823 			 * until we can get applications to not send until we
   3824 			 * are ready.
   3825 			 */
   3826 			error = 0;
   3827 			goto failed;
   3828 		case EHOSTUNREACH:
   3829 		case ENETUNREACH:
   3830 			if (ixa->ixa_ire != NULL) {
   3831 				/*
   3832 				 * Let conn_ip_output/ire_send_noroute return
   3833 				 * the error and send any local ICMP error.
   3834 				 */
   3835 				error = 0;
   3836 				break;
   3837 			}
   3838 			/* FALLTHRU */
   3839 		default:
   3840 		failed:
   3841 			ixa_refrele(ixa);
   3842 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   3843 			freemsg(mp);
   3844 			return (error);
   3845 		}
   3846 	} else {
   3847 		/* Done with conn_t */
   3848 		mutex_exit(&connp->conn_lock);
   3849 	}
   3850 
   3851 	/* We're done.  Pass the packet to ip. */
   3852 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
   3853 	error = conn_ip_output(mp, ixa);
   3854 	/* No rawipOutErrors if an error since IP increases its error counter */
   3855 	switch (error) {
   3856 	case 0:
   3857 		break;
   3858 	case EWOULDBLOCK:
   3859 		(void) ixa_check_drain_insert(connp, ixa);
   3860 		error = 0;
   3861 		break;
   3862 	case EADDRNOTAVAIL:
   3863 		/*
   3864 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
   3865 		 * Don't have the application see that errno
   3866 		 */
   3867 		error = ENETUNREACH;
   3868 		/* FALLTHRU */
   3869 	default:
   3870 		mutex_enter(&connp->conn_lock);
   3871 		/*
   3872 		 * Clear the source and v6lastdst so we call ip_attr_connect
   3873 		 * for the next packet and try to pick a better source.
   3874 		 */
   3875 		if (connp->conn_mcbc_bind)
   3876 			connp->conn_saddr_v6 = ipv6_all_zeros;
   3877 		else
   3878 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
   3879 		connp->conn_v6lastdst = ipv6_all_zeros;
   3880 		mutex_exit(&connp->conn_lock);
   3881 		break;
   3882 	}
   3883 	ixa_refrele(ixa);
   3884 	return (error);
   3885 }
   3886 
   3887 
   3888 /*
   3889  * Prepend the header template and then fill in the source and
   3890  * flowinfo. The caller needs to handle the destination address since
   3891  * it's setting is different if rthdr or source route.
   3892  *
   3893  * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
   3894  * When it returns NULL it sets errorp.
   3895  */
   3896 static mblk_t *
   3897 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
   3898     const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
   3899 {
   3900 	icmp_t		*icmp = connp->conn_icmp;
   3901 	icmp_stack_t	*is = icmp->icmp_is;
   3902 	uint_t		pktlen;
   3903 	uint_t		copylen;
   3904 	uint8_t		*iph;
   3905 	uint_t		ip_hdr_length;
   3906 	uint32_t	cksum;
   3907 	ip_pkt_t	*ipp;
   3908 
   3909 	ASSERT(MUTEX_HELD(&connp->conn_lock));
   3910 
   3911 	/*
   3912 	 * Copy the header template.
   3913 	 */
   3914 	copylen = connp->conn_ht_iphc_len;
   3915 	pktlen = copylen + msgdsize(mp);
   3916 	if (pktlen > IP_MAXPACKET) {
   3917 		freemsg(mp);
   3918 		*errorp = EMSGSIZE;
   3919 		return (NULL);
   3920 	}
   3921 	ixa->ixa_pktlen = pktlen;
   3922 
   3923 	/* check/fix buffer config, setup pointers into it */
   3924 	iph = mp->b_rptr - copylen;
   3925 	if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
   3926 		mblk_t *mp1;
   3927 
   3928 		mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
   3929 		if (mp1 == NULL) {
   3930 			freemsg(mp);
   3931 			*errorp = ENOMEM;
   3932 			return (NULL);
   3933 		}
   3934 		mp1->b_wptr = DB_LIM(mp1);
   3935 		mp1->b_cont = mp;
   3936 		mp = mp1;
   3937 		iph = (mp->b_wptr - copylen);
   3938 	}
   3939 	mp->b_rptr = iph;
   3940 	bcopy(connp->conn_ht_iphc, iph, copylen);
   3941 	ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
   3942 
   3943 	ixa->ixa_ip_hdr_length = ip_hdr_length;
   3944 
   3945 	/*
   3946 	 * Prepare for ICMPv6 checksum done in IP.
   3947 	 *
   3948 	 * icmp_build_hdr_template has already massaged any routing header
   3949 	 * and placed the result in conn_sum.
   3950 	 *
   3951 	 * We make it easy for IP to include our pseudo header
   3952 	 * by putting our length (and any routing header adjustment)
   3953 	 * in the ICMPv6 checksum field.
   3954 	 */
   3955 	cksum = pktlen - ip_hdr_length;
   3956 
   3957 	cksum += connp->conn_sum;
   3958 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
   3959 	ASSERT(cksum < 0x10000);
   3960 
   3961 	ipp = &connp->conn_xmit_ipp;
   3962 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
   3963 		ipha_t	*ipha = (ipha_t *)iph;
   3964 
   3965 		ipha->ipha_length = htons((uint16_t)pktlen);
   3966 
   3967 		/* if IP_PKTINFO specified an addres it wins over bind() */
   3968 		if ((ipp->ipp_fields & IPPF_ADDR) &&
   3969 		    IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
   3970 			ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
   3971 			ipha->ipha_src = ipp->ipp_addr_v4;
   3972 		} else {
   3973 			IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
   3974 		}
   3975 	} else {
   3976 		ip6_t *ip6h = (ip6_t *)iph;
   3977 		uint_t	cksum_offset = 0;
   3978 
   3979 		ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
   3980 
   3981 		/* if IP_PKTINFO specified an addres it wins over bind() */
   3982 		if ((ipp->ipp_fields & IPPF_ADDR) &&
   3983 		    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
   3984 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
   3985 			ip6h->ip6_src = ipp->ipp_addr;
   3986 		} else {
   3987 			ip6h->ip6_src = *v6src;
   3988 		}
   3989 		ip6h->ip6_vcf =
   3990 		    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
   3991 		    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
   3992 		if (ipp->ipp_fields & IPPF_TCLASS) {
   3993 			/* Overrides the class part of flowinfo */
   3994 			ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
   3995 			    ipp->ipp_tclass);
   3996 		}
   3997 
   3998 		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
   3999 			if (connp->conn_proto == IPPROTO_ICMPV6) {
   4000 				cksum_offset = ixa->ixa_ip_hdr_length +
   4001 				    offsetof(icmp6_t, icmp6_cksum);
   4002 			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
   4003 				cksum_offset = ixa->ixa_ip_hdr_length +
   4004 				    ixa->ixa_raw_cksum_offset;
   4005 			}
   4006 		}
   4007 		if (cksum_offset != 0) {
   4008 			uint16_t *ptr;
   4009 
   4010 			/* Make sure the checksum fits in the first mblk */
   4011 			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
   4012 				mblk_t *mp1;
   4013 
   4014 				mp1 = msgpullup(mp,
   4015 				    cksum_offset + sizeof (short));
   4016 				freemsg(mp);
   4017 				if (mp1 == NULL) {
   4018 					*errorp = ENOMEM;
   4019 					return (NULL);
   4020 				}
   4021 				mp = mp1;
   4022 				iph = mp->b_rptr;
   4023 				ip6h = (ip6_t *)iph;
   4024 			}
   4025 			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
   4026 			*ptr = htons(cksum);
   4027 		}
   4028 	}
   4029 
   4030 	return (mp);
   4031 }
   4032 
   4033 /*
   4034  * This routine handles all messages passed downstream.  It either
   4035  * consumes the message or passes it downstream; it never queues a
   4036  * a message.
   4037  */
   4038 void
   4039 icmp_wput(queue_t *q, mblk_t *mp)
   4040 {
   4041 	sin6_t		*sin6;
   4042 	sin_t		*sin = NULL;
   4043 	uint_t		srcid;
   4044 	conn_t		*connp = Q_TO_CONN(q);
   4045 	icmp_t		*icmp = connp->conn_icmp;
   4046 	int		error = 0;
   4047 	struct sockaddr	*addr = NULL;
   4048 	socklen_t	addrlen;
   4049 	icmp_stack_t	*is = icmp->icmp_is;
   4050 	struct T_unitdata_req *tudr;
   4051 	mblk_t		*data_mp;
   4052 	cred_t		*cr;
   4053 	pid_t		pid;
   4054 
   4055 	/*
   4056 	 * We directly handle several cases here: T_UNITDATA_REQ message
   4057 	 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
   4058 	 * socket.
   4059 	 */
   4060 	switch (DB_TYPE(mp)) {
   4061 	case M_DATA:
   4062 		/* sockfs never sends down M_DATA */
   4063 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   4064 		freemsg(mp);
   4065 		return;
   4066 
   4067 	case M_PROTO:
   4068 	case M_PCPROTO:
   4069 		tudr = (struct T_unitdata_req *)mp->b_rptr;
   4070 		if (MBLKL(mp) < sizeof (*tudr) ||
   4071 		    ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
   4072 			icmp_wput_other(q, mp);
   4073 			return;
   4074 		}
   4075 		break;
   4076 
   4077 	default:
   4078 		icmp_wput_other(q, mp);
   4079 		return;
   4080 	}
   4081 
   4082 	/* Handle valid T_UNITDATA_REQ here */
   4083 	data_mp = mp->b_cont;
   4084 	if (data_mp == NULL) {
   4085 		error = EPROTO;
   4086 		goto ud_error2;
   4087 	}
   4088 	mp->b_cont = NULL;
   4089 
   4090 	if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
   4091 		error = EADDRNOTAVAIL;
   4092 		goto ud_error2;
   4093 	}
   4094 
   4095 	/*
   4096 	 * All Solaris components should pass a db_credp
   4097 	 * for this message, hence we ASSERT.
   4098 	 * On production kernels we return an error to be robust against
   4099 	 * random streams modules sitting on top of us.
   4100 	 */
   4101 	cr = msg_getcred(mp, &pid);
   4102 	ASSERT(cr != NULL);
   4103 	if (cr == NULL) {
   4104 		error = EINVAL;
   4105 		goto ud_error2;
   4106 	}
   4107 
   4108 	/*
   4109 	 * If a port has not been bound to the stream, fail.
   4110 	 * This is not a problem when sockfs is directly
   4111 	 * above us, because it will ensure that the socket
   4112 	 * is first bound before allowing data to be sent.
   4113 	 */
   4114 	if (icmp->icmp_state == TS_UNBND) {
   4115 		error = EPROTO;
   4116 		goto ud_error2;
   4117 	}
   4118 	addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
   4119 	addrlen = tudr->DEST_length;
   4120 
   4121 	switch (connp->conn_family) {
   4122 	case AF_INET6:
   4123 		sin6 = (sin6_t *)addr;
   4124 		if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
   4125 		    (sin6->sin6_family != AF_INET6)) {
   4126 			error = EADDRNOTAVAIL;
   4127 			goto ud_error2;
   4128 		}
   4129 
   4130 		/* No support for mapped addresses on raw sockets */
   4131 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
   4132 			error = EADDRNOTAVAIL;
   4133 			goto ud_error2;
   4134 		}
   4135 		srcid = sin6->__sin6_src_id;
   4136 
   4137 		/*
   4138 		 * If the local address is a mapped address return
   4139 		 * an error.
   4140 		 * It would be possible to send an IPv6 packet but the
   4141 		 * response would never make it back to the application
   4142 		 * since it is bound to a mapped address.
   4143 		 */
   4144 		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
   4145 			error = EADDRNOTAVAIL;
   4146 			goto ud_error2;
   4147 		}
   4148 
   4149 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
   4150 			sin6->sin6_addr = ipv6_loopback;
   4151 
   4152 		if (tudr->OPT_length != 0) {
   4153 			/*
   4154 			 * If we are connected then the destination needs to be
   4155 			 * the same as the connected one.
   4156 			 */
   4157 			if (icmp->icmp_state == TS_DATA_XFER &&
   4158 			    !conn_same_as_last_v6(connp, sin6)) {
   4159 				error = EISCONN;
   4160 				goto ud_error2;
   4161 			}
   4162 			error = icmp_output_ancillary(connp, NULL, sin6,
   4163 			    data_mp, mp, NULL, cr, pid);
   4164 		} else {
   4165 			ip_xmit_attr_t *ixa;
   4166 
   4167 			/*
   4168 			 * We have to allocate an ip_xmit_attr_t before we grab
   4169 			 * conn_lock and we need to hold conn_lock once we've
   4170 			 * checked conn_same_as_last_v6 to handle concurrent
   4171 			 * send* calls on a socket.
   4172 			 */
   4173 			ixa = conn_get_ixa(connp, B_FALSE);
   4174 			if (ixa == NULL) {
   4175 				error = ENOMEM;
   4176 				goto ud_error2;
   4177 			}
   4178 			mutex_enter(&connp->conn_lock);
   4179 
   4180 			if (conn_same_as_last_v6(connp, sin6) &&
   4181 			    connp->conn_lastsrcid == srcid &&
   4182 			    ipsec_outbound_policy_current(ixa)) {
   4183 				/* icmp_output_lastdst drops conn_lock */
   4184 				error = icmp_output_lastdst(connp, data_mp, cr,
   4185 				    pid, ixa);
   4186 			} else {
   4187 				/* icmp_output_newdst drops conn_lock */
   4188 				error = icmp_output_newdst(connp, data_mp, NULL,
   4189 				    sin6, cr, pid, ixa);
   4190 			}
   4191 			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
   4192 		}
   4193 		if (error == 0) {
   4194 			freeb(mp);
   4195 			return;
   4196 		}
   4197 		break;
   4198 
   4199 	case AF_INET:
   4200 		sin = (sin_t *)addr;
   4201 		if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
   4202 		    (sin->sin_family != AF_INET)) {
   4203 			error = EADDRNOTAVAIL;
   4204 			goto ud_error2;
   4205 		}
   4206 		if (sin->sin_addr.s_addr == INADDR_ANY)
   4207 			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
   4208 
   4209 		/* Protocol 255 contains full IP headers */
   4210 		/* Read without holding lock */
   4211 		if (icmp->icmp_hdrincl) {
   4212 			if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
   4213 				if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
   4214 					error = EINVAL;
   4215 					goto ud_error2;
   4216 				}
   4217 			}
   4218 			error = icmp_output_hdrincl(connp, data_mp, cr, pid);
   4219 			if (error == 0) {
   4220 				freeb(mp);
   4221 				return;
   4222 			}
   4223 			/* data_mp consumed above */
   4224 			data_mp = NULL;
   4225 			goto ud_error2;
   4226 		}
   4227 
   4228 		if (tudr->OPT_length != 0) {
   4229 			/*
   4230 			 * If we are connected then the destination needs to be
   4231 			 * the same as the connected one.
   4232 			 */
   4233 			if (icmp->icmp_state == TS_DATA_XFER &&
   4234 			    !conn_same_as_last_v4(connp, sin)) {
   4235 				error = EISCONN;
   4236 				goto ud_error2;
   4237 			}
   4238 			error = icmp_output_ancillary(connp, sin, NULL,
   4239 			    data_mp, mp, NULL, cr, pid);
   4240 		} else {
   4241 			ip_xmit_attr_t *ixa;
   4242 
   4243 			/*
   4244 			 * We have to allocate an ip_xmit_attr_t before we grab
   4245 			 * conn_lock and we need to hold conn_lock once we've
   4246 			 * checked conn_same_as_last_v4 to handle concurrent
   4247 			 * send* calls on a socket.
   4248 			 */
   4249 			ixa = conn_get_ixa(connp, B_FALSE);
   4250 			if (ixa == NULL) {
   4251 				error = ENOMEM;
   4252 				goto ud_error2;
   4253 			}
   4254 			mutex_enter(&connp->conn_lock);
   4255 
   4256 			if (conn_same_as_last_v4(connp, sin) &&
   4257 			    ipsec_outbound_policy_current(ixa)) {
   4258 				/* icmp_output_lastdst drops conn_lock */
   4259 				error = icmp_output_lastdst(connp, data_mp, cr,
   4260 				    pid, ixa);
   4261 			} else {
   4262 				/* icmp_output_newdst drops conn_lock */
   4263 				error = icmp_output_newdst(connp, data_mp, sin,
   4264 				    NULL, cr, pid, ixa);
   4265 			}
   4266 			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
   4267 		}
   4268 		if (error == 0) {
   4269 			freeb(mp);
   4270 			return;
   4271 		}
   4272 		break;
   4273 	}
   4274 	ASSERT(mp != NULL);
   4275 	/* mp is freed by the following routine */
   4276 	icmp_ud_err(q, mp, (t_scalar_t)error);
   4277 	return;
   4278 
   4279 ud_error2:
   4280 	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   4281 	freemsg(data_mp);
   4282 	ASSERT(mp != NULL);
   4283 	/* mp is freed by the following routine */
   4284 	icmp_ud_err(q, mp, (t_scalar_t)error);
   4285 }
   4286 
   4287 /*
   4288  * Handle the case of the IP address or flow label being different
   4289  * for both IPv4 and IPv6.
   4290  *
   4291  * NOTE: The caller must hold conn_lock and we drop it here.
   4292  */
   4293 static int
   4294 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
   4295     cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
   4296 {
   4297 	icmp_t		*icmp = connp->conn_icmp;
   4298 	icmp_stack_t	*is = icmp->icmp_is;
   4299 	int		error;
   4300 	ip_xmit_attr_t	*oldixa;
   4301 	boolean_t	do_ipsec;
   4302 	uint_t		srcid;
   4303 	uint32_t	flowinfo;
   4304 	in6_addr_t	v6src;
   4305 	in6_addr_t	v6dst;
   4306 	in6_addr_t	v6nexthop;
   4307 	in_port_t	dstport;
   4308 
   4309 	ASSERT(MUTEX_HELD(&connp->conn_lock));
   4310 	ASSERT(ixa != NULL);
   4311 
   4312 	/*
   4313 	 * We hold conn_lock across all the use and modifications of
   4314 	 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
   4315 	 * stay consistent.
   4316 	 */
   4317 
   4318 	ASSERT(cr != NULL);
   4319 	ixa->ixa_cred = cr;
   4320 	ixa->ixa_cpid = pid;
   4321 	if (is_system_labeled()) {
   4322 		/* We need to restart with a label based on the cred */
   4323 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
   4324 	}
   4325 	/*
   4326 	 * If we are connected then the destination needs to be the
   4327 	 * same as the connected one, which is not the case here since we
   4328 	 * checked for that above.
   4329 	 */
   4330 	if (icmp->icmp_state == TS_DATA_XFER) {
   4331 		mutex_exit(&connp->conn_lock);
   4332 		error = EISCONN;
   4333 		goto ud_error;
   4334 	}
   4335 
   4336 	/* In case previous destination was multicast or multirt */
   4337 	ip_attr_newdst(ixa);
   4338 
   4339 	/*
   4340 	 * If laddr is unspecified then we look at sin6_src_id.
   4341 	 * We will give precedence to a source address set with IPV6_PKTINFO
   4342 	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
   4343 	 * want ip_attr_connect to select a source (since it can fail) when
   4344 	 * IPV6_PKTINFO is specified.
   4345 	 * If this doesn't result in a source address then we get a source
   4346 	 * from ip_attr_connect() below.
   4347 	 */
   4348 	v6src = connp->conn_saddr_v6;
   4349 	if (sin != NULL) {
   4350 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
   4351 		dstport = sin->sin_port;
   4352 		flowinfo = 0;
   4353 		srcid = 0;
   4354 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
   4355 		if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) {
   4356 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
   4357 			    connp->conn_netstack);
   4358 		}
   4359 		ixa->ixa_flags |= IXAF_IS_IPV4;
   4360 	} else {
   4361 		v6dst = sin6->sin6_addr;
   4362 		dstport = sin6->sin6_port;
   4363 		flowinfo = sin6->sin6_flowinfo;
   4364 		srcid = sin6->__sin6_src_id;
   4365 		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
   4366 			ixa->ixa_scopeid = sin6->sin6_scope_id;
   4367 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
   4368 		} else {
   4369 			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
   4370 		}
   4371 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
   4372 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
   4373 			    connp->conn_netstack);
   4374 		}
   4375 		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
   4376 			ixa->ixa_flags |= IXAF_IS_IPV4;
   4377 		else
   4378 			ixa->ixa_flags &= ~IXAF_IS_IPV4;
   4379 	}
   4380 	/* Handle IPV6_PKTINFO setting source address. */
   4381 	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
   4382 	    (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) {
   4383 		ip_pkt_t *ipp = &connp->conn_xmit_ipp;
   4384 
   4385 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
   4386 			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
   4387 				v6src = ipp->ipp_addr;
   4388 		} else {
   4389 			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
   4390 				v6src = ipp->ipp_addr;
   4391 		}
   4392 	}
   4393 
   4394 	/* Defer IPsec if it might need to look at ICMP type/code */
   4395 	switch (ixa->ixa_protocol) {
   4396 	case IPPROTO_ICMP:
   4397 	case IPPROTO_ICMPV6:
   4398 		do_ipsec = B_FALSE;
   4399 		break;
   4400 	default:
   4401 		do_ipsec = B_TRUE;
   4402 	}
   4403 
   4404 	ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
   4405 	mutex_exit(&connp->conn_lock);
   4406 
   4407 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
   4408 	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
   4409 	    (do_ipsec ? IPDF_IPSEC : 0));
   4410 	switch (error) {
   4411 	case 0:
   4412 		break;
   4413 	case EADDRNOTAVAIL:
   4414 		/*
   4415 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
   4416 		 * Don't have the application see that errno
   4417 		 */
   4418 		error = ENETUNREACH;
   4419 		goto failed;
   4420 	case ENETDOWN:
   4421 		/*
   4422 		 * Have !ipif_addr_ready address; drop packet silently
   4423 		 * until we can get applications to not send until we
   4424 		 * are ready.
   4425 		 */
   4426 		error = 0;
   4427 		goto failed;
   4428 	case EHOSTUNREACH:
   4429 	case ENETUNREACH:
   4430 		if (ixa->ixa_ire != NULL) {
   4431 			/*
   4432 			 * Let conn_ip_output/ire_send_noroute return
   4433 			 * the error and send any local ICMP error.
   4434 			 */
   4435 			error = 0;
   4436 			break;
   4437 		}
   4438 		/* FALLTHRU */
   4439 	default:
   4440 	failed:
   4441 		goto ud_error;
   4442 	}
   4443 
   4444 	mutex_enter(&connp->conn_lock);
   4445 	/*
   4446 	 * While we dropped the lock some other thread might have connected
   4447 	 * this socket. If so we bail out with EISCONN to ensure that the
   4448 	 * connecting thread is the one that updates conn_ixa, conn_ht_*
   4449 	 * and conn_*last*.
   4450 	 */
   4451 	if (icmp->icmp_state == TS_DATA_XFER) {
   4452 		mutex_exit(&connp->conn_lock);
   4453 		error = EISCONN;
   4454 		goto ud_error;
   4455 	}
   4456 
   4457 	/*
   4458 	 * We need to rebuild the headers if
   4459 	 *  - we are labeling packets (could be different for different
   4460 	 *    destinations)
   4461 	 *  - we have a source route (or routing header) since we need to
   4462 	 *    massage that to get the pseudo-header checksum
   4463 	 *  - a socket option with COA_HEADER_CHANGED has been set which
   4464 	 *    set conn_v6lastdst to zero.
   4465 	 *
   4466 	 * Otherwise the prepend function will just update the src, dst,
   4467 	 * and flow label.
   4468 	 */
   4469 	if (is_system_labeled()) {
   4470 		/* TX MLP requires SCM_UCRED and don't have that here */
   4471 		if (connp->conn_mlp_type != mlptSingle) {
   4472 			mutex_exit(&connp->conn_lock);
   4473 			error = ECONNREFUSED;
   4474 			goto ud_error;
   4475 		}
   4476 		/*
   4477 		 * Check whether Trusted Solaris policy allows communication
   4478 		 * with this host, and pretend that the destination is
   4479 		 * unreachable if not.
   4480 		 * Compute any needed label and place it in ipp_label_v4/v6.
   4481 		 *
   4482 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
   4483 		 * ipp_label_v4/v6 to form the packet.
   4484 		 *
   4485 		 * Tsol note: Since we hold conn_lock we know no other
   4486 		 * thread manipulates conn_xmit_ipp.
   4487 		 */
   4488 		error = conn_update_label(connp, ixa, &v6dst,
   4489 		    &connp->conn_xmit_ipp);
   4490 		if (error != 0) {
   4491 			mutex_exit(&connp->conn_lock);
   4492 			goto ud_error;
   4493 		}
   4494 		/* Rebuild the header template */
   4495 		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
   4496 		    flowinfo);
   4497 		if (error != 0) {
   4498 			mutex_exit(&connp->conn_lock);
   4499 			goto ud_error;
   4500 		}
   4501 	} else if (connp->conn_xmit_ipp.ipp_fields &
   4502 	    (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
   4503 	    IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
   4504 		/* Rebuild the header template */
   4505 		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
   4506 		    flowinfo);
   4507 		if (error != 0) {
   4508 			mutex_exit(&connp->conn_lock);
   4509 			goto ud_error;
   4510 		}
   4511 	} else {
   4512 		/* Simply update the destination address if no source route */
   4513 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
   4514 			ipha_t	*ipha = (ipha_t *)connp->conn_ht_iphc;
   4515 
   4516 			IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
   4517 			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
   4518 				ipha->ipha_fragment_offset_and_flags |=
   4519 				    IPH_DF_HTONS;
   4520 			} else {
   4521 				ipha->ipha_fragment_offset_and_flags &=
   4522 				    ~IPH_DF_HTONS;
   4523 			}
   4524 		} else {
   4525 			ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
   4526 			ip6h->ip6_dst = v6dst;
   4527 		}
   4528 	}
   4529 
   4530 	/*
   4531 	 * Remember the dst etc which corresponds to the built header
   4532 	 * template and conn_ixa.
   4533 	 */
   4534 	oldixa = conn_replace_ixa(connp, ixa);
   4535 	connp->conn_v6lastdst = v6dst;
   4536 	connp->conn_lastflowinfo = flowinfo;
   4537 	connp->conn_lastscopeid = ixa->ixa_scopeid;
   4538 	connp->conn_lastsrcid = srcid;
   4539 	/* Also remember a source to use together with lastdst */
   4540 	connp->conn_v6lastsrc = v6src;
   4541 
   4542 	data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
   4543 	    flowinfo, &error);
   4544 
   4545 	/* Done with conn_t */
   4546 	mutex_exit(&connp->conn_lock);
   4547 	ixa_refrele(oldixa);
   4548 
   4549 	if (data_mp == NULL) {
   4550 		ASSERT(error != 0);
   4551 		goto ud_error;
   4552 	}
   4553 
   4554 	if (!do_ipsec) {
   4555 		/* Policy might differ for different ICMP type/code */
   4556 		data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
   4557 		if (data_mp == NULL) {
   4558 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   4559 			error = EHOSTUNREACH;	/* IPsec policy failure */
   4560 			goto done;
   4561 		}
   4562 	}
   4563 
   4564 	/* We're done.  Pass the packet to ip. */
   4565 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
   4566 
   4567 	error = conn_ip_output(data_mp, ixa);
   4568 	/* No rawipOutErrors if an error since IP increases its error counter */
   4569 	switch (error) {
   4570 	case 0:
   4571 		break;
   4572 	case EWOULDBLOCK:
   4573 		(void) ixa_check_drain_insert(connp, ixa);
   4574 		error = 0;
   4575 		break;
   4576 	case EADDRNOTAVAIL:
   4577 		/*
   4578 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
   4579 		 * Don't have the application see that errno
   4580 		 */
   4581 		error = ENETUNREACH;
   4582 		/* FALLTHRU */
   4583 	default:
   4584 		mutex_enter(&connp->conn_lock);
   4585 		/*
   4586 		 * Clear the source and v6lastdst so we call ip_attr_connect
   4587 		 * for the next packet and try to pick a better source.
   4588 		 */
   4589 		if (connp->conn_mcbc_bind)
   4590 			connp->conn_saddr_v6 = ipv6_all_zeros;
   4591 		else
   4592 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
   4593 		connp->conn_v6lastdst = ipv6_all_zeros;
   4594 		mutex_exit(&connp->conn_lock);
   4595 		break;
   4596 	}
   4597 done:
   4598 	ixa_refrele(ixa);
   4599 	return (error);
   4600 
   4601 ud_error:
   4602 	if (ixa != NULL)
   4603 		ixa_refrele(ixa);
   4604 
   4605 	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   4606 	freemsg(data_mp);
   4607 	return (error);
   4608 }
   4609 
   4610 /* ARGSUSED */
   4611 static void
   4612 icmp_wput_fallback(queue_t *q, mblk_t *mp)
   4613 {
   4614 #ifdef DEBUG
   4615 	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
   4616 #endif
   4617 	freemsg(mp);
   4618 }
   4619 
   4620 static void
   4621 icmp_wput_other(queue_t *q, mblk_t *mp)
   4622 {
   4623 	uchar_t	*rptr = mp->b_rptr;
   4624 	struct iocblk *iocp;
   4625 	conn_t	*connp = Q_TO_CONN(q);
   4626 	icmp_t	*icmp = connp->conn_icmp;
   4627 	icmp_stack_t *is = icmp->icmp_is;
   4628 	cred_t *cr;
   4629 
   4630 	switch (mp->b_datap->db_type) {
   4631 	case M_PROTO:
   4632 	case M_PCPROTO:
   4633 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
   4634 			/*
   4635 			 * If the message does not contain a PRIM_type,
   4636 			 * throw it away.
   4637 			 */
   4638 			freemsg(mp);
   4639 			return;
   4640 		}
   4641 		switch (((t_primp_t)rptr)->type) {
   4642 		case T_ADDR_REQ:
   4643 			icmp_addr_req(q, mp);
   4644 			return;
   4645 		case O_T_BIND_REQ:
   4646 		case T_BIND_REQ:
   4647 			icmp_tpi_bind(q, mp);
   4648 			return;
   4649 		case T_CONN_REQ:
   4650 			icmp_tpi_connect(q, mp);
   4651 			return;
   4652 		case T_CAPABILITY_REQ:
   4653 			icmp_capability_req(q, mp);
   4654 			return;
   4655 		case T_INFO_REQ:
   4656 			icmp_info_req(q, mp);
   4657 			return;
   4658 		case T_UNITDATA_REQ:
   4659 			/*
   4660 			 * If a T_UNITDATA_REQ gets here, the address must
   4661 			 * be bad.  Valid T_UNITDATA_REQs are handled
   4662 			 * in icmp_wput.
   4663 			 */
   4664 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
   4665 			return;
   4666 		case T_UNBIND_REQ:
   4667 			icmp_tpi_unbind(q, mp);
   4668 			return;
   4669 		case T_SVR4_OPTMGMT_REQ:
   4670 			/*
   4671 			 * All Solaris components should pass a db_credp
   4672 			 * for this TPI message, hence we ASSERT.
   4673 			 * But in case there is some other M_PROTO that looks
   4674 			 * like a TPI message sent by some other kernel
   4675 			 * component, we check and return an error.
   4676 			 */
   4677 			cr = msg_getcred(mp, NULL);
   4678 			ASSERT(cr != NULL);
   4679 			if (cr == NULL) {
   4680 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
   4681 				return;
   4682 			}
   4683 
   4684 			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
   4685 			    cr)) {
   4686 				svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
   4687 			}
   4688 			return;
   4689 
   4690 		case T_OPTMGMT_REQ:
   4691 			/*
   4692 			 * All Solaris components should pass a db_credp
   4693 			 * for this TPI message, hence we ASSERT.
   4694 			 * But in case there is some other M_PROTO that looks
   4695 			 * like a TPI message sent by some other kernel
   4696 			 * component, we check and return an error.
   4697 			 */
   4698 			cr = msg_getcred(mp, NULL);
   4699 			ASSERT(cr != NULL);
   4700 			if (cr == NULL) {
   4701 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
   4702 				return;
   4703 			}
   4704 			tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
   4705 			return;
   4706 
   4707 		case T_DISCON_REQ:
   4708 			icmp_tpi_disconnect(q, mp);
   4709 			return;
   4710 
   4711 		/* The following TPI message is not supported by icmp. */
   4712 		case O_T_CONN_RES:
   4713 		case T_CONN_RES:
   4714 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
   4715 			return;
   4716 
   4717 		/* The following 3 TPI requests are illegal for icmp. */
   4718 		case T_DATA_REQ:
   4719 		case T_EXDATA_REQ:
   4720 		case T_ORDREL_REQ:
   4721 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
   4722 			return;
   4723 		default:
   4724 			break;
   4725 		}
   4726 		break;
   4727 	case M_FLUSH:
   4728 		if (*rptr & FLUSHW)
   4729 			flushq(q, FLUSHDATA);
   4730 		break;
   4731 	case M_IOCTL:
   4732 		iocp = (struct iocblk *)mp->b_rptr;
   4733 		switch (iocp->ioc_cmd) {
   4734 		case TI_GETPEERNAME:
   4735 			if (icmp->icmp_state != TS_DATA_XFER) {
   4736 				/*
   4737 				 * If a default destination address has not
   4738 				 * been associated with the stream, then we
   4739 				 * don't know the peer's name.
   4740 				 */
   4741 				iocp->ioc_error = ENOTCONN;
   4742 				iocp->ioc_count = 0;
   4743 				mp->b_datap->db_type = M_IOCACK;
   4744 				qreply(q, mp);
   4745 				return;
   4746 			}
   4747 			/* FALLTHRU */
   4748 		case TI_GETMYNAME:
   4749 			/*
   4750 			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
   4751 			 * need to copyin the user's strbuf structure.
   4752 			 * Processing will continue in the M_IOCDATA case
   4753 			 * below.
   4754 			 */
   4755 			mi_copyin(q, mp, NULL,
   4756 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
   4757 			return;
   4758 		case ND_SET:
   4759 			/* nd_getset performs the necessary checking */
   4760 		case ND_GET:
   4761 			if (nd_getset(q, is->is_nd, mp)) {
   4762 				qreply(q, mp);
   4763 				return;
   4764 			}
   4765 			break;
   4766 		default:
   4767 			break;
   4768 		}
   4769 		break;
   4770 	case M_IOCDATA:
   4771 		icmp_wput_iocdata(q, mp);
   4772 		return;
   4773 	default:
   4774 		/* Unrecognized messages are passed through without change. */
   4775 		break;
   4776 	}
   4777 	ip_wput_nondata(q, mp);
   4778 }
   4779 
   4780 /*
   4781  * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
   4782  * messages.
   4783  */
   4784 static void
   4785 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
   4786 {
   4787 	mblk_t		*mp1;
   4788 	STRUCT_HANDLE(strbuf, sb);
   4789 	uint_t		addrlen;
   4790 	conn_t		*connp = Q_TO_CONN(q);
   4791 	icmp_t		*icmp = connp->conn_icmp;
   4792 
   4793 	/* Make sure it is one of ours. */
   4794 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
   4795 	case TI_GETMYNAME:
   4796 	case TI_GETPEERNAME:
   4797 		break;
   4798 	default:
   4799 		ip_wput_nondata(q, mp);
   4800 		return;
   4801 	}
   4802 
   4803 	switch (mi_copy_state(q, mp, &mp1)) {
   4804 	case -1:
   4805 		return;
   4806 	case MI_COPY_CASE(MI_COPY_IN, 1):
   4807 		break;
   4808 	case MI_COPY_CASE(MI_COPY_OUT, 1):
   4809 		/*
   4810 		 * The address has been copied out, so now
   4811 		 * copyout the strbuf.
   4812 		 */
   4813 		mi_copyout(q, mp);
   4814 		return;
   4815 	case MI_COPY_CASE(MI_COPY_OUT, 2):
   4816 		/*
   4817 		 * The address and strbuf have been copied out.
   4818 		 * We're done, so just acknowledge the original
   4819 		 * M_IOCTL.
   4820 		 */
   4821 		mi_copy_done(q, mp, 0);
   4822 		return;
   4823 	default:
   4824 		/*
   4825 		 * Something strange has happened, so acknowledge
   4826 		 * the original M_IOCTL with an EPROTO error.
   4827 		 */
   4828 		mi_copy_done(q, mp, EPROTO);
   4829 		return;
   4830 	}
   4831 
   4832 	/*
   4833 	 * Now we have the strbuf structure for TI_GETMYNAME
   4834 	 * and TI_GETPEERNAME.  Next we copyout the requested
   4835 	 * address and then we'll copyout the strbuf.
   4836 	 */
   4837 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
   4838 	    (void *)mp1->b_rptr);
   4839 
   4840 	if (connp->conn_family == AF_INET)
   4841 		addrlen = sizeof (sin_t);
   4842 	else
   4843 		addrlen = sizeof (sin6_t);
   4844 
   4845 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
   4846 		mi_copy_done(q, mp, EINVAL);
   4847 		return;
   4848 	}
   4849 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
   4850 	case TI_GETMYNAME:
   4851 		break;
   4852 	case TI_GETPEERNAME:
   4853 		if (icmp->icmp_state != TS_DATA_XFER) {
   4854 			mi_copy_done(q, mp, ENOTCONN);
   4855 			return;
   4856 		}
   4857 		break;
   4858 	default:
   4859 		mi_copy_done(q, mp, EPROTO);
   4860 		return;
   4861 	}
   4862 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
   4863 	if (!mp1)
   4864 		return;
   4865 
   4866 	STRUCT_FSET(sb, len, addrlen);
   4867 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
   4868 	case TI_GETMYNAME:
   4869 		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
   4870 		    &addrlen);
   4871 		break;
   4872 	case TI_GETPEERNAME:
   4873 		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
   4874 		    &addrlen);
   4875 		break;
   4876 	}
   4877 	mp1->b_wptr += addrlen;
   4878 	/* Copy out the address */
   4879 	mi_copyout(q, mp);
   4880 }
   4881 
   4882 void
   4883 icmp_ddi_g_init(void)
   4884 {
   4885 	icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
   4886 	    icmp_opt_obj.odb_opt_arr_cnt);
   4887 
   4888 	/*
   4889 	 * We want to be informed each time a stack is created or
   4890 	 * destroyed in the kernel, so we can maintain the
   4891 	 * set of icmp_stack_t's.
   4892 	 */
   4893 	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
   4894 }
   4895 
   4896 void
   4897 icmp_ddi_g_destroy(void)
   4898 {
   4899 	netstack_unregister(NS_ICMP);
   4900 }
   4901 
   4902 #define	INET_NAME	"ip"
   4903 
   4904 /*
   4905  * Initialize the ICMP stack instance.
   4906  */
   4907 static void *
   4908 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
   4909 {
   4910 	icmp_stack_t	*is;
   4911 	icmpparam_t	*pa;
   4912 	int		error = 0;
   4913 	major_t		major;
   4914 
   4915 	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
   4916 	is->is_netstack = ns;
   4917 
   4918 	pa = (icmpparam_t *)kmem_alloc(sizeof (icmp_param_arr), KM_SLEEP);
   4919 	is->is_param_arr = pa;
   4920 	bcopy(icmp_param_arr, is->is_param_arr, sizeof (icmp_param_arr));
   4921 
   4922 	(void) icmp_param_register(&is->is_nd,
   4923 	    is->is_param_arr, A_CNT(icmp_param_arr));
   4924 	is->is_ksp = rawip_kstat_init(stackid);
   4925 
   4926 	major = mod_name_to_major(INET_NAME);
   4927 	error = ldi_ident_from_major(major, &is->is_ldi_ident);
   4928 	ASSERT(error == 0);
   4929 	return (is);
   4930 }
   4931 
   4932 /*
   4933  * Free the ICMP stack instance.
   4934  */
   4935 static void
   4936 rawip_stack_fini(netstackid_t stackid, void *arg)
   4937 {
   4938 	icmp_stack_t *is = (icmp_stack_t *)arg;
   4939 
   4940 	nd_free(&is->is_nd);
   4941 	kmem_free(is->is_param_arr, sizeof (icmp_param_arr));
   4942 	is->is_param_arr = NULL;
   4943 
   4944 	rawip_kstat_fini(stackid, is->is_ksp);
   4945 	is->is_ksp = NULL;
   4946 	ldi_ident_release(is->is_ldi_ident);
   4947 	kmem_free(is, sizeof (*is));
   4948 }
   4949 
   4950 static void *
   4951 rawip_kstat_init(netstackid_t stackid) {
   4952 	kstat_t	*ksp;
   4953 
   4954 	rawip_named_kstat_t template = {
   4955 		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
   4956 		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
   4957 		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
   4958 		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
   4959 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
   4960 	};
   4961 
   4962 	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
   4963 					KSTAT_TYPE_NAMED,
   4964 					NUM_OF_FIELDS(rawip_named_kstat_t),
   4965 					0, stackid);
   4966 	if (ksp == NULL || ksp->ks_data == NULL)
   4967 		return (NULL);
   4968 
   4969 	bcopy(&template, ksp->ks_data, sizeof (template));
   4970 	ksp->ks_update = rawip_kstat_update;
   4971 	ksp->ks_private = (void *)(uintptr_t)stackid;
   4972 
   4973 	kstat_install(ksp);
   4974 	return (ksp);
   4975 }
   4976 
   4977 static void
   4978 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
   4979 {
   4980 	if (ksp != NULL) {
   4981 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
   4982 		kstat_delete_netstack(ksp, stackid);
   4983 	}
   4984 }
   4985 
   4986 static int
   4987 rawip_kstat_update(kstat_t *ksp, int rw)
   4988 {
   4989 	rawip_named_kstat_t *rawipkp;
   4990 	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
   4991 	netstack_t	*ns;
   4992 	icmp_stack_t	*is;
   4993 
   4994 	if ((ksp == NULL) || (ksp->ks_data == NULL))
   4995 		return (EIO);
   4996 
   4997 	if (rw == KSTAT_WRITE)
   4998 		return (EACCES);
   4999 
   5000 	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
   5001 
   5002 	ns = netstack_find_by_stackid(stackid);
   5003 	if (ns == NULL)
   5004 		return (-1);
   5005 	is = ns->netstack_icmp;
   5006 	if (is == NULL) {
   5007 		netstack_rele(ns);
   5008 		return (-1);
   5009 	}
   5010 	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
   5011 	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
   5012 	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
   5013 	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
   5014 	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
   5015 	netstack_rele(ns);
   5016 	return (0);
   5017 }
   5018 
   5019 /* ARGSUSED */
   5020 int
   5021 rawip_accept(sock_lower_handle_t lproto_handle,
   5022     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
   5023     cred_t *cr)
   5024 {
   5025 	return (EOPNOTSUPP);
   5026 }
   5027 
   5028 /* ARGSUSED */
   5029 int
   5030 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
   5031     socklen_t len, cred_t *cr)
   5032 {
   5033 	conn_t  *connp = (conn_t *)proto_handle;
   5034 	int	error;
   5035 
   5036 	/* All Solaris components should pass a cred for this operation. */
   5037 	ASSERT(cr != NULL);
   5038 
   5039 	/* Binding to a NULL address really means unbind */
   5040 	if (sa == NULL)
   5041 		error = rawip_do_unbind(connp);
   5042 	else
   5043 		error = rawip_do_bind(connp, sa, len);
   5044 
   5045 	if (error < 0) {
   5046 		if (error == -TOUTSTATE)
   5047 			error = EINVAL;
   5048 		else
   5049 			error = proto_tlitosyserr(-error);
   5050 	}
   5051 	return (error);
   5052 }
   5053 
   5054 static int
   5055 rawip_implicit_bind(conn_t *connp)
   5056 {
   5057 	sin6_t sin6addr;
   5058 	sin_t *sin;
   5059 	sin6_t *sin6;
   5060 	socklen_t len;
   5061 	int error;
   5062 
   5063 	if (connp->conn_family == AF_INET) {
   5064 		len = sizeof (struct sockaddr_in);
   5065 		sin = (sin_t *)&sin6addr;
   5066 		*sin = sin_null;
   5067 		sin->sin_family = AF_INET;
   5068 		sin->sin_addr.s_addr = INADDR_ANY;
   5069 	} else {
   5070 		ASSERT(connp->conn_family == AF_INET6);
   5071 		len = sizeof (sin6_t);
   5072 		sin6 = (sin6_t *)&sin6addr;
   5073 		*sin6 = sin6_null;
   5074 		sin6->sin6_family = AF_INET6;
   5075 		V6_SET_ZERO(sin6->sin6_addr);
   5076 	}
   5077 
   5078 	error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
   5079 
   5080 	return ((error < 0) ? proto_tlitosyserr(-error) : error);
   5081 }
   5082 
   5083 static int
   5084 rawip_unbind(conn_t *connp)
   5085 {
   5086 	int error;
   5087 
   5088 	error = rawip_do_unbind(connp);
   5089 	if (error < 0) {
   5090 		error = proto_tlitosyserr(-error);
   5091 	}
   5092 	return (error);
   5093 }
   5094 
   5095 /* ARGSUSED */
   5096 int
   5097 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
   5098 {
   5099 	return (EOPNOTSUPP);
   5100 }
   5101 
   5102 int
   5103 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
   5104     socklen_t len, sock_connid_t *id, cred_t *cr)
   5105 {
   5106 	conn_t	*connp = (conn_t *)proto_handle;
   5107 	icmp_t *icmp = connp->conn_icmp;
   5108 	int	error;
   5109 	boolean_t did_bind = B_FALSE;
   5110 	pid_t	pid = curproc->p_pid;
   5111 
   5112 	/* All Solaris components should pass a cred for this operation. */
   5113 	ASSERT(cr != NULL);
   5114 
   5115 	if (sa == NULL) {
   5116 		/*
   5117 		 * Disconnect
   5118 		 * Make sure we are connected
   5119 		 */
   5120 		if (icmp->icmp_state != TS_DATA_XFER)
   5121 			return (EINVAL);
   5122 
   5123 		error = icmp_disconnect(connp);
   5124 		return (error);
   5125 	}
   5126 
   5127 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
   5128 	if (error != 0)
   5129 		return (error);
   5130 
   5131 	/* do an implicit bind if necessary */
   5132 	if (icmp->icmp_state == TS_UNBND) {
   5133 		error = rawip_implicit_bind(connp);
   5134 		/*
   5135 		 * We could be racing with an actual bind, in which case
   5136 		 * we would see EPROTO. We cross our fingers and try
   5137 		 * to connect.
   5138 		 */
   5139 		if (!(error == 0 || error == EPROTO))
   5140 			return (error);
   5141 		did_bind = B_TRUE;
   5142 	}
   5143 
   5144 	/*
   5145 	 * set SO_DGRAM_ERRIND
   5146 	 */
   5147 	connp->conn_dgram_errind = B_TRUE;
   5148 
   5149 	error = rawip_do_connect(connp, sa, len, cr, pid);
   5150 	if (error != 0 && did_bind) {
   5151 		int unbind_err;
   5152 
   5153 		unbind_err = rawip_unbind(connp);
   5154 		ASSERT(unbind_err == 0);
   5155 	}
   5156 
   5157 	if (error == 0) {
   5158 		*id = 0;
   5159 		(*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
   5160 		    0, NULL, -1);
   5161 	} else if (error < 0) {
   5162 		error = proto_tlitosyserr(-error);
   5163 	}
   5164 	return (error);
   5165 }
   5166 
   5167 /* ARGSUSED2 */
   5168 int
   5169 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
   5170     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
   5171 {
   5172 	conn_t  *connp = (conn_t *)proto_handle;
   5173 	icmp_t	*icmp;
   5174 	struct T_capability_ack tca;
   5175 	struct sockaddr_in6 laddr, faddr;
   5176 	socklen_t laddrlen, faddrlen;
   5177 	short opts;
   5178 	struct stroptions *stropt;
   5179 	mblk_t *stropt_mp;
   5180 	int error;
   5181 
   5182 	icmp = connp->conn_icmp;
   5183 
   5184 	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
   5185 
   5186 	/*
   5187 	 * setup the fallback stream that was allocated
   5188 	 */
   5189 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
   5190 	connp->conn_minor_arena = WR(q)->q_ptr;
   5191 
   5192 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
   5193 
   5194 	WR(q)->q_qinfo = &icmpwinit;
   5195 
   5196 	connp->conn_rq = RD(q);
   5197 	connp->conn_wq = WR(q);
   5198 
   5199 	/* Notify stream head about options before sending up data */
   5200 	stropt_mp->b_datap->db_type = M_SETOPTS;
   5201 	stropt_mp->b_wptr += sizeof (*stropt);
   5202 	stropt = (struct stroptions *)stropt_mp->b_rptr;
   5203 	stropt->so_flags = SO_WROFF | SO_HIWAT;
   5204 	stropt->so_wroff = connp->conn_wroff;
   5205 	stropt->so_hiwat = connp->conn_rcvbuf;
   5206 	putnext(RD(q), stropt_mp);
   5207 
   5208 	/*
   5209 	 * free helper stream
   5210 	 */
   5211 	ip_free_helper_stream(connp);
   5212 
   5213 	/*
   5214 	 * Collect the information needed to sync with the sonode
   5215 	 */
   5216 	icmp_do_capability_ack(icmp, &tca, TC1_INFO);
   5217 
   5218 	laddrlen = faddrlen = sizeof (sin6_t);
   5219 	(void) rawip_getsockname((sock_lower_handle_t)connp,
   5220 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
   5221 	error = rawip_getpeername((sock_lower_handle_t)connp,
   5222 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
   5223 	if (error != 0)
   5224 		faddrlen = 0;
   5225 	opts = 0;
   5226 	if (connp->conn_dgram_errind)
   5227 		opts |= SO_DGRAM_ERRIND;
   5228 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
   5229 		opts |= SO_DONTROUTE;
   5230 
   5231 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
   5232 	    (struct sockaddr *)&laddr, laddrlen,
   5233 	    (struct sockaddr *)&faddr, faddrlen, opts);
   5234 
   5235 	/*
   5236 	 * Attempts to send data up during fallback will result in it being
   5237 	 * queued in icmp_t. Now we push up any queued packets.
   5238 	 */
   5239 	mutex_enter(&icmp->icmp_recv_lock);
   5240 	while (icmp->icmp_fallback_queue_head != NULL) {
   5241 		mblk_t	*mp;
   5242 
   5243 		mp = icmp->icmp_fallback_queue_head;
   5244 		icmp->icmp_fallback_queue_head = mp->b_next;
   5245 		mp->b_next = NULL;
   5246 		mutex_exit(&icmp->icmp_recv_lock);
   5247 		putnext(RD(q), mp);
   5248 		mutex_enter(&icmp->icmp_recv_lock);
   5249 	}
   5250 	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
   5251 
   5252 	/*
   5253 	 * No longer a streams less socket
   5254 	 */
   5255 	mutex_enter(&connp->conn_lock);
   5256 	connp->conn_flags &= ~IPCL_NONSTR;
   5257 	mutex_exit(&connp->conn_lock);
   5258 
   5259 	mutex_exit(&icmp->icmp_recv_lock);
   5260 
   5261 	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
   5262 	    icmp->icmp_fallback_queue_tail == NULL);
   5263 
   5264 	ASSERT(connp->conn_ref >= 1);
   5265 
   5266 	return (0);
   5267 }
   5268 
   5269 /* ARGSUSED2 */
   5270 sock_lower_handle_t
   5271 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
   5272     uint_t *smodep, int *errorp, int flags, cred_t *credp)
   5273 {
   5274 	conn_t *connp;
   5275 
   5276 	if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
   5277 		*errorp = EPROTONOSUPPORT;
   5278 		return (NULL);
   5279 	}
   5280 
   5281 	connp = rawip_do_open(family, credp, errorp, flags);
   5282 	if (connp != NULL) {
   5283 		connp->conn_flags |= IPCL_NONSTR;
   5284 
   5285 		mutex_enter(&connp->conn_lock);
   5286 		connp->conn_state_flags &= ~CONN_INCIPIENT;
   5287 		mutex_exit(&connp->conn_lock);
   5288 		*sock_downcalls = &sock_rawip_downcalls;
   5289 		*smodep = SM_ATOMIC;
   5290 	} else {
   5291 		ASSERT(*errorp != 0);
   5292 	}
   5293 
   5294 	return ((sock_lower_handle_t)connp);
   5295 }
   5296 
   5297 /* ARGSUSED3 */
   5298 void
   5299 rawip_activate(sock_lower_handle_t proto_handle,
   5300     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
   5301     cred_t *cr)
   5302 {
   5303 	conn_t 			*connp = (conn_t *)proto_handle;
   5304 	struct sock_proto_props sopp;
   5305 
   5306 	/* All Solaris components should pass a cred for this operation. */
   5307 	ASSERT(cr != NULL);
   5308 
   5309 	connp->conn_upcalls = sock_upcalls;
   5310 	connp->conn_upper_handle = sock_handle;
   5311 
   5312 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
   5313 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
   5314 	sopp.sopp_wroff = connp->conn_wroff;
   5315 	sopp.sopp_rxhiwat = connp->conn_rcvbuf;
   5316 	sopp.sopp_rxlowat = connp->conn_rcvlowat;
   5317 	sopp.sopp_maxblk = INFPSZ;
   5318 	sopp.sopp_maxpsz = IP_MAXPACKET;
   5319 	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
   5320 	    icmp_mod_info.mi_minpsz;
   5321 
   5322 	(*connp->conn_upcalls->su_set_proto_props)
   5323 	    (connp->conn_upper_handle, &sopp);
   5324 
   5325 	icmp_bind_proto(connp->conn_icmp);
   5326 }
   5327 
   5328 /* ARGSUSED3 */
   5329 int
   5330 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
   5331     socklen_t *salenp, cred_t *cr)
   5332 {
   5333 	conn_t  *connp = (conn_t *)proto_handle;
   5334 	icmp_t  *icmp = connp->conn_icmp;
   5335 	int	error;
   5336 
   5337 	/* All Solaris components should pass a cred for this operation. */
   5338 	ASSERT(cr != NULL);
   5339 
   5340 	mutex_enter(&connp->conn_lock);
   5341 	if (icmp->icmp_state != TS_DATA_XFER)
   5342 		error = ENOTCONN;
   5343 	else
   5344 		error = conn_getpeername(connp, sa, salenp);
   5345 	mutex_exit(&connp->conn_lock);
   5346 	return (error);
   5347 }
   5348 
   5349 /* ARGSUSED3 */
   5350 int
   5351 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
   5352     socklen_t *salenp, cred_t *cr)
   5353 {
   5354 	conn_t  *connp = (conn_t *)proto_handle;
   5355 	int	error;
   5356 
   5357 	/* All Solaris components should pass a cred for this operation. */
   5358 	ASSERT(cr != NULL);
   5359 
   5360 	mutex_enter(&connp->conn_lock);
   5361 	error = conn_getsockname(connp, sa, salenp);
   5362 	mutex_exit(&connp->conn_lock);
   5363 	return (error);
   5364 }
   5365 
   5366 int
   5367 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
   5368     const void *optvalp, socklen_t optlen, cred_t *cr)
   5369 {
   5370 	conn_t	*connp = (conn_t *)proto_handle;
   5371 	int error;
   5372 
   5373 	/* All Solaris components should pass a cred for this operation. */
   5374 	ASSERT(cr != NULL);
   5375 
   5376 	error = proto_opt_check(level, option_name, optlen, NULL,
   5377 	    icmp_opt_obj.odb_opt_des_arr,
   5378 	    icmp_opt_obj.odb_opt_arr_cnt,
   5379 	    B_TRUE, B_FALSE, cr);
   5380 
   5381 	if (error != 0) {
   5382 		/*
   5383 		 * option not recognized
   5384 		 */
   5385 		if (error < 0) {
   5386 			error = proto_tlitosyserr(-error);
   5387 		}
   5388 		return (error);
   5389 	}
   5390 
   5391 	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
   5392 	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
   5393 	    (uchar_t *)optvalp, NULL, cr);
   5394 
   5395 	ASSERT(error >= 0);
   5396 
   5397 	return (error);
   5398 }
   5399 
   5400 int
   5401 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
   5402     void *optvalp, socklen_t *optlen, cred_t *cr)
   5403 {
   5404 	int		error;
   5405 	conn_t		*connp = (conn_t *)proto_handle;
   5406 	t_uscalar_t	max_optbuf_len;
   5407 	void		*optvalp_buf;
   5408 	int		len;
   5409 
   5410 	/* All Solaris components should pass a cred for this operation. */
   5411 	ASSERT(cr != NULL);
   5412 
   5413 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
   5414 	    icmp_opt_obj.odb_opt_des_arr,
   5415 	    icmp_opt_obj.odb_opt_arr_cnt,
   5416 	    B_FALSE, B_TRUE, cr);
   5417 
   5418 	if (error != 0) {
   5419 		if (error < 0) {
   5420 			error = proto_tlitosyserr(-error);
   5421 		}
   5422 		return (error);
   5423 	}
   5424 
   5425 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
   5426 	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
   5427 	if (len == -1) {
   5428 		kmem_free(optvalp_buf, max_optbuf_len);
   5429 		return (EINVAL);
   5430 	}
   5431 
   5432 	/*
   5433 	 * update optlen and copy option value
   5434 	 */
   5435 	t_uscalar_t size = MIN(len, *optlen);
   5436 
   5437 	bcopy(optvalp_buf, optvalp, size);
   5438 	bcopy(&size, optlen, sizeof (size));
   5439 
   5440 	kmem_free(optvalp_buf, max_optbuf_len);
   5441 	return (0);
   5442 }
   5443 
   5444 /* ARGSUSED1 */
   5445 int
   5446 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
   5447 {
   5448 	conn_t	*connp = (conn_t *)proto_handle;
   5449 
   5450 	/* All Solaris components should pass a cred for this operation. */
   5451 	ASSERT(cr != NULL);
   5452 
   5453 	(void) rawip_do_close(connp);
   5454 	return (0);
   5455 }
   5456 
   5457 /* ARGSUSED2 */
   5458 int
   5459 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
   5460 {
   5461 	conn_t  *connp = (conn_t *)proto_handle;
   5462 
   5463 	/* All Solaris components should pass a cred for this operation. */
   5464 	ASSERT(cr != NULL);
   5465 
   5466 	/* shut down the send side */
   5467 	if (how != SHUT_RD)
   5468 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
   5469 		    SOCK_OPCTL_SHUT_SEND, 0);
   5470 	/* shut down the recv side */
   5471 	if (how != SHUT_WR)
   5472 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
   5473 		    SOCK_OPCTL_SHUT_RECV, 0);
   5474 	return (0);
   5475 }
   5476 
   5477 void
   5478 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
   5479 {
   5480 	conn_t  *connp = (conn_t *)proto_handle;
   5481 	icmp_t	*icmp = connp->conn_icmp;
   5482 
   5483 	mutex_enter(&icmp->icmp_recv_lock);
   5484 	connp->conn_flow_cntrld = B_FALSE;
   5485 	mutex_exit(&icmp->icmp_recv_lock);
   5486 }
   5487 
   5488 int
   5489 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
   5490     int mode, int32_t *rvalp, cred_t *cr)
   5491 {
   5492 	conn_t  	*connp = (conn_t *)proto_handle;
   5493 	int		error;
   5494 
   5495 	/* All Solaris components should pass a cred for this operation. */
   5496 	ASSERT(cr != NULL);
   5497 
   5498 	/*
   5499 	 * If we don't have a helper stream then create one.
   5500 	 * ip_create_helper_stream takes care of locking the conn_t,
   5501 	 * so this check for NULL is just a performance optimization.
   5502 	 */
   5503 	if (connp->conn_helper_info == NULL) {
   5504 		icmp_stack_t *is = connp->conn_icmp->icmp_is;
   5505 
   5506 		ASSERT(is->is_ldi_ident != NULL);
   5507 
   5508 		/*
   5509 		 * Create a helper stream for non-STREAMS socket.
   5510 		 */
   5511 		error = ip_create_helper_stream(connp, is->is_ldi_ident);
   5512 		if (error != 0) {
   5513 			ip0dbg(("rawip_ioctl: create of IP helper stream "
   5514 			    "failed %d\n", error));
   5515 			return (error);
   5516 		}
   5517 	}
   5518 
   5519 	switch (cmd) {
   5520 	case ND_SET:
   5521 	case ND_GET:
   5522 	case _SIOCSOCKFALLBACK:
   5523 	case TI_GETPEERNAME:
   5524 	case TI_GETMYNAME:
   5525 #ifdef DEBUG
   5526 		cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
   5527 		    " socket", cmd);
   5528 #endif
   5529 		error = EINVAL;
   5530 		break;
   5531 	default:
   5532 		/*
   5533 		 * Pass on to IP using helper stream
   5534 		 */
   5535 		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
   5536 		    cmd, arg, mode, cr, rvalp);
   5537 		break;
   5538 	}
   5539 	return (error);
   5540 }
   5541 
   5542 int
   5543 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
   5544     cred_t *cr)
   5545 {
   5546 	sin6_t		*sin6;
   5547 	sin_t		*sin = NULL;
   5548 	uint_t		srcid;
   5549 	conn_t		*connp = (conn_t *)proto_handle;
   5550 	icmp_t		*icmp = connp->conn_icmp;
   5551 	int		error = 0;
   5552 	icmp_stack_t	*is = icmp->icmp_is;
   5553 	pid_t		pid = curproc->p_pid;
   5554 	ip_xmit_attr_t	*ixa;
   5555 
   5556 	ASSERT(DB_TYPE(mp) == M_DATA);
   5557 
   5558 	/* All Solaris components should pass a cred for this operation. */
   5559 	ASSERT(cr != NULL);
   5560 
   5561 	/* do an implicit bind if necessary */
   5562 	if (icmp->icmp_state == TS_UNBND) {
   5563 		error = rawip_implicit_bind(connp);
   5564 		/*
   5565 		 * We could be racing with an actual bind, in which case
   5566 		 * we would see EPROTO. We cross our fingers and try
   5567 		 * to connect.
   5568 		 */
   5569 		if (!(error == 0 || error == EPROTO)) {
   5570 			freemsg(mp);
   5571 			return (error);
   5572 		}
   5573 	}
   5574 
   5575 	/* Protocol 255 contains full IP headers */
   5576 	/* Read without holding lock */
   5577 	if (icmp->icmp_hdrincl) {
   5578 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
   5579 		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
   5580 			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
   5581 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   5582 				freemsg(mp);
   5583 				return (EINVAL);
   5584 			}
   5585 		}
   5586 		error = icmp_output_hdrincl(connp, mp, cr, pid);
   5587 		if (is->is_sendto_ignerr)
   5588 			return (0);
   5589 		else
   5590 			return (error);
   5591 	}
   5592 
   5593 	/* Connected? */
   5594 	if (msg->msg_name == NULL) {
   5595 		if (icmp->icmp_state != TS_DATA_XFER) {
   5596 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   5597 			return (EDESTADDRREQ);
   5598 		}
   5599 		if (msg->msg_controllen != 0) {
   5600 			error = icmp_output_ancillary(connp, NULL, NULL, mp,
   5601 			    NULL, msg, cr, pid);
   5602 		} else {
   5603 			error = icmp_output_connected(connp, mp, cr, pid);
   5604 		}
   5605 		if (is->is_sendto_ignerr)
   5606 			return (0);
   5607 		else
   5608 			return (error);
   5609 	}
   5610 	if (icmp->icmp_state == TS_DATA_XFER) {
   5611 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   5612 		return (EISCONN);
   5613 	}
   5614 	error = proto_verify_ip_addr(connp->conn_family,
   5615 	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
   5616 	if (error != 0) {
   5617 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   5618 		return (error);
   5619 	}
   5620 	switch (connp->conn_family) {
   5621 	case AF_INET6:
   5622 		sin6 = (sin6_t *)msg->msg_name;
   5623 
   5624 		/* No support for mapped addresses on raw sockets */
   5625 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
   5626 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   5627 			return (EADDRNOTAVAIL);
   5628 		}
   5629 		srcid = sin6->__sin6_src_id;
   5630 
   5631 		/*
   5632 		 * If the local address is a mapped address return
   5633 		 * an error.
   5634 		 * It would be possible to send an IPv6 packet but the
   5635 		 * response would never make it back to the application
   5636 		 * since it is bound to a mapped address.
   5637 		 */
   5638 		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
   5639 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   5640 			return (EADDRNOTAVAIL);
   5641 		}
   5642 
   5643 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
   5644 			sin6->sin6_addr = ipv6_loopback;
   5645 
   5646 		/*
   5647 		 * We have to allocate an ip_xmit_attr_t before we grab
   5648 		 * conn_lock and we need to hold conn_lock once we've check
   5649 		 * conn_same_as_last_v6 to handle concurrent send* calls on a
   5650 		 * socket.
   5651 		 */
   5652 		if (msg->msg_controllen == 0) {
   5653 			ixa = conn_get_ixa(connp, B_FALSE);
   5654 			if (ixa == NULL) {
   5655 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   5656 				return (ENOMEM);
   5657 			}
   5658 		} else {
   5659 			ixa = NULL;
   5660 		}
   5661 		mutex_enter(&connp->conn_lock);
   5662 		if (icmp->icmp_delayed_error != 0) {
   5663 			sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
   5664 
   5665 			error = icmp->icmp_delayed_error;
   5666 			icmp->icmp_delayed_error = 0;
   5667 
   5668 			/* Compare IP address and family */
   5669 
   5670 			if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
   5671 			    &sin2->sin6_addr) &&
   5672 			    sin6->sin6_family == sin2->sin6_family) {
   5673 				mutex_exit(&connp->conn_lock);
   5674 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   5675 				if (ixa != NULL)
   5676 					ixa_refrele(ixa);
   5677 				return (error);
   5678 			}
   5679 		}
   5680 		if (msg->msg_controllen != 0) {
   5681 			mutex_exit(&connp->conn_lock);
   5682 			ASSERT(ixa == NULL);
   5683 			error = icmp_output_ancillary(connp, NULL, sin6, mp,
   5684 			    NULL, msg, cr, pid);
   5685 		} else if (conn_same_as_last_v6(connp, sin6) &&
   5686 		    connp->conn_lastsrcid == srcid &&
   5687 		    ipsec_outbound_policy_current(ixa)) {
   5688 			/* icmp_output_lastdst drops conn_lock */
   5689 			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
   5690 		} else {
   5691 			/* icmp_output_newdst drops conn_lock */
   5692 			error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
   5693 			    pid, ixa);
   5694 		}
   5695 		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
   5696 		if (is->is_sendto_ignerr)
   5697 			return (0);
   5698 		else
   5699 			return (error);
   5700 	case AF_INET:
   5701 		sin = (sin_t *)msg->msg_name;
   5702 
   5703 		if (sin->sin_addr.s_addr == INADDR_ANY)
   5704 			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
   5705 
   5706 		/*
   5707 		 * We have to allocate an ip_xmit_attr_t before we grab
   5708 		 * conn_lock and we need to hold conn_lock once we've check
   5709 		 * conn_same_as_last_v6 to handle concurrent send* on a socket.
   5710 		 */
   5711 		if (msg->msg_controllen == 0) {
   5712 			ixa = conn_get_ixa(connp, B_FALSE);
   5713 			if (ixa == NULL) {
   5714 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   5715 				return (ENOMEM);
   5716 			}
   5717 		} else {
   5718 			ixa = NULL;
   5719 		}
   5720 		mutex_enter(&connp->conn_lock);
   5721 		if (icmp->icmp_delayed_error != 0) {
   5722 			sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
   5723 
   5724 			error = icmp->icmp_delayed_error;
   5725 			icmp->icmp_delayed_error = 0;
   5726 
   5727 			/* Compare IP address */
   5728 
   5729 			if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
   5730 				mutex_exit(&connp->conn_lock);
   5731 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
   5732 				if (ixa != NULL)
   5733 					ixa_refrele(ixa);
   5734 				return (error);
   5735 			}
   5736 		}
   5737 
   5738 		if (msg->msg_controllen != 0) {
   5739 			mutex_exit(&connp->conn_lock);
   5740 			ASSERT(ixa == NULL);
   5741 			error = icmp_output_ancillary(connp, sin, NULL, mp,
   5742 			    NULL, msg, cr, pid);
   5743 		} else if (conn_same_as_last_v4(connp, sin) &&
   5744 		    ipsec_outbound_policy_current(ixa)) {
   5745 			/* icmp_output_lastdst drops conn_lock */
   5746 			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
   5747 		} else {
   5748 			/* icmp_output_newdst drops conn_lock */
   5749 			error = icmp_output_newdst(connp, mp, sin, NULL, cr,
   5750 			    pid, ixa);
   5751 		}
   5752 		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
   5753 		if (is->is_sendto_ignerr)
   5754 			return (0);
   5755 		else
   5756 			return (error);
   5757 	default:
   5758 		return (EINVAL);
   5759 	}
   5760 }
   5761 
   5762 sock_downcalls_t sock_rawip_downcalls = {
   5763 	rawip_activate,
   5764 	rawip_accept,
   5765 	rawip_bind,
   5766 	rawip_listen,
   5767 	rawip_connect,
   5768 	rawip_getpeername,
   5769 	rawip_getsockname,
   5770 	rawip_getsockopt,
   5771 	rawip_setsockopt,
   5772 	rawip_send,
   5773 	NULL,
   5774 	NULL,
   5775 	NULL,
   5776 	rawip_shutdown,
   5777 	rawip_clr_flowctrl,
   5778 	rawip_ioctl,
   5779 	rawip_close
   5780 };
   5781