Home | History | Annotate | Download | only in inet
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 /* Copyright (c) 1990 Mentat Inc. */
     27 
     28 #ifndef	_INET_IP_H
     29 #define	_INET_IP_H
     30 
     31 #ifdef	__cplusplus
     32 extern "C" {
     33 #endif
     34 
     35 #include <sys/isa_defs.h>
     36 #include <sys/types.h>
     37 #include <inet/mib2.h>
     38 #include <inet/nd.h>
     39 #include <sys/atomic.h>
     40 #include <net/if_dl.h>
     41 #include <net/if.h>
     42 #include <netinet/ip.h>
     43 #include <netinet/igmp.h>
     44 #include <sys/neti.h>
     45 #include <sys/hook.h>
     46 #include <sys/hook_event.h>
     47 #include <sys/hook_impl.h>
     48 #include <inet/ip_stack.h>
     49 
     50 #ifdef _KERNEL
     51 #include <netinet/ip6.h>
     52 #include <sys/avl.h>
     53 #include <sys/vmem.h>
     54 #include <sys/squeue.h>
     55 #include <net/route.h>
     56 #include <sys/systm.h>
     57 #include <sys/multidata.h>
     58 #include <net/radix.h>
     59 #include <sys/modhash.h>
     60 
     61 #ifdef DEBUG
     62 #define	CONN_DEBUG
     63 #endif
     64 
     65 #define	IP_DEBUG
     66 /*
     67  * The mt-streams(9F) flags for the IP module; put here so that other
     68  * "drivers" that are actually IP (e.g., ICMP, UDP) can use the same set
     69  * of flags.
     70  */
     71 #define	IP_DEVMTFLAGS D_MP
     72 #endif	/* _KERNEL */
     73 
     74 #define	IP_MOD_NAME	"ip"
     75 #define	IP_DEV_NAME	"/dev/ip"
     76 #define	IP6_DEV_NAME	"/dev/ip6"
     77 
     78 #define	UDP_MOD_NAME	"udp"
     79 #define	UDP_DEV_NAME	"/dev/udp"
     80 #define	UDP6_DEV_NAME	"/dev/udp6"
     81 
     82 #define	TCP_MOD_NAME	"tcp"
     83 #define	TCP_DEV_NAME	"/dev/tcp"
     84 #define	TCP6_DEV_NAME	"/dev/tcp6"
     85 
     86 #define	SCTP_MOD_NAME	"sctp"
     87 
     88 #ifndef	_IPADDR_T
     89 #define	_IPADDR_T
     90 typedef uint32_t ipaddr_t;
     91 #endif
     92 
     93 /* Number of bits in an address */
     94 #define	IP_ABITS		32
     95 #define	IPV6_ABITS		128
     96 
     97 #define	IP_HOST_MASK		(ipaddr_t)0xffffffffU
     98 
     99 #define	IP_CSUM(mp, off, sum)		(~ip_cksum(mp, off, sum) & 0xFFFF)
    100 #define	IP_CSUM_PARTIAL(mp, off, sum)	ip_cksum(mp, off, sum)
    101 #define	IP_BCSUM_PARTIAL(bp, len, sum)	bcksum(bp, len, sum)
    102 #define	IP_MD_CSUM(pd, off, sum)	(~ip_md_cksum(pd, off, sum) & 0xffff)
    103 #define	IP_MD_CSUM_PARTIAL(pd, off, sum) ip_md_cksum(pd, off, sum)
    104 
    105 /*
    106  * Flag to IP write side to indicate that the appln has sent in a pre-built
    107  * IP header. Stored in ipha_ident (which is otherwise zero).
    108  */
    109 #define	IP_HDR_INCLUDED			0xFFFF
    110 
    111 #define	ILL_FRAG_HASH_TBL_COUNT	((unsigned int)64)
    112 #define	ILL_FRAG_HASH_TBL_SIZE	(ILL_FRAG_HASH_TBL_COUNT * sizeof (ipfb_t))
    113 
    114 #define	IPV4_ADDR_LEN			4
    115 #define	IP_ADDR_LEN			IPV4_ADDR_LEN
    116 #define	IP_ARP_PROTO_TYPE		0x0800
    117 
    118 #define	IPV4_VERSION			4
    119 #define	IP_VERSION			IPV4_VERSION
    120 #define	IP_SIMPLE_HDR_LENGTH_IN_WORDS	5
    121 #define	IP_SIMPLE_HDR_LENGTH		20
    122 #define	IP_MAX_HDR_LENGTH		60
    123 
    124 #define	IP_MAX_OPT_LENGTH (IP_MAX_HDR_LENGTH-IP_SIMPLE_HDR_LENGTH)
    125 
    126 #define	IP_MIN_MTU			(IP_MAX_HDR_LENGTH + 8)	/* 68 bytes */
    127 
    128 /*
    129  * XXX IP_MAXPACKET is defined in <netinet/ip.h> as well. At some point the
    130  * 2 files should be cleaned up to remove all redundant definitions.
    131  */
    132 #define	IP_MAXPACKET			65535
    133 #define	IP_SIMPLE_HDR_VERSION \
    134 	((IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS)
    135 
    136 #define	UDPH_SIZE			8
    137 
    138 /* Leave room for ip_newroute to tack on the src and target addresses */
    139 #define	OK_RESOLVER_MP(mp)						\
    140 	((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN))
    141 
    142 /*
    143  * Constants and type definitions to support IP IOCTL commands
    144  */
    145 #define	IP_IOCTL			(('i'<<8)|'p')
    146 #define	IP_IOC_IRE_DELETE		4
    147 #define	IP_IOC_IRE_DELETE_NO_REPLY	5
    148 #define	IP_IOC_IRE_ADVISE_NO_REPLY	6
    149 #define	IP_IOC_RTS_REQUEST		7
    150 
    151 /* Common definitions used by IP IOCTL data structures */
    152 typedef struct ipllcmd_s {
    153 	uint_t	ipllc_cmd;
    154 	uint_t	ipllc_name_offset;
    155 	uint_t	ipllc_name_length;
    156 } ipllc_t;
    157 
    158 /* IP IRE Change Command Structure. */
    159 typedef struct ipic_s {
    160 	ipllc_t	ipic_ipllc;
    161 	uint_t	ipic_ire_type;
    162 	uint_t	ipic_max_frag;
    163 	uint_t	ipic_addr_offset;
    164 	uint_t	ipic_addr_length;
    165 	uint_t	ipic_mask_offset;
    166 	uint_t	ipic_mask_length;
    167 	uint_t	ipic_src_addr_offset;
    168 	uint_t	ipic_src_addr_length;
    169 	uint_t	ipic_ll_hdr_offset;
    170 	uint_t	ipic_ll_hdr_length;
    171 	uint_t	ipic_gateway_addr_offset;
    172 	uint_t	ipic_gateway_addr_length;
    173 	clock_t	ipic_rtt;
    174 	uint32_t ipic_ssthresh;
    175 	clock_t	ipic_rtt_sd;
    176 	uchar_t ipic_ire_marks;
    177 } ipic_t;
    178 
    179 #define	ipic_cmd		ipic_ipllc.ipllc_cmd
    180 #define	ipic_ll_name_length	ipic_ipllc.ipllc_name_length
    181 #define	ipic_ll_name_offset	ipic_ipllc.ipllc_name_offset
    182 
    183 /* IP IRE Delete Command Structure. */
    184 typedef struct ipid_s {
    185 	ipllc_t	ipid_ipllc;
    186 	uint_t	ipid_ire_type;
    187 	uint_t	ipid_addr_offset;
    188 	uint_t	ipid_addr_length;
    189 	uint_t	ipid_mask_offset;
    190 	uint_t	ipid_mask_length;
    191 } ipid_t;
    192 
    193 #define	ipid_cmd		ipid_ipllc.ipllc_cmd
    194 
    195 #ifdef _KERNEL
    196 /*
    197  * Temporary state for ip options parser.
    198  */
    199 typedef struct ipoptp_s
    200 {
    201 	uint8_t		*ipoptp_next;	/* next option to look at */
    202 	uint8_t		*ipoptp_end;	/* end of options */
    203 	uint8_t		*ipoptp_cur;	/* start of current option */
    204 	uint8_t		ipoptp_len;	/* length of current option */
    205 	uint32_t	ipoptp_flags;
    206 } ipoptp_t;
    207 
    208 /*
    209  * Flag(s) for ipoptp_flags
    210  */
    211 #define	IPOPTP_ERROR	0x00000001
    212 #endif	/* _KERNEL */
    213 
    214 /* Controls forwarding of IP packets, set via ndd */
    215 #define	IP_FORWARD_NEVER	0
    216 #define	IP_FORWARD_ALWAYS	1
    217 
    218 #define	WE_ARE_FORWARDING(ipst)	((ipst)->ips_ip_g_forward == IP_FORWARD_ALWAYS)
    219 
    220 #define	IPH_HDR_LENGTH(ipha)						\
    221 	((int)(((ipha_t *)ipha)->ipha_version_and_hdr_length & 0xF) << 2)
    222 
    223 #define	IPH_HDR_VERSION(ipha)						\
    224 	((int)(((ipha_t *)ipha)->ipha_version_and_hdr_length) >> 4)
    225 
    226 #ifdef _KERNEL
    227 /*
    228  * IP reassembly macros.  We hide starting and ending offsets in b_next and
    229  * b_prev of messages on the reassembly queue.	The messages are chained using
    230  * b_cont.  These macros are used in ip_reassemble() so we don't have to see
    231  * the ugly casts and assignments.
    232  * Note that the offsets are <= 64k i.e. a uint_t is sufficient to represent
    233  * them.
    234  */
    235 #define	IP_REASS_START(mp)		((uint_t)(uintptr_t)((mp)->b_next))
    236 #define	IP_REASS_SET_START(mp, u)	\
    237 	((mp)->b_next = (mblk_t *)(uintptr_t)(u))
    238 #define	IP_REASS_END(mp)		((uint_t)(uintptr_t)((mp)->b_prev))
    239 #define	IP_REASS_SET_END(mp, u)		\
    240 	((mp)->b_prev = (mblk_t *)(uintptr_t)(u))
    241 
    242 #define	IP_REASS_COMPLETE	0x1
    243 #define	IP_REASS_PARTIAL	0x2
    244 #define	IP_REASS_FAILED		0x4
    245 
    246 /*
    247  * Test to determine whether this is a module instance of IP or a
    248  * driver instance of IP.
    249  */
    250 #define	CONN_Q(q)	(WR(q)->q_next == NULL)
    251 
    252 #define	Q_TO_CONN(q)	((conn_t *)(q)->q_ptr)
    253 #define	Q_TO_TCP(q)	(Q_TO_CONN((q))->conn_tcp)
    254 #define	Q_TO_UDP(q)	(Q_TO_CONN((q))->conn_udp)
    255 #define	Q_TO_ICMP(q)	(Q_TO_CONN((q))->conn_icmp)
    256 #define	Q_TO_RTS(q)	(Q_TO_CONN((q))->conn_rts)
    257 
    258 /*
    259  * The following two macros are used by IP to get the appropriate
    260  * wq and rq for a conn. If it is a TCP conn, then we need
    261  * tcp_wq/tcp_rq else, conn_wq/conn_rq. IP can use conn_wq and conn_rq
    262  * from a conn directly if it knows that the conn is not TCP.
    263  */
    264 #define	CONNP_TO_WQ(connp)	\
    265 	(IPCL_IS_TCP(connp) ? (connp)->conn_tcp->tcp_wq : (connp)->conn_wq)
    266 
    267 #define	CONNP_TO_RQ(connp)	RD(CONNP_TO_WQ(connp))
    268 
    269 #define	GRAB_CONN_LOCK(q)	{				\
    270 	if (q != NULL && CONN_Q(q))				\
    271 		mutex_enter(&(Q_TO_CONN(q))->conn_lock);	\
    272 }
    273 
    274 #define	RELEASE_CONN_LOCK(q)	{				\
    275 	if (q != NULL && CONN_Q(q))				\
    276 		mutex_exit(&(Q_TO_CONN(q))->conn_lock);		\
    277 }
    278 
    279 /* "Congestion controlled" protocol */
    280 #define	IP_FLOW_CONTROLLED_ULP(p)   ((p) == IPPROTO_TCP || (p) == IPPROTO_SCTP)
    281 
    282 /*
    283  * Complete the pending operation. Usually an ioctl. Can also
    284  * be a bind or option management request that got enqueued
    285  * in an ipsq_t. Called on completion of the operation.
    286  */
    287 #define	CONN_OPER_PENDING_DONE(connp)	{			\
    288 	mutex_enter(&(connp)->conn_lock);			\
    289 	(connp)->conn_oper_pending_ill = NULL;			\
    290 	cv_broadcast(&(connp)->conn_refcv);			\
    291 	mutex_exit(&(connp)->conn_lock);			\
    292 	CONN_DEC_REF(connp);					\
    293 }
    294 
    295 /* Get the credential of an IP queue of unknown type */
    296 #define	GET_QUEUE_CRED(wq)						\
    297 	((wq)->q_next ? (((ill_t *)(wq)->q_ptr)->ill_credp) \
    298 	    : ((Q_TO_CONN((wq)))->conn_cred))
    299 
    300 /*
    301  * Flags for the various ip_fanout_* routines.
    302  */
    303 #define	IP_FF_SEND_ICMP		0x01	/* Send an ICMP error */
    304 #define	IP_FF_HDR_COMPLETE	0x02	/* Call ip_hdr_complete if error */
    305 #define	IP_FF_CKSUM		0x04	/* Recompute ipha_cksum if error */
    306 #define	IP_FF_RAWIP		0x08	/* Use rawip mib variable */
    307 #define	IP_FF_SRC_QUENCH	0x10	/* OK to send ICMP_SOURCE_QUENCH */
    308 #define	IP_FF_SYN_ADDIRE	0x20	/* Add IRE if TCP syn packet */
    309 #define	IP_FF_IPINFO		0x80	/* Used for both V4 and V6 */
    310 #define	IP_FF_SEND_SLLA		0x100	/* Send source link layer info ? */
    311 #define	IPV6_REACHABILITY_CONFIRMATION	0x200	/* Flags for ip_xmit_v6 */
    312 #define	IP_FF_NO_MCAST_LOOP	0x400	/* No multicasts for sending zone */
    313 
    314 /*
    315  * Following flags are used by IPQoS to determine if policy processing is
    316  * required.
    317  */
    318 #define	IP6_NO_IPPOLICY		0x800	/* Don't do IPQoS processing */
    319 #define	IP6_IN_LLMCAST		0x1000	/* Multicast */
    320 
    321 #define	IP_FF_LOOPBACK		0x2000	/* Loopback fanout */
    322 #define	IP_FF_SCTP_CSUM_ERR	0x4000	/* sctp pkt has failed chksum */
    323 
    324 #ifndef	IRE_DB_TYPE
    325 #define	IRE_DB_TYPE	M_SIG
    326 #endif
    327 
    328 #ifndef	IRE_DB_REQ_TYPE
    329 #define	IRE_DB_REQ_TYPE	M_PCSIG
    330 #endif
    331 
    332 #ifndef	IRE_ARPRESOLVE_TYPE
    333 #define	IRE_ARPRESOLVE_TYPE	M_EVENT
    334 #endif
    335 
    336 /*
    337  * Values for squeue switch:
    338  */
    339 
    340 #define	IP_SQUEUE_ENTER_NODRAIN	1
    341 #define	IP_SQUEUE_ENTER	2
    342 /*
    343  * This is part of the interface between Transport provider and
    344  * IP which can be used to set policy information. This is usually
    345  * accompanied with O_T_BIND_REQ/T_BIND_REQ.ip_bind assumes that
    346  * only IPSEC_POLICY_SET is there when it is found in the chain.
    347  * The information contained is an struct ipsec_req_t. On success
    348  * or failure, either the T_BIND_ACK or the T_ERROR_ACK is returned.
    349  * IPSEC_POLICY_SET is never returned.
    350  */
    351 #define	IPSEC_POLICY_SET	M_SETOPTS
    352 
    353 #define	IRE_IS_LOCAL(ire)	((ire != NULL) && \
    354 				((ire)->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
    355 
    356 #define	IRE_IS_TARGET(ire)	((ire != NULL) && \
    357 				((ire)->ire_type != IRE_BROADCAST))
    358 
    359 /* IP Fragmentation Reassembly Header */
    360 typedef struct ipf_s {
    361 	struct ipf_s	*ipf_hash_next;
    362 	struct ipf_s	**ipf_ptphn;	/* Pointer to previous hash next. */
    363 	uint32_t	ipf_ident;	/* Ident to match. */
    364 	uint8_t		ipf_protocol;	/* Protocol to match. */
    365 	uchar_t		ipf_last_frag_seen : 1;	/* Last fragment seen ? */
    366 	time_t		ipf_timestamp;	/* Reassembly start time. */
    367 	mblk_t		*ipf_mp;	/* mblk we live in. */
    368 	mblk_t		*ipf_tail_mp;	/* Frag queue tail pointer. */
    369 	int		ipf_hole_cnt;	/* Number of holes (hard-case). */
    370 	int		ipf_end;	/* Tail end offset (0 -> hard-case). */
    371 	uint_t		ipf_gen;	/* Frag queue generation */
    372 	size_t		ipf_count;	/* Count of bytes used by frag */
    373 	uint_t		ipf_nf_hdr_len; /* Length of nonfragmented header */
    374 	in6_addr_t	ipf_v6src;	/* IPv6 source address */
    375 	in6_addr_t	ipf_v6dst;	/* IPv6 dest address */
    376 	uint_t		ipf_prev_nexthdr_offset; /* Offset for nexthdr value */
    377 	uint8_t		ipf_ecn;	/* ECN info for the fragments */
    378 	uint8_t		ipf_num_dups;	/* Number of times dup frags recvd */
    379 	uint16_t	ipf_checksum_flags; /* Hardware checksum flags */
    380 	uint32_t	ipf_checksum;	/* Partial checksum of fragment data */
    381 } ipf_t;
    382 
    383 #define	ipf_src	V4_PART_OF_V6(ipf_v6src)
    384 #define	ipf_dst	V4_PART_OF_V6(ipf_v6dst)
    385 
    386 typedef enum {
    387 	IB_PKT = 0x01,
    388 	OB_PKT = 0x02
    389 } ip_pkt_t;
    390 
    391 #define	UPDATE_IB_PKT_COUNT(ire)\
    392 	{ \
    393 	(ire)->ire_ib_pkt_count++; \
    394 	if ((ire)->ire_ipif != NULL) { \
    395 		/* \
    396 		 * forwarding packet \
    397 		 */ \
    398 		if ((ire)->ire_type & (IRE_LOCAL|IRE_BROADCAST)) \
    399 			atomic_add_32(&(ire)->ire_ipif->ipif_ib_pkt_count, 1);\
    400 		else \
    401 			atomic_add_32(&(ire)->ire_ipif->ipif_fo_pkt_count, 1);\
    402 	} \
    403 	}
    404 
    405 #define	UPDATE_OB_PKT_COUNT(ire)\
    406 	{ \
    407 	(ire)->ire_ob_pkt_count++;\
    408 	if ((ire)->ire_ipif != NULL) { \
    409 		atomic_add_32(&(ire)->ire_ipif->ipif_ob_pkt_count, 1); \
    410 	} \
    411 	}
    412 
    413 #define	IP_RPUT_LOCAL(q, mp, ipha, ire, recv_ill) \
    414 { \
    415 	switch (ipha->ipha_protocol) { \
    416 		case IPPROTO_UDP: \
    417 			ip_udp_input(q, mp, ipha, ire, recv_ill); \
    418 			break; \
    419 		default: \
    420 			ip_proto_input(q, mp, ipha, ire, recv_ill, 0); \
    421 			break; \
    422 	} \
    423 }
    424 
    425 /*
    426  * NCE_EXPIRED is TRUE when we have a non-permanent nce that was
    427  * found to be REACHABLE more than ip_ire_arp_interval ms ago.
    428  * This macro is used to age existing nce_t entries. The
    429  * nce's will get cleaned up in the following circumstances:
    430  * - ip_ire_trash_reclaim will free nce's using ndp_cache_reclaim
    431  *    when memory is low,
    432  * - ip_arp_news, when updates are received.
    433  * - if the nce is NCE_EXPIRED(), it will deleted, so that a new
    434  *   arp request will need to be triggered from an ND_INITIAL nce.
    435  *
    436  * Note that the nce state transition follows the pattern:
    437  *	ND_INITIAL -> ND_INCOMPLETE -> ND_REACHABLE
    438  * after which the nce is deleted when it has expired.
    439  *
    440  * nce_last is the timestamp that indicates when the nce_res_mp in the
    441  * nce_t was last updated to a valid link-layer address.  nce_last gets
    442  * modified/updated :
    443  *  - when the nce is created
    444  *  - every time we get a sane arp response for the nce.
    445  */
    446 #define	NCE_EXPIRED(nce, ipst)	(nce->nce_last > 0 &&	\
    447 	    ((nce->nce_flags & NCE_F_PERMANENT) == 0) &&	\
    448 	    ((TICK_TO_MSEC(lbolt64) - nce->nce_last) > 		\
    449 		(ipst)->ips_ip_ire_arp_interval))
    450 
    451 #endif /* _KERNEL */
    452 
    453 /* ICMP types */
    454 #define	ICMP_ECHO_REPLY			0
    455 #define	ICMP_DEST_UNREACHABLE		3
    456 #define	ICMP_SOURCE_QUENCH		4
    457 #define	ICMP_REDIRECT			5
    458 #define	ICMP_ECHO_REQUEST		8
    459 #define	ICMP_ROUTER_ADVERTISEMENT	9
    460 #define	ICMP_ROUTER_SOLICITATION	10
    461 #define	ICMP_TIME_EXCEEDED		11
    462 #define	ICMP_PARAM_PROBLEM		12
    463 #define	ICMP_TIME_STAMP_REQUEST		13
    464 #define	ICMP_TIME_STAMP_REPLY		14
    465 #define	ICMP_INFO_REQUEST		15
    466 #define	ICMP_INFO_REPLY			16
    467 #define	ICMP_ADDRESS_MASK_REQUEST	17
    468 #define	ICMP_ADDRESS_MASK_REPLY		18
    469 
    470 /* ICMP_TIME_EXCEEDED codes */
    471 #define	ICMP_TTL_EXCEEDED		0
    472 #define	ICMP_REASSEMBLY_TIME_EXCEEDED	1
    473 
    474 /* ICMP_DEST_UNREACHABLE codes */
    475 #define	ICMP_NET_UNREACHABLE		0
    476 #define	ICMP_HOST_UNREACHABLE		1
    477 #define	ICMP_PROTOCOL_UNREACHABLE	2
    478 #define	ICMP_PORT_UNREACHABLE		3
    479 #define	ICMP_FRAGMENTATION_NEEDED	4
    480 #define	ICMP_SOURCE_ROUTE_FAILED	5
    481 #define	ICMP_DEST_NET_UNKNOWN		6
    482 #define	ICMP_DEST_HOST_UNKNOWN		7
    483 #define	ICMP_SRC_HOST_ISOLATED		8
    484 #define	ICMP_DEST_NET_UNREACH_ADMIN	9
    485 #define	ICMP_DEST_HOST_UNREACH_ADMIN	10
    486 #define	ICMP_DEST_NET_UNREACH_TOS	11
    487 #define	ICMP_DEST_HOST_UNREACH_TOS	12
    488 
    489 /* ICMP Header Structure */
    490 typedef struct icmph_s {
    491 	uint8_t		icmph_type;
    492 	uint8_t		icmph_code;
    493 	uint16_t	icmph_checksum;
    494 	union {
    495 		struct { /* ECHO request/response structure */
    496 			uint16_t	u_echo_ident;
    497 			uint16_t	u_echo_seqnum;
    498 		} u_echo;
    499 		struct { /* Destination unreachable structure */
    500 			uint16_t	u_du_zero;
    501 			uint16_t	u_du_mtu;
    502 		} u_du;
    503 		struct { /* Parameter problem structure */
    504 			uint8_t		u_pp_ptr;
    505 			uint8_t		u_pp_rsvd[3];
    506 		} u_pp;
    507 		struct { /* Redirect structure */
    508 			ipaddr_t	u_rd_gateway;
    509 		} u_rd;
    510 	} icmph_u;
    511 } icmph_t;
    512 
    513 #define	icmph_echo_ident	icmph_u.u_echo.u_echo_ident
    514 #define	icmph_echo_seqnum	icmph_u.u_echo.u_echo_seqnum
    515 #define	icmph_du_zero		icmph_u.u_du.u_du_zero
    516 #define	icmph_du_mtu		icmph_u.u_du.u_du_mtu
    517 #define	icmph_pp_ptr		icmph_u.u_pp.u_pp_ptr
    518 #define	icmph_rd_gateway	icmph_u.u_rd.u_rd_gateway
    519 
    520 #define	ICMPH_SIZE	8
    521 
    522 /*
    523  * Minimum length of transport layer header included in an ICMP error
    524  * message for it to be considered valid.
    525  */
    526 #define	ICMP_MIN_TP_HDR_LEN	8
    527 
    528 /* Aligned IP header */
    529 typedef struct ipha_s {
    530 	uint8_t		ipha_version_and_hdr_length;
    531 	uint8_t		ipha_type_of_service;
    532 	uint16_t	ipha_length;
    533 	uint16_t	ipha_ident;
    534 	uint16_t	ipha_fragment_offset_and_flags;
    535 	uint8_t		ipha_ttl;
    536 	uint8_t		ipha_protocol;
    537 	uint16_t	ipha_hdr_checksum;
    538 	ipaddr_t	ipha_src;
    539 	ipaddr_t	ipha_dst;
    540 } ipha_t;
    541 
    542 /*
    543  * IP Flags
    544  *
    545  * Some of these constant names are copied for the DTrace IP provider in
    546  * usr/src/lib/libdtrace/common/{ip.d.in, ip.sed.in}, which should be kept
    547  * in sync.
    548  */
    549 #define	IPH_DF		0x4000	/* Don't fragment */
    550 #define	IPH_MF		0x2000	/* More fragments to come */
    551 #define	IPH_OFFSET	0x1FFF	/* Where the offset lives */
    552 #define	IPH_FRAG_HDR	0x8000	/* IPv6 don't fragment bit */
    553 
    554 /* ECN code points for IPv4 TOS byte and IPv6 traffic class octet. */
    555 #define	IPH_ECN_NECT	0x0	/* Not ECN-Capable Transport */
    556 #define	IPH_ECN_ECT1	0x1	/* ECN-Capable Transport, ECT(1) */
    557 #define	IPH_ECN_ECT0	0x2	/* ECN-Capable Transport, ECT(0) */
    558 #define	IPH_ECN_CE	0x3	/* ECN-Congestion Experienced (CE) */
    559 
    560 /* IP Mac info structure */
    561 typedef struct ip_m_s {
    562 	t_uscalar_t	ip_m_mac_type;	/* From <sys/dlpi.h> */
    563 	int		ip_m_type;	/* From <net/if_types.h> */
    564 	boolean_t	(*ip_m_v4mapinfo)(uint_t, uint8_t *, uint8_t *,
    565 			    uint32_t *, ipaddr_t *);
    566 	boolean_t	(*ip_m_v6mapinfo)(uint_t, uint8_t *, uint8_t *,
    567 			    uint32_t *, in6_addr_t *);
    568 	boolean_t	(*ip_m_v6intfid)(uint_t, uint8_t *, in6_addr_t *);
    569 } ip_m_t;
    570 
    571 /*
    572  * The following functions attempt to reduce the link layer dependency
    573  * of the IP stack. The current set of link specific operations are:
    574  * a. map from IPv4 class D (224.0/4) multicast address range to the link
    575  * layer multicast address range.
    576  * b. map from IPv6 multicast address range (ff00::/8) to the link
    577  * layer multicast address range.
    578  * c. derive the default IPv6 interface identifier from the link layer
    579  * address.
    580  */
    581 #define	MEDIA_V4MINFO(ip_m, plen, bphys, maddr, hwxp, v4ptr) \
    582 	(((ip_m)->ip_m_v4mapinfo != NULL) && \
    583 	(*(ip_m)->ip_m_v4mapinfo)(plen, bphys, maddr, hwxp, v4ptr))
    584 #define	MEDIA_V6INTFID(ip_m, plen, phys, v6ptr) \
    585 	(((ip_m)->ip_m_v6intfid != NULL) && \
    586 	(*(ip_m)->ip_m_v6intfid)(plen, phys, v6ptr))
    587 #define	MEDIA_V6MINFO(ip_m, plen, bphys, maddr, hwxp, v6ptr) \
    588 	(((ip_m)->ip_m_v6mapinfo != NULL) && \
    589 	(*(ip_m)->ip_m_v6mapinfo)(plen, bphys, maddr, hwxp, v6ptr))
    590 
    591 /* Router entry types */
    592 #define	IRE_BROADCAST		0x0001	/* Route entry for broadcast address */
    593 #define	IRE_DEFAULT		0x0002	/* Route entry for default gateway */
    594 #define	IRE_LOCAL		0x0004	/* Route entry for local address */
    595 #define	IRE_LOOPBACK		0x0008	/* Route entry for loopback address */
    596 #define	IRE_PREFIX		0x0010	/* Route entry for prefix routes */
    597 #define	IRE_CACHE		0x0020	/* Cached Route entry */
    598 #define	IRE_IF_NORESOLVER	0x0040	/* Route entry for local interface */
    599 					/* net without any address mapping. */
    600 #define	IRE_IF_RESOLVER		0x0080	/* Route entry for local interface */
    601 					/* net with resolver. */
    602 #define	IRE_HOST		0x0100	/* Host route entry */
    603 #define	IRE_HOST_REDIRECT	0x0200	/* only used for T_SVR4_OPTMGMT_REQ */
    604 
    605 #define	IRE_INTERFACE		(IRE_IF_NORESOLVER | IRE_IF_RESOLVER)
    606 #define	IRE_OFFSUBNET		(IRE_DEFAULT | IRE_PREFIX | IRE_HOST)
    607 #define	IRE_CACHETABLE		(IRE_CACHE | IRE_BROADCAST | IRE_LOCAL | \
    608 				IRE_LOOPBACK)
    609 #define	IRE_FORWARDTABLE	(IRE_INTERFACE | IRE_OFFSUBNET)
    610 
    611 /*
    612  * If an IRE is marked with IRE_MARK_CONDEMNED, the last walker of
    613  * the bucket should delete this IRE from this bucket.
    614  */
    615 #define	IRE_MARK_CONDEMNED	0x0001
    616 /*
    617  * If a broadcast IRE is marked with IRE_MARK_NORECV, ip_rput will drop the
    618  * broadcast packets received on that interface. This is marked only
    619  * on broadcast ires. Employed by IPMP, where we have multiple NICs on the
    620  * same subnet receiving the same broadcast packet.
    621  */
    622 #define	IRE_MARK_NORECV		0x0002
    623 /*
    624  * IRE_CACHE marked this way won't be returned by ire_cache_lookup. Need
    625  * to look specifically using MATCH_IRE_MARK_HIDDEN. Used by IPMP.
    626  */
    627 #define	IRE_MARK_HIDDEN		0x0004	/* Typically Used by in.mpathd */
    628 
    629 /*
    630  * An IRE with IRE_MARK_NOADD is created in ip_newroute_ipif when the outgoing
    631  * interface is specified by e.g. IP_PKTINFO.  The IRE is not added to the IRE
    632  * cache table.
    633  */
    634 #define	IRE_MARK_NOADD		0x0008	/* Mark not to add ire in cache */
    635 
    636 /*
    637  * IRE marked with IRE_MARK_TEMPORARY means that this IRE has been used
    638  * either for forwarding a packet or has not been used for sending
    639  * traffic on TCP connections terminated on this system.  In both
    640  * cases, this IRE is the first to go when IRE is being cleaned up.
    641  */
    642 #define	IRE_MARK_TEMPORARY	0x0010
    643 
    644 /*
    645  * IRE marked with IRE_MARK_USESRC_CHECK means that while adding an IRE with
    646  * this mark, additional atomic checks need to be performed. For eg: by the
    647  * time an IRE_CACHE is created, sent up to ARP and then comes back to IP; the
    648  * usesrc grouping could have changed in which case we want to fail adding
    649  * the IRE_CACHE entry
    650  */
    651 #define	IRE_MARK_USESRC_CHECK	0x0020
    652 
    653 /*
    654  * IRE_MARK_PRIVATE_ADDR is used for IP_NEXTHOP. When IP_NEXTHOP is set, the
    655  * routing table lookup for the destination is bypassed and the packet is
    656  * sent directly to the specified nexthop. The associated IRE_CACHE entries
    657  * should be marked with IRE_MARK_PRIVATE_ADDR flag so that they don't show up
    658  * in regular ire cache lookups.
    659  */
    660 #define	IRE_MARK_PRIVATE_ADDR	0x0040
    661 
    662 /*
    663  * When we send an ARP resolution query for the nexthop gateway's ire,
    664  * we use esballoc to create the ire_t in the AR_ENTRY_QUERY mblk
    665  * chain, and mark its ire_marks with IRE_MARK_UNCACHED. This flag
    666  * indicates that information from ARP has not been transferred to a
    667  * permanent IRE_CACHE entry. The flag is reset only when the
    668  * information is successfully transferred to an ire_cache entry (in
    669  * ire_add()). Attempting to free the AR_ENTRY_QUERY mblk chain prior
    670  * to ire_add (e.g., from arp, or from ip`ip_wput_nondata) will
    671  * require that the resources (incomplete ire_cache and/or nce) must
    672  * be cleaned up. The free callback routine (ire_freemblk()) checks
    673  * for IRE_MARK_UNCACHED to see if any resources that are pinned down
    674  * will need to be cleaned up or not.
    675  */
    676 
    677 #define	IRE_MARK_UNCACHED	0x0080
    678 
    679 /*
    680  * The comment below (and for other netstack_t references) refers
    681  * to the fact that we only do netstack_hold in particular cases,
    682  * such as the references from open streams (ill_t and conn_t's
    683  * pointers). Internally within IP we rely on IP's ability to cleanup e.g.
    684  * ire_t's when an ill goes away.
    685  */
    686 typedef struct ire_expire_arg_s {
    687 	int		iea_flush_flag;
    688 	ip_stack_t	*iea_ipst;	/* Does not have a netstack_hold */
    689 } ire_expire_arg_t;
    690 
    691 /* Flags with ire_expire routine */
    692 #define	FLUSH_ARP_TIME		0x0001	/* ARP info potentially stale timer */
    693 #define	FLUSH_REDIRECT_TIME	0x0002	/* Redirects potentially stale */
    694 #define	FLUSH_MTU_TIME		0x0004	/* Include path MTU per RFC 1191 */
    695 
    696 /* Arguments to ire_flush_cache() */
    697 #define	IRE_FLUSH_DELETE	0
    698 #define	IRE_FLUSH_ADD		1
    699 
    700 /*
    701  * Open/close synchronization flags.
    702  * These are kept in a separate field in the conn and the synchronization
    703  * depends on the atomic 32 bit access to that field.
    704  */
    705 #define	CONN_CLOSING		0x01	/* ip_close waiting for ip_wsrv */
    706 #define	CONN_IPSEC_LOAD_WAIT	0x02	/* waiting for load */
    707 #define	CONN_CONDEMNED		0x04	/* conn is closing, no more refs */
    708 #define	CONN_INCIPIENT		0x08	/* conn not yet visible, no refs */
    709 #define	CONN_QUIESCED		0x10	/* conn is now quiescent */
    710 
    711 /* Used to check connection state flags before caching the IRE */
    712 #define	CONN_CACHE_IRE(connp)	\
    713 	(!((connp)->conn_state_flags & (CONN_CLOSING|CONN_CONDEMNED)))
    714 
    715 /*
    716  * Parameter to ip_output giving the identity of the caller.
    717  * IP_WSRV means the packet was enqueued in the STREAMS queue
    718  * due to flow control and is now being reprocessed in the context of
    719  * the STREAMS service procedure, consequent to flow control relief.
    720  * IRE_SEND means the packet is being reprocessed consequent to an
    721  * ire cache creation and addition and this may or may not be happening
    722  * in the service procedure context. Anything other than the above 2
    723  * cases is identified as IP_WPUT. Most commonly this is the case of
    724  * packets coming down from the application.
    725  */
    726 #ifdef _KERNEL
    727 #define	IP_WSRV			1	/* Called from ip_wsrv */
    728 #define	IP_WPUT			2	/* Called from ip_wput */
    729 #define	IRE_SEND		3	/* Called from ire_send */
    730 
    731 /*
    732  * Extra structures need for per-src-addr filtering (IGMPv3/MLDv2)
    733  */
    734 #define	MAX_FILTER_SIZE	64
    735 
    736 typedef struct slist_s {
    737 	int		sl_numsrc;
    738 	in6_addr_t	sl_addr[MAX_FILTER_SIZE];
    739 } slist_t;
    740 
    741 /*
    742  * Following struct is used to maintain retransmission state for
    743  * a multicast group.  One rtx_state_t struct is an in-line field
    744  * of the ilm_t struct; the slist_ts in the rtx_state_t struct are
    745  * alloc'd as needed.
    746  */
    747 typedef struct rtx_state_s {
    748 	uint_t		rtx_timer;	/* retrans timer */
    749 	int		rtx_cnt;	/* retrans count */
    750 	int		rtx_fmode_cnt;	/* retrans count for fmode change */
    751 	slist_t		*rtx_allow;
    752 	slist_t		*rtx_block;
    753 } rtx_state_t;
    754 
    755 /*
    756  * Used to construct list of multicast address records that will be
    757  * sent in a single listener report.
    758  */
    759 typedef struct mrec_s {
    760 	struct mrec_s	*mrec_next;
    761 	uint8_t		mrec_type;
    762 	uint8_t		mrec_auxlen;	/* currently unused */
    763 	in6_addr_t	mrec_group;
    764 	slist_t		mrec_srcs;
    765 } mrec_t;
    766 
    767 /* Group membership list per upper conn */
    768 /*
    769  * XXX add ilg info for ifaddr/ifindex.
    770  * XXX can we make ilg survive an ifconfig unplumb + plumb
    771  * by setting the ipif/ill to NULL and recover that later?
    772  *
    773  * ilg_ipif is used by IPv4 as multicast groups are joined using an interface
    774  * address (ipif).
    775  * ilg_ill is used by IPv6 as multicast groups are joined using an interface
    776  * index (phyint->phyint_ifindex).
    777  * ilg_ill is NULL for IPv4 and ilg_ipif is NULL for IPv6.
    778  *
    779  * ilg records the state of multicast memberships of a socket end point.
    780  * ilm records the state of multicast memberships with the driver and is
    781  * maintained per interface.
    782  *
    783  * Notes :
    784  *
    785  * 1) There is no direct link between a given ilg and ilm. If the
    786  *    application has joined a group G with ifindex I, we will have
    787  *    an ilg with ilg_v6group and ilg_ill. There will be a corresponding
    788  *    ilm with ilm_ill/ilm_v6addr recording the multicast membership.
    789  *    To delete the membership,
    790  *
    791  *		a) Search for ilg matching on G and I with ilg_v6group
    792  *		   and ilg_ill. Delete ilg_ill.
    793  *		b) Search the corresponding ilm matching on G and I with
    794  *		   ilm_v6addr and ilm_ill. Delete ilm.
    795  *
    796  *    In IPv4, the only difference is, we look using ipifs instead of
    797  *    ills.
    798  *
    799  * 2) With IP multipathing, we want to keep receiving even after the
    800  *    interface has failed. We do this by moving multicast memberships
    801  *    to a new_ill within the group. This is achieved by sending
    802  *    DL_DISABMULTI_REQS on ilg_ill/ilm_ill and sending DL_ENABMULTIREQS
    803  *    on the new_ill and changing ilg_ill/ilm_ill to new_ill. But, we
    804  *    need to be able to delete memberships which will still come down
    805  *    with the ifindex of the old ill which is what the application
    806  *    knows of. Thus we store the ilm_/ilg_orig_ifindex to keep track
    807  *    of where we joined initially so that we can lookup even after we
    808  *    moved the membership. It is also used for moving back the membership
    809  *    when the old ill has been repaired. This is done by looking up for
    810  *    ilms with ilm_orig_ifindex matching on the old ill's ifindex. Only
    811  *    ilms actually move from old ill to new ill. ilgs don't move (just
    812  *    the ilg_ill is changed when it moves) as it just records the state
    813  *    of the application that has joined a group G where as ilm records
    814  *    the state joined with the driver. Thus when we send DL_XXXMULTI_REQs
    815  *    we also need to keep the ilm in the right ill.
    816  *
    817  *    In IPv4, as ipifs move from old ill to new_ill, ilgs and ilms move
    818  *    implicitly as we use only ipifs in IPv4. Thus, one can always lookup
    819  *    a given ilm/ilg even after it fails without the support of
    820  *    orig_ifindex. We move ilms still to record the driver state as
    821  *    mentioned above.
    822  */
    823 
    824 /*
    825  * The ilg_t and ilm_t members are protected by ipsq. They can be changed only
    826  * by a thread executing in the ipsq. In other words add/delete of a
    827  * multicast group has to execute in the ipsq.
    828  */
    829 #define	ILG_DELETED	0x1		/* ilg_flags */
    830 typedef struct ilg_s {
    831 	in6_addr_t	ilg_v6group;
    832 	struct ipif_s	*ilg_ipif;	/* Logical interface we are member on */
    833 	struct ill_s	*ilg_ill;	/* Used by IPv6 */
    834 	int		ilg_orig_ifindex; /* Interface originally joined on */
    835 	uint_t		ilg_flags;
    836 	mcast_record_t	ilg_fmode;	/* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */
    837 	slist_t		*ilg_filter;
    838 } ilg_t;
    839 
    840 /*
    841  * Multicast address list entry for ill.
    842  * ilm_ipif is used by IPv4 as multicast groups are joined using ipif.
    843  * ilm_ill is used by IPv6 as multicast groups are joined using ill.
    844  * ilm_ill is NULL for IPv4 and ilm_ipif is NULL for IPv6.
    845  *
    846  * The comment below (and for other netstack_t references) refers
    847  * to the fact that we only do netstack_hold in particular cases,
    848  * such as the references from open streams (ill_t and conn_t's
    849  * pointers). Internally within IP we rely on IP's ability to cleanup e.g.
    850  * ire_t's when an ill goes away.
    851  */
    852 #define	ILM_DELETED	0x1		/* ilm_flags */
    853 typedef struct ilm_s {
    854 	in6_addr_t	ilm_v6addr;
    855 	int		ilm_refcnt;
    856 	uint_t		ilm_timer;	/* IGMP/MLD query resp timer, in msec */
    857 	struct ipif_s	*ilm_ipif;	/* Back pointer to ipif for IPv4 */
    858 	struct ilm_s	*ilm_next;	/* Linked list for each ill */
    859 	uint_t		ilm_state;	/* state of the membership */
    860 	struct ill_s	*ilm_ill;	/* Back pointer to ill for IPv6 */
    861 	int		ilm_orig_ifindex;  /* V6_MULTICAST_IF/ilm_ipif index */
    862 	uint_t		ilm_flags;
    863 	boolean_t	ilm_is_new;	/* new ilm */
    864 	boolean_t	ilm_notify_driver; /* Need to notify the driver */
    865 	zoneid_t	ilm_zoneid;
    866 	int		ilm_no_ilg_cnt;	/* number of joins w/ no ilg */
    867 	mcast_record_t	ilm_fmode;	/* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */
    868 	slist_t		*ilm_filter;	/* source filter list */
    869 	slist_t		*ilm_pendsrcs;	/* relevant src addrs for pending req */
    870 	rtx_state_t	ilm_rtx;	/* SCR retransmission state */
    871 	ip_stack_t	*ilm_ipst;	/* Does not have a netstack_hold */
    872 } ilm_t;
    873 
    874 #define	ilm_addr	V4_PART_OF_V6(ilm_v6addr)
    875 
    876 /*
    877  * ilm_walker_cleanup needs to execute when the ilm_walker_cnt goes down to
    878  * zero. In addition it needs to block new walkers while it is unlinking ilm's
    879  * from the list. Thus simple atomics for the ill_ilm_walker_cnt don't suffice.
    880  */
    881 #define	ILM_WALKER_HOLD(ill)    {               \
    882 	mutex_enter(&(ill)->ill_lock);          \
    883 	ill->ill_ilm_walker_cnt++;              \
    884 	mutex_exit(&(ill)->ill_lock);           \
    885 }
    886 
    887 /*
    888  * ilm_walker_cleanup releases ill_lock
    889  */
    890 #define	ILM_WALKER_RELE(ill)	{ 		\
    891 	mutex_enter(&(ill)->ill_lock);		\
    892 	(ill)->ill_ilm_walker_cnt--;		\
    893 	if ((ill)->ill_ilm_walker_cnt == 0 && (ill)->ill_ilm_cleanup_reqd) \
    894 		ilm_walker_cleanup(ill);	\
    895 	else 					\
    896 		mutex_exit(&(ill)->ill_lock);	\
    897 }
    898 
    899 /*
    900  * Soft reference to an IPsec SA.
    901  *
    902  * On relative terms, conn's can be persistent (living as long as the
    903  * processes which create them), while SA's are ephemeral (dying when
    904  * they hit their time-based or byte-based lifetimes).
    905  *
    906  * We could hold a hard reference to an SA from an ipsec_latch_t,
    907  * but this would cause expired SA's to linger for a potentially
    908  * unbounded time.
    909  *
    910  * Instead, we remember the hash bucket number and bucket generation
    911  * in addition to the pointer.  The bucket generation is incremented on
    912  * each deletion.
    913  */
    914 typedef struct ipsa_ref_s
    915 {
    916 	struct ipsa_s	*ipsr_sa;
    917 	struct isaf_s	*ipsr_bucket;
    918 	uint64_t	ipsr_gen;
    919 } ipsa_ref_t;
    920 
    921 /*
    922  * IPsec "latching" state.
    923  *
    924  * In the presence of IPsec policy, fully-bound conn's bind a connection
    925  * to more than just the 5-tuple, but also a specific IPsec action and
    926  * identity-pair.
    927  *
    928  * As an optimization, we also cache soft references to IPsec SA's
    929  * here so that we can fast-path around most of the work needed for
    930  * outbound IPsec SA selection.
    931  *
    932  * Were it not for TCP's detached connections, this state would be
    933  * in-line in conn_t; instead, this is in a separate structure so it
    934  * can be handed off to TCP