1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #ifndef _INET_IP_H 29 #define _INET_IP_H 30 31 #ifdef __cplusplus 32 extern "C" { 33 #endif 34 35 #include <sys/isa_defs.h> 36 #include <sys/types.h> 37 #include <inet/mib2.h> 38 #include <inet/nd.h> 39 #include <sys/atomic.h> 40 #include <net/if_dl.h> 41 #include <net/if.h> 42 #include <netinet/ip.h> 43 #include <netinet/igmp.h> 44 #include <sys/neti.h> 45 #include <sys/hook.h> 46 #include <sys/hook_event.h> 47 #include <sys/hook_impl.h> 48 #include <inet/ip_stack.h> 49 50 #ifdef _KERNEL 51 #include <netinet/ip6.h> 52 #include <sys/avl.h> 53 #include <sys/vmem.h> 54 #include <sys/squeue.h> 55 #include <net/route.h> 56 #include <sys/systm.h> 57 #include <sys/multidata.h> 58 #include <net/radix.h> 59 #include <sys/modhash.h> 60 61 #ifdef DEBUG 62 #define CONN_DEBUG 63 #endif 64 65 #define IP_DEBUG 66 /* 67 * The mt-streams(9F) flags for the IP module; put here so that other 68 * "drivers" that are actually IP (e.g., ICMP, UDP) can use the same set 69 * of flags. 70 */ 71 #define IP_DEVMTFLAGS D_MP 72 #endif /* _KERNEL */ 73 74 #define IP_MOD_NAME "ip" 75 #define IP_DEV_NAME "/dev/ip" 76 #define IP6_DEV_NAME "/dev/ip6" 77 78 #define UDP_MOD_NAME "udp" 79 #define UDP_DEV_NAME "/dev/udp" 80 #define UDP6_DEV_NAME "/dev/udp6" 81 82 #define TCP_MOD_NAME "tcp" 83 #define TCP_DEV_NAME "/dev/tcp" 84 #define TCP6_DEV_NAME "/dev/tcp6" 85 86 #define SCTP_MOD_NAME "sctp" 87 88 #ifndef _IPADDR_T 89 #define _IPADDR_T 90 typedef uint32_t ipaddr_t; 91 #endif 92 93 /* Number of bits in an address */ 94 #define IP_ABITS 32 95 #define IPV6_ABITS 128 96 97 #define IP_HOST_MASK (ipaddr_t)0xffffffffU 98 99 #define IP_CSUM(mp, off, sum) (~ip_cksum(mp, off, sum) & 0xFFFF) 100 #define IP_CSUM_PARTIAL(mp, off, sum) ip_cksum(mp, off, sum) 101 #define IP_BCSUM_PARTIAL(bp, len, sum) bcksum(bp, len, sum) 102 #define IP_MD_CSUM(pd, off, sum) (~ip_md_cksum(pd, off, sum) & 0xffff) 103 #define IP_MD_CSUM_PARTIAL(pd, off, sum) ip_md_cksum(pd, off, sum) 104 105 /* 106 * Flag to IP write side to indicate that the appln has sent in a pre-built 107 * IP header. Stored in ipha_ident (which is otherwise zero). 108 */ 109 #define IP_HDR_INCLUDED 0xFFFF 110 111 #define ILL_FRAG_HASH_TBL_COUNT ((unsigned int)64) 112 #define ILL_FRAG_HASH_TBL_SIZE (ILL_FRAG_HASH_TBL_COUNT * sizeof (ipfb_t)) 113 114 #define IPV4_ADDR_LEN 4 115 #define IP_ADDR_LEN IPV4_ADDR_LEN 116 #define IP_ARP_PROTO_TYPE 0x0800 117 118 #define IPV4_VERSION 4 119 #define IP_VERSION IPV4_VERSION 120 #define IP_SIMPLE_HDR_LENGTH_IN_WORDS 5 121 #define IP_SIMPLE_HDR_LENGTH 20 122 #define IP_MAX_HDR_LENGTH 60 123 124 #define IP_MAX_OPT_LENGTH (IP_MAX_HDR_LENGTH-IP_SIMPLE_HDR_LENGTH) 125 126 #define IP_MIN_MTU (IP_MAX_HDR_LENGTH + 8) /* 68 bytes */ 127 128 /* 129 * XXX IP_MAXPACKET is defined in <netinet/ip.h> as well. At some point the 130 * 2 files should be cleaned up to remove all redundant definitions. 131 */ 132 #define IP_MAXPACKET 65535 133 #define IP_SIMPLE_HDR_VERSION \ 134 ((IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS) 135 136 #define UDPH_SIZE 8 137 138 /* Leave room for ip_newroute to tack on the src and target addresses */ 139 #define OK_RESOLVER_MP(mp) \ 140 ((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN)) 141 142 /* 143 * Constants and type definitions to support IP IOCTL commands 144 */ 145 #define IP_IOCTL (('i'<<8)|'p') 146 #define IP_IOC_IRE_DELETE 4 147 #define IP_IOC_IRE_DELETE_NO_REPLY 5 148 #define IP_IOC_IRE_ADVISE_NO_REPLY 6 149 #define IP_IOC_RTS_REQUEST 7 150 151 /* Common definitions used by IP IOCTL data structures */ 152 typedef struct ipllcmd_s { 153 uint_t ipllc_cmd; 154 uint_t ipllc_name_offset; 155 uint_t ipllc_name_length; 156 } ipllc_t; 157 158 /* IP IRE Change Command Structure. */ 159 typedef struct ipic_s { 160 ipllc_t ipic_ipllc; 161 uint_t ipic_ire_type; 162 uint_t ipic_max_frag; 163 uint_t ipic_addr_offset; 164 uint_t ipic_addr_length; 165 uint_t ipic_mask_offset; 166 uint_t ipic_mask_length; 167 uint_t ipic_src_addr_offset; 168 uint_t ipic_src_addr_length; 169 uint_t ipic_ll_hdr_offset; 170 uint_t ipic_ll_hdr_length; 171 uint_t ipic_gateway_addr_offset; 172 uint_t ipic_gateway_addr_length; 173 clock_t ipic_rtt; 174 uint32_t ipic_ssthresh; 175 clock_t ipic_rtt_sd; 176 uchar_t ipic_ire_marks; 177 } ipic_t; 178 179 #define ipic_cmd ipic_ipllc.ipllc_cmd 180 #define ipic_ll_name_length ipic_ipllc.ipllc_name_length 181 #define ipic_ll_name_offset ipic_ipllc.ipllc_name_offset 182 183 /* IP IRE Delete Command Structure. */ 184 typedef struct ipid_s { 185 ipllc_t ipid_ipllc; 186 uint_t ipid_ire_type; 187 uint_t ipid_addr_offset; 188 uint_t ipid_addr_length; 189 uint_t ipid_mask_offset; 190 uint_t ipid_mask_length; 191 } ipid_t; 192 193 #define ipid_cmd ipid_ipllc.ipllc_cmd 194 195 #ifdef _KERNEL 196 /* 197 * Temporary state for ip options parser. 198 */ 199 typedef struct ipoptp_s 200 { 201 uint8_t *ipoptp_next; /* next option to look at */ 202 uint8_t *ipoptp_end; /* end of options */ 203 uint8_t *ipoptp_cur; /* start of current option */ 204 uint8_t ipoptp_len; /* length of current option */ 205 uint32_t ipoptp_flags; 206 } ipoptp_t; 207 208 /* 209 * Flag(s) for ipoptp_flags 210 */ 211 #define IPOPTP_ERROR 0x00000001 212 #endif /* _KERNEL */ 213 214 /* Controls forwarding of IP packets, set via ndd */ 215 #define IP_FORWARD_NEVER 0 216 #define IP_FORWARD_ALWAYS 1 217 218 #define WE_ARE_FORWARDING(ipst) ((ipst)->ips_ip_g_forward == IP_FORWARD_ALWAYS) 219 220 #define IPH_HDR_LENGTH(ipha) \ 221 ((int)(((ipha_t *)ipha)->ipha_version_and_hdr_length & 0xF) << 2) 222 223 #define IPH_HDR_VERSION(ipha) \ 224 ((int)(((ipha_t *)ipha)->ipha_version_and_hdr_length) >> 4) 225 226 #ifdef _KERNEL 227 /* 228 * IP reassembly macros. We hide starting and ending offsets in b_next and 229 * b_prev of messages on the reassembly queue. The messages are chained using 230 * b_cont. These macros are used in ip_reassemble() so we don't have to see 231 * the ugly casts and assignments. 232 * Note that the offsets are <= 64k i.e. a uint_t is sufficient to represent 233 * them. 234 */ 235 #define IP_REASS_START(mp) ((uint_t)(uintptr_t)((mp)->b_next)) 236 #define IP_REASS_SET_START(mp, u) \ 237 ((mp)->b_next = (mblk_t *)(uintptr_t)(u)) 238 #define IP_REASS_END(mp) ((uint_t)(uintptr_t)((mp)->b_prev)) 239 #define IP_REASS_SET_END(mp, u) \ 240 ((mp)->b_prev = (mblk_t *)(uintptr_t)(u)) 241 242 #define IP_REASS_COMPLETE 0x1 243 #define IP_REASS_PARTIAL 0x2 244 #define IP_REASS_FAILED 0x4 245 246 /* 247 * Test to determine whether this is a module instance of IP or a 248 * driver instance of IP. 249 */ 250 #define CONN_Q(q) (WR(q)->q_next == NULL) 251 252 #define Q_TO_CONN(q) ((conn_t *)(q)->q_ptr) 253 #define Q_TO_TCP(q) (Q_TO_CONN((q))->conn_tcp) 254 #define Q_TO_UDP(q) (Q_TO_CONN((q))->conn_udp) 255 #define Q_TO_ICMP(q) (Q_TO_CONN((q))->conn_icmp) 256 #define Q_TO_RTS(q) (Q_TO_CONN((q))->conn_rts) 257 258 /* 259 * The following two macros are used by IP to get the appropriate 260 * wq and rq for a conn. If it is a TCP conn, then we need 261 * tcp_wq/tcp_rq else, conn_wq/conn_rq. IP can use conn_wq and conn_rq 262 * from a conn directly if it knows that the conn is not TCP. 263 */ 264 #define CONNP_TO_WQ(connp) \ 265 (IPCL_IS_TCP(connp) ? (connp)->conn_tcp->tcp_wq : (connp)->conn_wq) 266 267 #define CONNP_TO_RQ(connp) RD(CONNP_TO_WQ(connp)) 268 269 #define GRAB_CONN_LOCK(q) { \ 270 if (q != NULL && CONN_Q(q)) \ 271 mutex_enter(&(Q_TO_CONN(q))->conn_lock); \ 272 } 273 274 #define RELEASE_CONN_LOCK(q) { \ 275 if (q != NULL && CONN_Q(q)) \ 276 mutex_exit(&(Q_TO_CONN(q))->conn_lock); \ 277 } 278 279 /* "Congestion controlled" protocol */ 280 #define IP_FLOW_CONTROLLED_ULP(p) ((p) == IPPROTO_TCP || (p) == IPPROTO_SCTP) 281 282 /* 283 * Complete the pending operation. Usually an ioctl. Can also 284 * be a bind or option management request that got enqueued 285 * in an ipsq_t. Called on completion of the operation. 286 */ 287 #define CONN_OPER_PENDING_DONE(connp) { \ 288 mutex_enter(&(connp)->conn_lock); \ 289 (connp)->conn_oper_pending_ill = NULL; \ 290 cv_broadcast(&(connp)->conn_refcv); \ 291 mutex_exit(&(connp)->conn_lock); \ 292 CONN_DEC_REF(connp); \ 293 } 294 295 /* Get the credential of an IP queue of unknown type */ 296 #define GET_QUEUE_CRED(wq) \ 297 ((wq)->q_next ? (((ill_t *)(wq)->q_ptr)->ill_credp) \ 298 : ((Q_TO_CONN((wq)))->conn_cred)) 299 300 /* 301 * Flags for the various ip_fanout_* routines. 302 */ 303 #define IP_FF_SEND_ICMP 0x01 /* Send an ICMP error */ 304 #define IP_FF_HDR_COMPLETE 0x02 /* Call ip_hdr_complete if error */ 305 #define IP_FF_CKSUM 0x04 /* Recompute ipha_cksum if error */ 306 #define IP_FF_RAWIP 0x08 /* Use rawip mib variable */ 307 #define IP_FF_SRC_QUENCH 0x10 /* OK to send ICMP_SOURCE_QUENCH */ 308 #define IP_FF_SYN_ADDIRE 0x20 /* Add IRE if TCP syn packet */ 309 #define IP_FF_IPINFO 0x80 /* Used for both V4 and V6 */ 310 #define IP_FF_SEND_SLLA 0x100 /* Send source link layer info ? */ 311 #define IPV6_REACHABILITY_CONFIRMATION 0x200 /* Flags for ip_xmit_v6 */ 312 #define IP_FF_NO_MCAST_LOOP 0x400 /* No multicasts for sending zone */ 313 314 /* 315 * Following flags are used by IPQoS to determine if policy processing is 316 * required. 317 */ 318 #define IP6_NO_IPPOLICY 0x800 /* Don't do IPQoS processing */ 319 #define IP6_IN_LLMCAST 0x1000 /* Multicast */ 320 321 #define IP_FF_LOOPBACK 0x2000 /* Loopback fanout */ 322 #define IP_FF_SCTP_CSUM_ERR 0x4000 /* sctp pkt has failed chksum */ 323 324 #ifndef IRE_DB_TYPE 325 #define IRE_DB_TYPE M_SIG 326 #endif 327 328 #ifndef IRE_DB_REQ_TYPE 329 #define IRE_DB_REQ_TYPE M_PCSIG 330 #endif 331 332 #ifndef IRE_ARPRESOLVE_TYPE 333 #define IRE_ARPRESOLVE_TYPE M_EVENT 334 #endif 335 336 /* 337 * Values for squeue switch: 338 */ 339 340 #define IP_SQUEUE_ENTER_NODRAIN 1 341 #define IP_SQUEUE_ENTER 2 342 /* 343 * This is part of the interface between Transport provider and 344 * IP which can be used to set policy information. This is usually 345 * accompanied with O_T_BIND_REQ/T_BIND_REQ.ip_bind assumes that 346 * only IPSEC_POLICY_SET is there when it is found in the chain. 347 * The information contained is an struct ipsec_req_t. On success 348 * or failure, either the T_BIND_ACK or the T_ERROR_ACK is returned. 349 * IPSEC_POLICY_SET is never returned. 350 */ 351 #define IPSEC_POLICY_SET M_SETOPTS 352 353 #define IRE_IS_LOCAL(ire) ((ire != NULL) && \ 354 ((ire)->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) 355 356 #define IRE_IS_TARGET(ire) ((ire != NULL) && \ 357 ((ire)->ire_type != IRE_BROADCAST)) 358 359 /* IP Fragmentation Reassembly Header */ 360 typedef struct ipf_s { 361 struct ipf_s *ipf_hash_next; 362 struct ipf_s **ipf_ptphn; /* Pointer to previous hash next. */ 363 uint32_t ipf_ident; /* Ident to match. */ 364 uint8_t ipf_protocol; /* Protocol to match. */ 365 uchar_t ipf_last_frag_seen : 1; /* Last fragment seen ? */ 366 time_t ipf_timestamp; /* Reassembly start time. */ 367 mblk_t *ipf_mp; /* mblk we live in. */ 368 mblk_t *ipf_tail_mp; /* Frag queue tail pointer. */ 369 int ipf_hole_cnt; /* Number of holes (hard-case). */ 370 int ipf_end; /* Tail end offset (0 -> hard-case). */ 371 uint_t ipf_gen; /* Frag queue generation */ 372 size_t ipf_count; /* Count of bytes used by frag */ 373 uint_t ipf_nf_hdr_len; /* Length of nonfragmented header */ 374 in6_addr_t ipf_v6src; /* IPv6 source address */ 375 in6_addr_t ipf_v6dst; /* IPv6 dest address */ 376 uint_t ipf_prev_nexthdr_offset; /* Offset for nexthdr value */ 377 uint8_t ipf_ecn; /* ECN info for the fragments */ 378 uint8_t ipf_num_dups; /* Number of times dup frags recvd */ 379 uint16_t ipf_checksum_flags; /* Hardware checksum flags */ 380 uint32_t ipf_checksum; /* Partial checksum of fragment data */ 381 } ipf_t; 382 383 #define ipf_src V4_PART_OF_V6(ipf_v6src) 384 #define ipf_dst V4_PART_OF_V6(ipf_v6dst) 385 386 typedef enum { 387 IB_PKT = 0x01, 388 OB_PKT = 0x02 389 } ip_pkt_t; 390 391 #define UPDATE_IB_PKT_COUNT(ire)\ 392 { \ 393 (ire)->ire_ib_pkt_count++; \ 394 if ((ire)->ire_ipif != NULL) { \ 395 /* \ 396 * forwarding packet \ 397 */ \ 398 if ((ire)->ire_type & (IRE_LOCAL|IRE_BROADCAST)) \ 399 atomic_add_32(&(ire)->ire_ipif->ipif_ib_pkt_count, 1);\ 400 else \ 401 atomic_add_32(&(ire)->ire_ipif->ipif_fo_pkt_count, 1);\ 402 } \ 403 } 404 405 #define UPDATE_OB_PKT_COUNT(ire)\ 406 { \ 407 (ire)->ire_ob_pkt_count++;\ 408 if ((ire)->ire_ipif != NULL) { \ 409 atomic_add_32(&(ire)->ire_ipif->ipif_ob_pkt_count, 1); \ 410 } \ 411 } 412 413 #define IP_RPUT_LOCAL(q, mp, ipha, ire, recv_ill) \ 414 { \ 415 switch (ipha->ipha_protocol) { \ 416 case IPPROTO_UDP: \ 417 ip_udp_input(q, mp, ipha, ire, recv_ill); \ 418 break; \ 419 default: \ 420 ip_proto_input(q, mp, ipha, ire, recv_ill, 0); \ 421 break; \ 422 } \ 423 } 424 425 /* 426 * NCE_EXPIRED is TRUE when we have a non-permanent nce that was 427 * found to be REACHABLE more than ip_ire_arp_interval ms ago. 428 * This macro is used to age existing nce_t entries. The 429 * nce's will get cleaned up in the following circumstances: 430 * - ip_ire_trash_reclaim will free nce's using ndp_cache_reclaim 431 * when memory is low, 432 * - ip_arp_news, when updates are received. 433 * - if the nce is NCE_EXPIRED(), it will deleted, so that a new 434 * arp request will need to be triggered from an ND_INITIAL nce. 435 * 436 * Note that the nce state transition follows the pattern: 437 * ND_INITIAL -> ND_INCOMPLETE -> ND_REACHABLE 438 * after which the nce is deleted when it has expired. 439 * 440 * nce_last is the timestamp that indicates when the nce_res_mp in the 441 * nce_t was last updated to a valid link-layer address. nce_last gets 442 * modified/updated : 443 * - when the nce is created 444 * - every time we get a sane arp response for the nce. 445 */ 446 #define NCE_EXPIRED(nce, ipst) (nce->nce_last > 0 && \ 447 ((nce->nce_flags & NCE_F_PERMANENT) == 0) && \ 448 ((TICK_TO_MSEC(lbolt64) - nce->nce_last) > \ 449 (ipst)->ips_ip_ire_arp_interval)) 450 451 #endif /* _KERNEL */ 452 453 /* ICMP types */ 454 #define ICMP_ECHO_REPLY 0 455 #define ICMP_DEST_UNREACHABLE 3 456 #define ICMP_SOURCE_QUENCH 4 457 #define ICMP_REDIRECT 5 458 #define ICMP_ECHO_REQUEST 8 459 #define ICMP_ROUTER_ADVERTISEMENT 9 460 #define ICMP_ROUTER_SOLICITATION 10 461 #define ICMP_TIME_EXCEEDED 11 462 #define ICMP_PARAM_PROBLEM 12 463 #define ICMP_TIME_STAMP_REQUEST 13 464 #define ICMP_TIME_STAMP_REPLY 14 465 #define ICMP_INFO_REQUEST 15 466 #define ICMP_INFO_REPLY 16 467 #define ICMP_ADDRESS_MASK_REQUEST 17 468 #define ICMP_ADDRESS_MASK_REPLY 18 469 470 /* ICMP_TIME_EXCEEDED codes */ 471 #define ICMP_TTL_EXCEEDED 0 472 #define ICMP_REASSEMBLY_TIME_EXCEEDED 1 473 474 /* ICMP_DEST_UNREACHABLE codes */ 475 #define ICMP_NET_UNREACHABLE 0 476 #define ICMP_HOST_UNREACHABLE 1 477 #define ICMP_PROTOCOL_UNREACHABLE 2 478 #define ICMP_PORT_UNREACHABLE 3 479 #define ICMP_FRAGMENTATION_NEEDED 4 480 #define ICMP_SOURCE_ROUTE_FAILED 5 481 #define ICMP_DEST_NET_UNKNOWN 6 482 #define ICMP_DEST_HOST_UNKNOWN 7 483 #define ICMP_SRC_HOST_ISOLATED 8 484 #define ICMP_DEST_NET_UNREACH_ADMIN 9 485 #define ICMP_DEST_HOST_UNREACH_ADMIN 10 486 #define ICMP_DEST_NET_UNREACH_TOS 11 487 #define ICMP_DEST_HOST_UNREACH_TOS 12 488 489 /* ICMP Header Structure */ 490 typedef struct icmph_s { 491 uint8_t icmph_type; 492 uint8_t icmph_code; 493 uint16_t icmph_checksum; 494 union { 495 struct { /* ECHO request/response structure */ 496 uint16_t u_echo_ident; 497 uint16_t u_echo_seqnum; 498 } u_echo; 499 struct { /* Destination unreachable structure */ 500 uint16_t u_du_zero; 501 uint16_t u_du_mtu; 502 } u_du; 503 struct { /* Parameter problem structure */ 504 uint8_t u_pp_ptr; 505 uint8_t u_pp_rsvd[3]; 506 } u_pp; 507 struct { /* Redirect structure */ 508 ipaddr_t u_rd_gateway; 509 } u_rd; 510 } icmph_u; 511 } icmph_t; 512 513 #define icmph_echo_ident icmph_u.u_echo.u_echo_ident 514 #define icmph_echo_seqnum icmph_u.u_echo.u_echo_seqnum 515 #define icmph_du_zero icmph_u.u_du.u_du_zero 516 #define icmph_du_mtu icmph_u.u_du.u_du_mtu 517 #define icmph_pp_ptr icmph_u.u_pp.u_pp_ptr 518 #define icmph_rd_gateway icmph_u.u_rd.u_rd_gateway 519 520 #define ICMPH_SIZE 8 521 522 /* 523 * Minimum length of transport layer header included in an ICMP error 524 * message for it to be considered valid. 525 */ 526 #define ICMP_MIN_TP_HDR_LEN 8 527 528 /* Aligned IP header */ 529 typedef struct ipha_s { 530 uint8_t ipha_version_and_hdr_length; 531 uint8_t ipha_type_of_service; 532 uint16_t ipha_length; 533 uint16_t ipha_ident; 534 uint16_t ipha_fragment_offset_and_flags; 535 uint8_t ipha_ttl; 536 uint8_t ipha_protocol; 537 uint16_t ipha_hdr_checksum; 538 ipaddr_t ipha_src; 539 ipaddr_t ipha_dst; 540 } ipha_t; 541 542 /* 543 * IP Flags 544 * 545 * Some of these constant names are copied for the DTrace IP provider in 546 * usr/src/lib/libdtrace/common/{ip.d.in, ip.sed.in}, which should be kept 547 * in sync. 548 */ 549 #define IPH_DF 0x4000 /* Don't fragment */ 550 #define IPH_MF 0x2000 /* More fragments to come */ 551 #define IPH_OFFSET 0x1FFF /* Where the offset lives */ 552 #define IPH_FRAG_HDR 0x8000 /* IPv6 don't fragment bit */ 553 554 /* ECN code points for IPv4 TOS byte and IPv6 traffic class octet. */ 555 #define IPH_ECN_NECT 0x0 /* Not ECN-Capable Transport */ 556 #define IPH_ECN_ECT1 0x1 /* ECN-Capable Transport, ECT(1) */ 557 #define IPH_ECN_ECT0 0x2 /* ECN-Capable Transport, ECT(0) */ 558 #define IPH_ECN_CE 0x3 /* ECN-Congestion Experienced (CE) */ 559 560 /* IP Mac info structure */ 561 typedef struct ip_m_s { 562 t_uscalar_t ip_m_mac_type; /* From <sys/dlpi.h> */ 563 int ip_m_type; /* From <net/if_types.h> */ 564 boolean_t (*ip_m_v4mapinfo)(uint_t, uint8_t *, uint8_t *, 565 uint32_t *, ipaddr_t *); 566 boolean_t (*ip_m_v6mapinfo)(uint_t, uint8_t *, uint8_t *, 567 uint32_t *, in6_addr_t *); 568 boolean_t (*ip_m_v6intfid)(uint_t, uint8_t *, in6_addr_t *); 569 } ip_m_t; 570 571 /* 572 * The following functions attempt to reduce the link layer dependency 573 * of the IP stack. The current set of link specific operations are: 574 * a. map from IPv4 class D (224.0/4) multicast address range to the link 575 * layer multicast address range. 576 * b. map from IPv6 multicast address range (ff00::/8) to the link 577 * layer multicast address range. 578 * c. derive the default IPv6 interface identifier from the link layer 579 * address. 580 */ 581 #define MEDIA_V4MINFO(ip_m, plen, bphys, maddr, hwxp, v4ptr) \ 582 (((ip_m)->ip_m_v4mapinfo != NULL) && \ 583 (*(ip_m)->ip_m_v4mapinfo)(plen, bphys, maddr, hwxp, v4ptr)) 584 #define MEDIA_V6INTFID(ip_m, plen, phys, v6ptr) \ 585 (((ip_m)->ip_m_v6intfid != NULL) && \ 586 (*(ip_m)->ip_m_v6intfid)(plen, phys, v6ptr)) 587 #define MEDIA_V6MINFO(ip_m, plen, bphys, maddr, hwxp, v6ptr) \ 588 (((ip_m)->ip_m_v6mapinfo != NULL) && \ 589 (*(ip_m)->ip_m_v6mapinfo)(plen, bphys, maddr, hwxp, v6ptr)) 590 591 /* Router entry types */ 592 #define IRE_BROADCAST 0x0001 /* Route entry for broadcast address */ 593 #define IRE_DEFAULT 0x0002 /* Route entry for default gateway */ 594 #define IRE_LOCAL 0x0004 /* Route entry for local address */ 595 #define IRE_LOOPBACK 0x0008 /* Route entry for loopback address */ 596 #define IRE_PREFIX 0x0010 /* Route entry for prefix routes */ 597 #define IRE_CACHE 0x0020 /* Cached Route entry */ 598 #define IRE_IF_NORESOLVER 0x0040 /* Route entry for local interface */ 599 /* net without any address mapping. */ 600 #define IRE_IF_RESOLVER 0x0080 /* Route entry for local interface */ 601 /* net with resolver. */ 602 #define IRE_HOST 0x0100 /* Host route entry */ 603 #define IRE_HOST_REDIRECT 0x0200 /* only used for T_SVR4_OPTMGMT_REQ */ 604 605 #define IRE_INTERFACE (IRE_IF_NORESOLVER | IRE_IF_RESOLVER) 606 #define IRE_OFFSUBNET (IRE_DEFAULT | IRE_PREFIX | IRE_HOST) 607 #define IRE_CACHETABLE (IRE_CACHE | IRE_BROADCAST | IRE_LOCAL | \ 608 IRE_LOOPBACK) 609 #define IRE_FORWARDTABLE (IRE_INTERFACE | IRE_OFFSUBNET) 610 611 /* 612 * If an IRE is marked with IRE_MARK_CONDEMNED, the last walker of 613 * the bucket should delete this IRE from this bucket. 614 */ 615 #define IRE_MARK_CONDEMNED 0x0001 616 /* 617 * If a broadcast IRE is marked with IRE_MARK_NORECV, ip_rput will drop the 618 * broadcast packets received on that interface. This is marked only 619 * on broadcast ires. Employed by IPMP, where we have multiple NICs on the 620 * same subnet receiving the same broadcast packet. 621 */ 622 #define IRE_MARK_NORECV 0x0002 623 /* 624 * IRE_CACHE marked this way won't be returned by ire_cache_lookup. Need 625 * to look specifically using MATCH_IRE_MARK_HIDDEN. Used by IPMP. 626 */ 627 #define IRE_MARK_HIDDEN 0x0004 /* Typically Used by in.mpathd */ 628 629 /* 630 * An IRE with IRE_MARK_NOADD is created in ip_newroute_ipif when the outgoing 631 * interface is specified by e.g. IP_PKTINFO. The IRE is not added to the IRE 632 * cache table. 633 */ 634 #define IRE_MARK_NOADD 0x0008 /* Mark not to add ire in cache */ 635 636 /* 637 * IRE marked with IRE_MARK_TEMPORARY means that this IRE has been used 638 * either for forwarding a packet or has not been used for sending 639 * traffic on TCP connections terminated on this system. In both 640 * cases, this IRE is the first to go when IRE is being cleaned up. 641 */ 642 #define IRE_MARK_TEMPORARY 0x0010 643 644 /* 645 * IRE marked with IRE_MARK_USESRC_CHECK means that while adding an IRE with 646 * this mark, additional atomic checks need to be performed. For eg: by the 647 * time an IRE_CACHE is created, sent up to ARP and then comes back to IP; the 648 * usesrc grouping could have changed in which case we want to fail adding 649 * the IRE_CACHE entry 650 */ 651 #define IRE_MARK_USESRC_CHECK 0x0020 652 653 /* 654 * IRE_MARK_PRIVATE_ADDR is used for IP_NEXTHOP. When IP_NEXTHOP is set, the 655 * routing table lookup for the destination is bypassed and the packet is 656 * sent directly to the specified nexthop. The associated IRE_CACHE entries 657 * should be marked with IRE_MARK_PRIVATE_ADDR flag so that they don't show up 658 * in regular ire cache lookups. 659 */ 660 #define IRE_MARK_PRIVATE_ADDR 0x0040 661 662 /* 663 * When we send an ARP resolution query for the nexthop gateway's ire, 664 * we use esballoc to create the ire_t in the AR_ENTRY_QUERY mblk 665 * chain, and mark its ire_marks with IRE_MARK_UNCACHED. This flag 666 * indicates that information from ARP has not been transferred to a 667 * permanent IRE_CACHE entry. The flag is reset only when the 668 * information is successfully transferred to an ire_cache entry (in 669 * ire_add()). Attempting to free the AR_ENTRY_QUERY mblk chain prior 670 * to ire_add (e.g., from arp, or from ip`ip_wput_nondata) will 671 * require that the resources (incomplete ire_cache and/or nce) must 672 * be cleaned up. The free callback routine (ire_freemblk()) checks 673 * for IRE_MARK_UNCACHED to see if any resources that are pinned down 674 * will need to be cleaned up or not. 675 */ 676 677 #define IRE_MARK_UNCACHED 0x0080 678 679 /* 680 * The comment below (and for other netstack_t references) refers 681 * to the fact that we only do netstack_hold in particular cases, 682 * such as the references from open streams (ill_t and conn_t's 683 * pointers). Internally within IP we rely on IP's ability to cleanup e.g. 684 * ire_t's when an ill goes away. 685 */ 686 typedef struct ire_expire_arg_s { 687 int iea_flush_flag; 688 ip_stack_t *iea_ipst; /* Does not have a netstack_hold */ 689 } ire_expire_arg_t; 690 691 /* Flags with ire_expire routine */ 692 #define FLUSH_ARP_TIME 0x0001 /* ARP info potentially stale timer */ 693 #define FLUSH_REDIRECT_TIME 0x0002 /* Redirects potentially stale */ 694 #define FLUSH_MTU_TIME 0x0004 /* Include path MTU per RFC 1191 */ 695 696 /* Arguments to ire_flush_cache() */ 697 #define IRE_FLUSH_DELETE 0 698 #define IRE_FLUSH_ADD 1 699 700 /* 701 * Open/close synchronization flags. 702 * These are kept in a separate field in the conn and the synchronization 703 * depends on the atomic 32 bit access to that field. 704 */ 705 #define CONN_CLOSING 0x01 /* ip_close waiting for ip_wsrv */ 706 #define CONN_IPSEC_LOAD_WAIT 0x02 /* waiting for load */ 707 #define CONN_CONDEMNED 0x04 /* conn is closing, no more refs */ 708 #define CONN_INCIPIENT 0x08 /* conn not yet visible, no refs */ 709 #define CONN_QUIESCED 0x10 /* conn is now quiescent */ 710 711 /* Used to check connection state flags before caching the IRE */ 712 #define CONN_CACHE_IRE(connp) \ 713 (!((connp)->conn_state_flags & (CONN_CLOSING|CONN_CONDEMNED))) 714 715 /* 716 * Parameter to ip_output giving the identity of the caller. 717 * IP_WSRV means the packet was enqueued in the STREAMS queue 718 * due to flow control and is now being reprocessed in the context of 719 * the STREAMS service procedure, consequent to flow control relief. 720 * IRE_SEND means the packet is being reprocessed consequent to an 721 * ire cache creation and addition and this may or may not be happening 722 * in the service procedure context. Anything other than the above 2 723 * cases is identified as IP_WPUT. Most commonly this is the case of 724 * packets coming down from the application. 725 */ 726 #ifdef _KERNEL 727 #define IP_WSRV 1 /* Called from ip_wsrv */ 728 #define IP_WPUT 2 /* Called from ip_wput */ 729 #define IRE_SEND 3 /* Called from ire_send */ 730 731 /* 732 * Extra structures need for per-src-addr filtering (IGMPv3/MLDv2) 733 */ 734 #define MAX_FILTER_SIZE 64 735 736 typedef struct slist_s { 737 int sl_numsrc; 738 in6_addr_t sl_addr[MAX_FILTER_SIZE]; 739 } slist_t; 740 741 /* 742 * Following struct is used to maintain retransmission state for 743 * a multicast group. One rtx_state_t struct is an in-line field 744 * of the ilm_t struct; the slist_ts in the rtx_state_t struct are 745 * alloc'd as needed. 746 */ 747 typedef struct rtx_state_s { 748 uint_t rtx_timer; /* retrans timer */ 749 int rtx_cnt; /* retrans count */ 750 int rtx_fmode_cnt; /* retrans count for fmode change */ 751 slist_t *rtx_allow; 752 slist_t *rtx_block; 753 } rtx_state_t; 754 755 /* 756 * Used to construct list of multicast address records that will be 757 * sent in a single listener report. 758 */ 759 typedef struct mrec_s { 760 struct mrec_s *mrec_next; 761 uint8_t mrec_type; 762 uint8_t mrec_auxlen; /* currently unused */ 763 in6_addr_t mrec_group; 764 slist_t mrec_srcs; 765 } mrec_t; 766 767 /* Group membership list per upper conn */ 768 /* 769 * XXX add ilg info for ifaddr/ifindex. 770 * XXX can we make ilg survive an ifconfig unplumb + plumb 771 * by setting the ipif/ill to NULL and recover that later? 772 * 773 * ilg_ipif is used by IPv4 as multicast groups are joined using an interface 774 * address (ipif). 775 * ilg_ill is used by IPv6 as multicast groups are joined using an interface 776 * index (phyint->phyint_ifindex). 777 * ilg_ill is NULL for IPv4 and ilg_ipif is NULL for IPv6. 778 * 779 * ilg records the state of multicast memberships of a socket end point. 780 * ilm records the state of multicast memberships with the driver and is 781 * maintained per interface. 782 * 783 * Notes : 784 * 785 * 1) There is no direct link between a given ilg and ilm. If the 786 * application has joined a group G with ifindex I, we will have 787 * an ilg with ilg_v6group and ilg_ill. There will be a corresponding 788 * ilm with ilm_ill/ilm_v6addr recording the multicast membership. 789 * To delete the membership, 790 * 791 * a) Search for ilg matching on G and I with ilg_v6group 792 * and ilg_ill. Delete ilg_ill. 793 * b) Search the corresponding ilm matching on G and I with 794 * ilm_v6addr and ilm_ill. Delete ilm. 795 * 796 * In IPv4, the only difference is, we look using ipifs instead of 797 * ills. 798 * 799 * 2) With IP multipathing, we want to keep receiving even after the 800 * interface has failed. We do this by moving multicast memberships 801 * to a new_ill within the group. This is achieved by sending 802 * DL_DISABMULTI_REQS on ilg_ill/ilm_ill and sending DL_ENABMULTIREQS 803 * on the new_ill and changing ilg_ill/ilm_ill to new_ill. But, we 804 * need to be able to delete memberships which will still come down 805 * with the ifindex of the old ill which is what the application 806 * knows of. Thus we store the ilm_/ilg_orig_ifindex to keep track 807 * of where we joined initially so that we can lookup even after we 808 * moved the membership. It is also used for moving back the membership 809 * when the old ill has been repaired. This is done by looking up for 810 * ilms with ilm_orig_ifindex matching on the old ill's ifindex. Only 811 * ilms actually move from old ill to new ill. ilgs don't move (just 812 * the ilg_ill is changed when it moves) as it just records the state 813 * of the application that has joined a group G where as ilm records 814 * the state joined with the driver. Thus when we send DL_XXXMULTI_REQs 815 * we also need to keep the ilm in the right ill. 816 * 817 * In IPv4, as ipifs move from old ill to new_ill, ilgs and ilms move 818 * implicitly as we use only ipifs in IPv4. Thus, one can always lookup 819 * a given ilm/ilg even after it fails without the support of 820 * orig_ifindex. We move ilms still to record the driver state as 821 * mentioned above. 822 */ 823 824 /* 825 * The ilg_t and ilm_t members are protected by ipsq. They can be changed only 826 * by a thread executing in the ipsq. In other words add/delete of a 827 * multicast group has to execute in the ipsq. 828 */ 829 #define ILG_DELETED 0x1 /* ilg_flags */ 830 typedef struct ilg_s { 831 in6_addr_t ilg_v6group; 832 struct ipif_s *ilg_ipif; /* Logical interface we are member on */ 833 struct ill_s *ilg_ill; /* Used by IPv6 */ 834 int ilg_orig_ifindex; /* Interface originally joined on */ 835 uint_t ilg_flags; 836 mcast_record_t ilg_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */ 837 slist_t *ilg_filter; 838 } ilg_t; 839 840 /* 841 * Multicast address list entry for ill. 842 * ilm_ipif is used by IPv4 as multicast groups are joined using ipif. 843 * ilm_ill is used by IPv6 as multicast groups are joined using ill. 844 * ilm_ill is NULL for IPv4 and ilm_ipif is NULL for IPv6. 845 * 846 * The comment below (and for other netstack_t references) refers 847 * to the fact that we only do netstack_hold in particular cases, 848 * such as the references from open streams (ill_t and conn_t's 849 * pointers). Internally within IP we rely on IP's ability to cleanup e.g. 850 * ire_t's when an ill goes away. 851 */ 852 #define ILM_DELETED 0x1 /* ilm_flags */ 853 typedef struct ilm_s { 854 in6_addr_t ilm_v6addr; 855 int ilm_refcnt; 856 uint_t ilm_timer; /* IGMP/MLD query resp timer, in msec */ 857 struct ipif_s *ilm_ipif; /* Back pointer to ipif for IPv4 */ 858 struct ilm_s *ilm_next; /* Linked list for each ill */ 859 uint_t ilm_state; /* state of the membership */ 860 struct ill_s *ilm_ill; /* Back pointer to ill for IPv6 */ 861 int ilm_orig_ifindex; /* V6_MULTICAST_IF/ilm_ipif index */ 862 uint_t ilm_flags; 863 boolean_t ilm_is_new; /* new ilm */ 864 boolean_t ilm_notify_driver; /* Need to notify the driver */ 865 zoneid_t ilm_zoneid; 866 int ilm_no_ilg_cnt; /* number of joins w/ no ilg */ 867 mcast_record_t ilm_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */ 868 slist_t *ilm_filter; /* source filter list */ 869 slist_t *ilm_pendsrcs; /* relevant src addrs for pending req */ 870 rtx_state_t ilm_rtx; /* SCR retransmission state */ 871 ip_stack_t *ilm_ipst; /* Does not have a netstack_hold */ 872 } ilm_t; 873 874 #define ilm_addr V4_PART_OF_V6(ilm_v6addr) 875 876 /* 877 * ilm_walker_cleanup needs to execute when the ilm_walker_cnt goes down to 878 * zero. In addition it needs to block new walkers while it is unlinking ilm's 879 * from the list. Thus simple atomics for the ill_ilm_walker_cnt don't suffice. 880 */ 881 #define ILM_WALKER_HOLD(ill) { \ 882 mutex_enter(&(ill)->ill_lock); \ 883 ill->ill_ilm_walker_cnt++; \ 884 mutex_exit(&(ill)->ill_lock); \ 885 } 886 887 /* 888 * ilm_walker_cleanup releases ill_lock 889 */ 890 #define ILM_WALKER_RELE(ill) { \ 891 mutex_enter(&(ill)->ill_lock); \ 892 (ill)->ill_ilm_walker_cnt--; \ 893 if ((ill)->ill_ilm_walker_cnt == 0 && (ill)->ill_ilm_cleanup_reqd) \ 894 ilm_walker_cleanup(ill); \ 895 else \ 896 mutex_exit(&(ill)->ill_lock); \ 897 } 898 899 /* 900 * Soft reference to an IPsec SA. 901 * 902 * On relative terms, conn's can be persistent (living as long as the 903 * processes which create them), while SA's are ephemeral (dying when 904 * they hit their time-based or byte-based lifetimes). 905 * 906 * We could hold a hard reference to an SA from an ipsec_latch_t, 907 * but this would cause expired SA's to linger for a potentially 908 * unbounded time. 909 * 910 * Instead, we remember the hash bucket number and bucket generation 911 * in addition to the pointer. The bucket generation is incremented on 912 * each deletion. 913 */ 914 typedef struct ipsa_ref_s 915 { 916 struct ipsa_s *ipsr_sa; 917 struct isaf_s *ipsr_bucket; 918 uint64_t ipsr_gen; 919 } ipsa_ref_t; 920 921 /* 922 * IPsec "latching" state. 923 * 924 * In the presence of IPsec policy, fully-bound conn's bind a connection 925 * to more than just the 5-tuple, but also a specific IPsec action and 926 * identity-pair. 927 * 928 * As an optimization, we also cache soft references to IPsec SA's 929 * here so that we can fast-path around most of the work needed for 930 * outbound IPsec SA selection. 931 * 932 * Were it not for TCP's detached connections, this state would be 933 * in-line in conn_t; instead, this is in a separate structure so it 934 * can be handed off to TCP