1 0 stevel /* 2 0 stevel * CDDL HEADER START 3 0 stevel * 4 0 stevel * The contents of this file are subject to the terms of the 5 1392 ja97890 * Common Development and Distribution License (the "License"). 6 1392 ja97890 * You may not use this file except in compliance with the License. 7 0 stevel * 8 0 stevel * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 0 stevel * or http://www.opensolaris.org/os/licensing. 10 0 stevel * See the License for the specific language governing permissions 11 0 stevel * and limitations under the License. 12 0 stevel * 13 0 stevel * When distributing Covered Code, include this CDDL HEADER in each 14 0 stevel * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 0 stevel * If applicable, add the following below this CDDL HEADER, with the 16 0 stevel * fields enclosed by brackets "[]" replaced with your own identifying 17 0 stevel * information: Portions Copyright [yyyy] [name of copyright owner] 18 0 stevel * 19 0 stevel * CDDL HEADER END 20 0 stevel */ 21 3448 dh155122 22 0 stevel /* 23 8485 Peter * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 0 stevel * Use is subject to license terms. 25 0 stevel */ 26 0 stevel /* Copyright (c) 1990 Mentat Inc. */ 27 0 stevel 28 0 stevel #include <sys/types.h> 29 0 stevel #include <sys/stream.h> 30 0 stevel #include <sys/dlpi.h> 31 0 stevel #include <sys/stropts.h> 32 0 stevel #include <sys/sysmacros.h> 33 0 stevel #include <sys/strsubr.h> 34 0 stevel #include <sys/strlog.h> 35 0 stevel #include <sys/strsun.h> 36 0 stevel #include <sys/zone.h> 37 0 stevel #define _SUN_TPI_VERSION 2 38 0 stevel #include <sys/tihdr.h> 39 0 stevel #include <sys/xti_inet.h> 40 0 stevel #include <sys/ddi.h> 41 11042 Erik #include <sys/suntpi.h> 42 0 stevel #include <sys/cmn_err.h> 43 0 stevel #include <sys/debug.h> 44 0 stevel #include <sys/kobj.h> 45 0 stevel #include <sys/modctl.h> 46 0 stevel #include <sys/atomic.h> 47 0 stevel #include <sys/policy.h> 48 1676 jpk #include <sys/priv.h> 49 8275 Eric #include <sys/taskq.h> 50 0 stevel 51 0 stevel #include <sys/systm.h> 52 0 stevel #include <sys/param.h> 53 0 stevel #include <sys/kmem.h> 54 2958 dr146992 #include <sys/sdt.h> 55 0 stevel #include <sys/socket.h> 56 0 stevel #include <sys/vtrace.h> 57 0 stevel #include <sys/isa_defs.h> 58 5868 dr146992 #include <sys/mac.h> 59 0 stevel #include <net/if.h> 60 0 stevel #include <net/if_arp.h> 61 0 stevel #include <net/route.h> 62 0 stevel #include <sys/sockio.h> 63 0 stevel #include <netinet/in.h> 64 0 stevel #include <net/if_dl.h> 65 0 stevel 66 0 stevel #include <inet/common.h> 67 0 stevel #include <inet/mi.h> 68 0 stevel #include <inet/mib2.h> 69 0 stevel #include <inet/nd.h> 70 0 stevel #include <inet/arp.h> 71 0 stevel #include <inet/snmpcom.h> 72 5240 nordmark #include <inet/optcom.h> 73 0 stevel #include <inet/kstatcom.h> 74 0 stevel 75 0 stevel #include <netinet/igmp_var.h> 76 0 stevel #include <netinet/ip6.h> 77 0 stevel #include <netinet/icmp6.h> 78 0 stevel #include <netinet/sctp.h> 79 0 stevel 80 0 stevel #include <inet/ip.h> 81 741 masputra #include <inet/ip_impl.h> 82 0 stevel #include <inet/ip6.h> 83 0 stevel #include <inet/ip6_asp.h> 84 0 stevel #include <inet/tcp.h> 85 741 masputra #include <inet/tcp_impl.h> 86 0 stevel #include <inet/ip_multi.h> 87 0 stevel #include <inet/ip_if.h> 88 0 stevel #include <inet/ip_ire.h> 89 2535 sangeeta #include <inet/ip_ftable.h> 90 0 stevel #include <inet/ip_rts.h> 91 0 stevel #include <inet/ip_ndp.h> 92 0 stevel #include <inet/ip_listutils.h> 93 0 stevel #include <netinet/igmp.h> 94 0 stevel #include <netinet/ip_mroute.h> 95 0 stevel #include <inet/ipp_common.h> 96 0 stevel 97 0 stevel #include <net/pfkeyv2.h> 98 0 stevel #include <inet/sadb.h> 99 0 stevel #include <inet/ipsec_impl.h> 100 10616 Sebastien #include <inet/iptun/iptun_impl.h> 101 0 stevel #include <inet/ipdrop.h> 102 2958 dr146992 #include <inet/ip_netinfo.h> 103 10946 Sangeeta #include <inet/ilb_ip.h> 104 0 stevel 105 0 stevel #include <sys/ethernet.h> 106 0 stevel #include <net/if_types.h> 107 0 stevel #include <sys/cpuvar.h> 108 0 stevel 109 0 stevel #include <ipp/ipp.h> 110 0 stevel #include <ipp/ipp_impl.h> 111 0 stevel #include <ipp/ipgpc/ipgpc.h> 112 0 stevel 113 0 stevel #include <sys/pattr.h> 114 0 stevel #include <inet/ipclassifier.h> 115 0 stevel #include <inet/sctp_ip.h> 116 2252 priyanka #include <inet/sctp/sctp_impl.h> 117 741 masputra #include <inet/udp_impl.h> 118 5240 nordmark #include <inet/rawip_impl.h> 119 5240 nordmark #include <inet/rts_impl.h> 120 1676 jpk 121 1676 jpk #include <sys/tsol/label.h> 122 1676 jpk #include <sys/tsol/tnet.h> 123 1676 jpk 124 8275 Eric #include <sys/squeue_impl.h> 125 11042 Erik #include <inet/ip_arp.h> 126 11110 Erik 127 11110 Erik #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ 128 0 stevel 129 0 stevel /* 130 0 stevel * Values for squeue switch: 131 8275 Eric * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN 132 8275 Eric * IP_SQUEUE_ENTER: SQ_PROCESS 133 8275 Eric * IP_SQUEUE_FILL: SQ_FILL 134 0 stevel */ 135 11042 Erik int ip_squeue_enter = IP_SQUEUE_ENTER; /* Setable in /etc/system */ 136 3448 dh155122 137 8275 Eric int ip_squeue_flag; 138 0 stevel 139 3448 dh155122 /* 140 3448 dh155122 * Setable in /etc/system 141 3448 dh155122 */ 142 0 stevel int ip_poll_normal_ms = 100; 143 0 stevel int ip_poll_normal_ticks = 0; 144 3233 yz147064 int ip_modclose_ackwait_ms = 3000; 145 5023 carlsonj 146 5023 carlsonj /* 147 5023 carlsonj * It would be nice to have these present only in DEBUG systems, but the 148 5023 carlsonj * current design of the global symbol checking logic requires them to be 149 5023 carlsonj * unconditionally present. 150 5023 carlsonj */ 151 5023 carlsonj uint_t ip_thread_data; /* TSD key for debug support */ 152 5023 carlsonj krwlock_t ip_thread_rwlock; 153 5023 carlsonj list_t ip_thread_list; 154 0 stevel 155 0 stevel /* 156 0 stevel * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions. 157 0 stevel */ 158 0 stevel 159 0 stevel struct listptr_s { 160 0 stevel mblk_t *lp_head; /* pointer to the head of the list */ 161 0 stevel mblk_t *lp_tail; /* pointer to the tail of the list */ 162 0 stevel }; 163 0 stevel 164 0 stevel typedef struct listptr_s listptr_t; 165 0 stevel 166 0 stevel /* 167 1676 jpk * This is used by ip_snmp_get_mib2_ip_route_media and 168 1676 jpk * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data. 169 1676 jpk */ 170 1676 jpk typedef struct iproutedata_s { 171 1676 jpk uint_t ird_idx; 172 8485 Peter uint_t ird_flags; /* see below */ 173 1676 jpk listptr_t ird_route; /* ipRouteEntryTable */ 174 1676 jpk listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ 175 1676 jpk listptr_t ird_attrs; /* ipRouteAttributeTable */ 176 1676 jpk } iproutedata_t; 177 1676 jpk 178 11042 Erik /* Include ire_testhidden and IRE_IF_CLONE routes */ 179 11042 Erik #define IRD_REPORT_ALL 0x01 180 8485 Peter 181 1676 jpk /* 182 0 stevel * Cluster specific hooks. These should be NULL when booted as a non-cluster 183 0 stevel */ 184 0 stevel 185 0 stevel /* 186 0 stevel * Hook functions to enable cluster networking 187 0 stevel * On non-clustered systems these vectors must always be NULL. 188 0 stevel * 189 0 stevel * Hook function to Check ip specified ip address is a shared ip address 190 0 stevel * in the cluster 191 0 stevel * 192 0 stevel */ 193 8392 Huafeng int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol, 194 8392 Huafeng sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL; 195 0 stevel 196 0 stevel /* 197 0 stevel * Hook function to generate cluster wide ip fragment identifier 198 0 stevel */ 199 8392 Huafeng uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol, 200 8392 Huafeng sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp, 201 8392 Huafeng void *args) = NULL; 202 7749 Thejaswini 203 7749 Thejaswini /* 204 7749 Thejaswini * Hook function to generate cluster wide SPI. 205 7749 Thejaswini */ 206 8392 Huafeng void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t, 207 8392 Huafeng void *) = NULL; 208 7749 Thejaswini 209 7749 Thejaswini /* 210 7749 Thejaswini * Hook function to verify if the SPI is already utlized. 211 7749 Thejaswini */ 212 7749 Thejaswini 213 8392 Huafeng int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL; 214 7749 Thejaswini 215 7749 Thejaswini /* 216 7749 Thejaswini * Hook function to delete the SPI from the cluster wide repository. 217 7749 Thejaswini */ 218 7749 Thejaswini 219 8392 Huafeng void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL; 220 7749 Thejaswini 221 7749 Thejaswini /* 222 7749 Thejaswini * Hook function to inform the cluster when packet received on an IDLE SA 223 7749 Thejaswini */ 224 7749 Thejaswini 225 8392 Huafeng void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, 226 8392 Huafeng in6_addr_t, in6_addr_t, void *) = NULL; 227 0 stevel 228 0 stevel /* 229 0 stevel * Synchronization notes: 230 0 stevel * 231 0 stevel * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any 232 0 stevel * MT level protection given by STREAMS. IP uses a combination of its own 233 0 stevel * internal serialization mechanism and standard Solaris locking techniques. 234 8485 Peter * The internal serialization is per phyint. This is used to serialize 235 11042 Erik * plumbing operations, IPMP operations, most set ioctls, etc. 236 0 stevel * 237 0 stevel * Plumbing is a long sequence of operations involving message 238 0 stevel * exchanges between IP, ARP and device drivers. Many set ioctls are typically 239 0 stevel * involved in plumbing operations. A natural model is to serialize these 240 0 stevel * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in 241 0 stevel * parallel without any interference. But various set ioctls on hme0 are best 242 11042 Erik * serialized, along with IPMP operations and processing of DLPI control 243 11042 Erik * messages received from drivers on a per phyint basis. This serialization is 244 11042 Erik * provided by the ipsq_t and primitives operating on this. Details can 245 11042 Erik * be found in ip_if.c above the core primitives operating on ipsq_t. 246 0 stevel * 247 0 stevel * Lookups of an ipif or ill by a thread return a refheld ipif / ill. 248 0 stevel * Simiarly lookup of an ire by a thread also returns a refheld ire. 249 0 stevel * In addition ipif's and ill's referenced by the ire are also indirectly 250 11042 Erik * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld 251 8485 Peter * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the 252 0 stevel * address of an ipif has to go through the ipsq_t. This ensures that only 253 11042 Erik * one such exclusive operation proceeds at any time on the ipif. It then 254 11042 Erik * waits for all refcnts 255 0 stevel * associated with this ipif to come down to zero. The address is changed 256 0 stevel * only after the ipif has been quiesced. Then the ipif is brought up again. 257 0 stevel * More details are described above the comment in ip_sioctl_flags. 258 0 stevel * 259 0 stevel * Packet processing is based mostly on IREs and are fully multi-threaded 260 0 stevel * using standard Solaris MT techniques. 261 0 stevel * 262 0 stevel * There are explicit locks in IP to handle: 263 0 stevel * - The ip_g_head list maintained by mi_open_link() and friends. 264 0 stevel * 265 0 stevel * - The reassembly data structures (one lock per hash bucket) 266 0 stevel * 267 0 stevel * - conn_lock is meant to protect conn_t fields. The fields actually 268 0 stevel * protected by conn_lock are documented in the conn_t definition. 269 0 stevel * 270 0 stevel * - ire_lock to protect some of the fields of the ire, IRE tables 271 0 stevel * (one lock per hash bucket). Refer to ip_ire.c for details. 272 0 stevel * 273 11042 Erik * - ndp_g_lock and ncec_lock for protecting NCEs. 274 0 stevel * 275 0 stevel * - ill_lock protects fields of the ill and ipif. Details in ip.h 276 0 stevel * 277 0 stevel * - ill_g_lock: This is a global reader/writer lock. Protects the following 278 0 stevel * * The AVL tree based global multi list of all ills. 279 0 stevel * * The linked list of all ipifs of an ill 280 8485 Peter * * The <ipsq-xop> mapping 281 0 stevel * * <ill-phyint> association 282 0 stevel * Insertion/deletion of an ill in the system, insertion/deletion of an ipif 283 8485 Peter * into an ill, changing the <ipsq-xop> mapping of an ill, changing the 284 8485 Peter * <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as 285 8485 Peter * writer for the actual duration of the insertion/deletion/change. 286 0 stevel * 287 0 stevel * - ill_lock: This is a per ill mutex. 288 8485 Peter * It protects some members of the ill_t struct; see ip.h for details. 289 0 stevel * It also protects the <ill-phyint> assoc. 290 0 stevel * It also protects the list of ipifs hanging off the ill. 291 0 stevel * 292 0 stevel * - ipsq_lock: This is a per ipsq_t mutex lock. 293 8485 Peter * This protects some members of the ipsq_t struct; see ip.h for details. 294 8485 Peter * It also protects the <ipsq-ipxop> mapping 295 8485 Peter * 296 8485 Peter * - ipx_lock: This is a per ipxop_t mutex lock. 297 8485 Peter * This protects some members of the ipxop_t struct; see ip.h for details. 298 0 stevel * 299 0 stevel * - phyint_lock: This is a per phyint mutex lock. Protects just the 300 0 stevel * phyint_flags 301 0 stevel * 302 0 stevel * - ip_g_nd_lock: This is a global reader/writer lock. 303 0 stevel * Any call to nd_load to load a new parameter to the ND table must hold the 304 0 stevel * lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock 305 0 stevel * as reader. 306 0 stevel * 307 0 stevel * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses. 308 0 stevel * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the 309 0 stevel * uniqueness check also done atomically. 310 0 stevel * 311 0 stevel * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc 312 0 stevel * group list linked by ill_usesrc_grp_next. It also protects the 313 0 stevel * ill_usesrc_ifindex field. It is taken as a writer when a member of the 314 0 stevel * group is being added or deleted. This lock is taken as a reader when 315 0 stevel * walking the list/group(eg: to get the number of members in a usesrc group). 316 0 stevel * Note, it is only necessary to take this lock if the ill_usesrc_grp_next 317 0 stevel * field is changing state i.e from NULL to non-NULL or vice-versa. For 318 0 stevel * example, it is not necessary to take this lock in the initial portion 319 8485 Peter * of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these 320 8485 Peter * operations are executed exclusively and that ensures that the "usesrc 321 8485 Peter * group state" cannot change. The "usesrc group state" change can happen 322 8485 Peter * only in the latter part of ip_sioctl_slifusesrc and in ill_delete. 323 8485 Peter * 324 8485 Peter * Changing <ill-phyint>, <ipsq-xop> assocications: 325 0 stevel * 326 0 stevel * To change the <ill-phyint> association, the ill_g_lock must be held 327 0 stevel * as writer, and the ill_locks of both the v4 and v6 instance of the ill 328 0 stevel * must be held. 329 0 stevel * 330 8485 Peter * To change the <ipsq-xop> association, the ill_g_lock must be held as 331 8485 Peter * writer, the ipsq_lock must be held, and one must be writer on the ipsq. 332 8485 Peter * This is only done when ills are added or removed from IPMP groups. 333 0 stevel * 334 0 stevel * To add or delete an ipif from the list of ipifs hanging off the ill, 335 0 stevel * ill_g_lock (writer) and ill_lock must be held and the thread must be 336 8485 Peter * a writer on the associated ipsq. 337 0 stevel * 338 0 stevel * To add or delete an ill to the system, the ill_g_lock must be held as 339 0 stevel * writer and the thread must be a writer on the associated ipsq. 340 0 stevel * 341 0 stevel * To add or delete an ilm to an ill, the ill_lock must be held and the thread 342 0 stevel * must be a writer on the associated ipsq. 343 0 stevel * 344 0 stevel * Lock hierarchy 345 0 stevel * 346 0 stevel * Some lock hierarchy scenarios are listed below. 347 0 stevel * 348 8485 Peter * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock 349 0 stevel * ill_g_lock -> ill_lock(s) -> phyint_lock 350 11042 Erik * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock 351 0 stevel * ill_g_lock -> ip_addr_avail_lock 352 0 stevel * conn_lock -> irb_lock -> ill_lock -> ire_lock 353 0 stevel * ill_g_lock -> ip_g_nd_lock 354 11042 Erik * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock 355 11042 Erik * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock 356 11042 Erik * arl_lock -> ill_lock 357 11042 Erik * ips_ire_dep_lock -> irb_lock 358 1676 jpk * 359 1676 jpk * When more than 1 ill lock is needed to be held, all ill lock addresses 360 1676 jpk * are sorted on address and locked starting from highest addressed lock 361 1676 jpk * downward. 362 1676 jpk * 363 11042 Erik * Multicast scenarios 364 11042 Erik * ips_ill_g_lock -> ill_mcast_lock 365 11042 Erik * conn_ilg_lock -> ips_ill_g_lock -> ill_lock 366 11042 Erik * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock 367 11042 Erik * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock 368 11042 Erik * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock 369 11042 Erik * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock 370 11042 Erik * 371 1676 jpk * IPsec scenarios 372 1676 jpk * 373 1676 jpk * ipsa_lock -> ill_g_lock -> ill_lock 374 0 stevel * ill_g_usesrc_lock -> ill_g_lock -> ill_lock 375 0 stevel * 376 1676 jpk * Trusted Solaris scenarios 377 1676 jpk * 378 1676 jpk * igsa_lock -> gcgrp_rwlock -> gcgrp_lock 379 1676 jpk * igsa_lock -> gcdb_lock 380 1676 jpk * gcgrp_rwlock -> ire_lock 381 1676 jpk * gcgrp_rwlock -> gcdb_lock 382 0 stevel * 383 8275 Eric * squeue(sq_lock), flow related (ft_lock, fe_lock) locking 384 8275 Eric * 385 8275 Eric * cpu_lock --> ill_lock --> sqset_lock --> sq_lock 386 8275 Eric * sq_lock -> conn_lock -> QLOCK(q) 387 8275 Eric * ill_lock -> ft_lock -> fe_lock 388 2535 sangeeta * 389 2535 sangeeta * Routing/forwarding table locking notes: 390 2535 sangeeta * 391 2535 sangeeta * Lock acquisition order: Radix tree lock, irb_lock. 392 2535 sangeeta * Requirements: 393 2535 sangeeta * i. Walker must not hold any locks during the walker callback. 394 2535 sangeeta * ii Walker must not see a truncated tree during the walk because of any node 395 2535 sangeeta * deletion. 396 2535 sangeeta * iii Existing code assumes ire_bucket is valid if it is non-null and is used 397 2535 sangeeta * in many places in the code to walk the irb list. Thus even if all the 398 2535 sangeeta * ires in a bucket have been deleted, we still can't free the radix node 399 2535 sangeeta * until the ires have actually been inactive'd (freed). 400 2535 sangeeta * 401 2535 sangeeta * Tree traversal - Need to hold the global tree lock in read mode. 402 2535 sangeeta * Before dropping the global tree lock, need to either increment the ire_refcnt 403 2535 sangeeta * to ensure that the radix node can't be deleted. 404 2535 sangeeta * 405 2535 sangeeta * Tree add - Need to hold the global tree lock in write mode to add a 406 2535 sangeeta * radix node. To prevent the node from being deleted, increment the 407 2535 sangeeta * irb_refcnt, after the node is added to the tree. The ire itself is 408 2535 sangeeta * added later while holding the irb_lock, but not the tree lock. 409 2535 sangeeta * 410 2535 sangeeta * Tree delete - Need to hold the global tree lock and irb_lock in write mode. 411 2535 sangeeta * All associated ires must be inactive (i.e. freed), and irb_refcnt 412 2535 sangeeta * must be zero. 413 2535 sangeeta * 414 2535 sangeeta * Walker - Increment irb_refcnt before calling the walker callback. Hold the 415 2535 sangeeta * global tree lock (read mode) for traversal. 416 2535 sangeeta * 417 11042 Erik * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele 418 11042 Erik * hence we will acquire irb_lock while holding ips_ire_dep_lock. 419 11042 Erik * 420 4987 danmcd * IPsec notes : 421 4987 danmcd * 422 11042 Erik * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes 423 11042 Erik * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the 424 11042 Erik * ip_xmit_attr_t has the 425 4987 danmcd * information used by the IPsec code for applying the right level of 426 11042 Erik * protection. The information initialized by IP in the ip_xmit_attr_t 427 0 stevel * is determined by the per-socket policy or global policy in the system. 428 11042 Erik * For inbound datagrams, the ip_recv_attr_t 429 11042 Erik * starts out with nothing in it. It gets filled 430 0 stevel * with the right information if it goes through the AH/ESP code, which 431 0 stevel * happens if the incoming packet is secure. The information initialized 432 11042 Erik * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether 433 0 stevel * the policy requirements needed by per-socket policy or global policy 434 0 stevel * is met or not. 435 0 stevel * 436 0 stevel * For fully connected sockets i.e dst, src [addr, port] is known, 437 0 stevel * conn_policy_cached is set indicating that policy has been cached. 438 0 stevel * conn_in_enforce_policy may or may not be set depending on whether 439 0 stevel * there is a global policy match or per-socket policy match. 440 11042 Erik * Policy inheriting happpens in ip_policy_set once the destination is known. 441 0 stevel * Once the right policy is set on the conn_t, policy cannot change for 442 0 stevel * this socket. This makes life simpler for TCP (UDP ?) where 443 0 stevel * re-transmissions go out with the same policy. For symmetry, policy 444 0 stevel * is cached for fully connected UDP sockets also. Thus if policy is cached, 445 0 stevel * it also implies that policy is latched i.e policy cannot change 446 0 stevel * on these sockets. As we have the right policy on the conn, we don't 447 0 stevel * have to lookup global policy for every outbound and inbound datagram 448 0 stevel * and thus serving as an optimization. Note that a global policy change 449 0 stevel * does not affect fully connected sockets if they have policy. If fully 450 0 stevel * connected sockets did not have any policy associated with it, global 451 0 stevel * policy change may affect them. 452 0 stevel * 453 0 stevel * IP Flow control notes: 454 8833 Venu * --------------------- 455 8833 Venu * Non-TCP streams are flow controlled by IP. The way this is accomplished 456 8833 Venu * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When 457 8833 Venu * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into 458 8833 Venu * GLDv3. Otherwise packets are sent down to lower layers using STREAMS 459 8833 Venu * functions. 460 8833 Venu * 461 8833 Venu * Per Tx ring udp flow control: 462 8833 Venu * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in 463 8833 Venu * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true). 464 8833 Venu * 465 8833 Venu * The underlying link can expose multiple Tx rings to the GLDv3 mac layer. 466 8833 Venu * To achieve best performance, outgoing traffic need to be fanned out among 467 8833 Venu * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send 468 8833 Venu * traffic out of the NIC and it takes a fanout hint. UDP connections pass 469 8833 Venu * the address of connp as fanout hint to mac_tx(). Under flow controlled 470 8833 Venu * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This 471 8833 Venu * cookie points to a specific Tx ring that is blocked. The cookie is used to 472 8833 Venu * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t 473 8833 Venu * point to drain_lists (idl_t's). These drain list will store the blocked UDP 474 8833 Venu * connp's. The drain list is not a single list but a configurable number of 475 8833 Venu * lists. 476 8833 Venu * 477 8833 Venu * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t 478 8833 Venu * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE 479 8833 Venu * which is equal to 128. This array in turn contains a pointer to idl_t[], 480 8833 Venu * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain 481 8833 Venu * list will point to the list of connp's that are flow controlled. 482 8833 Venu * 483 8833 Venu * --------------- ------- ------- ------- 484 8833 Venu * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|--> 485 8833 Venu * | --------------- ------- ------- ------- 486 8833 Venu * | --------------- ------- ------- ------- 487 8833 Venu * |->|drain_list[1]|-->|connp|-->|connp|-->|connp|--> 488 8833 Venu * ---------------- | --------------- ------- ------- ------- 489 8833 Venu * |idl_tx_list[0]|->| --------------- ------- ------- ------- 490 8833 Venu * ---------------- |->|drain_list[2]|-->|connp|-->|connp|-->|connp|--> 491 8833 Venu * | --------------- ------- ------- ------- 492 8833 Venu * . . . . . 493 8833 Venu * | --------------- ------- ------- ------- 494 8833 Venu * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|--> 495 8833 Venu * --------------- ------- ------- ------- 496 8833 Venu * --------------- ------- ------- ------- 497 8833 Venu * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|--> 498 8833 Venu * | --------------- ------- ------- ------- 499 8833 Venu * | --------------- ------- ------- ------- 500 8833 Venu * ---------------- |->|drain_list[1]|-->|connp|-->|connp|-->|connp|--> 501 8833 Venu * |idl_tx_list[1]|->| --------------- ------- ------- ------- 502 8833 Venu * ---------------- | . . . . 503 8833 Venu * | --------------- ------- ------- ------- 504 8833 Venu * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|--> 505 8833 Venu * --------------- ------- ------- ------- 506 8833 Venu * ..... 507 8833 Venu * ---------------- 508 8833 Venu * |idl_tx_list[n]|-> ... 509 8833 Venu * ---------------- 510 8833 Venu * 511 8833 Venu * When mac_tx() returns a cookie, the cookie is used to hash into a 512 8833 Venu * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is 513 8833 Venu * called passing idl_tx_list. The connp gets inserted in a drain list 514 8833 Venu * pointed to by idl_tx_list. conn_drain_list() asserts flow control for 515 11042 Erik * the sockets (non stream based) and sets QFULL condition on the conn_wq 516 11042 Erik * of streams sockets, or the su_txqfull for non-streams sockets. 517 8833 Venu * connp->conn_direct_blocked will be set to indicate the blocked 518 8833 Venu * condition. 519 8833 Venu * 520 8833 Venu * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved. 521 8833 Venu * A cookie is passed in the call to ill_flow_enable() that identifies the 522 8833 Venu * blocked Tx ring. This cookie is used to get to the idl_tx_list that 523 8833 Venu * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t 524 11042 Erik * and goes through each conn in the drain list and calls conn_idl_remove 525 11042 Erik * for the conn to clear the qfull condition for the conn, as well as to 526 11042 Erik * remove the conn from the idl list. In addition, streams based sockets 527 11042 Erik * will have the conn_wq enabled, causing ip_wsrv to run for the 528 8833 Venu * conn. ip_wsrv drains the queued messages, and removes the conn from the 529 11042 Erik * drain list, if all messages were drained. It also notifies the 530 11042 Erik * conn_upcalls for the conn to signal that flow-control has opened up. 531 0 stevel * 532 0 stevel * In reality the drain list is not a single list, but a configurable number 533 11042 Erik * of lists. conn_walk_drain() in the IP module, notifies the conn_upcalls for 534 11042 Erik * each conn in the list. conn_drain_insert and conn_drain_tail are the only 535 8833 Venu * functions that manipulate this drain list. conn_drain_insert is called in 536 11042 Erik * from the protocol layer when conn_ip_output returns EWOULDBLOCK. 537 11042 Erik * (as opposed to from ip_wsrv context for STREAMS 538 8833 Venu * case -- see below). The synchronization between drain insertion and flow 539 8833 Venu * control wakeup is handled by using idl_txl->txl_lock. 540 8833 Venu * 541 8833 Venu * Flow control using STREAMS: 542 8833 Venu * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism 543 8833 Venu * is used. On the send side, if the packet cannot be sent down to the 544 11042 Erik * driver by IP, because of a canput failure, ip_xmit drops the packet 545 11042 Erik * and returns EWOULDBLOCK to the caller, who may then invoke 546 11042 Erik * ixa_check_drain_insert to insert the conn on the 0'th drain list. 547 11042 Erik * When ip_wsrv runs on the ill_wq because flow control has been relieved, the 548 11042 Erik * blocked conns in the * 0'th drain list is drained as with the 549 11042 Erik * non-STREAMS case. 550 11042 Erik * 551 11042 Erik * In both the STREAMS and non-STREAMS case, the sockfs upcall to set 552 11042 Erik * qfull is done when the conn is inserted into the drain list 553 11042 Erik * (conn_drain_insert()) and cleared when the conn is removed from the drain 554 11042 Erik * list (conn_idl_remove()). 555 0 stevel * 556 0 stevel * IPQOS notes: 557 0 stevel * 558 0 stevel * IPQoS Policies are applied to packets using IPPF (IP Policy framework) 559 0 stevel * and IPQoS modules. IPPF includes hooks in IP at different control points 560 0 stevel * (callout positions) which direct packets to IPQoS modules for policy 561 0 stevel * processing. Policies, if present, are global. 562 0 stevel * 563 0 stevel * The callout positions are located in the following paths: 564 0 stevel * o local_in (packets destined for this host) 565 0 stevel * o local_out (packets orginating from this host ) 566 0 stevel * o fwd_in (packets forwarded by this m/c - inbound) 567 0 stevel * o fwd_out (packets forwarded by this m/c - outbound) 568 0 stevel * Hooks at these callout points can be enabled/disabled using the ndd variable 569 0 stevel * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions). 570 0 stevel * By default all the callout positions are enabled. 571 0 stevel * 572 0 stevel * Outbound (local_out) 573 11042 Erik * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6. 574 0 stevel * 575 0 stevel * Inbound (local_in) 576 11042 Erik * Hooks are placed in ip_fanout_v4 and ip_fanout_v6. 577 0 stevel * 578 0 stevel * Forwarding (in and out) 579 11042 Erik * Hooks are placed in ire_recv_forward_v4/v6. 580 0 stevel * 581 0 stevel * IP Policy Framework processing (IPPF processing) 582 0 stevel * Policy processing for a packet is initiated by ip_process, which ascertains 583 0 stevel * that the classifier (ipgpc) is loaded and configured, failing which the 584 0 stevel * packet resumes normal processing in IP. If the clasifier is present, the 585 0 stevel * packet is acted upon by one or more IPQoS modules (action instances), per 586 0 stevel * filters configured in ipgpc and resumes normal IP processing thereafter. 587 0 stevel * An action instance can drop a packet in course of its processing. 588 0 stevel * 589 0 stevel * Zones notes: 590 0 stevel * 591 0 stevel * The partitioning rules for networking are as follows: 592 0 stevel * 1) Packets coming from a zone must have a source address belonging to that 593 0 stevel * zone. 594 0 stevel * 2) Packets coming from a zone can only be sent on a physical interface on 595 0 stevel * which the zone has an IP address. 596 0 stevel * 3) Between two zones on the same machine, packet delivery is only allowed if 597 0 stevel * there's a matching route for the destination and zone in the forwarding 598 0 stevel * table. 599 0 stevel * 4) The TCP and UDP port spaces are per-zone; that is, two processes in 600 0 stevel * different zones can bind to the same port with the wildcard address 601 0 stevel * (INADDR_ANY). 602 0 stevel * 603 0 stevel * The granularity of interface partitioning is at the logical interface level. 604 0 stevel * Therefore, every zone has its own IP addresses, and incoming packets can be 605 0 stevel * attributed to a zone unambiguously. A logical interface is placed into a zone 606 0 stevel * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t 607 0 stevel * structure. Rule (1) is implemented by modifying the source address selection 608 0 stevel * algorithm so that the list of eligible addresses is filtered based on the 609 0 stevel * sending process zone. 610 0 stevel * 611 0 stevel * The Internet Routing Entries (IREs) are either exclusive to a zone or shared 612 0 stevel * across all zones, depending on their type. Here is the break-up: 613 0 stevel * 614 0 stevel * IRE type Shared/exclusive 615 0 stevel * -------- ---------------- 616 0 stevel * IRE_BROADCAST Exclusive 617 0 stevel * IRE_DEFAULT (default routes) Shared (*) 618 2733 nordmark * IRE_LOCAL Exclusive (x) 619 0 stevel * IRE_LOOPBACK Exclusive 620 0 stevel * IRE_PREFIX (net routes) Shared (*) 621 0 stevel * IRE_IF_NORESOLVER (interface routes) Exclusive 622 0 stevel * IRE_IF_RESOLVER (interface routes) Exclusive 623 11042 Erik * IRE_IF_CLONE (interface routes) Exclusive 624 0 stevel * IRE_HOST (host routes) Shared (*) 625 0 stevel * 626 0 stevel * (*) A zone can only use a default or off-subnet route if the gateway is 627 0 stevel * directly reachable from the zone, that is, if the gateway's address matches 628 0 stevel * one of the zone's logical interfaces. 629 2733 nordmark * 630 11042 Erik * (x) IRE_LOCAL are handled a bit differently. 631 11042 Erik * When ip_restrict_interzone_loopback is set (the default), 632 11042 Erik * ire_route_recursive restricts loopback using an IRE_LOCAL 633 2733 nordmark * between zone to the case when L2 would have conceptually looped the packet 634 2733 nordmark * back, i.e. the loopback which is required since neither Ethernet drivers 635 2733 nordmark * nor Ethernet hardware loops them back. This is the case when the normal 636 2733 nordmark * routes (ignoring IREs with different zoneids) would send out the packet on 637 8485 Peter * the same ill as the ill with which is IRE_LOCAL is associated. 638 0 stevel * 639 0 stevel * Multiple zones can share a common broadcast address; typically all zones 640 0 stevel * share the 255.255.255.255 address. Incoming as well as locally originated 641 0 stevel * broadcast packets must be dispatched to all the zones on the broadcast 642 0 stevel * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial 643 0 stevel * since some zones may not be on the 10.16.72/24 network. To handle this, each 644 0 stevel * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are 645 0 stevel * sent to every zone that has an IRE_BROADCAST entry for the destination 646 11042 Erik * address on the input ill, see ip_input_broadcast(). 647 0 stevel * 648 0 stevel * Applications in different zones can join the same multicast group address. 649 11042 Erik * The same logic applies for multicast as for broadcast. ip_input_multicast 650 11042 Erik * dispatches packets to all zones that have members on the physical interface. 651 0 stevel */ 652 0 stevel 653 0 stevel /* 654 0 stevel * Squeue Fanout flags: 655 0 stevel * 0: No fanout. 656 0 stevel * 1: Fanout across all squeues 657 0 stevel */ 658 0 stevel boolean_t ip_squeue_fanout = 0; 659 0 stevel 660 0 stevel /* 661 0 stevel * Maximum dups allowed per packet. 662 0 stevel */ 663 0 stevel uint_t ip_max_frag_dups = 10; 664 0 stevel 665 8348 Eric /* RFC 1122 Conformance */ 666 0 stevel #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER 667 0 stevel 668 0 stevel #define ILL_MAX_NAMELEN LIFNAMSIZ 669 0 stevel 670 5240 nordmark static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag, 671 5240 nordmark cred_t *credp, boolean_t isv6); 672 11042 Erik static mblk_t *ip_xmit_attach_llhdr(mblk_t *, nce_t *); 673 11042 Erik 674 11042 Erik static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *); 675 11042 Erik static void icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *); 676 11042 Erik static void icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *, 677 11042 Erik ip_recv_attr_t *); 678 0 stevel static void icmp_options_update(ipha_t *); 679 11042 Erik static void icmp_param_problem(mblk_t *, uint8_t, ip_recv_attr_t *); 680 11042 Erik static void icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *); 681 11042 Erik static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *); 682 11042 Erik static void icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *, 683 11042 Erik ip_recv_attr_t *); 684 11042 Erik static void icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *); 685 11042 Erik static void icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *, 686 11042 Erik ip_recv_attr_t *); 687 11042 Erik 688 0 stevel mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); 689 0 stevel char *ip_dot_addr(ipaddr_t, char *); 690 0 stevel mblk_t *ip_carve_mp(mblk_t **, ssize_t); 691 0 stevel int ip_close(queue_t *, int); 692 0 stevel static char *ip_dot_saddr(uchar_t *, char *); 693 0 stevel static void ip_lrput(queue_t *, mblk_t *); 694 0 stevel ipaddr_t ip_net_mask(ipaddr_t); 695 0 stevel char *ip_nv_lookup(nv_t *, int); 696 0 stevel static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *); 697 0 stevel static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *); 698 3448 dh155122 static boolean_t ip_param_register(IDP *ndp, ipparam_t *, size_t, 699 3448 dh155122 ipndp_t *, size_t); 700 0 stevel static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 701 0 stevel void ip_rput(queue_t *, mblk_t *); 702 0 stevel static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, 703 0 stevel void *dummy_arg); 704 5240 nordmark int ip_snmp_get(queue_t *, mblk_t *, int); 705 3284 apersson static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *, 706 3448 dh155122 mib2_ipIfStatsEntry_t *, ip_stack_t *); 707 3448 dh155122 static mblk_t *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *, 708 3448 dh155122 ip_stack_t *); 709 3448 dh155122 static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *); 710 3448 dh155122 static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst); 711 3448 dh155122 static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst); 712 3448 dh155122 static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst); 713 3448 dh155122 static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst); 714 3448 dh155122 static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *, 715 3448 dh155122 ip_stack_t *ipst); 716 3448 dh155122 static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *, 717 3448 dh155122 ip_stack_t *ipst); 718 3448 dh155122 static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *, 719 3448 dh155122 ip_stack_t *ipst); 720 3448 dh155122 static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *, 721 3448 dh155122 ip_stack_t *ipst); 722 3448 dh155122 static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *, 723 3448 dh155122 ip_stack_t *ipst); 724 3448 dh155122 static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *, 725 3448 dh155122 ip_stack_t *ipst); 726 3448 dh155122 static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *, 727 3448 dh155122 ip_stack_t *ipst); 728 3448 dh155122 static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *, 729 3448 dh155122 ip_stack_t *ipst); 730 8485 Peter static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int, 731 3448 dh155122 ip_stack_t *ipst); 732 8485 Peter static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int, 733 3448 dh155122 ip_stack_t *ipst); 734 1676 jpk static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); 735 1676 jpk static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); 736 11042 Erik static int ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *); 737 11042 Erik static int ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *); 738 0 stevel int ip_snmp_set(queue_t *, int, int, uchar_t *, int); 739 11042 Erik 740 11042 Erik static mblk_t *ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *, 741 8778 Erik mblk_t *); 742 3448 dh155122 743 3448 dh155122 static void conn_drain_init(ip_stack_t *); 744 3448 dh155122 static void conn_drain_fini(ip_stack_t *); 745 0 stevel static void conn_drain_tail(conn_t *connp, boolean_t closing); 746 0 stevel 747 8833 Venu static void conn_walk_drain(ip_stack_t *, idl_tx_list_t *); 748 11042 Erik static void conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *); 749 3448 dh155122 750 3448 dh155122 static void *ip_stack_init(netstackid_t stackid, netstack_t *ns); 751 3448 dh155122 static void ip_stack_shutdown(netstackid_t stackid, void *arg); 752 3448 dh155122 static void ip_stack_fini(netstackid_t stackid, void *arg); 753 0 stevel 754 0 stevel static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); 755 0 stevel 756 0 stevel static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, 757 11042 Erik const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *), 758 11042 Erik ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t, 759 11042 Erik const in6_addr_t *); 760 0 stevel 761 0 stevel static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); 762 0 stevel static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, 763 0 stevel caddr_t, cred_t *); 764 0 stevel static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, 765 0 stevel caddr_t cp, cred_t *cr); 766 1184 krgopi static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, 767 4041 nordmark cred_t *); 768 8275 Eric static int ip_squeue_switch(int); 769 0 stevel 770 3448 dh155122 static void *ip_kstat_init(netstackid_t, ip_stack_t *); 771 3448 dh155122 static void ip_kstat_fini(netstackid_t, kstat_t *); 772 0 stevel static int ip_kstat_update(kstat_t *kp, int rw); 773 3448 dh155122 static void *icmp_kstat_init(netstackid_t); 774 3448 dh155122 static void icmp_kstat_fini(netstackid_t, kstat_t *); 775 0 stevel static int icmp_kstat_update(kstat_t *kp, int rw); 776 3448 dh155122 static void *ip_kstat2_init(netstackid_t, ip_stat_t *); 777 3448 dh155122 static void ip_kstat2_fini(netstackid_t, kstat_t *); 778 0 stevel 779 11042 Erik static void ipobs_init(ip_stack_t *); 780 11042 Erik static void ipobs_fini(ip_stack_t *); 781 11042 Erik 782 0 stevel ipaddr_t ip_g_all_ones = IP_HOST_MASK; 783 0 stevel 784 0 stevel /* How long, in seconds, we allow frags to hang around. */ 785 9213 Girish #define IP_FRAG_TIMEOUT 15 786 9213 Girish #define IPV6_FRAG_TIMEOUT 60 787 741 masputra 788 0 stevel static long ip_rput_pullups; 789 0 stevel int dohwcksum = 1; /* use h/w cksum if supported by the hardware */ 790 0 stevel 791 5815 gt145670 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */ 792 5815 gt145670 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */ 793 0 stevel 794 3448 dh155122 int ip_debug; 795 0 stevel 796 0 stevel /* 797 0 stevel * Multirouting/CGTP stuff 798 0 stevel */ 799 0 stevel int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */ 800 0 stevel 801 0 stevel /* 802 0 stevel * Named Dispatch Parameter Table. 803 0 stevel * All of these are alterable, within the min/max values given, at run time. 804 0 stevel */ 805 0 stevel static ipparam_t lcl_param_arr[] = { 806 0 stevel /* min max value name */ 807 0 stevel { 0, 1, 0, "ip_respond_to_address_mask_broadcast"}, 808 0 stevel { 0, 1, 1, "ip_respond_to_echo_broadcast"}, 809 0 stevel { 0, 1, 1, "ip_respond_to_echo_multicast"}, 810 0 stevel { 0, 1, 0, "ip_respond_to_timestamp"}, 811 0 stevel { 0, 1, 0, "ip_respond_to_timestamp_broadcast"}, 812 0 stevel { 0, 1, 1, "ip_send_redirects"}, 813 0 stevel { 0, 1, 0, "ip_forward_directed_broadcasts"}, 814 0 stevel { 0, 10, 0, "ip_mrtdebug"}, 815 11042 Erik { 1, 8, 3, "ip_ire_reclaim_fraction" }, 816 11042 Erik { 1, 8, 3, "ip_nce_reclaim_fraction" }, 817 11042 Erik { 1, 8, 3, "ip_dce_reclaim_fraction" }, 818 0 stevel { 1, 255, 255, "ip_def_ttl" }, 819 0 stevel { 0, 1, 0, "ip_forward_src_routed"}, 820 0 stevel { 0, 256, 32, "ip_wroff_extra" }, 821 11042 Erik { 2, 999999999, 60*20, "ip_pathmtu_interval" }, /* In seconds */ 822 0 stevel { 8, 65536, 64, "ip_icmp_return_data_bytes" }, 823 0 stevel { 0, 1, 1, "ip_path_mtu_discovery" }, 824 11042 Erik { 68, 65535, 576, "ip_pmtu_min" }, 825 0 stevel { 0, 1, 0, "ip_ignore_redirect" }, 826 11042 Erik { 0, 1, 0, "ip_arp_icmp_error" }, 827 0 stevel { 1, 254, 1, "ip_broadcast_ttl" }, 828 0 stevel { 0, 99999, 100, "ip_icmp_err_interval" }, 829 0 stevel { 1, 99999, 10, "ip_icmp_err_burst" }, 830 0 stevel { 0, 999999999, 1000000, "ip_reass_queue_bytes" }, 831 0 stevel { 0, 1, 0, "ip_strict_dst_multihoming" }, 832 0 stevel { 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"}, 833 0 stevel { 0, 1, 0, "ipsec_override_persocket_policy" }, 834 0 stevel { 0, 1, 1, "icmp_accept_clear_messages" }, 835 0 stevel { 0, 1, 1, "igmp_accept_clear_messages" }, 836 0 stevel { 2, 999999999, ND_DELAY_FIRST_PROBE_TIME, 837 0 stevel "ip_ndp_delay_first_probe_time"}, 838 0 stevel { 1, 999999999, ND_MAX_UNICAST_SOLICIT, 839 0 stevel "ip_ndp_max_unicast_solicit"}, 840 0 stevel { 1, 255, IPV6_MAX_HOPS, "ip6_def_hops" }, 841 0 stevel { 8, IPV6_MIN_MTU, IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" }, 842 0 stevel { 0, 1, 0, "ip6_forward_src_routed"}, 843 0 stevel { 0, 1, 1, "ip6_respond_to_echo_multicast"}, 844 0 stevel { 0, 1, 1, "ip6_send_redirects"}, 845 0 stevel { 0, 1, 0, "ip6_ignore_redirect" }, 846 0 stevel { 0, 1, 0, "ip6_strict_dst_multihoming" }, 847 0 stevel 848 11042 Erik { 0, 2, 2, "ip_src_check" }, 849 0 stevel 850 0 stevel { 0, 999999, 1000, "ipsec_policy_log_interval" }, 851 0 stevel 852 0 stevel { 0, 1, 1, "pim_accept_clear_messages" }, 853 0 stevel { 1000, 20000, 2000, "ip_ndp_unsolicit_interval" }, 854 0 stevel { 1, 20, 3, "ip_ndp_unsolicit_count" }, 855 0 stevel { 0, 1, 1, "ip6_ignore_home_address_opt" }, 856 0 stevel { 0, 15, 0, "ip_policy_mask" }, 857 11042 Erik { 0, 2, 2, "ip_ecmp_behavior" }, 858 0 stevel { 0, 255, 1, "ip_multirt_ttl" }, 859 11042 Erik { 0, 3600, 60, "ip_ire_badcnt_lifetime" }, /* In seconds */ 860 2546 carlsonj { 0, 999999, 60*60*24, "ip_max_temp_idle" }, 861 2546 carlsonj { 0, 1000, 1, "ip_max_temp_defend" }, 862 11042 Erik /* 863 11042 Erik * when a conflict of an active address is detected, 864 11042 Erik * defend up to ip_max_defend times, within any 865 11042 Erik * ip_defend_interval span. 866 11042 Erik */ 867 2546 carlsonj { 0, 1000, 3, "ip_max_defend" }, 868 2546 carlsonj { 0, 999999, 30, "ip_defend_interval" }, 869 2546 carlsonj { 0, 3600000, 300000, "ip_dup_recovery" }, 870 2733 nordmark { 0, 1, 1, "ip_restrict_interzone_loopback" }, 871 3115 yl150051 { 0, 1, 1, "ip_lso_outbound" }, 872 4783 udpa { IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" }, 873 4783 udpa { MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" }, 874 0 stevel #ifdef DEBUG 875 0 stevel { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, 876 3448 dh155122 #else 877 3448 dh155122 { 0, 0, 0, "" }, 878 0 stevel #endif 879 11042 Erik /* delay before sending first probe: */ 880 11042 Erik { 0, 20000, 1000, "arp_probe_delay" }, 881 11042 Erik { 0, 20000, 100, "arp_fastprobe_delay" }, 882 11042 Erik /* interval at which DAD probes are sent: */ 883 11042 Erik { 10, 20000, 1500, "arp_probe_interval" }, 884 11042 Erik { 10, 20000, 150, "arp_fastprobe_interval" }, 885 11042 Erik /* setting probe count to 0 will disable ARP probing for DAD. */ 886 11042 Erik { 0, 20, 3, "arp_probe_count" }, 887 11042 Erik { 0, 20, 3, "arp_fastprobe_count" }, 888 11042 Erik 889 11042 Erik { 0, 3600000, 15000, "ipv4_dad_announce_interval"}, 890 11042 Erik { 0, 3600000, 15000, "ipv6_dad_announce_interval"}, 891 11042 Erik /* 892 11042 Erik * Rate limiting parameters for DAD defense used in 893 11042 Erik * ill_defend_rate_limit(): 894 11042 Erik * defend_rate : pkts/hour permitted 895 11042 Erik * defend_interval : time that can elapse before we send out a 896 11042 Erik * DAD defense. 897 11042 Erik * defend_period: denominator for defend_rate (in seconds). 898 11042 Erik */ 899 11042 Erik { 0, 3600000, 300000, "arp_defend_interval"}, 900 11042 Erik { 0, 20000, 100, "arp_defend_rate"}, 901 11042 Erik { 0, 3600000, 300000, "ndp_defend_interval"}, 902 11042 Erik { 0, 20000, 100, "ndp_defend_rate"}, 903 11042 Erik { 5, 86400, 3600, "arp_defend_period"}, 904 11042 Erik { 5, 86400, 3600, "ndp_defend_period"}, 905 11042 Erik { 0, 1, 1, "ipv4_icmp_return_pmtu" }, 906 11042 Erik { 0, 1, 1, "ipv6_icmp_return_pmtu" }, 907 11042 Erik /* 908 11042 Erik * publish count/interval values used to announce local addresses 909 11042 Erik * for IPv4, IPv6. 910 11042 Erik */ 911 11042 Erik { 1, 20, 5, "ip_arp_publish_count" }, 912 11042 Erik { 1000, 20000, 2000, "ip_arp_publish_interval" }, 913 0 stevel }; 914 0 stevel 915 3448 dh155122 /* 916 3448 dh155122 * Extended NDP table 917 3448 dh155122 * The addresses for the first two are filled in to be ips_ip_g_forward 918 3448 dh155122 * and ips_ipv6_forward at init time. 919 3448 dh155122 */ 920 0 stevel static ipndp_t lcl_ndp_arr[] = { 921 0 stevel /* getf setf data name */ 922 3448 dh155122 #define IPNDP_IP_FORWARDING_OFFSET 0 923 3448 dh155122 { ip_param_generic_get, ip_forward_set, NULL, 924 0 stevel "ip_forwarding" }, 925 3448 dh155122 #define IPNDP_IP6_FORWARDING_OFFSET 1 926 3448 dh155122 { ip_param_generic_get, ip_forward_set, NULL, 927 0 stevel "ip6_forwarding" }, 928 0 stevel { ip_param_generic_get, ip_input_proc_set, 929 0 stevel (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, 930 1184 krgopi { ip_param_generic_get, ip_int_set, 931 0 stevel (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, 932 9089 Vasumathi #define IPNDP_CGTP_FILTER_OFFSET 4 933 3448 dh155122 { ip_cgtp_filter_get, ip_cgtp_filter_set, NULL, 934 1184 krgopi "ip_cgtp_filter" }, 935 5401 nordmark { ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug, 936 5401 nordmark "ip_debug" }, 937 0 stevel }; 938 0 stevel 939 0 stevel /* 940 0 stevel * Table of IP ioctls encoding the various properties of the ioctl and 941 0 stevel * indexed based on the last byte of the ioctl command. Occasionally there 942 0 stevel * is a clash, and there is more than 1 ioctl with the same last byte. 943 0 stevel * In such a case 1 ioctl is encoded in the ndx table and the remaining 944 0 stevel * ioctls are encoded in the misc table. An entry in the ndx table is 945 0 stevel * retrieved by indexing on the last byte of the ioctl command and comparing 946 0 stevel * the ioctl command with the value in the ndx table. In the event of a 947 0 stevel * mismatch the misc table is then searched sequentially for the desired 948 0 stevel * ioctl command. 949 0 stevel * 950 0 stevel * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func> 951 0 stevel */ 952 0 stevel ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { 953 0 stevel /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 954 0 stevel /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 955 0 stevel /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 956 0 stevel /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 957 0 stevel /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 958 0 stevel /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 959 0 stevel /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 960 0 stevel /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 961 0 stevel /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 962 0 stevel /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 963 0 stevel 964 0 stevel /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV, 965 0 stevel MISC_CMD, ip_siocaddrt, NULL }, 966 0 stevel /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV, 967 0 stevel MISC_CMD, ip_siocdelrt, NULL }, 968 0 stevel 969 0 stevel /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 970 0 stevel IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 971 8485 Peter /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD, 972 0 stevel IF_CMD, ip_sioctl_get_addr, NULL }, 973 0 stevel 974 0 stevel /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 975 0 stevel IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 976 0 stevel /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), 977 8485 Peter IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL }, 978 0 stevel 979 0 stevel /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), 980 8485 Peter IPI_PRIV | IPI_WR, 981 0 stevel IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 982 0 stevel /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), 983 8485 Peter IPI_MODOK | IPI_GET_CMD, 984 0 stevel IF_CMD, ip_sioctl_get_flags, NULL }, 985 0 stevel 986 0 stevel /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 987 0 stevel /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 988 0 stevel 989 0 stevel /* copyin size cannot be coded for SIOCGIFCONF */ 990 4972 meem /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD, 991 0 stevel MISC_CMD, ip_sioctl_get_ifconf, NULL }, 992 0 stevel 993 0 stevel /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 994 0 stevel IF_CMD, ip_sioctl_mtu, NULL }, 995 8485 Peter /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD, 996 0 stevel IF_CMD, ip_sioctl_get_mtu, NULL }, 997 0 stevel /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), 998 8485 Peter IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL }, 999 0 stevel /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1000 0 stevel IF_CMD, ip_sioctl_brdaddr, NULL }, 1001 0 stevel /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), 1002 8485 Peter IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL }, 1003 0 stevel /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, 1004 0 stevel IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1005 0 stevel /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), 1006 8485 Peter IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL }, 1007 0 stevel /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, 1008 0 stevel IF_CMD, ip_sioctl_metric, NULL }, 1009 0 stevel /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1010 0 stevel 1011 0 stevel /* See 166-168 below for extended SIOC*XARP ioctls */ 1012 8485 Peter /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR, 1013 4972 meem ARP_CMD, ip_sioctl_arp, NULL }, 1014 8485 Peter /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD, 1015 4972 meem ARP_CMD, ip_sioctl_arp, NULL }, 1016 8485 Peter /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR, 1017 4972 meem ARP_CMD, ip_sioctl_arp, NULL }, 1018 0 stevel 1019 0 stevel /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1020 0 stevel /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1021 0 stevel /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1022 0 stevel /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1023 0 stevel /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1024 0 stevel /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1025 0 stevel /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1026 0 stevel /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1027 0 stevel /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1028 0 stevel /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1029 0 stevel /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1030 0 stevel /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1031 0 stevel /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1032 0 stevel /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1033 0 stevel /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1034 0 stevel /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1035 0 stevel /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1036 0 stevel /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1037 0 stevel /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1038 0 stevel /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1039 0 stevel /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1040 0 stevel 1041 0 stevel /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK, 1042 0 stevel MISC_CMD, if_unitsel, if_unitsel_restart }, 1043 0 stevel 1044 0 stevel /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1045 0 stevel /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1046 0 stevel /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1047 0 stevel /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1048 0 stevel /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1049 0 stevel /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1050 0 stevel /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1051 0 stevel /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1052 0 stevel /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1053 0 stevel /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1054 0 stevel /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1055 0 stevel /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1056 0 stevel /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1057 0 stevel /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1058 0 stevel /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1059 0 stevel /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1060 0 stevel /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1061 0 stevel /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1062 0 stevel 1063 0 stevel /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq), 1064 0 stevel IPI_PRIV | IPI_WR | IPI_MODOK, 1065 0 stevel IF_CMD, ip_sioctl_sifname, NULL }, 1066 0 stevel 1067 0 stevel /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1068 0 stevel /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1069 0 stevel /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1070 0 stevel /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1071 0 stevel /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1072 0 stevel /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1073 0 stevel /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1074 0 stevel /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1075 0 stevel /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1076 0 stevel /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1077 0 stevel /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1078 0 stevel /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1079 0 stevel /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1080 0 stevel 1081 8485 Peter /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD, 1082 0 stevel MISC_CMD, ip_sioctl_get_ifnum, NULL }, 1083 8485 Peter /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD, 1084 0 stevel IF_CMD, ip_sioctl_get_muxid, NULL }, 1085 0 stevel /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), 1086 8485 Peter IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL }, 1087 0 stevel 1088 0 stevel /* Both if and lif variants share same func */ 1089 8485 Peter /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD, 1090 0 stevel IF_CMD, ip_sioctl_get_lifindex, NULL }, 1091 0 stevel /* Both if and lif variants share same func */ 1092 0 stevel /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), 1093 8485 Peter IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL }, 1094 0 stevel 1095 0 stevel /* copyin size cannot be coded for SIOCGIFCONF */ 1096 4972 meem /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD, 1097 0 stevel MISC_CMD, ip_sioctl_get_ifconf, NULL }, 1098 0 stevel /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1099 0 stevel /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1100 0 stevel /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1101 0 stevel /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1102 0 stevel /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1103 0 stevel /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1104 0 stevel /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1105 0 stevel /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1106 0 stevel /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1107 0 stevel /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1108 0 stevel /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1109 0 stevel /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1110 0 stevel /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1111 0 stevel /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1112 0 stevel /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1113 0 stevel /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1114 0 stevel /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1115 0 stevel 1116 0 stevel /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), 1117 8485 Peter IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif, 1118 0 stevel ip_sioctl_removeif_restart }, 1119 0 stevel /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), 1120 8485 Peter IPI_GET_CMD | IPI_PRIV | IPI_WR, 1121 0 stevel LIF_CMD, ip_sioctl_addif, NULL }, 1122 0 stevel #define SIOCLIFADDR_NDX 112 1123 0 stevel /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1124 0 stevel LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, 1125 0 stevel /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), 1126 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL }, 1127 0 stevel /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1128 0 stevel LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, 1129 0 stevel /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), 1130 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL }, 1131 0 stevel /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), 1132 8485 Peter IPI_PRIV | IPI_WR, 1133 0 stevel LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, 1134 0 stevel /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), 1135 8485 Peter IPI_GET_CMD | IPI_MODOK, 1136 0 stevel LIF_CMD, ip_sioctl_get_flags, NULL }, 1137 0 stevel 1138 0 stevel /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1139 0 stevel /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1140 0 stevel 1141 4972 meem /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, 1142 0 stevel ip_sioctl_get_lifconf, NULL }, 1143 0 stevel /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1144 0 stevel LIF_CMD, ip_sioctl_mtu, NULL }, 1145 8485 Peter /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD, 1146 0 stevel LIF_CMD, ip_sioctl_get_mtu, NULL }, 1147 0 stevel /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), 1148 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL }, 1149 0 stevel /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1150 0 stevel LIF_CMD, ip_sioctl_brdaddr, NULL }, 1151 0 stevel /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), 1152 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL }, 1153 0 stevel /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1154 0 stevel LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, 1155 0 stevel /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), 1156 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL }, 1157 0 stevel /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1158 0 stevel LIF_CMD, ip_sioctl_metric, NULL }, 1159 0 stevel /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), 1160 8485 Peter IPI_PRIV | IPI_WR | IPI_MODOK, 1161 0 stevel LIF_CMD, ip_sioctl_slifname, 1162 0 stevel ip_sioctl_slifname_restart }, 1163 0 stevel 1164 8485 Peter /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD, 1165 0 stevel MISC_CMD, ip_sioctl_get_lifnum, NULL }, 1166 0 stevel /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), 1167 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL }, 1168 0 stevel /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), 1169 8485 Peter IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL }, 1170 0 stevel /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), 1171 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 }, 1172 0 stevel /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), 1173 8485 Peter IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 }, 1174 0 stevel /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1175 0 stevel LIF_CMD, ip_sioctl_token, NULL }, 1176 0 stevel /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), 1177 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL }, 1178 0 stevel /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1179 0 stevel LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, 1180 0 stevel /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), 1181 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL }, 1182 0 stevel /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, 1183 0 stevel LIF_CMD, ip_sioctl_lnkinfo, NULL }, 1184 0 stevel 1185 0 stevel /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), 1186 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, 1187 0 stevel /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, 1188 0 stevel LIF_CMD, ip_siocdelndp_v6, NULL }, 1189 0 stevel /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, 1190 0 stevel LIF_CMD, ip_siocqueryndp_v6, NULL }, 1191 0 stevel /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV, 1192 0 stevel LIF_CMD, ip_siocsetndp_v6, NULL }, 1193 0 stevel /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1194 0 stevel MISC_CMD, ip_sioctl_tmyaddr, NULL }, 1195 0 stevel /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD, 1196 0 stevel MISC_CMD, ip_sioctl_tonlink, NULL }, 1197 0 stevel /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, 1198 0 stevel MISC_CMD, ip_sioctl_tmysite, NULL }, 1199 10616 Sebastien /* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1200 10616 Sebastien /* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1201 0 stevel /* IPSECioctls handled in ip_sioctl_copyin_setup itself */ 1202 0 stevel /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1203 0 stevel /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1204 0 stevel /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1205 0 stevel /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, 1206 0 stevel 1207 8485 Peter /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1208 8485 Peter 1209 8700 Peter /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD, 1210 8700 Peter LIF_CMD, ip_sioctl_get_binding, NULL }, 1211 0 stevel /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), 1212 8485 Peter IPI_PRIV | IPI_WR, 1213 0 stevel LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, 1214 0 stevel /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), 1215 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL }, 1216 8485 Peter /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t), 1217 8485 Peter IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL }, 1218 0 stevel 1219 0 stevel /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ 1220 0 stevel /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1221 0 stevel /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1222 0 stevel /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1223 0 stevel 1224 8485 Peter /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1225 0 stevel 1226 0 stevel /* These are handled in ip_sioctl_copyin_setup itself */ 1227 0 stevel /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, 1228 0 stevel MISC_CMD, NULL, NULL }, 1229 0 stevel /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT, 1230 0 stevel MISC_CMD, NULL, NULL }, 1231 0 stevel /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL }, 1232 0 stevel 1233 4972 meem /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, 1234 0 stevel ip_sioctl_get_lifconf, NULL }, 1235 0 stevel 1236 8485 Peter /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR, 1237 4972 meem XARP_CMD, ip_sioctl_arp, NULL }, 1238 8485 Peter /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD, 1239 4972 meem XARP_CMD, ip_sioctl_arp, NULL }, 1240 8485 Peter /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR, 1241 4972 meem XARP_CMD, ip_sioctl_arp, NULL }, 1242 0 stevel 1243 0 stevel /* SIOCPOPSOCKFS is not handled by IP */ 1244 0 stevel /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, 1245 0 stevel 1246 0 stevel /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), 1247 8485 Peter IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL }, 1248 0 stevel /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), 1249 8485 Peter IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone, 1250 0 stevel ip_sioctl_slifzone_restart }, 1251 0 stevel /* 172-174 are SCTP ioctls and not handled by IP */ 1252 0 stevel /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1253 0 stevel /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1254 0 stevel /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1255 0 stevel /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq), 1256 0 stevel IPI_GET_CMD, LIF_CMD, 1257 0 stevel ip_sioctl_get_lifusesrc, 0 }, 1258 0 stevel /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq), 1259 0 stevel IPI_PRIV | IPI_WR, 1260 0 stevel LIF_CMD, ip_sioctl_slifusesrc, 1261 0 stevel NULL }, 1262 0 stevel /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD, 1263 0 stevel ip_sioctl_get_lifsrcof, NULL }, 1264 0 stevel /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD, 1265 4972 meem MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1266 11042 Erik /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0, 1267 4972 meem MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1268 0 stevel /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD, 1269 4972 meem MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1270 11042 Erik /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0, 1271 4972 meem MSFILT_CMD, ip_sioctl_msfilter, NULL }, 1272 8485 Peter /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, 1273 5381 meem /* SIOCSENABLESDP is handled by SDP */ 1274 5381 meem /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL }, 1275 8348 Eric /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL }, 1276 10946 Sangeeta /* 185 */ { IPI_DONTCARE /* SIOCGIFHWADDR */, 0, 0, 0, NULL, NULL }, 1277 10946 Sangeeta /* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL }, 1278 10946 Sangeeta /* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD, 1279 10946 Sangeeta ip_sioctl_ilb_cmd, NULL }, 1280 0 stevel }; 1281 0 stevel 1282 0 stevel int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1283 0 stevel 1284 0 stevel ip_ioctl_cmd_t ip_misc_ioctl_table[] = { 1285 11042 Erik { I_LINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, 1286 11042 Erik { I_UNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, 1287 11042 Erik { I_PLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, 1288 11042 Erik { I_PUNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, 1289 11042 Erik { ND_GET, 0, 0, 0, NULL, NULL }, 1290 11042 Erik { ND_SET, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, 1291 0 stevel { IP_IOCTL, 0, 0, 0, NULL, NULL }, 1292 8485 Peter { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD, 1293 0 stevel MISC_CMD, mrt_ioctl}, 1294 8485 Peter { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD, 1295 0 stevel MISC_CMD, mrt_ioctl}, 1296 8485 Peter { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD, 1297 0 stevel MISC_CMD, mrt_ioctl} 1298 0 stevel }; 1299 0 stevel 1300 0 stevel int ip_misc_ioctl_count = 1301 0 stevel sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t); 1302 0 stevel 1303 0 stevel int conn_drain_nthreads; /* Number of drainers reqd. */ 1304 0 stevel /* Settable in /etc/system */ 1305 0 stevel /* Defined in ip_ire.c */ 1306 0 stevel extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt; 1307 0 stevel extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt; 1308 0 stevel extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio; 1309 0 stevel 1310 0 stevel static nv_t ire_nv_arr[] = { 1311 0 stevel { IRE_BROADCAST, "BROADCAST" }, 1312 0 stevel { IRE_LOCAL, "LOCAL" }, 1313 0 stevel { IRE_LOOPBACK, "LOOPBACK" }, 1314 0 stevel { IRE_DEFAULT, "DEFAULT" }, 1315 0 stevel { IRE_PREFIX, "PREFIX" }, 1316 0 stevel { IRE_IF_NORESOLVER, "IF_NORESOL" }, 1317 0 stevel { IRE_IF_RESOLVER, "IF_RESOLV" }, 1318 11042 Erik { IRE_IF_CLONE, "IF_CLONE" }, 1319 0 stevel { IRE_HOST, "HOST" }, 1320 11042 Erik { IRE_MULTICAST, "MULTICAST" }, 1321 11042 Erik { IRE_NOROUTE, "NOROUTE" }, 1322 0 stevel { 0 } 1323 0 stevel }; 1324 0 stevel 1325 0 stevel nv_t *ire_nv_tbl = ire_nv_arr; 1326 0 stevel 1327 0 stevel /* Simple ICMP IP Header Template */ 1328 0 stevel static ipha_t icmp_ipha = { 1329 0 stevel IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 1330 0 stevel }; 1331 0 stevel 1332 0 stevel struct module_info ip_mod_info = { 1333 8348 Eric IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT, 1334 8348 Eric IP_MOD_LOWAT 1335 0 stevel }; 1336 0 stevel 1337 2546 carlsonj /* 1338 2546 carlsonj * Duplicate static symbols within a module confuses mdb; so we avoid the 1339 2546 carlsonj * problem by making the symbols here distinct from those in udp.c. 1340 2546 carlsonj */ 1341 2546 carlsonj 1342 5240 nordmark /* 1343 5240 nordmark * Entry points for IP as a device and as a module. 1344 5240 nordmark * We have separate open functions for the /dev/ip and /dev/ip6 devices. 1345 5240 nordmark */ 1346 5240 nordmark static struct qinit iprinitv4 = { 1347 5240 nordmark (pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL, 1348 0 stevel &ip_mod_info 1349 0 stevel }; 1350 0 stevel 1351 5240 nordmark struct qinit iprinitv6 = { 1352 5240 nordmark (pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL, 1353 0 stevel &ip_mod_info 1354 0 stevel }; 1355 0 stevel 1356 11042 Erik static struct qinit ipwinit = { 1357 11042 Erik (pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL, 1358 0 stevel &ip_mod_info 1359 0 stevel }; 1360 0 stevel 1361 5240 nordmark static struct qinit iplrinit = { 1362 5240 nordmark (pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL, 1363 5240 nordmark &ip_mod_info 1364 5240 nordmark }; 1365 5240 nordmark 1366 5240 nordmark static struct qinit iplwinit = { 1367 5240 nordmark (pfi_t)ip_lwput, NULL, NULL, NULL, NULL, 1368 5240 nordmark &ip_mod_info 1369 5240 nordmark }; 1370 5240 nordmark 1371 5240 nordmark /* For AF_INET aka /dev/ip */ 1372 5240 nordmark struct streamtab ipinfov4 = { 1373 11042 Erik &iprinitv4, &ipwinit, &iplrinit, &iplwinit 1374 5240 nordmark }; 1375 5240 nordmark 1376 5240 nordmark /* For AF_INET6 aka /dev/ip6 */ 1377 5240 nordmark struct streamtab ipinfov6 = { 1378 11042 Erik &iprinitv6, &ipwinit, &iplrinit, &iplwinit 1379 0 stevel }; 1380 0 stevel 1381 0 stevel #ifdef DEBUG 1382 11042 Erik boolean_t skip_sctp_cksum = B_FALSE; 1383 0 stevel #endif 1384 2733 nordmark 1385 2733 nordmark /* 1386 11042 Erik * Generate an ICMP fragmentation needed message. 1387 11042 Erik * When called from ip_output side a minimal ip_recv_attr_t needs to be 1388 11042 Erik * constructed by the caller. 1389 11042 Erik */ 1390 11042 Erik void 1391 11042 Erik icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira) 1392 0 stevel { 1393 0 stevel icmph_t icmph; 1394 11042 Erik ip_stack_t *ipst = ira->ira_ill->ill_ipst; 1395 11042 Erik 1396 11042 Erik mp = icmp_pkt_err_ok(mp, ira); 1397 11042 Erik if (mp == NULL) 1398 11042 Erik return; 1399 0 stevel 1400 0 stevel bzero(&icmph, sizeof (icmph_t)); 1401 0 stevel icmph.icmph_type = ICMP_DEST_UNREACHABLE; 1402 0 stevel icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED; 1403 0 stevel icmph.icmph_du_mtu = htons((uint16_t)mtu); 1404 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded); 1405 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); 1406 11042 Erik 1407 11042 Erik icmp_pkt(mp, &icmph, sizeof (icmph_t), ira); 1408 11042 Erik } 1409 11042 Erik 1410 11042 Erik /* 1411 11042 Erik * icmp_inbound_v4 deals with ICMP messages that are handled by IP. 1412 11042 Erik * If the ICMP message is consumed by IP, i.e., it should not be delivered 1413 11042 Erik * to any IPPROTO_ICMP raw sockets, then it returns NULL. 1414 11042 Erik * Likewise, if the ICMP error is misformed (too short, etc), then it 1415 11042 Erik * returns NULL. The caller uses this to determine whether or not to send 1416 11042 Erik * to raw sockets. 1417 11042 Erik * 1418 11042 Erik * All error messages are passed to the matching transport stream. 1419 11042 Erik * 1420 11042 Erik * The following cases are handled by icmp_inbound: 1421 0 stevel * 1) It needs to send a reply back and possibly delivering it 1422 0 stevel * to the "interested" upper clients. 1423 11042 Erik * 2) Return the mblk so that the caller can pass it to the RAW socket clients. 1424 0 stevel * 3) It needs to change some values in IP only. 1425 11042 Erik * 4) It needs to change some values in IP and upper layers e.g TCP 1426 11042 Erik * by delivering an error to the upper layers. 1427 11042 Erik * 1428 11042 Erik * We handle the above three cases in the context of IPsec in the 1429 0 stevel * following way : 1430 0 stevel * 1431 0 stevel * 1) Send the reply back in the same way as the request came in. 1432 0 stevel * If it came in encrypted, it goes out encrypted. If it came in 1433 0 stevel * clear, it goes out in clear. Thus, this will prevent chosen 1434 0 stevel * plain text attack. 1435 0 stevel * 2) The client may or may not expect things to come in secure. 1436 0 stevel * If it comes in secure, the policy constraints are checked 1437 0 stevel * before delivering it to the upper layers. If it comes in 1438 0 stevel * clear, ipsec_inbound_accept_clear will decide whether to 1439 0 stevel * accept this in clear or not. In both the cases, if the returned 1440 0 stevel * message (IP header + 8 bytes) that caused the icmp message has 1441 0 stevel * AH/ESP headers, it is sent up to AH/ESP for validation before 1442 0 stevel * sending up. If there are only 8 bytes of returned message, then 1443 0 stevel * upper client will not be notified. 1444 0 stevel * 3) Check with global policy to see whether it matches the constaints. 1445 0 stevel * But this will be done only if icmp_accept_messages_in_clear is 1446 0 stevel * zero. 1447 0 stevel * 4) If we need to change both in IP and ULP, then the decision taken 1448 0 stevel * while affecting the values in IP and while delivering up to TCP 1449 0 stevel * should be the same. 1450 0 stevel * 1451 0 stevel * There are two cases. 1452 0 stevel * 1453 0 stevel * a) If we reject data at the IP layer (ipsec_check_global_policy() 1454 0 stevel * failed), we will not deliver it to the ULP, even though they 1455 0 stevel * are *willing* to accept in *clear*. This is fine as our global 1456 0 stevel * disposition to icmp messages asks us reject the datagram. 1457 0 stevel * 1458 0 stevel * b) If we accept data at the IP layer (ipsec_check_global_policy() 1459 0 stevel * succeeded or icmp_accept_messages_in_clear is 1), and not able 1460 0 stevel * to deliver it to ULP (policy failed), it can lead to 1461 0 stevel * consistency problems. The cases known at this time are 1462 0 stevel * ICMP_DESTINATION_UNREACHABLE messages with following code 1463 0 stevel * values : 1464 0 stevel * 1465 0 stevel * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value 1466 0 stevel * and Upper layer rejects. Then the communication will 1467 0 stevel * come to a stop. This is solved by making similar decisions 1468 0 stevel * at both levels. Currently, when we are unable to deliver 1469 0 stevel * to the Upper Layer (due to policy failures) while IP has 1470 11042 Erik * adjusted dce_pmtu, the next outbound datagram would 1471 0 stevel * generate a local ICMP_FRAGMENTATION_NEEDED message - which 1472 0 stevel * will be with the right level of protection. Thus the right 1473 0 stevel * value will be communicated even if we are not able to 1474 0 stevel * communicate when we get from the wire initially. But this 1475 0 stevel * assumes there would be at least one outbound datagram after 1476 11042 Erik * IP has adjusted its dce_pmtu value. To make things 1477 0 stevel * simpler, we accept in clear after the validation of 1478 0 stevel * AH/ESP headers. 1479 0 stevel * 1480 0 stevel * - Other ICMP ERRORS : We may not be able to deliver it to the 1481 0 stevel * upper layer depending on the level of protection the upper 1482 0 stevel * layer expects and the disposition in ipsec_inbound_accept_clear(). 1483 0 stevel * ipsec_inbound_accept_clear() decides whether a given ICMP error 1484 0 stevel * should be accepted in clear when the Upper layer expects secure. 1485 0 stevel * Thus the communication may get aborted by some bad ICMP 1486 0 stevel * packets. 1487 11042 Erik */ 1488 11042 Erik mblk_t * 1489 11042 Erik icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira) 1490 11042 Erik { 1491 11042 Erik icmph_t *icmph; 1492 11042 Erik ipha_t *ipha; /* Outer header */ 1493 11042 Erik int ip_hdr_length; /* Outer header length */ 1494 0 stevel boolean_t interested; 1495 11042 Erik ipif_t *ipif; 1496 0 stevel uint32_t ts; 1497 11042 Erik uint32_t *tsp; 1498 11042 Erik timestruc_t now; 1499 11042 Erik ill_t *ill = ira->ira_ill; 1500 11042 Erik ip_stack_t *ipst = ill->ill_ipst; 1501 11042 Erik zoneid_t zoneid = ira->ira_zoneid; 1502 11042 Erik int len_needed; 1503 11042 Erik mblk_t *mp_ret = NULL; 1504 0 stevel 1505 0 stevel ipha = (ipha_t *)mp->b_rptr; 1506 11042 Erik 1507 11042 Erik BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs); 1508 11042 Erik 1509 11042 Erik ip_hdr_length = ira->ira_ip_hdr_length; 1510 11042 Erik if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) { 1511 11042 Erik if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) { 1512 11042 Erik BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); 1513 11042 Erik ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); 1514 11042 Erik freemsg(mp); 1515 11042 Erik return (NULL); 1516 11042 Erik } 1517 11042 Erik /* Last chance to get real. */ 1518 11042 Erik ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira); 1519 11042 Erik if (ipha == NULL) { 1520 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 1521 11042 Erik freemsg(mp); 1522 11042 Erik return (NULL); 1523 11042 Erik } 1524 11042 Erik } 1525 11042 Erik 1526 0 stevel /* The IP header will always be a multiple of four bytes */ 1527 11042 Erik icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; 1528 11042 Erik ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type, 1529 0 stevel icmph->icmph_code)); 1530 11042 Erik 1531 11042 Erik /* 1532 11042 Erik * We will set "interested" to "true" if we should pass a copy to 1533 11042 Erik * the transport or if we handle the packet locally. 1534 11042 Erik */ 1535 0 stevel interested = B_FALSE; 1536 0 stevel switch (icmph->icmph_type) { 1537 0 stevel case ICMP_ECHO_REPLY: 1538 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps); 1539 0 stevel break; 1540 0 stevel case ICMP_DEST_UNREACHABLE: 1541 0 stevel if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) 1542 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded); 1543 0 stevel interested = B_TRUE; /* Pass up to transport */ 1544 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs); 1545 0 stevel break; 1546 0 stevel case ICMP_SOURCE_QUENCH: 1547 0 stevel interested = B_TRUE; /* Pass up to transport */ 1548 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs); 1549 0 stevel break; 1550 0 stevel case ICMP_REDIRECT: 1551 3448 dh155122 if (!ipst->ips_ip_ignore_redirect) 1552 0 stevel interested = B_TRUE; 1553 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects); 1554 0 stevel break; 1555 0 stevel case ICMP_ECHO_REQUEST: 1556 0 stevel /* 1557 0 stevel * Whether to respond to echo requests that come in as IP 1558 0 stevel * broadcasts or as IP multicast is subject to debate 1559 0 stevel * (what isn't?). We aim to please, you pick it. 1560 0 stevel * Default is do it. 1561 0 stevel */ 1562 11042 Erik if (ira->ira_flags & IRAF_MULTICAST) { 1563 11042 Erik /* multicast: respond based on tunable */ 1564 11042 Erik interested = ipst->ips_ip_g_resp_to_echo_mcast; 1565 11042 Erik } else if (ira->ira_flags & IRAF_BROADCAST) { 1566 11042 Erik /* broadcast: respond based on tunable */ 1567 11042 Erik interested = ipst->ips_ip_g_resp_to_echo_bcast; 1568 11042 Erik } else { 1569 0 stevel /* unicast: always respond */ 1570 0 stevel interested = B_TRUE; 1571 3448 dh155122 } 1572 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos); 1573 11042 Erik if (!interested) { 1574 11042 Erik /* We never pass these to RAW sockets */ 1575 11042 Erik freemsg(mp); 1576 11042 Erik return (NULL); 1577 11042 Erik } 1578 11042 Erik 1579 11042 Erik /* Check db_ref to make sure we can modify the packet. */ 1580 11042 Erik if (mp->b_datap->db_ref > 1) { 1581 11042 Erik mblk_t *mp1; 1582 11042 Erik 1583 11042 Erik mp1 = copymsg(mp); 1584 11042 Erik freemsg(mp); 1585 11042 Erik if (!mp1) { 1586 11042 Erik BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 1587 11042 Erik return (NULL); 1588 11042 Erik } 1589 11042 Erik mp = mp1; 1590 11042 Erik ipha = (ipha_t *)mp->b_rptr; 1591 11042 Erik icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; 1592 11042 Erik } 1593 11042 Erik icmph->icmph_type = ICMP_ECHO_REPLY; 1594 11042 Erik BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps); 1595 11042 Erik icmp_send_reply_v4(mp, ipha, icmph, ira); 1596 11042 Erik return (NULL); 1597 11042 Erik 1598 0 stevel case ICMP_ROUTER_ADVERTISEMENT: 1599 0 stevel case ICMP_ROUTER_SOLICITATION: 1600 0 stevel break; 1601 0 stevel case ICMP_TIME_EXCEEDED: 1602 0 stevel interested = B_TRUE; /* Pass up to transport */ 1603 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds); 1604 0 stevel break; 1605 0 stevel case ICMP_PARAM_PROBLEM: 1606 0 stevel interested = B_TRUE; /* Pass up to transport */ 1607 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs); 1608 0 stevel break; 1609 0 stevel case ICMP_TIME_STAMP_REQUEST: 1610 0 stevel /* Response to Time Stamp Requests is local policy. */ 1611 11042 Erik if (ipst->ips_ip_g_resp_to_timestamp) { 1612 11042 Erik if (ira->ira_flags & IRAF_MULTIBROADCAST) 1613 11042 Erik interested = 1614 11042 Erik ipst->ips_ip_g_resp_to_timestamp_bcast; 1615 11042 Erik else 1616 11042 Erik interested = B_TRUE; 1617 11042 Erik } 1618 11042 Erik if (!interested) { 1619 11042 Erik /* We never pass these to RAW sockets */ 1620 11042 Erik freemsg(mp); 1621 11042 Erik return (NULL); 1622 11042 Erik } 1623 11042 Erik 1624 11042 Erik /* Make sure we have enough of the packet */ 1625 11042 Erik len_needed = ip_hdr_length + ICMPH_SIZE + 1626 11042 Erik 3 * sizeof (uint32_t); 1627 11042 Erik 1628 11042 Erik if (mp->b_wptr - mp->b_rptr < len_needed) { 1629 11042 Erik ipha = ip_pullup(mp, len_needed, ira); 1630 11042 Erik if (ipha == NULL) { 1631 11042 Erik BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1632 11042 Erik ip_drop_input("ipIfStatsInDiscards - ip_pullup", 1633 11042 Erik mp, ill); 1634 11042 Erik freemsg(mp); 1635 11042 Erik return (NULL); 1636 11042 Erik } 1637 11042 Erik /* Refresh following the pullup. */ 1638 11042 Erik icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; 1639 0 stevel } 1640 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps); 1641 11042 Erik /* Check db_ref to make sure we can modify the packet. */ 1642 11042 Erik if (mp->b_datap->db_ref > 1) { 1643 11042 Erik mblk_t *mp1; 1644 11042 Erik 1645 11042 Erik mp1 = copymsg(mp); 1646 11042 Erik freemsg(mp); 1647 11042 Erik if (!mp1) { 1648 11042 Erik BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 1649 11042 Erik return (NULL); 1650 11042 Erik } 1651 11042 Erik mp = mp1; 1652 11042 Erik ipha = (ipha_t *)mp->b_rptr; 1653 11042 Erik icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; 1654 11042 Erik } 1655 0 stevel icmph->icmph_type = ICMP_TIME_STAMP_REPLY; 1656 11042 Erik tsp = (uint32_t *)&icmph[1]; 1657 0 stevel tsp++; /* Skip past 'originate time' */ 1658 0 stevel /* Compute # of milliseconds since midnight */ 1659 0 stevel gethrestime(&now); 1660 0 stevel ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + 1661 0 stevel now.tv_nsec / (NANOSEC / MILLISEC); 1662 0 stevel *tsp++ = htonl(ts); /* Lay in 'receive time' */ 1663 0 stevel *tsp++ = htonl(ts); /* Lay in 'send time' */ 1664 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps); 1665 11042 Erik icmp_send_reply_v4(mp, ipha, icmph, ira); 1666 11042 Erik return (NULL); 1667 11042 Erik 1668 11042 Erik case ICMP_TIME_STAMP_REPLY: 1669 11042 Erik BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps); 1670 11042 Erik break; 1671 11042 Erik case ICMP_INFO_REQUEST: 1672 11042 Erik /* Per RFC 1122 3.2.2.7, ignore this. */ 1673 11042 Erik case ICMP_INFO_REPLY: 1674 11042 Erik break; 1675 11042 Erik case ICMP_ADDRESS_MASK_REQUEST: 1676 11042 Erik if (ira->ira_flags & IRAF_MULTIBROADCAST) { 1677 11042 Erik interested = 1678 11042 Erik ipst->ips_ip_respond_to_address_mask_broadcast; 1679 11042 Erik } else { 1680 11042 Erik interested = B_TRUE; 1681 11042 Erik } 1682 11042 Erik if (!interested) { 1683 11042 Erik /* We never pass these to RAW sockets */ 1684 11042 Erik freemsg(mp); 1685 11042 Erik return (NULL); 1686 11042 Erik } 1687 11042 Erik len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN; 1688 11042 Erik if (mp->b_wptr - mp->b_rptr < len_needed) { 1689 11042 Erik ipha = ip_pullup(mp, len_needed, ira); 1690 11042 Erik if (ipha == NULL) { 1691 11042 Erik BUMP_MIB(ill->ill_ip_mib, 1692 11042 Erik ipIfStatsInTruncatedPkts); 1693 11042 Erik ip_drop_input("ipIfStatsInTruncatedPkts", mp, 1694 11042 Erik ill); 1695 11042 Erik freemsg(mp); 1696 11042 Erik return (NULL); 1697 11042 Erik } 1698 11042 Erik /* Refresh following the pullup. */ 1699 11042 Erik icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; 1700 11042 Erik } 1701 11042 Erik BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks); 1702 11042 Erik /* Check db_ref to make sure we can modify the packet. */ 1703 11042 Erik if (mp->b_datap->db_ref > 1) { 1704 11042 Erik mblk_t *mp1; 1705 11042 Erik 1706 11042 Erik mp1 = copymsg(mp); 1707 11042 Erik freemsg(mp); 1708 11042 Erik if (!mp1) { 1709 11042 Erik BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); 1710 11042 Erik return (NULL); 1711 11042 Erik } 1712 11042 Erik mp = mp1; 1713 11042 Erik ipha = (ipha_t *)mp->b_rptr; 1714 11042 Erik icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; 1715 11042 Erik } 1716 11042 Erik /* 1717 11042 Erik * Need the ipif with the mask be the same as the source 1718 11042 Erik * address of the mask reply. For unicast we have a specific 1719 11042 Erik * ipif. For multicast/broadcast we only handle onlink 1720 11042 Erik * senders, and use the source address to pick an ipif. 1721 11042 Erik */ 1722 11042 Erik ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst); 1723 11042 Erik if (ipif == NULL) { 1724 11042 Erik /* Broadcast or multicast */ 1725 11042 Erik ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); 1726 11042 Erik if (ipif == NULL) { 1727 11042 Erik freemsg(mp); 1728 11042 Erik return (NULL); 1729 11042 Erik } 1730 11042 Erik } 1731 11042 Erik icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 1732 11042 Erik bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 1733 11042 Erik ipif_refrele(ipif); 1734 11042 Erik BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps); 1735 11042 Erik icmp_send_reply_v4(mp, ipha, icmph, ira); 1736 11042 Erik return (NULL); 1737 11042 Erik 1738 11042 Erik case ICMP_ADDRESS_MASK_REPLY: 1739 11042 Erik BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps); 1740 11042 Erik break; 1741 11042 Erik default: 1742 11042 Erik interested = B_TRUE; /* Pass up to transport */ 1743 11042 Erik BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns); 1744 11042 Erik break; 1745 11042 Erik } 1746 11042 Erik /* 1747 11042 Erik * See if there is an ICMP client to avoid an extra copymsg/freemsg 1748 11042 Erik * if there isn't one. 1749 11042 Erik */ 1750 11042 Erik if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) { 1751 11042 Erik /* If there is an ICMP client and we want one too, copy it. */ 1752 11042 Erik 1753 11042 Erik if (!interested) { 1754 11042 Erik /* Caller will deliver to RAW sockets */ 1755 11042 Erik return (mp); 1756 11042 Erik } 1757 11042 Erik mp_ret = copymsg(mp); 1758 11042 Erik if (mp_ret == NULL) { 1759 3284 apersson BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1760 11042 Erik ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill); 1761 11042 Erik } 1762 11042 Erik } else if (!interested) { 1763 11042 Erik /* Neither we nor raw sockets are interested. Drop packet now */ 1764 11042 Erik freemsg(mp); 1765 11042 Erik return (NULL); 1766 11042 Erik } 1767 11042 Erik 1768 11042 Erik /* 1769 11042 Erik * ICMP error or redirect packet. Make sure we have enough of 1770 11042 Erik * the header and that db_ref == 1 since we might end up modifying 1771 11042 Erik * the packet. 1772 11042 Erik */ 1773 11042 Erik if (mp->b_cont != NULL) { 1774 11042 Erik if (ip_pullup(mp, -1, ira) == NULL) { 1775 3284 apersson BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1776 11042 Erik ip_drop_input("ipIfStatsInDiscards - ip_pullup", 1777 11042 Erik mp, ill); 1778 11042 Erik freemsg(mp); 1779 11042 Erik return (mp_ret); 1780 11042 Erik } 1781 11042 Erik } 1782 11042 Erik 1783 11042 Erik if (mp->b_datap->db_ref > 1) { 1784 11042 Erik mblk_t *mp1; 1785 11042 Erik 1786 11042 Erik mp1 = copymsg(mp); 1787 11042 Erik if (mp1 == NULL) { 1788 11042 Erik BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1789 11042 Erik ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill); 1790 11042 Erik freemsg(mp); 1791 11042 Erik return (mp_ret); 1792 11042 Erik } 1793 11042 Erik freemsg(mp); 1794 11042 Erik mp = mp1; 1795 11042 Erik } 1796 11042 Erik 1797 11042 Erik /* 1798 11042 Erik * In case mp has changed, verify the message before any further 1799 11042 Erik * processes. 1800 11042 Erik */ 1801 11042 Erik ipha = (ipha_t *)mp->b_rptr; 1802 11042 Erik icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; 1803 11042 Erik if (!icmp_inbound_verify_v4(mp, icmph, ira)) { 1804 11042 Erik freemsg(mp); 1805 11042 Erik return (mp_ret); 1806 11042 Erik } 1807 11042 Erik 1808 11042 Erik switch (icmph->icmph_type) { 1809 11042 Erik case ICMP_REDIRECT: 1810 11042 Erik icmp_redirect_v4(mp, ipha, icmph, ira); 1811 11042 Erik break; 1812 11042 Erik case ICMP_DEST_UNREACHABLE: 1813 11042 Erik if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { 1814 11042 Erik /* Update DCE and adjust MTU is icmp header if needed */ 1815 11042 Erik icmp_inbound_too_big_v4(icmph, ira); 1816 11042 Erik } 1817 11042 Erik /* FALLTHRU */ 1818 11042 Erik default: 1819 11042 Erik icmp_inbound_error_fanout_v4(mp, icmph, ira); 1820 11042 Erik break; 1821 11042 Erik } 1822 11042 Erik return (mp_ret); 1823 11042 Erik } 1824 11042 Erik 1825 11042 Erik /* 1826 11042 Erik * Send an ICMP echo, timestamp or address mask reply. 1827 11042 Erik * The caller has already updated the payload part of the packet. 1828 11042 Erik * We handle the ICMP checksum, IP source address selection and feed 1829 11042 Erik * the packet into ip_output_simple. 1830 11042 Erik */ 1831 11042 Erik static void 1832 11042 Erik icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, 1833 11042 Erik ip_recv_attr_t *ira) 1834 11042 Erik { 1835 11042 Erik uint_t ip_hdr_length = ira->ira_ip_hdr_length; 1836 11042 Erik ill_t *ill = ira->ira_ill; 1837 11042 Erik ip_stack_t *ipst = ill->ill_ipst; 1838 11042 Erik ip_xmit_attr_t ixas; 1839 11042 Erik 1840 0 stevel /* Send out an ICMP packet */ 1841 0 stevel icmph->icmph_checksum = 0; 1842 11042 Erik icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0); 1843 0 stevel /* Reset time to live. */ 1844 3448 dh155122 ipha->ipha_ttl = ipst->ips_ip_def_ttl; 1845 0 stevel { 1846 0 stevel /* Swap source and destination addresses */ 1847 0 stevel ipaddr_t tmp; 1848 0 stevel 1849 0 stevel tmp = ipha->ipha_src; 1850 0 stevel ipha->ipha_src = ipha->ipha_dst; 1851 0 stevel ipha->ipha_dst = tmp; 1852 0 stevel } 1853 0 stevel ipha->ipha_ident = 0; 1854 0 stevel if (!IS_SIMPLE_IPH(ipha)) 1855 0 stevel icmp_options_update(ipha); 1856 0 stevel 1857 11042 Erik bzero(&ixas, sizeof (ixas)); 1858 11042 Erik ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 1859 11042 Erik ixas.ixa_zoneid = ira->ira_zoneid; 1860 11042 Erik ixas.ixa_cred = kcred; 1861 11042 Erik ixas.ixa_cpid = NOPID; 1862 11042 Erik ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ 1863 11042 Erik ixas.ixa_ifindex = 0; 1864 11042 Erik ixas.ixa_ipst = ipst; 1865 11042 Erik ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1866 11042 Erik 1867 11042 Erik if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { 1868 0 stevel /* 1869 0 stevel * This packet should go out the same way as it 1870 11042 Erik * came in i.e in clear, independent of the IPsec policy 1871 11042 Erik * for transmitting packets. 1872 11042 Erik */ 1873 11042 Erik ixas.ixa_flags |= IXAF_NO_IPSEC; 1874 11042 Erik } else { 1875 11042 Erik if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) { 1876 3284 apersson BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1877 11042 Erik /* Note: mp already consumed and ip_drop_packet done */ 1878 11042 Erik return; 1879 11042 Erik } 1880 11042 Erik } 1881 11042 Erik if (ira->ira_flags & IRAF_MULTIBROADCAST) { 1882 11042 Erik /* 1883 11042 Erik * Not one or our addresses (IRE_LOCALs), thus we let 1884 11042 Erik * ip_output_simple pick the source. 1885 11042 Erik */ 1886 11042 Erik ipha->ipha_src = INADDR_ANY; 1887 11042 Erik ixas.ixa_flags |= IXAF_SET_SOURCE; 1888 11042 Erik } 1889 11042 Erik /* Should we send with DF and use dce_pmtu? */ 1890 11042 Erik if (ipst->ips_ipv4_icmp_return_pmtu) { 1891 11042 Erik ixas.ixa_flags |= IXAF_PMTU_DISCOVERY; 1892 11042 Erik ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS; 1893 11042 Erik } 1894 11042 Erik 1895 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); 1896 11042 Erik 1897 11042 Erik (void) ip_output_simple(mp, &ixas); 1898 11042 Erik ixa_cleanup(&ixas); 1899 11042 Erik } 1900 11042 Erik 1901 11042 Erik /* 1902 11042 Erik * Verify the ICMP messages for either for ICMP error or redirect packet. 1903 11042 Erik * The caller should have fully pulled up the message. If it's a redirect 1904 11042 Erik * packet, only basic checks on IP header will be done; otherwise, verify 1905 11042 Erik * the packet by looking at the included ULP header. 1906 11042 Erik * 1907 11042 Erik * Called before icmp_inbound_error_fanout_v4 is called. 1908 11042 Erik */ 1909 11042 Erik static boolean_t 1910 11042 Erik icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira) 1911 11042 Erik { 1912 11042 Erik ill_t *ill = ira->ira_ill; 1913 11042 Erik int hdr_length; 1914 11042 Erik ip_stack_t *ipst = ira->ira_ill->ill_ipst; 1915 11042 Erik conn_t *connp; 1916 11042 Erik ipha_t *ipha; /* Inner IP header */ 1917 11042 Erik 1918 11042 Erik ipha = (ipha_t *)&icmph[1]; 1919 11042 Erik if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr) 1920 11042 Erik goto truncated; 1921 11042 Erik 1922 11042 Erik hdr_length = IPH_HDR_LENGTH(ipha); 1923 11042 Erik 1924 11042 Erik if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) 1925 11042 Erik goto discard_pkt; 1926 11042 Erik 1927 11042 Erik if (hdr_length < sizeof (ipha_t)) 1928 11042 Erik goto truncated; 1929 11042 Erik 1930 11042 Erik if ((uchar_t *)ipha + hdr_length > mp->b_wptr) 1931 11042 Erik goto truncated; 1932 11042 Erik 1933 11042 Erik /* 1934 11042 Erik * Stop here for ICMP_REDIRECT. 1935 11042 Erik */ 1936 11042 Erik if (icmph->icmph_type == ICMP_REDIRECT) 1937 11042 Erik return (B_TRUE); 1938 11042 Erik 1939 11042 Erik /* 1940 11042 Erik * ICMP errors only. 1941 11042 Erik */ 1942 2252 priyanka switch (ipha->ipha_protocol) { 1943 11042 Erik case IPPROTO_UDP: 1944 11042 Erik /* 1945 11042 Erik * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 1946 11042 Erik * transport header. 1947 11042 Erik */ 1948 11042 Erik if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 1949 11042 Erik mp->b_wptr) 1950 11042 Erik goto truncated; 1951 11042 Erik break; 1952 11042 Erik case IPPROTO_TCP: { 1953 11042 Erik tcpha_t *tcpha; 1954 11042 Erik 1955 11042 Erik /* 1956 11042 Erik * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 1957 11042 Erik * transport header. 1958 11042 Erik */ 1959 11042 Erik if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 1960 11042 Erik mp->b_wptr) 1961 11042 Erik goto truncated; 1962 11042 Erik 1963 11042 Erik tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length); 1964 11042 Erik connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN, 1965 11042 Erik ipst); 1966 11042 Erik if (connp == NULL) 1967 11042 Erik goto discard_pkt; 1968 11042 Erik 1969 11042 Erik if ((connp->conn_verifyicmp != NULL) && 1970 11042 Erik !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) { 1971 11042 Erik CONN_DEC_REF(connp); 1972 11042 Erik goto discard_pkt; 1973 11042 Erik } 1974 2252 priyanka CONN_DEC_REF(connp); 1975 11042 Erik break; 1976 11042 Erik } 1977 11042 Erik case IPPROTO_SCTP: 1978 11042 Erik /* 1979 11042 Erik * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of 1980 11042 Erik * transport header. 1981 11042 Erik */ 1982 11042 Erik if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > 1983 11042 Erik mp->b_wptr) 1984 11042 Erik goto truncated; 1985 11042 Erik break; 1986 11042 Erik case IPPROTO_ESP: 1987 11042 Erik case IPPROTO_AH: 1988 11042 Erik break; 1989 11042 Erik case IPPROTO_ENCAP: 1990 11042 Erik if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > 1991 11042 Erik mp->b_wptr) 1992 11042 Erik goto truncated; 1993 11042 Erik break; 1994 11042 Erik default: 1995 11042 Erik break; 1996 11042 Erik } 1997 11042 Erik 1998 11042 Erik return (B_TRUE); 1999 11042 Erik 2000 11042 Erik discard_pkt: 2001 11042 Erik /* Bogus ICMP error. */ 2002 11042 Erik BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2003 11042 Erik return (B_FALSE); 2004 11042 Erik 2005 11042 Erik truncated: 2006 11042 Erik /* We pulled up everthing already. Must be truncated */ 2007 11042 Erik BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); 2008 11042 Erik ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); 2009 11042 Erik return (B_FALSE); 2010 2252 priyanka } 2011 2252 priyanka 2012 0 stevel /* Table from RFC 1191 */ 2013 0 stevel static int icmp_frag_size_table[] = 2014 0 stevel { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 }; 2015 0 stevel 2016 0 stevel /* 2017 0 stevel * Process received ICMP Packet too big. 2018 11042 Erik * Just handles the DCE create/update, including using the above table of 2019 11042 Erik * PMTU guesses. The caller is responsible for validating the packet before 2020 11042 Erik * passing it in and also to fanout the ICMP error to any matching transport 2021 11042 Erik * conns. Assumes the message has been fully pulled up and verified. 2022 11042 Erik * 2023 11042 Erik * Before getting here, the caller has called icmp_inbound_verify_v4() 2024 11042 Erik * that should have verified with ULP to prevent undoing the changes we're 2025 11042 Erik * going to make to DCE. For example, TCP might have verified that the packet 2026 11042 Erik * which generated error is in the send window. 2027 11042 Erik * 2028 11042 Erik * In some cases modified this MTU in the ICMP header packet; the caller 2029 11042 Erik * should pass to the matching ULP after this returns. 2030 11042 Erik */ 2031 11042 Erik static void 2032 11042 Erik icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira) 2033 11042 Erik { 2034 11042 Erik dce_t *dce; 2035 11042 Erik int old_mtu; 2036 11042 Erik int mtu, orig_mtu; 2037 11042 Erik ipaddr_t dst; 2038 11042 Erik boolean_t disable_pmtud; 2039 11042 Erik ill_t *ill = ira->ira_ill; 2040 11042 Erik ip_stack_t *ipst = ill->ill_ipst; 2041 11042 Erik uint_t hdr_length; 2042 11042 Erik ipha_t *ipha; 2043 11042 Erik 2044 11042 Erik /* Caller already pulled up everything. */ 2045 11042 Erik ipha = (ipha_t *)&icmph[1]; 2046 0 stevel ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE && 2047 0 stevel icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED); 2048 3284 apersson ASSERT(ill != NULL); 2049 0 stevel 2050 0 stevel hdr_length = IPH_HDR_LENGTH(ipha); 2051 0 stevel 2052 11042 Erik /* 2053 11042 Erik * We handle path MTU for source routed packets since the DCE 2054 11042 Erik * is looked up using the final destination. 2055 11042 Erik */ 2056 11042 Erik dst = ip_get_dst(ipha); 2057 11042 Erik 2058 11042 Erik dce = dce_lookup_and_add_v4(dst, ipst); 2059 11042 Erik if (dce == NULL) { 2060 11042 Erik /* Couldn't add a unique one - ENOMEM */ 2061 11042 Erik ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n", 2062 11042 Erik ntohl(dst))); 2063 11042 Erik return; 2064 0 stevel } 2065 8106 Kacheong 2066 0 stevel /* Check for MTU discovery advice as described in RFC 1191 */ 2067 0 stevel mtu = ntohs(icmph->icmph_du_mtu); 2068 8106 Kacheong orig_mtu = mtu; 2069 8106 Kacheong disable_pmtud = B_FALSE; 2070 8106 Kacheong 2071 11042 Erik mutex_enter(&dce->dce_lock); 2072 11042 Erik if (dce->dce_flags & DCEF_PMTU) 2073 11042 Erik old_mtu = dce->dce_pmtu; 2074 11042 Erik else 2075 11042 Erik old_mtu = ill->ill_mtu; 2076 11042 Erik 2077 11042 Erik if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) { 2078 11042 Erik uint32_t length; 2079 11042 Erik int i; 2080 11042 Erik 2081 11042 Erik /* 2082 11042 Erik * Use the table from RFC 1191 to figure out 2083 11042 Erik * the next "plateau" based on the length in 2084 11042 Erik * the original IP packet. 2085 11042 Erik */ 2086 11042 Erik length = ntohs(ipha->ipha_length); 2087 11042 Erik DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce, 2088 11042 Erik uint32_t, length); 2089 11042 Erik if (old_mtu <= length && 2090 11042 Erik old_mtu >= length - hdr_length) { 2091 11042 Erik /* 2092 11042 Erik * Handle broken BSD 4.2 systems that 2093 11042 Erik * return the wrong ipha_length in ICMP 2094 11042 Erik * errors. 2095 11042 Erik */ 2096 11042 Erik ip1dbg(("Wrong mtu: sent %d, dce %d\n", 2097 11042 Erik length, old_mtu)); 2098 11042 Erik length -= hdr_length; 2099 11042 Erik } 2100 11042 Erik for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { 2101 11042 Erik if (length > icmp_frag_size_table[i]) 2102 11042 Erik break; 2103 11042 Erik } 2104 11042 Erik if (i == A_CNT(icmp_frag_size_table)) { 2105 11042 Erik /* Smaller than IP_MIN_MTU! */ 2106 11042 Erik ip1dbg(("Too big for packet size %d\n", 2107 11042 Erik length)); 2108 11042 Erik disable_pmtud = B_TRUE; 2109 11042 Erik mtu = ipst->ips_ip_pmtu_min; 2110 11042 Erik } else { 2111 11042 Erik mtu = icmp_frag_size_table[i]; 2112 11042 Erik ip1dbg(("Calculated mtu %d, packet size %d, " 2113 11042 Erik "before %d\n", mtu, length, old_mtu)); 2114 11042 Erik if (mtu < ipst->ips_ip_pmtu_min) { 2115 11042 Erik mtu = ipst->ips_ip_pmtu_min; 2116 8106 Kacheong disable_pmtud = B_TRUE; 2117 11042 Erik } 2118 11042 Erik } 2119 11042 Erik } 2120 11042 Erik if (disable_pmtud) 2121 11042 Erik dce->dce_flags |= DCEF_TOO_SMALL_PMTU; 2122 11042 Erik else 2123 11042 Erik dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU; 2124 11042 Erik 2125 11042 Erik dce->dce_pmtu = MIN(old_mtu, mtu); 2126 11042 Erik /* Prepare to send the new max frag size for the ULP. */ 2127 11042 Erik icmph->icmph_du_zero = 0; 2128 11042 Erik icmph->icmph_du_mtu = htons((uint16_t)dce->dce_pmtu); 2129 11042 Erik DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *, 2130 11042 Erik dce, int, orig_mtu, int, mtu); 2131 11042 Erik 2132 11042 Erik /* We now have a PMTU for sure */ 2133 11042 Erik dce->dce_flags |= DCEF_PMTU; 2134 11066 rafael dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 2135 11042 Erik mutex_exit(&dce->dce_lock); 2136 11042 Erik /* 2137 11042 Erik * After dropping the lock the new value is visible to everyone. 2138 11042 Erik * Then we bump the generation number so any cached values reinspect 2139 11042 Erik * the dce_t. 2140 11042 Erik */ 2141 11042 Erik dce_increment_generation(dce); 2142 11042 Erik dce_refrele(dce); 2143 11042 Erik } 2144 11042 Erik 2145 11042 Erik /* 2146 11042 Erik * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4 2147 0 stevel * calls this function. 2148 0 stevel */ 2149 0 stevel static mblk_t * 2150 11042 Erik icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha) 2151 11042 Erik { 2152 0 stevel int length; 2153 0 stevel 2154 0 stevel ASSERT(mp->b_datap->db_type == M_DATA); 2155 0 stevel 2156 11042 Erik /* icmp_inbound_v4 has already pulled up the whole error packet */ 2157 11042 Erik ASSERT(mp->b_cont == NULL); 2158 11042 Erik 2159 11042 Erik /* 2160 11042 Erik * The length that we want to overlay is the inner header 2161 11042 Erik * and what follows it. 2162 11042 Erik */ 2163 11042 Erik length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr); 2164 11042 Erik 2165 11042 Erik /* 2166 11042 Erik * Overlay the inner header and whatever follows it over the 2167 0 stevel * outer header. 2168 0 stevel */ 2169 0 stevel bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length); 2170 0 stevel 2171 11042 Erik /* Adjust for what we removed */ 2172 11042 Erik mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha; 2173 0 stevel return (mp); 2174 10616 Sebastien } 2175 10616 Sebastien 2176 10616 Sebastien /* 2177 0 stevel * Try to pass the ICMP message upstream in case the ULP cares. 2178 0 stevel * 2179 0 stevel * If the packet that caused the ICMP error is secure, we send 2180 0 stevel * it to AH/ESP to make sure that the attached packet has a 2181 0 stevel * valid association. ipha in the code below points to the 2182 0 stevel * IP header of the packet that caused the error. 2183 0 stevel * 2184 10616 Sebastien * For IPsec cases, we let the next-layer-up (which has access to 2185 10616 Sebastien * cached policy on the conn_t, or can query the SPD directly) 2186 10616 Sebastien * subtract out any IPsec overhead if they must. We therefore make no 2187 10616 Sebastien * adjustments here for IPsec overhead. 2188 0 stevel * 2189 0 stevel * IFN could have been generated locally or by some router. 2190 0 stevel * 2191 11042 Erik * LOCAL : ire_send_wire (before calling ipsec_out_process) can call 2192 11042 Erik * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN. 2193 0 stevel * This happens because IP adjusted its value of MTU on an 2194 0 stevel * earlier IFN message and could not tell the upper layer, 2195 0 stevel * the new adjusted value of MTU e.g. Packet was encrypted 2196 0 stevel * or there was not enough information to fanout to upper 2197 11042 Erik * layers. Thus on the next outbound datagram, ire_send_wire 2198 4987 danmcd * generates the IFN, where IPsec processing has *not* been 2199 0 stevel * done. 2200 0 stevel * 2201 11042 Erik * Note that we retain ixa_fragsize across IPsec thus once 2202 11042 Erik * we have picking ixa_fragsize and entered ipsec_out_process we do 2203 11042 Erik * no change the fragsize even if the path MTU changes before 2204 11042 Erik * we reach ip_output_post_ipsec. 2205 11042 Erik * 2206 11042 Erik * In the local case, IRAF_LOOPBACK will be set indicating 2207 0 stevel * that IFN was generated locally. 2208 0 stevel * 2209 0 stevel * ROUTER : IFN could be secure or non-secure. 2210 0 stevel * 2211 0 stevel * * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the 2212 0 stevel * packet in error has AH/ESP headers to validate the AH/ESP 2213 0 stevel * headers. AH/ESP will verify whether there is a valid SA or 2214 0 stevel * not and send it back. We will fanout again if we have more 2215 0 stevel * data in the packet. 2216 0 stevel * 2217 0 stevel * If the packet in error does not have AH/ESP, we handle it 2218 0 stevel * like any other case. 2219 0 stevel * 2220 11042 Erik * * NON_SECURE : If the packet in error has AH/ESP headers, we send it 2221 11042 Erik * up to AH/ESP for validation. AH/ESP will verify whether there is a 2222 0 stevel * valid SA or not and send it back. We will fanout again if 2223 0 stevel * we have more data in the packet. 2224 0 stevel * 2225 0 stevel * If the packet in error does not have AH/ESP, we handle it 2226 0 stevel * like any other case. 2227 11042 Erik * 2228 11042 Erik * The caller must have called icmp_inbound_verify_v4. 2229 11042 Erik */ 2230 11042 Erik static void 2231 11042 Erik icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira) 2232 11042 Erik { 2233 11042 Erik uint16_t *up; /* Pointer to ports in ULP header */ 2234 11042 Erik uint32_t ports; /* reversed ports for fanout */ 2235 11042 Erik ipha_t ripha; /* With reversed addresses */ 2236 11042 Erik ipha_t *ipha; /* Inner IP header */ 2237 11042 Erik uint_t hdr_length; /* Inner IP header length */ 2238 11042 Erik tcpha_t *tcpha; 2239 11042 Erik conn_t *connp; 2240 11042 Erik ill_t *ill = ira->ira_ill; 2241 11042 Erik ip_stack_t *ipst = ill->ill_ipst; 2242 11042 Erik ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 2243 11042 Erik ill_t *rill = ira->ira_rill; 2244 11042 Erik 2245 11042 Erik /* Caller already pulled up everything. */ 2246 11042 Erik ipha = (ipha_t *)&icmph[1]; 2247 11042 Erik ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr); 2248 11042 Erik ASSERT(mp->b_cont == NULL); 2249 11042 Erik 2250 11042 Erik hdr_length = IPH_HDR_LENGTH(ipha); 2251 11042 Erik ira->ira_protocol = ipha->ipha_protocol; 2252 0 stevel 2253 10616 Sebastien /* 2254 10616 Sebastien * We need a separate IP header with the source and destination 2255 10616 Sebastien * addresses reversed to do fanout/classification because the ipha in 2256 10616 Sebastien * the ICMP error is in the form we sent it out. 2257 10616 Sebastien */ 2258 10616 Sebastien ripha.ipha_src = ipha->ipha_dst; 2259 10616 Sebastien ripha.ipha_dst = ipha->ipha_src; 2260 10616 Sebastien ripha.ipha_protocol = ipha->ipha_protocol; 2261 10616 Sebastien ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length; 2262 10616 Sebastien 2263 11042 Erik ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n", 2264 10616 Sebastien ripha.ipha_protocol, ntohl(ipha->ipha_src), 2265 10616 Sebastien ntohl(ipha->ipha_dst), 2266 10616 Sebastien icmph->icmph_type, icmph->icmph_code)); 2267 10616 Sebastien 2268 0 stevel switch (ipha->ipha_protocol) { 2269 0 stevel case IPPROTO_UDP: 2270 0 stevel up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2271 0 stevel 2272 10616 Sebastien /* Attempt to find a client stream based on port. */ 2273 11042 Erik ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n", 2274 10616 Sebastien ntohs(up[0]), ntohs(up[1]))); 2275 0 stevel 2276 11042 Erik /* Note that we send error to all matches. */ 2277 11042 Erik ira->ira_flags |= IRAF_ICMP_ERROR; 2278 11042 Erik ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira); 2279 11042 Erik ira->ira_flags &= ~IRAF_ICMP_ERROR; 2280 0 stevel return; 2281 0 stevel 2282 0 stevel case IPPROTO_TCP: 2283 0 stevel /* 2284 0 stevel * Find a TCP client stream for this packet. 2285 0 stevel * Note that we do a reverse lookup since the header is 2286 0 stevel * in the form we sent it out. 2287 0 stevel */ 2288 11042 Erik tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length); 2289 11042 Erik connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN, 2290 3448 dh155122 ipst); 2291 3284 apersson if (connp == NULL) 2292 3284 apersson goto discard_pkt; 2293 0 stevel 2294 11042 Erik if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || 2295 11042 Erik (ira->ira_flags & IRAF_IPSEC_SECURE)) { 2296 11042 Erik mp = ipsec_check_inbound_policy(mp, connp, 2297 11042 Erik ipha, NULL, ira); 2298 11042 Erik if (mp == NULL) { 2299 11042 Erik BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2300 11042 Erik /* Note that mp is NULL */ 2301 11042 Erik ip_drop_input("ipIfStatsInDiscards", mp, ill); 2302 11042 Erik CONN_DEC_REF(connp); 2303 11042 Erik return; 2304 11042 Erik } 2305 11042 Erik } 2306 11042 Erik 2307 11042 Erik ira->ira_flags |= IRAF_ICMP_ERROR; 2308 11042 Erik ira->ira_ill = ira->ira_rill = NULL; 2309 11042 Erik if (IPCL_IS_TCP(connp)) { 2310 11042 Erik SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 2311 11042 Erik connp->conn_recvicmp, connp, ira, SQ_FILL, 2312 11042 Erik SQTAG_TCP_INPUT_ICMP_ERR); 2313 11042 Erik } else { 2314 11042 Erik /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ 2315 11042 Erik (connp->conn_recv)(connp, mp, NULL, ira); 2316 11042 Erik CONN_DEC_REF(connp); 2317 11042 Erik } 2318 11042 Erik ira->ira_ill = ill; 2319 11042 Erik ira->ira_rill = rill; 2320 11042 Erik ira->ira_flags &= ~IRAF_ICMP_ERROR; 2321 0 stevel return; 2322 0 stevel 2323 0 stevel case IPPROTO_SCTP: 2324 0 stevel up = (uint16_t *)((uchar_t *)ipha + hdr_length); 2325 10616 Sebastien /* Find a SCTP client stream for this packet. */ 2326 0 stevel ((uint16_t *)&ports)[0] = up[1]; 2327 0 stevel ((uint16_t *)&ports)[1] = up[0]; 2328 0 stevel 2329 11042 Erik ira->ira_flags |= IRAF_ICMP_ERROR; 2330 11042 Erik ip_fanout_sctp(mp, &ripha, NULL, ports, ira); 2331 11042 Erik ira->ira_flags &= ~IRAF_ICMP_ERROR; 2332 0 stevel return; 2333 0 stevel 2334 0 stevel case IPPROTO_ESP: 2335 11042 Erik case IPPROTO_AH: 2336 11042 Erik if (!ipsec_loaded(ipss)) { 2337 11042 Erik ip_proto_not_sup(mp, ira); 2338 11042 Erik return; 2339 11042 Erik } 2340 11042 Erik 2341 11042 Erik if (ipha->ipha_protocol == IPPROTO_ESP) 2342 11042 Erik mp = ipsecesp_icmp_error(mp, ira); 2343 11042 Erik else 2344 11042 Erik mp = ipsecah_icmp_error(mp, ira); 2345 11042 Erik if (mp == NULL) 2346 11042 Erik return; 2347 11042 Erik 2348 11042 Erik /* Just in case ipsec didn't preserve the NULL b_cont */ 2349 11042 Erik if (mp->b_cont != NULL) { 2350 11042 Erik if (!pullupmsg(mp, -1)) 2351 11042 Erik goto discard_pkt; 2352 11042 Erik } 2353 11042 Erik 2354 11042 Erik /* 2355 11042 Erik * Note that ira_pktlen and ira_ip_hdr_length are no longer 2356 11042 Erik * correct, but we don't use them any more here. 2357 11042 Erik * 2358 11042 Erik * If succesful, the mp has been modified to not include 2359 11042 Erik * the ESP/AH header so we can fanout to the ULP's icmp 2360 11042 Erik * error handler. 2361 11042 Erik */ 2362 11042 Erik if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) 2363 11042 Erik goto truncated; 2364 11042 Erik 2365 11042 Erik /* Verify the modified message before any further processes. */ 2366 11042 Erik ipha = (ipha_t *)mp->b_rptr; 2367 11042 Erik hdr_length = IPH_HDR_LENGTH(ipha); 2368 11042 Erik icmph = (icmph_t *)&mp->b_rptr[hdr_length]; 2369 11042 Erik if (!icmp_inbound_verify_v4(mp, icmph, ira)) { 2370 11042 Erik freemsg(mp); 2371 11042 Erik return; 2372 11042 Erik } 2373 11042 Erik 2374 11042 Erik icmp_inbound_error_fanout_v4(mp, icmph, ira); 2375 11042 Erik return; 2376 11042 Erik 2377 11042 Erik case IPPROTO_ENCAP: { 2378 11042 Erik /* Look for self-encapsulated packets that caused an error */ 2379 11042 Erik ipha_t *in_ipha; 2380 11042 Erik 2381 11042 Erik /* 2382 11042 Erik * Caller has verified that length has to be 2383 11042 Erik * at least the size of IP header. 2384 11042 Erik */ 2385 11042 Erik ASSERT(hdr_length >= sizeof (ipha_t)); 2386 11042 Erik /* 2387 11042 Erik * Check the sanity of the inner IP header like 2388 11042 Erik * we did for the outer header. 2389 11042 Erik */ 2390 11042 Erik in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); 2391 11042 Erik if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) { 2392 11042 Erik goto discard_pkt; 2393 11042 Erik } 2394 11042 Erik if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) { 2395 11042 Erik goto discard_pkt; 2396 11042 Erik } 2397 11042 Erik /* Check for Self-encapsulated tunnels */ 2398 11042 Erik if (in_ipha->ipha_src == ipha->ipha_src && 2399 11042 Erik in_ipha->ipha_dst == ipha->ipha_dst) { 2400 11042 Erik 2401 11042 Erik mp = icmp_inbound_self_encap_error_v4(mp, ipha, 2402 11042 Erik in_ipha); 2403 11042 Erik if (mp == NULL) 2404 11042 Erik goto discard_pkt; 2405 11042 Erik 2406 11042 Erik /* 2407 11042 Erik * Just in case self_encap didn't preserve the NULL 2408 11042 Erik * b_cont 2409 11042 Erik */ 2410 11042 Erik if (mp->b_cont != NULL) { 2411 11042 Erik if (!pullupmsg(mp, -1)) 2412 11042 Erik goto discard_pkt; 2413 11042 Erik } 2414 11042 Erik /* 2415 11042 Erik * Note that ira_pktlen and ira_ip_hdr_length are no 2416 11042 Erik * longer correct, but we don't use them any more here. 2417 11042 Erik */ 2418 11042 Erik if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) 2419 11042 Erik goto truncated; 2420 11042 Erik 2421 11042 Erik /* 2422 11042 Erik * Verify the modified message before any further 2423 11042 Erik * processes. 2424 11042 Erik */ 2425 11042 Erik ipha = (ipha_t *)mp->b_rptr; 2426 11042 Erik hdr_length = IPH_HDR_LENGTH(ipha); 2427 11042 Erik icmph = (icmph_t *)&mp->b_rptr[hdr_length]; 2428 11042 Erik if (!icmp_inbound_verify_v4(mp, icmph, ira)) { 2429 0 stevel freemsg(mp); 2430 0 stevel return; 2431 0 stevel } 2432 11042 Erik 2433 11042 Erik /* 2434 11042 Erik * The packet in error is self-encapsualted. 2435 11042 Erik * And we are finding it further encapsulated 2436 11042 Erik * which we could not have possibly generated. 2437 11042 Erik */ 2438 11042 Erik if (ipha->ipha_protocol == IPPROTO_ENCAP) { 2439 11042 Erik goto discard_pkt; 2440 11042 Erik } 2441 11042 Erik icmp_inbound_error_fanout_v4(mp, icmph, ira); 2442 11042 Erik return; 2443 11042 Erik } 2444 11042 Erik /* No self-encapsulated */ 2445 11042 Erik /* FALLTHRU */ 2446 11042 Erik } 2447 10616 Sebastien case IPPROTO_IPV6: 2448 11042 Erik if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src, 2449 11042 Erik &ripha.ipha_dst, ipst)) != NULL) { 2450 11042 Erik ira->ira_flags |= IRAF_ICMP_ERROR; 2451 11042 Erik connp->conn_recvicmp(connp, mp, NULL, ira); 2452 11042 Erik CONN_DEC_REF(connp); 2453 11042 Erik ira->ira_flags &= ~IRAF_ICMP_ERROR; 2454 11042 Erik return; 2455 11042 Erik } 2456 10616 Sebastien /* 2457 10616 Sebastien * No IP tunnel is interested, fallthrough and see 2458 10616 Sebastien * if a raw socket will want it. 2459 10616 Sebastien */ 2460 10616 Sebastien /* FALLTHRU */ 2461 10616 Sebastien default: 2462 11042 Erik ira->ira_flags |= IRAF_ICMP_ERROR; 2463 11042 Erik ip_fanout_proto_v4(mp, &ripha, ira); 2464 11042 Erik ira->ira_flags &= ~IRAF_ICMP_ERROR; 2465 0 stevel return; 2466 0 stevel } 2467 0 stevel /* NOTREACHED */ 2468 3284 apersson discard_pkt: 2469 3284 apersson BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2470 11042 Erik ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n")); 2471 11042 Erik ip_drop_input("ipIfStatsInDiscards", mp, ill); 2472 11042 Erik freemsg(mp); 2473 11042 Erik return; 2474 11042 Erik 2475 11042 Erik truncated: 2476 11042 Erik /* We pulled up everthing already. Must be truncated */ 2477 11042 Erik BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); 2478 11042 Erik ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); 2479 11042 Erik freemsg(mp); 2480 0 stevel } 2481 0 stevel 2482 0 stevel /* 2483 0 stevel * Common IP options parser. 2484 0 stevel * 2485 0 stevel * Setup routine: fill in *optp with options-parsing state, then 2486 0 stevel * tail-call ipoptp_next to return the first option. 2487 0 stevel */ 2488 0 stevel uint8_t 2489 0 stevel ipoptp_first(ipoptp_t *optp, ipha_t *ipha) 2490 0 stevel { 2491 0 stevel uint32_t totallen; /* total length of all options */ 2492 0 stevel 2493 0 stevel totallen = ipha->ipha_version_and_hdr_length - 2494 0 stevel (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); 2495 0 stevel totallen <<= 2; 2496 0 stevel optp->ipoptp_next = (uint8_t *)(&ipha[1]); 2497 11042 Erik optp->ipoptp_end = optp->ipoptp_next + totallen; 2498 11042 Erik optp->ipoptp_flags = 0; 2499 11042 Erik return (ipoptp_next(optp)); 2500 11042 Erik } 2501 11042 Erik 2502 11042 Erik /* Like above but without an ipha_t */ 2503 11042 Erik uint8_t 2504 11042 Erik ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt) 2505 11042 Erik { 2506 11042 Erik optp->ipoptp_next = opt; 2507 0 stevel optp->ipoptp_end = optp->ipoptp_next + totallen; 2508 0 stevel optp->ipoptp_flags = 0; 2509 0 stevel return (ipoptp_next(optp)); 2510 0 stevel } 2511 0 stevel 2512 0 stevel /* 2513 0 stevel * Common IP options parser: extract next option. 2514 0 stevel */ 2515 0 stevel uint8_t 2516 0 stevel ipoptp_next(ipoptp_t *optp) 2517 0 stevel { 2518 0 stevel uint8_t *end = optp->ipoptp_end; 2519 0 stevel uint8_t *cur = optp->ipoptp_next; 2520 0 stevel uint8_t opt, len, pointer; 2521 0 stevel 2522 0 stevel /* 2523 0 stevel * If cur > end already, then the ipoptp_end or ipoptp_next pointer 2524 0 stevel * has been corrupted. 2525 0 stevel */ 2526 0 stevel ASSERT(cur <= end); 2527 0 stevel 2528 0 stevel if (cur == end) 2529 0 stevel return (IPOPT_EOL); 2530 0 stevel 2531 0 stevel opt = cur[IPOPT_OPTVAL]; 2532 0 stevel 2533 0 stevel /* 2534 0 stevel * Skip any NOP options. 2535 0 stevel */ 2536 0 stevel while (opt == IPOPT_NOP) { 2537 0 stevel cur++; 2538 0 stevel if (cur == end) 2539 0 stevel return (IPOPT_EOL); 2540 0 stevel opt = cur[IPOPT_OPTVAL]; 2541 0 stevel } 2542 0 stevel 2543 0 stevel if (opt == IPOPT_EOL) 2544 0 stevel return (IPOPT_EOL); 2545 0 stevel 2546 0 stevel /* 2547 0 stevel * Option requiring a length. 2548 0 stevel */ 2549 0 stevel if ((cur + 1) >= end) { 2550 0 stevel optp->ipoptp_flags |= IPOPTP_ERROR; 2551 0 stevel return (IPOPT_EOL); 2552 0 stevel } 2553 0 stevel len = cur[IPOPT_OLEN]; 2554 0 stevel if (len < 2) { 2555 0 stevel optp->ipoptp_flags |= IPOPTP_ERROR; 2556 0 stevel return (IPOPT_EOL); 2557 0 stevel } 2558 0 stevel optp->ipoptp_cur = cur; 2559 0 stevel optp->ipoptp_len = len; 2560 0 stevel optp->ipoptp_next = cur + len; 2561 0 stevel if (cur + len > end) { 2562 0 stevel optp->ipoptp_flags |= IPOPTP_ERROR; 2563 0 stevel return (IPOPT_EOL); 2564 0 stevel } 2565 0 stevel 2566 0 stevel /* 2567 0 stevel * For the options which require a pointer field, make sure 2568 0 stevel * its there, and make sure it points to either something 2569 0 stevel * inside this option, or the end of the option. 2570 0 stevel */ 2571 0 stevel switch (opt) { 2572 0 stevel case IPOPT_RR: 2573 0 stevel case IPOPT_TS: 2574 0 stevel case IPOPT_LSRR: 2575 0 stevel case IPOPT_SSRR: 2576 0 stevel if (len <= IPOPT_OFFSET) { 2577 0 stevel optp->ipoptp_flags |= IPOPTP_ERROR; 2578 0 stevel return (opt); 2579 0 stevel } 2580 0 stevel pointer = cur[IPOPT_OFFSET]; 2581 0 stevel if (pointer - 1 > len) { 2582 0 stevel optp->ipoptp_flags |= IPOPTP_ERROR; 2583 0 stevel return (opt); 2584 0 stevel } 2585 0 stevel break; 2586 0 stevel } 2587 0 stevel 2588 0 stevel /* 2589 0 stevel * Sanity check the pointer field based on the type of the 2590 0 stevel * option. 2591 0 stevel */ 2592 0 stevel switch (opt) { 2593 0 stevel case IPOPT_RR: 2594 0 stevel case IPOPT_SSRR: 2595 0 stevel case IPOPT_LSRR: 2596 0 stevel if (pointer < IPOPT_MINOFF_SR) 2597 0 stevel optp->ipoptp_flags |= IPOPTP_ERROR; 2598 0 stevel break; 2599 0 stevel case IPOPT_TS: 2600 0 stevel if (pointer < IPOPT_MINOFF_IT) 2601 0 stevel optp->ipoptp_flags |= IPOPTP_ERROR; 2602 0 stevel /* 2603 0 stevel * Note that the Internet Timestamp option also 2604 0 stevel * contains two four bit fields (the Overflow field, 2605 0 stevel * and the Flag field), which follow the pointer 2606 0 stevel * field. We don't need to check that these fields 2607 0 stevel * fall within the length of the option because this 2608 0 stevel * was implicitely done above. We've checked that the 2609 0 stevel * pointer value is at least IPOPT_MINOFF_IT, and that 2610 0 stevel * it falls within the option. Since IPOPT_MINOFF_IT > 2611 0 stevel * IPOPT_POS_OV_FLG, we don't need the explicit check. 2612 0 stevel */ 2613 0 stevel ASSERT(len > IPOPT_POS_OV_FLG); 2614 0 stevel break; 2615 0 stevel } 2616 0 stevel 2617 0 stevel return (opt); 2618 1676 jpk } 2619 1676 jpk 2620 1676 jpk /* 2621 1676 jpk * Use the outgoing IP header to create an IP_OPTIONS option the way 2622 1676 jpk * it was passed down from the application. 2623 11042 Erik * 2624 11042 Erik * This is compatible with BSD in that it returns 2625 11042 Erik * the reverse source route with the final destination 2626 11042 Erik * as the last entry. The first 4 bytes of the option 2627 11042 Erik * will contain the final destination. 2628 11042 Erik */ 2629 11042 Erik int 2630 11042 Erik ip_opt_get_user(conn_t *connp, uchar_t *buf) 2631 1676 jpk { 2632 1676 jpk ipoptp_t opts; 2633 11042 Erik uchar_t *opt; 2634 1676 jpk uint8_t optval; 2635 1676 jpk uint8_t optlen; 2636 1676 jpk uint32_t len = 0; 2637 11042 Erik uchar_t *buf1 = buf; 2638 11042 Erik uint32_t totallen; 2639 11042 Erik ipaddr_t dst; 2640 11042 Erik ip_pkt_t *ipp = &connp->conn_xmit_ipp; 2641 11042 Erik 2642 11042 Erik if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS)) 2643 11042 Erik return (0); 2644 11042 Erik 2645 11042 Erik totallen = ipp->ipp_ipv4_options_len; 2646 11042 Erik if (totallen & 0x3) 2647 11042 Erik return (0); 2648 1676 jpk 2649 1676 jpk buf += IP_ADDR_LEN; /* Leave room for final destination */ 2650 1676 jpk len += IP_ADDR_LEN; 2651 1676 jpk bzero(buf1, IP_ADDR_LEN); 2652 1676 jpk 2653 11042 Erik dst = connp->conn_faddr_v4; 2654 11042 Erik 2655 11042 Erik for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options); 2656 1676 jpk optval != IPOPT_EOL; 2657 1676 jpk optval = ipoptp_next(&opts)) { 2658 1676 jpk int off; 2659 1676 jpk 2660 1676 jpk opt = opts.ipoptp_cur; 2661 11042 Erik if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { 2662 11042 Erik break; 2663 11042 Erik } 2664 1676 jpk optlen = opts.ipoptp_len; 2665 11042 Erik 2666 1676 jpk switch (optval) { 2667 1676 jpk case IPOPT_SSRR: 2668 1676 jpk case IPOPT_LSRR: 2669 1676 jpk 2670 1676 jpk /* 2671 11042 Erik * Insert destination as the first entry in the source 2672 1676 jpk * route and move down the entries on step. 2673 1676 jpk * The last entry gets placed at buf1. 2674 1676 jpk */ 2675 1676 jpk buf[IPOPT_OPTVAL] = optval; 2676 1676 jpk buf[IPOPT_OLEN] = optlen; 2677 1676 jpk buf[IPOPT_OFFSET] = optlen; 2678 1676 jpk 2679 1676 jpk off = optlen - IP_ADDR_LEN; 2680 1676 jpk if (off < 0) { 2681 1676 jpk /* No entries in source route */ 2682 1676 jpk break; 2683 1676 jpk } 2684 11042 Erik /* Last entry in source route if not already set */ 2685 11042 Erik if (dst == INADDR_ANY) 2686 11042 Erik bcopy(opt + off, buf1, IP_ADDR_LEN); 2687 1676 jpk off -= IP_ADDR_LEN; 2688 1676 jpk 2689 1676 jpk while (off > 0) { 2690 1676 jpk bcopy(opt + off, 2691 1676 jpk buf + off + IP_ADDR_LEN, 2692 1676 jpk IP_ADDR_LEN); 2693 1676 jpk off -= IP_ADDR_LEN; 2694 1676 jpk } 2695 1676 jpk /* ipha_dst into first slot */ 2696 11042 Erik bcopy(&dst, buf + off + IP_ADDR_LEN, 2697 1676 jpk IP_ADDR_LEN); 2698 1676 jpk buf += optlen; 2699 1676 jpk len += optlen; 2700 1676 jpk break; 2701 1676 jpk 2702 1676 jpk default: 2703 1676 jpk bcopy(opt, buf, optlen); 2704 1676 jpk buf += optlen; 2705 1676 jpk len += optlen; 2706 1676 jpk break; 2707 1676 jpk } 2708 1676 jpk } 2709 1676 jpk done: 2710 1676 jpk /* Pad the resulting options */ 2711 1676 jpk while (len & 0x3) { 2712 1676 jpk *buf++ = IPOPT_EOL; 2713 1676 jpk len++; 2714 1676 jpk } 2715 1676 jpk return (len); 2716 0 stevel } 2717 0 stevel 2718 0 stevel /* 2719 0 stevel * Update any record route or timestamp options to include this host. 2720 0 stevel * Reverse any source route option. 2721 0 stevel * This routine assumes that the options are well formed i.e. that they 2722 0 stevel * have already been checked. 2723 0 stevel */ 2724 0 stevel static void 2725 0 stevel icmp_options_update(ipha_t *ipha) 2726 0 stevel { 2727 0 stevel ipoptp_t opts; 2728 0 stevel uchar_t *opt; 2729 0 stevel uint8_t optval; 2730 0 stevel ipaddr_t src; /* Our local address */ 2731 0 stevel ipaddr_t dst; 2732 0 stevel 2733 0 stevel ip2dbg(("icmp_options_update\n")); 2734 0 stevel src = ipha->ipha_src; 2735 0 stevel dst = ipha->ipha_dst; 2736 0 stevel 2737 0 stevel for (optval = ipoptp_first(&opts, ipha); 2738 0 stevel optval != IPOPT_EOL; 2739 0 stevel optval = ipoptp_next(&opts)) { 2740 0 stevel ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); 2741 0 stevel opt = opts.ipoptp_cur; 2742 0 stevel ip2dbg(("icmp_options_update: opt %d, len %d\n", 2743 0 stevel optval, opts.ipoptp_len)); 2744 0 stevel switch (optval) { 2745 0 stevel int off1, off2; 2746 0 stevel case IPOPT_SSRR: 2747 0 stevel case IPOPT_LSRR: 2748 0 stevel /* 2749 0 stevel * Reverse the source route. The first entry 2750 0 stevel * should be the next to last one in the current 2751 0 stevel * source route (the last entry is our address). 2752 0 stevel * The last entry should be the final destination. 2753 0 stevel */ 2754 0 stevel off1 = IPOPT_MINOFF_SR - 1; 2755 0 stevel off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 2756 0 stevel if (off2 < 0) { 2757 0 stevel /* No entries in source route */ 2758 0 stevel ip1dbg(( 2759 0 stevel "icmp_options_update: bad src route\n")); 2760 0 stevel break; 2761 0 stevel } 2762 0 stevel bcopy((char *)opt + off2, &dst, IP_ADDR_LEN); 2763 0 stevel bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN); 2764 0 stevel bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN); 2765 0 stevel off2 -= IP_ADDR_LEN; 2766 0 stevel 2767 0 stevel while (off1 < off2) { 2768 0 stevel bcopy((char *)opt + off1, &src, IP_ADDR_LEN); 2769 0 stevel bcopy((char *)opt + off2, (char *)opt + off1, 2770 0 stevel IP_ADDR_LEN); 2771 0 stevel bcopy(&src, (char *)opt + off2, IP_ADDR_LEN); 2772 0 stevel off1 += IP_ADDR_LEN; 2773 0 stevel off2 -= IP_ADDR_LEN; 2774 0 stevel } 2775 0 stevel opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 2776 0 stevel break; 2777 0 stevel } 2778 0 stevel } 2779 0 stevel } 2780 0 stevel 2781 0 stevel /* 2782 0 stevel * Process received ICMP Redirect messages. 2783 11042 Erik * Assumes the caller has verified that the headers are in the pulled up mblk. 2784 11042 Erik * Consumes mp. 2785 11042 Erik */ 2786 11042 Erik static void 2787 11042 Erik icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira) 2788 11042 Erik { 2789 11042 Erik ire_t *ire, *nire; 2790 11042 Erik ire_t *prev_ire; 2791 11042 Erik ipaddr_t src, dst, gateway; 2792 11042 Erik ip_stack_t *ipst = ira->ira_ill->ill_ipst; 2793 11042 Erik ipha_t *inner_ipha; /* Inner IP header */ 2794 11042 Erik 2795 11042 Erik /* Caller already pulled up everything. */ 2796 11042 Erik inner_ipha = (ipha_t *)&icmph[1]; 2797 0 stevel src = ipha->ipha_src; 2798 11042 Erik dst = inner_ipha->ipha_dst; 2799 0 stevel gateway = icmph->icmph_rd_gateway; 2800 0 stevel /* Make sure the new gateway is reachable somehow. */ 2801 11042 Erik ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL, 2802 11042 Erik ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL); 2803 0 stevel /* 2804 0 stevel * Make sure we had a route for the dest in question and that 2805 0 stevel * that route was pointing to the old gateway (the source of the 2806 0 stevel * redirect packet.) 2807 11042 Erik * Note: this merely says that there is some IRE which matches that 2808 11042 Erik * gateway; not that the longest match matches that gateway. 2809 11042 Erik */ 2810 11042 Erik prev_ire = ire_ftable_lookup_v4(dst, 0, src, 0, NULL, ALL_ZONES, 2811 11042 Erik NULL, MATCH_IRE_GW, 0, ipst, NULL); 2812 0 stevel /* 2813 0 stevel * Check that 2814 0 stevel * the redirect was not from ourselves 2815 0 stevel * the new gateway and the old gateway are directly reachable 2816 0 stevel */ 2817 11042 Erik if (prev_ire == NULL || ire == NULL || 2818 11042 Erik (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) || 2819 11042 Erik (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 2820 11042 Erik !(ire->ire_type & IRE_IF_ALL)) { 2821 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); 2822 11042 Erik ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill); 2823 0 stevel freemsg(mp); 2824 0 stevel if (ire != NULL) 2825 0 stevel ire_refrele(ire); 2826 0 stevel if (prev_ire != NULL) 2827 0 stevel ire_refrele(prev_ire); 2828 0 stevel return; 2829 0 stevel } 2830 0 stevel 2831 0 stevel ire_refrele(prev_ire); 2832 11042 Erik ire_refrele(ire); 2833 11042 Erik 2834 0 stevel /* 2835 0 stevel * TODO: more precise handling for cases 0, 2, 3, the latter two 2836 0 stevel * require TOS routing 2837 0 stevel */ 2838 0 stevel switch (icmph->icmph_code) { 2839 0 stevel case 0: 2840 0 stevel case 1: 2841 0 stevel /* TODO: TOS specificity for cases 2 and 3 */ 2842 0 stevel case 2: 2843 0 stevel case 3: 2844 0 stevel break; 2845 0 stevel default: 2846 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); 2847 11042 Erik ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill); 2848 11042 Erik freemsg(mp); 2849 0 stevel return; 2850 0 stevel } 2851 0 stevel /* 2852 0 stevel * Create a Route Association. This will allow us to remember that 2853 0 stevel * someone we believe told us to use the particular gateway. 2854 0 stevel */ 2855 0 stevel ire = ire_create( 2856 4459 kcpoon (uchar_t *)&dst, /* dest addr */ 2857 4459 kcpoon (uchar_t *)&ip_g_all_ones, /* mask */ 2858 4459 kcpoon (uchar_t *)&gateway, /* gateway addr */ 2859 4459 kcpoon IRE_HOST, 2860 11042 Erik NULL, /* ill */ 2861 11042 Erik ALL_ZONES, 2862 4459 kcpoon (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 2863 4714 sowmini NULL, /* tsol_gc_t */ 2864 4459 kcpoon ipst); 2865 0 stevel 2866 0 stevel if (ire == NULL) { 2867 0 stevel freemsg(mp); 2868 11042 Erik return; 2869 11042 Erik } 2870 11042 Erik nire = ire_add(ire); 2871 11042 Erik /* Check if it was a duplicate entry */ 2872 11042 Erik if (nire != NULL && nire != ire) { 2873 11042 Erik ASSERT(nire->ire_identical_ref > 1); 2874 11042 Erik ire_delete(nire); 2875 11042 Erik ire_refrele(nire); 2876 11042 Erik nire = NULL; 2877 11042 Erik } 2878 11042 Erik ire = nire; 2879 11042 Erik if (ire != NULL) { 2880 11042 Erik ire_refrele(ire); /* Held in ire_add */ 2881 11042 Erik 2882 0 stevel /* tell routing sockets that we received a redirect */ 2883 0 stevel ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src, 2884 0 stevel (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, 2885 3448 dh155122 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst); 2886 0 stevel } 2887 0 stevel 2888 0 stevel /* 2889 3004 dd193516 * Delete any existing IRE_HOST type redirect ires for this destination. 2890 0 stevel * This together with the added IRE has the effect of 2891 0 stevel * modifying an existing redirect. 2892 0 stevel */ 2893 11042 Erik prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL, 2894 11042 Erik ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL); 2895 3004 dd193516 if (prev_ire != NULL) { 2896 3004 dd193516 if (prev_ire ->ire_flags & RTF_DYNAMIC) 2897 3004 dd193516 ire_delete(prev_ire); 2898 0 stevel ire_refrele(prev_ire); 2899 0 stevel } 2900 0 stevel 2901 0 stevel freemsg(mp); 2902 0 stevel } 2903 0 stevel 2904 0 stevel /* 2905 0 stevel * Generate an ICMP parameter problem message. 2906 11042 Erik * When called from ip_output side a minimal ip_recv_attr_t needs to be 2907 11042 Erik * constructed by the caller. 2908 11042 Erik */ 2909 11042 Erik static void 2910 11042 Erik icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira) 2911 0 stevel { 2912 0 stevel icmph_t icmph; 2913 11042 Erik ip_stack_t *ipst = ira->ira_ill->ill_ipst; 2914 11042 Erik 2915 11042 Erik mp = icmp_pkt_err_ok(mp, ira); 2916 11042 Erik if (mp == NULL) 2917 11042 Erik return; 2918 0 stevel 2919 0 stevel bzero(&icmph, sizeof (icmph_t)); 2920 0 stevel icmph.icmph_type = ICMP_PARAM_PROBLEM; 2921 0 stevel icmph.icmph_pp_ptr = ptr; 2922 3448 dh155122 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs); 2923 11042 Erik icmp_pkt(mp, &icmph, sizeof (icmph_t), ira); 2924 0 stevel } 2925 0 stevel 2926 0 stevel /* 2927 0 stevel * Build and ship an IPv4 ICMP message using the packet data in mp, and 2928 0 stevel * the ICMP header pointed to by "stuff". (May be called as writer.) 2929 0 stevel * Note: assumes that icmp_pkt_err_ok has been called to verify that 2930 0 stevel * an icmp error packet can be sent. 2931 0 stevel * Assigns an appropriate source address to the packet. If ipha_dst is 2932 11042 Erik * one of our addresses use it for source. Otherwise let ip_output_simple 2933 11042 Erik * pick the source address. 2934 11042 Erik */ 2935 11042 Erik static void 2936 11042 Erik icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira) 2937 0 stevel { 2938 0 stevel ipaddr_t dst; 2939 0 stevel icmph_t *icmph; 2940 0 stevel ipha_t *ipha; 2941 0 stevel uint_t len_needed; 2942 0 stevel size_t msg_len; 2943 0 stevel mblk_t *mp1; 2944 0 stevel ipaddr_t src; 2945 0 stevel ire_t *ire; 2946 11042 Erik ip_xmit_attr_t ixas; 2947 11042 Erik ip_stack_t *ipst = ira->ira_ill->ill_ipst; 2948 11042 Erik 2949 11042 Erik ipha = (ipha_t *)mp->b_rptr; 2950 11042 Erik 2951 11042 Erik bzero(&ixas, sizeof (ixas)); 2952 11042 Erik ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 2953 11042 Erik ixas.ixa_zoneid = ira->ira_zoneid; 2954 11042 Erik ixas.ixa_ifindex = 0; 2955 11042 Erik ixas.ixa_ipst = ipst; 2956 11042 Erik ixas.ixa_cred = kcred; 2957 11042 Erik ixas.ixa_cpid = NOPID; 2958 11042 Erik ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ 2959 11042 Erik ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 2960 11042 Erik 2961 11042 Erik if (ira->ira_flags & IRAF_IPSEC_SECURE) { 2962 11042 Erik /* 2963 11042 Erik * Apply IPsec based on how IPsec was applied to 2964 11042 Erik * the packet that had the error. 2965 0 stevel * 2966 11042 Erik * If it was an outbound packet that caused the ICMP 2967 11042 Erik * error, then the caller will have setup the IRA 2968 11042 Erik * appropriately. 2969 11042 Erik */ 2970 11042 Erik if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) { 2971 11042 Erik BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 2972 11042 Erik /* Note: mp already consumed and ip_drop_packet done */ 2973 11042 Erik return; 2974 11042 Erik } 2975 0 stevel } else { 2976 0 stevel /* 2977 0 stevel * This is in clear. The icmp message we are building 2978 11042 Erik * here should go out in clear, independent of our policy. 2979 11042 Erik */ 2980 11042 Erik ixas.ixa_flags |= IXAF_NO_IPSEC; 2981 0 stevel } 2982 0 stevel 2983 0 stevel /* Remember our eventual destination */ 2984 0 stevel dst = ipha->ipha_src; 2985 0 stevel 2986 11042 Erik /* 2987 11042 Erik * If the packet was for one of our unicast addresses, make 2988 11042 Erik * sure we respond with that as the source. Otherwise 2989 11042 Erik * have ip_output_simple pick the source address. 2990 11042 Erik */ 2991 11042 Erik ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 2992 11042 Erik (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL, 2993 11042 Erik MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL); 2994 11042 Erik if (ire != NULL) { 2995 11042 Erik ire_refrele(ire); 2996 0 stevel src = ipha->ipha_dst; 2997 4823 seb } else { 2998 11042 Erik src = INADDR_ANY; 2999 11042 Erik ixas.ixa_flags |= IXAF_SET_SOURCE; 3000 11042 Erik } 3001 0 stevel 3002 0 stevel /* 3003 4564 wy83408 * Check if we can send back more then 8 bytes in addition to 3004 4564 wy83408 * the IP header. We try to send 64 bytes of data and the internal 3005 4564 wy83408 * header in the special cases of ipv4 encapsulated ipv4 or ipv6. 3006 0 stevel */ 3007 1676 jpk len_needed = IPH_HDR_LENGTH(ipha); 3008 4564 wy83408 if (ipha->ipha_protocol == IPPROTO_ENCAP || 3009 4564 wy83408 ipha->ipha_protocol == IPPROTO_IPV6) { 3010 4564 wy83408 if (!pullupmsg(mp, -1)) { 3011 4564 wy83408 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 3012