Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 /* Copyright (c) 1990 Mentat Inc. */
     27 
     28 #include <sys/types.h>
     29 #include <sys/stream.h>
     30 #include <sys/dlpi.h>
     31 #include <sys/stropts.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/strsubr.h>
     34 #include <sys/strlog.h>
     35 #include <sys/strsun.h>
     36 #include <sys/zone.h>
     37 #define	_SUN_TPI_VERSION 2
     38 #include <sys/tihdr.h>
     39 #include <sys/xti_inet.h>
     40 #include <sys/ddi.h>
     41 #include <sys/suntpi.h>
     42 #include <sys/cmn_err.h>
     43 #include <sys/debug.h>
     44 #include <sys/kobj.h>
     45 #include <sys/modctl.h>
     46 #include <sys/atomic.h>
     47 #include <sys/policy.h>
     48 #include <sys/priv.h>
     49 #include <sys/taskq.h>
     50 
     51 #include <sys/systm.h>
     52 #include <sys/param.h>
     53 #include <sys/kmem.h>
     54 #include <sys/sdt.h>
     55 #include <sys/socket.h>
     56 #include <sys/vtrace.h>
     57 #include <sys/isa_defs.h>
     58 #include <sys/mac.h>
     59 #include <net/if.h>
     60 #include <net/if_arp.h>
     61 #include <net/route.h>
     62 #include <sys/sockio.h>
     63 #include <netinet/in.h>
     64 #include <net/if_dl.h>
     65 
     66 #include <inet/common.h>
     67 #include <inet/mi.h>
     68 #include <inet/mib2.h>
     69 #include <inet/nd.h>
     70 #include <inet/arp.h>
     71 #include <inet/snmpcom.h>
     72 #include <inet/optcom.h>
     73 #include <inet/kstatcom.h>
     74 
     75 #include <netinet/igmp_var.h>
     76 #include <netinet/ip6.h>
     77 #include <netinet/icmp6.h>
     78 #include <netinet/sctp.h>
     79 
     80 #include <inet/ip.h>
     81 #include <inet/ip_impl.h>
     82 #include <inet/ip6.h>
     83 #include <inet/ip6_asp.h>
     84 #include <inet/tcp.h>
     85 #include <inet/tcp_impl.h>
     86 #include <inet/ip_multi.h>
     87 #include <inet/ip_if.h>
     88 #include <inet/ip_ire.h>
     89 #include <inet/ip_ftable.h>
     90 #include <inet/ip_rts.h>
     91 #include <inet/ip_ndp.h>
     92 #include <inet/ip_listutils.h>
     93 #include <netinet/igmp.h>
     94 #include <netinet/ip_mroute.h>
     95 #include <inet/ipp_common.h>
     96 
     97 #include <net/pfkeyv2.h>
     98 #include <inet/sadb.h>
     99 #include <inet/ipsec_impl.h>
    100 #include <inet/iptun/iptun_impl.h>
    101 #include <inet/ipdrop.h>
    102 #include <inet/ip_netinfo.h>
    103 #include <inet/ilb_ip.h>
    104 
    105 #include <sys/ethernet.h>
    106 #include <net/if_types.h>
    107 #include <sys/cpuvar.h>
    108 
    109 #include <ipp/ipp.h>
    110 #include <ipp/ipp_impl.h>
    111 #include <ipp/ipgpc/ipgpc.h>
    112 
    113 #include <sys/pattr.h>
    114 #include <inet/ipclassifier.h>
    115 #include <inet/sctp_ip.h>
    116 #include <inet/sctp/sctp_impl.h>
    117 #include <inet/udp_impl.h>
    118 #include <inet/rawip_impl.h>
    119 #include <inet/rts_impl.h>
    120 
    121 #include <sys/tsol/label.h>
    122 #include <sys/tsol/tnet.h>
    123 
    124 #include <sys/squeue_impl.h>
    125 #include <inet/ip_arp.h>
    126 
    127 #include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
    128 
    129 /*
    130  * Values for squeue switch:
    131  * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
    132  * IP_SQUEUE_ENTER: SQ_PROCESS
    133  * IP_SQUEUE_FILL: SQ_FILL
    134  */
    135 int ip_squeue_enter = IP_SQUEUE_ENTER;	/* Setable in /etc/system */
    136 
    137 int ip_squeue_flag;
    138 
    139 /*
    140  * Setable in /etc/system
    141  */
    142 int ip_poll_normal_ms = 100;
    143 int ip_poll_normal_ticks = 0;
    144 int ip_modclose_ackwait_ms = 3000;
    145 
    146 /*
    147  * It would be nice to have these present only in DEBUG systems, but the
    148  * current design of the global symbol checking logic requires them to be
    149  * unconditionally present.
    150  */
    151 uint_t ip_thread_data;			/* TSD key for debug support */
    152 krwlock_t ip_thread_rwlock;
    153 list_t	ip_thread_list;
    154 
    155 /*
    156  * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
    157  */
    158 
    159 struct listptr_s {
    160 	mblk_t	*lp_head;	/* pointer to the head of the list */
    161 	mblk_t	*lp_tail;	/* pointer to the tail of the list */
    162 };
    163 
    164 typedef struct listptr_s listptr_t;
    165 
    166 /*
    167  * This is used by ip_snmp_get_mib2_ip_route_media and
    168  * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
    169  */
    170 typedef struct iproutedata_s {
    171 	uint_t		ird_idx;
    172 	uint_t		ird_flags;	/* see below */
    173 	listptr_t	ird_route;	/* ipRouteEntryTable */
    174 	listptr_t	ird_netmedia;	/* ipNetToMediaEntryTable */
    175 	listptr_t	ird_attrs;	/* ipRouteAttributeTable */
    176 } iproutedata_t;
    177 
    178 /* Include ire_testhidden and IRE_IF_CLONE routes */
    179 #define	IRD_REPORT_ALL	0x01
    180 
    181 /*
    182  * Cluster specific hooks. These should be NULL when booted as a non-cluster
    183  */
    184 
    185 /*
    186  * Hook functions to enable cluster networking
    187  * On non-clustered systems these vectors must always be NULL.
    188  *
    189  * Hook function to Check ip specified ip address is a shared ip address
    190  * in the cluster
    191  *
    192  */
    193 int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
    194     sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL;
    195 
    196 /*
    197  * Hook function to generate cluster wide ip fragment identifier
    198  */
    199 uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
    200     sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
    201     void *args) = NULL;
    202 
    203 /*
    204  * Hook function to generate cluster wide SPI.
    205  */
    206 void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
    207     void *) = NULL;
    208 
    209 /*
    210  * Hook function to verify if the SPI is already utlized.
    211  */
    212 
    213 int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
    214 
    215 /*
    216  * Hook function to delete the SPI from the cluster wide repository.
    217  */
    218 
    219 void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
    220 
    221 /*
    222  * Hook function to inform the cluster when packet received on an IDLE SA
    223  */
    224 
    225 void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
    226     in6_addr_t, in6_addr_t, void *) = NULL;
    227 
    228 /*
    229  * Synchronization notes:
    230  *
    231  * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
    232  * MT level protection given by STREAMS. IP uses a combination of its own
    233  * internal serialization mechanism and standard Solaris locking techniques.
    234  * The internal serialization is per phyint.  This is used to serialize
    235  * plumbing operations, IPMP operations, most set ioctls, etc.
    236  *
    237  * Plumbing is a long sequence of operations involving message
    238  * exchanges between IP, ARP and device drivers. Many set ioctls are typically
    239  * involved in plumbing operations. A natural model is to serialize these
    240  * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
    241  * parallel without any interference. But various set ioctls on hme0 are best
    242  * serialized, along with IPMP operations and processing of DLPI control
    243  * messages received from drivers on a per phyint basis. This serialization is
    244  * provided by the ipsq_t and primitives operating on this. Details can
    245  * be found in ip_if.c above the core primitives operating on ipsq_t.
    246  *
    247  * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
    248  * Simiarly lookup of an ire by a thread also returns a refheld ire.
    249  * In addition ipif's and ill's referenced by the ire are also indirectly
    250  * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
    251  * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
    252  * address of an ipif has to go through the ipsq_t. This ensures that only
    253  * one such exclusive operation proceeds at any time on the ipif. It then
    254  * waits for all refcnts
    255  * associated with this ipif to come down to zero. The address is changed
    256  * only after the ipif has been quiesced. Then the ipif is brought up again.
    257  * More details are described above the comment in ip_sioctl_flags.
    258  *
    259  * Packet processing is based mostly on IREs and are fully multi-threaded
    260  * using standard Solaris MT techniques.
    261  *
    262  * There are explicit locks in IP to handle:
    263  * - The ip_g_head list maintained by mi_open_link() and friends.
    264  *
    265  * - The reassembly data structures (one lock per hash bucket)
    266  *
    267  * - conn_lock is meant to protect conn_t fields. The fields actually
    268  *   protected by conn_lock are documented in the conn_t definition.
    269  *
    270  * - ire_lock to protect some of the fields of the ire, IRE tables
    271  *   (one lock per hash bucket). Refer to ip_ire.c for details.
    272  *
    273  * - ndp_g_lock and ncec_lock for protecting NCEs.
    274  *
    275  * - ill_lock protects fields of the ill and ipif. Details in ip.h
    276  *
    277  * - ill_g_lock: This is a global reader/writer lock. Protects the following
    278  *	* The AVL tree based global multi list of all ills.
    279  *	* The linked list of all ipifs of an ill
    280  *	* The <ipsq-xop> mapping
    281  *	* <ill-phyint> association
    282  *   Insertion/deletion of an ill in the system, insertion/deletion of an ipif
    283  *   into an ill, changing the <ipsq-xop> mapping of an ill, changing the
    284  *   <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
    285  *   writer for the actual duration of the insertion/deletion/change.
    286  *
    287  * - ill_lock:  This is a per ill mutex.
    288  *   It protects some members of the ill_t struct; see ip.h for details.
    289  *   It also protects the <ill-phyint> assoc.
    290  *   It also protects the list of ipifs hanging off the ill.
    291  *
    292  * - ipsq_lock: This is a per ipsq_t mutex lock.
    293  *   This protects some members of the ipsq_t struct; see ip.h for details.
    294  *   It also protects the <ipsq-ipxop> mapping
    295  *
    296  * - ipx_lock: This is a per ipxop_t mutex lock.
    297  *   This protects some members of the ipxop_t struct; see ip.h for details.
    298  *
    299  * - phyint_lock: This is a per phyint mutex lock. Protects just the
    300  *   phyint_flags
    301  *
    302  * - ip_g_nd_lock: This is a global reader/writer lock.
    303  *   Any call to nd_load to load a new parameter to the ND table must hold the
    304  *   lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock
    305  *   as reader.
    306  *
    307  * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
    308  *   This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
    309  *   uniqueness check also done atomically.
    310  *
    311  * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
    312  *   group list linked by ill_usesrc_grp_next. It also protects the
    313  *   ill_usesrc_ifindex field. It is taken as a writer when a member of the
    314  *   group is being added or deleted.  This lock is taken as a reader when
    315  *   walking the list/group(eg: to get the number of members in a usesrc group).
    316  *   Note, it is only necessary to take this lock if the ill_usesrc_grp_next
    317  *   field is changing state i.e from NULL to non-NULL or vice-versa. For
    318  *   example, it is not necessary to take this lock in the initial portion
    319  *   of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
    320  *   operations are executed exclusively and that ensures that the "usesrc
    321  *   group state" cannot change. The "usesrc group state" change can happen
    322  *   only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
    323  *
    324  * Changing <ill-phyint>, <ipsq-xop> assocications:
    325  *
    326  * To change the <ill-phyint> association, the ill_g_lock must be held
    327  * as writer, and the ill_locks of both the v4 and v6 instance of the ill
    328  * must be held.
    329  *
    330  * To change the <ipsq-xop> association, the ill_g_lock must be held as
    331  * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
    332  * This is only done when ills are added or removed from IPMP groups.
    333  *
    334  * To add or delete an ipif from the list of ipifs hanging off the ill,
    335  * ill_g_lock (writer) and ill_lock must be held and the thread must be
    336  * a writer on the associated ipsq.
    337  *
    338  * To add or delete an ill to the system, the ill_g_lock must be held as
    339  * writer and the thread must be a writer on the associated ipsq.
    340  *
    341  * To add or delete an ilm to an ill, the ill_lock must be held and the thread
    342  * must be a writer on the associated ipsq.
    343  *
    344  * Lock hierarchy
    345  *
    346  * Some lock hierarchy scenarios are listed below.
    347  *
    348  * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
    349  * ill_g_lock -> ill_lock(s) -> phyint_lock
    350  * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
    351  * ill_g_lock -> ip_addr_avail_lock
    352  * conn_lock -> irb_lock -> ill_lock -> ire_lock
    353  * ill_g_lock -> ip_g_nd_lock
    354  * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
    355  * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
    356  * arl_lock -> ill_lock
    357  * ips_ire_dep_lock -> irb_lock
    358  *
    359  * When more than 1 ill lock is needed to be held, all ill lock addresses
    360  * are sorted on address and locked starting from highest addressed lock
    361  * downward.
    362  *
    363  * Multicast scenarios
    364  * ips_ill_g_lock -> ill_mcast_lock
    365  * conn_ilg_lock -> ips_ill_g_lock -> ill_lock
    366  * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
    367  * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
    368  * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
    369  * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
    370  *
    371  * IPsec scenarios
    372  *
    373  * ipsa_lock -> ill_g_lock -> ill_lock
    374  * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
    375  *
    376  * Trusted Solaris scenarios
    377  *
    378  * igsa_lock -> gcgrp_rwlock -> gcgrp_lock
    379  * igsa_lock -> gcdb_lock
    380  * gcgrp_rwlock -> ire_lock
    381  * gcgrp_rwlock -> gcdb_lock
    382  *
    383  * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
    384  *
    385  * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
    386  * sq_lock -> conn_lock -> QLOCK(q)
    387  * ill_lock -> ft_lock -> fe_lock
    388  *
    389  * Routing/forwarding table locking notes:
    390  *
    391  * Lock acquisition order: Radix tree lock, irb_lock.
    392  * Requirements:
    393  * i.  Walker must not hold any locks during the walker callback.
    394  * ii  Walker must not see a truncated tree during the walk because of any node
    395  *     deletion.
    396  * iii Existing code assumes ire_bucket is valid if it is non-null and is used
    397  *     in many places in the code to walk the irb list. Thus even if all the
    398  *     ires in a bucket have been deleted, we still can't free the radix node
    399  *     until the ires have actually been inactive'd (freed).
    400  *
    401  * Tree traversal - Need to hold the global tree lock in read mode.
    402  * Before dropping the global tree lock, need to either increment the ire_refcnt
    403  * to ensure that the radix node can't be deleted.
    404  *
    405  * Tree add - Need to hold the global tree lock in write mode to add a
    406  * radix node. To prevent the node from being deleted, increment the
    407  * irb_refcnt, after the node is added to the tree. The ire itself is
    408  * added later while holding the irb_lock, but not the tree lock.
    409  *
    410  * Tree delete - Need to hold the global tree lock and irb_lock in write mode.
    411  * All associated ires must be inactive (i.e. freed), and irb_refcnt
    412  * must be zero.
    413  *
    414  * Walker - Increment irb_refcnt before calling the walker callback. Hold the
    415  * global tree lock (read mode) for traversal.
    416  *
    417  * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
    418  * hence we will acquire irb_lock while holding ips_ire_dep_lock.
    419  *
    420  * IPsec notes :
    421  *
    422  * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
    423  * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
    424  * ip_xmit_attr_t has the
    425  * information used by the IPsec code for applying the right level of
    426  * protection. The information initialized by IP in the ip_xmit_attr_t
    427  * is determined by the per-socket policy or global policy in the system.
    428  * For inbound datagrams, the ip_recv_attr_t
    429  * starts out with nothing in it. It gets filled
    430  * with the right information if it goes through the AH/ESP code, which
    431  * happens if the incoming packet is secure. The information initialized
    432  * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
    433  * the policy requirements needed by per-socket policy or global policy
    434  * is met or not.
    435  *
    436  * For fully connected sockets i.e dst, src [addr, port] is known,
    437  * conn_policy_cached is set indicating that policy has been cached.
    438  * conn_in_enforce_policy may or may not be set depending on whether
    439  * there is a global policy match or per-socket policy match.
    440  * Policy inheriting happpens in ip_policy_set once the destination is known.
    441  * Once the right policy is set on the conn_t, policy cannot change for
    442  * this socket. This makes life simpler for TCP (UDP ?) where
    443  * re-transmissions go out with the same policy. For symmetry, policy
    444  * is cached for fully connected UDP sockets also. Thus if policy is cached,
    445  * it also implies that policy is latched i.e policy cannot change
    446  * on these sockets. As we have the right policy on the conn, we don't
    447  * have to lookup global policy for every outbound and inbound datagram
    448  * and thus serving as an optimization. Note that a global policy change
    449  * does not affect fully connected sockets if they have policy. If fully
    450  * connected sockets did not have any policy associated with it, global
    451  * policy change may affect them.
    452  *
    453  * IP Flow control notes:
    454  * ---------------------
    455  * Non-TCP streams are flow controlled by IP. The way this is accomplished
    456  * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
    457  * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
    458  * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
    459  * functions.
    460  *
    461  * Per Tx ring udp flow control:
    462  * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
    463  * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
    464  *
    465  * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
    466  * To achieve best performance, outgoing traffic need to be fanned out among
    467  * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
    468  * traffic out of the NIC and it takes a fanout hint. UDP connections pass
    469  * the address of connp as fanout hint to mac_tx(). Under flow controlled
    470  * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
    471  * cookie points to a specific Tx ring that is blocked. The cookie is used to
    472  * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
    473  * point to drain_lists (idl_t's). These drain list will store the blocked UDP
    474  * connp's. The drain list is not a single list but a configurable number of
    475  * lists.
    476  *
    477  * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
    478  * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
    479  * which is equal to 128. This array in turn contains a pointer to idl_t[],
    480  * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
    481  * list will point to the list of connp's that are flow controlled.
    482  *
    483  *                      ---------------   -------   -------   -------
    484  *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
    485  *                   |  ---------------   -------   -------   -------
    486  *                   |  ---------------   -------   -------   -------
    487  *                   |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
    488  * ----------------  |  ---------------   -------   -------   -------
    489  * |idl_tx_list[0]|->|  ---------------   -------   -------   -------
    490  * ----------------  |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
    491  *                   |  ---------------   -------   -------   -------
    492  *                   .        .              .         .         .
    493  *                   |  ---------------   -------   -------   -------
    494  *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
    495  *                      ---------------   -------   -------   -------
    496  *                      ---------------   -------   -------   -------
    497  *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
    498  *                   |  ---------------   -------   -------   -------
    499  *                   |  ---------------   -------   -------   -------
    500  * ----------------  |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
    501  * |idl_tx_list[1]|->|  ---------------   -------   -------   -------
    502  * ----------------  |        .              .         .         .
    503  *                   |  ---------------   -------   -------   -------
    504  *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
    505  *                      ---------------   -------   -------   -------
    506  *     .....
    507  * ----------------
    508  * |idl_tx_list[n]|-> ...
    509  * ----------------
    510  *
    511  * When mac_tx() returns a cookie, the cookie is used to hash into a
    512  * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is
    513  * called passing idl_tx_list. The connp gets inserted in a drain list
    514  * pointed to by idl_tx_list. conn_drain_list() asserts flow control for
    515  * the sockets (non stream based) and sets QFULL condition on the conn_wq
    516  * of streams sockets, or the su_txqfull for non-streams sockets.
    517  * connp->conn_direct_blocked will be set to indicate the blocked
    518  * condition.
    519  *
    520  * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved.
    521  * A cookie is passed in the call to ill_flow_enable() that identifies the
    522  * blocked Tx ring. This cookie is used to get to the idl_tx_list that
    523  * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t
    524  * and goes through each conn in the drain list and calls conn_idl_remove
    525  * for the conn to clear the qfull condition for the conn, as well as to
    526  * remove the conn from the idl list. In addition, streams based sockets
    527  * will have the conn_wq enabled, causing ip_wsrv to run for the
    528  * conn. ip_wsrv drains the queued messages, and removes the conn from the
    529  * drain list, if all messages were drained. It also notifies the
    530  * conn_upcalls for the conn to signal that flow-control has opened up.
    531  *
    532  * In reality the drain list is not a single list, but a configurable number
    533  * of lists. conn_walk_drain() in the IP module, notifies the conn_upcalls for
    534  * each conn in the list. conn_drain_insert and conn_drain_tail are the only
    535  * functions that manipulate this drain list. conn_drain_insert is called in
    536  * from the protocol layer when conn_ip_output returns EWOULDBLOCK.
    537  * (as opposed to from ip_wsrv context for STREAMS
    538  * case -- see below). The synchronization between drain insertion and flow
    539  * control wakeup is handled by using idl_txl->txl_lock.
    540  *
    541  * Flow control using STREAMS:
    542  * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism
    543  * is used. On the send side, if the packet cannot be sent down to the
    544  * driver by IP, because of a canput failure, ip_xmit drops the packet
    545  * and returns EWOULDBLOCK to the caller, who may then invoke
    546  * ixa_check_drain_insert to insert the conn on the 0'th drain list.
    547  * When ip_wsrv runs on the ill_wq because flow control has been relieved, the
    548  * blocked conns in the * 0'th drain list is drained as with the
    549  * non-STREAMS case.
    550  *
    551  * In both the STREAMS and non-STREAMS case, the sockfs upcall to set
    552  * qfull is done when the conn is inserted into the drain list
    553  * (conn_drain_insert()) and cleared when the conn is removed from the drain
    554  * list (conn_idl_remove()).
    555  *
    556  * IPQOS notes:
    557  *
    558  * IPQoS Policies are applied to packets using IPPF (IP Policy framework)
    559  * and IPQoS modules. IPPF includes hooks in IP at different control points
    560  * (callout positions) which direct packets to IPQoS modules for policy
    561  * processing. Policies, if present, are global.
    562  *
    563  * The callout positions are located in the following paths:
    564  *		o local_in (packets destined for this host)
    565  *		o local_out (packets orginating from this host )
    566  *		o fwd_in  (packets forwarded by this m/c - inbound)
    567  *		o fwd_out (packets forwarded by this m/c - outbound)
    568  * Hooks at these callout points can be enabled/disabled using the ndd variable
    569  * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
    570  * By default all the callout positions are enabled.
    571  *
    572  * Outbound (local_out)
    573  * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
    574  *
    575  * Inbound (local_in)
    576  * Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
    577  *
    578  * Forwarding (in and out)
    579  * Hooks are placed in ire_recv_forward_v4/v6.
    580  *
    581  * IP Policy Framework processing (IPPF processing)
    582  * Policy processing for a packet is initiated by ip_process, which ascertains
    583  * that the classifier (ipgpc) is loaded and configured, failing which the
    584  * packet resumes normal processing in IP. If the clasifier is present, the
    585  * packet is acted upon by one or more IPQoS modules (action instances), per
    586  * filters configured in ipgpc and resumes normal IP processing thereafter.
    587  * An action instance can drop a packet in course of its processing.
    588  *
    589  * Zones notes:
    590  *
    591  * The partitioning rules for networking are as follows:
    592  * 1) Packets coming from a zone must have a source address belonging to that
    593  * zone.
    594  * 2) Packets coming from a zone can only be sent on a physical interface on
    595  * which the zone has an IP address.
    596  * 3) Between two zones on the same machine, packet delivery is only allowed if
    597  * there's a matching route for the destination and zone in the forwarding
    598  * table.
    599  * 4) The TCP and UDP port spaces are per-zone; that is, two processes in
    600  * different zones can bind to the same port with the wildcard address
    601  * (INADDR_ANY).
    602  *
    603  * The granularity of interface partitioning is at the logical interface level.
    604  * Therefore, every zone has its own IP addresses, and incoming packets can be
    605  * attributed to a zone unambiguously. A logical interface is placed into a zone
    606  * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
    607  * structure. Rule (1) is implemented by modifying the source address selection
    608  * algorithm so that the list of eligible addresses is filtered based on the
    609  * sending process zone.
    610  *
    611  * The Internet Routing Entries (IREs) are either exclusive to a zone or shared
    612  * across all zones, depending on their type. Here is the break-up:
    613  *
    614  * IRE type				Shared/exclusive
    615  * --------				----------------
    616  * IRE_BROADCAST			Exclusive
    617  * IRE_DEFAULT (default routes)		Shared (*)
    618  * IRE_LOCAL				Exclusive (x)
    619  * IRE_LOOPBACK				Exclusive
    620  * IRE_PREFIX (net routes)		Shared (*)
    621  * IRE_IF_NORESOLVER (interface routes)	Exclusive
    622  * IRE_IF_RESOLVER (interface routes)	Exclusive
    623  * IRE_IF_CLONE (interface routes)	Exclusive
    624  * IRE_HOST (host routes)		Shared (*)
    625  *
    626  * (*) A zone can only use a default or off-subnet route if the gateway is
    627  * directly reachable from the zone, that is, if the gateway's address matches
    628  * one of the zone's logical interfaces.
    629  *
    630  * (x) IRE_LOCAL are handled a bit differently.
    631  * When ip_restrict_interzone_loopback is set (the default),
    632  * ire_route_recursive restricts loopback using an IRE_LOCAL
    633  * between zone to the case when L2 would have conceptually looped the packet
    634  * back, i.e. the loopback which is required since neither Ethernet drivers
    635  * nor Ethernet hardware loops them back. This is the case when the normal
    636  * routes (ignoring IREs with different zoneids) would send out the packet on
    637  * the same ill as the ill with which is IRE_LOCAL is associated.
    638  *
    639  * Multiple zones can share a common broadcast address; typically all zones
    640  * share the 255.255.255.255 address. Incoming as well as locally originated
    641  * broadcast packets must be dispatched to all the zones on the broadcast
    642  * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
    643  * since some zones may not be on the 10.16.72/24 network. To handle this, each
    644  * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
    645  * sent to every zone that has an IRE_BROADCAST entry for the destination
    646  * address on the input ill, see ip_input_broadcast().
    647  *
    648  * Applications in different zones can join the same multicast group address.
    649  * The same logic applies for multicast as for broadcast. ip_input_multicast
    650  * dispatches packets to all zones that have members on the physical interface.
    651  */
    652 
    653 /*
    654  * Squeue Fanout flags:
    655  *	0: No fanout.
    656  *	1: Fanout across all squeues
    657  */
    658 boolean_t	ip_squeue_fanout = 0;
    659 
    660 /*
    661  * Maximum dups allowed per packet.
    662  */
    663 uint_t ip_max_frag_dups = 10;
    664 
    665 /* RFC 1122 Conformance */
    666 #define	IP_FORWARD_DEFAULT	IP_FORWARD_NEVER
    667 
    668 #define	ILL_MAX_NAMELEN			LIFNAMSIZ
    669 
    670 static int	ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
    671 		    cred_t *credp, boolean_t isv6);
    672 static mblk_t	*ip_xmit_attach_llhdr(mblk_t *, nce_t *);
    673 
    674 static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
    675 static void	icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
    676 static void	icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
    677     ip_recv_attr_t *);
    678 static void	icmp_options_update(ipha_t *);
    679 static void	icmp_param_problem(mblk_t *, uint8_t,  ip_recv_attr_t *);
    680 static void	icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
    681 static mblk_t	*icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
    682 static void	icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
    683     ip_recv_attr_t *);
    684 static void	icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
    685 static void	icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
    686     ip_recv_attr_t *);
    687 
    688 mblk_t		*ip_dlpi_alloc(size_t, t_uscalar_t);
    689 char		*ip_dot_addr(ipaddr_t, char *);
    690 mblk_t		*ip_carve_mp(mblk_t **, ssize_t);
    691 int		ip_close(queue_t *, int);
    692 static char	*ip_dot_saddr(uchar_t *, char *);
    693 static void	ip_lrput(queue_t *, mblk_t *);
    694 ipaddr_t	ip_net_mask(ipaddr_t);
    695 char		*ip_nv_lookup(nv_t *, int);
    696 static int	ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    697 static int	ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    698 static boolean_t	ip_param_register(IDP *ndp, ipparam_t *, size_t,
    699     ipndp_t *, size_t);
    700 static int	ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
    701 void	ip_rput(queue_t *, mblk_t *);
    702 static void	ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
    703 		    void *dummy_arg);
    704 int		ip_snmp_get(queue_t *, mblk_t *, int);
    705 static mblk_t	*ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
    706 		    mib2_ipIfStatsEntry_t *, ip_stack_t *);
    707 static mblk_t	*ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
    708 		    ip_stack_t *);
    709 static mblk_t	*ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *);
    710 static mblk_t	*ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    711 static mblk_t	*ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
    712 static mblk_t	*ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    713 static mblk_t	*ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
    714 static mblk_t	*ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
    715 		    ip_stack_t *ipst);
    716 static mblk_t	*ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
    717 		    ip_stack_t *ipst);
    718 static mblk_t	*ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
    719 		    ip_stack_t *ipst);
    720 static mblk_t	*ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
    721 		    ip_stack_t *ipst);
    722 static mblk_t	*ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
    723 		    ip_stack_t *ipst);
    724 static mblk_t	*ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
    725 		    ip_stack_t *ipst);
    726 static mblk_t	*ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
    727 		    ip_stack_t *ipst);
    728 static mblk_t	*ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
    729 		    ip_stack_t *ipst);
    730 static mblk_t	*ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
    731 		    ip_stack_t *ipst);
    732 static mblk_t	*ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
    733 		    ip_stack_t *ipst);
    734 static void	ip_snmp_get2_v4(ire_t *, iproutedata_t *);
    735 static void	ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
    736 static int	ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *);
    737 static int	ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *);
    738 int		ip_snmp_set(queue_t *, int, int, uchar_t *, int);
    739 
    740 static mblk_t	*ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
    741 		    mblk_t *);
    742 
    743 static void	conn_drain_init(ip_stack_t *);
    744 static void	conn_drain_fini(ip_stack_t *);
    745 static void	conn_drain_tail(conn_t *connp, boolean_t closing);
    746 
    747 static void	conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
    748 static void	conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
    749 
    750 static void	*ip_stack_init(netstackid_t stackid, netstack_t *ns);
    751 static void	ip_stack_shutdown(netstackid_t stackid, void *arg);
    752 static void	ip_stack_fini(netstackid_t stackid, void *arg);
    753 
    754 static int	ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
    755 
    756 static int	ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
    757     const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
    758     ire_t *, conn_t *, boolean_t, const in6_addr_t *,  mcast_record_t,
    759     const in6_addr_t *);
    760 
    761 static int	ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    762 static int	ip_cgtp_filter_set(queue_t *, mblk_t *, char *,
    763     caddr_t, cred_t *);
    764 static int	ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
    765     caddr_t cp, cred_t *cr);
    766 static int	ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
    767     cred_t *);
    768 static int	ip_squeue_switch(int);
    769 
    770 static void	*ip_kstat_init(netstackid_t, ip_stack_t *);
    771 static void	ip_kstat_fini(netstackid_t, kstat_t *);
    772 static int	ip_kstat_update(kstat_t *kp, int rw);
    773 static void	*icmp_kstat_init(netstackid_t);
    774 static void	icmp_kstat_fini(netstackid_t, kstat_t *);
    775 static int	icmp_kstat_update(kstat_t *kp, int rw);
    776 static void	*ip_kstat2_init(netstackid_t, ip_stat_t *);
    777 static void	ip_kstat2_fini(netstackid_t, kstat_t *);
    778 
    779 static void	ipobs_init(ip_stack_t *);
    780 static void	ipobs_fini(ip_stack_t *);
    781 
    782 ipaddr_t	ip_g_all_ones = IP_HOST_MASK;
    783 
    784 /* How long, in seconds, we allow frags to hang around. */
    785 #define	IP_FRAG_TIMEOUT		15
    786 #define	IPV6_FRAG_TIMEOUT	60
    787 
    788 static long ip_rput_pullups;
    789 int	dohwcksum = 1;	/* use h/w cksum if supported by the hardware */
    790 
    791 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
    792 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
    793 
    794 int	ip_debug;
    795 
    796 /*
    797  * Multirouting/CGTP stuff
    798  */
    799 int	ip_cgtp_filter_rev = CGTP_FILTER_REV;	/* CGTP hooks version */
    800 
    801 /*
    802  * Named Dispatch Parameter Table.
    803  * All of these are alterable, within the min/max values given, at run time.
    804  */
    805 static ipparam_t	lcl_param_arr[] = {
    806 	/* min	max	value	name */
    807 	{  0,	1,	0,	"ip_respond_to_address_mask_broadcast"},
    808 	{  0,	1,	1,	"ip_respond_to_echo_broadcast"},
    809 	{  0,	1,	1,	"ip_respond_to_echo_multicast"},
    810 	{  0,	1,	0,	"ip_respond_to_timestamp"},
    811 	{  0,	1,	0,	"ip_respond_to_timestamp_broadcast"},
    812 	{  0,	1,	1,	"ip_send_redirects"},
    813 	{  0,	1,	0,	"ip_forward_directed_broadcasts"},
    814 	{  0,	10,	0,	"ip_mrtdebug"},
    815 	{  1,	8,	3,	"ip_ire_reclaim_fraction" },
    816 	{  1,	8,	3,	"ip_nce_reclaim_fraction" },
    817 	{  1,	8,	3,	"ip_dce_reclaim_fraction" },
    818 	{  1,	255,	255,	"ip_def_ttl" },
    819 	{  0,	1,	0,	"ip_forward_src_routed"},
    820 	{  0,	256,	32,	"ip_wroff_extra" },
    821 	{  2, 999999999, 60*20, "ip_pathmtu_interval" },	/* In seconds */
    822 	{  8,	65536,  64,	"ip_icmp_return_data_bytes" },
    823 	{  0,	1,	1,	"ip_path_mtu_discovery" },
    824 	{ 68,	65535,	576,	"ip_pmtu_min" },
    825 	{  0,	1,	0,	"ip_ignore_redirect" },
    826 	{  0,	1,	0,	"ip_arp_icmp_error" },
    827 	{  1,	254,	1,	"ip_broadcast_ttl" },
    828 	{  0,	99999,	100,	"ip_icmp_err_interval" },
    829 	{  1,	99999,	10,	"ip_icmp_err_burst" },
    830 	{  0,	999999999,	1000000, "ip_reass_queue_bytes" },
    831 	{  0,	1,	0,	"ip_strict_dst_multihoming" },
    832 	{  1,	MAX_ADDRS_PER_IF,	256,	"ip_addrs_per_if"},
    833 	{  0,	1,	0,	"ipsec_override_persocket_policy" },
    834 	{  0,	1,	1,	"icmp_accept_clear_messages" },
    835 	{  0,	1,	1,	"igmp_accept_clear_messages" },
    836 	{  2,	999999999, ND_DELAY_FIRST_PROBE_TIME,
    837 				"ip_ndp_delay_first_probe_time"},
    838 	{  1,	999999999, ND_MAX_UNICAST_SOLICIT,
    839 				"ip_ndp_max_unicast_solicit"},
    840 	{  1,	255,	IPV6_MAX_HOPS,	"ip6_def_hops" },
    841 	{  8,	IPV6_MIN_MTU,	IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" },
    842 	{  0,	1,	0,	"ip6_forward_src_routed"},
    843 	{  0,	1,	1,	"ip6_respond_to_echo_multicast"},
    844 	{  0,	1,	1,	"ip6_send_redirects"},
    845 	{  0,	1,	0,	"ip6_ignore_redirect" },
    846 	{  0,	1,	0,	"ip6_strict_dst_multihoming" },
    847 
    848 	{  0,	2,	2,	"ip_src_check" },
    849 
    850 	{  0,	999999,	1000,	"ipsec_policy_log_interval" },
    851 
    852 	{  0,	1,	1,	"pim_accept_clear_messages" },
    853 	{  1000, 20000,	2000,	"ip_ndp_unsolicit_interval" },
    854 	{  1,	20,	3,	"ip_ndp_unsolicit_count" },
    855 	{  0,	1,	1,	"ip6_ignore_home_address_opt" },
    856 	{  0,	15,	0,	"ip_policy_mask" },
    857 	{  0,	2,	2,	"ip_ecmp_behavior" },
    858 	{  0,	255,	1,	"ip_multirt_ttl" },
    859 	{  0,	3600,	60,	"ip_ire_badcnt_lifetime" },	/* In seconds */
    860 	{  0,	999999,	60*60*24, "ip_max_temp_idle" },
    861 	{  0,	1000,	1,	"ip_max_temp_defend" },
    862 	/*
    863 	 * when a conflict of an active address is detected,
    864 	 * defend up to ip_max_defend times, within any
    865 	 * ip_defend_interval span.
    866 	 */
    867 	{  0,	1000,	3,	"ip_max_defend" },
    868 	{  0,	999999,	30,	"ip_defend_interval" },
    869 	{  0,	3600000, 300000, "ip_dup_recovery" },
    870 	{  0,	1,	1,	"ip_restrict_interzone_loopback" },
    871 	{  0,	1,	1,	"ip_lso_outbound" },
    872 	{  IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" },
    873 	{  MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" },
    874 #ifdef DEBUG
    875 	{  0,	1,	0,	"ip6_drop_inbound_icmpv6" },
    876 #else
    877 	{  0,	0,	0,	"" },
    878 #endif
    879 	/* delay before sending first probe: */
    880 	{  0,	20000,	1000,	"arp_probe_delay" },
    881 	{  0,	20000,	100,	"arp_fastprobe_delay" },
    882 	/* interval at which DAD probes are sent: */
    883 	{ 10,	20000,	1500,	"arp_probe_interval" },
    884 	{ 10,	20000,	150,	"arp_fastprobe_interval" },
    885 	/* setting probe count to 0 will disable ARP probing for DAD. */
    886 	{  0,	20,	3,	"arp_probe_count" },
    887 	{  0,	20,	3,	"arp_fastprobe_count" },
    888 
    889 	{  0,	3600000, 15000,	"ipv4_dad_announce_interval"},
    890 	{  0,	3600000, 15000,	"ipv6_dad_announce_interval"},
    891 	/*
    892 	 * Rate limiting parameters for DAD defense used in
    893 	 * ill_defend_rate_limit():
    894 	 * defend_rate : pkts/hour permitted
    895 	 * defend_interval : time that can elapse before we send out a
    896 	 *			DAD defense.
    897 	 * defend_period: denominator for defend_rate (in seconds).
    898 	 */
    899 	{  0,	3600000, 300000,	"arp_defend_interval"},
    900 	{  0,	20000, 100,		"arp_defend_rate"},
    901 	{  0,	3600000, 300000,	"ndp_defend_interval"},
    902 	{  0,	20000, 100,		"ndp_defend_rate"},
    903 	{  5,	86400,	3600,		"arp_defend_period"},
    904 	{  5,	86400,	3600,		"ndp_defend_period"},
    905 	{  0,	1,	1,		"ipv4_icmp_return_pmtu" },
    906 	{  0,	1,	1,		"ipv6_icmp_return_pmtu" },
    907 	/*
    908 	 * publish count/interval values used to announce local addresses
    909 	 * for IPv4, IPv6.
    910 	 */
    911 	{  1,	20,	5,	"ip_arp_publish_count" },
    912 	{  1000, 20000,	2000,	"ip_arp_publish_interval" },
    913 };
    914 
    915 /*
    916  * Extended NDP table
    917  * The addresses for the first two are filled in to be ips_ip_g_forward
    918  * and ips_ipv6_forward at init time.
    919  */
    920 static ipndp_t	lcl_ndp_arr[] = {
    921 	/* getf			setf		data			name */
    922 #define	IPNDP_IP_FORWARDING_OFFSET	0
    923 	{  ip_param_generic_get,	ip_forward_set,	NULL,
    924 	    "ip_forwarding" },
    925 #define	IPNDP_IP6_FORWARDING_OFFSET	1
    926 	{  ip_param_generic_get,	ip_forward_set,	NULL,
    927 	    "ip6_forwarding" },
    928 	{ ip_param_generic_get, ip_input_proc_set,
    929 	    (caddr_t)&ip_squeue_enter, "ip_squeue_enter" },
    930 	{ ip_param_generic_get, ip_int_set,
    931 	    (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" },
    932 #define	IPNDP_CGTP_FILTER_OFFSET	4
    933 	{  ip_cgtp_filter_get,	ip_cgtp_filter_set, NULL,
    934 	    "ip_cgtp_filter" },
    935 	{  ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug,
    936 	    "ip_debug" },
    937 };
    938 
    939 /*
    940  * Table of IP ioctls encoding the various properties of the ioctl and
    941  * indexed based on the last byte of the ioctl command. Occasionally there
    942  * is a clash, and there is more than 1 ioctl with the same last byte.
    943  * In such a case 1 ioctl is encoded in the ndx table and the remaining
    944  * ioctls are encoded in the misc table. An entry in the ndx table is
    945  * retrieved by indexing on the last byte of the ioctl command and comparing
    946  * the ioctl command with the value in the ndx table. In the event of a
    947  * mismatch the misc table is then searched sequentially for the desired
    948  * ioctl command.
    949  *
    950  * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
    951  */
    952 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
    953 	/* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    954 	/* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    955 	/* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    956 	/* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    957 	/* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    958 	/* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    959 	/* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    960 	/* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    961 	/* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    962 	/* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    963 
    964 	/* 010 */ { SIOCADDRT,	sizeof (struct rtentry), IPI_PRIV,
    965 			MISC_CMD, ip_siocaddrt, NULL },
    966 	/* 011 */ { SIOCDELRT,	sizeof (struct rtentry), IPI_PRIV,
    967 			MISC_CMD, ip_siocdelrt, NULL },
    968 
    969 	/* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    970 			IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
    971 	/* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
    972 			IF_CMD, ip_sioctl_get_addr, NULL },
    973 
    974 	/* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    975 			IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
    976 	/* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
    977 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
    978 
    979 	/* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
    980 			IPI_PRIV | IPI_WR,
    981 			IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
    982 	/* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
    983 			IPI_MODOK | IPI_GET_CMD,
    984 			IF_CMD, ip_sioctl_get_flags, NULL },
    985 
    986 	/* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    987 	/* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    988 
    989 	/* copyin size cannot be coded for SIOCGIFCONF */
    990 	/* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD,
    991 			MISC_CMD, ip_sioctl_get_ifconf, NULL },
    992 
    993 	/* 021 */ { SIOCSIFMTU,	sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    994 			IF_CMD, ip_sioctl_mtu, NULL },
    995 	/* 022 */ { SIOCGIFMTU,	sizeof (struct ifreq), IPI_GET_CMD,
    996 			IF_CMD, ip_sioctl_get_mtu, NULL },
    997 	/* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
    998 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
    999 	/* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
   1000 			IF_CMD, ip_sioctl_brdaddr, NULL },
   1001 	/* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
   1002 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
   1003 	/* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
   1004 			IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
   1005 	/* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
   1006 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
   1007 	/* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
   1008 			IF_CMD, ip_sioctl_metric, NULL },
   1009 	/* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1010 
   1011 	/* See 166-168 below for extended SIOC*XARP ioctls */
   1012 	/* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
   1013 			ARP_CMD, ip_sioctl_arp, NULL },
   1014 	/* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
   1015 			ARP_CMD, ip_sioctl_arp, NULL },
   1016 	/* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
   1017 			ARP_CMD, ip_sioctl_arp, NULL },
   1018 
   1019 	/* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1020 	/* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1021 	/* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1022 	/* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1023 	/* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1024 	/* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1025 	/* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1026 	/* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1027 	/* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1028 	/* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1029 	/* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1030 	/* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1031 	/* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1032 	/* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1033 	/* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1034 	/* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1035 	/* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1036 	/* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1037 	/* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1038 	/* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1039 	/* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1040 
   1041 	/* 054 */ { IF_UNITSEL,	sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
   1042 			MISC_CMD, if_unitsel, if_unitsel_restart },
   1043 
   1044 	/* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1045 	/* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1046 	/* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1047 	/* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1048 	/* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1049 	/* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1050 	/* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1051 	/* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1052 	/* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1053 	/* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1054 	/* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1055 	/* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1056 	/* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1057 	/* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1058 	/* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1059 	/* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1060 	/* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1061 	/* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1062 
   1063 	/* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
   1064 			IPI_PRIV | IPI_WR | IPI_MODOK,
   1065 			IF_CMD, ip_sioctl_sifname, NULL },
   1066 
   1067 	/* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1068 	/* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1069 	/* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1070 	/* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1071 	/* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1072 	/* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1073 	/* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1074 	/* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1075 	/* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1076 	/* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1077 	/* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1078 	/* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1079 	/* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1080 
   1081 	/* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
   1082 			MISC_CMD, ip_sioctl_get_ifnum, NULL },
   1083 	/* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
   1084 			IF_CMD, ip_sioctl_get_muxid, NULL },
   1085 	/* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
   1086 			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
   1087 
   1088 	/* Both if and lif variants share same func */
   1089 	/* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
   1090 			IF_CMD, ip_sioctl_get_lifindex, NULL },
   1091 	/* Both if and lif variants share same func */
   1092 	/* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
   1093 			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
   1094 
   1095 	/* copyin size cannot be coded for SIOCGIFCONF */
   1096 	/* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
   1097 			MISC_CMD, ip_sioctl_get_ifconf, NULL },
   1098 	/* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1099 	/* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1100 	/* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1101 	/* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1102 	/* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1103 	/* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1104 	/* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1105 	/* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1106 	/* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1107 	/* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1108 	/* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1109 	/* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1110 	/* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1111 	/* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1112 	/* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1113 	/* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1114 	/* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1115 
   1116 	/* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
   1117 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
   1118 			ip_sioctl_removeif_restart },
   1119 	/* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
   1120 			IPI_GET_CMD | IPI_PRIV | IPI_WR,
   1121 			LIF_CMD, ip_sioctl_addif, NULL },
   1122 #define	SIOCLIFADDR_NDX 112
   1123 	/* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1124 			LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
   1125 	/* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
   1126 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
   1127 	/* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1128 			LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
   1129 	/* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
   1130 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
   1131 	/* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
   1132 			IPI_PRIV | IPI_WR,
   1133 			LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
   1134 	/* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
   1135 			IPI_GET_CMD | IPI_MODOK,
   1136 			LIF_CMD, ip_sioctl_get_flags, NULL },
   1137 
   1138 	/* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1139 	/* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1140 
   1141 	/* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
   1142 			ip_sioctl_get_lifconf, NULL },
   1143 	/* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1144 			LIF_CMD, ip_sioctl_mtu, NULL },
   1145 	/* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
   1146 			LIF_CMD, ip_sioctl_get_mtu, NULL },
   1147 	/* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
   1148 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
   1149 	/* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1150 			LIF_CMD, ip_sioctl_brdaddr, NULL },
   1151 	/* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
   1152 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
   1153 	/* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1154 			LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
   1155 	/* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
   1156 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
   1157 	/* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1158 			LIF_CMD, ip_sioctl_metric, NULL },
   1159 	/* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
   1160 			IPI_PRIV | IPI_WR | IPI_MODOK,
   1161 			LIF_CMD, ip_sioctl_slifname,
   1162 			ip_sioctl_slifname_restart },
   1163 
   1164 	/* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
   1165 			MISC_CMD, ip_sioctl_get_lifnum, NULL },
   1166 	/* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
   1167 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
   1168 	/* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
   1169 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
   1170 	/* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
   1171 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
   1172 	/* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
   1173 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
   1174 	/* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1175 			LIF_CMD, ip_sioctl_token, NULL },
   1176 	/* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
   1177 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
   1178 	/* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1179 			LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
   1180 	/* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
   1181 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
   1182 	/* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1183 			LIF_CMD, ip_sioctl_lnkinfo, NULL },
   1184 
   1185 	/* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
   1186 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
   1187 	/* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
   1188 			LIF_CMD, ip_siocdelndp_v6, NULL },
   1189 	/* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
   1190 			LIF_CMD, ip_siocqueryndp_v6, NULL },
   1191 	/* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
   1192 			LIF_CMD, ip_siocsetndp_v6, NULL },
   1193 	/* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
   1194 			MISC_CMD, ip_sioctl_tmyaddr, NULL },
   1195 	/* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
   1196 			MISC_CMD, ip_sioctl_tonlink, NULL },
   1197 	/* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
   1198 			MISC_CMD, ip_sioctl_tmysite, NULL },
   1199 	/* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1200 	/* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1201 	/* IPSECioctls handled in ip_sioctl_copyin_setup itself */
   1202 	/* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1203 	/* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1204 	/* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1205 	/* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1206 
   1207 	/* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1208 
   1209 	/* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD,
   1210 			LIF_CMD, ip_sioctl_get_binding, NULL },
   1211 	/* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
   1212 			IPI_PRIV | IPI_WR,
   1213 			LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
   1214 	/* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
   1215 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
   1216 	/* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
   1217 			IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
   1218 
   1219 	/* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
   1220 	/* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1221 	/* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1222 	/* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1223 
   1224 	/* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1225 
   1226 	/* These are handled in ip_sioctl_copyin_setup itself */
   1227 	/* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
   1228 			MISC_CMD, NULL, NULL },
   1229 	/* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
   1230 			MISC_CMD, NULL, NULL },
   1231 	/* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
   1232 
   1233 	/* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
   1234 			ip_sioctl_get_lifconf, NULL },
   1235 
   1236 	/* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
   1237 			XARP_CMD, ip_sioctl_arp, NULL },
   1238 	/* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
   1239 			XARP_CMD, ip_sioctl_arp, NULL },
   1240 	/* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
   1241 			XARP_CMD, ip_sioctl_arp, NULL },
   1242 
   1243 	/* SIOCPOPSOCKFS is not handled by IP */
   1244 	/* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
   1245 
   1246 	/* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
   1247 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
   1248 	/* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
   1249 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
   1250 			ip_sioctl_slifzone_restart },
   1251 	/* 172-174 are SCTP ioctls and not handled by IP */
   1252 	/* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1253 	/* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1254 	/* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1255 	/* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
   1256 			IPI_GET_CMD, LIF_CMD,
   1257 			ip_sioctl_get_lifusesrc, 0 },
   1258 	/* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
   1259 			IPI_PRIV | IPI_WR,
   1260 			LIF_CMD, ip_sioctl_slifusesrc,
   1261 			NULL },
   1262 	/* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
   1263 			ip_sioctl_get_lifsrcof, NULL },
   1264 	/* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
   1265 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1266 	/* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
   1267 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1268 	/* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
   1269 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1270 	/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
   1271 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1272 	/* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1273 	/* SIOCSENABLESDP is handled by SDP */
   1274 	/* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
   1275 	/* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
   1276 	/* 185 */ { IPI_DONTCARE /* SIOCGIFHWADDR */, 0, 0, 0, NULL, NULL },
   1277 	/* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL },
   1278 	/* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD,
   1279 			ip_sioctl_ilb_cmd, NULL },
   1280 };
   1281 
   1282 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
   1283 
   1284 ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
   1285 	{ I_LINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1286 	{ I_UNLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1287 	{ I_PLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1288 	{ I_PUNLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1289 	{ ND_GET,	0, 0, 0, NULL, NULL },
   1290 	{ ND_SET,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1291 	{ IP_IOCTL,	0, 0, 0, NULL, NULL },
   1292 	{ SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
   1293 		MISC_CMD, mrt_ioctl},
   1294 	{ SIOCGETSGCNT,	sizeof (struct sioc_sg_req), IPI_GET_CMD,
   1295 		MISC_CMD, mrt_ioctl},
   1296 	{ SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
   1297 		MISC_CMD, mrt_ioctl}
   1298 };
   1299 
   1300 int ip_misc_ioctl_count =
   1301     sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
   1302 
   1303 int	conn_drain_nthreads;		/* Number of drainers reqd. */
   1304 					/* Settable in /etc/system */
   1305 /* Defined in ip_ire.c */
   1306 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
   1307 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
   1308 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
   1309 
   1310 static nv_t	ire_nv_arr[] = {
   1311 	{ IRE_BROADCAST, "BROADCAST" },
   1312 	{ IRE_LOCAL, "LOCAL" },
   1313 	{ IRE_LOOPBACK, "LOOPBACK" },
   1314 	{ IRE_DEFAULT, "DEFAULT" },
   1315 	{ IRE_PREFIX, "PREFIX" },
   1316 	{ IRE_IF_NORESOLVER, "IF_NORESOL" },
   1317 	{ IRE_IF_RESOLVER, "IF_RESOLV" },
   1318 	{ IRE_IF_CLONE, "IF_CLONE" },
   1319 	{ IRE_HOST, "HOST" },
   1320 	{ IRE_MULTICAST, "MULTICAST" },
   1321 	{ IRE_NOROUTE, "NOROUTE" },
   1322 	{ 0 }
   1323 };
   1324 
   1325 nv_t	*ire_nv_tbl = ire_nv_arr;
   1326 
   1327 /* Simple ICMP IP Header Template */
   1328 static ipha_t icmp_ipha = {
   1329 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
   1330 };
   1331 
   1332 struct module_info ip_mod_info = {
   1333 	IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
   1334 	IP_MOD_LOWAT
   1335 };
   1336 
   1337 /*
   1338  * Duplicate static symbols within a module confuses mdb; so we avoid the
   1339  * problem by making the symbols here distinct from those in udp.c.
   1340  */
   1341 
   1342 /*
   1343  * Entry points for IP as a device and as a module.
   1344  * We have separate open functions for the /dev/ip and /dev/ip6 devices.
   1345  */
   1346 static struct qinit iprinitv4 = {
   1347 	(pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL,
   1348 	&ip_mod_info
   1349 };
   1350 
   1351 struct qinit iprinitv6 = {
   1352 	(pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL,
   1353 	&ip_mod_info
   1354 };
   1355 
   1356 static struct qinit ipwinit = {
   1357 	(pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL,
   1358 	&ip_mod_info
   1359 };
   1360 
   1361 static struct qinit iplrinit = {
   1362 	(pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL,
   1363 	&ip_mod_info
   1364 };
   1365 
   1366 static struct qinit iplwinit = {
   1367 	(pfi_t)ip_lwput, NULL, NULL, NULL, NULL,
   1368 	&ip_mod_info
   1369 };
   1370 
   1371 /* For AF_INET aka /dev/ip */
   1372 struct streamtab ipinfov4 = {
   1373 	&iprinitv4, &ipwinit, &iplrinit, &iplwinit
   1374 };
   1375 
   1376 /* For AF_INET6 aka /dev/ip6 */
   1377 struct streamtab ipinfov6 = {
   1378 	&iprinitv6, &ipwinit, &iplrinit, &iplwinit
   1379 };
   1380 
   1381 #ifdef	DEBUG
   1382 boolean_t skip_sctp_cksum = B_FALSE;
   1383 #endif
   1384 
   1385 /*
   1386  * Generate an ICMP fragmentation needed message.
   1387  * When called from ip_output side a minimal ip_recv_attr_t needs to be
   1388  * constructed by the caller.
   1389  */
   1390 void
   1391 icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
   1392 {
   1393 	icmph_t	icmph;
   1394 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   1395 
   1396 	mp = icmp_pkt_err_ok(mp, ira);
   1397 	if (mp == NULL)
   1398 		return;
   1399 
   1400 	bzero(&icmph, sizeof (icmph_t));
   1401 	icmph.icmph_type = ICMP_DEST_UNREACHABLE;
   1402 	icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
   1403 	icmph.icmph_du_mtu = htons((uint16_t)mtu);
   1404 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
   1405 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
   1406 
   1407 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   1408 }
   1409 
   1410 /*
   1411  * icmp_inbound_v4 deals with ICMP messages that are handled by IP.
   1412  * If the ICMP message is consumed by IP, i.e., it should not be delivered
   1413  * to any IPPROTO_ICMP raw sockets, then it returns NULL.
   1414  * Likewise, if the ICMP error is misformed (too short, etc), then it
   1415  * returns NULL. The caller uses this to determine whether or not to send
   1416  * to raw sockets.
   1417  *
   1418  * All error messages are passed to the matching transport stream.
   1419  *
   1420  * The following cases are handled by icmp_inbound:
   1421  * 1) It needs to send a reply back and possibly delivering it
   1422  *    to the "interested" upper clients.
   1423  * 2) Return the mblk so that the caller can pass it to the RAW socket clients.
   1424  * 3) It needs to change some values in IP only.
   1425  * 4) It needs to change some values in IP and upper layers e.g TCP
   1426  *    by delivering an error to the upper layers.
   1427  *
   1428  * We handle the above three cases in the context of IPsec in the
   1429  * following way :
   1430  *
   1431  * 1) Send the reply back in the same way as the request came in.
   1432  *    If it came in encrypted, it goes out encrypted. If it came in
   1433  *    clear, it goes out in clear. Thus, this will prevent chosen
   1434  *    plain text attack.
   1435  * 2) The client may or may not expect things to come in secure.
   1436  *    If it comes in secure, the policy constraints are checked
   1437  *    before delivering it to the upper layers. If it comes in
   1438  *    clear, ipsec_inbound_accept_clear will decide whether to
   1439  *    accept this in clear or not. In both the cases, if the returned
   1440  *    message (IP header + 8 bytes) that caused the icmp message has
   1441  *    AH/ESP headers, it is sent up to AH/ESP for validation before
   1442  *    sending up. If there are only 8 bytes of returned message, then
   1443  *    upper client will not be notified.
   1444  * 3) Check with global policy to see whether it matches the constaints.
   1445  *    But this will be done only if icmp_accept_messages_in_clear is
   1446  *    zero.
   1447  * 4) If we need to change both in IP and ULP, then the decision taken
   1448  *    while affecting the values in IP and while delivering up to TCP
   1449  *    should be the same.
   1450  *
   1451  * 	There are two cases.
   1452  *
   1453  * 	a) If we reject data at the IP layer (ipsec_check_global_policy()
   1454  *	   failed), we will not deliver it to the ULP, even though they
   1455  *	   are *willing* to accept in *clear*. This is fine as our global
   1456  *	   disposition to icmp messages asks us reject the datagram.
   1457  *
   1458  *	b) If we accept data at the IP layer (ipsec_check_global_policy()
   1459  *	   succeeded or icmp_accept_messages_in_clear is 1), and not able
   1460  *	   to deliver it to ULP (policy failed), it can lead to
   1461  *	   consistency problems. The cases known at this time are
   1462  *	   ICMP_DESTINATION_UNREACHABLE  messages with following code
   1463  *	   values :
   1464  *
   1465  *	   - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
   1466  *	     and Upper layer rejects. Then the communication will
   1467  *	     come to a stop. This is solved by making similar decisions
   1468  *	     at both levels. Currently, when we are unable to deliver
   1469  *	     to the Upper Layer (due to policy failures) while IP has
   1470  *	     adjusted dce_pmtu, the next outbound datagram would
   1471  *	     generate a local ICMP_FRAGMENTATION_NEEDED message - which
   1472  *	     will be with the right level of protection. Thus the right
   1473  *	     value will be communicated even if we are not able to
   1474  *	     communicate when we get from the wire initially. But this
   1475  *	     assumes there would be at least one outbound datagram after
   1476  *	     IP has adjusted its dce_pmtu value. To make things
   1477  *	     simpler, we accept in clear after the validation of
   1478  *	     AH/ESP headers.
   1479  *
   1480  *	   - Other ICMP ERRORS : We may not be able to deliver it to the
   1481  *	     upper layer depending on the level of protection the upper
   1482  *	     layer expects and the disposition in ipsec_inbound_accept_clear().
   1483  *	     ipsec_inbound_accept_clear() decides whether a given ICMP error
   1484  *	     should be accepted in clear when the Upper layer expects secure.
   1485  *	     Thus the communication may get aborted by some bad ICMP
   1486  *	     packets.
   1487  */
   1488 mblk_t *
   1489 icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
   1490 {
   1491 	icmph_t		*icmph;
   1492 	ipha_t		*ipha;		/* Outer header */
   1493 	int		ip_hdr_length;	/* Outer header length */
   1494 	boolean_t	interested;
   1495 	ipif_t		*ipif;
   1496 	uint32_t	ts;
   1497 	uint32_t	*tsp;
   1498 	timestruc_t	now;
   1499 	ill_t		*ill = ira->ira_ill;
   1500 	ip_stack_t	*ipst = ill->ill_ipst;
   1501 	zoneid_t	zoneid = ira->ira_zoneid;
   1502 	int		len_needed;
   1503 	mblk_t		*mp_ret = NULL;
   1504 
   1505 	ipha = (ipha_t *)mp->b_rptr;
   1506 
   1507 	BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
   1508 
   1509 	ip_hdr_length = ira->ira_ip_hdr_length;
   1510 	if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
   1511 		if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
   1512 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   1513 			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   1514 			freemsg(mp);
   1515 			return (NULL);
   1516 		}
   1517 		/* Last chance to get real. */
   1518 		ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
   1519 		if (ipha == NULL) {
   1520 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   1521 			freemsg(mp);
   1522 			return (NULL);
   1523 		}
   1524 	}
   1525 
   1526 	/* The IP header will always be a multiple of four bytes */
   1527 	icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1528 	ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
   1529 	    icmph->icmph_code));
   1530 
   1531 	/*
   1532 	 * We will set "interested" to "true" if we should pass a copy to
   1533 	 * the transport or if we handle the packet locally.
   1534 	 */
   1535 	interested = B_FALSE;
   1536 	switch (icmph->icmph_type) {
   1537 	case ICMP_ECHO_REPLY:
   1538 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
   1539 		break;
   1540 	case ICMP_DEST_UNREACHABLE:
   1541 		if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
   1542 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
   1543 		interested = B_TRUE;	/* Pass up to transport */
   1544 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
   1545 		break;
   1546 	case ICMP_SOURCE_QUENCH:
   1547 		interested = B_TRUE;	/* Pass up to transport */
   1548 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
   1549 		break;
   1550 	case ICMP_REDIRECT:
   1551 		if (!ipst->ips_ip_ignore_redirect)
   1552 			interested = B_TRUE;
   1553 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
   1554 		break;
   1555 	case ICMP_ECHO_REQUEST:
   1556 		/*
   1557 		 * Whether to respond to echo requests that come in as IP
   1558 		 * broadcasts or as IP multicast is subject to debate
   1559 		 * (what isn't?).  We aim to please, you pick it.
   1560 		 * Default is do it.
   1561 		 */
   1562 		if (ira->ira_flags & IRAF_MULTICAST) {
   1563 			/* multicast: respond based on tunable */
   1564 			interested = ipst->ips_ip_g_resp_to_echo_mcast;
   1565 		} else if (ira->ira_flags & IRAF_BROADCAST) {
   1566 			/* broadcast: respond based on tunable */
   1567 			interested = ipst->ips_ip_g_resp_to_echo_bcast;
   1568 		} else {
   1569 			/* unicast: always respond */
   1570 			interested = B_TRUE;
   1571 		}
   1572 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
   1573 		if (!interested) {
   1574 			/* We never pass these to RAW sockets */
   1575 			freemsg(mp);
   1576 			return (NULL);
   1577 		}
   1578 
   1579 		/* Check db_ref to make sure we can modify the packet. */
   1580 		if (mp->b_datap->db_ref > 1) {
   1581 			mblk_t	*mp1;
   1582 
   1583 			mp1 = copymsg(mp);
   1584 			freemsg(mp);
   1585 			if (!mp1) {
   1586 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1587 				return (NULL);
   1588 			}
   1589 			mp = mp1;
   1590 			ipha = (ipha_t *)mp->b_rptr;
   1591 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1592 		}
   1593 		icmph->icmph_type = ICMP_ECHO_REPLY;
   1594 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
   1595 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1596 		return (NULL);
   1597 
   1598 	case ICMP_ROUTER_ADVERTISEMENT:
   1599 	case ICMP_ROUTER_SOLICITATION:
   1600 		break;
   1601 	case ICMP_TIME_EXCEEDED:
   1602 		interested = B_TRUE;	/* Pass up to transport */
   1603 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
   1604 		break;
   1605 	case ICMP_PARAM_PROBLEM:
   1606 		interested = B_TRUE;	/* Pass up to transport */
   1607 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
   1608 		break;
   1609 	case ICMP_TIME_STAMP_REQUEST:
   1610 		/* Response to Time Stamp Requests is local policy. */
   1611 		if (ipst->ips_ip_g_resp_to_timestamp) {
   1612 			if (ira->ira_flags & IRAF_MULTIBROADCAST)
   1613 				interested =
   1614 				    ipst->ips_ip_g_resp_to_timestamp_bcast;
   1615 			else
   1616 				interested = B_TRUE;
   1617 		}
   1618 		if (!interested) {
   1619 			/* We never pass these to RAW sockets */
   1620 			freemsg(mp);
   1621 			return (NULL);
   1622 		}
   1623 
   1624 		/* Make sure we have enough of the packet */
   1625 		len_needed = ip_hdr_length + ICMPH_SIZE +
   1626 		    3 * sizeof (uint32_t);
   1627 
   1628 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   1629 			ipha = ip_pullup(mp, len_needed, ira);
   1630 			if (ipha == NULL) {
   1631 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1632 				ip_drop_input("ipIfStatsInDiscards - ip_pullup",
   1633 				    mp, ill);
   1634 				freemsg(mp);
   1635 				return (NULL);
   1636 			}
   1637 			/* Refresh following the pullup. */
   1638 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1639 		}
   1640 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
   1641 		/* Check db_ref to make sure we can modify the packet. */
   1642 		if (mp->b_datap->db_ref > 1) {
   1643 			mblk_t	*mp1;
   1644 
   1645 			mp1 = copymsg(mp);
   1646 			freemsg(mp);
   1647 			if (!mp1) {
   1648 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1649 				return (NULL);
   1650 			}
   1651 			mp = mp1;
   1652 			ipha = (ipha_t *)mp->b_rptr;
   1653 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1654 		}
   1655 		icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
   1656 		tsp = (uint32_t *)&icmph[1];
   1657 		tsp++;		/* Skip past 'originate time' */
   1658 		/* Compute # of milliseconds since midnight */
   1659 		gethrestime(&now);
   1660 		ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
   1661 		    now.tv_nsec / (NANOSEC / MILLISEC);
   1662 		*tsp++ = htonl(ts);	/* Lay in 'receive time' */
   1663 		*tsp++ = htonl(ts);	/* Lay in 'send time' */
   1664 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
   1665 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1666 		return (NULL);
   1667 
   1668 	case ICMP_TIME_STAMP_REPLY:
   1669 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
   1670 		break;
   1671 	case ICMP_INFO_REQUEST:
   1672 		/* Per RFC 1122 3.2.2.7, ignore this. */
   1673 	case ICMP_INFO_REPLY:
   1674 		break;
   1675 	case ICMP_ADDRESS_MASK_REQUEST:
   1676 		if (ira->ira_flags & IRAF_MULTIBROADCAST) {
   1677 			interested =
   1678 			    ipst->ips_ip_respond_to_address_mask_broadcast;
   1679 		} else {
   1680 			interested = B_TRUE;
   1681 		}
   1682 		if (!interested) {
   1683 			/* We never pass these to RAW sockets */
   1684 			freemsg(mp);
   1685 			return (NULL);
   1686 		}
   1687 		len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
   1688 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   1689 			ipha = ip_pullup(mp, len_needed, ira);
   1690 			if (ipha == NULL) {
   1691 				BUMP_MIB(ill->ill_ip_mib,
   1692 				    ipIfStatsInTruncatedPkts);
   1693 				ip_drop_input("ipIfStatsInTruncatedPkts", mp,
   1694 				    ill);
   1695 				freemsg(mp);
   1696 				return (NULL);
   1697 			}
   1698 			/* Refresh following the pullup. */
   1699 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1700 		}
   1701 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
   1702 		/* Check db_ref to make sure we can modify the packet. */
   1703 		if (mp->b_datap->db_ref > 1) {
   1704 			mblk_t	*mp1;
   1705 
   1706 			mp1 = copymsg(mp);
   1707 			freemsg(mp);
   1708 			if (!mp1) {
   1709 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1710 				return (NULL);
   1711 			}
   1712 			mp = mp1;
   1713 			ipha = (ipha_t *)mp->b_rptr;
   1714 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1715 		}
   1716 		/*
   1717 		 * Need the ipif with the mask be the same as the source
   1718 		 * address of the mask reply. For unicast we have a specific
   1719 		 * ipif. For multicast/broadcast we only handle onlink
   1720 		 * senders, and use the source address to pick an ipif.
   1721 		 */
   1722 		ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
   1723 		if (ipif == NULL) {
   1724 			/* Broadcast or multicast */
   1725 			ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
   1726 			if (ipif == NULL) {
   1727 				freemsg(mp);
   1728 				return (NULL);
   1729 			}
   1730 		}
   1731 		icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
   1732 		bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
   1733 		ipif_refrele(ipif);
   1734 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
   1735 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1736 		return (NULL);
   1737 
   1738 	case ICMP_ADDRESS_MASK_REPLY:
   1739 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
   1740 		break;
   1741 	default:
   1742 		interested = B_TRUE;	/* Pass up to transport */
   1743 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
   1744 		break;
   1745 	}
   1746 	/*
   1747 	 * See if there is an ICMP client to avoid an extra copymsg/freemsg
   1748 	 * if there isn't one.
   1749 	 */
   1750 	if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
   1751 		/* If there is an ICMP client and we want one too, copy it. */
   1752 
   1753 		if (!interested) {
   1754 			/* Caller will deliver to RAW sockets */
   1755 			return (mp);
   1756 		}
   1757 		mp_ret = copymsg(mp);
   1758 		if (mp_ret == NULL) {
   1759 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1760 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
   1761 		}
   1762 	} else if (!interested) {
   1763 		/* Neither we nor raw sockets are interested. Drop packet now */
   1764 		freemsg(mp);
   1765 		return (NULL);
   1766 	}
   1767 
   1768 	/*
   1769 	 * ICMP error or redirect packet. Make sure we have enough of
   1770 	 * the header and that db_ref == 1 since we might end up modifying
   1771 	 * the packet.
   1772 	 */
   1773 	if (mp->b_cont != NULL) {
   1774 		if (ip_pullup(mp, -1, ira) == NULL) {
   1775 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1776 			ip_drop_input("ipIfStatsInDiscards - ip_pullup",
   1777 			    mp, ill);
   1778 			freemsg(mp);
   1779 			return (mp_ret);
   1780 		}
   1781 	}
   1782 
   1783 	if (mp->b_datap->db_ref > 1) {
   1784 		mblk_t	*mp1;
   1785 
   1786 		mp1 = copymsg(mp);
   1787 		if (mp1 == NULL) {
   1788 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1789 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
   1790 			freemsg(mp);
   1791 			return (mp_ret);
   1792 		}
   1793 		freemsg(mp);
   1794 		mp = mp1;
   1795 	}
   1796 
   1797 	/*
   1798 	 * In case mp has changed, verify the message before any further
   1799 	 * processes.
   1800 	 */
   1801 	ipha = (ipha_t *)mp->b_rptr;
   1802 	icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1803 	if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   1804 		freemsg(mp);
   1805 		return (mp_ret);
   1806 	}
   1807 
   1808 	switch (icmph->icmph_type) {
   1809 	case ICMP_REDIRECT:
   1810 		icmp_redirect_v4(mp, ipha, icmph, ira);
   1811 		break;
   1812 	case ICMP_DEST_UNREACHABLE:
   1813 		if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
   1814 			/* Update DCE and adjust MTU is icmp header if needed */
   1815 			icmp_inbound_too_big_v4(icmph, ira);
   1816 		}
   1817 		/* FALLTHRU */
   1818 	default:
   1819 		icmp_inbound_error_fanout_v4(mp, icmph, ira);
   1820 		break;
   1821 	}
   1822 	return (mp_ret);
   1823 }
   1824 
   1825 /*
   1826  * Send an ICMP echo, timestamp or address mask reply.
   1827  * The caller has already updated the payload part of the packet.
   1828  * We handle the ICMP checksum, IP source address selection and feed
   1829  * the packet into ip_output_simple.
   1830  */
   1831 static void
   1832 icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
   1833     ip_recv_attr_t *ira)
   1834 {
   1835 	uint_t		ip_hdr_length = ira->ira_ip_hdr_length;
   1836 	ill_t		*ill = ira->ira_ill;
   1837 	ip_stack_t	*ipst = ill->ill_ipst;
   1838 	ip_xmit_attr_t	ixas;
   1839 
   1840 	/* Send out an ICMP packet */
   1841 	icmph->icmph_checksum = 0;
   1842 	icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
   1843 	/* Reset time to live. */
   1844 	ipha->ipha_ttl = ipst->ips_ip_def_ttl;
   1845 	{
   1846 		/* Swap source and destination addresses */
   1847 		ipaddr_t tmp;
   1848 
   1849 		tmp = ipha->ipha_src;
   1850 		ipha->ipha_src = ipha->ipha_dst;
   1851 		ipha->ipha_dst = tmp;
   1852 	}
   1853 	ipha->ipha_ident = 0;
   1854 	if (!IS_SIMPLE_IPH(ipha))
   1855 		icmp_options_update(ipha);
   1856 
   1857 	bzero(&ixas, sizeof (ixas));
   1858 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
   1859 	ixas.ixa_zoneid = ira->ira_zoneid;
   1860 	ixas.ixa_cred = kcred;
   1861 	ixas.ixa_cpid = NOPID;
   1862 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
   1863 	ixas.ixa_ifindex = 0;
   1864 	ixas.ixa_ipst = ipst;
   1865 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
   1866 
   1867 	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
   1868 		/*
   1869 		 * This packet should go out the same way as it
   1870 		 * came in i.e in clear, independent of the IPsec policy
   1871 		 * for transmitting packets.
   1872 		 */
   1873 		ixas.ixa_flags |= IXAF_NO_IPSEC;
   1874 	} else {
   1875 		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
   1876 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1877 			/* Note: mp already consumed and ip_drop_packet done */
   1878 			return;
   1879 		}
   1880 	}
   1881 	if (ira->ira_flags & IRAF_MULTIBROADCAST) {
   1882 		/*
   1883 		 * Not one or our addresses (IRE_LOCALs), thus we let
   1884 		 * ip_output_simple pick the source.
   1885 		 */
   1886 		ipha->ipha_src = INADDR_ANY;
   1887 		ixas.ixa_flags |= IXAF_SET_SOURCE;
   1888 	}
   1889 	/* Should we send with DF and use dce_pmtu? */
   1890 	if (ipst->ips_ipv4_icmp_return_pmtu) {
   1891 		ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
   1892 		ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
   1893 	}
   1894 
   1895 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
   1896 
   1897 	(void) ip_output_simple(mp, &ixas);
   1898 	ixa_cleanup(&ixas);
   1899 }
   1900 
   1901 /*
   1902  * Verify the ICMP messages for either for ICMP error or redirect packet.
   1903  * The caller should have fully pulled up the message. If it's a redirect
   1904  * packet, only basic checks on IP header will be done; otherwise, verify
   1905  * the packet by looking at the included ULP header.
   1906  *
   1907  * Called before icmp_inbound_error_fanout_v4 is called.
   1908  */
   1909 static boolean_t
   1910 icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
   1911 {
   1912 	ill_t		*ill = ira->ira_ill;
   1913 	int		hdr_length;
   1914 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   1915 	conn_t		*connp;
   1916 	ipha_t		*ipha;	/* Inner IP header */
   1917 
   1918 	ipha = (ipha_t *)&icmph[1];
   1919 	if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
   1920 		goto truncated;
   1921 
   1922 	hdr_length = IPH_HDR_LENGTH(ipha);
   1923 
   1924 	if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
   1925 		goto discard_pkt;
   1926 
   1927 	if (hdr_length < sizeof (ipha_t))
   1928 		goto truncated;
   1929 
   1930 	if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
   1931 		goto truncated;
   1932 
   1933 	/*
   1934 	 * Stop here for ICMP_REDIRECT.
   1935 	 */
   1936 	if (icmph->icmph_type == ICMP_REDIRECT)
   1937 		return (B_TRUE);
   1938 
   1939 	/*
   1940 	 * ICMP errors only.
   1941 	 */
   1942 	switch (ipha->ipha_protocol) {
   1943 	case IPPROTO_UDP:
   1944 		/*
   1945 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1946 		 * transport header.
   1947 		 */
   1948 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1949 		    mp->b_wptr)
   1950 			goto truncated;
   1951 		break;
   1952 	case IPPROTO_TCP: {
   1953 		tcpha_t		*tcpha;
   1954 
   1955 		/*
   1956 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1957 		 * transport header.
   1958 		 */
   1959 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1960 		    mp->b_wptr)
   1961 			goto truncated;
   1962 
   1963 		tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
   1964 		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
   1965 		    ipst);
   1966 		if (connp == NULL)
   1967 			goto discard_pkt;
   1968 
   1969 		if ((connp->conn_verifyicmp != NULL) &&
   1970 		    !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
   1971 			CONN_DEC_REF(connp);
   1972 			goto discard_pkt;
   1973 		}
   1974 		CONN_DEC_REF(connp);
   1975 		break;
   1976 	}
   1977 	case IPPROTO_SCTP:
   1978 		/*
   1979 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1980 		 * transport header.
   1981 		 */
   1982 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1983 		    mp->b_wptr)
   1984 			goto truncated;
   1985 		break;
   1986 	case IPPROTO_ESP:
   1987 	case IPPROTO_AH:
   1988 		break;
   1989 	case IPPROTO_ENCAP:
   1990 		if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
   1991 		    mp->b_wptr)
   1992 			goto truncated;
   1993 		break;
   1994 	default:
   1995 		break;
   1996 	}
   1997 
   1998 	return (B_TRUE);
   1999 
   2000 discard_pkt:
   2001 	/* Bogus ICMP error. */
   2002 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2003 	return (B_FALSE);
   2004 
   2005 truncated:
   2006 	/* We pulled up everthing already. Must be truncated */
   2007 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   2008 	ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   2009 	return (B_FALSE);
   2010 }
   2011 
   2012 /* Table from RFC 1191 */
   2013 static int icmp_frag_size_table[] =
   2014 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
   2015 
   2016 /*
   2017  * Process received ICMP Packet too big.
   2018  * Just handles the DCE create/update, including using the above table of
   2019  * PMTU guesses. The caller is responsible for validating the packet before
   2020  * passing it in and also to fanout the ICMP error to any matching transport
   2021  * conns. Assumes the message has been fully pulled up and verified.
   2022  *
   2023  * Before getting here, the caller has called icmp_inbound_verify_v4()
   2024  * that should have verified with ULP to prevent undoing the changes we're
   2025  * going to make to DCE. For example, TCP might have verified that the packet
   2026  * which generated error is in the send window.
   2027  *
   2028  * In some cases modified this MTU in the ICMP header packet; the caller
   2029  * should pass to the matching ULP after this returns.
   2030  */
   2031 static void
   2032 icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
   2033 {
   2034 	dce_t		*dce;
   2035 	int		old_mtu;
   2036 	int		mtu, orig_mtu;
   2037 	ipaddr_t	dst;
   2038 	boolean_t	disable_pmtud;
   2039 	ill_t		*ill = ira->ira_ill;
   2040 	ip_stack_t	*ipst = ill->ill_ipst;
   2041 	uint_t		hdr_length;
   2042 	ipha_t		*ipha;
   2043 
   2044 	/* Caller already pulled up everything. */
   2045 	ipha = (ipha_t *)&icmph[1];
   2046 	ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
   2047 	    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
   2048 	ASSERT(ill != NULL);
   2049 
   2050 	hdr_length = IPH_HDR_LENGTH(ipha);
   2051 
   2052 	/*
   2053 	 * We handle path MTU for source routed packets since the DCE
   2054 	 * is looked up using the final destination.
   2055 	 */
   2056 	dst = ip_get_dst(ipha);
   2057 
   2058 	dce = dce_lookup_and_add_v4(dst, ipst);
   2059 	if (dce == NULL) {
   2060 		/* Couldn't add a unique one - ENOMEM */
   2061 		ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
   2062 		    ntohl(dst)));
   2063 		return;
   2064 	}
   2065 
   2066 	/* Check for MTU discovery advice as described in RFC 1191 */
   2067 	mtu = ntohs(icmph->icmph_du_mtu);
   2068 	orig_mtu = mtu;
   2069 	disable_pmtud = B_FALSE;
   2070 
   2071 	mutex_enter(&dce->dce_lock);
   2072 	if (dce->dce_flags & DCEF_PMTU)
   2073 		old_mtu = dce->dce_pmtu;
   2074 	else
   2075 		old_mtu = ill->ill_mtu;
   2076 
   2077 	if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
   2078 		uint32_t length;
   2079 		int	i;
   2080 
   2081 		/*
   2082 		 * Use the table from RFC 1191 to figure out
   2083 		 * the next "plateau" based on the length in
   2084 		 * the original IP packet.
   2085 		 */
   2086 		length = ntohs(ipha->ipha_length);
   2087 		DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
   2088 		    uint32_t, length);
   2089 		if (old_mtu <= length &&
   2090 		    old_mtu >= length - hdr_length) {
   2091 			/*
   2092 			 * Handle broken BSD 4.2 systems that
   2093 			 * return the wrong ipha_length in ICMP
   2094 			 * errors.
   2095 			 */
   2096 			ip1dbg(("Wrong mtu: sent %d, dce %d\n",
   2097 			    length, old_mtu));
   2098 			length -= hdr_length;
   2099 		}
   2100 		for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
   2101 			if (length > icmp_frag_size_table[i])
   2102 				break;
   2103 		}
   2104 		if (i == A_CNT(icmp_frag_size_table)) {
   2105 			/* Smaller than IP_MIN_MTU! */
   2106 			ip1dbg(("Too big for packet size %d\n",
   2107 			    length));
   2108 			disable_pmtud = B_TRUE;
   2109 			mtu = ipst->ips_ip_pmtu_min;
   2110 		} else {
   2111 			mtu = icmp_frag_size_table[i];
   2112 			ip1dbg(("Calculated mtu %d, packet size %d, "
   2113 			    "before %d\n", mtu, length, old_mtu));
   2114 			if (mtu < ipst->ips_ip_pmtu_min) {
   2115 				mtu = ipst->ips_ip_pmtu_min;
   2116 				disable_pmtud = B_TRUE;
   2117 			}
   2118 		}
   2119 	}
   2120 	if (disable_pmtud)
   2121 		dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
   2122 	else
   2123 		dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
   2124 
   2125 	dce->dce_pmtu = MIN(old_mtu, mtu);
   2126 	/* Prepare to send the new max frag size for the ULP. */
   2127 	icmph->icmph_du_zero = 0;
   2128 	icmph->icmph_du_mtu =  htons((uint16_t)dce->dce_pmtu);
   2129 	DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *,
   2130 	    dce, int, orig_mtu, int, mtu);
   2131 
   2132 	/* We now have a PMTU for sure */
   2133 	dce->dce_flags |= DCEF_PMTU;
   2134 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
   2135 	mutex_exit(&dce->dce_lock);
   2136 	/*
   2137 	 * After dropping the lock the new value is visible to everyone.
   2138 	 * Then we bump the generation number so any cached values reinspect
   2139 	 * the dce_t.
   2140 	 */
   2141 	dce_increment_generation(dce);
   2142 	dce_refrele(dce);
   2143 }
   2144 
   2145 /*
   2146  * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4
   2147  * calls this function.
   2148  */
   2149 static mblk_t *
   2150 icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha)
   2151 {
   2152 	int length;
   2153 
   2154 	ASSERT(mp->b_datap->db_type == M_DATA);
   2155 
   2156 	/* icmp_inbound_v4 has already pulled up the whole error packet */
   2157 	ASSERT(mp->b_cont == NULL);
   2158 
   2159 	/*
   2160 	 * The length that we want to overlay is the inner header
   2161 	 * and what follows it.
   2162 	 */
   2163 	length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr);
   2164 
   2165 	/*
   2166 	 * Overlay the inner header and whatever follows it over the
   2167 	 * outer header.
   2168 	 */
   2169 	bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
   2170 
   2171 	/* Adjust for what we removed */
   2172 	mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha;
   2173 	return (mp);
   2174 }
   2175 
   2176 /*
   2177  * Try to pass the ICMP message upstream in case the ULP cares.
   2178  *
   2179  * If the packet that caused the ICMP error is secure, we send
   2180  * it to AH/ESP to make sure that the attached packet has a
   2181  * valid association. ipha in the code below points to the
   2182  * IP header of the packet that caused the error.
   2183  *
   2184  * For IPsec cases, we let the next-layer-up (which has access to
   2185  * cached policy on the conn_t, or can query the SPD directly)
   2186  * subtract out any IPsec overhead if they must.  We therefore make no
   2187  * adjustments here for IPsec overhead.
   2188  *
   2189  * IFN could have been generated locally or by some router.
   2190  *
   2191  * LOCAL : ire_send_wire (before calling ipsec_out_process) can call
   2192  * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN.
   2193  *	    This happens because IP adjusted its value of MTU on an
   2194  *	    earlier IFN message and could not tell the upper layer,
   2195  *	    the new adjusted value of MTU e.g. Packet was encrypted
   2196  *	    or there was not enough information to fanout to upper
   2197  *	    layers. Thus on the next outbound datagram, ire_send_wire
   2198  *	    generates the IFN, where IPsec processing has *not* been
   2199  *	    done.
   2200  *
   2201  *	    Note that we retain ixa_fragsize across IPsec thus once
   2202  *	    we have picking ixa_fragsize and entered ipsec_out_process we do
   2203  *	    no change the fragsize even if the path MTU changes before
   2204  *	    we reach ip_output_post_ipsec.
   2205  *
   2206  *	    In the local case, IRAF_LOOPBACK will be set indicating
   2207  *	    that IFN was generated locally.
   2208  *
   2209  * ROUTER : IFN could be secure or non-secure.
   2210  *
   2211  *	    * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the
   2212  *	      packet in error has AH/ESP headers to validate the AH/ESP
   2213  *	      headers. AH/ESP will verify whether there is a valid SA or
   2214  *	      not and send it back. We will fanout again if we have more
   2215  *	      data in the packet.
   2216  *
   2217  *	      If the packet in error does not have AH/ESP, we handle it
   2218  *	      like any other case.
   2219  *
   2220  *	    * NON_SECURE : If the packet in error has AH/ESP headers, we send it
   2221  *	      up to AH/ESP for validation. AH/ESP will verify whether there is a
   2222  *	      valid SA or not and send it back. We will fanout again if
   2223  *	      we have more data in the packet.
   2224  *
   2225  *	      If the packet in error does not have AH/ESP, we handle it
   2226  *	      like any other case.
   2227  *
   2228  * The caller must have called icmp_inbound_verify_v4.
   2229  */
   2230 static void
   2231 icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
   2232 {
   2233 	uint16_t	*up;	/* Pointer to ports in ULP header */
   2234 	uint32_t	ports;	/* reversed ports for fanout */
   2235 	ipha_t		ripha;	/* With reversed addresses */
   2236 	ipha_t		*ipha;  /* Inner IP header */
   2237 	uint_t		hdr_length; /* Inner IP header length */
   2238 	tcpha_t		*tcpha;
   2239 	conn_t		*connp;
   2240 	ill_t		*ill = ira->ira_ill;
   2241 	ip_stack_t	*ipst = ill->ill_ipst;
   2242 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   2243 	ill_t		*rill = ira->ira_rill;
   2244 
   2245 	/* Caller already pulled up everything. */
   2246 	ipha = (ipha_t *)&icmph[1];
   2247 	ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr);
   2248 	ASSERT(mp->b_cont == NULL);
   2249 
   2250 	hdr_length = IPH_HDR_LENGTH(ipha);
   2251 	ira->ira_protocol = ipha->ipha_protocol;
   2252 
   2253 	/*
   2254 	 * We need a separate IP header with the source and destination
   2255 	 * addresses reversed to do fanout/classification because the ipha in
   2256 	 * the ICMP error is in the form we sent it out.
   2257 	 */
   2258 	ripha.ipha_src = ipha->ipha_dst;
   2259 	ripha.ipha_dst = ipha->ipha_src;
   2260 	ripha.ipha_protocol = ipha->ipha_protocol;
   2261 	ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length;
   2262 
   2263 	ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n",
   2264 	    ripha.ipha_protocol, ntohl(ipha->ipha_src),
   2265 	    ntohl(ipha->ipha_dst),
   2266 	    icmph->icmph_type, icmph->icmph_code));
   2267 
   2268 	switch (ipha->ipha_protocol) {
   2269 	case IPPROTO_UDP:
   2270 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
   2271 
   2272 		/* Attempt to find a client stream based on port. */
   2273 		ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n",
   2274 		    ntohs(up[0]), ntohs(up[1])));
   2275 
   2276 		/* Note that we send error to all matches. */
   2277 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2278 		ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira);
   2279 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2280 		return;
   2281 
   2282 	case IPPROTO_TCP:
   2283 		/*
   2284 		 * Find a TCP client stream for this packet.
   2285 		 * Note that we do a reverse lookup since the header is
   2286 		 * in the form we sent it out.
   2287 		 */
   2288 		tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
   2289 		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
   2290 		    ipst);
   2291 		if (connp == NULL)
   2292 			goto discard_pkt;
   2293 
   2294 		if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
   2295 		    (ira->ira_flags & IRAF_IPSEC_SECURE)) {
   2296 			mp = ipsec_check_inbound_policy(mp, connp,
   2297 			    ipha, NULL, ira);
   2298 			if (mp == NULL) {
   2299 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2300 				/* Note that mp is NULL */
   2301 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
   2302 				CONN_DEC_REF(connp);
   2303 				return;
   2304 			}
   2305 		}
   2306 
   2307 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2308 		ira->ira_ill = ira->ira_rill = NULL;
   2309 		if (IPCL_IS_TCP(connp)) {
   2310 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   2311 			    connp->conn_recvicmp, connp, ira, SQ_FILL,
   2312 			    SQTAG_TCP_INPUT_ICMP_ERR);
   2313 		} else {
   2314 			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
   2315 			(connp->conn_recv)(connp, mp, NULL, ira);
   2316 			CONN_DEC_REF(connp);
   2317 		}
   2318 		ira->ira_ill = ill;
   2319 		ira->ira_rill = rill;
   2320 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2321 		return;
   2322 
   2323 	case IPPROTO_SCTP:
   2324 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
   2325 		/* Find a SCTP client stream for this packet. */
   2326 		((uint16_t *)&ports)[0] = up[1];
   2327 		((uint16_t *)&ports)[1] = up[0];
   2328 
   2329 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2330 		ip_fanout_sctp(mp, &ripha, NULL, ports, ira);
   2331 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2332 		return;
   2333 
   2334 	case IPPROTO_ESP:
   2335 	case IPPROTO_AH:
   2336 		if (!ipsec_loaded(ipss)) {
   2337 			ip_proto_not_sup(mp, ira);
   2338 			return;
   2339 		}
   2340 
   2341 		if (ipha->ipha_protocol == IPPROTO_ESP)
   2342 			mp = ipsecesp_icmp_error(mp, ira);
   2343 		else
   2344 			mp = ipsecah_icmp_error(mp, ira);
   2345 		if (mp == NULL)
   2346 			return;
   2347 
   2348 		/* Just in case ipsec didn't preserve the NULL b_cont */
   2349 		if (mp->b_cont != NULL) {
   2350 			if (!pullupmsg(mp, -1))
   2351 				goto discard_pkt;
   2352 		}
   2353 
   2354 		/*
   2355 		 * Note that ira_pktlen and ira_ip_hdr_length are no longer
   2356 		 * correct, but we don't use them any more here.
   2357 		 *
   2358 		 * If succesful, the mp has been modified to not include
   2359 		 * the ESP/AH header so we can fanout to the ULP's icmp
   2360 		 * error handler.
   2361 		 */
   2362 		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
   2363 			goto truncated;
   2364 
   2365 		/* Verify the modified message before any further processes. */
   2366 		ipha = (ipha_t *)mp->b_rptr;
   2367 		hdr_length = IPH_HDR_LENGTH(ipha);
   2368 		icmph = (icmph_t *)&mp->b_rptr[hdr_length];
   2369 		if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   2370 			freemsg(mp);
   2371 			return;
   2372 		}
   2373 
   2374 		icmp_inbound_error_fanout_v4(mp, icmph, ira);
   2375 		return;
   2376 
   2377 	case IPPROTO_ENCAP: {
   2378 		/* Look for self-encapsulated packets that caused an error */
   2379 		ipha_t *in_ipha;
   2380 
   2381 		/*
   2382 		 * Caller has verified that length has to be
   2383 		 * at least the size of IP header.
   2384 		 */
   2385 		ASSERT(hdr_length >= sizeof (ipha_t));
   2386 		/*
   2387 		 * Check the sanity of the inner IP header like
   2388 		 * we did for the outer header.
   2389 		 */
   2390 		in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
   2391 		if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
   2392 			goto discard_pkt;
   2393 		}
   2394 		if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
   2395 			goto discard_pkt;
   2396 		}
   2397 		/* Check for Self-encapsulated tunnels */
   2398 		if (in_ipha->ipha_src == ipha->ipha_src &&
   2399 		    in_ipha->ipha_dst == ipha->ipha_dst) {
   2400 
   2401 			mp = icmp_inbound_self_encap_error_v4(mp, ipha,
   2402 			    in_ipha);
   2403 			if (mp == NULL)
   2404 				goto discard_pkt;
   2405 
   2406 			/*
   2407 			 * Just in case self_encap didn't preserve the NULL
   2408 			 * b_cont
   2409 			 */
   2410 			if (mp->b_cont != NULL) {
   2411 				if (!pullupmsg(mp, -1))
   2412 					goto discard_pkt;
   2413 			}
   2414 			/*
   2415 			 * Note that ira_pktlen and ira_ip_hdr_length are no
   2416 			 * longer correct, but we don't use them any more here.
   2417 			 */
   2418 			if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
   2419 				goto truncated;
   2420 
   2421 			/*
   2422 			 * Verify the modified message before any further
   2423 			 * processes.
   2424 			 */
   2425 			ipha = (ipha_t *)mp->b_rptr;
   2426 			hdr_length = IPH_HDR_LENGTH(ipha);
   2427 			icmph = (icmph_t *)&mp->b_rptr[hdr_length];
   2428 			if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   2429 				freemsg(mp);
   2430 				return;
   2431 			}
   2432 
   2433 			/*
   2434 			 * The packet in error is self-encapsualted.
   2435 			 * And we are finding it further encapsulated
   2436 			 * which we could not have possibly generated.
   2437 			 */
   2438 			if (ipha->ipha_protocol == IPPROTO_ENCAP) {
   2439 				goto discard_pkt;
   2440 			}
   2441 			icmp_inbound_error_fanout_v4(mp, icmph, ira);
   2442 			return;
   2443 		}
   2444 		/* No self-encapsulated */
   2445 		/* FALLTHRU */
   2446 	}
   2447 	case IPPROTO_IPV6:
   2448 		if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src,
   2449 		    &ripha.ipha_dst, ipst)) != NULL) {
   2450 			ira->ira_flags |= IRAF_ICMP_ERROR;
   2451 			connp->conn_recvicmp(connp, mp, NULL, ira);
   2452 			CONN_DEC_REF(connp);
   2453 			ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2454 			return;
   2455 		}
   2456 		/*
   2457 		 * No IP tunnel is interested, fallthrough and see
   2458 		 * if a raw socket will want it.
   2459 		 */
   2460 		/* FALLTHRU */
   2461 	default:
   2462 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2463 		ip_fanout_proto_v4(mp, &ripha, ira);
   2464 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2465 		return;
   2466 	}
   2467 	/* NOTREACHED */
   2468 discard_pkt:
   2469 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2470 	ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n"));
   2471 	ip_drop_input("ipIfStatsInDiscards", mp, ill);
   2472 	freemsg(mp);
   2473 	return;
   2474 
   2475 truncated:
   2476 	/* We pulled up everthing already. Must be truncated */
   2477 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   2478 	ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   2479 	freemsg(mp);
   2480 }
   2481 
   2482 /*
   2483  * Common IP options parser.
   2484  *
   2485  * Setup routine: fill in *optp with options-parsing state, then
   2486  * tail-call ipoptp_next to return the first option.
   2487  */
   2488 uint8_t
   2489 ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
   2490 {
   2491 	uint32_t totallen; /* total length of all options */
   2492 
   2493 	totallen = ipha->ipha_version_and_hdr_length -
   2494 	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
   2495 	totallen <<= 2;
   2496 	optp->ipoptp_next = (uint8_t *)(&ipha[1]);
   2497 	optp->ipoptp_end = optp->ipoptp_next + totallen;
   2498 	optp->ipoptp_flags = 0;
   2499 	return (ipoptp_next(optp));
   2500 }
   2501 
   2502 /* Like above but without an ipha_t */
   2503 uint8_t
   2504 ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt)
   2505 {
   2506 	optp->ipoptp_next = opt;
   2507 	optp->ipoptp_end = optp->ipoptp_next + totallen;
   2508 	optp->ipoptp_flags = 0;
   2509 	return (ipoptp_next(optp));
   2510 }
   2511 
   2512 /*
   2513  * Common IP options parser: extract next option.
   2514  */
   2515 uint8_t
   2516 ipoptp_next(ipoptp_t *optp)
   2517 {
   2518 	uint8_t *end = optp->ipoptp_end;
   2519 	uint8_t *cur = optp->ipoptp_next;
   2520 	uint8_t opt, len, pointer;
   2521 
   2522 	/*
   2523 	 * If cur > end already, then the ipoptp_end or ipoptp_next pointer
   2524 	 * has been corrupted.
   2525 	 */
   2526 	ASSERT(cur <= end);
   2527 
   2528 	if (cur == end)
   2529 		return (IPOPT_EOL);
   2530 
   2531 	opt = cur[IPOPT_OPTVAL];
   2532 
   2533 	/*
   2534 	 * Skip any NOP options.
   2535 	 */
   2536 	while (opt == IPOPT_NOP) {
   2537 		cur++;
   2538 		if (cur == end)
   2539 			return (IPOPT_EOL);
   2540 		opt = cur[IPOPT_OPTVAL];
   2541 	}
   2542 
   2543 	if (opt == IPOPT_EOL)
   2544 		return (IPOPT_EOL);
   2545 
   2546 	/*
   2547 	 * Option requiring a length.
   2548 	 */
   2549 	if ((cur + 1) >= end) {
   2550 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2551 		return (IPOPT_EOL);
   2552 	}
   2553 	len = cur[IPOPT_OLEN];
   2554 	if (len < 2) {
   2555 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2556 		return (IPOPT_EOL);
   2557 	}
   2558 	optp->ipoptp_cur = cur;
   2559 	optp->ipoptp_len = len;
   2560 	optp->ipoptp_next = cur + len;
   2561 	if (cur + len > end) {
   2562 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2563 		return (IPOPT_EOL);
   2564 	}
   2565 
   2566 	/*
   2567 	 * For the options which require a pointer field, make sure
   2568 	 * its there, and make sure it points to either something
   2569 	 * inside this option, or the end of the option.
   2570 	 */
   2571 	switch (opt) {
   2572 	case IPOPT_RR:
   2573 	case IPOPT_TS:
   2574 	case IPOPT_LSRR:
   2575 	case IPOPT_SSRR:
   2576 		if (len <= IPOPT_OFFSET) {
   2577 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2578 			return (opt);
   2579 		}
   2580 		pointer = cur[IPOPT_OFFSET];
   2581 		if (pointer - 1 > len) {
   2582 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2583 			return (opt);
   2584 		}
   2585 		break;
   2586 	}
   2587 
   2588 	/*
   2589 	 * Sanity check the pointer field based on the type of the
   2590 	 * option.
   2591 	 */
   2592 	switch (opt) {
   2593 	case IPOPT_RR:
   2594 	case IPOPT_SSRR:
   2595 	case IPOPT_LSRR:
   2596 		if (pointer < IPOPT_MINOFF_SR)
   2597 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2598 		break;
   2599 	case IPOPT_TS:
   2600 		if (pointer < IPOPT_MINOFF_IT)
   2601 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2602 		/*
   2603 		 * Note that the Internet Timestamp option also
   2604 		 * contains two four bit fields (the Overflow field,
   2605 		 * and the Flag field), which follow the pointer
   2606 		 * field.  We don't need to check that these fields
   2607 		 * fall within the length of the option because this
   2608 		 * was implicitely done above.  We've checked that the
   2609 		 * pointer value is at least IPOPT_MINOFF_IT, and that
   2610 		 * it falls within the option.  Since IPOPT_MINOFF_IT >
   2611 		 * IPOPT_POS_OV_FLG, we don't need the explicit check.
   2612 		 */
   2613 		ASSERT(len > IPOPT_POS_OV_FLG);
   2614 		break;
   2615 	}
   2616 
   2617 	return (opt);
   2618 }
   2619 
   2620 /*
   2621  * Use the outgoing IP header to create an IP_OPTIONS option the way
   2622  * it was passed down from the application.
   2623  *
   2624  * This is compatible with BSD in that it returns
   2625  * the reverse source route with the final destination
   2626  * as the last entry. The first 4 bytes of the option
   2627  * will contain the final destination.
   2628  */
   2629 int
   2630 ip_opt_get_user(conn_t *connp, uchar_t *buf)
   2631 {
   2632 	ipoptp_t	opts;
   2633 	uchar_t		*opt;
   2634 	uint8_t		optval;
   2635 	uint8_t		optlen;
   2636 	uint32_t	len = 0;
   2637 	uchar_t		*buf1 = buf;
   2638 	uint32_t	totallen;
   2639 	ipaddr_t	dst;
   2640 	ip_pkt_t	*ipp = &connp->conn_xmit_ipp;
   2641 
   2642 	if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
   2643 		return (0);
   2644 
   2645 	totallen = ipp->ipp_ipv4_options_len;
   2646 	if (totallen & 0x3)
   2647 		return (0);
   2648 
   2649 	buf += IP_ADDR_LEN;	/* Leave room for final destination */
   2650 	len += IP_ADDR_LEN;
   2651 	bzero(buf1, IP_ADDR_LEN);
   2652 
   2653 	dst = connp->conn_faddr_v4;
   2654 
   2655 	for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
   2656 	    optval != IPOPT_EOL;
   2657 	    optval = ipoptp_next(&opts)) {
   2658 		int	off;
   2659 
   2660 		opt = opts.ipoptp_cur;
   2661 		if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   2662 			break;
   2663 		}
   2664 		optlen = opts.ipoptp_len;
   2665 
   2666 		switch (optval) {
   2667 		case IPOPT_SSRR:
   2668 		case IPOPT_LSRR:
   2669 
   2670 			/*
   2671 			 * Insert destination as the first entry in the source
   2672 			 * route and move down the entries on step.
   2673 			 * The last entry gets placed at buf1.
   2674 			 */
   2675 			buf[IPOPT_OPTVAL] = optval;
   2676 			buf[IPOPT_OLEN] = optlen;
   2677 			buf[IPOPT_OFFSET] = optlen;
   2678 
   2679 			off = optlen - IP_ADDR_LEN;
   2680 			if (off < 0) {
   2681 				/* No entries in source route */
   2682 				break;
   2683 			}
   2684 			/* Last entry in source route if not already set */
   2685 			if (dst == INADDR_ANY)
   2686 				bcopy(opt + off, buf1, IP_ADDR_LEN);
   2687 			off -= IP_ADDR_LEN;
   2688 
   2689 			while (off > 0) {
   2690 				bcopy(opt + off,
   2691 				    buf + off + IP_ADDR_LEN,
   2692 				    IP_ADDR_LEN);
   2693 				off -= IP_ADDR_LEN;
   2694 			}
   2695 			/* ipha_dst into first slot */
   2696 			bcopy(&dst, buf + off + IP_ADDR_LEN,
   2697 			    IP_ADDR_LEN);
   2698 			buf += optlen;
   2699 			len += optlen;
   2700 			break;
   2701 
   2702 		default:
   2703 			bcopy(opt, buf, optlen);
   2704 			buf += optlen;
   2705 			len += optlen;
   2706 			break;
   2707 		}
   2708 	}
   2709 done:
   2710 	/* Pad the resulting options */
   2711 	while (len & 0x3) {
   2712 		*buf++ = IPOPT_EOL;
   2713 		len++;
   2714 	}
   2715 	return (len);
   2716 }
   2717 
   2718 /*
   2719  * Update any record route or timestamp options to include this host.
   2720  * Reverse any source route option.
   2721  * This routine assumes that the options are well formed i.e. that they
   2722  * have already been checked.
   2723  */
   2724 static void
   2725 icmp_options_update(ipha_t *ipha)
   2726 {
   2727 	ipoptp_t	opts;
   2728 	uchar_t		*opt;
   2729 	uint8_t		optval;
   2730 	ipaddr_t	src;		/* Our local address */
   2731 	ipaddr_t	dst;
   2732 
   2733 	ip2dbg(("icmp_options_update\n"));
   2734 	src = ipha->ipha_src;
   2735 	dst = ipha->ipha_dst;
   2736 
   2737 	for (optval = ipoptp_first(&opts, ipha);
   2738 	    optval != IPOPT_EOL;
   2739 	    optval = ipoptp_next(&opts)) {
   2740 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
   2741 		opt = opts.ipoptp_cur;
   2742 		ip2dbg(("icmp_options_update: opt %d, len %d\n",
   2743 		    optval, opts.ipoptp_len));
   2744 		switch (optval) {
   2745 			int off1, off2;
   2746 		case IPOPT_SSRR:
   2747 		case IPOPT_LSRR:
   2748 			/*
   2749 			 * Reverse the source route.  The first entry
   2750 			 * should be the next to last one in the current
   2751 			 * source route (the last entry is our address).
   2752 			 * The last entry should be the final destination.
   2753 			 */
   2754 			off1 = IPOPT_MINOFF_SR - 1;
   2755 			off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
   2756 			if (off2 < 0) {
   2757 				/* No entries in source route */
   2758 				ip1dbg((
   2759 				    "icmp_options_update: bad src route\n"));
   2760 				break;
   2761 			}
   2762 			bcopy((char *)opt + off2, &dst, IP_ADDR_LEN);
   2763 			bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN);
   2764 			bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN);
   2765 			off2 -= IP_ADDR_LEN;
   2766 
   2767 			while (off1 < off2) {
   2768 				bcopy((char *)opt + off1, &src, IP_ADDR_LEN);
   2769 				bcopy((char *)opt + off2, (char *)opt + off1,
   2770 				    IP_ADDR_LEN);
   2771 				bcopy(&src, (char *)opt + off2, IP_ADDR_LEN);
   2772 				off1 += IP_ADDR_LEN;
   2773 				off2 -= IP_ADDR_LEN;
   2774 			}
   2775 			opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
   2776 			break;
   2777 		}
   2778 	}
   2779 }
   2780 
   2781 /*
   2782  * Process received ICMP Redirect messages.
   2783  * Assumes the caller has verified that the headers are in the pulled up mblk.
   2784  * Consumes mp.
   2785  */
   2786 static void
   2787 icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira)
   2788 {
   2789 	ire_t		*ire, *nire;
   2790 	ire_t		*prev_ire;
   2791 	ipaddr_t  	src, dst, gateway;
   2792 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   2793 	ipha_t		*inner_ipha;	/* Inner IP header */
   2794 
   2795 	/* Caller already pulled up everything. */
   2796 	inner_ipha = (ipha_t *)&icmph[1];
   2797 	src = ipha->ipha_src;
   2798 	dst = inner_ipha->ipha_dst;
   2799 	gateway = icmph->icmph_rd_gateway;
   2800 	/* Make sure the new gateway is reachable somehow. */
   2801 	ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL,
   2802 	    ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
   2803 	/*
   2804 	 * Make sure we had a route for the dest in question and that
   2805 	 * that route was pointing to the old gateway (the source of the
   2806 	 * redirect packet.)
   2807 	 * Note: this merely says that there is some IRE which matches that
   2808 	 * gateway; not that the longest match matches that gateway.
   2809 	 */
   2810 	prev_ire = ire_ftable_lookup_v4(dst, 0, src, 0, NULL, ALL_ZONES,
   2811 	    NULL, MATCH_IRE_GW, 0, ipst, NULL);
   2812 	/*
   2813 	 * Check that
   2814 	 *	the redirect was not from ourselves
   2815 	 *	the new gateway and the old gateway are directly reachable
   2816 	 */
   2817 	if (prev_ire == NULL || ire == NULL ||
   2818 	    (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
   2819 	    (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
   2820 	    !(ire->ire_type & IRE_IF_ALL)) {
   2821 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
   2822 		ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill);
   2823 		freemsg(mp);
   2824 		if (ire != NULL)
   2825 			ire_refrele(ire);
   2826 		if (prev_ire != NULL)
   2827 			ire_refrele(prev_ire);
   2828 		return;
   2829 	}
   2830 
   2831 	ire_refrele(prev_ire);
   2832 	ire_refrele(ire);
   2833 
   2834 	/*
   2835 	 * TODO: more precise handling for cases 0, 2, 3, the latter two
   2836 	 * require TOS routing
   2837 	 */
   2838 	switch (icmph->icmph_code) {
   2839 	case 0:
   2840 	case 1:
   2841 		/* TODO: TOS specificity for cases 2 and 3 */
   2842 	case 2:
   2843 	case 3:
   2844 		break;
   2845 	default:
   2846 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
   2847 		ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill);
   2848 		freemsg(mp);
   2849 		return;
   2850 	}
   2851 	/*
   2852 	 * Create a Route Association.  This will allow us to remember that
   2853 	 * someone we believe told us to use the particular gateway.
   2854 	 */
   2855 	ire = ire_create(
   2856 	    (uchar_t *)&dst,			/* dest addr */
   2857 	    (uchar_t *)&ip_g_all_ones,		/* mask */
   2858 	    (uchar_t *)&gateway,		/* gateway addr */
   2859 	    IRE_HOST,
   2860 	    NULL,				/* ill */
   2861 	    ALL_ZONES,
   2862 	    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
   2863 	    NULL,				/* tsol_gc_t */
   2864 	    ipst);
   2865 
   2866 	if (ire == NULL) {
   2867 		freemsg(mp);
   2868 		return;
   2869 	}
   2870 	nire = ire_add(ire);
   2871 	/* Check if it was a duplicate entry */
   2872 	if (nire != NULL && nire != ire) {
   2873 		ASSERT(nire->ire_identical_ref > 1);
   2874 		ire_delete(nire);
   2875 		ire_refrele(nire);
   2876 		nire = NULL;
   2877 	}
   2878 	ire = nire;
   2879 	if (ire != NULL) {
   2880 		ire_refrele(ire);		/* Held in ire_add */
   2881 
   2882 		/* tell routing sockets that we received a redirect */
   2883 		ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
   2884 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
   2885 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
   2886 	}
   2887 
   2888 	/*
   2889 	 * Delete any existing IRE_HOST type redirect ires for this destination.
   2890 	 * This together with the added IRE has the effect of
   2891 	 * modifying an existing redirect.
   2892 	 */
   2893 	prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL,
   2894 	    ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL);
   2895 	if (prev_ire != NULL) {
   2896 		if (prev_ire ->ire_flags & RTF_DYNAMIC)
   2897 			ire_delete(prev_ire);
   2898 		ire_refrele(prev_ire);
   2899 	}
   2900 
   2901 	freemsg(mp);
   2902 }
   2903 
   2904 /*
   2905  * Generate an ICMP parameter problem message.
   2906  * When called from ip_output side a minimal ip_recv_attr_t needs to be
   2907  * constructed by the caller.
   2908  */
   2909 static void
   2910 icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira)
   2911 {
   2912 	icmph_t	icmph;
   2913 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   2914 
   2915 	mp = icmp_pkt_err_ok(mp, ira);
   2916 	if (mp == NULL)
   2917 		return;
   2918 
   2919 	bzero(&icmph, sizeof (icmph_t));
   2920 	icmph.icmph_type = ICMP_PARAM_PROBLEM;
   2921 	icmph.icmph_pp_ptr = ptr;
   2922 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs);
   2923 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   2924 }
   2925 
   2926 /*
   2927  * Build and ship an IPv4 ICMP message using the packet data in mp, and
   2928  * the ICMP header pointed to by "stuff".  (May be called as writer.)
   2929  * Note: assumes that icmp_pkt_err_ok has been called to verify that
   2930  * an icmp error packet can be sent.
   2931  * Assigns an appropriate source address to the packet. If ipha_dst is
   2932  * one of our addresses use it for source. Otherwise let ip_output_simple
   2933  * pick the source address.
   2934  */
   2935 static void
   2936 icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira)
   2937 {
   2938 	ipaddr_t dst;
   2939 	icmph_t	*icmph;
   2940 	ipha_t	*ipha;
   2941 	uint_t	len_needed;
   2942 	size_t	msg_len;
   2943 	mblk_t	*mp1;
   2944 	ipaddr_t src;
   2945 	ire_t	*ire;
   2946 	ip_xmit_attr_t ixas;
   2947 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   2948 
   2949 	ipha = (ipha_t *)mp->b_rptr;
   2950 
   2951 	bzero(&ixas, sizeof (ixas));
   2952 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
   2953 	ixas.ixa_zoneid = ira->ira_zoneid;
   2954 	ixas.ixa_ifindex = 0;
   2955 	ixas.ixa_ipst = ipst;
   2956 	ixas.ixa_cred = kcred;
   2957 	ixas.ixa_cpid = NOPID;
   2958 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
   2959 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
   2960 
   2961 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
   2962 		/*
   2963 		 * Apply IPsec based on how IPsec was applied to
   2964 		 * the packet that had the error.
   2965 		 *
   2966 		 * If it was an outbound packet that caused the ICMP
   2967 		 * error, then the caller will have setup the IRA
   2968 		 * appropriately.
   2969 		 */
   2970 		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
   2971 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   2972 			/* Note: mp already consumed and ip_drop_packet done */
   2973 			return;
   2974 		}
   2975 	} else {
   2976 		/*
   2977 		 * This is in clear. The icmp message we are building
   2978 		 * here should go out in clear, independent of our policy.
   2979 		 */
   2980 		ixas.ixa_flags |= IXAF_NO_IPSEC;
   2981 	}
   2982 
   2983 	/* Remember our eventual destination */
   2984 	dst = ipha->ipha_src;
   2985 
   2986 	/*
   2987 	 * If the packet was for one of our unicast addresses, make
   2988 	 * sure we respond with that as the source. Otherwise
   2989 	 * have ip_output_simple pick the source address.
   2990 	 */
   2991 	ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0,
   2992 	    (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL,
   2993 	    MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL);
   2994 	if (ire != NULL) {
   2995 		ire_refrele(ire);
   2996 		src = ipha->ipha_dst;
   2997 	} else {
   2998 		src = INADDR_ANY;
   2999 		ixas.ixa_flags |= IXAF_SET_SOURCE;
   3000 	}
   3001 
   3002 	/*
   3003 	 * Check if we can send back more then 8 bytes in addition to
   3004 	 * the IP header.  We try to send 64 bytes of data and the internal
   3005 	 * header in the special cases of ipv4 encapsulated ipv4 or ipv6.
   3006 	 */
   3007 	len_needed = IPH_HDR_LENGTH(ipha);
   3008 	if (ipha->ipha_protocol == IPPROTO_ENCAP ||
   3009 	    ipha->ipha_protocol == IPPROTO_IPV6) {
   3010 		if (!pullupmsg(mp, -1)) {
   3011 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   3012 			ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
   3013 			freemsg(mp);
   3014 			return;
   3015 		}
   3016 		ipha = (ipha_t *)mp->b_rptr;
   3017 
   3018 		if (ipha->ipha_protocol == IPPROTO_ENCAP) {
   3019 			len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha +
   3020 			    len_needed));
   3021 		} else {
   3022 			ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed);
   3023 
   3024 			ASSERT(ipha->ipha_protocol == IPPROTO_IPV6);
   3025 			len_needed += ip_hdr_length_v6(mp, ip6h);
   3026 		}
   3027 	}
   3028 	len_needed += ipst->ips_ip_icmp_return;
   3029 	msg_len = msgdsize(mp);
   3030 	if (msg_len > len_needed) {
   3031 		(void) adjmsg(mp, len_needed - msg_len);
   3032 		msg_len = len_needed;
   3033 	}
   3034 	mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_MED);
   3035 	if (mp1 == NULL) {
   3036 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors);
   3037 		freemsg(mp);
   3038 		return;
   3039 	}
   3040 	mp1->b_cont = mp;
   3041 	mp = mp1;
   3042 
   3043 	/*
   3044 	 * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
   3045 	 * node generates be accepted in peace by all on-host destinations.
   3046 	 * If we do NOT assume that all on-host destinations trust
   3047 	 * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
   3048 	 * (Look for IXAF_TRUSTED_ICMP).
   3049 	 */
   3050 	ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
   3051 
   3052 	ipha = (ipha_t *)mp->b_rptr;
   3053 	mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len);
   3054 	*ipha = icmp_ipha;
   3055 	ipha->ipha_src = src;
   3056 	ipha->ipha_dst = dst;
   3057 	ipha->ipha_ttl = ipst->ips_ip_def_ttl;
   3058 	msg_len += sizeof (icmp_ipha) + len;
   3059 	if (msg_len > IP_MAXPACKET) {
   3060 		(void) adjmsg(mp, IP_MAXPACKET - msg_len);
   3061 		msg_len = IP_MAXPACKET;
   3062 	}
   3063 	ipha->ipha_length = htons((uint16_t)msg_len);
   3064 	icmph = (icmph_t *)&ipha[1];
   3065 	bcopy(stuff, icmph, len);
   3066 	icmph->icmph_checksum = 0;
   3067 	icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
   3068 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
   3069 
   3070 	(void) ip_output_simple(mp, &ixas);
   3071 	ixa_cleanup(&ixas);
   3072 }
   3073 
   3074 /*
   3075  * Determine if an ICMP error packet can be sent given the rate limit.
   3076  * The limit consists of an average frequency (icmp_pkt_err_interval measured
   3077  * in milliseconds) and a burst size. Burst size number of packets can
   3078  * be sent arbitrarely closely spaced.
   3079  * The state is tracked using two variables to implement an approximate
   3080  * token bucket filter:
   3081  *	icmp_pkt_err_last - lbolt value when the last burst started
   3082  *	icmp_pkt_err_sent - number of packets sent in current burst
   3083  */
   3084 boolean_t
   3085 icmp_err_rate_limit(ip_stack_t *ipst)
   3086 {
   3087 	clock_t now = TICK_TO_MSEC(ddi_get_lbolt());
   3088 	uint_t refilled; /* Number of packets refilled in tbf since last */
   3089 	/* Guard against changes by loading into local variable */
   3090 	uint_t err_interval = ipst->ips_ip_icmp_err_interval;
   3091 
   3092 	if (err_interval == 0)
   3093 		return (B_FALSE);
   3094 
   3095 	if (ipst->ips_icmp_pkt_err_last > now) {
   3096 		/* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */
   3097 		ipst->ips_icmp_pkt_err_last = 0;
   3098 		ipst->ips_icmp_pkt_err_sent = 0;
   3099 	}
   3100 	/*
   3101 	 * If we are in a burst update the token bucket filter.
   3102 	 * Update the "last" time to be close to "now" but make sure
   3103 	 * we don't loose precision.
   3104 	 */
   3105 	if (ipst->ips_icmp_pkt_err_sent != 0) {
   3106 		refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval;
   3107 		if (refilled > ipst->ips_icmp_pkt_err_sent) {
   3108 			ipst->ips_icmp_pkt_err_sent = 0;
   3109 		} else {
   3110 			ipst->ips_icmp_pkt_err_sent -= refilled;
   3111 			ipst->ips_icmp_pkt_err_last += refilled * err_interval;
   3112 		}
   3113 	}
   3114 	if (ipst->ips_icmp_pkt_err_sent == 0) {
   3115 		/* Start of new burst */
   3116 		ipst->ips_icmp_pkt_err_last = now;
   3117 	}
   3118 	if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) {
   3119 		ipst->ips_icmp_pkt_err_sent++;
   3120 		ip1dbg(("icmp_err_rate_limit: %d sent in burst\n",
   3121 		    ipst->ips_icmp_pkt_err_sent));
   3122 		return (B_FALSE);
   3123 	}
   3124 	ip1dbg(("icmp_err_rate_limit: dropped\n"));
   3125 	return (B_TRUE);
   3126 }
   3127 
   3128 /*
   3129  * Check if it is ok to send an IPv4 ICMP error packet in
   3130  * response to the IPv4 packet in mp.
   3131  * Free the message and return null if no
   3132  * ICMP error packet should be sent.
   3133  */
   3134 static mblk_t *
   3135 icmp_pkt_err_ok(mblk_t *mp, ip_recv_attr_t *ira)
   3136 {
   3137 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   3138 	icmph_t	*icmph;
   3139 	ipha_t	*ipha;
   3140 	uint_t	len_needed;
   3141 
   3142 	if (!mp)
   3143 		return (NULL);
   3144 	ipha = (ipha_t *)mp->b_rptr;
   3145 	if (ip_csum_hdr(ipha)) {
   3146 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs);
   3147 		ip_drop_input("ipIfStatsInCksumErrs", mp, NULL);
   3148 		freemsg(mp);
   3149 		return (NULL);
   3150 	}
   3151 	if (ip_type_v4(ipha->ipha_dst, ipst) == IRE_BROADCAST ||
   3152 	    ip_type_v4(ipha->ipha_src, ipst) == IRE_BROADCAST ||
   3153 	    CLASSD(ipha->ipha_dst) ||
   3154 	    CLASSD(ipha->ipha_src) ||
   3155 	    (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) {
   3156 		/* Note: only errors to the fragment with offset 0 */
   3157 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   3158 		freemsg(mp);
   3159 		return (NULL);
   3160 	}
   3161 	if (ipha->ipha_protocol == IPPROTO_ICMP) {
   3162 		/*
   3163 		 * Check the ICMP type.  RFC 1122 sez:  don't send ICMP
   3164 		 * errors in response to any ICMP errors.
   3165 		 */
   3166 		len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE;
   3167 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   3168 			if (!pullupmsg(mp, len_needed)) {
   3169 				BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   3170 				freemsg(mp);
   3171 				return (NULL);
   3172 			}
   3173 			ipha = (ipha_t *)mp->b_rptr;
   3174 		}
   3175 		icmph = (icmph_t *)
   3176 		    (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]);
   3177 		switch (icmph->icmph_type) {
   3178 		case ICMP_DEST_UNREACHABLE:
   3179 		case ICMP_SOURCE_QUENCH:
   3180 		case ICMP_TIME_EXCEEDED:
   3181 		case ICMP_PARAM_PROBLEM:
   3182 		case ICMP_REDIRECT:
   3183 			BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   3184 			freemsg(mp);
   3185 			return (NULL);
   3186 		default:
   3187 			break;
   3188 		}
   3189 	}
   3190 	/*
   3191 	 * If this is a labeled system, then check to see if we're allowed to
   3192 	 * send a response to this particular sender.  If not, then just drop.
   3193 	 */
   3194 	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
   3195 		ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n"));
   3196 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   3197 		freemsg(mp);
   3198 		return (NULL);
   3199 	}
   3200 	if (icmp_err_rate_limit(ipst)) {
   3201 		/*
   3202 		 * Only send ICMP error packets every so often.
   3203 		 * This should be done on a per port/source basis,
   3204 		 * but for now this will suffice.
   3205 		 */
   3206 		freemsg(mp);
   3207 		return (NULL);
   3208 	}
   3209 	return (mp);
   3210 }
   3211 
   3212 /*
   3213  * Called when a packet was sent out the same link that it arrived on.
   3214  * Check if it is ok to send a redirect and then send it.
   3215  */
   3216 void
   3217 ip_send_potential_redirect_v4(mblk_t *mp, ipha_t *ipha, ire_t *ire,
   3218     ip_recv_attr_t *ira)
   3219 {
   3220 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   3221 	ipaddr_t	src, nhop;
   3222 	mblk_t		*mp1;
   3223 	ire_t		*nhop_ire;
   3224 
   3225 	/*
   3226 	 * Check the source address to see if it originated
   3227 	 * on the same logical subnet it is going back out on.
   3228 	 * If so, we should be able to send it a redirect.
   3229 	 * Avoid sending a redirect if the destination
   3230 	 * is directly connected (i.e., we matched an IRE_ONLINK),
   3231 	 * or if the packet was source routed out this interface.
   3232 	 *
   3233 	 * We avoid sending a redirect if the
   3234 	 * destination is directly connected
   3235 	 * because it is possible that multiple
   3236 	 * IP subnets may have been configured on
   3237 	 * the link, and the source may not
   3238 	 * be on the same subnet as ip destination,
   3239 	 * even though they are on the same
   3240 	 * physical link.
   3241 	 */
   3242 	if ((ire->ire_type & IRE_ONLINK) ||
   3243 	    ip_source_routed(ipha, ipst))
   3244 		return;
   3245 
   3246 	nhop_ire = ire_nexthop(ire);
   3247 	if (nhop_ire == NULL)
   3248 		return;
   3249 
   3250 	nhop = nhop_ire->ire_addr;
   3251 
   3252 	if (nhop_ire->ire_type & IRE_IF_CLONE) {
   3253 		ire_t	*ire2;
   3254 
   3255 		/* Follow ire_dep_parent to find non-clone IRE_INTERFACE */
   3256 		mutex_enter(&nhop_ire->ire_lock);
   3257 		ire2 = nhop_ire->ire_dep_parent;
   3258 		if (ire2 != NULL)
   3259 			ire_refhold(ire2);
   3260 		mutex_exit(&nhop_ire->ire_lock);
   3261 		ire_refrele(nhop_ire);
   3262 		nhop_ire = ire2;
   3263 	}
   3264 	if (nhop_ire == NULL)
   3265 		return;
   3266 
   3267 	ASSERT(!(nhop_ire->ire_type & IRE_IF_CLONE));
   3268 
   3269 	src = ipha->ipha_src;
   3270 
   3271 	/*
   3272 	 * We look at the interface ire for the nexthop,
   3273 	 * to see if ipha_src is in the same subnet
   3274 	 * as the nexthop.
   3275 	 */
   3276 	if ((src & nhop_ire->ire_mask) == (nhop & nhop_ire->ire_mask)) {
   3277 		/*
   3278 		 * The source is directly connected.
   3279 		 */
   3280 		mp1 = copymsg(mp);
   3281 		if (mp1 != NULL) {
   3282 			icmp_send_redirect(mp1, nhop, ira);
   3283 		}
   3284 	}
   3285 	ire_refrele(nhop_ire);
   3286 }
   3287 
   3288 /*
   3289  * Generate an ICMP redirect message.
   3290  */
   3291 static void
   3292 icmp_send_redirect(mblk_t *mp, ipaddr_t gateway, ip_recv_attr_t *ira)
   3293 {
   3294 	icmph_t	icmph;
   3295 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   3296 
   3297 	mp = icmp_pkt_err_ok(mp, ira);
   3298 	if (mp == NULL)
   3299 		return;
   3300 
   3301 	bzero(&icmph, sizeof (icmph_t));
   3302 	icmph.icmph_type = ICMP_REDIRECT;
   3303 	icmph.icmph_code = 1;
   3304 	icmph.icmph_rd_gateway = gateway;
   3305 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
   3306 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   3307 }
   3308 
   3309 /*
   3310  * Generate an ICMP time exceeded message.
   3311  */
   3312 void
   3313 icmp_time_exceeded(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
   3314 {
   3315 	icmph_t	icmph;
   3316 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   3317 
   3318 	mp = icmp_pkt_err_ok(mp, ira);
   3319 	if (mp == NULL)
   3320 		return;
   3321 
   3322 	bzero(&icmph, sizeof (icmph_t));
   3323 	icmph.icmph_type = ICMP_TIME_EXCEEDED;
   3324 	icmph.icmph_code = code;
   3325 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
   3326 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   3327 }
   3328 
   3329 /*
   3330  * Generate an ICMP unreachable message.
   3331  * When called from ip_output side a minimal ip_recv_attr_t needs to be
   3332  * constructed by the caller.
   3333  */
   3334 void
   3335 icmp_unreachable(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
   3336 {
   3337 	icmph_t	icmph;
   3338 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   3339 
   3340 	mp = icmp_pkt_err_ok(mp, ira);
   3341 	if (mp == NULL)
   3342 		return;
   3343 
   3344 	bzero(&icmph, sizeof (icmph_t));
   3345 	icmph.icmph_type = ICMP_DEST_UNREACHABLE;
   3346 	icmph.icmph_code = code;
   3347 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
   3348 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   3349 }
   3350 
   3351 /*
   3352  * Latch in the IPsec state for a stream based the policy in the listener
   3353  * and the actions in the ip_recv_attr_t.
   3354  * Called directly from TCP and SCTP.
   3355  */
   3356 boolean_t
   3357 ip_ipsec_policy_inherit(conn_t *connp, conn_t *lconnp, ip_recv_attr_t *ira)
   3358 {
   3359 	ASSERT(lconnp->conn_policy != NULL);
   3360 	ASSERT(connp->conn_policy == NULL);
   3361 
   3362 	IPPH_REFHOLD(lconnp->conn_policy);
   3363 	connp->conn_policy = lconnp->conn_policy;
   3364 
   3365 	if (ira->ira_ipsec_action != NULL) {
   3366 		if (connp->conn_latch == NULL) {
   3367 			connp->conn_latch = iplatch_create();
   3368 			if (connp->conn_latch == NULL)
   3369 				return (B_FALSE);
   3370 		}
   3371 		ipsec_latch_inbound(connp, ira);
   3372 	}
   3373 	return (B_TRUE);
   3374 }
   3375 
   3376 /*
   3377  * Verify whether or not the IP address is a valid local address.
   3378  * Could be a unicast, including one for a down interface.
   3379  * If allow_mcbc then a multicast or broadcast address is also
   3380  * acceptable.
   3381  *
   3382  * In the case of a broadcast/multicast address, however, the
   3383  * upper protocol is expected to reset the src address
   3384  * to zero when we return IPVL_MCAST/IPVL_BCAST so that
   3385  * no packets are emitted with broadcast/multicast address as
   3386  * source address (that violates hosts requirements RFC 1122)
   3387  * The addresses valid for bind are:
   3388  *	(1) - INADDR_ANY (0)
   3389  *	(2) - IP address of an UP interface
   3390  *	(3) - IP address of a DOWN interface
   3391  *	(4) - valid local IP broadcast addresses. In this case
   3392  *	the conn will only receive packets destined to
   3393  *	the specified broadcast address.
   3394  *	(5) - a multicast address. In this case
   3395  *	the conn will only receive packets destined to
   3396  *	the specified multicast address. Note: the
   3397  *	application still has to issue an
   3398  *	IP_ADD_MEMBERSHIP socket option.
   3399  *
   3400  * In all the above cases, the bound address must be valid in the current zone.
   3401  * When the address is loopback, multicast or broadcast, there might be many
   3402  * matching IREs so bind has to look up based on the zone.
   3403  */
   3404 ip_laddr_t
   3405 ip_laddr_verify_v4(ipaddr_t src_addr, zoneid_t zoneid,
   3406     ip_stack_t *ipst, boolean_t allow_mcbc)
   3407 {
   3408 	ire_t *src_ire;
   3409 
   3410 	ASSERT(src_addr != INADDR_ANY);
   3411 
   3412 	src_ire = ire_ftable_lookup_v4(src_addr, 0, 0, 0,
   3413 	    NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, 0, ipst, NULL);
   3414 
   3415 	/*
   3416 	 * If an address other than in6addr_any is requested,
   3417 	 * we verify that it is a valid address for bind
   3418 	 * Note: Following code is in if-else-if form for
   3419 	 * readability compared to a condition check.
   3420 	 */
   3421 	if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
   3422 		/*
   3423 		 * (2) Bind to address of local UP interface
   3424 		 */
   3425 		ire_refrele(src_ire);
   3426 		return (IPVL_UNICAST_UP);
   3427 	} else if (src_ire != NULL && src_ire->ire_type & IRE_BROADCAST) {
   3428 		/*
   3429 		 * (4) Bind to broadcast address
   3430 		 */
   3431 		ire_refrele(src_ire);
   3432 		if (allow_mcbc)
   3433 			return (IPVL_BCAST);
   3434 		else
   3435 			return (IPVL_BAD);
   3436 	} else if (CLASSD(src_addr)) {
   3437 		/* (5) bind to multicast address. */
   3438 		if (src_ire != NULL)
   3439 			ire_refrele(src_ire);
   3440 
   3441 		if (allow_mcbc)
   3442 			return (IPVL_MCAST);
   3443 		else
   3444 			return (IPVL_BAD);
   3445 	} else {
   3446 		ipif_t *ipif;
   3447 
   3448 		/*
   3449 		 * (3) Bind to address of local DOWN interface?
   3450 		 * (ipif_lookup_addr() looks up all interfaces
   3451 		 * but we do not get here for UP interfaces
   3452 		 * - case (2) above)
   3453 		 */
   3454 		if (src_ire != NULL)
   3455 			ire_refrele(src_ire);
   3456 
   3457 		ipif = ipif_lookup_addr(src_addr, NULL, zoneid, ipst);
   3458 		if (ipif == NULL)
   3459 			return (IPVL_BAD);
   3460 
   3461 		/* Not a useful source? */
   3462 		if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
   3463 			ipif_refrele(ipif);
   3464 			return (IPVL_BAD);
   3465 		}
   3466 		ipif_refrele(ipif);
   3467 		return (IPVL_UNICAST_DOWN);
   3468 	}
   3469 }
   3470 
   3471 /*
   3472  * Insert in the bind fanout for IPv4 and IPv6.
   3473  * The caller should already have used ip_laddr_verify_v*() before calling
   3474  * this.
   3475  */
   3476 int
   3477 ip_laddr_fanout_insert(conn_t *connp)
   3478 {
   3479 	int		error;
   3480 
   3481 	/*
   3482 	 * Allow setting new policies. For example, disconnects result
   3483 	 * in us being called. As we would have set conn_policy_cached
   3484 	 * to B_TRUE before, we should set it to B_FALSE, so that policy
   3485 	 * can change after the disconnect.
   3486 	 */
   3487 	connp->conn_policy_cached = B_FALSE;
   3488 
   3489 	error = ipcl_bind_insert(connp);
   3490 	if (error != 0) {
   3491 		if (connp->conn_anon_port) {
   3492 			(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
   3493 			    connp->conn_mlp_type, connp->conn_proto,
   3494 			    ntohs(connp->conn_lport), B_FALSE);
   3495 		}
   3496 		connp->conn_mlp_type = mlptSingle;
   3497 	}
   3498 	return (error);
   3499 }
   3500 
   3501 /*
   3502  * Verify that both the source and destination addresses are valid. If
   3503  * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
   3504  * i.e. have no route to it.  Protocols like TCP want to verify destination
   3505  * reachability, while tunnels do not.
   3506  *
   3507  * Determine the route, the interface, and (optionally) the source address
   3508  * to use to reach a given destination.
   3509  * Note that we allow connect to broadcast and multicast addresses when
   3510  * IPDF_ALLOW_MCBC is set.
   3511  * first_hop and dst_addr are normally the same, but if source routing
   3512  * they will differ; in that case the first_hop is what we'll use for the
   3513  * routing lookup but the dce and label checks will be done on dst_addr,
   3514  *
   3515  * If uinfo is set, then we fill in the best available information
   3516  * we have for the destination. This is based on (in priority order) any
   3517  * metrics and path MTU stored in a dce_t, route metrics, and finally the
   3518  * ill_mtu.
   3519  *
   3520  * Tsol note: If we have a source route then dst_addr != firsthop. But we
   3521  * always do the label check on dst_addr.
   3522  */
   3523 int
   3524 ip_set_destination_v4(ipaddr_t *src_addrp, ipaddr_t dst_addr, ipaddr_t firsthop,
   3525     ip_xmit_attr_t *ixa, iulp_t *uinfo, uint32_t flags, uint_t mac_mode)
   3526 {
   3527 	ire_t		*ire = NULL;
   3528 	int		error = 0;
   3529 	ipaddr_t	setsrc;				/* RTF_SETSRC */
   3530 	zoneid_t	zoneid = ixa->ixa_zoneid;	/* Honors SO_ALLZONES */
   3531 	ip_stack_t	*ipst = ixa->ixa_ipst;
   3532 	dce_t		*dce;
   3533 	uint_t		pmtu;
   3534 	uint_t		generation;
   3535 	nce_t		*nce;
   3536 	ill_t		*ill = NULL;
   3537 	boolean_t	multirt = B_FALSE;
   3538 
   3539 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
   3540 
   3541 	/*
   3542 	 * We never send to zero; the ULPs map it to the loopback address.
   3543 	 * We can't allow it since we use zero to mean unitialized in some
   3544 	 * places.
   3545 	 */
   3546 	ASSERT(dst_addr != INADDR_ANY);
   3547 
   3548 	if (is_system_labeled()) {
   3549 		ts_label_t *tsl = NULL;
   3550 
   3551 		error = tsol_check_dest(ixa->ixa_tsl, &dst_addr, IPV4_VERSION,
   3552 		    mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
   3553 		if (error != 0)
   3554 			return (error);
   3555 		if (tsl != NULL) {
   3556 			/* Update the label */
   3557 			ip_xmit_attr_replace_tsl(ixa, tsl);
   3558 		}
   3559 	}
   3560 
   3561 	setsrc = INADDR_ANY;
   3562 	/*
   3563 	 * Select a route; For IPMP interfaces, we would only select
   3564 	 * a "hidden" route (i.e., going through a specific under_ill)
   3565 	 * if ixa_ifindex has been specified.
   3566 	 */
   3567 	ire = ip_select_route_v4(firsthop, ixa, &generation, &setsrc, &error,
   3568 	    &multirt);
   3569 	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
   3570 	if (error != 0)
   3571 		goto bad_addr;
   3572 
   3573 	/*
   3574 	 * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
   3575 	 * If IPDF_VERIFY_DST is set, the destination must be reachable;
   3576 	 * Otherwise the destination needn't be reachable.
   3577 	 *
   3578 	 * If we match on a reject or black hole, then we've got a
   3579 	 * local failure.  May as well fail out the connect() attempt,
   3580 	 * since it's never going to succeed.
   3581 	 */
   3582 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
   3583 		/*
   3584 		 * If we're verifying destination reachability, we always want
   3585 		 * to complain here.
   3586 		 *
   3587 		 * If we're not verifying destination reachability but the
   3588 		 * destination has a route, we still want to fail on the
   3589 		 * temporary address and broadcast address tests.
   3590 		 *
   3591 		 * In both cases do we let the code continue so some reasonable
   3592 		 * information is returned to the caller. That enables the
   3593 		 * caller to use (and even cache) the IRE. conn_ip_ouput will
   3594 		 * use the generation mismatch path to check for the unreachable
   3595 		 * case thereby avoiding any specific check in the main path.
   3596 		 */
   3597 		ASSERT(generation == IRE_GENERATION_VERIFY);
   3598 		if (flags & IPDF_VERIFY_DST) {
   3599 			/*
   3600 			 * Set errno but continue to set up ixa_ire to be
   3601 			 * the RTF_REJECT|RTF_BLACKHOLE IRE.
   3602 			 * That allows callers to use ip_output to get an
   3603 			 * ICMP error back.
   3604 			 */
   3605 			if (!(ire->ire_type & IRE_HOST))
   3606 				error = ENETUNREACH;
   3607 			else
   3608 				error = EHOSTUNREACH;
   3609 		}
   3610 	}
   3611 
   3612 	if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
   3613 	    !(flags & IPDF_ALLOW_MCBC)) {
   3614 		ire_refrele(ire);
   3615 		ire = ire_reject(ipst, B_FALSE);
   3616 		generation = IRE_GENERATION_VERIFY;
   3617 		error = ENETUNREACH;
   3618 	}
   3619 
   3620 	/* Cache things */
   3621 	if (ixa->ixa_ire != NULL)
   3622 		ire_refrele_notr(ixa->ixa_ire);
   3623 #ifdef DEBUG
   3624 	ire_refhold_notr(ire);
   3625 	ire_refrele(ire);
   3626 #endif
   3627 	ixa->ixa_ire = ire;
   3628 	ixa->ixa_ire_generation = generation;
   3629 
   3630 	/*
   3631 	 * For multicast with multirt we have a flag passed back from
   3632 	 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
   3633 	 * possible multicast address.
   3634 	 * We also need a flag for multicast since we can't check
   3635 	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
   3636 	 */
   3637 	if (multirt) {
   3638 		ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
   3639 		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
   3640 	} else {
   3641 		ixa->ixa_postfragfn = ire->ire_postfragfn;
   3642 		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
   3643 	}
   3644 	if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
   3645 		/* Get an nce to cache. */
   3646 		nce = ire_to_nce(ire, firsthop, NULL);
   3647 		if (nce == NULL) {
   3648 			/* Allocation failure? */
   3649 			ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
   3650 		} else {
   3651 			if (ixa->ixa_nce != NULL)
   3652 				nce_refrele(ixa->ixa_nce);
   3653 			ixa->ixa_nce = nce;
   3654 		}
   3655 	}
   3656 
   3657 	/*
   3658 	 * If the source address is a loopback address, the
   3659 	 * destination had best be local or multicast.
   3660 	 * If we are sending to an IRE_LOCAL using a loopback source then
   3661 	 * it had better be the same zoneid.
   3662 	 */
   3663 	if (*src_addrp == htonl(INADDR_LOOPBACK)) {
   3664 		if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
   3665 			ire = NULL;	/* Stored in ixa_ire */
   3666 			error = EADDRNOTAVAIL;
   3667 			goto bad_addr;
   3668 		}
   3669 		if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
   3670 			ire = NULL;	/* Stored in ixa_ire */
   3671 			error = EADDRNOTAVAIL;
   3672 			goto bad_addr;
   3673 		}
   3674 	}
   3675 	if (ire->ire_type & IRE_BROADCAST) {
   3676 		/*
   3677 		 * If the ULP didn't have a specified source, then we
   3678 		 * make sure we reselect the source when sending
   3679 		 * broadcasts out different interfaces.
   3680 		 */
   3681 		if (flags & IPDF_SELECT_SRC)
   3682 			ixa->ixa_flags |= IXAF_SET_SOURCE;
   3683 		else
   3684 			ixa->ixa_flags &= ~IXAF_SET_SOURCE;
   3685 	}
   3686 
   3687 	/*
   3688 	 * Does the caller want us to pick a source address?
   3689 	 */
   3690 	if (flags & IPDF_SELECT_SRC) {
   3691 		ipaddr_t	src_addr;
   3692 
   3693 		/*
   3694 		 * We use use ire_nexthop_ill to avoid the under ipmp
   3695 		 * interface for source address selection. Note that for ipmp
   3696 		 * probe packets, ixa_ifindex would have been specified, and
   3697 		 * the ip_select_route() invocation would have picked an ire
   3698 		 * will ire_ill pointing at an under interface.
   3699 		 */
   3700 		ill = ire_nexthop_ill(ire);
   3701 
   3702 		/* If unreachable we have no ill but need some source */
   3703 		if (ill == NULL) {
   3704 			src_addr = htonl(INADDR_LOOPBACK);
   3705 			/* Make sure we look for a better source address */
   3706 			generation = SRC_GENERATION_VERIFY;
   3707 		} else {
   3708 			error = ip_select_source_v4(ill, setsrc, dst_addr,
   3709 			    ixa->ixa_multicast_ifaddr, zoneid,
   3710 			    ipst, &src_addr, &generation, NULL);
   3711 			if (error != 0) {
   3712 				ire = NULL;	/* Stored in ixa_ire */
   3713 				goto bad_addr;
   3714 			}
   3715 		}
   3716 
   3717 		/*
   3718 		 * We allow the source address to to down.
   3719 		 * However, we check that we don't use the loopback address
   3720 		 * as a source when sending out on the wire.
   3721 		 */
   3722 		if ((src_addr == htonl(INADDR_LOOPBACK)) &&
   3723 		    !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
   3724 		    !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
   3725 			ire = NULL;	/* Stored in ixa_ire */
   3726 			error = EADDRNOTAVAIL;
   3727 			goto bad_addr;
   3728 		}
   3729 
   3730 		*src_addrp = src_addr;
   3731 		ixa->ixa_src_generation = generation;
   3732 	}
   3733 
   3734 	if (flags & IPDF_UNIQUE_DCE) {
   3735 		/* Fallback to the default dce if allocation fails */
   3736 		dce = dce_lookup_and_add_v4(dst_addr, ipst);
   3737 		if (dce != NULL)
   3738 			generation = dce->dce_generation;
   3739 		else
   3740 			dce = dce_lookup_v4(dst_addr, ipst, &generation);
   3741 	} else {
   3742 		dce = dce_lookup_v4(dst_addr, ipst, &generation);
   3743 	}
   3744 	ASSERT(dce != NULL);
   3745 	if (ixa->ixa_dce != NULL)
   3746 		dce_refrele_notr(ixa->ixa_dce);
   3747 #ifdef DEBUG
   3748 	dce_refhold_notr(dce);
   3749 	dce_refrele(dce);
   3750 #endif
   3751 	ixa->ixa_dce = dce;
   3752 	ixa->ixa_dce_generation = generation;
   3753 
   3754 	/*
   3755 	 * Make sure we don't leave an unreachable ixa_nce in place
   3756 	 * since ip_select_route is used when we unplumb i.e., remove
   3757 	 * references on ixa_ire, ixa_nce, and ixa_dce.
   3758 	 */
   3759 	nce = ixa->ixa_nce;
   3760 	if (nce != NULL && nce->nce_is_condemned) {
   3761 		nce_refrele(nce);
   3762 		ixa->ixa_nce = NULL;
   3763 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
   3764 	}
   3765 
   3766 	/*
   3767 	 * The caller has set IXAF_PMTU_DISCOVERY if path MTU is desired.
   3768 	 * However, we can't do it for IPv4 multicast or broadcast.
   3769 	 */
   3770 	if (ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST))
   3771 		ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
   3772 
   3773 	/*
   3774 	 * Set initial value for fragmentation limit. Either conn_ip_output
   3775 	 * or ULP might updates it when there are routing changes.
   3776 	 * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
   3777 	 */
   3778 	pmtu = ip_get_pmtu(ixa);
   3779 	ixa->ixa_fragsize = pmtu;
   3780 	/* Make sure ixa_fragsize and ixa_pmtu remain identical */
   3781 	if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
   3782 		ixa->ixa_pmtu = pmtu;
   3783 
   3784 	/*
   3785 	 * Extract information useful for some transports.
   3786 	 * First we look for DCE metrics. Then we take what we have in
   3787 	 * the metrics in the route, where the offlink is used if we have
   3788 	 * one.
   3789 	 */
   3790 	if (uinfo != NULL) {
   3791 		bzero(uinfo, sizeof (*uinfo));
   3792 
   3793 		if (dce->dce_flags & DCEF_UINFO)
   3794 			*uinfo = dce->dce_uinfo;
   3795 
   3796 		rts_merge_metrics(uinfo, &ire->ire_metrics);
   3797 
   3798 		/* Allow ire_metrics to decrease the path MTU from above */
   3799 		if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
   3800 			uinfo->iulp_mtu = pmtu;
   3801 
   3802 		uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
   3803 		uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
   3804 		uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
   3805 	}
   3806 
   3807 	if (ill != NULL)
   3808 		ill_refrele(ill);
   3809 
   3810 	return (error);
   3811 
   3812 bad_addr:
   3813 	if (ire != NULL)
   3814 		ire_refrele(ire);
   3815 
   3816 	if (ill != NULL)
   3817 		ill_refrele(ill);
   3818 
   3819 	/*
   3820 	 * Make sure we don't leave an unreachable ixa_nce in place
   3821 	 * since ip_select_route is used when we unplumb i.e., remove
   3822 	 * references on ixa_ire, ixa_nce, and ixa_dce.
   3823 	 */
   3824 	nce = ixa->ixa_nce;
   3825 	if (nce != NULL && nce->nce_is_condemned) {
   3826 		nce_refrele(nce);
   3827 		ixa->ixa_nce = NULL;
   3828 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
   3829 	}
   3830 
   3831 	return (error);
   3832 }
   3833 
   3834 
   3835 /*
   3836  * Get the base MTU for the case when path MTU discovery is not used.
   3837  * Takes the MTU of the IRE into account.
   3838  */
   3839 uint_t
   3840 ip_get_base_mtu(ill_t *ill, ire_t *ire)
   3841 {
   3842 	uint_t mtu = ill->ill_mtu;
   3843 	uint_t iremtu = ire->ire_metrics.iulp_mtu;
   3844 
   3845 	if (iremtu != 0 && iremtu < mtu)
   3846 		mtu = iremtu;
   3847 
   3848 	return (mtu);
   3849 }
   3850 
   3851 /*
   3852  * Get the PMTU for the attributes. Handles both IPv4 and IPv6.
   3853  * Assumes that ixa_ire, dce, and nce have already been set up.
   3854  *
   3855  * The caller has set IXAF_PMTU_DISCOVERY if path MTU discovery is desired.
   3856  * We avoid path MTU discovery if it is disabled with ndd.
   3857  * Furtermore, if the path MTU is too small, then we don't set DF for IPv4.
   3858  *
   3859  * NOTE: We also used to turn it off for source routed packets. That
   3860  * is no longer required since the dce is per final destination.
   3861  */
   3862 uint_t
   3863 ip_get_pmtu(ip_xmit_attr_t *ixa)
   3864 {
   3865 	ip_stack_t	*ipst = ixa->ixa_ipst;
   3866 	dce_t		*dce;
   3867 	nce_t		*nce;
   3868 	ire_t		*ire;
   3869 	uint_t		pmtu;
   3870 
   3871 	ire = ixa->ixa_ire;
   3872 	dce = ixa->ixa_dce;
   3873 	nce = ixa->ixa_nce;
   3874 
   3875 	/*
   3876 	 * If path MTU discovery has been turned off by ndd, then we ignore
   3877 	 * any dce_pmtu and for IPv4 we will not set DF.
   3878 	 */
   3879 	if (!ipst->ips_ip_path_mtu_discovery)
   3880 		ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
   3881 
   3882 	pmtu = IP_MAXPACKET;
   3883 	/*
   3884 	 * Decide whether whether IPv4 sets DF
   3885 	 * For IPv6 "no DF" means to use the 1280 mtu
   3886 	 */
   3887 	if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
   3888 		ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
   3889 	} else {
   3890 		ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
   3891 		if (!(ixa->ixa_flags & IXAF_IS_IPV4))
   3892 			pmtu = IPV6_MIN_MTU;
   3893 	}
   3894 
   3895 	/* Check if the PMTU is to old before we use it */
   3896 	if ((dce->dce_flags & DCEF_PMTU) &&
   3897 	    TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
   3898 	    ipst->ips_ip_pathmtu_interval) {
   3899 		/*
   3900 		 * Older than 20 minutes. Drop the path MTU information.
   3901 		 */
   3902 		mutex_enter(&dce->dce_lock);
   3903 		dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
   3904 		dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
   3905 		mutex_exit(&dce->dce_lock);
   3906 		dce_increment_generation(dce);
   3907 	}
   3908 
   3909 	/* The metrics on the route can lower the path MTU */
   3910 	if (ire->ire_metrics.iulp_mtu != 0 &&
   3911 	    ire->ire_metrics.iulp_mtu < pmtu)
   3912 		pmtu = ire->ire_metrics.iulp_mtu;
   3913 
   3914 	/*
   3915 	 * If the path MTU is smaller than some minimum, we still use dce_pmtu
   3916 	 * above (would be 576 for IPv4 and 1280 for IPv6), but we clear
   3917 	 * IXAF_PMTU_IPV4_DF so that we avoid setting DF for IPv4.
   3918 	 */
   3919 	if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
   3920 		if (dce->dce_flags & DCEF_PMTU) {
   3921 			if (dce->dce_pmtu < pmtu)
   3922 				pmtu = dce->dce_pmtu;
   3923 
   3924 			if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) {
   3925 				ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL;
   3926 				ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
   3927 			} else {
   3928 				ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
   3929 				ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
   3930 			}
   3931 		} else {
   3932 			ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
   3933 			ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
   3934 		}
   3935 	}
   3936 
   3937 	/*
   3938 	 * If we have an IRE_LOCAL we use the loopback mtu instead of
   3939 	 * the ill for going out the wire i.e., IRE_LOCAL gets the same
   3940 	 * mtu as IRE_LOOPBACK.
   3941 	 */
   3942 	if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
   3943 		uint_t loopback_mtu;
   3944 
   3945 		loopback_mtu = (ire->ire_ipversion == IPV6_VERSION) ?
   3946 		    ip_loopback_mtu_v6plus : ip_loopback_mtuplus;
   3947 
   3948 		if (loopback_mtu < pmtu)
   3949 			pmtu = loopback_mtu;
   3950 	} else if (nce != NULL) {
   3951 		/*
   3952 		 * Make sure we don't exceed the interface MTU.
   3953 		 * In the case of RTF_REJECT or RTF_BLACKHOLE we might not have
   3954 		 * an ill. We'd use the above IP_MAXPACKET in that case just
   3955 		 * to tell the transport something larger than zero.
   3956 		 */
   3957 		if (nce->nce_common->ncec_ill->ill_mtu < pmtu)
   3958 			pmtu = nce->nce_common->ncec_ill->ill_mtu;
   3959 		if (nce->nce_common->ncec_ill != nce->nce_ill &&
   3960 		    nce->nce_ill->ill_mtu < pmtu) {
   3961 			/*
   3962 			 * for interfaces in an IPMP group, the mtu of
   3963 			 * the nce_ill (under_ill) could be different
   3964 			 * from the mtu of the ncec_ill, so we take the
   3965 			 * min of the two.
   3966 			 */
   3967 			pmtu = nce->nce_ill->ill_mtu;
   3968 		}
   3969 	}
   3970 
   3971 	/*
   3972 	 * Handle the IPV6_USE_MIN_MTU socket option or ancillary data.
   3973 	 * Only applies to IPv6.
   3974 	 */
   3975 	if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
   3976 		if (ixa->ixa_flags & IXAF_USE_MIN_MTU) {
   3977 			switch (ixa->ixa_use_min_mtu) {
   3978 			case IPV6_USE_MIN_MTU_MULTICAST:
   3979 				if (ire->ire_type & IRE_MULTICAST)
   3980 					pmtu = IPV6_MIN_MTU;
   3981 				break;
   3982 			case IPV6_USE_MIN_MTU_ALWAYS:
   3983 				pmtu = IPV6_MIN_MTU;
   3984 				break;
   3985 			case IPV6_USE_MIN_MTU_NEVER:
   3986 				break;
   3987 			}
   3988 		} else {
   3989 			/* Default is IPV6_USE_MIN_MTU_MULTICAST */
   3990 			if (ire->ire_type & IRE_MULTICAST)
   3991 				pmtu = IPV6_MIN_MTU;
   3992 		}
   3993 	}
   3994 
   3995 	/*
   3996 	 * After receiving an ICMPv6 "packet too big" message with a
   3997 	 * MTU < 1280, and for multirouted IPv6 packets, the IP layer
   3998 	 * will insert a 8-byte fragment header in every packet. We compensate
   3999 	 * for those cases by returning a smaller path MTU to the ULP.
   4000 	 *
   4001 	 * In the case of CGTP then ip_output will add a fragment header.
   4002 	 * Make sure there is room for it by telling a smaller number
   4003 	 * to the transport.
   4004 	 *
   4005 	 * When IXAF_IPV6_ADDR_FRAGHDR we subtract the frag hdr here
   4006 	 * so the ULPs consistently see a iulp_pmtu and ip_get_pmtu()
   4007 	 * which is the size of the packets it can send.
   4008 	 */
   4009 	if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
   4010 		if ((dce->dce_flags & DCEF_TOO_SMALL_PMTU) ||
   4011 		    (ire->ire_flags & RTF_MULTIRT) ||
   4012 		    (ixa->ixa_flags & IXAF_MULTIRT_MULTICAST)) {
   4013 			pmtu -= sizeof (ip6_frag_t);
   4014 			ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
   4015 		}
   4016 	}
   4017 
   4018 	return (pmtu);
   4019 }
   4020 
   4021 /*
   4022  * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping
   4023  * the final piece where we don't.  Return a pointer to the first mblk in the
   4024  * result, and update the pointer to the next mblk to chew on.  If anything
   4025  * goes wrong (i.e., dupb fails), we waste everything in sight and return a
   4026  * NULL pointer.
   4027  */
   4028 mblk_t *
   4029 ip_carve_mp(mblk_t **mpp, ssize_t len)
   4030 {
   4031 	mblk_t	*mp0;
   4032 	mblk_t	*mp1;
   4033 	mblk_t	*mp2;
   4034 
   4035 	if (!len || !mpp || !(mp0 = *mpp))
   4036 		return (NULL);
   4037 	/* If we aren't going to consume the first mblk, we need a dup. */
   4038 	if (mp0->b_wptr - mp0->b_rptr > len) {
   4039 		mp1 = dupb(mp0);
   4040 		if (mp1) {
   4041 			/* Partition the data between the two mblks. */
   4042 			mp1->b_wptr = mp1->b_rptr + len;
   4043 			mp0->b_rptr = mp1->b_wptr;
   4044 			/*
   4045 			 * after adjustments if mblk not consumed is now
   4046 			 * unaligned, try to align it. If this fails free
   4047 			 * all messages and let upper layer recover.
   4048 			 */
   4049 			if (!OK_32PTR(mp0->b_rptr)) {
   4050 				if (!pullupmsg(mp0, -1)) {
   4051 					freemsg(mp0);
   4052 					freemsg(mp1);
   4053 					*mpp = NULL;
   4054 					return (NULL);
   4055 				}
   4056 			}
   4057 		}
   4058 		return (mp1);
   4059 	}
   4060 	/* Eat through as many mblks as we need to get len bytes. */
   4061 	len -= mp0->b_wptr - mp0->b_rptr;
   4062 	for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) {
   4063 		if (mp2->b_wptr - mp2->b_rptr > len) {
   4064 			/*
   4065 			 * We won't consume the entire last mblk.  Like
   4066 			 * above, dup and partition it.
   4067 			 */
   4068 			mp1->b_cont = dupb(mp2);
   4069 			mp1 = mp1->b_cont;
   4070 			if (!mp1) {
   4071 				/*
   4072 				 * Trouble.  Rather than go to a lot of
   4073 				 * trouble to clean up, we free the messages.
   4074 				 * This won't be any worse than losing it on
   4075 				 * the wire.
   4076 				 */
   4077 				freemsg(mp0);
   4078 				freemsg(mp2);
   4079 				*mpp = NULL;
   4080 				return (NULL);
   4081 			}
   4082 			mp1->b_wptr = mp1->b_rptr + len;
   4083 			mp2->b_rptr = mp1->b_wptr;
   4084 			/*
   4085 			 * after adjustments if mblk not consumed is now
   4086 			 * unaligned, try to align it. If this fails free
   4087 			 * all messages and let upper layer recover.
   4088 			 */
   4089 			if (!OK_32PTR(mp2->b_rptr)) {
   4090 				if (!pullupmsg(mp2, -1)) {
   4091 					freemsg(mp0);
   4092 					freemsg(mp2);
   4093 					*mpp = NULL;
   4094 					return (NULL);
   4095 				}
   4096 			}
   4097 			*mpp = mp2;
   4098 			return (mp0);
   4099 		}
   4100 		/* Decrement len by the amount we just got. */
   4101 		len -= mp2->b_wptr - mp2->b_rptr;
   4102 	}
   4103 	/*
   4104 	 * len should be reduced to zero now.  If not our caller has
   4105 	 * screwed up.
   4106 	 */
   4107 	if (len) {
   4108 		/* Shouldn't happen! */
   4109 		freemsg(mp0);
   4110 		*mpp = NULL;
   4111 		return (NULL);
   4112 	}
   4113 	/*
   4114 	 * We consumed up to exactly the end of an mblk.  Detach the part
   4115 	 * we are returning from the rest of the chain.
   4116 	 */
   4117 	mp1->b_cont = NULL;
   4118 	*mpp = mp2;
   4119 	return (mp0);
   4120 }
   4121 
   4122 /* The ill stream is being unplumbed. Called from ip_close */
   4123 int
   4124 ip_modclose(ill_t *ill)
   4125 {
   4126 	boolean_t success;
   4127 	ipsq_t	*ipsq;
   4128 	ipif_t	*ipif;
   4129 	queue_t	*q = ill->ill_rq;
   4130 	ip_stack_t	*ipst = ill->ill_ipst;
   4131 	int	i;
   4132 	arl_ill_common_t *ai = ill->ill_common;
   4133 
   4134 	/*
   4135 	 * The punlink prior to this may have initiated a capability
   4136 	 * negotiation. But ipsq_enter will block until that finishes or
   4137 	 * times out.
   4138 	 */
   4139 	success = ipsq_enter(ill, B_FALSE, NEW_OP);
   4140 
   4141 	/*
   4142 	 * Open/close/push/pop is guaranteed to be single threaded
   4143 	 * per stream by STREAMS. FS guarantees that all references
   4144 	 * from top are gone before close is called. So there can't
   4145 	 * be another close thread that has set CONDEMNED on this ill.
   4146 	 * and cause ipsq_enter to return failure.
   4147 	 */
   4148 	ASSERT(success);
   4149 	ipsq = ill->ill_phyint->phyint_ipsq;
   4150 
   4151 	/*
   4152 	 * Mark it condemned. No new reference will be made to this ill.
   4153 	 * Lookup functions will return an error. Threads that try to
   4154 	 * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures
   4155 	 * that the refcnt will drop down to zero.
   4156 	 */
   4157 	mutex_enter(&ill->ill_lock);
   4158 	ill->ill_state_flags |= ILL_CONDEMNED;
   4159 	for (ipif = ill->ill_ipif; ipif != NULL;
   4160 	    ipif = ipif->ipif_next) {
   4161 		ipif->ipif_state_flags |= IPIF_CONDEMNED;
   4162 	}
   4163 	/*
   4164 	 * Wake up anybody waiting to enter the ipsq. ipsq_enter
   4165 	 * returns  error if ILL_CONDEMNED is set
   4166 	 */
   4167 	cv_broadcast(&ill->ill_cv);
   4168 	mutex_exit(&ill->ill_lock);
   4169 
   4170 	/*
   4171 	 * Send all the deferred DLPI messages downstream which came in
   4172 	 * during the small window right before ipsq_enter(). We do this
   4173 	 * without waiting for the ACKs because all the ACKs for M_PROTO
   4174 	 * messages are ignored in ip_rput() when ILL_CONDEMNED is set.
   4175 	 */
   4176 	ill_dlpi_send_deferred(ill);
   4177 
   4178 	/*
   4179 	 * Shut down fragmentation reassembly.
   4180 	 * ill_frag_timer won't start a timer again.
   4181 	 * Now cancel any existing timer
   4182 	 */
   4183 	(void) untimeout(ill->ill_frag_timer_id);
   4184 	(void) ill_frag_timeout(ill, 0);
   4185 
   4186 	/*
   4187 	 * Call ill_delete to bring down the ipifs, ilms and ill on
   4188 	 * this ill. Then wait for the refcnts to drop to zero.
   4189 	 * ill_is_freeable checks whether the ill is really quiescent.
   4190 	 * Then make sure that threads that are waiting to enter the
   4191 	 * ipsq have seen the error returned by ipsq_enter and have
   4192 	 * gone away. Then we call ill_delete_tail which does the
   4193 	 * DL_UNBIND_REQ with the driver and then qprocsoff.
   4194 	 */
   4195 	ill_delete(ill);
   4196 	mutex_enter(&ill->ill_lock);
   4197 	while (!ill_is_freeable(ill))
   4198 		cv_wait(&ill->ill_cv, &ill->ill_lock);
   4199 
   4200 	while (ill->ill_waiters)
   4201 		cv_wait(&ill->ill_cv, &ill->ill_lock);
   4202 
   4203 	mutex_exit(&ill->ill_lock);
   4204 
   4205 	/*
   4206 	 * ill_delete_tail drops reference on ill_ipst, but we need to keep
   4207 	 * it held until the end of the function since the cleanup
   4208 	 * below needs to be able to use the ip_stack_t.
   4209 	 */
   4210 	netstack_hold(ipst->ips_netstack);
   4211 
   4212 	/* qprocsoff is done via ill_delete_tail */
   4213 	ill_delete_tail(ill);
   4214 	/*
   4215 	 * synchronously wait for arp stream to unbind. After this, we
   4216 	 * cannot get any data packets up from the driver.
   4217 	 */
   4218 	arp_unbind_complete(ill);
   4219 	ASSERT(ill->ill_ipst == NULL);
   4220 
   4221 	/*
   4222 	 * Walk through all conns and qenable those that have queued data.
   4223 	 * Close synchronization needs this to
   4224 	 * be done to ensure that all upper layers blocked
   4225 	 * due to flow control to the closing device
   4226 	 * get unblocked.
   4227 	 */
   4228 	ip1dbg(("ip_wsrv: walking\n"));
   4229 	for (i = 0; i < TX_FANOUT_SIZE; i++) {
   4230 		conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]);
   4231 	}
   4232 
   4233 	/*
   4234 	 * ai can be null if this is an IPv6 ill, or if the IPv4
   4235 	 * stream is being torn down before ARP was plumbed (e.g.,
   4236 	 * /sbin/ifconfig plumbing a stream twice, and encountering
   4237 	 * an error
   4238 	 */
   4239 	if (ai != NULL) {
   4240 		ASSERT(!ill->ill_isv6);
   4241 		mutex_enter(&ai->ai_lock);
   4242 		ai->ai_ill = NULL;
   4243 		if (ai->ai_arl == NULL) {
   4244 			mutex_destroy(&ai->ai_lock);
   4245 			kmem_free(ai, sizeof (*ai));
   4246 		} else {
   4247 			cv_signal(&ai->ai_ill_unplumb_done);
   4248 			mutex_exit(&ai->ai_lock);
   4249 		}
   4250 	}
   4251 
   4252 	mutex_enter(&ipst->ips_ip_mi_lock);
   4253 	mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill);
   4254 	mutex_exit(&ipst->ips_ip_mi_lock);
   4255 
   4256 	/*
   4257 	 * credp could be null if the open didn't succeed and ip_modopen
   4258 	 * itself calls ip_close.
   4259 	 */
   4260 	if (ill->ill_credp != NULL)
   4261 		crfree(ill->ill_credp);
   4262 
   4263 	mutex_destroy(&ill->ill_saved_ire_lock);
   4264 	mutex_destroy(&ill->ill_lock);
   4265 	rw_destroy(&ill->ill_mcast_lock);
   4266 	mutex_destroy(&ill->ill_mcast_serializer);
   4267 	list_destroy(&ill->ill_nce);
   4268 
   4269 	/*
   4270 	 * Now we are done with the module close pieces that
   4271 	 * need the netstack_t.
   4272 	 */
   4273 	netstack_rele(ipst->ips_netstack);
   4274 
   4275 	mi_close_free((IDP)ill);
   4276 	q->q_ptr = WR(q)->q_ptr = NULL;
   4277 
   4278 	ipsq_exit(ipsq);
   4279 
   4280 	return (0);
   4281 }
   4282 
   4283 /*
   4284  * This is called as part of close() for IP, UDP, ICMP, and RTS
   4285  * in order to quiesce the conn.
   4286  */
   4287 void
   4288 ip_quiesce_conn(conn_t *connp)
   4289 {
   4290 	boolean_t	drain_cleanup_reqd = B_FALSE;
   4291 	boolean_t	conn_ioctl_cleanup_reqd = B_FALSE;
   4292 	boolean_t	ilg_cleanup_reqd = B_FALSE;
   4293 	ip_stack_t	*ipst;
   4294 
   4295 	ASSERT(!IPCL_IS_TCP(connp));
   4296 	ipst = connp->conn_netstack->netstack_ip;
   4297 
   4298 	/*
   4299 	 * Mark the conn as closing, and this conn must not be
   4300 	 * inserted in future into any list. Eg. conn_drain_insert(),
   4301 	 * won't insert this conn into the conn_drain_list.
   4302 	 *
   4303 	 * conn_idl, and conn_ilg cannot get set henceforth.
   4304 	 */
   4305 	mutex_enter(&connp->conn_lock);
   4306 	ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
   4307 	connp->conn_state_flags |= CONN_CLOSING;
   4308 	if (connp->conn_idl != NULL)
   4309 		drain_cleanup_reqd = B_TRUE;
   4310 	if (connp->conn_oper_pending_ill != NULL)
   4311 		conn_ioctl_cleanup_reqd = B_TRUE;
   4312 	if (connp->conn_dhcpinit_ill != NULL) {
   4313 		ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0);
   4314 		atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit);
   4315 		ill_set_inputfn(connp->conn_dhcpinit_ill);
   4316 		connp->conn_dhcpinit_ill = NULL;
   4317 	}
   4318 	if (connp->conn_ilg != NULL)
   4319 		ilg_cleanup_reqd = B_TRUE;
   4320 	mutex_exit(&connp->conn_lock);
   4321 
   4322 	if (conn_ioctl_cleanup_reqd)
   4323 		conn_ioctl_cleanup(connp);
   4324 
   4325 	if (is_system_labeled() && connp->conn_anon_port) {
   4326 		(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
   4327 		    connp->conn_mlp_type, connp->conn_proto,
   4328 		    ntohs(connp->conn_lport), B_FALSE);
   4329 		connp->conn_anon_port = 0;
   4330 	}
   4331 	connp->conn_mlp_type = mlptSingle;
   4332 
   4333 	/*
   4334 	 * Remove this conn from any fanout list it is on.
   4335 	 * and then wait for any threads currently operating
   4336 	 * on this endpoint to finish
   4337 	 */
   4338 	ipcl_hash_remove(connp);
   4339 
   4340 	/*
   4341 	 * Remove this conn from the drain list, and do
   4342 	 * any other cleanup that may be required.
   4343 	 * (Only non-tcp conns may have a non-null conn_idl.
   4344 	 * TCP conns are never flow controlled, and
   4345 	 * conn_idl will be null)
   4346 	 */
   4347 	if (drain_cleanup_reqd && connp->conn_idl != NULL) {
   4348 		mutex_enter(&connp->conn_idl->idl_lock);
   4349 		conn_drain_tail(connp, B_TRUE);
   4350 		mutex_exit(&connp->conn_idl->idl_lock);
   4351 	}
   4352 
   4353 	if (connp == ipst->ips_ip_g_mrouter)
   4354 		(void) ip_mrouter_done(ipst);
   4355 
   4356 	if (ilg_cleanup_reqd)
   4357 		ilg_delete_all(connp);
   4358 
   4359 	/*
   4360 	 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
   4361 	 * callers from write side can't be there now because close
   4362 	 * is in progress. The only other caller is ipcl_walk
   4363 	 * which checks for the condemned flag.
   4364 	 */
   4365 	mutex_enter(&connp->conn_lock);
   4366 	connp->conn_state_flags |= CONN_CONDEMNED;
   4367 	while (connp->conn_ref != 1)
   4368 		cv_wait(&connp->conn_cv, &connp->conn_lock);
   4369 	connp->conn_state_flags |= CONN_QUIESCED;
   4370 	mutex_exit(&connp->conn_lock);
   4371 }
   4372 
   4373 /* ARGSUSED */
   4374 int
   4375 ip_close(queue_t *q, int flags)
   4376 {
   4377 	conn_t		*connp;
   4378 
   4379 	/*
   4380 	 * Call the appropriate delete routine depending on whether this is
   4381 	 * a module or device.
   4382 	 */
   4383 	if (WR(q)->q_next != NULL) {
   4384 		/* This is a module close */
   4385 		return (ip_modclose((ill_t *)q->q_ptr));
   4386 	}
   4387 
   4388 	connp = q->q_ptr;
   4389 	ip_quiesce_conn(connp);
   4390 
   4391 	qprocsoff(q);
   4392 
   4393 	/*
   4394 	 * Now we are truly single threaded on this stream, and can
   4395 	 * delete the things hanging off the connp, and finally the connp.
   4396 	 * We removed this connp from the fanout list, it cannot be
   4397 	 * accessed thru the fanouts, and we already waited for the
   4398 	 * conn_ref to drop to 0. We are already in close, so
   4399 	 * there cannot be any other thread from the top. qprocsoff
   4400 	 * has completed, and service has completed or won't run in
   4401 	 * future.
   4402 	 */
   4403 	ASSERT(connp->conn_ref == 1);
   4404 
   4405 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
   4406 
   4407 	connp->conn_ref--;
   4408 	ipcl_conn_destroy(connp);
   4409 
   4410 	q->q_ptr = WR(q)->q_ptr = NULL;
   4411 	return (0);
   4412 }
   4413 
   4414 /*
   4415  * Wapper around putnext() so that ip_rts_request can merely use
   4416  * conn_recv.
   4417  */
   4418 /*ARGSUSED2*/
   4419 static void
   4420 ip_conn_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   4421 {
   4422 	conn_t *connp = (conn_t *)arg1;
   4423 
   4424 	putnext(connp->conn_rq, mp);
   4425 }
   4426 
   4427 /* Dummy in case ICMP error delivery is attempted to a /dev/ip instance */
   4428 /* ARGSUSED */
   4429 static void
   4430 ip_conn_input_icmp(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   4431 {
   4432 	freemsg(mp);
   4433 }
   4434 
   4435 /*
   4436  * Called when the module is about to be unloaded
   4437  */
   4438 void
   4439 ip_ddi_destroy(void)
   4440 {
   4441 	tnet_fini();
   4442 
   4443 	icmp_ddi_g_destroy();
   4444 	rts_ddi_g_destroy();
   4445 	udp_ddi_g_destroy();
   4446 	sctp_ddi_g_destroy();
   4447 	tcp_ddi_g_destroy();
   4448 	ilb_ddi_g_destroy();
   4449 	dce_g_destroy();
   4450 	ipsec_policy_g_destroy();
   4451 	ipcl_g_destroy();
   4452 	ip_net_g_destroy();
   4453 	ip_ire_g_fini();
   4454 	inet_minor_destroy(ip_minor_arena_sa);
   4455 #if defined(_LP64)
   4456 	inet_minor_destroy(ip_minor_arena_la);
   4457 #endif
   4458 
   4459 #ifdef DEBUG
   4460 	list_destroy(&ip_thread_list);
   4461 	rw_destroy(&ip_thread_rwlock);
   4462 	tsd_destroy(&ip_thread_data);
   4463 #endif
   4464 
   4465 	netstack_unregister(NS_IP);
   4466 }
   4467 
   4468 /*
   4469  * First step in cleanup.
   4470  */
   4471 /* ARGSUSED */
   4472 static void
   4473 ip_stack_shutdown(netstackid_t stackid, void *arg)
   4474 {
   4475 	ip_stack_t *ipst = (ip_stack_t *)arg;
   4476 
   4477 #ifdef NS_DEBUG
   4478 	printf("ip_stack_shutdown(%p, stack %d)\n", (void *)ipst, stackid);
   4479 #endif
   4480 
   4481 	/*
   4482 	 * Perform cleanup for special interfaces (loopback and IPMP).
   4483 	 */
   4484 	ip_interface_cleanup(ipst);
   4485 
   4486 	/*
   4487 	 * The *_hook_shutdown()s start the process of notifying any
   4488 	 * consumers that things are going away.... nothing is destroyed.
   4489 	 */
   4490 	ipv4_hook_shutdown(ipst);
   4491 	ipv6_hook_shutdown(ipst);
   4492 	arp_hook_shutdown(ipst);
   4493 
   4494 	mutex_enter(&ipst->ips_capab_taskq_lock);
   4495 	ipst->ips_capab_taskq_quit = B_TRUE;
   4496 	cv_signal(&ipst->ips_capab_taskq_cv);
   4497 	mutex_exit(&ipst->ips_capab_taskq_lock);
   4498 }
   4499 
   4500 /*
   4501  * Free the IP stack instance.
   4502  */
   4503 static void
   4504 ip_stack_fini(netstackid_t stackid, void *arg)
   4505 {
   4506 	ip_stack_t *ipst = (ip_stack_t *)arg;
   4507 	int ret;
   4508 
   4509 #ifdef NS_DEBUG
   4510 	printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
   4511 #endif
   4512 	/*
   4513 	 * At this point, all of the notifications that the events and
   4514 	 * protocols are going away have been run, meaning that we can
   4515 	 * now set about starting to clean things up.
   4516 	 */
   4517 	ipobs_fini(ipst);
   4518 	ipv4_hook_destroy(ipst);
   4519 	ipv6_hook_destroy(ipst);
   4520 	arp_hook_destroy(ipst);
   4521 	ip_net_destroy(ipst);
   4522 
   4523 	mutex_destroy(&ipst->ips_capab_taskq_lock);
   4524 	cv_destroy(&ipst->ips_capab_taskq_cv);
   4525 
   4526 	ipmp_destroy(ipst);
   4527 	rw_destroy(&ipst->ips_srcid_lock);
   4528 
   4529 	ip_kstat_fini(stackid, ipst->ips_ip_mibkp);
   4530 	ipst->ips_ip_mibkp = NULL;
   4531 	icmp_kstat_fini(stackid, ipst->ips_icmp_mibkp);
   4532 	ipst->ips_icmp_mibkp = NULL;
   4533 	ip_kstat2_fini(stackid, ipst->ips_ip_kstat);
   4534 	ipst->ips_ip_kstat = NULL;
   4535 	bzero(&ipst->ips_ip_statistics, sizeof (ipst->ips_ip_statistics));
   4536 	ip6_kstat_fini(stackid, ipst->ips_ip6_kstat);
   4537 	ipst->ips_ip6_kstat = NULL;
   4538 	bzero(&ipst->ips_ip6_statistics, sizeof (ipst->ips_ip6_statistics));
   4539 
   4540 	nd_free(&ipst->ips_ip_g_nd);
   4541 	kmem_free(ipst->ips_param_arr, sizeof (lcl_param_arr));
   4542 	ipst->ips_param_arr = NULL;
   4543 	kmem_free(ipst->ips_ndp_arr, sizeof (lcl_ndp_arr));
   4544 	ipst->ips_ndp_arr = NULL;
   4545 
   4546 	dce_stack_destroy(ipst);
   4547 	ip_mrouter_stack_destroy(ipst);
   4548 
   4549 	mutex_destroy(&ipst->ips_ip_mi_lock);
   4550 	rw_destroy(&ipst->ips_ill_g_usesrc_lock);
   4551 	rw_destroy(&ipst->ips_ip_g_nd_lock);
   4552 
   4553 	ret = untimeout(ipst->ips_igmp_timeout_id);
   4554 	if (ret == -1) {
   4555 		ASSERT(ipst->ips_igmp_timeout_id == 0);
   4556 	} else {
   4557 		ASSERT(ipst->ips_igmp_timeout_id != 0);
   4558 		ipst->ips_igmp_timeout_id = 0;
   4559 	}
   4560 	ret = untimeout(ipst->ips_igmp_slowtimeout_id);
   4561 	if (ret == -1) {
   4562 		ASSERT(ipst->ips_igmp_slowtimeout_id == 0);
   4563 	} else {
   4564 		ASSERT(ipst->ips_igmp_slowtimeout_id != 0);
   4565 		ipst->ips_igmp_slowtimeout_id = 0;
   4566 	}
   4567 	ret = untimeout(ipst->ips_mld_timeout_id);
   4568 	if (ret == -1) {
   4569 		ASSERT(ipst->ips_mld_timeout_id == 0);
   4570 	} else {
   4571 		ASSERT(ipst->ips_mld_timeout_id != 0);
   4572 		ipst->ips_mld_timeout_id = 0;
   4573 	}
   4574 	ret = untimeout(ipst->ips_mld_slowtimeout_id);
   4575 	if (ret == -1) {
   4576 		ASSERT(ipst->ips_mld_slowtimeout_id == 0);
   4577 	} else {
   4578 		ASSERT(ipst->ips_mld_slowtimeout_id != 0);
   4579 		ipst->ips_mld_slowtimeout_id = 0;
   4580 	}
   4581 
   4582 	mutex_destroy(&ipst->ips_igmp_timer_lock);
   4583 	mutex_destroy(&ipst->ips_mld_timer_lock);
   4584 	mutex_destroy(&ipst->ips_igmp_slowtimeout_lock);
   4585 	mutex_destroy(&ipst->ips_mld_slowtimeout_lock);
   4586 	mutex_destroy(&ipst->ips_ip_addr_avail_lock);
   4587 	rw_destroy(&ipst->ips_ill_g_lock);
   4588 
   4589 	ip_ire_fini(ipst);
   4590 	ip6_asp_free(ipst);
   4591 	conn_drain_fini(ipst);
   4592 	ipcl_destroy(ipst);
   4593 
   4594 	mutex_destroy(&ipst->ips_ndp4->ndp_g_lock);
   4595 	mutex_destroy(&ipst->ips_ndp6->ndp_g_lock);
   4596 	kmem_free(ipst->ips_ndp4, sizeof (ndp_g_t));
   4597 	ipst->ips_ndp4 = NULL;
   4598 	kmem_free(ipst->ips_ndp6, sizeof (ndp_g_t));
   4599 	ipst->ips_ndp6 = NULL;
   4600 
   4601 	if (ipst->ips_loopback_ksp != NULL) {
   4602 		kstat_delete_netstack(ipst->ips_loopback_ksp, stackid);
   4603 		ipst->ips_loopback_ksp = NULL;
   4604 	}
   4605 
   4606 	kmem_free(ipst->ips_phyint_g_list, sizeof (phyint_list_t));
   4607 	ipst->ips_phyint_g_list = NULL;
   4608 	kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS);
   4609 	ipst->ips_ill_g_heads = NULL;
   4610 
   4611 	ldi_ident_release(ipst->ips_ldi_ident);
   4612 	kmem_free(ipst, sizeof (*ipst));
   4613 }
   4614 
   4615 /*
   4616  * This function is called from the TSD destructor, and is used to debug
   4617  * reference count issues in IP. See block comment in <inet/ip_if.h> for
   4618  * details.
   4619  */
   4620 static void
   4621 ip_thread_exit(void *phash)
   4622 {
   4623 	th_hash_t *thh = phash;
   4624 
   4625 	rw_enter(&ip_thread_rwlock, RW_WRITER);
   4626 	list_remove(&ip_thread_list, thh);
   4627 	rw_exit(&ip_thread_rwlock);
   4628 	mod_hash_destroy_hash(thh->thh_hash);
   4629 	kmem_free(thh, sizeof (*thh));
   4630 }
   4631 
   4632 /*
   4633  * Called when the IP kernel module is loaded into the kernel
   4634  */
   4635 void
   4636 ip_ddi_init(void)
   4637 {
   4638 	ip_squeue_flag = ip_squeue_switch(ip_squeue_enter);
   4639 
   4640 	/*
   4641 	 * For IP and TCP the minor numbers should start from 2 since we have 4
   4642 	 * initial devices: ip, ip6, tcp, tcp6.
   4643 	 */
   4644 	/*
   4645 	 * If this is a 64-bit kernel, then create two separate arenas -
   4646 	 * one for TLIs in the range of INET_MIN_DEV+2 through 2^^18-1, and the
   4647 	 * other for socket apps in the range 2^^18 through 2^^32-1.
   4648 	 */
   4649 	ip_minor_arena_la = NULL;
   4650 	ip_minor_arena_sa = NULL;
   4651 #if defined(_LP64)
   4652 	if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
   4653 	    INET_MIN_DEV + 2, MAXMIN32, KM_SLEEP)) == NULL) {
   4654 		cmn_err(CE_PANIC,
   4655 		    "ip_ddi_init: ip_minor_arena_sa creation failed\n");
   4656 	}
   4657 	if ((ip_minor_arena_la = inet_minor_create("ip_minor_arena_la",
   4658 	    MAXMIN32 + 1, MAXMIN64, KM_SLEEP)) == NULL) {
   4659 		cmn_err(CE_PANIC,
   4660 		    "ip_ddi_init: ip_minor_arena_la creation failed\n");
   4661 	}
   4662 #else
   4663 	if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
   4664 	    INET_MIN_DEV + 2, MAXMIN, KM_SLEEP)) == NULL) {
   4665 		cmn_err(CE_PANIC,
   4666 		    "ip_ddi_init: ip_minor_arena_sa creation failed\n");
   4667 	}
   4668 #endif
   4669 	ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms);
   4670 
   4671 	ipcl_g_init();
   4672 	ip_ire_g_init();
   4673 	ip_net_g_init();
   4674 
   4675 #ifdef DEBUG
   4676 	tsd_create(&ip_thread_data, ip_thread_exit);
   4677 	rw_init(&ip_thread_rwlock, NULL, RW_DEFAULT, NULL);
   4678 	list_create(&ip_thread_list, sizeof (th_hash_t),
   4679 	    offsetof(th_hash_t, thh_link));
   4680 #endif
   4681 	ipsec_policy_g_init();
   4682 	tcp_ddi_g_init();
   4683 	sctp_ddi_g_init();
   4684 	dce_g_init();
   4685 
   4686 	/*
   4687 	 * We want to be informed each time a stack is created or
   4688 	 * destroyed in the kernel, so we can maintain the
   4689 	 * set of udp_stack_t's.
   4690 	 */
   4691 	netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown,
   4692 	    ip_stack_fini);
   4693 
   4694 	tnet_init();
   4695 
   4696 	udp_ddi_g_init();
   4697 	rts_ddi_g_init();
   4698 	icmp_ddi_g_init();
   4699 	ilb_ddi_g_init();
   4700 }
   4701 
   4702 /*
   4703  * Initialize the IP stack instance.
   4704  */
   4705 static void *
   4706 ip_stack_init(netstackid_t stackid, netstack_t *ns)
   4707 {
   4708 	ip_stack_t	*ipst;
   4709 	ipparam_t	*pa;
   4710 	ipndp_t		*na;
   4711 	major_t		major;
   4712 
   4713 #ifdef NS_DEBUG
   4714 	printf("ip_stack_init(stack %d)\n", stackid);
   4715 #endif
   4716 
   4717 	ipst = (ip_stack_t *)kmem_zalloc(sizeof (*ipst), KM_SLEEP);
   4718 	ipst->ips_netstack = ns;
   4719 
   4720 	ipst->ips_ill_g_heads = kmem_zalloc(sizeof (ill_g_head_t) * MAX_G_HEADS,
   4721 	    KM_SLEEP);
   4722 	ipst->ips_phyint_g_list = kmem_zalloc(sizeof (phyint_list_t),
   4723 	    KM_SLEEP);
   4724 	ipst->ips_ndp4 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
   4725 	ipst->ips_ndp6 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
   4726 	mutex_init(&ipst->ips_ndp4->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
   4727 	mutex_init(&ipst->ips_ndp6->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
   4728 
   4729 	rw_init(&ipst->ips_ip_g_nd_lock, NULL, RW_DEFAULT, NULL);
   4730 	mutex_init(&ipst->ips_igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL);
   4731 	ipst->ips_igmp_deferred_next = INFINITY;
   4732 	mutex_init(&ipst->ips_mld_timer_lock, NULL, MUTEX_DEFAULT, NULL);
   4733 	ipst->ips_mld_deferred_next = INFINITY;
   4734 	mutex_init(&ipst->ips_igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
   4735 	mutex_init(&ipst->ips_mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
   4736 	mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL);
   4737 	mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL);
   4738 	rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL);
   4739 	rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL);
   4740 
   4741 	ipcl_init(ipst);
   4742 	ip_ire_init(ipst);
   4743 	ip6_asp_init(ipst);
   4744 	ipif_init(ipst);
   4745 	conn_drain_init(ipst);
   4746 	ip_mrouter_stack_init(ipst);
   4747 	dce_stack_init(ipst);
   4748 
   4749 	ipst->ips_ip_g_frag_timeout = IP_FRAG_TIMEOUT;
   4750 	ipst->ips_ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000;
   4751 	ipst->ips_ipv6_frag_timeout = IPV6_FRAG_TIMEOUT;
   4752 	ipst->ips_ipv6_frag_timo_ms = IPV6_FRAG_TIMEOUT * 1000;
   4753 
   4754 	ipst->ips_ip_multirt_log_interval = 1000;
   4755 
   4756 	ipst->ips_ip_g_forward = IP_FORWARD_DEFAULT;
   4757 	ipst->ips_ipv6_forward = IP_FORWARD_DEFAULT;
   4758 	ipst->ips_ill_index = 1;
   4759 
   4760 	ipst->ips_saved_ip_g_forward = -1;
   4761 	ipst->ips_reg_vif_num = ALL_VIFS; 	/* Index to Register vif */
   4762 
   4763 	pa = (ipparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
   4764 	ipst->ips_param_arr = pa;
   4765 	bcopy(lcl_param_arr, ipst->ips_param_arr, sizeof (lcl_param_arr));
   4766 
   4767 	na = (ipndp_t *)kmem_alloc(sizeof (lcl_ndp_arr), KM_SLEEP);
   4768 	ipst->ips_ndp_arr = na;
   4769 	bcopy(lcl_ndp_arr, ipst->ips_ndp_arr, sizeof (lcl_ndp_arr));
   4770 	ipst->ips_ndp_arr[IPNDP_IP_FORWARDING_OFFSET].ip_ndp_data =
   4771 	    (caddr_t)&ipst->ips_ip_g_forward;
   4772 	ipst->ips_ndp_arr[IPNDP_IP6_FORWARDING_OFFSET].ip_ndp_data =
   4773 	    (caddr_t)&ipst->ips_ipv6_forward;
   4774 	ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_name,
   4775 	    "ip_cgtp_filter") == 0);
   4776 	ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_data =
   4777 	    (caddr_t)&ipst->ips_ip_cgtp_filter;
   4778 
   4779 	(void) ip_param_register(&ipst->ips_ip_g_nd,
   4780 	    ipst->ips_param_arr, A_CNT(lcl_param_arr),
   4781 	    ipst->ips_ndp_arr, A_CNT(lcl_ndp_arr));
   4782 
   4783 	ipst->ips_ip_mibkp = ip_kstat_init(stackid, ipst);
   4784 	ipst->ips_icmp_mibkp = icmp_kstat_init(stackid);
   4785 	ipst->ips_ip_kstat = ip_kstat2_init(stackid, &ipst->ips_ip_statistics);
   4786 	ipst->ips_ip6_kstat =
   4787 	    ip6_kstat_init(stackid, &ipst->ips_ip6_statistics);
   4788 
   4789 	ipst->ips_ip_src_id = 1;
   4790 	rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
   4791 
   4792 	ipst->ips_src_generation = SRC_GENERATION_INITIAL;
   4793 
   4794 	ip_net_init(ipst, ns);
   4795 	ipv4_hook_init(ipst);
   4796 	ipv6_hook_init(ipst);
   4797 	arp_hook_init(ipst);
   4798 	ipmp_init(ipst);
   4799 	ipobs_init(ipst);
   4800 
   4801 	/*
   4802 	 * Create the taskq dispatcher thread and initialize related stuff.
   4803 	 */
   4804 	ipst->ips_capab_taskq_thread = thread_create(NULL, 0,
   4805 	    ill_taskq_dispatch, ipst, 0, &p0, TS_RUN, minclsyspri);
   4806 	mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
   4807 	cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL);
   4808 
   4809 	major = mod_name_to_major(INET_NAME);
   4810 	(void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
   4811 	return (ipst);
   4812 }
   4813 
   4814 /*
   4815  * Allocate and initialize a DLPI template of the specified length.  (May be
   4816  * called as writer.)
   4817  */
   4818 mblk_t *
   4819 ip_dlpi_alloc(size_t len, t_uscalar_t prim)
   4820 {
   4821 	mblk_t	*mp;
   4822 
   4823 	mp = allocb(len, BPRI_MED);
   4824 	if (!mp)
   4825 		return (NULL);
   4826 
   4827 	/*
   4828 	 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
   4829 	 * of which we don't seem to use) are sent with M_PCPROTO, and
   4830 	 * that other DLPI are M_PROTO.
   4831 	 */
   4832 	if (prim == DL_INFO_REQ) {
   4833 		mp->b_datap->db_type = M_PCPROTO;
   4834 	} else {
   4835 		mp->b_datap->db_type = M_PROTO;
   4836 	}
   4837 
   4838 	mp->b_wptr = mp->b_rptr + len;
   4839 	bzero(mp->b_rptr, len);
   4840 	((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
   4841 	return (mp);
   4842 }
   4843 
   4844 /*
   4845  * Allocate and initialize a DLPI notification.  (May be called as writer.)
   4846  */
   4847 mblk_t *
   4848 ip_dlnotify_alloc(uint_t notification, uint_t data)
   4849 {
   4850 	dl_notify_ind_t	*notifyp;
   4851 	mblk_t		*mp;
   4852 
   4853 	if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
   4854 		return (NULL);
   4855 
   4856 	notifyp = (dl_notify_ind_t *)mp->b_rptr;
   4857 	notifyp->dl_notification = notification;
   4858 	notifyp->dl_data = data;
   4859 	return (mp);
   4860 }
   4861 
   4862 /*
   4863  * Debug formatting routine.  Returns a character string representation of the
   4864  * addr in buf, of the form xxx.xxx.xxx.xxx.  This routine takes the address
   4865  * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer.
   4866  *
   4867  * Once the ndd table-printing interfaces are removed, this can be changed to
   4868  * standard dotted-decimal form.
   4869  */
   4870 char *
   4871 ip_dot_addr(ipaddr_t addr, char *buf)
   4872 {
   4873 	uint8_t *ap = (uint8_t *)&addr;
   4874 
   4875 	(void) mi_sprintf(buf, "%03d.%03d.%03d.%03d",
   4876 	    ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF);
   4877 	return (buf);
   4878 }
   4879 
   4880 /*
   4881  * Write the given MAC address as a printable string in the usual colon-
   4882  * separated format.
   4883  */
   4884 const char *
   4885 mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen)
   4886 {
   4887 	char *bp;
   4888 
   4889 	if (alen == 0 || buflen < 4)
   4890 		return ("?");
   4891 	bp = buf;
   4892 	for (;;) {
   4893 		/*
   4894 		 * If there are more MAC address bytes available, but we won't
   4895 		 * have any room to print them, then add "..." to the string
   4896 		 * instead.  See below for the 'magic number' explanation.
   4897 		 */
   4898 		if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) {
   4899 			(void) strcpy(bp, "...");
   4900 			break;
   4901 		}
   4902 		(void) sprintf(bp, "%02x", *addr++);
   4903 		bp += 2;
   4904 		if (--alen == 0)
   4905 			break;
   4906 		*bp++ = ':';
   4907 		buflen -= 3;
   4908 		/*
   4909 		 * At this point, based on the first 'if' statement above,
   4910 		 * either alen == 1 and buflen >= 3, or alen > 1 and
   4911 		 * buflen >= 4.  The first case leaves room for the final "xx"
   4912 		 * number and trailing NUL byte.  The second leaves room for at
   4913 		 * least "...".  Thus the apparently 'magic' numbers chosen for
   4914 		 * that statement.
   4915 		 */
   4916 	}
   4917 	return (buf);
   4918 }
   4919 
   4920 /*
   4921  * Called when it is conceptually a ULP that would sent the packet
   4922  * e.g., port unreachable and protocol unreachable. Check that the packet
   4923  * would have passed the IPsec global policy before sending the error.
   4924  *
   4925  * Send an ICMP error after patching up the packet appropriately.
   4926  * Uses ip_drop_input and bumps the appropriate MIB.
   4927  */
   4928 void
   4929 ip_fanout_send_icmp_v4(mblk_t *mp, uint_t icmp_type, uint_t icmp_code,
   4930     ip_recv_attr_t *ira)
   4931 {
   4932 	ipha_t		*ipha;
   4933 	boolean_t	secure;
   4934 	ill_t		*ill = ira->ira_ill;
   4935 	ip_stack_t	*ipst = ill->ill_ipst;
   4936 	netstack_t	*ns = ipst->ips_netstack;
   4937 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
   4938 
   4939 	secure = ira->ira_flags & IRAF_IPSEC_SECURE;
   4940 
   4941 	/*
   4942 	 * We are generating an icmp error for some inbound packet.
   4943 	 * Called from all ip_fanout_(udp, tcp, proto) functions.
   4944 	 * Before we generate an error, check with global policy
   4945 	 * to see whether this is allowed to enter the system. As
   4946 	 * there is no "conn", we are checking with global policy.
   4947 	 */
   4948 	ipha = (ipha_t *)mp->b_rptr;
   4949 	if (secure || ipss->ipsec_inbound_v4_policy_present) {
   4950 		mp = ipsec_check_global_policy(mp, NULL, ipha, NULL, ira, ns);
   4951 		if (mp == NULL)
   4952 			return;
   4953 	}
   4954 
   4955 	/* We never send errors for protocols that we do implement */
   4956 	if (ira->ira_protocol == IPPROTO_ICMP ||
   4957 	    ira->ira_protocol == IPPROTO_IGMP) {
   4958 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   4959 		ip_drop_input("ip_fanout_send_icmp_v4", mp, ill);
   4960 		freemsg(mp);
   4961 		return;
   4962 	}
   4963 	/*
   4964 	 * Have to correct checksum since
   4965 	 * the packet might have been
   4966 	 * fragmented and the reassembly code in ip_rput
   4967 	 * does not restore the IP checksum.
   4968 	 */
   4969 	ipha->ipha_hdr_checksum = 0;
   4970 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
   4971 
   4972 	switch (icmp_type) {
   4973 	case ICMP_DEST_UNREACHABLE:
   4974 		switch (icmp_code) {
   4975 		case ICMP_PROTOCOL_UNREACHABLE:
   4976 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
   4977 			ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
   4978 			break;
   4979 		case ICMP_PORT_UNREACHABLE:
   4980 			BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
   4981 			ip_drop_input("ipIfStatsNoPorts", mp, ill);
   4982 			break;
   4983 		}
   4984 
   4985 		icmp_unreachable(mp, icmp_code, ira);
   4986 		break;
   4987 	default:
   4988 #ifdef DEBUG
   4989 		panic("ip_fanout_send_icmp_v4: wrong type");
   4990 		/*NOTREACHED*/
   4991 #else
   4992 		freemsg(mp);
   4993 		break;
   4994 #endif
   4995 	}
   4996 }
   4997 
   4998 /*
   4999  * Used to send an ICMP error message when a packet is received for
   5000  * a protocol that is not supported. The mblk passed as argument
   5001  * is consumed by this function.
   5002  */
   5003 void
   5004 ip_proto_not_sup(mblk_t *mp, ip_recv_attr_t *ira)
   5005 {
   5006 	ipha_t		*ipha;
   5007 
   5008 	ipha = (ipha_t *)mp->b_rptr;
   5009 	if (ira->ira_flags & IRAF_IS_IPV4) {
   5010 		ASSERT(IPH_HDR_VERSION(ipha) == IP_VERSION);
   5011 		ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
   5012 		    ICMP_PROTOCOL_UNREACHABLE, ira);
   5013 	} else {
   5014 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
   5015 		ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
   5016 		    ICMP6_PARAMPROB_NEXTHEADER, ira);
   5017 	}
   5018 }
   5019 
   5020 /*
   5021  * Deliver a rawip packet to the given conn, possibly applying ipsec policy.
   5022  * Handles IPv4 and IPv6.
   5023  * We are responsible for disposing of mp, such as by freemsg() or putnext()
   5024  * Caller is responsible for dropping references to the conn.
   5025  */
   5026 void
   5027 ip_fanout_proto_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
   5028     ip_recv_attr_t *ira)
   5029 {
   5030 	ill_t		*ill = ira->ira_ill;
   5031 	ip_stack_t	*ipst = ill->ill_ipst;
   5032 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   5033 	boolean_t	secure;
   5034 	uint_t		protocol = ira->ira_protocol;
   5035 	iaflags_t	iraflags = ira->ira_flags;
   5036 	queue_t		*rq;
   5037 
   5038 	secure = iraflags & IRAF_IPSEC_SECURE;
   5039 
   5040 	rq = connp->conn_rq;
   5041 	if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
   5042 		switch (protocol) {
   5043 		case IPPROTO_ICMPV6:
   5044 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows);
   5045 			break;
   5046 		case IPPROTO_ICMP:
   5047 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
   5048 			break;
   5049 		default:
   5050 			BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
   5051 			break;
   5052 		}
   5053 		freemsg(mp);
   5054 		return;
   5055 	}
   5056 
   5057 	ASSERT(!(IPCL_IS_IPTUN(connp)));
   5058 
   5059 	if (((iraflags & IRAF_IS_IPV4) ?
   5060 	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
   5061 	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
   5062 	    secure) {
   5063 		mp = ipsec_check_inbound_policy(mp, connp, ipha,
   5064 		    ip6h, ira);
   5065 		if (mp == NULL) {
   5066 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5067 			/* Note that mp is NULL */
   5068 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5069 			return;
   5070 		}
   5071 	}
   5072 
   5073 	if (iraflags & IRAF_ICMP_ERROR) {
   5074 		(connp->conn_recvicmp)(connp, mp, NULL, ira);
   5075 	} else {
   5076 		ill_t *rill = ira->ira_rill;
   5077 
   5078 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
   5079 		ira->ira_ill = ira->ira_rill = NULL;
   5080 		/* Send it upstream */
   5081 		(connp->conn_recv)(connp, mp, NULL, ira);
   5082 		ira->ira_ill = ill;
   5083 		ira->ira_rill = rill;
   5084 	}
   5085 }
   5086 
   5087 /*
   5088  * Handle protocols with which IP is less intimate.  There
   5089  * can be more than one stream bound to a particular
   5090  * protocol.  When this is the case, normally each one gets a copy
   5091  * of any incoming packets.
   5092  *
   5093  * IPsec NOTE :
   5094  *
   5095  * Don't allow a secure packet going up a non-secure connection.
   5096  * We don't allow this because
   5097  *
   5098  * 1) Reply might go out in clear which will be dropped at
   5099  *    the sending side.
   5100  * 2) If the reply goes out in clear it will give the
   5101  *    adversary enough information for getting the key in
   5102  *    most of the cases.
   5103  *
   5104  * Moreover getting a secure packet when we expect clear
   5105  * implies that SA's were added without checking for
   5106  * policy on both ends. This should not happen once ISAKMP
   5107  * is used to negotiate SAs as SAs will be added only after
   5108  * verifying the policy.
   5109  *
   5110  * Zones notes:
   5111  * Earlier in ip_input on a system with multiple shared-IP zones we
   5112  * duplicate the multicast and broadcast packets and send them up
   5113  * with each explicit zoneid that exists on that ill.
   5114  * This means that here we can match the zoneid with SO_ALLZONES being special.
   5115  */
   5116 void
   5117 ip_fanout_proto_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
   5118 {
   5119 	mblk_t		*mp1;
   5120 	ipaddr_t	laddr;
   5121 	conn_t		*connp, *first_connp, *next_connp;
   5122 	connf_t		*connfp;
   5123 	ill_t		*ill = ira->ira_ill;
   5124 	ip_stack_t	*ipst = ill->ill_ipst;
   5125 
   5126 	laddr = ipha->ipha_dst;
   5127 
   5128 	connfp = &ipst->ips_ipcl_proto_fanout_v4[ira->ira_protocol];
   5129 	mutex_enter(&connfp->connf_lock);
   5130 	connp = connfp->connf_head;
   5131 	for (connp = connfp->connf_head; connp != NULL;
   5132 	    connp = connp->conn_next) {
   5133 		/* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
   5134 		if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
   5135 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5136 		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) {
   5137 			break;
   5138 		}
   5139 	}
   5140 
   5141 	if (connp == NULL) {
   5142 		/*
   5143 		 * No one bound to these addresses.  Is
   5144 		 * there a client that wants all
   5145 		 * unclaimed datagrams?
   5146 		 */
   5147 		mutex_exit(&connfp->connf_lock);
   5148 		ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
   5149 		    ICMP_PROTOCOL_UNREACHABLE, ira);
   5150 		return;
   5151 	}
   5152 
   5153 	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
   5154 
   5155 	CONN_INC_REF(connp);
   5156 	first_connp = connp;
   5157 	connp = connp->conn_next;
   5158 
   5159 	for (;;) {
   5160 		while (connp != NULL) {
   5161 			/* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
   5162 			if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
   5163 			    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5164 			    tsol_receive_local(mp, &laddr, IPV4_VERSION,
   5165 			    ira, connp)))
   5166 				break;
   5167 			connp = connp->conn_next;
   5168 		}
   5169 
   5170 		if (connp == NULL) {
   5171 			/* No more interested clients */
   5172 			connp = first_connp;
   5173 			break;
   5174 		}
   5175 		if (((mp1 = dupmsg(mp)) == NULL) &&
   5176 		    ((mp1 = copymsg(mp)) == NULL)) {
   5177 			/* Memory allocation failed */
   5178 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5179 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5180 			connp = first_connp;
   5181 			break;
   5182 		}
   5183 
   5184 		CONN_INC_REF(connp);
   5185 		mutex_exit(&connfp->connf_lock);
   5186 
   5187 		ip_fanout_proto_conn(connp, mp1, (ipha_t *)mp1->b_rptr, NULL,
   5188 		    ira);
   5189 
   5190 		mutex_enter(&connfp->connf_lock);
   5191 		/* Follow the next pointer before releasing the conn. */
   5192 		next_connp = connp->conn_next;
   5193 		CONN_DEC_REF(connp);
   5194 		connp = next_connp;
   5195 	}
   5196 
   5197 	/* Last one.  Send it upstream. */
   5198 	mutex_exit(&connfp->connf_lock);
   5199 
   5200 	ip_fanout_proto_conn(connp, mp, ipha, NULL, ira);
   5201 
   5202 	CONN_DEC_REF(connp);
   5203 }
   5204 
   5205 /*
   5206  * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or
   5207  * pass it along to ESP if the SPI is non-zero.  Returns the mblk if the mblk
   5208  * is not consumed.
   5209  *
   5210  * One of three things can happen, all of which affect the passed-in mblk:
   5211  *
   5212  * 1.) The packet is stock UDP and gets its zero-SPI stripped.  Return mblk..
   5213  *
   5214  * 2.) The packet is ESP-in-UDP, gets transformed into an equivalent
   5215  *     ESP packet, and is passed along to ESP for consumption.  Return NULL.
   5216  *
   5217  * 3.) The packet is an ESP-in-UDP Keepalive.  Drop it and return NULL.
   5218  */
   5219 mblk_t *
   5220 zero_spi_check(mblk_t *mp, ip_recv_attr_t *ira)
   5221 {
   5222 	int shift, plen, iph_len;
   5223 	ipha_t *ipha;
   5224 	udpha_t *udpha;
   5225 	uint32_t *spi;
   5226 	uint32_t esp_ports;
   5227 	uint8_t *orptr;
   5228 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   5229 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   5230 
   5231 	ipha = (ipha_t *)mp->b_rptr;
   5232 	iph_len = ira->ira_ip_hdr_length;
   5233 	plen = ira->ira_pktlen;
   5234 
   5235 	if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) {
   5236 		/*
   5237 		 * Most likely a keepalive for the benefit of an intervening
   5238 		 * NAT.  These aren't for us, per se, so drop it.
   5239 		 *
   5240 		 * RFC 3947/8 doesn't say for sure what to do for 2-3
   5241 		 * byte packets (keepalives are 1-byte), but we'll drop them
   5242 		 * also.
   5243 		 */
   5244 		ip_drop_packet(mp, B_TRUE, ira->ira_ill,
   5245 		    DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper);
   5246 		return (NULL);
   5247 	}
   5248 
   5249 	if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) {
   5250 		/* might as well pull it all up - it might be ESP. */
   5251 		if (!pullupmsg(mp, -1)) {
   5252 			ip_drop_packet(mp, B_TRUE, ira->ira_ill,
   5253 			    DROPPER(ipss, ipds_esp_nomem),
   5254 			    &ipss->ipsec_dropper);
   5255 			return (NULL);
   5256 		}
   5257 
   5258 		ipha = (ipha_t *)mp->b_rptr;
   5259 	}
   5260 	spi = (uint32_t *)(mp->b_rptr + iph_len + sizeof (udpha_t));
   5261 	if (*spi == 0) {
   5262 		/* UDP packet - remove 0-spi. */
   5263 		shift = sizeof (uint32_t);
   5264 	} else {
   5265 		/* ESP-in-UDP packet - reduce to ESP. */
   5266 		ipha->ipha_protocol = IPPROTO_ESP;
   5267 		shift = sizeof (udpha_t);
   5268 	}
   5269 
   5270 	/* Fix IP header */
   5271 	ira->ira_pktlen = (plen - shift);
   5272 	ipha->ipha_length = htons(ira->ira_pktlen);
   5273 	ipha->ipha_hdr_checksum = 0;
   5274 
   5275 	orptr = mp->b_rptr;
   5276 	mp->b_rptr += shift;
   5277 
   5278 	udpha = (udpha_t *)(orptr + iph_len);
   5279 	if (*spi == 0) {
   5280 		ASSERT((uint8_t *)ipha == orptr);
   5281 		udpha->uha_length = htons(plen - shift - iph_len);
   5282 		iph_len += sizeof (udpha_t);	/* For the call to ovbcopy(). */
   5283 		esp_ports = 0;
   5284 	} else {
   5285 		esp_ports = *((uint32_t *)udpha);
   5286 		ASSERT(esp_ports != 0);
   5287 	}
   5288 	ovbcopy(orptr, orptr + shift, iph_len);
   5289 	if (esp_ports != 0) /* Punt up for ESP processing. */ {
   5290 		ipha = (ipha_t *)(orptr + shift);
   5291 
   5292 		ira->ira_flags |= IRAF_ESP_UDP_PORTS;
   5293 		ira->ira_esp_udp_ports = esp_ports;
   5294 		ip_fanout_v4(mp, ipha, ira);
   5295 		return (NULL);
   5296 	}
   5297 	return (mp);
   5298 }
   5299 
   5300 /*
   5301  * Deliver a udp packet to the given conn, possibly applying ipsec policy.
   5302  * Handles IPv4 and IPv6.
   5303  * We are responsible for disposing of mp, such as by freemsg() or putnext()
   5304  * Caller is responsible for dropping references to the conn.
   5305  */
   5306 void
   5307 ip_fanout_udp_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
   5308     ip_recv_attr_t *ira)
   5309 {
   5310 	ill_t		*ill = ira->ira_ill;
   5311 	ip_stack_t	*ipst = ill->ill_ipst;
   5312 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   5313 	boolean_t	secure;
   5314 	iaflags_t	iraflags = ira->ira_flags;
   5315 
   5316 	secure = iraflags & IRAF_IPSEC_SECURE;
   5317 
   5318 	if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
   5319 	    !canputnext(connp->conn_rq)) {
   5320 		BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
   5321 		freemsg(mp);
   5322 		return;
   5323 	}
   5324 
   5325 	if (((iraflags & IRAF_IS_IPV4) ?
   5326 	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
   5327 	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
   5328 	    secure) {
   5329 		mp = ipsec_check_inbound_policy(mp, connp, ipha,
   5330 		    ip6h, ira);
   5331 		if (mp == NULL) {
   5332 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5333 			/* Note that mp is NULL */
   5334 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5335 			return;
   5336 		}
   5337 	}
   5338 
   5339 	/*
   5340 	 * Since this code is not used for UDP unicast we don't need a NAT_T
   5341 	 * check. Only ip_fanout_v4 has that check.
   5342 	 */
   5343 	if (ira->ira_flags & IRAF_ICMP_ERROR) {
   5344 		(connp->conn_recvicmp)(connp, mp, NULL, ira);
   5345 	} else {
   5346 		ill_t *rill = ira->ira_rill;
   5347 
   5348 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
   5349 		ira->ira_ill = ira->ira_rill = NULL;
   5350 		/* Send it upstream */
   5351 		(connp->conn_recv)(connp, mp, NULL, ira);
   5352 		ira->ira_ill = ill;
   5353 		ira->ira_rill = rill;
   5354 	}
   5355 }
   5356 
   5357 /*
   5358  * Fanout for UDP packets that are multicast or broadcast, and ICMP errors.
   5359  * (Unicast fanout is handled in ip_input_v4.)
   5360  *
   5361  * If SO_REUSEADDR is set all multicast and broadcast packets
   5362  * will be delivered to all conns bound to the same port.
   5363  *
   5364  * If there is at least one matching AF_INET receiver, then we will
   5365  * ignore any AF_INET6 receivers.
   5366  * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an
   5367  * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4
   5368  * packets.
   5369  *
   5370  * Zones notes:
   5371  * Earlier in ip_input on a system with multiple shared-IP zones we
   5372  * duplicate the multicast and broadcast packets and send them up
   5373  * with each explicit zoneid that exists on that ill.
   5374  * This means that here we can match the zoneid with SO_ALLZONES being special.
   5375  */
   5376 void
   5377 ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport,
   5378     ip_recv_attr_t *ira)
   5379 {
   5380 	ipaddr_t	laddr;
   5381 	in6_addr_t	v6faddr;
   5382 	conn_t		*connp;
   5383 	connf_t		*connfp;
   5384 	ipaddr_t	faddr;
   5385 	ill_t		*ill = ira->ira_ill;
   5386 	ip_stack_t	*ipst = ill->ill_ipst;
   5387 
   5388 	ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
   5389 
   5390 	laddr = ipha->ipha_dst;
   5391 	faddr = ipha->ipha_src;
   5392 
   5393 	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
   5394 	mutex_enter(&connfp->connf_lock);
   5395 	connp = connfp->connf_head;
   5396 
   5397 	/*
   5398 	 * If SO_REUSEADDR has been set on the first we send the
   5399 	 * packet to all clients that have joined the group and
   5400 	 * match the port.
   5401 	 */
   5402 	while (connp != NULL) {
   5403 		if ((IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr)) &&
   5404 		    conn_wantpacket(connp, ira, ipha) &&
   5405 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5406 		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
   5407 			break;
   5408 		connp = connp->conn_next;
   5409 	}
   5410 
   5411 	if (connp == NULL)
   5412 		goto notfound;
   5413 
   5414 	CONN_INC_REF(connp);
   5415 
   5416 	if (connp->conn_reuseaddr) {
   5417 		conn_t		*first_connp = connp;
   5418 		conn_t		*next_connp;
   5419 		mblk_t		*mp1;
   5420 
   5421 		connp = connp->conn_next;
   5422 		for (;;) {
   5423 			while (connp != NULL) {
   5424 				if (IPCL_UDP_MATCH(connp, lport, laddr,
   5425 				    fport, faddr) &&
   5426 				    conn_wantpacket(connp, ira, ipha) &&
   5427 				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5428 				    tsol_receive_local(mp, &laddr, IPV4_VERSION,
   5429 				    ira, connp)))
   5430 					break;
   5431 				connp = connp->conn_next;
   5432 			}
   5433 			if (connp == NULL) {
   5434 				/* No more interested clients */
   5435 				connp = first_connp;
   5436 				break;
   5437 			}
   5438 			if (((mp1 = dupmsg(mp)) == NULL) &&
   5439 			    ((mp1 = copymsg(mp)) == NULL)) {
   5440 				/* Memory allocation failed */
   5441 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5442 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5443 				connp = first_connp;
   5444 				break;
   5445 			}
   5446 			CONN_INC_REF(connp);
   5447 			mutex_exit(&connfp->connf_lock);
   5448 
   5449 			IP_STAT(ipst, ip_udp_fanmb);
   5450 			ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
   5451 			    NULL, ira);
   5452 			mutex_enter(&connfp->connf_lock);
   5453 			/* Follow the next pointer before releasing the conn */
   5454 			next_connp = connp->conn_next;
   5455 			CONN_DEC_REF(connp);
   5456 			connp = next_connp;
   5457 		}
   5458 	}
   5459 
   5460 	/* Last one.  Send it upstream. */
   5461 	mutex_exit(&connfp->connf_lock);
   5462 	IP_STAT(ipst, ip_udp_fanmb);
   5463 	ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
   5464 	CONN_DEC_REF(connp);
   5465 	return;
   5466 
   5467 notfound:
   5468 	mutex_exit(&connfp->connf_lock);
   5469 	/*
   5470 	 * IPv6 endpoints bound to multicast IPv4-mapped addresses
   5471 	 * have already been matched above, since they live in the IPv4
   5472 	 * fanout tables. This implies we only need to
   5473 	 * check for IPv6 in6addr_any endpoints here.
   5474 	 * Thus we compare using ipv6_all_zeros instead of the destination
   5475 	 * address, except for the multicast group membership lookup which
   5476 	 * uses the IPv4 destination.
   5477 	 */
   5478 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6faddr);
   5479 	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
   5480 	mutex_enter(&connfp->connf_lock);
   5481 	connp = connfp->connf_head;
   5482 	/*
   5483 	 * IPv4 multicast packet being delivered to an AF_INET6
   5484 	 * in6addr_any endpoint.
   5485 	 * Need to check conn_wantpacket(). Note that we use conn_wantpacket()
   5486 	 * and not conn_wantpacket_v6() since any multicast membership is
   5487 	 * for an IPv4-mapped multicast address.
   5488 	 */
   5489 	while (connp != NULL) {
   5490 		if (IPCL_UDP_MATCH_V6(connp, lport, ipv6_all_zeros,
   5491 		    fport, v6faddr) &&
   5492 		    conn_wantpacket(connp, ira, ipha) &&
   5493 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5494 		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
   5495 			break;
   5496 		connp = connp->conn_next;
   5497 	}
   5498 
   5499 	if (connp == NULL) {
   5500 		/*
   5501 		 * No one bound to this port.  Is
   5502 		 * there a client that wants all
   5503 		 * unclaimed datagrams?
   5504 		 */
   5505 		mutex_exit(&connfp->connf_lock);
   5506 
   5507 		if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].connf_head !=
   5508 		    NULL) {
   5509 			ASSERT(ira->ira_protocol == IPPROTO_UDP);
   5510 			ip_fanout_proto_v4(mp, ipha, ira);
   5511 		} else {
   5512 			/*
   5513 			 * We used to attempt to send an icmp error here, but
   5514 			 * since this is known to be a multicast packet
   5515 			 * and we don't send icmp errors in response to
   5516 			 * multicast, just drop the packet and give up sooner.
   5517 			 */
   5518 			BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
   5519 			freemsg(mp);
   5520 		}
   5521 		return;
   5522 	}
   5523 	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
   5524 
   5525 	/*
   5526 	 * If SO_REUSEADDR has been set on the first we send the
   5527 	 * packet to all clients that have joined the group and
   5528 	 * match the port.
   5529 	 */
   5530 	if (connp->conn_reuseaddr) {
   5531 		conn_t		*first_connp = connp;
   5532 		conn_t		*next_connp;
   5533 		mblk_t		*mp1;
   5534 
   5535 		CONN_INC_REF(connp);
   5536 		connp = connp->conn_next;
   5537 		for (;;) {
   5538 			while (connp != NULL) {
   5539 				if (IPCL_UDP_MATCH_V6(connp, lport,
   5540 				    ipv6_all_zeros, fport, v6faddr) &&
   5541 				    conn_wantpacket(connp, ira, ipha) &&
   5542 				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5543 				    tsol_receive_local(mp, &laddr, IPV4_VERSION,
   5544 				    ira, connp)))
   5545 					break;
   5546 				connp = connp->conn_next;
   5547 			}
   5548 			if (connp == NULL) {
   5549 				/* No more interested clients */
   5550 				connp = first_connp;
   5551 				break;
   5552 			}
   5553 			if (((mp1 = dupmsg(mp)) == NULL) &&
   5554 			    ((mp1 = copymsg(mp)) == NULL)) {
   5555 				/* Memory allocation failed */
   5556 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5557 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5558 				connp = first_connp;
   5559 				break;
   5560 			}
   5561 			CONN_INC_REF(connp);
   5562 			mutex_exit(&connfp->connf_lock);
   5563 
   5564 			IP_STAT(ipst, ip_udp_fanmb);
   5565 			ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
   5566 			    NULL, ira);
   5567 			mutex_enter(&connfp->connf_lock);
   5568 			/* Follow the next pointer before releasing the conn */
   5569 			next_connp = connp->conn_next;
   5570 			CONN_DEC_REF(connp);
   5571 			connp = next_connp;
   5572 		}
   5573 	}
   5574 
   5575 	/* Last one.  Send it upstream. */
   5576 	mutex_exit(&connfp->connf_lock);
   5577 	IP_STAT(ipst, ip_udp_fanmb);
   5578 	ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
   5579 	CONN_DEC_REF(connp);
   5580 }
   5581 
   5582 /*
   5583  * Split an incoming packet's IPv4 options into the label and the other options.
   5584  * If 'allocate' is set it does memory allocation for the ip_pkt_t, including
   5585  * clearing out any leftover label or options.
   5586  * Otherwise it just makes ipp point into the packet.
   5587  *
   5588  * Returns zero if ok; ENOMEM if the buffer couldn't be allocated.
   5589  */
   5590 int
   5591 ip_find_hdr_v4(ipha_t *ipha, ip_pkt_t *ipp, boolean_t allocate)
   5592 {
   5593 	uchar_t		*opt;
   5594 	uint32_t	totallen;
   5595 	uint32_t	optval;
   5596 	uint32_t	optlen;
   5597 
   5598 	ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
   5599 	ipp->ipp_hoplimit = ipha->ipha_ttl;
   5600 	ipp->ipp_type_of_service = ipha->ipha_type_of_service;
   5601 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &ipp->ipp_addr);
   5602 
   5603 	/*
   5604 	 * Get length (in 4 byte octets) of IP header options.
   5605 	 */
   5606 	totallen = ipha->ipha_version_and_hdr_length -
   5607 	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
   5608 
   5609 	if (totallen == 0) {
   5610 		if (!allocate)
   5611 			return (0);
   5612 
   5613 		/* Clear out anything from a previous packet */
   5614 		if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
   5615 			kmem_free(ipp->ipp_ipv4_options,
   5616 			    ipp->ipp_ipv4_options_len);
   5617 			ipp->ipp_ipv4_options = NULL;
   5618 			ipp->ipp_ipv4_options_len = 0;
   5619 			ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
   5620 		}
   5621 		if (ipp->ipp_fields & IPPF_LABEL_V4) {
   5622 			kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
   5623 			ipp->ipp_label_v4 = NULL;
   5624 			ipp->ipp_label_len_v4 = 0;
   5625 			ipp->ipp_fields &= ~IPPF_LABEL_V4;
   5626 		}
   5627 		return (0);
   5628 	}
   5629 
   5630 	totallen <<= 2;
   5631 	opt = (uchar_t *)&ipha[1];
   5632 	if (!is_system_labeled()) {
   5633 
   5634 	copyall:
   5635 		if (!allocate) {
   5636 			if (totallen != 0) {
   5637 				ipp->ipp_ipv4_options = opt;
   5638 				ipp->ipp_ipv4_options_len = totallen;
   5639 				ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
   5640 			}
   5641 			return (0);
   5642 		}
   5643 		/* Just copy all of options */
   5644 		if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
   5645 			if (totallen == ipp->ipp_ipv4_options_len) {
   5646 				bcopy(opt, ipp->ipp_ipv4_options, totallen);
   5647 				return (0);
   5648 			}
   5649 			kmem_free(ipp->ipp_ipv4_options,
   5650 			    ipp->ipp_ipv4_options_len);
   5651 			ipp->ipp_ipv4_options = NULL;
   5652 			ipp->ipp_ipv4_options_len = 0;
   5653 			ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
   5654 		}
   5655 		if (totallen == 0)
   5656 			return (0);
   5657 
   5658 		ipp->ipp_ipv4_options = kmem_alloc(totallen, KM_NOSLEEP);
   5659 		if (ipp->ipp_ipv4_options == NULL)
   5660 			return (ENOMEM);
   5661 		ipp->ipp_ipv4_options_len = totallen;
   5662 		ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
   5663 		bcopy(opt, ipp->ipp_ipv4_options, totallen);
   5664 		return (0);
   5665 	}
   5666 
   5667 	if (allocate && (ipp->ipp_fields & IPPF_LABEL_V4)) {
   5668 		kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
   5669 		ipp->ipp_label_v4 = NULL;
   5670 		ipp->ipp_label_len_v4 = 0;
   5671 		ipp->ipp_fields &= ~IPPF_LABEL_V4;
   5672 	}
   5673 
   5674 	/*
   5675 	 * Search for CIPSO option.
   5676 	 * We assume CIPSO is first in options if it is present.
   5677 	 * If it isn't, then ipp_opt_ipv4_options will not include the options
   5678 	 * prior to the CIPSO option.
   5679 	 */
   5680 	while (totallen != 0) {
   5681 		switch (optval = opt[IPOPT_OPTVAL]) {
   5682 		case IPOPT_EOL:
   5683 			return (0);
   5684 		case IPOPT_NOP:
   5685 			optlen = 1;
   5686 			break;
   5687 		default:
   5688 			if (totallen <= IPOPT_OLEN)
   5689 				return (EINVAL);
   5690 			optlen = opt[IPOPT_OLEN];
   5691 			if (optlen < 2)
   5692 				return (EINVAL);
   5693 		}
   5694 		if (optlen > totallen)
   5695 			return (EINVAL);
   5696 
   5697 		switch (optval) {
   5698 		case IPOPT_COMSEC:
   5699 			if (!allocate) {
   5700 				ipp->ipp_label_v4 = opt;
   5701 				ipp->ipp_label_len_v4 = optlen;
   5702 				ipp->ipp_fields |= IPPF_LABEL_V4;
   5703 			} else {
   5704 				ipp->ipp_label_v4 = kmem_alloc(optlen,
   5705 				    KM_NOSLEEP);
   5706 				if (ipp->ipp_label_v4 == NULL)
   5707 					return (ENOMEM);
   5708 				ipp->ipp_label_len_v4 = optlen;
   5709 				ipp->ipp_fields |= IPPF_LABEL_V4;
   5710 				bcopy(opt, ipp->ipp_label_v4, optlen);
   5711 			}
   5712 			totallen -= optlen;
   5713 			opt += optlen;
   5714 
   5715 			/* Skip padding bytes until we get to a multiple of 4 */
   5716 			while ((totallen & 3) != 0 && opt[0] == IPOPT_NOP) {
   5717 				totallen--;
   5718 				opt++;
   5719 			}
   5720 			/* Remaining as ipp_ipv4_options */
   5721 			goto copyall;
   5722 		}
   5723 		totallen -= optlen;
   5724 		opt += optlen;
   5725 	}
   5726 	/* No CIPSO found; return everything as ipp_ipv4_options */
   5727 	totallen = ipha->ipha_version_and_hdr_length -
   5728 	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
   5729 	totallen <<= 2;
   5730 	opt = (uchar_t *)&ipha[1];
   5731 	goto copyall;
   5732 }
   5733 
   5734 /*
   5735  * Efficient versions of lookup for an IRE when we only
   5736  * match the address.
   5737  * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
   5738  * Does not handle multicast addresses.
   5739  */
   5740 uint_t
   5741 ip_type_v4(ipaddr_t addr, ip_stack_t *ipst)
   5742 {
   5743 	ire_t *ire;
   5744 	uint_t result;
   5745 
   5746 	ire = ire_ftable_lookup_simple_v4(addr, 0, ipst, NULL);
   5747 	ASSERT(ire != NULL);
   5748 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
   5749 		result = IRE_NOROUTE;
   5750 	else
   5751 		result = ire->ire_type;
   5752 	ire_refrele(ire);
   5753 	return (result);
   5754 }
   5755 
   5756 /*
   5757  * Efficient versions of lookup for an IRE when we only
   5758  * match the address.
   5759  * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
   5760  * Does not handle multicast addresses.
   5761  */
   5762 uint_t
   5763 ip_type_v6(const in6_addr_t *addr, ip_stack_t *ipst)
   5764 {
   5765 	ire_t *ire;
   5766 	uint_t result;
   5767 
   5768 	ire = ire_ftable_lookup_simple_v6(addr, 0, ipst, NULL);
   5769 	ASSERT(ire != NULL);
   5770 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
   5771 		result = IRE_NOROUTE;
   5772 	else
   5773 		result = ire->ire_type;
   5774 	ire_refrele(ire);
   5775 	return (result);
   5776 }
   5777 
   5778 /*
   5779  * Nobody should be sending
   5780  * packets up this stream
   5781  */
   5782 static void
   5783 ip_lrput(queue_t *q, mblk_t *mp)
   5784 {
   5785 	switch (mp->b_datap->db_type) {
   5786 	case M_FLUSH:
   5787 		/* Turn around */
   5788 		if (*mp->b_rptr & FLUSHW) {
   5789 			*mp->b_rptr &= ~FLUSHR;
   5790 			qreply(q, mp);
   5791 			return;
   5792 		}
   5793 		break;
   5794 	}
   5795 	freemsg(mp);
   5796 }
   5797 
   5798 /* Nobody should be sending packets down this stream */
   5799 /* ARGSUSED */
   5800 void
   5801 ip_lwput(queue_t *q, mblk_t *mp)
   5802 {
   5803 	freemsg(mp);
   5804 }
   5805 
   5806 /*
   5807  * Move the first hop in any source route to ipha_dst and remove that part of
   5808  * the source route.  Called by other protocols.  Errors in option formatting
   5809  * are ignored - will be handled by ip_output_options. Return the final
   5810  * destination (either ipha_dst or the last entry in a source route.)
   5811  */
   5812 ipaddr_t
   5813 ip_massage_options(ipha_t *ipha, netstack_t *ns)
   5814 {
   5815 	ipoptp_t	opts;
   5816 	uchar_t		*opt;
   5817 	uint8_t		optval;
   5818 	uint8_t		optlen;
   5819 	ipaddr_t	dst;
   5820 	int		i;
   5821 	ip_stack_t	*ipst = ns->netstack_ip;
   5822 
   5823 	ip2dbg(("ip_massage_options\n"));
   5824 	dst = ipha->ipha_dst;
   5825 	for (optval = ipoptp_first(&opts, ipha);
   5826 	    optval != IPOPT_EOL;
   5827 	    optval = ipoptp_next(&opts)) {
   5828 		opt = opts.ipoptp_cur;
   5829 		switch (optval) {
   5830 			uint8_t off;
   5831 		case IPOPT_SSRR:
   5832 		case IPOPT_LSRR:
   5833 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   5834 				ip1dbg(("ip_massage_options: bad src route\n"));
   5835 				break;
   5836 			}
   5837 			optlen = opts.ipoptp_len;
   5838 			off = opt[IPOPT_OFFSET];
   5839 			off--;
   5840 		redo_srr:
   5841 			if (optlen < IP_ADDR_LEN ||
   5842 			    off > optlen - IP_ADDR_LEN) {
   5843 				/* End of source route */
   5844 				ip1dbg(("ip_massage_options: end of SR\n"));
   5845 				break;
   5846 			}
   5847 			bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
   5848 			ip1dbg(("ip_massage_options: next hop 0x%x\n",
   5849 			    ntohl(dst)));
   5850 			/*
   5851 			 * Check if our address is present more than
   5852 			 * once as consecutive hops in source route.
   5853 			 * XXX verify per-interface ip_forwarding
   5854 			 * for source route?
   5855 			 */
   5856 			if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
   5857 				off += IP_ADDR_LEN;
   5858 				goto redo_srr;
   5859 			}
   5860 			if (dst == htonl(INADDR_LOOPBACK)) {
   5861 				ip1dbg(("ip_massage_options: loopback addr in "
   5862 				    "source route!\n"));
   5863 				break;
   5864 			}
   5865 			/*
   5866 			 * Update ipha_dst to be the first hop and remove the
   5867 			 * first hop from the source route (by overwriting
   5868 			 * part of the option with NOP options).
   5869 			 */
   5870 			ipha->ipha_dst = dst;
   5871 			/* Put the last entry in dst */
   5872 			off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) +
   5873 			    3;
   5874 			bcopy(&opt[off], &dst, IP_ADDR_LEN);
   5875 
   5876 			ip1dbg(("ip_massage_options: last hop 0x%x\n",
   5877 			    ntohl(dst)));
   5878 			/* Move down and overwrite */
   5879 			opt[IP_ADDR_LEN] = opt[0];
   5880 			opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN;
   5881 			opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET];
   5882 			for (i = 0; i < IP_ADDR_LEN; i++)
   5883 				opt[i] = IPOPT_NOP;
   5884 			break;
   5885 		}
   5886 	}
   5887 	return (dst);
   5888 }
   5889 
   5890 /*
   5891  * Return the network mask
   5892  * associated with the specified address.
   5893  */
   5894 ipaddr_t
   5895 ip_net_mask(ipaddr_t addr)
   5896 {
   5897 	uchar_t	*up = (uchar_t *)&addr;
   5898 	ipaddr_t mask = 0;
   5899 	uchar_t	*maskp = (uchar_t *)&mask;
   5900 
   5901 #if defined(__i386) || defined(__amd64)
   5902 #define	TOTALLY_BRAIN_DAMAGED_C_COMPILER
   5903 #endif
   5904 #ifdef  TOTALLY_BRAIN_DAMAGED_C_COMPILER
   5905 	maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0;
   5906 #endif
   5907 	if (CLASSD(addr)) {
   5908 		maskp[0] = 0xF0;
   5909 		return (mask);
   5910 	}
   5911 
   5912 	/* We assume Class E default netmask to be 32 */
   5913 	if (CLASSE(addr))
   5914 		return (0xffffffffU);
   5915 
   5916 	if (addr == 0)
   5917 		return (0);
   5918 	maskp[0] = 0xFF;
   5919 	if ((up[0] & 0x80) == 0)
   5920 		return (mask);
   5921 
   5922 	maskp[1] = 0xFF;
   5923 	if ((up[0] & 0xC0) == 0x80)
   5924 		return (mask);
   5925 
   5926 	maskp[2] = 0xFF;
   5927 	if ((up[0] & 0xE0) == 0xC0)
   5928 		return (mask);
   5929 
   5930 	/* Otherwise return no mask */
   5931 	return ((ipaddr_t)0);
   5932 }
   5933 
   5934 /* Name/Value Table Lookup Routine */
   5935 char *
   5936 ip_nv_lookup(nv_t *nv, int value)
   5937 {
   5938 	if (!nv)
   5939 		return (NULL);
   5940 	for (; nv->nv_name; nv++) {
   5941 		if (nv->nv_value == value)
   5942 			return (nv->nv_name);
   5943 	}
   5944 	return ("unknown");
   5945 }
   5946 
   5947 static int
   5948 ip_wait_for_info_ack(ill_t *ill)
   5949 {
   5950 	int err;
   5951 
   5952 	mutex_enter(&ill->ill_lock);
   5953 	while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) {
   5954 		/*
   5955 		 * Return value of 0 indicates a pending signal.
   5956 		 */
   5957 		err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock);
   5958 		if (err == 0) {
   5959 			mutex_exit(&ill->ill_lock);
   5960 			return (EINTR);
   5961 		}
   5962 	}
   5963 	mutex_exit(&ill->ill_lock);
   5964 	/*
   5965 	 * ip_rput_other could have set an error  in ill_error on
   5966 	 * receipt of M_ERROR.
   5967 	 */
   5968 	return (ill->ill_error);
   5969 }
   5970 
   5971 /*
   5972  * This is a module open, i.e. this is a control stream for access
   5973  * to a DLPI device.  We allocate an ill_t as the instance data in
   5974  * this case.
   5975  */
   5976 static int
   5977 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   5978 {
   5979 	ill_t	*ill;
   5980 	int	err;
   5981 	zoneid_t zoneid;
   5982 	netstack_t *ns;
   5983 	ip_stack_t *ipst;
   5984 
   5985 	/*
   5986 	 * Prevent unprivileged processes from pushing IP so that
   5987 	 * they can't send raw IP.
   5988 	 */
   5989 	if (secpolicy_net_rawaccess(credp) != 0)
   5990 		return (EPERM);
   5991 
   5992 	ns = netstack_find_by_cred(credp);
   5993 	ASSERT(ns != NULL);
   5994 	ipst = ns->netstack_ip;
   5995 	ASSERT(ipst != NULL);
   5996 
   5997 	/*
   5998 	 * For exclusive stacks we set the zoneid to zero
   5999 	 * to make IP operate as if in the global zone.
   6000 	 */
   6001 	if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
   6002 		zoneid = GLOBAL_ZONEID;
   6003 	else
   6004 		zoneid = crgetzoneid(credp);
   6005 
   6006 	ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t));
   6007 	q->q_ptr = WR(q)->q_ptr = ill;
   6008 	ill->ill_ipst = ipst;
   6009 	ill->ill_zoneid = zoneid;
   6010 
   6011 	/*
   6012 	 * ill_init initializes the ill fields and then sends down
   6013 	 * down a DL_INFO_REQ after calling qprocson.
   6014 	 */
   6015 	err = ill_init(q, ill);
   6016 
   6017 	if (err != 0) {
   6018 		mi_free(ill);
   6019 		netstack_rele(ipst->ips_netstack);
   6020 		q->q_ptr = NULL;
   6021 		WR(q)->q_ptr = NULL;
   6022 		return (err);
   6023 	}
   6024 
   6025 	/*
   6026 	 * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
   6027 	 *
   6028 	 * ill_init initializes the ipsq marking this thread as
   6029 	 * writer
   6030 	 */
   6031 	ipsq_exit(ill->ill_phyint->phyint_ipsq);
   6032 	err = ip_wait_for_info_ack(ill);
   6033 	if (err == 0)
   6034 		ill->ill_credp = credp;
   6035 	else
   6036 		goto fail;
   6037 
   6038 	crhold(credp);
   6039 
   6040 	mutex_enter(&ipst->ips_ip_mi_lock);
   6041 	err = mi_open_link(&ipst->ips_ip_g_head, (IDP)q->q_ptr, devp, flag,
   6042 	    sflag, credp);
   6043 	mutex_exit(&ipst->ips_ip_mi_lock);
   6044 fail:
   6045 	if (err) {
   6046 		(void) ip_close(q, 0);
   6047 		return (err);
   6048 	}
   6049 	return (0);
   6050 }
   6051 
   6052 /* For /dev/ip aka AF_INET open */
   6053 int
   6054 ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   6055 {
   6056 	return (ip_open(q, devp, flag, sflag, credp, B_FALSE));
   6057 }
   6058 
   6059 /* For /dev/ip6 aka AF_INET6 open */
   6060 int
   6061 ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   6062 {
   6063 	return (ip_open(q, devp, flag, sflag, credp, B_TRUE));
   6064 }
   6065 
   6066 /* IP open routine. */
   6067 int
   6068 ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
   6069     boolean_t isv6)
   6070 {
   6071 	conn_t 		*connp;
   6072 	major_t		maj;
   6073 	zoneid_t	zoneid;
   6074 	netstack_t	*ns;
   6075 	ip_stack_t	*ipst;
   6076 
   6077 	/* Allow reopen. */
   6078 	if (q->q_ptr != NULL)
   6079 		return (0);
   6080 
   6081 	if (sflag & MODOPEN) {
   6082 		/* This is a module open */
   6083 		return (ip_modopen(q, devp, flag, sflag, credp));
   6084 	}
   6085 
   6086 	if ((flag & ~(FKLYR)) == IP_HELPER_STR) {
   6087 		/*
   6088 		 * Non streams based socket looking for a stream
   6089 		 * to access IP
   6090 		 */
   6091 		return (ip_helper_stream_setup(q, devp, flag, sflag,
   6092 		    credp, isv6));
   6093 	}
   6094 
   6095 	ns = netstack_find_by_cred(credp);
   6096 	ASSERT(ns != NULL);
   6097 	ipst = ns->netstack_ip;
   6098 	ASSERT(ipst != NULL);
   6099 
   6100 	/*
   6101 	 * For exclusive stacks we set the zoneid to zero
   6102 	 * to make IP operate as if in the global zone.
   6103 	 */
   6104 	if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
   6105 		zoneid = GLOBAL_ZONEID;
   6106 	else
   6107 		zoneid = crgetzoneid(credp);
   6108 
   6109 	/*
   6110 	 * We are opening as a device. This is an IP client stream, and we
   6111 	 * allocate an conn_t as the instance data.
   6112 	 */
   6113 	connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, ipst->ips_netstack);
   6114 
   6115 	/*
   6116 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
   6117 	 * done by netstack_find_by_cred()
   6118 	 */
   6119 	netstack_rele(ipst->ips_netstack);
   6120 
   6121 	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
   6122 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
   6123 	connp->conn_ixa->ixa_zoneid = zoneid;
   6124 	connp->conn_zoneid = zoneid;
   6125 
   6126 	connp->conn_rq = q;
   6127 	q->q_ptr = WR(q)->q_ptr = connp;
   6128 
   6129 	/* Minor tells us which /dev entry was opened */
   6130 	if (isv6) {
   6131 		connp->conn_family = AF_INET6;
   6132 		connp->conn_ipversion = IPV6_VERSION;
   6133 		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
   6134 		connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
   6135 	} else {
   6136 		connp->conn_family = AF_INET;
   6137 		connp->conn_ipversion = IPV4_VERSION;
   6138 		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
   6139 	}
   6140 
   6141 	if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
   6142 	    ((connp->conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
   6143 		connp->conn_minor_arena = ip_minor_arena_la;
   6144 	} else {
   6145 		/*
   6146 		 * Either minor numbers in the large arena were exhausted
   6147 		 * or a non socket application is doing the open.
   6148 		 * Try to allocate from the small arena.
   6149 		 */
   6150 		if ((connp->conn_dev =
   6151 		    inet_minor_alloc(ip_minor_arena_sa)) == 0) {
   6152 			/* CONN_DEC_REF takes care of netstack_rele() */
   6153 			q->q_ptr = WR(q)->q_ptr = NULL;
   6154 			CONN_DEC_REF(connp);
   6155 			return (EBUSY);
   6156 		}
   6157 		connp->conn_minor_arena = ip_minor_arena_sa;
   6158 	}
   6159 
   6160 	maj = getemajor(*devp);
   6161 	*devp = makedevice(maj, (minor_t)connp->conn_dev);
   6162 
   6163 	/*
   6164 	 * connp->conn_cred is crfree()ed in ipcl_conn_destroy()
   6165 	 */
   6166 	connp->conn_cred = credp;
   6167 	/* Cache things in ixa without an extra refhold */
   6168 	connp->conn_ixa->ixa_cred = connp->conn_cred;
   6169 	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
   6170 	if (is_system_labeled())
   6171 		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
   6172 
   6173 	/*
   6174 	 * Handle IP_IOC_RTS_REQUEST and other ioctls which use conn_recv
   6175 	 */
   6176 	connp->conn_recv = ip_conn_input;
   6177 	connp->conn_recvicmp = ip_conn_input_icmp;
   6178 
   6179 	crhold(connp->conn_cred);
   6180 
   6181 	/*
   6182 	 * If the caller has the process-wide flag set, then default to MAC
   6183 	 * exempt mode.  This allows read-down to unlabeled hosts.
   6184 	 */
   6185 	if (getpflags(NET_MAC_AWARE, credp) != 0)
   6186 		connp->conn_mac_mode = CONN_MAC_AWARE;
   6187 
   6188 	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
   6189 
   6190 	connp->conn_rq = q;
   6191 	connp->conn_wq = WR(q);
   6192 
   6193 	/* Non-zero default values */
   6194 	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
   6195 
   6196 	/*
   6197 	 * Make the conn globally visible to walkers
   6198 	 */
   6199 	ASSERT(connp->conn_ref == 1);
   6200 	mutex_enter(&connp->conn_lock);
   6201 	connp->conn_state_flags &= ~CONN_INCIPIENT;
   6202 	mutex_exit(&connp->conn_lock);
   6203 
   6204 	qprocson(q);
   6205 
   6206 	return (0);
   6207 }
   6208 
   6209 /*
   6210  * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid,
   6211  * all of them are copied to the conn_t. If the req is "zero", the policy is
   6212  * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req
   6213  * fields.
   6214  * We keep only the latest setting of the policy and thus policy setting
   6215  * is not incremental/cumulative.
   6216  *
   6217  * Requests to set policies with multiple alternative actions will
   6218  * go through a different API.
   6219  */
   6220 int
   6221 ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
   6222 {
   6223 	uint_t ah_req = 0;
   6224 	uint_t esp_req = 0;
   6225 	uint_t se_req = 0;
   6226 	ipsec_act_t *actp = NULL;
   6227 	uint_t nact;
   6228 	ipsec_policy_head_t *ph;
   6229 	boolean_t is_pol_reset, is_pol_inserted = B_FALSE;
   6230 	int error = 0;
   6231 	netstack_t	*ns = connp->conn_netstack;
   6232 	ip_stack_t	*ipst = ns->netstack_ip;
   6233 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
   6234 
   6235 #define	REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER)
   6236 
   6237 	/*
   6238 	 * The IP_SEC_OPT option does not allow variable length parameters,
   6239 	 * hence a request cannot be NULL.
   6240 	 */
   6241 	if (req == NULL)
   6242 		return (EINVAL);
   6243 
   6244 	ah_req = req->ipsr_ah_req;
   6245 	esp_req = req->ipsr_esp_req;
   6246 	se_req = req->ipsr_self_encap_req;
   6247 
   6248 	/* Don't allow setting self-encap without one or more of AH/ESP. */
   6249 	if (se_req != 0 && esp_req == 0 && ah_req == 0)
   6250 		return (EINVAL);
   6251 
   6252 	/*
   6253 	 * Are we dealing with a request to reset the policy (i.e.
   6254 	 * zero requests).
   6255 	 */
   6256 	is_pol_reset = ((ah_req & REQ_MASK) == 0 &&
   6257 	    (esp_req & REQ_MASK) == 0 &&
   6258 	    (se_req & REQ_MASK) == 0);
   6259 
   6260 	if (!is_pol_reset) {
   6261 		/*
   6262 		 * If we couldn't load IPsec, fail with "protocol
   6263 		 * not supported".
   6264 		 * IPsec may not have been loaded for a request with zero
   6265 		 * policies, so we don't fail in this case.
   6266 		 */
   6267 		mutex_enter(&ipss->ipsec_loader_lock);
   6268 		if (ipss->ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) {
   6269 			mutex_exit(&ipss->ipsec_loader_lock);
   6270 			return (EPROTONOSUPPORT);
   6271 		}
   6272 		mutex_exit(&ipss->ipsec_loader_lock);
   6273 
   6274 		/*
   6275 		 * Test for valid requests. Invalid algorithms
   6276 		 * need to be tested by IPsec code because new
   6277 		 * algorithms can be added dynamically.
   6278 		 */
   6279 		if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
   6280 		    (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
   6281 		    (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) {
   6282 			return (EINVAL);
   6283 		}
   6284 
   6285 		/*
   6286 		 * Only privileged users can issue these
   6287 		 * requests.
   6288 		 */
   6289 		if (((ah_req & IPSEC_PREF_NEVER) ||
   6290 		    (esp_req & IPSEC_PREF_NEVER) ||
   6291 		    (se_req & IPSEC_PREF_NEVER)) &&
   6292 		    secpolicy_ip_config(cr, B_FALSE) != 0) {
   6293 			return (EPERM);
   6294 		}
   6295 
   6296 		/*
   6297 		 * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER
   6298 		 * are mutually exclusive.
   6299 		 */
   6300 		if (((ah_req & REQ_MASK) == REQ_MASK) ||
   6301 		    ((esp_req & REQ_MASK) == REQ_MASK) ||
   6302 		    ((se_req & REQ_MASK) == REQ_MASK)) {
   6303 			/* Both of them are set */
   6304 			return (EINVAL);
   6305 		}
   6306 	}
   6307 
   6308 	ASSERT(MUTEX_HELD(&connp->conn_lock));
   6309 
   6310 	/*
   6311 	 * If we have already cached policies in conn_connect(), don't
   6312 	 * let them change now. We cache policies for connections
   6313 	 * whose src,dst [addr, port] is known.
   6314 	 */
   6315 	if (connp->conn_policy_cached) {
   6316 		return (EINVAL);
   6317 	}
   6318 
   6319 	/*
   6320 	 * We have a zero policies, reset the connection policy if already
   6321 	 * set. This will cause the connection to inherit the
   6322 	 * global policy, if any.
   6323 	 */
   6324 	if (is_pol_reset) {
   6325 		if (connp->conn_policy != NULL) {
   6326 			IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack);
   6327 			connp->conn_policy = NULL;
   6328 		}
   6329 		connp->conn_in_enforce_policy = B_FALSE;
   6330 		connp->conn_out_enforce_policy = B_FALSE;
   6331 		return (0);
   6332 	}
   6333 
   6334 	ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy,
   6335 	    ipst->ips_netstack);
   6336 	if (ph == NULL)
   6337 		goto enomem;
   6338 
   6339 	ipsec_actvec_from_req(req, &actp, &nact, ipst->ips_netstack);
   6340 	if (actp == NULL)
   6341 		goto enomem;
   6342 
   6343 	/*
   6344 	 * Always insert IPv4 policy entries, since they can also apply to
   6345 	 * ipv6 sockets being used in ipv4-compat mode.
   6346 	 */
   6347 	if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
   6348 	    IPSEC_TYPE_INBOUND, ns))
   6349 		goto enomem;
   6350 	is_pol_inserted = B_TRUE;
   6351 	if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
   6352 	    IPSEC_TYPE_OUTBOUND, ns))
   6353 		goto enomem;
   6354 
   6355 	/*
   6356 	 * We're looking at a v6 socket, also insert the v6-specific
   6357 	 * entries.
   6358 	 */
   6359 	if (connp->conn_family == AF_INET6) {
   6360 		if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
   6361 		    IPSEC_TYPE_INBOUND, ns))
   6362 			goto enomem;
   6363 		if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
   6364 		    IPSEC_TYPE_OUTBOUND, ns))
   6365 			goto enomem;
   6366 	}
   6367 
   6368 	ipsec_actvec_free(actp, nact);
   6369 
   6370 	/*
   6371 	 * If the requests need security, set enforce_policy.
   6372 	 * If the requests are IPSEC_PREF_NEVER, one should
   6373 	 * still set conn_out_enforce_policy so that ip_set_destination
   6374 	 * marks the ip_xmit_attr_t appropriatly. This is needed so that
   6375 	 * for connections that we don't cache policy in at connect time,
   6376 	 * if global policy matches in ip_output_attach_policy, we
   6377 	 * don't wrongly inherit global policy. Similarly, we need
   6378 	 * to set conn_in_enforce_policy also so that we don't verify
   6379 	 * policy wrongly.
   6380 	 */
   6381 	if ((ah_req & REQ_MASK) != 0 ||
   6382 	    (esp_req & REQ_MASK) != 0 ||
   6383 	    (se_req & REQ_MASK) != 0) {
   6384 		connp->conn_in_enforce_policy = B_TRUE;
   6385 		connp->conn_out_enforce_policy = B_TRUE;
   6386 	}
   6387 
   6388 	return (error);
   6389 #undef REQ_MASK
   6390 
   6391 	/*
   6392 	 * Common memory-allocation-failure exit path.
   6393 	 */
   6394 enomem:
   6395 	if (actp != NULL)
   6396 		ipsec_actvec_free(actp, nact);
   6397 	if (is_pol_inserted)
   6398 		ipsec_polhead_flush(ph, ns);
   6399 	return (ENOMEM);
   6400 }
   6401 
   6402 /*
   6403  * Set socket options for joining and leaving multicast groups.
   6404  * Common to IPv4 and IPv6; inet6 indicates the type of socket.
   6405  * The caller has already check that the option name is consistent with
   6406  * the address family of the socket.
   6407  */
   6408 int
   6409 ip_opt_set_multicast_group(conn_t *connp, t_scalar_t name,
   6410     uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
   6411 {
   6412 	int		*i1 = (int *)invalp;
   6413 	int		error = 0;
   6414 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   6415 	struct ip_mreq	*v4_mreqp;
   6416 	struct ipv6_mreq *v6_mreqp;
   6417 	struct group_req *greqp;
   6418 	ire_t *ire;
   6419 	boolean_t done = B_FALSE;
   6420 	ipaddr_t ifaddr;
   6421 	in6_addr_t v6group;
   6422 	uint_t ifindex;
   6423 	boolean_t mcast_opt = B_TRUE;
   6424 	mcast_record_t fmode;
   6425 	int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
   6426 	    ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
   6427 
   6428 	switch (name) {
   6429 	case IP_ADD_MEMBERSHIP:
   6430 	case IPV6_JOIN_GROUP:
   6431 		mcast_opt = B_FALSE;
   6432 		/* FALLTHRU */
   6433 	case MCAST_JOIN_GROUP:
   6434 		fmode = MODE_IS_EXCLUDE;
   6435 		optfn = ip_opt_add_group;
   6436 		break;
   6437 
   6438 	case IP_DROP_MEMBERSHIP:
   6439 	case IPV6_LEAVE_GROUP:
   6440 		mcast_opt = B_FALSE;
   6441 		/* FALLTHRU */
   6442 	case MCAST_LEAVE_GROUP:
   6443 		fmode = MODE_IS_INCLUDE;
   6444 		optfn = ip_opt_delete_group;
   6445 		break;
   6446 	default:
   6447 		ASSERT(0);
   6448 	}
   6449 
   6450 	if (mcast_opt) {
   6451 		struct sockaddr_in *sin;
   6452 		struct sockaddr_in6 *sin6;
   6453 
   6454 		greqp = (struct group_req *)i1;
   6455 		if (greqp->gr_group.ss_family == AF_INET) {
   6456 			sin = (struct sockaddr_in *)&(greqp->gr_group);
   6457 			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &v6group);
   6458 		} else {
   6459 			if (!inet6)
   6460 				return (EINVAL);	/* Not on INET socket */
   6461 
   6462 			sin6 = (struct sockaddr_in6 *)&(greqp->gr_group);
   6463 			v6group = sin6->sin6_addr;
   6464 		}
   6465 		ifaddr = INADDR_ANY;
   6466 		ifindex = greqp->gr_interface;
   6467 	} else if (inet6) {
   6468 		v6_mreqp = (struct ipv6_mreq *)i1;
   6469 		v6group = v6_mreqp->ipv6mr_multiaddr;
   6470 		ifaddr = INADDR_ANY;
   6471 		ifindex = v6_mreqp->ipv6mr_interface;
   6472 	} else {
   6473 		v4_mreqp = (struct ip_mreq *)i1;
   6474 		IN6_INADDR_TO_V4MAPPED(&v4_mreqp->imr_multiaddr, &v6group);
   6475 		ifaddr = (ipaddr_t)v4_mreqp->imr_interface.s_addr;
   6476 		ifindex = 0;
   6477 	}
   6478 
   6479 	/*
   6480 	 * In the multirouting case, we need to replicate
   6481 	 * the request on all interfaces that will take part
   6482 	 * in replication.  We do so because multirouting is
   6483 	 * reflective, thus we will probably receive multi-
   6484 	 * casts on those interfaces.
   6485 	 * The ip_multirt_apply_membership() succeeds if
   6486 	 * the operation succeeds on at least one interface.
   6487 	 */
   6488 	if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
   6489 		ipaddr_t group;
   6490 
   6491 		IN6_V4MAPPED_TO_IPADDR(&v6group, group);
   6492 
   6493 		ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
   6494 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6495 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6496 	} else {
   6497 		ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
   6498 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6499 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6500 	}
   6501 	if (ire != NULL) {
   6502 		if (ire->ire_flags & RTF_MULTIRT) {
   6503 			error = ip_multirt_apply_membership(optfn, ire, connp,
   6504 			    checkonly, &v6group, fmode, &ipv6_all_zeros);
   6505 			done = B_TRUE;
   6506 		}
   6507 		ire_refrele(ire);
   6508 	}
   6509 
   6510 	if (!done) {
   6511 		error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
   6512 		    fmode, &ipv6_all_zeros);
   6513 	}
   6514 	return (error);
   6515 }
   6516 
   6517 /*
   6518  * Set socket options for joining and leaving multicast groups
   6519  * for specific sources.
   6520  * Common to IPv4 and IPv6; inet6 indicates the type of socket.
   6521  * The caller has already check that the option name is consistent with
   6522  * the address family of the socket.
   6523  */
   6524 int
   6525 ip_opt_set_multicast_sources(conn_t *connp, t_scalar_t name,
   6526     uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
   6527 {
   6528 	int		*i1 = (int *)invalp;
   6529 	int		error = 0;
   6530 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   6531 	struct ip_mreq_source *imreqp;
   6532 	struct group_source_req *gsreqp;
   6533 	in6_addr_t v6group, v6src;
   6534 	uint32_t ifindex;
   6535 	ipaddr_t ifaddr;
   6536 	boolean_t mcast_opt = B_TRUE;
   6537 	mcast_record_t fmode;
   6538 	ire_t *ire;
   6539 	boolean_t done = B_FALSE;
   6540 	int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
   6541 	    ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
   6542 
   6543 	switch (name) {
   6544 	case IP_BLOCK_SOURCE:
   6545 		mcast_opt = B_FALSE;
   6546 		/* FALLTHRU */
   6547 	case MCAST_BLOCK_SOURCE:
   6548 		fmode = MODE_IS_EXCLUDE;
   6549 		optfn = ip_opt_add_group;
   6550 		break;
   6551 
   6552 	case IP_UNBLOCK_SOURCE:
   6553 		mcast_opt = B_FALSE;
   6554 		/* FALLTHRU */
   6555 	case MCAST_UNBLOCK_SOURCE:
   6556 		fmode = MODE_IS_EXCLUDE;
   6557 		optfn = ip_opt_delete_group;
   6558 		break;
   6559 
   6560 	case IP_ADD_SOURCE_MEMBERSHIP:
   6561 		mcast_opt = B_FALSE;
   6562 		/* FALLTHRU */
   6563 	case MCAST_JOIN_SOURCE_GROUP:
   6564 		fmode = MODE_IS_INCLUDE;
   6565 		optfn = ip_opt_add_group;
   6566 		break;
   6567 
   6568 	case IP_DROP_SOURCE_MEMBERSHIP:
   6569 		mcast_opt = B_FALSE;
   6570 		/* FALLTHRU */
   6571 	case MCAST_LEAVE_SOURCE_GROUP:
   6572 		fmode = MODE_IS_INCLUDE;
   6573 		optfn = ip_opt_delete_group;
   6574 		break;
   6575 	default:
   6576 		ASSERT(0);
   6577 	}
   6578 
   6579 	if (mcast_opt) {
   6580 		gsreqp = (struct group_source_req *)i1;
   6581 		ifindex = gsreqp->gsr_interface;
   6582 		if (gsreqp->gsr_group.ss_family == AF_INET) {
   6583 			struct sockaddr_in *s;
   6584 			s = (struct sockaddr_in *)&gsreqp->gsr_group;
   6585 			IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6group);
   6586 			s = (struct sockaddr_in *)&gsreqp->gsr_source;
   6587 			IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src);
   6588 		} else {
   6589 			struct sockaddr_in6 *s6;
   6590 
   6591 			if (!inet6)
   6592 				return (EINVAL);	/* Not on INET socket */
   6593 
   6594 			s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group;
   6595 			v6group = s6->sin6_addr;
   6596 			s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source;
   6597 			v6src = s6->sin6_addr;
   6598 		}
   6599 		ifaddr = INADDR_ANY;
   6600 	} else {
   6601 		imreqp = (struct ip_mreq_source *)i1;
   6602 		IN6_INADDR_TO_V4MAPPED(&imreqp->imr_multiaddr, &v6group);
   6603 		IN6_INADDR_TO_V4MAPPED(&imreqp->imr_sourceaddr, &v6src);
   6604 		ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr;
   6605 		ifindex = 0;
   6606 	}
   6607 
   6608 	/*
   6609 	 * Handle src being mapped INADDR_ANY by changing it to unspecified.
   6610 	 */
   6611 	if (IN6_IS_ADDR_V4MAPPED_ANY(&v6src))
   6612 		v6src = ipv6_all_zeros;
   6613 
   6614 	/*
   6615 	 * In the multirouting case, we need to replicate
   6616 	 * the request as noted in the mcast cases above.
   6617 	 */
   6618 	if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
   6619 		ipaddr_t group;
   6620 
   6621 		IN6_V4MAPPED_TO_IPADDR(&v6group, group);
   6622 
   6623 		ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
   6624 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6625 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6626 	} else {
   6627 		ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
   6628 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6629 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6630 	}
   6631 	if (ire != NULL) {
   6632 		if (ire->ire_flags & RTF_MULTIRT) {
   6633 			error = ip_multirt_apply_membership(optfn, ire, connp,
   6634 			    checkonly, &v6group, fmode, &v6src);
   6635 			done = B_TRUE;
   6636 		}
   6637 		ire_refrele(ire);
   6638 	}
   6639 	if (!done) {
   6640 		error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
   6641 		    fmode, &v6src);
   6642 	}
   6643 	return (error);
   6644 }
   6645 
   6646 /*
   6647  * Given a destination address and a pointer to where to put the information
   6648  * this routine fills in the mtuinfo.
   6649  * The socket must be connected.
   6650  * For sctp conn_faddr is the primary address.
   6651  */
   6652 int
   6653 ip_fill_mtuinfo(conn_t *connp, ip_xmit_attr_t *ixa, struct ip6_mtuinfo *mtuinfo)
   6654 {
   6655 	uint32_t	pmtu = IP_MAXPACKET;
   6656 	uint_t		scopeid;
   6657 
   6658 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6))
   6659 		return (-1);
   6660 
   6661 	/* In case we never sent or called ip_set_destination_v4/v6 */
   6662 	if (ixa->ixa_ire != NULL)
   6663 		pmtu = ip_get_pmtu(ixa);
   6664 
   6665 	if (ixa->ixa_flags & IXAF_SCOPEID_SET)
   6666 		scopeid = ixa->ixa_scopeid;
   6667 	else
   6668 		scopeid = 0;
   6669 
   6670 	bzero(mtuinfo, sizeof (*mtuinfo));
   6671 	mtuinfo->ip6m_addr.sin6_family = AF_INET6;
   6672 	mtuinfo->ip6m_addr.sin6_port = connp->conn_fport;
   6673 	mtuinfo->ip6m_addr.sin6_addr = connp->conn_faddr_v6;
   6674 	mtuinfo->ip6m_addr.sin6_scope_id = scopeid;
   6675 	mtuinfo->ip6m_mtu = pmtu;
   6676 
   6677 	return (sizeof (struct ip6_mtuinfo));
   6678 }
   6679 
   6680 /* Named Dispatch routine to get a current value out of our parameter table. */
   6681 /* ARGSUSED */
   6682 static int
   6683 ip_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr)
   6684 {
   6685 	ipparam_t *ippa = (ipparam_t *)cp;
   6686 
   6687 	(void) mi_mpprintf(mp, "%d", ippa->ip_param_value);
   6688 	return (0);
   6689 }
   6690 
   6691 /* ARGSUSED */
   6692 static int
   6693 ip_param_generic_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr)
   6694 {
   6695 
   6696 	(void) mi_mpprintf(mp, "%d", *(int *)cp);
   6697 	return (0);
   6698 }
   6699 
   6700 /*
   6701  * Set ip{,6}_forwarding values.  This means walking through all of the
   6702  * ill's and toggling their forwarding values.
   6703  */
   6704 /* ARGSUSED */
   6705 static int
   6706 ip_forward_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr)
   6707 {
   6708 	long new_value;
   6709 	int *forwarding_value = (int *)cp;
   6710 	ill_t *ill;
   6711 	boolean_t isv6;
   6712 	ill_walk_context_t ctx;
   6713 	ip_stack_t *ipst = CONNQ_TO_IPST(q);
   6714 
   6715 	isv6 = (forwarding_value == &ipst->ips_ipv6_forward);
   6716 
   6717 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
   6718 	    new_value < 0 || new_value > 1) {
   6719 		return (EINVAL);
   6720 	}
   6721 
   6722 	*forwarding_value = new_value;
   6723 
   6724 	/*
   6725 	 * Regardless of the current value of ip_forwarding, set all per-ill
   6726 	 * values of ip_forwarding to the value being set.
   6727 	 *
   6728 	 * Bring all the ill's up to date with the new global value.
   6729 	 */
   6730 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   6731 
   6732 	if (isv6)
   6733 		ill = ILL_START_WALK_V6(&ctx, ipst);
   6734 	else
   6735 		ill = ILL_START_WALK_V4(&ctx, ipst);
   6736 
   6737 	for (; ill != NULL; ill = ill_next(&ctx, ill))
   6738 		(void) ill_forward_set(ill, new_value != 0);
   6739 
   6740 	rw_exit(&ipst->ips_ill_g_lock);
   6741 	return (0);
   6742 }
   6743 
   6744 /*
   6745  * Walk through the param array specified registering each element with the
   6746  * Named Dispatch handler. This is called only during init. So it is ok
   6747  * not to acquire any locks
   6748  */
   6749 static boolean_t
   6750 ip_param_register(IDP *ndp, ipparam_t *ippa, size_t ippa_cnt,
   6751     ipndp_t *ipnd, size_t ipnd_cnt)
   6752 {
   6753 	for (; ippa_cnt-- > 0; ippa++) {
   6754 		if (ippa->ip_param_name && ippa->ip_param_name[0]) {
   6755 			if (!nd_load(ndp, ippa->ip_param_name,
   6756 			    ip_param_get, ip_param_set, (caddr_t)ippa)) {
   6757 				nd_free(ndp);
   6758 				return (B_FALSE);
   6759 			}
   6760 		}
   6761 	}
   6762 
   6763 	for (; ipnd_cnt-- > 0; ipnd++) {
   6764 		if (ipnd->ip_ndp_name && ipnd->ip_ndp_name[0]) {
   6765 			if (!nd_load(ndp, ipnd->ip_ndp_name,
   6766 			    ipnd->ip_ndp_getf, ipnd->ip_ndp_setf,
   6767 			    ipnd->ip_ndp_data)) {
   6768 				nd_free(ndp);
   6769 				return (B_FALSE);
   6770 			}
   6771 		}
   6772 	}
   6773 
   6774 	return (B_TRUE);
   6775 }
   6776 
   6777 /* Named Dispatch routine to negotiate a new value for one of our parameters. */
   6778 /* ARGSUSED */
   6779 static int
   6780 ip_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr)
   6781 {
   6782 	long		new_value;
   6783 	ipparam_t	*ippa = (ipparam_t *)cp;
   6784 
   6785 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
   6786 	    new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) {
   6787 		return (EINVAL);
   6788 	}
   6789 	ippa->ip_param_value = new_value;
   6790 	return (0);
   6791 }
   6792 
   6793 /*
   6794  * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases,
   6795  * When an ipf is passed here for the first time, if
   6796  * we already have in-order fragments on the queue, we convert from the fast-
   6797  * path reassembly scheme to the hard-case scheme.  From then on, additional
   6798  * fragments are reassembled here.  We keep track of the start and end offsets
   6799  * of each piece, and the number of holes in the chain.  When the hole count
   6800  * goes to zero, we are done!
   6801  *
   6802  * The ipf_count will be updated to account for any mblk(s) added (pointed to
   6803  * by mp) or subtracted (freeb()ed dups), upon return the caller must update
   6804  * ipfb_count and ill_frag_count by the difference of ipf_count before and
   6805  * after the call to ip_reassemble().
   6806  */
   6807 int
   6808 ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill,
   6809     size_t msg_len)
   6810 {
   6811 	uint_t	end;
   6812 	mblk_t	*next_mp;
   6813 	mblk_t	*mp1;
   6814 	uint_t	offset;
   6815 	boolean_t incr_dups = B_TRUE;
   6816 	boolean_t offset_zero_seen = B_FALSE;
   6817 	boolean_t pkt_boundary_checked = B_FALSE;
   6818 
   6819 	/* If start == 0 then ipf_nf_hdr_len has to be set. */
   6820 	ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0);
   6821 
   6822 	/* Add in byte count */
   6823 	ipf->ipf_count += msg_len;
   6824 	if (ipf->ipf_end) {
   6825 		/*
   6826 		 * We were part way through in-order reassembly, but now there
   6827 		 * is a hole.  We walk through messages already queued, and
   6828 		 * mark them for hard case reassembly.  We know that up till
   6829 		 * now they were in order starting from offset zero.
   6830 		 */
   6831 		offset = 0;
   6832 		for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
   6833 			IP_REASS_SET_START(mp1, offset);
   6834 			if (offset == 0) {
   6835 				ASSERT(ipf->ipf_nf_hdr_len != 0);
   6836 				offset = -ipf->ipf_nf_hdr_len;
   6837 			}
   6838 			offset += mp1->b_wptr - mp1->b_rptr;
   6839 			IP_REASS_SET_END(mp1, offset);
   6840 		}
   6841 		/* One hole at the end. */
   6842 		ipf->ipf_hole_cnt = 1;
   6843 		/* Brand it as a hard case, forever. */
   6844 		ipf->ipf_end = 0;
   6845 	}
   6846 	/* Walk through all the new pieces. */
   6847 	do {
   6848 		end = start + (mp->b_wptr - mp->b_rptr);
   6849 		/*
   6850 		 * If start is 0, decrease 'end' only for the first mblk of
   6851 		 * the fragment. Otherwise 'end' can get wrong value in the
   6852 		 * second pass of the loop if first mblk is exactly the
   6853 		 * size of ipf_nf_hdr_len.
   6854 		 */
   6855 		if (start == 0 && !offset_zero_seen) {
   6856 			/* First segment */
   6857 			ASSERT(ipf->ipf_nf_hdr_len != 0);
   6858 			end -= ipf->ipf_nf_hdr_len;
   6859 			offset_zero_seen = B_TRUE;
   6860 		}
   6861 		next_mp = mp->b_cont;
   6862 		/*
   6863 		 * We are checking to see if there is any interesing data
   6864 		 * to process.  If there isn't and the mblk isn't the
   6865 		 * one which carries the unfragmentable header then we
   6866 		 * drop it.  It's possible to have just the unfragmentable
   6867 		 * header come through without any data.  That needs to be
   6868 		 * saved.
   6869 		 *
   6870 		 * If the assert at the top of this function holds then the
   6871 		 * term "ipf->ipf_nf_hdr_len != 0" isn't needed.  This code
   6872 		 * is infrequently traveled enough that the test is left in
   6873 		 * to protect against future code changes which break that
   6874 		 * invariant.
   6875 		 */
   6876 		if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) {
   6877 			/* Empty.  Blast it. */
   6878 			IP_REASS_SET_START(mp, 0);
   6879 			IP_REASS_SET_END(mp, 0);
   6880 			/*
   6881 			 * If the ipf points to the mblk we are about to free,
   6882 			 * update ipf to point to the next mblk (or NULL
   6883 			 * if none).
   6884 			 */
   6885 			if (ipf->ipf_mp->b_cont == mp)
   6886 				ipf->ipf_mp->b_cont = next_mp;
   6887 			freeb(mp);
   6888 			continue;
   6889 		}
   6890 		mp->b_cont = NULL;
   6891 		IP_REASS_SET_START(mp, start);
   6892 		IP_REASS_SET_END(mp, end);
   6893 		if (!ipf->ipf_tail_mp) {
   6894 			ipf->ipf_tail_mp = mp;
   6895 			ipf->ipf_mp->b_cont = mp;
   6896 			if (start == 0 || !more) {
   6897 				ipf->ipf_hole_cnt = 1;
   6898 				/*
   6899 				 * if the first fragment comes in more than one
   6900 				 * mblk, this loop will be executed for each
   6901 				 * mblk. Need to adjust hole count so exiting
   6902 				 * this routine will leave hole count at 1.
   6903 				 */
   6904 				if (next_mp)
   6905 					ipf->ipf_hole_cnt++;
   6906 			} else
   6907 				ipf->ipf_hole_cnt = 2;
   6908 			continue;
   6909 		} else if (ipf->ipf_last_frag_seen && !more &&
   6910 		    !pkt_boundary_checked) {
   6911 			/*
   6912 			 * We check datagram boundary only if this fragment
   6913 			 * claims to be the last fragment and we have seen a
   6914 			 * last fragment in the past too. We do this only
   6915 			 * once for a given fragment.
   6916 			 *
   6917 			 * start cannot be 0 here as fragments with start=0
   6918 			 * and MF=0 gets handled as a complete packet. These
   6919 			 * fragments should not reach here.
   6920 			 */
   6921 
   6922 			if (start + msgdsize(mp) !=
   6923 			    IP_REASS_END(ipf->ipf_tail_mp)) {
   6924 				/*
   6925 				 * We have two fragments both of which claim
   6926 				 * to be the last fragment but gives conflicting
   6927 				 * information about the whole datagram size.
   6928 				 * Something fishy is going on. Drop the
   6929 				 * fragment and free up the reassembly list.
   6930 				 */
   6931 				return (IP_REASS_FAILED);
   6932 			}
   6933 
   6934 			/*
   6935 			 * We shouldn't come to this code block again for this
   6936 			 * particular fragment.
   6937 			 */
   6938 			pkt_boundary_checked = B_TRUE;
   6939 		}
   6940 
   6941 		/* New stuff at or beyond tail? */
   6942 		offset = IP_REASS_END(ipf->ipf_tail_mp);
   6943 		if (start >= offset) {
   6944 			if (ipf->ipf_last_frag_seen) {
   6945 				/* current fragment is beyond last fragment */
   6946 				return (IP_REASS_FAILED);
   6947 			}
   6948 			/* Link it on end. */
   6949 			ipf->ipf_tail_mp->b_cont = mp;
   6950 			ipf->ipf_tail_mp = mp;
   6951 			if (more) {
   6952 				if (start != offset)
   6953 					ipf->ipf_hole_cnt++;
   6954 			} else if (start == offset && next_mp == NULL)
   6955 					ipf->ipf_hole_cnt--;
   6956 			continue;
   6957 		}
   6958 		mp1 = ipf->ipf_mp->b_cont;
   6959 		offset = IP_REASS_START(mp1);
   6960 		/* New stuff at the front? */
   6961 		if (start < offset) {
   6962 			if (start == 0) {
   6963 				if (end >= offset) {
   6964 					/* Nailed the hole at the begining. */
   6965 					ipf->ipf_hole_cnt--;
   6966 				}
   6967 			} else if (end < offset) {
   6968 				/*
   6969 				 * A hole, stuff, and a hole where there used
   6970 				 * to be just a hole.
   6971 				 */
   6972 				ipf->ipf_hole_cnt++;
   6973 			}
   6974 			mp->b_cont = mp1;
   6975 			/* Check for overlap. */
   6976 			while (end > offset) {
   6977 				if (end < IP_REASS_END(mp1)) {
   6978 					mp->b_wptr -= end - offset;
   6979 					IP_REASS_SET_END(mp, offset);
   6980 					BUMP_MIB(ill->ill_ip_mib,
   6981 					    ipIfStatsReasmPartDups);
   6982 					break;
   6983 				}
   6984 				/* Did we cover another hole? */
   6985 				if ((mp1->b_cont &&
   6986 				    IP_REASS_END(mp1) !=
   6987 				    IP_REASS_START(mp1->b_cont) &&
   6988 				    end >= IP_REASS_START(mp1->b_cont)) ||
   6989 				    (!ipf->ipf_last_frag_seen && !more)) {
   6990 					ipf->ipf_hole_cnt--;
   6991 				}
   6992 				/* Clip out mp1. */
   6993 				if ((mp->b_cont = mp1->b_cont) == NULL) {
   6994 					/*
   6995 					 * After clipping out mp1, this guy
   6996 					 * is now hanging off the end.
   6997 					 */
   6998 					ipf->ipf_tail_mp = mp;
   6999 				}
   7000 				IP_REASS_SET_START(mp1, 0);
   7001 				IP_REASS_SET_END(mp1, 0);
   7002 				/* Subtract byte count */
   7003 				ipf->ipf_count -= mp1->b_datap->db_lim -
   7004 				    mp1->b_datap->db_base;
   7005 				freeb(mp1);
   7006 				BUMP_MIB(ill->ill_ip_mib,
   7007 				    ipIfStatsReasmPartDups);
   7008 				mp1 = mp->b_cont;
   7009 				if (!mp1)
   7010 					break;
   7011 				offset = IP_REASS_START(mp1);
   7012 			}
   7013 			ipf->ipf_mp->b_cont = mp;
   7014 			continue;
   7015 		}
   7016 		/*
   7017 		 * The new piece starts somewhere between the start of the head
   7018 		 * and before the end of the tail.
   7019 		 */
   7020 		for (; mp1; mp1 = mp1->b_cont) {
   7021 			offset = IP_REASS_END(mp1);
   7022 			if (start < offset) {
   7023 				if (end <= offset) {
   7024 					/* Nothing new. */
   7025 					IP_REASS_SET_START(mp, 0);
   7026 					IP_REASS_SET_END(mp, 0);
   7027 					/* Subtract byte count */
   7028 					ipf->ipf_count -= mp->b_datap->db_lim -
   7029 					    mp->b_datap->db_base;
   7030 					if (incr_dups) {
   7031 						ipf->ipf_num_dups++;
   7032 						incr_dups = B_FALSE;
   7033 					}
   7034 					freeb(mp);
   7035 					BUMP_MIB(ill->ill_ip_mib,
   7036 					    ipIfStatsReasmDuplicates);
   7037 					break;
   7038 				}
   7039 				/*
   7040 				 * Trim redundant stuff off beginning of new
   7041 				 * piece.
   7042 				 */
   7043 				IP_REASS_SET_START(mp, offset);
   7044 				mp->b_rptr += offset - start;
   7045 				BUMP_MIB(ill->ill_ip_mib,
   7046 				    ipIfStatsReasmPartDups);
   7047 				start = offset;
   7048 				if (!mp1->b_cont) {
   7049 					/*
   7050 					 * After trimming, this guy is now
   7051 					 * hanging off the end.
   7052 					 */
   7053 					mp1->b_cont = mp;
   7054 					ipf->ipf_tail_mp = mp;
   7055 					if (!more) {
   7056 						ipf->ipf_hole_cnt--;
   7057 					}
   7058 					break;
   7059 				}
   7060 			}
   7061 			if (start >= IP_REASS_START(mp1->b_cont))
   7062 				continue;
   7063 			/* Fill a hole */
   7064 			if (start > offset)
   7065 				ipf->ipf_hole_cnt++;
   7066 			mp->b_cont = mp1->b_cont;
   7067 			mp1->b_cont = mp;
   7068 			mp1 = mp->b_cont;
   7069 			offset = IP_REASS_START(mp1);
   7070 			if (end >= offset) {
   7071 				ipf->ipf_hole_cnt--;
   7072 				/* Check for overlap. */
   7073 				while (end > offset) {
   7074 					if (end < IP_REASS_END(mp1)) {
   7075 						mp->b_wptr -= end - offset;
   7076 						IP_REASS_SET_END(mp, offset);
   7077 						/*
   7078 						 * TODO we might bump
   7079 						 * this up twice if there is
   7080 						 * overlap at both ends.
   7081 						 */
   7082 						BUMP_MIB(ill->ill_ip_mib,
   7083 						    ipIfStatsReasmPartDups);
   7084 						break;
   7085 					}
   7086 					/* Did we cover another hole? */
   7087 					if ((mp1->b_cont &&
   7088 					    IP_REASS_END(mp1)
   7089 					    != IP_REASS_START(mp1->b_cont) &&
   7090 					    end >=
   7091 					    IP_REASS_START(mp1->b_cont)) ||
   7092 					    (!ipf->ipf_last_frag_seen &&
   7093 					    !more)) {
   7094 						ipf->ipf_hole_cnt--;
   7095 					}
   7096 					/* Clip out mp1. */
   7097 					if ((mp->b_cont = mp1->b_cont) ==
   7098 					    NULL) {
   7099 						/*
   7100 						 * After clipping out mp1,
   7101 						 * this guy is now hanging
   7102 						 * off the end.
   7103 						 */
   7104 						ipf->ipf_tail_mp = mp;
   7105 					}
   7106 					IP_REASS_SET_START(mp1, 0);
   7107 					IP_REASS_SET_END(mp1, 0);
   7108 					/* Subtract byte count */
   7109 					ipf->ipf_count -=
   7110 					    mp1->b_datap->db_lim -
   7111 					    mp1->b_datap->db_base;
   7112 					freeb(mp1);
   7113 					BUMP_MIB(ill->ill_ip_mib,
   7114 					    ipIfStatsReasmPartDups);
   7115 					mp1 = mp->b_cont;
   7116 					if (!mp1)
   7117 						break;
   7118 					offset = IP_REASS_START(mp1);
   7119 				}
   7120 			}
   7121 			break;
   7122 		}
   7123 	} while (start = end, mp = next_mp);
   7124 
   7125 	/* Fragment just processed could be the last one. Remember this fact */
   7126 	if (!more)
   7127 		ipf->ipf_last_frag_seen = B_TRUE;
   7128 
   7129 	/* Still got holes? */
   7130 	if (ipf->ipf_hole_cnt)
   7131 		return (IP_REASS_PARTIAL);
   7132 	/* Clean up overloaded fields to avoid upstream disasters. */
   7133 	for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
   7134 		IP_REASS_SET_START(mp1, 0);
   7135 		IP_REASS_SET_END(mp1, 0);
   7136 	}
   7137 	return (IP_REASS_COMPLETE);
   7138 }
   7139 
   7140 /*
   7141  * Fragmentation reassembly.  Each ILL has a hash table for
   7142  * queuing packets undergoing reassembly for all IPIFs
   7143  * associated with the ILL.  The hash is based on the packet
   7144  * IP ident field.  The ILL frag hash table was allocated
   7145  * as a timer block at the time the ILL was created.  Whenever
   7146  * there is anything on the reassembly queue, the timer will
   7147  * be running.  Returns the reassembled packet if reassembly completes.
   7148  */
   7149 mblk_t *
   7150 ip_input_fragment(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
   7151 {
   7152 	uint32_t	frag_offset_flags;
   7153 	mblk_t		*t_mp;
   7154 	ipaddr_t	dst;
   7155 	uint8_t		proto = ipha->ipha_protocol;
   7156 	uint32_t	sum_val;
   7157 	uint16_t	sum_flags;
   7158 	ipf_t		*ipf;
   7159 	ipf_t		**ipfp;
   7160 	ipfb_t		*ipfb;
   7161 	uint16_t	ident;
   7162 	uint32_t	offset;
   7163 	ipaddr_t	src;
   7164 	uint_t		hdr_length;
   7165 	uint32_t	end;
   7166 	mblk_t		*mp1;
   7167 	mblk_t		*tail_mp;
   7168 	size_t		count;
   7169 	size_t		msg_len;
   7170 	uint8_t		ecn_info = 0;
   7171 	uint32_t	packet_size;
   7172 	boolean_t	pruned = B_FALSE;
   7173 	ill_t		*ill = ira->ira_ill;
   7174 	ip_stack_t	*ipst = ill->ill_ipst;
   7175 
   7176 	/*
   7177 	 * Drop the fragmented as early as possible, if
   7178 	 * we don't have resource(s) to re-assemble.
   7179 	 */
   7180 	if (ipst->ips_ip_reass_queue_bytes == 0) {
   7181 		freemsg(mp);
   7182 		return (NULL);
   7183 	}
   7184 
   7185 	/* Check for fragmentation offset; return if there's none */
   7186 	if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
   7187 	    (IPH_MF | IPH_OFFSET)) == 0)
   7188 		return (mp);
   7189 
   7190 	/*
   7191 	 * We utilize hardware computed checksum info only for UDP since
   7192 	 * IP fragmentation is a normal occurrence for the protocol.  In
   7193 	 * addition, checksum offload support for IP fragments carrying
   7194 	 * UDP payload is commonly implemented across network adapters.
   7195 	 */
   7196 	ASSERT(ira->ira_rill != NULL);
   7197 	if (proto == IPPROTO_UDP && dohwcksum &&
   7198 	    ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
   7199 	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
   7200 		mblk_t *mp1 = mp->b_cont;
   7201 		int32_t len;
   7202 
   7203 		/* Record checksum information from the packet */
   7204 		sum_val = (uint32_t)DB_CKSUM16(mp);
   7205 		sum_flags = DB_CKSUMFLAGS(mp);
   7206 
   7207 		/* IP payload offset from beginning of mblk */
   7208 		offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr;
   7209 
   7210 		if ((sum_flags & HCK_PARTIALCKSUM) &&
   7211 		    (mp1 == NULL || mp1->b_cont == NULL) &&
   7212 		    offset >= DB_CKSUMSTART(mp) &&
   7213 		    ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
   7214 			uint32_t adj;
   7215 			/*
   7216 			 * Partial checksum has been calculated by hardware
   7217 			 * and attached to the packet; in addition, any
   7218 			 * prepended extraneous data is even byte aligned.
   7219 			 * If any such data exists, we adjust the checksum;
   7220 			 * this would also handle any postpended data.
   7221 			 */
   7222 			IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
   7223 			    mp, mp1, len, adj);
   7224 
   7225 			/* One's complement subtract extraneous checksum */
   7226 			if (adj >= sum_val)
   7227 				sum_val = ~(adj - sum_val) & 0xFFFF;
   7228 			else
   7229 				sum_val -= adj;
   7230 		}
   7231 	} else {
   7232 		sum_val = 0;
   7233 		sum_flags = 0;
   7234 	}
   7235 
   7236 	/* Clear hardware checksumming flag */
   7237 	DB_CKSUMFLAGS(mp) = 0;
   7238 
   7239 	ident = ipha->ipha_ident;
   7240 	offset = (frag_offset_flags << 3) & 0xFFFF;
   7241 	src = ipha->ipha_src;
   7242 	dst = ipha->ipha_dst;
   7243 	hdr_length = IPH_HDR_LENGTH(ipha);
   7244 	end = ntohs(ipha->ipha_length) - hdr_length;
   7245 
   7246 	/* If end == 0 then we have a packet with no data, so just free it */
   7247 	if (end == 0) {
   7248 		freemsg(mp);
   7249 		return (NULL);
   7250 	}
   7251 
   7252 	/* Record the ECN field info. */
   7253 	ecn_info = (ipha->ipha_type_of_service & 0x3);
   7254 	if (offset != 0) {
   7255 		/*
   7256 		 * If this isn't the first piece, strip the header, and
   7257 		 * add the offset to the end value.
   7258 		 */
   7259 		mp->b_rptr += hdr_length;
   7260 		end += offset;
   7261 	}
   7262 
   7263 	/* Handle vnic loopback of fragments */
   7264 	if (mp->b_datap->db_ref > 2)
   7265 		msg_len = 0;
   7266 	else
   7267 		msg_len = MBLKSIZE(mp);
   7268 
   7269 	tail_mp = mp;
   7270 	while (tail_mp->b_cont != NULL) {
   7271 		tail_mp = tail_mp->b_cont;
   7272 		if (tail_mp->b_datap->db_ref <= 2)
   7273 			msg_len += MBLKSIZE(tail_mp);
   7274 	}
   7275 
   7276 	/* If the reassembly list for this ILL will get too big, prune it */
   7277 	if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
   7278 	    ipst->ips_ip_reass_queue_bytes) {
   7279 		DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
   7280 		    uint_t, ill->ill_frag_count,
   7281 		    uint_t, ipst->ips_ip_reass_queue_bytes);
   7282 		ill_frag_prune(ill,
   7283 		    (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
   7284 		    (ipst->ips_ip_reass_queue_bytes - msg_len));
   7285 		pruned = B_TRUE;
   7286 	}
   7287 
   7288 	ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
   7289 	mutex_enter(&ipfb->ipfb_lock);
   7290 
   7291 	ipfp = &ipfb->ipfb_ipf;
   7292 	/* Try to find an existing fragment queue for this packet. */
   7293 	for (;;) {
   7294 		ipf = ipfp[0];
   7295 		if (ipf != NULL) {
   7296 			/*
   7297 			 * It has to match on ident and src/dst address.
   7298 			 */
   7299 			if (ipf->ipf_ident == ident &&
   7300 			    ipf->ipf_src == src &&
   7301 			    ipf->ipf_dst == dst &&
   7302 			    ipf->ipf_protocol == proto) {
   7303 				/*
   7304 				 * If we have received too many
   7305 				 * duplicate fragments for this packet
   7306 				 * free it.
   7307 				 */
   7308 				if (ipf->ipf_num_dups > ip_max_frag_dups) {
   7309 					ill_frag_free_pkts(ill, ipfb, ipf, 1);
   7310 					freemsg(mp);
   7311 					mutex_exit(&ipfb->ipfb_lock);
   7312 					return (NULL);
   7313 				}
   7314 				/* Found it. */
   7315 				break;
   7316 			}
   7317 			ipfp = &ipf->ipf_hash_next;
   7318 			continue;
   7319 		}
   7320 
   7321 		/*
   7322 		 * If we pruned the list, do we want to store this new
   7323 		 * fragment?. We apply an optimization here based on the
   7324 		 * fact that most fragments will be received in order.
   7325 		 * So if the offset of this incoming fragment is zero,
   7326 		 * it is the first fragment of a new packet. We will
   7327 		 * keep it.  Otherwise drop the fragment, as we have
   7328 		 * probably pruned the packet already (since the
   7329 		 * packet cannot be found).
   7330 		 */
   7331 		if (pruned && offset != 0) {
   7332 			mutex_exit(&ipfb->ipfb_lock);
   7333 			freemsg(mp);
   7334 			return (NULL);
   7335 		}
   7336 
   7337 		if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst))  {
   7338 			/*
   7339 			 * Too many fragmented packets in this hash
   7340 			 * bucket. Free the oldest.
   7341 			 */
   7342 			ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
   7343 		}
   7344 
   7345 		/* New guy.  Allocate a frag message. */
   7346 		mp1 = allocb(sizeof (*ipf), BPRI_MED);
   7347 		if (mp1 == NULL) {
   7348 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   7349 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   7350 			freemsg(mp);
   7351 reass_done:
   7352 			mutex_exit(&ipfb->ipfb_lock);
   7353 			return (NULL);
   7354 		}
   7355 
   7356 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
   7357 		mp1->b_cont = mp;
   7358 
   7359 		/* Initialize the fragment header. */
   7360 		ipf = (ipf_t *)mp1->b_rptr;
   7361 		ipf->ipf_mp = mp1;
   7362 		ipf->ipf_ptphn = ipfp;
   7363 		ipfp[0] = ipf;
   7364 		ipf->ipf_hash_next = NULL;
   7365 		ipf->ipf_ident = ident;
   7366 		ipf->ipf_protocol = proto;
   7367 		ipf->ipf_src = src;
   7368 		ipf->ipf_dst = dst;
   7369 		ipf->ipf_nf_hdr_len = 0;
   7370 		/* Record reassembly start time. */
   7371 		ipf->ipf_timestamp = gethrestime_sec();
   7372 		/* Record ipf generation and account for frag header */
   7373 		ipf->ipf_gen = ill->ill_ipf_gen++;
   7374 		ipf->ipf_count = MBLKSIZE(mp1);
   7375 		ipf->ipf_last_frag_seen = B_FALSE;
   7376 		ipf->ipf_ecn = ecn_info;
   7377 		ipf->ipf_num_dups = 0;
   7378 		ipfb->ipfb_frag_pkts++;
   7379 		ipf->ipf_checksum = 0;
   7380 		ipf->ipf_checksum_flags = 0;
   7381 
   7382 		/* Store checksum value in fragment header */
   7383 		if (sum_flags != 0) {
   7384 			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7385 			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7386 			ipf->ipf_checksum = sum_val;
   7387 			ipf->ipf_checksum_flags = sum_flags;
   7388 		}
   7389 
   7390 		/*
   7391 		 * We handle reassembly two ways.  In the easy case,
   7392 		 * where all the fragments show up in order, we do
   7393 		 * minimal bookkeeping, and just clip new pieces on
   7394 		 * the end.  If we ever see a hole, then we go off
   7395 		 * to ip_reassemble which has to mark the pieces and
   7396 		 * keep track of the number of holes, etc.  Obviously,
   7397 		 * the point of having both mechanisms is so we can
   7398 		 * handle the easy case as efficiently as possible.
   7399 		 */
   7400 		if (offset == 0) {
   7401 			/* Easy case, in-order reassembly so far. */
   7402 			ipf->ipf_count += msg_len;
   7403 			ipf->ipf_tail_mp = tail_mp;
   7404 			/*
   7405 			 * Keep track of next expected offset in
   7406 			 * ipf_end.
   7407 			 */
   7408 			ipf->ipf_end = end;
   7409 			ipf->ipf_nf_hdr_len = hdr_length;
   7410 		} else {
   7411 			/* Hard case, hole at the beginning. */
   7412 			ipf->ipf_tail_mp = NULL;
   7413 			/*
   7414 			 * ipf_end == 0 means that we have given up
   7415 			 * on easy reassembly.
   7416 			 */
   7417 			ipf->ipf_end = 0;
   7418 
   7419 			/* Forget checksum offload from now on */
   7420 			ipf->ipf_checksum_flags = 0;
   7421 
   7422 			/*
   7423 			 * ipf_hole_cnt is set by ip_reassemble.
   7424 			 * ipf_count is updated by ip_reassemble.
   7425 			 * No need to check for return value here
   7426 			 * as we don't expect reassembly to complete
   7427 			 * or fail for the first fragment itself.
   7428 			 */
   7429 			(void) ip_reassemble(mp, ipf,
   7430 			    (frag_offset_flags & IPH_OFFSET) << 3,
   7431 			    (frag_offset_flags & IPH_MF), ill, msg_len);
   7432 		}
   7433 		/* Update per ipfb and ill byte counts */
   7434 		ipfb->ipfb_count += ipf->ipf_count;
   7435 		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
   7436 		atomic_add_32(&ill->ill_frag_count, ipf->ipf_count);
   7437 		/* If the frag timer wasn't already going, start it. */
   7438 		mutex_enter(&ill->ill_lock);
   7439 		ill_frag_timer_start(ill);
   7440 		mutex_exit(&ill->ill_lock);
   7441 		goto reass_done;
   7442 	}
   7443 
   7444 	/*
   7445 	 * If the packet's flag has changed (it could be coming up
   7446 	 * from an interface different than the previous, therefore
   7447 	 * possibly different checksum capability), then forget about
   7448 	 * any stored checksum states.  Otherwise add the value to
   7449 	 * the existing one stored in the fragment header.
   7450 	 */
   7451 	if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
   7452 		sum_val += ipf->ipf_checksum;
   7453 		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7454 		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7455 		ipf->ipf_checksum = sum_val;
   7456 	} else if (ipf->ipf_checksum_flags != 0) {
   7457 		/* Forget checksum offload from now on */
   7458 		ipf->ipf_checksum_flags = 0;
   7459 	}
   7460 
   7461 	/*
   7462 	 * We have a new piece of a datagram which is already being
   7463 	 * reassembled.  Update the ECN info if all IP fragments
   7464 	 * are ECN capable.  If there is one which is not, clear
   7465 	 * all the info.  If there is at least one which has CE
   7466 	 * code point, IP needs to report that up to transport.
   7467 	 */
   7468 	if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
   7469 		if (ecn_info == IPH_ECN_CE)
   7470 			ipf->ipf_ecn = IPH_ECN_CE;
   7471 	} else {
   7472 		ipf->ipf_ecn = IPH_ECN_NECT;
   7473 	}
   7474 	if (offset && ipf->ipf_end == offset) {
   7475 		/* The new fragment fits at the end */
   7476 		ipf->ipf_tail_mp->b_cont = mp;
   7477 		/* Update the byte count */
   7478 		ipf->ipf_count += msg_len;
   7479 		/* Update per ipfb and ill byte counts */
   7480 		ipfb->ipfb_count += msg_len;
   7481 		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
   7482 		atomic_add_32(&ill->ill_frag_count, msg_len);
   7483 		if (frag_offset_flags & IPH_MF) {
   7484 			/* More to come. */
   7485 			ipf->ipf_end = end;
   7486 			ipf->ipf_tail_mp = tail_mp;
   7487 			goto reass_done;
   7488 		}
   7489 	} else {
   7490 		/* Go do the hard cases. */
   7491 		int ret;
   7492 
   7493 		if (offset == 0)
   7494 			ipf->ipf_nf_hdr_len = hdr_length;
   7495 
   7496 		/* Save current byte count */
   7497 		count = ipf->ipf_count;
   7498 		ret = ip_reassemble(mp, ipf,
   7499 		    (frag_offset_flags & IPH_OFFSET) << 3,
   7500 		    (frag_offset_flags & IPH_MF), ill, msg_len);
   7501 		/* Count of bytes added and subtracted (freeb()ed) */
   7502 		count = ipf->ipf_count - count;
   7503 		if (count) {
   7504 			/* Update per ipfb and ill byte counts */
   7505 			ipfb->ipfb_count += count;
   7506 			ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
   7507 			atomic_add_32(&ill->ill_frag_count, count);
   7508 		}
   7509 		if (ret == IP_REASS_PARTIAL) {
   7510 			goto reass_done;
   7511 		} else if (ret == IP_REASS_FAILED) {
   7512 			/* Reassembly failed. Free up all resources */
   7513 			ill_frag_free_pkts(ill, ipfb, ipf, 1);
   7514 			for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
   7515 				IP_REASS_SET_START(t_mp, 0);
   7516 				IP_REASS_SET_END(t_mp, 0);
   7517 			}
   7518 			freemsg(mp);
   7519 			goto reass_done;
   7520 		}
   7521 		/* We will reach here iff 'ret' is IP_REASS_COMPLETE */
   7522 	}
   7523 	/*
   7524 	 * We have completed reassembly.  Unhook the frag header from
   7525 	 * the reassembly list.
   7526 	 *
   7527 	 * Before we free the frag header, record the ECN info
   7528 	 * to report back to the transport.
   7529 	 */
   7530 	ecn_info = ipf->ipf_ecn;
   7531 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs);
   7532 	ipfp = ipf->ipf_ptphn;
   7533 
   7534 	/* We need to supply these to caller */
   7535 	if ((sum_flags = ipf->ipf_checksum_flags) != 0)
   7536 		sum_val = ipf->ipf_checksum;
   7537 	else
   7538 		sum_val = 0;
   7539 
   7540 	mp1 = ipf->ipf_mp;
   7541 	count = ipf->ipf_count;
   7542 	ipf = ipf->ipf_hash_next;
   7543 	if (ipf != NULL)
   7544 		ipf->ipf_ptphn = ipfp;
   7545 	ipfp[0] = ipf;
   7546 	atomic_add_32(&ill->ill_frag_count, -count);
   7547 	ASSERT(ipfb->ipfb_count >= count);
   7548 	ipfb->ipfb_count -= count;
   7549 	ipfb->ipfb_frag_pkts--;
   7550 	mutex_exit(&ipfb->ipfb_lock);
   7551 	/* Ditch the frag header. */
   7552 	mp = mp1->b_cont;
   7553 
   7554 	freeb(mp1);
   7555 
   7556 	/* Restore original IP length in header. */
   7557 	packet_size = (uint32_t)msgdsize(mp);
   7558 	if (packet_size > IP_MAXPACKET) {
   7559 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
   7560 		ip_drop_input("Reassembled packet too large", mp, ill);
   7561 		freemsg(mp);
   7562 		return (NULL);
   7563 	}
   7564 
   7565 	if (DB_REF(mp) > 1) {
   7566 		mblk_t *mp2 = copymsg(mp);
   7567 
   7568 		if (mp2 == NULL) {
   7569 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   7570 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   7571 			freemsg(mp);
   7572 			return (NULL);
   7573 		}
   7574 		freemsg(mp);
   7575 		mp = mp2;
   7576 	}
   7577 	ipha = (ipha_t *)mp->b_rptr;
   7578 
   7579 	ipha->ipha_length = htons((uint16_t)packet_size);
   7580 	/* We're now complete, zip the frag state */
   7581 	ipha->ipha_fragment_offset_and_flags = 0;
   7582 	/* Record the ECN info. */
   7583 	ipha->ipha_type_of_service &= 0xFC;
   7584 	ipha->ipha_type_of_service |= ecn_info;
   7585 
   7586 	/* Update the receive attributes */
   7587 	ira->ira_pktlen = packet_size;
   7588 	ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
   7589 
   7590 	/* Reassembly is successful; set checksum information in packet */
   7591 	DB_CKSUM16(mp) = (uint16_t)sum_val;
   7592 	DB_CKSUMFLAGS(mp) = sum_flags;
   7593 	DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
   7594 
   7595 	return (mp);
   7596 }
   7597 
   7598 /*
   7599  * Pullup function that should be used for IP input in order to
   7600  * ensure we do not loose the L2 source address; we need the l2 source
   7601  * address for IP_RECVSLLA and for ndp_input.
   7602  *
   7603  * We return either NULL or b_rptr.
   7604  */
   7605 void *
   7606 ip_pullup(mblk_t *mp, ssize_t len, ip_recv_attr_t *ira)
   7607 {
   7608 	ill_t		*ill = ira->ira_ill;
   7609 
   7610 	if (ip_rput_pullups++ == 0) {
   7611 		(void) m