Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 /* Copyright (c) 1990 Mentat Inc. */
     27 
     28 #include <sys/types.h>
     29 #include <sys/stream.h>
     30 #include <sys/dlpi.h>
     31 #include <sys/stropts.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/strsubr.h>
     34 #include <sys/strlog.h>
     35 #include <sys/strsun.h>
     36 #include <sys/zone.h>
     37 #define	_SUN_TPI_VERSION 2
     38 #include <sys/tihdr.h>
     39 #include <sys/xti_inet.h>
     40 #include <sys/ddi.h>
     41 #include <sys/suntpi.h>
     42 #include <sys/cmn_err.h>
     43 #include <sys/debug.h>
     44 #include <sys/kobj.h>
     45 #include <sys/modctl.h>
     46 #include <sys/atomic.h>
     47 #include <sys/policy.h>
     48 #include <sys/priv.h>
     49 #include <sys/taskq.h>
     50 
     51 #include <sys/systm.h>
     52 #include <sys/param.h>
     53 #include <sys/kmem.h>
     54 #include <sys/sdt.h>
     55 #include <sys/socket.h>
     56 #include <sys/vtrace.h>
     57 #include <sys/isa_defs.h>
     58 #include <sys/mac.h>
     59 #include <net/if.h>
     60 #include <net/if_arp.h>
     61 #include <net/route.h>
     62 #include <sys/sockio.h>
     63 #include <netinet/in.h>
     64 #include <net/if_dl.h>
     65 
     66 #include <inet/common.h>
     67 #include <inet/mi.h>
     68 #include <inet/mib2.h>
     69 #include <inet/nd.h>
     70 #include <inet/arp.h>
     71 #include <inet/snmpcom.h>
     72 #include <inet/optcom.h>
     73 #include <inet/kstatcom.h>
     74 
     75 #include <netinet/igmp_var.h>
     76 #include <netinet/ip6.h>
     77 #include <netinet/icmp6.h>
     78 #include <netinet/sctp.h>
     79 
     80 #include <inet/ip.h>
     81 #include <inet/ip_impl.h>
     82 #include <inet/ip6.h>
     83 #include <inet/ip6_asp.h>
     84 #include <inet/tcp.h>
     85 #include <inet/tcp_impl.h>
     86 #include <inet/ip_multi.h>
     87 #include <inet/ip_if.h>
     88 #include <inet/ip_ire.h>
     89 #include <inet/ip_ftable.h>
     90 #include <inet/ip_rts.h>
     91 #include <inet/ip_ndp.h>
     92 #include <inet/ip_listutils.h>
     93 #include <netinet/igmp.h>
     94 #include <netinet/ip_mroute.h>
     95 #include <inet/ipp_common.h>
     96 
     97 #include <net/pfkeyv2.h>
     98 #include <inet/sadb.h>
     99 #include <inet/ipsec_impl.h>
    100 #include <inet/iptun/iptun_impl.h>
    101 #include <inet/ipdrop.h>
    102 #include <inet/ip_netinfo.h>
    103 #include <inet/ilb_ip.h>
    104 
    105 #include <sys/ethernet.h>
    106 #include <net/if_types.h>
    107 #include <sys/cpuvar.h>
    108 
    109 #include <ipp/ipp.h>
    110 #include <ipp/ipp_impl.h>
    111 #include <ipp/ipgpc/ipgpc.h>
    112 
    113 #include <sys/pattr.h>
    114 #include <inet/ipclassifier.h>
    115 #include <inet/sctp_ip.h>
    116 #include <inet/sctp/sctp_impl.h>
    117 #include <inet/udp_impl.h>
    118 #include <inet/rawip_impl.h>
    119 #include <inet/rts_impl.h>
    120 
    121 #include <sys/tsol/label.h>
    122 #include <sys/tsol/tnet.h>
    123 
    124 #include <sys/squeue_impl.h>
    125 #include <inet/ip_arp.h>
    126 
    127 #include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
    128 
    129 /*
    130  * Values for squeue switch:
    131  * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
    132  * IP_SQUEUE_ENTER: SQ_PROCESS
    133  * IP_SQUEUE_FILL: SQ_FILL
    134  */
    135 int ip_squeue_enter = IP_SQUEUE_ENTER;	/* Setable in /etc/system */
    136 
    137 int ip_squeue_flag;
    138 
    139 /*
    140  * Setable in /etc/system
    141  */
    142 int ip_poll_normal_ms = 100;
    143 int ip_poll_normal_ticks = 0;
    144 int ip_modclose_ackwait_ms = 3000;
    145 
    146 /*
    147  * It would be nice to have these present only in DEBUG systems, but the
    148  * current design of the global symbol checking logic requires them to be
    149  * unconditionally present.
    150  */
    151 uint_t ip_thread_data;			/* TSD key for debug support */
    152 krwlock_t ip_thread_rwlock;
    153 list_t	ip_thread_list;
    154 
    155 /*
    156  * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
    157  */
    158 
    159 struct listptr_s {
    160 	mblk_t	*lp_head;	/* pointer to the head of the list */
    161 	mblk_t	*lp_tail;	/* pointer to the tail of the list */
    162 };
    163 
    164 typedef struct listptr_s listptr_t;
    165 
    166 /*
    167  * This is used by ip_snmp_get_mib2_ip_route_media and
    168  * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
    169  */
    170 typedef struct iproutedata_s {
    171 	uint_t		ird_idx;
    172 	uint_t		ird_flags;	/* see below */
    173 	listptr_t	ird_route;	/* ipRouteEntryTable */
    174 	listptr_t	ird_netmedia;	/* ipNetToMediaEntryTable */
    175 	listptr_t	ird_attrs;	/* ipRouteAttributeTable */
    176 } iproutedata_t;
    177 
    178 /* Include ire_testhidden and IRE_IF_CLONE routes */
    179 #define	IRD_REPORT_ALL	0x01
    180 
    181 /*
    182  * Cluster specific hooks. These should be NULL when booted as a non-cluster
    183  */
    184 
    185 /*
    186  * Hook functions to enable cluster networking
    187  * On non-clustered systems these vectors must always be NULL.
    188  *
    189  * Hook function to Check ip specified ip address is a shared ip address
    190  * in the cluster
    191  *
    192  */
    193 int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
    194     sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL;
    195 
    196 /*
    197  * Hook function to generate cluster wide ip fragment identifier
    198  */
    199 uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
    200     sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
    201     void *args) = NULL;
    202 
    203 /*
    204  * Hook function to generate cluster wide SPI.
    205  */
    206 void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
    207     void *) = NULL;
    208 
    209 /*
    210  * Hook function to verify if the SPI is already utlized.
    211  */
    212 
    213 int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
    214 
    215 /*
    216  * Hook function to delete the SPI from the cluster wide repository.
    217  */
    218 
    219 void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
    220 
    221 /*
    222  * Hook function to inform the cluster when packet received on an IDLE SA
    223  */
    224 
    225 void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
    226     in6_addr_t, in6_addr_t, void *) = NULL;
    227 
    228 /*
    229  * Synchronization notes:
    230  *
    231  * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
    232  * MT level protection given by STREAMS. IP uses a combination of its own
    233  * internal serialization mechanism and standard Solaris locking techniques.
    234  * The internal serialization is per phyint.  This is used to serialize
    235  * plumbing operations, IPMP operations, most set ioctls, etc.
    236  *
    237  * Plumbing is a long sequence of operations involving message
    238  * exchanges between IP, ARP and device drivers. Many set ioctls are typically
    239  * involved in plumbing operations. A natural model is to serialize these
    240  * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
    241  * parallel without any interference. But various set ioctls on hme0 are best
    242  * serialized, along with IPMP operations and processing of DLPI control
    243  * messages received from drivers on a per phyint basis. This serialization is
    244  * provided by the ipsq_t and primitives operating on this. Details can
    245  * be found in ip_if.c above the core primitives operating on ipsq_t.
    246  *
    247  * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
    248  * Simiarly lookup of an ire by a thread also returns a refheld ire.
    249  * In addition ipif's and ill's referenced by the ire are also indirectly
    250  * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
    251  * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
    252  * address of an ipif has to go through the ipsq_t. This ensures that only
    253  * one such exclusive operation proceeds at any time on the ipif. It then
    254  * waits for all refcnts
    255  * associated with this ipif to come down to zero. The address is changed
    256  * only after the ipif has been quiesced. Then the ipif is brought up again.
    257  * More details are described above the comment in ip_sioctl_flags.
    258  *
    259  * Packet processing is based mostly on IREs and are fully multi-threaded
    260  * using standard Solaris MT techniques.
    261  *
    262  * There are explicit locks in IP to handle:
    263  * - The ip_g_head list maintained by mi_open_link() and friends.
    264  *
    265  * - The reassembly data structures (one lock per hash bucket)
    266  *
    267  * - conn_lock is meant to protect conn_t fields. The fields actually
    268  *   protected by conn_lock are documented in the conn_t definition.
    269  *
    270  * - ire_lock to protect some of the fields of the ire, IRE tables
    271  *   (one lock per hash bucket). Refer to ip_ire.c for details.
    272  *
    273  * - ndp_g_lock and ncec_lock for protecting NCEs.
    274  *
    275  * - ill_lock protects fields of the ill and ipif. Details in ip.h
    276  *
    277  * - ill_g_lock: This is a global reader/writer lock. Protects the following
    278  *	* The AVL tree based global multi list of all ills.
    279  *	* The linked list of all ipifs of an ill
    280  *	* The <ipsq-xop> mapping
    281  *	* <ill-phyint> association
    282  *   Insertion/deletion of an ill in the system, insertion/deletion of an ipif
    283  *   into an ill, changing the <ipsq-xop> mapping of an ill, changing the
    284  *   <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
    285  *   writer for the actual duration of the insertion/deletion/change.
    286  *
    287  * - ill_lock:  This is a per ill mutex.
    288  *   It protects some members of the ill_t struct; see ip.h for details.
    289  *   It also protects the <ill-phyint> assoc.
    290  *   It also protects the list of ipifs hanging off the ill.
    291  *
    292  * - ipsq_lock: This is a per ipsq_t mutex lock.
    293  *   This protects some members of the ipsq_t struct; see ip.h for details.
    294  *   It also protects the <ipsq-ipxop> mapping
    295  *
    296  * - ipx_lock: This is a per ipxop_t mutex lock.
    297  *   This protects some members of the ipxop_t struct; see ip.h for details.
    298  *
    299  * - phyint_lock: This is a per phyint mutex lock. Protects just the
    300  *   phyint_flags
    301  *
    302  * - ip_g_nd_lock: This is a global reader/writer lock.
    303  *   Any call to nd_load to load a new parameter to the ND table must hold the
    304  *   lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock
    305  *   as reader.
    306  *
    307  * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
    308  *   This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
    309  *   uniqueness check also done atomically.
    310  *
    311  * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
    312  *   group list linked by ill_usesrc_grp_next. It also protects the
    313  *   ill_usesrc_ifindex field. It is taken as a writer when a member of the
    314  *   group is being added or deleted.  This lock is taken as a reader when
    315  *   walking the list/group(eg: to get the number of members in a usesrc group).
    316  *   Note, it is only necessary to take this lock if the ill_usesrc_grp_next
    317  *   field is changing state i.e from NULL to non-NULL or vice-versa. For
    318  *   example, it is not necessary to take this lock in the initial portion
    319  *   of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
    320  *   operations are executed exclusively and that ensures that the "usesrc
    321  *   group state" cannot change. The "usesrc group state" change can happen
    322  *   only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
    323  *
    324  * Changing <ill-phyint>, <ipsq-xop> assocications:
    325  *
    326  * To change the <ill-phyint> association, the ill_g_lock must be held
    327  * as writer, and the ill_locks of both the v4 and v6 instance of the ill
    328  * must be held.
    329  *
    330  * To change the <ipsq-xop> association, the ill_g_lock must be held as
    331  * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
    332  * This is only done when ills are added or removed from IPMP groups.
    333  *
    334  * To add or delete an ipif from the list of ipifs hanging off the ill,
    335  * ill_g_lock (writer) and ill_lock must be held and the thread must be
    336  * a writer on the associated ipsq.
    337  *
    338  * To add or delete an ill to the system, the ill_g_lock must be held as
    339  * writer and the thread must be a writer on the associated ipsq.
    340  *
    341  * To add or delete an ilm to an ill, the ill_lock must be held and the thread
    342  * must be a writer on the associated ipsq.
    343  *
    344  * Lock hierarchy
    345  *
    346  * Some lock hierarchy scenarios are listed below.
    347  *
    348  * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
    349  * ill_g_lock -> ill_lock(s) -> phyint_lock
    350  * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
    351  * ill_g_lock -> ip_addr_avail_lock
    352  * conn_lock -> irb_lock -> ill_lock -> ire_lock
    353  * ill_g_lock -> ip_g_nd_lock
    354  * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
    355  * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
    356  * arl_lock -> ill_lock
    357  * ips_ire_dep_lock -> irb_lock
    358  *
    359  * When more than 1 ill lock is needed to be held, all ill lock addresses
    360  * are sorted on address and locked starting from highest addressed lock
    361  * downward.
    362  *
    363  * Multicast scenarios
    364  * ips_ill_g_lock -> ill_mcast_lock
    365  * conn_ilg_lock -> ips_ill_g_lock -> ill_lock
    366  * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
    367  * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
    368  * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
    369  * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
    370  *
    371  * IPsec scenarios
    372  *
    373  * ipsa_lock -> ill_g_lock -> ill_lock
    374  * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
    375  *
    376  * Trusted Solaris scenarios
    377  *
    378  * igsa_lock -> gcgrp_rwlock -> gcgrp_lock
    379  * igsa_lock -> gcdb_lock
    380  * gcgrp_rwlock -> ire_lock
    381  * gcgrp_rwlock -> gcdb_lock
    382  *
    383  * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
    384  *
    385  * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
    386  * sq_lock -> conn_lock -> QLOCK(q)
    387  * ill_lock -> ft_lock -> fe_lock
    388  *
    389  * Routing/forwarding table locking notes:
    390  *
    391  * Lock acquisition order: Radix tree lock, irb_lock.
    392  * Requirements:
    393  * i.  Walker must not hold any locks during the walker callback.
    394  * ii  Walker must not see a truncated tree during the walk because of any node
    395  *     deletion.
    396  * iii Existing code assumes ire_bucket is valid if it is non-null and is used
    397  *     in many places in the code to walk the irb list. Thus even if all the
    398  *     ires in a bucket have been deleted, we still can't free the radix node
    399  *     until the ires have actually been inactive'd (freed).
    400  *
    401  * Tree traversal - Need to hold the global tree lock in read mode.
    402  * Before dropping the global tree lock, need to either increment the ire_refcnt
    403  * to ensure that the radix node can't be deleted.
    404  *
    405  * Tree add - Need to hold the global tree lock in write mode to add a
    406  * radix node. To prevent the node from being deleted, increment the
    407  * irb_refcnt, after the node is added to the tree. The ire itself is
    408  * added later while holding the irb_lock, but not the tree lock.
    409  *
    410  * Tree delete - Need to hold the global tree lock and irb_lock in write mode.
    411  * All associated ires must be inactive (i.e. freed), and irb_refcnt
    412  * must be zero.
    413  *
    414  * Walker - Increment irb_refcnt before calling the walker callback. Hold the
    415  * global tree lock (read mode) for traversal.
    416  *
    417  * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
    418  * hence we will acquire irb_lock while holding ips_ire_dep_lock.
    419  *
    420  * IPsec notes :
    421  *
    422  * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
    423  * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
    424  * ip_xmit_attr_t has the
    425  * information used by the IPsec code for applying the right level of
    426  * protection. The information initialized by IP in the ip_xmit_attr_t
    427  * is determined by the per-socket policy or global policy in the system.
    428  * For inbound datagrams, the ip_recv_attr_t
    429  * starts out with nothing in it. It gets filled
    430  * with the right information if it goes through the AH/ESP code, which
    431  * happens if the incoming packet is secure. The information initialized
    432  * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
    433  * the policy requirements needed by per-socket policy or global policy
    434  * is met or not.
    435  *
    436  * For fully connected sockets i.e dst, src [addr, port] is known,
    437  * conn_policy_cached is set indicating that policy has been cached.
    438  * conn_in_enforce_policy may or may not be set depending on whether
    439  * there is a global policy match or per-socket policy match.
    440  * Policy inheriting happpens in ip_policy_set once the destination is known.
    441  * Once the right policy is set on the conn_t, policy cannot change for
    442  * this socket. This makes life simpler for TCP (UDP ?) where
    443  * re-transmissions go out with the same policy. For symmetry, policy
    444  * is cached for fully connected UDP sockets also. Thus if policy is cached,
    445  * it also implies that policy is latched i.e policy cannot change
    446  * on these sockets. As we have the right policy on the conn, we don't
    447  * have to lookup global policy for every outbound and inbound datagram
    448  * and thus serving as an optimization. Note that a global policy change
    449  * does not affect fully connected sockets if they have policy. If fully
    450  * connected sockets did not have any policy associated with it, global
    451  * policy change may affect them.
    452  *
    453  * IP Flow control notes:
    454  * ---------------------
    455  * Non-TCP streams are flow controlled by IP. The way this is accomplished
    456  * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
    457  * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
    458  * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
    459  * functions.
    460  *
    461  * Per Tx ring udp flow control:
    462  * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
    463  * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
    464  *
    465  * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
    466  * To achieve best performance, outgoing traffic need to be fanned out among
    467  * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
    468  * traffic out of the NIC and it takes a fanout hint. UDP connections pass
    469  * the address of connp as fanout hint to mac_tx(). Under flow controlled
    470  * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
    471  * cookie points to a specific Tx ring that is blocked. The cookie is used to
    472  * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
    473  * point to drain_lists (idl_t's). These drain list will store the blocked UDP
    474  * connp's. The drain list is not a single list but a configurable number of
    475  * lists.
    476  *
    477  * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
    478  * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
    479  * which is equal to 128. This array in turn contains a pointer to idl_t[],
    480  * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
    481  * list will point to the list of connp's that are flow controlled.
    482  *
    483  *                      ---------------   -------   -------   -------
    484  *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
    485  *                   |  ---------------   -------   -------   -------
    486  *                   |  ---------------   -------   -------   -------
    487  *                   |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
    488  * ----------------  |  ---------------   -------   -------   -------
    489  * |idl_tx_list[0]|->|  ---------------   -------   -------   -------
    490  * ----------------  |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
    491  *                   |  ---------------   -------   -------   -------
    492  *                   .        .              .         .         .
    493  *                   |  ---------------   -------   -------   -------
    494  *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
    495  *                      ---------------   -------   -------   -------
    496  *                      ---------------   -------   -------   -------
    497  *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
    498  *                   |  ---------------   -------   -------   -------
    499  *                   |  ---------------   -------   -------   -------
    500  * ----------------  |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
    501  * |idl_tx_list[1]|->|  ---------------   -------   -------   -------
    502  * ----------------  |        .              .         .         .
    503  *                   |  ---------------   -------   -------   -------
    504  *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
    505  *                      ---------------   -------   -------   -------
    506  *     .....
    507  * ----------------
    508  * |idl_tx_list[n]|-> ...
    509  * ----------------
    510  *
    511  * When mac_tx() returns a cookie, the cookie is used to hash into a
    512  * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is
    513  * called passing idl_tx_list. The connp gets inserted in a drain list
    514  * pointed to by idl_tx_list. conn_drain_list() asserts flow control for
    515  * the sockets (non stream based) and sets QFULL condition on the conn_wq
    516  * of streams sockets, or the su_txqfull for non-streams sockets.
    517  * connp->conn_direct_blocked will be set to indicate the blocked
    518  * condition.
    519  *
    520  * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved.
    521  * A cookie is passed in the call to ill_flow_enable() that identifies the
    522  * blocked Tx ring. This cookie is used to get to the idl_tx_list that
    523  * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t
    524  * and goes through each conn in the drain list and calls conn_idl_remove
    525  * for the conn to clear the qfull condition for the conn, as well as to
    526  * remove the conn from the idl list. In addition, streams based sockets
    527  * will have the conn_wq enabled, causing ip_wsrv to run for the
    528  * conn. ip_wsrv drains the queued messages, and removes the conn from the
    529  * drain list, if all messages were drained. It also notifies the
    530  * conn_upcalls for the conn to signal that flow-control has opened up.
    531  *
    532  * In reality the drain list is not a single list, but a configurable number
    533  * of lists. conn_walk_drain() in the IP module, notifies the conn_upcalls for
    534  * each conn in the list. conn_drain_insert and conn_drain_tail are the only
    535  * functions that manipulate this drain list. conn_drain_insert is called in
    536  * from the protocol layer when conn_ip_output returns EWOULDBLOCK.
    537  * (as opposed to from ip_wsrv context for STREAMS
    538  * case -- see below). The synchronization between drain insertion and flow
    539  * control wakeup is handled by using idl_txl->txl_lock.
    540  *
    541  * Flow control using STREAMS:
    542  * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism
    543  * is used. On the send side, if the packet cannot be sent down to the
    544  * driver by IP, because of a canput failure, ip_xmit drops the packet
    545  * and returns EWOULDBLOCK to the caller, who may then invoke
    546  * ixa_check_drain_insert to insert the conn on the 0'th drain list.
    547  * When ip_wsrv runs on the ill_wq because flow control has been relieved, the
    548  * blocked conns in the * 0'th drain list is drained as with the
    549  * non-STREAMS case.
    550  *
    551  * In both the STREAMS and non-STREAMS case, the sockfs upcall to set
    552  * qfull is done when the conn is inserted into the drain list
    553  * (conn_drain_insert()) and cleared when the conn is removed from the drain
    554  * list (conn_idl_remove()).
    555  *
    556  * IPQOS notes:
    557  *
    558  * IPQoS Policies are applied to packets using IPPF (IP Policy framework)
    559  * and IPQoS modules. IPPF includes hooks in IP at different control points
    560  * (callout positions) which direct packets to IPQoS modules for policy
    561  * processing. Policies, if present, are global.
    562  *
    563  * The callout positions are located in the following paths:
    564  *		o local_in (packets destined for this host)
    565  *		o local_out (packets orginating from this host )
    566  *		o fwd_in  (packets forwarded by this m/c - inbound)
    567  *		o fwd_out (packets forwarded by this m/c - outbound)
    568  * Hooks at these callout points can be enabled/disabled using the ndd variable
    569  * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
    570  * By default all the callout positions are enabled.
    571  *
    572  * Outbound (local_out)
    573  * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
    574  *
    575  * Inbound (local_in)
    576  * Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
    577  *
    578  * Forwarding (in and out)
    579  * Hooks are placed in ire_recv_forward_v4/v6.
    580  *
    581  * IP Policy Framework processing (IPPF processing)
    582  * Policy processing for a packet is initiated by ip_process, which ascertains
    583  * that the classifier (ipgpc) is loaded and configured, failing which the
    584  * packet resumes normal processing in IP. If the clasifier is present, the
    585  * packet is acted upon by one or more IPQoS modules (action instances), per
    586  * filters configured in ipgpc and resumes normal IP processing thereafter.
    587  * An action instance can drop a packet in course of its processing.
    588  *
    589  * Zones notes:
    590  *
    591  * The partitioning rules for networking are as follows:
    592  * 1) Packets coming from a zone must have a source address belonging to that
    593  * zone.
    594  * 2) Packets coming from a zone can only be sent on a physical interface on
    595  * which the zone has an IP address.
    596  * 3) Between two zones on the same machine, packet delivery is only allowed if
    597  * there's a matching route for the destination and zone in the forwarding
    598  * table.
    599  * 4) The TCP and UDP port spaces are per-zone; that is, two processes in
    600  * different zones can bind to the same port with the wildcard address
    601  * (INADDR_ANY).
    602  *
    603  * The granularity of interface partitioning is at the logical interface level.
    604  * Therefore, every zone has its own IP addresses, and incoming packets can be
    605  * attributed to a zone unambiguously. A logical interface is placed into a zone
    606  * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
    607  * structure. Rule (1) is implemented by modifying the source address selection
    608  * algorithm so that the list of eligible addresses is filtered based on the
    609  * sending process zone.
    610  *
    611  * The Internet Routing Entries (IREs) are either exclusive to a zone or shared
    612  * across all zones, depending on their type. Here is the break-up:
    613  *
    614  * IRE type				Shared/exclusive
    615  * --------				----------------
    616  * IRE_BROADCAST			Exclusive
    617  * IRE_DEFAULT (default routes)		Shared (*)
    618  * IRE_LOCAL				Exclusive (x)
    619  * IRE_LOOPBACK				Exclusive
    620  * IRE_PREFIX (net routes)		Shared (*)
    621  * IRE_IF_NORESOLVER (interface routes)	Exclusive
    622  * IRE_IF_RESOLVER (interface routes)	Exclusive
    623  * IRE_IF_CLONE (interface routes)	Exclusive
    624  * IRE_HOST (host routes)		Shared (*)
    625  *
    626  * (*) A zone can only use a default or off-subnet route if the gateway is
    627  * directly reachable from the zone, that is, if the gateway's address matches
    628  * one of the zone's logical interfaces.
    629  *
    630  * (x) IRE_LOCAL are handled a bit differently.
    631  * When ip_restrict_interzone_loopback is set (the default),
    632  * ire_route_recursive restricts loopback using an IRE_LOCAL
    633  * between zone to the case when L2 would have conceptually looped the packet
    634  * back, i.e. the loopback which is required since neither Ethernet drivers
    635  * nor Ethernet hardware loops them back. This is the case when the normal
    636  * routes (ignoring IREs with different zoneids) would send out the packet on
    637  * the same ill as the ill with which is IRE_LOCAL is associated.
    638  *
    639  * Multiple zones can share a common broadcast address; typically all zones
    640  * share the 255.255.255.255 address. Incoming as well as locally originated
    641  * broadcast packets must be dispatched to all the zones on the broadcast
    642  * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
    643  * since some zones may not be on the 10.16.72/24 network. To handle this, each
    644  * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
    645  * sent to every zone that has an IRE_BROADCAST entry for the destination
    646  * address on the input ill, see ip_input_broadcast().
    647  *
    648  * Applications in different zones can join the same multicast group address.
    649  * The same logic applies for multicast as for broadcast. ip_input_multicast
    650  * dispatches packets to all zones that have members on the physical interface.
    651  */
    652 
    653 /*
    654  * Squeue Fanout flags:
    655  *	0: No fanout.
    656  *	1: Fanout across all squeues
    657  */
    658 boolean_t	ip_squeue_fanout = 0;
    659 
    660 /*
    661  * Maximum dups allowed per packet.
    662  */
    663 uint_t ip_max_frag_dups = 10;
    664 
    665 /* RFC 1122 Conformance */
    666 #define	IP_FORWARD_DEFAULT	IP_FORWARD_NEVER
    667 
    668 static int	ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
    669 		    cred_t *credp, boolean_t isv6);
    670 static mblk_t	*ip_xmit_attach_llhdr(mblk_t *, nce_t *);
    671 
    672 static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
    673 static void	icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
    674 static void	icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
    675     ip_recv_attr_t *);
    676 static void	icmp_options_update(ipha_t *);
    677 static void	icmp_param_problem(mblk_t *, uint8_t,  ip_recv_attr_t *);
    678 static void	icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
    679 static mblk_t	*icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
    680 static void	icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
    681     ip_recv_attr_t *);
    682 static void	icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
    683 static void	icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
    684     ip_recv_attr_t *);
    685 
    686 mblk_t		*ip_dlpi_alloc(size_t, t_uscalar_t);
    687 char		*ip_dot_addr(ipaddr_t, char *);
    688 mblk_t		*ip_carve_mp(mblk_t **, ssize_t);
    689 int		ip_close(queue_t *, int);
    690 static char	*ip_dot_saddr(uchar_t *, char *);
    691 static void	ip_lrput(queue_t *, mblk_t *);
    692 ipaddr_t	ip_net_mask(ipaddr_t);
    693 char		*ip_nv_lookup(nv_t *, int);
    694 static int	ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    695 static int	ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    696 static boolean_t	ip_param_register(IDP *ndp, ipparam_t *, size_t,
    697     ipndp_t *, size_t);
    698 static int	ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
    699 void	ip_rput(queue_t *, mblk_t *);
    700 static void	ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
    701 		    void *dummy_arg);
    702 int		ip_snmp_get(queue_t *, mblk_t *, int);
    703 static mblk_t	*ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
    704 		    mib2_ipIfStatsEntry_t *, ip_stack_t *);
    705 static mblk_t	*ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
    706 		    ip_stack_t *);
    707 static mblk_t	*ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *);
    708 static mblk_t	*ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    709 static mblk_t	*ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
    710 static mblk_t	*ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    711 static mblk_t	*ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
    712 static mblk_t	*ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
    713 		    ip_stack_t *ipst);
    714 static mblk_t	*ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
    715 		    ip_stack_t *ipst);
    716 static mblk_t	*ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
    717 		    ip_stack_t *ipst);
    718 static mblk_t	*ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
    719 		    ip_stack_t *ipst);
    720 static mblk_t	*ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
    721 		    ip_stack_t *ipst);
    722 static mblk_t	*ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
    723 		    ip_stack_t *ipst);
    724 static mblk_t	*ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
    725 		    ip_stack_t *ipst);
    726 static mblk_t	*ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
    727 		    ip_stack_t *ipst);
    728 static mblk_t	*ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
    729 		    ip_stack_t *ipst);
    730 static mblk_t	*ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
    731 		    ip_stack_t *ipst);
    732 static void	ip_snmp_get2_v4(ire_t *, iproutedata_t *);
    733 static void	ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
    734 static int	ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *);
    735 static int	ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *);
    736 int		ip_snmp_set(queue_t *, int, int, uchar_t *, int);
    737 
    738 static mblk_t	*ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
    739 		    mblk_t *);
    740 
    741 static void	conn_drain_init(ip_stack_t *);
    742 static void	conn_drain_fini(ip_stack_t *);
    743 static void	conn_drain_tail(conn_t *connp, boolean_t closing);
    744 
    745 static void	conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
    746 static void	conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
    747 
    748 static void	*ip_stack_init(netstackid_t stackid, netstack_t *ns);
    749 static void	ip_stack_shutdown(netstackid_t stackid, void *arg);
    750 static void	ip_stack_fini(netstackid_t stackid, void *arg);
    751 
    752 static int	ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
    753 
    754 static int	ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
    755     const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
    756     ire_t *, conn_t *, boolean_t, const in6_addr_t *,  mcast_record_t,
    757     const in6_addr_t *);
    758 
    759 static int	ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    760 static int	ip_cgtp_filter_set(queue_t *, mblk_t *, char *,
    761     caddr_t, cred_t *);
    762 static int	ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
    763     caddr_t cp, cred_t *cr);
    764 static int	ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
    765     cred_t *);
    766 static int	ip_squeue_switch(int);
    767 
    768 static void	*ip_kstat_init(netstackid_t, ip_stack_t *);
    769 static void	ip_kstat_fini(netstackid_t, kstat_t *);
    770 static int	ip_kstat_update(kstat_t *kp, int rw);
    771 static void	*icmp_kstat_init(netstackid_t);
    772 static void	icmp_kstat_fini(netstackid_t, kstat_t *);
    773 static int	icmp_kstat_update(kstat_t *kp, int rw);
    774 static void	*ip_kstat2_init(netstackid_t, ip_stat_t *);
    775 static void	ip_kstat2_fini(netstackid_t, kstat_t *);
    776 
    777 static void	ipobs_init(ip_stack_t *);
    778 static void	ipobs_fini(ip_stack_t *);
    779 
    780 ipaddr_t	ip_g_all_ones = IP_HOST_MASK;
    781 
    782 /* How long, in seconds, we allow frags to hang around. */
    783 #define	IP_FRAG_TIMEOUT		15
    784 #define	IPV6_FRAG_TIMEOUT	60
    785 
    786 static long ip_rput_pullups;
    787 int	dohwcksum = 1;	/* use h/w cksum if supported by the hardware */
    788 
    789 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
    790 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
    791 
    792 int	ip_debug;
    793 
    794 /*
    795  * Multirouting/CGTP stuff
    796  */
    797 int	ip_cgtp_filter_rev = CGTP_FILTER_REV;	/* CGTP hooks version */
    798 
    799 /*
    800  * Named Dispatch Parameter Table.
    801  * All of these are alterable, within the min/max values given, at run time.
    802  */
    803 static ipparam_t	lcl_param_arr[] = {
    804 	/* min	max	value	name */
    805 	{  0,	1,	0,	"ip_respond_to_address_mask_broadcast"},
    806 	{  0,	1,	1,	"ip_respond_to_echo_broadcast"},
    807 	{  0,	1,	1,	"ip_respond_to_echo_multicast"},
    808 	{  0,	1,	0,	"ip_respond_to_timestamp"},
    809 	{  0,	1,	0,	"ip_respond_to_timestamp_broadcast"},
    810 	{  0,	1,	1,	"ip_send_redirects"},
    811 	{  0,	1,	0,	"ip_forward_directed_broadcasts"},
    812 	{  0,	10,	0,	"ip_mrtdebug"},
    813 	{  1,	8,	3,	"ip_ire_reclaim_fraction" },
    814 	{  1,	8,	3,	"ip_nce_reclaim_fraction" },
    815 	{  1,	8,	3,	"ip_dce_reclaim_fraction" },
    816 	{  1,	255,	255,	"ip_def_ttl" },
    817 	{  0,	1,	0,	"ip_forward_src_routed"},
    818 	{  0,	256,	32,	"ip_wroff_extra" },
    819 	{  2, 999999999, 60*20, "ip_pathmtu_interval" },	/* In seconds */
    820 	{  8,	65536,  64,	"ip_icmp_return_data_bytes" },
    821 	{  0,	1,	1,	"ip_path_mtu_discovery" },
    822 	{ 68,	65535,	576,	"ip_pmtu_min" },
    823 	{  0,	1,	0,	"ip_ignore_redirect" },
    824 	{  0,	1,	0,	"ip_arp_icmp_error" },
    825 	{  1,	254,	1,	"ip_broadcast_ttl" },
    826 	{  0,	99999,	100,	"ip_icmp_err_interval" },
    827 	{  1,	99999,	10,	"ip_icmp_err_burst" },
    828 	{  0,	999999999,	1000000, "ip_reass_queue_bytes" },
    829 	{  0,	1,	0,	"ip_strict_dst_multihoming" },
    830 	{  1,	MAX_ADDRS_PER_IF,	256,	"ip_addrs_per_if"},
    831 	{  0,	1,	0,	"ipsec_override_persocket_policy" },
    832 	{  0,	1,	1,	"icmp_accept_clear_messages" },
    833 	{  0,	1,	1,	"igmp_accept_clear_messages" },
    834 	{  2,	999999999, ND_DELAY_FIRST_PROBE_TIME,
    835 				"ip_ndp_delay_first_probe_time"},
    836 	{  1,	999999999, ND_MAX_UNICAST_SOLICIT,
    837 				"ip_ndp_max_unicast_solicit"},
    838 	{  1,	255,	IPV6_MAX_HOPS,	"ip6_def_hops" },
    839 	{  8,	IPV6_MIN_MTU,	IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" },
    840 	{  0,	1,	0,	"ip6_forward_src_routed"},
    841 	{  0,	1,	1,	"ip6_respond_to_echo_multicast"},
    842 	{  0,	1,	1,	"ip6_send_redirects"},
    843 	{  0,	1,	0,	"ip6_ignore_redirect" },
    844 	{  0,	1,	0,	"ip6_strict_dst_multihoming" },
    845 
    846 	{  0,	2,	2,	"ip_src_check" },
    847 
    848 	{  0,	999999,	1000,	"ipsec_policy_log_interval" },
    849 
    850 	{  0,	1,	1,	"pim_accept_clear_messages" },
    851 	{  1000, 20000,	2000,	"ip_ndp_unsolicit_interval" },
    852 	{  1,	20,	3,	"ip_ndp_unsolicit_count" },
    853 	{  0,	1,	1,	"ip6_ignore_home_address_opt" },
    854 	{  0,	15,	0,	"ip_policy_mask" },
    855 	{  0,	2,	2,	"ip_ecmp_behavior" },
    856 	{  0,	255,	1,	"ip_multirt_ttl" },
    857 	{  0,	3600,	60,	"ip_ire_badcnt_lifetime" },	/* In seconds */
    858 	{  0,	999999,	60*60*24, "ip_max_temp_idle" },
    859 	{  0,	1000,	1,	"ip_max_temp_defend" },
    860 	/*
    861 	 * when a conflict of an active address is detected,
    862 	 * defend up to ip_max_defend times, within any
    863 	 * ip_defend_interval span.
    864 	 */
    865 	{  0,	1000,	3,	"ip_max_defend" },
    866 	{  0,	999999,	30,	"ip_defend_interval" },
    867 	{  0,	3600000, 300000, "ip_dup_recovery" },
    868 	{  0,	1,	1,	"ip_restrict_interzone_loopback" },
    869 	{  0,	1,	1,	"ip_lso_outbound" },
    870 	{  IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" },
    871 	{  MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" },
    872 #ifdef DEBUG
    873 	{  0,	1,	0,	"ip6_drop_inbound_icmpv6" },
    874 #else
    875 	{  0,	0,	0,	"" },
    876 #endif
    877 	/* delay before sending first probe: */
    878 	{  0,	20000,	1000,	"arp_probe_delay" },
    879 	{  0,	20000,	100,	"arp_fastprobe_delay" },
    880 	/* interval at which DAD probes are sent: */
    881 	{ 10,	20000,	1500,	"arp_probe_interval" },
    882 	{ 10,	20000,	150,	"arp_fastprobe_interval" },
    883 	/* setting probe count to 0 will disable ARP probing for DAD. */
    884 	{  0,	20,	3,	"arp_probe_count" },
    885 	{  0,	20,	3,	"arp_fastprobe_count" },
    886 
    887 	{  0,	3600000, 15000,	"ipv4_dad_announce_interval"},
    888 	{  0,	3600000, 15000,	"ipv6_dad_announce_interval"},
    889 	/*
    890 	 * Rate limiting parameters for DAD defense used in
    891 	 * ill_defend_rate_limit():
    892 	 * defend_rate : pkts/hour permitted
    893 	 * defend_interval : time that can elapse before we send out a
    894 	 *			DAD defense.
    895 	 * defend_period: denominator for defend_rate (in seconds).
    896 	 */
    897 	{  0,	3600000, 300000,	"arp_defend_interval"},
    898 	{  0,	20000, 100,		"arp_defend_rate"},
    899 	{  0,	3600000, 300000,	"ndp_defend_interval"},
    900 	{  0,	20000, 100,		"ndp_defend_rate"},
    901 	{  5,	86400,	3600,		"arp_defend_period"},
    902 	{  5,	86400,	3600,		"ndp_defend_period"},
    903 	{  0,	1,	1,		"ipv4_icmp_return_pmtu" },
    904 	{  0,	1,	1,		"ipv6_icmp_return_pmtu" },
    905 	/*
    906 	 * publish count/interval values used to announce local addresses
    907 	 * for IPv4, IPv6.
    908 	 */
    909 	{  1,	20,	5,	"ip_arp_publish_count" },
    910 	{  1000, 20000,	2000,	"ip_arp_publish_interval" },
    911 };
    912 
    913 /*
    914  * Extended NDP table
    915  * The addresses for the first two are filled in to be ips_ip_g_forward
    916  * and ips_ipv6_forward at init time.
    917  */
    918 static ipndp_t	lcl_ndp_arr[] = {
    919 	/* getf			setf		data			name */
    920 #define	IPNDP_IP_FORWARDING_OFFSET	0
    921 	{  ip_param_generic_get,	ip_forward_set,	NULL,
    922 	    "ip_forwarding" },
    923 #define	IPNDP_IP6_FORWARDING_OFFSET	1
    924 	{  ip_param_generic_get,	ip_forward_set,	NULL,
    925 	    "ip6_forwarding" },
    926 	{ ip_param_generic_get, ip_input_proc_set,
    927 	    (caddr_t)&ip_squeue_enter, "ip_squeue_enter" },
    928 	{ ip_param_generic_get, ip_int_set,
    929 	    (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" },
    930 #define	IPNDP_CGTP_FILTER_OFFSET	4
    931 	{  ip_cgtp_filter_get,	ip_cgtp_filter_set, NULL,
    932 	    "ip_cgtp_filter" },
    933 	{  ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug,
    934 	    "ip_debug" },
    935 };
    936 
    937 /*
    938  * Table of IP ioctls encoding the various properties of the ioctl and
    939  * indexed based on the last byte of the ioctl command. Occasionally there
    940  * is a clash, and there is more than 1 ioctl with the same last byte.
    941  * In such a case 1 ioctl is encoded in the ndx table and the remaining
    942  * ioctls are encoded in the misc table. An entry in the ndx table is
    943  * retrieved by indexing on the last byte of the ioctl command and comparing
    944  * the ioctl command with the value in the ndx table. In the event of a
    945  * mismatch the misc table is then searched sequentially for the desired
    946  * ioctl command.
    947  *
    948  * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
    949  */
    950 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
    951 	/* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    952 	/* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    953 	/* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    954 	/* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    955 	/* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    956 	/* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    957 	/* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    958 	/* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    959 	/* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    960 	/* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    961 
    962 	/* 010 */ { SIOCADDRT,	sizeof (struct rtentry), IPI_PRIV,
    963 			MISC_CMD, ip_siocaddrt, NULL },
    964 	/* 011 */ { SIOCDELRT,	sizeof (struct rtentry), IPI_PRIV,
    965 			MISC_CMD, ip_siocdelrt, NULL },
    966 
    967 	/* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    968 			IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
    969 	/* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
    970 			IF_CMD, ip_sioctl_get_addr, NULL },
    971 
    972 	/* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    973 			IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
    974 	/* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
    975 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
    976 
    977 	/* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
    978 			IPI_PRIV | IPI_WR,
    979 			IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
    980 	/* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
    981 			IPI_MODOK | IPI_GET_CMD,
    982 			IF_CMD, ip_sioctl_get_flags, NULL },
    983 
    984 	/* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    985 	/* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    986 
    987 	/* copyin size cannot be coded for SIOCGIFCONF */
    988 	/* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD,
    989 			MISC_CMD, ip_sioctl_get_ifconf, NULL },
    990 
    991 	/* 021 */ { SIOCSIFMTU,	sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    992 			IF_CMD, ip_sioctl_mtu, NULL },
    993 	/* 022 */ { SIOCGIFMTU,	sizeof (struct ifreq), IPI_GET_CMD,
    994 			IF_CMD, ip_sioctl_get_mtu, NULL },
    995 	/* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
    996 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
    997 	/* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    998 			IF_CMD, ip_sioctl_brdaddr, NULL },
    999 	/* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
   1000 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
   1001 	/* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
   1002 			IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
   1003 	/* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
   1004 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
   1005 	/* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
   1006 			IF_CMD, ip_sioctl_metric, NULL },
   1007 	/* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1008 
   1009 	/* See 166-168 below for extended SIOC*XARP ioctls */
   1010 	/* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
   1011 			ARP_CMD, ip_sioctl_arp, NULL },
   1012 	/* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
   1013 			ARP_CMD, ip_sioctl_arp, NULL },
   1014 	/* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
   1015 			ARP_CMD, ip_sioctl_arp, NULL },
   1016 
   1017 	/* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1018 	/* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1019 	/* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1020 	/* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1021 	/* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1022 	/* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1023 	/* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1024 	/* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1025 	/* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1026 	/* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1027 	/* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1028 	/* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1029 	/* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1030 	/* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1031 	/* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1032 	/* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1033 	/* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1034 	/* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1035 	/* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1036 	/* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1037 	/* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1038 
   1039 	/* 054 */ { IF_UNITSEL,	sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
   1040 			MISC_CMD, if_unitsel, if_unitsel_restart },
   1041 
   1042 	/* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1043 	/* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1044 	/* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1045 	/* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1046 	/* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1047 	/* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1048 	/* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1049 	/* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1050 	/* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1051 	/* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1052 	/* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1053 	/* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1054 	/* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1055 	/* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1056 	/* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1057 	/* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1058 	/* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1059 	/* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1060 
   1061 	/* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
   1062 			IPI_PRIV | IPI_WR | IPI_MODOK,
   1063 			IF_CMD, ip_sioctl_sifname, NULL },
   1064 
   1065 	/* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1066 	/* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1067 	/* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1068 	/* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1069 	/* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1070 	/* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1071 	/* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1072 	/* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1073 	/* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1074 	/* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1075 	/* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1076 	/* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1077 	/* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1078 
   1079 	/* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
   1080 			MISC_CMD, ip_sioctl_get_ifnum, NULL },
   1081 	/* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
   1082 			IF_CMD, ip_sioctl_get_muxid, NULL },
   1083 	/* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
   1084 			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
   1085 
   1086 	/* Both if and lif variants share same func */
   1087 	/* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
   1088 			IF_CMD, ip_sioctl_get_lifindex, NULL },
   1089 	/* Both if and lif variants share same func */
   1090 	/* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
   1091 			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
   1092 
   1093 	/* copyin size cannot be coded for SIOCGIFCONF */
   1094 	/* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
   1095 			MISC_CMD, ip_sioctl_get_ifconf, NULL },
   1096 	/* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1097 	/* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1098 	/* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1099 	/* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1100 	/* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1101 	/* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1102 	/* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1103 	/* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1104 	/* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1105 	/* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1106 	/* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1107 	/* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1108 	/* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1109 	/* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1110 	/* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1111 	/* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1112 	/* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1113 
   1114 	/* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
   1115 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
   1116 			ip_sioctl_removeif_restart },
   1117 	/* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
   1118 			IPI_GET_CMD | IPI_PRIV | IPI_WR,
   1119 			LIF_CMD, ip_sioctl_addif, NULL },
   1120 #define	SIOCLIFADDR_NDX 112
   1121 	/* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1122 			LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
   1123 	/* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
   1124 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
   1125 	/* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1126 			LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
   1127 	/* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
   1128 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
   1129 	/* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
   1130 			IPI_PRIV | IPI_WR,
   1131 			LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
   1132 	/* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
   1133 			IPI_GET_CMD | IPI_MODOK,
   1134 			LIF_CMD, ip_sioctl_get_flags, NULL },
   1135 
   1136 	/* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1137 	/* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1138 
   1139 	/* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
   1140 			ip_sioctl_get_lifconf, NULL },
   1141 	/* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1142 			LIF_CMD, ip_sioctl_mtu, NULL },
   1143 	/* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
   1144 			LIF_CMD, ip_sioctl_get_mtu, NULL },
   1145 	/* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
   1146 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
   1147 	/* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1148 			LIF_CMD, ip_sioctl_brdaddr, NULL },
   1149 	/* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
   1150 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
   1151 	/* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1152 			LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
   1153 	/* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
   1154 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
   1155 	/* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1156 			LIF_CMD, ip_sioctl_metric, NULL },
   1157 	/* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
   1158 			IPI_PRIV | IPI_WR | IPI_MODOK,
   1159 			LIF_CMD, ip_sioctl_slifname,
   1160 			ip_sioctl_slifname_restart },
   1161 
   1162 	/* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
   1163 			MISC_CMD, ip_sioctl_get_lifnum, NULL },
   1164 	/* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
   1165 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
   1166 	/* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
   1167 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
   1168 	/* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
   1169 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
   1170 	/* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
   1171 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
   1172 	/* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1173 			LIF_CMD, ip_sioctl_token, NULL },
   1174 	/* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
   1175 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
   1176 	/* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1177 			LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
   1178 	/* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
   1179 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
   1180 	/* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1181 			LIF_CMD, ip_sioctl_lnkinfo, NULL },
   1182 
   1183 	/* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
   1184 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
   1185 	/* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
   1186 			LIF_CMD, ip_siocdelndp_v6, NULL },
   1187 	/* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
   1188 			LIF_CMD, ip_siocqueryndp_v6, NULL },
   1189 	/* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
   1190 			LIF_CMD, ip_siocsetndp_v6, NULL },
   1191 	/* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
   1192 			MISC_CMD, ip_sioctl_tmyaddr, NULL },
   1193 	/* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
   1194 			MISC_CMD, ip_sioctl_tonlink, NULL },
   1195 	/* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
   1196 			MISC_CMD, ip_sioctl_tmysite, NULL },
   1197 	/* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1198 	/* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1199 	/* IPSECioctls handled in ip_sioctl_copyin_setup itself */
   1200 	/* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1201 	/* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1202 	/* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1203 	/* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1204 
   1205 	/* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1206 
   1207 	/* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD,
   1208 			LIF_CMD, ip_sioctl_get_binding, NULL },
   1209 	/* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
   1210 			IPI_PRIV | IPI_WR,
   1211 			LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
   1212 	/* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
   1213 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
   1214 	/* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
   1215 			IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
   1216 
   1217 	/* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
   1218 	/* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1219 	/* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1220 	/* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1221 
   1222 	/* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1223 
   1224 	/* These are handled in ip_sioctl_copyin_setup itself */
   1225 	/* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
   1226 			MISC_CMD, NULL, NULL },
   1227 	/* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
   1228 			MISC_CMD, NULL, NULL },
   1229 	/* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
   1230 
   1231 	/* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
   1232 			ip_sioctl_get_lifconf, NULL },
   1233 
   1234 	/* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
   1235 			XARP_CMD, ip_sioctl_arp, NULL },
   1236 	/* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
   1237 			XARP_CMD, ip_sioctl_arp, NULL },
   1238 	/* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
   1239 			XARP_CMD, ip_sioctl_arp, NULL },
   1240 
   1241 	/* SIOCPOPSOCKFS is not handled by IP */
   1242 	/* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
   1243 
   1244 	/* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
   1245 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
   1246 	/* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
   1247 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
   1248 			ip_sioctl_slifzone_restart },
   1249 	/* 172-174 are SCTP ioctls and not handled by IP */
   1250 	/* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1251 	/* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1252 	/* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1253 	/* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
   1254 			IPI_GET_CMD, LIF_CMD,
   1255 			ip_sioctl_get_lifusesrc, 0 },
   1256 	/* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
   1257 			IPI_PRIV | IPI_WR,
   1258 			LIF_CMD, ip_sioctl_slifusesrc,
   1259 			NULL },
   1260 	/* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
   1261 			ip_sioctl_get_lifsrcof, NULL },
   1262 	/* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
   1263 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1264 	/* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
   1265 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1266 	/* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
   1267 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1268 	/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
   1269 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1270 	/* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1271 	/* SIOCSENABLESDP is handled by SDP */
   1272 	/* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
   1273 	/* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
   1274 	/* 185 */ { IPI_DONTCARE /* SIOCGIFHWADDR */, 0, 0, 0, NULL, NULL },
   1275 	/* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL },
   1276 	/* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD,
   1277 			ip_sioctl_ilb_cmd, NULL },
   1278 };
   1279 
   1280 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
   1281 
   1282 ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
   1283 	{ I_LINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1284 	{ I_UNLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1285 	{ I_PLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1286 	{ I_PUNLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1287 	{ ND_GET,	0, 0, 0, NULL, NULL },
   1288 	{ ND_SET,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1289 	{ IP_IOCTL,	0, 0, 0, NULL, NULL },
   1290 	{ SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
   1291 		MISC_CMD, mrt_ioctl},
   1292 	{ SIOCGETSGCNT,	sizeof (struct sioc_sg_req), IPI_GET_CMD,
   1293 		MISC_CMD, mrt_ioctl},
   1294 	{ SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
   1295 		MISC_CMD, mrt_ioctl}
   1296 };
   1297 
   1298 int ip_misc_ioctl_count =
   1299     sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
   1300 
   1301 int	conn_drain_nthreads;		/* Number of drainers reqd. */
   1302 					/* Settable in /etc/system */
   1303 /* Defined in ip_ire.c */
   1304 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
   1305 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
   1306 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
   1307 
   1308 static nv_t	ire_nv_arr[] = {
   1309 	{ IRE_BROADCAST, "BROADCAST" },
   1310 	{ IRE_LOCAL, "LOCAL" },
   1311 	{ IRE_LOOPBACK, "LOOPBACK" },
   1312 	{ IRE_DEFAULT, "DEFAULT" },
   1313 	{ IRE_PREFIX, "PREFIX" },
   1314 	{ IRE_IF_NORESOLVER, "IF_NORESOL" },
   1315 	{ IRE_IF_RESOLVER, "IF_RESOLV" },
   1316 	{ IRE_IF_CLONE, "IF_CLONE" },
   1317 	{ IRE_HOST, "HOST" },
   1318 	{ IRE_MULTICAST, "MULTICAST" },
   1319 	{ IRE_NOROUTE, "NOROUTE" },
   1320 	{ 0 }
   1321 };
   1322 
   1323 nv_t	*ire_nv_tbl = ire_nv_arr;
   1324 
   1325 /* Simple ICMP IP Header Template */
   1326 static ipha_t icmp_ipha = {
   1327 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
   1328 };
   1329 
   1330 struct module_info ip_mod_info = {
   1331 	IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
   1332 	IP_MOD_LOWAT
   1333 };
   1334 
   1335 /*
   1336  * Duplicate static symbols within a module confuses mdb; so we avoid the
   1337  * problem by making the symbols here distinct from those in udp.c.
   1338  */
   1339 
   1340 /*
   1341  * Entry points for IP as a device and as a module.
   1342  * We have separate open functions for the /dev/ip and /dev/ip6 devices.
   1343  */
   1344 static struct qinit iprinitv4 = {
   1345 	(pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL,
   1346 	&ip_mod_info
   1347 };
   1348 
   1349 struct qinit iprinitv6 = {
   1350 	(pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL,
   1351 	&ip_mod_info
   1352 };
   1353 
   1354 static struct qinit ipwinit = {
   1355 	(pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL,
   1356 	&ip_mod_info
   1357 };
   1358 
   1359 static struct qinit iplrinit = {
   1360 	(pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL,
   1361 	&ip_mod_info
   1362 };
   1363 
   1364 static struct qinit iplwinit = {
   1365 	(pfi_t)ip_lwput, NULL, NULL, NULL, NULL,
   1366 	&ip_mod_info
   1367 };
   1368 
   1369 /* For AF_INET aka /dev/ip */
   1370 struct streamtab ipinfov4 = {
   1371 	&iprinitv4, &ipwinit, &iplrinit, &iplwinit
   1372 };
   1373 
   1374 /* For AF_INET6 aka /dev/ip6 */
   1375 struct streamtab ipinfov6 = {
   1376 	&iprinitv6, &ipwinit, &iplrinit, &iplwinit
   1377 };
   1378 
   1379 #ifdef	DEBUG
   1380 boolean_t skip_sctp_cksum = B_FALSE;
   1381 #endif
   1382 
   1383 /*
   1384  * Generate an ICMP fragmentation needed message.
   1385  * When called from ip_output side a minimal ip_recv_attr_t needs to be
   1386  * constructed by the caller.
   1387  */
   1388 void
   1389 icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
   1390 {
   1391 	icmph_t	icmph;
   1392 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   1393 
   1394 	mp = icmp_pkt_err_ok(mp, ira);
   1395 	if (mp == NULL)
   1396 		return;
   1397 
   1398 	bzero(&icmph, sizeof (icmph_t));
   1399 	icmph.icmph_type = ICMP_DEST_UNREACHABLE;
   1400 	icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
   1401 	icmph.icmph_du_mtu = htons((uint16_t)mtu);
   1402 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
   1403 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
   1404 
   1405 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   1406 }
   1407 
   1408 /*
   1409  * icmp_inbound_v4 deals with ICMP messages that are handled by IP.
   1410  * If the ICMP message is consumed by IP, i.e., it should not be delivered
   1411  * to any IPPROTO_ICMP raw sockets, then it returns NULL.
   1412  * Likewise, if the ICMP error is misformed (too short, etc), then it
   1413  * returns NULL. The caller uses this to determine whether or not to send
   1414  * to raw sockets.
   1415  *
   1416  * All error messages are passed to the matching transport stream.
   1417  *
   1418  * The following cases are handled by icmp_inbound:
   1419  * 1) It needs to send a reply back and possibly delivering it
   1420  *    to the "interested" upper clients.
   1421  * 2) Return the mblk so that the caller can pass it to the RAW socket clients.
   1422  * 3) It needs to change some values in IP only.
   1423  * 4) It needs to change some values in IP and upper layers e.g TCP
   1424  *    by delivering an error to the upper layers.
   1425  *
   1426  * We handle the above three cases in the context of IPsec in the
   1427  * following way :
   1428  *
   1429  * 1) Send the reply back in the same way as the request came in.
   1430  *    If it came in encrypted, it goes out encrypted. If it came in
   1431  *    clear, it goes out in clear. Thus, this will prevent chosen
   1432  *    plain text attack.
   1433  * 2) The client may or may not expect things to come in secure.
   1434  *    If it comes in secure, the policy constraints are checked
   1435  *    before delivering it to the upper layers. If it comes in
   1436  *    clear, ipsec_inbound_accept_clear will decide whether to
   1437  *    accept this in clear or not. In both the cases, if the returned
   1438  *    message (IP header + 8 bytes) that caused the icmp message has
   1439  *    AH/ESP headers, it is sent up to AH/ESP for validation before
   1440  *    sending up. If there are only 8 bytes of returned message, then
   1441  *    upper client will not be notified.
   1442  * 3) Check with global policy to see whether it matches the constaints.
   1443  *    But this will be done only if icmp_accept_messages_in_clear is
   1444  *    zero.
   1445  * 4) If we need to change both in IP and ULP, then the decision taken
   1446  *    while affecting the values in IP and while delivering up to TCP
   1447  *    should be the same.
   1448  *
   1449  * 	There are two cases.
   1450  *
   1451  * 	a) If we reject data at the IP layer (ipsec_check_global_policy()
   1452  *	   failed), we will not deliver it to the ULP, even though they
   1453  *	   are *willing* to accept in *clear*. This is fine as our global
   1454  *	   disposition to icmp messages asks us reject the datagram.
   1455  *
   1456  *	b) If we accept data at the IP layer (ipsec_check_global_policy()
   1457  *	   succeeded or icmp_accept_messages_in_clear is 1), and not able
   1458  *	   to deliver it to ULP (policy failed), it can lead to
   1459  *	   consistency problems. The cases known at this time are
   1460  *	   ICMP_DESTINATION_UNREACHABLE  messages with following code
   1461  *	   values :
   1462  *
   1463  *	   - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
   1464  *	     and Upper layer rejects. Then the communication will
   1465  *	     come to a stop. This is solved by making similar decisions
   1466  *	     at both levels. Currently, when we are unable to deliver
   1467  *	     to the Upper Layer (due to policy failures) while IP has
   1468  *	     adjusted dce_pmtu, the next outbound datagram would
   1469  *	     generate a local ICMP_FRAGMENTATION_NEEDED message - which
   1470  *	     will be with the right level of protection. Thus the right
   1471  *	     value will be communicated even if we are not able to
   1472  *	     communicate when we get from the wire initially. But this
   1473  *	     assumes there would be at least one outbound datagram after
   1474  *	     IP has adjusted its dce_pmtu value. To make things
   1475  *	     simpler, we accept in clear after the validation of
   1476  *	     AH/ESP headers.
   1477  *
   1478  *	   - Other ICMP ERRORS : We may not be able to deliver it to the
   1479  *	     upper layer depending on the level of protection the upper
   1480  *	     layer expects and the disposition in ipsec_inbound_accept_clear().
   1481  *	     ipsec_inbound_accept_clear() decides whether a given ICMP error
   1482  *	     should be accepted in clear when the Upper layer expects secure.
   1483  *	     Thus the communication may get aborted by some bad ICMP
   1484  *	     packets.
   1485  */
   1486 mblk_t *
   1487 icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
   1488 {
   1489 	icmph_t		*icmph;
   1490 	ipha_t		*ipha;		/* Outer header */
   1491 	int		ip_hdr_length;	/* Outer header length */
   1492 	boolean_t	interested;
   1493 	ipif_t		*ipif;
   1494 	uint32_t	ts;
   1495 	uint32_t	*tsp;
   1496 	timestruc_t	now;
   1497 	ill_t		*ill = ira->ira_ill;
   1498 	ip_stack_t	*ipst = ill->ill_ipst;
   1499 	zoneid_t	zoneid = ira->ira_zoneid;
   1500 	int		len_needed;
   1501 	mblk_t		*mp_ret = NULL;
   1502 
   1503 	ipha = (ipha_t *)mp->b_rptr;
   1504 
   1505 	BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
   1506 
   1507 	ip_hdr_length = ira->ira_ip_hdr_length;
   1508 	if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
   1509 		if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
   1510 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   1511 			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   1512 			freemsg(mp);
   1513 			return (NULL);
   1514 		}
   1515 		/* Last chance to get real. */
   1516 		ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
   1517 		if (ipha == NULL) {
   1518 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   1519 			freemsg(mp);
   1520 			return (NULL);
   1521 		}
   1522 	}
   1523 
   1524 	/* The IP header will always be a multiple of four bytes */
   1525 	icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1526 	ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
   1527 	    icmph->icmph_code));
   1528 
   1529 	/*
   1530 	 * We will set "interested" to "true" if we should pass a copy to
   1531 	 * the transport or if we handle the packet locally.
   1532 	 */
   1533 	interested = B_FALSE;
   1534 	switch (icmph->icmph_type) {
   1535 	case ICMP_ECHO_REPLY:
   1536 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
   1537 		break;
   1538 	case ICMP_DEST_UNREACHABLE:
   1539 		if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
   1540 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
   1541 		interested = B_TRUE;	/* Pass up to transport */
   1542 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
   1543 		break;
   1544 	case ICMP_SOURCE_QUENCH:
   1545 		interested = B_TRUE;	/* Pass up to transport */
   1546 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
   1547 		break;
   1548 	case ICMP_REDIRECT:
   1549 		if (!ipst->ips_ip_ignore_redirect)
   1550 			interested = B_TRUE;
   1551 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
   1552 		break;
   1553 	case ICMP_ECHO_REQUEST:
   1554 		/*
   1555 		 * Whether to respond to echo requests that come in as IP
   1556 		 * broadcasts or as IP multicast is subject to debate
   1557 		 * (what isn't?).  We aim to please, you pick it.
   1558 		 * Default is do it.
   1559 		 */
   1560 		if (ira->ira_flags & IRAF_MULTICAST) {
   1561 			/* multicast: respond based on tunable */
   1562 			interested = ipst->ips_ip_g_resp_to_echo_mcast;
   1563 		} else if (ira->ira_flags & IRAF_BROADCAST) {
   1564 			/* broadcast: respond based on tunable */
   1565 			interested = ipst->ips_ip_g_resp_to_echo_bcast;
   1566 		} else {
   1567 			/* unicast: always respond */
   1568 			interested = B_TRUE;
   1569 		}
   1570 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
   1571 		if (!interested) {
   1572 			/* We never pass these to RAW sockets */
   1573 			freemsg(mp);
   1574 			return (NULL);
   1575 		}
   1576 
   1577 		/* Check db_ref to make sure we can modify the packet. */
   1578 		if (mp->b_datap->db_ref > 1) {
   1579 			mblk_t	*mp1;
   1580 
   1581 			mp1 = copymsg(mp);
   1582 			freemsg(mp);
   1583 			if (!mp1) {
   1584 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1585 				return (NULL);
   1586 			}
   1587 			mp = mp1;
   1588 			ipha = (ipha_t *)mp->b_rptr;
   1589 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1590 		}
   1591 		icmph->icmph_type = ICMP_ECHO_REPLY;
   1592 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
   1593 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1594 		return (NULL);
   1595 
   1596 	case ICMP_ROUTER_ADVERTISEMENT:
   1597 	case ICMP_ROUTER_SOLICITATION:
   1598 		break;
   1599 	case ICMP_TIME_EXCEEDED:
   1600 		interested = B_TRUE;	/* Pass up to transport */
   1601 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
   1602 		break;
   1603 	case ICMP_PARAM_PROBLEM:
   1604 		interested = B_TRUE;	/* Pass up to transport */
   1605 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
   1606 		break;
   1607 	case ICMP_TIME_STAMP_REQUEST:
   1608 		/* Response to Time Stamp Requests is local policy. */
   1609 		if (ipst->ips_ip_g_resp_to_timestamp) {
   1610 			if (ira->ira_flags & IRAF_MULTIBROADCAST)
   1611 				interested =
   1612 				    ipst->ips_ip_g_resp_to_timestamp_bcast;
   1613 			else
   1614 				interested = B_TRUE;
   1615 		}
   1616 		if (!interested) {
   1617 			/* We never pass these to RAW sockets */
   1618 			freemsg(mp);
   1619 			return (NULL);
   1620 		}
   1621 
   1622 		/* Make sure we have enough of the packet */
   1623 		len_needed = ip_hdr_length + ICMPH_SIZE +
   1624 		    3 * sizeof (uint32_t);
   1625 
   1626 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   1627 			ipha = ip_pullup(mp, len_needed, ira);
   1628 			if (ipha == NULL) {
   1629 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1630 				ip_drop_input("ipIfStatsInDiscards - ip_pullup",
   1631 				    mp, ill);
   1632 				freemsg(mp);
   1633 				return (NULL);
   1634 			}
   1635 			/* Refresh following the pullup. */
   1636 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1637 		}
   1638 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
   1639 		/* Check db_ref to make sure we can modify the packet. */
   1640 		if (mp->b_datap->db_ref > 1) {
   1641 			mblk_t	*mp1;
   1642 
   1643 			mp1 = copymsg(mp);
   1644 			freemsg(mp);
   1645 			if (!mp1) {
   1646 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1647 				return (NULL);
   1648 			}
   1649 			mp = mp1;
   1650 			ipha = (ipha_t *)mp->b_rptr;
   1651 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1652 		}
   1653 		icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
   1654 		tsp = (uint32_t *)&icmph[1];
   1655 		tsp++;		/* Skip past 'originate time' */
   1656 		/* Compute # of milliseconds since midnight */
   1657 		gethrestime(&now);
   1658 		ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
   1659 		    now.tv_nsec / (NANOSEC / MILLISEC);
   1660 		*tsp++ = htonl(ts);	/* Lay in 'receive time' */
   1661 		*tsp++ = htonl(ts);	/* Lay in 'send time' */
   1662 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
   1663 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1664 		return (NULL);
   1665 
   1666 	case ICMP_TIME_STAMP_REPLY:
   1667 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
   1668 		break;
   1669 	case ICMP_INFO_REQUEST:
   1670 		/* Per RFC 1122 3.2.2.7, ignore this. */
   1671 	case ICMP_INFO_REPLY:
   1672 		break;
   1673 	case ICMP_ADDRESS_MASK_REQUEST:
   1674 		if (ira->ira_flags & IRAF_MULTIBROADCAST) {
   1675 			interested =
   1676 			    ipst->ips_ip_respond_to_address_mask_broadcast;
   1677 		} else {
   1678 			interested = B_TRUE;
   1679 		}
   1680 		if (!interested) {
   1681 			/* We never pass these to RAW sockets */
   1682 			freemsg(mp);
   1683 			return (NULL);
   1684 		}
   1685 		len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
   1686 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   1687 			ipha = ip_pullup(mp, len_needed, ira);
   1688 			if (ipha == NULL) {
   1689 				BUMP_MIB(ill->ill_ip_mib,
   1690 				    ipIfStatsInTruncatedPkts);
   1691 				ip_drop_input("ipIfStatsInTruncatedPkts", mp,
   1692 				    ill);
   1693 				freemsg(mp);
   1694 				return (NULL);
   1695 			}
   1696 			/* Refresh following the pullup. */
   1697 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1698 		}
   1699 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
   1700 		/* Check db_ref to make sure we can modify the packet. */
   1701 		if (mp->b_datap->db_ref > 1) {
   1702 			mblk_t	*mp1;
   1703 
   1704 			mp1 = copymsg(mp);
   1705 			freemsg(mp);
   1706 			if (!mp1) {
   1707 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1708 				return (NULL);
   1709 			}
   1710 			mp = mp1;
   1711 			ipha = (ipha_t *)mp->b_rptr;
   1712 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1713 		}
   1714 		/*
   1715 		 * Need the ipif with the mask be the same as the source
   1716 		 * address of the mask reply. For unicast we have a specific
   1717 		 * ipif. For multicast/broadcast we only handle onlink
   1718 		 * senders, and use the source address to pick an ipif.
   1719 		 */
   1720 		ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
   1721 		if (ipif == NULL) {
   1722 			/* Broadcast or multicast */
   1723 			ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
   1724 			if (ipif == NULL) {
   1725 				freemsg(mp);
   1726 				return (NULL);
   1727 			}
   1728 		}
   1729 		icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
   1730 		bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
   1731 		ipif_refrele(ipif);
   1732 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
   1733 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1734 		return (NULL);
   1735 
   1736 	case ICMP_ADDRESS_MASK_REPLY:
   1737 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
   1738 		break;
   1739 	default:
   1740 		interested = B_TRUE;	/* Pass up to transport */
   1741 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
   1742 		break;
   1743 	}
   1744 	/*
   1745 	 * See if there is an ICMP client to avoid an extra copymsg/freemsg
   1746 	 * if there isn't one.
   1747 	 */
   1748 	if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
   1749 		/* If there is an ICMP client and we want one too, copy it. */
   1750 
   1751 		if (!interested) {
   1752 			/* Caller will deliver to RAW sockets */
   1753 			return (mp);
   1754 		}
   1755 		mp_ret = copymsg(mp);
   1756 		if (mp_ret == NULL) {
   1757 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1758 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
   1759 		}
   1760 	} else if (!interested) {
   1761 		/* Neither we nor raw sockets are interested. Drop packet now */
   1762 		freemsg(mp);
   1763 		return (NULL);
   1764 	}
   1765 
   1766 	/*
   1767 	 * ICMP error or redirect packet. Make sure we have enough of
   1768 	 * the header and that db_ref == 1 since we might end up modifying
   1769 	 * the packet.
   1770 	 */
   1771 	if (mp->b_cont != NULL) {
   1772 		if (ip_pullup(mp, -1, ira) == NULL) {
   1773 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1774 			ip_drop_input("ipIfStatsInDiscards - ip_pullup",
   1775 			    mp, ill);
   1776 			freemsg(mp);
   1777 			return (mp_ret);
   1778 		}
   1779 	}
   1780 
   1781 	if (mp->b_datap->db_ref > 1) {
   1782 		mblk_t	*mp1;
   1783 
   1784 		mp1 = copymsg(mp);
   1785 		if (mp1 == NULL) {
   1786 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1787 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
   1788 			freemsg(mp);
   1789 			return (mp_ret);
   1790 		}
   1791 		freemsg(mp);
   1792 		mp = mp1;
   1793 	}
   1794 
   1795 	/*
   1796 	 * In case mp has changed, verify the message before any further
   1797 	 * processes.
   1798 	 */
   1799 	ipha = (ipha_t *)mp->b_rptr;
   1800 	icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1801 	if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   1802 		freemsg(mp);
   1803 		return (mp_ret);
   1804 	}
   1805 
   1806 	switch (icmph->icmph_type) {
   1807 	case ICMP_REDIRECT:
   1808 		icmp_redirect_v4(mp, ipha, icmph, ira);
   1809 		break;
   1810 	case ICMP_DEST_UNREACHABLE:
   1811 		if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
   1812 			/* Update DCE and adjust MTU is icmp header if needed */
   1813 			icmp_inbound_too_big_v4(icmph, ira);
   1814 		}
   1815 		/* FALLTHRU */
   1816 	default:
   1817 		icmp_inbound_error_fanout_v4(mp, icmph, ira);
   1818 		break;
   1819 	}
   1820 	return (mp_ret);
   1821 }
   1822 
   1823 /*
   1824  * Send an ICMP echo, timestamp or address mask reply.
   1825  * The caller has already updated the payload part of the packet.
   1826  * We handle the ICMP checksum, IP source address selection and feed
   1827  * the packet into ip_output_simple.
   1828  */
   1829 static void
   1830 icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
   1831     ip_recv_attr_t *ira)
   1832 {
   1833 	uint_t		ip_hdr_length = ira->ira_ip_hdr_length;
   1834 	ill_t		*ill = ira->ira_ill;
   1835 	ip_stack_t	*ipst = ill->ill_ipst;
   1836 	ip_xmit_attr_t	ixas;
   1837 
   1838 	/* Send out an ICMP packet */
   1839 	icmph->icmph_checksum = 0;
   1840 	icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
   1841 	/* Reset time to live. */
   1842 	ipha->ipha_ttl = ipst->ips_ip_def_ttl;
   1843 	{
   1844 		/* Swap source and destination addresses */
   1845 		ipaddr_t tmp;
   1846 
   1847 		tmp = ipha->ipha_src;
   1848 		ipha->ipha_src = ipha->ipha_dst;
   1849 		ipha->ipha_dst = tmp;
   1850 	}
   1851 	ipha->ipha_ident = 0;
   1852 	if (!IS_SIMPLE_IPH(ipha))
   1853 		icmp_options_update(ipha);
   1854 
   1855 	bzero(&ixas, sizeof (ixas));
   1856 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
   1857 	ixas.ixa_zoneid = ira->ira_zoneid;
   1858 	ixas.ixa_cred = kcred;
   1859 	ixas.ixa_cpid = NOPID;
   1860 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
   1861 	ixas.ixa_ifindex = 0;
   1862 	ixas.ixa_ipst = ipst;
   1863 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
   1864 
   1865 	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
   1866 		/*
   1867 		 * This packet should go out the same way as it
   1868 		 * came in i.e in clear, independent of the IPsec policy
   1869 		 * for transmitting packets.
   1870 		 */
   1871 		ixas.ixa_flags |= IXAF_NO_IPSEC;
   1872 	} else {
   1873 		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
   1874 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1875 			/* Note: mp already consumed and ip_drop_packet done */
   1876 			return;
   1877 		}
   1878 	}
   1879 	if (ira->ira_flags & IRAF_MULTIBROADCAST) {
   1880 		/*
   1881 		 * Not one or our addresses (IRE_LOCALs), thus we let
   1882 		 * ip_output_simple pick the source.
   1883 		 */
   1884 		ipha->ipha_src = INADDR_ANY;
   1885 		ixas.ixa_flags |= IXAF_SET_SOURCE;
   1886 	}
   1887 	/* Should we send with DF and use dce_pmtu? */
   1888 	if (ipst->ips_ipv4_icmp_return_pmtu) {
   1889 		ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
   1890 		ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
   1891 	}
   1892 
   1893 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
   1894 
   1895 	(void) ip_output_simple(mp, &ixas);
   1896 	ixa_cleanup(&ixas);
   1897 }
   1898 
   1899 /*
   1900  * Verify the ICMP messages for either for ICMP error or redirect packet.
   1901  * The caller should have fully pulled up the message. If it's a redirect
   1902  * packet, only basic checks on IP header will be done; otherwise, verify
   1903  * the packet by looking at the included ULP header.
   1904  *
   1905  * Called before icmp_inbound_error_fanout_v4 is called.
   1906  */
   1907 static boolean_t
   1908 icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
   1909 {
   1910 	ill_t		*ill = ira->ira_ill;
   1911 	int		hdr_length;
   1912 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   1913 	conn_t		*connp;
   1914 	ipha_t		*ipha;	/* Inner IP header */
   1915 
   1916 	ipha = (ipha_t *)&icmph[1];
   1917 	if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
   1918 		goto truncated;
   1919 
   1920 	hdr_length = IPH_HDR_LENGTH(ipha);
   1921 
   1922 	if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
   1923 		goto discard_pkt;
   1924 
   1925 	if (hdr_length < sizeof (ipha_t))
   1926 		goto truncated;
   1927 
   1928 	if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
   1929 		goto truncated;
   1930 
   1931 	/*
   1932 	 * Stop here for ICMP_REDIRECT.
   1933 	 */
   1934 	if (icmph->icmph_type == ICMP_REDIRECT)
   1935 		return (B_TRUE);
   1936 
   1937 	/*
   1938 	 * ICMP errors only.
   1939 	 */
   1940 	switch (ipha->ipha_protocol) {
   1941 	case IPPROTO_UDP:
   1942 		/*
   1943 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1944 		 * transport header.
   1945 		 */
   1946 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1947 		    mp->b_wptr)
   1948 			goto truncated;
   1949 		break;
   1950 	case IPPROTO_TCP: {
   1951 		tcpha_t		*tcpha;
   1952 
   1953 		/*
   1954 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1955 		 * transport header.
   1956 		 */
   1957 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1958 		    mp->b_wptr)
   1959 			goto truncated;
   1960 
   1961 		tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
   1962 		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
   1963 		    ipst);
   1964 		if (connp == NULL)
   1965 			goto discard_pkt;
   1966 
   1967 		if ((connp->conn_verifyicmp != NULL) &&
   1968 		    !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
   1969 			CONN_DEC_REF(connp);
   1970 			goto discard_pkt;
   1971 		}
   1972 		CONN_DEC_REF(connp);
   1973 		break;
   1974 	}
   1975 	case IPPROTO_SCTP:
   1976 		/*
   1977 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1978 		 * transport header.
   1979 		 */
   1980 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1981 		    mp->b_wptr)
   1982 			goto truncated;
   1983 		break;
   1984 	case IPPROTO_ESP:
   1985 	case IPPROTO_AH:
   1986 		break;
   1987 	case IPPROTO_ENCAP:
   1988 		if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
   1989 		    mp->b_wptr)
   1990 			goto truncated;
   1991 		break;
   1992 	default:
   1993 		break;
   1994 	}
   1995 
   1996 	return (B_TRUE);
   1997 
   1998 discard_pkt:
   1999 	/* Bogus ICMP error. */
   2000 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2001 	return (B_FALSE);
   2002 
   2003 truncated:
   2004 	/* We pulled up everthing already. Must be truncated */
   2005 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   2006 	ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   2007 	return (B_FALSE);
   2008 }
   2009 
   2010 /* Table from RFC 1191 */
   2011 static int icmp_frag_size_table[] =
   2012 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
   2013 
   2014 /*
   2015  * Process received ICMP Packet too big.
   2016  * Just handles the DCE create/update, including using the above table of
   2017  * PMTU guesses. The caller is responsible for validating the packet before
   2018  * passing it in and also to fanout the ICMP error to any matching transport
   2019  * conns. Assumes the message has been fully pulled up and verified.
   2020  *
   2021  * Before getting here, the caller has called icmp_inbound_verify_v4()
   2022  * that should have verified with ULP to prevent undoing the changes we're
   2023  * going to make to DCE. For example, TCP might have verified that the packet
   2024  * which generated error is in the send window.
   2025  *
   2026  * In some cases modified this MTU in the ICMP header packet; the caller
   2027  * should pass to the matching ULP after this returns.
   2028  */
   2029 static void
   2030 icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
   2031 {
   2032 	dce_t		*dce;
   2033 	int		old_mtu;
   2034 	int		mtu, orig_mtu;
   2035 	ipaddr_t	dst;
   2036 	boolean_t	disable_pmtud;
   2037 	ill_t		*ill = ira->ira_ill;
   2038 	ip_stack_t	*ipst = ill->ill_ipst;
   2039 	uint_t		hdr_length;
   2040 	ipha_t		*ipha;
   2041 
   2042 	/* Caller already pulled up everything. */
   2043 	ipha = (ipha_t *)&icmph[1];
   2044 	ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
   2045 	    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
   2046 	ASSERT(ill != NULL);
   2047 
   2048 	hdr_length = IPH_HDR_LENGTH(ipha);
   2049 
   2050 	/*
   2051 	 * We handle path MTU for source routed packets since the DCE
   2052 	 * is looked up using the final destination.
   2053 	 */
   2054 	dst = ip_get_dst(ipha);
   2055 
   2056 	dce = dce_lookup_and_add_v4(dst, ipst);
   2057 	if (dce == NULL) {
   2058 		/* Couldn't add a unique one - ENOMEM */
   2059 		ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
   2060 		    ntohl(dst)));
   2061 		return;
   2062 	}
   2063 
   2064 	/* Check for MTU discovery advice as described in RFC 1191 */
   2065 	mtu = ntohs(icmph->icmph_du_mtu);
   2066 	orig_mtu = mtu;
   2067 	disable_pmtud = B_FALSE;
   2068 
   2069 	mutex_enter(&dce->dce_lock);
   2070 	if (dce->dce_flags & DCEF_PMTU)
   2071 		old_mtu = dce->dce_pmtu;
   2072 	else
   2073 		old_mtu = ill->ill_mtu;
   2074 
   2075 	if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
   2076 		uint32_t length;
   2077 		int	i;
   2078 
   2079 		/*
   2080 		 * Use the table from RFC 1191 to figure out
   2081 		 * the next "plateau" based on the length in
   2082 		 * the original IP packet.
   2083 		 */
   2084 		length = ntohs(ipha->ipha_length);
   2085 		DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
   2086 		    uint32_t, length);
   2087 		if (old_mtu <= length &&
   2088 		    old_mtu >= length - hdr_length) {
   2089 			/*
   2090 			 * Handle broken BSD 4.2 systems that
   2091 			 * return the wrong ipha_length in ICMP
   2092 			 * errors.
   2093 			 */
   2094 			ip1dbg(("Wrong mtu: sent %d, dce %d\n",
   2095 			    length, old_mtu));
   2096 			length -= hdr_length;
   2097 		}
   2098 		for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
   2099 			if (length > icmp_frag_size_table[i])
   2100 				break;
   2101 		}
   2102 		if (i == A_CNT(icmp_frag_size_table)) {
   2103 			/* Smaller than IP_MIN_MTU! */
   2104 			ip1dbg(("Too big for packet size %d\n",
   2105 			    length));
   2106 			disable_pmtud = B_TRUE;
   2107 			mtu = ipst->ips_ip_pmtu_min;
   2108 		} else {
   2109 			mtu = icmp_frag_size_table[i];
   2110 			ip1dbg(("Calculated mtu %d, packet size %d, "
   2111 			    "before %d\n", mtu, length, old_mtu));
   2112 			if (mtu < ipst->ips_ip_pmtu_min) {
   2113 				mtu = ipst->ips_ip_pmtu_min;
   2114 				disable_pmtud = B_TRUE;
   2115 			}
   2116 		}
   2117 	}
   2118 	if (disable_pmtud)
   2119 		dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
   2120 	else
   2121 		dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
   2122 
   2123 	dce->dce_pmtu = MIN(old_mtu, mtu);
   2124 	/* Prepare to send the new max frag size for the ULP. */
   2125 	icmph->icmph_du_zero = 0;
   2126 	icmph->icmph_du_mtu =  htons((uint16_t)dce->dce_pmtu);
   2127 	DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *,
   2128 	    dce, int, orig_mtu, int, mtu);
   2129 
   2130 	/* We now have a PMTU for sure */
   2131 	dce->dce_flags |= DCEF_PMTU;
   2132 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
   2133 	mutex_exit(&dce->dce_lock);
   2134 	/*
   2135 	 * After dropping the lock the new value is visible to everyone.
   2136 	 * Then we bump the generation number so any cached values reinspect
   2137 	 * the dce_t.
   2138 	 */
   2139 	dce_increment_generation(dce);
   2140 	dce_refrele(dce);
   2141 }
   2142 
   2143 /*
   2144  * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4
   2145  * calls this function.
   2146  */
   2147 static mblk_t *
   2148 icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha)
   2149 {
   2150 	int length;
   2151 
   2152 	ASSERT(mp->b_datap->db_type == M_DATA);
   2153 
   2154 	/* icmp_inbound_v4 has already pulled up the whole error packet */
   2155 	ASSERT(mp->b_cont == NULL);
   2156 
   2157 	/*
   2158 	 * The length that we want to overlay is the inner header
   2159 	 * and what follows it.
   2160 	 */
   2161 	length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr);
   2162 
   2163 	/*
   2164 	 * Overlay the inner header and whatever follows it over the
   2165 	 * outer header.
   2166 	 */
   2167 	bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
   2168 
   2169 	/* Adjust for what we removed */
   2170 	mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha;
   2171 	return (mp);
   2172 }
   2173 
   2174 /*
   2175  * Try to pass the ICMP message upstream in case the ULP cares.
   2176  *
   2177  * If the packet that caused the ICMP error is secure, we send
   2178  * it to AH/ESP to make sure that the attached packet has a
   2179  * valid association. ipha in the code below points to the
   2180  * IP header of the packet that caused the error.
   2181  *
   2182  * For IPsec cases, we let the next-layer-up (which has access to
   2183  * cached policy on the conn_t, or can query the SPD directly)
   2184  * subtract out any IPsec overhead if they must.  We therefore make no
   2185  * adjustments here for IPsec overhead.
   2186  *
   2187  * IFN could have been generated locally or by some router.
   2188  *
   2189  * LOCAL : ire_send_wire (before calling ipsec_out_process) can call
   2190  * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN.
   2191  *	    This happens because IP adjusted its value of MTU on an
   2192  *	    earlier IFN message and could not tell the upper layer,
   2193  *	    the new adjusted value of MTU e.g. Packet was encrypted
   2194  *	    or there was not enough information to fanout to upper
   2195  *	    layers. Thus on the next outbound datagram, ire_send_wire
   2196  *	    generates the IFN, where IPsec processing has *not* been
   2197  *	    done.
   2198  *
   2199  *	    Note that we retain ixa_fragsize across IPsec thus once
   2200  *	    we have picking ixa_fragsize and entered ipsec_out_process we do
   2201  *	    no change the fragsize even if the path MTU changes before
   2202  *	    we reach ip_output_post_ipsec.
   2203  *
   2204  *	    In the local case, IRAF_LOOPBACK will be set indicating
   2205  *	    that IFN was generated locally.
   2206  *
   2207  * ROUTER : IFN could be secure or non-secure.
   2208  *
   2209  *	    * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the
   2210  *	      packet in error has AH/ESP headers to validate the AH/ESP
   2211  *	      headers. AH/ESP will verify whether there is a valid SA or
   2212  *	      not and send it back. We will fanout again if we have more
   2213  *	      data in the packet.
   2214  *
   2215  *	      If the packet in error does not have AH/ESP, we handle it
   2216  *	      like any other case.
   2217  *
   2218  *	    * NON_SECURE : If the packet in error has AH/ESP headers, we send it
   2219  *	      up to AH/ESP for validation. AH/ESP will verify whether there is a
   2220  *	      valid SA or not and send it back. We will fanout again if
   2221  *	      we have more data in the packet.
   2222  *
   2223  *	      If the packet in error does not have AH/ESP, we handle it
   2224  *	      like any other case.
   2225  *
   2226  * The caller must have called icmp_inbound_verify_v4.
   2227  */
   2228 static void
   2229 icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
   2230 {
   2231 	uint16_t	*up;	/* Pointer to ports in ULP header */
   2232 	uint32_t	ports;	/* reversed ports for fanout */
   2233 	ipha_t		ripha;	/* With reversed addresses */
   2234 	ipha_t		*ipha;  /* Inner IP header */
   2235 	uint_t		hdr_length; /* Inner IP header length */
   2236 	tcpha_t		*tcpha;
   2237 	conn_t		*connp;
   2238 	ill_t		*ill = ira->ira_ill;
   2239 	ip_stack_t	*ipst = ill->ill_ipst;
   2240 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   2241 	ill_t		*rill = ira->ira_rill;
   2242 
   2243 	/* Caller already pulled up everything. */
   2244 	ipha = (ipha_t *)&icmph[1];
   2245 	ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr);
   2246 	ASSERT(mp->b_cont == NULL);
   2247 
   2248 	hdr_length = IPH_HDR_LENGTH(ipha);
   2249 	ira->ira_protocol = ipha->ipha_protocol;
   2250 
   2251 	/*
   2252 	 * We need a separate IP header with the source and destination
   2253 	 * addresses reversed to do fanout/classification because the ipha in
   2254 	 * the ICMP error is in the form we sent it out.
   2255 	 */
   2256 	ripha.ipha_src = ipha->ipha_dst;
   2257 	ripha.ipha_dst = ipha->ipha_src;
   2258 	ripha.ipha_protocol = ipha->ipha_protocol;
   2259 	ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length;
   2260 
   2261 	ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n",
   2262 	    ripha.ipha_protocol, ntohl(ipha->ipha_src),
   2263 	    ntohl(ipha->ipha_dst),
   2264 	    icmph->icmph_type, icmph->icmph_code));
   2265 
   2266 	switch (ipha->ipha_protocol) {
   2267 	case IPPROTO_UDP:
   2268 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
   2269 
   2270 		/* Attempt to find a client stream based on port. */
   2271 		ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n",
   2272 		    ntohs(up[0]), ntohs(up[1])));
   2273 
   2274 		/* Note that we send error to all matches. */
   2275 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2276 		ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira);
   2277 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2278 		return;
   2279 
   2280 	case IPPROTO_TCP:
   2281 		/*
   2282 		 * Find a TCP client stream for this packet.
   2283 		 * Note that we do a reverse lookup since the header is
   2284 		 * in the form we sent it out.
   2285 		 */
   2286 		tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
   2287 		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
   2288 		    ipst);
   2289 		if (connp == NULL)
   2290 			goto discard_pkt;
   2291 
   2292 		if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
   2293 		    (ira->ira_flags & IRAF_IPSEC_SECURE)) {
   2294 			mp = ipsec_check_inbound_policy(mp, connp,
   2295 			    ipha, NULL, ira);
   2296 			if (mp == NULL) {
   2297 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2298 				/* Note that mp is NULL */
   2299 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
   2300 				CONN_DEC_REF(connp);
   2301 				return;
   2302 			}
   2303 		}
   2304 
   2305 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2306 		ira->ira_ill = ira->ira_rill = NULL;
   2307 		if (IPCL_IS_TCP(connp)) {
   2308 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   2309 			    connp->conn_recvicmp, connp, ira, SQ_FILL,
   2310 			    SQTAG_TCP_INPUT_ICMP_ERR);
   2311 		} else {
   2312 			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
   2313 			(connp->conn_recv)(connp, mp, NULL, ira);
   2314 			CONN_DEC_REF(connp);
   2315 		}
   2316 		ira->ira_ill = ill;
   2317 		ira->ira_rill = rill;
   2318 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2319 		return;
   2320 
   2321 	case IPPROTO_SCTP:
   2322 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
   2323 		/* Find a SCTP client stream for this packet. */
   2324 		((uint16_t *)&ports)[0] = up[1];
   2325 		((uint16_t *)&ports)[1] = up[0];
   2326 
   2327 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2328 		ip_fanout_sctp(mp, &ripha, NULL, ports, ira);
   2329 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2330 		return;
   2331 
   2332 	case IPPROTO_ESP:
   2333 	case IPPROTO_AH:
   2334 		if (!ipsec_loaded(ipss)) {
   2335 			ip_proto_not_sup(mp, ira);
   2336 			return;
   2337 		}
   2338 
   2339 		if (ipha->ipha_protocol == IPPROTO_ESP)
   2340 			mp = ipsecesp_icmp_error(mp, ira);
   2341 		else
   2342 			mp = ipsecah_icmp_error(mp, ira);
   2343 		if (mp == NULL)
   2344 			return;
   2345 
   2346 		/* Just in case ipsec didn't preserve the NULL b_cont */
   2347 		if (mp->b_cont != NULL) {
   2348 			if (!pullupmsg(mp, -1))
   2349 				goto discard_pkt;
   2350 		}
   2351 
   2352 		/*
   2353 		 * Note that ira_pktlen and ira_ip_hdr_length are no longer
   2354 		 * correct, but we don't use them any more here.
   2355 		 *
   2356 		 * If succesful, the mp has been modified to not include
   2357 		 * the ESP/AH header so we can fanout to the ULP's icmp
   2358 		 * error handler.
   2359 		 */
   2360 		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
   2361 			goto truncated;
   2362 
   2363 		/* Verify the modified message before any further processes. */
   2364 		ipha = (ipha_t *)mp->b_rptr;
   2365 		hdr_length = IPH_HDR_LENGTH(ipha);
   2366 		icmph = (icmph_t *)&mp->b_rptr[hdr_length];
   2367 		if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   2368 			freemsg(mp);
   2369 			return;
   2370 		}
   2371 
   2372 		icmp_inbound_error_fanout_v4(mp, icmph, ira);
   2373 		return;
   2374 
   2375 	case IPPROTO_ENCAP: {
   2376 		/* Look for self-encapsulated packets that caused an error */
   2377 		ipha_t *in_ipha;
   2378 
   2379 		/*
   2380 		 * Caller has verified that length has to be
   2381 		 * at least the size of IP header.
   2382 		 */
   2383 		ASSERT(hdr_length >= sizeof (ipha_t));
   2384 		/*
   2385 		 * Check the sanity of the inner IP header like
   2386 		 * we did for the outer header.
   2387 		 */
   2388 		in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
   2389 		if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
   2390 			goto discard_pkt;
   2391 		}
   2392 		if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
   2393 			goto discard_pkt;
   2394 		}
   2395 		/* Check for Self-encapsulated tunnels */
   2396 		if (in_ipha->ipha_src == ipha->ipha_src &&
   2397 		    in_ipha->ipha_dst == ipha->ipha_dst) {
   2398 
   2399 			mp = icmp_inbound_self_encap_error_v4(mp, ipha,
   2400 			    in_ipha);
   2401 			if (mp == NULL)
   2402 				goto discard_pkt;
   2403 
   2404 			/*
   2405 			 * Just in case self_encap didn't preserve the NULL
   2406 			 * b_cont
   2407 			 */
   2408 			if (mp->b_cont != NULL) {
   2409 				if (!pullupmsg(mp, -1))
   2410 					goto discard_pkt;
   2411 			}
   2412 			/*
   2413 			 * Note that ira_pktlen and ira_ip_hdr_length are no
   2414 			 * longer correct, but we don't use them any more here.
   2415 			 */
   2416 			if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
   2417 				goto truncated;
   2418 
   2419 			/*
   2420 			 * Verify the modified message before any further
   2421 			 * processes.
   2422 			 */
   2423 			ipha = (ipha_t *)mp->b_rptr;
   2424 			hdr_length = IPH_HDR_LENGTH(ipha);
   2425 			icmph = (icmph_t *)&mp->b_rptr[hdr_length];
   2426 			if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   2427 				freemsg(mp);
   2428 				return;
   2429 			}
   2430 
   2431 			/*
   2432 			 * The packet in error is self-encapsualted.
   2433 			 * And we are finding it further encapsulated
   2434 			 * which we could not have possibly generated.
   2435 			 */
   2436 			if (ipha->ipha_protocol == IPPROTO_ENCAP) {
   2437 				goto discard_pkt;
   2438 			}
   2439 			icmp_inbound_error_fanout_v4(mp, icmph, ira);
   2440 			return;
   2441 		}
   2442 		/* No self-encapsulated */
   2443 		/* FALLTHRU */
   2444 	}
   2445 	case IPPROTO_IPV6:
   2446 		if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src,
   2447 		    &ripha.ipha_dst, ipst)) != NULL) {
   2448 			ira->ira_flags |= IRAF_ICMP_ERROR;
   2449 			connp->conn_recvicmp(connp, mp, NULL, ira);
   2450 			CONN_DEC_REF(connp);
   2451 			ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2452 			return;
   2453 		}
   2454 		/*
   2455 		 * No IP tunnel is interested, fallthrough and see
   2456 		 * if a raw socket will want it.
   2457 		 */
   2458 		/* FALLTHRU */
   2459 	default:
   2460 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2461 		ip_fanout_proto_v4(mp, &ripha, ira);
   2462 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2463 		return;
   2464 	}
   2465 	/* NOTREACHED */
   2466 discard_pkt:
   2467 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2468 	ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n"));
   2469 	ip_drop_input("ipIfStatsInDiscards", mp, ill);
   2470 	freemsg(mp);
   2471 	return;
   2472 
   2473 truncated:
   2474 	/* We pulled up everthing already. Must be truncated */
   2475 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   2476 	ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   2477 	freemsg(mp);
   2478 }
   2479 
   2480 /*
   2481  * Common IP options parser.
   2482  *
   2483  * Setup routine: fill in *optp with options-parsing state, then
   2484  * tail-call ipoptp_next to return the first option.
   2485  */
   2486 uint8_t
   2487 ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
   2488 {
   2489 	uint32_t totallen; /* total length of all options */
   2490 
   2491 	totallen = ipha->ipha_version_and_hdr_length -
   2492 	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
   2493 	totallen <<= 2;
   2494 	optp->ipoptp_next = (uint8_t *)(&ipha[1]);
   2495 	optp->ipoptp_end = optp->ipoptp_next + totallen;
   2496 	optp->ipoptp_flags = 0;
   2497 	return (ipoptp_next(optp));
   2498 }
   2499 
   2500 /* Like above but without an ipha_t */
   2501 uint8_t
   2502 ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt)
   2503 {
   2504 	optp->ipoptp_next = opt;
   2505 	optp->ipoptp_end = optp->ipoptp_next + totallen;
   2506 	optp->ipoptp_flags = 0;
   2507 	return (ipoptp_next(optp));
   2508 }
   2509 
   2510 /*
   2511  * Common IP options parser: extract next option.
   2512  */
   2513 uint8_t
   2514 ipoptp_next(ipoptp_t *optp)
   2515 {
   2516 	uint8_t *end = optp->ipoptp_end;
   2517 	uint8_t *cur = optp->ipoptp_next;
   2518 	uint8_t opt, len, pointer;
   2519 
   2520 	/*
   2521 	 * If cur > end already, then the ipoptp_end or ipoptp_next pointer
   2522 	 * has been corrupted.
   2523 	 */
   2524 	ASSERT(cur <= end);
   2525 
   2526 	if (cur == end)
   2527 		return (IPOPT_EOL);
   2528 
   2529 	opt = cur[IPOPT_OPTVAL];
   2530 
   2531 	/*
   2532 	 * Skip any NOP options.
   2533 	 */
   2534 	while (opt == IPOPT_NOP) {
   2535 		cur++;
   2536 		if (cur == end)
   2537 			return (IPOPT_EOL);
   2538 		opt = cur[IPOPT_OPTVAL];
   2539 	}
   2540 
   2541 	if (opt == IPOPT_EOL)
   2542 		return (IPOPT_EOL);
   2543 
   2544 	/*
   2545 	 * Option requiring a length.
   2546 	 */
   2547 	if ((cur + 1) >= end) {
   2548 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2549 		return (IPOPT_EOL);
   2550 	}
   2551 	len = cur[IPOPT_OLEN];
   2552 	if (len < 2) {
   2553 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2554 		return (IPOPT_EOL);
   2555 	}
   2556 	optp->ipoptp_cur = cur;
   2557 	optp->ipoptp_len = len;
   2558 	optp->ipoptp_next = cur + len;
   2559 	if (cur + len > end) {
   2560 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2561 		return (IPOPT_EOL);
   2562 	}
   2563 
   2564 	/*
   2565 	 * For the options which require a pointer field, make sure
   2566 	 * its there, and make sure it points to either something
   2567 	 * inside this option, or the end of the option.
   2568 	 */
   2569 	switch (opt) {
   2570 	case IPOPT_RR:
   2571 	case IPOPT_TS:
   2572 	case IPOPT_LSRR:
   2573 	case IPOPT_SSRR:
   2574 		if (len <= IPOPT_OFFSET) {
   2575 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2576 			return (opt);
   2577 		}
   2578 		pointer = cur[IPOPT_OFFSET];
   2579 		if (pointer - 1 > len) {
   2580 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2581 			return (opt);
   2582 		}
   2583 		break;
   2584 	}
   2585 
   2586 	/*
   2587 	 * Sanity check the pointer field based on the type of the
   2588 	 * option.
   2589 	 */
   2590 	switch (opt) {
   2591 	case IPOPT_RR:
   2592 	case IPOPT_SSRR:
   2593 	case IPOPT_LSRR:
   2594 		if (pointer < IPOPT_MINOFF_SR)
   2595 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2596 		break;
   2597 	case IPOPT_TS:
   2598 		if (pointer < IPOPT_MINOFF_IT)
   2599 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2600 		/*
   2601 		 * Note that the Internet Timestamp option also
   2602 		 * contains two four bit fields (the Overflow field,
   2603 		 * and the Flag field), which follow the pointer
   2604 		 * field.  We don't need to check that these fields
   2605 		 * fall within the length of the option because this
   2606 		 * was implicitely done above.  We've checked that the
   2607 		 * pointer value is at least IPOPT_MINOFF_IT, and that
   2608 		 * it falls within the option.  Since IPOPT_MINOFF_IT >
   2609 		 * IPOPT_POS_OV_FLG, we don't need the explicit check.
   2610 		 */
   2611 		ASSERT(len > IPOPT_POS_OV_FLG);
   2612 		break;
   2613 	}
   2614 
   2615 	return (opt);
   2616 }
   2617 
   2618 /*
   2619  * Use the outgoing IP header to create an IP_OPTIONS option the way
   2620  * it was passed down from the application.
   2621  *
   2622  * This is compatible with BSD in that it returns
   2623  * the reverse source route with the final destination
   2624  * as the last entry. The first 4 bytes of the option
   2625  * will contain the final destination.
   2626  */
   2627 int
   2628 ip_opt_get_user(conn_t *connp, uchar_t *buf)
   2629 {
   2630 	ipoptp_t	opts;
   2631 	uchar_t		*opt;
   2632 	uint8_t		optval;
   2633 	uint8_t		optlen;
   2634 	uint32_t	len = 0;
   2635 	uchar_t		*buf1 = buf;
   2636 	uint32_t	totallen;
   2637 	ipaddr_t	dst;
   2638 	ip_pkt_t	*ipp = &connp->conn_xmit_ipp;
   2639 
   2640 	if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
   2641 		return (0);
   2642 
   2643 	totallen = ipp->ipp_ipv4_options_len;
   2644 	if (totallen & 0x3)
   2645 		return (0);
   2646 
   2647 	buf += IP_ADDR_LEN;	/* Leave room for final destination */
   2648 	len += IP_ADDR_LEN;
   2649 	bzero(buf1, IP_ADDR_LEN);
   2650 
   2651 	dst = connp->conn_faddr_v4;
   2652 
   2653 	for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
   2654 	    optval != IPOPT_EOL;
   2655 	    optval = ipoptp_next(&opts)) {
   2656 		int	off;
   2657 
   2658 		opt = opts.ipoptp_cur;
   2659 		if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   2660 			break;
   2661 		}
   2662 		optlen = opts.ipoptp_len;
   2663 
   2664 		switch (optval) {
   2665 		case IPOPT_SSRR:
   2666 		case IPOPT_LSRR:
   2667 
   2668 			/*
   2669 			 * Insert destination as the first entry in the source
   2670 			 * route and move down the entries on step.
   2671 			 * The last entry gets placed at buf1.
   2672 			 */
   2673 			buf[IPOPT_OPTVAL] = optval;
   2674 			buf[IPOPT_OLEN] = optlen;
   2675 			buf[IPOPT_OFFSET] = optlen;
   2676 
   2677 			off = optlen - IP_ADDR_LEN;
   2678 			if (off < 0) {
   2679 				/* No entries in source route */
   2680 				break;
   2681 			}
   2682 			/* Last entry in source route if not already set */
   2683 			if (dst == INADDR_ANY)
   2684 				bcopy(opt + off, buf1, IP_ADDR_LEN);
   2685 			off -= IP_ADDR_LEN;
   2686 
   2687 			while (off > 0) {
   2688 				bcopy(opt + off,
   2689 				    buf + off + IP_ADDR_LEN,
   2690 				    IP_ADDR_LEN);
   2691 				off -= IP_ADDR_LEN;
   2692 			}
   2693 			/* ipha_dst into first slot */
   2694 			bcopy(&dst, buf + off + IP_ADDR_LEN,
   2695 			    IP_ADDR_LEN);
   2696 			buf += optlen;
   2697 			len += optlen;
   2698 			break;
   2699 
   2700 		default:
   2701 			bcopy(opt, buf, optlen);
   2702 			buf += optlen;
   2703 			len += optlen;
   2704 			break;
   2705 		}
   2706 	}
   2707 done:
   2708 	/* Pad the resulting options */
   2709 	while (len & 0x3) {
   2710 		*buf++ = IPOPT_EOL;
   2711 		len++;
   2712 	}
   2713 	return (len);
   2714 }
   2715 
   2716 /*
   2717  * Update any record route or timestamp options to include this host.
   2718  * Reverse any source route option.
   2719  * This routine assumes that the options are well formed i.e. that they
   2720  * have already been checked.
   2721  */
   2722 static void
   2723 icmp_options_update(ipha_t *ipha)
   2724 {
   2725 	ipoptp_t	opts;
   2726 	uchar_t		*opt;
   2727 	uint8_t		optval;
   2728 	ipaddr_t	src;		/* Our local address */
   2729 	ipaddr_t	dst;
   2730 
   2731 	ip2dbg(("icmp_options_update\n"));
   2732 	src = ipha->ipha_src;
   2733 	dst = ipha->ipha_dst;
   2734 
   2735 	for (optval = ipoptp_first(&opts, ipha);
   2736 	    optval != IPOPT_EOL;
   2737 	    optval = ipoptp_next(&opts)) {
   2738 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
   2739 		opt = opts.ipoptp_cur;
   2740 		ip2dbg(("icmp_options_update: opt %d, len %d\n",
   2741 		    optval, opts.ipoptp_len));
   2742 		switch (optval) {
   2743 			int off1, off2;
   2744 		case IPOPT_SSRR:
   2745 		case IPOPT_LSRR:
   2746 			/*
   2747 			 * Reverse the source route.  The first entry
   2748 			 * should be the next to last one in the current
   2749 			 * source route (the last entry is our address).
   2750 			 * The last entry should be the final destination.
   2751 			 */
   2752 			off1 = IPOPT_MINOFF_SR - 1;
   2753 			off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
   2754 			if (off2 < 0) {
   2755 				/* No entries in source route */
   2756 				ip1dbg((
   2757 				    "icmp_options_update: bad src route\n"));
   2758 				break;
   2759 			}
   2760 			bcopy((char *)opt + off2, &dst, IP_ADDR_LEN);
   2761 			bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN);
   2762 			bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN);
   2763 			off2 -= IP_ADDR_LEN;
   2764 
   2765 			while (off1 < off2) {
   2766 				bcopy((char *)opt + off1, &src, IP_ADDR_LEN);
   2767 				bcopy((char *)opt + off2, (char *)opt + off1,
   2768 				    IP_ADDR_LEN);
   2769 				bcopy(&src, (char *)opt + off2, IP_ADDR_LEN);
   2770 				off1 += IP_ADDR_LEN;
   2771 				off2 -= IP_ADDR_LEN;
   2772 			}
   2773 			opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
   2774 			break;
   2775 		}
   2776 	}
   2777 }
   2778 
   2779 /*
   2780  * Process received ICMP Redirect messages.
   2781  * Assumes the caller has verified that the headers are in the pulled up mblk.
   2782  * Consumes mp.
   2783  */
   2784 static void
   2785 icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira)
   2786 {
   2787 	ire_t		*ire, *nire;
   2788 	ire_t		*prev_ire;
   2789 	ipaddr_t  	src, dst, gateway;
   2790 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   2791 	ipha_t		*inner_ipha;	/* Inner IP header */
   2792 
   2793 	/* Caller already pulled up everything. */
   2794 	inner_ipha = (ipha_t *)&icmph[1];
   2795 	src = ipha->ipha_src;
   2796 	dst = inner_ipha->ipha_dst;
   2797 	gateway = icmph->icmph_rd_gateway;
   2798 	/* Make sure the new gateway is reachable somehow. */
   2799 	ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL,
   2800 	    ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
   2801 	/*
   2802 	 * Make sure we had a route for the dest in question and that
   2803 	 * that route was pointing to the old gateway (the source of the
   2804 	 * redirect packet.)
   2805 	 * We do longest match and then compare ire_gateway_addr below.
   2806 	 */
   2807 	prev_ire = ire_ftable_lookup_v4(dst, 0, 0, 0, NULL, ALL_ZONES,
   2808 	    NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
   2809 	/*
   2810 	 * Check that
   2811 	 *	the redirect was not from ourselves
   2812 	 *	the new gateway and the old gateway are directly reachable
   2813 	 */
   2814 	if (prev_ire == NULL || ire == NULL ||
   2815 	    (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
   2816 	    (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
   2817 	    !(ire->ire_type & IRE_IF_ALL) ||
   2818 	    prev_ire->ire_gateway_addr != src) {
   2819 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
   2820 		ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill);
   2821 		freemsg(mp);
   2822 		if (ire != NULL)
   2823 			ire_refrele(ire);
   2824 		if (prev_ire != NULL)
   2825 			ire_refrele(prev_ire);
   2826 		return;
   2827 	}
   2828 
   2829 	ire_refrele(prev_ire);
   2830 	ire_refrele(ire);
   2831 
   2832 	/*
   2833 	 * TODO: more precise handling for cases 0, 2, 3, the latter two
   2834 	 * require TOS routing
   2835 	 */
   2836 	switch (icmph->icmph_code) {
   2837 	case 0:
   2838 	case 1:
   2839 		/* TODO: TOS specificity for cases 2 and 3 */
   2840 	case 2:
   2841 	case 3:
   2842 		break;
   2843 	default:
   2844 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
   2845 		ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill);
   2846 		freemsg(mp);
   2847 		return;
   2848 	}
   2849 	/*
   2850 	 * Create a Route Association.  This will allow us to remember that
   2851 	 * someone we believe told us to use the particular gateway.
   2852 	 */
   2853 	ire = ire_create(
   2854 	    (uchar_t *)&dst,			/* dest addr */
   2855 	    (uchar_t *)&ip_g_all_ones,		/* mask */
   2856 	    (uchar_t *)&gateway,		/* gateway addr */
   2857 	    IRE_HOST,
   2858 	    NULL,				/* ill */
   2859 	    ALL_ZONES,
   2860 	    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
   2861 	    NULL,				/* tsol_gc_t */
   2862 	    ipst);
   2863 
   2864 	if (ire == NULL) {
   2865 		freemsg(mp);
   2866 		return;
   2867 	}
   2868 	nire = ire_add(ire);
   2869 	/* Check if it was a duplicate entry */
   2870 	if (nire != NULL && nire != ire) {
   2871 		ASSERT(nire->ire_identical_ref > 1);
   2872 		ire_delete(nire);
   2873 		ire_refrele(nire);
   2874 		nire = NULL;
   2875 	}
   2876 	ire = nire;
   2877 	if (ire != NULL) {
   2878 		ire_refrele(ire);		/* Held in ire_add */
   2879 
   2880 		/* tell routing sockets that we received a redirect */
   2881 		ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
   2882 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
   2883 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
   2884 	}
   2885 
   2886 	/*
   2887 	 * Delete any existing IRE_HOST type redirect ires for this destination.
   2888 	 * This together with the added IRE has the effect of
   2889 	 * modifying an existing redirect.
   2890 	 */
   2891 	prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL,
   2892 	    ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL);
   2893 	if (prev_ire != NULL) {
   2894 		if (prev_ire ->ire_flags & RTF_DYNAMIC)
   2895 			ire_delete(prev_ire);
   2896 		ire_refrele(prev_ire);
   2897 	}
   2898 
   2899 	freemsg(mp);
   2900 }
   2901 
   2902 /*
   2903  * Generate an ICMP parameter problem message.
   2904  * When called from ip_output side a minimal ip_recv_attr_t needs to be
   2905  * constructed by the caller.
   2906  */
   2907 static void
   2908 icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira)
   2909 {
   2910 	icmph_t	icmph;
   2911 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   2912 
   2913 	mp = icmp_pkt_err_ok(mp, ira);
   2914 	if (mp == NULL)
   2915 		return;
   2916 
   2917 	bzero(&icmph, sizeof (icmph_t));
   2918 	icmph.icmph_type = ICMP_PARAM_PROBLEM;
   2919 	icmph.icmph_pp_ptr = ptr;
   2920 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs);
   2921 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   2922 }
   2923 
   2924 /*
   2925  * Build and ship an IPv4 ICMP message using the packet data in mp, and
   2926  * the ICMP header pointed to by "stuff".  (May be called as writer.)
   2927  * Note: assumes that icmp_pkt_err_ok has been called to verify that
   2928  * an icmp error packet can be sent.
   2929  * Assigns an appropriate source address to the packet. If ipha_dst is
   2930  * one of our addresses use it for source. Otherwise let ip_output_simple
   2931  * pick the source address.
   2932  */
   2933 static void
   2934 icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira)
   2935 {
   2936 	ipaddr_t dst;
   2937 	icmph_t	*icmph;
   2938 	ipha_t	*ipha;
   2939 	uint_t	len_needed;
   2940 	size_t	msg_len;
   2941 	mblk_t	*mp1;
   2942 	ipaddr_t src;
   2943 	ire_t	*ire;
   2944 	ip_xmit_attr_t ixas;
   2945 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   2946 
   2947 	ipha = (ipha_t *)mp->b_rptr;
   2948 
   2949 	bzero(&ixas, sizeof (ixas));
   2950 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
   2951 	ixas.ixa_zoneid = ira->ira_zoneid;
   2952 	ixas.ixa_ifindex = 0;
   2953 	ixas.ixa_ipst = ipst;
   2954 	ixas.ixa_cred = kcred;
   2955 	ixas.ixa_cpid = NOPID;
   2956 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
   2957 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
   2958 
   2959 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
   2960 		/*
   2961 		 * Apply IPsec based on how IPsec was applied to
   2962 		 * the packet that had the error.
   2963 		 *
   2964 		 * If it was an outbound packet that caused the ICMP
   2965 		 * error, then the caller will have setup the IRA
   2966 		 * appropriately.
   2967 		 */
   2968 		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
   2969 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   2970 			/* Note: mp already consumed and ip_drop_packet done */
   2971 			return;
   2972 		}
   2973 	} else {
   2974 		/*
   2975 		 * This is in clear. The icmp message we are building
   2976 		 * here should go out in clear, independent of our policy.
   2977 		 */
   2978 		ixas.ixa_flags |= IXAF_NO_IPSEC;
   2979 	}
   2980 
   2981 	/* Remember our eventual destination */
   2982 	dst = ipha->ipha_src;
   2983 
   2984 	/*
   2985 	 * If the packet was for one of our unicast addresses, make
   2986 	 * sure we respond with that as the source. Otherwise
   2987 	 * have ip_output_simple pick the source address.
   2988 	 */
   2989 	ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0,
   2990 	    (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL,
   2991 	    MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL);
   2992 	if (ire != NULL) {
   2993 		ire_refrele(ire);
   2994 		src = ipha->ipha_dst;
   2995 	} else {
   2996 		src = INADDR_ANY;
   2997 		ixas.ixa_flags |= IXAF_SET_SOURCE;
   2998 	}
   2999 
   3000 	/*
   3001 	 * Check if we can send back more then 8 bytes in addition to
   3002 	 * the IP header.  We try to send 64 bytes of data and the internal
   3003 	 * header in the special cases of ipv4 encapsulated ipv4 or ipv6.
   3004 	 */
   3005 	len_needed = IPH_HDR_LENGTH(ipha);
   3006 	if (ipha->ipha_protocol == IPPROTO_ENCAP ||
   3007 	    ipha->ipha_protocol == IPPROTO_IPV6) {
   3008 		if (!pullupmsg(mp, -1)) {
   3009 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   3010 			ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
   3011 			freemsg(mp);
   3012 			return;
   3013 		}
   3014 		ipha = (ipha_t *)mp->b_rptr;
   3015 
   3016 		if (ipha->ipha_protocol == IPPROTO_ENCAP) {
   3017 			len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha +
   3018 			    len_needed));
   3019 		} else {
   3020 			ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed);
   3021 
   3022 			ASSERT(ipha->ipha_protocol == IPPROTO_IPV6);
   3023 			len_needed += ip_hdr_length_v6(mp, ip6h);
   3024 		}
   3025 	}
   3026 	len_needed += ipst->ips_ip_icmp_return;
   3027 	msg_len = msgdsize(mp);
   3028 	if (msg_len > len_needed) {
   3029 		(void) adjmsg(mp, len_needed - msg_len);
   3030 		msg_len = len_needed;
   3031 	}
   3032 	mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_MED);
   3033 	if (mp1 == NULL) {
   3034 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors);
   3035 		freemsg(mp);
   3036 		return;
   3037 	}
   3038 	mp1->b_cont = mp;
   3039 	mp = mp1;
   3040 
   3041 	/*
   3042 	 * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
   3043 	 * node generates be accepted in peace by all on-host destinations.
   3044 	 * If we do NOT assume that all on-host destinations trust
   3045 	 * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
   3046 	 * (Look for IXAF_TRUSTED_ICMP).
   3047 	 */
   3048 	ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
   3049 
   3050 	ipha = (ipha_t *)mp->b_rptr;
   3051 	mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len);
   3052 	*ipha = icmp_ipha;
   3053 	ipha->ipha_src = src;
   3054 	ipha->ipha_dst = dst;
   3055 	ipha->ipha_ttl = ipst->ips_ip_def_ttl;
   3056 	msg_len += sizeof (icmp_ipha) + len;
   3057 	if (msg_len > IP_MAXPACKET) {
   3058 		(void) adjmsg(mp, IP_MAXPACKET - msg_len);
   3059 		msg_len = IP_MAXPACKET;
   3060 	}
   3061 	ipha->ipha_length = htons((uint16_t)msg_len);
   3062 	icmph = (icmph_t *)&ipha[1];
   3063 	bcopy(stuff, icmph, len);
   3064 	icmph->icmph_checksum = 0;
   3065 	icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
   3066 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
   3067 
   3068 	(void) ip_output_simple(mp, &ixas);
   3069 	ixa_cleanup(&ixas);
   3070 }
   3071 
   3072 /*
   3073  * Determine if an ICMP error packet can be sent given the rate limit.
   3074  * The limit consists of an average frequency (icmp_pkt_err_interval measured
   3075  * in milliseconds) and a burst size. Burst size number of packets can
   3076  * be sent arbitrarely closely spaced.
   3077  * The state is tracked using two variables to implement an approximate
   3078  * token bucket filter:
   3079  *	icmp_pkt_err_last - lbolt value when the last burst started
   3080  *	icmp_pkt_err_sent - number of packets sent in current burst
   3081  */
   3082 boolean_t
   3083 icmp_err_rate_limit(ip_stack_t *ipst)
   3084 {
   3085 	clock_t now = TICK_TO_MSEC(ddi_get_lbolt());
   3086 	uint_t refilled; /* Number of packets refilled in tbf since last */
   3087 	/* Guard against changes by loading into local variable */
   3088 	uint_t err_interval = ipst->ips_ip_icmp_err_interval;
   3089 
   3090 	if (err_interval == 0)
   3091 		return (B_FALSE);
   3092 
   3093 	if (ipst->ips_icmp_pkt_err_last > now) {
   3094 		/* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */
   3095 		ipst->ips_icmp_pkt_err_last = 0;
   3096 		ipst->ips_icmp_pkt_err_sent = 0;
   3097 	}
   3098 	/*
   3099 	 * If we are in a burst update the token bucket filter.
   3100 	 * Update the "last" time to be close to "now" but make sure
   3101 	 * we don't loose precision.
   3102 	 */
   3103 	if (ipst->ips_icmp_pkt_err_sent != 0) {
   3104 		refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval;
   3105 		if (refilled > ipst->ips_icmp_pkt_err_sent) {
   3106 			ipst->ips_icmp_pkt_err_sent = 0;
   3107 		} else {
   3108 			ipst->ips_icmp_pkt_err_sent -= refilled;
   3109 			ipst->ips_icmp_pkt_err_last += refilled * err_interval;
   3110 		}
   3111 	}
   3112 	if (ipst->ips_icmp_pkt_err_sent == 0) {
   3113 		/* Start of new burst */
   3114 		ipst->ips_icmp_pkt_err_last = now;
   3115 	}
   3116 	if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) {
   3117 		ipst->ips_icmp_pkt_err_sent++;
   3118 		ip1dbg(("icmp_err_rate_limit: %d sent in burst\n",
   3119 		    ipst->ips_icmp_pkt_err_sent));
   3120 		return (B_FALSE);
   3121 	}
   3122 	ip1dbg(("icmp_err_rate_limit: dropped\n"));
   3123 	return (B_TRUE);
   3124 }
   3125 
   3126 /*
   3127  * Check if it is ok to send an IPv4 ICMP error packet in
   3128  * response to the IPv4 packet in mp.
   3129  * Free the message and return null if no
   3130  * ICMP error packet should be sent.
   3131  */
   3132 static mblk_t *
   3133 icmp_pkt_err_ok(mblk_t *mp, ip_recv_attr_t *ira)
   3134 {
   3135 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   3136 	icmph_t	*icmph;
   3137 	ipha_t	*ipha;
   3138 	uint_t	len_needed;
   3139 
   3140 	if (!mp)
   3141 		return (NULL);
   3142 	ipha = (ipha_t *)mp->b_rptr;
   3143 	if (ip_csum_hdr(ipha)) {
   3144 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs);
   3145 		ip_drop_input("ipIfStatsInCksumErrs", mp, NULL);
   3146 		freemsg(mp);
   3147 		return (NULL);
   3148 	}
   3149 	if (ip_type_v4(ipha->ipha_dst, ipst) == IRE_BROADCAST ||
   3150 	    ip_type_v4(ipha->ipha_src, ipst) == IRE_BROADCAST ||
   3151 	    CLASSD(ipha->ipha_dst) ||
   3152 	    CLASSD(ipha->ipha_src) ||
   3153 	    (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) {
   3154 		/* Note: only errors to the fragment with offset 0 */
   3155 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   3156 		freemsg(mp);
   3157 		return (NULL);
   3158 	}
   3159 	if (ipha->ipha_protocol == IPPROTO_ICMP) {
   3160 		/*
   3161 		 * Check the ICMP type.  RFC 1122 sez:  don't send ICMP
   3162 		 * errors in response to any ICMP errors.
   3163 		 */
   3164 		len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE;
   3165 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   3166 			if (!pullupmsg(mp, len_needed)) {
   3167 				BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   3168 				freemsg(mp);
   3169 				return (NULL);
   3170 			}
   3171 			ipha = (ipha_t *)mp->b_rptr;
   3172 		}
   3173 		icmph = (icmph_t *)
   3174 		    (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]);
   3175 		switch (icmph->icmph_type) {
   3176 		case ICMP_DEST_UNREACHABLE:
   3177 		case ICMP_SOURCE_QUENCH:
   3178 		case ICMP_TIME_EXCEEDED:
   3179 		case ICMP_PARAM_PROBLEM:
   3180 		case ICMP_REDIRECT:
   3181 			BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   3182 			freemsg(mp);
   3183 			return (NULL);
   3184 		default:
   3185 			break;
   3186 		}
   3187 	}
   3188 	/*
   3189 	 * If this is a labeled system, then check to see if we're allowed to
   3190 	 * send a response to this particular sender.  If not, then just drop.
   3191 	 */
   3192 	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
   3193 		ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n"));
   3194 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   3195 		freemsg(mp);
   3196 		return (NULL);
   3197 	}
   3198 	if (icmp_err_rate_limit(ipst)) {
   3199 		/*
   3200 		 * Only send ICMP error packets every so often.
   3201 		 * This should be done on a per port/source basis,
   3202 		 * but for now this will suffice.
   3203 		 */
   3204 		freemsg(mp);
   3205 		return (NULL);
   3206 	}
   3207 	return (mp);
   3208 }
   3209 
   3210 /*
   3211  * Called when a packet was sent out the same link that it arrived on.
   3212  * Check if it is ok to send a redirect and then send it.
   3213  */
   3214 void
   3215 ip_send_potential_redirect_v4(mblk_t *mp, ipha_t *ipha, ire_t *ire,
   3216     ip_recv_attr_t *ira)
   3217 {
   3218 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   3219 	ipaddr_t	src, nhop;
   3220 	mblk_t		*mp1;
   3221 	ire_t		*nhop_ire;
   3222 
   3223 	/*
   3224 	 * Check the source address to see if it originated
   3225 	 * on the same logical subnet it is going back out on.
   3226 	 * If so, we should be able to send it a redirect.
   3227 	 * Avoid sending a redirect if the destination
   3228 	 * is directly connected (i.e., we matched an IRE_ONLINK),
   3229 	 * or if the packet was source routed out this interface.
   3230 	 *
   3231 	 * We avoid sending a redirect if the
   3232 	 * destination is directly connected
   3233 	 * because it is possible that multiple
   3234 	 * IP subnets may have been configured on
   3235 	 * the link, and the source may not
   3236 	 * be on the same subnet as ip destination,
   3237 	 * even though they are on the same
   3238 	 * physical link.
   3239 	 */
   3240 	if ((ire->ire_type & IRE_ONLINK) ||
   3241 	    ip_source_routed(ipha, ipst))
   3242 		return;
   3243 
   3244 	nhop_ire = ire_nexthop(ire);
   3245 	if (nhop_ire == NULL)
   3246 		return;
   3247 
   3248 	nhop = nhop_ire->ire_addr;
   3249 
   3250 	if (nhop_ire->ire_type & IRE_IF_CLONE) {
   3251 		ire_t	*ire2;
   3252 
   3253 		/* Follow ire_dep_parent to find non-clone IRE_INTERFACE */
   3254 		mutex_enter(&nhop_ire->ire_lock);
   3255 		ire2 = nhop_ire->ire_dep_parent;
   3256 		if (ire2 != NULL)
   3257 			ire_refhold(ire2);
   3258 		mutex_exit(&nhop_ire->ire_lock);
   3259 		ire_refrele(nhop_ire);
   3260 		nhop_ire = ire2;
   3261 	}
   3262 	if (nhop_ire == NULL)
   3263 		return;
   3264 
   3265 	ASSERT(!(nhop_ire->ire_type & IRE_IF_CLONE));
   3266 
   3267 	src = ipha->ipha_src;
   3268 
   3269 	/*
   3270 	 * We look at the interface ire for the nexthop,
   3271 	 * to see if ipha_src is in the same subnet
   3272 	 * as the nexthop.
   3273 	 */
   3274 	if ((src & nhop_ire->ire_mask) == (nhop & nhop_ire->ire_mask)) {
   3275 		/*
   3276 		 * The source is directly connected.
   3277 		 */
   3278 		mp1 = copymsg(mp);
   3279 		if (mp1 != NULL) {
   3280 			icmp_send_redirect(mp1, nhop, ira);
   3281 		}
   3282 	}
   3283 	ire_refrele(nhop_ire);
   3284 }
   3285 
   3286 /*
   3287  * Generate an ICMP redirect message.
   3288  */
   3289 static void
   3290 icmp_send_redirect(mblk_t *mp, ipaddr_t gateway, ip_recv_attr_t *ira)
   3291 {
   3292 	icmph_t	icmph;
   3293 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   3294 
   3295 	mp = icmp_pkt_err_ok(mp, ira);
   3296 	if (mp == NULL)
   3297 		return;
   3298 
   3299 	bzero(&icmph, sizeof (icmph_t));
   3300 	icmph.icmph_type = ICMP_REDIRECT;
   3301 	icmph.icmph_code = 1;
   3302 	icmph.icmph_rd_gateway = gateway;
   3303 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
   3304 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   3305 }
   3306 
   3307 /*
   3308  * Generate an ICMP time exceeded message.
   3309  */
   3310 void
   3311 icmp_time_exceeded(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
   3312 {
   3313 	icmph_t	icmph;
   3314 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   3315 
   3316 	mp = icmp_pkt_err_ok(mp, ira);
   3317 	if (mp == NULL)
   3318 		return;
   3319 
   3320 	bzero(&icmph, sizeof (icmph_t));
   3321 	icmph.icmph_type = ICMP_TIME_EXCEEDED;
   3322 	icmph.icmph_code = code;
   3323 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
   3324 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   3325 }
   3326 
   3327 /*
   3328  * Generate an ICMP unreachable message.
   3329  * When called from ip_output side a minimal ip_recv_attr_t needs to be
   3330  * constructed by the caller.
   3331  */
   3332 void
   3333 icmp_unreachable(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
   3334 {
   3335 	icmph_t	icmph;
   3336 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   3337 
   3338 	mp = icmp_pkt_err_ok(mp, ira);
   3339 	if (mp == NULL)
   3340 		return;
   3341 
   3342 	bzero(&icmph, sizeof (icmph_t));
   3343 	icmph.icmph_type = ICMP_DEST_UNREACHABLE;
   3344 	icmph.icmph_code = code;
   3345 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
   3346 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   3347 }
   3348 
   3349 /*
   3350  * Latch in the IPsec state for a stream based the policy in the listener
   3351  * and the actions in the ip_recv_attr_t.
   3352  * Called directly from TCP and SCTP.
   3353  */
   3354 boolean_t
   3355 ip_ipsec_policy_inherit(conn_t *connp, conn_t *lconnp, ip_recv_attr_t *ira)
   3356 {
   3357 	ASSERT(lconnp->conn_policy != NULL);
   3358 	ASSERT(connp->conn_policy == NULL);
   3359 
   3360 	IPPH_REFHOLD(lconnp->conn_policy);
   3361 	connp->conn_policy = lconnp->conn_policy;
   3362 
   3363 	if (ira->ira_ipsec_action != NULL) {
   3364 		if (connp->conn_latch == NULL) {
   3365 			connp->conn_latch = iplatch_create();
   3366 			if (connp->conn_latch == NULL)
   3367 				return (B_FALSE);
   3368 		}
   3369 		ipsec_latch_inbound(connp, ira);
   3370 	}
   3371 	return (B_TRUE);
   3372 }
   3373 
   3374 /*
   3375  * Verify whether or not the IP address is a valid local address.
   3376  * Could be a unicast, including one for a down interface.
   3377  * If allow_mcbc then a multicast or broadcast address is also
   3378  * acceptable.
   3379  *
   3380  * In the case of a broadcast/multicast address, however, the
   3381  * upper protocol is expected to reset the src address
   3382  * to zero when we return IPVL_MCAST/IPVL_BCAST so that
   3383  * no packets are emitted with broadcast/multicast address as
   3384  * source address (that violates hosts requirements RFC 1122)
   3385  * The addresses valid for bind are:
   3386  *	(1) - INADDR_ANY (0)
   3387  *	(2) - IP address of an UP interface
   3388  *	(3) - IP address of a DOWN interface
   3389  *	(4) - valid local IP broadcast addresses. In this case
   3390  *	the conn will only receive packets destined to
   3391  *	the specified broadcast address.
   3392  *	(5) - a multicast address. In this case
   3393  *	the conn will only receive packets destined to
   3394  *	the specified multicast address. Note: the
   3395  *	application still has to issue an
   3396  *	IP_ADD_MEMBERSHIP socket option.
   3397  *
   3398  * In all the above cases, the bound address must be valid in the current zone.
   3399  * When the address is loopback, multicast or broadcast, there might be many
   3400  * matching IREs so bind has to look up based on the zone.
   3401  */
   3402 ip_laddr_t
   3403 ip_laddr_verify_v4(ipaddr_t src_addr, zoneid_t zoneid,
   3404     ip_stack_t *ipst, boolean_t allow_mcbc)
   3405 {
   3406 	ire_t *src_ire;
   3407 
   3408 	ASSERT(src_addr != INADDR_ANY);
   3409 
   3410 	src_ire = ire_ftable_lookup_v4(src_addr, 0, 0, 0,
   3411 	    NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, 0, ipst, NULL);
   3412 
   3413 	/*
   3414 	 * If an address other than in6addr_any is requested,
   3415 	 * we verify that it is a valid address for bind
   3416 	 * Note: Following code is in if-else-if form for
   3417 	 * readability compared to a condition check.
   3418 	 */
   3419 	if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
   3420 		/*
   3421 		 * (2) Bind to address of local UP interface
   3422 		 */
   3423 		ire_refrele(src_ire);
   3424 		return (IPVL_UNICAST_UP);
   3425 	} else if (src_ire != NULL && src_ire->ire_type & IRE_BROADCAST) {
   3426 		/*
   3427 		 * (4) Bind to broadcast address
   3428 		 */
   3429 		ire_refrele(src_ire);
   3430 		if (allow_mcbc)
   3431 			return (IPVL_BCAST);
   3432 		else
   3433 			return (IPVL_BAD);
   3434 	} else if (CLASSD(src_addr)) {
   3435 		/* (5) bind to multicast address. */
   3436 		if (src_ire != NULL)
   3437 			ire_refrele(src_ire);
   3438 
   3439 		if (allow_mcbc)
   3440 			return (IPVL_MCAST);
   3441 		else
   3442 			return (IPVL_BAD);
   3443 	} else {
   3444 		ipif_t *ipif;
   3445 
   3446 		/*
   3447 		 * (3) Bind to address of local DOWN interface?
   3448 		 * (ipif_lookup_addr() looks up all interfaces
   3449 		 * but we do not get here for UP interfaces
   3450 		 * - case (2) above)
   3451 		 */
   3452 		if (src_ire != NULL)
   3453 			ire_refrele(src_ire);
   3454 
   3455 		ipif = ipif_lookup_addr(src_addr, NULL, zoneid, ipst);
   3456 		if (ipif == NULL)
   3457 			return (IPVL_BAD);
   3458 
   3459 		/* Not a useful source? */
   3460 		if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
   3461 			ipif_refrele(ipif);
   3462 			return (IPVL_BAD);
   3463 		}
   3464 		ipif_refrele(ipif);
   3465 		return (IPVL_UNICAST_DOWN);
   3466 	}
   3467 }
   3468 
   3469 /*
   3470  * Insert in the bind fanout for IPv4 and IPv6.
   3471  * The caller should already have used ip_laddr_verify_v*() before calling
   3472  * this.
   3473  */
   3474 int
   3475 ip_laddr_fanout_insert(conn_t *connp)
   3476 {
   3477 	int		error;
   3478 
   3479 	/*
   3480 	 * Allow setting new policies. For example, disconnects result
   3481 	 * in us being called. As we would have set conn_policy_cached
   3482 	 * to B_TRUE before, we should set it to B_FALSE, so that policy
   3483 	 * can change after the disconnect.
   3484 	 */
   3485 	connp->conn_policy_cached = B_FALSE;
   3486 
   3487 	error = ipcl_bind_insert(connp);
   3488 	if (error != 0) {
   3489 		if (connp->conn_anon_port) {
   3490 			(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
   3491 			    connp->conn_mlp_type, connp->conn_proto,
   3492 			    ntohs(connp->conn_lport), B_FALSE);
   3493 		}
   3494 		connp->conn_mlp_type = mlptSingle;
   3495 	}
   3496 	return (error);
   3497 }
   3498 
   3499 /*
   3500  * Verify that both the source and destination addresses are valid. If
   3501  * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
   3502  * i.e. have no route to it.  Protocols like TCP want to verify destination
   3503  * reachability, while tunnels do not.
   3504  *
   3505  * Determine the route, the interface, and (optionally) the source address
   3506  * to use to reach a given destination.
   3507  * Note that we allow connect to broadcast and multicast addresses when
   3508  * IPDF_ALLOW_MCBC is set.
   3509  * first_hop and dst_addr are normally the same, but if source routing
   3510  * they will differ; in that case the first_hop is what we'll use for the
   3511  * routing lookup but the dce and label checks will be done on dst_addr,
   3512  *
   3513  * If uinfo is set, then we fill in the best available information
   3514  * we have for the destination. This is based on (in priority order) any
   3515  * metrics and path MTU stored in a dce_t, route metrics, and finally the
   3516  * ill_mtu.
   3517  *
   3518  * Tsol note: If we have a source route then dst_addr != firsthop. But we
   3519  * always do the label check on dst_addr.
   3520  */
   3521 int
   3522 ip_set_destination_v4(ipaddr_t *src_addrp, ipaddr_t dst_addr, ipaddr_t firsthop,
   3523     ip_xmit_attr_t *ixa, iulp_t *uinfo, uint32_t flags, uint_t mac_mode)
   3524 {
   3525 	ire_t		*ire = NULL;
   3526 	int		error = 0;
   3527 	ipaddr_t	setsrc;				/* RTF_SETSRC */
   3528 	zoneid_t	zoneid = ixa->ixa_zoneid;	/* Honors SO_ALLZONES */
   3529 	ip_stack_t	*ipst = ixa->ixa_ipst;
   3530 	dce_t		*dce;
   3531 	uint_t		pmtu;
   3532 	uint_t		generation;
   3533 	nce_t		*nce;
   3534 	ill_t		*ill = NULL;
   3535 	boolean_t	multirt = B_FALSE;
   3536 
   3537 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
   3538 
   3539 	/*
   3540 	 * We never send to zero; the ULPs map it to the loopback address.
   3541 	 * We can't allow it since we use zero to mean unitialized in some
   3542 	 * places.
   3543 	 */
   3544 	ASSERT(dst_addr != INADDR_ANY);
   3545 
   3546 	if (is_system_labeled()) {
   3547 		ts_label_t *tsl = NULL;
   3548 
   3549 		error = tsol_check_dest(ixa->ixa_tsl, &dst_addr, IPV4_VERSION,
   3550 		    mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
   3551 		if (error != 0)
   3552 			return (error);
   3553 		if (tsl != NULL) {
   3554 			/* Update the label */
   3555 			ip_xmit_attr_replace_tsl(ixa, tsl);
   3556 		}
   3557 	}
   3558 
   3559 	setsrc = INADDR_ANY;
   3560 	/*
   3561 	 * Select a route; For IPMP interfaces, we would only select
   3562 	 * a "hidden" route (i.e., going through a specific under_ill)
   3563 	 * if ixa_ifindex has been specified.
   3564 	 */
   3565 	ire = ip_select_route_v4(firsthop, ixa, &generation, &setsrc, &error,
   3566 	    &multirt);
   3567 	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
   3568 	if (error != 0)
   3569 		goto bad_addr;
   3570 
   3571 	/*
   3572 	 * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
   3573 	 * If IPDF_VERIFY_DST is set, the destination must be reachable;
   3574 	 * Otherwise the destination needn't be reachable.
   3575 	 *
   3576 	 * If we match on a reject or black hole, then we've got a
   3577 	 * local failure.  May as well fail out the connect() attempt,
   3578 	 * since it's never going to succeed.
   3579 	 */
   3580 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
   3581 		/*
   3582 		 * If we're verifying destination reachability, we always want
   3583 		 * to complain here.
   3584 		 *
   3585 		 * If we're not verifying destination reachability but the
   3586 		 * destination has a route, we still want to fail on the
   3587 		 * temporary address and broadcast address tests.
   3588 		 *
   3589 		 * In both cases do we let the code continue so some reasonable
   3590 		 * information is returned to the caller. That enables the
   3591 		 * caller to use (and even cache) the IRE. conn_ip_ouput will
   3592 		 * use the generation mismatch path to check for the unreachable
   3593 		 * case thereby avoiding any specific check in the main path.
   3594 		 */
   3595 		ASSERT(generation == IRE_GENERATION_VERIFY);
   3596 		if (flags & IPDF_VERIFY_DST) {
   3597 			/*
   3598 			 * Set errno but continue to set up ixa_ire to be
   3599 			 * the RTF_REJECT|RTF_BLACKHOLE IRE.
   3600 			 * That allows callers to use ip_output to get an
   3601 			 * ICMP error back.
   3602 			 */
   3603 			if (!(ire->ire_type & IRE_HOST))
   3604 				error = ENETUNREACH;
   3605 			else
   3606 				error = EHOSTUNREACH;
   3607 		}
   3608 	}
   3609 
   3610 	if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
   3611 	    !(flags & IPDF_ALLOW_MCBC)) {
   3612 		ire_refrele(ire);
   3613 		ire = ire_reject(ipst, B_FALSE);
   3614 		generation = IRE_GENERATION_VERIFY;
   3615 		error = ENETUNREACH;
   3616 	}
   3617 
   3618 	/* Cache things */
   3619 	if (ixa->ixa_ire != NULL)
   3620 		ire_refrele_notr(ixa->ixa_ire);
   3621 #ifdef DEBUG
   3622 	ire_refhold_notr(ire);
   3623 	ire_refrele(ire);
   3624 #endif
   3625 	ixa->ixa_ire = ire;
   3626 	ixa->ixa_ire_generation = generation;
   3627 
   3628 	/*
   3629 	 * For multicast with multirt we have a flag passed back from
   3630 	 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
   3631 	 * possible multicast address.
   3632 	 * We also need a flag for multicast since we can't check
   3633 	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
   3634 	 */
   3635 	if (multirt) {
   3636 		ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
   3637 		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
   3638 	} else {
   3639 		ixa->ixa_postfragfn = ire->ire_postfragfn;
   3640 		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
   3641 	}
   3642 	if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
   3643 		/* Get an nce to cache. */
   3644 		nce = ire_to_nce(ire, firsthop, NULL);
   3645 		if (nce == NULL) {
   3646 			/* Allocation failure? */
   3647 			ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
   3648 		} else {
   3649 			if (ixa->ixa_nce != NULL)
   3650 				nce_refrele(ixa->ixa_nce);
   3651 			ixa->ixa_nce = nce;
   3652 		}
   3653 	}
   3654 
   3655 	/*
   3656 	 * If the source address is a loopback address, the
   3657 	 * destination had best be local or multicast.
   3658 	 * If we are sending to an IRE_LOCAL using a loopback source then
   3659 	 * it had better be the same zoneid.
   3660 	 */
   3661 	if (*src_addrp == htonl(INADDR_LOOPBACK)) {
   3662 		if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
   3663 			ire = NULL;	/* Stored in ixa_ire */
   3664 			error = EADDRNOTAVAIL;
   3665 			goto bad_addr;
   3666 		}
   3667 		if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
   3668 			ire = NULL;	/* Stored in ixa_ire */
   3669 			error = EADDRNOTAVAIL;
   3670 			goto bad_addr;
   3671 		}
   3672 	}
   3673 	if (ire->ire_type & IRE_BROADCAST) {
   3674 		/*
   3675 		 * If the ULP didn't have a specified source, then we
   3676 		 * make sure we reselect the source when sending
   3677 		 * broadcasts out different interfaces.
   3678 		 */
   3679 		if (flags & IPDF_SELECT_SRC)
   3680 			ixa->ixa_flags |= IXAF_SET_SOURCE;
   3681 		else
   3682 			ixa->ixa_flags &= ~IXAF_SET_SOURCE;
   3683 	}
   3684 
   3685 	/*
   3686 	 * Does the caller want us to pick a source address?
   3687 	 */
   3688 	if (flags & IPDF_SELECT_SRC) {
   3689 		ipaddr_t	src_addr;
   3690 
   3691 		/*
   3692 		 * We use use ire_nexthop_ill to avoid the under ipmp
   3693 		 * interface for source address selection. Note that for ipmp
   3694 		 * probe packets, ixa_ifindex would have been specified, and
   3695 		 * the ip_select_route() invocation would have picked an ire
   3696 		 * will ire_ill pointing at an under interface.
   3697 		 */
   3698 		ill = ire_nexthop_ill(ire);
   3699 
   3700 		/* If unreachable we have no ill but need some source */
   3701 		if (ill == NULL) {
   3702 			src_addr = htonl(INADDR_LOOPBACK);
   3703 			/* Make sure we look for a better source address */
   3704 			generation = SRC_GENERATION_VERIFY;
   3705 		} else {
   3706 			error = ip_select_source_v4(ill, setsrc, dst_addr,
   3707 			    ixa->ixa_multicast_ifaddr, zoneid,
   3708 			    ipst, &src_addr, &generation, NULL);
   3709 			if (error != 0) {
   3710 				ire = NULL;	/* Stored in ixa_ire */
   3711 				goto bad_addr;
   3712 			}
   3713 		}
   3714 
   3715 		/*
   3716 		 * We allow the source address to to down.
   3717 		 * However, we check that we don't use the loopback address
   3718 		 * as a source when sending out on the wire.
   3719 		 */
   3720 		if ((src_addr == htonl(INADDR_LOOPBACK)) &&
   3721 		    !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
   3722 		    !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
   3723 			ire = NULL;	/* Stored in ixa_ire */
   3724 			error = EADDRNOTAVAIL;
   3725 			goto bad_addr;
   3726 		}
   3727 
   3728 		*src_addrp = src_addr;
   3729 		ixa->ixa_src_generation = generation;
   3730 	}
   3731 
   3732 	if (flags & IPDF_UNIQUE_DCE) {
   3733 		/* Fallback to the default dce if allocation fails */
   3734 		dce = dce_lookup_and_add_v4(dst_addr, ipst);
   3735 		if (dce != NULL)
   3736 			generation = dce->dce_generation;
   3737 		else
   3738 			dce = dce_lookup_v4(dst_addr, ipst, &generation);
   3739 	} else {
   3740 		dce = dce_lookup_v4(dst_addr, ipst, &generation);
   3741 	}
   3742 	ASSERT(dce != NULL);
   3743 	if (ixa->ixa_dce != NULL)
   3744 		dce_refrele_notr(ixa->ixa_dce);
   3745 #ifdef DEBUG
   3746 	dce_refhold_notr(dce);
   3747 	dce_refrele(dce);
   3748 #endif
   3749 	ixa->ixa_dce = dce;
   3750 	ixa->ixa_dce_generation = generation;
   3751 
   3752 	/*
   3753 	 * Make sure we don't leave an unreachable ixa_nce in place
   3754 	 * since ip_select_route is used when we unplumb i.e., remove
   3755 	 * references on ixa_ire, ixa_nce, and ixa_dce.
   3756 	 */
   3757 	nce = ixa->ixa_nce;
   3758 	if (nce != NULL && nce->nce_is_condemned) {
   3759 		nce_refrele(nce);
   3760 		ixa->ixa_nce = NULL;
   3761 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
   3762 	}
   3763 
   3764 	/*
   3765 	 * The caller has set IXAF_PMTU_DISCOVERY if path MTU is desired.
   3766 	 * However, we can't do it for IPv4 multicast or broadcast.
   3767 	 */
   3768 	if (ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST))
   3769 		ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
   3770 
   3771 	/*
   3772 	 * Set initial value for fragmentation limit. Either conn_ip_output
   3773 	 * or ULP might updates it when there are routing changes.
   3774 	 * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
   3775 	 */
   3776 	pmtu = ip_get_pmtu(ixa);
   3777 	ixa->ixa_fragsize = pmtu;
   3778 	/* Make sure ixa_fragsize and ixa_pmtu remain identical */
   3779 	if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
   3780 		ixa->ixa_pmtu = pmtu;
   3781 
   3782 	/*
   3783 	 * Extract information useful for some transports.
   3784 	 * First we look for DCE metrics. Then we take what we have in
   3785 	 * the metrics in the route, where the offlink is used if we have
   3786 	 * one.
   3787 	 */
   3788 	if (uinfo != NULL) {
   3789 		bzero(uinfo, sizeof (*uinfo));
   3790 
   3791 		if (dce->dce_flags & DCEF_UINFO)
   3792 			*uinfo = dce->dce_uinfo;
   3793 
   3794 		rts_merge_metrics(uinfo, &ire->ire_metrics);
   3795 
   3796 		/* Allow ire_metrics to decrease the path MTU from above */
   3797 		if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
   3798 			uinfo->iulp_mtu = pmtu;
   3799 
   3800 		uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
   3801 		uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
   3802 		uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
   3803 	}
   3804 
   3805 	if (ill != NULL)
   3806 		ill_refrele(ill);
   3807 
   3808 	return (error);
   3809 
   3810 bad_addr:
   3811 	if (ire != NULL)
   3812 		ire_refrele(ire);
   3813 
   3814 	if (ill != NULL)
   3815 		ill_refrele(ill);
   3816 
   3817 	/*
   3818 	 * Make sure we don't leave an unreachable ixa_nce in place
   3819 	 * since ip_select_route is used when we unplumb i.e., remove
   3820 	 * references on ixa_ire, ixa_nce, and ixa_dce.
   3821 	 */
   3822 	nce = ixa->ixa_nce;
   3823 	if (nce != NULL && nce->nce_is_condemned) {
   3824 		nce_refrele(nce);
   3825 		ixa->ixa_nce = NULL;
   3826 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
   3827 	}
   3828 
   3829 	return (error);
   3830 }
   3831 
   3832 
   3833 /*
   3834  * Get the base MTU for the case when path MTU discovery is not used.
   3835  * Takes the MTU of the IRE into account.
   3836  */
   3837 uint_t
   3838 ip_get_base_mtu(ill_t *ill, ire_t *ire)
   3839 {
   3840 	uint_t mtu = ill->ill_mtu;
   3841 	uint_t iremtu = ire->ire_metrics.iulp_mtu;
   3842 
   3843 	if (iremtu != 0 && iremtu < mtu)
   3844 		mtu = iremtu;
   3845 
   3846 	return (mtu);
   3847 }
   3848 
   3849 /*
   3850  * Get the PMTU for the attributes. Handles both IPv4 and IPv6.
   3851  * Assumes that ixa_ire, dce, and nce have already been set up.
   3852  *
   3853  * The caller has set IXAF_PMTU_DISCOVERY if path MTU discovery is desired.
   3854  * We avoid path MTU discovery if it is disabled with ndd.
   3855  * Furtermore, if the path MTU is too small, then we don't set DF for IPv4.
   3856  *
   3857  * NOTE: We also used to turn it off for source routed packets. That
   3858  * is no longer required since the dce is per final destination.
   3859  */
   3860 uint_t
   3861 ip_get_pmtu(ip_xmit_attr_t *ixa)
   3862 {
   3863 	ip_stack_t	*ipst = ixa->ixa_ipst;
   3864 	dce_t		*dce;
   3865 	nce_t		*nce;
   3866 	ire_t		*ire;
   3867 	uint_t		pmtu;
   3868 
   3869 	ire = ixa->ixa_ire;
   3870 	dce = ixa->ixa_dce;
   3871 	nce = ixa->ixa_nce;
   3872 
   3873 	/*
   3874 	 * If path MTU discovery has been turned off by ndd, then we ignore
   3875 	 * any dce_pmtu and for IPv4 we will not set DF.
   3876 	 */
   3877 	if (!ipst->ips_ip_path_mtu_discovery)
   3878 		ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
   3879 
   3880 	pmtu = IP_MAXPACKET;
   3881 	/*
   3882 	 * Decide whether whether IPv4 sets DF
   3883 	 * For IPv6 "no DF" means to use the 1280 mtu
   3884 	 */
   3885 	if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
   3886 		ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
   3887 	} else {
   3888 		ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
   3889 		if (!(ixa->ixa_flags & IXAF_IS_IPV4))
   3890 			pmtu = IPV6_MIN_MTU;
   3891 	}
   3892 
   3893 	/* Check if the PMTU is to old before we use it */
   3894 	if ((dce->dce_flags & DCEF_PMTU) &&
   3895 	    TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
   3896 	    ipst->ips_ip_pathmtu_interval) {
   3897 		/*
   3898 		 * Older than 20 minutes. Drop the path MTU information.
   3899 		 */
   3900 		mutex_enter(&dce->dce_lock);
   3901 		dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
   3902 		dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
   3903 		mutex_exit(&dce->dce_lock);
   3904 		dce_increment_generation(dce);
   3905 	}
   3906 
   3907 	/* The metrics on the route can lower the path MTU */
   3908 	if (ire->ire_metrics.iulp_mtu != 0 &&
   3909 	    ire->ire_metrics.iulp_mtu < pmtu)
   3910 		pmtu = ire->ire_metrics.iulp_mtu;
   3911 
   3912 	/*
   3913 	 * If the path MTU is smaller than some minimum, we still use dce_pmtu
   3914 	 * above (would be 576 for IPv4 and 1280 for IPv6), but we clear
   3915 	 * IXAF_PMTU_IPV4_DF so that we avoid setting DF for IPv4.
   3916 	 */
   3917 	if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
   3918 		if (dce->dce_flags & DCEF_PMTU) {
   3919 			if (dce->dce_pmtu < pmtu)
   3920 				pmtu = dce->dce_pmtu;
   3921 
   3922 			if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) {
   3923 				ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL;
   3924 				ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
   3925 			} else {
   3926 				ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
   3927 				ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
   3928 			}
   3929 		} else {
   3930 			ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
   3931 			ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
   3932 		}
   3933 	}
   3934 
   3935 	/*
   3936 	 * If we have an IRE_LOCAL we use the loopback mtu instead of
   3937 	 * the ill for going out the wire i.e., IRE_LOCAL gets the same
   3938 	 * mtu as IRE_LOOPBACK.
   3939 	 */
   3940 	if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
   3941 		uint_t loopback_mtu;
   3942 
   3943 		loopback_mtu = (ire->ire_ipversion == IPV6_VERSION) ?
   3944 		    ip_loopback_mtu_v6plus : ip_loopback_mtuplus;
   3945 
   3946 		if (loopback_mtu < pmtu)
   3947 			pmtu = loopback_mtu;
   3948 	} else if (nce != NULL) {
   3949 		/*
   3950 		 * Make sure we don't exceed the interface MTU.
   3951 		 * In the case of RTF_REJECT or RTF_BLACKHOLE we might not have
   3952 		 * an ill. We'd use the above IP_MAXPACKET in that case just
   3953 		 * to tell the transport something larger than zero.
   3954 		 */
   3955 		if (nce->nce_common->ncec_ill->ill_mtu < pmtu)
   3956 			pmtu = nce->nce_common->ncec_ill->ill_mtu;
   3957 		if (nce->nce_common->ncec_ill != nce->nce_ill &&
   3958 		    nce->nce_ill->ill_mtu < pmtu) {
   3959 			/*
   3960 			 * for interfaces in an IPMP group, the mtu of
   3961 			 * the nce_ill (under_ill) could be different
   3962 			 * from the mtu of the ncec_ill, so we take the
   3963 			 * min of the two.
   3964 			 */
   3965 			pmtu = nce->nce_ill->ill_mtu;
   3966 		}
   3967 	}
   3968 
   3969 	/*
   3970 	 * Handle the IPV6_USE_MIN_MTU socket option or ancillary data.
   3971 	 * Only applies to IPv6.
   3972 	 */
   3973 	if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
   3974 		if (ixa->ixa_flags & IXAF_USE_MIN_MTU) {
   3975 			switch (ixa->ixa_use_min_mtu) {
   3976 			case IPV6_USE_MIN_MTU_MULTICAST:
   3977 				if (ire->ire_type & IRE_MULTICAST)
   3978 					pmtu = IPV6_MIN_MTU;
   3979 				break;
   3980 			case IPV6_USE_MIN_MTU_ALWAYS:
   3981 				pmtu = IPV6_MIN_MTU;
   3982 				break;
   3983 			case IPV6_USE_MIN_MTU_NEVER:
   3984 				break;
   3985 			}
   3986 		} else {
   3987 			/* Default is IPV6_USE_MIN_MTU_MULTICAST */
   3988 			if (ire->ire_type & IRE_MULTICAST)
   3989 				pmtu = IPV6_MIN_MTU;
   3990 		}
   3991 	}
   3992 
   3993 	/*
   3994 	 * After receiving an ICMPv6 "packet too big" message with a
   3995 	 * MTU < 1280, and for multirouted IPv6 packets, the IP layer
   3996 	 * will insert a 8-byte fragment header in every packet. We compensate
   3997 	 * for those cases by returning a smaller path MTU to the ULP.
   3998 	 *
   3999 	 * In the case of CGTP then ip_output will add a fragment header.
   4000 	 * Make sure there is room for it by telling a smaller number
   4001 	 * to the transport.
   4002 	 *
   4003 	 * When IXAF_IPV6_ADDR_FRAGHDR we subtract the frag hdr here
   4004 	 * so the ULPs consistently see a iulp_pmtu and ip_get_pmtu()
   4005 	 * which is the size of the packets it can send.
   4006 	 */
   4007 	if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
   4008 		if ((dce->dce_flags & DCEF_TOO_SMALL_PMTU) ||
   4009 		    (ire->ire_flags & RTF_MULTIRT) ||
   4010 		    (ixa->ixa_flags & IXAF_MULTIRT_MULTICAST)) {
   4011 			pmtu -= sizeof (ip6_frag_t);
   4012 			ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
   4013 		}
   4014 	}
   4015 
   4016 	return (pmtu);
   4017 }
   4018 
   4019 /*
   4020  * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping
   4021  * the final piece where we don't.  Return a pointer to the first mblk in the
   4022  * result, and update the pointer to the next mblk to chew on.  If anything
   4023  * goes wrong (i.e., dupb fails), we waste everything in sight and return a
   4024  * NULL pointer.
   4025  */
   4026 mblk_t *
   4027 ip_carve_mp(mblk_t **mpp, ssize_t len)
   4028 {
   4029 	mblk_t	*mp0;
   4030 	mblk_t	*mp1;
   4031 	mblk_t	*mp2;
   4032 
   4033 	if (!len || !mpp || !(mp0 = *mpp))
   4034 		return (NULL);
   4035 	/* If we aren't going to consume the first mblk, we need a dup. */
   4036 	if (mp0->b_wptr - mp0->b_rptr > len) {
   4037 		mp1 = dupb(mp0);
   4038 		if (mp1) {
   4039 			/* Partition the data between the two mblks. */
   4040 			mp1->b_wptr = mp1->b_rptr + len;
   4041 			mp0->b_rptr = mp1->b_wptr;
   4042 			/*
   4043 			 * after adjustments if mblk not consumed is now
   4044 			 * unaligned, try to align it. If this fails free
   4045 			 * all messages and let upper layer recover.
   4046 			 */
   4047 			if (!OK_32PTR(mp0->b_rptr)) {
   4048 				if (!pullupmsg(mp0, -1)) {
   4049 					freemsg(mp0);
   4050 					freemsg(mp1);
   4051 					*mpp = NULL;
   4052 					return (NULL);
   4053 				}
   4054 			}
   4055 		}
   4056 		return (mp1);
   4057 	}
   4058 	/* Eat through as many mblks as we need to get len bytes. */
   4059 	len -= mp0->b_wptr - mp0->b_rptr;
   4060 	for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) {
   4061 		if (mp2->b_wptr - mp2->b_rptr > len) {
   4062 			/*
   4063 			 * We won't consume the entire last mblk.  Like
   4064 			 * above, dup and partition it.
   4065 			 */
   4066 			mp1->b_cont = dupb(mp2);
   4067 			mp1 = mp1->b_cont;
   4068 			if (!mp1) {
   4069 				/*
   4070 				 * Trouble.  Rather than go to a lot of
   4071 				 * trouble to clean up, we free the messages.
   4072 				 * This won't be any worse than losing it on
   4073 				 * the wire.
   4074 				 */
   4075 				freemsg(mp0);
   4076 				freemsg(mp2);
   4077 				*mpp = NULL;
   4078 				return (NULL);
   4079 			}
   4080 			mp1->b_wptr = mp1->b_rptr + len;
   4081 			mp2->b_rptr = mp1->b_wptr;
   4082 			/*
   4083 			 * after adjustments if mblk not consumed is now
   4084 			 * unaligned, try to align it. If this fails free
   4085 			 * all messages and let upper layer recover.
   4086 			 */
   4087 			if (!OK_32PTR(mp2->b_rptr)) {
   4088 				if (!pullupmsg(mp2, -1)) {
   4089 					freemsg(mp0);
   4090 					freemsg(mp2);
   4091 					*mpp = NULL;
   4092 					return (NULL);
   4093 				}
   4094 			}
   4095 			*mpp = mp2;
   4096 			return (mp0);
   4097 		}
   4098 		/* Decrement len by the amount we just got. */
   4099 		len -= mp2->b_wptr - mp2->b_rptr;
   4100 	}
   4101 	/*
   4102 	 * len should be reduced to zero now.  If not our caller has
   4103 	 * screwed up.
   4104 	 */
   4105 	if (len) {
   4106 		/* Shouldn't happen! */
   4107 		freemsg(mp0);
   4108 		*mpp = NULL;
   4109 		return (NULL);
   4110 	}
   4111 	/*
   4112 	 * We consumed up to exactly the end of an mblk.  Detach the part
   4113 	 * we are returning from the rest of the chain.
   4114 	 */
   4115 	mp1->b_cont = NULL;
   4116 	*mpp = mp2;
   4117 	return (mp0);
   4118 }
   4119 
   4120 /* The ill stream is being unplumbed. Called from ip_close */
   4121 int
   4122 ip_modclose(ill_t *ill)
   4123 {
   4124 	boolean_t success;
   4125 	ipsq_t	*ipsq;
   4126 	ipif_t	*ipif;
   4127 	queue_t	*q = ill->ill_rq;
   4128 	ip_stack_t	*ipst = ill->ill_ipst;
   4129 	int	i;
   4130 	arl_ill_common_t *ai = ill->ill_common;
   4131 
   4132 	/*
   4133 	 * The punlink prior to this may have initiated a capability
   4134 	 * negotiation. But ipsq_enter will block until that finishes or
   4135 	 * times out.
   4136 	 */
   4137 	success = ipsq_enter(ill, B_FALSE, NEW_OP);
   4138 
   4139 	/*
   4140 	 * Open/close/push/pop is guaranteed to be single threaded
   4141 	 * per stream by STREAMS. FS guarantees that all references
   4142 	 * from top are gone before close is called. So there can't
   4143 	 * be another close thread that has set CONDEMNED on this ill.
   4144 	 * and cause ipsq_enter to return failure.
   4145 	 */
   4146 	ASSERT(success);
   4147 	ipsq = ill->ill_phyint->phyint_ipsq;
   4148 
   4149 	/*
   4150 	 * Mark it condemned. No new reference will be made to this ill.
   4151 	 * Lookup functions will return an error. Threads that try to
   4152 	 * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures
   4153 	 * that the refcnt will drop down to zero.
   4154 	 */
   4155 	mutex_enter(&ill->ill_lock);
   4156 	ill->ill_state_flags |= ILL_CONDEMNED;
   4157 	for (ipif = ill->ill_ipif; ipif != NULL;
   4158 	    ipif = ipif->ipif_next) {
   4159 		ipif->ipif_state_flags |= IPIF_CONDEMNED;
   4160 	}
   4161 	/*
   4162 	 * Wake up anybody waiting to enter the ipsq. ipsq_enter
   4163 	 * returns  error if ILL_CONDEMNED is set
   4164 	 */
   4165 	cv_broadcast(&ill->ill_cv);
   4166 	mutex_exit(&ill->ill_lock);
   4167 
   4168 	/*
   4169 	 * Send all the deferred DLPI messages downstream which came in
   4170 	 * during the small window right before ipsq_enter(). We do this
   4171 	 * without waiting for the ACKs because all the ACKs for M_PROTO
   4172 	 * messages are ignored in ip_rput() when ILL_CONDEMNED is set.
   4173 	 */
   4174 	ill_dlpi_send_deferred(ill);
   4175 
   4176 	/*
   4177 	 * Shut down fragmentation reassembly.
   4178 	 * ill_frag_timer won't start a timer again.
   4179 	 * Now cancel any existing timer
   4180 	 */
   4181 	(void) untimeout(ill->ill_frag_timer_id);
   4182 	(void) ill_frag_timeout(ill, 0);
   4183 
   4184 	/*
   4185 	 * Call ill_delete to bring down the ipifs, ilms and ill on
   4186 	 * this ill. Then wait for the refcnts to drop to zero.
   4187 	 * ill_is_freeable checks whether the ill is really quiescent.
   4188 	 * Then make sure that threads that are waiting to enter the
   4189 	 * ipsq have seen the error returned by ipsq_enter and have
   4190 	 * gone away. Then we call ill_delete_tail which does the
   4191 	 * DL_UNBIND_REQ with the driver and then qprocsoff.
   4192 	 */
   4193 	ill_delete(ill);
   4194 	mutex_enter(&ill->ill_lock);
   4195 	while (!ill_is_freeable(ill))
   4196 		cv_wait(&ill->ill_cv, &ill->ill_lock);
   4197 
   4198 	while (ill->ill_waiters)
   4199 		cv_wait(&ill->ill_cv, &ill->ill_lock);
   4200 
   4201 	mutex_exit(&ill->ill_lock);
   4202 
   4203 	/*
   4204 	 * ill_delete_tail drops reference on ill_ipst, but we need to keep
   4205 	 * it held until the end of the function since the cleanup
   4206 	 * below needs to be able to use the ip_stack_t.
   4207 	 */
   4208 	netstack_hold(ipst->ips_netstack);
   4209 
   4210 	/* qprocsoff is done via ill_delete_tail */
   4211 	ill_delete_tail(ill);
   4212 	/*
   4213 	 * synchronously wait for arp stream to unbind. After this, we
   4214 	 * cannot get any data packets up from the driver.
   4215 	 */
   4216 	arp_unbind_complete(ill);
   4217 	ASSERT(ill->ill_ipst == NULL);
   4218 
   4219 	/*
   4220 	 * Walk through all conns and qenable those that have queued data.
   4221 	 * Close synchronization needs this to
   4222 	 * be done to ensure that all upper layers blocked
   4223 	 * due to flow control to the closing device
   4224 	 * get unblocked.
   4225 	 */
   4226 	ip1dbg(("ip_wsrv: walking\n"));
   4227 	for (i = 0; i < TX_FANOUT_SIZE; i++) {
   4228 		conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]);
   4229 	}
   4230 
   4231 	/*
   4232 	 * ai can be null if this is an IPv6 ill, or if the IPv4
   4233 	 * stream is being torn down before ARP was plumbed (e.g.,
   4234 	 * /sbin/ifconfig plumbing a stream twice, and encountering
   4235 	 * an error
   4236 	 */
   4237 	if (ai != NULL) {
   4238 		ASSERT(!ill->ill_isv6);
   4239 		mutex_enter(&ai->ai_lock);
   4240 		ai->ai_ill = NULL;
   4241 		if (ai->ai_arl == NULL) {
   4242 			mutex_destroy(&ai->ai_lock);
   4243 			kmem_free(ai, sizeof (*ai));
   4244 		} else {
   4245 			cv_signal(&ai->ai_ill_unplumb_done);
   4246 			mutex_exit(&ai->ai_lock);
   4247 		}
   4248 	}
   4249 
   4250 	mutex_enter(&ipst->ips_ip_mi_lock);
   4251 	mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill);
   4252 	mutex_exit(&ipst->ips_ip_mi_lock);
   4253 
   4254 	/*
   4255 	 * credp could be null if the open didn't succeed and ip_modopen
   4256 	 * itself calls ip_close.
   4257 	 */
   4258 	if (ill->ill_credp != NULL)
   4259 		crfree(ill->ill_credp);
   4260 
   4261 	mutex_destroy(&ill->ill_saved_ire_lock);
   4262 	mutex_destroy(&ill->ill_lock);
   4263 	rw_destroy(&ill->ill_mcast_lock);
   4264 	mutex_destroy(&ill->ill_mcast_serializer);
   4265 	list_destroy(&ill->ill_nce);
   4266 
   4267 	/*
   4268 	 * Now we are done with the module close pieces that
   4269 	 * need the netstack_t.
   4270 	 */
   4271 	netstack_rele(ipst->ips_netstack);
   4272 
   4273 	mi_close_free((IDP)ill);
   4274 	q->q_ptr = WR(q)->q_ptr = NULL;
   4275 
   4276 	ipsq_exit(ipsq);
   4277 
   4278 	return (0);
   4279 }
   4280 
   4281 /*
   4282  * This is called as part of close() for IP, UDP, ICMP, and RTS
   4283  * in order to quiesce the conn.
   4284  */
   4285 void
   4286 ip_quiesce_conn(conn_t *connp)
   4287 {
   4288 	boolean_t	drain_cleanup_reqd = B_FALSE;
   4289 	boolean_t	conn_ioctl_cleanup_reqd = B_FALSE;
   4290 	boolean_t	ilg_cleanup_reqd = B_FALSE;
   4291 	ip_stack_t	*ipst;
   4292 
   4293 	ASSERT(!IPCL_IS_TCP(connp));
   4294 	ipst = connp->conn_netstack->netstack_ip;
   4295 
   4296 	/*
   4297 	 * Mark the conn as closing, and this conn must not be
   4298 	 * inserted in future into any list. Eg. conn_drain_insert(),
   4299 	 * won't insert this conn into the conn_drain_list.
   4300 	 *
   4301 	 * conn_idl, and conn_ilg cannot get set henceforth.
   4302 	 */
   4303 	mutex_enter(&connp->conn_lock);
   4304 	ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
   4305 	connp->conn_state_flags |= CONN_CLOSING;
   4306 	if (connp->conn_idl != NULL)
   4307 		drain_cleanup_reqd = B_TRUE;
   4308 	if (connp->conn_oper_pending_ill != NULL)
   4309 		conn_ioctl_cleanup_reqd = B_TRUE;
   4310 	if (connp->conn_dhcpinit_ill != NULL) {
   4311 		ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0);
   4312 		atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit);
   4313 		ill_set_inputfn(connp->conn_dhcpinit_ill);
   4314 		connp->conn_dhcpinit_ill = NULL;
   4315 	}
   4316 	if (connp->conn_ilg != NULL)
   4317 		ilg_cleanup_reqd = B_TRUE;
   4318 	mutex_exit(&connp->conn_lock);
   4319 
   4320 	if (conn_ioctl_cleanup_reqd)
   4321 		conn_ioctl_cleanup(connp);
   4322 
   4323 	if (is_system_labeled() && connp->conn_anon_port) {
   4324 		(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
   4325 		    connp->conn_mlp_type, connp->conn_proto,
   4326 		    ntohs(connp->conn_lport), B_FALSE);
   4327 		connp->conn_anon_port = 0;
   4328 	}
   4329 	connp->conn_mlp_type = mlptSingle;
   4330 
   4331 	/*
   4332 	 * Remove this conn from any fanout list it is on.
   4333 	 * and then wait for any threads currently operating
   4334 	 * on this endpoint to finish
   4335 	 */
   4336 	ipcl_hash_remove(connp);
   4337 
   4338 	/*
   4339 	 * Remove this conn from the drain list, and do
   4340 	 * any other cleanup that may be required.
   4341 	 * (Only non-tcp conns may have a non-null conn_idl.
   4342 	 * TCP conns are never flow controlled, and
   4343 	 * conn_idl will be null)
   4344 	 */
   4345 	if (drain_cleanup_reqd && connp->conn_idl != NULL) {
   4346 		mutex_enter(&connp->conn_idl->idl_lock);
   4347 		conn_drain_tail(connp, B_TRUE);
   4348 		mutex_exit(&connp->conn_idl->idl_lock);
   4349 	}
   4350 
   4351 	if (connp == ipst->ips_ip_g_mrouter)
   4352 		(void) ip_mrouter_done(ipst);
   4353 
   4354 	if (ilg_cleanup_reqd)
   4355 		ilg_delete_all(connp);
   4356 
   4357 	/*
   4358 	 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
   4359 	 * callers from write side can't be there now because close
   4360 	 * is in progress. The only other caller is ipcl_walk
   4361 	 * which checks for the condemned flag.
   4362 	 */
   4363 	mutex_enter(&connp->conn_lock);
   4364 	connp->conn_state_flags |= CONN_CONDEMNED;
   4365 	while (connp->conn_ref != 1)
   4366 		cv_wait(&connp->conn_cv, &connp->conn_lock);
   4367 	connp->conn_state_flags |= CONN_QUIESCED;
   4368 	mutex_exit(&connp->conn_lock);
   4369 }
   4370 
   4371 /* ARGSUSED */
   4372 int
   4373 ip_close(queue_t *q, int flags)
   4374 {
   4375 	conn_t		*connp;
   4376 
   4377 	/*
   4378 	 * Call the appropriate delete routine depending on whether this is
   4379 	 * a module or device.
   4380 	 */
   4381 	if (WR(q)->q_next != NULL) {
   4382 		/* This is a module close */
   4383 		return (ip_modclose((ill_t *)q->q_ptr));
   4384 	}
   4385 
   4386 	connp = q->q_ptr;
   4387 	ip_quiesce_conn(connp);
   4388 
   4389 	qprocsoff(q);
   4390 
   4391 	/*
   4392 	 * Now we are truly single threaded on this stream, and can
   4393 	 * delete the things hanging off the connp, and finally the connp.
   4394 	 * We removed this connp from the fanout list, it cannot be
   4395 	 * accessed thru the fanouts, and we already waited for the
   4396 	 * conn_ref to drop to 0. We are already in close, so
   4397 	 * there cannot be any other thread from the top. qprocsoff
   4398 	 * has completed, and service has completed or won't run in
   4399 	 * future.
   4400 	 */
   4401 	ASSERT(connp->conn_ref == 1);
   4402 
   4403 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
   4404 
   4405 	connp->conn_ref--;
   4406 	ipcl_conn_destroy(connp);
   4407 
   4408 	q->q_ptr = WR(q)->q_ptr = NULL;
   4409 	return (0);
   4410 }
   4411 
   4412 /*
   4413  * Wapper around putnext() so that ip_rts_request can merely use
   4414  * conn_recv.
   4415  */
   4416 /*ARGSUSED2*/
   4417 static void
   4418 ip_conn_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   4419 {
   4420 	conn_t *connp = (conn_t *)arg1;
   4421 
   4422 	putnext(connp->conn_rq, mp);
   4423 }
   4424 
   4425 /* Dummy in case ICMP error delivery is attempted to a /dev/ip instance */
   4426 /* ARGSUSED */
   4427 static void
   4428 ip_conn_input_icmp(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   4429 {
   4430 	freemsg(mp);
   4431 }
   4432 
   4433 /*
   4434  * Called when the module is about to be unloaded
   4435  */
   4436 void
   4437 ip_ddi_destroy(void)
   4438 {
   4439 	tnet_fini();
   4440 
   4441 	icmp_ddi_g_destroy();
   4442 	rts_ddi_g_destroy();
   4443 	udp_ddi_g_destroy();
   4444 	sctp_ddi_g_destroy();
   4445 	tcp_ddi_g_destroy();
   4446 	ilb_ddi_g_destroy();
   4447 	dce_g_destroy();
   4448 	ipsec_policy_g_destroy();
   4449 	ipcl_g_destroy();
   4450 	ip_net_g_destroy();
   4451 	ip_ire_g_fini();
   4452 	inet_minor_destroy(ip_minor_arena_sa);
   4453 #if defined(_LP64)
   4454 	inet_minor_destroy(ip_minor_arena_la);
   4455 #endif
   4456 
   4457 #ifdef DEBUG
   4458 	list_destroy(&ip_thread_list);
   4459 	rw_destroy(&ip_thread_rwlock);
   4460 	tsd_destroy(&ip_thread_data);
   4461 #endif
   4462 
   4463 	netstack_unregister(NS_IP);
   4464 }
   4465 
   4466 /*
   4467  * First step in cleanup.
   4468  */
   4469 /* ARGSUSED */
   4470 static void
   4471 ip_stack_shutdown(netstackid_t stackid, void *arg)
   4472 {
   4473 	ip_stack_t *ipst = (ip_stack_t *)arg;
   4474 
   4475 #ifdef NS_DEBUG
   4476 	printf("ip_stack_shutdown(%p, stack %d)\n", (void *)ipst, stackid);
   4477 #endif
   4478 
   4479 	/*
   4480 	 * Perform cleanup for special interfaces (loopback and IPMP).
   4481 	 */
   4482 	ip_interface_cleanup(ipst);
   4483 
   4484 	/*
   4485 	 * The *_hook_shutdown()s start the process of notifying any
   4486 	 * consumers that things are going away.... nothing is destroyed.
   4487 	 */
   4488 	ipv4_hook_shutdown(ipst);
   4489 	ipv6_hook_shutdown(ipst);
   4490 	arp_hook_shutdown(ipst);
   4491 
   4492 	mutex_enter(&ipst->ips_capab_taskq_lock);
   4493 	ipst->ips_capab_taskq_quit = B_TRUE;
   4494 	cv_signal(&ipst->ips_capab_taskq_cv);
   4495 	mutex_exit(&ipst->ips_capab_taskq_lock);
   4496 }
   4497 
   4498 /*
   4499  * Free the IP stack instance.
   4500  */
   4501 static void
   4502 ip_stack_fini(netstackid_t stackid, void *arg)
   4503 {
   4504 	ip_stack_t *ipst = (ip_stack_t *)arg;
   4505 	int ret;
   4506 
   4507 #ifdef NS_DEBUG
   4508 	printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
   4509 #endif
   4510 	/*
   4511 	 * At this point, all of the notifications that the events and
   4512 	 * protocols are going away have been run, meaning that we can
   4513 	 * now set about starting to clean things up.
   4514 	 */
   4515 	ipobs_fini(ipst);
   4516 	ipv4_hook_destroy(ipst);
   4517 	ipv6_hook_destroy(ipst);
   4518 	arp_hook_destroy(ipst);
   4519 	ip_net_destroy(ipst);
   4520 
   4521 	mutex_destroy(&ipst->ips_capab_taskq_lock);
   4522 	cv_destroy(&ipst->ips_capab_taskq_cv);
   4523 
   4524 	ipmp_destroy(ipst);
   4525 	rw_destroy(&ipst->ips_srcid_lock);
   4526 
   4527 	ip_kstat_fini(stackid, ipst->ips_ip_mibkp);
   4528 	ipst->ips_ip_mibkp = NULL;
   4529 	icmp_kstat_fini(stackid, ipst->ips_icmp_mibkp);
   4530 	ipst->ips_icmp_mibkp = NULL;
   4531 	ip_kstat2_fini(stackid, ipst->ips_ip_kstat);
   4532 	ipst->ips_ip_kstat = NULL;
   4533 	bzero(&ipst->ips_ip_statistics, sizeof (ipst->ips_ip_statistics));
   4534 	ip6_kstat_fini(stackid, ipst->ips_ip6_kstat);
   4535 	ipst->ips_ip6_kstat = NULL;
   4536 	bzero(&ipst->ips_ip6_statistics, sizeof (ipst->ips_ip6_statistics));
   4537 
   4538 	nd_free(&ipst->ips_ip_g_nd);
   4539 	kmem_free(ipst->ips_param_arr, sizeof (lcl_param_arr));
   4540 	ipst->ips_param_arr = NULL;
   4541 	kmem_free(ipst->ips_ndp_arr, sizeof (lcl_ndp_arr));
   4542 	ipst->ips_ndp_arr = NULL;
   4543 
   4544 	dce_stack_destroy(ipst);
   4545 	ip_mrouter_stack_destroy(ipst);
   4546 
   4547 	mutex_destroy(&ipst->ips_ip_mi_lock);
   4548 	rw_destroy(&ipst->ips_ill_g_usesrc_lock);
   4549 	rw_destroy(&ipst->ips_ip_g_nd_lock);
   4550 
   4551 	ret = untimeout(ipst->ips_igmp_timeout_id);
   4552 	if (ret == -1) {
   4553 		ASSERT(ipst->ips_igmp_timeout_id == 0);
   4554 	} else {
   4555 		ASSERT(ipst->ips_igmp_timeout_id != 0);
   4556 		ipst->ips_igmp_timeout_id = 0;
   4557 	}
   4558 	ret = untimeout(ipst->ips_igmp_slowtimeout_id);
   4559 	if (ret == -1) {
   4560 		ASSERT(ipst->ips_igmp_slowtimeout_id == 0);
   4561 	} else {
   4562 		ASSERT(ipst->ips_igmp_slowtimeout_id != 0);
   4563 		ipst->ips_igmp_slowtimeout_id = 0;
   4564 	}
   4565 	ret = untimeout(ipst->ips_mld_timeout_id);
   4566 	if (ret == -1) {
   4567 		ASSERT(ipst->ips_mld_timeout_id == 0);
   4568 	} else {
   4569 		ASSERT(ipst->ips_mld_timeout_id != 0);
   4570 		ipst->ips_mld_timeout_id = 0;
   4571 	}
   4572 	ret = untimeout(ipst->ips_mld_slowtimeout_id);
   4573 	if (ret == -1) {
   4574 		ASSERT(ipst->ips_mld_slowtimeout_id == 0);
   4575 	} else {
   4576 		ASSERT(ipst->ips_mld_slowtimeout_id != 0);
   4577 		ipst->ips_mld_slowtimeout_id = 0;
   4578 	}
   4579 
   4580 	mutex_destroy(&ipst->ips_igmp_timer_lock);
   4581 	mutex_destroy(&ipst->ips_mld_timer_lock);
   4582 	mutex_destroy(&ipst->ips_igmp_slowtimeout_lock);
   4583 	mutex_destroy(&ipst->ips_mld_slowtimeout_lock);
   4584 	mutex_destroy(&ipst->ips_ip_addr_avail_lock);
   4585 	rw_destroy(&ipst->ips_ill_g_lock);
   4586 
   4587 	ip_ire_fini(ipst);
   4588 	ip6_asp_free(ipst);
   4589 	conn_drain_fini(ipst);
   4590 	ipcl_destroy(ipst);
   4591 
   4592 	mutex_destroy(&ipst->ips_ndp4->ndp_g_lock);
   4593 	mutex_destroy(&ipst->ips_ndp6->ndp_g_lock);
   4594 	kmem_free(ipst->ips_ndp4, sizeof (ndp_g_t));
   4595 	ipst->ips_ndp4 = NULL;
   4596 	kmem_free(ipst->ips_ndp6, sizeof (ndp_g_t));
   4597 	ipst->ips_ndp6 = NULL;
   4598 
   4599 	if (ipst->ips_loopback_ksp != NULL) {
   4600 		kstat_delete_netstack(ipst->ips_loopback_ksp, stackid);
   4601 		ipst->ips_loopback_ksp = NULL;
   4602 	}
   4603 
   4604 	kmem_free(ipst->ips_phyint_g_list, sizeof (phyint_list_t));
   4605 	ipst->ips_phyint_g_list = NULL;
   4606 	kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS);
   4607 	ipst->ips_ill_g_heads = NULL;
   4608 
   4609 	ldi_ident_release(ipst->ips_ldi_ident);
   4610 	kmem_free(ipst, sizeof (*ipst));
   4611 }
   4612 
   4613 /*
   4614  * This function is called from the TSD destructor, and is used to debug
   4615  * reference count issues in IP. See block comment in <inet/ip_if.h> for
   4616  * details.
   4617  */
   4618 static void
   4619 ip_thread_exit(void *phash)
   4620 {
   4621 	th_hash_t *thh = phash;
   4622 
   4623 	rw_enter(&ip_thread_rwlock, RW_WRITER);
   4624 	list_remove(&ip_thread_list, thh);
   4625 	rw_exit(&ip_thread_rwlock);
   4626 	mod_hash_destroy_hash(thh->thh_hash);
   4627 	kmem_free(thh, sizeof (*thh));
   4628 }
   4629 
   4630 /*
   4631  * Called when the IP kernel module is loaded into the kernel
   4632  */
   4633 void
   4634 ip_ddi_init(void)
   4635 {
   4636 	ip_squeue_flag = ip_squeue_switch(ip_squeue_enter);
   4637 
   4638 	/*
   4639 	 * For IP and TCP the minor numbers should start from 2 since we have 4
   4640 	 * initial devices: ip, ip6, tcp, tcp6.
   4641 	 */
   4642 	/*
   4643 	 * If this is a 64-bit kernel, then create two separate arenas -
   4644 	 * one for TLIs in the range of INET_MIN_DEV+2 through 2^^18-1, and the
   4645 	 * other for socket apps in the range 2^^18 through 2^^32-1.
   4646 	 */
   4647 	ip_minor_arena_la = NULL;
   4648 	ip_minor_arena_sa = NULL;
   4649 #if defined(_LP64)
   4650 	if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
   4651 	    INET_MIN_DEV + 2, MAXMIN32, KM_SLEEP)) == NULL) {
   4652 		cmn_err(CE_PANIC,
   4653 		    "ip_ddi_init: ip_minor_arena_sa creation failed\n");
   4654 	}
   4655 	if ((ip_minor_arena_la = inet_minor_create("ip_minor_arena_la",
   4656 	    MAXMIN32 + 1, MAXMIN64, KM_SLEEP)) == NULL) {
   4657 		cmn_err(CE_PANIC,
   4658 		    "ip_ddi_init: ip_minor_arena_la creation failed\n");
   4659 	}
   4660 #else
   4661 	if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
   4662 	    INET_MIN_DEV + 2, MAXMIN, KM_SLEEP)) == NULL) {
   4663 		cmn_err(CE_PANIC,
   4664 		    "ip_ddi_init: ip_minor_arena_sa creation failed\n");
   4665 	}
   4666 #endif
   4667 	ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms);
   4668 
   4669 	ipcl_g_init();
   4670 	ip_ire_g_init();
   4671 	ip_net_g_init();
   4672 
   4673 #ifdef DEBUG
   4674 	tsd_create(&ip_thread_data, ip_thread_exit);
   4675 	rw_init(&ip_thread_rwlock, NULL, RW_DEFAULT, NULL);
   4676 	list_create(&ip_thread_list, sizeof (th_hash_t),
   4677 	    offsetof(th_hash_t, thh_link));
   4678 #endif
   4679 	ipsec_policy_g_init();
   4680 	tcp_ddi_g_init();
   4681 	sctp_ddi_g_init();
   4682 	dce_g_init();
   4683 
   4684 	/*
   4685 	 * We want to be informed each time a stack is created or
   4686 	 * destroyed in the kernel, so we can maintain the
   4687 	 * set of udp_stack_t's.
   4688 	 */
   4689 	netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown,
   4690 	    ip_stack_fini);
   4691 
   4692 	tnet_init();
   4693 
   4694 	udp_ddi_g_init();
   4695 	rts_ddi_g_init();
   4696 	icmp_ddi_g_init();
   4697 	ilb_ddi_g_init();
   4698 }
   4699 
   4700 /*
   4701  * Initialize the IP stack instance.
   4702  */
   4703 static void *
   4704 ip_stack_init(netstackid_t stackid, netstack_t *ns)
   4705 {
   4706 	ip_stack_t	*ipst;
   4707 	ipparam_t	*pa;
   4708 	ipndp_t		*na;
   4709 	major_t		major;
   4710 
   4711 #ifdef NS_DEBUG
   4712 	printf("ip_stack_init(stack %d)\n", stackid);
   4713 #endif
   4714 
   4715 	ipst = (ip_stack_t *)kmem_zalloc(sizeof (*ipst), KM_SLEEP);
   4716 	ipst->ips_netstack = ns;
   4717 
   4718 	ipst->ips_ill_g_heads = kmem_zalloc(sizeof (ill_g_head_t) * MAX_G_HEADS,
   4719 	    KM_SLEEP);
   4720 	ipst->ips_phyint_g_list = kmem_zalloc(sizeof (phyint_list_t),
   4721 	    KM_SLEEP);
   4722 	ipst->ips_ndp4 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
   4723 	ipst->ips_ndp6 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
   4724 	mutex_init(&ipst->ips_ndp4->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
   4725 	mutex_init(&ipst->ips_ndp6->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
   4726 
   4727 	rw_init(&ipst->ips_ip_g_nd_lock, NULL, RW_DEFAULT, NULL);
   4728 	mutex_init(&ipst->ips_igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL);
   4729 	ipst->ips_igmp_deferred_next = INFINITY;
   4730 	mutex_init(&ipst->ips_mld_timer_lock, NULL, MUTEX_DEFAULT, NULL);
   4731 	ipst->ips_mld_deferred_next = INFINITY;
   4732 	mutex_init(&ipst->ips_igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
   4733 	mutex_init(&ipst->ips_mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
   4734 	mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL);
   4735 	mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL);
   4736 	rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL);
   4737 	rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL);
   4738 
   4739 	ipcl_init(ipst);
   4740 	ip_ire_init(ipst);
   4741 	ip6_asp_init(ipst);
   4742 	ipif_init(ipst);
   4743 	conn_drain_init(ipst);
   4744 	ip_mrouter_stack_init(ipst);
   4745 	dce_stack_init(ipst);
   4746 
   4747 	ipst->ips_ip_g_frag_timeout = IP_FRAG_TIMEOUT;
   4748 	ipst->ips_ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000;
   4749 	ipst->ips_ipv6_frag_timeout = IPV6_FRAG_TIMEOUT;
   4750 	ipst->ips_ipv6_frag_timo_ms = IPV6_FRAG_TIMEOUT * 1000;
   4751 
   4752 	ipst->ips_ip_multirt_log_interval = 1000;
   4753 
   4754 	ipst->ips_ip_g_forward = IP_FORWARD_DEFAULT;
   4755 	ipst->ips_ipv6_forward = IP_FORWARD_DEFAULT;
   4756 	ipst->ips_ill_index = 1;
   4757 
   4758 	ipst->ips_saved_ip_g_forward = -1;
   4759 	ipst->ips_reg_vif_num = ALL_VIFS; 	/* Index to Register vif */
   4760 
   4761 	pa = (ipparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
   4762 	ipst->ips_param_arr = pa;
   4763 	bcopy(lcl_param_arr, ipst->ips_param_arr, sizeof (lcl_param_arr));
   4764 
   4765 	na = (ipndp_t *)kmem_alloc(sizeof (lcl_ndp_arr), KM_SLEEP);
   4766 	ipst->ips_ndp_arr = na;
   4767 	bcopy(lcl_ndp_arr, ipst->ips_ndp_arr, sizeof (lcl_ndp_arr));
   4768 	ipst->ips_ndp_arr[IPNDP_IP_FORWARDING_OFFSET].ip_ndp_data =
   4769 	    (caddr_t)&ipst->ips_ip_g_forward;
   4770 	ipst->ips_ndp_arr[IPNDP_IP6_FORWARDING_OFFSET].ip_ndp_data =
   4771 	    (caddr_t)&ipst->ips_ipv6_forward;
   4772 	ASSERT(strcmp(ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_name,
   4773 	    "ip_cgtp_filter") == 0);
   4774 	ipst->ips_ndp_arr[IPNDP_CGTP_FILTER_OFFSET].ip_ndp_data =
   4775 	    (caddr_t)&ipst->ips_ip_cgtp_filter;
   4776 
   4777 	(void) ip_param_register(&ipst->ips_ip_g_nd,
   4778 	    ipst->ips_param_arr, A_CNT(lcl_param_arr),
   4779 	    ipst->ips_ndp_arr, A_CNT(lcl_ndp_arr));
   4780 
   4781 	ipst->ips_ip_mibkp = ip_kstat_init(stackid, ipst);
   4782 	ipst->ips_icmp_mibkp = icmp_kstat_init(stackid);
   4783 	ipst->ips_ip_kstat = ip_kstat2_init(stackid, &ipst->ips_ip_statistics);
   4784 	ipst->ips_ip6_kstat =
   4785 	    ip6_kstat_init(stackid, &ipst->ips_ip6_statistics);
   4786 
   4787 	ipst->ips_ip_src_id = 1;
   4788 	rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
   4789 
   4790 	ipst->ips_src_generation = SRC_GENERATION_INITIAL;
   4791 
   4792 	ip_net_init(ipst, ns);
   4793 	ipv4_hook_init(ipst);
   4794 	ipv6_hook_init(ipst);
   4795 	arp_hook_init(ipst);
   4796 	ipmp_init(ipst);
   4797 	ipobs_init(ipst);
   4798 
   4799 	/*
   4800 	 * Create the taskq dispatcher thread and initialize related stuff.
   4801 	 */
   4802 	ipst->ips_capab_taskq_thread = thread_create(NULL, 0,
   4803 	    ill_taskq_dispatch, ipst, 0, &p0, TS_RUN, minclsyspri);
   4804 	mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
   4805 	cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL);
   4806 
   4807 	major = mod_name_to_major(INET_NAME);
   4808 	(void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
   4809 	return (ipst);
   4810 }
   4811 
   4812 /*
   4813  * Allocate and initialize a DLPI template of the specified length.  (May be
   4814  * called as writer.)
   4815  */
   4816 mblk_t *
   4817 ip_dlpi_alloc(size_t len, t_uscalar_t prim)
   4818 {
   4819 	mblk_t	*mp;
   4820 
   4821 	mp = allocb(len, BPRI_MED);
   4822 	if (!mp)
   4823 		return (NULL);
   4824 
   4825 	/*
   4826 	 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
   4827 	 * of which we don't seem to use) are sent with M_PCPROTO, and
   4828 	 * that other DLPI are M_PROTO.
   4829 	 */
   4830 	if (prim == DL_INFO_REQ) {
   4831 		mp->b_datap->db_type = M_PCPROTO;
   4832 	} else {
   4833 		mp->b_datap->db_type = M_PROTO;
   4834 	}
   4835 
   4836 	mp->b_wptr = mp->b_rptr + len;
   4837 	bzero(mp->b_rptr, len);
   4838 	((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
   4839 	return (mp);
   4840 }
   4841 
   4842 /*
   4843  * Allocate and initialize a DLPI notification.  (May be called as writer.)
   4844  */
   4845 mblk_t *
   4846 ip_dlnotify_alloc(uint_t notification, uint_t data)
   4847 {
   4848 	dl_notify_ind_t	*notifyp;
   4849 	mblk_t		*mp;
   4850 
   4851 	if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
   4852 		return (NULL);
   4853 
   4854 	notifyp = (dl_notify_ind_t *)mp->b_rptr;
   4855 	notifyp->dl_notification = notification;
   4856 	notifyp->dl_data = data;
   4857 	return (mp);
   4858 }
   4859 
   4860 /*
   4861  * Debug formatting routine.  Returns a character string representation of the
   4862  * addr in buf, of the form xxx.xxx.xxx.xxx.  This routine takes the address
   4863  * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer.
   4864  *
   4865  * Once the ndd table-printing interfaces are removed, this can be changed to
   4866  * standard dotted-decimal form.
   4867  */
   4868 char *
   4869 ip_dot_addr(ipaddr_t addr, char *buf)
   4870 {
   4871 	uint8_t *ap = (uint8_t *)&addr;
   4872 
   4873 	(void) mi_sprintf(buf, "%03d.%03d.%03d.%03d",
   4874 	    ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF);
   4875 	return (buf);
   4876 }
   4877 
   4878 /*
   4879  * Write the given MAC address as a printable string in the usual colon-
   4880  * separated format.
   4881  */
   4882 const char *
   4883 mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen)
   4884 {
   4885 	char *bp;
   4886 
   4887 	if (alen == 0 || buflen < 4)
   4888 		return ("?");
   4889 	bp = buf;
   4890 	for (;;) {
   4891 		/*
   4892 		 * If there are more MAC address bytes available, but we won't
   4893 		 * have any room to print them, then add "..." to the string
   4894 		 * instead.  See below for the 'magic number' explanation.
   4895 		 */
   4896 		if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) {
   4897 			(void) strcpy(bp, "...");
   4898 			break;
   4899 		}
   4900 		(void) sprintf(bp, "%02x", *addr++);
   4901 		bp += 2;
   4902 		if (--alen == 0)
   4903 			break;
   4904 		*bp++ = ':';
   4905 		buflen -= 3;
   4906 		/*
   4907 		 * At this point, based on the first 'if' statement above,
   4908 		 * either alen == 1 and buflen >= 3, or alen > 1 and
   4909 		 * buflen >= 4.  The first case leaves room for the final "xx"
   4910 		 * number and trailing NUL byte.  The second leaves room for at
   4911 		 * least "...".  Thus the apparently 'magic' numbers chosen for
   4912 		 * that statement.
   4913 		 */
   4914 	}
   4915 	return (buf);
   4916 }
   4917 
   4918 /*
   4919  * Called when it is conceptually a ULP that would sent the packet
   4920  * e.g., port unreachable and protocol unreachable. Check that the packet
   4921  * would have passed the IPsec global policy before sending the error.
   4922  *
   4923  * Send an ICMP error after patching up the packet appropriately.
   4924  * Uses ip_drop_input and bumps the appropriate MIB.
   4925  */
   4926 void
   4927 ip_fanout_send_icmp_v4(mblk_t *mp, uint_t icmp_type, uint_t icmp_code,
   4928     ip_recv_attr_t *ira)
   4929 {
   4930 	ipha_t		*ipha;
   4931 	boolean_t	secure;
   4932 	ill_t		*ill = ira->ira_ill;
   4933 	ip_stack_t	*ipst = ill->ill_ipst;
   4934 	netstack_t	*ns = ipst->ips_netstack;
   4935 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
   4936 
   4937 	secure = ira->ira_flags & IRAF_IPSEC_SECURE;
   4938 
   4939 	/*
   4940 	 * We are generating an icmp error for some inbound packet.
   4941 	 * Called from all ip_fanout_(udp, tcp, proto) functions.
   4942 	 * Before we generate an error, check with global policy
   4943 	 * to see whether this is allowed to enter the system. As
   4944 	 * there is no "conn", we are checking with global policy.
   4945 	 */
   4946 	ipha = (ipha_t *)mp->b_rptr;
   4947 	if (secure || ipss->ipsec_inbound_v4_policy_present) {
   4948 		mp = ipsec_check_global_policy(mp, NULL, ipha, NULL, ira, ns);
   4949 		if (mp == NULL)
   4950 			return;
   4951 	}
   4952 
   4953 	/* We never send errors for protocols that we do implement */
   4954 	if (ira->ira_protocol == IPPROTO_ICMP ||
   4955 	    ira->ira_protocol == IPPROTO_IGMP) {
   4956 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   4957 		ip_drop_input("ip_fanout_send_icmp_v4", mp, ill);
   4958 		freemsg(mp);
   4959 		return;
   4960 	}
   4961 	/*
   4962 	 * Have to correct checksum since
   4963 	 * the packet might have been
   4964 	 * fragmented and the reassembly code in ip_rput
   4965 	 * does not restore the IP checksum.
   4966 	 */
   4967 	ipha->ipha_hdr_checksum = 0;
   4968 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
   4969 
   4970 	switch (icmp_type) {
   4971 	case ICMP_DEST_UNREACHABLE:
   4972 		switch (icmp_code) {
   4973 		case ICMP_PROTOCOL_UNREACHABLE:
   4974 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
   4975 			ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
   4976 			break;
   4977 		case ICMP_PORT_UNREACHABLE:
   4978 			BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
   4979 			ip_drop_input("ipIfStatsNoPorts", mp, ill);
   4980 			break;
   4981 		}
   4982 
   4983 		icmp_unreachable(mp, icmp_code, ira);
   4984 		break;
   4985 	default:
   4986 #ifdef DEBUG
   4987 		panic("ip_fanout_send_icmp_v4: wrong type");
   4988 		/*NOTREACHED*/
   4989 #else
   4990 		freemsg(mp);
   4991 		break;
   4992 #endif
   4993 	}
   4994 }
   4995 
   4996 /*
   4997  * Used to send an ICMP error message when a packet is received for
   4998  * a protocol that is not supported. The mblk passed as argument
   4999  * is consumed by this function.
   5000  */
   5001 void
   5002 ip_proto_not_sup(mblk_t *mp, ip_recv_attr_t *ira)
   5003 {
   5004 	ipha_t		*ipha;
   5005 
   5006 	ipha = (ipha_t *)mp->b_rptr;
   5007 	if (ira->ira_flags & IRAF_IS_IPV4) {
   5008 		ASSERT(IPH_HDR_VERSION(ipha) == IP_VERSION);
   5009 		ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
   5010 		    ICMP_PROTOCOL_UNREACHABLE, ira);
   5011 	} else {
   5012 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
   5013 		ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
   5014 		    ICMP6_PARAMPROB_NEXTHEADER, ira);
   5015 	}
   5016 }
   5017 
   5018 /*
   5019  * Deliver a rawip packet to the given conn, possibly applying ipsec policy.
   5020  * Handles IPv4 and IPv6.
   5021  * We are responsible for disposing of mp, such as by freemsg() or putnext()
   5022  * Caller is responsible for dropping references to the conn.
   5023  */
   5024 void
   5025 ip_fanout_proto_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
   5026     ip_recv_attr_t *ira)
   5027 {
   5028 	ill_t		*ill = ira->ira_ill;
   5029 	ip_stack_t	*ipst = ill->ill_ipst;
   5030 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   5031 	boolean_t	secure;
   5032 	uint_t		protocol = ira->ira_protocol;
   5033 	iaflags_t	iraflags = ira->ira_flags;
   5034 	queue_t		*rq;
   5035 
   5036 	secure = iraflags & IRAF_IPSEC_SECURE;
   5037 
   5038 	rq = connp->conn_rq;
   5039 	if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
   5040 		switch (protocol) {
   5041 		case IPPROTO_ICMPV6:
   5042 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows);
   5043 			break;
   5044 		case IPPROTO_ICMP:
   5045 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
   5046 			break;
   5047 		default:
   5048 			BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
   5049 			break;
   5050 		}
   5051 		freemsg(mp);
   5052 		return;
   5053 	}
   5054 
   5055 	ASSERT(!(IPCL_IS_IPTUN(connp)));
   5056 
   5057 	if (((iraflags & IRAF_IS_IPV4) ?
   5058 	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
   5059 	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
   5060 	    secure) {
   5061 		mp = ipsec_check_inbound_policy(mp, connp, ipha,
   5062 		    ip6h, ira);
   5063 		if (mp == NULL) {
   5064 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5065 			/* Note that mp is NULL */
   5066 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5067 			return;
   5068 		}
   5069 	}
   5070 
   5071 	if (iraflags & IRAF_ICMP_ERROR) {
   5072 		(connp->conn_recvicmp)(connp, mp, NULL, ira);
   5073 	} else {
   5074 		ill_t *rill = ira->ira_rill;
   5075 
   5076 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
   5077 		ira->ira_ill = ira->ira_rill = NULL;
   5078 		/* Send it upstream */
   5079 		(connp->conn_recv)(connp, mp, NULL, ira);
   5080 		ira->ira_ill = ill;
   5081 		ira->ira_rill = rill;
   5082 	}
   5083 }
   5084 
   5085 /*
   5086  * Handle protocols with which IP is less intimate.  There
   5087  * can be more than one stream bound to a particular
   5088  * protocol.  When this is the case, normally each one gets a copy
   5089  * of any incoming packets.
   5090  *
   5091  * IPsec NOTE :
   5092  *
   5093  * Don't allow a secure packet going up a non-secure connection.
   5094  * We don't allow this because
   5095  *
   5096  * 1) Reply might go out in clear which will be dropped at
   5097  *    the sending side.
   5098  * 2) If the reply goes out in clear it will give the
   5099  *    adversary enough information for getting the key in
   5100  *    most of the cases.
   5101  *
   5102  * Moreover getting a secure packet when we expect clear
   5103  * implies that SA's were added without checking for
   5104  * policy on both ends. This should not happen once ISAKMP
   5105  * is used to negotiate SAs as SAs will be added only after
   5106  * verifying the policy.
   5107  *
   5108  * Zones notes:
   5109  * Earlier in ip_input on a system with multiple shared-IP zones we
   5110  * duplicate the multicast and broadcast packets and send them up
   5111  * with each explicit zoneid that exists on that ill.
   5112  * This means that here we can match the zoneid with SO_ALLZONES being special.
   5113  */
   5114 void
   5115 ip_fanout_proto_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
   5116 {
   5117 	mblk_t		*mp1;
   5118 	ipaddr_t	laddr;
   5119 	conn_t		*connp, *first_connp, *next_connp;
   5120 	connf_t		*connfp;
   5121 	ill_t		*ill = ira->ira_ill;
   5122 	ip_stack_t	*ipst = ill->ill_ipst;
   5123 
   5124 	laddr = ipha->ipha_dst;
   5125 
   5126 	connfp = &ipst->ips_ipcl_proto_fanout_v4[ira->ira_protocol];
   5127 	mutex_enter(&connfp->connf_lock);
   5128 	connp = connfp->connf_head;
   5129 	for (connp = connfp->connf_head; connp != NULL;
   5130 	    connp = connp->conn_next) {
   5131 		/* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
   5132 		if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
   5133 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5134 		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) {
   5135 			break;
   5136 		}
   5137 	}
   5138 
   5139 	if (connp == NULL) {
   5140 		/*
   5141 		 * No one bound to these addresses.  Is
   5142 		 * there a client that wants all
   5143 		 * unclaimed datagrams?
   5144 		 */
   5145 		mutex_exit(&connfp->connf_lock);
   5146 		ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
   5147 		    ICMP_PROTOCOL_UNREACHABLE, ira);
   5148 		return;
   5149 	}
   5150 
   5151 	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
   5152 
   5153 	CONN_INC_REF(connp);
   5154 	first_connp = connp;
   5155 	connp = connp->conn_next;
   5156 
   5157 	for (;;) {
   5158 		while (connp != NULL) {
   5159 			/* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
   5160 			if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
   5161 			    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5162 			    tsol_receive_local(mp, &laddr, IPV4_VERSION,
   5163 			    ira, connp)))
   5164 				break;
   5165 			connp = connp->conn_next;
   5166 		}
   5167 
   5168 		if (connp == NULL) {
   5169 			/* No more interested clients */
   5170 			connp = first_connp;
   5171 			break;
   5172 		}
   5173 		if (((mp1 = dupmsg(mp)) == NULL) &&
   5174 		    ((mp1 = copymsg(mp)) == NULL)) {
   5175 			/* Memory allocation failed */
   5176 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5177 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5178 			connp = first_connp;
   5179 			break;
   5180 		}
   5181 
   5182 		CONN_INC_REF(connp);
   5183 		mutex_exit(&connfp->connf_lock);
   5184 
   5185 		ip_fanout_proto_conn(connp, mp1, (ipha_t *)mp1->b_rptr, NULL,
   5186 		    ira);
   5187 
   5188 		mutex_enter(&connfp->connf_lock);
   5189 		/* Follow the next pointer before releasing the conn. */
   5190 		next_connp = connp->conn_next;
   5191 		CONN_DEC_REF(connp);
   5192 		connp = next_connp;
   5193 	}
   5194 
   5195 	/* Last one.  Send it upstream. */
   5196 	mutex_exit(&connfp->connf_lock);
   5197 
   5198 	ip_fanout_proto_conn(connp, mp, ipha, NULL, ira);
   5199 
   5200 	CONN_DEC_REF(connp);
   5201 }
   5202 
   5203 /*
   5204  * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or
   5205  * pass it along to ESP if the SPI is non-zero.  Returns the mblk if the mblk
   5206  * is not consumed.
   5207  *
   5208  * One of three things can happen, all of which affect the passed-in mblk:
   5209  *
   5210  * 1.) The packet is stock UDP and gets its zero-SPI stripped.  Return mblk..
   5211  *
   5212  * 2.) The packet is ESP-in-UDP, gets transformed into an equivalent
   5213  *     ESP packet, and is passed along to ESP for consumption.  Return NULL.
   5214  *
   5215  * 3.) The packet is an ESP-in-UDP Keepalive.  Drop it and return NULL.
   5216  */
   5217 mblk_t *
   5218 zero_spi_check(mblk_t *mp, ip_recv_attr_t *ira)
   5219 {
   5220 	int shift, plen, iph_len;
   5221 	ipha_t *ipha;
   5222 	udpha_t *udpha;
   5223 	uint32_t *spi;
   5224 	uint32_t esp_ports;
   5225 	uint8_t *orptr;
   5226 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   5227 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   5228 
   5229 	ipha = (ipha_t *)mp->b_rptr;
   5230 	iph_len = ira->ira_ip_hdr_length;
   5231 	plen = ira->ira_pktlen;
   5232 
   5233 	if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) {
   5234 		/*
   5235 		 * Most likely a keepalive for the benefit of an intervening
   5236 		 * NAT.  These aren't for us, per se, so drop it.
   5237 		 *
   5238 		 * RFC 3947/8 doesn't say for sure what to do for 2-3
   5239 		 * byte packets (keepalives are 1-byte), but we'll drop them
   5240 		 * also.
   5241 		 */
   5242 		ip_drop_packet(mp, B_TRUE, ira->ira_ill,
   5243 		    DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper);
   5244 		return (NULL);
   5245 	}
   5246 
   5247 	if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) {
   5248 		/* might as well pull it all up - it might be ESP. */
   5249 		if (!pullupmsg(mp, -1)) {
   5250 			ip_drop_packet(mp, B_TRUE, ira->ira_ill,
   5251 			    DROPPER(ipss, ipds_esp_nomem),
   5252 			    &ipss->ipsec_dropper);
   5253 			return (NULL);
   5254 		}
   5255 
   5256 		ipha = (ipha_t *)mp->b_rptr;
   5257 	}
   5258 	spi = (uint32_t *)(mp->b_rptr + iph_len + sizeof (udpha_t));
   5259 	if (*spi == 0) {
   5260 		/* UDP packet - remove 0-spi. */
   5261 		shift = sizeof (uint32_t);
   5262 	} else {
   5263 		/* ESP-in-UDP packet - reduce to ESP. */
   5264 		ipha->ipha_protocol = IPPROTO_ESP;
   5265 		shift = sizeof (udpha_t);
   5266 	}
   5267 
   5268 	/* Fix IP header */
   5269 	ira->ira_pktlen = (plen - shift);
   5270 	ipha->ipha_length = htons(ira->ira_pktlen);
   5271 	ipha->ipha_hdr_checksum = 0;
   5272 
   5273 	orptr = mp->b_rptr;
   5274 	mp->b_rptr += shift;
   5275 
   5276 	udpha = (udpha_t *)(orptr + iph_len);
   5277 	if (*spi == 0) {
   5278 		ASSERT((uint8_t *)ipha == orptr);
   5279 		udpha->uha_length = htons(plen - shift - iph_len);
   5280 		iph_len += sizeof (udpha_t);	/* For the call to ovbcopy(). */
   5281 		esp_ports = 0;
   5282 	} else {
   5283 		esp_ports = *((uint32_t *)udpha);
   5284 		ASSERT(esp_ports != 0);
   5285 	}
   5286 	ovbcopy(orptr, orptr + shift, iph_len);
   5287 	if (esp_ports != 0) /* Punt up for ESP processing. */ {
   5288 		ipha = (ipha_t *)(orptr + shift);
   5289 
   5290 		ira->ira_flags |= IRAF_ESP_UDP_PORTS;
   5291 		ira->ira_esp_udp_ports = esp_ports;
   5292 		ip_fanout_v4(mp, ipha, ira);
   5293 		return (NULL);
   5294 	}
   5295 	return (mp);
   5296 }
   5297 
   5298 /*
   5299  * Deliver a udp packet to the given conn, possibly applying ipsec policy.
   5300  * Handles IPv4 and IPv6.
   5301  * We are responsible for disposing of mp, such as by freemsg() or putnext()
   5302  * Caller is responsible for dropping references to the conn.
   5303  */
   5304 void
   5305 ip_fanout_udp_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
   5306     ip_recv_attr_t *ira)
   5307 {
   5308 	ill_t		*ill = ira->ira_ill;
   5309 	ip_stack_t	*ipst = ill->ill_ipst;
   5310 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   5311 	boolean_t	secure;
   5312 	iaflags_t	iraflags = ira->ira_flags;
   5313 
   5314 	secure = iraflags & IRAF_IPSEC_SECURE;
   5315 
   5316 	if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
   5317 	    !canputnext(connp->conn_rq)) {
   5318 		BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
   5319 		freemsg(mp);
   5320 		return;
   5321 	}
   5322 
   5323 	if (((iraflags & IRAF_IS_IPV4) ?
   5324 	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
   5325 	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
   5326 	    secure) {
   5327 		mp = ipsec_check_inbound_policy(mp, connp, ipha,
   5328 		    ip6h, ira);
   5329 		if (mp == NULL) {
   5330 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5331 			/* Note that mp is NULL */
   5332 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5333 			return;
   5334 		}
   5335 	}
   5336 
   5337 	/*
   5338 	 * Since this code is not used for UDP unicast we don't need a NAT_T
   5339 	 * check. Only ip_fanout_v4 has that check.
   5340 	 */
   5341 	if (ira->ira_flags & IRAF_ICMP_ERROR) {
   5342 		(connp->conn_recvicmp)(connp, mp, NULL, ira);
   5343 	} else {
   5344 		ill_t *rill = ira->ira_rill;
   5345 
   5346 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
   5347 		ira->ira_ill = ira->ira_rill = NULL;
   5348 		/* Send it upstream */
   5349 		(connp->conn_recv)(connp, mp, NULL, ira);
   5350 		ira->ira_ill = ill;
   5351 		ira->ira_rill = rill;
   5352 	}
   5353 }
   5354 
   5355 /*
   5356  * Fanout for UDP packets that are multicast or broadcast, and ICMP errors.
   5357  * (Unicast fanout is handled in ip_input_v4.)
   5358  *
   5359  * If SO_REUSEADDR is set all multicast and broadcast packets
   5360  * will be delivered to all conns bound to the same port.
   5361  *
   5362  * If there is at least one matching AF_INET receiver, then we will
   5363  * ignore any AF_INET6 receivers.
   5364  * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an
   5365  * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4
   5366  * packets.
   5367  *
   5368  * Zones notes:
   5369  * Earlier in ip_input on a system with multiple shared-IP zones we
   5370  * duplicate the multicast and broadcast packets and send them up
   5371  * with each explicit zoneid that exists on that ill.
   5372  * This means that here we can match the zoneid with SO_ALLZONES being special.
   5373  */
   5374 void
   5375 ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport,
   5376     ip_recv_attr_t *ira)
   5377 {
   5378 	ipaddr_t	laddr;
   5379 	in6_addr_t	v6faddr;
   5380 	conn_t		*connp;
   5381 	connf_t		*connfp;
   5382 	ipaddr_t	faddr;
   5383 	ill_t		*ill = ira->ira_ill;
   5384 	ip_stack_t	*ipst = ill->ill_ipst;
   5385 
   5386 	ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
   5387 
   5388 	laddr = ipha->ipha_dst;
   5389 	faddr = ipha->ipha_src;
   5390 
   5391 	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
   5392 	mutex_enter(&connfp->connf_lock);
   5393 	connp = connfp->connf_head;
   5394 
   5395 	/*
   5396 	 * If SO_REUSEADDR has been set on the first we send the
   5397 	 * packet to all clients that have joined the group and
   5398 	 * match the port.
   5399 	 */
   5400 	while (connp != NULL) {
   5401 		if ((IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr)) &&
   5402 		    conn_wantpacket(connp, ira, ipha) &&
   5403 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5404 		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
   5405 			break;
   5406 		connp = connp->conn_next;
   5407 	}
   5408 
   5409 	if (connp == NULL)
   5410 		goto notfound;
   5411 
   5412 	CONN_INC_REF(connp);
   5413 
   5414 	if (connp->conn_reuseaddr) {
   5415 		conn_t		*first_connp = connp;
   5416 		conn_t		*next_connp;
   5417 		mblk_t		*mp1;
   5418 
   5419 		connp = connp->conn_next;
   5420 		for (;;) {
   5421 			while (connp != NULL) {
   5422 				if (IPCL_UDP_MATCH(connp, lport, laddr,
   5423 				    fport, faddr) &&
   5424 				    conn_wantpacket(connp, ira, ipha) &&
   5425 				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5426 				    tsol_receive_local(mp, &laddr, IPV4_VERSION,
   5427 				    ira, connp)))
   5428 					break;
   5429 				connp = connp->conn_next;
   5430 			}
   5431 			if (connp == NULL) {
   5432 				/* No more interested clients */
   5433 				connp = first_connp;
   5434 				break;
   5435 			}
   5436 			if (((mp1 = dupmsg(mp)) == NULL) &&
   5437 			    ((mp1 = copymsg(mp)) == NULL)) {
   5438 				/* Memory allocation failed */
   5439 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5440 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5441 				connp = first_connp;
   5442 				break;
   5443 			}
   5444 			CONN_INC_REF(connp);
   5445 			mutex_exit(&connfp->connf_lock);
   5446 
   5447 			IP_STAT(ipst, ip_udp_fanmb);
   5448 			ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
   5449 			    NULL, ira);
   5450 			mutex_enter(&connfp->connf_lock);
   5451 			/* Follow the next pointer before releasing the conn */
   5452 			next_connp = connp->conn_next;
   5453 			CONN_DEC_REF(connp);
   5454 			connp = next_connp;
   5455 		}
   5456 	}
   5457 
   5458 	/* Last one.  Send it upstream. */
   5459 	mutex_exit(&connfp->connf_lock);
   5460 	IP_STAT(ipst, ip_udp_fanmb);
   5461 	ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
   5462 	CONN_DEC_REF(connp);
   5463 	return;
   5464 
   5465 notfound:
   5466 	mutex_exit(&connfp->connf_lock);
   5467 	/*
   5468 	 * IPv6 endpoints bound to multicast IPv4-mapped addresses
   5469 	 * have already been matched above, since they live in the IPv4
   5470 	 * fanout tables. This implies we only need to
   5471 	 * check for IPv6 in6addr_any endpoints here.
   5472 	 * Thus we compare using ipv6_all_zeros instead of the destination
   5473 	 * address, except for the multicast group membership lookup which
   5474 	 * uses the IPv4 destination.
   5475 	 */
   5476 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6faddr);
   5477 	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
   5478 	mutex_enter(&connfp->connf_lock);
   5479 	connp = connfp->connf_head;
   5480 	/*
   5481 	 * IPv4 multicast packet being delivered to an AF_INET6
   5482 	 * in6addr_any endpoint.
   5483 	 * Need to check conn_wantpacket(). Note that we use conn_wantpacket()
   5484 	 * and not conn_wantpacket_v6() since any multicast membership is
   5485 	 * for an IPv4-mapped multicast address.
   5486 	 */
   5487 	while (connp != NULL) {
   5488 		if (IPCL_UDP_MATCH_V6(connp, lport, ipv6_all_zeros,
   5489 		    fport, v6faddr) &&
   5490 		    conn_wantpacket(connp, ira, ipha) &&
   5491 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5492 		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
   5493 			break;
   5494 		connp = connp->conn_next;
   5495 	}
   5496 
   5497 	if (connp == NULL) {
   5498 		/*
   5499 		 * No one bound to this port.  Is
   5500 		 * there a client that wants all
   5501 		 * unclaimed datagrams?
   5502 		 */
   5503 		mutex_exit(&connfp->connf_lock);
   5504 
   5505 		if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].connf_head !=
   5506 		    NULL) {
   5507 			ASSERT(ira->ira_protocol == IPPROTO_UDP);
   5508 			ip_fanout_proto_v4(mp, ipha, ira);
   5509 		} else {
   5510 			/*
   5511 			 * We used to attempt to send an icmp error here, but
   5512 			 * since this is known to be a multicast packet
   5513 			 * and we don't send icmp errors in response to
   5514 			 * multicast, just drop the packet and give up sooner.
   5515 			 */
   5516 			BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
   5517 			freemsg(mp);
   5518 		}
   5519 		return;
   5520 	}
   5521 	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
   5522 
   5523 	/*
   5524 	 * If SO_REUSEADDR has been set on the first we send the
   5525 	 * packet to all clients that have joined the group and
   5526 	 * match the port.
   5527 	 */
   5528 	if (connp->conn_reuseaddr) {
   5529 		conn_t		*first_connp = connp;
   5530 		conn_t		*next_connp;
   5531 		mblk_t		*mp1;
   5532 
   5533 		CONN_INC_REF(connp);
   5534 		connp = connp->conn_next;
   5535 		for (;;) {
   5536 			while (connp != NULL) {
   5537 				if (IPCL_UDP_MATCH_V6(connp, lport,
   5538 				    ipv6_all_zeros, fport, v6faddr) &&
   5539 				    conn_wantpacket(connp, ira, ipha) &&
   5540 				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5541 				    tsol_receive_local(mp, &laddr, IPV4_VERSION,
   5542 				    ira, connp)))
   5543 					break;
   5544 				connp = connp->conn_next;
   5545 			}
   5546 			if (connp == NULL) {
   5547 				/* No more interested clients */
   5548 				connp = first_connp;
   5549 				break;
   5550 			}
   5551 			if (((mp1 = dupmsg(mp)) == NULL) &&
   5552 			    ((mp1 = copymsg(mp)) == NULL)) {
   5553 				/* Memory allocation failed */
   5554 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5555 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5556 				connp = first_connp;
   5557 				break;
   5558 			}
   5559 			CONN_INC_REF(connp);
   5560 			mutex_exit(&connfp->connf_lock);
   5561 
   5562 			IP_STAT(ipst, ip_udp_fanmb);
   5563 			ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
   5564 			    NULL, ira);
   5565 			mutex_enter(&connfp->connf_lock);
   5566 			/* Follow the next pointer before releasing the conn */
   5567 			next_connp = connp->conn_next;
   5568 			CONN_DEC_REF(connp);
   5569 			connp = next_connp;
   5570 		}
   5571 	}
   5572 
   5573 	/* Last one.  Send it upstream. */
   5574 	mutex_exit(&connfp->connf_lock);
   5575 	IP_STAT(ipst, ip_udp_fanmb);
   5576 	ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
   5577 	CONN_DEC_REF(connp);
   5578 }
   5579 
   5580 /*
   5581  * Split an incoming packet's IPv4 options into the label and the other options.
   5582  * If 'allocate' is set it does memory allocation for the ip_pkt_t, including
   5583  * clearing out any leftover label or options.
   5584  * Otherwise it just makes ipp point into the packet.
   5585  *
   5586  * Returns zero if ok; ENOMEM if the buffer couldn't be allocated.
   5587  */
   5588 int
   5589 ip_find_hdr_v4(ipha_t *ipha, ip_pkt_t *ipp, boolean_t allocate)
   5590 {
   5591 	uchar_t		*opt;
   5592 	uint32_t	totallen;
   5593 	uint32_t	optval;
   5594 	uint32_t	optlen;
   5595 
   5596 	ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
   5597 	ipp->ipp_hoplimit = ipha->ipha_ttl;
   5598 	ipp->ipp_type_of_service = ipha->ipha_type_of_service;
   5599 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &ipp->ipp_addr);
   5600 
   5601 	/*
   5602 	 * Get length (in 4 byte octets) of IP header options.
   5603 	 */
   5604 	totallen = ipha->ipha_version_and_hdr_length -
   5605 	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
   5606 
   5607 	if (totallen == 0) {
   5608 		if (!allocate)
   5609 			return (0);
   5610 
   5611 		/* Clear out anything from a previous packet */
   5612 		if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
   5613 			kmem_free(ipp->ipp_ipv4_options,
   5614 			    ipp->ipp_ipv4_options_len);
   5615 			ipp->ipp_ipv4_options = NULL;
   5616 			ipp->ipp_ipv4_options_len = 0;
   5617 			ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
   5618 		}
   5619 		if (ipp->ipp_fields & IPPF_LABEL_V4) {
   5620 			kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
   5621 			ipp->ipp_label_v4 = NULL;
   5622 			ipp->ipp_label_len_v4 = 0;
   5623 			ipp->ipp_fields &= ~IPPF_LABEL_V4;
   5624 		}
   5625 		return (0);
   5626 	}
   5627 
   5628 	totallen <<= 2;
   5629 	opt = (uchar_t *)&ipha[1];
   5630 	if (!is_system_labeled()) {
   5631 
   5632 	copyall:
   5633 		if (!allocate) {
   5634 			if (totallen != 0) {
   5635 				ipp->ipp_ipv4_options = opt;
   5636 				ipp->ipp_ipv4_options_len = totallen;
   5637 				ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
   5638 			}
   5639 			return (0);
   5640 		}
   5641 		/* Just copy all of options */
   5642 		if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
   5643 			if (totallen == ipp->ipp_ipv4_options_len) {
   5644 				bcopy(opt, ipp->ipp_ipv4_options, totallen);
   5645 				return (0);
   5646 			}
   5647 			kmem_free(ipp->ipp_ipv4_options,
   5648 			    ipp->ipp_ipv4_options_len);
   5649 			ipp->ipp_ipv4_options = NULL;
   5650 			ipp->ipp_ipv4_options_len = 0;
   5651 			ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
   5652 		}
   5653 		if (totallen == 0)
   5654 			return (0);
   5655 
   5656 		ipp->ipp_ipv4_options = kmem_alloc(totallen, KM_NOSLEEP);
   5657 		if (ipp->ipp_ipv4_options == NULL)
   5658 			return (ENOMEM);
   5659 		ipp->ipp_ipv4_options_len = totallen;
   5660 		ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
   5661 		bcopy(opt, ipp->ipp_ipv4_options, totallen);
   5662 		return (0);
   5663 	}
   5664 
   5665 	if (allocate && (ipp->ipp_fields & IPPF_LABEL_V4)) {
   5666 		kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
   5667 		ipp->ipp_label_v4 = NULL;
   5668 		ipp->ipp_label_len_v4 = 0;
   5669 		ipp->ipp_fields &= ~IPPF_LABEL_V4;
   5670 	}
   5671 
   5672 	/*
   5673 	 * Search for CIPSO option.
   5674 	 * We assume CIPSO is first in options if it is present.
   5675 	 * If it isn't, then ipp_opt_ipv4_options will not include the options
   5676 	 * prior to the CIPSO option.
   5677 	 */
   5678 	while (totallen != 0) {
   5679 		switch (optval = opt[IPOPT_OPTVAL]) {
   5680 		case IPOPT_EOL:
   5681 			return (0);
   5682 		case IPOPT_NOP:
   5683 			optlen = 1;
   5684 			break;
   5685 		default:
   5686 			if (totallen <= IPOPT_OLEN)
   5687 				return (EINVAL);
   5688 			optlen = opt[IPOPT_OLEN];
   5689 			if (optlen < 2)
   5690 				return (EINVAL);
   5691 		}
   5692 		if (optlen > totallen)
   5693 			return (EINVAL);
   5694 
   5695 		switch (optval) {
   5696 		case IPOPT_COMSEC:
   5697 			if (!allocate) {
   5698 				ipp->ipp_label_v4 = opt;
   5699 				ipp->ipp_label_len_v4 = optlen;
   5700 				ipp->ipp_fields |= IPPF_LABEL_V4;
   5701 			} else {
   5702 				ipp->ipp_label_v4 = kmem_alloc(optlen,
   5703 				    KM_NOSLEEP);
   5704 				if (ipp->ipp_label_v4 == NULL)
   5705 					return (ENOMEM);
   5706 				ipp->ipp_label_len_v4 = optlen;
   5707 				ipp->ipp_fields |= IPPF_LABEL_V4;
   5708 				bcopy(opt, ipp->ipp_label_v4, optlen);
   5709 			}
   5710 			totallen -= optlen;
   5711 			opt += optlen;
   5712 
   5713 			/* Skip padding bytes until we get to a multiple of 4 */
   5714 			while ((totallen & 3) != 0 && opt[0] == IPOPT_NOP) {
   5715 				totallen--;
   5716 				opt++;
   5717 			}
   5718 			/* Remaining as ipp_ipv4_options */
   5719 			goto copyall;
   5720 		}
   5721 		totallen -= optlen;
   5722 		opt += optlen;
   5723 	}
   5724 	/* No CIPSO found; return everything as ipp_ipv4_options */
   5725 	totallen = ipha->ipha_version_and_hdr_length -
   5726 	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
   5727 	totallen <<= 2;
   5728 	opt = (uchar_t *)&ipha[1];
   5729 	goto copyall;
   5730 }
   5731 
   5732 /*
   5733  * Efficient versions of lookup for an IRE when we only
   5734  * match the address.
   5735  * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
   5736  * Does not handle multicast addresses.
   5737  */
   5738 uint_t
   5739 ip_type_v4(ipaddr_t addr, ip_stack_t *ipst)
   5740 {
   5741 	ire_t *ire;
   5742 	uint_t result;
   5743 
   5744 	ire = ire_ftable_lookup_simple_v4(addr, 0, ipst, NULL);
   5745 	ASSERT(ire != NULL);
   5746 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
   5747 		result = IRE_NOROUTE;
   5748 	else
   5749 		result = ire->ire_type;
   5750 	ire_refrele(ire);
   5751 	return (result);
   5752 }
   5753 
   5754 /*
   5755  * Efficient versions of lookup for an IRE when we only
   5756  * match the address.
   5757  * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
   5758  * Does not handle multicast addresses.
   5759  */
   5760 uint_t
   5761 ip_type_v6(const in6_addr_t *addr, ip_stack_t *ipst)
   5762 {
   5763 	ire_t *ire;
   5764 	uint_t result;
   5765 
   5766 	ire = ire_ftable_lookup_simple_v6(addr, 0, ipst, NULL);
   5767 	ASSERT(ire != NULL);
   5768 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
   5769 		result = IRE_NOROUTE;
   5770 	else
   5771 		result = ire->ire_type;
   5772 	ire_refrele(ire);
   5773 	return (result);
   5774 }
   5775 
   5776 /*
   5777  * Nobody should be sending
   5778  * packets up this stream
   5779  */
   5780 static void
   5781 ip_lrput(queue_t *q, mblk_t *mp)
   5782 {
   5783 	switch (mp->b_datap->db_type) {
   5784 	case M_FLUSH:
   5785 		/* Turn around */
   5786 		if (*mp->b_rptr & FLUSHW) {
   5787 			*mp->b_rptr &= ~FLUSHR;
   5788 			qreply(q, mp);
   5789 			return;
   5790 		}
   5791 		break;
   5792 	}
   5793 	freemsg(mp);
   5794 }
   5795 
   5796 /* Nobody should be sending packets down this stream */
   5797 /* ARGSUSED */
   5798 void
   5799 ip_lwput(queue_t *q, mblk_t *mp)
   5800 {
   5801 	freemsg(mp);
   5802 }
   5803 
   5804 /*
   5805  * Move the first hop in any source route to ipha_dst and remove that part of
   5806  * the source route.  Called by other protocols.  Errors in option formatting
   5807  * are ignored - will be handled by ip_output_options. Return the final
   5808  * destination (either ipha_dst or the last entry in a source route.)
   5809  */
   5810 ipaddr_t
   5811 ip_massage_options(ipha_t *ipha, netstack_t *ns)
   5812 {
   5813 	ipoptp_t	opts;
   5814 	uchar_t		*opt;
   5815 	uint8_t		optval;
   5816 	uint8_t		optlen;
   5817 	ipaddr_t	dst;
   5818 	int		i;
   5819 	ip_stack_t	*ipst = ns->netstack_ip;
   5820 
   5821 	ip2dbg(("ip_massage_options\n"));
   5822 	dst = ipha->ipha_dst;
   5823 	for (optval = ipoptp_first(&opts, ipha);
   5824 	    optval != IPOPT_EOL;
   5825 	    optval = ipoptp_next(&opts)) {
   5826 		opt = opts.ipoptp_cur;
   5827 		switch (optval) {
   5828 			uint8_t off;
   5829 		case IPOPT_SSRR:
   5830 		case IPOPT_LSRR:
   5831 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   5832 				ip1dbg(("ip_massage_options: bad src route\n"));
   5833 				break;
   5834 			}
   5835 			optlen = opts.ipoptp_len;
   5836 			off = opt[IPOPT_OFFSET];
   5837 			off--;
   5838 		redo_srr:
   5839 			if (optlen < IP_ADDR_LEN ||
   5840 			    off > optlen - IP_ADDR_LEN) {
   5841 				/* End of source route */
   5842 				ip1dbg(("ip_massage_options: end of SR\n"));
   5843 				break;
   5844 			}
   5845 			bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
   5846 			ip1dbg(("ip_massage_options: next hop 0x%x\n",
   5847 			    ntohl(dst)));
   5848 			/*
   5849 			 * Check if our address is present more than
   5850 			 * once as consecutive hops in source route.
   5851 			 * XXX verify per-interface ip_forwarding
   5852 			 * for source route?
   5853 			 */
   5854 			if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
   5855 				off += IP_ADDR_LEN;
   5856 				goto redo_srr;
   5857 			}
   5858 			if (dst == htonl(INADDR_LOOPBACK)) {
   5859 				ip1dbg(("ip_massage_options: loopback addr in "
   5860 				    "source route!\n"));
   5861 				break;
   5862 			}
   5863 			/*
   5864 			 * Update ipha_dst to be the first hop and remove the
   5865 			 * first hop from the source route (by overwriting
   5866 			 * part of the option with NOP options).
   5867 			 */
   5868 			ipha->ipha_dst = dst;
   5869 			/* Put the last entry in dst */
   5870 			off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) +
   5871 			    3;
   5872 			bcopy(&opt[off], &dst, IP_ADDR_LEN);
   5873 
   5874 			ip1dbg(("ip_massage_options: last hop 0x%x\n",
   5875 			    ntohl(dst)));
   5876 			/* Move down and overwrite */
   5877 			opt[IP_ADDR_LEN] = opt[0];
   5878 			opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN;
   5879 			opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET];
   5880 			for (i = 0; i < IP_ADDR_LEN; i++)
   5881 				opt[i] = IPOPT_NOP;
   5882 			break;
   5883 		}
   5884 	}
   5885 	return (dst);
   5886 }
   5887 
   5888 /*
   5889  * Return the network mask
   5890  * associated with the specified address.
   5891  */
   5892 ipaddr_t
   5893 ip_net_mask(ipaddr_t addr)
   5894 {
   5895 	uchar_t	*up = (uchar_t *)&addr;
   5896 	ipaddr_t mask = 0;
   5897 	uchar_t	*maskp = (uchar_t *)&mask;
   5898 
   5899 #if defined(__i386) || defined(__amd64)
   5900 #define	TOTALLY_BRAIN_DAMAGED_C_COMPILER
   5901 #endif
   5902 #ifdef  TOTALLY_BRAIN_DAMAGED_C_COMPILER
   5903 	maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0;
   5904 #endif
   5905 	if (CLASSD(addr)) {
   5906 		maskp[0] = 0xF0;
   5907 		return (mask);
   5908 	}
   5909 
   5910 	/* We assume Class E default netmask to be 32 */
   5911 	if (CLASSE(addr))
   5912 		return (0xffffffffU);
   5913 
   5914 	if (addr == 0)
   5915 		return (0);
   5916 	maskp[0] = 0xFF;
   5917 	if ((up[0] & 0x80) == 0)
   5918 		return (mask);
   5919 
   5920 	maskp[1] = 0xFF;
   5921 	if ((up[0] & 0xC0) == 0x80)
   5922 		return (mask);
   5923 
   5924 	maskp[2] = 0xFF;
   5925 	if ((up[0] & 0xE0) == 0xC0)
   5926 		return (mask);
   5927 
   5928 	/* Otherwise return no mask */
   5929 	return ((ipaddr_t)0);
   5930 }
   5931 
   5932 /* Name/Value Table Lookup Routine */
   5933 char *
   5934 ip_nv_lookup(nv_t *nv, int value)
   5935 {
   5936 	if (!nv)
   5937 		return (NULL);
   5938 	for (; nv->nv_name; nv++) {
   5939 		if (nv->nv_value == value)
   5940 			return (nv->nv_name);
   5941 	}
   5942 	return ("unknown");
   5943 }
   5944 
   5945 static int
   5946 ip_wait_for_info_ack(ill_t *ill)
   5947 {
   5948 	int err;
   5949 
   5950 	mutex_enter(&ill->ill_lock);
   5951 	while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) {
   5952 		/*
   5953 		 * Return value of 0 indicates a pending signal.
   5954 		 */
   5955 		err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock);
   5956 		if (err == 0) {
   5957 			mutex_exit(&ill->ill_lock);
   5958 			return (EINTR);
   5959 		}
   5960 	}
   5961 	mutex_exit(&ill->ill_lock);
   5962 	/*
   5963 	 * ip_rput_other could have set an error  in ill_error on
   5964 	 * receipt of M_ERROR.
   5965 	 */
   5966 	return (ill->ill_error);
   5967 }
   5968 
   5969 /*
   5970  * This is a module open, i.e. this is a control stream for access
   5971  * to a DLPI device.  We allocate an ill_t as the instance data in
   5972  * this case.
   5973  */
   5974 static int
   5975 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   5976 {
   5977 	ill_t	*ill;
   5978 	int	err;
   5979 	zoneid_t zoneid;
   5980 	netstack_t *ns;
   5981 	ip_stack_t *ipst;
   5982 
   5983 	/*
   5984 	 * Prevent unprivileged processes from pushing IP so that
   5985 	 * they can't send raw IP.
   5986 	 */
   5987 	if (secpolicy_net_rawaccess(credp) != 0)
   5988 		return (EPERM);
   5989 
   5990 	ns = netstack_find_by_cred(credp);
   5991 	ASSERT(ns != NULL);
   5992 	ipst = ns->netstack_ip;
   5993 	ASSERT(ipst != NULL);
   5994 
   5995 	/*
   5996 	 * For exclusive stacks we set the zoneid to zero
   5997 	 * to make IP operate as if in the global zone.
   5998 	 */
   5999 	if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
   6000 		zoneid = GLOBAL_ZONEID;
   6001 	else
   6002 		zoneid = crgetzoneid(credp);
   6003 
   6004 	ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t));
   6005 	q->q_ptr = WR(q)->q_ptr = ill;
   6006 	ill->ill_ipst = ipst;
   6007 	ill->ill_zoneid = zoneid;
   6008 
   6009 	/*
   6010 	 * ill_init initializes the ill fields and then sends down
   6011 	 * down a DL_INFO_REQ after calling qprocson.
   6012 	 */
   6013 	err = ill_init(q, ill);
   6014 
   6015 	if (err != 0) {
   6016 		mi_free(ill);
   6017 		netstack_rele(ipst->ips_netstack);
   6018 		q->q_ptr = NULL;
   6019 		WR(q)->q_ptr = NULL;
   6020 		return (err);
   6021 	}
   6022 
   6023 	/*
   6024 	 * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
   6025 	 *
   6026 	 * ill_init initializes the ipsq marking this thread as
   6027 	 * writer
   6028 	 */
   6029 	ipsq_exit(ill->ill_phyint->phyint_ipsq);
   6030 	err = ip_wait_for_info_ack(ill);
   6031 	if (err == 0)
   6032 		ill->ill_credp = credp;
   6033 	else
   6034 		goto fail;
   6035 
   6036 	crhold(credp);
   6037 
   6038 	mutex_enter(&ipst->ips_ip_mi_lock);
   6039 	err = mi_open_link(&ipst->ips_ip_g_head, (IDP)q->q_ptr, devp, flag,
   6040 	    sflag, credp);
   6041 	mutex_exit(&ipst->ips_ip_mi_lock);
   6042 fail:
   6043 	if (err) {
   6044 		(void) ip_close(q, 0);
   6045 		return (err);
   6046 	}
   6047 	return (0);
   6048 }
   6049 
   6050 /* For /dev/ip aka AF_INET open */
   6051 int
   6052 ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   6053 {
   6054 	return (ip_open(q, devp, flag, sflag, credp, B_FALSE));
   6055 }
   6056 
   6057 /* For /dev/ip6 aka AF_INET6 open */
   6058 int
   6059 ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   6060 {
   6061 	return (ip_open(q, devp, flag, sflag, credp, B_TRUE));
   6062 }
   6063 
   6064 /* IP open routine. */
   6065 int
   6066 ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
   6067     boolean_t isv6)
   6068 {
   6069 	conn_t 		*connp;
   6070 	major_t		maj;
   6071 	zoneid_t	zoneid;
   6072 	netstack_t	*ns;
   6073 	ip_stack_t	*ipst;
   6074 
   6075 	/* Allow reopen. */
   6076 	if (q->q_ptr != NULL)
   6077 		return (0);
   6078 
   6079 	if (sflag & MODOPEN) {
   6080 		/* This is a module open */
   6081 		return (ip_modopen(q, devp, flag, sflag, credp));
   6082 	}
   6083 
   6084 	if ((flag & ~(FKLYR)) == IP_HELPER_STR) {
   6085 		/*
   6086 		 * Non streams based socket looking for a stream
   6087 		 * to access IP
   6088 		 */
   6089 		return (ip_helper_stream_setup(q, devp, flag, sflag,
   6090 		    credp, isv6));
   6091 	}
   6092 
   6093 	ns = netstack_find_by_cred(credp);
   6094 	ASSERT(ns != NULL);
   6095 	ipst = ns->netstack_ip;
   6096 	ASSERT(ipst != NULL);
   6097 
   6098 	/*
   6099 	 * For exclusive stacks we set the zoneid to zero
   6100 	 * to make IP operate as if in the global zone.
   6101 	 */
   6102 	if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
   6103 		zoneid = GLOBAL_ZONEID;
   6104 	else
   6105 		zoneid = crgetzoneid(credp);
   6106 
   6107 	/*
   6108 	 * We are opening as a device. This is an IP client stream, and we
   6109 	 * allocate an conn_t as the instance data.
   6110 	 */
   6111 	connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, ipst->ips_netstack);
   6112 
   6113 	/*
   6114 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
   6115 	 * done by netstack_find_by_cred()
   6116 	 */
   6117 	netstack_rele(ipst->ips_netstack);
   6118 
   6119 	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
   6120 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
   6121 	connp->conn_ixa->ixa_zoneid = zoneid;
   6122 	connp->conn_zoneid = zoneid;
   6123 
   6124 	connp->conn_rq = q;
   6125 	q->q_ptr = WR(q)->q_ptr = connp;
   6126 
   6127 	/* Minor tells us which /dev entry was opened */
   6128 	if (isv6) {
   6129 		connp->conn_family = AF_INET6;
   6130 		connp->conn_ipversion = IPV6_VERSION;
   6131 		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
   6132 		connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
   6133 	} else {
   6134 		connp->conn_family = AF_INET;
   6135 		connp->conn_ipversion = IPV4_VERSION;
   6136 		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
   6137 	}
   6138 
   6139 	if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
   6140 	    ((connp->conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
   6141 		connp->conn_minor_arena = ip_minor_arena_la;
   6142 	} else {
   6143 		/*
   6144 		 * Either minor numbers in the large arena were exhausted
   6145 		 * or a non socket application is doing the open.
   6146 		 * Try to allocate from the small arena.
   6147 		 */
   6148 		if ((connp->conn_dev =
   6149 		    inet_minor_alloc(ip_minor_arena_sa)) == 0) {
   6150 			/* CONN_DEC_REF takes care of netstack_rele() */
   6151 			q->q_ptr = WR(q)->q_ptr = NULL;
   6152 			CONN_DEC_REF(connp);
   6153 			return (EBUSY);
   6154 		}
   6155 		connp->conn_minor_arena = ip_minor_arena_sa;
   6156 	}
   6157 
   6158 	maj = getemajor(*devp);
   6159 	*devp = makedevice(maj, (minor_t)connp->conn_dev);
   6160 
   6161 	/*
   6162 	 * connp->conn_cred is crfree()ed in ipcl_conn_destroy()
   6163 	 */
   6164 	connp->conn_cred = credp;
   6165 	/* Cache things in ixa without an extra refhold */
   6166 	connp->conn_ixa->ixa_cred = connp->conn_cred;
   6167 	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
   6168 	if (is_system_labeled())
   6169 		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
   6170 
   6171 	/*
   6172 	 * Handle IP_IOC_RTS_REQUEST and other ioctls which use conn_recv
   6173 	 */
   6174 	connp->conn_recv = ip_conn_input;
   6175 	connp->conn_recvicmp = ip_conn_input_icmp;
   6176 
   6177 	crhold(connp->conn_cred);
   6178 
   6179 	/*
   6180 	 * If the caller has the process-wide flag set, then default to MAC
   6181 	 * exempt mode.  This allows read-down to unlabeled hosts.
   6182 	 */
   6183 	if (getpflags(NET_MAC_AWARE, credp) != 0)
   6184 		connp->conn_mac_mode = CONN_MAC_AWARE;
   6185 
   6186 	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
   6187 
   6188 	connp->conn_rq = q;
   6189 	connp->conn_wq = WR(q);
   6190 
   6191 	/* Non-zero default values */
   6192 	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
   6193 
   6194 	/*
   6195 	 * Make the conn globally visible to walkers
   6196 	 */
   6197 	ASSERT(connp->conn_ref == 1);
   6198 	mutex_enter(&connp->conn_lock);
   6199 	connp->conn_state_flags &= ~CONN_INCIPIENT;
   6200 	mutex_exit(&connp->conn_lock);
   6201 
   6202 	qprocson(q);
   6203 
   6204 	return (0);
   6205 }
   6206 
   6207 /*
   6208  * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid,
   6209  * all of them are copied to the conn_t. If the req is "zero", the policy is
   6210  * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req
   6211  * fields.
   6212  * We keep only the latest setting of the policy and thus policy setting
   6213  * is not incremental/cumulative.
   6214  *
   6215  * Requests to set policies with multiple alternative actions will
   6216  * go through a different API.
   6217  */
   6218 int
   6219 ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
   6220 {
   6221 	uint_t ah_req = 0;
   6222 	uint_t esp_req = 0;
   6223 	uint_t se_req = 0;
   6224 	ipsec_act_t *actp = NULL;
   6225 	uint_t nact;
   6226 	ipsec_policy_head_t *ph;
   6227 	boolean_t is_pol_reset, is_pol_inserted = B_FALSE;
   6228 	int error = 0;
   6229 	netstack_t	*ns = connp->conn_netstack;
   6230 	ip_stack_t	*ipst = ns->netstack_ip;
   6231 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
   6232 
   6233 #define	REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER)
   6234 
   6235 	/*
   6236 	 * The IP_SEC_OPT option does not allow variable length parameters,
   6237 	 * hence a request cannot be NULL.
   6238 	 */
   6239 	if (req == NULL)
   6240 		return (EINVAL);
   6241 
   6242 	ah_req = req->ipsr_ah_req;
   6243 	esp_req = req->ipsr_esp_req;
   6244 	se_req = req->ipsr_self_encap_req;
   6245 
   6246 	/* Don't allow setting self-encap without one or more of AH/ESP. */
   6247 	if (se_req != 0 && esp_req == 0 && ah_req == 0)
   6248 		return (EINVAL);
   6249 
   6250 	/*
   6251 	 * Are we dealing with a request to reset the policy (i.e.
   6252 	 * zero requests).
   6253 	 */
   6254 	is_pol_reset = ((ah_req & REQ_MASK) == 0 &&
   6255 	    (esp_req & REQ_MASK) == 0 &&
   6256 	    (se_req & REQ_MASK) == 0);
   6257 
   6258 	if (!is_pol_reset) {
   6259 		/*
   6260 		 * If we couldn't load IPsec, fail with "protocol
   6261 		 * not supported".
   6262 		 * IPsec may not have been loaded for a request with zero
   6263 		 * policies, so we don't fail in this case.
   6264 		 */
   6265 		mutex_enter(&ipss->ipsec_loader_lock);
   6266 		if (ipss->ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) {
   6267 			mutex_exit(&ipss->ipsec_loader_lock);
   6268 			return (EPROTONOSUPPORT);
   6269 		}
   6270 		mutex_exit(&ipss->ipsec_loader_lock);
   6271 
   6272 		/*
   6273 		 * Test for valid requests. Invalid algorithms
   6274 		 * need to be tested by IPsec code because new
   6275 		 * algorithms can be added dynamically.
   6276 		 */
   6277 		if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
   6278 		    (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
   6279 		    (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) {
   6280 			return (EINVAL);
   6281 		}
   6282 
   6283 		/*
   6284 		 * Only privileged users can issue these
   6285 		 * requests.
   6286 		 */
   6287 		if (((ah_req & IPSEC_PREF_NEVER) ||
   6288 		    (esp_req & IPSEC_PREF_NEVER) ||
   6289 		    (se_req & IPSEC_PREF_NEVER)) &&
   6290 		    secpolicy_ip_config(cr, B_FALSE) != 0) {
   6291 			return (EPERM);
   6292 		}
   6293 
   6294 		/*
   6295 		 * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER
   6296 		 * are mutually exclusive.
   6297 		 */
   6298 		if (((ah_req & REQ_MASK) == REQ_MASK) ||
   6299 		    ((esp_req & REQ_MASK) == REQ_MASK) ||
   6300 		    ((se_req & REQ_MASK) == REQ_MASK)) {
   6301 			/* Both of them are set */
   6302 			return (EINVAL);
   6303 		}
   6304 	}
   6305 
   6306 	ASSERT(MUTEX_HELD(&connp->conn_lock));
   6307 
   6308 	/*
   6309 	 * If we have already cached policies in conn_connect(), don't
   6310 	 * let them change now. We cache policies for connections
   6311 	 * whose src,dst [addr, port] is known.
   6312 	 */
   6313 	if (connp->conn_policy_cached) {
   6314 		return (EINVAL);
   6315 	}
   6316 
   6317 	/*
   6318 	 * We have a zero policies, reset the connection policy if already
   6319 	 * set. This will cause the connection to inherit the
   6320 	 * global policy, if any.
   6321 	 */
   6322 	if (is_pol_reset) {
   6323 		if (connp->conn_policy != NULL) {
   6324 			IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack);
   6325 			connp->conn_policy = NULL;
   6326 		}
   6327 		connp->conn_in_enforce_policy = B_FALSE;
   6328 		connp->conn_out_enforce_policy = B_FALSE;
   6329 		return (0);
   6330 	}
   6331 
   6332 	ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy,
   6333 	    ipst->ips_netstack);
   6334 	if (ph == NULL)
   6335 		goto enomem;
   6336 
   6337 	ipsec_actvec_from_req(req, &actp, &nact, ipst->ips_netstack);
   6338 	if (actp == NULL)
   6339 		goto enomem;
   6340 
   6341 	/*
   6342 	 * Always insert IPv4 policy entries, since they can also apply to
   6343 	 * ipv6 sockets being used in ipv4-compat mode.
   6344 	 */
   6345 	if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
   6346 	    IPSEC_TYPE_INBOUND, ns))
   6347 		goto enomem;
   6348 	is_pol_inserted = B_TRUE;
   6349 	if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
   6350 	    IPSEC_TYPE_OUTBOUND, ns))
   6351 		goto enomem;
   6352 
   6353 	/*
   6354 	 * We're looking at a v6 socket, also insert the v6-specific
   6355 	 * entries.
   6356 	 */
   6357 	if (connp->conn_family == AF_INET6) {
   6358 		if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
   6359 		    IPSEC_TYPE_INBOUND, ns))
   6360 			goto enomem;
   6361 		if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
   6362 		    IPSEC_TYPE_OUTBOUND, ns))
   6363 			goto enomem;
   6364 	}
   6365 
   6366 	ipsec_actvec_free(actp, nact);
   6367 
   6368 	/*
   6369 	 * If the requests need security, set enforce_policy.
   6370 	 * If the requests are IPSEC_PREF_NEVER, one should
   6371 	 * still set conn_out_enforce_policy so that ip_set_destination
   6372 	 * marks the ip_xmit_attr_t appropriatly. This is needed so that
   6373 	 * for connections that we don't cache policy in at connect time,
   6374 	 * if global policy matches in ip_output_attach_policy, we
   6375 	 * don't wrongly inherit global policy. Similarly, we need
   6376 	 * to set conn_in_enforce_policy also so that we don't verify
   6377 	 * policy wrongly.
   6378 	 */
   6379 	if ((ah_req & REQ_MASK) != 0 ||
   6380 	    (esp_req & REQ_MASK) != 0 ||
   6381 	    (se_req & REQ_MASK) != 0) {
   6382 		connp->conn_in_enforce_policy = B_TRUE;
   6383 		connp->conn_out_enforce_policy = B_TRUE;
   6384 	}
   6385 
   6386 	return (error);
   6387 #undef REQ_MASK
   6388 
   6389 	/*
   6390 	 * Common memory-allocation-failure exit path.
   6391 	 */
   6392 enomem:
   6393 	if (actp != NULL)
   6394 		ipsec_actvec_free(actp, nact);
   6395 	if (is_pol_inserted)
   6396 		ipsec_polhead_flush(ph, ns);
   6397 	return (ENOMEM);
   6398 }
   6399 
   6400 /*
   6401  * Set socket options for joining and leaving multicast groups.
   6402  * Common to IPv4 and IPv6; inet6 indicates the type of socket.
   6403  * The caller has already check that the option name is consistent with
   6404  * the address family of the socket.
   6405  */
   6406 int
   6407 ip_opt_set_multicast_group(conn_t *connp, t_scalar_t name,
   6408     uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
   6409 {
   6410 	int		*i1 = (int *)invalp;
   6411 	int		error = 0;
   6412 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   6413 	struct ip_mreq	*v4_mreqp;
   6414 	struct ipv6_mreq *v6_mreqp;
   6415 	struct group_req *greqp;
   6416 	ire_t *ire;
   6417 	boolean_t done = B_FALSE;
   6418 	ipaddr_t ifaddr;
   6419 	in6_addr_t v6group;
   6420 	uint_t ifindex;
   6421 	boolean_t mcast_opt = B_TRUE;
   6422 	mcast_record_t fmode;
   6423 	int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
   6424 	    ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
   6425 
   6426 	switch (name) {
   6427 	case IP_ADD_MEMBERSHIP:
   6428 	case IPV6_JOIN_GROUP:
   6429 		mcast_opt = B_FALSE;
   6430 		/* FALLTHRU */
   6431 	case MCAST_JOIN_GROUP:
   6432 		fmode = MODE_IS_EXCLUDE;
   6433 		optfn = ip_opt_add_group;
   6434 		break;
   6435 
   6436 	case IP_DROP_MEMBERSHIP:
   6437 	case IPV6_LEAVE_GROUP:
   6438 		mcast_opt = B_FALSE;
   6439 		/* FALLTHRU */
   6440 	case MCAST_LEAVE_GROUP:
   6441 		fmode = MODE_IS_INCLUDE;
   6442 		optfn = ip_opt_delete_group;
   6443 		break;
   6444 	default:
   6445 		ASSERT(0);
   6446 	}
   6447 
   6448 	if (mcast_opt) {
   6449 		struct sockaddr_in *sin;
   6450 		struct sockaddr_in6 *sin6;
   6451 
   6452 		greqp = (struct group_req *)i1;
   6453 		if (greqp->gr_group.ss_family == AF_INET) {
   6454 			sin = (struct sockaddr_in *)&(greqp->gr_group);
   6455 			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &v6group);
   6456 		} else {
   6457 			if (!inet6)
   6458 				return (EINVAL);	/* Not on INET socket */
   6459 
   6460 			sin6 = (struct sockaddr_in6 *)&(greqp->gr_group);
   6461 			v6group = sin6->sin6_addr;
   6462 		}
   6463 		ifaddr = INADDR_ANY;
   6464 		ifindex = greqp->gr_interface;
   6465 	} else if (inet6) {
   6466 		v6_mreqp = (struct ipv6_mreq *)i1;
   6467 		v6group = v6_mreqp->ipv6mr_multiaddr;
   6468 		ifaddr = INADDR_ANY;
   6469 		ifindex = v6_mreqp->ipv6mr_interface;
   6470 	} else {
   6471 		v4_mreqp = (struct ip_mreq *)i1;
   6472 		IN6_INADDR_TO_V4MAPPED(&v4_mreqp->imr_multiaddr, &v6group);
   6473 		ifaddr = (ipaddr_t)v4_mreqp->imr_interface.s_addr;
   6474 		ifindex = 0;
   6475 	}
   6476 
   6477 	/*
   6478 	 * In the multirouting case, we need to replicate
   6479 	 * the request on all interfaces that will take part
   6480 	 * in replication.  We do so because multirouting is
   6481 	 * reflective, thus we will probably receive multi-
   6482 	 * casts on those interfaces.
   6483 	 * The ip_multirt_apply_membership() succeeds if
   6484 	 * the operation succeeds on at least one interface.
   6485 	 */
   6486 	if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
   6487 		ipaddr_t group;
   6488 
   6489 		IN6_V4MAPPED_TO_IPADDR(&v6group, group);
   6490 
   6491 		ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
   6492 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6493 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6494 	} else {
   6495 		ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
   6496 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6497 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6498 	}
   6499 	if (ire != NULL) {
   6500 		if (ire->ire_flags & RTF_MULTIRT) {
   6501 			error = ip_multirt_apply_membership(optfn, ire, connp,
   6502 			    checkonly, &v6group, fmode, &ipv6_all_zeros);
   6503 			done = B_TRUE;
   6504 		}
   6505 		ire_refrele(ire);
   6506 	}
   6507 
   6508 	if (!done) {
   6509 		error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
   6510 		    fmode, &ipv6_all_zeros);
   6511 	}
   6512 	return (error);
   6513 }
   6514 
   6515 /*
   6516  * Set socket options for joining and leaving multicast groups
   6517  * for specific sources.
   6518  * Common to IPv4 and IPv6; inet6 indicates the type of socket.
   6519  * The caller has already check that the option name is consistent with
   6520  * the address family of the socket.
   6521  */
   6522 int
   6523 ip_opt_set_multicast_sources(conn_t *connp, t_scalar_t name,
   6524     uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
   6525 {
   6526 	int		*i1 = (int *)invalp;
   6527 	int		error = 0;
   6528 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   6529 	struct ip_mreq_source *imreqp;
   6530 	struct group_source_req *gsreqp;
   6531 	in6_addr_t v6group, v6src;
   6532 	uint32_t ifindex;
   6533 	ipaddr_t ifaddr;
   6534 	boolean_t mcast_opt = B_TRUE;
   6535 	mcast_record_t fmode;
   6536 	ire_t *ire;
   6537 	boolean_t done = B_FALSE;
   6538 	int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
   6539 	    ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
   6540 
   6541 	switch (name) {
   6542 	case IP_BLOCK_SOURCE:
   6543 		mcast_opt = B_FALSE;
   6544 		/* FALLTHRU */
   6545 	case MCAST_BLOCK_SOURCE:
   6546 		fmode = MODE_IS_EXCLUDE;
   6547 		optfn = ip_opt_add_group;
   6548 		break;
   6549 
   6550 	case IP_UNBLOCK_SOURCE:
   6551 		mcast_opt = B_FALSE;
   6552 		/* FALLTHRU */
   6553 	case MCAST_UNBLOCK_SOURCE:
   6554 		fmode = MODE_IS_EXCLUDE;
   6555 		optfn = ip_opt_delete_group;
   6556 		break;
   6557 
   6558 	case IP_ADD_SOURCE_MEMBERSHIP:
   6559 		mcast_opt = B_FALSE;
   6560 		/* FALLTHRU */
   6561 	case MCAST_JOIN_SOURCE_GROUP:
   6562 		fmode = MODE_IS_INCLUDE;
   6563 		optfn = ip_opt_add_group;
   6564 		break;
   6565 
   6566 	case IP_DROP_SOURCE_MEMBERSHIP:
   6567 		mcast_opt = B_FALSE;
   6568 		/* FALLTHRU */
   6569 	case MCAST_LEAVE_SOURCE_GROUP:
   6570 		fmode = MODE_IS_INCLUDE;
   6571 		optfn = ip_opt_delete_group;
   6572 		break;
   6573 	default:
   6574 		ASSERT(0);
   6575 	}
   6576 
   6577 	if (mcast_opt) {
   6578 		gsreqp = (struct group_source_req *)i1;
   6579 		ifindex = gsreqp->gsr_interface;
   6580 		if (gsreqp->gsr_group.ss_family == AF_INET) {
   6581 			struct sockaddr_in *s;
   6582 			s = (struct sockaddr_in *)&gsreqp->gsr_group;
   6583 			IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6group);
   6584 			s = (struct sockaddr_in *)&gsreqp->gsr_source;
   6585 			IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src);
   6586 		} else {
   6587 			struct sockaddr_in6 *s6;
   6588 
   6589 			if (!inet6)
   6590 				return (EINVAL);	/* Not on INET socket */
   6591 
   6592 			s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group;
   6593 			v6group = s6->sin6_addr;
   6594 			s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source;
   6595 			v6src = s6->sin6_addr;
   6596 		}
   6597 		ifaddr = INADDR_ANY;
   6598 	} else {
   6599 		imreqp = (struct ip_mreq_source *)i1;
   6600 		IN6_INADDR_TO_V4MAPPED(&imreqp->imr_multiaddr, &v6group);
   6601 		IN6_INADDR_TO_V4MAPPED(&imreqp->imr_sourceaddr, &v6src);
   6602 		ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr;
   6603 		ifindex = 0;
   6604 	}
   6605 
   6606 	/*
   6607 	 * Handle src being mapped INADDR_ANY by changing it to unspecified.
   6608 	 */
   6609 	if (IN6_IS_ADDR_V4MAPPED_ANY(&v6src))
   6610 		v6src = ipv6_all_zeros;
   6611 
   6612 	/*
   6613 	 * In the multirouting case, we need to replicate
   6614 	 * the request as noted in the mcast cases above.
   6615 	 */
   6616 	if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
   6617 		ipaddr_t group;
   6618 
   6619 		IN6_V4MAPPED_TO_IPADDR(&v6group, group);
   6620 
   6621 		ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
   6622 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6623 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6624 	} else {
   6625 		ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
   6626 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6627 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6628 	}
   6629 	if (ire != NULL) {
   6630 		if (ire->ire_flags & RTF_MULTIRT) {
   6631 			error = ip_multirt_apply_membership(optfn, ire, connp,
   6632 			    checkonly, &v6group, fmode, &v6src);
   6633 			done = B_TRUE;
   6634 		}
   6635 		ire_refrele(ire);
   6636 	}
   6637 	if (!done) {
   6638 		error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
   6639 		    fmode, &v6src);
   6640 	}
   6641 	return (error);
   6642 }
   6643 
   6644 /*
   6645  * Given a destination address and a pointer to where to put the information
   6646  * this routine fills in the mtuinfo.
   6647  * The socket must be connected.
   6648  * For sctp conn_faddr is the primary address.
   6649  */
   6650 int
   6651 ip_fill_mtuinfo(conn_t *connp, ip_xmit_attr_t *ixa, struct ip6_mtuinfo *mtuinfo)
   6652 {
   6653 	uint32_t	pmtu = IP_MAXPACKET;
   6654 	uint_t		scopeid;
   6655 
   6656 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6))
   6657 		return (-1);
   6658 
   6659 	/* In case we never sent or called ip_set_destination_v4/v6 */
   6660 	if (ixa->ixa_ire != NULL)
   6661 		pmtu = ip_get_pmtu(ixa);
   6662 
   6663 	if (ixa->ixa_flags & IXAF_SCOPEID_SET)
   6664 		scopeid = ixa->ixa_scopeid;
   6665 	else
   6666 		scopeid = 0;
   6667 
   6668 	bzero(mtuinfo, sizeof (*mtuinfo));
   6669 	mtuinfo->ip6m_addr.sin6_family = AF_INET6;
   6670 	mtuinfo->ip6m_addr.sin6_port = connp->conn_fport;
   6671 	mtuinfo->ip6m_addr.sin6_addr = connp->conn_faddr_v6;
   6672 	mtuinfo->ip6m_addr.sin6_scope_id = scopeid;
   6673 	mtuinfo->ip6m_mtu = pmtu;
   6674 
   6675 	return (sizeof (struct ip6_mtuinfo));
   6676 }
   6677 
   6678 /* Named Dispatch routine to get a current value out of our parameter table. */
   6679 /* ARGSUSED */
   6680 static int
   6681 ip_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr)
   6682 {
   6683 	ipparam_t *ippa = (ipparam_t *)cp;
   6684 
   6685 	(void) mi_mpprintf(mp, "%d", ippa->ip_param_value);
   6686 	return (0);
   6687 }
   6688 
   6689 /* ARGSUSED */
   6690 static int
   6691 ip_param_generic_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr)
   6692 {
   6693 
   6694 	(void) mi_mpprintf(mp, "%d", *(int *)cp);
   6695 	return (0);
   6696 }
   6697 
   6698 /*
   6699  * Set ip{,6}_forwarding values.  This means walking through all of the
   6700  * ill's and toggling their forwarding values.
   6701  */
   6702 /* ARGSUSED */
   6703 static int
   6704 ip_forward_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr)
   6705 {
   6706 	long new_value;
   6707 	int *forwarding_value = (int *)cp;
   6708 	ill_t *ill;
   6709 	boolean_t isv6;
   6710 	ill_walk_context_t ctx;
   6711 	ip_stack_t *ipst = CONNQ_TO_IPST(q);
   6712 
   6713 	isv6 = (forwarding_value == &ipst->ips_ipv6_forward);
   6714 
   6715 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
   6716 	    new_value < 0 || new_value > 1) {
   6717 		return (EINVAL);
   6718 	}
   6719 
   6720 	*forwarding_value = new_value;
   6721 
   6722 	/*
   6723 	 * Regardless of the current value of ip_forwarding, set all per-ill
   6724 	 * values of ip_forwarding to the value being set.
   6725 	 *
   6726 	 * Bring all the ill's up to date with the new global value.
   6727 	 */
   6728 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   6729 
   6730 	if (isv6)
   6731 		ill = ILL_START_WALK_V6(&ctx, ipst);
   6732 	else
   6733 		ill = ILL_START_WALK_V4(&ctx, ipst);
   6734 
   6735 	for (; ill != NULL; ill = ill_next(&ctx, ill))
   6736 		(void) ill_forward_set(ill, new_value != 0);
   6737 
   6738 	rw_exit(&ipst->ips_ill_g_lock);
   6739 	return (0);
   6740 }
   6741 
   6742 /*
   6743  * Walk through the param array specified registering each element with the
   6744  * Named Dispatch handler. This is called only during init. So it is ok
   6745  * not to acquire any locks
   6746  */
   6747 static boolean_t
   6748 ip_param_register(IDP *ndp, ipparam_t *ippa, size_t ippa_cnt,
   6749     ipndp_t *ipnd, size_t ipnd_cnt)
   6750 {
   6751 	for (; ippa_cnt-- > 0; ippa++) {
   6752 		if (ippa->ip_param_name && ippa->ip_param_name[0]) {
   6753 			if (!nd_load(ndp, ippa->ip_param_name,
   6754 			    ip_param_get, ip_param_set, (caddr_t)ippa)) {
   6755 				nd_free(ndp);
   6756 				return (B_FALSE);
   6757 			}
   6758 		}
   6759 	}
   6760 
   6761 	for (; ipnd_cnt-- > 0; ipnd++) {
   6762 		if (ipnd->ip_ndp_name && ipnd->ip_ndp_name[0]) {
   6763 			if (!nd_load(ndp, ipnd->ip_ndp_name,
   6764 			    ipnd->ip_ndp_getf, ipnd->ip_ndp_setf,
   6765 			    ipnd->ip_ndp_data)) {
   6766 				nd_free(ndp);
   6767 				return (B_FALSE);
   6768 			}
   6769 		}
   6770 	}
   6771 
   6772 	return (B_TRUE);
   6773 }
   6774 
   6775 /* Named Dispatch routine to negotiate a new value for one of our parameters. */
   6776 /* ARGSUSED */
   6777 static int
   6778 ip_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr)
   6779 {
   6780 	long		new_value;
   6781 	ipparam_t	*ippa = (ipparam_t *)cp;
   6782 
   6783 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
   6784 	    new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) {
   6785 		return (EINVAL);
   6786 	}
   6787 	ippa->ip_param_value = new_value;
   6788 	return (0);
   6789 }
   6790 
   6791 /*
   6792  * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases,
   6793  * When an ipf is passed here for the first time, if
   6794  * we already have in-order fragments on the queue, we convert from the fast-
   6795  * path reassembly scheme to the hard-case scheme.  From then on, additional
   6796  * fragments are reassembled here.  We keep track of the start and end offsets
   6797  * of each piece, and the number of holes in the chain.  When the hole count
   6798  * goes to zero, we are done!
   6799  *
   6800  * The ipf_count will be updated to account for any mblk(s) added (pointed to
   6801  * by mp) or subtracted (freeb()ed dups), upon return the caller must update
   6802  * ipfb_count and ill_frag_count by the difference of ipf_count before and
   6803  * after the call to ip_reassemble().
   6804  */
   6805 int
   6806 ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill,
   6807     size_t msg_len)
   6808 {
   6809 	uint_t	end;
   6810 	mblk_t	*next_mp;
   6811 	mblk_t	*mp1;
   6812 	uint_t	offset;
   6813 	boolean_t incr_dups = B_TRUE;
   6814 	boolean_t offset_zero_seen = B_FALSE;
   6815 	boolean_t pkt_boundary_checked = B_FALSE;
   6816 
   6817 	/* If start == 0 then ipf_nf_hdr_len has to be set. */
   6818 	ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0);
   6819 
   6820 	/* Add in byte count */
   6821 	ipf->ipf_count += msg_len;
   6822 	if (ipf->ipf_end) {
   6823 		/*
   6824 		 * We were part way through in-order reassembly, but now there
   6825 		 * is a hole.  We walk through messages already queued, and
   6826 		 * mark them for hard case reassembly.  We know that up till
   6827 		 * now they were in order starting from offset zero.
   6828 		 */
   6829 		offset = 0;
   6830 		for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
   6831 			IP_REASS_SET_START(mp1, offset);
   6832 			if (offset == 0) {
   6833 				ASSERT(ipf->ipf_nf_hdr_len != 0);
   6834 				offset = -ipf->ipf_nf_hdr_len;
   6835 			}
   6836 			offset += mp1->b_wptr - mp1->b_rptr;
   6837 			IP_REASS_SET_END(mp1, offset);
   6838 		}
   6839 		/* One hole at the end. */
   6840 		ipf->ipf_hole_cnt = 1;
   6841 		/* Brand it as a hard case, forever. */
   6842 		ipf->ipf_end = 0;
   6843 	}
   6844 	/* Walk through all the new pieces. */
   6845 	do {
   6846 		end = start + (mp->b_wptr - mp->b_rptr);
   6847 		/*
   6848 		 * If start is 0, decrease 'end' only for the first mblk of
   6849 		 * the fragment. Otherwise 'end' can get wrong value in the
   6850 		 * second pass of the loop if first mblk is exactly the
   6851 		 * size of ipf_nf_hdr_len.
   6852 		 */
   6853 		if (start == 0 && !offset_zero_seen) {
   6854 			/* First segment */
   6855 			ASSERT(ipf->ipf_nf_hdr_len != 0);
   6856 			end -= ipf->ipf_nf_hdr_len;
   6857 			offset_zero_seen = B_TRUE;
   6858 		}
   6859 		next_mp = mp->b_cont;
   6860 		/*
   6861 		 * We are checking to see if there is any interesing data
   6862 		 * to process.  If there isn't and the mblk isn't the
   6863 		 * one which carries the unfragmentable header then we
   6864 		 * drop it.  It's possible to have just the unfragmentable
   6865 		 * header come through without any data.  That needs to be
   6866 		 * saved.
   6867 		 *
   6868 		 * If the assert at the top of this function holds then the
   6869 		 * term "ipf->ipf_nf_hdr_len != 0" isn't needed.  This code
   6870 		 * is infrequently traveled enough that the test is left in
   6871 		 * to protect against future code changes which break that
   6872 		 * invariant.
   6873 		 */
   6874 		if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) {
   6875 			/* Empty.  Blast it. */
   6876 			IP_REASS_SET_START(mp, 0);
   6877 			IP_REASS_SET_END(mp, 0);
   6878 			/*
   6879 			 * If the ipf points to the mblk we are about to free,
   6880 			 * update ipf to point to the next mblk (or NULL
   6881 			 * if none).
   6882 			 */
   6883 			if (ipf->ipf_mp->b_cont == mp)
   6884 				ipf->ipf_mp->b_cont = next_mp;
   6885 			freeb(mp);
   6886 			continue;
   6887 		}
   6888 		mp->b_cont = NULL;
   6889 		IP_REASS_SET_START(mp, start);
   6890 		IP_REASS_SET_END(mp, end);
   6891 		if (!ipf->ipf_tail_mp) {
   6892 			ipf->ipf_tail_mp = mp;
   6893 			ipf->ipf_mp->b_cont = mp;
   6894 			if (start == 0 || !more) {
   6895 				ipf->ipf_hole_cnt = 1;
   6896 				/*
   6897 				 * if the first fragment comes in more than one
   6898 				 * mblk, this loop will be executed for each
   6899 				 * mblk. Need to adjust hole count so exiting
   6900 				 * this routine will leave hole count at 1.
   6901 				 */
   6902 				if (next_mp)
   6903 					ipf->ipf_hole_cnt++;
   6904 			} else
   6905 				ipf->ipf_hole_cnt = 2;
   6906 			continue;
   6907 		} else if (ipf->ipf_last_frag_seen && !more &&
   6908 		    !pkt_boundary_checked) {
   6909 			/*
   6910 			 * We check datagram boundary only if this fragment
   6911 			 * claims to be the last fragment and we have seen a
   6912 			 * last fragment in the past too. We do this only
   6913 			 * once for a given fragment.
   6914 			 *
   6915 			 * start cannot be 0 here as fragments with start=0
   6916 			 * and MF=0 gets handled as a complete packet. These
   6917 			 * fragments should not reach here.
   6918 			 */
   6919 
   6920 			if (start + msgdsize(mp) !=
   6921 			    IP_REASS_END(ipf->ipf_tail_mp)) {
   6922 				/*
   6923 				 * We have two fragments both of which claim
   6924 				 * to be the last fragment but gives conflicting
   6925 				 * information about the whole datagram size.
   6926 				 * Something fishy is going on. Drop the
   6927 				 * fragment and free up the reassembly list.
   6928 				 */
   6929 				return (IP_REASS_FAILED);
   6930 			}
   6931 
   6932 			/*
   6933 			 * We shouldn't come to this code block again for this
   6934 			 * particular fragment.
   6935 			 */
   6936 			pkt_boundary_checked = B_TRUE;
   6937 		}
   6938 
   6939 		/* New stuff at or beyond tail? */
   6940 		offset = IP_REASS_END(ipf->ipf_tail_mp);
   6941 		if (start >= offset) {
   6942 			if (ipf->ipf_last_frag_seen) {
   6943 				/* current fragment is beyond last fragment */
   6944 				return (IP_REASS_FAILED);
   6945 			}
   6946 			/* Link it on end. */
   6947 			ipf->ipf_tail_mp->b_cont = mp;
   6948 			ipf->ipf_tail_mp = mp;
   6949 			if (more) {
   6950 				if (start != offset)
   6951 					ipf->ipf_hole_cnt++;
   6952 			} else if (start == offset && next_mp == NULL)
   6953 					ipf->ipf_hole_cnt--;
   6954 			continue;
   6955 		}
   6956 		mp1 = ipf->ipf_mp->b_cont;
   6957 		offset = IP_REASS_START(mp1);
   6958 		/* New stuff at the front? */
   6959 		if (start < offset) {
   6960 			if (start == 0) {
   6961 				if (end >= offset) {
   6962 					/* Nailed the hole at the begining. */
   6963 					ipf->ipf_hole_cnt--;
   6964 				}
   6965 			} else if (end < offset) {
   6966 				/*
   6967 				 * A hole, stuff, and a hole where there used
   6968 				 * to be just a hole.
   6969 				 */
   6970 				ipf->ipf_hole_cnt++;
   6971 			}
   6972 			mp->b_cont = mp1;
   6973 			/* Check for overlap. */
   6974 			while (end > offset) {
   6975 				if (end < IP_REASS_END(mp1)) {
   6976 					mp->b_wptr -= end - offset;
   6977 					IP_REASS_SET_END(mp, offset);
   6978 					BUMP_MIB(ill->ill_ip_mib,
   6979 					    ipIfStatsReasmPartDups);
   6980 					break;
   6981 				}
   6982 				/* Did we cover another hole? */
   6983 				if ((mp1->b_cont &&
   6984 				    IP_REASS_END(mp1) !=
   6985 				    IP_REASS_START(mp1->b_cont) &&
   6986 				    end >= IP_REASS_START(mp1->b_cont)) ||
   6987 				    (!ipf->ipf_last_frag_seen && !more)) {
   6988 					ipf->ipf_hole_cnt--;
   6989 				}
   6990 				/* Clip out mp1. */
   6991 				if ((mp->b_cont = mp1->b_cont) == NULL) {
   6992 					/*
   6993 					 * After clipping out mp1, this guy
   6994 					 * is now hanging off the end.
   6995 					 */
   6996 					ipf->ipf_tail_mp = mp;
   6997 				}
   6998 				IP_REASS_SET_START(mp1, 0);
   6999 				IP_REASS_SET_END(mp1, 0);
   7000 				/* Subtract byte count */
   7001 				ipf->ipf_count -= mp1->b_datap->db_lim -
   7002 				    mp1->b_datap->db_base;
   7003 				freeb(mp1);
   7004 				BUMP_MIB(ill->ill_ip_mib,
   7005 				    ipIfStatsReasmPartDups);
   7006 				mp1 = mp->b_cont;
   7007 				if (!mp1)
   7008 					break;
   7009 				offset = IP_REASS_START(mp1);
   7010 			}
   7011 			ipf->ipf_mp->b_cont = mp;
   7012 			continue;
   7013 		}
   7014 		/*
   7015 		 * The new piece starts somewhere between the start of the head
   7016 		 * and before the end of the tail.
   7017 		 */
   7018 		for (; mp1; mp1 = mp1->b_cont) {
   7019 			offset = IP_REASS_END(mp1);
   7020 			if (start < offset) {
   7021 				if (end <= offset) {
   7022 					/* Nothing new. */
   7023 					IP_REASS_SET_START(mp, 0);
   7024 					IP_REASS_SET_END(mp, 0);
   7025 					/* Subtract byte count */
   7026 					ipf->ipf_count -= mp->b_datap->db_lim -
   7027 					    mp->b_datap->db_base;
   7028 					if (incr_dups) {
   7029 						ipf->ipf_num_dups++;
   7030 						incr_dups = B_FALSE;
   7031 					}
   7032 					freeb(mp);
   7033 					BUMP_MIB(ill->ill_ip_mib,
   7034 					    ipIfStatsReasmDuplicates);
   7035 					break;
   7036 				}
   7037 				/*
   7038 				 * Trim redundant stuff off beginning of new
   7039 				 * piece.
   7040 				 */
   7041 				IP_REASS_SET_START(mp, offset);
   7042 				mp->b_rptr += offset - start;
   7043 				BUMP_MIB(ill->ill_ip_mib,
   7044 				    ipIfStatsReasmPartDups);
   7045 				start = offset;
   7046 				if (!mp1->b_cont) {
   7047 					/*
   7048 					 * After trimming, this guy is now
   7049 					 * hanging off the end.
   7050 					 */
   7051 					mp1->b_cont = mp;
   7052 					ipf->ipf_tail_mp = mp;
   7053 					if (!more) {
   7054 						ipf->ipf_hole_cnt--;
   7055 					}
   7056 					break;
   7057 				}
   7058 			}
   7059 			if (start >= IP_REASS_START(mp1->b_cont))
   7060 				continue;
   7061 			/* Fill a hole */
   7062 			if (start > offset)
   7063 				ipf->ipf_hole_cnt++;
   7064 			mp->b_cont = mp1->b_cont;
   7065 			mp1->b_cont = mp;
   7066 			mp1 = mp->b_cont;
   7067 			offset = IP_REASS_START(mp1);
   7068 			if (end >= offset) {
   7069 				ipf->ipf_hole_cnt--;
   7070 				/* Check for overlap. */
   7071 				while (end > offset) {
   7072 					if (end < IP_REASS_END(mp1)) {
   7073 						mp->b_wptr -= end - offset;
   7074 						IP_REASS_SET_END(mp, offset);
   7075 						/*
   7076 						 * TODO we might bump
   7077 						 * this up twice if there is
   7078 						 * overlap at both ends.
   7079 						 */
   7080 						BUMP_MIB(ill->ill_ip_mib,
   7081 						    ipIfStatsReasmPartDups);
   7082 						break;
   7083 					}
   7084 					/* Did we cover another hole? */
   7085 					if ((mp1->b_cont &&
   7086 					    IP_REASS_END(mp1)
   7087 					    != IP_REASS_START(mp1->b_cont) &&
   7088 					    end >=
   7089 					    IP_REASS_START(mp1->b_cont)) ||
   7090 					    (!ipf->ipf_last_frag_seen &&
   7091 					    !more)) {
   7092 						ipf->ipf_hole_cnt--;
   7093 					}
   7094 					/* Clip out mp1. */
   7095 					if ((mp->b_cont = mp1->b_cont) ==
   7096 					    NULL) {
   7097 						/*
   7098 						 * After clipping out mp1,
   7099 						 * this guy is now hanging
   7100 						 * off the end.
   7101 						 */
   7102 						ipf->ipf_tail_mp = mp;
   7103 					}
   7104 					IP_REASS_SET_START(mp1, 0);
   7105 					IP_REASS_SET_END(mp1, 0);
   7106 					/* Subtract byte count */
   7107 					ipf->ipf_count -=
   7108 					    mp1->b_datap->db_lim -
   7109 					    mp1->b_datap->db_base;
   7110 					freeb(mp1);
   7111 					BUMP_MIB(ill->ill_ip_mib,
   7112 					    ipIfStatsReasmPartDups);
   7113 					mp1 = mp->b_cont;
   7114 					if (!mp1)
   7115 						break;
   7116 					offset = IP_REASS_START(mp1);
   7117 				}
   7118 			}
   7119 			break;
   7120 		}
   7121 	} while (start = end, mp = next_mp);
   7122 
   7123 	/* Fragment just processed could be the last one. Remember this fact */
   7124 	if (!more)
   7125 		ipf->ipf_last_frag_seen = B_TRUE;
   7126 
   7127 	/* Still got holes? */
   7128 	if (ipf->ipf_hole_cnt)
   7129 		return (IP_REASS_PARTIAL);
   7130 	/* Clean up overloaded fields to avoid upstream disasters. */
   7131 	for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
   7132 		IP_REASS_SET_START(mp1, 0);
   7133 		IP_REASS_SET_END(mp1, 0);
   7134 	}
   7135 	return (IP_REASS_COMPLETE);
   7136 }
   7137 
   7138 /*
   7139  * Fragmentation reassembly.  Each ILL has a hash table for
   7140  * queuing packets undergoing reassembly for all IPIFs
   7141  * associated with the ILL.  The hash is based on the packet
   7142  * IP ident field.  The ILL frag hash table was allocated
   7143  * as a timer block at the time the ILL was created.  Whenever
   7144  * there is anything on the reassembly queue, the timer will
   7145  * be running.  Returns the reassembled packet if reassembly completes.
   7146  */
   7147 mblk_t *
   7148 ip_input_fragment(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
   7149 {
   7150 	uint32_t	frag_offset_flags;
   7151 	mblk_t		*t_mp;
   7152 	ipaddr_t	dst;
   7153 	uint8_t		proto = ipha->ipha_protocol;
   7154 	uint32_t	sum_val;
   7155 	uint16_t	sum_flags;
   7156 	ipf_t		*ipf;
   7157 	ipf_t		**ipfp;
   7158 	ipfb_t		*ipfb;
   7159 	uint16_t	ident;
   7160 	uint32_t	offset;
   7161 	ipaddr_t	src;
   7162 	uint_t		hdr_length;
   7163 	uint32_t	end;
   7164 	mblk_t		*mp1;
   7165 	mblk_t		*tail_mp;
   7166 	size_t		count;
   7167 	size_t		msg_len;
   7168 	uint8_t		ecn_info = 0;
   7169 	uint32_t	packet_size;
   7170 	boolean_t	pruned = B_FALSE;
   7171 	ill_t		*ill = ira->ira_ill;
   7172 	ip_stack_t	*ipst = ill->ill_ipst;
   7173 
   7174 	/*
   7175 	 * Drop the fragmented as early as possible, if
   7176 	 * we don't have resource(s) to re-assemble.
   7177 	 */
   7178 	if (ipst->ips_ip_reass_queue_bytes == 0) {
   7179 		freemsg(mp);
   7180 		return (NULL);
   7181 	}
   7182 
   7183 	/* Check for fragmentation offset; return if there's none */
   7184 	if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
   7185 	    (IPH_MF | IPH_OFFSET)) == 0)
   7186 		return (mp);
   7187 
   7188 	/*
   7189 	 * We utilize hardware computed checksum info only for UDP since
   7190 	 * IP fragmentation is a normal occurrence for the protocol.  In
   7191 	 * addition, checksum offload support for IP fragments carrying
   7192 	 * UDP payload is commonly implemented across network adapters.
   7193 	 */
   7194 	ASSERT(ira->ira_rill != NULL);
   7195 	if (proto == IPPROTO_UDP && dohwcksum &&
   7196 	    ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
   7197 	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
   7198 		mblk_t *mp1 = mp->b_cont;
   7199 		int32_t len;
   7200 
   7201 		/* Record checksum information from the packet */
   7202 		sum_val = (uint32_t)DB_CKSUM16(mp);
   7203 		sum_flags = DB_CKSUMFLAGS(mp);
   7204 
   7205 		/* IP payload offset from beginning of mblk */
   7206 		offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr;
   7207 
   7208 		if ((sum_flags & HCK_PARTIALCKSUM) &&
   7209 		    (mp1 == NULL || mp1->b_cont == NULL) &&
   7210 		    offset >= DB_CKSUMSTART(mp) &&
   7211 		    ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
   7212 			uint32_t adj;
   7213 			/*
   7214 			 * Partial checksum has been calculated by hardware
   7215 			 * and attached to the packet; in addition, any
   7216 			 * prepended extraneous data is even byte aligned.
   7217 			 * If any such data exists, we adjust the checksum;
   7218 			 * this would also handle any postpended data.
   7219 			 */
   7220 			IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
   7221 			    mp, mp1, len, adj);
   7222 
   7223 			/* One's complement subtract extraneous checksum */
   7224 			if (adj >= sum_val)
   7225 				sum_val = ~(adj - sum_val) & 0xFFFF;
   7226 			else
   7227 				sum_val -= adj;
   7228 		}
   7229 	} else {
   7230 		sum_val = 0;
   7231 		sum_flags = 0;
   7232 	}
   7233 
   7234 	/* Clear hardware checksumming flag */
   7235 	DB_CKSUMFLAGS(mp) = 0;
   7236 
   7237 	ident = ipha->ipha_ident;
   7238 	offset = (frag_offset_flags << 3) & 0xFFFF;
   7239 	src = ipha->ipha_src;
   7240 	dst = ipha->ipha_dst;
   7241 	hdr_length = IPH_HDR_LENGTH(ipha);
   7242 	end = ntohs(ipha->ipha_length) - hdr_length;
   7243 
   7244 	/* If end == 0 then we have a packet with no data, so just free it */
   7245 	if (end == 0) {
   7246 		freemsg(mp);
   7247 		return (NULL);
   7248 	}
   7249 
   7250 	/* Record the ECN field info. */
   7251 	ecn_info = (ipha->ipha_type_of_service & 0x3);
   7252 	if (offset != 0) {
   7253 		/*
   7254 		 * If this isn't the first piece, strip the header, and
   7255 		 * add the offset to the end value.
   7256 		 */
   7257 		mp->b_rptr += hdr_length;
   7258 		end += offset;
   7259 	}
   7260 
   7261 	/* Handle vnic loopback of fragments */
   7262 	if (mp->b_datap->db_ref > 2)
   7263 		msg_len = 0;
   7264 	else
   7265 		msg_len = MBLKSIZE(mp);
   7266 
   7267 	tail_mp = mp;
   7268 	while (tail_mp->b_cont != NULL) {
   7269 		tail_mp = tail_mp->b_cont;
   7270 		if (tail_mp->b_datap->db_ref <= 2)
   7271 			msg_len += MBLKSIZE(tail_mp);
   7272 	}
   7273 
   7274 	/* If the reassembly list for this ILL will get too big, prune it */
   7275 	if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
   7276 	    ipst->ips_ip_reass_queue_bytes) {
   7277 		DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
   7278 		    uint_t, ill->ill_frag_count,
   7279 		    uint_t, ipst->ips_ip_reass_queue_bytes);
   7280 		ill_frag_prune(ill,
   7281 		    (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
   7282 		    (ipst->ips_ip_reass_queue_bytes - msg_len));
   7283 		pruned = B_TRUE;
   7284 	}
   7285 
   7286 	ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
   7287 	mutex_enter(&ipfb->ipfb_lock);
   7288 
   7289 	ipfp = &ipfb->ipfb_ipf;
   7290 	/* Try to find an existing fragment queue for this packet. */
   7291 	for (;;) {
   7292 		ipf = ipfp[0];
   7293 		if (ipf != NULL) {
   7294 			/*
   7295 			 * It has to match on ident and src/dst address.
   7296 			 */
   7297 			if (ipf->ipf_ident == ident &&
   7298 			    ipf->ipf_src == src &&
   7299 			    ipf->ipf_dst == dst &&
   7300 			    ipf->ipf_protocol == proto) {
   7301 				/*
   7302 				 * If we have received too many
   7303 				 * duplicate fragments for this packet
   7304 				 * free it.
   7305 				 */
   7306 				if (ipf->ipf_num_dups > ip_max_frag_dups) {
   7307 					ill_frag_free_pkts(ill, ipfb, ipf, 1);
   7308 					freemsg(mp);
   7309 					mutex_exit(&ipfb->ipfb_lock);
   7310 					return (NULL);
   7311 				}
   7312 				/* Found it. */
   7313 				break;
   7314 			}
   7315 			ipfp = &ipf->ipf_hash_next;
   7316 			continue;
   7317 		}
   7318 
   7319 		/*
   7320 		 * If we pruned the list, do we want to store this new
   7321 		 * fragment?. We apply an optimization here based on the
   7322 		 * fact that most fragments will be received in order.
   7323 		 * So if the offset of this incoming fragment is zero,
   7324 		 * it is the first fragment of a new packet. We will
   7325 		 * keep it.  Otherwise drop the fragment, as we have
   7326 		 * probably pruned the packet already (since the
   7327 		 * packet cannot be found).
   7328 		 */
   7329 		if (pruned && offset != 0) {
   7330 			mutex_exit(&ipfb->ipfb_lock);
   7331 			freemsg(mp);
   7332 			return (NULL);
   7333 		}
   7334 
   7335 		if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst))  {
   7336 			/*
   7337 			 * Too many fragmented packets in this hash
   7338 			 * bucket. Free the oldest.
   7339 			 */
   7340 			ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
   7341 		}
   7342 
   7343 		/* New guy.  Allocate a frag message. */
   7344 		mp1 = allocb(sizeof (*ipf), BPRI_MED);
   7345 		if (mp1 == NULL) {
   7346 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   7347 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   7348 			freemsg(mp);
   7349 reass_done:
   7350 			mutex_exit(&ipfb->ipfb_lock);
   7351 			return (NULL);
   7352 		}
   7353 
   7354 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
   7355 		mp1->b_cont = mp;
   7356 
   7357 		/* Initialize the fragment header. */
   7358 		ipf = (ipf_t *)mp1->b_rptr;
   7359 		ipf->ipf_mp = mp1;
   7360 		ipf->ipf_ptphn = ipfp;
   7361 		ipfp[0] = ipf;
   7362 		ipf->ipf_hash_next = NULL;
   7363 		ipf->ipf_ident = ident;
   7364 		ipf->ipf_protocol = proto;
   7365 		ipf->ipf_src = src;
   7366 		ipf->ipf_dst = dst;
   7367 		ipf->ipf_nf_hdr_len = 0;
   7368 		/* Record reassembly start time. */
   7369 		ipf->ipf_timestamp = gethrestime_sec();
   7370 		/* Record ipf generation and account for frag header */
   7371 		ipf->ipf_gen = ill->ill_ipf_gen++;
   7372 		ipf->ipf_count = MBLKSIZE(mp1);
   7373 		ipf->ipf_last_frag_seen = B_FALSE;
   7374 		ipf->ipf_ecn = ecn_info;
   7375 		ipf->ipf_num_dups = 0;
   7376 		ipfb->ipfb_frag_pkts++;
   7377 		ipf->ipf_checksum = 0;
   7378 		ipf->ipf_checksum_flags = 0;
   7379 
   7380 		/* Store checksum value in fragment header */
   7381 		if (sum_flags != 0) {
   7382 			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7383 			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7384 			ipf->ipf_checksum = sum_val;
   7385 			ipf->ipf_checksum_flags = sum_flags;
   7386 		}
   7387 
   7388 		/*
   7389 		 * We handle reassembly two ways.  In the easy case,
   7390 		 * where all the fragments show up in order, we do
   7391 		 * minimal bookkeeping, and just clip new pieces on
   7392 		 * the end.  If we ever see a hole, then we go off
   7393 		 * to ip_reassemble which has to mark the pieces and
   7394 		 * keep track of the number of holes, etc.  Obviously,
   7395 		 * the point of having both mechanisms is so we can
   7396 		 * handle the easy case as efficiently as possible.
   7397 		 */
   7398 		if (offset == 0) {
   7399 			/* Easy case, in-order reassembly so far. */
   7400 			ipf->ipf_count += msg_len;
   7401 			ipf->ipf_tail_mp = tail_mp;
   7402 			/*
   7403 			 * Keep track of next expected offset in
   7404 			 * ipf_end.
   7405 			 */
   7406 			ipf->ipf_end = end;
   7407 			ipf->ipf_nf_hdr_len = hdr_length;
   7408 		} else {
   7409 			/* Hard case, hole at the beginning. */
   7410 			ipf->ipf_tail_mp = NULL;
   7411 			/*
   7412 			 * ipf_end == 0 means that we have given up
   7413 			 * on easy reassembly.
   7414 			 */
   7415 			ipf->ipf_end = 0;
   7416 
   7417 			/* Forget checksum offload from now on */
   7418 			ipf->ipf_checksum_flags = 0;
   7419 
   7420 			/*
   7421 			 * ipf_hole_cnt is set by ip_reassemble.
   7422 			 * ipf_count is updated by ip_reassemble.
   7423 			 * No need to check for return value here
   7424 			 * as we don't expect reassembly to complete
   7425 			 * or fail for the first fragment itself.
   7426 			 */
   7427 			(void) ip_reassemble(mp, ipf,
   7428 			    (frag_offset_flags & IPH_OFFSET) << 3,
   7429 			    (frag_offset_flags & IPH_MF), ill, msg_len);
   7430 		}
   7431 		/* Update per ipfb and ill byte counts */
   7432 		ipfb->ipfb_count += ipf->ipf_count;
   7433 		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
   7434 		atomic_add_32(&ill->ill_frag_count, ipf->ipf_count);
   7435 		/* If the frag timer wasn't already going, start it. */
   7436 		mutex_enter(&ill->ill_lock);
   7437 		ill_frag_timer_start(ill);
   7438 		mutex_exit(&ill->ill_lock);
   7439 		goto reass_done;
   7440 	}
   7441 
   7442 	/*
   7443 	 * If the packet's flag has changed (it could be coming up
   7444 	 * from an interface different than the previous, therefore
   7445 	 * possibly different checksum capability), then forget about
   7446 	 * any stored checksum states.  Otherwise add the value to
   7447 	 * the existing one stored in the fragment header.
   7448 	 */
   7449 	if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
   7450 		sum_val += ipf->ipf_checksum;
   7451 		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7452 		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7453 		ipf->ipf_checksum = sum_val;
   7454 	} else if (ipf->ipf_checksum_flags != 0) {
   7455 		/* Forget checksum offload from now on */
   7456 		ipf->ipf_checksum_flags = 0;
   7457 	}
   7458 
   7459 	/*
   7460 	 * We have a new piece of a datagram which is already being
   7461 	 * reassembled.  Update the ECN info if all IP fragments
   7462 	 * are ECN capable.  If there is one which is not, clear
   7463 	 * all the info.  If there is at least one which has CE
   7464 	 * code point, IP needs to report that up to transport.
   7465 	 */
   7466 	if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
   7467 		if (ecn_info == IPH_ECN_CE)
   7468 			ipf->ipf_ecn = IPH_ECN_CE;
   7469 	} else {
   7470 		ipf->ipf_ecn = IPH_ECN_NECT;
   7471 	}
   7472 	if (offset && ipf->ipf_end == offset) {
   7473 		/* The new fragment fits at the end */
   7474 		ipf->ipf_tail_mp->b_cont = mp;
   7475 		/* Update the byte count */
   7476 		ipf->ipf_count += msg_len;
   7477 		/* Update per ipfb and ill byte counts */
   7478 		ipfb->ipfb_count += msg_len;
   7479 		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
   7480 		atomic_add_32(&ill->ill_frag_count, msg_len);
   7481 		if (frag_offset_flags & IPH_MF) {
   7482 			/* More to come. */
   7483 			ipf->ipf_end = end;
   7484 			ipf->ipf_tail_mp = tail_mp;
   7485 			goto reass_done;
   7486 		}
   7487 	} else {
   7488 		/* Go do the hard cases. */
   7489 		int ret;
   7490 
   7491 		if (offset == 0)
   7492 			ipf->ipf_nf_hdr_len = hdr_length;
   7493 
   7494 		/* Save current byte count */
   7495 		count = ipf->ipf_count;
   7496 		ret = ip_reassemble(mp, ipf,
   7497 		    (frag_offset_flags & IPH_OFFSET) << 3,
   7498 		    (frag_offset_flags & IPH_MF), ill, msg_len);
   7499 		/* Count of bytes added and subtracted (freeb()ed) */
   7500 		count = ipf->ipf_count - count;
   7501 		if (count) {
   7502 			/* Update per ipfb and ill byte counts */
   7503 			ipfb->ipfb_count += count;
   7504 			ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
   7505 			atomic_add_32(&ill->ill_frag_count, count);
   7506 		}
   7507 		if (ret == IP_REASS_PARTIAL) {
   7508 			goto reass_done;
   7509 		} else if (ret == IP_REASS_FAILED) {
   7510 			/* Reassembly failed. Free up all resources */
   7511 			ill_frag_free_pkts(ill, ipfb, ipf, 1);
   7512 			for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
   7513 				IP_REASS_SET_START(t_mp, 0);
   7514 				IP_REASS_SET_END(t_mp, 0);
   7515 			}
   7516 			freemsg(mp);
   7517 			goto reass_done;
   7518 		}
   7519 		/* We will reach here iff 'ret' is IP_REASS_COMPLETE */
   7520 	}
   7521 	/*
   7522 	 * We have completed reassembly.  Unhook the frag header from
   7523 	 * the reassembly list.
   7524 	 *
   7525 	 * Before we free the frag header, record the ECN info
   7526 	 * to report back to the transport.
   7527 	 */
   7528 	ecn_info = ipf->ipf_ecn;
   7529 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs);
   7530 	ipfp = ipf->ipf_ptphn;
   7531 
   7532 	/* We need to supply these to caller */
   7533 	if ((sum_flags = ipf->ipf_checksum_flags) != 0)
   7534 		sum_val = ipf->ipf_checksum;
   7535 	else
   7536 		sum_val = 0;
   7537 
   7538 	mp1 = ipf->ipf_mp;
   7539 	count = ipf->ipf_count;
   7540 	ipf = ipf->ipf_hash_next;
   7541 	if (ipf != NULL)
   7542 		ipf->ipf_ptphn = ipfp;
   7543 	ipfp[0] = ipf;
   7544 	atomic_add_32(&ill->ill_frag_count, -count);
   7545 	ASSERT(ipfb->ipfb_count >= count);
   7546 	ipfb->ipfb_count -= count;
   7547 	ipfb->ipfb_frag_pkts--;
   7548 	mutex_exit(&ipfb->ipfb_lock);
   7549 	/* Ditch the frag header. */
   7550 	mp = mp1->b_cont;
   7551 
   7552 	freeb(mp1);
   7553 
   7554 	/* Restore original IP length in header. */
   7555 	packet_size = (uint32_t)msgdsize(mp);
   7556 	if (packet_size > IP_MAXPACKET) {
   7557 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
   7558 		ip_drop_input("Reassembled packet too large", mp, ill);
   7559 		freemsg(mp);
   7560 		return (NULL);
   7561 	}
   7562 
   7563 	if (DB_REF(mp) > 1) {
   7564 		mblk_t *mp2 = copymsg(mp);
   7565 
   7566 		if (mp2 == NULL) {
   7567 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   7568 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   7569 			freemsg(mp);
   7570 			return (NULL);
   7571 		}
   7572 		freemsg(mp);
   7573 		mp = mp2;
   7574 	}
   7575 	ipha = (ipha_t *)mp->b_rptr;
   7576 
   7577 	ipha->ipha_length = htons((uint16_t)packet_size);
   7578 	/* We're now complete, zip the frag state */
   7579 	ipha->ipha_fragment_offset_and_flags = 0;
   7580 	/* Record the ECN info. */
   7581 	ipha->ipha_type_of_service &= 0xFC;
   7582 	ipha->ipha_type_of_service |= ecn_info;
   7583 
   7584 	/* Update the receive attributes */
   7585 	ira->ira_pktlen = packet_size;
   7586 	ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
   7587 
   7588 	/* Reassembly is successful; set checksum information in packet */
   7589 	DB_CKSUM16(mp) = (uint16_t)sum_val;
   7590 	DB_CKSUMFLAGS(mp) = sum_flags;
   7591 	DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
   7592 
   7593 	return (mp);
   7594 }
   7595 
   7596 /*
   7597  * Pullup function that should be used for IP input in order to
   7598  * ensure we do not loose the L2 source address; we need the l2 source
   7599  * address for IP_RECVSLLA and for ndp_input.
   7600  *
   7601  * We return either NULL or b_rptr.
   7602  */
   7603 void *
   7604 ip_pullup(mblk_t *mp, ssize_t len, ip_recv_attr_t *ira)
   7605 {
   7606 	ill_t		*ill = ira->ira_ill;
   7607 
   7608 	if (ip_rput_pullups++ == 0) {
   7609 		(void) mi_strlog(ill->ill_rq, 1,