Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 /* Copyright (c) 1990 Mentat Inc. */
     27 
     28 #include <sys/types.h>
     29 #include <sys/stream.h>
     30 #include <sys/dlpi.h>
     31 #include <sys/stropts.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/strsubr.h>
     34 #include <sys/strlog.h>
     35 #include <sys/strsun.h>
     36 #include <sys/zone.h>
     37 #define	_SUN_TPI_VERSION 2
     38 #include <sys/tihdr.h>
     39 #include <sys/xti_inet.h>
     40 #include <sys/ddi.h>
     41 #include <sys/cmn_err.h>
     42 #include <sys/debug.h>
     43 #include <sys/kobj.h>
     44 #include <sys/modctl.h>
     45 #include <sys/atomic.h>
     46 #include <sys/policy.h>
     47 #include <sys/priv.h>
     48 #include <sys/taskq.h>
     49 
     50 #include <sys/systm.h>
     51 #include <sys/param.h>
     52 #include <sys/kmem.h>
     53 #include <sys/sdt.h>
     54 #include <sys/socket.h>
     55 #include <sys/vtrace.h>
     56 #include <sys/isa_defs.h>
     57 #include <sys/mac.h>
     58 #include <net/if.h>
     59 #include <net/if_arp.h>
     60 #include <net/route.h>
     61 #include <sys/sockio.h>
     62 #include <netinet/in.h>
     63 #include <net/if_dl.h>
     64 
     65 #include <inet/common.h>
     66 #include <inet/mi.h>
     67 #include <inet/mib2.h>
     68 #include <inet/nd.h>
     69 #include <inet/arp.h>
     70 #include <inet/snmpcom.h>
     71 #include <inet/optcom.h>
     72 #include <inet/kstatcom.h>
     73 
     74 #include <netinet/igmp_var.h>
     75 #include <netinet/ip6.h>
     76 #include <netinet/icmp6.h>
     77 #include <netinet/sctp.h>
     78 
     79 #include <inet/ip.h>
     80 #include <inet/ip_impl.h>
     81 #include <inet/ip6.h>
     82 #include <inet/ip6_asp.h>
     83 #include <inet/tcp.h>
     84 #include <inet/tcp_impl.h>
     85 #include <inet/ip_multi.h>
     86 #include <inet/ip_if.h>
     87 #include <inet/ip_ire.h>
     88 #include <inet/ip_ftable.h>
     89 #include <inet/ip_rts.h>
     90 #include <inet/ip_ndp.h>
     91 #include <inet/ip_listutils.h>
     92 #include <netinet/igmp.h>
     93 #include <netinet/ip_mroute.h>
     94 #include <inet/ipp_common.h>
     95 
     96 #include <net/pfkeyv2.h>
     97 #include <inet/ipsec_info.h>
     98 #include <inet/sadb.h>
     99 #include <inet/ipsec_impl.h>
    100 #include <sys/iphada.h>
    101 #include <inet/tun.h>
    102 #include <inet/ipdrop.h>
    103 #include <inet/ip_netinfo.h>
    104 
    105 #include <sys/ethernet.h>
    106 #include <net/if_types.h>
    107 #include <sys/cpuvar.h>
    108 
    109 #include <ipp/ipp.h>
    110 #include <ipp/ipp_impl.h>
    111 #include <ipp/ipgpc/ipgpc.h>
    112 
    113 #include <sys/multidata.h>
    114 #include <sys/pattr.h>
    115 
    116 #include <inet/ipclassifier.h>
    117 #include <inet/sctp_ip.h>
    118 #include <inet/sctp/sctp_impl.h>
    119 #include <inet/udp_impl.h>
    120 #include <inet/rawip_impl.h>
    121 #include <inet/rts_impl.h>
    122 
    123 #include <sys/tsol/label.h>
    124 #include <sys/tsol/tnet.h>
    125 
    126 #include <rpc/pmap_prot.h>
    127 #include <sys/squeue_impl.h>
    128 
    129 /*
    130  * Values for squeue switch:
    131  * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
    132  * IP_SQUEUE_ENTER: SQ_PROCESS
    133  * IP_SQUEUE_FILL: SQ_FILL
    134  */
    135 int ip_squeue_enter = 2;	/* Setable in /etc/system */
    136 
    137 int ip_squeue_flag;
    138 #define	SET_BPREV_FLAG(x)	((mblk_t *)(uintptr_t)(x))
    139 
    140 /*
    141  * Setable in /etc/system
    142  */
    143 int ip_poll_normal_ms = 100;
    144 int ip_poll_normal_ticks = 0;
    145 int ip_modclose_ackwait_ms = 3000;
    146 
    147 /*
    148  * It would be nice to have these present only in DEBUG systems, but the
    149  * current design of the global symbol checking logic requires them to be
    150  * unconditionally present.
    151  */
    152 uint_t ip_thread_data;			/* TSD key for debug support */
    153 krwlock_t ip_thread_rwlock;
    154 list_t	ip_thread_list;
    155 
    156 /*
    157  * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
    158  */
    159 
    160 struct listptr_s {
    161 	mblk_t	*lp_head;	/* pointer to the head of the list */
    162 	mblk_t	*lp_tail;	/* pointer to the tail of the list */
    163 };
    164 
    165 typedef struct listptr_s listptr_t;
    166 
    167 /*
    168  * This is used by ip_snmp_get_mib2_ip_route_media and
    169  * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
    170  */
    171 typedef struct iproutedata_s {
    172 	uint_t		ird_idx;
    173 	uint_t		ird_flags;	/* see below */
    174 	listptr_t	ird_route;	/* ipRouteEntryTable */
    175 	listptr_t	ird_netmedia;	/* ipNetToMediaEntryTable */
    176 	listptr_t	ird_attrs;	/* ipRouteAttributeTable */
    177 } iproutedata_t;
    178 
    179 #define	IRD_REPORT_TESTHIDDEN	0x01	/* include IRE_MARK_TESTHIDDEN routes */
    180 
    181 /*
    182  * Cluster specific hooks. These should be NULL when booted as a non-cluster
    183  */
    184 
    185 /*
    186  * Hook functions to enable cluster networking
    187  * On non-clustered systems these vectors must always be NULL.
    188  *
    189  * Hook function to Check ip specified ip address is a shared ip address
    190  * in the cluster
    191  *
    192  */
    193 int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
    194     sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL;
    195 
    196 /*
    197  * Hook function to generate cluster wide ip fragment identifier
    198  */
    199 uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
    200     sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
    201     void *args) = NULL;
    202 
    203 /*
    204  * Hook function to generate cluster wide SPI.
    205  */
    206 void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
    207     void *) = NULL;
    208 
    209 /*
    210  * Hook function to verify if the SPI is already utlized.
    211  */
    212 
    213 int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
    214 
    215 /*
    216  * Hook function to delete the SPI from the cluster wide repository.
    217  */
    218 
    219 void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
    220 
    221 /*
    222  * Hook function to inform the cluster when packet received on an IDLE SA
    223  */
    224 
    225 void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
    226     in6_addr_t, in6_addr_t, void *) = NULL;
    227 
    228 /*
    229  * Synchronization notes:
    230  *
    231  * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
    232  * MT level protection given by STREAMS. IP uses a combination of its own
    233  * internal serialization mechanism and standard Solaris locking techniques.
    234  * The internal serialization is per phyint.  This is used to serialize
    235  * plumbing operations, certain multicast operations, most set ioctls,
    236  * igmp/mld timers etc.
    237  *
    238  * Plumbing is a long sequence of operations involving message
    239  * exchanges between IP, ARP and device drivers. Many set ioctls are typically
    240  * involved in plumbing operations. A natural model is to serialize these
    241  * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
    242  * parallel without any interference. But various set ioctls on hme0 are best
    243  * serialized, along with multicast join/leave operations, igmp/mld timer
    244  * operations, and processing of DLPI control messages received from drivers
    245  * on a per phyint basis.  This serialization is provided by the ipsq_t and
    246  * primitives operating on this. Details can be found in ip_if.c above the
    247  * core primitives operating on ipsq_t.
    248  *
    249  * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
    250  * Simiarly lookup of an ire by a thread also returns a refheld ire.
    251  * In addition ipif's and ill's referenced by the ire are also indirectly
    252  * refheld. Thus no ipif or ill can vanish nor can critical parameters like
    253  * the ipif's address or netmask change as long as an ipif is refheld
    254  * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
    255  * address of an ipif has to go through the ipsq_t. This ensures that only
    256  * 1 such exclusive operation proceeds at any time on the ipif. It then
    257  * deletes all ires associated with this ipif, and waits for all refcnts
    258  * associated with this ipif to come down to zero. The address is changed
    259  * only after the ipif has been quiesced. Then the ipif is brought up again.
    260  * More details are described above the comment in ip_sioctl_flags.
    261  *
    262  * Packet processing is based mostly on IREs and are fully multi-threaded
    263  * using standard Solaris MT techniques.
    264  *
    265  * There are explicit locks in IP to handle:
    266  * - The ip_g_head list maintained by mi_open_link() and friends.
    267  *
    268  * - The reassembly data structures (one lock per hash bucket)
    269  *
    270  * - conn_lock is meant to protect conn_t fields. The fields actually
    271  *   protected by conn_lock are documented in the conn_t definition.
    272  *
    273  * - ire_lock to protect some of the fields of the ire, IRE tables
    274  *   (one lock per hash bucket). Refer to ip_ire.c for details.
    275  *
    276  * - ndp_g_lock and nce_lock for protecting NCEs.
    277  *
    278  * - ill_lock protects fields of the ill and ipif. Details in ip.h
    279  *
    280  * - ill_g_lock: This is a global reader/writer lock. Protects the following
    281  *	* The AVL tree based global multi list of all ills.
    282  *	* The linked list of all ipifs of an ill
    283  *	* The <ipsq-xop> mapping
    284  *	* <ill-phyint> association
    285  *   Insertion/deletion of an ill in the system, insertion/deletion of an ipif
    286  *   into an ill, changing the <ipsq-xop> mapping of an ill, changing the
    287  *   <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
    288  *   writer for the actual duration of the insertion/deletion/change.
    289  *
    290  * - ill_lock:  This is a per ill mutex.
    291  *   It protects some members of the ill_t struct; see ip.h for details.
    292  *   It also protects the <ill-phyint> assoc.
    293  *   It also protects the list of ipifs hanging off the ill.
    294  *
    295  * - ipsq_lock: This is a per ipsq_t mutex lock.
    296  *   This protects some members of the ipsq_t struct; see ip.h for details.
    297  *   It also protects the <ipsq-ipxop> mapping
    298  *
    299  * - ipx_lock: This is a per ipxop_t mutex lock.
    300  *   This protects some members of the ipxop_t struct; see ip.h for details.
    301  *
    302  * - phyint_lock: This is a per phyint mutex lock. Protects just the
    303  *   phyint_flags
    304  *
    305  * - ip_g_nd_lock: This is a global reader/writer lock.
    306  *   Any call to nd_load to load a new parameter to the ND table must hold the
    307  *   lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock
    308  *   as reader.
    309  *
    310  * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
    311  *   This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
    312  *   uniqueness check also done atomically.
    313  *
    314  * - ipsec_capab_ills_lock: This readers/writer lock protects the global
    315  *   lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken
    316  *   as a writer when adding or deleting elements from these lists, and
    317  *   as a reader when walking these lists to send a SADB update to the
    318  *   IPsec capable ills.
    319  *
    320  * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
    321  *   group list linked by ill_usesrc_grp_next. It also protects the
    322  *   ill_usesrc_ifindex field. It is taken as a writer when a member of the
    323  *   group is being added or deleted.  This lock is taken as a reader when
    324  *   walking the list/group(eg: to get the number of members in a usesrc group).
    325  *   Note, it is only necessary to take this lock if the ill_usesrc_grp_next
    326  *   field is changing state i.e from NULL to non-NULL or vice-versa. For
    327  *   example, it is not necessary to take this lock in the initial portion
    328  *   of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
    329  *   operations are executed exclusively and that ensures that the "usesrc
    330  *   group state" cannot change. The "usesrc group state" change can happen
    331  *   only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
    332  *
    333  * Changing <ill-phyint>, <ipsq-xop> assocications:
    334  *
    335  * To change the <ill-phyint> association, the ill_g_lock must be held
    336  * as writer, and the ill_locks of both the v4 and v6 instance of the ill
    337  * must be held.
    338  *
    339  * To change the <ipsq-xop> association, the ill_g_lock must be held as
    340  * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
    341  * This is only done when ills are added or removed from IPMP groups.
    342  *
    343  * To add or delete an ipif from the list of ipifs hanging off the ill,
    344  * ill_g_lock (writer) and ill_lock must be held and the thread must be
    345  * a writer on the associated ipsq.
    346  *
    347  * To add or delete an ill to the system, the ill_g_lock must be held as
    348  * writer and the thread must be a writer on the associated ipsq.
    349  *
    350  * To add or delete an ilm to an ill, the ill_lock must be held and the thread
    351  * must be a writer on the associated ipsq.
    352  *
    353  * Lock hierarchy
    354  *
    355  * Some lock hierarchy scenarios are listed below.
    356  *
    357  * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
    358  * ill_g_lock -> ill_lock(s) -> phyint_lock
    359  * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock
    360  * ill_g_lock -> ip_addr_avail_lock
    361  * conn_lock -> irb_lock -> ill_lock -> ire_lock
    362  * ill_g_lock -> ip_g_nd_lock
    363  *
    364  * When more than 1 ill lock is needed to be held, all ill lock addresses
    365  * are sorted on address and locked starting from highest addressed lock
    366  * downward.
    367  *
    368  * IPsec scenarios
    369  *
    370  * ipsa_lock -> ill_g_lock -> ill_lock
    371  * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock
    372  * ipsec_capab_ills_lock -> ipsa_lock
    373  * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
    374  *
    375  * Trusted Solaris scenarios
    376  *
    377  * igsa_lock -> gcgrp_rwlock -> gcgrp_lock
    378  * igsa_lock -> gcdb_lock
    379  * gcgrp_rwlock -> ire_lock
    380  * gcgrp_rwlock -> gcdb_lock
    381  *
    382  * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
    383  *
    384  * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
    385  * sq_lock -> conn_lock -> QLOCK(q)
    386  * ill_lock -> ft_lock -> fe_lock
    387  *
    388  * Routing/forwarding table locking notes:
    389  *
    390  * Lock acquisition order: Radix tree lock, irb_lock.
    391  * Requirements:
    392  * i.  Walker must not hold any locks during the walker callback.
    393  * ii  Walker must not see a truncated tree during the walk because of any node
    394  *     deletion.
    395  * iii Existing code assumes ire_bucket is valid if it is non-null and is used
    396  *     in many places in the code to walk the irb list. Thus even if all the
    397  *     ires in a bucket have been deleted, we still can't free the radix node
    398  *     until the ires have actually been inactive'd (freed).
    399  *
    400  * Tree traversal - Need to hold the global tree lock in read mode.
    401  * Before dropping the global tree lock, need to either increment the ire_refcnt
    402  * to ensure that the radix node can't be deleted.
    403  *
    404  * Tree add - Need to hold the global tree lock in write mode to add a
    405  * radix node. To prevent the node from being deleted, increment the
    406  * irb_refcnt, after the node is added to the tree. The ire itself is
    407  * added later while holding the irb_lock, but not the tree lock.
    408  *
    409  * Tree delete - Need to hold the global tree lock and irb_lock in write mode.
    410  * All associated ires must be inactive (i.e. freed), and irb_refcnt
    411  * must be zero.
    412  *
    413  * Walker - Increment irb_refcnt before calling the walker callback. Hold the
    414  * global tree lock (read mode) for traversal.
    415  *
    416  * IPsec notes :
    417  *
    418  * IP interacts with the IPsec code (AH/ESP) by tagging a M_CTL message
    419  * in front of the actual packet. For outbound datagrams, the M_CTL
    420  * contains a ipsec_out_t (defined in ipsec_info.h), which has the
    421  * information used by the IPsec code for applying the right level of
    422  * protection. The information initialized by IP in the ipsec_out_t
    423  * is determined by the per-socket policy or global policy in the system.
    424  * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in
    425  * ipsec_info.h) which starts out with nothing in it. It gets filled
    426  * with the right information if it goes through the AH/ESP code, which
    427  * happens if the incoming packet is secure. The information initialized
    428  * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether
    429  * the policy requirements needed by per-socket policy or global policy
    430  * is met or not.
    431  *
    432  * If there is both per-socket policy (set using setsockopt) and there
    433  * is also global policy match for the 5 tuples of the socket,
    434  * ipsec_override_policy() makes the decision of which one to use.
    435  *
    436  * For fully connected sockets i.e dst, src [addr, port] is known,
    437  * conn_policy_cached is set indicating that policy has been cached.
    438  * conn_in_enforce_policy may or may not be set depending on whether
    439  * there is a global policy match or per-socket policy match.
    440  * Policy inheriting happpens in ip_bind during the ipa_conn_t bind.
    441  * Once the right policy is set on the conn_t, policy cannot change for
    442  * this socket. This makes life simpler for TCP (UDP ?) where
    443  * re-transmissions go out with the same policy. For symmetry, policy
    444  * is cached for fully connected UDP sockets also. Thus if policy is cached,
    445  * it also implies that policy is latched i.e policy cannot change
    446  * on these sockets. As we have the right policy on the conn, we don't
    447  * have to lookup global policy for every outbound and inbound datagram
    448  * and thus serving as an optimization. Note that a global policy change
    449  * does not affect fully connected sockets if they have policy. If fully
    450  * connected sockets did not have any policy associated with it, global
    451  * policy change may affect them.
    452  *
    453  * IP Flow control notes:
    454  * ---------------------
    455  * Non-TCP streams are flow controlled by IP. The way this is accomplished
    456  * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
    457  * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
    458  * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
    459  * functions.
    460  *
    461  * Per Tx ring udp flow control:
    462  * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
    463  * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
    464  *
    465  * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
    466  * To achieve best performance, outgoing traffic need to be fanned out among
    467  * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
    468  * traffic out of the NIC and it takes a fanout hint. UDP connections pass
    469  * the address of connp as fanout hint to mac_tx(). Under flow controlled
    470  * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
    471  * cookie points to a specific Tx ring that is blocked. The cookie is used to
    472  * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
    473  * point to drain_lists (idl_t's). These drain list will store the blocked UDP
    474  * connp's. The drain list is not a single list but a configurable number of
    475  * lists.
    476  *
    477  * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
    478  * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
    479  * which is equal to 128. This array in turn contains a pointer to idl_t[],
    480  * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
    481  * list will point to the list of connp's that are flow controlled.
    482  *
    483  *                      ---------------   -------   -------   -------
    484  *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
    485  *                   |  ---------------   -------   -------   -------
    486  *                   |  ---------------   -------   -------   -------
    487  *                   |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
    488  * ----------------  |  ---------------   -------   -------   -------
    489  * |idl_tx_list[0]|->|  ---------------   -------   -------   -------
    490  * ----------------  |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
    491  *                   |  ---------------   -------   -------   -------
    492  *                   .        .              .         .         .
    493  *                   |  ---------------   -------   -------   -------
    494  *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
    495  *                      ---------------   -------   -------   -------
    496  *                      ---------------   -------   -------   -------
    497  *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
    498  *                   |  ---------------   -------   -------   -------
    499  *                   |  ---------------   -------   -------   -------
    500  * ----------------  |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
    501  * |idl_tx_list[1]|->|  ---------------   -------   -------   -------
    502  * ----------------  |        .              .         .         .
    503  *                   |  ---------------   -------   -------   -------
    504  *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
    505  *                      ---------------   -------   -------   -------
    506  *     .....
    507  * ----------------
    508  * |idl_tx_list[n]|-> ...
    509  * ----------------
    510  *
    511  * When mac_tx() returns a cookie, the cookie is used to hash into a
    512  * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is
    513  * called passing idl_tx_list. The connp gets inserted in a drain list
    514  * pointed to by idl_tx_list. conn_drain_list() asserts flow control for
    515  * the sockets (non stream based) and sets QFULL condition for conn_wq.
    516  * connp->conn_direct_blocked will be set to indicate the blocked
    517  * condition.
    518  *
    519  * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved.
    520  * A cookie is passed in the call to ill_flow_enable() that identifies the
    521  * blocked Tx ring. This cookie is used to get to the idl_tx_list that
    522  * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t
    523  * and goes through each of the drain list (q)enabling the conn_wq of the
    524  * first conn in each of the drain list. This causes ip_wsrv to run for the
    525  * conn. ip_wsrv drains the queued messages, and removes the conn from the
    526  * drain list, if all messages were drained. It also qenables the next conn
    527  * in the drain list to continue the drain process.
    528  *
    529  * In reality the drain list is not a single list, but a configurable number
    530  * of lists. conn_drain_walk() in the IP module, qenables the first conn in
    531  * each list. If the ip_wsrv of the next qenabled conn does not run, because
    532  * the stream closes, ip_close takes responsibility to qenable the next conn
    533  * in the drain list. conn_drain_insert and conn_drain_tail are the only
    534  * functions that manipulate this drain list. conn_drain_insert is called in
    535  * ip_wput context itself (as opposed to from ip_wsrv context for STREAMS
    536  * case -- see below). The synchronization between drain insertion and flow
    537  * control wakeup is handled by using idl_txl->txl_lock.
    538  *
    539  * Flow control using STREAMS:
    540  * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism
    541  * is used. On the send side, if the packet cannot be sent down to the
    542  * driver by IP, because of a canput failure, IP does a putq on the conn_wq.
    543  * This will cause ip_wsrv to run on the conn_wq. ip_wsrv in turn, inserts
    544  * the conn in a list of conn's that need to be drained when the flow
    545  * control condition subsides. The blocked connps are put in first member
    546  * of ips_idl_tx_list[] array. Ultimately STREAMS backenables the ip_wsrv
    547  * on the IP module. It calls conn_walk_drain() passing ips_idl_tx_list[0].
    548  * ips_idl_tx_list[0] contains the drain lists of blocked conns. The
    549  * conn_wq of the first conn in the drain lists is (q)enabled to run.
    550  * ip_wsrv on this conn drains the queued messages, and removes the conn
    551  * from the drain list, if all messages were drained. It also qenables the
    552  * next conn in the drain list to continue the drain process.
    553  *
    554  * If the ip_wsrv of the next qenabled conn does not run, because the
    555  * stream closes, ip_close takes responsibility to qenable the next conn in
    556  * the drain list. The directly called ip_wput path always does a putq, if
    557  * it cannot putnext. Thus synchronization problems are handled between
    558  * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only
    559  * functions that manipulate this drain list. Furthermore conn_drain_insert
    560  * is called only from ip_wsrv for the STREAMS case, and there can be only 1
    561  * instance of ip_wsrv running on a queue at any time. conn_drain_tail can
    562  * be simultaneously called from both ip_wsrv and ip_close.
    563  *
    564  * IPQOS notes:
    565  *
    566  * IPQoS Policies are applied to packets using IPPF (IP Policy framework)
    567  * and IPQoS modules. IPPF includes hooks in IP at different control points
    568  * (callout positions) which direct packets to IPQoS modules for policy
    569  * processing. Policies, if present, are global.
    570  *
    571  * The callout positions are located in the following paths:
    572  *		o local_in (packets destined for this host)
    573  *		o local_out (packets orginating from this host )
    574  *		o fwd_in  (packets forwarded by this m/c - inbound)
    575  *		o fwd_out (packets forwarded by this m/c - outbound)
    576  * Hooks at these callout points can be enabled/disabled using the ndd variable
    577  * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
    578  * By default all the callout positions are enabled.
    579  *
    580  * Outbound (local_out)
    581  * Hooks are placed in ip_wput_ire and ipsec_out_process.
    582  *
    583  * Inbound (local_in)
    584  * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and
    585  * TCP and UDP fanout routines.
    586  *
    587  * Forwarding (in and out)
    588  * Hooks are placed in ip_rput_forward.
    589  *
    590  * IP Policy Framework processing (IPPF processing)
    591  * Policy processing for a packet is initiated by ip_process, which ascertains
    592  * that the classifier (ipgpc) is loaded and configured, failing which the
    593  * packet resumes normal processing in IP. If the clasifier is present, the
    594  * packet is acted upon by one or more IPQoS modules (action instances), per
    595  * filters configured in ipgpc and resumes normal IP processing thereafter.
    596  * An action instance can drop a packet in course of its processing.
    597  *
    598  * A boolean variable, ip_policy, is used in all the fanout routines that can
    599  * invoke ip_process for a packet. This variable indicates if the packet should
    600  * to be sent for policy processing. The variable is set to B_TRUE by default,
    601  * i.e. when the routines are invoked in the normal ip procesing path for a
    602  * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout;
    603  * ip_policy is set to B_FALSE for all the routines called in these two
    604  * functions because, in the former case,  we don't process loopback traffic
    605  * currently while in the latter, the packets have already been processed in
    606  * icmp_inbound.
    607  *
    608  * Zones notes:
    609  *
    610  * The partitioning rules for networking are as follows:
    611  * 1) Packets coming from a zone must have a source address belonging to that
    612  * zone.
    613  * 2) Packets coming from a zone can only be sent on a physical interface on
    614  * which the zone has an IP address.
    615  * 3) Between two zones on the same machine, packet delivery is only allowed if
    616  * there's a matching route for the destination and zone in the forwarding
    617  * table.
    618  * 4) The TCP and UDP port spaces are per-zone; that is, two processes in
    619  * different zones can bind to the same port with the wildcard address
    620  * (INADDR_ANY).
    621  *
    622  * The granularity of interface partitioning is at the logical interface level.
    623  * Therefore, every zone has its own IP addresses, and incoming packets can be
    624  * attributed to a zone unambiguously. A logical interface is placed into a zone
    625  * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
    626  * structure. Rule (1) is implemented by modifying the source address selection
    627  * algorithm so that the list of eligible addresses is filtered based on the
    628  * sending process zone.
    629  *
    630  * The Internet Routing Entries (IREs) are either exclusive to a zone or shared
    631  * across all zones, depending on their type. Here is the break-up:
    632  *
    633  * IRE type				Shared/exclusive
    634  * --------				----------------
    635  * IRE_BROADCAST			Exclusive
    636  * IRE_DEFAULT (default routes)		Shared (*)
    637  * IRE_LOCAL				Exclusive (x)
    638  * IRE_LOOPBACK				Exclusive
    639  * IRE_PREFIX (net routes)		Shared (*)
    640  * IRE_CACHE				Exclusive
    641  * IRE_IF_NORESOLVER (interface routes)	Exclusive
    642  * IRE_IF_RESOLVER (interface routes)	Exclusive
    643  * IRE_HOST (host routes)		Shared (*)
    644  *
    645  * (*) A zone can only use a default or off-subnet route if the gateway is
    646  * directly reachable from the zone, that is, if the gateway's address matches
    647  * one of the zone's logical interfaces.
    648  *
    649  * (x) IRE_LOCAL are handled a bit differently, since for all other entries
    650  * in ire_ctable and IRE_INTERFACE, ire_src_addr is what can be used as source
    651  * when sending packets using the IRE. For IRE_LOCAL ire_src_addr is the IP
    652  * address of the zone itself (the destination). Since IRE_LOCAL is used
    653  * for communication between zones, ip_wput_ire has special logic to set
    654  * the right source address when sending using an IRE_LOCAL.
    655  *
    656  * Furthermore, when ip_restrict_interzone_loopback is set (the default),
    657  * ire_cache_lookup restricts loopback using an IRE_LOCAL
    658  * between zone to the case when L2 would have conceptually looped the packet
    659  * back, i.e. the loopback which is required since neither Ethernet drivers
    660  * nor Ethernet hardware loops them back. This is the case when the normal
    661  * routes (ignoring IREs with different zoneids) would send out the packet on
    662  * the same ill as the ill with which is IRE_LOCAL is associated.
    663  *
    664  * Multiple zones can share a common broadcast address; typically all zones
    665  * share the 255.255.255.255 address. Incoming as well as locally originated
    666  * broadcast packets must be dispatched to all the zones on the broadcast
    667  * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
    668  * since some zones may not be on the 10.16.72/24 network. To handle this, each
    669  * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
    670  * sent to every zone that has an IRE_BROADCAST entry for the destination
    671  * address on the input ill, see conn_wantpacket().
    672  *
    673  * Applications in different zones can join the same multicast group address.
    674  * For IPv4, group memberships are per-logical interface, so they're already
    675  * inherently part of a zone. For IPv6, group memberships are per-physical
    676  * interface, so we distinguish IPv6 group memberships based on group address,
    677  * interface and zoneid. In both cases, received multicast packets are sent to
    678  * every zone for which a group membership entry exists. On IPv6 we need to
    679  * check that the target zone still has an address on the receiving physical
    680  * interface; it could have been removed since the application issued the
    681  * IPV6_JOIN_GROUP.
    682  */
    683 
    684 /*
    685  * Squeue Fanout flags:
    686  *	0: No fanout.
    687  *	1: Fanout across all squeues
    688  */
    689 boolean_t	ip_squeue_fanout = 0;
    690 
    691 /*
    692  * Maximum dups allowed per packet.
    693  */
    694 uint_t ip_max_frag_dups = 10;
    695 
    696 #define	IS_SIMPLE_IPH(ipha)						\
    697 	((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)
    698 
    699 /* RFC 1122 Conformance */
    700 #define	IP_FORWARD_DEFAULT	IP_FORWARD_NEVER
    701 
    702 #define	ILL_MAX_NAMELEN			LIFNAMSIZ
    703 
    704 static int	conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *);
    705 
    706 static int	ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
    707 		    cred_t *credp, boolean_t isv6);
    708 static mblk_t	*ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t,
    709 		    ipha_t **);
    710 
    711 static void	icmp_frag_needed(queue_t *, mblk_t *, int, zoneid_t,
    712 		    ip_stack_t *);
    713 static void	icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int,
    714 		    uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t);
    715 static ipaddr_t	icmp_get_nexthop_addr(ipha_t *, ill_t *, zoneid_t, mblk_t *mp);
    716 static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *, ill_t *, zoneid_t,
    717 		    mblk_t *, int, ip_stack_t *);
    718 static void	icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *,
    719 		    icmph_t *, ipha_t *, int, int, boolean_t, boolean_t,
    720 		    ill_t *, zoneid_t);
    721 static void	icmp_options_update(ipha_t *);
    722 static void	icmp_param_problem(queue_t *, mblk_t *, uint8_t, zoneid_t,
    723 		    ip_stack_t *);
    724 static void	icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t,
    725 		    zoneid_t zoneid, ip_stack_t *);
    726 static mblk_t	*icmp_pkt_err_ok(mblk_t *, ip_stack_t *);
    727 static void	icmp_redirect(ill_t *, mblk_t *);
    728 static void	icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t,
    729 		    ip_stack_t *);
    730 
    731 static void	ip_arp_news(queue_t *, mblk_t *);
    732 static boolean_t ip_bind_get_ire_v4(mblk_t **, ire_t *, iulp_t *, ip_stack_t *);
    733 mblk_t		*ip_dlpi_alloc(size_t, t_uscalar_t);
    734 char		*ip_dot_addr(ipaddr_t, char *);
    735 mblk_t		*ip_carve_mp(mblk_t **, ssize_t);
    736 int		ip_close(queue_t *, int);
    737 static char	*ip_dot_saddr(uchar_t *, char *);
    738 static void	ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t,
    739 		    boolean_t, boolean_t, ill_t *, zoneid_t);
    740 static void	ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t,
    741 		    boolean_t, boolean_t, zoneid_t);
    742 static void	ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t,
    743 		    boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t);
    744 static void	ip_lrput(queue_t *, mblk_t *);
    745 ipaddr_t	ip_net_mask(ipaddr_t);
    746 void		ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t,
    747 		    ip_stack_t *);
    748 static void	ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t,
    749 		    conn_t *, uint32_t, zoneid_t, ip_opt_info_t *);
    750 char		*ip_nv_lookup(nv_t *, int);
    751 static boolean_t	ip_check_for_ipsec_opt(queue_t *, mblk_t *);
    752 static int	ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    753 static int	ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    754 static boolean_t	ip_param_register(IDP *ndp, ipparam_t *, size_t,
    755     ipndp_t *, size_t);
    756 static int	ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
    757 void	ip_rput(queue_t *, mblk_t *);
    758 static void	ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
    759 		    void *dummy_arg);
    760 void	ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
    761 static int	ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *,
    762     ip_stack_t *);
    763 static boolean_t	ip_rput_local_options(queue_t *, mblk_t *, ipha_t *,
    764 			    ire_t *, ip_stack_t *);
    765 static boolean_t	ip_rput_multimblk_ipoptions(queue_t *, ill_t *,
    766 			    mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *);
    767 static int	ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *,
    768     ip_stack_t *);
    769 static boolean_t ip_rput_fragment(ill_t *, ill_t *, mblk_t **, ipha_t *,
    770     uint32_t *, uint16_t *);
    771 int		ip_snmp_get(queue_t *, mblk_t *, int);
    772 static mblk_t	*ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
    773 		    mib2_ipIfStatsEntry_t *, ip_stack_t *);
    774 static mblk_t	*ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
    775 		    ip_stack_t *);
    776 static mblk_t	*ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *);
    777 static mblk_t	*ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    778 static mblk_t	*ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
    779 static mblk_t	*ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    780 static mblk_t	*ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
    781 static mblk_t	*ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
    782 		    ip_stack_t *ipst);
    783 static mblk_t	*ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
    784 		    ip_stack_t *ipst);
    785 static mblk_t	*ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
    786 		    ip_stack_t *ipst);
    787 static mblk_t	*ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
    788 		    ip_stack_t *ipst);
    789 static mblk_t	*ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
    790 		    ip_stack_t *ipst);
    791 static mblk_t	*ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
    792 		    ip_stack_t *ipst);
    793 static mblk_t	*ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
    794 		    ip_stack_t *ipst);
    795 static mblk_t	*ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
    796 		    ip_stack_t *ipst);
    797 static mblk_t	*ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
    798 		    ip_stack_t *ipst);
    799 static mblk_t	*ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
    800 		    ip_stack_t *ipst);
    801 static void	ip_snmp_get2_v4(ire_t *, iproutedata_t *);
    802 static void	ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
    803 static int	ip_snmp_get2_v6_media(nce_t *, iproutedata_t *);
    804 int		ip_snmp_set(queue_t *, int, int, uchar_t *, int);
    805 static boolean_t	ip_source_routed(ipha_t *, ip_stack_t *);
    806 static boolean_t	ip_source_route_included(ipha_t *);
    807 static void	ip_trash_ire_reclaim_stack(ip_stack_t *);
    808 
    809 static void	ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t,
    810 		    zoneid_t, ip_stack_t *, conn_t *);
    811 static mblk_t	*ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *,
    812 		    mblk_t *);
    813 static void	ip_wput_local_options(ipha_t *, ip_stack_t *);
    814 static int	ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t,
    815 		    zoneid_t, ip_stack_t *);
    816 
    817 static void	conn_drain_init(ip_stack_t *);
    818 static void	conn_drain_fini(ip_stack_t *);
    819 static void	conn_drain_tail(conn_t *connp, boolean_t closing);
    820 
    821 static void	conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
    822 static void	conn_setqfull(conn_t *);
    823 static void	conn_clrqfull(conn_t *);
    824 
    825 static void	*ip_stack_init(netstackid_t stackid, netstack_t *ns);
    826 static void	ip_stack_shutdown(netstackid_t stackid, void *arg);
    827 static void	ip_stack_fini(netstackid_t stackid, void *arg);
    828 
    829 static boolean_t	conn_wantpacket(conn_t *, ill_t *, ipha_t *, int,
    830     zoneid_t);
    831 static void	ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
    832     void *dummy_arg);
    833 
    834 static int	ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
    835 
    836 static int	ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
    837     ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *,
    838     conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *);
    839 static void	ip_multirt_bad_mtu(ire_t *, uint32_t);
    840 
    841 static int	ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    842 static int	ip_cgtp_filter_set(queue_t *, mblk_t *, char *,
    843     caddr_t, cred_t *);
    844 extern int	ip_helper_stream_setup(queue_t *, dev_t *, int, int,
    845     cred_t *, boolean_t);
    846 static int	ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
    847     caddr_t cp, cred_t *cr);
    848 static int	ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
    849     cred_t *);
    850 static int	ip_squeue_switch(int);
    851 
    852 static void	*ip_kstat_init(netstackid_t, ip_stack_t *);
    853 static void	ip_kstat_fini(netstackid_t, kstat_t *);
    854 static int	ip_kstat_update(kstat_t *kp, int rw);
    855 static void	*icmp_kstat_init(netstackid_t);
    856 static void	icmp_kstat_fini(netstackid_t, kstat_t *);
    857 static int	icmp_kstat_update(kstat_t *kp, int rw);
    858 static void	*ip_kstat2_init(netstackid_t, ip_stat_t *);
    859 static void	ip_kstat2_fini(netstackid_t, kstat_t *);
    860 
    861 static mblk_t	*ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t,
    862     ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *);
    863 
    864 static void	ip_rput_process_forward(queue_t *, mblk_t *, ire_t *,
    865     ipha_t *, ill_t *, boolean_t, boolean_t);
    866 
    867 static void ipobs_init(ip_stack_t *);
    868 static void ipobs_fini(ip_stack_t *);
    869 ipaddr_t	ip_g_all_ones = IP_HOST_MASK;
    870 
    871 /* How long, in seconds, we allow frags to hang around. */
    872 #define	IP_FRAG_TIMEOUT		15
    873 #define	IPV6_FRAG_TIMEOUT	60
    874 
    875 /*
    876  * Threshold which determines whether MDT should be used when
    877  * generating IP fragments; payload size must be greater than
    878  * this threshold for MDT to take place.
    879  */
    880 #define	IP_WPUT_FRAG_MDT_MIN	32768
    881 
    882 /* Setable in /etc/system only */
    883 int	ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN;
    884 
    885 static long ip_rput_pullups;
    886 int	dohwcksum = 1;	/* use h/w cksum if supported by the hardware */
    887 
    888 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
    889 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
    890 
    891 int	ip_debug;
    892 
    893 #ifdef DEBUG
    894 uint32_t ipsechw_debug = 0;
    895 #endif
    896 
    897 /*
    898  * Multirouting/CGTP stuff
    899  */
    900 int	ip_cgtp_filter_rev = CGTP_FILTER_REV;	/* CGTP hooks version */
    901 
    902 /*
    903  * XXX following really should only be in a header. Would need more
    904  * header and .c clean up first.
    905  */
    906 extern optdb_obj_t	ip_opt_obj;
    907 
    908 ulong_t ip_squeue_enter_unbound = 0;
    909 
    910 /*
    911  * Named Dispatch Parameter Table.
    912  * All of these are alterable, within the min/max values given, at run time.
    913  */
    914 static ipparam_t	lcl_param_arr[] = {
    915 	/* min	max	value	name */
    916 	{  0,	1,	0,	"ip_respond_to_address_mask_broadcast"},
    917 	{  0,	1,	1,	"ip_respond_to_echo_broadcast"},
    918 	{  0,	1,	1,	"ip_respond_to_echo_multicast"},
    919 	{  0,	1,	0,	"ip_respond_to_timestamp"},
    920 	{  0,	1,	0,	"ip_respond_to_timestamp_broadcast"},
    921 	{  0,	1,	1,	"ip_send_redirects"},
    922 	{  0,	1,	0,	"ip_forward_directed_broadcasts"},
    923 	{  0,	10,	0,	"ip_mrtdebug"},
    924 	{  5000, 999999999,	60000, "ip_ire_timer_interval" },
    925 	{  60000, 999999999,	1200000, "ip_ire_arp_interval" },
    926 	{  60000, 999999999,	60000, "ip_ire_redirect_interval" },
    927 	{  1,	255,	255,	"ip_def_ttl" },
    928 	{  0,	1,	0,	"ip_forward_src_routed"},
    929 	{  0,	256,	32,	"ip_wroff_extra" },
    930 	{  5000, 999999999, 600000, "ip_ire_pathmtu_interval" },
    931 	{  8,	65536,  64,	"ip_icmp_return_data_bytes" },
    932 	{  0,	1,	1,	"ip_path_mtu_discovery" },
    933 	{  0,	240,	30,	"ip_ignore_delete_time" },
    934 	{  0,	1,	0,	"ip_ignore_redirect" },
    935 	{  0,	1,	1,	"ip_output_queue" },
    936 	{  1,	254,	1,	"ip_broadcast_ttl" },
    937 	{  0,	99999,	100,	"ip_icmp_err_interval" },
    938 	{  1,	99999,	10,	"ip_icmp_err_burst" },
    939 	{  0,	999999999,	1000000, "ip_reass_queue_bytes" },
    940 	{  0,	1,	0,	"ip_strict_dst_multihoming" },
    941 	{  1,	MAX_ADDRS_PER_IF,	256,	"ip_addrs_per_if"},
    942 	{  0,	1,	0,	"ipsec_override_persocket_policy" },
    943 	{  0,	1,	1,	"icmp_accept_clear_messages" },
    944 	{  0,	1,	1,	"igmp_accept_clear_messages" },
    945 	{  2,	999999999, ND_DELAY_FIRST_PROBE_TIME,
    946 				"ip_ndp_delay_first_probe_time"},
    947 	{  1,	999999999, ND_MAX_UNICAST_SOLICIT,
    948 				"ip_ndp_max_unicast_solicit"},
    949 	{  1,	255,	IPV6_MAX_HOPS,	"ip6_def_hops" },
    950 	{  8,	IPV6_MIN_MTU,	IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" },
    951 	{  0,	1,	0,	"ip6_forward_src_routed"},
    952 	{  0,	1,	1,	"ip6_respond_to_echo_multicast"},
    953 	{  0,	1,	1,	"ip6_send_redirects"},
    954 	{  0,	1,	0,	"ip6_ignore_redirect" },
    955 	{  0,	1,	0,	"ip6_strict_dst_multihoming" },
    956 
    957 	{  1,	8,	3,	"ip_ire_reclaim_fraction" },
    958 
    959 	{  0,	999999,	1000,	"ipsec_policy_log_interval" },
    960 
    961 	{  0,	1,	1,	"pim_accept_clear_messages" },
    962 	{  1000, 20000,	2000,	"ip_ndp_unsolicit_interval" },
    963 	{  1,	20,	3,	"ip_ndp_unsolicit_count" },
    964 	{  0,	1,	1,	"ip6_ignore_home_address_opt" },
    965 	{  0,	15,	0,	"ip_policy_mask" },
    966 	{  1000, 60000, 1000,	"ip_multirt_resolution_interval" },
    967 	{  0,	255,	1,	"ip_multirt_ttl" },
    968 	{  0,	1,	1,	"ip_multidata_outbound" },
    969 	{  0,	3600000, 300000, "ip_ndp_defense_interval" },
    970 	{  0,	999999,	60*60*24, "ip_max_temp_idle" },
    971 	{  0,	1000,	1,	"ip_max_temp_defend" },
    972 	{  0,	1000,	3,	"ip_max_defend" },
    973 	{  0,	999999,	30,	"ip_defend_interval" },
    974 	{  0,	3600000, 300000, "ip_dup_recovery" },
    975 	{  0,	1,	1,	"ip_restrict_interzone_loopback" },
    976 	{  0,	1,	1,	"ip_lso_outbound" },
    977 	{  IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" },
    978 	{  MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" },
    979 	{ 68,	65535,	576,	"ip_pmtu_min" },
    980 #ifdef DEBUG
    981 	{  0,	1,	0,	"ip6_drop_inbound_icmpv6" },
    982 #else
    983 	{  0,	0,	0,	"" },
    984 #endif
    985 };
    986 
    987 /*
    988  * Extended NDP table
    989  * The addresses for the first two are filled in to be ips_ip_g_forward
    990  * and ips_ipv6_forward at init time.
    991  */
    992 static ipndp_t	lcl_ndp_arr[] = {
    993 	/* getf			setf		data			name */
    994 #define	IPNDP_IP_FORWARDING_OFFSET	0
    995 	{  ip_param_generic_get,	ip_forward_set,	NULL,
    996 	    "ip_forwarding" },
    997 #define	IPNDP_IP6_FORWARDING_OFFSET	1
    998 	{  ip_param_generic_get,	ip_forward_set,	NULL,
    999 	    "ip6_forwarding" },
   1000 	{ ip_param_generic_get, ip_input_proc_set,
   1001 	    (caddr_t)&ip_squeue_enter, "ip_squeue_enter" },
   1002 	{ ip_param_generic_get, ip_int_set,
   1003 	    (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" },
   1004 #define	IPNDP_CGTP_FILTER_OFFSET	4
   1005 	{  ip_cgtp_filter_get,	ip_cgtp_filter_set, NULL,
   1006 	    "ip_cgtp_filter" },
   1007 	{  ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug,
   1008 	    "ip_debug" },
   1009 };
   1010 
   1011 /*
   1012  * Table of IP ioctls encoding the various properties of the ioctl and
   1013  * indexed based on the last byte of the ioctl command. Occasionally there
   1014  * is a clash, and there is more than 1 ioctl with the same last byte.
   1015  * In such a case 1 ioctl is encoded in the ndx table and the remaining
   1016  * ioctls are encoded in the misc table. An entry in the ndx table is
   1017  * retrieved by indexing on the last byte of the ioctl command and comparing
   1018  * the ioctl command with the value in the ndx table. In the event of a
   1019  * mismatch the misc table is then searched sequentially for the desired
   1020  * ioctl command.
   1021  *
   1022  * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
   1023  */
   1024 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
   1025 	/* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1026 	/* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1027 	/* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1028 	/* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1029 	/* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1030 	/* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1031 	/* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1032 	/* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1033 	/* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1034 	/* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1035 
   1036 	/* 010 */ { SIOCADDRT,	sizeof (struct rtentry), IPI_PRIV,
   1037 			MISC_CMD, ip_siocaddrt, NULL },
   1038 	/* 011 */ { SIOCDELRT,	sizeof (struct rtentry), IPI_PRIV,
   1039 			MISC_CMD, ip_siocdelrt, NULL },
   1040 
   1041 	/* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
   1042 			IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
   1043 	/* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
   1044 			IF_CMD, ip_sioctl_get_addr, NULL },
   1045 
   1046 	/* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
   1047 			IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
   1048 	/* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
   1049 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
   1050 
   1051 	/* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
   1052 			IPI_PRIV | IPI_WR,
   1053 			IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
   1054 	/* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
   1055 			IPI_MODOK | IPI_GET_CMD,
   1056 			IF_CMD, ip_sioctl_get_flags, NULL },
   1057 
   1058 	/* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1059 	/* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1060 
   1061 	/* copyin size cannot be coded for SIOCGIFCONF */
   1062 	/* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD,
   1063 			MISC_CMD, ip_sioctl_get_ifconf, NULL },
   1064 
   1065 	/* 021 */ { SIOCSIFMTU,	sizeof (struct ifreq), IPI_PRIV | IPI_WR,
   1066 			IF_CMD, ip_sioctl_mtu, NULL },
   1067 	/* 022 */ { SIOCGIFMTU,	sizeof (struct ifreq), IPI_GET_CMD,
   1068 			IF_CMD, ip_sioctl_get_mtu, NULL },
   1069 	/* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
   1070 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
   1071 	/* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
   1072 			IF_CMD, ip_sioctl_brdaddr, NULL },
   1073 	/* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
   1074 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
   1075 	/* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
   1076 			IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
   1077 	/* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
   1078 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
   1079 	/* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
   1080 			IF_CMD, ip_sioctl_metric, NULL },
   1081 	/* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1082 
   1083 	/* See 166-168 below for extended SIOC*XARP ioctls */
   1084 	/* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
   1085 			ARP_CMD, ip_sioctl_arp, NULL },
   1086 	/* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
   1087 			ARP_CMD, ip_sioctl_arp, NULL },
   1088 	/* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
   1089 			ARP_CMD, ip_sioctl_arp, NULL },
   1090 
   1091 	/* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1092 	/* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1093 	/* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1094 	/* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1095 	/* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1096 	/* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1097 	/* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1098 	/* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1099 	/* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1100 	/* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1101 	/* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1102 	/* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1103 	/* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1104 	/* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1105 	/* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1106 	/* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1107 	/* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1108 	/* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1109 	/* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1110 	/* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1111 	/* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1112 
   1113 	/* 054 */ { IF_UNITSEL,	sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
   1114 			MISC_CMD, if_unitsel, if_unitsel_restart },
   1115 
   1116 	/* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1117 	/* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1118 	/* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1119 	/* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1120 	/* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1121 	/* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1122 	/* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1123 	/* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1124 	/* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1125 	/* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1126 	/* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1127 	/* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1128 	/* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1129 	/* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1130 	/* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1131 	/* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1132 	/* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1133 	/* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1134 
   1135 	/* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
   1136 			IPI_PRIV | IPI_WR | IPI_MODOK,
   1137 			IF_CMD, ip_sioctl_sifname, NULL },
   1138 
   1139 	/* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1140 	/* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1141 	/* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1142 	/* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1143 	/* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1144 	/* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1145 	/* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1146 	/* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1147 	/* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1148 	/* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1149 	/* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1150 	/* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1151 	/* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1152 
   1153 	/* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
   1154 			MISC_CMD, ip_sioctl_get_ifnum, NULL },
   1155 	/* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
   1156 			IF_CMD, ip_sioctl_get_muxid, NULL },
   1157 	/* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
   1158 			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
   1159 
   1160 	/* Both if and lif variants share same func */
   1161 	/* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
   1162 			IF_CMD, ip_sioctl_get_lifindex, NULL },
   1163 	/* Both if and lif variants share same func */
   1164 	/* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
   1165 			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
   1166 
   1167 	/* copyin size cannot be coded for SIOCGIFCONF */
   1168 	/* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
   1169 			MISC_CMD, ip_sioctl_get_ifconf, NULL },
   1170 	/* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1171 	/* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1172 	/* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1173 	/* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1174 	/* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1175 	/* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1176 	/* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1177 	/* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1178 	/* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1179 	/* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1180 	/* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1181 	/* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1182 	/* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1183 	/* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1184 	/* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1185 	/* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1186 	/* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1187 
   1188 	/* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
   1189 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
   1190 			ip_sioctl_removeif_restart },
   1191 	/* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
   1192 			IPI_GET_CMD | IPI_PRIV | IPI_WR,
   1193 			LIF_CMD, ip_sioctl_addif, NULL },
   1194 #define	SIOCLIFADDR_NDX 112
   1195 	/* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1196 			LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
   1197 	/* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
   1198 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
   1199 	/* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1200 			LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
   1201 	/* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
   1202 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
   1203 	/* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
   1204 			IPI_PRIV | IPI_WR,
   1205 			LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
   1206 	/* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
   1207 			IPI_GET_CMD | IPI_MODOK,
   1208 			LIF_CMD, ip_sioctl_get_flags, NULL },
   1209 
   1210 	/* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1211 	/* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1212 
   1213 	/* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
   1214 			ip_sioctl_get_lifconf, NULL },
   1215 	/* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1216 			LIF_CMD, ip_sioctl_mtu, NULL },
   1217 	/* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
   1218 			LIF_CMD, ip_sioctl_get_mtu, NULL },
   1219 	/* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
   1220 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
   1221 	/* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1222 			LIF_CMD, ip_sioctl_brdaddr, NULL },
   1223 	/* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
   1224 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
   1225 	/* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1226 			LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
   1227 	/* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
   1228 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
   1229 	/* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1230 			LIF_CMD, ip_sioctl_metric, NULL },
   1231 	/* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
   1232 			IPI_PRIV | IPI_WR | IPI_MODOK,
   1233 			LIF_CMD, ip_sioctl_slifname,
   1234 			ip_sioctl_slifname_restart },
   1235 
   1236 	/* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
   1237 			MISC_CMD, ip_sioctl_get_lifnum, NULL },
   1238 	/* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
   1239 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
   1240 	/* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
   1241 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
   1242 	/* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
   1243 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
   1244 	/* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
   1245 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
   1246 	/* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1247 			LIF_CMD, ip_sioctl_token, NULL },
   1248 	/* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
   1249 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
   1250 	/* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1251 			LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
   1252 	/* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
   1253 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
   1254 	/* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1255 			LIF_CMD, ip_sioctl_lnkinfo, NULL },
   1256 
   1257 	/* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
   1258 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
   1259 	/* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
   1260 			LIF_CMD, ip_siocdelndp_v6, NULL },
   1261 	/* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
   1262 			LIF_CMD, ip_siocqueryndp_v6, NULL },
   1263 	/* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
   1264 			LIF_CMD, ip_siocsetndp_v6, NULL },
   1265 	/* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
   1266 			MISC_CMD, ip_sioctl_tmyaddr, NULL },
   1267 	/* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
   1268 			MISC_CMD, ip_sioctl_tonlink, NULL },
   1269 	/* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
   1270 			MISC_CMD, ip_sioctl_tmysite, NULL },
   1271 	/* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), 0,
   1272 			TUN_CMD, ip_sioctl_tunparam, NULL },
   1273 	/* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req),
   1274 		    IPI_PRIV | IPI_WR,
   1275 		    TUN_CMD, ip_sioctl_tunparam, NULL },
   1276 
   1277 	/* IPSECioctls handled in ip_sioctl_copyin_setup itself */
   1278 	/* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1279 	/* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1280 	/* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1281 	/* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1282 
   1283 	/* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1284 
   1285 	/* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD,
   1286 			LIF_CMD, ip_sioctl_get_binding, NULL },
   1287 	/* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
   1288 			IPI_PRIV | IPI_WR,
   1289 			LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
   1290 	/* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
   1291 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
   1292 	/* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
   1293 			IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
   1294 
   1295 	/* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
   1296 	/* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1297 	/* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1298 	/* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1299 
   1300 	/* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1301 
   1302 	/* These are handled in ip_sioctl_copyin_setup itself */
   1303 	/* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
   1304 			MISC_CMD, NULL, NULL },
   1305 	/* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
   1306 			MISC_CMD, NULL, NULL },
   1307 	/* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
   1308 
   1309 	/* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
   1310 			ip_sioctl_get_lifconf, NULL },
   1311 
   1312 	/* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
   1313 			XARP_CMD, ip_sioctl_arp, NULL },
   1314 	/* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
   1315 			XARP_CMD, ip_sioctl_arp, NULL },
   1316 	/* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
   1317 			XARP_CMD, ip_sioctl_arp, NULL },
   1318 
   1319 	/* SIOCPOPSOCKFS is not handled by IP */
   1320 	/* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
   1321 
   1322 	/* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
   1323 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
   1324 	/* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
   1325 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
   1326 			ip_sioctl_slifzone_restart },
   1327 	/* 172-174 are SCTP ioctls and not handled by IP */
   1328 	/* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1329 	/* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1330 	/* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1331 	/* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
   1332 			IPI_GET_CMD, LIF_CMD,
   1333 			ip_sioctl_get_lifusesrc, 0 },
   1334 	/* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
   1335 			IPI_PRIV | IPI_WR,
   1336 			LIF_CMD, ip_sioctl_slifusesrc,
   1337 			NULL },
   1338 	/* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
   1339 			ip_sioctl_get_lifsrcof, NULL },
   1340 	/* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
   1341 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1342 	/* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR,
   1343 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1344 	/* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
   1345 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1346 	/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR,
   1347 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1348 	/* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1349 	/* SIOCSENABLESDP is handled by SDP */
   1350 	/* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
   1351 	/* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
   1352 };
   1353 
   1354 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
   1355 
   1356 ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
   1357 	{ OSIOCGTUNPARAM, sizeof (struct old_iftun_req),
   1358 		IPI_GET_CMD, TUN_CMD, ip_sioctl_tunparam, NULL },
   1359 	{ OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR,
   1360 		TUN_CMD, ip_sioctl_tunparam, NULL },
   1361 	{ I_LINK,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
   1362 	{ I_UNLINK,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
   1363 	{ I_PLINK,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
   1364 	{ I_PUNLINK,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
   1365 	{ ND_GET,	0, IPI_PASS_DOWN, 0, NULL, NULL },
   1366 	{ ND_SET,	0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
   1367 	{ IP_IOCTL,	0, 0, 0, NULL, NULL },
   1368 	{ SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
   1369 		MISC_CMD, mrt_ioctl},
   1370 	{ SIOCGETSGCNT,	sizeof (struct sioc_sg_req), IPI_GET_CMD,
   1371 		MISC_CMD, mrt_ioctl},
   1372 	{ SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
   1373 		MISC_CMD, mrt_ioctl}
   1374 };
   1375 
   1376 int ip_misc_ioctl_count =
   1377     sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
   1378 
   1379 int	conn_drain_nthreads;		/* Number of drainers reqd. */
   1380 					/* Settable in /etc/system */
   1381 /* Defined in ip_ire.c */
   1382 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
   1383 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
   1384 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
   1385 
   1386 static nv_t	ire_nv_arr[] = {
   1387 	{ IRE_BROADCAST, "BROADCAST" },
   1388 	{ IRE_LOCAL, "LOCAL" },
   1389 	{ IRE_LOOPBACK, "LOOPBACK" },
   1390 	{ IRE_CACHE, "CACHE" },
   1391 	{ IRE_DEFAULT, "DEFAULT" },
   1392 	{ IRE_PREFIX, "PREFIX" },
   1393 	{ IRE_IF_NORESOLVER, "IF_NORESOL" },
   1394 	{ IRE_IF_RESOLVER, "IF_RESOLV" },
   1395 	{ IRE_HOST, "HOST" },
   1396 	{ 0 }
   1397 };
   1398 
   1399 nv_t	*ire_nv_tbl = ire_nv_arr;
   1400 
   1401 /* Simple ICMP IP Header Template */
   1402 static ipha_t icmp_ipha = {
   1403 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
   1404 };
   1405 
   1406 struct module_info ip_mod_info = {
   1407 	IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
   1408 	IP_MOD_LOWAT
   1409 };
   1410 
   1411 /*
   1412  * Duplicate static symbols within a module confuses mdb; so we avoid the
   1413  * problem by making the symbols here distinct from those in udp.c.
   1414  */
   1415 
   1416 /*
   1417  * Entry points for IP as a device and as a module.
   1418  * FIXME: down the road we might want a separate module and driver qinit.
   1419  * We have separate open functions for the /dev/ip and /dev/ip6 devices.
   1420  */
   1421 static struct qinit iprinitv4 = {
   1422 	(pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL,
   1423 	&ip_mod_info
   1424 };
   1425 
   1426 struct qinit iprinitv6 = {
   1427 	(pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL,
   1428 	&ip_mod_info
   1429 };
   1430 
   1431 static struct qinit ipwinitv4 = {
   1432 	(pfi_t)ip_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL,
   1433 	&ip_mod_info
   1434 };
   1435 
   1436 struct qinit ipwinitv6 = {
   1437 	(pfi_t)ip_wput_v6, (pfi_t)ip_wsrv, NULL, NULL, NULL,
   1438 	&ip_mod_info
   1439 };
   1440 
   1441 static struct qinit iplrinit = {
   1442 	(pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL,
   1443 	&ip_mod_info
   1444 };
   1445 
   1446 static struct qinit iplwinit = {
   1447 	(pfi_t)ip_lwput, NULL, NULL, NULL, NULL,
   1448 	&ip_mod_info
   1449 };
   1450 
   1451 /* For AF_INET aka /dev/ip */
   1452 struct streamtab ipinfov4 = {
   1453 	&iprinitv4, &ipwinitv4, &iplrinit, &iplwinit
   1454 };
   1455 
   1456 /* For AF_INET6 aka /dev/ip6 */
   1457 struct streamtab ipinfov6 = {
   1458 	&iprinitv6, &ipwinitv6, &iplrinit, &iplwinit
   1459 };
   1460 
   1461 #ifdef	DEBUG
   1462 static boolean_t skip_sctp_cksum = B_FALSE;
   1463 #endif
   1464 
   1465 /*
   1466  * Prepend the zoneid using an ipsec_out_t for later use by functions like
   1467  * ip_rput_v6(), ip_output(), etc.  If the message
   1468  * block already has a M_CTL at the front of it, then simply set the zoneid
   1469  * appropriately.
   1470  */
   1471 mblk_t *
   1472 ip_prepend_zoneid(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
   1473 {
   1474 	mblk_t		*first_mp;
   1475 	ipsec_out_t	*io;
   1476 
   1477 	ASSERT(zoneid != ALL_ZONES);
   1478 	if (mp->b_datap->db_type == M_CTL) {
   1479 		io = (ipsec_out_t *)mp->b_rptr;
   1480 		ASSERT(io->ipsec_out_type == IPSEC_OUT);
   1481 		io->ipsec_out_zoneid = zoneid;
   1482 		return (mp);
   1483 	}
   1484 
   1485 	first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack);
   1486 	if (first_mp == NULL)
   1487 		return (NULL);
   1488 	io = (ipsec_out_t *)first_mp->b_rptr;
   1489 	/* This is not a secure packet */
   1490 	io->ipsec_out_secure = B_FALSE;
   1491 	io->ipsec_out_zoneid = zoneid;
   1492 	first_mp->b_cont = mp;
   1493 	return (first_mp);
   1494 }
   1495 
   1496 /*
   1497  * Copy an M_CTL-tagged message, preserving reference counts appropriately.
   1498  */
   1499 mblk_t *
   1500 ip_copymsg(mblk_t *mp)
   1501 {
   1502 	mblk_t *nmp;
   1503 	ipsec_info_t *in;
   1504 
   1505 	if (mp->b_datap->db_type != M_CTL)
   1506 		return (copymsg(mp));
   1507 
   1508 	in = (ipsec_info_t *)mp->b_rptr;
   1509 
   1510 	/*
   1511 	 * Note that M_CTL is also used for delivering ICMP error messages
   1512 	 * upstream to transport layers.
   1513 	 */
   1514 	if (in->ipsec_info_type != IPSEC_OUT &&
   1515 	    in->ipsec_info_type != IPSEC_IN)
   1516 		return (copymsg(mp));
   1517 
   1518 	nmp = copymsg(mp->b_cont);
   1519 
   1520 	if (in->ipsec_info_type == IPSEC_OUT) {
   1521 		return (ipsec_out_tag(mp, nmp,
   1522 		    ((ipsec_out_t *)in)->ipsec_out_ns));
   1523 	} else {
   1524 		return (ipsec_in_tag(mp, nmp,
   1525 		    ((ipsec_in_t *)in)->ipsec_in_ns));
   1526 	}
   1527 }
   1528 
   1529 /* Generate an ICMP fragmentation needed message. */
   1530 static void
   1531 icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid,
   1532     ip_stack_t *ipst)
   1533 {
   1534 	icmph_t	icmph;
   1535 	mblk_t *first_mp;
   1536 	boolean_t mctl_present;
   1537 
   1538 	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
   1539 
   1540 	if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
   1541 		if (mctl_present)
   1542 			freeb(first_mp);
   1543 		return;
   1544 	}
   1545 
   1546 	bzero(&icmph, sizeof (icmph_t));
   1547 	icmph.icmph_type = ICMP_DEST_UNREACHABLE;
   1548 	icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
   1549 	icmph.icmph_du_mtu = htons((uint16_t)mtu);
   1550 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
   1551 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
   1552 	icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid,
   1553 	    ipst);
   1554 }
   1555 
   1556 /*
   1557  * icmp_inbound deals with ICMP messages in the following ways.
   1558  *
   1559  * 1) It needs to send a reply back and possibly delivering it
   1560  *    to the "interested" upper clients.
   1561  * 2) It needs to send it to the upper clients only.
   1562  * 3) It needs to change some values in IP only.
   1563  * 4) It needs to change some values in IP and upper layers e.g TCP.
   1564  *
   1565  * We need to accomodate icmp messages coming in clear until we get
   1566  * everything secure from the wire. If icmp_accept_clear_messages
   1567  * is zero we check with the global policy and act accordingly. If
   1568  * it is non-zero, we accept the message without any checks. But
   1569  * *this does not mean* that this will be delivered to the upper
   1570  * clients. By accepting we might send replies back, change our MTU
   1571  * value etc. but delivery to the ULP/clients depends on their policy
   1572  * dispositions.
   1573  *
   1574  * We handle the above 4 cases in the context of IPsec in the
   1575  * following way :
   1576  *
   1577  * 1) Send the reply back in the same way as the request came in.
   1578  *    If it came in encrypted, it goes out encrypted. If it came in
   1579  *    clear, it goes out in clear. Thus, this will prevent chosen
   1580  *    plain text attack.
   1581  * 2) The client may or may not expect things to come in secure.
   1582  *    If it comes in secure, the policy constraints are checked
   1583  *    before delivering it to the upper layers. If it comes in
   1584  *    clear, ipsec_inbound_accept_clear will decide whether to
   1585  *    accept this in clear or not. In both the cases, if the returned
   1586  *    message (IP header + 8 bytes) that caused the icmp message has
   1587  *    AH/ESP headers, it is sent up to AH/ESP for validation before
   1588  *    sending up. If there are only 8 bytes of returned message, then
   1589  *    upper client will not be notified.
   1590  * 3) Check with global policy to see whether it matches the constaints.
   1591  *    But this will be done only if icmp_accept_messages_in_clear is
   1592  *    zero.
   1593  * 4) If we need to change both in IP and ULP, then the decision taken
   1594  *    while affecting the values in IP and while delivering up to TCP
   1595  *    should be the same.
   1596  *
   1597  * 	There are two cases.
   1598  *
   1599  * 	a) If we reject data at the IP layer (ipsec_check_global_policy()
   1600  *	   failed), we will not deliver it to the ULP, even though they
   1601  *	   are *willing* to accept in *clear*. This is fine as our global
   1602  *	   disposition to icmp messages asks us reject the datagram.
   1603  *
   1604  *	b) If we accept data at the IP layer (ipsec_check_global_policy()
   1605  *	   succeeded or icmp_accept_messages_in_clear is 1), and not able
   1606  *	   to deliver it to ULP (policy failed), it can lead to
   1607  *	   consistency problems. The cases known at this time are
   1608  *	   ICMP_DESTINATION_UNREACHABLE  messages with following code
   1609  *	   values :
   1610  *
   1611  *	   - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
   1612  *	     and Upper layer rejects. Then the communication will
   1613  *	     come to a stop. This is solved by making similar decisions
   1614  *	     at both levels. Currently, when we are unable to deliver
   1615  *	     to the Upper Layer (due to policy failures) while IP has
   1616  *	     adjusted ire_max_frag, the next outbound datagram would
   1617  *	     generate a local ICMP_FRAGMENTATION_NEEDED message - which
   1618  *	     will be with the right level of protection. Thus the right
   1619  *	     value will be communicated even if we are not able to
   1620  *	     communicate when we get from the wire initially. But this
   1621  *	     assumes there would be at least one outbound datagram after
   1622  *	     IP has adjusted its ire_max_frag value. To make things
   1623  *	     simpler, we accept in clear after the validation of
   1624  *	     AH/ESP headers.
   1625  *
   1626  *	   - Other ICMP ERRORS : We may not be able to deliver it to the
   1627  *	     upper layer depending on the level of protection the upper
   1628  *	     layer expects and the disposition in ipsec_inbound_accept_clear().
   1629  *	     ipsec_inbound_accept_clear() decides whether a given ICMP error
   1630  *	     should be accepted in clear when the Upper layer expects secure.
   1631  *	     Thus the communication may get aborted by some bad ICMP
   1632  *	     packets.
   1633  *
   1634  * IPQoS Notes:
   1635  * The only instance when a packet is sent for processing is when there
   1636  * isn't an ICMP client and if we are interested in it.
   1637  * If there is a client, IPPF processing will take place in the
   1638  * ip_fanout_proto routine.
   1639  *
   1640  * Zones notes:
   1641  * The packet is only processed in the context of the specified zone: typically
   1642  * only this zone will reply to an echo request, and only interested clients in
   1643  * this zone will receive a copy of the packet. This means that the caller must
   1644  * call icmp_inbound() for each relevant zone.
   1645  */
   1646 static void
   1647 icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
   1648     int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy,
   1649     ill_t *recv_ill, zoneid_t zoneid)
   1650 {
   1651 	icmph_t	*icmph;
   1652 	ipha_t	*ipha;
   1653 	int	iph_hdr_length;
   1654 	int	hdr_length;
   1655 	boolean_t	interested;
   1656 	uint32_t	ts;
   1657 	uchar_t	*wptr;
   1658 	ipif_t	*ipif;
   1659 	mblk_t *first_mp;
   1660 	ipsec_in_t *ii;
   1661 	timestruc_t now;
   1662 	uint32_t ill_index;
   1663 	ip_stack_t *ipst;
   1664 
   1665 	ASSERT(ill != NULL);
   1666 	ipst = ill->ill_ipst;
   1667 
   1668 	first_mp = mp;
   1669 	if (mctl_present) {
   1670 		mp = first_mp->b_cont;
   1671 		ASSERT(mp != NULL);
   1672 	}
   1673 
   1674 	ipha = (ipha_t *)mp->b_rptr;
   1675 	if (ipst->ips_icmp_accept_clear_messages == 0) {
   1676 		first_mp = ipsec_check_global_policy(first_mp, NULL,
   1677 		    ipha, NULL, mctl_present, ipst->ips_netstack);
   1678 		if (first_mp == NULL)
   1679 			return;
   1680 	}
   1681 
   1682 	/*
   1683 	 * On a labeled system, we have to check whether the zone itself is
   1684 	 * permitted to receive raw traffic.
   1685 	 */
   1686 	if (is_system_labeled()) {
   1687 		if (zoneid == ALL_ZONES)
   1688 			zoneid = tsol_packet_to_zoneid(mp);
   1689 		if (!tsol_can_accept_raw(mp, B_FALSE)) {
   1690 			ip1dbg(("icmp_inbound: zone %d can't receive raw",
   1691 			    zoneid));
   1692 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   1693 			freemsg(first_mp);
   1694 			return;
   1695 		}
   1696 	}
   1697 
   1698 	/*
   1699 	 * We have accepted the ICMP message. It means that we will
   1700 	 * respond to the packet if needed. It may not be delivered
   1701 	 * to the upper client depending on the policy constraints
   1702 	 * and the disposition in ipsec_inbound_accept_clear.
   1703 	 */
   1704 
   1705 	ASSERT(ill != NULL);
   1706 
   1707 	BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
   1708 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
   1709 	if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) {
   1710 		/* Last chance to get real. */
   1711 		if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) {
   1712 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   1713 			freemsg(first_mp);
   1714 			return;
   1715 		}
   1716 		/* Refresh iph following the pullup. */
   1717 		ipha = (ipha_t *)mp->b_rptr;
   1718 	}
   1719 	/* ICMP header checksum, including checksum field, should be zero. */
   1720 	if (sum_valid ? (sum != 0 && sum != 0xFFFF) :
   1721 	    IP_CSUM(mp, iph_hdr_length, 0)) {
   1722 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
   1723 		freemsg(first_mp);
   1724 		return;
   1725 	}
   1726 	/* The IP header will always be a multiple of four bytes */
   1727 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   1728 	ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type,
   1729 	    icmph->icmph_code));
   1730 	wptr = (uchar_t *)icmph + ICMPH_SIZE;
   1731 	/* We will set "interested" to "true" if we want a copy */
   1732 	interested = B_FALSE;
   1733 	switch (icmph->icmph_type) {
   1734 	case ICMP_ECHO_REPLY:
   1735 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
   1736 		break;
   1737 	case ICMP_DEST_UNREACHABLE:
   1738 		if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
   1739 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
   1740 		interested = B_TRUE;	/* Pass up to transport */
   1741 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
   1742 		break;
   1743 	case ICMP_SOURCE_QUENCH:
   1744 		interested = B_TRUE;	/* Pass up to transport */
   1745 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
   1746 		break;
   1747 	case ICMP_REDIRECT:
   1748 		if (!ipst->ips_ip_ignore_redirect)
   1749 			interested = B_TRUE;
   1750 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
   1751 		break;
   1752 	case ICMP_ECHO_REQUEST:
   1753 		/*
   1754 		 * Whether to respond to echo requests that come in as IP
   1755 		 * broadcasts or as IP multicast is subject to debate
   1756 		 * (what isn't?).  We aim to please, you pick it.
   1757 		 * Default is do it.
   1758 		 */
   1759 		if (!broadcast && !CLASSD(ipha->ipha_dst)) {
   1760 			/* unicast: always respond */
   1761 			interested = B_TRUE;
   1762 		} else if (CLASSD(ipha->ipha_dst)) {
   1763 			/* multicast: respond based on tunable */
   1764 			interested = ipst->ips_ip_g_resp_to_echo_mcast;
   1765 		} else if (broadcast) {
   1766 			/* broadcast: respond based on tunable */
   1767 			interested = ipst->ips_ip_g_resp_to_echo_bcast;
   1768 		}
   1769 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
   1770 		break;
   1771 	case ICMP_ROUTER_ADVERTISEMENT:
   1772 	case ICMP_ROUTER_SOLICITATION:
   1773 		break;
   1774 	case ICMP_TIME_EXCEEDED:
   1775 		interested = B_TRUE;	/* Pass up to transport */
   1776 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
   1777 		break;
   1778 	case ICMP_PARAM_PROBLEM:
   1779 		interested = B_TRUE;	/* Pass up to transport */
   1780 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
   1781 		break;
   1782 	case ICMP_TIME_STAMP_REQUEST:
   1783 		/* Response to Time Stamp Requests is local policy. */
   1784 		if (ipst->ips_ip_g_resp_to_timestamp &&
   1785 		    /* So is whether to respond if it was an IP broadcast. */
   1786 		    (!broadcast || ipst->ips_ip_g_resp_to_timestamp_bcast)) {
   1787 			int tstamp_len = 3 * sizeof (uint32_t);
   1788 
   1789 			if (wptr +  tstamp_len > mp->b_wptr) {
   1790 				if (!pullupmsg(mp, wptr + tstamp_len -
   1791 				    mp->b_rptr)) {
   1792 					BUMP_MIB(ill->ill_ip_mib,
   1793 					    ipIfStatsInDiscards);
   1794 					freemsg(first_mp);
   1795 					return;
   1796 				}
   1797 				/* Refresh ipha following the pullup. */
   1798 				ipha = (ipha_t *)mp->b_rptr;
   1799 				icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   1800 				wptr = (uchar_t *)icmph + ICMPH_SIZE;
   1801 			}
   1802 			interested = B_TRUE;
   1803 		}
   1804 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
   1805 		break;
   1806 	case ICMP_TIME_STAMP_REPLY:
   1807 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
   1808 		break;
   1809 	case ICMP_INFO_REQUEST:
   1810 		/* Per RFC 1122 3.2.2.7, ignore this. */
   1811 	case ICMP_INFO_REPLY:
   1812 		break;
   1813 	case ICMP_ADDRESS_MASK_REQUEST:
   1814 		if ((ipst->ips_ip_respond_to_address_mask_broadcast ||
   1815 		    !broadcast) &&
   1816 		    /* TODO m_pullup of complete header? */
   1817 		    (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) {
   1818 			interested = B_TRUE;
   1819 		}
   1820 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
   1821 		break;
   1822 	case ICMP_ADDRESS_MASK_REPLY:
   1823 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
   1824 		break;
   1825 	default:
   1826 		interested = B_TRUE;	/* Pass up to transport */
   1827 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
   1828 		break;
   1829 	}
   1830 	/* See if there is an ICMP client. */
   1831 	if (ipst->ips_ipcl_proto_fanout[IPPROTO_ICMP].connf_head != NULL) {
   1832 		/* If there is an ICMP client and we want one too, copy it. */
   1833 		mblk_t *first_mp1;
   1834 
   1835 		if (!interested) {
   1836 			ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present,
   1837 			    ip_policy, recv_ill, zoneid);
   1838 			return;
   1839 		}
   1840 		first_mp1 = ip_copymsg(first_mp);
   1841 		if (first_mp1 != NULL) {
   1842 			ip_fanout_proto(q, first_mp1, ill, ipha,
   1843 			    0, mctl_present, ip_policy, recv_ill, zoneid);
   1844 		}
   1845 	} else if (!interested) {
   1846 		freemsg(first_mp);
   1847 		return;
   1848 	} else {
   1849 		/*
   1850 		 * Initiate policy processing for this packet if ip_policy
   1851 		 * is true.
   1852 		 */
   1853 		if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) {
   1854 			ill_index = ill->ill_phyint->phyint_ifindex;
   1855 			ip_process(IPP_LOCAL_IN, &mp, ill_index);
   1856 			if (mp == NULL) {
   1857 				if (mctl_present) {
   1858 					freeb(first_mp);
   1859 				}
   1860 				BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   1861 				return;
   1862 			}
   1863 		}
   1864 	}
   1865 	/* We want to do something with it. */
   1866 	/* Check db_ref to make sure we can modify the packet. */
   1867 	if (mp->b_datap->db_ref > 1) {
   1868 		mblk_t	*first_mp1;
   1869 
   1870 		first_mp1 = ip_copymsg(first_mp);
   1871 		freemsg(first_mp);
   1872 		if (!first_mp1) {
   1873 			BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1874 			return;
   1875 		}
   1876 		first_mp = first_mp1;
   1877 		if (mctl_present) {
   1878 			mp = first_mp->b_cont;
   1879 			ASSERT(mp != NULL);
   1880 		} else {
   1881 			mp = first_mp;
   1882 		}
   1883 		ipha = (ipha_t *)mp->b_rptr;
   1884 		icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   1885 		wptr = (uchar_t *)icmph + ICMPH_SIZE;
   1886 	}
   1887 	switch (icmph->icmph_type) {
   1888 	case ICMP_ADDRESS_MASK_REQUEST:
   1889 		ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
   1890 		if (ipif == NULL) {
   1891 			freemsg(first_mp);
   1892 			return;
   1893 		}
   1894 		/*
   1895 		 * outging interface must be IPv4
   1896 		 */
   1897 		ASSERT(ipif != NULL && !ipif->ipif_isv6);
   1898 		icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
   1899 		bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN);
   1900 		ipif_refrele(ipif);
   1901 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
   1902 		break;
   1903 	case ICMP_ECHO_REQUEST:
   1904 		icmph->icmph_type = ICMP_ECHO_REPLY;
   1905 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
   1906 		break;
   1907 	case ICMP_TIME_STAMP_REQUEST: {
   1908 		uint32_t *tsp;
   1909 
   1910 		icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
   1911 		tsp = (uint32_t *)wptr;
   1912 		tsp++;		/* Skip past 'originate time' */
   1913 		/* Compute # of milliseconds since midnight */
   1914 		gethrestime(&now);
   1915 		ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
   1916 		    now.tv_nsec / (NANOSEC / MILLISEC);
   1917 		*tsp++ = htonl(ts);	/* Lay in 'receive time' */
   1918 		*tsp++ = htonl(ts);	/* Lay in 'send time' */
   1919 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
   1920 		break;
   1921 	}
   1922 	default:
   1923 		ipha = (ipha_t *)&icmph[1];
   1924 		if ((uchar_t *)&ipha[1] > mp->b_wptr) {
   1925 			if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) {
   1926 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1927 				freemsg(first_mp);
   1928 				return;
   1929 			}
   1930 			icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   1931 			ipha = (ipha_t *)&icmph[1];
   1932 		}
   1933 		if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) {
   1934 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1935 			freemsg(first_mp);
   1936 			return;
   1937 		}
   1938 		hdr_length = IPH_HDR_LENGTH(ipha);
   1939 		if (hdr_length < sizeof (ipha_t)) {
   1940 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1941 			freemsg(first_mp);
   1942 			return;
   1943 		}
   1944 		if ((uchar_t *)ipha + hdr_length > mp->b_wptr) {
   1945 			if (!pullupmsg(mp,
   1946 			    (uchar_t *)ipha + hdr_length - mp->b_rptr)) {
   1947 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1948 				freemsg(first_mp);
   1949 				return;
   1950 			}
   1951 			icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   1952 			ipha = (ipha_t *)&icmph[1];
   1953 		}
   1954 		switch (icmph->icmph_type) {
   1955 		case ICMP_REDIRECT:
   1956 			/*
   1957 			 * As there is no upper client to deliver, we don't
   1958 			 * need the first_mp any more.
   1959 			 */
   1960 			if (mctl_present) {
   1961 				freeb(first_mp);
   1962 			}
   1963 			icmp_redirect(ill, mp);
   1964 			return;
   1965 		case ICMP_DEST_UNREACHABLE:
   1966 			if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
   1967 				if (!icmp_inbound_too_big(icmph, ipha, ill,
   1968 				    zoneid, mp, iph_hdr_length, ipst)) {
   1969 					freemsg(first_mp);
   1970 					return;
   1971 				}
   1972 				/*
   1973 				 * icmp_inbound_too_big() may alter mp.
   1974 				 * Resynch ipha and icmph accordingly.
   1975 				 */
   1976 				icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   1977 				ipha = (ipha_t *)&icmph[1];
   1978 			}
   1979 			/* FALLTHRU */
   1980 		default :
   1981 			/*
   1982 			 * IPQoS notes: Since we have already done IPQoS
   1983 			 * processing we don't want to do it again in
   1984 			 * the fanout routines called by
   1985 			 * icmp_inbound_error_fanout, hence the last
   1986 			 * argument, ip_policy, is B_FALSE.
   1987 			 */
   1988 			icmp_inbound_error_fanout(q, ill, first_mp, icmph,
   1989 			    ipha, iph_hdr_length, hdr_length, mctl_present,
   1990 			    B_FALSE, recv_ill, zoneid);
   1991 		}
   1992 		return;
   1993 	}
   1994 	/* Send out an ICMP packet */
   1995 	icmph->icmph_checksum = 0;
   1996 	icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0);
   1997 	if (broadcast || CLASSD(ipha->ipha_dst)) {
   1998 		ipif_t	*ipif_chosen;
   1999 		/*
   2000 		 * Make it look like it was directed to us, so we don't look
   2001 		 * like a fool with a broadcast or multicast source address.
   2002 		 */
   2003 		ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
   2004 		/*
   2005 		 * Make sure that we haven't grabbed an interface that's DOWN.
   2006 		 */
   2007 		if (ipif != NULL) {
   2008 			ipif_chosen = ipif_select_source(ipif->ipif_ill,
   2009 			    ipha->ipha_src, zoneid);
   2010 			if (ipif_chosen != NULL) {
   2011 				ipif_refrele(ipif);
   2012 				ipif = ipif_chosen;
   2013 			}
   2014 		}
   2015 		if (ipif == NULL) {
   2016 			ip0dbg(("icmp_inbound: "
   2017 			    "No source for broadcast/multicast:\n"
   2018 			    "\tsrc 0x%x dst 0x%x ill %p "
   2019 			    "ipif_lcl_addr 0x%x\n",
   2020 			    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
   2021 			    (void *)ill,
   2022 			    ill->ill_ipif->ipif_lcl_addr));
   2023 			freemsg(first_mp);
   2024 			return;
   2025 		}
   2026 		ASSERT(ipif != NULL && !ipif->ipif_isv6);
   2027 		ipha->ipha_dst = ipif->ipif_src_addr;
   2028 		ipif_refrele(ipif);
   2029 	}
   2030 	/* Reset time to live. */
   2031 	ipha->ipha_ttl = ipst->ips_ip_def_ttl;
   2032 	{
   2033 		/* Swap source and destination addresses */
   2034 		ipaddr_t tmp;
   2035 
   2036 		tmp = ipha->ipha_src;
   2037 		ipha->ipha_src = ipha->ipha_dst;
   2038 		ipha->ipha_dst = tmp;
   2039 	}
   2040 	ipha->ipha_ident = 0;
   2041 	if (!IS_SIMPLE_IPH(ipha))
   2042 		icmp_options_update(ipha);
   2043 
   2044 	if (!mctl_present) {
   2045 		/*
   2046 		 * This packet should go out the same way as it
   2047 		 * came in i.e in clear. To make sure that global
   2048 		 * policy will not be applied to this in ip_wput_ire,
   2049 		 * we attach a IPSEC_IN mp and clear ipsec_in_secure.
   2050 		 */
   2051 		ASSERT(first_mp == mp);
   2052 		first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack);
   2053 		if (first_mp == NULL) {
   2054 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2055 			freemsg(mp);
   2056 			return;
   2057 		}
   2058 		ii = (ipsec_in_t *)first_mp->b_rptr;
   2059 
   2060 		/* This is not a secure packet */
   2061 		ii->ipsec_in_secure = B_FALSE;
   2062 		first_mp->b_cont = mp;
   2063 	} else {
   2064 		ii = (ipsec_in_t *)first_mp->b_rptr;
   2065 		ii->ipsec_in_ns = ipst->ips_netstack;	/* No netstack_hold */
   2066 	}
   2067 	ii->ipsec_in_zoneid = zoneid;
   2068 	ASSERT(zoneid != ALL_ZONES);
   2069 	if (!ipsec_in_to_out(first_mp, ipha, NULL)) {
   2070 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2071 		return;
   2072 	}
   2073 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
   2074 	put(WR(q), first_mp);
   2075 }
   2076 
   2077 static ipaddr_t
   2078 icmp_get_nexthop_addr(ipha_t *ipha, ill_t *ill, zoneid_t zoneid, mblk_t *mp)
   2079 {
   2080 	conn_t *connp;
   2081 	connf_t *connfp;
   2082 	ipaddr_t nexthop_addr = INADDR_ANY;
   2083 	int hdr_length = IPH_HDR_LENGTH(ipha);
   2084 	uint16_t *up;
   2085 	uint32_t ports;
   2086 	ip_stack_t *ipst = ill->ill_ipst;
   2087 
   2088 	up = (uint16_t *)((uchar_t *)ipha + hdr_length);
   2089 	switch (ipha->ipha_protocol) {
   2090 		case IPPROTO_TCP:
   2091 		{
   2092 			tcph_t *tcph;
   2093 
   2094 			/* do a reverse lookup */
   2095 			tcph = (tcph_t *)((uchar_t *)ipha + hdr_length);
   2096 			connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph,
   2097 			    TCPS_LISTEN, ipst);
   2098 			break;
   2099 		}
   2100 		case IPPROTO_UDP:
   2101 		{
   2102 			uint32_t dstport, srcport;
   2103 
   2104 			((uint16_t *)&ports)[0] = up[1];
   2105 			((uint16_t *)&ports)[1] = up[0];
   2106 
   2107 			/* Extract ports in net byte order */
   2108 			dstport = htons(ntohl(ports) & 0xFFFF);
   2109 			srcport = htons(ntohl(ports) >> 16);
   2110 
   2111 			connfp = &ipst->ips_ipcl_udp_fanout[
   2112 			    IPCL_UDP_HASH(dstport, ipst)];
   2113 			mutex_enter(&connfp->connf_lock);
   2114 			connp = connfp->connf_head;
   2115 
   2116 			/* do a reverse lookup */
   2117 			while ((connp != NULL) &&
   2118 			    (!IPCL_UDP_MATCH(connp, dstport,
   2119 			    ipha->ipha_src, srcport, ipha->ipha_dst) ||
   2120 			    !IPCL_ZONE_MATCH(connp, zoneid))) {
   2121 				connp = connp->conn_next;
   2122 			}
   2123 			if (connp != NULL)
   2124 				CONN_INC_REF(connp);
   2125 			mutex_exit(&connfp->connf_lock);
   2126 			break;
   2127 		}
   2128 		case IPPROTO_SCTP:
   2129 		{
   2130 			in6_addr_t map_src, map_dst;
   2131 
   2132 			IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_src);
   2133 			IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_dst);
   2134 			((uint16_t *)&ports)[0] = up[1];
   2135 			((uint16_t *)&ports)[1] = up[0];
   2136 
   2137 			connp = sctp_find_conn(&map_src, &map_dst, ports,
   2138 			    zoneid, ipst->ips_netstack->netstack_sctp);
   2139 			if (connp == NULL) {
   2140 				connp = ipcl_classify_raw(mp, IPPROTO_SCTP,
   2141 				    zoneid, ports, ipha, ipst);
   2142 			} else {
   2143 				CONN_INC_REF(connp);
   2144 				SCTP_REFRELE(CONN2SCTP(connp));
   2145 			}
   2146 			break;
   2147 		}
   2148 		default:
   2149 		{
   2150 			ipha_t ripha;
   2151 
   2152 			ripha.ipha_src = ipha->ipha_dst;
   2153 			ripha.ipha_dst = ipha->ipha_src;
   2154 			ripha.ipha_protocol = ipha->ipha_protocol;
   2155 
   2156 			connfp = &ipst->ips_ipcl_proto_fanout[
   2157 			    ipha->ipha_protocol];
   2158 			mutex_enter(&connfp->connf_lock);
   2159 			connp = connfp->connf_head;
   2160 			for (connp = connfp->connf_head; connp != NULL;
   2161 			    connp = connp->conn_next) {
   2162 				if (IPCL_PROTO_MATCH(connp,
   2163 				    ipha->ipha_protocol, &ripha, ill,
   2164 				    0, zoneid)) {
   2165 					CONN_INC_REF(connp);
   2166 					break;
   2167 				}
   2168 			}
   2169 			mutex_exit(&connfp->connf_lock);
   2170 		}
   2171 	}
   2172 	if (connp != NULL) {
   2173 		if (connp->conn_nexthop_set)
   2174 			nexthop_addr = connp->conn_nexthop_v4;
   2175 		CONN_DEC_REF(connp);
   2176 	}
   2177 	return (nexthop_addr);
   2178 }
   2179 
   2180 /* Table from RFC 1191 */
   2181 static int icmp_frag_size_table[] =
   2182 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
   2183 
   2184 /*
   2185  * Process received ICMP Packet too big.
   2186  * After updating any IRE it does the fanout to any matching transport streams.
   2187  * Assumes the message has been pulled up till the IP header that caused
   2188  * the error.
   2189  *
   2190  * Returns B_FALSE on failure and B_TRUE on success.
   2191  */
   2192 static boolean_t
   2193 icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill,
   2194     zoneid_t zoneid, mblk_t *mp, int iph_hdr_length,
   2195     ip_stack_t *ipst)
   2196 {
   2197 	ire_t	*ire, *first_ire;
   2198 	int	mtu, orig_mtu;
   2199 	int	hdr_length;
   2200 	ipaddr_t nexthop_addr;
   2201 	boolean_t disable_pmtud;
   2202 
   2203 	ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
   2204 	    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
   2205 	ASSERT(ill != NULL);
   2206 
   2207 	hdr_length = IPH_HDR_LENGTH(ipha);
   2208 
   2209 	/* Drop if the original packet contained a source route */
   2210 	if (ip_source_route_included(ipha)) {
   2211 		return (B_FALSE);
   2212 	}
   2213 	/*
   2214 	 * Verify we have atleast ICMP_MIN_TP_HDR_LENGTH bytes of transport
   2215 	 * header.
   2216 	 */
   2217 	if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   2218 	    mp->b_wptr) {
   2219 		if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
   2220 		    ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
   2221 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2222 			ip1dbg(("icmp_inbound_too_big: insufficient hdr\n"));
   2223 			return (B_FALSE);
   2224 		}
   2225 		icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   2226 		ipha = (ipha_t *)&icmph[1];
   2227 	}
   2228 	nexthop_addr = icmp_get_nexthop_addr(ipha, ill, zoneid, mp);
   2229 	if (nexthop_addr != INADDR_ANY) {
   2230 		/* nexthop set */
   2231 		first_ire = ire_ctable_lookup(ipha->ipha_dst,
   2232 		    nexthop_addr, 0, NULL, ALL_ZONES, msg_getlabel(mp),
   2233 		    MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, ipst);
   2234 	} else {
   2235 		/* nexthop not set */
   2236 		first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE,
   2237 		    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
   2238 	}
   2239 
   2240 	if (!first_ire) {
   2241 		ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n",
   2242 		    ntohl(ipha->ipha_dst)));
   2243 		return (B_FALSE);
   2244 	}
   2245 
   2246 	/* Check for MTU discovery advice as described in RFC 1191 */
   2247 	mtu = ntohs(icmph->icmph_du_mtu);
   2248 	orig_mtu = mtu;
   2249 	disable_pmtud = B_FALSE;
   2250 
   2251 	rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER);
   2252 	for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst;
   2253 	    ire = ire->ire_next) {
   2254 		/*
   2255 		 * Look for the connection to which this ICMP message is
   2256 		 * directed. If it has the IP_NEXTHOP option set, then the
   2257 		 * search is limited to IREs with the MATCH_IRE_PRIVATE
   2258 		 * option. Else the search is limited to regular IREs.
   2259 		 */
   2260 		if (((ire->ire_marks & IRE_MARK_PRIVATE_ADDR) &&
   2261 		    (nexthop_addr != ire->ire_gateway_addr)) ||
   2262 		    (!(ire->ire_marks & IRE_MARK_PRIVATE_ADDR) &&
   2263 		    (nexthop_addr != INADDR_ANY)))
   2264 			continue;
   2265 
   2266 		mutex_enter(&ire->ire_lock);
   2267 		if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
   2268 			uint32_t length;
   2269 			int	i;
   2270 
   2271 			/*
   2272 			 * Use the table from RFC 1191 to figure out
   2273 			 * the next "plateau" based on the length in
   2274 			 * the original IP packet.
   2275 			 */
   2276 			length = ntohs(ipha->ipha_length);
   2277 			DTRACE_PROBE2(ip4__pmtu__guess, ire_t *, ire,
   2278 			    uint32_t, length);
   2279 			if (ire->ire_max_frag <= length &&
   2280 			    ire->ire_max_frag >= length - hdr_length) {
   2281 				/*
   2282 				 * Handle broken BSD 4.2 systems that
   2283 				 * return the wrong iph_length in ICMP
   2284 				 * errors.
   2285 				 */
   2286 				length -= hdr_length;
   2287 			}
   2288 			for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
   2289 				if (length > icmp_frag_size_table[i])
   2290 					break;
   2291 			}
   2292 			if (i == A_CNT(icmp_frag_size_table)) {
   2293 				/* Smaller than 68! */
   2294 				disable_pmtud = B_TRUE;
   2295 				mtu = ipst->ips_ip_pmtu_min;
   2296 			} else {
   2297 				mtu = icmp_frag_size_table[i];
   2298 				if (mtu < ipst->ips_ip_pmtu_min) {
   2299 					mtu = ipst->ips_ip_pmtu_min;
   2300 					disable_pmtud = B_TRUE;
   2301 				}
   2302 			}
   2303 			/* Fool the ULP into believing our guessed PMTU. */
   2304 			icmph->icmph_du_zero = 0;
   2305 			icmph->icmph_du_mtu = htons(mtu);
   2306 		}
   2307 		if (disable_pmtud)
   2308 			ire->ire_frag_flag = 0;
   2309 		/* Reduce the IRE max frag value as advised. */
   2310 		ire->ire_max_frag = MIN(ire->ire_max_frag, mtu);
   2311 		if (ire->ire_max_frag == mtu) {
   2312 			/* Decreased it */
   2313 			ire->ire_marks |= IRE_MARK_PMTU;
   2314 		}
   2315 		mutex_exit(&ire->ire_lock);
   2316 		DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, ire_t *,
   2317 		    ire, int, orig_mtu, int, mtu);
   2318 	}
   2319 	rw_exit(&first_ire->ire_bucket->irb_lock);
   2320 	ire_refrele(first_ire);
   2321 	return (B_TRUE);
   2322 }
   2323 
   2324 /*
   2325  * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout
   2326  * calls this function.
   2327  */
   2328 static mblk_t *
   2329 icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length)
   2330 {
   2331 	ipha_t *ipha;
   2332 	icmph_t *icmph;
   2333 	ipha_t *in_ipha;
   2334 	int length;
   2335 
   2336 	ASSERT(mp->b_datap->db_type == M_DATA);
   2337 
   2338 	/*
   2339 	 * For Self-encapsulated packets, we added an extra IP header
   2340 	 * without the options. Inner IP header is the one from which
   2341 	 * the outer IP header was formed. Thus, we need to remove the
   2342 	 * outer IP header. To do this, we pullup the whole message
   2343 	 * and overlay whatever follows the outer IP header over the
   2344 	 * outer IP header.
   2345 	 */
   2346 
   2347 	if (!pullupmsg(mp, -1))
   2348 		return (NULL);
   2349 
   2350 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   2351 	ipha = (ipha_t *)&icmph[1];
   2352 	in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
   2353 
   2354 	/*
   2355 	 * The length that we want to overlay is following the inner
   2356 	 * IP header. Subtracting the IP header + icmp header + outer
   2357 	 * IP header's length should give us the length that we want to
   2358 	 * overlay.
   2359 	 */
   2360 	length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) -
   2361 	    hdr_length;
   2362 	/*
   2363 	 * Overlay whatever follows the inner header over the
   2364 	 * outer header.
   2365 	 */
   2366 	bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
   2367 
   2368 	/* Set the wptr to account for the outer header */
   2369 	mp->b_wptr -= hdr_length;
   2370 	return (mp);
   2371 }
   2372 
   2373 /*
   2374  * Try to pass the ICMP message upstream in case the ULP cares.
   2375  *
   2376  * If the packet that caused the ICMP error is secure, we send
   2377  * it to AH/ESP to make sure that the attached packet has a
   2378  * valid association. ipha in the code below points to the
   2379  * IP header of the packet that caused the error.
   2380  *
   2381  * We handle ICMP_FRAGMENTATION_NEEDED(IFN) message differently
   2382  * in the context of IPsec. Normally we tell the upper layer
   2383  * whenever we send the ire (including ip_bind), the IPsec header
   2384  * length in ire_ipsec_overhead. TCP can deduce the MSS as it
   2385  * has both the MTU (ire_max_frag) and the ire_ipsec_overhead.
   2386  * Similarly, we pass the new MTU icmph_du_mtu and TCP does the
   2387  * same thing. As TCP has the IPsec options size that needs to be
   2388  * adjusted, we just pass the MTU unchanged.
   2389  *
   2390  * IFN could have been generated locally or by some router.
   2391  *
   2392  * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this.
   2393  *	    This happens because IP adjusted its value of MTU on an
   2394  *	    earlier IFN message and could not tell the upper layer,
   2395  *	    the new adjusted value of MTU e.g. Packet was encrypted
   2396  *	    or there was not enough information to fanout to upper
   2397  *	    layers. Thus on the next outbound datagram, ip_wput_ire
   2398  *	    generates the IFN, where IPsec processing has *not* been
   2399  *	    done.
   2400  *
   2401  *	   *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed
   2402  *	    could have generated this. This happens because ire_max_frag
   2403  *	    value in IP was set to a new value, while the IPsec processing
   2404  *	    was being done and after we made the fragmentation check in
   2405  *	    ip_wput_ire. Thus on return from IPsec processing,
   2406  *	    ip_wput_ipsec_out finds that the new length is > ire_max_frag
   2407  *	    and generates the IFN. As IPsec processing is over, we fanout
   2408  *	    to AH/ESP to remove the header.
   2409  *
   2410  *	    In both these cases, ipsec_in_loopback will be set indicating
   2411  *	    that IFN was generated locally.
   2412  *
   2413  * ROUTER : IFN could be secure or non-secure.
   2414  *
   2415  *	    * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the
   2416  *	      packet in error has AH/ESP headers to validate the AH/ESP
   2417  *	      headers. AH/ESP will verify whether there is a valid SA or
   2418  *	      not and send it back. We will fanout again if we have more
   2419  *	      data in the packet.
   2420  *
   2421  *	      If the packet in error does not have AH/ESP, we handle it
   2422  *	      like any other case.
   2423  *
   2424  *	    * NON_SECURE : If the packet in error has AH/ESP headers,
   2425  *	      we attach a dummy ipsec_in and send it up to AH/ESP
   2426  *	      for validation. AH/ESP will verify whether there is a
   2427  *	      valid SA or not and send it back. We will fanout again if
   2428  *	      we have more data in the packet.
   2429  *
   2430  *	      If the packet in error does not have AH/ESP, we handle it
   2431  *	      like any other case.
   2432  */
   2433 static void
   2434 icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp,
   2435     icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length,
   2436     boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill,
   2437     zoneid_t zoneid)
   2438 {
   2439 	uint16_t *up;	/* Pointer to ports in ULP header */
   2440 	uint32_t ports;	/* reversed ports for fanout */
   2441 	ipha_t ripha;	/* With reversed addresses */
   2442 	mblk_t *first_mp;
   2443 	ipsec_in_t *ii;
   2444 	tcph_t	*tcph;
   2445 	conn_t	*connp;
   2446 	ip_stack_t *ipst;
   2447 
   2448 	ASSERT(ill != NULL);
   2449 
   2450 	ASSERT(recv_ill != NULL);
   2451 	ipst = recv_ill->ill_ipst;
   2452 
   2453 	first_mp = mp;
   2454 	if (mctl_present) {
   2455 		mp = first_mp->b_cont;
   2456 		ASSERT(mp != NULL);
   2457 
   2458 		ii = (ipsec_in_t *)first_mp->b_rptr;
   2459 		ASSERT(ii->ipsec_in_type == IPSEC_IN);
   2460 	} else {
   2461 		ii = NULL;
   2462 	}
   2463 
   2464 	switch (ipha->ipha_protocol) {
   2465 	case IPPROTO_UDP:
   2466 		/*
   2467 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   2468 		 * transport header.
   2469 		 */
   2470 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   2471 		    mp->b_wptr) {
   2472 			if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
   2473 			    ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
   2474 				goto discard_pkt;
   2475 			}
   2476 			icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   2477 			ipha = (ipha_t *)&icmph[1];
   2478 		}
   2479 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
   2480 
   2481 		/*
   2482 		 * Attempt to find a client stream based on port.
   2483 		 * Note that we do a reverse lookup since the header is
   2484 		 * in the form we sent it out.
   2485 		 * The ripha header is only used for the IP_UDP_MATCH and we
   2486 		 * only set the src and dst addresses and protocol.
   2487 		 */
   2488 		ripha.ipha_src = ipha->ipha_dst;
   2489 		ripha.ipha_dst = ipha->ipha_src;
   2490 		ripha.ipha_protocol = ipha->ipha_protocol;
   2491 		((uint16_t *)&ports)[0] = up[1];
   2492 		((uint16_t *)&ports)[1] = up[0];
   2493 		ip2dbg(("icmp_inbound_error: UDP %x:%d to %x:%d: %d/%d\n",
   2494 		    ntohl(ipha->ipha_src), ntohs(up[0]),
   2495 		    ntohl(ipha->ipha_dst), ntohs(up[1]),
   2496 		    icmph->icmph_type, icmph->icmph_code));
   2497 
   2498 		/* Have to change db_type after any pullupmsg */
   2499 		DB_TYPE(mp) = M_CTL;
   2500 
   2501 		ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0,
   2502 		    mctl_present, ip_policy, recv_ill, zoneid);
   2503 		return;
   2504 
   2505 	case IPPROTO_TCP:
   2506 		/*
   2507 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   2508 		 * transport header.
   2509 		 */
   2510 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   2511 		    mp->b_wptr) {
   2512 			if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
   2513 			    ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
   2514 				goto discard_pkt;
   2515 			}
   2516 			icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   2517 			ipha = (ipha_t *)&icmph[1];
   2518 		}
   2519 		/*
   2520 		 * Find a TCP client stream for this packet.
   2521 		 * Note that we do a reverse lookup since the header is
   2522 		 * in the form we sent it out.
   2523 		 */
   2524 		tcph = (tcph_t *)((uchar_t *)ipha + hdr_length);
   2525 		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN,
   2526 		    ipst);
   2527 		if (connp == NULL)
   2528 			goto discard_pkt;
   2529 
   2530 		/* Have to change db_type after any pullupmsg */
   2531 		DB_TYPE(mp) = M_CTL;
   2532 		SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp,
   2533 		    SQ_FILL, SQTAG_TCP_INPUT_ICMP_ERR);
   2534 		return;
   2535 
   2536 	case IPPROTO_SCTP:
   2537 		/*
   2538 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   2539 		 * transport header.
   2540 		 */
   2541 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   2542 		    mp->b_wptr) {
   2543 			if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
   2544 			    ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
   2545 				goto discard_pkt;
   2546 			}
   2547 			icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   2548 			ipha = (ipha_t *)&icmph[1];
   2549 		}
   2550 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
   2551 		/*
   2552 		 * Find a SCTP client stream for this packet.
   2553 		 * Note that we do a reverse lookup since the header is
   2554 		 * in the form we sent it out.
   2555 		 * The ripha header is only used for the matching and we
   2556 		 * only set the src and dst addresses, protocol, and version.
   2557 		 */
   2558 		ripha.ipha_src = ipha->ipha_dst;
   2559 		ripha.ipha_dst = ipha->ipha_src;
   2560 		ripha.ipha_protocol = ipha->ipha_protocol;
   2561 		ripha.ipha_version_and_hdr_length =
   2562 		    ipha->ipha_version_and_hdr_length;
   2563 		((uint16_t *)&ports)[0] = up[1];
   2564 		((uint16_t *)&ports)[1] = up[0];
   2565 
   2566 		/* Have to change db_type after any pullupmsg */
   2567 		DB_TYPE(mp) = M_CTL;
   2568 		ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0,
   2569 		    mctl_present, ip_policy, zoneid);
   2570 		return;
   2571 
   2572 	case IPPROTO_ESP:
   2573 	case IPPROTO_AH: {
   2574 		int ipsec_rc;
   2575 		ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
   2576 
   2577 		/*
   2578 		 * We need a IPSEC_IN in the front to fanout to AH/ESP.
   2579 		 * We will re-use the IPSEC_IN if it is already present as
   2580 		 * AH/ESP will not affect any fields in the IPSEC_IN for
   2581 		 * ICMP errors. If there is no IPSEC_IN, allocate a new
   2582 		 * one and attach it in the front.
   2583 		 */
   2584 		if (ii != NULL) {
   2585 			/*
   2586 			 * ip_fanout_proto_again converts the ICMP errors
   2587 			 * that come back from AH/ESP to M_DATA so that
   2588 			 * if it is non-AH/ESP and we do a pullupmsg in
   2589 			 * this function, it would work. Convert it back
   2590 			 * to M_CTL before we send up as this is a ICMP
   2591 			 * error. This could have been generated locally or
   2592 			 * by some router. Validate the inner IPsec
   2593 			 * headers.
   2594 			 *
   2595 			 * NOTE : ill_index is used by ip_fanout_proto_again
   2596 			 * to locate the ill.
   2597 			 */
   2598 			ASSERT(ill != NULL);
   2599 			ii->ipsec_in_ill_index =
   2600 			    ill->ill_phyint->phyint_ifindex;
   2601 			ii->ipsec_in_rill_index =
   2602 			    recv_ill->ill_phyint->phyint_ifindex;
   2603 			DB_TYPE(first_mp->b_cont) = M_CTL;
   2604 		} else {
   2605 			/*
   2606 			 * IPSEC_IN is not present. We attach a ipsec_in
   2607 			 * message and send up to IPsec for validating
   2608 			 * and removing the IPsec headers. Clear
   2609 			 * ipsec_in_secure so that when we return
   2610 			 * from IPsec, we don't mistakenly think that this
   2611 			 * is a secure packet came from the network.
   2612 			 *
   2613 			 * NOTE : ill_index is used by ip_fanout_proto_again
   2614 			 * to locate the ill.
   2615 			 */
   2616 			ASSERT(first_mp == mp);
   2617 			first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack);
   2618 			if (first_mp == NULL) {
   2619 				freemsg(mp);
   2620 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2621 				return;
   2622 			}
   2623 			ii = (ipsec_in_t *)first_mp->b_rptr;
   2624 
   2625 			/* This is not a secure packet */
   2626 			ii->ipsec_in_secure = B_FALSE;
   2627 			first_mp->b_cont = mp;
   2628 			DB_TYPE(mp) = M_CTL;
   2629 			ASSERT(ill != NULL);
   2630 			ii->ipsec_in_ill_index =
   2631 			    ill->ill_phyint->phyint_ifindex;
   2632 			ii->ipsec_in_rill_index =
   2633 			    recv_ill->ill_phyint->phyint_ifindex;
   2634 		}
   2635 		ip2dbg(("icmp_inbound_error: ipsec\n"));
   2636 
   2637 		if (!ipsec_loaded(ipss)) {
   2638 			ip_proto_not_sup(q, first_mp, 0, zoneid, ipst);
   2639 			return;
   2640 		}
   2641 
   2642 		if (ipha->ipha_protocol == IPPROTO_ESP)
   2643 			ipsec_rc = ipsecesp_icmp_error(first_mp);
   2644 		else
   2645 			ipsec_rc = ipsecah_icmp_error(first_mp);
   2646 		if (ipsec_rc == IPSEC_STATUS_FAILED)
   2647 			return;
   2648 
   2649 		ip_fanout_proto_again(first_mp, ill, recv_ill, NULL);
   2650 		return;
   2651 	}
   2652 	default:
   2653 		/*
   2654 		 * The ripha header is only used for the lookup and we
   2655 		 * only set the src and dst addresses and protocol.
   2656 		 */
   2657 		ripha.ipha_src = ipha->ipha_dst;
   2658 		ripha.ipha_dst = ipha->ipha_src;
   2659 		ripha.ipha_protocol = ipha->ipha_protocol;
   2660 		ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n",
   2661 		    ripha.ipha_protocol, ntohl(ipha->ipha_src),
   2662 		    ntohl(ipha->ipha_dst),
   2663 		    icmph->icmph_type, icmph->icmph_code));
   2664 		if (ipha->ipha_protocol == IPPROTO_ENCAP) {
   2665 			ipha_t *in_ipha;
   2666 
   2667 			if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
   2668 			    mp->b_wptr) {
   2669 				if (!pullupmsg(mp, (uchar_t *)ipha +
   2670 				    hdr_length + sizeof (ipha_t) -
   2671 				    mp->b_rptr)) {
   2672 					goto discard_pkt;
   2673 				}
   2674 				icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   2675 				ipha = (ipha_t *)&icmph[1];
   2676 			}
   2677 			/*
   2678 			 * Caller has verified that length has to be
   2679 			 * at least the size of IP header.
   2680 			 */
   2681 			ASSERT(hdr_length >= sizeof (ipha_t));
   2682 			/*
   2683 			 * Check the sanity of the inner IP header like
   2684 			 * we did for the outer header.
   2685 			 */
   2686 			in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
   2687 			if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
   2688 				goto discard_pkt;
   2689 			}
   2690 			if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
   2691 				goto discard_pkt;
   2692 			}
   2693 			/* Check for Self-encapsulated tunnels */
   2694 			if (in_ipha->ipha_src == ipha->ipha_src &&
   2695 			    in_ipha->ipha_dst == ipha->ipha_dst) {
   2696 
   2697 				mp = icmp_inbound_self_encap_error(mp,
   2698 				    iph_hdr_length, hdr_length);
   2699 				if (mp == NULL)
   2700 					goto discard_pkt;
   2701 				icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   2702 				ipha = (ipha_t *)&icmph[1];
   2703 				hdr_length = IPH_HDR_LENGTH(ipha);
   2704 				/*
   2705 				 * The packet in error is self-encapsualted.
   2706 				 * And we are finding it further encapsulated
   2707 				 * which we could not have possibly generated.
   2708 				 */
   2709 				if (ipha->ipha_protocol == IPPROTO_ENCAP) {
   2710 					goto discard_pkt;
   2711 				}
   2712 				icmp_inbound_error_fanout(q, ill, first_mp,
   2713 				    icmph, ipha, iph_hdr_length, hdr_length,
   2714 				    mctl_present, ip_policy, recv_ill, zoneid);
   2715 				return;
   2716 			}
   2717 		}
   2718 		if ((ipha->ipha_protocol == IPPROTO_ENCAP ||
   2719 		    ipha->ipha_protocol == IPPROTO_IPV6) &&
   2720 		    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED &&
   2721 		    ii != NULL &&
   2722 		    ii->ipsec_in_loopback &&
   2723 		    ii->ipsec_in_secure) {
   2724 			/*
   2725 			 * For IP tunnels that get a looped-back
   2726 			 * ICMP_FRAGMENTATION_NEEDED message, adjust the
   2727 			 * reported new MTU to take into account the IPsec
   2728 			 * headers protecting this configured tunnel.
   2729 			 *
   2730 			 * This allows the tunnel module (tun.c) to blindly
   2731 			 * accept the MTU reported in an ICMP "too big"
   2732 			 * message.
   2733 			 *
   2734 			 * Non-looped back ICMP messages will just be
   2735 			 * handled by the security protocols (if needed),
   2736 			 * and the first subsequent packet will hit this
   2737 			 * path.
   2738 			 */
   2739 			icmph->icmph_du_mtu = htons(ntohs(icmph->icmph_du_mtu) -
   2740 			    ipsec_in_extra_length(first_mp));
   2741 		}
   2742 		/* Have to change db_type after any pullupmsg */
   2743 		DB_TYPE(mp) = M_CTL;
   2744 
   2745 		ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present,
   2746 		    ip_policy, recv_ill, zoneid);
   2747 		return;
   2748 	}
   2749 	/* NOTREACHED */
   2750 discard_pkt:
   2751 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2752 drop_pkt:;
   2753 	ip1dbg(("icmp_inbound_error_fanout: drop pkt\n"));
   2754 	freemsg(first_mp);
   2755 }
   2756 
   2757 /*
   2758  * Common IP options parser.
   2759  *
   2760  * Setup routine: fill in *optp with options-parsing state, then
   2761  * tail-call ipoptp_next to return the first option.
   2762  */
   2763 uint8_t
   2764 ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
   2765 {
   2766 	uint32_t totallen; /* total length of all options */
   2767 
   2768 	totallen = ipha->ipha_version_and_hdr_length -
   2769 	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
   2770 	totallen <<= 2;
   2771 	optp->ipoptp_next = (uint8_t *)(&ipha[1]);
   2772 	optp->ipoptp_end = optp->ipoptp_next + totallen;
   2773 	optp->ipoptp_flags = 0;
   2774 	return (ipoptp_next(optp));
   2775 }
   2776 
   2777 /*
   2778  * Common IP options parser: extract next option.
   2779  */
   2780 uint8_t
   2781 ipoptp_next(ipoptp_t *optp)
   2782 {
   2783 	uint8_t *end = optp->ipoptp_end;
   2784 	uint8_t *cur = optp->ipoptp_next;
   2785 	uint8_t opt, len, pointer;
   2786 
   2787 	/*
   2788 	 * If cur > end already, then the ipoptp_end or ipoptp_next pointer
   2789 	 * has been corrupted.
   2790 	 */
   2791 	ASSERT(cur <= end);
   2792 
   2793 	if (cur == end)
   2794 		return (IPOPT_EOL);
   2795 
   2796 	opt = cur[IPOPT_OPTVAL];
   2797 
   2798 	/*
   2799 	 * Skip any NOP options.
   2800 	 */
   2801 	while (opt == IPOPT_NOP) {
   2802 		cur++;
   2803 		if (cur == end)
   2804 			return (IPOPT_EOL);
   2805 		opt = cur[IPOPT_OPTVAL];
   2806 	}
   2807 
   2808 	if (opt == IPOPT_EOL)
   2809 		return (IPOPT_EOL);
   2810 
   2811 	/*
   2812 	 * Option requiring a length.
   2813 	 */
   2814 	if ((cur + 1) >= end) {
   2815 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2816 		return (IPOPT_EOL);
   2817 	}
   2818 	len = cur[IPOPT_OLEN];
   2819 	if (len < 2) {
   2820 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2821 		return (IPOPT_EOL);
   2822 	}
   2823 	optp->ipoptp_cur = cur;
   2824 	optp->ipoptp_len = len;
   2825 	optp->ipoptp_next = cur + len;
   2826 	if (cur + len > end) {
   2827 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2828 		return (IPOPT_EOL);
   2829 	}
   2830 
   2831 	/*
   2832 	 * For the options which require a pointer field, make sure
   2833 	 * its there, and make sure it points to either something
   2834 	 * inside this option, or the end of the option.
   2835 	 */
   2836 	switch (opt) {
   2837 	case IPOPT_RR:
   2838 	case IPOPT_TS:
   2839 	case IPOPT_LSRR:
   2840 	case IPOPT_SSRR:
   2841 		if (len <= IPOPT_OFFSET) {
   2842 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2843 			return (opt);
   2844 		}
   2845 		pointer = cur[IPOPT_OFFSET];
   2846 		if (pointer - 1 > len) {
   2847 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2848 			return (opt);
   2849 		}
   2850 		break;
   2851 	}
   2852 
   2853 	/*
   2854 	 * Sanity check the pointer field based on the type of the
   2855 	 * option.
   2856 	 */
   2857 	switch (opt) {
   2858 	case IPOPT_RR:
   2859 	case IPOPT_SSRR:
   2860 	case IPOPT_LSRR:
   2861 		if (pointer < IPOPT_MINOFF_SR)
   2862 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2863 		break;
   2864 	case IPOPT_TS:
   2865 		if (pointer < IPOPT_MINOFF_IT)
   2866 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2867 		/*
   2868 		 * Note that the Internet Timestamp option also
   2869 		 * contains two four bit fields (the Overflow field,
   2870 		 * and the Flag field), which follow the pointer
   2871 		 * field.  We don't need to check that these fields
   2872 		 * fall within the length of the option because this
   2873 		 * was implicitely done above.  We've checked that the
   2874 		 * pointer value is at least IPOPT_MINOFF_IT, and that
   2875 		 * it falls within the option.  Since IPOPT_MINOFF_IT >
   2876 		 * IPOPT_POS_OV_FLG, we don't need the explicit check.
   2877 		 */
   2878 		ASSERT(len > IPOPT_POS_OV_FLG);
   2879 		break;
   2880 	}
   2881 
   2882 	return (opt);
   2883 }
   2884 
   2885 /*
   2886  * Use the outgoing IP header to create an IP_OPTIONS option the way
   2887  * it was passed down from the application.
   2888  */
   2889 int
   2890 ip_opt_get_user(const ipha_t *ipha, uchar_t *buf)
   2891 {
   2892 	ipoptp_t	opts;
   2893 	const uchar_t	*opt;
   2894 	uint8_t		optval;
   2895 	uint8_t		optlen;
   2896 	uint32_t	len = 0;
   2897 	uchar_t	*buf1 = buf;
   2898 
   2899 	buf += IP_ADDR_LEN;	/* Leave room for final destination */
   2900 	len += IP_ADDR_LEN;
   2901 	bzero(buf1, IP_ADDR_LEN);
   2902 
   2903 	/*
   2904 	 * OK to cast away const here, as we don't store through the returned
   2905 	 * opts.ipoptp_cur pointer.
   2906 	 */
   2907 	for (optval = ipoptp_first(&opts, (ipha_t *)ipha);
   2908 	    optval != IPOPT_EOL;
   2909 	    optval = ipoptp_next(&opts)) {
   2910 		int	off;
   2911 
   2912 		opt = opts.ipoptp_cur;
   2913 		optlen = opts.ipoptp_len;
   2914 		switch (optval) {
   2915 		case IPOPT_SSRR:
   2916 		case IPOPT_LSRR:
   2917 
   2918 			/*
   2919 			 * Insert ipha_dst as the first entry in the source
   2920 			 * route and move down the entries on step.
   2921 			 * The last entry gets placed at buf1.
   2922 			 */
   2923 			buf[IPOPT_OPTVAL] = optval;
   2924 			buf[IPOPT_OLEN] = optlen;
   2925 			buf[IPOPT_OFFSET] = optlen;
   2926 
   2927 			off = optlen - IP_ADDR_LEN;
   2928 			if (off < 0) {
   2929 				/* No entries in source route */
   2930 				break;
   2931 			}
   2932 			/* Last entry in source route */
   2933 			bcopy(opt + off, buf1, IP_ADDR_LEN);
   2934 			off -= IP_ADDR_LEN;
   2935 
   2936 			while (off > 0) {
   2937 				bcopy(opt + off,
   2938 				    buf + off + IP_ADDR_LEN,
   2939 				    IP_ADDR_LEN);
   2940 				off -= IP_ADDR_LEN;
   2941 			}
   2942 			/* ipha_dst into first slot */
   2943 			bcopy(&ipha->ipha_dst,
   2944 			    buf + off + IP_ADDR_LEN,
   2945 			    IP_ADDR_LEN);
   2946 			buf += optlen;
   2947 			len += optlen;
   2948 			break;
   2949 
   2950 		case IPOPT_COMSEC:
   2951 		case IPOPT_SECURITY:
   2952 			/* if passing up a label is not ok, then remove */
   2953 			if (is_system_labeled())
   2954 				break;
   2955 			/* FALLTHROUGH */
   2956 		default:
   2957 			bcopy(opt, buf, optlen);
   2958 			buf += optlen;
   2959 			len += optlen;
   2960 			break;
   2961 		}
   2962 	}
   2963 done:
   2964 	/* Pad the resulting options */
   2965 	while (len & 0x3) {
   2966 		*buf++ = IPOPT_EOL;
   2967 		len++;
   2968 	}
   2969 	return (len);
   2970 }
   2971 
   2972 /*
   2973  * Update any record route or timestamp options to include this host.
   2974  * Reverse any source route option.
   2975  * This routine assumes that the options are well formed i.e. that they
   2976  * have already been checked.
   2977  */
   2978 static void
   2979 icmp_options_update(ipha_t *ipha)
   2980 {
   2981 	ipoptp_t	opts;
   2982 	uchar_t		*opt;
   2983 	uint8_t		optval;
   2984 	ipaddr_t	src;		/* Our local address */
   2985 	ipaddr_t	dst;
   2986 
   2987 	ip2dbg(("icmp_options_update\n"));
   2988 	src = ipha->ipha_src;
   2989 	dst = ipha->ipha_dst;
   2990 
   2991 	for (optval = ipoptp_first(&opts, ipha);
   2992 	    optval != IPOPT_EOL;
   2993 	    optval = ipoptp_next(&opts)) {
   2994 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
   2995 		opt = opts.ipoptp_cur;
   2996 		ip2dbg(("icmp_options_update: opt %d, len %d\n",
   2997 		    optval, opts.ipoptp_len));
   2998 		switch (optval) {
   2999 			int off1, off2;
   3000 		case IPOPT_SSRR:
   3001 		case IPOPT_LSRR:
   3002 			/*
   3003 			 * Reverse the source route.  The first entry
   3004 			 * should be the next to last one in the current
   3005 			 * source route (the last entry is our address).
   3006 			 * The last entry should be the final destination.
   3007 			 */
   3008 			off1 = IPOPT_MINOFF_SR - 1;
   3009 			off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
   3010 			if (off2 < 0) {
   3011 				/* No entries in source route */
   3012 				ip1dbg((
   3013 				    "icmp_options_update: bad src route\n"));
   3014 				break;
   3015 			}
   3016 			bcopy((char *)opt + off2, &dst, IP_ADDR_LEN);
   3017 			bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN);
   3018 			bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN);
   3019 			off2 -= IP_ADDR_LEN;
   3020 
   3021 			while (off1 < off2) {
   3022 				bcopy((char *)opt + off1, &src, IP_ADDR_LEN);
   3023 				bcopy((char *)opt + off2, (char *)opt + off1,
   3024 				    IP_ADDR_LEN);
   3025 				bcopy(&src, (char *)opt + off2, IP_ADDR_LEN);
   3026 				off1 += IP_ADDR_LEN;
   3027 				off2 -= IP_ADDR_LEN;
   3028 			}
   3029 			opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
   3030 			break;
   3031 		}
   3032 	}
   3033 }
   3034 
   3035 /*
   3036  * Process received ICMP Redirect messages.
   3037  */
   3038 static void
   3039 icmp_redirect(ill_t *ill, mblk_t *mp)
   3040 {
   3041 	ipha_t	*ipha;
   3042 	int	iph_hdr_length;
   3043 	icmph_t	*icmph;
   3044 	ipha_t	*ipha_err;
   3045 	ire_t	*ire;
   3046 	ire_t	*prev_ire;
   3047 	ire_t	*save_ire;
   3048 	ipaddr_t  src, dst, gateway;
   3049 	iulp_t	ulp_info = { 0 };
   3050 	int	error;
   3051 	ip_stack_t *ipst;
   3052 
   3053 	ASSERT(ill != NULL);
   3054 	ipst = ill->ill_ipst;
   3055 
   3056 	ipha = (ipha_t *)mp->b_rptr;
   3057 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
   3058 	if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) <
   3059 	    sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) {
   3060 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   3061 		freemsg(mp);
   3062 		return;
   3063 	}
   3064 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   3065 	ipha_err = (ipha_t *)&icmph[1];
   3066 	src = ipha->ipha_src;
   3067 	dst = ipha_err->ipha_dst;
   3068 	gateway = icmph->icmph_rd_gateway;
   3069 	/* Make sure the new gateway is reachable somehow. */
   3070 	ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL,
   3071 	    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
   3072 	/*
   3073 	 * Make sure we had a route for the dest in question and that
   3074 	 * that route was pointing to the old gateway (the source of the
   3075 	 * redirect packet.)
   3076 	 */
   3077 	prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES,
   3078 	    NULL, MATCH_IRE_GW, ipst);
   3079 	/*
   3080 	 * Check that
   3081 	 *	the redirect was not from ourselves
   3082 	 *	the new gateway and the old gateway are directly reachable
   3083 	 */
   3084 	if (!prev_ire ||
   3085 	    !ire ||
   3086 	    ire->ire_type == IRE_LOCAL) {
   3087 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
   3088 		freemsg(mp);
   3089 		if (ire != NULL)
   3090 			ire_refrele(ire);
   3091 		if (prev_ire != NULL)
   3092 			ire_refrele(prev_ire);
   3093 		return;
   3094 	}
   3095 
   3096 	/*
   3097 	 * Should we use the old ULP info to create the new gateway?  From
   3098 	 * a user's perspective, we should inherit the info so that it
   3099 	 * is a "smooth" transition.  If we do not do that, then new
   3100 	 * connections going thru the new gateway will have no route metrics,
   3101 	 * which is counter-intuitive to user.  From a network point of
   3102 	 * view, this may or may not make sense even though the new gateway
   3103 	 * is still directly connected to us so the route metrics should not
   3104 	 * change much.
   3105 	 *
   3106 	 * But if the old ire_uinfo is not initialized, we do another
   3107 	 * recursive lookup on the dest using the new gateway.  There may
   3108 	 * be a route to that.  If so, use it to initialize the redirect
   3109 	 * route.
   3110 	 */
   3111 	if (prev_ire->ire_uinfo.iulp_set) {
   3112 		bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t));
   3113 	} else {
   3114 		ire_t *tmp_ire;
   3115 		ire_t *sire;
   3116 
   3117 		tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire,
   3118 		    ALL_ZONES, 0, NULL,
   3119 		    (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT),
   3120 		    ipst);
   3121 		if (sire != NULL) {
   3122 			bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t));
   3123 			/*
   3124 			 * If sire != NULL, ire_ftable_lookup() should not
   3125 			 * return a NULL value.
   3126 			 */
   3127 			ASSERT(tmp_ire != NULL);
   3128 			ire_refrele(tmp_ire);
   3129 			ire_refrele(sire);
   3130 		} else if (tmp_ire != NULL) {
   3131 			bcopy(&tmp_ire->ire_uinfo, &ulp_info,
   3132 			    sizeof (iulp_t));
   3133 			ire_refrele(tmp_ire);
   3134 		}
   3135 	}
   3136 	if (prev_ire->ire_type == IRE_CACHE)
   3137 		ire_delete(prev_ire);
   3138 	ire_refrele(prev_ire);
   3139 	/*
   3140 	 * TODO: more precise handling for cases 0, 2, 3, the latter two
   3141 	 * require TOS routing
   3142 	 */
   3143 	switch (icmph->icmph_code) {
   3144 	case 0:
   3145 	case 1:
   3146 		/* TODO: TOS specificity for cases 2 and 3 */
   3147 	case 2:
   3148 	case 3:
   3149 		break;
   3150 	default:
   3151 		freemsg(mp);
   3152 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
   3153 		ire_refrele(ire);
   3154 		return;
   3155 	}
   3156 	/*
   3157 	 * Create a Route Association.  This will allow us to remember that
   3158 	 * someone we believe told us to use the particular gateway.
   3159 	 */
   3160 	save_ire = ire;
   3161 	ire = ire_create(
   3162 	    (uchar_t *)&dst,			/* dest addr */
   3163 	    (uchar_t *)&ip_g_all_ones,		/* mask */
   3164 	    (uchar_t *)&save_ire->ire_src_addr,	/* source addr */
   3165 	    (uchar_t *)&gateway,		/* gateway addr */
   3166 	    &save_ire->ire_max_frag,		/* max frag */
   3167 	    NULL,				/* no src nce */
   3168 	    NULL,				/* no rfq */
   3169 	    NULL,				/* no stq */
   3170 	    IRE_HOST,
   3171 	    NULL,				/* ipif */
   3172 	    0,					/* cmask */
   3173 	    0,					/* phandle */
   3174 	    0,					/* ihandle */
   3175 	    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
   3176 	    &ulp_info,
   3177 	    NULL,				/* tsol_gc_t */
   3178 	    NULL,				/* gcgrp */
   3179 	    ipst);
   3180 
   3181 	if (ire == NULL) {
   3182 		freemsg(mp);
   3183 		ire_refrele(save_ire);
   3184 		return;
   3185 	}
   3186 	error = ire_add(&ire, NULL, NULL, NULL, B_FALSE);
   3187 	ire_refrele(save_ire);
   3188 	atomic_inc_32(&ipst->ips_ip_redirect_cnt);
   3189 
   3190 	if (error == 0) {
   3191 		ire_refrele(ire);		/* Held in ire_add_v4 */
   3192 		/* tell routing sockets that we received a redirect */
   3193 		ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
   3194 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
   3195 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
   3196 	}
   3197 
   3198 	/*
   3199 	 * Delete any existing IRE_HOST type redirect ires for this destination.
   3200 	 * This together with the added IRE has the effect of
   3201 	 * modifying an existing redirect.
   3202 	 */
   3203 	prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST, NULL, NULL,
   3204 	    ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), ipst);
   3205 	if (prev_ire != NULL) {
   3206 		if (prev_ire ->ire_flags & RTF_DYNAMIC)
   3207 			ire_delete(prev_ire);
   3208 		ire_refrele(prev_ire);
   3209 	}
   3210 
   3211 	freemsg(mp);
   3212 }
   3213 
   3214 /*
   3215  * Generate an ICMP parameter problem message.
   3216  */
   3217 static void
   3218 icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid,
   3219 	ip_stack_t *ipst)
   3220 {
   3221 	icmph_t	icmph;
   3222 	boolean_t mctl_present;
   3223 	mblk_t *first_mp;
   3224 
   3225 	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
   3226 
   3227 	if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
   3228 		if (mctl_present)
   3229 			freeb(first_mp);
   3230 		return;
   3231 	}
   3232 
   3233 	bzero(&icmph, sizeof (icmph_t));
   3234 	icmph.icmph_type = ICMP_PARAM_PROBLEM;
   3235 	icmph.icmph_pp_ptr = ptr;
   3236 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs);
   3237 	icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid,
   3238 	    ipst);
   3239 }
   3240 
   3241 /*
   3242  * Build and ship an IPv4 ICMP message using the packet data in mp, and
   3243  * the ICMP header pointed to by "stuff".  (May be called as writer.)
   3244  * Note: assumes that icmp_pkt_err_ok has been called to verify that
   3245  * an icmp error packet can be sent.
   3246  * Assigns an appropriate source address to the packet. If ipha_dst is
   3247  * one of our addresses use it for source. Otherwise pick a source based
   3248  * on a route lookup back to ipha_src.
   3249  * Note that ipha_src must be set here since the
   3250  * packet is likely to arrive on an ill queue in ip_wput() which will
   3251  * not set a source address.
   3252  */
   3253 static void
   3254 icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
   3255     boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst)
   3256 {
   3257 	ipaddr_t dst;
   3258 	icmph_t	*icmph;
   3259 	ipha_t	*ipha;
   3260 	uint_t	len_needed;
   3261 	size_t	msg_len;
   3262 	mblk_t	*mp1;
   3263 	ipaddr_t src;
   3264 	ire_t	*ire;
   3265 	mblk_t *ipsec_mp;
   3266 	ipsec_out_t	*io = NULL;
   3267 
   3268 	if (mctl_present) {
   3269 		/*
   3270 		 * If it is :
   3271 		 *
   3272 		 * 1) a IPSEC_OUT, then this is caused by outbound
   3273 		 *    datagram originating on this host. IPsec processing
   3274 		 *    may or may not have been done. Refer to comments above
   3275 		 *    icmp_inbound_error_fanout for details.
   3276 		 *
   3277 		 * 2) a IPSEC_IN if we are generating a icmp_message
   3278 		 *    for an incoming datagram destined for us i.e called
   3279 		 *    from ip_fanout_send_icmp.
   3280 		 */
   3281 		ipsec_info_t *in;
   3282 		ipsec_mp = mp;
   3283 		mp = ipsec_mp->b_cont;
   3284 
   3285 		in = (ipsec_info_t *)ipsec_mp->b_rptr;
   3286 		ipha = (ipha_t *)mp->b_rptr;
   3287 
   3288 		ASSERT(in->ipsec_info_type == IPSEC_OUT ||
   3289 		    in->ipsec_info_type == IPSEC_IN);
   3290 
   3291 		if (in->ipsec_info_type == IPSEC_IN) {
   3292 			/*
   3293 			 * Convert the IPSEC_IN to IPSEC_OUT.
   3294 			 */
   3295 			if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) {
   3296 				BUMP_MIB(&ipst->ips_ip_mib,
   3297 				    ipIfStatsOutDiscards);
   3298 				return;
   3299 			}
   3300 			io = (ipsec_out_t *)ipsec_mp->b_rptr;
   3301 		} else {
   3302 			ASSERT(in->ipsec_info_type == IPSEC_OUT);
   3303 			io = (ipsec_out_t *)in;
   3304 			/*
   3305 			 * Clear out ipsec_out_proc_begin, so we do a fresh
   3306 			 * ire lookup.
   3307 			 */
   3308 			io->ipsec_out_proc_begin = B_FALSE;
   3309 		}
   3310 		ASSERT(zoneid != ALL_ZONES);
   3311 		/*
   3312 		 * The IPSEC_IN (now an IPSEC_OUT) didn't have its zoneid
   3313 		 * initialized.  We need to do that now.
   3314 		 */
   3315 		io->ipsec_out_zoneid = zoneid;
   3316 	} else {
   3317 		/*
   3318 		 * This is in clear. The icmp message we are building
   3319 		 * here should go out in clear.
   3320 		 *
   3321 		 * Pardon the convolution of it all, but it's easier to
   3322 		 * allocate a "use cleartext" IPSEC_IN message and convert
   3323 		 * it than it is to allocate a new one.
   3324 		 */
   3325 		ipsec_in_t *ii;
   3326 		ASSERT(DB_TYPE(mp) == M_DATA);
   3327 		ipsec_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack);
   3328 		if (ipsec_mp == NULL) {
   3329 			freemsg(mp);
   3330 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   3331 			return;
   3332 		}
   3333 		ii = (ipsec_in_t *)ipsec_mp->b_rptr;
   3334 
   3335 		/* This is not a secure packet */
   3336 		ii->ipsec_in_secure = B_FALSE;
   3337 		/*
   3338 		 * For trusted extensions using a shared IP address we can
   3339 		 * send using any zoneid.
   3340 		 */
   3341 		if (zoneid == ALL_ZONES)
   3342 			ii->ipsec_in_zoneid = GLOBAL_ZONEID;
   3343 		else
   3344 			ii->ipsec_in_zoneid = zoneid;
   3345 		ipsec_mp->b_cont = mp;
   3346 		ipha = (ipha_t *)mp->b_rptr;
   3347 		/*
   3348 		 * Convert the IPSEC_IN to IPSEC_OUT.
   3349 		 */
   3350 		if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) {
   3351 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   3352 			return;
   3353 		}
   3354 		io = (ipsec_out_t *)ipsec_mp->b_rptr;
   3355 	}
   3356 
   3357 	/* Remember our eventual destination */
   3358 	dst = ipha->ipha_src;
   3359 
   3360 	ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK),
   3361 	    NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE, ipst);
   3362 	if (ire != NULL &&
   3363 	    (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) {
   3364 		src = ipha->ipha_dst;
   3365 	} else {
   3366 		if (ire != NULL)
   3367 			ire_refrele(ire);
   3368 		ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL,
   3369 		    (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY),
   3370 		    ipst);
   3371 		if (ire == NULL) {
   3372 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
   3373 			freemsg(ipsec_mp);
   3374 			return;
   3375 		}
   3376 		src = ire->ire_src_addr;
   3377 	}
   3378 
   3379 	if (ire != NULL)
   3380 		ire_refrele(ire);
   3381 
   3382 	/*
   3383 	 * Check if we can send back more then 8 bytes in addition to
   3384 	 * the IP header.  We try to send 64 bytes of data and the internal
   3385 	 * header in the special cases of ipv4 encapsulated ipv4 or ipv6.
   3386 	 */
   3387 	len_needed = IPH_HDR_LENGTH(ipha);
   3388 	if (ipha->ipha_protocol == IPPROTO_ENCAP ||
   3389 	    ipha->ipha_protocol == IPPROTO_IPV6) {
   3390 
   3391 		if (!pullupmsg(mp, -1)) {
   3392 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   3393 			freemsg(ipsec_mp);
   3394 			return;
   3395 		}
   3396 		ipha = (ipha_t *)mp->b_rptr;
   3397 
   3398 		if (ipha->ipha_protocol == IPPROTO_ENCAP) {
   3399 			len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha +
   3400 			    len_needed));
   3401 		} else {
   3402 			ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed);
   3403 
   3404 			ASSERT(ipha->ipha_protocol == IPPROTO_IPV6);
   3405 			len_needed += ip_hdr_length_v6(mp, ip6h);
   3406 		}
   3407 	}
   3408 	len_needed += ipst->ips_ip_icmp_return;
   3409 	msg_len = msgdsize(mp);
   3410 	if (msg_len > len_needed) {
   3411 		(void) adjmsg(mp, len_needed - msg_len);
   3412 		msg_len = len_needed;
   3413 	}
   3414 	/* Make sure we propagate the cred/label for TX */
   3415 	mp1 = allocb_tmpl(sizeof (icmp_ipha) + len, mp);
   3416 	if (mp1 == NULL) {
   3417 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors);
   3418 		freemsg(ipsec_mp);
   3419 		return;
   3420 	}
   3421 	mp1->b_cont = mp;
   3422 	mp = mp1;
   3423 	ASSERT(ipsec_mp->b_datap->db_type == M_CTL &&
   3424 	    ipsec_mp->b_rptr == (uint8_t *)io &&
   3425 	    io->ipsec_out_type == IPSEC_OUT);
   3426 	ipsec_mp->b_cont = mp;
   3427 
   3428 	/*
   3429 	 * Set ipsec_out_icmp_loopback so we can let the ICMP messages this
   3430 	 * node generates be accepted in peace by all on-host destinations.
   3431 	 * If we do NOT assume that all on-host destinations trust
   3432 	 * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
   3433 	 * (Look for ipsec_out_icmp_loopback).
   3434 	 */
   3435 	io->ipsec_out_icmp_loopback = B_TRUE;
   3436 
   3437 	ipha = (ipha_t *)mp->b_rptr;
   3438 	mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len);
   3439 	*ipha = icmp_ipha;
   3440 	ipha->ipha_src = src;
   3441 	ipha->ipha_dst = dst;
   3442 	ipha->ipha_ttl = ipst->ips_ip_def_ttl;
   3443 	msg_len += sizeof (icmp_ipha) + len;
   3444 	if (msg_len > IP_MAXPACKET) {
   3445 		(void) adjmsg(mp, IP_MAXPACKET - msg_len);
   3446 		msg_len = IP_MAXPACKET;
   3447 	}
   3448 	ipha->ipha_length = htons((uint16_t)msg_len);
   3449 	icmph = (icmph_t *)&ipha[1];
   3450 	bcopy(stuff, icmph, len);
   3451 	icmph->icmph_checksum = 0;
   3452 	icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
   3453 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
   3454 	put(q, ipsec_mp);
   3455 }
   3456 
   3457 /*
   3458  * Determine if an ICMP error packet can be sent given the rate limit.
   3459  * The limit consists of an average frequency (icmp_pkt_err_interval measured
   3460  * in milliseconds) and a burst size. Burst size number of packets can
   3461  * be sent arbitrarely closely spaced.
   3462  * The state is tracked using two variables to implement an approximate
   3463  * token bucket filter:
   3464  *	icmp_pkt_err_last - lbolt value when the last burst started
   3465  *	icmp_pkt_err_sent - number of packets sent in current burst
   3466  */
   3467 boolean_t
   3468 icmp_err_rate_limit(ip_stack_t *ipst)
   3469 {
   3470 	clock_t now = TICK_TO_MSEC(lbolt);
   3471 	uint_t refilled; /* Number of packets refilled in tbf since last */
   3472 	/* Guard against changes by loading into local variable */
   3473 	uint_t err_interval = ipst->ips_ip_icmp_err_interval;
   3474 
   3475 	if (err_interval == 0)
   3476 		return (B_FALSE);
   3477 
   3478 	if (ipst->ips_icmp_pkt_err_last > now) {
   3479 		/* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */
   3480 		ipst->ips_icmp_pkt_err_last = 0;
   3481 		ipst->ips_icmp_pkt_err_sent = 0;
   3482 	}
   3483 	/*
   3484 	 * If we are in a burst update the token bucket filter.
   3485 	 * Update the "last" time to be close to "now" but make sure
   3486 	 * we don't loose precision.
   3487 	 */
   3488 	if (ipst->ips_icmp_pkt_err_sent != 0) {
   3489 		refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval;
   3490 		if (refilled > ipst->ips_icmp_pkt_err_sent) {
   3491 			ipst->ips_icmp_pkt_err_sent = 0;
   3492 		} else {
   3493 			ipst->ips_icmp_pkt_err_sent -= refilled;
   3494 			ipst->ips_icmp_pkt_err_last += refilled * err_interval;
   3495 		}
   3496 	}
   3497 	if (ipst->ips_icmp_pkt_err_sent == 0) {
   3498 		/* Start of new burst */
   3499 		ipst->ips_icmp_pkt_err_last = now;
   3500 	}
   3501 	if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) {
   3502 		ipst->ips_icmp_pkt_err_sent++;
   3503 		ip1dbg(("icmp_err_rate_limit: %d sent in burst\n",
   3504 		    ipst->ips_icmp_pkt_err_sent));
   3505 		return (B_FALSE);
   3506 	}
   3507 	ip1dbg(("icmp_err_rate_limit: dropped\n"));
   3508 	return (B_TRUE);
   3509 }
   3510 
   3511 /*
   3512  * Check if it is ok to send an IPv4 ICMP error packet in
   3513  * response to the IPv4 packet in mp.
   3514  * Free the message and return null if no
   3515  * ICMP error packet should be sent.
   3516  */
   3517 static mblk_t *
   3518 icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst)
   3519 {
   3520 	icmph_t	*icmph;
   3521 	ipha_t	*ipha;
   3522 	uint_t	len_needed;
   3523 	ire_t	*src_ire;
   3524 	ire_t	*dst_ire;
   3525 
   3526 	if (!mp)
   3527 		return (NULL);
   3528 	ipha = (ipha_t *)mp->b_rptr;
   3529 	if (ip_csum_hdr(ipha)) {
   3530 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs);
   3531 		freemsg(mp);
   3532 		return (NULL);
   3533 	}
   3534 	src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST,
   3535 	    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
   3536 	dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST,
   3537 	    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
   3538 	if (src_ire != NULL || dst_ire != NULL ||
   3539 	    CLASSD(ipha->ipha_dst) ||
   3540 	    CLASSD(ipha->ipha_src) ||
   3541 	    (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) {
   3542 		/* Note: only errors to the fragment with offset 0 */
   3543 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   3544 		freemsg(mp);
   3545 		if (src_ire != NULL)
   3546 			ire_refrele(src_ire);
   3547 		if (dst_ire != NULL)
   3548 			ire_refrele(dst_ire);
   3549 		return (NULL);
   3550 	}
   3551 	if (ipha->ipha_protocol == IPPROTO_ICMP) {
   3552 		/*
   3553 		 * Check the ICMP type.  RFC 1122 sez:  don't send ICMP
   3554 		 * errors in response to any ICMP errors.
   3555 		 */
   3556 		len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE;
   3557 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   3558 			if (!pullupmsg(mp, len_needed)) {
   3559 				BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   3560 				freemsg(mp);
   3561 				return (NULL);
   3562 			}
   3563 			ipha = (ipha_t *)mp->b_rptr;
   3564 		}
   3565 		icmph = (icmph_t *)
   3566 		    (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]);
   3567 		switch (icmph->icmph_type) {
   3568 		case ICMP_DEST_UNREACHABLE:
   3569 		case ICMP_SOURCE_QUENCH:
   3570 		case ICMP_TIME_EXCEEDED:
   3571 		case ICMP_PARAM_PROBLEM:
   3572 		case ICMP_REDIRECT:
   3573 			BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   3574 			freemsg(mp);
   3575 			return (NULL);
   3576 		default:
   3577 			break;
   3578 		}
   3579 	}
   3580 	/*
   3581 	 * If this is a labeled system, then check to see if we're allowed to
   3582 	 * send a response to this particular sender.  If not, then just drop.
   3583 	 */
   3584 	if (is_system_labeled() && !tsol_can_reply_error(mp)) {
   3585 		ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n"));
   3586 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   3587 		freemsg(mp);
   3588 		return (NULL);
   3589 	}
   3590 	if (icmp_err_rate_limit(ipst)) {
   3591 		/*
   3592 		 * Only send ICMP error packets every so often.
   3593 		 * This should be done on a per port/source basis,
   3594 		 * but for now this will suffice.
   3595 		 */
   3596 		freemsg(mp);
   3597 		return (NULL);
   3598 	}
   3599 	return (mp);
   3600 }
   3601 
   3602 /*
   3603  * Generate an ICMP redirect message.
   3604  */
   3605 static void
   3606 icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway, ip_stack_t *ipst)
   3607 {
   3608 	icmph_t	icmph;
   3609 
   3610 	/*
   3611 	 * We are called from ip_rput where we could
   3612 	 * not have attached an IPSEC_IN.
   3613 	 */
   3614 	ASSERT(mp->b_datap->db_type == M_DATA);
   3615 
   3616 	if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
   3617 		return;
   3618 	}
   3619 
   3620 	bzero(&icmph, sizeof (icmph_t));
   3621 	icmph.icmph_type = ICMP_REDIRECT;
   3622 	icmph.icmph_code = 1;
   3623 	icmph.icmph_rd_gateway = gateway;
   3624 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
   3625 	/* Redirects sent by router, and router is global zone */
   3626 	icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE, GLOBAL_ZONEID, ipst);
   3627 }
   3628 
   3629 /*
   3630  * Generate an ICMP time exceeded message.
   3631  */
   3632 void
   3633 icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid,
   3634     ip_stack_t *ipst)
   3635 {
   3636 	icmph_t	icmph;
   3637 	boolean_t mctl_present;
   3638 	mblk_t *first_mp;
   3639 
   3640 	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
   3641 
   3642 	if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
   3643 		if (mctl_present)
   3644 			freeb(first_mp);
   3645 		return;
   3646 	}
   3647 
   3648 	bzero(&icmph, sizeof (icmph_t));
   3649 	icmph.icmph_type = ICMP_TIME_EXCEEDED;
   3650 	icmph.icmph_code = code;
   3651 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
   3652 	icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid,
   3653 	    ipst);
   3654 }
   3655 
   3656 /*
   3657  * Generate an ICMP unreachable message.
   3658  */
   3659 void
   3660 icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid,
   3661     ip_stack_t *ipst)
   3662 {
   3663 	icmph_t	icmph;
   3664 	mblk_t *