OpenGrok

Cross Reference: ip.c
xref: /onnv/onnv-gate/usr/src/uts/common/inet/ip/ip.c
Home | History | Annotate | Line # | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
     24  * Copyright (c) 1990 Mentat Inc.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/stream.h>
     29 #include <sys/dlpi.h>
     30 #include <sys/stropts.h>
     31 #include <sys/sysmacros.h>
     32 #include <sys/strsubr.h>
     33 #include <sys/strlog.h>
     34 #include <sys/strsun.h>
     35 #include <sys/zone.h>
     36 #define	_SUN_TPI_VERSION 2
     37 #include <sys/tihdr.h>
     38 #include <sys/xti_inet.h>
     39 #include <sys/ddi.h>
     40 #include <sys/suntpi.h>
     41 #include <sys/cmn_err.h>
     42 #include <sys/debug.h>
     43 #include <sys/kobj.h>
     44 #include <sys/modctl.h>
     45 #include <sys/atomic.h>
     46 #include <sys/policy.h>
     47 #include <sys/priv.h>
     48 #include <sys/taskq.h>
     49 
     50 #include <sys/systm.h>
     51 #include <sys/param.h>
     52 #include <sys/kmem.h>
     53 #include <sys/sdt.h>
     54 #include <sys/socket.h>
     55 #include <sys/vtrace.h>
     56 #include <sys/isa_defs.h>
     57 #include <sys/mac.h>
     58 #include <net/if.h>
     59 #include <net/if_arp.h>
     60 #include <net/route.h>
     61 #include <sys/sockio.h>
     62 #include <netinet/in.h>
     63 #include <net/if_dl.h>
     64 
     65 #include <inet/common.h>
     66 #include <inet/mi.h>
     67 #include <inet/mib2.h>
     68 #include <inet/nd.h>
     69 #include <inet/arp.h>
     70 #include <inet/snmpcom.h>
     71 #include <inet/optcom.h>
     72 #include <inet/kstatcom.h>
     73 
     74 #include <netinet/igmp_var.h>
     75 #include <netinet/ip6.h>
     76 #include <netinet/icmp6.h>
     77 #include <netinet/sctp.h>
     78 
     79 #include <inet/ip.h>
     80 #include <inet/ip_impl.h>
     81 #include <inet/ip6.h>
     82 #include <inet/ip6_asp.h>
     83 #include <inet/tcp.h>
     84 #include <inet/tcp_impl.h>
     85 #include <inet/ip_multi.h>
     86 #include <inet/ip_if.h>
     87 #include <inet/ip_ire.h>
     88 #include <inet/ip_ftable.h>
     89 #include <inet/ip_rts.h>
     90 #include <inet/ip_ndp.h>
     91 #include <inet/ip_listutils.h>
     92 #include <netinet/igmp.h>
     93 #include <netinet/ip_mroute.h>
     94 #include <inet/ipp_common.h>
     95 
     96 #include <net/pfkeyv2.h>
     97 #include <inet/sadb.h>
     98 #include <inet/ipsec_impl.h>
     99 #include <inet/iptun/iptun_impl.h>
    100 #include <inet/ipdrop.h>
    101 #include <inet/ip_netinfo.h>
    102 #include <inet/ilb_ip.h>
    103 
    104 #include <sys/ethernet.h>
    105 #include <net/if_types.h>
    106 #include <sys/cpuvar.h>
    107 
    108 #include <ipp/ipp.h>
    109 #include <ipp/ipp_impl.h>
    110 #include <ipp/ipgpc/ipgpc.h>
    111 
    112 #include <sys/pattr.h>
    113 #include <inet/ipclassifier.h>
    114 #include <inet/sctp_ip.h>
    115 #include <inet/sctp/sctp_impl.h>
    116 #include <inet/udp_impl.h>
    117 #include <inet/rawip_impl.h>
    118 #include <inet/rts_impl.h>
    119 
    120 #include <sys/tsol/label.h>
    121 #include <sys/tsol/tnet.h>
    122 
    123 #include <sys/squeue_impl.h>
    124 #include <inet/ip_arp.h>
    125 
    126 #include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
    127 
    128 /*
    129  * Values for squeue switch:
    130  * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
    131  * IP_SQUEUE_ENTER: SQ_PROCESS
    132  * IP_SQUEUE_FILL: SQ_FILL
    133  */
    134 int ip_squeue_enter = IP_SQUEUE_ENTER;	/* Setable in /etc/system */
    135 
    136 int ip_squeue_flag;
    137 
    138 /*
    139  * Setable in /etc/system
    140  */
    141 int ip_poll_normal_ms = 100;
    142 int ip_poll_normal_ticks = 0;
    143 int ip_modclose_ackwait_ms = 3000;
    144 
    145 /*
    146  * It would be nice to have these present only in DEBUG systems, but the
    147  * current design of the global symbol checking logic requires them to be
    148  * unconditionally present.
    149  */
    150 uint_t ip_thread_data;			/* TSD key for debug support */
    151 krwlock_t ip_thread_rwlock;
    152 list_t	ip_thread_list;
    153 
    154 /*
    155  * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
    156  */
    157 
    158 struct listptr_s {
    159 	mblk_t	*lp_head;	/* pointer to the head of the list */
    160 	mblk_t	*lp_tail;	/* pointer to the tail of the list */
    161 };
    162 
    163 typedef struct listptr_s listptr_t;
    164 
    165 /*
    166  * This is used by ip_snmp_get_mib2_ip_route_media and
    167  * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
    168  */
    169 typedef struct iproutedata_s {
    170 	uint_t		ird_idx;
    171 	uint_t		ird_flags;	/* see below */
    172 	listptr_t	ird_route;	/* ipRouteEntryTable */
    173 	listptr_t	ird_netmedia;	/* ipNetToMediaEntryTable */
    174 	listptr_t	ird_attrs;	/* ipRouteAttributeTable */
    175 } iproutedata_t;
    176 
    177 /* Include ire_testhidden and IRE_IF_CLONE routes */
    178 #define	IRD_REPORT_ALL	0x01
    179 
    180 /*
    181  * Cluster specific hooks. These should be NULL when booted as a non-cluster
    182  */
    183 
    184 /*
    185  * Hook functions to enable cluster networking
    186  * On non-clustered systems these vectors must always be NULL.
    187  *
    188  * Hook function to Check ip specified ip address is a shared ip address
    189  * in the cluster
    190  *
    191  */
    192 int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
    193     sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL;
    194 
    195 /*
    196  * Hook function to generate cluster wide ip fragment identifier
    197  */
    198 uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
    199     sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
    200     void *args) = NULL;
    201 
    202 /*
    203  * Hook function to generate cluster wide SPI.
    204  */
    205 void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
    206     void *) = NULL;
    207 
    208 /*
    209  * Hook function to verify if the SPI is already utlized.
    210  */
    211 
    212 int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
    213 
    214 /*
    215  * Hook function to delete the SPI from the cluster wide repository.
    216  */
    217 
    218 void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
    219 
    220 /*
    221  * Hook function to inform the cluster when packet received on an IDLE SA
    222  */
    223 
    224 void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
    225     in6_addr_t, in6_addr_t, void *) = NULL;
    226 
    227 /*
    228  * Synchronization notes:
    229  *
    230  * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
    231  * MT level protection given by STREAMS. IP uses a combination of its own
    232  * internal serialization mechanism and standard Solaris locking techniques.
    233  * The internal serialization is per phyint.  This is used to serialize
    234  * plumbing operations, IPMP operations, most set ioctls, etc.
    235  *
    236  * Plumbing is a long sequence of operations involving message
    237  * exchanges between IP, ARP and device drivers. Many set ioctls are typically
    238  * involved in plumbing operations. A natural model is to serialize these
    239  * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
    240  * parallel without any interference. But various set ioctls on hme0 are best
    241  * serialized, along with IPMP operations and processing of DLPI control
    242  * messages received from drivers on a per phyint basis. This serialization is
    243  * provided by the ipsq_t and primitives operating on this. Details can
    244  * be found in ip_if.c above the core primitives operating on ipsq_t.
    245  *
    246  * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
    247  * Simiarly lookup of an ire by a thread also returns a refheld ire.
    248  * In addition ipif's and ill's referenced by the ire are also indirectly
    249  * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
    250  * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
    251  * address of an ipif has to go through the ipsq_t. This ensures that only
    252  * one such exclusive operation proceeds at any time on the ipif. It then
    253  * waits for all refcnts
    254  * associated with this ipif to come down to zero. The address is changed
    255  * only after the ipif has been quiesced. Then the ipif is brought up again.
    256  * More details are described above the comment in ip_sioctl_flags.
    257  *
    258  * Packet processing is based mostly on IREs and are fully multi-threaded
    259  * using standard Solaris MT techniques.
    260  *
    261  * There are explicit locks in IP to handle:
    262  * - The ip_g_head list maintained by mi_open_link() and friends.
    263  *
    264  * - The reassembly data structures (one lock per hash bucket)
    265  *
    266  * - conn_lock is meant to protect conn_t fields. The fields actually
    267  *   protected by conn_lock are documented in the conn_t definition.
    268  *
    269  * - ire_lock to protect some of the fields of the ire, IRE tables
    270  *   (one lock per hash bucket). Refer to ip_ire.c for details.
    271  *
    272  * - ndp_g_lock and ncec_lock for protecting NCEs.
    273  *
    274  * - ill_lock protects fields of the ill and ipif. Details in ip.h
    275  *
    276  * - ill_g_lock: This is a global reader/writer lock. Protects the following
    277  *	* The AVL tree based global multi list of all ills.
    278  *	* The linked list of all ipifs of an ill
    279  *	* The <ipsq-xop> mapping
    280  *	* <ill-phyint> association
    281  *   Insertion/deletion of an ill in the system, insertion/deletion of an ipif
    282  *   into an ill, changing the <ipsq-xop> mapping of an ill, changing the
    283  *   <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
    284  *   writer for the actual duration of the insertion/deletion/change.
    285  *
    286  * - ill_lock:  This is a per ill mutex.
    287  *   It protects some members of the ill_t struct; see ip.h for details.
    288  *   It also protects the <ill-phyint> assoc.
    289  *   It also protects the list of ipifs hanging off the ill.
    290  *
    291  * - ipsq_lock: This is a per ipsq_t mutex lock.
    292  *   This protects some members of the ipsq_t struct; see ip.h for details.
    293  *   It also protects the <ipsq-ipxop> mapping
    294  *
    295  * - ipx_lock: This is a per ipxop_t mutex lock.
    296  *   This protects some members of the ipxop_t struct; see ip.h for details.
    297  *
    298  * - phyint_lock: This is a per phyint mutex lock. Protects just the
    299  *   phyint_flags
    300  *
    301  * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
    302  *   This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
    303  *   uniqueness check also done atomically.
    304  *
    305  * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
    306  *   group list linked by ill_usesrc_grp_next. It also protects the
    307  *   ill_usesrc_ifindex field. It is taken as a writer when a member of the
    308  *   group is being added or deleted.  This lock is taken as a reader when
    309  *   walking the list/group(eg: to get the number of members in a usesrc group).
    310  *   Note, it is only necessary to take this lock if the ill_usesrc_grp_next
    311  *   field is changing state i.e from NULL to non-NULL or vice-versa. For
    312  *   example, it is not necessary to take this lock in the initial portion
    313  *   of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
    314  *   operations are executed exclusively and that ensures that the "usesrc
    315  *   group state" cannot change. The "usesrc group state" change can happen
    316  *   only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
    317  *
    318  * Changing <ill-phyint>, <ipsq-xop> assocications:
    319  *
    320  * To change the <ill-phyint> association, the ill_g_lock must be held
    321  * as writer, and the ill_locks of both the v4 and v6 instance of the ill
    322  * must be held.
    323  *
    324  * To change the <ipsq-xop> association, the ill_g_lock must be held as
    325  * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
    326  * This is only done when ills are added or removed from IPMP groups.
    327  *
    328  * To add or delete an ipif from the list of ipifs hanging off the ill,
    329  * ill_g_lock (writer) and ill_lock must be held and the thread must be
    330  * a writer on the associated ipsq.
    331  *
    332  * To add or delete an ill to the system, the ill_g_lock must be held as
    333  * writer and the thread must be a writer on the associated ipsq.
    334  *
    335  * To add or delete an ilm to an ill, the ill_lock must be held and the thread
    336  * must be a writer on the associated ipsq.
    337  *
    338  * Lock hierarchy
    339  *
    340  * Some lock hierarchy scenarios are listed below.
    341  *
    342  * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
    343  * ill_g_lock -> ill_lock(s) -> phyint_lock
    344  * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
    345  * ill_g_lock -> ip_addr_avail_lock
    346  * conn_lock -> irb_lock -> ill_lock -> ire_lock
    347  * ill_g_lock -> ip_g_nd_lock
    348  * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
    349  * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
    350  * arl_lock -> ill_lock
    351  * ips_ire_dep_lock -> irb_lock
    352  *
    353  * When more than 1 ill lock is needed to be held, all ill lock addresses
    354  * are sorted on address and locked starting from highest addressed lock
    355  * downward.
    356  *
    357  * Multicast scenarios
    358  * ips_ill_g_lock -> ill_mcast_lock
    359  * conn_ilg_lock -> ips_ill_g_lock -> ill_lock
    360  * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
    361  * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
    362  * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
    363  * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
    364  *
    365  * IPsec scenarios
    366  *
    367  * ipsa_lock -> ill_g_lock -> ill_lock
    368  * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
    369  *
    370  * Trusted Solaris scenarios
    371  *
    372  * igsa_lock -> gcgrp_rwlock -> gcgrp_lock
    373  * igsa_lock -> gcdb_lock
    374  * gcgrp_rwlock -> ire_lock
    375  * gcgrp_rwlock -> gcdb_lock
    376  *
    377  * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
    378  *
    379  * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
    380  * sq_lock -> conn_lock -> QLOCK(q)
    381  * ill_lock -> ft_lock -> fe_lock
    382  *
    383  * Routing/forwarding table locking notes:
    384  *
    385  * Lock acquisition order: Radix tree lock, irb_lock.
    386  * Requirements:
    387  * i.  Walker must not hold any locks during the walker callback.
    388  * ii  Walker must not see a truncated tree during the walk because of any node
    389  *     deletion.
    390  * iii Existing code assumes ire_bucket is valid if it is non-null and is used
    391  *     in many places in the code to walk the irb list. Thus even if all the
    392  *     ires in a bucket have been deleted, we still can't free the radix node
    393  *     until the ires have actually been inactive'd (freed).
    394  *
    395  * Tree traversal - Need to hold the global tree lock in read mode.
    396  * Before dropping the global tree lock, need to either increment the ire_refcnt
    397  * to ensure that the radix node can't be deleted.
    398  *
    399  * Tree add - Need to hold the global tree lock in write mode to add a
    400  * radix node. To prevent the node from being deleted, increment the
    401  * irb_refcnt, after the node is added to the tree. The ire itself is
    402  * added later while holding the irb_lock, but not the tree lock.
    403  *
    404  * Tree delete - Need to hold the global tree lock and irb_lock in write mode.
    405  * All associated ires must be inactive (i.e. freed), and irb_refcnt
    406  * must be zero.
    407  *
    408  * Walker - Increment irb_refcnt before calling the walker callback. Hold the
    409  * global tree lock (read mode) for traversal.
    410  *
    411  * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
    412  * hence we will acquire irb_lock while holding ips_ire_dep_lock.
    413  *
    414  * IPsec notes :
    415  *
    416  * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
    417  * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
    418  * ip_xmit_attr_t has the
    419  * information used by the IPsec code for applying the right level of
    420  * protection. The information initialized by IP in the ip_xmit_attr_t
    421  * is determined by the per-socket policy or global policy in the system.
    422  * For inbound datagrams, the ip_recv_attr_t
    423  * starts out with nothing in it. It gets filled
    424  * with the right information if it goes through the AH/ESP code, which
    425  * happens if the incoming packet is secure. The information initialized
    426  * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
    427  * the policy requirements needed by per-socket policy or global policy
    428  * is met or not.
    429  *
    430  * For fully connected sockets i.e dst, src [addr, port] is known,
    431  * conn_policy_cached is set indicating that policy has been cached.
    432  * conn_in_enforce_policy may or may not be set depending on whether
    433  * there is a global policy match or per-socket policy match.
    434  * Policy inheriting happpens in ip_policy_set once the destination is known.
    435  * Once the right policy is set on the conn_t, policy cannot change for
    436  * this socket. This makes life simpler for TCP (UDP ?) where
    437  * re-transmissions go out with the same policy. For symmetry, policy
    438  * is cached for fully connected UDP sockets also. Thus if policy is cached,
    439  * it also implies that policy is latched i.e policy cannot change
    440  * on these sockets. As we have the right policy on the conn, we don't
    441  * have to lookup global policy for every outbound and inbound datagram
    442  * and thus serving as an optimization. Note that a global policy change
    443  * does not affect fully connected sockets if they have policy. If fully
    444  * connected sockets did not have any policy associated with it, global
    445  * policy change may affect them.
    446  *
    447  * IP Flow control notes:
    448  * ---------------------
    449  * Non-TCP streams are flow controlled by IP. The way this is accomplished
    450  * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
    451  * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
    452  * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
    453  * functions.
    454  *
    455  * Per Tx ring udp flow control:
    456  * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
    457  * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
    458  *
    459  * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
    460  * To achieve best performance, outgoing traffic need to be fanned out among
    461  * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
    462  * traffic out of the NIC and it takes a fanout hint. UDP connections pass
    463  * the address of connp as fanout hint to mac_tx(). Under flow controlled
    464  * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
    465  * cookie points to a specific Tx ring that is blocked. The cookie is used to
    466  * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
    467  * point to drain_lists (idl_t's). These drain list will store the blocked UDP
    468  * connp's. The drain list is not a single list but a configurable number of
    469  * lists.
    470  *
    471  * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
    472  * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
    473  * which is equal to 128. This array in turn contains a pointer to idl_t[],
    474  * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
    475  * list will point to the list of connp's that are flow controlled.
    476  *
    477  *                      ---------------   -------   -------   -------
    478  *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
    479  *                   |  ---------------   -------   -------   -------
    480  *                   |  ---------------   -------   -------   -------
    481  *                   |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
    482  * ----------------  |  ---------------   -------   -------   -------
    483  * |idl_tx_list[0]|->|  ---------------   -------   -------   -------
    484  * ----------------  |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
    485  *                   |  ---------------   -------   -------   -------
    486  *                   .        .              .         .         .
    487  *                   |  ---------------   -------   -------   -------
    488  *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
    489  *                      ---------------   -------   -------   -------
    490  *                      ---------------   -------   -------   -------
    491  *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
    492  *                   |  ---------------   -------   -------   -------
    493  *                   |  ---------------   -------   -------   -------
    494  * ----------------  |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
    495  * |idl_tx_list[1]|->|  ---------------   -------   -------   -------
    496  * ----------------  |        .              .         .         .
    497  *                   |  ---------------   -------   -------   -------
    498  *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
    499  *                      ---------------   -------   -------   -------
    500  *     .....
    501  * ----------------
    502  * |idl_tx_list[n]|-> ...
    503  * ----------------
    504  *
    505  * When mac_tx() returns a cookie, the cookie is hashed into an index into
    506  * ips_idl_tx_list[], and conn_drain_insert() is called with the idl_tx_list
    507  * to insert the conn onto.  conn_drain_insert() asserts flow control for the
    508  * sockets via su_txq_full() (non-STREAMS) or QFULL on conn_wq (STREAMS).
    509  * Further, conn_blocked is set to indicate that the conn is blocked.
    510  *
    511  * GLDv3 calls ill_flow_enable() when flow control is relieved.  The cookie
    512  * passed in the call to ill_flow_enable() identifies the blocked Tx ring and
    513  * is again hashed to locate the appropriate idl_tx_list, which is then
    514  * drained via conn_walk_drain().  conn_walk_drain() goes through each conn in
    515  * the drain list and calls conn_drain_remove() to clear flow control (via
    516  * calling su_txq_full() or clearing QFULL), and remove the conn from the
    517  * drain list.
    518  *
    519  * Note that the drain list is not a single list but a (configurable) array of
    520  * lists (8 elements by default).  Synchronization between drain insertion and
    521  * flow control wakeup is handled by using idl_txl->txl_lock, and only
    522  * conn_drain_insert() and conn_drain_remove() manipulate the drain list.
    523  *
    524  * Flow control via STREAMS is used when ILL_DIRECT_CAPABLE() returns FALSE.
    525  * On the send side, if the packet cannot be sent down to the driver by IP
    526  * (canput() fails), ip_xmit() drops the packet and returns EWOULDBLOCK to the
    527  * caller, who may then invoke ixa_check_drain_insert() to insert the conn on
    528  * the 0'th drain list.  When ip_wsrv() runs on the ill_wq because flow
    529  * control has been relieved, the blocked conns in the 0'th drain list are
    530  * drained as in the non-STREAMS case.
    531  *
    532  * In both the STREAMS and non-STREAMS cases, the sockfs upcall to set QFULL
    533  * is done when the conn is inserted into the drain list (conn_drain_insert())
    534  * and cleared when the conn is removed from the it (conn_drain_remove()).
    535  *
    536  * IPQOS notes:
    537  *
    538  * IPQoS Policies are applied to packets using IPPF (IP Policy framework)
    539  * and IPQoS modules. IPPF includes hooks in IP at different control points
    540  * (callout positions) which direct packets to IPQoS modules for policy
    541  * processing. Policies, if present, are global.
    542  *
    543  * The callout positions are located in the following paths:
    544  *		o local_in (packets destined for this host)
    545  *		o local_out (packets orginating from this host )
    546  *		o fwd_in  (packets forwarded by this m/c - inbound)
    547  *		o fwd_out (packets forwarded by this m/c - outbound)
    548  * Hooks at these callout points can be enabled/disabled using the ndd variable
    549  * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
    550  * By default all the callout positions are enabled.
    551  *
    552  * Outbound (local_out)
    553  * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
    554  *
    555  * Inbound (local_in)
    556  * Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
    557  *
    558  * Forwarding (in and out)
    559  * Hooks are placed in ire_recv_forward_v4/v6.
    560  *
    561  * IP Policy Framework processing (IPPF processing)
    562  * Policy processing for a packet is initiated by ip_process, which ascertains
    563  * that the classifier (ipgpc) is loaded and configured, failing which the
    564  * packet resumes normal processing in IP. If the clasifier is present, the
    565  * packet is acted upon by one or more IPQoS modules (action instances), per
    566  * filters configured in ipgpc and resumes normal IP processing thereafter.
    567  * An action instance can drop a packet in course of its processing.
    568  *
    569  * Zones notes:
    570  *
    571  * The partitioning rules for networking are as follows:
    572  * 1) Packets coming from a zone must have a source address belonging to that
    573  * zone.
    574  * 2) Packets coming from a zone can only be sent on a physical interface on
    575  * which the zone has an IP address.
    576  * 3) Between two zones on the same machine, packet delivery is only allowed if
    577  * there's a matching route for the destination and zone in the forwarding
    578  * table.
    579  * 4) The TCP and UDP port spaces are per-zone; that is, two processes in
    580  * different zones can bind to the same port with the wildcard address
    581  * (INADDR_ANY).
    582  *
    583  * The granularity of interface partitioning is at the logical interface level.
    584  * Therefore, every zone has its own IP addresses, and incoming packets can be
    585  * attributed to a zone unambiguously. A logical interface is placed into a zone
    586  * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
    587  * structure. Rule (1) is implemented by modifying the source address selection
    588  * algorithm so that the list of eligible addresses is filtered based on the
    589  * sending process zone.
    590  *
    591  * The Internet Routing Entries (IREs) are either exclusive to a zone or shared
    592  * across all zones, depending on their type. Here is the break-up:
    593  *
    594  * IRE type				Shared/exclusive
    595  * --------				----------------
    596  * IRE_BROADCAST			Exclusive
    597  * IRE_DEFAULT (default routes)		Shared (*)
    598  * IRE_LOCAL				Exclusive (x)
    599  * IRE_LOOPBACK				Exclusive
    600  * IRE_PREFIX (net routes)		Shared (*)
    601  * IRE_IF_NORESOLVER (interface routes)	Exclusive
    602  * IRE_IF_RESOLVER (interface routes)	Exclusive
    603  * IRE_IF_CLONE (interface routes)	Exclusive
    604  * IRE_HOST (host routes)		Shared (*)
    605  *
    606  * (*) A zone can only use a default or off-subnet route if the gateway is
    607  * directly reachable from the zone, that is, if the gateway's address matches
    608  * one of the zone's logical interfaces.
    609  *
    610  * (x) IRE_LOCAL are handled a bit differently.
    611  * When ip_restrict_interzone_loopback is set (the default),
    612  * ire_route_recursive restricts loopback using an IRE_LOCAL
    613  * between zone to the case when L2 would have conceptually looped the packet
    614  * back, i.e. the loopback which is required since neither Ethernet drivers
    615  * nor Ethernet hardware loops them back. This is the case when the normal
    616  * routes (ignoring IREs with different zoneids) would send out the packet on
    617  * the same ill as the ill with which is IRE_LOCAL is associated.
    618  *
    619  * Multiple zones can share a common broadcast address; typically all zones
    620  * share the 255.255.255.255 address. Incoming as well as locally originated
    621  * broadcast packets must be dispatched to all the zones on the broadcast
    622  * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
    623  * since some zones may not be on the 10.16.72/24 network. To handle this, each
    624  * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
    625  * sent to every zone that has an IRE_BROADCAST entry for the destination
    626  * address on the input ill, see ip_input_broadcast().
    627  *
    628  * Applications in different zones can join the same multicast group address.
    629  * The same logic applies for multicast as for broadcast. ip_input_multicast
    630  * dispatches packets to all zones that have members on the physical interface.
    631  */
    632 
    633 /*
    634  * Squeue Fanout flags:
    635  *	0: No fanout.
    636  *	1: Fanout across all squeues
    637  */
    638 boolean_t	ip_squeue_fanout = 0;
    639 
    640 /*
    641  * Maximum dups allowed per packet.
    642  */
    643 uint_t ip_max_frag_dups = 10;
    644 
    645 static int	ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
    646 		    cred_t *credp, boolean_t isv6);
    647 static mblk_t	*ip_xmit_attach_llhdr(mblk_t *, nce_t *);
    648 
    649 static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
    650 static void	icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
    651 static void	icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
    652     ip_recv_attr_t *);
    653 static void	icmp_options_update(ipha_t *);
    654 static void	icmp_param_problem(mblk_t *, uint8_t,  ip_recv_attr_t *);
    655 static void	icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
    656 static mblk_t	*icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
    657 static void	icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
    658     ip_recv_attr_t *);
    659 static void	icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
    660 static void	icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
    661     ip_recv_attr_t *);
    662 
    663 mblk_t		*ip_dlpi_alloc(size_t, t_uscalar_t);
    664 char		*ip_dot_addr(ipaddr_t, char *);
    665 mblk_t		*ip_carve_mp(mblk_t **, ssize_t);
    666 int		ip_close(queue_t *, int);
    667 static char	*ip_dot_saddr(uchar_t *, char *);
    668 static void	ip_lrput(queue_t *, mblk_t *);
    669 ipaddr_t	ip_net_mask(ipaddr_t);
    670 char		*ip_nv_lookup(nv_t *, int);
    671 void	ip_rput(queue_t *, mblk_t *);
    672 static void	ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
    673 		    void *dummy_arg);
    674 int		ip_snmp_get(queue_t *, mblk_t *, int, boolean_t);
    675 static mblk_t	*ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
    676 		    mib2_ipIfStatsEntry_t *, ip_stack_t *, boolean_t);
    677 static mblk_t	*ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
    678 		    ip_stack_t *, boolean_t);
    679 static mblk_t	*ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *,
    680 		    boolean_t);
    681 static mblk_t	*ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    682 static mblk_t	*ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
    683 static mblk_t	*ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    684 static mblk_t	*ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
    685 static mblk_t	*ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
    686 		    ip_stack_t *ipst, boolean_t);
    687 static mblk_t	*ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
    688 		    ip_stack_t *ipst, boolean_t);
    689 static mblk_t	*ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
    690 		    ip_stack_t *ipst);
    691 static mblk_t	*ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
    692 		    ip_stack_t *ipst);
    693 static mblk_t	*ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
    694 		    ip_stack_t *ipst);
    695 static mblk_t	*ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
    696 		    ip_stack_t *ipst);
    697 static mblk_t	*ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
    698 		    ip_stack_t *ipst);
    699 static mblk_t	*ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
    700 		    ip_stack_t *ipst);
    701 static mblk_t	*ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
    702 		    ip_stack_t *ipst);
    703 static mblk_t	*ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
    704 		    ip_stack_t *ipst);
    705 static void	ip_snmp_get2_v4(ire_t *, iproutedata_t *);
    706 static void	ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
    707 static int	ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *);
    708 static int	ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *);
    709 int		ip_snmp_set(queue_t *, int, int, uchar_t *, int);
    710 
    711 static mblk_t	*ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
    712 		    mblk_t *);
    713 
    714 static void	conn_drain_init(ip_stack_t *);
    715 static void	conn_drain_fini(ip_stack_t *);
    716 static void	conn_drain(conn_t *connp, boolean_t closing);
    717 
    718 static void	conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
    719 static void	conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
    720 
    721 static void	*ip_stack_init(netstackid_t stackid, netstack_t *ns);
    722 static void	ip_stack_shutdown(netstackid_t stackid, void *arg);
    723 static void	ip_stack_fini(netstackid_t stackid, void *arg);
    724 
    725 static int	ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
    726     const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
    727     ire_t *, conn_t *, boolean_t, const in6_addr_t *,  mcast_record_t,
    728     const in6_addr_t *);
    729 
    730 static int	ip_squeue_switch(int);
    731 
    732 static void	*ip_kstat_init(netstackid_t, ip_stack_t *);
    733 static void	ip_kstat_fini(netstackid_t, kstat_t *);
    734 static int	ip_kstat_update(kstat_t *kp, int rw);
    735 static void	*icmp_kstat_init(netstackid_t);
    736 static void	icmp_kstat_fini(netstackid_t, kstat_t *);
    737 static int	icmp_kstat_update(kstat_t *kp, int rw);
    738 static void	*ip_kstat2_init(netstackid_t, ip_stat_t *);
    739 static void	ip_kstat2_fini(netstackid_t, kstat_t *);
    740 
    741 static void	ipobs_init(ip_stack_t *);
    742 static void	ipobs_fini(ip_stack_t *);
    743 
    744 static int	ip_tp_cpu_update(cpu_setup_t, int, void *);
    745 
    746 ipaddr_t	ip_g_all_ones = IP_HOST_MASK;
    747 
    748 static long ip_rput_pullups;
    749 int	dohwcksum = 1;	/* use h/w cksum if supported by the hardware */
    750 
    751 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
    752 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
    753 
    754 int	ip_debug;
    755 
    756 /*
    757  * Multirouting/CGTP stuff
    758  */
    759 int	ip_cgtp_filter_rev = CGTP_FILTER_REV;	/* CGTP hooks version */
    760 
    761 /*
    762  * IP tunables related declarations. Definitions are in ip_tunables.c
    763  */
    764 extern mod_prop_info_t ip_propinfo_tbl[];
    765 extern int ip_propinfo_count;
    766 
    767 /*
    768  * Table of IP ioctls encoding the various properties of the ioctl and
    769  * indexed based on the last byte of the ioctl command. Occasionally there
    770  * is a clash, and there is more than 1 ioctl with the same last byte.
    771  * In such a case 1 ioctl is encoded in the ndx table and the remaining
    772  * ioctls are encoded in the misc table. An entry in the ndx table is
    773  * retrieved by indexing on the last byte of the ioctl command and comparing
    774  * the ioctl command with the value in the ndx table. In the event of a
    775  * mismatch the misc table is then searched sequentially for the desired
    776  * ioctl command.
    777  *
    778  * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
    779  */
    780 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
    781 	/* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    782 	/* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    783 	/* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    784 	/* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    785 	/* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    786 	/* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    787 	/* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    788 	/* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    789 	/* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    790 	/* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    791 
    792 	/* 010 */ { SIOCADDRT,	sizeof (struct rtentry), IPI_PRIV,
    793 			MISC_CMD, ip_siocaddrt, NULL },
    794 	/* 011 */ { SIOCDELRT,	sizeof (struct rtentry), IPI_PRIV,
    795 			MISC_CMD, ip_siocdelrt, NULL },
    796 
    797 	/* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    798 			IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
    799 	/* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
    800 			IF_CMD, ip_sioctl_get_addr, NULL },
    801 
    802 	/* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    803 			IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
    804 	/* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
    805 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
    806 
    807 	/* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
    808 			IPI_PRIV | IPI_WR,
    809 			IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
    810 	/* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
    811 			IPI_MODOK | IPI_GET_CMD,
    812 			IF_CMD, ip_sioctl_get_flags, NULL },
    813 
    814 	/* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    815 	/* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    816 
    817 	/* copyin size cannot be coded for SIOCGIFCONF */
    818 	/* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD,
    819 			MISC_CMD, ip_sioctl_get_ifconf, NULL },
    820 
    821 	/* 021 */ { SIOCSIFMTU,	sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    822 			IF_CMD, ip_sioctl_mtu, NULL },
    823 	/* 022 */ { SIOCGIFMTU,	sizeof (struct ifreq), IPI_GET_CMD,
    824 			IF_CMD, ip_sioctl_get_mtu, NULL },
    825 	/* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
    826 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
    827 	/* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    828 			IF_CMD, ip_sioctl_brdaddr, NULL },
    829 	/* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
    830 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
    831 	/* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    832 			IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
    833 	/* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
    834 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
    835 	/* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
    836 			IF_CMD, ip_sioctl_metric, NULL },
    837 	/* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    838 
    839 	/* See 166-168 below for extended SIOC*XARP ioctls */
    840 	/* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
    841 			ARP_CMD, ip_sioctl_arp, NULL },
    842 	/* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
    843 			ARP_CMD, ip_sioctl_arp, NULL },
    844 	/* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
    845 			ARP_CMD, ip_sioctl_arp, NULL },
    846 
    847 	/* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    848 	/* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    849 	/* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    850 	/* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    851 	/* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    852 	/* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    853 	/* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    854 	/* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    855 	/* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    856 	/* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    857 	/* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    858 	/* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    859 	/* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    860 	/* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    861 	/* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    862 	/* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    863 	/* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    864 	/* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    865 	/* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    866 	/* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    867 	/* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    868 
    869 	/* 054 */ { IF_UNITSEL,	sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
    870 			MISC_CMD, if_unitsel, if_unitsel_restart },
    871 
    872 	/* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    873 	/* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    874 	/* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    875 	/* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    876 	/* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    877 	/* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    878 	/* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    879 	/* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    880 	/* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    881 	/* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    882 	/* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    883 	/* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    884 	/* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    885 	/* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    886 	/* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    887 	/* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    888 	/* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    889 	/* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    890 
    891 	/* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
    892 			IPI_PRIV | IPI_WR | IPI_MODOK,
    893 			IF_CMD, ip_sioctl_sifname, NULL },
    894 
    895 	/* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    896 	/* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    897 	/* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    898 	/* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    899 	/* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    900 	/* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    901 	/* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    902 	/* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    903 	/* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    904 	/* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    905 	/* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    906 	/* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    907 	/* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    908 
    909 	/* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
    910 			MISC_CMD, ip_sioctl_get_ifnum, NULL },
    911 	/* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
    912 			IF_CMD, ip_sioctl_get_muxid, NULL },
    913 	/* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
    914 			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
    915 
    916 	/* Both if and lif variants share same func */
    917 	/* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
    918 			IF_CMD, ip_sioctl_get_lifindex, NULL },
    919 	/* Both if and lif variants share same func */
    920 	/* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
    921 			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
    922 
    923 	/* copyin size cannot be coded for SIOCGIFCONF */
    924 	/* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
    925 			MISC_CMD, ip_sioctl_get_ifconf, NULL },
    926 	/* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    927 	/* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    928 	/* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    929 	/* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    930 	/* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    931 	/* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    932 	/* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    933 	/* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    934 	/* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    935 	/* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    936 	/* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    937 	/* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    938 	/* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    939 	/* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    940 	/* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    941 	/* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    942 	/* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    943 
    944 	/* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
    945 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
    946 			ip_sioctl_removeif_restart },
    947 	/* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
    948 			IPI_GET_CMD | IPI_PRIV | IPI_WR,
    949 			LIF_CMD, ip_sioctl_addif, NULL },
    950 #define	SIOCLIFADDR_NDX 112
    951 	/* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
    952 			LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
    953 	/* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
    954 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
    955 	/* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
    956 			LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
    957 	/* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
    958 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
    959 	/* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
    960 			IPI_PRIV | IPI_WR,
    961 			LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
    962 	/* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
    963 			IPI_GET_CMD | IPI_MODOK,
    964 			LIF_CMD, ip_sioctl_get_flags, NULL },
    965 
    966 	/* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    967 	/* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    968 
    969 	/* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
    970 			ip_sioctl_get_lifconf, NULL },
    971 	/* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
    972 			LIF_CMD, ip_sioctl_mtu, NULL },
    973 	/* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
    974 			LIF_CMD, ip_sioctl_get_mtu, NULL },
    975 	/* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
    976 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
    977 	/* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
    978 			LIF_CMD, ip_sioctl_brdaddr, NULL },
    979 	/* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
    980 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
    981 	/* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
    982 			LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
    983 	/* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
    984 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
    985 	/* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
    986 			LIF_CMD, ip_sioctl_metric, NULL },
    987 	/* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
    988 			IPI_PRIV | IPI_WR | IPI_MODOK,
    989 			LIF_CMD, ip_sioctl_slifname,
    990 			ip_sioctl_slifname_restart },
    991 
    992 	/* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
    993 			MISC_CMD, ip_sioctl_get_lifnum, NULL },
    994 	/* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
    995 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
    996 	/* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
    997 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
    998 	/* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
    999 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
   1000 	/* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
   1001 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
   1002 	/* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1003 			LIF_CMD, ip_sioctl_token, NULL },
   1004 	/* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
   1005 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
   1006 	/* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1007 			LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
   1008 	/* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
   1009 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
   1010 	/* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1011 			LIF_CMD, ip_sioctl_lnkinfo, NULL },
   1012 
   1013 	/* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
   1014 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
   1015 	/* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
   1016 			LIF_CMD, ip_siocdelndp_v6, NULL },
   1017 	/* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
   1018 			LIF_CMD, ip_siocqueryndp_v6, NULL },
   1019 	/* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
   1020 			LIF_CMD, ip_siocsetndp_v6, NULL },
   1021 	/* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
   1022 			MISC_CMD, ip_sioctl_tmyaddr, NULL },
   1023 	/* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
   1024 			MISC_CMD, ip_sioctl_tonlink, NULL },
   1025 	/* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
   1026 			MISC_CMD, ip_sioctl_tmysite, NULL },
   1027 	/* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1028 	/* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1029 	/* IPSECioctls handled in ip_sioctl_copyin_setup itself */
   1030 	/* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1031 	/* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1032 	/* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1033 	/* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1034 
   1035 	/* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1036 
   1037 	/* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD,
   1038 			LIF_CMD, ip_sioctl_get_binding, NULL },
   1039 	/* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
   1040 			IPI_PRIV | IPI_WR,
   1041 			LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
   1042 	/* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
   1043 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
   1044 	/* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
   1045 			IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
   1046 
   1047 	/* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
   1048 	/* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1049 	/* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1050 	/* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1051 
   1052 	/* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1053 
   1054 	/* These are handled in ip_sioctl_copyin_setup itself */
   1055 	/* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
   1056 			MISC_CMD, NULL, NULL },
   1057 	/* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
   1058 			MISC_CMD, NULL, NULL },
   1059 	/* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
   1060 
   1061 	/* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
   1062 			ip_sioctl_get_lifconf, NULL },
   1063 
   1064 	/* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
   1065 			XARP_CMD, ip_sioctl_arp, NULL },
   1066 	/* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
   1067 			XARP_CMD, ip_sioctl_arp, NULL },
   1068 	/* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
   1069 			XARP_CMD, ip_sioctl_arp, NULL },
   1070 
   1071 	/* SIOCPOPSOCKFS is not handled by IP */
   1072 	/* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
   1073 
   1074 	/* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
   1075 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
   1076 	/* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
   1077 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
   1078 			ip_sioctl_slifzone_restart },
   1079 	/* 172-174 are SCTP ioctls and not handled by IP */
   1080 	/* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1081 	/* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1082 	/* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1083 	/* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
   1084 			IPI_GET_CMD, LIF_CMD,
   1085 			ip_sioctl_get_lifusesrc, 0 },
   1086 	/* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
   1087 			IPI_PRIV | IPI_WR,
   1088 			LIF_CMD, ip_sioctl_slifusesrc,
   1089 			NULL },
   1090 	/* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
   1091 			ip_sioctl_get_lifsrcof, NULL },
   1092 	/* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
   1093 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1094 	/* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
   1095 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1096 	/* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
   1097 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1098 	/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
   1099 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1100 	/* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1101 	/* SIOCSENABLESDP is handled by SDP */
   1102 	/* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
   1103 	/* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
   1104 	/* 185 */ { SIOCGIFHWADDR, sizeof (struct ifreq), IPI_GET_CMD,
   1105 			IF_CMD, ip_sioctl_get_ifhwaddr, NULL },
   1106 	/* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL },
   1107 	/* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD,
   1108 			ip_sioctl_ilb_cmd, NULL },
   1109 	/* 188 */ { SIOCGETPROP, 0, IPI_GET_CMD, 0, NULL, NULL },
   1110 	/* 189 */ { SIOCSETPROP, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL},
   1111 	/* 190 */ { SIOCGLIFDADSTATE, sizeof (struct lifreq),
   1112 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dadstate, NULL },
   1113 	/* 191 */ { SIOCSLIFPREFIX, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1114 			LIF_CMD, ip_sioctl_prefix, ip_sioctl_prefix_restart },
   1115 	/* 192 */ { SIOCGLIFHWADDR, sizeof (struct lifreq), IPI_GET_CMD,
   1116 			LIF_CMD, ip_sioctl_get_lifhwaddr, NULL }
   1117 };
   1118 
   1119 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
   1120 
   1121 ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
   1122 	{ I_LINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1123 	{ I_UNLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1124 	{ I_PLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1125 	{ I_PUNLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1126 	{ ND_GET,	0, 0, 0, NULL, NULL },
   1127 	{ ND_SET,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1128 	{ IP_IOCTL,	0, 0, 0, NULL, NULL },
   1129 	{ SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
   1130 		MISC_CMD, mrt_ioctl},
   1131 	{ SIOCGETSGCNT,	sizeof (struct sioc_sg_req), IPI_GET_CMD,
   1132 		MISC_CMD, mrt_ioctl},
   1133 	{ SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
   1134 		MISC_CMD, mrt_ioctl}
   1135 };
   1136 
   1137 int ip_misc_ioctl_count =
   1138     sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
   1139 
   1140 int	conn_drain_nthreads;		/* Number of drainers reqd. */
   1141 					/* Settable in /etc/system */
   1142 /* Defined in ip_ire.c */
   1143 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
   1144 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
   1145 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
   1146 
   1147 static nv_t	ire_nv_arr[] = {
   1148 	{ IRE_BROADCAST, "BROADCAST" },
   1149 	{ IRE_LOCAL, "LOCAL" },
   1150 	{ IRE_LOOPBACK, "LOOPBACK" },
   1151 	{ IRE_DEFAULT, "DEFAULT" },
   1152 	{ IRE_PREFIX, "PREFIX" },
   1153 	{ IRE_IF_NORESOLVER, "IF_NORESOL" },
   1154 	{ IRE_IF_RESOLVER, "IF_RESOLV" },
   1155 	{ IRE_IF_CLONE, "IF_CLONE" },
   1156 	{ IRE_HOST, "HOST" },
   1157 	{ IRE_MULTICAST, "MULTICAST" },
   1158 	{ IRE_NOROUTE, "NOROUTE" },
   1159 	{ 0 }
   1160 };
   1161 
   1162 nv_t	*ire_nv_tbl = ire_nv_arr;
   1163 
   1164 /* Simple ICMP IP Header Template */
   1165 static ipha_t icmp_ipha = {
   1166 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
   1167 };
   1168 
   1169 struct module_info ip_mod_info = {
   1170 	IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
   1171 	IP_MOD_LOWAT
   1172 };
   1173 
   1174 /*
   1175  * Duplicate static symbols within a module confuses mdb; so we avoid the
   1176  * problem by making the symbols here distinct from those in udp.c.
   1177  */
   1178 
   1179 /*
   1180  * Entry points for IP as a device and as a module.
   1181  * We have separate open functions for the /dev/ip and /dev/ip6 devices.
   1182  */
   1183 static struct qinit iprinitv4 = {
   1184 	(pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL,
   1185 	&ip_mod_info
   1186 };
   1187 
   1188 struct qinit iprinitv6 = {
   1189 	(pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL,
   1190 	&ip_mod_info
   1191 };
   1192 
   1193 static struct qinit ipwinit = {
   1194 	(pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL,
   1195 	&ip_mod_info
   1196 };
   1197 
   1198 static struct qinit iplrinit = {
   1199 	(pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL,
   1200 	&ip_mod_info
   1201 };
   1202 
   1203 static struct qinit iplwinit = {
   1204 	(pfi_t)ip_lwput, NULL, NULL, NULL, NULL,
   1205 	&ip_mod_info
   1206 };
   1207 
   1208 /* For AF_INET aka /dev/ip */
   1209 struct streamtab ipinfov4 = {
   1210 	&iprinitv4, &ipwinit, &iplrinit, &iplwinit
   1211 };
   1212 
   1213 /* For AF_INET6 aka /dev/ip6 */
   1214 struct streamtab ipinfov6 = {
   1215 	&iprinitv6, &ipwinit, &iplrinit, &iplwinit
   1216 };
   1217 
   1218 #ifdef	DEBUG
   1219 boolean_t skip_sctp_cksum = B_FALSE;
   1220 #endif
   1221 
   1222 /*
   1223  * Generate an ICMP fragmentation needed message.
   1224  * When called from ip_output side a minimal ip_recv_attr_t needs to be
   1225  * constructed by the caller.
   1226  */
   1227 void
   1228 icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
   1229 {
   1230 	icmph_t	icmph;
   1231 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   1232 
   1233 	mp = icmp_pkt_err_ok(mp, ira);
   1234 	if (mp == NULL)
   1235 		return;
   1236 
   1237 	bzero(&icmph, sizeof (icmph_t));
   1238 	icmph.icmph_type = ICMP_DEST_UNREACHABLE;
   1239 	icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
   1240 	icmph.icmph_du_mtu = htons((uint16_t)mtu);
   1241 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
   1242 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
   1243 
   1244 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   1245 }
   1246 
   1247 /*
   1248  * icmp_inbound_v4 deals with ICMP messages that are handled by IP.
   1249  * If the ICMP message is consumed by IP, i.e., it should not be delivered
   1250  * to any IPPROTO_ICMP raw sockets, then it returns NULL.
   1251  * Likewise, if the ICMP error is misformed (too short, etc), then it
   1252  * returns NULL. The caller uses this to determine whether or not to send
   1253  * to raw sockets.
   1254  *
   1255  * All error messages are passed to the matching transport stream.
   1256  *
   1257  * The following cases are handled by icmp_inbound:
   1258  * 1) It needs to send a reply back and possibly delivering it
   1259  *    to the "interested" upper clients.
   1260  * 2) Return the mblk so that the caller can pass it to the RAW socket clients.
   1261  * 3) It needs to change some values in IP only.
   1262  * 4) It needs to change some values in IP and upper layers e.g TCP
   1263  *    by delivering an error to the upper layers.
   1264  *
   1265  * We handle the above three cases in the context of IPsec in the
   1266  * following way :
   1267  *
   1268  * 1) Send the reply back in the same way as the request came in.
   1269  *    If it came in encrypted, it goes out encrypted. If it came in
   1270  *    clear, it goes out in clear. Thus, this will prevent chosen
   1271  *    plain text attack.
   1272  * 2) The client may or may not expect things to come in secure.
   1273  *    If it comes in secure, the policy constraints are checked
   1274  *    before delivering it to the upper layers. If it comes in
   1275  *    clear, ipsec_inbound_accept_clear will decide whether to
   1276  *    accept this in clear or not. In both the cases, if the returned
   1277  *    message (IP header + 8 bytes) that caused the icmp message has
   1278  *    AH/ESP headers, it is sent up to AH/ESP for validation before
   1279  *    sending up. If there are only 8 bytes of returned message, then
   1280  *    upper client will not be notified.
   1281  * 3) Check with global policy to see whether it matches the constaints.
   1282  *    But this will be done only if icmp_accept_messages_in_clear is
   1283  *    zero.
   1284  * 4) If we need to change both in IP and ULP, then the decision taken
   1285  *    while affecting the values in IP and while delivering up to TCP
   1286  *    should be the same.
   1287  *
   1288  * 	There are two cases.
   1289  *
   1290  * 	a) If we reject data at the IP layer (ipsec_check_global_policy()
   1291  *	   failed), we will not deliver it to the ULP, even though they
   1292  *	   are *willing* to accept in *clear*. This is fine as our global
   1293  *	   disposition to icmp messages asks us reject the datagram.
   1294  *
   1295  *	b) If we accept data at the IP layer (ipsec_check_global_policy()
   1296  *	   succeeded or icmp_accept_messages_in_clear is 1), and not able
   1297  *	   to deliver it to ULP (policy failed), it can lead to
   1298  *	   consistency problems. The cases known at this time are
   1299  *	   ICMP_DESTINATION_UNREACHABLE  messages with following code
   1300  *	   values :
   1301  *
   1302  *	   - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
   1303  *	     and Upper layer rejects. Then the communication will
   1304  *	     come to a stop. This is solved by making similar decisions
   1305  *	     at both levels. Currently, when we are unable to deliver
   1306  *	     to the Upper Layer (due to policy failures) while IP has
   1307  *	     adjusted dce_pmtu, the next outbound datagram would
   1308  *	     generate a local ICMP_FRAGMENTATION_NEEDED message - which
   1309  *	     will be with the right level of protection. Thus the right
   1310  *	     value will be communicated even if we are not able to
   1311  *	     communicate when we get from the wire initially. But this
   1312  *	     assumes there would be at least one outbound datagram after
   1313  *	     IP has adjusted its dce_pmtu value. To make things
   1314  *	     simpler, we accept in clear after the validation of
   1315  *	     AH/ESP headers.
   1316  *
   1317  *	   - Other ICMP ERRORS : We may not be able to deliver it to the
   1318  *	     upper layer depending on the level of protection the upper
   1319  *	     layer expects and the disposition in ipsec_inbound_accept_clear().
   1320  *	     ipsec_inbound_accept_clear() decides whether a given ICMP error
   1321  *	     should be accepted in clear when the Upper layer expects secure.
   1322  *	     Thus the communication may get aborted by some bad ICMP
   1323  *	     packets.
   1324  */
   1325 mblk_t *
   1326 icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
   1327 {
   1328 	icmph_t		*icmph;
   1329 	ipha_t		*ipha;		/* Outer header */
   1330 	int		ip_hdr_length;	/* Outer header length */
   1331 	boolean_t	interested;
   1332 	ipif_t		*ipif;
   1333 	uint32_t	ts;
   1334 	uint32_t	*tsp;
   1335 	timestruc_t	now;
   1336 	ill_t		*ill = ira->ira_ill;
   1337 	ip_stack_t	*ipst = ill->ill_ipst;
   1338 	zoneid_t	zoneid = ira->ira_zoneid;
   1339 	int		len_needed;
   1340 	mblk_t		*mp_ret = NULL;
   1341 
   1342 	ipha = (ipha_t *)mp->b_rptr;
   1343 
   1344 	BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
   1345 
   1346 	ip_hdr_length = ira->ira_ip_hdr_length;
   1347 	if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
   1348 		if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
   1349 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   1350 			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   1351 			freemsg(mp);
   1352 			return (NULL);
   1353 		}
   1354 		/* Last chance to get real. */
   1355 		ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
   1356 		if (ipha == NULL) {
   1357 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   1358 			freemsg(mp);
   1359 			return (NULL);
   1360 		}
   1361 	}
   1362 
   1363 	/* The IP header will always be a multiple of four bytes */
   1364 	icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1365 	ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
   1366 	    icmph->icmph_code));
   1367 
   1368 	/*
   1369 	 * We will set "interested" to "true" if we should pass a copy to
   1370 	 * the transport or if we handle the packet locally.
   1371 	 */
   1372 	interested = B_FALSE;
   1373 	switch (icmph->icmph_type) {
   1374 	case ICMP_ECHO_REPLY:
   1375 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
   1376 		break;
   1377 	case ICMP_DEST_UNREACHABLE:
   1378 		if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
   1379 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
   1380 		interested = B_TRUE;	/* Pass up to transport */
   1381 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
   1382 		break;
   1383 	case ICMP_SOURCE_QUENCH:
   1384 		interested = B_TRUE;	/* Pass up to transport */
   1385 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
   1386 		break;
   1387 	case ICMP_REDIRECT:
   1388 		if (!ipst->ips_ip_ignore_redirect)
   1389 			interested = B_TRUE;
   1390 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
   1391 		break;
   1392 	case ICMP_ECHO_REQUEST:
   1393 		/*
   1394 		 * Whether to respond to echo requests that come in as IP
   1395 		 * broadcasts or as IP multicast is subject to debate
   1396 		 * (what isn't?).  We aim to please, you pick it.
   1397 		 * Default is do it.
   1398 		 */
   1399 		if (ira->ira_flags & IRAF_MULTICAST) {
   1400 			/* multicast: respond based on tunable */
   1401 			interested = ipst->ips_ip_g_resp_to_echo_mcast;
   1402 		} else if (ira->ira_flags & IRAF_BROADCAST) {
   1403 			/* broadcast: respond based on tunable */
   1404 			interested = ipst->ips_ip_g_resp_to_echo_bcast;
   1405 		} else {
   1406 			/* unicast: always respond */
   1407 			interested = B_TRUE;
   1408 		}
   1409 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
   1410 		if (!interested) {
   1411 			/* We never pass these to RAW sockets */
   1412 			freemsg(mp);
   1413 			return (NULL);
   1414 		}
   1415 
   1416 		/* Check db_ref to make sure we can modify the packet. */
   1417 		if (mp->b_datap->db_ref > 1) {
   1418 			mblk_t	*mp1;
   1419 
   1420 			mp1 = copymsg(mp);
   1421 			freemsg(mp);
   1422 			if (!mp1) {
   1423 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1424 				return (NULL);
   1425 			}
   1426 			mp = mp1;
   1427 			ipha = (ipha_t *)mp->b_rptr;
   1428 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1429 		}
   1430 		icmph->icmph_type = ICMP_ECHO_REPLY;
   1431 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
   1432 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1433 		return (NULL);
   1434 
   1435 	case ICMP_ROUTER_ADVERTISEMENT:
   1436 	case ICMP_ROUTER_SOLICITATION:
   1437 		break;
   1438 	case ICMP_TIME_EXCEEDED:
   1439 		interested = B_TRUE;	/* Pass up to transport */
   1440 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
   1441 		break;
   1442 	case ICMP_PARAM_PROBLEM:
   1443 		interested = B_TRUE;	/* Pass up to transport */
   1444 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
   1445 		break;
   1446 	case ICMP_TIME_STAMP_REQUEST:
   1447 		/* Response to Time Stamp Requests is local policy. */
   1448 		if (ipst->ips_ip_g_resp_to_timestamp) {
   1449 			if (ira->ira_flags & IRAF_MULTIBROADCAST)
   1450 				interested =
   1451 				    ipst->ips_ip_g_resp_to_timestamp_bcast;
   1452 			else
   1453 				interested = B_TRUE;
   1454 		}
   1455 		if (!interested) {
   1456 			/* We never pass these to RAW sockets */
   1457 			freemsg(mp);
   1458 			return (NULL);
   1459 		}
   1460 
   1461 		/* Make sure we have enough of the packet */
   1462 		len_needed = ip_hdr_length + ICMPH_SIZE +
   1463 		    3 * sizeof (uint32_t);
   1464 
   1465 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   1466 			ipha = ip_pullup(mp, len_needed, ira);
   1467 			if (ipha == NULL) {
   1468 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1469 				ip_drop_input("ipIfStatsInDiscards - ip_pullup",
   1470 				    mp, ill);
   1471 				freemsg(mp);
   1472 				return (NULL);
   1473 			}
   1474 			/* Refresh following the pullup. */
   1475 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1476 		}
   1477 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
   1478 		/* Check db_ref to make sure we can modify the packet. */
   1479 		if (mp->b_datap->db_ref > 1) {
   1480 			mblk_t	*mp1;
   1481 
   1482 			mp1 = copymsg(mp);
   1483 			freemsg(mp);
   1484 			if (!mp1) {
   1485 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1486 				return (NULL);
   1487 			}
   1488 			mp = mp1;
   1489 			ipha = (ipha_t *)mp->b_rptr;
   1490 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1491 		}
   1492 		icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
   1493 		tsp = (uint32_t *)&icmph[1];
   1494 		tsp++;		/* Skip past 'originate time' */
   1495 		/* Compute # of milliseconds since midnight */
   1496 		gethrestime(&now);
   1497 		ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
   1498 		    now.tv_nsec / (NANOSEC / MILLISEC);
   1499 		*tsp++ = htonl(ts);	/* Lay in 'receive time' */
   1500 		*tsp++ = htonl(ts);	/* Lay in 'send time' */
   1501 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
   1502 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1503 		return (NULL);
   1504 
   1505 	case ICMP_TIME_STAMP_REPLY:
   1506 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
   1507 		break;
   1508 	case ICMP_INFO_REQUEST:
   1509 		/* Per RFC 1122 3.2.2.7, ignore this. */
   1510 	case ICMP_INFO_REPLY:
   1511 		break;
   1512 	case ICMP_ADDRESS_MASK_REQUEST:
   1513 		if (ira->ira_flags & IRAF_MULTIBROADCAST) {
   1514 			interested =
   1515 			    ipst->ips_ip_respond_to_address_mask_broadcast;
   1516 		} else {
   1517 			interested = B_TRUE;
   1518 		}
   1519 		if (!interested) {
   1520 			/* We never pass these to RAW sockets */
   1521 			freemsg(mp);
   1522 			return (NULL);
   1523 		}
   1524 		len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
   1525 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   1526 			ipha = ip_pullup(mp, len_needed, ira);
   1527 			if (ipha == NULL) {
   1528 				BUMP_MIB(ill->ill_ip_mib,
   1529 				    ipIfStatsInTruncatedPkts);
   1530 				ip_drop_input("ipIfStatsInTruncatedPkts", mp,
   1531 				    ill);
   1532 				freemsg(mp);
   1533 				return (NULL);
   1534 			}
   1535 			/* Refresh following the pullup. */
   1536 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1537 		}
   1538 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
   1539 		/* Check db_ref to make sure we can modify the packet. */
   1540 		if (mp->b_datap->db_ref > 1) {
   1541 			mblk_t	*mp1;
   1542 
   1543 			mp1 = copymsg(mp);
   1544 			freemsg(mp);
   1545 			if (!mp1) {
   1546 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1547 				return (NULL);
   1548 			}
   1549 			mp = mp1;
   1550 			ipha = (ipha_t *)mp->b_rptr;
   1551 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1552 		}
   1553 		/*
   1554 		 * Need the ipif with the mask be the same as the source
   1555 		 * address of the mask reply. For unicast we have a specific
   1556 		 * ipif. For multicast/broadcast we only handle onlink
   1557 		 * senders, and use the source address to pick an ipif.
   1558 		 */
   1559 		ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
   1560 		if (ipif == NULL) {
   1561 			/* Broadcast or multicast */
   1562 			ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
   1563 			if (ipif == NULL) {
   1564 				freemsg(mp);
   1565 				return (NULL);
   1566 			}
   1567 		}
   1568 		icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
   1569 		bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
   1570 		ipif_refrele(ipif);
   1571 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
   1572 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1573 		return (NULL);
   1574 
   1575 	case ICMP_ADDRESS_MASK_REPLY:
   1576 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
   1577 		break;
   1578 	default:
   1579 		interested = B_TRUE;	/* Pass up to transport */
   1580 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
   1581 		break;
   1582 	}
   1583 	/*
   1584 	 * See if there is an ICMP client to avoid an extra copymsg/freemsg
   1585 	 * if there isn't one.
   1586 	 */
   1587 	if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
   1588 		/* If there is an ICMP client and we want one too, copy it. */
   1589 
   1590 		if (!interested) {
   1591 			/* Caller will deliver to RAW sockets */
   1592 			return (mp);
   1593 		}
   1594 		mp_ret = copymsg(mp);
   1595 		if (mp_ret == NULL) {
   1596 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1597 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
   1598 		}
   1599 	} else if (!interested) {
   1600 		/* Neither we nor raw sockets are interested. Drop packet now */
   1601 		freemsg(mp);
   1602 		return (NULL);
   1603 	}
   1604 
   1605 	/*
   1606 	 * ICMP error or redirect packet. Make sure we have enough of
   1607 	 * the header and that db_ref == 1 since we might end up modifying
   1608 	 * the packet.
   1609 	 */
   1610 	if (mp->b_cont != NULL) {
   1611 		if (ip_pullup(mp, -1, ira) == NULL) {
   1612 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1613 			ip_drop_input("ipIfStatsInDiscards - ip_pullup",
   1614 			    mp, ill);
   1615 			freemsg(mp);
   1616 			return (mp_ret);
   1617 		}
   1618 	}
   1619 
   1620 	if (mp->b_datap->db_ref > 1) {
   1621 		mblk_t	*mp1;
   1622 
   1623 		mp1 = copymsg(mp);
   1624 		if (mp1 == NULL) {
   1625 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1626 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
   1627 			freemsg(mp);
   1628 			return (mp_ret);
   1629 		}
   1630 		freemsg(mp);
   1631 		mp = mp1;
   1632 	}
   1633 
   1634 	/*
   1635 	 * In case mp has changed, verify the message before any further
   1636 	 * processes.
   1637 	 */
   1638 	ipha = (ipha_t *)mp->b_rptr;
   1639 	icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1640 	if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   1641 		freemsg(mp);
   1642 		return (mp_ret);
   1643 	}
   1644 
   1645 	switch (icmph->icmph_type) {
   1646 	case ICMP_REDIRECT:
   1647 		icmp_redirect_v4(mp, ipha, icmph, ira);
   1648 		break;
   1649 	case ICMP_DEST_UNREACHABLE:
   1650 		if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
   1651 			/* Update DCE and adjust MTU is icmp header if needed */
   1652 			icmp_inbound_too_big_v4(icmph, ira);
   1653 		}
   1654 		/* FALLTHRU */
   1655 	default:
   1656 		icmp_inbound_error_fanout_v4(mp, icmph, ira);
   1657 		break;
   1658 	}
   1659 	return (mp_ret);
   1660 }
   1661 
   1662 /*
   1663  * Send an ICMP echo, timestamp or address mask reply.
   1664  * The caller has already updated the payload part of the packet.
   1665  * We handle the ICMP checksum, IP source address selection and feed
   1666  * the packet into ip_output_simple.
   1667  */
   1668 static void
   1669 icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
   1670     ip_recv_attr_t *ira)
   1671 {
   1672 	uint_t		ip_hdr_length = ira->ira_ip_hdr_length;
   1673 	ill_t		*ill = ira->ira_ill;
   1674 	ip_stack_t	*ipst = ill->ill_ipst;
   1675 	ip_xmit_attr_t	ixas;
   1676 
   1677 	/* Send out an ICMP packet */
   1678 	icmph->icmph_checksum = 0;
   1679 	icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
   1680 	/* Reset time to live. */
   1681 	ipha->ipha_ttl = ipst->ips_ip_def_ttl;
   1682 	{
   1683 		/* Swap source and destination addresses */
   1684 		ipaddr_t tmp;
   1685 
   1686 		tmp = ipha->ipha_src;
   1687 		ipha->ipha_src = ipha->ipha_dst;
   1688 		ipha->ipha_dst = tmp;
   1689 	}
   1690 	ipha->ipha_ident = 0;
   1691 	if (!IS_SIMPLE_IPH(ipha))
   1692 		icmp_options_update(ipha);
   1693 
   1694 	bzero(&ixas, sizeof (ixas));
   1695 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
   1696 	ixas.ixa_zoneid = ira->ira_zoneid;
   1697 	ixas.ixa_cred = kcred;
   1698 	ixas.ixa_cpid = NOPID;
   1699 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
   1700 	ixas.ixa_ifindex = 0;
   1701 	ixas.ixa_ipst = ipst;
   1702 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
   1703 
   1704 	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
   1705 		/*
   1706 		 * This packet should go out the same way as it
   1707 		 * came in i.e in clear, independent of the IPsec policy
   1708 		 * for transmitting packets.
   1709 		 */
   1710 		ixas.ixa_flags |= IXAF_NO_IPSEC;
   1711 	} else {
   1712 		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
   1713 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1714 			/* Note: mp already consumed and ip_drop_packet done */
   1715 			return;
   1716 		}
   1717 	}
   1718 	if (ira->ira_flags & IRAF_MULTIBROADCAST) {
   1719 		/*
   1720 		 * Not one or our addresses (IRE_LOCALs), thus we let
   1721 		 * ip_output_simple pick the source.
   1722 		 */
   1723 		ipha->ipha_src = INADDR_ANY;
   1724 		ixas.ixa_flags |= IXAF_SET_SOURCE;
   1725 	}
   1726 	/* Should we send with DF and use dce_pmtu? */
   1727 	if (ipst->ips_ipv4_icmp_return_pmtu) {
   1728 		ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
   1729 		ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
   1730 	}
   1731 
   1732 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
   1733 
   1734 	(void) ip_output_simple(mp, &ixas);
   1735 	ixa_cleanup(&ixas);
   1736 }
   1737 
   1738 /*
   1739  * Verify the ICMP messages for either for ICMP error or redirect packet.
   1740  * The caller should have fully pulled up the message. If it's a redirect
   1741  * packet, only basic checks on IP header will be done; otherwise, verify
   1742  * the packet by looking at the included ULP header.
   1743  *
   1744  * Called before icmp_inbound_error_fanout_v4 is called.
   1745  */
   1746 static boolean_t
   1747 icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
   1748 {
   1749 	ill_t		*ill = ira->ira_ill;
   1750 	int		hdr_length;
   1751 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   1752 	conn_t		*connp;
   1753 	ipha_t		*ipha;	/* Inner IP header */
   1754 
   1755 	ipha = (ipha_t *)&icmph[1];
   1756 	if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
   1757 		goto truncated;
   1758 
   1759 	hdr_length = IPH_HDR_LENGTH(ipha);
   1760 
   1761 	if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
   1762 		goto discard_pkt;
   1763 
   1764 	if (hdr_length < sizeof (ipha_t))
   1765 		goto truncated;
   1766 
   1767 	if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
   1768 		goto truncated;
   1769 
   1770 	/*
   1771 	 * Stop here for ICMP_REDIRECT.
   1772 	 */
   1773 	if (icmph->icmph_type == ICMP_REDIRECT)
   1774 		return (B_TRUE);
   1775 
   1776 	/*
   1777 	 * ICMP errors only.
   1778 	 */
   1779 	switch (ipha->ipha_protocol) {
   1780 	case IPPROTO_UDP:
   1781 		/*
   1782 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1783 		 * transport header.
   1784 		 */
   1785 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1786 		    mp->b_wptr)
   1787 			goto truncated;
   1788 		break;
   1789 	case IPPROTO_TCP: {
   1790 		tcpha_t		*tcpha;
   1791 
   1792 		/*
   1793 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1794 		 * transport header.
   1795 		 */
   1796 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1797 		    mp->b_wptr)
   1798 			goto truncated;
   1799 
   1800 		tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
   1801 		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
   1802 		    ipst);
   1803 		if (connp == NULL)
   1804 			goto discard_pkt;
   1805 
   1806 		if ((connp->conn_verifyicmp != NULL) &&
   1807 		    !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
   1808 			CONN_DEC_REF(connp);
   1809 			goto discard_pkt;
   1810 		}
   1811 		CONN_DEC_REF(connp);
   1812 		break;
   1813 	}
   1814 	case IPPROTO_SCTP:
   1815 		/*
   1816 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1817 		 * transport header.
   1818 		 */
   1819 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1820 		    mp->b_wptr)
   1821 			goto truncated;
   1822 		break;
   1823 	case IPPROTO_ESP:
   1824 	case IPPROTO_AH:
   1825 		break;
   1826 	case IPPROTO_ENCAP:
   1827 		if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
   1828 		    mp->b_wptr)
   1829 			goto truncated;
   1830 		break;
   1831 	default:
   1832 		break;
   1833 	}
   1834 
   1835 	return (B_TRUE);
   1836 
   1837 discard_pkt:
   1838 	/* Bogus ICMP error. */
   1839 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1840 	return (B_FALSE);
   1841 
   1842 truncated:
   1843 	/* We pulled up everthing already. Must be truncated */
   1844 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   1845 	ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   1846 	return (B_FALSE);
   1847 }
   1848 
   1849 /* Table from RFC 1191 */
   1850 static int icmp_frag_size_table[] =
   1851 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
   1852 
   1853 /*
   1854  * Process received ICMP Packet too big.
   1855  * Just handles the DCE create/update, including using the above table of
   1856  * PMTU guesses. The caller is responsible for validating the packet before
   1857  * passing it in and also to fanout the ICMP error to any matching transport
   1858  * conns. Assumes the message has been fully pulled up and verified.
   1859  *
   1860  * Before getting here, the caller has called icmp_inbound_verify_v4()
   1861  * that should have verified with ULP to prevent undoing the changes we're
   1862  * going to make to DCE. For example, TCP might have verified that the packet
   1863  * which generated error is in the send window.
   1864  *
   1865  * In some cases modified this MTU in the ICMP header packet; the caller
   1866  * should pass to the matching ULP after this returns.
   1867  */
   1868 static void
   1869 icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
   1870 {
   1871 	dce_t		*dce;
   1872 	int		old_mtu;
   1873 	int		mtu, orig_mtu;
   1874 	ipaddr_t	dst;
   1875 	boolean_t	disable_pmtud;
   1876 	ill_t		*ill = ira->ira_ill;
   1877 	ip_stack_t	*ipst = ill->ill_ipst;
   1878 	uint_t		hdr_length;
   1879 	ipha_t		*ipha;
   1880 
   1881 	/* Caller already pulled up everything. */
   1882 	ipha = (ipha_t *)&icmph[1];
   1883 	ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
   1884 	    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
   1885 	ASSERT(ill != NULL);
   1886 
   1887 	hdr_length = IPH_HDR_LENGTH(ipha);
   1888 
   1889 	/*
   1890 	 * We handle path MTU for source routed packets since the DCE
   1891 	 * is looked up using the final destination.
   1892 	 */
   1893 	dst = ip_get_dst(ipha);
   1894 
   1895 	dce = dce_lookup_and_add_v4(dst, ipst);
   1896 	if (dce == NULL) {
   1897 		/* Couldn't add a unique one - ENOMEM */
   1898 		ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
   1899 		    ntohl(dst)));
   1900 		return;
   1901 	}
   1902 
   1903 	/* Check for MTU discovery advice as described in RFC 1191 */
   1904 	mtu = ntohs(icmph->icmph_du_mtu);
   1905 	orig_mtu = mtu;
   1906 	disable_pmtud = B_FALSE;
   1907 
   1908 	mutex_enter(&dce->dce_lock);
   1909 	if (dce->dce_flags & DCEF_PMTU)
   1910 		old_mtu = dce->dce_pmtu;
   1911 	else
   1912 		old_mtu = ill->ill_mtu;
   1913 
   1914 	if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
   1915 		uint32_t length;
   1916 		int	i;
   1917 
   1918 		/*
   1919 		 * Use the table from RFC 1191 to figure out
   1920 		 * the next "plateau" based on the length in
   1921 		 * the original IP packet.
   1922 		 */
   1923 		length = ntohs(ipha->ipha_length);
   1924 		DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
   1925 		    uint32_t, length);
   1926 		if (old_mtu <= length &&
   1927 		    old_mtu >= length - hdr_length) {
   1928 			/*
   1929 			 * Handle broken BSD 4.2 systems that
   1930 			 * return the wrong ipha_length in ICMP
   1931 			 * errors.
   1932 			 */
   1933 			ip1dbg(("Wrong mtu: sent %d, dce %d\n",
   1934 			    length, old_mtu));
   1935 			length -= hdr_length;
   1936 		}
   1937 		for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
   1938 			if (length > icmp_frag_size_table[i])
   1939 				break;
   1940 		}
   1941 		if (i == A_CNT(icmp_frag_size_table)) {
   1942 			/* Smaller than IP_MIN_MTU! */
   1943 			ip1dbg(("Too big for packet size %d\n",
   1944 			    length));
   1945 			disable_pmtud = B_TRUE;
   1946 			mtu = ipst->ips_ip_pmtu_min;
   1947 		} else {
   1948 			mtu = icmp_frag_size_table[i];
   1949 			ip1dbg(("Calculated mtu %d, packet size %d, "
   1950 			    "before %d\n", mtu, length, old_mtu));
   1951 			if (mtu < ipst->ips_ip_pmtu_min) {
   1952 				mtu = ipst->ips_ip_pmtu_min;
   1953 				disable_pmtud = B_TRUE;
   1954 			}
   1955 		}
   1956 	}
   1957 	if (disable_pmtud)
   1958 		dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
   1959 	else
   1960 		dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
   1961 
   1962 	dce->dce_pmtu = MIN(old_mtu, mtu);
   1963 	/* Prepare to send the new max frag size for the ULP. */
   1964 	icmph->icmph_du_zero = 0;
   1965 	icmph->icmph_du_mtu =  htons((uint16_t)dce->dce_pmtu);
   1966 	DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *,
   1967 	    dce, int, orig_mtu, int, mtu);
   1968 
   1969 	/* We now have a PMTU for sure */
   1970 	dce->dce_flags |= DCEF_PMTU;
   1971 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
   1972 	mutex_exit(&dce->dce_lock);
   1973 	/*
   1974 	 * After dropping the lock the new value is visible to everyone.
   1975 	 * Then we bump the generation number so any cached values reinspect
   1976 	 * the dce_t.
   1977 	 */
   1978 	dce_increment_generation(dce);
   1979 	dce_refrele(dce);
   1980 }
   1981 
   1982 /*
   1983  * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4
   1984  * calls this function.
   1985  */
   1986 static mblk_t *
   1987 icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha)
   1988 {
   1989 	int length;
   1990 
   1991 	ASSERT(mp->b_datap->db_type == M_DATA);
   1992 
   1993 	/* icmp_inbound_v4 has already pulled up the whole error packet */
   1994 	ASSERT(mp->b_cont == NULL);
   1995 
   1996 	/*
   1997 	 * The length that we want to overlay is the inner header
   1998 	 * and what follows it.
   1999 	 */
   2000 	length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr);
   2001 
   2002 	/*
   2003 	 * Overlay the inner header and whatever follows it over the
   2004 	 * outer header.
   2005 	 */
   2006 	bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
   2007 
   2008 	/* Adjust for what we removed */
   2009 	mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha;
   2010 	return (mp);
   2011 }
   2012 
   2013 /*
   2014  * Try to pass the ICMP message upstream in case the ULP cares.
   2015  *
   2016  * If the packet that caused the ICMP error is secure, we send
   2017  * it to AH/ESP to make sure that the attached packet has a
   2018  * valid association. ipha in the code below points to the
   2019  * IP header of the packet that caused the error.
   2020  *
   2021  * For IPsec cases, we let the next-layer-up (which has access to
   2022  * cached policy on the conn_t, or can query the SPD directly)
   2023  * subtract out any IPsec overhead if they must.  We therefore make no
   2024  * adjustments here for IPsec overhead.
   2025  *
   2026  * IFN could have been generated locally or by some router.
   2027  *
   2028  * LOCAL : ire_send_wire (before calling ipsec_out_process) can call
   2029  * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN.
   2030  *	    This happens because IP adjusted its value of MTU on an
   2031  *	    earlier IFN message and could not tell the upper layer,
   2032  *	    the new adjusted value of MTU e.g. Packet was encrypted
   2033  *	    or there was not enough information to fanout to upper
   2034  *	    layers. Thus on the next outbound datagram, ire_send_wire
   2035  *	    generates the IFN, where IPsec processing has *not* been
   2036  *	    done.
   2037  *
   2038  *	    Note that we retain ixa_fragsize across IPsec thus once
   2039  *	    we have picking ixa_fragsize and entered ipsec_out_process we do
   2040  *	    no change the fragsize even if the path MTU changes before
   2041  *	    we reach ip_output_post_ipsec.
   2042  *
   2043  *	    In the local case, IRAF_LOOPBACK will be set indicating
   2044  *	    that IFN was generated locally.
   2045  *
   2046  * ROUTER : IFN could be secure or non-secure.
   2047  *
   2048  *	    * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the
   2049  *	      packet in error has AH/ESP headers to validate the AH/ESP
   2050  *	      headers. AH/ESP will verify whether there is a valid SA or
   2051  *	      not and send it back. We will fanout again if we have more
   2052  *	      data in the packet.
   2053  *
   2054  *	      If the packet in error does not have AH/ESP, we handle it
   2055  *	      like any other case.
   2056  *
   2057  *	    * NON_SECURE : If the packet in error has AH/ESP headers, we send it
   2058  *	      up to AH/ESP for validation. AH/ESP will verify whether there is a
   2059  *	      valid SA or not and send it back. We will fanout again if
   2060  *	      we have more data in the packet.
   2061  *
   2062  *	      If the packet in error does not have AH/ESP, we handle it
   2063  *	      like any other case.
   2064  *
   2065  * The caller must have called icmp_inbound_verify_v4.
   2066  */
   2067 static void
   2068 icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
   2069 {
   2070 	uint16_t	*up;	/* Pointer to ports in ULP header */
   2071 	uint32_t	ports;	/* reversed ports for fanout */
   2072 	ipha_t		ripha;	/* With reversed addresses */
   2073 	ipha_t		*ipha;  /* Inner IP header */
   2074 	uint_t		hdr_length; /* Inner IP header length */
   2075 	tcpha_t		*tcpha;
   2076 	conn_t		*connp;
   2077 	ill_t		*ill = ira->ira_ill;
   2078 	ip_stack_t	*ipst = ill->ill_ipst;
   2079 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   2080 	ill_t		*rill = ira->ira_rill;
   2081 
   2082 	/* Caller already pulled up everything. */
   2083 	ipha = (ipha_t *)&icmph[1];
   2084 	ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr);
   2085 	ASSERT(mp->b_cont == NULL);
   2086 
   2087 	hdr_length = IPH_HDR_LENGTH(ipha);
   2088 	ira->ira_protocol = ipha->ipha_protocol;
   2089 
   2090 	/*
   2091 	 * We need a separate IP header with the source and destination
   2092 	 * addresses reversed to do fanout/classification because the ipha in
   2093 	 * the ICMP error is in the form we sent it out.
   2094 	 */
   2095 	ripha.ipha_src = ipha->ipha_dst;
   2096 	ripha.ipha_dst = ipha->ipha_src;
   2097 	ripha.ipha_protocol = ipha->ipha_protocol;
   2098 	ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length;
   2099 
   2100 	ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n",
   2101 	    ripha.ipha_protocol, ntohl(ipha->ipha_src),
   2102 	    ntohl(ipha->ipha_dst),
   2103 	    icmph->icmph_type, icmph->icmph_code));
   2104 
   2105 	switch (ipha->ipha_protocol) {
   2106 	case IPPROTO_UDP:
   2107 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
   2108 
   2109 		/* Attempt to find a client stream based on port. */
   2110 		ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n",
   2111 		    ntohs(up[0]), ntohs(up[1])));
   2112 
   2113 		/* Note that we send error to all matches. */
   2114 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2115 		ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira);
   2116 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2117 		return;
   2118 
   2119 	case IPPROTO_TCP:
   2120 		/*
   2121 		 * Find a TCP client stream for this packet.
   2122 		 * Note that we do a reverse lookup since the header is
   2123 		 * in the form we sent it out.
   2124 		 */
   2125 		tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
   2126 		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
   2127 		    ipst);
   2128 		if (connp == NULL)
   2129 			goto discard_pkt;
   2130 
   2131 		if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
   2132 		    (ira->ira_flags & IRAF_IPSEC_SECURE)) {
   2133 			mp = ipsec_check_inbound_policy(mp, connp,
   2134 			    ipha, NULL, ira);
   2135 			if (mp == NULL) {
   2136 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2137 				/* Note that mp is NULL */
   2138 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
   2139 				CONN_DEC_REF(connp);
   2140 				return;
   2141 			}
   2142 		}
   2143 
   2144 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2145 		ira->ira_ill = ira->ira_rill = NULL;
   2146 		if (IPCL_IS_TCP(connp)) {
   2147 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   2148 			    connp->conn_recvicmp, connp, ira, SQ_FILL,
   2149 			    SQTAG_TCP_INPUT_ICMP_ERR);
   2150 		} else {
   2151 			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
   2152 			(connp->conn_recv)(connp, mp, NULL, ira);
   2153 			CONN_DEC_REF(connp);
   2154 		}
   2155 		ira->ira_ill = ill;
   2156 		ira->ira_rill = rill;
   2157 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2158 		return;
   2159 
   2160 	case IPPROTO_SCTP:
   2161 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
   2162 		/* Find a SCTP client stream for this packet. */
   2163 		((uint16_t *)&ports)[0] = up[1];
   2164 		((uint16_t *)&ports)[1] = up[0];
   2165 
   2166 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2167 		ip_fanout_sctp(mp, &ripha, NULL, ports, ira);
   2168 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2169 		return;
   2170 
   2171 	case IPPROTO_ESP:
   2172 	case IPPROTO_AH:
   2173 		if (!ipsec_loaded(ipss)) {
   2174 			ip_proto_not_sup(mp, ira);
   2175 			return;
   2176 		}
   2177 
   2178 		if (ipha->ipha_protocol == IPPROTO_ESP)
   2179 			mp = ipsecesp_icmp_error(mp, ira);
   2180 		else
   2181 			mp = ipsecah_icmp_error(mp, ira);
   2182 		if (mp == NULL)
   2183 			return;
   2184 
   2185 		/* Just in case ipsec didn't preserve the NULL b_cont */
   2186 		if (mp->b_cont != NULL) {
   2187 			if (!pullupmsg(mp, -1))
   2188 				goto discard_pkt;
   2189 		}
   2190 
   2191 		/*
   2192 		 * Note that ira_pktlen and ira_ip_hdr_length are no longer
   2193 		 * correct, but we don't use them any more here.
   2194 		 *
   2195 		 * If succesful, the mp has been modified to not include
   2196 		 * the ESP/AH header so we can fanout to the ULP's icmp
   2197 		 * error handler.
   2198 		 */
   2199 		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
   2200 			goto truncated;
   2201 
   2202 		/* Verify the modified message before any further processes. */
   2203 		ipha = (ipha_t *)mp->b_rptr;
   2204 		hdr_length = IPH_HDR_LENGTH(ipha);
   2205 		icmph = (icmph_t *)&mp->b_rptr[hdr_length];
   2206 		if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   2207 			freemsg(mp);
   2208 			return;
   2209 		}
   2210 
   2211 		icmp_inbound_error_fanout_v4(mp, icmph, ira);
   2212 		return;
   2213 
   2214 	case IPPROTO_ENCAP: {
   2215 		/* Look for self-encapsulated packets that caused an error */
   2216 		ipha_t *in_ipha;
   2217 
   2218 		/*
   2219 		 * Caller has verified that length has to be
   2220 		 * at least the size of IP header.
   2221 		 */
   2222 		ASSERT(hdr_length >= sizeof (ipha_t));
   2223 		/*
   2224 		 * Check the sanity of the inner IP header like
   2225 		 * we did for the outer header.
   2226 		 */
   2227 		in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
   2228 		if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
   2229 			goto discard_pkt;
   2230 		}
   2231 		if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
   2232 			goto discard_pkt;
   2233 		}
   2234 		/* Check for Self-encapsulated tunnels */
   2235 		if (in_ipha->ipha_src == ipha->ipha_src &&
   2236 		    in_ipha->ipha_dst == ipha->ipha_dst) {
   2237 
   2238 			mp = icmp_inbound_self_encap_error_v4(mp, ipha,
   2239 			    in_ipha);
   2240 			if (mp == NULL)
   2241 				goto discard_pkt;
   2242 
   2243 			/*
   2244 			 * Just in case self_encap didn't preserve the NULL
   2245 			 * b_cont
   2246 			 */
   2247 			if (mp->b_cont != NULL) {
   2248 				if (!pullupmsg(mp, -1))
   2249 					goto discard_pkt;
   2250 			}
   2251 			/*
   2252 			 * Note that ira_pktlen and ira_ip_hdr_length are no
   2253 			 * longer correct, but we don't use them any more here.
   2254 			 */
   2255 			if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
   2256 				goto truncated;
   2257 
   2258 			/*
   2259 			 * Verify the modified message before any further
   2260 			 * processes.
   2261 			 */
   2262 			ipha = (ipha_t *)mp->b_rptr;
   2263 			hdr_length = IPH_HDR_LENGTH(ipha);
   2264 			icmph = (icmph_t *)&mp->b_rptr[hdr_length];
   2265 			if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   2266 				freemsg(mp);
   2267 				return;
   2268 			}
   2269 
   2270 			/*
   2271 			 * The packet in error is self-encapsualted.
   2272 			 * And we are finding it further encapsulated
   2273 			 * which we could not have possibly generated.
   2274 			 */
   2275 			if (ipha->ipha_protocol == IPPROTO_ENCAP) {
   2276 				goto discard_pkt;
   2277 			}
   2278 			icmp_inbound_error_fanout_v4(mp, icmph, ira);
   2279 			return;
   2280 		}
   2281 		/* No self-encapsulated */
   2282 		/* FALLTHRU */
   2283 	}
   2284 	case IPPROTO_IPV6:
   2285 		if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src,
   2286 		    &ripha.ipha_dst, ipst)) != NULL) {
   2287 			ira->ira_flags |= IRAF_ICMP_ERROR;
   2288 			connp->conn_recvicmp(connp, mp, NULL, ira);
   2289 			CONN_DEC_REF(connp);
   2290 			ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2291 			return;
   2292 		}
   2293 		/*
   2294 		 * No IP tunnel is interested, fallthrough and see
   2295 		 * if a raw socket will want it.
   2296 		 */
   2297 		/* FALLTHRU */
   2298 	default:
   2299 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2300 		ip_fanout_proto_v4(mp, &ripha, ira);
   2301 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2302 		return;
   2303 	}
   2304 	/* NOTREACHED */
   2305 discard_pkt:
   2306 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2307 	ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n"));
   2308 	ip_drop_input("ipIfStatsInDiscards", mp, ill);
   2309 	freemsg(mp);
   2310 	return;
   2311 
   2312 truncated:
   2313 	/* We pulled up everthing already. Must be truncated */
   2314 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   2315 	ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   2316 	freemsg(mp);
   2317 }
   2318 
   2319 /*
   2320  * Common IP options parser.
   2321  *
   2322  * Setup routine: fill in *optp with options-parsing state, then
   2323  * tail-call ipoptp_next to return the first option.
   2324  */
   2325 uint8_t
   2326 ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
   2327 {
   2328 	uint32_t totallen; /* total length of all options */
   2329 
   2330 	totallen = ipha->ipha_version_and_hdr_length -
   2331 	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
   2332 	totallen <<= 2;
   2333 	optp->ipoptp_next = (uint8_t *)(&ipha[1]);
   2334 	optp->ipoptp_end = optp->ipoptp_next + totallen;
   2335 	optp->ipoptp_flags = 0;
   2336 	return (ipoptp_next(optp));
   2337 }
   2338 
   2339 /* Like above but without an ipha_t */
   2340 uint8_t
   2341 ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt)
   2342 {
   2343 	optp->ipoptp_next = opt;
   2344 	optp->ipoptp_end = optp->ipoptp_next + totallen;
   2345 	optp->ipoptp_flags = 0;
   2346 	return (ipoptp_next(optp));
   2347 }
   2348 
   2349 /*
   2350  * Common IP options parser: extract next option.
   2351  */
   2352 uint8_t
   2353 ipoptp_next(ipoptp_t *optp)
   2354 {
   2355 	uint8_t *end = optp->ipoptp_end;
   2356 	uint8_t *cur = optp->ipoptp_next;
   2357 	uint8_t opt, len, pointer;
   2358 
   2359 	/*
   2360 	 * If cur > end already, then the ipoptp_end or ipoptp_next pointer
   2361 	 * has been corrupted.
   2362 	 */
   2363 	ASSERT(cur <= end);
   2364 
   2365 	if (cur == end)
   2366 		return (IPOPT_EOL);
   2367 
   2368 	opt = cur[IPOPT_OPTVAL];
   2369 
   2370 	/*
   2371 	 * Skip any NOP options.
   2372 	 */
   2373 	while (opt == IPOPT_NOP) {
   2374 		cur++;
   2375 		if (cur == end)
   2376 			return (IPOPT_EOL);
   2377 		opt = cur[IPOPT_OPTVAL];
   2378 	}
   2379 
   2380 	if (opt == IPOPT_EOL)
   2381 		return (IPOPT_EOL);
   2382 
   2383 	/*
   2384 	 * Option requiring a length.
   2385 	 */
   2386 	if ((cur + 1) >= end) {
   2387 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2388 		return (IPOPT_EOL);
   2389 	}
   2390 	len = cur[IPOPT_OLEN];
   2391 	if (len < 2) {
   2392 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2393 		return (IPOPT_EOL);
   2394 	}
   2395 	optp->ipoptp_cur = cur;
   2396 	optp->ipoptp_len = len;
   2397 	optp->ipoptp_next = cur + len;
   2398 	if (cur + len > end) {
   2399 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2400 		return (IPOPT_EOL);
   2401 	}
   2402 
   2403 	/*
   2404 	 * For the options which require a pointer field, make sure
   2405 	 * its there, and make sure it points to either something
   2406 	 * inside this option, or the end of the option.
   2407 	 */
   2408 	switch (opt) {
   2409 	case IPOPT_RR:
   2410 	case IPOPT_TS:
   2411 	case IPOPT_LSRR:
   2412 	case IPOPT_SSRR:
   2413 		if (len <= IPOPT_OFFSET) {
   2414 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2415 			return (opt);
   2416 		}
   2417 		pointer = cur[IPOPT_OFFSET];
   2418 		if (pointer - 1 > len) {
   2419 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2420 			return (opt);
   2421 		}
   2422 		break;
   2423 	}
   2424 
   2425 	/*
   2426 	 * Sanity check the pointer field based on the type of the
   2427 	 * option.
   2428 	 */
   2429 	switch (opt) {
   2430 	case IPOPT_RR:
   2431 	case IPOPT_SSRR:
   2432 	case IPOPT_LSRR:
   2433 		if (pointer < IPOPT_MINOFF_SR)
   2434 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2435 		break;
   2436 	case IPOPT_TS:
   2437 		if (pointer < IPOPT_MINOFF_IT)
   2438 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2439 		/*
   2440 		 * Note that the Internet Timestamp option also
   2441 		 * contains two four bit fields (the Overflow field,
   2442 		 * and the Flag field), which follow the pointer
   2443 		 * field.  We don't need to check that these fields
   2444 		 * fall within the length of the option because this
   2445 		 * was implicitely done above.  We've checked that the
   2446 		 * pointer value is at least IPOPT_MINOFF_IT, and that
   2447 		 * it falls within the option.  Since IPOPT_MINOFF_IT >
   2448 		 * IPOPT_POS_OV_FLG, we don't need the explicit check.
   2449 		 */
   2450 		ASSERT(len > IPOPT_POS_OV_FLG);
   2451 		break;
   2452 	}
   2453 
   2454 	return (opt);
   2455 }
   2456 
   2457 /*
   2458  * Use the outgoing IP header to create an IP_OPTIONS option the way
   2459  * it was passed down from the application.
   2460  *
   2461  * This is compatible with BSD in that it returns
   2462  * the reverse source route with the final destination
   2463  * as the last entry. The first 4 bytes of the option
   2464  * will contain the final destination.
   2465  */
   2466 int
   2467 ip_opt_get_user(conn_t *connp, uchar_t *buf)
   2468 {
   2469 	ipoptp_t	opts;
   2470 	uchar_t		*opt;
   2471 	uint8_t		optval;
   2472 	uint8_t		optlen;
   2473 	uint32_t	len = 0;
   2474 	uchar_t		*buf1 = buf;
   2475 	uint32_t	totallen;
   2476 	ipaddr_t	dst;
   2477 	ip_pkt_t	*ipp = &connp->conn_xmit_ipp;
   2478 
   2479 	if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
   2480 		return (0);
   2481 
   2482 	totallen = ipp->ipp_ipv4_options_len;
   2483 	if (totallen & 0x3)
   2484 		return (0);
   2485 
   2486 	buf += IP_ADDR_LEN;	/* Leave room for final destination */
   2487 	len += IP_ADDR_LEN;
   2488 	bzero(buf1, IP_ADDR_LEN);
   2489 
   2490 	dst = connp->conn_faddr_v4;
   2491 
   2492 	for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
   2493 	    optval != IPOPT_EOL;
   2494 	    optval = ipoptp_next(&opts)) {
   2495 		int	off;
   2496 
   2497 		opt = opts.ipoptp_cur;
   2498 		if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   2499 			break;
   2500 		}
   2501 		optlen = opts.ipoptp_len;
   2502 
   2503 		switch (optval) {
   2504 		case IPOPT_SSRR:
   2505 		case IPOPT_LSRR:
   2506 
   2507 			/*
   2508 			 * Insert destination as the first entry in the source
   2509 			 * route and move down the entries on step.
   2510 			 * The last entry gets placed at buf1.
   2511 			 */
   2512 			buf[IPOPT_OPTVAL] = optval;
   2513 			buf[IPOPT_OLEN] = optlen;
   2514 			buf[IPOPT_OFFSET] = optlen;
   2515 
   2516 			off = optlen - IP_ADDR_LEN;
   2517 			if (off < 0) {
   2518 				/* No entries in source route */
   2519 				break;
   2520 			}
   2521 			/* Last entry in source route if not already set */
   2522 			if (dst == INADDR_ANY)
   2523 				bcopy(opt + off, buf1, IP_ADDR_LEN);
   2524 			off -= IP_ADDR_LEN;
   2525 
   2526 			while (off > 0) {
   2527 				bcopy(opt + off,
   2528 				    buf + off + IP_ADDR_LEN,
   2529 				    IP_ADDR_LEN);
   2530 				off -= IP_ADDR_LEN;
   2531 			}
   2532 			/* ipha_dst into first slot */
   2533 			bcopy(&dst, buf + off + IP_ADDR_LEN,
   2534 			    IP_ADDR_LEN);
   2535 			buf += optlen;
   2536 			len += optlen;
   2537 			break;
   2538 
   2539 		default:
   2540 			bcopy(opt, buf, optlen);
   2541 			buf += optlen;
   2542 			len += optlen;
   2543 			break;
   2544 		}
   2545 	}
   2546 done:
   2547 	/* Pad the resulting options */
   2548 	while (len & 0x3) {
   2549 		*buf++ = IPOPT_EOL;
   2550 		len++;
   2551 	}
   2552 	return (len);
   2553 }
   2554 
   2555 /*
   2556  * Update any record route or timestamp options to include this host.
   2557  * Reverse any source route option.
   2558  * This routine assumes that the options are well formed i.e. that they
   2559  * have already been checked.
   2560  */
   2561 static void
   2562 icmp_options_update(ipha_t *ipha)
   2563 {
   2564 	ipoptp_t	opts;
   2565 	uchar_t		*opt;
   2566 	uint8_t		optval;
   2567 	ipaddr_t	src;		/* Our local address */
   2568 	ipaddr_t	dst;
   2569 
   2570 	ip2dbg(("icmp_options_update\n"));
   2571 	src = ipha->ipha_src;
   2572 	dst = ipha->ipha_dst;
   2573 
   2574 	for (optval = ipoptp_first(&opts, ipha);
   2575 	    optval != IPOPT_EOL;
   2576 	    optval = ipoptp_next(&opts)) {
   2577 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
   2578 		opt = opts.ipoptp_cur;
   2579 		ip2dbg(("icmp_options_update: opt %d, len %d\n",
   2580 		    optval, opts.ipoptp_len));
   2581 		switch (optval) {
   2582 			int off1, off2;
   2583 		case IPOPT_SSRR:
   2584 		case IPOPT_LSRR:
   2585 			/*
   2586 			 * Reverse the source route.  The first entry
   2587 			 * should be the next to last one in the current
   2588 			 * source route (the last entry is our address).
   2589 			 * The last entry should be the final destination.
   2590 			 */
   2591 			off1 = IPOPT_MINOFF_SR - 1;
   2592 			off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
   2593 			if (off2 < 0) {
   2594 				/* No entries in source route */
   2595 				ip1dbg((
   2596 				    "icmp_options_update: bad src route\n"));
   2597 				break;
   2598 			}
   2599 			bcopy((char *)opt + off2, &dst, IP_ADDR_LEN);
   2600 			bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN);
   2601 			bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN);
   2602 			off2 -= IP_ADDR_LEN;
   2603 
   2604 			while (off1 < off2) {
   2605 				bcopy((char *)opt + off1, &src, IP_ADDR_LEN);
   2606 				bcopy((char *)opt + off2, (char *)opt + off1,
   2607 				    IP_ADDR_LEN);
   2608 				bcopy(&src, (char *)opt + off2, IP_ADDR_LEN);
   2609 				off1 += IP_ADDR_LEN;
   2610 				off2 -= IP_ADDR_LEN;
   2611 			}
   2612 			opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
   2613 			break;
   2614 		}
   2615 	}
   2616 }
   2617 
   2618 /*
   2619  * Process received ICMP Redirect messages.
   2620  * Assumes the caller has verified that the headers are in the pulled up mblk.
   2621  * Consumes mp.
   2622  */
   2623 static void
   2624 icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira)
   2625 {
   2626 	ire_t		*ire, *nire;
   2627 	ire_t		*prev_ire;
   2628 	ipaddr_t  	src, dst, gateway;
   2629 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   2630 	ipha_t		*inner_ipha;	/* Inner IP header */
   2631 
   2632 	/* Caller already pulled up everything. */
   2633 	inner_ipha = (ipha_t *)&icmph[1];
   2634 	src = ipha->ipha_src;
   2635 	dst = inner_ipha->ipha_dst;
   2636 	gateway = icmph->icmph_rd_gateway;
   2637 	/* Make sure the new gateway is reachable somehow. */
   2638 	ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL,
   2639 	    ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
   2640 	/*
   2641 	 * Make sure we had a route for the dest in question and that
   2642 	 * that route was pointing to the old gateway (the source of the
   2643 	 * redirect packet.)
   2644 	 * We do longest match and then compare ire_gateway_addr below.
   2645 	 */
   2646 	prev_ire = ire_ftable_lookup_v4(dst, 0, 0, 0, NULL, ALL_ZONES,
   2647 	    NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
   2648 	/*
   2649 	 * Check that
   2650 	 *	the redirect was not from ourselves
   2651 	 *	the new gateway and the old gateway are directly reachable
   2652 	 */
   2653 	if (prev_ire == NULL || ire == NULL ||
   2654 	    (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
   2655 	    (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
   2656 	    !(ire->ire_type & IRE_IF_ALL) ||
   2657 	    prev_ire->ire_gateway_addr != src) {
   2658 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
   2659 		ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill);
   2660 		freemsg(mp);
   2661 		if (ire != NULL)
   2662 			ire_refrele(ire);
   2663 		if (prev_ire != NULL)
   2664 			ire_refrele(prev_ire);
   2665 		return;
   2666 	}
   2667 
   2668 	ire_refrele(prev_ire);
   2669 	ire_refrele(ire);
   2670 
   2671 	/*
   2672 	 * TODO: more precise handling for cases 0, 2, 3, the latter two
   2673 	 * require TOS routing
   2674 	 */
   2675 	switch (icmph->icmph_code) {
   2676 	case 0:
   2677 	case 1:
   2678 		/* TODO: TOS specificity for cases 2 and 3 */
   2679 	case 2:
   2680 	case 3:
   2681 		break;
   2682 	default:
   2683 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
   2684 		ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill);
   2685 		freemsg(mp);
   2686 		return;
   2687 	}
   2688 	/*
   2689 	 * Create a Route Association.  This will allow us to remember that
   2690 	 * someone we believe told us to use the particular gateway.
   2691 	 */
   2692 	ire = ire_create(
   2693 	    (uchar_t *)&dst,			/* dest addr */
   2694 	    (uchar_t *)&ip_g_all_ones,		/* mask */
   2695 	    (uchar_t *)&gateway,		/* gateway addr */
   2696 	    IRE_HOST,
   2697 	    NULL,				/* ill */
   2698 	    ALL_ZONES,
   2699 	    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
   2700 	    NULL,				/* tsol_gc_t */
   2701 	    ipst);
   2702 
   2703 	if (ire == NULL) {
   2704 		freemsg(mp);
   2705 		return;
   2706 	}
   2707 	nire = ire_add(ire);
   2708 	/* Check if it was a duplicate entry */
   2709 	if (nire != NULL && nire != ire) {
   2710 		ASSERT(nire->ire_identical_ref > 1);
   2711 		ire_delete(nire);
   2712 		ire_refrele(nire);
   2713 		nire = NULL;
   2714 	}
   2715 	ire = nire;
   2716 	if (ire != NULL) {
   2717 		ire_refrele(ire);		/* Held in ire_add */
   2718 
   2719 		/* tell routing sockets that we received a redirect */
   2720 		ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
   2721 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
   2722 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
   2723 	}
   2724 
   2725 	/*
   2726 	 * Delete any existing IRE_HOST type redirect ires for this destination.
   2727 	 * This together with the added IRE has the effect of
   2728 	 * modifying an existing redirect.
   2729 	 */
   2730 	prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL,
   2731 	    ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL);
   2732 	if (prev_ire != NULL) {
   2733 		if (prev_ire ->ire_flags & RTF_DYNAMIC)
   2734 			ire_delete(prev_ire);
   2735 		ire_refrele(prev_ire);
   2736 	}
   2737 
   2738 	freemsg(mp);
   2739 }
   2740 
   2741 /*
   2742  * Generate an ICMP parameter problem message.
   2743  * When called from ip_output side a minimal ip_recv_attr_t needs to be
   2744  * constructed by the caller.
   2745  */
   2746 static void
   2747 icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira)
   2748 {
   2749 	icmph_t	icmph;
   2750 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   2751 
   2752 	mp = icmp_pkt_err_ok(mp, ira);
   2753 	if (mp == NULL)
   2754 		return;
   2755 
   2756 	bzero(&icmph, sizeof (icmph_t));
   2757 	icmph.icmph_type = ICMP_PARAM_PROBLEM;
   2758 	icmph.icmph_pp_ptr = ptr;
   2759 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs);
   2760 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   2761 }
   2762 
   2763 /*
   2764  * Build and ship an IPv4 ICMP message using the packet data in mp, and
   2765  * the ICMP header pointed to by "stuff".  (May be called as writer.)
   2766  * Note: assumes that icmp_pkt_err_ok has been called to verify that
   2767  * an icmp error packet can be sent.
   2768  * Assigns an appropriate source address to the packet. If ipha_dst is
   2769  * one of our addresses use it for source. Otherwise let ip_output_simple
   2770  * pick the source address.
   2771  */
   2772 static void
   2773 icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira)
   2774 {
   2775 	ipaddr_t dst;
   2776 	icmph_t	*icmph;
   2777 	ipha_t	*ipha;
   2778 	uint_t	len_needed;
   2779 	size_t	msg_len;
   2780 	mblk_t	*mp1;
   2781 	ipaddr_t src;
   2782 	ire_t	*ire;
   2783 	ip_xmit_attr_t ixas;
   2784 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   2785 
   2786 	ipha = (ipha_t *)mp->b_rptr;
   2787 
   2788 	bzero(&ixas, sizeof (ixas));
   2789 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
   2790 	ixas.ixa_zoneid = ira->ira_zoneid;
   2791 	ixas.ixa_ifindex = 0;
   2792 	ixas.ixa_ipst = ipst;
   2793 	ixas.ixa_cred = kcred;
   2794 	ixas.ixa_cpid = NOPID;
   2795 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
   2796 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
   2797 
   2798 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
   2799 		/*
   2800 		 * Apply IPsec based on how IPsec was applied to
   2801 		 * the packet that had the error.
   2802 		 *
   2803 		 * If it was an outbound packet that caused the ICMP
   2804 		 * error, then the caller will have setup the IRA
   2805 		 * appropriately.
   2806 		 */
   2807 		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
   2808 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   2809 			/* Note: mp already consumed and ip_drop_packet done */
   2810 			return;
   2811 		}
   2812 	} else {
   2813 		/*
   2814 		 * This is in clear. The icmp message we are building
   2815 		 * here should go out in clear, independent of our policy.
   2816 		 */
   2817 		ixas.ixa_flags |= IXAF_NO_IPSEC;
   2818 	}
   2819 
   2820 	/* Remember our eventual destination */
   2821 	dst = ipha->ipha_src;
   2822 
   2823 	/*
   2824 	 * If the packet was for one of our unicast addresses, make
   2825 	 * sure we respond with that as the source. Otherwise
   2826 	 * have ip_output_simple pick the source address.
   2827 	 */
   2828 	ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0,
   2829 	    (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL,
   2830 	    MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL);
   2831 	if (ire != NULL) {
   2832 		ire_refrele(ire);
   2833 		src = ipha->ipha_dst;
   2834 	} else {
   2835 		src = INADDR_ANY;
   2836 		ixas.ixa_flags |= IXAF_SET_SOURCE;
   2837 	}
   2838 
   2839 	/*
   2840 	 * Check if we can send back more then 8 bytes in addition to
   2841 	 * the IP header.  We try to send 64 bytes of data and the internal
   2842 	 * header in the special cases of ipv4 encapsulated ipv4 or ipv6.
   2843 	 */
   2844 	len_needed = IPH_HDR_LENGTH(ipha);
   2845 	if (ipha->ipha_protocol == IPPROTO_ENCAP ||
   2846 	    ipha->ipha_protocol == IPPROTO_IPV6) {
   2847 		if (!pullupmsg(mp, -1)) {
   2848 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   2849 			ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
   2850 			freemsg(mp);
   2851 			return;
   2852 		}
   2853 		ipha = (ipha_t *)mp->b_rptr;
   2854 
   2855 		if (ipha->ipha_protocol == IPPROTO_ENCAP) {
   2856 			len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha +
   2857 			    len_needed));
   2858 		} else {
   2859 			ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed);
   2860 
   2861 			ASSERT(ipha->ipha_protocol == IPPROTO_IPV6);
   2862 			len_needed += ip_hdr_length_v6(mp, ip6h);
   2863 		}
   2864 	}
   2865 	len_needed += ipst->ips_ip_icmp_return;
   2866 	msg_len = msgdsize(mp);
   2867 	if (msg_len > len_needed) {
   2868 		(void) adjmsg(mp, len_needed - msg_len);
   2869 		msg_len = len_needed;
   2870 	}
   2871 	mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_MED);
   2872 	if (mp1 == NULL) {
   2873 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors);
   2874 		freemsg(mp);
   2875 		return;
   2876 	}
   2877 	mp1->b_cont = mp;
   2878 	mp = mp1;
   2879 
   2880 	/*
   2881 	 * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
   2882 	 * node generates be accepted in peace by all on-host destinations.
   2883 	 * If we do NOT assume that all on-host destinations trust
   2884 	 * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
   2885 	 * (Look for IXAF_TRUSTED_ICMP).
   2886 	 */
   2887 	ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
   2888 
   2889 	ipha = (ipha_t *)mp->b_rptr;
   2890 	mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len);
   2891 	*ipha = icmp_ipha;
   2892 	ipha->ipha_src = src;
   2893 	ipha->ipha_dst = dst;
   2894 	ipha->ipha_ttl = ipst->ips_ip_def_ttl;
   2895 	msg_len += sizeof (icmp_ipha) + len;
   2896 	if (msg_len > IP_MAXPACKET) {
   2897 		(void) adjmsg(mp, IP_MAXPACKET - msg_len);
   2898 		msg_len = IP_MAXPACKET;
   2899 	}
   2900 	ipha->ipha_length = htons((uint16_t)msg_len);
   2901 	icmph = (icmph_t *)&ipha[1];
   2902 	bcopy(stuff, icmph, len);
   2903 	icmph->icmph_checksum = 0;
   2904 	icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
   2905 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
   2906 
   2907 	(void) ip_output_simple(mp, &ixas);
   2908 	ixa_cleanup(&ixas);
   2909 }
   2910 
   2911 /*
   2912  * Determine if an ICMP error packet can be sent given the rate limit.
   2913  * The limit consists of an average frequency (icmp_pkt_err_interval measured
   2914  * in milliseconds) and a burst size. Burst size number of packets can
   2915  * be sent arbitrarely closely spaced.
   2916  * The state is tracked using two variables to implement an approximate
   2917  * token bucket filter:
   2918  *	icmp_pkt_err_last - lbolt value when the last burst started
   2919  *	icmp_pkt_err_sent - number of packets sent in current burst
   2920  */
   2921 boolean_t
   2922 icmp_err_rate_limit(ip_stack_t *ipst)
   2923 {
   2924 	clock_t now = TICK_TO_MSEC(ddi_get_lbolt());
   2925 	uint_t refilled; /* Number of packets refilled in tbf since last */
   2926 	/* Guard against changes by loading into local variable */
   2927 	uint_t err_interval = ipst->ips_ip_icmp_err_interval;
   2928 
   2929 	if (err_interval == 0)
   2930 		return (B_FALSE);
   2931 
   2932 	if (ipst->ips_icmp_pkt_err_last > now) {
   2933 		/* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */
   2934 		ipst->ips_icmp_pkt_err_last = 0;
   2935 		ipst->ips_icmp_pkt_err_sent = 0;
   2936 	}
   2937 	/*
   2938 	 * If we are in a burst update the token bucket filter.
   2939 	 * Update the "last" time to be close to "now" but make sure
   2940 	 * we don't loose precision.
   2941 	 */
   2942 	if (ipst->ips_icmp_pkt_err_sent != 0) {
   2943 		refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval;
   2944 		if (refilled > ipst->ips_icmp_pkt_err_sent) {
   2945 			ipst->ips_icmp_pkt_err_sent = 0;
   2946 		} else {
   2947 			ipst->ips_icmp_pkt_err_sent -= refilled;
   2948 			ipst->ips_icmp_pkt_err_last += refilled * err_interval;
   2949 		}
   2950 	}
   2951 	if (ipst->ips_icmp_pkt_err_sent == 0) {
   2952 		/* Start of new burst */
   2953 		ipst->ips_icmp_pkt_err_last = now;
   2954 	}
   2955 	if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) {
   2956 		ipst->ips_icmp_pkt_err_sent++;
   2957 		ip1dbg(("icmp_err_rate_limit: %d sent in burst\n",
   2958 		    ipst->ips_icmp_pkt_err_sent));
   2959 		return (B_FALSE);
   2960 	}
   2961 	ip1dbg(("icmp_err_rate_limit: dropped\n"));
   2962 	return (B_TRUE);
   2963 }
   2964 
   2965 /*
   2966  * Check if it is ok to send an IPv4 ICMP error packet in
   2967  * response to the IPv4 packet in mp.
   2968  * Free the message and return null if no
   2969  * ICMP error packet should be sent.
   2970  */
   2971 static mblk_t *
   2972 icmp_pkt_err_ok(mblk_t *mp, ip_recv_attr_t *ira)
   2973 {
   2974 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   2975 	icmph_t	*icmph;
   2976 	ipha_t	*ipha;
   2977 	uint_t	len_needed;
   2978 
   2979 	if (!mp)
   2980 		return (NULL);
   2981 	ipha = (ipha_t *)mp->b_rptr;
   2982 	if (ip_csum_hdr(ipha)) {
   2983 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs);
   2984 		ip_drop_input("ipIfStatsInCksumErrs", mp, NULL);
   2985 		freemsg(mp);
   2986 		return (NULL);
   2987 	}
   2988 	if (ip_type_v4(ipha->ipha_dst, ipst) == IRE_BROADCAST ||
   2989 	    ip_type_v4(ipha->ipha_src, ipst) == IRE_BROADCAST ||
   2990 	    CLASSD(ipha->ipha_dst) ||
   2991 	    CLASSD(ipha->ipha_src) ||
   2992 	    (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) {
   2993 		/* Note: only errors to the fragment with offset 0 */
   2994 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   2995 		freemsg(mp);
   2996 		return (NULL);
   2997 	}
   2998 	if (ipha->ipha_protocol == IPPROTO_ICMP) {
   2999 		/*
   3000 		 * Check the ICMP type.  RFC 1122 sez:  don't send ICMP
   3001 		 * errors in response to any ICMP errors.
   3002 		 */
   3003 		len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE;
   3004 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   3005 			if (!pullupmsg(mp, len_needed)) {
   3006 				BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   3007 				freemsg(mp);
   3008 				return (NULL);
   3009 			}
   3010 			ipha = (ipha_t *)mp->b_rptr;
   3011 		}
   3012 		icmph = (icmph_t *)
   3013 		    (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]);
   3014 		switch (icmph->icmph_type) {
   3015 		case ICMP_DEST_UNREACHABLE:
   3016 		case ICMP_SOURCE_QUENCH:
   3017 		case ICMP_TIME_EXCEEDED:
   3018 		case ICMP_PARAM_PROBLEM:
   3019 		case ICMP_REDIRECT:
   3020 			BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   3021 			freemsg(mp);
   3022 			return (NULL);
   3023 		default:
   3024 			break;
   3025 		}
   3026 	}
   3027 	/*
   3028 	 * If this is a labeled system, then check to see if we're allowed to
   3029 	 * send a response to this particular sender.  If not, then just drop.
   3030 	 */
   3031 	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
   3032 		ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n"));
   3033 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   3034 		freemsg(mp);
   3035 		return (NULL);
   3036 	}
   3037 	if (icmp_err_rate_limit(ipst)) {
   3038 		/*
   3039 		 * Only send ICMP error packets every so often.
   3040 		 * This should be done on a per port/source basis,
   3041 		 * but for now this will suffice.
   3042 		 */
   3043 		freemsg(mp);
   3044 		return (NULL);
   3045 	}
   3046 	return (mp);
   3047 }
   3048 
   3049 /*
   3050  * Called when a packet was sent out the same link that it arrived on.
   3051  * Check if it is ok to send a redirect and then send it.
   3052  */
   3053 void
   3054 ip_send_potential_redirect_v4(mblk_t *mp, ipha_t *ipha, ire_t *ire,
   3055     ip_recv_attr_t *ira)
   3056 {
   3057 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   3058 	ipaddr_t	src, nhop;
   3059 	mblk_t		*mp1;
   3060 	ire_t		*nhop_ire;
   3061 
   3062 	/*
   3063 	 * Check the source address to see if it originated
   3064 	 * on the same logical subnet it is going back out on.
   3065 	 * If so, we should be able to send it a redirect.
   3066 	 * Avoid sending a redirect if the destination
   3067 	 * is directly connected (i.e., we matched an IRE_ONLINK),
   3068 	 * or if the packet was source routed out this interface.
   3069 	 *
   3070 	 * We avoid sending a redirect if the
   3071 	 * destination is directly connected
   3072 	 * because it is possible that multiple
   3073 	 * IP subnets may have been configured on
   3074 	 * the link, and the source may not
   3075 	 * be on the same subnet as ip destination,
   3076 	 * even though they are on the same
   3077 	 * physical link.
   3078 	 */
   3079 	if ((ire->ire_type & IRE_ONLINK) ||
   3080 	    ip_source_routed(ipha, ipst))
   3081 		return;
   3082 
   3083 	nhop_ire = ire_nexthop(ire);
   3084 	if (nhop_ire == NULL)
   3085 		return;
   3086 
   3087 	nhop = nhop_ire->ire_addr;
   3088 
   3089 	if (nhop_ire->ire_type & IRE_IF_CLONE) {
   3090 		ire_t	*ire2;
   3091 
   3092 		/* Follow ire_dep_parent to find non-clone IRE_INTERFACE */
   3093 		mutex_enter(&nhop_ire->ire_lock);
   3094 		ire2 = nhop_ire->ire_dep_parent;
   3095 		if (ire2 != NULL)
   3096 			ire_refhold(ire2);
   3097 		mutex_exit(&nhop_ire->ire_lock);
   3098 		ire_refrele(nhop_ire);
   3099 		nhop_ire = ire2;
   3100 	}
   3101 	if (nhop_ire == NULL)
   3102 		return;
   3103 
   3104 	ASSERT(!(nhop_ire->ire_type & IRE_IF_CLONE));
   3105 
   3106 	src = ipha->ipha_src;
   3107 
   3108 	/*
   3109 	 * We look at the interface ire for the nexthop,
   3110 	 * to see if ipha_src is in the same subnet
   3111 	 * as the nexthop.
   3112 	 */
   3113 	if ((src & nhop_ire->ire_mask) == (nhop & nhop_ire->ire_mask)) {
   3114 		/*
   3115 		 * The source is directly connected.
   3116 		 */
   3117 		mp1 = copymsg(mp);
   3118 		if (mp1 != NULL) {
   3119 			icmp_send_redirect(mp1, nhop, ira);
   3120 		}
   3121 	}
   3122 	ire_refrele(nhop_ire);
   3123 }
   3124 
   3125 /*
   3126  * Generate an ICMP redirect message.
   3127  */
   3128 static void
   3129 icmp_send_redirect(mblk_t *mp, ipaddr_t gateway, ip_recv_attr_t *ira)
   3130 {
   3131 	icmph_t	icmph;
   3132 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   3133 
   3134 	mp = icmp_pkt_err_ok(mp, ira);
   3135 	if (mp == NULL)
   3136 		return;
   3137 
   3138 	bzero(&icmph, sizeof (icmph_t));
   3139 	icmph.icmph_type = ICMP_REDIRECT;
   3140 	icmph.icmph_code = 1;
   3141 	icmph.icmph_rd_gateway = gateway;
   3142 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
   3143 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   3144 }
   3145 
   3146 /*
   3147  * Generate an ICMP time exceeded message.
   3148  */
   3149 void
   3150 icmp_time_exceeded(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
   3151 {
   3152 	icmph_t	icmph;
   3153 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   3154 
   3155 	mp = icmp_pkt_err_ok(mp, ira);
   3156 	if (mp == NULL)
   3157 		return;
   3158 
   3159 	bzero(&icmph, sizeof (icmph_t));
   3160 	icmph.icmph_type = ICMP_TIME_EXCEEDED;
   3161 	icmph.icmph_code = code;
   3162 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
   3163 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   3164 }
   3165 
   3166 /*
   3167  * Generate an ICMP unreachable message.
   3168  * When called from ip_output side a minimal ip_recv_attr_t needs to be
   3169  * constructed by the caller.
   3170  */
   3171 void
   3172 icmp_unreachable(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
   3173 {
   3174 	icmph_t	icmph;
   3175 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   3176 
   3177 	mp = icmp_pkt_err_ok(mp, ira);
   3178 	if (mp == NULL)
   3179 		return;
   3180 
   3181 	bzero(&icmph, sizeof (icmph_t));
   3182 	icmph.icmph_type = ICMP_DEST_UNREACHABLE;
   3183 	icmph.icmph_code = code;
   3184 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
   3185 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   3186 }
   3187 
   3188 /*
   3189  * Latch in the IPsec state for a stream based the policy in the listener
   3190  * and the actions in the ip_recv_attr_t.
   3191  * Called directly from TCP and SCTP.
   3192  */
   3193 boolean_t
   3194 ip_ipsec_policy_inherit(conn_t *connp, conn_t *lconnp, ip_recv_attr_t *ira)
   3195 {
   3196 	ASSERT(lconnp->conn_policy != NULL);
   3197 	ASSERT(connp->conn_policy == NULL);
   3198 
   3199 	IPPH_REFHOLD(lconnp->conn_policy);
   3200 	connp->conn_policy = lconnp->conn_policy;
   3201 
   3202 	if (ira->ira_ipsec_action != NULL) {
   3203 		if (connp->conn_latch == NULL) {
   3204 			connp->conn_latch = iplatch_create();
   3205 			if (connp->conn_latch == NULL)
   3206 				return (B_FALSE);
   3207 		}
   3208 		ipsec_latch_inbound(connp, ira);
   3209 	}
   3210 	return (B_TRUE);
   3211 }
   3212 
   3213 /*
   3214  * Verify whether or not the IP address is a valid local address.
   3215  * Could be a unicast, including one for a down interface.
   3216  * If allow_mcbc then a multicast or broadcast address is also
   3217  * acceptable.
   3218  *
   3219  * In the case of a broadcast/multicast address, however, the
   3220  * upper protocol is expected to reset the src address
   3221  * to zero when we return IPVL_MCAST/IPVL_BCAST so that
   3222  * no packets are emitted with broadcast/multicast address as
   3223  * source address (that violates hosts requirements RFC 1122)
   3224  * The addresses valid for bind are:
   3225  *	(1) - INADDR_ANY (0)
   3226  *	(2) - IP address of an UP interface
   3227  *	(3) - IP address of a DOWN interface
   3228  *	(4) - valid local IP broadcast addresses. In this case
   3229  *	the conn will only receive packets destined to
   3230  *	the specified broadcast address.
   3231  *	(5) - a multicast address. In this case
   3232  *	the conn will only receive packets destined to
   3233  *	the specified multicast address. Note: the
   3234  *	application still has to issue an
   3235  *	IP_ADD_MEMBERSHIP socket option.
   3236  *
   3237  * In all the above cases, the bound address must be valid in the current zone.
   3238  * When the address is loopback, multicast or broadcast, there might be many
   3239  * matching IREs so bind has to look up based on the zone.
   3240  */
   3241 ip_laddr_t
   3242 ip_laddr_verify_v4(ipaddr_t src_addr, zoneid_t zoneid,
   3243     ip_stack_t *ipst, boolean_t allow_mcbc)
   3244 {
   3245 	ire_t *src_ire;
   3246 
   3247 	ASSERT(src_addr != INADDR_ANY);
   3248 
   3249 	src_ire = ire_ftable_lookup_v4(src_addr, 0, 0, 0,
   3250 	    NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, 0, ipst, NULL);
   3251 
   3252 	/*
   3253 	 * If an address other than in6addr_any is requested,
   3254 	 * we verify that it is a valid address for bind
   3255 	 * Note: Following code is in if-else-if form for
   3256 	 * readability compared to a condition check.
   3257 	 */
   3258 	if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
   3259 		/*
   3260 		 * (2) Bind to address of local UP interface
   3261 		 */
   3262 		ire_refrele(src_ire);
   3263 		return (IPVL_UNICAST_UP);
   3264 	} else if (src_ire != NULL && src_ire->ire_type & IRE_BROADCAST) {
   3265 		/*
   3266 		 * (4) Bind to broadcast address
   3267 		 */
   3268 		ire_refrele(src_ire);
   3269 		if (allow_mcbc)
   3270 			return (IPVL_BCAST);
   3271 		else
   3272 			return (IPVL_BAD);
   3273 	} else if (CLASSD(src_addr)) {
   3274 		/* (5) bind to multicast address. */
   3275 		if (src_ire != NULL)
   3276 			ire_refrele(src_ire);
   3277 
   3278 		if (allow_mcbc)
   3279 			return (IPVL_MCAST);
   3280 		else
   3281 			return (IPVL_BAD);
   3282 	} else {
   3283 		ipif_t *ipif;
   3284 
   3285 		/*
   3286 		 * (3) Bind to address of local DOWN interface?
   3287 		 * (ipif_lookup_addr() looks up all interfaces
   3288 		 * but we do not get here for UP interfaces
   3289 		 * - case (2) above)
   3290 		 */
   3291 		if (src_ire != NULL)
   3292 			ire_refrele(src_ire);
   3293 
   3294 		ipif = ipif_lookup_addr(src_addr, NULL, zoneid, ipst);
   3295 		if (ipif == NULL)
   3296 			return (IPVL_BAD);
   3297 
   3298 		/* Not a useful source? */
   3299 		if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
   3300 			ipif_refrele(ipif);
   3301 			return (IPVL_BAD);
   3302 		}
   3303 		ipif_refrele(ipif);
   3304 		return (IPVL_UNICAST_DOWN);
   3305 	}
   3306 }
   3307 
   3308 /*
   3309  * Insert in the bind fanout for IPv4 and IPv6.
   3310  * The caller should already have used ip_laddr_verify_v*() before calling
   3311  * this.
   3312  */
   3313 int
   3314 ip_laddr_fanout_insert(conn_t *connp)
   3315 {
   3316 	int		error;
   3317 
   3318 	/*
   3319 	 * Allow setting new policies. For example, disconnects result
   3320 	 * in us being called. As we would have set conn_policy_cached
   3321 	 * to B_TRUE before, we should set it to B_FALSE, so that policy
   3322 	 * can change after the disconnect.
   3323 	 */
   3324 	connp->conn_policy_cached = B_FALSE;
   3325 
   3326 	error = ipcl_bind_insert(connp);
   3327 	if (error != 0) {
   3328 		if (connp->conn_anon_port) {
   3329 			(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
   3330 			    connp->conn_mlp_type, connp->conn_proto,
   3331 			    ntohs(connp->conn_lport), B_FALSE);
   3332 		}
   3333 		connp->conn_mlp_type = mlptSingle;
   3334 	}
   3335 	return (error);
   3336 }
   3337 
   3338 /*
   3339  * Verify that both the source and destination addresses are valid. If
   3340  * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
   3341  * i.e. have no route to it.  Protocols like TCP want to verify destination
   3342  * reachability, while tunnels do not.
   3343  *
   3344  * Determine the route, the interface, and (optionally) the source address
   3345  * to use to reach a given destination.
   3346  * Note that we allow connect to broadcast and multicast addresses when
   3347  * IPDF_ALLOW_MCBC is set.
   3348  * first_hop and dst_addr are normally the same, but if source routing
   3349  * they will differ; in that case the first_hop is what we'll use for the
   3350  * routing lookup but the dce and label checks will be done on dst_addr,
   3351  *
   3352  * If uinfo is set, then we fill in the best available information
   3353  * we have for the destination. This is based on (in priority order) any
   3354  * metrics and path MTU stored in a dce_t, route metrics, and finally the
   3355  * ill_mtu/ill_mc_mtu.
   3356  *
   3357  * Tsol note: If we have a source route then dst_addr != firsthop. But we
   3358  * always do the label check on dst_addr.
   3359  */
   3360 int
   3361 ip_set_destination_v4(ipaddr_t *src_addrp, ipaddr_t dst_addr, ipaddr_t firsthop,
   3362     ip_xmit_attr_t *ixa, iulp_t *uinfo, uint32_t flags, uint_t mac_mode)
   3363 {
   3364 	ire_t		*ire = NULL;
   3365 	int		error = 0;
   3366 	ipaddr_t	setsrc;				/* RTF_SETSRC */
   3367 	zoneid_t	zoneid = ixa->ixa_zoneid;	/* Honors SO_ALLZONES */
   3368 	ip_stack_t	*ipst = ixa->ixa_ipst;
   3369 	dce_t		*dce;
   3370 	uint_t		pmtu;
   3371 	uint_t		generation;
   3372 	nce_t		*nce;
   3373 	ill_t		*ill = NULL;
   3374 	boolean_t	multirt = B_FALSE;
   3375 
   3376 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
   3377 
   3378 	/*
   3379 	 * We never send to zero; the ULPs map it to the loopback address.
   3380 	 * We can't allow it since we use zero to mean unitialized in some
   3381 	 * places.
   3382 	 */
   3383 	ASSERT(dst_addr != INADDR_ANY);
   3384 
   3385 	if (is_system_labeled()) {
   3386 		ts_label_t *tsl = NULL;
   3387 
   3388 		error = tsol_check_dest(ixa->ixa_tsl, &dst_addr, IPV4_VERSION,
   3389 		    mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
   3390 		if (error != 0)
   3391 			return (error);
   3392 		if (tsl != NULL) {
   3393 			/* Update the label */
   3394 			ip_xmit_attr_replace_tsl(ixa, tsl);
   3395 		}
   3396 	}
   3397 
   3398 	setsrc = INADDR_ANY;
   3399 	/*
   3400 	 * Select a route; For IPMP interfaces, we would only select
   3401 	 * a "hidden" route (i.e., going through a specific under_ill)
   3402 	 * if ixa_ifindex has been specified.
   3403 	 */
   3404 	ire = ip_select_route_v4(firsthop, *src_addrp, ixa,
   3405 	    &generation, &setsrc, &error, &multirt);
   3406 	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
   3407 	if (error != 0)
   3408 		goto bad_addr;
   3409 
   3410 	/*
   3411 	 * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
   3412 	 * If IPDF_VERIFY_DST is set, the destination must be reachable;
   3413 	 * Otherwise the destination needn't be reachable.
   3414 	 *
   3415 	 * If we match on a reject or black hole, then we've got a
   3416 	 * local failure.  May as well fail out the connect() attempt,
   3417 	 * since it's never going to succeed.
   3418 	 */
   3419 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
   3420 		/*
   3421 		 * If we're verifying destination reachability, we always want
   3422 		 * to complain here.
   3423 		 *
   3424 		 * If we're not verifying destination reachability but the
   3425 		 * destination has a route, we still want to fail on the
   3426 		 * temporary address and broadcast address tests.
   3427 		 *
   3428 		 * In both cases do we let the code continue so some reasonable
   3429 		 * information is returned to the caller. That enables the
   3430 		 * caller to use (and even cache) the IRE. conn_ip_ouput will
   3431 		 * use the generation mismatch path to check for the unreachable
   3432 		 * case thereby avoiding any specific check in the main path.
   3433 		 */
   3434 		ASSERT(generation == IRE_GENERATION_VERIFY);
   3435 		if (flags & IPDF_VERIFY_DST) {
   3436 			/*
   3437 			 * Set errno but continue to set up ixa_ire to be
   3438 			 * the RTF_REJECT|RTF_BLACKHOLE IRE.
   3439 			 * That allows callers to use ip_output to get an
   3440 			 * ICMP error back.
   3441 			 */
   3442 			if (!(ire->ire_type & IRE_HOST))
   3443 				error = ENETUNREACH;
   3444 			else
   3445 				error = EHOSTUNREACH;
   3446 		}
   3447 	}
   3448 
   3449 	if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
   3450 	    !(flags & IPDF_ALLOW_MCBC)) {
   3451 		ire_refrele(ire);
   3452 		ire = ire_reject(ipst, B_FALSE);
   3453 		generation = IRE_GENERATION_VERIFY;
   3454 		error = ENETUNREACH;
   3455 	}
   3456 
   3457 	/* Cache things */
   3458 	if (ixa->ixa_ire != NULL)
   3459 		ire_refrele_notr(ixa->ixa_ire);
   3460 #ifdef DEBUG
   3461 	ire_refhold_notr(ire);
   3462 	ire_refrele(ire);
   3463 #endif
   3464 	ixa->ixa_ire = ire;
   3465 	ixa->ixa_ire_generation = generation;
   3466 
   3467 	/*
   3468 	 * Ensure that ixa_dce is always set any time that ixa_ire is set,
   3469 	 * since some callers will send a packet to conn_ip_output() even if
   3470 	 * there's an error.
   3471 	 */
   3472 	if (flags & IPDF_UNIQUE_DCE) {
   3473 		/* Fallback to the default dce if allocation fails */
   3474 		dce = dce_lookup_and_add_v4(dst_addr, ipst);
   3475 		if (dce != NULL)
   3476 			generation = dce->dce_generation;
   3477 		else
   3478 			dce = dce_lookup_v4(dst_addr, ipst, &generation);
   3479 	} else {
   3480 		dce = dce_lookup_v4(dst_addr, ipst, &generation);
   3481 	}
   3482 	ASSERT(dce != NULL);
   3483 	if (ixa->ixa_dce != NULL)
   3484 		dce_refrele_notr(ixa->ixa_dce);
   3485 #ifdef DEBUG
   3486 	dce_refhold_notr(dce);
   3487 	dce_refrele(dce);
   3488 #endif
   3489 	ixa->ixa_dce = dce;
   3490 	ixa->ixa_dce_generation = generation;
   3491 
   3492 	/*
   3493 	 * For multicast with multirt we have a flag passed back from
   3494 	 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
   3495 	 * possible multicast address.
   3496 	 * We also need a flag for multicast since we can't check
   3497 	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
   3498 	 */
   3499 	if (multirt) {
   3500 		ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
   3501 		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
   3502 	} else {
   3503 		ixa->ixa_postfragfn = ire->ire_postfragfn;
   3504 		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
   3505 	}
   3506 	if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
   3507 		/* Get an nce to cache. */
   3508 		nce = ire_to_nce(ire, firsthop, NULL);
   3509 		if (nce == NULL) {
   3510 			/* Allocation failure? */
   3511 			ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
   3512 		} else {
   3513 			if (ixa->ixa_nce != NULL)
   3514 				nce_refrele(ixa->ixa_nce);
   3515 			ixa->ixa_nce = nce;
   3516 		}
   3517 	}
   3518 
   3519 	/*
   3520 	 * If the source address is a loopback address, the
   3521 	 * destination had best be local or multicast.
   3522 	 * If we are sending to an IRE_LOCAL using a loopback source then
   3523 	 * it had better be the same zoneid.
   3524 	 */
   3525 	if (*src_addrp == htonl(INADDR_LOOPBACK)) {
   3526 		if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
   3527 			ire = NULL;	/* Stored in ixa_ire */
   3528 			error = EADDRNOTAVAIL;
   3529 			goto bad_addr;
   3530 		}
   3531 		if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
   3532 			ire = NULL;	/* Stored in ixa_ire */
   3533 			error = EADDRNOTAVAIL;
   3534 			goto bad_addr;
   3535 		}
   3536 	}
   3537 	if (ire->ire_type & IRE_BROADCAST) {
   3538 		/*
   3539 		 * If the ULP didn't have a specified source, then we
   3540 		 * make sure we reselect the source when sending
   3541 		 * broadcasts out different interfaces.
   3542 		 */
   3543 		if (flags & IPDF_SELECT_SRC)
   3544 			ixa->ixa_flags |= IXAF_SET_SOURCE;
   3545 		else
   3546 			ixa->ixa_flags &= ~IXAF_SET_SOURCE;
   3547 	}
   3548 
   3549 	/*
   3550 	 * Does the caller want us to pick a source address?
   3551 	 */
   3552 	if (flags & IPDF_SELECT_SRC) {
   3553 		ipaddr_t	src_addr;
   3554 
   3555 		/*
   3556 		 * We use use ire_nexthop_ill to avoid the under ipmp
   3557 		 * interface for source address selection. Note that for ipmp
   3558 		 * probe packets, ixa_ifindex would have been specified, and
   3559 		 * the ip_select_route() invocation would have picked an ire
   3560 		 * will ire_ill pointing at an under interface.
   3561 		 */
   3562 		ill = ire_nexthop_ill(ire);
   3563 
   3564 		/* If unreachable we have no ill but need some source */
   3565 		if (ill == NULL) {
   3566 			src_addr = htonl(INADDR_LOOPBACK);
   3567 			/* Make sure we look for a better source address */
   3568 			generation = SRC_GENERATION_VERIFY;
   3569 		} else {
   3570 			error = ip_select_source_v4(ill, setsrc, dst_addr,
   3571 			    ixa->ixa_multicast_ifaddr, zoneid,
   3572 			    ipst, &src_addr, &generation, NULL);
   3573 			if (error != 0) {
   3574 				ire = NULL;	/* Stored in ixa_ire */
   3575 				goto bad_addr;
   3576 			}
   3577 		}
   3578 
   3579 		/*
   3580 		 * We allow the source address to to down.
   3581 		 * However, we check that we don't use the loopback address
   3582 		 * as a source when sending out on the wire.
   3583 		 */
   3584 		if ((src_addr == htonl(INADDR_LOOPBACK)) &&
   3585 		    !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
   3586 		    !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
   3587 			ire = NULL;	/* Stored in ixa_ire */
   3588 			error = EADDRNOTAVAIL;
   3589 			goto bad_addr;
   3590 		}
   3591 
   3592 		*src_addrp = src_addr;
   3593 		ixa->ixa_src_generation = generation;
   3594 	}
   3595 
   3596 	/*
   3597 	 * Make sure we don't leave an unreachable ixa_nce in place
   3598 	 * since ip_select_route is used when we unplumb i.e., remove
   3599 	 * references on ixa_ire, ixa_nce, and ixa_dce.
   3600 	 */
   3601 	nce = ixa->ixa_nce;
   3602 	if (nce != NULL && nce->nce_is_condemned) {
   3603 		nce_refrele(nce);
   3604 		ixa->ixa_nce = NULL;
   3605 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
   3606 	}
   3607 
   3608 	/*
   3609 	 * The caller has set IXAF_PMTU_DISCOVERY if path MTU is desired.
   3610 	 * However, we can't do it for IPv4 multicast or broadcast.
   3611 	 */
   3612 	if (ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST))
   3613 		ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
   3614 
   3615 	/*
   3616 	 * Set initial value for fragmentation limit. Either conn_ip_output
   3617 	 * or ULP might updates it when there are routing changes.
   3618 	 * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
   3619 	 */
   3620 	pmtu = ip_get_pmtu(ixa);
   3621 	ixa->ixa_fragsize = pmtu;
   3622 	/* Make sure ixa_fragsize and ixa_pmtu remain identical */
   3623 	if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
   3624 		ixa->ixa_pmtu = pmtu;
   3625 
   3626 	/*
   3627 	 * Extract information useful for some transports.
   3628 	 * First we look for DCE metrics. Then we take what we have in
   3629 	 * the metrics in the route, where the offlink is used if we have
   3630 	 * one.
   3631 	 */
   3632 	if (uinfo != NULL) {
   3633 		bzero(uinfo, sizeof (*uinfo));
   3634 
   3635 		if (dce->dce_flags & DCEF_UINFO)
   3636 			*uinfo = dce->dce_uinfo;
   3637 
   3638 		rts_merge_metrics(uinfo, &ire->ire_metrics);
   3639 
   3640 		/* Allow ire_metrics to decrease the path MTU from above */
   3641 		if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
   3642 			uinfo->iulp_mtu = pmtu;
   3643 
   3644 		uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
   3645 		uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
   3646 		uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
   3647 	}
   3648 
   3649 	if (ill != NULL)
   3650 		ill_refrele(ill);
   3651 
   3652 	return (error);
   3653 
   3654 bad_addr:
   3655 	if (ire != NULL)
   3656 		ire_refrele(ire);
   3657 
   3658 	if (ill != NULL)
   3659 		ill_refrele(ill);
   3660 
   3661 	/*
   3662 	 * Make sure we don't leave an unreachable ixa_nce in place
   3663 	 * since ip_select_route is used when we unplumb i.e., remove
   3664 	 * references on ixa_ire, ixa_nce, and ixa_dce.
   3665 	 */
   3666 	nce = ixa->ixa_nce;
   3667 	if (nce != NULL && nce->nce_is_condemned) {
   3668 		nce_refrele(nce);
   3669 		ixa->ixa_nce = NULL;
   3670 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
   3671 	}
   3672 
   3673 	return (error);
   3674 }
   3675 
   3676 
   3677 /*
   3678  * Get the base MTU for the case when path MTU discovery is not used.
   3679  * Takes the MTU of the IRE into account.
   3680  */
   3681 uint_t
   3682 ip_get_base_mtu(ill_t *ill, ire_t *ire)
   3683 {
   3684 	uint_t mtu;
   3685 	uint_t iremtu = ire->ire_metrics.iulp_mtu;
   3686 
   3687 	if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST))
   3688 		mtu = ill->ill_mc_mtu;
   3689 	else
   3690 		mtu = ill->ill_mtu;
   3691 
   3692 	if (iremtu != 0 && iremtu < mtu)
   3693 		mtu = iremtu;
   3694 
   3695 	return (mtu);
   3696 }
   3697 
   3698 /*
   3699  * Get the PMTU for the attributes. Handles both IPv4 and IPv6.
   3700  * Assumes that ixa_ire, dce, and nce have already been set up.
   3701  *
   3702  * The caller has set IXAF_PMTU_DISCOVERY if path MTU discovery is desired.
   3703  * We avoid path MTU discovery if it is disabled with ndd.
   3704  * Furtermore, if the path MTU is too small, then we don't set DF for IPv4.
   3705  *
   3706  * NOTE: We also used to turn it off for source routed packets. That
   3707  * is no longer required since the dce is per final destination.
   3708  */
   3709 uint_t
   3710 ip_get_pmtu(ip_xmit_attr_t *ixa)
   3711 {
   3712 	ip_stack_t	*ipst = ixa->ixa_ipst;
   3713 	dce_t		*dce;
   3714 	nce_t		*nce;
   3715 	ire_t		*ire;
   3716 	uint_t		pmtu;
   3717 
   3718 	ire = ixa->ixa_ire;
   3719 	dce = ixa->ixa_dce;
   3720 	nce = ixa->ixa_nce;
   3721 
   3722 	/*
   3723 	 * If path MTU discovery has been turned off by ndd, then we ignore
   3724 	 * any dce_pmtu and for IPv4 we will not set DF.
   3725 	 */
   3726 	if (!ipst->ips_ip_path_mtu_discovery)
   3727 		ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
   3728 
   3729 	pmtu = IP_MAXPACKET;
   3730 	/*
   3731 	 * Decide whether whether IPv4 sets DF
   3732 	 * For IPv6 "no DF" means to use the 1280 mtu
   3733 	 */
   3734 	if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
   3735 		ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
   3736 	} else {
   3737 		ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
   3738 		if (!(ixa->ixa_flags & IXAF_IS_IPV4))
   3739 			pmtu = IPV6_MIN_MTU;
   3740 	}
   3741 
   3742 	/* Check if the PMTU is to old before we use it */
   3743 	if ((dce->dce_flags & DCEF_PMTU) &&
   3744 	    TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
   3745 	    ipst->ips_ip_pathmtu_interval) {
   3746 		/*
   3747 		 * Older than 20 minutes. Drop the path MTU information.
   3748 		 */
   3749 		mutex_enter(&dce->dce_lock);
   3750 		dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
   3751 		dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
   3752 		mutex_exit(&dce->dce_lock);
   3753 		dce_increment_generation(dce);
   3754 	}
   3755 
   3756 	/* The metrics on the route can lower the path MTU */
   3757 	if (ire->ire_metrics.iulp_mtu != 0 &&
   3758 	    ire->ire_metrics.iulp_mtu < pmtu)
   3759 		pmtu = ire->ire_metrics.iulp_mtu;
   3760 
   3761 	/*
   3762 	 * If the path MTU is smaller than some minimum, we still use dce_pmtu
   3763 	 * above (would be 576 for IPv4 and 1280 for IPv6), but we clear
   3764 	 * IXAF_PMTU_IPV4_DF so that we avoid setting DF for IPv4.
   3765 	 */
   3766 	if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
   3767 		if (dce->dce_flags & DCEF_PMTU) {
   3768 			if (dce->dce_pmtu < pmtu)
   3769 				pmtu = dce->dce_pmtu;
   3770 
   3771 			if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) {
   3772 				ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL;
   3773 				ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
   3774 			} else {
   3775 				ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
   3776 				ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
   3777 			}
   3778 		} else {
   3779 			ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
   3780 			ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
   3781 		}
   3782 	}
   3783 
   3784 	/*
   3785 	 * If we have an IRE_LOCAL we use the loopback mtu instead of
   3786 	 * the ill for going out the wire i.e., IRE_LOCAL gets the same
   3787 	 * mtu as IRE_LOOPBACK.
   3788 	 */
   3789 	if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
   3790 		uint_t loopback_mtu;
   3791 
   3792 		loopback_mtu = (ire->ire_ipversion == IPV6_VERSION) ?
   3793 		    ip_loopback_mtu_v6plus : ip_loopback_mtuplus;
   3794 
   3795 		if (loopback_mtu < pmtu)
   3796 			pmtu = loopback_mtu;
   3797 	} else if (nce != NULL) {
   3798 		/*
   3799 		 * Make sure we don't exceed the interface MTU.
   3800 		 * In the case of RTF_REJECT or RTF_BLACKHOLE we might not have
   3801 		 * an ill. We'd use the above IP_MAXPACKET in that case just
   3802 		 * to tell the transport something larger than zero.
   3803 		 */
   3804 		if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST)) {
   3805 			if (nce->nce_common->ncec_ill->ill_mc_mtu < pmtu)
   3806 				pmtu = nce->nce_common->ncec_ill->ill_mc_mtu;
   3807 			if (nce->nce_common->ncec_ill != nce->nce_ill &&
   3808 			    nce->nce_ill->ill_mc_mtu < pmtu) {
   3809 				/*
   3810 				 * for interfaces in an IPMP group, the mtu of
   3811 				 * the nce_ill (under_ill) could be different
   3812 				 * from the mtu of the ncec_ill, so we take the
   3813 				 * min of the two.
   3814 				 */
   3815 				pmtu = nce->nce_ill->ill_mc_mtu;
   3816 			}
   3817 		} else {
   3818 			if (nce->nce_common->ncec_ill->ill_mtu < pmtu)
   3819 				pmtu = nce->nce_common->ncec_ill->ill_mtu;
   3820 			if (nce->nce_common->ncec_ill != nce->nce_ill &&
   3821 			    nce->nce_ill->ill_mtu < pmtu) {
   3822 				/*
   3823 				 * for interfaces in an IPMP group, the mtu of
   3824 				 * the nce_ill (under_ill) could be different
   3825 				 * from the mtu of the ncec_ill, so we take the
   3826 				 * min of the two.
   3827 				 */
   3828 				pmtu = nce->nce_ill->ill_mtu;
   3829 			}
   3830 		}
   3831 	}
   3832 
   3833 	/*
   3834 	 * Handle the IPV6_USE_MIN_MTU socket option or ancillary data.
   3835 	 * Only applies to IPv6.
   3836 	 */
   3837 	if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
   3838 		if (ixa->ixa_flags & IXAF_USE_MIN_MTU) {
   3839 			switch (ixa->ixa_use_min_mtu) {
   3840 			case IPV6_USE_MIN_MTU_MULTICAST:
   3841 				if (ire->ire_type & IRE_MULTICAST)
   3842 					pmtu = IPV6_MIN_MTU;
   3843 				break;
   3844 			case IPV6_USE_MIN_MTU_ALWAYS:
   3845 				pmtu = IPV6_MIN_MTU;
   3846 				break;
   3847 			case IPV6_USE_MIN_MTU_NEVER:
   3848 				break;
   3849 			}
   3850 		} else {
   3851 			/* Default is IPV6_USE_MIN_MTU_MULTICAST */
   3852 			if (ire->ire_type & IRE_MULTICAST)
   3853 				pmtu = IPV6_MIN_MTU;
   3854 		}
   3855 	}
   3856 
   3857 	/*
   3858 	 * After receiving an ICMPv6 "packet too big" message with a
   3859 	 * MTU < 1280, and for multirouted IPv6 packets, the IP layer
   3860 	 * will insert a 8-byte fragment header in every packet. We compensate
   3861 	 * for those cases by returning a smaller path MTU to the ULP.
   3862 	 *
   3863 	 * In the case of CGTP then ip_output will add a fragment header.
   3864 	 * Make sure there is room for it by telling a smaller number
   3865 	 * to the transport.
   3866 	 *
   3867 	 * When IXAF_IPV6_ADDR_FRAGHDR we subtract the frag hdr here
   3868 	 * so the ULPs consistently see a iulp_pmtu and ip_get_pmtu()
   3869 	 * which is the size of the packets it can send.
   3870 	 */
   3871 	if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
   3872 		if ((dce->dce_flags & DCEF_TOO_SMALL_PMTU) ||
   3873 		    (ire->ire_flags & RTF_MULTIRT) ||
   3874 		    (ixa->ixa_flags & IXAF_MULTIRT_MULTICAST)) {
   3875 			pmtu -= sizeof (ip6_frag_t);
   3876 			ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
   3877 		}
   3878 	}
   3879 
   3880 	return (pmtu);
   3881 }
   3882 
   3883 /*
   3884  * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping
   3885  * the final piece where we don't.  Return a pointer to the first mblk in the
   3886  * result, and update the pointer to the next mblk to chew on.  If anything
   3887  * goes wrong (i.e., dupb fails), we waste everything in sight and return a
   3888  * NULL pointer.
   3889  */
   3890 mblk_t *
   3891 ip_carve_mp(mblk_t **mpp, ssize_t len)
   3892 {
   3893 	mblk_t	*mp0;
   3894 	mblk_t	*mp1;
   3895 	mblk_t	*mp2;
   3896 
   3897 	if (!len || !mpp || !(mp0 = *mpp))
   3898 		return (NULL);
   3899 	/* If we aren't going to consume the first mblk, we need a dup. */
   3900 	if (mp0->b_wptr - mp0->b_rptr > len) {
   3901 		mp1 = dupb(mp0);
   3902 		if (mp1) {
   3903 			/* Partition the data between the two mblks. */
   3904 			mp1->b_wptr = mp1->b_rptr + len;
   3905 			mp0->b_rptr = mp1->b_wptr;
   3906 			/*
   3907 			 * after adjustments if mblk not consumed is now
   3908 			 * unaligned, try to align it. If this fails free
   3909 			 * all messages and let upper layer recover.
   3910 			 */
   3911 			if (!OK_32PTR(mp0->b_rptr)) {
   3912 				if (!pullupmsg(mp0, -1)) {
   3913 					freemsg(mp0);
   3914 					freemsg(mp1);
   3915 					*mpp = NULL;
   3916 					return (NULL);
   3917 				}
   3918 			}
   3919 		}
   3920 		return (mp1);
   3921 	}
   3922 	/* Eat through as many mblks as we need to get len bytes. */
   3923 	len -= mp0->b_wptr - mp0->b_rptr;
   3924 	for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) {
   3925 		if (mp2->b_wptr - mp2->b_rptr > len) {
   3926 			/*
   3927 			 * We won't consume the entire last mblk.  Like
   3928 			 * above, dup and partition it.
   3929 			 */
   3930 			mp1->b_cont = dupb(mp2);
   3931 			mp1 = mp1->b_cont;
   3932 			if (!mp1) {
   3933 				/*
   3934 				 * Trouble.  Rather than go to a lot of
   3935 				 * trouble to clean up, we free the messages.
   3936 				 * This won't be any worse than losing it on
   3937 				 * the wire.
   3938 				 */
   3939 				freemsg(mp0);
   3940 				freemsg(mp2);
   3941 				*mpp = NULL;
   3942 				return (NULL);
   3943 			}
   3944 			mp1->b_wptr = mp1->b_rptr + len;
   3945 			mp2->b_rptr = mp1->b_wptr;
   3946 			/*
   3947 			 * after adjustments if mblk not consumed is now
   3948 			 * unaligned, try to align it. If this fails free
   3949 			 * all messages and let upper layer recover.
   3950 			 */
   3951 			if (!OK_32PTR(mp2->b_rptr)) {
   3952 				if (!pullupmsg(mp2, -1)) {
   3953 					freemsg(mp0);
   3954 					freemsg(mp2);
   3955 					*mpp = NULL;
   3956 					return (NULL);
   3957 				}
   3958 			}
   3959 			*mpp = mp2;
   3960 			return (mp0);
   3961 		}
   3962 		/* Decrement len by the amount we just got. */
   3963 		len -= mp2->b_wptr - mp2->b_rptr;
   3964 	}
   3965 	/*
   3966 	 * len should be reduced to zero now.  If not our caller has
   3967 	 * screwed up.
   3968 	 */
   3969 	if (len) {
   3970 		/* Shouldn't happen! */
   3971 		freemsg(mp0);
   3972 		*mpp = NULL;
   3973 		return (NULL);
   3974 	}
   3975 	/*
   3976 	 * We consumed up to exactly the end of an mblk.  Detach the part
   3977 	 * we are returning from the rest of the chain.
   3978 	 */
   3979 	mp1->b_cont = NULL;
   3980 	*mpp = mp2;
   3981 	return (mp0);
   3982 }
   3983 
   3984 /* The ill stream is being unplumbed. Called from ip_close */
   3985 int
   3986 ip_modclose(ill_t *ill)
   3987 {
   3988 	boolean_t success;
   3989 	ipsq_t	*ipsq;
   3990 	ipif_t	*ipif;
   3991 	queue_t	*q = ill->ill_rq;
   3992 	ip_stack_t	*ipst = ill->ill_ipst;
   3993 	int	i;
   3994 	arl_ill_common_t *ai = ill->ill_common;
   3995 
   3996 	/*
   3997 	 * The punlink prior to this may have initiated a capability
   3998 	 * negotiation. But ipsq_enter will block until that finishes or
   3999 	 * times out.
   4000 	 */
   4001 	success = ipsq_enter(ill, B_FALSE, NEW_OP);
   4002 
   4003 	/*
   4004 	 * Open/close/push/pop is guaranteed to be single threaded
   4005 	 * per stream by STREAMS. FS guarantees that all references
   4006 	 * from top are gone before close is called. So there can't
   4007 	 * be another close thread that has set CONDEMNED on this ill.
   4008 	 * and cause ipsq_enter to return failure.
   4009 	 */
   4010 	ASSERT(success);
   4011 	ipsq = ill->ill_phyint->phyint_ipsq;
   4012 
   4013 	/*
   4014 	 * Mark it condemned. No new reference will be made to this ill.
   4015 	 * Lookup functions will return an error. Threads that try to
   4016 	 * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures
   4017 	 * that the refcnt will drop down to zero.
   4018 	 */
   4019 	mutex_enter(&ill->ill_lock);
   4020 	ill->ill_state_flags |= ILL_CONDEMNED;
   4021 	for (ipif = ill->ill_ipif; ipif != NULL;
   4022 	    ipif = ipif->ipif_next) {
   4023 		ipif->ipif_state_flags |= IPIF_CONDEMNED;
   4024 	}
   4025 	/*
   4026 	 * Wake up anybody waiting to enter the ipsq. ipsq_enter
   4027 	 * returns  error if ILL_CONDEMNED is set
   4028 	 */
   4029 	cv_broadcast(&ill->ill_cv);
   4030 	mutex_exit(&ill->ill_lock);
   4031 
   4032 	/*
   4033 	 * Send all the deferred DLPI messages downstream which came in
   4034 	 * during the small window right before ipsq_enter(). We do this
   4035 	 * without waiting for the ACKs because all the ACKs for M_PROTO
   4036 	 * messages are ignored in ip_rput() when ILL_CONDEMNED is set.
   4037 	 */
   4038 	ill_dlpi_send_deferred(ill);
   4039 
   4040 	/*
   4041 	 * Shut down fragmentation reassembly.
   4042 	 * ill_frag_timer won't start a timer again.
   4043 	 * Now cancel any existing timer
   4044 	 */
   4045 	(void) untimeout(ill->ill_frag_timer_id);
   4046 	(void) ill_frag_timeout(ill, 0);
   4047 
   4048 	/*
   4049 	 * Call ill_delete to bring down the ipifs, ilms and ill on
   4050 	 * this ill. Then wait for the refcnts to drop to zero.
   4051 	 * ill_is_freeable checks whether the ill is really quiescent.
   4052 	 * Then make sure that threads that are waiting to enter the
   4053 	 * ipsq have seen the error returned by ipsq_enter and have
   4054 	 * gone away. Then we call ill_delete_tail which does the
   4055 	 * DL_UNBIND_REQ with the driver and then qprocsoff.
   4056 	 */
   4057 	ill_delete(ill);
   4058 	mutex_enter(&ill->ill_lock);
   4059 	while (!ill_is_freeable(ill))
   4060 		cv_wait(&ill->ill_cv, &ill->ill_lock);
   4061 
   4062 	while (ill->ill_waiters)
   4063 		cv_wait(&ill->ill_cv, &ill->ill_lock);
   4064 
   4065 	mutex_exit(&ill->ill_lock);
   4066 
   4067 	/*
   4068 	 * ill_delete_tail drops reference on ill_ipst, but we need to keep
   4069 	 * it held until the end of the function since the cleanup
   4070 	 * below needs to be able to use the ip_stack_t.
   4071 	 */
   4072 	netstack_hold(ipst->ips_netstack);
   4073 
   4074 	/* qprocsoff is done via ill_delete_tail */
   4075 	ill_delete_tail(ill);
   4076 	/*
   4077 	 * synchronously wait for arp stream to unbind. After this, we
   4078 	 * cannot get any data packets up from the driver.
   4079 	 */
   4080 	arp_unbind_complete(ill);
   4081 	ASSERT(ill->ill_ipst == NULL);
   4082 
   4083 	/*
   4084 	 * Walk through all conns and qenable those that have queued data.
   4085 	 * Close synchronization needs this to
   4086 	 * be done to ensure that all upper layers blocked
   4087 	 * due to flow control to the closing device
   4088 	 * get unblocked.
   4089 	 */
   4090 	ip1dbg(("ip_wsrv: walking\n"));
   4091 	for (i = 0; i < TX_FANOUT_SIZE; i++) {
   4092 		conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]);
   4093 	}
   4094 
   4095 	/*
   4096 	 * ai can be null if this is an IPv6 ill, or if the IPv4
   4097 	 * stream is being torn down before ARP was plumbed (e.g.,
   4098 	 * /sbin/ifconfig plumbing a stream twice, and encountering
   4099 	 * an error
   4100 	 */
   4101 	if (ai != NULL) {
   4102 		ASSERT(!ill->ill_isv6);
   4103 		mutex_enter(&ai->ai_lock);
   4104 		ai->ai_ill = NULL;
   4105 		if (ai->ai_arl == NULL) {
   4106 			mutex_destroy(&ai->ai_lock);
   4107 			kmem_free(ai, sizeof (*ai));
   4108 		} else {
   4109 			cv_signal(&ai->ai_ill_unplumb_done);
   4110 			mutex_exit(&ai->ai_lock);
   4111 		}
   4112 	}
   4113 
   4114 	mutex_enter(&ipst->ips_ip_mi_lock);
   4115 	mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill);
   4116 	mutex_exit(&ipst->ips_ip_mi_lock);
   4117 
   4118 	/*
   4119 	 * credp could be null if the open didn't succeed and ip_modopen
   4120 	 * itself calls ip_close.
   4121 	 */
   4122 	if (ill->ill_credp != NULL)
   4123 		crfree(ill->ill_credp);
   4124 
   4125 	mutex_destroy(&ill->ill_saved_ire_lock);
   4126 	mutex_destroy(&ill->ill_lock);
   4127 	rw_destroy(&ill->ill_mcast_lock);
   4128 	mutex_destroy(&ill->ill_mcast_serializer);
   4129 	list_destroy(&ill->ill_nce);
   4130 
   4131 	/*
   4132 	 * Now we are done with the module close pieces that
   4133 	 * need the netstack_t.
   4134 	 */
   4135 	netstack_rele(ipst->ips_netstack);
   4136 
   4137 	mi_close_free((IDP)ill);
   4138 	q->q_ptr = WR(q)->q_ptr = NULL;
   4139 
   4140 	ipsq_exit(ipsq);
   4141 
   4142 	return (0);
   4143 }
   4144 
   4145 /*
   4146  * This is called as part of close() for IP, UDP, ICMP, and RTS
   4147  * in order to quiesce the conn.
   4148  */
   4149 void
   4150 ip_quiesce_conn(conn_t *connp)
   4151 {
   4152 	boolean_t	drain_cleanup_reqd = B_FALSE;
   4153 	boolean_t	conn_ioctl_cleanup_reqd = B_FALSE;
   4154 	boolean_t	ilg_cleanup_reqd = B_FALSE;
   4155 	ip_stack_t	*ipst;
   4156 
   4157 	ASSERT(!IPCL_IS_TCP(connp));
   4158 	ipst = connp->conn_netstack->netstack_ip;
   4159 
   4160 	/*
   4161 	 * Mark the conn as closing, and this conn must not be
   4162 	 * inserted in future into any list. Eg. conn_drain_insert(),
   4163 	 * won't insert this conn into the conn_drain_list.
   4164 	 *
   4165 	 * conn_idl, and conn_ilg cannot get set henceforth.
   4166 	 */
   4167 	mutex_enter(&connp->conn_lock);
   4168 	ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
   4169 	connp->conn_state_flags |= CONN_CLOSING;
   4170 	if (connp->conn_idl != NULL)
   4171 		drain_cleanup_reqd = B_TRUE;
   4172 	if (connp->conn_oper_pending_ill != NULL)
   4173 		conn_ioctl_cleanup_reqd = B_TRUE;
   4174 	if (connp->conn_dhcpinit_ill != NULL) {
   4175 		ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0);
   4176 		atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit);
   4177 		ill_set_inputfn(connp->conn_dhcpinit_ill);
   4178 		connp->conn_dhcpinit_ill = NULL;
   4179 	}
   4180 	if (connp->conn_ilg != NULL)
   4181 		ilg_cleanup_reqd = B_TRUE;
   4182 	mutex_exit(&connp->conn_lock);
   4183 
   4184 	if (conn_ioctl_cleanup_reqd)
   4185 		conn_ioctl_cleanup(connp);
   4186 
   4187 	if (is_system_labeled() && connp->conn_anon_port) {
   4188 		(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
   4189 		    connp->conn_mlp_type, connp->conn_proto,
   4190 		    ntohs(connp->conn_lport), B_FALSE);
   4191 		connp->conn_anon_port = 0;
   4192 	}
   4193 	connp->conn_mlp_type = mlptSingle;
   4194 
   4195 	/*
   4196 	 * Remove this conn from any fanout list it is on.
   4197 	 * and then wait for any threads currently operating
   4198 	 * on this endpoint to finish
   4199 	 */
   4200 	ipcl_hash_remove(connp);
   4201 
   4202 	/*
   4203 	 * Remove this conn from the drain list, and do any other cleanup that
   4204 	 * may be required.  (TCP conns are never flow controlled, and
   4205 	 * conn_idl will be NULL.)
   4206 	 */
   4207 	if (drain_cleanup_reqd && connp->conn_idl != NULL) {
   4208 		idl_t *idl = connp->conn_idl;
   4209 
   4210 		mutex_enter(&idl->idl_lock);
   4211 		conn_drain(connp, B_TRUE);
   4212 		mutex_exit(&idl->idl_lock);
   4213 	}
   4214 
   4215 	if (connp == ipst->ips_ip_g_mrouter)
   4216 		(void) ip_mrouter_done(ipst);
   4217 
   4218 	if (ilg_cleanup_reqd)
   4219 		ilg_delete_all(connp);
   4220 
   4221 	/*
   4222 	 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
   4223 	 * callers from write side can't be there now because close
   4224 	 * is in progress. The only other caller is ipcl_walk
   4225 	 * which checks for the condemned flag.
   4226 	 */
   4227 	mutex_enter(&connp->conn_lock);
   4228 	connp->conn_state_flags |= CONN_CONDEMNED;
   4229 	while (connp->conn_ref != 1)
   4230 		cv_wait(&connp->conn_cv, &connp->conn_lock);
   4231 	connp->conn_state_flags |= CONN_QUIESCED;
   4232 	mutex_exit(&connp->conn_lock);
   4233 }
   4234 
   4235 /* ARGSUSED */
   4236 int
   4237 ip_close(queue_t *q, int flags)
   4238 {
   4239 	conn_t		*connp;
   4240 
   4241 	/*
   4242 	 * Call the appropriate delete routine depending on whether this is
   4243 	 * a module or device.
   4244 	 */
   4245 	if (WR(q)->q_next != NULL) {
   4246 		/* This is a module close */
   4247 		return (ip_modclose((ill_t *)q->q_ptr));
   4248 	}
   4249 
   4250 	connp = q->q_ptr;
   4251 	ip_quiesce_conn(connp);
   4252 
   4253 	qprocsoff(q);
   4254 
   4255 	/*
   4256 	 * Now we are truly single threaded on this stream, and can
   4257 	 * delete the things hanging off the connp, and finally the connp.
   4258 	 * We removed this connp from the fanout list, it cannot be
   4259 	 * accessed thru the fanouts, and we already waited for the
   4260 	 * conn_ref to drop to 0. We are already in close, so
   4261 	 * there cannot be any other thread from the top. qprocsoff
   4262 	 * has completed, and service has completed or won't run in
   4263 	 * future.
   4264 	 */
   4265 	ASSERT(connp->conn_ref == 1);
   4266 
   4267 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
   4268 
   4269 	connp->conn_ref--;
   4270 	ipcl_conn_destroy(connp);
   4271 
   4272 	q->q_ptr = WR(q)->q_ptr = NULL;
   4273 	return (0);
   4274 }
   4275 
   4276 /*
   4277  * Wapper around putnext() so that ip_rts_request can merely use
   4278  * conn_recv.
   4279  */
   4280 /*ARGSUSED2*/
   4281 static void
   4282 ip_conn_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   4283 {
   4284 	conn_t *connp = (conn_t *)arg1;
   4285 
   4286 	putnext(connp->conn_rq, mp);
   4287 }
   4288 
   4289 /* Dummy in case ICMP error delivery is attempted to a /dev/ip instance */
   4290 /* ARGSUSED */
   4291 static void
   4292 ip_conn_input_icmp(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   4293 {
   4294 	freemsg(mp);
   4295 }
   4296 
   4297 /*
   4298  * Called when the module is about to be unloaded
   4299  */
   4300 void
   4301 ip_ddi_destroy(void)
   4302 {
   4303 	/* This needs to be called before destroying any transports. */
   4304 	mutex_enter(&cpu_lock);
   4305 	unregister_cpu_setup_func(ip_tp_cpu_update, NULL);
   4306 	mutex_exit(&cpu_lock);
   4307 
   4308 	tnet_fini();
   4309 
   4310 	icmp_ddi_g_destroy();
   4311 	rts_ddi_g_destroy();
   4312 	udp_ddi_g_destroy();
   4313 	sctp_ddi_g_destroy();
   4314 	tcp_ddi_g_destroy();
   4315 	ilb_ddi_g_destroy();
   4316 	dce_g_destroy();
   4317 	ipsec_policy_g_destroy();
   4318 	ipcl_g_destroy();
   4319 	ip_net_g_destroy();
   4320 	ip_ire_g_fini();
   4321 	inet_minor_destroy(ip_minor_arena_sa);
   4322 #if defined(_LP64)
   4323 	inet_minor_destroy(ip_minor_arena_la);
   4324 #endif
   4325 
   4326 #ifdef DEBUG
   4327 	list_destroy(&ip_thread_list);
   4328 	rw_destroy(&ip_thread_rwlock);
   4329 	tsd_destroy(&ip_thread_data);
   4330 #endif
   4331 
   4332 	netstack_unregister(NS_IP);
   4333 }
   4334 
   4335 /*
   4336  * First step in cleanup.
   4337  */
   4338 /* ARGSUSED */
   4339 static void
   4340 ip_stack_shutdown(netstackid_t stackid, void *arg)
   4341 {
   4342 	ip_stack_t *ipst = (ip_stack_t *)arg;
   4343 
   4344 #ifdef NS_DEBUG
   4345 	printf("ip_stack_shutdown(%p, stack %d)\n", (void *)ipst, stackid);
   4346 #endif
   4347 
   4348 	/*
   4349 	 * Perform cleanup for special interfaces (loopback and IPMP).
   4350 	 */
   4351 	ip_interface_cleanup(ipst);
   4352 
   4353 	/*
   4354 	 * The *_hook_shutdown()s start the process of notifying any
   4355 	 * consumers that things are going away.... nothing is destroyed.
   4356 	 */
   4357 	ipv4_hook_shutdown(ipst);
   4358 	ipv6_hook_shutdown(ipst);
   4359 	arp_hook_shutdown(ipst);
   4360 
   4361 	mutex_enter(&ipst->ips_capab_taskq_lock);
   4362 	ipst->ips_capab_taskq_quit = B_TRUE;
   4363 	cv_signal(&ipst->ips_capab_taskq_cv);
   4364 	mutex_exit(&ipst->ips_capab_taskq_lock);
   4365 }
   4366 
   4367 /*
   4368  * Free the IP stack instance.
   4369  */
   4370 static void
   4371 ip_stack_fini(netstackid_t stackid, void *arg)
   4372 {
   4373 	ip_stack_t *ipst = (ip_stack_t *)arg;
   4374 	int ret;
   4375 
   4376 #ifdef NS_DEBUG
   4377 	printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
   4378 #endif
   4379 	/*
   4380 	 * At this point, all of the notifications that the events and
   4381 	 * protocols are going away have been run, meaning that we can
   4382 	 * now set about starting to clean things up.
   4383 	 */
   4384 	ipobs_fini(ipst);
   4385 	ipv4_hook_destroy(ipst);
   4386 	ipv6_hook_destroy(ipst);
   4387 	arp_hook_destroy(ipst);
   4388 	ip_net_destroy(ipst);
   4389 
   4390 	ipmp_destroy(ipst);
   4391 
   4392 	ip_kstat_fini(stackid, ipst->ips_ip_mibkp);
   4393 	ipst->ips_ip_mibkp = NULL;
   4394 	icmp_kstat_fini(stackid, ipst->ips_icmp_mibkp);
   4395 	ipst->ips_icmp_mibkp = NULL;
   4396 	ip_kstat2_fini(stackid, ipst->ips_ip_kstat);
   4397 	ipst->ips_ip_kstat = NULL;
   4398 	bzero(&ipst->ips_ip_statistics, sizeof (ipst->ips_ip_statistics));
   4399 	ip6_kstat_fini(stackid, ipst->ips_ip6_kstat);
   4400 	ipst->ips_ip6_kstat = NULL;
   4401 	bzero(&ipst->ips_ip6_statistics, sizeof (ipst->ips_ip6_statistics));
   4402 
   4403 	kmem_free(ipst->ips_propinfo_tbl,
   4404 	    ip_propinfo_count * sizeof (mod_prop_info_t));
   4405 	ipst->ips_propinfo_tbl = NULL;
   4406 
   4407 	dce_stack_destroy(ipst);
   4408 	ip_mrouter_stack_destroy(ipst);
   4409 
   4410 	ret = untimeout(ipst->ips_igmp_timeout_id);
   4411 	if (ret == -1) {
   4412 		ASSERT(ipst->ips_igmp_timeout_id == 0);
   4413 	} else {
   4414 		ASSERT(ipst->ips_igmp_timeout_id != 0);
   4415 		ipst->ips_igmp_timeout_id = 0;
   4416 	}
   4417 	ret = untimeout(ipst->ips_igmp_slowtimeout_id);
   4418 	if (ret == -1) {
   4419 		ASSERT(ipst->ips_igmp_slowtimeout_id == 0);
   4420 	} else {
   4421 		ASSERT(ipst->ips_igmp_slowtimeout_id != 0);
   4422 		ipst->ips_igmp_slowtimeout_id = 0;
   4423 	}
   4424 	ret = untimeout(ipst->ips_mld_timeout_id);
   4425 	if (ret == -1) {
   4426 		ASSERT(ipst->ips_mld_timeout_id == 0);
   4427 	} else {
   4428 		ASSERT(ipst->ips_mld_timeout_id != 0);
   4429 		ipst->ips_mld_timeout_id = 0;
   4430 	}
   4431 	ret = untimeout(ipst->ips_mld_slowtimeout_id);
   4432 	if (ret == -1) {
   4433 		ASSERT(ipst->ips_mld_slowtimeout_id == 0);
   4434 	} else {
   4435 		ASSERT(ipst->ips_mld_slowtimeout_id != 0);
   4436 		ipst->ips_mld_slowtimeout_id = 0;
   4437 	}
   4438 
   4439 	ip_ire_fini(ipst);
   4440 	ip6_asp_free(ipst);
   4441 	conn_drain_fini(ipst);
   4442 	ipcl_destroy(ipst);
   4443 
   4444 	mutex_destroy(&ipst->ips_ndp4->ndp_g_lock);
   4445 	mutex_destroy(&ipst->ips_ndp6->ndp_g_lock);
   4446 	kmem_free(ipst->ips_ndp4, sizeof (ndp_g_t));
   4447 	ipst->ips_ndp4 = NULL;
   4448 	kmem_free(ipst->ips_ndp6, sizeof (ndp_g_t));
   4449 	ipst->ips_ndp6 = NULL;
   4450 
   4451 	if (ipst->ips_loopback_ksp != NULL) {
   4452 		kstat_delete_netstack(ipst->ips_loopback_ksp, stackid);
   4453 		ipst->ips_loopback_ksp = NULL;
   4454 	}
   4455 
   4456 	mutex_destroy(&ipst->ips_capab_taskq_lock);
   4457 	cv_destroy(&ipst->ips_capab_taskq_cv);
   4458 
   4459 	rw_destroy(&ipst->ips_srcid_lock);
   4460 
   4461 	mutex_destroy(&ipst->ips_ip_mi_lock);
   4462 	rw_destroy(&ipst->ips_ill_g_usesrc_lock);
   4463 
   4464 	mutex_destroy(&ipst->ips_igmp_timer_lock);
   4465 	mutex_destroy(&ipst->ips_mld_timer_lock);
   4466 	mutex_destroy(&ipst->ips_igmp_slowtimeout_lock);
   4467 	mutex_destroy(&ipst->ips_mld_slowtimeout_lock);
   4468 	mutex_destroy(&ipst->ips_ip_addr_avail_lock);
   4469 	rw_destroy(&ipst->ips_ill_g_lock);
   4470 
   4471 	kmem_free(ipst->ips_phyint_g_list, sizeof (phyint_list_t));
   4472 	ipst->ips_phyint_g_list = NULL;
   4473 	kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS);
   4474 	ipst->ips_ill_g_heads = NULL;
   4475 
   4476 	ldi_ident_release(ipst->ips_ldi_ident);
   4477 	kmem_free(ipst, sizeof (*ipst));
   4478 }
   4479 
   4480 /*
   4481  * This function is called from the TSD destructor, and is used to debug
   4482  * reference count issues in IP. See block comment in <inet/ip_if.h> for
   4483  * details.
   4484  */
   4485 static void
   4486 ip_thread_exit(void *phash)
   4487 {
   4488 	th_hash_t *thh = phash;
   4489 
   4490 	rw_enter(&ip_thread_rwlock, RW_WRITER);
   4491 	list_remove(&ip_thread_list, thh);
   4492 	rw_exit(&ip_thread_rwlock);
   4493 	mod_hash_destroy_hash(thh->thh_hash);
   4494 	kmem_free(thh, sizeof (*thh));
   4495 }
   4496 
   4497 /*
   4498  * Called when the IP kernel module is loaded into the kernel
   4499  */
   4500 void
   4501 ip_ddi_init(void)
   4502 {
   4503 	ip_squeue_flag = ip_squeue_switch(ip_squeue_enter);
   4504 
   4505 	/*
   4506 	 * For IP and TCP the minor numbers should start from 2 since we have 4
   4507 	 * initial devices: ip, ip6, tcp, tcp6.
   4508 	 */
   4509 	/*
   4510 	 * If this is a 64-bit kernel, then create two separate arenas -
   4511 	 * one for TLIs in the range of INET_MIN_DEV+2 through 2^^18-1, and the
   4512 	 * other for socket apps in the range 2^^18 through 2^^32-1.
   4513 	 */
   4514 	ip_minor_arena_la = NULL;
   4515 	ip_minor_arena_sa = NULL;
   4516 #if defined(_LP64)
   4517 	if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
   4518 	    INET_MIN_DEV + 2, MAXMIN32, KM_SLEEP)) == NULL) {
   4519 		cmn_err(CE_PANIC,
   4520 		    "ip_ddi_init: ip_minor_arena_sa creation failed\n");
   4521 	}
   4522 	if ((ip_minor_arena_la = inet_minor_create("ip_minor_arena_la",
   4523 	    MAXMIN32 + 1, MAXMIN64, KM_SLEEP)) == NULL) {
   4524 		cmn_err(CE_PANIC,
   4525 		    "ip_ddi_init: ip_minor_arena_la creation failed\n");
   4526 	}
   4527 #else
   4528 	if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
   4529 	    INET_MIN_DEV + 2, MAXMIN, KM_SLEEP)) == NULL) {
   4530 		cmn_err(CE_PANIC,
   4531 		    "ip_ddi_init: ip_minor_arena_sa creation failed\n");
   4532 	}
   4533 #endif
   4534 	ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms);
   4535 
   4536 	ipcl_g_init();
   4537 	ip_ire_g_init();
   4538 	ip_net_g_init();
   4539 
   4540 #ifdef DEBUG
   4541 	tsd_create(&ip_thread_data, ip_thread_exit);
   4542 	rw_init(&ip_thread_rwlock, NULL, RW_DEFAULT, NULL);
   4543 	list_create(&ip_thread_list, sizeof (th_hash_t),
   4544 	    offsetof(th_hash_t, thh_link));
   4545 #endif
   4546 	ipsec_policy_g_init();
   4547 	tcp_ddi_g_init();
   4548 	sctp_ddi_g_init();
   4549 	dce_g_init();
   4550 
   4551 	/*
   4552 	 * We want to be informed each time a stack is created or
   4553 	 * destroyed in the kernel, so we can maintain the
   4554 	 * set of udp_stack_t's.
   4555 	 */
   4556 	netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown,
   4557 	    ip_stack_fini);
   4558 
   4559 	tnet_init();
   4560 
   4561 	udp_ddi_g_init();
   4562 	rts_ddi_g_init();
   4563 	icmp_ddi_g_init();
   4564 	ilb_ddi_g_init();
   4565 
   4566 	/* This needs to be called after all transports are initialized. */
   4567 	mutex_enter(&cpu_lock);
   4568 	register_cpu_setup_func(ip_tp_cpu_update, NULL);
   4569 	mutex_exit(&cpu_lock);
   4570 }
   4571 
   4572 /*
   4573  * Initialize the IP stack instance.
   4574  */
   4575 static void *
   4576 ip_stack_init(netstackid_t stackid, netstack_t *ns)
   4577 {
   4578 	ip_stack_t	*ipst;
   4579 	size_t		arrsz;
   4580 	major_t		major;
   4581 
   4582 #ifdef NS_DEBUG
   4583 	printf("ip_stack_init(stack %d)\n", stackid);
   4584 #endif
   4585 
   4586 	ipst = (ip_stack_t *)kmem_zalloc(sizeof (*ipst), KM_SLEEP);
   4587 	ipst->ips_netstack = ns;
   4588 
   4589 	ipst->ips_ill_g_heads = kmem_zalloc(sizeof (ill_g_head_t) * MAX_G_HEADS,
   4590 	    KM_SLEEP);
   4591 	ipst->ips_phyint_g_list = kmem_zalloc(sizeof (phyint_list_t),
   4592 	    KM_SLEEP);
   4593 	ipst->ips_ndp4 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
   4594 	ipst->ips_ndp6 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
   4595 	mutex_init(&ipst->ips_ndp4->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
   4596 	mutex_init(&ipst->ips_ndp6->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
   4597 
   4598 	mutex_init(&ipst->ips_igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL);
   4599 	ipst->ips_igmp_deferred_next = INFINITY;
   4600 	mutex_init(&ipst->ips_mld_timer_lock, NULL, MUTEX_DEFAULT, NULL);
   4601 	ipst->ips_mld_deferred_next = INFINITY;
   4602 	mutex_init(&ipst->ips_igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
   4603 	mutex_init(&ipst->ips_mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
   4604 	mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL);
   4605 	mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL);
   4606 	rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL);
   4607 	rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL);
   4608 
   4609 	ipcl_init(ipst);
   4610 	ip_ire_init(ipst);
   4611 	ip6_asp_init(ipst);
   4612 	ipif_init(ipst);
   4613 	conn_drain_init(ipst);
   4614 	ip_mrouter_stack_init(ipst);
   4615 	dce_stack_init(ipst);
   4616 
   4617 	ipst->ips_ip_multirt_log_interval = 1000;
   4618 
   4619 	ipst->ips_ill_index = 1;
   4620 
   4621 	ipst->ips_saved_ip_forwarding = -1;
   4622 	ipst->ips_reg_vif_num = ALL_VIFS; 	/* Index to Register vif */
   4623 
   4624 	arrsz = ip_propinfo_count * sizeof (mod_prop_info_t);
   4625 	ipst->ips_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
   4626 	bcopy(ip_propinfo_tbl, ipst->ips_propinfo_tbl, arrsz);
   4627 
   4628 	ipst->ips_ip_mibkp = ip_kstat_init(stackid, ipst);
   4629 	ipst->ips_icmp_mibkp = icmp_kstat_init(stackid);
   4630 	ipst->ips_ip_kstat = ip_kstat2_init(stackid, &ipst->ips_ip_statistics);
   4631 	ipst->ips_ip6_kstat =
   4632 	    ip6_kstat_init(stackid, &ipst->ips_ip6_statistics);
   4633 
   4634 	ipst->ips_ip_src_id = 1;
   4635 	rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
   4636 
   4637 	ipst->ips_src_generation = SRC_GENERATION_INITIAL;
   4638 
   4639 	ip_net_init(ipst, ns);
   4640 	ipv4_hook_init(ipst);
   4641 	ipv6_hook_init(ipst);
   4642 	arp_hook_init(ipst);
   4643 	ipmp_init(ipst);
   4644 	ipobs_init(ipst);
   4645 
   4646 	/*
   4647 	 * Create the taskq dispatcher thread and initialize related stuff.
   4648 	 */
   4649 	ipst->ips_capab_taskq_thread = thread_create(NULL, 0,
   4650 	    ill_taskq_dispatch, ipst, 0, &p0, TS_RUN, minclsyspri);
   4651 	mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
   4652 	cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL);
   4653 
   4654 	major = mod_name_to_major(INET_NAME);
   4655 	(void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
   4656 	return (ipst);
   4657 }
   4658 
   4659 /*
   4660  * Allocate and initialize a DLPI template of the specified length.  (May be
   4661  * called as writer.)
   4662  */
   4663 mblk_t *
   4664 ip_dlpi_alloc(size_t len, t_uscalar_t prim)
   4665 {
   4666 	mblk_t	*mp;
   4667 
   4668 	mp = allocb(len, BPRI_MED);
   4669 	if (!mp)
   4670 		return (NULL);
   4671 
   4672 	/*
   4673 	 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
   4674 	 * of which we don't seem to use) are sent with M_PCPROTO, and
   4675 	 * that other DLPI are M_PROTO.
   4676 	 */
   4677 	if (prim == DL_INFO_REQ) {
   4678 		mp->b_datap->db_type = M_PCPROTO;
   4679 	} else {
   4680 		mp->b_datap->db_type = M_PROTO;
   4681 	}
   4682 
   4683 	mp->b_wptr = mp->b_rptr + len;
   4684 	bzero(mp->b_rptr, len);
   4685 	((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
   4686 	return (mp);
   4687 }
   4688 
   4689 /*
   4690  * Allocate and initialize a DLPI notification.  (May be called as writer.)
   4691  */
   4692 mblk_t *
   4693 ip_dlnotify_alloc(uint_t notification, uint_t data)
   4694 {
   4695 	dl_notify_ind_t	*notifyp;
   4696 	mblk_t		*mp;
   4697 
   4698 	if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
   4699 		return (NULL);
   4700 
   4701 	notifyp = (dl_notify_ind_t *)mp->b_rptr;
   4702 	notifyp->dl_notification = notification;
   4703 	notifyp->dl_data = data;
   4704 	return (mp);
   4705 }
   4706 
   4707 mblk_t *
   4708 ip_dlnotify_alloc2(uint_t notification, uint_t data1, uint_t data2)
   4709 {
   4710 	dl_notify_ind_t	*notifyp;
   4711 	mblk_t		*mp;
   4712 
   4713 	if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
   4714 		return (NULL);
   4715 
   4716 	notifyp = (dl_notify_ind_t *)mp->b_rptr;
   4717 	notifyp->dl_notification = notification;
   4718 	notifyp->dl_data1 = data1;
   4719 	notifyp->dl_data2 = data2;
   4720 	return (mp);
   4721 }
   4722 
   4723 /*
   4724  * Debug formatting routine.  Returns a character string representation of the
   4725  * addr in buf, of the form xxx.xxx.xxx.xxx.  This routine takes the address
   4726  * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer.
   4727  *
   4728  * Once the ndd table-printing interfaces are removed, this can be changed to
   4729  * standard dotted-decimal form.
   4730  */
   4731 char *
   4732 ip_dot_addr(ipaddr_t addr, char *buf)
   4733 {
   4734 	uint8_t *ap = (uint8_t *)&addr;
   4735 
   4736 	(void) mi_sprintf(buf, "%03d.%03d.%03d.%03d",
   4737 	    ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF);
   4738 	return (buf);
   4739 }
   4740 
   4741 /*
   4742  * Write the given MAC address as a printable string in the usual colon-
   4743  * separated format.
   4744  */
   4745 const char *
   4746 mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen)
   4747 {
   4748 	char *bp;
   4749 
   4750 	if (alen == 0 || buflen < 4)
   4751 		return ("?");
   4752 	bp = buf;
   4753 	for (;;) {
   4754 		/*
   4755 		 * If there are more MAC address bytes available, but we won't
   4756 		 * have any room to print them, then add "..." to the string
   4757 		 * instead.  See below for the 'magic number' explanation.
   4758 		 */
   4759 		if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) {
   4760 			(void) strcpy(bp, "...");
   4761 			break;
   4762 		}
   4763 		(void) sprintf(bp, "%02x", *addr++);
   4764 		bp += 2;
   4765 		if (--alen == 0)
   4766 			break;
   4767 		*bp++ = ':';
   4768 		buflen -= 3;
   4769 		/*
   4770 		 * At this point, based on the first 'if' statement above,
   4771 		 * either alen == 1 and buflen >= 3, or alen > 1 and
   4772 		 * buflen >= 4.  The first case leaves room for the final "xx"
   4773 		 * number and trailing NUL byte.  The second leaves room for at
   4774 		 * least "...".  Thus the apparently 'magic' numbers chosen for
   4775 		 * that statement.
   4776 		 */
   4777 	}
   4778 	return (buf);
   4779 }
   4780 
   4781 /*
   4782  * Called when it is conceptually a ULP that would sent the packet
   4783  * e.g., port unreachable and protocol unreachable. Check that the packet
   4784  * would have passed the IPsec global policy before sending the error.
   4785  *
   4786  * Send an ICMP error after patching up the packet appropriately.
   4787  * Uses ip_drop_input and bumps the appropriate MIB.
   4788  */
   4789 void
   4790 ip_fanout_send_icmp_v4(mblk_t *mp, uint_t icmp_type, uint_t icmp_code,
   4791     ip_recv_attr_t *ira)
   4792 {
   4793 	ipha_t		*ipha;
   4794 	boolean_t	secure;
   4795 	ill_t		*ill = ira->ira_ill;
   4796 	ip_stack_t	*ipst = ill->ill_ipst;
   4797 	netstack_t	*ns = ipst->ips_netstack;
   4798 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
   4799 
   4800 	secure = ira->ira_flags & IRAF_IPSEC_SECURE;
   4801 
   4802 	/*
   4803 	 * We are generating an icmp error for some inbound packet.
   4804 	 * Called from all ip_fanout_(udp, tcp, proto) functions.
   4805 	 * Before we generate an error, check with global policy
   4806 	 * to see whether this is allowed to enter the system. As
   4807 	 * there is no "conn", we are checking with global policy.
   4808 	 */
   4809 	ipha = (ipha_t *)mp->b_rptr;
   4810 	if (secure || ipss->ipsec_inbound_v4_policy_present) {
   4811 		mp = ipsec_check_global_policy(mp, NULL, ipha, NULL, ira, ns);
   4812 		if (mp == NULL)
   4813 			return;
   4814 	}
   4815 
   4816 	/* We never send errors for protocols that we do implement */
   4817 	if (ira->ira_protocol == IPPROTO_ICMP ||
   4818 	    ira->ira_protocol == IPPROTO_IGMP) {
   4819 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   4820 		ip_drop_input("ip_fanout_send_icmp_v4", mp, ill);
   4821 		freemsg(mp);
   4822 		return;
   4823 	}
   4824 	/*
   4825 	 * Have to correct checksum since
   4826 	 * the packet might have been
   4827 	 * fragmented and the reassembly code in ip_rput
   4828 	 * does not restore the IP checksum.
   4829 	 */
   4830 	ipha->ipha_hdr_checksum = 0;
   4831 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
   4832 
   4833 	switch (icmp_type) {
   4834 	case ICMP_DEST_UNREACHABLE:
   4835 		switch (icmp_code) {
   4836 		case ICMP_PROTOCOL_UNREACHABLE:
   4837 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
   4838 			ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
   4839 			break;
   4840 		case ICMP_PORT_UNREACHABLE:
   4841 			BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
   4842 			ip_drop_input("ipIfStatsNoPorts", mp, ill);
   4843 			break;
   4844 		}
   4845 
   4846 		icmp_unreachable(mp, icmp_code, ira);
   4847 		break;
   4848 	default:
   4849 #ifdef DEBUG
   4850 		panic("ip_fanout_send_icmp_v4: wrong type");
   4851 		/*NOTREACHED*/
   4852 #else
   4853 		freemsg(mp);
   4854 		break;
   4855 #endif
   4856 	}
   4857 }
   4858 
   4859 /*
   4860  * Used to send an ICMP error message when a packet is received for
   4861  * a protocol that is not supported. The mblk passed as argument
   4862  * is consumed by this function.
   4863  */
   4864 void
   4865 ip_proto_not_sup(mblk_t *mp, ip_recv_attr_t *ira)
   4866 {
   4867 	ipha_t		*ipha;
   4868 
   4869 	ipha = (ipha_t *)mp->b_rptr;
   4870 	if (ira->ira_flags & IRAF_IS_IPV4) {
   4871 		ASSERT(IPH_HDR_VERSION(ipha) == IP_VERSION);
   4872 		ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
   4873 		    ICMP_PROTOCOL_UNREACHABLE, ira);
   4874 	} else {
   4875 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
   4876 		ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
   4877 		    ICMP6_PARAMPROB_NEXTHEADER, ira);
   4878 	}
   4879 }
   4880 
   4881 /*
   4882  * Deliver a rawip packet to the given conn, possibly applying ipsec policy.
   4883  * Handles IPv4 and IPv6.
   4884  * We are responsible for disposing of mp, such as by freemsg() or putnext()
   4885  * Caller is responsible for dropping references to the conn.
   4886  */
   4887 void
   4888 ip_fanout_proto_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
   4889     ip_recv_attr_t *ira)
   4890 {
   4891 	ill_t		*ill = ira->ira_ill;
   4892 	ip_stack_t	*ipst = ill->ill_ipst;
   4893 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   4894 	boolean_t	secure;
   4895 	uint_t		protocol = ira->ira_protocol;
   4896 	iaflags_t	iraflags = ira->ira_flags;
   4897 	queue_t		*rq;
   4898 
   4899 	secure = iraflags & IRAF_IPSEC_SECURE;
   4900 
   4901 	rq = connp->conn_rq;
   4902 	if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
   4903 		switch (protocol) {
   4904 		case IPPROTO_ICMPV6:
   4905 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows);
   4906 			break;
   4907 		case IPPROTO_ICMP:
   4908 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
   4909 			break;
   4910 		default:
   4911 			BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
   4912 			break;
   4913 		}
   4914 		freemsg(mp);
   4915 		return;
   4916 	}
   4917 
   4918 	ASSERT(!(IPCL_IS_IPTUN(connp)));
   4919 
   4920 	if (((iraflags & IRAF_IS_IPV4) ?
   4921 	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
   4922 	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
   4923 	    secure) {
   4924 		mp = ipsec_check_inbound_policy(mp, connp, ipha,
   4925 		    ip6h, ira);
   4926 		if (mp == NULL) {
   4927 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   4928 			/* Note that mp is NULL */
   4929 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   4930 			return;
   4931 		}
   4932 	}
   4933 
   4934 	if (iraflags & IRAF_ICMP_ERROR) {
   4935 		(connp->conn_recvicmp)(connp, mp, NULL, ira);
   4936 	} else {
   4937 		ill_t *rill = ira->ira_rill;
   4938 
   4939 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
   4940 		ira->ira_ill = ira->ira_rill = NULL;
   4941 		/* Send it upstream */
   4942 		(connp->conn_recv)(connp, mp, NULL, ira);
   4943 		ira->ira_ill = ill;
   4944 		ira->ira_rill = rill;
   4945 	}
   4946 }
   4947 
   4948 /*
   4949  * Handle protocols with which IP is less intimate.  There
   4950  * can be more than one stream bound to a particular
   4951  * protocol.  When this is the case, normally each one gets a copy
   4952  * of any incoming packets.
   4953  *
   4954  * IPsec NOTE :
   4955  *
   4956  * Don't allow a secure packet going up a non-secure connection.
   4957  * We don't allow this because
   4958  *
   4959  * 1) Reply might go out in clear which will be dropped at
   4960  *    the sending side.
   4961  * 2) If the reply goes out in clear it will give the
   4962  *    adversary enough information for getting the key in
   4963  *    most of the cases.
   4964  *
   4965  * Moreover getting a secure packet when we expect clear
   4966  * implies that SA's were added without checking for
   4967  * policy on both ends. This should not happen once ISAKMP
   4968  * is used to negotiate SAs as SAs will be added only after
   4969  * verifying the policy.
   4970  *
   4971  * Zones notes:
   4972  * Earlier in ip_input on a system with multiple shared-IP zones we
   4973  * duplicate the multicast and broadcast packets and send them up
   4974  * with each explicit zoneid that exists on that ill.
   4975  * This means that here we can match the zoneid with SO_ALLZONES being special.
   4976  */
   4977 void
   4978 ip_fanout_proto_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
   4979 {
   4980 	mblk_t		*mp1;
   4981 	ipaddr_t	laddr;
   4982 	conn_t		*connp, *first_connp, *next_connp;
   4983 	connf_t		*connfp;
   4984 	ill_t		*ill = ira->ira_ill;
   4985 	ip_stack_t	*ipst = ill->ill_ipst;
   4986 
   4987 	laddr = ipha->ipha_dst;
   4988 
   4989 	connfp = &ipst->ips_ipcl_proto_fanout_v4[ira->ira_protocol];
   4990 	mutex_enter(&connfp->connf_lock);
   4991 	connp = connfp->connf_head;
   4992 	for (connp = connfp->connf_head; connp != NULL;
   4993 	    connp = connp->conn_next) {
   4994 		/* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
   4995 		if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
   4996 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   4997 		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) {
   4998 			break;
   4999 		}
   5000 	}
   5001 
   5002 	if (connp == NULL) {
   5003 		/*
   5004 		 * No one bound to these addresses.  Is
   5005 		 * there a client that wants all
   5006 		 * unclaimed datagrams?
   5007 		 */
   5008 		mutex_exit(&connfp->connf_lock);
   5009 		ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
   5010 		    ICMP_PROTOCOL_UNREACHABLE, ira);
   5011 		return;
   5012 	}
   5013 
   5014 	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
   5015 
   5016 	CONN_INC_REF(connp);
   5017 	first_connp = connp;
   5018 	connp = connp->conn_next;
   5019 
   5020 	for (;;) {
   5021 		while (connp != NULL) {
   5022 			/* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
   5023 			if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
   5024 			    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5025 			    tsol_receive_local(mp, &laddr, IPV4_VERSION,
   5026 			    ira, connp)))
   5027 				break;
   5028 			connp = connp->conn_next;
   5029 		}
   5030 
   5031 		if (connp == NULL) {
   5032 			/* No more interested clients */
   5033 			connp = first_connp;
   5034 			break;
   5035 		}
   5036 		if (((mp1 = dupmsg(mp)) == NULL) &&
   5037 		    ((mp1 = copymsg(mp)) == NULL)) {
   5038 			/* Memory allocation failed */
   5039 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5040 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5041 			connp = first_connp;
   5042 			break;
   5043 		}
   5044 
   5045 		CONN_INC_REF(connp);
   5046 		mutex_exit(&connfp->connf_lock);
   5047 
   5048 		ip_fanout_proto_conn(connp, mp1, (ipha_t *)mp1->b_rptr, NULL,
   5049 		    ira);
   5050 
   5051 		mutex_enter(&connfp->connf_lock);
   5052 		/* Follow the next pointer before releasing the conn. */
   5053 		next_connp = connp->conn_next;
   5054 		CONN_DEC_REF(connp);
   5055 		connp = next_connp;
   5056 	}
   5057 
   5058 	/* Last one.  Send it upstream. */
   5059 	mutex_exit(&connfp->connf_lock);
   5060 
   5061 	ip_fanout_proto_conn(connp, mp, ipha, NULL, ira);
   5062 
   5063 	CONN_DEC_REF(connp);
   5064 }
   5065 
   5066 /*
   5067  * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or
   5068  * pass it along to ESP if the SPI is non-zero.  Returns the mblk if the mblk
   5069  * is not consumed.
   5070  *
   5071  * One of three things can happen, all of which affect the passed-in mblk:
   5072  *
   5073  * 1.) The packet is stock UDP and gets its zero-SPI stripped.  Return mblk..
   5074  *
   5075  * 2.) The packet is ESP-in-UDP, gets transformed into an equivalent
   5076  *     ESP packet, and is passed along to ESP for consumption.  Return NULL.
   5077  *
   5078  * 3.) The packet is an ESP-in-UDP Keepalive.  Drop it and return NULL.
   5079  */
   5080 mblk_t *
   5081 zero_spi_check(mblk_t *mp, ip_recv_attr_t *ira)
   5082 {
   5083 	int shift, plen, iph_len;
   5084 	ipha_t *ipha;
   5085 	udpha_t *udpha;
   5086 	uint32_t *spi;
   5087 	uint32_t esp_ports;
   5088 	uint8_t *orptr;
   5089 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   5090 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   5091 
   5092 	ipha = (ipha_t *)mp->b_rptr;
   5093 	iph_len = ira->ira_ip_hdr_length;
   5094 	plen = ira->ira_pktlen;
   5095 
   5096 	if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) {
   5097 		/*
   5098 		 * Most likely a keepalive for the benefit of an intervening
   5099 		 * NAT.  These aren't for us, per se, so drop it.
   5100 		 *
   5101 		 * RFC 3947/8 doesn't say for sure what to do for 2-3
   5102 		 * byte packets (keepalives are 1-byte), but we'll drop them
   5103 		 * also.
   5104 		 */
   5105 		ip_drop_packet(mp, B_TRUE, ira->ira_ill,
   5106 		    DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper);
   5107 		return (NULL);
   5108 	}
   5109 
   5110 	if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) {
   5111 		/* might as well pull it all up - it might be ESP. */
   5112 		if (!pullupmsg(mp, -1)) {
   5113 			ip_drop_packet(mp, B_TRUE, ira->ira_ill,
   5114 			    DROPPER(ipss, ipds_esp_nomem),
   5115 			    &ipss->ipsec_dropper);
   5116 			return (NULL);
   5117 		}
   5118 
   5119 		ipha = (ipha_t *)mp->b_rptr;
   5120 	}
   5121 	spi = (uint32_t *)(mp->b_rptr + iph_len + sizeof (udpha_t));
   5122 	if (*spi == 0) {
   5123 		/* UDP packet - remove 0-spi. */
   5124 		shift = sizeof (uint32_t);
   5125 	} else {
   5126 		/* ESP-in-UDP packet - reduce to ESP. */
   5127 		ipha->ipha_protocol = IPPROTO_ESP;
   5128 		shift = sizeof (udpha_t);
   5129 	}
   5130 
   5131 	/* Fix IP header */
   5132 	ira->ira_pktlen = (plen - shift);
   5133 	ipha->ipha_length = htons(ira->ira_pktlen);
   5134 	ipha->ipha_hdr_checksum = 0;
   5135 
   5136 	orptr = mp->b_rptr;
   5137 	mp->b_rptr += shift;
   5138 
   5139 	udpha = (udpha_t *)(orptr + iph_len);
   5140 	if (*spi == 0) {
   5141 		ASSERT((uint8_t *)ipha == orptr);
   5142 		udpha->uha_length = htons(plen - shift - iph_len);
   5143 		iph_len += sizeof (udpha_t);	/* For the call to ovbcopy(). */
   5144 		esp_ports = 0;
   5145 	} else {
   5146 		esp_ports = *((uint32_t *)udpha);
   5147 		ASSERT(esp_ports != 0);
   5148 	}
   5149 	ovbcopy(orptr, orptr + shift, iph_len);
   5150 	if (esp_ports != 0) /* Punt up for ESP processing. */ {
   5151 		ipha = (ipha_t *)(orptr + shift);
   5152 
   5153 		ira->ira_flags |= IRAF_ESP_UDP_PORTS;
   5154 		ira->ira_esp_udp_ports = esp_ports;
   5155 		ip_fanout_v4(mp, ipha, ira);
   5156 		return (NULL);
   5157 	}
   5158 	return (mp);
   5159 }
   5160 
   5161 /*
   5162  * Deliver a udp packet to the given conn, possibly applying ipsec policy.
   5163  * Handles IPv4 and IPv6.
   5164  * We are responsible for disposing of mp, such as by freemsg() or putnext()
   5165  * Caller is responsible for dropping references to the conn.
   5166  */
   5167 void
   5168 ip_fanout_udp_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
   5169     ip_recv_attr_t *ira)
   5170 {
   5171 	ill_t		*ill = ira->ira_ill;
   5172 	ip_stack_t	*ipst = ill->ill_ipst;
   5173 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   5174 	boolean_t	secure;
   5175 	iaflags_t	iraflags = ira->ira_flags;
   5176 
   5177 	secure = iraflags & IRAF_IPSEC_SECURE;
   5178 
   5179 	if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
   5180 	    !canputnext(connp->conn_rq)) {
   5181 		BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
   5182 		freemsg(mp);
   5183 		return;
   5184 	}
   5185 
   5186 	if (((iraflags & IRAF_IS_IPV4) ?
   5187 	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
   5188 	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
   5189 	    secure) {
   5190 		mp = ipsec_check_inbound_policy(mp, connp, ipha,
   5191 		    ip6h, ira);
   5192 		if (mp == NULL) {
   5193 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5194 			/* Note that mp is NULL */
   5195 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5196 			return;
   5197 		}
   5198 	}
   5199 
   5200 	/*
   5201 	 * Since this code is not used for UDP unicast we don't need a NAT_T
   5202 	 * check. Only ip_fanout_v4 has that check.
   5203 	 */
   5204 	if (ira->ira_flags & IRAF_ICMP_ERROR) {
   5205 		(connp->conn_recvicmp)(connp, mp, NULL, ira);
   5206 	} else {
   5207 		ill_t *rill = ira->ira_rill;
   5208 
   5209 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
   5210 		ira->ira_ill = ira->ira_rill = NULL;
   5211 		/* Send it upstream */
   5212 		(connp->conn_recv)(connp, mp, NULL, ira);
   5213 		ira->ira_ill = ill;
   5214 		ira->ira_rill = rill;
   5215 	}
   5216 }
   5217 
   5218 /*
   5219  * Fanout for UDP packets that are multicast or broadcast, and ICMP errors.
   5220  * (Unicast fanout is handled in ip_input_v4.)
   5221  *
   5222  * If SO_REUSEADDR is set all multicast and broadcast packets
   5223  * will be delivered to all conns bound to the same port.
   5224  *
   5225  * If there is at least one matching AF_INET receiver, then we will
   5226  * ignore any AF_INET6 receivers.
   5227  * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an
   5228  * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4
   5229  * packets.
   5230  *
   5231  * Zones notes:
   5232  * Earlier in ip_input on a system with multiple shared-IP zones we
   5233  * duplicate the multicast and broadcast packets and send them up
   5234  * with each explicit zoneid that exists on that ill.
   5235  * This means that here we can match the zoneid with SO_ALLZONES being special.
   5236  */
   5237 void
   5238 ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport,
   5239     ip_recv_attr_t *ira)
   5240 {
   5241 	ipaddr_t	laddr;
   5242 	in6_addr_t	v6faddr;
   5243 	conn_t		*connp;
   5244 	connf_t		*connfp;
   5245 	ipaddr_t	faddr;
   5246 	ill_t		*ill = ira->ira_ill;
   5247 	ip_stack_t	*ipst = ill->ill_ipst;
   5248 
   5249 	ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
   5250 
   5251 	laddr = ipha->ipha_dst;
   5252 	faddr = ipha->ipha_src;
   5253 
   5254 	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
   5255 	mutex_enter(&connfp->connf_lock);
   5256 	connp = connfp->connf_head;
   5257 
   5258 	/*
   5259 	 * If SO_REUSEADDR has been set on the first we send the
   5260 	 * packet to all clients that have joined the group and
   5261 	 * match the port.
   5262 	 */
   5263 	while (connp != NULL) {
   5264 		if ((IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr)) &&
   5265 		    conn_wantpacket(connp, ira, ipha) &&
   5266 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5267 		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
   5268 			break;
   5269 		connp = connp->conn_next;
   5270 	}
   5271 
   5272 	if (connp == NULL)
   5273 		goto notfound;
   5274 
   5275 	CONN_INC_REF(connp);
   5276 
   5277 	if (connp->conn_reuseaddr) {
   5278 		conn_t		*first_connp = connp;
   5279 		conn_t		*next_connp;
   5280 		mblk_t		*mp1;
   5281 
   5282 		connp = connp->conn_next;
   5283 		for (;;) {
   5284 			while (connp != NULL) {
   5285 				if (IPCL_UDP_MATCH(connp, lport, laddr,
   5286 				    fport, faddr) &&
   5287 				    conn_wantpacket(connp, ira, ipha) &&
   5288 				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5289 				    tsol_receive_local(mp, &laddr, IPV4_VERSION,
   5290 				    ira, connp)))
   5291 					break;
   5292 				connp = connp->conn_next;
   5293 			}
   5294 			if (connp == NULL) {
   5295 				/* No more interested clients */
   5296 				connp = first_connp;
   5297 				break;
   5298 			}
   5299 			if (((mp1 = dupmsg(mp)) == NULL) &&
   5300 			    ((mp1 = copymsg(mp)) == NULL)) {
   5301 				/* Memory allocation failed */
   5302 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5303 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5304 				connp = first_connp;
   5305 				break;
   5306 			}
   5307 			CONN_INC_REF(connp);
   5308 			mutex_exit(&connfp->connf_lock);
   5309 
   5310 			IP_STAT(ipst, ip_udp_fanmb);
   5311 			ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
   5312 			    NULL, ira);
   5313 			mutex_enter(&connfp->connf_lock);
   5314 			/* Follow the next pointer before releasing the conn */
   5315 			next_connp = connp->conn_next;
   5316 			CONN_DEC_REF(connp);
   5317 			connp = next_connp;
   5318 		}
   5319 	}
   5320 
   5321 	/* Last one.  Send it upstream. */
   5322 	mutex_exit(&connfp->connf_lock);
   5323 	IP_STAT(ipst, ip_udp_fanmb);
   5324 	ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
   5325 	CONN_DEC_REF(connp);
   5326 	return;
   5327 
   5328 notfound:
   5329 	mutex_exit(&connfp->connf_lock);
   5330 	/*
   5331 	 * IPv6 endpoints bound to multicast IPv4-mapped addresses
   5332 	 * have already been matched above, since they live in the IPv4
   5333 	 * fanout tables. This implies we only need to
   5334 	 * check for IPv6 in6addr_any endpoints here.
   5335 	 * Thus we compare using ipv6_all_zeros instead of the destination
   5336 	 * address, except for the multicast group membership lookup which
   5337 	 * uses the IPv4 destination.
   5338 	 */
   5339 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6faddr);
   5340 	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
   5341 	mutex_enter(&connfp->connf_lock);
   5342 	connp = connfp->connf_head;
   5343 	/*
   5344 	 * IPv4 multicast packet being delivered to an AF_INET6
   5345 	 * in6addr_any endpoint.
   5346 	 * Need to check conn_wantpacket(). Note that we use conn_wantpacket()
   5347 	 * and not conn_wantpacket_v6() since any multicast membership is
   5348 	 * for an IPv4-mapped multicast address.
   5349 	 */
   5350 	while (connp != NULL) {
   5351 		if (IPCL_UDP_MATCH_V6(connp, lport, ipv6_all_zeros,
   5352 		    fport, v6faddr) &&
   5353 		    conn_wantpacket(connp, ira, ipha) &&
   5354 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5355 		    tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
   5356 			break;
   5357 		connp = connp->conn_next;
   5358 	}
   5359 
   5360 	if (connp == NULL) {
   5361 		/*
   5362 		 * No one bound to this port.  Is
   5363 		 * there a client that wants all
   5364 		 * unclaimed datagrams?
   5365 		 */
   5366 		mutex_exit(&connfp->connf_lock);
   5367 
   5368 		if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].connf_head !=
   5369 		    NULL) {
   5370 			ASSERT(ira->ira_protocol == IPPROTO_UDP);
   5371 			ip_fanout_proto_v4(mp, ipha, ira);
   5372 		} else {
   5373 			/*
   5374 			 * We used to attempt to send an icmp error here, but
   5375 			 * since this is known to be a multicast packet
   5376 			 * and we don't send icmp errors in response to
   5377 			 * multicast, just drop the packet and give up sooner.
   5378 			 */
   5379 			BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
   5380 			freemsg(mp);
   5381 		}
   5382 		return;
   5383 	}
   5384 	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
   5385 
   5386 	/*
   5387 	 * If SO_REUSEADDR has been set on the first we send the
   5388 	 * packet to all clients that have joined the group and
   5389 	 * match the port.
   5390 	 */
   5391 	if (connp->conn_reuseaddr) {
   5392 		conn_t		*first_connp = connp;
   5393 		conn_t		*next_connp;
   5394 		mblk_t		*mp1;
   5395 
   5396 		CONN_INC_REF(connp);
   5397 		connp = connp->conn_next;
   5398 		for (;;) {
   5399 			while (connp != NULL) {
   5400 				if (IPCL_UDP_MATCH_V6(connp, lport,
   5401 				    ipv6_all_zeros, fport, v6faddr) &&
   5402 				    conn_wantpacket(connp, ira, ipha) &&
   5403 				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
   5404 				    tsol_receive_local(mp, &laddr, IPV4_VERSION,
   5405 				    ira, connp)))
   5406 					break;
   5407 				connp = connp->conn_next;
   5408 			}
   5409 			if (connp == NULL) {
   5410 				/* No more interested clients */
   5411 				connp = first_connp;
   5412 				break;
   5413 			}
   5414 			if (((mp1 = dupmsg(mp)) == NULL) &&
   5415 			    ((mp1 = copymsg(mp)) == NULL)) {
   5416 				/* Memory allocation failed */
   5417 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   5418 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
   5419 				connp = first_connp;
   5420 				break;
   5421 			}
   5422 			CONN_INC_REF(connp);
   5423 			mutex_exit(&connfp->connf_lock);
   5424 
   5425 			IP_STAT(ipst, ip_udp_fanmb);
   5426 			ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
   5427 			    NULL, ira);
   5428 			mutex_enter(&connfp->connf_lock);
   5429 			/* Follow the next pointer before releasing the conn */
   5430 			next_connp = connp->conn_next;
   5431 			CONN_DEC_REF(connp);
   5432 			connp = next_connp;
   5433 		}
   5434 	}
   5435 
   5436 	/* Last one.  Send it upstream. */
   5437 	mutex_exit(&connfp->connf_lock);
   5438 	IP_STAT(ipst, ip_udp_fanmb);
   5439 	ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
   5440 	CONN_DEC_REF(connp);
   5441 }
   5442 
   5443 /*
   5444  * Split an incoming packet's IPv4 options into the label and the other options.
   5445  * If 'allocate' is set it does memory allocation for the ip_pkt_t, including
   5446  * clearing out any leftover label or options.
   5447  * Otherwise it just makes ipp point into the packet.
   5448  *
   5449  * Returns zero if ok; ENOMEM if the buffer couldn't be allocated.
   5450  */
   5451 int
   5452 ip_find_hdr_v4(ipha_t *ipha, ip_pkt_t *ipp, boolean_t allocate)
   5453 {
   5454 	uchar_t		*opt;
   5455 	uint32_t	totallen;
   5456 	uint32_t	optval;
   5457 	uint32_t	optlen;
   5458 
   5459 	ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
   5460 	ipp->ipp_hoplimit = ipha->ipha_ttl;
   5461 	ipp->ipp_type_of_service = ipha->ipha_type_of_service;
   5462 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &ipp->ipp_addr);
   5463 
   5464 	/*
   5465 	 * Get length (in 4 byte octets) of IP header options.
   5466 	 */
   5467 	totallen = ipha->ipha_version_and_hdr_length -
   5468 	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
   5469 
   5470 	if (totallen == 0) {
   5471 		if (!allocate)
   5472 			return (0);
   5473 
   5474 		/* Clear out anything from a previous packet */
   5475 		if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
   5476 			kmem_free(ipp->ipp_ipv4_options,
   5477 			    ipp->ipp_ipv4_options_len);
   5478 			ipp->ipp_ipv4_options = NULL;
   5479 			ipp->ipp_ipv4_options_len = 0;
   5480 			ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
   5481 		}
   5482 		if (ipp->ipp_fields & IPPF_LABEL_V4) {
   5483 			kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
   5484 			ipp->ipp_label_v4 = NULL;
   5485 			ipp->ipp_label_len_v4 = 0;
   5486 			ipp->ipp_fields &= ~IPPF_LABEL_V4;
   5487 		}
   5488 		return (0);
   5489 	}
   5490 
   5491 	totallen <<= 2;
   5492 	opt = (uchar_t *)&ipha[1];
   5493 	if (!is_system_labeled()) {
   5494 
   5495 	copyall:
   5496 		if (!allocate) {
   5497 			if (totallen != 0) {
   5498 				ipp->ipp_ipv4_options = opt;
   5499 				ipp->ipp_ipv4_options_len = totallen;
   5500 				ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
   5501 			}
   5502 			return (0);
   5503 		}
   5504 		/* Just copy all of options */
   5505 		if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
   5506 			if (totallen == ipp->ipp_ipv4_options_len) {
   5507 				bcopy(opt, ipp->ipp_ipv4_options, totallen);
   5508 				return (0);
   5509 			}
   5510 			kmem_free(ipp->ipp_ipv4_options,
   5511 			    ipp->ipp_ipv4_options_len);
   5512 			ipp->ipp_ipv4_options = NULL;
   5513 			ipp->ipp_ipv4_options_len = 0;
   5514 			ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
   5515 		}
   5516 		if (totallen == 0)
   5517 			return (0);
   5518 
   5519 		ipp->ipp_ipv4_options = kmem_alloc(totallen, KM_NOSLEEP);
   5520 		if (ipp->ipp_ipv4_options == NULL)
   5521 			return (ENOMEM);
   5522 		ipp->ipp_ipv4_options_len = totallen;
   5523 		ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
   5524 		bcopy(opt, ipp->ipp_ipv4_options, totallen);
   5525 		return (0);
   5526 	}
   5527 
   5528 	if (allocate && (ipp->ipp_fields & IPPF_LABEL_V4)) {
   5529 		kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
   5530 		ipp->ipp_label_v4 = NULL;
   5531 		ipp->ipp_label_len_v4 = 0;
   5532 		ipp->ipp_fields &= ~IPPF_LABEL_V4;
   5533 	}
   5534 
   5535 	/*
   5536 	 * Search for CIPSO option.
   5537 	 * We assume CIPSO is first in options if it is present.
   5538 	 * If it isn't, then ipp_opt_ipv4_options will not include the options
   5539 	 * prior to the CIPSO option.
   5540 	 */
   5541 	while (totallen != 0) {
   5542 		switch (optval = opt[IPOPT_OPTVAL]) {
   5543 		case IPOPT_EOL:
   5544 			return (0);
   5545 		case IPOPT_NOP:
   5546 			optlen = 1;
   5547 			break;
   5548 		default:
   5549 			if (totallen <= IPOPT_OLEN)
   5550 				return (EINVAL);
   5551 			optlen = opt[IPOPT_OLEN];
   5552 			if (optlen < 2)
   5553 				return (EINVAL);
   5554 		}
   5555 		if (optlen > totallen)
   5556 			return (EINVAL);
   5557 
   5558 		switch (optval) {
   5559 		case IPOPT_COMSEC:
   5560 			if (!allocate) {
   5561 				ipp->ipp_label_v4 = opt;
   5562 				ipp->ipp_label_len_v4 = optlen;
   5563 				ipp->ipp_fields |= IPPF_LABEL_V4;
   5564 			} else {
   5565 				ipp->ipp_label_v4 = kmem_alloc(optlen,
   5566 				    KM_NOSLEEP);
   5567 				if (ipp->ipp_label_v4 == NULL)
   5568 					return (ENOMEM);
   5569 				ipp->ipp_label_len_v4 = optlen;
   5570 				ipp->ipp_fields |= IPPF_LABEL_V4;
   5571 				bcopy(opt, ipp->ipp_label_v4, optlen);
   5572 			}
   5573 			totallen -= optlen;
   5574 			opt += optlen;
   5575 
   5576 			/* Skip padding bytes until we get to a multiple of 4 */
   5577 			while ((totallen & 3) != 0 && opt[0] == IPOPT_NOP) {
   5578 				totallen--;
   5579 				opt++;
   5580 			}
   5581 			/* Remaining as ipp_ipv4_options */
   5582 			goto copyall;
   5583 		}
   5584 		totallen -= optlen;
   5585 		opt += optlen;
   5586 	}
   5587 	/* No CIPSO found; return everything as ipp_ipv4_options */
   5588 	totallen = ipha->ipha_version_and_hdr_length -
   5589 	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
   5590 	totallen <<= 2;
   5591 	opt = (uchar_t *)&ipha[1];
   5592 	goto copyall;
   5593 }
   5594 
   5595 /*
   5596  * Efficient versions of lookup for an IRE when we only
   5597  * match the address.
   5598  * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
   5599  * Does not handle multicast addresses.
   5600  */
   5601 uint_t
   5602 ip_type_v4(ipaddr_t addr, ip_stack_t *ipst)
   5603 {
   5604 	ire_t *ire;
   5605 	uint_t result;
   5606 
   5607 	ire = ire_ftable_lookup_simple_v4(addr, 0, ipst, NULL);
   5608 	ASSERT(ire != NULL);
   5609 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
   5610 		result = IRE_NOROUTE;
   5611 	else
   5612 		result = ire->ire_type;
   5613 	ire_refrele(ire);
   5614 	return (result);
   5615 }
   5616 
   5617 /*
   5618  * Efficient versions of lookup for an IRE when we only
   5619  * match the address.
   5620  * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
   5621  * Does not handle multicast addresses.
   5622  */
   5623 uint_t
   5624 ip_type_v6(const in6_addr_t *addr, ip_stack_t *ipst)
   5625 {
   5626 	ire_t *ire;
   5627 	uint_t result;
   5628 
   5629 	ire = ire_ftable_lookup_simple_v6(addr, 0, ipst, NULL);
   5630 	ASSERT(ire != NULL);
   5631 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
   5632 		result = IRE_NOROUTE;
   5633 	else
   5634 		result = ire->ire_type;
   5635 	ire_refrele(ire);
   5636 	return (result);
   5637 }
   5638 
   5639 /*
   5640  * Nobody should be sending
   5641  * packets up this stream
   5642  */
   5643 static void
   5644 ip_lrput(queue_t *q, mblk_t *mp)
   5645 {
   5646 	switch (mp->b_datap->db_type) {
   5647 	case M_FLUSH:
   5648 		/* Turn around */
   5649 		if (*mp->b_rptr & FLUSHW) {
   5650 			*mp->b_rptr &= ~FLUSHR;
   5651 			qreply(q, mp);
   5652 			return;
   5653 		}
   5654 		break;
   5655 	}
   5656 	freemsg(mp);
   5657 }
   5658 
   5659 /* Nobody should be sending packets down this stream */
   5660 /* ARGSUSED */
   5661 void
   5662 ip_lwput(queue_t *q, mblk_t *mp)
   5663 {
   5664 	freemsg(mp);
   5665 }
   5666 
   5667 /*
   5668  * Move the first hop in any source route to ipha_dst and remove that part of
   5669  * the source route.  Called by other protocols.  Errors in option formatting
   5670  * are ignored - will be handled by ip_output_options. Return the final
   5671  * destination (either ipha_dst or the last entry in a source route.)
   5672  */
   5673 ipaddr_t
   5674 ip_massage_options(ipha_t *ipha, netstack_t *ns)
   5675 {
   5676 	ipoptp_t	opts;
   5677 	uchar_t		*opt;
   5678 	uint8_t		optval;
   5679 	uint8_t		optlen;
   5680 	ipaddr_t	dst;
   5681 	int		i;
   5682 	ip_stack_t	*ipst = ns->netstack_ip;
   5683 
   5684 	ip2dbg(("ip_massage_options\n"));
   5685 	dst = ipha->ipha_dst;
   5686 	for (optval = ipoptp_first(&opts, ipha);
   5687 	    optval != IPOPT_EOL;
   5688 	    optval = ipoptp_next(&opts)) {
   5689 		opt = opts.ipoptp_cur;
   5690 		switch (optval) {
   5691 			uint8_t off;
   5692 		case IPOPT_SSRR:
   5693 		case IPOPT_LSRR:
   5694 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   5695 				ip1dbg(("ip_massage_options: bad src route\n"));
   5696 				break;
   5697 			}
   5698 			optlen = opts.ipoptp_len;
   5699 			off = opt[IPOPT_OFFSET];
   5700 			off--;
   5701 		redo_srr:
   5702 			if (optlen < IP_ADDR_LEN ||
   5703 			    off > optlen - IP_ADDR_LEN) {
   5704 				/* End of source route */
   5705 				ip1dbg(("ip_massage_options: end of SR\n"));
   5706 				break;
   5707 			}
   5708 			bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
   5709 			ip1dbg(("ip_massage_options: next hop 0x%x\n",
   5710 			    ntohl(dst)));
   5711 			/*
   5712 			 * Check if our address is present more than
   5713 			 * once as consecutive hops in source route.
   5714 			 * XXX verify per-interface ip_forwarding
   5715 			 * for source route?
   5716 			 */
   5717 			if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
   5718 				off += IP_ADDR_LEN;
   5719 				goto redo_srr;
   5720 			}
   5721 			if (dst == htonl(INADDR_LOOPBACK)) {
   5722 				ip1dbg(("ip_massage_options: loopback addr in "
   5723 				    "source route!\n"));
   5724 				break;
   5725 			}
   5726 			/*
   5727 			 * Update ipha_dst to be the first hop and remove the
   5728 			 * first hop from the source route (by overwriting
   5729 			 * part of the option with NOP options).
   5730 			 */
   5731 			ipha->ipha_dst = dst;
   5732 			/* Put the last entry in dst */
   5733 			off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) +
   5734 			    3;
   5735 			bcopy(&opt[off], &dst, IP_ADDR_LEN);
   5736 
   5737 			ip1dbg(("ip_massage_options: last hop 0x%x\n",
   5738 			    ntohl(dst)));
   5739 			/* Move down and overwrite */
   5740 			opt[IP_ADDR_LEN] = opt[0];
   5741 			opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN;
   5742 			opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET];
   5743 			for (i = 0; i < IP_ADDR_LEN; i++)
   5744 				opt[i] = IPOPT_NOP;
   5745 			break;
   5746 		}
   5747 	}
   5748 	return (dst);
   5749 }
   5750 
   5751 /*
   5752  * Return the network mask
   5753  * associated with the specified address.
   5754  */
   5755 ipaddr_t
   5756 ip_net_mask(ipaddr_t addr)
   5757 {
   5758 	uchar_t	*up = (uchar_t *)&addr;
   5759 	ipaddr_t mask = 0;
   5760 	uchar_t	*maskp = (uchar_t *)&mask;
   5761 
   5762 #if defined(__i386) || defined(__amd64)
   5763 #define	TOTALLY_BRAIN_DAMAGED_C_COMPILER
   5764 #endif
   5765 #ifdef  TOTALLY_BRAIN_DAMAGED_C_COMPILER
   5766 	maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0;
   5767 #endif
   5768 	if (CLASSD(addr)) {
   5769 		maskp[0] = 0xF0;
   5770 		return (mask);
   5771 	}
   5772 
   5773 	/* We assume Class E default netmask to be 32 */
   5774 	if (CLASSE(addr))
   5775 		return (0xffffffffU);
   5776 
   5777 	if (addr == 0)
   5778 		return (0);
   5779 	maskp[0] = 0xFF;
   5780 	if ((up[0] & 0x80) == 0)
   5781 		return (mask);
   5782 
   5783 	maskp[1] = 0xFF;
   5784 	if ((up[0] & 0xC0) == 0x80)
   5785 		return (mask);
   5786 
   5787 	maskp[2] = 0xFF;
   5788 	if ((up[0] & 0xE0) == 0xC0)
   5789 		return (mask);
   5790 
   5791 	/* Otherwise return no mask */
   5792 	return ((ipaddr_t)0);
   5793 }
   5794 
   5795 /* Name/Value Table Lookup Routine */
   5796 char *
   5797 ip_nv_lookup(nv_t *nv, int value)
   5798 {
   5799 	if (!nv)
   5800 		return (NULL);
   5801 	for (; nv->nv_name; nv++) {
   5802 		if (nv->nv_value == value)
   5803 			return (nv->nv_name);
   5804 	}
   5805 	return ("unknown");
   5806 }
   5807 
   5808 static int
   5809 ip_wait_for_info_ack(ill_t *ill)
   5810 {
   5811 	int err;
   5812 
   5813 	mutex_enter(&ill->ill_lock);
   5814 	while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) {
   5815 		/*
   5816 		 * Return value of 0 indicates a pending signal.
   5817 		 */
   5818 		err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock);
   5819 		if (err == 0) {
   5820 			mutex_exit(&ill->ill_lock);
   5821 			return (EINTR);
   5822 		}
   5823 	}
   5824 	mutex_exit(&ill->ill_lock);
   5825 	/*
   5826 	 * ip_rput_other could have set an error  in ill_error on
   5827 	 * receipt of M_ERROR.
   5828 	 */
   5829 	return (ill->ill_error);
   5830 }
   5831 
   5832 /*
   5833  * This is a module open, i.e. this is a control stream for access
   5834  * to a DLPI device.  We allocate an ill_t as the instance data in
   5835  * this case.
   5836  */
   5837 static int
   5838 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   5839 {
   5840 	ill_t	*ill;
   5841 	int	err;
   5842 	zoneid_t zoneid;
   5843 	netstack_t *ns;
   5844 	ip_stack_t *ipst;
   5845 
   5846 	/*
   5847 	 * Prevent unprivileged processes from pushing IP so that
   5848 	 * they can't send raw IP.
   5849 	 */
   5850 	if (secpolicy_net_rawaccess(credp) != 0)
   5851 		return (EPERM);
   5852 
   5853 	ns = netstack_find_by_cred(credp);
   5854 	ASSERT(ns != NULL);
   5855 	ipst = ns->netstack_ip;
   5856 	ASSERT(ipst != NULL);
   5857 
   5858 	/*
   5859 	 * For exclusive stacks we set the zoneid to zero
   5860 	 * to make IP operate as if in the global zone.
   5861 	 */
   5862 	if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
   5863 		zoneid = GLOBAL_ZONEID;
   5864 	else
   5865 		zoneid = crgetzoneid(credp);
   5866 
   5867 	ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t));
   5868 	q->q_ptr = WR(q)->q_ptr = ill;
   5869 	ill->ill_ipst = ipst;
   5870 	ill->ill_zoneid = zoneid;
   5871 
   5872 	/*
   5873 	 * ill_init initializes the ill fields and then sends down
   5874 	 * down a DL_INFO_REQ after calling qprocson.
   5875 	 */
   5876 	err = ill_init(q, ill);
   5877 
   5878 	if (err != 0) {
   5879 		mi_free(ill);
   5880 		netstack_rele(ipst->ips_netstack);
   5881 		q->q_ptr = NULL;
   5882 		WR(q)->q_ptr = NULL;
   5883 		return (err);
   5884 	}
   5885 
   5886 	/*
   5887 	 * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
   5888 	 *
   5889 	 * ill_init initializes the ipsq marking this thread as
   5890 	 * writer
   5891 	 */
   5892 	ipsq_exit(ill->ill_phyint->phyint_ipsq);
   5893 	err = ip_wait_for_info_ack(ill);
   5894 	if (err == 0)
   5895 		ill->ill_credp = credp;
   5896 	else
   5897 		goto fail;
   5898 
   5899 	crhold(credp);
   5900 
   5901 	mutex_enter(&ipst->ips_ip_mi_lock);
   5902 	err = mi_open_link(&ipst->ips_ip_g_head, (IDP)q->q_ptr, devp, flag,
   5903 	    sflag, credp);
   5904 	mutex_exit(&ipst->ips_ip_mi_lock);
   5905 fail:
   5906 	if (err) {
   5907 		(void) ip_close(q, 0);
   5908 		return (err);
   5909 	}
   5910 	return (0);
   5911 }
   5912 
   5913 /* For /dev/ip aka AF_INET open */
   5914 int
   5915 ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   5916 {
   5917 	return (ip_open(q, devp, flag, sflag, credp, B_FALSE));
   5918 }
   5919 
   5920 /* For /dev/ip6 aka AF_INET6 open */
   5921 int
   5922 ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   5923 {
   5924 	return (ip_open(q, devp, flag, sflag, credp, B_TRUE));
   5925 }
   5926 
   5927 /* IP open routine. */
   5928 int
   5929 ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
   5930     boolean_t isv6)
   5931 {
   5932 	conn_t 		*connp;
   5933 	major_t		maj;
   5934 	zoneid_t	zoneid;
   5935 	netstack_t	*ns;
   5936 	ip_stack_t	*ipst;
   5937 
   5938 	/* Allow reopen. */
   5939 	if (q->q_ptr != NULL)
   5940 		return (0);
   5941 
   5942 	if (sflag & MODOPEN) {
   5943 		/* This is a module open */
   5944 		return (ip_modopen(q, devp, flag, sflag, credp));
   5945 	}
   5946 
   5947 	if ((flag & ~(FKLYR)) == IP_HELPER_STR) {
   5948 		/*
   5949 		 * Non streams based socket looking for a stream
   5950 		 * to access IP
   5951 		 */
   5952 		return (ip_helper_stream_setup(q, devp, flag, sflag,
   5953 		    credp, isv6));
   5954 	}
   5955 
   5956 	ns = netstack_find_by_cred(credp);
   5957 	ASSERT(ns != NULL);
   5958 	ipst = ns->netstack_ip;
   5959 	ASSERT(ipst != NULL);
   5960 
   5961 	/*
   5962 	 * For exclusive stacks we set the zoneid to zero
   5963 	 * to make IP operate as if in the global zone.
   5964 	 */
   5965 	if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
   5966 		zoneid = GLOBAL_ZONEID;
   5967 	else
   5968 		zoneid = crgetzoneid(credp);
   5969 
   5970 	/*
   5971 	 * We are opening as a device. This is an IP client stream, and we
   5972 	 * allocate an conn_t as the instance data.
   5973 	 */
   5974 	connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, ipst->ips_netstack);
   5975 
   5976 	/*
   5977 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
   5978 	 * done by netstack_find_by_cred()
   5979 	 */
   5980 	netstack_rele(ipst->ips_netstack);
   5981 
   5982 	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
   5983 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
   5984 	connp->conn_ixa->ixa_zoneid = zoneid;
   5985 	connp->conn_zoneid = zoneid;
   5986 
   5987 	connp->conn_rq = q;
   5988 	q->q_ptr = WR(q)->q_ptr = connp;
   5989 
   5990 	/* Minor tells us which /dev entry was opened */
   5991 	if (isv6) {
   5992 		connp->conn_family = AF_INET6;
   5993 		connp->conn_ipversion = IPV6_VERSION;
   5994 		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
   5995 		connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
   5996 	} else {
   5997 		connp->conn_family = AF_INET;
   5998 		connp->conn_ipversion = IPV4_VERSION;
   5999 		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
   6000 	}
   6001 
   6002 	if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
   6003 	    ((connp->conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
   6004 		connp->conn_minor_arena = ip_minor_arena_la;
   6005 	} else {
   6006 		/*
   6007 		 * Either minor numbers in the large arena were exhausted
   6008 		 * or a non socket application is doing the open.
   6009 		 * Try to allocate from the small arena.
   6010 		 */
   6011 		if ((connp->conn_dev =
   6012 		    inet_minor_alloc(ip_minor_arena_sa)) == 0) {
   6013 			/* CONN_DEC_REF takes care of netstack_rele() */
   6014 			q->q_ptr = WR(q)->q_ptr = NULL;
   6015 			CONN_DEC_REF(connp);
   6016 			return (EBUSY);
   6017 		}
   6018 		connp->conn_minor_arena = ip_minor_arena_sa;
   6019 	}
   6020 
   6021 	maj = getemajor(*devp);
   6022 	*devp = makedevice(maj, (minor_t)connp->conn_dev);
   6023 
   6024 	/*
   6025 	 * connp->conn_cred is crfree()ed in ipcl_conn_destroy()
   6026 	 */
   6027 	connp->conn_cred = credp;
   6028 	connp->conn_cpid = curproc->p_pid;
   6029 	/* Cache things in ixa without an extra refhold */
   6030 	ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
   6031 	connp->conn_ixa->ixa_cred = connp->conn_cred;
   6032 	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
   6033 	if (is_system_labeled())
   6034 		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
   6035 
   6036 	/*
   6037 	 * Handle IP_IOC_RTS_REQUEST and other ioctls which use conn_recv
   6038 	 */
   6039 	connp->conn_recv = ip_conn_input;
   6040 	connp->conn_recvicmp = ip_conn_input_icmp;
   6041 
   6042 	crhold(connp->conn_cred);
   6043 
   6044 	/*
   6045 	 * If the caller has the process-wide flag set, then default to MAC
   6046 	 * exempt mode.  This allows read-down to unlabeled hosts.
   6047 	 */
   6048 	if (getpflags(NET_MAC_AWARE, credp) != 0)
   6049 		connp->conn_mac_mode = CONN_MAC_AWARE;
   6050 
   6051 	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
   6052 
   6053 	connp->conn_rq = q;
   6054 	connp->conn_wq = WR(q);
   6055 
   6056 	/* Non-zero default values */
   6057 	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
   6058 
   6059 	/*
   6060 	 * Make the conn globally visible to walkers
   6061 	 */
   6062 	ASSERT(connp->conn_ref == 1);
   6063 	mutex_enter(&connp->conn_lock);
   6064 	connp->conn_state_flags &= ~CONN_INCIPIENT;
   6065 	mutex_exit(&connp->conn_lock);
   6066 
   6067 	qprocson(q);
   6068 
   6069 	return (0);
   6070 }
   6071 
   6072 /*
   6073  * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid,
   6074  * all of them are copied to the conn_t. If the req is "zero", the policy is
   6075  * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req
   6076  * fields.
   6077  * We keep only the latest setting of the policy and thus policy setting
   6078  * is not incremental/cumulative.
   6079  *
   6080  * Requests to set policies with multiple alternative actions will
   6081  * go through a different API.
   6082  */
   6083 int
   6084 ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
   6085 {
   6086 	uint_t ah_req = 0;
   6087 	uint_t esp_req = 0;
   6088 	uint_t se_req = 0;
   6089 	ipsec_act_t *actp = NULL;
   6090 	uint_t nact;
   6091 	ipsec_policy_head_t *ph;
   6092 	boolean_t is_pol_reset, is_pol_inserted = B_FALSE;
   6093 	int error = 0;
   6094 	netstack_t	*ns = connp->conn_netstack;
   6095 	ip_stack_t	*ipst = ns->netstack_ip;
   6096 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
   6097 
   6098 #define	REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER)
   6099 
   6100 	/*
   6101 	 * The IP_SEC_OPT option does not allow variable length parameters,
   6102 	 * hence a request cannot be NULL.
   6103 	 */
   6104 	if (req == NULL)
   6105 		return (EINVAL);
   6106 
   6107 	ah_req = req->ipsr_ah_req;
   6108 	esp_req = req->ipsr_esp_req;
   6109 	se_req = req->ipsr_self_encap_req;
   6110 
   6111 	/* Don't allow setting self-encap without one or more of AH/ESP. */
   6112 	if (se_req != 0 && esp_req == 0 && ah_req == 0)
   6113 		return (EINVAL);
   6114 
   6115 	/*
   6116 	 * Are we dealing with a request to reset the policy (i.e.
   6117 	 * zero requests).
   6118 	 */
   6119 	is_pol_reset = ((ah_req & REQ_MASK) == 0 &&
   6120 	    (esp_req & REQ_MASK) == 0 &&
   6121 	    (se_req & REQ_MASK) == 0);
   6122 
   6123 	if (!is_pol_reset) {
   6124 		/*
   6125 		 * If we couldn't load IPsec, fail with "protocol
   6126 		 * not supported".
   6127 		 * IPsec may not have been loaded for a request with zero
   6128 		 * policies, so we don't fail in this case.
   6129 		 */
   6130 		mutex_enter(&ipss->ipsec_loader_lock);
   6131 		if (ipss->ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) {
   6132 			mutex_exit(&ipss->ipsec_loader_lock);
   6133 			return (EPROTONOSUPPORT);
   6134 		}
   6135 		mutex_exit(&ipss->ipsec_loader_lock);
   6136 
   6137 		/*
   6138 		 * Test for valid requests. Invalid algorithms
   6139 		 * need to be tested by IPsec code because new
   6140 		 * algorithms can be added dynamically.
   6141 		 */
   6142 		if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
   6143 		    (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
   6144 		    (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) {
   6145 			return (EINVAL);
   6146 		}
   6147 
   6148 		/*
   6149 		 * Only privileged users can issue these
   6150 		 * requests.
   6151 		 */
   6152 		if (((ah_req & IPSEC_PREF_NEVER) ||
   6153 		    (esp_req & IPSEC_PREF_NEVER) ||
   6154 		    (se_req & IPSEC_PREF_NEVER)) &&
   6155 		    secpolicy_ip_config(cr, B_FALSE) != 0) {
   6156 			return (EPERM);
   6157 		}
   6158 
   6159 		/*
   6160 		 * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER
   6161 		 * are mutually exclusive.
   6162 		 */
   6163 		if (((ah_req & REQ_MASK) == REQ_MASK) ||
   6164 		    ((esp_req & REQ_MASK) == REQ_MASK) ||
   6165 		    ((se_req & REQ_MASK) == REQ_MASK)) {
   6166 			/* Both of them are set */
   6167 			return (EINVAL);
   6168 		}
   6169 	}
   6170 
   6171 	ASSERT(MUTEX_HELD(&connp->conn_lock));
   6172 
   6173 	/*
   6174 	 * If we have already cached policies in conn_connect(), don't
   6175 	 * let them change now. We cache policies for connections
   6176 	 * whose src,dst [addr, port] is known.
   6177 	 */
   6178 	if (connp->conn_policy_cached) {
   6179 		return (EINVAL);
   6180 	}
   6181 
   6182 	/*
   6183 	 * We have a zero policies, reset the connection policy if already
   6184 	 * set. This will cause the connection to inherit the
   6185 	 * global policy, if any.
   6186 	 */
   6187 	if (is_pol_reset) {
   6188 		if (connp->conn_policy != NULL) {
   6189 			IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack);
   6190 			connp->conn_policy = NULL;
   6191 		}
   6192 		connp->conn_in_enforce_policy = B_FALSE;
   6193 		connp->conn_out_enforce_policy = B_FALSE;
   6194 		return (0);
   6195 	}
   6196 
   6197 	ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy,
   6198 	    ipst->ips_netstack);
   6199 	if (ph == NULL)
   6200 		goto enomem;
   6201 
   6202 	ipsec_actvec_from_req(req, &actp, &nact, ipst->ips_netstack);
   6203 	if (actp == NULL)
   6204 		goto enomem;
   6205 
   6206 	/*
   6207 	 * Always insert IPv4 policy entries, since they can also apply to
   6208 	 * ipv6 sockets being used in ipv4-compat mode.
   6209 	 */
   6210 	if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
   6211 	    IPSEC_TYPE_INBOUND, ns))
   6212 		goto enomem;
   6213 	is_pol_inserted = B_TRUE;
   6214 	if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
   6215 	    IPSEC_TYPE_OUTBOUND, ns))
   6216 		goto enomem;
   6217 
   6218 	/*
   6219 	 * We're looking at a v6 socket, also insert the v6-specific
   6220 	 * entries.
   6221 	 */
   6222 	if (connp->conn_family == AF_INET6) {
   6223 		if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
   6224 		    IPSEC_TYPE_INBOUND, ns))
   6225 			goto enomem;
   6226 		if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
   6227 		    IPSEC_TYPE_OUTBOUND, ns))
   6228 			goto enomem;
   6229 	}
   6230 
   6231 	ipsec_actvec_free(actp, nact);
   6232 
   6233 	/*
   6234 	 * If the requests need security, set enforce_policy.
   6235 	 * If the requests are IPSEC_PREF_NEVER, one should
   6236 	 * still set conn_out_enforce_policy so that ip_set_destination
   6237 	 * marks the ip_xmit_attr_t appropriatly. This is needed so that
   6238 	 * for connections that we don't cache policy in at connect time,
   6239 	 * if global policy matches in ip_output_attach_policy, we
   6240 	 * don't wrongly inherit global policy. Similarly, we need
   6241 	 * to set conn_in_enforce_policy also so that we don't verify
   6242 	 * policy wrongly.
   6243 	 */
   6244 	if ((ah_req & REQ_MASK) != 0 ||
   6245 	    (esp_req & REQ_MASK) != 0 ||
   6246 	    (se_req & REQ_MASK) != 0) {
   6247 		connp->conn_in_enforce_policy = B_TRUE;
   6248 		connp->conn_out_enforce_policy = B_TRUE;
   6249 	}
   6250 
   6251 	return (error);
   6252 #undef REQ_MASK
   6253 
   6254 	/*
   6255 	 * Common memory-allocation-failure exit path.
   6256 	 */
   6257 enomem:
   6258 	if (actp != NULL)
   6259 		ipsec_actvec_free(actp, nact);
   6260 	if (is_pol_inserted)
   6261 		ipsec_polhead_flush(ph, ns);
   6262 	return (ENOMEM);
   6263 }
   6264 
   6265 /*
   6266  * Set socket options for joining and leaving multicast groups.
   6267  * Common to IPv4 and IPv6; inet6 indicates the type of socket.
   6268  * The caller has already check that the option name is consistent with
   6269  * the address family of the socket.
   6270  */
   6271 int
   6272 ip_opt_set_multicast_group(conn_t *connp, t_scalar_t name,
   6273     uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
   6274 {
   6275 	int		*i1 = (int *)invalp;
   6276 	int		error = 0;
   6277 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   6278 	struct ip_mreq	*v4_mreqp;
   6279 	struct ipv6_mreq *v6_mreqp;
   6280 	struct group_req *greqp;
   6281 	ire_t *ire;
   6282 	boolean_t done = B_FALSE;
   6283 	ipaddr_t ifaddr;
   6284 	in6_addr_t v6group;
   6285 	uint_t ifindex;
   6286 	boolean_t mcast_opt = B_TRUE;
   6287 	mcast_record_t fmode;
   6288 	int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
   6289 	    ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
   6290 
   6291 	switch (name) {
   6292 	case IP_ADD_MEMBERSHIP:
   6293 	case IPV6_JOIN_GROUP:
   6294 		mcast_opt = B_FALSE;
   6295 		/* FALLTHRU */
   6296 	case MCAST_JOIN_GROUP:
   6297 		fmode = MODE_IS_EXCLUDE;
   6298 		optfn = ip_opt_add_group;
   6299 		break;
   6300 
   6301 	case IP_DROP_MEMBERSHIP:
   6302 	case IPV6_LEAVE_GROUP:
   6303 		mcast_opt = B_FALSE;
   6304 		/* FALLTHRU */
   6305 	case MCAST_LEAVE_GROUP:
   6306 		fmode = MODE_IS_INCLUDE;
   6307 		optfn = ip_opt_delete_group;
   6308 		break;
   6309 	default:
   6310 		ASSERT(0);
   6311 	}
   6312 
   6313 	if (mcast_opt) {
   6314 		struct sockaddr_in *sin;
   6315 		struct sockaddr_in6 *sin6;
   6316 
   6317 		greqp = (struct group_req *)i1;
   6318 		if (greqp->gr_group.ss_family == AF_INET) {
   6319 			sin = (struct sockaddr_in *)&(greqp->gr_group);
   6320 			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &v6group);
   6321 		} else {
   6322 			if (!inet6)
   6323 				return (EINVAL);	/* Not on INET socket */
   6324 
   6325 			sin6 = (struct sockaddr_in6 *)&(greqp->gr_group);
   6326 			v6group = sin6->sin6_addr;
   6327 		}
   6328 		ifaddr = INADDR_ANY;
   6329 		ifindex = greqp->gr_interface;
   6330 	} else if (inet6) {
   6331 		v6_mreqp = (struct ipv6_mreq *)i1;
   6332 		v6group = v6_mreqp->ipv6mr_multiaddr;
   6333 		ifaddr = INADDR_ANY;
   6334 		ifindex = v6_mreqp->ipv6mr_interface;
   6335 	} else {
   6336 		v4_mreqp = (struct ip_mreq *)i1;
   6337 		IN6_INADDR_TO_V4MAPPED(&v4_mreqp->imr_multiaddr, &v6group);
   6338 		ifaddr = (ipaddr_t)v4_mreqp->imr_interface.s_addr;
   6339 		ifindex = 0;
   6340 	}
   6341 
   6342 	/*
   6343 	 * In the multirouting case, we need to replicate
   6344 	 * the request on all interfaces that will take part
   6345 	 * in replication.  We do so because multirouting is
   6346 	 * reflective, thus we will probably receive multi-
   6347 	 * casts on those interfaces.
   6348 	 * The ip_multirt_apply_membership() succeeds if
   6349 	 * the operation succeeds on at least one interface.
   6350 	 */
   6351 	if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
   6352 		ipaddr_t group;
   6353 
   6354 		IN6_V4MAPPED_TO_IPADDR(&v6group, group);
   6355 
   6356 		ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
   6357 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6358 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6359 	} else {
   6360 		ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
   6361 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6362 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6363 	}
   6364 	if (ire != NULL) {
   6365 		if (ire->ire_flags & RTF_MULTIRT) {
   6366 			error = ip_multirt_apply_membership(optfn, ire, connp,
   6367 			    checkonly, &v6group, fmode, &ipv6_all_zeros);
   6368 			done = B_TRUE;
   6369 		}
   6370 		ire_refrele(ire);
   6371 	}
   6372 
   6373 	if (!done) {
   6374 		error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
   6375 		    fmode, &ipv6_all_zeros);
   6376 	}
   6377 	return (error);
   6378 }
   6379 
   6380 /*
   6381  * Set socket options for joining and leaving multicast groups
   6382  * for specific sources.
   6383  * Common to IPv4 and IPv6; inet6 indicates the type of socket.
   6384  * The caller has already check that the option name is consistent with
   6385  * the address family of the socket.
   6386  */
   6387 int
   6388 ip_opt_set_multicast_sources(conn_t *connp, t_scalar_t name,
   6389     uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
   6390 {
   6391 	int		*i1 = (int *)invalp;
   6392 	int		error = 0;
   6393 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   6394 	struct ip_mreq_source *imreqp;
   6395 	struct group_source_req *gsreqp;
   6396 	in6_addr_t v6group, v6src;
   6397 	uint32_t ifindex;
   6398 	ipaddr_t ifaddr;
   6399 	boolean_t mcast_opt = B_TRUE;
   6400 	mcast_record_t fmode;
   6401 	ire_t *ire;
   6402 	boolean_t done = B_FALSE;
   6403 	int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
   6404 	    ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
   6405 
   6406 	switch (name) {
   6407 	case IP_BLOCK_SOURCE:
   6408 		mcast_opt = B_FALSE;
   6409 		/* FALLTHRU */
   6410 	case MCAST_BLOCK_SOURCE:
   6411 		fmode = MODE_IS_EXCLUDE;
   6412 		optfn = ip_opt_add_group;
   6413 		break;
   6414 
   6415 	case IP_UNBLOCK_SOURCE:
   6416 		mcast_opt = B_FALSE;
   6417 		/* FALLTHRU */
   6418 	case MCAST_UNBLOCK_SOURCE:
   6419 		fmode = MODE_IS_EXCLUDE;
   6420 		optfn = ip_opt_delete_group;
   6421 		break;
   6422 
   6423 	case IP_ADD_SOURCE_MEMBERSHIP:
   6424 		mcast_opt = B_FALSE;
   6425 		/* FALLTHRU */
   6426 	case MCAST_JOIN_SOURCE_GROUP:
   6427 		fmode = MODE_IS_INCLUDE;
   6428 		optfn = ip_opt_add_group;
   6429 		break;
   6430 
   6431 	case IP_DROP_SOURCE_MEMBERSHIP:
   6432 		mcast_opt = B_FALSE;
   6433 		/* FALLTHRU */
   6434 	case MCAST_LEAVE_SOURCE_GROUP:
   6435 		fmode = MODE_IS_INCLUDE;
   6436 		optfn = ip_opt_delete_group;
   6437 		break;
   6438 	default:
   6439 		ASSERT(0);
   6440 	}
   6441 
   6442 	if (mcast_opt) {
   6443 		gsreqp = (struct group_source_req *)i1;
   6444 		ifindex = gsreqp->gsr_interface;
   6445 		if (gsreqp->gsr_group.ss_family == AF_INET) {
   6446 			struct sockaddr_in *s;
   6447 			s = (struct sockaddr_in *)&gsreqp->gsr_group;
   6448 			IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6group);
   6449 			s = (struct sockaddr_in *)&gsreqp->gsr_source;
   6450 			IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src);
   6451 		} else {
   6452 			struct sockaddr_in6 *s6;
   6453 
   6454 			if (!inet6)
   6455 				return (EINVAL);	/* Not on INET socket */
   6456 
   6457 			s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group;
   6458 			v6group = s6->sin6_addr;
   6459 			s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source;
   6460 			v6src = s6->sin6_addr;
   6461 		}
   6462 		ifaddr = INADDR_ANY;
   6463 	} else {
   6464 		imreqp = (struct ip_mreq_source *)i1;
   6465 		IN6_INADDR_TO_V4MAPPED(&imreqp->imr_multiaddr, &v6group);
   6466 		IN6_INADDR_TO_V4MAPPED(&imreqp->imr_sourceaddr, &v6src);
   6467 		ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr;
   6468 		ifindex = 0;
   6469 	}
   6470 
   6471 	/*
   6472 	 * Handle src being mapped INADDR_ANY by changing it to unspecified.
   6473 	 */
   6474 	if (IN6_IS_ADDR_V4MAPPED_ANY(&v6src))
   6475 		v6src = ipv6_all_zeros;
   6476 
   6477 	/*
   6478 	 * In the multirouting case, we need to replicate
   6479 	 * the request as noted in the mcast cases above.
   6480 	 */
   6481 	if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
   6482 		ipaddr_t group;
   6483 
   6484 		IN6_V4MAPPED_TO_IPADDR(&v6group, group);
   6485 
   6486 		ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
   6487 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6488 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6489 	} else {
   6490 		ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
   6491 		    IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
   6492 		    MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
   6493 	}
   6494 	if (ire != NULL) {
   6495 		if (ire->ire_flags & RTF_MULTIRT) {
   6496 			error = ip_multirt_apply_membership(optfn, ire, connp,
   6497 			    checkonly, &v6group, fmode, &v6src);
   6498 			done = B_TRUE;
   6499 		}
   6500 		ire_refrele(ire);
   6501 	}
   6502 	if (!done) {
   6503 		error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
   6504 		    fmode, &v6src);
   6505 	}
   6506 	return (error);
   6507 }
   6508 
   6509 /*
   6510  * Given a destination address and a pointer to where to put the information
   6511  * this routine fills in the mtuinfo.
   6512  * The socket must be connected.
   6513  * For sctp conn_faddr is the primary address.
   6514  */
   6515 int
   6516 ip_fill_mtuinfo(conn_t *connp, ip_xmit_attr_t *ixa, struct ip6_mtuinfo *mtuinfo)
   6517 {
   6518 	uint32_t	pmtu = IP_MAXPACKET;
   6519 	uint_t		scopeid;
   6520 
   6521 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6))
   6522 		return (-1);
   6523 
   6524 	/* In case we never sent or called ip_set_destination_v4/v6 */
   6525 	if (ixa->ixa_ire != NULL)
   6526 		pmtu = ip_get_pmtu(ixa);
   6527 
   6528 	if (ixa->ixa_flags & IXAF_SCOPEID_SET)
   6529 		scopeid = ixa->ixa_scopeid;
   6530 	else
   6531 		scopeid = 0;
   6532 
   6533 	bzero(mtuinfo, sizeof (*mtuinfo));
   6534 	mtuinfo->ip6m_addr.sin6_family = AF_INET6;
   6535 	mtuinfo->ip6m_addr.sin6_port = connp->conn_fport;
   6536 	mtuinfo->ip6m_addr.sin6_addr = connp->conn_faddr_v6;
   6537 	mtuinfo->ip6m_addr.sin6_scope_id = scopeid;
   6538 	mtuinfo->ip6m_mtu = pmtu;
   6539 
   6540 	return (sizeof (struct ip6_mtuinfo));
   6541 }
   6542 
   6543 /*
   6544  * When the src multihoming is changed from weak to [strong, preferred]
   6545  * ip_ire_rebind_walker is called to walk the list of all ire_t entries
   6546  * and identify routes that were created by user-applications in the
   6547  * unbound state (i.e., without RTA_IFP), and for which an ire_ill is not
   6548  * currently defined. These routes are then 'rebound', i.e., their ire_ill
   6549  * is selected by finding an interface route for the gateway.
   6550  */
   6551 /* ARGSUSED */
   6552 void
   6553 ip_ire_rebind_walker(ire_t *ire, void *notused)
   6554 {
   6555 	if (!ire->ire_unbound || ire->ire_ill != NULL)
   6556 		return;
   6557 	ire_rebind(ire);
   6558 	ire_delete(ire);
   6559 }
   6560 
   6561 /*
   6562  * When the src multihoming is changed from  [strong, preferred] to weak,
   6563  * ip_ire_unbind_walker is called to walk the list of all ire_t entries, and
   6564  * set any entries that were created by user-applications in the unbound state
   6565  * (i.e., without RTA_IFP) back to having a NULL ire_ill.
   6566  */
   6567 /* ARGSUSED */
   6568 void
   6569 ip_ire_unbind_walker(ire_t *ire, void *notused)
   6570 {
   6571 	ire_t *new_ire;
   6572 
   6573 	if (!ire->ire_unbound || ire->ire_ill == NULL)
   6574 		return;
   6575 	if (ire->ire_ipversion == IPV6_VERSION) {
   6576 		new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6,
   6577 		    &ire->ire_gateway_addr_v6, ire->ire_type, NULL,
   6578 		    ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
   6579 	} else {
   6580 		new_ire = ire_create((uchar_t *)&ire->ire_addr,
   6581 		    (uchar_t *)&ire->ire_mask,
   6582 		    (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, NULL,
   6583 		    ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
   6584 	}
   6585 	if (new_ire == NULL)
   6586 		return;
   6587 	new_ire->ire_unbound = B_TRUE;
   6588 	/*
   6589 	 * The bound ire must first be deleted so that we don't return
   6590 	 * the existing one on the attempt to add the unbound new_ire.
   6591 	 */
   6592 	ire_delete(ire);
   6593 	new_ire = ire_add(new_ire);
   6594 	if (new_ire != NULL)
   6595 		ire_refrele(new_ire);
   6596 }
   6597 
   6598 /*
   6599  * When the settings of ip*_strict_src_multihoming tunables are changed,
   6600  * all cached routes need to be recomputed. This recomputation needs to be
   6601  * done when going from weaker to stronger modes so that the cached ire
   6602  * for the connection does not violate the current ip*_strict_src_multihoming
   6603  * setting. It also needs to be done when going from stronger to weaker modes,
   6604  * so that we fall back to matching on the longest-matching-route (as opposed
   6605  * to a shorter match that may have been selected in the strong mode
   6606  * to satisfy src_multihoming settings).
   6607  *
   6608  * The cached ixa_ire entires for all conn_t entries are marked as
   6609  * "verify" so that they will be recomputed for the next packet.
   6610  */
   6611 void
   6612 conn_ire_revalidate(conn_t *connp, void *arg)
   6613 {
   6614 	boolean_t isv6 = (boolean_t)arg;
   6615 
   6616 	if ((isv6 && connp->conn_ipversion != IPV6_VERSION) ||
   6617 	    (!isv6 && connp->conn_ipversion != IPV4_VERSION))
   6618 		return;
   6619 	connp->conn_ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
   6620 }
   6621 
   6622 /*
   6623  * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases,
   6624  * When an ipf is passed here for the first time, if
   6625  * we already have in-order fragments on the queue, we convert from the fast-
   6626  * path reassembly scheme to the hard-case scheme.  From then on, additional
   6627  * fragments are reassembled here.  We keep track of the start and end offsets
   6628  * of each piece, and the number of holes in the chain.  When the hole count
   6629  * goes to zero, we are done!
   6630  *
   6631  * The ipf_count will be updated to account for any mblk(s) added (pointed to
   6632  * by mp) or subtracted (freeb()ed dups), upon return the caller must update
   6633  * ipfb_count and ill_frag_count by the difference of ipf_count before and
   6634  * after the call to ip_reassemble().
   6635  */
   6636 int
   6637 ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill,
   6638     size_t msg_len)
   6639 {
   6640 	uint_t	end;
   6641 	mblk_t	*next_mp;
   6642 	mblk_t	*mp1;
   6643 	uint_t	offset;
   6644 	boolean_t incr_dups = B_TRUE;
   6645 	boolean_t offset_zero_seen = B_FALSE;
   6646 	boolean_t pkt_boundary_checked = B_FALSE;
   6647 
   6648 	/* If start == 0 then ipf_nf_hdr_len has to be set. */
   6649 	ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0);
   6650 
   6651 	/* Add in byte count */
   6652 	ipf->ipf_count += msg_len;
   6653 	if (ipf->ipf_end) {
   6654 		/*
   6655 		 * We were part way through in-order reassembly, but now there
   6656 		 * is a hole.  We walk through messages already queued, and
   6657 		 * mark them for hard case reassembly.  We know that up till
   6658 		 * now they were in order starting from offset zero.
   6659 		 */
   6660 		offset = 0;
   6661 		for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
   6662 			IP_REASS_SET_START(mp1, offset);
   6663 			if (offset == 0) {
   6664 				ASSERT(ipf->ipf_nf_hdr_len != 0);
   6665 				offset = -ipf->ipf_nf_hdr_len;
   6666 			}
   6667 			offset += mp1->b_wptr - mp1->b_rptr;
   6668 			IP_REASS_SET_END(mp1, offset);
   6669 		}
   6670 		/* One hole at the end. */
   6671 		ipf->ipf_hole_cnt = 1;
   6672 		/* Brand it as a hard case, forever. */
   6673 		ipf->ipf_end = 0;
   6674 	}
   6675 	/* Walk through all the new pieces. */
   6676 	do {
   6677 		end = start + (mp->b_wptr - mp->b_rptr);
   6678 		/*
   6679 		 * If start is 0, decrease 'end' only for the first mblk of
   6680 		 * the fragment. Otherwise 'end' can get wrong value in the
   6681 		 * second pass of the loop if first mblk is exactly the
   6682 		 * size of ipf_nf_hdr_len.
   6683 		 */
   6684 		if (start == 0 && !offset_zero_seen) {
   6685 			/* First segment */
   6686 			ASSERT(ipf->ipf_nf_hdr_len != 0);
   6687 			end -= ipf->ipf_nf_hdr_len;
   6688 			offset_zero_seen = B_TRUE;
   6689 		}
   6690 		next_mp = mp->b_cont;
   6691 		/*
   6692 		 * We are checking to see if there is any interesing data
   6693 		 * to process.  If there isn't and the mblk isn't the
   6694 		 * one which carries the unfragmentable header then we
   6695 		 * drop it.  It's possible to have just the unfragmentable
   6696 		 * header come through without any data.  That needs to be
   6697 		 * saved.
   6698 		 *
   6699 		 * If the assert at the top of this function holds then the
   6700 		 * term "ipf->ipf_nf_hdr_len != 0" isn't needed.  This code
   6701 		 * is infrequently traveled enough that the test is left in
   6702 		 * to protect against future code changes which break that
   6703 		 * invariant.
   6704 		 */
   6705 		if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) {
   6706 			/* Empty.  Blast it. */
   6707 			IP_REASS_SET_START(mp, 0);
   6708 			IP_REASS_SET_END(mp, 0);
   6709 			/*
   6710 			 * If the ipf points to the mblk we are about to free,
   6711 			 * update ipf to point to the next mblk (or NULL
   6712 			 * if none).
   6713 			 */
   6714 			if (ipf->ipf_mp->b_cont == mp)
   6715 				ipf->ipf_mp->b_cont = next_mp;
   6716 			freeb(mp);
   6717 			continue;
   6718 		}
   6719 		mp->b_cont = NULL;
   6720 		IP_REASS_SET_START(mp, start);
   6721 		IP_REASS_SET_END(mp, end);
   6722 		if (!ipf->ipf_tail_mp) {
   6723 			ipf->ipf_tail_mp = mp;
   6724 			ipf->ipf_mp->b_cont = mp;
   6725 			if (start == 0 || !more) {
   6726 				ipf->ipf_hole_cnt = 1;
   6727 				/*
   6728 				 * if the first fragment comes in more than one
   6729 				 * mblk, this loop will be executed for each
   6730 				 * mblk. Need to adjust hole count so exiting
   6731 				 * this routine will leave hole count at 1.
   6732 				 */
   6733 				if (next_mp)
   6734 					ipf->ipf_hole_cnt++;
   6735 			} else
   6736 				ipf->ipf_hole_cnt = 2;
   6737 			continue;
   6738 		} else if (ipf->ipf_last_frag_seen && !more &&
   6739 		    !pkt_boundary_checked) {
   6740 			/*
   6741 			 * We check datagram boundary only if this fragment
   6742 			 * claims to be the last fragment and we have seen a
   6743 			 * last fragment in the past too. We do this only
   6744 			 * once for a given fragment.
   6745 			 *
   6746 			 * start cannot be 0 here as fragments with start=0
   6747 			 * and MF=0 gets handled as a complete packet. These
   6748 			 * fragments should not reach here.
   6749 			 */
   6750 
   6751 			if (start + msgdsize(mp) !=
   6752 			    IP_REASS_END(ipf->ipf_tail_mp)) {
   6753 				/*
   6754 				 * We have two fragments both of which claim
   6755 				 * to be the last fragment but gives conflicting
   6756 				 * information about the whole datagram size.
   6757 				 * Something fishy is going on. Drop the
   6758 				 * fragment and free up the reassembly list.
   6759 				 */
   6760 				return (IP_REASS_FAILED);
   6761 			}
   6762 
   6763 			/*
   6764 			 * We shouldn't come to this code block again for this
   6765 			 * particular fragment.
   6766 			 */
   6767 			pkt_boundary_checked = B_TRUE;
   6768 		}
   6769 
   6770 		/* New stuff at or beyond tail? */
   6771 		offset = IP_REASS_END(ipf->ipf_tail_mp);
   6772 		if (start >= offset) {
   6773 			if (ipf->ipf_last_frag_seen) {
   6774 				/* current fragment is beyond last fragment */
   6775 				return (IP_REASS_FAILED);
   6776 			}
   6777 			/* Link it on end. */
   6778 			ipf->ipf_tail_mp->b_cont = mp;
   6779 			ipf->ipf_tail_mp = mp;
   6780 			if (more) {
   6781 				if (start != offset)
   6782 					ipf->ipf_hole_cnt++;
   6783 			} else if (start == offset && next_mp == NULL)
   6784 					ipf->ipf_hole_cnt--;
   6785 			continue;
   6786 		}
   6787 		mp1 = ipf->ipf_mp->b_cont;
   6788 		offset = IP_REASS_START(mp1);
   6789 		/* New stuff at the front? */
   6790 		if (start < offset) {
   6791 			if (start == 0) {
   6792 				if (end >= offset) {
   6793 					/* Nailed the hole at the begining. */
   6794 					ipf->ipf_hole_cnt--;
   6795 				}
   6796 			} else if (end < offset) {
   6797 				/*
   6798 				 * A hole, stuff, and a hole where there used
   6799 				 * to be just a hole.
   6800 				 */
   6801 				ipf->ipf_hole_cnt++;
   6802 			}
   6803 			mp->b_cont = mp1;
   6804 			/* Check for overlap. */
   6805 			while (end > offset) {
   6806 				if (end < IP_REASS_END(mp1)) {
   6807 					mp->b_wptr -= end - offset;
   6808 					IP_REASS_SET_END(mp, offset);
   6809 					BUMP_MIB(ill->ill_ip_mib,
   6810 					    ipIfStatsReasmPartDups);
   6811 					break;
   6812 				}
   6813 				/* Did we cover another hole? */
   6814 				if ((mp1->b_cont &&
   6815 				    IP_REASS_END(mp1) !=
   6816 				    IP_REASS_START(mp1->b_cont) &&
   6817 				    end >= IP_REASS_START(mp1->b_cont)) ||
   6818 				    (!ipf->ipf_last_frag_seen && !more)) {
   6819 					ipf->ipf_hole_cnt--;
   6820 				}
   6821 				/* Clip out mp1. */
   6822 				if ((mp->b_cont = mp1->b_cont) == NULL) {
   6823 					/*
   6824 					 * After clipping out mp1, this guy
   6825 					 * is now hanging off the end.
   6826 					 */
   6827 					ipf->ipf_tail_mp = mp;
   6828 				}
   6829 				IP_REASS_SET_START(mp1, 0);
   6830 				IP_REASS_SET_END(mp1, 0);
   6831 				/* Subtract byte count */
   6832 				ipf->ipf_count -= mp1->b_datap->db_lim -
   6833 				    mp1->b_datap->db_base;
   6834 				freeb(mp1);
   6835 				BUMP_MIB(ill->ill_ip_mib,
   6836 				    ipIfStatsReasmPartDups);
   6837 				mp1 = mp->b_cont;
   6838 				if (!mp1)
   6839 					break;
   6840 				offset = IP_REASS_START(mp1);
   6841 			}
   6842 			ipf->ipf_mp->b_cont = mp;
   6843 			continue;
   6844 		}
   6845 		/*
   6846 		 * The new piece starts somewhere between the start of the head
   6847 		 * and before the end of the tail.
   6848 		 */
   6849 		for (; mp1; mp1 = mp1->b_cont) {
   6850 			offset = IP_REASS_END(mp1);
   6851 			if (start < offset) {
   6852 				if (end <= offset) {
   6853 					/* Nothing new. */
   6854 					IP_REASS_SET_START(mp, 0);
   6855 					IP_REASS_SET_END(mp, 0);
   6856 					/* Subtract byte count */
   6857 					ipf->ipf_count -= mp->b_datap->db_lim -
   6858 					    mp->b_datap->db_base;
   6859 					if (incr_dups) {
   6860 						ipf->ipf_num_dups++;
   6861 						incr_dups = B_FALSE;
   6862 					}
   6863 					freeb(mp);
   6864 					BUMP_MIB(ill->ill_ip_mib,
   6865 					    ipIfStatsReasmDuplicates);
   6866 					break;
   6867 				}
   6868 				/*
   6869 				 * Trim redundant stuff off beginning of new
   6870 				 * piece.
   6871 				 */
   6872 				IP_REASS_SET_START(mp, offset);
   6873 				mp->b_rptr += offset - start;
   6874 				BUMP_MIB(ill->ill_ip_mib,
   6875 				    ipIfStatsReasmPartDups);
   6876 				start = offset;
   6877 				if (!mp1->b_cont) {
   6878 					/*
   6879 					 * After trimming, this guy is now
   6880 					 * hanging off the end.
   6881 					 */
   6882 					mp1->b_cont = mp;
   6883 					ipf->ipf_tail_mp = mp;
   6884 					if (!more) {
   6885 						ipf->ipf_hole_cnt--;
   6886 					}
   6887 					break;
   6888 				}
   6889 			}
   6890 			if (start >= IP_REASS_START(mp1->b_cont))
   6891 				continue;
   6892 			/* Fill a hole */
   6893 			if (start > offset)
   6894 				ipf->ipf_hole_cnt++;
   6895 			mp->b_cont = mp1->b_cont;
   6896 			mp1->b_cont = mp;
   6897 			mp1 = mp->b_cont;
   6898 			offset = IP_REASS_START(mp1);
   6899 			if (end >= offset) {
   6900 				ipf->ipf_hole_cnt--;
   6901 				/* Check for overlap. */
   6902 				while (end > offset) {
   6903 					if (end < IP_REASS_END(mp1)) {
   6904 						mp->b_wptr -= end - offset;
   6905 						IP_REASS_SET_END(mp, offset);
   6906 						/*
   6907 						 * TODO we might bump
   6908 						 * this up twice if there is
   6909 						 * overlap at both ends.
   6910 						 */
   6911 						BUMP_MIB(ill->ill_ip_mib,
   6912 						    ipIfStatsReasmPartDups);
   6913 						break;
   6914 					}
   6915 					/* Did we cover another hole? */
   6916 					if ((mp1->b_cont &&
   6917 					    IP_REASS_END(mp1)
   6918 					    != IP_REASS_START(mp1->b_cont) &&
   6919 					    end >=
   6920 					    IP_REASS_START(mp1->b_cont)) ||
   6921 					    (!ipf->ipf_last_frag_seen &&
   6922 					    !more)) {
   6923 						ipf->ipf_hole_cnt--;
   6924 					}
   6925 					/* Clip out mp1. */
   6926 					if ((mp->b_cont = mp1->b_cont) ==
   6927 					    NULL) {
   6928 						/*
   6929 						 * After clipping out mp1,
   6930 						 * this guy is now hanging
   6931 						 * off the end.
   6932 						 */
   6933 						ipf->ipf_tail_mp = mp;
   6934 					}
   6935 					IP_REASS_SET_START(mp1, 0);
   6936 					IP_REASS_SET_END(mp1, 0);
   6937 					/* Subtract byte count */
   6938 					ipf->ipf_count -=
   6939 					    mp1->b_datap->db_lim -
   6940 					    mp1->b_datap->db_base;
   6941 					freeb(mp1);
   6942 					BUMP_MIB(ill->ill_ip_mib,
   6943 					    ipIfStatsReasmPartDups);
   6944 					mp1 = mp->b_cont;
   6945 					if (!mp1)
   6946 						break;
   6947 					offset = IP_REASS_START(mp1);
   6948 				}
   6949 			}
   6950 			break;
   6951 		}
   6952 	} while (start = end, mp = next_mp);
   6953 
   6954 	/* Fragment just processed could be the last one. Remember this fact */
   6955 	if (!more)
   6956 		ipf->ipf_last_frag_seen = B_TRUE;
   6957 
   6958 	/* Still got holes? */
   6959 	if (ipf->ipf_hole_cnt)
   6960 		return (IP_REASS_PARTIAL);
   6961 	/* Clean up overloaded fields to avoid upstream disasters. */
   6962 	for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
   6963 		IP_REASS_SET_START(mp1, 0);
   6964 		IP_REASS_SET_END(mp1, 0);
   6965 	}
   6966 	return (IP_REASS_COMPLETE);
   6967 }
   6968 
   6969 /*
   6970  * Fragmentation reassembly.  Each ILL has a hash table for
   6971  * queuing packets undergoing reassembly for all IPIFs
   6972  * associated with the ILL.  The hash is based on the packet
   6973  * IP ident field.  The ILL frag hash table was allocated
   6974  * as a timer block at the time the ILL was created.  Whenever
   6975  * there is anything on the reassembly queue, the timer will
   6976  * be running.  Returns the reassembled packet if reassembly completes.
   6977  */
   6978 mblk_t *
   6979 ip_input_fragment(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
   6980 {
   6981 	uint32_t	frag_offset_flags;
   6982 	mblk_t		*t_mp;
   6983 	ipaddr_t	dst;
   6984 	uint8_t		proto = ipha->ipha_protocol;
   6985 	uint32_t	sum_val;
   6986 	uint16_t	sum_flags;
   6987 	ipf_t		*ipf;
   6988 	ipf_t		**ipfp;
   6989 	ipfb_t		*ipfb;
   6990 	uint16_t	ident;
   6991 	uint32_t	offset;
   6992 	ipaddr_t	src;
   6993 	uint_t		hdr_length;
   6994 	uint32_t	end;
   6995 	mblk_t		*mp1;
   6996 	mblk_t		*tail_mp;
   6997 	size_t		count;
   6998 	size_t		msg_len;
   6999 	uint8_t		ecn_info = 0;
   7000 	uint32_t	packet_size;
   7001 	boolean_t	pruned = B_FALSE;
   7002 	ill_t		*ill = ira->ira_ill;
   7003 	ip_stack_t	*ipst = ill->ill_ipst;
   7004 
   7005 	/*
   7006 	 * Drop the fragmented as early as possible, if
   7007 	 * we don't have resource(s) to re-assemble.
   7008 	 */
   7009 	if (ipst->ips_ip_reass_queue_bytes == 0) {
   7010 		freemsg(mp);
   7011 		return (NULL);
   7012 	}
   7013 
   7014 	/* Check for fragmentation offset; return if there's none */
   7015 	if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
   7016 	    (IPH_MF | IPH_OFFSET)) == 0)
   7017 		return (mp);
   7018 
   7019 	/*
   7020 	 * We utilize hardware computed checksum info only for UDP since
   7021 	 * IP fragmentation is a normal occurrence for the protocol.  In
   7022 	 * addition, checksum offload support for IP fragments carrying
   7023 	 * UDP payload is commonly implemented across network adapters.
   7024 	 */
   7025 	ASSERT(ira->ira_rill != NULL);
   7026 	if (proto == IPPROTO_UDP && dohwcksum &&
   7027 	    ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
   7028 	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
   7029 		mblk_t *mp1 = mp->b_cont;
   7030 		int32_t len;
   7031 
   7032 		/* Record checksum information from the packet */
   7033 		sum_val = (uint32_t)DB_CKSUM16(mp);
   7034 		sum_flags = DB_CKSUMFLAGS(mp);
   7035 
   7036 		/* IP payload offset from beginning of mblk */
   7037 		offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr;
   7038 
   7039 		if ((sum_flags & HCK_PARTIALCKSUM) &&
   7040 		    (mp1 == NULL || mp1->b_cont == NULL) &&
   7041 		    offset >= DB_CKSUMSTART(mp) &&
   7042 		    ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
   7043 			uint32_t adj;
   7044 			/*
   7045 			 * Partial checksum has been calculated by hardware
   7046 			 * and attached to the packet; in addition, any
   7047 			 * prepended extraneous data is even byte aligned.
   7048 			 * If any such data exists, we adjust the checksum;
   7049 			 * this would also handle any postpended data.
   7050 			 */
   7051 			IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
   7052 			    mp, mp1, len, adj);
   7053 
   7054 			/* One's complement subtract extraneous checksum */
   7055 			if (adj >= sum_val)
   7056 				sum_val = ~(adj - sum_val) & 0xFFFF;
   7057 			else
   7058 				sum_val -= adj;
   7059 		}
   7060 	} else {
   7061 		sum_val = 0;
   7062 		sum_flags = 0;
   7063 	}
   7064 
   7065 	/* Clear hardware checksumming flag */
   7066 	DB_CKSUMFLAGS(mp) = 0;
   7067 
   7068 	ident = ipha->ipha_ident;
   7069 	offset = (frag_offset_flags << 3) & 0xFFFF;
   7070 	src = ipha->ipha_src;
   7071 	dst = ipha->ipha_dst;
   7072 	hdr_length = IPH_HDR_LENGTH(ipha);
   7073 	end = ntohs(ipha->ipha_length) - hdr_length;
   7074 
   7075 	/* If end == 0 then we have a packet with no data, so just free it */
   7076 	if (end == 0) {
   7077 		freemsg(mp);
   7078 		return (NULL);
   7079 	}
   7080 
   7081 	/* Record the ECN field info. */
   7082 	ecn_info = (ipha->ipha_type_of_service & 0x3);
   7083 	if (offset != 0) {
   7084 		/*
   7085 		 * If this isn't the first piece, strip the header, and
   7086 		 * add the offset to the end value.
   7087 		 */
   7088 		mp->b_rptr += hdr_length;
   7089 		end += offset;
   7090 	}
   7091 
   7092 	/* Handle vnic loopback of fragments */
   7093 	if (mp->b_datap->db_ref > 2)
   7094 		msg_len = 0;
   7095 	else
   7096 		msg_len = MBLKSIZE(mp);
   7097 
   7098 	tail_mp = mp;
   7099 	while (tail_mp->b_cont != NULL) {
   7100 		tail_mp = tail_mp->b_cont;
   7101 		if (tail_mp->b_datap->db_ref <= 2)
   7102 			msg_len += MBLKSIZE(tail_mp);
   7103 	}
   7104 
   7105 	/* If the reassembly list for this ILL will get too big, prune it */
   7106 	if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
   7107 	    ipst->ips_ip_reass_queue_bytes) {
   7108 		DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
   7109 		    uint_t, ill->ill_frag_count,
   7110 		    uint_t, ipst->ips_ip_reass_queue_bytes);
   7111 		ill_frag_prune(ill,
   7112 		    (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
   7113 		    (ipst->ips_ip_reass_queue_bytes - msg_len));
   7114 		pruned = B_TRUE;
   7115 	}
   7116 
   7117 	ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
   7118 	mutex_enter(&ipfb->ipfb_lock);
   7119 
   7120 	ipfp = &ipfb->ipfb_ipf;
   7121 	/* Try to find an existing fragment queue for this packet. */
   7122 	for (;;) {
   7123 		ipf = ipfp[0];
   7124 		if (ipf != NULL) {
   7125 			/*
   7126 			 * It has to match on ident and src/dst address.
   7127 			 */
   7128 			if (ipf->ipf_ident == ident &&
   7129 			    ipf->ipf_src == src &&
   7130 			    ipf->ipf_dst == dst &&
   7131 			    ipf->ipf_protocol == proto) {
   7132 				/*
   7133 				 * If we have received too many
   7134 				 * duplicate fragments for this packet
   7135 				 * free it.
   7136 				 */
   7137 				if (ipf->ipf_num_dups > ip_max_frag_dups) {
   7138 					ill_frag_free_pkts(ill, ipfb, ipf, 1);
   7139 					freemsg(mp);
   7140 					mutex_exit(&ipfb->ipfb_lock);
   7141 					return (NULL);
   7142 				}
   7143 				/* Found it. */
   7144 				break;
   7145 			}
   7146 			ipfp = &ipf->ipf_hash_next;
   7147 			continue;
   7148 		}
   7149 
   7150 		/*
   7151 		 * If we pruned the list, do we want to store this new
   7152 		 * fragment?. We apply an optimization here based on the
   7153 		 * fact that most fragments will be received in order.
   7154 		 * So if the offset of this incoming fragment is zero,
   7155 		 * it is the first fragment of a new packet. We will
   7156 		 * keep it.  Otherwise drop the fragment, as we have
   7157 		 * probably pruned the packet already (since the
   7158 		 * packet cannot be found).
   7159 		 */
   7160 		if (pruned && offset != 0) {
   7161 			mutex_exit(&ipfb->ipfb_lock);
   7162 			freemsg(mp);
   7163 			return (NULL);
   7164 		}
   7165 
   7166 		if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst))  {
   7167 			/*
   7168 			 * Too many fragmented packets in this hash
   7169 			 * bucket. Free the oldest.
   7170 			 */
   7171 			ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
   7172 		}
   7173 
   7174 		/* New guy.  Allocate a frag message. */
   7175 		mp1 = allocb(sizeof (*ipf), BPRI_MED);
   7176 		if (mp1 == NULL) {
   7177 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   7178 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   7179 			freemsg(mp);
   7180 reass_done:
   7181 			mutex_exit(&ipfb->ipfb_lock);
   7182 			return (NULL);
   7183 		}
   7184 
   7185 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
   7186 		mp1->b_cont = mp;
   7187 
   7188 		/* Initialize the fragment header. */
   7189 		ipf = (ipf_t *)mp1->b_rptr;
   7190 		ipf->ipf_mp = mp1;
   7191 		ipf->ipf_ptphn = ipfp;
   7192 		ipfp[0] = ipf;
   7193 		ipf->ipf_hash_next = NULL;
   7194 		ipf->ipf_ident = ident;
   7195 		ipf->ipf_protocol = proto;
   7196 		ipf->ipf_src = src;
   7197 		ipf->ipf_dst = dst;
   7198 		ipf->ipf_nf_hdr_len = 0;
   7199 		/* Record reassembly start time. */
   7200 		ipf->ipf_timestamp = gethrestime_sec();
   7201 		/* Record ipf generation and account for frag header */
   7202 		ipf->ipf_gen = ill->ill_ipf_gen++;
   7203 		ipf->ipf_count = MBLKSIZE(mp1);
   7204 		ipf->ipf_last_frag_seen = B_FALSE;
   7205 		ipf->ipf_ecn = ecn_info;
   7206 		ipf->ipf_num_dups = 0;
   7207 		ipfb->ipfb_frag_pkts++;
   7208 		ipf->ipf_checksum = 0;
   7209 		ipf->ipf_checksum_flags = 0;
   7210 
   7211 		/* Store checksum value in fragment header */
   7212 		if (sum_flags != 0) {
   7213 			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7214 			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7215 			ipf->ipf_checksum = sum_val;
   7216 			ipf->ipf_checksum_flags = sum_flags;
   7217 		}
   7218 
   7219 		/*
   7220 		 * We handle reassembly two ways.  In the easy case,
   7221 		 * where all the fragments show up in order, we do
   7222 		 * minimal bookkeeping, and just clip new pieces on
   7223 		 * the end.  If we ever see a hole, then we go off
   7224 		 * to ip_reassemble which has to mark the pieces and
   7225 		 * keep track of the number of holes, etc.  Obviously,
   7226 		 * the point of having both mechanisms is so we can
   7227 		 * handle the easy case as efficiently as possible.
   7228 		 */
   7229 		if (offset == 0) {
   7230 			/* Easy case, in-order reassembly so far. */
   7231 			ipf->ipf_count += msg_len;
   7232 			ipf->ipf_tail_mp = tail_mp;
   7233 			/*
   7234 			 * Keep track of next expected offset in
   7235 			 * ipf_end.
   7236 			 */
   7237 			ipf->ipf_end = end;
   7238 			ipf->ipf_nf_hdr_len = hdr_length;
   7239 		} else {
   7240 			/* Hard case, hole at the beginning. */
   7241 			ipf->ipf_tail_mp = NULL;
   7242 			/*
   7243 			 * ipf_end == 0 means that we have given up
   7244 			 * on easy reassembly.
   7245 			 */
   7246 			ipf->ipf_end = 0;
   7247 
   7248 			/* Forget checksum offload from now on */
   7249 			ipf->ipf_checksum_flags = 0;
   7250 
   7251 			/*
   7252 			 * ipf_hole_cnt is set by ip_reassemble.
   7253 			 * ipf_count is updated by ip_reassemble.
   7254 			 * No need to check for return value here
   7255 			 * as we don't expect reassembly to complete
   7256 			 * or fail for the first fragment itself.
   7257 			 */
   7258 			(void) ip_reassemble(mp, ipf,
   7259 			    (frag_offset_flags & IPH_OFFSET) << 3,
   7260 			    (frag_offset_flags & IPH_MF), ill, msg_len);
   7261 		}
   7262 		/* Update per ipfb and ill byte counts */
   7263 		ipfb->ipfb_count += ipf->ipf_count;
   7264 		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
   7265 		atomic_add_32(&ill->ill_frag_count, ipf->ipf_count);
   7266 		/* If the frag timer wasn't already going, start it. */
   7267 		mutex_enter(&ill->ill_lock);
   7268 		ill_frag_timer_start(ill);
   7269 		mutex_exit(&ill->ill_lock);
   7270 		goto reass_done;
   7271 	}
   7272 
   7273 	/*
   7274 	 * If the packet's flag has changed (it could be coming up
   7275 	 * from an interface different than the previous, therefore
   7276 	 * possibly different checksum capability), then forget about
   7277 	 * any stored checksum states.  Otherwise add the value to
   7278 	 * the existing one stored in the fragment header.
   7279 	 */
   7280 	if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
   7281 		sum_val += ipf->ipf_checksum;
   7282 		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7283 		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
   7284 		ipf->ipf_checksum = sum_val;
   7285 	} else if (ipf->ipf_checksum_flags != 0) {
   7286 		/* Forget checksum offload from now on */
   7287 		ipf->ipf_checksum_flags = 0;
   7288 	}
   7289 
   7290 	/*
   7291 	 * We have a new piece of a datagram which is already being
   7292 	 * reassembled.  Update the ECN info if all IP fragments
   7293 	 * are ECN capable.  If there is one which is not, clear
   7294 	 * all the info.  If there is at least one which has CE
   7295 	 * code point, IP needs to report that up to transport.
   7296 	 */
   7297 	if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
   7298 		if (ecn_info == IPH_ECN_CE)
   7299 			ipf->ipf_ecn = IPH_ECN_CE;
   7300 	} else {
   7301 		ipf->ipf_ecn = IPH_ECN_NECT;
   7302 	}
   7303 	if (offset && ipf->ipf_end == offset) {
   7304 		/* The new fragment fits at the end */
   7305 		ipf->ipf_tail_mp->b_cont = mp;
   7306 		/* Update the byte count */
   7307 		ipf->ipf_count += msg_len;
   7308 		/* Update per ipfb and ill byte counts */
   7309 		ipfb->ipfb_count += msg_len;
   7310 		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
   7311 		atomic_add_32(&ill->ill_frag_count, msg_len);
   7312 		if (frag_offset_flags & IPH_MF) {
   7313 			/* More to come. */
   7314 			ipf->ipf_end = end;
   7315 			ipf->ipf_tail_mp = tail_mp;
   7316 			goto reass_done;
   7317 		}
   7318 	} else {
   7319 		/* Go do the hard cases. */
   7320 		int ret;
   7321 
   7322 		if (offset == 0)
   7323 			ipf->ipf_nf_hdr_len = hdr_length;
   7324 
   7325 		/* Save current byte count */
   7326 		count = ipf->ipf_count;
   7327 		ret = ip_reassemble(mp, ipf,
   7328 		    (frag_offset_flags & IPH_OFFSET) << 3,
   7329 		    (frag_offset_flags & IPH_MF), ill, msg_len);
   7330 		/* Count of bytes added and subtracted (freeb()ed) */
   7331 		count = ipf->ipf_count - count;
   7332 		if (count) {
   7333 			/* Update per ipfb and ill byte counts */
   7334 			ipfb->ipfb_count += count;
   7335 			ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
   7336 			atomic_add_32(&ill->ill_frag_count, count);
   7337 		}
   7338 		if (ret == IP_REASS_PARTIAL) {
   7339 			goto reass_done;
   7340 		} else if (ret == IP_REASS_FAILED) {
   7341 			/* Reassembly failed. Free up all resources */
   7342 			ill_frag_free_pkts(ill, ipfb, ipf, 1);
   7343 			for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
   7344 				IP_REASS_SET_START(t_mp, 0);
   7345 				IP_REASS_SET_END(t_mp, 0);
   7346 			}
   7347 			freemsg(mp);
   7348 			goto reass_done;
   7349 		}
   7350 		/* We will reach here iff 'ret' is IP_REASS_COMPLETE */
   7351 	}
   7352 	/*
   7353 	 * We have completed reassembly.  Unhook the frag header from
   7354 	 * the reassembly list.
   7355 	 *
   7356 	 * Before we free the frag header, record the ECN info
   7357 	 * to report back to the transport.
   7358 	 */
   7359 	ecn_info = ipf->ipf_ecn;
   7360 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs);
   7361 	ipfp = ipf->ipf_ptphn;
   7362 
   7363 	/* We need to supply these to caller */
   7364 	if ((sum_flags = ipf->ipf_checksum_flags) != 0)
   7365 		sum_val = ipf->ipf_checksum;
   7366 	else
   7367 		sum_val = 0;
   7368 
   7369 	mp1 = ipf->ipf_mp;
   7370 	count = ipf->ipf_count;
   7371 	ipf = ipf->ipf_hash_next;
   7372 	if (ipf != NULL)
   7373 		ipf->ipf_ptphn = ipfp;
   7374 	ipfp[0] = ipf;
   7375 	atomic_add_32(&ill->ill_frag_count, -count);
   7376 	ASSERT(ipfb->ipfb_count >= count);
   7377 	ipfb->ipfb_count -= count;
   7378 	ipfb->ipfb_frag_pkts--;
   7379 	mutex_exit(&ipfb->ipfb_lock);
   7380 	/* Ditch the frag header. */
   7381 	mp = mp1->b_cont;
   7382 
   7383 	freeb(mp1);
   7384 
   7385 	/* Restore original IP length in header. */
   7386 	packet_size = (uint32_t)msgdsize(mp);
   7387 	if (packet_size > IP_MAXPACKET) {
   7388 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
   7389 		ip_drop_input("Reassembled packet too large", mp, ill);
   7390 		freemsg(mp);
   7391 		return (NULL);
   7392 	}
   7393 
   7394 	if (DB_REF(mp) > 1) {
   7395 		mblk_t *mp2 = copymsg(mp);
   7396 
   7397 		if (mp2 == NULL) {
   7398 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   7399 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   7400 			freemsg(mp);
   7401 			return (NULL);
   7402 		}
   7403 		freemsg(mp);
   7404 		mp = mp2;
   7405 	}
   7406 	ipha = (ipha_t *)mp->b_rptr;
   7407 
   7408 	ipha->ipha_length = htons((uint16_t)packet_size);
   7409 	/* We're now complete, zip the frag state */
   7410 	ipha->ipha_fragment_offset_and_flags = 0;
   7411 	/* Record the ECN info. */
   7412 	ipha->ipha_type_of_service &= 0xFC;
   7413 	ipha->ipha_type_of_service |= ecn_info;
   7414 
   7415 	/* Update the receive attributes */
   7416 	ira->ira_pktlen = packet_size;
   7417 	ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
   7418 
   7419 	/* Reassembly is successful; set checksum information in packet */
   7420 	DB_CKSUM16(mp) = (uint16_t)sum_val;
   7421 	DB_CKSUMFLAGS(mp) = sum_flags;
   7422 	DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
   7423 
   7424 	return (mp);
   7425 }
   7426 
   7427 /*
   7428  * Pullup function that should be used for IP input in order to
   7429  * ensure we do not loose the L2 source address; we need the l2 source
   7430  * address for IP_RECVSLLA and for ndp_input.
   7431  *
   7432  * We return either NULL or b_rptr.
   7433  */
   7434 void *
   7435 ip_pullup(mblk_t *mp, ssize_t len, ip_recv_attr_t *ira)
   7436 {
   7437 	ill_t		*ill = ira->ira_ill;
   7438 
   7439 	if (ip_rput_pullups++ == 0) {
   7440 		(void) mi_strlog(ill->ill_rq, 1, SL_ERROR|SL_TRACE,
   7441 		    "ip_pullup: %s forced us to "
   7442 		    " pullup pkt, hdr len %ld, hdr addr %p",
   7443 		    ill->ill_name, len, (void *)mp->b_rptr);
   7444 	}
   7445 	if (!(ira->ira_flags & IRAF_L2SRC_SET))
   7446 		ip_setl2src(mp, ira, ira->ira_rill);
   7447 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
   7448 	if (!pullupmsg(mp, len))
   7449 		return (NULL);
   7450 	else
   7451 		return (mp->b_rptr);
   7452 }
   7453 
   7454 /*
   7455  * Make sure ira_l2src has an address. If we don't have one fill with zeros.
   7456  * When called from the ULP ira_rill will be NULL hence the caller has to
   7457  * pass in the ill.
   7458  */
   7459 /* ARGSUSED */
   7460 void
   7461 ip_setl2src(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill)
   7462 {
   7463 	const uchar_t *addr;
   7464 	int alen;
   7465 
   7466 	if (ira->ira_flags & IRAF_L2SRC_SET)
   7467 		return;
   7468 
   7469 	ASSERT(ill != NULL);
   7470 	alen = ill->ill_phys_addr_length;
   7471 	ASSERT(alen <= sizeof (ira->ira_l2src));
   7472 	if (ira->ira_mhip != NULL &&
   7473 	    (addr = ira->ira_mhip->mhi_saddr) != NULL) {
   7474 		bcopy(addr, ira->ira_l2src, alen);
   7475 	} else if ((ira->ira_flags & IRAF_L2SRC_LOOPBACK) &&
   7476 	    (addr = ill->ill_phys_addr) != NULL) {
   7477 		bcopy(addr, ira->ira_l2src, alen);
   7478 	} else {
   7479 		bzero(ira->ira_l2src, alen);
   7480 	}
   7481 	ira->ira_flags |= IRAF_L2SRC_SET;
   7482 }
   7483 
   7484 /*
   7485  * check ip header length and align it.
   7486  */
   7487 mblk_t *
   7488 ip_check_and_align_header(mblk_t *mp, uint_t min_size, ip_recv_attr_t *ira)
   7489 {
   7490 	ill_t	*ill = ira->ira_ill;
   7491 	ssize_t len;
   7492 
   7493 	len = MBLKL(mp);
   7494 
   7495 	if (!OK_32PTR(mp->b_rptr))
   7496 		IP_STAT(ill->ill_ipst, ip_notaligned);
   7497 	else
   7498 		IP_STAT(ill->ill_ipst, ip_recv_pullup);
   7499 
   7500 	/* Guard against bogus device drivers */
   7501 	if (len < 0) {
   7502 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
   7503 		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
   7504 		freemsg(mp);
   7505 		return (NULL);
   7506 	}
   7507 
   7508 	if (len == 0) {
   7509 		/* GLD sometimes sends up mblk with b_rptr == b_wptr! */
   7510 		mblk_t *mp1 = mp->b_cont;
   7511 
   7512 		if (!(ira->ira_flags & IRAF_L2SRC_SET))
   7513 			ip_setl2src(mp, ira, ira->ira_rill);
   7514 		ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
   7515 
   7516 		freeb(mp);
   7517 		mp = mp1;
   7518 		if (mp == NULL)
   7519 			return (NULL);
   7520 
   7521 		if (OK_32PTR(mp->b_rptr) && MBLKL(mp) >= min_size)
   7522 			return (mp);
   7523 	}
   7524 	if (ip_pullup(mp, min_size, ira) == NULL) {
   7525 		if (msgdsize(mp) < min_size) {
   7526 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
   7527 			ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
   7528 		} else {
   7529 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   7530 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   7531 		}
   7532 		freemsg(mp);
   7533 		return (NULL);
   7534 	}
   7535 	return (mp);
   7536 }
   7537 
   7538 /*
   7539  * Common code for IPv4 and IPv6 to check and pullup multi-mblks
   7540  */
   7541 mblk_t *
   7542 ip_check_length(mblk_t *mp, uchar_t *rptr, ssize_t len,	uint_t pkt_len,
   7543     uint_t min_size, ip_recv_attr_t *ira)
   7544 {
   7545 	ill_t	*ill = ira->ira_ill;
   7546 
   7547 	/*
   7548 	 * Make sure we have data length consistent
   7549 	 * with the IP header.
   7550 	 */
   7551 	if (mp->b_cont == NULL) {
   7552 		/* pkt_len is based on ipha_len, not the mblk length */
   7553 		if (pkt_len < min_size) {
   7554 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
   7555 			ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
   7556 			freemsg(mp);
   7557 			return (NULL);
   7558 		}
   7559 		if (len < 0) {
   7560 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   7561 			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   7562 			freemsg(mp);
   7563 			return (NULL);
   7564 		}
   7565 		/* Drop any pad */
   7566 		mp->b_wptr = rptr + pkt_len;
   7567 	} else if ((len += msgdsize(mp->b_cont)) != 0) {
   7568 		ASSERT(pkt_len >= min_size);
   7569 		if (pkt_len < min_size) {
   7570 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
   7571 			ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
   7572 			freemsg(mp);
   7573 			return (NULL);
   7574 		}
   7575 		if (len < 0) {
   7576 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   7577 			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   7578 			freemsg(mp);
   7579 			return (NULL);
   7580 		}
   7581 		/* Drop any pad */
   7582 		(void) adjmsg(mp, -len);
   7583 		/*
   7584 		 * adjmsg may have freed an mblk from the chain, hence
   7585 		 * invalidate any hw checksum here. This will force IP to
   7586 		 * calculate the checksum in sw, but only for this packet.
   7587 		 */
   7588 		DB_CKSUMFLAGS(mp) = 0;
   7589 		IP_STAT(ill->ill_ipst, ip_multimblk);
   7590 	}
   7591 	return (mp);
   7592 }
   7593 
   7594 /*
   7595  * Check that the IPv4 opt_len is consistent with the packet and pullup
   7596  * the options.
   7597  */
   7598 mblk_t *
   7599 ip_check_optlen(mblk_t *mp, ipha_t *ipha, uint_t opt_len, uint_t pkt_len,
   7600     ip_recv_attr_t *ira)
   7601 {
   7602 	ill_t	*ill = ira->ira_ill;
   7603 	ssize_t len;
   7604 
   7605 	/* Assume no IPv6 packets arrive over the IPv4 queue */
   7606 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
   7607 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
   7608 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion);
   7609 		ip_drop_input("IPvN packet on IPv4 ill", mp, ill);
   7610 		freemsg(mp);
   7611 		return (NULL);
   7612 	}
   7613 
   7614 	if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) {
   7615 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
   7616 		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
   7617 		freemsg(mp);
   7618 		return (NULL);
   7619 	}
   7620 	/*
   7621 	 * Recompute complete header length and make sure we
   7622 	 * have access to all of it.
   7623 	 */
   7624 	len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2;
   7625 	if (len > (mp->b_wptr - mp->b_rptr)) {
   7626 		if (len > pkt_len) {
   7627 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
   7628 			ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
   7629 			freemsg(mp);
   7630 			return (NULL);
   7631 		}
   7632 		if (ip_pullup(mp, len, ira) == NULL) {
   7633 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   7634 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   7635 			freemsg(mp);
   7636 			return (NULL);
   7637 		}
   7638 	}
   7639 	return (mp);
   7640 }
   7641 
   7642 /*
   7643  * Returns a new ire, or the same ire, or NULL.
   7644  * If a different IRE is returned, then it is held; the caller
   7645  * needs to release it.
   7646  * In no case is there any hold/release on the ire argument.
   7647  */
   7648 ire_t *
   7649 ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
   7650 {
   7651 	ire_t		*new_ire;
   7652 	ill_t		*ire_ill;
   7653 	uint_t		ifindex;
   7654 	ip_stack_t	*ipst = ill->ill_ipst;
   7655 	boolean_t	strict_check = B_FALSE;
   7656 
   7657 	/*
   7658 	 * IPMP common case: if IRE and ILL are in the same group, there's no
   7659 	 * issue (e.g. packet received on an underlying interface matched an
   7660 	 * IRE_LOCAL on its associated group interface).
   7661 	 */
   7662 	ASSERT(ire->ire_ill != NULL);
   7663 	if (IS_IN_SAME_ILLGRP(ill, ire->ire_ill))
   7664 		return (ire);
   7665 
   7666 	/*
   7667 	 * Do another ire lookup here, using the ingress ill, to see if the
   7668 	 * interface is in a usesrc group.
   7669 	 * As long as the ills belong to the same group, we don't consider
   7670 	 * them to be arriving on the wrong interface. Thus, if the switch
   7671 	 * is doing inbound load spreading, we won't drop packets when the
   7672 	 * ip*_strict_dst_multihoming switch is on.
   7673 	 * We also need to check for IPIF_UNNUMBERED point2point interfaces
   7674 	 * where the local address may not be unique. In this case we were
   7675 	 * at the mercy of the initial ire lookup and the IRE_LOCAL it
   7676 	 * actually returned. The new lookup, which is more specific, should
   7677 	 * only find the IRE_LOCAL associated with the ingress ill if one
   7678 	 * exists.
   7679 	 */
   7680 	if (ire->ire_ipversion == IPV4_VERSION) {
   7681 		if (ipst->ips_ip_strict_dst_multihoming)
   7682 			strict_check = B_TRUE;
   7683 		new_ire = ire_ftable_lookup_v4(*((ipaddr_t *)addr), 0, 0,
   7684 		    IRE_LOCAL, ill, ALL_ZONES, NULL,
   7685 		    (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
   7686 	} else {
   7687 		ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr));
   7688 		if (ipst->ips_ipv6_strict_dst_multihoming)
   7689 			strict_check = B_TRUE;
   7690 		new_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL,
   7691 		    IRE_LOCAL, ill, ALL_ZONES, NULL,
   7692 		    (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
   7693 	}
   7694 	/*
   7695 	 * If the same ire that was returned in ip_input() is found then this
   7696 	 * is an indication that usesrc groups are in use. The packet
   7697 	 * arrived on a different ill in the group than the one associated with
   7698 	 * the destination address.  If a different ire was found then the same
   7699 	 * IP address must be hosted on multiple ills. This is possible with
   7700 	 * unnumbered point2point interfaces. We switch to use this new ire in
   7701 	 * order to have accurate interface statistics.
   7702 	 */
   7703 	if (new_ire != NULL) {
   7704 		/* Note: held in one case but not the other? Caller handles */
   7705 		if (new_ire != ire)
   7706 			return (new_ire);
   7707 		/* Unchanged */
   7708 		ire_refrele(new_ire);
   7709 		return (ire);
   7710 	}
   7711 
   7712 	/*
   7713 	 * Chase pointers once and store locally.
   7714 	 */
   7715 	ASSERT(ire->ire_ill != NULL);
   7716 	ire_ill = ire->ire_ill;
   7717 	ifindex = ill->ill_usesrc_ifindex;
   7718 
   7719 	/*
   7720 	 * Check if it's a legal address on the 'usesrc' interface.
   7721 	 * For IPMP data addresses the IRE_LOCAL is the upper, hence we
   7722 	 * can just check phyint_ifindex.
   7723 	 */
   7724 	if (ifindex != 0 && ifindex == ire_ill->ill_phyint->phyint_ifindex) {
   7725 		return (ire);
   7726 	}
   7727 
   7728 	/*
   7729 	 * If the ip*_strict_dst_multihoming switch is on then we can
   7730 	 * only accept this packet if the interface is marked as routing.
   7731 	 */
   7732 	if (!(strict_check))
   7733 		return (ire);
   7734 
   7735 	if ((ill->ill_flags & ire->ire_ill->ill_flags & ILLF_ROUTER) != 0) {
   7736 		return (ire);
   7737 	}
   7738 	return (NULL);
   7739 }
   7740 
   7741 /*
   7742  * This function is used to construct a mac_header_info_s from a
   7743  * DL_UNITDATA_IND message.
   7744  * The address fields in the mhi structure points into the message,
   7745  * thus the caller can't use those fields after freeing the message.
   7746  *
   7747  * We determine whether the packet received is a non-unicast packet
   7748  * and in doing so, determine whether or not it is broadcast vs multicast.
   7749  * For it to be a broadcast packet, we must have the appropriate mblk_t
   7750  * hanging off the ill_t.  If this is either not present or doesn't match
   7751  * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
   7752  * to be multicast.  Thus NICs that have no broadcast address (or no
   7753  * capability for one, such as point to point links) cannot return as
   7754  * the packet being broadcast.
   7755  */
   7756 void
   7757 ip_dlur_to_mhi(ill_t *ill, mblk_t *mb, struct mac_header_info_s *mhip)
   7758 {
   7759 	dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr;
   7760 	mblk_t *bmp;
   7761 	uint_t extra_offset;
   7762 
   7763 	bzero(mhip, sizeof (struct mac_header_info_s));
   7764 
   7765 	mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
   7766 
   7767 	if (ill->ill_sap_length < 0)
   7768 		extra_offset = 0;
   7769 	else
   7770 		extra_offset = ill->ill_sap_length;
   7771 
   7772 	mhip->mhi_daddr = (uchar_t *)ind + ind->dl_dest_addr_offset +
   7773 	    extra_offset;
   7774 	mhip->mhi_saddr = (uchar_t *)ind + ind->dl_src_addr_offset +
   7775 	    extra_offset;
   7776 
   7777 	if (!ind->dl_group_address)
   7778 		return;
   7779 
   7780 	/* Multicast or broadcast */
   7781 	mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
   7782 
   7783 	if (ind->dl_dest_addr_offset > sizeof (*ind) &&
   7784 	    ind->dl_dest_addr_offset + ind->dl_dest_addr_length < MBLKL(mb) &&
   7785 	    (bmp = ill->ill_bcast_mp) != NULL) {
   7786 		dl_unitdata_req_t *dlur;
   7787 		uint8_t *bphys_addr;
   7788 
   7789 		dlur = (dl_unitdata_req_t *)bmp->b_rptr;
   7790 		bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
   7791 		    extra_offset;
   7792 
   7793 		if (bcmp(mhip->mhi_daddr, bphys_addr,
   7794 		    ind->dl_dest_addr_length) == 0)
   7795 			mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
   7796 	}
   7797 }
   7798 
   7799 /*
   7800  * This function is used to construct a mac_header_info_s from a
   7801  * M_DATA fastpath message from a DLPI driver.
   7802  * The address fields in the mhi structure points into the message,
   7803  * thus the caller can't use those fields after freeing the message.
   7804  *
   7805  * We determine whether the packet received is a non-unicast packet
   7806  * and in doing so, determine whether or not it is broadcast vs multicast.
   7807  * For it to be a broadcast packet, we must have the appropriate mblk_t
   7808  * hanging off the ill_t.  If this is either not present or doesn't match
   7809  * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
   7810  * to be multicast.  Thus NICs that have no broadcast address (or no
   7811  * capability for one, such as point to point links) cannot return as
   7812  * the packet being broadcast.
   7813  */
   7814 void
   7815 ip_mdata_to_mhi(ill_t *ill, mblk_t *mp, struct mac_header_info_s *mhip)
   7816 {
   7817 	mblk_t *bmp;
   7818 	struct ether_header *pether;
   7819 
   7820 	bzero(mhip, sizeof (struct mac_header_info_s));
   7821 
   7822 	mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
   7823 
   7824 	pether = (struct ether_header *)((char *)mp->b_rptr
   7825 	    - sizeof (struct ether_header));
   7826 
   7827 	/*
   7828 	 * Make sure the interface is an ethernet type, since we don't
   7829 	 * know the header format for anything but Ethernet. Also make
   7830 	 * sure we are pointing correctly above db_base.
   7831 	 */
   7832 	if (ill->ill_type != IFT_ETHER)
   7833 		return;
   7834 
   7835 retry:
   7836 	if ((uchar_t *)pether < mp->b_datap->db_base)
   7837 		return;
   7838 
   7839 	/* Is there a VLAN tag? */
   7840 	if (ill->ill_isv6) {
   7841 		if (pether->ether_type != htons(ETHERTYPE_IPV6)) {
   7842 			pether = (struct ether_header *)((char *)pether - 4);
   7843 			goto retry;
   7844 		}
   7845 	} else {
   7846 		if (pether->ether_type != htons(ETHERTYPE_IP)) {
   7847 			pether = (struct ether_header *)((char *)pether - 4);
   7848 			goto retry;
   7849 		}
   7850 	}
   7851 	mhip->mhi_daddr = (uchar_t *)&pether->ether_dhost;
   7852 	mhip->mhi_saddr = (uchar_t *)&pether->ether_shost;
   7853 
   7854 	if (!(mhip->mhi_daddr[0] & 0x01))
   7855 		return;
   7856 
   7857 	/* Multicast or broadcast */
   7858 	mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
   7859 
   7860 	if ((bmp = ill->ill_bcast_mp) != NULL) {
   7861 		dl_unitdata_req_t *dlur;
   7862 		uint8_t *bphys_addr;
   7863 		uint_t	addrlen;
   7864 
   7865 		dlur = (dl_unitdata_req_t *)bmp->b_rptr;
   7866 		addrlen = dlur->dl_dest_addr_length;
   7867 		if (ill->ill_sap_length < 0) {
   7868 			bphys_addr = (uchar_t *)dlur +
   7869 			    dlur->dl_dest_addr_offset;
   7870 			addrlen += ill->ill_sap_length;
   7871 		} else {
   7872 			bphys_addr = (uchar_t *)dlur +
   7873 			    dlur->dl_dest_addr_offset +
   7874 			    ill->ill_sap_length;
   7875 			addrlen -= ill->ill_sap_length;
   7876 		}
   7877 		if (bcmp(mhip->mhi_daddr, bphys_addr, addrlen) == 0)
   7878 			mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
   7879 	}
   7880 }
   7881 
   7882 /*
   7883  * Handle anything but M_DATA messages
   7884  * We see the DL_UNITDATA_IND which are part
   7885  * of the data path, and also the other messages from the driver.
   7886  */
   7887 void
   7888 ip_rput_notdata(ill_t *ill, mblk_t *mp)
   7889 {
   7890 	mblk_t		*first_mp;
   7891 	struct iocblk   *iocp;
   7892 	struct mac_header_info_s mhi;
   7893 
   7894 	switch (DB_TYPE(mp)) {
   7895 	case M_PROTO:
   7896 	case M_PCPROTO: {
   7897 		if (((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive !=
   7898 		    DL_UNITDATA_IND) {
   7899 			/* Go handle anything other than data elsewhere. */
   7900 			ip_rput_dlpi(ill, mp);
   7901 			return;
   7902 		}
   7903 
   7904 		first_mp = mp;
   7905 		mp = first_mp->b_cont;
   7906 		first_mp->b_cont = NULL;
   7907 
   7908 		if (mp == NULL) {
   7909 			freeb(first_mp);
   7910 			return;
   7911 		}
   7912 		ip_dlur_to_mhi(ill, first_mp, &mhi);
   7913 		if (ill->ill_isv6)
   7914 			ip_input_v6(ill, NULL, mp, &mhi);
   7915 		else
   7916 			ip_input(ill, NULL, mp, &mhi);
   7917 
   7918 		/* Ditch the DLPI header. */
   7919 		freeb(first_mp);
   7920 		return;
   7921 	}
   7922 	case M_IOCACK:
   7923 		iocp = (struct iocblk *)mp->b_rptr;
   7924 		switch (iocp->ioc_cmd) {
   7925 		case DL_IOC_HDR_INFO:
   7926 			ill_fastpath_ack(ill, mp);
   7927 			return;
   7928 		default:
   7929 			putnext(ill->ill_rq, mp);
   7930 			return;
   7931 		}
   7932 		/* FALLTHRU */
   7933 	case M_ERROR:
   7934 	case M_HANGUP:
   7935 		mutex_enter(&ill->ill_lock);
   7936 		if (ill->ill_state_flags & ILL_CONDEMNED) {
   7937 			mutex_exit(&ill->ill_lock);
   7938 			freemsg(mp);
   7939 			return;
   7940 		}
   7941 		ill_refhold_locked(ill);
   7942 		mutex_exit(&ill->ill_lock);
   7943 		qwriter_ip(ill, ill->ill_rq, mp, ip_rput_other, CUR_OP,
   7944 		    B_FALSE);
   7945 		return;
   7946 	case M_CTL:
   7947 		putnext(ill->ill_rq, mp);
   7948 		return;
   7949 	case M_IOCNAK:
   7950 		ip1dbg(("got iocnak "));
   7951 		iocp = (struct iocblk *)mp->b_rptr;
   7952 		switch (iocp->ioc_cmd) {
   7953 		case DL_IOC_HDR_INFO:
   7954 			ip_rput_other(NULL, ill->ill_rq, mp, NULL);
   7955 			return;
   7956 		default:
   7957 			break;
   7958 		}
   7959 		/* FALLTHRU */
   7960 	default:
   7961 		putnext(ill->ill_rq, mp);
   7962 		return;
   7963 	}
   7964 }
   7965 
   7966 /* Read side put procedure.  Packets coming from the wire arrive here. */
   7967 void
   7968 ip_rput(queue_t *q, mblk_t *mp)
   7969 {
   7970 	ill_t	*ill;
   7971 	union DL_primitives *dl;
   7972 
   7973 	ill = (ill_t *)q->q_ptr;
   7974 
   7975 	if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
   7976 		/*
   7977 		 * If things are opening or closing, only accept high-priority
   7978 		 * DLPI messages.  (On open ill->ill_ipif has not yet been
   7979 		 * created; on close, things hanging off the ill may have been
   7980 		 * freed already.)
   7981 		 */
   7982 		dl = (union DL_primitives *)mp->b_rptr;
   7983 		if (DB_TYPE(mp) != M_PCPROTO ||
   7984 		    dl->dl_primitive == DL_UNITDATA_IND) {
   7985 			inet_freemsg(mp);
   7986 			return;
   7987 		}
   7988 	}
   7989 	if (DB_TYPE(mp) == M_DATA) {
   7990 		struct mac_header_info_s mhi;
   7991 
   7992 		ip_mdata_to_mhi(ill, mp, &mhi);
   7993 		ip_input(ill, NULL, mp, &mhi);
   7994 	} else {
   7995 		ip_rput_notdata(ill, mp);
   7996 	}
   7997 }
   7998 
   7999 /*
   8000  * Move the information to a copy.
   8001  */
   8002 mblk_t *
   8003 ip_fix_dbref(mblk_t *mp, ip_recv_attr_t *ira)
   8004 {
   8005 	mblk_t		*mp1;
   8006 	ill_t		*ill = ira->ira_ill;
   8007 	ip_stack_t	*ipst = ill->ill_ipst;
   8008 
   8009 	IP_STAT(ipst, ip_db_ref);
   8010 
   8011 	/* Make sure we have ira_l2src before we loose the original mblk */
   8012 	if (!(ira->ira_flags & IRAF_L2SRC_SET))
   8013 		ip_setl2src(mp, ira, ira->ira_rill);
   8014 
   8015 	mp1 = copymsg(mp);
   8016 	if (mp1 == NULL) {
   8017 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   8018 		ip_drop_input("ipIfStatsInDiscards", mp, ill);
   8019 		freemsg(mp);
   8020 		return (NULL);
   8021 	}
   8022 	/* preserve the hardware checksum flags and data, if present */
   8023 	if (DB_CKSUMFLAGS(mp) != 0) {
   8024 		DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
   8025 		DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
   8026 		DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
   8027 		DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
   8028 		DB_CKSUM16(mp1) = DB_CKSUM16(mp);
   8029 	}
   8030 	freemsg(mp);
   8031 	return (mp1);
   8032 }
   8033 
   8034 static void
   8035 ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err,
   8036     t_uscalar_t err)
   8037 {
   8038 	if (dl_err == DL_SYSERR) {
   8039 		(void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
   8040 		    "%s: %s failed: DL_SYSERR (errno %u)\n",
   8041 		    ill->ill_name, dl_primstr(prim), err);
   8042 		return;
   8043 	}
   8044 
   8045 	(void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
   8046 	    "%s: %s failed: %s\n", ill->ill_name, dl_primstr(prim),
   8047 	    dl_errstr(dl_err));
   8048 }
   8049 
   8050 /*
   8051  * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other
   8052  * than DL_UNITDATA_IND messages. If we need to process this message
   8053  * exclusively, we call qwriter_ip, in which case we also need to call
   8054  * ill_refhold before that, since qwriter_ip does an ill_refrele.
   8055  */
   8056 void
   8057 ip_rput_dlpi(ill_t *ill, mblk_t *mp)
   8058 {
   8059 	dl_ok_ack_t	*dloa = (dl_ok_ack_t *)mp->b_rptr;
   8060 	dl_error_ack_t	*dlea = (dl_error_ack_t *)dloa;
   8061 	queue_t		*q = ill->ill_rq;
   8062 	t_uscalar_t	prim = dloa->dl_primitive;
   8063 	t_uscalar_t	reqprim = DL_PRIM_INVAL;
   8064 
   8065 	DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi",
   8066 	    char *, dl_primstr(prim), ill_t *, ill);
   8067 	ip1dbg(("ip_rput_dlpi"));
   8068 
   8069 	/*
   8070 	 * If we received an ACK but didn't send a request for it, then it
   8071 	 * can't be part of any pending operation; discard up-front.
   8072 	 */
   8073 	switch (prim) {
   8074 	case DL_ERROR_ACK:
   8075 		reqprim = dlea->dl_error_primitive;
   8076 		ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK for %s (0x%x): %s "
   8077 		    "(0x%x), unix %u\n", ill->ill_name, dl_primstr(reqprim),
   8078 		    reqprim, dl_errstr(dlea->dl_errno), dlea->dl_errno,
   8079 		    dlea->dl_unix_errno));
   8080 		break;
   8081 	case DL_OK_ACK:
   8082 		reqprim = dloa->dl_correct_primitive;
   8083 		break;
   8084 	case DL_INFO_ACK:
   8085 		reqprim = DL_INFO_REQ;
   8086 		break;
   8087 	case DL_BIND_ACK:
   8088 		reqprim = DL_BIND_REQ;
   8089 		break;
   8090 	case DL_PHYS_ADDR_ACK:
   8091 		reqprim = DL_PHYS_ADDR_REQ;
   8092 		break;
   8093 	case DL_NOTIFY_ACK:
   8094 		reqprim = DL_NOTIFY_REQ;
   8095 		break;
   8096 	case DL_CAPABILITY_ACK:
   8097 		reqprim = DL_CAPABILITY_REQ;
   8098 		break;
   8099 	}
   8100 
   8101 	if (prim != DL_NOTIFY_IND) {
   8102 		if (reqprim == DL_PRIM_INVAL ||
   8103 		    !ill_dlpi_pending(ill, reqprim)) {
   8104 			/* Not a DLPI message we support or expected */
   8105 			freemsg(mp);
   8106 			return;
   8107 		}
   8108 		ip1dbg(("ip_rput: received %s for %s\n", dl_primstr(prim),
   8109 		    dl_primstr(reqprim)));
   8110 	}
   8111 
   8112 	switch (reqprim) {
   8113 	case DL_UNBIND_REQ:
   8114 		/*
   8115 		 * NOTE: we mark the unbind as complete even if we got a
   8116 		 * DL_ERROR_ACK, since there's not much else we can do.
   8117 		 */
   8118 		mutex_enter(&ill->ill_lock);
   8119 		ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
   8120 		cv_signal(&ill->ill_cv);
   8121 		mutex_exit(&ill->ill_lock);
   8122 		break;
   8123 
   8124 	case DL_ENABMULTI_REQ:
   8125 		if (prim == DL_OK_ACK) {
   8126 			if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
   8127 				ill->ill_dlpi_multicast_state = IDS_OK;
   8128 		}
   8129 		break;
   8130 	}
   8131 
   8132 	/*
   8133 	 * The message is one we're waiting for (or DL_NOTIFY_IND), but we
   8134 	 * need to become writer to continue to process it.  Because an
   8135 	 * exclusive operation doesn't complete until replies to all queued
   8136 	 * DLPI messages have been received, we know we're in the middle of an
   8137 	 * exclusive operation and pass CUR_OP (except for DL_NOTIFY_IND).
   8138 	 *
   8139 	 * As required by qwriter_ip(), we refhold the ill; it will refrele.
   8140 	 * Since this is on the ill stream we unconditionally bump up the
   8141 	 * refcount without doing ILL_CAN_LOOKUP().
   8142 	 */
   8143 	ill_refhold(ill);
   8144 	if (prim == DL_NOTIFY_IND)
   8145 		qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, NEW_OP, B_FALSE);
   8146 	else
   8147 		qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, CUR_OP, B_FALSE);
   8148 }
   8149 
   8150 /*
   8151  * Handling of DLPI messages that require exclusive access to the ipsq.
   8152  *
   8153  * Need to do ipsq_pending_mp_get on ioctl completion, which could
   8154  * happen here. (along with mi_copy_done)
   8155  */
   8156 /* ARGSUSED */
   8157 static void
   8158 ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
   8159 {
   8160 	dl_ok_ack_t	*dloa = (dl_ok_ack_t *)mp->b_rptr;
   8161 	dl_error_ack_t	*dlea = (dl_error_ack_t *)dloa;
   8162 	int		err = 0;
   8163 	ill_t		*ill = (ill_t *)q->q_ptr;
   8164 	ipif_t		*ipif = NULL;
   8165 	mblk_t		*mp1 = NULL;
   8166 	conn_t		*connp = NULL;
   8167 	t_uscalar_t	paddrreq;
   8168 	mblk_t		*mp_hw;
   8169 	boolean_t	success;
   8170 	boolean_t	ioctl_aborted = B_FALSE;
   8171 	boolean_t	log = B_TRUE;
   8172 
   8173 	DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer",
   8174 	    char *, dl_primstr(dloa->dl_primitive), ill_t *, ill);
   8175 
   8176 	ip1dbg(("ip_rput_dlpi_writer .."));
   8177 	ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop);
   8178 	ASSERT(IAM_WRITER_ILL(ill));
   8179 
   8180 	ipif = ipsq->ipsq_xop->ipx_pending_ipif;
   8181 	/*
   8182 	 * The current ioctl could have been aborted by the user and a new
   8183 	 * ioctl to bring up another ill could have started. We could still
   8184 	 * get a response from the driver later.
   8185 	 */
   8186 	if (ipif != NULL && ipif->ipif_ill != ill)
   8187 		ioctl_aborted = B_TRUE;
   8188 
   8189 	switch (dloa->dl_primitive) {
   8190 	case DL_ERROR_ACK:
   8191 		ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n",
   8192 		    dl_primstr(dlea->dl_error_primitive)));
   8193 
   8194 		DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer error",
   8195 		    char *, dl_primstr(dlea->dl_error_primitive),
   8196 		    ill_t *, ill);
   8197 
   8198 		switch (dlea->dl_error_primitive) {
   8199 		case DL_DISABMULTI_REQ:
   8200 			ill_dlpi_done(ill, dlea->dl_error_primitive);
   8201 			break;
   8202 		case DL_PROMISCON_REQ:
   8203 		case DL_PROMISCOFF_REQ:
   8204 		case DL_UNBIND_REQ:
   8205 		case DL_ATTACH_REQ:
   8206 		case DL_INFO_REQ:
   8207 			ill_dlpi_done(ill, dlea->dl_error_primitive);
   8208 			break;
   8209 		case DL_NOTIFY_REQ:
   8210 			ill_dlpi_done(ill, DL_NOTIFY_REQ);
   8211 			log = B_FALSE;
   8212 			break;
   8213 		case DL_PHYS_ADDR_REQ:
   8214 			/*
   8215 			 * For IPv6 only, there are two additional
   8216 			 * phys_addr_req's sent to the driver to get the
   8217 			 * IPv6 token and lla. This allows IP to acquire
   8218 			 * the hardware address format for a given interface
   8219 			 * without having built in knowledge of the hardware
   8220 			 * address. ill_phys_addr_pend keeps track of the last
   8221 			 * DL_PAR sent so we know which response we are
   8222 			 * dealing with. ill_dlpi_done will update
   8223 			 * ill_phys_addr_pend when it sends the next req.
   8224 			 * We don't complete the IOCTL until all three DL_PARs
   8225 			 * have been attempted, so set *_len to 0 and break.
   8226 			 */
   8227 			paddrreq = ill->ill_phys_addr_pend;
   8228 			ill_dlpi_done(ill, DL_PHYS_ADDR_REQ);
   8229 			if (paddrreq == DL_IPV6_TOKEN) {
   8230 				ill->ill_token_length = 0;
   8231 				log = B_FALSE;
   8232 				break;
   8233 			} else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) {
   8234 				ill->ill_nd_lla_len = 0;
   8235 				log = B_FALSE;
   8236 				break;
   8237 			}
   8238 			/*
   8239 			 * Something went wrong with the DL_PHYS_ADDR_REQ.
   8240 			 * We presumably have an IOCTL hanging out waiting
   8241 			 * for completion. Find it and complete the IOCTL
   8242 			 * with the error noted.
   8243 			 * However, ill_dl_phys was called on an ill queue
   8244 			 * (from SIOCSLIFNAME), thus conn_pending_ill is not
   8245 			 * set. But the ioctl is known to be pending on ill_wq.
   8246 			 */
   8247 			if (!ill->ill_ifname_pending)
   8248 				break;
   8249 			ill->ill_ifname_pending = 0;
   8250 			if (!ioctl_aborted)
   8251 				mp1 = ipsq_pending_mp_get(ipsq, &connp);
   8252 			if (mp1 != NULL) {
   8253 				/*
   8254 				 * This operation (SIOCSLIFNAME) must have
   8255 				 * happened on the ill. Assert there is no conn
   8256 				 */
   8257 				ASSERT(connp == NULL);
   8258 				q = ill->ill_wq;
   8259 			}
   8260 			break;
   8261 		case DL_BIND_REQ:
   8262 			ill_dlpi_done(ill, DL_BIND_REQ);
   8263 			if (ill->ill_ifname_pending)
   8264 				break;
   8265 			mutex_enter(&ill->ill_lock);
   8266 			ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
   8267 			mutex_exit(&ill->ill_lock);
   8268 			/*
   8269 			 * Something went wrong with the bind.  We presumably
   8270 			 * have an IOCTL hanging out waiting for completion.
   8271 			 * Find it, take down the interface that was coming
   8272 			 * up, and complete the IOCTL with the error noted.
   8273 			 */
   8274 			if (!ioctl_aborted)
   8275 				mp1 = ipsq_pending_mp_get(ipsq, &connp);
   8276 			if (mp1 != NULL) {
   8277 				/*
   8278 				 * This might be a result of a DL_NOTE_REPLUMB
   8279 				 * notification. In that case, connp is NULL.
   8280 				 */
   8281 				if (connp != NULL)
   8282 					q = CONNP_TO_WQ(connp);
   8283 
   8284 				(void) ipif_down(ipif, NULL, NULL);
   8285 				/* error is set below the switch */
   8286 			}
   8287 			break;
   8288 		case DL_ENABMULTI_REQ:
   8289 			ill_dlpi_done(ill, DL_ENABMULTI_REQ);
   8290 
   8291 			if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
   8292 				ill->ill_dlpi_multicast_state = IDS_FAILED;
   8293 			if (ill->ill_dlpi_multicast_state == IDS_FAILED) {
   8294 
   8295 				printf("ip: joining multicasts failed (%d)"
   8296 				    " on %s - will use link layer "
   8297 				    "broadcasts for multicast\n",
   8298 				    dlea->dl_errno, ill->ill_name);
   8299 
   8300 				/*
   8301 				 * Set up for multi_bcast; We are the
   8302 				 * writer, so ok to access ill->ill_ipif
   8303 				 * without any lock.
   8304 				 */
   8305 				mutex_enter(&ill->ill_phyint->phyint_lock);
   8306 				ill->ill_phyint->phyint_flags |=
   8307 				    PHYI_MULTI_BCAST;
   8308 				mutex_exit(&ill->ill_phyint->phyint_lock);
   8309 
   8310 			}
   8311 			freemsg(mp);	/* Don't want to pass this up */
   8312 			return;
   8313 		case DL_CAPABILITY_REQ:
   8314 			ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
   8315 			    "DL_CAPABILITY REQ\n"));
   8316 			if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
   8317 				ill->ill_dlpi_capab_state = IDCS_FAILED;
   8318 			ill_capability_done(ill);
   8319 			freemsg(mp);
   8320 			return;
   8321 		}
   8322 		/*
   8323 		 * Note the error for IOCTL completion (mp1 is set when
   8324 		 * ready to complete ioctl). If ill_ifname_pending_err is
   8325 		 * set, an error occured during plumbing (ill_ifname_pending),
   8326 		 * so we want to report that error.
   8327 		 *
   8328 		 * NOTE: there are two addtional DL_PHYS_ADDR_REQ's
   8329 		 * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are
   8330 		 * expected to get errack'd if the driver doesn't support
   8331 		 * these flags (e.g. ethernet). log will be set to B_FALSE
   8332 		 * if these error conditions are encountered.
   8333 		 */
   8334 		if (mp1 != NULL) {
   8335 			if (ill->ill_ifname_pending_err != 0)  {
   8336 				err = ill->ill_ifname_pending_err;
   8337 				ill->ill_ifname_pending_err = 0;
   8338 			} else {
   8339 				err = dlea->dl_unix_errno ?
   8340 				    dlea->dl_unix_errno : ENXIO;
   8341 			}
   8342 		/*
   8343 		 * If we're plumbing an interface and an error hasn't already
   8344 		 * been saved, set ill_ifname_pending_err to the error passed
   8345 		 * up. Ignore the error if log is B_FALSE (see comment above).
   8346 		 */
   8347 		} else if (log && ill->ill_ifname_pending &&
   8348 		    ill->ill_ifname_pending_err == 0) {
   8349 			ill->ill_ifname_pending_err = dlea->dl_unix_errno ?
   8350 			    dlea->dl_unix_errno : ENXIO;
   8351 		}
   8352 
   8353 		if (log)
   8354 			ip_dlpi_error(ill, dlea->dl_error_primitive,
   8355 			    dlea->dl_errno, dlea->dl_unix_errno);
   8356 		break;
   8357 	case DL_CAPABILITY_ACK:
   8358 		ill_capability_ack(ill, mp);
   8359 		/*
   8360 		 * The message has been handed off to ill_capability_ack
   8361 		 * and must not be freed below
   8362 		 */
   8363 		mp = NULL;
   8364 		break;
   8365 
   8366 	case DL_INFO_ACK:
   8367 		/* Call a routine to handle this one. */
   8368 		ill_dlpi_done(ill, DL_INFO_REQ);
   8369 		ip_ll_subnet_defaults(ill, mp);
   8370 		ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock));
   8371 		return;
   8372 	case DL_BIND_ACK:
   8373 		/*
   8374 		 * We should have an IOCTL waiting on this unless
   8375 		 * sent by ill_dl_phys, in which case just return
   8376 		 */
   8377 		ill_dlpi_done(ill, DL_BIND_REQ);
   8378 
   8379 		if (ill->ill_ifname_pending) {
   8380 			DTRACE_PROBE2(ip__rput__dlpi__ifname__pending,
   8381 			    ill_t *, ill, mblk_t *, mp);
   8382 			break;
   8383 		}
   8384 		mutex_enter(&ill->ill_lock);
   8385 		ill->ill_dl_up = 1;
   8386 		ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
   8387 		mutex_exit(&ill->ill_lock);
   8388 
   8389 		if (!ioctl_aborted)
   8390 			mp1 = ipsq_pending_mp_get(ipsq, &connp);
   8391 		if (mp1 == NULL) {
   8392 			DTRACE_PROBE1(ip__rput__dlpi__no__mblk, ill_t *, ill);
   8393 			break;
   8394 		}
   8395 		/*
   8396 		 * mp1 was added by ill_dl_up(). if that is a result of
   8397 		 * a DL_NOTE_REPLUMB notification, connp could be NULL.
   8398 		 */
   8399 		if (connp != NULL)
   8400 			q = CONNP_TO_WQ(connp);
   8401 		/*
   8402 		 * We are exclusive. So nothing can change even after
   8403 		 * we get the pending mp.
   8404 		 */
   8405 		ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name));
   8406 		DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill);
   8407 		ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0);
   8408 
   8409 		/*
   8410 		 * Now bring up the resolver; when that is complete, we'll
   8411 		 * create IREs.  Note that we intentionally mirror what
   8412 		 * ipif_up() would have done, because we got here by way of
   8413 		 * ill_dl_up(), which stopped ipif_up()'s processing.
   8414 		 */
   8415 		if (ill->ill_isv6) {
   8416 			/*
   8417 			 * v6 interfaces.
   8418 			 * Unlike ARP which has to do another bind
   8419 			 * and attach, once we get here we are
   8420 			 * done with NDP
   8421 			 */
   8422 			(void) ipif_resolver_up(ipif, Res_act_initial);
   8423 			if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
   8424 				err = ipif_up_done_v6(ipif);
   8425 		} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
   8426 			/*
   8427 			 * ARP and other v4 external resolvers.
   8428 			 * Leave the pending mblk intact so that
   8429 			 * the ioctl completes in ip_rput().
   8430 			 */
   8431 			if (connp != NULL)
   8432 				mutex_enter(&connp->conn_lock);
   8433 			mutex_enter(&ill->ill_lock);
   8434 			success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0);
   8435 			mutex_exit(&ill->ill_lock);
   8436 			if (connp != NULL)
   8437 				mutex_exit(&connp->conn_lock);
   8438 			if (success) {
   8439 				err = ipif_resolver_up(ipif, Res_act_initial);
   8440 				if (err == EINPROGRESS) {
   8441 					freemsg(mp);
   8442 					return;
   8443 				}
   8444 				mp1 = ipsq_pending_mp_get(ipsq, &connp);
   8445 			} else {
   8446 				/* The conn has started closing */
   8447 				err = EINTR;
   8448 			}
   8449 		} else {
   8450 			/*
   8451 			 * This one is complete. Reply to pending ioctl.
   8452 			 */
   8453 			(void) ipif_resolver_up(ipif, Res_act_initial);
   8454 			err = ipif_up_done(ipif);
   8455 		}
   8456 
   8457 		if ((err == 0) && (ill->ill_up_ipifs)) {
   8458 			err = ill_up_ipifs(ill, q, mp1);
   8459 			if (err == EINPROGRESS) {
   8460 				freemsg(mp);
   8461 				return;
   8462 			}
   8463 		}
   8464 
   8465 		/*
   8466 		 * If we have a moved ipif to bring up, and everything has
   8467 		 * succeeded to this point, bring it up on the IPMP ill.
   8468 		 * Otherwise, leave it down -- the admin can try to bring it
   8469 		 * up by hand if need be.
   8470 		 */
   8471 		if (ill->ill_move_ipif != NULL) {
   8472 			if (err != 0) {
   8473 				ill->ill_move_ipif = NULL;
   8474 			} else {
   8475 				ipif = ill->ill_move_ipif;
   8476 				ill->ill_move_ipif = NULL;
   8477 				err = ipif_up(ipif, q, mp1);
   8478 				if (err == EINPROGRESS) {
   8479 					freemsg(mp);
   8480 					return;
   8481 				}
   8482 			}
   8483 		}
   8484 		break;
   8485 
   8486 	case DL_NOTIFY_IND: {
   8487 		dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr;
   8488 		uint_t orig_mtu, orig_mc_mtu;
   8489 
   8490 		switch (notify->dl_notification) {
   8491 		case DL_NOTE_PHYS_ADDR:
   8492 			err = ill_set_phys_addr(ill, mp);
   8493 			break;
   8494 
   8495 		case DL_NOTE_REPLUMB:
   8496 			/*
   8497 			 * Directly return after calling ill_replumb().
   8498 			 * Note that we should not free mp as it is reused
   8499 			 * in the ill_replumb() function.
   8500 			 */
   8501 			err = ill_replumb(ill, mp);
   8502 			return;
   8503 
   8504 		case DL_NOTE_FASTPATH_FLUSH:
   8505 			nce_flush(ill, B_FALSE);
   8506 			break;
   8507 
   8508 		case DL_NOTE_SDU_SIZE:
   8509 		case DL_NOTE_SDU_SIZE2:
   8510 			/*
   8511 			 * The dce and fragmentation code can cope with
   8512 			 * this changing while packets are being sent.
   8513 			 * When packets are sent ip_output will discover
   8514 			 * a change.
   8515 			 *
   8516 			 * Change the MTU size of the interface.
   8517 			 */
   8518 			mutex_enter(&ill->ill_lock);
   8519 			orig_mtu = ill->ill_mtu;
   8520 			orig_mc_mtu = ill->ill_mc_mtu;
   8521 			switch (notify->dl_notification) {
   8522 			case DL_NOTE_SDU_SIZE:
   8523 				ill->ill_current_frag =
   8524 				    (uint_t)notify->dl_data;
   8525 				ill->ill_mc_mtu = (uint_t)notify->dl_data;
   8526 				break;
   8527 			case DL_NOTE_SDU_SIZE2:
   8528 				ill->ill_current_frag =
   8529 				    (uint_t)notify->dl_data1;
   8530 				ill->ill_mc_mtu = (uint_t)notify->dl_data2;
   8531 				break;
   8532 			}
   8533 			if (ill->ill_current_frag > ill->ill_max_frag)
   8534 				ill->ill_max_frag = ill->ill_current_frag;
   8535 
   8536 			if (!(ill->ill_flags & ILLF_FIXEDMTU)) {
   8537 				ill->ill_mtu = ill->ill_current_frag;
   8538 
   8539 				/*
   8540 				 * If ill_user_mtu was set (via
   8541 				 * SIOCSLIFLNKINFO), clamp ill_mtu at it.
   8542 				 */
   8543 				if (ill->ill_user_mtu != 0 &&
   8544 				    ill->ill_user_mtu < ill->ill_mtu)
   8545 					ill->ill_mtu = ill->ill_user_mtu;
   8546 
   8547 				if (ill->ill_user_mtu != 0 &&
   8548 				    ill->ill_user_mtu < ill->ill_mc_mtu)
   8549 					ill->ill_mc_mtu = ill->ill_user_mtu;
   8550 
   8551 				if (ill->ill_isv6) {
   8552 					if (ill->ill_mtu < IPV6_MIN_MTU)
   8553 						ill->ill_mtu = IPV6_MIN_MTU;
   8554 					if (ill->ill_mc_mtu < IPV6_MIN_MTU)
   8555 						ill->ill_mc_mtu = IPV6_MIN_MTU;
   8556 				} else {
   8557 					if (ill->ill_mtu < IP_MIN_MTU)
   8558 						ill->ill_mtu = IP_MIN_MTU;
   8559 					if (ill->ill_mc_mtu < IP_MIN_MTU)
   8560 						ill->ill_mc_mtu = IP_MIN_MTU;
   8561 				}
   8562 			} else if (ill->ill_mc_mtu > ill->ill_mtu) {
   8563 				ill->ill_mc_mtu = ill->ill_mtu;
   8564 			}
   8565 
   8566 			mutex_exit(&ill->ill_lock);
   8567 			/*
   8568 			 * Make sure all dce_generation checks find out
   8569 			 * that ill_mtu/ill_mc_mtu has changed.
   8570 			 */
   8571 			if (orig_mtu != ill->ill_mtu ||
   8572 			    orig_mc_mtu != ill->ill_mc_mtu) {
   8573 				dce_increment_all_generations(ill->ill_isv6,
   8574 				    ill->ill_ipst);
   8575 			}
   8576 
   8577 			/*
   8578 			 * Refresh IPMP meta-interface MTU if necessary.
   8579 			 */
   8580 			if (IS_UNDER_IPMP(ill))
   8581 				ipmp_illgrp_refresh_mtu(ill->ill_grp);
   8582 			break;
   8583 
   8584 		case DL_NOTE_LINK_UP:
   8585 		case DL_NOTE_LINK_DOWN: {
   8586 			/*
   8587 			 * We are writer. ill / phyint / ipsq assocs stable.
   8588 			 * The RUNNING flag reflects the state of the link.
   8589 			 */
   8590 			phyint_t *phyint = ill->ill_phyint;
   8591 			uint64_t new_phyint_flags;
   8592 			boolean_t changed = B_FALSE;
   8593 			boolean_t went_up;
   8594 
   8595 			went_up = notify->dl_notification == DL_NOTE_LINK_UP;
   8596 			mutex_enter(&phyint->phyint_lock);
   8597 
   8598 			new_phyint_flags = went_up ?
   8599 			    phyint->phyint_flags | PHYI_RUNNING :
   8600 			    phyint->phyint_flags & ~PHYI_RUNNING;
   8601 
   8602 			if (IS_IPMP(ill)) {
   8603 				new_phyint_flags = went_up ?
   8604 				    new_phyint_flags & ~PHYI_FAILED :
   8605 				    new_phyint_flags | PHYI_FAILED;
   8606 			}
   8607 
   8608 			if (new_phyint_flags != phyint->phyint_flags) {
   8609 				phyint->phyint_flags = new_phyint_flags;
   8610 				changed = B_TRUE;
   8611 			}
   8612 			mutex_exit(&phyint->phyint_lock);
   8613 			/*
   8614 			 * ill_restart_dad handles the DAD restart and routing
   8615 			 * socket notification logic.
   8616 			 */
   8617 			if (changed) {
   8618 				ill_restart_dad(phyint->phyint_illv4, went_up);
   8619 				ill_restart_dad(phyint->phyint_illv6, went_up);
   8620 			}
   8621 			break;
   8622 		}
   8623 		case DL_NOTE_PROMISC_ON_PHYS: {
   8624 			phyint_t *phyint = ill->ill_phyint;
   8625 
   8626 			mutex_enter(&phyint->phyint_lock);
   8627 			phyint->phyint_flags |= PHYI_PROMISC;
   8628 			mutex_exit(&phyint->phyint_lock);
   8629 			break;
   8630 		}
   8631 		case DL_NOTE_PROMISC_OFF_PHYS: {
   8632 			phyint_t *phyint = ill->ill_phyint;
   8633 
   8634 			mutex_enter(&phyint->phyint_lock);
   8635 			phyint->phyint_flags &= ~PHYI_PROMISC;
   8636 			mutex_exit(&phyint->phyint_lock);
   8637 			break;
   8638 		}
   8639 		case DL_NOTE_CAPAB_RENEG:
   8640 			/*
   8641 			 * Something changed on the driver side.
   8642 			 * It wants us to renegotiate the capabilities
   8643 			 * on this ill. One possible cause is the aggregation
   8644 			 * interface under us where a port got added or
   8645 			 * went away.
   8646 			 *
   8647 			 * If the capability negotiation is already done
   8648 			 * or is in progress, reset the capabilities and
   8649 			 * mark the ill's ill_capab_reneg to be B_TRUE,
   8650 			 * so that when the ack comes back, we can start
   8651 			 * the renegotiation process.
   8652 			 *
   8653 			 * Note that if ill_capab_reneg is already B_TRUE
   8654 			 * (ill_dlpi_capab_state is IDS_UNKNOWN in this case),
   8655 			 * the capability resetting request has been sent
   8656 			 * and the renegotiation has not been started yet;
   8657 			 * nothing needs to be done in this case.
   8658 			 */
   8659 			ipsq_current_start(ipsq, ill->ill_ipif, 0);
   8660 			ill_capability_reset(ill, B_TRUE);
   8661 			ipsq_current_finish(ipsq);
   8662 			break;
   8663 
   8664 		case DL_NOTE_ALLOWED_IPS:
   8665 			ill_set_allowed_ips(ill, mp);
   8666 			break;
   8667 		default:
   8668 			ip0dbg(("ip_rput_dlpi_writer: unknown notification "
   8669 			    "type 0x%x for DL_NOTIFY_IND\n",
   8670 			    notify->dl_notification));
   8671 			break;
   8672 		}
   8673 
   8674 		/*
   8675 		 * As this is an asynchronous operation, we
   8676 		 * should not call ill_dlpi_done
   8677 		 */
   8678 		break;
   8679 	}
   8680 	case DL_NOTIFY_ACK: {
   8681 		dl_notify_ack_t *noteack = (dl_notify_ack_t *)mp->b_rptr;
   8682 
   8683 		if (noteack->dl_notifications & DL_NOTE_LINK_UP)
   8684 			ill->ill_note_link = 1;
   8685 		ill_dlpi_done(ill, DL_NOTIFY_REQ);
   8686 		break;
   8687 	}
   8688 	case DL_PHYS_ADDR_ACK: {
   8689 		/*
   8690 		 * As part of plumbing the interface via SIOCSLIFNAME,
   8691 		 * ill_dl_phys() will queue a series of DL_PHYS_ADDR_REQs,
   8692 		 * whose answers we receive here.  As each answer is received,
   8693 		 * we call ill_dlpi_done() to dispatch the next request as
   8694 		 * we're processing the current one.  Once all answers have
   8695 		 * been received, we use ipsq_pending_mp_get() to dequeue the
   8696 		 * outstanding IOCTL and reply to it.  (Because ill_dl_phys()
   8697 		 * is invoked from an ill queue, conn_oper_pending_ill is not
   8698 		 * available, but we know the ioctl is pending on ill_wq.)
   8699 		 */
   8700 		uint_t	paddrlen, paddroff;
   8701 		uint8_t	*addr;
   8702 
   8703 		paddrreq = ill->ill_phys_addr_pend;
   8704 		paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length;
   8705 		paddroff = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_offset;
   8706 		addr = mp->b_rptr + paddroff;
   8707 
   8708 		ill_dlpi_done(ill, DL_PHYS_ADDR_REQ);
   8709 		if (paddrreq == DL_IPV6_TOKEN) {
   8710 			/*
   8711 			 * bcopy to low-order bits of ill_token
   8712 			 *
   8713 			 * XXX Temporary hack - currently, all known tokens
   8714 			 * are 64 bits, so I'll cheat for the moment.
   8715 			 */
   8716 			bcopy(addr, &ill->ill_token.s6_addr32[2], paddrlen);
   8717 			ill->ill_token_length = paddrlen;
   8718 			break;
   8719 		} else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) {
   8720 			ASSERT(ill->ill_nd_lla_mp == NULL);
   8721 			ill_set_ndmp(ill, mp, paddroff, paddrlen);
   8722 			mp = NULL;
   8723 			break;
   8724 		} else if (paddrreq == DL_CURR_DEST_ADDR) {
   8725 			ASSERT(ill->ill_dest_addr_mp == NULL);
   8726 			ill->ill_dest_addr_mp = mp;
   8727 			ill->ill_dest_addr = addr;
   8728 			mp = NULL;
   8729 			if (ill->ill_isv6) {
   8730 				ill_setdesttoken(ill);
   8731 				ipif_setdestlinklocal(ill->ill_ipif);
   8732 			}
   8733 			break;
   8734 		}
   8735 
   8736 		ASSERT(paddrreq == DL_CURR_PHYS_ADDR);
   8737 		ASSERT(ill->ill_phys_addr_mp == NULL);
   8738 		if (!ill->ill_ifname_pending)
   8739 			break;
   8740 		ill->ill_ifname_pending = 0;
   8741 		if (!ioctl_aborted)
   8742 			mp1 = ipsq_pending_mp_get(ipsq, &connp);
   8743 		if (mp1 != NULL) {
   8744 			ASSERT(connp == NULL);
   8745 			q = ill->ill_wq;
   8746 		}
   8747 		/*
   8748 		 * If any error acks received during the plumbing sequence,
   8749 		 * ill_ifname_pending_err will be set. Break out and send up
   8750 		 * the error to the pending ioctl.
   8751 		 */
   8752 		if (ill->ill_ifname_pending_err != 0) {
   8753 			err = ill->ill_ifname_pending_err;
   8754 			ill->ill_ifname_pending_err = 0;
   8755 			break;
   8756 		}
   8757 
   8758 		ill->ill_phys_addr_mp = mp;
   8759 		ill->ill_phys_addr = (paddrlen == 0 ? NULL : addr);
   8760 		mp = NULL;
   8761 
   8762 		/*
   8763 		 * If paddrlen or ill_phys_addr_length is zero, the DLPI
   8764 		 * provider doesn't support physical addresses.  We check both
   8765 		 * paddrlen and ill_phys_addr_length because sppp (PPP) does
   8766 		 * not have physical addresses, but historically adversises a
   8767 		 * physical address length of 0 in its DL_INFO_ACK, but 6 in
   8768 		 * its DL_PHYS_ADDR_ACK.
   8769 		 */
   8770 		if (paddrlen == 0 || ill->ill_phys_addr_length == 0) {
   8771 			ill->ill_phys_addr = NULL;
   8772 		} else if (paddrlen != ill->ill_phys_addr_length) {
   8773 			ip0dbg(("DL_PHYS_ADDR_ACK: got addrlen %d, expected %d",
   8774 			    paddrlen, ill->ill_phys_addr_length));
   8775 			err = EINVAL;
   8776 			break;
   8777 		}
   8778 
   8779 		if (ill->ill_nd_lla_mp == NULL) {
   8780 			if ((mp_hw = copyb(ill->ill_phys_addr_mp)) == NULL) {
   8781 				err = ENOMEM;
   8782 				break;
   8783 			}
   8784 			ill_set_ndmp(ill, mp_hw, paddroff, paddrlen);
   8785 		}
   8786 
   8787 		if (ill->ill_isv6) {
   8788 			ill_setdefaulttoken(ill);
   8789 			ipif_setlinklocal(ill->ill_ipif);
   8790 		}
   8791 		break;
   8792 	}
   8793 	case DL_OK_ACK:
   8794 		ip2dbg(("DL_OK_ACK %s (0x%x)\n",
   8795 		    dl_primstr((int)dloa->dl_correct_primitive),
   8796 		    dloa->dl_correct_primitive));
   8797 		DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer ok",
   8798 		    char *, dl_primstr(dloa->dl_correct_primitive),
   8799 		    ill_t *, ill);
   8800 
   8801 		switch (dloa->dl_correct_primitive) {
   8802 		case DL_ENABMULTI_REQ:
   8803 		case DL_DISABMULTI_REQ:
   8804 			ill_dlpi_done(ill, dloa->dl_correct_primitive);
   8805 			break;
   8806 		case DL_PROMISCON_REQ:
   8807 		case DL_PROMISCOFF_REQ:
   8808 		case DL_UNBIND_REQ:
   8809 		case DL_ATTACH_REQ:
   8810 			ill_dlpi_done(ill, dloa->dl_correct_primitive);
   8811 			break;
   8812 		}
   8813 		break;
   8814 	default:
   8815 		break;
   8816 	}
   8817 
   8818 	freemsg(mp);
   8819 	if (mp1 == NULL)
   8820 		return;
   8821 
   8822 	/*
   8823 	 * The operation must complete without EINPROGRESS since
   8824 	 * ipsq_pending_mp_get() has removed the mblk (mp1).  Otherwise,
   8825 	 * the operation will be stuck forever inside the IPSQ.
   8826 	 */
   8827 	ASSERT(err != EINPROGRESS);
   8828 
   8829 	DTRACE_PROBE4(ipif__ioctl, char *, "ip_rput_dlpi_writer finish",
   8830 	    int, ipsq->ipsq_xop->ipx_current_ioctl, ill_t *, ill,
   8831 	    ipif_t *, NULL);
   8832 
   8833 	switch (ipsq->ipsq_xop->ipx_current_ioctl) {
   8834 	case 0:
   8835 		ipsq_current_finish(ipsq);
   8836 		break;
   8837 
   8838 	case SIOCSLIFNAME:
   8839 	case IF_UNITSEL: {
   8840 		ill_t *ill_other = ILL_OTHER(ill);
   8841 
   8842 		/*
   8843 		 * If SIOCSLIFNAME or IF_UNITSEL is about to succeed, and the
   8844 		 * ill has a peer which is in an IPMP group, then place ill
   8845 		 * into the same group.  One catch: although ifconfig plumbs
   8846 		 * the appropriate IPMP meta-interface prior to plumbing this
   8847 		 * ill, it is possible for multiple ifconfig applications to
   8848 		 * race (or for another application to adjust plumbing), in
   8849 		 * which case the IPMP meta-interface we need will be missing.
   8850 		 * If so, kick the phyint out of the group.
   8851 		 */
   8852 		if (err == 0 && ill_other != NULL && IS_UNDER_IPMP(ill_other)) {
   8853 			ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
   8854 			ipmp_illgrp_t	*illg;
   8855 
   8856 			illg = ill->ill_isv6 ? grp->gr_v6 : grp->gr_v4;
   8857 			if (illg == NULL)
   8858 				ipmp_phyint_leave_grp(ill->ill_phyint);
   8859 			else
   8860 				ipmp_ill_join_illgrp(ill, illg);
   8861 		}
   8862 
   8863 		if (ipsq->ipsq_xop->ipx_current_ioctl == IF_UNITSEL)
   8864 			ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
   8865 		else
   8866 			ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
   8867 		break;
   8868 	}
   8869 	case SIOCLIFADDIF:
   8870 		ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
   8871 		break;
   8872 
   8873 	default:
   8874 		ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
   8875 		break;
   8876 	}
   8877 }
   8878 
   8879 /*
   8880  * ip_rput_other is called by ip_rput to handle messages modifying the global
   8881  * state in IP.  If 'ipsq' is non-NULL, caller is writer on it.
   8882  */
   8883 /* ARGSUSED */
   8884 void
   8885 ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
   8886 {
   8887 	ill_t		*ill = q->q_ptr;
   8888 	struct iocblk	*iocp;
   8889 
   8890 	ip1dbg(("ip_rput_other "));
   8891 	if (ipsq != NULL) {
   8892 		ASSERT(IAM_WRITER_IPSQ(ipsq));
   8893 		ASSERT(ipsq->ipsq_xop ==
   8894 		    ill->ill_phyint->phyint_ipsq->ipsq_xop);
   8895 	}
   8896 
   8897 	switch (mp->b_datap->db_type) {
   8898 	case M_ERROR:
   8899 	case M_HANGUP:
   8900 		/*
   8901 		 * The device has a problem.  We force the ILL down.  It can
   8902 		 * be brought up again manually using SIOCSIFFLAGS (via
   8903 		 * ifconfig or equivalent).
   8904 		 */
   8905 		ASSERT(ipsq != NULL);
   8906 		if (mp->b_rptr < mp->b_wptr)
   8907 			ill->ill_error = (int)(*mp->b_rptr & 0xFF);
   8908 		if (ill->ill_error == 0)
   8909 			ill->ill_error = ENXIO;
   8910 		if (!ill_down_start(q, mp))
   8911 			return;
   8912 		ipif_all_down_tail(ipsq, q, mp, NULL);
   8913 		break;
   8914 	case M_IOCNAK: {
   8915 		iocp = (struct iocblk *)mp->b_rptr;
   8916 
   8917 		ASSERT(iocp->ioc_cmd == DL_IOC_HDR_INFO);
   8918 		/*
   8919 		 * If this was the first attempt, turn off the fastpath
   8920 		 * probing.
   8921 		 */
   8922 		mutex_enter(&ill->ill_lock);
   8923 		if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) {
   8924 			ill->ill_dlpi_fastpath_state = IDS_FAILED;
   8925 			mutex_exit(&ill->ill_lock);
   8926 			/*
   8927 			 * don't flush the nce_t entries: we use them
   8928 			 * as an index to the ncec itself.
   8929 			 */
   8930 			ip1dbg(("ip_rput: DLPI fastpath off on interface %s\n",
   8931 			    ill->ill_name));
   8932 		} else {
   8933 			mutex_exit(&ill->ill_lock);
   8934 		}
   8935 		freemsg(mp);
   8936 		break;
   8937 	}
   8938 	default:
   8939 		ASSERT(0);
   8940 		break;
   8941 	}
   8942 }
   8943 
   8944 /*
   8945  * Update any source route, record route or timestamp options
   8946  * When it fails it has consumed the message and BUMPed the MIB.
   8947  */
   8948 boolean_t
   8949 ip_forward_options(mblk_t *mp, ipha_t *ipha, ill_t *dst_ill,
   8950     ip_recv_attr_t *ira)
   8951 {
   8952 	ipoptp_t	opts;
   8953 	uchar_t		*opt;
   8954 	uint8_t		optval;
   8955 	uint8_t		optlen;
   8956 	ipaddr_t	dst;
   8957 	ipaddr_t	ifaddr;
   8958 	uint32_t	ts;
   8959 	timestruc_t	now;
   8960 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   8961 
   8962 	ip2dbg(("ip_forward_options\n"));
   8963 	dst = ipha->ipha_dst;
   8964 	for (optval = ipoptp_first(&opts, ipha);
   8965 	    optval != IPOPT_EOL;
   8966 	    optval = ipoptp_next(&opts)) {
   8967 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
   8968 		opt = opts.ipoptp_cur;
   8969 		optlen = opts.ipoptp_len;
   8970 		ip2dbg(("ip_forward_options: opt %d, len %d\n",
   8971 		    optval, opts.ipoptp_len));
   8972 		switch (optval) {
   8973 			uint32_t off;
   8974 		case IPOPT_SSRR:
   8975 		case IPOPT_LSRR:
   8976 			/* Check if adminstratively disabled */
   8977 			if (!ipst->ips_ip_forward_src_routed) {
   8978 				BUMP_MIB(dst_ill->ill_ip_mib,
   8979 				    ipIfStatsForwProhibits);
   8980 				ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
   8981 				    mp, dst_ill);
   8982 				icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
   8983 				    ira);
   8984 				return (B_FALSE);
   8985 			}
   8986 			if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
   8987 				/*
   8988 				 * Must be partial since ip_input_options
   8989 				 * checked for strict.
   8990 				 */
   8991 				break;
   8992 			}
   8993 			off = opt[IPOPT_OFFSET];
   8994 			off--;
   8995 		redo_srr:
   8996 			if (optlen < IP_ADDR_LEN ||
   8997 			    off > optlen - IP_ADDR_LEN) {
   8998 				/* End of source route */
   8999 				ip1dbg((
   9000 				    "ip_forward_options: end of SR\n"));
   9001 				break;
   9002 			}
   9003 			/* Pick a reasonable address on the outbound if */
   9004 			ASSERT(dst_ill != NULL);
   9005 			if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
   9006 			    INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
   9007 			    NULL) != 0) {
   9008 				/* No source! Shouldn't happen */
   9009 				ifaddr = INADDR_ANY;
   9010 			}
   9011 			bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
   9012 			bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
   9013 			ip1dbg(("ip_forward_options: next hop 0x%x\n",
   9014 			    ntohl(dst)));
   9015 
   9016 			/*
   9017 			 * Check if our address is present more than
   9018 			 * once as consecutive hops in source route.
   9019 			 */
   9020 			if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
   9021 				off += IP_ADDR_LEN;
   9022 				opt[IPOPT_OFFSET] += IP_ADDR_LEN;
   9023 				goto redo_srr;
   9024 			}
   9025 			ipha->ipha_dst = dst;
   9026 			opt[IPOPT_OFFSET] += IP_ADDR_LEN;
   9027 			break;
   9028 		case IPOPT_RR:
   9029 			off = opt[IPOPT_OFFSET];
   9030 			off--;
   9031 			if (optlen < IP_ADDR_LEN ||
   9032 			    off > optlen - IP_ADDR_LEN) {
   9033 				/* No more room - ignore */
   9034 				ip1dbg((
   9035 				    "ip_forward_options: end of RR\n"));
   9036 				break;
   9037 			}
   9038 			/* Pick a reasonable address on the outbound if */
   9039 			ASSERT(dst_ill != NULL);
   9040 			if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
   9041 			    INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
   9042 			    NULL) != 0) {
   9043 				/* No source! Shouldn't happen */
   9044 				ifaddr = INADDR_ANY;
   9045 			}
   9046 			bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
   9047 			opt[IPOPT_OFFSET] += IP_ADDR_LEN;
   9048 			break;
   9049 		case IPOPT_TS:
   9050 			/* Insert timestamp if there is room */
   9051 			switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
   9052 			case IPOPT_TS_TSONLY:
   9053 				off = IPOPT_TS_TIMELEN;
   9054 				break;
   9055 			case IPOPT_TS_PRESPEC:
   9056 			case IPOPT_TS_PRESPEC_RFC791:
   9057 				/* Verify that the address matched */
   9058 				off = opt[IPOPT_OFFSET] - 1;
   9059 				bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
   9060 				if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
   9061 					/* Not for us */
   9062 					break;
   9063 				}
   9064 				/* FALLTHRU */
   9065 			case IPOPT_TS_TSANDADDR:
   9066 				off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
   9067 				break;
   9068 			default:
   9069 				/*
   9070 				 * ip_*put_options should have already
   9071 				 * dropped this packet.
   9072 				 */
   9073 				cmn_err(CE_PANIC, "ip_forward_options: "
   9074 				    "unknown IT - bug in ip_input_options?\n");
   9075 				return (B_TRUE);	/* Keep "lint" happy */
   9076 			}
   9077 			if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
   9078 				/* Increase overflow counter */
   9079 				off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
   9080 				opt[IPOPT_POS_OV_FLG] =
   9081 				    (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) |
   9082 				    (off << 4));
   9083 				break;
   9084 			}
   9085 			off = opt[IPOPT_OFFSET] - 1;
   9086 			switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
   9087 			case IPOPT_TS_PRESPEC:
   9088 			case IPOPT_TS_PRESPEC_RFC791:
   9089 			case IPOPT_TS_TSANDADDR:
   9090 				/* Pick a reasonable addr on the outbound if */
   9091 				ASSERT(dst_ill != NULL);
   9092 				if (ip_select_source_v4(dst_ill, INADDR_ANY,
   9093 				    dst, INADDR_ANY, ALL_ZONES, ipst, &ifaddr,
   9094 				    NULL, NULL) != 0) {
   9095 					/* No source! Shouldn't happen */
   9096 					ifaddr = INADDR_ANY;
   9097 				}
   9098 				bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
   9099 				opt[IPOPT_OFFSET] += IP_ADDR_LEN;
   9100 				/* FALLTHRU */
   9101 			case IPOPT_TS_TSONLY:
   9102 				off = opt[IPOPT_OFFSET] - 1;
   9103 				/* Compute # of milliseconds since midnight */
   9104 				gethrestime(&now);
   9105 				ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
   9106 				    now.tv_nsec / (NANOSEC / MILLISEC);
   9107 				bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
   9108 				opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
   9109 				break;
   9110 			}
   9111 			break;
   9112 		}
   9113 	}
   9114 	return (B_TRUE);
   9115 }
   9116 
   9117 /*
   9118  * Call ill_frag_timeout to do garbage collection. ill_frag_timeout
   9119  * returns 'true' if there are still fragments left on the queue, in
   9120  * which case we restart the timer.
   9121  */
   9122 void
   9123 ill_frag_timer(void *arg)
   9124 {
   9125 	ill_t	*ill = (ill_t *)arg;
   9126 	boolean_t frag_pending;
   9127 	ip_stack_t *ipst = ill->ill_ipst;
   9128 	time_t	timeout;
   9129 
   9130 	mutex_enter(&ill->ill_lock);
   9131 	ASSERT(!ill->ill_fragtimer_executing);
   9132 	if (ill->ill_state_flags & ILL_CONDEMNED) {
   9133 		ill->ill_frag_timer_id = 0;
   9134 		mutex_exit(&ill->ill_lock);
   9135 		return;
   9136 	}
   9137 	ill->ill_fragtimer_executing = 1;
   9138 	mutex_exit(&ill->ill_lock);
   9139 
   9140 	timeout = (ill->ill_isv6 ? ipst->ips_ipv6_reassembly_timeout :
   9141 	    ipst->ips_ip_reassembly_timeout);
   9142 
   9143 	frag_pending = ill_frag_timeout(ill, timeout);
   9144 
   9145 	/*
   9146 	 * Restart the timer, if we have fragments pending or if someone
   9147 	 * wanted us to be scheduled again.
   9148 	 */
   9149 	mutex_enter(&ill->ill_lock);
   9150 	ill->ill_fragtimer_executing = 0;
   9151 	ill->ill_frag_timer_id = 0;
   9152 	if (frag_pending || ill->ill_fragtimer_needrestart)
   9153 		ill_frag_timer_start(ill);
   9154 	mutex_exit(&ill->ill_lock);
   9155 }
   9156 
   9157 void
   9158 ill_frag_timer_start(ill_t *ill)
   9159 {
   9160 	ip_stack_t *ipst = ill->ill_ipst;
   9161 	clock_t	timeo_ms;
   9162 
   9163 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   9164 
   9165 	/* If the ill is closing or opening don't proceed */
   9166 	if (ill->ill_state_flags & ILL_CONDEMNED)
   9167 		return;
   9168 
   9169 	if (ill->ill_fragtimer_executing) {
   9170 		/*
   9171 		 * ill_frag_timer is currently executing. Just record the
   9172 		 * the fact that we want the timer to be restarted.
   9173 		 * ill_frag_timer will post a timeout before it returns,
   9174 		 * ensuring it will be called again.
   9175 		 */
   9176 		ill->ill_fragtimer_needrestart = 1;
   9177 		return;
   9178 	}
   9179 
   9180 	if (ill->ill_frag_timer_id == 0) {
   9181 		timeo_ms = (ill->ill_isv6 ? ipst->ips_ipv6_reassembly_timeout :
   9182 		    ipst->ips_ip_reassembly_timeout) * SECONDS;
   9183 
   9184 		/*
   9185 		 * The timer is neither running nor is the timeout handler
   9186 		 * executing. Post a timeout so that ill_frag_timer will be
   9187 		 * called
   9188 		 */
   9189 		ill->ill_frag_timer_id = timeout(ill_frag_timer, ill,
   9190 		    MSEC_TO_TICK(timeo_ms >> 1));
   9191 		ill->ill_fragtimer_needrestart = 0;
   9192 	}
   9193 }
   9194 
   9195 /*
   9196  * Update any source route, record route or timestamp options.
   9197  * Check that we are at end of strict source route.
   9198  * The options have already been checked for sanity in ip_input_options().
   9199  */
   9200 boolean_t
   9201 ip_input_local_options(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
   9202 {
   9203 	ipoptp_t	opts;
   9204 	uchar_t		*opt;
   9205 	uint8_t		optval;
   9206 	uint8_t		optlen;
   9207 	ipaddr_t	dst;
   9208 	ipaddr_t	ifaddr;
   9209 	uint32_t	ts;
   9210 	timestruc_t	now;
   9211 	ill_t		*ill = ira->ira_ill;
   9212 	ip_stack_t	*ipst = ill->ill_ipst;
   9213 
   9214 	ip2dbg(("ip_input_local_options\n"));
   9215 
   9216 	for (optval = ipoptp_first(&opts, ipha);
   9217 	    optval != IPOPT_EOL;
   9218 	    optval = ipoptp_next(&opts)) {
   9219 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
   9220 		opt = opts.ipoptp_cur;
   9221 		optlen = opts.ipoptp_len;
   9222 		ip2dbg(("ip_input_local_options: opt %d, len %d\n",
   9223 		    optval, optlen));
   9224 		switch (optval) {
   9225 			uint32_t off;
   9226 		case IPOPT_SSRR:
   9227 		case IPOPT_LSRR:
   9228 			off = opt[IPOPT_OFFSET];
   9229 			off--;
   9230 			if (optlen < IP_ADDR_LEN ||
   9231 			    off > optlen - IP_ADDR_LEN) {
   9232 				/* End of source route */
   9233 				ip1dbg(("ip_input_local_options: end of SR\n"));
   9234 				break;
   9235 			}
   9236 			/*
   9237 			 * This will only happen if two consecutive entries
   9238 			 * in the source route contains our address or if
   9239 			 * it is a packet with a loose source route which
   9240 			 * reaches us before consuming the whole source route
   9241 			 */
   9242 			ip1dbg(("ip_input_local_options: not end of SR\n"));
   9243 			if (optval == IPOPT_SSRR) {
   9244 				goto bad_src_route;
   9245 			}
   9246 			/*
   9247 			 * Hack: instead of dropping the packet truncate the
   9248 			 * source route to what has been used by filling the
   9249 			 * rest with IPOPT_NOP.
   9250 			 */
   9251 			opt[IPOPT_OLEN] = (uint8_t)off;
   9252 			while (off < optlen) {
   9253 				opt[off++] = IPOPT_NOP;
   9254 			}
   9255 			break;
   9256 		case IPOPT_RR:
   9257 			off = opt[IPOPT_OFFSET];
   9258 			off--;
   9259 			if (optlen < IP_ADDR_LEN ||
   9260 			    off > optlen - IP_ADDR_LEN) {
   9261 				/* No more room - ignore */
   9262 				ip1dbg((
   9263 				    "ip_input_local_options: end of RR\n"));
   9264 				break;
   9265 			}
   9266 			/* Pick a reasonable address on the outbound if */
   9267 			if (ip_select_source_v4(ill, INADDR_ANY, ipha->ipha_dst,
   9268 			    INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
   9269 			    NULL) != 0) {
   9270 				/* No source! Shouldn't happen */
   9271 				ifaddr = INADDR_ANY;
   9272 			}
   9273 			bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
   9274 			opt[IPOPT_OFFSET] += IP_ADDR_LEN;
   9275 			break;
   9276 		case IPOPT_TS:
   9277 			/* Insert timestamp if there is romm */
   9278 			switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
   9279 			case IPOPT_TS_TSONLY:
   9280 				off = IPOPT_TS_TIMELEN;
   9281 				break;
   9282 			case IPOPT_TS_PRESPEC:
   9283 			case IPOPT_TS_PRESPEC_RFC791:
   9284 				/* Verify that the address matched */
   9285 				off = opt[IPOPT_OFFSET] - 1;
   9286 				bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
   9287 				if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
   9288 					/* Not for us */
   9289 					break;
   9290 				}
   9291 				/* FALLTHRU */
   9292 			case IPOPT_TS_TSANDADDR:
   9293 				off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
   9294 				break;
   9295 			default:
   9296 				/*
   9297 				 * ip_*put_options should have already
   9298 				 * dropped this packet.
   9299 				 */
   9300 				cmn_err(CE_PANIC, "ip_input_local_options: "
   9301 				    "unknown IT - bug in ip_input_options?\n");
   9302 				return (B_TRUE);	/* Keep "lint" happy */
   9303 			}
   9304 			if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
   9305 				/* Increase overflow counter */
   9306 				off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
   9307 				opt[IPOPT_POS_OV_FLG] =
   9308 				    (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) |
   9309 				    (off << 4));
   9310 				break;
   9311 			}
   9312 			off = opt[IPOPT_OFFSET] - 1;
   9313 			switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
   9314 			case IPOPT_TS_PRESPEC:
   9315 			case IPOPT_TS_PRESPEC_RFC791:
   9316 			case IPOPT_TS_TSANDADDR:
   9317 				/* Pick a reasonable addr on the outbound if */
   9318 				if (ip_select_source_v4(ill, INADDR_ANY,
   9319 				    ipha->ipha_dst, INADDR_ANY, ALL_ZONES, ipst,
   9320 				    &ifaddr, NULL, NULL) != 0) {
   9321 					/* No source! Shouldn't happen */
   9322 					ifaddr = INADDR_ANY;
   9323 				}
   9324 				bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
   9325 				opt[IPOPT_OFFSET] += IP_ADDR_LEN;
   9326 				/* FALLTHRU */
   9327 			case IPOPT_TS_TSONLY:
   9328 				off = opt[IPOPT_OFFSET] - 1;
   9329 				/* Compute # of milliseconds since midnight */
   9330 				gethrestime(&now);
   9331 				ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
   9332 				    now.tv_nsec / (NANOSEC / MILLISEC);
   9333 				bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
   9334 				opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
   9335 				break;
   9336 			}
   9337 			break;
   9338 		}
   9339 	}
   9340 	return (B_TRUE);
   9341 
   9342 bad_src_route:
   9343 	/* make sure we clear any indication of a hardware checksum */
   9344 	DB_CKSUMFLAGS(mp) = 0;
   9345 	ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
   9346 	icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
   9347 	return (B_FALSE);
   9348 
   9349 }
   9350 
   9351 /*
   9352  * Process IP options in an inbound packet.  Always returns the nexthop.
   9353  * Normally this is the passed in nexthop, but if there is an option
   9354  * that effects the nexthop (such as a source route) that will be returned.
   9355  * Sets *errorp if there is an error, in which case an ICMP error has been sent
   9356  * and mp freed.
   9357  */
   9358 ipaddr_t
   9359 ip_input_options(ipha_t *ipha, ipaddr_t dst, mblk_t *mp,
   9360     ip_recv_attr_t *ira, int *errorp)
   9361 {
   9362 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   9363 	ipoptp_t	opts;
   9364 	uchar_t		*opt;
   9365 	uint8_t		optval;
   9366 	uint8_t		optlen;
   9367 	intptr_t	code = 0;
   9368 	ire_t		*ire;
   9369 
   9370 	ip2dbg(("ip_input_options\n"));
   9371 	*errorp = 0;
   9372 	for (optval = ipoptp_first(&opts, ipha);
   9373 	    optval != IPOPT_EOL;
   9374 	    optval = ipoptp_next(&opts)) {
   9375 		opt = opts.ipoptp_cur;
   9376 		optlen = opts.ipoptp_len;
   9377 		ip2dbg(("ip_input_options: opt %d, len %d\n",
   9378 		    optval, optlen));
   9379 		/*
   9380 		 * Note: we need to verify the checksum before we
   9381 		 * modify anything thus this routine only extracts the next
   9382 		 * hop dst from any source route.
   9383 		 */
   9384 		switch (optval) {
   9385 			uint32_t off;
   9386 		case IPOPT_SSRR:
   9387 		case IPOPT_LSRR:
   9388 			if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
   9389 				if (optval == IPOPT_SSRR) {
   9390 					ip1dbg(("ip_input_options: not next"
   9391 					    " strict source route 0x%x\n",
   9392 					    ntohl(dst)));
   9393 					code = (char *)&ipha->ipha_dst -
   9394 					    (char *)ipha;
   9395 					goto param_prob; /* RouterReq's */
   9396 				}
   9397 				ip2dbg(("ip_input_options: "
   9398 				    "not next source route 0x%x\n",
   9399 				    ntohl(dst)));
   9400 				break;
   9401 			}
   9402 
   9403 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   9404 				ip1dbg((
   9405 				    "ip_input_options: bad option offset\n"));
   9406 				code = (char *)&opt[IPOPT_OLEN] -
   9407 				    (char *)ipha;
   9408 				goto param_prob;
   9409 			}
   9410 			off = opt[IPOPT_OFFSET];
   9411 			off--;
   9412 		redo_srr:
   9413 			if (optlen < IP_ADDR_LEN ||
   9414 			    off > optlen - IP_ADDR_LEN) {
   9415 				/* End of source route */
   9416 				ip1dbg(("ip_input_options: end of SR\n"));
   9417 				break;
   9418 			}
   9419 			bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
   9420 			ip1dbg(("ip_input_options: next hop 0x%x\n",
   9421 			    ntohl(dst)));
   9422 
   9423 			/*
   9424 			 * Check if our address is present more than
   9425 			 * once as consecutive hops in source route.
   9426 			 * XXX verify per-interface ip_forwarding
   9427 			 * for source route?
   9428 			 */
   9429 			if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
   9430 				off += IP_ADDR_LEN;
   9431 				goto redo_srr;
   9432 			}
   9433 
   9434 			if (dst == htonl(INADDR_LOOPBACK)) {
   9435 				ip1dbg(("ip_input_options: loopback addr in "
   9436 				    "source route!\n"));
   9437 				goto bad_src_route;
   9438 			}
   9439 			/*
   9440 			 * For strict: verify that dst is directly
   9441 			 * reachable.
   9442 			 */
   9443 			if (optval == IPOPT_SSRR) {
   9444 				ire = ire_ftable_lookup_v4(dst, 0, 0,
   9445 				    IRE_INTERFACE, NULL, ALL_ZONES,
   9446 				    ira->ira_tsl,
   9447 				    MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
   9448 				    NULL);
   9449 				if (ire == NULL) {
   9450 					ip1dbg(("ip_input_options: SSRR not "
   9451 					    "directly reachable: 0x%x\n",
   9452 					    ntohl(dst)));
   9453 					goto bad_src_route;
   9454 				}
   9455 				ire_refrele(ire);
   9456 			}
   9457 			/*
   9458 			 * Defer update of the offset and the record route
   9459 			 * until the packet is forwarded.
   9460 			 */
   9461 			break;
   9462 		case IPOPT_RR:
   9463 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   9464 				ip1dbg((
   9465 				    "ip_input_options: bad option offset\n"));
   9466 				code = (char *)&opt[IPOPT_OLEN] -
   9467 				    (char *)ipha;
   9468 				goto param_prob;
   9469 			}
   9470 			break;
   9471 		case IPOPT_TS:
   9472 			/*
   9473 			 * Verify that length >= 5 and that there is either
   9474 			 * room for another timestamp or that the overflow
   9475 			 * counter is not maxed out.
   9476 			 */
   9477 			code = (char *)&opt[IPOPT_OLEN] - (char *)ipha;
   9478 			if (optlen < IPOPT_MINLEN_IT) {
   9479 				goto param_prob;
   9480 			}
   9481 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   9482 				ip1dbg((
   9483 				    "ip_input_options: bad option offset\n"));
   9484 				code = (char *)&opt[IPOPT_OFFSET] -
   9485 				    (char *)ipha;
   9486 				goto param_prob;
   9487 			}
   9488 			switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
   9489 			case IPOPT_TS_TSONLY:
   9490 				off = IPOPT_TS_TIMELEN;
   9491 				break;
   9492 			case IPOPT_TS_TSANDADDR:
   9493 			case IPOPT_TS_PRESPEC:
   9494 			case IPOPT_TS_PRESPEC_RFC791:
   9495 				off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
   9496 				break;
   9497 			default:
   9498 				code = (char *)&opt[IPOPT_POS_OV_FLG] -
   9499 				    (char *)ipha;
   9500 				goto param_prob;
   9501 			}
   9502 			if (opt[IPOPT_OFFSET] - 1 + off > optlen &&
   9503 			    (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) {
   9504 				/*
   9505 				 * No room and the overflow counter is 15
   9506 				 * already.
   9507 				 */
   9508 				goto param_prob;
   9509 			}
   9510 			break;
   9511 		}
   9512 	}
   9513 
   9514 	if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) {
   9515 		return (dst);
   9516 	}
   9517 
   9518 	ip1dbg(("ip_input_options: error processing IP options."));
   9519 	code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
   9520 
   9521 param_prob:
   9522 	/* make sure we clear any indication of a hardware checksum */
   9523 	DB_CKSUMFLAGS(mp) = 0;
   9524 	ip_drop_input("ICMP_PARAM_PROBLEM", mp, ira->ira_ill);
   9525 	icmp_param_problem(mp, (uint8_t)code, ira);
   9526 	*errorp = -1;
   9527 	return (dst);
   9528 
   9529 bad_src_route:
   9530 	/* make sure we clear any indication of a hardware checksum */
   9531 	DB_CKSUMFLAGS(mp) = 0;
   9532 	ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ira->ira_ill);
   9533 	icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
   9534 	*errorp = -1;
   9535 	return (dst);
   9536 }
   9537 
   9538 /*
   9539  * IP & ICMP info in >=14 msg's ...
   9540  *  - ip fixed part (mib2_ip_t)
   9541  *  - icmp fixed part (mib2_icmp_t)
   9542  *  - ipAddrEntryTable (ip 20)		all IPv4 ipifs
   9543  *  - ipRouteEntryTable (ip 21)		all IPv4 IREs
   9544  *  - ipNetToMediaEntryTable (ip 22)	all IPv4 Neighbor Cache entries
   9545  *  - ipRouteAttributeTable (ip 102)	labeled routes
   9546  *  - ip multicast membership (ip_member_t)
   9547  *  - ip multicast source filtering (ip_grpsrc_t)
   9548  *  - igmp fixed part (struct igmpstat)
   9549  *  - multicast routing stats (struct mrtstat)
   9550  *  - multicast routing vifs (array of struct vifctl)
   9551  *  - multicast routing routes (array of struct mfcctl)
   9552  *  - ip6 fixed part (mib2_ipv6IfStatsEntry_t)
   9553  *					One per ill plus one generic
   9554  *  - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t)
   9555  *					One per ill plus one generic
   9556  *  - ipv6RouteEntry			all IPv6 IREs
   9557  *  - ipv6RouteAttributeTable (ip6 102)	labeled routes
   9558  *  - ipv6NetToMediaEntry		all IPv6 Neighbor Cache entries
   9559  *  - ipv6AddrEntry			all IPv6 ipifs
   9560  *  - ipv6 multicast membership (ipv6_member_t)
   9561  *  - ipv6 multicast source filtering (ipv6_grpsrc_t)
   9562  *
   9563  * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is
   9564  * already filled in by the caller.
   9565  * If legacy_req is true then MIB structures needs to be truncated to their
   9566  * legacy sizes before being returned.
   9567  * Return value of 0 indicates that no messages were sent and caller
   9568  * should free mpctl.
   9569  */
   9570 int
   9571 ip_snmp_get(queue_t *q, mblk_t *mpctl, int level, boolean_t legacy_req)
   9572 {
   9573 	ip_stack_t *ipst;
   9574 	sctp_stack_t *sctps;
   9575 
   9576 	if (q->q_next != NULL) {
   9577 		ipst = ILLQ_TO_IPST(q);
   9578 	} else {
   9579 		ipst = CONNQ_TO_IPST(q);
   9580 	}
   9581 	ASSERT(ipst != NULL);
   9582 	sctps = ipst->ips_netstack->netstack_sctp;
   9583 
   9584 	if (mpctl == NULL || mpctl->b_cont == NULL) {
   9585 		return (0);
   9586 	}
   9587 
   9588 	/*
   9589 	 * For the purposes of the (broken) packet shell use
   9590 	 * of the level we make sure MIB2_TCP/MIB2_UDP can be used
   9591 	 * to make TCP and UDP appear first in the list of mib items.
   9592 	 * TBD: We could expand this and use it in netstat so that
   9593 	 * the kernel doesn't have to produce large tables (connections,
   9594 	 * routes, etc) when netstat only wants the statistics or a particular
   9595 	 * table.
   9596 	 */
   9597 	if (!(level == MIB2_TCP || level == MIB2_UDP)) {
   9598 		if ((mpctl = icmp_snmp_get(q, mpctl)) == NULL) {
   9599 			return (1);
   9600 		}
   9601 	}
   9602 
   9603 	if (level != MIB2_TCP) {
   9604 		if ((mpctl = udp_snmp_get(q, mpctl, legacy_req)) == NULL) {
   9605 			return (1);
   9606 		}
   9607 	}
   9608 
   9609 	if (level != MIB2_UDP) {
   9610 		if ((mpctl = tcp_snmp_get(q, mpctl, legacy_req)) == NULL) {
   9611 			return (1);
   9612 		}
   9613 	}
   9614 
   9615 	if ((mpctl = ip_snmp_get_mib2_ip_traffic_stats(q, mpctl,
   9616 	    ipst, legacy_req)) == NULL) {
   9617 		return (1);
   9618 	}
   9619 
   9620 	if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl, ipst,
   9621 	    legacy_req)) == NULL) {
   9622 		return (1);
   9623 	}
   9624 
   9625 	if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl, ipst)) == NULL) {
   9626 		return (1);
   9627 	}
   9628 
   9629 	if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl, ipst)) == NULL) {
   9630 		return (1);
   9631 	}
   9632 
   9633 	if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl, ipst)) == NULL) {
   9634 		return (1);
   9635 	}
   9636 
   9637 	if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl, ipst)) == NULL) {
   9638 		return (1);
   9639 	}
   9640 
   9641 	if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl, ipst,
   9642 	    legacy_req)) == NULL) {
   9643 		return (1);
   9644 	}
   9645 
   9646 	if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl, ipst,
   9647 	    legacy_req)) == NULL) {
   9648 		return (1);
   9649 	}
   9650 
   9651 	if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl, ipst)) == NULL) {
   9652 		return (1);
   9653 	}
   9654 
   9655 	if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl, ipst)) == NULL) {
   9656 		return (1);
   9657 	}
   9658 
   9659 	if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl, ipst)) == NULL) {
   9660 		return (1);
   9661 	}
   9662 
   9663 	if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl, ipst)) == NULL) {
   9664 		return (1);
   9665 	}
   9666 
   9667 	if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl, ipst)) == NULL) {
   9668 		return (1);
   9669 	}
   9670 
   9671 	if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl, ipst)) == NULL) {
   9672 		return (1);
   9673 	}
   9674 
   9675 	mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, level, ipst);
   9676 	if (mpctl == NULL)
   9677 		return (1);
   9678 
   9679 	mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, level, ipst);
   9680 	if (mpctl == NULL)
   9681 		return (1);
   9682 
   9683 	if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) {
   9684 		return (1);
   9685 	}
   9686 	if ((mpctl = ip_snmp_get_mib2_ip_dce(q, mpctl, ipst)) == NULL) {
   9687 		return (1);
   9688 	}
   9689 	freemsg(mpctl);
   9690 	return (1);
   9691 }
   9692 
   9693 /* Get global (legacy) IPv4 statistics */
   9694 static mblk_t *
   9695 ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib,
   9696     ip_stack_t *ipst, boolean_t legacy_req)
   9697 {
   9698 	mib2_ip_t		old_ip_mib;
   9699 	struct opthdr		*optp;
   9700 	mblk_t			*mp2ctl;
   9701 	mib2_ipAddrEntry_t	mae;
   9702 
   9703 	/*
   9704 	 * make a copy of the original message
   9705 	 */
   9706 	mp2ctl = copymsg(mpctl);
   9707 
   9708 	/* fixed length IP structure... */
   9709 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   9710 	optp->level = MIB2_IP;
   9711 	optp->name = 0;
   9712 	SET_MIB(old_ip_mib.ipForwarding,
   9713 	    (WE_ARE_FORWARDING(ipst) ? 1 : 2));
   9714 	SET_MIB(old_ip_mib.ipDefaultTTL,
   9715 	    (uint32_t)ipst->ips_ip_def_ttl);
   9716 	SET_MIB(old_ip_mib.ipReasmTimeout,
   9717 	    ipst->ips_ip_reassembly_timeout);
   9718 	SET_MIB(old_ip_mib.ipAddrEntrySize,
   9719 	    (legacy_req) ? LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t) :
   9720 	    sizeof (mib2_ipAddrEntry_t));
   9721 	SET_MIB(old_ip_mib.ipRouteEntrySize,
   9722 	    sizeof (mib2_ipRouteEntry_t));
   9723 	SET_MIB(old_ip_mib.ipNetToMediaEntrySize,
   9724 	    sizeof (mib2_ipNetToMediaEntry_t));
   9725 	SET_MIB(old_ip_mib.ipMemberEntrySize, sizeof (ip_member_t));
   9726 	SET_MIB(old_ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t));
   9727 	SET_MIB(old_ip_mib.ipRouteAttributeSize,
   9728 	    sizeof (mib2_ipAttributeEntry_t));
   9729 	SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t));
   9730 	SET_MIB(old_ip_mib.ipDestEntrySize, sizeof (dest_cache_entry_t));
   9731 
   9732 	/*
   9733 	 * Grab the statistics from the new IP MIB
   9734 	 */
   9735 	SET_MIB(old_ip_mib.ipInReceives,
   9736 	    (uint32_t)ipmib->ipIfStatsHCInReceives);
   9737 	SET_MIB(old_ip_mib.ipInHdrErrors, ipmib->ipIfStatsInHdrErrors);
   9738 	SET_MIB(old_ip_mib.ipInAddrErrors, ipmib->ipIfStatsInAddrErrors);
   9739 	SET_MIB(old_ip_mib.ipForwDatagrams,
   9740 	    (uint32_t)ipmib->ipIfStatsHCOutForwDatagrams);
   9741 	SET_MIB(old_ip_mib.ipInUnknownProtos,
   9742 	    ipmib->ipIfStatsInUnknownProtos);
   9743 	SET_MIB(old_ip_mib.ipInDiscards, ipmib->ipIfStatsInDiscards);
   9744 	SET_MIB(old_ip_mib.ipInDelivers,
   9745 	    (uint32_t)ipmib->ipIfStatsHCInDelivers);
   9746 	SET_MIB(old_ip_mib.ipOutRequests,
   9747 	    (uint32_t)ipmib->ipIfStatsHCOutRequests);
   9748 	SET_MIB(old_ip_mib.ipOutDiscards, ipmib->ipIfStatsOutDiscards);
   9749 	SET_MIB(old_ip_mib.ipOutNoRoutes, ipmib->ipIfStatsOutNoRoutes);
   9750 	SET_MIB(old_ip_mib.ipReasmReqds, ipmib->ipIfStatsReasmReqds);
   9751 	SET_MIB(old_ip_mib.ipReasmOKs, ipmib->ipIfStatsReasmOKs);
   9752 	SET_MIB(old_ip_mib.ipReasmFails, ipmib->ipIfStatsReasmFails);
   9753 	SET_MIB(old_ip_mib.ipFragOKs, ipmib->ipIfStatsOutFragOKs);
   9754 	SET_MIB(old_ip_mib.ipFragFails, ipmib->ipIfStatsOutFragFails);
   9755 	SET_MIB(old_ip_mib.ipFragCreates, ipmib->ipIfStatsOutFragCreates);
   9756 
   9757 	/* ipRoutingDiscards is not being used */
   9758 	SET_MIB(old_ip_mib.ipRoutingDiscards, 0);
   9759 	SET_MIB(old_ip_mib.tcpInErrs, ipmib->tcpIfStatsInErrs);
   9760 	SET_MIB(old_ip_mib.udpNoPorts, ipmib->udpIfStatsNoPorts);
   9761 	SET_MIB(old_ip_mib.ipInCksumErrs, ipmib->ipIfStatsInCksumErrs);
   9762 	SET_MIB(old_ip_mib.ipReasmDuplicates,
   9763 	    ipmib->ipIfStatsReasmDuplicates);
   9764 	SET_MIB(old_ip_mib.ipReasmPartDups, ipmib->ipIfStatsReasmPartDups);
   9765 	SET_MIB(old_ip_mib.ipForwProhibits, ipmib->ipIfStatsForwProhibits);
   9766 	SET_MIB(old_ip_mib.udpInCksumErrs, ipmib->udpIfStatsInCksumErrs);
   9767 	SET_MIB(old_ip_mib.udpInOverflows, ipmib->udpIfStatsInOverflows);
   9768 	SET_MIB(old_ip_mib.rawipInOverflows,
   9769 	    ipmib->rawipIfStatsInOverflows);
   9770 
   9771 	SET_MIB(old_ip_mib.ipsecInSucceeded, ipmib->ipsecIfStatsInSucceeded);
   9772 	SET_MIB(old_ip_mib.ipsecInFailed, ipmib->ipsecIfStatsInFailed);
   9773 	SET_MIB(old_ip_mib.ipInIPv6, ipmib->ipIfStatsInWrongIPVersion);
   9774 	SET_MIB(old_ip_mib.ipOutIPv6, ipmib->ipIfStatsOutWrongIPVersion);
   9775 	SET_MIB(old_ip_mib.ipOutSwitchIPv6,
   9776 	    ipmib->ipIfStatsOutSwitchIPVersion);
   9777 
   9778 	if (!snmp_append_data(mpctl->b_cont, (char *)&old_ip_mib,
   9779 	    (int)sizeof (old_ip_mib))) {
   9780 		ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n",
   9781 		    (uint_t)sizeof (old_ip_mib)));
   9782 	}
   9783 
   9784 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   9785 	ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n",
   9786 	    (int)optp->level, (int)optp->name, (int)optp->len));
   9787 	qreply(q, mpctl);
   9788 	return (mp2ctl);
   9789 }
   9790 
   9791 /* Per interface IPv4 statistics */
   9792 static mblk_t *
   9793 ip_snmp_get_mib2_ip_traffic_stats(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
   9794     boolean_t legacy_req)
   9795 {
   9796 	struct opthdr		*optp;
   9797 	mblk_t			*mp2ctl;
   9798 	ill_t			*ill;
   9799 	ill_walk_context_t	ctx;
   9800 	mblk_t			*mp_tail = NULL;
   9801 	mib2_ipIfStatsEntry_t	global_ip_mib;
   9802 	mib2_ipAddrEntry_t	mae;
   9803 
   9804 	/*
   9805 	 * Make a copy of the original message
   9806 	 */
   9807 	mp2ctl = copymsg(mpctl);
   9808 
   9809 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   9810 	optp->level = MIB2_IP;
   9811 	optp->name = MIB2_IP_TRAFFIC_STATS;
   9812 	/* Include "unknown interface" ip_mib */
   9813 	ipst->ips_ip_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
   9814 	ipst->ips_ip_mib.ipIfStatsIfIndex =
   9815 	    MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */
   9816 	SET_MIB(ipst->ips_ip_mib.ipIfStatsForwarding,
   9817 	    (ipst->ips_ip_forwarding ? 1 : 2));
   9818 	SET_MIB(ipst->ips_ip_mib.ipIfStatsDefaultTTL,
   9819 	    (uint32_t)ipst->ips_ip_def_ttl);
   9820 	SET_MIB(ipst->ips_ip_mib.ipIfStatsEntrySize,
   9821 	    sizeof (mib2_ipIfStatsEntry_t));
   9822 	SET_MIB(ipst->ips_ip_mib.ipIfStatsAddrEntrySize,
   9823 	    sizeof (mib2_ipAddrEntry_t));
   9824 	SET_MIB(ipst->ips_ip_mib.ipIfStatsRouteEntrySize,
   9825 	    sizeof (mib2_ipRouteEntry_t));
   9826 	SET_MIB(ipst->ips_ip_mib.ipIfStatsNetToMediaEntrySize,
   9827 	    sizeof (mib2_ipNetToMediaEntry_t));
   9828 	SET_MIB(ipst->ips_ip_mib.ipIfStatsMemberEntrySize,
   9829 	    sizeof (ip_member_t));
   9830 	SET_MIB(ipst->ips_ip_mib.ipIfStatsGroupSourceEntrySize,
   9831 	    sizeof (ip_grpsrc_t));
   9832 
   9833 	bcopy(&ipst->ips_ip_mib, &global_ip_mib, sizeof (global_ip_mib));
   9834 
   9835 	if (legacy_req) {
   9836 		SET_MIB(global_ip_mib.ipIfStatsAddrEntrySize,
   9837 		    LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t));
   9838 	}
   9839 
   9840 	if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
   9841 	    (char *)&global_ip_mib, (int)sizeof (global_ip_mib))) {
   9842 		ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
   9843 		    "failed to allocate %u bytes\n",
   9844 		    (uint_t)sizeof (global_ip_mib)));
   9845 	}
   9846 
   9847 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   9848 	ill = ILL_START_WALK_V4(&ctx, ipst);
   9849 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   9850 		ill->ill_ip_mib->ipIfStatsIfIndex =
   9851 		    ill->ill_phyint->phyint_ifindex;
   9852 		SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding,
   9853 		    (ipst->ips_ip_forwarding ? 1 : 2));
   9854 		SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultTTL,
   9855 		    (uint32_t)ipst->ips_ip_def_ttl);
   9856 
   9857 		ip_mib2_add_ip_stats(&global_ip_mib, ill->ill_ip_mib);
   9858 		if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
   9859 		    (char *)ill->ill_ip_mib,
   9860 		    (int)sizeof (*ill->ill_ip_mib))) {
   9861 			ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
   9862 			    "failed to allocate %u bytes\n",
   9863 			    (uint_t)sizeof (*ill->ill_ip_mib)));
   9864 		}
   9865 	}
   9866 	rw_exit(&ipst->ips_ill_g_lock);
   9867 
   9868 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   9869 	ip3dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
   9870 	    "level %d, name %d, len %d\n",
   9871 	    (int)optp->level, (int)optp->name, (int)optp->len));
   9872 	qreply(q, mpctl);
   9873 
   9874 	if (mp2ctl == NULL)
   9875 		return (NULL);
   9876 
   9877 	return (ip_snmp_get_mib2_ip(q, mp2ctl, &global_ip_mib, ipst,
   9878 	    legacy_req));
   9879 }
   9880 
   9881 /* Global IPv4 ICMP statistics */
   9882 static mblk_t *
   9883 ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
   9884 {
   9885 	struct opthdr		*optp;
   9886 	mblk_t			*mp2ctl;
   9887 
   9888 	/*
   9889 	 * Make a copy of the original message
   9890 	 */
   9891 	mp2ctl = copymsg(mpctl);
   9892 
   9893 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   9894 	optp->level = MIB2_ICMP;
   9895 	optp->name = 0;
   9896 	if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_icmp_mib,
   9897 	    (int)sizeof (ipst->ips_icmp_mib))) {
   9898 		ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n",
   9899 		    (uint_t)sizeof (ipst->ips_icmp_mib)));
   9900 	}
   9901 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   9902 	ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n",
   9903 	    (int)optp->level, (int)optp->name, (int)optp->len));
   9904 	qreply(q, mpctl);
   9905 	return (mp2ctl);
   9906 }
   9907 
   9908 /* Global IPv4 IGMP statistics */
   9909 static mblk_t *
   9910 ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
   9911 {
   9912 	struct opthdr		*optp;
   9913 	mblk_t			*mp2ctl;
   9914 
   9915 	/*
   9916 	 * make a copy of the original message
   9917 	 */
   9918 	mp2ctl = copymsg(mpctl);
   9919 
   9920 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   9921 	optp->level = EXPER_IGMP;
   9922 	optp->name = 0;
   9923 	if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_igmpstat,
   9924 	    (int)sizeof (ipst->ips_igmpstat))) {
   9925 		ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n",
   9926 		    (uint_t)sizeof (ipst->ips_igmpstat)));
   9927 	}
   9928 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   9929 	ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n",
   9930 	    (int)optp->level, (int)optp->name, (int)optp->len));
   9931 	qreply(q, mpctl);
   9932 	return (mp2ctl);
   9933 }
   9934 
   9935 /* Global IPv4 Multicast Routing statistics */
   9936 static mblk_t *
   9937 ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
   9938 {
   9939 	struct opthdr		*optp;
   9940 	mblk_t			*mp2ctl;
   9941 
   9942 	/*
   9943 	 * make a copy of the original message
   9944 	 */
   9945 	mp2ctl = copymsg(mpctl);
   9946 
   9947 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   9948 	optp->level = EXPER_DVMRP;
   9949 	optp->name = 0;
   9950 	if (!ip_mroute_stats(mpctl->b_cont, ipst)) {
   9951 		ip0dbg(("ip_mroute_stats: failed\n"));
   9952 	}
   9953 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   9954 	ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n",
   9955 	    (int)optp->level, (int)optp->name, (int)optp->len));
   9956 	qreply(q, mpctl);
   9957 	return (mp2ctl);
   9958 }
   9959 
   9960 /* IPv4 address information */
   9961 static mblk_t *
   9962 ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
   9963     boolean_t legacy_req)
   9964 {
   9965 	struct opthdr		*optp;
   9966 	mblk_t			*mp2ctl;
   9967 	mblk_t			*mp_tail = NULL;
   9968 	ill_t			*ill;
   9969 	ipif_t			*ipif;
   9970 	uint_t			bitval;
   9971 	mib2_ipAddrEntry_t	mae;
   9972 	size_t			mae_size;
   9973 	zoneid_t		zoneid;
   9974 	ill_walk_context_t	ctx;
   9975 
   9976 	/*
   9977 	 * make a copy of the original message
   9978 	 */
   9979 	mp2ctl = copymsg(mpctl);
   9980 
   9981 	mae_size = (legacy_req) ? LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t) :
   9982 	    sizeof (mib2_ipAddrEntry_t);
   9983 
   9984 	/* ipAddrEntryTable */
   9985 
   9986 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   9987 	optp->level = MIB2_IP;
   9988 	optp->name = MIB2_IP_ADDR;
   9989 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   9990 
   9991 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   9992 	ill = ILL_START_WALK_V4(&ctx, ipst);
   9993 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   9994 		for (ipif = ill->ill_ipif; ipif != NULL;
   9995 		    ipif = ipif->ipif_next) {
   9996 			if (ipif->ipif_zoneid != zoneid &&
   9997 			    ipif->ipif_zoneid != ALL_ZONES)
   9998 				continue;
   9999 			/* Sum of count from dead IRE_LO* and our current */
   10000 			mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
   10001 			if (ipif->ipif_ire_local != NULL) {
   10002 				mae.ipAdEntInfo.ae_ibcnt +=
   10003 				    ipif->ipif_ire_local->ire_ib_pkt_count;
   10004 			}
   10005 			mae.ipAdEntInfo.ae_obcnt = 0;
   10006 			mae.ipAdEntInfo.ae_focnt = 0;
   10007 
   10008 			ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes,
   10009 			    OCTET_LENGTH);
   10010 			mae.ipAdEntIfIndex.o_length =
   10011 			    mi_strlen(mae.ipAdEntIfIndex.o_bytes);
   10012 			mae.ipAdEntAddr = ipif->ipif_lcl_addr;
   10013 			mae.ipAdEntNetMask = ipif->ipif_net_mask;
   10014 			mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet;
   10015 			mae.ipAdEntInfo.ae_subnet_len =
   10016 			    ip_mask_to_plen(ipif->ipif_net_mask);
   10017 			mae.ipAdEntInfo.ae_src_addr = ipif->ipif_lcl_addr;
   10018 			for (bitval = 1;
   10019 			    bitval &&
   10020 			    !(bitval & ipif->ipif_brd_addr);
   10021 			    bitval <<= 1)
   10022 				noop;
   10023 			mae.ipAdEntBcastAddr = bitval;
   10024 			mae.ipAdEntReasmMaxSize = IP_MAXPACKET;
   10025 			mae.ipAdEntInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
   10026 			mae.ipAdEntInfo.ae_metric  = ipif->ipif_ill->ill_metric;
   10027 			mae.ipAdEntInfo.ae_broadcast_addr =
   10028 			    ipif->ipif_brd_addr;
   10029 			mae.ipAdEntInfo.ae_pp_dst_addr =
   10030 			    ipif->ipif_pp_dst_addr;
   10031 			mae.ipAdEntInfo.ae_flags = ipif->ipif_flags |
   10032 			    ill->ill_flags | ill->ill_phyint->phyint_flags;
   10033 			mae.ipAdEntRetransmitTime =
   10034 			    ill->ill_reachable_retrans_time;
   10035 
   10036 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
   10037 			    (char *)&mae, (int)mae_size)) {
   10038 				ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to "
   10039 				    "allocate %u bytes\n", (uint_t)mae_size));
   10040 			}
   10041 		}
   10042 	}
   10043 	rw_exit(&ipst->ips_ill_g_lock);
   10044 
   10045 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   10046 	ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n",
   10047 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10048 	qreply(q, mpctl);
   10049 	return (mp2ctl);
   10050 }
   10051 
   10052 /* IPv6 address information */
   10053 static mblk_t *
   10054 ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
   10055     boolean_t legacy_req)
   10056 {
   10057 	struct opthdr		*optp;
   10058 	mblk_t			*mp2ctl;
   10059 	mblk_t			*mp_tail = NULL;
   10060 	ill_t			*ill;
   10061 	ipif_t			*ipif;
   10062 	mib2_ipv6AddrEntry_t	mae6;
   10063 	size_t			mae6_size;
   10064 	zoneid_t		zoneid;
   10065 	ill_walk_context_t	ctx;
   10066 
   10067 	/*
   10068 	 * make a copy of the original message
   10069 	 */
   10070 	mp2ctl = copymsg(mpctl);
   10071 
   10072 	mae6_size = (legacy_req) ?
   10073 	    LEGACY_MIB_SIZE(&mae6, mib2_ipv6AddrEntry_t) :
   10074 	    sizeof (mib2_ipv6AddrEntry_t);
   10075 
   10076 	/* ipv6AddrEntryTable */
   10077 
   10078 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10079 	optp->level = MIB2_IP6;
   10080 	optp->name = MIB2_IP6_ADDR;
   10081 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   10082 
   10083 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   10084 	ill = ILL_START_WALK_V6(&ctx, ipst);
   10085 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   10086 		for (ipif = ill->ill_ipif; ipif != NULL;
   10087 		    ipif = ipif->ipif_next) {
   10088 			if (ipif->ipif_zoneid != zoneid &&
   10089 			    ipif->ipif_zoneid != ALL_ZONES)
   10090 				continue;
   10091 			/* Sum of count from dead IRE_LO* and our current */
   10092 			mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
   10093 			if (ipif->ipif_ire_local != NULL) {
   10094 				mae6.ipv6AddrInfo.ae_ibcnt +=
   10095 				    ipif->ipif_ire_local->ire_ib_pkt_count;
   10096 			}
   10097 			mae6.ipv6AddrInfo.ae_obcnt = 0;
   10098 			mae6.ipv6AddrInfo.ae_focnt = 0;
   10099 
   10100 			ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes,
   10101 			    OCTET_LENGTH);
   10102 			mae6.ipv6AddrIfIndex.o_length =
   10103 			    mi_strlen(mae6.ipv6AddrIfIndex.o_bytes);
   10104 			mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr;
   10105 			mae6.ipv6AddrPfxLength =
   10106 			    ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
   10107 			mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet;
   10108 			mae6.ipv6AddrInfo.ae_subnet_len =
   10109 			    mae6.ipv6AddrPfxLength;
   10110 			mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6lcl_addr;
   10111 
   10112 			/* Type: stateless(1), stateful(2), unknown(3) */
   10113 			if (ipif->ipif_flags & IPIF_ADDRCONF)
   10114 				mae6.ipv6AddrType = 1;
   10115 			else
   10116 				mae6.ipv6AddrType = 2;
   10117 			/* Anycast: true(1), false(2) */
   10118 			if (ipif->ipif_flags & IPIF_ANYCAST)
   10119 				mae6.ipv6AddrAnycastFlag = 1;
   10120 			else
   10121 				mae6.ipv6AddrAnycastFlag = 2;
   10122 
   10123 			/*
   10124 			 * Address status: preferred(1), deprecated(2),
   10125 			 * invalid(3), inaccessible(4), unknown(5)
   10126 			 */
   10127 			if (ipif->ipif_flags & IPIF_NOLOCAL)
   10128 				mae6.ipv6AddrStatus = 3;
   10129 			else if (ipif->ipif_flags & IPIF_DEPRECATED)
   10130 				mae6.ipv6AddrStatus = 2;
   10131 			else
   10132 				mae6.ipv6AddrStatus = 1;
   10133 			mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
   10134 			mae6.ipv6AddrInfo.ae_metric  =
   10135 			    ipif->ipif_ill->ill_metric;
   10136 			mae6.ipv6AddrInfo.ae_pp_dst_addr =
   10137 			    ipif->ipif_v6pp_dst_addr;
   10138 			mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags |
   10139 			    ill->ill_flags | ill->ill_phyint->phyint_flags;
   10140 			mae6.ipv6AddrReasmMaxSize = IP_MAXPACKET;
   10141 			mae6.ipv6AddrIdentifier = ill->ill_token;
   10142 			mae6.ipv6AddrIdentifierLen = ill->ill_token_length;
   10143 			mae6.ipv6AddrReachableTime = ill->ill_reachable_time;
   10144 			mae6.ipv6AddrRetransmitTime =
   10145 			    ill->ill_reachable_retrans_time;
   10146 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
   10147 			    (char *)&mae6, (int)mae6_size)) {
   10148 				ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to "
   10149 				    "allocate %u bytes\n",
   10150 				    (uint_t)mae6_size));
   10151 			}
   10152 		}
   10153 	}
   10154 	rw_exit(&ipst->ips_ill_g_lock);
   10155 
   10156 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   10157 	ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n",
   10158 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10159 	qreply(q, mpctl);
   10160 	return (mp2ctl);
   10161 }
   10162 
   10163 /* IPv4 multicast group membership. */
   10164 static mblk_t *
   10165 ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
   10166 {
   10167 	struct opthdr		*optp;
   10168 	mblk_t			*mp2ctl;
   10169 	ill_t			*ill;
   10170 	ipif_t			*ipif;
   10171 	ilm_t			*ilm;
   10172 	ip_member_t		ipm;
   10173 	mblk_t			*mp_tail = NULL;
   10174 	ill_walk_context_t	ctx;
   10175 	zoneid_t		zoneid;
   10176 
   10177 	/*
   10178 	 * make a copy of the original message
   10179 	 */
   10180 	mp2ctl = copymsg(mpctl);
   10181 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   10182 
   10183 	/* ipGroupMember table */
   10184 	optp = (struct opthdr *)&mpctl->b_rptr[
   10185 	    sizeof (struct T_optmgmt_ack)];
   10186 	optp->level = MIB2_IP;
   10187 	optp->name = EXPER_IP_GROUP_MEMBERSHIP;
   10188 
   10189 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   10190 	ill = ILL_START_WALK_V4(&ctx, ipst);
   10191 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   10192 		/* Make sure the ill isn't going away. */
   10193 		if (!ill_check_and_refhold(ill))
   10194 			continue;
   10195 		rw_exit(&ipst->ips_ill_g_lock);
   10196 		rw_enter(&ill->ill_mcast_lock, RW_READER);
   10197 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
   10198 			if (ilm->ilm_zoneid != zoneid &&
   10199 			    ilm->ilm_zoneid != ALL_ZONES)
   10200 				continue;
   10201 
   10202 			/* Is there an ipif for ilm_ifaddr? */
   10203 			for (ipif = ill->ill_ipif; ipif != NULL;
   10204 			    ipif = ipif->ipif_next) {
   10205 				if (!IPIF_IS_CONDEMNED(ipif) &&
   10206 				    ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
   10207 				    ilm->ilm_ifaddr != INADDR_ANY)
   10208 					break;
   10209 			}
   10210 			if (ipif != NULL) {
   10211 				ipif_get_name(ipif,
   10212 				    ipm.ipGroupMemberIfIndex.o_bytes,
   10213 				    OCTET_LENGTH);
   10214 			} else {
   10215 				ill_get_name(ill,
   10216 				    ipm.ipGroupMemberIfIndex.o_bytes,
   10217 				    OCTET_LENGTH);
   10218 			}
   10219 			ipm.ipGroupMemberIfIndex.o_length =
   10220 			    mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes);
   10221 
   10222 			ipm.ipGroupMemberAddress = ilm->ilm_addr;
   10223 			ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt;
   10224 			ipm.ipGroupMemberFilterMode = ilm->ilm_fmode;
   10225 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
   10226 			    (char *)&ipm, (int)sizeof (ipm))) {
   10227 				ip1dbg(("ip_snmp_get_mib2_ip_group: "
   10228 				    "failed to allocate %u bytes\n",
   10229 				    (uint_t)sizeof (ipm)));
   10230 			}
   10231 		}
   10232 		rw_exit(&ill->ill_mcast_lock);
   10233 		ill_refrele(ill);
   10234 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   10235 	}
   10236 	rw_exit(&ipst->ips_ill_g_lock);
   10237 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   10238 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
   10239 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10240 	qreply(q, mpctl);
   10241 	return (mp2ctl);
   10242 }
   10243 
   10244 /* IPv6 multicast group membership. */
   10245 static mblk_t *
   10246 ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
   10247 {
   10248 	struct opthdr		*optp;
   10249 	mblk_t			*mp2ctl;
   10250 	ill_t			*ill;
   10251 	ilm_t			*ilm;
   10252 	ipv6_member_t		ipm6;
   10253 	mblk_t			*mp_tail = NULL;
   10254 	ill_walk_context_t	ctx;
   10255 	zoneid_t		zoneid;
   10256 
   10257 	/*
   10258 	 * make a copy of the original message
   10259 	 */
   10260 	mp2ctl = copymsg(mpctl);
   10261 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   10262 
   10263 	/* ip6GroupMember table */
   10264 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10265 	optp->level = MIB2_IP6;
   10266 	optp->name = EXPER_IP6_GROUP_MEMBERSHIP;
   10267 
   10268 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   10269 	ill = ILL_START_WALK_V6(&ctx, ipst);
   10270 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   10271 		/* Make sure the ill isn't going away. */
   10272 		if (!ill_check_and_refhold(ill))
   10273 			continue;
   10274 		rw_exit(&ipst->ips_ill_g_lock);
   10275 		/*
   10276 		 * Normally we don't have any members on under IPMP interfaces.
   10277 		 * We report them as a debugging aid.
   10278 		 */
   10279 		rw_enter(&ill->ill_mcast_lock, RW_READER);
   10280 		ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex;
   10281 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
   10282 			if (ilm->ilm_zoneid != zoneid &&
   10283 			    ilm->ilm_zoneid != ALL_ZONES)
   10284 				continue;	/* not this zone */
   10285 			ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr;
   10286 			ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt;
   10287 			ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode;
   10288 			if (!snmp_append_data2(mpctl->b_cont,
   10289 			    &mp_tail,
   10290 			    (char *)&ipm6, (int)sizeof (ipm6))) {
   10291 				ip1dbg(("ip_snmp_get_mib2_ip6_group: "
   10292 				    "failed to allocate %u bytes\n",
   10293 				    (uint_t)sizeof (ipm6)));
   10294 			}
   10295 		}
   10296 		rw_exit(&ill->ill_mcast_lock);
   10297 		ill_refrele(ill);
   10298 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   10299 	}
   10300 	rw_exit(&ipst->ips_ill_g_lock);
   10301 
   10302 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   10303 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
   10304 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10305 	qreply(q, mpctl);
   10306 	return (mp2ctl);
   10307 }
   10308 
   10309 /* IP multicast filtered sources */
   10310 static mblk_t *
   10311 ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
   10312 {
   10313 	struct opthdr		*optp;
   10314 	mblk_t			*mp2ctl;
   10315 	ill_t			*ill;
   10316 	ipif_t			*ipif;
   10317 	ilm_t			*ilm;
   10318 	ip_grpsrc_t		ips;
   10319 	mblk_t			*mp_tail = NULL;
   10320 	ill_walk_context_t	ctx;
   10321 	zoneid_t		zoneid;
   10322 	int			i;
   10323 	slist_t			*sl;
   10324 
   10325 	/*
   10326 	 * make a copy of the original message
   10327 	 */
   10328 	mp2ctl = copymsg(mpctl);
   10329 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   10330 
   10331 	/* ipGroupSource table */
   10332 	optp = (struct opthdr *)&mpctl->b_rptr[
   10333 	    sizeof (struct T_optmgmt_ack)];
   10334 	optp->level = MIB2_IP;
   10335 	optp->name = EXPER_IP_GROUP_SOURCES;
   10336 
   10337 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   10338 	ill = ILL_START_WALK_V4(&ctx, ipst);
   10339 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   10340 		/* Make sure the ill isn't going away. */
   10341 		if (!ill_check_and_refhold(ill))
   10342 			continue;
   10343 		rw_exit(&ipst->ips_ill_g_lock);
   10344 		rw_enter(&ill->ill_mcast_lock, RW_READER);
   10345 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
   10346 			sl = ilm->ilm_filter;
   10347 			if (ilm->ilm_zoneid != zoneid &&
   10348 			    ilm->ilm_zoneid != ALL_ZONES)
   10349 				continue;
   10350 			if (SLIST_IS_EMPTY(sl))
   10351 				continue;
   10352 
   10353 			/* Is there an ipif for ilm_ifaddr? */
   10354 			for (ipif = ill->ill_ipif; ipif != NULL;
   10355 			    ipif = ipif->ipif_next) {
   10356 				if (!IPIF_IS_CONDEMNED(ipif) &&
   10357 				    ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
   10358 				    ilm->ilm_ifaddr != INADDR_ANY)
   10359 					break;
   10360 			}
   10361 			if (ipif != NULL) {
   10362 				ipif_get_name(ipif,
   10363 				    ips.ipGroupSourceIfIndex.o_bytes,
   10364 				    OCTET_LENGTH);
   10365 			} else {
   10366 				ill_get_name(ill,
   10367 				    ips.ipGroupSourceIfIndex.o_bytes,
   10368 				    OCTET_LENGTH);
   10369 			}
   10370 			ips.ipGroupSourceIfIndex.o_length =
   10371 			    mi_strlen(ips.ipGroupSourceIfIndex.o_bytes);
   10372 
   10373 			ips.ipGroupSourceGroup = ilm->ilm_addr;
   10374 			for (i = 0; i < sl->sl_numsrc; i++) {
   10375 				if (!IN6_IS_ADDR_V4MAPPED(&sl->sl_addr[i]))
   10376 					continue;
   10377 				IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i],
   10378 				    ips.ipGroupSourceAddress);
   10379 				if (snmp_append_data2(mpctl->b_cont, &mp_tail,
   10380 				    (char *)&ips, (int)sizeof (ips)) == 0) {
   10381 					ip1dbg(("ip_snmp_get_mib2_ip_group_src:"
   10382 					    " failed to allocate %u bytes\n",
   10383 					    (uint_t)sizeof (ips)));
   10384 				}
   10385 			}
   10386 		}
   10387 		rw_exit(&ill->ill_mcast_lock);
   10388 		ill_refrele(ill);
   10389 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   10390 	}
   10391 	rw_exit(&ipst->ips_ill_g_lock);
   10392 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   10393 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
   10394 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10395 	qreply(q, mpctl);
   10396 	return (mp2ctl);
   10397 }
   10398 
   10399 /* IPv6 multicast filtered sources. */
   10400 static mblk_t *
   10401 ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
   10402 {
   10403 	struct opthdr		*optp;
   10404 	mblk_t			*mp2ctl;
   10405 	ill_t			*ill;
   10406 	ilm_t			*ilm;
   10407 	ipv6_grpsrc_t		ips6;
   10408 	mblk_t			*mp_tail = NULL;
   10409 	ill_walk_context_t	ctx;
   10410 	zoneid_t		zoneid;
   10411 	int			i;
   10412 	slist_t			*sl;
   10413 
   10414 	/*
   10415 	 * make a copy of the original message
   10416 	 */
   10417 	mp2ctl = copymsg(mpctl);
   10418 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   10419 
   10420 	/* ip6GroupMember table */
   10421 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10422 	optp->level = MIB2_IP6;
   10423 	optp->name = EXPER_IP6_GROUP_SOURCES;
   10424 
   10425 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   10426 	ill = ILL_START_WALK_V6(&ctx, ipst);
   10427 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   10428 		/* Make sure the ill isn't going away. */
   10429 		if (!ill_check_and_refhold(ill))
   10430 			continue;
   10431 		rw_exit(&ipst->ips_ill_g_lock);
   10432 		/*
   10433 		 * Normally we don't have any members on under IPMP interfaces.
   10434 		 * We report them as a debugging aid.
   10435 		 */
   10436 		rw_enter(&ill->ill_mcast_lock, RW_READER);
   10437 		ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex;
   10438 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
   10439 			sl = ilm->ilm_filter;
   10440 			if (ilm->ilm_zoneid != zoneid &&
   10441 			    ilm->ilm_zoneid != ALL_ZONES)
   10442 				continue;
   10443 			if (SLIST_IS_EMPTY(sl))
   10444 				continue;
   10445 			ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr;
   10446 			for (i = 0; i < sl->sl_numsrc; i++) {
   10447 				ips6.ipv6GroupSourceAddress = sl->sl_addr[i];
   10448 				if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
   10449 				    (char *)&ips6, (int)sizeof (ips6))) {
   10450 					ip1dbg(("ip_snmp_get_mib2_ip6_"
   10451 					    "group_src: failed to allocate "
   10452 					    "%u bytes\n",
   10453 					    (uint_t)sizeof (ips6)));
   10454 				}
   10455 			}
   10456 		}
   10457 		rw_exit(&ill->ill_mcast_lock);
   10458 		ill_refrele(ill);
   10459 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   10460 	}
   10461 	rw_exit(&ipst->ips_ill_g_lock);
   10462 
   10463 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   10464 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
   10465 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10466 	qreply(q, mpctl);
   10467 	return (mp2ctl);
   10468 }
   10469 
   10470 /* Multicast routing virtual interface table. */
   10471 static mblk_t *
   10472 ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
   10473 {
   10474 	struct opthdr		*optp;
   10475 	mblk_t			*mp2ctl;
   10476 
   10477 	/*
   10478 	 * make a copy of the original message
   10479 	 */
   10480 	mp2ctl = copymsg(mpctl);
   10481 
   10482 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10483 	optp->level = EXPER_DVMRP;
   10484 	optp->name = EXPER_DVMRP_VIF;
   10485 	if (!ip_mroute_vif(mpctl->b_cont, ipst)) {
   10486 		ip0dbg(("ip_mroute_vif: failed\n"));
   10487 	}
   10488 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   10489 	ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n",
   10490 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10491 	qreply(q, mpctl);
   10492 	return (mp2ctl);
   10493 }
   10494 
   10495 /* Multicast routing table. */
   10496 static mblk_t *
   10497 ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
   10498 {
   10499 	struct opthdr		*optp;
   10500 	mblk_t			*mp2ctl;
   10501 
   10502 	/*
   10503 	 * make a copy of the original message
   10504 	 */
   10505 	mp2ctl = copymsg(mpctl);
   10506 
   10507 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10508 	optp->level = EXPER_DVMRP;
   10509 	optp->name = EXPER_DVMRP_MRT;
   10510 	if (!ip_mroute_mrt(mpctl->b_cont, ipst)) {
   10511 		ip0dbg(("ip_mroute_mrt: failed\n"));
   10512 	}
   10513 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   10514 	ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n",
   10515 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10516 	qreply(q, mpctl);
   10517 	return (mp2ctl);
   10518 }
   10519 
   10520 /*
   10521  * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable
   10522  * in one IRE walk.
   10523  */
   10524 static mblk_t *
   10525 ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
   10526     ip_stack_t *ipst)
   10527 {
   10528 	struct opthdr	*optp;
   10529 	mblk_t		*mp2ctl;	/* Returned */
   10530 	mblk_t		*mp3ctl;	/* nettomedia */
   10531 	mblk_t		*mp4ctl;	/* routeattrs */
   10532 	iproutedata_t	ird;
   10533 	zoneid_t	zoneid;
   10534 
   10535 	/*
   10536 	 * make copies of the original message
   10537 	 *	- mp2ctl is returned unchanged to the caller for his use
   10538 	 *	- mpctl is sent upstream as ipRouteEntryTable
   10539 	 *	- mp3ctl is sent upstream as ipNetToMediaEntryTable
   10540 	 *	- mp4ctl is sent upstream as ipRouteAttributeTable
   10541 	 */
   10542 	mp2ctl = copymsg(mpctl);
   10543 	mp3ctl = copymsg(mpctl);
   10544 	mp4ctl = copymsg(mpctl);
   10545 	if (mp3ctl == NULL || mp4ctl == NULL) {
   10546 		freemsg(mp4ctl);
   10547 		freemsg(mp3ctl);
   10548 		freemsg(mp2ctl);
   10549 		freemsg(mpctl);
   10550 		return (NULL);
   10551 	}
   10552 
   10553 	bzero(&ird, sizeof (ird));
   10554 
   10555 	ird.ird_route.lp_head = mpctl->b_cont;
   10556 	ird.ird_netmedia.lp_head = mp3ctl->b_cont;
   10557 	ird.ird_attrs.lp_head = mp4ctl->b_cont;
   10558 	/*
   10559 	 * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
   10560 	 * then also include ire_testhidden IREs and IRE_IF_CLONE.  This is
   10561 	 * intended a temporary solution until a proper MIB API is provided
   10562 	 * that provides complete filtering/caller-opt-in.
   10563 	 */
   10564 	if (level == EXPER_IP_AND_ALL_IRES)
   10565 		ird.ird_flags |= IRD_REPORT_ALL;
   10566 
   10567 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   10568 	ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst);
   10569 
   10570 	/* ipRouteEntryTable in mpctl */
   10571 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10572 	optp->level = MIB2_IP;
   10573 	optp->name = MIB2_IP_ROUTE;
   10574 	optp->len = msgdsize(ird.ird_route.lp_head);
   10575 	ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
   10576 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10577 	qreply(q, mpctl);
   10578 
   10579 	/* ipNetToMediaEntryTable in mp3ctl */
   10580 	ncec_walk(NULL, ip_snmp_get2_v4_media, &ird, ipst);
   10581 
   10582 	optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10583 	optp->level = MIB2_IP;
   10584 	optp->name = MIB2_IP_MEDIA;
   10585 	optp->len = msgdsize(ird.ird_netmedia.lp_head);
   10586 	ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
   10587 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10588 	qreply(q, mp3ctl);
   10589 
   10590 	/* ipRouteAttributeTable in mp4ctl */
   10591 	optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10592 	optp->level = MIB2_IP;
   10593 	optp->name = EXPER_IP_RTATTR;
   10594 	optp->len = msgdsize(ird.ird_attrs.lp_head);
   10595 	ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
   10596 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10597 	if (optp->len == 0)
   10598 		freemsg(mp4ctl);
   10599 	else
   10600 		qreply(q, mp4ctl);
   10601 
   10602 	return (mp2ctl);
   10603 }
   10604 
   10605 /*
   10606  * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and
   10607  * ipv6NetToMediaEntryTable in an NDP walk.
   10608  */
   10609 static mblk_t *
   10610 ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
   10611     ip_stack_t *ipst)
   10612 {
   10613 	struct opthdr	*optp;
   10614 	mblk_t		*mp2ctl;	/* Returned */
   10615 	mblk_t		*mp3ctl;	/* nettomedia */
   10616 	mblk_t		*mp4ctl;	/* routeattrs */
   10617 	iproutedata_t	ird;
   10618 	zoneid_t	zoneid;
   10619 
   10620 	/*
   10621 	 * make copies of the original message
   10622 	 *	- mp2ctl is returned unchanged to the caller for his use
   10623 	 *	- mpctl is sent upstream as ipv6RouteEntryTable
   10624 	 *	- mp3ctl is sent upstream as ipv6NetToMediaEntryTable
   10625 	 *	- mp4ctl is sent upstream as ipv6RouteAttributeTable
   10626 	 */
   10627 	mp2ctl = copymsg(mpctl);
   10628 	mp3ctl = copymsg(mpctl);
   10629 	mp4ctl = copymsg(mpctl);
   10630 	if (mp3ctl == NULL || mp4ctl == NULL) {
   10631 		freemsg(mp4ctl);
   10632 		freemsg(mp3ctl);
   10633 		freemsg(mp2ctl);
   10634 		freemsg(mpctl);
   10635 		return (NULL);
   10636 	}
   10637 
   10638 	bzero(&ird, sizeof (ird));
   10639 
   10640 	ird.ird_route.lp_head = mpctl->b_cont;
   10641 	ird.ird_netmedia.lp_head = mp3ctl->b_cont;
   10642 	ird.ird_attrs.lp_head = mp4ctl->b_cont;
   10643 	/*
   10644 	 * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
   10645 	 * then also include ire_testhidden IREs and IRE_IF_CLONE.  This is
   10646 	 * intended a temporary solution until a proper MIB API is provided
   10647 	 * that provides complete filtering/caller-opt-in.
   10648 	 */
   10649 	if (level == EXPER_IP_AND_ALL_IRES)
   10650 		ird.ird_flags |= IRD_REPORT_ALL;
   10651 
   10652 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   10653 	ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst);
   10654 
   10655 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10656 	optp->level = MIB2_IP6;
   10657 	optp->name = MIB2_IP6_ROUTE;
   10658 	optp->len = msgdsize(ird.ird_route.lp_head);
   10659 	ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
   10660 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10661 	qreply(q, mpctl);
   10662 
   10663 	/* ipv6NetToMediaEntryTable in mp3ctl */
   10664 	ncec_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst);
   10665 
   10666 	optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10667 	optp->level = MIB2_IP6;
   10668 	optp->name = MIB2_IP6_MEDIA;
   10669 	optp->len = msgdsize(ird.ird_netmedia.lp_head);
   10670 	ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
   10671 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10672 	qreply(q, mp3ctl);
   10673 
   10674 	/* ipv6RouteAttributeTable in mp4ctl */
   10675 	optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10676 	optp->level = MIB2_IP6;
   10677 	optp->name = EXPER_IP_RTATTR;
   10678 	optp->len = msgdsize(ird.ird_attrs.lp_head);
   10679 	ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
   10680 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10681 	if (optp->len == 0)
   10682 		freemsg(mp4ctl);
   10683 	else
   10684 		qreply(q, mp4ctl);
   10685 
   10686 	return (mp2ctl);
   10687 }
   10688 
   10689 /*
   10690  * IPv6 mib: One per ill
   10691  */
   10692 static mblk_t *
   10693 ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
   10694     boolean_t legacy_req)
   10695 {
   10696 	struct opthdr		*optp;
   10697 	mblk_t			*mp2ctl;
   10698 	ill_t			*ill;
   10699 	ill_walk_context_t	ctx;
   10700 	mblk_t			*mp_tail = NULL;
   10701 	mib2_ipv6AddrEntry_t	mae6;
   10702 	mib2_ipIfStatsEntry_t	*ise;
   10703 	size_t			ise_size, iae_size;
   10704 
   10705 	/*
   10706 	 * Make a copy of the original message
   10707 	 */
   10708 	mp2ctl = copymsg(mpctl);
   10709 
   10710 	/* fixed length IPv6 structure ... */
   10711 
   10712 	if (legacy_req) {
   10713 		ise_size = LEGACY_MIB_SIZE(&ipst->ips_ip6_mib,
   10714 		    mib2_ipIfStatsEntry_t);
   10715 		iae_size = LEGACY_MIB_SIZE(&mae6, mib2_ipv6AddrEntry_t);
   10716 	} else {
   10717 		ise_size = sizeof (mib2_ipIfStatsEntry_t);
   10718 		iae_size = sizeof (mib2_ipv6AddrEntry_t);
   10719 	}
   10720 
   10721 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10722 	optp->level = MIB2_IP6;
   10723 	optp->name = 0;
   10724 	/* Include "unknown interface" ip6_mib */
   10725 	ipst->ips_ip6_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
   10726 	ipst->ips_ip6_mib.ipIfStatsIfIndex =
   10727 	    MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */
   10728 	SET_MIB(ipst->ips_ip6_mib.ipIfStatsForwarding,
   10729 	    ipst->ips_ipv6_forwarding ? 1 : 2);
   10730 	SET_MIB(ipst->ips_ip6_mib.ipIfStatsDefaultHopLimit,
   10731 	    ipst->ips_ipv6_def_hops);
   10732 	SET_MIB(ipst->ips_ip6_mib.ipIfStatsEntrySize,
   10733 	    sizeof (mib2_ipIfStatsEntry_t));
   10734 	SET_MIB(ipst->ips_ip6_mib.ipIfStatsAddrEntrySize,
   10735 	    sizeof (mib2_ipv6AddrEntry_t));
   10736 	SET_MIB(ipst->ips_ip6_mib.ipIfStatsRouteEntrySize,
   10737 	    sizeof (mib2_ipv6RouteEntry_t));
   10738 	SET_MIB(ipst->ips_ip6_mib.ipIfStatsNetToMediaEntrySize,
   10739 	    sizeof (mib2_ipv6NetToMediaEntry_t));
   10740 	SET_MIB(ipst->ips_ip6_mib.ipIfStatsMemberEntrySize,
   10741 	    sizeof (ipv6_member_t));
   10742 	SET_MIB(ipst->ips_ip6_mib.ipIfStatsGroupSourceEntrySize,
   10743 	    sizeof (ipv6_grpsrc_t));
   10744 
   10745 	/*
   10746 	 * Synchronize 64- and 32-bit counters
   10747 	 */
   10748 	SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInReceives,
   10749 	    ipIfStatsHCInReceives);
   10750 	SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInDelivers,
   10751 	    ipIfStatsHCInDelivers);
   10752 	SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutRequests,
   10753 	    ipIfStatsHCOutRequests);
   10754 	SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutForwDatagrams,
   10755 	    ipIfStatsHCOutForwDatagrams);
   10756 	SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutMcastPkts,
   10757 	    ipIfStatsHCOutMcastPkts);
   10758 	SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInMcastPkts,
   10759 	    ipIfStatsHCInMcastPkts);
   10760 
   10761 	if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
   10762 	    (char *)&ipst->ips_ip6_mib, (int)ise_size)) {
   10763 		ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n",
   10764 		    (uint_t)ise_size));
   10765 	} else if (legacy_req) {
   10766 		/* Adjust the EntrySize fields for legacy requests. */
   10767 		ise =
   10768 		    (mib2_ipIfStatsEntry_t *)(mp_tail->b_wptr - (int)ise_size);
   10769 		SET_MIB(ise->ipIfStatsEntrySize, ise_size);
   10770 		SET_MIB(ise->ipIfStatsAddrEntrySize, iae_size);
   10771 	}
   10772 
   10773 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   10774 	ill = ILL_START_WALK_V6(&ctx, ipst);
   10775 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   10776 		ill->ill_ip_mib->ipIfStatsIfIndex =
   10777 		    ill->ill_phyint->phyint_ifindex;
   10778 		SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding,
   10779 		    ipst->ips_ipv6_forwarding ? 1 : 2);
   10780 		SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultHopLimit,
   10781 		    ill->ill_max_hops);
   10782 
   10783 		/*
   10784 		 * Synchronize 64- and 32-bit counters
   10785 		 */
   10786 		SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInReceives,
   10787 		    ipIfStatsHCInReceives);
   10788 		SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInDelivers,
   10789 		    ipIfStatsHCInDelivers);
   10790 		SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutRequests,
   10791 		    ipIfStatsHCOutRequests);
   10792 		SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutForwDatagrams,
   10793 		    ipIfStatsHCOutForwDatagrams);
   10794 		SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutMcastPkts,
   10795 		    ipIfStatsHCOutMcastPkts);
   10796 		SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInMcastPkts,
   10797 		    ipIfStatsHCInMcastPkts);
   10798 
   10799 		if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
   10800 		    (char *)ill->ill_ip_mib, (int)ise_size)) {
   10801 			ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate "
   10802 			"%u bytes\n", (uint_t)ise_size));
   10803 		} else if (legacy_req) {
   10804 			/* Adjust the EntrySize fields for legacy requests. */
   10805 			ise = (mib2_ipIfStatsEntry_t *)(mp_tail->b_wptr -
   10806 			    (int)ise_size);
   10807 			SET_MIB(ise->ipIfStatsEntrySize, ise_size);
   10808 			SET_MIB(ise->ipIfStatsAddrEntrySize, iae_size);
   10809 		}
   10810 	}
   10811 	rw_exit(&ipst->ips_ill_g_lock);
   10812 
   10813 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   10814 	ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n",
   10815 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10816 	qreply(q, mpctl);
   10817 	return (mp2ctl);
   10818 }
   10819 
   10820 /*
   10821  * ICMPv6 mib: One per ill
   10822  */
   10823 static mblk_t *
   10824 ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
   10825 {
   10826 	struct opthdr		*optp;
   10827 	mblk_t			*mp2ctl;
   10828 	ill_t			*ill;
   10829 	ill_walk_context_t	ctx;
   10830 	mblk_t			*mp_tail = NULL;
   10831 	/*
   10832 	 * Make a copy of the original message
   10833 	 */
   10834 	mp2ctl = copymsg(mpctl);
   10835 
   10836 	/* fixed length ICMPv6 structure ... */
   10837 
   10838 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
   10839 	optp->level = MIB2_ICMP6;
   10840 	optp->name = 0;
   10841 	/* Include "unknown interface" icmp6_mib */
   10842 	ipst->ips_icmp6_mib.ipv6IfIcmpIfIndex =
   10843 	    MIB2_UNKNOWN_INTERFACE; /* netstat flag */
   10844 	ipst->ips_icmp6_mib.ipv6IfIcmpEntrySize =
   10845 	    sizeof (mib2_ipv6IfIcmpEntry_t);
   10846 	if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
   10847 	    (char *)&ipst->ips_icmp6_mib,
   10848 	    (int)sizeof (ipst->ips_icmp6_mib))) {
   10849 		ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n",
   10850 		    (uint_t)sizeof (ipst->ips_icmp6_mib)));
   10851 	}
   10852 
   10853 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   10854 	ill = ILL_START_WALK_V6(&ctx, ipst);
   10855 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   10856 		ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
   10857 		    ill->ill_phyint->phyint_ifindex;
   10858 		if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
   10859 		    (char *)ill->ill_icmp6_mib,
   10860 		    (int)sizeof (*ill->ill_icmp6_mib))) {
   10861 			ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate "
   10862 			    "%u bytes\n",
   10863 			    (uint_t)sizeof (*ill->ill_icmp6_mib)));
   10864 		}
   10865 	}
   10866 	rw_exit(&ipst->ips_ill_g_lock);
   10867 
   10868 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
   10869 	ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n",
   10870 	    (int)optp->level, (int)optp->name, (int)optp->len));
   10871 	qreply(q, mpctl);
   10872 	return (mp2ctl);
   10873 }
   10874 
   10875 /*
   10876  * ire_walk routine to create both ipRouteEntryTable and
   10877  * ipRouteAttributeTable in one IRE walk
   10878  */
   10879 static void
   10880 ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
   10881 {
   10882 	ill_t				*ill;
   10883 	mib2_ipRouteEntry_t		*re;
   10884 	mib2_ipAttributeEntry_t		iaes;
   10885 	tsol_ire_gw_secattr_t		*attrp;
   10886 	tsol_gc_t			*gc = NULL;
   10887 	tsol_gcgrp_t			*gcgrp = NULL;
   10888 	ip_stack_t			*ipst = ire->ire_ipst;
   10889 
   10890 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
   10891 
   10892 	if (!(ird->ird_flags & IRD_REPORT_ALL)) {
   10893 		if (ire->ire_testhidden)
   10894 			return;
   10895 		if (ire->ire_type & IRE_IF_CLONE)
   10896 			return;
   10897 	}
   10898 
   10899 	if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
   10900 		return;
   10901 
   10902 	if ((attrp = ire->ire_gw_secattr) != NULL) {
   10903 		mutex_enter(&attrp->igsa_lock);
   10904 		if ((gc = attrp->igsa_gc) != NULL) {
   10905 			gcgrp = gc->gc_grp;
   10906 			ASSERT(gcgrp != NULL);
   10907 			rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
   10908 		}
   10909 		mutex_exit(&attrp->igsa_lock);
   10910 	}
   10911 	/*
   10912 	 * Return all IRE types for route table... let caller pick and choose
   10913 	 */
   10914 	re->ipRouteDest = ire->ire_addr;
   10915 	ill = ire->ire_ill;
   10916 	re->ipRouteIfIndex.o_length = 0;
   10917 	if (ill != NULL) {
   10918 		ill_get_name(ill, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH);
   10919 		re->ipRouteIfIndex.o_length =
   10920 		    mi_strlen(re->ipRouteIfIndex.o_bytes);
   10921 	}
   10922 	re->ipRouteMetric1 = -1;
   10923 	re->ipRouteMetric2 = -1;
   10924 	re->ipRouteMetric3 = -1;
   10925 	re->ipRouteMetric4 = -1;
   10926 
   10927 	re->ipRouteNextHop = ire->ire_gateway_addr;
   10928 	/* indirect(4), direct(3), or invalid(2) */
   10929 	if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
   10930 		re->ipRouteType = 2;
   10931 	else if (ire->ire_type & IRE_ONLINK)
   10932 		re->ipRouteType = 3;
   10933 	else
   10934 		re->ipRouteType = 4;
   10935 
   10936 	re->ipRouteProto = -1;
   10937 	re->ipRouteAge = gethrestime_sec() - ire->ire_create_time;
   10938 	re->ipRouteMask = ire->ire_mask;
   10939 	re->ipRouteMetric5 = -1;
   10940 	re->ipRouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
   10941 	if (ire->ire_ill != NULL && re->ipRouteInfo.re_max_frag == 0)
   10942 		re->ipRouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
   10943 
   10944 	re->ipRouteInfo.re_frag_flag	= 0;
   10945 	re->ipRouteInfo.re_rtt		= 0;
   10946 	re->ipRouteInfo.re_src_addr	= 0;
   10947 	re->ipRouteInfo.re_ref		= ire->ire_refcnt;
   10948 	re->ipRouteInfo.re_obpkt	= ire->ire_ob_pkt_count;
   10949 	re->ipRouteInfo.re_ibpkt	= ire->ire_ib_pkt_count;
   10950 	re->ipRouteInfo.re_flags	= ire->ire_flags;
   10951 
   10952 	/* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
   10953 	if (ire->ire_type & IRE_INTERFACE) {
   10954 		ire_t *child;
   10955 
   10956 		rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
   10957 		child = ire->ire_dep_children;
   10958 		while (child != NULL) {
   10959 			re->ipRouteInfo.re_obpkt += child->ire_ob_pkt_count;
   10960 			re->ipRouteInfo.re_ibpkt += child->ire_ib_pkt_count;
   10961 			child = child->ire_dep_sib_next;
   10962 		}
   10963 		rw_exit(&ipst->ips_ire_dep_lock);
   10964 	}
   10965 
   10966 	if (ire->ire_flags & RTF_DYNAMIC) {
   10967 		re->ipRouteInfo.re_ire_type	= IRE_HOST_REDIRECT;
   10968 	} else {
   10969 		re->ipRouteInfo.re_ire_type	= ire->ire_type;
   10970 	}
   10971 
   10972 	if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail,
   10973 	    (char *)re, (int)sizeof (*re))) {
   10974 		ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n",
   10975 		    (uint_t)sizeof (*re)));
   10976 	}
   10977 
   10978 	if (gc != NULL) {
   10979 		iaes.iae_routeidx = ird->ird_idx;
   10980 		iaes.iae_doi = gc->gc_db->gcdb_doi;
   10981 		iaes.iae_slrange = gc->gc_db->gcdb_slrange;
   10982 
   10983 		if (!snmp_append_data2(ird->ird_attrs.lp_head,
   10984 		    &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
   10985 			ip1dbg(("ip_snmp_get2_v4: failed to allocate %u "
   10986 			    "bytes\n", (uint_t)sizeof (iaes)));
   10987 		}
   10988 	}
   10989 
   10990 	/* bump route index for next pass */
   10991 	ird->ird_idx++;
   10992 
   10993 	kmem_free(re, sizeof (*re));
   10994 	if (gcgrp != NULL)
   10995 		rw_exit(&gcgrp->gcgrp_rwlock);
   10996 }
   10997 
   10998 /*
   10999  * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable.
   11000  */
   11001 static void
   11002 ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
   11003 {
   11004 	ill_t				*ill;
   11005 	mib2_ipv6RouteEntry_t		*re;
   11006 	mib2_ipAttributeEntry_t		iaes;
   11007 	tsol_ire_gw_secattr_t		*attrp;
   11008 	tsol_gc_t			*gc = NULL;
   11009 	tsol_gcgrp_t			*gcgrp = NULL;
   11010 	ip_stack_t			*ipst = ire->ire_ipst;
   11011 
   11012 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
   11013 
   11014 	if (!(ird->ird_flags & IRD_REPORT_ALL)) {
   11015 		if (ire->ire_testhidden)
   11016 			return;
   11017 		if (ire->ire_type & IRE_IF_CLONE)
   11018 			return;
   11019 	}
   11020 
   11021 	if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
   11022 		return;
   11023 
   11024 	if ((attrp = ire->ire_gw_secattr) != NULL) {
   11025 		mutex_enter(&attrp->igsa_lock);
   11026 		if ((gc = attrp->igsa_gc) != NULL) {
   11027 			gcgrp = gc->gc_grp;
   11028 			ASSERT(gcgrp != NULL);
   11029 			rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
   11030 		}
   11031 		mutex_exit(&attrp->igsa_lock);
   11032 	}
   11033 	/*
   11034 	 * Return all IRE types for route table... let caller pick and choose
   11035 	 */
   11036 	re->ipv6RouteDest = ire->ire_addr_v6;
   11037 	re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6);
   11038 	re->ipv6RouteIndex = 0;	/* Unique when multiple with same dest/plen */
   11039 	re->ipv6RouteIfIndex.o_length = 0;
   11040 	ill = ire->ire_ill;
   11041 	if (ill != NULL) {
   11042 		ill_get_name(ill, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH);
   11043 		re->ipv6RouteIfIndex.o_length =
   11044 		    mi_strlen(re->ipv6RouteIfIndex.o_bytes);
   11045 	}
   11046 
   11047 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
   11048 
   11049 	mutex_enter(&ire->ire_lock);
   11050 	re->ipv6RouteNextHop = ire->ire_gateway_addr_v6;
   11051 	mutex_exit(&ire->ire_lock);
   11052 
   11053 	/* remote(4), local(3), or discard(2) */
   11054 	if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
   11055 		re->ipv6RouteType = 2;
   11056 	else if (ire->ire_type & IRE_ONLINK)
   11057 		re->ipv6RouteType = 3;
   11058 	else
   11059 		re->ipv6RouteType = 4;
   11060 
   11061 	re->ipv6RouteProtocol	= -1;
   11062 	re->ipv6RoutePolicy	= 0;
   11063 	re->ipv6RouteAge	= gethrestime_sec() - ire->ire_create_time;
   11064 	re->ipv6RouteNextHopRDI	= 0;
   11065 	re->ipv6RouteWeight	= 0;
   11066 	re->ipv6RouteMetric	= 0;
   11067 	re->ipv6RouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
   11068 	if (ire->ire_ill != NULL && re->ipv6RouteInfo.re_max_frag == 0)
   11069 		re->ipv6RouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
   11070 
   11071 	re->ipv6RouteInfo.re_frag_flag	= 0;
   11072 	re->ipv6RouteInfo.re_rtt	= 0;
   11073 	re->ipv6RouteInfo.re_src_addr	= ipv6_all_zeros;
   11074 	re->ipv6RouteInfo.re_obpkt	= ire->ire_ob_pkt_count;
   11075 	re->ipv6RouteInfo.re_ibpkt	= ire->ire_ib_pkt_count;
   11076 	re->ipv6RouteInfo.re_ref	= ire->ire_refcnt;
   11077 	re->ipv6RouteInfo.re_flags	= ire->ire_flags;
   11078 
   11079 	/* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
   11080 	if (ire->ire_type & IRE_INTERFACE) {
   11081 		ire_t *child;
   11082 
   11083 		rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
   11084 		child = ire->ire_dep_children;
   11085 		while (child != NULL) {
   11086 			re->ipv6RouteInfo.re_obpkt += child->ire_ob_pkt_count;
   11087 			re->ipv6RouteInfo.re_ibpkt += child->ire_ib_pkt_count;
   11088 			child = child->ire_dep_sib_next;
   11089 		}
   11090 		rw_exit(&ipst->ips_ire_dep_lock);
   11091 	}
   11092 	if (ire->ire_flags & RTF_DYNAMIC) {
   11093 		re->ipv6RouteInfo.re_ire_type	= IRE_HOST_REDIRECT;
   11094 	} else {
   11095 		re->ipv6RouteInfo.re_ire_type	= ire->ire_type;
   11096 	}
   11097 
   11098 	if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail,
   11099 	    (char *)re, (int)sizeof (*re))) {
   11100 		ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n",
   11101 		    (uint_t)sizeof (*re)));
   11102 	}
   11103 
   11104 	if (gc != NULL) {
   11105 		iaes.iae_routeidx = ird->ird_idx;
   11106 		iaes.iae_doi = gc->gc_db->gcdb_doi;
   11107 		iaes.iae_slrange = gc->gc_db->gcdb_slrange;
   11108 
   11109 		if (!snmp_append_data2(ird->ird_attrs.lp_head,
   11110 		    &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
   11111 			ip1dbg(("ip_snmp_get2_v6: failed to allocate %u "
   11112 			    "bytes\n", (uint_t)sizeof (iaes)));
   11113 		}
   11114 	}
   11115 
   11116 	/* bump route index for next pass */
   11117 	ird->ird_idx++;
   11118 
   11119 	kmem_free(re, sizeof (*re));
   11120 	if (gcgrp != NULL)
   11121 		rw_exit(&gcgrp->gcgrp_rwlock);
   11122 }
   11123 
   11124 /*
   11125  * ncec_walk routine to create ipv6NetToMediaEntryTable
   11126  */
   11127 static int
   11128 ip_snmp_get2_v6_media(ncec_t *ncec, iproutedata_t *ird)
   11129 {
   11130 	ill_t				*ill;
   11131 	mib2_ipv6NetToMediaEntry_t	ntme;
   11132 
   11133 	ill = ncec->ncec_ill;
   11134 	/* skip arpce entries, and loopback ncec entries */
   11135 	if (ill->ill_isv6 == B_FALSE || ill->ill_net_type == IRE_LOOPBACK)
   11136 		return (0);
   11137 	/*
   11138 	 * Neighbor cache entry attached to IRE with on-link
   11139 	 * destination.
   11140 	 * We report all IPMP groups on ncec_ill which is normally the upper.
   11141 	 */
   11142 	ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex;
   11143 	ntme.ipv6NetToMediaNetAddress = ncec->ncec_addr;
   11144 	ntme.ipv6NetToMediaPhysAddress.o_length = ill->ill_phys_addr_length;
   11145 	if (ncec->ncec_lladdr != NULL) {
   11146 		bcopy(ncec->ncec_lladdr, ntme.ipv6NetToMediaPhysAddress.o_bytes,
   11147 		    ntme.ipv6NetToMediaPhysAddress.o_length);
   11148 	}
   11149 	/*
   11150 	 * Note: Returns ND_* states. Should be:
   11151 	 * reachable(1), stale(2), delay(3), probe(4),
   11152 	 * invalid(5), unknown(6)
   11153 	 */
   11154 	ntme.ipv6NetToMediaState = ncec->ncec_state;
   11155 	ntme.ipv6NetToMediaLastUpdated = 0;
   11156 
   11157 	/* other(1), dynamic(2), static(3), local(4) */
   11158 	if (NCE_MYADDR(ncec)) {
   11159 		ntme.ipv6NetToMediaType = 4;
   11160 	} else if (ncec->ncec_flags & NCE_F_PUBLISH) {
   11161 		ntme.ipv6NetToMediaType = 1; /* proxy */
   11162 	} else if (ncec->ncec_flags & NCE_F_STATIC) {
   11163 		ntme.ipv6NetToMediaType = 3;
   11164 	} else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST)) {
   11165 		ntme.ipv6NetToMediaType = 1;
   11166 	} else {
   11167 		ntme.ipv6NetToMediaType = 2;
   11168 	}
   11169 
   11170 	if (!snmp_append_data2(ird->ird_netmedia.lp_head,
   11171 	    &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
   11172 		ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n",
   11173 		    (uint_t)sizeof (ntme)));
   11174 	}
   11175 	return (0);
   11176 }
   11177 
   11178 int
   11179 nce2ace(ncec_t *ncec)
   11180 {
   11181 	int flags = 0;
   11182 
   11183 	if (NCE_ISREACHABLE(ncec))
   11184 		flags |= ACE_F_RESOLVED;
   11185 	if (ncec->ncec_flags & NCE_F_AUTHORITY)
   11186 		flags |= ACE_F_AUTHORITY;
   11187 	if (ncec->ncec_flags & NCE_F_PUBLISH)
   11188 		flags |= ACE_F_PUBLISH;
   11189 	if ((ncec->ncec_flags & NCE_F_NONUD) != 0)
   11190 		flags |= ACE_F_PERMANENT;
   11191 	if (NCE_MYADDR(ncec))
   11192 		flags |= (ACE_F_MYADDR | ACE_F_AUTHORITY);
   11193 	if (ncec->ncec_flags & NCE_F_UNVERIFIED)
   11194 		flags |= ACE_F_UNVERIFIED;
   11195 	if (ncec->ncec_flags & NCE_F_AUTHORITY)
   11196 		flags |= ACE_F_AUTHORITY;
   11197 	if (ncec->ncec_flags & NCE_F_DELAYED)
   11198 		flags |= ACE_F_DELAYED;
   11199 	return (flags);
   11200 }
   11201 
   11202 /*
   11203  * ncec_walk routine to create ipNetToMediaEntryTable
   11204  */
   11205 static int
   11206 ip_snmp_get2_v4_media(ncec_t *ncec, iproutedata_t *ird)
   11207 {
   11208 	ill_t				*ill;
   11209 	mib2_ipNetToMediaEntry_t	ntme;
   11210 	const char			*name = "unknown";
   11211 	ipaddr_t			ncec_addr;
   11212 
   11213 	ill = ncec->ncec_ill;
   11214 	if (ill->ill_isv6 || (ncec->ncec_flags & NCE_F_BCAST) ||
   11215 	    ill->ill_net_type == IRE_LOOPBACK)
   11216 		return (0);
   11217 
   11218 	/* We report all IPMP groups on ncec_ill which is normally the upper. */
   11219 	name = ill->ill_name;
   11220 	/* Based on RFC 4293: other(1), inval(2), dyn(3), stat(4) */
   11221 	if (NCE_MYADDR(ncec)) {
   11222 		ntme.ipNetToMediaType = 4;
   11223 	} else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST|NCE_F_PUBLISH)) {
   11224 		ntme.ipNetToMediaType = 1;
   11225 	} else {
   11226 		ntme.ipNetToMediaType = 3;
   11227 	}
   11228 	ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name));
   11229 	bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes,
   11230 	    ntme.ipNetToMediaIfIndex.o_length);
   11231 
   11232 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
   11233 	bcopy(&ncec_addr, &ntme.ipNetToMediaNetAddress, sizeof (ncec_addr));
   11234 
   11235 	ntme.ipNetToMediaInfo.ntm_mask.o_length = sizeof (ipaddr_t);
   11236 	ncec_addr = INADDR_BROADCAST;
   11237 	bcopy(&ncec_addr, ntme.ipNetToMediaInfo.ntm_mask.o_bytes,
   11238 	    sizeof (ncec_addr));
   11239 	/*
   11240 	 * map all the flags to the ACE counterpart.
   11241 	 */
   11242 	ntme.ipNetToMediaInfo.ntm_flags = nce2ace(ncec);
   11243 
   11244 	ntme.ipNetToMediaPhysAddress.o_length =
   11245 	    MIN(OCTET_LENGTH, ill->ill_phys_addr_length);
   11246 
   11247 	if (!NCE_ISREACHABLE(ncec))
   11248 		ntme.ipNetToMediaPhysAddress.o_length = 0;
   11249 	else {
   11250 		if (ncec->ncec_lladdr != NULL) {
   11251 			bcopy(ncec->ncec_lladdr,
   11252 			    ntme.ipNetToMediaPhysAddress.o_bytes,
   11253 			    ntme.ipNetToMediaPhysAddress.o_length);
   11254 		}
   11255 	}
   11256 
   11257 	if (!snmp_append_data2(ird->ird_netmedia.lp_head,
   11258 	    &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
   11259 		ip1dbg(("ip_snmp_get2_v4_media: failed to allocate %u bytes\n",
   11260 		    (uint_t)sizeof (ntme)));
   11261 	}
   11262 	return (0);
   11263 }
   11264 
   11265 /*
   11266  * return (0) if invalid set request, 1 otherwise, including non-tcp requests
   11267  */
   11268 /* ARGSUSED */
   11269 int
   11270 ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len)
   11271 {
   11272 	switch (level) {
   11273 	case MIB2_IP:
   11274 	case MIB2_ICMP:
   11275 		switch (name) {
   11276 		default:
   11277 			break;
   11278 		}
   11279 		return (1);
   11280 	default:
   11281 		return (1);
   11282 	}
   11283 }
   11284 
   11285 /*
   11286  * When there exists both a 64- and 32-bit counter of a particular type
   11287  * (i.e., InReceives), only the 64-bit counters are added.
   11288  */
   11289 void
   11290 ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *o1, mib2_ipIfStatsEntry_t *o2)
   11291 {
   11292 	UPDATE_MIB(o1, ipIfStatsInHdrErrors, o2->ipIfStatsInHdrErrors);
   11293 	UPDATE_MIB(o1, ipIfStatsInTooBigErrors, o2->ipIfStatsInTooBigErrors);
   11294 	UPDATE_MIB(o1, ipIfStatsInNoRoutes, o2->ipIfStatsInNoRoutes);
   11295 	UPDATE_MIB(o1, ipIfStatsInAddrErrors, o2->ipIfStatsInAddrErrors);
   11296 	UPDATE_MIB(o1, ipIfStatsInUnknownProtos, o2->ipIfStatsInUnknownProtos);
   11297 	UPDATE_MIB(o1, ipIfStatsInTruncatedPkts, o2->ipIfStatsInTruncatedPkts);
   11298 	UPDATE_MIB(o1, ipIfStatsInDiscards, o2->ipIfStatsInDiscards);
   11299 	UPDATE_MIB(o1, ipIfStatsOutDiscards, o2->ipIfStatsOutDiscards);
   11300 	UPDATE_MIB(o1, ipIfStatsOutFragOKs, o2->ipIfStatsOutFragOKs);
   11301 	UPDATE_MIB(o1, ipIfStatsOutFragFails, o2->ipIfStatsOutFragFails);
   11302 	UPDATE_MIB(o1, ipIfStatsOutFragCreates, o2->ipIfStatsOutFragCreates);
   11303 	UPDATE_MIB(o1, ipIfStatsReasmReqds, o2->ipIfStatsReasmReqds);
   11304 	UPDATE_MIB(o1, ipIfStatsReasmOKs, o2->ipIfStatsReasmOKs);
   11305 	UPDATE_MIB(o1, ipIfStatsReasmFails, o2->ipIfStatsReasmFails);
   11306 	UPDATE_MIB(o1, ipIfStatsOutNoRoutes, o2->ipIfStatsOutNoRoutes);
   11307 	UPDATE_MIB(o1, ipIfStatsReasmDuplicates, o2->ipIfStatsReasmDuplicates);
   11308 	UPDATE_MIB(o1, ipIfStatsReasmPartDups, o2->ipIfStatsReasmPartDups);
   11309 	UPDATE_MIB(o1, ipIfStatsForwProhibits, o2->ipIfStatsForwProhibits);
   11310 	UPDATE_MIB(o1, udpInCksumErrs, o2->udpInCksumErrs);
   11311 	UPDATE_MIB(o1, udpInOverflows, o2->udpInOverflows);
   11312 	UPDATE_MIB(o1, rawipInOverflows, o2->rawipInOverflows);
   11313 	UPDATE_MIB(o1, ipIfStatsInWrongIPVersion,
   11314 	    o2->ipIfStatsInWrongIPVersion);
   11315 	UPDATE_MIB(o1, ipIfStatsOutWrongIPVersion,
   11316 	    o2->ipIfStatsInWrongIPVersion);
   11317 	UPDATE_MIB(o1, ipIfStatsOutSwitchIPVersion,
   11318 	    o2->ipIfStatsOutSwitchIPVersion);
   11319 	UPDATE_MIB(o1, ipIfStatsHCInReceives, o2->ipIfStatsHCInReceives);
   11320 	UPDATE_MIB(o1, ipIfStatsHCInOctets, o2->ipIfStatsHCInOctets);
   11321 	UPDATE_MIB(o1, ipIfStatsHCInForwDatagrams,
   11322 	    o2->ipIfStatsHCInForwDatagrams);
   11323 	UPDATE_MIB(o1, ipIfStatsHCInDelivers, o2->ipIfStatsHCInDelivers);
   11324 	UPDATE_MIB(o1, ipIfStatsHCOutRequests, o2->ipIfStatsHCOutRequests);
   11325 	UPDATE_MIB(o1, ipIfStatsHCOutForwDatagrams,
   11326 	    o2->ipIfStatsHCOutForwDatagrams);
   11327 	UPDATE_MIB(o1, ipIfStatsOutFragReqds, o2->ipIfStatsOutFragReqds);
   11328 	UPDATE_MIB(o1, ipIfStatsHCOutTransmits, o2->ipIfStatsHCOutTransmits);
   11329 	UPDATE_MIB(o1, ipIfStatsHCOutOctets, o2->ipIfStatsHCOutOctets);
   11330 	UPDATE_MIB(o1, ipIfStatsHCInMcastPkts, o2->ipIfStatsHCInMcastPkts);
   11331 	UPDATE_MIB(o1, ipIfStatsHCInMcastOctets, o2->ipIfStatsHCInMcastOctets);
   11332 	UPDATE_MIB(o1, ipIfStatsHCOutMcastPkts, o2->ipIfStatsHCOutMcastPkts);
   11333 	UPDATE_MIB(o1, ipIfStatsHCOutMcastOctets,
   11334 	    o2->ipIfStatsHCOutMcastOctets);
   11335 	UPDATE_MIB(o1, ipIfStatsHCInBcastPkts, o2->ipIfStatsHCInBcastPkts);
   11336 	UPDATE_MIB(o1, ipIfStatsHCOutBcastPkts, o2->ipIfStatsHCOutBcastPkts);
   11337 	UPDATE_MIB(o1, ipsecInSucceeded, o2->ipsecInSucceeded);
   11338 	UPDATE_MIB(o1, ipsecInFailed, o2->ipsecInFailed);
   11339 	UPDATE_MIB(o1, ipInCksumErrs, o2->ipInCksumErrs);
   11340 	UPDATE_MIB(o1, tcpInErrs, o2->tcpInErrs);
   11341 	UPDATE_MIB(o1, udpNoPorts, o2->udpNoPorts);
   11342 }
   11343 
   11344 void
   11345 ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2)
   11346 {
   11347 	UPDATE_MIB(o1, ipv6IfIcmpInMsgs, o2->ipv6IfIcmpInMsgs);
   11348 	UPDATE_MIB(o1, ipv6IfIcmpInErrors, o2->ipv6IfIcmpInErrors);
   11349 	UPDATE_MIB(o1, ipv6IfIcmpInDestUnreachs, o2->ipv6IfIcmpInDestUnreachs);
   11350 	UPDATE_MIB(o1, ipv6IfIcmpInAdminProhibs, o2->ipv6IfIcmpInAdminProhibs);
   11351 	UPDATE_MIB(o1, ipv6IfIcmpInTimeExcds, o2->ipv6IfIcmpInTimeExcds);
   11352 	UPDATE_MIB(o1, ipv6IfIcmpInParmProblems, o2->ipv6IfIcmpInParmProblems);
   11353 	UPDATE_MIB(o1, ipv6IfIcmpInPktTooBigs, o2->ipv6IfIcmpInPktTooBigs);
   11354 	UPDATE_MIB(o1, ipv6IfIcmpInEchos, o2->ipv6IfIcmpInEchos);
   11355 	UPDATE_MIB(o1, ipv6IfIcmpInEchoReplies, o2->ipv6IfIcmpInEchoReplies);
   11356 	UPDATE_MIB(o1, ipv6IfIcmpInRouterSolicits,
   11357 	    o2->ipv6IfIcmpInRouterSolicits);
   11358 	UPDATE_MIB(o1, ipv6IfIcmpInRouterAdvertisements,
   11359 	    o2->ipv6IfIcmpInRouterAdvertisements);
   11360 	UPDATE_MIB(o1, ipv6IfIcmpInNeighborSolicits,
   11361 	    o2->ipv6IfIcmpInNeighborSolicits);
   11362 	UPDATE_MIB(o1, ipv6IfIcmpInNeighborAdvertisements,
   11363 	    o2->ipv6IfIcmpInNeighborAdvertisements);
   11364 	UPDATE_MIB(o1, ipv6IfIcmpInRedirects, o2->ipv6IfIcmpInRedirects);
   11365 	UPDATE_MIB(o1, ipv6IfIcmpInGroupMembQueries,
   11366 	    o2->ipv6IfIcmpInGroupMembQueries);
   11367 	UPDATE_MIB(o1, ipv6IfIcmpInGroupMembResponses,
   11368 	    o2->ipv6IfIcmpInGroupMembResponses);
   11369 	UPDATE_MIB(o1, ipv6IfIcmpInGroupMembReductions,
   11370 	    o2->ipv6IfIcmpInGroupMembReductions);
   11371 	UPDATE_MIB(o1, ipv6IfIcmpOutMsgs, o2->ipv6IfIcmpOutMsgs);
   11372 	UPDATE_MIB(o1, ipv6IfIcmpOutErrors, o2->ipv6IfIcmpOutErrors);
   11373 	UPDATE_MIB(o1, ipv6IfIcmpOutDestUnreachs,
   11374 	    o2->ipv6IfIcmpOutDestUnreachs);
   11375 	UPDATE_MIB(o1, ipv6IfIcmpOutAdminProhibs,
   11376 	    o2->ipv6IfIcmpOutAdminProhibs);
   11377 	UPDATE_MIB(o1, ipv6IfIcmpOutTimeExcds, o2->ipv6IfIcmpOutTimeExcds);
   11378 	UPDATE_MIB(o1, ipv6IfIcmpOutParmProblems,
   11379 	    o2->ipv6IfIcmpOutParmProblems);
   11380 	UPDATE_MIB(o1, ipv6IfIcmpOutPktTooBigs, o2->ipv6IfIcmpOutPktTooBigs);
   11381 	UPDATE_MIB(o1, ipv6IfIcmpOutEchos, o2->ipv6IfIcmpOutEchos);
   11382 	UPDATE_MIB(o1, ipv6IfIcmpOutEchoReplies, o2->ipv6IfIcmpOutEchoReplies);
   11383 	UPDATE_MIB(o1, ipv6IfIcmpOutRouterSolicits,
   11384 	    o2->ipv6IfIcmpOutRouterSolicits);
   11385 	UPDATE_MIB(o1, ipv6IfIcmpOutRouterAdvertisements,
   11386 	    o2->ipv6IfIcmpOutRouterAdvertisements);
   11387 	UPDATE_MIB(o1, ipv6IfIcmpOutNeighborSolicits,
   11388 	    o2->ipv6IfIcmpOutNeighborSolicits);
   11389 	UPDATE_MIB(o1, ipv6IfIcmpOutNeighborAdvertisements,
   11390 	    o2->ipv6IfIcmpOutNeighborAdvertisements);
   11391 	UPDATE_MIB(o1, ipv6IfIcmpOutRedirects, o2->ipv6IfIcmpOutRedirects);
   11392 	UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembQueries,
   11393 	    o2->ipv6IfIcmpOutGroupMembQueries);
   11394 	UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembResponses,
   11395 	    o2->ipv6IfIcmpOutGroupMembResponses);
   11396 	UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembReductions,
   11397 	    o2->ipv6IfIcmpOutGroupMembReductions);
   11398 	UPDATE_MIB(o1, ipv6IfIcmpInOverflows, o2->ipv6IfIcmpInOverflows);
   11399 	UPDATE_MIB(o1, ipv6IfIcmpBadHoplimit, o2->ipv6IfIcmpBadHoplimit);
   11400 	UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborAdvertisements,
   11401 	    o2->ipv6IfIcmpInBadNeighborAdvertisements);
   11402 	UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborSolicitations,
   11403 	    o2->ipv6IfIcmpInBadNeighborSolicitations);
   11404 	UPDATE_MIB(o1, ipv6IfIcmpInBadRedirects, o2->ipv6IfIcmpInBadRedirects);
   11405 	UPDATE_MIB(o1, ipv6IfIcmpInGroupMembTotal,
   11406 	    o2->ipv6IfIcmpInGroupMembTotal);
   11407 	UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadQueries,
   11408 	    o2->ipv6IfIcmpInGroupMembBadQueries);
   11409 	UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadReports,
   11410 	    o2->ipv6IfIcmpInGroupMembBadReports);
   11411 	UPDATE_MIB(o1, ipv6IfIcmpInGroupMembOurReports,
   11412 	    o2->ipv6IfIcmpInGroupMembOurReports);
   11413 }
   11414 
   11415 /*
   11416  * Called before the options are updated to check if this packet will
   11417  * be source routed from here.
   11418  * This routine assumes that the options are well formed i.e. that they
   11419  * have already been checked.
   11420  */
   11421 boolean_t
   11422 ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
   11423 {
   11424 	ipoptp_t	opts;
   11425 	uchar_t		*opt;
   11426 	uint8_t		optval;
   11427 	uint8_t		optlen;
   11428 	ipaddr_t	dst;
   11429 
   11430 	if (IS_SIMPLE_IPH(ipha)) {
   11431 		ip2dbg(("not source routed\n"));
   11432 		return (B_FALSE);
   11433 	}
   11434 	dst = ipha->ipha_dst;
   11435 	for (optval = ipoptp_first(&opts, ipha);
   11436 	    optval != IPOPT_EOL;
   11437 	    optval = ipoptp_next(&opts)) {
   11438 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
   11439 		opt = opts.ipoptp_cur;
   11440 		optlen = opts.ipoptp_len;
   11441 		ip2dbg(("ip_source_routed: opt %d, len %d\n",
   11442 		    optval, optlen));
   11443 		switch (optval) {
   11444 			uint32_t off;
   11445 		case IPOPT_SSRR:
   11446 		case IPOPT_LSRR:
   11447 			/*
   11448 			 * If dst is one of our addresses and there are some
   11449 			 * entries left in the source route return (true).
   11450 			 */
   11451 			if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
   11452 				ip2dbg(("ip_source_routed: not next"
   11453 				    " source route 0x%x\n",
   11454 				    ntohl(dst)));
   11455 				return (B_FALSE);
   11456 			}
   11457 			off = opt[IPOPT_OFFSET];
   11458 			off--;
   11459 			if (optlen < IP_ADDR_LEN ||
   11460 			    off > optlen - IP_ADDR_LEN) {
   11461 				/* End of source route */
   11462 				ip1dbg(("ip_source_routed: end of SR\n"));
   11463 				return (B_FALSE);
   11464 			}
   11465 			return (B_TRUE);
   11466 		}
   11467 	}
   11468 	ip2dbg(("not source routed\n"));
   11469 	return (B_FALSE);
   11470 }
   11471 
   11472 /*
   11473  * ip_unbind is called by the transports to remove a conn from
   11474  * the fanout table.
   11475  */
   11476 void
   11477 ip_unbind(conn_t *connp)
   11478 {
   11479 
   11480 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
   11481 
   11482 	if (is_system_labeled() && connp->conn_anon_port) {
   11483 		(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
   11484 		    connp->conn_mlp_type, connp->conn_proto,
   11485 		    ntohs(connp->conn_lport), B_FALSE);
   11486 		connp->conn_anon_port = 0;
   11487 	}
   11488 	connp->conn_mlp_type = mlptSingle;
   11489 
   11490 	ipcl_hash_remove(connp);
   11491 }
   11492 
   11493 /*
   11494  * Used for deciding the MSS size for the upper layer. Thus
   11495  * we need to check the outbound policy values in the conn.
   11496  */
   11497 int
   11498 conn_ipsec_length(conn_t *connp)
   11499 {
   11500 	ipsec_latch_t *ipl;
   11501 
   11502 	ipl = connp->conn_latch;
   11503 	if (ipl == NULL)
   11504 		return (0);
   11505 
   11506 	if (connp->conn_ixa->ixa_ipsec_policy == NULL)
   11507 		return (0);
   11508 
   11509 	return (connp->conn_ixa->ixa_ipsec_policy->ipsp_act->ipa_ovhd);
   11510 }
   11511 
   11512 /*
   11513  * Returns an estimate of the IPsec headers size. This is used if
   11514  * we don't want to call into IPsec to get the exact size.
   11515  */
   11516 int
   11517 ipsec_out_extra_length(ip_xmit_attr_t *ixa)
   11518 {
   11519 	ipsec_action_t *a;
   11520 
   11521 	if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
   11522 		return (0);
   11523 
   11524 	a = ixa->ixa_ipsec_action;
   11525 	if (a == NULL) {
   11526 		ASSERT(ixa->ixa_ipsec_policy != NULL);
   11527 		a = ixa->ixa_ipsec_policy->ipsp_act;
   11528 	}
   11529 	ASSERT(a != NULL);
   11530 
   11531 	return (a->ipa_ovhd);
   11532 }
   11533 
   11534 /*
   11535  * If there are any source route options, return the true final
   11536  * destination. Otherwise, return the destination.
   11537  */
   11538 ipaddr_t
   11539 ip_get_dst(ipha_t *ipha)
   11540 {
   11541 	ipoptp_t	opts;
   11542 	uchar_t		*opt;
   11543 	uint8_t		optval;
   11544 	uint8_t		optlen;
   11545 	ipaddr_t	dst;
   11546 	uint32_t off;
   11547 
   11548 	dst = ipha->ipha_dst;
   11549 
   11550 	if (IS_SIMPLE_IPH(ipha))
   11551 		return (dst);
   11552 
   11553 	for (optval = ipoptp_first(&opts, ipha);
   11554 	    optval != IPOPT_EOL;
   11555 	    optval = ipoptp_next(&opts)) {
   11556 		opt = opts.ipoptp_cur;
   11557 		optlen = opts.ipoptp_len;
   11558 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
   11559 		switch (optval) {
   11560 		case IPOPT_SSRR:
   11561 		case IPOPT_LSRR:
   11562 			off = opt[IPOPT_OFFSET];
   11563 			/*
   11564 			 * If one of the conditions is true, it means
   11565 			 * end of options and dst already has the right
   11566 			 * value.
   11567 			 */
   11568 			if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) {
   11569 				off = optlen - IP_ADDR_LEN;
   11570 				bcopy(&opt[off], &dst, IP_ADDR_LEN);
   11571 			}
   11572 			return (dst);
   11573 		default:
   11574 			break;
   11575 		}
   11576 	}
   11577 
   11578 	return (dst);
   11579 }
   11580 
   11581 /*
   11582  * Outbound IP fragmentation routine.
   11583  * Assumes the caller has checked whether or not fragmentation should
   11584  * be allowed. Here we copy the DF bit from the header to all the generated
   11585  * fragments.
   11586  */
   11587 int
   11588 ip_fragment_v4(mblk_t *mp_orig, nce_t *nce, iaflags_t ixaflags,
   11589     uint_t pkt_len, uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone,
   11590     zoneid_t nolzid, pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
   11591 {
   11592 	int		i1;
   11593 	int		hdr_len;
   11594 	mblk_t		*hdr_mp;
   11595 	ipha_t		*ipha;
   11596 	int		ip_data_end;
   11597 	int		len;
   11598 	mblk_t		*mp = mp_orig;
   11599 	int		offset;
   11600 	ill_t		*ill = nce->nce_ill;
   11601 	ip_stack_t	*ipst = ill->ill_ipst;
   11602 	mblk_t		*carve_mp;
   11603 	uint32_t	frag_flag;
   11604 	uint_t		priority = mp->b_band;
   11605 	int		error = 0;
   11606 
   11607 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
   11608 
   11609 	if (pkt_len != msgdsize(mp)) {
   11610 		ip0dbg(("Packet length mismatch: %d, %ld\n",
   11611 		    pkt_len, msgdsize(mp)));
   11612 		freemsg(mp);
   11613 		return (EINVAL);
   11614 	}
   11615 
   11616 	if (max_frag == 0) {
   11617 		ip1dbg(("ip_fragment_v4: max_frag is zero. Dropping packet\n"));
   11618 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
   11619 		ip_drop_output("FragFails: zero max_frag", mp, ill);
   11620 		freemsg(mp);
   11621 		return (EINVAL);
   11622 	}
   11623 
   11624 	ASSERT(MBLKL(mp) >= sizeof (ipha_t));
   11625 	ipha = (ipha_t *)mp->b_rptr;
   11626 	ASSERT(ntohs(ipha->ipha_length) == pkt_len);
   11627 	frag_flag = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF;
   11628 
   11629 	/*
   11630 	 * Establish the starting offset.  May not be zero if we are fragging
   11631 	 * a fragment that is being forwarded.
   11632 	 */
   11633 	offset = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET;
   11634 
   11635 	/* TODO why is this test needed? */
   11636 	if (((max_frag - ntohs(ipha->ipha_length)) & ~7) < 8) {
   11637 		/* TODO: notify ulp somehow */
   11638 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
   11639 		ip_drop_output("FragFails: bad starting offset", mp, ill);
   11640 		freemsg(mp);
   11641 		return (EINVAL);
   11642 	}
   11643 
   11644 	hdr_len = IPH_HDR_LENGTH(ipha);
   11645 	ipha->ipha_hdr_checksum = 0;
   11646 
   11647 	/*
   11648 	 * Establish the number of bytes maximum per frag, after putting
   11649 	 * in the header.
   11650 	 */
   11651 	len = (max_frag - hdr_len) & ~7;
   11652 
   11653 	/* Get a copy of the header for the trailing frags */
   11654 	hdr_mp = ip_fragment_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst,
   11655 	    mp);
   11656 	if (hdr_mp == NULL) {
   11657 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
   11658 		ip_drop_output("FragFails: no hdr_mp", mp, ill);
   11659 		freemsg(mp);
   11660 		return (ENOBUFS);
   11661 	}
   11662 
   11663 	/* Store the starting offset, with the MoreFrags flag. */
   11664 	i1 = offset | IPH_MF | frag_flag;
   11665 	ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1);
   11666 
   11667 	/* Establish the ending byte offset, based on the starting offset. */
   11668 	offset <<= 3;
   11669 	ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len;
   11670 
   11671 	/* Store the length of the first fragment in the IP header. */
   11672 	i1 = len + hdr_len;
   11673 	ASSERT(i1 <= IP_MAXPACKET);
   11674 	ipha->ipha_length = htons((uint16_t)i1);
   11675 
   11676 	/*
   11677 	 * Compute the IP header checksum for the first frag.  We have to
   11678 	 * watch out that we stop at the end of the header.
   11679 	 */
   11680 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
   11681 
   11682 	/*
   11683 	 * Now carve off the first frag.  Note that this will include the
   11684 	 * original IP header.
   11685 	 */
   11686 	if (!(mp = ip_carve_mp(&mp_orig, i1))) {
   11687 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
   11688 		ip_drop_output("FragFails: could not carve mp", mp_orig, ill);
   11689 		freeb(hdr_mp);
   11690 		freemsg(mp_orig);
   11691 		return (ENOBUFS);
   11692 	}
   11693 
   11694 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
   11695 
   11696 	error = postfragfn(mp, nce, ixaflags, i1, xmit_hint, szone, nolzid,
   11697 	    ixa_cookie);
   11698 	if (error != 0 && error != EWOULDBLOCK) {
   11699 		/* No point in sending the other fragments */
   11700 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
   11701 		ip_drop_output("FragFails: postfragfn failed", mp_orig, ill);
   11702 		freeb(hdr_mp);
   11703 		freemsg(mp_orig);
   11704 		return (error);
   11705 	}
   11706 
   11707 	/* No need to redo state machine in loop */
   11708 	ixaflags &= ~IXAF_REACH_CONF;
   11709 
   11710 	/* Advance the offset to the second frag starting point. */
   11711 	offset += len;
   11712 	/*
   11713 	 * Update hdr_len from the copied header - there might be less options
   11714 	 * in the later fragments.
   11715 	 */
   11716 	hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr);
   11717 	/* Loop until done. */
   11718 	for (;;) {
   11719 		uint16_t	offset_and_flags;
   11720 		uint16_t	ip_len;
   11721 
   11722 		if (ip_data_end - offset > len) {
   11723 			/*
   11724 			 * Carve off the appropriate amount from the original
   11725 			 * datagram.
   11726 			 */
   11727 			if (!(carve_mp = ip_carve_mp(&mp_orig, len))) {
   11728 				mp = NULL;
   11729 				break;
   11730 			}
   11731 			/*
   11732 			 * More frags after this one.  Get another copy
   11733 			 * of the header.
   11734 			 */
   11735 			if (carve_mp->b_datap->db_ref == 1 &&
   11736 			    hdr_mp->b_wptr - hdr_mp->b_rptr <
   11737 			    carve_mp->b_rptr - carve_mp->b_datap->db_base) {
   11738 				/* Inline IP header */
   11739 				carve_mp->b_rptr -= hdr_mp->b_wptr -
   11740 				    hdr_mp->b_rptr;
   11741 				bcopy(hdr_mp->b_rptr, carve_mp->b_rptr,
   11742 				    hdr_mp->b_wptr - hdr_mp->b_rptr);
   11743 				mp = carve_mp;
   11744 			} else {
   11745 				if (!(mp = copyb(hdr_mp))) {
   11746 					freemsg(carve_mp);
   11747 					break;
   11748 				}
   11749 				/* Get priority marking, if any. */
   11750 				mp->b_band = priority;
   11751 				mp->b_cont = carve_mp;
   11752 			}
   11753 			ipha = (ipha_t *)mp->b_rptr;
   11754 			offset_and_flags = IPH_MF;
   11755 		} else {
   11756 			/*
   11757 			 * Last frag.  Consume the header. Set len to
   11758 			 * the length of this last piece.
   11759 			 */
   11760 			len = ip_data_end - offset;
   11761 
   11762 			/*
   11763 			 * Carve off the appropriate amount from the original
   11764 			 * datagram.
   11765 			 */
   11766 			if (!(carve_mp = ip_carve_mp(&mp_orig, len))) {
   11767 				mp = NULL;
   11768 				break;
   11769 			}
   11770 			if (carve_mp->b_datap->db_ref == 1 &&
   11771 			    hdr_mp->b_wptr - hdr_mp->b_rptr <
   11772 			    carve_mp->b_rptr - carve_mp->b_datap->db_base) {
   11773 				/* Inline IP header */
   11774 				carve_mp->b_rptr -= hdr_mp->b_wptr -
   11775 				    hdr_mp->b_rptr;
   11776 				bcopy(hdr_mp->b_rptr, carve_mp->b_rptr,
   11777 				    hdr_mp->b_wptr - hdr_mp->b_rptr);
   11778 				mp = carve_mp;
   11779 				freeb(hdr_mp);
   11780 				hdr_mp = mp;
   11781 			} else {
   11782 				mp = hdr_mp;
   11783 				/* Get priority marking, if any. */
   11784 				mp->b_band = priority;
   11785 				mp->b_cont = carve_mp;
   11786 			}
   11787 			ipha = (ipha_t *)mp->b_rptr;
   11788 			/* A frag of a frag might have IPH_MF non-zero */
   11789 			offset_and_flags =
   11790 			    ntohs(ipha->ipha_fragment_offset_and_flags) &
   11791 			    IPH_MF;
   11792 		}
   11793 		offset_and_flags |= (uint16_t)(offset >> 3);
   11794 		offset_and_flags |= (uint16_t)frag_flag;
   11795 		/* Store the offset and flags in the IP header. */
   11796 		ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags);
   11797 
   11798 		/* Store the length in the IP header. */
   11799 		ip_len = (uint16_t)(len + hdr_len);
   11800 		ipha->ipha_length = htons(ip_len);
   11801 
   11802 		/*
   11803 		 * Set the IP header checksum.	Note that mp is just
   11804 		 * the header, so this is easy to pass to ip_csum.
   11805 		 */
   11806 		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
   11807 
   11808 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
   11809 
   11810 		error = postfragfn(mp, nce, ixaflags, ip_len, xmit_hint, szone,
   11811 		    nolzid, ixa_cookie);
   11812 		/* All done if we just consumed the hdr_mp. */
   11813 		if (mp == hdr_mp) {
   11814 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
   11815 			return (error);
   11816 		}
   11817 		if (error != 0 && error != EWOULDBLOCK) {
   11818 			DTRACE_PROBE2(ip__xmit__frag__fail, ill_t *, ill,
   11819 			    mblk_t *, hdr_mp);
   11820 			/* No point in sending the other fragments */
   11821 			break;
   11822 		}
   11823 
   11824 		/* Otherwise, advance and loop. */
   11825 		offset += len;
   11826 	}
   11827 	/* Clean up following allocation failure. */
   11828 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
   11829 	ip_drop_output("FragFails: loop ended", NULL, ill);
   11830 	if (mp != hdr_mp)
   11831 		freeb(hdr_mp);
   11832 	if (mp != mp_orig)
   11833 		freemsg(mp_orig);
   11834 	return (error);
   11835 }
   11836 
   11837 /*
   11838  * Copy the header plus those options which have the copy bit set
   11839  */
   11840 static mblk_t *
   11841 ip_fragment_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst,
   11842     mblk_t *src)
   11843 {
   11844 	mblk_t	*mp;
   11845 	uchar_t	*up;
   11846 
   11847 	/*
   11848 	 * Quick check if we need to look for options without the copy bit
   11849 	 * set
   11850 	 */
   11851 	mp = allocb_tmpl(ipst->ips_ip_wroff_extra + hdr_len, src);
   11852 	if (!mp)
   11853 		return (mp);
   11854 	mp->b_rptr += ipst->ips_ip_wroff_extra;
   11855 	if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) {
   11856 		bcopy(rptr, mp->b_rptr, hdr_len);
   11857 		mp->b_wptr += hdr_len + ipst->ips_ip_wroff_extra;
   11858 		return (mp);
   11859 	}
   11860 	up  = mp->b_rptr;
   11861 	bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH);
   11862 	up += IP_SIMPLE_HDR_LENGTH;
   11863 	rptr += IP_SIMPLE_HDR_LENGTH;
   11864 	hdr_len -= IP_SIMPLE_HDR_LENGTH;
   11865 	while (hdr_len > 0) {
   11866 		uint32_t optval;
   11867 		uint32_t optlen;
   11868 
   11869 		optval = *rptr;
   11870 		if (optval == IPOPT_EOL)
   11871 			break;
   11872 		if (optval == IPOPT_NOP)
   11873 			optlen = 1;
   11874 		else
   11875 			optlen = rptr[1];
   11876 		if (optval & IPOPT_COPY) {
   11877 			bcopy(rptr, up, optlen);
   11878 			up += optlen;
   11879 		}
   11880 		rptr += optlen;
   11881 		hdr_len -= optlen;
   11882 	}
   11883 	/*
   11884 	 * Make sure that we drop an even number of words by filling
   11885 	 * with EOL to the next word boundary.
   11886 	 */
   11887 	for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH);
   11888 	    hdr_len & 0x3; hdr_len++)
   11889 		*up++ = IPOPT_EOL;
   11890 	mp->b_wptr = up;
   11891 	/* Update header length */
   11892 	mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2));
   11893 	return (mp);
   11894 }
   11895 
   11896 /*
   11897  * Update any source route, record route, or timestamp options when
   11898  * sending a packet back to ourselves.
   11899  * Check that we are at end of strict source route.
   11900  * The options have been sanity checked by ip_output_options().
   11901  */
   11902 void
   11903 ip_output_local_options(ipha_t *ipha, ip_stack_t *ipst)
   11904 {
   11905 	ipoptp_t	opts;
   11906 	uchar_t		*opt;
   11907 	uint8_t		optval;
   11908 	uint8_t		optlen;
   11909 	ipaddr_t	dst;
   11910 	uint32_t	ts;
   11911 	timestruc_t	now;
   11912 
   11913 	for (optval = ipoptp_first(&opts, ipha);
   11914 	    optval != IPOPT_EOL;
   11915 	    optval = ipoptp_next(&opts)) {
   11916 		opt = opts.ipoptp_cur;
   11917 		optlen = opts.ipoptp_len;
   11918 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
   11919 		switch (optval) {
   11920 			uint32_t off;
   11921 		case IPOPT_SSRR:
   11922 		case IPOPT_LSRR:
   11923 			off = opt[IPOPT_OFFSET];
   11924 			off--;
   11925 			if (optlen < IP_ADDR_LEN ||
   11926 			    off > optlen - IP_ADDR_LEN) {
   11927 				/* End of source route */
   11928 				break;
   11929 			}
   11930 			/*
   11931 			 * This will only happen if two consecutive entries
   11932 			 * in the source route contains our address or if
   11933 			 * it is a packet with a loose source route which
   11934 			 * reaches us before consuming the whole source route
   11935 			 */
   11936 
   11937 			if (optval == IPOPT_SSRR) {
   11938 				return;
   11939 			}
   11940 			/*
   11941 			 * Hack: instead of dropping the packet truncate the
   11942 			 * source route to what has been used by filling the
   11943 			 * rest with IPOPT_NOP.
   11944 			 */
   11945 			opt[IPOPT_OLEN] = (uint8_t)off;
   11946 			while (off < optlen) {
   11947 				opt[off++] = IPOPT_NOP;
   11948 			}
   11949 			break;
   11950 		case IPOPT_RR:
   11951 			off = opt[IPOPT_OFFSET];
   11952 			off--;
   11953 			if (optlen < IP_ADDR_LEN ||
   11954 			    off > optlen - IP_ADDR_LEN) {
   11955 				/* No more room - ignore */
   11956 				ip1dbg((
   11957 				    "ip_output_local_options: end of RR\n"));
   11958 				break;
   11959 			}
   11960 			dst = htonl(INADDR_LOOPBACK);
   11961 			bcopy(&dst, (char *)opt + off, IP_ADDR_LEN);
   11962 			opt[IPOPT_OFFSET] += IP_ADDR_LEN;
   11963 			break;
   11964 		case IPOPT_TS:
   11965 			/* Insert timestamp if there is romm */
   11966 			switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
   11967 			case IPOPT_TS_TSONLY:
   11968 				off = IPOPT_TS_TIMELEN;
   11969 				break;
   11970 			case IPOPT_TS_PRESPEC:
   11971 			case IPOPT_TS_PRESPEC_RFC791:
   11972 				/* Verify that the address matched */
   11973 				off = opt[IPOPT_OFFSET] - 1;
   11974 				bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
   11975 				if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
   11976 					/* Not for us */
   11977 					break;
   11978 				}
   11979 				/* FALLTHRU */
   11980 			case IPOPT_TS_TSANDADDR:
   11981 				off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
   11982 				break;
   11983 			default:
   11984 				/*
   11985 				 * ip_*put_options should have already
   11986 				 * dropped this packet.
   11987 				 */
   11988 				cmn_err(CE_PANIC, "ip_output_local_options: "
   11989 				    "unknown IT - bug in ip_output_options?\n");
   11990 				return;	/* Keep "lint" happy */
   11991 			}
   11992 			if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
   11993 				/* Increase overflow counter */
   11994 				off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
   11995 				opt[IPOPT_POS_OV_FLG] = (uint8_t)
   11996 				    (opt[IPOPT_POS_OV_FLG] & 0x0F) |
   11997 				    (off << 4);
   11998 				break;
   11999 			}
   12000 			off = opt[IPOPT_OFFSET] - 1;
   12001 			switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
   12002 			case IPOPT_TS_PRESPEC:
   12003 			case IPOPT_TS_PRESPEC_RFC791:
   12004 			case IPOPT_TS_TSANDADDR:
   12005 				dst = htonl(INADDR_LOOPBACK);
   12006 				bcopy(&dst, (char *)opt + off, IP_ADDR_LEN);
   12007 				opt[IPOPT_OFFSET] += IP_ADDR_LEN;
   12008 				/* FALLTHRU */
   12009 			case IPOPT_TS_TSONLY:
   12010 				off = opt[IPOPT_OFFSET] - 1;
   12011 				/* Compute # of milliseconds since midnight */
   12012 				gethrestime(&now);
   12013 				ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
   12014 				    now.tv_nsec / (NANOSEC / MILLISEC);
   12015 				bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
   12016 				opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
   12017 				break;
   12018 			}
   12019 			break;
   12020 		}
   12021 	}
   12022 }
   12023 
   12024 /*
   12025  * Prepend an M_DATA fastpath header, and if none present prepend a
   12026  * DL_UNITDATA_REQ. Frees the mblk on failure.
   12027  *
   12028  * nce_dlur_mp and nce_fp_mp can not disappear once they have been set.
   12029  * If there is a change to them, the nce will be deleted (condemned) and
   12030  * a new nce_t will be created when packets are sent. Thus we need no locks
   12031  * to access those fields.
   12032  *
   12033  * We preserve b_band to support IPQoS. If a DL_UNITDATA_REQ is prepended
   12034  * we place b_band in dl_priority.dl_max.
   12035  */
   12036 static mblk_t *
   12037 ip_xmit_attach_llhdr(mblk_t *mp, nce_t *nce)
   12038 {
   12039 	uint_t	hlen;
   12040 	mblk_t *mp1;
   12041 	uint_t	priority;
   12042 	uchar_t *rptr;
   12043 
   12044 	rptr = mp->b_rptr;
   12045 
   12046 	ASSERT(DB_TYPE(mp) == M_DATA);
   12047 	priority = mp->b_band;
   12048 
   12049 	ASSERT(nce != NULL);
   12050 	if ((mp1 = nce->nce_fp_mp) != NULL) {
   12051 		hlen = MBLKL(mp1);
   12052 		/*
   12053 		 * Check if we have enough room to prepend fastpath
   12054 		 * header
   12055 		 */
   12056 		if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) {
   12057 			rptr -= hlen;
   12058 			bcopy(mp1->b_rptr, rptr, hlen);
   12059 			/*
   12060 			 * Set the b_rptr to the start of the link layer
   12061 			 * header
   12062 			 */
   12063 			mp->b_rptr = rptr;
   12064 			return (mp);
   12065 		}
   12066 		mp1 = copyb(mp1);
   12067 		if (mp1 == NULL) {
   12068 			ill_t *ill = nce->nce_ill;
   12069 
   12070 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   12071 			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
   12072 			freemsg(mp);
   12073 			return (NULL);
   12074 		}
   12075 		mp1->b_band = priority;
   12076 		mp1->b_cont = mp;
   12077 		DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
   12078 		DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
   12079 		DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
   12080 		DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
   12081 		DB_LSOMSS(mp1) = DB_LSOMSS(mp);
   12082 		DTRACE_PROBE1(ip__xmit__copyb, (mblk_t *), mp1);
   12083 		/*
   12084 		 * XXX disable ICK_VALID and compute checksum
   12085 		 * here; can happen if nce_fp_mp changes and
   12086 		 * it can't be copied now due to insufficient
   12087 		 * space. (unlikely, fp mp can change, but it
   12088 		 * does not increase in length)
   12089 		 */
   12090 		return (mp1);
   12091 	}
   12092 	mp1 = copyb(nce->nce_dlur_mp);
   12093 
   12094 	if (mp1 == NULL) {
   12095 		ill_t *ill = nce->nce_ill;
   12096 
   12097 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   12098 		ip_drop_output("ipIfStatsOutDiscards", mp, ill);
   12099 		freemsg(mp);
   12100 		return (NULL);
   12101 	}
   12102 	mp1->b_cont = mp;
   12103 	if (priority != 0) {
   12104 		mp1->b_band = priority;
   12105 		((dl_unitdata_req_t *)(mp1->b_rptr))->dl_priority.dl_max =
   12106 		    priority;
   12107 	}
   12108 	return (mp1);
   12109 #undef rptr
   12110 }
   12111 
   12112 /*
   12113  * Finish the outbound IPsec processing. This function is called from
   12114  * ipsec_out_process() if the IPsec packet was processed
   12115  * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
   12116  * asynchronously.
   12117  *
   12118  * This is common to IPv4 and IPv6.
   12119  */
   12120 int
   12121 ip_output_post_ipsec(mblk_t *mp, ip_xmit_attr_t *ixa)
   12122 {
   12123 	iaflags_t	ixaflags = ixa->ixa_flags;
   12124 	uint_t		pktlen;
   12125 
   12126 
   12127 	/* AH/ESP don't update ixa_pktlen when they modify the packet */
   12128 	if (ixaflags & IXAF_IS_IPV4) {
   12129 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
   12130 
   12131 		ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
   12132 		pktlen = ntohs(ipha->ipha_length);
   12133 	} else {
   12134 		ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
   12135 
   12136 		ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
   12137 		pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
   12138 	}
   12139 
   12140 	/*
   12141 	 * We release any hard reference on the SAs here to make
   12142 	 * sure the SAs can be garbage collected. ipsr_sa has a soft reference
   12143 	 * on the SAs.
   12144 	 * If in the future we want the hard latching of the SAs in the
   12145 	 * ip_xmit_attr_t then we should remove this.
   12146 	 */
   12147 	if (ixa->ixa_ipsec_esp_sa != NULL) {
   12148 		IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
   12149 		ixa->ixa_ipsec_esp_sa = NULL;
   12150 	}
   12151 	if (ixa->ixa_ipsec_ah_sa != NULL) {
   12152 		IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
   12153 		ixa->ixa_ipsec_ah_sa = NULL;
   12154 	}
   12155 
   12156 	/* Do we need to fragment? */
   12157 	if ((ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR) ||
   12158 	    pktlen > ixa->ixa_fragsize) {
   12159 		if (ixaflags & IXAF_IS_IPV4) {
   12160 			ASSERT(!(ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR));
   12161 			/*
   12162 			 * We check for the DF case in ipsec_out_process
   12163 			 * hence this only handles the non-DF case.
   12164 			 */
   12165 			return (ip_fragment_v4(mp, ixa->ixa_nce, ixa->ixa_flags,
   12166 			    pktlen, ixa->ixa_fragsize,
   12167 			    ixa->ixa_xmit_hint, ixa->ixa_zoneid,
   12168 			    ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
   12169 			    &ixa->ixa_cookie));
   12170 		} else {
   12171 			mp = ip_fraghdr_add_v6(mp, ixa->ixa_ident, ixa);
   12172 			if (mp == NULL) {
   12173 				/* MIB and ip_drop_output already done */
   12174 				return (ENOMEM);
   12175 			}
   12176 			pktlen += sizeof (ip6_frag_t);
   12177 			if (pktlen > ixa->ixa_fragsize) {
   12178 				return (ip_fragment_v6(mp, ixa->ixa_nce,
   12179 				    ixa->ixa_flags, pktlen,
   12180 				    ixa->ixa_fragsize, ixa->ixa_xmit_hint,
   12181 				    ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
   12182 				    ixa->ixa_postfragfn, &ixa->ixa_cookie));
   12183 			}
   12184 		}
   12185 	}
   12186 	return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixa->ixa_flags,
   12187 	    pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
   12188 	    ixa->ixa_no_loop_zoneid, NULL));
   12189 }
   12190 
   12191 /*
   12192  * Finish the inbound IPsec processing. This function is called from
   12193  * ipsec_out_process() if the IPsec packet was processed
   12194  * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
   12195  * asynchronously.
   12196  *
   12197  * This is common to IPv4 and IPv6.
   12198  */
   12199 void
   12200 ip_input_post_ipsec(mblk_t *mp, ip_recv_attr_t *ira)
   12201 {
   12202 	iaflags_t	iraflags = ira->ira_flags;
   12203 
   12204 	/* Length might have changed */
   12205 	if (iraflags & IRAF_IS_IPV4) {
   12206 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
   12207 
   12208 		ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
   12209 		ira->ira_pktlen = ntohs(ipha->ipha_length);
   12210 		ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
   12211 		ira->ira_protocol = ipha->ipha_protocol;
   12212 
   12213 		ip_fanout_v4(mp, ipha, ira);
   12214 	} else {
   12215 		ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
   12216 		uint8_t		*nexthdrp;
   12217 
   12218 		ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
   12219 		ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
   12220 		if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ira->ira_ip_hdr_length,
   12221 		    &nexthdrp)) {
   12222 			/* Malformed packet */
   12223 			BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
   12224 			ip_drop_input("ipIfStatsInDiscards", mp, ira->ira_ill);
   12225 			freemsg(mp);
   12226 			return;
   12227 		}
   12228 		ira->ira_protocol = *nexthdrp;
   12229 		ip_fanout_v6(mp, ip6h, ira);
   12230 	}
   12231 }
   12232 
   12233 /*
   12234  * Select which AH & ESP SA's to use (if any) for the outbound packet.
   12235  *
   12236  * If this function returns B_TRUE, the requested SA's have been filled
   12237  * into the ixa_ipsec_*_sa pointers.
   12238  *
   12239  * If the function returns B_FALSE, the packet has been "consumed", most
   12240  * likely by an ACQUIRE sent up via PF_KEY to a key management daemon.
   12241  *
   12242  * The SA references created by the protocol-specific "select"
   12243  * function will be released in ip_output_post_ipsec.
   12244  */
   12245 static boolean_t
   12246 ipsec_out_select_sa(mblk_t *mp, ip_xmit_attr_t *ixa)
   12247 {
   12248 	boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE;
   12249 	ipsec_policy_t *pp;
   12250 	ipsec_action_t *ap;
   12251 
   12252 	ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
   12253 	ASSERT((ixa->ixa_ipsec_policy != NULL) ||
   12254 	    (ixa->ixa_ipsec_action != NULL));
   12255 
   12256 	ap = ixa->ixa_ipsec_action;
   12257 	if (ap == NULL) {
   12258 		pp = ixa->ixa_ipsec_policy;
   12259 		ASSERT(pp != NULL);
   12260 		ap = pp->ipsp_act;
   12261 		ASSERT(ap != NULL);
   12262 	}
   12263 
   12264 	/*
   12265 	 * We have an action.  now, let's select SA's.
   12266 	 * A side effect of setting ixa_ipsec_*_sa is that it will
   12267 	 * be cached in the conn_t.
   12268 	 */
   12269 	if (ap->ipa_want_esp) {
   12270 		if (ixa->ixa_ipsec_esp_sa == NULL) {
   12271 			need_esp_acquire = !ipsec_outbound_sa(mp, ixa,
   12272 			    IPPROTO_ESP);
   12273 		}
   12274 		ASSERT(need_esp_acquire || ixa->ixa_ipsec_esp_sa != NULL);
   12275 	}
   12276 
   12277 	if (ap->ipa_want_ah) {
   12278 		if (ixa->ixa_ipsec_ah_sa == NULL) {
   12279 			need_ah_acquire = !ipsec_outbound_sa(mp, ixa,
   12280 			    IPPROTO_AH);
   12281 		}
   12282 		ASSERT(need_ah_acquire || ixa->ixa_ipsec_ah_sa != NULL);
   12283 		/*
   12284 		 * The ESP and AH processing order needs to be preserved
   12285 		 * when both protocols are required (ESP should be applied
   12286 		 * before AH for an outbound packet). Force an ESP ACQUIRE
   12287 		 * when both ESP and AH are required, and an AH ACQUIRE
   12288 		 * is needed.
   12289 		 */
   12290 		if (ap->ipa_want_esp && need_ah_acquire)
   12291 			need_esp_acquire = B_TRUE;
   12292 	}
   12293 
   12294 	/*
   12295 	 * Send an ACQUIRE (extended, regular, or both) if we need one.
   12296 	 * Release SAs that got referenced, but will not be used until we
   12297 	 * acquire _all_ of the SAs we need.
   12298 	 */
   12299 	if (need_ah_acquire || need_esp_acquire) {
   12300 		if (ixa->ixa_ipsec_ah_sa != NULL) {
   12301 			IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
   12302 			ixa->ixa_ipsec_ah_sa = NULL;
   12303 		}
   12304 		if (ixa->ixa_ipsec_esp_sa != NULL) {
   12305 			IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
   12306 			ixa->ixa_ipsec_esp_sa = NULL;
   12307 		}
   12308 
   12309 		sadb_acquire(mp, ixa, need_ah_acquire, need_esp_acquire);
   12310 		return (B_FALSE);
   12311 	}
   12312 
   12313 	return (B_TRUE);
   12314 }
   12315 
   12316 /*
   12317  * Handle IPsec output processing.
   12318  * This function is only entered once for a given packet.
   12319  * We try to do things synchronously, but if we need to have user-level
   12320  * set up SAs, or ESP or AH uses asynchronous kEF, then the operation
   12321  * will be completed
   12322  *  - when the SAs are added in esp_add_sa_finish/ah_add_sa_finish
   12323  *  - when asynchronous ESP is done it will do AH
   12324  *
   12325  * In all cases we come back in ip_output_post_ipsec() to fragment and
   12326  * send out the packet.
   12327  */
   12328 int
   12329 ipsec_out_process(mblk_t *mp, ip_xmit_attr_t *ixa)
   12330 {
   12331 	ill_t		*ill = ixa->ixa_nce->nce_ill;
   12332 	ip_stack_t	*ipst = ixa->ixa_ipst;
   12333 	ipsec_stack_t	*ipss;
   12334 	ipsec_policy_t	*pp;
   12335 	ipsec_action_t	*ap;
   12336 
   12337 	ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
   12338 
   12339 	ASSERT((ixa->ixa_ipsec_policy != NULL) ||
   12340 	    (ixa->ixa_ipsec_action != NULL));
   12341 
   12342 	ipss = ipst->ips_netstack->netstack_ipsec;
   12343 	if (!ipsec_loaded(ipss)) {
   12344 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   12345 		ip_drop_packet(mp, B_TRUE, ill,
   12346 		    DROPPER(ipss, ipds_ip_ipsec_not_loaded),
   12347 		    &ipss->ipsec_dropper);
   12348 		return (ENOTSUP);
   12349 	}
   12350 
   12351 	ap = ixa->ixa_ipsec_action;
   12352 	if (ap == NULL) {
   12353 		pp = ixa->ixa_ipsec_policy;
   12354 		ASSERT(pp != NULL);
   12355 		ap = pp->ipsp_act;
   12356 		ASSERT(ap != NULL);
   12357 	}
   12358 
   12359 	/* Handle explicit drop action and bypass. */
   12360 	switch (ap->ipa_act.ipa_type) {
   12361 	case IPSEC_ACT_DISCARD:
   12362 	case IPSEC_ACT_REJECT:
   12363 		ip_drop_packet(mp, B_FALSE, ill,
   12364 		    DROPPER(ipss, ipds_spd_explicit), &ipss->ipsec_spd_dropper);
   12365 		return (EHOSTUNREACH);	/* IPsec policy failure */
   12366 	case IPSEC_ACT_BYPASS:
   12367 		return (ip_output_post_ipsec(mp, ixa));
   12368 	}
   12369 
   12370 	/*
   12371 	 * The order of processing is first insert a IP header if needed.
   12372 	 * Then insert the ESP header and then the AH header.
   12373 	 */
   12374 	if ((ixa->ixa_flags & IXAF_IS_IPV4) && ap->ipa_want_se) {
   12375 		/*
   12376 		 * First get the outer IP header before sending
   12377 		 * it to ESP.
   12378 		 */
   12379 		ipha_t *oipha, *iipha;
   12380 		mblk_t *outer_mp, *inner_mp;
   12381 
   12382 		if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) {
   12383 			(void) mi_strlog(ill->ill_rq, 0,
   12384 			    SL_ERROR|SL_TRACE|SL_CONSOLE,
   12385 			    "ipsec_out_process: "
   12386 			    "Self-Encapsulation failed: Out of memory\n");
   12387 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   12388 			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
   12389 			freemsg(mp);
   12390 			return (ENOBUFS);
   12391 		}
   12392 		inner_mp = mp;
   12393 		ASSERT(inner_mp->b_datap->db_type == M_DATA);
   12394 		oipha = (ipha_t *)outer_mp->b_rptr;
   12395 		iipha = (ipha_t *)inner_mp->b_rptr;
   12396 		*oipha = *iipha;
   12397 		outer_mp->b_wptr += sizeof (ipha_t);
   12398 		oipha->ipha_length = htons(ntohs(iipha->ipha_length) +
   12399 		    sizeof (ipha_t));
   12400 		oipha->ipha_protocol = IPPROTO_ENCAP;
   12401 		oipha->ipha_version_and_hdr_length =
   12402 		    IP_SIMPLE_HDR_VERSION;
   12403 		oipha->ipha_hdr_checksum = 0;
   12404 		oipha->ipha_hdr_checksum = ip_csum_hdr(oipha);
   12405 		outer_mp->b_cont = inner_mp;
   12406 		mp = outer_mp;
   12407 
   12408 		ixa->ixa_flags |= IXAF_IPSEC_TUNNEL;
   12409 	}
   12410 
   12411 	/* If we need to wait for a SA then we can't return any errno */
   12412 	if (((ap->ipa_want_ah && (ixa->ixa_ipsec_ah_sa == NULL)) ||
   12413 	    (ap->ipa_want_esp && (ixa->ixa_ipsec_esp_sa == NULL))) &&
   12414 	    !ipsec_out_select_sa(mp, ixa))
   12415 		return (0);
   12416 
   12417 	/*
   12418 	 * By now, we know what SA's to use.  Toss over to ESP & AH
   12419 	 * to do the heavy lifting.
   12420 	 */
   12421 	if (ap->ipa_want_esp) {
   12422 		ASSERT(ixa->ixa_ipsec_esp_sa != NULL);
   12423 
   12424 		mp = ixa->ixa_ipsec_esp_sa->ipsa_output_func(mp, ixa);
   12425 		if (mp == NULL) {
   12426 			/*
   12427 			 * Either it failed or is pending. In the former case
   12428 			 * ipIfStatsInDiscards was increased.
   12429 			 */
   12430 			return (0);
   12431 		}
   12432 	}
   12433 
   12434 	if (ap->ipa_want_ah) {
   12435 		ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
   12436 
   12437 		mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(mp, ixa);
   12438 		if (mp == NULL) {
   12439 			/*
   12440 			 * Either it failed or is pending. In the former case
   12441 			 * ipIfStatsInDiscards was increased.
   12442 			 */
   12443 			return (0);
   12444 		}
   12445 	}
   12446 	/*
   12447 	 * We are done with IPsec processing. Send it over
   12448 	 * the wire.
   12449 	 */
   12450 	return (ip_output_post_ipsec(mp, ixa));
   12451 }
   12452 
   12453 /*
   12454  * ioctls that go through a down/up sequence may need to wait for the down
   12455  * to complete. This involves waiting for the ire and ipif refcnts to go down
   12456  * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail.
   12457  */
   12458 /* ARGSUSED */
   12459 void
   12460 ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
   12461 {
   12462 	struct iocblk *iocp;
   12463 	mblk_t *mp1;
   12464 	ip_ioctl_cmd_t *ipip;
   12465 	int err;
   12466 	sin_t	*sin;
   12467 	struct lifreq *lifr;
   12468 	struct ifreq *ifr;
   12469 
   12470 	iocp = (struct iocblk *)mp->b_rptr;
   12471 	ASSERT(ipsq != NULL);
   12472 	/* Existence of mp1 verified in ip_wput_nondata */
   12473 	mp1 = mp->b_cont->b_cont;
   12474 	ipip = ip_sioctl_lookup(iocp->ioc_cmd);
   12475 	if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) {
   12476 		/*
   12477 		 * Special case where ipx_current_ipif is not set:
   12478 		 * ill_phyint_reinit merged the v4 and v6 into a single ipsq.
   12479 		 * We are here as were not able to complete the operation in
   12480 		 * ipif_set_values because we could not become exclusive on
   12481 		 * the new ipsq.
   12482 		 */
   12483 		ill_t *ill = q->q_ptr;
   12484 		ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd);
   12485 	}
   12486 	ASSERT(ipsq->ipsq_xop->ipx_current_ipif != NULL);
   12487 
   12488 	if (ipip->ipi_cmd_type == IF_CMD) {
   12489 		/* This a old style SIOC[GS]IF* command */
   12490 		ifr = (struct ifreq *)mp1->b_rptr;
   12491 		sin = (sin_t *)&ifr->ifr_addr;
   12492 	} else if (ipip->ipi_cmd_type == LIF_CMD) {
   12493 		/* This a new style SIOC[GS]LIF* command */
   12494 		lifr = (struct lifreq *)mp1->b_rptr;
   12495 		sin = (sin_t *)&lifr->lifr_addr;
   12496 	} else {
   12497 		sin = NULL;
   12498 	}
   12499 
   12500 	err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin,
   12501 	    q, mp, ipip, mp1->b_rptr);
   12502 
   12503 	DTRACE_PROBE4(ipif__ioctl, char *, "ip_reprocess_ioctl finish",
   12504 	    int, ipip->ipi_cmd,
   12505 	    ill_t *, ipsq->ipsq_xop->ipx_current_ipif->ipif_ill,
   12506 	    ipif_t *, ipsq->ipsq_xop->ipx_current_ipif);
   12507 
   12508 	ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
   12509 }
   12510 
   12511 /*
   12512  * ioctl processing
   12513  *
   12514  * ioctl processing starts with ip_sioctl_copyin_setup(), which looks up
   12515  * the ioctl command in the ioctl tables, determines the copyin data size
   12516  * from the ipi_copyin_size field, and does an mi_copyin() of that size.
   12517  *
   12518  * ioctl processing then continues when the M_IOCDATA makes its way down to
   12519  * ip_wput_nondata().  The ioctl is looked up again in the ioctl table, its
   12520  * associated 'conn' is refheld till the end of the ioctl and the general
   12521  * ioctl processing function ip_process_ioctl() is called to extract the
   12522  * arguments and process the ioctl.  To simplify extraction, ioctl commands
   12523  * are "typed" based on the arguments they take (e.g., LIF_CMD which takes a
   12524  * `struct lifreq'), and a common extract function (e.g., ip_extract_lifreq())
   12525  * is used to extract the ioctl's arguments.
   12526  *
   12527  * ip_process_ioctl determines if the ioctl needs to be serialized, and if
   12528  * so goes thru the serialization primitive ipsq_try_enter. Then the
   12529  * appropriate function to handle the ioctl is called based on the entry in
   12530  * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish
   12531  * which also refreleases the 'conn' that was refheld at the start of the
   12532  * ioctl. Finally ipsq_exit is called if needed to exit the ipsq.
   12533  *
   12534  * Many exclusive ioctls go thru an internal down up sequence as part of
   12535  * the operation. For example an attempt to change the IP address of an
   12536  * ipif entails ipif_down, set address, ipif_up. Bringing down the interface
   12537  * does all the cleanup such as deleting all ires that use this address.
   12538  * Then we need to wait till all references to the interface go away.
   12539  */
   12540 void
   12541 ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
   12542 {
   12543 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
   12544 	ip_ioctl_cmd_t *ipip = arg;
   12545 	ip_extract_func_t *extract_funcp;
   12546 	cmd_info_t ci;
   12547 	int err;
   12548 	boolean_t entered_ipsq = B_FALSE;
   12549 
   12550 	ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd));
   12551 
   12552 	if (ipip == NULL)
   12553 		ipip = ip_sioctl_lookup(iocp->ioc_cmd);
   12554 
   12555 	/*
   12556 	 * SIOCLIFADDIF needs to go thru a special path since the
   12557 	 * ill may not exist yet. This happens in the case of lo0
   12558 	 * which is created using this ioctl.
   12559 	 */
   12560 	if (ipip->ipi_cmd == SIOCLIFADDIF) {
   12561 		err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL);
   12562 		DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish",
   12563 		    int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
   12564 		ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
   12565 		return;
   12566 	}
   12567 
   12568 	ci.ci_ipif = NULL;
   12569 	switch (ipip->ipi_cmd_type) {
   12570 	case MISC_CMD:
   12571 	case MSFILT_CMD:
   12572 		/*
   12573 		 * All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF.
   12574 		 */
   12575 		if (ipip->ipi_cmd == IF_UNITSEL) {
   12576 			/* ioctl comes down the ill */
   12577 			ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif;
   12578 			ipif_refhold(ci.ci_ipif);
   12579 		}
   12580 		err = 0;
   12581 		ci.ci_sin = NULL;
   12582 		ci.ci_sin6 = NULL;
   12583 		ci.ci_lifr = NULL;
   12584 		extract_funcp = NULL;
   12585 		break;
   12586 
   12587 	case IF_CMD:
   12588 	case LIF_CMD:
   12589 		extract_funcp = ip_extract_lifreq;
   12590 		break;
   12591 
   12592 	case ARP_CMD:
   12593 	case XARP_CMD:
   12594 		extract_funcp = ip_extract_arpreq;
   12595 		break;
   12596 
   12597 	default:
   12598 		ASSERT(0);
   12599 	}
   12600 
   12601 	if (extract_funcp != NULL) {
   12602 		err = (*extract_funcp)(q, mp, ipip, &ci);
   12603 		if (err != 0) {
   12604 			DTRACE_PROBE4(ipif__ioctl,
   12605 			    char *, "ip_process_ioctl finish err",
   12606 			    int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
   12607 			ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
   12608 			return;
   12609 		}
   12610 
   12611 		/*
   12612 		 * All of the extraction functions return a refheld ipif.
   12613 		 */
   12614 		ASSERT(ci.ci_ipif != NULL);
   12615 	}
   12616 
   12617 	if (!(ipip->ipi_flags & IPI_WR)) {
   12618 		/*
   12619 		 * A return value of EINPROGRESS means the ioctl is
   12620 		 * either queued and waiting for some reason or has
   12621 		 * already completed.
   12622 		 */
   12623 		err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip,
   12624 		    ci.ci_lifr);
   12625 		if (ci.ci_ipif != NULL) {
   12626 			DTRACE_PROBE4(ipif__ioctl,
   12627 			    char *, "ip_process_ioctl finish RD",
   12628 			    int, ipip->ipi_cmd, ill_t *, ci.ci_ipif->ipif_ill,
   12629 			    ipif_t *, ci.ci_ipif);
   12630 			ipif_refrele(ci.ci_ipif);
   12631 		} else {
   12632 			DTRACE_PROBE4(ipif__ioctl,
   12633 			    char *, "ip_process_ioctl finish RD",
   12634 			    int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
   12635 		}
   12636 		ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
   12637 		return;
   12638 	}
   12639 
   12640 	ASSERT(ci.ci_ipif != NULL);
   12641 
   12642 	/*
   12643 	 * If ipsq is non-NULL, we are already being called exclusively
   12644 	 */
   12645 	ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
   12646 	if (ipsq == NULL) {
   12647 		ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl,
   12648 		    NEW_OP, B_TRUE);
   12649 		if (ipsq == NULL) {
   12650 			ipif_refrele(ci.ci_ipif);
   12651 			return;
   12652 		}
   12653 		entered_ipsq = B_TRUE;
   12654 	}
   12655 	/*
   12656 	 * Release the ipif so that ipif_down and friends that wait for
   12657 	 * references to go away are not misled about the current ipif_refcnt
   12658 	 * values. We are writer so we can access the ipif even after releasing
   12659 	 * the ipif.
   12660 	 */
   12661 	ipif_refrele(ci.ci_ipif);
   12662 
   12663 	ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
   12664 
   12665 	/*
   12666 	 * A return value of EINPROGRESS means the ioctl is
   12667 	 * either queued and waiting for some reason or has
   12668 	 * already completed.
   12669 	 */
   12670 	err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
   12671 
   12672 	DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
   12673 	    int, ipip->ipi_cmd,
   12674 	    ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
   12675 	    ipif_t *, ci.ci_ipif);
   12676 	ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
   12677 
   12678 	if (entered_ipsq)
   12679 		ipsq_exit(ipsq);
   12680 }
   12681 
   12682 /*
   12683  * Complete the ioctl. Typically ioctls use the mi package and need to
   12684  * do mi_copyout/mi_copy_done.
   12685  */
   12686 void
   12687 ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq)
   12688 {
   12689 	conn_t	*connp = NULL;
   12690 
   12691 	if (err == EINPROGRESS)
   12692 		return;
   12693 
   12694 	if (CONN_Q(q)) {
   12695 		connp = Q_TO_CONN(q);
   12696 		ASSERT(connp->conn_ref >= 2);
   12697 	}
   12698 
   12699 	switch (mode) {
   12700 	case COPYOUT:
   12701 		if (err == 0)
   12702 			mi_copyout(q, mp);
   12703 		else
   12704 			mi_copy_done(q, mp, err);
   12705 		break;
   12706 
   12707 	case NO_COPYOUT:
   12708 		mi_copy_done(q, mp, err);
   12709 		break;
   12710 
   12711 	default:
   12712 		ASSERT(mode == CONN_CLOSE);	/* aborted through CONN_CLOSE */
   12713 		break;
   12714 	}
   12715 
   12716 	/*
   12717 	 * The conn refhold and ioctlref placed on the conn at the start of the
   12718 	 * ioctl are released here.
   12719 	 */
   12720 	if (connp != NULL) {
   12721 		CONN_DEC_IOCTLREF(connp);
   12722 		CONN_OPER_PENDING_DONE(connp);
   12723 	}
   12724 
   12725 	if (ipsq != NULL)
   12726 		ipsq_current_finish(ipsq);
   12727 }
   12728 
   12729 /* Handles all non data messages */
   12730 void
   12731 ip_wput_nondata(queue_t *q, mblk_t *mp)
   12732 {
   12733 	mblk_t		*mp1;
   12734 	struct iocblk	*iocp;
   12735 	ip_ioctl_cmd_t	*ipip;
   12736 	conn_t		*connp;
   12737 	cred_t		*cr;
   12738 	char		*proto_str;
   12739 
   12740 	if (CONN_Q(q))
   12741 		connp = Q_TO_CONN(q);
   12742 	else
   12743 		connp = NULL;
   12744 
   12745 	switch (DB_TYPE(mp)) {
   12746 	case M_IOCTL:
   12747 		/*
   12748 		 * IOCTL processing begins in ip_sioctl_copyin_setup which
   12749 		 * will arrange to copy in associated control structures.
   12750 		 */
   12751 		ip_sioctl_copyin_setup(q, mp);
   12752 		return;
   12753 	case M_IOCDATA:
   12754 		/*
   12755 		 * Ensure that this is associated with one of our trans-
   12756 		 * parent ioctls.  If it's not ours, discard it if we're
   12757 		 * running as a driver, or pass it on if we're a module.
   12758 		 */
   12759 		iocp = (struct iocblk *)mp->b_rptr;
   12760 		ipip = ip_sioctl_lookup(iocp->ioc_cmd);
   12761 		if (ipip == NULL) {
   12762 			if (q->q_next == NULL) {
   12763 				goto nak;
   12764 			} else {
   12765 				putnext(q, mp);
   12766 			}
   12767 			return;
   12768 		}
   12769 		if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
   12770 			/*
   12771 			 * The ioctl is one we recognise, but is not consumed
   12772 			 * by IP as a module and we are a module, so we drop
   12773 			 */
   12774 			goto nak;
   12775 		}
   12776 
   12777 		/* IOCTL continuation following copyin or copyout. */
   12778 		if (mi_copy_state(q, mp, NULL) == -1) {
   12779 			/*
   12780 			 * The copy operation failed.  mi_copy_state already
   12781 			 * cleaned up, so we're out of here.
   12782 			 */
   12783 			return;
   12784 		}
   12785 		/*
   12786 		 * If we just completed a copy in, we become writer and
   12787 		 * continue processing in ip_sioctl_copyin_done.  If it
   12788 		 * was a copy out, we call mi_copyout again.  If there is
   12789 		 * nothing more to copy out, it will complete the IOCTL.
   12790 		 */
   12791 		if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) {
   12792 			if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) {
   12793 				mi_copy_done(q, mp, EPROTO);
   12794 				return;
   12795 			}
   12796 			/*
   12797 			 * Check for cases that need more copying.  A return
   12798 			 * value of 0 means a second copyin has been started,
   12799 			 * so we return; a return value of 1 means no more
   12800 			 * copying is needed, so we continue.
   12801 			 */
   12802 			if (ipip->ipi_cmd_type == MSFILT_CMD &&
   12803 			    MI_COPY_COUNT(mp) == 1) {
   12804 				if (ip_copyin_msfilter(q, mp) == 0)
   12805 					return;
   12806 			}
   12807 			/*
   12808 			 * Refhold the conn, till the ioctl completes. This is
   12809 			 * needed in case the ioctl ends up in the pending mp
   12810 			 * list. Every mp in the ipx_pending_mp list must have
   12811 			 * a refhold on the conn to resume processing. The
   12812 			 * refhold is released when the ioctl completes
   12813 			 * (whether normally or abnormally). An ioctlref is also
   12814 			 * placed on the conn to prevent TCP from removing the
   12815 			 * queue needed to send the ioctl reply back.
   12816 			 * In all cases ip_ioctl_finish is called to finish
   12817 			 * the ioctl and release the refholds.
   12818 			 */
   12819 			if (connp != NULL) {
   12820 				/* This is not a reentry */
   12821 				CONN_INC_REF(connp);
   12822 				CONN_INC_IOCTLREF(connp);
   12823 			} else {
   12824 				if (!(ipip->ipi_flags & IPI_MODOK)) {
   12825 					mi_copy_done(q, mp, EINVAL);
   12826 					return;
   12827 				}
   12828 			}
   12829 
   12830 			ip_process_ioctl(NULL, q, mp, ipip);
   12831 
   12832 		} else {
   12833 			mi_copyout(q, mp);
   12834 		}
   12835 		return;
   12836 
   12837 	case M_IOCNAK:
   12838 		/*
   12839 		 * The only way we could get here is if a resolver didn't like
   12840 		 * an IOCTL we sent it.	 This shouldn't happen.
   12841 		 */
   12842 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
   12843 		    "ip_wput_nondata: unexpected M_IOCNAK, ioc_cmd 0x%x",
   12844 		    ((struct iocblk *)mp->b_rptr)->ioc_cmd);
   12845 		freemsg(mp);
   12846 		return;
   12847 	case M_IOCACK:
   12848 		/* /dev/ip shouldn't see this */
   12849 		goto nak;
   12850 	case M_FLUSH:
   12851 		if (*mp->b_rptr & FLUSHW)
   12852 			flushq(q, FLUSHALL);
   12853 		if (q->q_next) {
   12854 			putnext(q, mp);
   12855 			return;
   12856 		}
   12857 		if (*mp->b_rptr & FLUSHR) {
   12858 			*mp->b_rptr &= ~FLUSHW;
   12859 			qreply(q, mp);
   12860 			return;
   12861 		}
   12862 		freemsg(mp);
   12863 		return;
   12864 	case M_CTL:
   12865 		break;
   12866 	case M_PROTO:
   12867 	case M_PCPROTO:
   12868 		/*
   12869 		 * The only PROTO messages we expect are SNMP-related.
   12870 		 */
   12871 		switch (((union T_primitives *)mp->b_rptr)->type) {
   12872 		case T_SVR4_OPTMGMT_REQ:
   12873 			ip2dbg(("ip_wput_nondata: T_SVR4_OPTMGMT_REQ "
   12874 			    "flags %x\n",
   12875 			    ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags));
   12876 
   12877 			if (connp == NULL) {
   12878 				proto_str = "T_SVR4_OPTMGMT_REQ";
   12879 				goto protonak;
   12880 			}
   12881 
   12882 			/*
   12883 			 * All Solaris components should pass a db_credp
   12884 			 * for this TPI message, hence we ASSERT.
   12885 			 * But in case there is some other M_PROTO that looks
   12886 			 * like a TPI message sent by some other kernel
   12887 			 * component, we check and return an error.
   12888 			 */
   12889 			cr = msg_getcred(mp, NULL);
   12890 			ASSERT(cr != NULL);
   12891 			if (cr == NULL) {
   12892 				mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
   12893 				if (mp != NULL)
   12894 					qreply(q, mp);
   12895 				return;
   12896 			}
   12897 
   12898 			if (!snmpcom_req(q, mp, ip_snmp_set, ip_snmp_get, cr)) {
   12899 				proto_str = "Bad SNMPCOM request?";
   12900 				goto protonak;
   12901 			}
   12902 			return;
   12903 		default:
   12904 			ip1dbg(("ip_wput_nondata: dropping M_PROTO prim %u\n",
   12905 			    (int)*(uint_t *)mp->b_rptr));
   12906 			freemsg(mp);
   12907 			return;
   12908 		}
   12909 	default:
   12910 		break;
   12911 	}
   12912 	if (q->q_next) {
   12913 		putnext(q, mp);
   12914 	} else
   12915 		freemsg(mp);
   12916 	return;
   12917 
   12918 nak:
   12919 	iocp->ioc_error = EINVAL;
   12920 	mp->b_datap->db_type = M_IOCNAK;
   12921 	iocp->ioc_count = 0;
   12922 	qreply(q, mp);
   12923 	return;
   12924 
   12925 protonak:
   12926 	cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str);
   12927 	if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL)
   12928 		qreply(q, mp);
   12929 }
   12930 
   12931 /*
   12932  * Process IP options in an outbound packet.  Verify that the nexthop in a
   12933  * strict source route is onlink.
   12934  * Returns non-zero if something fails in which case an ICMP error has been
   12935  * sent and mp freed.
   12936  *
   12937  * Assumes the ULP has called ip_massage_options to move nexthop into ipha_dst.
   12938  */
   12939 int
   12940 ip_output_options(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa, ill_t *ill)
   12941 {
   12942 	ipoptp_t	opts;
   12943 	uchar_t		*opt;
   12944 	uint8_t		optval;
   12945 	uint8_t		optlen;
   12946 	ipaddr_t	dst;
   12947 	intptr_t	code = 0;
   12948 	ire_t		*ire;
   12949 	ip_stack_t	*ipst = ixa->ixa_ipst;
   12950 	ip_recv_attr_t	iras;
   12951 
   12952 	ip2dbg(("ip_output_options\n"));
   12953 
   12954 	dst = ipha->ipha_dst;
   12955 	for (optval = ipoptp_first(&opts, ipha);
   12956 	    optval != IPOPT_EOL;
   12957 	    optval = ipoptp_next(&opts)) {
   12958 		opt = opts.ipoptp_cur;
   12959 		optlen = opts.ipoptp_len;
   12960 		ip2dbg(("ip_output_options: opt %d, len %d\n",
   12961 		    optval, optlen));
   12962 		switch (optval) {
   12963 			uint32_t off;
   12964 		case IPOPT_SSRR:
   12965 		case IPOPT_LSRR:
   12966 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   12967 				ip1dbg((
   12968 				    "ip_output_options: bad option offset\n"));
   12969 				code = (char *)&opt[IPOPT_OLEN] -
   12970 				    (char *)ipha;
   12971 				goto param_prob;
   12972 			}
   12973 			off = opt[IPOPT_OFFSET];
   12974 			ip1dbg(("ip_output_options: next hop 0x%x\n",
   12975 			    ntohl(dst)));
   12976 			/*
   12977 			 * For strict: verify that dst is directly
   12978 			 * reachable.
   12979 			 */
   12980 			if (optval == IPOPT_SSRR) {
   12981 				ire = ire_ftable_lookup_v4(dst, 0, 0,
   12982 				    IRE_INTERFACE, NULL, ALL_ZONES,
   12983 				    ixa->ixa_tsl,
   12984 				    MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
   12985 				    NULL);
   12986 				if (ire == NULL) {
   12987 					ip1dbg(("ip_output_options: SSRR not"
   12988 					    " directly reachable: 0x%x\n",
   12989 					    ntohl(dst)));
   12990 					goto bad_src_route;
   12991 				}
   12992 				ire_refrele(ire);
   12993 			}
   12994 			break;
   12995 		case IPOPT_RR:
   12996 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   12997 				ip1dbg((
   12998 				    "ip_output_options: bad option offset\n"));
   12999 				code = (char *)&opt[IPOPT_OLEN] -
   13000 				    (char *)ipha;
   13001 				goto param_prob;
   13002 			}
   13003 			break;
   13004 		case IPOPT_TS:
   13005 			/*
   13006 			 * Verify that length >=5 and that there is either
   13007 			 * room for another timestamp or that the overflow
   13008 			 * counter is not maxed out.
   13009 			 */
   13010 			code = (char *)&opt[IPOPT_OLEN] - (char *)ipha;
   13011 			if (optlen < IPOPT_MINLEN_IT) {
   13012 				goto param_prob;
   13013 			}
   13014 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   13015 				ip1dbg((
   13016 				    "ip_output_options: bad option offset\n"));
   13017 				code = (char *)&opt[IPOPT_OFFSET] -
   13018 				    (char *)ipha;
   13019 				goto param_prob;
   13020 			}
   13021 			switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
   13022 			case IPOPT_TS_TSONLY:
   13023 				off = IPOPT_TS_TIMELEN;
   13024 				break;
   13025 			case IPOPT_TS_TSANDADDR:
   13026 			case IPOPT_TS_PRESPEC:
   13027 			case IPOPT_TS_PRESPEC_RFC791:
   13028 				off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
   13029 				break;
   13030 			default:
   13031 				code = (char *)&opt[IPOPT_POS_OV_FLG] -
   13032 				    (char *)ipha;
   13033 				goto param_prob;
   13034 			}
   13035 			if (opt[IPOPT_OFFSET] - 1 + off > optlen &&
   13036 			    (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) {
   13037 				/*
   13038 				 * No room and the overflow counter is 15
   13039 				 * already.
   13040 				 */
   13041 				goto param_prob;
   13042 			}
   13043 			break;
   13044 		}
   13045 	}
   13046 
   13047 	if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0)
   13048 		return (0);
   13049 
   13050 	ip1dbg(("ip_output_options: error processing IP options."));
   13051 	code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
   13052 
   13053 param_prob:
   13054 	bzero(&iras, sizeof (iras));
   13055 	iras.ira_ill = iras.ira_rill = ill;
   13056 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
   13057 	iras.ira_rifindex = iras.ira_ruifindex;
   13058 	iras.ira_flags = IRAF_IS_IPV4;
   13059 
   13060 	ip_drop_output("ip_output_options", mp, ill);
   13061 	icmp_param_problem(mp, (uint8_t)code, &iras);
   13062 	ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   13063 	return (-1);
   13064 
   13065 bad_src_route:
   13066 	bzero(&iras, sizeof (iras));
   13067 	iras.ira_ill = iras.ira_rill = ill;
   13068 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
   13069 	iras.ira_rifindex = iras.ira_ruifindex;
   13070 	iras.ira_flags = IRAF_IS_IPV4;
   13071 
   13072 	ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
   13073 	icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
   13074 	ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   13075 	return (-1);
   13076 }
   13077 
   13078 /*
   13079  * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT.
   13080  * conn_drain_list_cnt can be changed by setting conn_drain_nthreads
   13081  * thru /etc/system.
   13082  */
   13083 #define	CONN_MAXDRAINCNT	64
   13084 
   13085 static void
   13086 conn_drain_init(ip_stack_t *ipst)
   13087 {
   13088 	int i, j;
   13089 	idl_tx_list_t *itl_tx;
   13090 
   13091 	ipst->ips_conn_drain_list_cnt = conn_drain_nthreads;
   13092 
   13093 	if ((ipst->ips_conn_drain_list_cnt == 0) ||
   13094 	    (ipst->ips_conn_drain_list_cnt > CONN_MAXDRAINCNT)) {
   13095 		/*
   13096 		 * Default value of the number of drainers is the
   13097 		 * number of cpus, subject to maximum of 8 drainers.
   13098 		 */
   13099 		if (boot_max_ncpus != -1)
   13100 			ipst->ips_conn_drain_list_cnt = MIN(boot_max_ncpus, 8);
   13101 		else
   13102 			ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8);
   13103 	}
   13104 
   13105 	ipst->ips_idl_tx_list =
   13106 	    kmem_zalloc(TX_FANOUT_SIZE * sizeof (idl_tx_list_t), KM_SLEEP);
   13107 	for (i = 0; i < TX_FANOUT_SIZE; i++) {
   13108 		itl_tx =  &ipst->ips_idl_tx_list[i];
   13109 		itl_tx->txl_drain_list =
   13110 		    kmem_zalloc(ipst->ips_conn_drain_list_cnt *
   13111 		    sizeof (idl_t), KM_SLEEP);
   13112 		mutex_init(&itl_tx->txl_lock, NULL, MUTEX_DEFAULT, NULL);
   13113 		for (j = 0; j < ipst->ips_conn_drain_list_cnt; j++) {
   13114 			mutex_init(&itl_tx->txl_drain_list[j].idl_lock, NULL,
   13115 			    MUTEX_DEFAULT, NULL);
   13116 			itl_tx->txl_drain_list[j].idl_itl = itl_tx;
   13117 		}
   13118 	}
   13119 }
   13120 
   13121 static void
   13122 conn_drain_fini(ip_stack_t *ipst)
   13123 {
   13124 	int i;
   13125 	idl_tx_list_t *itl_tx;
   13126 
   13127 	for (i = 0; i < TX_FANOUT_SIZE; i++) {
   13128 		itl_tx =  &ipst->ips_idl_tx_list[i];
   13129 		kmem_free(itl_tx->txl_drain_list,
   13130 		    ipst->ips_conn_drain_list_cnt * sizeof (idl_t));
   13131 	}
   13132 	kmem_free(ipst->ips_idl_tx_list,
   13133 	    TX_FANOUT_SIZE * sizeof (idl_tx_list_t));
   13134 	ipst->ips_idl_tx_list = NULL;
   13135 }
   13136 
   13137 /*
   13138  * Flow control has blocked us from proceeding.  Insert the given conn in one
   13139  * of the conn drain lists.  When flow control is unblocked, either ip_wsrv()
   13140  * (STREAMS) or ill_flow_enable() (direct) will be called back, which in turn
   13141  * will call conn_walk_drain().  See the flow control notes at the top of this
   13142  * file for more details.
   13143  */
   13144 void
   13145 conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list)
   13146 {
   13147 	idl_t	*idl = tx_list->txl_drain_list;
   13148 	uint_t	index;
   13149 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   13150 
   13151 	mutex_enter(&connp->conn_lock);
   13152 	if (connp->conn_state_flags & CONN_CLOSING) {
   13153 		/*
   13154 		 * The conn is closing as a result of which CONN_CLOSING
   13155 		 * is set. Return.
   13156 		 */
   13157 		mutex_exit(&connp->conn_lock);
   13158 		return;
   13159 	} else if (connp->conn_idl == NULL) {
   13160 		/*
   13161 		 * Assign the next drain list round robin. We dont' use
   13162 		 * a lock, and thus it may not be strictly round robin.
   13163 		 * Atomicity of load/stores is enough to make sure that
   13164 		 * conn_drain_list_index is always within bounds.
   13165 		 */
   13166 		index = tx_list->txl_drain_index;
   13167 		ASSERT(index < ipst->ips_conn_drain_list_cnt);
   13168 		connp->conn_idl = &tx_list->txl_drain_list[index];
   13169 		index++;
   13170 		if (index == ipst->ips_conn_drain_list_cnt)
   13171 			index = 0;
   13172 		tx_list->txl_drain_index = index;
   13173 	} else {
   13174 		ASSERT(connp->conn_idl->idl_itl == tx_list);
   13175 	}
   13176 	mutex_exit(&connp->conn_lock);
   13177 
   13178 	idl = connp->conn_idl;
   13179 	mutex_enter(&idl->idl_lock);
   13180 	if ((connp->conn_drain_prev != NULL) ||
   13181 	    (connp->conn_state_flags & CONN_CLOSING)) {
   13182 		/*
   13183 		 * The conn is either already in the drain list or closing.
   13184 		 * (We needed to check for CONN_CLOSING again since close can
   13185 		 * sneak in between dropping conn_lock and acquiring idl_lock.)
   13186 		 */
   13187 		mutex_exit(&idl->idl_lock);
   13188 		return;
   13189 	}
   13190 
   13191 	/*
   13192 	 * The conn is not in the drain list. Insert it at the
   13193 	 * tail of the drain list. The drain list is circular
   13194 	 * and doubly linked. idl_conn points to the 1st element
   13195 	 * in the list.
   13196 	 */
   13197 	if (idl->idl_conn == NULL) {
   13198 		idl->idl_conn = connp;
   13199 		connp->conn_drain_next = connp;
   13200 		connp->conn_drain_prev = connp;
   13201 	} else {
   13202 		conn_t *head = idl->idl_conn;
   13203 
   13204 		connp->conn_drain_next = head;
   13205 		connp->conn_drain_prev = head->conn_drain_prev;
   13206 		head->conn_drain_prev->conn_drain_next = connp;
   13207 		head->conn_drain_prev = connp;
   13208 	}
   13209 	/*
   13210 	 * For non streams based sockets assert flow control.
   13211 	 */
   13212 	conn_setqfull(connp, NULL);
   13213 	mutex_exit(&idl->idl_lock);
   13214 }
   13215 
   13216 static void
   13217 conn_drain_remove(conn_t *connp)
   13218 {
   13219 	idl_t *idl = connp->conn_idl;
   13220 
   13221 	if (idl != NULL) {
   13222 		/*
   13223 		 * Remove ourself from the drain list.
   13224 		 */
   13225 		if (connp->conn_drain_next == connp) {
   13226 			/* Singleton in the list */
   13227 			ASSERT(connp->conn_drain_prev == connp);
   13228 			idl->idl_conn = NULL;
   13229 		} else {
   13230 			connp->conn_drain_prev->conn_drain_next =
   13231 			    connp->conn_drain_next;
   13232 			connp->conn_drain_next->conn_drain_prev =
   13233 			    connp->conn_drain_prev;
   13234 			if (idl->idl_conn == connp)
   13235 				idl->idl_conn = connp->conn_drain_next;
   13236 		}
   13237 
   13238 		/*
   13239 		 * NOTE: because conn_idl is associated with a specific drain
   13240 		 * list which in turn is tied to the index the TX ring
   13241 		 * (txl_cookie) hashes to, and because the TX ring can change
   13242 		 * over the lifetime of the conn_t, we must clear conn_idl so
   13243 		 * a subsequent conn_drain_insert() will set conn_idl again
   13244 		 * based on the latest txl_cookie.
   13245 		 */
   13246 		connp->conn_idl = NULL;
   13247 	}
   13248 	connp->conn_drain_next = NULL;
   13249 	connp->conn_drain_prev = NULL;
   13250 
   13251 	conn_clrqfull(connp, NULL);
   13252 	/*
   13253 	 * For streams based sockets open up flow control.
   13254 	 */
   13255 	if (!IPCL_IS_NONSTR(connp))
   13256 		enableok(connp->conn_wq);
   13257 }
   13258 
   13259 /*
   13260  * This conn is closing, and we are called from ip_close. OR
   13261  * this conn is draining because flow-control on the ill has been relieved.
   13262  *
   13263  * We must also need to remove conn's on this idl from the list, and also
   13264  * inform the sockfs upcalls about the change in flow-control.
   13265  */
   13266 static void
   13267 conn_drain(conn_t *connp, boolean_t closing)
   13268 {
   13269 	idl_t *idl;
   13270 	conn_t *next_connp;
   13271 
   13272 	/*
   13273 	 * connp->conn_idl is stable at this point, and no lock is needed
   13274 	 * to check it. If we are called from ip_close, close has already
   13275 	 * set CONN_CLOSING, thus freezing the value of conn_idl, and
   13276 	 * called us only because conn_idl is non-null. If we are called thru
   13277 	 * service, conn_idl could be null, but it cannot change because
   13278 	 * service is single-threaded per queue, and there cannot be another
   13279 	 * instance of service trying to call conn_drain_insert on this conn
   13280 	 * now.
   13281 	 */
   13282 	ASSERT(!closing || connp == NULL || connp->conn_idl != NULL);
   13283 
   13284 	/*
   13285 	 * If the conn doesn't exist or is not on a drain list, bail.
   13286 	 */
   13287 	if (connp == NULL || connp->conn_idl == NULL ||
   13288 	    connp->conn_drain_prev == NULL) {
   13289 		return;
   13290 	}
   13291 
   13292 	idl = connp->conn_idl;
   13293 	ASSERT(MUTEX_HELD(&idl->idl_lock));
   13294 
   13295 	if (!closing) {
   13296 		next_connp = connp->conn_drain_next;
   13297 		while (next_connp != connp) {
   13298 			conn_t *delconnp = next_connp;
   13299 
   13300 			next_connp = next_connp->conn_drain_next;
   13301 			conn_drain_remove(delconnp);
   13302 		}
   13303 		ASSERT(connp->conn_drain_next == idl->idl_conn);
   13304 	}
   13305 	conn_drain_remove(connp);
   13306 }
   13307 
   13308 /*
   13309  * Write service routine. Shared perimeter entry point.
   13310  * The device queue's messages has fallen below the low water mark and STREAMS
   13311  * has backenabled the ill_wq. Send sockfs notification about flow-control on
   13312  * each waiting conn.
   13313  */
   13314 void
   13315 ip_wsrv(queue_t *q)
   13316 {
   13317 	ill_t	*ill;
   13318 
   13319 	ill = (ill_t *)q->q_ptr;
   13320 	if (ill->ill_state_flags == 0) {
   13321 		ip_stack_t *ipst = ill->ill_ipst;
   13322 
   13323 		/*
   13324 		 * The device flow control has opened up.
   13325 		 * Walk through conn drain lists and qenable the
   13326 		 * first conn in each list. This makes sense only
   13327 		 * if the stream is fully plumbed and setup.
   13328 		 * Hence the ill_state_flags check above.
   13329 		 */
   13330 		ip1dbg(("ip_wsrv: walking\n"));
   13331 		conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]);
   13332 		enableok(ill->ill_wq);
   13333 	}
   13334 }
   13335 
   13336 /*
   13337  * Callback to disable flow control in IP.
   13338  *
   13339  * This is a mac client callback added when the DLD_CAPAB_DIRECT capability
   13340  * is enabled.
   13341  *
   13342  * When MAC_TX() is not able to send any more packets, dld sets its queue
   13343  * to QFULL and enable the STREAMS flow control. Later, when the underlying
   13344  * driver is able to continue to send packets, it calls mac_tx_(ring_)update()
   13345  * function and wakes up corresponding mac worker threads, which in turn
   13346  * calls this callback function, and disables flow control.
   13347  */
   13348 void
   13349 ill_flow_enable(void *arg, ip_mac_tx_cookie_t cookie)
   13350 {
   13351 	ill_t *ill = (ill_t *)arg;
   13352 	ip_stack_t *ipst = ill->ill_ipst;
   13353 	idl_tx_list_t *idl_txl;
   13354 
   13355 	idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
   13356 	mutex_enter(&idl_txl->txl_lock);
   13357 	/* add code to to set a flag to indicate idl_txl is enabled */
   13358 	conn_walk_drain(ipst, idl_txl);
   13359 	mutex_exit(&idl_txl->txl_lock);
   13360 }
   13361 
   13362 /*
   13363  * Flow control has been relieved and STREAMS has backenabled us; drain
   13364  * all the conn lists on `tx_list'.
   13365  */
   13366 static void
   13367 conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
   13368 {
   13369 	int i;
   13370 	idl_t *idl;
   13371 
   13372 	IP_STAT(ipst, ip_conn_walk_drain);
   13373 
   13374 	for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
   13375 		idl = &tx_list->txl_drain_list[i];
   13376 		mutex_enter(&idl->idl_lock);
   13377 		conn_drain(idl->idl_conn, B_FALSE);
   13378 		mutex_exit(&idl->idl_lock);
   13379 	}
   13380 }
   13381 
   13382 /*
   13383  * Determine if the ill and multicast aspects of that packets
   13384  * "matches" the conn.
   13385  */
   13386 boolean_t
   13387 conn_wantpacket(conn_t *connp, ip_recv_attr_t *ira, ipha_t *ipha)
   13388 {
   13389 	ill_t		*ill = ira->ira_rill;
   13390 	zoneid_t	zoneid = ira->ira_zoneid;
   13391 	uint_t		in_ifindex;
   13392 	ipaddr_t	dst, src;
   13393 
   13394 	dst = ipha->ipha_dst;
   13395 	src = ipha->ipha_src;
   13396 
   13397 	/*
   13398 	 * conn_incoming_ifindex is set by IP_BOUND_IF which limits
   13399 	 * unicast, broadcast and multicast reception to
   13400 	 * conn_incoming_ifindex.
   13401 	 * conn_wantpacket is called for unicast, broadcast and
   13402 	 * multicast packets.
   13403 	 */
   13404 	in_ifindex = connp->conn_incoming_ifindex;
   13405 
   13406 	/* mpathd can bind to the under IPMP interface, which we allow */
   13407 	if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
   13408 		if (!IS_UNDER_IPMP(ill))
   13409 			return (B_FALSE);
   13410 
   13411 		if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
   13412 			return (B_FALSE);
   13413 	}
   13414 
   13415 	if (!IPCL_ZONE_MATCH(connp, zoneid))
   13416 		return (B_FALSE);
   13417 
   13418 	if (!(ira->ira_flags & IRAF_MULTICAST))
   13419 		return (B_TRUE);
   13420 
   13421 	if (connp->conn_multi_router) {
   13422 		/* multicast packet and multicast router socket: send up */
   13423 		return (B_TRUE);
   13424 	}
   13425 
   13426 	if (ipha->ipha_protocol == IPPROTO_PIM ||
   13427 	    ipha->ipha_protocol == IPPROTO_RSVP)
   13428 		return (B_TRUE);
   13429 
   13430 	return (conn_hasmembers_ill_withsrc_v4(connp, dst, src, ira->ira_ill));
   13431 }
   13432 
   13433 void
   13434 conn_setqfull(conn_t *connp, boolean_t *flow_stopped)
   13435 {
   13436 	if (IPCL_IS_NONSTR(connp)) {
   13437 		(*connp->conn_upcalls->su_txq_full)
   13438 		    (connp->conn_upper_handle, B_TRUE);
   13439 		if (flow_stopped != NULL)
   13440 			*flow_stopped = B_TRUE;
   13441 	} else {
   13442 		queue_t *q = connp->conn_wq;
   13443 
   13444 		ASSERT(q != NULL);
   13445 		if (!(q->q_flag & QFULL)) {
   13446 			mutex_enter(QLOCK(q));
   13447 			if (!(q->q_flag & QFULL)) {
   13448 				/* still need to set QFULL */
   13449 				q->q_flag |= QFULL;
   13450 				/* set flow_stopped to true under QLOCK */
   13451 				if (flow_stopped != NULL)
   13452 					*flow_stopped = B_TRUE;
   13453 				mutex_exit(QLOCK(q));
   13454 			} else {
   13455 				/* flow_stopped is left unchanged */
   13456 				mutex_exit(QLOCK(q));
   13457 			}
   13458 		}
   13459 	}
   13460 }
   13461 
   13462 void
   13463 conn_clrqfull(conn_t *connp, boolean_t *flow_stopped)
   13464 {
   13465 	if (IPCL_IS_NONSTR(connp)) {
   13466 		(*connp->conn_upcalls->su_txq_full)
   13467 		    (connp->conn_upper_handle, B_FALSE);
   13468 		if (flow_stopped != NULL)
   13469 			*flow_stopped = B_FALSE;
   13470 	} else {
   13471 		queue_t *q = connp->conn_wq;
   13472 
   13473 		ASSERT(q != NULL);
   13474 		if (q->q_flag & QFULL) {
   13475 			mutex_enter(QLOCK(q));
   13476 			if (q->q_flag & QFULL) {
   13477 				q->q_flag &= ~QFULL;
   13478 				/* set flow_stopped to false under QLOCK */
   13479 				if (flow_stopped != NULL)
   13480 					*flow_stopped = B_FALSE;
   13481 				mutex_exit(QLOCK(q));
   13482 				if (q->q_flag & QWANTW)
   13483 					qbackenable(q, 0);
   13484 			} else {
   13485 				/* flow_stopped is left unchanged */
   13486 				mutex_exit(QLOCK(q));
   13487 			}
   13488 		}
   13489 	}
   13490 
   13491 	mutex_enter(&connp->conn_lock);
   13492 	connp->conn_blocked = B_FALSE;
   13493 	mutex_exit(&connp->conn_lock);
   13494 }
   13495 
   13496 /*
   13497  * Return the length in bytes of the IPv4 headers (base header, label, and
   13498  * other IP options) that will be needed based on the
   13499  * ip_pkt_t structure passed by the caller.
   13500  *
   13501  * The returned length does not include the length of the upper level
   13502  * protocol (ULP) header.
   13503  * The caller needs to check that the length doesn't exceed the max for IPv4.
   13504  */
   13505 int
   13506 ip_total_hdrs_len_v4(const ip_pkt_t *ipp)
   13507 {
   13508 	int len;
   13509 
   13510 	len = IP_SIMPLE_HDR_LENGTH;
   13511 	if (ipp->ipp_fields & IPPF_LABEL_V4) {
   13512 		ASSERT(ipp->ipp_label_len_v4 != 0);
   13513 		/* We need to round up here */
   13514 		len += (ipp->ipp_label_len_v4 + 3) & ~3;
   13515 	}
   13516 
   13517 	if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
   13518 		ASSERT(ipp->ipp_ipv4_options_len != 0);
   13519 		ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
   13520 		len += ipp->ipp_ipv4_options_len;
   13521 	}
   13522 	return (len);
   13523 }
   13524 
   13525 /*
   13526  * All-purpose routine to build an IPv4 header with options based
   13527  * on the abstract ip_pkt_t.
   13528  *
   13529  * The caller has to set the source and destination address as well as
   13530  * ipha_length. The caller has to massage any source route and compensate
   13531  * for the ULP pseudo-header checksum due to the source route.
   13532  */
   13533 void
   13534 ip_build_hdrs_v4(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
   13535     uint8_t protocol)
   13536 {
   13537 	ipha_t	*ipha = (ipha_t *)buf;
   13538 	uint8_t *cp;
   13539 
   13540 	/* Initialize IPv4 header */
   13541 	ipha->ipha_type_of_service = ipp->ipp_type_of_service;
   13542 	ipha->ipha_length = 0;	/* Caller will set later */
   13543 	ipha->ipha_ident = 0;
   13544 	ipha->ipha_fragment_offset_and_flags = 0;
   13545 	ipha->ipha_ttl = ipp->ipp_unicast_hops;
   13546 	ipha->ipha_protocol = protocol;
   13547 	ipha->ipha_hdr_checksum = 0;
   13548 
   13549 	if ((ipp->ipp_fields & IPPF_ADDR) &&
   13550 	    IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
   13551 		ipha->ipha_src = ipp->ipp_addr_v4;
   13552 
   13553 	cp = (uint8_t *)&ipha[1];
   13554 	if (ipp->ipp_fields & IPPF_LABEL_V4) {
   13555 		ASSERT(ipp->ipp_label_len_v4 != 0);
   13556 		bcopy(ipp->ipp_label_v4, cp, ipp->ipp_label_len_v4);
   13557 		cp += ipp->ipp_label_len_v4;
   13558 		/* We need to round up here */
   13559 		while ((uintptr_t)cp & 0x3) {
   13560 			*cp++ = IPOPT_NOP;
   13561 		}
   13562 	}
   13563 
   13564 	if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
   13565 		ASSERT(ipp->ipp_ipv4_options_len != 0);
   13566 		ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
   13567 		bcopy(ipp->ipp_ipv4_options, cp, ipp->ipp_ipv4_options_len);
   13568 		cp += ipp->ipp_ipv4_options_len;
   13569 	}
   13570 	ipha->ipha_version_and_hdr_length =
   13571 	    (uint8_t)((IP_VERSION << 4) + buf_len / 4);
   13572 
   13573 	ASSERT((int)(cp - buf) == buf_len);
   13574 }
   13575 
   13576 /* Allocate the private structure */
   13577 static int
   13578 ip_priv_alloc(void **bufp)
   13579 {
   13580 	void	*buf;
   13581 
   13582 	if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL)
   13583 		return (ENOMEM);
   13584 
   13585 	*bufp = buf;
   13586 	return (0);
   13587 }
   13588 
   13589 /* Function to delete the private structure */
   13590 void
   13591 ip_priv_free(void *buf)
   13592 {
   13593 	ASSERT(buf != NULL);
   13594 	kmem_free(buf, sizeof (ip_priv_t));
   13595 }
   13596 
   13597 /*
   13598  * The entry point for IPPF processing.
   13599  * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the
   13600  * routine just returns.
   13601  *
   13602  * When called, ip_process generates an ipp_packet_t structure
   13603  * which holds the state information for this packet and invokes the
   13604  * the classifier (via ipp_packet_process). The classification, depending on
   13605  * configured filters, results in a list of actions for this packet. Invoking
   13606  * an action may cause the packet to be dropped, in which case we return NULL.
   13607  * proc indicates the callout position for
   13608  * this packet and ill is the interface this packet arrived on or will leave
   13609  * on (inbound and outbound resp.).
   13610  *
   13611  * We do the processing on the rill (mapped to the upper if ipmp), but MIB
   13612  * on the ill corrsponding to the destination IP address.
   13613  */
   13614 mblk_t *
   13615 ip_process(ip_proc_t proc, mblk_t *mp, ill_t *rill, ill_t *ill)
   13616 {
   13617 	ip_priv_t	*priv;
   13618 	ipp_action_id_t	aid;
   13619 	int		rc = 0;
   13620 	ipp_packet_t	*pp;
   13621 
   13622 	/* If the classifier is not loaded, return  */
   13623 	if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) {
   13624 		return (mp);
   13625 	}
   13626 
   13627 	ASSERT(mp != NULL);
   13628 
   13629 	/* Allocate the packet structure */
   13630 	rc = ipp_packet_alloc(&pp, "ip", aid);
   13631 	if (rc != 0)
   13632 		goto drop;
   13633 
   13634 	/* Allocate the private structure */
   13635 	rc = ip_priv_alloc((void **)&priv);
   13636 	if (rc != 0) {
   13637 		ipp_packet_free(pp);
   13638 		goto drop;
   13639 	}
   13640 	priv->proc = proc;
   13641 	priv->ill_index = ill_get_upper_ifindex(rill);
   13642 
   13643 	ipp_packet_set_private(pp, priv, ip_priv_free);
   13644 	ipp_packet_set_data(pp, mp);
   13645 
   13646 	/* Invoke the classifier */
   13647 	rc = ipp_packet_process(&pp);
   13648 	if (pp != NULL) {
   13649 		mp = ipp_packet_get_data(pp);
   13650 		ipp_packet_free(pp);
   13651 		if (rc != 0)
   13652 			goto drop;
   13653 		return (mp);
   13654 	} else {
   13655 		/* No mp to trace in ip_drop_input/ip_drop_output  */
   13656 		mp = NULL;
   13657 	}
   13658 drop:
   13659 	if (proc == IPP_LOCAL_IN || proc == IPP_FWD_IN) {
   13660 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   13661 		ip_drop_input("ip_process", mp, ill);
   13662 	} else {
   13663 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   13664 		ip_drop_output("ip_process", mp, ill);
   13665 	}
   13666 	freemsg(mp);
   13667 	return (NULL);
   13668 }
   13669 
   13670 /*
   13671  * Propagate a multicast group membership operation (add/drop) on
   13672  * all the interfaces crossed by the related multirt routes.
   13673  * The call is considered successful if the operation succeeds
   13674  * on at least one interface.
   13675  *
   13676  * This assumes that a set of IRE_HOST/RTF_MULTIRT has been created for the
   13677  * multicast addresses with the ire argument being the first one.
   13678  * We walk the bucket to find all the of those.
   13679  *
   13680  * Common to IPv4 and IPv6.
   13681  */
   13682 static int
   13683 ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
   13684     const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
   13685     ire_t *ire, conn_t *connp, boolean_t checkonly, const in6_addr_t *v6group,
   13686     mcast_record_t fmode, const in6_addr_t *v6src)
   13687 {
   13688 	ire_t		*ire_gw;
   13689 	irb_t		*irb;
   13690 	int		ifindex;
   13691 	int		error = 0;
   13692 	int		result;
   13693 	ip_stack_t	*ipst = ire->ire_ipst;
   13694 	ipaddr_t	group;
   13695 	boolean_t	isv6;
   13696 	int		match_flags;
   13697 
   13698 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
   13699 		IN6_V4MAPPED_TO_IPADDR(v6group, group);
   13700 		isv6 = B_FALSE;
   13701 	} else {
   13702 		isv6 = B_TRUE;
   13703 	}
   13704 
   13705 	irb = ire->ire_bucket;
   13706 	ASSERT(irb != NULL);
   13707 
   13708 	result = 0;
   13709 	irb_refhold(irb);
   13710 	for (; ire != NULL; ire = ire->ire_next) {
   13711 		if ((ire->ire_flags & RTF_MULTIRT) == 0)
   13712 			continue;
   13713 
   13714 		/* We handle -ifp routes by matching on the ill if set */
   13715 		match_flags = MATCH_IRE_TYPE;
   13716 		if (ire->ire_ill != NULL)
   13717 			match_flags |= MATCH_IRE_ILL;
   13718 
   13719 		if (isv6) {
   13720 			if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6group))
   13721 				continue;
   13722 
   13723 			ire_gw = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6,
   13724 			    0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
   13725 			    match_flags, 0, ipst, NULL);
   13726 		} else {
   13727 			if (ire->ire_addr != group)
   13728 				continue;
   13729 
   13730 			ire_gw = ire_ftable_lookup_v4(ire->ire_gateway_addr,
   13731 			    0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
   13732 			    match_flags, 0, ipst, NULL);
   13733 		}
   13734 		/* No interface route exists for the gateway; skip this ire. */
   13735 		if (ire_gw == NULL)
   13736 			continue;
   13737 		if (ire_gw->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
   13738 			ire_refrele(ire_gw);
   13739 			continue;
   13740 		}
   13741 		ASSERT(ire_gw->ire_ill != NULL);	/* IRE_INTERFACE */
   13742 		ifindex = ire_gw->ire_ill->ill_phyint->phyint_ifindex;
   13743 
   13744 		/*
   13745 		 * The operation is considered a success if
   13746 		 * it succeeds at least once on any one interface.
   13747 		 */
   13748 		error = fn(connp, checkonly, v6group, INADDR_ANY, ifindex,
   13749 		    fmode, v6src);
   13750 		if (error == 0)
   13751 			result = CGTP_MCAST_SUCCESS;
   13752 
   13753 		ire_refrele(ire_gw);
   13754 	}
   13755 	irb_refrele(irb);
   13756 	/*
   13757 	 * Consider the call as successful if we succeeded on at least
   13758 	 * one interface. Otherwise, return the last encountered error.
   13759 	 */
   13760 	return (result == CGTP_MCAST_SUCCESS ? 0 : error);
   13761 }
   13762 
   13763 /*
   13764  * Return the expected CGTP hooks version number.
   13765  */
   13766 int
   13767 ip_cgtp_filter_supported(void)
   13768 {
   13769 	return (ip_cgtp_filter_rev);
   13770 }
   13771 
   13772 /*
   13773  * CGTP hooks can be registered by invoking this function.
   13774  * Checks that the version number matches.
   13775  */
   13776 int
   13777 ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops)
   13778 {
   13779 	netstack_t *ns;
   13780 	ip_stack_t *ipst;
   13781 
   13782 	if (ops->cfo_filter_rev != CGTP_FILTER_REV)
   13783 		return (ENOTSUP);
   13784 
   13785 	ns = netstack_find_by_stackid(stackid);
   13786 	if (ns == NULL)
   13787 		return (EINVAL);
   13788 	ipst = ns->netstack_ip;
   13789 	ASSERT(ipst != NULL);
   13790 
   13791 	if (ipst->ips_ip_cgtp_filter_ops != NULL) {
   13792 		netstack_rele(ns);
   13793 		return (EALREADY);
   13794 	}
   13795 
   13796 	ipst->ips_ip_cgtp_filter_ops = ops;
   13797 
   13798 	ill_set_inputfn_all(ipst);
   13799 
   13800 	netstack_rele(ns);
   13801 	return (0);
   13802 }
   13803 
   13804 /*
   13805  * CGTP hooks can be unregistered by invoking this function.
   13806  * Returns ENXIO if there was no registration.
   13807  * Returns EBUSY if the ndd variable has not been turned off.
   13808  */
   13809 int
   13810 ip_cgtp_filter_unregister(netstackid_t stackid)
   13811 {
   13812 	netstack_t *ns;
   13813 	ip_stack_t *ipst;
   13814 
   13815 	ns = netstack_find_by_stackid(stackid);
   13816 	if (ns == NULL)
   13817 		return (EINVAL);
   13818 	ipst = ns->netstack_ip;
   13819 	ASSERT(ipst != NULL);
   13820 
   13821 	if (ipst->ips_ip_cgtp_filter) {
   13822 		netstack_rele(ns);
   13823 		return (EBUSY);
   13824 	}
   13825 
   13826 	if (ipst->ips_ip_cgtp_filter_ops == NULL) {
   13827 		netstack_rele(ns);
   13828 		return (ENXIO);
   13829 	}
   13830 	ipst->ips_ip_cgtp_filter_ops = NULL;
   13831 
   13832 	ill_set_inputfn_all(ipst);
   13833 
   13834 	netstack_rele(ns);
   13835 	return (0);
   13836 }
   13837 
   13838 /*
   13839  * Check whether there is a CGTP filter registration.
   13840  * Returns non-zero if there is a registration, otherwise returns zero.
   13841  * Note: returns zero if bad stackid.
   13842  */
   13843 int
   13844 ip_cgtp_filter_is_registered(netstackid_t stackid)
   13845 {
   13846 	netstack_t *ns;
   13847 	ip_stack_t *ipst;
   13848 	int ret;
   13849 
   13850 	ns = netstack_find_by_stackid(stackid);
   13851 	if (ns == NULL)
   13852 		return (0);
   13853 	ipst = ns->netstack_ip;
   13854 	ASSERT(ipst != NULL);
   13855 
   13856 	if (ipst->ips_ip_cgtp_filter_ops != NULL)
   13857 		ret = 1;
   13858 	else
   13859 		ret = 0;
   13860 
   13861 	netstack_rele(ns);
   13862 	return (ret);
   13863 }
   13864 
   13865 static int
   13866 ip_squeue_switch(int val)
   13867 {
   13868 	int rval;
   13869 
   13870 	switch (val) {
   13871 	case IP_SQUEUE_ENTER_NODRAIN:
   13872 		rval = SQ_NODRAIN;
   13873 		break;
   13874 	case IP_SQUEUE_ENTER:
   13875 		rval = SQ_PROCESS;
   13876 		break;
   13877 	case IP_SQUEUE_FILL:
   13878 	default:
   13879 		rval = SQ_FILL;
   13880 		break;
   13881 	}
   13882 	return (rval);
   13883 }
   13884 
   13885 static void *
   13886 ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp)
   13887 {
   13888 	kstat_t *ksp;
   13889 
   13890 	ip_stat_t template = {
   13891 		{ "ip_udp_fannorm", 		KSTAT_DATA_UINT64 },
   13892 		{ "ip_udp_fanmb", 		KSTAT_DATA_UINT64 },
   13893 		{ "ip_recv_pullup", 		KSTAT_DATA_UINT64 },
   13894 		{ "ip_db_ref",			KSTAT_DATA_UINT64 },
   13895 		{ "ip_notaligned",		KSTAT_DATA_UINT64 },
   13896 		{ "ip_multimblk",		KSTAT_DATA_UINT64 },
   13897 		{ "ip_opt",			KSTAT_DATA_UINT64 },
   13898 		{ "ipsec_proto_ahesp",		KSTAT_DATA_UINT64 },
   13899 		{ "ip_conn_flputbq",		KSTAT_DATA_UINT64 },
   13900 		{ "ip_conn_walk_drain",		KSTAT_DATA_UINT64 },
   13901 		{ "ip_out_sw_cksum",		KSTAT_DATA_UINT64 },
   13902 		{ "ip_out_sw_cksum_bytes",	KSTAT_DATA_UINT64 },
   13903 		{ "ip_in_sw_cksum",		KSTAT_DATA_UINT64 },
   13904 		{ "ip_ire_reclaim_calls",	KSTAT_DATA_UINT64 },
   13905 		{ "ip_ire_reclaim_deleted",	KSTAT_DATA_UINT64 },
   13906 		{ "ip_nce_reclaim_calls",	KSTAT_DATA_UINT64 },
   13907 		{ "ip_nce_reclaim_deleted",	KSTAT_DATA_UINT64 },
   13908 		{ "ip_dce_reclaim_calls",	KSTAT_DATA_UINT64 },
   13909 		{ "ip_dce_reclaim_deleted",	KSTAT_DATA_UINT64 },
   13910 		{ "ip_tcp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
   13911 		{ "ip_tcp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
   13912 		{ "ip_tcp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
   13913 		{ "ip_udp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
   13914 		{ "ip_udp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
   13915 		{ "ip_udp_in_sw_cksum_err",	KSTAT_DATA_UINT64 },
   13916 		{ "conn_in_recvdstaddr",	KSTAT_DATA_UINT64 },
   13917 		{ "conn_in_recvopts",		KSTAT_DATA_UINT64 },
   13918 		{ "conn_in_recvif",		KSTAT_DATA_UINT64 },
   13919 		{ "conn_in_recvslla",		KSTAT_DATA_UINT64 },
   13920 		{ "conn_in_recvucred",		KSTAT_DATA_UINT64 },
   13921 		{ "conn_in_recvttl",		KSTAT_DATA_UINT64 },
   13922 		{ "conn_in_recvhopopts",	KSTAT_DATA_UINT64 },
   13923 		{ "conn_in_recvhoplimit",	KSTAT_DATA_UINT64 },
   13924 		{ "conn_in_recvdstopts",	KSTAT_DATA_UINT64 },
   13925 		{ "conn_in_recvrthdrdstopts",	KSTAT_DATA_UINT64 },
   13926 		{ "conn_in_recvrthdr",		KSTAT_DATA_UINT64 },
   13927 		{ "conn_in_recvpktinfo",	KSTAT_DATA_UINT64 },
   13928 		{ "conn_in_recvtclass",		KSTAT_DATA_UINT64 },
   13929 		{ "conn_in_timestamp",		KSTAT_DATA_UINT64 },
   13930 	};
   13931 
   13932 	ksp = kstat_create_netstack("ip", 0, "ipstat", "net",
   13933 	    KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t),
   13934 	    KSTAT_FLAG_VIRTUAL, stackid);
   13935 
   13936 	if (ksp == NULL)
   13937 		return (NULL);
   13938 
   13939 	bcopy(&template, ip_statisticsp, sizeof (template));
   13940 	ksp->ks_data = (void *)ip_statisticsp;
   13941 	ksp->ks_private = (void *)(uintptr_t)stackid;
   13942 
   13943 	kstat_install(ksp);
   13944 	return (ksp);
   13945 }
   13946 
   13947 static void
   13948 ip_kstat2_fini(netstackid_t stackid, kstat_t *ksp)
   13949 {
   13950 	if (ksp != NULL) {
   13951 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
   13952 		kstat_delete_netstack(ksp, stackid);
   13953 	}
   13954 }
   13955 
   13956 static void *
   13957 ip_kstat_init(netstackid_t stackid, ip_stack_t *ipst)
   13958 {
   13959 	kstat_t	*ksp;
   13960 
   13961 	ip_named_kstat_t template = {
   13962 		{ "forwarding",		KSTAT_DATA_UINT32, 0 },
   13963 		{ "defaultTTL",		KSTAT_DATA_UINT32, 0 },
   13964 		{ "inReceives",		KSTAT_DATA_UINT64, 0 },
   13965 		{ "inHdrErrors",	KSTAT_DATA_UINT32, 0 },
   13966 		{ "inAddrErrors",	KSTAT_DATA_UINT32, 0 },
   13967 		{ "forwDatagrams",	KSTAT_DATA_UINT64, 0 },
   13968 		{ "inUnknownProtos",	KSTAT_DATA_UINT32, 0 },
   13969 		{ "inDiscards",		KSTAT_DATA_UINT32, 0 },
   13970 		{ "inDelivers",		KSTAT_DATA_UINT64, 0 },
   13971 		{ "outRequests",	KSTAT_DATA_UINT64, 0 },
   13972 		{ "outDiscards",	KSTAT_DATA_UINT32, 0 },
   13973 		{ "outNoRoutes",	KSTAT_DATA_UINT32, 0 },
   13974 		{ "reasmTimeout",	KSTAT_DATA_UINT32, 0 },
   13975 		{ "reasmReqds",		KSTAT_DATA_UINT32, 0 },
   13976 		{ "reasmOKs",		KSTAT_DATA_UINT32, 0 },
   13977 		{ "reasmFails",		KSTAT_DATA_UINT32, 0 },
   13978 		{ "fragOKs",		KSTAT_DATA_UINT32, 0 },
   13979 		{ "fragFails",		KSTAT_DATA_UINT32, 0 },
   13980 		{ "fragCreates",	KSTAT_DATA_UINT32, 0 },
   13981 		{ "addrEntrySize",	KSTAT_DATA_INT32, 0 },
   13982 		{ "routeEntrySize",	KSTAT_DATA_INT32, 0 },
   13983 		{ "netToMediaEntrySize",	KSTAT_DATA_INT32, 0 },
   13984 		{ "routingDiscards",	KSTAT_DATA_UINT32, 0 },
   13985 		{ "inErrs",		KSTAT_DATA_UINT32, 0 },
   13986 		{ "noPorts",		KSTAT_DATA_UINT32, 0 },
   13987 		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
   13988 		{ "reasmDuplicates",	KSTAT_DATA_UINT32, 0 },
   13989 		{ "reasmPartDups",	KSTAT_DATA_UINT32, 0 },
   13990 		{ "forwProhibits",	KSTAT_DATA_UINT32, 0 },
   13991 		{ "udpInCksumErrs",	KSTAT_DATA_UINT32, 0 },
   13992 		{ "udpInOverflows",	KSTAT_DATA_UINT32, 0 },
   13993 		{ "rawipInOverflows",	KSTAT_DATA_UINT32, 0 },
   13994 		{ "ipsecInSucceeded",	KSTAT_DATA_UINT32, 0 },
   13995 		{ "ipsecInFailed",	KSTAT_DATA_INT32, 0 },
   13996 		{ "memberEntrySize",	KSTAT_DATA_INT32, 0 },
   13997 		{ "inIPv6",		KSTAT_DATA_UINT32, 0 },
   13998 		{ "outIPv6",		KSTAT_DATA_UINT32, 0 },
   13999 		{ "outSwitchIPv6",	KSTAT_DATA_UINT32, 0 },
   14000 	};
   14001 
   14002 	ksp = kstat_create_netstack("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED,
   14003 	    NUM_OF_FIELDS(ip_named_kstat_t), 0, stackid);
   14004 	if (ksp == NULL || ksp->ks_data == NULL)
   14005 		return (NULL);
   14006 
   14007 	template.forwarding.value.ui32 = WE_ARE_FORWARDING(ipst) ? 1:2;
   14008 	template.defaultTTL.value.ui32 = (uint32_t)ipst->ips_ip_def_ttl;
   14009 	template.reasmTimeout.value.ui32 = ipst->ips_ip_reassembly_timeout;
   14010 	template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t);
   14011 	template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t);
   14012 
   14013 	template.netToMediaEntrySize.value.i32 =
   14014 	    sizeof (mib2_ipNetToMediaEntry_t);
   14015 
   14016 	template.memberEntrySize.value.i32 = sizeof (ipv6_member_t);
   14017 
   14018 	bcopy(&template, ksp->ks_data, sizeof (template));
   14019 	ksp->ks_update = ip_kstat_update;
   14020 	ksp->ks_private = (void *)(uintptr_t)stackid;
   14021 
   14022 	kstat_install(ksp);
   14023 	return (ksp);
   14024 }
   14025 
   14026 static void
   14027 ip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
   14028 {
   14029 	if (ksp != NULL) {
   14030 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
   14031 		kstat_delete_netstack(ksp, stackid);
   14032 	}
   14033 }
   14034 
   14035 static int
   14036 ip_kstat_update(kstat_t *kp, int rw)
   14037 {
   14038 	ip_named_kstat_t *ipkp;
   14039 	mib2_ipIfStatsEntry_t ipmib;
   14040 	ill_walk_context_t ctx;
   14041 	ill_t *ill;
   14042 	netstackid_t	stackid = (zoneid_t)(uintptr_t)kp->ks_private;
   14043 	netstack_t	*ns;
   14044 	ip_stack_t	*ipst;
   14045 
   14046 	if (kp == NULL || kp->ks_data == NULL)
   14047 		return (EIO);
   14048 
   14049 	if (rw == KSTAT_WRITE)
   14050 		return (EACCES);
   14051 
   14052 	ns = netstack_find_by_stackid(stackid);
   14053 	if (ns == NULL)
   14054 		return (-1);
   14055 	ipst = ns->netstack_ip;
   14056 	if (ipst == NULL) {
   14057 		netstack_rele(ns);
   14058 		return (-1);
   14059 	}
   14060 	ipkp = (ip_named_kstat_t *)kp->ks_data;
   14061 
   14062 	bcopy(&ipst->ips_ip_mib, &ipmib, sizeof (ipmib));
   14063 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   14064 	ill = ILL_START_WALK_V4(&ctx, ipst);
   14065 	for (; ill != NULL; ill = ill_next(&ctx, ill))
   14066 		ip_mib2_add_ip_stats(&ipmib, ill->ill_ip_mib);
   14067 	rw_exit(&ipst->ips_ill_g_lock);
   14068 
   14069 	ipkp->forwarding.value.ui32 =		ipmib.ipIfStatsForwarding;
   14070 	ipkp->defaultTTL.value.ui32 =		ipmib.ipIfStatsDefaultTTL;
   14071 	ipkp->inReceives.value.ui64 =		ipmib.ipIfStatsHCInReceives;
   14072 	ipkp->inHdrErrors.value.ui32 =		ipmib.ipIfStatsInHdrErrors;
   14073 	ipkp->inAddrErrors.value.ui32 =		ipmib.ipIfStatsInAddrErrors;
   14074 	ipkp->forwDatagrams.value.ui64 = ipmib.ipIfStatsHCOutForwDatagrams;
   14075 	ipkp->inUnknownProtos.value.ui32 =	ipmib.ipIfStatsInUnknownProtos;
   14076 	ipkp->inDiscards.value.ui32 =		ipmib.ipIfStatsInDiscards;
   14077 	ipkp->inDelivers.value.ui64 =		ipmib.ipIfStatsHCInDelivers;
   14078 	ipkp->outRequests.value.ui64 =		ipmib.ipIfStatsHCOutRequests;
   14079 	ipkp->outDiscards.value.ui32 =		ipmib.ipIfStatsOutDiscards;
   14080 	ipkp->outNoRoutes.value.ui32 =		ipmib.ipIfStatsOutNoRoutes;
   14081 	ipkp->reasmTimeout.value.ui32 =		ipst->ips_ip_reassembly_timeout;
   14082 	ipkp->reasmReqds.value.ui32 =		ipmib.ipIfStatsReasmReqds;
   14083 	ipkp->reasmOKs.value.ui32 =		ipmib.ipIfStatsReasmOKs;
   14084 	ipkp->reasmFails.value.ui32 =		ipmib.ipIfStatsReasmFails;
   14085 	ipkp->fragOKs.value.ui32 =		ipmib.ipIfStatsOutFragOKs;
   14086 	ipkp->fragFails.value.ui32 =		ipmib.ipIfStatsOutFragFails;
   14087 	ipkp->fragCreates.value.ui32 =		ipmib.ipIfStatsOutFragCreates;
   14088 
   14089 	ipkp->routingDiscards.value.ui32 =	0;
   14090 	ipkp->inErrs.value.ui32 =		ipmib.tcpIfStatsInErrs;
   14091 	ipkp->noPorts.value.ui32 =		ipmib.udpIfStatsNoPorts;
   14092 	ipkp->inCksumErrs.value.ui32 =		ipmib.ipIfStatsInCksumErrs;
   14093 	ipkp->reasmDuplicates.value.ui32 =	ipmib.ipIfStatsReasmDuplicates;
   14094 	ipkp->reasmPartDups.value.ui32 =	ipmib.ipIfStatsReasmPartDups;
   14095 	ipkp->forwProhibits.value.ui32 =	ipmib.ipIfStatsForwProhibits;
   14096 	ipkp->udpInCksumErrs.value.ui32 =	ipmib.udpIfStatsInCksumErrs;
   14097 	ipkp->udpInOverflows.value.ui32 =	ipmib.udpIfStatsInOverflows;
   14098 	ipkp->rawipInOverflows.value.ui32 =	ipmib.rawipIfStatsInOverflows;
   14099 	ipkp->ipsecInSucceeded.value.ui32 =	ipmib.ipsecIfStatsInSucceeded;
   14100 	ipkp->ipsecInFailed.value.i32 =		ipmib.ipsecIfStatsInFailed;
   14101 
   14102 	ipkp->inIPv6.value.ui32 =	ipmib.ipIfStatsInWrongIPVersion;
   14103 	ipkp->outIPv6.value.ui32 =	ipmib.ipIfStatsOutWrongIPVersion;
   14104 	ipkp->outSwitchIPv6.value.ui32 = ipmib.ipIfStatsOutSwitchIPVersion;
   14105 
   14106 	netstack_rele(ns);
   14107 
   14108 	return (0);
   14109 }
   14110 
   14111 static void *
   14112 icmp_kstat_init(netstackid_t stackid)
   14113 {
   14114 	kstat_t	*ksp;
   14115 
   14116 	icmp_named_kstat_t template = {
   14117 		{ "inMsgs",		KSTAT_DATA_UINT32 },
   14118 		{ "inErrors",		KSTAT_DATA_UINT32 },
   14119 		{ "inDestUnreachs",	KSTAT_DATA_UINT32 },
   14120 		{ "inTimeExcds",	KSTAT_DATA_UINT32 },
   14121 		{ "inParmProbs",	KSTAT_DATA_UINT32 },
   14122 		{ "inSrcQuenchs",	KSTAT_DATA_UINT32 },
   14123 		{ "inRedirects",	KSTAT_DATA_UINT32 },
   14124 		{ "inEchos",		KSTAT_DATA_UINT32 },
   14125 		{ "inEchoReps",		KSTAT_DATA_UINT32 },
   14126 		{ "inTimestamps",	KSTAT_DATA_UINT32 },
   14127 		{ "inTimestampReps",	KSTAT_DATA_UINT32 },
   14128 		{ "inAddrMasks",	KSTAT_DATA_UINT32 },
   14129 		{ "inAddrMaskReps",	KSTAT_DATA_UINT32 },
   14130 		{ "outMsgs",		KSTAT_DATA_UINT32 },
   14131 		{ "outErrors",		KSTAT_DATA_UINT32 },
   14132 		{ "outDestUnreachs",	KSTAT_DATA_UINT32 },
   14133 		{ "outTimeExcds",	KSTAT_DATA_UINT32 },
   14134 		{ "outParmProbs",	KSTAT_DATA_UINT32 },
   14135 		{ "outSrcQuenchs",	KSTAT_DATA_UINT32 },
   14136 		{ "outRedirects",	KSTAT_DATA_UINT32 },
   14137 		{ "outEchos",		KSTAT_DATA_UINT32 },
   14138 		{ "outEchoReps",	KSTAT_DATA_UINT32 },
   14139 		{ "outTimestamps",	KSTAT_DATA_UINT32 },
   14140 		{ "outTimestampReps",	KSTAT_DATA_UINT32 },
   14141 		{ "outAddrMasks",	KSTAT_DATA_UINT32 },
   14142 		{ "outAddrMaskReps",	KSTAT_DATA_UINT32 },
   14143 		{ "inChksumErrs",	KSTAT_DATA_UINT32 },
   14144 		{ "inUnknowns",		KSTAT_DATA_UINT32 },
   14145 		{ "inFragNeeded",	KSTAT_DATA_UINT32 },
   14146 		{ "outFragNeeded",	KSTAT_DATA_UINT32 },
   14147 		{ "outDrops",		KSTAT_DATA_UINT32 },
   14148 		{ "inOverFlows",	KSTAT_DATA_UINT32 },
   14149 		{ "inBadRedirects",	KSTAT_DATA_UINT32 },
   14150 	};
   14151 
   14152 	ksp = kstat_create_netstack("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED,
   14153 	    NUM_OF_FIELDS(icmp_named_kstat_t), 0, stackid);
   14154 	if (ksp == NULL || ksp->ks_data == NULL)
   14155 		return (NULL);
   14156 
   14157 	bcopy(&template, ksp->ks_data, sizeof (template));
   14158 
   14159 	ksp->ks_update = icmp_kstat_update;
   14160 	ksp->ks_private = (void *)(uintptr_t)stackid;
   14161 
   14162 	kstat_install(ksp);
   14163 	return (ksp);
   14164 }
   14165 
   14166 static void
   14167 icmp_kstat_fini(netstackid_t stackid, kstat_t *ksp)
   14168 {
   14169 	if (ksp != NULL) {
   14170 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
   14171 		kstat_delete_netstack(ksp, stackid);
   14172 	}
   14173 }
   14174 
   14175 static int
   14176 icmp_kstat_update(kstat_t *kp, int rw)
   14177 {
   14178 	icmp_named_kstat_t *icmpkp;
   14179 	netstackid_t	stackid = (zoneid_t)(uintptr_t)kp->ks_private;
   14180 	netstack_t	*ns;
   14181 	ip_stack_t	*ipst;
   14182 
   14183 	if ((kp == NULL) || (kp->ks_data == NULL))
   14184 		return (EIO);
   14185 
   14186 	if (rw == KSTAT_WRITE)
   14187 		return (EACCES);
   14188 
   14189 	ns = netstack_find_by_stackid(stackid);
   14190 	if (ns == NULL)
   14191 		return (-1);
   14192 	ipst = ns->netstack_ip;
   14193 	if (ipst == NULL) {
   14194 		netstack_rele(ns);
   14195 		return (-1);
   14196 	}
   14197 	icmpkp = (icmp_named_kstat_t *)kp->ks_data;
   14198 
   14199 	icmpkp->inMsgs.value.ui32 =	    ipst->ips_icmp_mib.icmpInMsgs;
   14200 	icmpkp->inErrors.value.ui32 =	    ipst->ips_icmp_mib.icmpInErrors;
   14201 	icmpkp->inDestUnreachs.value.ui32 =
   14202 	    ipst->ips_icmp_mib.icmpInDestUnreachs;
   14203 	icmpkp->inTimeExcds.value.ui32 =    ipst->ips_icmp_mib.icmpInTimeExcds;
   14204 	icmpkp->inParmProbs.value.ui32 =    ipst->ips_icmp_mib.icmpInParmProbs;
   14205 	icmpkp->inSrcQuenchs.value.ui32 =   ipst->ips_icmp_mib.icmpInSrcQuenchs;
   14206 	icmpkp->inRedirects.value.ui32 =    ipst->ips_icmp_mib.icmpInRedirects;
   14207 	icmpkp->inEchos.value.ui32 =	    ipst->ips_icmp_mib.icmpInEchos;
   14208 	icmpkp->inEchoReps.value.ui32 =	    ipst->ips_icmp_mib.icmpInEchoReps;
   14209 	icmpkp->inTimestamps.value.ui32 =   ipst->ips_icmp_mib.icmpInTimestamps;
   14210 	icmpkp->inTimestampReps.value.ui32 =
   14211 	    ipst->ips_icmp_mib.icmpInTimestampReps;
   14212 	icmpkp->inAddrMasks.value.ui32 =    ipst->ips_icmp_mib.icmpInAddrMasks;
   14213 	icmpkp->inAddrMaskReps.value.ui32 =
   14214 	    ipst->ips_icmp_mib.icmpInAddrMaskReps;
   14215 	icmpkp->outMsgs.value.ui32 =	    ipst->ips_icmp_mib.icmpOutMsgs;
   14216 	icmpkp->outErrors.value.ui32 =	    ipst->ips_icmp_mib.icmpOutErrors;
   14217 	icmpkp->outDestUnreachs.value.ui32 =
   14218 	    ipst->ips_icmp_mib.icmpOutDestUnreachs;
   14219 	icmpkp->outTimeExcds.value.ui32 =   ipst->ips_icmp_mib.icmpOutTimeExcds;
   14220 	icmpkp->outParmProbs.value.ui32 =   ipst->ips_icmp_mib.icmpOutParmProbs;
   14221 	icmpkp->outSrcQuenchs.value.ui32 =
   14222 	    ipst->ips_icmp_mib.icmpOutSrcQuenchs;
   14223 	icmpkp->outRedirects.value.ui32 =   ipst->ips_icmp_mib.icmpOutRedirects;
   14224 	icmpkp->outEchos.value.ui32 =	    ipst->ips_icmp_mib.icmpOutEchos;
   14225 	icmpkp->outEchoReps.value.ui32 =    ipst->ips_icmp_mib.icmpOutEchoReps;
   14226 	icmpkp->outTimestamps.value.ui32 =
   14227 	    ipst->ips_icmp_mib.icmpOutTimestamps;
   14228 	icmpkp->outTimestampReps.value.ui32 =
   14229 	    ipst->ips_icmp_mib.icmpOutTimestampReps;
   14230 	icmpkp->outAddrMasks.value.ui32 =
   14231 	    ipst->ips_icmp_mib.icmpOutAddrMasks;
   14232 	icmpkp->outAddrMaskReps.value.ui32 =
   14233 	    ipst->ips_icmp_mib.icmpOutAddrMaskReps;
   14234 	icmpkp->inCksumErrs.value.ui32 =    ipst->ips_icmp_mib.icmpInCksumErrs;
   14235 	icmpkp->inUnknowns.value.ui32 =	    ipst->ips_icmp_mib.icmpInUnknowns;
   14236 	icmpkp->inFragNeeded.value.ui32 =   ipst->ips_icmp_mib.icmpInFragNeeded;
   14237 	icmpkp->outFragNeeded.value.ui32 =
   14238 	    ipst->ips_icmp_mib.icmpOutFragNeeded;
   14239 	icmpkp->outDrops.value.ui32 =	    ipst->ips_icmp_mib.icmpOutDrops;
   14240 	icmpkp->inOverflows.value.ui32 =    ipst->ips_icmp_mib.icmpInOverflows;
   14241 	icmpkp->inBadRedirects.value.ui32 =
   14242 	    ipst->ips_icmp_mib.icmpInBadRedirects;
   14243 
   14244 	netstack_rele(ns);
   14245 	return (0);
   14246 }
   14247 
   14248 /*
   14249  * This is the fanout function for raw socket opened for SCTP.  Note
   14250  * that it is called after SCTP checks that there is no socket which
   14251  * wants a packet.  Then before SCTP handles this out of the blue packet,
   14252  * this function is called to see if there is any raw socket for SCTP.
   14253  * If there is and it is bound to the correct address, the packet will
   14254  * be sent to that socket.  Note that only one raw socket can be bound to
   14255  * a port.  This is assured in ipcl_sctp_hash_insert();
   14256  */
   14257 void
   14258 ip_fanout_sctp_raw(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t ports,
   14259     ip_recv_attr_t *ira)
   14260 {
   14261 	conn_t		*connp;
   14262 	queue_t		*rq;
   14263 	boolean_t	secure;
   14264 	ill_t		*ill = ira->ira_ill;
   14265 	ip_stack_t	*ipst = ill->ill_ipst;
   14266 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   14267 	sctp_stack_t	*sctps = ipst->ips_netstack->netstack_sctp;
   14268 	iaflags_t	iraflags = ira->ira_flags;
   14269 	ill_t		*rill = ira->ira_rill;
   14270 
   14271 	secure = iraflags & IRAF_IPSEC_SECURE;
   14272 
   14273 	connp = ipcl_classify_raw(mp, IPPROTO_SCTP, ports, ipha, ip6h,
   14274 	    ira, ipst);
   14275 	if (connp == NULL) {
   14276 		/*
   14277 		 * Although raw sctp is not summed, OOB chunks must be.
   14278 		 * Drop the packet here if the sctp checksum failed.
   14279 		 */
   14280 		if (iraflags & IRAF_SCTP_CSUM_ERR) {
   14281 			SCTPS_BUMP_MIB(sctps, sctpChecksumError);
   14282 			freemsg(mp);
   14283 			return;
   14284 		}
   14285 		ira->ira_ill = ira->ira_rill = NULL;
   14286 		sctp_ootb_input(mp, ira, ipst);
   14287 		ira->ira_ill = ill;
   14288 		ira->ira_rill = rill;
   14289 		return;
   14290 	}
   14291 	rq = connp->conn_rq;
   14292 	if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
   14293 		CONN_DEC_REF(connp);
   14294 		BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
   14295 		freemsg(mp);
   14296 		return;
   14297 	}
   14298 	if (((iraflags & IRAF_IS_IPV4) ?
   14299 	    CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
   14300 	    CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
   14301 	    secure) {
   14302 		mp = ipsec_check_inbound_policy(mp, connp, ipha,
   14303 		    ip6h, ira);
   14304 		if (mp == NULL) {
   14305 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   14306 			/* Note that mp is NULL */
   14307 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   14308 			CONN_DEC_REF(connp);
   14309 			return;
   14310 		}
   14311 	}
   14312 
   14313 	if (iraflags & IRAF_ICMP_ERROR) {
   14314 		(connp->conn_recvicmp)(connp, mp, NULL, ira);
   14315 	} else {
   14316 		ill_t *rill = ira->ira_rill;
   14317 
   14318 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
   14319 		/* This is the SOCK_RAW, IPPROTO_SCTP case. */
   14320 		ira->ira_ill = ira->ira_rill = NULL;
   14321 		(connp->conn_recv)(connp, mp, NULL, ira);
   14322 		ira->ira_ill = ill;
   14323 		ira->ira_rill = rill;
   14324 	}
   14325 	CONN_DEC_REF(connp);
   14326 }
   14327 
   14328 /*
   14329  * Free a packet that has the link-layer dl_unitdata_req_t or fast-path
   14330  * header before the ip payload.
   14331  */
   14332 static void
   14333 ip_xmit_flowctl_drop(ill_t *ill, mblk_t *mp, boolean_t is_fp_mp, int fp_mp_len)
   14334 {
   14335 	int len = (mp->b_wptr - mp->b_rptr);
   14336 	mblk_t *ip_mp;
   14337 
   14338 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   14339 	if (is_fp_mp || len != fp_mp_len) {
   14340 		if (len > fp_mp_len) {
   14341 			/*
   14342 			 * fastpath header and ip header in the first mblk
   14343 			 */
   14344 			mp->b_rptr += fp_mp_len;
   14345 		} else {
   14346 			/*
   14347 			 * ip_xmit_attach_llhdr had to prepend an mblk to
   14348 			 * attach the fastpath header before ip header.
   14349 			 */
   14350 			ip_mp = mp->b_cont;
   14351 			freeb(mp);
   14352 			mp = ip_mp;
   14353 			mp->b_rptr += (fp_mp_len - len);
   14354 		}
   14355 	} else {
   14356 		ip_mp = mp->b_cont;
   14357 		freeb(mp);
   14358 		mp = ip_mp;
   14359 	}
   14360 	ip_drop_output("ipIfStatsOutDiscards - flow ctl", mp, ill);
   14361 	freemsg(mp);
   14362 }
   14363 
   14364 /*
   14365  * Normal post fragmentation function.
   14366  *
   14367  * Send a packet using the passed in nce. This handles both IPv4 and IPv6
   14368  * using the same state machine.
   14369  *
   14370  * We return an error on failure. In particular we return EWOULDBLOCK
   14371  * when the driver flow controls. In that case this ensures that ip_wsrv runs
   14372  * (currently by canputnext failure resulting in backenabling from GLD.)
   14373  * This allows the callers of conn_ip_output() to use EWOULDBLOCK as an
   14374  * indication that they can flow control until ip_wsrv() tells then to restart.
   14375  *
   14376  * If the nce passed by caller is incomplete, this function
   14377  * queues the packet and if necessary, sends ARP request and bails.
   14378  * If the Neighbor Cache passed is fully resolved, we simply prepend
   14379  * the link-layer header to the packet, do ipsec hw acceleration
   14380  * work if necessary, and send the packet out on the wire.
   14381  */
   14382 /* ARGSUSED6 */
   14383 int
   14384 ip_xmit(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
   14385     uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, uintptr_t *ixacookie)
   14386 {
   14387 	queue_t		*wq;
   14388 	ill_t		*ill = nce->nce_ill;
   14389 	ip_stack_t	*ipst = ill->ill_ipst;
   14390 	uint64_t	delta;
   14391 	boolean_t	isv6 = ill->ill_isv6;
   14392 	boolean_t	fp_mp;
   14393 	ncec_t		*ncec = nce->nce_common;
   14394 	int64_t		now = LBOLT_FASTPATH64;
   14395 	boolean_t	is_probe;
   14396 
   14397 	DTRACE_PROBE1(ip__xmit, nce_t *, nce);
   14398 
   14399 	ASSERT(mp != NULL);
   14400 	ASSERT(mp->b_datap->db_type == M_DATA);
   14401 	ASSERT(pkt_len == msgdsize(mp));
   14402 
   14403 	/*
   14404 	 * If we have already been here and are coming back after ARP/ND.
   14405 	 * the IXAF_NO_TRACE flag is set. We skip FW_HOOKS, DTRACE and ipobs
   14406 	 * in that case since they have seen the packet when it came here
   14407 	 * the first time.
   14408 	 */
   14409 	if (ixaflags & IXAF_NO_TRACE)
   14410 		goto sendit;
   14411 
   14412 	if (ixaflags & IXAF_IS_IPV4) {
   14413 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
   14414 
   14415 		ASSERT(!isv6);
   14416 		ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
   14417 		if (HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) &&
   14418 		    !(ixaflags & IXAF_NO_PFHOOK)) {
   14419 			int	error;
   14420 
   14421 			FW_HOOKS(ipst->ips_ip4_physical_out_event,
   14422 			    ipst->ips_ipv4firewall_physical_out,
   14423 			    NULL, ill, ipha, mp, mp, 0, ipst, error);
   14424 			DTRACE_PROBE1(ip4__physical__out__end,
   14425 			    mblk_t *, mp);
   14426 			if (mp == NULL)
   14427 				return (error);
   14428 
   14429 			/* The length could have changed */
   14430 			pkt_len = msgdsize(mp);
   14431 		}
   14432 		if (ipst->ips_ip4_observe.he_interested) {
   14433 			/*
   14434 			 * Note that for TX the zoneid is the sending
   14435 			 * zone, whether or not MLP is in play.
   14436 			 * Since the szone argument is the IP zoneid (i.e.,
   14437 			 * zero for exclusive-IP zones) and ipobs wants
   14438 			 * the system zoneid, we map it here.
   14439 			 */
   14440 			szone = IP_REAL_ZONEID(szone, ipst);
   14441 
   14442 			/*
   14443 			 * On the outbound path the destination zone will be
   14444 			 * unknown as we're sending this packet out on the
   14445 			 * wire.
   14446 			 */
   14447 			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
   14448 			    ill, ipst);
   14449 		}
   14450 		DTRACE_IP7(send, mblk_t *, mp,  conn_t *, NULL,
   14451 		    void_ip_t *, ipha,  __dtrace_ipsr_ill_t *, ill,
   14452 		    ipha_t *, ipha, ip6_t *, NULL, int, 0);
   14453 	} else {
   14454 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
   14455 
   14456 		ASSERT(isv6);
   14457 		ASSERT(pkt_len ==
   14458 		    ntohs(((ip6_t *)mp->b_rptr)->ip6_plen) + IPV6_HDR_LEN);
   14459 		if (HOOKS6_INTERESTED_PHYSICAL_OUT(ipst) &&
   14460 		    !(ixaflags & IXAF_NO_PFHOOK)) {
   14461 			int	error;
   14462 
   14463 			FW_HOOKS6(ipst->ips_ip6_physical_out_event,
   14464 			    ipst->ips_ipv6firewall_physical_out,
   14465 			    NULL, ill, ip6h, mp, mp, 0, ipst, error);
   14466 			DTRACE_PROBE1(ip6__physical__out__end,
   14467 			    mblk_t *, mp);
   14468 			if (mp == NULL)
   14469 				return (error);
   14470 
   14471 			/* The length could have changed */
   14472 			pkt_len = msgdsize(mp);
   14473 		}
   14474 		if (ipst->ips_ip6_observe.he_interested) {
   14475 			/* See above */
   14476 			szone = IP_REAL_ZONEID(szone, ipst);
   14477 
   14478 			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
   14479 			    ill, ipst);
   14480 		}
   14481 		DTRACE_IP7(send, mblk_t *, mp,  conn_t *, NULL,
   14482 		    void_ip_t *, ip6h,  __dtrace_ipsr_ill_t *, ill,
   14483 		    ipha_t *, NULL, ip6_t *, ip6h, int, 0);
   14484 	}
   14485 
   14486 sendit:
   14487 	/*
   14488 	 * We check the state without a lock because the state can never
   14489 	 * move "backwards" to initial or incomplete.
   14490 	 */
   14491 	switch (ncec->ncec_state) {
   14492 	case ND_REACHABLE:
   14493 	case ND_STALE:
   14494 	case ND_DELAY:
   14495 	case ND_PROBE:
   14496 		mp = ip_xmit_attach_llhdr(mp, nce);
   14497 		if (mp == NULL) {
   14498 			/*
   14499 			 * ip_xmit_attach_llhdr has increased
   14500 			 * ipIfStatsOutDiscards and called ip_drop_output()
   14501 			 */
   14502 			return (ENOBUFS);
   14503 		}
   14504 		/*
   14505 		 * check if nce_fastpath completed and we tagged on a
   14506 		 * copy of nce_fp_mp in ip_xmit_attach_llhdr().
   14507 		 */
   14508 		fp_mp = (mp->b_datap->db_type == M_DATA);
   14509 
   14510 		if (fp_mp &&
   14511 		    (ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT)) {
   14512 			ill_dld_direct_t *idd;
   14513 
   14514 			idd = &ill->ill_dld_capab->idc_direct;
   14515 			/*
   14516 			 * Send the packet directly to DLD, where it
   14517 			 * may be queued depending on the availability
   14518 			 * of transmit resources at the media layer.
   14519 			 * Return value should be taken into
   14520 			 * account and flow control the TCP.
   14521 			 */
   14522 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
   14523 			UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
   14524 			    pkt_len);
   14525 
   14526 			if (ixaflags & IXAF_NO_DEV_FLOW_CTL) {
   14527 				(void) idd->idd_tx_df(idd->idd_tx_dh, mp,
   14528 				    (uintptr_t)xmit_hint, IP_DROP_ON_NO_DESC);
   14529 			} else {
   14530 				uintptr_t cookie;
   14531 
   14532 				if ((cookie = idd->idd_tx_df(idd->idd_tx_dh,
   14533 				    mp, (uintptr_t)xmit_hint, 0)) != 0) {
   14534 					if (ixacookie != NULL)
   14535 						*ixacookie = cookie;
   14536 					return (EWOULDBLOCK);
   14537 				}
   14538 			}
   14539 		} else {
   14540 			wq = ill->ill_wq;
   14541 
   14542 			if (!(ixaflags & IXAF_NO_DEV_FLOW_CTL) &&
   14543 			    !canputnext(wq)) {
   14544 				if (ixacookie != NULL)
   14545 					*ixacookie = 0;
   14546 				ip_xmit_flowctl_drop(ill, mp, fp_mp,
   14547 				    nce->nce_fp_mp != NULL ?
   14548 				    MBLKL(nce->nce_fp_mp) : 0);
   14549 				return (EWOULDBLOCK);
   14550 			}
   14551 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
   14552 			UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
   14553 			    pkt_len);
   14554 			putnext(wq, mp);
   14555 		}
   14556 
   14557 		/*
   14558 		 * The rest of this function implements Neighbor Unreachability
   14559 		 * detection. Determine if the ncec is eligible for NUD.
   14560 		 */
   14561 		if (ncec->ncec_flags & NCE_F_NONUD)
   14562 			return (0);
   14563 
   14564 		ASSERT(ncec->ncec_state != ND_INCOMPLETE);
   14565 
   14566 		/*
   14567 		 * Check for upper layer advice
   14568 		 */
   14569 		if (ixaflags & IXAF_REACH_CONF) {
   14570 			timeout_id_t tid;
   14571 
   14572 			/*
   14573 			 * It should be o.k. to check the state without
   14574 			 * a lock here, at most we lose an advice.
   14575 			 */
   14576 			ncec->ncec_last = TICK_TO_MSEC(now);
   14577 			if (ncec->ncec_state != ND_REACHABLE) {
   14578 				mutex_enter(&ncec->ncec_lock);
   14579 				ncec->ncec_state = ND_REACHABLE;
   14580 				tid = ncec->ncec_timeout_id;
   14581 				ncec->ncec_timeout_id = 0;
   14582 				mutex_exit(&ncec->ncec_lock);
   14583 				(void) untimeout(tid);
   14584 				if (ip_debug > 2) {
   14585 					/* ip1dbg */
   14586 					pr_addr_dbg("ip_xmit: state"
   14587 					    " for %s changed to"
   14588 					    " REACHABLE\n", AF_INET6,
   14589 					    &ncec->ncec_addr);
   14590 				}
   14591 			}
   14592 			return (0);
   14593 		}
   14594 
   14595 		delta =  TICK_TO_MSEC(now) - ncec->ncec_last;
   14596 		ip1dbg(("ip_xmit: delta = %" PRId64
   14597 		    " ill_reachable_time = %d \n", delta,
   14598 		    ill->ill_reachable_time));
   14599 		if (delta > (uint64_t)ill->ill_reachable_time) {
   14600 			mutex_enter(&ncec->ncec_lock);
   14601 			switch (ncec->ncec_state) {
   14602 			case ND_REACHABLE:
   14603 				ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
   14604 				/* FALLTHROUGH */
   14605 			case ND_STALE:
   14606 				/*
   14607 				 * ND_REACHABLE is identical to
   14608 				 * ND_STALE in this specific case. If
   14609 				 * reachable time has expired for this
   14610 				 * neighbor (delta is greater than
   14611 				 * reachable time), conceptually, the
   14612 				 * neighbor cache is no longer in
   14613 				 * REACHABLE state, but already in
   14614 				 * STALE state.  So the correct
   14615 				 * transition here is to ND_DELAY.
   14616 				 */
   14617 				ncec->ncec_state = ND_DELAY;
   14618 				mutex_exit(&ncec->ncec_lock);
   14619 				nce_restart_timer(ncec,
   14620 				    ipst->ips_delay_first_probe_time);
   14621 				if (ip_debug > 3) {
   14622 					/* ip2dbg */
   14623 					pr_addr_dbg("ip_xmit: state"
   14624 					    " for %s changed to"
   14625 					    " DELAY\n", AF_INET6,
   14626 					    &ncec->ncec_addr);
   14627 				}
   14628 				break;
   14629 			case ND_DELAY:
   14630 			case ND_PROBE:
   14631 				mutex_exit(&ncec->ncec_lock);
   14632 				/* Timers have already started */
   14633 				break;
   14634 			case ND_UNREACHABLE:
   14635 				/*
   14636 				 * nce_timer has detected that this ncec
   14637 				 * is unreachable and initiated deleting
   14638 				 * this ncec.
   14639 				 * This is a harmless race where we found the
   14640 				 * ncec before it was deleted and have
   14641 				 * just sent out a packet using this
   14642 				 * unreachable ncec.
   14643 				 */
   14644 				mutex_exit(&ncec->ncec_lock);
   14645 				break;
   14646 			default:
   14647 				ASSERT(0);
   14648 				mutex_exit(&ncec->ncec_lock);
   14649 			}
   14650 		}
   14651 		return (0);
   14652 
   14653 	case ND_INCOMPLETE:
   14654 		/*
   14655 		 * the state could have changed since we didn't hold the lock.
   14656 		 * Re-verify state under lock.
   14657 		 */
   14658 		is_probe = ipmp_packet_is_probe(mp, nce->nce_ill);
   14659 		mutex_enter(&ncec->ncec_lock);
   14660 		if (NCE_ISREACHABLE(ncec)) {
   14661 			mutex_exit(&ncec->ncec_lock);
   14662 			goto sendit;
   14663 		}
   14664 		/* queue the packet */
   14665 		nce_queue_mp(ncec, mp, is_probe);
   14666 		mutex_exit(&ncec->ncec_lock);
   14667 		DTRACE_PROBE2(ip__xmit__incomplete,
   14668 		    (ncec_t *), ncec, (mblk_t *), mp);
   14669 		return (0);
   14670 
   14671 	case ND_INITIAL:
   14672 		/*
   14673 		 * State could have changed since we didn't hold the lock, so
   14674 		 * re-verify state.
   14675 		 */
   14676 		is_probe = ipmp_packet_is_probe(mp, nce->nce_ill);
   14677 		mutex_enter(&ncec->ncec_lock);
   14678 		if (NCE_ISREACHABLE(ncec))  {
   14679 			mutex_exit(&ncec->ncec_lock);
   14680 			goto sendit;
   14681 		}
   14682 		nce_queue_mp(ncec, mp, is_probe);
   14683 		if (ncec->ncec_state == ND_INITIAL) {
   14684 			ncec->ncec_state = ND_INCOMPLETE;
   14685 			mutex_exit(&ncec->ncec_lock);
   14686 			/*
   14687 			 * figure out the source we want to use
   14688 			 * and resolve it.
   14689 			 */
   14690 			ip_ndp_resolve(ncec);
   14691 		} else  {
   14692 			mutex_exit(&ncec->ncec_lock);
   14693 		}
   14694 		return (0);
   14695 
   14696 	case ND_UNREACHABLE:
   14697 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   14698 		ip_drop_output("ipIfStatsOutDiscards - ND_UNREACHABLE",
   14699 		    mp, ill);
   14700 		freemsg(mp);
   14701 		return (0);
   14702 
   14703 	default:
   14704 		ASSERT(0);
   14705 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   14706 		ip_drop_output("ipIfStatsOutDiscards - ND_other",
   14707 		    mp, ill);
   14708 		freemsg(mp);
   14709 		return (ENETUNREACH);
   14710 	}
   14711 }
   14712 
   14713 /*
   14714  * Return B_TRUE if the buffers differ in length or content.
   14715  * This is used for comparing extension header buffers.
   14716  * Note that an extension header would be declared different
   14717  * even if all that changed was the next header value in that header i.e.
   14718  * what really changed is the next extension header.
   14719  */
   14720 boolean_t
   14721 ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf,
   14722     uint_t blen)
   14723 {
   14724 	if (!b_valid)
   14725 		blen = 0;
   14726 
   14727 	if (alen != blen)
   14728 		return (B_TRUE);
   14729 	if (alen == 0)
   14730 		return (B_FALSE);	/* Both zero length */
   14731 	return (bcmp(abuf, bbuf, alen));
   14732 }
   14733 
   14734 /*
   14735  * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok.
   14736  * Return B_FALSE if memory allocation fails - don't change any state!
   14737  */
   14738 boolean_t
   14739 ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
   14740     const void *src, uint_t srclen)
   14741 {
   14742 	void *dst;
   14743 
   14744 	if (!src_valid)
   14745 		srclen = 0;
   14746 
   14747 	ASSERT(*dstlenp == 0);
   14748 	if (src != NULL && srclen != 0) {
   14749 		dst = mi_alloc(srclen, BPRI_MED);
   14750 		if (dst == NULL)
   14751 			return (B_FALSE);
   14752 	} else {
   14753 		dst = NULL;
   14754 	}
   14755 	if (*dstp != NULL)
   14756 		mi_free(*dstp);
   14757 	*dstp = dst;
   14758 	*dstlenp = dst == NULL ? 0 : srclen;
   14759 	return (B_TRUE);
   14760 }
   14761 
   14762 /*
   14763  * Replace what is in *dst, *dstlen with the source.
   14764  * Assumes ip_allocbuf has already been called.
   14765  */
   14766 void
   14767 ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
   14768     const void *src, uint_t srclen)
   14769 {
   14770 	if (!src_valid)
   14771 		srclen = 0;
   14772 
   14773 	ASSERT(*dstlenp == srclen);
   14774 	if (src != NULL && srclen != 0)
   14775 		bcopy(src, *dstp, srclen);
   14776 }
   14777 
   14778 /*
   14779  * Free the storage pointed to by the members of an ip_pkt_t.
   14780  */
   14781 void
   14782 ip_pkt_free(ip_pkt_t *ipp)
   14783 {
   14784 	uint_t	fields = ipp->ipp_fields;
   14785 
   14786 	if (fields & IPPF_HOPOPTS) {
   14787 		kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen);
   14788 		ipp->ipp_hopopts = NULL;
   14789 		ipp->ipp_hopoptslen = 0;
   14790 	}
   14791 	if (fields & IPPF_RTHDRDSTOPTS) {
   14792 		kmem_free(ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
   14793 		ipp->ipp_rthdrdstopts = NULL;
   14794 		ipp->ipp_rthdrdstoptslen = 0;
   14795 	}
   14796 	if (fields & IPPF_DSTOPTS) {
   14797 		kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen);
   14798 		ipp->ipp_dstopts = NULL;
   14799 		ipp->ipp_dstoptslen = 0;
   14800 	}
   14801 	if (fields & IPPF_RTHDR) {
   14802 		kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen);
   14803 		ipp->ipp_rthdr = NULL;
   14804 		ipp->ipp_rthdrlen = 0;
   14805 	}
   14806 	if (fields & IPPF_IPV4_OPTIONS) {
   14807 		kmem_free(ipp->ipp_ipv4_options, ipp->ipp_ipv4_options_len);
   14808 		ipp->ipp_ipv4_options = NULL;
   14809 		ipp->ipp_ipv4_options_len = 0;
   14810 	}
   14811 	if (fields & IPPF_LABEL_V4) {
   14812 		kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
   14813 		ipp->ipp_label_v4 = NULL;
   14814 		ipp->ipp_label_len_v4 = 0;
   14815 	}
   14816 	if (fields & IPPF_LABEL_V6) {
   14817 		kmem_free(ipp->ipp_label_v6, ipp->ipp_label_len_v6);
   14818 		ipp->ipp_label_v6 = NULL;
   14819 		ipp->ipp_label_len_v6 = 0;
   14820 	}
   14821 	ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
   14822 	    IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
   14823 }
   14824 
   14825 /*
   14826  * Copy from src to dst and allocate as needed.
   14827  * Returns zero or ENOMEM.
   14828  *
   14829  * The caller must initialize dst to zero.
   14830  */
   14831 int
   14832 ip_pkt_copy(ip_pkt_t *src, ip_pkt_t *dst, int kmflag)
   14833 {
   14834 	uint_t	fields = src->ipp_fields;
   14835 
   14836 	/* Start with fields that don't require memory allocation */
   14837 	dst->ipp_fields = fields &
   14838 	    ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
   14839 	    IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
   14840 
   14841 	dst->ipp_addr = src->ipp_addr;
   14842 	dst->ipp_unicast_hops = src->ipp_unicast_hops;
   14843 	dst->ipp_hoplimit = src->ipp_hoplimit;
   14844 	dst->ipp_tclass = src->ipp_tclass;
   14845 	dst->ipp_type_of_service = src->ipp_type_of_service;
   14846 
   14847 	if (!(fields & (IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
   14848 	    IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6)))
   14849 		return (0);
   14850 
   14851 	if (fields & IPPF_HOPOPTS) {
   14852 		dst->ipp_hopopts = kmem_alloc(src->ipp_hopoptslen, kmflag);
   14853 		if (dst->ipp_hopopts == NULL) {
   14854 			ip_pkt_free(dst);
   14855 			return (ENOMEM);
   14856 		}
   14857 		dst->ipp_fields |= IPPF_HOPOPTS;
   14858 		bcopy(src->ipp_hopopts, dst->ipp_hopopts,
   14859 		    src->ipp_hopoptslen);
   14860 		dst->ipp_hopoptslen = src->ipp_hopoptslen;
   14861 	}
   14862 	if (fields & IPPF_RTHDRDSTOPTS) {
   14863 		dst->ipp_rthdrdstopts = kmem_alloc(src->ipp_rthdrdstoptslen,
   14864 		    kmflag);
   14865 		if (dst->ipp_rthdrdstopts == NULL) {
   14866 			ip_pkt_free(dst);
   14867 			return (ENOMEM);
   14868 		}
   14869 		dst->ipp_fields |= IPPF_RTHDRDSTOPTS;
   14870 		bcopy(src->ipp_rthdrdstopts, dst->ipp_rthdrdstopts,
   14871 		    src->ipp_rthdrdstoptslen);
   14872 		dst->ipp_rthdrdstoptslen = src->ipp_rthdrdstoptslen;
   14873 	}
   14874 	if (fields & IPPF_DSTOPTS) {
   14875 		dst->ipp_dstopts = kmem_alloc(src->ipp_dstoptslen, kmflag);
   14876 		if (dst->ipp_dstopts == NULL) {
   14877 			ip_pkt_free(dst);
   14878 			return (ENOMEM);
   14879 		}
   14880 		dst->ipp_fields |= IPPF_DSTOPTS;
   14881 		bcopy(src->ipp_dstopts, dst->ipp_dstopts,
   14882 		    src->ipp_dstoptslen);
   14883 		dst->ipp_dstoptslen = src->ipp_dstoptslen;
   14884 	}
   14885 	if (fields & IPPF_RTHDR) {
   14886 		dst->ipp_rthdr = kmem_alloc(src->ipp_rthdrlen, kmflag);
   14887 		if (dst->ipp_rthdr == NULL) {
   14888 			ip_pkt_free(dst);
   14889 			return (ENOMEM);
   14890 		}
   14891 		dst->ipp_fields |= IPPF_RTHDR;
   14892 		bcopy(src->ipp_rthdr, dst->ipp_rthdr,
   14893 		    src->ipp_rthdrlen);
   14894 		dst->ipp_rthdrlen = src->ipp_rthdrlen;
   14895 	}
   14896 	if (fields & IPPF_IPV4_OPTIONS) {
   14897 		dst->ipp_ipv4_options = kmem_alloc(src->ipp_ipv4_options_len,
   14898 		    kmflag);
   14899 		if (dst->ipp_ipv4_options == NULL) {
   14900 			ip_pkt_free(dst);
   14901 			return (ENOMEM);
   14902 		}
   14903 		dst->ipp_fields |= IPPF_IPV4_OPTIONS;
   14904 		bcopy(src->ipp_ipv4_options, dst->ipp_ipv4_options,
   14905 		    src->ipp_ipv4_options_len);
   14906 		dst->ipp_ipv4_options_len = src->ipp_ipv4_options_len;
   14907 	}
   14908 	if (fields & IPPF_LABEL_V4) {
   14909 		dst->ipp_label_v4 = kmem_alloc(src->ipp_label_len_v4, kmflag);
   14910 		if (dst->ipp_label_v4 == NULL) {
   14911 			ip_pkt_free(dst);
   14912 			return (ENOMEM);
   14913 		}
   14914 		dst->ipp_fields |= IPPF_LABEL_V4;
   14915 		bcopy(src->ipp_label_v4, dst->ipp_label_v4,
   14916 		    src->ipp_label_len_v4);
   14917 		dst->ipp_label_len_v4 = src->ipp_label_len_v4;
   14918 	}
   14919 	if (fields & IPPF_LABEL_V6) {
   14920 		dst->ipp_label_v6 = kmem_alloc(src->ipp_label_len_v6, kmflag);
   14921 		if (dst->ipp_label_v6 == NULL) {
   14922 			ip_pkt_free(dst);
   14923 			return (ENOMEM);
   14924 		}
   14925 		dst->ipp_fields |= IPPF_LABEL_V6;
   14926 		bcopy(src->ipp_label_v6, dst->ipp_label_v6,
   14927 		    src->ipp_label_len_v6);
   14928 		dst->ipp_label_len_v6 = src->ipp_label_len_v6;
   14929 	}
   14930 	if (fields & IPPF_FRAGHDR) {
   14931 		dst->ipp_fraghdr = kmem_alloc(src->ipp_fraghdrlen, kmflag);
   14932 		if (dst->ipp_fraghdr == NULL) {
   14933 			ip_pkt_free(dst);
   14934 			return (ENOMEM);
   14935 		}
   14936 		dst->ipp_fields |= IPPF_FRAGHDR;
   14937 		bcopy(src->ipp_fraghdr, dst->ipp_fraghdr,
   14938 		    src->ipp_fraghdrlen);
   14939 		dst->ipp_fraghdrlen = src->ipp_fraghdrlen;
   14940 	}
   14941 	return (0);
   14942 }
   14943 
   14944 /*
   14945  * Returns INADDR_ANY if no source route
   14946  */
   14947 ipaddr_t
   14948 ip_pkt_source_route_v4(const ip_pkt_t *ipp)
   14949 {
   14950 	ipaddr_t	nexthop = INADDR_ANY;
   14951 	ipoptp_t	opts;
   14952 	uchar_t		*opt;
   14953 	uint8_t		optval;
   14954 	uint8_t		optlen;
   14955 	uint32_t	totallen;
   14956 
   14957 	if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
   14958 		return (INADDR_ANY);
   14959 
   14960 	totallen = ipp->ipp_ipv4_options_len;
   14961 	if (totallen & 0x3)
   14962 		return (INADDR_ANY);
   14963 
   14964 	for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
   14965 	    optval != IPOPT_EOL;
   14966 	    optval = ipoptp_next(&opts)) {
   14967 		opt = opts.ipoptp_cur;
   14968 		switch (optval) {
   14969 			uint8_t off;
   14970 		case IPOPT_SSRR:
   14971 		case IPOPT_LSRR:
   14972 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   14973 				break;
   14974 			}
   14975 			optlen = opts.ipoptp_len;
   14976 			off = opt[IPOPT_OFFSET];
   14977 			off--;
   14978 			if (optlen < IP_ADDR_LEN ||
   14979 			    off > optlen - IP_ADDR_LEN) {
   14980 				/* End of source route */
   14981 				break;
   14982 			}
   14983 			bcopy((char *)opt + off, &nexthop, IP_ADDR_LEN);
   14984 			if (nexthop == htonl(INADDR_LOOPBACK)) {
   14985 				/* Ignore */
   14986 				nexthop = INADDR_ANY;
   14987 				break;
   14988 			}
   14989 			break;
   14990 		}
   14991 	}
   14992 	return (nexthop);
   14993 }
   14994 
   14995 /*
   14996  * Reverse a source route.
   14997  */
   14998 void
   14999 ip_pkt_source_route_reverse_v4(ip_pkt_t *ipp)
   15000 {
   15001 	ipaddr_t	tmp;
   15002 	ipoptp_t	opts;
   15003 	uchar_t		*opt;
   15004 	uint8_t		optval;
   15005 	uint32_t	totallen;
   15006 
   15007 	if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
   15008 		return;
   15009 
   15010 	totallen = ipp->ipp_ipv4_options_len;
   15011 	if (totallen & 0x3)
   15012 		return;
   15013 
   15014 	for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
   15015 	    optval != IPOPT_EOL;
   15016 	    optval = ipoptp_next(&opts)) {
   15017 		uint8_t off1, off2;
   15018 
   15019 		opt = opts.ipoptp_cur;
   15020 		switch (optval) {
   15021 		case IPOPT_SSRR:
   15022 		case IPOPT_LSRR:
   15023 			if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   15024 				break;
   15025 			}
   15026 			off1 = IPOPT_MINOFF_SR - 1;
   15027 			off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
   15028 			while (off2 > off1) {
   15029 				bcopy(opt + off2, &tmp, IP_ADDR_LEN);
   15030 				bcopy(opt + off1, opt + off2, IP_ADDR_LEN);
   15031 				bcopy(&tmp, opt + off2, IP_ADDR_LEN);
   15032 				off2 -= IP_ADDR_LEN;
   15033 				off1 += IP_ADDR_LEN;
   15034 			}
   15035 			opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
   15036 			break;
   15037 		}
   15038 	}
   15039 }
   15040 
   15041 /*
   15042  * Returns NULL if no routing header
   15043  */
   15044 in6_addr_t *
   15045 ip_pkt_source_route_v6(const ip_pkt_t *ipp)
   15046 {
   15047 	in6_addr_t	*nexthop = NULL;
   15048 	ip6_rthdr0_t	*rthdr;
   15049 
   15050 	if (!(ipp->ipp_fields & IPPF_RTHDR))
   15051 		return (NULL);
   15052 
   15053 	rthdr = (ip6_rthdr0_t *)ipp->ipp_rthdr;
   15054 	if (rthdr->ip6r0_segleft == 0)
   15055 		return (NULL);
   15056 
   15057 	nexthop = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr));
   15058 	return (nexthop);
   15059 }
   15060 
   15061 zoneid_t
   15062 ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_recv_attr_t *ira,
   15063     zoneid_t lookup_zoneid)
   15064 {
   15065 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   15066 	ire_t		*ire;
   15067 	int		ire_flags = MATCH_IRE_TYPE;
   15068 	zoneid_t	zoneid = ALL_ZONES;
   15069 
   15070 	if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
   15071 		return (ALL_ZONES);
   15072 
   15073 	if (lookup_zoneid != ALL_ZONES)
   15074 		ire_flags |= MATCH_IRE_ZONEONLY;
   15075 	ire = ire_ftable_lookup_v4(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
   15076 	    NULL, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
   15077 	if (ire != NULL) {
   15078 		zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
   15079 		ire_refrele(ire);
   15080 	}
   15081 	return (zoneid);
   15082 }
   15083 
   15084 zoneid_t
   15085 ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill,
   15086     ip_recv_attr_t *ira, zoneid_t lookup_zoneid)
   15087 {
   15088 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   15089 	ire_t		*ire;
   15090 	int		ire_flags = MATCH_IRE_TYPE;
   15091 	zoneid_t	zoneid = ALL_ZONES;
   15092 
   15093 	if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
   15094 		return (ALL_ZONES);
   15095 
   15096 	if (IN6_IS_ADDR_LINKLOCAL(addr))
   15097 		ire_flags |= MATCH_IRE_ILL;
   15098 
   15099 	if (lookup_zoneid != ALL_ZONES)
   15100 		ire_flags |= MATCH_IRE_ZONEONLY;
   15101 	ire = ire_ftable_lookup_v6(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
   15102 	    ill, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
   15103 	if (ire != NULL) {
   15104 		zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
   15105 		ire_refrele(ire);
   15106 	}
   15107 	return (zoneid);
   15108 }
   15109 
   15110 /*
   15111  * IP obserability hook support functions.
   15112  */
   15113 static void
   15114 ipobs_init(ip_stack_t *ipst)
   15115 {
   15116 	netid_t id;
   15117 
   15118 	id = net_getnetidbynetstackid(ipst->ips_netstack->netstack_stackid);
   15119 
   15120 	ipst->ips_ip4_observe_pr = net_protocol_lookup(id, NHF_INET);
   15121 	VERIFY(ipst->ips_ip4_observe_pr != NULL);
   15122 
   15123 	ipst->ips_ip6_observe_pr = net_protocol_lookup(id, NHF_INET6);
   15124 	VERIFY(ipst->ips_ip6_observe_pr != NULL);
   15125 }
   15126 
   15127 static void
   15128 ipobs_fini(ip_stack_t *ipst)
   15129 {
   15130 
   15131 	VERIFY(net_protocol_release(ipst->ips_ip4_observe_pr) == 0);
   15132 	VERIFY(net_protocol_release(ipst->ips_ip6_observe_pr) == 0);
   15133 }
   15134 
   15135 /*
   15136  * hook_pkt_observe_t is composed in network byte order so that the
   15137  * entire mblk_t chain handed into hook_run can be used as-is.
   15138  * The caveat is that use of the fields, such as the zone fields,
   15139  * requires conversion into host byte order first.
   15140  */
   15141 void
   15142 ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
   15143     const ill_t *ill, ip_stack_t *ipst)
   15144 {
   15145 	hook_pkt_observe_t *hdr;
   15146 	uint64_t grifindex;
   15147 	mblk_t *imp;
   15148 
   15149 	imp = allocb(sizeof (*hdr), BPRI_HI);
   15150 	if (imp == NULL)
   15151 		return;
   15152 
   15153 	hdr = (hook_pkt_observe_t *)imp->b_rptr;
   15154 	/*
   15155 	 * b_wptr is set to make the apparent size of the data in the mblk_t
   15156 	 * to exclude the pointers at the end of hook_pkt_observer_t.
   15157 	 */
   15158 	imp->b_wptr = imp->b_rptr + sizeof (dl_ipnetinfo_t);
   15159 	imp->b_cont = mp;
   15160 
   15161 	ASSERT(DB_TYPE(mp) == M_DATA);
   15162 
   15163 	if (IS_UNDER_IPMP(ill))
   15164 		grifindex = ipmp_ill_get_ipmp_ifindex(ill);
   15165 	else
   15166 		grifindex = 0;
   15167 
   15168 	hdr->hpo_version = 1;
   15169 	hdr->hpo_htype = htons(htype);
   15170 	hdr->hpo_pktlen = htonl((ulong_t)msgdsize(mp));
   15171 	hdr->hpo_ifindex = htonl(ill->ill_phyint->phyint_ifindex);
   15172 	hdr->hpo_grifindex = htonl(grifindex);
   15173 	hdr->hpo_zsrc = htonl(zsrc);
   15174 	hdr->hpo_zdst = htonl(zdst);
   15175 	hdr->hpo_pkt = imp;
   15176 	hdr->hpo_ctx = ipst->ips_netstack;
   15177 
   15178 	if (ill->ill_isv6) {
   15179 		hdr->hpo_family = AF_INET6;
   15180 		(void) hook_run(ipst->ips_ipv6_net_data->netd_hooks,
   15181 		    ipst->ips_ipv6observing, (hook_data_t)hdr);
   15182 	} else {
   15183 		hdr->hpo_family = AF_INET;
   15184 		(void) hook_run(ipst->ips_ipv4_net_data->netd_hooks,
   15185 		    ipst->ips_ipv4observing, (hook_data_t)hdr);
   15186 	}
   15187 
   15188 	imp->b_cont = NULL;
   15189 	freemsg(imp);
   15190 }
   15191 
   15192 /*
   15193  * Utility routine that checks if `v4srcp' is a valid address on underlying
   15194  * interface `ill'.  If `ipifp' is non-NULL, it's set to a held ipif
   15195  * associated with `v4srcp' on success.  NOTE: if this is not called from
   15196  * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
   15197  * group during or after this lookup.
   15198  */
   15199 boolean_t
   15200 ipif_lookup_testaddr_v4(ill_t *ill, const in_addr_t *v4srcp, ipif_t **ipifp)
   15201 {
   15202 	ipif_t *ipif;
   15203 
   15204 	ipif = ipif_lookup_addr_exact(*v4srcp, ill, ill->ill_ipst);
   15205 	if (ipif != NULL) {
   15206 		if (ipifp != NULL)
   15207 			*ipifp = ipif;
   15208 		else
   15209 			ipif_refrele(ipif);
   15210 		return (B_TRUE);
   15211 	}
   15212 
   15213 	ip1dbg(("ipif_lookup_testaddr_v4: cannot find ipif for src %x\n",
   15214 	    *v4srcp));
   15215 	return (B_FALSE);
   15216 }
   15217 
   15218 /*
   15219  * Transport protocol call back function for CPU state change.
   15220  */
   15221 /* ARGSUSED */
   15222 static int
   15223 ip_tp_cpu_update(cpu_setup_t what, int id, void *arg)
   15224 {
   15225 	processorid_t cpu_seqid;
   15226 	netstack_handle_t nh;
   15227 	netstack_t *ns;
   15228 
   15229 	ASSERT(MUTEX_HELD(&cpu_lock));
   15230 
   15231 	switch (what) {
   15232 	case CPU_CONFIG:
   15233 	case CPU_ON:
   15234 	case CPU_INIT:
   15235 	case CPU_CPUPART_IN:
   15236 		cpu_seqid = cpu[id]->cpu_seqid;
   15237 		netstack_next_init(&nh);
   15238 		while ((ns = netstack_next(&nh)) != NULL) {
   15239 			tcp_stack_cpu_add(ns->netstack_tcp, cpu_seqid);
   15240 			sctp_stack_cpu_add(ns->netstack_sctp, cpu_seqid);
   15241 			udp_stack_cpu_add(ns->netstack_udp, cpu_seqid);
   15242 			netstack_rele(ns);
   15243 		}
   15244 		netstack_next_fini(&nh);
   15245 		break;
   15246 	case CPU_UNCONFIG:
   15247 	case CPU_OFF:
   15248 	case CPU_CPUPART_OUT:
   15249 		/*
   15250 		 * Nothing to do.  We don't remove the per CPU stats from
   15251 		 * the IP stack even when the CPU goes offline.
   15252 		 */
   15253 		break;
   15254 	default:
   15255 		break;
   15256 	}
   15257 	return (0);
   15258 }
   15259