Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /* Copyright (c) 1990 Mentat Inc. */
     26 
     27 /*
     28  * This file contains the interface control functions for IP.
     29  */
     30 
     31 #include <sys/types.h>
     32 #include <sys/stream.h>
     33 #include <sys/dlpi.h>
     34 #include <sys/stropts.h>
     35 #include <sys/strsun.h>
     36 #include <sys/sysmacros.h>
     37 #include <sys/strsubr.h>
     38 #include <sys/strlog.h>
     39 #include <sys/ddi.h>
     40 #include <sys/sunddi.h>
     41 #include <sys/cmn_err.h>
     42 #include <sys/kstat.h>
     43 #include <sys/debug.h>
     44 #include <sys/zone.h>
     45 #include <sys/sunldi.h>
     46 #include <sys/file.h>
     47 #include <sys/bitmap.h>
     48 #include <sys/cpuvar.h>
     49 #include <sys/time.h>
     50 #include <sys/ctype.h>
     51 #include <sys/kmem.h>
     52 #include <sys/systm.h>
     53 #include <sys/param.h>
     54 #include <sys/socket.h>
     55 #include <sys/isa_defs.h>
     56 #include <net/if.h>
     57 #include <net/if_arp.h>
     58 #include <net/if_types.h>
     59 #include <net/if_dl.h>
     60 #include <net/route.h>
     61 #include <sys/sockio.h>
     62 #include <netinet/in.h>
     63 #include <netinet/ip6.h>
     64 #include <netinet/icmp6.h>
     65 #include <netinet/igmp_var.h>
     66 #include <sys/policy.h>
     67 #include <sys/ethernet.h>
     68 #include <sys/callb.h>
     69 #include <sys/md5.h>
     70 
     71 #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
     72 #include <inet/mi.h>
     73 #include <inet/nd.h>
     74 #include <inet/arp.h>
     75 #include <inet/ip_arp.h>
     76 #include <inet/mib2.h>
     77 #include <inet/ip.h>
     78 #include <inet/ip6.h>
     79 #include <inet/ip6_asp.h>
     80 #include <inet/tcp.h>
     81 #include <inet/ip_multi.h>
     82 #include <inet/ip_ire.h>
     83 #include <inet/ip_ftable.h>
     84 #include <inet/ip_rts.h>
     85 #include <inet/ip_ndp.h>
     86 #include <inet/ip_if.h>
     87 #include <inet/ip_impl.h>
     88 #include <inet/sctp_ip.h>
     89 #include <inet/ip_netinfo.h>
     90 #include <inet/ilb_ip.h>
     91 
     92 #include <netinet/igmp.h>
     93 #include <inet/ip_listutils.h>
     94 #include <inet/ipclassifier.h>
     95 #include <sys/mac_client.h>
     96 #include <sys/dld.h>
     97 
     98 #include <sys/systeminfo.h>
     99 #include <sys/bootconf.h>
    100 
    101 #include <sys/tsol/tndb.h>
    102 #include <sys/tsol/tnet.h>
    103 
    104 /* The character which tells where the ill_name ends */
    105 #define	IPIF_SEPARATOR_CHAR	':'
    106 
    107 /* IP ioctl function table entry */
    108 typedef struct ipft_s {
    109 	int	ipft_cmd;
    110 	pfi_t	ipft_pfi;
    111 	int	ipft_min_size;
    112 	int	ipft_flags;
    113 } ipft_t;
    114 #define	IPFT_F_NO_REPLY		0x1	/* IP ioctl does not expect any reply */
    115 #define	IPFT_F_SELF_REPLY	0x2	/* ioctl callee does the ioctl reply */
    116 
    117 static int	nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    118 static int	nd_ill_forward_set(queue_t *q, mblk_t *mp,
    119 		    char *value, caddr_t cp, cred_t *ioc_cr);
    120 
    121 static boolean_t ill_is_quiescent(ill_t *);
    122 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
    123 static ip_m_t	*ip_m_lookup(t_uscalar_t mac_type);
    124 static int	ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    125     mblk_t *mp, boolean_t need_up);
    126 static int	ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    127     mblk_t *mp, boolean_t need_up);
    128 static int	ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
    129     queue_t *q, mblk_t *mp, boolean_t need_up);
    130 static int	ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
    131     mblk_t *mp);
    132 static int	ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    133     mblk_t *mp);
    134 static int	ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
    135     queue_t *q, mblk_t *mp, boolean_t need_up);
    136 static int	ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
    137     int ioccmd, struct linkblk *li);
    138 static ipaddr_t	ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
    139 static void	ip_wput_ioctl(queue_t *q, mblk_t *mp);
    140 static void	ipsq_flush(ill_t *ill);
    141 
    142 static	int	ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
    143     queue_t *q, mblk_t *mp, boolean_t need_up);
    144 static void	ipsq_delete(ipsq_t *);
    145 
    146 static ipif_t	*ipif_allocate(ill_t *ill, int id, uint_t ire_type,
    147     boolean_t initialize, boolean_t insert);
    148 static ire_t	**ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
    149 static void	ipif_delete_bcast_ires(ipif_t *ipif);
    150 static int	ipif_add_ires_v4(ipif_t *, boolean_t);
    151 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
    152 		    boolean_t isv6);
    153 static int	ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
    154 static void	ipif_free(ipif_t *ipif);
    155 static void	ipif_free_tail(ipif_t *ipif);
    156 static void	ipif_set_default(ipif_t *ipif);
    157 static int	ipif_set_values(queue_t *q, mblk_t *mp,
    158     char *interf_name, uint_t *ppa);
    159 static int	ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
    160     queue_t *q);
    161 static ipif_t	*ipif_lookup_on_name(char *name, size_t namelen,
    162     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
    163     ip_stack_t *);
    164 
    165 static int	ill_alloc_ppa(ill_if_t *, ill_t *);
    166 static void	ill_delete_interface_type(ill_if_t *);
    167 static int	ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
    168 static void	ill_dl_down(ill_t *ill);
    169 static void	ill_down(ill_t *ill);
    170 static void	ill_down_ipifs(ill_t *, boolean_t);
    171 static void	ill_free_mib(ill_t *ill);
    172 static void	ill_glist_delete(ill_t *);
    173 static void	ill_phyint_reinit(ill_t *ill);
    174 static void	ill_set_nce_router_flags(ill_t *, boolean_t);
    175 static void	ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
    176 static void	ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
    177 
    178 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
    179 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
    180 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
    181 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
    182 static ip_v4mapinfo_func_t ip_ether_v4_mapping;
    183 static ip_v6mapinfo_func_t ip_ether_v6_mapping;
    184 static ip_v4mapinfo_func_t ip_ib_v4_mapping;
    185 static ip_v6mapinfo_func_t ip_ib_v6_mapping;
    186 static ip_v4mapinfo_func_t ip_mbcast_mapping;
    187 static void 	ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
    188 static void 	ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
    189 static void	phyint_free(phyint_t *);
    190 
    191 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
    192 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    193 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    194 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    195 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
    196 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
    197     dl_capability_sub_t *);
    198 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
    199 static void	ill_capability_dld_reset_fill(ill_t *, mblk_t *);
    200 static void	ill_capability_dld_ack(ill_t *, mblk_t *,
    201 		    dl_capability_sub_t *);
    202 static void	ill_capability_dld_enable(ill_t *);
    203 static void	ill_capability_ack_thr(void *);
    204 static void	ill_capability_lso_enable(ill_t *);
    205 
    206 static ill_t	*ill_prev_usesrc(ill_t *);
    207 static int	ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
    208 static void	ill_disband_usesrc_group(ill_t *);
    209 static void	ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
    210 
    211 #ifdef DEBUG
    212 static	void	ill_trace_cleanup(const ill_t *);
    213 static	void	ipif_trace_cleanup(const ipif_t *);
    214 #endif
    215 
    216 /*
    217  * if we go over the memory footprint limit more than once in this msec
    218  * interval, we'll start pruning aggressively.
    219  */
    220 int ip_min_frag_prune_time = 0;
    221 
    222 static ipft_t	ip_ioctl_ftbl[] = {
    223 	{ IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
    224 	{ IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
    225 		IPFT_F_NO_REPLY },
    226 	{ IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
    227 	{ 0 }
    228 };
    229 
    230 /* Simple ICMP IP Header Template */
    231 static ipha_t icmp_ipha = {
    232 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
    233 };
    234 
    235 static uchar_t	ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
    236 
    237 static ip_m_t   ip_m_tbl[] = {
    238 	{ DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    239 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
    240 	    ip_nodef_v6intfid },
    241 	{ DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
    242 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    243 	    ip_nodef_v6intfid },
    244 	{ DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
    245 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    246 	    ip_nodef_v6intfid },
    247 	{ DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
    248 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    249 	    ip_nodef_v6intfid },
    250 	{ DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
    251 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
    252 	    ip_nodef_v6intfid },
    253 	{ DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
    254 	    ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
    255 	    ip_nodef_v6intfid },
    256 	{ DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
    257 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
    258 	    ip_ipv4_v6destintfid },
    259 	{ DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
    260 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
    261 	    ip_ipv6_v6destintfid },
    262 	{ DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
    263 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
    264 	    ip_nodef_v6intfid },
    265 	{ SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    266 	    NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
    267 	{ SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    268 	    NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
    269 	{ DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    270 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    271 	    ip_nodef_v6intfid }
    272 };
    273 
    274 static ill_t	ill_null;		/* Empty ILL for init. */
    275 char	ipif_loopback_name[] = "lo0";
    276 static char *ipv4_forward_suffix = ":ip_forwarding";
    277 static char *ipv6_forward_suffix = ":ip6_forwarding";
    278 static	sin6_t	sin6_null;	/* Zero address for quick clears */
    279 static	sin_t	sin_null;	/* Zero address for quick clears */
    280 
    281 /* When set search for unused ipif_seqid */
    282 static ipif_t	ipif_zero;
    283 
    284 /*
    285  * ppa arena is created after these many
    286  * interfaces have been plumbed.
    287  */
    288 uint_t	ill_no_arena = 12;	/* Setable in /etc/system */
    289 
    290 /*
    291  * Allocate per-interface mibs.
    292  * Returns true if ok. False otherwise.
    293  *  ipsq  may not yet be allocated (loopback case ).
    294  */
    295 static boolean_t
    296 ill_allocate_mibs(ill_t *ill)
    297 {
    298 	/* Already allocated? */
    299 	if (ill->ill_ip_mib != NULL) {
    300 		if (ill->ill_isv6)
    301 			ASSERT(ill->ill_icmp6_mib != NULL);
    302 		return (B_TRUE);
    303 	}
    304 
    305 	ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
    306 	    KM_NOSLEEP);
    307 	if (ill->ill_ip_mib == NULL) {
    308 		return (B_FALSE);
    309 	}
    310 
    311 	/* Setup static information */
    312 	SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
    313 	    sizeof (mib2_ipIfStatsEntry_t));
    314 	if (ill->ill_isv6) {
    315 		ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
    316 		SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
    317 		    sizeof (mib2_ipv6AddrEntry_t));
    318 		SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
    319 		    sizeof (mib2_ipv6RouteEntry_t));
    320 		SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
    321 		    sizeof (mib2_ipv6NetToMediaEntry_t));
    322 		SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
    323 		    sizeof (ipv6_member_t));
    324 		SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
    325 		    sizeof (ipv6_grpsrc_t));
    326 	} else {
    327 		ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
    328 		SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
    329 		    sizeof (mib2_ipAddrEntry_t));
    330 		SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
    331 		    sizeof (mib2_ipRouteEntry_t));
    332 		SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
    333 		    sizeof (mib2_ipNetToMediaEntry_t));
    334 		SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
    335 		    sizeof (ip_member_t));
    336 		SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
    337 		    sizeof (ip_grpsrc_t));
    338 
    339 		/*
    340 		 * For a v4 ill, we are done at this point, because per ill
    341 		 * icmp mibs are only used for v6.
    342 		 */
    343 		return (B_TRUE);
    344 	}
    345 
    346 	ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
    347 	    KM_NOSLEEP);
    348 	if (ill->ill_icmp6_mib == NULL) {
    349 		kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
    350 		ill->ill_ip_mib = NULL;
    351 		return (B_FALSE);
    352 	}
    353 	/* static icmp info */
    354 	ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
    355 	    sizeof (mib2_ipv6IfIcmpEntry_t);
    356 	/*
    357 	 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
    358 	 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
    359 	 * -> ill_phyint_reinit
    360 	 */
    361 	return (B_TRUE);
    362 }
    363 
    364 /*
    365  * Completely vaporize a lower level tap and all associated interfaces.
    366  * ill_delete is called only out of ip_close when the device control
    367  * stream is being closed.
    368  */
    369 void
    370 ill_delete(ill_t *ill)
    371 {
    372 	ipif_t	*ipif;
    373 	ill_t	*prev_ill;
    374 	ip_stack_t	*ipst = ill->ill_ipst;
    375 
    376 	/*
    377 	 * ill_delete may be forcibly entering the ipsq. The previous
    378 	 * ioctl may not have completed and may need to be aborted.
    379 	 * ipsq_flush takes care of it. If we don't need to enter the
    380 	 * the ipsq forcibly, the 2nd invocation of ipsq_flush in
    381 	 * ill_delete_tail is sufficient.
    382 	 */
    383 	ipsq_flush(ill);
    384 
    385 	/*
    386 	 * Nuke all interfaces.  ipif_free will take down the interface,
    387 	 * remove it from the list, and free the data structure.
    388 	 * Walk down the ipif list and remove the logical interfaces
    389 	 * first before removing the main ipif. We can't unplumb
    390 	 * zeroth interface first in the case of IPv6 as update_conn_ill
    391 	 * -> ip_ll_multireq de-references ill_ipif for checking
    392 	 * POINTOPOINT.
    393 	 *
    394 	 * If ill_ipif was not properly initialized (i.e low on memory),
    395 	 * then no interfaces to clean up. In this case just clean up the
    396 	 * ill.
    397 	 */
    398 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
    399 		ipif_free(ipif);
    400 
    401 	/*
    402 	 * clean out all the nce_t entries that depend on this
    403 	 * ill for the ill_phys_addr.
    404 	 */
    405 	nce_flush(ill, B_TRUE);
    406 
    407 	/* Clean up msgs on pending upcalls for mrouted */
    408 	reset_mrt_ill(ill);
    409 
    410 	update_conn_ill(ill, ipst);
    411 
    412 	/*
    413 	 * Remove multicast references added as a result of calls to
    414 	 * ip_join_allmulti().
    415 	 */
    416 	ip_purge_allmulti(ill);
    417 
    418 	/*
    419 	 * If the ill being deleted is under IPMP, boot it out of the illgrp.
    420 	 */
    421 	if (IS_UNDER_IPMP(ill))
    422 		ipmp_ill_leave_illgrp(ill);
    423 
    424 	/*
    425 	 * ill_down will arrange to blow off any IRE's dependent on this
    426 	 * ILL, and shut down fragmentation reassembly.
    427 	 */
    428 	ill_down(ill);
    429 
    430 	/* Let SCTP know, so that it can remove this from its list. */
    431 	sctp_update_ill(ill, SCTP_ILL_REMOVE);
    432 
    433 	/*
    434 	 * Walk all CONNs that can have a reference on an ire or nce for this
    435 	 * ill (we actually walk all that now have stale references).
    436 	 */
    437 	ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
    438 
    439 	/* With IPv6 we have dce_ifindex. Cleanup for neatness */
    440 	if (ill->ill_isv6)
    441 		dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
    442 
    443 	/*
    444 	 * If an address on this ILL is being used as a source address then
    445 	 * clear out the pointers in other ILLs that point to this ILL.
    446 	 */
    447 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
    448 	if (ill->ill_usesrc_grp_next != NULL) {
    449 		if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
    450 			ill_disband_usesrc_group(ill);
    451 		} else {	/* consumer of the usesrc ILL */
    452 			prev_ill = ill_prev_usesrc(ill);
    453 			prev_ill->ill_usesrc_grp_next =
    454 			    ill->ill_usesrc_grp_next;
    455 		}
    456 	}
    457 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
    458 }
    459 
    460 static void
    461 ipif_non_duplicate(ipif_t *ipif)
    462 {
    463 	ill_t *ill = ipif->ipif_ill;
    464 	mutex_enter(&ill->ill_lock);
    465 	if (ipif->ipif_flags & IPIF_DUPLICATE) {
    466 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
    467 		ASSERT(ill->ill_ipif_dup_count > 0);
    468 		ill->ill_ipif_dup_count--;
    469 	}
    470 	mutex_exit(&ill->ill_lock);
    471 }
    472 
    473 /*
    474  * ill_delete_tail is called from ip_modclose after all references
    475  * to the closing ill are gone. The wait is done in ip_modclose
    476  */
    477 void
    478 ill_delete_tail(ill_t *ill)
    479 {
    480 	mblk_t	**mpp;
    481 	ipif_t	*ipif;
    482 	ip_stack_t	*ipst = ill->ill_ipst;
    483 
    484 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
    485 		ipif_non_duplicate(ipif);
    486 		(void) ipif_down_tail(ipif);
    487 	}
    488 
    489 	ASSERT(ill->ill_ipif_dup_count == 0);
    490 
    491 	/*
    492 	 * If polling capability is enabled (which signifies direct
    493 	 * upcall into IP and driver has ill saved as a handle),
    494 	 * we need to make sure that unbind has completed before we
    495 	 * let the ill disappear and driver no longer has any reference
    496 	 * to this ill.
    497 	 */
    498 	mutex_enter(&ill->ill_lock);
    499 	while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
    500 		cv_wait(&ill->ill_cv, &ill->ill_lock);
    501 	mutex_exit(&ill->ill_lock);
    502 	ASSERT(!(ill->ill_capabilities &
    503 	    (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
    504 
    505 	if (ill->ill_net_type != IRE_LOOPBACK)
    506 		qprocsoff(ill->ill_rq);
    507 
    508 	/*
    509 	 * We do an ipsq_flush once again now. New messages could have
    510 	 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
    511 	 * could also have landed up if an ioctl thread had looked up
    512 	 * the ill before we set the ILL_CONDEMNED flag, but not yet
    513 	 * enqueued the ioctl when we did the ipsq_flush last time.
    514 	 */
    515 	ipsq_flush(ill);
    516 
    517 	/*
    518 	 * Free capabilities.
    519 	 */
    520 	if (ill->ill_hcksum_capab != NULL) {
    521 		kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
    522 		ill->ill_hcksum_capab = NULL;
    523 	}
    524 
    525 	if (ill->ill_zerocopy_capab != NULL) {
    526 		kmem_free(ill->ill_zerocopy_capab,
    527 		    sizeof (ill_zerocopy_capab_t));
    528 		ill->ill_zerocopy_capab = NULL;
    529 	}
    530 
    531 	if (ill->ill_lso_capab != NULL) {
    532 		kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
    533 		ill->ill_lso_capab = NULL;
    534 	}
    535 
    536 	if (ill->ill_dld_capab != NULL) {
    537 		kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
    538 		ill->ill_dld_capab = NULL;
    539 	}
    540 
    541 	while (ill->ill_ipif != NULL)
    542 		ipif_free_tail(ill->ill_ipif);
    543 
    544 	/*
    545 	 * We have removed all references to ilm from conn and the ones joined
    546 	 * within the kernel.
    547 	 *
    548 	 * We don't walk conns, mrts and ires because
    549 	 *
    550 	 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
    551 	 * 2) ill_down ->ill_downi walks all the ires and cleans up
    552 	 *    ill references.
    553 	 */
    554 
    555 	/*
    556 	 * If this ill is an IPMP meta-interface, blow away the illgrp.  This
    557 	 * is safe to do because the illgrp has already been unlinked from the
    558 	 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
    559 	 */
    560 	if (IS_IPMP(ill)) {
    561 		ipmp_illgrp_destroy(ill->ill_grp);
    562 		ill->ill_grp = NULL;
    563 	}
    564 
    565 	/*
    566 	 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
    567 	 * could free the phyint. No more reference to the phyint after this
    568 	 * point.
    569 	 */
    570 	(void) ill_glist_delete(ill);
    571 
    572 	rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER);
    573 	if (ill->ill_ndd_name != NULL)
    574 		nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name);
    575 	rw_exit(&ipst->ips_ip_g_nd_lock);
    576 
    577 	if (ill->ill_frag_ptr != NULL) {
    578 		uint_t count;
    579 
    580 		for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
    581 			mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
    582 		}
    583 		mi_free(ill->ill_frag_ptr);
    584 		ill->ill_frag_ptr = NULL;
    585 		ill->ill_frag_hash_tbl = NULL;
    586 	}
    587 
    588 	freemsg(ill->ill_nd_lla_mp);
    589 	/* Free all retained control messages. */
    590 	mpp = &ill->ill_first_mp_to_free;
    591 	do {
    592 		while (mpp[0]) {
    593 			mblk_t  *mp;
    594 			mblk_t  *mp1;
    595 
    596 			mp = mpp[0];
    597 			mpp[0] = mp->b_next;
    598 			for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
    599 				mp1->b_next = NULL;
    600 				mp1->b_prev = NULL;
    601 			}
    602 			freemsg(mp);
    603 		}
    604 	} while (mpp++ != &ill->ill_last_mp_to_free);
    605 
    606 	ill_free_mib(ill);
    607 
    608 #ifdef DEBUG
    609 	ill_trace_cleanup(ill);
    610 #endif
    611 
    612 	/* The default multicast interface might have changed */
    613 	ire_increment_multicast_generation(ipst, ill->ill_isv6);
    614 
    615 	/* Drop refcnt here */
    616 	netstack_rele(ill->ill_ipst->ips_netstack);
    617 	ill->ill_ipst = NULL;
    618 }
    619 
    620 static void
    621 ill_free_mib(ill_t *ill)
    622 {
    623 	ip_stack_t *ipst = ill->ill_ipst;
    624 
    625 	/*
    626 	 * MIB statistics must not be lost, so when an interface
    627 	 * goes away the counter values will be added to the global
    628 	 * MIBs.
    629 	 */
    630 	if (ill->ill_ip_mib != NULL) {
    631 		if (ill->ill_isv6) {
    632 			ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
    633 			    ill->ill_ip_mib);
    634 		} else {
    635 			ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
    636 			    ill->ill_ip_mib);
    637 		}
    638 
    639 		kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
    640 		ill->ill_ip_mib = NULL;
    641 	}
    642 	if (ill->ill_icmp6_mib != NULL) {
    643 		ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
    644 		    ill->ill_icmp6_mib);
    645 		kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
    646 		ill->ill_icmp6_mib = NULL;
    647 	}
    648 }
    649 
    650 /*
    651  * Concatenate together a physical address and a sap.
    652  *
    653  * Sap_lengths are interpreted as follows:
    654  *   sap_length == 0	==>	no sap
    655  *   sap_length > 0	==>	sap is at the head of the dlpi address
    656  *   sap_length < 0	==>	sap is at the tail of the dlpi address
    657  */
    658 static void
    659 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
    660     t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
    661 {
    662 	uint16_t sap_addr = (uint16_t)sap_src;
    663 
    664 	if (sap_length == 0) {
    665 		if (phys_src == NULL)
    666 			bzero(dst, phys_length);
    667 		else
    668 			bcopy(phys_src, dst, phys_length);
    669 	} else if (sap_length < 0) {
    670 		if (phys_src == NULL)
    671 			bzero(dst, phys_length);
    672 		else
    673 			bcopy(phys_src, dst, phys_length);
    674 		bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
    675 	} else {
    676 		bcopy(&sap_addr, dst, sizeof (sap_addr));
    677 		if (phys_src == NULL)
    678 			bzero((char *)dst + sap_length, phys_length);
    679 		else
    680 			bcopy(phys_src, (char *)dst + sap_length, phys_length);
    681 	}
    682 }
    683 
    684 /*
    685  * Generate a dl_unitdata_req mblk for the device and address given.
    686  * addr_length is the length of the physical portion of the address.
    687  * If addr is NULL include an all zero address of the specified length.
    688  * TRUE? In any case, addr_length is taken to be the entire length of the
    689  * dlpi address, including the absolute value of sap_length.
    690  */
    691 mblk_t *
    692 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
    693 		t_scalar_t sap_length)
    694 {
    695 	dl_unitdata_req_t *dlur;
    696 	mblk_t	*mp;
    697 	t_scalar_t	abs_sap_length;		/* absolute value */
    698 
    699 	abs_sap_length = ABS(sap_length);
    700 	mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
    701 	    DL_UNITDATA_REQ);
    702 	if (mp == NULL)
    703 		return (NULL);
    704 	dlur = (dl_unitdata_req_t *)mp->b_rptr;
    705 	/* HACK: accomodate incompatible DLPI drivers */
    706 	if (addr_length == 8)
    707 		addr_length = 6;
    708 	dlur->dl_dest_addr_length = addr_length + abs_sap_length;
    709 	dlur->dl_dest_addr_offset = sizeof (*dlur);
    710 	dlur->dl_priority.dl_min = 0;
    711 	dlur->dl_priority.dl_max = 0;
    712 	ill_dlur_copy_address(addr, addr_length, sap, sap_length,
    713 	    (uchar_t *)&dlur[1]);
    714 	return (mp);
    715 }
    716 
    717 /*
    718  * Add the pending mp to the list. There can be only 1 pending mp
    719  * in the list. Any exclusive ioctl that needs to wait for a response
    720  * from another module or driver needs to use this function to set
    721  * the ipx_pending_mp to the ioctl mblk and wait for the response from
    722  * the other module/driver. This is also used while waiting for the
    723  * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
    724  */
    725 boolean_t
    726 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
    727     int waitfor)
    728 {
    729 	ipxop_t	*ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
    730 
    731 	ASSERT(IAM_WRITER_IPIF(ipif));
    732 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
    733 	ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
    734 	ASSERT(ipx->ipx_pending_mp == NULL);
    735 	/*
    736 	 * The caller may be using a different ipif than the one passed into
    737 	 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
    738 	 * ill needs to wait for the V6 ill to quiesce).  So we can't ASSERT
    739 	 * that `ipx_current_ipif == ipif'.
    740 	 */
    741 	ASSERT(ipx->ipx_current_ipif != NULL);
    742 
    743 	/*
    744 	 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
    745 	 * driver.
    746 	 */
    747 	ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
    748 	    (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
    749 	    (DB_TYPE(add_mp) == M_PCPROTO));
    750 
    751 	if (connp != NULL) {
    752 		ASSERT(MUTEX_HELD(&connp->conn_lock));
    753 		/*
    754 		 * Return error if the conn has started closing. The conn
    755 		 * could have finished cleaning up the pending mp list,
    756 		 * If so we should not add another mp to the list negating
    757 		 * the cleanup.
    758 		 */
    759 		if (connp->conn_state_flags & CONN_CLOSING)
    760 			return (B_FALSE);
    761 	}
    762 	mutex_enter(&ipx->ipx_lock);
    763 	ipx->ipx_pending_ipif = ipif;
    764 	/*
    765 	 * Note down the queue in b_queue. This will be returned by
    766 	 * ipsq_pending_mp_get. Caller will then use these values to restart
    767 	 * the processing
    768 	 */
    769 	add_mp->b_next = NULL;
    770 	add_mp->b_queue = q;
    771 	ipx->ipx_pending_mp = add_mp;
    772 	ipx->ipx_waitfor = waitfor;
    773 	mutex_exit(&ipx->ipx_lock);
    774 
    775 	if (connp != NULL)
    776 		connp->conn_oper_pending_ill = ipif->ipif_ill;
    777 
    778 	return (B_TRUE);
    779 }
    780 
    781 /*
    782  * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
    783  * queued in the list.
    784  */
    785 mblk_t *
    786 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
    787 {
    788 	mblk_t	*curr = NULL;
    789 	ipxop_t	*ipx = ipsq->ipsq_xop;
    790 
    791 	*connpp = NULL;
    792 	mutex_enter(&ipx->ipx_lock);
    793 	if (ipx->ipx_pending_mp == NULL) {
    794 		mutex_exit(&ipx->ipx_lock);
    795 		return (NULL);
    796 	}
    797 
    798 	/* There can be only 1 such excl message */
    799 	curr = ipx->ipx_pending_mp;
    800 	ASSERT(curr->b_next == NULL);
    801 	ipx->ipx_pending_ipif = NULL;
    802 	ipx->ipx_pending_mp = NULL;
    803 	ipx->ipx_waitfor = 0;
    804 	mutex_exit(&ipx->ipx_lock);
    805 
    806 	if (CONN_Q(curr->b_queue)) {
    807 		/*
    808 		 * This mp did a refhold on the conn, at the start of the ioctl.
    809 		 * So we can safely return a pointer to the conn to the caller.
    810 		 */
    811 		*connpp = Q_TO_CONN(curr->b_queue);
    812 	} else {
    813 		*connpp = NULL;
    814 	}
    815 	curr->b_next = NULL;
    816 	curr->b_prev = NULL;
    817 	return (curr);
    818 }
    819 
    820 /*
    821  * Cleanup the ioctl mp queued in ipx_pending_mp
    822  * - Called in the ill_delete path
    823  * - Called in the M_ERROR or M_HANGUP path on the ill.
    824  * - Called in the conn close path.
    825  */
    826 boolean_t
    827 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
    828 {
    829 	mblk_t	*mp;
    830 	ipxop_t	*ipx;
    831 	queue_t	*q;
    832 	ipif_t	*ipif;
    833 	int	cmd;
    834 
    835 	ASSERT(IAM_WRITER_ILL(ill));
    836 	ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
    837 
    838 	/*
    839 	 * If connp is null, unconditionally clean up the ipx_pending_mp.
    840 	 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl
    841 	 * even if it is meant for another ill, since we have to enqueue
    842 	 * a new mp now in ipx_pending_mp to complete the ipif_down.
    843 	 * If connp is non-null we are called from the conn close path.
    844 	 */
    845 	mutex_enter(&ipx->ipx_lock);
    846 	mp = ipx->ipx_pending_mp;
    847 	if (mp == NULL || (connp != NULL &&
    848 	    mp->b_queue != CONNP_TO_WQ(connp))) {
    849 		mutex_exit(&ipx->ipx_lock);
    850 		return (B_FALSE);
    851 	}
    852 	/* Now remove from the ipx_pending_mp */
    853 	ipx->ipx_pending_mp = NULL;
    854 	q = mp->b_queue;
    855 	mp->b_next = NULL;
    856 	mp->b_prev = NULL;
    857 	mp->b_queue = NULL;
    858 
    859 	ipif = ipx->ipx_pending_ipif;
    860 	ipx->ipx_pending_ipif = NULL;
    861 	ipx->ipx_waitfor = 0;
    862 	ipx->ipx_current_ipif = NULL;
    863 	cmd = ipx->ipx_current_ioctl;
    864 	ipx->ipx_current_ioctl = 0;
    865 	ipx->ipx_current_done = B_TRUE;
    866 	mutex_exit(&ipx->ipx_lock);
    867 
    868 	if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
    869 		DTRACE_PROBE4(ipif__ioctl,
    870 		    char *, "ipsq_pending_mp_cleanup",
    871 		    int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
    872 		    ipif_t *, ipif);
    873 		if (connp == NULL) {
    874 			ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
    875 		} else {
    876 			ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
    877 			mutex_enter(&ipif->ipif_ill->ill_lock);
    878 			ipif->ipif_state_flags &= ~IPIF_CHANGING;
    879 			mutex_exit(&ipif->ipif_ill->ill_lock);
    880 		}
    881 	} else {
    882 		/*
    883 		 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't
    884 		 * be just inet_freemsg. we have to restart it
    885 		 * otherwise the thread will be stuck.
    886 		 */
    887 		inet_freemsg(mp);
    888 	}
    889 	return (B_TRUE);
    890 }
    891 
    892 /*
    893  * Called in the conn close path and ill delete path
    894  */
    895 static void
    896 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
    897 {
    898 	ipsq_t	*ipsq;
    899 	mblk_t	*prev;
    900 	mblk_t	*curr;
    901 	mblk_t	*next;
    902 	queue_t	*q;
    903 	mblk_t	*tmp_list = NULL;
    904 
    905 	ASSERT(IAM_WRITER_ILL(ill));
    906 	if (connp != NULL)
    907 		q = CONNP_TO_WQ(connp);
    908 	else
    909 		q = ill->ill_wq;
    910 
    911 	ipsq = ill->ill_phyint->phyint_ipsq;
    912 	/*
    913 	 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
    914 	 * In the case of ioctl from a conn, there can be only 1 mp
    915 	 * queued on the ipsq. If an ill is being unplumbed, only messages
    916 	 * related to this ill are flushed, like M_ERROR or M_HANGUP message.
    917 	 * ioctls meant for this ill form conn's are not flushed. They will
    918 	 * be processed during ipsq_exit and will not find the ill and will
    919 	 * return error.
    920 	 */
    921 	mutex_enter(&ipsq->ipsq_lock);
    922 	for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
    923 	    curr = next) {
    924 		next = curr->b_next;
    925 		if (curr->b_queue == q || curr->b_queue == RD(q)) {
    926 			/* Unlink the mblk from the pending mp list */
    927 			if (prev != NULL) {
    928 				prev->b_next = curr->b_next;
    929 			} else {
    930 				ASSERT(ipsq->ipsq_xopq_mphead == curr);
    931 				ipsq->ipsq_xopq_mphead = curr->b_next;
    932 			}
    933 			if (ipsq->ipsq_xopq_mptail == curr)
    934 				ipsq->ipsq_xopq_mptail = prev;
    935 			/*
    936 			 * Create a temporary list and release the ipsq lock
    937 			 * New elements are added to the head of the tmp_list
    938 			 */
    939 			curr->b_next = tmp_list;
    940 			tmp_list = curr;
    941 		} else {
    942 			prev = curr;
    943 		}
    944 	}
    945 	mutex_exit(&ipsq->ipsq_lock);
    946 
    947 	while (tmp_list != NULL) {
    948 		curr = tmp_list;
    949 		tmp_list = curr->b_next;
    950 		curr->b_next = NULL;
    951 		curr->b_prev = NULL;
    952 		curr->b_queue = NULL;
    953 		if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
    954 			DTRACE_PROBE4(ipif__ioctl,
    955 			    char *, "ipsq_xopq_mp_cleanup",
    956 			    int, 0, ill_t *, NULL, ipif_t *, NULL);
    957 			ip_ioctl_finish(q, curr, ENXIO, connp != NULL ?
    958 			    CONN_CLOSE : NO_COPYOUT, NULL);
    959 		} else {
    960 			/*
    961 			 * IP-MT XXX In the case of TLI/XTI bind / optmgmt
    962 			 * this can't be just inet_freemsg. we have to
    963 			 * restart it otherwise the thread will be stuck.
    964 			 */
    965 			inet_freemsg(curr);
    966 		}
    967 	}
    968 }
    969 
    970 /*
    971  * This conn has started closing. Cleanup any pending ioctl from this conn.
    972  * STREAMS ensures that there can be at most 1 ioctl pending on a stream.
    973  */
    974 void
    975 conn_ioctl_cleanup(conn_t *connp)
    976 {
    977 	ipsq_t	*ipsq;
    978 	ill_t	*ill;
    979 	boolean_t refheld;
    980 
    981 	/*
    982 	 * Is any exclusive ioctl pending ? If so clean it up. If the
    983 	 * ioctl has not yet started, the mp is pending in the list headed by
    984 	 * ipsq_xopq_head. If the ioctl has started the mp could be present in
    985 	 * ipx_pending_mp. If the ioctl timed out in the streamhead but
    986 	 * is currently executing now the mp is not queued anywhere but
    987 	 * conn_oper_pending_ill is null. The conn close will wait
    988 	 * till the conn_ref drops to zero.
    989 	 */
    990 	mutex_enter(&connp->conn_lock);
    991 	ill = connp->conn_oper_pending_ill;
    992 	if (ill == NULL) {
    993 		mutex_exit(&connp->conn_lock);
    994 		return;
    995 	}
    996 
    997 	/*
    998 	 * We may not be able to refhold the ill if the ill/ipif
    999 	 * is changing. But we need to make sure that the ill will
   1000 	 * not vanish. So we just bump up the ill_waiter count.
   1001 	 */
   1002 	refheld = ill_waiter_inc(ill);
   1003 	mutex_exit(&connp->conn_lock);
   1004 	if (refheld) {
   1005 		if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
   1006 			ill_waiter_dcr(ill);
   1007 			/*
   1008 			 * Check whether this ioctl has started and is
   1009 			 * pending. If it is not found there then check
   1010 			 * whether this ioctl has not even started and is in
   1011 			 * the ipsq_xopq list.
   1012 			 */
   1013 			if (!ipsq_pending_mp_cleanup(ill, connp))
   1014 				ipsq_xopq_mp_cleanup(ill, connp);
   1015 			ipsq = ill->ill_phyint->phyint_ipsq;
   1016 			ipsq_exit(ipsq);
   1017 			return;
   1018 		}
   1019 	}
   1020 
   1021 	/*
   1022 	 * The ill is also closing and we could not bump up the
   1023 	 * ill_waiter_count or we could not enter the ipsq. Leave
   1024 	 * the cleanup to ill_delete
   1025 	 */
   1026 	mutex_enter(&connp->conn_lock);
   1027 	while (connp->conn_oper_pending_ill != NULL)
   1028 		cv_wait(&connp->conn_refcv, &connp->conn_lock);
   1029 	mutex_exit(&connp->conn_lock);
   1030 	if (refheld)
   1031 		ill_waiter_dcr(ill);
   1032 }
   1033 
   1034 /*
   1035  * ipcl_walk function for cleaning up conn_*_ill fields.
   1036  * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
   1037  * conn_bound_if in place. We prefer dropping
   1038  * packets instead of sending them out the wrong interface, or accepting
   1039  * packets from the wrong ifindex.
   1040  */
   1041 static void
   1042 conn_cleanup_ill(conn_t *connp, caddr_t arg)
   1043 {
   1044 	ill_t	*ill = (ill_t *)arg;
   1045 
   1046 	mutex_enter(&connp->conn_lock);
   1047 	if (connp->conn_dhcpinit_ill == ill) {
   1048 		connp->conn_dhcpinit_ill = NULL;
   1049 		ASSERT(ill->ill_dhcpinit != 0);
   1050 		atomic_dec_32(&ill->ill_dhcpinit);
   1051 		ill_set_inputfn(ill);
   1052 	}
   1053 	mutex_exit(&connp->conn_lock);
   1054 }
   1055 
   1056 static int
   1057 ill_down_ipifs_tail(ill_t *ill)
   1058 {
   1059 	ipif_t	*ipif;
   1060 	int err;
   1061 
   1062 	ASSERT(IAM_WRITER_ILL(ill));
   1063 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   1064 		ipif_non_duplicate(ipif);
   1065 		/*
   1066 		 * ipif_down_tail will call arp_ll_down on the last ipif
   1067 		 * and typically return EINPROGRESS when the DL_UNBIND is sent.
   1068 		 */
   1069 		if ((err = ipif_down_tail(ipif)) != 0)
   1070 			return (err);
   1071 	}
   1072 	return (0);
   1073 }
   1074 
   1075 /* ARGSUSED */
   1076 void
   1077 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
   1078 {
   1079 	ASSERT(IAM_WRITER_IPSQ(ipsq));
   1080 	(void) ill_down_ipifs_tail(q->q_ptr);
   1081 	freemsg(mp);
   1082 	ipsq_current_finish(ipsq);
   1083 }
   1084 
   1085 /*
   1086  * ill_down_start is called when we want to down this ill and bring it up again
   1087  * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
   1088  * all interfaces, but don't tear down any plumbing.
   1089  */
   1090 boolean_t
   1091 ill_down_start(queue_t *q, mblk_t *mp)
   1092 {
   1093 	ill_t	*ill = q->q_ptr;
   1094 	ipif_t	*ipif;
   1095 
   1096 	ASSERT(IAM_WRITER_ILL(ill));
   1097 	mutex_enter(&ill->ill_lock);
   1098 	ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
   1099 	/* no more nce addition allowed */
   1100 	mutex_exit(&ill->ill_lock);
   1101 
   1102 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
   1103 		(void) ipif_down(ipif, NULL, NULL);
   1104 
   1105 	ill_down(ill);
   1106 
   1107 	/*
   1108 	 * Walk all CONNs that can have a reference on an ire or nce for this
   1109 	 * ill (we actually walk all that now have stale references).
   1110 	 */
   1111 	ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
   1112 
   1113 	/* With IPv6 we have dce_ifindex. Cleanup for neatness */
   1114 	if (ill->ill_isv6)
   1115 		dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
   1116 
   1117 
   1118 	(void) ipsq_pending_mp_cleanup(ill, NULL);
   1119 
   1120 	ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
   1121 
   1122 	/*
   1123 	 * Atomically test and add the pending mp if references are active.
   1124 	 */
   1125 	mutex_enter(&ill->ill_lock);
   1126 	if (!ill_is_quiescent(ill)) {
   1127 		/* call cannot fail since `conn_t *' argument is NULL */
   1128 		(void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
   1129 		    mp, ILL_DOWN);
   1130 		mutex_exit(&ill->ill_lock);
   1131 		return (B_FALSE);
   1132 	}
   1133 	mutex_exit(&ill->ill_lock);
   1134 	return (B_TRUE);
   1135 }
   1136 
   1137 static void
   1138 ill_down(ill_t *ill)
   1139 {
   1140 	mblk_t	*mp;
   1141 	ip_stack_t	*ipst = ill->ill_ipst;
   1142 
   1143 	/*
   1144 	 * Blow off any IREs dependent on this ILL.
   1145 	 * The caller needs to handle conn_ixa_cleanup
   1146 	 */
   1147 	ill_delete_ires(ill);
   1148 
   1149 	ire_walk_ill(0, 0, ill_downi, ill, ill);
   1150 
   1151 	/* Remove any conn_*_ill depending on this ill */
   1152 	ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
   1153 
   1154 	/*
   1155 	 * Free state for additional IREs.
   1156 	 */
   1157 	mutex_enter(&ill->ill_saved_ire_lock);
   1158 	mp = ill->ill_saved_ire_mp;
   1159 	ill->ill_saved_ire_mp = NULL;
   1160 	ill->ill_saved_ire_cnt = 0;
   1161 	mutex_exit(&ill->ill_saved_ire_lock);
   1162 	freemsg(mp);
   1163 }
   1164 
   1165 /*
   1166  * ire_walk routine used to delete every IRE that depends on
   1167  * 'ill'.  (Always called as writer.)
   1168  *
   1169  * Note: since the routes added by the kernel are deleted separately,
   1170  * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
   1171  *
   1172  * We also remove references on ire_nce_cache entries that refer to the ill.
   1173  */
   1174 void
   1175 ill_downi(ire_t *ire, char *ill_arg)
   1176 {
   1177 	ill_t	*ill = (ill_t *)ill_arg;
   1178 	nce_t	*nce;
   1179 
   1180 	mutex_enter(&ire->ire_lock);
   1181 	nce = ire->ire_nce_cache;
   1182 	if (nce != NULL && nce->nce_ill == ill)
   1183 		ire->ire_nce_cache = NULL;
   1184 	else
   1185 		nce = NULL;
   1186 	mutex_exit(&ire->ire_lock);
   1187 	if (nce != NULL)
   1188 		nce_refrele(nce);
   1189 	if (ire->ire_ill == ill)
   1190 		ire_delete(ire);
   1191 }
   1192 
   1193 /* Remove IRE_IF_CLONE on this ill */
   1194 void
   1195 ill_downi_if_clone(ire_t *ire, char *ill_arg)
   1196 {
   1197 	ill_t	*ill = (ill_t *)ill_arg;
   1198 
   1199 	ASSERT(ire->ire_type & IRE_IF_CLONE);
   1200 	if (ire->ire_ill == ill)
   1201 		ire_delete(ire);
   1202 }
   1203 
   1204 /* Consume an M_IOCACK of the fastpath probe. */
   1205 void
   1206 ill_fastpath_ack(ill_t *ill, mblk_t *mp)
   1207 {
   1208 	mblk_t	*mp1 = mp;
   1209 
   1210 	/*
   1211 	 * If this was the first attempt turn on the fastpath probing.
   1212 	 */
   1213 	mutex_enter(&ill->ill_lock);
   1214 	if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
   1215 		ill->ill_dlpi_fastpath_state = IDS_OK;
   1216 	mutex_exit(&ill->ill_lock);
   1217 
   1218 	/* Free the M_IOCACK mblk, hold on to the data */
   1219 	mp = mp->b_cont;
   1220 	freeb(mp1);
   1221 	if (mp == NULL)
   1222 		return;
   1223 	if (mp->b_cont != NULL)
   1224 		nce_fastpath_update(ill, mp);
   1225 	else
   1226 		ip0dbg(("ill_fastpath_ack:  no b_cont\n"));
   1227 	freemsg(mp);
   1228 }
   1229 
   1230 /*
   1231  * Throw an M_IOCTL message downstream asking "do you know fastpath?"
   1232  * The data portion of the request is a dl_unitdata_req_t template for
   1233  * what we would send downstream in the absence of a fastpath confirmation.
   1234  */
   1235 int
   1236 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
   1237 {
   1238 	struct iocblk	*ioc;
   1239 	mblk_t	*mp;
   1240 
   1241 	if (dlur_mp == NULL)
   1242 		return (EINVAL);
   1243 
   1244 	mutex_enter(&ill->ill_lock);
   1245 	switch (ill->ill_dlpi_fastpath_state) {
   1246 	case IDS_FAILED:
   1247 		/*
   1248 		 * Driver NAKed the first fastpath ioctl - assume it doesn't
   1249 		 * support it.
   1250 		 */
   1251 		mutex_exit(&ill->ill_lock);
   1252 		return (ENOTSUP);
   1253 	case IDS_UNKNOWN:
   1254 		/* This is the first probe */
   1255 		ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
   1256 		break;
   1257 	default:
   1258 		break;
   1259 	}
   1260 	mutex_exit(&ill->ill_lock);
   1261 
   1262 	if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
   1263 		return (EAGAIN);
   1264 
   1265 	mp->b_cont = copyb(dlur_mp);
   1266 	if (mp->b_cont == NULL) {
   1267 		freeb(mp);
   1268 		return (EAGAIN);
   1269 	}
   1270 
   1271 	ioc = (struct iocblk *)mp->b_rptr;
   1272 	ioc->ioc_count = msgdsize(mp->b_cont);
   1273 
   1274 	DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
   1275 	    char *, "DL_IOC_HDR_INFO", ill_t *, ill);
   1276 	putnext(ill->ill_wq, mp);
   1277 	return (0);
   1278 }
   1279 
   1280 void
   1281 ill_capability_probe(ill_t *ill)
   1282 {
   1283 	mblk_t	*mp;
   1284 
   1285 	ASSERT(IAM_WRITER_ILL(ill));
   1286 
   1287 	if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
   1288 	    ill->ill_dlpi_capab_state != IDCS_FAILED)
   1289 		return;
   1290 
   1291 	/*
   1292 	 * We are starting a new cycle of capability negotiation.
   1293 	 * Free up the capab reset messages of any previous incarnation.
   1294 	 * We will do a fresh allocation when we get the response to our probe
   1295 	 */
   1296 	if (ill->ill_capab_reset_mp != NULL) {
   1297 		freemsg(ill->ill_capab_reset_mp);
   1298 		ill->ill_capab_reset_mp = NULL;
   1299 	}
   1300 
   1301 	ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
   1302 
   1303 	mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
   1304 	if (mp == NULL)
   1305 		return;
   1306 
   1307 	ill_capability_send(ill, mp);
   1308 	ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
   1309 }
   1310 
   1311 void
   1312 ill_capability_reset(ill_t *ill, boolean_t reneg)
   1313 {
   1314 	ASSERT(IAM_WRITER_ILL(ill));
   1315 
   1316 	if (ill->ill_dlpi_capab_state != IDCS_OK)
   1317 		return;
   1318 
   1319 	ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
   1320 
   1321 	ill_capability_send(ill, ill->ill_capab_reset_mp);
   1322 	ill->ill_capab_reset_mp = NULL;
   1323 	/*
   1324 	 * We turn off all capabilities except those pertaining to
   1325 	 * direct function call capabilities viz. ILL_CAPAB_DLD*
   1326 	 * which will be turned off by the corresponding reset functions.
   1327 	 */
   1328 	ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM  | ILL_CAPAB_ZEROCOPY);
   1329 }
   1330 
   1331 static void
   1332 ill_capability_reset_alloc(ill_t *ill)
   1333 {
   1334 	mblk_t *mp;
   1335 	size_t	size = 0;
   1336 	int	err;
   1337 	dl_capability_req_t	*capb;
   1338 
   1339 	ASSERT(IAM_WRITER_ILL(ill));
   1340 	ASSERT(ill->ill_capab_reset_mp == NULL);
   1341 
   1342 	if (ILL_HCKSUM_CAPABLE(ill)) {
   1343 		size += sizeof (dl_capability_sub_t) +
   1344 		    sizeof (dl_capab_hcksum_t);
   1345 	}
   1346 
   1347 	if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
   1348 		size += sizeof (dl_capability_sub_t) +
   1349 		    sizeof (dl_capab_zerocopy_t);
   1350 	}
   1351 
   1352 	if (ill->ill_capabilities & ILL_CAPAB_DLD) {
   1353 		size += sizeof (dl_capability_sub_t) +
   1354 		    sizeof (dl_capab_dld_t);
   1355 	}
   1356 
   1357 	mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
   1358 	    STR_NOSIG, &err);
   1359 
   1360 	mp->b_datap->db_type = M_PROTO;
   1361 	bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
   1362 
   1363 	capb = (dl_capability_req_t *)mp->b_rptr;
   1364 	capb->dl_primitive = DL_CAPABILITY_REQ;
   1365 	capb->dl_sub_offset = sizeof (dl_capability_req_t);
   1366 	capb->dl_sub_length = size;
   1367 
   1368 	mp->b_wptr += sizeof (dl_capability_req_t);
   1369 
   1370 	/*
   1371 	 * Each handler fills in the corresponding dl_capability_sub_t
   1372 	 * inside the mblk,
   1373 	 */
   1374 	ill_capability_hcksum_reset_fill(ill, mp);
   1375 	ill_capability_zerocopy_reset_fill(ill, mp);
   1376 	ill_capability_dld_reset_fill(ill, mp);
   1377 
   1378 	ill->ill_capab_reset_mp = mp;
   1379 }
   1380 
   1381 static void
   1382 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
   1383 {
   1384 	dl_capab_id_t *id_ic;
   1385 	uint_t sub_dl_cap = outers->dl_cap;
   1386 	dl_capability_sub_t *inners;
   1387 	uint8_t *capend;
   1388 
   1389 	ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
   1390 
   1391 	/*
   1392 	 * Note: range checks here are not absolutely sufficient to
   1393 	 * make us robust against malformed messages sent by drivers;
   1394 	 * this is in keeping with the rest of IP's dlpi handling.
   1395 	 * (Remember, it's coming from something else in the kernel
   1396 	 * address space)
   1397 	 */
   1398 
   1399 	capend = (uint8_t *)(outers + 1) + outers->dl_length;
   1400 	if (capend > mp->b_wptr) {
   1401 		cmn_err(CE_WARN, "ill_capability_id_ack: "
   1402 		    "malformed sub-capability too long for mblk");
   1403 		return;
   1404 	}
   1405 
   1406 	id_ic = (dl_capab_id_t *)(outers + 1);
   1407 
   1408 	if (outers->dl_length < sizeof (*id_ic) ||
   1409 	    (inners = &id_ic->id_subcap,
   1410 	    inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
   1411 		cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
   1412 		    "encapsulated capab type %d too long for mblk",
   1413 		    inners->dl_cap);
   1414 		return;
   1415 	}
   1416 
   1417 	if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
   1418 		ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
   1419 		    "isn't as expected; pass-thru module(s) detected, "
   1420 		    "discarding capability\n", inners->dl_cap));
   1421 		return;
   1422 	}
   1423 
   1424 	/* Process the encapsulated sub-capability */
   1425 	ill_capability_dispatch(ill, mp, inners);
   1426 }
   1427 
   1428 static void
   1429 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
   1430 {
   1431 	dl_capability_sub_t *dl_subcap;
   1432 
   1433 	if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
   1434 		return;
   1435 
   1436 	/*
   1437 	 * The dl_capab_dld_t that follows the dl_capability_sub_t is not
   1438 	 * initialized below since it is not used by DLD.
   1439 	 */
   1440 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
   1441 	dl_subcap->dl_cap = DL_CAPAB_DLD;
   1442 	dl_subcap->dl_length = sizeof (dl_capab_dld_t);
   1443 
   1444 	mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
   1445 }
   1446 
   1447 static void
   1448 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
   1449 {
   1450 	/*
   1451 	 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
   1452 	 * is only to get the VRRP capability.
   1453 	 */
   1454 	if (ill->ill_ipif_up_count == 0) {
   1455 		if (subp->dl_cap == DL_CAPAB_VRRP)
   1456 			ill_capability_vrrp_ack(ill, mp, subp);
   1457 		return;
   1458 	}
   1459 
   1460 	switch (subp->dl_cap) {
   1461 	case DL_CAPAB_HCKSUM:
   1462 		ill_capability_hcksum_ack(ill, mp, subp);
   1463 		break;
   1464 	case DL_CAPAB_ZEROCOPY:
   1465 		ill_capability_zerocopy_ack(ill, mp, subp);
   1466 		break;
   1467 	case DL_CAPAB_DLD:
   1468 		ill_capability_dld_ack(ill, mp, subp);
   1469 		break;
   1470 	case DL_CAPAB_VRRP:
   1471 		break;
   1472 	default:
   1473 		ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
   1474 		    subp->dl_cap));
   1475 	}
   1476 }
   1477 
   1478 /*
   1479  * Process the vrrp capability received from a DLS Provider. isub must point
   1480  * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
   1481  */
   1482 static void
   1483 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1484 {
   1485 	dl_capab_vrrp_t	*vrrp;
   1486 	uint_t		sub_dl_cap = isub->dl_cap;
   1487 	uint8_t		*capend;
   1488 
   1489 	ASSERT(IAM_WRITER_ILL(ill));
   1490 	ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
   1491 
   1492 	/*
   1493 	 * Note: range checks here are not absolutely sufficient to
   1494 	 * make us robust against malformed messages sent by drivers;
   1495 	 * this is in keeping with the rest of IP's dlpi handling.
   1496 	 * (Remember, it's coming from something else in the kernel
   1497 	 * address space)
   1498 	 */
   1499 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1500 	if (capend > mp->b_wptr) {
   1501 		cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
   1502 		    "malformed sub-capability too long for mblk");
   1503 		return;
   1504 	}
   1505 	vrrp = (dl_capab_vrrp_t *)(isub + 1);
   1506 
   1507 	/*
   1508 	 * Compare the IP address family and set ILLF_VRRP for the right ill.
   1509 	 */
   1510 	if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
   1511 	    (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
   1512 		ill->ill_flags |= ILLF_VRRP;
   1513 	}
   1514 }
   1515 
   1516 /*
   1517  * Process a hardware checksum offload capability negotiation ack received
   1518  * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
   1519  * of a DL_CAPABILITY_ACK message.
   1520  */
   1521 static void
   1522 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1523 {
   1524 	dl_capability_req_t	*ocap;
   1525 	dl_capab_hcksum_t	*ihck, *ohck;
   1526 	ill_hcksum_capab_t	**ill_hcksum;
   1527 	mblk_t			*nmp = NULL;
   1528 	uint_t			sub_dl_cap = isub->dl_cap;
   1529 	uint8_t			*capend;
   1530 
   1531 	ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
   1532 
   1533 	ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
   1534 
   1535 	/*
   1536 	 * Note: range checks here are not absolutely sufficient to
   1537 	 * make us robust against malformed messages sent by drivers;
   1538 	 * this is in keeping with the rest of IP's dlpi handling.
   1539 	 * (Remember, it's coming from something else in the kernel
   1540 	 * address space)
   1541 	 */
   1542 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1543 	if (capend > mp->b_wptr) {
   1544 		cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
   1545 		    "malformed sub-capability too long for mblk");
   1546 		return;
   1547 	}
   1548 
   1549 	/*
   1550 	 * There are two types of acks we process here:
   1551 	 * 1. acks in reply to a (first form) generic capability req
   1552 	 *    (no ENABLE flag set)
   1553 	 * 2. acks in reply to a ENABLE capability req.
   1554 	 *    (ENABLE flag set)
   1555 	 */
   1556 	ihck = (dl_capab_hcksum_t *)(isub + 1);
   1557 
   1558 	if (ihck->hcksum_version != HCKSUM_VERSION_1) {
   1559 		cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
   1560 		    "unsupported hardware checksum "
   1561 		    "sub-capability (version %d, expected %d)",
   1562 		    ihck->hcksum_version, HCKSUM_VERSION_1);
   1563 		return;
   1564 	}
   1565 
   1566 	if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
   1567 		ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
   1568 		    "checksum capability isn't as expected; pass-thru "
   1569 		    "module(s) detected, discarding capability\n"));
   1570 		return;
   1571 	}
   1572 
   1573 #define	CURR_HCKSUM_CAPAB				\
   1574 	(HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 |	\
   1575 	HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
   1576 
   1577 	if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
   1578 	    (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
   1579 		/* do ENABLE processing */
   1580 		if (*ill_hcksum == NULL) {
   1581 			*ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
   1582 			    KM_NOSLEEP);
   1583 
   1584 			if (*ill_hcksum == NULL) {
   1585 				cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
   1586 				    "could not enable hcksum version %d "
   1587 				    "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
   1588 				    ill->ill_name);
   1589 				return;
   1590 			}
   1591 		}
   1592 
   1593 		(*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
   1594 		(*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
   1595 		ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
   1596 		ip1dbg(("ill_capability_hcksum_ack: interface %s "
   1597 		    "has enabled hardware checksumming\n ",
   1598 		    ill->ill_name));
   1599 	} else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
   1600 		/*
   1601 		 * Enabling hardware checksum offload
   1602 		 * Currently IP supports {TCP,UDP}/IPv4
   1603 		 * partial and full cksum offload and
   1604 		 * IPv4 header checksum offload.
   1605 		 * Allocate new mblk which will
   1606 		 * contain a new capability request
   1607 		 * to enable hardware checksum offload.
   1608 		 */
   1609 		uint_t	size;
   1610 		uchar_t	*rptr;
   1611 
   1612 		size = sizeof (dl_capability_req_t) +
   1613 		    sizeof (dl_capability_sub_t) + isub->dl_length;
   1614 
   1615 		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
   1616 			cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
   1617 			    "could not enable hardware cksum for %s (ENOMEM)\n",
   1618 			    ill->ill_name);
   1619 			return;
   1620 		}
   1621 
   1622 		rptr = nmp->b_rptr;
   1623 		/* initialize dl_capability_req_t */
   1624 		ocap = (dl_capability_req_t *)nmp->b_rptr;
   1625 		ocap->dl_sub_offset =
   1626 		    sizeof (dl_capability_req_t);
   1627 		ocap->dl_sub_length =
   1628 		    sizeof (dl_capability_sub_t) +
   1629 		    isub->dl_length;
   1630 		nmp->b_rptr += sizeof (dl_capability_req_t);
   1631 
   1632 		/* initialize dl_capability_sub_t */
   1633 		bcopy(isub, nmp->b_rptr, sizeof (*isub));
   1634 		nmp->b_rptr += sizeof (*isub);
   1635 
   1636 		/* initialize dl_capab_hcksum_t */
   1637 		ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
   1638 		bcopy(ihck, ohck, sizeof (*ihck));
   1639 
   1640 		nmp->b_rptr = rptr;
   1641 		ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
   1642 
   1643 		/* Set ENABLE flag */
   1644 		ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
   1645 		ohck->hcksum_txflags |= HCKSUM_ENABLE;
   1646 
   1647 		/*
   1648 		 * nmp points to a DL_CAPABILITY_REQ message to enable
   1649 		 * hardware checksum acceleration.
   1650 		 */
   1651 		ill_capability_send(ill, nmp);
   1652 	} else {
   1653 		ip1dbg(("ill_capability_hcksum_ack: interface %s has "
   1654 		    "advertised %x hardware checksum capability flags\n",
   1655 		    ill->ill_name, ihck->hcksum_txflags));
   1656 	}
   1657 }
   1658 
   1659 static void
   1660 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
   1661 {
   1662 	dl_capab_hcksum_t *hck_subcap;
   1663 	dl_capability_sub_t *dl_subcap;
   1664 
   1665 	if (!ILL_HCKSUM_CAPABLE(ill))
   1666 		return;
   1667 
   1668 	ASSERT(ill->ill_hcksum_capab != NULL);
   1669 
   1670 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
   1671 	dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
   1672 	dl_subcap->dl_length = sizeof (*hck_subcap);
   1673 
   1674 	hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
   1675 	hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
   1676 	hck_subcap->hcksum_txflags = 0;
   1677 
   1678 	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
   1679 }
   1680 
   1681 static void
   1682 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1683 {
   1684 	mblk_t *nmp = NULL;
   1685 	dl_capability_req_t *oc;
   1686 	dl_capab_zerocopy_t *zc_ic, *zc_oc;
   1687 	ill_zerocopy_capab_t **ill_zerocopy_capab;
   1688 	uint_t sub_dl_cap = isub->dl_cap;
   1689 	uint8_t *capend;
   1690 
   1691 	ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
   1692 
   1693 	ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
   1694 
   1695 	/*
   1696 	 * Note: range checks here are not absolutely sufficient to
   1697 	 * make us robust against malformed messages sent by drivers;
   1698 	 * this is in keeping with the rest of IP's dlpi handling.
   1699 	 * (Remember, it's coming from something else in the kernel
   1700 	 * address space)
   1701 	 */
   1702 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1703 	if (capend > mp->b_wptr) {
   1704 		cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
   1705 		    "malformed sub-capability too long for mblk");
   1706 		return;
   1707 	}
   1708 
   1709 	zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
   1710 	if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
   1711 		cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
   1712 		    "unsupported ZEROCOPY sub-capability (version %d, "
   1713 		    "expected %d)", zc_ic->zerocopy_version,
   1714 		    ZEROCOPY_VERSION_1);
   1715 		return;
   1716 	}
   1717 
   1718 	if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
   1719 		ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
   1720 		    "capability isn't as expected; pass-thru module(s) "
   1721 		    "detected, discarding capability\n"));
   1722 		return;
   1723 	}
   1724 
   1725 	if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
   1726 		if (*ill_zerocopy_capab == NULL) {
   1727 			*ill_zerocopy_capab =
   1728 			    kmem_zalloc(sizeof (ill_zerocopy_capab_t),
   1729 			    KM_NOSLEEP);
   1730 
   1731 			if (*ill_zerocopy_capab == NULL) {
   1732 				cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
   1733 				    "could not enable Zero-copy version %d "
   1734 				    "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
   1735 				    ill->ill_name);
   1736 				return;
   1737 			}
   1738 		}
   1739 
   1740 		ip1dbg(("ill_capability_zerocopy_ack: interface %s "
   1741 		    "supports Zero-copy version %d\n", ill->ill_name,
   1742 		    ZEROCOPY_VERSION_1));
   1743 
   1744 		(*ill_zerocopy_capab)->ill_zerocopy_version =
   1745 		    zc_ic->zerocopy_version;
   1746 		(*ill_zerocopy_capab)->ill_zerocopy_flags =
   1747 		    zc_ic->zerocopy_flags;
   1748 
   1749 		ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
   1750 	} else {
   1751 		uint_t size;
   1752 		uchar_t *rptr;
   1753 
   1754 		size = sizeof (dl_capability_req_t) +
   1755 		    sizeof (dl_capability_sub_t) +
   1756 		    sizeof (dl_capab_zerocopy_t);
   1757 
   1758 		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
   1759 			cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
   1760 			    "could not enable zerocopy for %s (ENOMEM)\n",
   1761 			    ill->ill_name);
   1762 			return;
   1763 		}
   1764 
   1765 		rptr = nmp->b_rptr;
   1766 		/* initialize dl_capability_req_t */
   1767 		oc = (dl_capability_req_t *)rptr;
   1768 		oc->dl_sub_offset = sizeof (dl_capability_req_t);
   1769 		oc->dl_sub_length = sizeof (dl_capability_sub_t) +
   1770 		    sizeof (dl_capab_zerocopy_t);
   1771 		rptr += sizeof (dl_capability_req_t);
   1772 
   1773 		/* initialize dl_capability_sub_t */
   1774 		bcopy(isub, rptr, sizeof (*isub));
   1775 		rptr += sizeof (*isub);
   1776 
   1777 		/* initialize dl_capab_zerocopy_t */
   1778 		zc_oc = (dl_capab_zerocopy_t *)rptr;
   1779 		*zc_oc = *zc_ic;
   1780 
   1781 		ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
   1782 		    "to enable zero-copy version %d\n", ill->ill_name,
   1783 		    ZEROCOPY_VERSION_1));
   1784 
   1785 		/* set VMSAFE_MEM flag */
   1786 		zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
   1787 
   1788 		/* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
   1789 		ill_capability_send(ill, nmp);
   1790 	}
   1791 }
   1792 
   1793 static void
   1794 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
   1795 {
   1796 	dl_capab_zerocopy_t *zerocopy_subcap;
   1797 	dl_capability_sub_t *dl_subcap;
   1798 
   1799 	if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
   1800 		return;
   1801 
   1802 	ASSERT(ill->ill_zerocopy_capab != NULL);
   1803 
   1804 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
   1805 	dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
   1806 	dl_subcap->dl_length = sizeof (*zerocopy_subcap);
   1807 
   1808 	zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
   1809 	zerocopy_subcap->zerocopy_version =
   1810 	    ill->ill_zerocopy_capab->ill_zerocopy_version;
   1811 	zerocopy_subcap->zerocopy_flags = 0;
   1812 
   1813 	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
   1814 }
   1815 
   1816 /*
   1817  * DLD capability
   1818  * Refer to dld.h for more information regarding the purpose and usage
   1819  * of this capability.
   1820  */
   1821 static void
   1822 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1823 {
   1824 	dl_capab_dld_t		*dld_ic, dld;
   1825 	uint_t			sub_dl_cap = isub->dl_cap;
   1826 	uint8_t			*capend;
   1827 	ill_dld_capab_t		*idc;
   1828 
   1829 	ASSERT(IAM_WRITER_ILL(ill));
   1830 	ASSERT(sub_dl_cap == DL_CAPAB_DLD);
   1831 
   1832 	/*
   1833 	 * Note: range checks here are not absolutely sufficient to
   1834 	 * make us robust against malformed messages sent by drivers;
   1835 	 * this is in keeping with the rest of IP's dlpi handling.
   1836 	 * (Remember, it's coming from something else in the kernel
   1837 	 * address space)
   1838 	 */
   1839 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1840 	if (capend > mp->b_wptr) {
   1841 		cmn_err(CE_WARN, "ill_capability_dld_ack: "
   1842 		    "malformed sub-capability too long for mblk");
   1843 		return;
   1844 	}
   1845 	dld_ic = (dl_capab_dld_t *)(isub + 1);
   1846 	if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
   1847 		cmn_err(CE_CONT, "ill_capability_dld_ack: "
   1848 		    "unsupported DLD sub-capability (version %d, "
   1849 		    "expected %d)", dld_ic->dld_version,
   1850 		    DLD_CURRENT_VERSION);
   1851 		return;
   1852 	}
   1853 	if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
   1854 		ip1dbg(("ill_capability_dld_ack: mid token for dld "
   1855 		    "capability isn't as expected; pass-thru module(s) "
   1856 		    "detected, discarding capability\n"));
   1857 		return;
   1858 	}
   1859 
   1860 	/*
   1861 	 * Copy locally to ensure alignment.
   1862 	 */
   1863 	bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
   1864 
   1865 	if ((idc = ill->ill_dld_capab) == NULL) {
   1866 		idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
   1867 		if (idc == NULL) {
   1868 			cmn_err(CE_WARN, "ill_capability_dld_ack: "
   1869 			    "could not enable DLD version %d "
   1870 			    "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
   1871 			    ill->ill_name);
   1872 			return;
   1873 		}
   1874 		ill->ill_dld_capab = idc;
   1875 	}
   1876 	idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
   1877 	idc->idc_capab_dh = (void *)dld.dld_capab_handle;
   1878 	ip1dbg(("ill_capability_dld_ack: interface %s "
   1879 	    "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
   1880 
   1881 	ill_capability_dld_enable(ill);
   1882 }
   1883 
   1884 /*
   1885  * Typically capability negotiation between IP and the driver happens via
   1886  * DLPI message exchange. However GLD also offers a direct function call
   1887  * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
   1888  * But arbitrary function calls into IP or GLD are not permitted, since both
   1889  * of them are protected by their own perimeter mechanism. The perimeter can
   1890  * be viewed as a coarse lock or serialization mechanism. The hierarchy of
   1891  * these perimeters is IP -> MAC. Thus for example to enable the squeue
   1892  * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
   1893  * to enter the mac perimeter and then do the direct function calls into
   1894  * GLD to enable squeue polling. The ring related callbacks from the mac into
   1895  * the stack to add, bind, quiesce, restart or cleanup a ring are all
   1896  * protected by the mac perimeter.
   1897  */
   1898 static void
   1899 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
   1900 {
   1901 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1902 	int			err;
   1903 
   1904 	err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
   1905 	    DLD_ENABLE);
   1906 	ASSERT(err == 0);
   1907 }
   1908 
   1909 static void
   1910 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
   1911 {
   1912 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1913 	int			err;
   1914 
   1915 	err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
   1916 	    DLD_DISABLE);
   1917 	ASSERT(err == 0);
   1918 }
   1919 
   1920 boolean_t
   1921 ill_mac_perim_held(ill_t *ill)
   1922 {
   1923 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1924 
   1925 	return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
   1926 	    DLD_QUERY));
   1927 }
   1928 
   1929 static void
   1930 ill_capability_direct_enable(ill_t *ill)
   1931 {
   1932 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1933 	ill_dld_direct_t	*idd = &idc->idc_direct;
   1934 	dld_capab_direct_t	direct;
   1935 	int			rc;
   1936 
   1937 	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
   1938 
   1939 	bzero(&direct, sizeof (direct));
   1940 	direct.di_rx_cf = (uintptr_t)ip_input;
   1941 	direct.di_rx_ch = ill;
   1942 
   1943 	rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
   1944 	    DLD_ENABLE);
   1945 	if (rc == 0) {
   1946 		idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
   1947 		idd->idd_tx_dh = direct.di_tx_dh;
   1948 		idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
   1949 		idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
   1950 		idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
   1951 		idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
   1952 		ASSERT(idd->idd_tx_cb_df != NULL);
   1953 		ASSERT(idd->idd_tx_fctl_df != NULL);
   1954 		ASSERT(idd->idd_tx_df != NULL);
   1955 		/*
   1956 		 * One time registration of flow enable callback function
   1957 		 */
   1958 		ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
   1959 		    ill_flow_enable, ill);
   1960 		ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
   1961 		DTRACE_PROBE1(direct_on, (ill_t *), ill);
   1962 	} else {
   1963 		cmn_err(CE_WARN, "warning: could not enable DIRECT "
   1964 		    "capability, rc = %d\n", rc);
   1965 		DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
   1966 	}
   1967 }
   1968 
   1969 static void
   1970 ill_capability_poll_enable(ill_t *ill)
   1971 {
   1972 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1973 	dld_capab_poll_t	poll;
   1974 	int			rc;
   1975 
   1976 	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
   1977 
   1978 	bzero(&poll, sizeof (poll));
   1979 	poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
   1980 	poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
   1981 	poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
   1982 	poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
   1983 	poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
   1984 	poll.poll_ring_ch = ill;
   1985 	rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
   1986 	    DLD_ENABLE);
   1987 	if (rc == 0) {
   1988 		ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
   1989 		DTRACE_PROBE1(poll_on, (ill_t *), ill);
   1990 	} else {
   1991 		ip1dbg(("warning: could not enable POLL "
   1992 		    "capability, rc = %d\n", rc));
   1993 		DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
   1994 	}
   1995 }
   1996 
   1997 /*
   1998  * Enable the LSO capability.
   1999  */
   2000 static void
   2001 ill_capability_lso_enable(ill_t *ill)
   2002 {
   2003 	ill_dld_capab_t	*idc = ill->ill_dld_capab;
   2004 	dld_capab_lso_t	lso;
   2005 	int rc;
   2006 
   2007 	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
   2008 
   2009 	if (ill->ill_lso_capab == NULL) {
   2010 		ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
   2011 		    KM_NOSLEEP);
   2012 		if (ill->ill_lso_capab == NULL) {
   2013 			cmn_err(CE_WARN, "ill_capability_lso_enable: "
   2014 			    "could not enable LSO for %s (ENOMEM)\n",
   2015 			    ill->ill_name);
   2016 			return;
   2017 		}
   2018 	}
   2019 
   2020 	bzero(&lso, sizeof (lso));
   2021 	if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
   2022 	    DLD_ENABLE)) == 0) {
   2023 		ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
   2024 		ill->ill_lso_capab->ill_lso_max = lso.lso_max;
   2025 		ill->ill_capabilities |= ILL_CAPAB_LSO;
   2026 		ip1dbg(("ill_capability_lso_enable: interface %s "
   2027 		    "has enabled LSO\n ", ill->ill_name));
   2028 	} else {
   2029 		kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
   2030 		ill->ill_lso_capab = NULL;
   2031 		DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
   2032 	}
   2033 }
   2034 
   2035 static void
   2036 ill_capability_dld_enable(ill_t *ill)
   2037 {
   2038 	mac_perim_handle_t mph;
   2039 
   2040 	ASSERT(IAM_WRITER_ILL(ill));
   2041 
   2042 	if (ill->ill_isv6)
   2043 		return;
   2044 
   2045 	ill_mac_perim_enter(ill, &mph);
   2046 	if (!ill->ill_isv6) {
   2047 		ill_capability_direct_enable(ill);
   2048 		ill_capability_poll_enable(ill);
   2049 		ill_capability_lso_enable(ill);
   2050 	}
   2051 	ill->ill_capabilities |= ILL_CAPAB_DLD;
   2052 	ill_mac_perim_exit(ill, mph);
   2053 }
   2054 
   2055 static void
   2056 ill_capability_dld_disable(ill_t *ill)
   2057 {
   2058 	ill_dld_capab_t	*idc;
   2059 	ill_dld_direct_t *idd;
   2060 	mac_perim_handle_t	mph;
   2061 
   2062 	ASSERT(IAM_WRITER_ILL(ill));
   2063 
   2064 	if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
   2065 		return;
   2066 
   2067 	ill_mac_perim_enter(ill, &mph);
   2068 
   2069 	idc = ill->ill_dld_capab;
   2070 	if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
   2071 		/*
   2072 		 * For performance we avoid locks in the transmit data path
   2073 		 * and don't maintain a count of the number of threads using
   2074 		 * direct calls. Thus some threads could be using direct
   2075 		 * transmit calls to GLD, even after the capability mechanism
   2076 		 * turns it off. This is still safe since the handles used in
   2077 		 * the direct calls continue to be valid until the unplumb is
   2078 		 * completed. Remove the callback that was added (1-time) at
   2079 		 * capab enable time.
   2080 		 */
   2081 		mutex_enter(&ill->ill_lock);
   2082 		ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
   2083 		mutex_exit(&ill->ill_lock);
   2084 		if (ill->ill_flownotify_mh != NULL) {
   2085 			idd = &idc->idc_direct;
   2086 			idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
   2087 			    ill->ill_flownotify_mh);
   2088 			ill->ill_flownotify_mh = NULL;
   2089 		}
   2090 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
   2091 		    NULL, DLD_DISABLE);
   2092 	}
   2093 
   2094 	if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
   2095 		ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
   2096 		ip_squeue_clean_all(ill);
   2097 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
   2098 		    NULL, DLD_DISABLE);
   2099 	}
   2100 
   2101 	if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
   2102 		ASSERT(ill->ill_lso_capab != NULL);
   2103 		/*
   2104 		 * Clear the capability flag for LSO but retain the
   2105 		 * ill_lso_capab structure since it's possible that another
   2106 		 * thread is still referring to it.  The structure only gets
   2107 		 * deallocated when we destroy the ill.
   2108 		 */
   2109 
   2110 		ill->ill_capabilities &= ~ILL_CAPAB_LSO;
   2111 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
   2112 		    NULL, DLD_DISABLE);
   2113 	}
   2114 
   2115 	ill->ill_capabilities &= ~ILL_CAPAB_DLD;
   2116 	ill_mac_perim_exit(ill, mph);
   2117 }
   2118 
   2119 /*
   2120  * Capability Negotiation protocol
   2121  *
   2122  * We don't wait for DLPI capability operations to finish during interface
   2123  * bringup or teardown. Doing so would introduce more asynchrony and the
   2124  * interface up/down operations will need multiple return and restarts.
   2125  * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
   2126  * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
   2127  * exclusive operation won't start until the DLPI operations of the previous
   2128  * exclusive operation complete.
   2129  *
   2130  * The capability state machine is shown below.
   2131  *
   2132  * state		next state		event, action
   2133  *
   2134  * IDCS_UNKNOWN 	IDCS_PROBE_SENT		ill_capability_probe
   2135  * IDCS_PROBE_SENT	IDCS_OK			ill_capability_ack
   2136  * IDCS_PROBE_SENT	IDCS_FAILED		ip_rput_dlpi_writer (nack)
   2137  * IDCS_OK		IDCS_RENEG		Receipt of DL_NOTE_CAPAB_RENEG
   2138  * IDCS_OK		IDCS_RESET_SENT		ill_capability_reset
   2139  * IDCS_RESET_SENT	IDCS_UNKNOWN		ill_capability_ack_thr
   2140  * IDCS_RENEG		IDCS_PROBE_SENT		ill_capability_ack_thr ->
   2141  *						    ill_capability_probe.
   2142  */
   2143 
   2144 /*
   2145  * Dedicated thread started from ip_stack_init that handles capability
   2146  * disable. This thread ensures the taskq dispatch does not fail by waiting
   2147  * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
   2148  * that direct calls to DLD are done in a cv_waitable context.
   2149  */
   2150 void
   2151 ill_taskq_dispatch(ip_stack_t *ipst)
   2152 {
   2153 	callb_cpr_t cprinfo;
   2154 	char 	name[64];
   2155 	mblk_t	*mp;
   2156 
   2157 	(void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
   2158 	    ipst->ips_netstack->netstack_stackid);
   2159 	CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
   2160 	    name);
   2161 	mutex_enter(&ipst->ips_capab_taskq_lock);
   2162 
   2163 	for (;;) {
   2164 		mp = ipst->ips_capab_taskq_head;
   2165 		while (mp != NULL) {
   2166 			ipst->ips_capab_taskq_head = mp->b_next;
   2167 			if (ipst->ips_capab_taskq_head == NULL)
   2168 				ipst->ips_capab_taskq_tail = NULL;
   2169 			mutex_exit(&ipst->ips_capab_taskq_lock);
   2170 			mp->b_next = NULL;
   2171 
   2172 			VERIFY(taskq_dispatch(system_taskq,
   2173 			    ill_capability_ack_thr, mp, TQ_SLEEP) != 0);
   2174 			mutex_enter(&ipst->ips_capab_taskq_lock);
   2175 			mp = ipst->ips_capab_taskq_head;
   2176 		}
   2177 
   2178 		if (ipst->ips_capab_taskq_quit)
   2179 			break;
   2180 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   2181 		cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
   2182 		CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
   2183 	}
   2184 	VERIFY(ipst->ips_capab_taskq_head == NULL);
   2185 	VERIFY(ipst->ips_capab_taskq_tail == NULL);
   2186 	CALLB_CPR_EXIT(&cprinfo);
   2187 	thread_exit();
   2188 }
   2189 
   2190 /*
   2191  * Consume a new-style hardware capabilities negotiation ack.
   2192  * Called via taskq on receipt of DL_CAPABILITY_ACK.
   2193  */
   2194 static void
   2195 ill_capability_ack_thr(void *arg)
   2196 {
   2197 	mblk_t	*mp = arg;
   2198 	dl_capability_ack_t *capp;
   2199 	dl_capability_sub_t *subp, *endp;
   2200 	ill_t	*ill;
   2201 	boolean_t reneg;
   2202 
   2203 	ill = (ill_t *)mp->b_prev;
   2204 	mp->b_prev = NULL;
   2205 
   2206 	VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
   2207 
   2208 	if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
   2209 	    ill->ill_dlpi_capab_state == IDCS_RENEG) {
   2210 		/*
   2211 		 * We have received the ack for our DL_CAPAB reset request.
   2212 		 * There isnt' anything in the message that needs processing.
   2213 		 * All message based capabilities have been disabled, now
   2214 		 * do the function call based capability disable.
   2215 		 */
   2216 		reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
   2217 		ill_capability_dld_disable(ill);
   2218 		ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
   2219 		if (reneg)
   2220 			ill_capability_probe(ill);
   2221 		goto done;
   2222 	}
   2223 
   2224 	if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
   2225 		ill->ill_dlpi_capab_state = IDCS_OK;
   2226 
   2227 	capp = (dl_capability_ack_t *)mp->b_rptr;
   2228 
   2229 	if (capp->dl_sub_length == 0) {
   2230 		/* no new-style capabilities */
   2231 		goto done;
   2232 	}
   2233 
   2234 	/* make sure the driver supplied correct dl_sub_length */
   2235 	if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
   2236 		ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
   2237 		    "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
   2238 		goto done;
   2239 	}
   2240 
   2241 #define	SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
   2242 	/*
   2243 	 * There are sub-capabilities. Process the ones we know about.
   2244 	 * Loop until we don't have room for another sub-cap header..
   2245 	 */
   2246 	for (subp = SC(capp, capp->dl_sub_offset),
   2247 	    endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
   2248 	    subp <= endp;
   2249 	    subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
   2250 
   2251 		switch (subp->dl_cap) {
   2252 		case DL_CAPAB_ID_WRAPPER:
   2253 			ill_capability_id_ack(ill, mp, subp);
   2254 			break;
   2255 		default:
   2256 			ill_capability_dispatch(ill, mp, subp);
   2257 			break;
   2258 		}
   2259 	}
   2260 #undef SC
   2261 done:
   2262 	inet_freemsg(mp);
   2263 	ill_capability_done(ill);
   2264 	ipsq_exit(ill->ill_phyint->phyint_ipsq);
   2265 }
   2266 
   2267 /*
   2268  * This needs to be started in a taskq thread to provide a cv_waitable
   2269  * context.
   2270  */
   2271 void
   2272 ill_capability_ack(ill_t *ill, mblk_t *mp)
   2273 {
   2274 	ip_stack_t	*ipst = ill->ill_ipst;
   2275 
   2276 	mp->b_prev = (mblk_t *)ill;
   2277 	ASSERT(mp->b_next == NULL);
   2278 
   2279 	if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
   2280 	    TQ_NOSLEEP) != 0)
   2281 		return;
   2282 
   2283 	/*
   2284 	 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
   2285 	 * which will do the dispatch using TQ_SLEEP to guarantee success.
   2286 	 */
   2287 	mutex_enter(&ipst->ips_capab_taskq_lock);
   2288 	if (ipst->ips_capab_taskq_head == NULL) {
   2289 		ASSERT(ipst->ips_capab_taskq_tail == NULL);
   2290 		ipst->ips_capab_taskq_head = mp;
   2291 	} else {
   2292 		ipst->ips_capab_taskq_tail->b_next = mp;
   2293 	}
   2294 	ipst->ips_capab_taskq_tail = mp;
   2295 
   2296 	cv_signal(&ipst->ips_capab_taskq_cv);
   2297 	mutex_exit(&ipst->ips_capab_taskq_lock);
   2298 }
   2299 
   2300 /*
   2301  * This routine is called to scan the fragmentation reassembly table for
   2302  * the specified ILL for any packets that are starting to smell.
   2303  * dead_interval is the maximum time in seconds that will be tolerated.  It
   2304  * will either be the value specified in ip_g_frag_timeout, or zero if the
   2305  * ILL is shutting down and it is time to blow everything off.
   2306  *
   2307  * It returns the number of seconds (as a time_t) that the next frag timer
   2308  * should be scheduled for, 0 meaning that the timer doesn't need to be
   2309  * re-started.  Note that the method of calculating next_timeout isn't
   2310  * entirely accurate since time will flow between the time we grab
   2311  * current_time and the time we schedule the next timeout.  This isn't a
   2312  * big problem since this is the timer for sending an ICMP reassembly time
   2313  * exceeded messages, and it doesn't have to be exactly accurate.
   2314  *
   2315  * This function is
   2316  * sometimes called as writer, although this is not required.
   2317  */
   2318 time_t
   2319 ill_frag_timeout(ill_t *ill, time_t dead_interval)
   2320 {
   2321 	ipfb_t	*ipfb;
   2322 	ipfb_t	*endp;
   2323 	ipf_t	*ipf;
   2324 	ipf_t	*ipfnext;
   2325 	mblk_t	*mp;
   2326 	time_t	current_time = gethrestime_sec();
   2327 	time_t	next_timeout = 0;
   2328 	uint32_t	hdr_length;
   2329 	mblk_t	*send_icmp_head;
   2330 	mblk_t	*send_icmp_head_v6;
   2331 	ip_stack_t *ipst = ill->ill_ipst;
   2332 	ip_recv_attr_t iras;
   2333 
   2334 	bzero(&iras, sizeof (iras));
   2335 	iras.ira_flags = 0;
   2336 	iras.ira_ill = iras.ira_rill = ill;
   2337 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
   2338 	iras.ira_rifindex = iras.ira_ruifindex;
   2339 
   2340 	ipfb = ill->ill_frag_hash_tbl;
   2341 	if (ipfb == NULL)
   2342 		return (B_FALSE);
   2343 	endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
   2344 	/* Walk the frag hash table. */
   2345 	for (; ipfb < endp; ipfb++) {
   2346 		send_icmp_head = NULL;
   2347 		send_icmp_head_v6 = NULL;
   2348 		mutex_enter(&ipfb->ipfb_lock);
   2349 		while ((ipf = ipfb->ipfb_ipf) != 0) {
   2350 			time_t frag_time = current_time - ipf->ipf_timestamp;
   2351 			time_t frag_timeout;
   2352 
   2353 			if (frag_time < dead_interval) {
   2354 				/*
   2355 				 * There are some outstanding fragments
   2356 				 * that will timeout later.  Make note of
   2357 				 * the time so that we can reschedule the
   2358 				 * next timeout appropriately.
   2359 				 */
   2360 				frag_timeout = dead_interval - frag_time;
   2361 				if (next_timeout == 0 ||
   2362 				    frag_timeout < next_timeout) {
   2363 					next_timeout = frag_timeout;
   2364 				}
   2365 				break;
   2366 			}
   2367 			/* Time's up.  Get it out of here. */
   2368 			hdr_length = ipf->ipf_nf_hdr_len;
   2369 			ipfnext = ipf->ipf_hash_next;
   2370 			if (ipfnext)
   2371 				ipfnext->ipf_ptphn = ipf->ipf_ptphn;
   2372 			*ipf->ipf_ptphn = ipfnext;
   2373 			mp = ipf->ipf_mp->b_cont;
   2374 			for (; mp; mp = mp->b_cont) {
   2375 				/* Extra points for neatness. */
   2376 				IP_REASS_SET_START(mp, 0);
   2377 				IP_REASS_SET_END(mp, 0);
   2378 			}
   2379 			mp = ipf->ipf_mp->b_cont;
   2380 			atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
   2381 			ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
   2382 			ipfb->ipfb_count -= ipf->ipf_count;
   2383 			ASSERT(ipfb->ipfb_frag_pkts > 0);
   2384 			ipfb->ipfb_frag_pkts--;
   2385 			/*
   2386 			 * We do not send any icmp message from here because
   2387 			 * we currently are holding the ipfb_lock for this
   2388 			 * hash chain. If we try and send any icmp messages
   2389 			 * from here we may end up via a put back into ip
   2390 			 * trying to get the same lock, causing a recursive
   2391 			 * mutex panic. Instead we build a list and send all
   2392 			 * the icmp messages after we have dropped the lock.
   2393 			 */
   2394 			if (ill->ill_isv6) {
   2395 				if (hdr_length != 0) {
   2396 					mp->b_next = send_icmp_head_v6;
   2397 					send_icmp_head_v6 = mp;
   2398 				} else {
   2399 					freemsg(mp);
   2400 				}
   2401 			} else {
   2402 				if (hdr_length != 0) {
   2403 					mp->b_next = send_icmp_head;
   2404 					send_icmp_head = mp;
   2405 				} else {
   2406 					freemsg(mp);
   2407 				}
   2408 			}
   2409 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
   2410 			ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
   2411 			freeb(ipf->ipf_mp);
   2412 		}
   2413 		mutex_exit(&ipfb->ipfb_lock);
   2414 		/*
   2415 		 * Now need to send any icmp messages that we delayed from
   2416 		 * above.
   2417 		 */
   2418 		while (send_icmp_head_v6 != NULL) {
   2419 			ip6_t *ip6h;
   2420 
   2421 			mp = send_icmp_head_v6;
   2422 			send_icmp_head_v6 = send_icmp_head_v6->b_next;
   2423 			mp->b_next = NULL;
   2424 			ip6h = (ip6_t *)mp->b_rptr;
   2425 			iras.ira_flags = 0;
   2426 			/*
   2427 			 * This will result in an incorrect ALL_ZONES zoneid
   2428 			 * for multicast packets, but we
   2429 			 * don't send ICMP errors for those in any case.
   2430 			 */
   2431 			iras.ira_zoneid =
   2432 			    ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
   2433 			    ill, ipst);
   2434 			ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
   2435 			icmp_time_exceeded_v6(mp,
   2436 			    ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
   2437 			    &iras);
   2438 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   2439 		}
   2440 		while (send_icmp_head != NULL) {
   2441 			ipaddr_t dst;
   2442 
   2443 			mp = send_icmp_head;
   2444 			send_icmp_head = send_icmp_head->b_next;
   2445 			mp->b_next = NULL;
   2446 
   2447 			dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
   2448 
   2449 			iras.ira_flags = IRAF_IS_IPV4;
   2450 			/*
   2451 			 * This will result in an incorrect ALL_ZONES zoneid
   2452 			 * for broadcast and multicast packets, but we
   2453 			 * don't send ICMP errors for those in any case.
   2454 			 */
   2455 			iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
   2456 			    ill, ipst);
   2457 			ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
   2458 			icmp_time_exceeded(mp,
   2459 			    ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
   2460 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   2461 		}
   2462 	}
   2463 	/*
   2464 	 * A non-dying ILL will use the return value to decide whether to
   2465 	 * restart the frag timer, and for how long.
   2466 	 */
   2467 	return (next_timeout);
   2468 }
   2469 
   2470 /*
   2471  * This routine is called when the approximate count of mblk memory used
   2472  * for the specified ILL has exceeded max_count.
   2473  */
   2474 void
   2475 ill_frag_prune(ill_t *ill, uint_t max_count)
   2476 {
   2477 	ipfb_t	*ipfb;
   2478 	ipf_t	*ipf;
   2479 	size_t	count;
   2480 	clock_t now;
   2481 
   2482 	/*
   2483 	 * If we are here within ip_min_frag_prune_time msecs remove
   2484 	 * ill_frag_free_num_pkts oldest packets from each bucket and increment
   2485 	 * ill_frag_free_num_pkts.
   2486 	 */
   2487 	mutex_enter(&ill->ill_lock);
   2488 	now = ddi_get_lbolt();
   2489 	if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
   2490 	    (ip_min_frag_prune_time != 0 ?
   2491 	    ip_min_frag_prune_time : msec_per_tick)) {
   2492 
   2493 		ill->ill_frag_free_num_pkts++;
   2494 
   2495 	} else {
   2496 		ill->ill_frag_free_num_pkts = 0;
   2497 	}
   2498 	ill->ill_last_frag_clean_time = now;
   2499 	mutex_exit(&ill->ill_lock);
   2500 
   2501 	/*
   2502 	 * free ill_frag_free_num_pkts oldest packets from each bucket.
   2503 	 */
   2504 	if (ill->ill_frag_free_num_pkts != 0) {
   2505 		int ix;
   2506 
   2507 		for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
   2508 			ipfb = &ill->ill_frag_hash_tbl[ix];
   2509 			mutex_enter(&ipfb->ipfb_lock);
   2510 			if (ipfb->ipfb_ipf != NULL) {
   2511 				ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
   2512 				    ill->ill_frag_free_num_pkts);
   2513 			}
   2514 			mutex_exit(&ipfb->ipfb_lock);
   2515 		}
   2516 	}
   2517 	/*
   2518 	 * While the reassembly list for this ILL is too big, prune a fragment
   2519 	 * queue by age, oldest first.
   2520 	 */
   2521 	while (ill->ill_frag_count > max_count) {
   2522 		int	ix;
   2523 		ipfb_t	*oipfb = NULL;
   2524 		uint_t	oldest = UINT_MAX;
   2525 
   2526 		count = 0;
   2527 		for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
   2528 			ipfb = &ill->ill_frag_hash_tbl[ix];
   2529 			mutex_enter(&ipfb->ipfb_lock);
   2530 			ipf = ipfb->ipfb_ipf;
   2531 			if (ipf != NULL && ipf->ipf_gen < oldest) {
   2532 				oldest = ipf->ipf_gen;
   2533 				oipfb = ipfb;
   2534 			}
   2535 			count += ipfb->ipfb_count;
   2536 			mutex_exit(&ipfb->ipfb_lock);
   2537 		}
   2538 		if (oipfb == NULL)
   2539 			break;
   2540 
   2541 		if (count <= max_count)
   2542 			return;	/* Somebody beat us to it, nothing to do */
   2543 		mutex_enter(&oipfb->ipfb_lock);
   2544 		ipf = oipfb->ipfb_ipf;
   2545 		if (ipf != NULL) {
   2546 			ill_frag_free_pkts(ill, oipfb, ipf, 1);
   2547 		}
   2548 		mutex_exit(&oipfb->ipfb_lock);
   2549 	}
   2550 }
   2551 
   2552 /*
   2553  * free 'free_cnt' fragmented packets starting at ipf.
   2554  */
   2555 void
   2556 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
   2557 {
   2558 	size_t	count;
   2559 	mblk_t	*mp;
   2560 	mblk_t	*tmp;
   2561 	ipf_t **ipfp = ipf->ipf_ptphn;
   2562 
   2563 	ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
   2564 	ASSERT(ipfp != NULL);
   2565 	ASSERT(ipf != NULL);
   2566 
   2567 	while (ipf != NULL && free_cnt-- > 0) {
   2568 		count = ipf->ipf_count;
   2569 		mp = ipf->ipf_mp;
   2570 		ipf = ipf->ipf_hash_next;
   2571 		for (tmp = mp; tmp; tmp = tmp->b_cont) {
   2572 			IP_REASS_SET_START(tmp, 0);
   2573 			IP_REASS_SET_END(tmp, 0);
   2574 		}
   2575 		atomic_add_32(&ill->ill_frag_count, -count);
   2576 		ASSERT(ipfb->ipfb_count >= count);
   2577 		ipfb->ipfb_count -= count;
   2578 		ASSERT(ipfb->ipfb_frag_pkts > 0);
   2579 		ipfb->ipfb_frag_pkts--;
   2580 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
   2581 		ip_drop_input("ipIfStatsReasmFails", mp, ill);
   2582 		freemsg(mp);
   2583 	}
   2584 
   2585 	if (ipf)
   2586 		ipf->ipf_ptphn = ipfp;
   2587 	ipfp[0] = ipf;
   2588 }
   2589 
   2590 #define	ND_FORWARD_WARNING	"The <if>:ip*_forwarding ndd variables are " \
   2591 	"obsolete and may be removed in a future release of Solaris.  Use " \
   2592 	"ifconfig(1M) to manipulate the forwarding status of an interface."
   2593 
   2594 /*
   2595  * For obsolete per-interface forwarding configuration;
   2596  * called in response to ND_GET.
   2597  */
   2598 /* ARGSUSED */
   2599 static int
   2600 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr)
   2601 {
   2602 	ill_t *ill = (ill_t *)cp;
   2603 
   2604 	cmn_err(CE_WARN, ND_FORWARD_WARNING);
   2605 
   2606 	(void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0);
   2607 	return (0);
   2608 }
   2609 
   2610 /*
   2611  * For obsolete per-interface forwarding configuration;
   2612  * called in response to ND_SET.
   2613  */
   2614 /* ARGSUSED */
   2615 static int
   2616 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp,
   2617     cred_t *ioc_cr)
   2618 {
   2619 	long value;
   2620 	int retval;
   2621 	ip_stack_t *ipst = CONNQ_TO_IPST(q);
   2622 
   2623 	cmn_err(CE_WARN, ND_FORWARD_WARNING);
   2624 
   2625 	if (ddi_strtol(valuestr, NULL, 10, &value) != 0 ||
   2626 	    value < 0 || value > 1) {
   2627 		return (EINVAL);
   2628 	}
   2629 
   2630 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   2631 	retval = ill_forward_set((ill_t *)cp, (value != 0));
   2632 	rw_exit(&ipst->ips_ill_g_lock);
   2633 	return (retval);
   2634 }
   2635 
   2636 /*
   2637  * Helper function for ill_forward_set().
   2638  */
   2639 static void
   2640 ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
   2641 {
   2642 	ip_stack_t	*ipst = ill->ill_ipst;
   2643 
   2644 	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
   2645 
   2646 	ip1dbg(("ill_forward_set: %s %s forwarding on %s",
   2647 	    (enable ? "Enabling" : "Disabling"),
   2648 	    (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
   2649 	mutex_enter(&ill->ill_lock);
   2650 	if (enable)
   2651 		ill->ill_flags |= ILLF_ROUTER;
   2652 	else
   2653 		ill->ill_flags &= ~ILLF_ROUTER;
   2654 	mutex_exit(&ill->ill_lock);
   2655 	if (ill->ill_isv6)
   2656 		ill_set_nce_router_flags(ill, enable);
   2657 	/* Notify routing socket listeners of this change. */
   2658 	if (ill->ill_ipif != NULL)
   2659 		ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
   2660 }
   2661 
   2662 /*
   2663  * Set an ill's ILLF_ROUTER flag appropriately.  Send up RTS_IFINFO routing
   2664  * socket messages for each interface whose flags we change.
   2665  */
   2666 int
   2667 ill_forward_set(ill_t *ill, boolean_t enable)
   2668 {
   2669 	ipmp_illgrp_t *illg;
   2670 	ip_stack_t *ipst = ill->ill_ipst;
   2671 
   2672 	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
   2673 
   2674 	if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
   2675 	    (!enable && !(ill->ill_flags & ILLF_ROUTER)))
   2676 		return (0);
   2677 
   2678 	if (IS_LOOPBACK(ill))
   2679 		return (EINVAL);
   2680 
   2681 	if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
   2682 		/*
   2683 		 * Update all of the interfaces in the group.
   2684 		 */
   2685 		illg = ill->ill_grp;
   2686 		ill = list_head(&illg->ig_if);
   2687 		for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
   2688 			ill_forward_set_on_ill(ill, enable);
   2689 
   2690 		/*
   2691 		 * Update the IPMP meta-interface.
   2692 		 */
   2693 		ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
   2694 		return (0);
   2695 	}
   2696 
   2697 	ill_forward_set_on_ill(ill, enable);
   2698 	return (0);
   2699 }
   2700 
   2701 /*
   2702  * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
   2703  * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
   2704  * set or clear.
   2705  */
   2706 static void
   2707 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
   2708 {
   2709 	ipif_t *ipif;
   2710 	ncec_t *ncec;
   2711 	nce_t *nce;
   2712 
   2713 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   2714 		/*
   2715 		 * NOTE: we match across the illgrp because nce's for
   2716 		 * addresses on IPMP interfaces have an nce_ill that points to
   2717 		 * the bound underlying ill.
   2718 		 */
   2719 		nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
   2720 		if (nce != NULL) {
   2721 			ncec = nce->nce_common;
   2722 			mutex_enter(&ncec->ncec_lock);
   2723 			if (enable)
   2724 				ncec->ncec_flags |= NCE_F_ISROUTER;
   2725 			else
   2726 				ncec->ncec_flags &= ~NCE_F_ISROUTER;
   2727 			mutex_exit(&ncec->ncec_lock);
   2728 			nce_refrele(nce);
   2729 		}
   2730 	}
   2731 }
   2732 
   2733 /*
   2734  * Given an ill with a _valid_ name, add the ip_forwarding ndd variable
   2735  * for this ill.  Make sure the v6/v4 question has been answered about this
   2736  * ill.  The creation of this ndd variable is only for backwards compatibility.
   2737  * The preferred way to control per-interface IP forwarding is through the
   2738  * ILLF_ROUTER interface flag.
   2739  */
   2740 static int
   2741 ill_set_ndd_name(ill_t *ill)
   2742 {
   2743 	char *suffix;
   2744 	ip_stack_t	*ipst = ill->ill_ipst;
   2745 
   2746 	ASSERT(IAM_WRITER_ILL(ill));
   2747 
   2748 	if (ill->ill_isv6)
   2749 		suffix = ipv6_forward_suffix;
   2750 	else
   2751 		suffix = ipv4_forward_suffix;
   2752 
   2753 	ill->ill_ndd_name = ill->ill_name + ill->ill_name_length;
   2754 	bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1);
   2755 	/*
   2756 	 * Copies over the '\0'.
   2757 	 * Note that strlen(suffix) is always bounded.
   2758 	 */
   2759 	bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1,
   2760 	    strlen(suffix) + 1);
   2761 
   2762 	/*
   2763 	 * Use of the nd table requires holding the reader lock.
   2764 	 * Modifying the nd table thru nd_load/nd_unload requires
   2765 	 * the writer lock.
   2766 	 */
   2767 	rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER);
   2768 	if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get,
   2769 	    nd_ill_forward_set, (caddr_t)ill)) {
   2770 		/*
   2771 		 * If the nd_load failed, it only meant that it could not
   2772 		 * allocate a new bunch of room for further NDD expansion.
   2773 		 * Because of that, the ill_ndd_name will be set to 0, and
   2774 		 * this interface is at the mercy of the global ip_forwarding
   2775 		 * variable.
   2776 		 */
   2777 		rw_exit(&ipst->ips_ip_g_nd_lock);
   2778 		ill->ill_ndd_name = NULL;
   2779 		return (ENOMEM);
   2780 	}
   2781 	rw_exit(&ipst->ips_ip_g_nd_lock);
   2782 	return (0);
   2783 }
   2784 
   2785 /*
   2786  * Intializes the context structure and returns the first ill in the list
   2787  * cuurently start_list and end_list can have values:
   2788  * MAX_G_HEADS		Traverse both IPV4 and IPV6 lists.
   2789  * IP_V4_G_HEAD		Traverse IPV4 list only.
   2790  * IP_V6_G_HEAD		Traverse IPV6 list only.
   2791  */
   2792 
   2793 /*
   2794  * We don't check for CONDEMNED ills here. Caller must do that if
   2795  * necessary under the ill lock.
   2796  */
   2797 ill_t *
   2798 ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
   2799     ip_stack_t *ipst)
   2800 {
   2801 	ill_if_t *ifp;
   2802 	ill_t *ill;
   2803 	avl_tree_t *avl_tree;
   2804 
   2805 	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
   2806 	ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
   2807 
   2808 	/*
   2809 	 * setup the lists to search
   2810 	 */
   2811 	if (end_list != MAX_G_HEADS) {
   2812 		ctx->ctx_current_list = start_list;
   2813 		ctx->ctx_last_list = end_list;
   2814 	} else {
   2815 		ctx->ctx_last_list = MAX_G_HEADS - 1;
   2816 		ctx->ctx_current_list = 0;
   2817 	}
   2818 
   2819 	while (ctx->ctx_current_list <= ctx->ctx_last_list) {
   2820 		ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
   2821 		if (ifp != (ill_if_t *)
   2822 		    &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
   2823 			avl_tree = &ifp->illif_avl_by_ppa;
   2824 			ill = avl_first(avl_tree);
   2825 			/*
   2826 			 * ill is guaranteed to be non NULL or ifp should have
   2827 			 * not existed.
   2828 			 */
   2829 			ASSERT(ill != NULL);
   2830 			return (ill);
   2831 		}
   2832 		ctx->ctx_current_list++;
   2833 	}
   2834 
   2835 	return (NULL);
   2836 }
   2837 
   2838 /*
   2839  * returns the next ill in the list. ill_first() must have been called
   2840  * before calling ill_next() or bad things will happen.
   2841  */
   2842 
   2843 /*
   2844  * We don't check for CONDEMNED ills here. Caller must do that if
   2845  * necessary under the ill lock.
   2846  */
   2847 ill_t *
   2848 ill_next(ill_walk_context_t *ctx, ill_t *lastill)
   2849 {
   2850 	ill_if_t *ifp;
   2851 	ill_t *ill;
   2852 	ip_stack_t	*ipst = lastill->ill_ipst;
   2853 
   2854 	ASSERT(lastill->ill_ifptr != (ill_if_t *)
   2855 	    &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
   2856 	if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
   2857 	    AVL_AFTER)) != NULL) {
   2858 		return (ill);
   2859 	}
   2860 
   2861 	/* goto next ill_ifp in the list. */
   2862 	ifp = lastill->ill_ifptr->illif_next;
   2863 
   2864 	/* make sure not at end of circular list */
   2865 	while (ifp ==
   2866 	    (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
   2867 		if (++ctx->ctx_current_list > ctx->ctx_last_list)
   2868 			return (NULL);
   2869 		ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
   2870 	}
   2871 
   2872 	return (avl_first(&ifp->illif_avl_by_ppa));
   2873 }
   2874 
   2875 /*
   2876  * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
   2877  * The final number (PPA) must not have any leading zeros.  Upon success, a
   2878  * pointer to the start of the PPA is returned; otherwise NULL is returned.
   2879  */
   2880 static char *
   2881 ill_get_ppa_ptr(char *name)
   2882 {
   2883 	int namelen = strlen(name);
   2884 	int end_ndx = namelen - 1;
   2885 	int ppa_ndx, i;
   2886 
   2887 	/*
   2888 	 * Check that the first character is [a-zA-Z], and that the last
   2889 	 * character is [0-9].
   2890 	 */
   2891 	if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
   2892 		return (NULL);
   2893 
   2894 	/*
   2895 	 * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
   2896 	 */
   2897 	for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
   2898 		if (!isdigit(name[ppa_ndx - 1]))
   2899 			break;
   2900 
   2901 	if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
   2902 		return (NULL);
   2903 
   2904 	/*
   2905 	 * Check that the intermediate characters are [a-z0-9.]
   2906 	 */
   2907 	for (i = 1; i < ppa_ndx; i++) {
   2908 		if (!isalpha(name[i]) && !isdigit(name[i]) &&
   2909 		    name[i] != '.' && name[i] != '_') {
   2910 			return (NULL);
   2911 		}
   2912 	}
   2913 
   2914 	return (name + ppa_ndx);
   2915 }
   2916 
   2917 /*
   2918  * use avl tree to locate the ill.
   2919  */
   2920 static ill_t *
   2921 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
   2922 {
   2923 	char *ppa_ptr = NULL;
   2924 	int len;
   2925 	uint_t ppa;
   2926 	ill_t *ill = NULL;
   2927 	ill_if_t *ifp;
   2928 	int list;
   2929 
   2930 	/*
   2931 	 * get ppa ptr
   2932 	 */
   2933 	if (isv6)
   2934 		list = IP_V6_G_HEAD;
   2935 	else
   2936 		list = IP_V4_G_HEAD;
   2937 
   2938 	if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
   2939 		return (NULL);
   2940 	}
   2941 
   2942 	len = ppa_ptr - name + 1;
   2943 
   2944 	ppa = stoi(&ppa_ptr);
   2945 
   2946 	ifp = IP_VX_ILL_G_LIST(list, ipst);
   2947 
   2948 	while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
   2949 		/*
   2950 		 * match is done on len - 1 as the name is not null
   2951 		 * terminated it contains ppa in addition to the interface
   2952 		 * name.
   2953 		 */
   2954 		if ((ifp->illif_name_len == len) &&
   2955 		    bcmp(ifp->illif_name, name, len - 1) == 0) {
   2956 			break;
   2957 		} else {
   2958 			ifp = ifp->illif_next;
   2959 		}
   2960 	}
   2961 
   2962 	if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
   2963 		/*
   2964 		 * Even the interface type does not exist.
   2965 		 */
   2966 		return (NULL);
   2967 	}
   2968 
   2969 	ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
   2970 	if (ill != NULL) {
   2971 		mutex_enter(&ill->ill_lock);
   2972 		if (ILL_CAN_LOOKUP(ill)) {
   2973 			ill_refhold_locked(ill);
   2974 			mutex_exit(&ill->ill_lock);
   2975 			return (ill);
   2976 		}
   2977 		mutex_exit(&ill->ill_lock);
   2978 	}
   2979 	return (NULL);
   2980 }
   2981 
   2982 /*
   2983  * comparison function for use with avl.
   2984  */
   2985 static int
   2986 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
   2987 {
   2988 	uint_t ppa;
   2989 	uint_t ill_ppa;
   2990 
   2991 	ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
   2992 
   2993 	ppa = *((uint_t *)ppa_ptr);
   2994 	ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
   2995 	/*
   2996 	 * We want the ill with the lowest ppa to be on the
   2997 	 * top.
   2998 	 */
   2999 	if (ill_ppa < ppa)
   3000 		return (1);
   3001 	if (ill_ppa > ppa)
   3002 		return (-1);
   3003 	return (0);
   3004 }
   3005 
   3006 /*
   3007  * remove an interface type from the global list.
   3008  */
   3009 static void
   3010 ill_delete_interface_type(ill_if_t *interface)
   3011 {
   3012 	ASSERT(interface != NULL);
   3013 	ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
   3014 
   3015 	avl_destroy(&interface->illif_avl_by_ppa);
   3016 	if (interface->illif_ppa_arena != NULL)
   3017 		vmem_destroy(interface->illif_ppa_arena);
   3018 
   3019 	remque(interface);
   3020 
   3021 	mi_free(interface);
   3022 }
   3023 
   3024 /*
   3025  * remove ill from the global list.
   3026  */
   3027 static void
   3028 ill_glist_delete(ill_t *ill)
   3029 {
   3030 	ip_stack_t	*ipst;
   3031 	phyint_t	*phyi;
   3032 
   3033 	if (ill == NULL)
   3034 		return;
   3035 	ipst = ill->ill_ipst;
   3036 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
   3037 
   3038 	/*
   3039 	 * If the ill was never inserted into the AVL tree
   3040 	 * we skip the if branch.
   3041 	 */
   3042 	if (ill->ill_ifptr != NULL) {
   3043 		/*
   3044 		 * remove from AVL tree and free ppa number
   3045 		 */
   3046 		avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
   3047 
   3048 		if (ill->ill_ifptr->illif_ppa_arena != NULL) {
   3049 			vmem_free(ill->ill_ifptr->illif_ppa_arena,
   3050 			    (void *)(uintptr_t)(ill->ill_ppa+1), 1);
   3051 		}
   3052 		if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
   3053 			ill_delete_interface_type(ill->ill_ifptr);
   3054 		}
   3055 
   3056 		/*
   3057 		 * Indicate ill is no longer in the list.
   3058 		 */
   3059 		ill->ill_ifptr = NULL;
   3060 		ill->ill_name_length = 0;
   3061 		ill->ill_name[0] = '\0';
   3062 		ill->ill_ppa = UINT_MAX;
   3063 	}
   3064 
   3065 	/* Generate one last event for this ill. */
   3066 	ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
   3067 	    ill->ill_name_length);
   3068 
   3069 	ASSERT(ill->ill_phyint != NULL);
   3070 	phyi = ill->ill_phyint;
   3071 	ill->ill_phyint = NULL;
   3072 
   3073 	/*
   3074 	 * ill_init allocates a phyint always to store the copy
   3075 	 * of flags relevant to phyint. At that point in time, we could
   3076 	 * not assign the name and hence phyint_illv4/v6 could not be
   3077 	 * initialized. Later in ipif_set_values, we assign the name to
   3078 	 * the ill, at which point in time we assign phyint_illv4/v6.
   3079 	 * Thus we don't rely on phyint_illv6 to be initialized always.
   3080 	 */
   3081 	if (ill->ill_flags & ILLF_IPV6)
   3082 		phyi->phyint_illv6 = NULL;
   3083 	else
   3084 		phyi->phyint_illv4 = NULL;
   3085 
   3086 	if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
   3087 		rw_exit(&ipst->ips_ill_g_lock);
   3088 		return;
   3089 	}
   3090 
   3091 	/*
   3092 	 * There are no ills left on this phyint; pull it out of the phyint
   3093 	 * avl trees, and free it.
   3094 	 */
   3095 	if (phyi->phyint_ifindex > 0) {
   3096 		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
   3097 		    phyi);
   3098 		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
   3099 		    phyi);
   3100 	}
   3101 	rw_exit(&ipst->ips_ill_g_lock);
   3102 
   3103 	phyint_free(phyi);
   3104 }
   3105 
   3106 /*
   3107  * allocate a ppa, if the number of plumbed interfaces of this type are
   3108  * less than ill_no_arena do a linear search to find a unused ppa.
   3109  * When the number goes beyond ill_no_arena switch to using an arena.
   3110  * Note: ppa value of zero cannot be allocated from vmem_arena as it
   3111  * is the return value for an error condition, so allocation starts at one
   3112  * and is decremented by one.
   3113  */
   3114 static int
   3115 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
   3116 {
   3117 	ill_t *tmp_ill;
   3118 	uint_t start, end;
   3119 	int ppa;
   3120 
   3121 	if (ifp->illif_ppa_arena == NULL &&
   3122 	    (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
   3123 		/*
   3124 		 * Create an arena.
   3125 		 */
   3126 		ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
   3127 		    (void *)1, UINT_MAX - 1, 1, NULL, NULL,
   3128 		    NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
   3129 			/* allocate what has already been assigned */
   3130 		for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
   3131 		    tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
   3132 		    tmp_ill, AVL_AFTER)) {
   3133 			ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
   3134 			    1,		/* size */
   3135 			    1,		/* align/quantum */
   3136 			    0,		/* phase */
   3137 			    0,		/* nocross */
   3138 			    /* minaddr */
   3139 			    (void *)((uintptr_t)tmp_ill->ill_ppa + 1),
   3140 			    /* maxaddr */
   3141 			    (void *)((uintptr_t)tmp_ill->ill_ppa + 2),
   3142 			    VM_NOSLEEP|VM_FIRSTFIT);
   3143 			if (ppa == 0) {
   3144 				ip1dbg(("ill_alloc_ppa: ppa allocation"
   3145 				    " failed while switching"));
   3146 				vmem_destroy(ifp->illif_ppa_arena);
   3147 				ifp->illif_ppa_arena = NULL;
   3148 				break;
   3149 			}
   3150 		}
   3151 	}
   3152 
   3153 	if (ifp->illif_ppa_arena != NULL) {
   3154 		if (ill->ill_ppa == UINT_MAX) {
   3155 			ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
   3156 			    1, VM_NOSLEEP|VM_FIRSTFIT);
   3157 			if (ppa == 0)
   3158 				return (EAGAIN);
   3159 			ill->ill_ppa = --ppa;
   3160 		} else {
   3161 			ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
   3162 			    1, 		/* size */
   3163 			    1, 		/* align/quantum */
   3164 			    0, 		/* phase */
   3165 			    0, 		/* nocross */
   3166 			    (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
   3167 			    (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
   3168 			    VM_NOSLEEP|VM_FIRSTFIT);
   3169 			/*
   3170 			 * Most likely the allocation failed because
   3171 			 * the requested ppa was in use.
   3172 			 */
   3173 			if (ppa == 0)
   3174 				return (EEXIST);
   3175 		}
   3176 		return (0);
   3177 	}
   3178 
   3179 	/*
   3180 	 * No arena is in use and not enough (>ill_no_arena) interfaces have
   3181 	 * been plumbed to create one. Do a linear search to get a unused ppa.
   3182 	 */
   3183 	if (ill->ill_ppa == UINT_MAX) {
   3184 		end = UINT_MAX - 1;
   3185 		start = 0;
   3186 	} else {
   3187 		end = start = ill->ill_ppa;
   3188 	}
   3189 
   3190 	tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
   3191 	while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
   3192 		if (start++ >= end) {
   3193 			if (ill->ill_ppa == UINT_MAX)
   3194 				return (EAGAIN);
   3195 			else
   3196 				return (EEXIST);
   3197 		}
   3198 		tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
   3199 	}
   3200 	ill->ill_ppa = start;
   3201 	return (0);
   3202 }
   3203 
   3204 /*
   3205  * Insert ill into the list of configured ill's. Once this function completes,
   3206  * the ill is globally visible and is available through lookups. More precisely
   3207  * this happens after the caller drops the ill_g_lock.
   3208  */
   3209 static int
   3210 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
   3211 {
   3212 	ill_if_t *ill_interface;
   3213 	avl_index_t where = 0;
   3214 	int error;
   3215 	int name_length;
   3216 	int index;
   3217 	boolean_t check_length = B_FALSE;
   3218 	ip_stack_t	*ipst = ill->ill_ipst;
   3219 
   3220 	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
   3221 
   3222 	name_length = mi_strlen(name) + 1;
   3223 
   3224 	if (isv6)
   3225 		index = IP_V6_G_HEAD;
   3226 	else
   3227 		index = IP_V4_G_HEAD;
   3228 
   3229 	ill_interface = IP_VX_ILL_G_LIST(index, ipst);
   3230 	/*
   3231 	 * Search for interface type based on name
   3232 	 */
   3233 	while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
   3234 		if ((ill_interface->illif_name_len == name_length) &&
   3235 		    (strcmp(ill_interface->illif_name, name) == 0)) {
   3236 			break;
   3237 		}
   3238 		ill_interface = ill_interface->illif_next;
   3239 	}
   3240 
   3241 	/*
   3242 	 * Interface type not found, create one.
   3243 	 */
   3244 	if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
   3245 		ill_g_head_t ghead;
   3246 
   3247 		/*
   3248 		 * allocate ill_if_t structure
   3249 		 */
   3250 		ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
   3251 		if (ill_interface == NULL) {
   3252 			return (ENOMEM);
   3253 		}
   3254 
   3255 		(void) strcpy(ill_interface->illif_name, name);
   3256 		ill_interface->illif_name_len = name_length;
   3257 
   3258 		avl_create(&ill_interface->illif_avl_by_ppa,
   3259 		    ill_compare_ppa, sizeof (ill_t),
   3260 		    offsetof(struct ill_s, ill_avl_byppa));
   3261 
   3262 		/*
   3263 		 * link the structure in the back to maintain order
   3264 		 * of configuration for ifconfig output.
   3265 		 */
   3266 		ghead = ipst->ips_ill_g_heads[index];
   3267 		insque(ill_interface, ghead.ill_g_list_tail);
   3268 	}
   3269 
   3270 	if (ill->ill_ppa == UINT_MAX)
   3271 		check_length = B_TRUE;
   3272 
   3273 	error = ill_alloc_ppa(ill_interface, ill);
   3274 	if (error != 0) {
   3275 		if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
   3276 			ill_delete_interface_type(ill->ill_ifptr);
   3277 		return (error);
   3278 	}
   3279 
   3280 	/*
   3281 	 * When the ppa is choosen by the system, check that there is
   3282 	 * enough space to insert ppa. if a specific ppa was passed in this
   3283 	 * check is not required as the interface name passed in will have
   3284 	 * the right ppa in it.
   3285 	 */
   3286 	if (check_length) {
   3287 		/*
   3288 		 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
   3289 		 */
   3290 		char buf[sizeof (uint_t) * 3];
   3291 
   3292 		/*
   3293 		 * convert ppa to string to calculate the amount of space
   3294 		 * required for it in the name.
   3295 		 */
   3296 		numtos(ill->ill_ppa, buf);
   3297 
   3298 		/* Do we have enough space to insert ppa ? */
   3299 
   3300 		if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
   3301 			/* Free ppa and interface type struct */
   3302 			if (ill_interface->illif_ppa_arena != NULL) {
   3303 				vmem_free(ill_interface->illif_ppa_arena,
   3304 				    (void *)(uintptr_t)(ill->ill_ppa+1), 1);
   3305 			}
   3306 			if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
   3307 				ill_delete_interface_type(ill->ill_ifptr);
   3308 
   3309 			return (EINVAL);
   3310 		}
   3311 	}
   3312 
   3313 	(void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
   3314 	ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
   3315 
   3316 	(void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
   3317 	    &where);
   3318 	ill->ill_ifptr = ill_interface;
   3319 	avl_insert(&ill_interface->illif_avl_by_ppa, ill, where);
   3320 
   3321 	ill_phyint_reinit(ill);
   3322 	return (0);
   3323 }
   3324 
   3325 /* Initialize the per phyint ipsq used for serialization */
   3326 static boolean_t
   3327 ipsq_init(ill_t *ill, boolean_t enter)
   3328 {
   3329 	ipsq_t  *ipsq;
   3330 	ipxop_t	*ipx;
   3331 
   3332 	if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
   3333 		return (B_FALSE);
   3334 
   3335 	ill->ill_phyint->phyint_ipsq = ipsq;
   3336 	ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
   3337 	ipx->ipx_ipsq = ipsq;
   3338 	ipsq->ipsq_next = ipsq;
   3339 	ipsq->ipsq_phyint = ill->ill_phyint;
   3340 	mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
   3341 	mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
   3342 	ipsq->ipsq_ipst = ill->ill_ipst;	/* No netstack_hold */
   3343 	if (enter) {
   3344 		ipx->ipx_writer = curthread;
   3345 		ipx->ipx_forced = B_FALSE;
   3346 		ipx->ipx_reentry_cnt = 1;
   3347 #ifdef DEBUG
   3348 		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
   3349 #endif
   3350 	}
   3351 	return (B_TRUE);
   3352 }
   3353 
   3354 /*
   3355  * ill_init is called by ip_open when a device control stream is opened.
   3356  * It does a few initializations, and shoots a DL_INFO_REQ message down
   3357  * to the driver.  The response is later picked up in ip_rput_dlpi and
   3358  * used to set up default mechanisms for talking to the driver.  (Always
   3359  * called as writer.)
   3360  *
   3361  * If this function returns error, ip_open will call ip_close which in
   3362  * turn will call ill_delete to clean up any memory allocated here that
   3363  * is not yet freed.
   3364  */
   3365 int
   3366 ill_init(queue_t *q, ill_t *ill)
   3367 {
   3368 	int	count;
   3369 	dl_info_req_t	*dlir;
   3370 	mblk_t	*info_mp;
   3371 	uchar_t *frag_ptr;
   3372 
   3373 	/*
   3374 	 * The ill is initialized to zero by mi_alloc*(). In addition
   3375 	 * some fields already contain valid values, initialized in
   3376 	 * ip_open(), before we reach here.
   3377 	 */
   3378 	mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
   3379 	mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
   3380 	ill->ill_saved_ire_cnt = 0;
   3381 
   3382 	ill->ill_rq = q;
   3383 	ill->ill_wq = WR(q);
   3384 
   3385 	info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
   3386 	    BPRI_HI);
   3387 	if (info_mp == NULL)
   3388 		return (ENOMEM);
   3389 
   3390 	/*
   3391 	 * Allocate sufficient space to contain our fragment hash table and
   3392 	 * the device name.
   3393 	 */
   3394 	frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE +
   3395 	    2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix));
   3396 	if (frag_ptr == NULL) {
   3397 		freemsg(info_mp);
   3398 		return (ENOMEM);
   3399 	}
   3400 	ill->ill_frag_ptr = frag_ptr;
   3401 	ill->ill_frag_free_num_pkts = 0;
   3402 	ill->ill_last_frag_clean_time = 0;
   3403 	ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr;
   3404 	ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE);
   3405 	for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
   3406 		mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock,
   3407 		    NULL, MUTEX_DEFAULT, NULL);
   3408 	}
   3409 
   3410 	ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
   3411 	if (ill->ill_phyint == NULL) {
   3412 		freemsg(info_mp);
   3413 		mi_free(frag_ptr);
   3414 		return (ENOMEM);
   3415 	}
   3416 
   3417 	mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
   3418 	/*
   3419 	 * For now pretend this is a v4 ill. We need to set phyint_ill*
   3420 	 * at this point because of the following reason. If we can't
   3421 	 * enter the ipsq at some point and cv_wait, the writer that
   3422 	 * wakes us up tries to locate us using the list of all phyints
   3423 	 * in an ipsq and the ills from the phyint thru the phyint_ill*.
   3424 	 * If we don't set it now, we risk a missed wakeup.
   3425 	 */
   3426 	ill->ill_phyint->phyint_illv4 = ill;
   3427 	ill->ill_ppa = UINT_MAX;
   3428 	list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
   3429 
   3430 	ill_set_inputfn(ill);
   3431 
   3432 	if (!ipsq_init(ill, B_TRUE)) {
   3433 		freemsg(info_mp);
   3434 		mi_free(frag_ptr);
   3435 		mi_free(ill->ill_phyint);
   3436 		return (ENOMEM);
   3437 	}
   3438 
   3439 	ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
   3440 
   3441 	/* Frag queue limit stuff */
   3442 	ill->ill_frag_count = 0;
   3443 	ill->ill_ipf_gen = 0;
   3444 
   3445 	rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
   3446 	mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
   3447 	ill->ill_global_timer = INFINITY;
   3448 	ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
   3449 	ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
   3450 	ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
   3451 	ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
   3452 
   3453 	/*
   3454 	 * Initialize IPv6 configuration variables.  The IP module is always
   3455 	 * opened as an IPv4 module.  Instead tracking down the cases where
   3456 	 * it switches to do ipv6, we'll just initialize the IPv6 configuration
   3457 	 * here for convenience, this has no effect until the ill is set to do
   3458 	 * IPv6.
   3459 	 */
   3460 	ill->ill_reachable_time = ND_REACHABLE_TIME;
   3461 	ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
   3462 	ill->ill_max_buf = ND_MAX_Q;
   3463 	ill->ill_refcnt = 0;
   3464 
   3465 	/* Send down the Info Request to the driver. */
   3466 	info_mp->b_datap->db_type = M_PCPROTO;
   3467 	dlir = (dl_info_req_t *)info_mp->b_rptr;
   3468 	info_mp->b_wptr = (uchar_t *)&dlir[1];
   3469 	dlir->dl_primitive = DL_INFO_REQ;
   3470 
   3471 	ill->ill_dlpi_pending = DL_PRIM_INVAL;
   3472 
   3473 	qprocson(q);
   3474 	ill_dlpi_send(ill, info_mp);
   3475 
   3476 	return (0);
   3477 }
   3478 
   3479 /*
   3480  * ill_dls_info
   3481  * creates datalink socket info from the device.
   3482  */
   3483 int
   3484 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill)
   3485 {
   3486 	size_t	len;
   3487 
   3488 	sdl->sdl_family = AF_LINK;
   3489 	sdl->sdl_index = ill_get_upper_ifindex(ill);
   3490 	sdl->sdl_type = ill->ill_type;
   3491 	ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
   3492 	len = strlen(sdl->sdl_data);
   3493 	ASSERT(len < 256);
   3494 	sdl->sdl_nlen = (uchar_t)len;
   3495 	sdl->sdl_alen = ill->ill_phys_addr_length;
   3496 	sdl->sdl_slen = 0;
   3497 	if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL)
   3498 		bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen);
   3499 
   3500 	return (sizeof (struct sockaddr_dl));
   3501 }
   3502 
   3503 /*
   3504  * ill_xarp_info
   3505  * creates xarp info from the device.
   3506  */
   3507 static int
   3508 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
   3509 {
   3510 	sdl->sdl_family = AF_LINK;
   3511 	sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
   3512 	sdl->sdl_type = ill->ill_type;
   3513 	ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
   3514 	sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
   3515 	sdl->sdl_alen = ill->ill_phys_addr_length;
   3516 	sdl->sdl_slen = 0;
   3517 	return (sdl->sdl_nlen);
   3518 }
   3519 
   3520 static int
   3521 loopback_kstat_update(kstat_t *ksp, int rw)
   3522 {
   3523 	kstat_named_t *kn;
   3524 	netstackid_t	stackid;
   3525 	netstack_t	*ns;
   3526 	ip_stack_t	*ipst;
   3527 
   3528 	if (ksp == NULL || ksp->ks_data == NULL)
   3529 		return (EIO);
   3530 
   3531 	if (rw == KSTAT_WRITE)
   3532 		return (EACCES);
   3533 
   3534 	kn = KSTAT_NAMED_PTR(ksp);
   3535 	stackid = (zoneid_t)(uintptr_t)ksp->ks_private;
   3536 
   3537 	ns = netstack_find_by_stackid(stackid);
   3538 	if (ns == NULL)
   3539 		return (-1);
   3540 
   3541 	ipst = ns->netstack_ip;
   3542 	if (ipst == NULL) {
   3543 		netstack_rele(ns);
   3544 		return (-1);
   3545 	}
   3546 	kn[0].value.ui32 = ipst->ips_loopback_packets;
   3547 	kn[1].value.ui32 = ipst->ips_loopback_packets;
   3548 	netstack_rele(ns);
   3549 	return (0);
   3550 }
   3551 
   3552 /*
   3553  * Has ifindex been plumbed already?
   3554  */
   3555 static boolean_t
   3556 phyint_exists(uint_t index, ip_stack_t *ipst)
   3557 {
   3558 	ASSERT(index != 0);
   3559 	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
   3560 
   3561 	return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
   3562 	    &index, NULL) != NULL);
   3563 }
   3564 
   3565 /* Pick a unique ifindex */
   3566 boolean_t
   3567 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst)
   3568 {
   3569 	uint_t starting_index;
   3570 
   3571 	if (!ipst->ips_ill_index_wrap) {
   3572 		*indexp = ipst->ips_ill_index++;
   3573 		if (ipst->ips_ill_index == 0) {
   3574 			/* Reached the uint_t limit Next time wrap  */
   3575 			ipst->ips_ill_index_wrap = B_TRUE;
   3576 		}
   3577 		return (B_TRUE);
   3578 	}
   3579 
   3580 	/*
   3581 	 * Start reusing unused indexes. Note that we hold the ill_g_lock
   3582 	 * at this point and don't want to call any function that attempts
   3583 	 * to get the lock again.
   3584 	 */
   3585 	starting_index = ipst->ips_ill_index++;
   3586 	for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) {
   3587 		if (ipst->ips_ill_index != 0 &&
   3588 		    !phyint_exists(ipst->ips_ill_index, ipst)) {
   3589 			/* found unused index - use it */
   3590 			*indexp = ipst->ips_ill_index;
   3591 			return (B_TRUE);
   3592 		}
   3593 	}
   3594 
   3595 	/*
   3596 	 * all interface indicies are inuse.
   3597 	 */
   3598 	return (B_FALSE);
   3599 }
   3600 
   3601 /*
   3602  * Assign a unique interface index for the phyint.
   3603  */
   3604 static boolean_t
   3605 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst)
   3606 {
   3607 	ASSERT(phyi->phyint_ifindex == 0);
   3608 	return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst));
   3609 }
   3610 
   3611 /*
   3612  * Initialize the flags on `phyi' as per the provided mactype.
   3613  */
   3614 static void
   3615 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype)
   3616 {
   3617 	uint64_t flags = 0;
   3618 
   3619 	/*
   3620 	 * Initialize PHYI_RUNNING and PHYI_FAILED.  For non-IPMP interfaces,
   3621 	 * we always presume the underlying hardware is working and set
   3622 	 * PHYI_RUNNING (if it's not, the driver will subsequently send a
   3623 	 * DL_NOTE_LINK_DOWN message).  For IPMP interfaces, at initialization
   3624 	 * there are no active interfaces in the group so we set PHYI_FAILED.
   3625 	 */
   3626 	if (mactype == SUNW_DL_IPMP)
   3627 		flags |= PHYI_FAILED;
   3628 	else
   3629 		flags |= PHYI_RUNNING;
   3630 
   3631 	switch (mactype) {
   3632 	case SUNW_DL_VNI:
   3633 		flags |= PHYI_VIRTUAL;
   3634 		break;
   3635 	case SUNW_DL_IPMP:
   3636 		flags |= PHYI_IPMP;
   3637 		break;
   3638 	case DL_LOOP:
   3639 		flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL);
   3640 		break;
   3641 	}
   3642 
   3643 	mutex_enter(&phyi->phyint_lock);
   3644 	phyi->phyint_flags |= flags;
   3645 	mutex_exit(&phyi->phyint_lock);
   3646 }
   3647 
   3648 /*
   3649  * Return a pointer to the ill which matches the supplied name.  Note that
   3650  * the ill name length includes the null termination character.  (May be
   3651  * called as writer.)
   3652  * If do_alloc and the interface is "lo0" it will be automatically created.
   3653  * Cannot bump up reference on condemned ills. So dup detect can't be done
   3654  * using this func.
   3655  */
   3656 ill_t *
   3657 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
   3658     boolean_t *did_alloc, ip_stack_t *ipst)
   3659 {
   3660 	ill_t	*ill;
   3661 	ipif_t	*ipif;
   3662 	ipsq_t	*ipsq;
   3663 	kstat_named_t	*kn;
   3664 	boolean_t isloopback;
   3665 	in6_addr_t ov6addr;
   3666 
   3667 	isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
   3668 
   3669 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   3670 	ill = ill_find_by_name(name, isv6, ipst);
   3671 	rw_exit(&ipst->ips_ill_g_lock);
   3672 	if (ill != NULL)
   3673 		return (ill);
   3674 
   3675 	/*
   3676 	 * Couldn't find it.  Does this happen to be a lookup for the
   3677 	 * loopback device and are we allowed to allocate it?
   3678 	 */
   3679 	if (!isloopback || !do_alloc)
   3680 		return (NULL);
   3681 
   3682 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
   3683 	ill = ill_find_by_name(name, isv6, ipst);
   3684 	if (ill != NULL) {
   3685 		rw_exit(&ipst->ips_ill_g_lock);
   3686 		return (ill);
   3687 	}
   3688 
   3689 	/* Create the loopback device on demand */
   3690 	ill = (ill_t *)(mi_alloc(sizeof (ill_t) +
   3691 	    sizeof (ipif_loopback_name), BPRI_MED));
   3692 	if (ill == NULL)
   3693 		goto done;
   3694 
   3695 	*ill = ill_null;
   3696 	mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL);
   3697 	ill->ill_ipst = ipst;
   3698 	list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
   3699 	netstack_hold(ipst->ips_netstack);
   3700 	/*
   3701 	 * For exclusive stacks we set the zoneid to zero
   3702 	 * to make IP operate as if in the global zone.
   3703 	 */
   3704 	ill->ill_zoneid = GLOBAL_ZONEID;
   3705 
   3706 	ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
   3707 	if (ill->ill_phyint == NULL)
   3708 		goto done;
   3709 
   3710 	if (isv6)
   3711 		ill->ill_phyint->phyint_illv6 = ill;
   3712 	else
   3713 		ill->ill_phyint->phyint_illv4 = ill;
   3714 	mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
   3715 	phyint_flags_init(ill->ill_phyint, DL_LOOP);
   3716 
   3717 	if (isv6) {
   3718 		ill->ill_isv6 = B_TRUE;
   3719 		ill->ill_max_frag = ip_loopback_mtu_v6plus;
   3720 	} else {
   3721 		ill->ill_max_frag = ip_loopback_mtuplus;
   3722 	}
   3723 	if (!ill_allocate_mibs(ill))
   3724 		goto done;
   3725 	ill->ill_current_frag = ill->ill_max_frag;
   3726 	ill->ill_mtu = ill->ill_max_frag;	/* Initial value */
   3727 	/*
   3728 	 * ipif_loopback_name can't be pointed at directly because its used
   3729 	 * by both the ipv4 and ipv6 interfaces.  When the ill is removed
   3730 	 * from the glist, ill_glist_delete() sets the first character of
   3731 	 * ill_name to '\0'.
   3732 	 */
   3733 	ill->ill_name = (char *)ill + sizeof (*ill);
   3734 	(void) strcpy(ill->ill_name, ipif_loopback_name);
   3735 	ill->ill_name_length = sizeof (ipif_loopback_name);
   3736 	/* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
   3737 	ill->ill_dlpi_pending = DL_PRIM_INVAL;
   3738 
   3739 	rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
   3740 	mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
   3741 	ill->ill_global_timer = INFINITY;
   3742 	ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
   3743 	ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
   3744 	ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
   3745 	ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
   3746 
   3747 	/* No resolver here. */
   3748 	ill->ill_net_type = IRE_LOOPBACK;
   3749 
   3750 	/* Initialize the ipsq */
   3751 	if (!ipsq_init(ill, B_FALSE))
   3752 		goto done;
   3753 
   3754 	ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE);
   3755 	if (ipif == NULL)
   3756 		goto done;
   3757 
   3758 	ill->ill_flags = ILLF_MULTICAST;
   3759 
   3760 	ov6addr = ipif->ipif_v6lcl_addr;
   3761 	/* Set up default loopback address and mask. */
   3762 	if (!isv6) {
   3763 		ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
   3764 
   3765 		IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
   3766 		V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
   3767 		V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
   3768 		    ipif->ipif_v6subnet);
   3769 		ill->ill_flags |= ILLF_IPV4;
   3770 	} else {
   3771 		ipif->ipif_v6lcl_addr = ipv6_loopback;
   3772 		ipif->ipif_v6net_mask = ipv6_all_ones;
   3773 		V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
   3774 		    ipif->ipif_v6subnet);
   3775 		ill->ill_flags |= ILLF_IPV6;
   3776 	}
   3777 
   3778 	/*
   3779 	 * Chain us in at the end of the ill list. hold the ill
   3780 	 * before we make it globally visible. 1 for the lookup.
   3781 	 */
   3782 	ill->ill_refcnt = 0;
   3783 	ill_refhold(ill);
   3784 
   3785 	ill->ill_frag_count = 0;
   3786 	ill->ill_frag_free_num_pkts = 0;
   3787 	ill->ill_last_frag_clean_time = 0;
   3788 
   3789 	ipsq = ill->ill_phyint->phyint_ipsq;
   3790 
   3791 	ill_set_inputfn(ill);
   3792 
   3793 	if (ill_glist_insert(ill, "lo", isv6) != 0)
   3794 		cmn_err(CE_PANIC, "cannot insert loopback interface");
   3795 
   3796 	/* Let SCTP know so that it can add this to its list */
   3797 	sctp_update_ill(ill, SCTP_ILL_INSERT);
   3798 
   3799 	/*
   3800 	 * We have already assigned ipif_v6lcl_addr above, but we need to
   3801 	 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which
   3802 	 * requires to be after ill_glist_insert() since we need the
   3803 	 * ill_index set. Pass on ipv6_loopback as the old address.
   3804 	 */
   3805 	sctp_update_ipif_addr(ipif, ov6addr);
   3806 
   3807 	ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
   3808 
   3809 	/*
   3810 	 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
   3811 	 * If so, free our original one.
   3812 	 */
   3813 	if (ipsq != ill->ill_phyint->phyint_ipsq)
   3814 		ipsq_delete(ipsq);
   3815 
   3816 	if (ipst->ips_loopback_ksp == NULL) {
   3817 		/* Export loopback interface statistics */
   3818 		ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0,
   3819 		    ipif_loopback_name, "net",
   3820 		    KSTAT_TYPE_NAMED, 2, 0,
   3821 		    ipst->ips_netstack->netstack_stackid);
   3822 		if (ipst->ips_loopback_ksp != NULL) {
   3823 			ipst->ips_loopback_ksp->ks_update =
   3824 			    loopback_kstat_update;
   3825 			kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp);
   3826 			kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32);
   3827 			kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32);
   3828 			ipst->ips_loopback_ksp->ks_private =
   3829 			    (void *)(uintptr_t)ipst->ips_netstack->
   3830 			    netstack_stackid;
   3831 			kstat_install(ipst->ips_loopback_ksp);
   3832 		}
   3833 	}
   3834 
   3835 	*did_alloc = B_TRUE;
   3836 	rw_exit(&ipst->ips_ill_g_lock);
   3837 	ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id),
   3838 	    NE_PLUMB, ill->ill_name, ill->ill_name_length);
   3839 	return (ill);
   3840 done:
   3841 	if (ill != NULL) {
   3842 		if (ill->ill_phyint != NULL) {
   3843 			ipsq = ill->ill_phyint->phyint_ipsq;
   3844 			if (ipsq != NULL) {
   3845 				ipsq->ipsq_phyint = NULL;
   3846 				ipsq_delete(ipsq);
   3847 			}
   3848 			mi_free(ill->ill_phyint);
   3849 		}
   3850 		ill_free_mib(ill);
   3851 		if (ill->ill_ipst != NULL)
   3852 			netstack_rele(ill->ill_ipst->ips_netstack);
   3853 		mi_free(ill);
   3854 	}
   3855 	rw_exit(&ipst->ips_ill_g_lock);
   3856 	return (NULL);
   3857 }
   3858 
   3859 /*
   3860  * For IPP calls - use the ip_stack_t for global stack.
   3861  */
   3862 ill_t *
   3863 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
   3864 {
   3865 	ip_stack_t	*ipst;
   3866 	ill_t		*ill;
   3867 
   3868 	ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip;
   3869 	if (ipst == NULL) {
   3870 		cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n");
   3871 		return (NULL);
   3872 	}
   3873 
   3874 	ill = ill_lookup_on_ifindex(index, isv6, ipst);
   3875 	netstack_rele(ipst->ips_netstack);
   3876 	return (ill);
   3877 }
   3878 
   3879 /*
   3880  * Return a pointer to the ill which matches the index and IP version type.
   3881  */
   3882 ill_t *
   3883 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
   3884 {
   3885 	ill_t	*ill;
   3886 	phyint_t *phyi;
   3887 
   3888 	/*
   3889 	 * Indexes are stored in the phyint - a common structure
   3890 	 * to both IPv4 and IPv6.
   3891 	 */
   3892 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   3893 	phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
   3894 	    (void *) &index, NULL);
   3895 	if (phyi != NULL) {
   3896 		ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
   3897 		if (ill != NULL) {
   3898 			mutex_enter(&ill->ill_lock);
   3899 			if (!ILL_IS_CONDEMNED(ill)) {
   3900 				ill_refhold_locked(ill);
   3901 				mutex_exit(&ill->ill_lock);
   3902 				rw_exit(&ipst->ips_ill_g_lock);
   3903 				return (ill);
   3904 			}
   3905 			mutex_exit(&ill->ill_lock);
   3906 		}
   3907 	}
   3908 	rw_exit(&ipst->ips_ill_g_lock);
   3909 	return (NULL);
   3910 }
   3911 
   3912 /*
   3913  * Verify whether or not an interface index is valid.
   3914  * It can be zero (meaning "reset") or an interface index assigned
   3915  * to a non-VNI interface. (We don't use VNI interface to send packets.)
   3916  */
   3917 boolean_t
   3918 ip_ifindex_valid(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
   3919 {
   3920 	ill_t		*ill;
   3921 
   3922 	if (ifindex == 0)
   3923 		return (B_TRUE);
   3924 
   3925 	ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
   3926 	if (ill == NULL)
   3927 		return (B_FALSE);
   3928 	if (IS_VNI(ill)) {
   3929 		ill_refrele(ill);
   3930 		return (B_FALSE);
   3931 	}
   3932 	ill_refrele(ill);
   3933 	return (B_TRUE);
   3934 }
   3935 
   3936 /*
   3937  * Return the ifindex next in sequence after the passed in ifindex.
   3938  * If there is no next ifindex for the given protocol, return 0.
   3939  */
   3940 uint_t
   3941 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
   3942 {
   3943 	phyint_t *phyi;
   3944 	phyint_t *phyi_initial;
   3945 	uint_t   ifindex;
   3946 
   3947 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   3948 
   3949 	if (index == 0) {
   3950 		phyi = avl_first(
   3951 		    &ipst->ips_phyint_g_list->phyint_list_avl_by_index);
   3952 	} else {
   3953 		phyi = phyi_initial = avl_find(
   3954 		    &ipst->ips_phyint_g_list->phyint_list_avl_by_index,
   3955 		    (void *) &index, NULL);
   3956 	}
   3957 
   3958 	for (; phyi != NULL;
   3959 	    phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
   3960 	    phyi, AVL_AFTER)) {
   3961 		/*
   3962 		 * If we're not returning the first interface in the tree
   3963 		 * and we still haven't moved past the phyint_t that
   3964 		 * corresponds to index, avl_walk needs to be called again
   3965 		 */
   3966 		if (!((index != 0) && (phyi == phyi_initial))) {
   3967 			if (isv6) {
   3968 				if ((phyi->phyint_illv6) &&
   3969 				    ILL_CAN_LOOKUP(phyi->phyint_illv6) &&
   3970 				    (phyi->phyint_illv6->ill_isv6 == 1))
   3971 					break;
   3972 			} else {
   3973 				if ((phyi->phyint_illv4) &&
   3974 				    ILL_CAN_LOOKUP(phyi->phyint_illv4) &&
   3975 				    (phyi->phyint_illv4->ill_isv6 == 0))
   3976 					break;
   3977 			}
   3978 		}
   3979 	}
   3980 
   3981 	rw_exit(&ipst->ips_ill_g_lock);
   3982 
   3983 	if (phyi != NULL)
   3984 		ifindex = phyi->phyint_ifindex;
   3985 	else
   3986 		ifindex = 0;
   3987 
   3988 	return (ifindex);
   3989 }
   3990 
   3991 /*
   3992  * Return the ifindex for the named interface.
   3993  * If there is no next ifindex for the interface, return 0.
   3994  */
   3995 uint_t
   3996 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
   3997 {
   3998 	phyint_t	*phyi;
   3999 	avl_index_t	where = 0;
   4000 	uint_t		ifindex;
   4001 
   4002 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   4003 
   4004 	if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
   4005 	    name, &where)) == NULL) {
   4006 		rw_exit(&ipst->ips_ill_g_lock);
   4007 		return (0);
   4008 	}
   4009 
   4010 	ifindex = phyi->phyint_ifindex;
   4011 
   4012 	rw_exit(&ipst->ips_ill_g_lock);
   4013 
   4014 	return (ifindex);
   4015 }
   4016 
   4017 /*
   4018  * Return the ifindex to be used by upper layer protocols for instance
   4019  * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill.
   4020  */
   4021 uint_t
   4022 ill_get_upper_ifindex(const ill_t *ill)
   4023 {
   4024 	if (IS_UNDER_IPMP(ill))
   4025 		return (ipmp_ill_get_ipmp_ifindex(ill));
   4026 	else
   4027 		return (ill->ill_phyint->phyint_ifindex);
   4028 }
   4029 
   4030 
   4031 /*
   4032  * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
   4033  * that gives a running thread a reference to the ill. This reference must be
   4034  * released by the thread when it is done accessing the ill and related
   4035  * objects. ill_refcnt can not be used to account for static references
   4036  * such as other structures pointing to an ill. Callers must generally
   4037  * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros
   4038  * or be sure that the ill is not being deleted or changing state before
   4039  * calling the refhold functions. A non-zero ill_refcnt ensures that the
   4040  * ill won't change any of its critical state such as address, netmask etc.
   4041  */
   4042 void
   4043 ill_refhold(ill_t *ill)
   4044 {
   4045 	mutex_enter(&ill->ill_lock);
   4046 	ill->ill_refcnt++;
   4047 	ILL_TRACE_REF(ill);
   4048 	mutex_exit(&ill->ill_lock);
   4049 }
   4050 
   4051 void
   4052 ill_refhold_locked(ill_t *ill)
   4053 {
   4054 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4055 	ill->ill_refcnt++;
   4056 	ILL_TRACE_REF(ill);
   4057 }
   4058 
   4059 /* Returns true if we managed to get a refhold */
   4060 boolean_t
   4061 ill_check_and_refhold(ill_t *ill)
   4062 {
   4063 	mutex_enter(&ill->ill_lock);
   4064 	if (!ILL_IS_CONDEMNED(ill)) {
   4065 		ill_refhold_locked(ill);
   4066 		mutex_exit(&ill->ill_lock);
   4067 		return (B_TRUE);
   4068 	}
   4069 	mutex_exit(&ill->ill_lock);
   4070 	return (B_FALSE);
   4071 }
   4072 
   4073 /*
   4074  * Must not be called while holding any locks. Otherwise if this is
   4075  * the last reference to be released, there is a chance of recursive mutex
   4076  * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
   4077  * to restart an ioctl.
   4078  */
   4079 void
   4080 ill_refrele(ill_t *ill)
   4081 {
   4082 	mutex_enter(&ill->ill_lock);
   4083 	ASSERT(ill->ill_refcnt != 0);
   4084 	ill->ill_refcnt--;
   4085 	ILL_UNTRACE_REF(ill);
   4086 	if (ill->ill_refcnt != 0) {
   4087 		/* Every ire pointing to the ill adds 1 to ill_refcnt */
   4088 		mutex_exit(&ill->ill_lock);
   4089 		return;
   4090 	}
   4091 
   4092 	/* Drops the ill_lock */
   4093 	ipif_ill_refrele_tail(ill);
   4094 }
   4095 
   4096 /*
   4097  * Obtain a weak reference count on the ill. This reference ensures the
   4098  * ill won't be freed, but the ill may change any of its critical state
   4099  * such as netmask, address etc. Returns an error if the ill has started
   4100  * closing.
   4101  */
   4102 boolean_t
   4103 ill_waiter_inc(ill_t *ill)
   4104 {
   4105 	mutex_enter(&ill->ill_lock);
   4106 	if (ill->ill_state_flags & ILL_CONDEMNED) {
   4107 		mutex_exit(&ill->ill_lock);
   4108 		return (B_FALSE);
   4109 	}
   4110 	ill->ill_waiters++;
   4111 	mutex_exit(&ill->ill_lock);
   4112 	return (B_TRUE);
   4113 }
   4114 
   4115 void
   4116 ill_waiter_dcr(ill_t *ill)
   4117 {
   4118 	mutex_enter(&ill->ill_lock);
   4119 	ill->ill_waiters--;
   4120 	if (ill->ill_waiters == 0)
   4121 		cv_broadcast(&ill->ill_cv);
   4122 	mutex_exit(&ill->ill_lock);
   4123 }
   4124 
   4125 /*
   4126  * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the
   4127  * driver.  We construct best guess defaults for lower level information that
   4128  * we need.  If an interface is brought up without injection of any overriding
   4129  * information from outside, we have to be ready to go with these defaults.
   4130  * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ)
   4131  * we primarely want the dl_provider_style.
   4132  * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND
   4133  * at which point we assume the other part of the information is valid.
   4134  */
   4135 void
   4136 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
   4137 {
   4138 	uchar_t		*brdcst_addr;
   4139 	uint_t		brdcst_addr_length, phys_addr_length;
   4140 	t_scalar_t	sap_length;
   4141 	dl_info_ack_t	*dlia;
   4142 	ip_m_t		*ipm;
   4143 	dl_qos_cl_sel1_t *sel1;
   4144 	int		min_mtu;
   4145 
   4146 	ASSERT(IAM_WRITER_ILL(ill));
   4147 
   4148 	/*
   4149 	 * Till the ill is fully up  the ill is not globally visible.
   4150 	 * So no need for a lock.
   4151 	 */
   4152 	dlia = (dl_info_ack_t *)mp->b_rptr;
   4153 	ill->ill_mactype = dlia->dl_mac_type;
   4154 
   4155 	ipm = ip_m_lookup(dlia->dl_mac_type);
   4156 	if (ipm == NULL) {
   4157 		ipm = ip_m_lookup(DL_OTHER);
   4158 		ASSERT(ipm != NULL);
   4159 	}
   4160 	ill->ill_media = ipm;
   4161 
   4162 	/*
   4163 	 * When the new DLPI stuff is ready we'll pull lengths
   4164 	 * from dlia.
   4165 	 */
   4166 	if (dlia->dl_version == DL_VERSION_2) {
   4167 		brdcst_addr_length = dlia->dl_brdcst_addr_length;
   4168 		brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
   4169 		    brdcst_addr_length);
   4170 		if (brdcst_addr == NULL) {
   4171 			brdcst_addr_length = 0;
   4172 		}
   4173 		sap_length = dlia->dl_sap_length;
   4174 		phys_addr_length = dlia->dl_addr_length - ABS(sap_length);
   4175 		ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n",
   4176 		    brdcst_addr_length, sap_length, phys_addr_length));
   4177 	} else {
   4178 		brdcst_addr_length = 6;
   4179 		brdcst_addr = ip_six_byte_all_ones;
   4180 		sap_length = -2;
   4181 		phys_addr_length = brdcst_addr_length;
   4182 	}
   4183 
   4184 	ill->ill_bcast_addr_length = brdcst_addr_length;
   4185 	ill->ill_phys_addr_length = phys_addr_length;
   4186 	ill->ill_sap_length = sap_length;
   4187 
   4188 	/*
   4189 	 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
   4190 	 * but we must ensure a minimum IP MTU is used since other bits of
   4191 	 * IP will fly apart otherwise.
   4192 	 */
   4193 	min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
   4194 	ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
   4195 	ill->ill_current_frag = ill->ill_max_frag;
   4196 	ill->ill_mtu = ill->ill_max_frag;
   4197 
   4198 	ill->ill_type = ipm->ip_m_type;
   4199 
   4200 	if (!ill->ill_dlpi_style_set) {
   4201 		if (dlia->dl_provider_style == DL_STYLE2)
   4202 			ill->ill_needs_attach = 1;
   4203 
   4204 		phyint_flags_init(ill->ill_phyint, ill->ill_mactype);
   4205 
   4206 		/*
   4207 		 * Allocate the first ipif on this ill.  We don't delay it
   4208 		 * further as ioctl handling assumes at least one ipif exists.
   4209 		 *
   4210 		 * At this point we don't know whether the ill is v4 or v6.
   4211 		 * We will know this whan the SIOCSLIFNAME happens and
   4212 		 * the correct value for ill_isv6 will be assigned in
   4213 		 * ipif_set_values(). We need to hold the ill lock and
   4214 		 * clear the ILL_LL_SUBNET_PENDING flag and atomically do
   4215 		 * the wakeup.
   4216 		 */
   4217 		(void) ipif_allocate(ill, 0, IRE_LOCAL,
   4218 		    dlia->dl_provider_style != DL_STYLE2, B_TRUE);
   4219 		mutex_enter(&ill->ill_lock);
   4220 		ASSERT(ill->ill_dlpi_style_set == 0);
   4221 		ill->ill_dlpi_style_set = 1;
   4222 		ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING;
   4223 		cv_broadcast(&ill->ill_cv);
   4224 		mutex_exit(&ill->ill_lock);
   4225 		freemsg(mp);
   4226 		return;
   4227 	}
   4228 	ASSERT(ill->ill_ipif != NULL);
   4229 	/*
   4230 	 * We know whether it is IPv4 or IPv6 now, as this is the
   4231 	 * second DL_INFO_ACK we are recieving in response to the
   4232 	 * DL_INFO_REQ sent in ipif_set_values.
   4233 	 */
   4234 	ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap;
   4235 	/*
   4236 	 * Clear all the flags that were set based on ill_bcast_addr_length
   4237 	 * and ill_phys_addr_length (in ipif_set_values) as these could have
   4238 	 * changed now and we need to re-evaluate.
   4239 	 */
   4240 	ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP);
   4241 	ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
   4242 
   4243 	/*
   4244 	 * Free ill_bcast_mp as things could have changed now.
   4245 	 *
   4246 	 * NOTE: The IPMP meta-interface is special-cased because it starts
   4247 	 * with no underlying interfaces (and thus an unknown broadcast
   4248 	 * address length), but we enforce that an interface is broadcast-
   4249 	 * capable as part of allowing it to join a group.
   4250 	 */
   4251 	if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
   4252 		if (ill->ill_bcast_mp != NULL)
   4253 			freemsg(ill->ill_bcast_mp);
   4254 		ill->ill_net_type = IRE_IF_NORESOLVER;
   4255 
   4256 		ill->ill_bcast_mp = ill_dlur_gen(NULL,
   4257 		    ill->ill_phys_addr_length,
   4258 		    ill->ill_sap,
   4259 		    ill->ill_sap_length);
   4260 
   4261 		if (ill->ill_isv6)
   4262 			/*
   4263 			 * Note: xresolv interfaces will eventually need NOARP
   4264 			 * set here as well, but that will require those
   4265 			 * external resolvers to have some knowledge of
   4266 			 * that flag and act appropriately. Not to be changed
   4267 			 * at present.
   4268 			 */
   4269 			ill->ill_flags |= ILLF_NONUD;
   4270 		else
   4271 			ill->ill_flags |= ILLF_NOARP;
   4272 
   4273 		if (ill->ill_mactype == SUNW_DL_VNI) {
   4274 			ill->ill_ipif->ipif_flags |= IPIF_NOXMIT;
   4275 		} else if (ill->ill_phys_addr_length == 0 ||
   4276 		    ill->ill_mactype == DL_IPV4 ||
   4277 		    ill->ill_mactype == DL_IPV6) {
   4278 			/*
   4279 			 * The underying link is point-to-point, so mark the
   4280 			 * interface as such.  We can do IP multicast over
   4281 			 * such a link since it transmits all network-layer
   4282 			 * packets to the remote side the same way.
   4283 			 */
   4284 			ill->ill_flags |= ILLF_MULTICAST;
   4285 			ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT;
   4286 		}
   4287 	} else {
   4288 		ill->ill_net_type = IRE_IF_RESOLVER;
   4289 		if (ill->ill_bcast_mp != NULL)
   4290 			freemsg(ill->ill_bcast_mp);
   4291 		ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr,
   4292 		    ill->ill_bcast_addr_length, ill->ill_sap,
   4293 		    ill->ill_sap_length);
   4294 		/*
   4295 		 * Later detect lack of DLPI driver multicast
   4296 		 * capability by catching DL_ENABMULTI errors in
   4297 		 * ip_rput_dlpi.
   4298 		 */
   4299 		ill->ill_flags |= ILLF_MULTICAST;
   4300 		if (!ill->ill_isv6)
   4301 			ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
   4302 	}
   4303 
   4304 	/* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */
   4305 	if (ill->ill_mactype == SUNW_DL_IPMP)
   4306 		ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
   4307 
   4308 	/* By default an interface does not support any CoS marking */
   4309 	ill->ill_flags &= ~ILLF_COS_ENABLED;
   4310 
   4311 	/*
   4312 	 * If we get QoS information in DL_INFO_ACK, the device supports
   4313 	 * some form of CoS marking, set ILLF_COS_ENABLED.
   4314 	 */
   4315 	sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset,
   4316 	    dlia->dl_qos_length);
   4317 	if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) {
   4318 		ill->ill_flags |= ILLF_COS_ENABLED;
   4319 	}
   4320 
   4321 	/* Clear any previous error indication. */
   4322 	ill->ill_error = 0;
   4323 	freemsg(mp);
   4324 }
   4325 
   4326 /*
   4327  * Perform various checks to verify that an address would make sense as a
   4328  * local, remote, or subnet interface address.
   4329  */
   4330 static boolean_t
   4331 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask)
   4332 {
   4333 	ipaddr_t	net_mask;
   4334 
   4335 	/*
   4336 	 * Don't allow all zeroes, or all ones, but allow
   4337 	 * all ones netmask.
   4338 	 */
   4339 	if ((net_mask = ip_net_mask(addr)) == 0)
   4340 		return (B_FALSE);
   4341 	/* A given netmask overrides the "guess" netmask */
   4342 	if (subnet_mask != 0)
   4343 		net_mask = subnet_mask;
   4344 	if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) ||
   4345 	    (addr == (addr | ~net_mask)))) {
   4346 		return (B_FALSE);
   4347 	}
   4348 
   4349 	/*
   4350 	 * Even if the netmask is all ones, we do not allow address to be
   4351 	 * 255.255.255.255
   4352 	 */
   4353 	if (addr == INADDR_BROADCAST)
   4354 		return (B_FALSE);
   4355 
   4356 	if (CLASSD(addr))
   4357 		return (B_FALSE);
   4358 
   4359 	return (B_TRUE);
   4360 }
   4361 
   4362 #define	V6_IPIF_LINKLOCAL(p)	\
   4363 	IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr)
   4364 
   4365 /*
   4366  * Compare two given ipifs and check if the second one is better than
   4367  * the first one using the order of preference (not taking deprecated
   4368  * into acount) specified in ipif_lookup_multicast().
   4369  */
   4370 static boolean_t
   4371 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
   4372 {
   4373 	/* Check the least preferred first. */
   4374 	if (IS_LOOPBACK(old_ipif->ipif_ill)) {
   4375 		/* If both ipifs are the same, use the first one. */
   4376 		if (IS_LOOPBACK(new_ipif->ipif_ill))
   4377 			return (B_FALSE);
   4378 		else
   4379 			return (B_TRUE);
   4380 	}
   4381 
   4382 	/* For IPv6, check for link local address. */
   4383 	if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) {
   4384 		if (IS_LOOPBACK(new_ipif->ipif_ill) ||
   4385 		    V6_IPIF_LINKLOCAL(new_ipif)) {
   4386 			/* The second one is equal or less preferred. */
   4387 			return (B_FALSE);
   4388 		} else {
   4389 			return (B_TRUE);
   4390 		}
   4391 	}
   4392 
   4393 	/* Then check for point to point interface. */
   4394 	if (old_ipif->ipif_flags & IPIF_POINTOPOINT) {
   4395 		if (IS_LOOPBACK(new_ipif->ipif_ill) ||
   4396 		    (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) ||
   4397 		    (new_ipif->ipif_flags & IPIF_POINTOPOINT)) {
   4398 			return (B_FALSE);
   4399 		} else {
   4400 			return (B_TRUE);
   4401 		}
   4402 	}
   4403 
   4404 	/* old_ipif is a normal interface, so no need to use the new one. */
   4405 	return (B_FALSE);
   4406 }
   4407 
   4408 /*
   4409  * Find a mulitcast-capable ipif given an IP instance and zoneid.
   4410  * The ipif must be up, and its ill must multicast-capable, not
   4411  * condemned, not an underlying interface in an IPMP group, and
   4412  * not a VNI interface.  Order of preference:
   4413  *
   4414  * 	1a. normal
   4415  * 	1b. normal, but deprecated
   4416  * 	2a. point to point
   4417  * 	2b. point to point, but deprecated
   4418  * 	3a. link local
   4419  * 	3b. link local, but deprecated
   4420  * 	4. loopback.
   4421  */
   4422 static ipif_t *
   4423 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
   4424 {
   4425 	ill_t			*ill;
   4426 	ill_walk_context_t	ctx;
   4427 	ipif_t			*ipif;
   4428 	ipif_t			*saved_ipif = NULL;
   4429 	ipif_t			*dep_ipif = NULL;
   4430 
   4431 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   4432 	if (isv6)
   4433 		ill = ILL_START_WALK_V6(&ctx, ipst);
   4434 	else
   4435 		ill = ILL_START_WALK_V4(&ctx, ipst);
   4436 
   4437 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   4438 		mutex_enter(&ill->ill_lock);
   4439 		if (IS_VNI(ill) || IS_UNDER_IPMP(ill) ||
   4440 		    ILL_IS_CONDEMNED(ill) ||
   4441 		    !(ill->ill_flags & ILLF_MULTICAST)) {
   4442 			mutex_exit(&ill->ill_lock);
   4443 			continue;
   4444 		}
   4445 		for (ipif = ill->ill_ipif; ipif != NULL;
   4446 		    ipif = ipif->ipif_next) {
   4447 			if (zoneid != ipif->ipif_zoneid &&
   4448 			    zoneid != ALL_ZONES &&
   4449 			    ipif->ipif_zoneid != ALL_ZONES) {
   4450 				continue;
   4451 			}
   4452 			if (!(ipif->ipif_flags & IPIF_UP) ||
   4453 			    IPIF_IS_CONDEMNED(ipif)) {
   4454 				continue;
   4455 			}
   4456 
   4457 			/*
   4458 			 * Found one candidate.  If it is deprecated,
   4459 			 * remember it in dep_ipif.  If it is not deprecated,
   4460 			 * remember it in saved_ipif.
   4461 			 */
   4462 			if (ipif->ipif_flags & IPIF_DEPRECATED) {
   4463 				if (dep_ipif == NULL) {
   4464 					dep_ipif = ipif;
   4465 				} else if (ipif_comp_multi(dep_ipif, ipif,
   4466 				    isv6)) {
   4467 					/*
   4468 					 * If the previous dep_ipif does not
   4469 					 * belong to the same ill, we've done
   4470 					 * a ipif_refhold() on it.  So we need
   4471 					 * to release it.
   4472 					 */
   4473 					if (dep_ipif->ipif_ill != ill)
   4474 						ipif_refrele(dep_ipif);
   4475 					dep_ipif = ipif;
   4476 				}
   4477 				continue;
   4478 			}
   4479 			if (saved_ipif == NULL) {
   4480 				saved_ipif = ipif;
   4481 			} else {
   4482 				if (ipif_comp_multi(saved_ipif, ipif, isv6)) {
   4483 					if (saved_ipif->ipif_ill != ill)
   4484 						ipif_refrele(saved_ipif);
   4485 					saved_ipif = ipif;
   4486 				}
   4487 			}
   4488 		}
   4489 		/*
   4490 		 * Before going to the next ill, do a ipif_refhold() on the
   4491 		 * saved ones.
   4492 		 */
   4493 		if (saved_ipif != NULL && saved_ipif->ipif_ill == ill)
   4494 			ipif_refhold_locked(saved_ipif);
   4495 		if (dep_ipif != NULL && dep_ipif->ipif_ill == ill)
   4496 			ipif_refhold_locked(dep_ipif);
   4497 		mutex_exit(&ill->ill_lock);
   4498 	}
   4499 	rw_exit(&ipst->ips_ill_g_lock);
   4500 
   4501 	/*
   4502 	 * If we have only the saved_ipif, return it.  But if we have both
   4503 	 * saved_ipif and dep_ipif, check to see which one is better.
   4504 	 */
   4505 	if (saved_ipif != NULL) {
   4506 		if (dep_ipif != NULL) {
   4507 			if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) {
   4508 				ipif_refrele(saved_ipif);
   4509 				return (dep_ipif);
   4510 			} else {
   4511 				ipif_refrele(dep_ipif);
   4512 				return (saved_ipif);
   4513 			}
   4514 		}
   4515 		return (saved_ipif);
   4516 	} else {
   4517 		return (dep_ipif);
   4518 	}
   4519 }
   4520 
   4521 ill_t *
   4522 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
   4523 {
   4524 	ipif_t *ipif;
   4525 	ill_t *ill;
   4526 
   4527 	ipif = ipif_lookup_multicast(ipst, zoneid, isv6);
   4528 	if (ipif == NULL)
   4529 		return (NULL);
   4530 
   4531 	ill = ipif->ipif_ill;
   4532 	ill_refhold(ill);
   4533 	ipif_refrele(ipif);
   4534 	return (ill);
   4535 }
   4536 
   4537 /*
   4538  * This function is called when an application does not specify an interface
   4539  * to be used for multicast traffic (joining a group/sending data).  It
   4540  * calls ire_lookup_multi() to look for an interface route for the
   4541  * specified multicast group.  Doing this allows the administrator to add
   4542  * prefix routes for multicast to indicate which interface to be used for
   4543  * multicast traffic in the above scenario.  The route could be for all
   4544  * multicast (224.0/4), for a single multicast group (a /32 route) or
   4545  * anything in between.  If there is no such multicast route, we just find
   4546  * any multicast capable interface and return it.  The returned ipif
   4547  * is refhold'ed.
   4548  *
   4549  * We support MULTIRT and RTF_SETSRC on the multicast routes added to the
   4550  * unicast table. This is used by CGTP.
   4551  */
   4552 ill_t *
   4553 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
   4554     boolean_t *multirtp, ipaddr_t *setsrcp)
   4555 {
   4556 	ill_t			*ill;
   4557 
   4558 	ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp);
   4559 	if (ill != NULL)
   4560 		return (ill);
   4561 
   4562 	return (ill_lookup_multicast(ipst, zoneid, B_FALSE));
   4563 }
   4564 
   4565 /*
   4566  * Look for an ipif with the specified interface address and destination.
   4567  * The destination address is used only for matching point-to-point interfaces.
   4568  */
   4569 ipif_t *
   4570 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst)
   4571 {
   4572 	ipif_t	*ipif;
   4573 	ill_t	*ill;
   4574 	ill_walk_context_t ctx;
   4575 
   4576 	/*
   4577 	 * First match all the point-to-point interfaces
   4578 	 * before looking at non-point-to-point interfaces.
   4579 	 * This is done to avoid returning non-point-to-point
   4580 	 * ipif instead of unnumbered point-to-point ipif.
   4581 	 */
   4582 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   4583 	ill = ILL_START_WALK_V4(&ctx, ipst);
   4584 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   4585 		mutex_enter(&ill->ill_lock);
   4586 		for (ipif = ill->ill_ipif; ipif != NULL;
   4587 		    ipif = ipif->ipif_next) {
   4588 			/* Allow the ipif to be down */
   4589 			if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
   4590 			    (ipif->ipif_lcl_addr == if_addr) &&
   4591 			    (ipif->ipif_pp_dst_addr == dst)) {
   4592 				if (!IPIF_IS_CONDEMNED(ipif)) {
   4593 					ipif_refhold_locked(ipif);
   4594 					mutex_exit(&ill->ill_lock);
   4595 					rw_exit(&ipst->ips_ill_g_lock);
   4596 					return (ipif);
   4597 				}
   4598 			}
   4599 		}
   4600 		mutex_exit(&ill->ill_lock);
   4601 	}
   4602 	rw_exit(&ipst->ips_ill_g_lock);
   4603 
   4604 	/* lookup the ipif based on interface address */
   4605 	ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst);
   4606 	ASSERT(ipif == NULL || !ipif->ipif_isv6);
   4607 	return (ipif);
   4608 }
   4609 
   4610 /*
   4611  * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
   4612  */
   4613 static ipif_t *
   4614 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags,
   4615     zoneid_t zoneid, ip_stack_t *ipst)
   4616 {
   4617 	ipif_t  *ipif;
   4618 	ill_t   *ill;
   4619 	boolean_t ptp = B_FALSE;
   4620 	ill_walk_context_t	ctx;
   4621 	boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
   4622 	boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
   4623 
   4624 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   4625 	/*
   4626 	 * Repeat twice, first based on local addresses and
   4627 	 * next time for pointopoint.
   4628 	 */
   4629 repeat:
   4630 	ill = ILL_START_WALK_V4(&ctx, ipst);
   4631 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   4632 		if (match_ill != NULL && ill != match_ill &&
   4633 		    (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
   4634 			continue;
   4635 		}
   4636 		mutex_enter(&ill->ill_lock);
   4637 		for (ipif = ill->ill_ipif; ipif != NULL;
   4638 		    ipif = ipif->ipif_next) {
   4639 			if (zoneid != ALL_ZONES &&
   4640 			    zoneid != ipif->ipif_zoneid &&
   4641 			    ipif->ipif_zoneid != ALL_ZONES)
   4642 				continue;
   4643 
   4644 			if (no_duplicate && !(ipif->ipif_flags & IPIF_UP))
   4645 				continue;
   4646 
   4647 			/* Allow the ipif to be down */
   4648 			if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
   4649 			    ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
   4650 			    (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
   4651 			    (ipif->ipif_pp_dst_addr == addr))) {
   4652 				if (!IPIF_IS_CONDEMNED(ipif)) {
   4653 					ipif_refhold_locked(ipif);
   4654 					mutex_exit(&ill->ill_lock);
   4655 					rw_exit(&ipst->ips_ill_g_lock);
   4656 					return (ipif);
   4657 				}
   4658 			}
   4659 		}
   4660 		mutex_exit(&ill->ill_lock);
   4661 	}
   4662 
   4663 	/* If we already did the ptp case, then we are done */
   4664 	if (ptp) {
   4665 		rw_exit(&ipst->ips_ill_g_lock);
   4666 		return (NULL);
   4667 	}
   4668 	ptp = B_TRUE;
   4669 	goto repeat;
   4670 }
   4671 
   4672 /*
   4673  * Lookup an ipif with the specified address.  For point-to-point links we
   4674  * look for matches on either the destination address or the local address,
   4675  * but we skip the local address check if IPIF_UNNUMBERED is set.  If the
   4676  * `match_ill' argument is non-NULL, the lookup is restricted to that ill
   4677  * (or illgrp if `match_ill' is in an IPMP group).
   4678  */
   4679 ipif_t *
   4680 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
   4681     ip_stack_t *ipst)
   4682 {
   4683 	return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP,
   4684 	    zoneid, ipst));
   4685 }
   4686 
   4687 /*
   4688  * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
   4689  * except that we will only return an address if it is not marked as
   4690  * IPIF_DUPLICATE
   4691  */
   4692 ipif_t *
   4693 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
   4694     ip_stack_t *ipst)
   4695 {
   4696 	return (ipif_lookup_addr_common(addr, match_ill,
   4697 	    (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP),
   4698 	    zoneid, ipst));
   4699 }
   4700 
   4701 /*
   4702  * Special abbreviated version of ipif_lookup_addr() that doesn't match
   4703  * `match_ill' across the IPMP group.  This function is only needed in some
   4704  * corner-cases; almost everything should use ipif_lookup_addr().
   4705  */
   4706 ipif_t *
   4707 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
   4708 {
   4709 	ASSERT(match_ill != NULL);
   4710 	return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES,
   4711 	    ipst));
   4712 }
   4713 
   4714 /*
   4715  * Look for an ipif with the specified address. For point-point links
   4716  * we look for matches on either the destination address and the local
   4717  * address, but we ignore the check on the local address if IPIF_UNNUMBERED
   4718  * is set.
   4719  * If the `match_ill' argument is non-NULL, the lookup is restricted to that
   4720  * ill (or illgrp if `match_ill' is in an IPMP group).
   4721  * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
   4722  */
   4723 zoneid_t
   4724 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
   4725 {
   4726 	zoneid_t zoneid;
   4727 	ipif_t  *ipif;
   4728 	ill_t   *ill;
   4729 	boolean_t ptp = B_FALSE;
   4730 	ill_walk_context_t	ctx;
   4731 
   4732 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   4733 	/*
   4734 	 * Repeat twice, first based on local addresses and
   4735 	 * next time for pointopoint.
   4736 	 */
   4737 repeat:
   4738 	ill = ILL_START_WALK_V4(&ctx, ipst);
   4739 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   4740 		if (match_ill != NULL && ill != match_ill &&
   4741 		    !IS_IN_SAME_ILLGRP(ill, match_ill)) {
   4742 			continue;
   4743 		}
   4744 		mutex_enter(&ill->ill_lock);
   4745 		for (ipif = ill->ill_ipif; ipif != NULL;
   4746 		    ipif = ipif->ipif_next) {
   4747 			/* Allow the ipif to be down */
   4748 			if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
   4749 			    ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
   4750 			    (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
   4751 			    (ipif->ipif_pp_dst_addr == addr)) &&
   4752 			    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
   4753 				zoneid = ipif->ipif_zoneid;
   4754 				mutex_exit(&ill->ill_lock);
   4755 				rw_exit(&ipst->ips_ill_g_lock);
   4756 				/*
   4757 				 * If ipif_zoneid was ALL_ZONES then we have
   4758 				 * a trusted extensions shared IP address.
   4759 				 * In that case GLOBAL_ZONEID works to send.
   4760 				 */
   4761 				if (zoneid == ALL_ZONES)
   4762 					zoneid = GLOBAL_ZONEID;
   4763 				return (zoneid);
   4764 			}
   4765 		}
   4766 		mutex_exit(&ill->ill_lock);
   4767 	}
   4768 
   4769 	/* If we already did the ptp case, then we are done */
   4770 	if (ptp) {
   4771 		rw_exit(&ipst->ips_ill_g_lock);
   4772 		return (ALL_ZONES);
   4773 	}
   4774 	ptp = B_TRUE;
   4775 	goto repeat;
   4776 }
   4777 
   4778 /*
   4779  * Look for an ipif that matches the specified remote address i.e. the
   4780  * ipif that would receive the specified packet.
   4781  * First look for directly connected interfaces and then do a recursive
   4782  * IRE lookup and pick the first ipif corresponding to the source address in the
   4783  * ire.
   4784  * Returns: held ipif
   4785  *
   4786  * This is only used for ICMP_ADDRESS_MASK_REQUESTs
   4787  */
   4788 ipif_t *
   4789 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
   4790 {
   4791 	ipif_t	*ipif;
   4792 
   4793 	ASSERT(!ill->ill_isv6);
   4794 
   4795 	/*
   4796 	 * Someone could be changing this ipif currently or change it
   4797 	 * after we return this. Thus  a few packets could use the old
   4798 	 * old values. However structure updates/creates (ire, ilg, ilm etc)
   4799 	 * will atomically be updated or cleaned up with the new value
   4800 	 * Thus we don't need a lock to check the flags or other attrs below.
   4801 	 */
   4802 	mutex_enter(&ill->ill_lock);
   4803 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   4804 		if (IPIF_IS_CONDEMNED(ipif))
   4805 			continue;
   4806 		if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
   4807 		    ipif->ipif_zoneid != ALL_ZONES)
   4808 			continue;
   4809 		/* Allow the ipif to be down */
   4810 		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
   4811 			if ((ipif->ipif_pp_dst_addr == addr) ||
   4812 			    (!(ipif->ipif_flags & IPIF_UNNUMBERED) &&
   4813 			    ipif->ipif_lcl_addr == addr)) {
   4814 				ipif_refhold_locked(ipif);
   4815 				mutex_exit(&ill->ill_lock);
   4816 				return (ipif);
   4817 			}
   4818 		} else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) {
   4819 			ipif_refhold_locked(ipif);
   4820 			mutex_exit(&ill->ill_lock);
   4821 			return (ipif);
   4822 		}
   4823 	}
   4824 	mutex_exit(&ill->ill_lock);
   4825 	/*
   4826 	 * For a remote destination it isn't possible to nail down a particular
   4827 	 * ipif.
   4828 	 */
   4829 
   4830 	/* Pick the first interface */
   4831 	ipif = ipif_get_next_ipif(NULL, ill);
   4832 	return (ipif);
   4833 }
   4834 
   4835 /*
   4836  * This func does not prevent refcnt from increasing. But if
   4837  * the caller has taken steps to that effect, then this func
   4838  * can be used to determine whether the ill has become quiescent
   4839  */
   4840 static boolean_t
   4841 ill_is_quiescent(ill_t *ill)
   4842 {
   4843 	ipif_t	*ipif;
   4844 
   4845 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4846 
   4847 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   4848 		if (ipif->ipif_refcnt != 0)
   4849 			return (B_FALSE);
   4850 	}
   4851 	if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
   4852 		return (B_FALSE);
   4853 	}
   4854 	return (B_TRUE);
   4855 }
   4856 
   4857 boolean_t
   4858 ill_is_freeable(ill_t *ill)
   4859 {
   4860 	ipif_t	*ipif;
   4861 
   4862 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4863 
   4864 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   4865 		if (ipif->ipif_refcnt != 0) {
   4866 			return (B_FALSE);
   4867 		}
   4868 	}
   4869 	if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) {
   4870 		return (B_FALSE);
   4871 	}
   4872 	return (B_TRUE);
   4873 }
   4874 
   4875 /*
   4876  * This func does not prevent refcnt from increasing. But if
   4877  * the caller has taken steps to that effect, then this func
   4878  * can be used to determine whether the ipif has become quiescent
   4879  */
   4880 static boolean_t
   4881 ipif_is_quiescent(ipif_t *ipif)
   4882 {
   4883 	ill_t *ill;
   4884 
   4885 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
   4886 
   4887 	if (ipif->ipif_refcnt != 0)
   4888 		return (B_FALSE);
   4889 
   4890 	ill = ipif->ipif_ill;
   4891 	if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
   4892 	    ill->ill_logical_down) {
   4893 		return (B_TRUE);
   4894 	}
   4895 
   4896 	/* This is the last ipif going down or being deleted on this ill */
   4897 	if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
   4898 		return (B_FALSE);
   4899 	}
   4900 
   4901 	return (B_TRUE);
   4902 }
   4903 
   4904 /*
   4905  * return true if the ipif can be destroyed: the ipif has to be quiescent
   4906  * with zero references from ire/ilm to it.
   4907  */
   4908 static boolean_t
   4909 ipif_is_freeable(ipif_t *ipif)
   4910 {
   4911 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
   4912 	ASSERT(ipif->ipif_id != 0);
   4913 	return (ipif->ipif_refcnt == 0);
   4914 }
   4915 
   4916 /*
   4917  * The ipif/ill/ire has been refreled. Do the tail processing.
   4918  * Determine if the ipif or ill in question has become quiescent and if so
   4919  * wakeup close and/or restart any queued pending ioctl that is waiting
   4920  * for the ipif_down (or ill_down)
   4921  */
   4922 void
   4923 ipif_ill_refrele_tail(ill_t *ill)
   4924 {
   4925 	mblk_t	*mp;
   4926 	conn_t	*connp;
   4927 	ipsq_t	*ipsq;
   4928 	ipxop_t	*ipx;
   4929 	ipif_t	*ipif;
   4930 	dl_notify_ind_t *dlindp;
   4931 
   4932 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4933 
   4934 	if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
   4935 		/* ip_modclose() may be waiting */
   4936 		cv_broadcast(&ill->ill_cv);
   4937 	}
   4938 
   4939 	ipsq = ill->ill_phyint->phyint_ipsq;
   4940 	mutex_enter(&ipsq->ipsq_lock);
   4941 	ipx = ipsq->ipsq_xop;
   4942 	mutex_enter(&ipx->ipx_lock);
   4943 	if (ipx->ipx_waitfor == 0)	/* no one's waiting; bail */
   4944 		goto unlock;
   4945 
   4946 	ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
   4947 
   4948 	ipif = ipx->ipx_pending_ipif;
   4949 	if (ipif->ipif_ill != ill) 	/* wait is for another ill; bail */
   4950 		goto unlock;
   4951 
   4952 	switch (ipx->ipx_waitfor) {
   4953 	case IPIF_DOWN:
   4954 		if (!ipif_is_quiescent(ipif))
   4955 			goto unlock;
   4956 		break;
   4957 	case IPIF_FREE:
   4958 		if (!ipif_is_freeable(ipif))
   4959 			goto unlock;
   4960 		break;
   4961 	case ILL_DOWN:
   4962 		if (!ill_is_quiescent(ill))
   4963 			goto unlock;
   4964 		break;
   4965 	case ILL_FREE:
   4966 		/*
   4967 		 * ILL_FREE is only for loopback; normal ill teardown waits
   4968 		 * synchronously in ip_modclose() without using ipx_waitfor,
   4969 		 * handled by the cv_broadcast() at the top of this function.
   4970 		 */
   4971 		if (!ill_is_freeable(ill))
   4972 			goto unlock;
   4973 		break;
   4974 	default:
   4975 		cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
   4976 		    (void *)ipsq, ipx->ipx_waitfor);
   4977 	}
   4978 
   4979 	ill_refhold_locked(ill);	/* for qwriter_ip() call below */
   4980 	mutex_exit(&ipx->ipx_lock);
   4981 	mp = ipsq_pending_mp_get(ipsq, &connp);
   4982 	mutex_exit(&ipsq->ipsq_lock);
   4983 	mutex_exit(&ill->ill_lock);
   4984 
   4985 	ASSERT(mp != NULL);
   4986 	/*
   4987 	 * NOTE: all of the qwriter_ip() calls below use CUR_OP since
   4988 	 * we can only get here when the current operation decides it
   4989 	 * it needs to quiesce via ipsq_pending_mp_add().
   4990 	 */
   4991 	switch (mp->b_datap->db_type) {
   4992 	case M_PCPROTO:
   4993 	case M_PROTO:
   4994 		/*
   4995 		 * For now, only DL_NOTIFY_IND messages can use this facility.
   4996 		 */
   4997 		dlindp = (dl_notify_ind_t *)mp->b_rptr;
   4998 		ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND);
   4999 
   5000 		switch (dlindp->dl_notification) {
   5001 		case DL_NOTE_PHYS_ADDR:
   5002 			qwriter_ip(ill, ill->ill_rq, mp,
   5003 			    ill_set_phys_addr_tail, CUR_OP, B_TRUE);
   5004 			return;
   5005 		case DL_NOTE_REPLUMB:
   5006 			qwriter_ip(ill, ill->ill_rq, mp,
   5007 			    ill_replumb_tail, CUR_OP, B_TRUE);
   5008 			return;
   5009 		default:
   5010 			ASSERT(0);
   5011 			ill_refrele(ill);
   5012 		}
   5013 		break;
   5014 
   5015 	case M_ERROR:
   5016 	case M_HANGUP:
   5017 		qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP,
   5018 		    B_TRUE);
   5019 		return;
   5020 
   5021 	case M_IOCTL:
   5022 	case M_IOCDATA:
   5023 		qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) :
   5024 		    ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE);
   5025 		return;
   5026 
   5027 	default:
   5028 		cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
   5029 		    "db_type %d\n", (void *)mp, mp->b_datap->db_type);
   5030 	}
   5031 	return;
   5032 unlock:
   5033 	mutex_exit(&ipsq->ipsq_lock);
   5034 	mutex_exit(&ipx->ipx_lock);
   5035 	mutex_exit(&ill->ill_lock);
   5036 }
   5037 
   5038 #ifdef DEBUG
   5039 /* Reuse trace buffer from beginning (if reached the end) and record trace */
   5040 static void
   5041 th_trace_rrecord(th_trace_t *th_trace)
   5042 {
   5043 	tr_buf_t *tr_buf;
   5044 	uint_t lastref;
   5045 
   5046 	lastref = th_trace->th_trace_lastref;
   5047 	lastref++;
   5048 	if (lastref == TR_BUF_MAX)
   5049 		lastref = 0;
   5050 	th_trace->th_trace_lastref = lastref;
   5051 	tr_buf = &th_trace->th_trbuf[lastref];
   5052 	tr_buf->tr_time = ddi_get_lbolt();
   5053 	tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH);
   5054 }
   5055 
   5056 static void
   5057 th_trace_free(void *value)
   5058 {
   5059 	th_trace_t *th_trace = value;
   5060 
   5061 	ASSERT(th_trace->th_refcnt == 0);
   5062 	kmem_free(th_trace, sizeof (*th_trace));
   5063 }
   5064 
   5065 /*
   5066  * Find or create the per-thread hash table used to track object references.
   5067  * The ipst argument is NULL if we shouldn't allocate.
   5068  *
   5069  * Accesses per-thread data, so there's no need to lock here.
   5070  */
   5071 static mod_hash_t *
   5072 th_trace_gethash(ip_stack_t *ipst)
   5073 {
   5074 	th_hash_t *thh;
   5075 
   5076 	if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) {
   5077 		mod_hash_t *mh;
   5078 		char name[256];
   5079 		size_t objsize, rshift;
   5080 		int retv;
   5081 
   5082 		if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL)
   5083 			return (NULL);
   5084 		(void) snprintf(name, sizeof (name), "th_trace_%p",
   5085 		    (void *)curthread);
   5086 
   5087 		/*
   5088 		 * We use mod_hash_create_extended here rather than the more
   5089 		 * obvious mod_hash_create_ptrhash because the latter has a
   5090 		 * hard-coded KM_SLEEP, and we'd prefer to fail rather than
   5091 		 * block.
   5092 		 */
   5093 		objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
   5094 		    MAX(sizeof (ire_t), sizeof (ncec_t)));
   5095 		rshift = highbit(objsize);
   5096 		mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
   5097 		    th_trace_free, mod_hash_byptr, (void *)rshift,
   5098 		    mod_hash_ptrkey_cmp, KM_NOSLEEP);
   5099 		if (mh == NULL) {
   5100 			kmem_free(thh, sizeof (*thh));
   5101 			return (NULL);
   5102 		}
   5103 		thh->thh_hash = mh;
   5104 		thh->thh_ipst = ipst;
   5105 		/*
   5106 		 * We trace ills, ipifs, ires, and nces.  All of these are
   5107 		 * per-IP-stack, so the lock on the thread list is as well.
   5108 		 */
   5109 		rw_enter(&ip_thread_rwlock, RW_WRITER);
   5110 		list_insert_tail(&ip_thread_list, thh);
   5111 		rw_exit(&ip_thread_rwlock);
   5112 		retv = tsd_set(ip_thread_data, thh);
   5113 		ASSERT(retv == 0);
   5114 	}
   5115 	return (thh != NULL ? thh->thh_hash : NULL);
   5116 }
   5117 
   5118 boolean_t
   5119 th_trace_ref(const void *obj, ip_stack_t *ipst)
   5120 {
   5121 	th_trace_t *th_trace;
   5122 	mod_hash_t *mh;
   5123 	mod_hash_val_t val;
   5124 
   5125 	if ((mh = th_trace_gethash(ipst)) == NULL)
   5126 		return (B_FALSE);
   5127 
   5128 	/*
   5129 	 * Attempt to locate the trace buffer for this obj and thread.
   5130 	 * If it does not exist, then allocate a new trace buffer and
   5131 	 * insert into the hash.
   5132 	 */
   5133 	if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) {
   5134 		th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP);
   5135 		if (th_trace == NULL)
   5136 			return (B_FALSE);
   5137 
   5138 		th_trace->th_id = curthread;
   5139 		if (mod_hash_insert(mh, (mod_hash_key_t)obj,
   5140 		    (mod_hash_val_t)th_trace) != 0) {
   5141 			kmem_free(th_trace, sizeof (th_trace_t));
   5142 			return (B_FALSE);
   5143 		}
   5144 	} else {
   5145 		th_trace = (th_trace_t *)val;
   5146 	}
   5147 
   5148 	ASSERT(th_trace->th_refcnt >= 0 &&
   5149 	    th_trace->th_refcnt < TR_BUF_MAX - 1);
   5150 
   5151 	th_trace->th_refcnt++;
   5152 	th_trace_rrecord(th_trace);
   5153 	return (B_TRUE);
   5154 }
   5155 
   5156 /*
   5157  * For the purpose of tracing a reference release, we assume that global
   5158  * tracing is always on and that the same thread initiated the reference hold
   5159  * is releasing.
   5160  */
   5161 void
   5162 th_trace_unref(const void *obj)
   5163 {
   5164 	int retv;
   5165 	mod_hash_t *mh;
   5166 	th_trace_t *th_trace;
   5167 	mod_hash_val_t val;
   5168 
   5169 	mh = th_trace_gethash(NULL);
   5170 	retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val);
   5171 	ASSERT(retv == 0);
   5172 	th_trace = (th_trace_t *)val;
   5173 
   5174 	ASSERT(th_trace->th_refcnt > 0);
   5175 	th_trace->th_refcnt--;
   5176 	th_trace_rrecord(th_trace);
   5177 }
   5178 
   5179 /*
   5180  * If tracing has been disabled, then we assume that the reference counts are
   5181  * now useless, and we clear them out before destroying the entries.
   5182  */
   5183 void
   5184 th_trace_cleanup(const void *obj, boolean_t trace_disable)
   5185 {
   5186 	th_hash_t	*thh;
   5187 	mod_hash_t	*mh;
   5188 	mod_hash_val_t	val;
   5189 	th_trace_t	*th_trace;
   5190 	int		retv;
   5191 
   5192 	rw_enter(&ip_thread_rwlock, RW_READER);
   5193 	for (thh = list_head(&ip_thread_list); thh != NULL;
   5194 	    thh = list_next(&ip_thread_list, thh)) {
   5195 		if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj,
   5196 		    &val) == 0) {
   5197 			th_trace = (th_trace_t *)val;
   5198 			if (trace_disable)
   5199 				th_trace->th_refcnt = 0;
   5200 			retv = mod_hash_destroy(mh, (mod_hash_key_t)obj);
   5201 			ASSERT(retv == 0);
   5202 		}
   5203 	}
   5204 	rw_exit(&ip_thread_rwlock);
   5205 }
   5206 
   5207 void
   5208 ipif_trace_ref(ipif_t *ipif)
   5209 {
   5210 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
   5211 
   5212 	if (ipif->ipif_trace_disable)
   5213 		return;
   5214 
   5215 	if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) {
   5216 		ipif->ipif_trace_disable = B_TRUE;
   5217 		ipif_trace_cleanup(ipif);
   5218 	}
   5219 }
   5220 
   5221 void
   5222 ipif_untrace_ref(ipif_t *ipif)
   5223 {
   5224 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
   5225 
   5226 	if (!ipif->ipif_trace_disable)
   5227 		th_trace_unref(ipif);
   5228 }
   5229 
   5230 void
   5231 ill_trace_ref(ill_t *ill)
   5232 {
   5233 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   5234 
   5235 	if (ill->ill_trace_disable)
   5236 		return;
   5237 
   5238 	if (!th_trace_ref(ill, ill->ill_ipst)) {
   5239 		ill->ill_trace_disable = B_TRUE;
   5240 		ill_trace_cleanup(ill);
   5241 	}
   5242 }
   5243 
   5244 void
   5245 ill_untrace_ref(ill_t *ill)
   5246 {
   5247 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   5248 
   5249 	if (!ill->ill_trace_disable)
   5250 		th_trace_unref(ill);
   5251 }
   5252 
   5253 /*
   5254  * Called when ipif is unplumbed or when memory alloc fails.  Note that on
   5255  * failure, ipif_trace_disable is set.
   5256  */
   5257 static void
   5258 ipif_trace_cleanup(const ipif_t *ipif)
   5259 {
   5260 	th_trace_cleanup(ipif, ipif->ipif_trace_disable);
   5261 }
   5262 
   5263 /*
   5264  * Called when ill is unplumbed or when memory alloc fails.  Note that on
   5265  * failure, ill_trace_disable is set.
   5266  */
   5267 static void
   5268 ill_trace_cleanup(const ill_t *ill)
   5269 {
   5270 	th_trace_cleanup(ill, ill->ill_trace_disable);
   5271 }
   5272 #endif /* DEBUG */
   5273 
   5274 void
   5275 ipif_refhold_locked(ipif_t *ipif)
   5276 {
   5277 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
   5278 	ipif->ipif_refcnt++;
   5279 	IPIF_TRACE_REF(ipif);
   5280 }
   5281 
   5282 void
   5283 ipif_refhold(ipif_t *ipif)
   5284 {
   5285 	ill_t	*ill;
   5286 
   5287 	ill = ipif->ipif_ill;
   5288 	mutex_enter(&ill->ill_lock);
   5289 	ipif->ipif_refcnt++;
   5290 	IPIF_TRACE_REF(ipif);
   5291 	mutex_exit(&ill->ill_lock);
   5292 }
   5293 
   5294 /*
   5295  * Must not be called while holding any locks. Otherwise if this is
   5296  * the last reference to be released there is a chance of recursive mutex
   5297  * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
   5298  * to restart an ioctl.
   5299  */
   5300 void
   5301 ipif_refrele(ipif_t *ipif)
   5302 {
   5303 	ill_t	*ill;
   5304 
   5305 	ill = ipif->ipif_ill;
   5306 
   5307 	mutex_enter(&ill->ill_lock);
   5308 	ASSERT(ipif->ipif_refcnt != 0);
   5309 	ipif->ipif_refcnt--;
   5310 	IPIF_UNTRACE_REF(ipif);
   5311 	if (ipif->ipif_refcnt != 0) {
   5312 		mutex_exit(&ill->ill_lock);
   5313 		return;
   5314 	}
   5315 
   5316 	/* Drops the ill_lock */
   5317 	ipif_ill_refrele_tail(ill);
   5318 }
   5319 
   5320 ipif_t *
   5321 ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
   5322 {
   5323 	ipif_t	*ipif;
   5324 
   5325 	mutex_enter(&ill->ill_lock);
   5326 	for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
   5327 	    ipif != NULL; ipif = ipif->ipif_next) {
   5328 		if (IPIF_IS_CONDEMNED(ipif))
   5329 			continue;
   5330 		ipif_refhold_locked(ipif);
   5331 		mutex_exit(&ill->ill_lock);
   5332 		return (ipif);
   5333 	}
   5334 	mutex_exit(&ill->ill_lock);
   5335 	return (NULL);
   5336 }
   5337 
   5338 /*
   5339  * TODO: make this table extendible at run time
   5340  * Return a pointer to the mac type info for 'mac_type'
   5341  */
   5342 static ip_m_t *
   5343 ip_m_lookup(t_uscalar_t mac_type)
   5344 {
   5345 	ip_m_t	*ipm;
   5346 
   5347 	for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++)
   5348 		if (ipm->ip_m_mac_type == mac_type)
   5349 			return (ipm);
   5350 	return (NULL);
   5351 }
   5352 
   5353 /*
   5354  * Make a link layer address from the multicast IP address *addr.
   5355  * To form the link layer address, invoke the ip_m_v*mapping function
   5356  * associated with the link-layer type.
   5357  */
   5358 void
   5359 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr)
   5360 {
   5361 	ip_m_t *ipm;
   5362 
   5363 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
   5364 		return;
   5365 
   5366 	ASSERT(addr != NULL);
   5367 
   5368 	ipm = ip_m_lookup(ill->ill_mactype);
   5369 	if (ipm == NULL ||
   5370 	    (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) ||
   5371 	    (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) {
   5372 		ip0dbg(("no mapping for ill %s mactype 0x%x\n",
   5373 		    ill->ill_name, ill->ill_mactype));
   5374 		return;
   5375 	}
   5376 	if (ill->ill_isv6)
   5377 		(*ipm->ip_m_v6mapping)(ill, addr, hwaddr);
   5378 	else
   5379 		(*ipm->ip_m_v4mapping)(ill, addr, hwaddr);
   5380 }
   5381 
   5382 /*
   5383  * ip_rt_add is called to add an IPv4 route to the forwarding table.
   5384  * ill is passed in to associate it with the correct interface.
   5385  * If ire_arg is set, then we return the held IRE in that location.
   5386  */
   5387 int
   5388 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
   5389     ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg,
   5390     boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid)
   5391 {
   5392 	ire_t	*ire, *nire;
   5393 	ire_t	*gw_ire = NULL;
   5394 	ipif_t	*ipif = NULL;
   5395 	uint_t	type;
   5396 	int	match_flags = MATCH_IRE_TYPE;
   5397 	tsol_gc_t *gc = NULL;
   5398 	tsol_gcgrp_t *gcgrp = NULL;
   5399 	boolean_t gcgrp_xtraref = B_FALSE;
   5400 	boolean_t cgtp_broadcast;
   5401 
   5402 	ip1dbg(("ip_rt_add:"));
   5403 
   5404 	if (ire_arg != NULL)
   5405 		*ire_arg = NULL;
   5406 
   5407 	/*
   5408 	 * If this is the case of RTF_HOST being set, then we set the netmask
   5409 	 * to all ones (regardless if one was supplied).
   5410 	 */
   5411 	if (flags & RTF_HOST)
   5412 		mask = IP_HOST_MASK;
   5413 
   5414 	/*
   5415 	 * Prevent routes with a zero gateway from being created (since
   5416 	 * interfaces can currently be plumbed and brought up no assigned
   5417 	 * address).
   5418 	 */
   5419 	if (gw_addr == 0)
   5420 		return (ENETUNREACH);
   5421 	/*
   5422 	 * Get the ipif, if any, corresponding to the gw_addr
   5423 	 * If -ifp was specified we restrict ourselves to the ill, otherwise
   5424 	 * we match on the gatway and destination to handle unnumbered pt-pt
   5425 	 * interfaces.
   5426 	 */
   5427 	if (ill != NULL)
   5428 		ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst);
   5429 	else
   5430 		ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
   5431 	if (ipif != NULL) {
   5432 		if (IS_VNI(ipif->ipif_ill)) {
   5433 			ipif_refrele(ipif);
   5434 			return (EINVAL);
   5435 		}
   5436 	}
   5437 
   5438 	/*
   5439 	 * GateD will attempt to create routes with a loopback interface
   5440 	 * address as the gateway and with RTF_GATEWAY set.  We allow
   5441 	 * these routes to be added, but create them as interface routes
   5442 	 * since the gateway is an interface address.
   5443 	 */
   5444 	if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) {
   5445 		flags &= ~RTF_GATEWAY;
   5446 		if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
   5447 		    mask == IP_HOST_MASK) {
   5448 			ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
   5449 			    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
   5450 			    NULL);
   5451 			if (ire != NULL) {
   5452 				ire_refrele(ire);
   5453 				ipif_refrele(ipif);
   5454 				return (EEXIST);
   5455 			}
   5456 			ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x"
   5457 			    "for 0x%x\n", (void *)ipif,
   5458 			    ipif->ipif_ire_type,
   5459 			    ntohl(ipif->ipif_lcl_addr)));
   5460 			ire = ire_create(
   5461 			    (uchar_t *)&dst_addr,	/* dest address */
   5462 			    (uchar_t *)&mask,		/* mask */
   5463 			    NULL,			/* no gateway */
   5464 			    ipif->ipif_ire_type,	/* LOOPBACK */
   5465 			    ipif->ipif_ill,
   5466 			    zoneid,
   5467 			    (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
   5468 			    NULL,
   5469 			    ipst);
   5470 
   5471 			if (ire == NULL) {
   5472 				ipif_refrele(ipif);
   5473 				return (ENOMEM);
   5474 			}
   5475 			/* src address assigned by the caller? */
   5476 			if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
   5477 				ire->ire_setsrc_addr = src_addr;
   5478 
   5479 			nire = ire_add(ire);
   5480 			if (nire == NULL) {
   5481 				/*
   5482 				 * In the result of failure, ire_add() will have
   5483 				 * already deleted the ire in question, so there
   5484 				 * is no need to do that here.
   5485 				 */
   5486 				ipif_refrele(ipif);
   5487 				return (ENOMEM);
   5488 			}
   5489 			/*
   5490 			 * Check if it was a duplicate entry. This handles
   5491 			 * the case of two racing route adds for the same route
   5492 			 */
   5493 			if (nire != ire) {
   5494 				ASSERT(nire->ire_identical_ref > 1);
   5495 				ire_delete(nire);
   5496 				ire_refrele(nire);
   5497 				ipif_refrele(ipif);
   5498 				return (EEXIST);
   5499 			}
   5500 			ire = nire;
   5501 			goto save_ire;
   5502 		}
   5503 	}
   5504 
   5505 	/*
   5506 	 * The routes for multicast with CGTP are quite special in that
   5507 	 * the gateway is the local interface address, yet RTF_GATEWAY
   5508 	 * is set. We turn off RTF_GATEWAY to provide compatibility with
   5509 	 * this undocumented and unusual use of multicast routes.
   5510 	 */
   5511 	if ((flags & RTF_MULTIRT) && ipif != NULL)
   5512 		flags &= ~RTF_GATEWAY;
   5513 
   5514 	/*
   5515 	 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
   5516 	 * and the gateway address provided is one of the system's interface
   5517 	 * addresses.  By using the routing socket interface and supplying an
   5518 	 * RTA_IFP sockaddr with an interface index, an alternate method of
   5519 	 * specifying an interface route to be created is available which uses
   5520 	 * the interface index that specifies the outgoing interface rather than
   5521 	 * the address of an outgoing interface (which may not be able to
   5522 	 * uniquely identify an interface).  When coupled with the RTF_GATEWAY
   5523 	 * flag, routes can be specified which not only specify the next-hop to
   5524 	 * be used when routing to a certain prefix, but also which outgoing
   5525 	 * interface should be used.
   5526 	 *
   5527 	 * Previously, interfaces would have unique addresses assigned to them
   5528 	 * and so the address assigned to a particular interface could be used
   5529 	 * to identify a particular interface.  One exception to this was the
   5530 	 * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
   5531 	 *
   5532 	 * With the advent of IPv6 and its link-local addresses, this
   5533 	 * restriction was relaxed and interfaces could share addresses between
   5534 	 * themselves.  In fact, typically all of the link-local interfaces on
   5535 	 * an IPv6 node or router will have the same link-local address.  In
   5536 	 * order to differentiate between these interfaces, the use of an
   5537 	 * interface index is necessary and this index can be carried inside a
   5538 	 * RTA_IFP sockaddr (which is actually a sockaddr_dl).  One restriction
   5539 	 * of using the interface index, however, is that all of the ipif's that
   5540 	 * are part of an ill have the same index and so the RTA_IFP sockaddr
   5541 	 * cannot be used to differentiate between ipif's (or logical
   5542 	 * interfaces) that belong to the same ill (physical interface).
   5543 	 *
   5544 	 * For example, in the following case involving IPv4 interfaces and
   5545 	 * logical interfaces
   5546 	 *
   5547 	 *	192.0.2.32	255.255.255.224	192.0.2.33	U	if0
   5548 	 *	192.0.2.32	255.255.255.224	192.0.2.34	U	if0
   5549 	 *	192.0.2.32	255.255.255.224	192.0.2.35	U	if0
   5550 	 *
   5551 	 * the ipif's corresponding to each of these interface routes can be
   5552 	 * uniquely identified by the "gateway" (actually interface address).
   5553 	 *
   5554 	 * In this case involving multiple IPv6 default routes to a particular
   5555 	 * link-local gateway, the use of RTA_IFP is necessary to specify which
   5556 	 * default route is of interest:
   5557 	 *
   5558 	 *	default		fe80::123:4567:89ab:cdef	U	if0
   5559 	 *	default		fe80::123:4567:89ab:cdef	U	if1
   5560 	 */
   5561 
   5562 	/* RTF_GATEWAY not set */
   5563 	if (!(flags & RTF_GATEWAY)) {
   5564 		if (sp != NULL) {
   5565 			ip2dbg(("ip_rt_add: gateway security attributes "
   5566 			    "cannot be set with interface route\n"));
   5567 			if (ipif != NULL)
   5568 				ipif_refrele(ipif);
   5569 			return (EINVAL);
   5570 		}
   5571 
   5572 		/*
   5573 		 * Whether or not ill (RTA_IFP) is set, we require that
   5574 		 * the gateway is one of our local addresses.
   5575 		 */
   5576 		if (ipif == NULL)
   5577 			return (ENETUNREACH);
   5578 
   5579 		/*
   5580 		 * We use MATCH_IRE_ILL here. If the caller specified an
   5581 		 * interface (from the RTA_IFP sockaddr) we use it, otherwise
   5582 		 * we use the ill derived from the gateway address.
   5583 		 * We can always match the gateway address since we record it
   5584 		 * in ire_gateway_addr.
   5585 		 * We don't allow RTA_IFP to specify a different ill than the
   5586 		 * one matching the ipif to make sure we can delete the route.
   5587 		 */
   5588 		match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
   5589 		if (ill == NULL) {
   5590 			ill = ipif->ipif_ill;
   5591 		} else if (ill != ipif->ipif_ill) {
   5592 			ipif_refrele(ipif);
   5593 			return (EINVAL);
   5594 		}
   5595 
   5596 		/*
   5597 		 * We check for an existing entry at this point.
   5598 		 *
   5599 		 * Since a netmask isn't passed in via the ioctl interface
   5600 		 * (SIOCADDRT), we don't check for a matching netmask in that
   5601 		 * case.
   5602 		 */
   5603 		if (!ioctl_msg)
   5604 			match_flags |= MATCH_IRE_MASK;
   5605 		ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
   5606 		    IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst,
   5607 		    NULL);
   5608 		if (ire != NULL) {
   5609 			ire_refrele(ire);
   5610 			ipif_refrele(ipif);
   5611 			return (EEXIST);
   5612 		}
   5613 
   5614 		/*
   5615 		 * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or
   5616 		 * IRE_IF_RESOLVER with the modified address, netmask, and
   5617 		 * gateway.
   5618 		 */
   5619 		ire = ire_create(
   5620 		    (uchar_t *)&dst_addr,
   5621 		    (uint8_t *)&mask,
   5622 		    (uint8_t *)&gw_addr,
   5623 		    ill->ill_net_type,
   5624 		    ill,
   5625 		    zoneid,
   5626 		    flags,
   5627 		    NULL,
   5628 		    ipst);
   5629 		if (ire == NULL) {
   5630 			ipif_refrele(ipif);
   5631 			return (ENOMEM);
   5632 		}
   5633 
   5634 		/*
   5635 		 * Some software (for example, GateD and Sun Cluster) attempts
   5636 		 * to create (what amount to) IRE_PREFIX routes with the
   5637 		 * loopback address as the gateway.  This is primarily done to
   5638 		 * set up prefixes with the RTF_REJECT flag set (for example,
   5639 		 * when generating aggregate routes.)
   5640 		 *
   5641 		 * If the IRE type (as defined by ill->ill_net_type) is
   5642 		 * IRE_LOOPBACK, then we map the request into a
   5643 		 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as
   5644 		 * these interface routes, by definition, can only be that.
   5645 		 *
   5646 		 * Needless to say, the real IRE_LOOPBACK is NOT created by this
   5647 		 * routine, but rather using ire_create() directly.
   5648 		 *
   5649 		 */
   5650 		if (ill->ill_net_type == IRE_LOOPBACK) {
   5651 			ire->ire_type = IRE_IF_NORESOLVER;
   5652 			ire->ire_flags |= RTF_BLACKHOLE;
   5653 		}
   5654 
   5655 		/* src address assigned by the caller? */
   5656 		if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
   5657 			ire->ire_setsrc_addr = src_addr;
   5658 
   5659 		nire = ire_add(ire);
   5660 		if (nire == NULL) {
   5661 			/*
   5662 			 * In the result of failure, ire_add() will have
   5663 			 * already deleted the ire in question, so there
   5664 			 * is no need to do that here.
   5665 			 */
   5666 			ipif_refrele(ipif);
   5667 			return (ENOMEM);
   5668 		}
   5669 		/*
   5670 		 * Check if it was a duplicate entry. This handles
   5671 		 * the case of two racing route adds for the same route
   5672 		 */
   5673 		if (nire != ire) {
   5674 			ire_delete(nire);
   5675 			ire_refrele(nire);
   5676 			ipif_refrele(ipif);
   5677 			return (EEXIST);
   5678 		}
   5679 		ire = nire;
   5680 		goto save_ire;
   5681 	}
   5682 
   5683 	/*
   5684 	 * Get an interface IRE for the specified gateway.
   5685 	 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
   5686 	 * gateway, it is currently unreachable and we fail the request
   5687 	 * accordingly.
   5688 	 * If RTA_IFP was specified we look on that particular ill.
   5689 	 */
   5690 	if (ill != NULL)
   5691 		match_flags |= MATCH_IRE_ILL;
   5692 
   5693 	/* Check whether the gateway is reachable. */
   5694 again:
   5695 	type = IRE_INTERFACE;
   5696 	if (flags & RTF_INDIRECT)
   5697 		type |= IRE_OFFLINK;
   5698 
   5699 	gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill,
   5700 	    ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
   5701 	if (gw_ire == NULL) {
   5702 		/*
   5703 		 * With IPMP, we allow host routes to influence in.mpathd's
   5704 		 * target selection.  However, if the test addresses are on
   5705 		 * their own network, the above lookup will fail since the
   5706 		 * underlying IRE_INTERFACEs are marked hidden.  So allow
   5707 		 * hidden test IREs to be found and try again.
   5708 		 */
   5709 		if (!(match_flags & MATCH_IRE_TESTHIDDEN))  {
   5710 			match_flags |= MATCH_IRE_TESTHIDDEN;
   5711 			goto again;
   5712 		}
   5713 
   5714 		if (ipif != NULL)
   5715 			ipif_refrele(ipif);
   5716 		return (ENETUNREACH);
   5717 	}
   5718 
   5719 	/*
   5720 	 * We create one of three types of IREs as a result of this request
   5721 	 * based on the netmask.  A netmask of all ones (which is automatically
   5722 	 * assumed when RTF_HOST is set) results in an IRE_HOST being created.
   5723 	 * An all zeroes netmask implies a default route so an IRE_DEFAULT is
   5724 	 * created.  Otherwise, an IRE_PREFIX route is created for the
   5725 	 * destination prefix.
   5726 	 */
   5727 	if (mask == IP_HOST_MASK)
   5728 		type = IRE_HOST;
   5729 	else if (mask == 0)
   5730 		type = IRE_DEFAULT;
   5731 	else
   5732 		type = IRE_PREFIX;
   5733 
   5734 	/* check for a duplicate entry */
   5735 	ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
   5736 	    ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW,
   5737 	    0, ipst, NULL);
   5738 	if (ire != NULL) {
   5739 		if (ipif != NULL)
   5740 			ipif_refrele(ipif);
   5741 		ire_refrele(gw_ire);
   5742 		ire_refrele(ire);
   5743 		return (EEXIST);
   5744 	}
   5745 
   5746 	/* Security attribute exists */
   5747 	if (sp != NULL) {
   5748 		tsol_gcgrp_addr_t ga;
   5749 
   5750 		/* find or create the gateway credentials group */
   5751 		ga.ga_af = AF_INET;
   5752 		IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr);
   5753 
   5754 		/* we hold reference to it upon success */
   5755 		gcgrp = gcgrp_lookup(&ga, B_TRUE);
   5756 		if (gcgrp == NULL) {
   5757 			if (ipif != NULL)
   5758 				ipif_refrele(ipif);
   5759 			ire_refrele(gw_ire);
   5760 			return (ENOMEM);
   5761 		}
   5762 
   5763 		/*
   5764 		 * Create and add the security attribute to the group; a
   5765 		 * reference to the group is made upon allocating a new
   5766 		 * entry successfully.  If it finds an already-existing
   5767 		 * entry for the security attribute in the group, it simply
   5768 		 * returns it and no new reference is made to the group.
   5769 		 */
   5770 		gc = gc_create(sp, gcgrp, &gcgrp_xtraref);
   5771 		if (gc == NULL) {
   5772 			if (ipif != NULL)
   5773 				ipif_refrele(ipif);
   5774 			/* release reference held by gcgrp_lookup */
   5775 			GCGRP_REFRELE(gcgrp);
   5776 			ire_refrele(gw_ire);
   5777 			return (ENOMEM);
   5778 		}
   5779 	}
   5780 
   5781 	/* Create the IRE. */
   5782 	ire = ire_create(
   5783 	    (uchar_t *)&dst_addr,		/* dest address */
   5784 	    (uchar_t *)&mask,			/* mask */
   5785 	    (uchar_t *)&gw_addr,		/* gateway address */
   5786 	    (ushort_t)type,			/* IRE type */
   5787 	    ill,
   5788 	    zoneid,
   5789 	    flags,
   5790 	    gc,					/* security attribute */
   5791 	    ipst);
   5792 
   5793 	/*
   5794 	 * The ire holds a reference to the 'gc' and the 'gc' holds a
   5795 	 * reference to the 'gcgrp'. We can now release the extra reference
   5796 	 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used.
   5797 	 */
   5798 	if (gcgrp_xtraref)
   5799 		GCGRP_REFRELE(gcgrp);
   5800 	if (ire == NULL) {
   5801 		if (gc != NULL)
   5802 			GC_REFRELE(gc);
   5803 		if (ipif != NULL)
   5804 			ipif_refrele(ipif);
   5805 		ire_refrele(gw_ire);
   5806 		return (ENOMEM);
   5807 	}
   5808 
   5809 	/* Before we add, check if an extra CGTP broadcast is needed */
   5810 	cgtp_broadcast = ((flags & RTF_MULTIRT) &&
   5811 	    ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST);
   5812 
   5813 	/* src address assigned by the caller? */
   5814 	if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
   5815 		ire->ire_setsrc_addr = src_addr;
   5816 
   5817 	/*
   5818 	 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
   5819 	 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
   5820 	 */
   5821 
   5822 	/* Add the new IRE. */
   5823 	nire = ire_add(ire);
   5824 	if (nire == NULL) {
   5825 		/*
   5826 		 * In the result of failure, ire_add() will have
   5827 		 * already deleted the ire in question, so there
   5828 		 * is no need to do that here.
   5829 		 */
   5830 		if (ipif != NULL)
   5831 			ipif_refrele(ipif);
   5832 		ire_refrele(gw_ire);
   5833 		return (ENOMEM);
   5834 	}
   5835 	/*
   5836 	 * Check if it was a duplicate entry. This handles
   5837 	 * the case of two racing route adds for the same route
   5838 	 */
   5839 	if (nire != ire) {
   5840 		ire_delete(nire);
   5841 		ire_refrele(nire);
   5842 		if (ipif != NULL)
   5843 			ipif_refrele(ipif);
   5844 		ire_refrele(gw_ire);
   5845 		return (EEXIST);
   5846 	}
   5847 	ire = nire;
   5848 
   5849 	if (flags & RTF_MULTIRT) {
   5850 		/*
   5851 		 * Invoke the CGTP (multirouting) filtering module
   5852 		 * to add the dst address in the filtering database.
   5853 		 * Replicated inbound packets coming from that address
   5854 		 * will be filtered to discard the duplicates.
   5855 		 * It is not necessary to call the CGTP filter hook
   5856 		 * when the dst address is a broadcast or multicast,
   5857 		 * because an IP source address cannot be a broadcast
   5858 		 * or a multicast.
   5859 		 */
   5860 		if (cgtp_broadcast) {
   5861 			ip_cgtp_bcast_add(ire, ipst);
   5862 			goto save_ire;
   5863 		}
   5864 		if (ipst->ips_ip_cgtp_filter_ops != NULL &&
   5865 		    !CLASSD(ire->ire_addr)) {
   5866 			int res;
   5867 			ipif_t *src_ipif;
   5868 
   5869 			/* Find the source address corresponding to gw_ire */
   5870 			src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr,
   5871 			    NULL, zoneid, ipst);
   5872 			if (src_ipif != NULL) {
   5873 				res = ipst->ips_ip_cgtp_filter_ops->
   5874 				    cfo_add_dest_v4(
   5875 				    ipst->ips_netstack->netstack_stackid,
   5876 				    ire->ire_addr,
   5877 				    ire->ire_gateway_addr,
   5878 				    ire->ire_setsrc_addr,
   5879 				    src_ipif->ipif_lcl_addr);
   5880 				ipif_refrele(src_ipif);
   5881 			} else {
   5882 				res = EADDRNOTAVAIL;
   5883 			}
   5884 			if (res != 0) {
   5885 				if (ipif != NULL)
   5886 					ipif_refrele(ipif);
   5887 				ire_refrele(gw_ire);
   5888 				ire_delete(ire);
   5889 				ire_refrele(ire);	/* Held in ire_add */
   5890 				return (res);
   5891 			}
   5892 		}
   5893 	}
   5894 
   5895 save_ire:
   5896 	if (gw_ire != NULL) {
   5897 		ire_refrele(gw_ire);
   5898 		gw_ire = NULL;
   5899 	}
   5900 	if (ill != NULL) {
   5901 		/*
   5902 		 * Save enough information so that we can recreate the IRE if
   5903 		 * the interface goes down and then up.  The metrics associated
   5904 		 * with the route will be saved as well when rts_setmetrics() is
   5905 		 * called after the IRE has been created.  In the case where
   5906 		 * memory cannot be allocated, none of this information will be
   5907 		 * saved.
   5908 		 */
   5909 		ill_save_ire(ill, ire);
   5910 	}
   5911 	if (ioctl_msg)
   5912 		ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
   5913 	if (ire_arg != NULL) {
   5914 		/*
   5915 		 * Store the ire that was successfully added into where ire_arg
   5916 		 * points to so that callers don't have to look it up
   5917 		 * themselves (but they are responsible for ire_refrele()ing
   5918 		 * the ire when they are finished with it).
   5919 		 */
   5920 		*ire_arg = ire;
   5921 	} else {
   5922 		ire_refrele(ire);		/* Held in ire_add */
   5923 	}
   5924 	if (ipif != NULL)
   5925 		ipif_refrele(ipif);
   5926 	return (0);
   5927 }
   5928 
   5929 /*
   5930  * ip_rt_delete is called to delete an IPv4 route.
   5931  * ill is passed in to associate it with the correct interface.
   5932  */
   5933 /* ARGSUSED4 */
   5934 int
   5935 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
   5936     uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg,
   5937     ip_stack_t *ipst, zoneid_t zoneid)
   5938 {
   5939 	ire_t	*ire = NULL;
   5940 	ipif_t	*ipif;
   5941 	uint_t	type;
   5942 	uint_t	match_flags = MATCH_IRE_TYPE;
   5943 	int	err = 0;
   5944 
   5945 	ip1dbg(("ip_rt_delete:"));
   5946 	/*
   5947 	 * If this is the case of RTF_HOST being set, then we set the netmask
   5948 	 * to all ones.  Otherwise, we use the netmask if one was supplied.
   5949 	 */
   5950 	if (flags & RTF_HOST) {
   5951 		mask = IP_HOST_MASK;
   5952 		match_flags |= MATCH_IRE_MASK;
   5953 	} else if (rtm_addrs & RTA_NETMASK) {
   5954 		match_flags |= MATCH_IRE_MASK;
   5955 	}
   5956 
   5957 	/*
   5958 	 * Note that RTF_GATEWAY is never set on a delete, therefore
   5959 	 * we check if the gateway address is one of our interfaces first,
   5960 	 * and fall back on RTF_GATEWAY routes.
   5961 	 *
   5962 	 * This makes it possible to delete an original
   5963 	 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
   5964 	 * However, we have RTF_KERNEL set on the ones created by ipif_up
   5965 	 * and those can not be deleted here.
   5966 	 *
   5967 	 * We use MATCH_IRE_ILL if we know the interface. If the caller
   5968 	 * specified an interface (from the RTA_IFP sockaddr) we use it,
   5969 	 * otherwise we use the ill derived from the gateway address.
   5970 	 * We can always match the gateway address since we record it
   5971 	 * in ire_gateway_addr.
   5972 	 *
   5973 	 * For more detail on specifying routes by gateway address and by
   5974 	 * interface index, see the comments in ip_rt_add().
   5975 	 */
   5976 	ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
   5977 	if (ipif != NULL) {
   5978 		ill_t	*ill_match;
   5979 
   5980 		if (ill != NULL)
   5981 			ill_match = ill;
   5982 		else
   5983 			ill_match = ipif->ipif_ill;
   5984 
   5985 		match_flags |= MATCH_IRE_ILL;
   5986 		if (ipif->ipif_ire_type == IRE_LOOPBACK) {
   5987 			ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
   5988 			    ill_match, ALL_ZONES, NULL, match_flags, 0, ipst,
   5989 			    NULL);
   5990 		}
   5991 		if (ire == NULL) {
   5992 			match_flags |= MATCH_IRE_GW;
   5993 			ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
   5994 			    IRE_INTERFACE, ill_match, ALL_ZONES, NULL,
   5995 			    match_flags, 0, ipst, NULL);
   5996 		}
   5997 		/* Avoid deleting routes created by kernel from an ipif */
   5998 		if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
   5999 			ire_refrele(ire);
   6000 			ire = NULL;
   6001 		}
   6002 
   6003 		/* Restore in case we didn't find a match */
   6004 		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
   6005 	}
   6006 
   6007 	if (ire == NULL) {
   6008 		/*
   6009 		 * At this point, the gateway address is not one of our own
   6010 		 * addresses or a matching interface route was not found.  We
   6011 		 * set the IRE type to lookup based on whether
   6012 		 * this is a host route, a default route or just a prefix.
   6013 		 *
   6014 		 * If an ill was passed in, then the lookup is based on an
   6015 		 * interface index so MATCH_IRE_ILL is added to match_flags.
   6016 		 */
   6017 		match_flags |= MATCH_IRE_GW;
   6018 		if (ill != NULL)
   6019 			match_flags |= MATCH_IRE_ILL;
   6020 		if (mask == IP_HOST_MASK)
   6021 			type = IRE_HOST;
   6022 		else if (mask == 0)
   6023 			type = IRE_DEFAULT;
   6024 		else
   6025 			type = IRE_PREFIX;
   6026 		ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
   6027 		    ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
   6028 	}
   6029 
   6030 	if (ipif != NULL) {
   6031 		ipif_refrele(ipif);
   6032 		ipif = NULL;
   6033 	}
   6034 
   6035 	if (ire == NULL)
   6036 		return (ESRCH);
   6037 
   6038 	if (ire->ire_flags & RTF_MULTIRT) {
   6039 		/*
   6040 		 * Invoke the CGTP (multirouting) filtering module
   6041 		 * to remove the dst address from the filtering database.
   6042 		 * Packets coming from that address will no longer be
   6043 		 * filtered to remove duplicates.
   6044 		 */
   6045 		if (ipst->ips_ip_cgtp_filter_ops != NULL) {
   6046 			err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4(
   6047 			    ipst->ips_netstack->netstack_stackid,
   6048 			    ire->ire_addr, ire->ire_gateway_addr);
   6049 		}
   6050 		ip_cgtp_bcast_delete(ire, ipst);
   6051 	}
   6052 
   6053 	ill = ire->ire_ill;
   6054 	if (ill != NULL)
   6055 		ill_remove_saved_ire(ill, ire);
   6056 	if (ioctl_msg)
   6057 		ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
   6058 	ire_delete(ire);
   6059 	ire_refrele(ire);
   6060 	return (err);
   6061 }
   6062 
   6063 /*
   6064  * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL.
   6065  */
   6066 /* ARGSUSED */
   6067 int
   6068 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
   6069     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
   6070 {
   6071 	ipaddr_t dst_addr;
   6072 	ipaddr_t gw_addr;
   6073 	ipaddr_t mask;
   6074 	int error = 0;
   6075 	mblk_t *mp1;
   6076 	struct rtentry *rt;
   6077 	ipif_t *ipif = NULL;
   6078 	ip_stack_t	*ipst;
   6079 
   6080 	ASSERT(q->q_next == NULL);
   6081 	ipst = CONNQ_TO_IPST(q);
   6082 
   6083 	ip1dbg(("ip_siocaddrt:"));
   6084 	/* Existence of mp1 verified in ip_wput_nondata */
   6085 	mp1 = mp->b_cont->b_cont;
   6086 	rt = (struct rtentry *)mp1->b_rptr;
   6087 
   6088 	dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
   6089 	gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
   6090 
   6091 	/*
   6092 	 * If the RTF_HOST flag is on, this is a request to assign a gateway
   6093 	 * to a particular host address.  In this case, we set the netmask to
   6094 	 * all ones for the particular destination address.  Otherwise,
   6095 	 * determine the netmask to be used based on dst_addr and the interfaces
   6096 	 * in use.
   6097 	 */
   6098 	if (rt->rt_flags & RTF_HOST) {
   6099 		mask = IP_HOST_MASK;
   6100 	} else {
   6101 		/*
   6102 		 * Note that ip_subnet_mask returns a zero mask in the case of
   6103 		 * default (an all-zeroes address).
   6104 		 */
   6105 		mask = ip_subnet_mask(dst_addr, &ipif, ipst);
   6106 	}
   6107 
   6108 	error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
   6109 	    B_TRUE, NULL, ipst, ALL_ZONES);
   6110 	if (ipif != NULL)
   6111 		ipif_refrele(ipif);
   6112 	return (error);
   6113 }
   6114 
   6115 /*
   6116  * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL.
   6117  */
   6118 /* ARGSUSED */
   6119 int
   6120 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
   6121     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
   6122 {
   6123 	ipaddr_t dst_addr;
   6124 	ipaddr_t gw_addr;
   6125 	ipaddr_t mask;
   6126 	int error;
   6127 	mblk_t *mp1;
   6128 	struct rtentry *rt;
   6129 	ipif_t *ipif = NULL;
   6130 	ip_stack_t	*ipst;
   6131 
   6132 	ASSERT(q->q_next == NULL);
   6133 	ipst = CONNQ_TO_IPST(q);
   6134 
   6135 	ip1dbg(("ip_siocdelrt:"));
   6136 	/* Existence of mp1 verified in ip_wput_nondata */
   6137 	mp1 = mp->b_cont->b_cont;
   6138 	rt = (struct rtentry *)mp1->b_rptr;
   6139 
   6140 	dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
   6141 	gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
   6142 
   6143 	/*
   6144 	 * If the RTF_HOST flag is on, this is a request to delete a gateway
   6145 	 * to a particular host address.  In this case, we set the netmask to
   6146 	 * all ones for the particular destination address.  Otherwise,
   6147 	 * determine the netmask to be used based on dst_addr and the interfaces
   6148 	 * in use.
   6149 	 */
   6150 	if (rt->rt_flags & RTF_HOST) {
   6151 		mask = IP_HOST_MASK;
   6152 	} else {
   6153 		/*
   6154 		 * Note that ip_subnet_mask returns a zero mask in the case of
   6155 		 * default (an all-zeroes address).
   6156 		 */
   6157 		mask = ip_subnet_mask(dst_addr, &ipif, ipst);
   6158 	}
   6159 
   6160 	error = ip_rt_delete(dst_addr, mask, gw_addr,
   6161 	    RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE,
   6162 	    ipst, ALL_ZONES);
   6163 	if (ipif != NULL)
   6164 		ipif_refrele(ipif);
   6165 	return (error);
   6166 }
   6167 
   6168 /*
   6169  * Enqueue the mp onto the ipsq, chained by b_next.
   6170  * b_prev stores the function to be executed later, and b_queue the queue
   6171  * where this mp originated.
   6172  */
   6173 void
   6174 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
   6175     ill_t *pending_ill)
   6176 {
   6177 	conn_t	*connp;
   6178 	ipxop_t *ipx = ipsq->ipsq_xop;
   6179 
   6180 	ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
   6181 	ASSERT(MUTEX_HELD(&ipx->ipx_lock));
   6182 	ASSERT(func != NULL);
   6183 
   6184 	mp->b_queue = q;
   6185 	mp->b_prev = (void *)func;
   6186 	mp->b_next = NULL;
   6187 
   6188 	switch (type) {
   6189 	case CUR_OP:
   6190 		if (ipx->ipx_mptail != NULL) {
   6191 			ASSERT(ipx->ipx_mphead != NULL);
   6192 			ipx->ipx_mptail->b_next = mp;
   6193 		} else {
   6194 			ASSERT(ipx->ipx_mphead == NULL);
   6195 			ipx->ipx_mphead = mp;
   6196 		}
   6197 		ipx->ipx_mptail = mp;
   6198 		break;
   6199 
   6200 	case NEW_OP:
   6201 		if (ipsq->ipsq_xopq_mptail != NULL) {
   6202 			ASSERT(ipsq->ipsq_xopq_mphead != NULL);
   6203 			ipsq->ipsq_xopq_mptail->b_next = mp;
   6204 		} else {
   6205 			ASSERT(ipsq->ipsq_xopq_mphead == NULL);
   6206 			ipsq->ipsq_xopq_mphead = mp;
   6207 		}
   6208 		ipsq->ipsq_xopq_mptail = mp;
   6209 		ipx->ipx_ipsq_queued = B_TRUE;
   6210 		break;
   6211 
   6212 	case SWITCH_OP:
   6213 		ASSERT(ipsq->ipsq_swxop != NULL);
   6214 		/* only one switch operation is currently allowed */
   6215 		ASSERT(ipsq->ipsq_switch_mp == NULL);
   6216 		ipsq->ipsq_switch_mp = mp;
   6217 		ipx->ipx_ipsq_queued = B_TRUE;
   6218 		break;
   6219 	default:
   6220 		cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
   6221 	}
   6222 
   6223 	if (CONN_Q(q) && pending_ill != NULL) {
   6224 		connp = Q_TO_CONN(q);
   6225 		ASSERT(MUTEX_HELD(&connp->conn_lock));
   6226 		connp->conn_oper_pending_ill = pending_ill;
   6227 	}
   6228 }
   6229 
   6230 /*
   6231  * Dequeue the next message that requested exclusive access to this IPSQ's
   6232  * xop.  Specifically:
   6233  *
   6234  *  1. If we're still processing the current operation on `ipsq', then
   6235  *     dequeue the next message for the operation (from ipx_mphead), or
   6236  *     return NULL if there are no queued messages for the operation.
   6237  *     These messages are queued via CUR_OP to qwriter_ip() and friends.
   6238  *
   6239  *  2. If the current operation on `ipsq' has completed (ipx_current_ipif is
   6240  *     not set) see if the ipsq has requested an xop switch.  If so, switch
   6241  *     `ipsq' to a different xop.  Xop switches only happen when joining or
   6242  *     leaving IPMP groups and require a careful dance -- see the comments
   6243  *     in-line below for details.  If we're leaving a group xop or if we're
   6244  *     joining a group xop and become writer on it, then we proceed to (3).
   6245  *     Otherwise, we return NULL and exit the xop.
   6246  *
   6247  *  3. For each IPSQ in the xop, return any switch operation stored on
   6248  *     ipsq_switch_mp (set via SWITCH_OP); these must be processed before
   6249  *     any other messages queued on the IPSQ.  Otherwise, dequeue the next
   6250  *     exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
   6251  *     Note that if the phyint tied to `ipsq' is not using IPMP there will
   6252  *     only be one IPSQ in the xop.  Otherwise, there will be one IPSQ for
   6253  *     each phyint in the group, including the IPMP meta-interface phyint.
   6254  */
   6255 static mblk_t *
   6256 ipsq_dq(ipsq_t *ipsq)
   6257 {
   6258 	ill_t	*illv4, *illv6;
   6259 	mblk_t	*mp;
   6260 	ipsq_t	*xopipsq;
   6261 	ipsq_t	*leftipsq = NULL;
   6262 	ipxop_t *ipx;
   6263 	phyint_t *phyi = ipsq->ipsq_phyint;
   6264 	ip_stack_t *ipst = ipsq->ipsq_ipst;
   6265 	boolean_t emptied = B_FALSE;
   6266 
   6267 	/*
   6268 	 * Grab all the locks we need in the defined order (ill_g_lock ->
   6269 	 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
   6270 	 */
   6271 	rw_enter(&ipst->ips_ill_g_lock,
   6272 	    ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
   6273 	mutex_enter(&ipsq->ipsq_lock);
   6274 	ipx = ipsq->ipsq_xop;
   6275 	mutex_enter(&ipx->ipx_lock);
   6276 
   6277 	/*
   6278 	 * Dequeue the next message associated with the current exclusive
   6279 	 * operation, if any.
   6280 	 */
   6281 	if ((mp = ipx->ipx_mphead) != NULL) {
   6282 		ipx->ipx_mphead = mp->b_next;
   6283 		if (ipx->ipx_mphead == NULL)
   6284 			ipx->ipx_mptail = NULL;
   6285 		mp->b_next = (void *)ipsq;
   6286 		goto out;
   6287 	}
   6288 
   6289 	if (ipx->ipx_current_ipif != NULL)
   6290 		goto empty;
   6291 
   6292 	if (ipsq->ipsq_swxop != NULL) {
   6293 		/*
   6294 		 * The exclusive operation that is now being completed has
   6295 		 * requested a switch to a different xop.  This happens
   6296 		 * when an interface joins or leaves an IPMP group.  Joins
   6297 		 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
   6298 		 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
   6299 		 * (phyint_free()), or interface plumb for an ill type
   6300 		 * not in the IPMP group (ip_rput_dlpi_writer()).
   6301 		 *
   6302 		 * Xop switches are not allowed on the IPMP meta-interface.
   6303 		 */
   6304 		ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
   6305 		ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
   6306 		DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
   6307 
   6308 		if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
   6309 			/*
   6310 			 * We're switching back to our own xop, so we have two
   6311 			 * xop's to drain/exit: our own, and the group xop
   6312 			 * that we are leaving.
   6313 			 *
   6314 			 * First, pull ourselves out of the group ipsq list.
   6315 			 * This is safe since we're writer on ill_g_lock.
   6316 			 */
   6317 			ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
   6318 
   6319 			xopipsq = ipx->ipx_ipsq;
   6320 			while (xopipsq->ipsq_next != ipsq)
   6321 				xopipsq = xopipsq->ipsq_next;
   6322 
   6323 			xopipsq->ipsq_next = ipsq->ipsq_next;
   6324 			ipsq->ipsq_next = ipsq;
   6325 			ipsq->ipsq_xop = ipsq->ipsq_swxop;
   6326 			ipsq->ipsq_swxop = NULL;
   6327 
   6328 			/*
   6329 			 * Second, prepare to exit the group xop.  The actual
   6330 			 * ipsq_exit() is done at the end of this function
   6331 			 * since we cannot hold any locks across ipsq_exit().
   6332 			 * Note that although we drop the group's ipx_lock, no
   6333 			 * threads can proceed since we're still ipx_writer.
   6334 			 */
   6335 			leftipsq = xopipsq;
   6336 			mutex_exit(&ipx->ipx_lock);
   6337 
   6338 			/*
   6339 			 * Third, set ipx to point to our own xop (which was
   6340 			 * inactive and therefore can be entered).
   6341 			 */
   6342 			ipx = ipsq->ipsq_xop;
   6343 			mutex_enter(&ipx->ipx_lock);
   6344 			ASSERT(ipx->ipx_writer == NULL);
   6345 			ASSERT(ipx->ipx_current_ipif == NULL);
   6346 		} else {
   6347 			/*
   6348 			 * We're switching from our own xop to a group xop.
   6349 			 * The requestor of the switch must ensure that the
   6350 			 * group xop cannot go away (e.g. by ensuring the
   6351 			 * phyint associated with the xop cannot go away).
   6352 			 *
   6353 			 * If we can become writer on our new xop, then we'll
   6354 			 * do the drain.  Otherwise, the current writer of our
   6355 			 * new xop will do the drain when it exits.
   6356 			 *
   6357 			 * First, splice ourselves into the group IPSQ list.
   6358 			 * This is safe since we're writer on ill_g_lock.
   6359 			 */
   6360 			ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
   6361 
   6362 			xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
   6363 			while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
   6364 				xopipsq = xopipsq->ipsq_next;
   6365 
   6366 			xopipsq->ipsq_next = ipsq;
   6367 			ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
   6368 			ipsq->ipsq_xop = ipsq->ipsq_swxop;
   6369 			ipsq->ipsq_swxop = NULL;
   6370 
   6371 			/*
   6372 			 * Second, exit our own xop, since it's now unused.
   6373 			 * This is safe since we've got the only reference.
   6374 			 */
   6375 			ASSERT(ipx->ipx_writer == curthread);
   6376 			ipx->ipx_writer = NULL;
   6377 			VERIFY(--ipx->ipx_reentry_cnt == 0);
   6378 			ipx->ipx_ipsq_queued = B_FALSE;
   6379 			mutex_exit(&ipx->ipx_lock);
   6380 
   6381 			/*
   6382 			 * Third, set ipx to point to our new xop, and check
   6383 			 * if we can become writer on it.  If we cannot, then
   6384 			 * the current writer will drain the IPSQ group when
   6385 			 * it exits.  Our ipsq_xop is guaranteed to be stable
   6386 			 * because we're still holding ipsq_lock.
   6387 			 */
   6388 			ipx = ipsq->ipsq_xop;
   6389 			mutex_enter(&ipx->ipx_lock);
   6390 			if (ipx->ipx_writer != NULL ||
   6391 			    ipx->ipx_current_ipif != NULL) {
   6392 				goto out;
   6393 			}
   6394 		}
   6395 
   6396 		/*
   6397 		 * Fourth, become writer on our new ipx before we continue
   6398 		 * with the drain.  Note that we never dropped ipsq_lock
   6399 		 * above, so no other thread could've raced with us to
   6400 		 * become writer first.  Also, we're holding ipx_lock, so
   6401 		 * no other thread can examine the ipx right now.
   6402 		 */
   6403 		ASSERT(ipx->ipx_current_ipif == NULL);
   6404 		ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
   6405 		VERIFY(ipx->ipx_reentry_cnt++ == 0);
   6406 		ipx->ipx_writer = curthread;
   6407 		ipx->ipx_forced = B_FALSE;
   6408 #ifdef DEBUG
   6409 		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
   6410 #endif
   6411 	}
   6412 
   6413 	xopipsq = ipsq;
   6414 	do {
   6415 		/*
   6416 		 * So that other operations operate on a consistent and
   6417 		 * complete phyint, a switch message on an IPSQ must be
   6418 		 * handled prior to any other operations on that IPSQ.
   6419 		 */
   6420 		if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
   6421 			xopipsq->ipsq_switch_mp = NULL;
   6422 			ASSERT(mp->b_next == NULL);
   6423 			mp->b_next = (void *)xopipsq;
   6424 			goto out;
   6425 		}
   6426 
   6427 		if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
   6428 			xopipsq->ipsq_xopq_mphead = mp->b_next;
   6429 			if (xopipsq->ipsq_xopq_mphead == NULL)
   6430 				xopipsq->ipsq_xopq_mptail = NULL;
   6431 			mp->b_next = (void *)xopipsq;
   6432 			goto out;
   6433 		}
   6434 	} while ((xopipsq = xopipsq->ipsq_next) != ipsq);
   6435 empty:
   6436 	/*
   6437 	 * There are no messages.  Further, we are holding ipx_lock, hence no
   6438 	 * new messages can end up on any IPSQ in the xop.
   6439 	 */
   6440 	ipx->ipx_writer = NULL;
   6441 	ipx->ipx_forced = B_FALSE;
   6442 	VERIFY(--ipx->ipx_reentry_cnt == 0);
   6443 	ipx->ipx_ipsq_queued = B_FALSE;
   6444 	emptied = B_TRUE;
   6445 #ifdef	DEBUG
   6446 	ipx->ipx_depth = 0;
   6447 #endif
   6448 out:
   6449 	mutex_exit(&ipx->ipx_lock);
   6450 	mutex_exit(&ipsq->ipsq_lock);
   6451 
   6452 	/*
   6453 	 * If we completely emptied the xop, then wake up any threads waiting
   6454 	 * to enter any of the IPSQ's associated with it.
   6455 	 */
   6456 	if (emptied) {
   6457 		xopipsq = ipsq;
   6458 		do {
   6459 			if ((phyi = xopipsq->ipsq_phyint) == NULL)
   6460 				continue;
   6461 
   6462 			illv4 = phyi->phyint_illv4;
   6463 			illv6 = phyi->phyint_illv6;
   6464 
   6465 			GRAB_ILL_LOCKS(illv4, illv6);
   6466 			if (illv4 != NULL)
   6467 				cv_broadcast(&illv4->ill_cv);
   6468 			if (illv6 != NULL)
   6469 				cv_broadcast(&illv6->ill_cv);
   6470 			RELEASE_ILL_LOCKS(illv4, illv6);
   6471 		} while ((xopipsq = xopipsq->ipsq_next) != ipsq);
   6472 	}
   6473 	rw_exit(&ipst->ips_ill_g_lock);
   6474 
   6475 	/*
   6476 	 * Now that all locks are dropped, exit the IPSQ we left.
   6477 	 */
   6478 	if (leftipsq != NULL)
   6479 		ipsq_exit(leftipsq);
   6480 
   6481 	return (mp);
   6482 }
   6483 
   6484 /*
   6485  * Return completion status of previously initiated DLPI operations on
   6486  * ills in the purview of an ipsq.
   6487  */
   6488 static boolean_t
   6489 ipsq_dlpi_done(ipsq_t *ipsq)
   6490 {
   6491 	ipsq_t		*ipsq_start;
   6492 	phyint_t	*phyi;
   6493 	ill_t		*ill;
   6494 
   6495 	ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock));
   6496 	ipsq_start = ipsq;
   6497 
   6498 	do {
   6499 		/*
   6500 		 * The only current users of this function are ipsq_try_enter
   6501 		 * and ipsq_enter which have made sure that ipsq_writer is
   6502 		 * NULL before we reach here. ill_dlpi_pending is modified
   6503 		 * only by an ipsq writer
   6504 		 */
   6505 		ASSERT(ipsq->ipsq_xop->ipx_writer == NULL);
   6506 		phyi = ipsq->ipsq_phyint;
   6507 		/*
   6508 		 * phyi could be NULL if a phyint that is part of an
   6509 		 * IPMP group is being unplumbed. A more detailed
   6510 		 * comment is in ipmp_grp_update_kstats()
   6511 		 */
   6512 		if (phyi != NULL) {
   6513 			ill = phyi->phyint_illv4;
   6514 			if (ill != NULL &&
   6515 			    (ill->ill_dlpi_pending != DL_PRIM_INVAL ||
   6516 			    ill->ill_arl_dlpi_pending))
   6517 				return (B_FALSE);
   6518 
   6519 			ill = phyi->phyint_illv6;
   6520 			if (ill != NULL &&
   6521 			    ill->ill_dlpi_pending != DL_PRIM_INVAL)
   6522 				return (B_FALSE);
   6523 		}
   6524 
   6525 	} while ((ipsq = ipsq->ipsq_next) != ipsq_start);
   6526 
   6527 	return (B_TRUE);
   6528 }
   6529 
   6530 /*
   6531  * Enter the ipsq corresponding to ill, by waiting synchronously till
   6532  * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
   6533  * will have to drain completely before ipsq_enter returns success.
   6534  * ipx_current_ipif will be set if some exclusive op is in progress,
   6535  * and the ipsq_exit logic will start the next enqueued op after
   6536  * completion of the current op. If 'force' is used, we don't wait
   6537  * for the enqueued ops. This is needed when a conn_close wants to
   6538  * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
   6539  * of an ill can also use this option. But we dont' use it currently.
   6540  */
   6541 #define	ENTER_SQ_WAIT_TICKS 100
   6542 boolean_t
   6543 ipsq_enter(ill_t *ill, boolean_t force, int type)
   6544 {
   6545 	ipsq_t	*ipsq;
   6546 	ipxop_t *ipx;
   6547 	boolean_t waited_enough = B_FALSE;
   6548 	ip_stack_t *ipst = ill->ill_ipst;
   6549 
   6550 	/*
   6551 	 * Note that the relationship between ill and ipsq is fixed as long as
   6552 	 * the ill is not ILL_CONDEMNED.  Holding ipsq_lock ensures the
   6553 	 * relationship between the IPSQ and xop cannot change.  However,
   6554 	 * since we cannot hold ipsq_lock across the cv_wait(), it may change
   6555 	 * while we're waiting.  We wait on ill_cv and rely on ipsq_exit()
   6556 	 * waking up all ills in the xop when it becomes available.
   6557 	 */
   6558 	for (;;) {
   6559 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   6560 		mutex_enter(&ill->ill_lock);
   6561 		if (ill->ill_state_flags & ILL_CONDEMNED) {
   6562 			mutex_exit(&ill->ill_lock);
   6563 			rw_exit(&ipst->ips_ill_g_lock);
   6564 			return (B_FALSE);
   6565 		}
   6566 
   6567 		ipsq = ill->ill_phyint->phyint_ipsq;
   6568 		mutex_enter(&ipsq->ipsq_lock);
   6569 		ipx = ipsq->ipsq_xop;
   6570 		mutex_enter(&ipx->ipx_lock);
   6571 
   6572 		if (ipx->ipx_writer == NULL && (type == CUR_OP ||
   6573 		    (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) ||
   6574 		    waited_enough))
   6575 			break;
   6576 
   6577 		rw_exit(&ipst->ips_ill_g_lock);
   6578 
   6579 		if (!force || ipx->ipx_writer != NULL) {
   6580 			mutex_exit(&ipx->ipx_lock);
   6581 			mutex_exit(&ipsq->ipsq_lock);
   6582 			cv_wait(&ill->ill_cv, &ill->ill_lock);
   6583 		} else {
   6584 			mutex_exit(&ipx->ipx_lock);
   6585 			mutex_exit(&ipsq->ipsq_lock);
   6586 			(void) cv_reltimedwait(&ill->ill_cv,
   6587 			    &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK);
   6588 			waited_enough = B_TRUE;
   6589 		}
   6590 		mutex_exit(&ill->ill_lock);
   6591 	}
   6592 
   6593 	ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
   6594 	ASSERT(ipx->ipx_reentry_cnt == 0);
   6595 	ipx->ipx_writer = curthread;
   6596 	ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
   6597 	ipx->ipx_reentry_cnt++;
   6598 #ifdef DEBUG
   6599 	ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
   6600 #endif
   6601 	mutex_exit(&ipx->ipx_lock);
   6602 	mutex_exit(&ipsq->ipsq_lock);
   6603 	mutex_exit(&ill->ill_lock);
   6604 	rw_exit(&ipst->ips_ill_g_lock);
   6605 
   6606 	return (B_TRUE);
   6607 }
   6608 
   6609 /*
   6610  * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock
   6611  * across the call to the core interface ipsq_try_enter() and hence calls this
   6612  * function directly. This is explained more fully in ipif_set_values().
   6613  * In order to support the above constraint, ipsq_try_enter is implemented as
   6614  * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently
   6615  */
   6616 static ipsq_t *
   6617 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
   6618     int type, boolean_t reentry_ok)
   6619 {
   6620 	ipsq_t	*ipsq;
   6621 	ipxop_t	*ipx;
   6622 	ip_stack_t *ipst = ill->ill_ipst;
   6623 
   6624 	/*
   6625 	 * lock ordering:
   6626 	 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
   6627 	 *
   6628 	 * ipx of an ipsq can't change when ipsq_lock is held.
   6629 	 */
   6630 	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
   6631 	GRAB_CONN_LOCK(q);
   6632 	mutex_enter(&ill->ill_lock);
   6633 	ipsq = ill->ill_phyint->phyint_ipsq;
   6634 	mutex_enter(&ipsq->ipsq_lock);
   6635 	ipx = ipsq->ipsq_xop;
   6636 	mutex_enter(&ipx->ipx_lock);
   6637 
   6638 	/*
   6639 	 * 1. Enter the ipsq if we are already writer and reentry is ok.
   6640 	 *    (Note: If the caller does not specify reentry_ok then neither
   6641 	 *    'func' nor any of its callees must ever attempt to enter the ipsq
   6642 	 *    again. Otherwise it can lead to an infinite loop
   6643 	 * 2. Enter the ipsq if there is no current writer and this attempted
   6644 	 *    entry is part of the current operation
   6645 	 * 3. Enter the ipsq if there is no current writer and this is a new
   6646 	 *    operation and the operation queue is empty and there is no
   6647 	 *    operation currently in progress and if all previously initiated
   6648 	 *    DLPI operations have completed.
   6649 	 */
   6650 	if ((ipx->ipx_writer == curthread && reentry_ok) ||
   6651 	    (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
   6652 	    !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL &&
   6653 	    ipsq_dlpi_done(ipsq))))) {
   6654 		/* Success. */
   6655 		ipx->ipx_reentry_cnt++;
   6656 		ipx->ipx_writer = curthread;
   6657 		ipx->ipx_forced = B_FALSE;
   6658 		mutex_exit(&ipx->ipx_lock);
   6659 		mutex_exit(&ipsq->ipsq_lock);
   6660 		mutex_exit(&ill->ill_lock);
   6661 		RELEASE_CONN_LOCK(q);
   6662 #ifdef DEBUG
   6663 		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
   6664 #endif
   6665 		return (ipsq);
   6666 	}
   6667 
   6668 	if (func != NULL)
   6669 		ipsq_enq(ipsq, q, mp, func, type, ill);
   6670 
   6671 	mutex_exit(&ipx->ipx_lock);
   6672 	mutex_exit(&ipsq->ipsq_lock);
   6673 	mutex_exit(&ill->ill_lock);
   6674 	RELEASE_CONN_LOCK(q);
   6675 	return (NULL);
   6676 }
   6677 
   6678 /*
   6679  * The ipsq_t (ipsq) is the synchronization data structure used to serialize
   6680  * certain critical operations like plumbing (i.e. most set ioctls), etc.
   6681  * There is one ipsq per phyint. The ipsq
   6682  * serializes exclusive ioctls issued by applications on a per ipsq basis in
   6683  * ipsq_xopq_mphead. It also protects against multiple threads executing in
   6684  * the ipsq. Responses from the driver pertain to the current ioctl (say a
   6685  * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
   6686  * up the interface) and are enqueued in ipx_mphead.
   6687  *
   6688  * If a thread does not want to reenter the ipsq when it is already writer,
   6689  * it must make sure that the specified reentry point to be called later
   6690  * when the ipsq is empty, nor any code path starting from the specified reentry
   6691  * point must never ever try to enter the ipsq again. Otherwise it can lead
   6692  * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
   6693  * When the thread that is currently exclusive finishes, it (ipsq_exit)
   6694  * dequeues the requests waiting to become exclusive in ipx_mphead and calls
   6695  * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
   6696  * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
   6697  * ioctl if the current ioctl has completed. If the current ioctl is still
   6698  * in progress it simply returns. The current ioctl could be waiting for
   6699  * a response from another module (the driver or could be waiting for
   6700  * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
   6701  * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
   6702  * execution of the ioctl and ipsq_exit does not start the next ioctl unless
   6703  * ipx_current_ipif is NULL which happens only once the ioctl is complete and
   6704  * all associated DLPI operations have completed.
   6705  */
   6706 
   6707 /*
   6708  * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
   6709  * and `ill' cannot both be specified).  Returns a pointer to the entered IPSQ
   6710  * on success, or NULL on failure.  The caller ensures ipif/ill is valid by
   6711  * refholding it as necessary.  If the IPSQ cannot be entered and `func' is
   6712  * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
   6713  * can be entered.  If `func' is NULL, then `q' and `mp' are ignored.
   6714  */
   6715 ipsq_t *
   6716 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
   6717     ipsq_func_t func, int type, boolean_t reentry_ok)
   6718 {
   6719 	ip_stack_t	*ipst;
   6720 	ipsq_t		*ipsq;
   6721 
   6722 	/* Only 1 of ipif or ill can be specified */
   6723 	ASSERT((ipif != NULL) ^ (ill != NULL));
   6724 
   6725 	if (ipif != NULL)
   6726 		ill = ipif->ipif_ill;
   6727 	ipst = ill->ill_ipst;
   6728 
   6729 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   6730 	ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok);
   6731 	rw_exit(&ipst->ips_ill_g_lock);
   6732 
   6733 	return (ipsq);
   6734 }
   6735 
   6736 /*
   6737  * Try to enter the IPSQ corresponding to `ill' as writer.  The caller ensures
   6738  * ill is valid by refholding it if necessary; we will refrele.  If the IPSQ
   6739  * cannot be entered, the mp is queued for completion.
   6740  */
   6741 void
   6742 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
   6743     boolean_t reentry_ok)
   6744 {
   6745 	ipsq_t	*ipsq;
   6746 
   6747 	ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok);
   6748 
   6749 	/*
   6750 	 * Drop the caller's refhold on the ill.  This is safe since we either
   6751 	 * entered the IPSQ (and thus are exclusive), or failed to enter the
   6752 	 * IPSQ, in which case we return without accessing ill anymore.  This
   6753 	 * is needed because func needs to see the correct refcount.
   6754 	 * e.g. removeif can work only then.
   6755 	 */
   6756 	ill_refrele(ill);
   6757 	if (ipsq != NULL) {
   6758 		(*func)(ipsq, q, mp, NULL);
   6759 		ipsq_exit(ipsq);
   6760 	}
   6761 }
   6762 
   6763 /*
   6764  * Exit the specified IPSQ.  If this is the final exit on it then drain it
   6765  * prior to exiting.  Caller must be writer on the specified IPSQ.
   6766  */
   6767 void
   6768 ipsq_exit(ipsq_t *ipsq)
   6769 {
   6770 	mblk_t *mp;
   6771 	ipsq_t *mp_ipsq;
   6772 	queue_t	*q;
   6773 	phyint_t *phyi;
   6774 	ipsq_func_t func;
   6775 
   6776 	ASSERT(IAM_WRITER_IPSQ(ipsq));
   6777 
   6778 	ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
   6779 	if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
   6780 		ipsq->ipsq_xop->ipx_reentry_cnt--;
   6781 		return;
   6782 	}
   6783 
   6784 	for (;;) {
   6785 		phyi = ipsq->ipsq_phyint;
   6786 		mp = ipsq_dq(ipsq);
   6787 		mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
   6788 
   6789 		/*
   6790 		 * If we've changed to a new IPSQ, and the phyint associated
   6791 		 * with the old one has gone away, free the old IPSQ.  Note
   6792 		 * that this cannot happen while the IPSQ is in a group.
   6793 		 */
   6794 		if (mp_ipsq != ipsq && phyi == NULL) {
   6795 			ASSERT(ipsq->ipsq_next == ipsq);
   6796 			ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
   6797 			ipsq_delete(ipsq);
   6798 		}
   6799 
   6800 		if (mp == NULL)
   6801 			break;
   6802 
   6803 		q = mp->b_queue;
   6804 		func = (ipsq_func_t)mp->b_prev;
   6805 		ipsq = mp_ipsq;
   6806 		mp->b_next = mp->b_prev = NULL;
   6807 		mp->b_queue = NULL;
   6808 
   6809 		/*
   6810 		 * If 'q' is an conn queue, it is valid, since we did a
   6811 		 * a refhold on the conn at the start of the ioctl.
   6812 		 * If 'q' is an ill queue, it is valid, since close of an
   6813 		 * ill will clean up its IPSQ.
   6814 		 */
   6815 		(*func)(ipsq, q, mp, NULL);
   6816 	}
   6817 }
   6818 
   6819 /*
   6820  * Used to start any igmp or mld timers that could not be started
   6821  * while holding ill_mcast_lock. The timers can't be started while holding
   6822  * the lock, since mld/igmp_start_timers may need to call untimeout()
   6823  * which can't be done while holding the lock which the timeout handler
   6824  * acquires. Otherwise
   6825  * there could be a deadlock since the timeout handlers
   6826  * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire
   6827  * ill_mcast_lock.
   6828  */
   6829 void
   6830 ill_mcast_timer_start(ip_stack_t *ipst)
   6831 {
   6832 	int		next;
   6833 
   6834 	mutex_enter(&ipst->ips_igmp_timer_lock);
   6835 	next = ipst->ips_igmp_deferred_next;
   6836 	ipst->ips_igmp_deferred_next = INFINITY;
   6837 	mutex_exit(&ipst->ips_igmp_timer_lock);
   6838 
   6839 	if (next != INFINITY)
   6840 		igmp_start_timers(next, ipst);
   6841 
   6842 	mutex_enter(&ipst->ips_mld_timer_lock);
   6843 	next = ipst->ips_mld_deferred_next;
   6844 	ipst->ips_mld_deferred_next = INFINITY;
   6845 	mutex_exit(&ipst->ips_mld_timer_lock);
   6846 
   6847 	if (next != INFINITY)
   6848 		mld_start_timers(next, ipst);
   6849 }
   6850 
   6851 /*
   6852  * Start the current exclusive operation on `ipsq'; associate it with `ipif'
   6853  * and `ioccmd'.
   6854  */
   6855 void
   6856 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
   6857 {
   6858 	ill_t *ill = ipif->ipif_ill;
   6859 	ipxop_t *ipx = ipsq->ipsq_xop;
   6860 
   6861 	ASSERT(IAM_WRITER_IPSQ(ipsq));
   6862 	ASSERT(ipx->ipx_current_ipif == NULL);
   6863 	ASSERT(ipx->ipx_current_ioctl == 0);
   6864 
   6865 	ipx->ipx_current_done = B_FALSE;
   6866 	ipx->ipx_current_ioctl = ioccmd;
   6867 	mutex_enter(&ipx->ipx_lock);
   6868 	ipx->ipx_current_ipif = ipif;
   6869 	mutex_exit(&ipx->ipx_lock);
   6870 
   6871 	/*
   6872 	 * Set IPIF_CHANGING on one or more ipifs associated with the
   6873 	 * current exclusive operation.  IPIF_CHANGING prevents any new
   6874 	 * references to the ipif (so that the references will eventually
   6875 	 * drop to zero) and also prevents any "get" operations (e.g.,
   6876 	 * SIOCGLIFFLAGS) from being able to access the ipif until the
   6877 	 * operation has completed and the ipif is again in a stable state.
   6878 	 *
   6879 	 * For ioctls, IPIF_CHANGING is set on the ipif associated with the
   6880 	 * ioctl.  For internal operations (where ioccmd is zero), all ipifs
   6881 	 * on the ill are marked with IPIF_CHANGING since it's unclear which
   6882 	 * ipifs will be affected.
   6883 	 *
   6884 	 * Note that SIOCLIFREMOVEIF is a special case as it sets
   6885 	 * IPIF_CONDEMNED internally after identifying the right ipif to
   6886 	 * operate on.
   6887 	 */
   6888 	switch (ioccmd) {
   6889 	case SIOCLIFREMOVEIF:
   6890 		break;
   6891 	case 0:
   6892 		mutex_enter(&ill->ill_lock);
   6893 		ipif = ipif->ipif_ill->ill_ipif;
   6894 		for (; ipif != NULL; ipif = ipif->ipif_next)
   6895 			ipif->ipif_state_flags |= IPIF_CHANGING;
   6896 		mutex_exit(&ill->ill_lock);
   6897 		break;
   6898 	default:
   6899 		mutex_enter(&ill->ill_lock);
   6900 		ipif->ipif_state_flags |= IPIF_CHANGING;
   6901 		mutex_exit(&ill->ill_lock);
   6902 	}
   6903 }
   6904 
   6905 /*
   6906  * Finish the current exclusive operation on `ipsq'.  Usually, this will allow
   6907  * the next exclusive operation to begin once we ipsq_exit().  However, if
   6908  * pending DLPI operations remain, then we will wait for the queue to drain
   6909  * before allowing the next exclusive operation to begin.  This ensures that
   6910  * DLPI operations from one exclusive operation are never improperly processed
   6911  * as part of a subsequent exclusive operation.
   6912  */
   6913 void
   6914 ipsq_current_finish(ipsq_t *ipsq)
   6915 {
   6916 	ipxop_t	*ipx = ipsq->ipsq_xop;
   6917 	t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
   6918 	ipif_t	*ipif = ipx->ipx_current_ipif;
   6919 
   6920 	ASSERT(IAM_WRITER_IPSQ(ipsq));
   6921 
   6922 	/*
   6923 	 * For SIOCLIFREMOVEIF, the ipif has been already been blown away
   6924 	 * (but in that case, IPIF_CHANGING will already be clear and no
   6925 	 * pending DLPI messages can remain).
   6926 	 */
   6927 	if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
   6928 		ill_t *ill = ipif->ipif_ill;
   6929 
   6930 		mutex_enter(&ill->ill_lock);
   6931 		dlpi_pending = ill->ill_dlpi_pending;
   6932 		if (ipx->ipx_current_ioctl == 0) {
   6933 			ipif = ill->ill_ipif;
   6934 			for (; ipif != NULL; ipif = ipif->ipif_next)
   6935 				ipif->ipif_state_flags &= ~IPIF_CHANGING;
   6936 		} else {
   6937 			ipif->ipif_state_flags &= ~IPIF_CHANGING;
   6938 		}
   6939 		mutex_exit(&ill->ill_lock);
   6940 	}
   6941 
   6942 	ASSERT(!ipx->ipx_current_done);
   6943 	ipx->ipx_current_done = B_TRUE;
   6944 	ipx->ipx_current_ioctl = 0;
   6945 	if (dlpi_pending == DL_PRIM_INVAL) {
   6946 		mutex_enter(&ipx->ipx_lock);
   6947 		ipx->ipx_current_ipif = NULL;
   6948 		mutex_exit(&ipx->ipx_lock);
   6949 	}
   6950 }
   6951 
   6952 /*
   6953  * The ill is closing. Flush all messages on the ipsq that originated
   6954  * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead
   6955  * for this ill since ipsq_enter could not have entered until then.
   6956  * New messages can't be queued since the CONDEMNED flag is set.
   6957  */
   6958 static void
   6959 ipsq_flush(ill_t *ill)
   6960 {
   6961 	queue_t	*q;
   6962 	mblk_t	*prev;
   6963 	mblk_t	*mp;
   6964 	mblk_t	*mp_next;
   6965 	ipxop_t	*ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
   6966 
   6967 	ASSERT(IAM_WRITER_ILL(ill));
   6968 
   6969 	/*
   6970 	 * Flush any messages sent up by the driver.
   6971 	 */
   6972 	mutex_enter(&ipx->ipx_lock);
   6973 	for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
   6974 		mp_next = mp->b_next;
   6975 		q = mp->b_queue;
   6976 		if (q == ill->ill_rq || q == ill->ill_wq) {
   6977 			/* dequeue mp */
   6978 			if (prev == NULL)
   6979 				ipx->ipx_mphead = mp->b_next;
   6980 			else
   6981 				prev->b_next = mp->b_next;
   6982 			if (ipx->ipx_mptail == mp) {
   6983 				ASSERT(mp_next == NULL);
   6984 				ipx->ipx_mptail = prev;
   6985 			}
   6986 			inet_freemsg(mp);
   6987 		} else {
   6988 			prev = mp;
   6989 		}
   6990 	}
   6991 	mutex_exit(&ipx->ipx_lock);
   6992 	(void) ipsq_pending_mp_cleanup(ill, NULL);
   6993 	ipsq_xopq_mp_cleanup(ill, NULL);
   6994 }
   6995 
   6996 /*
   6997  * Parse an ifreq or lifreq struct coming down ioctls and refhold
   6998  * and return the associated ipif.
   6999  * Return value:
   7000  *	Non zero: An error has occurred. ci may not be filled out.
   7001  *	zero : ci is filled out with the ioctl cmd in ci.ci_name, and
   7002  *	a held ipif in ci.ci_ipif.
   7003  */
   7004 int
   7005 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
   7006     cmd_info_t *ci)
   7007 {
   7008 	char		*name;
   7009 	struct ifreq    *ifr;
   7010 	struct lifreq    *lifr;
   7011 	ipif_t		*ipif = NULL;
   7012 	ill_t		*ill;
   7013 	conn_t		*connp;
   7014 	boolean_t	isv6;
   7015 	boolean_t	exists;
   7016 	mblk_t		*mp1;
   7017 	zoneid_t	zoneid;
   7018 	ip_stack_t	*ipst;
   7019 
   7020 	if (q->q_next != NULL) {
   7021 		ill = (ill_t *)q->q_ptr;
   7022 		isv6 = ill->ill_isv6;
   7023 		connp = NULL;
   7024 		zoneid = ALL_ZONES;
   7025 		ipst = ill->ill_ipst;
   7026 	} else {
   7027 		ill = NULL;
   7028 		connp = Q_TO_CONN(q);
   7029 		isv6 = (connp->conn_family == AF_INET6);
   7030 		zoneid = connp->conn_zoneid;
   7031 		if (zoneid == GLOBAL_ZONEID) {
   7032 			/* global zone can access ipifs in all zones */
   7033 			zoneid = ALL_ZONES;
   7034 		}
   7035 		ipst = connp->conn_netstack->netstack_ip;
   7036 	}
   7037 
   7038 	/* Has been checked in ip_wput_nondata */
   7039 	mp1 = mp->b_cont->b_cont;
   7040 
   7041 	if (ipip->ipi_cmd_type == IF_CMD) {
   7042 		/* This a old style SIOC[GS]IF* command */
   7043 		ifr = (struct ifreq *)mp1->b_rptr;
   7044 		/*
   7045 		 * Null terminate the string to protect against buffer
   7046 		 * overrun. String was generated by user code and may not
   7047 		 * be trusted.
   7048 		 */
   7049 		ifr->ifr_name[IFNAMSIZ - 1] = '\0';
   7050 		name = ifr->ifr_name;
   7051 		ci->ci_sin = (sin_t *)&ifr->ifr_addr;
   7052 		ci->ci_sin6 = NULL;
   7053 		ci->ci_lifr = (struct lifreq *)ifr;
   7054 	} else {
   7055 		/* This a new style SIOC[GS]LIF* command */
   7056 		ASSERT(ipip->ipi_cmd_type == LIF_CMD);
   7057 		lifr = (struct lifreq *)mp1->b_rptr;
   7058 		/*
   7059 		 * Null terminate the string to protect against buffer
   7060 		 * overrun. String was generated by user code and may not
   7061 		 * be trusted.
   7062 		 */
   7063 		lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
   7064 		name = lifr->lifr_name;
   7065 		ci->ci_sin = (sin_t *)&lifr->lifr_addr;
   7066 		ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
   7067 		ci->ci_lifr = lifr;
   7068 	}
   7069 
   7070 	if (ipip->ipi_cmd == SIOCSLIFNAME) {
   7071 		/*
   7072 		 * The ioctl will be failed if the ioctl comes down
   7073 		 * an conn stream
   7074 		 */
   7075 		if (ill == NULL) {
   7076 			/*
   7077 			 * Not an ill queue, return EINVAL same as the
   7078 			 * old error code.
   7079 			 */
   7080 			return (ENXIO);
   7081 		}
   7082 		ipif = ill->ill_ipif;
   7083 		ipif_refhold(ipif);
   7084 	} else {
   7085 		ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE,
   7086 		    &exists, isv6, zoneid, ipst);
   7087 
   7088 		/*
   7089 		 * Ensure that get ioctls don't see any internal state changes
   7090 		 * caused by set ioctls by deferring them if IPIF_CHANGING is
   7091 		 * set.
   7092 		 */
   7093 		if (ipif != NULL && !(ipip->ipi_flags & IPI_WR) &&
   7094 		    !IAM_WRITER_IPIF(ipif)) {
   7095 			ipsq_t	*ipsq;
   7096 
   7097 			if (connp != NULL)
   7098 				mutex_enter(&connp->conn_lock);
   7099 			mutex_enter(&ipif->ipif_ill->ill_lock);
   7100 			if (IPIF_IS_CHANGING(ipif) &&
   7101 			    !IPIF_IS_CONDEMNED(ipif)) {
   7102 				ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
   7103 				mutex_enter(&ipsq->ipsq_lock);
   7104 				mutex_enter(&ipsq->ipsq_xop->ipx_lock);
   7105 				mutex_exit(&ipif->ipif_ill->ill_lock);
   7106 				ipsq_enq(ipsq, q, mp, ip_process_ioctl,
   7107 				    NEW_OP, ipif->ipif_ill);
   7108 				mutex_exit(&ipsq->ipsq_xop->ipx_lock);
   7109 				mutex_exit(&ipsq->ipsq_lock);
   7110 				if (connp != NULL)
   7111 					mutex_exit(&connp->conn_lock);
   7112 				ipif_refrele(ipif);
   7113 				return (EINPROGRESS);
   7114 			}
   7115 			mutex_exit(&ipif->ipif_ill->ill_lock);
   7116 			if (connp != NULL)
   7117 				mutex_exit(&connp->conn_lock);
   7118 		}
   7119 	}
   7120 
   7121 	/*
   7122 	 * Old style [GS]IFCMD does not admit IPv6 ipif
   7123 	 */
   7124 	if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) {
   7125 		ipif_refrele(ipif);
   7126 		return (ENXIO);
   7127 	}
   7128 
   7129 	if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL &&
   7130 	    name[0] == '\0') {
   7131 		/*
   7132 		 * Handle a or a SIOC?IF* with a null name
   7133 		 * during plumb (on the ill queue before the I_PLINK).
   7134 		 */
   7135 		ipif = ill->ill_ipif;
   7136 		ipif_refhold(ipif);
   7137 	}
   7138 
   7139 	if (ipif == NULL)
   7140 		return (ENXIO);
   7141 
   7142 	DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq",
   7143 	    int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif);
   7144 
   7145 	ci->ci_ipif = ipif;
   7146 	return (0);
   7147 }
   7148 
   7149 /*
   7150  * Return the total number of ipifs.
   7151  */
   7152 static uint_t
   7153 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
   7154 {
   7155 	uint_t numifs = 0;
   7156 	ill_t	*ill;
   7157 	ill_walk_context_t	ctx;
   7158 	ipif_t	*ipif;
   7159 
   7160 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   7161 	ill = ILL_START_WALK_V4(&ctx, ipst);
   7162 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   7163 		if (IS_UNDER_IPMP(ill))
   7164 			continue;
   7165 		for (ipif = ill->ill_ipif; ipif != NULL;
   7166 		    ipif = ipif->ipif_next) {
   7167 			if (ipif->ipif_zoneid == zoneid ||
   7168 			    ipif->ipif_zoneid == ALL_ZONES)
   7169 				numifs++;
   7170 		}
   7171 	}
   7172 	rw_exit(&ipst->ips_ill_g_lock);
   7173 	return (numifs);
   7174 }
   7175 
   7176 /*
   7177  * Return the total number of ipifs.
   7178  */
   7179 static uint_t
   7180 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
   7181 {
   7182 	uint_t numifs = 0;
   7183 	ill_t	*ill;
   7184 	ipif_t	*ipif;
   7185 	ill_walk_context_t	ctx;
   7186 
   7187 	ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid));
   7188 
   7189 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   7190 	if (family == AF_INET)
   7191 		ill = ILL_START_WALK_V4(&ctx, ipst);
   7192 	else if (family == AF_INET6)
   7193 		ill = ILL_START_WALK_V6(&ctx, ipst);
   7194 	else
   7195 		ill = ILL_START_WALK_ALL(&ctx, ipst);
   7196 
   7197 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   7198 		if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
   7199 			continue;
   7200 
   7201 		for (ipif = ill->ill_ipif; ipif != NULL;
   7202 		    ipif = ipif->ipif_next) {
   7203 			if ((ipif->ipif_flags & IPIF_NOXMIT) &&
   7204 			    !(lifn_flags & LIFC_NOXMIT))
   7205 				continue;
   7206 			if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
   7207 			    !(lifn_flags & LIFC_TEMPORARY))
   7208 				continue;
   7209 			if (((ipif->ipif_flags &
   7210 			    (IPIF_NOXMIT|IPIF_NOLOCAL|
   7211 			    IPIF_DEPRECATED)) ||
   7212 			    IS_LOOPBACK(ill) ||
   7213 			    !(ipif->ipif_flags & IPIF_UP)) &&
   7214 			    (lifn_flags & LIFC_EXTERNAL_SOURCE))
   7215 				continue;
   7216 
   7217 			if (zoneid != ipif->ipif_zoneid &&
   7218 			    ipif->ipif_zoneid != ALL_ZONES &&
   7219 			    (zoneid != GLOBAL_ZONEID ||
   7220 			    !(lifn_flags & LIFC_ALLZONES)))
   7221 				continue;
   7222 
   7223 			numifs++;
   7224 		}
   7225 	}
   7226 	rw_exit(&ipst->ips_ill_g_lock);
   7227 	return (numifs);
   7228 }
   7229 
   7230 uint_t
   7231 ip_get_lifsrcofnum(ill_t *ill)
   7232 {
   7233 	uint_t numifs = 0;
   7234 	ill_t	*ill_head = ill;
   7235 	ip_stack_t	*ipst = ill->ill_ipst;
   7236 
   7237 	/*
   7238 	 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some
   7239 	 * other thread may be trying to relink the ILLs in this usesrc group
   7240 	 * and adjusting the ill_usesrc_grp_next pointers
   7241 	 */
   7242 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
   7243 	if ((ill->ill_usesrc_ifindex == 0) &&
   7244 	    (ill->ill_usesrc_grp_next != NULL)) {
   7245 		for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head);
   7246 		    ill = ill->ill_usesrc_grp_next)
   7247 			numifs++;
   7248 	}
   7249 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
   7250 
   7251 	return (numifs);
   7252 }
   7253 
   7254 /* Null values are passed in for ipif, sin, and ifreq */
   7255 /* ARGSUSED */
   7256 int
   7257 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
   7258     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
   7259 {
   7260 	int *nump;
   7261 	conn_t *connp = Q_TO_CONN(q);
   7262 
   7263 	ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
   7264 
   7265 	/* Existence of b_cont->b_cont checked in ip_wput_nondata */
   7266 	nump = (int *)mp->b_cont->b_cont->b_rptr;
   7267 
   7268 	*nump = ip_get_numifs(connp->conn_zoneid,
   7269 	    connp->conn_netstack->netstack_ip);
   7270 	ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump));
   7271 	return (0);
   7272 }
   7273 
   7274 /* Null values are passed in for ipif, sin, and ifreq */
   7275 /* ARGSUSED */
   7276 int
   7277 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin,
   7278     queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
   7279 {
   7280 	struct lifnum *lifn;
   7281 	mblk_t	*mp1;
   7282 	conn_t *connp = Q_TO_CONN(q);
   7283 
   7284 	ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
   7285 
   7286 	/* Existence checked in ip_wput_nondata */
   7287 	mp1 = mp->b_cont->b_cont;
   7288 
   7289 	lifn = (struct lifnum *)mp1->b_rptr;
   7290 	switch (lifn->lifn_family) {
   7291 	case AF_UNSPEC:
   7292 	case AF_INET:
   7293 	case AF_INET6:
   7294 		break;
   7295 	default:
   7296 		return (EAFNOSUPPORT);
   7297 	}
   7298 
   7299 	lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags,
   7300 	    connp->conn_zoneid, connp->conn_netstack->netstack_ip);
   7301 	ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count));
   7302 	return (0);
   7303 }
   7304 
   7305 /* ARGSUSED */
   7306 int
   7307 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
   7308     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
   7309 {
   7310 	STRUCT_HANDLE(ifconf, ifc);
   7311 	mblk_t *mp1;
   7312 	struct iocblk *iocp;
   7313 	struct ifreq *ifr;
   7314 	ill_walk_context_t	ctx;
   7315 	ill_t	*ill;
   7316 	ipif_t	*ipif;
   7317 	struct sockaddr_in *sin;
   7318 	int32_t	ifclen;
   7319 	zoneid_t zoneid;
   7320 	ip_stack_t *ipst = CONNQ_TO_IPST(q);
   7321 
   7322 	ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */
   7323 
   7324 	ip1dbg(("ip_sioctl_get_ifconf"));
   7325 	/* Existence verified in ip_wput_nondata */
   7326 	mp1 = mp->b_cont->b_cont;
   7327 	iocp = (struct iocblk *)mp->b_rptr;
   7328 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   7329 
   7330 	/*
   7331 	 * The original SIOCGIFCONF passed in a struct ifconf which specified
   7332 	 * the user buffer address and length into which the list of struct
   7333 	 * ifreqs was to be copied.  Since AT&T Streams does not seem to
   7334 	 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS,
   7335 	 * the SIOCGIFCONF operation was redefined to simply provide
   7336 	 * a large output buffer into which we are supposed to jam the ifreq
   7337 	 * array.  The same ioctl command code was used, despite the fact that
   7338 	 * both the applications and the kernel code had to change, thus making
   7339 	 * it impossible to support both interfaces.
   7340 	 *
   7341 	 * For reasons not good enough to try to explain, the following
   7342 	 * algorithm is used for deciding what to do with one of these:
   7343 	 * If the IOCTL comes in as an I_STR, it is assumed to be of the new
   7344 	 * form with the output buffer coming down as the continuation message.
   7345 	 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style,
   7346 	 * and we have to copy in the ifconf structure to find out how big the
   7347 	 * output buffer is and where to copy out to.  Sure no problem...
   7348 	 *
   7349 	 */
   7350 	STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL);
   7351 	if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) {
   7352 		int numifs = 0;
   7353 		size_t ifc_bufsize;
   7354 
   7355 		/*
   7356 		 * Must be (better be!) continuation of a TRANSPARENT
   7357 		 * IOCTL.  We just copied in the ifconf structure.
   7358 		 */
   7359 		STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
   7360 		    (struct ifconf *)mp1->b_rptr);
   7361 
   7362 		/*
   7363 		 * Allocate a buffer to hold requested information.
   7364 		 *
   7365 		 * If ifc_len is larger than what is needed, we only
   7366 		 * allocate what we will use.
   7367 		 *
   7368 		 * If ifc_len is smaller than what is needed, return
   7369 		 * EINVAL.
   7370 		 *
   7371 		 * XXX: the ill_t structure can hava 2 counters, for
   7372 		 * v4 and v6 (not just ill_ipif_up_count) to store the
   7373 		 * number of interfaces for a device, so we don't need
   7374 		 * to count them here...
   7375 		 */
   7376 		numifs = ip_get_numifs(zoneid, ipst);
   7377 
   7378 		ifclen = STRUCT_FGET(ifc, ifc_len);
   7379 		ifc_bufsize = numifs * sizeof (struct ifreq);
   7380 		if (ifc_bufsize > ifclen) {
   7381 			if (iocp->ioc_cmd == O_SIOCGIFCONF) {
   7382 				/* old behaviour */
   7383 				return (EINVAL);
   7384 			} else {
   7385 				ifc_bufsize = ifclen;
   7386 			}
   7387 		}
   7388 
   7389 		mp1 = mi_copyout_alloc(q, mp,
   7390 		    STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE);
   7391 		if (mp1 == NULL)
   7392 			return (ENOMEM);
   7393 
   7394 		mp1->b_wptr = mp1->b_rptr + ifc_bufsize;
   7395 	}
   7396 	bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
   7397 	/*
   7398 	 * the SIOCGIFCONF ioctl only knows about
   7399 	 * IPv4 addresses, so don't try to tell
   7400 	 * it about interfaces with IPv6-only
   7401 	 * addresses. (Last parm 'isv6' is B_FALSE)
   7402 	 */
   7403 
   7404 	ifr = (struct ifreq *)mp1->b_rptr;
   7405 
   7406 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   7407 	ill = ILL_START_WALK_V4(&ctx, ipst);
   7408 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   7409 		if (IS_UNDER_IPMP(ill))
   7410 			continue;
   7411 		for (ipif = ill->ill_ipif; ipif != NULL;
   7412 		    ipif = ipif->ipif_next) {
   7413 			if (zoneid != ipif->ipif_zoneid &&
   7414 			    ipif->ipif_zoneid != ALL_ZONES)
   7415 				continue;
   7416 			if ((uchar_t *)&ifr[1] > mp1->b_wptr) {
   7417 				if (iocp->ioc_cmd == O_SIOCGIFCONF) {
   7418 					/* old behaviour */
   7419 					rw_exit(&ipst->ips_ill_g_lock);
   7420 					return (EINVAL);
   7421 				} else {
   7422 					goto if_copydone;
   7423 				}
   7424 			}
   7425 			ipif_get_name(ipif, ifr->ifr_name,
   7426 			    sizeof (ifr->ifr_name));
   7427 			sin = (sin_t *)&ifr->ifr_addr;
   7428 			*sin = sin_null;
   7429 			sin->sin_family = AF_INET;
   7430 			sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
   7431 			ifr++;
   7432 		}
   7433 	}
   7434 if_copydone:
   7435 	rw_exit(&ipst->ips_ill_g_lock);
   7436 	mp1->b_wptr = (uchar_t *)ifr;
   7437 
   7438 	if (STRUCT_BUF(ifc) != NULL) {
   7439 		STRUCT_FSET(ifc, ifc_len,
   7440 		    (int)((uchar_t *)ifr - mp1->b_rptr));
   7441 	}
   7442 	return (0);
   7443 }
   7444 
   7445 /*
   7446  * Get the interfaces using the address hosted on the interface passed in,
   7447  * as a source adddress
   7448  */
   7449 /* ARGSUSED */
   7450 int
   7451 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
   7452     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
   7453 {
   7454 	mblk_t *mp1;
   7455 	ill_t	*ill, *ill_head;
   7456 	ipif_t	*ipif, *orig_ipif;
   7457 	int	numlifs = 0;
   7458 	size_t	lifs_bufsize, lifsmaxlen;
   7459 	struct	lifreq *lifr;
   7460 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
   7461 	uint_t	ifindex;
   7462 	zoneid_t zoneid;
   7463 	boolean_t isv6 = B_FALSE;
   7464 	struct	sockaddr_in	*sin;
   7465 	struct	sockaddr_in6	*sin6;
   7466 	STRUCT_HANDLE(lifsrcof, lifs);
   7467 	ip_stack_t		*ipst;
   7468 
   7469 	ipst = CONNQ_TO_IPST(q);
   7470 
   7471 	ASSERT(q->q_next == NULL);
   7472 
   7473 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   7474 
   7475 	/* Existence verified in ip_wput_nondata */
   7476 	mp1 = mp->b_cont->b_cont;
   7477 
   7478 	/*
   7479 	 * Must be (better be!) continuation of a TRANSPARENT
   7480 	 * IOCTL.  We just copied in the lifsrcof structure.
   7481 	 */
   7482 	STRUCT_SET_HANDLE(lifs, iocp->ioc_flag,
   7483 	    (struct lifsrcof *)mp1->b_rptr);
   7484 
   7485 	if (MBLKL(mp1) != STRUCT_SIZE(lifs))
   7486 		return (EINVAL);
   7487 
   7488 	ifindex = STRUCT_FGET(lifs, lifs_ifindex);
   7489 	isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
   7490 	ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst);
   7491 	if (ipif == NULL) {
   7492 		ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
   7493 		    ifindex));
   7494 		return (ENXIO);
   7495 	}
   7496 
   7497 	/* Allocate a buffer to hold requested information */
   7498 	numlifs = ip_get_lifsrcofnum(ipif->ipif_ill);
   7499 	lifs_bufsize = numlifs * sizeof (struct lifreq);
   7500 	lifsmaxlen =  STRUCT_FGET(lifs, lifs_maxlen);
   7501 	/* The actual size needed is always returned in lifs_len */
   7502 	STRUCT_FSET(lifs, lifs_len, lifs_bufsize);
   7503 
   7504 	/* If the amount we need is more than what is passed in, abort */
   7505 	if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) {
   7506 		ipif_refrele(ipif);
   7507 		return (0);
   7508 	}
   7509 
   7510 	mp1 = mi_copyout_alloc(q, mp,
   7511 	    STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE);
   7512 	if (mp1 == NULL) {
   7513 		ipif_refrele(ipif);
   7514 		return (ENOMEM);
   7515 	}
   7516 
   7517 	mp1->b_wptr = mp1->b_rptr + lifs_bufsize;
   7518 	bzero(mp1->b_rptr, lifs_bufsize);
   7519 
   7520 	lifr = (struct lifreq *)mp1->b_rptr;
   7521 
   7522 	ill = ill_head = ipif->ipif_ill;
   7523 	orig_ipif = ipif;
   7524 
   7525 	/* ill_g_usesrc_lock protects ill_usesrc_grp_next */
   7526 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
   7527 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   7528 
   7529 	ill = ill->ill_usesrc_grp_next; /* start from next ill */
   7530 	for (; (ill != NULL) && (ill != ill_head);
   7531 	    ill = ill->ill_usesrc_grp_next) {
   7532 
   7533 		if ((uchar_t *)&lifr[1] > mp1->b_wptr)
   7534 			break;
   7535 
   7536 		ipif = ill->ill_ipif;
   7537 		ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name));
   7538 		if (ipif->ipif_isv6) {
   7539 			sin6 = (sin6_t *)&lifr->lifr_addr;
   7540 			*sin6 = sin6_null;
   7541 			sin6->sin6_family = AF_INET6;
   7542 			sin6->sin6_addr = ipif->ipif_v6lcl_addr;
   7543 			lifr->lifr_addrlen = ip_mask_to_plen_v6(
   7544 			    &ipif->ipif_v6net_mask);
   7545 		} else {
   7546 			sin = (sin_t *)&lifr->lifr_addr;
   7547 			*sin = sin_null;
   7548 			sin->sin_family = AF_INET;
   7549 			sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
   7550 			lifr->lifr_addrlen = ip_mask_to_plen(
   7551 			    ipif->ipif_net_mask);
   7552 		}
   7553 		lifr++;
   7554 	}
   7555 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
   7556 	rw_exit(&ipst->ips_ill_g_lock);
   7557 	ipif_refrele(orig_ipif);
   7558 	mp1->b_wptr = (uchar_t *)lifr;
   7559 	STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
   7560 
   7561 	return (0);
   7562 }
   7563 
   7564 /* ARGSUSED */
   7565 int
   7566 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
   7567     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
   7568 {
   7569 	mblk_t *mp1;
   7570 	int	list;
   7571 	ill_t	*ill;
   7572 	ipif_t	*ipif;
   7573 	int	flags;
   7574 	int	numlifs = 0;
   7575 	size_t	lifc_bufsize;
   7576 	struct	lifreq *lifr;
   7577 	sa_family_t	family;
   7578 	struct	sockaddr_in	*sin;
   7579 	struct	sockaddr_in6	*sin6;
   7580 	ill_walk_context_t	ctx;
   7581 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
   7582 	int32_t	lifclen;
   7583 	zoneid_t zoneid;
   7584 	STRUCT_HANDLE(lifconf, lifc);
   7585 	ip_stack_t *ipst = CONNQ_TO_IPST(q);
   7586 
   7587 	ip1dbg(("ip_sioctl_get_lifconf"));
   7588 
   7589 	ASSERT(q->q_next == NULL);
   7590 
   7591 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   7592 
   7593 	/* Existence verified in ip_wput_nondata */
   7594 	mp1 = mp->b_cont->b_cont;
   7595 
   7596 	/*
   7597 	 * An extended version of SIOCGIFCONF that takes an
   7598 	 * additional address family and flags field.
   7599 	 * AF_UNSPEC retrieve both IPv4 and IPv6.
   7600 	 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT
   7601 	 * interfaces are omitted.
   7602 	 * Similarly, IPIF_TEMPORARY interfaces are omitted
   7603 	 * unless LIFC_TEMPORARY is specified.
   7604 	 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT,
   7605 	 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and
   7606 	 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE
   7607 	 * has priority over LIFC_NOXMIT.
   7608 	 */
   7609 	STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL);
   7610 
   7611 	if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc))
   7612 		return (EINVAL);
   7613 
   7614 	/*
   7615 	 * Must be (better be!) continuation of a TRANSPARENT
   7616 	 * IOCTL.  We just copied in the lifconf structure.
   7617 	 */
   7618 	STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr);
   7619 
   7620 	family = STRUCT_FGET(lifc, lifc_family);
   7621 	flags = STRUCT_FGET(lifc, lifc_flags);
   7622 
   7623 	switch (family) {
   7624 	case AF_UNSPEC:
   7625 		/*
   7626 		 * walk all ILL's.
   7627 		 */
   7628 		list = MAX_G_HEADS;
   7629 		break;
   7630 	case AF_INET:
   7631 		/*
   7632 		 * walk only IPV4 ILL's.
   7633 		 */
   7634 		list = IP_V4_G_HEAD;
   7635 		break;
   7636 	case AF_INET6:
   7637 		/*
   7638 		 * walk only IPV6 ILL's.
   7639 		 */
   7640 		list = IP_V6_G_HEAD;
   7641 		break;
   7642 	default:
   7643 		return (EAFNOSUPPORT);
   7644 	}
   7645 
   7646 	/*
   7647 	 * Allocate a buffer to hold requested information.
   7648 	 *
   7649 	 * If lifc_len is larger than what is needed, we only
   7650 	 * allocate what we will use.
   7651 	 *
   7652 	 * If lifc_len is smaller than what is needed, return
   7653 	 * EINVAL.
   7654 	 */
   7655 	numlifs = ip_get_numlifs(family, flags, zoneid, ipst);
   7656 	lifc_bufsize = numlifs * sizeof (struct lifreq);
   7657 	lifclen = STRUCT_FGET(lifc, lifc_len);
   7658 	if (lifc_bufsize > lifclen) {
   7659 		if (iocp->ioc_cmd == O_SIOCGLIFCONF)
   7660 			return (EINVAL);
   7661 		else
   7662 			lifc_bufsize = lifclen;
   7663 	}
   7664 
   7665 	mp1 = mi_copyout_alloc(q, mp,
   7666 	    STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE);
   7667 	if (mp1 == NULL)
   7668 		return (ENOMEM);
   7669 
   7670 	mp1->b_wptr = mp1->b_rptr + lifc_bufsize;
   7671 	bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
   7672 
   7673 	lifr = (struct lifreq *)mp1->b_rptr;
   7674 
   7675 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   7676 	ill = ill_first(list, list, &ctx, ipst);
   7677 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   7678 		if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
   7679 			continue;
   7680 
   7681 		for (ipif = ill->ill_ipif; ipif != NULL;
   7682 		    ipif = ipif->ipif_next) {
   7683 			if ((ipif->ipif_flags & IPIF_NOXMIT) &&
   7684 			    !(flags & LIFC_NOXMIT))
   7685 				continue;
   7686 
   7687 			if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
   7688 			    !(flags & LIFC_TEMPORARY))
   7689 				continue;
   7690 
   7691 			if (((ipif->ipif_flags &
   7692 			    (IPIF_NOXMIT|IPIF_NOLOCAL|
   7693 			    IPIF_DEPRECATED)) ||
   7694 			    IS_LOOPBACK(ill) ||
   7695 			    !(ipif->ipif_flags & IPIF_UP)) &&
   7696 			    (flags & LIFC_EXTERNAL_SOURCE))
   7697 				continue;
   7698 
   7699 			if (zoneid != ipif->ipif_zoneid &&
   7700 			    ipif->ipif_zoneid != ALL_ZONES &&
   7701 			    (zoneid != GLOBAL_ZONEID ||
   7702 			    !(flags & LIFC_ALLZONES)))
   7703 				continue;
   7704 
   7705 			if ((uchar_t *)&lifr[1] > mp1->b_wptr) {
   7706 				if (iocp->ioc_cmd == O_SIOCGLIFCONF) {
   7707 					rw_exit(&ipst->ips_ill_g_lock);
   7708 					return (EINVAL);
   7709 				} else {
   7710 					goto lif_copydone;
   7711 				}
   7712 			}
   7713 
   7714 			ipif_get_name(ipif, lifr->lifr_name,
   7715 			    sizeof (lifr->lifr_name));
   7716 			lifr->lifr_type = ill->ill_type;
   7717 			if (ipif->ipif_isv6) {
   7718 				sin6 = (sin6_t *)&lifr->lifr_addr;
   7719 				*sin6 = sin6_null;
   7720 				sin6->sin6_family = AF_INET6;
   7721 				sin6->sin6_addr =
   7722 				    ipif->ipif_v6lcl_addr;
   7723 				lifr->lifr_addrlen =
   7724 				    ip_mask_to_plen_v6(
   7725 				    &ipif->ipif_v6net_mask);
   7726 			} else {
   7727 				sin = (sin_t *)&lifr->lifr_addr;
   7728 				*sin = sin_null;
   7729 				sin->sin_family = AF_INET;
   7730 				sin->sin_addr.s_addr =
   7731 				    ipif->ipif_lcl_addr;
   7732 				lifr->lifr_addrlen =
   7733 				    ip_mask_to_plen(
   7734 				    ipif->ipif_net_mask);
   7735 			}
   7736 			lifr++;
   7737 		}
   7738 	}
   7739 lif_copydone:
   7740 	rw_exit(&ipst->ips_ill_g_lock);
   7741 
   7742 	mp1->b_wptr = (uchar_t *)lifr;
   7743 	if (STRUCT_BUF(lifc) != NULL) {
   7744 		STRUCT_FSET(lifc, lifc_len,
   7745 		    (int)((uchar_t *)lifr - mp1->b_rptr));
   7746 	}
   7747 	return (0);
   7748 }
   7749 
   7750 static void
   7751 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
   7752 {
   7753 	ip6_asp_t *table;
   7754 	size_t table_size;
   7755 	mblk_t *data_mp;
   7756 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
   7757 	ip_stack_t	*ipst;
   7758 
   7759 	if (q->q_next == NULL)
   7760 		ipst = CONNQ_TO_IPST(q);
   7761 	else
   7762 		ipst = ILLQ_TO_IPST(q);
   7763 
   7764 	/* These two ioctls are I_STR only */
   7765 	if (iocp->ioc_count == TRANSPARENT) {
   7766 		miocnak(q, mp, 0, EINVAL);
   7767 		return;
   7768 	}
   7769 
   7770 	data_mp = mp->b_cont;
   7771 	if (data_mp == NULL) {
   7772 		/* The user passed us a NULL argument */
   7773 		table = NULL;
   7774 		table_size = iocp->ioc_count;
   7775 	} else {
   7776 		/*
   7777 		 * The user provided a table.  The stream head
   7778 		 * may have copied in the user data in chunks,
   7779 		 * so make sure everything is pulled up
   7780 		 * properly.
   7781 		 */
   7782 		if (MBLKL(data_mp) < iocp->ioc_count) {
   7783 			mblk_t *new_data_mp;
   7784 			if ((new_data_mp = msgpullup(data_mp, -1)) ==
   7785 			    NULL) {
   7786 				miocnak(q, mp, 0, ENOMEM);
   7787 				return;
   7788 			}
   7789 			freemsg(data_mp);
   7790 			data_mp = new_data_mp;
   7791 			mp->b_cont = data_mp;
   7792 		}
   7793 		table = (ip6_asp_t *)data_mp->b_rptr;
   7794 		table_size = iocp->ioc_count;
   7795 	}
   7796 
   7797 	switch (iocp->ioc_cmd) {
   7798 	case SIOCGIP6ADDRPOLICY:
   7799 		iocp->ioc_rval = ip6_asp_get(table, table_size, ipst);
   7800 		if (iocp->ioc_rval == -1)
   7801 			iocp->ioc_error = EINVAL;
   7802 #if defined(_SYSCALL32_IMPL) &&