Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /* Copyright (c) 1990 Mentat Inc. */
     26 
     27 /*
     28  * This file contains the interface control functions for IP.
     29  */
     30 
     31 #include <sys/types.h>
     32 #include <sys/stream.h>
     33 #include <sys/dlpi.h>
     34 #include <sys/stropts.h>
     35 #include <sys/strsun.h>
     36 #include <sys/sysmacros.h>
     37 #include <sys/strsubr.h>
     38 #include <sys/strlog.h>
     39 #include <sys/ddi.h>
     40 #include <sys/sunddi.h>
     41 #include <sys/cmn_err.h>
     42 #include <sys/kstat.h>
     43 #include <sys/debug.h>
     44 #include <sys/zone.h>
     45 #include <sys/sunldi.h>
     46 #include <sys/file.h>
     47 #include <sys/bitmap.h>
     48 #include <sys/cpuvar.h>
     49 #include <sys/time.h>
     50 #include <sys/ctype.h>
     51 #include <sys/kmem.h>
     52 #include <sys/systm.h>
     53 #include <sys/param.h>
     54 #include <sys/socket.h>
     55 #include <sys/isa_defs.h>
     56 #include <net/if.h>
     57 #include <net/if_arp.h>
     58 #include <net/if_types.h>
     59 #include <net/if_dl.h>
     60 #include <net/route.h>
     61 #include <sys/sockio.h>
     62 #include <netinet/in.h>
     63 #include <netinet/ip6.h>
     64 #include <netinet/icmp6.h>
     65 #include <netinet/igmp_var.h>
     66 #include <sys/policy.h>
     67 #include <sys/ethernet.h>
     68 #include <sys/callb.h>
     69 #include <sys/md5.h>
     70 
     71 #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
     72 #include <inet/mi.h>
     73 #include <inet/nd.h>
     74 #include <inet/arp.h>
     75 #include <inet/ip_arp.h>
     76 #include <inet/mib2.h>
     77 #include <inet/ip.h>
     78 #include <inet/ip6.h>
     79 #include <inet/ip6_asp.h>
     80 #include <inet/tcp.h>
     81 #include <inet/ip_multi.h>
     82 #include <inet/ip_ire.h>
     83 #include <inet/ip_ftable.h>
     84 #include <inet/ip_rts.h>
     85 #include <inet/ip_ndp.h>
     86 #include <inet/ip_if.h>
     87 #include <inet/ip_impl.h>
     88 #include <inet/sctp_ip.h>
     89 #include <inet/ip_netinfo.h>
     90 #include <inet/ilb_ip.h>
     91 
     92 #include <netinet/igmp.h>
     93 #include <inet/ip_listutils.h>
     94 #include <inet/ipclassifier.h>
     95 #include <sys/mac_client.h>
     96 #include <sys/dld.h>
     97 
     98 #include <sys/systeminfo.h>
     99 #include <sys/bootconf.h>
    100 
    101 #include <sys/tsol/tndb.h>
    102 #include <sys/tsol/tnet.h>
    103 
    104 /* The character which tells where the ill_name ends */
    105 #define	IPIF_SEPARATOR_CHAR	':'
    106 
    107 /* IP ioctl function table entry */
    108 typedef struct ipft_s {
    109 	int	ipft_cmd;
    110 	pfi_t	ipft_pfi;
    111 	int	ipft_min_size;
    112 	int	ipft_flags;
    113 } ipft_t;
    114 #define	IPFT_F_NO_REPLY		0x1	/* IP ioctl does not expect any reply */
    115 #define	IPFT_F_SELF_REPLY	0x2	/* ioctl callee does the ioctl reply */
    116 
    117 static int	nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    118 static int	nd_ill_forward_set(queue_t *q, mblk_t *mp,
    119 		    char *value, caddr_t cp, cred_t *ioc_cr);
    120 
    121 static boolean_t ill_is_quiescent(ill_t *);
    122 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
    123 static ip_m_t	*ip_m_lookup(t_uscalar_t mac_type);
    124 static int	ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    125     mblk_t *mp, boolean_t need_up);
    126 static int	ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    127     mblk_t *mp, boolean_t need_up);
    128 static int	ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
    129     queue_t *q, mblk_t *mp, boolean_t need_up);
    130 static int	ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
    131     mblk_t *mp);
    132 static int	ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    133     mblk_t *mp);
    134 static int	ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
    135     queue_t *q, mblk_t *mp, boolean_t need_up);
    136 static int	ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
    137     int ioccmd, struct linkblk *li);
    138 static ipaddr_t	ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
    139 static void	ip_wput_ioctl(queue_t *q, mblk_t *mp);
    140 static void	ipsq_flush(ill_t *ill);
    141 
    142 static	int	ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
    143     queue_t *q, mblk_t *mp, boolean_t need_up);
    144 static void	ipsq_delete(ipsq_t *);
    145 
    146 static ipif_t	*ipif_allocate(ill_t *ill, int id, uint_t ire_type,
    147     boolean_t initialize, boolean_t insert, int *errorp);
    148 static ire_t	**ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
    149 static void	ipif_delete_bcast_ires(ipif_t *ipif);
    150 static int	ipif_add_ires_v4(ipif_t *, boolean_t);
    151 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
    152 		    boolean_t isv6);
    153 static int	ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
    154 static void	ipif_free(ipif_t *ipif);
    155 static void	ipif_free_tail(ipif_t *ipif);
    156 static void	ipif_set_default(ipif_t *ipif);
    157 static int	ipif_set_values(queue_t *q, mblk_t *mp,
    158     char *interf_name, uint_t *ppa);
    159 static int	ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
    160     queue_t *q);
    161 static ipif_t	*ipif_lookup_on_name(char *name, size_t namelen,
    162     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
    163     ip_stack_t *);
    164 
    165 static int	ill_alloc_ppa(ill_if_t *, ill_t *);
    166 static void	ill_delete_interface_type(ill_if_t *);
    167 static int	ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
    168 static void	ill_dl_down(ill_t *ill);
    169 static void	ill_down(ill_t *ill);
    170 static void	ill_down_ipifs(ill_t *, boolean_t);
    171 static void	ill_free_mib(ill_t *ill);
    172 static void	ill_glist_delete(ill_t *);
    173 static void	ill_phyint_reinit(ill_t *ill);
    174 static void	ill_set_nce_router_flags(ill_t *, boolean_t);
    175 static void	ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
    176 static void	ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
    177 
    178 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
    179 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
    180 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
    181 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
    182 static ip_v4mapinfo_func_t ip_ether_v4_mapping;
    183 static ip_v6mapinfo_func_t ip_ether_v6_mapping;
    184 static ip_v4mapinfo_func_t ip_ib_v4_mapping;
    185 static ip_v6mapinfo_func_t ip_ib_v6_mapping;
    186 static ip_v4mapinfo_func_t ip_mbcast_mapping;
    187 static void 	ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
    188 static void 	ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
    189 static void	phyint_free(phyint_t *);
    190 
    191 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
    192 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    193 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    194 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    195 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
    196 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
    197     dl_capability_sub_t *);
    198 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
    199 static void	ill_capability_dld_reset_fill(ill_t *, mblk_t *);
    200 static void	ill_capability_dld_ack(ill_t *, mblk_t *,
    201 		    dl_capability_sub_t *);
    202 static void	ill_capability_dld_enable(ill_t *);
    203 static void	ill_capability_ack_thr(void *);
    204 static void	ill_capability_lso_enable(ill_t *);
    205 
    206 static ill_t	*ill_prev_usesrc(ill_t *);
    207 static int	ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
    208 static void	ill_disband_usesrc_group(ill_t *);
    209 static void	ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
    210 
    211 #ifdef DEBUG
    212 static	void	ill_trace_cleanup(const ill_t *);
    213 static	void	ipif_trace_cleanup(const ipif_t *);
    214 #endif
    215 
    216 static	void	ill_dlpi_clear_deferred(ill_t *ill);
    217 
    218 /*
    219  * if we go over the memory footprint limit more than once in this msec
    220  * interval, we'll start pruning aggressively.
    221  */
    222 int ip_min_frag_prune_time = 0;
    223 
    224 static ipft_t	ip_ioctl_ftbl[] = {
    225 	{ IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
    226 	{ IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
    227 		IPFT_F_NO_REPLY },
    228 	{ IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
    229 	{ 0 }
    230 };
    231 
    232 /* Simple ICMP IP Header Template */
    233 static ipha_t icmp_ipha = {
    234 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
    235 };
    236 
    237 static uchar_t	ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
    238 
    239 static ip_m_t   ip_m_tbl[] = {
    240 	{ DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    241 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
    242 	    ip_nodef_v6intfid },
    243 	{ DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
    244 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    245 	    ip_nodef_v6intfid },
    246 	{ DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
    247 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    248 	    ip_nodef_v6intfid },
    249 	{ DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
    250 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    251 	    ip_nodef_v6intfid },
    252 	{ DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
    253 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
    254 	    ip_nodef_v6intfid },
    255 	{ DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
    256 	    ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
    257 	    ip_nodef_v6intfid },
    258 	{ DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
    259 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
    260 	    ip_ipv4_v6destintfid },
    261 	{ DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
    262 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
    263 	    ip_ipv6_v6destintfid },
    264 	{ DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
    265 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
    266 	    ip_nodef_v6intfid },
    267 	{ SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    268 	    NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
    269 	{ SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    270 	    NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
    271 	{ DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    272 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    273 	    ip_nodef_v6intfid }
    274 };
    275 
    276 static ill_t	ill_null;		/* Empty ILL for init. */
    277 char	ipif_loopback_name[] = "lo0";
    278 static char *ipv4_forward_suffix = ":ip_forwarding";
    279 static char *ipv6_forward_suffix = ":ip6_forwarding";
    280 static	sin6_t	sin6_null;	/* Zero address for quick clears */
    281 static	sin_t	sin_null;	/* Zero address for quick clears */
    282 
    283 /* When set search for unused ipif_seqid */
    284 static ipif_t	ipif_zero;
    285 
    286 /*
    287  * ppa arena is created after these many
    288  * interfaces have been plumbed.
    289  */
    290 uint_t	ill_no_arena = 12;	/* Setable in /etc/system */
    291 
    292 /*
    293  * Allocate per-interface mibs.
    294  * Returns true if ok. False otherwise.
    295  *  ipsq  may not yet be allocated (loopback case ).
    296  */
    297 static boolean_t
    298 ill_allocate_mibs(ill_t *ill)
    299 {
    300 	/* Already allocated? */
    301 	if (ill->ill_ip_mib != NULL) {
    302 		if (ill->ill_isv6)
    303 			ASSERT(ill->ill_icmp6_mib != NULL);
    304 		return (B_TRUE);
    305 	}
    306 
    307 	ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
    308 	    KM_NOSLEEP);
    309 	if (ill->ill_ip_mib == NULL) {
    310 		return (B_FALSE);
    311 	}
    312 
    313 	/* Setup static information */
    314 	SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
    315 	    sizeof (mib2_ipIfStatsEntry_t));
    316 	if (ill->ill_isv6) {
    317 		ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
    318 		SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
    319 		    sizeof (mib2_ipv6AddrEntry_t));
    320 		SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
    321 		    sizeof (mib2_ipv6RouteEntry_t));
    322 		SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
    323 		    sizeof (mib2_ipv6NetToMediaEntry_t));
    324 		SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
    325 		    sizeof (ipv6_member_t));
    326 		SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
    327 		    sizeof (ipv6_grpsrc_t));
    328 	} else {
    329 		ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
    330 		SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
    331 		    sizeof (mib2_ipAddrEntry_t));
    332 		SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
    333 		    sizeof (mib2_ipRouteEntry_t));
    334 		SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
    335 		    sizeof (mib2_ipNetToMediaEntry_t));
    336 		SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
    337 		    sizeof (ip_member_t));
    338 		SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
    339 		    sizeof (ip_grpsrc_t));
    340 
    341 		/*
    342 		 * For a v4 ill, we are done at this point, because per ill
    343 		 * icmp mibs are only used for v6.
    344 		 */
    345 		return (B_TRUE);
    346 	}
    347 
    348 	ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
    349 	    KM_NOSLEEP);
    350 	if (ill->ill_icmp6_mib == NULL) {
    351 		kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
    352 		ill->ill_ip_mib = NULL;
    353 		return (B_FALSE);
    354 	}
    355 	/* static icmp info */
    356 	ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
    357 	    sizeof (mib2_ipv6IfIcmpEntry_t);
    358 	/*
    359 	 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
    360 	 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
    361 	 * -> ill_phyint_reinit
    362 	 */
    363 	return (B_TRUE);
    364 }
    365 
    366 /*
    367  * Completely vaporize a lower level tap and all associated interfaces.
    368  * ill_delete is called only out of ip_close when the device control
    369  * stream is being closed.
    370  */
    371 void
    372 ill_delete(ill_t *ill)
    373 {
    374 	ipif_t	*ipif;
    375 	ill_t	*prev_ill;
    376 	ip_stack_t	*ipst = ill->ill_ipst;
    377 
    378 	/*
    379 	 * ill_delete may be forcibly entering the ipsq. The previous
    380 	 * ioctl may not have completed and may need to be aborted.
    381 	 * ipsq_flush takes care of it. If we don't need to enter the
    382 	 * the ipsq forcibly, the 2nd invocation of ipsq_flush in
    383 	 * ill_delete_tail is sufficient.
    384 	 */
    385 	ipsq_flush(ill);
    386 
    387 	/*
    388 	 * Nuke all interfaces.  ipif_free will take down the interface,
    389 	 * remove it from the list, and free the data structure.
    390 	 * Walk down the ipif list and remove the logical interfaces
    391 	 * first before removing the main ipif. We can't unplumb
    392 	 * zeroth interface first in the case of IPv6 as update_conn_ill
    393 	 * -> ip_ll_multireq de-references ill_ipif for checking
    394 	 * POINTOPOINT.
    395 	 *
    396 	 * If ill_ipif was not properly initialized (i.e low on memory),
    397 	 * then no interfaces to clean up. In this case just clean up the
    398 	 * ill.
    399 	 */
    400 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
    401 		ipif_free(ipif);
    402 
    403 	/*
    404 	 * clean out all the nce_t entries that depend on this
    405 	 * ill for the ill_phys_addr.
    406 	 */
    407 	nce_flush(ill, B_TRUE);
    408 
    409 	/* Clean up msgs on pending upcalls for mrouted */
    410 	reset_mrt_ill(ill);
    411 
    412 	update_conn_ill(ill, ipst);
    413 
    414 	/*
    415 	 * Remove multicast references added as a result of calls to
    416 	 * ip_join_allmulti().
    417 	 */
    418 	ip_purge_allmulti(ill);
    419 
    420 	/*
    421 	 * If the ill being deleted is under IPMP, boot it out of the illgrp.
    422 	 */
    423 	if (IS_UNDER_IPMP(ill))
    424 		ipmp_ill_leave_illgrp(ill);
    425 
    426 	/*
    427 	 * ill_down will arrange to blow off any IRE's dependent on this
    428 	 * ILL, and shut down fragmentation reassembly.
    429 	 */
    430 	ill_down(ill);
    431 
    432 	/* Let SCTP know, so that it can remove this from its list. */
    433 	sctp_update_ill(ill, SCTP_ILL_REMOVE);
    434 
    435 	/*
    436 	 * Walk all CONNs that can have a reference on an ire or nce for this
    437 	 * ill (we actually walk all that now have stale references).
    438 	 */
    439 	ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
    440 
    441 	/* With IPv6 we have dce_ifindex. Cleanup for neatness */
    442 	if (ill->ill_isv6)
    443 		dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
    444 
    445 	/*
    446 	 * If an address on this ILL is being used as a source address then
    447 	 * clear out the pointers in other ILLs that point to this ILL.
    448 	 */
    449 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
    450 	if (ill->ill_usesrc_grp_next != NULL) {
    451 		if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
    452 			ill_disband_usesrc_group(ill);
    453 		} else {	/* consumer of the usesrc ILL */
    454 			prev_ill = ill_prev_usesrc(ill);
    455 			prev_ill->ill_usesrc_grp_next =
    456 			    ill->ill_usesrc_grp_next;
    457 		}
    458 	}
    459 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
    460 }
    461 
    462 static void
    463 ipif_non_duplicate(ipif_t *ipif)
    464 {
    465 	ill_t *ill = ipif->ipif_ill;
    466 	mutex_enter(&ill->ill_lock);
    467 	if (ipif->ipif_flags & IPIF_DUPLICATE) {
    468 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
    469 		ASSERT(ill->ill_ipif_dup_count > 0);
    470 		ill->ill_ipif_dup_count--;
    471 	}
    472 	mutex_exit(&ill->ill_lock);
    473 }
    474 
    475 /*
    476  * ill_delete_tail is called from ip_modclose after all references
    477  * to the closing ill are gone. The wait is done in ip_modclose
    478  */
    479 void
    480 ill_delete_tail(ill_t *ill)
    481 {
    482 	mblk_t	**mpp;
    483 	ipif_t	*ipif;
    484 	ip_stack_t	*ipst = ill->ill_ipst;
    485 
    486 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
    487 		ipif_non_duplicate(ipif);
    488 		(void) ipif_down_tail(ipif);
    489 	}
    490 
    491 	ASSERT(ill->ill_ipif_dup_count == 0);
    492 
    493 	/*
    494 	 * If polling capability is enabled (which signifies direct
    495 	 * upcall into IP and driver has ill saved as a handle),
    496 	 * we need to make sure that unbind has completed before we
    497 	 * let the ill disappear and driver no longer has any reference
    498 	 * to this ill.
    499 	 */
    500 	mutex_enter(&ill->ill_lock);
    501 	while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
    502 		cv_wait(&ill->ill_cv, &ill->ill_lock);
    503 	mutex_exit(&ill->ill_lock);
    504 	ASSERT(!(ill->ill_capabilities &
    505 	    (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
    506 
    507 	if (ill->ill_net_type != IRE_LOOPBACK)
    508 		qprocsoff(ill->ill_rq);
    509 
    510 	/*
    511 	 * We do an ipsq_flush once again now. New messages could have
    512 	 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
    513 	 * could also have landed up if an ioctl thread had looked up
    514 	 * the ill before we set the ILL_CONDEMNED flag, but not yet
    515 	 * enqueued the ioctl when we did the ipsq_flush last time.
    516 	 */
    517 	ipsq_flush(ill);
    518 
    519 	/*
    520 	 * Free capabilities.
    521 	 */
    522 	if (ill->ill_hcksum_capab != NULL) {
    523 		kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
    524 		ill->ill_hcksum_capab = NULL;
    525 	}
    526 
    527 	if (ill->ill_zerocopy_capab != NULL) {
    528 		kmem_free(ill->ill_zerocopy_capab,
    529 		    sizeof (ill_zerocopy_capab_t));
    530 		ill->ill_zerocopy_capab = NULL;
    531 	}
    532 
    533 	if (ill->ill_lso_capab != NULL) {
    534 		kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
    535 		ill->ill_lso_capab = NULL;
    536 	}
    537 
    538 	if (ill->ill_dld_capab != NULL) {
    539 		kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
    540 		ill->ill_dld_capab = NULL;
    541 	}
    542 
    543 	while (ill->ill_ipif != NULL)
    544 		ipif_free_tail(ill->ill_ipif);
    545 
    546 	/*
    547 	 * We have removed all references to ilm from conn and the ones joined
    548 	 * within the kernel.
    549 	 *
    550 	 * We don't walk conns, mrts and ires because
    551 	 *
    552 	 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
    553 	 * 2) ill_down ->ill_downi walks all the ires and cleans up
    554 	 *    ill references.
    555 	 */
    556 
    557 	/*
    558 	 * If this ill is an IPMP meta-interface, blow away the illgrp.  This
    559 	 * is safe to do because the illgrp has already been unlinked from the
    560 	 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
    561 	 */
    562 	if (IS_IPMP(ill)) {
    563 		ipmp_illgrp_destroy(ill->ill_grp);
    564 		ill->ill_grp = NULL;
    565 	}
    566 
    567 	/*
    568 	 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
    569 	 * could free the phyint. No more reference to the phyint after this
    570 	 * point.
    571 	 */
    572 	(void) ill_glist_delete(ill);
    573 
    574 	rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER);
    575 	if (ill->ill_ndd_name != NULL)
    576 		nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name);
    577 	rw_exit(&ipst->ips_ip_g_nd_lock);
    578 
    579 	if (ill->ill_frag_ptr != NULL) {
    580 		uint_t count;
    581 
    582 		for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
    583 			mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
    584 		}
    585 		mi_free(ill->ill_frag_ptr);
    586 		ill->ill_frag_ptr = NULL;
    587 		ill->ill_frag_hash_tbl = NULL;
    588 	}
    589 
    590 	freemsg(ill->ill_nd_lla_mp);
    591 	/* Free all retained control messages. */
    592 	mpp = &ill->ill_first_mp_to_free;
    593 	do {
    594 		while (mpp[0]) {
    595 			mblk_t  *mp;
    596 			mblk_t  *mp1;
    597 
    598 			mp = mpp[0];
    599 			mpp[0] = mp->b_next;
    600 			for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
    601 				mp1->b_next = NULL;
    602 				mp1->b_prev = NULL;
    603 			}
    604 			freemsg(mp);
    605 		}
    606 	} while (mpp++ != &ill->ill_last_mp_to_free);
    607 
    608 	ill_free_mib(ill);
    609 
    610 #ifdef DEBUG
    611 	ill_trace_cleanup(ill);
    612 #endif
    613 
    614 	/* The default multicast interface might have changed */
    615 	ire_increment_multicast_generation(ipst, ill->ill_isv6);
    616 
    617 	/* Drop refcnt here */
    618 	netstack_rele(ill->ill_ipst->ips_netstack);
    619 	ill->ill_ipst = NULL;
    620 }
    621 
    622 static void
    623 ill_free_mib(ill_t *ill)
    624 {
    625 	ip_stack_t *ipst = ill->ill_ipst;
    626 
    627 	/*
    628 	 * MIB statistics must not be lost, so when an interface
    629 	 * goes away the counter values will be added to the global
    630 	 * MIBs.
    631 	 */
    632 	if (ill->ill_ip_mib != NULL) {
    633 		if (ill->ill_isv6) {
    634 			ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
    635 			    ill->ill_ip_mib);
    636 		} else {
    637 			ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
    638 			    ill->ill_ip_mib);
    639 		}
    640 
    641 		kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
    642 		ill->ill_ip_mib = NULL;
    643 	}
    644 	if (ill->ill_icmp6_mib != NULL) {
    645 		ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
    646 		    ill->ill_icmp6_mib);
    647 		kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
    648 		ill->ill_icmp6_mib = NULL;
    649 	}
    650 }
    651 
    652 /*
    653  * Concatenate together a physical address and a sap.
    654  *
    655  * Sap_lengths are interpreted as follows:
    656  *   sap_length == 0	==>	no sap
    657  *   sap_length > 0	==>	sap is at the head of the dlpi address
    658  *   sap_length < 0	==>	sap is at the tail of the dlpi address
    659  */
    660 static void
    661 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
    662     t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
    663 {
    664 	uint16_t sap_addr = (uint16_t)sap_src;
    665 
    666 	if (sap_length == 0) {
    667 		if (phys_src == NULL)
    668 			bzero(dst, phys_length);
    669 		else
    670 			bcopy(phys_src, dst, phys_length);
    671 	} else if (sap_length < 0) {
    672 		if (phys_src == NULL)
    673 			bzero(dst, phys_length);
    674 		else
    675 			bcopy(phys_src, dst, phys_length);
    676 		bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
    677 	} else {
    678 		bcopy(&sap_addr, dst, sizeof (sap_addr));
    679 		if (phys_src == NULL)
    680 			bzero((char *)dst + sap_length, phys_length);
    681 		else
    682 			bcopy(phys_src, (char *)dst + sap_length, phys_length);
    683 	}
    684 }
    685 
    686 /*
    687  * Generate a dl_unitdata_req mblk for the device and address given.
    688  * addr_length is the length of the physical portion of the address.
    689  * If addr is NULL include an all zero address of the specified length.
    690  * TRUE? In any case, addr_length is taken to be the entire length of the
    691  * dlpi address, including the absolute value of sap_length.
    692  */
    693 mblk_t *
    694 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
    695 		t_scalar_t sap_length)
    696 {
    697 	dl_unitdata_req_t *dlur;
    698 	mblk_t	*mp;
    699 	t_scalar_t	abs_sap_length;		/* absolute value */
    700 
    701 	abs_sap_length = ABS(sap_length);
    702 	mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
    703 	    DL_UNITDATA_REQ);
    704 	if (mp == NULL)
    705 		return (NULL);
    706 	dlur = (dl_unitdata_req_t *)mp->b_rptr;
    707 	/* HACK: accomodate incompatible DLPI drivers */
    708 	if (addr_length == 8)
    709 		addr_length = 6;
    710 	dlur->dl_dest_addr_length = addr_length + abs_sap_length;
    711 	dlur->dl_dest_addr_offset = sizeof (*dlur);
    712 	dlur->dl_priority.dl_min = 0;
    713 	dlur->dl_priority.dl_max = 0;
    714 	ill_dlur_copy_address(addr, addr_length, sap, sap_length,
    715 	    (uchar_t *)&dlur[1]);
    716 	return (mp);
    717 }
    718 
    719 /*
    720  * Add the pending mp to the list. There can be only 1 pending mp
    721  * in the list. Any exclusive ioctl that needs to wait for a response
    722  * from another module or driver needs to use this function to set
    723  * the ipx_pending_mp to the ioctl mblk and wait for the response from
    724  * the other module/driver. This is also used while waiting for the
    725  * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
    726  */
    727 boolean_t
    728 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
    729     int waitfor)
    730 {
    731 	ipxop_t	*ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
    732 
    733 	ASSERT(IAM_WRITER_IPIF(ipif));
    734 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
    735 	ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
    736 	ASSERT(ipx->ipx_pending_mp == NULL);
    737 	/*
    738 	 * The caller may be using a different ipif than the one passed into
    739 	 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
    740 	 * ill needs to wait for the V6 ill to quiesce).  So we can't ASSERT
    741 	 * that `ipx_current_ipif == ipif'.
    742 	 */
    743 	ASSERT(ipx->ipx_current_ipif != NULL);
    744 
    745 	/*
    746 	 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
    747 	 * driver.
    748 	 */
    749 	ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
    750 	    (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
    751 	    (DB_TYPE(add_mp) == M_PCPROTO));
    752 
    753 	if (connp != NULL) {
    754 		ASSERT(MUTEX_HELD(&connp->conn_lock));
    755 		/*
    756 		 * Return error if the conn has started closing. The conn
    757 		 * could have finished cleaning up the pending mp list,
    758 		 * If so we should not add another mp to the list negating
    759 		 * the cleanup.
    760 		 */
    761 		if (connp->conn_state_flags & CONN_CLOSING)
    762 			return (B_FALSE);
    763 	}
    764 	mutex_enter(&ipx->ipx_lock);
    765 	ipx->ipx_pending_ipif = ipif;
    766 	/*
    767 	 * Note down the queue in b_queue. This will be returned by
    768 	 * ipsq_pending_mp_get. Caller will then use these values to restart
    769 	 * the processing
    770 	 */
    771 	add_mp->b_next = NULL;
    772 	add_mp->b_queue = q;
    773 	ipx->ipx_pending_mp = add_mp;
    774 	ipx->ipx_waitfor = waitfor;
    775 	mutex_exit(&ipx->ipx_lock);
    776 
    777 	if (connp != NULL)
    778 		connp->conn_oper_pending_ill = ipif->ipif_ill;
    779 
    780 	return (B_TRUE);
    781 }
    782 
    783 /*
    784  * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
    785  * queued in the list.
    786  */
    787 mblk_t *
    788 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
    789 {
    790 	mblk_t	*curr = NULL;
    791 	ipxop_t	*ipx = ipsq->ipsq_xop;
    792 
    793 	*connpp = NULL;
    794 	mutex_enter(&ipx->ipx_lock);
    795 	if (ipx->ipx_pending_mp == NULL) {
    796 		mutex_exit(&ipx->ipx_lock);
    797 		return (NULL);
    798 	}
    799 
    800 	/* There can be only 1 such excl message */
    801 	curr = ipx->ipx_pending_mp;
    802 	ASSERT(curr->b_next == NULL);
    803 	ipx->ipx_pending_ipif = NULL;
    804 	ipx->ipx_pending_mp = NULL;
    805 	ipx->ipx_waitfor = 0;
    806 	mutex_exit(&ipx->ipx_lock);
    807 
    808 	if (CONN_Q(curr->b_queue)) {
    809 		/*
    810 		 * This mp did a refhold on the conn, at the start of the ioctl.
    811 		 * So we can safely return a pointer to the conn to the caller.
    812 		 */
    813 		*connpp = Q_TO_CONN(curr->b_queue);
    814 	} else {
    815 		*connpp = NULL;
    816 	}
    817 	curr->b_next = NULL;
    818 	curr->b_prev = NULL;
    819 	return (curr);
    820 }
    821 
    822 /*
    823  * Cleanup the ioctl mp queued in ipx_pending_mp
    824  * - Called in the ill_delete path
    825  * - Called in the M_ERROR or M_HANGUP path on the ill.
    826  * - Called in the conn close path.
    827  *
    828  * Returns success on finding the pending mblk associated with the ioctl or
    829  * exclusive operation in progress, failure otherwise.
    830  */
    831 boolean_t
    832 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
    833 {
    834 	mblk_t	*mp;
    835 	ipxop_t	*ipx;
    836 	queue_t	*q;
    837 	ipif_t	*ipif;
    838 	int	cmd;
    839 
    840 	ASSERT(IAM_WRITER_ILL(ill));
    841 	ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
    842 
    843 	mutex_enter(&ipx->ipx_lock);
    844 	mp = ipx->ipx_pending_mp;
    845 	if (connp != NULL) {
    846 		if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) {
    847 			/*
    848 			 * Nothing to clean since the conn that is closing
    849 			 * does not have a matching pending mblk in
    850 			 * ipx_pending_mp.
    851 			 */
    852 			mutex_exit(&ipx->ipx_lock);
    853 			return (B_FALSE);
    854 		}
    855 	} else {
    856 		/*
    857 		 * A non-zero ill_error signifies we are called in the
    858 		 * M_ERROR or M_HANGUP path and we need to unconditionally
    859 		 * abort any current ioctl and do the corresponding cleanup.
    860 		 * A zero ill_error means we are in the ill_delete path and
    861 		 * we do the cleanup only if there is a pending mp.
    862 		 */
    863 		if (mp == NULL && ill->ill_error == 0) {
    864 			mutex_exit(&ipx->ipx_lock);
    865 			return (B_FALSE);
    866 		}
    867 	}
    868 
    869 	/* Now remove from the ipx_pending_mp */
    870 	ipx->ipx_pending_mp = NULL;
    871 	ipif = ipx->ipx_pending_ipif;
    872 	ipx->ipx_pending_ipif = NULL;
    873 	ipx->ipx_waitfor = 0;
    874 	ipx->ipx_current_ipif = NULL;
    875 	cmd = ipx->ipx_current_ioctl;
    876 	ipx->ipx_current_ioctl = 0;
    877 	ipx->ipx_current_done = B_TRUE;
    878 	mutex_exit(&ipx->ipx_lock);
    879 
    880 	if (mp == NULL)
    881 		return (B_FALSE);
    882 
    883 	q = mp->b_queue;
    884 	mp->b_next = NULL;
    885 	mp->b_prev = NULL;
    886 	mp->b_queue = NULL;
    887 
    888 	if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
    889 		DTRACE_PROBE4(ipif__ioctl,
    890 		    char *, "ipsq_pending_mp_cleanup",
    891 		    int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
    892 		    ipif_t *, ipif);
    893 		if (connp == NULL) {
    894 			ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
    895 		} else {
    896 			ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
    897 			mutex_enter(&ipif->ipif_ill->ill_lock);
    898 			ipif->ipif_state_flags &= ~IPIF_CHANGING;
    899 			mutex_exit(&ipif->ipif_ill->ill_lock);
    900 		}
    901 	} else {
    902 		inet_freemsg(mp);
    903 	}
    904 	return (B_TRUE);
    905 }
    906 
    907 /*
    908  * Called in the conn close path and ill delete path
    909  */
    910 static void
    911 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
    912 {
    913 	ipsq_t	*ipsq;
    914 	mblk_t	*prev;
    915 	mblk_t	*curr;
    916 	mblk_t	*next;
    917 	queue_t	*wq, *rq = NULL;
    918 	mblk_t	*tmp_list = NULL;
    919 
    920 	ASSERT(IAM_WRITER_ILL(ill));
    921 	if (connp != NULL)
    922 		wq = CONNP_TO_WQ(connp);
    923 	else
    924 		wq = ill->ill_wq;
    925 
    926 	/*
    927 	 * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard
    928 	 * against this here.
    929 	 */
    930 	if (wq != NULL)
    931 		rq = RD(wq);
    932 
    933 	ipsq = ill->ill_phyint->phyint_ipsq;
    934 	/*
    935 	 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
    936 	 * In the case of ioctl from a conn, there can be only 1 mp
    937 	 * queued on the ipsq. If an ill is being unplumbed, only messages
    938 	 * related to this ill are flushed, like M_ERROR or M_HANGUP message.
    939 	 * ioctls meant for this ill form conn's are not flushed. They will
    940 	 * be processed during ipsq_exit and will not find the ill and will
    941 	 * return error.
    942 	 */
    943 	mutex_enter(&ipsq->ipsq_lock);
    944 	for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
    945 	    curr = next) {
    946 		next = curr->b_next;
    947 		if (curr->b_queue == wq || curr->b_queue == rq) {
    948 			/* Unlink the mblk from the pending mp list */
    949 			if (prev != NULL) {
    950 				prev->b_next = curr->b_next;
    951 			} else {
    952 				ASSERT(ipsq->ipsq_xopq_mphead == curr);
    953 				ipsq->ipsq_xopq_mphead = curr->b_next;
    954 			}
    955 			if (ipsq->ipsq_xopq_mptail == curr)
    956 				ipsq->ipsq_xopq_mptail = prev;
    957 			/*
    958 			 * Create a temporary list and release the ipsq lock
    959 			 * New elements are added to the head of the tmp_list
    960 			 */
    961 			curr->b_next = tmp_list;
    962 			tmp_list = curr;
    963 		} else {
    964 			prev = curr;
    965 		}
    966 	}
    967 	mutex_exit(&ipsq->ipsq_lock);
    968 
    969 	while (tmp_list != NULL) {
    970 		curr = tmp_list;
    971 		tmp_list = curr->b_next;
    972 		curr->b_next = NULL;
    973 		curr->b_prev = NULL;
    974 		curr->b_queue = NULL;
    975 		if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
    976 			DTRACE_PROBE4(ipif__ioctl,
    977 			    char *, "ipsq_xopq_mp_cleanup",
    978 			    int, 0, ill_t *, NULL, ipif_t *, NULL);
    979 			ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ?
    980 			    CONN_CLOSE : NO_COPYOUT, NULL);
    981 		} else {
    982 			/*
    983 			 * IP-MT XXX In the case of TLI/XTI bind / optmgmt
    984 			 * this can't be just inet_freemsg. we have to
    985 			 * restart it otherwise the thread will be stuck.
    986 			 */
    987 			inet_freemsg(curr);
    988 		}
    989 	}
    990 }
    991 
    992 /*
    993  * This conn has started closing. Cleanup any pending ioctl from this conn.
    994  * STREAMS ensures that there can be at most 1 active ioctl on a stream.
    995  */
    996 void
    997 conn_ioctl_cleanup(conn_t *connp)
    998 {
    999 	ipsq_t	*ipsq;
   1000 	ill_t	*ill;
   1001 	boolean_t refheld;
   1002 
   1003 	/*
   1004 	 * Check for a queued ioctl. If the ioctl has not yet started, the mp
   1005 	 * is pending in the list headed by ipsq_xopq_head. If the ioctl has
   1006 	 * started the mp could be present in ipx_pending_mp. Note that if
   1007 	 * conn_oper_pending_ill is NULL, the ioctl may still be in flight and
   1008 	 * not yet queued anywhere. In this case, the conn close code will wait
   1009 	 * until the conn_ref is dropped. If the stream was a tcp stream, then
   1010 	 * tcp_close will wait first until all ioctls have completed for this
   1011 	 * conn.
   1012 	 */
   1013 	mutex_enter(&connp->conn_lock);
   1014 	ill = connp->conn_oper_pending_ill;
   1015 	if (ill == NULL) {
   1016 		mutex_exit(&connp->conn_lock);
   1017 		return;
   1018 	}
   1019 
   1020 	/*
   1021 	 * We may not be able to refhold the ill if the ill/ipif
   1022 	 * is changing. But we need to make sure that the ill will
   1023 	 * not vanish. So we just bump up the ill_waiter count.
   1024 	 */
   1025 	refheld = ill_waiter_inc(ill);
   1026 	mutex_exit(&connp->conn_lock);
   1027 	if (refheld) {
   1028 		if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
   1029 			ill_waiter_dcr(ill);
   1030 			/*
   1031 			 * Check whether this ioctl has started and is
   1032 			 * pending. If it is not found there then check
   1033 			 * whether this ioctl has not even started and is in
   1034 			 * the ipsq_xopq list.
   1035 			 */
   1036 			if (!ipsq_pending_mp_cleanup(ill, connp))
   1037 				ipsq_xopq_mp_cleanup(ill, connp);
   1038 			ipsq = ill->ill_phyint->phyint_ipsq;
   1039 			ipsq_exit(ipsq);
   1040 			return;
   1041 		}
   1042 	}
   1043 
   1044 	/*
   1045 	 * The ill is also closing and we could not bump up the
   1046 	 * ill_waiter_count or we could not enter the ipsq. Leave
   1047 	 * the cleanup to ill_delete
   1048 	 */
   1049 	mutex_enter(&connp->conn_lock);
   1050 	while (connp->conn_oper_pending_ill != NULL)
   1051 		cv_wait(&connp->conn_refcv, &connp->conn_lock);
   1052 	mutex_exit(&connp->conn_lock);
   1053 	if (refheld)
   1054 		ill_waiter_dcr(ill);
   1055 }
   1056 
   1057 /*
   1058  * ipcl_walk function for cleaning up conn_*_ill fields.
   1059  * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
   1060  * conn_bound_if in place. We prefer dropping
   1061  * packets instead of sending them out the wrong interface, or accepting
   1062  * packets from the wrong ifindex.
   1063  */
   1064 static void
   1065 conn_cleanup_ill(conn_t *connp, caddr_t arg)
   1066 {
   1067 	ill_t	*ill = (ill_t *)arg;
   1068 
   1069 	mutex_enter(&connp->conn_lock);
   1070 	if (connp->conn_dhcpinit_ill == ill) {
   1071 		connp->conn_dhcpinit_ill = NULL;
   1072 		ASSERT(ill->ill_dhcpinit != 0);
   1073 		atomic_dec_32(&ill->ill_dhcpinit);
   1074 		ill_set_inputfn(ill);
   1075 	}
   1076 	mutex_exit(&connp->conn_lock);
   1077 }
   1078 
   1079 static int
   1080 ill_down_ipifs_tail(ill_t *ill)
   1081 {
   1082 	ipif_t	*ipif;
   1083 	int err;
   1084 
   1085 	ASSERT(IAM_WRITER_ILL(ill));
   1086 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   1087 		ipif_non_duplicate(ipif);
   1088 		/*
   1089 		 * ipif_down_tail will call arp_ll_down on the last ipif
   1090 		 * and typically return EINPROGRESS when the DL_UNBIND is sent.
   1091 		 */
   1092 		if ((err = ipif_down_tail(ipif)) != 0)
   1093 			return (err);
   1094 	}
   1095 	return (0);
   1096 }
   1097 
   1098 /* ARGSUSED */
   1099 void
   1100 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
   1101 {
   1102 	ASSERT(IAM_WRITER_IPSQ(ipsq));
   1103 	(void) ill_down_ipifs_tail(q->q_ptr);
   1104 	freemsg(mp);
   1105 	ipsq_current_finish(ipsq);
   1106 }
   1107 
   1108 /*
   1109  * ill_down_start is called when we want to down this ill and bring it up again
   1110  * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
   1111  * all interfaces, but don't tear down any plumbing.
   1112  */
   1113 boolean_t
   1114 ill_down_start(queue_t *q, mblk_t *mp)
   1115 {
   1116 	ill_t	*ill = q->q_ptr;
   1117 	ipif_t	*ipif;
   1118 
   1119 	ASSERT(IAM_WRITER_ILL(ill));
   1120 	mutex_enter(&ill->ill_lock);
   1121 	ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
   1122 	/* no more nce addition allowed */
   1123 	mutex_exit(&ill->ill_lock);
   1124 
   1125 	/*
   1126 	 * It is possible that some ioctl is already in progress while we
   1127 	 * received the M_ERROR / M_HANGUP in which case, we need to abort
   1128 	 * the ioctl. ill_down_start() is being processed as CUR_OP rather
   1129 	 * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent
   1130 	 * the in progress ioctl from ever completing.
   1131 	 *
   1132 	 * The thread that started the ioctl (if any) must have returned,
   1133 	 * since we are now executing as writer. After the 2 calls below,
   1134 	 * the state of the ipsq and the ill would reflect no trace of any
   1135 	 * pending operation. Subsequently if there is any response to the
   1136 	 * original ioctl from the driver, it would be discarded as an
   1137 	 * unsolicited message from the driver.
   1138 	 */
   1139 	(void) ipsq_pending_mp_cleanup(ill, NULL);
   1140 	ill_dlpi_clear_deferred(ill);
   1141 
   1142 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
   1143 		(void) ipif_down(ipif, NULL, NULL);
   1144 
   1145 	ill_down(ill);
   1146 
   1147 	/*
   1148 	 * Walk all CONNs that can have a reference on an ire or nce for this
   1149 	 * ill (we actually walk all that now have stale references).
   1150 	 */
   1151 	ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
   1152 
   1153 	/* With IPv6 we have dce_ifindex. Cleanup for neatness */
   1154 	if (ill->ill_isv6)
   1155 		dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
   1156 
   1157 	ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
   1158 
   1159 	/*
   1160 	 * Atomically test and add the pending mp if references are active.
   1161 	 */
   1162 	mutex_enter(&ill->ill_lock);
   1163 	if (!ill_is_quiescent(ill)) {
   1164 		/* call cannot fail since `conn_t *' argument is NULL */
   1165 		(void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
   1166 		    mp, ILL_DOWN);
   1167 		mutex_exit(&ill->ill_lock);
   1168 		return (B_FALSE);
   1169 	}
   1170 	mutex_exit(&ill->ill_lock);
   1171 	return (B_TRUE);
   1172 }
   1173 
   1174 static void
   1175 ill_down(ill_t *ill)
   1176 {
   1177 	mblk_t	*mp;
   1178 	ip_stack_t	*ipst = ill->ill_ipst;
   1179 
   1180 	/*
   1181 	 * Blow off any IREs dependent on this ILL.
   1182 	 * The caller needs to handle conn_ixa_cleanup
   1183 	 */
   1184 	ill_delete_ires(ill);
   1185 
   1186 	ire_walk_ill(0, 0, ill_downi, ill, ill);
   1187 
   1188 	/* Remove any conn_*_ill depending on this ill */
   1189 	ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
   1190 
   1191 	/*
   1192 	 * Free state for additional IREs.
   1193 	 */
   1194 	mutex_enter(&ill->ill_saved_ire_lock);
   1195 	mp = ill->ill_saved_ire_mp;
   1196 	ill->ill_saved_ire_mp = NULL;
   1197 	ill->ill_saved_ire_cnt = 0;
   1198 	mutex_exit(&ill->ill_saved_ire_lock);
   1199 	freemsg(mp);
   1200 }
   1201 
   1202 /*
   1203  * ire_walk routine used to delete every IRE that depends on
   1204  * 'ill'.  (Always called as writer.)
   1205  *
   1206  * Note: since the routes added by the kernel are deleted separately,
   1207  * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
   1208  *
   1209  * We also remove references on ire_nce_cache entries that refer to the ill.
   1210  */
   1211 void
   1212 ill_downi(ire_t *ire, char *ill_arg)
   1213 {
   1214 	ill_t	*ill = (ill_t *)ill_arg;
   1215 	nce_t	*nce;
   1216 
   1217 	mutex_enter(&ire->ire_lock);
   1218 	nce = ire->ire_nce_cache;
   1219 	if (nce != NULL && nce->nce_ill == ill)
   1220 		ire->ire_nce_cache = NULL;
   1221 	else
   1222 		nce = NULL;
   1223 	mutex_exit(&ire->ire_lock);
   1224 	if (nce != NULL)
   1225 		nce_refrele(nce);
   1226 	if (ire->ire_ill == ill)
   1227 		ire_delete(ire);
   1228 }
   1229 
   1230 /* Remove IRE_IF_CLONE on this ill */
   1231 void
   1232 ill_downi_if_clone(ire_t *ire, char *ill_arg)
   1233 {
   1234 	ill_t	*ill = (ill_t *)ill_arg;
   1235 
   1236 	ASSERT(ire->ire_type & IRE_IF_CLONE);
   1237 	if (ire->ire_ill == ill)
   1238 		ire_delete(ire);
   1239 }
   1240 
   1241 /* Consume an M_IOCACK of the fastpath probe. */
   1242 void
   1243 ill_fastpath_ack(ill_t *ill, mblk_t *mp)
   1244 {
   1245 	mblk_t	*mp1 = mp;
   1246 
   1247 	/*
   1248 	 * If this was the first attempt turn on the fastpath probing.
   1249 	 */
   1250 	mutex_enter(&ill->ill_lock);
   1251 	if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
   1252 		ill->ill_dlpi_fastpath_state = IDS_OK;
   1253 	mutex_exit(&ill->ill_lock);
   1254 
   1255 	/* Free the M_IOCACK mblk, hold on to the data */
   1256 	mp = mp->b_cont;
   1257 	freeb(mp1);
   1258 	if (mp == NULL)
   1259 		return;
   1260 	if (mp->b_cont != NULL)
   1261 		nce_fastpath_update(ill, mp);
   1262 	else
   1263 		ip0dbg(("ill_fastpath_ack:  no b_cont\n"));
   1264 	freemsg(mp);
   1265 }
   1266 
   1267 /*
   1268  * Throw an M_IOCTL message downstream asking "do you know fastpath?"
   1269  * The data portion of the request is a dl_unitdata_req_t template for
   1270  * what we would send downstream in the absence of a fastpath confirmation.
   1271  */
   1272 int
   1273 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
   1274 {
   1275 	struct iocblk	*ioc;
   1276 	mblk_t	*mp;
   1277 
   1278 	if (dlur_mp == NULL)
   1279 		return (EINVAL);
   1280 
   1281 	mutex_enter(&ill->ill_lock);
   1282 	switch (ill->ill_dlpi_fastpath_state) {
   1283 	case IDS_FAILED:
   1284 		/*
   1285 		 * Driver NAKed the first fastpath ioctl - assume it doesn't
   1286 		 * support it.
   1287 		 */
   1288 		mutex_exit(&ill->ill_lock);
   1289 		return (ENOTSUP);
   1290 	case IDS_UNKNOWN:
   1291 		/* This is the first probe */
   1292 		ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
   1293 		break;
   1294 	default:
   1295 		break;
   1296 	}
   1297 	mutex_exit(&ill->ill_lock);
   1298 
   1299 	if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
   1300 		return (EAGAIN);
   1301 
   1302 	mp->b_cont = copyb(dlur_mp);
   1303 	if (mp->b_cont == NULL) {
   1304 		freeb(mp);
   1305 		return (EAGAIN);
   1306 	}
   1307 
   1308 	ioc = (struct iocblk *)mp->b_rptr;
   1309 	ioc->ioc_count = msgdsize(mp->b_cont);
   1310 
   1311 	DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
   1312 	    char *, "DL_IOC_HDR_INFO", ill_t *, ill);
   1313 	putnext(ill->ill_wq, mp);
   1314 	return (0);
   1315 }
   1316 
   1317 void
   1318 ill_capability_probe(ill_t *ill)
   1319 {
   1320 	mblk_t	*mp;
   1321 
   1322 	ASSERT(IAM_WRITER_ILL(ill));
   1323 
   1324 	if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
   1325 	    ill->ill_dlpi_capab_state != IDCS_FAILED)
   1326 		return;
   1327 
   1328 	/*
   1329 	 * We are starting a new cycle of capability negotiation.
   1330 	 * Free up the capab reset messages of any previous incarnation.
   1331 	 * We will do a fresh allocation when we get the response to our probe
   1332 	 */
   1333 	if (ill->ill_capab_reset_mp != NULL) {
   1334 		freemsg(ill->ill_capab_reset_mp);
   1335 		ill->ill_capab_reset_mp = NULL;
   1336 	}
   1337 
   1338 	ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
   1339 
   1340 	mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
   1341 	if (mp == NULL)
   1342 		return;
   1343 
   1344 	ill_capability_send(ill, mp);
   1345 	ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
   1346 }
   1347 
   1348 void
   1349 ill_capability_reset(ill_t *ill, boolean_t reneg)
   1350 {
   1351 	ASSERT(IAM_WRITER_ILL(ill));
   1352 
   1353 	if (ill->ill_dlpi_capab_state != IDCS_OK)
   1354 		return;
   1355 
   1356 	ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
   1357 
   1358 	ill_capability_send(ill, ill->ill_capab_reset_mp);
   1359 	ill->ill_capab_reset_mp = NULL;
   1360 	/*
   1361 	 * We turn off all capabilities except those pertaining to
   1362 	 * direct function call capabilities viz. ILL_CAPAB_DLD*
   1363 	 * which will be turned off by the corresponding reset functions.
   1364 	 */
   1365 	ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM  | ILL_CAPAB_ZEROCOPY);
   1366 }
   1367 
   1368 static void
   1369 ill_capability_reset_alloc(ill_t *ill)
   1370 {
   1371 	mblk_t *mp;
   1372 	size_t	size = 0;
   1373 	int	err;
   1374 	dl_capability_req_t	*capb;
   1375 
   1376 	ASSERT(IAM_WRITER_ILL(ill));
   1377 	ASSERT(ill->ill_capab_reset_mp == NULL);
   1378 
   1379 	if (ILL_HCKSUM_CAPABLE(ill)) {
   1380 		size += sizeof (dl_capability_sub_t) +
   1381 		    sizeof (dl_capab_hcksum_t);
   1382 	}
   1383 
   1384 	if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
   1385 		size += sizeof (dl_capability_sub_t) +
   1386 		    sizeof (dl_capab_zerocopy_t);
   1387 	}
   1388 
   1389 	if (ill->ill_capabilities & ILL_CAPAB_DLD) {
   1390 		size += sizeof (dl_capability_sub_t) +
   1391 		    sizeof (dl_capab_dld_t);
   1392 	}
   1393 
   1394 	mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
   1395 	    STR_NOSIG, &err);
   1396 
   1397 	mp->b_datap->db_type = M_PROTO;
   1398 	bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
   1399 
   1400 	capb = (dl_capability_req_t *)mp->b_rptr;
   1401 	capb->dl_primitive = DL_CAPABILITY_REQ;
   1402 	capb->dl_sub_offset = sizeof (dl_capability_req_t);
   1403 	capb->dl_sub_length = size;
   1404 
   1405 	mp->b_wptr += sizeof (dl_capability_req_t);
   1406 
   1407 	/*
   1408 	 * Each handler fills in the corresponding dl_capability_sub_t
   1409 	 * inside the mblk,
   1410 	 */
   1411 	ill_capability_hcksum_reset_fill(ill, mp);
   1412 	ill_capability_zerocopy_reset_fill(ill, mp);
   1413 	ill_capability_dld_reset_fill(ill, mp);
   1414 
   1415 	ill->ill_capab_reset_mp = mp;
   1416 }
   1417 
   1418 static void
   1419 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
   1420 {
   1421 	dl_capab_id_t *id_ic;
   1422 	uint_t sub_dl_cap = outers->dl_cap;
   1423 	dl_capability_sub_t *inners;
   1424 	uint8_t *capend;
   1425 
   1426 	ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
   1427 
   1428 	/*
   1429 	 * Note: range checks here are not absolutely sufficient to
   1430 	 * make us robust against malformed messages sent by drivers;
   1431 	 * this is in keeping with the rest of IP's dlpi handling.
   1432 	 * (Remember, it's coming from something else in the kernel
   1433 	 * address space)
   1434 	 */
   1435 
   1436 	capend = (uint8_t *)(outers + 1) + outers->dl_length;
   1437 	if (capend > mp->b_wptr) {
   1438 		cmn_err(CE_WARN, "ill_capability_id_ack: "
   1439 		    "malformed sub-capability too long for mblk");
   1440 		return;
   1441 	}
   1442 
   1443 	id_ic = (dl_capab_id_t *)(outers + 1);
   1444 
   1445 	if (outers->dl_length < sizeof (*id_ic) ||
   1446 	    (inners = &id_ic->id_subcap,
   1447 	    inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
   1448 		cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
   1449 		    "encapsulated capab type %d too long for mblk",
   1450 		    inners->dl_cap);
   1451 		return;
   1452 	}
   1453 
   1454 	if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
   1455 		ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
   1456 		    "isn't as expected; pass-thru module(s) detected, "
   1457 		    "discarding capability\n", inners->dl_cap));
   1458 		return;
   1459 	}
   1460 
   1461 	/* Process the encapsulated sub-capability */
   1462 	ill_capability_dispatch(ill, mp, inners);
   1463 }
   1464 
   1465 static void
   1466 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
   1467 {
   1468 	dl_capability_sub_t *dl_subcap;
   1469 
   1470 	if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
   1471 		return;
   1472 
   1473 	/*
   1474 	 * The dl_capab_dld_t that follows the dl_capability_sub_t is not
   1475 	 * initialized below since it is not used by DLD.
   1476 	 */
   1477 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
   1478 	dl_subcap->dl_cap = DL_CAPAB_DLD;
   1479 	dl_subcap->dl_length = sizeof (dl_capab_dld_t);
   1480 
   1481 	mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
   1482 }
   1483 
   1484 static void
   1485 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
   1486 {
   1487 	/*
   1488 	 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
   1489 	 * is only to get the VRRP capability.
   1490 	 *
   1491 	 * Note that we cannot check ill_ipif_up_count here since
   1492 	 * ill_ipif_up_count is only incremented when the resolver is setup.
   1493 	 * That is done asynchronously, and can race with this function.
   1494 	 */
   1495 	if (!ill->ill_dl_up) {
   1496 		if (subp->dl_cap == DL_CAPAB_VRRP)
   1497 			ill_capability_vrrp_ack(ill, mp, subp);
   1498 		return;
   1499 	}
   1500 
   1501 	switch (subp->dl_cap) {
   1502 	case DL_CAPAB_HCKSUM:
   1503 		ill_capability_hcksum_ack(ill, mp, subp);
   1504 		break;
   1505 	case DL_CAPAB_ZEROCOPY:
   1506 		ill_capability_zerocopy_ack(ill, mp, subp);
   1507 		break;
   1508 	case DL_CAPAB_DLD:
   1509 		ill_capability_dld_ack(ill, mp, subp);
   1510 		break;
   1511 	case DL_CAPAB_VRRP:
   1512 		break;
   1513 	default:
   1514 		ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
   1515 		    subp->dl_cap));
   1516 	}
   1517 }
   1518 
   1519 /*
   1520  * Process the vrrp capability received from a DLS Provider. isub must point
   1521  * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
   1522  */
   1523 static void
   1524 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1525 {
   1526 	dl_capab_vrrp_t	*vrrp;
   1527 	uint_t		sub_dl_cap = isub->dl_cap;
   1528 	uint8_t		*capend;
   1529 
   1530 	ASSERT(IAM_WRITER_ILL(ill));
   1531 	ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
   1532 
   1533 	/*
   1534 	 * Note: range checks here are not absolutely sufficient to
   1535 	 * make us robust against malformed messages sent by drivers;
   1536 	 * this is in keeping with the rest of IP's dlpi handling.
   1537 	 * (Remember, it's coming from something else in the kernel
   1538 	 * address space)
   1539 	 */
   1540 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1541 	if (capend > mp->b_wptr) {
   1542 		cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
   1543 		    "malformed sub-capability too long for mblk");
   1544 		return;
   1545 	}
   1546 	vrrp = (dl_capab_vrrp_t *)(isub + 1);
   1547 
   1548 	/*
   1549 	 * Compare the IP address family and set ILLF_VRRP for the right ill.
   1550 	 */
   1551 	if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
   1552 	    (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
   1553 		ill->ill_flags |= ILLF_VRRP;
   1554 	}
   1555 }
   1556 
   1557 /*
   1558  * Process a hardware checksum offload capability negotiation ack received
   1559  * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
   1560  * of a DL_CAPABILITY_ACK message.
   1561  */
   1562 static void
   1563 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1564 {
   1565 	dl_capability_req_t	*ocap;
   1566 	dl_capab_hcksum_t	*ihck, *ohck;
   1567 	ill_hcksum_capab_t	**ill_hcksum;
   1568 	mblk_t			*nmp = NULL;
   1569 	uint_t			sub_dl_cap = isub->dl_cap;
   1570 	uint8_t			*capend;
   1571 
   1572 	ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
   1573 
   1574 	ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
   1575 
   1576 	/*
   1577 	 * Note: range checks here are not absolutely sufficient to
   1578 	 * make us robust against malformed messages sent by drivers;
   1579 	 * this is in keeping with the rest of IP's dlpi handling.
   1580 	 * (Remember, it's coming from something else in the kernel
   1581 	 * address space)
   1582 	 */
   1583 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1584 	if (capend > mp->b_wptr) {
   1585 		cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
   1586 		    "malformed sub-capability too long for mblk");
   1587 		return;
   1588 	}
   1589 
   1590 	/*
   1591 	 * There are two types of acks we process here:
   1592 	 * 1. acks in reply to a (first form) generic capability req
   1593 	 *    (no ENABLE flag set)
   1594 	 * 2. acks in reply to a ENABLE capability req.
   1595 	 *    (ENABLE flag set)
   1596 	 */
   1597 	ihck = (dl_capab_hcksum_t *)(isub + 1);
   1598 
   1599 	if (ihck->hcksum_version != HCKSUM_VERSION_1) {
   1600 		cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
   1601 		    "unsupported hardware checksum "
   1602 		    "sub-capability (version %d, expected %d)",
   1603 		    ihck->hcksum_version, HCKSUM_VERSION_1);
   1604 		return;
   1605 	}
   1606 
   1607 	if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
   1608 		ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
   1609 		    "checksum capability isn't as expected; pass-thru "
   1610 		    "module(s) detected, discarding capability\n"));
   1611 		return;
   1612 	}
   1613 
   1614 #define	CURR_HCKSUM_CAPAB				\
   1615 	(HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 |	\
   1616 	HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
   1617 
   1618 	if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
   1619 	    (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
   1620 		/* do ENABLE processing */
   1621 		if (*ill_hcksum == NULL) {
   1622 			*ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
   1623 			    KM_NOSLEEP);
   1624 
   1625 			if (*ill_hcksum == NULL) {
   1626 				cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
   1627 				    "could not enable hcksum version %d "
   1628 				    "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
   1629 				    ill->ill_name);
   1630 				return;
   1631 			}
   1632 		}
   1633 
   1634 		(*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
   1635 		(*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
   1636 		ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
   1637 		ip1dbg(("ill_capability_hcksum_ack: interface %s "
   1638 		    "has enabled hardware checksumming\n ",
   1639 		    ill->ill_name));
   1640 	} else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
   1641 		/*
   1642 		 * Enabling hardware checksum offload
   1643 		 * Currently IP supports {TCP,UDP}/IPv4
   1644 		 * partial and full cksum offload and
   1645 		 * IPv4 header checksum offload.
   1646 		 * Allocate new mblk which will
   1647 		 * contain a new capability request
   1648 		 * to enable hardware checksum offload.
   1649 		 */
   1650 		uint_t	size;
   1651 		uchar_t	*rptr;
   1652 
   1653 		size = sizeof (dl_capability_req_t) +
   1654 		    sizeof (dl_capability_sub_t) + isub->dl_length;
   1655 
   1656 		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
   1657 			cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
   1658 			    "could not enable hardware cksum for %s (ENOMEM)\n",
   1659 			    ill->ill_name);
   1660 			return;
   1661 		}
   1662 
   1663 		rptr = nmp->b_rptr;
   1664 		/* initialize dl_capability_req_t */
   1665 		ocap = (dl_capability_req_t *)nmp->b_rptr;
   1666 		ocap->dl_sub_offset =
   1667 		    sizeof (dl_capability_req_t);
   1668 		ocap->dl_sub_length =
   1669 		    sizeof (dl_capability_sub_t) +
   1670 		    isub->dl_length;
   1671 		nmp->b_rptr += sizeof (dl_capability_req_t);
   1672 
   1673 		/* initialize dl_capability_sub_t */
   1674 		bcopy(isub, nmp->b_rptr, sizeof (*isub));
   1675 		nmp->b_rptr += sizeof (*isub);
   1676 
   1677 		/* initialize dl_capab_hcksum_t */
   1678 		ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
   1679 		bcopy(ihck, ohck, sizeof (*ihck));
   1680 
   1681 		nmp->b_rptr = rptr;
   1682 		ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
   1683 
   1684 		/* Set ENABLE flag */
   1685 		ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
   1686 		ohck->hcksum_txflags |= HCKSUM_ENABLE;
   1687 
   1688 		/*
   1689 		 * nmp points to a DL_CAPABILITY_REQ message to enable
   1690 		 * hardware checksum acceleration.
   1691 		 */
   1692 		ill_capability_send(ill, nmp);
   1693 	} else {
   1694 		ip1dbg(("ill_capability_hcksum_ack: interface %s has "
   1695 		    "advertised %x hardware checksum capability flags\n",
   1696 		    ill->ill_name, ihck->hcksum_txflags));
   1697 	}
   1698 }
   1699 
   1700 static void
   1701 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
   1702 {
   1703 	dl_capab_hcksum_t *hck_subcap;
   1704 	dl_capability_sub_t *dl_subcap;
   1705 
   1706 	if (!ILL_HCKSUM_CAPABLE(ill))
   1707 		return;
   1708 
   1709 	ASSERT(ill->ill_hcksum_capab != NULL);
   1710 
   1711 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
   1712 	dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
   1713 	dl_subcap->dl_length = sizeof (*hck_subcap);
   1714 
   1715 	hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
   1716 	hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
   1717 	hck_subcap->hcksum_txflags = 0;
   1718 
   1719 	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
   1720 }
   1721 
   1722 static void
   1723 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1724 {
   1725 	mblk_t *nmp = NULL;
   1726 	dl_capability_req_t *oc;
   1727 	dl_capab_zerocopy_t *zc_ic, *zc_oc;
   1728 	ill_zerocopy_capab_t **ill_zerocopy_capab;
   1729 	uint_t sub_dl_cap = isub->dl_cap;
   1730 	uint8_t *capend;
   1731 
   1732 	ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
   1733 
   1734 	ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
   1735 
   1736 	/*
   1737 	 * Note: range checks here are not absolutely sufficient to
   1738 	 * make us robust against malformed messages sent by drivers;
   1739 	 * this is in keeping with the rest of IP's dlpi handling.
   1740 	 * (Remember, it's coming from something else in the kernel
   1741 	 * address space)
   1742 	 */
   1743 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1744 	if (capend > mp->b_wptr) {
   1745 		cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
   1746 		    "malformed sub-capability too long for mblk");
   1747 		return;
   1748 	}
   1749 
   1750 	zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
   1751 	if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
   1752 		cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
   1753 		    "unsupported ZEROCOPY sub-capability (version %d, "
   1754 		    "expected %d)", zc_ic->zerocopy_version,
   1755 		    ZEROCOPY_VERSION_1);
   1756 		return;
   1757 	}
   1758 
   1759 	if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
   1760 		ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
   1761 		    "capability isn't as expected; pass-thru module(s) "
   1762 		    "detected, discarding capability\n"));
   1763 		return;
   1764 	}
   1765 
   1766 	if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
   1767 		if (*ill_zerocopy_capab == NULL) {
   1768 			*ill_zerocopy_capab =
   1769 			    kmem_zalloc(sizeof (ill_zerocopy_capab_t),
   1770 			    KM_NOSLEEP);
   1771 
   1772 			if (*ill_zerocopy_capab == NULL) {
   1773 				cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
   1774 				    "could not enable Zero-copy version %d "
   1775 				    "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
   1776 				    ill->ill_name);
   1777 				return;
   1778 			}
   1779 		}
   1780 
   1781 		ip1dbg(("ill_capability_zerocopy_ack: interface %s "
   1782 		    "supports Zero-copy version %d\n", ill->ill_name,
   1783 		    ZEROCOPY_VERSION_1));
   1784 
   1785 		(*ill_zerocopy_capab)->ill_zerocopy_version =
   1786 		    zc_ic->zerocopy_version;
   1787 		(*ill_zerocopy_capab)->ill_zerocopy_flags =
   1788 		    zc_ic->zerocopy_flags;
   1789 
   1790 		ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
   1791 	} else {
   1792 		uint_t size;
   1793 		uchar_t *rptr;
   1794 
   1795 		size = sizeof (dl_capability_req_t) +
   1796 		    sizeof (dl_capability_sub_t) +
   1797 		    sizeof (dl_capab_zerocopy_t);
   1798 
   1799 		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
   1800 			cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
   1801 			    "could not enable zerocopy for %s (ENOMEM)\n",
   1802 			    ill->ill_name);
   1803 			return;
   1804 		}
   1805 
   1806 		rptr = nmp->b_rptr;
   1807 		/* initialize dl_capability_req_t */
   1808 		oc = (dl_capability_req_t *)rptr;
   1809 		oc->dl_sub_offset = sizeof (dl_capability_req_t);
   1810 		oc->dl_sub_length = sizeof (dl_capability_sub_t) +
   1811 		    sizeof (dl_capab_zerocopy_t);
   1812 		rptr += sizeof (dl_capability_req_t);
   1813 
   1814 		/* initialize dl_capability_sub_t */
   1815 		bcopy(isub, rptr, sizeof (*isub));
   1816 		rptr += sizeof (*isub);
   1817 
   1818 		/* initialize dl_capab_zerocopy_t */
   1819 		zc_oc = (dl_capab_zerocopy_t *)rptr;
   1820 		*zc_oc = *zc_ic;
   1821 
   1822 		ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
   1823 		    "to enable zero-copy version %d\n", ill->ill_name,
   1824 		    ZEROCOPY_VERSION_1));
   1825 
   1826 		/* set VMSAFE_MEM flag */
   1827 		zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
   1828 
   1829 		/* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
   1830 		ill_capability_send(ill, nmp);
   1831 	}
   1832 }
   1833 
   1834 static void
   1835 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
   1836 {
   1837 	dl_capab_zerocopy_t *zerocopy_subcap;
   1838 	dl_capability_sub_t *dl_subcap;
   1839 
   1840 	if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
   1841 		return;
   1842 
   1843 	ASSERT(ill->ill_zerocopy_capab != NULL);
   1844 
   1845 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
   1846 	dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
   1847 	dl_subcap->dl_length = sizeof (*zerocopy_subcap);
   1848 
   1849 	zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
   1850 	zerocopy_subcap->zerocopy_version =
   1851 	    ill->ill_zerocopy_capab->ill_zerocopy_version;
   1852 	zerocopy_subcap->zerocopy_flags = 0;
   1853 
   1854 	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
   1855 }
   1856 
   1857 /*
   1858  * DLD capability
   1859  * Refer to dld.h for more information regarding the purpose and usage
   1860  * of this capability.
   1861  */
   1862 static void
   1863 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1864 {
   1865 	dl_capab_dld_t		*dld_ic, dld;
   1866 	uint_t			sub_dl_cap = isub->dl_cap;
   1867 	uint8_t			*capend;
   1868 	ill_dld_capab_t		*idc;
   1869 
   1870 	ASSERT(IAM_WRITER_ILL(ill));
   1871 	ASSERT(sub_dl_cap == DL_CAPAB_DLD);
   1872 
   1873 	/*
   1874 	 * Note: range checks here are not absolutely sufficient to
   1875 	 * make us robust against malformed messages sent by drivers;
   1876 	 * this is in keeping with the rest of IP's dlpi handling.
   1877 	 * (Remember, it's coming from something else in the kernel
   1878 	 * address space)
   1879 	 */
   1880 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1881 	if (capend > mp->b_wptr) {
   1882 		cmn_err(CE_WARN, "ill_capability_dld_ack: "
   1883 		    "malformed sub-capability too long for mblk");
   1884 		return;
   1885 	}
   1886 	dld_ic = (dl_capab_dld_t *)(isub + 1);
   1887 	if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
   1888 		cmn_err(CE_CONT, "ill_capability_dld_ack: "
   1889 		    "unsupported DLD sub-capability (version %d, "
   1890 		    "expected %d)", dld_ic->dld_version,
   1891 		    DLD_CURRENT_VERSION);
   1892 		return;
   1893 	}
   1894 	if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
   1895 		ip1dbg(("ill_capability_dld_ack: mid token for dld "
   1896 		    "capability isn't as expected; pass-thru module(s) "
   1897 		    "detected, discarding capability\n"));
   1898 		return;
   1899 	}
   1900 
   1901 	/*
   1902 	 * Copy locally to ensure alignment.
   1903 	 */
   1904 	bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
   1905 
   1906 	if ((idc = ill->ill_dld_capab) == NULL) {
   1907 		idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
   1908 		if (idc == NULL) {
   1909 			cmn_err(CE_WARN, "ill_capability_dld_ack: "
   1910 			    "could not enable DLD version %d "
   1911 			    "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
   1912 			    ill->ill_name);
   1913 			return;
   1914 		}
   1915 		ill->ill_dld_capab = idc;
   1916 	}
   1917 	idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
   1918 	idc->idc_capab_dh = (void *)dld.dld_capab_handle;
   1919 	ip1dbg(("ill_capability_dld_ack: interface %s "
   1920 	    "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
   1921 
   1922 	ill_capability_dld_enable(ill);
   1923 }
   1924 
   1925 /*
   1926  * Typically capability negotiation between IP and the driver happens via
   1927  * DLPI message exchange. However GLD also offers a direct function call
   1928  * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
   1929  * But arbitrary function calls into IP or GLD are not permitted, since both
   1930  * of them are protected by their own perimeter mechanism. The perimeter can
   1931  * be viewed as a coarse lock or serialization mechanism. The hierarchy of
   1932  * these perimeters is IP -> MAC. Thus for example to enable the squeue
   1933  * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
   1934  * to enter the mac perimeter and then do the direct function calls into
   1935  * GLD to enable squeue polling. The ring related callbacks from the mac into
   1936  * the stack to add, bind, quiesce, restart or cleanup a ring are all
   1937  * protected by the mac perimeter.
   1938  */
   1939 static void
   1940 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
   1941 {
   1942 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1943 	int			err;
   1944 
   1945 	err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
   1946 	    DLD_ENABLE);
   1947 	ASSERT(err == 0);
   1948 }
   1949 
   1950 static void
   1951 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
   1952 {
   1953 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1954 	int			err;
   1955 
   1956 	err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
   1957 	    DLD_DISABLE);
   1958 	ASSERT(err == 0);
   1959 }
   1960 
   1961 boolean_t
   1962 ill_mac_perim_held(ill_t *ill)
   1963 {
   1964 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1965 
   1966 	return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
   1967 	    DLD_QUERY));
   1968 }
   1969 
   1970 static void
   1971 ill_capability_direct_enable(ill_t *ill)
   1972 {
   1973 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1974 	ill_dld_direct_t	*idd = &idc->idc_direct;
   1975 	dld_capab_direct_t	direct;
   1976 	int			rc;
   1977 
   1978 	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
   1979 
   1980 	bzero(&direct, sizeof (direct));
   1981 	direct.di_rx_cf = (uintptr_t)ip_input;
   1982 	direct.di_rx_ch = ill;
   1983 
   1984 	rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
   1985 	    DLD_ENABLE);
   1986 	if (rc == 0) {
   1987 		idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
   1988 		idd->idd_tx_dh = direct.di_tx_dh;
   1989 		idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
   1990 		idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
   1991 		idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
   1992 		idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
   1993 		ASSERT(idd->idd_tx_cb_df != NULL);
   1994 		ASSERT(idd->idd_tx_fctl_df != NULL);
   1995 		ASSERT(idd->idd_tx_df != NULL);
   1996 		/*
   1997 		 * One time registration of flow enable callback function
   1998 		 */
   1999 		ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
   2000 		    ill_flow_enable, ill);
   2001 		ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
   2002 		DTRACE_PROBE1(direct_on, (ill_t *), ill);
   2003 	} else {
   2004 		cmn_err(CE_WARN, "warning: could not enable DIRECT "
   2005 		    "capability, rc = %d\n", rc);
   2006 		DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
   2007 	}
   2008 }
   2009 
   2010 static void
   2011 ill_capability_poll_enable(ill_t *ill)
   2012 {
   2013 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   2014 	dld_capab_poll_t	poll;
   2015 	int			rc;
   2016 
   2017 	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
   2018 
   2019 	bzero(&poll, sizeof (poll));
   2020 	poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
   2021 	poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
   2022 	poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
   2023 	poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
   2024 	poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
   2025 	poll.poll_ring_ch = ill;
   2026 	rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
   2027 	    DLD_ENABLE);
   2028 	if (rc == 0) {
   2029 		ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
   2030 		DTRACE_PROBE1(poll_on, (ill_t *), ill);
   2031 	} else {
   2032 		ip1dbg(("warning: could not enable POLL "
   2033 		    "capability, rc = %d\n", rc));
   2034 		DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
   2035 	}
   2036 }
   2037 
   2038 /*
   2039  * Enable the LSO capability.
   2040  */
   2041 static void
   2042 ill_capability_lso_enable(ill_t *ill)
   2043 {
   2044 	ill_dld_capab_t	*idc = ill->ill_dld_capab;
   2045 	dld_capab_lso_t	lso;
   2046 	int rc;
   2047 
   2048 	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
   2049 
   2050 	if (ill->ill_lso_capab == NULL) {
   2051 		ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
   2052 		    KM_NOSLEEP);
   2053 		if (ill->ill_lso_capab == NULL) {
   2054 			cmn_err(CE_WARN, "ill_capability_lso_enable: "
   2055 			    "could not enable LSO for %s (ENOMEM)\n",
   2056 			    ill->ill_name);
   2057 			return;
   2058 		}
   2059 	}
   2060 
   2061 	bzero(&lso, sizeof (lso));
   2062 	if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
   2063 	    DLD_ENABLE)) == 0) {
   2064 		ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
   2065 		ill->ill_lso_capab->ill_lso_max = lso.lso_max;
   2066 		ill->ill_capabilities |= ILL_CAPAB_LSO;
   2067 		ip1dbg(("ill_capability_lso_enable: interface %s "
   2068 		    "has enabled LSO\n ", ill->ill_name));
   2069 	} else {
   2070 		kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
   2071 		ill->ill_lso_capab = NULL;
   2072 		DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
   2073 	}
   2074 }
   2075 
   2076 static void
   2077 ill_capability_dld_enable(ill_t *ill)
   2078 {
   2079 	mac_perim_handle_t mph;
   2080 
   2081 	ASSERT(IAM_WRITER_ILL(ill));
   2082 
   2083 	if (ill->ill_isv6)
   2084 		return;
   2085 
   2086 	ill_mac_perim_enter(ill, &mph);
   2087 	if (!ill->ill_isv6) {
   2088 		ill_capability_direct_enable(ill);
   2089 		ill_capability_poll_enable(ill);
   2090 		ill_capability_lso_enable(ill);
   2091 	}
   2092 	ill->ill_capabilities |= ILL_CAPAB_DLD;
   2093 	ill_mac_perim_exit(ill, mph);
   2094 }
   2095 
   2096 static void
   2097 ill_capability_dld_disable(ill_t *ill)
   2098 {
   2099 	ill_dld_capab_t	*idc;
   2100 	ill_dld_direct_t *idd;
   2101 	mac_perim_handle_t	mph;
   2102 
   2103 	ASSERT(IAM_WRITER_ILL(ill));
   2104 
   2105 	if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
   2106 		return;
   2107 
   2108 	ill_mac_perim_enter(ill, &mph);
   2109 
   2110 	idc = ill->ill_dld_capab;
   2111 	if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
   2112 		/*
   2113 		 * For performance we avoid locks in the transmit data path
   2114 		 * and don't maintain a count of the number of threads using
   2115 		 * direct calls. Thus some threads could be using direct
   2116 		 * transmit calls to GLD, even after the capability mechanism
   2117 		 * turns it off. This is still safe since the handles used in
   2118 		 * the direct calls continue to be valid until the unplumb is
   2119 		 * completed. Remove the callback that was added (1-time) at
   2120 		 * capab enable time.
   2121 		 */
   2122 		mutex_enter(&ill->ill_lock);
   2123 		ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
   2124 		mutex_exit(&ill->ill_lock);
   2125 		if (ill->ill_flownotify_mh != NULL) {
   2126 			idd = &idc->idc_direct;
   2127 			idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
   2128 			    ill->ill_flownotify_mh);
   2129 			ill->ill_flownotify_mh = NULL;
   2130 		}
   2131 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
   2132 		    NULL, DLD_DISABLE);
   2133 	}
   2134 
   2135 	if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
   2136 		ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
   2137 		ip_squeue_clean_all(ill);
   2138 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
   2139 		    NULL, DLD_DISABLE);
   2140 	}
   2141 
   2142 	if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
   2143 		ASSERT(ill->ill_lso_capab != NULL);
   2144 		/*
   2145 		 * Clear the capability flag for LSO but retain the
   2146 		 * ill_lso_capab structure since it's possible that another
   2147 		 * thread is still referring to it.  The structure only gets
   2148 		 * deallocated when we destroy the ill.
   2149 		 */
   2150 
   2151 		ill->ill_capabilities &= ~ILL_CAPAB_LSO;
   2152 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
   2153 		    NULL, DLD_DISABLE);
   2154 	}
   2155 
   2156 	ill->ill_capabilities &= ~ILL_CAPAB_DLD;
   2157 	ill_mac_perim_exit(ill, mph);
   2158 }
   2159 
   2160 /*
   2161  * Capability Negotiation protocol
   2162  *
   2163  * We don't wait for DLPI capability operations to finish during interface
   2164  * bringup or teardown. Doing so would introduce more asynchrony and the
   2165  * interface up/down operations will need multiple return and restarts.
   2166  * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
   2167  * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
   2168  * exclusive operation won't start until the DLPI operations of the previous
   2169  * exclusive operation complete.
   2170  *
   2171  * The capability state machine is shown below.
   2172  *
   2173  * state		next state		event, action
   2174  *
   2175  * IDCS_UNKNOWN 	IDCS_PROBE_SENT		ill_capability_probe
   2176  * IDCS_PROBE_SENT	IDCS_OK			ill_capability_ack
   2177  * IDCS_PROBE_SENT	IDCS_FAILED		ip_rput_dlpi_writer (nack)
   2178  * IDCS_OK		IDCS_RENEG		Receipt of DL_NOTE_CAPAB_RENEG
   2179  * IDCS_OK		IDCS_RESET_SENT		ill_capability_reset
   2180  * IDCS_RESET_SENT	IDCS_UNKNOWN		ill_capability_ack_thr
   2181  * IDCS_RENEG		IDCS_PROBE_SENT		ill_capability_ack_thr ->
   2182  *						    ill_capability_probe.
   2183  */
   2184 
   2185 /*
   2186  * Dedicated thread started from ip_stack_init that handles capability
   2187  * disable. This thread ensures the taskq dispatch does not fail by waiting
   2188  * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
   2189  * that direct calls to DLD are done in a cv_waitable context.
   2190  */
   2191 void
   2192 ill_taskq_dispatch(ip_stack_t *ipst)
   2193 {
   2194 	callb_cpr_t cprinfo;
   2195 	char 	name[64];
   2196 	mblk_t	*mp;
   2197 
   2198 	(void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
   2199 	    ipst->ips_netstack->netstack_stackid);
   2200 	CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
   2201 	    name);
   2202 	mutex_enter(&ipst->ips_capab_taskq_lock);
   2203 
   2204 	for (;;) {
   2205 		mp = ipst->ips_capab_taskq_head;
   2206 		while (mp != NULL) {
   2207 			ipst->ips_capab_taskq_head = mp->b_next;
   2208 			if (ipst->ips_capab_taskq_head == NULL)
   2209 				ipst->ips_capab_taskq_tail = NULL;
   2210 			mutex_exit(&ipst->ips_capab_taskq_lock);
   2211 			mp->b_next = NULL;
   2212 
   2213 			VERIFY(taskq_dispatch(system_taskq,
   2214 			    ill_capability_ack_thr, mp, TQ_SLEEP) != 0);
   2215 			mutex_enter(&ipst->ips_capab_taskq_lock);
   2216 			mp = ipst->ips_capab_taskq_head;
   2217 		}
   2218 
   2219 		if (ipst->ips_capab_taskq_quit)
   2220 			break;
   2221 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   2222 		cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
   2223 		CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
   2224 	}
   2225 	VERIFY(ipst->ips_capab_taskq_head == NULL);
   2226 	VERIFY(ipst->ips_capab_taskq_tail == NULL);
   2227 	CALLB_CPR_EXIT(&cprinfo);
   2228 	thread_exit();
   2229 }
   2230 
   2231 /*
   2232  * Consume a new-style hardware capabilities negotiation ack.
   2233  * Called via taskq on receipt of DL_CAPABILITY_ACK.
   2234  */
   2235 static void
   2236 ill_capability_ack_thr(void *arg)
   2237 {
   2238 	mblk_t	*mp = arg;
   2239 	dl_capability_ack_t *capp;
   2240 	dl_capability_sub_t *subp, *endp;
   2241 	ill_t	*ill;
   2242 	boolean_t reneg;
   2243 
   2244 	ill = (ill_t *)mp->b_prev;
   2245 	mp->b_prev = NULL;
   2246 
   2247 	VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
   2248 
   2249 	if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
   2250 	    ill->ill_dlpi_capab_state == IDCS_RENEG) {
   2251 		/*
   2252 		 * We have received the ack for our DL_CAPAB reset request.
   2253 		 * There isnt' anything in the message that needs processing.
   2254 		 * All message based capabilities have been disabled, now
   2255 		 * do the function call based capability disable.
   2256 		 */
   2257 		reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
   2258 		ill_capability_dld_disable(ill);
   2259 		ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
   2260 		if (reneg)
   2261 			ill_capability_probe(ill);
   2262 		goto done;
   2263 	}
   2264 
   2265 	if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
   2266 		ill->ill_dlpi_capab_state = IDCS_OK;
   2267 
   2268 	capp = (dl_capability_ack_t *)mp->b_rptr;
   2269 
   2270 	if (capp->dl_sub_length == 0) {
   2271 		/* no new-style capabilities */
   2272 		goto done;
   2273 	}
   2274 
   2275 	/* make sure the driver supplied correct dl_sub_length */
   2276 	if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
   2277 		ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
   2278 		    "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
   2279 		goto done;
   2280 	}
   2281 
   2282 #define	SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
   2283 	/*
   2284 	 * There are sub-capabilities. Process the ones we know about.
   2285 	 * Loop until we don't have room for another sub-cap header..
   2286 	 */
   2287 	for (subp = SC(capp, capp->dl_sub_offset),
   2288 	    endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
   2289 	    subp <= endp;
   2290 	    subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
   2291 
   2292 		switch (subp->dl_cap) {
   2293 		case DL_CAPAB_ID_WRAPPER:
   2294 			ill_capability_id_ack(ill, mp, subp);
   2295 			break;
   2296 		default:
   2297 			ill_capability_dispatch(ill, mp, subp);
   2298 			break;
   2299 		}
   2300 	}
   2301 #undef SC
   2302 done:
   2303 	inet_freemsg(mp);
   2304 	ill_capability_done(ill);
   2305 	ipsq_exit(ill->ill_phyint->phyint_ipsq);
   2306 }
   2307 
   2308 /*
   2309  * This needs to be started in a taskq thread to provide a cv_waitable
   2310  * context.
   2311  */
   2312 void
   2313 ill_capability_ack(ill_t *ill, mblk_t *mp)
   2314 {
   2315 	ip_stack_t	*ipst = ill->ill_ipst;
   2316 
   2317 	mp->b_prev = (mblk_t *)ill;
   2318 	ASSERT(mp->b_next == NULL);
   2319 
   2320 	if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
   2321 	    TQ_NOSLEEP) != 0)
   2322 		return;
   2323 
   2324 	/*
   2325 	 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
   2326 	 * which will do the dispatch using TQ_SLEEP to guarantee success.
   2327 	 */
   2328 	mutex_enter(&ipst->ips_capab_taskq_lock);
   2329 	if (ipst->ips_capab_taskq_head == NULL) {
   2330 		ASSERT(ipst->ips_capab_taskq_tail == NULL);
   2331 		ipst->ips_capab_taskq_head = mp;
   2332 	} else {
   2333 		ipst->ips_capab_taskq_tail->b_next = mp;
   2334 	}
   2335 	ipst->ips_capab_taskq_tail = mp;
   2336 
   2337 	cv_signal(&ipst->ips_capab_taskq_cv);
   2338 	mutex_exit(&ipst->ips_capab_taskq_lock);
   2339 }
   2340 
   2341 /*
   2342  * This routine is called to scan the fragmentation reassembly table for
   2343  * the specified ILL for any packets that are starting to smell.
   2344  * dead_interval is the maximum time in seconds that will be tolerated.  It
   2345  * will either be the value specified in ip_g_frag_timeout, or zero if the
   2346  * ILL is shutting down and it is time to blow everything off.
   2347  *
   2348  * It returns the number of seconds (as a time_t) that the next frag timer
   2349  * should be scheduled for, 0 meaning that the timer doesn't need to be
   2350  * re-started.  Note that the method of calculating next_timeout isn't
   2351  * entirely accurate since time will flow between the time we grab
   2352  * current_time and the time we schedule the next timeout.  This isn't a
   2353  * big problem since this is the timer for sending an ICMP reassembly time
   2354  * exceeded messages, and it doesn't have to be exactly accurate.
   2355  *
   2356  * This function is
   2357  * sometimes called as writer, although this is not required.
   2358  */
   2359 time_t
   2360 ill_frag_timeout(ill_t *ill, time_t dead_interval)
   2361 {
   2362 	ipfb_t	*ipfb;
   2363 	ipfb_t	*endp;
   2364 	ipf_t	*ipf;
   2365 	ipf_t	*ipfnext;
   2366 	mblk_t	*mp;
   2367 	time_t	current_time = gethrestime_sec();
   2368 	time_t	next_timeout = 0;
   2369 	uint32_t	hdr_length;
   2370 	mblk_t	*send_icmp_head;
   2371 	mblk_t	*send_icmp_head_v6;
   2372 	ip_stack_t *ipst = ill->ill_ipst;
   2373 	ip_recv_attr_t iras;
   2374 
   2375 	bzero(&iras, sizeof (iras));
   2376 	iras.ira_flags = 0;
   2377 	iras.ira_ill = iras.ira_rill = ill;
   2378 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
   2379 	iras.ira_rifindex = iras.ira_ruifindex;
   2380 
   2381 	ipfb = ill->ill_frag_hash_tbl;
   2382 	if (ipfb == NULL)
   2383 		return (B_FALSE);
   2384 	endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
   2385 	/* Walk the frag hash table. */
   2386 	for (; ipfb < endp; ipfb++) {
   2387 		send_icmp_head = NULL;
   2388 		send_icmp_head_v6 = NULL;
   2389 		mutex_enter(&ipfb->ipfb_lock);
   2390 		while ((ipf = ipfb->ipfb_ipf) != 0) {
   2391 			time_t frag_time = current_time - ipf->ipf_timestamp;
   2392 			time_t frag_timeout;
   2393 
   2394 			if (frag_time < dead_interval) {
   2395 				/*
   2396 				 * There are some outstanding fragments
   2397 				 * that will timeout later.  Make note of
   2398 				 * the time so that we can reschedule the
   2399 				 * next timeout appropriately.
   2400 				 */
   2401 				frag_timeout = dead_interval - frag_time;
   2402 				if (next_timeout == 0 ||
   2403 				    frag_timeout < next_timeout) {
   2404 					next_timeout = frag_timeout;
   2405 				}
   2406 				break;
   2407 			}
   2408 			/* Time's up.  Get it out of here. */
   2409 			hdr_length = ipf->ipf_nf_hdr_len;
   2410 			ipfnext = ipf->ipf_hash_next;
   2411 			if (ipfnext)
   2412 				ipfnext->ipf_ptphn = ipf->ipf_ptphn;
   2413 			*ipf->ipf_ptphn = ipfnext;
   2414 			mp = ipf->ipf_mp->b_cont;
   2415 			for (; mp; mp = mp->b_cont) {
   2416 				/* Extra points for neatness. */
   2417 				IP_REASS_SET_START(mp, 0);
   2418 				IP_REASS_SET_END(mp, 0);
   2419 			}
   2420 			mp = ipf->ipf_mp->b_cont;
   2421 			atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
   2422 			ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
   2423 			ipfb->ipfb_count -= ipf->ipf_count;
   2424 			ASSERT(ipfb->ipfb_frag_pkts > 0);
   2425 			ipfb->ipfb_frag_pkts--;
   2426 			/*
   2427 			 * We do not send any icmp message from here because
   2428 			 * we currently are holding the ipfb_lock for this
   2429 			 * hash chain. If we try and send any icmp messages
   2430 			 * from here we may end up via a put back into ip
   2431 			 * trying to get the same lock, causing a recursive
   2432 			 * mutex panic. Instead we build a list and send all
   2433 			 * the icmp messages after we have dropped the lock.
   2434 			 */
   2435 			if (ill->ill_isv6) {
   2436 				if (hdr_length != 0) {
   2437 					mp->b_next = send_icmp_head_v6;
   2438 					send_icmp_head_v6 = mp;
   2439 				} else {
   2440 					freemsg(mp);
   2441 				}
   2442 			} else {
   2443 				if (hdr_length != 0) {
   2444 					mp->b_next = send_icmp_head;
   2445 					send_icmp_head = mp;
   2446 				} else {
   2447 					freemsg(mp);
   2448 				}
   2449 			}
   2450 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
   2451 			ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
   2452 			freeb(ipf->ipf_mp);
   2453 		}
   2454 		mutex_exit(&ipfb->ipfb_lock);
   2455 		/*
   2456 		 * Now need to send any icmp messages that we delayed from
   2457 		 * above.
   2458 		 */
   2459 		while (send_icmp_head_v6 != NULL) {
   2460 			ip6_t *ip6h;
   2461 
   2462 			mp = send_icmp_head_v6;
   2463 			send_icmp_head_v6 = send_icmp_head_v6->b_next;
   2464 			mp->b_next = NULL;
   2465 			ip6h = (ip6_t *)mp->b_rptr;
   2466 			iras.ira_flags = 0;
   2467 			/*
   2468 			 * This will result in an incorrect ALL_ZONES zoneid
   2469 			 * for multicast packets, but we
   2470 			 * don't send ICMP errors for those in any case.
   2471 			 */
   2472 			iras.ira_zoneid =
   2473 			    ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
   2474 			    ill, ipst);
   2475 			ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
   2476 			icmp_time_exceeded_v6(mp,
   2477 			    ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
   2478 			    &iras);
   2479 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   2480 		}
   2481 		while (send_icmp_head != NULL) {
   2482 			ipaddr_t dst;
   2483 
   2484 			mp = send_icmp_head;
   2485 			send_icmp_head = send_icmp_head->b_next;
   2486 			mp->b_next = NULL;
   2487 
   2488 			dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
   2489 
   2490 			iras.ira_flags = IRAF_IS_IPV4;
   2491 			/*
   2492 			 * This will result in an incorrect ALL_ZONES zoneid
   2493 			 * for broadcast and multicast packets, but we
   2494 			 * don't send ICMP errors for those in any case.
   2495 			 */
   2496 			iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
   2497 			    ill, ipst);
   2498 			ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
   2499 			icmp_time_exceeded(mp,
   2500 			    ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
   2501 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   2502 		}
   2503 	}
   2504 	/*
   2505 	 * A non-dying ILL will use the return value to decide whether to
   2506 	 * restart the frag timer, and for how long.
   2507 	 */
   2508 	return (next_timeout);
   2509 }
   2510 
   2511 /*
   2512  * This routine is called when the approximate count of mblk memory used
   2513  * for the specified ILL has exceeded max_count.
   2514  */
   2515 void
   2516 ill_frag_prune(ill_t *ill, uint_t max_count)
   2517 {
   2518 	ipfb_t	*ipfb;
   2519 	ipf_t	*ipf;
   2520 	size_t	count;
   2521 	clock_t now;
   2522 
   2523 	/*
   2524 	 * If we are here within ip_min_frag_prune_time msecs remove
   2525 	 * ill_frag_free_num_pkts oldest packets from each bucket and increment
   2526 	 * ill_frag_free_num_pkts.
   2527 	 */
   2528 	mutex_enter(&ill->ill_lock);
   2529 	now = ddi_get_lbolt();
   2530 	if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
   2531 	    (ip_min_frag_prune_time != 0 ?
   2532 	    ip_min_frag_prune_time : msec_per_tick)) {
   2533 
   2534 		ill->ill_frag_free_num_pkts++;
   2535 
   2536 	} else {
   2537 		ill->ill_frag_free_num_pkts = 0;
   2538 	}
   2539 	ill->ill_last_frag_clean_time = now;
   2540 	mutex_exit(&ill->ill_lock);
   2541 
   2542 	/*
   2543 	 * free ill_frag_free_num_pkts oldest packets from each bucket.
   2544 	 */
   2545 	if (ill->ill_frag_free_num_pkts != 0) {
   2546 		int ix;
   2547 
   2548 		for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
   2549 			ipfb = &ill->ill_frag_hash_tbl[ix];
   2550 			mutex_enter(&ipfb->ipfb_lock);
   2551 			if (ipfb->ipfb_ipf != NULL) {
   2552 				ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
   2553 				    ill->ill_frag_free_num_pkts);
   2554 			}
   2555 			mutex_exit(&ipfb->ipfb_lock);
   2556 		}
   2557 	}
   2558 	/*
   2559 	 * While the reassembly list for this ILL is too big, prune a fragment
   2560 	 * queue by age, oldest first.
   2561 	 */
   2562 	while (ill->ill_frag_count > max_count) {
   2563 		int	ix;
   2564 		ipfb_t	*oipfb = NULL;
   2565 		uint_t	oldest = UINT_MAX;
   2566 
   2567 		count = 0;
   2568 		for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
   2569 			ipfb = &ill->ill_frag_hash_tbl[ix];
   2570 			mutex_enter(&ipfb->ipfb_lock);
   2571 			ipf = ipfb->ipfb_ipf;
   2572 			if (ipf != NULL && ipf->ipf_gen < oldest) {
   2573 				oldest = ipf->ipf_gen;
   2574 				oipfb = ipfb;
   2575 			}
   2576 			count += ipfb->ipfb_count;
   2577 			mutex_exit(&ipfb->ipfb_lock);
   2578 		}
   2579 		if (oipfb == NULL)
   2580 			break;
   2581 
   2582 		if (count <= max_count)
   2583 			return;	/* Somebody beat us to it, nothing to do */
   2584 		mutex_enter(&oipfb->ipfb_lock);
   2585 		ipf = oipfb->ipfb_ipf;
   2586 		if (ipf != NULL) {
   2587 			ill_frag_free_pkts(ill, oipfb, ipf, 1);
   2588 		}
   2589 		mutex_exit(&oipfb->ipfb_lock);
   2590 	}
   2591 }
   2592 
   2593 /*
   2594  * free 'free_cnt' fragmented packets starting at ipf.
   2595  */
   2596 void
   2597 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
   2598 {
   2599 	size_t	count;
   2600 	mblk_t	*mp;
   2601 	mblk_t	*tmp;
   2602 	ipf_t **ipfp = ipf->ipf_ptphn;
   2603 
   2604 	ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
   2605 	ASSERT(ipfp != NULL);
   2606 	ASSERT(ipf != NULL);
   2607 
   2608 	while (ipf != NULL && free_cnt-- > 0) {
   2609 		count = ipf->ipf_count;
   2610 		mp = ipf->ipf_mp;
   2611 		ipf = ipf->ipf_hash_next;
   2612 		for (tmp = mp; tmp; tmp = tmp->b_cont) {
   2613 			IP_REASS_SET_START(tmp, 0);
   2614 			IP_REASS_SET_END(tmp, 0);
   2615 		}
   2616 		atomic_add_32(&ill->ill_frag_count, -count);
   2617 		ASSERT(ipfb->ipfb_count >= count);
   2618 		ipfb->ipfb_count -= count;
   2619 		ASSERT(ipfb->ipfb_frag_pkts > 0);
   2620 		ipfb->ipfb_frag_pkts--;
   2621 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
   2622 		ip_drop_input("ipIfStatsReasmFails", mp, ill);
   2623 		freemsg(mp);
   2624 	}
   2625 
   2626 	if (ipf)
   2627 		ipf->ipf_ptphn = ipfp;
   2628 	ipfp[0] = ipf;
   2629 }
   2630 
   2631 #define	ND_FORWARD_WARNING	"The <if>:ip*_forwarding ndd variables are " \
   2632 	"obsolete and may be removed in a future release of Solaris.  Use " \
   2633 	"ifconfig(1M) to manipulate the forwarding status of an interface."
   2634 
   2635 /*
   2636  * For obsolete per-interface forwarding configuration;
   2637  * called in response to ND_GET.
   2638  */
   2639 /* ARGSUSED */
   2640 static int
   2641 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr)
   2642 {
   2643 	ill_t *ill = (ill_t *)cp;
   2644 
   2645 	cmn_err(CE_WARN, ND_FORWARD_WARNING);
   2646 
   2647 	(void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0);
   2648 	return (0);
   2649 }
   2650 
   2651 /*
   2652  * For obsolete per-interface forwarding configuration;
   2653  * called in response to ND_SET.
   2654  */
   2655 /* ARGSUSED */
   2656 static int
   2657 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp,
   2658     cred_t *ioc_cr)
   2659 {
   2660 	long value;
   2661 	int retval;
   2662 	ip_stack_t *ipst = CONNQ_TO_IPST(q);
   2663 
   2664 	cmn_err(CE_WARN, ND_FORWARD_WARNING);
   2665 
   2666 	if (ddi_strtol(valuestr, NULL, 10, &value) != 0 ||
   2667 	    value < 0 || value > 1) {
   2668 		return (EINVAL);
   2669 	}
   2670 
   2671 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   2672 	retval = ill_forward_set((ill_t *)cp, (value != 0));
   2673 	rw_exit(&ipst->ips_ill_g_lock);
   2674 	return (retval);
   2675 }
   2676 
   2677 /*
   2678  * Helper function for ill_forward_set().
   2679  */
   2680 static void
   2681 ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
   2682 {
   2683 	ip_stack_t	*ipst = ill->ill_ipst;
   2684 
   2685 	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
   2686 
   2687 	ip1dbg(("ill_forward_set: %s %s forwarding on %s",
   2688 	    (enable ? "Enabling" : "Disabling"),
   2689 	    (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
   2690 	mutex_enter(&ill->ill_lock);
   2691 	if (enable)
   2692 		ill->ill_flags |= ILLF_ROUTER;
   2693 	else
   2694 		ill->ill_flags &= ~ILLF_ROUTER;
   2695 	mutex_exit(&ill->ill_lock);
   2696 	if (ill->ill_isv6)
   2697 		ill_set_nce_router_flags(ill, enable);
   2698 	/* Notify routing socket listeners of this change. */
   2699 	if (ill->ill_ipif != NULL)
   2700 		ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
   2701 }
   2702 
   2703 /*
   2704  * Set an ill's ILLF_ROUTER flag appropriately.  Send up RTS_IFINFO routing
   2705  * socket messages for each interface whose flags we change.
   2706  */
   2707 int
   2708 ill_forward_set(ill_t *ill, boolean_t enable)
   2709 {
   2710 	ipmp_illgrp_t *illg;
   2711 	ip_stack_t *ipst = ill->ill_ipst;
   2712 
   2713 	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
   2714 
   2715 	if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
   2716 	    (!enable && !(ill->ill_flags & ILLF_ROUTER)))
   2717 		return (0);
   2718 
   2719 	if (IS_LOOPBACK(ill))
   2720 		return (EINVAL);
   2721 
   2722 	if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
   2723 		/*
   2724 		 * Update all of the interfaces in the group.
   2725 		 */
   2726 		illg = ill->ill_grp;
   2727 		ill = list_head(&illg->ig_if);
   2728 		for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
   2729 			ill_forward_set_on_ill(ill, enable);
   2730 
   2731 		/*
   2732 		 * Update the IPMP meta-interface.
   2733 		 */
   2734 		ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
   2735 		return (0);
   2736 	}
   2737 
   2738 	ill_forward_set_on_ill(ill, enable);
   2739 	return (0);
   2740 }
   2741 
   2742 /*
   2743  * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
   2744  * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
   2745  * set or clear.
   2746  */
   2747 static void
   2748 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
   2749 {
   2750 	ipif_t *ipif;
   2751 	ncec_t *ncec;
   2752 	nce_t *nce;
   2753 
   2754 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   2755 		/*
   2756 		 * NOTE: we match across the illgrp because nce's for
   2757 		 * addresses on IPMP interfaces have an nce_ill that points to
   2758 		 * the bound underlying ill.
   2759 		 */
   2760 		nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
   2761 		if (nce != NULL) {
   2762 			ncec = nce->nce_common;
   2763 			mutex_enter(&ncec->ncec_lock);
   2764 			if (enable)
   2765 				ncec->ncec_flags |= NCE_F_ISROUTER;
   2766 			else
   2767 				ncec->ncec_flags &= ~NCE_F_ISROUTER;
   2768 			mutex_exit(&ncec->ncec_lock);
   2769 			nce_refrele(nce);
   2770 		}
   2771 	}
   2772 }
   2773 
   2774 /*
   2775  * Given an ill with a _valid_ name, add the ip_forwarding ndd variable
   2776  * for this ill.  Make sure the v6/v4 question has been answered about this
   2777  * ill.  The creation of this ndd variable is only for backwards compatibility.
   2778  * The preferred way to control per-interface IP forwarding is through the
   2779  * ILLF_ROUTER interface flag.
   2780  */
   2781 static int
   2782 ill_set_ndd_name(ill_t *ill)
   2783 {
   2784 	char *suffix;
   2785 	ip_stack_t	*ipst = ill->ill_ipst;
   2786 
   2787 	ASSERT(IAM_WRITER_ILL(ill));
   2788 
   2789 	if (ill->ill_isv6)
   2790 		suffix = ipv6_forward_suffix;
   2791 	else
   2792 		suffix = ipv4_forward_suffix;
   2793 
   2794 	ill->ill_ndd_name = ill->ill_name + ill->ill_name_length;
   2795 	bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1);
   2796 	/*
   2797 	 * Copies over the '\0'.
   2798 	 * Note that strlen(suffix) is always bounded.
   2799 	 */
   2800 	bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1,
   2801 	    strlen(suffix) + 1);
   2802 
   2803 	/*
   2804 	 * Use of the nd table requires holding the reader lock.
   2805 	 * Modifying the nd table thru nd_load/nd_unload requires
   2806 	 * the writer lock.
   2807 	 */
   2808 	rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER);
   2809 	if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get,
   2810 	    nd_ill_forward_set, (caddr_t)ill)) {
   2811 		/*
   2812 		 * If the nd_load failed, it only meant that it could not
   2813 		 * allocate a new bunch of room for further NDD expansion.
   2814 		 * Because of that, the ill_ndd_name will be set to 0, and
   2815 		 * this interface is at the mercy of the global ip_forwarding
   2816 		 * variable.
   2817 		 */
   2818 		rw_exit(&ipst->ips_ip_g_nd_lock);
   2819 		ill->ill_ndd_name = NULL;
   2820 		return (ENOMEM);
   2821 	}
   2822 	rw_exit(&ipst->ips_ip_g_nd_lock);
   2823 	return (0);
   2824 }
   2825 
   2826 /*
   2827  * Intializes the context structure and returns the first ill in the list
   2828  * cuurently start_list and end_list can have values:
   2829  * MAX_G_HEADS		Traverse both IPV4 and IPV6 lists.
   2830  * IP_V4_G_HEAD		Traverse IPV4 list only.
   2831  * IP_V6_G_HEAD		Traverse IPV6 list only.
   2832  */
   2833 
   2834 /*
   2835  * We don't check for CONDEMNED ills here. Caller must do that if
   2836  * necessary under the ill lock.
   2837  */
   2838 ill_t *
   2839 ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
   2840     ip_stack_t *ipst)
   2841 {
   2842 	ill_if_t *ifp;
   2843 	ill_t *ill;
   2844 	avl_tree_t *avl_tree;
   2845 
   2846 	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
   2847 	ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
   2848 
   2849 	/*
   2850 	 * setup the lists to search
   2851 	 */
   2852 	if (end_list != MAX_G_HEADS) {
   2853 		ctx->ctx_current_list = start_list;
   2854 		ctx->ctx_last_list = end_list;
   2855 	} else {
   2856 		ctx->ctx_last_list = MAX_G_HEADS - 1;
   2857 		ctx->ctx_current_list = 0;
   2858 	}
   2859 
   2860 	while (ctx->ctx_current_list <= ctx->ctx_last_list) {
   2861 		ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
   2862 		if (ifp != (ill_if_t *)
   2863 		    &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
   2864 			avl_tree = &ifp->illif_avl_by_ppa;
   2865 			ill = avl_first(avl_tree);
   2866 			/*
   2867 			 * ill is guaranteed to be non NULL or ifp should have
   2868 			 * not existed.
   2869 			 */
   2870 			ASSERT(ill != NULL);
   2871 			return (ill);
   2872 		}
   2873 		ctx->ctx_current_list++;
   2874 	}
   2875 
   2876 	return (NULL);
   2877 }
   2878 
   2879 /*
   2880  * returns the next ill in the list. ill_first() must have been called
   2881  * before calling ill_next() or bad things will happen.
   2882  */
   2883 
   2884 /*
   2885  * We don't check for CONDEMNED ills here. Caller must do that if
   2886  * necessary under the ill lock.
   2887  */
   2888 ill_t *
   2889 ill_next(ill_walk_context_t *ctx, ill_t *lastill)
   2890 {
   2891 	ill_if_t *ifp;
   2892 	ill_t *ill;
   2893 	ip_stack_t	*ipst = lastill->ill_ipst;
   2894 
   2895 	ASSERT(lastill->ill_ifptr != (ill_if_t *)
   2896 	    &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
   2897 	if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
   2898 	    AVL_AFTER)) != NULL) {
   2899 		return (ill);
   2900 	}
   2901 
   2902 	/* goto next ill_ifp in the list. */
   2903 	ifp = lastill->ill_ifptr->illif_next;
   2904 
   2905 	/* make sure not at end of circular list */
   2906 	while (ifp ==
   2907 	    (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
   2908 		if (++ctx->ctx_current_list > ctx->ctx_last_list)
   2909 			return (NULL);
   2910 		ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
   2911 	}
   2912 
   2913 	return (avl_first(&ifp->illif_avl_by_ppa));
   2914 }
   2915 
   2916 /*
   2917  * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
   2918  * The final number (PPA) must not have any leading zeros.  Upon success, a
   2919  * pointer to the start of the PPA is returned; otherwise NULL is returned.
   2920  */
   2921 static char *
   2922 ill_get_ppa_ptr(char *name)
   2923 {
   2924 	int namelen = strlen(name);
   2925 	int end_ndx = namelen - 1;
   2926 	int ppa_ndx, i;
   2927 
   2928 	/*
   2929 	 * Check that the first character is [a-zA-Z], and that the last
   2930 	 * character is [0-9].
   2931 	 */
   2932 	if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
   2933 		return (NULL);
   2934 
   2935 	/*
   2936 	 * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
   2937 	 */
   2938 	for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
   2939 		if (!isdigit(name[ppa_ndx - 1]))
   2940 			break;
   2941 
   2942 	if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
   2943 		return (NULL);
   2944 
   2945 	/*
   2946 	 * Check that the intermediate characters are [a-z0-9.]
   2947 	 */
   2948 	for (i = 1; i < ppa_ndx; i++) {
   2949 		if (!isalpha(name[i]) && !isdigit(name[i]) &&
   2950 		    name[i] != '.' && name[i] != '_') {
   2951 			return (NULL);
   2952 		}
   2953 	}
   2954 
   2955 	return (name + ppa_ndx);
   2956 }
   2957 
   2958 /*
   2959  * use avl tree to locate the ill.
   2960  */
   2961 static ill_t *
   2962 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
   2963 {
   2964 	char *ppa_ptr = NULL;
   2965 	int len;
   2966 	uint_t ppa;
   2967 	ill_t *ill = NULL;
   2968 	ill_if_t *ifp;
   2969 	int list;
   2970 
   2971 	/*
   2972 	 * get ppa ptr
   2973 	 */
   2974 	if (isv6)
   2975 		list = IP_V6_G_HEAD;
   2976 	else
   2977 		list = IP_V4_G_HEAD;
   2978 
   2979 	if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
   2980 		return (NULL);
   2981 	}
   2982 
   2983 	len = ppa_ptr - name + 1;
   2984 
   2985 	ppa = stoi(&ppa_ptr);
   2986 
   2987 	ifp = IP_VX_ILL_G_LIST(list, ipst);
   2988 
   2989 	while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
   2990 		/*
   2991 		 * match is done on len - 1 as the name is not null
   2992 		 * terminated it contains ppa in addition to the interface
   2993 		 * name.
   2994 		 */
   2995 		if ((ifp->illif_name_len == len) &&
   2996 		    bcmp(ifp->illif_name, name, len - 1) == 0) {
   2997 			break;
   2998 		} else {
   2999 			ifp = ifp->illif_next;
   3000 		}
   3001 	}
   3002 
   3003 	if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
   3004 		/*
   3005 		 * Even the interface type does not exist.
   3006 		 */
   3007 		return (NULL);
   3008 	}
   3009 
   3010 	ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
   3011 	if (ill != NULL) {
   3012 		mutex_enter(&ill->ill_lock);
   3013 		if (ILL_CAN_LOOKUP(ill)) {
   3014 			ill_refhold_locked(ill);
   3015 			mutex_exit(&ill->ill_lock);
   3016 			return (ill);
   3017 		}
   3018 		mutex_exit(&ill->ill_lock);
   3019 	}
   3020 	return (NULL);
   3021 }
   3022 
   3023 /*
   3024  * comparison function for use with avl.
   3025  */
   3026 static int
   3027 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
   3028 {
   3029 	uint_t ppa;
   3030 	uint_t ill_ppa;
   3031 
   3032 	ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
   3033 
   3034 	ppa = *((uint_t *)ppa_ptr);
   3035 	ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
   3036 	/*
   3037 	 * We want the ill with the lowest ppa to be on the
   3038 	 * top.
   3039 	 */
   3040 	if (ill_ppa < ppa)
   3041 		return (1);
   3042 	if (ill_ppa > ppa)
   3043 		return (-1);
   3044 	return (0);
   3045 }
   3046 
   3047 /*
   3048  * remove an interface type from the global list.
   3049  */
   3050 static void
   3051 ill_delete_interface_type(ill_if_t *interface)
   3052 {
   3053 	ASSERT(interface != NULL);
   3054 	ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
   3055 
   3056 	avl_destroy(&interface->illif_avl_by_ppa);
   3057 	if (interface->illif_ppa_arena != NULL)
   3058 		vmem_destroy(interface->illif_ppa_arena);
   3059 
   3060 	remque(interface);
   3061 
   3062 	mi_free(interface);
   3063 }
   3064 
   3065 /*
   3066  * remove ill from the global list.
   3067  */
   3068 static void
   3069 ill_glist_delete(ill_t *ill)
   3070 {
   3071 	ip_stack_t	*ipst;
   3072 	phyint_t	*phyi;
   3073 
   3074 	if (ill == NULL)
   3075 		return;
   3076 	ipst = ill->ill_ipst;
   3077 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
   3078 
   3079 	/*
   3080 	 * If the ill was never inserted into the AVL tree
   3081 	 * we skip the if branch.
   3082 	 */
   3083 	if (ill->ill_ifptr != NULL) {
   3084 		/*
   3085 		 * remove from AVL tree and free ppa number
   3086 		 */
   3087 		avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
   3088 
   3089 		if (ill->ill_ifptr->illif_ppa_arena != NULL) {
   3090 			vmem_free(ill->ill_ifptr->illif_ppa_arena,
   3091 			    (void *)(uintptr_t)(ill->ill_ppa+1), 1);
   3092 		}
   3093 		if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
   3094 			ill_delete_interface_type(ill->ill_ifptr);
   3095 		}
   3096 
   3097 		/*
   3098 		 * Indicate ill is no longer in the list.
   3099 		 */
   3100 		ill->ill_ifptr = NULL;
   3101 		ill->ill_name_length = 0;
   3102 		ill->ill_name[0] = '\0';
   3103 		ill->ill_ppa = UINT_MAX;
   3104 	}
   3105 
   3106 	/* Generate one last event for this ill. */
   3107 	ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
   3108 	    ill->ill_name_length);
   3109 
   3110 	ASSERT(ill->ill_phyint != NULL);
   3111 	phyi = ill->ill_phyint;
   3112 	ill->ill_phyint = NULL;
   3113 
   3114 	/*
   3115 	 * ill_init allocates a phyint always to store the copy
   3116 	 * of flags relevant to phyint. At that point in time, we could
   3117 	 * not assign the name and hence phyint_illv4/v6 could not be
   3118 	 * initialized. Later in ipif_set_values, we assign the name to
   3119 	 * the ill, at which point in time we assign phyint_illv4/v6.
   3120 	 * Thus we don't rely on phyint_illv6 to be initialized always.
   3121 	 */
   3122 	if (ill->ill_flags & ILLF_IPV6)
   3123 		phyi->phyint_illv6 = NULL;
   3124 	else
   3125 		phyi->phyint_illv4 = NULL;
   3126 
   3127 	if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
   3128 		rw_exit(&ipst->ips_ill_g_lock);
   3129 		return;
   3130 	}
   3131 
   3132 	/*
   3133 	 * There are no ills left on this phyint; pull it out of the phyint
   3134 	 * avl trees, and free it.
   3135 	 */
   3136 	if (phyi->phyint_ifindex > 0) {
   3137 		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
   3138 		    phyi);
   3139 		avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
   3140 		    phyi);
   3141 	}
   3142 	rw_exit(&ipst->ips_ill_g_lock);
   3143 
   3144 	phyint_free(phyi);
   3145 }
   3146 
   3147 /*
   3148  * allocate a ppa, if the number of plumbed interfaces of this type are
   3149  * less than ill_no_arena do a linear search to find a unused ppa.
   3150  * When the number goes beyond ill_no_arena switch to using an arena.
   3151  * Note: ppa value of zero cannot be allocated from vmem_arena as it
   3152  * is the return value for an error condition, so allocation starts at one
   3153  * and is decremented by one.
   3154  */
   3155 static int
   3156 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
   3157 {
   3158 	ill_t *tmp_ill;
   3159 	uint_t start, end;
   3160 	int ppa;
   3161 
   3162 	if (ifp->illif_ppa_arena == NULL &&
   3163 	    (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
   3164 		/*
   3165 		 * Create an arena.
   3166 		 */
   3167 		ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
   3168 		    (void *)1, UINT_MAX - 1, 1, NULL, NULL,
   3169 		    NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
   3170 			/* allocate what has already been assigned */
   3171 		for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
   3172 		    tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
   3173 		    tmp_ill, AVL_AFTER)) {
   3174 			ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
   3175 			    1,		/* size */
   3176 			    1,		/* align/quantum */
   3177 			    0,		/* phase */
   3178 			    0,		/* nocross */
   3179 			    /* minaddr */
   3180 			    (void *)((uintptr_t)tmp_ill->ill_ppa + 1),
   3181 			    /* maxaddr */
   3182 			    (void *)((uintptr_t)tmp_ill->ill_ppa + 2),
   3183 			    VM_NOSLEEP|VM_FIRSTFIT);
   3184 			if (ppa == 0) {
   3185 				ip1dbg(("ill_alloc_ppa: ppa allocation"
   3186 				    " failed while switching"));
   3187 				vmem_destroy(ifp->illif_ppa_arena);
   3188 				ifp->illif_ppa_arena = NULL;
   3189 				break;
   3190 			}
   3191 		}
   3192 	}
   3193 
   3194 	if (ifp->illif_ppa_arena != NULL) {
   3195 		if (ill->ill_ppa == UINT_MAX) {
   3196 			ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
   3197 			    1, VM_NOSLEEP|VM_FIRSTFIT);
   3198 			if (ppa == 0)
   3199 				return (EAGAIN);
   3200 			ill->ill_ppa = --ppa;
   3201 		} else {
   3202 			ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
   3203 			    1, 		/* size */
   3204 			    1, 		/* align/quantum */
   3205 			    0, 		/* phase */
   3206 			    0, 		/* nocross */
   3207 			    (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
   3208 			    (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
   3209 			    VM_NOSLEEP|VM_FIRSTFIT);
   3210 			/*
   3211 			 * Most likely the allocation failed because
   3212 			 * the requested ppa was in use.
   3213 			 */
   3214 			if (ppa == 0)
   3215 				return (EEXIST);
   3216 		}
   3217 		return (0);
   3218 	}
   3219 
   3220 	/*
   3221 	 * No arena is in use and not enough (>ill_no_arena) interfaces have
   3222 	 * been plumbed to create one. Do a linear search to get a unused ppa.
   3223 	 */
   3224 	if (ill->ill_ppa == UINT_MAX) {
   3225 		end = UINT_MAX - 1;
   3226 		start = 0;
   3227 	} else {
   3228 		end = start = ill->ill_ppa;
   3229 	}
   3230 
   3231 	tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
   3232 	while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
   3233 		if (start++ >= end) {
   3234 			if (ill->ill_ppa == UINT_MAX)
   3235 				return (EAGAIN);
   3236 			else
   3237 				return (EEXIST);
   3238 		}
   3239 		tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
   3240 	}
   3241 	ill->ill_ppa = start;
   3242 	return (0);
   3243 }
   3244 
   3245 /*
   3246  * Insert ill into the list of configured ill's. Once this function completes,
   3247  * the ill is globally visible and is available through lookups. More precisely
   3248  * this happens after the caller drops the ill_g_lock.
   3249  */
   3250 static int
   3251 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
   3252 {
   3253 	ill_if_t *ill_interface;
   3254 	avl_index_t where = 0;
   3255 	int error;
   3256 	int name_length;
   3257 	int index;
   3258 	boolean_t check_length = B_FALSE;
   3259 	ip_stack_t	*ipst = ill->ill_ipst;
   3260 
   3261 	ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
   3262 
   3263 	name_length = mi_strlen(name) + 1;
   3264 
   3265 	if (isv6)
   3266 		index = IP_V6_G_HEAD;
   3267 	else
   3268 		index = IP_V4_G_HEAD;
   3269 
   3270 	ill_interface = IP_VX_ILL_G_LIST(index, ipst);
   3271 	/*
   3272 	 * Search for interface type based on name
   3273 	 */
   3274 	while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
   3275 		if ((ill_interface->illif_name_len == name_length) &&
   3276 		    (strcmp(ill_interface->illif_name, name) == 0)) {
   3277 			break;
   3278 		}
   3279 		ill_interface = ill_interface->illif_next;
   3280 	}
   3281 
   3282 	/*
   3283 	 * Interface type not found, create one.
   3284 	 */
   3285 	if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
   3286 		ill_g_head_t ghead;
   3287 
   3288 		/*
   3289 		 * allocate ill_if_t structure
   3290 		 */
   3291 		ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
   3292 		if (ill_interface == NULL) {
   3293 			return (ENOMEM);
   3294 		}
   3295 
   3296 		(void) strcpy(ill_interface->illif_name, name);
   3297 		ill_interface->illif_name_len = name_length;
   3298 
   3299 		avl_create(&ill_interface->illif_avl_by_ppa,
   3300 		    ill_compare_ppa, sizeof (ill_t),
   3301 		    offsetof(struct ill_s, ill_avl_byppa));
   3302 
   3303 		/*
   3304 		 * link the structure in the back to maintain order
   3305 		 * of configuration for ifconfig output.
   3306 		 */
   3307 		ghead = ipst->ips_ill_g_heads[index];
   3308 		insque(ill_interface, ghead.ill_g_list_tail);
   3309 	}
   3310 
   3311 	if (ill->ill_ppa == UINT_MAX)
   3312 		check_length = B_TRUE;
   3313 
   3314 	error = ill_alloc_ppa(ill_interface, ill);
   3315 	if (error != 0) {
   3316 		if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
   3317 			ill_delete_interface_type(ill->ill_ifptr);
   3318 		return (error);
   3319 	}
   3320 
   3321 	/*
   3322 	 * When the ppa is choosen by the system, check that there is
   3323 	 * enough space to insert ppa. if a specific ppa was passed in this
   3324 	 * check is not required as the interface name passed in will have
   3325 	 * the right ppa in it.
   3326 	 */
   3327 	if (check_length) {
   3328 		/*
   3329 		 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
   3330 		 */
   3331 		char buf[sizeof (uint_t) * 3];
   3332 
   3333 		/*
   3334 		 * convert ppa to string to calculate the amount of space
   3335 		 * required for it in the name.
   3336 		 */
   3337 		numtos(ill->ill_ppa, buf);
   3338 
   3339 		/* Do we have enough space to insert ppa ? */
   3340 
   3341 		if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
   3342 			/* Free ppa and interface type struct */
   3343 			if (ill_interface->illif_ppa_arena != NULL) {
   3344 				vmem_free(ill_interface->illif_ppa_arena,
   3345 				    (void *)(uintptr_t)(ill->ill_ppa+1), 1);
   3346 			}
   3347 			if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
   3348 				ill_delete_interface_type(ill->ill_ifptr);
   3349 
   3350 			return (EINVAL);
   3351 		}
   3352 	}
   3353 
   3354 	(void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
   3355 	ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
   3356 
   3357 	(void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
   3358 	    &where);
   3359 	ill->ill_ifptr = ill_interface;
   3360 	avl_insert(&ill_interface->illif_avl_by_ppa, ill, where);
   3361 
   3362 	ill_phyint_reinit(ill);
   3363 	return (0);
   3364 }
   3365 
   3366 /* Initialize the per phyint ipsq used for serialization */
   3367 static boolean_t
   3368 ipsq_init(ill_t *ill, boolean_t enter)
   3369 {
   3370 	ipsq_t  *ipsq;
   3371 	ipxop_t	*ipx;
   3372 
   3373 	if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
   3374 		return (B_FALSE);
   3375 
   3376 	ill->ill_phyint->phyint_ipsq = ipsq;
   3377 	ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
   3378 	ipx->ipx_ipsq = ipsq;
   3379 	ipsq->ipsq_next = ipsq;
   3380 	ipsq->ipsq_phyint = ill->ill_phyint;
   3381 	mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
   3382 	mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
   3383 	ipsq->ipsq_ipst = ill->ill_ipst;	/* No netstack_hold */
   3384 	if (enter) {
   3385 		ipx->ipx_writer = curthread;
   3386 		ipx->ipx_forced = B_FALSE;
   3387 		ipx->ipx_reentry_cnt = 1;
   3388 #ifdef DEBUG
   3389 		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
   3390 #endif
   3391 	}
   3392 	return (B_TRUE);
   3393 }
   3394 
   3395 /*
   3396  * ill_init is called by ip_open when a device control stream is opened.
   3397  * It does a few initializations, and shoots a DL_INFO_REQ message down
   3398  * to the driver.  The response is later picked up in ip_rput_dlpi and
   3399  * used to set up default mechanisms for talking to the driver.  (Always
   3400  * called as writer.)
   3401  *
   3402  * If this function returns error, ip_open will call ip_close which in
   3403  * turn will call ill_delete to clean up any memory allocated here that
   3404  * is not yet freed.
   3405  */
   3406 int
   3407 ill_init(queue_t *q, ill_t *ill)
   3408 {
   3409 	int	count;
   3410 	dl_info_req_t	*dlir;
   3411 	mblk_t	*info_mp;
   3412 	uchar_t *frag_ptr;
   3413 
   3414 	/*
   3415 	 * The ill is initialized to zero by mi_alloc*(). In addition
   3416 	 * some fields already contain valid values, initialized in
   3417 	 * ip_open(), before we reach here.
   3418 	 */
   3419 	mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
   3420 	mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
   3421 	ill->ill_saved_ire_cnt = 0;
   3422 
   3423 	ill->ill_rq = q;
   3424 	ill->ill_wq = WR(q);
   3425 
   3426 	info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
   3427 	    BPRI_HI);
   3428 	if (info_mp == NULL)
   3429 		return (ENOMEM);
   3430 
   3431 	/*
   3432 	 * Allocate sufficient space to contain our fragment hash table and
   3433 	 * the device name.
   3434 	 */
   3435 	frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE +
   3436 	    2 * LIFNAMSIZ + strlen(ipv6_forward_suffix));
   3437 	if (frag_ptr == NULL) {
   3438 		freemsg(info_mp);
   3439 		return (ENOMEM);
   3440 	}
   3441 	ill->ill_frag_ptr = frag_ptr;
   3442 	ill->ill_frag_free_num_pkts = 0;
   3443 	ill->ill_last_frag_clean_time = 0;
   3444 	ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr;
   3445 	ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE);
   3446 	for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
   3447 		mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock,
   3448 		    NULL, MUTEX_DEFAULT, NULL);
   3449 	}
   3450 
   3451 	ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
   3452 	if (ill->ill_phyint == NULL) {
   3453 		freemsg(info_mp);
   3454 		mi_free(frag_ptr);
   3455 		return (ENOMEM);
   3456 	}
   3457 
   3458 	mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
   3459 	/*
   3460 	 * For now pretend this is a v4 ill. We need to set phyint_ill*
   3461 	 * at this point because of the following reason. If we can't
   3462 	 * enter the ipsq at some point and cv_wait, the writer that
   3463 	 * wakes us up tries to locate us using the list of all phyints
   3464 	 * in an ipsq and the ills from the phyint thru the phyint_ill*.
   3465 	 * If we don't set it now, we risk a missed wakeup.
   3466 	 */
   3467 	ill->ill_phyint->phyint_illv4 = ill;
   3468 	ill->ill_ppa = UINT_MAX;
   3469 	list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
   3470 
   3471 	ill_set_inputfn(ill);
   3472 
   3473 	if (!ipsq_init(ill, B_TRUE)) {
   3474 		freemsg(info_mp);
   3475 		mi_free(frag_ptr);
   3476 		mi_free(ill->ill_phyint);
   3477 		return (ENOMEM);
   3478 	}
   3479 
   3480 	ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
   3481 
   3482 	/* Frag queue limit stuff */
   3483 	ill->ill_frag_count = 0;
   3484 	ill->ill_ipf_gen = 0;
   3485 
   3486 	rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
   3487 	mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
   3488 	ill->ill_global_timer = INFINITY;
   3489 	ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
   3490 	ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
   3491 	ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
   3492 	ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
   3493 
   3494 	/*
   3495 	 * Initialize IPv6 configuration variables.  The IP module is always
   3496 	 * opened as an IPv4 module.  Instead tracking down the cases where
   3497 	 * it switches to do ipv6, we'll just initialize the IPv6 configuration
   3498 	 * here for convenience, this has no effect until the ill is set to do
   3499 	 * IPv6.
   3500 	 */
   3501 	ill->ill_reachable_time = ND_REACHABLE_TIME;
   3502 	ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
   3503 	ill->ill_max_buf = ND_MAX_Q;
   3504 	ill->ill_refcnt = 0;
   3505 
   3506 	/* Send down the Info Request to the driver. */
   3507 	info_mp->b_datap->db_type = M_PCPROTO;
   3508 	dlir = (dl_info_req_t *)info_mp->b_rptr;
   3509 	info_mp->b_wptr = (uchar_t *)&dlir[1];
   3510 	dlir->dl_primitive = DL_INFO_REQ;
   3511 
   3512 	ill->ill_dlpi_pending = DL_PRIM_INVAL;
   3513 
   3514 	qprocson(q);
   3515 	ill_dlpi_send(ill, info_mp);
   3516 
   3517 	return (0);
   3518 }
   3519 
   3520 /*
   3521  * ill_dls_info
   3522  * creates datalink socket info from the device.
   3523  */
   3524 int
   3525 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill)
   3526 {
   3527 	size_t	len;
   3528 
   3529 	sdl->sdl_family = AF_LINK;
   3530 	sdl->sdl_index = ill_get_upper_ifindex(ill);
   3531 	sdl->sdl_type = ill->ill_type;
   3532 	ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
   3533 	len = strlen(sdl->sdl_data);
   3534 	ASSERT(len < 256);
   3535 	sdl->sdl_nlen = (uchar_t)len;
   3536 	sdl->sdl_alen = ill->ill_phys_addr_length;
   3537 	sdl->sdl_slen = 0;
   3538 	if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL)
   3539 		bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen);
   3540 
   3541 	return (sizeof (struct sockaddr_dl));
   3542 }
   3543 
   3544 /*
   3545  * ill_xarp_info
   3546  * creates xarp info from the device.
   3547  */
   3548 static int
   3549 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
   3550 {
   3551 	sdl->sdl_family = AF_LINK;
   3552 	sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
   3553 	sdl->sdl_type = ill->ill_type;
   3554 	ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
   3555 	sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
   3556 	sdl->sdl_alen = ill->ill_phys_addr_length;
   3557 	sdl->sdl_slen = 0;
   3558 	return (sdl->sdl_nlen);
   3559 }
   3560 
   3561 static int
   3562 loopback_kstat_update(kstat_t *ksp, int rw)
   3563 {
   3564 	kstat_named_t *kn;
   3565 	netstackid_t	stackid;
   3566 	netstack_t	*ns;
   3567 	ip_stack_t	*ipst;
   3568 
   3569 	if (ksp == NULL || ksp->ks_data == NULL)
   3570 		return (EIO);
   3571 
   3572 	if (rw == KSTAT_WRITE)
   3573 		return (EACCES);
   3574 
   3575 	kn = KSTAT_NAMED_PTR(ksp);
   3576 	stackid = (zoneid_t)(uintptr_t)ksp->ks_private;
   3577 
   3578 	ns = netstack_find_by_stackid(stackid);
   3579 	if (ns == NULL)
   3580 		return (-1);
   3581 
   3582 	ipst = ns->netstack_ip;
   3583 	if (ipst == NULL) {
   3584 		netstack_rele(ns);
   3585 		return (-1);
   3586 	}
   3587 	kn[0].value.ui32 = ipst->ips_loopback_packets;
   3588 	kn[1].value.ui32 = ipst->ips_loopback_packets;
   3589 	netstack_rele(ns);
   3590 	return (0);
   3591 }
   3592 
   3593 /*
   3594  * Has ifindex been plumbed already?
   3595  */
   3596 static boolean_t
   3597 phyint_exists(uint_t index, ip_stack_t *ipst)
   3598 {
   3599 	ASSERT(index != 0);
   3600 	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
   3601 
   3602 	return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
   3603 	    &index, NULL) != NULL);
   3604 }
   3605 
   3606 /* Pick a unique ifindex */
   3607 boolean_t
   3608 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst)
   3609 {
   3610 	uint_t starting_index;
   3611 
   3612 	if (!ipst->ips_ill_index_wrap) {
   3613 		*indexp = ipst->ips_ill_index++;
   3614 		if (ipst->ips_ill_index == 0) {
   3615 			/* Reached the uint_t limit Next time wrap  */
   3616 			ipst->ips_ill_index_wrap = B_TRUE;
   3617 		}
   3618 		return (B_TRUE);
   3619 	}
   3620 
   3621 	/*
   3622 	 * Start reusing unused indexes. Note that we hold the ill_g_lock
   3623 	 * at this point and don't want to call any function that attempts
   3624 	 * to get the lock again.
   3625 	 */
   3626 	starting_index = ipst->ips_ill_index++;
   3627 	for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) {
   3628 		if (ipst->ips_ill_index != 0 &&
   3629 		    !phyint_exists(ipst->ips_ill_index, ipst)) {
   3630 			/* found unused index - use it */
   3631 			*indexp = ipst->ips_ill_index;
   3632 			return (B_TRUE);
   3633 		}
   3634 	}
   3635 
   3636 	/*
   3637 	 * all interface indicies are inuse.
   3638 	 */
   3639 	return (B_FALSE);
   3640 }
   3641 
   3642 /*
   3643  * Assign a unique interface index for the phyint.
   3644  */
   3645 static boolean_t
   3646 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst)
   3647 {
   3648 	ASSERT(phyi->phyint_ifindex == 0);
   3649 	return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst));
   3650 }
   3651 
   3652 /*
   3653  * Initialize the flags on `phyi' as per the provided mactype.
   3654  */
   3655 static void
   3656 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype)
   3657 {
   3658 	uint64_t flags = 0;
   3659 
   3660 	/*
   3661 	 * Initialize PHYI_RUNNING and PHYI_FAILED.  For non-IPMP interfaces,
   3662 	 * we always presume the underlying hardware is working and set
   3663 	 * PHYI_RUNNING (if it's not, the driver will subsequently send a
   3664 	 * DL_NOTE_LINK_DOWN message).  For IPMP interfaces, at initialization
   3665 	 * there are no active interfaces in the group so we set PHYI_FAILED.
   3666 	 */
   3667 	if (mactype == SUNW_DL_IPMP)
   3668 		flags |= PHYI_FAILED;
   3669 	else
   3670 		flags |= PHYI_RUNNING;
   3671 
   3672 	switch (mactype) {
   3673 	case SUNW_DL_VNI:
   3674 		flags |= PHYI_VIRTUAL;
   3675 		break;
   3676 	case SUNW_DL_IPMP:
   3677 		flags |= PHYI_IPMP;
   3678 		break;
   3679 	case DL_LOOP:
   3680 		flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL);
   3681 		break;
   3682 	}
   3683 
   3684 	mutex_enter(&phyi->phyint_lock);
   3685 	phyi->phyint_flags |= flags;
   3686 	mutex_exit(&phyi->phyint_lock);
   3687 }
   3688 
   3689 /*
   3690  * Return a pointer to the ill which matches the supplied name.  Note that
   3691  * the ill name length includes the null termination character.  (May be
   3692  * called as writer.)
   3693  * If do_alloc and the interface is "lo0" it will be automatically created.
   3694  * Cannot bump up reference on condemned ills. So dup detect can't be done
   3695  * using this func.
   3696  */
   3697 ill_t *
   3698 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
   3699     boolean_t *did_alloc, ip_stack_t *ipst)
   3700 {
   3701 	ill_t	*ill;
   3702 	ipif_t	*ipif;
   3703 	ipsq_t	*ipsq;
   3704 	kstat_named_t	*kn;
   3705 	boolean_t isloopback;
   3706 	in6_addr_t ov6addr;
   3707 
   3708 	isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
   3709 
   3710 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   3711 	ill = ill_find_by_name(name, isv6, ipst);
   3712 	rw_exit(&ipst->ips_ill_g_lock);
   3713 	if (ill != NULL)
   3714 		return (ill);
   3715 
   3716 	/*
   3717 	 * Couldn't find it.  Does this happen to be a lookup for the
   3718 	 * loopback device and are we allowed to allocate it?
   3719 	 */
   3720 	if (!isloopback || !do_alloc)
   3721 		return (NULL);
   3722 
   3723 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
   3724 	ill = ill_find_by_name(name, isv6, ipst);
   3725 	if (ill != NULL) {
   3726 		rw_exit(&ipst->ips_ill_g_lock);
   3727 		return (ill);
   3728 	}
   3729 
   3730 	/* Create the loopback device on demand */
   3731 	ill = (ill_t *)(mi_alloc(sizeof (ill_t) +
   3732 	    sizeof (ipif_loopback_name), BPRI_MED));
   3733 	if (ill == NULL)
   3734 		goto done;
   3735 
   3736 	*ill = ill_null;
   3737 	mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL);
   3738 	ill->ill_ipst = ipst;
   3739 	list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
   3740 	netstack_hold(ipst->ips_netstack);
   3741 	/*
   3742 	 * For exclusive stacks we set the zoneid to zero
   3743 	 * to make IP operate as if in the global zone.
   3744 	 */
   3745 	ill->ill_zoneid = GLOBAL_ZONEID;
   3746 
   3747 	ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
   3748 	if (ill->ill_phyint == NULL)
   3749 		goto done;
   3750 
   3751 	if (isv6)
   3752 		ill->ill_phyint->phyint_illv6 = ill;
   3753 	else
   3754 		ill->ill_phyint->phyint_illv4 = ill;
   3755 	mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
   3756 	phyint_flags_init(ill->ill_phyint, DL_LOOP);
   3757 
   3758 	if (isv6) {
   3759 		ill->ill_isv6 = B_TRUE;
   3760 		ill->ill_max_frag = ip_loopback_mtu_v6plus;
   3761 	} else {
   3762 		ill->ill_max_frag = ip_loopback_mtuplus;
   3763 	}
   3764 	if (!ill_allocate_mibs(ill))
   3765 		goto done;
   3766 	ill->ill_current_frag = ill->ill_max_frag;
   3767 	ill->ill_mtu = ill->ill_max_frag;	/* Initial value */
   3768 	/*
   3769 	 * ipif_loopback_name can't be pointed at directly because its used
   3770 	 * by both the ipv4 and ipv6 interfaces.  When the ill is removed
   3771 	 * from the glist, ill_glist_delete() sets the first character of
   3772 	 * ill_name to '\0'.
   3773 	 */
   3774 	ill->ill_name = (char *)ill + sizeof (*ill);
   3775 	(void) strcpy(ill->ill_name, ipif_loopback_name);
   3776 	ill->ill_name_length = sizeof (ipif_loopback_name);
   3777 	/* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
   3778 	ill->ill_dlpi_pending = DL_PRIM_INVAL;
   3779 
   3780 	rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
   3781 	mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
   3782 	ill->ill_global_timer = INFINITY;
   3783 	ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
   3784 	ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
   3785 	ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
   3786 	ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
   3787 
   3788 	/* No resolver here. */
   3789 	ill->ill_net_type = IRE_LOOPBACK;
   3790 
   3791 	/* Initialize the ipsq */
   3792 	if (!ipsq_init(ill, B_FALSE))
   3793 		goto done;
   3794 
   3795 	ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL);
   3796 	if (ipif == NULL)
   3797 		goto done;
   3798 
   3799 	ill->ill_flags = ILLF_MULTICAST;
   3800 
   3801 	ov6addr = ipif->ipif_v6lcl_addr;
   3802 	/* Set up default loopback address and mask. */
   3803 	if (!isv6) {
   3804 		ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
   3805 
   3806 		IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
   3807 		V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
   3808 		V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
   3809 		    ipif->ipif_v6subnet);
   3810 		ill->ill_flags |= ILLF_IPV4;
   3811 	} else {
   3812 		ipif->ipif_v6lcl_addr = ipv6_loopback;
   3813 		ipif->ipif_v6net_mask = ipv6_all_ones;
   3814 		V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
   3815 		    ipif->ipif_v6subnet);
   3816 		ill->ill_flags |= ILLF_IPV6;
   3817 	}
   3818 
   3819 	/*
   3820 	 * Chain us in at the end of the ill list. hold the ill
   3821 	 * before we make it globally visible. 1 for the lookup.
   3822 	 */
   3823 	ill->ill_refcnt = 0;
   3824 	ill_refhold(ill);
   3825 
   3826 	ill->ill_frag_count = 0;
   3827 	ill->ill_frag_free_num_pkts = 0;
   3828 	ill->ill_last_frag_clean_time = 0;
   3829 
   3830 	ipsq = ill->ill_phyint->phyint_ipsq;
   3831 
   3832 	ill_set_inputfn(ill);
   3833 
   3834 	if (ill_glist_insert(ill, "lo", isv6) != 0)
   3835 		cmn_err(CE_PANIC, "cannot insert loopback interface");
   3836 
   3837 	/* Let SCTP know so that it can add this to its list */
   3838 	sctp_update_ill(ill, SCTP_ILL_INSERT);
   3839 
   3840 	/*
   3841 	 * We have already assigned ipif_v6lcl_addr above, but we need to
   3842 	 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which
   3843 	 * requires to be after ill_glist_insert() since we need the
   3844 	 * ill_index set. Pass on ipv6_loopback as the old address.
   3845 	 */
   3846 	sctp_update_ipif_addr(ipif, ov6addr);
   3847 
   3848 	ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
   3849 
   3850 	/*
   3851 	 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
   3852 	 * If so, free our original one.
   3853 	 */
   3854 	if (ipsq != ill->ill_phyint->phyint_ipsq)
   3855 		ipsq_delete(ipsq);
   3856 
   3857 	if (ipst->ips_loopback_ksp == NULL) {
   3858 		/* Export loopback interface statistics */
   3859 		ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0,
   3860 		    ipif_loopback_name, "net",
   3861 		    KSTAT_TYPE_NAMED, 2, 0,
   3862 		    ipst->ips_netstack->netstack_stackid);
   3863 		if (ipst->ips_loopback_ksp != NULL) {
   3864 			ipst->ips_loopback_ksp->ks_update =
   3865 			    loopback_kstat_update;
   3866 			kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp);
   3867 			kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32);
   3868 			kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32);
   3869 			ipst->ips_loopback_ksp->ks_private =
   3870 			    (void *)(uintptr_t)ipst->ips_netstack->
   3871 			    netstack_stackid;
   3872 			kstat_install(ipst->ips_loopback_ksp);
   3873 		}
   3874 	}
   3875 
   3876 	*did_alloc = B_TRUE;
   3877 	rw_exit(&ipst->ips_ill_g_lock);
   3878 	ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id),
   3879 	    NE_PLUMB, ill->ill_name, ill->ill_name_length);
   3880 	return (ill);
   3881 done:
   3882 	if (ill != NULL) {
   3883 		if (ill->ill_phyint != NULL) {
   3884 			ipsq = ill->ill_phyint->phyint_ipsq;
   3885 			if (ipsq != NULL) {
   3886 				ipsq->ipsq_phyint = NULL;
   3887 				ipsq_delete(ipsq);
   3888 			}
   3889 			mi_free(ill->ill_phyint);
   3890 		}
   3891 		ill_free_mib(ill);
   3892 		if (ill->ill_ipst != NULL)
   3893 			netstack_rele(ill->ill_ipst->ips_netstack);
   3894 		mi_free(ill);
   3895 	}
   3896 	rw_exit(&ipst->ips_ill_g_lock);
   3897 	return (NULL);
   3898 }
   3899 
   3900 /*
   3901  * For IPP calls - use the ip_stack_t for global stack.
   3902  */
   3903 ill_t *
   3904 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
   3905 {
   3906 	ip_stack_t	*ipst;
   3907 	ill_t		*ill;
   3908 
   3909 	ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip;
   3910 	if (ipst == NULL) {
   3911 		cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n");
   3912 		return (NULL);
   3913 	}
   3914 
   3915 	ill = ill_lookup_on_ifindex(index, isv6, ipst);
   3916 	netstack_rele(ipst->ips_netstack);
   3917 	return (ill);
   3918 }
   3919 
   3920 /*
   3921  * Return a pointer to the ill which matches the index and IP version type.
   3922  */
   3923 ill_t *
   3924 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
   3925 {
   3926 	ill_t	*ill;
   3927 	phyint_t *phyi;
   3928 
   3929 	/*
   3930 	 * Indexes are stored in the phyint - a common structure
   3931 	 * to both IPv4 and IPv6.
   3932 	 */
   3933 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   3934 	phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
   3935 	    (void *) &index, NULL);
   3936 	if (phyi != NULL) {
   3937 		ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
   3938 		if (ill != NULL) {
   3939 			mutex_enter(&ill->ill_lock);
   3940 			if (!ILL_IS_CONDEMNED(ill)) {
   3941 				ill_refhold_locked(ill);
   3942 				mutex_exit(&ill->ill_lock);
   3943 				rw_exit(&ipst->ips_ill_g_lock);
   3944 				return (ill);
   3945 			}
   3946 			mutex_exit(&ill->ill_lock);
   3947 		}
   3948 	}
   3949 	rw_exit(&ipst->ips_ill_g_lock);
   3950 	return (NULL);
   3951 }
   3952 
   3953 /*
   3954  * Verify whether or not an interface index is valid for the specified zoneid
   3955  * to transmit packets.
   3956  * It can be zero (meaning "reset") or an interface index assigned
   3957  * to a non-VNI interface. (We don't use VNI interface to send packets.)
   3958  */
   3959 boolean_t
   3960 ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6,
   3961     ip_stack_t *ipst)
   3962 {
   3963 	ill_t		*ill;
   3964 
   3965 	if (ifindex == 0)
   3966 		return (B_TRUE);
   3967 
   3968 	ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst);
   3969 	if (ill == NULL)
   3970 		return (B_FALSE);
   3971 	if (IS_VNI(ill)) {
   3972 		ill_refrele(ill);
   3973 		return (B_FALSE);
   3974 	}
   3975 	ill_refrele(ill);
   3976 	return (B_TRUE);
   3977 }
   3978 
   3979 /*
   3980  * Return the ifindex next in sequence after the passed in ifindex.
   3981  * If there is no next ifindex for the given protocol, return 0.
   3982  */
   3983 uint_t
   3984 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
   3985 {
   3986 	phyint_t *phyi;
   3987 	phyint_t *phyi_initial;
   3988 	uint_t   ifindex;
   3989 
   3990 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   3991 
   3992 	if (index == 0) {
   3993 		phyi = avl_first(
   3994 		    &ipst->ips_phyint_g_list->phyint_list_avl_by_index);
   3995 	} else {
   3996 		phyi = phyi_initial = avl_find(
   3997 		    &ipst->ips_phyint_g_list->phyint_list_avl_by_index,
   3998 		    (void *) &index, NULL);
   3999 	}
   4000 
   4001 	for (; phyi != NULL;
   4002 	    phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
   4003 	    phyi, AVL_AFTER)) {
   4004 		/*
   4005 		 * If we're not returning the first interface in the tree
   4006 		 * and we still haven't moved past the phyint_t that
   4007 		 * corresponds to index, avl_walk needs to be called again
   4008 		 */
   4009 		if (!((index != 0) && (phyi == phyi_initial))) {
   4010 			if (isv6) {
   4011 				if ((phyi->phyint_illv6) &&
   4012 				    ILL_CAN_LOOKUP(phyi->phyint_illv6) &&
   4013 				    (phyi->phyint_illv6->ill_isv6 == 1))
   4014 					break;
   4015 			} else {
   4016 				if ((phyi->phyint_illv4) &&
   4017 				    ILL_CAN_LOOKUP(phyi->phyint_illv4) &&
   4018 				    (phyi->phyint_illv4->ill_isv6 == 0))
   4019 					break;
   4020 			}
   4021 		}
   4022 	}
   4023 
   4024 	rw_exit(&ipst->ips_ill_g_lock);
   4025 
   4026 	if (phyi != NULL)
   4027 		ifindex = phyi->phyint_ifindex;
   4028 	else
   4029 		ifindex = 0;
   4030 
   4031 	return (ifindex);
   4032 }
   4033 
   4034 /*
   4035  * Return the ifindex for the named interface.
   4036  * If there is no next ifindex for the interface, return 0.
   4037  */
   4038 uint_t
   4039 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
   4040 {
   4041 	phyint_t	*phyi;
   4042 	avl_index_t	where = 0;
   4043 	uint_t		ifindex;
   4044 
   4045 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   4046 
   4047 	if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
   4048 	    name, &where)) == NULL) {
   4049 		rw_exit(&ipst->ips_ill_g_lock);
   4050 		return (0);
   4051 	}
   4052 
   4053 	ifindex = phyi->phyint_ifindex;
   4054 
   4055 	rw_exit(&ipst->ips_ill_g_lock);
   4056 
   4057 	return (ifindex);
   4058 }
   4059 
   4060 /*
   4061  * Return the ifindex to be used by upper layer protocols for instance
   4062  * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill.
   4063  */
   4064 uint_t
   4065 ill_get_upper_ifindex(const ill_t *ill)
   4066 {
   4067 	if (IS_UNDER_IPMP(ill))
   4068 		return (ipmp_ill_get_ipmp_ifindex(ill));
   4069 	else
   4070 		return (ill->ill_phyint->phyint_ifindex);
   4071 }
   4072 
   4073 
   4074 /*
   4075  * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
   4076  * that gives a running thread a reference to the ill. This reference must be
   4077  * released by the thread when it is done accessing the ill and related
   4078  * objects. ill_refcnt can not be used to account for static references
   4079  * such as other structures pointing to an ill. Callers must generally
   4080  * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros
   4081  * or be sure that the ill is not being deleted or changing state before
   4082  * calling the refhold functions. A non-zero ill_refcnt ensures that the
   4083  * ill won't change any of its critical state such as address, netmask etc.
   4084  */
   4085 void
   4086 ill_refhold(ill_t *ill)
   4087 {
   4088 	mutex_enter(&ill->ill_lock);
   4089 	ill->ill_refcnt++;
   4090 	ILL_TRACE_REF(ill);
   4091 	mutex_exit(&ill->ill_lock);
   4092 }
   4093 
   4094 void
   4095 ill_refhold_locked(ill_t *ill)
   4096 {
   4097 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4098 	ill->ill_refcnt++;
   4099 	ILL_TRACE_REF(ill);
   4100 }
   4101 
   4102 /* Returns true if we managed to get a refhold */
   4103 boolean_t
   4104 ill_check_and_refhold(ill_t *ill)
   4105 {
   4106 	mutex_enter(&ill->ill_lock);
   4107 	if (!ILL_IS_CONDEMNED(ill)) {
   4108 		ill_refhold_locked(ill);
   4109 		mutex_exit(&ill->ill_lock);
   4110 		return (B_TRUE);
   4111 	}
   4112 	mutex_exit(&ill->ill_lock);
   4113 	return (B_FALSE);
   4114 }
   4115 
   4116 /*
   4117  * Must not be called while holding any locks. Otherwise if this is
   4118  * the last reference to be released, there is a chance of recursive mutex
   4119  * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
   4120  * to restart an ioctl.
   4121  */
   4122 void
   4123 ill_refrele(ill_t *ill)
   4124 {
   4125 	mutex_enter(&ill->ill_lock);
   4126 	ASSERT(ill->ill_refcnt != 0);
   4127 	ill->ill_refcnt--;
   4128 	ILL_UNTRACE_REF(ill);
   4129 	if (ill->ill_refcnt != 0) {
   4130 		/* Every ire pointing to the ill adds 1 to ill_refcnt */
   4131 		mutex_exit(&ill->ill_lock);
   4132 		return;
   4133 	}
   4134 
   4135 	/* Drops the ill_lock */
   4136 	ipif_ill_refrele_tail(ill);
   4137 }
   4138 
   4139 /*
   4140  * Obtain a weak reference count on the ill. This reference ensures the
   4141  * ill won't be freed, but the ill may change any of its critical state
   4142  * such as netmask, address etc. Returns an error if the ill has started
   4143  * closing.
   4144  */
   4145 boolean_t
   4146 ill_waiter_inc(ill_t *ill)
   4147 {
   4148 	mutex_enter(&ill->ill_lock);
   4149 	if (ill->ill_state_flags & ILL_CONDEMNED) {
   4150 		mutex_exit(&ill->ill_lock);
   4151 		return (B_FALSE);
   4152 	}
   4153 	ill->ill_waiters++;
   4154 	mutex_exit(&ill->ill_lock);
   4155 	return (B_TRUE);
   4156 }
   4157 
   4158 void
   4159 ill_waiter_dcr(ill_t *ill)
   4160 {
   4161 	mutex_enter(&ill->ill_lock);
   4162 	ill->ill_waiters--;
   4163 	if (ill->ill_waiters == 0)
   4164 		cv_broadcast(&ill->ill_cv);
   4165 	mutex_exit(&ill->ill_lock);
   4166 }
   4167 
   4168 /*
   4169  * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the
   4170  * driver.  We construct best guess defaults for lower level information that
   4171  * we need.  If an interface is brought up without injection of any overriding
   4172  * information from outside, we have to be ready to go with these defaults.
   4173  * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ)
   4174  * we primarely want the dl_provider_style.
   4175  * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND
   4176  * at which point we assume the other part of the information is valid.
   4177  */
   4178 void
   4179 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
   4180 {
   4181 	uchar_t		*brdcst_addr;
   4182 	uint_t		brdcst_addr_length, phys_addr_length;
   4183 	t_scalar_t	sap_length;
   4184 	dl_info_ack_t	*dlia;
   4185 	ip_m_t		*ipm;
   4186 	dl_qos_cl_sel1_t *sel1;
   4187 	int		min_mtu;
   4188 
   4189 	ASSERT(IAM_WRITER_ILL(ill));
   4190 
   4191 	/*
   4192 	 * Till the ill is fully up  the ill is not globally visible.
   4193 	 * So no need for a lock.
   4194 	 */
   4195 	dlia = (dl_info_ack_t *)mp->b_rptr;
   4196 	ill->ill_mactype = dlia->dl_mac_type;
   4197 
   4198 	ipm = ip_m_lookup(dlia->dl_mac_type);
   4199 	if (ipm == NULL) {
   4200 		ipm = ip_m_lookup(DL_OTHER);
   4201 		ASSERT(ipm != NULL);
   4202 	}
   4203 	ill->ill_media = ipm;
   4204 
   4205 	/*
   4206 	 * When the new DLPI stuff is ready we'll pull lengths
   4207 	 * from dlia.
   4208 	 */
   4209 	if (dlia->dl_version == DL_VERSION_2) {
   4210 		brdcst_addr_length = dlia->dl_brdcst_addr_length;
   4211 		brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
   4212 		    brdcst_addr_length);
   4213 		if (brdcst_addr == NULL) {
   4214 			brdcst_addr_length = 0;
   4215 		}
   4216 		sap_length = dlia->dl_sap_length;
   4217 		phys_addr_length = dlia->dl_addr_length - ABS(sap_length);
   4218 		ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n",
   4219 		    brdcst_addr_length, sap_length, phys_addr_length));
   4220 	} else {
   4221 		brdcst_addr_length = 6;
   4222 		brdcst_addr = ip_six_byte_all_ones;
   4223 		sap_length = -2;
   4224 		phys_addr_length = brdcst_addr_length;
   4225 	}
   4226 
   4227 	ill->ill_bcast_addr_length = brdcst_addr_length;
   4228 	ill->ill_phys_addr_length = phys_addr_length;
   4229 	ill->ill_sap_length = sap_length;
   4230 
   4231 	/*
   4232 	 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
   4233 	 * but we must ensure a minimum IP MTU is used since other bits of
   4234 	 * IP will fly apart otherwise.
   4235 	 */
   4236 	min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
   4237 	ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
   4238 	ill->ill_current_frag = ill->ill_max_frag;
   4239 	ill->ill_mtu = ill->ill_max_frag;
   4240 
   4241 	ill->ill_type = ipm->ip_m_type;
   4242 
   4243 	if (!ill->ill_dlpi_style_set) {
   4244 		if (dlia->dl_provider_style == DL_STYLE2)
   4245 			ill->ill_needs_attach = 1;
   4246 
   4247 		phyint_flags_init(ill->ill_phyint, ill->ill_mactype);
   4248 
   4249 		/*
   4250 		 * Allocate the first ipif on this ill.  We don't delay it
   4251 		 * further as ioctl handling assumes at least one ipif exists.
   4252 		 *
   4253 		 * At this point we don't know whether the ill is v4 or v6.
   4254 		 * We will know this whan the SIOCSLIFNAME happens and
   4255 		 * the correct value for ill_isv6 will be assigned in
   4256 		 * ipif_set_values(). We need to hold the ill lock and
   4257 		 * clear the ILL_LL_SUBNET_PENDING flag and atomically do
   4258 		 * the wakeup.
   4259 		 */
   4260 		(void) ipif_allocate(ill, 0, IRE_LOCAL,
   4261 		    dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL);
   4262 		mutex_enter(&ill->ill_lock);
   4263 		ASSERT(ill->ill_dlpi_style_set == 0);
   4264 		ill->ill_dlpi_style_set = 1;
   4265 		ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING;
   4266 		cv_broadcast(&ill->ill_cv);
   4267 		mutex_exit(&ill->ill_lock);
   4268 		freemsg(mp);
   4269 		return;
   4270 	}
   4271 	ASSERT(ill->ill_ipif != NULL);
   4272 	/*
   4273 	 * We know whether it is IPv4 or IPv6 now, as this is the
   4274 	 * second DL_INFO_ACK we are recieving in response to the
   4275 	 * DL_INFO_REQ sent in ipif_set_values.
   4276 	 */
   4277 	ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap;
   4278 	/*
   4279 	 * Clear all the flags that were set based on ill_bcast_addr_length
   4280 	 * and ill_phys_addr_length (in ipif_set_values) as these could have
   4281 	 * changed now and we need to re-evaluate.
   4282 	 */
   4283 	ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP);
   4284 	ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
   4285 
   4286 	/*
   4287 	 * Free ill_bcast_mp as things could have changed now.
   4288 	 *
   4289 	 * NOTE: The IPMP meta-interface is special-cased because it starts
   4290 	 * with no underlying interfaces (and thus an unknown broadcast
   4291 	 * address length), but we enforce that an interface is broadcast-
   4292 	 * capable as part of allowing it to join a group.
   4293 	 */
   4294 	if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
   4295 		if (ill->ill_bcast_mp != NULL)
   4296 			freemsg(ill->ill_bcast_mp);
   4297 		ill->ill_net_type = IRE_IF_NORESOLVER;
   4298 
   4299 		ill->ill_bcast_mp = ill_dlur_gen(NULL,
   4300 		    ill->ill_phys_addr_length,
   4301 		    ill->ill_sap,
   4302 		    ill->ill_sap_length);
   4303 
   4304 		if (ill->ill_isv6)
   4305 			/*
   4306 			 * Note: xresolv interfaces will eventually need NOARP
   4307 			 * set here as well, but that will require those
   4308 			 * external resolvers to have some knowledge of
   4309 			 * that flag and act appropriately. Not to be changed
   4310 			 * at present.
   4311 			 */
   4312 			ill->ill_flags |= ILLF_NONUD;
   4313 		else
   4314 			ill->ill_flags |= ILLF_NOARP;
   4315 
   4316 		if (ill->ill_mactype == SUNW_DL_VNI) {
   4317 			ill->ill_ipif->ipif_flags |= IPIF_NOXMIT;
   4318 		} else if (ill->ill_phys_addr_length == 0 ||
   4319 		    ill->ill_mactype == DL_IPV4 ||
   4320 		    ill->ill_mactype == DL_IPV6) {
   4321 			/*
   4322 			 * The underying link is point-to-point, so mark the
   4323 			 * interface as such.  We can do IP multicast over
   4324 			 * such a link since it transmits all network-layer
   4325 			 * packets to the remote side the same way.
   4326 			 */
   4327 			ill->ill_flags |= ILLF_MULTICAST;
   4328 			ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT;
   4329 		}
   4330 	} else {
   4331 		ill->ill_net_type = IRE_IF_RESOLVER;
   4332 		if (ill->ill_bcast_mp != NULL)
   4333 			freemsg(ill->ill_bcast_mp);
   4334 		ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr,
   4335 		    ill->ill_bcast_addr_length, ill->ill_sap,
   4336 		    ill->ill_sap_length);
   4337 		/*
   4338 		 * Later detect lack of DLPI driver multicast
   4339 		 * capability by catching DL_ENABMULTI errors in
   4340 		 * ip_rput_dlpi.
   4341 		 */
   4342 		ill->ill_flags |= ILLF_MULTICAST;
   4343 		if (!ill->ill_isv6)
   4344 			ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
   4345 	}
   4346 
   4347 	/* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */
   4348 	if (ill->ill_mactype == SUNW_DL_IPMP)
   4349 		ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
   4350 
   4351 	/* By default an interface does not support any CoS marking */
   4352 	ill->ill_flags &= ~ILLF_COS_ENABLED;
   4353 
   4354 	/*
   4355 	 * If we get QoS information in DL_INFO_ACK, the device supports
   4356 	 * some form of CoS marking, set ILLF_COS_ENABLED.
   4357 	 */
   4358 	sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset,
   4359 	    dlia->dl_qos_length);
   4360 	if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) {
   4361 		ill->ill_flags |= ILLF_COS_ENABLED;
   4362 	}
   4363 
   4364 	/* Clear any previous error indication. */
   4365 	ill->ill_error = 0;
   4366 	freemsg(mp);
   4367 }
   4368 
   4369 /*
   4370  * Perform various checks to verify that an address would make sense as a
   4371  * local, remote, or subnet interface address.
   4372  */
   4373 static boolean_t
   4374 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask)
   4375 {
   4376 	ipaddr_t	net_mask;
   4377 
   4378 	/*
   4379 	 * Don't allow all zeroes, or all ones, but allow
   4380 	 * all ones netmask.
   4381 	 */
   4382 	if ((net_mask = ip_net_mask(addr)) == 0)
   4383 		return (B_FALSE);
   4384 	/* A given netmask overrides the "guess" netmask */
   4385 	if (subnet_mask != 0)
   4386 		net_mask = subnet_mask;
   4387 	if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) ||
   4388 	    (addr == (addr | ~net_mask)))) {
   4389 		return (B_FALSE);
   4390 	}
   4391 
   4392 	/*
   4393 	 * Even if the netmask is all ones, we do not allow address to be
   4394 	 * 255.255.255.255
   4395 	 */
   4396 	if (addr == INADDR_BROADCAST)
   4397 		return (B_FALSE);
   4398 
   4399 	if (CLASSD(addr))
   4400 		return (B_FALSE);
   4401 
   4402 	return (B_TRUE);
   4403 }
   4404 
   4405 #define	V6_IPIF_LINKLOCAL(p)	\
   4406 	IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr)
   4407 
   4408 /*
   4409  * Compare two given ipifs and check if the second one is better than
   4410  * the first one using the order of preference (not taking deprecated
   4411  * into acount) specified in ipif_lookup_multicast().
   4412  */
   4413 static boolean_t
   4414 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
   4415 {
   4416 	/* Check the least preferred first. */
   4417 	if (IS_LOOPBACK(old_ipif->ipif_ill)) {
   4418 		/* If both ipifs are the same, use the first one. */
   4419 		if (IS_LOOPBACK(new_ipif->ipif_ill))
   4420 			return (B_FALSE);
   4421 		else
   4422 			return (B_TRUE);
   4423 	}
   4424 
   4425 	/* For IPv6, check for link local address. */
   4426 	if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) {
   4427 		if (IS_LOOPBACK(new_ipif->ipif_ill) ||
   4428 		    V6_IPIF_LINKLOCAL(new_ipif)) {
   4429 			/* The second one is equal or less preferred. */
   4430 			return (B_FALSE);
   4431 		} else {
   4432 			return (B_TRUE);
   4433 		}
   4434 	}
   4435 
   4436 	/* Then check for point to point interface. */
   4437 	if (old_ipif->ipif_flags & IPIF_POINTOPOINT) {
   4438 		if (IS_LOOPBACK(new_ipif->ipif_ill) ||
   4439 		    (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) ||
   4440 		    (new_ipif->ipif_flags & IPIF_POINTOPOINT)) {
   4441 			return (B_FALSE);
   4442 		} else {
   4443 			return (B_TRUE);
   4444 		}
   4445 	}
   4446 
   4447 	/* old_ipif is a normal interface, so no need to use the new one. */
   4448 	return (B_FALSE);
   4449 }
   4450 
   4451 /*
   4452  * Find a mulitcast-capable ipif given an IP instance and zoneid.
   4453  * The ipif must be up, and its ill must multicast-capable, not
   4454  * condemned, not an underlying interface in an IPMP group, and
   4455  * not a VNI interface.  Order of preference:
   4456  *
   4457  * 	1a. normal
   4458  * 	1b. normal, but deprecated
   4459  * 	2a. point to point
   4460  * 	2b. point to point, but deprecated
   4461  * 	3a. link local
   4462  * 	3b. link local, but deprecated
   4463  * 	4. loopback.
   4464  */
   4465 static ipif_t *
   4466 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
   4467 {
   4468 	ill_t			*ill;
   4469 	ill_walk_context_t	ctx;
   4470 	ipif_t			*ipif;
   4471 	ipif_t			*saved_ipif = NULL;
   4472 	ipif_t			*dep_ipif = NULL;
   4473 
   4474 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   4475 	if (isv6)
   4476 		ill = ILL_START_WALK_V6(&ctx, ipst);
   4477 	else
   4478 		ill = ILL_START_WALK_V4(&ctx, ipst);
   4479 
   4480 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   4481 		mutex_enter(&ill->ill_lock);
   4482 		if (IS_VNI(ill) || IS_UNDER_IPMP(ill) ||
   4483 		    ILL_IS_CONDEMNED(ill) ||
   4484 		    !(ill->ill_flags & ILLF_MULTICAST)) {
   4485 			mutex_exit(&ill->ill_lock);
   4486 			continue;
   4487 		}
   4488 		for (ipif = ill->ill_ipif; ipif != NULL;
   4489 		    ipif = ipif->ipif_next) {
   4490 			if (zoneid != ipif->ipif_zoneid &&
   4491 			    zoneid != ALL_ZONES &&
   4492 			    ipif->ipif_zoneid != ALL_ZONES) {
   4493 				continue;
   4494 			}
   4495 			if (!(ipif->ipif_flags & IPIF_UP) ||
   4496 			    IPIF_IS_CONDEMNED(ipif)) {
   4497 				continue;
   4498 			}
   4499 
   4500 			/*
   4501 			 * Found one candidate.  If it is deprecated,
   4502 			 * remember it in dep_ipif.  If it is not deprecated,
   4503 			 * remember it in saved_ipif.
   4504 			 */
   4505 			if (ipif->ipif_flags & IPIF_DEPRECATED) {
   4506 				if (dep_ipif == NULL) {
   4507 					dep_ipif = ipif;
   4508 				} else if (ipif_comp_multi(dep_ipif, ipif,
   4509 				    isv6)) {
   4510 					/*
   4511 					 * If the previous dep_ipif does not
   4512 					 * belong to the same ill, we've done
   4513 					 * a ipif_refhold() on it.  So we need
   4514 					 * to release it.
   4515 					 */
   4516 					if (dep_ipif->ipif_ill != ill)
   4517 						ipif_refrele(dep_ipif);
   4518 					dep_ipif = ipif;
   4519 				}
   4520 				continue;
   4521 			}
   4522 			if (saved_ipif == NULL) {
   4523 				saved_ipif = ipif;
   4524 			} else {
   4525 				if (ipif_comp_multi(saved_ipif, ipif, isv6)) {
   4526 					if (saved_ipif->ipif_ill != ill)
   4527 						ipif_refrele(saved_ipif);
   4528 					saved_ipif = ipif;
   4529 				}
   4530 			}
   4531 		}
   4532 		/*
   4533 		 * Before going to the next ill, do a ipif_refhold() on the
   4534 		 * saved ones.
   4535 		 */
   4536 		if (saved_ipif != NULL && saved_ipif->ipif_ill == ill)
   4537 			ipif_refhold_locked(saved_ipif);
   4538 		if (dep_ipif != NULL && dep_ipif->ipif_ill == ill)
   4539 			ipif_refhold_locked(dep_ipif);
   4540 		mutex_exit(&ill->ill_lock);
   4541 	}
   4542 	rw_exit(&ipst->ips_ill_g_lock);
   4543 
   4544 	/*
   4545 	 * If we have only the saved_ipif, return it.  But if we have both
   4546 	 * saved_ipif and dep_ipif, check to see which one is better.
   4547 	 */
   4548 	if (saved_ipif != NULL) {
   4549 		if (dep_ipif != NULL) {
   4550 			if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) {
   4551 				ipif_refrele(saved_ipif);
   4552 				return (dep_ipif);
   4553 			} else {
   4554 				ipif_refrele(dep_ipif);
   4555 				return (saved_ipif);
   4556 			}
   4557 		}
   4558 		return (saved_ipif);
   4559 	} else {
   4560 		return (dep_ipif);
   4561 	}
   4562 }
   4563 
   4564 ill_t *
   4565 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
   4566 {
   4567 	ipif_t *ipif;
   4568 	ill_t *ill;
   4569 
   4570 	ipif = ipif_lookup_multicast(ipst, zoneid, isv6);
   4571 	if (ipif == NULL)
   4572 		return (NULL);
   4573 
   4574 	ill = ipif->ipif_ill;
   4575 	ill_refhold(ill);
   4576 	ipif_refrele(ipif);
   4577 	return (ill);
   4578 }
   4579 
   4580 /*
   4581  * This function is called when an application does not specify an interface
   4582  * to be used for multicast traffic (joining a group/sending data).  It
   4583  * calls ire_lookup_multi() to look for an interface route for the
   4584  * specified multicast group.  Doing this allows the administrator to add
   4585  * prefix routes for multicast to indicate which interface to be used for
   4586  * multicast traffic in the above scenario.  The route could be for all
   4587  * multicast (224.0/4), for a single multicast group (a /32 route) or
   4588  * anything in between.  If there is no such multicast route, we just find
   4589  * any multicast capable interface and return it.  The returned ipif
   4590  * is refhold'ed.
   4591  *
   4592  * We support MULTIRT and RTF_SETSRC on the multicast routes added to the
   4593  * unicast table. This is used by CGTP.
   4594  */
   4595 ill_t *
   4596 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
   4597     boolean_t *multirtp, ipaddr_t *setsrcp)
   4598 {
   4599 	ill_t			*ill;
   4600 
   4601 	ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp);
   4602 	if (ill != NULL)
   4603 		return (ill);
   4604 
   4605 	return (ill_lookup_multicast(ipst, zoneid, B_FALSE));
   4606 }
   4607 
   4608 /*
   4609  * Look for an ipif with the specified interface address and destination.
   4610  * The destination address is used only for matching point-to-point interfaces.
   4611  */
   4612 ipif_t *
   4613 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst)
   4614 {
   4615 	ipif_t	*ipif;
   4616 	ill_t	*ill;
   4617 	ill_walk_context_t ctx;
   4618 
   4619 	/*
   4620 	 * First match all the point-to-point interfaces
   4621 	 * before looking at non-point-to-point interfaces.
   4622 	 * This is done to avoid returning non-point-to-point
   4623 	 * ipif instead of unnumbered point-to-point ipif.
   4624 	 */
   4625 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   4626 	ill = ILL_START_WALK_V4(&ctx, ipst);
   4627 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   4628 		mutex_enter(&ill->ill_lock);
   4629 		for (ipif = ill->ill_ipif; ipif != NULL;
   4630 		    ipif = ipif->ipif_next) {
   4631 			/* Allow the ipif to be down */
   4632 			if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
   4633 			    (ipif->ipif_lcl_addr == if_addr) &&
   4634 			    (ipif->ipif_pp_dst_addr == dst)) {
   4635 				if (!IPIF_IS_CONDEMNED(ipif)) {
   4636 					ipif_refhold_locked(ipif);
   4637 					mutex_exit(&ill->ill_lock);
   4638 					rw_exit(&ipst->ips_ill_g_lock);
   4639 					return (ipif);
   4640 				}
   4641 			}
   4642 		}
   4643 		mutex_exit(&ill->ill_lock);
   4644 	}
   4645 	rw_exit(&ipst->ips_ill_g_lock);
   4646 
   4647 	/* lookup the ipif based on interface address */
   4648 	ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst);
   4649 	ASSERT(ipif == NULL || !ipif->ipif_isv6);
   4650 	return (ipif);
   4651 }
   4652 
   4653 /*
   4654  * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
   4655  */
   4656 static ipif_t *
   4657 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags,
   4658     zoneid_t zoneid, ip_stack_t *ipst)
   4659 {
   4660 	ipif_t  *ipif;
   4661 	ill_t   *ill;
   4662 	boolean_t ptp = B_FALSE;
   4663 	ill_walk_context_t	ctx;
   4664 	boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
   4665 	boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
   4666 
   4667 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   4668 	/*
   4669 	 * Repeat twice, first based on local addresses and
   4670 	 * next time for pointopoint.
   4671 	 */
   4672 repeat:
   4673 	ill = ILL_START_WALK_V4(&ctx, ipst);
   4674 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   4675 		if (match_ill != NULL && ill != match_ill &&
   4676 		    (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
   4677 			continue;
   4678 		}
   4679 		mutex_enter(&ill->ill_lock);
   4680 		for (ipif = ill->ill_ipif; ipif != NULL;
   4681 		    ipif = ipif->ipif_next) {
   4682 			if (zoneid != ALL_ZONES &&
   4683 			    zoneid != ipif->ipif_zoneid &&
   4684 			    ipif->ipif_zoneid != ALL_ZONES)
   4685 				continue;
   4686 
   4687 			if (no_duplicate && !(ipif->ipif_flags & IPIF_UP))
   4688 				continue;
   4689 
   4690 			/* Allow the ipif to be down */
   4691 			if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
   4692 			    ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
   4693 			    (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
   4694 			    (ipif->ipif_pp_dst_addr == addr))) {
   4695 				if (!IPIF_IS_CONDEMNED(ipif)) {
   4696 					ipif_refhold_locked(ipif);
   4697 					mutex_exit(&ill->ill_lock);
   4698 					rw_exit(&ipst->ips_ill_g_lock);
   4699 					return (ipif);
   4700 				}
   4701 			}
   4702 		}
   4703 		mutex_exit(&ill->ill_lock);
   4704 	}
   4705 
   4706 	/* If we already did the ptp case, then we are done */
   4707 	if (ptp) {
   4708 		rw_exit(&ipst->ips_ill_g_lock);
   4709 		return (NULL);
   4710 	}
   4711 	ptp = B_TRUE;
   4712 	goto repeat;
   4713 }
   4714 
   4715 /*
   4716  * Lookup an ipif with the specified address.  For point-to-point links we
   4717  * look for matches on either the destination address or the local address,
   4718  * but we skip the local address check if IPIF_UNNUMBERED is set.  If the
   4719  * `match_ill' argument is non-NULL, the lookup is restricted to that ill
   4720  * (or illgrp if `match_ill' is in an IPMP group).
   4721  */
   4722 ipif_t *
   4723 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
   4724     ip_stack_t *ipst)
   4725 {
   4726 	return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP,
   4727 	    zoneid, ipst));
   4728 }
   4729 
   4730 /*
   4731  * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
   4732  * except that we will only return an address if it is not marked as
   4733  * IPIF_DUPLICATE
   4734  */
   4735 ipif_t *
   4736 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
   4737     ip_stack_t *ipst)
   4738 {
   4739 	return (ipif_lookup_addr_common(addr, match_ill,
   4740 	    (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP),
   4741 	    zoneid, ipst));
   4742 }
   4743 
   4744 /*
   4745  * Special abbreviated version of ipif_lookup_addr() that doesn't match
   4746  * `match_ill' across the IPMP group.  This function is only needed in some
   4747  * corner-cases; almost everything should use ipif_lookup_addr().
   4748  */
   4749 ipif_t *
   4750 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
   4751 {
   4752 	ASSERT(match_ill != NULL);
   4753 	return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES,
   4754 	    ipst));
   4755 }
   4756 
   4757 /*
   4758  * Look for an ipif with the specified address. For point-point links
   4759  * we look for matches on either the destination address and the local
   4760  * address, but we ignore the check on the local address if IPIF_UNNUMBERED
   4761  * is set.
   4762  * If the `match_ill' argument is non-NULL, the lookup is restricted to that
   4763  * ill (or illgrp if `match_ill' is in an IPMP group).
   4764  * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
   4765  */
   4766 zoneid_t
   4767 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
   4768 {
   4769 	zoneid_t zoneid;
   4770 	ipif_t  *ipif;
   4771 	ill_t   *ill;
   4772 	boolean_t ptp = B_FALSE;
   4773 	ill_walk_context_t	ctx;
   4774 
   4775 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   4776 	/*
   4777 	 * Repeat twice, first based on local addresses and
   4778 	 * next time for pointopoint.
   4779 	 */
   4780 repeat:
   4781 	ill = ILL_START_WALK_V4(&ctx, ipst);
   4782 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   4783 		if (match_ill != NULL && ill != match_ill &&
   4784 		    !IS_IN_SAME_ILLGRP(ill, match_ill)) {
   4785 			continue;
   4786 		}
   4787 		mutex_enter(&ill->ill_lock);
   4788 		for (ipif = ill->ill_ipif; ipif != NULL;
   4789 		    ipif = ipif->ipif_next) {
   4790 			/* Allow the ipif to be down */
   4791 			if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
   4792 			    ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
   4793 			    (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
   4794 			    (ipif->ipif_pp_dst_addr == addr)) &&
   4795 			    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
   4796 				zoneid = ipif->ipif_zoneid;
   4797 				mutex_exit(&ill->ill_lock);
   4798 				rw_exit(&ipst->ips_ill_g_lock);
   4799 				/*
   4800 				 * If ipif_zoneid was ALL_ZONES then we have
   4801 				 * a trusted extensions shared IP address.
   4802 				 * In that case GLOBAL_ZONEID works to send.
   4803 				 */
   4804 				if (zoneid == ALL_ZONES)
   4805 					zoneid = GLOBAL_ZONEID;
   4806 				return (zoneid);
   4807 			}
   4808 		}
   4809 		mutex_exit(&ill->ill_lock);
   4810 	}
   4811 
   4812 	/* If we already did the ptp case, then we are done */
   4813 	if (ptp) {
   4814 		rw_exit(&ipst->ips_ill_g_lock);
   4815 		return (ALL_ZONES);
   4816 	}
   4817 	ptp = B_TRUE;
   4818 	goto repeat;
   4819 }
   4820 
   4821 /*
   4822  * Look for an ipif that matches the specified remote address i.e. the
   4823  * ipif that would receive the specified packet.
   4824  * First look for directly connected interfaces and then do a recursive
   4825  * IRE lookup and pick the first ipif corresponding to the source address in the
   4826  * ire.
   4827  * Returns: held ipif
   4828  *
   4829  * This is only used for ICMP_ADDRESS_MASK_REQUESTs
   4830  */
   4831 ipif_t *
   4832 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
   4833 {
   4834 	ipif_t	*ipif;
   4835 
   4836 	ASSERT(!ill->ill_isv6);
   4837 
   4838 	/*
   4839 	 * Someone could be changing this ipif currently or change it
   4840 	 * after we return this. Thus  a few packets could use the old
   4841 	 * old values. However structure updates/creates (ire, ilg, ilm etc)
   4842 	 * will atomically be updated or cleaned up with the new value
   4843 	 * Thus we don't need a lock to check the flags or other attrs below.
   4844 	 */
   4845 	mutex_enter(&ill->ill_lock);
   4846 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   4847 		if (IPIF_IS_CONDEMNED(ipif))
   4848 			continue;
   4849 		if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
   4850 		    ipif->ipif_zoneid != ALL_ZONES)
   4851 			continue;
   4852 		/* Allow the ipif to be down */
   4853 		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
   4854 			if ((ipif->ipif_pp_dst_addr == addr) ||
   4855 			    (!(ipif->ipif_flags & IPIF_UNNUMBERED) &&
   4856 			    ipif->ipif_lcl_addr == addr)) {
   4857 				ipif_refhold_locked(ipif);
   4858 				mutex_exit(&ill->ill_lock);
   4859 				return (ipif);
   4860 			}
   4861 		} else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) {
   4862 			ipif_refhold_locked(ipif);
   4863 			mutex_exit(&ill->ill_lock);
   4864 			return (ipif);
   4865 		}
   4866 	}
   4867 	mutex_exit(&ill->ill_lock);
   4868 	/*
   4869 	 * For a remote destination it isn't possible to nail down a particular
   4870 	 * ipif.
   4871 	 */
   4872 
   4873 	/* Pick the first interface */
   4874 	ipif = ipif_get_next_ipif(NULL, ill);
   4875 	return (ipif);
   4876 }
   4877 
   4878 /*
   4879  * This func does not prevent refcnt from increasing. But if
   4880  * the caller has taken steps to that effect, then this func
   4881  * can be used to determine whether the ill has become quiescent
   4882  */
   4883 static boolean_t
   4884 ill_is_quiescent(ill_t *ill)
   4885 {
   4886 	ipif_t	*ipif;
   4887 
   4888 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4889 
   4890 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   4891 		if (ipif->ipif_refcnt != 0)
   4892 			return (B_FALSE);
   4893 	}
   4894 	if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
   4895 		return (B_FALSE);
   4896 	}
   4897 	return (B_TRUE);
   4898 }
   4899 
   4900 boolean_t
   4901 ill_is_freeable(ill_t *ill)
   4902 {
   4903 	ipif_t	*ipif;
   4904 
   4905 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4906 
   4907 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   4908 		if (ipif->ipif_refcnt != 0) {
   4909 			return (B_FALSE);
   4910 		}
   4911 	}
   4912 	if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) {
   4913 		return (B_FALSE);
   4914 	}
   4915 	return (B_TRUE);
   4916 }
   4917 
   4918 /*
   4919  * This func does not prevent refcnt from increasing. But if
   4920  * the caller has taken steps to that effect, then this func
   4921  * can be used to determine whether the ipif has become quiescent
   4922  */
   4923 static boolean_t
   4924 ipif_is_quiescent(ipif_t *ipif)
   4925 {
   4926 	ill_t *ill;
   4927 
   4928 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
   4929 
   4930 	if (ipif->ipif_refcnt != 0)
   4931 		return (B_FALSE);
   4932 
   4933 	ill = ipif->ipif_ill;
   4934 	if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
   4935 	    ill->ill_logical_down) {
   4936 		return (B_TRUE);
   4937 	}
   4938 
   4939 	/* This is the last ipif going down or being deleted on this ill */
   4940 	if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
   4941 		return (B_FALSE);
   4942 	}
   4943 
   4944 	return (B_TRUE);
   4945 }
   4946 
   4947 /*
   4948  * return true if the ipif can be destroyed: the ipif has to be quiescent
   4949  * with zero references from ire/ilm to it.
   4950  */
   4951 static boolean_t
   4952 ipif_is_freeable(ipif_t *ipif)
   4953 {
   4954 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
   4955 	ASSERT(ipif->ipif_id != 0);
   4956 	return (ipif->ipif_refcnt == 0);
   4957 }
   4958 
   4959 /*
   4960  * The ipif/ill/ire has been refreled. Do the tail processing.
   4961  * Determine if the ipif or ill in question has become quiescent and if so
   4962  * wakeup close and/or restart any queued pending ioctl that is waiting
   4963  * for the ipif_down (or ill_down)
   4964  */
   4965 void
   4966 ipif_ill_refrele_tail(ill_t *ill)
   4967 {
   4968 	mblk_t	*mp;
   4969 	conn_t	*connp;
   4970 	ipsq_t	*ipsq;
   4971 	ipxop_t	*ipx;
   4972 	ipif_t	*ipif;
   4973 	dl_notify_ind_t *dlindp;
   4974 
   4975 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4976 
   4977 	if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
   4978 		/* ip_modclose() may be waiting */
   4979 		cv_broadcast(&ill->ill_cv);
   4980 	}
   4981 
   4982 	ipsq = ill->ill_phyint->phyint_ipsq;
   4983 	mutex_enter(&ipsq->ipsq_lock);
   4984 	ipx = ipsq->ipsq_xop;
   4985 	mutex_enter(&ipx->ipx_lock);
   4986 	if (ipx->ipx_waitfor == 0)	/* no one's waiting; bail */
   4987 		goto unlock;
   4988 
   4989 	ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
   4990 
   4991 	ipif = ipx->ipx_pending_ipif;
   4992 	if (ipif->ipif_ill != ill) 	/* wait is for another ill; bail */
   4993 		goto unlock;
   4994 
   4995 	switch (ipx->ipx_waitfor) {
   4996 	case IPIF_DOWN:
   4997 		if (!ipif_is_quiescent(ipif))
   4998 			goto unlock;
   4999 		break;
   5000 	case IPIF_FREE:
   5001 		if (!ipif_is_freeable(ipif))
   5002 			goto unlock;
   5003 		break;
   5004 	case ILL_DOWN:
   5005 		if (!ill_is_quiescent(ill))
   5006 			goto unlock;
   5007 		break;
   5008 	case ILL_FREE:
   5009 		/*
   5010 		 * ILL_FREE is only for loopback; normal ill teardown waits
   5011 		 * synchronously in ip_modclose() without using ipx_waitfor,
   5012 		 * handled by the cv_broadcast() at the top of this function.
   5013 		 */
   5014 		if (!ill_is_freeable(ill))
   5015 			goto unlock;
   5016 		break;
   5017 	default:
   5018 		cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
   5019 		    (void *)ipsq, ipx->ipx_waitfor);
   5020 	}
   5021 
   5022 	ill_refhold_locked(ill);	/* for qwriter_ip() call below */
   5023 	mutex_exit(&ipx->ipx_lock);
   5024 	mp = ipsq_pending_mp_get(ipsq, &connp);
   5025 	mutex_exit(&ipsq->ipsq_lock);
   5026 	mutex_exit(&ill->ill_lock);
   5027 
   5028 	ASSERT(mp != NULL);
   5029 	/*
   5030 	 * NOTE: all of the qwriter_ip() calls below use CUR_OP since
   5031 	 * we can only get here when the current operation decides it
   5032 	 * it needs to quiesce via ipsq_pending_mp_add().
   5033 	 */
   5034 	switch (mp->b_datap->db_type) {
   5035 	case M_PCPROTO:
   5036 	case M_PROTO:
   5037 		/*
   5038 		 * For now, only DL_NOTIFY_IND messages can use this facility.
   5039 		 */
   5040 		dlindp = (dl_notify_ind_t *)mp->b_rptr;
   5041 		ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND);
   5042 
   5043 		switch (dlindp->dl_notification) {
   5044 		case DL_NOTE_PHYS_ADDR:
   5045 			qwriter_ip(ill, ill->ill_rq, mp,
   5046 			    ill_set_phys_addr_tail, CUR_OP, B_TRUE);
   5047 			return;
   5048 		case DL_NOTE_REPLUMB:
   5049 			qwriter_ip(ill, ill->ill_rq, mp,
   5050 			    ill_replumb_tail, CUR_OP, B_TRUE);
   5051 			return;
   5052 		default:
   5053 			ASSERT(0);
   5054 			ill_refrele(ill);
   5055 		}
   5056 		break;
   5057 
   5058 	case M_ERROR:
   5059 	case M_HANGUP:
   5060 		qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP,
   5061 		    B_TRUE);
   5062 		return;
   5063 
   5064 	case M_IOCTL:
   5065 	case M_IOCDATA:
   5066 		qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) :
   5067 		    ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE);
   5068 		return;
   5069 
   5070 	default:
   5071 		cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
   5072 		    "db_type %d\n", (void *)mp, mp->b_datap->db_type);
   5073 	}
   5074 	return;
   5075 unlock:
   5076 	mutex_exit(&ipsq->ipsq_lock);
   5077 	mutex_exit(&ipx->ipx_lock);
   5078 	mutex_exit(&ill->ill_lock);
   5079 }
   5080 
   5081 #ifdef DEBUG
   5082 /* Reuse trace buffer from beginning (if reached the end) and record trace */
   5083 static void
   5084 th_trace_rrecord(th_trace_t *th_trace)
   5085 {
   5086 	tr_buf_t *tr_buf;
   5087 	uint_t lastref;
   5088 
   5089 	lastref = th_trace->th_trace_lastref;
   5090 	lastref++;
   5091 	if (lastref == TR_BUF_MAX)
   5092 		lastref = 0;
   5093 	th_trace->th_trace_lastref = lastref;
   5094 	tr_buf = &th_trace->th_trbuf[lastref];
   5095 	tr_buf->tr_time = ddi_get_lbolt();
   5096 	tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH);
   5097 }
   5098 
   5099 static void
   5100 th_trace_free(void *value)
   5101 {
   5102 	th_trace_t *th_trace = value;
   5103 
   5104 	ASSERT(th_trace->th_refcnt == 0);
   5105 	kmem_free(th_trace, sizeof (*th_trace));
   5106 }
   5107 
   5108 /*
   5109  * Find or create the per-thread hash table used to track object references.
   5110  * The ipst argument is NULL if we shouldn't allocate.
   5111  *
   5112  * Accesses per-thread data, so there's no need to lock here.
   5113  */
   5114 static mod_hash_t *
   5115 th_trace_gethash(ip_stack_t *ipst)
   5116 {
   5117 	th_hash_t *thh;
   5118 
   5119 	if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) {
   5120 		mod_hash_t *mh;
   5121 		char name[256];
   5122 		size_t objsize, rshift;
   5123 		int retv;
   5124 
   5125 		if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL)
   5126 			return (NULL);
   5127 		(void) snprintf(name, sizeof (name), "th_trace_%p",
   5128 		    (void *)curthread);
   5129 
   5130 		/*
   5131 		 * We use mod_hash_create_extended here rather than the more
   5132 		 * obvious mod_hash_create_ptrhash because the latter has a
   5133 		 * hard-coded KM_SLEEP, and we'd prefer to fail rather than
   5134 		 * block.
   5135 		 */
   5136 		objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
   5137 		    MAX(sizeof (ire_t), sizeof (ncec_t)));
   5138 		rshift = highbit(objsize);
   5139 		mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
   5140 		    th_trace_free, mod_hash_byptr, (void *)rshift,
   5141 		    mod_hash_ptrkey_cmp, KM_NOSLEEP);
   5142 		if (mh == NULL) {
   5143 			kmem_free(thh, sizeof (*thh));
   5144 			return (NULL);
   5145 		}
   5146 		thh->thh_hash = mh;
   5147 		thh->thh_ipst = ipst;
   5148 		/*
   5149 		 * We trace ills, ipifs, ires, and nces.  All of these are
   5150 		 * per-IP-stack, so the lock on the thread list is as well.
   5151 		 */
   5152 		rw_enter(&ip_thread_rwlock, RW_WRITER);
   5153 		list_insert_tail(&ip_thread_list, thh);
   5154 		rw_exit(&ip_thread_rwlock);
   5155 		retv = tsd_set(ip_thread_data, thh);
   5156 		ASSERT(retv == 0);
   5157 	}
   5158 	return (thh != NULL ? thh->thh_hash : NULL);
   5159 }
   5160 
   5161 boolean_t
   5162 th_trace_ref(const void *obj, ip_stack_t *ipst)
   5163 {
   5164 	th_trace_t *th_trace;
   5165 	mod_hash_t *mh;
   5166 	mod_hash_val_t val;
   5167 
   5168 	if ((mh = th_trace_gethash(ipst)) == NULL)
   5169 		return (B_FALSE);
   5170 
   5171 	/*
   5172 	 * Attempt to locate the trace buffer for this obj and thread.
   5173 	 * If it does not exist, then allocate a new trace buffer and
   5174 	 * insert into the hash.
   5175 	 */
   5176 	if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) {
   5177 		th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP);
   5178 		if (th_trace == NULL)
   5179 			return (B_FALSE);
   5180 
   5181 		th_trace->th_id = curthread;
   5182 		if (mod_hash_insert(mh, (mod_hash_key_t)obj,
   5183 		    (mod_hash_val_t)th_trace) != 0) {
   5184 			kmem_free(th_trace, sizeof (th_trace_t));
   5185 			return (B_FALSE);
   5186 		}
   5187 	} else {
   5188 		th_trace = (th_trace_t *)val;
   5189 	}
   5190 
   5191 	ASSERT(th_trace->th_refcnt >= 0 &&
   5192 	    th_trace->th_refcnt < TR_BUF_MAX - 1);
   5193 
   5194 	th_trace->th_refcnt++;
   5195 	th_trace_rrecord(th_trace);
   5196 	return (B_TRUE);
   5197 }
   5198 
   5199 /*
   5200  * For the purpose of tracing a reference release, we assume that global
   5201  * tracing is always on and that the same thread initiated the reference hold
   5202  * is releasing.
   5203  */
   5204 void
   5205 th_trace_unref(const void *obj)
   5206 {
   5207 	int retv;
   5208 	mod_hash_t *mh;
   5209 	th_trace_t *th_trace;
   5210 	mod_hash_val_t val;
   5211 
   5212 	mh = th_trace_gethash(NULL);
   5213 	retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val);
   5214 	ASSERT(retv == 0);
   5215 	th_trace = (th_trace_t *)val;
   5216 
   5217 	ASSERT(th_trace->th_refcnt > 0);
   5218 	th_trace->th_refcnt--;
   5219 	th_trace_rrecord(th_trace);
   5220 }
   5221 
   5222 /*
   5223  * If tracing has been disabled, then we assume that the reference counts are
   5224  * now useless, and we clear them out before destroying the entries.
   5225  */
   5226 void
   5227 th_trace_cleanup(const void *obj, boolean_t trace_disable)
   5228 {
   5229 	th_hash_t	*thh;
   5230 	mod_hash_t	*mh;
   5231 	mod_hash_val_t	val;
   5232 	th_trace_t	*th_trace;
   5233 	int		retv;
   5234 
   5235 	rw_enter(&ip_thread_rwlock, RW_READER);
   5236 	for (thh = list_head(&ip_thread_list); thh != NULL;
   5237 	    thh = list_next(&ip_thread_list, thh)) {
   5238 		if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj,
   5239 		    &val) == 0) {
   5240 			th_trace = (th_trace_t *)val;
   5241 			if (trace_disable)
   5242 				th_trace->th_refcnt = 0;
   5243 			retv = mod_hash_destroy(mh, (mod_hash_key_t)obj);
   5244 			ASSERT(retv == 0);
   5245 		}
   5246 	}
   5247 	rw_exit(&ip_thread_rwlock);
   5248 }
   5249 
   5250 void
   5251 ipif_trace_ref(ipif_t *ipif)
   5252 {
   5253 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
   5254 
   5255 	if (ipif->ipif_trace_disable)
   5256 		return;
   5257 
   5258 	if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) {
   5259 		ipif->ipif_trace_disable = B_TRUE;
   5260 		ipif_trace_cleanup(ipif);
   5261 	}
   5262 }
   5263 
   5264 void
   5265 ipif_untrace_ref(ipif_t *ipif)
   5266 {
   5267 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
   5268 
   5269 	if (!ipif->ipif_trace_disable)
   5270 		th_trace_unref(ipif);
   5271 }
   5272 
   5273 void
   5274 ill_trace_ref(ill_t *ill)
   5275 {
   5276 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   5277 
   5278 	if (ill->ill_trace_disable)
   5279 		return;
   5280 
   5281 	if (!th_trace_ref(ill, ill->ill_ipst)) {
   5282 		ill->ill_trace_disable = B_TRUE;
   5283 		ill_trace_cleanup(ill);
   5284 	}
   5285 }
   5286 
   5287 void
   5288 ill_untrace_ref(ill_t *ill)
   5289 {
   5290 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   5291 
   5292 	if (!ill->ill_trace_disable)
   5293 		th_trace_unref(ill);
   5294 }
   5295 
   5296 /*
   5297  * Called when ipif is unplumbed or when memory alloc fails.  Note that on
   5298  * failure, ipif_trace_disable is set.
   5299  */
   5300 static void
   5301 ipif_trace_cleanup(const ipif_t *ipif)
   5302 {
   5303 	th_trace_cleanup(ipif, ipif->ipif_trace_disable);
   5304 }
   5305 
   5306 /*
   5307  * Called when ill is unplumbed or when memory alloc fails.  Note that on
   5308  * failure, ill_trace_disable is set.
   5309  */
   5310 static void
   5311 ill_trace_cleanup(const ill_t *ill)
   5312 {
   5313 	th_trace_cleanup(ill, ill->ill_trace_disable);
   5314 }
   5315 #endif /* DEBUG */
   5316 
   5317 void
   5318 ipif_refhold_locked(ipif_t *ipif)
   5319 {
   5320 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
   5321 	ipif->ipif_refcnt++;
   5322 	IPIF_TRACE_REF(ipif);
   5323 }
   5324 
   5325 void
   5326 ipif_refhold(ipif_t *ipif)
   5327 {
   5328 	ill_t	*ill;
   5329 
   5330 	ill = ipif->ipif_ill;
   5331 	mutex_enter(&ill->ill_lock);
   5332 	ipif->ipif_refcnt++;
   5333 	IPIF_TRACE_REF(ipif);
   5334 	mutex_exit(&ill->ill_lock);
   5335 }
   5336 
   5337 /*
   5338  * Must not be called while holding any locks. Otherwise if this is
   5339  * the last reference to be released there is a chance of recursive mutex
   5340  * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
   5341  * to restart an ioctl.
   5342  */
   5343 void
   5344 ipif_refrele(ipif_t *ipif)
   5345 {
   5346 	ill_t	*ill;
   5347 
   5348 	ill = ipif->ipif_ill;
   5349 
   5350 	mutex_enter(&ill->ill_lock);
   5351 	ASSERT(ipif->ipif_refcnt != 0);
   5352 	ipif->ipif_refcnt--;
   5353 	IPIF_UNTRACE_REF(ipif);
   5354 	if (ipif->ipif_refcnt != 0) {
   5355 		mutex_exit(&ill->ill_lock);
   5356 		return;
   5357 	}
   5358 
   5359 	/* Drops the ill_lock */
   5360 	ipif_ill_refrele_tail(ill);
   5361 }
   5362 
   5363 ipif_t *
   5364 ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
   5365 {
   5366 	ipif_t	*ipif;
   5367 
   5368 	mutex_enter(&ill->ill_lock);
   5369 	for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
   5370 	    ipif != NULL; ipif = ipif->ipif_next) {
   5371 		if (IPIF_IS_CONDEMNED(ipif))
   5372 			continue;
   5373 		ipif_refhold_locked(ipif);
   5374 		mutex_exit(&ill->ill_lock);
   5375 		return (ipif);
   5376 	}
   5377 	mutex_exit(&ill->ill_lock);
   5378 	return (NULL);
   5379 }
   5380 
   5381 /*
   5382  * TODO: make this table extendible at run time
   5383  * Return a pointer to the mac type info for 'mac_type'
   5384  */
   5385 static ip_m_t *
   5386 ip_m_lookup(t_uscalar_t mac_type)
   5387 {
   5388 	ip_m_t	*ipm;
   5389 
   5390 	for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++)
   5391 		if (ipm->ip_m_mac_type == mac_type)
   5392 			return (ipm);
   5393 	return (NULL);
   5394 }
   5395 
   5396 /*
   5397  * Make a link layer address from the multicast IP address *addr.
   5398  * To form the link layer address, invoke the ip_m_v*mapping function
   5399  * associated with the link-layer type.
   5400  */
   5401 void
   5402 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr)
   5403 {
   5404 	ip_m_t *ipm;
   5405 
   5406 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
   5407 		return;
   5408 
   5409 	ASSERT(addr != NULL);
   5410 
   5411 	ipm = ip_m_lookup(ill->ill_mactype);
   5412 	if (ipm == NULL ||
   5413 	    (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) ||
   5414 	    (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) {
   5415 		ip0dbg(("no mapping for ill %s mactype 0x%x\n",
   5416 		    ill->ill_name, ill->ill_mactype));
   5417 		return;
   5418 	}
   5419 	if (ill->ill_isv6)
   5420 		(*ipm->ip_m_v6mapping)(ill, addr, hwaddr);
   5421 	else
   5422 		(*ipm->ip_m_v4mapping)(ill, addr, hwaddr);
   5423 }
   5424 
   5425 /*
   5426  * ip_rt_add is called to add an IPv4 route to the forwarding table.
   5427  * ill is passed in to associate it with the correct interface.
   5428  * If ire_arg is set, then we return the held IRE in that location.
   5429  */
   5430 int
   5431 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
   5432     ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg,
   5433     boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid)
   5434 {
   5435 	ire_t	*ire, *nire;
   5436 	ire_t	*gw_ire = NULL;
   5437 	ipif_t	*ipif = NULL;
   5438 	uint_t	type;
   5439 	int	match_flags = MATCH_IRE_TYPE;
   5440 	tsol_gc_t *gc = NULL;
   5441 	tsol_gcgrp_t *gcgrp = NULL;
   5442 	boolean_t gcgrp_xtraref = B_FALSE;
   5443 	boolean_t cgtp_broadcast;
   5444 
   5445 	ip1dbg(("ip_rt_add:"));
   5446 
   5447 	if (ire_arg != NULL)
   5448 		*ire_arg = NULL;
   5449 
   5450 	/*
   5451 	 * If this is the case of RTF_HOST being set, then we set the netmask
   5452 	 * to all ones (regardless if one was supplied).
   5453 	 */
   5454 	if (flags & RTF_HOST)
   5455 		mask = IP_HOST_MASK;
   5456 
   5457 	/*
   5458 	 * Prevent routes with a zero gateway from being created (since
   5459 	 * interfaces can currently be plumbed and brought up no assigned
   5460 	 * address).
   5461 	 */
   5462 	if (gw_addr == 0)
   5463 		return (ENETUNREACH);
   5464 	/*
   5465 	 * Get the ipif, if any, corresponding to the gw_addr
   5466 	 * If -ifp was specified we restrict ourselves to the ill, otherwise
   5467 	 * we match on the gatway and destination to handle unnumbered pt-pt
   5468 	 * interfaces.
   5469 	 */
   5470 	if (ill != NULL)
   5471 		ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst);
   5472 	else
   5473 		ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
   5474 	if (ipif != NULL) {
   5475 		if (IS_VNI(ipif->ipif_ill)) {
   5476 			ipif_refrele(ipif);
   5477 			return (EINVAL);
   5478 		}
   5479 	}
   5480 
   5481 	/*
   5482 	 * GateD will attempt to create routes with a loopback interface
   5483 	 * address as the gateway and with RTF_GATEWAY set.  We allow
   5484 	 * these routes to be added, but create them as interface routes
   5485 	 * since the gateway is an interface address.
   5486 	 */
   5487 	if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) {
   5488 		flags &= ~RTF_GATEWAY;
   5489 		if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
   5490 		    mask == IP_HOST_MASK) {
   5491 			ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
   5492 			    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
   5493 			    NULL);
   5494 			if (ire != NULL) {
   5495 				ire_refrele(ire);
   5496 				ipif_refrele(ipif);
   5497 				return (EEXIST);
   5498 			}
   5499 			ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x"
   5500 			    "for 0x%x\n", (void *)ipif,
   5501 			    ipif->ipif_ire_type,
   5502 			    ntohl(ipif->ipif_lcl_addr)));
   5503 			ire = ire_create(
   5504 			    (uchar_t *)&dst_addr,	/* dest address */
   5505 			    (uchar_t *)&mask,		/* mask */
   5506 			    NULL,			/* no gateway */
   5507 			    ipif->ipif_ire_type,	/* LOOPBACK */
   5508 			    ipif->ipif_ill,
   5509 			    zoneid,
   5510 			    (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
   5511 			    NULL,
   5512 			    ipst);
   5513 
   5514 			if (ire == NULL) {
   5515 				ipif_refrele(ipif);
   5516 				return (ENOMEM);
   5517 			}
   5518 			/* src address assigned by the caller? */
   5519 			if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
   5520 				ire->ire_setsrc_addr = src_addr;
   5521 
   5522 			nire = ire_add(ire);
   5523 			if (nire == NULL) {
   5524 				/*
   5525 				 * In the result of failure, ire_add() will have
   5526 				 * already deleted the ire in question, so there
   5527 				 * is no need to do that here.
   5528 				 */
   5529 				ipif_refrele(ipif);
   5530 				return (ENOMEM);
   5531 			}
   5532 			/*
   5533 			 * Check if it was a duplicate entry. This handles
   5534 			 * the case of two racing route adds for the same route
   5535 			 */
   5536 			if (nire != ire) {
   5537 				ASSERT(nire->ire_identical_ref > 1);
   5538 				ire_delete(nire);
   5539 				ire_refrele(nire);
   5540 				ipif_refrele(ipif);
   5541 				return (EEXIST);
   5542 			}
   5543 			ire = nire;
   5544 			goto save_ire;
   5545 		}
   5546 	}
   5547 
   5548 	/*
   5549 	 * The routes for multicast with CGTP are quite special in that
   5550 	 * the gateway is the local interface address, yet RTF_GATEWAY
   5551 	 * is set. We turn off RTF_GATEWAY to provide compatibility with
   5552 	 * this undocumented and unusual use of multicast routes.
   5553 	 */
   5554 	if ((flags & RTF_MULTIRT) && ipif != NULL)
   5555 		flags &= ~RTF_GATEWAY;
   5556 
   5557 	/*
   5558 	 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
   5559 	 * and the gateway address provided is one of the system's interface
   5560 	 * addresses.  By using the routing socket interface and supplying an
   5561 	 * RTA_IFP sockaddr with an interface index, an alternate method of
   5562 	 * specifying an interface route to be created is available which uses
   5563 	 * the interface index that specifies the outgoing interface rather than
   5564 	 * the address of an outgoing interface (which may not be able to
   5565 	 * uniquely identify an interface).  When coupled with the RTF_GATEWAY
   5566 	 * flag, routes can be specified which not only specify the next-hop to
   5567 	 * be used when routing to a certain prefix, but also which outgoing
   5568 	 * interface should be used.
   5569 	 *
   5570 	 * Previously, interfaces would have unique addresses assigned to them
   5571 	 * and so the address assigned to a particular interface could be used
   5572 	 * to identify a particular interface.  One exception to this was the
   5573 	 * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
   5574 	 *
   5575 	 * With the advent of IPv6 and its link-local addresses, this
   5576 	 * restriction was relaxed and interfaces could share addresses between
   5577 	 * themselves.  In fact, typically all of the link-local interfaces on
   5578 	 * an IPv6 node or router will have the same link-local address.  In
   5579 	 * order to differentiate between these interfaces, the use of an
   5580 	 * interface index is necessary and this index can be carried inside a
   5581 	 * RTA_IFP sockaddr (which is actually a sockaddr_dl).  One restriction
   5582 	 * of using the interface index, however, is that all of the ipif's that
   5583 	 * are part of an ill have the same index and so the RTA_IFP sockaddr
   5584 	 * cannot be used to differentiate between ipif's (or logical
   5585 	 * interfaces) that belong to the same ill (physical interface).
   5586 	 *
   5587 	 * For example, in the following case involving IPv4 interfaces and
   5588 	 * logical interfaces
   5589 	 *
   5590 	 *	192.0.2.32	255.255.255.224	192.0.2.33	U	if0
   5591 	 *	192.0.2.32	255.255.255.224	192.0.2.34	U	if0
   5592 	 *	192.0.2.32	255.255.255.224	192.0.2.35	U	if0
   5593 	 *
   5594 	 * the ipif's corresponding to each of these interface routes can be
   5595 	 * uniquely identified by the "gateway" (actually interface address).
   5596 	 *
   5597 	 * In this case involving multiple IPv6 default routes to a particular
   5598 	 * link-local gateway, the use of RTA_IFP is necessary to specify which
   5599 	 * default route is of interest:
   5600 	 *
   5601 	 *	default		fe80::123:4567:89ab:cdef	U	if0
   5602 	 *	default		fe80::123:4567:89ab:cdef	U	if1
   5603 	 */
   5604 
   5605 	/* RTF_GATEWAY not set */
   5606 	if (!(flags & RTF_GATEWAY)) {
   5607 		if (sp != NULL) {
   5608 			ip2dbg(("ip_rt_add: gateway security attributes "
   5609 			    "cannot be set with interface route\n"));
   5610 			if (ipif != NULL)
   5611 				ipif_refrele(ipif);
   5612 			return (EINVAL);
   5613 		}
   5614 
   5615 		/*
   5616 		 * Whether or not ill (RTA_IFP) is set, we require that
   5617 		 * the gateway is one of our local addresses.
   5618 		 */
   5619 		if (ipif == NULL)
   5620 			return (ENETUNREACH);
   5621 
   5622 		/*
   5623 		 * We use MATCH_IRE_ILL here. If the caller specified an
   5624 		 * interface (from the RTA_IFP sockaddr) we use it, otherwise
   5625 		 * we use the ill derived from the gateway address.
   5626 		 * We can always match the gateway address since we record it
   5627 		 * in ire_gateway_addr.
   5628 		 * We don't allow RTA_IFP to specify a different ill than the
   5629 		 * one matching the ipif to make sure we can delete the route.
   5630 		 */
   5631 		match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
   5632 		if (ill == NULL) {
   5633 			ill = ipif->ipif_ill;
   5634 		} else if (ill != ipif->ipif_ill) {
   5635 			ipif_refrele(ipif);
   5636 			return (EINVAL);
   5637 		}
   5638 
   5639 		/*
   5640 		 * We check for an existing entry at this point.
   5641 		 *
   5642 		 * Since a netmask isn't passed in via the ioctl interface
   5643 		 * (SIOCADDRT), we don't check for a matching netmask in that
   5644 		 * case.
   5645 		 */
   5646 		if (!ioctl_msg)
   5647 			match_flags |= MATCH_IRE_MASK;
   5648 		ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
   5649 		    IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst,
   5650 		    NULL);
   5651 		if (ire != NULL) {
   5652 			ire_refrele(ire);
   5653 			ipif_refrele(ipif);
   5654 			return (EEXIST);
   5655 		}
   5656 
   5657 		/*
   5658 		 * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or
   5659 		 * IRE_IF_RESOLVER with the modified address, netmask, and
   5660 		 * gateway.
   5661 		 */
   5662 		ire = ire_create(
   5663 		    (uchar_t *)&dst_addr,
   5664 		    (uint8_t *)&mask,
   5665 		    (uint8_t *)&gw_addr,
   5666 		    ill->ill_net_type,
   5667 		    ill,
   5668 		    zoneid,
   5669 		    flags,
   5670 		    NULL,
   5671 		    ipst);
   5672 		if (ire == NULL) {
   5673 			ipif_refrele(ipif);
   5674 			return (ENOMEM);
   5675 		}
   5676 
   5677 		/*
   5678 		 * Some software (for example, GateD and Sun Cluster) attempts
   5679 		 * to create (what amount to) IRE_PREFIX routes with the
   5680 		 * loopback address as the gateway.  This is primarily done to
   5681 		 * set up prefixes with the RTF_REJECT flag set (for example,
   5682 		 * when generating aggregate routes.)
   5683 		 *
   5684 		 * If the IRE type (as defined by ill->ill_net_type) is
   5685 		 * IRE_LOOPBACK, then we map the request into a
   5686 		 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as
   5687 		 * these interface routes, by definition, can only be that.
   5688 		 *
   5689 		 * Needless to say, the real IRE_LOOPBACK is NOT created by this
   5690 		 * routine, but rather using ire_create() directly.
   5691 		 *
   5692 		 */
   5693 		if (ill->ill_net_type == IRE_LOOPBACK) {
   5694 			ire->ire_type = IRE_IF_NORESOLVER;
   5695 			ire->ire_flags |= RTF_BLACKHOLE;
   5696 		}
   5697 
   5698 		/* src address assigned by the caller? */
   5699 		if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
   5700 			ire->ire_setsrc_addr = src_addr;
   5701 
   5702 		nire = ire_add(ire);
   5703 		if (nire == NULL) {
   5704 			/*
   5705 			 * In the result of failure, ire_add() will have
   5706 			 * already deleted the ire in question, so there
   5707 			 * is no need to do that here.
   5708 			 */
   5709 			ipif_refrele(ipif);
   5710 			return (ENOMEM);
   5711 		}
   5712 		/*
   5713 		 * Check if it was a duplicate entry. This handles
   5714 		 * the case of two racing route adds for the same route
   5715 		 */
   5716 		if (nire != ire) {
   5717 			ire_delete(nire);
   5718 			ire_refrele(nire);
   5719 			ipif_refrele(ipif);
   5720 			return (EEXIST);
   5721 		}
   5722 		ire = nire;
   5723 		goto save_ire;
   5724 	}
   5725 
   5726 	/*
   5727 	 * Get an interface IRE for the specified gateway.
   5728 	 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
   5729 	 * gateway, it is currently unreachable and we fail the request
   5730 	 * accordingly. We reject any RTF_GATEWAY routes where the gateway
   5731 	 * is an IRE_LOCAL or IRE_LOOPBACK.
   5732 	 * If RTA_IFP was specified we look on that particular ill.
   5733 	 */
   5734 	if (ill != NULL)
   5735 		match_flags |= MATCH_IRE_ILL;
   5736 
   5737 	/* Check whether the gateway is reachable. */
   5738 again:
   5739 	type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK;
   5740 	if (flags & RTF_INDIRECT)
   5741 		type |= IRE_OFFLINK;
   5742 
   5743 	gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill,
   5744 	    ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
   5745 	if (gw_ire == NULL) {
   5746 		/*
   5747 		 * With IPMP, we allow host routes to influence in.mpathd's
   5748 		 * target selection.  However, if the test addresses are on
   5749 		 * their own network, the above lookup will fail since the
   5750 		 * underlying IRE_INTERFACEs are marked hidden.  So allow
   5751 		 * hidden test IREs to be found and try again.
   5752 		 */
   5753 		if (!(match_flags & MATCH_IRE_TESTHIDDEN))  {
   5754 			match_flags |= MATCH_IRE_TESTHIDDEN;
   5755 			goto again;
   5756 		}
   5757 		if (ipif != NULL)
   5758 			ipif_refrele(ipif);
   5759 		return (ENETUNREACH);
   5760 	}
   5761 	if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
   5762 		ire_refrele(gw_ire);
   5763 		if (ipif != NULL)
   5764 			ipif_refrele(ipif);
   5765 		return (ENETUNREACH);
   5766 	}
   5767 
   5768 	/*
   5769 	 * We create one of three types of IREs as a result of this request
   5770 	 * based on the netmask.  A netmask of all ones (which is automatically
   5771 	 * assumed when RTF_HOST is set) results in an IRE_HOST being created.
   5772 	 * An all zeroes netmask implies a default route so an IRE_DEFAULT is
   5773 	 * created.  Otherwise, an IRE_PREFIX route is created for the
   5774 	 * destination prefix.
   5775 	 */
   5776 	if (mask == IP_HOST_MASK)
   5777 		type = IRE_HOST;
   5778 	else if (mask == 0)
   5779 		type = IRE_DEFAULT;
   5780 	else
   5781 		type = IRE_PREFIX;
   5782 
   5783 	/* check for a duplicate entry */
   5784 	ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
   5785 	    ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW,
   5786 	    0, ipst, NULL);
   5787 	if (ire != NULL) {
   5788 		if (ipif != NULL)
   5789 			ipif_refrele(ipif);
   5790 		ire_refrele(gw_ire);
   5791 		ire_refrele(ire);
   5792 		return (EEXIST);
   5793 	}
   5794 
   5795 	/* Security attribute exists */
   5796 	if (sp != NULL) {
   5797 		tsol_gcgrp_addr_t ga;
   5798 
   5799 		/* find or create the gateway credentials group */
   5800 		ga.ga_af = AF_INET;
   5801 		IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr);
   5802 
   5803 		/* we hold reference to it upon success */
   5804 		gcgrp = gcgrp_lookup(&ga, B_TRUE);
   5805 		if (gcgrp == NULL) {
   5806 			if (ipif != NULL)
   5807 				ipif_refrele(ipif);
   5808 			ire_refrele(gw_ire);
   5809 			return (ENOMEM);
   5810 		}
   5811 
   5812 		/*
   5813 		 * Create and add the security attribute to the group; a
   5814 		 * reference to the group is made upon allocating a new
   5815 		 * entry successfully.  If it finds an already-existing
   5816 		 * entry for the security attribute in the group, it simply
   5817 		 * returns it and no new reference is made to the group.
   5818 		 */
   5819 		gc = gc_create(sp, gcgrp, &gcgrp_xtraref);
   5820 		if (gc == NULL) {
   5821 			if (ipif != NULL)
   5822 				ipif_refrele(ipif);
   5823 			/* release reference held by gcgrp_lookup */
   5824 			GCGRP_REFRELE(gcgrp);
   5825 			ire_refrele(gw_ire);
   5826 			return (ENOMEM);
   5827 		}
   5828 	}
   5829 
   5830 	/* Create the IRE. */
   5831 	ire = ire_create(
   5832 	    (uchar_t *)&dst_addr,		/* dest address */
   5833 	    (uchar_t *)&mask,			/* mask */
   5834 	    (uchar_t *)&gw_addr,		/* gateway address */
   5835 	    (ushort_t)type,			/* IRE type */
   5836 	    ill,
   5837 	    zoneid,
   5838 	    flags,
   5839 	    gc,					/* security attribute */
   5840 	    ipst);
   5841 
   5842 	/*
   5843 	 * The ire holds a reference to the 'gc' and the 'gc' holds a
   5844 	 * reference to the 'gcgrp'. We can now release the extra reference
   5845 	 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used.
   5846 	 */
   5847 	if (gcgrp_xtraref)
   5848 		GCGRP_REFRELE(gcgrp);
   5849 	if (ire == NULL) {
   5850 		if (gc != NULL)
   5851 			GC_REFRELE(gc);
   5852 		if (ipif != NULL)
   5853 			ipif_refrele(ipif);
   5854 		ire_refrele(gw_ire);
   5855 		return (ENOMEM);
   5856 	}
   5857 
   5858 	/* Before we add, check if an extra CGTP broadcast is needed */
   5859 	cgtp_broadcast = ((flags & RTF_MULTIRT) &&
   5860 	    ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST);
   5861 
   5862 	/* src address assigned by the caller? */
   5863 	if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
   5864 		ire->ire_setsrc_addr = src_addr;
   5865 
   5866 	/*
   5867 	 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
   5868 	 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
   5869 	 */
   5870 
   5871 	/* Add the new IRE. */
   5872 	nire = ire_add(ire);
   5873 	if (nire == NULL) {
   5874 		/*
   5875 		 * In the result of failure, ire_add() will have
   5876 		 * already deleted the ire in question, so there
   5877 		 * is no need to do that here.
   5878 		 */
   5879 		if (ipif != NULL)
   5880 			ipif_refrele(ipif);
   5881 		ire_refrele(gw_ire);
   5882 		return (ENOMEM);
   5883 	}
   5884 	/*
   5885 	 * Check if it was a duplicate entry. This handles
   5886 	 * the case of two racing route adds for the same route
   5887 	 */
   5888 	if (nire != ire) {
   5889 		ire_delete(nire);
   5890 		ire_refrele(nire);
   5891 		if (ipif != NULL)
   5892 			ipif_refrele(ipif);
   5893 		ire_refrele(gw_ire);
   5894 		return (EEXIST);
   5895 	}
   5896 	ire = nire;
   5897 
   5898 	if (flags & RTF_MULTIRT) {
   5899 		/*
   5900 		 * Invoke the CGTP (multirouting) filtering module
   5901 		 * to add the dst address in the filtering database.
   5902 		 * Replicated inbound packets coming from that address
   5903 		 * will be filtered to discard the duplicates.
   5904 		 * It is not necessary to call the CGTP filter hook
   5905 		 * when the dst address is a broadcast or multicast,
   5906 		 * because an IP source address cannot be a broadcast
   5907 		 * or a multicast.
   5908 		 */
   5909 		if (cgtp_broadcast) {
   5910 			ip_cgtp_bcast_add(ire, ipst);
   5911 			goto save_ire;
   5912 		}
   5913 		if (ipst->ips_ip_cgtp_filter_ops != NULL &&
   5914 		    !CLASSD(ire->ire_addr)) {
   5915 			int res;
   5916 			ipif_t *src_ipif;
   5917 
   5918 			/* Find the source address corresponding to gw_ire */
   5919 			src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr,
   5920 			    NULL, zoneid, ipst);
   5921 			if (src_ipif != NULL) {
   5922 				res = ipst->ips_ip_cgtp_filter_ops->
   5923 				    cfo_add_dest_v4(
   5924 				    ipst->ips_netstack->netstack_stackid,
   5925 				    ire->ire_addr,
   5926 				    ire->ire_gateway_addr,
   5927 				    ire->ire_setsrc_addr,
   5928 				    src_ipif->ipif_lcl_addr);
   5929 				ipif_refrele(src_ipif);
   5930 			} else {
   5931 				res = EADDRNOTAVAIL;
   5932 			}
   5933 			if (res != 0) {
   5934 				if (ipif != NULL)
   5935 					ipif_refrele(ipif);
   5936 				ire_refrele(gw_ire);
   5937 				ire_delete(ire);
   5938 				ire_refrele(ire);	/* Held in ire_add */
   5939 				return (res);
   5940 			}
   5941 		}
   5942 	}
   5943 
   5944 save_ire:
   5945 	if (gw_ire != NULL) {
   5946 		ire_refrele(gw_ire);
   5947 		gw_ire = NULL;
   5948 	}
   5949 	if (ill != NULL) {
   5950 		/*
   5951 		 * Save enough information so that we can recreate the IRE if
   5952 		 * the interface goes down and then up.  The metrics associated
   5953 		 * with the route will be saved as well when rts_setmetrics() is
   5954 		 * called after the IRE has been created.  In the case where
   5955 		 * memory cannot be allocated, none of this information will be
   5956 		 * saved.
   5957 		 */
   5958 		ill_save_ire(ill, ire);
   5959 	}
   5960 	if (ioctl_msg)
   5961 		ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
   5962 	if (ire_arg != NULL) {
   5963 		/*
   5964 		 * Store the ire that was successfully added into where ire_arg
   5965 		 * points to so that callers don't have to look it up
   5966 		 * themselves (but they are responsible for ire_refrele()ing
   5967 		 * the ire when they are finished with it).
   5968 		 */
   5969 		*ire_arg = ire;
   5970 	} else {
   5971 		ire_refrele(ire);		/* Held in ire_add */
   5972 	}
   5973 	if (ipif != NULL)
   5974 		ipif_refrele(ipif);
   5975 	return (0);
   5976 }
   5977 
   5978 /*
   5979  * ip_rt_delete is called to delete an IPv4 route.
   5980  * ill is passed in to associate it with the correct interface.
   5981  */
   5982 /* ARGSUSED4 */
   5983 int
   5984 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
   5985     uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg,
   5986     ip_stack_t *ipst, zoneid_t zoneid)
   5987 {
   5988 	ire_t	*ire = NULL;
   5989 	ipif_t	*ipif;
   5990 	uint_t	type;
   5991 	uint_t	match_flags = MATCH_IRE_TYPE;
   5992 	int	err = 0;
   5993 
   5994 	ip1dbg(("ip_rt_delete:"));
   5995 	/*
   5996 	 * If this is the case of RTF_HOST being set, then we set the netmask
   5997 	 * to all ones.  Otherwise, we use the netmask if one was supplied.
   5998 	 */
   5999 	if (flags & RTF_HOST) {
   6000 		mask = IP_HOST_MASK;
   6001 		match_flags |= MATCH_IRE_MASK;
   6002 	} else if (rtm_addrs & RTA_NETMASK) {
   6003 		match_flags |= MATCH_IRE_MASK;
   6004 	}
   6005 
   6006 	/*
   6007 	 * Note that RTF_GATEWAY is never set on a delete, therefore
   6008 	 * we check if the gateway address is one of our interfaces first,
   6009 	 * and fall back on RTF_GATEWAY routes.
   6010 	 *
   6011 	 * This makes it possible to delete an original
   6012 	 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
   6013 	 * However, we have RTF_KERNEL set on the ones created by ipif_up
   6014 	 * and those can not be deleted here.
   6015 	 *
   6016 	 * We use MATCH_IRE_ILL if we know the interface. If the caller
   6017 	 * specified an interface (from the RTA_IFP sockaddr) we use it,
   6018 	 * otherwise we use the ill derived from the gateway address.
   6019 	 * We can always match the gateway address since we record it
   6020 	 * in ire_gateway_addr.
   6021 	 *
   6022 	 * For more detail on specifying routes by gateway address and by
   6023 	 * interface index, see the comments in ip_rt_add().
   6024 	 */
   6025 	ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
   6026 	if (ipif != NULL) {
   6027 		ill_t	*ill_match;
   6028 
   6029 		if (ill != NULL)
   6030 			ill_match = ill;
   6031 		else
   6032 			ill_match = ipif->ipif_ill;
   6033 
   6034 		match_flags |= MATCH_IRE_ILL;
   6035 		if (ipif->ipif_ire_type == IRE_LOOPBACK) {
   6036 			ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
   6037 			    ill_match, ALL_ZONES, NULL, match_flags, 0, ipst,
   6038 			    NULL);
   6039 		}
   6040 		if (ire == NULL) {
   6041 			match_flags |= MATCH_IRE_GW;
   6042 			ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
   6043 			    IRE_INTERFACE, ill_match, ALL_ZONES, NULL,
   6044 			    match_flags, 0, ipst, NULL);
   6045 		}
   6046 		/* Avoid deleting routes created by kernel from an ipif */
   6047 		if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
   6048 			ire_refrele(ire);
   6049 			ire = NULL;
   6050 		}
   6051 
   6052 		/* Restore in case we didn't find a match */
   6053 		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
   6054 	}
   6055 
   6056 	if (ire == NULL) {
   6057 		/*
   6058 		 * At this point, the gateway address is not one of our own
   6059 		 * addresses or a matching interface route was not found.  We
   6060 		 * set the IRE type to lookup based on whether
   6061 		 * this is a host route, a default route or just a prefix.
   6062 		 *
   6063 		 * If an ill was passed in, then the lookup is based on an
   6064 		 * interface index so MATCH_IRE_ILL is added to match_flags.
   6065 		 */
   6066 		match_flags |= MATCH_IRE_GW;
   6067 		if (ill != NULL)
   6068 			match_flags |= MATCH_IRE_ILL;
   6069 		if (mask == IP_HOST_MASK)
   6070 			type = IRE_HOST;
   6071 		else if (mask == 0)
   6072 			type = IRE_DEFAULT;
   6073 		else
   6074 			type = IRE_PREFIX;
   6075 		ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
   6076 		    ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
   6077 	}
   6078 
   6079 	if (ipif != NULL) {
   6080 		ipif_refrele(ipif);
   6081 		ipif = NULL;
   6082 	}
   6083 
   6084 	if (ire == NULL)
   6085 		return (ESRCH);
   6086 
   6087 	if (ire->ire_flags & RTF_MULTIRT) {
   6088 		/*
   6089 		 * Invoke the CGTP (multirouting) filtering module
   6090 		 * to remove the dst address from the filtering database.
   6091 		 * Packets coming from that address will no longer be
   6092 		 * filtered to remove duplicates.
   6093 		 */
   6094 		if (ipst->ips_ip_cgtp_filter_ops != NULL) {
   6095 			err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4(
   6096 			    ipst->ips_netstack->netstack_stackid,
   6097 			    ire->ire_addr, ire->ire_gateway_addr);
   6098 		}
   6099 		ip_cgtp_bcast_delete(ire, ipst);
   6100 	}
   6101 
   6102 	ill = ire->ire_ill;
   6103 	if (ill != NULL)
   6104 		ill_remove_saved_ire(ill, ire);
   6105 	if (ioctl_msg)
   6106 		ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
   6107 	ire_delete(ire);
   6108 	ire_refrele(ire);
   6109 	return (err);
   6110 }
   6111 
   6112 /*
   6113  * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL.
   6114  */
   6115 /* ARGSUSED */
   6116 int
   6117 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
   6118     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
   6119 {
   6120 	ipaddr_t dst_addr;
   6121 	ipaddr_t gw_addr;
   6122 	ipaddr_t mask;
   6123 	int error = 0;
   6124 	mblk_t *mp1;
   6125 	struct rtentry *rt;
   6126 	ipif_t *ipif = NULL;
   6127 	ip_stack_t	*ipst;
   6128 
   6129 	ASSERT(q->q_next == NULL);
   6130 	ipst = CONNQ_TO_IPST(q);
   6131 
   6132 	ip1dbg(("ip_siocaddrt:"));
   6133 	/* Existence of mp1 verified in ip_wput_nondata */
   6134 	mp1 = mp->b_cont->b_cont;
   6135 	rt = (struct rtentry *)mp1->b_rptr;
   6136 
   6137 	dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
   6138 	gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
   6139 
   6140 	/*
   6141 	 * If the RTF_HOST flag is on, this is a request to assign a gateway
   6142 	 * to a particular host address.  In this case, we set the netmask to
   6143 	 * all ones for the particular destination address.  Otherwise,
   6144 	 * determine the netmask to be used based on dst_addr and the interfaces
   6145 	 * in use.
   6146 	 */
   6147 	if (rt->rt_flags & RTF_HOST) {
   6148 		mask = IP_HOST_MASK;
   6149 	} else {
   6150 		/*
   6151 		 * Note that ip_subnet_mask returns a zero mask in the case of
   6152 		 * default (an all-zeroes address).
   6153 		 */
   6154 		mask = ip_subnet_mask(dst_addr, &ipif, ipst);
   6155 	}
   6156 
   6157 	error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
   6158 	    B_TRUE, NULL, ipst, ALL_ZONES);
   6159 	if (ipif != NULL)
   6160 		ipif_refrele(ipif);
   6161 	return (error);
   6162 }
   6163 
   6164 /*
   6165  * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL.
   6166  */
   6167 /* ARGSUSED */
   6168 int
   6169 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
   6170     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
   6171 {
   6172 	ipaddr_t dst_addr;
   6173 	ipaddr_t gw_addr;
   6174 	ipaddr_t mask;
   6175 	int error;
   6176 	mblk_t *mp1;
   6177 	struct rtentry *rt;
   6178 	ipif_t *ipif = NULL;
   6179 	ip_stack_t	*ipst;
   6180 
   6181 	ASSERT(q->q_next == NULL);
   6182 	ipst = CONNQ_TO_IPST(q);
   6183 
   6184 	ip1dbg(("ip_siocdelrt:"));
   6185 	/* Existence of mp1 verified in ip_wput_nondata */
   6186 	mp1 = mp->b_cont->b_cont;
   6187 	rt = (struct rtentry *)mp1->b_rptr;
   6188 
   6189 	dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
   6190 	gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
   6191 
   6192 	/*
   6193 	 * If the RTF_HOST flag is on, this is a request to delete a gateway
   6194 	 * to a particular host address.  In this case, we set the netmask to
   6195 	 * all ones for the particular destination address.  Otherwise,
   6196 	 * determine the netmask to be used based on dst_addr and the interfaces
   6197 	 * in use.
   6198 	 */
   6199 	if (rt->rt_flags & RTF_HOST) {
   6200 		mask = IP_HOST_MASK;
   6201 	} else {
   6202 		/*
   6203 		 * Note that ip_subnet_mask returns a zero mask in the case of
   6204 		 * default (an all-zeroes address).
   6205 		 */
   6206 		mask = ip_subnet_mask(dst_addr, &ipif, ipst);
   6207 	}
   6208 
   6209 	error = ip_rt_delete(dst_addr, mask, gw_addr,
   6210 	    RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE,
   6211 	    ipst, ALL_ZONES);
   6212 	if (ipif != NULL)
   6213 		ipif_refrele(ipif);
   6214 	return (error);
   6215 }
   6216 
   6217 /*
   6218  * Enqueue the mp onto the ipsq, chained by b_next.
   6219  * b_prev stores the function to be executed later, and b_queue the queue
   6220  * where this mp originated.
   6221  */
   6222 void
   6223 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
   6224     ill_t *pending_ill)
   6225 {
   6226 	conn_t	*connp;
   6227 	ipxop_t *ipx = ipsq->ipsq_xop;
   6228 
   6229 	ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
   6230 	ASSERT(MUTEX_HELD(&ipx->ipx_lock));
   6231 	ASSERT(func != NULL);
   6232 
   6233 	mp->b_queue = q;
   6234 	mp->b_prev = (void *)func;
   6235 	mp->b_next = NULL;
   6236 
   6237 	switch (type) {
   6238 	case CUR_OP:
   6239 		if (ipx->ipx_mptail != NULL) {
   6240 			ASSERT(ipx->ipx_mphead != NULL);
   6241 			ipx->ipx_mptail->b_next = mp;
   6242 		} else {
   6243 			ASSERT(ipx->ipx_mphead == NULL);
   6244 			ipx->ipx_mphead = mp;
   6245 		}
   6246 		ipx->ipx_mptail = mp;
   6247 		break;
   6248 
   6249 	case NEW_OP:
   6250 		if (ipsq->ipsq_xopq_mptail != NULL) {
   6251 			ASSERT(ipsq->ipsq_xopq_mphead != NULL);
   6252 			ipsq->ipsq_xopq_mptail->b_next = mp;
   6253 		} else {
   6254 			ASSERT(ipsq->ipsq_xopq_mphead == NULL);
   6255 			ipsq->ipsq_xopq_mphead = mp;
   6256 		}
   6257 		ipsq->ipsq_xopq_mptail = mp;
   6258 		ipx->ipx_ipsq_queued = B_TRUE;
   6259 		break;
   6260 
   6261 	case SWITCH_OP:
   6262 		ASSERT(ipsq->ipsq_swxop != NULL);
   6263 		/* only one switch operation is currently allowed */
   6264 		ASSERT(ipsq->ipsq_switch_mp == NULL);
   6265 		ipsq->ipsq_switch_mp = mp;
   6266 		ipx->ipx_ipsq_queued = B_TRUE;
   6267 		break;
   6268 	default:
   6269 		cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
   6270 	}
   6271 
   6272 	if (CONN_Q(q) && pending_ill != NULL) {
   6273 		connp = Q_TO_CONN(q);
   6274 		ASSERT(MUTEX_HELD(&connp->conn_lock));
   6275 		connp->conn_oper_pending_ill = pending_ill;
   6276 	}
   6277 }
   6278 
   6279 /*
   6280  * Dequeue the next message that requested exclusive access to this IPSQ's
   6281  * xop.  Specifically:
   6282  *
   6283  *  1. If we're still processing the current operation on `ipsq', then
   6284  *     dequeue the next message for the operation (from ipx_mphead), or
   6285  *     return NULL if there are no queued messages for the operation.
   6286  *     These messages are queued via CUR_OP to qwriter_ip() and friends.
   6287  *
   6288  *  2. If the current operation on `ipsq' has completed (ipx_current_ipif is
   6289  *     not set) see if the ipsq has requested an xop switch.  If so, switch
   6290  *     `ipsq' to a different xop.  Xop switches only happen when joining or
   6291  *     leaving IPMP groups and require a careful dance -- see the comments
   6292  *     in-line below for details.  If we're leaving a group xop or if we're
   6293  *     joining a group xop and become writer on it, then we proceed to (3).
   6294  *     Otherwise, we return NULL and exit the xop.
   6295  *
   6296  *  3. For each IPSQ in the xop, return any switch operation stored on
   6297  *     ipsq_switch_mp (set via SWITCH_OP); these must be processed before
   6298  *     any other messages queued on the IPSQ.  Otherwise, dequeue the next
   6299  *     exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
   6300  *     Note that if the phyint tied to `ipsq' is not using IPMP there will
   6301  *     only be one IPSQ in the xop.  Otherwise, there will be one IPSQ for
   6302  *     each phyint in the group, including the IPMP meta-interface phyint.
   6303  */
   6304 static mblk_t *
   6305 ipsq_dq(ipsq_t *ipsq)
   6306 {
   6307 	ill_t	*illv4, *illv6;
   6308 	mblk_t	*mp;
   6309 	ipsq_t	*xopipsq;
   6310 	ipsq_t	*leftipsq = NULL;
   6311 	ipxop_t *ipx;
   6312 	phyint_t *phyi = ipsq->ipsq_phyint;
   6313 	ip_stack_t *ipst = ipsq->ipsq_ipst;
   6314 	boolean_t emptied = B_FALSE;
   6315 
   6316 	/*
   6317 	 * Grab all the locks we need in the defined order (ill_g_lock ->
   6318 	 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
   6319 	 */
   6320 	rw_enter(&ipst->ips_ill_g_lock,
   6321 	    ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
   6322 	mutex_enter(&ipsq->ipsq_lock);
   6323 	ipx = ipsq->ipsq_xop;
   6324 	mutex_enter(&ipx->ipx_lock);
   6325 
   6326 	/*
   6327 	 * Dequeue the next message associated with the current exclusive
   6328 	 * operation, if any.
   6329 	 */
   6330 	if ((mp = ipx->ipx_mphead) != NULL) {
   6331 		ipx->ipx_mphead = mp->b_next;
   6332 		if (ipx->ipx_mphead == NULL)
   6333 			ipx->ipx_mptail = NULL;
   6334 		mp->b_next = (void *)ipsq;
   6335 		goto out;
   6336 	}
   6337 
   6338 	if (ipx->ipx_current_ipif != NULL)
   6339 		goto empty;
   6340 
   6341 	if (ipsq->ipsq_swxop != NULL) {
   6342 		/*
   6343 		 * The exclusive operation that is now being completed has
   6344 		 * requested a switch to a different xop.  This happens
   6345 		 * when an interface joins or leaves an IPMP group.  Joins
   6346 		 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
   6347 		 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
   6348 		 * (phyint_free()), or interface plumb for an ill type
   6349 		 * not in the IPMP group (ip_rput_dlpi_writer()).
   6350 		 *
   6351 		 * Xop switches are not allowed on the IPMP meta-interface.
   6352 		 */
   6353 		ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
   6354 		ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
   6355 		DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
   6356 
   6357 		if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
   6358 			/*
   6359 			 * We're switching back to our own xop, so we have two
   6360 			 * xop's to drain/exit: our own, and the group xop
   6361 			 * that we are leaving.
   6362 			 *
   6363 			 * First, pull ourselves out of the group ipsq list.
   6364 			 * This is safe since we're writer on ill_g_lock.
   6365 			 */
   6366 			ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
   6367 
   6368 			xopipsq = ipx->ipx_ipsq;
   6369 			while (xopipsq->ipsq_next != ipsq)
   6370 				xopipsq = xopipsq->ipsq_next;
   6371 
   6372 			xopipsq->ipsq_next = ipsq->ipsq_next;
   6373 			ipsq->ipsq_next = ipsq;
   6374 			ipsq->ipsq_xop = ipsq->ipsq_swxop;
   6375 			ipsq->ipsq_swxop = NULL;
   6376 
   6377 			/*
   6378 			 * Second, prepare to exit the group xop.  The actual
   6379 			 * ipsq_exit() is done at the end of this function
   6380 			 * since we cannot hold any locks across ipsq_exit().
   6381 			 * Note that although we drop the group's ipx_lock, no
   6382 			 * threads can proceed since we're still ipx_writer.
   6383 			 */
   6384 			leftipsq = xopipsq;
   6385 			mutex_exit(&ipx->ipx_lock);
   6386 
   6387 			/*
   6388 			 * Third, set ipx to point to our own xop (which was
   6389 			 * inactive and therefore can be entered).
   6390 			 */
   6391 			ipx = ipsq->ipsq_xop;
   6392 			mutex_enter(&ipx->ipx_lock);
   6393 			ASSERT(ipx->ipx_writer == NULL);
   6394 			ASSERT(ipx->ipx_current_ipif == NULL);
   6395 		} else {
   6396 			/*
   6397 			 * We're switching from our own xop to a group xop.
   6398 			 * The requestor of the switch must ensure that the
   6399 			 * group xop cannot go away (e.g. by ensuring the
   6400 			 * phyint associated with the xop cannot go away).
   6401 			 *
   6402 			 * If we can become writer on our new xop, then we'll
   6403 			 * do the drain.  Otherwise, the current writer of our
   6404 			 * new xop will do the drain when it exits.
   6405 			 *
   6406 			 * First, splice ourselves into the group IPSQ list.
   6407 			 * This is safe since we're writer on ill_g_lock.
   6408 			 */
   6409 			ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
   6410 
   6411 			xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
   6412 			while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
   6413 				xopipsq = xopipsq->ipsq_next;
   6414 
   6415 			xopipsq->ipsq_next = ipsq;
   6416 			ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
   6417 			ipsq->ipsq_xop = ipsq->ipsq_swxop;
   6418 			ipsq->ipsq_swxop = NULL;
   6419 
   6420 			/*
   6421 			 * Second, exit our own xop, since it's now unused.
   6422 			 * This is safe since we've got the only reference.
   6423 			 */
   6424 			ASSERT(ipx->ipx_writer == curthread);
   6425 			ipx->ipx_writer = NULL;
   6426 			VERIFY(--ipx->ipx_reentry_cnt == 0);
   6427 			ipx->ipx_ipsq_queued = B_FALSE;
   6428 			mutex_exit(&ipx->ipx_lock);
   6429 
   6430 			/*
   6431 			 * Third, set ipx to point to our new xop, and check
   6432 			 * if we can become writer on it.  If we cannot, then
   6433 			 * the current writer will drain the IPSQ group when
   6434 			 * it exits.  Our ipsq_xop is guaranteed to be stable
   6435 			 * because we're still holding ipsq_lock.
   6436 			 */
   6437 			ipx = ipsq->ipsq_xop;
   6438 			mutex_enter(&ipx->ipx_lock);
   6439 			if (ipx->ipx_writer != NULL ||
   6440 			    ipx->ipx_current_ipif != NULL) {
   6441 				goto out;
   6442 			}
   6443 		}
   6444 
   6445 		/*
   6446 		 * Fourth, become writer on our new ipx before we continue
   6447 		 * with the drain.  Note that we never dropped ipsq_lock
   6448 		 * above, so no other thread could've raced with us to
   6449 		 * become writer first.  Also, we're holding ipx_lock, so
   6450 		 * no other thread can examine the ipx right now.
   6451 		 */
   6452 		ASSERT(ipx->ipx_current_ipif == NULL);
   6453 		ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
   6454 		VERIFY(ipx->ipx_reentry_cnt++ == 0);
   6455 		ipx->ipx_writer = curthread;
   6456 		ipx->ipx_forced = B_FALSE;
   6457 #ifdef DEBUG
   6458 		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
   6459 #endif
   6460 	}
   6461 
   6462 	xopipsq = ipsq;
   6463 	do {
   6464 		/*
   6465 		 * So that other operations operate on a consistent and
   6466 		 * complete phyint, a switch message on an IPSQ must be
   6467 		 * handled prior to any other operations on that IPSQ.
   6468 		 */
   6469 		if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
   6470 			xopipsq->ipsq_switch_mp = NULL;
   6471 			ASSERT(mp->b_next == NULL);
   6472 			mp->b_next = (void *)xopipsq;
   6473 			goto out;
   6474 		}
   6475 
   6476 		if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
   6477 			xopipsq->ipsq_xopq_mphead = mp->b_next;
   6478 			if (xopipsq->ipsq_xopq_mphead == NULL)
   6479 				xopipsq->ipsq_xopq_mptail = NULL;
   6480 			mp->b_next = (void *)xopipsq;
   6481 			goto out;
   6482 		}
   6483 	} while ((xopipsq = xopipsq->ipsq_next) != ipsq);
   6484 empty:
   6485 	/*
   6486 	 * There are no messages.  Further, we are holding ipx_lock, hence no
   6487 	 * new messages can end up on any IPSQ in the xop.
   6488 	 */
   6489 	ipx->ipx_writer = NULL;
   6490 	ipx->ipx_forced = B_FALSE;
   6491 	VERIFY(--ipx->ipx_reentry_cnt == 0);
   6492 	ipx->ipx_ipsq_queued = B_FALSE;
   6493 	emptied = B_TRUE;
   6494 #ifdef	DEBUG
   6495 	ipx->ipx_depth = 0;
   6496 #endif
   6497 out:
   6498 	mutex_exit(&ipx->ipx_lock);
   6499 	mutex_exit(&ipsq->ipsq_lock);
   6500 
   6501 	/*
   6502 	 * If we completely emptied the xop, then wake up any threads waiting
   6503 	 * to enter any of the IPSQ's associated with it.
   6504 	 */
   6505 	if (emptied) {
   6506 		xopipsq = ipsq;
   6507 		do {
   6508 			if ((phyi = xopipsq->ipsq_phyint) == NULL)
   6509 				continue;
   6510 
   6511 			illv4 = phyi->phyint_illv4;
   6512 			illv6 = phyi->phyint_illv6;
   6513 
   6514 			GRAB_ILL_LOCKS(illv4, illv6);
   6515 			if (illv4 != NULL)
   6516 				cv_broadcast(&illv4->ill_cv);
   6517 			if (illv6 != NULL)
   6518 				cv_broadcast(&illv6->ill_cv);
   6519 			RELEASE_ILL_LOCKS(illv4, illv6);
   6520 		} while ((xopipsq = xopipsq->ipsq_next) != ipsq);
   6521 	}
   6522 	rw_exit(&ipst->ips_ill_g_lock);
   6523 
   6524 	/*
   6525 	 * Now that all locks are dropped, exit the IPSQ we left.
   6526 	 */
   6527 	if (leftipsq != NULL)
   6528 		ipsq_exit(leftipsq);
   6529 
   6530 	return (mp);
   6531 }
   6532 
   6533 /*
   6534  * Return completion status of previously initiated DLPI operations on
   6535  * ills in the purview of an ipsq.
   6536  */
   6537 static boolean_t
   6538 ipsq_dlpi_done(ipsq_t *ipsq)
   6539 {
   6540 	ipsq_t		*ipsq_start;
   6541 	phyint_t	*phyi;
   6542 	ill_t		*ill;
   6543 
   6544 	ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock));
   6545 	ipsq_start = ipsq;
   6546 
   6547 	do {
   6548 		/*
   6549 		 * The only current users of this function are ipsq_try_enter
   6550 		 * and ipsq_enter which have made sure that ipsq_writer is
   6551 		 * NULL before we reach here. ill_dlpi_pending is modified
   6552 		 * only by an ipsq writer
   6553 		 */
   6554 		ASSERT(ipsq->ipsq_xop->ipx_writer == NULL);
   6555 		phyi = ipsq->ipsq_phyint;
   6556 		/*
   6557 		 * phyi could be NULL if a phyint that is part of an
   6558 		 * IPMP group is being unplumbed. A more detailed
   6559 		 * comment is in ipmp_grp_update_kstats()
   6560 		 */
   6561 		if (phyi != NULL) {
   6562 			ill = phyi->phyint_illv4;
   6563 			if (ill != NULL &&
   6564 			    (ill->ill_dlpi_pending != DL_PRIM_INVAL ||
   6565 			    ill->ill_arl_dlpi_pending))
   6566 				return (B_FALSE);
   6567 
   6568 			ill = phyi->phyint_illv6;
   6569 			if (ill != NULL &&
   6570 			    ill->ill_dlpi_pending != DL_PRIM_INVAL)
   6571 				return (B_FALSE);
   6572 		}
   6573 
   6574 	} while ((ipsq = ipsq->ipsq_next) != ipsq_start);
   6575 
   6576 	return (B_TRUE);
   6577 }
   6578 
   6579 /*
   6580  * Enter the ipsq corresponding to ill, by waiting synchronously till
   6581  * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
   6582  * will have to drain completely before ipsq_enter returns success.
   6583  * ipx_current_ipif will be set if some exclusive op is in progress,
   6584  * and the ipsq_exit logic will start the next enqueued op after
   6585  * completion of the current op. If 'force' is used, we don't wait
   6586  * for the enqueued ops. This is needed when a conn_close wants to
   6587  * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
   6588  * of an ill can also use this option. But we dont' use it currently.
   6589  */
   6590 #define	ENTER_SQ_WAIT_TICKS 100
   6591 boolean_t
   6592 ipsq_enter(ill_t *ill, boolean_t force, int type)
   6593 {
   6594 	ipsq_t	*ipsq;
   6595 	ipxop_t *ipx;
   6596 	boolean_t waited_enough = B_FALSE;
   6597 	ip_stack_t *ipst = ill->ill_ipst;
   6598 
   6599 	/*
   6600 	 * Note that the relationship between ill and ipsq is fixed as long as
   6601 	 * the ill is not ILL_CONDEMNED.  Holding ipsq_lock ensures the
   6602 	 * relationship between the IPSQ and xop cannot change.  However,
   6603 	 * since we cannot hold ipsq_lock across the cv_wait(), it may change
   6604 	 * while we're waiting.  We wait on ill_cv and rely on ipsq_exit()
   6605 	 * waking up all ills in the xop when it becomes available.
   6606 	 */
   6607 	for (;;) {
   6608 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   6609 		mutex_enter(&ill->ill_lock);
   6610 		if (ill->ill_state_flags & ILL_CONDEMNED) {
   6611 			mutex_exit(&ill->ill_lock);
   6612 			rw_exit(&ipst->ips_ill_g_lock);
   6613 			return (B_FALSE);
   6614 		}
   6615 
   6616 		ipsq = ill->ill_phyint->phyint_ipsq;
   6617 		mutex_enter(&ipsq->ipsq_lock);
   6618 		ipx = ipsq->ipsq_xop;
   6619 		mutex_enter(&ipx->ipx_lock);
   6620 
   6621 		if (ipx->ipx_writer == NULL && (type == CUR_OP ||
   6622 		    (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) ||
   6623 		    waited_enough))
   6624 			break;
   6625 
   6626 		rw_exit(&ipst->ips_ill_g_lock);
   6627 
   6628 		if (!force || ipx->ipx_writer != NULL) {
   6629 			mutex_exit(&ipx->ipx_lock);
   6630 			mutex_exit(&ipsq->ipsq_lock);
   6631 			cv_wait(&ill->ill_cv, &ill->ill_lock);
   6632 		} else {
   6633 			mutex_exit(&ipx->ipx_lock);
   6634 			mutex_exit(&ipsq->ipsq_lock);
   6635 			(void) cv_reltimedwait(&ill->ill_cv,
   6636 			    &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK);
   6637 			waited_enough = B_TRUE;
   6638 		}
   6639 		mutex_exit(&ill->ill_lock);
   6640 	}
   6641 
   6642 	ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
   6643 	ASSERT(ipx->ipx_reentry_cnt == 0);
   6644 	ipx->ipx_writer = curthread;
   6645 	ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
   6646 	ipx->ipx_reentry_cnt++;
   6647 #ifdef DEBUG
   6648 	ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
   6649 #endif
   6650 	mutex_exit(&ipx->ipx_lock);
   6651 	mutex_exit(&ipsq->ipsq_lock);
   6652 	mutex_exit(&ill->ill_lock);
   6653 	rw_exit(&ipst->ips_ill_g_lock);
   6654 
   6655 	return (B_TRUE);
   6656 }
   6657 
   6658 /*
   6659  * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock
   6660  * across the call to the core interface ipsq_try_enter() and hence calls this
   6661  * function directly. This is explained more fully in ipif_set_values().
   6662  * In order to support the above constraint, ipsq_try_enter is implemented as
   6663  * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently
   6664  */
   6665 static ipsq_t *
   6666 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
   6667     int type, boolean_t reentry_ok)
   6668 {
   6669 	ipsq_t	*ipsq;
   6670 	ipxop_t	*ipx;
   6671 	ip_stack_t *ipst = ill->ill_ipst;
   6672 
   6673 	/*
   6674 	 * lock ordering:
   6675 	 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
   6676 	 *
   6677 	 * ipx of an ipsq can't change when ipsq_lock is held.
   6678 	 */
   6679 	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
   6680 	GRAB_CONN_LOCK(q);
   6681 	mutex_enter(&ill->ill_lock);
   6682 	ipsq = ill->ill_phyint->phyint_ipsq;
   6683 	mutex_enter(&ipsq->ipsq_lock);
   6684 	ipx = ipsq->ipsq_xop;
   6685 	mutex_enter(&ipx->ipx_lock);
   6686 
   6687 	/*
   6688 	 * 1. Enter the ipsq if we are already writer and reentry is ok.
   6689 	 *    (Note: If the caller does not specify reentry_ok then neither
   6690 	 *    'func' nor any of its callees must ever attempt to enter the ipsq
   6691 	 *    again. Otherwise it can lead to an infinite loop
   6692 	 * 2. Enter the ipsq if there is no current writer and this attempted
   6693 	 *    entry is part of the current operation
   6694 	 * 3. Enter the ipsq if there is no current writer and this is a new
   6695 	 *    operation and the operation queue is empty and there is no
   6696 	 *    operation currently in progress and if all previously initiated
   6697 	 *    DLPI operations have completed.
   6698 	 */
   6699 	if ((ipx->ipx_writer == curthread && reentry_ok) ||
   6700 	    (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
   6701 	    !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL &&
   6702 	    ipsq_dlpi_done(ipsq))))) {
   6703 		/* Success. */
   6704 		ipx->ipx_reentry_cnt++;
   6705 		ipx->ipx_writer = curthread;
   6706 		ipx->ipx_forced = B_FALSE;
   6707 		mutex_exit(&ipx->ipx_lock);
   6708 		mutex_exit(&ipsq->ipsq_lock);
   6709 		mutex_exit(&ill->ill_lock);
   6710 		RELEASE_CONN_LOCK(q);
   6711 #ifdef DEBUG
   6712 		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
   6713 #endif
   6714 		return (ipsq);
   6715 	}
   6716 
   6717 	if (func != NULL)
   6718 		ipsq_enq(ipsq, q, mp, func, type, ill);
   6719 
   6720 	mutex_exit(&ipx->ipx_lock);
   6721 	mutex_exit(&ipsq->ipsq_lock);
   6722 	mutex_exit(&ill->ill_lock);
   6723 	RELEASE_CONN_LOCK(q);
   6724 	return (NULL);
   6725 }
   6726 
   6727 /*
   6728  * The ipsq_t (ipsq) is the synchronization data structure used to serialize
   6729  * certain critical operations like plumbing (i.e. most set ioctls), etc.
   6730  * There is one ipsq per phyint. The ipsq
   6731  * serializes exclusive ioctls issued by applications on a per ipsq basis in
   6732  * ipsq_xopq_mphead. It also protects against multiple threads executing in
   6733  * the ipsq. Responses from the driver pertain to the current ioctl (say a
   6734  * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
   6735  * up the interface) and are enqueued in ipx_mphead.
   6736  *
   6737  * If a thread does not want to reenter the ipsq when it is already writer,
   6738  * it must make sure that the specified reentry point to be called later
   6739  * when the ipsq is empty, nor any code path starting from the specified reentry
   6740  * point must never ever try to enter the ipsq again. Otherwise it can lead
   6741  * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
   6742  * When the thread that is currently exclusive finishes, it (ipsq_exit)
   6743  * dequeues the requests waiting to become exclusive in ipx_mphead and calls
   6744  * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
   6745  * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
   6746  * ioctl if the current ioctl has completed. If the current ioctl is still
   6747  * in progress it simply returns. The current ioctl could be waiting for
   6748  * a response from another module (the driver or could be waiting for
   6749  * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
   6750  * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
   6751  * execution of the ioctl and ipsq_exit does not start the next ioctl unless
   6752  * ipx_current_ipif is NULL which happens only once the ioctl is complete and
   6753  * all associated DLPI operations have completed.
   6754  */
   6755 
   6756 /*
   6757  * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
   6758  * and `ill' cannot both be specified).  Returns a pointer to the entered IPSQ
   6759  * on success, or NULL on failure.  The caller ensures ipif/ill is valid by
   6760  * refholding it as necessary.  If the IPSQ cannot be entered and `func' is
   6761  * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
   6762  * can be entered.  If `func' is NULL, then `q' and `mp' are ignored.
   6763  */
   6764 ipsq_t *
   6765 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
   6766     ipsq_func_t func, int type, boolean_t reentry_ok)
   6767 {
   6768 	ip_stack_t	*ipst;
   6769 	ipsq_t		*ipsq;
   6770 
   6771 	/* Only 1 of ipif or ill can be specified */
   6772 	ASSERT((ipif != NULL) ^ (ill != NULL));
   6773 
   6774 	if (ipif != NULL)
   6775 		ill = ipif->ipif_ill;
   6776 	ipst = ill->ill_ipst;
   6777 
   6778 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   6779 	ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok);
   6780 	rw_exit(&ipst->ips_ill_g_lock);
   6781 
   6782 	return (ipsq);
   6783 }
   6784 
   6785 /*
   6786  * Try to enter the IPSQ corresponding to `ill' as writer.  The caller ensures
   6787  * ill is valid by refholding it if necessary; we will refrele.  If the IPSQ
   6788  * cannot be entered, the mp is queued for completion.
   6789  */
   6790 void
   6791 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
   6792     boolean_t reentry_ok)
   6793 {
   6794 	ipsq_t	*ipsq;
   6795 
   6796 	ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok);
   6797 
   6798 	/*
   6799 	 * Drop the caller's refhold on the ill.  This is safe since we either
   6800 	 * entered the IPSQ (and thus are exclusive), or failed to enter the
   6801 	 * IPSQ, in which case we return without accessing ill anymore.  This
   6802 	 * is needed because func needs to see the correct refcount.
   6803 	 * e.g. removeif can work only then.
   6804 	 */
   6805 	ill_refrele(ill);
   6806 	if (ipsq != NULL) {
   6807 		(*func)(ipsq, q, mp, NULL);
   6808 		ipsq_exit(ipsq);
   6809 	}
   6810 }
   6811 
   6812 /*
   6813  * Exit the specified IPSQ.  If this is the final exit on it then drain it
   6814  * prior to exiting.  Caller must be writer on the specified IPSQ.
   6815  */
   6816 void
   6817 ipsq_exit(ipsq_t *ipsq)
   6818 {
   6819 	mblk_t *mp;
   6820 	ipsq_t *mp_ipsq;
   6821 	queue_t	*q;
   6822 	phyint_t *phyi;
   6823 	ipsq_func_t func;
   6824 
   6825 	ASSERT(IAM_WRITER_IPSQ(ipsq));
   6826 
   6827 	ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
   6828 	if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
   6829 		ipsq->ipsq_xop->ipx_reentry_cnt--;
   6830 		return;
   6831 	}
   6832 
   6833 	for (;;) {
   6834 		phyi = ipsq->ipsq_phyint;
   6835 		mp = ipsq_dq(ipsq);
   6836 		mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
   6837 
   6838 		/*
   6839 		 * If we've changed to a new IPSQ, and the phyint associated
   6840 		 * with the old one has gone away, free the old IPSQ.  Note
   6841 		 * that this cannot happen while the IPSQ is in a group.
   6842 		 */
   6843 		if (mp_ipsq != ipsq && phyi == NULL) {
   6844 			ASSERT(ipsq->ipsq_next == ipsq);
   6845 			ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
   6846 			ipsq_delete(ipsq);
   6847 		}
   6848 
   6849 		if (mp == NULL)
   6850 			break;
   6851 
   6852 		q = mp->b_queue;
   6853 		func = (ipsq_func_t)mp->b_prev;
   6854 		ipsq = mp_ipsq;
   6855 		mp->b_next = mp->b_prev = NULL;
   6856 		mp->b_queue = NULL;
   6857 
   6858 		/*
   6859 		 * If 'q' is an conn queue, it is valid, since we did a
   6860 		 * a refhold on the conn at the start of the ioctl.
   6861 		 * If 'q' is an ill queue, it is valid, since close of an
   6862 		 * ill will clean up its IPSQ.
   6863 		 */
   6864 		(*func)(ipsq, q, mp, NULL);
   6865 	}
   6866 }
   6867 
   6868 /*
   6869  * Used to start any igmp or mld timers that could not be started
   6870  * while holding ill_mcast_lock. The timers can't be started while holding
   6871  * the lock, since mld/igmp_start_timers may need to call untimeout()
   6872  * which can't be done while holding the lock which the timeout handler
   6873  * acquires. Otherwise
   6874  * there could be a deadlock since the timeout handlers
   6875  * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire
   6876  * ill_mcast_lock.
   6877  */
   6878 void
   6879 ill_mcast_timer_start(ip_stack_t *ipst)
   6880 {
   6881 	int		next;
   6882 
   6883 	mutex_enter(&ipst->ips_igmp_timer_lock);
   6884 	next = ipst->ips_igmp_deferred_next;
   6885 	ipst->ips_igmp_deferred_next = INFINITY;
   6886 	mutex_exit(&ipst->ips_igmp_timer_lock);
   6887 
   6888 	if (next != INFINITY)
   6889 		igmp_start_timers(next, ipst);
   6890 
   6891 	mutex_enter(&ipst->ips_mld_timer_lock);
   6892 	next = ipst->ips_mld_deferred_next;
   6893 	ipst->ips_mld_deferred_next = INFINITY;
   6894 	mutex_exit(&ipst->ips_mld_timer_lock);
   6895 
   6896 	if (next != INFINITY)
   6897 		mld_start_timers(next, ipst);
   6898 }
   6899 
   6900 /*
   6901  * Start the current exclusive operation on `ipsq'; associate it with `ipif'
   6902  * and `ioccmd'.
   6903  */
   6904 void
   6905 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
   6906 {
   6907 	ill_t *ill = ipif->ipif_ill;
   6908 	ipxop_t *ipx = ipsq->ipsq_xop;
   6909 
   6910 	ASSERT(IAM_WRITER_IPSQ(ipsq));
   6911 	ASSERT(ipx->ipx_current_ipif == NULL);
   6912 	ASSERT(ipx->ipx_current_ioctl == 0);
   6913 
   6914 	ipx->ipx_current_done = B_FALSE;
   6915 	ipx->ipx_current_ioctl = ioccmd;
   6916 	mutex_enter(&ipx->ipx_lock);
   6917 	ipx->ipx_current_ipif = ipif;
   6918 	mutex_exit(&ipx->ipx_lock);
   6919 
   6920 	/*
   6921 	 * Set IPIF_CHANGING on one or more ipifs associated with the
   6922 	 * current exclusive operation.  IPIF_CHANGING prevents any new
   6923 	 * references to the ipif (so that the references will eventually
   6924 	 * drop to zero) and also prevents any "get" operations (e.g.,
   6925 	 * SIOCGLIFFLAGS) from being able to access the ipif until the
   6926 	 * operation has completed and the ipif is again in a stable state.
   6927 	 *
   6928 	 * For ioctls, IPIF_CHANGING is set on the ipif associated with the
   6929 	 * ioctl.  For internal operations (where ioccmd is zero), all ipifs
   6930 	 * on the ill are marked with IPIF_CHANGING since it's unclear which
   6931 	 * ipifs will be affected.
   6932 	 *
   6933 	 * Note that SIOCLIFREMOVEIF is a special case as it sets
   6934 	 * IPIF_CONDEMNED internally after identifying the right ipif to
   6935 	 * operate on.
   6936 	 */
   6937 	switch (ioccmd) {
   6938 	case SIOCLIFREMOVEIF:
   6939 		break;
   6940 	case 0:
   6941 		mutex_enter(&ill->ill_lock);
   6942 		ipif = ipif->ipif_ill->ill_ipif;
   6943 		for (; ipif != NULL; ipif = ipif->ipif_next)
   6944 			ipif->ipif_state_flags |= IPIF_CHANGING;
   6945 		mutex_exit(&ill->ill_lock);
   6946 		break;
   6947 	default:
   6948 		mutex_enter(&ill->ill_lock);
   6949 		ipif->ipif_state_flags |= IPIF_CHANGING;
   6950 		mutex_exit(&ill->ill_lock);
   6951 	}
   6952 }
   6953 
   6954 /*
   6955  * Finish the current exclusive operation on `ipsq'.  Usually, this will allow
   6956  * the next exclusive operation to begin once we ipsq_exit().  However, if
   6957  * pending DLPI operations remain, then we will wait for the queue to drain
   6958  * before allowing the next exclusive operation to begin.  This ensures that
   6959  * DLPI operations from one exclusive operation are never improperly processed
   6960  * as part of a subsequent exclusive operation.
   6961  */
   6962 void
   6963 ipsq_current_finish(ipsq_t *ipsq)
   6964 {
   6965 	ipxop_t	*ipx = ipsq->ipsq_xop;
   6966 	t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
   6967 	ipif_t	*ipif = ipx->ipx_current_ipif;
   6968 
   6969 	ASSERT(IAM_WRITER_IPSQ(ipsq));
   6970 
   6971 	/*
   6972 	 * For SIOCLIFREMOVEIF, the ipif has been already been blown away
   6973 	 * (but in that case, IPIF_CHANGING will already be clear and no
   6974 	 * pending DLPI messages can remain).
   6975 	 */
   6976 	if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
   6977 		ill_t *ill = ipif->ipif_ill;
   6978 
   6979 		mutex_enter(&ill->ill_lock);
   6980 		dlpi_pending = ill->ill_dlpi_pending;
   6981 		if (ipx->ipx_current_ioctl == 0) {
   6982 			ipif = ill->ill_ipif;
   6983 			for (; ipif != NULL; ipif = ipif->ipif_next)
   6984 				ipif->ipif_state_flags &= ~IPIF_CHANGING;
   6985 		} else {
   6986 			ipif->ipif_state_flags &= ~IPIF_CHANGING;
   6987 		}
   6988 		mutex_exit(&ill->ill_lock);
   6989 	}
   6990 
   6991 	ASSERT(!ipx->ipx_current_done);
   6992 	ipx->ipx_current_done = B_TRUE;
   6993 	ipx->ipx_current_ioctl = 0;
   6994 	if (dlpi_pending == DL_PRIM_INVAL) {
   6995 		mutex_enter(&ipx->ipx_lock);
   6996 		ipx->ipx_current_ipif = NULL;
   6997 		mutex_exit(&ipx->ipx_lock);
   6998 	}
   6999 }
   7000 
   7001 /*
   7002  * The ill is closing. Flush all messages on the ipsq that originated
   7003  * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead
   7004  * for this ill since ipsq_enter could not have entered until then.
   7005  * New messages can't be queued since the CONDEMNED flag is set.
   7006  */
   7007 static void
   7008 ipsq_flush(ill_t *ill)
   7009 {
   7010 	queue_t	*q;
   7011 	mblk_t	*prev;
   7012 	mblk_t	*mp;
   7013 	mblk_t	*mp_next;
   7014 	ipxop_t	*ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
   7015 
   7016 	ASSERT(IAM_WRITER_ILL(ill));
   7017 
   7018 	/*
   7019 	 * Flush any messages sent up by the driver.
   7020 	 */
   7021 	mutex_enter(&ipx->ipx_lock);
   7022 	for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
   7023 		mp_next = mp->b_next;
   7024 		q = mp->b_queue;
   7025 		if (q == ill->ill_rq || q == ill->ill_wq) {
   7026 			/* dequeue mp */
   7027 			if (prev == NULL)
   7028 				ipx->ipx_mphead = mp->b_next;
   7029 			else
   7030 				prev->b_next = mp->b_next;
   7031 			if (ipx->ipx_mptail == mp) {
   7032 				ASSERT(mp_next == NULL);
   7033 				ipx->ipx_mptail = prev;
   7034 			}
   7035 			inet_freemsg(mp);
   7036 		} else {
   7037 			prev = mp;
   7038 		}
   7039 	}
   7040 	mutex_exit(&ipx->ipx_lock);
   7041 	(void) ipsq_pending_mp_cleanup(ill, NULL);
   7042 	ipsq_xopq_mp_cleanup(ill, NULL);
   7043 }
   7044 
   7045 /*
   7046  * Parse an ifreq or lifreq struct coming down ioctls and refhold
   7047  * and return the associated ipif.
   7048  * Return value:
   7049  *	Non zero: An error has occurred. ci may not be filled out.
   7050  *	zero : ci is filled out with the ioctl cmd in ci.ci_name, and
   7051  *	a held ipif in ci.ci_ipif.
   7052  */
   7053 int
   7054 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
   7055     cmd_info_t *ci)
   7056 {
   7057 	char		*name;
   7058 	struct ifreq    *ifr;
   7059 	struct lifreq    *lifr;
   7060 	ipif_t		*ipif = NULL;
   7061 	ill_t		*ill;
   7062 	conn_t		*connp;
   7063 	boolean_t	isv6;
   7064 	boolean_t	exists;
   7065 	mblk_t		*mp1;
   7066 	zoneid_t	zoneid;
   7067 	ip_stack_t	*ipst;
   7068 
   7069 	if (q->q_next != NULL) {
   7070 		ill = (ill_t *)q->q_ptr;
   7071 		isv6 = ill->ill_isv6;
   7072 		connp = NULL;
   7073 		zoneid = ALL_ZONES;
   7074 		ipst = ill->ill_ipst;
   7075 	} else {
   7076 		ill = NULL;
   7077 		connp = Q_TO_CONN(q);
   7078 		isv6 = (connp->conn_family == AF_INET6);
   7079 		zoneid = connp->conn_zoneid;
   7080 		if (zoneid == GLOBAL_ZONEID) {
   7081 			/* global zone can access ipifs in all zones */
   7082 			zoneid = ALL_ZONES;
   7083 		}
   7084 		ipst = connp->conn_netstack->netstack_ip;
   7085 	}
   7086 
   7087 	/* Has been checked in ip_wput_nondata */
   7088 	mp1 = mp->b_cont->b_cont;
   7089 
   7090 	if (ipip->ipi_cmd_type == IF_CMD) {
   7091 		/* This a old style SIOC[GS]IF* command */
   7092 		ifr = (struct ifreq *)mp1->b_rptr;
   7093 		/*
   7094 		 * Null terminate the string to protect against buffer
   7095 		 * overrun. String was generated by user code and may not
   7096 		 * be trusted.
   7097 		 */
   7098 		ifr->ifr_name[IFNAMSIZ - 1] = '\0';
   7099 		name = ifr->ifr_name;
   7100 		ci->ci_sin = (sin_t *)&ifr->ifr_addr;
   7101 		ci->ci_sin6 = NULL;
   7102 		ci->ci_lifr = (struct lifreq *)ifr;
   7103 	} else {
   7104 		/* This a new style SIOC[GS]LIF* command */
   7105 		ASSERT(ipip->ipi_cmd_type == LIF_CMD);
   7106 		lifr = (struct lifreq *)mp1->b_rptr;
   7107 		/*
   7108 		 * Null terminate the string to protect against buffer
   7109 		 * overrun. String was generated by user code and may not
   7110 		 * be trusted.
   7111 		 */
   7112 		lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
   7113 		name = lifr->lifr_name;
   7114 		ci->ci_sin = (sin_t *)&lifr->lifr_addr;
   7115 		ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
   7116 		ci->ci_lifr = lifr;
   7117 	}
   7118 
   7119 	if (ipip->ipi_cmd == SIOCSLIFNAME) {
   7120 		/*
   7121 		 * The ioctl will be failed if the ioctl comes down
   7122 		 * an conn stream
   7123 		 */
   7124 		if (ill == NULL) {
   7125 			/*
   7126 			 * Not an ill queue, return EINVAL same as the
   7127 			 * old error code.
   7128 			 */
   7129 			return (ENXIO);
   7130 		}
   7131 		ipif = ill->ill_ipif;
   7132 		ipif_refhold(ipif);
   7133 	} else {
   7134 		ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE,
   7135 		    &exists, isv6, zoneid, ipst);
   7136 
   7137 		/*
   7138 		 * Ensure that get ioctls don't see any internal state changes
   7139 		 * caused by set ioctls by deferring them if IPIF_CHANGING is
   7140 		 * set.
   7141 		 */
   7142 		if (ipif != NULL && !(ipip->ipi_flags & IPI_WR) &&
   7143 		    !IAM_WRITER_IPIF(ipif)) {
   7144 			ipsq_t	*ipsq;
   7145 
   7146 			if (connp != NULL)
   7147 				mutex_enter(&connp->conn_lock);
   7148 			mutex_enter(&ipif->ipif_ill->ill_lock);
   7149 			if (IPIF_IS_CHANGING(ipif) &&
   7150 			    !IPIF_IS_CONDEMNED(ipif)) {
   7151 				ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
   7152 				mutex_enter(&ipsq->ipsq_lock);
   7153 				mutex_enter(&ipsq->ipsq_xop->ipx_lock);
   7154 				mutex_exit(&ipif->ipif_ill->ill_lock);
   7155 				ipsq_enq(ipsq, q, mp, ip_process_ioctl,
   7156 				    NEW_OP, ipif->ipif_ill);
   7157 				mutex_exit(&ipsq->ipsq_xop->ipx_lock);
   7158 				mutex_exit(&ipsq->ipsq_lock);
   7159 				if (connp != NULL)
   7160 					mutex_exit(&connp->conn_lock);
   7161 				ipif_refrele(ipif);
   7162 				return (EINPROGRESS);
   7163 			}
   7164 			mutex_exit(&ipif->ipif_ill->ill_lock);
   7165 			if (connp != NULL)
   7166 				mutex_exit(&connp->conn_lock);
   7167 		}
   7168 	}
   7169 
   7170 	/*
   7171 	 * Old style [GS]IFCMD does not admit IPv6 ipif
   7172 	 */
   7173 	if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) {
   7174 		ipif_refrele(ipif);
   7175 		return (ENXIO);
   7176 	}
   7177 
   7178 	if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL &&
   7179 	    name[0] == '\0') {
   7180 		/*
   7181 		 * Handle a or a SIOC?IF* with a null name
   7182 		 * during plumb (on the ill queue before the I_PLINK).
   7183 		 */
   7184 		ipif = ill->ill_ipif;
   7185 		ipif_refhold(ipif);
   7186 	}
   7187 
   7188 	if (ipif == NULL)
   7189 		return (ENXIO);
   7190 
   7191 	DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq",
   7192 	    int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif);
   7193 
   7194 	ci->ci_ipif = ipif;
   7195 	return (0);
   7196 }
   7197 
   7198 /*
   7199  * Return the total number of ipifs.
   7200  */
   7201 static uint_t
   7202 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
   7203 {
   7204 	uint_t numifs = 0;
   7205 	ill_t	*ill;
   7206 	ill_walk_context_t	ctx;
   7207 	ipif_t	*ipif;
   7208 
   7209 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   7210 	ill = ILL_START_WALK_V4(&ctx, ipst);
   7211 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   7212 		if (IS_UNDER_IPMP(ill))
   7213 			continue;
   7214 		for (ipif = ill->ill_ipif; ipif != NULL;
   7215 		    ipif = ipif->ipif_next) {
   7216 			if (ipif->ipif_zoneid == zoneid ||
   7217 			    ipif->ipif_zoneid == ALL_ZONES)
   7218 				numifs++;
   7219 		}
   7220 	}
   7221 	rw_exit(&ipst->ips_ill_g_lock);
   7222 	return (numifs);
   7223 }
   7224 
   7225 /*
   7226  * Return the total number of ipifs.
   7227  */
   7228 static uint_t
   7229 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
   7230 {
   7231 	uint_t numifs = 0;
   7232 	ill_t	*ill;
   7233 	ipif_t	*ipif;
   7234 	ill_walk_context_t	ctx;
   7235 
   7236 	ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid));
   7237 
   7238 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   7239 	if (family == AF_INET)
   7240 		ill = ILL_START_WALK_V4(&ctx, ipst);
   7241 	else if (family == AF_INET6)
   7242 		ill = ILL_START_WALK_V6(&ctx, ipst);
   7243 	else
   7244 		ill = ILL_START_WALK_ALL(&ctx, ipst);
   7245 
   7246 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   7247 		if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
   7248 			continue;
   7249 
   7250 		for (ipif = ill->ill_ipif; ipif != NULL;
   7251 		    ipif = ipif->ipif_next) {
   7252 			if ((ipif->ipif_flags & IPIF_NOXMIT) &&
   7253 			    !(lifn_flags & LIFC_NOXMIT))
   7254 				continue;
   7255 			if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
   7256 			    !(lifn_flags & LIFC_TEMPORARY))
   7257 				continue;
   7258 			if (((ipif->ipif_flags &
   7259 			    (IPIF_NOXMIT|IPIF_NOLOCAL|
   7260 			    IPIF_DEPRECATED)) ||
   7261 			    IS_LOOPBACK(ill) ||
   7262 			    !(ipif->ipif_flags & IPIF_UP)) &&
   7263 			    (lifn_flags & LIFC_EXTERNAL_SOURCE))
   7264 				continue;
   7265 
   7266 			if (zoneid != ipif->ipif_zoneid &&
   7267 			    ipif->ipif_zoneid != ALL_ZONES &&
   7268 			    (zoneid != GLOBAL_ZONEID ||
   7269 			    !(lifn_flags & LIFC_ALLZONES)))
   7270 				continue;
   7271 
   7272 			numifs++;
   7273 		}
   7274 	}
   7275 	rw_exit(&ipst->ips_ill_g_lock);
   7276 	return (numifs);
   7277 }
   7278 
   7279 uint_t
   7280 ip_get_lifsrcofnum(ill_t *ill)
   7281 {
   7282 	uint_t numifs = 0;
   7283 	ill_t	*ill_head = ill;
   7284 	ip_stack_t	*ipst = ill->ill_ipst;
   7285 
   7286 	/*
   7287 	 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some
   7288 	 * other thread may be trying to relink the ILLs in this usesrc group
   7289 	 * and adjusting the ill_usesrc_grp_next pointers
   7290 	 */
   7291 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
   7292 	if ((ill->ill_usesrc_ifindex == 0) &&
   7293 	    (ill->ill_usesrc_grp_next != NULL)) {
   7294 		for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head);
   7295 		    ill = ill->ill_usesrc_grp_next)
   7296 			numifs++;
   7297 	}
   7298 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
   7299 
   7300 	return (numifs);
   7301 }
   7302 
   7303 /* Null values are passed in for ipif, sin, and ifreq */
   7304 /* ARGSUSED */
   7305 int
   7306 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
   7307     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
   7308 {
   7309 	int *nump;
   7310 	conn_t *connp = Q_TO_CONN(q);
   7311 
   7312 	ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
   7313 
   7314 	/* Existence of b_cont->b_cont checked in ip_wput_nondata */
   7315 	nump = (int *)mp->b_cont->b_cont->b_rptr;
   7316 
   7317 	*nump = ip_get_numifs(connp->conn_zoneid,
   7318 	    connp->conn_netstack->netstack_ip);
   7319 	ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump));
   7320 	return (0);
   7321 }
   7322 
   7323 /* Null values are passed in for ipif, sin, and ifreq */
   7324 /* ARGSUSED */
   7325 int
   7326 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin,
   7327     queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
   7328 {
   7329 	struct lifnum *lifn;
   7330 	mblk_t	*mp1;
   7331 	conn_t *connp = Q_TO_CONN(q);
   7332 
   7333 	ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
   7334 
   7335 	/* Existence checked in ip_wput_nondata */
   7336 	mp1 = mp->b_cont->b_cont;
   7337 
   7338 	lifn = (struct lifnum *)mp1->b_rptr;
   7339 	switch (lifn->lifn_family) {
   7340 	case AF_UNSPEC:
   7341 	case AF_INET:
   7342 	case AF_INET6:
   7343 		break;
   7344 	default:
   7345 		return (EAFNOSUPPORT);
   7346 	}
   7347 
   7348 	lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags,
   7349 	    connp->conn_zoneid, connp->conn_netstack->netstack_ip);
   7350 	ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count));
   7351 	return (0);
   7352 }
   7353 
   7354 /* ARGSUSED */
   7355 int
   7356 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
   7357     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
   7358 {
   7359 	STRUCT_HANDLE(ifconf, ifc);
   7360 	mblk_t *mp1;
   7361 	struct iocblk *iocp;
   7362 	struct ifreq *ifr;
   7363 	ill_walk_context_t	ctx;
   7364 	ill_t	*ill;
   7365 	ipif_t	*ipif;
   7366 	struct sockaddr_in *sin;
   7367 	int32_t	ifclen;
   7368 	zoneid_t zoneid;
   7369 	ip_stack_t *ipst = CONNQ_TO_IPST(q);
   7370 
   7371 	ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */
   7372 
   7373 	ip1dbg(("ip_sioctl_get_ifconf"));
   7374 	/* Existence verified in ip_wput_nondata */
   7375 	mp1 = mp->b_cont->b_cont;
   7376 	iocp = (struct iocblk *)mp->b_rptr;
   7377 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   7378 
   7379 	/*
   7380 	 * The original SIOCGIFCONF passed in a struct ifconf which specified
   7381 	 * the user buffer address and length into which the list of struct
   7382 	 * ifreqs was to be copied.  Since AT&T Streams does not seem to
   7383 	 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS,
   7384 	 * the SIOCGIFCONF operation was redefined to simply provide
   7385 	 * a large output buffer into which we are supposed to jam the ifreq
   7386 	 * array.  The same ioctl command code was used, despite the fact that
   7387 	 * both the applications and the kernel code had to change, thus making
   7388 	 * it impossible to support both interfaces.
   7389 	 *
   7390 	 * For reasons not good enough to try to explain, the following
   7391 	 * algorithm is used for deciding what to do with one of these:
   7392 	 * If the IOCTL comes in as an I_STR, it is assumed to be of the new
   7393 	 * form with the output buffer coming down as the continuation message.
   7394 	 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style,
   7395 	 * and we have to copy in the ifconf structure to find out how big the
   7396 	 * output buffer is and where to copy out to.  Sure no problem...
   7397 	 *
   7398 	 */
   7399 	STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL);
   7400 	if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) {
   7401 		int numifs = 0;
   7402 		size_t ifc_bufsize;
   7403 
   7404 		/*
   7405 		 * Must be (better be!) continuation of a TRANSPARENT
   7406 		 * IOCTL.  We just copied in the ifconf structure.
   7407 		 */
   7408 		STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
   7409 		    (struct ifconf *)mp1->b_rptr);
   7410 
   7411 		/*
   7412 		 * Allocate a buffer to hold requested information.
   7413 		 *
   7414 		 * If ifc_len is larger than what is needed, we only
   7415 		 * allocate what we will use.
   7416 		 *
   7417 		 * If ifc_len is smaller than what is needed, return
   7418 		 * EINVAL.
   7419 		 *
   7420 		 * XXX: the ill_t structure can hava 2 counters, for
   7421 		 * v4 and v6 (not just ill_ipif_up_count) to store the
   7422 		 * number of interfaces for a device, so we don't need
   7423 		 * to count them here...
   7424 		 */
   7425 		numifs = ip_get_numifs(zoneid, ipst);
   7426 
   7427 		ifclen = STRUCT_FGET(ifc, ifc_len);
   7428 		ifc_bufsize = numifs * sizeof (struct ifreq);
   7429 		if (ifc_bufsize > ifclen) {
   7430 			if (iocp->ioc_cmd == O_SIOCGIFCONF) {
   7431 				/* old behaviour */
   7432 				return (EINVAL);
   7433 			} else {
   7434 				ifc_bufsize = ifclen;
   7435 			}
   7436 		}
   7437 
   7438 		mp1 = mi_copyout_alloc(q, mp,
   7439 		    STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE);
   7440 		if (mp1 == NULL)
   7441 			return (ENOMEM);
   7442 
   7443 		mp1->b_wptr = mp1->b_rptr + ifc_bufsize;
   7444 	}
   7445 	bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
   7446 	/*
   7447 	 * the SIOCGIFCONF ioctl only knows about
   7448 	 * IPv4 addresses, so don't try to tell
   7449 	 * it about interfaces with IPv6-only
   7450 	 * addresses. (Last parm 'isv6' is B_FALSE)
   7451 	 */
   7452 
   7453 	ifr = (struct ifreq *)mp1->b_rptr;
   7454 
   7455 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   7456 	ill = ILL_START_WALK_V4(&ctx, ipst);
   7457 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   7458 		if (IS_UNDER_IPMP(ill))
   7459 			continue;
   7460 		for (ipif = ill->ill_ipif; ipif != NULL;
   7461 		    ipif = ipif->ipif_next) {
   7462 			if (zoneid != ipif->ipif_zoneid &&
   7463 			    ipif->ipif_zoneid != ALL_ZONES)
   7464 				continue;
   7465 			if ((uchar_t *)&ifr[1] > mp1->b_wptr) {
   7466 				if (iocp->ioc_cmd == O_SIOCGIFCONF) {
   7467 					/* old behaviour */
   7468 					rw_exit(&ipst->ips_ill_g_lock);
   7469 					return (EINVAL);
   7470 				} else {
   7471 					goto if_copydone;
   7472 				}
   7473 			}
   7474 			ipif_get_name(ipif, ifr->ifr_name,
   7475 			    sizeof (ifr->ifr_name));
   7476 			sin = (sin_t *)&ifr->ifr_addr;
   7477 			*sin = sin_null;
   7478 			sin->sin_family = AF_INET;
   7479 			sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
   7480 			ifr++;
   7481 		}
   7482 	}
   7483 if_copydone:
   7484 	rw_exit(&ipst->ips_ill_g_lock);
   7485 	mp1->b_wptr = (uchar_t *)ifr;
   7486 
   7487 	if (STRUCT_BUF(ifc) != NULL) {
   7488 		STRUCT_FSET(ifc, ifc_len,
   7489 		    (int)((uchar_t *)ifr - mp1->b_rptr));
   7490 	}
   7491 	return (0);
   7492 }
   7493 
   7494 /*
   7495  * Get the interfaces using the address hosted on the interface passed in,
   7496  * as a source adddress
   7497  */
   7498 /* ARGSUSED */
   7499 int
   7500 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
   7501     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
   7502 {
   7503 	mblk_t *mp1;
   7504 	ill_t	*ill, *ill_head;
   7505 	ipif_t	*ipif, *orig_ipif;
   7506 	int	numlifs = 0;
   7507 	size_t	lifs_bufsize, lifsmaxlen;
   7508 	struct	lifreq *lifr;
   7509 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
   7510 	uint_t	ifindex;
   7511 	zoneid_t zoneid;
   7512 	boolean_t isv6 = B_FALSE;
   7513 	struct	sockaddr_in	*sin;
   7514 	struct	sockaddr_in6	*sin6;
   7515 	STRUCT_HANDLE(lifsrcof, lifs);
   7516 	ip_stack_t		*ipst;
   7517 
   7518 	ipst = CONNQ_TO_IPST(q);
   7519 
   7520 	ASSERT(q->q_next == NULL);
   7521 
   7522 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   7523 
   7524 	/* Existence verified in ip_wput_nondata */
   7525 	mp1 = mp->b_cont->b_cont;
   7526 
   7527 	/*
   7528 	 * Must be (better be!) continuation of a TRANSPARENT
   7529 	 * IOCTL.  We just copied in the lifsrcof structure.
   7530 	 */
   7531 	STRUCT_SET_HANDLE(lifs, iocp->ioc_flag,
   7532 	    (struct lifsrcof *)mp1->b_rptr);
   7533 
   7534 	if (MBLKL(mp1) != STRUCT_SIZE(lifs))
   7535 		return (EINVAL);
   7536 
   7537 	ifindex = STRUCT_FGET(lifs, lifs_ifindex);
   7538 	isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
   7539 	ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst);
   7540 	if (ipif == NULL) {
   7541 		ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
   7542 		    ifindex));
   7543 		return (ENXIO);
   7544 	}
   7545 
   7546 	/* Allocate a buffer to hold requested information */
   7547 	numlifs = ip_get_lifsrcofnum(ipif->ipif_ill);
   7548 	lifs_bufsize = numlifs * sizeof (struct lifreq);
   7549 	lifsmaxlen =  STRUCT_FGET(lifs, lifs_maxlen);
   7550 	/* The actual size needed is always returned in lifs_len */
   7551 	STRUCT_FSET(lifs, lifs_len, lifs_bufsize);
   7552 
   7553 	/* If the amount we need is more than what is passed in, abort */
   7554 	if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) {
   7555 		ipif_refrele(ipif);
   7556 		return (0);
   7557 	}
   7558 
   7559 	mp1 = mi_copyout_alloc(q, mp,
   7560 	    STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE);
   7561 	if (mp1 == NULL) {
   7562 		ipif_refrele(ipif);
   7563 		return (ENOMEM);
   7564 	}
   7565 
   7566 	mp1->b_wptr = mp1->b_rptr + lifs_bufsize;
   7567 	bzero(mp1->b_rptr, lifs_bufsize);
   7568 
   7569 	lifr = (struct lifreq *)mp1->b_rptr;
   7570 
   7571 	ill = ill_head = ipif->ipif_ill;
   7572 	orig_ipif = ipif;
   7573 
   7574 	/* ill_g_usesrc_lock protects ill_usesrc_grp_next */
   7575 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
   7576 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   7577 
   7578 	ill = ill->ill_usesrc_grp_next; /* start from next ill */
   7579 	for (; (ill != NULL) && (ill != ill_head);
   7580 	    ill = ill->ill_usesrc_grp_next) {
   7581 
   7582 		if ((uchar_t *)&lifr[1] > mp1->b_wptr)
   7583 			break;
   7584 
   7585 		ipif = ill->ill_ipif;
   7586 		ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name));
   7587 		if (ipif->ipif_isv6) {
   7588 			sin6 = (sin6_t *)&lifr->lifr_addr;
   7589 			*sin6 = sin6_null;
   7590 			sin6->sin6_family = AF_INET6;
   7591 			sin6->sin6_addr = ipif->ipif_v6lcl_addr;
   7592 			lifr->lifr_addrlen = ip_mask_to_plen_v6(
   7593 			    &ipif->ipif_v6net_mask);
   7594 		} else {
   7595 			sin = (sin_t *)&lifr->lifr_addr;
   7596 			*sin = sin_null;
   7597 			sin->sin_family = AF_INET;
   7598 			sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
   7599 			lifr->lifr_addrlen = ip_mask_to_plen(
   7600 			    ipif->ipif_net_mask);
   7601 		}
   7602 		lifr++;
   7603 	}
   7604 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
   7605 	rw_exit(&ipst->ips_ill_g_lock);
   7606 	ipif_refrele(orig_ipif);
   7607 	mp1->b_wptr = (uchar_t *)lifr;
   7608 	STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
   7609 
   7610 	return (0);
   7611 }
   7612 
   7613 /* ARGSUSED */
   7614 int
   7615 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
   7616     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
   7617 {
   7618 	mblk_t *mp1;
   7619 	int	list;
   7620 	ill_t	*ill;
   7621 	ipif_t	*ipif;
   7622 	int	flags;
   7623 	int	numlifs = 0;
   7624 	size_t	lifc_bufsize;
   7625 	struct	lifreq *lifr;
   7626 	sa_family_t	family;
   7627 	struct	sockaddr_in	*sin;
   7628 	struct	sockaddr_in6	*sin6;
   7629 	ill_walk_context_t	ctx;
   7630 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
   7631 	int32_t	lifclen;
   7632 	zoneid_t zoneid;
   7633 	STRUCT_HANDLE(lifconf, lifc);
   7634 	ip_stack_t *ipst = CONNQ_TO_IPST(q);
   7635 
   7636 	ip1dbg(("ip_sioctl_get_lifconf"));
   7637 
   7638 	ASSERT(q->q_next == NULL);
   7639 
   7640 	zoneid = Q_TO_CONN(q)->conn_zoneid;
   7641 
   7642 	/* Existence verified in ip_wput_nondata */
   7643 	mp1 = mp->b_cont->b_cont;
   7644 
   7645 	/*
   7646 	 * An extended version of SIOCGIFCONF that takes an
   7647 	 * additional address family and flags field.
   7648 	 * AF_UNSPEC retrieve both IPv4 and IPv6.
   7649 	 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT
   7650 	 * interfaces are omitted.
   7651 	 * Similarly, IPIF_TEMPORARY interfaces are omitted
   7652 	 * unless LIFC_TEMPORARY is specified.
   7653 	 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT,
   7654 	 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and
   7655 	 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE
   7656 	 * has priority over LIFC_NOXMIT.
   7657 	 */
   7658 	STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL);
   7659 
   7660 	if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc))
   7661 		return (EINVAL);
   7662 
   7663 	/*
   7664 	 * Must be (better be!) continuation of a TRANSPARENT
   7665 	 * IOCTL.  We just copied in the lifconf structure.
   7666 	 */
   7667 	STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr);
   7668 
   7669 	family = STRUCT_FGET(lifc, lifc_family);
   7670 	flags = STRUCT_FGET(lifc, lifc_flags);
   7671 
   7672 	switch (family) {
   7673 	case AF_UNSPEC:
   7674 		/*
   7675 		 * walk all ILL's.
   7676 		 */
   7677 		list = MAX_G_HEADS;
   7678 		break;
   7679 	case AF_INET:
   7680 		/*
   7681 		 * walk only IPV4 ILL's.
   7682 		 */
   7683 		list = IP_V4_G_HEAD;
   7684 		break;
   7685 	case AF_INET6:
   7686 		/*
   7687 		 * walk only IPV6 ILL's.
   7688 		 */
   7689 		list = IP_V6_G_HEAD;
   7690 		break;
   7691 	default:
   7692 		return (EAFNOSUPPORT);
   7693 	}
   7694 
   7695 	/*
   7696 	 * Allocate a buffer to hold requested information.
   7697 	 *
   7698 	 * If lifc_len is larger than what is needed, we only
   7699 	 * allocate what we will use.
   7700 	 *
   7701 	 * If lifc_len is smaller than what is needed, return
   7702 	 * EINVAL.
   7703 	 */
   7704 	numlifs = ip_get_numlifs(family, flags, zoneid, ipst);
   7705 	lifc_bufsize = numlifs * sizeof (struct lifreq);
   7706 	lifclen = STRUCT_FGET(lifc, lifc_len);
   7707 	if (lifc_bufsize > lifclen) {
   7708 		if (iocp->ioc_cmd == O_SIOCGLIFCONF)
   7709 			return (EINVAL);
   7710 		else
   7711 			lifc_bufsize = lifclen;
   7712 	}
   7713 
   7714 	mp1 = mi_copyout_alloc(q, mp,
   7715 	    STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE);
   7716 	if (mp1 == NULL)
   7717 		return (ENOMEM);
   7718 
   7719 	mp1->b_wptr = mp1->b_rptr + lifc_bufsize;
   7720 	bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
   7721 
   7722 	lifr = (struct lifreq *)mp1->b_rptr;
   7723 
   7724 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   7725 	ill = ill_first(list, list, &ctx, ipst);
   7726 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   7727 		if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
   7728 			continue;
   7729 
   7730 		for (ipif = ill->ill_ipif; ipif != NULL;
   7731 		    ipif = ipif->ipif_next) {
   7732 			if ((ipif->ipif_flags & IPIF_NOXMIT) &&
   7733 			    !(flags & LIFC_NOXMIT))
   7734 				continue;
   7735 
   7736 			if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
   7737 			    !(flags & LIFC_TEMPORARY))
   7738 				continue;
   7739 
   7740 			if (((ipif->ipif_flags &
   7741 			    (IPIF_NOXMIT|IPIF_NOLOCAL|
   7742 			    IPIF_DEPRECATED)) ||
   7743 			    IS_LOOPBACK(ill) ||
   7744 			    !(ipif->ipif_flags & IPIF_UP)) &&
   7745 			    (flags & LIFC_EXTERNAL_SOURCE))
   7746 				continue;
   7747 
   7748 			if (zoneid != ipif->ipif_zoneid &&
   7749 			    ipif->ipif_zoneid != ALL_ZONES &&
   7750 			    (zoneid != GLOBAL_ZONEID ||
   7751 			    !(flags & LIFC_ALLZONES)))
   7752 				continue;
   7753 
   7754 			if ((uchar_t *)&lifr[1] > mp1->b_wptr) {
   7755 				if (iocp->ioc_cmd == O_SIOCGLIFCONF) {
   7756 					rw_exit(&ipst->ips_ill_g_lock);
   7757 					return (EINVAL);
   7758 				} else {
   7759 					goto lif_copydone;
   7760 				}
   7761 			}
   7762 
   7763 			ipif_get_name(ipif, lifr->lifr_name,
   7764 			    sizeof (lifr->lifr_name));
   7765 			lifr->lifr_type = ill->ill_type;
   7766 			if (ipif->ipif_isv6) {
   7767 				sin6 = (sin6_t *)&lifr->lifr_addr;
   7768 				*sin6 = sin6_null;
   7769 				sin6->sin6_family = AF_INET6;
   7770 				sin6->sin6_addr =
   7771 				    ipif->ipif_v6lcl_addr;
   7772 				lifr->lifr_addrlen =
   7773 				    ip_mask_to_plen_v6(
   7774 				    &ipif->ipif_v6net_mask);
   7775 			} else {
   7776 				sin = (sin_t *)&lifr->lifr_addr;
   7777 				*sin = sin_null;
   7778 				sin->sin_family = AF_INET;
   7779 				sin->sin_addr.s_addr =
   7780 				    ipif->ipif_lcl_addr;
   7781 				lifr->lifr_addrlen =
   7782 				    ip_mask_to_plen(
   7783 				    ipif->ipif_net_mask);
   7784 			}
   7785 			lifr++;
   7786 		}
   7787 	}
   7788 lif_copydone:
   7789 	rw_exit(&ipst->ips_ill_g_lock);
   7790 
   7791 	mp1->b_wptr = (uchar_t *)lifr;
   7792 	if (STRUCT_BUF(lifc) != NULL) {
   7793 		STRUCT_FSET(lifc, lifc_len,
   7794 		    (int)((uchar_t *)lifr - mp1->b_rptr));
   7795 	}
   7796 	return (0);
   7797 }
   7798 
   7799 static void
   7800 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
   7801 {
   7802 	ip6_asp_t *table;
   7803 	size_t table_size;
   7804 	mblk_t *data_mp;
   7805 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
   7806 	ip_stack_t	*ipst;
   7807