Home | History | Annotate | Download | only in ip
      1      0      stevel /*
      2      0      stevel  * CDDL HEADER START
      3      0      stevel  *
      4      0      stevel  * The contents of this file are subject to the terms of the
      5   1392     ja97890  * Common Development and Distribution License (the "License").
      6   1392     ja97890  * You may not use this file except in compliance with the License.
      7      0      stevel  *
      8      0      stevel  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9      0      stevel  * or http://www.opensolaris.org/os/licensing.
     10      0      stevel  * See the License for the specific language governing permissions
     11      0      stevel  * and limitations under the License.
     12      0      stevel  *
     13      0      stevel  * When distributing Covered Code, include this CDDL HEADER in each
     14      0      stevel  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15      0      stevel  * If applicable, add the following below this CDDL HEADER, with the
     16      0      stevel  * fields enclosed by brackets "[]" replaced with your own identifying
     17      0      stevel  * information: Portions Copyright [yyyy] [name of copyright owner]
     18      0      stevel  *
     19      0      stevel  * CDDL HEADER END
     20      0      stevel  */
     21      0      stevel /*
     22   8485       Peter  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23      0      stevel  * Use is subject to license terms.
     24      0      stevel  */
     25      0      stevel /* Copyright (c) 1990 Mentat Inc. */
     26      0      stevel 
     27      0      stevel /*
     28      0      stevel  * This file contains the interface control functions for IP.
     29      0      stevel  */
     30      0      stevel 
     31      0      stevel #include <sys/types.h>
     32      0      stevel #include <sys/stream.h>
     33      0      stevel #include <sys/dlpi.h>
     34      0      stevel #include <sys/stropts.h>
     35      0      stevel #include <sys/strsun.h>
     36      0      stevel #include <sys/sysmacros.h>
     37   8778        Erik #include <sys/strsubr.h>
     38      0      stevel #include <sys/strlog.h>
     39      0      stevel #include <sys/ddi.h>
     40      0      stevel #include <sys/sunddi.h>
     41      0      stevel #include <sys/cmn_err.h>
     42      0      stevel #include <sys/kstat.h>
     43      0      stevel #include <sys/debug.h>
     44      0      stevel #include <sys/zone.h>
     45   3448    dh155122 #include <sys/sunldi.h>
     46   3448    dh155122 #include <sys/file.h>
     47   5023    carlsonj #include <sys/bitmap.h>
     48   8275        Eric #include <sys/cpuvar.h>
     49   8275        Eric #include <sys/time.h>
     50   8485       Peter #include <sys/ctype.h>
     51      0      stevel #include <sys/kmem.h>
     52      0      stevel #include <sys/systm.h>
     53      0      stevel #include <sys/param.h>
     54      0      stevel #include <sys/socket.h>
     55      0      stevel #include <sys/isa_defs.h>
     56      0      stevel #include <net/if.h>
     57      0      stevel #include <net/if_arp.h>
     58      0      stevel #include <net/if_types.h>
     59      0      stevel #include <net/if_dl.h>
     60      0      stevel #include <net/route.h>
     61      0      stevel #include <sys/sockio.h>
     62      0      stevel #include <netinet/in.h>
     63      0      stevel #include <netinet/ip6.h>
     64      0      stevel #include <netinet/icmp6.h>
     65      0      stevel #include <netinet/igmp_var.h>
     66      0      stevel #include <sys/policy.h>
     67      0      stevel #include <sys/ethernet.h>
     68   8275        Eric #include <sys/callb.h>
     69   8485       Peter #include <sys/md5.h>
     70      0      stevel 
     71      0      stevel #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
     72      0      stevel #include <inet/mi.h>
     73      0      stevel #include <inet/nd.h>
     74      0      stevel #include <inet/arp.h>
     75  11042        Erik #include <inet/ip_arp.h>
     76      0      stevel #include <inet/mib2.h>
     77      0      stevel #include <inet/ip.h>
     78      0      stevel #include <inet/ip6.h>
     79      0      stevel #include <inet/ip6_asp.h>
     80      0      stevel #include <inet/tcp.h>
     81      0      stevel #include <inet/ip_multi.h>
     82      0      stevel #include <inet/ip_ire.h>
     83   2535    sangeeta #include <inet/ip_ftable.h>
     84      0      stevel #include <inet/ip_rts.h>
     85      0      stevel #include <inet/ip_ndp.h>
     86      0      stevel #include <inet/ip_if.h>
     87    741    masputra #include <inet/ip_impl.h>
     88      0      stevel #include <inet/sctp_ip.h>
     89   2958    dr146992 #include <inet/ip_netinfo.h>
     90  10946    Sangeeta #include <inet/ilb_ip.h>
     91      0      stevel 
     92      0      stevel #include <netinet/igmp.h>
     93      0      stevel #include <inet/ip_listutils.h>
     94      0      stevel #include <inet/ipclassifier.h>
     95   8275        Eric #include <sys/mac_client.h>
     96   8275        Eric #include <sys/dld.h>
     97      0      stevel 
     98      0      stevel #include <sys/systeminfo.h>
     99      0      stevel #include <sys/bootconf.h>
    100   1676         jpk 
    101   1676         jpk #include <sys/tsol/tndb.h>
    102   1676         jpk #include <sys/tsol/tnet.h>
    103      0      stevel 
    104      0      stevel /* The character which tells where the ill_name ends */
    105      0      stevel #define	IPIF_SEPARATOR_CHAR	':'
    106      0      stevel 
    107      0      stevel /* IP ioctl function table entry */
    108      0      stevel typedef struct ipft_s {
    109      0      stevel 	int	ipft_cmd;
    110      0      stevel 	pfi_t	ipft_pfi;
    111      0      stevel 	int	ipft_min_size;
    112      0      stevel 	int	ipft_flags;
    113      0      stevel } ipft_t;
    114      0      stevel #define	IPFT_F_NO_REPLY		0x1	/* IP ioctl does not expect any reply */
    115      0      stevel #define	IPFT_F_SELF_REPLY	0x2	/* ioctl callee does the ioctl reply */
    116      0      stevel 
    117      0      stevel static int	nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    118      0      stevel static int	nd_ill_forward_set(queue_t *q, mblk_t *mp,
    119      0      stevel 		    char *value, caddr_t cp, cred_t *ioc_cr);
    120      0      stevel 
    121   6255     sowmini static boolean_t ill_is_quiescent(ill_t *);
    122      0      stevel static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
    123      0      stevel static ip_m_t	*ip_m_lookup(t_uscalar_t mac_type);
    124      0      stevel static int	ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    125      0      stevel     mblk_t *mp, boolean_t need_up);
    126      0      stevel static int	ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    127      0      stevel     mblk_t *mp, boolean_t need_up);
    128      0      stevel static int	ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
    129      0      stevel     queue_t *q, mblk_t *mp, boolean_t need_up);
    130      0      stevel static int	ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
    131   7216        meem     mblk_t *mp);
    132      0      stevel static int	ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    133      0      stevel     mblk_t *mp);
    134      0      stevel static int	ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
    135      0      stevel     queue_t *q, mblk_t *mp, boolean_t need_up);
    136   4770        meem static int	ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
    137  11042        Erik     int ioccmd, struct linkblk *li);
    138   3448    dh155122 static ipaddr_t	ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
    139      0      stevel static void	ip_wput_ioctl(queue_t *q, mblk_t *mp);
    140      0      stevel static void	ipsq_flush(ill_t *ill);
    141   4360        meem 
    142      0      stevel static	int	ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
    143      0      stevel     queue_t *q, mblk_t *mp, boolean_t need_up);
    144      0      stevel static void	ipsq_delete(ipsq_t *);
    145      0      stevel 
    146      0      stevel static ipif_t	*ipif_allocate(ill_t *ill, int id, uint_t ire_type,
    147   8485       Peter     boolean_t initialize, boolean_t insert);
    148   4770        meem static ire_t	**ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
    149  11042        Erik static void	ipif_delete_bcast_ires(ipif_t *ipif);
    150  11042        Erik static int	ipif_add_ires_v4(ipif_t *, boolean_t);
    151   4459      kcpoon static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
    152   4459      kcpoon 		    boolean_t isv6);
    153      0      stevel static int	ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
    154      0      stevel static void	ipif_free(ipif_t *ipif);
    155      0      stevel static void	ipif_free_tail(ipif_t *ipif);
    156      0      stevel static void	ipif_set_default(ipif_t *ipif);
    157      0      stevel static int	ipif_set_values(queue_t *q, mblk_t *mp,
    158      0      stevel     char *interf_name, uint_t *ppa);
    159      0      stevel static int	ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
    160      0      stevel     queue_t *q);
    161      0      stevel static ipif_t	*ipif_lookup_on_name(char *name, size_t namelen,
    162      0      stevel     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
    163  11042        Erik     ip_stack_t *);
    164      0      stevel 
    165      0      stevel static int	ill_alloc_ppa(ill_if_t *, ill_t *);
    166      0      stevel static void	ill_delete_interface_type(ill_if_t *);
    167      0      stevel static int	ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
    168   2546    carlsonj static void	ill_dl_down(ill_t *ill);
    169      0      stevel static void	ill_down(ill_t *ill);
    170  11076       Cathy static void	ill_down_ipifs(ill_t *, boolean_t);
    171      0      stevel static void	ill_free_mib(ill_t *ill);
    172      0      stevel static void	ill_glist_delete(ill_t *);
    173      0      stevel static void	ill_phyint_reinit(ill_t *ill);
    174      0      stevel static void	ill_set_nce_router_flags(ill_t *, boolean_t);
    175   3340        meem static void	ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
    176   9073       Cathy static void	ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
    177   9073       Cathy 
    178   8485       Peter static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
    179  10616   Sebastien static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
    180   8485       Peter static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
    181  10616   Sebastien static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
    182  11042        Erik static ip_v4mapinfo_func_t ip_ether_v4_mapping;
    183  11042        Erik static ip_v6mapinfo_func_t ip_ether_v6_mapping;
    184  11042        Erik static ip_v4mapinfo_func_t ip_ib_v4_mapping;
    185  11042        Erik static ip_v6mapinfo_func_t ip_ib_v6_mapping;
    186  11042        Erik static ip_v4mapinfo_func_t ip_mbcast_mapping;
    187  11042        Erik static void 	ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
    188   3448    dh155122 static void 	ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
    189   8485       Peter static void	phyint_free(phyint_t *);
    190      0      stevel 
    191  11042        Erik static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
    192      0      stevel static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    193  11076       Cathy static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    194      0      stevel static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    195   8275        Eric static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
    196      0      stevel static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
    197      0      stevel     dl_capability_sub_t *);
    198   8275        Eric static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
    199   8275        Eric static void	ill_capability_dld_reset_fill(ill_t *, mblk_t *);
    200   8275        Eric static void	ill_capability_dld_ack(ill_t *, mblk_t *,
    201   8275        Eric 		    dl_capability_sub_t *);
    202   8275        Eric static void	ill_capability_dld_enable(ill_t *);
    203   8275        Eric static void	ill_capability_ack_thr(void *);
    204   8275        Eric static void	ill_capability_lso_enable(ill_t *);
    205      0      stevel 
    206      0      stevel static ill_t	*ill_prev_usesrc(ill_t *);
    207      0      stevel static int	ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
    208      0      stevel static void	ill_disband_usesrc_group(ill_t *);
    209  11042        Erik static void	ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
    210   5023    carlsonj 
    211   5023    carlsonj #ifdef DEBUG
    212  11042        Erik static	void	ill_trace_cleanup(const ill_t *);
    213  11042        Erik static	void	ipif_trace_cleanup(const ipif_t *);
    214   5023    carlsonj #endif
    215      0      stevel 
    216      0      stevel /*
    217      0      stevel  * if we go over the memory footprint limit more than once in this msec
    218      0      stevel  * interval, we'll start pruning aggressively.
    219      0      stevel  */
    220      0      stevel int ip_min_frag_prune_time = 0;
    221      0      stevel 
    222      0      stevel static ipft_t	ip_ioctl_ftbl[] = {
    223      0      stevel 	{ IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
    224      0      stevel 	{ IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
    225      0      stevel 		IPFT_F_NO_REPLY },
    226      0      stevel 	{ IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
    227      0      stevel 	{ 0 }
    228      0      stevel };
    229      0      stevel 
    230      0      stevel /* Simple ICMP IP Header Template */
    231      0      stevel static ipha_t icmp_ipha = {
    232      0      stevel 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
    233      0      stevel };
    234      0      stevel 
    235      0      stevel static uchar_t	ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
    236      0      stevel 
    237   8023        Phil static ip_m_t   ip_m_tbl[] = {
    238  10616   Sebastien 	{ DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    239  11042        Erik 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
    240      0      stevel 	    ip_nodef_v6intfid },
    241  10616   Sebastien 	{ DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
    242  11042        Erik 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    243      0      stevel 	    ip_nodef_v6intfid },
    244  10616   Sebastien 	{ DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
    245  11042        Erik 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    246      0      stevel 	    ip_nodef_v6intfid },
    247  10616   Sebastien 	{ DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
    248  11042        Erik 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    249  10616   Sebastien 	    ip_nodef_v6intfid },
    250  10616   Sebastien 	{ DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
    251  11042        Erik 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
    252  10616   Sebastien 	    ip_nodef_v6intfid },
    253  10616   Sebastien 	{ DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
    254  11042        Erik 	    ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
    255  10616   Sebastien 	    ip_nodef_v6intfid },
    256  11042        Erik 	{ DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
    257  11042        Erik 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
    258  11042        Erik 	    ip_ipv4_v6destintfid },
    259  11042        Erik 	{ DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
    260  11042        Erik 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
    261  11042        Erik 	    ip_ipv6_v6destintfid },
    262  11042        Erik 	{ DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
    263  11042        Erik 	    ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
    264  11042        Erik 	    ip_nodef_v6intfid },
    265  10616   Sebastien 	{ SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    266  10616   Sebastien 	    NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
    267  10616   Sebastien 	{ SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    268  10616   Sebastien 	    NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
    269  10616   Sebastien 	{ DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
    270  11042        Erik 	    ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
    271      0      stevel 	    ip_nodef_v6intfid }
    272      0      stevel };
    273      0      stevel 
    274      0      stevel static ill_t	ill_null;		/* Empty ILL for init. */
    275      0      stevel char	ipif_loopback_name[] = "lo0";
    276      0      stevel static char *ipv4_forward_suffix = ":ip_forwarding";
    277      0      stevel static char *ipv6_forward_suffix = ":ip6_forwarding";
    278      0      stevel static	sin6_t	sin6_null;	/* Zero address for quick clears */
    279      0      stevel static	sin_t	sin_null;	/* Zero address for quick clears */
    280   3448    dh155122 
    281      0      stevel /* When set search for unused ipif_seqid */
    282      0      stevel static ipif_t	ipif_zero;
    283      0      stevel 
    284      0      stevel /*
    285      0      stevel  * ppa arena is created after these many
    286      0      stevel  * interfaces have been plumbed.
    287      0      stevel  */
    288   3448    dh155122 uint_t	ill_no_arena = 12;	/* Setable in /etc/system */
    289      0      stevel 
    290      0      stevel /*
    291   3284    apersson  * Allocate per-interface mibs.
    292      0      stevel  * Returns true if ok. False otherwise.
    293      0      stevel  *  ipsq  may not yet be allocated (loopback case ).
    294      0      stevel  */
    295      0      stevel static boolean_t
    296      0      stevel ill_allocate_mibs(ill_t *ill)
    297      0      stevel {
    298      0      stevel 	/* Already allocated? */
    299   3284    apersson 	if (ill->ill_ip_mib != NULL) {
    300   3284    apersson 		if (ill->ill_isv6)
    301   3284    apersson 			ASSERT(ill->ill_icmp6_mib != NULL);
    302      0      stevel 		return (B_TRUE);
    303      0      stevel 	}
    304      0      stevel 
    305   3284    apersson 	ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
    306      0      stevel 	    KM_NOSLEEP);
    307   3284    apersson 	if (ill->ill_ip_mib == NULL) {
    308   3284    apersson 		return (B_FALSE);
    309   3284    apersson 	}
    310   3284    apersson 
    311   3284    apersson 	/* Setup static information */
    312   3284    apersson 	SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
    313   3284    apersson 	    sizeof (mib2_ipIfStatsEntry_t));
    314   3284    apersson 	if (ill->ill_isv6) {
    315   3284    apersson 		ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
    316   3284    apersson 		SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
    317   3284    apersson 		    sizeof (mib2_ipv6AddrEntry_t));
    318   3284    apersson 		SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
    319   3284    apersson 		    sizeof (mib2_ipv6RouteEntry_t));
    320   3284    apersson 		SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
    321   3284    apersson 		    sizeof (mib2_ipv6NetToMediaEntry_t));
    322   3284    apersson 		SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
    323   3284    apersson 		    sizeof (ipv6_member_t));
    324   3284    apersson 		SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
    325   3284    apersson 		    sizeof (ipv6_grpsrc_t));
    326   3284    apersson 	} else {
    327   3284    apersson 		ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
    328   3284    apersson 		SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
    329   3284    apersson 		    sizeof (mib2_ipAddrEntry_t));
    330   3284    apersson 		SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
    331   3284    apersson 		    sizeof (mib2_ipRouteEntry_t));
    332   3284    apersson 		SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
    333   3284    apersson 		    sizeof (mib2_ipNetToMediaEntry_t));
    334   3284    apersson 		SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
    335   3284    apersson 		    sizeof (ip_member_t));
    336   3284    apersson 		SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
    337   3284    apersson 		    sizeof (ip_grpsrc_t));
    338   3284    apersson 
    339   3284    apersson 		/*
    340   3284    apersson 		 * For a v4 ill, we are done at this point, because per ill
    341   3284    apersson 		 * icmp mibs are only used for v6.
    342   3284    apersson 		 */
    343   3284    apersson 		return (B_TRUE);
    344   3284    apersson 	}
    345   3284    apersson 
    346      0      stevel 	ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
    347      0      stevel 	    KM_NOSLEEP);
    348      0      stevel 	if (ill->ill_icmp6_mib == NULL) {
    349   3284    apersson 		kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
    350   3284    apersson 		ill->ill_ip_mib = NULL;
    351   3284    apersson 		return (B_FALSE);
    352   3284    apersson 	}
    353   3284    apersson 	/* static icmp info */
    354   3284    apersson 	ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
    355   3284    apersson 	    sizeof (mib2_ipv6IfIcmpEntry_t);
    356   3284    apersson 	/*
    357   3284    apersson 	 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
    358      0      stevel 	 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
    359      0      stevel 	 * -> ill_phyint_reinit
    360      0      stevel 	 */
    361      0      stevel 	return (B_TRUE);
    362      0      stevel }
    363      0      stevel 
    364      0      stevel /*
    365      0      stevel  * Completely vaporize a lower level tap and all associated interfaces.
    366      0      stevel  * ill_delete is called only out of ip_close when the device control
    367      0      stevel  * stream is being closed.
    368      0      stevel  */
    369      0      stevel void
    370      0      stevel ill_delete(ill_t *ill)
    371      0      stevel {
    372      0      stevel 	ipif_t	*ipif;
    373      0      stevel 	ill_t	*prev_ill;
    374   3448    dh155122 	ip_stack_t	*ipst = ill->ill_ipst;
    375      0      stevel 
    376      0      stevel 	/*
    377      0      stevel 	 * ill_delete may be forcibly entering the ipsq. The previous
    378      0      stevel 	 * ioctl may not have completed and may need to be aborted.
    379      0      stevel 	 * ipsq_flush takes care of it. If we don't need to enter the
    380      0      stevel 	 * the ipsq forcibly, the 2nd invocation of ipsq_flush in
    381      0      stevel 	 * ill_delete_tail is sufficient.
    382      0      stevel 	 */
    383      0      stevel 	ipsq_flush(ill);
    384      0      stevel 
    385      0      stevel 	/*
    386      0      stevel 	 * Nuke all interfaces.  ipif_free will take down the interface,
    387      0      stevel 	 * remove it from the list, and free the data structure.
    388      0      stevel 	 * Walk down the ipif list and remove the logical interfaces
    389      0      stevel 	 * first before removing the main ipif. We can't unplumb
    390  11042        Erik 	 * zeroth interface first in the case of IPv6 as update_conn_ill
    391  11042        Erik 	 * -> ip_ll_multireq de-references ill_ipif for checking
    392      0      stevel 	 * POINTOPOINT.
    393      0      stevel 	 *
    394      0      stevel 	 * If ill_ipif was not properly initialized (i.e low on memory),
    395      0      stevel 	 * then no interfaces to clean up. In this case just clean up the
    396      0      stevel 	 * ill.
    397      0      stevel 	 */
    398      0      stevel 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
    399      0      stevel 		ipif_free(ipif);
    400      0      stevel 
    401      0      stevel 	/*
    402  11042        Erik 	 * clean out all the nce_t entries that depend on this
    403  11042        Erik 	 * ill for the ill_phys_addr.
    404  11042        Erik 	 */
    405  11042        Erik 	nce_flush(ill, B_TRUE);
    406      0      stevel 
    407      0      stevel 	/* Clean up msgs on pending upcalls for mrouted */
    408      0      stevel 	reset_mrt_ill(ill);
    409      0      stevel 
    410  11042        Erik 	update_conn_ill(ill, ipst);
    411   8023        Phil 
    412   8023        Phil 	/*
    413   8023        Phil 	 * Remove multicast references added as a result of calls to
    414   8023        Phil 	 * ip_join_allmulti().
    415   8023        Phil 	 */
    416   8023        Phil 	ip_purge_allmulti(ill);
    417   8485       Peter 
    418   8485       Peter 	/*
    419   8485       Peter 	 * If the ill being deleted is under IPMP, boot it out of the illgrp.
    420   8485       Peter 	 */
    421   8485       Peter 	if (IS_UNDER_IPMP(ill))
    422   8485       Peter 		ipmp_ill_leave_illgrp(ill);
    423      0      stevel 
    424      0      stevel 	/*
    425      0      stevel 	 * ill_down will arrange to blow off any IRE's dependent on this
    426      0      stevel 	 * ILL, and shut down fragmentation reassembly.
    427      0      stevel 	 */
    428      0      stevel 	ill_down(ill);
    429      0      stevel 
    430      0      stevel 	/* Let SCTP know, so that it can remove this from its list. */
    431      0      stevel 	sctp_update_ill(ill, SCTP_ILL_REMOVE);
    432  11042        Erik 
    433  11042        Erik 	/*
    434  11042        Erik 	 * Walk all CONNs that can have a reference on an ire or nce for this
    435  11042        Erik 	 * ill (we actually walk all that now have stale references).
    436  11042        Erik 	 */
    437  11042        Erik 	ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
    438  11042        Erik 
    439  11042        Erik 	/* With IPv6 we have dce_ifindex. Cleanup for neatness */
    440  11042        Erik 	if (ill->ill_isv6)
    441  11042        Erik 		dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
    442      0      stevel 
    443      0      stevel 	/*
    444      0      stevel 	 * If an address on this ILL is being used as a source address then
    445      0      stevel 	 * clear out the pointers in other ILLs that point to this ILL.
    446      0      stevel 	 */
    447   3448    dh155122 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
    448      0      stevel 	if (ill->ill_usesrc_grp_next != NULL) {
    449      0      stevel 		if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
    450      0      stevel 			ill_disband_usesrc_group(ill);
    451      0      stevel 		} else {	/* consumer of the usesrc ILL */
    452      0      stevel 			prev_ill = ill_prev_usesrc(ill);
    453      0      stevel 			prev_ill->ill_usesrc_grp_next =
    454      0      stevel 			    ill->ill_usesrc_grp_next;
    455      0      stevel 		}
    456      0      stevel 	}
    457   3448    dh155122 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
    458      0      stevel }
    459      0      stevel 
    460   2546    carlsonj static void
    461   2546    carlsonj ipif_non_duplicate(ipif_t *ipif)
    462   2546    carlsonj {
    463   2546    carlsonj 	ill_t *ill = ipif->ipif_ill;
    464   2546    carlsonj 	mutex_enter(&ill->ill_lock);
    465   2546    carlsonj 	if (ipif->ipif_flags & IPIF_DUPLICATE) {
    466   2546    carlsonj 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
    467   2546    carlsonj 		ASSERT(ill->ill_ipif_dup_count > 0);
    468   2546    carlsonj 		ill->ill_ipif_dup_count--;
    469   2546    carlsonj 	}
    470   2546    carlsonj 	mutex_exit(&ill->ill_lock);
    471   2546    carlsonj }
    472   2546    carlsonj 
    473      0      stevel /*
    474      0      stevel  * ill_delete_tail is called from ip_modclose after all references
    475      0      stevel  * to the closing ill are gone. The wait is done in ip_modclose
    476      0      stevel  */
    477      0      stevel void
    478      0      stevel ill_delete_tail(ill_t *ill)
    479      0      stevel {
    480      0      stevel 	mblk_t	**mpp;
    481      0      stevel 	ipif_t	*ipif;
    482   3448    dh155122 	ip_stack_t	*ipst = ill->ill_ipst;
    483      0      stevel 
    484   2546    carlsonj 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
    485   2546    carlsonj 		ipif_non_duplicate(ipif);
    486  11042        Erik 		(void) ipif_down_tail(ipif);
    487  11042        Erik 	}
    488  11042        Erik 
    489  11042        Erik 	ASSERT(ill->ill_ipif_dup_count == 0);
    490      0      stevel 
    491      0      stevel 	/*
    492      0      stevel 	 * If polling capability is enabled (which signifies direct
    493      0      stevel 	 * upcall into IP and driver has ill saved as a handle),
    494      0      stevel 	 * we need to make sure that unbind has completed before we
    495      0      stevel 	 * let the ill disappear and driver no longer has any reference
    496      0      stevel 	 * to this ill.
    497      0      stevel 	 */
    498      0      stevel 	mutex_enter(&ill->ill_lock);
    499   1555      krgopi 	while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
    500   1555      krgopi 		cv_wait(&ill->ill_cv, &ill->ill_lock);
    501   1555      krgopi 	mutex_exit(&ill->ill_lock);
    502   8275        Eric 	ASSERT(!(ill->ill_capabilities &
    503   8275        Eric 	    (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
    504      0      stevel 
    505      0      stevel 	if (ill->ill_net_type != IRE_LOOPBACK)
    506      0      stevel 		qprocsoff(ill->ill_rq);
    507      0      stevel 
    508      0      stevel 	/*
    509      0      stevel 	 * We do an ipsq_flush once again now. New messages could have
    510      0      stevel 	 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
    511      0      stevel 	 * could also have landed up if an ioctl thread had looked up
    512      0      stevel 	 * the ill before we set the ILL_CONDEMNED flag, but not yet
    513      0      stevel 	 * enqueued the ioctl when we did the ipsq_flush last time.
    514      0      stevel 	 */
    515      0      stevel 	ipsq_flush(ill);
    516      0      stevel 
    517      0      stevel 	/*
    518      0      stevel 	 * Free capabilities.
    519      0      stevel 	 */
    520      0      stevel 	if (ill->ill_hcksum_capab != NULL) {
    521      0      stevel 		kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
    522      0      stevel 		ill->ill_hcksum_capab = NULL;
    523      0      stevel 	}
    524      0      stevel 
    525      0      stevel 	if (ill->ill_zerocopy_capab != NULL) {
    526      0      stevel 		kmem_free(ill->ill_zerocopy_capab,
    527      0      stevel 		    sizeof (ill_zerocopy_capab_t));
    528      0      stevel 		ill->ill_zerocopy_capab = NULL;
    529      0      stevel 	}
    530   1184      krgopi 
    531   3115    yl150051 	if (ill->ill_lso_capab != NULL) {
    532   3115    yl150051 		kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
    533   3115    yl150051 		ill->ill_lso_capab = NULL;
    534   3115    yl150051 	}
    535   3115    yl150051 
    536   8275        Eric 	if (ill->ill_dld_capab != NULL) {
    537   8275        Eric 		kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
    538   8275        Eric 		ill->ill_dld_capab = NULL;
    539   8275        Eric 	}
    540      0      stevel 
    541      0      stevel 	while (ill->ill_ipif != NULL)
    542      0      stevel 		ipif_free_tail(ill->ill_ipif);
    543      0      stevel 
    544      0      stevel 	/*
    545      0      stevel 	 * We have removed all references to ilm from conn and the ones joined
    546      0      stevel 	 * within the kernel.
    547      0      stevel 	 *
    548      0      stevel 	 * We don't walk conns, mrts and ires because
    549      0      stevel 	 *
    550  11042        Erik 	 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
    551      0      stevel 	 * 2) ill_down ->ill_downi walks all the ires and cleans up
    552      0      stevel 	 *    ill references.
    553      0      stevel 	 */
    554   8485       Peter 
    555   8485       Peter 	/*
    556   8485       Peter 	 * If this ill is an IPMP meta-interface, blow away the illgrp.  This
    557   8485       Peter 	 * is safe to do because the illgrp has already been unlinked from the
    558   8485       Peter 	 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
    559   8485       Peter 	 */
    560   8485       Peter 	if (IS_IPMP(ill)) {
    561   8485       Peter 		ipmp_illgrp_destroy(ill->ill_grp);
    562   8485       Peter 		ill->ill_grp = NULL;
    563   8485       Peter 	}
    564   8485       Peter 
    565   8485       Peter 	/*
    566   8485       Peter 	 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
    567      0      stevel 	 * could free the phyint. No more reference to the phyint after this
    568      0      stevel 	 * point.
    569      0      stevel 	 */
    570      0      stevel 	(void) ill_glist_delete(ill);
    571      0      stevel 
    572   3448    dh155122 	rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER);
    573      0      stevel 	if (ill->ill_ndd_name != NULL)
    574   3448    dh155122 		nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name);
    575   3448    dh155122 	rw_exit(&ipst->ips_ip_g_nd_lock);
    576      0      stevel 
    577      0      stevel 	if (ill->ill_frag_ptr != NULL) {
    578      0      stevel 		uint_t count;
    579      0      stevel 
    580      0      stevel 		for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
    581      0      stevel 			mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
    582      0      stevel 		}
    583      0      stevel 		mi_free(ill->ill_frag_ptr);
    584      0      stevel 		ill->ill_frag_ptr = NULL;
    585      0      stevel 		ill->ill_frag_hash_tbl = NULL;
    586      0      stevel 	}
    587   3340        meem 
    588   3340        meem 	freemsg(ill->ill_nd_lla_mp);
    589      0      stevel 	/* Free all retained control messages. */
    590      0      stevel 	mpp = &ill->ill_first_mp_to_free;
    591      0      stevel 	do {
    592      0      stevel 		while (mpp[0]) {
    593      0      stevel 			mblk_t  *mp;
    594      0      stevel 			mblk_t  *mp1;
    595      0      stevel 
    596      0      stevel 			mp = mpp[0];
    597      0      stevel 			mpp[0] = mp->b_next;
    598      0      stevel 			for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
    599      0      stevel 				mp1->b_next = NULL;
    600      0      stevel 				mp1->b_prev = NULL;
    601      0      stevel 			}
    602      0      stevel 			freemsg(mp);
    603      0      stevel 		}
    604      0      stevel 	} while (mpp++ != &ill->ill_last_mp_to_free);
    605      0      stevel 
    606      0      stevel 	ill_free_mib(ill);
    607   5023    carlsonj 
    608   5023    carlsonj #ifdef DEBUG
    609   5023    carlsonj 	ill_trace_cleanup(ill);
    610   5023    carlsonj #endif
    611  11042        Erik 
    612  11042        Erik 	/* The default multicast interface might have changed */
    613  11042        Erik 	ire_increment_multicast_generation(ipst, ill->ill_isv6);
    614   5023    carlsonj 
    615   3448    dh155122 	/* Drop refcnt here */
    616   3448    dh155122 	netstack_rele(ill->ill_ipst->ips_netstack);
    617   3448    dh155122 	ill->ill_ipst = NULL;
    618      0      stevel }
    619      0      stevel 
    620      0      stevel static void
    621      0      stevel ill_free_mib(ill_t *ill)
    622      0      stevel {
    623   3448    dh155122 	ip_stack_t *ipst = ill->ill_ipst;
    624   3448    dh155122 
    625   3284    apersson 	/*
    626   3284    apersson 	 * MIB statistics must not be lost, so when an interface
    627   3284    apersson 	 * goes away the counter values will be added to the global
    628   3284    apersson 	 * MIBs.
    629   3284    apersson 	 */
    630   3284    apersson 	if (ill->ill_ip_mib != NULL) {
    631   3448    dh155122 		if (ill->ill_isv6) {
    632   3448    dh155122 			ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
    633   3448    dh155122 			    ill->ill_ip_mib);
    634   3448    dh155122 		} else {
    635   3448    dh155122 			ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
    636   3448    dh155122 			    ill->ill_ip_mib);
    637   3448    dh155122 		}
    638   3284    apersson 
    639   3284    apersson 		kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
    640   3284    apersson 		ill->ill_ip_mib = NULL;
    641      0      stevel 	}
    642      0      stevel 	if (ill->ill_icmp6_mib != NULL) {
    643   3448    dh155122 		ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
    644   3448    dh155122 		    ill->ill_icmp6_mib);
    645      0      stevel 		kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
    646      0      stevel 		ill->ill_icmp6_mib = NULL;
    647      0      stevel 	}
    648      0      stevel }
    649      0      stevel 
    650      0      stevel /*
    651      0      stevel  * Concatenate together a physical address and a sap.
    652      0      stevel  *
    653      0      stevel  * Sap_lengths are interpreted as follows:
    654      0      stevel  *   sap_length == 0	==>	no sap
    655      0      stevel  *   sap_length > 0	==>	sap is at the head of the dlpi address
    656      0      stevel  *   sap_length < 0	==>	sap is at the tail of the dlpi address
    657      0      stevel  */
    658      0      stevel static void
    659      0      stevel ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
    660      0      stevel     t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
    661      0      stevel {
    662      0      stevel 	uint16_t sap_addr = (uint16_t)sap_src;
    663      0      stevel 
    664      0      stevel 	if (sap_length == 0) {
    665      0      stevel 		if (phys_src == NULL)
    666      0      stevel 			bzero(dst, phys_length);
    667      0      stevel 		else
    668      0      stevel 			bcopy(phys_src, dst, phys_length);
    669      0      stevel 	} else if (sap_length < 0) {
    670      0      stevel 		if (phys_src == NULL)
    671      0      stevel 			bzero(dst, phys_length);
    672      0      stevel 		else
    673      0      stevel 			bcopy(phys_src, dst, phys_length);
    674      0      stevel 		bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
    675      0      stevel 	} else {
    676      0      stevel 		bcopy(&sap_addr, dst, sizeof (sap_addr));
    677      0      stevel 		if (phys_src == NULL)
    678      0      stevel 			bzero((char *)dst + sap_length, phys_length);
    679      0      stevel 		else
    680      0      stevel 			bcopy(phys_src, (char *)dst + sap_length, phys_length);
    681      0      stevel 	}
    682      0      stevel }
    683      0      stevel 
    684      0      stevel /*
    685      0      stevel  * Generate a dl_unitdata_req mblk for the device and address given.
    686      0      stevel  * addr_length is the length of the physical portion of the address.
    687      0      stevel  * If addr is NULL include an all zero address of the specified length.
    688      0      stevel  * TRUE? In any case, addr_length is taken to be the entire length of the
    689      0      stevel  * dlpi address, including the absolute value of sap_length.
    690      0      stevel  */
    691      0      stevel mblk_t *
    692      0      stevel ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
    693      0      stevel 		t_scalar_t sap_length)
    694      0      stevel {
    695      0      stevel 	dl_unitdata_req_t *dlur;
    696      0      stevel 	mblk_t	*mp;
    697      0      stevel 	t_scalar_t	abs_sap_length;		/* absolute value */
    698      0      stevel 
    699      0      stevel 	abs_sap_length = ABS(sap_length);
    700      0      stevel 	mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
    701   4459      kcpoon 	    DL_UNITDATA_REQ);
    702      0      stevel 	if (mp == NULL)
    703      0      stevel 		return (NULL);
    704      0      stevel 	dlur = (dl_unitdata_req_t *)mp->b_rptr;
    705      0      stevel 	/* HACK: accomodate incompatible DLPI drivers */
    706      0      stevel 	if (addr_length == 8)
    707      0      stevel 		addr_length = 6;
    708      0      stevel 	dlur->dl_dest_addr_length = addr_length + abs_sap_length;
    709      0      stevel 	dlur->dl_dest_addr_offset = sizeof (*dlur);
    710      0      stevel 	dlur->dl_priority.dl_min = 0;
    711      0      stevel 	dlur->dl_priority.dl_max = 0;
    712      0      stevel 	ill_dlur_copy_address(addr, addr_length, sap, sap_length,
    713      0      stevel 	    (uchar_t *)&dlur[1]);
    714      0      stevel 	return (mp);
    715      0      stevel }
    716      0      stevel 
    717      0      stevel /*
    718      0      stevel  * Add the pending mp to the list. There can be only 1 pending mp
    719      0      stevel  * in the list. Any exclusive ioctl that needs to wait for a response
    720      0      stevel  * from another module or driver needs to use this function to set
    721   8485       Peter  * the ipx_pending_mp to the ioctl mblk and wait for the response from
    722      0      stevel  * the other module/driver. This is also used while waiting for the
    723      0      stevel  * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
    724      0      stevel  */
    725      0      stevel boolean_t
    726      0      stevel ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
    727      0      stevel     int waitfor)
    728      0      stevel {
    729   8485       Peter 	ipxop_t	*ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
    730      0      stevel 
    731      0      stevel 	ASSERT(IAM_WRITER_IPIF(ipif));
    732      0      stevel 	ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
    733      0      stevel 	ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
    734   8485       Peter 	ASSERT(ipx->ipx_pending_mp == NULL);
    735   3340        meem 	/*
    736   3340        meem 	 * The caller may be using a different ipif than the one passed into
    737   3340        meem 	 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
    738   3340        meem 	 * ill needs to wait for the V6 ill to quiesce).  So we can't ASSERT
    739   8485       Peter 	 * that `ipx_current_ipif == ipif'.
    740   8485       Peter 	 */
    741   8485       Peter 	ASSERT(ipx->ipx_current_ipif != NULL);
    742   3340        meem 
    743      0      stevel 	/*
    744  10616   Sebastien 	 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
    745  10616   Sebastien 	 * driver.
    746  10616   Sebastien 	 */
    747  10616   Sebastien 	ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
    748  10616   Sebastien 	    (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
    749  10616   Sebastien 	    (DB_TYPE(add_mp) == M_PCPROTO));
    750   3340        meem 
    751      0      stevel 	if (connp != NULL) {
    752      0      stevel 		ASSERT(MUTEX_HELD(&connp->conn_lock));
    753      0      stevel 		/*
    754      0      stevel 		 * Return error if the conn has started closing. The conn
    755      0      stevel 		 * could have finished cleaning up the pending mp list,
    756      0      stevel 		 * If so we should not add another mp to the list negating
    757      0      stevel 		 * the cleanup.
    758      0      stevel 		 */
    759      0      stevel 		if (connp->conn_state_flags & CONN_CLOSING)
    760      0      stevel 			return (B_FALSE);
    761      0      stevel 	}
    762   8485       Peter 	mutex_enter(&ipx->ipx_lock);
    763   8485       Peter 	ipx->ipx_pending_ipif = ipif;
    764      0      stevel 	/*
    765      0      stevel 	 * Note down the queue in b_queue. This will be returned by
    766      0      stevel 	 * ipsq_pending_mp_get. Caller will then use these values to restart
    767      0      stevel 	 * the processing
    768      0      stevel 	 */
    769      0      stevel 	add_mp->b_next = NULL;
    770      0      stevel 	add_mp->b_queue = q;
    771   8485       Peter 	ipx->ipx_pending_mp = add_mp;
    772   8485       Peter 	ipx->ipx_waitfor = waitfor;
    773   8485       Peter 	mutex_exit(&ipx->ipx_lock);
    774   3340        meem 
    775      0      stevel 	if (connp != NULL)
    776      0      stevel 		connp->conn_oper_pending_ill = ipif->ipif_ill;
    777   8485       Peter 
    778   8485       Peter 	return (B_TRUE);
    779   8485       Peter }
    780   8485       Peter 
    781   8485       Peter /*
    782   8485       Peter  * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
    783      0      stevel  * queued in the list.
    784      0      stevel  */
    785      0      stevel mblk_t *
    786      0      stevel ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
    787      0      stevel {
    788      0      stevel 	mblk_t	*curr = NULL;
    789   8485       Peter 	ipxop_t	*ipx = ipsq->ipsq_xop;
    790   8485       Peter 
    791      0      stevel 	*connpp = NULL;
    792   8485       Peter 	mutex_enter(&ipx->ipx_lock);
    793   8485       Peter 	if (ipx->ipx_pending_mp == NULL) {
    794   8485       Peter 		mutex_exit(&ipx->ipx_lock);
    795      0      stevel 		return (NULL);
    796      0      stevel 	}
    797      0      stevel 
    798      0      stevel 	/* There can be only 1 such excl message */
    799   8485       Peter 	curr = ipx->ipx_pending_mp;
    800   8485       Peter 	ASSERT(curr->b_next == NULL);
    801   8485       Peter 	ipx->ipx_pending_ipif = NULL;
    802   8485       Peter 	ipx->ipx_pending_mp = NULL;
    803   8485       Peter 	ipx->ipx_waitfor = 0;
    804   8485       Peter 	mutex_exit(&ipx->ipx_lock);
    805      0      stevel 
    806      0      stevel 	if (CONN_Q(curr->b_queue)) {
    807      0      stevel 		/*
    808      0      stevel 		 * This mp did a refhold on the conn, at the start of the ioctl.
    809      0      stevel 		 * So we can safely return a pointer to the conn to the caller.
    810      0      stevel 		 */
    811      0      stevel 		*connpp = Q_TO_CONN(curr->b_queue);
    812      0      stevel 	} else {
    813      0      stevel 		*connpp = NULL;
    814      0      stevel 	}
    815      0      stevel 	curr->b_next = NULL;
    816      0      stevel 	curr->b_prev = NULL;
    817      0      stevel 	return (curr);
    818      0      stevel }
    819      0      stevel 
    820      0      stevel /*
    821   8485       Peter  * Cleanup the ioctl mp queued in ipx_pending_mp
    822      0      stevel  * - Called in the ill_delete path
    823      0      stevel  * - Called in the M_ERROR or M_HANGUP path on the ill.
    824      0      stevel  * - Called in the conn close path.
    825      0      stevel  */
    826      0      stevel boolean_t
    827      0      stevel ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
    828      0      stevel {
    829      0      stevel 	mblk_t	*mp;
    830   8485       Peter 	ipxop_t	*ipx;
    831      0      stevel 	queue_t	*q;
    832      0      stevel 	ipif_t	*ipif;
    833  11042        Erik 	int	cmd;
    834      0      stevel 
    835      0      stevel 	ASSERT(IAM_WRITER_ILL(ill));
    836   8485       Peter 	ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
    837   8485       Peter 
    838   8485       Peter 	/*
    839   8485       Peter 	 * If connp is null, unconditionally clean up the ipx_pending_mp.
    840      0      stevel 	 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl
    841      0      stevel 	 * even if it is meant for another ill, since we have to enqueue
    842   8485       Peter 	 * a new mp now in ipx_pending_mp to complete the ipif_down.
    843      0      stevel 	 * If connp is non-null we are called from the conn close path.
    844      0      stevel 	 */
    845   8485       Peter 	mutex_enter(&ipx->ipx_lock);
    846   8485       Peter 	mp = ipx->ipx_pending_mp;
    847      0      stevel 	if (mp == NULL || (connp != NULL &&
    848      0      stevel 	    mp->b_queue != CONNP_TO_WQ(connp))) {
    849   8485       Peter 		mutex_exit(&ipx->ipx_lock);
    850   8485       Peter 		return (B_FALSE);
    851   8485       Peter 	}
    852   8485       Peter 	/* Now remove from the ipx_pending_mp */
    853   8485       Peter 	ipx->ipx_pending_mp = NULL;
    854      0      stevel 	q = mp->b_queue;
    855      0      stevel 	mp->b_next = NULL;
    856      0      stevel 	mp->b_prev = NULL;
    857      0      stevel 	mp->b_queue = NULL;
    858      0      stevel 
    859   8485       Peter 	ipif = ipx->ipx_pending_ipif;
    860   8485       Peter 	ipx->ipx_pending_ipif = NULL;
    861   8485       Peter 	ipx->ipx_waitfor = 0;
    862   8485       Peter 	ipx->ipx_current_ipif = NULL;
    863  11042        Erik 	cmd = ipx->ipx_current_ioctl;
    864   8485       Peter 	ipx->ipx_current_ioctl = 0;
    865   8485       Peter 	ipx->ipx_current_done = B_TRUE;
    866   8485       Peter 	mutex_exit(&ipx->ipx_lock);
    867      0      stevel 
    868      0      stevel 	if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
    869  11042        Erik 		DTRACE_PROBE4(ipif__ioctl,
    870  11042        Erik 		    char *, "ipsq_pending_mp_cleanup",
    871  11042        Erik 		    int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
    872  11042        Erik 		    ipif_t *, ipif);
    873   3340        meem 		if (connp == NULL) {
    874   3340        meem 			ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
    875   3340        meem 		} else {
    876   3340        meem 			ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
    877   3340        meem 			mutex_enter(&ipif->ipif_ill->ill_lock);
    878   3340        meem 			ipif->ipif_state_flags &= ~IPIF_CHANGING;
    879   3340        meem 			mutex_exit(&ipif->ipif_ill->ill_lock);
    880   3340        meem 		}
    881      0      stevel 	} else {
    882      0      stevel 		/*
    883      0      stevel 		 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't
    884    741    masputra 		 * be just inet_freemsg. we have to restart it
    885      0      stevel 		 * otherwise the thread will be stuck.
    886      0      stevel 		 */
    887    741    masputra 		inet_freemsg(mp);
    888      0      stevel 	}
    889      0      stevel 	return (B_TRUE);
    890      0      stevel }
    891      0      stevel 
    892      0      stevel /*
    893      0      stevel  * Called in the conn close path and ill delete path
    894      0      stevel  */
    895      0      stevel static void
    896      0      stevel ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
    897      0      stevel {
    898      0      stevel 	ipsq_t	*ipsq;
    899      0      stevel 	mblk_t	*prev;
    900      0      stevel 	mblk_t	*curr;
    901      0      stevel 	mblk_t	*next;
    902      0      stevel 	queue_t	*q;
    903      0      stevel 	mblk_t	*tmp_list = NULL;
    904      0      stevel 
    905      0      stevel 	ASSERT(IAM_WRITER_ILL(ill));
    906      0      stevel 	if (connp != NULL)
    907      0      stevel 		q = CONNP_TO_WQ(connp);
    908      0      stevel 	else
    909      0      stevel 		q = ill->ill_wq;
    910      0      stevel 
    911      0      stevel 	ipsq = ill->ill_phyint->phyint_ipsq;
    912      0      stevel 	/*
    913      0      stevel 	 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
    914      0      stevel 	 * In the case of ioctl from a conn, there can be only 1 mp
    915      0      stevel 	 * queued on the ipsq. If an ill is being unplumbed, only messages
    916      0      stevel 	 * related to this ill are flushed, like M_ERROR or M_HANGUP message.
    917      0      stevel 	 * ioctls meant for this ill form conn's are not flushed. They will
    918      0      stevel 	 * be processed during ipsq_exit and will not find the ill and will
    919      0      stevel 	 * return error.
    920      0      stevel 	 */
    921      0      stevel 	mutex_enter(&ipsq->ipsq_lock);
    922      0      stevel 	for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
    923      0      stevel 	    curr = next) {
    924      0      stevel 		next = curr->b_next;
    925      0      stevel 		if (curr->b_queue == q || curr->b_queue == RD(q)) {
    926      0      stevel 			/* Unlink the mblk from the pending mp list */
    927      0      stevel 			if (prev != NULL) {
    928      0      stevel 				prev->b_next = curr->b_next;
    929      0      stevel 			} else {
    930      0      stevel 				ASSERT(ipsq->ipsq_xopq_mphead == curr);
    931      0      stevel 				ipsq->ipsq_xopq_mphead = curr->b_next;
    932      0      stevel 			}
    933      0      stevel 			if (ipsq->ipsq_xopq_mptail == curr)
    934      0      stevel 				ipsq->ipsq_xopq_mptail = prev;
    935      0      stevel 			/*
    936      0      stevel 			 * Create a temporary list and release the ipsq lock
    937      0      stevel 			 * New elements are added to the head of the tmp_list
    938      0      stevel 			 */
    939      0      stevel 			curr->b_next = tmp_list;
    940      0      stevel 			tmp_list = curr;
    941      0      stevel 		} else {
    942      0      stevel 			prev = curr;
    943      0      stevel 		}
    944      0      stevel 	}
    945      0      stevel 	mutex_exit(&ipsq->ipsq_lock);
    946      0      stevel 
    947      0      stevel 	while (tmp_list != NULL) {
    948      0      stevel 		curr = tmp_list;
    949      0      stevel 		tmp_list = curr->b_next;
    950      0      stevel 		curr->b_next = NULL;
    951      0      stevel 		curr->b_prev = NULL;
    952      0      stevel 		curr->b_queue = NULL;
    953      0      stevel 		if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
    954  11042        Erik 			DTRACE_PROBE4(ipif__ioctl,
    955  11042        Erik 			    char *, "ipsq_xopq_mp_cleanup",
    956  11042        Erik 			    int, 0, ill_t *, NULL, ipif_t *, NULL);
    957      0      stevel 			ip_ioctl_finish(q, curr, ENXIO, connp != NULL ?
    958   3340        meem 			    CONN_CLOSE : NO_COPYOUT, NULL);
    959      0      stevel 		} else {
    960      0      stevel 			/*
    961      0      stevel 			 * IP-MT XXX In the case of TLI/XTI bind / optmgmt
    962    741    masputra 			 * this can't be just inet_freemsg. we have to
    963      0      stevel 			 * restart it otherwise the thread will be stuck.
    964      0      stevel 			 */
    965    741    masputra 			inet_freemsg(curr);
    966      0      stevel 		}
    967      0      stevel 	}
    968      0      stevel }
    969      0      stevel 
    970      0      stevel /*
    971      0      stevel  * This conn has started closing. Cleanup any pending ioctl from this conn.
    972      0      stevel  * STREAMS ensures that there can be at most 1 ioctl pending on a stream.
    973      0      stevel  */
    974      0      stevel void
    975      0      stevel conn_ioctl_cleanup(conn_t *connp)
    976      0      stevel {
    977      0      stevel 	ipsq_t	*ipsq;
    978      0      stevel 	ill_t	*ill;
    979      0      stevel 	boolean_t refheld;
    980      0      stevel 
    981      0      stevel 	/*
    982      0      stevel 	 * Is any exclusive ioctl pending ? If so clean it up. If the
    983      0      stevel 	 * ioctl has not yet started, the mp is pending in the list headed by
    984      0      stevel 	 * ipsq_xopq_head. If the ioctl has started the mp could be present in
    985   8485       Peter 	 * ipx_pending_mp. If the ioctl timed out in the streamhead but
    986      0      stevel 	 * is currently executing now the mp is not queued anywhere but
    987      0      stevel 	 * conn_oper_pending_ill is null. The conn close will wait
    988      0      stevel 	 * till the conn_ref drops to zero.
    989      0      stevel 	 */
    990      0      stevel 	mutex_enter(&connp->conn_lock);
    991      0      stevel 	ill = connp->conn_oper_pending_ill;
    992      0      stevel 	if (ill == NULL) {
    993      0      stevel 		mutex_exit(&connp->conn_lock);
    994      0      stevel 		return;
    995      0      stevel 	}
    996      0      stevel 
    997      0      stevel 	/*
    998      0      stevel 	 * We may not be able to refhold the ill if the ill/ipif
    999      0      stevel 	 * is changing. But we need to make sure that the ill will
   1000      0      stevel 	 * not vanish. So we just bump up the ill_waiter count.
   1001      0      stevel 	 */
   1002      0      stevel 	refheld = ill_waiter_inc(ill);
   1003      0      stevel 	mutex_exit(&connp->conn_lock);
   1004      0      stevel 	if (refheld) {
   1005   8275        Eric 		if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
   1006      0      stevel 			ill_waiter_dcr(ill);
   1007      0      stevel 			/*
   1008      0      stevel 			 * Check whether this ioctl has started and is
   1009   8485       Peter 			 * pending. If it is not found there then check
   1010   8485       Peter 			 * whether this ioctl has not even started and is in
   1011   8485       Peter 			 * the ipsq_xopq list.
   1012      0      stevel 			 */
   1013      0      stevel 			if (!ipsq_pending_mp_cleanup(ill, connp))
   1014      0      stevel 				ipsq_xopq_mp_cleanup(ill, connp);
   1015      0      stevel 			ipsq = ill->ill_phyint->phyint_ipsq;
   1016   7098        meem 			ipsq_exit(ipsq);
   1017      0      stevel 			return;
   1018      0      stevel 		}
   1019      0      stevel 	}
   1020      0      stevel 
   1021      0      stevel 	/*
   1022      0      stevel 	 * The ill is also closing and we could not bump up the
   1023      0      stevel 	 * ill_waiter_count or we could not enter the ipsq. Leave
   1024      0      stevel 	 * the cleanup to ill_delete
   1025      0      stevel 	 */
   1026      0      stevel 	mutex_enter(&connp->conn_lock);
   1027      0      stevel 	while (connp->conn_oper_pending_ill != NULL)
   1028      0      stevel 		cv_wait(&connp->conn_refcv, &connp->conn_lock);
   1029      0      stevel 	mutex_exit(&connp->conn_lock);
   1030      0      stevel 	if (refheld)
   1031      0      stevel 		ill_waiter_dcr(ill);
   1032      0      stevel }
   1033      0      stevel 
   1034      0      stevel /*
   1035      0      stevel  * ipcl_walk function for cleaning up conn_*_ill fields.
   1036  11042        Erik  * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
   1037  11042        Erik  * conn_bound_if in place. We prefer dropping
   1038  11042        Erik  * packets instead of sending them out the wrong interface, or accepting
   1039  11042        Erik  * packets from the wrong ifindex.
   1040      0      stevel  */
   1041      0      stevel static void
   1042      0      stevel conn_cleanup_ill(conn_t *connp, caddr_t arg)
   1043      0      stevel {
   1044      0      stevel 	ill_t	*ill = (ill_t *)arg;
   1045      0      stevel 
   1046      0      stevel 	mutex_enter(&connp->conn_lock);
   1047   5381        meem 	if (connp->conn_dhcpinit_ill == ill) {
   1048   5381        meem 		connp->conn_dhcpinit_ill = NULL;
   1049   5381        meem 		ASSERT(ill->ill_dhcpinit != 0);
   1050   5381        meem 		atomic_dec_32(&ill->ill_dhcpinit);
   1051  11042        Erik 		ill_set_inputfn(ill);
   1052      0      stevel 	}
   1053      0      stevel 	mutex_exit(&connp->conn_lock);
   1054      0      stevel }
   1055      0      stevel 
   1056  11042        Erik static int
   1057   9073       Cathy ill_down_ipifs_tail(ill_t *ill)
   1058   9073       Cathy {
   1059   9073       Cathy 	ipif_t	*ipif;
   1060  11042        Erik 	int err;
   1061   9073       Cathy 
   1062   9073       Cathy 	ASSERT(IAM_WRITER_ILL(ill));
   1063   2546    carlsonj 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   1064   2546    carlsonj 		ipif_non_duplicate(ipif);
   1065  11042        Erik 		/*
   1066  11042        Erik 		 * ipif_down_tail will call arp_ll_down on the last ipif
   1067  11042        Erik 		 * and typically return EINPROGRESS when the DL_UNBIND is sent.
   1068  11042        Erik 		 */
   1069  11042        Erik 		if ((err = ipif_down_tail(ipif)) != 0)
   1070  11042        Erik 			return (err);
   1071  11042        Erik 	}
   1072  11042        Erik 	return (0);
   1073   9073       Cathy }
   1074   9073       Cathy 
   1075   9073       Cathy /* ARGSUSED */
   1076   9073       Cathy void
   1077   9073       Cathy ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
   1078   9073       Cathy {
   1079   9073       Cathy 	ASSERT(IAM_WRITER_IPSQ(ipsq));
   1080  11042        Erik 	(void) ill_down_ipifs_tail(q->q_ptr);
   1081      0      stevel 	freemsg(mp);
   1082   3340        meem 	ipsq_current_finish(ipsq);
   1083      0      stevel }
   1084      0      stevel 
   1085      0      stevel /*
   1086      0      stevel  * ill_down_start is called when we want to down this ill and bring it up again
   1087      0      stevel  * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
   1088      0      stevel  * all interfaces, but don't tear down any plumbing.
   1089      0      stevel  */
   1090      0      stevel boolean_t
   1091      0      stevel ill_down_start(queue_t *q, mblk_t *mp)
   1092      0      stevel {
   1093   3340        meem 	ill_t	*ill = q->q_ptr;
   1094   3340        meem 	ipif_t	*ipif;
   1095      0      stevel 
   1096      0      stevel 	ASSERT(IAM_WRITER_ILL(ill));
   1097  11042        Erik 	mutex_enter(&ill->ill_lock);
   1098  11042        Erik 	ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
   1099  11042        Erik 	/* no more nce addition allowed */
   1100  11042        Erik 	mutex_exit(&ill->ill_lock);
   1101      0      stevel 
   1102      0      stevel 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
   1103      0      stevel 		(void) ipif_down(ipif, NULL, NULL);
   1104      0      stevel 
   1105      0      stevel 	ill_down(ill);
   1106  11042        Erik 
   1107  11042        Erik 	/*
   1108  11042        Erik 	 * Walk all CONNs that can have a reference on an ire or nce for this
   1109  11042        Erik 	 * ill (we actually walk all that now have stale references).
   1110  11042        Erik 	 */
   1111  11042        Erik 	ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
   1112  11042        Erik 
   1113  11042        Erik 	/* With IPv6 we have dce_ifindex. Cleanup for neatness */
   1114  11042        Erik 	if (ill->ill_isv6)
   1115  11042        Erik 		dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
   1116  11042        Erik 
   1117      0      stevel 
   1118      0      stevel 	(void) ipsq_pending_mp_cleanup(ill, NULL);
   1119   3340        meem 
   1120   3340        meem 	ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
   1121   3340        meem 
   1122   3340        meem 	/*
   1123   3340        meem 	 * Atomically test and add the pending mp if references are active.
   1124   3340        meem 	 */
   1125   3340        meem 	mutex_enter(&ill->ill_lock);
   1126      0      stevel 	if (!ill_is_quiescent(ill)) {
   1127   3340        meem 		/* call cannot fail since `conn_t *' argument is NULL */
   1128      0      stevel 		(void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
   1129      0      stevel 		    mp, ILL_DOWN);
   1130      0      stevel 		mutex_exit(&ill->ill_lock);
   1131      0      stevel 		return (B_FALSE);
   1132      0      stevel 	}
   1133      0      stevel 	mutex_exit(&ill->ill_lock);
   1134      0      stevel 	return (B_TRUE);
   1135      0      stevel }
   1136      0      stevel 
   1137      0      stevel static void
   1138      0      stevel ill_down(ill_t *ill)
   1139      0      stevel {
   1140  11042        Erik 	mblk_t	*mp;
   1141  11042        Erik 	ip_stack_t	*ipst = ill->ill_ipst;
   1142  11042        Erik 
   1143  11042        Erik 	/*
   1144  11042        Erik 	 * Blow off any IREs dependent on this ILL.
   1145  11042        Erik 	 * The caller needs to handle conn_ixa_cleanup
   1146  11042        Erik 	 */
   1147  11042        Erik 	ill_delete_ires(ill);
   1148  11042        Erik 
   1149  11042        Erik 	ire_walk_ill(0, 0, ill_downi, ill, ill);
   1150   3448    dh155122 
   1151      0      stevel 	/* Remove any conn_*_ill depending on this ill */
   1152   3448    dh155122 	ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
   1153  11042        Erik 
   1154  11042        Erik 	/*
   1155  11042        Erik 	 * Free state for additional IREs.
   1156  11042        Erik 	 */
   1157  11042        Erik 	mutex_enter(&ill->ill_saved_ire_lock);
   1158  11042        Erik 	mp = ill->ill_saved_ire_mp;
   1159  11042        Erik 	ill->ill_saved_ire_mp = NULL;
   1160  11042        Erik 	ill->ill_saved_ire_cnt = 0;
   1161  11042        Erik 	mutex_exit(&ill->ill_saved_ire_lock);
   1162  11042        Erik 	freemsg(mp);
   1163  11042        Erik }
   1164  11042        Erik 
   1165  11042        Erik /*
   1166  11042        Erik  * ire_walk routine used to delete every IRE that depends on
   1167  11042        Erik  * 'ill'.  (Always called as writer.)
   1168  11042        Erik  *
   1169  11042        Erik  * Note: since the routes added by the kernel are deleted separately,
   1170  11042        Erik  * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
   1171  11042        Erik  *
   1172  11042        Erik  * We also remove references on ire_nce_cache entries that refer to the ill.
   1173  11042        Erik  */
   1174  11042        Erik void
   1175      0      stevel ill_downi(ire_t *ire, char *ill_arg)
   1176      0      stevel {
   1177      0      stevel 	ill_t	*ill = (ill_t *)ill_arg;
   1178  11042        Erik 	nce_t	*nce;
   1179  11042        Erik 
   1180  11042        Erik 	mutex_enter(&ire->ire_lock);
   1181  11042        Erik 	nce = ire->ire_nce_cache;
   1182  11042        Erik 	if (nce != NULL && nce->nce_ill == ill)
   1183  11042        Erik 		ire->ire_nce_cache = NULL;
   1184  11042        Erik 	else
   1185  11042        Erik 		nce = NULL;
   1186  11042        Erik 	mutex_exit(&ire->ire_lock);
   1187  11042        Erik 	if (nce != NULL)
   1188  11042        Erik 		nce_refrele(nce);
   1189  11042        Erik 	if (ire->ire_ill == ill)
   1190      0      stevel 		ire_delete(ire);
   1191  11042        Erik }
   1192  11042        Erik 
   1193  11042        Erik /* Remove IRE_IF_CLONE on this ill */
   1194  11042        Erik void
   1195  11042        Erik ill_downi_if_clone(ire_t *ire, char *ill_arg)
   1196  11042        Erik {
   1197  11042        Erik 	ill_t	*ill = (ill_t *)ill_arg;
   1198  11042        Erik 
   1199  11042        Erik 	ASSERT(ire->ire_type & IRE_IF_CLONE);
   1200  11042        Erik 	if (ire->ire_ill == ill)
   1201  11042        Erik 		ire_delete(ire);
   1202      0      stevel }
   1203      0      stevel 
   1204      0      stevel /* Consume an M_IOCACK of the fastpath probe. */
   1205      0      stevel void
   1206      0      stevel ill_fastpath_ack(ill_t *ill, mblk_t *mp)
   1207      0      stevel {
   1208      0      stevel 	mblk_t	*mp1 = mp;
   1209      0      stevel 
   1210      0      stevel 	/*
   1211      0      stevel 	 * If this was the first attempt turn on the fastpath probing.
   1212      0      stevel 	 */
   1213      0      stevel 	mutex_enter(&ill->ill_lock);
   1214   2893     ja97890 	if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
   1215   2893     ja97890 		ill->ill_dlpi_fastpath_state = IDS_OK;
   1216      0      stevel 	mutex_exit(&ill->ill_lock);
   1217      0      stevel 
   1218      0      stevel 	/* Free the M_IOCACK mblk, hold on to the data */
   1219      0      stevel 	mp = mp->b_cont;
   1220      0      stevel 	freeb(mp1);
   1221      0      stevel 	if (mp == NULL)
   1222      0      stevel 		return;
   1223  11042        Erik 	if (mp->b_cont != NULL)
   1224  11042        Erik 		nce_fastpath_update(ill, mp);
   1225  11042        Erik 	else
   1226      0      stevel 		ip0dbg(("ill_fastpath_ack:  no b_cont\n"));
   1227  11042        Erik 	freemsg(mp);
   1228      0      stevel }
   1229      0      stevel 
   1230      0      stevel /*
   1231      0      stevel  * Throw an M_IOCTL message downstream asking "do you know fastpath?"
   1232      0      stevel  * The data portion of the request is a dl_unitdata_req_t template for
   1233      0      stevel  * what we would send downstream in the absence of a fastpath confirmation.
   1234      0      stevel  */
   1235      0      stevel int
   1236      0      stevel ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
   1237      0      stevel {
   1238      0      stevel 	struct iocblk	*ioc;
   1239      0      stevel 	mblk_t	*mp;
   1240      0      stevel 
   1241      0      stevel 	if (dlur_mp == NULL)
   1242      0      stevel 		return (EINVAL);
   1243      0      stevel 
   1244      0      stevel 	mutex_enter(&ill->ill_lock);
   1245      0      stevel 	switch (ill->ill_dlpi_fastpath_state) {
   1246   2893     ja97890 	case IDS_FAILED:
   1247      0      stevel 		/*
   1248      0      stevel 		 * Driver NAKed the first fastpath ioctl - assume it doesn't
   1249      0      stevel 		 * support it.
   1250      0      stevel 		 */
   1251      0      stevel 		mutex_exit(&ill->ill_lock);
   1252      0      stevel 		return (ENOTSUP);
   1253   2893     ja97890 	case IDS_UNKNOWN:
   1254      0      stevel 		/* This is the first probe */
   1255   2893     ja97890 		ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
   1256      0      stevel 		break;
   1257      0      stevel 	default:
   1258      0      stevel 		break;
   1259      0      stevel 	}
   1260      0      stevel 	mutex_exit(&ill->ill_lock);
   1261      0      stevel 
   1262      0      stevel 	if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
   1263      0      stevel 		return (EAGAIN);
   1264      0      stevel 
   1265      0      stevel 	mp->b_cont = copyb(dlur_mp);
   1266      0      stevel 	if (mp->b_cont == NULL) {
   1267      0      stevel 		freeb(mp);
   1268      0      stevel 		return (EAGAIN);
   1269      0      stevel 	}
   1270      0      stevel 
   1271      0      stevel 	ioc = (struct iocblk *)mp->b_rptr;
   1272      0      stevel 	ioc->ioc_count = msgdsize(mp->b_cont);
   1273      0      stevel 
   1274  11042        Erik 	DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
   1275  11042        Erik 	    char *, "DL_IOC_HDR_INFO", ill_t *, ill);
   1276      0      stevel 	putnext(ill->ill_wq, mp);
   1277      0      stevel 	return (0);
   1278      0      stevel }
   1279      0      stevel 
   1280      0      stevel void
   1281      0      stevel ill_capability_probe(ill_t *ill)
   1282      0      stevel {
   1283   8275        Eric 	mblk_t	*mp;
   1284   8275        Eric 
   1285   8275        Eric 	ASSERT(IAM_WRITER_ILL(ill));
   1286   8275        Eric 
   1287   8275        Eric 	if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
   1288   8275        Eric 	    ill->ill_dlpi_capab_state != IDCS_FAILED)
   1289   8275        Eric 		return;
   1290   8275        Eric 
   1291   8275        Eric 	/*
   1292   8275        Eric 	 * We are starting a new cycle of capability negotiation.
   1293   8275        Eric 	 * Free up the capab reset messages of any previous incarnation.
   1294   8275        Eric 	 * We will do a fresh allocation when we get the response to our probe
   1295   8275        Eric 	 */
   1296   8275        Eric 	if (ill->ill_capab_reset_mp != NULL) {
   1297   8275        Eric 		freemsg(ill->ill_capab_reset_mp);
   1298   8275        Eric 		ill->ill_capab_reset_mp = NULL;
   1299   8275        Eric 	}
   1300   8275        Eric 
   1301      0      stevel 	ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
   1302   8275        Eric 
   1303   8275        Eric 	mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
   1304   8275        Eric 	if (mp == NULL)
   1305   8275        Eric 		return;
   1306   8275        Eric 
   1307   8275        Eric 	ill_capability_send(ill, mp);
   1308   8275        Eric 	ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
   1309   8275        Eric }
   1310   8275        Eric 
   1311   8275        Eric void
   1312   8275        Eric ill_capability_reset(ill_t *ill, boolean_t reneg)
   1313   8275        Eric {
   1314   8275        Eric 	ASSERT(IAM_WRITER_ILL(ill));
   1315   8275        Eric 
   1316   8275        Eric 	if (ill->ill_dlpi_capab_state != IDCS_OK)
   1317   8275        Eric 		return;
   1318   8275        Eric 
   1319   8275        Eric 	ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
   1320   8275        Eric 
   1321   8275        Eric 	ill_capability_send(ill, ill->ill_capab_reset_mp);
   1322   8275        Eric 	ill->ill_capab_reset_mp = NULL;
   1323   8275        Eric 	/*
   1324   8275        Eric 	 * We turn off all capabilities except those pertaining to
   1325   8275        Eric 	 * direct function call capabilities viz. ILL_CAPAB_DLD*
   1326   8275        Eric 	 * which will be turned off by the corresponding reset functions.
   1327   8275        Eric 	 */
   1328  11042        Erik 	ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM  | ILL_CAPAB_ZEROCOPY);
   1329   8275        Eric }
   1330   8275        Eric 
   1331   8275        Eric static void
   1332   8275        Eric ill_capability_reset_alloc(ill_t *ill)
   1333      0      stevel {
   1334      0      stevel 	mblk_t *mp;
   1335   8275        Eric 	size_t	size = 0;
   1336   8275        Eric 	int	err;
   1337   8275        Eric 	dl_capability_req_t	*capb;
   1338   8275        Eric 
   1339   8275        Eric 	ASSERT(IAM_WRITER_ILL(ill));
   1340   8275        Eric 	ASSERT(ill->ill_capab_reset_mp == NULL);
   1341   8275        Eric 
   1342   8275        Eric 	if (ILL_HCKSUM_CAPABLE(ill)) {
   1343   8275        Eric 		size += sizeof (dl_capability_sub_t) +
   1344   8275        Eric 		    sizeof (dl_capab_hcksum_t);
   1345   8275        Eric 	}
   1346   8275        Eric 
   1347   8275        Eric 	if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
   1348   8275        Eric 		size += sizeof (dl_capability_sub_t) +
   1349   8275        Eric 		    sizeof (dl_capab_zerocopy_t);
   1350   8275        Eric 	}
   1351   8275        Eric 
   1352   8275        Eric 	if (ill->ill_capabilities & ILL_CAPAB_DLD) {
   1353   8275        Eric 		size += sizeof (dl_capability_sub_t) +
   1354   8275        Eric 		    sizeof (dl_capab_dld_t);
   1355   8275        Eric 	}
   1356   8275        Eric 
   1357   8275        Eric 	mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
   1358   8275        Eric 	    STR_NOSIG, &err);
   1359   8275        Eric 
   1360   8275        Eric 	mp->b_datap->db_type = M_PROTO;
   1361   8275        Eric 	bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
   1362   8275        Eric 
   1363   8275        Eric 	capb = (dl_capability_req_t *)mp->b_rptr;
   1364   8275        Eric 	capb->dl_primitive = DL_CAPABILITY_REQ;
   1365   8275        Eric 	capb->dl_sub_offset = sizeof (dl_capability_req_t);
   1366   8275        Eric 	capb->dl_sub_length = size;
   1367   8275        Eric 
   1368   8275        Eric 	mp->b_wptr += sizeof (dl_capability_req_t);
   1369   8275        Eric 
   1370   8275        Eric 	/*
   1371   8275        Eric 	 * Each handler fills in the corresponding dl_capability_sub_t
   1372   8275        Eric 	 * inside the mblk,
   1373   8275        Eric 	 */
   1374   8275        Eric 	ill_capability_hcksum_reset_fill(ill, mp);
   1375   8275        Eric 	ill_capability_zerocopy_reset_fill(ill, mp);
   1376   8275        Eric 	ill_capability_dld_reset_fill(ill, mp);
   1377   8275        Eric 
   1378   8275        Eric 	ill->ill_capab_reset_mp = mp;
   1379      0      stevel }
   1380      0      stevel 
   1381      0      stevel static void
   1382      0      stevel ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
   1383      0      stevel {
   1384      0      stevel 	dl_capab_id_t *id_ic;
   1385      0      stevel 	uint_t sub_dl_cap = outers->dl_cap;
   1386      0      stevel 	dl_capability_sub_t *inners;
   1387      0      stevel 	uint8_t *capend;
   1388      0      stevel 
   1389      0      stevel 	ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
   1390      0      stevel 
   1391      0      stevel 	/*
   1392      0      stevel 	 * Note: range checks here are not absolutely sufficient to
   1393      0      stevel 	 * make us robust against malformed messages sent by drivers;
   1394      0      stevel 	 * this is in keeping with the rest of IP's dlpi handling.
   1395      0      stevel 	 * (Remember, it's coming from something else in the kernel
   1396      0      stevel 	 * address space)
   1397      0      stevel 	 */
   1398      0      stevel 
   1399      0      stevel 	capend = (uint8_t *)(outers + 1) + outers->dl_length;
   1400      0      stevel 	if (capend > mp->b_wptr) {
   1401      0      stevel 		cmn_err(CE_WARN, "ill_capability_id_ack: "
   1402      0      stevel 		    "malformed sub-capability too long for mblk");
   1403      0      stevel 		return;
   1404      0      stevel 	}
   1405      0      stevel 
   1406      0      stevel 	id_ic = (dl_capab_id_t *)(outers + 1);
   1407      0      stevel 
   1408      0      stevel 	if (outers->dl_length < sizeof (*id_ic) ||
   1409      0      stevel 	    (inners = &id_ic->id_subcap,
   1410      0      stevel 	    inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
   1411      0      stevel 		cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
   1412      0      stevel 		    "encapsulated capab type %d too long for mblk",
   1413      0      stevel 		    inners->dl_cap);
   1414      0      stevel 		return;
   1415      0      stevel 	}
   1416      0      stevel 
   1417      0      stevel 	if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
   1418      0      stevel 		ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
   1419      0      stevel 		    "isn't as expected; pass-thru module(s) detected, "
   1420      0      stevel 		    "discarding capability\n", inners->dl_cap));
   1421      0      stevel 		return;
   1422      0      stevel 	}
   1423      0      stevel 
   1424      0      stevel 	/* Process the encapsulated sub-capability */
   1425  11042        Erik 	ill_capability_dispatch(ill, mp, inners);
   1426   8275        Eric }
   1427   8275        Eric 
   1428   8275        Eric static void
   1429   8275        Eric ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
   1430   8275        Eric {
   1431   8275        Eric 	dl_capability_sub_t *dl_subcap;
   1432   8275        Eric 
   1433   8275        Eric 	if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
   1434   8275        Eric 		return;
   1435   8275        Eric 
   1436   8275        Eric 	/*
   1437   8275        Eric 	 * The dl_capab_dld_t that follows the dl_capability_sub_t is not
   1438   8275        Eric 	 * initialized below since it is not used by DLD.
   1439   8275        Eric 	 */
   1440   8275        Eric 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
   1441   8275        Eric 	dl_subcap->dl_cap = DL_CAPAB_DLD;
   1442   8275        Eric 	dl_subcap->dl_length = sizeof (dl_capab_dld_t);
   1443   8275        Eric 
   1444   8275        Eric 	mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
   1445      0      stevel }
   1446      0      stevel 
   1447  11042        Erik static void
   1448  11042        Erik ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
   1449  11042        Erik {
   1450  11076       Cathy 	/*
   1451  11076       Cathy 	 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
   1452  11076       Cathy 	 * is only to get the VRRP capability.
   1453  11076       Cathy 	 */
   1454  11076       Cathy 	if (ill->ill_ipif_up_count == 0) {
   1455  11076       Cathy 		if (subp->dl_cap == DL_CAPAB_VRRP)
   1456  11076       Cathy 			ill_capability_vrrp_ack(ill, mp, subp);
   1457  11076       Cathy 		return;
   1458  11076       Cathy 	}
   1459  11076       Cathy 
   1460      0      stevel 	switch (subp->dl_cap) {
   1461      0      stevel 	case DL_CAPAB_HCKSUM:
   1462      0      stevel 		ill_capability_hcksum_ack(ill, mp, subp);
   1463      0      stevel 		break;
   1464      0      stevel 	case DL_CAPAB_ZEROCOPY:
   1465      0      stevel 		ill_capability_zerocopy_ack(ill, mp, subp);
   1466      0      stevel 		break;
   1467   8275        Eric 	case DL_CAPAB_DLD:
   1468   8275        Eric 		ill_capability_dld_ack(ill, mp, subp);
   1469   3115    yl150051 		break;
   1470  11076       Cathy 	case DL_CAPAB_VRRP:
   1471  11076       Cathy 		break;
   1472      0      stevel 	default:
   1473      0      stevel 		ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
   1474      0      stevel 		    subp->dl_cap));
   1475  11076       Cathy 	}
   1476  11076       Cathy }
   1477  11076       Cathy 
   1478  11076       Cathy /*
   1479  11076       Cathy  * Process the vrrp capability received from a DLS Provider. isub must point
   1480  11076       Cathy  * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
   1481  11076       Cathy  */
   1482  11076       Cathy static void
   1483  11076       Cathy ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1484  11076       Cathy {
   1485  11076       Cathy 	dl_capab_vrrp_t	*vrrp;
   1486  11076       Cathy 	uint_t		sub_dl_cap = isub->dl_cap;
   1487  11076       Cathy 	uint8_t		*capend;
   1488  11076       Cathy 
   1489  11076       Cathy 	ASSERT(IAM_WRITER_ILL(ill));
   1490  11076       Cathy 	ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
   1491  11076       Cathy 
   1492  11076       Cathy 	/*
   1493  11076       Cathy 	 * Note: range checks here are not absolutely sufficient to
   1494  11076       Cathy 	 * make us robust against malformed messages sent by drivers;
   1495  11076       Cathy 	 * this is in keeping with the rest of IP's dlpi handling.
   1496  11076       Cathy 	 * (Remember, it's coming from something else in the kernel
   1497  11076       Cathy 	 * address space)
   1498  11076       Cathy 	 */
   1499  11076       Cathy 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1500  11076       Cathy 	if (capend > mp->b_wptr) {
   1501  11076       Cathy 		cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
   1502  11076       Cathy 		    "malformed sub-capability too long for mblk");
   1503  11076       Cathy 		return;
   1504  11076       Cathy 	}
   1505  11076       Cathy 	vrrp = (dl_capab_vrrp_t *)(isub + 1);
   1506  11076       Cathy 
   1507  11076       Cathy 	/*
   1508  11076       Cathy 	 * Compare the IP address family and set ILLF_VRRP for the right ill.
   1509  11076       Cathy 	 */
   1510  11076       Cathy 	if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
   1511  11076       Cathy 	    (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
   1512  11076       Cathy 		ill->ill_flags |= ILLF_VRRP;
   1513   1184      krgopi 	}
   1514   1184      krgopi }
   1515      0      stevel 
   1516      0      stevel /*
   1517      0      stevel  * Process a hardware checksum offload capability negotiation ack received
   1518      0      stevel  * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
   1519      0      stevel  * of a DL_CAPABILITY_ACK message.
   1520      0      stevel  */
   1521      0      stevel static void
   1522      0      stevel ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1523      0      stevel {
   1524      0      stevel 	dl_capability_req_t	*ocap;
   1525      0      stevel 	dl_capab_hcksum_t	*ihck, *ohck;
   1526      0      stevel 	ill_hcksum_capab_t	**ill_hcksum;
   1527      0      stevel 	mblk_t			*nmp = NULL;
   1528      0      stevel 	uint_t			sub_dl_cap = isub->dl_cap;
   1529      0      stevel 	uint8_t			*capend;
   1530      0      stevel 
   1531      0      stevel 	ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
   1532      0      stevel 
   1533      0      stevel 	ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
   1534      0      stevel 
   1535      0      stevel 	/*
   1536      0      stevel 	 * Note: range checks here are not absolutely sufficient to
   1537      0      stevel 	 * make us robust against malformed messages sent by drivers;
   1538      0      stevel 	 * this is in keeping with the rest of IP's dlpi handling.
   1539      0      stevel 	 * (Remember, it's coming from something else in the kernel
   1540      0      stevel 	 * address space)
   1541      0      stevel 	 */
   1542      0      stevel 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1543      0      stevel 	if (capend > mp->b_wptr) {
   1544      0      stevel 		cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
   1545      0      stevel 		    "malformed sub-capability too long for mblk");
   1546      0      stevel 		return;
   1547      0      stevel 	}
   1548      0      stevel 
   1549      0      stevel 	/*
   1550      0      stevel 	 * There are two types of acks we process here:
   1551      0      stevel 	 * 1. acks in reply to a (first form) generic capability req
   1552      0      stevel 	 *    (no ENABLE flag set)
   1553      0      stevel 	 * 2. acks in reply to a ENABLE capability req.
   1554      0      stevel 	 *    (ENABLE flag set)
   1555      0      stevel 	 */
   1556      0      stevel 	ihck = (dl_capab_hcksum_t *)(isub + 1);
   1557      0      stevel 
   1558      0      stevel 	if (ihck->hcksum_version != HCKSUM_VERSION_1) {
   1559      0      stevel 		cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
   1560      0      stevel 		    "unsupported hardware checksum "
   1561      0      stevel 		    "sub-capability (version %d, expected %d)",
   1562      0      stevel 		    ihck->hcksum_version, HCKSUM_VERSION_1);
   1563      0      stevel 		return;
   1564      0      stevel 	}
   1565      0      stevel 
   1566      0      stevel 	if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
   1567      0      stevel 		ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
   1568      0      stevel 		    "checksum capability isn't as expected; pass-thru "
   1569      0      stevel 		    "module(s) detected, discarding capability\n"));
   1570      0      stevel 		return;
   1571      0      stevel 	}
   1572      0      stevel 
   1573    741    masputra #define	CURR_HCKSUM_CAPAB				\
   1574    741    masputra 	(HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 |	\
   1575    741    masputra 	HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
   1576      0      stevel 
   1577      0      stevel 	if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
   1578      0      stevel 	    (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
   1579      0      stevel 		/* do ENABLE processing */
   1580      0      stevel 		if (*ill_hcksum == NULL) {
   1581      0      stevel 			*ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
   1582      0      stevel 			    KM_NOSLEEP);
   1583      0      stevel 
   1584      0      stevel 			if (*ill_hcksum == NULL) {
   1585      0      stevel 				cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
   1586      0      stevel 				    "could not enable hcksum version %d "
   1587      0      stevel 				    "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
   1588      0      stevel 				    ill->ill_name);
   1589      0      stevel 				return;
   1590      0      stevel 			}
   1591      0      stevel 		}
   1592      0      stevel 
   1593      0      stevel 		(*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
   1594      0      stevel 		(*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
   1595      0      stevel 		ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
   1596      0      stevel 		ip1dbg(("ill_capability_hcksum_ack: interface %s "
   1597      0      stevel 		    "has enabled hardware checksumming\n ",
   1598      0      stevel 		    ill->ill_name));
   1599      0      stevel 	} else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
   1600      0      stevel 		/*
   1601      0      stevel 		 * Enabling hardware checksum offload
   1602      0      stevel 		 * Currently IP supports {TCP,UDP}/IPv4
   1603      0      stevel 		 * partial and full cksum offload and
   1604      0      stevel 		 * IPv4 header checksum offload.
   1605      0      stevel 		 * Allocate new mblk which will
   1606      0      stevel 		 * contain a new capability request
   1607      0      stevel 		 * to enable hardware checksum offload.
   1608      0      stevel 		 */
   1609      0      stevel 		uint_t	size;
   1610      0      stevel 		uchar_t	*rptr;
   1611      0      stevel 
   1612      0      stevel 		size = sizeof (dl_capability_req_t) +
   1613      0      stevel 		    sizeof (dl_capability_sub_t) + isub->dl_length;
   1614      0      stevel 
   1615      0      stevel 		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
   1616      0      stevel 			cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
   1617      0      stevel 			    "could not enable hardware cksum for %s (ENOMEM)\n",
   1618      0      stevel 			    ill->ill_name);
   1619      0      stevel 			return;
   1620      0      stevel 		}
   1621      0      stevel 
   1622      0      stevel 		rptr = nmp->b_rptr;
   1623      0      stevel 		/* initialize dl_capability_req_t */
   1624      0      stevel 		ocap = (dl_capability_req_t *)nmp->b_rptr;
   1625      0      stevel 		ocap->dl_sub_offset =
   1626      0      stevel 		    sizeof (dl_capability_req_t);
   1627      0      stevel 		ocap->dl_sub_length =
   1628      0      stevel 		    sizeof (dl_capability_sub_t) +
   1629      0      stevel 		    isub->dl_length;
   1630      0      stevel 		nmp->b_rptr += sizeof (dl_capability_req_t);
   1631      0      stevel 
   1632      0      stevel 		/* initialize dl_capability_sub_t */
   1633      0      stevel 		bcopy(isub, nmp->b_rptr, sizeof (*isub));
   1634      0      stevel 		nmp->b_rptr += sizeof (*isub);
   1635      0      stevel 
   1636      0      stevel 		/* initialize dl_capab_hcksum_t */
   1637      0      stevel 		ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
   1638      0      stevel 		bcopy(ihck, ohck, sizeof (*ihck));
   1639      0      stevel 
   1640      0      stevel 		nmp->b_rptr = rptr;
   1641      0      stevel 		ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
   1642      0      stevel 
   1643      0      stevel 		/* Set ENABLE flag */
   1644      0      stevel 		ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
   1645      0      stevel 		ohck->hcksum_txflags |= HCKSUM_ENABLE;
   1646      0      stevel 
   1647      0      stevel 		/*
   1648      0      stevel 		 * nmp points to a DL_CAPABILITY_REQ message to enable
   1649      0      stevel 		 * hardware checksum acceleration.
   1650      0      stevel 		 */
   1651   8275        Eric 		ill_capability_send(ill, nmp);
   1652    741    masputra 	} else {
   1653      0      stevel 		ip1dbg(("ill_capability_hcksum_ack: interface %s has "
   1654      0      stevel 		    "advertised %x hardware checksum capability flags\n",
   1655      0      stevel 		    ill->ill_name, ihck->hcksum_txflags));
   1656    741    masputra 	}
   1657      0      stevel }
   1658      0      stevel 
   1659      0      stevel static void
   1660   8275        Eric ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
   1661   8275        Eric {
   1662      0      stevel 	dl_capab_hcksum_t *hck_subcap;
   1663      0      stevel 	dl_capability_sub_t *dl_subcap;
   1664      0      stevel 
   1665    741    masputra 	if (!ILL_HCKSUM_CAPABLE(ill))
   1666      0      stevel 		return;
   1667      0      stevel 
   1668      0      stevel 	ASSERT(ill->ill_hcksum_capab != NULL);
   1669   8275        Eric 
   1670   8275        Eric 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
   1671      0      stevel 	dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
   1672      0      stevel 	dl_subcap->dl_length = sizeof (*hck_subcap);
   1673      0      stevel 
   1674      0      stevel 	hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
   1675      0      stevel 	hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
   1676      0      stevel 	hck_subcap->hcksum_txflags = 0;
   1677      0      stevel 
   1678   8275        Eric 	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
   1679      0      stevel }
   1680      0      stevel 
   1681      0      stevel static void
   1682      0      stevel ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1683      0      stevel {
   1684      0      stevel 	mblk_t *nmp = NULL;
   1685      0      stevel 	dl_capability_req_t *oc;
   1686      0      stevel 	dl_capab_zerocopy_t *zc_ic, *zc_oc;
   1687      0      stevel 	ill_zerocopy_capab_t **ill_zerocopy_capab;
   1688      0      stevel 	uint_t sub_dl_cap = isub->dl_cap;
   1689      0      stevel 	uint8_t *capend;
   1690      0      stevel 
   1691      0      stevel 	ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
   1692      0      stevel 
   1693      0      stevel 	ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
   1694      0      stevel 
   1695      0      stevel 	/*
   1696      0      stevel 	 * Note: range checks here are not absolutely sufficient to
   1697      0      stevel 	 * make us robust against malformed messages sent by drivers;
   1698      0      stevel 	 * this is in keeping with the rest of IP's dlpi handling.
   1699      0      stevel 	 * (Remember, it's coming from something else in the kernel
   1700      0      stevel 	 * address space)
   1701      0      stevel 	 */
   1702      0      stevel 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1703      0      stevel 	if (capend > mp->b_wptr) {
   1704      0      stevel 		cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
   1705      0      stevel 		    "malformed sub-capability too long for mblk");
   1706      0      stevel 		return;
   1707      0      stevel 	}
   1708      0      stevel 
   1709      0      stevel 	zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
   1710      0      stevel 	if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
   1711      0      stevel 		cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
   1712      0      stevel 		    "unsupported ZEROCOPY sub-capability (version %d, "
   1713      0      stevel 		    "expected %d)", zc_ic->zerocopy_version,
   1714      0      stevel 		    ZEROCOPY_VERSION_1);
   1715      0      stevel 		return;
   1716      0      stevel 	}
   1717      0      stevel 
   1718      0      stevel 	if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
   1719      0      stevel 		ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
   1720      0      stevel 		    "capability isn't as expected; pass-thru module(s) "
   1721      0      stevel 		    "detected, discarding capability\n"));
   1722      0      stevel 		return;
   1723      0      stevel 	}
   1724      0      stevel 
   1725      0      stevel 	if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
   1726      0      stevel 		if (*ill_zerocopy_capab == NULL) {
   1727      0      stevel 			*ill_zerocopy_capab =
   1728      0      stevel 			    kmem_zalloc(sizeof (ill_zerocopy_capab_t),
   1729      0      stevel 			    KM_NOSLEEP);
   1730      0      stevel 
   1731      0      stevel 			if (*ill_zerocopy_capab == NULL) {
   1732      0      stevel 				cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
   1733      0      stevel 				    "could not enable Zero-copy version %d "
   1734      0      stevel 				    "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
   1735      0      stevel 				    ill->ill_name);
   1736      0      stevel 				return;
   1737      0      stevel 			}
   1738      0      stevel 		}
   1739      0      stevel 
   1740      0      stevel 		ip1dbg(("ill_capability_zerocopy_ack: interface %s "
   1741      0      stevel 		    "supports Zero-copy version %d\n", ill->ill_name,
   1742      0      stevel 		    ZEROCOPY_VERSION_1));
   1743      0      stevel 
   1744      0      stevel 		(*ill_zerocopy_capab)->ill_zerocopy_version =
   1745      0      stevel 		    zc_ic->zerocopy_version;
   1746      0      stevel 		(*ill_zerocopy_capab)->ill_zerocopy_flags =
   1747      0      stevel 		    zc_ic->zerocopy_flags;
   1748      0      stevel 
   1749      0      stevel 		ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
   1750      0      stevel 	} else {
   1751      0      stevel 		uint_t size;
   1752      0      stevel 		uchar_t *rptr;
   1753      0      stevel 
   1754      0      stevel 		size = sizeof (dl_capability_req_t) +
   1755      0      stevel 		    sizeof (dl_capability_sub_t) +
   1756      0      stevel 		    sizeof (dl_capab_zerocopy_t);
   1757      0      stevel 
   1758      0      stevel 		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
   1759      0      stevel 			cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
   1760      0      stevel 			    "could not enable zerocopy for %s (ENOMEM)\n",
   1761      0      stevel 			    ill->ill_name);
   1762      0      stevel 			return;
   1763      0      stevel 		}
   1764      0      stevel 
   1765      0      stevel 		rptr = nmp->b_rptr;
   1766      0      stevel 		/* initialize dl_capability_req_t */
   1767      0      stevel 		oc = (dl_capability_req_t *)rptr;
   1768      0      stevel 		oc->dl_sub_offset = sizeof (dl_capability_req_t);
   1769      0      stevel 		oc->dl_sub_length = sizeof (dl_capability_sub_t) +
   1770      0      stevel 		    sizeof (dl_capab_zerocopy_t);
   1771      0      stevel 		rptr += sizeof (dl_capability_req_t);
   1772      0      stevel 
   1773      0      stevel 		/* initialize dl_capability_sub_t */
   1774      0      stevel 		bcopy(isub, rptr, sizeof (*isub));
   1775      0      stevel 		rptr += sizeof (*isub);
   1776      0      stevel 
   1777      0      stevel 		/* initialize dl_capab_zerocopy_t */
   1778      0      stevel 		zc_oc = (dl_capab_zerocopy_t *)rptr;
   1779      0      stevel 		*zc_oc = *zc_ic;
   1780      0      stevel 
   1781      0      stevel 		ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
   1782      0      stevel 		    "to enable zero-copy version %d\n", ill->ill_name,
   1783      0      stevel 		    ZEROCOPY_VERSION_1));
   1784      0      stevel 
   1785      0      stevel 		/* set VMSAFE_MEM flag */
   1786      0      stevel 		zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
   1787      0      stevel 
   1788      0      stevel 		/* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
   1789   8275        Eric 		ill_capability_send(ill, nmp);
   1790   8275        Eric 	}
   1791   8275        Eric }
   1792   8275        Eric 
   1793   8275        Eric static void
   1794   8275        Eric ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
   1795   8275        Eric {
   1796      0      stevel 	dl_capab_zerocopy_t *zerocopy_subcap;
   1797      0      stevel 	dl_capability_sub_t *dl_subcap;
   1798      0      stevel 
   1799      0      stevel 	if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
   1800      0      stevel 		return;
   1801      0      stevel 
   1802      0      stevel 	ASSERT(ill->ill_zerocopy_capab != NULL);
   1803   8275        Eric 
   1804   8275        Eric 	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
   1805      0      stevel 	dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
   1806      0      stevel 	dl_subcap->dl_length = sizeof (*zerocopy_subcap);
   1807      0      stevel 
   1808      0      stevel 	zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
   1809      0      stevel 	zerocopy_subcap->zerocopy_version =
   1810      0      stevel 	    ill->ill_zerocopy_capab->ill_zerocopy_version;
   1811      0      stevel 	zerocopy_subcap->zerocopy_flags = 0;
   1812   3115    yl150051 
   1813   8275        Eric 	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
   1814   8275        Eric }
   1815   8275        Eric 
   1816   8275        Eric /*
   1817   8275        Eric  * DLD capability
   1818   8275        Eric  * Refer to dld.h for more information regarding the purpose and usage
   1819   8275        Eric  * of this capability.
   1820   8275        Eric  */
   1821   8275        Eric static void
   1822   8275        Eric ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
   1823   8275        Eric {
   1824   8275        Eric 	dl_capab_dld_t		*dld_ic, dld;
   1825   8275        Eric 	uint_t			sub_dl_cap = isub->dl_cap;
   1826   8275        Eric 	uint8_t			*capend;
   1827   8275        Eric 	ill_dld_capab_t		*idc;
   1828   8275        Eric 
   1829   8275        Eric 	ASSERT(IAM_WRITER_ILL(ill));
   1830   8275        Eric 	ASSERT(sub_dl_cap == DL_CAPAB_DLD);
   1831   3115    yl150051 
   1832   3115    yl150051 	/*
   1833   3115    yl150051 	 * Note: range checks here are not absolutely sufficient to
   1834   3115    yl150051 	 * make us robust against malformed messages sent by drivers;
   1835   3115    yl150051 	 * this is in keeping with the rest of IP's dlpi handling.
   1836   3115    yl150051 	 * (Remember, it's coming from something else in the kernel
   1837   3115    yl150051 	 * address space)
   1838   3115    yl150051 	 */
   1839   3115    yl150051 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
   1840   3115    yl150051 	if (capend > mp->b_wptr) {
   1841   8275        Eric 		cmn_err(CE_WARN, "ill_capability_dld_ack: "
   1842   3115    yl150051 		    "malformed sub-capability too long for mblk");
   1843   3115    yl150051 		return;
   1844   3115    yl150051 	}
   1845   8275        Eric 	dld_ic = (dl_capab_dld_t *)(isub + 1);
   1846   8275        Eric 	if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
   1847   8275        Eric 		cmn_err(CE_CONT, "ill_capability_dld_ack: "
   1848   8275        Eric 		    "unsupported DLD sub-capability (version %d, "
   1849   8275        Eric 		    "expected %d)", dld_ic->dld_version,
   1850   8275        Eric 		    DLD_CURRENT_VERSION);
   1851   8275        Eric 		return;
   1852   8275        Eric 	}
   1853   8275        Eric 	if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
   1854   8275        Eric 		ip1dbg(("ill_capability_dld_ack: mid token for dld "
   1855   3115    yl150051 		    "capability isn't as expected; pass-thru module(s) "
   1856   3115    yl150051 		    "detected, discarding capability\n"));
   1857   3115    yl150051 		return;
   1858   3115    yl150051 	}
   1859   3115    yl150051 
   1860   8275        Eric 	/*
   1861   8275        Eric 	 * Copy locally to ensure alignment.
   1862   8275        Eric 	 */
   1863   8275        Eric 	bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
   1864   8275        Eric 
   1865   8275        Eric 	if ((idc = ill->ill_dld_capab) == NULL) {
   1866   8275        Eric 		idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
   1867   8275        Eric 		if (idc == NULL) {
   1868   8275        Eric 			cmn_err(CE_WARN, "ill_capability_dld_ack: "
   1869   8275        Eric 			    "could not enable DLD version %d "
   1870   8275        Eric 			    "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
   1871   8275        Eric 			    ill->ill_name);
   1872   8275        Eric 			return;
   1873   8275        Eric 		}
   1874   8275        Eric 		ill->ill_dld_capab = idc;
   1875   8275        Eric 	}
   1876   9073       Cathy 	idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
   1877   9073       Cathy 	idc->idc_capab_dh = (void *)dld.dld_capab_handle;
   1878   8275        Eric 	ip1dbg(("ill_capability_dld_ack: interface %s "
   1879   8275        Eric 	    "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
   1880   8275        Eric 
   1881   8275        Eric 	ill_capability_dld_enable(ill);
   1882   8275        Eric }
   1883   8275        Eric 
   1884   8275        Eric /*
   1885   8275        Eric  * Typically capability negotiation between IP and the driver happens via
   1886   8275        Eric  * DLPI message exchange. However GLD also offers a direct function call
   1887   8275        Eric  * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
   1888   8275        Eric  * But arbitrary function calls into IP or GLD are not permitted, since both
   1889   8275        Eric  * of them are protected by their own perimeter mechanism. The perimeter can
   1890   8275        Eric  * be viewed as a coarse lock or serialization mechanism. The hierarchy of
   1891   8275        Eric  * these perimeters is IP -> MAC. Thus for example to enable the squeue
   1892   8275        Eric  * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
   1893   8275        Eric  * to enter the mac perimeter and then do the direct function calls into
   1894   8275        Eric  * GLD to enable squeue polling. The ring related callbacks from the mac into
   1895   8275        Eric  * the stack to add, bind, quiesce, restart or cleanup a ring are all
   1896   8275        Eric  * protected by the mac perimeter.
   1897   8275        Eric  */
   1898   8275        Eric static void
   1899   8275        Eric ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
   1900   8275        Eric {
   1901   8275        Eric 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1902   8275        Eric 	int			err;
   1903   8275        Eric 
   1904   8275        Eric 	err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
   1905   8275        Eric 	    DLD_ENABLE);
   1906   8275        Eric 	ASSERT(err == 0);
   1907   8275        Eric }
   1908   8275        Eric 
   1909   8275        Eric static void
   1910   8275        Eric ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
   1911   8275        Eric {
   1912   8275        Eric 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1913   8275        Eric 	int			err;
   1914   8275        Eric 
   1915   8275        Eric 	err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
   1916   8275        Eric 	    DLD_DISABLE);
   1917   8275        Eric 	ASSERT(err == 0);
   1918   8275        Eric }
   1919   8275        Eric 
   1920   8275        Eric boolean_t
   1921   8275        Eric ill_mac_perim_held(ill_t *ill)
   1922   8275        Eric {
   1923   8275        Eric 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1924   8275        Eric 
   1925   8275        Eric 	return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
   1926   8275        Eric 	    DLD_QUERY));
   1927   8275        Eric }
   1928   8275        Eric 
   1929   8275        Eric static void
   1930   8275        Eric ill_capability_direct_enable(ill_t *ill)
   1931   8275        Eric {
   1932   8275        Eric 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1933   8275        Eric 	ill_dld_direct_t	*idd = &idc->idc_direct;
   1934   8275        Eric 	dld_capab_direct_t	direct;
   1935   8275        Eric 	int			rc;
   1936   8275        Eric 
   1937   8275        Eric 	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
   1938   8275        Eric 
   1939   8275        Eric 	bzero(&direct, sizeof (direct));
   1940   8275        Eric 	direct.di_rx_cf = (uintptr_t)ip_input;
   1941   8275        Eric 	direct.di_rx_ch = ill;
   1942   8275        Eric 
   1943   8275        Eric 	rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
   1944   8275        Eric 	    DLD_ENABLE);
   1945   8275        Eric 	if (rc == 0) {
   1946   8275        Eric 		idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
   1947   8275        Eric 		idd->idd_tx_dh = direct.di_tx_dh;
   1948   8275        Eric 		idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
   1949   8275        Eric 		idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
   1950   8833        Venu 		idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
   1951   8833        Venu 		idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
   1952   9738       Cathy 		ASSERT(idd->idd_tx_cb_df != NULL);
   1953   9738       Cathy 		ASSERT(idd->idd_tx_fctl_df != NULL);
   1954   9738       Cathy 		ASSERT(idd->idd_tx_df != NULL);
   1955   8275        Eric 		/*
   1956   8275        Eric 		 * One time registration of flow enable callback function
   1957   8275        Eric 		 */
   1958   8275        Eric 		ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
   1959   8275        Eric 		    ill_flow_enable, ill);
   1960   8275        Eric 		ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
   1961   8275        Eric 		DTRACE_PROBE1(direct_on, (ill_t *), ill);
   1962   8275        Eric 	} else {
   1963   8275        Eric 		cmn_err(CE_WARN, "warning: could not enable DIRECT "
   1964   8275        Eric 		    "capability, rc = %d\n", rc);
   1965   8275        Eric 		DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
   1966   8275        Eric 	}
   1967   8275        Eric }
   1968   8275        Eric 
   1969   8275        Eric static void
   1970   8275        Eric ill_capability_poll_enable(ill_t *ill)
   1971   8275        Eric {
   1972   8275        Eric 	ill_dld_capab_t		*idc = ill->ill_dld_capab;
   1973   8275        Eric 	dld_capab_poll_t	poll;
   1974   8275        Eric 	int			rc;
   1975   8275        Eric 
   1976   8275        Eric 	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
   1977   8275        Eric 
   1978   8275        Eric 	bzero(&poll, sizeof (poll));
   1979   8275        Eric 	poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
   1980   8275        Eric 	poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
   1981   8275        Eric 	poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
   1982   8275        Eric 	poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
   1983   8275        Eric 	poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
   1984   8275        Eric 	poll.poll_ring_ch = ill;
   1985   8275        Eric 	rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
   1986   8275        Eric 	    DLD_ENABLE);
   1987   8275        Eric 	if (rc == 0) {
   1988   8275        Eric 		ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
   1989   8275        Eric 		DTRACE_PROBE1(poll_on, (ill_t *), ill);
   1990   8275        Eric 	} else {
   1991   8275        Eric 		ip1dbg(("warning: could not enable POLL "
   1992   8275        Eric 		    "capability, rc = %d\n", rc));
   1993   8275        Eric 		DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
   1994   8275        Eric 	}
   1995   8275        Eric }
   1996   8275        Eric 
   1997   8275        Eric /*
   1998   8275        Eric  * Enable the LSO capability.
   1999   8275        Eric  */
   2000   8275        Eric static void
   2001   8275        Eric ill_capability_lso_enable(ill_t *ill)
   2002   8275        Eric {
   2003   8275        Eric 	ill_dld_capab_t	*idc = ill->ill_dld_capab;
   2004   8275        Eric 	dld_capab_lso_t	lso;
   2005   8275        Eric 	int rc;
   2006   8275        Eric 
   2007   8275        Eric 	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
   2008   8275        Eric 
   2009   8275        Eric 	if (ill->ill_lso_capab == NULL) {
   2010   8275        Eric 		ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
   2011   8275        Eric 		    KM_NOSLEEP);
   2012   8275        Eric 		if (ill->ill_lso_capab == NULL) {
   2013   8275        Eric 			cmn_err(CE_WARN, "ill_capability_lso_enable: "
   2014   3115    yl150051 			    "could not enable LSO for %s (ENOMEM)\n",
   2015   3115    yl150051 			    ill->ill_name);
   2016   3115    yl150051 			return;
   2017   3115    yl150051 		}
   2018   8275        Eric 	}
   2019   8275        Eric 
   2020   8275        Eric 	bzero(&lso, sizeof (lso));
   2021   8275        Eric 	if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
   2022   8275        Eric 	    DLD_ENABLE)) == 0) {
   2023   8275        Eric 		ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
   2024   8275        Eric 		ill->ill_lso_capab->ill_lso_max = lso.lso_max;
   2025  11042        Erik 		ill->ill_capabilities |= ILL_CAPAB_LSO;
   2026   8275        Eric 		ip1dbg(("ill_capability_lso_enable: interface %s "
   2027   8275        Eric 		    "has enabled LSO\n ", ill->ill_name));
   2028   8275        Eric 	} else {
   2029   8275        Eric 		kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
   2030   8275        Eric 		ill->ill_lso_capab = NULL;
   2031   8275        Eric 		DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
   2032   8275        Eric 	}
   2033   8275        Eric }
   2034   8275        Eric 
   2035   8275        Eric static void
   2036   8275        Eric ill_capability_dld_enable(ill_t *ill)
   2037   8275        Eric {
   2038   8275        Eric 	mac_perim_handle_t mph;
   2039   8275        Eric 
   2040   8275        Eric 	ASSERT(IAM_WRITER_ILL(ill));
   2041   8275        Eric 
   2042   8275        Eric 	if (ill->ill_isv6)
   2043   8275        Eric 		return;
   2044   8275        Eric 
   2045   8275        Eric 	ill_mac_perim_enter(ill, &mph);
   2046   8275        Eric 	if (!ill->ill_isv6) {
   2047   8275        Eric 		ill_capability_direct_enable(ill);
   2048   8275        Eric 		ill_capability_poll_enable(ill);
   2049   8275        Eric 		ill_capability_lso_enable(ill);
   2050   8275        Eric 	}
   2051   8275        Eric 	ill->ill_capabilities |= ILL_CAPAB_DLD;
   2052   8275        Eric 	ill_mac_perim_exit(ill, mph);
   2053   8275        Eric }
   2054   8275        Eric 
   2055   8275        Eric static void
   2056   8275        Eric ill_capability_dld_disable(ill_t *ill)
   2057   8275        Eric {
   2058   8275        Eric 	ill_dld_capab_t	*idc;
   2059   8275        Eric 	ill_dld_direct_t *idd;
   2060   8275        Eric 	mac_perim_handle_t	mph;
   2061   8275        Eric 
   2062   8275        Eric 	ASSERT(IAM_WRITER_ILL(ill));
   2063   8275        Eric 
   2064   8275        Eric 	if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
   2065   8275        Eric 		return;
   2066   8275        Eric 
   2067   8275        Eric 	ill_mac_perim_enter(ill, &mph);
   2068   8275        Eric 
   2069   8275        Eric 	idc = ill->ill_dld_capab;
   2070   8275        Eric 	if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
   2071   8275        Eric 		/*
   2072   8275        Eric 		 * For performance we avoid locks in the transmit data path
   2073   8275        Eric 		 * and don't maintain a count of the number of threads using
   2074   8275        Eric 		 * direct calls. Thus some threads could be using direct
   2075   8275        Eric 		 * transmit calls to GLD, even after the capability mechanism
   2076   8275        Eric 		 * turns it off. This is still safe since the handles used in
   2077   8275        Eric 		 * the direct calls continue to be valid until the unplumb is
   2078   8275        Eric 		 * completed. Remove the callback that was added (1-time) at
   2079   8275        Eric 		 * capab enable time.
   2080   8275        Eric 		 */
   2081   8275        Eric 		mutex_enter(&ill->ill_lock);
   2082   8275        Eric 		ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
   2083   8275        Eric 		mutex_exit(&ill->ill_lock);
   2084   8275        Eric 		if (ill->ill_flownotify_mh != NULL) {
   2085   8275        Eric 			idd = &idc->idc_direct;
   2086   8275        Eric 			idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
   2087   8275        Eric 			    ill->ill_flownotify_mh);
   2088   8275        Eric 			ill->ill_flownotify_mh = NULL;
   2089   8275        Eric 		}
   2090   8275        Eric 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
   2091   8275        Eric 		    NULL, DLD_DISABLE);
   2092   8275        Eric 	}
   2093   8275        Eric 
   2094   8275        Eric 	if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
   2095   8275        Eric 		ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
   2096   8275        Eric 		ip_squeue_clean_all(ill);
   2097   8275        Eric 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
   2098   8275        Eric 		    NULL, DLD_DISABLE);
   2099   8275        Eric 	}
   2100   8275        Eric 
   2101  11042        Erik 	if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
   2102   8275        Eric 		ASSERT(ill->ill_lso_capab != NULL);
   2103   8275        Eric 		/*
   2104   8275        Eric 		 * Clear the capability flag for LSO but retain the
   2105   8275        Eric 		 * ill_lso_capab structure since it's possible that another
   2106   8275        Eric 		 * thread is still referring to it.  The structure only gets
   2107   8275        Eric 		 * deallocated when we destroy the ill.
   2108   8275        Eric 		 */
   2109   8275        Eric 
   2110  11042        Erik 		ill->ill_capabilities &= ~ILL_CAPAB_LSO;
   2111   8275        Eric 		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
   2112   8275        Eric 		    NULL, DLD_DISABLE);
   2113   8275        Eric 	}
   2114   8275        Eric 
   2115   8275        Eric 	ill->ill_capabilities &= ~ILL_CAPAB_DLD;
   2116   8275        Eric 	ill_mac_perim_exit(ill, mph);
   2117   8275        Eric }
   2118   8275        Eric 
   2119   8275        Eric /*
   2120   8275        Eric  * Capability Negotiation protocol
   2121   8275        Eric  *
   2122   8275        Eric  * We don't wait for DLPI capability operations to finish during interface
   2123   8275        Eric  * bringup or teardown. Doing so would introduce more asynchrony and the
   2124   8275        Eric  * interface up/down operations will need multiple return and restarts.
   2125   8275        Eric  * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
   2126   8275        Eric  * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
   2127   8275        Eric  * exclusive operation won't start until the DLPI operations of the previous
   2128   8275        Eric  * exclusive operation complete.
   2129   8275        Eric  *
   2130   8275        Eric  * The capability state machine is shown below.
   2131   8275        Eric  *
   2132   8275        Eric  * state		next state		event, action
   2133   8275        Eric  *
   2134   8275        Eric  * IDCS_UNKNOWN 	IDCS_PROBE_SENT		ill_capability_probe
   2135   8275        Eric  * IDCS_PROBE_SENT	IDCS_OK			ill_capability_ack
   2136   8275        Eric  * IDCS_PROBE_SENT	IDCS_FAILED		ip_rput_dlpi_writer (nack)
   2137   8275        Eric  * IDCS_OK		IDCS_RENEG		Receipt of DL_NOTE_CAPAB_RENEG
   2138   8275        Eric  * IDCS_OK		IDCS_RESET_SENT		ill_capability_reset
   2139   8275        Eric  * IDCS_RESET_SENT	IDCS_UNKNOWN		ill_capability_ack_thr
   2140   8275        Eric  * IDCS_RENEG		IDCS_PROBE_SENT		ill_capability_ack_thr ->
   2141   8275        Eric  *						    ill_capability_probe.
   2142   8275        Eric  */
   2143   8275        Eric 
   2144   8275        Eric /*
   2145   8275        Eric  * Dedicated thread started from ip_stack_init that handles capability
   2146   8275        Eric  * disable. This thread ensures the taskq dispatch does not fail by waiting
   2147   8275        Eric  * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
   2148   8275        Eric  * that direct calls to DLD are done in a cv_waitable context.
   2149   8275        Eric  */
   2150   8275        Eric void
   2151   8275        Eric ill_taskq_dispatch(ip_stack_t *ipst)
   2152   8275        Eric {
   2153   8275        Eric 	callb_cpr_t cprinfo;
   2154   8275        Eric 	char 	name[64];
   2155   8275        Eric 	mblk_t	*mp;
   2156   8275        Eric 
   2157   8275        Eric 	(void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
   2158   8275        Eric 	    ipst->ips_netstack->netstack_stackid);
   2159   8275        Eric 	CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
   2160   8275        Eric 	    name);
   2161   8275        Eric 	mutex_enter(&ipst->ips_capab_taskq_lock);
   2162   8275        Eric 
   2163   8275        Eric 	for (;;) {
   2164   9979  Thirumalai 		mp = ipst->ips_capab_taskq_head;
   2165   8275        Eric 		while (mp != NULL) {
   2166   9979  Thirumalai 			ipst->ips_capab_taskq_head = mp->b_next;
   2167   9979  Thirumalai 			if (ipst->ips_capab_taskq_head == NULL)
   2168   9979  Thirumalai 				ipst->ips_capab_taskq_tail = NULL;
   2169   8275        Eric 			mutex_exit(&ipst->ips_capab_taskq_lock);
   2170   9979  Thirumalai 			mp->b_next = NULL;
   2171   9979  Thirumalai 
   2172   8275        Eric 			VERIFY(taskq_dispatch(system_taskq,
   2173   8275        Eric 			    ill_capability_ack_thr, mp, TQ_SLEEP) != 0);
   2174   8275        Eric 			mutex_enter(&ipst->ips_capab_taskq_lock);
   2175   9979  Thirumalai 			mp = ipst->ips_capab_taskq_head;
   2176   8275        Eric 		}
   2177   8275        Eric 
   2178   8275        Eric 		if (ipst->ips_capab_taskq_quit)
   2179   8275        Eric 			break;
   2180   8275        Eric 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   2181   8275        Eric 		cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
   2182   8275        Eric 		CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
   2183   8275        Eric 	}
   2184   9979  Thirumalai 	VERIFY(ipst->ips_capab_taskq_head == NULL);
   2185   9979  Thirumalai 	VERIFY(ipst->ips_capab_taskq_tail == NULL);
   2186   8275        Eric 	CALLB_CPR_EXIT(&cprinfo);
   2187   8275        Eric 	thread_exit();
   2188      0      stevel }
   2189      0      stevel 
   2190      0      stevel /*
   2191      0      stevel  * Consume a new-style hardware capabilities negotiation ack.
   2192  11076       Cathy  * Called via taskq on receipt of DL_CAPABILITY_ACK.
   2193   8275        Eric  */
   2194   8275        Eric static void
   2195   8275        Eric ill_capability_ack_thr(void *arg)
   2196   8275        Eric {
   2197   8275        Eric 	mblk_t	*mp = arg;
   2198      0      stevel 	dl_capability_ack_t *capp;
   2199      0      stevel 	dl_capability_sub_t *subp, *endp;
   2200   8275        Eric 	ill_t	*ill;
   2201   8275        Eric 	boolean_t reneg;
   2202   8275        Eric 
   2203   8275        Eric 	ill = (ill_t *)mp->b_prev;
   2204   9979  Thirumalai 	mp->b_prev = NULL;
   2205   9979  Thirumalai 
   2206   8275        Eric 	VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
   2207   8275        Eric 
   2208   8275        Eric 	if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
   2209   8275        Eric 	    ill->ill_dlpi_capab_state == IDCS_RENEG) {
   2210   8275        Eric 		/*
   2211   8275        Eric 		 * We have received the ack for our DL_CAPAB reset request.
   2212   8275        Eric 		 * There isnt' anything in the message that needs processing.
   2213   8275        Eric 		 * All message based capabilities have been disabled, now
   2214   8275        Eric 		 * do the function call based capability disable.
   2215   8275        Eric 		 */
   2216   8275        Eric 		reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
   2217   8275        Eric 		ill_capability_dld_disable(ill);
   2218   8275        Eric 		ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
   2219   8275        Eric 		if (reneg)
   2220   8275        Eric 			ill_capability_probe(ill);
   2221   8275        Eric 		goto done;
   2222   8275        Eric 	}
   2223   8275        Eric 
   2224   8275        Eric 	if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
   2225   8275        Eric 		ill->ill_dlpi_capab_state = IDCS_OK;
   2226      0      stevel 
   2227      0      stevel 	capp = (dl_capability_ack_t *)mp->b_rptr;
   2228      0      stevel 
   2229   8275        Eric 	if (capp->dl_sub_length == 0) {
   2230      0      stevel 		/* no new-style capabilities */
   2231   8275        Eric 		goto done;
   2232   8275        Eric 	}
   2233      0      stevel 
   2234      0      stevel 	/* make sure the driver supplied correct dl_sub_length */
   2235      0      stevel 	if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
   2236      0      stevel 		ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
   2237      0      stevel 		    "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
   2238   8275        Eric 		goto done;
   2239   8275        Eric 	}
   2240   8275        Eric 
   2241      0      stevel #define	SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
   2242      0      stevel 	/*
   2243      0      stevel 	 * There are sub-capabilities. Process the ones we know about.
   2244      0      stevel 	 * Loop until we don't have room for another sub-cap header..
   2245      0      stevel 	 */
   2246      0      stevel 	for (subp = SC(capp, capp->dl_sub_offset),
   2247      0      stevel 	    endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
   2248      0      stevel 	    subp <= endp;
   2249      0      stevel 	    subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
   2250      0      stevel 
   2251      0      stevel 		switch (subp->dl_cap) {
   2252      0      stevel 		case DL_CAPAB_ID_WRAPPER:
   2253      0      stevel 			ill_capability_id_ack(ill, mp, subp);
   2254      0      stevel 			break;
   2255      0      stevel 		default:
   2256  11042        Erik 			ill_capability_dispatch(ill, mp, subp);
   2257      0      stevel 			break;
   2258      0      stevel 		}
   2259      0      stevel 	}
   2260      0      stevel #undef SC
   2261   8275        Eric done:
   2262   8275        Eric 	inet_freemsg(mp);
   2263   8275        Eric 	ill_capability_done(ill);
   2264   8275        Eric 	ipsq_exit(ill->ill_phyint->phyint_ipsq);
   2265   8275        Eric }
   2266   8275        Eric 
   2267   8275        Eric /*
   2268   8275        Eric  * This needs to be started in a taskq thread to provide a cv_waitable
   2269   8275        Eric  * context.
   2270   8275        Eric  */
   2271   8275        Eric void
   2272   8275        Eric ill_capability_ack(ill_t *ill, mblk_t *mp)
   2273   8275        Eric {
   2274   8275        Eric 	ip_stack_t	*ipst = ill->ill_ipst;
   2275   8275        Eric 
   2276   8275        Eric 	mp->b_prev = (mblk_t *)ill;
   2277   9979  Thirumalai 	ASSERT(mp->b_next == NULL);
   2278   9979  Thirumalai 
   2279   8275        Eric 	if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
   2280   8275        Eric 	    TQ_NOSLEEP) != 0)
   2281   8275        Eric 		return;
   2282   8275        Eric 
   2283   8275        Eric 	/*
   2284   8275        Eric 	 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
   2285   8275        Eric 	 * which will do the dispatch using TQ_SLEEP to guarantee success.
   2286   8275        Eric 	 */
   2287   8275        Eric 	mutex_enter(&ipst->ips_capab_taskq_lock);
   2288   9979  Thirumalai 	if (ipst->ips_capab_taskq_head == NULL) {
   2289   9979  Thirumalai 		ASSERT(ipst->ips_capab_taskq_tail == NULL);
   2290   9979  Thirumalai 		ipst->ips_capab_taskq_head = mp;
   2291   9979  Thirumalai 	} else {
   2292   9979  Thirumalai 		ipst->ips_capab_taskq_tail->b_next = mp;
   2293   9979  Thirumalai 	}
   2294   9979  Thirumalai 	ipst->ips_capab_taskq_tail = mp;
   2295   9979  Thirumalai 
   2296   8275        Eric 	cv_signal(&ipst->ips_capab_taskq_cv);
   2297   8275        Eric 	mutex_exit(&ipst->ips_capab_taskq_lock);
   2298      0      stevel }
   2299      0      stevel 
   2300      0      stevel /*
   2301      0      stevel  * This routine is called to scan the fragmentation reassembly table for
   2302      0      stevel  * the specified ILL for any packets that are starting to smell.
   2303      0      stevel  * dead_interval is the maximum time in seconds that will be tolerated.  It
   2304      0      stevel  * will either be the value specified in ip_g_frag_timeout, or zero if the
   2305      0      stevel  * ILL is shutting down and it is time to blow everything off.
   2306      0      stevel  *
   2307      0      stevel  * It returns the number of seconds (as a time_t) that the next frag timer
   2308      0      stevel  * should be scheduled for, 0 meaning that the timer doesn't need to be
   2309      0      stevel  * re-started.  Note that the method of calculating next_timeout isn't
   2310      0      stevel  * entirely accurate since time will flow between the time we grab
   2311      0      stevel  * current_time and the time we schedule the next timeout.  This isn't a
   2312      0      stevel  * big problem since this is the timer for sending an ICMP reassembly time
   2313      0      stevel  * exceeded messages, and it doesn't have to be exactly accurate.
   2314      0      stevel  *
   2315      0      stevel  * This function is
   2316      0      stevel  * sometimes called as writer, although this is not required.
   2317      0      stevel  */
   2318      0      stevel time_t
   2319      0      stevel ill_frag_timeout(ill_t *ill, time_t dead_interval)
   2320      0      stevel {
   2321      0      stevel 	ipfb_t	*ipfb;
   2322      0      stevel 	ipfb_t	*endp;
   2323      0      stevel 	ipf_t	*ipf;
   2324      0      stevel 	ipf_t	*ipfnext;
   2325      0      stevel 	mblk_t	*mp;
   2326      0      stevel 	time_t	current_time = gethrestime_sec();
   2327      0      stevel 	time_t	next_timeout = 0;
   2328      0      stevel 	uint32_t	hdr_length;
   2329      0      stevel 	mblk_t	*send_icmp_head;
   2330      0      stevel 	mblk_t	*send_icmp_head_v6;
   2331   3448    dh155122 	ip_stack_t *ipst = ill->ill_ipst;
   2332  11042        Erik 	ip_recv_attr_t iras;
   2333  11042        Erik 
   2334  11042        Erik 	bzero(&iras, sizeof (iras));
   2335  11042        Erik 	iras.ira_flags = 0;
   2336  11042        Erik 	iras.ira_ill = iras.ira_rill = ill;
   2337  11042        Erik 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
   2338  11042        Erik 	iras.ira_rifindex = iras.ira_ruifindex;
   2339      0      stevel 
   2340      0      stevel 	ipfb = ill->ill_frag_hash_tbl;
   2341      0      stevel 	if (ipfb == NULL)
   2342      0      stevel 		return (B_FALSE);
   2343      0      stevel 	endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
   2344      0      stevel 	/* Walk the frag hash table. */
   2345      0      stevel 	for (; ipfb < endp; ipfb++) {
   2346      0      stevel 		send_icmp_head = NULL;
   2347      0      stevel 		send_icmp_head_v6 = NULL;
   2348      0      stevel 		mutex_enter(&ipfb->ipfb_lock);
   2349      0      stevel 		while ((ipf = ipfb->ipfb_ipf) != 0) {
   2350      0      stevel 			time_t frag_time = current_time - ipf->ipf_timestamp;
   2351      0      stevel 			time_t frag_timeout;
   2352      0      stevel 
   2353      0      stevel 			if (frag_time < dead_interval) {
   2354      0      stevel 				/*
   2355      0      stevel 				 * There are some outstanding fragments
   2356      0      stevel 				 * that will timeout later.  Make note of
   2357      0      stevel 				 * the time so that we can reschedule the
   2358      0      stevel 				 * next timeout appropriately.
   2359      0      stevel 				 */
   2360      0      stevel 				frag_timeout = dead_interval - frag_time;
   2361      0      stevel 				if (next_timeout == 0 ||
   2362      0      stevel 				    frag_timeout < next_timeout) {
   2363      0      stevel 					next_timeout = frag_timeout;
   2364      0      stevel 				}
   2365      0      stevel 				break;
   2366      0      stevel 			}
   2367      0      stevel 			/* Time's up.  Get it out of here. */
   2368      0      stevel 			hdr_length = ipf->ipf_nf_hdr_len;
   2369      0      stevel 			ipfnext = ipf->ipf_hash_next;
   2370      0      stevel 			if (ipfnext)
   2371      0      stevel 				ipfnext->ipf_ptphn = ipf->ipf_ptphn;
   2372      0      stevel 			*ipf->ipf_ptphn = ipfnext;
   2373      0      stevel 			mp = ipf->ipf_mp->b_cont;
   2374      0      stevel 			for (; mp; mp = mp->b_cont) {
   2375      0      stevel 				/* Extra points for neatness. */
   2376      0      stevel 				IP_REASS_SET_START(mp, 0);
   2377      0      stevel 				IP_REASS_SET_END(mp, 0);
   2378      0      stevel 			}
   2379      0      stevel 			mp = ipf->ipf_mp->b_cont;
   2380   6759     georges 			atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
   2381      0      stevel 			ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
   2382      0      stevel 			ipfb->ipfb_count -= ipf->ipf_count;
   2383      0      stevel 			ASSERT(ipfb->ipfb_frag_pkts > 0);
   2384      0      stevel 			ipfb->ipfb_frag_pkts--;
   2385      0      stevel 			/*
   2386      0      stevel 			 * We do not send any icmp message from here because
   2387      0      stevel 			 * we currently are holding the ipfb_lock for this
   2388      0      stevel 			 * hash chain. If we try and send any icmp messages
   2389      0      stevel 			 * from here we may end up via a put back into ip
   2390      0      stevel 			 * trying to get the same lock, causing a recursive
   2391      0      stevel 			 * mutex panic. Instead we build a list and send all
   2392      0      stevel 			 * the icmp messages after we have dropped the lock.
   2393      0      stevel 			 */
   2394      0      stevel 			if (ill->ill_isv6) {
   2395      0      stevel 				if (hdr_length != 0) {
   2396      0      stevel 					mp->b_next = send_icmp_head_v6;
   2397      0      stevel 					send_icmp_head_v6 = mp;
   2398      0      stevel 				} else {
   2399      0      stevel 					freemsg(mp);
   2400      0      stevel 				}
   2401      0      stevel 			} else {
   2402      0      stevel 				if (hdr_length != 0) {
   2403      0      stevel 					mp->b_next = send_icmp_head;
   2404      0      stevel 					send_icmp_head = mp;
   2405      0      stevel 				} else {
   2406      0      stevel 					freemsg(mp);
   2407      0      stevel 				}
   2408      0      stevel 			}
   2409   3284    apersson 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
   2410  11042        Erik 			ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
   2411      0      stevel 			freeb(ipf->ipf_mp);
   2412      0      stevel 		}
   2413      0      stevel 		mutex_exit(&ipfb->ipfb_lock);
   2414      0      stevel 		/*
   2415      0      stevel 		 * Now need to send any icmp messages that we delayed from
   2416      0      stevel 		 * above.
   2417      0      stevel 		 */
   2418      0      stevel 		while (send_icmp_head_v6 != NULL) {
   2419   2733    nordmark 			ip6_t *ip6h;
   2420   2733    nordmark 
   2421      0      stevel 			mp = send_icmp_head_v6;
   2422      0      stevel 			send_icmp_head_v6 = send_icmp_head_v6->b_next;
   2423      0      stevel 			mp->b_next = NULL;
   2424  11042        Erik 			ip6h = (ip6_t *)mp->b_rptr;
   2425  11042        Erik 			iras.ira_flags = 0;
   2426  11042        Erik 			/*
   2427  11042        Erik 			 * This will result in an incorrect ALL_ZONES zoneid
   2428  11042        Erik 			 * for multicast packets, but we
   2429  11042        Erik 			 * don't send ICMP errors for those in any case.
   2430  11042        Erik 			 */
   2431  11042        Erik 			iras.ira_zoneid =
   2432  11042        Erik 			    ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
   2433   3448    dh155122 			    ill, ipst);
   2434  11042        Erik 			ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
   2435  11042        Erik 			icmp_time_exceeded_v6(mp,
   2436  11042        Erik 			    ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
   2437  11042        Erik 			    &iras);
   2438  11042        Erik 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   2439      0      stevel 		}
   2440      0      stevel 		while (send_icmp_head != NULL) {
   2441   2733    nordmark 			ipaddr_t dst;
   2442   2733    nordmark 
   2443      0      stevel 			mp = send_icmp_head;
   2444      0      stevel 			send_icmp_head = send_icmp_head->b_next;
   2445      0      stevel 			mp->b_next = NULL;
   2446   2733    nordmark 
   2447  11042        Erik 			dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
   2448  11042        Erik 
   2449  11042        Erik 			iras.ira_flags = IRAF_IS_IPV4;
   2450  11042        Erik 			/*
   2451  11042        Erik 			 * This will result in an incorrect ALL_ZONES zoneid
   2452  11042        Erik 			 * for broadcast and multicast packets, but we
   2453  11042        Erik 			 * don't send ICMP errors for those in any case.
   2454  11042        Erik 			 */
   2455  11042        Erik 			iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
   2456  11042        Erik 			    ill, ipst);
   2457  11042        Erik 			ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
   2458  11042        Erik 			icmp_time_exceeded(mp,
   2459  11042        Erik 			    ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
   2460  11042        Erik 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   2461      0      stevel 		}
   2462      0      stevel 	}
   2463      0      stevel 	/*
   2464      0      stevel 	 * A non-dying ILL will use the return value to decide whether to
   2465      0      stevel 	 * restart the frag timer, and for how long.
   2466      0      stevel 	 */
   2467      0      stevel 	return (next_timeout);
   2468      0      stevel }
   2469      0      stevel 
   2470      0      stevel /*
   2471      0      stevel  * This routine is called when the approximate count of mblk memory used
   2472      0      stevel  * for the specified ILL has exceeded max_count.
   2473      0      stevel  */
   2474      0      stevel void
   2475      0      stevel ill_frag_prune(ill_t *ill, uint_t max_count)
   2476      0      stevel {
   2477      0      stevel 	ipfb_t	*ipfb;
   2478      0      stevel 	ipf_t	*ipf;
   2479      0      stevel 	size_t	count;
   2480  11066      rafael 	clock_t now;
   2481      0      stevel 
   2482      0      stevel 	/*
   2483      0      stevel 	 * If we are here within ip_min_frag_prune_time msecs remove
   2484      0      stevel 	 * ill_frag_free_num_pkts oldest packets from each bucket and increment
   2485      0      stevel 	 * ill_frag_free_num_pkts.
   2486      0      stevel 	 */
   2487      0      stevel 	mutex_enter(&ill->ill_lock);
   2488  11066      rafael 	now = ddi_get_lbolt();
   2489  11066      rafael 	if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
   2490      0      stevel 	    (ip_min_frag_prune_time != 0 ?
   2491      0      stevel 	    ip_min_frag_prune_time : msec_per_tick)) {
   2492      0      stevel 
   2493      0      stevel 		ill->ill_frag_free_num_pkts++;
   2494      0      stevel 
   2495      0      stevel 	} else {
   2496      0      stevel 		ill->ill_frag_free_num_pkts = 0;
   2497      0      stevel 	}
   2498  11066      rafael 	ill->ill_last_frag_clean_time = now;
   2499      0      stevel 	mutex_exit(&ill->ill_lock);
   2500      0      stevel 
   2501      0      stevel 	/*
   2502      0      stevel 	 * free ill_frag_free_num_pkts oldest packets from each bucket.
   2503      0      stevel 	 */
   2504      0      stevel 	if (ill->ill_frag_free_num_pkts != 0) {
   2505      0      stevel 		int ix;
   2506      0      stevel 
   2507      0      stevel 		for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
   2508      0      stevel 			ipfb = &ill->ill_frag_hash_tbl[ix];
   2509      0      stevel 			mutex_enter(&ipfb->ipfb_lock);
   2510      0      stevel 			if (ipfb->ipfb_ipf != NULL) {
   2511      0      stevel 				ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
   2512      0      stevel 				    ill->ill_frag_free_num_pkts);
   2513      0      stevel 			}
   2514      0      stevel 			mutex_exit(&ipfb->ipfb_lock);
   2515      0      stevel 		}
   2516      0      stevel 	}
   2517      0      stevel 	/*
   2518      0      stevel 	 * While the reassembly list for this ILL is too big, prune a fragment
   2519   6759     georges 	 * queue by age, oldest first.
   2520      0      stevel 	 */
   2521      0      stevel 	while (ill->ill_frag_count > max_count) {
   2522      0      stevel 		int	ix;
   2523      0      stevel 		ipfb_t	*oipfb = NULL;
   2524      0      stevel 		uint_t	oldest = UINT_MAX;
   2525      0      stevel 
   2526      0      stevel 		count = 0;
   2527      0      stevel 		for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
   2528      0      stevel 			ipfb = &ill->ill_frag_hash_tbl[ix];
   2529      0      stevel 			mutex_enter(&ipfb->ipfb_lock);
   2530      0      stevel 			ipf = ipfb->ipfb_ipf;
   2531      0      stevel 			if (ipf != NULL && ipf->ipf_gen < oldest) {
   2532      0      stevel 				oldest = ipf->ipf_gen;
   2533      0      stevel 				oipfb = ipfb;
   2534      0      stevel 			}
   2535      0      stevel 			count += ipfb->ipfb_count;
   2536      0      stevel 			mutex_exit(&ipfb->ipfb_lock);
   2537      0      stevel 		}
   2538   6759     georges 		if (oipfb == NULL)
   2539   6759     georges 			break;
   2540   6759     georges 
   2541      0      stevel 		if (count <= max_count)
   2542      0      stevel 			return;	/* Somebody beat us to it, nothing to do */
   2543      0      stevel 		mutex_enter(&oipfb->ipfb_lock);
   2544      0      stevel 		ipf = oipfb->ipfb_ipf;
   2545      0      stevel 		if (ipf != NULL) {
   2546      0      stevel 			ill_frag_free_pkts(ill, oipfb, ipf, 1);
   2547      0      stevel 		}
   2548      0      stevel 		mutex_exit(&oipfb->ipfb_lock);
   2549      0      stevel 	}
   2550      0      stevel }
   2551      0      stevel 
   2552      0      stevel /*
   2553      0      stevel  * free 'free_cnt' fragmented packets starting at ipf.
   2554      0      stevel  */
   2555      0      stevel void
   2556      0      stevel ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
   2557      0      stevel {
   2558      0      stevel 	size_t	count;
   2559      0      stevel 	mblk_t	*mp;
   2560      0      stevel 	mblk_t	*tmp;
   2561      0      stevel 	ipf_t **ipfp = ipf->ipf_ptphn;
   2562      0      stevel 
   2563      0      stevel 	ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
   2564      0      stevel 	ASSERT(ipfp != NULL);
   2565      0      stevel 	ASSERT(ipf != NULL);
   2566      0      stevel 
   2567      0      stevel 	while (ipf != NULL && free_cnt-- > 0) {
   2568      0      stevel 		count = ipf->ipf_count;
   2569      0      stevel 		mp = ipf->ipf_mp;
   2570      0      stevel 		ipf = ipf->ipf_hash_next;
   2571      0      stevel 		for (tmp = mp; tmp; tmp = tmp->b_cont) {
   2572      0      stevel 			IP_REASS_SET_START(tmp, 0);
   2573      0      stevel 			IP_REASS_SET_END(tmp, 0);
   2574      0      stevel 		}
   2575   6759     georges 		atomic_add_32(&ill->ill_frag_count, -count);
   2576      0      stevel 		ASSERT(ipfb->ipfb_count >= count);
   2577      0      stevel 		ipfb->ipfb_count -= count;
   2578      0      stevel 		ASSERT(ipfb->ipfb_frag_pkts > 0);
   2579      0      stevel 		ipfb->ipfb_frag_pkts--;
   2580  11042        Erik 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
   2581  11042        Erik 		ip_drop_input("ipIfStatsReasmFails", mp, ill);
   2582      0      stevel 		freemsg(mp);
   2583      0      stevel 	}
   2584      0      stevel 
   2585      0      stevel 	if (ipf)
   2586      0      stevel 		ipf->ipf_ptphn = ipfp;
   2587      0      stevel 	ipfp[0] = ipf;
   2588      0      stevel }
   2589      0      stevel 
   2590      0      stevel #define	ND_FORWARD_WARNING	"The <if>:ip*_forwarding ndd variables are " \
   2591      0      stevel 	"obsolete and may be removed in a future release of Solaris.  Use " \
   2592      0      stevel 	"ifconfig(1M) to manipulate the forwarding status of an interface."
   2593      0      stevel 
   2594      0      stevel /*
   2595      0      stevel  * For obsolete per-interface forwarding configuration;
   2596      0      stevel  * called in response to ND_GET.
   2597      0      stevel  */
   2598      0      stevel /* ARGSUSED */
   2599      0      stevel static int
   2600      0      stevel nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr)
   2601      0      stevel {
   2602      0      stevel 	ill_t *ill = (ill_t *)cp;
   2603      0      stevel 
   2604      0      stevel 	cmn_err(CE_WARN, ND_FORWARD_WARNING);
   2605      0      stevel 
   2606      0      stevel 	(void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0);
   2607      0      stevel 	return (0);
   2608      0      stevel }
   2609      0      stevel 
   2610      0      stevel /*
   2611      0      stevel  * For obsolete per-interface forwarding configuration;
   2612      0      stevel  * called in response to ND_SET.
   2613      0      stevel  */
   2614      0      stevel /* ARGSUSED */
   2615      0      stevel static int
   2616      0      stevel nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp,
   2617      0      stevel     cred_t *ioc_cr)
   2618      0      stevel {
   2619      0      stevel 	long value;
   2620      0      stevel 	int retval;
   2621   3448    dh155122 	ip_stack_t *ipst = CONNQ_TO_IPST(q);
   2622      0      stevel 
   2623      0      stevel 	cmn_err(CE_WARN, ND_FORWARD_WARNING);
   2624      0      stevel 
   2625      0      stevel 	if (ddi_strtol(valuestr, NULL, 10, &value) != 0 ||
   2626      0      stevel 	    value < 0 || value > 1) {
   2627      0      stevel 		return (EINVAL);
   2628      0      stevel 	}
   2629      0      stevel 
   2630   3448    dh155122 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   2631   4360        meem 	retval = ill_forward_set((ill_t *)cp, (value != 0));
   2632   3448    dh155122 	rw_exit(&ipst->ips_ill_g_lock);
   2633      0      stevel 	return (retval);
   2634      0      stevel }
   2635      0      stevel 
   2636      0      stevel /*
   2637   8485       Peter  * Helper function for ill_forward_set().
   2638   8485       Peter  */
   2639   8485       Peter static void
   2640   8485       Peter ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
   2641   8485       Peter {
   2642   8485       Peter 	ip_stack_t	*ipst = ill->ill_ipst;
   2643   8485       Peter 
   2644   8485       Peter 	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
   2645   8485       Peter 
   2646   8485       Peter 	ip1dbg(("ill_forward_set: %s %s forwarding on %s",
   2647   8485       Peter 	    (enable ? "Enabling" : "Disabling"),
   2648   8485       Peter 	    (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
   2649   8485       Peter 	mutex_enter(&ill->ill_lock);
   2650   8485       Peter 	if (enable)
   2651   8485       Peter 		ill->ill_flags |= ILLF_ROUTER;
   2652   8485       Peter 	else
   2653   8485       Peter 		ill->ill_flags &= ~ILLF_ROUTER;
   2654   8485       Peter 	mutex_exit(&ill->ill_lock);
   2655   8485       Peter 	if (ill->ill_isv6)
   2656   8485       Peter 		ill_set_nce_router_flags(ill, enable);
   2657   8485       Peter 	/* Notify routing socket listeners of this change. */
   2658   9658     Sowmini 	if (ill->ill_ipif != NULL)
   2659   9658     Sowmini 		ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
   2660   8485       Peter }
   2661   8485       Peter 
   2662   8485       Peter /*
   2663   8485       Peter  * Set an ill's ILLF_ROUTER flag appropriately.  Send up RTS_IFINFO routing
   2664   8485       Peter  * socket messages for each interface whose flags we change.
   2665      0      stevel  */
   2666   4360        meem int
   2667   4360        meem ill_forward_set(ill_t *ill, boolean_t enable)
   2668   4360        meem {
   2669   8485       Peter 	ipmp_illgrp_t *illg;
   2670   8485       Peter 	ip_stack_t *ipst = ill->ill_ipst;
   2671   3448    dh155122 
   2672   3448    dh155122 	ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
   2673      0      stevel 
   2674      0      stevel 	if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
   2675   4360        meem 	    (!enable && !(ill->ill_flags & ILLF_ROUTER)))
   2676   4360        meem 		return (0);
   2677   4360        meem 
   2678   4459      kcpoon 	if (IS_LOOPBACK(ill))
   2679      0      stevel 		return (EINVAL);
   2680      0      stevel 
   2681   8485       Peter 	if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
   2682   8485       Peter 		/*
   2683   8485       Peter 		 * Update all of the interfaces in the group.
   2684   8485       Peter 		 */
   2685   8485       Peter 		illg = ill->ill_grp;
   2686   8485       Peter 		ill = list_head(&illg->ig_if);
   2687   8485       Peter 		for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
   2688   8485       Peter 			ill_forward_set_on_ill(ill, enable);
   2689   8485       Peter 
   2690   8485       Peter 		/*
   2691   8485       Peter 		 * Update the IPMP meta-interface.
   2692   8485       Peter 		 */
   2693   8485       Peter 		ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
   2694   8485       Peter 		return (0);
   2695   8485       Peter 	}
   2696   8485       Peter 
   2697   8485       Peter 	ill_forward_set_on_ill(ill, enable);
   2698      0      stevel 	return (0);
   2699      0      stevel }
   2700      0      stevel 
   2701      0      stevel /*
   2702      0      stevel  * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
   2703      0      stevel  * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
   2704      0      stevel  * set or clear.
   2705      0      stevel  */
   2706      0      stevel static void
   2707      0      stevel ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
   2708      0      stevel {
   2709      0      stevel 	ipif_t *ipif;
   2710  11042        Erik 	ncec_t *ncec;
   2711      0      stevel 	nce_t *nce;
   2712      0      stevel 
   2713      0      stevel 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   2714   8485       Peter 		/*
   2715   8499   Sebastien 		 * NOTE: we match across the illgrp because nce's for
   2716   8499   Sebastien 		 * addresses on IPMP interfaces have an nce_ill that points to
   2717   8499   Sebastien 		 * the bound underlying ill.
   2718   8499   Sebastien 		 */
   2719  11042        Erik 		nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
   2720      0      stevel 		if (nce != NULL) {
   2721  11042        Erik 			ncec = nce->nce_common;
   2722  11042        Erik 			mutex_enter(&ncec->ncec_lock);
   2723      0      stevel 			if (enable)
   2724  11042        Erik 				ncec->ncec_flags |= NCE_F_ISROUTER;
   2725      0      stevel 			else
   2726  11042        Erik 				ncec->ncec_flags &= ~NCE_F_ISROUTER;
   2727  11042        Erik 			mutex_exit(&ncec->ncec_lock);
   2728  11042        Erik 			nce_refrele(nce);
   2729      0      stevel 		}
   2730      0      stevel 	}
   2731      0      stevel }
   2732      0      stevel 
   2733      0      stevel /*
   2734      0      stevel  * Given an ill with a _valid_ name, add the ip_forwarding ndd variable
   2735      0      stevel  * for this ill.  Make sure the v6/v4 question has been answered about this
   2736      0      stevel  * ill.  The creation of this ndd variable is only for backwards compatibility.
   2737      0      stevel  * The preferred way to control per-interface IP forwarding is through the
   2738      0      stevel  * ILLF_ROUTER interface flag.
   2739      0      stevel  */
   2740      0      stevel static int
   2741      0      stevel ill_set_ndd_name(ill_t *ill)
   2742      0      stevel {
   2743      0      stevel 	char *suffix;
   2744   3448    dh155122 	ip_stack_t	*ipst = ill->ill_ipst;
   2745      0      stevel 
   2746      0      stevel 	ASSERT(IAM_WRITER_ILL(ill));
   2747      0      stevel 
   2748      0      stevel 	if (ill->ill_isv6)
   2749      0      stevel 		suffix = ipv6_forward_suffix;
   2750      0      stevel 	else
   2751      0      stevel 		suffix = ipv4_forward_suffix;
   2752      0      stevel 
   2753      0      stevel 	ill->ill_ndd_name = ill->ill_name + ill->ill_name_length;
   2754      0      stevel 	bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1);
   2755      0      stevel 	/*
   2756      0      stevel 	 * Copies over the '\0'.
   2757      0      stevel 	 * Note that strlen(suffix) is always bounded.
   2758      0      stevel 	 */
   2759      0      stevel 	bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1,
   2760      0      stevel 	    strlen(suffix) + 1);
   2761      0      stevel 
   2762      0      stevel 	/*
   2763      0      stevel 	 * Use of the nd table requires holding the reader lock.
   2764      0      stevel 	 * Modifying the nd table thru nd_load/nd_unload requires
   2765      0      stevel 	 * the writer lock.
   2766      0      stevel 	 */
   2767   3448    dh155122 	rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER);
   2768   3448    dh155122 	if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get,
   2769      0      stevel 	    nd_ill_forward_set, (caddr_t)ill)) {
   2770      0      stevel 		/*
   2771      0      stevel 		 * If the nd_load failed, it only meant that it could not
   2772      0      stevel 		 * allocate a new bunch of room for further NDD expansion.
   2773      0      stevel 		 * Because of that, the ill_ndd_name will be set to 0, and
   2774      0      stevel 		 * this interface is at the mercy of the global ip_forwarding
   2775      0      stevel 		 * variable.
   2776      0      stevel 		 */
   2777   3448    dh155122 		rw_exit(&ipst->ips_ip_g_nd_lock);
   2778      0      stevel 		ill->ill_ndd_name = NULL;
   2779      0      stevel 		return (ENOMEM);
   2780      0      stevel 	}
   2781   3448    dh155122 	rw_exit(&ipst->ips_ip_g_nd_lock);
   2782      0      stevel 	return (0);
   2783      0      stevel }
   2784      0      stevel 
   2785      0      stevel /*
   2786      0      stevel  * Intializes the context structure and returns the first ill in the list
   2787      0      stevel  * cuurently start_list and end_list can have values:
   2788      0      stevel  * MAX_G_HEADS		Traverse both IPV4 and IPV6 lists.
   2789      0      stevel  * IP_V4_G_HEAD		Traverse IPV4 list only.
   2790      0      stevel  * IP_V6_G_HEAD		Traverse IPV6 list only.
   2791      0      stevel  */
   2792      0      stevel 
   2793      0      stevel /*
   2794      0      stevel  * We don't check for CONDEMNED ills here. Caller must do that if
   2795      0      stevel  * necessary under the ill lock.
   2796      0      stevel  */
   2797      0      stevel ill_t *
   2798   3448    dh155122 ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
   2799   3448    dh155122     ip_stack_t *ipst)
   2800      0      stevel {
   2801      0      stevel 	ill_if_t *ifp;
   2802      0      stevel 	ill_t *ill;
   2803      0      stevel 	avl_tree_t *avl_tree;
   2804      0      stevel 
   2805   3448    dh155122 	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
   2806      0      stevel 	ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
   2807      0      stevel 
   2808      0      stevel 	/*
   2809      0      stevel 	 * setup the lists to search
   2810      0      stevel 	 */
   2811      0      stevel 	if (end_list != MAX_G_HEADS) {
   2812      0      stevel 		ctx->ctx_current_list = start_list;
   2813      0      stevel 		ctx->ctx_last_list = end_list;
   2814      0      stevel 	} else {
   2815      0      stevel 		ctx->ctx_last_list = MAX_G_HEADS - 1;
   2816      0      stevel 		ctx->ctx_current_list = 0;
   2817      0      stevel 	}
   2818      0      stevel 
   2819      0      stevel 	while (ctx->ctx_current_list <= ctx->ctx_last_list) {
   2820   3448    dh155122 		ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
   2821      0      stevel 		if (ifp != (ill_if_t *)
   2822   3448    dh155122 		    &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
   2823      0      stevel 			avl_tree = &ifp->illif_avl_by_ppa;
   2824      0      stevel 			ill = avl_first(avl_tree);
   2825      0      stevel 			/*
   2826      0      stevel 			 * ill is guaranteed to be non NULL or ifp should have
   2827      0      stevel 			 * not existed.
   2828      0      stevel 			 */
   2829      0      stevel 			ASSERT(ill != NULL);
   2830      0      stevel 			return (ill);
   2831      0      stevel 		}
   2832      0      stevel 		ctx->ctx_current_list++;
   2833      0      stevel 	}
   2834      0      stevel 
   2835      0      stevel 	return (NULL);
   2836      0      stevel }
   2837      0      stevel 
   2838      0      stevel /*
   2839      0      stevel  * returns the next ill in the list. ill_first() must have been called
   2840      0      stevel  * before calling ill_next() or bad things will happen.
   2841      0      stevel  */
   2842      0      stevel 
   2843      0      stevel /*
   2844      0      stevel  * We don't check for CONDEMNED ills here. Caller must do that if
   2845      0      stevel  * necessary under the ill lock.
   2846      0      stevel  */
   2847      0      stevel ill_t *
   2848      0      stevel ill_next(ill_walk_context_t *ctx, ill_t *lastill)
   2849      0      stevel {
   2850      0      stevel 	ill_if_t *ifp;
   2851      0      stevel 	ill_t *ill;
   2852   3448    dh155122 	ip_stack_t	*ipst = lastill->ill_ipst;
   2853   3448    dh155122 
   2854      0      stevel 	ASSERT(lastill->ill_ifptr != (ill_if_t *)
   2855   3448    dh155122 	    &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
   2856      0      stevel 	if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
   2857      0      stevel 	    AVL_AFTER)) != NULL) {
   2858      0      stevel 		return (ill);
   2859      0      stevel 	}
   2860      0      stevel 
   2861      0      stevel 	/* goto next ill_ifp in the list. */
   2862      0      stevel 	ifp = lastill->ill_ifptr->illif_next;
   2863      0      stevel 
   2864      0      stevel 	/* make sure not at end of circular list */
   2865   3448    dh155122 	while (ifp ==
   2866   3448    dh155122 	    (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
   2867      0      stevel 		if (++ctx->ctx_current_list > ctx->ctx_last_list)
   2868      0      stevel 			return (NULL);
   2869   3448    dh155122 		ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
   2870      0      stevel 	}
   2871      0      stevel 
   2872      0      stevel 	return (avl_first(&ifp->illif_avl_by_ppa));
   2873      0      stevel }
   2874      0      stevel 
   2875      0      stevel /*
   2876   8485       Peter  * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
   2877   8485       Peter  * The final number (PPA) must not have any leading zeros.  Upon success, a
   2878   8485       Peter  * pointer to the start of the PPA is returned; otherwise NULL is returned.
   2879      0      stevel  */
   2880      0      stevel static char *
   2881      0      stevel ill_get_ppa_ptr(char *name)
   2882      0      stevel {
   2883   8485       Peter 	int namelen = strlen(name);
   2884   8485       Peter 	int end_ndx = namelen - 1;
   2885   8485       Peter 	int ppa_ndx, i;
   2886   8485       Peter 
   2887   8485       Peter 	/*
   2888   8485       Peter 	 * Check that the first character is [a-zA-Z], and that the last
   2889   8485       Peter 	 * character is [0-9].
   2890   8485       Peter 	 */
   2891   8485       Peter 	if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
   2892   8485       Peter 		return (NULL);
   2893   8485       Peter 
   2894   8485       Peter 	/*
   2895   8485       Peter 	 * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
   2896   8485       Peter 	 */
   2897   8485       Peter 	for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
   2898   8485       Peter 		if (!isdigit(name[ppa_ndx - 1]))
   2899      0      stevel 			break;
   2900   8485       Peter 
   2901   8485       Peter 	if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
   2902   8485       Peter 		return (NULL);
   2903   8485       Peter 
   2904   8485       Peter 	/*
   2905   8485       Peter 	 * Check that the intermediate characters are [a-z0-9.]
   2906   8485       Peter 	 */
   2907   8485       Peter 	for (i = 1; i < ppa_ndx; i++) {
   2908   8485       Peter 		if (!isalpha(name[i]) && !isdigit(name[i]) &&
   2909   8485       Peter 		    name[i] != '.' && name[i] != '_') {
   2910   8485       Peter 			return (NULL);
   2911   8485       Peter 		}
   2912   8485       Peter 	}
   2913   8485       Peter 
   2914   8485       Peter 	return (name + ppa_ndx);
   2915      0      stevel }
   2916      0      stevel 
   2917      0      stevel /*
   2918      0      stevel  * use avl tree to locate the ill.
   2919      0      stevel  */
   2920      0      stevel static ill_t *
   2921  11042        Erik ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
   2922      0      stevel {
   2923      0      stevel 	char *ppa_ptr = NULL;
   2924      0      stevel 	int len;
   2925      0      stevel 	uint_t ppa;
   2926      0      stevel 	ill_t *ill = NULL;
   2927      0      stevel 	ill_if_t *ifp;
   2928      0      stevel 	int list;
   2929      0      stevel 
   2930      0      stevel 	/*
   2931      0      stevel 	 * get ppa ptr
   2932      0      stevel 	 */
   2933      0      stevel 	if (isv6)
   2934      0      stevel 		list = IP_V6_G_HEAD;
   2935      0      stevel 	else
   2936      0      stevel 		list = IP_V4_G_HEAD;
   2937      0      stevel 
   2938      0      stevel 	if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
   2939      0      stevel 		return (NULL);
   2940      0      stevel 	}
   2941      0      stevel 
   2942      0      stevel 	len = ppa_ptr - name + 1;
   2943      0      stevel 
   2944      0      stevel 	ppa = stoi(&ppa_ptr);
   2945      0      stevel 
   2946   3448    dh155122 	ifp = IP_VX_ILL_G_LIST(list, ipst);
   2947   3448    dh155122 
   2948   3448    dh155122 	while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
   2949      0      stevel 		/*
   2950      0      stevel 		 * match is done on len - 1 as the name is not null
   2951      0      stevel 		 * terminated it contains ppa in addition to the interface
   2952      0      stevel 		 * name.
   2953      0      stevel 		 */
   2954      0      stevel 		if ((ifp->illif_name_len == len) &&
   2955      0      stevel 		    bcmp(ifp->illif_name, name, len - 1) == 0) {
   2956      0      stevel 			break;
   2957      0      stevel 		} else {
   2958      0      stevel 			ifp = ifp->illif_next;
   2959      0      stevel 		}
   2960      0      stevel 	}
   2961      0      stevel 
   2962   3448    dh155122 	if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
   2963      0      stevel 		/*
   2964      0      stevel 		 * Even the interface type does not exist.
   2965      0      stevel 		 */
   2966      0      stevel 		return (NULL);
   2967      0      stevel 	}
   2968      0      stevel 
   2969      0      stevel 	ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
   2970      0      stevel 	if (ill != NULL) {
   2971      0      stevel 		mutex_enter(&ill->ill_lock);
   2972      0      stevel 		if (ILL_CAN_LOOKUP(ill)) {
   2973      0      stevel 			ill_refhold_locked(ill);
   2974      0      stevel 			mutex_exit(&ill->ill_lock);
   2975      0      stevel 			return (ill);
   2976  11042        Erik 		}
   2977  11042        Erik 		mutex_exit(&ill->ill_lock);
   2978  11042        Erik 	}
   2979      0      stevel 	return (NULL);
   2980      0      stevel }
   2981      0      stevel 
   2982      0      stevel /*
   2983      0      stevel  * comparison function for use with avl.
   2984      0      stevel  */
   2985      0      stevel static int
   2986      0      stevel ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
   2987      0      stevel {
   2988      0      stevel 	uint_t ppa;
   2989      0      stevel 	uint_t ill_ppa;
   2990      0      stevel 
   2991      0      stevel 	ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
   2992      0      stevel 
   2993      0      stevel 	ppa = *((uint_t *)ppa_ptr);
   2994      0      stevel 	ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
   2995      0      stevel 	/*
   2996      0      stevel 	 * We want the ill with the lowest ppa to be on the
   2997      0      stevel 	 * top.
   2998      0      stevel 	 */
   2999      0      stevel 	if (ill_ppa < ppa)
   3000      0      stevel 		return (1);
   3001      0      stevel 	if (ill_ppa > ppa)
   3002      0      stevel 		return (-1);
   3003      0      stevel 	return (0);
   3004      0      stevel }
   3005      0      stevel 
   3006      0      stevel /*
   3007      0      stevel  * remove an interface type from the global list.
   3008      0      stevel  */
   3009      0      stevel static void
   3010      0      stevel ill_delete_interface_type(ill_if_t *interface)
   3011      0      stevel {
   3012      0      stevel 	ASSERT(interface != NULL);
   3013      0      stevel 	ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
   3014      0      stevel 
   3015      0      stevel 	avl_destroy(&interface->