1 0 stevel /* 2 0 stevel * CDDL HEADER START 3 0 stevel * 4 0 stevel * The contents of this file are subject to the terms of the 5 1392 ja97890 * Common Development and Distribution License (the "License"). 6 1392 ja97890 * You may not use this file except in compliance with the License. 7 0 stevel * 8 0 stevel * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 0 stevel * or http://www.opensolaris.org/os/licensing. 10 0 stevel * See the License for the specific language governing permissions 11 0 stevel * and limitations under the License. 12 0 stevel * 13 0 stevel * When distributing Covered Code, include this CDDL HEADER in each 14 0 stevel * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 0 stevel * If applicable, add the following below this CDDL HEADER, with the 16 0 stevel * fields enclosed by brackets "[]" replaced with your own identifying 17 0 stevel * information: Portions Copyright [yyyy] [name of copyright owner] 18 0 stevel * 19 0 stevel * CDDL HEADER END 20 0 stevel */ 21 0 stevel /* 22 8485 Peter * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 0 stevel * Use is subject to license terms. 24 0 stevel */ 25 0 stevel /* Copyright (c) 1990 Mentat Inc. */ 26 0 stevel 27 0 stevel /* 28 0 stevel * This file contains the interface control functions for IP. 29 0 stevel */ 30 0 stevel 31 0 stevel #include <sys/types.h> 32 0 stevel #include <sys/stream.h> 33 0 stevel #include <sys/dlpi.h> 34 0 stevel #include <sys/stropts.h> 35 0 stevel #include <sys/strsun.h> 36 0 stevel #include <sys/sysmacros.h> 37 8778 Erik #include <sys/strsubr.h> 38 0 stevel #include <sys/strlog.h> 39 0 stevel #include <sys/ddi.h> 40 0 stevel #include <sys/sunddi.h> 41 0 stevel #include <sys/cmn_err.h> 42 0 stevel #include <sys/kstat.h> 43 0 stevel #include <sys/debug.h> 44 0 stevel #include <sys/zone.h> 45 3448 dh155122 #include <sys/sunldi.h> 46 3448 dh155122 #include <sys/file.h> 47 5023 carlsonj #include <sys/bitmap.h> 48 8275 Eric #include <sys/cpuvar.h> 49 8275 Eric #include <sys/time.h> 50 8485 Peter #include <sys/ctype.h> 51 0 stevel #include <sys/kmem.h> 52 0 stevel #include <sys/systm.h> 53 0 stevel #include <sys/param.h> 54 0 stevel #include <sys/socket.h> 55 0 stevel #include <sys/isa_defs.h> 56 0 stevel #include <net/if.h> 57 0 stevel #include <net/if_arp.h> 58 0 stevel #include <net/if_types.h> 59 0 stevel #include <net/if_dl.h> 60 0 stevel #include <net/route.h> 61 0 stevel #include <sys/sockio.h> 62 0 stevel #include <netinet/in.h> 63 0 stevel #include <netinet/ip6.h> 64 0 stevel #include <netinet/icmp6.h> 65 0 stevel #include <netinet/igmp_var.h> 66 0 stevel #include <sys/policy.h> 67 0 stevel #include <sys/ethernet.h> 68 8275 Eric #include <sys/callb.h> 69 8485 Peter #include <sys/md5.h> 70 0 stevel 71 0 stevel #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 72 0 stevel #include <inet/mi.h> 73 0 stevel #include <inet/nd.h> 74 0 stevel #include <inet/arp.h> 75 11042 Erik #include <inet/ip_arp.h> 76 0 stevel #include <inet/mib2.h> 77 0 stevel #include <inet/ip.h> 78 0 stevel #include <inet/ip6.h> 79 0 stevel #include <inet/ip6_asp.h> 80 0 stevel #include <inet/tcp.h> 81 0 stevel #include <inet/ip_multi.h> 82 0 stevel #include <inet/ip_ire.h> 83 2535 sangeeta #include <inet/ip_ftable.h> 84 0 stevel #include <inet/ip_rts.h> 85 0 stevel #include <inet/ip_ndp.h> 86 0 stevel #include <inet/ip_if.h> 87 741 masputra #include <inet/ip_impl.h> 88 0 stevel #include <inet/sctp_ip.h> 89 2958 dr146992 #include <inet/ip_netinfo.h> 90 10946 Sangeeta #include <inet/ilb_ip.h> 91 0 stevel 92 0 stevel #include <netinet/igmp.h> 93 0 stevel #include <inet/ip_listutils.h> 94 0 stevel #include <inet/ipclassifier.h> 95 8275 Eric #include <sys/mac_client.h> 96 8275 Eric #include <sys/dld.h> 97 0 stevel 98 0 stevel #include <sys/systeminfo.h> 99 0 stevel #include <sys/bootconf.h> 100 1676 jpk 101 1676 jpk #include <sys/tsol/tndb.h> 102 1676 jpk #include <sys/tsol/tnet.h> 103 0 stevel 104 0 stevel /* The character which tells where the ill_name ends */ 105 0 stevel #define IPIF_SEPARATOR_CHAR ':' 106 0 stevel 107 0 stevel /* IP ioctl function table entry */ 108 0 stevel typedef struct ipft_s { 109 0 stevel int ipft_cmd; 110 0 stevel pfi_t ipft_pfi; 111 0 stevel int ipft_min_size; 112 0 stevel int ipft_flags; 113 0 stevel } ipft_t; 114 0 stevel #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 115 0 stevel #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 116 0 stevel 117 0 stevel static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 118 0 stevel static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 119 0 stevel char *value, caddr_t cp, cred_t *ioc_cr); 120 0 stevel 121 6255 sowmini static boolean_t ill_is_quiescent(ill_t *); 122 0 stevel static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 123 0 stevel static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 124 0 stevel static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 125 0 stevel mblk_t *mp, boolean_t need_up); 126 0 stevel static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 127 0 stevel mblk_t *mp, boolean_t need_up); 128 0 stevel static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 129 0 stevel queue_t *q, mblk_t *mp, boolean_t need_up); 130 0 stevel static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 131 7216 meem mblk_t *mp); 132 0 stevel static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 133 0 stevel mblk_t *mp); 134 0 stevel static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 135 0 stevel queue_t *q, mblk_t *mp, boolean_t need_up); 136 4770 meem static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 137 11042 Erik int ioccmd, struct linkblk *li); 138 3448 dh155122 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 139 0 stevel static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 140 0 stevel static void ipsq_flush(ill_t *ill); 141 4360 meem 142 0 stevel static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 143 0 stevel queue_t *q, mblk_t *mp, boolean_t need_up); 144 0 stevel static void ipsq_delete(ipsq_t *); 145 0 stevel 146 0 stevel static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 147 8485 Peter boolean_t initialize, boolean_t insert); 148 4770 meem static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 149 11042 Erik static void ipif_delete_bcast_ires(ipif_t *ipif); 150 11042 Erik static int ipif_add_ires_v4(ipif_t *, boolean_t); 151 4459 kcpoon static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 152 4459 kcpoon boolean_t isv6); 153 0 stevel static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 154 0 stevel static void ipif_free(ipif_t *ipif); 155 0 stevel static void ipif_free_tail(ipif_t *ipif); 156 0 stevel static void ipif_set_default(ipif_t *ipif); 157 0 stevel static int ipif_set_values(queue_t *q, mblk_t *mp, 158 0 stevel char *interf_name, uint_t *ppa); 159 0 stevel static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 160 0 stevel queue_t *q); 161 0 stevel static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 162 0 stevel boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 163 11042 Erik ip_stack_t *); 164 0 stevel 165 0 stevel static int ill_alloc_ppa(ill_if_t *, ill_t *); 166 0 stevel static void ill_delete_interface_type(ill_if_t *); 167 0 stevel static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 168 2546 carlsonj static void ill_dl_down(ill_t *ill); 169 0 stevel static void ill_down(ill_t *ill); 170 11076 Cathy static void ill_down_ipifs(ill_t *, boolean_t); 171 0 stevel static void ill_free_mib(ill_t *ill); 172 0 stevel static void ill_glist_delete(ill_t *); 173 0 stevel static void ill_phyint_reinit(ill_t *ill); 174 0 stevel static void ill_set_nce_router_flags(ill_t *, boolean_t); 175 3340 meem static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 176 9073 Cathy static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); 177 9073 Cathy 178 8485 Peter static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; 179 10616 Sebastien static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; 180 8485 Peter static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; 181 10616 Sebastien static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; 182 11042 Erik static ip_v4mapinfo_func_t ip_ether_v4_mapping; 183 11042 Erik static ip_v6mapinfo_func_t ip_ether_v6_mapping; 184 11042 Erik static ip_v4mapinfo_func_t ip_ib_v4_mapping; 185 11042 Erik static ip_v6mapinfo_func_t ip_ib_v6_mapping; 186 11042 Erik static ip_v4mapinfo_func_t ip_mbcast_mapping; 187 11042 Erik static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *); 188 3448 dh155122 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 189 8485 Peter static void phyint_free(phyint_t *); 190 0 stevel 191 11042 Erik static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *); 192 0 stevel static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 193 11076 Cathy static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 194 0 stevel static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 195 8275 Eric static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); 196 0 stevel static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 197 0 stevel dl_capability_sub_t *); 198 8275 Eric static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); 199 8275 Eric static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); 200 8275 Eric static void ill_capability_dld_ack(ill_t *, mblk_t *, 201 8275 Eric dl_capability_sub_t *); 202 8275 Eric static void ill_capability_dld_enable(ill_t *); 203 8275 Eric static void ill_capability_ack_thr(void *); 204 8275 Eric static void ill_capability_lso_enable(ill_t *); 205 0 stevel 206 0 stevel static ill_t *ill_prev_usesrc(ill_t *); 207 0 stevel static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 208 0 stevel static void ill_disband_usesrc_group(ill_t *); 209 11042 Erik static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int); 210 5023 carlsonj 211 5023 carlsonj #ifdef DEBUG 212 11042 Erik static void ill_trace_cleanup(const ill_t *); 213 11042 Erik static void ipif_trace_cleanup(const ipif_t *); 214 5023 carlsonj #endif 215 0 stevel 216 0 stevel /* 217 0 stevel * if we go over the memory footprint limit more than once in this msec 218 0 stevel * interval, we'll start pruning aggressively. 219 0 stevel */ 220 0 stevel int ip_min_frag_prune_time = 0; 221 0 stevel 222 0 stevel static ipft_t ip_ioctl_ftbl[] = { 223 0 stevel { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 224 0 stevel { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 225 0 stevel IPFT_F_NO_REPLY }, 226 0 stevel { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 227 0 stevel { 0 } 228 0 stevel }; 229 0 stevel 230 0 stevel /* Simple ICMP IP Header Template */ 231 0 stevel static ipha_t icmp_ipha = { 232 0 stevel IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 233 0 stevel }; 234 0 stevel 235 0 stevel static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 236 0 stevel 237 8023 Phil static ip_m_t ip_m_tbl[] = { 238 10616 Sebastien { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 239 11042 Erik ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 240 0 stevel ip_nodef_v6intfid }, 241 10616 Sebastien { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, 242 11042 Erik ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 243 0 stevel ip_nodef_v6intfid }, 244 10616 Sebastien { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, 245 11042 Erik ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 246 0 stevel ip_nodef_v6intfid }, 247 10616 Sebastien { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, 248 11042 Erik ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 249 10616 Sebastien ip_nodef_v6intfid }, 250 10616 Sebastien { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, 251 11042 Erik ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 252 10616 Sebastien ip_nodef_v6intfid }, 253 10616 Sebastien { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, 254 11042 Erik ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid, 255 10616 Sebastien ip_nodef_v6intfid }, 256 11042 Erik { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, 257 11042 Erik ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 258 11042 Erik ip_ipv4_v6destintfid }, 259 11042 Erik { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, 260 11042 Erik ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid, 261 11042 Erik ip_ipv6_v6destintfid }, 262 11042 Erik { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, 263 11042 Erik ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 264 11042 Erik ip_nodef_v6intfid }, 265 10616 Sebastien { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 266 10616 Sebastien NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, 267 10616 Sebastien { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 268 10616 Sebastien NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, 269 10616 Sebastien { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 270 11042 Erik ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 271 0 stevel ip_nodef_v6intfid } 272 0 stevel }; 273 0 stevel 274 0 stevel static ill_t ill_null; /* Empty ILL for init. */ 275 0 stevel char ipif_loopback_name[] = "lo0"; 276 0 stevel static char *ipv4_forward_suffix = ":ip_forwarding"; 277 0 stevel static char *ipv6_forward_suffix = ":ip6_forwarding"; 278 0 stevel static sin6_t sin6_null; /* Zero address for quick clears */ 279 0 stevel static sin_t sin_null; /* Zero address for quick clears */ 280 3448 dh155122 281 0 stevel /* When set search for unused ipif_seqid */ 282 0 stevel static ipif_t ipif_zero; 283 0 stevel 284 0 stevel /* 285 0 stevel * ppa arena is created after these many 286 0 stevel * interfaces have been plumbed. 287 0 stevel */ 288 3448 dh155122 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 289 0 stevel 290 0 stevel /* 291 3284 apersson * Allocate per-interface mibs. 292 0 stevel * Returns true if ok. False otherwise. 293 0 stevel * ipsq may not yet be allocated (loopback case ). 294 0 stevel */ 295 0 stevel static boolean_t 296 0 stevel ill_allocate_mibs(ill_t *ill) 297 0 stevel { 298 0 stevel /* Already allocated? */ 299 3284 apersson if (ill->ill_ip_mib != NULL) { 300 3284 apersson if (ill->ill_isv6) 301 3284 apersson ASSERT(ill->ill_icmp6_mib != NULL); 302 0 stevel return (B_TRUE); 303 0 stevel } 304 0 stevel 305 3284 apersson ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 306 0 stevel KM_NOSLEEP); 307 3284 apersson if (ill->ill_ip_mib == NULL) { 308 3284 apersson return (B_FALSE); 309 3284 apersson } 310 3284 apersson 311 3284 apersson /* Setup static information */ 312 3284 apersson SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 313 3284 apersson sizeof (mib2_ipIfStatsEntry_t)); 314 3284 apersson if (ill->ill_isv6) { 315 3284 apersson ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 316 3284 apersson SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 317 3284 apersson sizeof (mib2_ipv6AddrEntry_t)); 318 3284 apersson SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 319 3284 apersson sizeof (mib2_ipv6RouteEntry_t)); 320 3284 apersson SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 321 3284 apersson sizeof (mib2_ipv6NetToMediaEntry_t)); 322 3284 apersson SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 323 3284 apersson sizeof (ipv6_member_t)); 324 3284 apersson SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 325 3284 apersson sizeof (ipv6_grpsrc_t)); 326 3284 apersson } else { 327 3284 apersson ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 328 3284 apersson SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 329 3284 apersson sizeof (mib2_ipAddrEntry_t)); 330 3284 apersson SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 331 3284 apersson sizeof (mib2_ipRouteEntry_t)); 332 3284 apersson SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 333 3284 apersson sizeof (mib2_ipNetToMediaEntry_t)); 334 3284 apersson SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 335 3284 apersson sizeof (ip_member_t)); 336 3284 apersson SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 337 3284 apersson sizeof (ip_grpsrc_t)); 338 3284 apersson 339 3284 apersson /* 340 3284 apersson * For a v4 ill, we are done at this point, because per ill 341 3284 apersson * icmp mibs are only used for v6. 342 3284 apersson */ 343 3284 apersson return (B_TRUE); 344 3284 apersson } 345 3284 apersson 346 0 stevel ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 347 0 stevel KM_NOSLEEP); 348 0 stevel if (ill->ill_icmp6_mib == NULL) { 349 3284 apersson kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 350 3284 apersson ill->ill_ip_mib = NULL; 351 3284 apersson return (B_FALSE); 352 3284 apersson } 353 3284 apersson /* static icmp info */ 354 3284 apersson ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 355 3284 apersson sizeof (mib2_ipv6IfIcmpEntry_t); 356 3284 apersson /* 357 3284 apersson * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 358 0 stevel * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 359 0 stevel * -> ill_phyint_reinit 360 0 stevel */ 361 0 stevel return (B_TRUE); 362 0 stevel } 363 0 stevel 364 0 stevel /* 365 0 stevel * Completely vaporize a lower level tap and all associated interfaces. 366 0 stevel * ill_delete is called only out of ip_close when the device control 367 0 stevel * stream is being closed. 368 0 stevel */ 369 0 stevel void 370 0 stevel ill_delete(ill_t *ill) 371 0 stevel { 372 0 stevel ipif_t *ipif; 373 0 stevel ill_t *prev_ill; 374 3448 dh155122 ip_stack_t *ipst = ill->ill_ipst; 375 0 stevel 376 0 stevel /* 377 0 stevel * ill_delete may be forcibly entering the ipsq. The previous 378 0 stevel * ioctl may not have completed and may need to be aborted. 379 0 stevel * ipsq_flush takes care of it. If we don't need to enter the 380 0 stevel * the ipsq forcibly, the 2nd invocation of ipsq_flush in 381 0 stevel * ill_delete_tail is sufficient. 382 0 stevel */ 383 0 stevel ipsq_flush(ill); 384 0 stevel 385 0 stevel /* 386 0 stevel * Nuke all interfaces. ipif_free will take down the interface, 387 0 stevel * remove it from the list, and free the data structure. 388 0 stevel * Walk down the ipif list and remove the logical interfaces 389 0 stevel * first before removing the main ipif. We can't unplumb 390 11042 Erik * zeroth interface first in the case of IPv6 as update_conn_ill 391 11042 Erik * -> ip_ll_multireq de-references ill_ipif for checking 392 0 stevel * POINTOPOINT. 393 0 stevel * 394 0 stevel * If ill_ipif was not properly initialized (i.e low on memory), 395 0 stevel * then no interfaces to clean up. In this case just clean up the 396 0 stevel * ill. 397 0 stevel */ 398 0 stevel for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 399 0 stevel ipif_free(ipif); 400 0 stevel 401 0 stevel /* 402 11042 Erik * clean out all the nce_t entries that depend on this 403 11042 Erik * ill for the ill_phys_addr. 404 11042 Erik */ 405 11042 Erik nce_flush(ill, B_TRUE); 406 0 stevel 407 0 stevel /* Clean up msgs on pending upcalls for mrouted */ 408 0 stevel reset_mrt_ill(ill); 409 0 stevel 410 11042 Erik update_conn_ill(ill, ipst); 411 8023 Phil 412 8023 Phil /* 413 8023 Phil * Remove multicast references added as a result of calls to 414 8023 Phil * ip_join_allmulti(). 415 8023 Phil */ 416 8023 Phil ip_purge_allmulti(ill); 417 8485 Peter 418 8485 Peter /* 419 8485 Peter * If the ill being deleted is under IPMP, boot it out of the illgrp. 420 8485 Peter */ 421 8485 Peter if (IS_UNDER_IPMP(ill)) 422 8485 Peter ipmp_ill_leave_illgrp(ill); 423 0 stevel 424 0 stevel /* 425 0 stevel * ill_down will arrange to blow off any IRE's dependent on this 426 0 stevel * ILL, and shut down fragmentation reassembly. 427 0 stevel */ 428 0 stevel ill_down(ill); 429 0 stevel 430 0 stevel /* Let SCTP know, so that it can remove this from its list. */ 431 0 stevel sctp_update_ill(ill, SCTP_ILL_REMOVE); 432 11042 Erik 433 11042 Erik /* 434 11042 Erik * Walk all CONNs that can have a reference on an ire or nce for this 435 11042 Erik * ill (we actually walk all that now have stale references). 436 11042 Erik */ 437 11042 Erik ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 438 11042 Erik 439 11042 Erik /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 440 11042 Erik if (ill->ill_isv6) 441 11042 Erik dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst); 442 0 stevel 443 0 stevel /* 444 0 stevel * If an address on this ILL is being used as a source address then 445 0 stevel * clear out the pointers in other ILLs that point to this ILL. 446 0 stevel */ 447 3448 dh155122 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 448 0 stevel if (ill->ill_usesrc_grp_next != NULL) { 449 0 stevel if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 450 0 stevel ill_disband_usesrc_group(ill); 451 0 stevel } else { /* consumer of the usesrc ILL */ 452 0 stevel prev_ill = ill_prev_usesrc(ill); 453 0 stevel prev_ill->ill_usesrc_grp_next = 454 0 stevel ill->ill_usesrc_grp_next; 455 0 stevel } 456 0 stevel } 457 3448 dh155122 rw_exit(&ipst->ips_ill_g_usesrc_lock); 458 0 stevel } 459 0 stevel 460 2546 carlsonj static void 461 2546 carlsonj ipif_non_duplicate(ipif_t *ipif) 462 2546 carlsonj { 463 2546 carlsonj ill_t *ill = ipif->ipif_ill; 464 2546 carlsonj mutex_enter(&ill->ill_lock); 465 2546 carlsonj if (ipif->ipif_flags & IPIF_DUPLICATE) { 466 2546 carlsonj ipif->ipif_flags &= ~IPIF_DUPLICATE; 467 2546 carlsonj ASSERT(ill->ill_ipif_dup_count > 0); 468 2546 carlsonj ill->ill_ipif_dup_count--; 469 2546 carlsonj } 470 2546 carlsonj mutex_exit(&ill->ill_lock); 471 2546 carlsonj } 472 2546 carlsonj 473 0 stevel /* 474 0 stevel * ill_delete_tail is called from ip_modclose after all references 475 0 stevel * to the closing ill are gone. The wait is done in ip_modclose 476 0 stevel */ 477 0 stevel void 478 0 stevel ill_delete_tail(ill_t *ill) 479 0 stevel { 480 0 stevel mblk_t **mpp; 481 0 stevel ipif_t *ipif; 482 3448 dh155122 ip_stack_t *ipst = ill->ill_ipst; 483 0 stevel 484 2546 carlsonj for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 485 2546 carlsonj ipif_non_duplicate(ipif); 486 11042 Erik (void) ipif_down_tail(ipif); 487 11042 Erik } 488 11042 Erik 489 11042 Erik ASSERT(ill->ill_ipif_dup_count == 0); 490 0 stevel 491 0 stevel /* 492 0 stevel * If polling capability is enabled (which signifies direct 493 0 stevel * upcall into IP and driver has ill saved as a handle), 494 0 stevel * we need to make sure that unbind has completed before we 495 0 stevel * let the ill disappear and driver no longer has any reference 496 0 stevel * to this ill. 497 0 stevel */ 498 0 stevel mutex_enter(&ill->ill_lock); 499 1555 krgopi while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 500 1555 krgopi cv_wait(&ill->ill_cv, &ill->ill_lock); 501 1555 krgopi mutex_exit(&ill->ill_lock); 502 8275 Eric ASSERT(!(ill->ill_capabilities & 503 8275 Eric (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); 504 0 stevel 505 0 stevel if (ill->ill_net_type != IRE_LOOPBACK) 506 0 stevel qprocsoff(ill->ill_rq); 507 0 stevel 508 0 stevel /* 509 0 stevel * We do an ipsq_flush once again now. New messages could have 510 0 stevel * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 511 0 stevel * could also have landed up if an ioctl thread had looked up 512 0 stevel * the ill before we set the ILL_CONDEMNED flag, but not yet 513 0 stevel * enqueued the ioctl when we did the ipsq_flush last time. 514 0 stevel */ 515 0 stevel ipsq_flush(ill); 516 0 stevel 517 0 stevel /* 518 0 stevel * Free capabilities. 519 0 stevel */ 520 0 stevel if (ill->ill_hcksum_capab != NULL) { 521 0 stevel kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 522 0 stevel ill->ill_hcksum_capab = NULL; 523 0 stevel } 524 0 stevel 525 0 stevel if (ill->ill_zerocopy_capab != NULL) { 526 0 stevel kmem_free(ill->ill_zerocopy_capab, 527 0 stevel sizeof (ill_zerocopy_capab_t)); 528 0 stevel ill->ill_zerocopy_capab = NULL; 529 0 stevel } 530 1184 krgopi 531 3115 yl150051 if (ill->ill_lso_capab != NULL) { 532 3115 yl150051 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 533 3115 yl150051 ill->ill_lso_capab = NULL; 534 3115 yl150051 } 535 3115 yl150051 536 8275 Eric if (ill->ill_dld_capab != NULL) { 537 8275 Eric kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); 538 8275 Eric ill->ill_dld_capab = NULL; 539 8275 Eric } 540 0 stevel 541 0 stevel while (ill->ill_ipif != NULL) 542 0 stevel ipif_free_tail(ill->ill_ipif); 543 0 stevel 544 0 stevel /* 545 0 stevel * We have removed all references to ilm from conn and the ones joined 546 0 stevel * within the kernel. 547 0 stevel * 548 0 stevel * We don't walk conns, mrts and ires because 549 0 stevel * 550 11042 Erik * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts. 551 0 stevel * 2) ill_down ->ill_downi walks all the ires and cleans up 552 0 stevel * ill references. 553 0 stevel */ 554 8485 Peter 555 8485 Peter /* 556 8485 Peter * If this ill is an IPMP meta-interface, blow away the illgrp. This 557 8485 Peter * is safe to do because the illgrp has already been unlinked from the 558 8485 Peter * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. 559 8485 Peter */ 560 8485 Peter if (IS_IPMP(ill)) { 561 8485 Peter ipmp_illgrp_destroy(ill->ill_grp); 562 8485 Peter ill->ill_grp = NULL; 563 8485 Peter } 564 8485 Peter 565 8485 Peter /* 566 8485 Peter * Take us out of the list of ILLs. ill_glist_delete -> phyint_free 567 0 stevel * could free the phyint. No more reference to the phyint after this 568 0 stevel * point. 569 0 stevel */ 570 0 stevel (void) ill_glist_delete(ill); 571 0 stevel 572 3448 dh155122 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 573 0 stevel if (ill->ill_ndd_name != NULL) 574 3448 dh155122 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 575 3448 dh155122 rw_exit(&ipst->ips_ip_g_nd_lock); 576 0 stevel 577 0 stevel if (ill->ill_frag_ptr != NULL) { 578 0 stevel uint_t count; 579 0 stevel 580 0 stevel for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 581 0 stevel mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 582 0 stevel } 583 0 stevel mi_free(ill->ill_frag_ptr); 584 0 stevel ill->ill_frag_ptr = NULL; 585 0 stevel ill->ill_frag_hash_tbl = NULL; 586 0 stevel } 587 3340 meem 588 3340 meem freemsg(ill->ill_nd_lla_mp); 589 0 stevel /* Free all retained control messages. */ 590 0 stevel mpp = &ill->ill_first_mp_to_free; 591 0 stevel do { 592 0 stevel while (mpp[0]) { 593 0 stevel mblk_t *mp; 594 0 stevel mblk_t *mp1; 595 0 stevel 596 0 stevel mp = mpp[0]; 597 0 stevel mpp[0] = mp->b_next; 598 0 stevel for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 599 0 stevel mp1->b_next = NULL; 600 0 stevel mp1->b_prev = NULL; 601 0 stevel } 602 0 stevel freemsg(mp); 603 0 stevel } 604 0 stevel } while (mpp++ != &ill->ill_last_mp_to_free); 605 0 stevel 606 0 stevel ill_free_mib(ill); 607 5023 carlsonj 608 5023 carlsonj #ifdef DEBUG 609 5023 carlsonj ill_trace_cleanup(ill); 610 5023 carlsonj #endif 611 11042 Erik 612 11042 Erik /* The default multicast interface might have changed */ 613 11042 Erik ire_increment_multicast_generation(ipst, ill->ill_isv6); 614 5023 carlsonj 615 3448 dh155122 /* Drop refcnt here */ 616 3448 dh155122 netstack_rele(ill->ill_ipst->ips_netstack); 617 3448 dh155122 ill->ill_ipst = NULL; 618 0 stevel } 619 0 stevel 620 0 stevel static void 621 0 stevel ill_free_mib(ill_t *ill) 622 0 stevel { 623 3448 dh155122 ip_stack_t *ipst = ill->ill_ipst; 624 3448 dh155122 625 3284 apersson /* 626 3284 apersson * MIB statistics must not be lost, so when an interface 627 3284 apersson * goes away the counter values will be added to the global 628 3284 apersson * MIBs. 629 3284 apersson */ 630 3284 apersson if (ill->ill_ip_mib != NULL) { 631 3448 dh155122 if (ill->ill_isv6) { 632 3448 dh155122 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 633 3448 dh155122 ill->ill_ip_mib); 634 3448 dh155122 } else { 635 3448 dh155122 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 636 3448 dh155122 ill->ill_ip_mib); 637 3448 dh155122 } 638 3284 apersson 639 3284 apersson kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 640 3284 apersson ill->ill_ip_mib = NULL; 641 0 stevel } 642 0 stevel if (ill->ill_icmp6_mib != NULL) { 643 3448 dh155122 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 644 3448 dh155122 ill->ill_icmp6_mib); 645 0 stevel kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 646 0 stevel ill->ill_icmp6_mib = NULL; 647 0 stevel } 648 0 stevel } 649 0 stevel 650 0 stevel /* 651 0 stevel * Concatenate together a physical address and a sap. 652 0 stevel * 653 0 stevel * Sap_lengths are interpreted as follows: 654 0 stevel * sap_length == 0 ==> no sap 655 0 stevel * sap_length > 0 ==> sap is at the head of the dlpi address 656 0 stevel * sap_length < 0 ==> sap is at the tail of the dlpi address 657 0 stevel */ 658 0 stevel static void 659 0 stevel ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 660 0 stevel t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 661 0 stevel { 662 0 stevel uint16_t sap_addr = (uint16_t)sap_src; 663 0 stevel 664 0 stevel if (sap_length == 0) { 665 0 stevel if (phys_src == NULL) 666 0 stevel bzero(dst, phys_length); 667 0 stevel else 668 0 stevel bcopy(phys_src, dst, phys_length); 669 0 stevel } else if (sap_length < 0) { 670 0 stevel if (phys_src == NULL) 671 0 stevel bzero(dst, phys_length); 672 0 stevel else 673 0 stevel bcopy(phys_src, dst, phys_length); 674 0 stevel bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 675 0 stevel } else { 676 0 stevel bcopy(&sap_addr, dst, sizeof (sap_addr)); 677 0 stevel if (phys_src == NULL) 678 0 stevel bzero((char *)dst + sap_length, phys_length); 679 0 stevel else 680 0 stevel bcopy(phys_src, (char *)dst + sap_length, phys_length); 681 0 stevel } 682 0 stevel } 683 0 stevel 684 0 stevel /* 685 0 stevel * Generate a dl_unitdata_req mblk for the device and address given. 686 0 stevel * addr_length is the length of the physical portion of the address. 687 0 stevel * If addr is NULL include an all zero address of the specified length. 688 0 stevel * TRUE? In any case, addr_length is taken to be the entire length of the 689 0 stevel * dlpi address, including the absolute value of sap_length. 690 0 stevel */ 691 0 stevel mblk_t * 692 0 stevel ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 693 0 stevel t_scalar_t sap_length) 694 0 stevel { 695 0 stevel dl_unitdata_req_t *dlur; 696 0 stevel mblk_t *mp; 697 0 stevel t_scalar_t abs_sap_length; /* absolute value */ 698 0 stevel 699 0 stevel abs_sap_length = ABS(sap_length); 700 0 stevel mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 701 4459 kcpoon DL_UNITDATA_REQ); 702 0 stevel if (mp == NULL) 703 0 stevel return (NULL); 704 0 stevel dlur = (dl_unitdata_req_t *)mp->b_rptr; 705 0 stevel /* HACK: accomodate incompatible DLPI drivers */ 706 0 stevel if (addr_length == 8) 707 0 stevel addr_length = 6; 708 0 stevel dlur->dl_dest_addr_length = addr_length + abs_sap_length; 709 0 stevel dlur->dl_dest_addr_offset = sizeof (*dlur); 710 0 stevel dlur->dl_priority.dl_min = 0; 711 0 stevel dlur->dl_priority.dl_max = 0; 712 0 stevel ill_dlur_copy_address(addr, addr_length, sap, sap_length, 713 0 stevel (uchar_t *)&dlur[1]); 714 0 stevel return (mp); 715 0 stevel } 716 0 stevel 717 0 stevel /* 718 0 stevel * Add the pending mp to the list. There can be only 1 pending mp 719 0 stevel * in the list. Any exclusive ioctl that needs to wait for a response 720 0 stevel * from another module or driver needs to use this function to set 721 8485 Peter * the ipx_pending_mp to the ioctl mblk and wait for the response from 722 0 stevel * the other module/driver. This is also used while waiting for the 723 0 stevel * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 724 0 stevel */ 725 0 stevel boolean_t 726 0 stevel ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 727 0 stevel int waitfor) 728 0 stevel { 729 8485 Peter ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; 730 0 stevel 731 0 stevel ASSERT(IAM_WRITER_IPIF(ipif)); 732 0 stevel ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 733 0 stevel ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 734 8485 Peter ASSERT(ipx->ipx_pending_mp == NULL); 735 3340 meem /* 736 3340 meem * The caller may be using a different ipif than the one passed into 737 3340 meem * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 738 3340 meem * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 739 8485 Peter * that `ipx_current_ipif == ipif'. 740 8485 Peter */ 741 8485 Peter ASSERT(ipx->ipx_current_ipif != NULL); 742 3340 meem 743 0 stevel /* 744 10616 Sebastien * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the 745 10616 Sebastien * driver. 746 10616 Sebastien */ 747 10616 Sebastien ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) || 748 10616 Sebastien (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) || 749 10616 Sebastien (DB_TYPE(add_mp) == M_PCPROTO)); 750 3340 meem 751 0 stevel if (connp != NULL) { 752 0 stevel ASSERT(MUTEX_HELD(&connp->conn_lock)); 753 0 stevel /* 754 0 stevel * Return error if the conn has started closing. The conn 755 0 stevel * could have finished cleaning up the pending mp list, 756 0 stevel * If so we should not add another mp to the list negating 757 0 stevel * the cleanup. 758 0 stevel */ 759 0 stevel if (connp->conn_state_flags & CONN_CLOSING) 760 0 stevel return (B_FALSE); 761 0 stevel } 762 8485 Peter mutex_enter(&ipx->ipx_lock); 763 8485 Peter ipx->ipx_pending_ipif = ipif; 764 0 stevel /* 765 0 stevel * Note down the queue in b_queue. This will be returned by 766 0 stevel * ipsq_pending_mp_get. Caller will then use these values to restart 767 0 stevel * the processing 768 0 stevel */ 769 0 stevel add_mp->b_next = NULL; 770 0 stevel add_mp->b_queue = q; 771 8485 Peter ipx->ipx_pending_mp = add_mp; 772 8485 Peter ipx->ipx_waitfor = waitfor; 773 8485 Peter mutex_exit(&ipx->ipx_lock); 774 3340 meem 775 0 stevel if (connp != NULL) 776 0 stevel connp->conn_oper_pending_ill = ipif->ipif_ill; 777 8485 Peter 778 8485 Peter return (B_TRUE); 779 8485 Peter } 780 8485 Peter 781 8485 Peter /* 782 8485 Peter * Retrieve the ipx_pending_mp and return it. There can be only 1 mp 783 0 stevel * queued in the list. 784 0 stevel */ 785 0 stevel mblk_t * 786 0 stevel ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 787 0 stevel { 788 0 stevel mblk_t *curr = NULL; 789 8485 Peter ipxop_t *ipx = ipsq->ipsq_xop; 790 8485 Peter 791 0 stevel *connpp = NULL; 792 8485 Peter mutex_enter(&ipx->ipx_lock); 793 8485 Peter if (ipx->ipx_pending_mp == NULL) { 794 8485 Peter mutex_exit(&ipx->ipx_lock); 795 0 stevel return (NULL); 796 0 stevel } 797 0 stevel 798 0 stevel /* There can be only 1 such excl message */ 799 8485 Peter curr = ipx->ipx_pending_mp; 800 8485 Peter ASSERT(curr->b_next == NULL); 801 8485 Peter ipx->ipx_pending_ipif = NULL; 802 8485 Peter ipx->ipx_pending_mp = NULL; 803 8485 Peter ipx->ipx_waitfor = 0; 804 8485 Peter mutex_exit(&ipx->ipx_lock); 805 0 stevel 806 0 stevel if (CONN_Q(curr->b_queue)) { 807 0 stevel /* 808 0 stevel * This mp did a refhold on the conn, at the start of the ioctl. 809 0 stevel * So we can safely return a pointer to the conn to the caller. 810 0 stevel */ 811 0 stevel *connpp = Q_TO_CONN(curr->b_queue); 812 0 stevel } else { 813 0 stevel *connpp = NULL; 814 0 stevel } 815 0 stevel curr->b_next = NULL; 816 0 stevel curr->b_prev = NULL; 817 0 stevel return (curr); 818 0 stevel } 819 0 stevel 820 0 stevel /* 821 8485 Peter * Cleanup the ioctl mp queued in ipx_pending_mp 822 0 stevel * - Called in the ill_delete path 823 0 stevel * - Called in the M_ERROR or M_HANGUP path on the ill. 824 0 stevel * - Called in the conn close path. 825 0 stevel */ 826 0 stevel boolean_t 827 0 stevel ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 828 0 stevel { 829 0 stevel mblk_t *mp; 830 8485 Peter ipxop_t *ipx; 831 0 stevel queue_t *q; 832 0 stevel ipif_t *ipif; 833 11042 Erik int cmd; 834 0 stevel 835 0 stevel ASSERT(IAM_WRITER_ILL(ill)); 836 8485 Peter ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 837 8485 Peter 838 8485 Peter /* 839 8485 Peter * If connp is null, unconditionally clean up the ipx_pending_mp. 840 0 stevel * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 841 0 stevel * even if it is meant for another ill, since we have to enqueue 842 8485 Peter * a new mp now in ipx_pending_mp to complete the ipif_down. 843 0 stevel * If connp is non-null we are called from the conn close path. 844 0 stevel */ 845 8485 Peter mutex_enter(&ipx->ipx_lock); 846 8485 Peter mp = ipx->ipx_pending_mp; 847 0 stevel if (mp == NULL || (connp != NULL && 848 0 stevel mp->b_queue != CONNP_TO_WQ(connp))) { 849 8485 Peter mutex_exit(&ipx->ipx_lock); 850 8485 Peter return (B_FALSE); 851 8485 Peter } 852 8485 Peter /* Now remove from the ipx_pending_mp */ 853 8485 Peter ipx->ipx_pending_mp = NULL; 854 0 stevel q = mp->b_queue; 855 0 stevel mp->b_next = NULL; 856 0 stevel mp->b_prev = NULL; 857 0 stevel mp->b_queue = NULL; 858 0 stevel 859 8485 Peter ipif = ipx->ipx_pending_ipif; 860 8485 Peter ipx->ipx_pending_ipif = NULL; 861 8485 Peter ipx->ipx_waitfor = 0; 862 8485 Peter ipx->ipx_current_ipif = NULL; 863 11042 Erik cmd = ipx->ipx_current_ioctl; 864 8485 Peter ipx->ipx_current_ioctl = 0; 865 8485 Peter ipx->ipx_current_done = B_TRUE; 866 8485 Peter mutex_exit(&ipx->ipx_lock); 867 0 stevel 868 0 stevel if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 869 11042 Erik DTRACE_PROBE4(ipif__ioctl, 870 11042 Erik char *, "ipsq_pending_mp_cleanup", 871 11042 Erik int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill, 872 11042 Erik ipif_t *, ipif); 873 3340 meem if (connp == NULL) { 874 3340 meem ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 875 3340 meem } else { 876 3340 meem ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 877 3340 meem mutex_enter(&ipif->ipif_ill->ill_lock); 878 3340 meem ipif->ipif_state_flags &= ~IPIF_CHANGING; 879 3340 meem mutex_exit(&ipif->ipif_ill->ill_lock); 880 3340 meem } 881 0 stevel } else { 882 0 stevel /* 883 0 stevel * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 884 741 masputra * be just inet_freemsg. we have to restart it 885 0 stevel * otherwise the thread will be stuck. 886 0 stevel */ 887 741 masputra inet_freemsg(mp); 888 0 stevel } 889 0 stevel return (B_TRUE); 890 0 stevel } 891 0 stevel 892 0 stevel /* 893 0 stevel * Called in the conn close path and ill delete path 894 0 stevel */ 895 0 stevel static void 896 0 stevel ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 897 0 stevel { 898 0 stevel ipsq_t *ipsq; 899 0 stevel mblk_t *prev; 900 0 stevel mblk_t *curr; 901 0 stevel mblk_t *next; 902 0 stevel queue_t *q; 903 0 stevel mblk_t *tmp_list = NULL; 904 0 stevel 905 0 stevel ASSERT(IAM_WRITER_ILL(ill)); 906 0 stevel if (connp != NULL) 907 0 stevel q = CONNP_TO_WQ(connp); 908 0 stevel else 909 0 stevel q = ill->ill_wq; 910 0 stevel 911 0 stevel ipsq = ill->ill_phyint->phyint_ipsq; 912 0 stevel /* 913 0 stevel * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 914 0 stevel * In the case of ioctl from a conn, there can be only 1 mp 915 0 stevel * queued on the ipsq. If an ill is being unplumbed, only messages 916 0 stevel * related to this ill are flushed, like M_ERROR or M_HANGUP message. 917 0 stevel * ioctls meant for this ill form conn's are not flushed. They will 918 0 stevel * be processed during ipsq_exit and will not find the ill and will 919 0 stevel * return error. 920 0 stevel */ 921 0 stevel mutex_enter(&ipsq->ipsq_lock); 922 0 stevel for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 923 0 stevel curr = next) { 924 0 stevel next = curr->b_next; 925 0 stevel if (curr->b_queue == q || curr->b_queue == RD(q)) { 926 0 stevel /* Unlink the mblk from the pending mp list */ 927 0 stevel if (prev != NULL) { 928 0 stevel prev->b_next = curr->b_next; 929 0 stevel } else { 930 0 stevel ASSERT(ipsq->ipsq_xopq_mphead == curr); 931 0 stevel ipsq->ipsq_xopq_mphead = curr->b_next; 932 0 stevel } 933 0 stevel if (ipsq->ipsq_xopq_mptail == curr) 934 0 stevel ipsq->ipsq_xopq_mptail = prev; 935 0 stevel /* 936 0 stevel * Create a temporary list and release the ipsq lock 937 0 stevel * New elements are added to the head of the tmp_list 938 0 stevel */ 939 0 stevel curr->b_next = tmp_list; 940 0 stevel tmp_list = curr; 941 0 stevel } else { 942 0 stevel prev = curr; 943 0 stevel } 944 0 stevel } 945 0 stevel mutex_exit(&ipsq->ipsq_lock); 946 0 stevel 947 0 stevel while (tmp_list != NULL) { 948 0 stevel curr = tmp_list; 949 0 stevel tmp_list = curr->b_next; 950 0 stevel curr->b_next = NULL; 951 0 stevel curr->b_prev = NULL; 952 0 stevel curr->b_queue = NULL; 953 0 stevel if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 954 11042 Erik DTRACE_PROBE4(ipif__ioctl, 955 11042 Erik char *, "ipsq_xopq_mp_cleanup", 956 11042 Erik int, 0, ill_t *, NULL, ipif_t *, NULL); 957 0 stevel ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 958 3340 meem CONN_CLOSE : NO_COPYOUT, NULL); 959 0 stevel } else { 960 0 stevel /* 961 0 stevel * IP-MT XXX In the case of TLI/XTI bind / optmgmt 962 741 masputra * this can't be just inet_freemsg. we have to 963 0 stevel * restart it otherwise the thread will be stuck. 964 0 stevel */ 965 741 masputra inet_freemsg(curr); 966 0 stevel } 967 0 stevel } 968 0 stevel } 969 0 stevel 970 0 stevel /* 971 0 stevel * This conn has started closing. Cleanup any pending ioctl from this conn. 972 0 stevel * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 973 0 stevel */ 974 0 stevel void 975 0 stevel conn_ioctl_cleanup(conn_t *connp) 976 0 stevel { 977 0 stevel ipsq_t *ipsq; 978 0 stevel ill_t *ill; 979 0 stevel boolean_t refheld; 980 0 stevel 981 0 stevel /* 982 0 stevel * Is any exclusive ioctl pending ? If so clean it up. If the 983 0 stevel * ioctl has not yet started, the mp is pending in the list headed by 984 0 stevel * ipsq_xopq_head. If the ioctl has started the mp could be present in 985 8485 Peter * ipx_pending_mp. If the ioctl timed out in the streamhead but 986 0 stevel * is currently executing now the mp is not queued anywhere but 987 0 stevel * conn_oper_pending_ill is null. The conn close will wait 988 0 stevel * till the conn_ref drops to zero. 989 0 stevel */ 990 0 stevel mutex_enter(&connp->conn_lock); 991 0 stevel ill = connp->conn_oper_pending_ill; 992 0 stevel if (ill == NULL) { 993 0 stevel mutex_exit(&connp->conn_lock); 994 0 stevel return; 995 0 stevel } 996 0 stevel 997 0 stevel /* 998 0 stevel * We may not be able to refhold the ill if the ill/ipif 999 0 stevel * is changing. But we need to make sure that the ill will 1000 0 stevel * not vanish. So we just bump up the ill_waiter count. 1001 0 stevel */ 1002 0 stevel refheld = ill_waiter_inc(ill); 1003 0 stevel mutex_exit(&connp->conn_lock); 1004 0 stevel if (refheld) { 1005 8275 Eric if (ipsq_enter(ill, B_TRUE, NEW_OP)) { 1006 0 stevel ill_waiter_dcr(ill); 1007 0 stevel /* 1008 0 stevel * Check whether this ioctl has started and is 1009 8485 Peter * pending. If it is not found there then check 1010 8485 Peter * whether this ioctl has not even started and is in 1011 8485 Peter * the ipsq_xopq list. 1012 0 stevel */ 1013 0 stevel if (!ipsq_pending_mp_cleanup(ill, connp)) 1014 0 stevel ipsq_xopq_mp_cleanup(ill, connp); 1015 0 stevel ipsq = ill->ill_phyint->phyint_ipsq; 1016 7098 meem ipsq_exit(ipsq); 1017 0 stevel return; 1018 0 stevel } 1019 0 stevel } 1020 0 stevel 1021 0 stevel /* 1022 0 stevel * The ill is also closing and we could not bump up the 1023 0 stevel * ill_waiter_count or we could not enter the ipsq. Leave 1024 0 stevel * the cleanup to ill_delete 1025 0 stevel */ 1026 0 stevel mutex_enter(&connp->conn_lock); 1027 0 stevel while (connp->conn_oper_pending_ill != NULL) 1028 0 stevel cv_wait(&connp->conn_refcv, &connp->conn_lock); 1029 0 stevel mutex_exit(&connp->conn_lock); 1030 0 stevel if (refheld) 1031 0 stevel ill_waiter_dcr(ill); 1032 0 stevel } 1033 0 stevel 1034 0 stevel /* 1035 0 stevel * ipcl_walk function for cleaning up conn_*_ill fields. 1036 11042 Erik * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and 1037 11042 Erik * conn_bound_if in place. We prefer dropping 1038 11042 Erik * packets instead of sending them out the wrong interface, or accepting 1039 11042 Erik * packets from the wrong ifindex. 1040 0 stevel */ 1041 0 stevel static void 1042 0 stevel conn_cleanup_ill(conn_t *connp, caddr_t arg) 1043 0 stevel { 1044 0 stevel ill_t *ill = (ill_t *)arg; 1045 0 stevel 1046 0 stevel mutex_enter(&connp->conn_lock); 1047 5381 meem if (connp->conn_dhcpinit_ill == ill) { 1048 5381 meem connp->conn_dhcpinit_ill = NULL; 1049 5381 meem ASSERT(ill->ill_dhcpinit != 0); 1050 5381 meem atomic_dec_32(&ill->ill_dhcpinit); 1051 11042 Erik ill_set_inputfn(ill); 1052 0 stevel } 1053 0 stevel mutex_exit(&connp->conn_lock); 1054 0 stevel } 1055 0 stevel 1056 11042 Erik static int 1057 9073 Cathy ill_down_ipifs_tail(ill_t *ill) 1058 9073 Cathy { 1059 9073 Cathy ipif_t *ipif; 1060 11042 Erik int err; 1061 9073 Cathy 1062 9073 Cathy ASSERT(IAM_WRITER_ILL(ill)); 1063 2546 carlsonj for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1064 2546 carlsonj ipif_non_duplicate(ipif); 1065 11042 Erik /* 1066 11042 Erik * ipif_down_tail will call arp_ll_down on the last ipif 1067 11042 Erik * and typically return EINPROGRESS when the DL_UNBIND is sent. 1068 11042 Erik */ 1069 11042 Erik if ((err = ipif_down_tail(ipif)) != 0) 1070 11042 Erik return (err); 1071 11042 Erik } 1072 11042 Erik return (0); 1073 9073 Cathy } 1074 9073 Cathy 1075 9073 Cathy /* ARGSUSED */ 1076 9073 Cathy void 1077 9073 Cathy ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1078 9073 Cathy { 1079 9073 Cathy ASSERT(IAM_WRITER_IPSQ(ipsq)); 1080 11042 Erik (void) ill_down_ipifs_tail(q->q_ptr); 1081 0 stevel freemsg(mp); 1082 3340 meem ipsq_current_finish(ipsq); 1083 0 stevel } 1084 0 stevel 1085 0 stevel /* 1086 0 stevel * ill_down_start is called when we want to down this ill and bring it up again 1087 0 stevel * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1088 0 stevel * all interfaces, but don't tear down any plumbing. 1089 0 stevel */ 1090 0 stevel boolean_t 1091 0 stevel ill_down_start(queue_t *q, mblk_t *mp) 1092 0 stevel { 1093 3340 meem ill_t *ill = q->q_ptr; 1094 3340 meem ipif_t *ipif; 1095 0 stevel 1096 0 stevel ASSERT(IAM_WRITER_ILL(ill)); 1097 11042 Erik mutex_enter(&ill->ill_lock); 1098 11042 Erik ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 1099 11042 Erik /* no more nce addition allowed */ 1100 11042 Erik mutex_exit(&ill->ill_lock); 1101 0 stevel 1102 0 stevel for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1103 0 stevel (void) ipif_down(ipif, NULL, NULL); 1104 0 stevel 1105 0 stevel ill_down(ill); 1106 11042 Erik 1107 11042 Erik /* 1108 11042 Erik * Walk all CONNs that can have a reference on an ire or nce for this 1109 11042 Erik * ill (we actually walk all that now have stale references). 1110 11042 Erik */ 1111 11042 Erik ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst); 1112 11042 Erik 1113 11042 Erik /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 1114 11042 Erik if (ill->ill_isv6) 1115 11042 Erik dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst); 1116 11042 Erik 1117 0 stevel 1118 0 stevel (void) ipsq_pending_mp_cleanup(ill, NULL); 1119 3340 meem 1120 3340 meem ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1121 3340 meem 1122 3340 meem /* 1123 3340 meem * Atomically test and add the pending mp if references are active. 1124 3340 meem */ 1125 3340 meem mutex_enter(&ill->ill_lock); 1126 0 stevel if (!ill_is_quiescent(ill)) { 1127 3340 meem /* call cannot fail since `conn_t *' argument is NULL */ 1128 0 stevel (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1129 0 stevel mp, ILL_DOWN); 1130 0 stevel mutex_exit(&ill->ill_lock); 1131 0 stevel return (B_FALSE); 1132 0 stevel } 1133 0 stevel mutex_exit(&ill->ill_lock); 1134 0 stevel return (B_TRUE); 1135 0 stevel } 1136 0 stevel 1137 0 stevel static void 1138 0 stevel ill_down(ill_t *ill) 1139 0 stevel { 1140 11042 Erik mblk_t *mp; 1141 11042 Erik ip_stack_t *ipst = ill->ill_ipst; 1142 11042 Erik 1143 11042 Erik /* 1144 11042 Erik * Blow off any IREs dependent on this ILL. 1145 11042 Erik * The caller needs to handle conn_ixa_cleanup 1146 11042 Erik */ 1147 11042 Erik ill_delete_ires(ill); 1148 11042 Erik 1149 11042 Erik ire_walk_ill(0, 0, ill_downi, ill, ill); 1150 3448 dh155122 1151 0 stevel /* Remove any conn_*_ill depending on this ill */ 1152 3448 dh155122 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1153 11042 Erik 1154 11042 Erik /* 1155 11042 Erik * Free state for additional IREs. 1156 11042 Erik */ 1157 11042 Erik mutex_enter(&ill->ill_saved_ire_lock); 1158 11042 Erik mp = ill->ill_saved_ire_mp; 1159 11042 Erik ill->ill_saved_ire_mp = NULL; 1160 11042 Erik ill->ill_saved_ire_cnt = 0; 1161 11042 Erik mutex_exit(&ill->ill_saved_ire_lock); 1162 11042 Erik freemsg(mp); 1163 11042 Erik } 1164 11042 Erik 1165 11042 Erik /* 1166 11042 Erik * ire_walk routine used to delete every IRE that depends on 1167 11042 Erik * 'ill'. (Always called as writer.) 1168 11042 Erik * 1169 11042 Erik * Note: since the routes added by the kernel are deleted separately, 1170 11042 Erik * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE. 1171 11042 Erik * 1172 11042 Erik * We also remove references on ire_nce_cache entries that refer to the ill. 1173 11042 Erik */ 1174 11042 Erik void 1175 0 stevel ill_downi(ire_t *ire, char *ill_arg) 1176 0 stevel { 1177 0 stevel ill_t *ill = (ill_t *)ill_arg; 1178 11042 Erik nce_t *nce; 1179 11042 Erik 1180 11042 Erik mutex_enter(&ire->ire_lock); 1181 11042 Erik nce = ire->ire_nce_cache; 1182 11042 Erik if (nce != NULL && nce->nce_ill == ill) 1183 11042 Erik ire->ire_nce_cache = NULL; 1184 11042 Erik else 1185 11042 Erik nce = NULL; 1186 11042 Erik mutex_exit(&ire->ire_lock); 1187 11042 Erik if (nce != NULL) 1188 11042 Erik nce_refrele(nce); 1189 11042 Erik if (ire->ire_ill == ill) 1190 0 stevel ire_delete(ire); 1191 11042 Erik } 1192 11042 Erik 1193 11042 Erik /* Remove IRE_IF_CLONE on this ill */ 1194 11042 Erik void 1195 11042 Erik ill_downi_if_clone(ire_t *ire, char *ill_arg) 1196 11042 Erik { 1197 11042 Erik ill_t *ill = (ill_t *)ill_arg; 1198 11042 Erik 1199 11042 Erik ASSERT(ire->ire_type & IRE_IF_CLONE); 1200 11042 Erik if (ire->ire_ill == ill) 1201 11042 Erik ire_delete(ire); 1202 0 stevel } 1203 0 stevel 1204 0 stevel /* Consume an M_IOCACK of the fastpath probe. */ 1205 0 stevel void 1206 0 stevel ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1207 0 stevel { 1208 0 stevel mblk_t *mp1 = mp; 1209 0 stevel 1210 0 stevel /* 1211 0 stevel * If this was the first attempt turn on the fastpath probing. 1212 0 stevel */ 1213 0 stevel mutex_enter(&ill->ill_lock); 1214 2893 ja97890 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1215 2893 ja97890 ill->ill_dlpi_fastpath_state = IDS_OK; 1216 0 stevel mutex_exit(&ill->ill_lock); 1217 0 stevel 1218 0 stevel /* Free the M_IOCACK mblk, hold on to the data */ 1219 0 stevel mp = mp->b_cont; 1220 0 stevel freeb(mp1); 1221 0 stevel if (mp == NULL) 1222 0 stevel return; 1223 11042 Erik if (mp->b_cont != NULL) 1224 11042 Erik nce_fastpath_update(ill, mp); 1225 11042 Erik else 1226 0 stevel ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1227 11042 Erik freemsg(mp); 1228 0 stevel } 1229 0 stevel 1230 0 stevel /* 1231 0 stevel * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1232 0 stevel * The data portion of the request is a dl_unitdata_req_t template for 1233 0 stevel * what we would send downstream in the absence of a fastpath confirmation. 1234 0 stevel */ 1235 0 stevel int 1236 0 stevel ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1237 0 stevel { 1238 0 stevel struct iocblk *ioc; 1239 0 stevel mblk_t *mp; 1240 0 stevel 1241 0 stevel if (dlur_mp == NULL) 1242 0 stevel return (EINVAL); 1243 0 stevel 1244 0 stevel mutex_enter(&ill->ill_lock); 1245 0 stevel switch (ill->ill_dlpi_fastpath_state) { 1246 2893 ja97890 case IDS_FAILED: 1247 0 stevel /* 1248 0 stevel * Driver NAKed the first fastpath ioctl - assume it doesn't 1249 0 stevel * support it. 1250 0 stevel */ 1251 0 stevel mutex_exit(&ill->ill_lock); 1252 0 stevel return (ENOTSUP); 1253 2893 ja97890 case IDS_UNKNOWN: 1254 0 stevel /* This is the first probe */ 1255 2893 ja97890 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1256 0 stevel break; 1257 0 stevel default: 1258 0 stevel break; 1259 0 stevel } 1260 0 stevel mutex_exit(&ill->ill_lock); 1261 0 stevel 1262 0 stevel if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1263 0 stevel return (EAGAIN); 1264 0 stevel 1265 0 stevel mp->b_cont = copyb(dlur_mp); 1266 0 stevel if (mp->b_cont == NULL) { 1267 0 stevel freeb(mp); 1268 0 stevel return (EAGAIN); 1269 0 stevel } 1270 0 stevel 1271 0 stevel ioc = (struct iocblk *)mp->b_rptr; 1272 0 stevel ioc->ioc_count = msgdsize(mp->b_cont); 1273 0 stevel 1274 11042 Erik DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe", 1275 11042 Erik char *, "DL_IOC_HDR_INFO", ill_t *, ill); 1276 0 stevel putnext(ill->ill_wq, mp); 1277 0 stevel return (0); 1278 0 stevel } 1279 0 stevel 1280 0 stevel void 1281 0 stevel ill_capability_probe(ill_t *ill) 1282 0 stevel { 1283 8275 Eric mblk_t *mp; 1284 8275 Eric 1285 8275 Eric ASSERT(IAM_WRITER_ILL(ill)); 1286 8275 Eric 1287 8275 Eric if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && 1288 8275 Eric ill->ill_dlpi_capab_state != IDCS_FAILED) 1289 8275 Eric return; 1290 8275 Eric 1291 8275 Eric /* 1292 8275 Eric * We are starting a new cycle of capability negotiation. 1293 8275 Eric * Free up the capab reset messages of any previous incarnation. 1294 8275 Eric * We will do a fresh allocation when we get the response to our probe 1295 8275 Eric */ 1296 8275 Eric if (ill->ill_capab_reset_mp != NULL) { 1297 8275 Eric freemsg(ill->ill_capab_reset_mp); 1298 8275 Eric ill->ill_capab_reset_mp = NULL; 1299 8275 Eric } 1300 8275 Eric 1301 0 stevel ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1302 8275 Eric 1303 8275 Eric mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); 1304 8275 Eric if (mp == NULL) 1305 8275 Eric return; 1306 8275 Eric 1307 8275 Eric ill_capability_send(ill, mp); 1308 8275 Eric ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; 1309 8275 Eric } 1310 8275 Eric 1311 8275 Eric void 1312 8275 Eric ill_capability_reset(ill_t *ill, boolean_t reneg) 1313 8275 Eric { 1314 8275 Eric ASSERT(IAM_WRITER_ILL(ill)); 1315 8275 Eric 1316 8275 Eric if (ill->ill_dlpi_capab_state != IDCS_OK) 1317 8275 Eric return; 1318 8275 Eric 1319 8275 Eric ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; 1320 8275 Eric 1321 8275 Eric ill_capability_send(ill, ill->ill_capab_reset_mp); 1322 8275 Eric ill->ill_capab_reset_mp = NULL; 1323 8275 Eric /* 1324 8275 Eric * We turn off all capabilities except those pertaining to 1325 8275 Eric * direct function call capabilities viz. ILL_CAPAB_DLD* 1326 8275 Eric * which will be turned off by the corresponding reset functions. 1327 8275 Eric */ 1328 11042 Erik ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY); 1329 8275 Eric } 1330 8275 Eric 1331 8275 Eric static void 1332 8275 Eric ill_capability_reset_alloc(ill_t *ill) 1333 0 stevel { 1334 0 stevel mblk_t *mp; 1335 8275 Eric size_t size = 0; 1336 8275 Eric int err; 1337 8275 Eric dl_capability_req_t *capb; 1338 8275 Eric 1339 8275 Eric ASSERT(IAM_WRITER_ILL(ill)); 1340 8275 Eric ASSERT(ill->ill_capab_reset_mp == NULL); 1341 8275 Eric 1342 8275 Eric if (ILL_HCKSUM_CAPABLE(ill)) { 1343 8275 Eric size += sizeof (dl_capability_sub_t) + 1344 8275 Eric sizeof (dl_capab_hcksum_t); 1345 8275 Eric } 1346 8275 Eric 1347 8275 Eric if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { 1348 8275 Eric size += sizeof (dl_capability_sub_t) + 1349 8275 Eric sizeof (dl_capab_zerocopy_t); 1350 8275 Eric } 1351 8275 Eric 1352 8275 Eric if (ill->ill_capabilities & ILL_CAPAB_DLD) { 1353 8275 Eric size += sizeof (dl_capability_sub_t) + 1354 8275 Eric sizeof (dl_capab_dld_t); 1355 8275 Eric } 1356 8275 Eric 1357 8275 Eric mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, 1358 8275 Eric STR_NOSIG, &err); 1359 8275 Eric 1360 8275 Eric mp->b_datap->db_type = M_PROTO; 1361 8275 Eric bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); 1362 8275 Eric 1363 8275 Eric capb = (dl_capability_req_t *)mp->b_rptr; 1364 8275 Eric capb->dl_primitive = DL_CAPABILITY_REQ; 1365 8275 Eric capb->dl_sub_offset = sizeof (dl_capability_req_t); 1366 8275 Eric capb->dl_sub_length = size; 1367 8275 Eric 1368 8275 Eric mp->b_wptr += sizeof (dl_capability_req_t); 1369 8275 Eric 1370 8275 Eric /* 1371 8275 Eric * Each handler fills in the corresponding dl_capability_sub_t 1372 8275 Eric * inside the mblk, 1373 8275 Eric */ 1374 8275 Eric ill_capability_hcksum_reset_fill(ill, mp); 1375 8275 Eric ill_capability_zerocopy_reset_fill(ill, mp); 1376 8275 Eric ill_capability_dld_reset_fill(ill, mp); 1377 8275 Eric 1378 8275 Eric ill->ill_capab_reset_mp = mp; 1379 0 stevel } 1380 0 stevel 1381 0 stevel static void 1382 0 stevel ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1383 0 stevel { 1384 0 stevel dl_capab_id_t *id_ic; 1385 0 stevel uint_t sub_dl_cap = outers->dl_cap; 1386 0 stevel dl_capability_sub_t *inners; 1387 0 stevel uint8_t *capend; 1388 0 stevel 1389 0 stevel ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1390 0 stevel 1391 0 stevel /* 1392 0 stevel * Note: range checks here are not absolutely sufficient to 1393 0 stevel * make us robust against malformed messages sent by drivers; 1394 0 stevel * this is in keeping with the rest of IP's dlpi handling. 1395 0 stevel * (Remember, it's coming from something else in the kernel 1396 0 stevel * address space) 1397 0 stevel */ 1398 0 stevel 1399 0 stevel capend = (uint8_t *)(outers + 1) + outers->dl_length; 1400 0 stevel if (capend > mp->b_wptr) { 1401 0 stevel cmn_err(CE_WARN, "ill_capability_id_ack: " 1402 0 stevel "malformed sub-capability too long for mblk"); 1403 0 stevel return; 1404 0 stevel } 1405 0 stevel 1406 0 stevel id_ic = (dl_capab_id_t *)(outers + 1); 1407 0 stevel 1408 0 stevel if (outers->dl_length < sizeof (*id_ic) || 1409 0 stevel (inners = &id_ic->id_subcap, 1410 0 stevel inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1411 0 stevel cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1412 0 stevel "encapsulated capab type %d too long for mblk", 1413 0 stevel inners->dl_cap); 1414 0 stevel return; 1415 0 stevel } 1416 0 stevel 1417 0 stevel if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1418 0 stevel ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1419 0 stevel "isn't as expected; pass-thru module(s) detected, " 1420 0 stevel "discarding capability\n", inners->dl_cap)); 1421 0 stevel return; 1422 0 stevel } 1423 0 stevel 1424 0 stevel /* Process the encapsulated sub-capability */ 1425 11042 Erik ill_capability_dispatch(ill, mp, inners); 1426 8275 Eric } 1427 8275 Eric 1428 8275 Eric static void 1429 8275 Eric ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) 1430 8275 Eric { 1431 8275 Eric dl_capability_sub_t *dl_subcap; 1432 8275 Eric 1433 8275 Eric if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 1434 8275 Eric return; 1435 8275 Eric 1436 8275 Eric /* 1437 8275 Eric * The dl_capab_dld_t that follows the dl_capability_sub_t is not 1438 8275 Eric * initialized below since it is not used by DLD. 1439 8275 Eric */ 1440 8275 Eric dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1441 8275 Eric dl_subcap->dl_cap = DL_CAPAB_DLD; 1442 8275 Eric dl_subcap->dl_length = sizeof (dl_capab_dld_t); 1443 8275 Eric 1444 8275 Eric mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); 1445 0 stevel } 1446 0 stevel 1447 11042 Erik static void 1448 11042 Erik ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp) 1449 11042 Erik { 1450 11076 Cathy /* 1451 11076 Cathy * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK 1452 11076 Cathy * is only to get the VRRP capability. 1453 11076 Cathy */ 1454 11076 Cathy if (ill->ill_ipif_up_count == 0) { 1455 11076 Cathy if (subp->dl_cap == DL_CAPAB_VRRP) 1456 11076 Cathy ill_capability_vrrp_ack(ill, mp, subp); 1457 11076 Cathy return; 1458 11076 Cathy } 1459 11076 Cathy 1460 0 stevel switch (subp->dl_cap) { 1461 0 stevel case DL_CAPAB_HCKSUM: 1462 0 stevel ill_capability_hcksum_ack(ill, mp, subp); 1463 0 stevel break; 1464 0 stevel case DL_CAPAB_ZEROCOPY: 1465 0 stevel ill_capability_zerocopy_ack(ill, mp, subp); 1466 0 stevel break; 1467 8275 Eric case DL_CAPAB_DLD: 1468 8275 Eric ill_capability_dld_ack(ill, mp, subp); 1469 3115 yl150051 break; 1470 11076 Cathy case DL_CAPAB_VRRP: 1471 11076 Cathy break; 1472 0 stevel default: 1473 0 stevel ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 1474 0 stevel subp->dl_cap)); 1475 11076 Cathy } 1476 11076 Cathy } 1477 11076 Cathy 1478 11076 Cathy /* 1479 11076 Cathy * Process the vrrp capability received from a DLS Provider. isub must point 1480 11076 Cathy * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message. 1481 11076 Cathy */ 1482 11076 Cathy static void 1483 11076 Cathy ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1484 11076 Cathy { 1485 11076 Cathy dl_capab_vrrp_t *vrrp; 1486 11076 Cathy uint_t sub_dl_cap = isub->dl_cap; 1487 11076 Cathy uint8_t *capend; 1488 11076 Cathy 1489 11076 Cathy ASSERT(IAM_WRITER_ILL(ill)); 1490 11076 Cathy ASSERT(sub_dl_cap == DL_CAPAB_VRRP); 1491 11076 Cathy 1492 11076 Cathy /* 1493 11076 Cathy * Note: range checks here are not absolutely sufficient to 1494 11076 Cathy * make us robust against malformed messages sent by drivers; 1495 11076 Cathy * this is in keeping with the rest of IP's dlpi handling. 1496 11076 Cathy * (Remember, it's coming from something else in the kernel 1497 11076 Cathy * address space) 1498 11076 Cathy */ 1499 11076 Cathy capend = (uint8_t *)(isub + 1) + isub->dl_length; 1500 11076 Cathy if (capend > mp->b_wptr) { 1501 11076 Cathy cmn_err(CE_WARN, "ill_capability_vrrp_ack: " 1502 11076 Cathy "malformed sub-capability too long for mblk"); 1503 11076 Cathy return; 1504 11076 Cathy } 1505 11076 Cathy vrrp = (dl_capab_vrrp_t *)(isub + 1); 1506 11076 Cathy 1507 11076 Cathy /* 1508 11076 Cathy * Compare the IP address family and set ILLF_VRRP for the right ill. 1509 11076 Cathy */ 1510 11076 Cathy if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) || 1511 11076 Cathy (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) { 1512 11076 Cathy ill->ill_flags |= ILLF_VRRP; 1513 1184 krgopi } 1514 1184 krgopi } 1515 0 stevel 1516 0 stevel /* 1517 0 stevel * Process a hardware checksum offload capability negotiation ack received 1518 0 stevel * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 1519 0 stevel * of a DL_CAPABILITY_ACK message. 1520 0 stevel */ 1521 0 stevel static void 1522 0 stevel ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1523 0 stevel { 1524 0 stevel dl_capability_req_t *ocap; 1525 0 stevel dl_capab_hcksum_t *ihck, *ohck; 1526 0 stevel ill_hcksum_capab_t **ill_hcksum; 1527 0 stevel mblk_t *nmp = NULL; 1528 0 stevel uint_t sub_dl_cap = isub->dl_cap; 1529 0 stevel uint8_t *capend; 1530 0 stevel 1531 0 stevel ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 1532 0 stevel 1533 0 stevel ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 1534 0 stevel 1535 0 stevel /* 1536 0 stevel * Note: range checks here are not absolutely sufficient to 1537 0 stevel * make us robust against malformed messages sent by drivers; 1538 0 stevel * this is in keeping with the rest of IP's dlpi handling. 1539 0 stevel * (Remember, it's coming from something else in the kernel 1540 0 stevel * address space) 1541 0 stevel */ 1542 0 stevel capend = (uint8_t *)(isub + 1) + isub->dl_length; 1543 0 stevel if (capend > mp->b_wptr) { 1544 0 stevel cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1545 0 stevel "malformed sub-capability too long for mblk"); 1546 0 stevel return; 1547 0 stevel } 1548 0 stevel 1549 0 stevel /* 1550 0 stevel * There are two types of acks we process here: 1551 0 stevel * 1. acks in reply to a (first form) generic capability req 1552 0 stevel * (no ENABLE flag set) 1553 0 stevel * 2. acks in reply to a ENABLE capability req. 1554 0 stevel * (ENABLE flag set) 1555 0 stevel */ 1556 0 stevel ihck = (dl_capab_hcksum_t *)(isub + 1); 1557 0 stevel 1558 0 stevel if (ihck->hcksum_version != HCKSUM_VERSION_1) { 1559 0 stevel cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 1560 0 stevel "unsupported hardware checksum " 1561 0 stevel "sub-capability (version %d, expected %d)", 1562 0 stevel ihck->hcksum_version, HCKSUM_VERSION_1); 1563 0 stevel return; 1564 0 stevel } 1565 0 stevel 1566 0 stevel if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 1567 0 stevel ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 1568 0 stevel "checksum capability isn't as expected; pass-thru " 1569 0 stevel "module(s) detected, discarding capability\n")); 1570 0 stevel return; 1571 0 stevel } 1572 0 stevel 1573 741 masputra #define CURR_HCKSUM_CAPAB \ 1574 741 masputra (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 1575 741 masputra HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 1576 0 stevel 1577 0 stevel if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 1578 0 stevel (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 1579 0 stevel /* do ENABLE processing */ 1580 0 stevel if (*ill_hcksum == NULL) { 1581 0 stevel *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 1582 0 stevel KM_NOSLEEP); 1583 0 stevel 1584 0 stevel if (*ill_hcksum == NULL) { 1585 0 stevel cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1586 0 stevel "could not enable hcksum version %d " 1587 0 stevel "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 1588 0 stevel ill->ill_name); 1589 0 stevel return; 1590 0 stevel } 1591 0 stevel } 1592 0 stevel 1593 0 stevel (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 1594 0 stevel (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 1595 0 stevel ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 1596 0 stevel ip1dbg(("ill_capability_hcksum_ack: interface %s " 1597 0 stevel "has enabled hardware checksumming\n ", 1598 0 stevel ill->ill_name)); 1599 0 stevel } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 1600 0 stevel /* 1601 0 stevel * Enabling hardware checksum offload 1602 0 stevel * Currently IP supports {TCP,UDP}/IPv4 1603 0 stevel * partial and full cksum offload and 1604 0 stevel * IPv4 header checksum offload. 1605 0 stevel * Allocate new mblk which will 1606 0 stevel * contain a new capability request 1607 0 stevel * to enable hardware checksum offload. 1608 0 stevel */ 1609 0 stevel uint_t size; 1610 0 stevel uchar_t *rptr; 1611 0 stevel 1612 0 stevel size = sizeof (dl_capability_req_t) + 1613 0 stevel sizeof (dl_capability_sub_t) + isub->dl_length; 1614 0 stevel 1615 0 stevel if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1616 0 stevel cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1617 0 stevel "could not enable hardware cksum for %s (ENOMEM)\n", 1618 0 stevel ill->ill_name); 1619 0 stevel return; 1620 0 stevel } 1621 0 stevel 1622 0 stevel rptr = nmp->b_rptr; 1623 0 stevel /* initialize dl_capability_req_t */ 1624 0 stevel ocap = (dl_capability_req_t *)nmp->b_rptr; 1625 0 stevel ocap->dl_sub_offset = 1626 0 stevel sizeof (dl_capability_req_t); 1627 0 stevel ocap->dl_sub_length = 1628 0 stevel sizeof (dl_capability_sub_t) + 1629 0 stevel isub->dl_length; 1630 0 stevel nmp->b_rptr += sizeof (dl_capability_req_t); 1631 0 stevel 1632 0 stevel /* initialize dl_capability_sub_t */ 1633 0 stevel bcopy(isub, nmp->b_rptr, sizeof (*isub)); 1634 0 stevel nmp->b_rptr += sizeof (*isub); 1635 0 stevel 1636 0 stevel /* initialize dl_capab_hcksum_t */ 1637 0 stevel ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 1638 0 stevel bcopy(ihck, ohck, sizeof (*ihck)); 1639 0 stevel 1640 0 stevel nmp->b_rptr = rptr; 1641 0 stevel ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 1642 0 stevel 1643 0 stevel /* Set ENABLE flag */ 1644 0 stevel ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 1645 0 stevel ohck->hcksum_txflags |= HCKSUM_ENABLE; 1646 0 stevel 1647 0 stevel /* 1648 0 stevel * nmp points to a DL_CAPABILITY_REQ message to enable 1649 0 stevel * hardware checksum acceleration. 1650 0 stevel */ 1651 8275 Eric ill_capability_send(ill, nmp); 1652 741 masputra } else { 1653 0 stevel ip1dbg(("ill_capability_hcksum_ack: interface %s has " 1654 0 stevel "advertised %x hardware checksum capability flags\n", 1655 0 stevel ill->ill_name, ihck->hcksum_txflags)); 1656 741 masputra } 1657 0 stevel } 1658 0 stevel 1659 0 stevel static void 1660 8275 Eric ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) 1661 8275 Eric { 1662 0 stevel dl_capab_hcksum_t *hck_subcap; 1663 0 stevel dl_capability_sub_t *dl_subcap; 1664 0 stevel 1665 741 masputra if (!ILL_HCKSUM_CAPABLE(ill)) 1666 0 stevel return; 1667 0 stevel 1668 0 stevel ASSERT(ill->ill_hcksum_capab != NULL); 1669 8275 Eric 1670 8275 Eric dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1671 0 stevel dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 1672 0 stevel dl_subcap->dl_length = sizeof (*hck_subcap); 1673 0 stevel 1674 0 stevel hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 1675 0 stevel hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 1676 0 stevel hck_subcap->hcksum_txflags = 0; 1677 0 stevel 1678 8275 Eric mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); 1679 0 stevel } 1680 0 stevel 1681 0 stevel static void 1682 0 stevel ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1683 0 stevel { 1684 0 stevel mblk_t *nmp = NULL; 1685 0 stevel dl_capability_req_t *oc; 1686 0 stevel dl_capab_zerocopy_t *zc_ic, *zc_oc; 1687 0 stevel ill_zerocopy_capab_t **ill_zerocopy_capab; 1688 0 stevel uint_t sub_dl_cap = isub->dl_cap; 1689 0 stevel uint8_t *capend; 1690 0 stevel 1691 0 stevel ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 1692 0 stevel 1693 0 stevel ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 1694 0 stevel 1695 0 stevel /* 1696 0 stevel * Note: range checks here are not absolutely sufficient to 1697 0 stevel * make us robust against malformed messages sent by drivers; 1698 0 stevel * this is in keeping with the rest of IP's dlpi handling. 1699 0 stevel * (Remember, it's coming from something else in the kernel 1700 0 stevel * address space) 1701 0 stevel */ 1702 0 stevel capend = (uint8_t *)(isub + 1) + isub->dl_length; 1703 0 stevel if (capend > mp->b_wptr) { 1704 0 stevel cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1705 0 stevel "malformed sub-capability too long for mblk"); 1706 0 stevel return; 1707 0 stevel } 1708 0 stevel 1709 0 stevel zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 1710 0 stevel if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 1711 0 stevel cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 1712 0 stevel "unsupported ZEROCOPY sub-capability (version %d, " 1713 0 stevel "expected %d)", zc_ic->zerocopy_version, 1714 0 stevel ZEROCOPY_VERSION_1); 1715 0 stevel return; 1716 0 stevel } 1717 0 stevel 1718 0 stevel if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 1719 0 stevel ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 1720 0 stevel "capability isn't as expected; pass-thru module(s) " 1721 0 stevel "detected, discarding capability\n")); 1722 0 stevel return; 1723 0 stevel } 1724 0 stevel 1725 0 stevel if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 1726 0 stevel if (*ill_zerocopy_capab == NULL) { 1727 0 stevel *ill_zerocopy_capab = 1728 0 stevel kmem_zalloc(sizeof (ill_zerocopy_capab_t), 1729 0 stevel KM_NOSLEEP); 1730 0 stevel 1731 0 stevel if (*ill_zerocopy_capab == NULL) { 1732 0 stevel cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1733 0 stevel "could not enable Zero-copy version %d " 1734 0 stevel "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 1735 0 stevel ill->ill_name); 1736 0 stevel return; 1737 0 stevel } 1738 0 stevel } 1739 0 stevel 1740 0 stevel ip1dbg(("ill_capability_zerocopy_ack: interface %s " 1741 0 stevel "supports Zero-copy version %d\n", ill->ill_name, 1742 0 stevel ZEROCOPY_VERSION_1)); 1743 0 stevel 1744 0 stevel (*ill_zerocopy_capab)->ill_zerocopy_version = 1745 0 stevel zc_ic->zerocopy_version; 1746 0 stevel (*ill_zerocopy_capab)->ill_zerocopy_flags = 1747 0 stevel zc_ic->zerocopy_flags; 1748 0 stevel 1749 0 stevel ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 1750 0 stevel } else { 1751 0 stevel uint_t size; 1752 0 stevel uchar_t *rptr; 1753 0 stevel 1754 0 stevel size = sizeof (dl_capability_req_t) + 1755 0 stevel sizeof (dl_capability_sub_t) + 1756 0 stevel sizeof (dl_capab_zerocopy_t); 1757 0 stevel 1758 0 stevel if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1759 0 stevel cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1760 0 stevel "could not enable zerocopy for %s (ENOMEM)\n", 1761 0 stevel ill->ill_name); 1762 0 stevel return; 1763 0 stevel } 1764 0 stevel 1765 0 stevel rptr = nmp->b_rptr; 1766 0 stevel /* initialize dl_capability_req_t */ 1767 0 stevel oc = (dl_capability_req_t *)rptr; 1768 0 stevel oc->dl_sub_offset = sizeof (dl_capability_req_t); 1769 0 stevel oc->dl_sub_length = sizeof (dl_capability_sub_t) + 1770 0 stevel sizeof (dl_capab_zerocopy_t); 1771 0 stevel rptr += sizeof (dl_capability_req_t); 1772 0 stevel 1773 0 stevel /* initialize dl_capability_sub_t */ 1774 0 stevel bcopy(isub, rptr, sizeof (*isub)); 1775 0 stevel rptr += sizeof (*isub); 1776 0 stevel 1777 0 stevel /* initialize dl_capab_zerocopy_t */ 1778 0 stevel zc_oc = (dl_capab_zerocopy_t *)rptr; 1779 0 stevel *zc_oc = *zc_ic; 1780 0 stevel 1781 0 stevel ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 1782 0 stevel "to enable zero-copy version %d\n", ill->ill_name, 1783 0 stevel ZEROCOPY_VERSION_1)); 1784 0 stevel 1785 0 stevel /* set VMSAFE_MEM flag */ 1786 0 stevel zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 1787 0 stevel 1788 0 stevel /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 1789 8275 Eric ill_capability_send(ill, nmp); 1790 8275 Eric } 1791 8275 Eric } 1792 8275 Eric 1793 8275 Eric static void 1794 8275 Eric ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) 1795 8275 Eric { 1796 0 stevel dl_capab_zerocopy_t *zerocopy_subcap; 1797 0 stevel dl_capability_sub_t *dl_subcap; 1798 0 stevel 1799 0 stevel if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 1800 0 stevel return; 1801 0 stevel 1802 0 stevel ASSERT(ill->ill_zerocopy_capab != NULL); 1803 8275 Eric 1804 8275 Eric dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1805 0 stevel dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 1806 0 stevel dl_subcap->dl_length = sizeof (*zerocopy_subcap); 1807 0 stevel 1808 0 stevel zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 1809 0 stevel zerocopy_subcap->zerocopy_version = 1810 0 stevel ill->ill_zerocopy_capab->ill_zerocopy_version; 1811 0 stevel zerocopy_subcap->zerocopy_flags = 0; 1812 3115 yl150051 1813 8275 Eric mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 1814 8275 Eric } 1815 8275 Eric 1816 8275 Eric /* 1817 8275 Eric * DLD capability 1818 8275 Eric * Refer to dld.h for more information regarding the purpose and usage 1819 8275 Eric * of this capability. 1820 8275 Eric */ 1821 8275 Eric static void 1822 8275 Eric ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1823 8275 Eric { 1824 8275 Eric dl_capab_dld_t *dld_ic, dld; 1825 8275 Eric uint_t sub_dl_cap = isub->dl_cap; 1826 8275 Eric uint8_t *capend; 1827 8275 Eric ill_dld_capab_t *idc; 1828 8275 Eric 1829 8275 Eric ASSERT(IAM_WRITER_ILL(ill)); 1830 8275 Eric ASSERT(sub_dl_cap == DL_CAPAB_DLD); 1831 3115 yl150051 1832 3115 yl150051 /* 1833 3115 yl150051 * Note: range checks here are not absolutely sufficient to 1834 3115 yl150051 * make us robust against malformed messages sent by drivers; 1835 3115 yl150051 * this is in keeping with the rest of IP's dlpi handling. 1836 3115 yl150051 * (Remember, it's coming from something else in the kernel 1837 3115 yl150051 * address space) 1838 3115 yl150051 */ 1839 3115 yl150051 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1840 3115 yl150051 if (capend > mp->b_wptr) { 1841 8275 Eric cmn_err(CE_WARN, "ill_capability_dld_ack: " 1842 3115 yl150051 "malformed sub-capability too long for mblk"); 1843 3115 yl150051 return; 1844 3115 yl150051 } 1845 8275 Eric dld_ic = (dl_capab_dld_t *)(isub + 1); 1846 8275 Eric if (dld_ic->dld_version != DLD_CURRENT_VERSION) { 1847 8275 Eric cmn_err(CE_CONT, "ill_capability_dld_ack: " 1848 8275 Eric "unsupported DLD sub-capability (version %d, " 1849 8275 Eric "expected %d)", dld_ic->dld_version, 1850 8275 Eric DLD_CURRENT_VERSION); 1851 8275 Eric return; 1852 8275 Eric } 1853 8275 Eric if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { 1854 8275 Eric ip1dbg(("ill_capability_dld_ack: mid token for dld " 1855 3115 yl150051 "capability isn't as expected; pass-thru module(s) " 1856 3115 yl150051 "detected, discarding capability\n")); 1857 3115 yl150051 return; 1858 3115 yl150051 } 1859 3115 yl150051 1860 8275 Eric /* 1861 8275 Eric * Copy locally to ensure alignment. 1862 8275 Eric */ 1863 8275 Eric bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); 1864 8275 Eric 1865 8275 Eric if ((idc = ill->ill_dld_capab) == NULL) { 1866 8275 Eric idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); 1867 8275 Eric if (idc == NULL) { 1868 8275 Eric cmn_err(CE_WARN, "ill_capability_dld_ack: " 1869 8275 Eric "could not enable DLD version %d " 1870 8275 Eric "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, 1871 8275 Eric ill->ill_name); 1872 8275 Eric return; 1873 8275 Eric } 1874 8275 Eric ill->ill_dld_capab = idc; 1875 8275 Eric } 1876 9073 Cathy idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; 1877 9073 Cathy idc->idc_capab_dh = (void *)dld.dld_capab_handle; 1878 8275 Eric ip1dbg(("ill_capability_dld_ack: interface %s " 1879 8275 Eric "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); 1880 8275 Eric 1881 8275 Eric ill_capability_dld_enable(ill); 1882 8275 Eric } 1883 8275 Eric 1884 8275 Eric /* 1885 8275 Eric * Typically capability negotiation between IP and the driver happens via 1886 8275 Eric * DLPI message exchange. However GLD also offers a direct function call 1887 8275 Eric * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, 1888 8275 Eric * But arbitrary function calls into IP or GLD are not permitted, since both 1889 8275 Eric * of them are protected by their own perimeter mechanism. The perimeter can 1890 8275 Eric * be viewed as a coarse lock or serialization mechanism. The hierarchy of 1891 8275 Eric * these perimeters is IP -> MAC. Thus for example to enable the squeue 1892 8275 Eric * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter 1893 8275 Eric * to enter the mac perimeter and then do the direct function calls into 1894 8275 Eric * GLD to enable squeue polling. The ring related callbacks from the mac into 1895 8275 Eric * the stack to add, bind, quiesce, restart or cleanup a ring are all 1896 8275 Eric * protected by the mac perimeter. 1897 8275 Eric */ 1898 8275 Eric static void 1899 8275 Eric ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) 1900 8275 Eric { 1901 8275 Eric ill_dld_capab_t *idc = ill->ill_dld_capab; 1902 8275 Eric int err; 1903 8275 Eric 1904 8275 Eric err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, 1905 8275 Eric DLD_ENABLE); 1906 8275 Eric ASSERT(err == 0); 1907 8275 Eric } 1908 8275 Eric 1909 8275 Eric static void 1910 8275 Eric ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) 1911 8275 Eric { 1912 8275 Eric ill_dld_capab_t *idc = ill->ill_dld_capab; 1913 8275 Eric int err; 1914 8275 Eric 1915 8275 Eric err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, 1916 8275 Eric DLD_DISABLE); 1917 8275 Eric ASSERT(err == 0); 1918 8275 Eric } 1919 8275 Eric 1920 8275 Eric boolean_t 1921 8275 Eric ill_mac_perim_held(ill_t *ill) 1922 8275 Eric { 1923 8275 Eric ill_dld_capab_t *idc = ill->ill_dld_capab; 1924 8275 Eric 1925 8275 Eric return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, 1926 8275 Eric DLD_QUERY)); 1927 8275 Eric } 1928 8275 Eric 1929 8275 Eric static void 1930 8275 Eric ill_capability_direct_enable(ill_t *ill) 1931 8275 Eric { 1932 8275 Eric ill_dld_capab_t *idc = ill->ill_dld_capab; 1933 8275 Eric ill_dld_direct_t *idd = &idc->idc_direct; 1934 8275 Eric dld_capab_direct_t direct; 1935 8275 Eric int rc; 1936 8275 Eric 1937 8275 Eric ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1938 8275 Eric 1939 8275 Eric bzero(&direct, sizeof (direct)); 1940 8275 Eric direct.di_rx_cf = (uintptr_t)ip_input; 1941 8275 Eric direct.di_rx_ch = ill; 1942 8275 Eric 1943 8275 Eric rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, 1944 8275 Eric DLD_ENABLE); 1945 8275 Eric if (rc == 0) { 1946 8275 Eric idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; 1947 8275 Eric idd->idd_tx_dh = direct.di_tx_dh; 1948 8275 Eric idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; 1949 8275 Eric idd->idd_tx_cb_dh = direct.di_tx_cb_dh; 1950 8833 Venu idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; 1951 8833 Venu idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; 1952 9738 Cathy ASSERT(idd->idd_tx_cb_df != NULL); 1953 9738 Cathy ASSERT(idd->idd_tx_fctl_df != NULL); 1954 9738 Cathy ASSERT(idd->idd_tx_df != NULL); 1955 8275 Eric /* 1956 8275 Eric * One time registration of flow enable callback function 1957 8275 Eric */ 1958 8275 Eric ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, 1959 8275 Eric ill_flow_enable, ill); 1960 8275 Eric ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; 1961 8275 Eric DTRACE_PROBE1(direct_on, (ill_t *), ill); 1962 8275 Eric } else { 1963 8275 Eric cmn_err(CE_WARN, "warning: could not enable DIRECT " 1964 8275 Eric "capability, rc = %d\n", rc); 1965 8275 Eric DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); 1966 8275 Eric } 1967 8275 Eric } 1968 8275 Eric 1969 8275 Eric static void 1970 8275 Eric ill_capability_poll_enable(ill_t *ill) 1971 8275 Eric { 1972 8275 Eric ill_dld_capab_t *idc = ill->ill_dld_capab; 1973 8275 Eric dld_capab_poll_t poll; 1974 8275 Eric int rc; 1975 8275 Eric 1976 8275 Eric ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 1977 8275 Eric 1978 8275 Eric bzero(&poll, sizeof (poll)); 1979 8275 Eric poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; 1980 8275 Eric poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; 1981 8275 Eric poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; 1982 8275 Eric poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; 1983 8275 Eric poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; 1984 8275 Eric poll.poll_ring_ch = ill; 1985 8275 Eric rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, 1986 8275 Eric DLD_ENABLE); 1987 8275 Eric if (rc == 0) { 1988 8275 Eric ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; 1989 8275 Eric DTRACE_PROBE1(poll_on, (ill_t *), ill); 1990 8275 Eric } else { 1991 8275 Eric ip1dbg(("warning: could not enable POLL " 1992 8275 Eric "capability, rc = %d\n", rc)); 1993 8275 Eric DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); 1994 8275 Eric } 1995 8275 Eric } 1996 8275 Eric 1997 8275 Eric /* 1998 8275 Eric * Enable the LSO capability. 1999 8275 Eric */ 2000 8275 Eric static void 2001 8275 Eric ill_capability_lso_enable(ill_t *ill) 2002 8275 Eric { 2003 8275 Eric ill_dld_capab_t *idc = ill->ill_dld_capab; 2004 8275 Eric dld_capab_lso_t lso; 2005 8275 Eric int rc; 2006 8275 Eric 2007 8275 Eric ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2008 8275 Eric 2009 8275 Eric if (ill->ill_lso_capab == NULL) { 2010 8275 Eric ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 2011 8275 Eric KM_NOSLEEP); 2012 8275 Eric if (ill->ill_lso_capab == NULL) { 2013 8275 Eric cmn_err(CE_WARN, "ill_capability_lso_enable: " 2014 3115 yl150051 "could not enable LSO for %s (ENOMEM)\n", 2015 3115 yl150051 ill->ill_name); 2016 3115 yl150051 return; 2017 3115 yl150051 } 2018 8275 Eric } 2019 8275 Eric 2020 8275 Eric bzero(&lso, sizeof (lso)); 2021 8275 Eric if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, 2022 8275 Eric DLD_ENABLE)) == 0) { 2023 8275 Eric ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; 2024 8275 Eric ill->ill_lso_capab->ill_lso_max = lso.lso_max; 2025 11042 Erik ill->ill_capabilities |= ILL_CAPAB_LSO; 2026 8275 Eric ip1dbg(("ill_capability_lso_enable: interface %s " 2027 8275 Eric "has enabled LSO\n ", ill->ill_name)); 2028 8275 Eric } else { 2029 8275 Eric kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 2030 8275 Eric ill->ill_lso_capab = NULL; 2031 8275 Eric DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); 2032 8275 Eric } 2033 8275 Eric } 2034 8275 Eric 2035 8275 Eric static void 2036 8275 Eric ill_capability_dld_enable(ill_t *ill) 2037 8275 Eric { 2038 8275 Eric mac_perim_handle_t mph; 2039 8275 Eric 2040 8275 Eric ASSERT(IAM_WRITER_ILL(ill)); 2041 8275 Eric 2042 8275 Eric if (ill->ill_isv6) 2043 8275 Eric return; 2044 8275 Eric 2045 8275 Eric ill_mac_perim_enter(ill, &mph); 2046 8275 Eric if (!ill->ill_isv6) { 2047 8275 Eric ill_capability_direct_enable(ill); 2048 8275 Eric ill_capability_poll_enable(ill); 2049 8275 Eric ill_capability_lso_enable(ill); 2050 8275 Eric } 2051 8275 Eric ill->ill_capabilities |= ILL_CAPAB_DLD; 2052 8275 Eric ill_mac_perim_exit(ill, mph); 2053 8275 Eric } 2054 8275 Eric 2055 8275 Eric static void 2056 8275 Eric ill_capability_dld_disable(ill_t *ill) 2057 8275 Eric { 2058 8275 Eric ill_dld_capab_t *idc; 2059 8275 Eric ill_dld_direct_t *idd; 2060 8275 Eric mac_perim_handle_t mph; 2061 8275 Eric 2062 8275 Eric ASSERT(IAM_WRITER_ILL(ill)); 2063 8275 Eric 2064 8275 Eric if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 2065 8275 Eric return; 2066 8275 Eric 2067 8275 Eric ill_mac_perim_enter(ill, &mph); 2068 8275 Eric 2069 8275 Eric idc = ill->ill_dld_capab; 2070 8275 Eric if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { 2071 8275 Eric /* 2072 8275 Eric * For performance we avoid locks in the transmit data path 2073 8275 Eric * and don't maintain a count of the number of threads using 2074 8275 Eric * direct calls. Thus some threads could be using direct 2075 8275 Eric * transmit calls to GLD, even after the capability mechanism 2076 8275 Eric * turns it off. This is still safe since the handles used in 2077 8275 Eric * the direct calls continue to be valid until the unplumb is 2078 8275 Eric * completed. Remove the callback that was added (1-time) at 2079 8275 Eric * capab enable time. 2080 8275 Eric */ 2081 8275 Eric mutex_enter(&ill->ill_lock); 2082 8275 Eric ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; 2083 8275 Eric mutex_exit(&ill->ill_lock); 2084 8275 Eric if (ill->ill_flownotify_mh != NULL) { 2085 8275 Eric idd = &idc->idc_direct; 2086 8275 Eric idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, 2087 8275 Eric ill->ill_flownotify_mh); 2088 8275 Eric ill->ill_flownotify_mh = NULL; 2089 8275 Eric } 2090 8275 Eric (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, 2091 8275 Eric NULL, DLD_DISABLE); 2092 8275 Eric } 2093 8275 Eric 2094 8275 Eric if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { 2095 8275 Eric ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; 2096 8275 Eric ip_squeue_clean_all(ill); 2097 8275 Eric (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, 2098 8275 Eric NULL, DLD_DISABLE); 2099 8275 Eric } 2100 8275 Eric 2101 11042 Erik if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) { 2102 8275 Eric ASSERT(ill->ill_lso_capab != NULL); 2103 8275 Eric /* 2104 8275 Eric * Clear the capability flag for LSO but retain the 2105 8275 Eric * ill_lso_capab structure since it's possible that another 2106 8275 Eric * thread is still referring to it. The structure only gets 2107 8275 Eric * deallocated when we destroy the ill. 2108 8275 Eric */ 2109 8275 Eric 2110 11042 Erik ill->ill_capabilities &= ~ILL_CAPAB_LSO; 2111 8275 Eric (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, 2112 8275 Eric NULL, DLD_DISABLE); 2113 8275 Eric } 2114 8275 Eric 2115 8275 Eric ill->ill_capabilities &= ~ILL_CAPAB_DLD; 2116 8275 Eric ill_mac_perim_exit(ill, mph); 2117 8275 Eric } 2118 8275 Eric 2119 8275 Eric /* 2120 8275 Eric * Capability Negotiation protocol 2121 8275 Eric * 2122 8275 Eric * We don't wait for DLPI capability operations to finish during interface 2123 8275 Eric * bringup or teardown. Doing so would introduce more asynchrony and the 2124 8275 Eric * interface up/down operations will need multiple return and restarts. 2125 8275 Eric * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as 2126 8275 Eric * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next 2127 8275 Eric * exclusive operation won't start until the DLPI operations of the previous 2128 8275 Eric * exclusive operation complete. 2129 8275 Eric * 2130 8275 Eric * The capability state machine is shown below. 2131 8275 Eric * 2132 8275 Eric * state next state event, action 2133 8275 Eric * 2134 8275 Eric * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe 2135 8275 Eric * IDCS_PROBE_SENT IDCS_OK ill_capability_ack 2136 8275 Eric * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) 2137 8275 Eric * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG 2138 8275 Eric * IDCS_OK IDCS_RESET_SENT ill_capability_reset 2139 8275 Eric * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr 2140 8275 Eric * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> 2141 8275 Eric * ill_capability_probe. 2142 8275 Eric */ 2143 8275 Eric 2144 8275 Eric /* 2145 8275 Eric * Dedicated thread started from ip_stack_init that handles capability 2146 8275 Eric * disable. This thread ensures the taskq dispatch does not fail by waiting 2147 8275 Eric * for resources using TQ_SLEEP. The taskq mechanism is used to ensure 2148 8275 Eric * that direct calls to DLD are done in a cv_waitable context. 2149 8275 Eric */ 2150 8275 Eric void 2151 8275 Eric ill_taskq_dispatch(ip_stack_t *ipst) 2152 8275 Eric { 2153 8275 Eric callb_cpr_t cprinfo; 2154 8275 Eric char name[64]; 2155 8275 Eric mblk_t *mp; 2156 8275 Eric 2157 8275 Eric (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", 2158 8275 Eric ipst->ips_netstack->netstack_stackid); 2159 8275 Eric CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, 2160 8275 Eric name); 2161 8275 Eric mutex_enter(&ipst->ips_capab_taskq_lock); 2162 8275 Eric 2163 8275 Eric for (;;) { 2164 9979 Thirumalai mp = ipst->ips_capab_taskq_head; 2165 8275 Eric while (mp != NULL) { 2166 9979 Thirumalai ipst->ips_capab_taskq_head = mp->b_next; 2167 9979 Thirumalai if (ipst->ips_capab_taskq_head == NULL) 2168 9979 Thirumalai ipst->ips_capab_taskq_tail = NULL; 2169 8275 Eric mutex_exit(&ipst->ips_capab_taskq_lock); 2170 9979 Thirumalai mp->b_next = NULL; 2171 9979 Thirumalai 2172 8275 Eric VERIFY(taskq_dispatch(system_taskq, 2173 8275 Eric ill_capability_ack_thr, mp, TQ_SLEEP) != 0); 2174 8275 Eric mutex_enter(&ipst->ips_capab_taskq_lock); 2175 9979 Thirumalai mp = ipst->ips_capab_taskq_head; 2176 8275 Eric } 2177 8275 Eric 2178 8275 Eric if (ipst->ips_capab_taskq_quit) 2179 8275 Eric break; 2180 8275 Eric CALLB_CPR_SAFE_BEGIN(&cprinfo); 2181 8275 Eric cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); 2182 8275 Eric CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); 2183 8275 Eric } 2184 9979 Thirumalai VERIFY(ipst->ips_capab_taskq_head == NULL); 2185 9979 Thirumalai VERIFY(ipst->ips_capab_taskq_tail == NULL); 2186 8275 Eric CALLB_CPR_EXIT(&cprinfo); 2187 8275 Eric thread_exit(); 2188 0 stevel } 2189 0 stevel 2190 0 stevel /* 2191 0 stevel * Consume a new-style hardware capabilities negotiation ack. 2192 11076 Cathy * Called via taskq on receipt of DL_CAPABILITY_ACK. 2193 8275 Eric */ 2194 8275 Eric static void 2195 8275 Eric ill_capability_ack_thr(void *arg) 2196 8275 Eric { 2197 8275 Eric mblk_t *mp = arg; 2198 0 stevel dl_capability_ack_t *capp; 2199 0 stevel dl_capability_sub_t *subp, *endp; 2200 8275 Eric ill_t *ill; 2201 8275 Eric boolean_t reneg; 2202 8275 Eric 2203 8275 Eric ill = (ill_t *)mp->b_prev; 2204 9979 Thirumalai mp->b_prev = NULL; 2205 9979 Thirumalai 2206 8275 Eric VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); 2207 8275 Eric 2208 8275 Eric if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || 2209 8275 Eric ill->ill_dlpi_capab_state == IDCS_RENEG) { 2210 8275 Eric /* 2211 8275 Eric * We have received the ack for our DL_CAPAB reset request. 2212 8275 Eric * There isnt' anything in the message that needs processing. 2213 8275 Eric * All message based capabilities have been disabled, now 2214 8275 Eric * do the function call based capability disable. 2215 8275 Eric */ 2216 8275 Eric reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; 2217 8275 Eric ill_capability_dld_disable(ill); 2218 8275 Eric ill->ill_dlpi_capab_state = IDCS_UNKNOWN; 2219 8275 Eric if (reneg) 2220 8275 Eric ill_capability_probe(ill); 2221 8275 Eric goto done; 2222 8275 Eric } 2223 8275 Eric 2224 8275 Eric if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 2225 8275 Eric ill->ill_dlpi_capab_state = IDCS_OK; 2226 0 stevel 2227 0 stevel capp = (dl_capability_ack_t *)mp->b_rptr; 2228 0 stevel 2229 8275 Eric if (capp->dl_sub_length == 0) { 2230 0 stevel /* no new-style capabilities */ 2231 8275 Eric goto done; 2232 8275 Eric } 2233 0 stevel 2234 0 stevel /* make sure the driver supplied correct dl_sub_length */ 2235 0 stevel if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 2236 0 stevel ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 2237 0 stevel "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 2238 8275 Eric goto done; 2239 8275 Eric } 2240 8275 Eric 2241 0 stevel #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 2242 0 stevel /* 2243 0 stevel * There are sub-capabilities. Process the ones we know about. 2244 0 stevel * Loop until we don't have room for another sub-cap header.. 2245 0 stevel */ 2246 0 stevel for (subp = SC(capp, capp->dl_sub_offset), 2247 0 stevel endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 2248 0 stevel subp <= endp; 2249 0 stevel subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 2250 0 stevel 2251 0 stevel switch (subp->dl_cap) { 2252 0 stevel case DL_CAPAB_ID_WRAPPER: 2253 0 stevel ill_capability_id_ack(ill, mp, subp); 2254 0 stevel break; 2255 0 stevel default: 2256 11042 Erik ill_capability_dispatch(ill, mp, subp); 2257 0 stevel break; 2258 0 stevel } 2259 0 stevel } 2260 0 stevel #undef SC 2261 8275 Eric done: 2262 8275 Eric inet_freemsg(mp); 2263 8275 Eric ill_capability_done(ill); 2264 8275 Eric ipsq_exit(ill->ill_phyint->phyint_ipsq); 2265 8275 Eric } 2266 8275 Eric 2267 8275 Eric /* 2268 8275 Eric * This needs to be started in a taskq thread to provide a cv_waitable 2269 8275 Eric * context. 2270 8275 Eric */ 2271 8275 Eric void 2272 8275 Eric ill_capability_ack(ill_t *ill, mblk_t *mp) 2273 8275 Eric { 2274 8275 Eric ip_stack_t *ipst = ill->ill_ipst; 2275 8275 Eric 2276 8275 Eric mp->b_prev = (mblk_t *)ill; 2277 9979 Thirumalai ASSERT(mp->b_next == NULL); 2278 9979 Thirumalai 2279 8275 Eric if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, 2280 8275 Eric TQ_NOSLEEP) != 0) 2281 8275 Eric return; 2282 8275 Eric 2283 8275 Eric /* 2284 8275 Eric * The taskq dispatch failed. Signal the ill_taskq_dispatch thread 2285 8275 Eric * which will do the dispatch using TQ_SLEEP to guarantee success. 2286 8275 Eric */ 2287 8275 Eric mutex_enter(&ipst->ips_capab_taskq_lock); 2288 9979 Thirumalai if (ipst->ips_capab_taskq_head == NULL) { 2289 9979 Thirumalai ASSERT(ipst->ips_capab_taskq_tail == NULL); 2290 9979 Thirumalai ipst->ips_capab_taskq_head = mp; 2291 9979 Thirumalai } else { 2292 9979 Thirumalai ipst->ips_capab_taskq_tail->b_next = mp; 2293 9979 Thirumalai } 2294 9979 Thirumalai ipst->ips_capab_taskq_tail = mp; 2295 9979 Thirumalai 2296 8275 Eric cv_signal(&ipst->ips_capab_taskq_cv); 2297 8275 Eric mutex_exit(&ipst->ips_capab_taskq_lock); 2298 0 stevel } 2299 0 stevel 2300 0 stevel /* 2301 0 stevel * This routine is called to scan the fragmentation reassembly table for 2302 0 stevel * the specified ILL for any packets that are starting to smell. 2303 0 stevel * dead_interval is the maximum time in seconds that will be tolerated. It 2304 0 stevel * will either be the value specified in ip_g_frag_timeout, or zero if the 2305 0 stevel * ILL is shutting down and it is time to blow everything off. 2306 0 stevel * 2307 0 stevel * It returns the number of seconds (as a time_t) that the next frag timer 2308 0 stevel * should be scheduled for, 0 meaning that the timer doesn't need to be 2309 0 stevel * re-started. Note that the method of calculating next_timeout isn't 2310 0 stevel * entirely accurate since time will flow between the time we grab 2311 0 stevel * current_time and the time we schedule the next timeout. This isn't a 2312 0 stevel * big problem since this is the timer for sending an ICMP reassembly time 2313 0 stevel * exceeded messages, and it doesn't have to be exactly accurate. 2314 0 stevel * 2315 0 stevel * This function is 2316 0 stevel * sometimes called as writer, although this is not required. 2317 0 stevel */ 2318 0 stevel time_t 2319 0 stevel ill_frag_timeout(ill_t *ill, time_t dead_interval) 2320 0 stevel { 2321 0 stevel ipfb_t *ipfb; 2322 0 stevel ipfb_t *endp; 2323 0 stevel ipf_t *ipf; 2324 0 stevel ipf_t *ipfnext; 2325 0 stevel mblk_t *mp; 2326 0 stevel time_t current_time = gethrestime_sec(); 2327 0 stevel time_t next_timeout = 0; 2328 0 stevel uint32_t hdr_length; 2329 0 stevel mblk_t *send_icmp_head; 2330 0 stevel mblk_t *send_icmp_head_v6; 2331 3448 dh155122 ip_stack_t *ipst = ill->ill_ipst; 2332 11042 Erik ip_recv_attr_t iras; 2333 11042 Erik 2334 11042 Erik bzero(&iras, sizeof (iras)); 2335 11042 Erik iras.ira_flags = 0; 2336 11042 Erik iras.ira_ill = iras.ira_rill = ill; 2337 11042 Erik iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2338 11042 Erik iras.ira_rifindex = iras.ira_ruifindex; 2339 0 stevel 2340 0 stevel ipfb = ill->ill_frag_hash_tbl; 2341 0 stevel if (ipfb == NULL) 2342 0 stevel return (B_FALSE); 2343 0 stevel endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 2344 0 stevel /* Walk the frag hash table. */ 2345 0 stevel for (; ipfb < endp; ipfb++) { 2346 0 stevel send_icmp_head = NULL; 2347 0 stevel send_icmp_head_v6 = NULL; 2348 0 stevel mutex_enter(&ipfb->ipfb_lock); 2349 0 stevel while ((ipf = ipfb->ipfb_ipf) != 0) { 2350 0 stevel time_t frag_time = current_time - ipf->ipf_timestamp; 2351 0 stevel time_t frag_timeout; 2352 0 stevel 2353 0 stevel if (frag_time < dead_interval) { 2354 0 stevel /* 2355 0 stevel * There are some outstanding fragments 2356 0 stevel * that will timeout later. Make note of 2357 0 stevel * the time so that we can reschedule the 2358 0 stevel * next timeout appropriately. 2359 0 stevel */ 2360 0 stevel frag_timeout = dead_interval - frag_time; 2361 0 stevel if (next_timeout == 0 || 2362 0 stevel frag_timeout < next_timeout) { 2363 0 stevel next_timeout = frag_timeout; 2364 0 stevel } 2365 0 stevel break; 2366 0 stevel } 2367 0 stevel /* Time's up. Get it out of here. */ 2368 0 stevel hdr_length = ipf->ipf_nf_hdr_len; 2369 0 stevel ipfnext = ipf->ipf_hash_next; 2370 0 stevel if (ipfnext) 2371 0 stevel ipfnext->ipf_ptphn = ipf->ipf_ptphn; 2372 0 stevel *ipf->ipf_ptphn = ipfnext; 2373 0 stevel mp = ipf->ipf_mp->b_cont; 2374 0 stevel for (; mp; mp = mp->b_cont) { 2375 0 stevel /* Extra points for neatness. */ 2376 0 stevel IP_REASS_SET_START(mp, 0); 2377 0 stevel IP_REASS_SET_END(mp, 0); 2378 0 stevel } 2379 0 stevel mp = ipf->ipf_mp->b_cont; 2380 6759 georges atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); 2381 0 stevel ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 2382 0 stevel ipfb->ipfb_count -= ipf->ipf_count; 2383 0 stevel ASSERT(ipfb->ipfb_frag_pkts > 0); 2384 0 stevel ipfb->ipfb_frag_pkts--; 2385 0 stevel /* 2386 0 stevel * We do not send any icmp message from here because 2387 0 stevel * we currently are holding the ipfb_lock for this 2388 0 stevel * hash chain. If we try and send any icmp messages 2389 0 stevel * from here we may end up via a put back into ip 2390 0 stevel * trying to get the same lock, causing a recursive 2391 0 stevel * mutex panic. Instead we build a list and send all 2392 0 stevel * the icmp messages after we have dropped the lock. 2393 0 stevel */ 2394 0 stevel if (ill->ill_isv6) { 2395 0 stevel if (hdr_length != 0) { 2396 0 stevel mp->b_next = send_icmp_head_v6; 2397 0 stevel send_icmp_head_v6 = mp; 2398 0 stevel } else { 2399 0 stevel freemsg(mp); 2400 0 stevel } 2401 0 stevel } else { 2402 0 stevel if (hdr_length != 0) { 2403 0 stevel mp->b_next = send_icmp_head; 2404 0 stevel send_icmp_head = mp; 2405 0 stevel } else { 2406 0 stevel freemsg(mp); 2407 0 stevel } 2408 0 stevel } 2409 3284 apersson BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2410 11042 Erik ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill); 2411 0 stevel freeb(ipf->ipf_mp); 2412 0 stevel } 2413 0 stevel mutex_exit(&ipfb->ipfb_lock); 2414 0 stevel /* 2415 0 stevel * Now need to send any icmp messages that we delayed from 2416 0 stevel * above. 2417 0 stevel */ 2418 0 stevel while (send_icmp_head_v6 != NULL) { 2419 2733 nordmark ip6_t *ip6h; 2420 2733 nordmark 2421 0 stevel mp = send_icmp_head_v6; 2422 0 stevel send_icmp_head_v6 = send_icmp_head_v6->b_next; 2423 0 stevel mp->b_next = NULL; 2424 11042 Erik ip6h = (ip6_t *)mp->b_rptr; 2425 11042 Erik iras.ira_flags = 0; 2426 11042 Erik /* 2427 11042 Erik * This will result in an incorrect ALL_ZONES zoneid 2428 11042 Erik * for multicast packets, but we 2429 11042 Erik * don't send ICMP errors for those in any case. 2430 11042 Erik */ 2431 11042 Erik iras.ira_zoneid = 2432 11042 Erik ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 2433 3448 dh155122 ill, ipst); 2434 11042 Erik ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2435 11042 Erik icmp_time_exceeded_v6(mp, 2436 11042 Erik ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 2437 11042 Erik &iras); 2438 11042 Erik ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2439 0 stevel } 2440 0 stevel while (send_icmp_head != NULL) { 2441 2733 nordmark ipaddr_t dst; 2442 2733 nordmark 2443 0 stevel mp = send_icmp_head; 2444 0 stevel send_icmp_head = send_icmp_head->b_next; 2445 0 stevel mp->b_next = NULL; 2446 2733 nordmark 2447 11042 Erik dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 2448 11042 Erik 2449 11042 Erik iras.ira_flags = IRAF_IS_IPV4; 2450 11042 Erik /* 2451 11042 Erik * This will result in an incorrect ALL_ZONES zoneid 2452 11042 Erik * for broadcast and multicast packets, but we 2453 11042 Erik * don't send ICMP errors for those in any case. 2454 11042 Erik */ 2455 11042 Erik iras.ira_zoneid = ipif_lookup_addr_zoneid(dst, 2456 11042 Erik ill, ipst); 2457 11042 Erik ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2458 11042 Erik icmp_time_exceeded(mp, 2459 11042 Erik ICMP_REASSEMBLY_TIME_EXCEEDED, &iras); 2460 11042 Erik ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2461 0 stevel } 2462 0 stevel } 2463 0 stevel /* 2464 0 stevel * A non-dying ILL will use the return value to decide whether to 2465 0 stevel * restart the frag timer, and for how long. 2466 0 stevel */ 2467 0 stevel return (next_timeout); 2468 0 stevel } 2469 0 stevel 2470 0 stevel /* 2471 0 stevel * This routine is called when the approximate count of mblk memory used 2472 0 stevel * for the specified ILL has exceeded max_count. 2473 0 stevel */ 2474 0 stevel void 2475 0 stevel ill_frag_prune(ill_t *ill, uint_t max_count) 2476 0 stevel { 2477 0 stevel ipfb_t *ipfb; 2478 0 stevel ipf_t *ipf; 2479 0 stevel size_t count; 2480 11066 rafael clock_t now; 2481 0 stevel 2482 0 stevel /* 2483 0 stevel * If we are here within ip_min_frag_prune_time msecs remove 2484 0 stevel * ill_frag_free_num_pkts oldest packets from each bucket and increment 2485 0 stevel * ill_frag_free_num_pkts. 2486 0 stevel */ 2487 0 stevel mutex_enter(&ill->ill_lock); 2488 11066 rafael now = ddi_get_lbolt(); 2489 11066 rafael if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <= 2490 0 stevel (ip_min_frag_prune_time != 0 ? 2491 0 stevel ip_min_frag_prune_time : msec_per_tick)) { 2492 0 stevel 2493 0 stevel ill->ill_frag_free_num_pkts++; 2494 0 stevel 2495 0 stevel } else { 2496 0 stevel ill->ill_frag_free_num_pkts = 0; 2497 0 stevel } 2498 11066 rafael ill->ill_last_frag_clean_time = now; 2499 0 stevel mutex_exit(&ill->ill_lock); 2500 0 stevel 2501 0 stevel /* 2502 0 stevel * free ill_frag_free_num_pkts oldest packets from each bucket. 2503 0 stevel */ 2504 0 stevel if (ill->ill_frag_free_num_pkts != 0) { 2505 0 stevel int ix; 2506 0 stevel 2507 0 stevel for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2508 0 stevel ipfb = &ill->ill_frag_hash_tbl[ix]; 2509 0 stevel mutex_enter(&ipfb->ipfb_lock); 2510 0 stevel if (ipfb->ipfb_ipf != NULL) { 2511 0 stevel ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 2512 0 stevel ill->ill_frag_free_num_pkts); 2513 0 stevel } 2514 0 stevel mutex_exit(&ipfb->ipfb_lock); 2515 0 stevel } 2516 0 stevel } 2517 0 stevel /* 2518 0 stevel * While the reassembly list for this ILL is too big, prune a fragment 2519 6759 georges * queue by age, oldest first. 2520 0 stevel */ 2521 0 stevel while (ill->ill_frag_count > max_count) { 2522 0 stevel int ix; 2523 0 stevel ipfb_t *oipfb = NULL; 2524 0 stevel uint_t oldest = UINT_MAX; 2525 0 stevel 2526 0 stevel count = 0; 2527 0 stevel for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2528 0 stevel ipfb = &ill->ill_frag_hash_tbl[ix]; 2529 0 stevel mutex_enter(&ipfb->ipfb_lock); 2530 0 stevel ipf = ipfb->ipfb_ipf; 2531 0 stevel if (ipf != NULL && ipf->ipf_gen < oldest) { 2532 0 stevel oldest = ipf->ipf_gen; 2533 0 stevel oipfb = ipfb; 2534 0 stevel } 2535 0 stevel count += ipfb->ipfb_count; 2536 0 stevel mutex_exit(&ipfb->ipfb_lock); 2537 0 stevel } 2538 6759 georges if (oipfb == NULL) 2539 6759 georges break; 2540 6759 georges 2541 0 stevel if (count <= max_count) 2542 0 stevel return; /* Somebody beat us to it, nothing to do */ 2543 0 stevel mutex_enter(&oipfb->ipfb_lock); 2544 0 stevel ipf = oipfb->ipfb_ipf; 2545 0 stevel if (ipf != NULL) { 2546 0 stevel ill_frag_free_pkts(ill, oipfb, ipf, 1); 2547 0 stevel } 2548 0 stevel mutex_exit(&oipfb->ipfb_lock); 2549 0 stevel } 2550 0 stevel } 2551 0 stevel 2552 0 stevel /* 2553 0 stevel * free 'free_cnt' fragmented packets starting at ipf. 2554 0 stevel */ 2555 0 stevel void 2556 0 stevel ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 2557 0 stevel { 2558 0 stevel size_t count; 2559 0 stevel mblk_t *mp; 2560 0 stevel mblk_t *tmp; 2561 0 stevel ipf_t **ipfp = ipf->ipf_ptphn; 2562 0 stevel 2563 0 stevel ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 2564 0 stevel ASSERT(ipfp != NULL); 2565 0 stevel ASSERT(ipf != NULL); 2566 0 stevel 2567 0 stevel while (ipf != NULL && free_cnt-- > 0) { 2568 0 stevel count = ipf->ipf_count; 2569 0 stevel mp = ipf->ipf_mp; 2570 0 stevel ipf = ipf->ipf_hash_next; 2571 0 stevel for (tmp = mp; tmp; tmp = tmp->b_cont) { 2572 0 stevel IP_REASS_SET_START(tmp, 0); 2573 0 stevel IP_REASS_SET_END(tmp, 0); 2574 0 stevel } 2575 6759 georges atomic_add_32(&ill->ill_frag_count, -count); 2576 0 stevel ASSERT(ipfb->ipfb_count >= count); 2577 0 stevel ipfb->ipfb_count -= count; 2578 0 stevel ASSERT(ipfb->ipfb_frag_pkts > 0); 2579 0 stevel ipfb->ipfb_frag_pkts--; 2580 11042 Erik BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2581 11042 Erik ip_drop_input("ipIfStatsReasmFails", mp, ill); 2582 0 stevel freemsg(mp); 2583 0 stevel } 2584 0 stevel 2585 0 stevel if (ipf) 2586 0 stevel ipf->ipf_ptphn = ipfp; 2587 0 stevel ipfp[0] = ipf; 2588 0 stevel } 2589 0 stevel 2590 0 stevel #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 2591 0 stevel "obsolete and may be removed in a future release of Solaris. Use " \ 2592 0 stevel "ifconfig(1M) to manipulate the forwarding status of an interface." 2593 0 stevel 2594 0 stevel /* 2595 0 stevel * For obsolete per-interface forwarding configuration; 2596 0 stevel * called in response to ND_GET. 2597 0 stevel */ 2598 0 stevel /* ARGSUSED */ 2599 0 stevel static int 2600 0 stevel nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 2601 0 stevel { 2602 0 stevel ill_t *ill = (ill_t *)cp; 2603 0 stevel 2604 0 stevel cmn_err(CE_WARN, ND_FORWARD_WARNING); 2605 0 stevel 2606 0 stevel (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 2607 0 stevel return (0); 2608 0 stevel } 2609 0 stevel 2610 0 stevel /* 2611 0 stevel * For obsolete per-interface forwarding configuration; 2612 0 stevel * called in response to ND_SET. 2613 0 stevel */ 2614 0 stevel /* ARGSUSED */ 2615 0 stevel static int 2616 0 stevel nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 2617 0 stevel cred_t *ioc_cr) 2618 0 stevel { 2619 0 stevel long value; 2620 0 stevel int retval; 2621 3448 dh155122 ip_stack_t *ipst = CONNQ_TO_IPST(q); 2622 0 stevel 2623 0 stevel cmn_err(CE_WARN, ND_FORWARD_WARNING); 2624 0 stevel 2625 0 stevel if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 2626 0 stevel value < 0 || value > 1) { 2627 0 stevel return (EINVAL); 2628 0 stevel } 2629 0 stevel 2630 3448 dh155122 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2631 4360 meem retval = ill_forward_set((ill_t *)cp, (value != 0)); 2632 3448 dh155122 rw_exit(&ipst->ips_ill_g_lock); 2633 0 stevel return (retval); 2634 0 stevel } 2635 0 stevel 2636 0 stevel /* 2637 8485 Peter * Helper function for ill_forward_set(). 2638 8485 Peter */ 2639 8485 Peter static void 2640 8485 Peter ill_forward_set_on_ill(ill_t *ill, boolean_t enable) 2641 8485 Peter { 2642 8485 Peter ip_stack_t *ipst = ill->ill_ipst; 2643 8485 Peter 2644 8485 Peter ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2645 8485 Peter 2646 8485 Peter ip1dbg(("ill_forward_set: %s %s forwarding on %s", 2647 8485 Peter (enable ? "Enabling" : "Disabling"), 2648 8485 Peter (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 2649 8485 Peter mutex_enter(&ill->ill_lock); 2650 8485 Peter if (enable) 2651 8485 Peter ill->ill_flags |= ILLF_ROUTER; 2652 8485 Peter else 2653 8485 Peter ill->ill_flags &= ~ILLF_ROUTER; 2654 8485 Peter mutex_exit(&ill->ill_lock); 2655 8485 Peter if (ill->ill_isv6) 2656 8485 Peter ill_set_nce_router_flags(ill, enable); 2657 8485 Peter /* Notify routing socket listeners of this change. */ 2658 9658 Sowmini if (ill->ill_ipif != NULL) 2659 9658 Sowmini ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 2660 8485 Peter } 2661 8485 Peter 2662 8485 Peter /* 2663 8485 Peter * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing 2664 8485 Peter * socket messages for each interface whose flags we change. 2665 0 stevel */ 2666 4360 meem int 2667 4360 meem ill_forward_set(ill_t *ill, boolean_t enable) 2668 4360 meem { 2669 8485 Peter ipmp_illgrp_t *illg; 2670 8485 Peter ip_stack_t *ipst = ill->ill_ipst; 2671 3448 dh155122 2672 3448 dh155122 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2673 0 stevel 2674 0 stevel if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 2675 4360 meem (!enable && !(ill->ill_flags & ILLF_ROUTER))) 2676 4360 meem return (0); 2677 4360 meem 2678 4459 kcpoon if (IS_LOOPBACK(ill)) 2679 0 stevel return (EINVAL); 2680 0 stevel 2681 8485 Peter if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 2682 8485 Peter /* 2683 8485 Peter * Update all of the interfaces in the group. 2684 8485 Peter */ 2685 8485 Peter illg = ill->ill_grp; 2686 8485 Peter ill = list_head(&illg->ig_if); 2687 8485 Peter for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 2688 8485 Peter ill_forward_set_on_ill(ill, enable); 2689 8485 Peter 2690 8485 Peter /* 2691 8485 Peter * Update the IPMP meta-interface. 2692 8485 Peter */ 2693 8485 Peter ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); 2694 8485 Peter return (0); 2695 8485 Peter } 2696 8485 Peter 2697 8485 Peter ill_forward_set_on_ill(ill, enable); 2698 0 stevel return (0); 2699 0 stevel } 2700 0 stevel 2701 0 stevel /* 2702 0 stevel * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 2703 0 stevel * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 2704 0 stevel * set or clear. 2705 0 stevel */ 2706 0 stevel static void 2707 0 stevel ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 2708 0 stevel { 2709 0 stevel ipif_t *ipif; 2710 11042 Erik ncec_t *ncec; 2711 0 stevel nce_t *nce; 2712 0 stevel 2713 0 stevel for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 2714 8485 Peter /* 2715 8499 Sebastien * NOTE: we match across the illgrp because nce's for 2716 8499 Sebastien * addresses on IPMP interfaces have an nce_ill that points to 2717 8499 Sebastien * the bound underlying ill. 2718 8499 Sebastien */ 2719 11042 Erik nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 2720 0 stevel if (nce != NULL) { 2721 11042 Erik ncec = nce->nce_common; 2722 11042 Erik mutex_enter(&ncec->ncec_lock); 2723 0 stevel if (enable) 2724 11042 Erik ncec->ncec_flags |= NCE_F_ISROUTER; 2725 0 stevel else 2726 11042 Erik ncec->ncec_flags &= ~NCE_F_ISROUTER; 2727 11042 Erik mutex_exit(&ncec->ncec_lock); 2728 11042 Erik nce_refrele(nce); 2729 0 stevel } 2730 0 stevel } 2731 0 stevel } 2732 0 stevel 2733 0 stevel /* 2734 0 stevel * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 2735 0 stevel * for this ill. Make sure the v6/v4 question has been answered about this 2736 0 stevel * ill. The creation of this ndd variable is only for backwards compatibility. 2737 0 stevel * The preferred way to control per-interface IP forwarding is through the 2738 0 stevel * ILLF_ROUTER interface flag. 2739 0 stevel */ 2740 0 stevel static int 2741 0 stevel ill_set_ndd_name(ill_t *ill) 2742 0 stevel { 2743 0 stevel char *suffix; 2744 3448 dh155122 ip_stack_t *ipst = ill->ill_ipst; 2745 0 stevel 2746 0 stevel ASSERT(IAM_WRITER_ILL(ill)); 2747 0 stevel 2748 0 stevel if (ill->ill_isv6) 2749 0 stevel suffix = ipv6_forward_suffix; 2750 0 stevel else 2751 0 stevel suffix = ipv4_forward_suffix; 2752 0 stevel 2753 0 stevel ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 2754 0 stevel bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 2755 0 stevel /* 2756 0 stevel * Copies over the '\0'. 2757 0 stevel * Note that strlen(suffix) is always bounded. 2758 0 stevel */ 2759 0 stevel bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 2760 0 stevel strlen(suffix) + 1); 2761 0 stevel 2762 0 stevel /* 2763 0 stevel * Use of the nd table requires holding the reader lock. 2764 0 stevel * Modifying the nd table thru nd_load/nd_unload requires 2765 0 stevel * the writer lock. 2766 0 stevel */ 2767 3448 dh155122 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 2768 3448 dh155122 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 2769 0 stevel nd_ill_forward_set, (caddr_t)ill)) { 2770 0 stevel /* 2771 0 stevel * If the nd_load failed, it only meant that it could not 2772 0 stevel * allocate a new bunch of room for further NDD expansion. 2773 0 stevel * Because of that, the ill_ndd_name will be set to 0, and 2774 0 stevel * this interface is at the mercy of the global ip_forwarding 2775 0 stevel * variable. 2776 0 stevel */ 2777 3448 dh155122 rw_exit(&ipst->ips_ip_g_nd_lock); 2778 0 stevel ill->ill_ndd_name = NULL; 2779 0 stevel return (ENOMEM); 2780 0 stevel } 2781 3448 dh155122 rw_exit(&ipst->ips_ip_g_nd_lock); 2782 0 stevel return (0); 2783 0 stevel } 2784 0 stevel 2785 0 stevel /* 2786 0 stevel * Intializes the context structure and returns the first ill in the list 2787 0 stevel * cuurently start_list and end_list can have values: 2788 0 stevel * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 2789 0 stevel * IP_V4_G_HEAD Traverse IPV4 list only. 2790 0 stevel * IP_V6_G_HEAD Traverse IPV6 list only. 2791 0 stevel */ 2792 0 stevel 2793 0 stevel /* 2794 0 stevel * We don't check for CONDEMNED ills here. Caller must do that if 2795 0 stevel * necessary under the ill lock. 2796 0 stevel */ 2797 0 stevel ill_t * 2798 3448 dh155122 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 2799 3448 dh155122 ip_stack_t *ipst) 2800 0 stevel { 2801 0 stevel ill_if_t *ifp; 2802 0 stevel ill_t *ill; 2803 0 stevel avl_tree_t *avl_tree; 2804 0 stevel 2805 3448 dh155122 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 2806 0 stevel ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 2807 0 stevel 2808 0 stevel /* 2809 0 stevel * setup the lists to search 2810 0 stevel */ 2811 0 stevel if (end_list != MAX_G_HEADS) { 2812 0 stevel ctx->ctx_current_list = start_list; 2813 0 stevel ctx->ctx_last_list = end_list; 2814 0 stevel } else { 2815 0 stevel ctx->ctx_last_list = MAX_G_HEADS - 1; 2816 0 stevel ctx->ctx_current_list = 0; 2817 0 stevel } 2818 0 stevel 2819 0 stevel while (ctx->ctx_current_list <= ctx->ctx_last_list) { 2820 3448 dh155122 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2821 0 stevel if (ifp != (ill_if_t *) 2822 3448 dh155122 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2823 0 stevel avl_tree = &ifp->illif_avl_by_ppa; 2824 0 stevel ill = avl_first(avl_tree); 2825 0 stevel /* 2826 0 stevel * ill is guaranteed to be non NULL or ifp should have 2827 0 stevel * not existed. 2828 0 stevel */ 2829 0 stevel ASSERT(ill != NULL); 2830 0 stevel return (ill); 2831 0 stevel } 2832 0 stevel ctx->ctx_current_list++; 2833 0 stevel } 2834 0 stevel 2835 0 stevel return (NULL); 2836 0 stevel } 2837 0 stevel 2838 0 stevel /* 2839 0 stevel * returns the next ill in the list. ill_first() must have been called 2840 0 stevel * before calling ill_next() or bad things will happen. 2841 0 stevel */ 2842 0 stevel 2843 0 stevel /* 2844 0 stevel * We don't check for CONDEMNED ills here. Caller must do that if 2845 0 stevel * necessary under the ill lock. 2846 0 stevel */ 2847 0 stevel ill_t * 2848 0 stevel ill_next(ill_walk_context_t *ctx, ill_t *lastill) 2849 0 stevel { 2850 0 stevel ill_if_t *ifp; 2851 0 stevel ill_t *ill; 2852 3448 dh155122 ip_stack_t *ipst = lastill->ill_ipst; 2853 3448 dh155122 2854 0 stevel ASSERT(lastill->ill_ifptr != (ill_if_t *) 2855 3448 dh155122 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 2856 0 stevel if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 2857 0 stevel AVL_AFTER)) != NULL) { 2858 0 stevel return (ill); 2859 0 stevel } 2860 0 stevel 2861 0 stevel /* goto next ill_ifp in the list. */ 2862 0 stevel ifp = lastill->ill_ifptr->illif_next; 2863 0 stevel 2864 0 stevel /* make sure not at end of circular list */ 2865 3448 dh155122 while (ifp == 2866 3448 dh155122 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2867 0 stevel if (++ctx->ctx_current_list > ctx->ctx_last_list) 2868 0 stevel return (NULL); 2869 3448 dh155122 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2870 0 stevel } 2871 0 stevel 2872 0 stevel return (avl_first(&ifp->illif_avl_by_ppa)); 2873 0 stevel } 2874 0 stevel 2875 0 stevel /* 2876 8485 Peter * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ 2877 8485 Peter * The final number (PPA) must not have any leading zeros. Upon success, a 2878 8485 Peter * pointer to the start of the PPA is returned; otherwise NULL is returned. 2879 0 stevel */ 2880 0 stevel static char * 2881 0 stevel ill_get_ppa_ptr(char *name) 2882 0 stevel { 2883 8485 Peter int namelen = strlen(name); 2884 8485 Peter int end_ndx = namelen - 1; 2885 8485 Peter int ppa_ndx, i; 2886 8485 Peter 2887 8485 Peter /* 2888 8485 Peter * Check that the first character is [a-zA-Z], and that the last 2889 8485 Peter * character is [0-9]. 2890 8485 Peter */ 2891 8485 Peter if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) 2892 8485 Peter return (NULL); 2893 8485 Peter 2894 8485 Peter /* 2895 8485 Peter * Set `ppa_ndx' to the PPA start, and check for leading zeroes. 2896 8485 Peter */ 2897 8485 Peter for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) 2898 8485 Peter if (!isdigit(name[ppa_ndx - 1])) 2899 0 stevel break; 2900 8485 Peter 2901 8485 Peter if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) 2902 8485 Peter return (NULL); 2903 8485 Peter 2904 8485 Peter /* 2905 8485 Peter * Check that the intermediate characters are [a-z0-9.] 2906 8485 Peter */ 2907 8485 Peter for (i = 1; i < ppa_ndx; i++) { 2908 8485 Peter if (!isalpha(name[i]) && !isdigit(name[i]) && 2909 8485 Peter name[i] != '.' && name[i] != '_') { 2910 8485 Peter return (NULL); 2911 8485 Peter } 2912 8485 Peter } 2913 8485 Peter 2914 8485 Peter return (name + ppa_ndx); 2915 0 stevel } 2916 0 stevel 2917 0 stevel /* 2918 0 stevel * use avl tree to locate the ill. 2919 0 stevel */ 2920 0 stevel static ill_t * 2921 11042 Erik ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst) 2922 0 stevel { 2923 0 stevel char *ppa_ptr = NULL; 2924 0 stevel int len; 2925 0 stevel uint_t ppa; 2926 0 stevel ill_t *ill = NULL; 2927 0 stevel ill_if_t *ifp; 2928 0 stevel int list; 2929 0 stevel 2930 0 stevel /* 2931 0 stevel * get ppa ptr 2932 0 stevel */ 2933 0 stevel if (isv6) 2934 0 stevel list = IP_V6_G_HEAD; 2935 0 stevel else 2936 0 stevel list = IP_V4_G_HEAD; 2937 0 stevel 2938 0 stevel if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 2939 0 stevel return (NULL); 2940 0 stevel } 2941 0 stevel 2942 0 stevel len = ppa_ptr - name + 1; 2943 0 stevel 2944 0 stevel ppa = stoi(&ppa_ptr); 2945 0 stevel 2946 3448 dh155122 ifp = IP_VX_ILL_G_LIST(list, ipst); 2947 3448 dh155122 2948 3448 dh155122 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2949 0 stevel /* 2950 0 stevel * match is done on len - 1 as the name is not null 2951 0 stevel * terminated it contains ppa in addition to the interface 2952 0 stevel * name. 2953 0 stevel */ 2954 0 stevel if ((ifp->illif_name_len == len) && 2955 0 stevel bcmp(ifp->illif_name, name, len - 1) == 0) { 2956 0 stevel break; 2957 0 stevel } else { 2958 0 stevel ifp = ifp->illif_next; 2959 0 stevel } 2960 0 stevel } 2961 0 stevel 2962 3448 dh155122 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2963 0 stevel /* 2964 0 stevel * Even the interface type does not exist. 2965 0 stevel */ 2966 0 stevel return (NULL); 2967 0 stevel } 2968 0 stevel 2969 0 stevel ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 2970 0 stevel if (ill != NULL) { 2971 0 stevel mutex_enter(&ill->ill_lock); 2972 0 stevel if (ILL_CAN_LOOKUP(ill)) { 2973 0 stevel ill_refhold_locked(ill); 2974 0 stevel mutex_exit(&ill->ill_lock); 2975 0 stevel return (ill); 2976 11042 Erik } 2977 11042 Erik mutex_exit(&ill->ill_lock); 2978 11042 Erik } 2979 0 stevel return (NULL); 2980 0 stevel } 2981 0 stevel 2982 0 stevel /* 2983 0 stevel * comparison function for use with avl. 2984 0 stevel */ 2985 0 stevel static int 2986 0 stevel ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 2987 0 stevel { 2988 0 stevel uint_t ppa; 2989 0 stevel uint_t ill_ppa; 2990 0 stevel 2991 0 stevel ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 2992 0 stevel 2993 0 stevel ppa = *((uint_t *)ppa_ptr); 2994 0 stevel ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 2995 0 stevel /* 2996 0 stevel * We want the ill with the lowest ppa to be on the 2997 0 stevel * top. 2998 0 stevel */ 2999 0 stevel if (ill_ppa < ppa) 3000 0 stevel return (1); 3001 0 stevel if (ill_ppa > ppa) 3002 0 stevel return (-1); 3003 0 stevel return (0); 3004 0 stevel } 3005 0 stevel 3006 0 stevel /* 3007 0 stevel * remove an interface type from the global list. 3008 0 stevel */ 3009 0 stevel static void 3010 0 stevel ill_delete_interface_type(ill_if_t *interface) 3011 0 stevel { 3012 0 stevel ASSERT(interface != NULL); 3013 0 stevel ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 3014 0 stevel 3015 0 stevel avl_destroy(&interface->