Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  *
     21  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     22  * Use is subject to license terms.
     23  */
     24 
     25 #include <inet/ip.h>
     26 #include <inet/ip6.h>
     27 #include <inet/ip_if.h>
     28 #include <inet/ip_ire.h>
     29 #include <inet/ip_multi.h>
     30 #include <inet/ip_ndp.h>
     31 #include <inet/ip_rts.h>
     32 #include <inet/mi.h>
     33 #include <net/if_types.h>
     34 #include <sys/dlpi.h>
     35 #include <sys/kmem.h>
     36 #include <sys/modhash.h>
     37 #include <sys/sdt.h>
     38 #include <sys/strsun.h>
     39 #include <sys/sunddi.h>
     40 #include <sys/types.h>
     41 
     42 /*
     43  * Convenience macros for getting the ip_stack_t associated with an
     44  * ipmp_illgrp_t or ipmp_grp_t.
     45  */
     46 #define	IPMP_GRP_TO_IPST(grp)		PHYINT_TO_IPST((grp)->gr_phyint)
     47 #define	IPMP_ILLGRP_TO_IPST(illg)	((illg)->ig_ipmp_ill->ill_ipst)
     48 
     49 /*
     50  * Assorted constants that aren't important enough to be tunable.
     51  */
     52 #define	IPMP_GRP_HASH_SIZE		64
     53 #define	IPMP_ILL_REFRESH_TIMEOUT	120	/* seconds */
     54 
     55 
     56 /*
     57  * IPMP meta-interface kstats (based on those in PSARC/1997/198).
     58  */
     59 static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = {
     60 	{ "obytes",	KSTAT_DATA_UINT32 },
     61 	{ "obytes64",	KSTAT_DATA_UINT64 },
     62 	{ "rbytes",	KSTAT_DATA_UINT32 },
     63 	{ "rbytes64",	KSTAT_DATA_UINT64 },
     64 	{ "opackets",	KSTAT_DATA_UINT32 },
     65 	{ "opackets64",	KSTAT_DATA_UINT64 },
     66 	{ "oerrors",	KSTAT_DATA_UINT32 },
     67 	{ "ipackets",	KSTAT_DATA_UINT32 },
     68 	{ "ipackets64",	KSTAT_DATA_UINT64 },
     69 	{ "ierrors",	KSTAT_DATA_UINT32 },
     70 	{ "multircv",	KSTAT_DATA_UINT32 },
     71 	{ "multixmt",	KSTAT_DATA_UINT32 },
     72 	{ "brdcstrcv",	KSTAT_DATA_UINT32 },
     73 	{ "brdcstxmt",	KSTAT_DATA_UINT32 },
     74 	{ "link_up",	KSTAT_DATA_UINT32 }
     75 };
     76 
     77 static void	ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t);
     78 static int	ipmp_grp_create_kstats(ipmp_grp_t *);
     79 static int	ipmp_grp_update_kstats(kstat_t *, int);
     80 static void	ipmp_grp_destroy_kstats(ipmp_grp_t *);
     81 static ill_t	*ipmp_illgrp_min_ill(ipmp_illgrp_t *);
     82 static ill_t	*ipmp_illgrp_max_ill(ipmp_illgrp_t *);
     83 static void	ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
     84 static void	ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t);
     85 static boolean_t ipmp_ill_activate(ill_t *);
     86 static void	ipmp_ill_deactivate(ill_t *);
     87 static void	ipmp_ill_ire_mark_testhidden(ire_t *, char *);
     88 static void	ipmp_ill_ire_clear_testhidden(ire_t *, char *);
     89 static void	ipmp_ill_refresh_active_timer_start(ill_t *);
     90 static void	ipmp_ill_rtsaddrmsg(ill_t *, int);
     91 static void	ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action);
     92 static ipif_t	*ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t);
     93 static void	ipmp_phyint_get_kstats(phyint_t *, uint64_t *);
     94 static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *);
     95 
     96 /*
     97  * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init().
     98  */
     99 void
    100 ipmp_init(ip_stack_t *ipst)
    101 {
    102 	ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash",
    103 	    IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
    104 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
    105 	rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0);
    106 }
    107 
    108 /*
    109  * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini().
    110  */
    111 void
    112 ipmp_destroy(ip_stack_t *ipst)
    113 {
    114 	mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash);
    115 	rw_destroy(&ipst->ips_ipmp_lock);
    116 }
    117 
    118 /*
    119  * Create an IPMP group named `grname', associate it with IPMP phyint `phyi',
    120  * and add it to the hash.  On success, return a pointer to the created group.
    121  * Caller must ensure `grname' is not yet in the hash.  Assumes that the IPMP
    122  * meta-interface associated with the group also has the same name (but they
    123  * may differ later via ipmp_grp_rename()).
    124  */
    125 ipmp_grp_t *
    126 ipmp_grp_create(const char *grname, phyint_t *phyi)
    127 {
    128 	ipmp_grp_t *grp;
    129 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
    130 	mod_hash_hndl_t mh;
    131 
    132 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
    133 
    134 	if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL)
    135 		return (NULL);
    136 
    137 	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
    138 	(void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname));
    139 
    140 	/*
    141 	 * Cache the group's phyint.  This is safe since a phyint_t will
    142 	 * outlive its ipmp_grp_t.
    143 	 */
    144 	grp->gr_phyint = phyi;
    145 
    146 	/*
    147 	 * Create IPMP group kstats.
    148 	 */
    149 	if (ipmp_grp_create_kstats(grp) != 0) {
    150 		kmem_free(grp, sizeof (ipmp_grp_t));
    151 		return (NULL);
    152 	}
    153 
    154 	/*
    155 	 * Insert the group into the hash.
    156 	 */
    157 	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) {
    158 		ipmp_grp_destroy_kstats(grp);
    159 		kmem_free(grp, sizeof (ipmp_grp_t));
    160 		return (NULL);
    161 	}
    162 	ipmp_grp_insert(grp, mh);
    163 
    164 	return (grp);
    165 }
    166 
    167 /*
    168  * Create IPMP kstat structures for `grp'.  Return an errno upon failure.
    169  */
    170 static int
    171 ipmp_grp_create_kstats(ipmp_grp_t *grp)
    172 {
    173 	kstat_t *ksp;
    174 	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
    175 
    176 	ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net",
    177 	    KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id);
    178 	if (ksp == NULL)
    179 		return (ENOMEM);
    180 
    181 	ksp->ks_update = ipmp_grp_update_kstats;
    182 	ksp->ks_private = grp;
    183 	bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats));
    184 
    185 	kstat_install(ksp);
    186 	grp->gr_ksp = ksp;
    187 	return (0);
    188 }
    189 
    190 /*
    191  * Update the IPMP kstats tracked by `ksp'; called by the kstats framework.
    192  */
    193 static int
    194 ipmp_grp_update_kstats(kstat_t *ksp, int rw)
    195 {
    196 	uint_t		i;
    197 	kstat_named_t	*kn = KSTAT_NAMED_PTR(ksp);
    198 	ipmp_grp_t	*grp = ksp->ks_private;
    199 	ip_stack_t	*ipst = IPMP_GRP_TO_IPST(grp);
    200 	ipsq_t		*ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq;
    201 	phyint_t	*phyi;
    202 	uint64_t	phyi_kstats[IPMP_KSTAT_MAX];
    203 
    204 	if (rw == KSTAT_WRITE)
    205 		return (EACCES);
    206 
    207 	/*
    208 	 * Start with the group's baseline values.
    209 	 */
    210 	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
    211 		if (kn[i].data_type == KSTAT_DATA_UINT32) {
    212 			kn[i].value.ui32 = grp->gr_kstats0[i];
    213 		} else {
    214 			ASSERT(kn[i].data_type == KSTAT_DATA_UINT64);
    215 			kn[i].value.ui64 = grp->gr_kstats0[i];
    216 		}
    217 	}
    218 
    219 	/*
    220 	 * Add in the stats of each phyint currently in the group.  Since we
    221 	 * don't directly track the phyints in a group, we cheat by walking
    222 	 * the IPSQ set under ill_g_lock.  (The IPSQ list cannot change while
    223 	 * ill_g_lock is held.)
    224 	 */
    225 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
    226 	ipsq = grp_ipsq->ipsq_next;
    227 	for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) {
    228 		phyi = ipsq->ipsq_phyint;
    229 
    230 		/*
    231 		 * If a phyint in a group is being unplumbed, it's possible
    232 		 * that ill_glist_delete() -> phyint_free() already freed the
    233 		 * phyint (and set ipsq_phyint to NULL), but the unplumb
    234 		 * operation has yet to complete (and thus ipsq_dq() has yet
    235 		 * to remove the phyint's IPSQ from the group IPSQ's phyint
    236 		 * list).  We skip those phyints here (note that their kstats
    237 		 * have already been added to gr_kstats0[]).
    238 		 */
    239 		if (phyi == NULL)
    240 			continue;
    241 
    242 		ipmp_phyint_get_kstats(phyi, phyi_kstats);
    243 
    244 		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
    245 			phyi_kstats[i] -= phyi->phyint_kstats0[i];
    246 			if (kn[i].data_type == KSTAT_DATA_UINT32)
    247 				kn[i].value.ui32 += phyi_kstats[i];
    248 			else
    249 				kn[i].value.ui64 += phyi_kstats[i];
    250 		}
    251 	}
    252 
    253 	kn[IPMP_KSTAT_LINK_UP].value.ui32 =
    254 	    (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0;
    255 
    256 	rw_exit(&ipst->ips_ill_g_lock);
    257 	return (0);
    258 }
    259 
    260 /*
    261  * Destroy IPMP kstat structures for `grp'.
    262  */
    263 static void
    264 ipmp_grp_destroy_kstats(ipmp_grp_t *grp)
    265 {
    266 	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
    267 
    268 	kstat_delete_netstack(grp->gr_ksp, id);
    269 	bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0));
    270 	grp->gr_ksp = NULL;
    271 }
    272 
    273 /*
    274  * Look up an IPMP group named `grname' on IP stack `ipst'.  Return NULL if it
    275  * does not exist.
    276  */
    277 ipmp_grp_t *
    278 ipmp_grp_lookup(const char *grname, ip_stack_t *ipst)
    279 {
    280 	ipmp_grp_t *grp;
    281 
    282 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
    283 
    284 	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
    285 	    (mod_hash_val_t *)&grp) == 0)
    286 		return (grp);
    287 
    288 	return (NULL);
    289 }
    290 
    291 /*
    292  * Place information about group `grp' into `lifgr'.
    293  */
    294 void
    295 ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr)
    296 {
    297 	ill_t *ill;
    298 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
    299 
    300 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
    301 
    302 	lifgr->gi_v4 = (grp->gr_v4 != NULL);
    303 	lifgr->gi_v6 = (grp->gr_v6 != NULL);
    304 	lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4;
    305 	lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6;
    306 	lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP;
    307 	(void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ);
    308 	lifgr->gi_m4ifname[0] = '\0';
    309 	lifgr->gi_m6ifname[0] = '\0';
    310 	lifgr->gi_bcifname[0] = '\0';
    311 
    312 	if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) {
    313 		(void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ);
    314 		(void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ);
    315 	}
    316 
    317 	if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL)
    318 		(void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ);
    319 }
    320 
    321 /*
    322  * Insert `grp' into the hash using the reserved hash entry `mh'.
    323  * Caller must ensure `grp' is not yet in the hash.
    324  */
    325 static void
    326 ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh)
    327 {
    328 	int err;
    329 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
    330 
    331 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
    332 
    333 	/*
    334 	 * Since grp->gr_name will exist at least as long as `grp' is in the
    335 	 * hash, we use it directly as the key.
    336 	 */
    337 	err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash,
    338 	    (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh);
    339 	if (err != 0) {
    340 		/*
    341 		 * This should never happen since `mh' was preallocated.
    342 		 */
    343 		panic("cannot insert IPMP group \"%s\" (err %d)",
    344 		    grp->gr_name, err);
    345 	}
    346 }
    347 
    348 /*
    349  * Remove `grp' from the hash.  Caller must ensure `grp' is in it.
    350  */
    351 static void
    352 ipmp_grp_remove(ipmp_grp_t *grp)
    353 {
    354 	int err;
    355 	mod_hash_val_t val;
    356 	mod_hash_key_t key = (mod_hash_key_t)grp->gr_name;
    357 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
    358 
    359 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
    360 
    361 	err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val);
    362 	if (err != 0 || val != grp) {
    363 		panic("cannot remove IPMP group \"%s\" (err %d)",
    364 		    grp->gr_name, err);
    365 	}
    366 }
    367 
    368 /*
    369  * Attempt to rename `grp' to new name `grname'.  Return an errno if the new
    370  * group name already exists or is invalid, or if there isn't enough memory.
    371  */
    372 int
    373 ipmp_grp_rename(ipmp_grp_t *grp, const char *grname)
    374 {
    375 	mod_hash_hndl_t mh;
    376 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
    377 
    378 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
    379 
    380 	if (grname[0] == '\0')
    381 		return (EINVAL);
    382 
    383 	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
    384 	    (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND)
    385 		return (EEXIST);
    386 
    387 	/*
    388 	 * Before we remove the group from the hash, ensure we'll be able to
    389 	 * re-insert it by reserving space.
    390 	 */
    391 	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0)
    392 		return (ENOMEM);
    393 
    394 	ipmp_grp_remove(grp);
    395 	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
    396 	ipmp_grp_insert(grp, mh);
    397 
    398 	return (0);
    399 }
    400 
    401 /*
    402  * Destroy `grp' and remove it from the hash.  Caller must ensure `grp' is in
    403  * the hash, and that there are no interfaces on it.
    404  */
    405 void
    406 ipmp_grp_destroy(ipmp_grp_t *grp)
    407 {
    408 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
    409 
    410 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
    411 
    412 	/*
    413 	 * If there are still interfaces using this group, panic before things
    414 	 * go really off the rails.
    415 	 */
    416 	if (grp->gr_nif != 0)
    417 		panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name);
    418 
    419 	ipmp_grp_remove(grp);
    420 	ipmp_grp_destroy_kstats(grp);
    421 
    422 	ASSERT(grp->gr_v4 == NULL);
    423 	ASSERT(grp->gr_v6 == NULL);
    424 	ASSERT(grp->gr_nv4 == 0);
    425 	ASSERT(grp->gr_nv6 == 0);
    426 	ASSERT(grp->gr_nactif == 0);
    427 	ASSERT(grp->gr_linkdownmp == NULL);
    428 	grp->gr_phyint = NULL;
    429 
    430 	kmem_free(grp, sizeof (ipmp_grp_t));
    431 }
    432 
    433 /*
    434  * Check whether `ill' is suitable for inclusion into `grp', and return an
    435  * errno describing the problem (if any).  NOTE: many of these errno values
    436  * are interpreted by ifconfig, which will take corrective action and retry
    437  * the SIOCSLIFGROUPNAME, so please exercise care when changing them.
    438  */
    439 static int
    440 ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
    441 {
    442 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
    443 
    444 	ASSERT(IAM_WRITER_ILL(ill));
    445 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
    446 
    447 	/*
    448 	 * To sidestep complicated address migration logic in the kernel and
    449 	 * to force the kernel's all-hosts multicast memberships to be blown
    450 	 * away, all addresses that had been brought up must be brought back
    451 	 * down prior to adding an interface to a group.  (This includes
    452 	 * addresses currently down due to DAD.)  Once the interface has been
    453 	 * added to the group, its addresses can then be brought back up, at
    454 	 * which point they will be moved to the IPMP meta-interface.
    455 	 * NOTE: we do this before ill_appaddr_cnt() since bringing down the
    456 	 * link-local causes in.ndpd to remove its ADDRCONF'd addresses.
    457 	 */
    458 	if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
    459 		return (EADDRINUSE);
    460 
    461 	/*
    462 	 * To avoid confusing applications by changing addresses that are
    463 	 * under their control, all such control must be removed prior to
    464 	 * adding an interface into a group.
    465 	 */
    466 	if (ill_appaddr_cnt(ill) != 0)
    467 		return (EADDRNOTAVAIL);
    468 
    469 	/*
    470 	 * Since PTP addresses do not share the same broadcast domain, they
    471 	 * are not allowed to be in an IPMP group.
    472 	 */
    473 	if (ill_ptpaddr_cnt(ill) != 0)
    474 		return (EINVAL);
    475 
    476 	/*
    477 	 * An ill must support multicast to be allowed into a group.
    478 	 */
    479 	if (!(ill->ill_flags & ILLF_MULTICAST))
    480 		return (ENOTSUP);
    481 
    482 	/*
    483 	 * An ill must strictly be using ARP and/or ND for address
    484 	 * resolution for it to be allowed into a group.
    485 	 */
    486 	if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP))
    487 		return (ENOTSUP);
    488 
    489 	/*
    490 	 * An ill cannot also be using usesrc groups.  (Although usesrc uses
    491 	 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does
    492 	 * all its modifications as writer.)
    493 	 */
    494 	if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill))
    495 		return (ENOTSUP);
    496 
    497 	/*
    498 	 * All ills in a group must be the same mactype.
    499 	 */
    500 	if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype)
    501 		return (EINVAL);
    502 
    503 	return (0);
    504 }
    505 
    506 /*
    507  * Check whether `phyi' is suitable for inclusion into `grp', and return an
    508  * errno describing the problem (if any).  See comment above ipmp_grp_vet_ill()
    509  * regarding errno values.
    510  */
    511 int
    512 ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi)
    513 {
    514 	int err = 0;
    515 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
    516 
    517 	ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq));
    518 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
    519 
    520 	/*
    521 	 * An interface cannot have address families plumbed that are not
    522 	 * configured in the group.
    523 	 */
    524 	if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL ||
    525 	    phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL)
    526 		return (EAFNOSUPPORT);
    527 
    528 	if (phyi->phyint_illv4 != NULL)
    529 		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4);
    530 	if (err == 0 && phyi->phyint_illv6 != NULL)
    531 		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6);
    532 
    533 	return (err);
    534 }
    535 
    536 /*
    537  * Create a new illgrp on IPMP meta-interface `ill'.
    538  */
    539 ipmp_illgrp_t *
    540 ipmp_illgrp_create(ill_t *ill)
    541 {
    542 	uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
    543 	ipmp_illgrp_t *illg;
    544 
    545 	ASSERT(IAM_WRITER_ILL(ill));
    546 	ASSERT(IS_IPMP(ill));
    547 	ASSERT(ill->ill_grp == NULL);
    548 
    549 	if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL)
    550 		return (NULL);
    551 
    552 	list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode));
    553 	list_create(&illg->ig_actif, sizeof (ill_t),
    554 	    offsetof(ill_t, ill_actnode));
    555 	list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t),
    556 	    offsetof(ipmp_arpent_t, ia_node));
    557 
    558 	illg->ig_ipmp_ill = ill;
    559 	ill->ill_grp = illg;
    560 	ipmp_illgrp_set_mtu(illg, mtu);
    561 
    562 	return (illg);
    563 }
    564 
    565 /*
    566  * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface.
    567  */
    568 void
    569 ipmp_illgrp_destroy(ipmp_illgrp_t *illg)
    570 {
    571 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
    572 	ASSERT(IS_IPMP(illg->ig_ipmp_ill));
    573 
    574 	/*
    575 	 * Verify `illg' is empty.
    576 	 */
    577 	ASSERT(illg->ig_next_ill == NULL);
    578 	ASSERT(illg->ig_cast_ill == NULL);
    579 	ASSERT(list_is_empty(&illg->ig_arpent));
    580 	ASSERT(list_is_empty(&illg->ig_if));
    581 	ASSERT(list_is_empty(&illg->ig_actif));
    582 	ASSERT(illg->ig_nactif == 0);
    583 
    584 	/*
    585 	 * Destroy `illg'.
    586 	 */
    587 	illg->ig_ipmp_ill->ill_grp = NULL;
    588 	illg->ig_ipmp_ill = NULL;
    589 	list_destroy(&illg->ig_if);
    590 	list_destroy(&illg->ig_actif);
    591 	list_destroy(&illg->ig_arpent);
    592 	kmem_free(illg, sizeof (ipmp_illgrp_t));
    593 }
    594 
    595 /*
    596  * Add `ipif' to the pool of usable data addresses on `illg' and attempt to
    597  * bind it to an underlying ill, while keeping an even address distribution.
    598  * If the bind is successful, return a pointer to the bound ill.
    599  */
    600 ill_t *
    601 ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
    602 {
    603 	ill_t *minill;
    604 	ipmp_arpent_t *entp;
    605 
    606 	ASSERT(IAM_WRITER_IPIF(ipif));
    607 	ASSERT(ipmp_ipif_is_dataaddr(ipif));
    608 
    609 	/*
    610 	 * IPMP data address mappings are internally managed by IP itself, so
    611 	 * delete any existing ARP entries associated with the address.
    612 	 */
    613 	if (!ipif->ipif_isv6) {
    614 		entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr);
    615 		if (entp != NULL)
    616 			ipmp_illgrp_destroy_arpent(illg, entp);
    617 	}
    618 
    619 	if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
    620 		ipmp_ill_bind_ipif(minill, ipif, Res_act_none);
    621 
    622 	return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL);
    623 }
    624 
    625 /*
    626  * Delete `ipif' from the pool of usable data addresses on `illg'.  If it's
    627  * bound, unbind it from the underlying ill while keeping an even address
    628  * distribution.
    629  */
    630 void
    631 ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
    632 {
    633 	ill_t *maxill, *boundill = ipif->ipif_bound_ill;
    634 
    635 	ASSERT(IAM_WRITER_IPIF(ipif));
    636 
    637 	if (boundill != NULL) {
    638 		(void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE);
    639 
    640 		maxill = ipmp_illgrp_max_ill(illg);
    641 		if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) {
    642 			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
    643 			ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind);
    644 		}
    645 	}
    646 }
    647 
    648 /*
    649  * Return the active ill with the greatest number of data addresses in `illg'.
    650  */
    651 static ill_t *
    652 ipmp_illgrp_max_ill(ipmp_illgrp_t *illg)
    653 {
    654 	ill_t *ill, *bestill = NULL;
    655 
    656 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
    657 
    658 	ill = list_head(&illg->ig_actif);
    659 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
    660 		if (bestill == NULL ||
    661 		    ill->ill_bound_cnt > bestill->ill_bound_cnt) {
    662 			bestill = ill;
    663 		}
    664 	}
    665 	return (bestill);
    666 }
    667 
    668 /*
    669  * Return the active ill with the fewest number of data addresses in `illg'.
    670  */
    671 static ill_t *
    672 ipmp_illgrp_min_ill(ipmp_illgrp_t *illg)
    673 {
    674 	ill_t *ill, *bestill = NULL;
    675 
    676 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
    677 
    678 	ill = list_head(&illg->ig_actif);
    679 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
    680 		if (bestill == NULL ||
    681 		    ill->ill_bound_cnt < bestill->ill_bound_cnt) {
    682 			if (ill->ill_bound_cnt == 0)
    683 				return (ill);	 /* can't get better */
    684 			bestill = ill;
    685 		}
    686 	}
    687 	return (bestill);
    688 }
    689 
    690 /*
    691  * Return a pointer to IPMP meta-interface for `illg' (which must exist).
    692  * Since ig_ipmp_ill never changes for a given illg, no locks are needed.
    693  */
    694 ill_t *
    695 ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg)
    696 {
    697 	return (illg->ig_ipmp_ill);
    698 }
    699 
    700 /*
    701  * Return a pointer to the next available underlying ill in `illg', or NULL if
    702  * one doesn't exist.  Caller must be inside the IPSQ.
    703  */
    704 ill_t *
    705 ipmp_illgrp_next_ill(ipmp_illgrp_t *illg)
    706 {
    707 	ill_t *ill;
    708 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
    709 
    710 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
    711 
    712 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
    713 	if ((ill = illg->ig_next_ill) != NULL) {
    714 		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
    715 		if (illg->ig_next_ill == NULL)
    716 			illg->ig_next_ill = list_head(&illg->ig_actif);
    717 	}
    718 	rw_exit(&ipst->ips_ipmp_lock);
    719 
    720 	return (ill);
    721 }
    722 
    723 /*
    724  * Return a held pointer to the next available underlying ill in `illg', or
    725  * NULL if one doesn't exist.  Caller need not be inside the IPSQ.
    726  */
    727 ill_t *
    728 ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
    729 {
    730 	ill_t *ill;
    731 	uint_t i;
    732 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
    733 
    734 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
    735 	for (i = 0; i < illg->ig_nactif; i++) {
    736 		ill = illg->ig_next_ill;
    737 		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
    738 		if (illg->ig_next_ill == NULL)
    739 			illg->ig_next_ill = list_head(&illg->ig_actif);
    740 
    741 		if (ill_check_and_refhold(ill)) {
    742 			rw_exit(&ipst->ips_ipmp_lock);
    743 			return (ill);
    744 		}
    745 	}
    746 	rw_exit(&ipst->ips_ipmp_lock);
    747 
    748 	return (NULL);
    749 }
    750 
    751 /*
    752  * Return a held pointer to the nominated multicast ill in `illg', or NULL if
    753  * one doesn't exist.  Caller need not be inside the IPSQ.
    754  */
    755 ill_t *
    756 ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
    757 {
    758 	ill_t *castill;
    759 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
    760 
    761 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
    762 	castill = illg->ig_cast_ill;
    763 	if (castill != NULL && ill_check_and_refhold(castill)) {
    764 		rw_exit(&ipst->ips_ipmp_lock);
    765 		return (castill);
    766 	}
    767 	rw_exit(&ipst->ips_ipmp_lock);
    768 	return (NULL);
    769 }
    770 
    771 /*
    772  * Callback routine for ncec_walk() that deletes `nce' if it is associated with
    773  * the `(ill_t *)arg' and it is not one of the local addresses.  Caller must be
    774  * inside the IPSQ.
    775  */
    776 static void
    777 ipmp_ncec_delete_nonlocal(ncec_t *ncec, uchar_t *arg)
    778 {
    779 	if ((ncec != NULL) && !NCE_MYADDR(ncec) &&
    780 	    ncec->ncec_ill == (ill_t *)arg) {
    781 		ncec_delete(ncec);
    782 	}
    783 }
    784 
    785 /*
    786  * Set the nominated cast ill on `illg' to `castill'.  If `castill' is NULL,
    787  * any existing nomination is removed.  Caller must be inside the IPSQ.
    788  */
    789 static void
    790 ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
    791 {
    792 	ill_t *ocastill = illg->ig_cast_ill;
    793 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
    794 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
    795 
    796 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
    797 
    798 	/*
    799 	 * Disable old nominated ill (if any).
    800 	 */
    801 	if (ocastill != NULL) {
    802 		DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *,
    803 		    illg, ill_t *, ocastill);
    804 		ASSERT(ocastill->ill_nom_cast);
    805 		ocastill->ill_nom_cast = B_FALSE;
    806 		/*
    807 		 * If the IPMP meta-interface is down, we never did the join,
    808 		 * so we must not try to leave.
    809 		 */
    810 		if (ipmp_ill->ill_dl_up)
    811 			ill_leave_multicast(ipmp_ill);
    812 
    813 		/*
    814 		 * Delete any NCEs tied to the old nomination.  We must do this
    815 		 * last since ill_leave_multicast() may trigger IREs to be
    816 		 * built using ig_cast_ill.
    817 		 */
    818 		ncec_walk(ocastill, (pfi_t)ipmp_ncec_delete_nonlocal, ocastill,
    819 		    ocastill->ill_ipst);
    820 	}
    821 
    822 	/*
    823 	 * Set new nomination.
    824 	 */
    825 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
    826 	illg->ig_cast_ill = castill;
    827 	rw_exit(&ipst->ips_ipmp_lock);
    828 
    829 	/*
    830 	 * Enable new nominated ill (if any).
    831 	 */
    832 	if (castill != NULL) {
    833 		DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *,
    834 		    illg, ill_t *, castill);
    835 		ASSERT(!castill->ill_nom_cast);
    836 		castill->ill_nom_cast = B_TRUE;
    837 		/*
    838 		 * If the IPMP meta-interface is down, the attempt to recover
    839 		 * will silently fail but ill_need_recover_multicast will be
    840 		 * erroneously cleared -- so check first.
    841 		 */
    842 		if (ipmp_ill->ill_dl_up)
    843 			ill_recover_multicast(ipmp_ill);
    844 	}
    845 }
    846 
    847 /*
    848  * Create an IPMP ARP entry and add it to the set tracked on `illg'.  If an
    849  * entry for the same IP address already exists, destroy it first.  Return the
    850  * created IPMP ARP entry, or NULL on failure.
    851  */
    852 ipmp_arpent_t *
    853 ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp,
    854     ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags)
    855 {
    856 	ipmp_arpent_t *entp, *oentp;
    857 
    858 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
    859 
    860 	if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len,
    861 	    KM_NOSLEEP)) == NULL)
    862 		return (NULL);
    863 
    864 	/*
    865 	 * Delete any existing ARP entry for this address.
    866 	 */
    867 	if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
    868 		ipmp_illgrp_destroy_arpent(illg, oentp);
    869 
    870 	/*
    871 	 * Prepend the new entry.
    872 	 */
    873 	entp->ia_ipaddr = ipaddr;
    874 	entp->ia_flags = flags;
    875 	entp->ia_lladdr_len = lladdr_len;
    876 	entp->ia_lladdr = (uchar_t *)&entp[1];
    877 	bcopy(lladdr, entp->ia_lladdr, lladdr_len);
    878 	entp->ia_proxyarp = proxyarp;
    879 	entp->ia_notified = B_TRUE;
    880 	list_insert_head(&illg->ig_arpent, entp);
    881 	return (entp);
    882 }
    883 
    884 /*
    885  * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it.
    886  */
    887 void
    888 ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
    889 {
    890 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
    891 
    892 	list_remove(&illg->ig_arpent, entp);
    893 	kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len);
    894 }
    895 
    896 /*
    897  * Mark that ARP has been notified about the IP address on `entp'; `illg' is
    898  * taken as a debugging aid for DTrace FBT probes.
    899  */
    900 /* ARGSUSED */
    901 void
    902 ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
    903 {
    904 	entp->ia_notified = B_TRUE;
    905 }
    906 
    907 /*
    908  * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is
    909  * NULL, any IPMP ARP entry is requested.  Return NULL if it does not exist.
    910  */
    911 ipmp_arpent_t *
    912 ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp)
    913 {
    914 	ipmp_arpent_t *entp = list_head(&illg->ig_arpent);
    915 
    916 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
    917 
    918 	if (addrp == NULL)
    919 		return (entp);
    920 
    921 	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp))
    922 		if (entp->ia_ipaddr == *addrp)
    923 			break;
    924 	return (entp);
    925 }
    926 
    927 /*
    928  * Refresh ARP entries on `illg' to be distributed across its active
    929  * interfaces.  Entries that cannot be refreshed (e.g., because there are no
    930  * active interfaces) are marked so that subsequent calls can try again.
    931  */
    932 void
    933 ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
    934 {
    935 	ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
    936 	uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
    937 	ipmp_arpent_t *entp;
    938 	ncec_t *ncec;
    939 	nce_t  *nce;
    940 
    941 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
    942 	ASSERT(!ipmp_ill->ill_isv6);
    943 
    944 	ill = list_head(&illg->ig_actif);
    945 	entp = list_head(&illg->ig_arpent);
    946 	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) {
    947 		if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) {
    948 			entp->ia_notified = B_FALSE;
    949 			continue;
    950 		}
    951 
    952 		ASSERT(paddrlen == ill->ill_phys_addr_length);
    953 
    954 		/*
    955 		 * If this is a proxy ARP entry, we can skip notifying ARP if
    956 		 * the entry is already up-to-date.  If it has changed, we
    957 		 * update the entry's hardware address before notifying ARP.
    958 		 */
    959 		if (entp->ia_proxyarp) {
    960 			if (bcmp(ill->ill_phys_addr, entp->ia_lladdr,
    961 			    paddrlen) == 0 && entp->ia_notified)
    962 				continue;
    963 			bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen);
    964 		}
    965 
    966 		(void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr,
    967 		    paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED,
    968 		    &nce);
    969 		if (nce == NULL || !entp->ia_proxyarp) {
    970 			if (nce != NULL)
    971 				nce_refrele(nce);
    972 			continue;
    973 		}
    974 		ncec = nce->nce_common;
    975 		mutex_enter(&ncec->ncec_lock);
    976 		nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr);
    977 		mutex_exit(&ncec->ncec_lock);
    978 		nce_refrele(nce);
    979 		ipmp_illgrp_mark_arpent(illg, entp);
    980 
    981 		if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
    982 			ill = list_head(&illg->ig_actif);
    983 	}
    984 }
    985 
    986 /*
    987  * Return an interface in `illg' with the specified `physaddr', or NULL if one
    988  * doesn't exist.  Caller must hold ill_g_lock if it's not inside the IPSQ.
    989  */
    990 ill_t *
    991 ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen)
    992 {
    993 	ill_t *ill;
    994 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
    995 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
    996 
    997 	ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock));
    998 
    999 	ill = list_head(&illg->ig_if);
   1000 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
   1001 		if (ill->ill_phys_addr_length == paddrlen &&
   1002 		    bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0)
   1003 			return (ill);
   1004 	}
   1005 	return (NULL);
   1006 }
   1007 
   1008 /*
   1009  * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND.
   1010  * Caller must be inside the IPSQ unless this is initialization.
   1011  */
   1012 static void
   1013 ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu)
   1014 {
   1015 	ill_t *ill = illg->ig_ipmp_ill;
   1016 	mblk_t *mp;
   1017 
   1018 	ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill));
   1019 
   1020 	/*
   1021 	 * If allocation fails, we have bigger problems than MTU.
   1022 	 */
   1023 	if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) {
   1024 		illg->ig_mtu = mtu;
   1025 		put(ill->ill_rq, mp);
   1026 	}
   1027 }
   1028 
   1029 /*
   1030  * Recalculate the IPMP group MTU for `illg', and update its associated IPMP
   1031  * ill MTU if necessary.
   1032  */
   1033 void
   1034 ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
   1035 {
   1036 	ill_t *ill;
   1037 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
   1038 	uint_t mtu = 0;
   1039 
   1040 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
   1041 
   1042 	/*
   1043 	 * Since ill_mtu can only change under ill_lock, we hold ill_lock
   1044 	 * for each ill as we iterate through the list.  Any changes to the
   1045 	 * ill_mtu will also trigger an update, so even if we missed it
   1046 	 * this time around, the update will catch it.
   1047 	 */
   1048 	ill = list_head(&illg->ig_if);
   1049 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
   1050 		mutex_enter(&ill->ill_lock);
   1051 		if (mtu == 0 || ill->ill_mtu < mtu)
   1052 			mtu = ill->ill_mtu;
   1053 		mutex_exit(&ill->ill_lock);
   1054 	}
   1055 
   1056 	/*
   1057 	 * MTU must be at least the minimum MTU.
   1058 	 */
   1059 	mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
   1060 
   1061 	if (illg->ig_mtu != mtu)
   1062 		ipmp_illgrp_set_mtu(illg, mtu);
   1063 }
   1064 
   1065 /*
   1066  * Link illgrp `illg' to IPMP group `grp'.  To simplify the caller, silently
   1067  * allow the same link to be established more than once.
   1068  */
   1069 void
   1070 ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp)
   1071 {
   1072 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
   1073 
   1074 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
   1075 
   1076 	if (illg->ig_ipmp_ill->ill_isv6) {
   1077 		ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg);
   1078 		grp->gr_v6 = illg;
   1079 	} else {
   1080 		ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg);
   1081 		grp->gr_v4 = illg;
   1082 	}
   1083 }
   1084 
   1085 /*
   1086  * Unlink illgrp `illg' from its IPMP group.  Return an errno if the illgrp
   1087  * cannot be unlinked (e.g., because there are still interfaces using it).
   1088  */
   1089 int
   1090 ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg)
   1091 {
   1092 	ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp;
   1093 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
   1094 
   1095 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
   1096 
   1097 	if (illg->ig_ipmp_ill->ill_isv6) {
   1098 		if (grp->gr_nv6 + grp->gr_pendv6 != 0)
   1099 			return (EBUSY);
   1100 		grp->gr_v6 = NULL;
   1101 	} else {
   1102 		if (grp->gr_nv4 + grp->gr_pendv4 != 0)
   1103 			return (EBUSY);
   1104 		grp->gr_v4 = NULL;
   1105 	}
   1106 	return (0);
   1107 }
   1108 
   1109 /*
   1110  * Place `ill' into `illg', and rebalance the data addresses on `illg'
   1111  * to be spread evenly across the ills now in it.  Also, adjust the IPMP
   1112  * ill as necessary to account for `ill' (e.g., MTU).
   1113  */
   1114 void
   1115 ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
   1116 {
   1117 	ill_t *ipmp_ill;
   1118 	ipif_t *ipif;
   1119 	ip_stack_t *ipst = ill->ill_ipst;
   1120 
   1121 	/* IS_UNDER_IPMP() requires ill_grp to be non-NULL */
   1122 	ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL);
   1123 	ASSERT(IAM_WRITER_ILL(ill));
   1124 	ASSERT(ill->ill_grp == NULL);
   1125 
   1126 	ipmp_ill = illg->ig_ipmp_ill;
   1127 
   1128 	/*
   1129 	 * Account for `ill' joining the illgrp.
   1130 	 */
   1131 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
   1132 	if (ill->ill_isv6)
   1133 		ill->ill_phyint->phyint_grp->gr_nv6++;
   1134 	else
   1135 		ill->ill_phyint->phyint_grp->gr_nv4++;
   1136 	rw_exit(&ipst->ips_ipmp_lock);
   1137 
   1138 	/*
   1139 	 * Ensure the ILLF_ROUTER flag remains consistent across the group.
   1140 	 */
   1141 	mutex_enter(&ill->ill_lock);
   1142 	if (ipmp_ill->ill_flags & ILLF_ROUTER)
   1143 		ill->ill_flags |= ILLF_ROUTER;
   1144 	else
   1145 		ill->ill_flags &= ~ILLF_ROUTER;
   1146 	mutex_exit(&ill->ill_lock);
   1147 
   1148 	/*
   1149 	 * Blow away all multicast memberships that currently exist on `ill'.
   1150 	 * This may seem odd, but it's consistent with the application view
   1151 	 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
   1152 	 * The ill_grp_pending bit prevents multicast group joins after
   1153 	 * update_conn_ill() and before ill_grp assignment.
   1154 	 */
   1155 	mutex_enter(&ill->ill_mcast_serializer);
   1156 	ill->ill_grp_pending = 1;
   1157 	mutex_exit(&ill->ill_mcast_serializer);
   1158 	update_conn_ill(ill, ill->ill_ipst);
   1159 	if (ill->ill_isv6) {
   1160 		reset_mrt_ill(ill);
   1161 	} else {
   1162 		ipif = ill->ill_ipif;
   1163 		for (; ipif != NULL; ipif = ipif->ipif_next) {
   1164 			reset_mrt_vif_ipif(ipif);
   1165 		}
   1166 	}
   1167 	ip_purge_allmulti(ill);
   1168 
   1169 	/*
   1170 	 * Borrow the first ill's ill_phys_addr_length value for the illgrp's
   1171 	 * physical address length.  All other ills must have the same value,
   1172 	 * since they are required to all be the same mactype.  Also update
   1173 	 * the IPMP ill's MTU and CoS marking, if necessary.
   1174 	 */
   1175 	if (list_is_empty(&illg->ig_if)) {
   1176 		ASSERT(ipmp_ill->ill_phys_addr_length == 0);
   1177 		/*
   1178 		 * NOTE: we leave ill_phys_addr NULL since the IPMP group
   1179 		 * doesn't have a physical address.  This means that code must
   1180 		 * not assume that ill_phys_addr is non-NULL just because
   1181 		 * ill_phys_addr_length is non-zero.  Likewise for ill_nd_lla.
   1182 		 */
   1183 		ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length;
   1184 		ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length;
   1185 		ipmp_ill->ill_type = ill->ill_type;
   1186 
   1187 		if (ill->ill_flags & ILLF_COS_ENABLED) {
   1188 			mutex_enter(&ipmp_ill->ill_lock);
   1189 			ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
   1190 			mutex_exit(&ipmp_ill->ill_lock);
   1191 		}
   1192 		ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
   1193 	} else {
   1194 		ASSERT(ipmp_ill->ill_phys_addr_length ==
   1195 		    ill->ill_phys_addr_length);
   1196 		ASSERT(ipmp_ill->ill_type == ill->ill_type);
   1197 
   1198 		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
   1199 			mutex_enter(&ipmp_ill->ill_lock);
   1200 			ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
   1201 			mutex_exit(&ipmp_ill->ill_lock);
   1202 		}
   1203 		if (illg->ig_mtu > ill->ill_mtu)
   1204 			ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
   1205 	}
   1206 
   1207 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
   1208 	list_insert_tail(&illg->ig_if, ill);
   1209 	ill->ill_grp = illg;
   1210 	rw_exit(&ipst->ips_ill_g_lock);
   1211 
   1212 	mutex_enter(&ill->ill_mcast_serializer);
   1213 	ill->ill_grp_pending = 0;
   1214 	mutex_exit(&ill->ill_mcast_serializer);
   1215 
   1216 	/*
   1217 	 * Hide the IREs on `ill' so that we don't accidentally find them when
   1218 	 * sending data traffic.
   1219 	 */
   1220 	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
   1221 
   1222 	ipmp_ill_refresh_active(ill);
   1223 }
   1224 
   1225 /*
   1226  * Remove `ill' from its illgrp, and rebalance the data addresses in that
   1227  * illgrp to be spread evenly across the remaining ills.  Also, adjust the
   1228  * IPMP ill as necessary now that `ill' is removed (e.g., MTU).
   1229  */
   1230 void
   1231 ipmp_ill_leave_illgrp(ill_t *ill)
   1232 {
   1233 	ill_t *ipmp_ill;
   1234 	ipif_t *ipif;
   1235 	ipmp_arpent_t *entp;
   1236 	ipmp_illgrp_t *illg = ill->ill_grp;
   1237 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
   1238 
   1239 	ASSERT(IS_UNDER_IPMP(ill));
   1240 	ASSERT(IAM_WRITER_ILL(ill));
   1241 	ASSERT(illg != NULL);
   1242 
   1243 	ipmp_ill = illg->ig_ipmp_ill;
   1244 
   1245 	/*
   1246 	 * Cancel IPMP-specific ill timeouts.
   1247 	 */
   1248 	(void) untimeout(ill->ill_refresh_tid);
   1249 
   1250 	/*
   1251 	 * Expose any previously-hidden IREs on `ill'.
   1252 	 */
   1253 	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill);
   1254 
   1255 	/*
   1256 	 * Ensure the multicast state for each ipif on `ill' is down so that
   1257 	 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin
   1258 	 * all eligible groups.
   1259 	 */
   1260 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
   1261 		if (ipif->ipif_flags & IPIF_UP)
   1262 			ipif_multicast_down(ipif);
   1263 
   1264 	/*
   1265 	 * Account for `ill' leaving the illgrp.
   1266 	 */
   1267 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
   1268 	if (ill->ill_isv6)
   1269 		ill->ill_phyint->phyint_grp->gr_nv6--;
   1270 	else
   1271 		ill->ill_phyint->phyint_grp->gr_nv4--;
   1272 	rw_exit(&ipst->ips_ipmp_lock);
   1273 
   1274 	/*
   1275 	 * Pull `ill' out of the interface lists.
   1276 	 */
   1277 	if (list_link_active(&ill->ill_actnode))
   1278 		ipmp_ill_deactivate(ill);
   1279 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
   1280 	list_remove(&illg->ig_if, ill);
   1281 	ill->ill_grp = NULL;
   1282 	rw_exit(&ipst->ips_ill_g_lock);
   1283 
   1284 	/*
   1285 	 * Re-establish multicast memberships that were previously being
   1286 	 * handled by the IPMP meta-interface.
   1287 	 */
   1288 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
   1289 		if (ipif->ipif_flags & IPIF_UP)
   1290 			ipif_multicast_up(ipif);
   1291 
   1292 	/*
   1293 	 * Refresh the group MTU based on the new interface list.
   1294 	 */
   1295 	ipmp_illgrp_refresh_mtu(illg);
   1296 
   1297 	if (list_is_empty(&illg->ig_if)) {
   1298 		/*
   1299 		 * No ills left in the illgrp; we no longer have a physical
   1300 		 * address length, nor can we support ARP, CoS, or anything
   1301 		 * else that depends on knowing the link layer type.
   1302 		 */
   1303 		while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL)
   1304 			ipmp_illgrp_destroy_arpent(illg, entp);
   1305 
   1306 		ipmp_ill->ill_phys_addr_length = 0;
   1307 		ipmp_ill->ill_nd_lla_len = 0;
   1308 		ipmp_ill->ill_type = IFT_OTHER;
   1309 		mutex_enter(&ipmp_ill->ill_lock);
   1310 		ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
   1311 		mutex_exit(&ipmp_ill->ill_lock);
   1312 	} else {
   1313 		/*
   1314 		 * If `ill' didn't support CoS, see if it can now be enabled.
   1315 		 */
   1316 		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
   1317 			ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED));
   1318 
   1319 			ill = list_head(&illg->ig_if);
   1320 			do {
   1321 				if (!(ill->ill_flags & ILLF_COS_ENABLED))
   1322 					break;
   1323 			} while ((ill = list_next(&illg->ig_if, ill)) != NULL);
   1324 
   1325 			if (ill == NULL) {
   1326 				mutex_enter(&ipmp_ill->ill_lock);
   1327 				ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
   1328 				mutex_exit(&ipmp_ill->ill_lock);
   1329 			}
   1330 		}
   1331 	}
   1332 }
   1333 
   1334 /*
   1335  * Check if `ill' should be active, and activate or deactivate if need be.
   1336  * Return B_FALSE if a refresh was necessary but could not be performed.
   1337  */
   1338 static boolean_t
   1339 ipmp_ill_try_refresh_active(ill_t *ill)
   1340 {
   1341 	boolean_t refreshed = B_TRUE;
   1342 
   1343 	ASSERT(IAM_WRITER_ILL(ill));
   1344 	ASSERT(IS_UNDER_IPMP(ill));
   1345 
   1346 	if (ipmp_ill_is_active(ill)) {
   1347 		if (!list_link_active(&ill->ill_actnode))
   1348 			refreshed = ipmp_ill_activate(ill);
   1349 	} else {
   1350 		if (list_link_active(&ill->ill_actnode))
   1351 			ipmp_ill_deactivate(ill);
   1352 	}
   1353 
   1354 	return (refreshed);
   1355 }
   1356 
   1357 /*
   1358  * Check if `ill' should be active, and activate or deactivate if need be.
   1359  * If the refresh fails, schedule a timer to try again later.
   1360  */
   1361 void
   1362 ipmp_ill_refresh_active(ill_t *ill)
   1363 {
   1364 	if (!ipmp_ill_try_refresh_active(ill))
   1365 		ipmp_ill_refresh_active_timer_start(ill);
   1366 }
   1367 
   1368 /*
   1369  * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'.
   1370  */
   1371 static void
   1372 ipmp_ill_refresh_active_timer(void *ill_arg)
   1373 {
   1374 	ill_t *ill = ill_arg;
   1375 	boolean_t refreshed = B_FALSE;
   1376 
   1377 	/*
   1378 	 * Clear ill_refresh_tid to indicate that no timeout is pending
   1379 	 * (another thread could schedule a new timeout while we're still
   1380 	 * running, but that's harmless).  If the ill is going away, bail.
   1381 	 */
   1382 	mutex_enter(&ill->ill_lock);
   1383 	ill->ill_refresh_tid = 0;
   1384 	if (ill->ill_state_flags & ILL_CONDEMNED) {
   1385 		mutex_exit(&ill->ill_lock);
   1386 		return;
   1387 	}
   1388 	mutex_exit(&ill->ill_lock);
   1389 
   1390 	if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) {
   1391 		refreshed = ipmp_ill_try_refresh_active(ill);
   1392 		ipsq_exit(ill->ill_phyint->phyint_ipsq);
   1393 	}
   1394 
   1395 	/*
   1396 	 * If the refresh failed, schedule another attempt.
   1397 	 */
   1398 	if (!refreshed)
   1399 		ipmp_ill_refresh_active_timer_start(ill);
   1400 }
   1401 
   1402 /*
   1403  * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'.
   1404  */
   1405 static void
   1406 ipmp_ill_refresh_active_timer_start(ill_t *ill)
   1407 {
   1408 	mutex_enter(&ill->ill_lock);
   1409 
   1410 	/*
   1411 	 * If the ill is going away or a refresh is already scheduled, bail.
   1412 	 */
   1413 	if (ill->ill_refresh_tid != 0 ||
   1414 	    (ill->ill_state_flags & ILL_CONDEMNED)) {
   1415 		mutex_exit(&ill->ill_lock);
   1416 		return;
   1417 	}
   1418 
   1419 	ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill,
   1420 	    SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT));
   1421 
   1422 	mutex_exit(&ill->ill_lock);
   1423 }
   1424 
   1425 /*
   1426  * Activate `ill' so it will be used to send and receive data traffic.  Return
   1427  * B_FALSE if `ill' cannot be activated.  Note that we allocate any messages
   1428  * needed to deactivate `ill' here as well so that deactivation cannot fail.
   1429  */
   1430 static boolean_t
   1431 ipmp_ill_activate(ill_t *ill)
   1432 {
   1433 	ipif_t		*ipif;
   1434 	mblk_t		*linkupmp = NULL, *linkdownmp = NULL;
   1435 	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
   1436 	ipmp_illgrp_t	*illg = ill->ill_grp;
   1437 	ill_t		*maxill;
   1438 	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
   1439 
   1440 	ASSERT(IAM_WRITER_ILL(ill));
   1441 	ASSERT(IS_UNDER_IPMP(ill));
   1442 
   1443 	/*
   1444 	 * If this will be the first active interface in the group, allocate
   1445 	 * the link-up and link-down messages.
   1446 	 */
   1447 	if (grp->gr_nactif == 0) {
   1448 		linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0);
   1449 		linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0);
   1450 		if (linkupmp == NULL || linkdownmp == NULL)
   1451 			goto fail;
   1452 	}
   1453 
   1454 	if (list_is_empty(&illg->ig_actif)) {
   1455 		/*
   1456 		 * Now that we have an active ill, nominate it for multicast
   1457 		 * and broadcast duties.  Do this before ipmp_ill_bind_ipif()
   1458 		 * since that may need to send multicast packets (e.g., IPv6
   1459 		 * neighbor discovery probes).
   1460 		 */
   1461 		ipmp_illgrp_set_cast(illg, ill);
   1462 
   1463 		/*
   1464 		 * This is the first active ill in the illgrp -- add 'em all.
   1465 		 * We can access/walk ig_ipmp_ill's ipif list since we're
   1466 		 * writer on its IPSQ as well.
   1467 		 */
   1468 		ipif = illg->ig_ipmp_ill->ill_ipif;
   1469 		for (; ipif != NULL; ipif = ipif->ipif_next)
   1470 			if (ipmp_ipif_is_up_dataaddr(ipif))
   1471 				ipmp_ill_bind_ipif(ill, ipif, Res_act_initial);
   1472 	} else {
   1473 		/*
   1474 		 * Redistribute the addresses by moving them from the ill with
   1475 		 * the most addresses until the ill being activated is at the
   1476 		 * same level as the rest of the ills.
   1477 		 */
   1478 		for (;;) {
   1479 			maxill = ipmp_illgrp_max_ill(illg);
   1480 			ASSERT(maxill != NULL);
   1481 			if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt)
   1482 				break;
   1483 			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
   1484 			ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
   1485 		}
   1486 	}
   1487 
   1488 	/*
   1489 	 * Put the interface in the active list.
   1490 	 */
   1491 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
   1492 	list_insert_tail(&illg->ig_actif, ill);
   1493 	illg->ig_nactif++;
   1494 	illg->ig_next_ill = ill;
   1495 	rw_exit(&ipst->ips_ipmp_lock);
   1496 
   1497 	/*
   1498 	 * Refresh static/proxy ARP entries to use `ill', if need be.
   1499 	 */
   1500 	if (!ill->ill_isv6)
   1501 		ipmp_illgrp_refresh_arpent(illg);
   1502 
   1503 	/*
   1504 	 * Finally, mark the group link up, if necessary.
   1505 	 */
   1506 	if (grp->gr_nactif++ == 0) {
   1507 		ASSERT(grp->gr_linkdownmp == NULL);
   1508 		grp->gr_linkdownmp = linkdownmp;
   1509 		put(illg->ig_ipmp_ill->ill_rq, linkupmp);
   1510 	}
   1511 	return (B_TRUE);
   1512 fail:
   1513 	freemsg(linkupmp);
   1514 	freemsg(linkdownmp);
   1515 	return (B_FALSE);
   1516 }
   1517 
   1518 /*
   1519  * Deactivate `ill' so it will not be used to send or receive data traffic.
   1520  */
   1521 static void
   1522 ipmp_ill_deactivate(ill_t *ill)
   1523 {
   1524 	ill_t		*minill;
   1525 	ipif_t		*ipif, *ubnextipif, *ubheadipif = NULL;
   1526 	mblk_t		*mp;
   1527 	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
   1528 	ipmp_illgrp_t	*illg = ill->ill_grp;
   1529 	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
   1530 
   1531 	ASSERT(IAM_WRITER_ILL(ill));
   1532 	ASSERT(IS_UNDER_IPMP(ill));
   1533 
   1534 	/*
   1535 	 * Pull the interface out of the active list.
   1536 	 */
   1537 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
   1538 	list_remove(&illg->ig_actif, ill);
   1539 	illg->ig_nactif--;
   1540 	illg->ig_next_ill = list_head(&illg->ig_actif);
   1541 	rw_exit(&ipst->ips_ipmp_lock);
   1542 
   1543 	/*
   1544 	 * If the ill that's being deactivated had been nominated for
   1545 	 * multicast/broadcast, nominate a new one.
   1546 	 */
   1547 	if (ill == illg->ig_cast_ill)
   1548 		ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
   1549 
   1550 	/*
   1551 	 * Delete all nce_t entries using this ill, so that the next attempt
   1552 	 * to send data traffic will revalidate cached nce's.
   1553 	 */
   1554 	nce_flush(ill, B_TRUE);
   1555 
   1556 	/*
   1557 	 * Unbind all of the ipifs bound to this ill, and save 'em in a list;
   1558 	 * we'll rebind them after we tell the resolver the ill is no longer
   1559 	 * active.  We must do things in this order or the resolver could
   1560 	 * accidentally rebind to the ill we're trying to remove if multiple
   1561 	 * ills in the group have the same hardware address (which is
   1562 	 * unsupported, but shouldn't lead to a wedged machine).
   1563 	 */
   1564 	while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) {
   1565 		ipif->ipif_bound_next = ubheadipif;
   1566 		ubheadipif = ipif;
   1567 	}
   1568 	if (!ill->ill_isv6) {
   1569 
   1570 		/*
   1571 		 * Refresh static/proxy ARP entries that had been using `ill'.
   1572 		 */
   1573 		ipmp_illgrp_refresh_arpent(illg);
   1574 	}
   1575 
   1576 	/*
   1577 	 * Rebind each ipif from the deactivated ill to the active ill with
   1578 	 * the fewest ipifs.  If there are no active ills, the ipifs will
   1579 	 * remain unbound.
   1580 	 */
   1581 	for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) {
   1582 		ubnextipif = ipif->ipif_bound_next;
   1583 		ipif->ipif_bound_next = NULL;
   1584 
   1585 		if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
   1586 			ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
   1587 	}
   1588 
   1589 	if (list_is_empty(&illg->ig_actif)) {
   1590 		ill_t *ipmp_ill = illg->ig_ipmp_ill;
   1591 
   1592 		ncec_walk(ipmp_ill, (pfi_t)ncec_delete_per_ill,
   1593 		    (uchar_t *)ipmp_ill, ipmp_ill->ill_ipst);
   1594 	}
   1595 
   1596 	/*
   1597 	 * Remove any IRE_IF_CLONE for this ill since they might have
   1598 	 * an ire_nce_cache/nce_common which refers to another ill in the group.
   1599 	 */
   1600 	ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone,
   1601 	    ill, ill);
   1602 
   1603 	/*
   1604 	 * Finally, mark the group link down, if necessary.
   1605 	 */
   1606 	if (--grp->gr_nactif == 0) {
   1607 		mp = grp->gr_linkdownmp;
   1608 		grp->gr_linkdownmp = NULL;
   1609 		ASSERT(mp != NULL);
   1610 		put(illg->ig_ipmp_ill->ill_rq, mp);
   1611 	}
   1612 }
   1613 
   1614 /*
   1615  * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD)
   1616  * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners.
   1617  */
   1618 static void
   1619 ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd)
   1620 {
   1621 	ipif_t *ipif;
   1622 
   1623 	ASSERT(IAM_WRITER_ILL(ill));
   1624 	ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE);
   1625 
   1626 	/*
   1627 	 * If `ill' is truly down, there are no messages to generate since:
   1628 	 *
   1629 	 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface
   1630 	 *    and its addresses by bringing them down.  But that's already
   1631 	 *    true, so there's nothing to hide.
   1632 	 *
   1633 	 * 2. If cmd == RTM_ADD, then we're supposed to generate messages
   1634 	 *    indicating that any previously-hidden up addresses are again
   1635 	 *    back up (along with the interface).  But they aren't, so
   1636 	 *    there's nothing to expose.
   1637 	 */
   1638 	if (ill->ill_ipif_up_count == 0)
   1639 		return;
   1640 
   1641 	if (cmd == RTM_ADD)
   1642 		ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL);
   1643 
   1644 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
   1645 		if (ipif->ipif_flags & IPIF_UP)
   1646 			ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL);
   1647 
   1648 	if (cmd == RTM_DELETE)
   1649 		ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL);
   1650 }
   1651 
   1652 /*
   1653  * Bind the address named by `ipif' to the underlying ill named by `ill'.
   1654  * If `act' is Res_act_none, don't notify the resolver.  Otherwise, `act'
   1655  * will indicate to the resolver whether this is an initial bringup of
   1656  * `ipif', or just a rebind to another ill.
   1657  */
   1658 static void
   1659 ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
   1660 {
   1661 	int err = 0;
   1662 	ip_stack_t *ipst = ill->ill_ipst;
   1663 
   1664 	ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif));
   1665 	ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill));
   1666 	ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif));
   1667 	ASSERT(ipif->ipif_bound_ill == NULL);
   1668 	ASSERT(ipif->ipif_bound_next == NULL);
   1669 
   1670 	ipif->ipif_bound_next = ill->ill_bound_ipif;
   1671 	ill->ill_bound_ipif = ipif;
   1672 	ill->ill_bound_cnt++;
   1673 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
   1674 	ipif->ipif_bound_ill = ill;
   1675 	rw_exit(&ipst->ips_ipmp_lock);
   1676 
   1677 	/*
   1678 	 * If necessary, tell ARP/NDP about the new mapping.  Note that
   1679 	 * ipif_resolver_up() cannot fail for IPv6 ills.
   1680 	 */
   1681 	if (act != Res_act_none) {
   1682 		if (ill->ill_isv6) {
   1683 			VERIFY(ipif_resolver_up(ipif, act) == 0);
   1684 			err = ipif_ndp_up(ipif, act == Res_act_initial);
   1685 		} else {
   1686 			err = ipif_resolver_up(ipif, act);
   1687 		}
   1688 
   1689 		/*
   1690 		 * Since ipif_ndp_up() never returns EINPROGRESS and
   1691 		 * ipif_resolver_up() only returns EINPROGRESS when the
   1692 		 * associated ill is not up, we should never be here with
   1693 		 * EINPROGRESS.  We rely on this to simplify the design.
   1694 		 */
   1695 		ASSERT(err != EINPROGRESS);
   1696 	}
   1697 	/* TODO: retry binding on failure? when? */
   1698 	ipif->ipif_bound = (err == 0);
   1699 }
   1700 
   1701 /*
   1702  * Unbind the address named by `ipif' from the underlying ill named by `ill'.
   1703  * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned.
   1704  * If no ipifs are bound to `ill', NULL is returned.  If `notifyres' is
   1705  * B_TRUE, notify the resolver about the change.
   1706  */
   1707 static ipif_t *
   1708 ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
   1709 {
   1710 	ipif_t *previpif;
   1711 	ip_stack_t *ipst = ill->ill_ipst;
   1712 
   1713 	ASSERT(IAM_WRITER_ILL(ill));
   1714 	ASSERT(IS_UNDER_IPMP(ill));
   1715 
   1716 	/*
   1717 	 * If necessary, find an ipif to unbind.
   1718 	 */
   1719 	if (ipif == NULL) {
   1720 		if ((ipif = ill->ill_bound_ipif) == NULL) {
   1721 			ASSERT(ill->ill_bound_cnt == 0);
   1722 			return (NULL);
   1723 		}
   1724 	}
   1725 
   1726 	ASSERT(IAM_WRITER_IPIF(ipif));
   1727 	ASSERT(IS_IPMP(ipif->ipif_ill));
   1728 	ASSERT(ipif->ipif_bound_ill == ill);
   1729 	ASSERT(ill->ill_bound_cnt > 0);
   1730 
   1731 	/*
   1732 	 * Unbind it.
   1733 	 */
   1734 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
   1735 	ipif->ipif_bound_ill = NULL;
   1736 	rw_exit(&ipst->ips_ipmp_lock);
   1737 	ill->ill_bound_cnt--;
   1738 
   1739 	if (ill->ill_bound_ipif == ipif) {
   1740 		ill->ill_bound_ipif = ipif->ipif_bound_next;
   1741 	} else {
   1742 		previpif = ill->ill_bound_ipif;
   1743 		while (previpif->ipif_bound_next != ipif)
   1744 			previpif = previpif->ipif_bound_next;
   1745 
   1746 		previpif->ipif_bound_next = ipif->ipif_bound_next;
   1747 	}
   1748 	ipif->ipif_bound_next = NULL;
   1749 
   1750 	/*
   1751 	 * If requested, notify the resolvers (provided we're bound).
   1752 	 */
   1753 	if (notifyres && ipif->ipif_bound) {
   1754 		if (ill->ill_isv6)
   1755 			ipif_ndp_down(ipif);
   1756 		else
   1757 			(void) ipif_arp_down(ipif);
   1758 	}
   1759 	ipif->ipif_bound = B_FALSE;
   1760 
   1761 	return (ipif);
   1762 }
   1763 
   1764 /*
   1765  * Check if `ill' is active.  Caller must hold ill_lock and phyint_lock if
   1766  * it's not inside the IPSQ.  Since ipmp_ill_try_refresh_active() calls this
   1767  * to determine whether an ill should be considered active, other consumers
   1768  * may race and learn about an ill that should be deactivated/activated before
   1769  * IPMP has performed the activation/deactivation.  This should be safe though
   1770  * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that
   1771  * would've been cleaned up by ipmp_ill_deactivate().
   1772  */
   1773 boolean_t
   1774 ipmp_ill_is_active(ill_t *ill)
   1775 {
   1776 	phyint_t *phyi = ill->ill_phyint;
   1777 
   1778 	ASSERT(IS_UNDER_IPMP(ill));
   1779 	ASSERT(IAM_WRITER_ILL(ill) ||
   1780 	    (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock)));
   1781 
   1782 	/*
   1783 	 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to
   1784 	 * set PHYI_FAILED whenever PHYI_RUNNING is cleared.  This allows the
   1785 	 * link flapping logic to be just in in.mpathd and allows us to ignore
   1786 	 * changes to PHYI_RUNNING.
   1787 	 */
   1788 	return (!(ill->ill_ipif_up_count == 0 ||
   1789 	    (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED))));
   1790 }
   1791 
   1792 /*
   1793  * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated
   1794  * with `ill_arg'.
   1795  */
   1796 static void
   1797 ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
   1798 {
   1799 	ill_t *ill = (ill_t *)ill_arg;
   1800 
   1801 	ASSERT(IAM_WRITER_ILL(ill));
   1802 	ASSERT(!IS_IPMP(ill));
   1803 
   1804 	if (ire->ire_ill != ill)
   1805 		return;
   1806 
   1807 	if (IRE_HIDDEN_TYPE(ire->ire_type)) {
   1808 		DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
   1809 		ire->ire_testhidden = B_TRUE;
   1810 	}
   1811 }
   1812 
   1813 /*
   1814  * IRE walker callback: clear ire_testhidden if the IRE has a source address
   1815  * on `ill_arg'.
   1816  */
   1817 static void
   1818 ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
   1819 {
   1820 	ill_t *ill = (ill_t *)ill_arg;
   1821 
   1822 	ASSERT(IAM_WRITER_ILL(ill));
   1823 	ASSERT(!IS_IPMP(ill));
   1824 
   1825 	if (ire->ire_ill == ill) {
   1826 		DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
   1827 		ire->ire_testhidden = B_FALSE;
   1828 	}
   1829 }
   1830 
   1831 /*
   1832  * Return a held pointer to the IPMP ill for underlying interface `ill', or
   1833  * NULL if one doesn't exist.  (Unfortunately, this function needs to take an
   1834  * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
   1835  * ill_grp pointer may become stale when not inside an IPSQ and not holding
   1836  * ipmp_lock.)  Caller need not be inside the IPSQ.
   1837  */
   1838 ill_t *
   1839 ipmp_ill_hold_ipmp_ill(ill_t *ill)
   1840 {
   1841 	ip_stack_t *ipst = ill->ill_ipst;
   1842 	ipmp_illgrp_t *illg;
   1843 
   1844 	ASSERT(!IS_IPMP(ill));
   1845 
   1846 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
   1847 	illg = ill->ill_grp;
   1848 	if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) {
   1849 		rw_exit(&ipst->ips_ipmp_lock);
   1850 		return (illg->ig_ipmp_ill);
   1851 	}
   1852 	/*
   1853 	 * Assume `ill' was removed from the illgrp in the meantime.
   1854 	 */
   1855 	rw_exit(&ill->ill_ipst->ips_ipmp_lock);
   1856 	return (NULL);
   1857 }
   1858 
   1859 /*
   1860  * Return the interface index for the IPMP ill tied to underlying interface
   1861  * `ill', or zero if one doesn't exist.  Caller need not be inside the IPSQ.
   1862  */
   1863 uint_t
   1864 ipmp_ill_get_ipmp_ifindex(const ill_t *ill)
   1865 {
   1866 	uint_t ifindex = 0;
   1867 	ip_stack_t *ipst = ill->ill_ipst;
   1868 	ipmp_grp_t *grp;
   1869 
   1870 	ASSERT(!IS_IPMP(ill));
   1871 
   1872 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
   1873 	if ((grp = ill->ill_phyint->phyint_grp) != NULL)
   1874 		ifindex = grp->gr_phyint->phyint_ifindex;
   1875 	rw_exit(&ipst->ips_ipmp_lock);
   1876 	return (ifindex);
   1877 }
   1878 
   1879 /*
   1880  * Place phyint `phyi' into IPMP group `grp'.
   1881  */
   1882 void
   1883 ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp)
   1884 {
   1885 	ill_t *ill;
   1886 	ipsq_t *ipsq = phyi->phyint_ipsq;
   1887 	ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq;
   1888 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
   1889 
   1890 	ASSERT(IAM_WRITER_IPSQ(ipsq));
   1891 	ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL);
   1892 
   1893 	/*
   1894 	 * Send routing socket messages indicating that the phyint's ills
   1895 	 * and ipifs vanished.
   1896 	 */
   1897 	if (phyi->phyint_illv4 != NULL) {
   1898 		ill = phyi->phyint_illv4;
   1899 		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
   1900 	}
   1901 
   1902 	if (phyi->phyint_illv6 != NULL) {
   1903 		ill = phyi->phyint_illv6;
   1904 		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
   1905 	}
   1906 
   1907 	/*
   1908 	 * Snapshot the phyint's initial kstats as a baseline.
   1909 	 */
   1910 	ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0);
   1911 
   1912 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
   1913 
   1914 	phyi->phyint_grp = grp;
   1915 	if (++grp->gr_nif == 1)
   1916 		grp->gr_mactype = ill->ill_mactype;
   1917 	else
   1918 		ASSERT(grp->gr_mactype == ill->ill_mactype);
   1919 
   1920 	/*
   1921 	 * Now that we're in the group, request a switch to the group's xop
   1922 	 * when we ipsq_exit().  All future operations will be exclusive on
   1923 	 * the group xop until ipmp_phyint_leave_grp() is called.
   1924 	 */
   1925 	ASSERT(ipsq->ipsq_swxop == NULL);
   1926 	ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop);
   1927 	ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop;
   1928 
   1929 	rw_exit(&ipst->ips_ipmp_lock);
   1930 }
   1931 
   1932 /*
   1933  * Remove phyint `phyi' from its current IPMP group.
   1934  */
   1935 void
   1936 ipmp_phyint_leave_grp(phyint_t *phyi)
   1937 {
   1938 	uint_t i;
   1939 	ipsq_t *ipsq = phyi->phyint_ipsq;
   1940 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
   1941 	uint64_t phyi_kstats[IPMP_KSTAT_MAX];
   1942 
   1943 	ASSERT(IAM_WRITER_IPSQ(ipsq));
   1944 
   1945 	/*
   1946 	 * If any of the phyint's ills are still in an illgrp, kick 'em out.
   1947 	 */
   1948 	if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4))
   1949 		ipmp_ill_leave_illgrp(phyi->phyint_illv4);
   1950 	if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6))
   1951 		ipmp_ill_leave_illgrp(phyi->phyint_illv6);
   1952 
   1953 	/*
   1954 	 * Send routing socket messages indicating that the phyint's ills
   1955 	 * and ipifs have reappeared.
   1956 	 */
   1957 	if (phyi->phyint_illv4 != NULL)
   1958 		ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD);
   1959 	if (phyi->phyint_illv6 != NULL)
   1960 		ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD);
   1961 
   1962 	/*
   1963 	 * Calculate the phyint's cumulative kstats while it was in the group,
   1964 	 * and add that to the group's baseline.
   1965 	 */
   1966 	ipmp_phyint_get_kstats(phyi, phyi_kstats);
   1967 	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
   1968 		phyi_kstats[i] -= phyi->phyint_kstats0[i];
   1969 		atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]);
   1970 	}
   1971 
   1972 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
   1973 
   1974 	phyi->phyint_grp->gr_nif--;
   1975 	phyi->phyint_grp = NULL;
   1976 
   1977 	/*
   1978 	 * As our final act in leaving the group, request a switch back to our
   1979 	 * IPSQ's own xop when we ipsq_exit().
   1980 	 */
   1981 	ASSERT(ipsq->ipsq_swxop == NULL);
   1982 	ipsq->ipsq_swxop = &ipsq->ipsq_ownxop;
   1983 
   1984 	rw_exit(&ipst->ips_ipmp_lock);
   1985 }
   1986 
   1987 /*
   1988  * Store the IPMP-related kstats for `phyi' into the array named by `kstats'.
   1989  * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements.
   1990  */
   1991 static void
   1992 ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[])
   1993 {
   1994 	uint_t		i, j;
   1995 	const char	*name;
   1996 	kstat_t		*ksp;
   1997 	kstat_named_t	*kn;
   1998 	ip_stack_t	*ipst = PHYINT_TO_IPST(phyi);
   1999 	zoneid_t	zoneid;
   2000 
   2001 	bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX);
   2002 	zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid);
   2003 	ksp = kstat_hold_byname("link", 0, phyi->phyint_name, zoneid);
   2004 	if (ksp == NULL)
   2005 		return;
   2006 
   2007 	KSTAT_ENTER(ksp);
   2008 
   2009 	if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
   2010 		/*
   2011 		 * Bring kstats up-to-date before recording.
   2012 		 */
   2013 		(void) KSTAT_UPDATE(ksp, KSTAT_READ);
   2014 
   2015 		kn = KSTAT_NAMED_PTR(ksp);
   2016 		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
   2017 			name = ipmp_kstats[i].name;
   2018 			kstats[i] = 0;
   2019 			for (j = 0; j < ksp->ks_ndata; j++) {
   2020 				if (strcmp(kn[j].name, name) != 0)
   2021 					continue;
   2022 
   2023 				switch (kn[j].data_type) {
   2024 				case KSTAT_DATA_INT32:
   2025 				case KSTAT_DATA_UINT32:
   2026 					kstats[i] = kn[j].value.ui32;
   2027 					break;
   2028 #ifdef	_LP64
   2029 				case KSTAT_DATA_LONG:
   2030 				case KSTAT_DATA_ULONG:
   2031 					kstats[i] = kn[j].value.ul;
   2032 					break;
   2033 #endif
   2034 				case KSTAT_DATA_INT64:
   2035 				case KSTAT_DATA_UINT64:
   2036 					kstats[i] = kn[j].value.ui64;
   2037 					break;
   2038 				}
   2039 				break;
   2040 			}
   2041 		}
   2042 	}
   2043 
   2044 	KSTAT_EXIT(ksp);
   2045 	kstat_rele(ksp);
   2046 }
   2047 
   2048 /*
   2049  * Refresh the active state of all ills on `phyi'.
   2050  */
   2051 void
   2052 ipmp_phyint_refresh_active(phyint_t *phyi)
   2053 {
   2054 	if (phyi->phyint_illv4 != NULL)
   2055 		ipmp_ill_refresh_active(phyi->phyint_illv4);
   2056 	if (phyi->phyint_illv6 != NULL)
   2057 		ipmp_ill_refresh_active(phyi->phyint_illv6);
   2058 }
   2059 
   2060 /*
   2061  * Return a held pointer to the underlying ill bound to `ipif', or NULL if one
   2062  * doesn't exist.  Caller need not be inside the IPSQ.
   2063  */
   2064 ill_t *
   2065 ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
   2066 {
   2067 	ill_t *boundill;
   2068 	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
   2069 
   2070 	ASSERT(IS_IPMP(ipif->ipif_ill));
   2071 
   2072 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
   2073 	boundill = ipif->ipif_bound_ill;
   2074 	if (boundill != NULL && ill_check_and_refhold(boundill)) {
   2075 		rw_exit(&ipst->ips_ipmp_lock);
   2076 		return (boundill);
   2077 	}
   2078 	rw_exit(&ipst->ips_ipmp_lock);
   2079 	return (NULL);
   2080 }
   2081 
   2082 /*
   2083  * Return a pointer to the underlying ill bound to `ipif', or NULL if one
   2084  * doesn't exist.  Caller must be inside the IPSQ.
   2085  */
   2086 ill_t *
   2087 ipmp_ipif_bound_ill(const ipif_t *ipif)
   2088 {
   2089 	ASSERT(IAM_WRITER_ILL(ipif->ipif_ill));
   2090 	ASSERT(IS_IPMP(ipif->ipif_ill));
   2091 
   2092 	return (ipif->ipif_bound_ill);
   2093 }
   2094 
   2095 /*
   2096  * Check if `ipif' is a "stub" (placeholder address not being used).
   2097  */
   2098 boolean_t
   2099 ipmp_ipif_is_stubaddr(const ipif_t *ipif)
   2100 {
   2101 	if (ipif->ipif_flags & IPIF_UP)
   2102 		return (B_FALSE);
   2103 	if (ipif->ipif_ill->ill_isv6)
   2104 		return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
   2105 	else
   2106 		return (ipif->ipif_lcl_addr == INADDR_ANY);
   2107 }
   2108 
   2109 /*
   2110  * Check if `ipif' is an IPMP data address.
   2111  */
   2112 boolean_t
   2113 ipmp_ipif_is_dataaddr(const ipif_t *ipif)
   2114 {
   2115 	if (ipif->ipif_flags & IPIF_NOFAILOVER)
   2116 		return (B_FALSE);
   2117 	if (ipif->ipif_ill->ill_isv6)
   2118 		return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
   2119 	else
   2120 		return (ipif->ipif_lcl_addr != INADDR_ANY);
   2121 }
   2122 
   2123 /*
   2124  * Check if `ipif' is an IPIF_UP IPMP data address.
   2125  */
   2126 static boolean_t
   2127 ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
   2128 {
   2129 	return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
   2130 }
   2131 
   2132 /*
   2133  * Check if `mp' contains a probe packet by verifying if the IP source address
   2134  * is a test address on an underlying interface `ill'. Caller need not be inside
   2135  * the IPSQ.
   2136  */
   2137 boolean_t
   2138 ipmp_packet_is_probe(mblk_t *mp, ill_t *ill)
   2139 {
   2140 	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
   2141 	ipha_t *ipha = (ipha_t *)mp->b_rptr;
   2142 
   2143 	ASSERT(DB_TYPE(mp) != M_CTL);
   2144 
   2145 	if (!IS_UNDER_IPMP(ill))
   2146 		return (B_FALSE);
   2147 
   2148 	if (ill->ill_isv6) {
   2149 		if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
   2150 		    ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL))
   2151 			return (B_TRUE);
   2152 	} else {
   2153 		if ((ipha->ipha_src != INADDR_ANY) &&
   2154 		    ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL))
   2155 			return (B_TRUE);
   2156 	}
   2157 	return (B_FALSE);
   2158 }
   2159 
   2160 /*
   2161  * Pick out an appropriate underlying interface for packet transmit.  This
   2162  * function may be called from the data path, so we need to verify that the
   2163  * IPMP group associated with `ill' is non-null after holding the ill_g_lock.
   2164  * Caller need not be inside the IPSQ.
   2165  */
   2166 ill_t *
   2167 ipmp_ill_get_xmit_ill(ill_t *ill, boolean_t is_unicast)
   2168 {
   2169 	ill_t *xmit_ill;
   2170 	ip_stack_t *ipst = ill->ill_ipst;
   2171 
   2172 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   2173 	if (ill->ill_grp == NULL) {
   2174 		/*
   2175 		 * The interface was taken out of the group. Return ill itself,
   2176 		 * but take a ref so that callers will always be able to do
   2177 		 * ill_refrele(ill);
   2178 		 */
   2179 		rw_exit(&ipst->ips_ill_g_lock);
   2180 		ill_refhold(ill);
   2181 		return (ill);
   2182 	}
   2183 	if (!is_unicast)
   2184 		xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
   2185 	else
   2186 		xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp);
   2187 	rw_exit(&ipst->ips_ill_g_lock);
   2188 	return (xmit_ill);
   2189 }
   2190 
   2191 /*
   2192  * Flush out any nce that points at `ncec' from an underlying interface
   2193  */
   2194 void
   2195 ipmp_ncec_flush_nce(ncec_t *ncec)
   2196 {
   2197 	ill_t		*ncec_ill = ncec->ncec_ill;
   2198 	ill_t		*ill;
   2199 	ipmp_illgrp_t	*illg;
   2200 	ip_stack_t	*ipst = ncec_ill->ill_ipst;
   2201 	list_t		dead;
   2202 	nce_t		*nce;
   2203 
   2204 	if (!IS_IPMP(ncec_ill))
   2205 		return;
   2206 
   2207 	illg = ncec_ill->ill_grp;
   2208 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
   2209 
   2210 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   2211 	ill = list_head(&illg->ig_if);
   2212 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
   2213 		nce_fastpath_list_delete(ill, ncec, &dead);
   2214 	}
   2215 	rw_exit(&ipst->ips_ill_g_lock);
   2216 
   2217 	/*
   2218 	 * we may now nce_refrele() all dead entries since all locks have been
   2219 	 * dropped.
   2220 	 */
   2221 	while ((nce = list_head(&dead)) != NULL) {
   2222 		list_remove(&dead, nce);
   2223 		nce_refrele(nce);
   2224 	}
   2225 	ASSERT(list_is_empty(&dead));
   2226 	list_destroy(&dead);
   2227 }
   2228 
   2229 /*
   2230  * For each interface in the IPMP group, if there are nce_t entries for the IP
   2231  * address corresponding to `ncec', then their dl_unitdata_req_t and fastpath
   2232  * information must be updated to match the link-layer address information in
   2233  * `ncec'.
   2234  */
   2235 void
   2236 ipmp_ncec_fastpath(ncec_t *ncec, ill_t *ipmp_ill)
   2237 {
   2238 	ill_t		*ill;
   2239 	ipmp_illgrp_t	*illg = ipmp_ill->ill_grp;
   2240 	ip_stack_t	*ipst = ipmp_ill->ill_ipst;
   2241 	nce_t		*nce, *nce_next;
   2242 	list_t		replace;
   2243 
   2244 	ASSERT(IS_IPMP(ipmp_ill));
   2245 
   2246 	/*
   2247 	 * if ncec itself is not reachable, there is no use in creating nce_t
   2248 	 * entries on the underlying interfaces in the group.
   2249 	 */
   2250 	if (!NCE_ISREACHABLE(ncec))
   2251 		return;
   2252 
   2253 	list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node));
   2254 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
   2255 	ill = list_head(&illg->ig_actif);
   2256 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
   2257 		/*
   2258 		 * For each underlying interface, we first check if there is an
   2259 		 * nce_t for the address in ncec->ncec_addr. If one exists,
   2260 		 * we should trigger nce_fastpath for that nce_t. However, the
   2261 		 * catch is that we are holding the ips_ipmp_lock to prevent
   2262 		 * changes to the IPMP group membership, so that we cannot
   2263 		 * putnext() to the driver.  So we nce_delete the
   2264 		 * list nce_t entries that need to be updated into the
   2265 		 * `replace' list, and then process the `replace' list
   2266 		 * after dropping the ips_ipmp_lock.
   2267 		 */
   2268 		mutex_enter(&ill->ill_lock);
   2269 		for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
   2270 			nce_next = list_next(&ill->ill_nce, nce);
   2271 			if (!IN6_ARE_ADDR_EQUAL(&nce->nce_addr,
   2272 			    &ncec->ncec_addr)) {
   2273 				nce = nce_next;
   2274 				continue;
   2275 			}
   2276 			nce_refhold(nce);
   2277 			nce_delete(nce);
   2278 			list_insert_tail(&replace, nce);
   2279 			nce = nce_next;
   2280 		}
   2281 		mutex_exit(&ill->ill_lock);
   2282 	}
   2283 	rw_exit(&ipst->ips_ipmp_lock);
   2284 	/*
   2285 	 * `replace' now has the list of nce's on which we should be triggering
   2286 	 * nce_fastpath(). We now retrigger fastpath by setting up the nce
   2287 	 * again. The code in nce_lookup_then_add_v* ensures that nce->nce_ill
   2288 	 * is still in the group for ncec->ncec_ill
   2289 	 */
   2290 	while ((nce = list_head(&replace)) != NULL) {
   2291 		list_remove(&replace, nce);
   2292 		if (ncec->ncec_ill->ill_isv6) {
   2293 			(void) nce_lookup_then_add_v6(nce->nce_ill,
   2294 			    ncec->ncec_lladdr,  ncec->ncec_lladdr_length,
   2295 			    &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED,
   2296 			    NULL);
   2297 		} else {
   2298 			ipaddr_t ipaddr;
   2299 
   2300 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr);
   2301 			(void) nce_lookup_then_add_v4(nce->nce_ill,
   2302 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
   2303 			    &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL);
   2304 		}
   2305 		nce_refrele(nce);
   2306 	}
   2307 	ASSERT(list_is_empty(&replace));
   2308 	list_destroy(&replace);
   2309 }
   2310