Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/types.h>
     27 #include <sys/stream.h>
     28 #include <sys/stropts.h>
     29 #include <sys/strsun.h>
     30 #include <sys/sysmacros.h>
     31 #include <sys/errno.h>
     32 #include <sys/dlpi.h>
     33 #include <sys/socket.h>
     34 #include <sys/ddi.h>
     35 #include <sys/sunddi.h>
     36 #include <sys/cmn_err.h>
     37 #include <sys/debug.h>
     38 #include <sys/vtrace.h>
     39 #include <sys/kmem.h>
     40 #include <sys/zone.h>
     41 #include <sys/ethernet.h>
     42 #include <sys/sdt.h>
     43 #include <sys/mac.h>
     44 
     45 #include <net/if.h>
     46 #include <net/if_types.h>
     47 #include <net/if_dl.h>
     48 #include <net/route.h>
     49 #include <netinet/in.h>
     50 #include <netinet/ip6.h>
     51 #include <netinet/icmp6.h>
     52 
     53 #include <inet/common.h>
     54 #include <inet/mi.h>
     55 #include <inet/mib2.h>
     56 #include <inet/nd.h>
     57 #include <inet/ip.h>
     58 #include <inet/ip_impl.h>
     59 #include <inet/ipclassifier.h>
     60 #include <inet/ip_if.h>
     61 #include <inet/ip_ire.h>
     62 #include <inet/ip_rts.h>
     63 #include <inet/ip6.h>
     64 #include <inet/ip_ndp.h>
     65 #include <inet/sctp_ip.h>
     66 #include <inet/ip_arp.h>
     67 #include <inet/ip2mac_impl.h>
     68 
     69 #define	ANNOUNCE_INTERVAL(isv6) \
     70 	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
     71 	ipst->ips_ip_arp_publish_interval)
     72 
     73 #define	DEFENSE_INTERVAL(isv6) \
     74 	(isv6 ? ipst->ips_ndp_defend_interval : \
     75 	ipst->ips_arp_defend_interval)
     76 
     77 /* Non-tunable probe interval, based on link capabilities */
     78 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
     79 
     80 /*
     81  * The IPv4 Link Local address space is special; we do extra duplicate checking
     82  * there, as the entire assignment mechanism rests on random numbers.
     83  */
     84 #define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
     85 				((uchar_t *)ptr)[1] == 254)
     86 
     87 /*
     88  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
     89  * in to the ncec*add* functions.
     90  *
     91  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
     92  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
     93  * that we will respond to requests for the protocol address.
     94  */
     95 #define	NCE_EXTERNAL_FLAGS_MASK \
     96 	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
     97 	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
     98 	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
     99 
    100 /*
    101  * Lock ordering:
    102  *
    103  *	ndp_g_lock -> ill_lock -> ncec_lock
    104  *
    105  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
    106  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
    107  * ncec_refcnt).
    108  */
    109 
    110 static	void	nce_cleanup_list(ncec_t *ncec);
    111 static	void 	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
    112 static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
    113     ncec_t *);
    114 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
    115 static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
    116     uint16_t ncec_flags, nce_t **newnce);
    117 static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
    118     uint16_t ncec_flags, nce_t **newnce);
    119 static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
    120     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
    121     const in6_addr_t *target, int flag);
    122 static void	ncec_refhold_locked(ncec_t *);
    123 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
    124 static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
    125 static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
    126     uint16_t, uint16_t, nce_t **);
    127 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
    128 static nce_t *nce_add(ill_t *, ncec_t *);
    129 static void nce_inactive(nce_t *);
    130 extern nce_t 	*nce_lookup(ill_t *, const in6_addr_t *);
    131 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
    132 static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
    133     uint16_t, uint16_t, nce_t **);
    134 static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
    135     uint16_t, uint16_t, nce_t **);
    136 static int  nce_add_v6_postprocess(nce_t *);
    137 static int  nce_add_v4_postprocess(nce_t *);
    138 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
    139 static clock_t nce_fuzz_interval(clock_t, boolean_t);
    140 static void nce_resolv_ipmp_ok(ncec_t *);
    141 static void nce_walk_common(ill_t *, pfi_t, void *);
    142 static void nce_start_timer(ncec_t *, uint_t);
    143 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
    144 static void nce_fastpath_trigger(nce_t *);
    145 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
    146 
    147 #ifdef DEBUG
    148 static void	ncec_trace_cleanup(const ncec_t *);
    149 #endif
    150 
    151 #define	NCE_HASH_PTR_V4(ipst, addr)					\
    152 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
    153 
    154 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
    155 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
    156 		NCE_TABLE_SIZE)]))
    157 
    158 extern kmem_cache_t *ncec_cache;
    159 extern kmem_cache_t *nce_cache;
    160 
    161 /*
    162  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
    163  * If src_ill is not null, the ncec_addr is bound to src_ill. The
    164  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
    165  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
    166  * IPMP cast_ill (in the IPMP case).
    167  *
    168  * Note that the probe interval is based on ncec->ncec_ill which
    169  * may be the ipmp_ill.
    170  */
    171 static void
    172 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
    173 {
    174 	boolean_t dropped;
    175 	uint32_t probe_interval;
    176 
    177 	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
    178 	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
    179 	if (ncec->ncec_ipversion == IPV6_VERSION) {
    180 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
    181 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
    182 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
    183 		probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill);
    184 	} else {
    185 		/* IPv4 DAD delay the initial probe. */
    186 		if (send_probe)
    187 			dropped = arp_probe(ncec);
    188 		else
    189 			dropped = B_TRUE;
    190 		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
    191 		    !send_probe);
    192 	}
    193 	if (!dropped) {
    194 		mutex_enter(&ncec->ncec_lock);
    195 		ncec->ncec_pcnt--;
    196 		mutex_exit(&ncec->ncec_lock);
    197 	}
    198 	nce_restart_timer(ncec, probe_interval);
    199 }
    200 
    201 /*
    202  * Compute default flags to use for an advertisement of this ncec's address.
    203  */
    204 static int
    205 nce_advert_flags(const ncec_t *ncec)
    206 {
    207 	int flag = 0;
    208 
    209 	if (ncec->ncec_flags & NCE_F_ISROUTER)
    210 		flag |= NDP_ISROUTER;
    211 	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
    212 		flag |= NDP_ORIDE;
    213 
    214 	return (flag);
    215 }
    216 
    217 /*
    218  * NDP Cache Entry creation routine.
    219  * This routine must always be called with ndp6->ndp_g_lock held.
    220  */
    221 int
    222 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
    223     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
    224 {
    225 	int		err;
    226 	nce_t		*nce;
    227 
    228 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
    229 	ASSERT(ill != NULL && ill->ill_isv6);
    230 
    231 	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
    232 	    &nce);
    233 	if (err != 0)
    234 		return (err);
    235 	ASSERT(newnce != NULL);
    236 	*newnce = nce;
    237 	return (err);
    238 }
    239 
    240 /*
    241  * Post-processing routine to be executed after nce_add_v6(). This function
    242  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
    243  * and must be called without any locks held.
    244  */
    245 int
    246 nce_add_v6_postprocess(nce_t *nce)
    247 {
    248 	ncec_t		*ncec = nce->nce_common;
    249 	boolean_t	dropped = B_FALSE;
    250 	uchar_t		*hw_addr = ncec->ncec_lladdr;
    251 	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
    252 	ill_t		*ill = ncec->ncec_ill;
    253 	int		err = 0;
    254 	uint16_t	flags = ncec->ncec_flags;
    255 	ip_stack_t	*ipst = ill->ill_ipst;
    256 	boolean_t	trigger_fastpath = B_TRUE;
    257 
    258 	/*
    259 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
    260 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
    261 	 * We call nce_fastpath from nce_update if the link layer address of
    262 	 * the peer changes from nce_update
    263 	 */
    264 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
    265 	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
    266 		trigger_fastpath = B_FALSE;
    267 
    268 	if (trigger_fastpath)
    269 		nce_fastpath_trigger(nce);
    270 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
    271 		ill_t *hwaddr_ill;
    272 		/*
    273 		 * Unicast entry that needs DAD.
    274 		 */
    275 		if (IS_IPMP(ill)) {
    276 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
    277 			    hw_addr, hw_addr_len);
    278 		} else {
    279 			hwaddr_ill = ill;
    280 		}
    281 		nce_dad(ncec, hwaddr_ill, B_TRUE);
    282 		err = EINPROGRESS;
    283 	} else if (flags & NCE_F_UNSOL_ADV) {
    284 		/*
    285 		 * We account for the transmit below by assigning one
    286 		 * less than the ndd variable. Subsequent decrements
    287 		 * are done in nce_timer.
    288 		 */
    289 		mutex_enter(&ncec->ncec_lock);
    290 		ncec->ncec_unsolicit_count =
    291 		    ipst->ips_ip_ndp_unsolicit_count - 1;
    292 		mutex_exit(&ncec->ncec_lock);
    293 		dropped = ndp_xmit(ill,
    294 		    ND_NEIGHBOR_ADVERT,
    295 		    hw_addr,
    296 		    hw_addr_len,
    297 		    &ncec->ncec_addr,	/* Source and target of the adv */
    298 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
    299 		    nce_advert_flags(ncec));
    300 		mutex_enter(&ncec->ncec_lock);
    301 		if (dropped)
    302 			ncec->ncec_unsolicit_count++;
    303 		else
    304 			ncec->ncec_last_time_defended = ddi_get_lbolt();
    305 		if (ncec->ncec_unsolicit_count != 0) {
    306 			nce_start_timer(ncec,
    307 			    ipst->ips_ip_ndp_unsolicit_interval);
    308 		}
    309 		mutex_exit(&ncec->ncec_lock);
    310 	}
    311 	return (err);
    312 }
    313 
    314 /*
    315  * Atomically lookup and add (if needed) Neighbor Cache information for
    316  * an address.
    317  *
    318  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
    319  * are always added pointing at the ipmp_ill. Thus, when the ill passed
    320  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
    321  * entries will be created, both pointing at the same ncec_t. The nce_t
    322  * entries will have their nce_ill set to the ipmp_ill and the under_ill
    323  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
    324  * Local addresses are always created on the ill passed to nce_add_v6.
    325  */
    326 int
    327 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
    328     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
    329 {
    330 	int		err = 0;
    331 	ip_stack_t	*ipst = ill->ill_ipst;
    332 	nce_t		*nce, *upper_nce = NULL;
    333 	ill_t		*in_ill = ill;
    334 	boolean_t	need_ill_refrele = B_FALSE;
    335 
    336 	if (flags & NCE_F_MCAST) {
    337 		/*
    338 		 * hw_addr will be figured out in nce_set_multicast_v6;
    339 		 * caller has to select the cast_ill
    340 		 */
    341 		ASSERT(hw_addr == NULL);
    342 		ASSERT(!IS_IPMP(ill));
    343 		err = nce_set_multicast_v6(ill, addr, flags, newnce);
    344 		return (err);
    345 	}
    346 	ASSERT(ill->ill_isv6);
    347 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
    348 		ill = ipmp_ill_hold_ipmp_ill(ill);
    349 		if (ill == NULL)
    350 			return (ENXIO);
    351 		need_ill_refrele = B_TRUE;
    352 	}
    353 
    354 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
    355 	nce = nce_lookup_addr(ill, addr);
    356 	if (nce == NULL) {
    357 		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
    358 		    &nce);
    359 	} else {
    360 		err = EEXIST;
    361 	}
    362 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    363 	if (err == 0)
    364 		err = nce_add_v6_postprocess(nce);
    365 	if (in_ill != ill && nce != NULL) {
    366 		nce_t *under_nce = NULL;
    367 
    368 		/*
    369 		 * in_ill was the under_ill. Try to create the under_nce.
    370 		 * Hold the ill_g_lock to prevent changes to group membership
    371 		 * until we are done.
    372 		 */
    373 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
    374 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
    375 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
    376 			    ill_t *, ill);
    377 			rw_exit(&ipst->ips_ill_g_lock);
    378 			err = ENXIO;
    379 			nce_refrele(nce);
    380 			nce = NULL;
    381 			goto bail;
    382 		}
    383 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
    384 		if (under_nce == NULL) {
    385 			rw_exit(&ipst->ips_ill_g_lock);
    386 			err = EINVAL;
    387 			nce_refrele(nce);
    388 			nce = NULL;
    389 			goto bail;
    390 		}
    391 		rw_exit(&ipst->ips_ill_g_lock);
    392 		upper_nce = nce;
    393 		nce = under_nce; /* will be returned to caller */
    394 		if (NCE_ISREACHABLE(nce->nce_common))
    395 			nce_fastpath_trigger(under_nce);
    396 	}
    397 	/* nce_refrele is deferred until the lock is dropped  */
    398 	if (nce != NULL) {
    399 		if (newnce != NULL)
    400 			*newnce = nce;
    401 		else
    402 			nce_refrele(nce);
    403 	}
    404 bail:
    405 	if (upper_nce != NULL)
    406 		nce_refrele(upper_nce);
    407 	if (need_ill_refrele)
    408 		ill_refrele(ill);
    409 	return (err);
    410 }
    411 
    412 /*
    413  * Remove all the CONDEMNED nces from the appropriate hash table.
    414  * We create a private list of NCEs, these may have ires pointing
    415  * to them, so the list will be passed through to clean up dependent
    416  * ires and only then we can do ncec_refrele() which can make NCE inactive.
    417  */
    418 static void
    419 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
    420 {
    421 	ncec_t *ncec1;
    422 	ncec_t **ptpn;
    423 
    424 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
    425 	ASSERT(ndp->ndp_g_walker == 0);
    426 	for (; ncec; ncec = ncec1) {
    427 		ncec1 = ncec->ncec_next;
    428 		mutex_enter(&ncec->ncec_lock);
    429 		if (NCE_ISCONDEMNED(ncec)) {
    430 			ptpn = ncec->ncec_ptpn;
    431 			ncec1 = ncec->ncec_next;
    432 			if (ncec1 != NULL)
    433 				ncec1->ncec_ptpn = ptpn;
    434 			*ptpn = ncec1;
    435 			ncec->ncec_ptpn = NULL;
    436 			ncec->ncec_next = NULL;
    437 			ncec->ncec_next = *free_nce_list;
    438 			*free_nce_list = ncec;
    439 		}
    440 		mutex_exit(&ncec->ncec_lock);
    441 	}
    442 }
    443 
    444 /*
    445  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
    446  *    will return this NCE. Also no new timeouts will
    447  *    be started (See nce_restart_timer).
    448  * 2. Cancel any currently running timeouts.
    449  * 3. If there is an ndp walker, return. The walker will do the cleanup.
    450  *    This ensures that walkers see a consistent list of NCEs while walking.
    451  * 4. Otherwise remove the NCE from the list of NCEs
    452  */
    453 void
    454 ncec_delete(ncec_t *ncec)
    455 {
    456 	ncec_t	**ptpn;
    457 	ncec_t	*ncec1;
    458 	int	ipversion = ncec->ncec_ipversion;
    459 	ndp_g_t *ndp;
    460 	ip_stack_t	*ipst = ncec->ncec_ipst;
    461 
    462 	if (ipversion == IPV4_VERSION)
    463 		ndp = ipst->ips_ndp4;
    464 	else
    465 		ndp = ipst->ips_ndp6;
    466 
    467 	/* Serialize deletes */
    468 	mutex_enter(&ncec->ncec_lock);
    469 	if (NCE_ISCONDEMNED(ncec)) {
    470 		/* Some other thread is doing the delete */
    471 		mutex_exit(&ncec->ncec_lock);
    472 		return;
    473 	}
    474 	/*
    475 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
    476 	 * refcnt has to be >= 2
    477 	 */
    478 	ASSERT(ncec->ncec_refcnt >= 2);
    479 	ncec->ncec_flags |= NCE_F_CONDEMNED;
    480 	mutex_exit(&ncec->ncec_lock);
    481 
    482 	/* Count how many condemned ires for kmem_cache callback */
    483 	atomic_add_32(&ipst->ips_num_nce_condemned, 1);
    484 	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
    485 
    486 	/* Complete any waiting callbacks */
    487 	ncec_cb_dispatch(ncec);
    488 
    489 	/*
    490 	 * Cancel any running timer. Timeout can't be restarted
    491 	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
    492 	 * Passing invalid timeout id is fine.
    493 	 */
    494 	if (ncec->ncec_timeout_id != 0) {
    495 		(void) untimeout(ncec->ncec_timeout_id);
    496 		ncec->ncec_timeout_id = 0;
    497 	}
    498 
    499 	mutex_enter(&ndp->ndp_g_lock);
    500 	if (ncec->ncec_ptpn == NULL) {
    501 		/*
    502 		 * The last ndp walker has already removed this ncec from
    503 		 * the list after we marked the ncec CONDEMNED and before
    504 		 * we grabbed the global lock.
    505 		 */
    506 		mutex_exit(&ndp->ndp_g_lock);
    507 		return;
    508 	}
    509 	if (ndp->ndp_g_walker > 0) {
    510 		/*
    511 		 * Can't unlink. The walker will clean up
    512 		 */
    513 		ndp->ndp_g_walker_cleanup = B_TRUE;
    514 		mutex_exit(&ndp->ndp_g_lock);
    515 		return;
    516 	}
    517 
    518 	/*
    519 	 * Now remove the ncec from the list. nce_restart_timer won't restart
    520 	 * the timer since it is marked CONDEMNED.
    521 	 */
    522 	ptpn = ncec->ncec_ptpn;
    523 	ncec1 = ncec->ncec_next;
    524 	if (ncec1 != NULL)
    525 		ncec1->ncec_ptpn = ptpn;
    526 	*ptpn = ncec1;
    527 	ncec->ncec_ptpn = NULL;
    528 	ncec->ncec_next = NULL;
    529 	mutex_exit(&ndp->ndp_g_lock);
    530 
    531 	/* Removed from ncec_ptpn/ncec_next list */
    532 	ncec_refrele_notr(ncec);
    533 }
    534 
    535 void
    536 ncec_inactive(ncec_t *ncec)
    537 {
    538 	mblk_t		**mpp;
    539 	ill_t		*ill = ncec->ncec_ill;
    540 	ip_stack_t	*ipst = ncec->ncec_ipst;
    541 
    542 	ASSERT(ncec->ncec_refcnt == 0);
    543 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
    544 
    545 	/* Count how many condemned nces for kmem_cache callback */
    546 	if (NCE_ISCONDEMNED(ncec))
    547 		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
    548 
    549 	/* Free all allocated messages */
    550 	mpp = &ncec->ncec_qd_mp;
    551 	while (*mpp != NULL) {
    552 		mblk_t  *mp;
    553 
    554 		mp = *mpp;
    555 		*mpp = mp->b_next;
    556 
    557 		inet_freemsg(mp);
    558 	}
    559 	/*
    560 	 * must have been cleaned up in ncec_delete
    561 	 */
    562 	ASSERT(list_is_empty(&ncec->ncec_cb));
    563 	list_destroy(&ncec->ncec_cb);
    564 	/*
    565 	 * free the ncec_lladdr if one was allocated in nce_add_common()
    566 	 */
    567 	if (ncec->ncec_lladdr_length > 0)
    568 		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
    569 
    570 #ifdef DEBUG
    571 	ncec_trace_cleanup(ncec);
    572 #endif
    573 
    574 	mutex_enter(&ill->ill_lock);
    575 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
    576 	    (char *), "ncec", (void *), ncec);
    577 	ill->ill_ncec_cnt--;
    578 	ncec->ncec_ill = NULL;
    579 	/*
    580 	 * If the number of ncec's associated with this ill have dropped
    581 	 * to zero, check whether we need to restart any operation that
    582 	 * is waiting for this to happen.
    583 	 */
    584 	if (ILL_DOWN_OK(ill)) {
    585 		/* ipif_ill_refrele_tail drops the ill_lock */
    586 		ipif_ill_refrele_tail(ill);
    587 	} else {
    588 		mutex_exit(&ill->ill_lock);
    589 	}
    590 
    591 	mutex_destroy(&ncec->ncec_lock);
    592 	kmem_cache_free(ncec_cache, ncec);
    593 }
    594 
    595 /*
    596  * ncec_walk routine.  Delete the ncec if it is associated with the ill
    597  * that is going away.  Always called as a writer.
    598  */
    599 void
    600 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
    601 {
    602 	if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
    603 		ncec_delete(ncec);
    604 	}
    605 }
    606 
    607 /*
    608  * Neighbor Cache cleanup logic for a list of ncec_t entries.
    609  */
    610 static void
    611 nce_cleanup_list(ncec_t *ncec)
    612 {
    613 	ncec_t *ncec_next;
    614 
    615 	ASSERT(ncec != NULL);
    616 	while (ncec != NULL) {
    617 		ncec_next = ncec->ncec_next;
    618 		ncec->ncec_next = NULL;
    619 
    620 		/*
    621 		 * It is possible for the last ndp walker (this thread)
    622 		 * to come here after ncec_delete has marked the ncec CONDEMNED
    623 		 * and before it has removed the ncec from the fastpath list
    624 		 * or called untimeout. So we need to do it here. It is safe
    625 		 * for both ncec_delete and this thread to do it twice or
    626 		 * even simultaneously since each of the threads has a
    627 		 * reference on the ncec.
    628 		 */
    629 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
    630 		/*
    631 		 * Cancel any running timer. Timeout can't be restarted
    632 		 * since CONDEMNED is set. The ncec_lock can't be
    633 		 * held across untimeout though passing invalid timeout
    634 		 * id is fine.
    635 		 */
    636 		if (ncec->ncec_timeout_id != 0) {
    637 			(void) untimeout(ncec->ncec_timeout_id);
    638 			ncec->ncec_timeout_id = 0;
    639 		}
    640 		/* Removed from ncec_ptpn/ncec_next list */
    641 		ncec_refrele_notr(ncec);
    642 		ncec = ncec_next;
    643 	}
    644 }
    645 
    646 /*
    647  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
    648  */
    649 boolean_t
    650 nce_restart_dad(ncec_t *ncec)
    651 {
    652 	boolean_t started;
    653 	ill_t *ill, *hwaddr_ill;
    654 
    655 	if (ncec == NULL)
    656 		return (B_FALSE);
    657 	ill = ncec->ncec_ill;
    658 	mutex_enter(&ncec->ncec_lock);
    659 	if (ncec->ncec_state == ND_PROBE) {
    660 		mutex_exit(&ncec->ncec_lock);
    661 		started = B_TRUE;
    662 	} else if (ncec->ncec_state == ND_REACHABLE) {
    663 		ASSERT(ncec->ncec_lladdr != NULL);
    664 		ncec->ncec_state = ND_PROBE;
    665 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
    666 		/*
    667 		 * Slight cheat here: we don't use the initial probe delay
    668 		 * for IPv4 in this obscure case.
    669 		 */
    670 		mutex_exit(&ncec->ncec_lock);
    671 		if (IS_IPMP(ill)) {
    672 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
    673 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
    674 		} else {
    675 			hwaddr_ill = ill;
    676 		}
    677 		nce_dad(ncec, hwaddr_ill, B_TRUE);
    678 		started = B_TRUE;
    679 	} else {
    680 		mutex_exit(&ncec->ncec_lock);
    681 		started = B_FALSE;
    682 	}
    683 	return (started);
    684 }
    685 
    686 /*
    687  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
    688  * If one is found, the refcnt on the ncec will be incremented.
    689  */
    690 ncec_t *
    691 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
    692 {
    693 	ncec_t		*ncec;
    694 	ip_stack_t	*ipst = ill->ill_ipst;
    695 
    696 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
    697 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
    698 
    699 	/* Get head of v6 hash table */
    700 	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
    701 	ncec = ncec_lookup_illgrp(ill, addr, ncec);
    702 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    703 	rw_exit(&ipst->ips_ill_g_lock);
    704 	return (ncec);
    705 }
    706 /*
    707  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
    708  * If one is found, the refcnt on the ncec will be incremented.
    709  */
    710 ncec_t *
    711 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
    712 {
    713 	ncec_t	*ncec = NULL;
    714 	in6_addr_t addr6;
    715 	ip_stack_t *ipst = ill->ill_ipst;
    716 
    717 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
    718 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
    719 
    720 	/* Get head of v4 hash table */
    721 	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
    722 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
    723 	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
    724 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
    725 	rw_exit(&ipst->ips_ill_g_lock);
    726 	return (ncec);
    727 }
    728 
    729 /*
    730  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
    731  * If an ncec is found, increment the hold count on that ncec.
    732  * The caller passes in the start of the appropriate hash table, and must
    733  * be holding the appropriate global lock (ndp_g_lock). In addition, since
    734  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
    735  * must be held as reader.
    736  *
    737  * This function always matches across the ipmp group.
    738  */
    739 ncec_t *
    740 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
    741 {
    742 	ndp_g_t		*ndp;
    743 	ip_stack_t	*ipst = ill->ill_ipst;
    744 
    745 	if (ill->ill_isv6)
    746 		ndp = ipst->ips_ndp6;
    747 	else
    748 		ndp = ipst->ips_ndp4;
    749 
    750 	ASSERT(ill != NULL);
    751 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
    752 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
    753 		return (NULL);
    754 	for (; ncec != NULL; ncec = ncec->ncec_next) {
    755 		if (ncec->ncec_ill == ill ||
    756 		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
    757 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
    758 				mutex_enter(&ncec->ncec_lock);
    759 				if (!NCE_ISCONDEMNED(ncec)) {
    760 					ncec_refhold_locked(ncec);
    761 					mutex_exit(&ncec->ncec_lock);
    762 					break;
    763 				}
    764 				mutex_exit(&ncec->ncec_lock);
    765 			}
    766 		}
    767 	}
    768 	return (ncec);
    769 }
    770 
    771 /*
    772  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
    773  * entries for ill only, i.e., when ill is part of an ipmp group,
    774  * nce_lookup_v4 will never try to match across the group.
    775  */
    776 nce_t *
    777 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
    778 {
    779 	nce_t *nce;
    780 	in6_addr_t addr6;
    781 	ip_stack_t *ipst = ill->ill_ipst;
    782 
    783 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
    784 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
    785 	nce = nce_lookup_addr(ill, &addr6);
    786 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
    787 	return (nce);
    788 }
    789 
    790 /*
    791  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
    792  * entries for ill only, i.e., when ill is part of an ipmp group,
    793  * nce_lookup_v6 will never try to match across the group.
    794  */
    795 nce_t *
    796 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
    797 {
    798 	nce_t *nce;
    799 	ip_stack_t *ipst = ill->ill_ipst;
    800 
    801 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
    802 	nce = nce_lookup_addr(ill, addr6);
    803 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    804 	return (nce);
    805 }
    806 
    807 static nce_t *
    808 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
    809 {
    810 	nce_t *nce;
    811 
    812 	ASSERT(ill != NULL);
    813 #ifdef DEBUG
    814 	if (ill->ill_isv6)
    815 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
    816 	else
    817 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
    818 #endif
    819 	mutex_enter(&ill->ill_lock);
    820 	nce = nce_lookup(ill, addr);
    821 	mutex_exit(&ill->ill_lock);
    822 	return (nce);
    823 }
    824 
    825 
    826 /*
    827  * Router turned to host.  We need to make sure that cached copies of the ncec
    828  * are not used for forwarding packets if they were derived from the default
    829  * route, and that the default route itself is removed, as  required by
    830  * section 7.2.5 of RFC 2461.
    831  *
    832  * Note that the ncec itself probably has valid link-layer information for the
    833  * nexthop, so that there is no reason to delete the ncec, as long as the
    834  * ISROUTER flag is turned off.
    835  */
    836 static void
    837 ncec_router_to_host(ncec_t *ncec)
    838 {
    839 	ire_t		*ire;
    840 	ip_stack_t	*ipst = ncec->ncec_ipst;
    841 
    842 	mutex_enter(&ncec->ncec_lock);
    843 	ncec->ncec_flags &= ~NCE_F_ISROUTER;
    844 	mutex_exit(&ncec->ncec_lock);
    845 
    846 	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
    847 	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
    848 	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
    849 	if (ire != NULL) {
    850 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
    851 		ire_delete(ire);
    852 		ire_refrele(ire);
    853 	}
    854 }
    855 
    856 /*
    857  * Process passed in parameters either from an incoming packet or via
    858  * user ioctl.
    859  */
    860 void
    861 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
    862 {
    863 	ill_t	*ill = ncec->ncec_ill;
    864 	uint32_t hw_addr_len = ill->ill_phys_addr_length;
    865 	boolean_t ll_updated = B_FALSE;
    866 	boolean_t ll_changed;
    867 	nce_t	*nce;
    868 
    869 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
    870 	/*
    871 	 * No updates of link layer address or the neighbor state is
    872 	 * allowed, when the cache is in NONUD state.  This still
    873 	 * allows for responding to reachability solicitation.
    874 	 */
    875 	mutex_enter(&ncec->ncec_lock);
    876 	if (ncec->ncec_state == ND_INCOMPLETE) {
    877 		if (hw_addr == NULL) {
    878 			mutex_exit(&ncec->ncec_lock);
    879 			return;
    880 		}
    881 		nce_set_ll(ncec, hw_addr);
    882 		/*
    883 		 * Update ncec state and send the queued packets
    884 		 * back to ip this time ire will be added.
    885 		 */
    886 		if (flag & ND_NA_FLAG_SOLICITED) {
    887 			nce_update(ncec, ND_REACHABLE, NULL);
    888 		} else {
    889 			nce_update(ncec, ND_STALE, NULL);
    890 		}
    891 		mutex_exit(&ncec->ncec_lock);
    892 		nce = nce_fastpath(ncec, B_TRUE, NULL);
    893 		nce_resolv_ok(ncec);
    894 		if (nce != NULL)
    895 			nce_refrele(nce);
    896 		return;
    897 	}
    898 	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
    899 	if (!is_adv) {
    900 		/* If this is a SOLICITATION request only */
    901 		if (ll_changed)
    902 			nce_update(ncec, ND_STALE, hw_addr);
    903 		mutex_exit(&ncec->ncec_lock);
    904 		ncec_cb_dispatch(ncec);
    905 		return;
    906 	}
    907 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
    908 		/* If in any other state than REACHABLE, ignore */
    909 		if (ncec->ncec_state == ND_REACHABLE) {
    910 			nce_update(ncec, ND_STALE, NULL);
    911 		}
    912 		mutex_exit(&ncec->ncec_lock);
    913 		ncec_cb_dispatch(ncec);
    914 		return;
    915 	} else {
    916 		if (ll_changed) {
    917 			nce_update(ncec, ND_UNCHANGED, hw_addr);
    918 			ll_updated = B_TRUE;
    919 		}
    920 		if (flag & ND_NA_FLAG_SOLICITED) {
    921 			nce_update(ncec, ND_REACHABLE, NULL);
    922 		} else {
    923 			if (ll_updated) {
    924 				nce_update(ncec, ND_STALE, NULL);
    925 			}
    926 		}
    927 		mutex_exit(&ncec->ncec_lock);
    928 		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
    929 		    NCE_F_ISROUTER)) {
    930 			ncec_router_to_host(ncec);
    931 		} else {
    932 			ncec_cb_dispatch(ncec);
    933 		}
    934 	}
    935 }
    936 
    937 /*
    938  * Pass arg1 to the pfi supplied, along with each ncec in existence.
    939  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
    940  * walking the hash list.
    941  */
    942 void
    943 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
    944     boolean_t trace)
    945 {
    946 	ncec_t	*ncec;
    947 	ncec_t	*ncec1;
    948 	ncec_t	**ncep;
    949 	ncec_t	*free_nce_list = NULL;
    950 
    951 	mutex_enter(&ndp->ndp_g_lock);
    952 	/* Prevent ncec_delete from unlink and free of NCE */
    953 	ndp->ndp_g_walker++;
    954 	mutex_exit(&ndp->ndp_g_lock);
    955 	for (ncep = ndp->nce_hash_tbl;
    956 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
    957 		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
    958 			ncec1 = ncec->ncec_next;
    959 			if (ill == NULL || ncec->ncec_ill == ill) {
    960 				if (trace) {
    961 					ncec_refhold(ncec);
    962 					(*pfi)(ncec, arg1);
    963 					ncec_refrele(ncec);
    964 				} else {
    965 					ncec_refhold_notr(ncec);
    966 					(*pfi)(ncec, arg1);
    967 					ncec_refrele_notr(ncec);
    968 				}
    969 			}
    970 		}
    971 	}
    972 	mutex_enter(&ndp->ndp_g_lock);
    973 	ndp->ndp_g_walker--;
    974 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
    975 		/* Time to delete condemned entries */
    976 		for (ncep = ndp->nce_hash_tbl;
    977 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
    978 			ncec = *ncep;
    979 			if (ncec != NULL) {
    980 				nce_remove(ndp, ncec, &free_nce_list);
    981 			}
    982 		}
    983 		ndp->ndp_g_walker_cleanup = B_FALSE;
    984 	}
    985 
    986 	mutex_exit(&ndp->ndp_g_lock);
    987 
    988 	if (free_nce_list != NULL) {
    989 		nce_cleanup_list(free_nce_list);
    990 	}
    991 }
    992 
    993 /*
    994  * Walk everything.
    995  * Note that ill can be NULL hence can't derive the ipst from it.
    996  */
    997 void
    998 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
    999 {
   1000 	ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
   1001 	ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
   1002 }
   1003 
   1004 /*
   1005  * For each interface an entry is added for the unspecified multicast group.
   1006  * Here that mapping is used to form the multicast cache entry for a particular
   1007  * multicast destination.
   1008  */
   1009 static int
   1010 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
   1011     uint16_t flags, nce_t **newnce)
   1012 {
   1013 	uchar_t		*hw_addr;
   1014 	int		err = 0;
   1015 	ip_stack_t	*ipst = ill->ill_ipst;
   1016 	nce_t		*nce;
   1017 
   1018 	ASSERT(ill != NULL);
   1019 	ASSERT(ill->ill_isv6);
   1020 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
   1021 
   1022 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
   1023 	nce = nce_lookup_addr(ill, dst);
   1024 	if (nce != NULL) {
   1025 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   1026 		goto done;
   1027 	}
   1028 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
   1029 		/*
   1030 		 * For IRE_IF_RESOLVER a hardware mapping can be
   1031 		 * generated.
   1032 		 */
   1033 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
   1034 		if (hw_addr == NULL) {
   1035 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   1036 			return (ENOMEM);
   1037 		}
   1038 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
   1039 	} else {
   1040 		/* No hw_addr is needed for IRE_IF_NORESOLVER. */
   1041 		hw_addr = NULL;
   1042 	}
   1043 	ASSERT((flags & NCE_F_MCAST) != 0);
   1044 	ASSERT((flags & NCE_F_NONUD) != 0);
   1045 	/* nce_state will be computed by nce_add_common() */
   1046 	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
   1047 	    ND_UNCHANGED, &nce);
   1048 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   1049 	if (err == 0)
   1050 		err = nce_add_v6_postprocess(nce);
   1051 	if (hw_addr != NULL)
   1052 		kmem_free(hw_addr, ill->ill_nd_lla_len);
   1053 	if (err != 0) {
   1054 		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
   1055 		return (err);
   1056 	}
   1057 done:
   1058 	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
   1059 	if (newnce != NULL)
   1060 		*newnce = nce;
   1061 	else
   1062 		nce_refrele(nce);
   1063 	return (0);
   1064 }
   1065 
   1066 /*
   1067  * Return the link layer address, and any flags of a ncec.
   1068  */
   1069 int
   1070 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
   1071 {
   1072 	ncec_t		*ncec;
   1073 	in6_addr_t	*addr;
   1074 	sin6_t		*sin6;
   1075 
   1076 	ASSERT(ill != NULL && ill->ill_isv6);
   1077 	sin6 = (sin6_t *)&lnr->lnr_addr;
   1078 	addr =  &sin6->sin6_addr;
   1079 
   1080 	/*
   1081 	 * NOTE: if the ill is an IPMP interface, then match against the whole
   1082 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
   1083 	 * addresses for the data addresses on an IPMP interface even though
   1084 	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
   1085 	 */
   1086 	ncec = ncec_lookup_illgrp_v6(ill, addr);
   1087 	if (ncec == NULL)
   1088 		return (ESRCH);
   1089 	/* If no link layer address is available yet, return ESRCH */
   1090 	if (!NCE_ISREACHABLE(ncec)) {
   1091 		ncec_refrele(ncec);
   1092 		return (ESRCH);
   1093 	}
   1094 	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
   1095 	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
   1096 	    lnr->lnr_hdw_len);
   1097 	if (ncec->ncec_flags & NCE_F_ISROUTER)
   1098 		lnr->lnr_flags = NDF_ISROUTER_ON;
   1099 	if (ncec->ncec_flags & NCE_F_ANYCAST)
   1100 		lnr->lnr_flags |= NDF_ANYCAST_ON;
   1101 	ncec_refrele(ncec);
   1102 	return (0);
   1103 }
   1104 
   1105 /*
   1106  * Finish setting up the Enable/Disable multicast for the driver.
   1107  */
   1108 mblk_t *
   1109 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
   1110     uint32_t hw_addr_offset, mblk_t *mp)
   1111 {
   1112 	uchar_t		*hw_addr;
   1113 	ipaddr_t	v4group;
   1114 	uchar_t		*addr;
   1115 
   1116 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
   1117 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
   1118 		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
   1119 
   1120 		ASSERT(CLASSD(v4group));
   1121 		ASSERT(!(ill->ill_isv6));
   1122 
   1123 		addr = (uchar_t *)&v4group;
   1124 	} else {
   1125 		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
   1126 		ASSERT(ill->ill_isv6);
   1127 
   1128 		addr = (uchar_t *)v6group;
   1129 	}
   1130 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
   1131 	if (hw_addr == NULL) {
   1132 		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
   1133 		freemsg(mp);
   1134 		return (NULL);
   1135 	}
   1136 
   1137 	ip_mcast_mapping(ill, addr, hw_addr);
   1138 	return (mp);
   1139 }
   1140 
   1141 void
   1142 ip_ndp_resolve(ncec_t *ncec)
   1143 {
   1144 	in_addr_t	sender4 = INADDR_ANY;
   1145 	in6_addr_t	sender6 = ipv6_all_zeros;
   1146 	ill_t		*src_ill;
   1147 	uint32_t	ms;
   1148 
   1149 	src_ill = nce_resolve_src(ncec, &sender6);
   1150 	if (src_ill == NULL) {
   1151 		/* Make sure we try again later */
   1152 		ms = ncec->ncec_ill->ill_reachable_retrans_time;
   1153 		nce_restart_timer(ncec, (clock_t)ms);
   1154 		return;
   1155 	}
   1156 	if (ncec->ncec_ipversion == IPV4_VERSION)
   1157 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
   1158 	mutex_enter(&ncec->ncec_lock);
   1159 	if (ncec->ncec_ipversion == IPV6_VERSION)
   1160 		ms = ndp_solicit(ncec, sender6, src_ill);
   1161 	else
   1162 		ms = arp_request(ncec, sender4, src_ill);
   1163 	mutex_exit(&ncec->ncec_lock);
   1164 	if (ms == 0) {
   1165 		if (ncec->ncec_state != ND_REACHABLE) {
   1166 			if (ncec->ncec_ipversion == IPV6_VERSION)
   1167 				ndp_resolv_failed(ncec);
   1168 			else
   1169 				arp_resolv_failed(ncec);
   1170 			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
   1171 			nce_make_unreachable(ncec);
   1172 			ncec_delete(ncec);
   1173 		}
   1174 	} else {
   1175 		nce_restart_timer(ncec, (clock_t)ms);
   1176 	}
   1177 done:
   1178 	ill_refrele(src_ill);
   1179 }
   1180 
   1181 /*
   1182  * Send an IPv6 neighbor solicitation.
   1183  * Returns number of milliseconds after which we should either rexmit or abort.
   1184  * Return of zero means we should abort.
   1185  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
   1186  * The optional source address is used as a hint to ndp_solicit for
   1187  * which source to use in the packet.
   1188  *
   1189  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
   1190  * the packet.
   1191  */
   1192 uint32_t
   1193 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
   1194 {
   1195 	in6_addr_t	dst;
   1196 	boolean_t	dropped = B_FALSE;
   1197 
   1198 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
   1199 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   1200 
   1201 	if (ncec->ncec_rcnt == 0)
   1202 		return (0);
   1203 
   1204 	dst = ncec->ncec_addr;
   1205 	ncec->ncec_rcnt--;
   1206 	mutex_exit(&ncec->ncec_lock);
   1207 	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
   1208 	    ill->ill_phys_addr_length, &src, &dst, 0);
   1209 	mutex_enter(&ncec->ncec_lock);
   1210 	if (dropped)
   1211 		ncec->ncec_rcnt++;
   1212 	return (ncec->ncec_ill->ill_reachable_retrans_time);
   1213 }
   1214 
   1215 /*
   1216  * Attempt to recover an address on an interface that's been marked as a
   1217  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
   1218  * no easy way to just probe the address and have the right thing happen if
   1219  * it's no longer in use.  Instead, we just bring it up normally and allow the
   1220  * regular interface start-up logic to probe for a remaining duplicate and take
   1221  * us back down if necessary.
   1222  * Neither DHCP nor temporary addresses arrive here; they're excluded by
   1223  * ip_ndp_excl.
   1224  */
   1225 /* ARGSUSED */
   1226 void
   1227 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
   1228 {
   1229 	ill_t	*ill = rq->q_ptr;
   1230 	ipif_t	*ipif;
   1231 	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
   1232 	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
   1233 	boolean_t addr_equal;
   1234 
   1235 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   1236 		/*
   1237 		 * We do not support recovery of proxy ARP'd interfaces,
   1238 		 * because the system lacks a complete proxy ARP mechanism.
   1239 		 */
   1240 		if (ill->ill_isv6) {
   1241 			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
   1242 			    addr6);
   1243 		} else {
   1244 			addr_equal = (ipif->ipif_lcl_addr == *addr4);
   1245 		}
   1246 
   1247 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
   1248 			continue;
   1249 
   1250 		/*
   1251 		 * If we have already recovered or if the interface is going
   1252 		 * away, then ignore.
   1253 		 */
   1254 		mutex_enter(&ill->ill_lock);
   1255 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
   1256 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
   1257 			mutex_exit(&ill->ill_lock);
   1258 			continue;
   1259 		}
   1260 
   1261 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
   1262 		ill->ill_ipif_dup_count--;
   1263 		mutex_exit(&ill->ill_lock);
   1264 		ipif->ipif_was_dup = B_TRUE;
   1265 
   1266 		if (ill->ill_isv6) {
   1267 			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
   1268 			(void) ipif_up_done_v6(ipif);
   1269 		} else {
   1270 			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
   1271 			    EINPROGRESS);
   1272 			(void) ipif_up_done(ipif);
   1273 		}
   1274 	}
   1275 	freeb(mp);
   1276 }
   1277 
   1278 /*
   1279  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
   1280  * As long as someone else holds the address, the interface will stay down.
   1281  * When that conflict goes away, the interface is brought back up.  This is
   1282  * done so that accidental shutdowns of addresses aren't made permanent.  Your
   1283  * server will recover from a failure.
   1284  *
   1285  * For DHCP and temporary addresses, recovery is not done in the kernel.
   1286  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
   1287  *
   1288  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
   1289  */
   1290 void
   1291 ipif_dup_recovery(void *arg)
   1292 {
   1293 	ipif_t *ipif = arg;
   1294 
   1295 	ipif->ipif_recovery_id = 0;
   1296 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
   1297 		return;
   1298 
   1299 	/*
   1300 	 * No lock, because this is just an optimization.
   1301 	 */
   1302 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
   1303 		return;
   1304 
   1305 	/* If the link is down, we'll retry this later */
   1306 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
   1307 		return;
   1308 
   1309 	ipif_do_recovery(ipif);
   1310 }
   1311 
   1312 /*
   1313  * Perform interface recovery by forcing the duplicate interfaces up and
   1314  * allowing the system to determine which ones should stay up.
   1315  *
   1316  * Called both by recovery timer expiry and link-up notification.
   1317  */
   1318 void
   1319 ipif_do_recovery(ipif_t *ipif)
   1320 {
   1321 	ill_t *ill = ipif->ipif_ill;
   1322 	mblk_t *mp;
   1323 	ip_stack_t *ipst = ill->ill_ipst;
   1324 	size_t mp_size;
   1325 
   1326 	if (ipif->ipif_isv6)
   1327 		mp_size = sizeof (ipif->ipif_v6lcl_addr);
   1328 	else
   1329 		mp_size = sizeof (ipif->ipif_lcl_addr);
   1330 	mp = allocb(mp_size, BPRI_MED);
   1331 	if (mp == NULL) {
   1332 		mutex_enter(&ill->ill_lock);
   1333 		if (ipst->ips_ip_dup_recovery > 0 &&
   1334 		    ipif->ipif_recovery_id == 0 &&
   1335 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
   1336 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
   1337 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
   1338 		}
   1339 		mutex_exit(&ill->ill_lock);
   1340 	} else {
   1341 		/*
   1342 		 * A recovery timer may still be running if we got here from
   1343 		 * ill_restart_dad(); cancel that timer.
   1344 		 */
   1345 		if (ipif->ipif_recovery_id != 0)
   1346 			(void) untimeout(ipif->ipif_recovery_id);
   1347 		ipif->ipif_recovery_id = 0;
   1348 
   1349 		if (ipif->ipif_isv6) {
   1350 			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
   1351 			    sizeof (ipif->ipif_v6lcl_addr));
   1352 		} else  {
   1353 			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
   1354 			    sizeof (ipif->ipif_lcl_addr));
   1355 		}
   1356 		ill_refhold(ill);
   1357 		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
   1358 		    B_FALSE);
   1359 	}
   1360 }
   1361 
   1362 /*
   1363  * Find the MAC and IP addresses in an NA/NS message.
   1364  */
   1365 static void
   1366 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
   1367     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
   1368 {
   1369 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
   1370 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
   1371 	uchar_t *addr;
   1372 	int alen;
   1373 
   1374 	/* icmp_inbound_v6 ensures this */
   1375 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
   1376 
   1377 	addr = ira->ira_l2src;
   1378 	alen = ill->ill_phys_addr_length;
   1379 	if (alen > 0) {
   1380 		*haddr = addr;
   1381 		*haddrlenp = alen;
   1382 	} else {
   1383 		*haddr = NULL;
   1384 		*haddrlenp = 0;
   1385 	}
   1386 
   1387 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
   1388 	*targp = ns->nd_ns_target;
   1389 }
   1390 
   1391 /*
   1392  * This is for exclusive changes due to NDP duplicate address detection
   1393  * failure.
   1394  */
   1395 /* ARGSUSED */
   1396 static void
   1397 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
   1398 {
   1399 	ill_t	*ill = rq->q_ptr;
   1400 	ipif_t	*ipif;
   1401 	uchar_t	*haddr;
   1402 	uint_t	haddrlen;
   1403 	ip_stack_t *ipst = ill->ill_ipst;
   1404 	in6_addr_t targ;
   1405 	ip_recv_attr_t iras;
   1406 	mblk_t	*attrmp;
   1407 
   1408 	attrmp = mp;
   1409 	mp = mp->b_cont;
   1410 	attrmp->b_cont = NULL;
   1411 	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
   1412 		/* The ill or ip_stack_t disappeared on us */
   1413 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1414 		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
   1415 		freemsg(mp);
   1416 		ira_cleanup(&iras, B_TRUE);
   1417 		return;
   1418 	}
   1419 
   1420 	ASSERT(ill == iras.ira_rill);
   1421 
   1422 	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
   1423 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
   1424 		/*
   1425 		 * Ignore conflicts generated by misbehaving switches that
   1426 		 * just reflect our own messages back to us.  For IPMP, we may
   1427 		 * see reflections across any ill in the illgrp.
   1428 		 *
   1429 		 * RFC2462 and revisions tried to detect both the case
   1430 		 * when a statically configured IPv6 address is a duplicate,
   1431 		 * and the case when the L2 address itself is a duplicate. The
   1432 		 * later is important because, with stateles address autoconf,
   1433 		 * if the L2 address is a duplicate, the resulting IPv6
   1434 		 * address(es) would also be duplicates. We rely on DAD of the
   1435 		 * IPv6 address itself to detect the latter case.
   1436 		 */
   1437 		/* For an under ill_grp can change under lock */
   1438 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1439 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
   1440 		    IS_UNDER_IPMP(ill) &&
   1441 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
   1442 		    haddrlen) != NULL) {
   1443 			rw_exit(&ipst->ips_ill_g_lock);
   1444 			goto ignore_conflict;
   1445 		}
   1446 		rw_exit(&ipst->ips_ill_g_lock);
   1447 	}
   1448 
   1449 	/*
   1450 	 * Look up the appropriate ipif.
   1451 	 */
   1452 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
   1453 	if (ipif == NULL)
   1454 		goto ignore_conflict;
   1455 
   1456 	/* Reload the ill to match the ipif */
   1457 	ill = ipif->ipif_ill;
   1458 
   1459 	/* If it's already duplicate or ineligible, then don't do anything. */
   1460 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
   1461 		ipif_refrele(ipif);
   1462 		goto ignore_conflict;
   1463 	}
   1464 
   1465 	/*
   1466 	 * If this is a failure during duplicate recovery, then don't
   1467 	 * complain.  It may take a long time to recover.
   1468 	 */
   1469 	if (!ipif->ipif_was_dup) {
   1470 		char ibuf[LIFNAMSIZ];
   1471 		char hbuf[MAC_STR_LEN];
   1472 		char sbuf[INET6_ADDRSTRLEN];
   1473 
   1474 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
   1475 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
   1476 		    " disabled", ibuf,
   1477 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
   1478 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
   1479 	}
   1480 	mutex_enter(&ill->ill_lock);
   1481 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
   1482 	ipif->ipif_flags |= IPIF_DUPLICATE;
   1483 	ill->ill_ipif_dup_count++;
   1484 	mutex_exit(&ill->ill_lock);
   1485 	(void) ipif_down(ipif, NULL, NULL);
   1486 	(void) ipif_down_tail(ipif);
   1487 	mutex_enter(&ill->ill_lock);
   1488 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
   1489 	    ill->ill_net_type == IRE_IF_RESOLVER &&
   1490 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
   1491 	    ipst->ips_ip_dup_recovery > 0) {
   1492 		ASSERT(ipif->ipif_recovery_id == 0);
   1493 		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
   1494 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
   1495 	}
   1496 	mutex_exit(&ill->ill_lock);
   1497 	ipif_refrele(ipif);
   1498 
   1499 ignore_conflict:
   1500 	freemsg(mp);
   1501 	ira_cleanup(&iras, B_TRUE);
   1502 }
   1503 
   1504 /*
   1505  * Handle failure by tearing down the ipifs with the specified address.  Note
   1506  * that tearing down the ipif also means deleting the ncec through ipif_down, so
   1507  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
   1508  * we start a timer on the ipif.
   1509  * Caller has to free mp;
   1510  */
   1511 static void
   1512 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
   1513 {
   1514 	const uchar_t	*haddr;
   1515 	ill_t		*ill = ira->ira_rill;
   1516 
   1517 	/*
   1518 	 * Ignore conflicts generated by misbehaving switches that just
   1519 	 * reflect our own messages back to us.
   1520 	 */
   1521 
   1522 	/* icmp_inbound_v6 ensures this */
   1523 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
   1524 	haddr = ira->ira_l2src;
   1525 	if (haddr != NULL &&
   1526 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
   1527 		return;
   1528 	}
   1529 
   1530 	if ((mp = copymsg(mp)) != NULL) {
   1531 		mblk_t	*attrmp;
   1532 
   1533 		attrmp = ip_recv_attr_to_mblk(ira);
   1534 		if (attrmp == NULL) {
   1535 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1536 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   1537 			freemsg(mp);
   1538 		} else {
   1539 			ASSERT(attrmp->b_cont == NULL);
   1540 			attrmp->b_cont = mp;
   1541 			mp = attrmp;
   1542 			ill_refhold(ill);
   1543 			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
   1544 			    B_FALSE);
   1545 		}
   1546 	}
   1547 }
   1548 
   1549 /*
   1550  * Handle a discovered conflict: some other system is advertising that it owns
   1551  * one of our IP addresses.  We need to defend ourselves, or just shut down the
   1552  * interface.
   1553  *
   1554  * Handles both IPv4 and IPv6
   1555  */
   1556 boolean_t
   1557 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
   1558 {
   1559 	ipif_t		*ipif;
   1560 	clock_t		now;
   1561 	uint_t		maxdefense;
   1562 	uint_t		defs;
   1563 	ill_t		*ill = ira->ira_ill;
   1564 	ip_stack_t	*ipst = ill->ill_ipst;
   1565 	uint32_t	elapsed;
   1566 	boolean_t	isv6 = ill->ill_isv6;
   1567 	ipaddr_t	ncec_addr;
   1568 
   1569 	if (isv6) {
   1570 		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
   1571 		    ipst);
   1572 	} else {
   1573 		if (arp_no_defense) {
   1574 			/*
   1575 			 * Yes, there is a conflict, but no, we do not
   1576 			 * defend ourself.
   1577 			 */
   1578 			return (B_TRUE);
   1579 		}
   1580 		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
   1581 		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
   1582 		    ipst);
   1583 	}
   1584 	if (ipif == NULL)
   1585 		return (B_FALSE);
   1586 
   1587 	/*
   1588 	 * First, figure out if this address is disposable.
   1589 	 */
   1590 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
   1591 		maxdefense = ipst->ips_ip_max_temp_defend;
   1592 	else
   1593 		maxdefense = ipst->ips_ip_max_defend;
   1594 
   1595 	/*
   1596 	 * Now figure out how many times we've defended ourselves.  Ignore
   1597 	 * defenses that happened long in the past.
   1598 	 */
   1599 	now = ddi_get_lbolt();
   1600 	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
   1601 	mutex_enter(&ncec->ncec_lock);
   1602 	if ((defs = ncec->ncec_defense_count) > 0 &&
   1603 	    elapsed > ipst->ips_ip_defend_interval) {
   1604 		/*
   1605 		 * ip_defend_interval has elapsed.
   1606 		 * reset the defense count.
   1607 		 */
   1608 		ncec->ncec_defense_count = defs = 0;
   1609 	}
   1610 	ncec->ncec_defense_count++;
   1611 	ncec->ncec_last_time_defended = now;
   1612 	mutex_exit(&ncec->ncec_lock);
   1613 	ipif_refrele(ipif);
   1614 
   1615 	/*
   1616 	 * If we've defended ourselves too many times already, then give up and
   1617 	 * tear down the interface(s) using this address.
   1618 	 * Otherwise, caller has to defend by sending out an announce.
   1619 	 */
   1620 	if (defs >= maxdefense) {
   1621 		if (isv6)
   1622 			ndp_failure(mp, ira);
   1623 		else
   1624 			arp_failure(mp, ira);
   1625 	} else {
   1626 		return (B_TRUE); /* caller must defend this address */
   1627 	}
   1628 	return (B_FALSE);
   1629 }
   1630 
   1631 /*
   1632  * Handle reception of Neighbor Solicitation messages.
   1633  */
   1634 static void
   1635 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
   1636 {
   1637 	ill_t		*ill = ira->ira_ill, *under_ill;
   1638 	nd_neighbor_solicit_t *ns;
   1639 	uint32_t	hlen = ill->ill_phys_addr_length;
   1640 	uchar_t		*haddr = NULL;
   1641 	icmp6_t		*icmp_nd;
   1642 	ip6_t		*ip6h;
   1643 	ncec_t		*our_ncec = NULL;
   1644 	in6_addr_t	target;
   1645 	in6_addr_t	src;
   1646 	int		len;
   1647 	int		flag = 0;
   1648 	nd_opt_hdr_t	*opt = NULL;
   1649 	boolean_t	bad_solicit = B_FALSE;
   1650 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
   1651 	boolean_t	need_ill_refrele = B_FALSE;
   1652 
   1653 	ip6h = (ip6_t *)mp->b_rptr;
   1654 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
   1655 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
   1656 	src = ip6h->ip6_src;
   1657 	ns = (nd_neighbor_solicit_t *)icmp_nd;
   1658 	target = ns->nd_ns_target;
   1659 	if (IN6_IS_ADDR_MULTICAST(&target)) {
   1660 		if (ip_debug > 2) {
   1661 			/* ip1dbg */
   1662 			pr_addr_dbg("ndp_input_solicit: Target is"
   1663 			    " multicast! %s\n", AF_INET6, &target);
   1664 		}
   1665 		bad_solicit = B_TRUE;
   1666 		goto done;
   1667 	}
   1668 	if (len > sizeof (nd_neighbor_solicit_t)) {
   1669 		/* Options present */
   1670 		opt = (nd_opt_hdr_t *)&ns[1];
   1671 		len -= sizeof (nd_neighbor_solicit_t);
   1672 		if (!ndp_verify_optlen(opt, len)) {
   1673 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
   1674 			bad_solicit = B_TRUE;
   1675 			goto done;
   1676 		}
   1677 	}
   1678 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
   1679 		/* Check to see if this is a valid DAD solicitation */
   1680 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
   1681 			if (ip_debug > 2) {
   1682 				/* ip1dbg */
   1683 				pr_addr_dbg("ndp_input_solicit: IPv6 "
   1684 				    "Destination is not solicited node "
   1685 				    "multicast %s\n", AF_INET6,
   1686 				    &ip6h->ip6_dst);
   1687 			}
   1688 			bad_solicit = B_TRUE;
   1689 			goto done;
   1690 		}
   1691 	}
   1692 
   1693 	/*
   1694 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
   1695 	 * received this packet if it's multicast) is not the ill tied to
   1696 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
   1697 	 * to ensure we find the associated NCE.
   1698 	 */
   1699 	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
   1700 	/*
   1701 	 * If this is a valid Solicitation for an address we are publishing,
   1702 	 * then a PUBLISH entry should exist in the cache
   1703 	 */
   1704 	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
   1705 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
   1706 		    "ifname=%s ", ill->ill_name));
   1707 		if (ip_debug > 2) {
   1708 			/* ip1dbg */
   1709 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
   1710 		}
   1711 		if (our_ncec == NULL)
   1712 			bad_solicit = B_TRUE;
   1713 		goto done;
   1714 	}
   1715 
   1716 	/* At this point we should have a verified NS per spec */
   1717 	if (opt != NULL) {
   1718 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
   1719 		if (opt != NULL) {
   1720 			haddr = (uchar_t *)&opt[1];
   1721 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
   1722 			    hlen == 0) {
   1723 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
   1724 				bad_solicit = B_TRUE;
   1725 				goto done;
   1726 			}
   1727 		}
   1728 	}
   1729 
   1730 	/* If sending directly to peer, set the unicast flag */
   1731 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
   1732 		flag |= NDP_UNICAST;
   1733 
   1734 	/*
   1735 	 * Create/update the entry for the soliciting node on the ipmp_ill.
   1736 	 * or respond to outstanding queries, don't if
   1737 	 * the source is unspecified address.
   1738 	 */
   1739 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
   1740 		int	err;
   1741 		nce_t	*nnce;
   1742 
   1743 		ASSERT(ill->ill_isv6);
   1744 		/*
   1745 		 * Regular solicitations *must* include the Source Link-Layer
   1746 		 * Address option.  Ignore messages that do not.
   1747 		 */
   1748 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
   1749 			ip1dbg(("ndp_input_solicit: source link-layer address "
   1750 			    "option missing with a specified source.\n"));
   1751 			bad_solicit = B_TRUE;
   1752 			goto done;
   1753 		}
   1754 
   1755 		/*
   1756 		 * This is a regular solicitation.  If we're still in the
   1757 		 * process of verifying the address, then don't respond at all
   1758 		 * and don't keep track of the sender.
   1759 		 */
   1760 		if (our_ncec->ncec_state == ND_PROBE)
   1761 			goto done;
   1762 
   1763 		/*
   1764 		 * If the solicitation doesn't have sender hardware address
   1765 		 * (legal for unicast solicitation), then process without
   1766 		 * installing the return NCE.  Either we already know it, or
   1767 		 * we'll be forced to look it up when (and if) we reply to the
   1768 		 * packet.
   1769 		 */
   1770 		if (haddr == NULL)
   1771 			goto no_source;
   1772 
   1773 		under_ill = ill;
   1774 		if (IS_UNDER_IPMP(under_ill)) {
   1775 			ill = ipmp_ill_hold_ipmp_ill(under_ill);
   1776 			if (ill == NULL)
   1777 				ill = under_ill;
   1778 			else
   1779 				need_ill_refrele = B_TRUE;
   1780 		}
   1781 		err = nce_lookup_then_add_v6(ill,
   1782 		    haddr, hlen,
   1783 		    &src,	/* Soliciting nodes address */
   1784 		    0,
   1785 		    ND_STALE,
   1786 		    &nnce);
   1787 
   1788 		if (need_ill_refrele) {
   1789 			ill_refrele(ill);
   1790 			ill = under_ill;
   1791 			need_ill_refrele =  B_FALSE;
   1792 		}
   1793 		switch (err) {
   1794 		case 0:
   1795 			/* done with this entry */
   1796 			nce_refrele(nnce);
   1797 			break;
   1798 		case EEXIST:
   1799 			/*
   1800 			 * B_FALSE indicates this is not an an advertisement.
   1801 			 */
   1802 			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
   1803 			nce_refrele(nnce);
   1804 			break;
   1805 		default:
   1806 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
   1807 			    err));
   1808 			goto done;
   1809 		}
   1810 no_source:
   1811 		flag |= NDP_SOLICITED;
   1812 	} else {
   1813 		/*
   1814 		 * No source link layer address option should be present in a
   1815 		 * valid DAD request.
   1816 		 */
   1817 		if (haddr != NULL) {
   1818 			ip1dbg(("ndp_input_solicit: source link-layer address "
   1819 			    "option present with an unspecified source.\n"));
   1820 			bad_solicit = B_TRUE;
   1821 			goto done;
   1822 		}
   1823 		if (our_ncec->ncec_state == ND_PROBE) {
   1824 			/*
   1825 			 * Internally looped-back probes will have
   1826 			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
   1827 			 * transmissions.
   1828 			 */
   1829 			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
   1830 				/*
   1831 				 * If someone else is probing our address, then
   1832 				 * we've crossed wires.  Declare failure.
   1833 				 */
   1834 				ndp_failure(mp, ira);
   1835 			}
   1836 			goto done;
   1837 		}
   1838 		/*
   1839 		 * This is a DAD probe.  Multicast the advertisement to the
   1840 		 * all-nodes address.
   1841 		 */
   1842 		src = ipv6_all_hosts_mcast;
   1843 	}
   1844 	flag |= nce_advert_flags(our_ncec);
   1845 	(void) ndp_xmit(ill,
   1846 	    ND_NEIGHBOR_ADVERT,
   1847 	    our_ncec->ncec_lladdr,
   1848 	    our_ncec->ncec_lladdr_length,
   1849 	    &target,	/* Source and target of the advertisement pkt */
   1850 	    &src,	/* IP Destination (source of original pkt) */
   1851 	    flag);
   1852 done:
   1853 	if (bad_solicit)
   1854 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
   1855 	if (our_ncec != NULL)
   1856 		ncec_refrele(our_ncec);
   1857 }
   1858 
   1859 /*
   1860  * Handle reception of Neighbor Solicitation messages
   1861  */
   1862 void
   1863 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
   1864 {
   1865 	ill_t		*ill = ira->ira_ill;
   1866 	nd_neighbor_advert_t *na;
   1867 	uint32_t	hlen = ill->ill_phys_addr_length;
   1868 	uchar_t		*haddr = NULL;
   1869 	icmp6_t		*icmp_nd;
   1870 	ip6_t		*ip6h;
   1871 	ncec_t		*dst_ncec = NULL;
   1872 	in6_addr_t	target;
   1873 	nd_opt_hdr_t	*opt = NULL;
   1874 	int		len;
   1875 	ip_stack_t	*ipst = ill->ill_ipst;
   1876 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
   1877 
   1878 	ip6h = (ip6_t *)mp->b_rptr;
   1879 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
   1880 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
   1881 	na = (nd_neighbor_advert_t *)icmp_nd;
   1882 
   1883 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
   1884 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
   1885 		ip1dbg(("ndp_input_advert: Target is multicast but the "
   1886 		    "solicited flag is not zero\n"));
   1887 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
   1888 		return;
   1889 	}
   1890 	target = na->nd_na_target;
   1891 	if (IN6_IS_ADDR_MULTICAST(&target)) {
   1892 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
   1893 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
   1894 		return;
   1895 	}
   1896 	if (len > sizeof (nd_neighbor_advert_t)) {
   1897 		opt = (nd_opt_hdr_t *)&na[1];
   1898 		if (!ndp_verify_optlen(opt,
   1899 		    len - sizeof (nd_neighbor_advert_t))) {
   1900 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
   1901 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
   1902 			return;
   1903 		}
   1904 		/* At this point we have a verified NA per spec */
   1905 		len -= sizeof (nd_neighbor_advert_t);
   1906 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
   1907 		if (opt != NULL) {
   1908 			haddr = (uchar_t *)&opt[1];
   1909 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
   1910 			    hlen == 0) {
   1911 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
   1912 				BUMP_MIB(mib,
   1913 				    ipv6IfIcmpInBadNeighborAdvertisements);
   1914 				return;
   1915 			}
   1916 		}
   1917 	}
   1918 
   1919 	/*
   1920 	 * NOTE: we match across the illgrp since we need to do DAD for all of
   1921 	 * our local addresses, and those are spread across all the active
   1922 	 * ills in the group.
   1923 	 */
   1924 	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
   1925 		return;
   1926 
   1927 	if (NCE_PUBLISH(dst_ncec)) {
   1928 		/*
   1929 		 * Someone just advertised an addresses that we publish. First,
   1930 		 * check it it was us -- if so, we can safely ignore it.
   1931 		 * We don't get the haddr from the ira_l2src because, in the
   1932 		 * case that the packet originated from us, on an IPMP group,
   1933 		 * the ira_l2src may would be the link-layer address of the
   1934 		 * cast_ill used to send the packet, which may not be the same
   1935 		 * as the dst_ncec->ncec_lladdr of the address.
   1936 		 */
   1937 		if (haddr != NULL) {
   1938 			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
   1939 				goto out;
   1940 
   1941 			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
   1942 				goto out;   /* from us -- no conflict */
   1943 
   1944 			/*
   1945 			 * If we're in an IPMP group, check if this is an echo
   1946 			 * from another ill in the group.  Use the double-
   1947 			 * checked locking pattern to avoid grabbing
   1948 			 * ill_g_lock in the non-IPMP case.
   1949 			 */
   1950 			if (IS_UNDER_IPMP(ill)) {
   1951 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1952 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
   1953 				    ill->ill_grp, haddr, hlen) != NULL) {
   1954 					rw_exit(&ipst->ips_ill_g_lock);
   1955 					goto out;
   1956 				}
   1957 				rw_exit(&ipst->ips_ill_g_lock);
   1958 			}
   1959 		}
   1960 
   1961 		/*
   1962 		 * This appears to be a real conflict.  If we're trying to
   1963 		 * configure this NCE (ND_PROBE), then shut it down.
   1964 		 * Otherwise, handle the discovered conflict.
   1965 		 */
   1966 		if (dst_ncec->ncec_state == ND_PROBE) {
   1967 			ndp_failure(mp, ira);
   1968 		} else {
   1969 			if (ip_nce_conflict(mp, ira, dst_ncec)) {
   1970 				char hbuf[MAC_STR_LEN];
   1971 				char sbuf[INET6_ADDRSTRLEN];
   1972 
   1973 				cmn_err(CE_WARN,
   1974 				    "node '%s' is using %s on %s",
   1975 				    inet_ntop(AF_INET6, &target, sbuf,
   1976 				    sizeof (sbuf)),
   1977 				    haddr == NULL ? "<none>" :
   1978 				    mac_colon_addr(haddr, hlen, hbuf,
   1979 				    sizeof (hbuf)), ill->ill_name);
   1980 				/*
   1981 				 * RFC 4862, Section 5.4.4 does not mandate
   1982 				 * any specific behavior when an NA matches
   1983 				 * a non-tentative address assigned to the
   1984 				 * receiver. We make the choice of defending
   1985 				 * our address, based on the assumption that
   1986 				 * the sender has not detected the Duplicate.
   1987 				 *
   1988 				 * ncec_last_time_defended has been adjusted
   1989 				 * in ip_nce_conflict()
   1990 				 */
   1991 				(void) ndp_announce(dst_ncec);
   1992 			}
   1993 		}
   1994 	} else {
   1995 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
   1996 			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
   1997 
   1998 		/* B_TRUE indicates this an advertisement */
   1999 		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
   2000 	}
   2001 out:
   2002 	ncec_refrele(dst_ncec);
   2003 }
   2004 
   2005 /*
   2006  * Process NDP neighbor solicitation/advertisement messages.
   2007  * The checksum has already checked o.k before reaching here.
   2008  * Information about the datalink header is contained in ira_l2src, but
   2009  * that should be ignored for loopback packets.
   2010  */
   2011 void
   2012 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
   2013 {
   2014 	ill_t		*ill = ira->ira_rill;
   2015 	icmp6_t		*icmp_nd;
   2016 	ip6_t		*ip6h;
   2017 	int		len;
   2018 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
   2019 	ill_t		*orig_ill = NULL;
   2020 
   2021 	/*
   2022 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
   2023 	 * and make it be the IPMP upper so avoid being confused by a packet
   2024 	 * addressed to a unicast address on a different ill.
   2025 	 */
   2026 	if (IS_UNDER_IPMP(ill)) {
   2027 		orig_ill = ill;
   2028 		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
   2029 		if (ill == NULL) {
   2030 			ill = orig_ill;
   2031 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2032 			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
   2033 			    mp, ill);
   2034 			freemsg(mp);
   2035 			return;
   2036 		}
   2037 		ASSERT(ill != orig_ill);
   2038 		orig_ill = ira->ira_ill;
   2039 		ira->ira_ill = ill;
   2040 		mib = ill->ill_icmp6_mib;
   2041 	}
   2042 	if (!pullupmsg(mp, -1)) {
   2043 		ip1dbg(("ndp_input: pullupmsg failed\n"));
   2044 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2045 		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
   2046 		goto done;
   2047 	}
   2048 	ip6h = (ip6_t *)mp->b_rptr;
   2049 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
   2050 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
   2051 		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
   2052 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
   2053 		goto done;
   2054 	}
   2055 	/*
   2056 	 * NDP does not accept any extension headers between the
   2057 	 * IP header and the ICMP header since e.g. a routing
   2058 	 * header could be dangerous.
   2059 	 * This assumes that any AH or ESP headers are removed
   2060 	 * by ip prior to passing the packet to ndp_input.
   2061 	 */
   2062 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
   2063 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
   2064 		    ip6h->ip6_nxt));
   2065 		ip_drop_input("Wrong next header", mp, ill);
   2066 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
   2067 		goto done;
   2068 	}
   2069 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
   2070 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
   2071 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
   2072 	if (icmp_nd->icmp6_code != 0) {
   2073 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
   2074 		ip_drop_input("code non-zero", mp, ill);
   2075 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
   2076 		goto done;
   2077 	}
   2078 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
   2079 	/*
   2080 	 * Make sure packet length is large enough for either
   2081 	 * a NS or a NA icmp packet.
   2082 	 */
   2083 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
   2084 		ip1dbg(("ndp_input: packet too short\n"));
   2085 		ip_drop_input("packet too short", mp, ill);
   2086 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
   2087 		goto done;
   2088 	}
   2089 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
   2090 		ndp_input_solicit(mp, ira);
   2091 	} else {
   2092 		ndp_input_advert(mp, ira);
   2093 	}
   2094 done:
   2095 	freemsg(mp);
   2096 	if (orig_ill != NULL) {
   2097 		ill_refrele(ill);
   2098 		ira->ira_ill = orig_ill;
   2099 	}
   2100 }
   2101 
   2102 /*
   2103  * ndp_xmit is called to form and transmit a ND solicitation or
   2104  * advertisement ICMP packet.
   2105  *
   2106  * If the source address is unspecified and this isn't a probe (used for
   2107  * duplicate address detection), an appropriate source address and link layer
   2108  * address will be chosen here.  The link layer address option is included if
   2109  * the source is specified (i.e., all non-probe packets), and omitted (per the
   2110  * specification) otherwise.
   2111  *
   2112  * It returns B_FALSE only if it does a successful put() to the
   2113  * corresponding ill's ill_wq otherwise returns B_TRUE.
   2114  */
   2115 static boolean_t
   2116 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
   2117     const in6_addr_t *sender, const in6_addr_t *target, int flag)
   2118 {
   2119 	uint32_t	len;
   2120 	icmp6_t 	*icmp6;
   2121 	mblk_t		*mp;
   2122 	ip6_t		*ip6h;
   2123 	nd_opt_hdr_t	*opt;
   2124 	uint_t		plen;
   2125 	zoneid_t	zoneid = GLOBAL_ZONEID;
   2126 	ill_t		*hwaddr_ill = ill;
   2127 	ip_xmit_attr_t	ixas;
   2128 	ip_stack_t	*ipst = ill->ill_ipst;
   2129 	boolean_t	need_refrele = B_FALSE;
   2130 	boolean_t	probe = B_FALSE;
   2131 
   2132 	if (IS_UNDER_IPMP(ill)) {
   2133 		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
   2134 		/*
   2135 		 * We send non-probe packets on the upper IPMP interface.
   2136 		 * ip_output_simple() will use cast_ill for sending any
   2137 		 * multicast packets. Note that we can't follow the same
   2138 		 * logic for probe packets because all interfaces in the ipmp
   2139 		 * group may have failed, so that we really want to only try
   2140 		 * to send the ND packet on the ill corresponding to the src
   2141 		 * address.
   2142 		 */
   2143 		if (!probe) {
   2144 			ill = ipmp_ill_hold_ipmp_ill(ill);
   2145 			if (ill != NULL)
   2146 				need_refrele = B_TRUE;
   2147 			else
   2148 				ill = hwaddr_ill;
   2149 		}
   2150 	}
   2151 
   2152 	/*
   2153 	 * If we have a unspecified source(sender) address, select a
   2154 	 * proper source address for the solicitation here itself so
   2155 	 * that we can initialize the h/w address correctly.
   2156 	 *
   2157 	 * If the sender is specified then we use this address in order
   2158 	 * to lookup the zoneid before calling ip_output_v6(). This is to
   2159 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
   2160 	 * by IP (we cannot guarantee that the global zone has an interface
   2161 	 * route to the destination).
   2162 	 *
   2163 	 * Note that the NA never comes here with the unspecified source
   2164 	 * address.
   2165 	 */
   2166 
   2167 	/*
   2168 	 * Probes will have unspec src at this point.
   2169 	 */
   2170 	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
   2171 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
   2172 		/*
   2173 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
   2174 		 * ALL_ZONES if it cannot find a matching ipif for the address
   2175 		 * we are trying to use. In this case we err on the side of
   2176 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
   2177 		 */
   2178 		if (zoneid == ALL_ZONES)
   2179 			zoneid = GLOBAL_ZONEID;
   2180 	}
   2181 
   2182 	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
   2183 	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
   2184 	mp = allocb(len,  BPRI_LO);
   2185 	if (mp == NULL) {
   2186 		if (need_refrele)
   2187 			ill_refrele(ill);
   2188 		return (B_TRUE);
   2189 	}
   2190 
   2191 	bzero((char *)mp->b_rptr, len);
   2192 	mp->b_wptr = mp->b_rptr + len;
   2193 
   2194 	bzero(&ixas, sizeof (ixas));
   2195 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6 | IXAF_NO_HW_CKSUM;
   2196 
   2197 	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
   2198 	ixas.ixa_ipst = ipst;
   2199 	ixas.ixa_cred = kcred;
   2200 	ixas.ixa_cpid = NOPID;
   2201 	ixas.ixa_tsl = NULL;
   2202 	ixas.ixa_zoneid = zoneid;
   2203 
   2204 	ip6h = (ip6_t *)mp->b_rptr;
   2205 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
   2206 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
   2207 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
   2208 	ip6h->ip6_hops = IPV6_MAX_HOPS;
   2209 	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
   2210 	ip6h->ip6_dst = *target;
   2211 	icmp6 = (icmp6_t *)&ip6h[1];
   2212 
   2213 	if (hw_addr_len != 0) {
   2214 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
   2215 		    sizeof (nd_neighbor_advert_t));
   2216 	} else {
   2217 		opt = NULL;
   2218 	}
   2219 	if (operation == ND_NEIGHBOR_SOLICIT) {
   2220 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
   2221 
   2222 		if (opt != NULL && !(flag & NDP_PROBE)) {
   2223 			/*
   2224 			 * Note that we don't send out SLLA for ND probes
   2225 			 * per RFC 4862, even though we do send out the src
   2226 			 * haddr for IPv4 DAD probes, even though both IPv4
   2227 			 * and IPv6 go out with the unspecified/INADDR_ANY
   2228 			 * src IP addr.
   2229 			 */
   2230 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
   2231 		}
   2232 		ip6h->ip6_src = *sender;
   2233 		ns->nd_ns_target = *target;
   2234 		if (!(flag & NDP_UNICAST)) {
   2235 			/* Form multicast address of the target */
   2236 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
   2237 			ip6h->ip6_dst.s6_addr32[3] |=
   2238 			    ns->nd_ns_target.s6_addr32[3];
   2239 		}
   2240 	} else {
   2241 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
   2242 
   2243 		ASSERT(!(flag & NDP_PROBE));
   2244 		if (opt != NULL)
   2245 			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
   2246 		ip6h->ip6_src = *sender;
   2247 		na->nd_na_target = *sender;
   2248 		if (flag & NDP_ISROUTER)
   2249 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
   2250 		if (flag & NDP_SOLICITED)
   2251 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
   2252 		if (flag & NDP_ORIDE)
   2253 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
   2254 	}
   2255 
   2256 	if (!(flag & NDP_PROBE)) {
   2257 		if (hw_addr != NULL && opt != NULL) {
   2258 			/* Fill in link layer address and option len */
   2259 			opt->nd_opt_len = (uint8_t)plen;
   2260 			bcopy(hw_addr, &opt[1], hw_addr_len);
   2261 		}
   2262 	}
   2263 	if (opt != NULL && opt->nd_opt_type == 0) {
   2264 		/* If there's no link layer address option, then strip it. */
   2265 		len -= plen * 8;
   2266 		mp->b_wptr = mp->b_rptr + len;
   2267 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
   2268 	}
   2269 
   2270 	icmp6->icmp6_type = (uint8_t)operation;
   2271 	icmp6->icmp6_code = 0;
   2272 	/*
   2273 	 * Prepare for checksum by putting icmp length in the icmp
   2274 	 * checksum field. The checksum is calculated in ip_output.c.
   2275 	 */
   2276 	icmp6->icmp6_cksum = ip6h->ip6_plen;
   2277 
   2278 	(void) ip_output_simple(mp, &ixas);
   2279 	ixa_cleanup(&ixas);
   2280 	if (need_refrele)
   2281 		ill_refrele(ill);
   2282 	return (B_FALSE);
   2283 }
   2284 
   2285 /*
   2286  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
   2287  * The datapath uses this as an indication that there
   2288  * is a problem (as opposed to a NCE that was just
   2289  * reclaimed due to lack of memory.
   2290  * Note that static ARP entries never become unreachable.
   2291  */
   2292 void
   2293 nce_make_unreachable(ncec_t *ncec)
   2294 {
   2295 	mutex_enter(&ncec->ncec_lock);
   2296 	ncec->ncec_state = ND_UNREACHABLE;
   2297 	mutex_exit(&ncec->ncec_lock);
   2298 }
   2299 
   2300 /*
   2301  * NCE retransmit timer. Common to IPv4 and IPv6.
   2302  * This timer goes off when:
   2303  * a. It is time to retransmit a resolution for resolver.
   2304  * b. It is time to send reachability probes.
   2305  */
   2306 void
   2307 nce_timer(void *arg)
   2308 {
   2309 	ncec_t		*ncec = arg;
   2310 	ill_t		*ill = ncec->ncec_ill, *src_ill;
   2311 	char		addrbuf[INET6_ADDRSTRLEN];
   2312 	boolean_t	dropped = B_FALSE;
   2313 	ip_stack_t	*ipst = ncec->ncec_ipst;
   2314 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
   2315 	in_addr_t	sender4 = INADDR_ANY;
   2316 	in6_addr_t	sender6 = ipv6_all_zeros;
   2317 
   2318 	/*
   2319 	 * The timer has to be cancelled by ncec_delete before doing the final
   2320 	 * refrele. So the NCE is guaranteed to exist when the timer runs
   2321 	 * until it clears the timeout_id. Before clearing the timeout_id
   2322 	 * bump up the refcnt so that we can continue to use the ncec
   2323 	 */
   2324 	ASSERT(ncec != NULL);
   2325 	mutex_enter(&ncec->ncec_lock);
   2326 	ncec_refhold_locked(ncec);
   2327 	ncec->ncec_timeout_id = 0;
   2328 	mutex_exit(&ncec->ncec_lock);
   2329 
   2330 	src_ill = nce_resolve_src(ncec, &sender6);
   2331 	/* if we could not find a sender address, return */
   2332 	if (src_ill == NULL) {
   2333 		if (!isv6) {
   2334 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
   2335 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
   2336 			    &sender4, addrbuf, sizeof (addrbuf))));
   2337 		} else {
   2338 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
   2339 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
   2340 		}
   2341 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
   2342 		ncec_refrele(ncec);
   2343 		return;
   2344 	}
   2345 	if (!isv6)
   2346 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
   2347 
   2348 	mutex_enter(&ncec->ncec_lock);
   2349 	/*
   2350 	 * Check the reachability state.
   2351 	 */
   2352 	switch (ncec->ncec_state) {
   2353 	case ND_DELAY:
   2354 		ASSERT(ncec->ncec_lladdr != NULL);
   2355 		ncec->ncec_state = ND_PROBE;
   2356 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
   2357 		if (isv6) {
   2358 			mutex_exit(&ncec->ncec_lock);
   2359 			dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
   2360 			    src_ill->ill_phys_addr,
   2361 			    src_ill->ill_phys_addr_length,
   2362 			    &sender6, &ncec->ncec_addr,
   2363 			    NDP_UNICAST);
   2364 		} else {
   2365 			dropped = arp_request(ncec, sender4, src_ill);
   2366 			mutex_exit(&ncec->ncec_lock);
   2367 		}
   2368 		if (!dropped) {
   2369 			mutex_enter(&ncec->ncec_lock);
   2370 			ncec->ncec_pcnt--;
   2371 			mutex_exit(&ncec->ncec_lock);
   2372 		}
   2373 		if (ip_debug > 3) {
   2374 			/* ip2dbg */
   2375 			pr_addr_dbg("nce_timer: state for %s changed "
   2376 			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
   2377 		}
   2378 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
   2379 		break;
   2380 	case ND_PROBE:
   2381 		/* must be retransmit timer */
   2382 		ASSERT(ncec->ncec_pcnt >= -1);
   2383 		if (ncec->ncec_pcnt > 0) {
   2384 			/*
   2385 			 * As per RFC2461, the ncec gets deleted after
   2386 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
   2387 			 * Note that the first unicast solicitation is sent
   2388 			 * during the DELAY state.
   2389 			 */
   2390 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
   2391 			    ncec->ncec_pcnt,
   2392 			    inet_ntop((isv6? AF_INET6 : AF_INET),
   2393 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
   2394 			if (NCE_PUBLISH(ncec)) {
   2395 				mutex_exit(&ncec->ncec_lock);
   2396 				/*
   2397 				 * send out a probe; note that src_ill
   2398 				 * is ignored by nce_dad() for all
   2399 				 * DAD message types other than IPv6
   2400 				 * unicast probes
   2401 				 */
   2402 				nce_dad(ncec, src_ill, B_TRUE);
   2403 			} else {
   2404 				ASSERT(src_ill != NULL);
   2405 				if (isv6) {
   2406 					mutex_exit(&ncec->ncec_lock);
   2407 					dropped = ndp_xmit(src_ill,
   2408 					    ND_NEIGHBOR_SOLICIT,
   2409 					    src_ill->ill_phys_addr,
   2410 					    src_ill->ill_phys_addr_length,
   2411 					    &sender6, &ncec->ncec_addr,
   2412 					    NDP_UNICAST);
   2413 				} else {
   2414 					/*
   2415 					 * since the nce is REACHABLE,
   2416 					 * the ARP request will be sent out
   2417 					 * as a link-layer unicast.
   2418 					 */
   2419 					dropped = arp_request(ncec, sender4,
   2420 					    src_ill);
   2421 					mutex_exit(&ncec->ncec_lock);
   2422 				}
   2423 				if (!dropped) {
   2424 					mutex_enter(&ncec->ncec_lock);
   2425 					ncec->ncec_pcnt--;
   2426 					mutex_exit(&ncec->ncec_lock);
   2427 				}
   2428 				nce_restart_timer(ncec,
   2429 				    ill->ill_reachable_retrans_time);
   2430 			}
   2431 		} else if (ncec->ncec_pcnt < 0) {
   2432 			/* No hope, delete the ncec */
   2433 			/* Tell datapath it went bad */
   2434 			ncec->ncec_state = ND_UNREACHABLE;
   2435 			mutex_exit(&ncec->ncec_lock);
   2436 			if (ip_debug > 2) {
   2437 				/* ip1dbg */
   2438 				pr_addr_dbg("nce_timer: Delete NCE for"
   2439 				    " dst %s\n", (isv6? AF_INET6: AF_INET),
   2440 				    &ncec->ncec_addr);
   2441 			}
   2442 			/* if static ARP can't delete. */
   2443 			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
   2444 				ncec_delete(ncec);
   2445 
   2446 		} else if (!NCE_PUBLISH(ncec)) {
   2447 			/*
   2448 			 * Probe count is 0 for a dynamic entry (one that we
   2449 			 * ourselves are not publishing). We should never get
   2450 			 * here if NONUD was requested, hence the ASSERT below.
   2451 			 */
   2452 			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
   2453 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
   2454 			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
   2455 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
   2456 			ncec->ncec_pcnt--;
   2457 			mutex_exit(&ncec->ncec_lock);
   2458 			/* Wait one interval before killing */
   2459 			nce_restart_timer(ncec,
   2460 			    ill->ill_reachable_retrans_time);
   2461 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
   2462 			ipif_t *ipif;
   2463 			ipaddr_t ncec_addr;
   2464 
   2465 			/*
   2466 			 * We're done probing, and we can now declare this
   2467 			 * address to be usable.  Let IP know that it's ok to
   2468 			 * use.
   2469 			 */
   2470 			ncec->ncec_state = ND_REACHABLE;
   2471 			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
   2472 			mutex_exit(&ncec->ncec_lock);
   2473 			if (isv6) {
   2474 				ipif = ipif_lookup_addr_exact_v6(
   2475 				    &ncec->ncec_addr, ill, ipst);
   2476 			} else {
   2477 				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
   2478 				    ncec_addr);
   2479 				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
   2480 				    ipst);
   2481 			}
   2482 			if (ipif != NULL) {
   2483 				if (ipif->ipif_was_dup) {
   2484 					char ibuf[LIFNAMSIZ];
   2485 					char sbuf[INET6_ADDRSTRLEN];
   2486 
   2487 					ipif->ipif_was_dup = B_FALSE;
   2488 					(void) inet_ntop(AF_INET6,
   2489 					    &ipif->ipif_v6lcl_addr,
   2490 					    sbuf, sizeof (sbuf));
   2491 					ipif_get_name(ipif, ibuf,
   2492 					    sizeof (ibuf));
   2493 					cmn_err(CE_NOTE, "recovered address "
   2494 					    "%s on %s", sbuf, ibuf);
   2495 				}
   2496 				if ((ipif->ipif_flags & IPIF_UP) &&
   2497 				    !ipif->ipif_addr_ready)
   2498 					ipif_up_notify(ipif);
   2499 				ipif->ipif_addr_ready = 1;
   2500 				ipif_refrele(ipif);
   2501 			}
   2502 			if (!isv6 && arp_no_defense)
   2503 				break;
   2504 			/* Begin defending our new address */
   2505 			if (ncec->ncec_unsolicit_count > 0) {
   2506 				ncec->ncec_unsolicit_count--;
   2507 				if (isv6) {
   2508 					dropped = ndp_announce(ncec);
   2509 				} else {
   2510 					dropped = arp_announce(ncec);
   2511 				}
   2512 
   2513 				if (dropped)
   2514 					ncec->ncec_unsolicit_count++;
   2515 				else
   2516 					ncec->ncec_last_time_defended =
   2517 					    ddi_get_lbolt();
   2518 			}
   2519 			if (ncec->ncec_unsolicit_count > 0) {
   2520 				nce_restart_timer(ncec,
   2521 				    ANNOUNCE_INTERVAL(isv6));
   2522 			} else if (DEFENSE_INTERVAL(isv6) != 0) {
   2523 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
   2524 			}
   2525 		} else {
   2526 			/*
   2527 			 * This is an address we're probing to be our own, but
   2528 			 * the ill is down.  Wait until it comes back before
   2529 			 * doing anything, but switch to reachable state so
   2530 			 * that the restart will work.
   2531 			 */
   2532 			ncec->ncec_state = ND_REACHABLE;
   2533 			mutex_exit(&ncec->ncec_lock);
   2534 		}
   2535 		break;
   2536 	case ND_INCOMPLETE: {
   2537 		mblk_t	*mp, *nextmp;
   2538 		mblk_t	**prevmpp;
   2539 
   2540 		/*
   2541 		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
   2542 		 * for any IPMP probe packets, and toss them.  IPMP probe
   2543 		 * packets will always be at the head of ncec_qd_mp, so that
   2544 		 * we can stop at the first queued ND packet that is
   2545 		 * not a probe packet.
   2546 		 */
   2547 		prevmpp = &ncec->ncec_qd_mp;
   2548 		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
   2549 			nextmp = mp->b_next;
   2550 
   2551 			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
   2552 				inet_freemsg(mp);
   2553 				ncec->ncec_nprobes--;
   2554 				*prevmpp = nextmp;
   2555 			} else {
   2556 				prevmpp = &mp->b_next;
   2557 			}
   2558 		}
   2559 
   2560 		/*
   2561 		 * Must be resolver's retransmit timer.
   2562 		 */
   2563 		mutex_exit(&ncec->ncec_lock);
   2564 		ip_ndp_resolve(ncec);
   2565 		break;
   2566 	}
   2567 	case ND_REACHABLE:
   2568 		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
   2569 		    ncec->ncec_unsolicit_count != 0) ||
   2570 		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
   2571 			if (ncec->ncec_unsolicit_count > 0) {
   2572 				ncec->ncec_unsolicit_count--;
   2573 				mutex_exit(&ncec->ncec_lock);
   2574 				/*
   2575 				 * When we get to zero announcements left,
   2576 				 * switch to address defense
   2577 				 */
   2578 			} else {
   2579 				boolean_t rate_limit;
   2580 
   2581 				mutex_exit(&ncec->ncec_lock);
   2582 				rate_limit = ill_defend_rate_limit(ill, ncec);
   2583 				if (rate_limit) {
   2584 					nce_restart_timer(ncec,
   2585 					    DEFENSE_INTERVAL(isv6));
   2586 					break;
   2587 				}
   2588 			}
   2589 			if (isv6) {
   2590 				dropped = ndp_announce(ncec);
   2591 			} else {
   2592 				dropped = arp_announce(ncec);
   2593 			}
   2594 			mutex_enter(&ncec->ncec_lock);
   2595 			if (dropped) {
   2596 				ncec->ncec_unsolicit_count++;
   2597 			} else {
   2598 				ncec->ncec_last_time_defended =
   2599 				    ddi_get_lbolt();
   2600 			}
   2601 			mutex_exit(&ncec->ncec_lock);
   2602 			if (ncec->ncec_unsolicit_count != 0) {
   2603 				nce_restart_timer(ncec,
   2604 				    ANNOUNCE_INTERVAL(isv6));
   2605 			} else {
   2606 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
   2607 			}
   2608 		} else {
   2609 			mutex_exit(&ncec->ncec_lock);
   2610 		}
   2611 		break;
   2612 	default:
   2613 		mutex_exit(&ncec->ncec_lock);
   2614 		break;
   2615 	}
   2616 done:
   2617 	ncec_refrele(ncec);
   2618 	ill_refrele(src_ill);
   2619 }
   2620 
   2621 /*
   2622  * Set a link layer address from the ll_addr passed in.
   2623  * Copy SAP from ill.
   2624  */
   2625 static void
   2626 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
   2627 {
   2628 	ill_t	*ill = ncec->ncec_ill;
   2629 
   2630 	ASSERT(ll_addr != NULL);
   2631 	if (ill->ill_phys_addr_length > 0) {
   2632 		/*
   2633 		 * The bcopy() below used to be called for the physical address
   2634 		 * length rather than the link layer address length. For
   2635 		 * ethernet and many other media, the phys_addr and lla are
   2636 		 * identical.
   2637 		 *
   2638 		 * The phys_addr and lla may not be the same for devices that
   2639 		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
   2640 		 * no known instances of these.
   2641 		 *
   2642 		 * For PPP or other interfaces with a zero length
   2643 		 * physical address, don't do anything here.
   2644 		 * The bcopy() with a zero phys_addr length was previously
   2645 		 * a no-op for interfaces with a zero-length physical address.
   2646 		 * Using the lla for them would change the way they operate.
   2647 		 * Doing nothing in such cases preserves expected behavior.
   2648 		 */
   2649 		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
   2650 	}
   2651 }
   2652 
   2653 boolean_t
   2654 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
   2655     uint32_t ll_addr_len)
   2656 {
   2657 	ASSERT(ncec->ncec_lladdr != NULL);
   2658 	if (ll_addr == NULL)
   2659 		return (B_FALSE);
   2660 	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
   2661 		return (B_TRUE);
   2662 	return (B_FALSE);
   2663 }
   2664 
   2665 /*
   2666  * Updates the link layer address or the reachability state of
   2667  * a cache entry.  Reset probe counter if needed.
   2668  */
   2669 void
   2670 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
   2671 {
   2672 	ill_t	*ill = ncec->ncec_ill;
   2673 	boolean_t need_stop_timer = B_FALSE;
   2674 	boolean_t need_fastpath_update = B_FALSE;
   2675 	nce_t	*nce = NULL;
   2676 	timeout_id_t tid;
   2677 
   2678 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   2679 	/*
   2680 	 * If this interface does not do NUD, there is no point
   2681 	 * in allowing an update to the cache entry.  Although
   2682 	 * we will respond to NS.
   2683 	 * The only time we accept an update for a resolver when
   2684 	 * NUD is turned off is when it has just been created.
   2685 	 * Non-Resolvers will always be created as REACHABLE.
   2686 	 */
   2687 	if (new_state != ND_UNCHANGED) {
   2688 		if ((ncec->ncec_flags & NCE_F_NONUD) &&
   2689 		    (ncec->ncec_state != ND_INCOMPLETE))
   2690 			return;
   2691 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
   2692 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
   2693 		need_stop_timer = B_TRUE;
   2694 		if (new_state == ND_REACHABLE)
   2695 			ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
   2696 		else {
   2697 			/* We force NUD in this case */
   2698 			ncec->ncec_last = 0;
   2699 		}
   2700 		ncec->ncec_state = new_state;
   2701 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
   2702 		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
   2703 		    new_state == ND_INCOMPLETE);
   2704 	}
   2705 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
   2706 		tid = ncec->ncec_timeout_id;
   2707 		ncec->ncec_timeout_id = 0;
   2708 	}
   2709 	/*
   2710 	 * Re-trigger fastpath probe and
   2711 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
   2712 	 * whatever packets that happens to be transmitting at the time.
   2713 	 */
   2714 	if (new_ll_addr != NULL) {
   2715 		bcopy(new_ll_addr, ncec->ncec_lladdr,
   2716 		    ill->ill_phys_addr_length);
   2717 		need_fastpath_update = B_TRUE;
   2718 	}
   2719 	mutex_exit(&ncec->ncec_lock);
   2720 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
   2721 		if (tid != 0)
   2722 			(void) untimeout(tid);
   2723 	}
   2724 	if (need_fastpath_update) {
   2725 		/*
   2726 		 * Delete any existing existing dlur_mp and fp_mp information.
   2727 		 * For IPMP interfaces, all underlying ill's must be checked
   2728 		 * and purged.
   2729 		 */
   2730 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
   2731 		/*
   2732 		 * add the new dlur_mp and fp_mp
   2733 		 */
   2734 		nce = nce_fastpath(ncec, B_TRUE, NULL);
   2735 		if (nce != NULL)
   2736 			nce_refrele(nce);
   2737 	}
   2738 	mutex_enter(&ncec->ncec_lock);
   2739 }
   2740 
   2741 static void
   2742 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
   2743 {
   2744 	uint_t	count = 0;
   2745 	mblk_t  **mpp, *tmp;
   2746 
   2747 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   2748 
   2749 	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
   2750 		if (++count > ncec->ncec_ill->ill_max_buf) {
   2751 			tmp = ncec->ncec_qd_mp->b_next;
   2752 			ncec->ncec_qd_mp->b_next = NULL;
   2753 			/*
   2754 			 * if we never create data addrs on the under_ill
   2755 			 * does this matter?
   2756 			 */
   2757 			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
   2758 			    ipIfStatsOutDiscards);
   2759 			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
   2760 			    ncec->ncec_ill);
   2761 			freemsg(ncec->ncec_qd_mp);
   2762 			ncec->ncec_qd_mp = tmp;
   2763 		}
   2764 	}
   2765 
   2766 	if (head_insert) {
   2767 		ncec->ncec_nprobes++;
   2768 		mp->b_next = ncec->ncec_qd_mp;
   2769 		ncec->ncec_qd_mp = mp;
   2770 	} else {
   2771 		*mpp = mp;
   2772 	}
   2773 }
   2774 
   2775 /*
   2776  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
   2777  * queued at the head or tail of the queue based on the input argument
   2778  * 'head_insert'. The caller should specify this argument as B_TRUE if this
   2779  * packet is an IPMP probe packet, in which case the following happens:
   2780  *
   2781  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
   2782  *	(non-ipmp_probe) load-speading case where the source address of the ND
   2783  *	packet is not tied to ncec_ill. If the ill bound to the source address
   2784  *	cannot receive, the response to the ND packet will not be received.
   2785  *	However, if ND packets for ncec_ill's probes are queued	behind that ND
   2786  *	packet, those probes will also fail to be sent, and thus in.mpathd will
   2787  *	 erroneously conclude that ncec_ill has also failed.
   2788  *
   2789  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
   2790  *	the first attempt.  This ensures that ND problems do not manifest as
   2791  *	probe RTT spikes.
   2792  *
   2793  * We achieve this by inserting ipmp_probe() packets at the head of the
   2794  * nce_queue.
   2795  *
   2796  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
   2797  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
   2798  */
   2799 void
   2800 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
   2801 {
   2802 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   2803 	nce_queue_mp_common(ncec, mp, head_insert);
   2804 }
   2805 
   2806 /*
   2807  * Called when address resolution failed due to a timeout.
   2808  * Send an ICMP unreachable in response to all queued packets.
   2809  */
   2810 void
   2811 ndp_resolv_failed(ncec_t *ncec)
   2812 {
   2813 	mblk_t	*mp, *nxt_mp;
   2814 	char	buf[INET6_ADDRSTRLEN];
   2815 	ill_t *ill = ncec->ncec_ill;
   2816 	ip_recv_attr_t	iras;
   2817 
   2818 	bzero(&iras, sizeof (iras));
   2819 	iras.ira_flags = 0;
   2820 	/*
   2821 	 * we are setting the ira_rill to the ipmp_ill (instead of
   2822 	 * the actual ill on which the packet was received), but this
   2823 	 * is ok because we don't actually need the real ira_rill.
   2824 	 * to send the icmp unreachable to the sender.
   2825 	 */
   2826 	iras.ira_ill = iras.ira_rill = ill;
   2827 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
   2828 	iras.ira_rifindex = iras.ira_ruifindex;
   2829 
   2830 	ip1dbg(("ndp_resolv_failed: dst %s\n",
   2831 	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
   2832 	mutex_enter(&ncec->ncec_lock);
   2833 	mp = ncec->ncec_qd_mp;
   2834 	ncec->ncec_qd_mp = NULL;
   2835 	ncec->ncec_nprobes = 0;
   2836 	mutex_exit(&ncec->ncec_lock);
   2837 	while (mp != NULL) {
   2838 		nxt_mp = mp->b_next;
   2839 		mp->b_next = NULL;
   2840 
   2841 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   2842 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
   2843 		    mp, ill);
   2844 		icmp_unreachable_v6(mp,
   2845 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
   2846 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   2847 		mp = nxt_mp;
   2848 	}
   2849 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
   2850 }
   2851 
   2852 /*
   2853  * Handle the completion of NDP and ARP resolution.
   2854  */
   2855 void
   2856 nce_resolv_ok(ncec_t *ncec)
   2857 {
   2858 	mblk_t *mp;
   2859 	uint_t pkt_len;
   2860 	iaflags_t ixaflags = IXAF_NO_TRACE;
   2861 	nce_t *nce;
   2862 	ill_t	*ill = ncec->ncec_ill;
   2863 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
   2864 	ip_stack_t *ipst = ill->ill_ipst;
   2865 
   2866 	if (IS_IPMP(ncec->ncec_ill)) {
   2867 		nce_resolv_ipmp_ok(ncec);
   2868 		return;
   2869 	}
   2870 	/* non IPMP case */
   2871 
   2872 	mutex_enter(&ncec->ncec_lock);
   2873 	ASSERT(ncec->ncec_nprobes == 0);
   2874 	mp = ncec->ncec_qd_mp;
   2875 	ncec->ncec_qd_mp = NULL;
   2876 	mutex_exit(&ncec->ncec_lock);
   2877 
   2878 	while (mp != NULL) {
   2879 		mblk_t *nxt_mp;
   2880 
   2881 		if (ill->ill_isv6) {
   2882 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
   2883 
   2884 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
   2885 		} else {
   2886 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
   2887 
   2888 			ixaflags |= IXAF_IS_IPV4;
   2889 			pkt_len = ntohs(ipha->ipha_length);
   2890 		}
   2891 		nxt_mp = mp->b_next;
   2892 		mp->b_next = NULL;
   2893 		/*
   2894 		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
   2895 		 * longer available, but it's ok to drop this flag because TCP
   2896 		 * has its own flow-control in effect, so TCP packets
   2897 		 * are not likely to get here when flow-control is in effect.
   2898 		 */
   2899 		mutex_enter(&ill->ill_lock);
   2900 		nce = nce_lookup(ill, &ncec->ncec_addr);
   2901 		mutex_exit(&ill->ill_lock);
   2902 
   2903 		if (nce == NULL) {
   2904 			if (isv6) {
   2905 				BUMP_MIB(&ipst->ips_ip6_mib,
   2906 				    ipIfStatsOutDiscards);
   2907 			} else {
   2908 				BUMP_MIB(&ipst->ips_ip_mib,
   2909 				    ipIfStatsOutDiscards);
   2910 			}
   2911 			ip_drop_output("ipIfStatsOutDiscards - no nce",
   2912 			    mp, NULL);
   2913 			freemsg(mp);
   2914 		} else {
   2915 			/*
   2916 			 * We don't know the zoneid, but
   2917 			 * ip_xmit does not care since IXAF_NO_TRACE
   2918 			 * is set. (We traced the packet the first
   2919 			 * time through ip_xmit.)
   2920 			 */
   2921 			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
   2922 			    ALL_ZONES, 0, NULL);
   2923 			nce_refrele(nce);
   2924 		}
   2925 		mp = nxt_mp;
   2926 	}
   2927 
   2928 	ncec_cb_dispatch(ncec); /* complete callbacks */
   2929 }
   2930 
   2931 /*
   2932  * Called by SIOCSNDP* ioctl to add/change an ncec entry
   2933  * and the corresponding attributes.
   2934  * Disallow states other than ND_REACHABLE or ND_STALE.
   2935  */
   2936 int
   2937 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
   2938 {
   2939 	sin6_t		*sin6;
   2940 	in6_addr_t	*addr;
   2941 	ncec_t		*ncec;
   2942 	nce_t		*nce;
   2943 	int		err = 0;
   2944 	uint16_t	new_flags = 0;
   2945 	uint16_t	old_flags = 0;
   2946 	int		inflags = lnr->lnr_flags;
   2947 	ip_stack_t	*ipst = ill->ill_ipst;
   2948 	boolean_t	do_postprocess = B_FALSE;
   2949 
   2950 	ASSERT(ill->ill_isv6);
   2951 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
   2952 	    (lnr->lnr_state_create != ND_STALE))
   2953 		return (EINVAL);
   2954 
   2955 	sin6 = (sin6_t *)&lnr->lnr_addr;
   2956 	addr = &sin6->sin6_addr;
   2957 
   2958 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
   2959 	ASSERT(!IS_UNDER_IPMP(ill));
   2960 	nce = nce_lookup_addr(ill, addr);
   2961 	if (nce != NULL)
   2962 		new_flags = nce->nce_common->ncec_flags;
   2963 
   2964 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
   2965 	case NDF_ISROUTER_ON:
   2966 		new_flags |= NCE_F_ISROUTER;
   2967 		break;
   2968 	case NDF_ISROUTER_OFF:
   2969 		new_flags &= ~NCE_F_ISROUTER;
   2970 		break;
   2971 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
   2972 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   2973 		if (nce != NULL)
   2974 			nce_refrele(nce);
   2975 		return (EINVAL);
   2976 	}
   2977 
   2978 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
   2979 	case NDF_ANYCAST_ON:
   2980 		new_flags |= NCE_F_ANYCAST;
   2981 		break;
   2982 	case NDF_ANYCAST_OFF:
   2983 		new_flags &= ~NCE_F_ANYCAST;
   2984 		break;
   2985 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
   2986 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   2987 		if (nce != NULL)
   2988 			nce_refrele(nce);
   2989 		return (EINVAL);
   2990 	}
   2991 
   2992 	if (nce == NULL) {
   2993 		err = nce_add_v6(ill,
   2994 		    (uchar_t *)lnr->lnr_hdw_addr,
   2995 		    ill->ill_phys_addr_length,
   2996 		    addr,
   2997 		    new_flags,
   2998 		    lnr->lnr_state_create,
   2999 		    &nce);
   3000 		if (err != 0) {
   3001 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   3002 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
   3003 			return (err);
   3004 		} else {
   3005 			do_postprocess = B_TRUE;
   3006 		}
   3007 	}
   3008 	ncec = nce->nce_common;
   3009 	old_flags = ncec->ncec_flags;
   3010 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
   3011 		ncec_router_to_host(ncec);
   3012 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   3013 		if (do_postprocess)
   3014 			err = nce_add_v6_postprocess(nce);
   3015 		nce_refrele(nce);
   3016 		return (0);
   3017 	}
   3018 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   3019 
   3020 	if (do_postprocess)
   3021 		err = nce_add_v6_postprocess(nce);
   3022 	/*
   3023 	 * err cannot be anything other than 0 because we don't support
   3024 	 * proxy arp of static addresses.
   3025 	 */
   3026 	ASSERT(err == 0);
   3027 
   3028 	mutex_enter(&ncec->ncec_lock);
   3029 	ncec->ncec_flags = new_flags;
   3030 	mutex_exit(&ncec->ncec_lock);
   3031 	/*
   3032 	 * Note that we ignore the state at this point, which
   3033 	 * should be either STALE or REACHABLE.  Instead we let
   3034 	 * the link layer address passed in to determine the state
   3035 	 * much like incoming packets.
   3036 	 */
   3037 	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
   3038 	nce_refrele(nce);
   3039 	return (0);
   3040 }
   3041 
   3042 /*
   3043  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
   3044  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
   3045  * be held to ensure that they are in the same group.
   3046  */
   3047 static nce_t *
   3048 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
   3049 {
   3050 
   3051 	nce_t *nce;
   3052 
   3053 	nce = nce_ill_lookup_then_add(ill, ncec);
   3054 
   3055 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
   3056 		return (nce);
   3057 
   3058 	/*
   3059 	 * hold the ncec_lock to synchronize with nce_update() so that,
   3060 	 * at the end of this function, the contents of nce_dlur_mp are
   3061 	 * consistent with ncec->ncec_lladdr, even though some intermediate
   3062 	 * packet may have been sent out with a mangled address, which would
   3063 	 * only be a transient condition.
   3064 	 */
   3065 	mutex_enter(&ncec->ncec_lock);
   3066 	if (ncec->ncec_lladdr != NULL) {
   3067 		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
   3068 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
   3069 	} else {
   3070 		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
   3071 		    ill->ill_sap_length);
   3072 	}
   3073 	mutex_exit(&ncec->ncec_lock);
   3074 	return (nce);
   3075 }
   3076 
   3077 /*
   3078  * we make nce_fp_mp to have an M_DATA prepend.
   3079  * The caller ensures there is hold on ncec for this function.
   3080  * Note that since ill_fastpath_probe() copies the mblk there is
   3081  * no need to hold the nce or ncec beyond this function.
   3082  *
   3083  * If the caller has passed in a non-null ncec_nce to nce_faspath() that
   3084  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
   3085  * and will be returned back by this function, so that no extra nce_refrele
   3086  * is required for the caller. The calls from nce_add_common() use this
   3087  * method. All other callers (that pass in NULL ncec_nce) will have to do a
   3088  * nce_refrele of the returned nce (when it is non-null).
   3089  */
   3090 nce_t *
   3091 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
   3092 {
   3093 	nce_t *nce;
   3094 	ill_t *ill = ncec->ncec_ill;
   3095 
   3096 	ASSERT(ill != NULL);
   3097 
   3098 	if (IS_IPMP(ill) && trigger_fp_req) {
   3099 		trigger_fp_req = B_FALSE;
   3100 		ipmp_ncec_fastpath(ncec, ill);
   3101 
   3102 	}
   3103 	/*
   3104 	 * If the caller already has the nce corresponding to the ill, use
   3105 	 * that one. Otherwise we have to lookup/add the nce. Calls from
   3106 	 * nce_add_common() fall in the former category, and have just done
   3107 	 * the nce lookup/add that can be reused.
   3108 	 */
   3109 	if (ncec_nce == NULL)
   3110 		nce = nce_fastpath_create(ill, ncec);
   3111 	else
   3112 		nce = ncec_nce;
   3113 
   3114 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
   3115 		return (nce);
   3116 
   3117 	if (trigger_fp_req)
   3118 		nce_fastpath_trigger(nce);
   3119 	return (nce);
   3120 }
   3121 
   3122 /*
   3123  * Trigger fastpath on nce. No locks may be held.
   3124  */
   3125 static void
   3126 nce_fastpath_trigger(nce_t *nce)
   3127 {
   3128 	int res;
   3129 	ill_t *ill = nce->nce_ill;
   3130 	ncec_t *ncec = nce->nce_common;
   3131 
   3132 	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
   3133 	/*
   3134 	 * EAGAIN is an indication of a transient error
   3135 	 * i.e. allocation failure etc. leave the ncec in the list it
   3136 	 * will be updated when another probe happens for another ire
   3137 	 * if not it will be taken out of the list when the ire is
   3138 	 * deleted.
   3139 	 */
   3140 	if (res != 0 && res != EAGAIN && res != ENOTSUP)
   3141 		nce_fastpath_list_delete(ill, ncec, NULL);
   3142 }
   3143 
   3144 /*
   3145  * Add ncec to the nce fastpath list on ill.
   3146  */
   3147 static nce_t *
   3148 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
   3149 {
   3150 	nce_t *nce = NULL;
   3151 
   3152 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   3153 	/*
   3154 	 * Atomically ensure that the ill is not CONDEMNED and is not going
   3155 	 * down, before adding the NCE.
   3156 	 */
   3157 	if (ill->ill_state_flags & ILL_CONDEMNED)
   3158 		return (NULL);
   3159 	mutex_enter(&ncec->ncec_lock);
   3160 	/*
   3161 	 * if ncec has not been deleted and
   3162 	 * is not already in the list add it.
   3163 	 */
   3164 	if (!NCE_ISCONDEMNED(ncec)) {
   3165 		nce = nce_lookup(ill, &ncec->ncec_addr);
   3166 		if (nce != NULL)
   3167 			goto done;
   3168 		nce = nce_add(ill, ncec);
   3169 	}
   3170 done:
   3171 	mutex_exit(&ncec->ncec_lock);
   3172 	return (nce);
   3173 }
   3174 
   3175 nce_t *
   3176 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
   3177 {
   3178 	nce_t *nce;
   3179 
   3180 	mutex_enter(&ill->ill_lock);
   3181 	nce = nce_ill_lookup_then_add_locked(ill, ncec);
   3182 	mutex_exit(&ill->ill_lock);
   3183 	return (nce);
   3184 }
   3185 
   3186 
   3187 /*
   3188  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
   3189  * nce is added to the 'dead' list, and the caller must nce_refrele() the
   3190  * entry after all locks have been dropped.
   3191  */
   3192 void
   3193 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
   3194 {
   3195 	nce_t *nce;
   3196 
   3197 	ASSERT(ill != NULL);
   3198 
   3199 	/* first clean out any nce pointers in the under_ills */
   3200 	if (IS_IPMP(ill))
   3201 		ipmp_ncec_flush_nce(ncec);
   3202 
   3203 	/* now the ill itself */
   3204 	mutex_enter(&ill->ill_lock);
   3205 	for (nce = list_head(&ill->ill_nce); nce != NULL;
   3206 	    nce = list_next(&ill->ill_nce, nce)) {
   3207 		if (nce->nce_common == ncec) {
   3208 			nce_refhold(nce);
   3209 			nce_delete(nce);
   3210 			break;
   3211 		}
   3212 	}
   3213 	mutex_exit(&ill->ill_lock);
   3214 	if (nce != NULL) {
   3215 		if (dead == NULL)
   3216 			nce_refrele(nce);
   3217 		else
   3218 			list_insert_tail(dead, nce);
   3219 	}
   3220 }
   3221 
   3222 /*
   3223  * when the fastpath response does not fit in the datab
   3224  * associated with the existing nce_fp_mp, we delete and
   3225  * add the nce to retrigger fastpath based on the information
   3226  * in the ncec_t.
   3227  */
   3228 static nce_t *
   3229 nce_delete_then_add(nce_t *nce)
   3230 {
   3231 	ill_t		*ill = nce->nce_ill;
   3232 	nce_t		*newnce = NULL;
   3233 
   3234 	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
   3235 	    (void *)nce, ill->ill_name));
   3236 	mutex_enter(&ill->ill_lock);
   3237 	mutex_enter(&nce->nce_common->ncec_lock);
   3238 	nce_delete(nce);
   3239 	/*
   3240 	 * Make sure that ncec is not condemned before adding. We hold the
   3241 	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
   3242 	 * ipmp_ncec_flush_nce()
   3243 	 */
   3244 	if (!NCE_ISCONDEMNED(nce->nce_common))
   3245 		newnce = nce_add(ill, nce->nce_common);
   3246 	mutex_exit(&nce->nce_common->ncec_lock);
   3247 	mutex_exit(&ill->ill_lock);
   3248 	nce_refrele(nce);
   3249 	return (newnce); /* could be null if nomem */
   3250 }
   3251 
   3252 typedef struct nce_fp_match_s {
   3253 	nce_t	*nce_fp_match_res;
   3254 	mblk_t	*nce_fp_match_ack_mp;
   3255 } nce_fp_match_t;
   3256 
   3257 /* ARGSUSED */
   3258 static int
   3259 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
   3260 {
   3261 	nce_fp_match_t	*nce_fp_marg = arg;
   3262 	ncec_t		*ncec = nce->nce_common;
   3263 	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
   3264 	uchar_t	*mp_rptr, *ud_mp_rptr;
   3265 	mblk_t		*ud_mp = nce->nce_dlur_mp;
   3266 	ptrdiff_t	cmplen;
   3267 
   3268 	/*
   3269 	 * mp is the mp associated with the fastpath ack.
   3270 	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
   3271 	 * under consideration. If the contents match, then the
   3272 	 * fastpath ack is used to update the nce.
   3273 	 */
   3274 	if (ud_mp == NULL)
   3275 		return (0);
   3276 	mp_rptr = mp->b_rptr;
   3277 	cmplen = mp->b_wptr - mp_rptr;
   3278 	ASSERT(cmplen >= 0);
   3279 
   3280 	ud_mp_rptr = ud_mp->b_rptr;
   3281 	/*
   3282 	 * The ncec is locked here to prevent any other threads from accessing
   3283 	 * and changing nce_dlur_mp when the address becomes resolved to an
   3284 	 * lla while we're in the middle of looking at and comparing the
   3285 	 * hardware address (lla). It is also locked to prevent multiple
   3286 	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
   3287 	 * time.
   3288 	 */
   3289 	mutex_enter(&ncec->ncec_lock);
   3290 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
   3291 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
   3292 		nce_fp_marg->nce_fp_match_res = nce;
   3293 		mutex_exit(&ncec->ncec_lock);
   3294 		nce_refhold(nce);
   3295 		return (1);
   3296 	}
   3297 	mutex_exit(&ncec->ncec_lock);
   3298 	return (0);
   3299 }
   3300 
   3301 /*
   3302  * Update all NCE's that are not in fastpath mode and
   3303  * have an nce_fp_mp that matches mp. mp->b_cont contains
   3304  * the fastpath header.
   3305  *
   3306  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
   3307  */
   3308 void
   3309 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
   3310 {
   3311 	nce_fp_match_t nce_fp_marg;
   3312 	nce_t *nce;
   3313 	mblk_t *nce_fp_mp, *fp_mp;
   3314 
   3315 	nce_fp_marg.nce_fp_match_res = NULL;
   3316 	nce_fp_marg.nce_fp_match_ack_mp = mp;
   3317 
   3318 	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
   3319 
   3320 	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
   3321 		return;
   3322 
   3323 	mutex_enter(&nce->nce_lock);
   3324 	nce_fp_mp = nce->nce_fp_mp;
   3325 
   3326 	if (nce_fp_mp != NULL) {
   3327 		fp_mp = mp->b_cont;
   3328 		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
   3329 		    nce_fp_mp->b_datap->db_lim) {
   3330 			mutex_exit(&nce->nce_lock);
   3331 			nce = nce_delete_then_add(nce);
   3332 			if (nce == NULL) {
   3333 				return;
   3334 			}
   3335 			mutex_enter(&nce->nce_lock);
   3336 			nce_fp_mp = nce->nce_fp_mp;
   3337 		}
   3338 	}
   3339 
   3340 	/* Matched - install mp as the fastpath mp */
   3341 	if (nce_fp_mp == NULL) {
   3342 		fp_mp = dupb(mp->b_cont);
   3343 		nce->nce_fp_mp = fp_mp;
   3344 	} else {
   3345 		fp_mp = mp->b_cont;
   3346 		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
   3347 		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
   3348 		    + MBLKL(fp_mp);
   3349 	}
   3350 	mutex_exit(&nce->nce_lock);
   3351 	nce_refrele(nce);
   3352 }
   3353 
   3354 /*
   3355  * Return a pointer to a given option in the packet.
   3356  * Assumes that option part of the packet have already been validated.
   3357  */
   3358 nd_opt_hdr_t *
   3359 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
   3360 {
   3361 	while (optlen > 0) {
   3362 		if (opt->nd_opt_type == opt_type)
   3363 			return (opt);
   3364 		optlen -= 8 * opt->nd_opt_len;
   3365 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
   3366 	}
   3367 	return (NULL);
   3368 }
   3369 
   3370 /*
   3371  * Verify all option lengths present are > 0, also check to see
   3372  * if the option lengths and packet length are consistent.
   3373  */
   3374 boolean_t
   3375 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
   3376 {
   3377 	ASSERT(opt != NULL);
   3378 	while (optlen > 0) {
   3379 		if (opt->nd_opt_len == 0)
   3380 			return (B_FALSE);
   3381 		optlen -= 8 * opt->nd_opt_len;
   3382 		if (optlen < 0)
   3383 			return (B_FALSE);
   3384 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
   3385 	}
   3386 	return (B_TRUE);
   3387 }
   3388 
   3389 /*
   3390  * ncec_walk function.
   3391  * Free a fraction of the NCE cache entries.
   3392  *
   3393  * A possible optimization here would be to use ncec_last where possible, and
   3394  * delete the least-frequently used entry, which would require more complex
   3395  * computation as we walk through the ncec's (e.g., track ncec entries by
   3396  * order of ncec_last and/or maintain state)
   3397  */
   3398 static void
   3399 ncec_cache_reclaim(ncec_t *ncec, char *arg)
   3400 {
   3401 	ip_stack_t	*ipst = ncec->ncec_ipst;
   3402 	uint_t		fraction = *(uint_t *)arg;
   3403 	uint_t		rand;
   3404 
   3405 	if ((ncec->ncec_flags &
   3406 	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
   3407 		return;
   3408 	}
   3409 
   3410 	rand = (uint_t)ddi_get_lbolt() +
   3411 	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
   3412 	if ((rand/fraction)*fraction == rand) {
   3413 		IP_STAT(ipst, ip_nce_reclaim_deleted);
   3414 		ncec_delete(ncec);
   3415 	}
   3416 }
   3417 
   3418 /*
   3419  * kmem_cache callback to free up memory.
   3420  *
   3421  * For now we just delete a fixed fraction.
   3422  */
   3423 static void
   3424 ip_nce_reclaim_stack(ip_stack_t *ipst)
   3425 {
   3426 	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
   3427 
   3428 	IP_STAT(ipst, ip_nce_reclaim_calls);
   3429 
   3430 	ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
   3431 
   3432 	/*
   3433 	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
   3434 	 * Get them to update any stale references to drop any refholds they
   3435 	 * have.
   3436 	 */
   3437 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
   3438 }
   3439 
   3440 /*
   3441  * Called by the memory allocator subsystem directly, when the system
   3442  * is running low on memory.
   3443  */
   3444 /* ARGSUSED */
   3445 void
   3446 ip_nce_reclaim(void *args)
   3447 {
   3448 	netstack_handle_t nh;
   3449 	netstack_t *ns;
   3450 
   3451 	netstack_next_init(&nh);
   3452 	while ((ns = netstack_next(&nh)) != NULL) {
   3453 		ip_nce_reclaim_stack(ns->netstack_ip);
   3454 		netstack_rele(ns);
   3455 	}
   3456 	netstack_next_fini(&nh);
   3457 }
   3458 
   3459 #ifdef DEBUG
   3460 void
   3461 ncec_trace_ref(ncec_t *ncec)
   3462 {
   3463 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   3464 
   3465 	if (ncec->ncec_trace_disable)
   3466 		return;
   3467 
   3468 	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
   3469 		ncec->ncec_trace_disable = B_TRUE;
   3470 		ncec_trace_cleanup(ncec);
   3471 	}
   3472 }
   3473 
   3474 void
   3475 ncec_untrace_ref(ncec_t *ncec)
   3476 {
   3477 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   3478 
   3479 	if (!ncec->ncec_trace_disable)
   3480 		th_trace_unref(ncec);
   3481 }
   3482 
   3483 static void
   3484 ncec_trace_cleanup(const ncec_t *ncec)
   3485 {
   3486 	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
   3487 }
   3488 #endif
   3489 
   3490 /*
   3491  * Called when address resolution fails due to a timeout.
   3492  * Send an ICMP unreachable in response to all queued packets.
   3493  */
   3494 void
   3495 arp_resolv_failed(ncec_t *ncec)
   3496 {
   3497 	mblk_t	*mp, *nxt_mp;
   3498 	char	buf[INET6_ADDRSTRLEN];
   3499 	struct in_addr ipv4addr;
   3500 	ill_t *ill = ncec->ncec_ill;
   3501 	ip_stack_t *ipst = ncec->ncec_ipst;
   3502 	ip_recv_attr_t	iras;
   3503 
   3504 	bzero(&iras, sizeof (iras));
   3505 	iras.ira_flags = IRAF_IS_IPV4;
   3506 	/*
   3507 	 * we are setting the ira_rill to the ipmp_ill (instead of
   3508 	 * the actual ill on which the packet was received), but this
   3509 	 * is ok because we don't actually need the real ira_rill.
   3510 	 * to send the icmp unreachable to the sender.
   3511 	 */
   3512 	iras.ira_ill = iras.ira_rill = ill;
   3513 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
   3514 	iras.ira_rifindex = iras.ira_ruifindex;
   3515 
   3516 	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
   3517 	ip3dbg(("arp_resolv_failed: dst %s\n",
   3518 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
   3519 	mutex_enter(&ncec->ncec_lock);
   3520 	mp = ncec->ncec_qd_mp;
   3521 	ncec->ncec_qd_mp = NULL;
   3522 	ncec->ncec_nprobes = 0;
   3523 	mutex_exit(&ncec->ncec_lock);
   3524 	while (mp != NULL) {
   3525 		nxt_mp = mp->b_next;
   3526 		mp->b_next = NULL;
   3527 
   3528 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   3529 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
   3530 		    mp, ill);
   3531 		if (ipst->ips_ip_arp_icmp_error) {
   3532 			ip3dbg(("arp_resolv_failed: "
   3533 			    "Calling icmp_unreachable\n"));
   3534 			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
   3535 		} else {
   3536 			freemsg(mp);
   3537 		}
   3538 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   3539 		mp = nxt_mp;
   3540 	}
   3541 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
   3542 }
   3543 
   3544 /*
   3545  * if ill is an under_ill, translate it to the ipmp_ill and add the
   3546  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
   3547  * one on the underlying in_ill) will be created for the
   3548  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
   3549  */
   3550 int
   3551 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
   3552     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
   3553 {
   3554 	int	err;
   3555 	in6_addr_t addr6;
   3556 	ip_stack_t *ipst = ill->ill_ipst;
   3557 	nce_t	*nce, *upper_nce = NULL;
   3558 	ill_t	*in_ill = ill, *under = NULL;
   3559 	boolean_t need_ill_refrele = B_FALSE;
   3560 
   3561 	if (flags & NCE_F_MCAST) {
   3562 		/*
   3563 		 * hw_addr will be figured out in nce_set_multicast_v4;
   3564 		 * caller needs to pass in the cast_ill for ipmp
   3565 		 */
   3566 		ASSERT(hw_addr == NULL);
   3567 		ASSERT(!IS_IPMP(ill));
   3568 		err = nce_set_multicast_v4(ill, addr, flags, newnce);
   3569 		return (err);
   3570 	}
   3571 
   3572 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
   3573 		ill = ipmp_ill_hold_ipmp_ill(ill);
   3574 		if (ill == NULL)
   3575 			return (ENXIO);
   3576 		need_ill_refrele = B_TRUE;
   3577 	}
   3578 	if ((flags & NCE_F_BCAST) != 0) {
   3579 		/*
   3580 		 * IPv4 broadcast ncec: compute the hwaddr.
   3581 		 */
   3582 		if (IS_IPMP(ill)) {
   3583 			under = ipmp_ill_get_xmit_ill(ill, B_FALSE);
   3584 			if (under == NULL)  {
   3585 				if (need_ill_refrele)
   3586 					ill_refrele(ill);
   3587 				return (ENETDOWN);
   3588 			}
   3589 			hw_addr = under->ill_bcast_mp->b_rptr +
   3590 			    NCE_LL_ADDR_OFFSET(under);
   3591 			hw_addr_len = under->ill_phys_addr_length;
   3592 		} else {
   3593 			hw_addr = ill->ill_bcast_mp->b_rptr +
   3594 			    NCE_LL_ADDR_OFFSET(ill),
   3595 			    hw_addr_len = ill->ill_phys_addr_length;
   3596 		}
   3597 	}
   3598 
   3599 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
   3600 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
   3601 	nce = nce_lookup_addr(ill, &addr6);
   3602 	if (nce == NULL) {
   3603 		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
   3604 		    state, &nce);
   3605 	} else {
   3606 		err = EEXIST;
   3607 	}
   3608 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
   3609 	if (err == 0)
   3610 		err = nce_add_v4_postprocess(nce);
   3611 
   3612 	if (in_ill != ill && nce != NULL) {
   3613 		nce_t *under_nce = NULL;
   3614 
   3615 		/*
   3616 		 * in_ill was the under_ill. Try to create the under_nce.
   3617 		 * Hold the ill_g_lock to prevent changes to group membership
   3618 		 * until we are done.
   3619 		 */
   3620 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   3621 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
   3622 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
   3623 			    ill_t *, ill);
   3624 			rw_exit(&ipst->ips_ill_g_lock);
   3625 			err = ENXIO;
   3626 			nce_refrele(nce);
   3627 			nce = NULL;
   3628 			goto bail;
   3629 		}
   3630 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
   3631 		if (under_nce == NULL) {
   3632 			rw_exit(&ipst->ips_ill_g_lock);
   3633 			err = EINVAL;
   3634 			nce_refrele(nce);
   3635 			nce = NULL;
   3636 			goto bail;
   3637 		}
   3638 		rw_exit(&ipst->ips_ill_g_lock);
   3639 		upper_nce = nce;
   3640 		nce = under_nce; /* will be returned to caller */
   3641 		if (NCE_ISREACHABLE(nce->nce_common))
   3642 			nce_fastpath_trigger(under_nce);
   3643 	}
   3644 	if (nce != NULL) {
   3645 		if (newnce != NULL)
   3646 			*newnce = nce;
   3647 		else
   3648 			nce_refrele(nce);
   3649 	}
   3650 bail:
   3651 	if (under != NULL)
   3652 		ill_refrele(under);
   3653 	if (upper_nce != NULL)
   3654 		nce_refrele(upper_nce);
   3655 	if (need_ill_refrele)
   3656 		ill_refrele(ill);
   3657 
   3658 	return (err);
   3659 }
   3660 
   3661 /*
   3662  * NDP Cache Entry creation routine for IPv4.
   3663  * This routine must always be called with ndp4->ndp_g_lock held.
   3664  * Prior to return, ncec_refcnt is incremented.
   3665  *
   3666  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
   3667  * are always added pointing at the ipmp_ill. Thus, when the ill passed
   3668  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
   3669  * entries will be created, both pointing at the same ncec_t. The nce_t
   3670  * entries will have their nce_ill set to the ipmp_ill and the under_ill
   3671  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
   3672  * Local addresses are always created on the ill passed to nce_add_v4.
   3673  */
   3674 int
   3675 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
   3676     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
   3677 {
   3678 	int		err;
   3679 	boolean_t	is_multicast = (flags & NCE_F_MCAST);
   3680 	struct in6_addr	addr6;
   3681 	nce_t		*nce;
   3682 
   3683 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
   3684 	ASSERT(!ill->ill_isv6);
   3685 	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
   3686 
   3687 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
   3688 	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
   3689 	    &nce);
   3690 	ASSERT(newnce != NULL);
   3691 	*newnce = nce;
   3692 	return (err);
   3693 }
   3694 
   3695 /*
   3696  * Post-processing routine to be executed after nce_add_v4(). This function
   3697  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
   3698  * and must be called without any locks held.
   3699  *
   3700  * Always returns 0, but we return an int to keep this symmetric with the
   3701  * IPv6 counter-part.
   3702  */
   3703 int
   3704 nce_add_v4_postprocess(nce_t *nce)
   3705 {
   3706 	ncec_t		*ncec = nce->nce_common;
   3707 	uint16_t	flags = ncec->ncec_flags;
   3708 	boolean_t	ndp_need_dad = B_FALSE;
   3709 	boolean_t	dropped;
   3710 	clock_t		delay;
   3711 	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
   3712 	uchar_t		*hw_addr = ncec->ncec_lladdr;
   3713 	boolean_t	trigger_fastpath = B_TRUE;
   3714 
   3715 	/*
   3716 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
   3717 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
   3718 	 * We call nce_fastpath from nce_update if the link layer address of
   3719 	 * the peer changes from nce_update
   3720 	 */
   3721 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
   3722 	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
   3723 		trigger_fastpath = B_FALSE;
   3724 
   3725 	if (trigger_fastpath)
   3726 		nce_fastpath_trigger(nce);
   3727 
   3728 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
   3729 		/*
   3730 		 * Either the caller (by passing in ND_PROBE)
   3731 		 * or nce_add_common() (by the internally computed state
   3732 		 * based on ncec_addr and ill_net_type) has determined
   3733 		 * that this unicast entry needs DAD. Trigger DAD.
   3734 		 */
   3735 		ndp_need_dad = B_TRUE;
   3736 	} else if (flags & NCE_F_UNSOL_ADV) {
   3737 		/*
   3738 		 * We account for the transmit below by assigning one
   3739 		 * less than the ndd variable. Subsequent decrements
   3740 		 * are done in nce_timer.
   3741 		 */
   3742 		mutex_enter(&ncec->ncec_lock);
   3743 		ncec->ncec_unsolicit_count =
   3744 		    ipst->ips_ip_arp_publish_count - 1;
   3745 		mutex_exit(&ncec->ncec_lock);
   3746 		dropped = arp_announce(ncec);
   3747 		mutex_enter(&ncec->ncec_lock);
   3748 		if (dropped)
   3749 			ncec->ncec_unsolicit_count++;
   3750 		else
   3751 			ncec->ncec_last_time_defended = ddi_get_lbolt();
   3752 		if (ncec->ncec_unsolicit_count != 0) {
   3753 			nce_start_timer(ncec,
   3754 			    ipst->ips_ip_arp_publish_interval);
   3755 		}
   3756 		mutex_exit(&ncec->ncec_lock);
   3757 	}
   3758 
   3759 	/*
   3760 	 * If ncec_xmit_interval is 0, user has configured us to send the first
   3761 	 * probe right away.  Do so, and set up for the subsequent probes.
   3762 	 */
   3763 	if (ndp_need_dad) {
   3764 		mutex_enter(&ncec->ncec_lock);
   3765 		if (ncec->ncec_pcnt == 0) {
   3766 			/*
   3767 			 * DAD probes and announce can be
   3768 			 * administratively disabled by setting the
   3769 			 * probe_count to zero. Restart the timer in
   3770 			 * this case to mark the ipif as ready.
   3771 			 */
   3772 			ncec->ncec_unsolicit_count = 0;
   3773 			mutex_exit(&ncec->ncec_lock);
   3774 			nce_restart_timer(ncec, 0);
   3775 		} else {
   3776 			mutex_exit(&ncec->ncec_lock);
   3777 			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
   3778 			    ipst->ips_arp_probe_delay :
   3779 			    ipst->ips_arp_fastprobe_delay);
   3780 			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
   3781 		}
   3782 	}
   3783 	return (0);
   3784 }
   3785 
   3786 /*
   3787  * ncec_walk routine to update all entries that have a given destination or
   3788  * gateway address and cached link layer (MAC) address.  This is used when ARP
   3789  * informs us that a network-to-link-layer mapping may have changed.
   3790  */
   3791 void
   3792 nce_update_hw_changed(ncec_t *ncec, void *arg)
   3793 {
   3794 	nce_hw_map_t *hwm = arg;
   3795 	ipaddr_t ncec_addr;
   3796 
   3797 	if (ncec->ncec_state != ND_REACHABLE)
   3798 		return;
   3799 
   3800 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
   3801 	if (ncec_addr != hwm->hwm_addr)
   3802 		return;
   3803 
   3804 	mutex_enter(&ncec->ncec_lock);
   3805 	if (hwm->hwm_flags != 0)
   3806 		ncec->ncec_flags = hwm->hwm_flags;
   3807 	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
   3808 	mutex_exit(&ncec->ncec_lock);
   3809 }
   3810 
   3811 void
   3812 ncec_refhold(ncec_t *ncec)
   3813 {
   3814 	mutex_enter(&(ncec)->ncec_lock);
   3815 	(ncec)->ncec_refcnt++;
   3816 	ASSERT((ncec)->ncec_refcnt != 0);
   3817 #ifdef DEBUG
   3818 	ncec_trace_ref(ncec);
   3819 #endif
   3820 	mutex_exit(&(ncec)->ncec_lock);
   3821 }
   3822 
   3823 void
   3824 ncec_refhold_notr(ncec_t *ncec)
   3825 {
   3826 	mutex_enter(&(ncec)->ncec_lock);
   3827 	(ncec)->ncec_refcnt++;
   3828 	ASSERT((ncec)->ncec_refcnt != 0);
   3829 	mutex_exit(&(ncec)->ncec_lock);
   3830 }
   3831 
   3832 static void
   3833 ncec_refhold_locked(ncec_t *ncec)
   3834 {
   3835 	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
   3836 	(ncec)->ncec_refcnt++;
   3837 #ifdef DEBUG
   3838 	ncec_trace_ref(ncec);
   3839 #endif
   3840 }
   3841 
   3842 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
   3843 void
   3844 ncec_refrele(ncec_t *ncec)
   3845 {
   3846 	mutex_enter(&(ncec)->ncec_lock);
   3847 #ifdef DEBUG
   3848 	ncec_untrace_ref(ncec);
   3849 #endif
   3850 	ASSERT((ncec)->ncec_refcnt != 0);
   3851 	if (--(ncec)->ncec_refcnt == 0) {
   3852 		ncec_inactive(ncec);
   3853 	} else {
   3854 		mutex_exit(&(ncec)->ncec_lock);
   3855 	}
   3856 }
   3857 
   3858 void
   3859 ncec_refrele_notr(ncec_t *ncec)
   3860 {
   3861 	mutex_enter(&(ncec)->ncec_lock);
   3862 	ASSERT((ncec)->ncec_refcnt != 0);
   3863 	if (--(ncec)->ncec_refcnt == 0) {
   3864 		ncec_inactive(ncec);
   3865 	} else {
   3866 		mutex_exit(&(ncec)->ncec_lock);
   3867 	}
   3868 }
   3869 
   3870 /*
   3871  * Common to IPv4 and IPv6.
   3872  */
   3873 void
   3874 nce_restart_timer(ncec_t *ncec, uint_t ms)
   3875 {
   3876 	timeout_id_t tid;
   3877 
   3878 	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
   3879 
   3880 	/* First cancel any running timer */
   3881 	mutex_enter(&ncec->ncec_lock);
   3882 	tid = ncec->ncec_timeout_id;
   3883 	ncec->ncec_timeout_id = 0;
   3884 	if (tid != 0) {
   3885 		mutex_exit(&ncec->ncec_lock);
   3886 		(void) untimeout(tid);
   3887 		mutex_enter(&ncec->ncec_lock);
   3888 	}
   3889 
   3890 	/* Restart timer */
   3891 	nce_start_timer(ncec, ms);
   3892 	mutex_exit(&ncec->ncec_lock);
   3893 }
   3894 
   3895 static void
   3896 nce_start_timer(ncec_t *ncec, uint_t ms)
   3897 {
   3898 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   3899 	/*
   3900 	 * Don't start the timer if the ncec has been deleted, or if the timer
   3901 	 * is already running
   3902 	 */
   3903 	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
   3904 		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
   3905 		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
   3906 	}
   3907 }
   3908 
   3909 int
   3910 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
   3911     uint16_t flags, nce_t **newnce)
   3912 {
   3913 	uchar_t		*hw_addr;
   3914 	int		err = 0;
   3915 	ip_stack_t	*ipst = ill->ill_ipst;
   3916 	in6_addr_t	dst6;
   3917 	nce_t		*nce;
   3918 
   3919 	ASSERT(!ill->ill_isv6);
   3920 
   3921 	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
   3922 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
   3923 	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
   3924 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
   3925 		goto done;
   3926 	}
   3927 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
   3928 		/*
   3929 		 * For IRE_IF_RESOLVER a hardware mapping can be
   3930 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
   3931 		 * in the ill is copied in nce_add_v4().
   3932 		 */
   3933 		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
   3934 		if (hw_addr == NULL) {
   3935 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
   3936 			return (ENOMEM);
   3937 		}
   3938 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
   3939 	} else {
   3940 		/*
   3941 		 * IRE_IF_NORESOLVER type simply copies the resolution
   3942 		 * cookie passed in.  So no hw_addr is needed.
   3943 		 */
   3944 		hw_addr = NULL;
   3945 	}
   3946 	ASSERT(flags & NCE_F_MCAST);
   3947 	ASSERT(flags & NCE_F_NONUD);
   3948 	/* nce_state will be computed by nce_add_common() */
   3949 	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
   3950 	    ND_UNCHANGED, &nce);
   3951 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
   3952 	if (err == 0)
   3953 		err = nce_add_v4_postprocess(nce);
   3954 	if (hw_addr != NULL)
   3955 		kmem_free(hw_addr, ill->ill_phys_addr_length);
   3956 	if (err != 0) {
   3957 		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
   3958 		return (err);
   3959 	}
   3960 done:
   3961 	if (newnce != NULL)
   3962 		*newnce = nce;
   3963 	else
   3964 		nce_refrele(nce);
   3965 	return (0);
   3966 }
   3967 
   3968 /*
   3969  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
   3970  * don't want to have to walk the list for every single one, so we gather up
   3971  * batches at a time.
   3972  */
   3973 #define	NCE_RESCHED_LIST_LEN	8
   3974 
   3975 typedef struct {
   3976 	ill_t	*ncert_ill;
   3977 	uint_t	ncert_num;
   3978 	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
   3979 } nce_resched_t;
   3980 
   3981 /*
   3982  * Pick the longest waiting NCEs for defense.
   3983  */
   3984 /* ARGSUSED */
   3985 static int
   3986 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
   3987 {
   3988 	nce_resched_t *ncert = arg;
   3989 	ncec_t **ncecs;
   3990 	ncec_t **ncec_max;
   3991 	ncec_t *ncec_temp;
   3992 	ncec_t *ncec = nce->nce_common;
   3993 
   3994 	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
   3995 	/*
   3996 	 * Only reachable entries that are ready for announcement are eligible.
   3997 	 */
   3998 	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
   3999 		return (0);
   4000 	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
   4001 		ncec_refhold(ncec);
   4002 		ncert->ncert_nces[ncert->ncert_num++] = ncec;
   4003 	} else {
   4004 		ncecs = ncert->ncert_nces;
   4005 		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
   4006 		ncec_refhold(ncec);
   4007 		for (; ncecs < ncec_max; ncecs++) {
   4008 			ASSERT(ncec != NULL);
   4009 			if ((*ncecs)->ncec_last_time_defended >
   4010 			    ncec->ncec_last_time_defended) {
   4011 				ncec_temp = *ncecs;
   4012 				*ncecs = ncec;
   4013 				ncec = ncec_temp;
   4014 			}
   4015 		}
   4016 		ncec_refrele(ncec);
   4017 	}
   4018 	return (0);
   4019 }
   4020 
   4021 /*
   4022  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
   4023  * doesn't happen very often (if at all), and thus it needn't be highly
   4024  * optimized.  (Note, though, that it's actually O(N) complexity, because the
   4025  * outer loop is bounded by a constant rather than by the length of the list.)
   4026  */
   4027 static void
   4028 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
   4029 {
   4030 	ncec_t		*ncec;
   4031 	ip_stack_t	*ipst = ill->ill_ipst;
   4032 	uint_t		i, defend_rate;
   4033 
   4034 	i = ill->ill_defend_count;
   4035 	ill->ill_defend_count = 0;
   4036 	if (ill->ill_isv6)
   4037 		defend_rate = ipst->ips_ndp_defend_rate;
   4038 	else
   4039 		defend_rate = ipst->ips_arp_defend_rate;
   4040 	/* If none could be sitting around, then don't reschedule */
   4041 	if (i < defend_rate) {
   4042 		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
   4043 		return;
   4044 	}
   4045 	ncert->ncert_ill = ill;
   4046 	while (ill->ill_defend_count < defend_rate) {
   4047 		nce_walk_common(ill, ncec_reschedule, ncert);
   4048 		for (i = 0; i < ncert->ncert_num; i++) {
   4049 
   4050 			ncec = ncert->ncert_nces[i];
   4051 			mutex_enter(&ncec->ncec_lock);
   4052 			ncec->ncec_flags |= NCE_F_DELAYED;
   4053 			mutex_exit(&ncec->ncec_lock);
   4054 			/*
   4055 			 * we plan to schedule this ncec, so incr the
   4056 			 * defend_count in anticipation.
   4057 			 */
   4058 			if (++ill->ill_defend_count >= defend_rate)
   4059 				break;
   4060 		}
   4061 		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
   4062 			break;
   4063 	}
   4064 }
   4065 
   4066 /*
   4067  * Check if the current rate-limiting parameters permit the sending
   4068  * of another address defense announcement for both IPv4 and IPv6.
   4069  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
   4070  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
   4071  * determines how many address defense announcements are permitted
   4072  * in any `defense_perio' interval.
   4073  */
   4074 static boolean_t
   4075 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
   4076 {
   4077 	clock_t		now = ddi_get_lbolt();
   4078 	ip_stack_t	*ipst = ill->ill_ipst;
   4079 	clock_t		start = ill->ill_defend_start;
   4080 	uint32_t	elapsed, defend_period, defend_rate;
   4081 	nce_resched_t	ncert;
   4082 	boolean_t	ret;
   4083 	int		i;
   4084 
   4085 	if (ill->ill_isv6) {
   4086 		defend_period = ipst->ips_ndp_defend_period;
   4087 		defend_rate = ipst->ips_ndp_defend_rate;
   4088 	} else {
   4089 		defend_period = ipst->ips_arp_defend_period;
   4090 		defend_rate = ipst->ips_arp_defend_rate;
   4091 	}
   4092 	if (defend_rate == 0)
   4093 		return (B_TRUE);
   4094 	bzero(&ncert, sizeof (ncert));
   4095 	mutex_enter(&ill->ill_lock);
   4096 	if (start > 0) {
   4097 		elapsed = now - start;
   4098 		if (elapsed > SEC_TO_TICK(defend_period)) {
   4099 			ill->ill_defend_start = now;
   4100 			/*
   4101 			 * nce_ill_reschedule will attempt to
   4102 			 * prevent starvation by reschduling the
   4103 			 * oldest entries, which are marked with
   4104 			 * the NCE_F_DELAYED flag.
   4105 			 */
   4106 			nce_ill_reschedule(ill, &ncert);
   4107 		}
   4108 	} else {
   4109 		ill->ill_defend_start = now;
   4110 	}
   4111 	ASSERT(ill->ill_defend_count <= defend_rate);
   4112 	mutex_enter(&ncec->ncec_lock);
   4113 	if (ncec->ncec_flags & NCE_F_DELAYED) {
   4114 		/*
   4115 		 * This ncec was rescheduled as one of the really old
   4116 		 * entries needing on-going defense. The
   4117 		 * ill_defend_count was already incremented in
   4118 		 * nce_ill_reschedule. Go ahead and send the announce.
   4119 		 */
   4120 		ncec->ncec_flags &= ~NCE_F_DELAYED;
   4121 		mutex_exit(&ncec->ncec_lock);
   4122 		ret = B_FALSE;
   4123 		goto done;
   4124 	}
   4125 	mutex_exit(&ncec->ncec_lock);
   4126 	if (ill->ill_defend_count < defend_rate)
   4127 		ill->ill_defend_count++;
   4128 	if (ill->ill_defend_count == defend_rate) {
   4129 		/*
   4130 		 * we are no longer allowed to send unbidden defense
   4131 		 * messages. Wait for rescheduling.
   4132 		 */
   4133 		ret = B_TRUE;
   4134 	} else {
   4135 		ret = B_FALSE;
   4136 	}
   4137 done:
   4138 	mutex_exit(&ill->ill_lock);
   4139 	/*
   4140 	 * After all the locks have been dropped we can restart nce timer,
   4141 	 * and refrele the delayed ncecs
   4142 	 */
   4143 	for (i = 0; i < ncert.ncert_num; i++) {
   4144 		clock_t	xmit_interval;
   4145 		ncec_t	*tmp;
   4146 
   4147 		tmp = ncert.ncert_nces[i];
   4148 		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
   4149 		    B_FALSE);
   4150 		nce_restart_timer(tmp, xmit_interval);
   4151 		ncec_refrele(tmp);
   4152 	}
   4153 	return (ret);
   4154 }
   4155 
   4156 boolean_t
   4157 ndp_announce(ncec_t *ncec)
   4158 {
   4159 	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
   4160 	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
   4161 	    nce_advert_flags(ncec)));
   4162 }
   4163 
   4164 ill_t *
   4165 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
   4166 {
   4167 	mblk_t		*mp;
   4168 	in6_addr_t	src6;
   4169 	ipaddr_t	src4;
   4170 	ill_t		*ill = ncec->ncec_ill;
   4171 	ill_t		*src_ill = NULL;
   4172 	ipif_t		*ipif = NULL;
   4173 	boolean_t	is_myaddr = NCE_MYADDR(ncec);
   4174 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
   4175 
   4176 	ASSERT(src != NULL);
   4177 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
   4178 	src6 = *src;
   4179 	if (is_myaddr) {
   4180 		src6 = ncec->ncec_addr;
   4181 		if (!isv6)
   4182 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
   4183 	} else {
   4184 		/*
   4185 		 * try to find one from the outgoing packet.
   4186 		 */
   4187 		mutex_enter(&ncec->ncec_lock);
   4188 		mp = ncec->ncec_qd_mp;
   4189 		if (mp != NULL) {
   4190 			if (isv6) {
   4191 				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
   4192 
   4193 				src6 = ip6h->ip6_src;
   4194 			} else {
   4195 				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
   4196 
   4197 				src4 = ipha->ipha_src;
   4198 				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
   4199 			}
   4200 		}
   4201 		mutex_exit(&ncec->ncec_lock);
   4202 	}
   4203 
   4204 	/*
   4205 	 * For outgoing packets, if the src of outgoing packet is one
   4206 	 * of the assigned interface addresses use it, otherwise we
   4207 	 * will pick the source address below.
   4208 	 * For local addresses (is_myaddr) doing DAD, NDP announce
   4209 	 * messages are mcast. So we use the (IPMP) cast_ill or the
   4210 	 * (non-IPMP) ncec_ill for these message types. The only case
   4211 	 * of unicast DAD messages are for IPv6 ND probes, for which
   4212 	 * we find the ipif_bound_ill corresponding to the ncec_addr.
   4213 	 */
   4214 	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
   4215 		if (isv6) {
   4216 			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
   4217 			    ill->ill_ipst);
   4218 		} else {
   4219 			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
   4220 			    ill->ill_ipst);
   4221 		}
   4222 
   4223 		/*
   4224 		 * If no relevant ipif can be found, then it's not one of our
   4225 		 * addresses.  Reset to :: and try to find a src for the NS or
   4226 		 * ARP request using ipif_select_source_v[4,6]  below.
   4227 		 * If an ipif can be found, but it's not yet done with
   4228 		 * DAD verification, and we are not being invoked for
   4229 		 * DAD (i.e., !is_myaddr), then just postpone this
   4230 		 * transmission until later.
   4231 		 */
   4232 		if (ipif == NULL) {
   4233 			src6 = ipv6_all_zeros;
   4234 			src4 = INADDR_ANY;
   4235 		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
   4236 			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
   4237 			    ncec_t *, ncec, ipif_t *, ipif);
   4238 			ipif_refrele(ipif);
   4239 			return (NULL);
   4240 		}
   4241 	}
   4242 
   4243 	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
   4244 		/*
   4245 		 * Pick a source address for this solicitation, but
   4246 		 * restrict the selection to addresses assigned to the
   4247 		 * output interface.  We do this because the destination will
   4248 		 * create a neighbor cache entry for the source address of
   4249 		 * this packet, so the source address had better be a valid
   4250 		 * neighbor.
   4251 		 */
   4252 		if (isv6) {
   4253 			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
   4254 			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
   4255 			    B_FALSE, NULL);
   4256 		} else {
   4257 			ipaddr_t nce_addr;
   4258 
   4259 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
   4260 			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
   4261 			    B_FALSE, NULL);
   4262 		}
   4263 		if (ipif == NULL && IS_IPMP(ill)) {
   4264 			ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE);
   4265 
   4266 			if (send_ill != NULL) {
   4267 				if (isv6) {
   4268 					ipif = ipif_select_source_v6(send_ill,
   4269 					    &ncec->ncec_addr, B_TRUE,
   4270 					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
   4271 					    B_FALSE, NULL);
   4272 				} else {
   4273 					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
   4274 					    src4);
   4275 					ipif = ipif_select_source_v4(send_ill,
   4276 					    src4, ALL_ZONES, B_TRUE, NULL);
   4277 				}
   4278 				ill_refrele(send_ill);
   4279 			}
   4280 		}
   4281 
   4282 		if (ipif == NULL) {
   4283 			char buf[INET6_ADDRSTRLEN];
   4284 
   4285 			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
   4286 			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
   4287 			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
   4288 			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
   4289 			return (NULL);
   4290 		}
   4291 		src6 = ipif->ipif_v6lcl_addr;
   4292 	}
   4293 	*src = src6;
   4294 	if (ipif != NULL) {
   4295 		src_ill = ipif->ipif_ill;
   4296 		if (IS_IPMP(src_ill))
   4297 			src_ill = ipmp_ipif_hold_bound_ill(ipif);
   4298 		else
   4299 			ill_refhold(src_ill);
   4300 		ipif_refrele(ipif);
   4301 		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
   4302 		    ill_t *, src_ill);
   4303 	}
   4304 	return (src_ill);
   4305 }
   4306 
   4307 void
   4308 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
   4309     uchar_t *hwaddr, int hwaddr_len, int flags)
   4310 {
   4311 	ill_t	*ill;
   4312 	ncec_t	*ncec;
   4313 	nce_t	*nce;
   4314 	uint16_t new_state;
   4315 
   4316 	ill = (ipif ? ipif->ipif_ill : NULL);
   4317 	if (ill != NULL) {
   4318 		/*
   4319 		 * only one ncec is possible
   4320 		 */
   4321 		nce = nce_lookup_v4(ill, addr);
   4322 		if (nce != NULL) {
   4323 			ncec = nce->nce_common;
   4324 			mutex_enter(&ncec->ncec_lock);
   4325 			if (NCE_ISREACHABLE(ncec))
   4326 				new_state = ND_UNCHANGED;
   4327 			else
   4328 				new_state = ND_STALE;
   4329 			ncec->ncec_flags = flags;
   4330 			nce_update(ncec, new_state, hwaddr);
   4331 			mutex_exit(&ncec->ncec_lock);
   4332 			nce_refrele(nce);
   4333 			return;
   4334 		}
   4335 	} else {
   4336 		/*
   4337 		 * ill is wildcard; clean up all ncec's and ire's
   4338 		 * that match on addr.
   4339 		 */
   4340 		nce_hw_map_t hwm;
   4341 
   4342 		hwm.hwm_addr = *addr;
   4343 		hwm.hwm_hwlen = hwaddr_len;
   4344 		hwm.hwm_hwaddr = hwaddr;
   4345 		hwm.hwm_flags = flags;
   4346 
   4347 		ncec_walk_common(ipst->ips_ndp4, NULL,
   4348 		    (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
   4349 	}
   4350 }
   4351 
   4352 /*
   4353  * Common function to add ncec entries.
   4354  * we always add the ncec with ncec_ill == ill, and always create
   4355  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
   4356  * ncec is !reachable.
   4357  *
   4358  * When the caller passes in an nce_state of ND_UNCHANGED,
   4359  * nce_add_common() will determine the state of the created nce based
   4360  * on the ill_net_type and nce_flags used. Otherwise, the nce will
   4361  * be created with state set to the passed in nce_state.
   4362  */
   4363 static int
   4364 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
   4365     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
   4366 {
   4367 	static	ncec_t		nce_nil;
   4368 	uchar_t			*template = NULL;
   4369 	int			err;
   4370 	ncec_t			*ncec;
   4371 	ncec_t			**ncep;
   4372 	ip_stack_t		*ipst = ill->ill_ipst;
   4373 	uint16_t		state;
   4374 	boolean_t		fastprobe = B_FALSE;
   4375 	struct ndp_g_s		*ndp;
   4376 	nce_t			*nce = NULL;
   4377 	mblk_t			*dlur_mp = NULL;
   4378 
   4379 	if (ill->ill_isv6)
   4380 		ndp = ill->ill_ipst->ips_ndp6;
   4381 	else
   4382 		ndp = ill->ill_ipst->ips_ndp4;
   4383 
   4384 	*retnce = NULL;
   4385 
   4386 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
   4387 
   4388 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
   4389 		ip0dbg(("nce_add_common: no addr\n"));
   4390 		return (EINVAL);
   4391 	}
   4392 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
   4393 		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
   4394 		return (EINVAL);
   4395 	}
   4396 
   4397 	if (ill->ill_isv6) {
   4398 		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
   4399 	} else {
   4400 		ipaddr_t v4addr;
   4401 
   4402 		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
   4403 		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
   4404 	}
   4405 
   4406 	/*
   4407 	 * The caller has ensured that there is no nce on ill, but there could
   4408 	 * still be an nce_common_t for the address, so that we find exisiting
   4409 	 * ncec_t strucutures first, and atomically add a new nce_t if
   4410 	 * one is found. The ndp_g_lock ensures that we don't cross threads
   4411 	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
   4412 	 * compare for matches across the illgrp because this function is
   4413 	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
   4414 	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
   4415 	 * appropriate.
   4416 	 */
   4417 	ncec = *ncep;
   4418 	for (; ncec != NULL; ncec = ncec->ncec_next) {
   4419 		if (ncec->ncec_ill == ill) {
   4420 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
   4421 				/*
   4422 				 * We should never find *retnce to be
   4423 				 * MYADDR, since the caller may then
   4424 				 * incorrectly restart a DAD timer that's
   4425 				 * already running.  However, if we are in
   4426 				 * forwarding mode, and the interface is
   4427 				 * moving in/out of groups, the data
   4428 				 * path ire lookup (e.g., ire_revalidate_nce)
   4429 				 * may  have determined that some destination
   4430 				 * is offlink while the control path is adding
   4431 				 * that address as a local address.
   4432 				 * Recover from  this case by failing the
   4433 				 * lookup
   4434 				 */
   4435 				if (NCE_MYADDR(ncec))
   4436 					return (ENXIO);
   4437 				*retnce = nce_ill_lookup_then_add(ill, ncec);
   4438 				if (*retnce != NULL)
   4439 					break;
   4440 			}
   4441 		}
   4442 	}
   4443 	if (*retnce != NULL) /* caller must trigger fastpath on nce */
   4444 		return (0);
   4445 
   4446 	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
   4447 	if (ncec == NULL)
   4448 		return (ENOMEM);
   4449 	*ncec = nce_nil;
   4450 	ncec->ncec_ill = ill;
   4451 	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
   4452 	ncec->ncec_flags = flags;
   4453 	ncec->ncec_ipst = ipst;	/* No netstack_hold */
   4454 
   4455 	if (!ill->ill_isv6) {
   4456 		ipaddr_t addr4;
   4457 
   4458 		/*
   4459 		 * DAD probe interval and probe count are set based on
   4460 		 * fast/slow probe settings. If the underlying link doesn't
   4461 		 * have reliably up/down notifications or if we're working
   4462 		 * with IPv4 169.254.0.0/16 Link Local Address space, then
   4463 		 * don't use the fast timers.  Otherwise, use them.
   4464 		 */
   4465 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
   4466 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
   4467 		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4))
   4468 			fastprobe = B_TRUE;
   4469 		if (fastprobe) {
   4470 			ncec->ncec_xmit_interval =
   4471 			    ipst->ips_arp_fastprobe_interval;
   4472 			ncec->ncec_pcnt =
   4473 			    ipst->ips_arp_fastprobe_count;
   4474 			ncec->ncec_flags |= NCE_F_FAST;
   4475 		} else {
   4476 			ncec->ncec_xmit_interval =
   4477 			    ipst->ips_arp_probe_interval;
   4478 			ncec->ncec_pcnt =
   4479 			    ipst->ips_arp_probe_count;
   4480 		}
   4481 		if (NCE_PUBLISH(ncec)) {
   4482 			ncec->ncec_unsolicit_count =
   4483 			    ipst->ips_ip_arp_publish_count;
   4484 		}
   4485 	} else {
   4486 		/*
   4487 		 * probe interval is constant: ILL_PROBE_INTERVAL
   4488 		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
   4489 		 */
   4490 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
   4491 		if (NCE_PUBLISH(ncec)) {
   4492 			ncec->ncec_unsolicit_count =
   4493 			    ipst->ips_ip_ndp_unsolicit_count;
   4494 		}
   4495 	}
   4496 	ncec->ncec_rcnt = ill->ill_xmit_count;
   4497 	ncec->ncec_addr = *addr;
   4498 	ncec->ncec_qd_mp = NULL;
   4499 	ncec->ncec_refcnt = 1; /* for ncec getting created */
   4500 	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
   4501 	ncec->ncec_trace_disable = B_FALSE;
   4502 
   4503 	/*
   4504 	 * ncec_lladdr holds link layer address
   4505 	 */
   4506 	if (hw_addr_len > 0) {
   4507 		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
   4508 		if (template == NULL) {
   4509 			err = ENOMEM;
   4510 			goto err_ret;
   4511 		}
   4512 		ncec->ncec_lladdr = template;
   4513 		ncec->ncec_lladdr_length = hw_addr_len;
   4514 		bzero(ncec->ncec_lladdr, hw_addr_len);
   4515 	}
   4516 	if ((flags & NCE_F_BCAST) != 0) {
   4517 		state = ND_REACHABLE;
   4518 		ASSERT(hw_addr_len > 0);
   4519 	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
   4520 		state = ND_INITIAL;
   4521 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
   4522 		/*
   4523 		 * NORESOLVER entries are always created in the REACHABLE
   4524 		 * state.
   4525 		 */
   4526 		state = ND_REACHABLE;
   4527 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
   4528 		    ill->ill_mactype != DL_IPV4 &&
   4529 		    ill->ill_mactype != DL_6TO4) {
   4530 			/*
   4531 			 * We create a nce_res_mp with the IP nexthop address
   4532 			 * as the destination address if the physical length
   4533 			 * is exactly 4 bytes for point-to-multipoint links
   4534 			 * that do their own resolution from IP to link-layer
   4535 			 * address (e.g. IP over X.25).
   4536 			 */
   4537 			bcopy((uchar_t *)addr,
   4538 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
   4539 		}
   4540 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
   4541 		    ill->ill_mactype != DL_IPV6) {
   4542 			/*
   4543 			 * We create a nce_res_mp with the IP nexthop address
   4544 			 * as the destination address if the physical legnth
   4545 			 * is exactly 16 bytes for point-to-multipoint links
   4546 			 * that do their own resolution from IP to link-layer
   4547 			 * address.
   4548 			 */
   4549 			bcopy((uchar_t *)addr,
   4550 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
   4551 		}
   4552 		/*
   4553 		 * Since NUD is not part of the base IPv4 protocol definition,
   4554 		 * IPv4 neighbor entries on NORESOLVER interfaces will never
   4555 		 * age, and are marked NCE_F_NONUD.
   4556 		 */
   4557 		if (!ill->ill_isv6)
   4558 			ncec->ncec_flags |= NCE_F_NONUD;
   4559 	} else if (ill->ill_net_type == IRE_LOOPBACK) {
   4560 		state = ND_REACHABLE;
   4561 	}
   4562 
   4563 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
   4564 		/*
   4565 		 * We are adding an ncec with a deterministic hw_addr,
   4566 		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
   4567 		 *
   4568 		 * if we are adding a unicast ncec for the local address
   4569 		 * it would be REACHABLE; we would be adding a ND_STALE entry
   4570 		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
   4571 		 * addresses are added in PROBE to trigger DAD.
   4572 		 */
   4573 		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
   4574 		    ill->ill_net_type == IRE_IF_NORESOLVER)
   4575 			state = ND_REACHABLE;
   4576 		else if (!NCE_PUBLISH(ncec))
   4577 			state = ND_STALE;
   4578 		else
   4579 			state = ND_PROBE;
   4580 		if (hw_addr != NULL)
   4581 			nce_set_ll(ncec, hw_addr);
   4582 	}
   4583 	/* caller overrides internally computed state */
   4584 	if (nce_state != ND_UNCHANGED)
   4585 		state = nce_state;
   4586 
   4587 	if (state == ND_PROBE)
   4588 		ncec->ncec_flags |= NCE_F_UNVERIFIED;
   4589 
   4590 	ncec->ncec_state = state;
   4591 
   4592 	if (state == ND_REACHABLE) {
   4593 		ncec->ncec_last = ncec->ncec_init_time =
   4594 		    TICK_TO_MSEC(ddi_get_lbolt64());
   4595 	} else {
   4596 		ncec->ncec_last = 0;
   4597 		if (state == ND_INITIAL)
   4598 			ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
   4599 	}
   4600 	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
   4601 	    offsetof(ncec_cb_t, ncec_cb_node));
   4602 	/*
   4603 	 * have all the memory allocations out of the way before taking locks
   4604 	 * and adding the nce.
   4605 	 */
   4606 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
   4607 	if (nce == NULL) {
   4608 		err = ENOMEM;
   4609 		goto err_ret;
   4610 	}
   4611 	if (ncec->ncec_lladdr != NULL ||
   4612 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
   4613 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
   4614 		    ill->ill_phys_addr_length, ill->ill_sap,
   4615 		    ill->ill_sap_length);
   4616 		if (dlur_mp == NULL) {
   4617 			err = ENOMEM;
   4618 			goto err_ret;
   4619 		}
   4620 	}
   4621 
   4622 	/*
   4623 	 * Atomically ensure that the ill is not CONDEMNED, before
   4624 	 * adding the NCE.
   4625 	 */
   4626 	mutex_enter(&ill->ill_lock);
   4627 	if (ill->ill_state_flags & ILL_CONDEMNED) {
   4628 		mutex_exit(&ill->ill_lock);
   4629 		err = EINVAL;
   4630 		goto err_ret;
   4631 	}
   4632 	if (!NCE_MYADDR(ncec) &&
   4633 	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
   4634 		mutex_exit(&ill->ill_lock);
   4635 		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
   4636 		err = EINVAL;
   4637 		goto err_ret;
   4638 	}
   4639 	/*
   4640 	 * Acquire the ncec_lock even before adding the ncec to the list
   4641 	 * so that it cannot get deleted after the ncec is added, but
   4642 	 * before we add the nce.
   4643 	 */
   4644 	mutex_enter(&ncec->ncec_lock);
   4645 	if ((ncec->ncec_next = *ncep) != NULL)
   4646 		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
   4647 	*ncep = ncec;
   4648 	ncec->ncec_ptpn = ncep;
   4649 
   4650 	/* Bump up the number of ncec's referencing this ill */
   4651 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
   4652 	    (char *), "ncec", (void *), ncec);
   4653 	ill->ill_ncec_cnt++;
   4654 	/*
   4655 	 * Since we hold the ncec_lock at this time, the ncec cannot be
   4656 	 * condemned, and we can safely add the nce.
   4657 	 */
   4658 	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
   4659 	mutex_exit(&ncec->ncec_lock);
   4660 	mutex_exit(&ill->ill_lock);
   4661 
   4662 	/* caller must trigger fastpath on *retnce */
   4663 	return (0);
   4664 
   4665 err_ret:
   4666 	if (ncec != NULL)
   4667 		kmem_cache_free(ncec_cache, ncec);
   4668 	if (nce != NULL)
   4669 		kmem_cache_free(nce_cache, nce);
   4670 	freemsg(dlur_mp);
   4671 	if (template != NULL)
   4672 		kmem_free(template, ill->ill_phys_addr_length);
   4673 	return (err);
   4674 }
   4675 
   4676 /*
   4677  * take a ref on the nce
   4678  */
   4679 void
   4680 nce_refhold(nce_t *nce)
   4681 {
   4682 	mutex_enter(&nce->nce_lock);
   4683 	nce->nce_refcnt++;
   4684 	ASSERT((nce)->nce_refcnt != 0);
   4685 	mutex_exit(&nce->nce_lock);
   4686 }
   4687 
   4688 /*
   4689  * release a ref on the nce; In general, this
   4690  * cannot be called with locks held because nce_inactive
   4691  * may result in nce_inactive which will take the ill_lock,
   4692  * do ipif_ill_refrele_tail etc. Thus the one exception
   4693  * where this can be called with locks held is when the caller
   4694  * is certain that the nce_refcnt is sufficient to prevent
   4695  * the invocation of nce_inactive.
   4696  */
   4697 void
   4698 nce_refrele(nce_t *nce)
   4699 {
   4700 	ASSERT((nce)->nce_refcnt != 0);
   4701 	mutex_enter(&nce->nce_lock);
   4702 	if (--nce->nce_refcnt == 0)
   4703 		nce_inactive(nce); /* destroys the mutex */
   4704 	else
   4705 		mutex_exit(&nce->nce_lock);
   4706 }
   4707 
   4708 /*
   4709  * free the nce after all refs have gone away.
   4710  */
   4711 static void
   4712 nce_inactive(nce_t *nce)
   4713 {
   4714 	ill_t *ill = nce->nce_ill;
   4715 
   4716 	ASSERT(nce->nce_refcnt == 0);
   4717 
   4718 	ncec_refrele_notr(nce->nce_common);
   4719 	nce->nce_common = NULL;
   4720 	freemsg(nce->nce_fp_mp);
   4721 	freemsg(nce->nce_dlur_mp);
   4722 
   4723 	mutex_enter(&ill->ill_lock);
   4724 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
   4725 	    (char *), "nce", (void *), nce);
   4726 	ill->ill_nce_cnt--;
   4727 	nce->nce_ill = NULL;
   4728 	/*
   4729 	 * If the number of ncec's associated with this ill have dropped
   4730 	 * to zero, check whether we need to restart any operation that
   4731 	 * is waiting for this to happen.
   4732 	 */
   4733 	if (ILL_DOWN_OK(ill)) {
   4734 		/* ipif_ill_refrele_tail drops the ill_lock */
   4735 		ipif_ill_refrele_tail(ill);
   4736 	} else {
   4737 		mutex_exit(&ill->ill_lock);
   4738 	}
   4739 
   4740 	mutex_destroy(&nce->nce_lock);
   4741 	kmem_cache_free(nce_cache, nce);
   4742 }
   4743 
   4744 /*
   4745  * Add an nce to the ill_nce list.
   4746  */
   4747 static nce_t *
   4748 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
   4749 {
   4750 	bzero(nce, sizeof (*nce));
   4751 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
   4752 	nce->nce_common = ncec;
   4753 	nce->nce_addr = ncec->ncec_addr;
   4754 	nce->nce_ill = ill;
   4755 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
   4756 	    (char *), "nce", (void *), nce);
   4757 	ill->ill_nce_cnt++;
   4758 
   4759 	nce->nce_refcnt = 1; /* for the thread */
   4760 	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
   4761 	nce->nce_dlur_mp = dlur_mp;
   4762 
   4763 	/* add nce to the ill's fastpath list.  */
   4764 	nce->nce_refcnt++; /* for the list */
   4765 	list_insert_head(&ill->ill_nce, nce);
   4766 	return (nce);
   4767 }
   4768 
   4769 static nce_t *
   4770 nce_add(ill_t *ill, ncec_t *ncec)
   4771 {
   4772 	nce_t	*nce;
   4773 	mblk_t	*dlur_mp = NULL;
   4774 
   4775 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4776 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   4777 
   4778 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
   4779 	if (nce == NULL)
   4780 		return (NULL);
   4781 	if (ncec->ncec_lladdr != NULL ||
   4782 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
   4783 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
   4784 		    ill->ill_phys_addr_length, ill->ill_sap,
   4785 		    ill->ill_sap_length);
   4786 		if (dlur_mp == NULL) {
   4787 			kmem_cache_free(nce_cache, nce);
   4788 			return (NULL);
   4789 		}
   4790 	}
   4791 	return (nce_add_impl(ill, ncec, nce, dlur_mp));
   4792 }
   4793 
   4794 /*
   4795  * remove the nce from the ill_faspath list
   4796  */
   4797 void
   4798 nce_delete(nce_t *nce)
   4799 {
   4800 	ill_t	*ill = nce->nce_ill;
   4801 
   4802 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4803 
   4804 	mutex_enter(&nce->nce_lock);
   4805 	if (nce->nce_is_condemned) {
   4806 		/*
   4807 		 * some other thread has removed this nce from the ill_nce list
   4808 		 */
   4809 		mutex_exit(&nce->nce_lock);
   4810 		return;
   4811 	}
   4812 	nce->nce_is_condemned = B_TRUE;
   4813 	mutex_exit(&nce->nce_lock);
   4814 
   4815 	list_remove(&ill->ill_nce, nce);
   4816 	/*
   4817 	 * even though we are holding the ill_lock, it is ok to
   4818 	 * call nce_refrele here because we know that we should have
   4819 	 * at least 2 refs on the nce: one for the thread, and one
   4820 	 * for the list. The refrele below will release the one for
   4821 	 * the list.
   4822 	 */
   4823 	nce_refrele(nce);
   4824 }
   4825 
   4826 nce_t *
   4827 nce_lookup(ill_t *ill, const in6_addr_t *addr)
   4828 {
   4829 	nce_t *nce = NULL;
   4830 
   4831 	ASSERT(ill != NULL);
   4832 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4833 
   4834 	for (nce = list_head(&ill->ill_nce); nce != NULL;
   4835 	    nce = list_next(&ill->ill_nce, nce)) {
   4836 		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
   4837 			break;
   4838 	}
   4839 
   4840 	/*
   4841 	 * if we found the nce on the ill_nce list while holding
   4842 	 * the ill_lock, then it cannot be condemned yet.
   4843 	 */
   4844 	if (nce != NULL) {
   4845 		ASSERT(!nce->nce_is_condemned);
   4846 		nce_refhold(nce);
   4847 	}
   4848 	return (nce);
   4849 }
   4850 
   4851 /*
   4852  * Walk the ill_nce list on ill. The callback function func() cannot perform
   4853  * any destructive actions.
   4854  */
   4855 static void
   4856 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
   4857 {
   4858 	nce_t *nce = NULL, *nce_next;
   4859 
   4860 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4861 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
   4862 		nce_next = list_next(&ill->ill_nce, nce);
   4863 		if (func(ill, nce, arg) != 0)
   4864 			break;
   4865 		nce = nce_next;
   4866 	}
   4867 }
   4868 
   4869 void
   4870 nce_walk(ill_t *ill, pfi_t func, void *arg)
   4871 {
   4872 	mutex_enter(&ill->ill_lock);
   4873 	nce_walk_common(ill, func, arg);
   4874 	mutex_exit(&ill->ill_lock);
   4875 }
   4876 
   4877 void
   4878 nce_flush(ill_t *ill, boolean_t flushall)
   4879 {
   4880 	nce_t *nce, *nce_next;
   4881 	list_t dead;
   4882 
   4883 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
   4884 	mutex_enter(&ill->ill_lock);
   4885 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
   4886 		nce_next = list_next(&ill->ill_nce, nce);
   4887 		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
   4888 			nce = nce_next;
   4889 			continue;
   4890 		}
   4891 		/*
   4892 		 * nce_delete requires that the caller should either not
   4893 		 * be holding locks, or should hold a ref to ensure that
   4894 		 * we wont hit ncec_inactive. So take a ref and clean up
   4895 		 * after the list is flushed.
   4896 		 */
   4897 		nce_refhold(nce);
   4898 		nce_delete(nce);
   4899 		list_insert_tail(&dead, nce);
   4900 		nce = nce_next;
   4901 	}
   4902 	mutex_exit(&ill->ill_lock);
   4903 	while ((nce = list_head(&dead)) != NULL) {
   4904 		list_remove(&dead, nce);
   4905 		nce_refrele(nce);
   4906 	}
   4907 	ASSERT(list_is_empty(&dead));
   4908 	list_destroy(&dead);
   4909 }
   4910 
   4911 /* Return an interval that is anywhere in the [1 .. intv] range */
   4912 static clock_t
   4913 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
   4914 {
   4915 	clock_t rnd, frac;
   4916 
   4917 	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
   4918 	/* Note that clock_t is signed; must chop off bits */
   4919 	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
   4920 	if (initial_time) {
   4921 		if (intv <= 0)
   4922 			intv = 1;
   4923 		else
   4924 			intv = (rnd % intv) + 1;
   4925 	} else {
   4926 		/* Compute 'frac' as 20% of the configured interval */
   4927 		if ((frac = intv / 5) <= 1)
   4928 			frac = 2;
   4929 		/* Set intv randomly in the range [intv-frac .. intv+frac] */
   4930 		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
   4931 		intv = 1;
   4932 	}
   4933 	return (intv);
   4934 }
   4935 
   4936 void
   4937 nce_resolv_ipmp_ok(ncec_t *ncec)
   4938 {
   4939 	mblk_t *mp;
   4940 	uint_t pkt_len;
   4941 	iaflags_t ixaflags = IXAF_NO_TRACE;
   4942 	nce_t *under_nce;
   4943 	ill_t	*ill = ncec->ncec_ill;
   4944 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
   4945 	ipif_t *src_ipif = NULL;
   4946 	ip_stack_t *ipst = ill->ill_ipst;
   4947 	ill_t *send_ill;
   4948 	uint_t nprobes;
   4949 
   4950 	ASSERT(IS_IPMP(ill));
   4951 
   4952 	mutex_enter(&ncec->ncec_lock);
   4953 	nprobes = ncec->ncec_nprobes;
   4954 	mp = ncec->ncec_qd_mp;
   4955 	ncec->ncec_qd_mp = NULL;
   4956 	ncec->ncec_nprobes = 0;
   4957 	mutex_exit(&ncec->ncec_lock);
   4958 
   4959 	while (mp != NULL) {
   4960 		mblk_t *nxt_mp;
   4961 
   4962 		nxt_mp = mp->b_next;
   4963 		mp->b_next = NULL;
   4964 		if (isv6) {
   4965 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
   4966 
   4967 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
   4968 			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
   4969 			    ill, ALL_ZONES, ipst);
   4970 		} else {
   4971 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
   4972 
   4973 			ixaflags |= IXAF_IS_IPV4;
   4974 			pkt_len = ntohs(ipha->ipha_length);
   4975 			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
   4976 			    ill, ALL_ZONES, ipst);
   4977 		}
   4978 
   4979 		/*
   4980 		 * find a new nce based on an under_ill. The first IPMP probe
   4981 		 * packet gets queued, so we could still find a src_ipif that
   4982 		 * matches an IPMP test address.
   4983 		 */
   4984 		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
   4985 			/*
   4986 			 * if src_ipif is null, this could be either a
   4987 			 * forwarded packet or a probe whose src got deleted.
   4988 			 * We identify the former case by looking for the
   4989 			 * ncec_nprobes: the first ncec_nprobes packets are
   4990 			 * probes;
   4991 			 */
   4992 			if (src_ipif == NULL && nprobes > 0)
   4993 				goto drop_pkt;
   4994 
   4995 			/*
   4996 			 * For forwarded packets, we use the ipmp rotor
   4997 			 * to find send_ill.
   4998 			 */
   4999 			send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill,
   5000 			    B_TRUE);
   5001 		} else {
   5002 			send_ill = src_ipif->ipif_ill;
   5003 			ill_refhold(send_ill);
   5004 		}
   5005 
   5006 		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
   5007 		    (ncec_t *), ncec, (ipif_t *),
   5008 		    src_ipif, (ill_t *), send_ill);
   5009 
   5010 		if (send_ill == NULL) {
   5011 			if (src_ipif != NULL)
   5012 				ipif_refrele(src_ipif);
   5013 			goto drop_pkt;
   5014 		}
   5015 		/* create an under_nce on send_ill */
   5016 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   5017 		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
   5018 			under_nce = nce_fastpath_create(send_ill, ncec);
   5019 		else
   5020 			under_nce = NULL;
   5021 		rw_exit(&ipst->ips_ill_g_lock);
   5022 		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
   5023 			nce_fastpath_trigger(under_nce);
   5024 
   5025 		ill_refrele(send_ill);
   5026 		if (src_ipif != NULL)
   5027 			ipif_refrele(src_ipif);
   5028 
   5029 		if (under_nce != NULL) {
   5030 			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
   5031 			    ALL_ZONES, 0, NULL);
   5032 			nce_refrele(under_nce);
   5033 			if (nprobes > 0)
   5034 				nprobes--;
   5035 			mp = nxt_mp;
   5036 			continue;
   5037 		}
   5038 drop_pkt:
   5039 		if (isv6) {
   5040 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
   5041 		} else {
   5042 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   5043 		}
   5044 		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
   5045 		freemsg(mp);
   5046 		if (nprobes > 0)
   5047 			nprobes--;
   5048 		mp = nxt_mp;
   5049 	}
   5050 	ncec_cb_dispatch(ncec); /* complete callbacks */
   5051 }
   5052