Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/types.h>
     27 #include <sys/stream.h>
     28 #include <sys/stropts.h>
     29 #include <sys/strsun.h>
     30 #include <sys/sysmacros.h>
     31 #include <sys/errno.h>
     32 #include <sys/dlpi.h>
     33 #include <sys/socket.h>
     34 #include <sys/ddi.h>
     35 #include <sys/sunddi.h>
     36 #include <sys/cmn_err.h>
     37 #include <sys/debug.h>
     38 #include <sys/vtrace.h>
     39 #include <sys/kmem.h>
     40 #include <sys/zone.h>
     41 #include <sys/ethernet.h>
     42 #include <sys/sdt.h>
     43 #include <sys/mac.h>
     44 
     45 #include <net/if.h>
     46 #include <net/if_types.h>
     47 #include <net/if_dl.h>
     48 #include <net/route.h>
     49 #include <netinet/in.h>
     50 #include <netinet/ip6.h>
     51 #include <netinet/icmp6.h>
     52 
     53 #include <inet/common.h>
     54 #include <inet/mi.h>
     55 #include <inet/mib2.h>
     56 #include <inet/nd.h>
     57 #include <inet/ip.h>
     58 #include <inet/ip_impl.h>
     59 #include <inet/ipclassifier.h>
     60 #include <inet/ip_if.h>
     61 #include <inet/ip_ire.h>
     62 #include <inet/ip_rts.h>
     63 #include <inet/ip6.h>
     64 #include <inet/ip_ndp.h>
     65 #include <inet/sctp_ip.h>
     66 #include <inet/ip_arp.h>
     67 #include <inet/ip2mac_impl.h>
     68 
     69 #define	ANNOUNCE_INTERVAL(isv6) \
     70 	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
     71 	ipst->ips_ip_arp_publish_interval)
     72 
     73 #define	DEFENSE_INTERVAL(isv6) \
     74 	(isv6 ? ipst->ips_ndp_defend_interval : \
     75 	ipst->ips_arp_defend_interval)
     76 
     77 /* Non-tunable probe interval, based on link capabilities */
     78 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
     79 
     80 /*
     81  * The IPv4 Link Local address space is special; we do extra duplicate checking
     82  * there, as the entire assignment mechanism rests on random numbers.
     83  */
     84 #define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
     85 				((uchar_t *)ptr)[1] == 254)
     86 
     87 /*
     88  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
     89  * in to the ncec*add* functions.
     90  *
     91  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
     92  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
     93  * that we will respond to requests for the protocol address.
     94  */
     95 #define	NCE_EXTERNAL_FLAGS_MASK \
     96 	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
     97 	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
     98 	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
     99 
    100 /*
    101  * Lock ordering:
    102  *
    103  *	ndp_g_lock -> ill_lock -> ncec_lock
    104  *
    105  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
    106  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
    107  * ncec_refcnt).
    108  */
    109 
    110 static	void	nce_cleanup_list(ncec_t *ncec);
    111 static	void 	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
    112 static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
    113     ncec_t *);
    114 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
    115 static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
    116     uint16_t ncec_flags, nce_t **newnce);
    117 static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
    118     uint16_t ncec_flags, nce_t **newnce);
    119 static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
    120     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
    121     const in6_addr_t *target, int flag);
    122 static void	ncec_refhold_locked(ncec_t *);
    123 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
    124 static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
    125 static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
    126     uint16_t, uint16_t, nce_t **);
    127 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
    128 static nce_t *nce_add(ill_t *, ncec_t *);
    129 static void nce_inactive(nce_t *);
    130 extern nce_t 	*nce_lookup(ill_t *, const in6_addr_t *);
    131 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
    132 static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
    133     uint16_t, uint16_t, nce_t **);
    134 static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
    135     uint16_t, uint16_t, nce_t **);
    136 static int  nce_add_v6_postprocess(nce_t *);
    137 static int  nce_add_v4_postprocess(nce_t *);
    138 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
    139 static clock_t nce_fuzz_interval(clock_t, boolean_t);
    140 static void nce_resolv_ipmp_ok(ncec_t *);
    141 static void nce_walk_common(ill_t *, pfi_t, void *);
    142 static void nce_start_timer(ncec_t *, uint_t);
    143 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
    144 static void nce_fastpath_trigger(nce_t *);
    145 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
    146 
    147 #ifdef DEBUG
    148 static void	ncec_trace_cleanup(const ncec_t *);
    149 #endif
    150 
    151 #define	NCE_HASH_PTR_V4(ipst, addr)					\
    152 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
    153 
    154 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
    155 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
    156 		NCE_TABLE_SIZE)]))
    157 
    158 extern kmem_cache_t *ncec_cache;
    159 extern kmem_cache_t *nce_cache;
    160 
    161 /*
    162  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
    163  * If src_ill is not null, the ncec_addr is bound to src_ill. The
    164  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
    165  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
    166  * IPMP cast_ill (in the IPMP case).
    167  *
    168  * Note that the probe interval is based on ncec->ncec_ill which
    169  * may be the ipmp_ill.
    170  */
    171 static void
    172 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
    173 {
    174 	boolean_t dropped;
    175 	uint32_t probe_interval;
    176 
    177 	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
    178 	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
    179 	if (ncec->ncec_ipversion == IPV6_VERSION) {
    180 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
    181 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
    182 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
    183 		probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill);
    184 	} else {
    185 		/* IPv4 DAD delay the initial probe. */
    186 		if (send_probe)
    187 			dropped = arp_probe(ncec);
    188 		else
    189 			dropped = B_TRUE;
    190 		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
    191 		    !send_probe);
    192 	}
    193 	if (!dropped) {
    194 		mutex_enter(&ncec->ncec_lock);
    195 		ncec->ncec_pcnt--;
    196 		mutex_exit(&ncec->ncec_lock);
    197 	}
    198 	nce_restart_timer(ncec, probe_interval);
    199 }
    200 
    201 /*
    202  * Compute default flags to use for an advertisement of this ncec's address.
    203  */
    204 static int
    205 nce_advert_flags(const ncec_t *ncec)
    206 {
    207 	int flag = 0;
    208 
    209 	if (ncec->ncec_flags & NCE_F_ISROUTER)
    210 		flag |= NDP_ISROUTER;
    211 	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
    212 		flag |= NDP_ORIDE;
    213 
    214 	return (flag);
    215 }
    216 
    217 /*
    218  * NDP Cache Entry creation routine.
    219  * This routine must always be called with ndp6->ndp_g_lock held.
    220  */
    221 int
    222 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
    223     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
    224 {
    225 	int		err;
    226 	nce_t		*nce;
    227 
    228 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
    229 	ASSERT(ill != NULL && ill->ill_isv6);
    230 
    231 	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
    232 	    &nce);
    233 	if (err != 0)
    234 		return (err);
    235 	ASSERT(newnce != NULL);
    236 	*newnce = nce;
    237 	return (err);
    238 }
    239 
    240 /*
    241  * Post-processing routine to be executed after nce_add_v6(). This function
    242  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
    243  * and must be called without any locks held.
    244  */
    245 int
    246 nce_add_v6_postprocess(nce_t *nce)
    247 {
    248 	ncec_t		*ncec = nce->nce_common;
    249 	boolean_t	dropped = B_FALSE;
    250 	uchar_t		*hw_addr = ncec->ncec_lladdr;
    251 	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
    252 	ill_t		*ill = ncec->ncec_ill;
    253 	int		err = 0;
    254 	uint16_t	flags = ncec->ncec_flags;
    255 	ip_stack_t	*ipst = ill->ill_ipst;
    256 	boolean_t	trigger_fastpath = B_TRUE;
    257 
    258 	/*
    259 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
    260 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
    261 	 * We call nce_fastpath from nce_update if the link layer address of
    262 	 * the peer changes from nce_update
    263 	 */
    264 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
    265 	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
    266 		trigger_fastpath = B_FALSE;
    267 
    268 	if (trigger_fastpath)
    269 		nce_fastpath_trigger(nce);
    270 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
    271 		ill_t *hwaddr_ill;
    272 		/*
    273 		 * Unicast entry that needs DAD.
    274 		 */
    275 		if (IS_IPMP(ill)) {
    276 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
    277 			    hw_addr, hw_addr_len);
    278 		} else {
    279 			hwaddr_ill = ill;
    280 		}
    281 		nce_dad(ncec, hwaddr_ill, B_TRUE);
    282 		err = EINPROGRESS;
    283 	} else if (flags & NCE_F_UNSOL_ADV) {
    284 		/*
    285 		 * We account for the transmit below by assigning one
    286 		 * less than the ndd variable. Subsequent decrements
    287 		 * are done in nce_timer.
    288 		 */
    289 		mutex_enter(&ncec->ncec_lock);
    290 		ncec->ncec_unsolicit_count =
    291 		    ipst->ips_ip_ndp_unsolicit_count - 1;
    292 		mutex_exit(&ncec->ncec_lock);
    293 		dropped = ndp_xmit(ill,
    294 		    ND_NEIGHBOR_ADVERT,
    295 		    hw_addr,
    296 		    hw_addr_len,
    297 		    &ncec->ncec_addr,	/* Source and target of the adv */
    298 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
    299 		    nce_advert_flags(ncec));
    300 		mutex_enter(&ncec->ncec_lock);
    301 		if (dropped)
    302 			ncec->ncec_unsolicit_count++;
    303 		else
    304 			ncec->ncec_last_time_defended = ddi_get_lbolt();
    305 		if (ncec->ncec_unsolicit_count != 0) {
    306 			nce_start_timer(ncec,
    307 			    ipst->ips_ip_ndp_unsolicit_interval);
    308 		}
    309 		mutex_exit(&ncec->ncec_lock);
    310 	}
    311 	return (err);
    312 }
    313 
    314 /*
    315  * Atomically lookup and add (if needed) Neighbor Cache information for
    316  * an address.
    317  *
    318  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
    319  * are always added pointing at the ipmp_ill. Thus, when the ill passed
    320  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
    321  * entries will be created, both pointing at the same ncec_t. The nce_t
    322  * entries will have their nce_ill set to the ipmp_ill and the under_ill
    323  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
    324  * Local addresses are always created on the ill passed to nce_add_v6.
    325  */
    326 int
    327 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
    328     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
    329 {
    330 	int		err = 0;
    331 	ip_stack_t	*ipst = ill->ill_ipst;
    332 	nce_t		*nce, *upper_nce = NULL;
    333 	ill_t		*in_ill = ill;
    334 	boolean_t	need_ill_refrele = B_FALSE;
    335 
    336 	if (flags & NCE_F_MCAST) {
    337 		/*
    338 		 * hw_addr will be figured out in nce_set_multicast_v6;
    339 		 * caller has to select the cast_ill
    340 		 */
    341 		ASSERT(hw_addr == NULL);
    342 		ASSERT(!IS_IPMP(ill));
    343 		err = nce_set_multicast_v6(ill, addr, flags, newnce);
    344 		return (err);
    345 	}
    346 	ASSERT(ill->ill_isv6);
    347 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
    348 		ill = ipmp_ill_hold_ipmp_ill(ill);
    349 		if (ill == NULL)
    350 			return (ENXIO);
    351 		need_ill_refrele = B_TRUE;
    352 	}
    353 
    354 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
    355 	nce = nce_lookup_addr(ill, addr);
    356 	if (nce == NULL) {
    357 		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
    358 		    &nce);
    359 	} else {
    360 		err = EEXIST;
    361 	}
    362 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    363 	if (err == 0)
    364 		err = nce_add_v6_postprocess(nce);
    365 	if (in_ill != ill && nce != NULL) {
    366 		nce_t *under_nce;
    367 
    368 		/*
    369 		 * in_ill was the under_ill. Try to create the under_nce.
    370 		 * Hold the ill_g_lock to prevent changes to group membership
    371 		 * until we are done.
    372 		 */
    373 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
    374 		if (IS_IN_SAME_ILLGRP(in_ill, ill)) {
    375 			under_nce = nce_fastpath_create(in_ill,
    376 			    nce->nce_common);
    377 			upper_nce = nce;
    378 			if ((nce = under_nce) == NULL)
    379 				err = EINVAL;
    380 		}
    381 		rw_exit(&ipst->ips_ill_g_lock);
    382 		if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common))
    383 			nce_fastpath_trigger(under_nce);
    384 	}
    385 	if (nce != NULL) {
    386 		if (newnce != NULL)
    387 			*newnce = nce;
    388 		else
    389 			nce_refrele(nce);
    390 	}
    391 	/* nce_refrele is deferred until the lock is dropped  */
    392 	if (upper_nce != NULL)
    393 		nce_refrele(upper_nce);
    394 	if (need_ill_refrele)
    395 		ill_refrele(ill);
    396 	return (err);
    397 }
    398 
    399 /*
    400  * Remove all the CONDEMNED nces from the appropriate hash table.
    401  * We create a private list of NCEs, these may have ires pointing
    402  * to them, so the list will be passed through to clean up dependent
    403  * ires and only then we can do ncec_refrele() which can make NCE inactive.
    404  */
    405 static void
    406 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
    407 {
    408 	ncec_t *ncec1;
    409 	ncec_t **ptpn;
    410 
    411 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
    412 	ASSERT(ndp->ndp_g_walker == 0);
    413 	for (; ncec; ncec = ncec1) {
    414 		ncec1 = ncec->ncec_next;
    415 		mutex_enter(&ncec->ncec_lock);
    416 		if (NCE_ISCONDEMNED(ncec)) {
    417 			ptpn = ncec->ncec_ptpn;
    418 			ncec1 = ncec->ncec_next;
    419 			if (ncec1 != NULL)
    420 				ncec1->ncec_ptpn = ptpn;
    421 			*ptpn = ncec1;
    422 			ncec->ncec_ptpn = NULL;
    423 			ncec->ncec_next = NULL;
    424 			ncec->ncec_next = *free_nce_list;
    425 			*free_nce_list = ncec;
    426 		}
    427 		mutex_exit(&ncec->ncec_lock);
    428 	}
    429 }
    430 
    431 /*
    432  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
    433  *    will return this NCE. Also no new timeouts will
    434  *    be started (See nce_restart_timer).
    435  * 2. Cancel any currently running timeouts.
    436  * 3. If there is an ndp walker, return. The walker will do the cleanup.
    437  *    This ensures that walkers see a consistent list of NCEs while walking.
    438  * 4. Otherwise remove the NCE from the list of NCEs
    439  */
    440 void
    441 ncec_delete(ncec_t *ncec)
    442 {
    443 	ncec_t	**ptpn;
    444 	ncec_t	*ncec1;
    445 	int	ipversion = ncec->ncec_ipversion;
    446 	ndp_g_t *ndp;
    447 	ip_stack_t	*ipst = ncec->ncec_ipst;
    448 
    449 	if (ipversion == IPV4_VERSION)
    450 		ndp = ipst->ips_ndp4;
    451 	else
    452 		ndp = ipst->ips_ndp6;
    453 
    454 	/* Serialize deletes */
    455 	mutex_enter(&ncec->ncec_lock);
    456 	if (NCE_ISCONDEMNED(ncec)) {
    457 		/* Some other thread is doing the delete */
    458 		mutex_exit(&ncec->ncec_lock);
    459 		return;
    460 	}
    461 	/*
    462 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
    463 	 * refcnt has to be >= 2
    464 	 */
    465 	ASSERT(ncec->ncec_refcnt >= 2);
    466 	ncec->ncec_flags |= NCE_F_CONDEMNED;
    467 	mutex_exit(&ncec->ncec_lock);
    468 
    469 	/* Count how many condemned ires for kmem_cache callback */
    470 	atomic_add_32(&ipst->ips_num_nce_condemned, 1);
    471 	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
    472 
    473 	/* Complete any waiting callbacks */
    474 	ncec_cb_dispatch(ncec);
    475 
    476 	/*
    477 	 * Cancel any running timer. Timeout can't be restarted
    478 	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
    479 	 * Passing invalid timeout id is fine.
    480 	 */
    481 	if (ncec->ncec_timeout_id != 0) {
    482 		(void) untimeout(ncec->ncec_timeout_id);
    483 		ncec->ncec_timeout_id = 0;
    484 	}
    485 
    486 	mutex_enter(&ndp->ndp_g_lock);
    487 	if (ncec->ncec_ptpn == NULL) {
    488 		/*
    489 		 * The last ndp walker has already removed this ncec from
    490 		 * the list after we marked the ncec CONDEMNED and before
    491 		 * we grabbed the global lock.
    492 		 */
    493 		mutex_exit(&ndp->ndp_g_lock);
    494 		return;
    495 	}
    496 	if (ndp->ndp_g_walker > 0) {
    497 		/*
    498 		 * Can't unlink. The walker will clean up
    499 		 */
    500 		ndp->ndp_g_walker_cleanup = B_TRUE;
    501 		mutex_exit(&ndp->ndp_g_lock);
    502 		return;
    503 	}
    504 
    505 	/*
    506 	 * Now remove the ncec from the list. nce_restart_timer won't restart
    507 	 * the timer since it is marked CONDEMNED.
    508 	 */
    509 	ptpn = ncec->ncec_ptpn;
    510 	ncec1 = ncec->ncec_next;
    511 	if (ncec1 != NULL)
    512 		ncec1->ncec_ptpn = ptpn;
    513 	*ptpn = ncec1;
    514 	ncec->ncec_ptpn = NULL;
    515 	ncec->ncec_next = NULL;
    516 	mutex_exit(&ndp->ndp_g_lock);
    517 
    518 	/* Removed from ncec_ptpn/ncec_next list */
    519 	ncec_refrele_notr(ncec);
    520 }
    521 
    522 void
    523 ncec_inactive(ncec_t *ncec)
    524 {
    525 	mblk_t		**mpp;
    526 	ill_t		*ill = ncec->ncec_ill;
    527 	ip_stack_t	*ipst = ncec->ncec_ipst;
    528 
    529 	ASSERT(ncec->ncec_refcnt == 0);
    530 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
    531 
    532 	/* Count how many condemned nces for kmem_cache callback */
    533 	if (NCE_ISCONDEMNED(ncec))
    534 		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
    535 
    536 	/* Free all allocated messages */
    537 	mpp = &ncec->ncec_qd_mp;
    538 	while (*mpp != NULL) {
    539 		mblk_t  *mp;
    540 
    541 		mp = *mpp;
    542 		*mpp = mp->b_next;
    543 
    544 		inet_freemsg(mp);
    545 	}
    546 	/*
    547 	 * must have been cleaned up in ncec_delete
    548 	 */
    549 	ASSERT(list_is_empty(&ncec->ncec_cb));
    550 	list_destroy(&ncec->ncec_cb);
    551 	/*
    552 	 * free the ncec_lladdr if one was allocated in nce_add_common()
    553 	 */
    554 	if (ncec->ncec_lladdr_length > 0)
    555 		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
    556 
    557 #ifdef DEBUG
    558 	ncec_trace_cleanup(ncec);
    559 #endif
    560 
    561 	mutex_enter(&ill->ill_lock);
    562 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
    563 	    (char *), "ncec", (void *), ncec);
    564 	ill->ill_ncec_cnt--;
    565 	ncec->ncec_ill = NULL;
    566 	/*
    567 	 * If the number of ncec's associated with this ill have dropped
    568 	 * to zero, check whether we need to restart any operation that
    569 	 * is waiting for this to happen.
    570 	 */
    571 	if (ILL_DOWN_OK(ill)) {
    572 		/* ipif_ill_refrele_tail drops the ill_lock */
    573 		ipif_ill_refrele_tail(ill);
    574 	} else {
    575 		mutex_exit(&ill->ill_lock);
    576 	}
    577 
    578 	mutex_destroy(&ncec->ncec_lock);
    579 	kmem_cache_free(ncec_cache, ncec);
    580 }
    581 
    582 /*
    583  * ncec_walk routine.  Delete the ncec if it is associated with the ill
    584  * that is going away.  Always called as a writer.
    585  */
    586 void
    587 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
    588 {
    589 	if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
    590 		ncec_delete(ncec);
    591 	}
    592 }
    593 
    594 /*
    595  * Neighbor Cache cleanup logic for a list of ncec_t entries.
    596  */
    597 static void
    598 nce_cleanup_list(ncec_t *ncec)
    599 {
    600 	ncec_t *ncec_next;
    601 
    602 	ASSERT(ncec != NULL);
    603 	while (ncec != NULL) {
    604 		ncec_next = ncec->ncec_next;
    605 		ncec->ncec_next = NULL;
    606 
    607 		/*
    608 		 * It is possible for the last ndp walker (this thread)
    609 		 * to come here after ncec_delete has marked the ncec CONDEMNED
    610 		 * and before it has removed the ncec from the fastpath list
    611 		 * or called untimeout. So we need to do it here. It is safe
    612 		 * for both ncec_delete and this thread to do it twice or
    613 		 * even simultaneously since each of the threads has a
    614 		 * reference on the ncec.
    615 		 */
    616 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
    617 		/*
    618 		 * Cancel any running timer. Timeout can't be restarted
    619 		 * since CONDEMNED is set. The ncec_lock can't be
    620 		 * held across untimeout though passing invalid timeout
    621 		 * id is fine.
    622 		 */
    623 		if (ncec->ncec_timeout_id != 0) {
    624 			(void) untimeout(ncec->ncec_timeout_id);
    625 			ncec->ncec_timeout_id = 0;
    626 		}
    627 		/* Removed from ncec_ptpn/ncec_next list */
    628 		ncec_refrele_notr(ncec);
    629 		ncec = ncec_next;
    630 	}
    631 }
    632 
    633 /*
    634  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
    635  */
    636 boolean_t
    637 nce_restart_dad(ncec_t *ncec)
    638 {
    639 	boolean_t started;
    640 	ill_t *ill, *hwaddr_ill;
    641 
    642 	if (ncec == NULL)
    643 		return (B_FALSE);
    644 	ill = ncec->ncec_ill;
    645 	mutex_enter(&ncec->ncec_lock);
    646 	if (ncec->ncec_state == ND_PROBE) {
    647 		mutex_exit(&ncec->ncec_lock);
    648 		started = B_TRUE;
    649 	} else if (ncec->ncec_state == ND_REACHABLE) {
    650 		ASSERT(ncec->ncec_lladdr != NULL);
    651 		ncec->ncec_state = ND_PROBE;
    652 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
    653 		/*
    654 		 * Slight cheat here: we don't use the initial probe delay
    655 		 * for IPv4 in this obscure case.
    656 		 */
    657 		mutex_exit(&ncec->ncec_lock);
    658 		if (IS_IPMP(ill)) {
    659 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
    660 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
    661 		} else {
    662 			hwaddr_ill = ill;
    663 		}
    664 		nce_dad(ncec, hwaddr_ill, B_TRUE);
    665 		started = B_TRUE;
    666 	} else {
    667 		mutex_exit(&ncec->ncec_lock);
    668 		started = B_FALSE;
    669 	}
    670 	return (started);
    671 }
    672 
    673 /*
    674  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
    675  * If one is found, the refcnt on the ncec will be incremented.
    676  */
    677 ncec_t *
    678 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
    679 {
    680 	ncec_t		*ncec;
    681 	ip_stack_t	*ipst = ill->ill_ipst;
    682 
    683 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
    684 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
    685 
    686 	/* Get head of v6 hash table */
    687 	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
    688 	ncec = ncec_lookup_illgrp(ill, addr, ncec);
    689 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    690 	rw_exit(&ipst->ips_ill_g_lock);
    691 	return (ncec);
    692 }
    693 /*
    694  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
    695  * If one is found, the refcnt on the ncec will be incremented.
    696  */
    697 ncec_t *
    698 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
    699 {
    700 	ncec_t	*ncec = NULL;
    701 	in6_addr_t addr6;
    702 	ip_stack_t *ipst = ill->ill_ipst;
    703 
    704 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
    705 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
    706 
    707 	/* Get head of v4 hash table */
    708 	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
    709 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
    710 	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
    711 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
    712 	rw_exit(&ipst->ips_ill_g_lock);
    713 	return (ncec);
    714 }
    715 
    716 /*
    717  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
    718  * If an ncec is found, increment the hold count on that ncec.
    719  * The caller passes in the start of the appropriate hash table, and must
    720  * be holding the appropriate global lock (ndp_g_lock). In addition, since
    721  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
    722  * must be held as reader.
    723  *
    724  * This function always matches across the ipmp group.
    725  */
    726 ncec_t *
    727 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
    728 {
    729 	ndp_g_t		*ndp;
    730 	ip_stack_t	*ipst = ill->ill_ipst;
    731 
    732 	if (ill->ill_isv6)
    733 		ndp = ipst->ips_ndp6;
    734 	else
    735 		ndp = ipst->ips_ndp4;
    736 
    737 	ASSERT(ill != NULL);
    738 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
    739 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
    740 		return (NULL);
    741 	for (; ncec != NULL; ncec = ncec->ncec_next) {
    742 		if (ncec->ncec_ill == ill ||
    743 		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
    744 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
    745 				mutex_enter(&ncec->ncec_lock);
    746 				if (!NCE_ISCONDEMNED(ncec)) {
    747 					ncec_refhold_locked(ncec);
    748 					mutex_exit(&ncec->ncec_lock);
    749 					break;
    750 				}
    751 				mutex_exit(&ncec->ncec_lock);
    752 			}
    753 		}
    754 	}
    755 	return (ncec);
    756 }
    757 
    758 /*
    759  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
    760  * entries for ill only, i.e., when ill is part of an ipmp group,
    761  * nce_lookup_v4 will never try to match across the group.
    762  */
    763 nce_t *
    764 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
    765 {
    766 	nce_t *nce;
    767 	in6_addr_t addr6;
    768 	ip_stack_t *ipst = ill->ill_ipst;
    769 
    770 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
    771 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
    772 	nce = nce_lookup_addr(ill, &addr6);
    773 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
    774 	return (nce);
    775 }
    776 
    777 /*
    778  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
    779  * entries for ill only, i.e., when ill is part of an ipmp group,
    780  * nce_lookup_v6 will never try to match across the group.
    781  */
    782 nce_t *
    783 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
    784 {
    785 	nce_t *nce;
    786 	ip_stack_t *ipst = ill->ill_ipst;
    787 
    788 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
    789 	nce = nce_lookup_addr(ill, addr6);
    790 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    791 	return (nce);
    792 }
    793 
    794 static nce_t *
    795 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
    796 {
    797 	nce_t *nce;
    798 
    799 	ASSERT(ill != NULL);
    800 #ifdef DEBUG
    801 	if (ill->ill_isv6)
    802 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
    803 	else
    804 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
    805 #endif
    806 	mutex_enter(&ill->ill_lock);
    807 	nce = nce_lookup(ill, addr);
    808 	mutex_exit(&ill->ill_lock);
    809 	return (nce);
    810 }
    811 
    812 
    813 /*
    814  * Router turned to host.  We need to make sure that cached copies of the ncec
    815  * are not used for forwarding packets if they were derived from the default
    816  * route, and that the default route itself is removed, as  required by
    817  * section 7.2.5 of RFC 2461.
    818  *
    819  * Note that the ncec itself probably has valid link-layer information for the
    820  * nexthop, so that there is no reason to delete the ncec, as long as the
    821  * ISROUTER flag is turned off.
    822  */
    823 static void
    824 ncec_router_to_host(ncec_t *ncec)
    825 {
    826 	ire_t		*ire;
    827 	ip_stack_t	*ipst = ncec->ncec_ipst;
    828 
    829 	mutex_enter(&ncec->ncec_lock);
    830 	ncec->ncec_flags &= ~NCE_F_ISROUTER;
    831 	mutex_exit(&ncec->ncec_lock);
    832 
    833 	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
    834 	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
    835 	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
    836 	if (ire != NULL) {
    837 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
    838 		ire_delete(ire);
    839 		ire_refrele(ire);
    840 	}
    841 }
    842 
    843 /*
    844  * Process passed in parameters either from an incoming packet or via
    845  * user ioctl.
    846  */
    847 void
    848 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
    849 {
    850 	ill_t	*ill = ncec->ncec_ill;
    851 	uint32_t hw_addr_len = ill->ill_phys_addr_length;
    852 	boolean_t ll_updated = B_FALSE;
    853 	boolean_t ll_changed;
    854 	nce_t	*nce;
    855 
    856 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
    857 	/*
    858 	 * No updates of link layer address or the neighbor state is
    859 	 * allowed, when the cache is in NONUD state.  This still
    860 	 * allows for responding to reachability solicitation.
    861 	 */
    862 	mutex_enter(&ncec->ncec_lock);
    863 	if (ncec->ncec_state == ND_INCOMPLETE) {
    864 		if (hw_addr == NULL) {
    865 			mutex_exit(&ncec->ncec_lock);
    866 			return;
    867 		}
    868 		nce_set_ll(ncec, hw_addr);
    869 		/*
    870 		 * Update ncec state and send the queued packets
    871 		 * back to ip this time ire will be added.
    872 		 */
    873 		if (flag & ND_NA_FLAG_SOLICITED) {
    874 			nce_update(ncec, ND_REACHABLE, NULL);
    875 		} else {
    876 			nce_update(ncec, ND_STALE, NULL);
    877 		}
    878 		mutex_exit(&ncec->ncec_lock);
    879 		nce = nce_fastpath(ncec, B_TRUE, NULL);
    880 		nce_resolv_ok(ncec);
    881 		if (nce != NULL)
    882 			nce_refrele(nce);
    883 		return;
    884 	}
    885 	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
    886 	if (!is_adv) {
    887 		/* If this is a SOLICITATION request only */
    888 		if (ll_changed)
    889 			nce_update(ncec, ND_STALE, hw_addr);
    890 		mutex_exit(&ncec->ncec_lock);
    891 		ncec_cb_dispatch(ncec);
    892 		return;
    893 	}
    894 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
    895 		/* If in any other state than REACHABLE, ignore */
    896 		if (ncec->ncec_state == ND_REACHABLE) {
    897 			nce_update(ncec, ND_STALE, NULL);
    898 		}
    899 		mutex_exit(&ncec->ncec_lock);
    900 		ncec_cb_dispatch(ncec);
    901 		return;
    902 	} else {
    903 		if (ll_changed) {
    904 			nce_update(ncec, ND_UNCHANGED, hw_addr);
    905 			ll_updated = B_TRUE;
    906 		}
    907 		if (flag & ND_NA_FLAG_SOLICITED) {
    908 			nce_update(ncec, ND_REACHABLE, NULL);
    909 		} else {
    910 			if (ll_updated) {
    911 				nce_update(ncec, ND_STALE, NULL);
    912 			}
    913 		}
    914 		mutex_exit(&ncec->ncec_lock);
    915 		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
    916 		    NCE_F_ISROUTER)) {
    917 			ncec_router_to_host(ncec);
    918 		} else {
    919 			ncec_cb_dispatch(ncec);
    920 		}
    921 	}
    922 }
    923 
    924 /*
    925  * Pass arg1 to the pfi supplied, along with each ncec in existence.
    926  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
    927  * walking the hash list.
    928  */
    929 void
    930 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
    931     boolean_t trace)
    932 {
    933 	ncec_t	*ncec;
    934 	ncec_t	*ncec1;
    935 	ncec_t	**ncep;
    936 	ncec_t	*free_nce_list = NULL;
    937 
    938 	mutex_enter(&ndp->ndp_g_lock);
    939 	/* Prevent ncec_delete from unlink and free of NCE */
    940 	ndp->ndp_g_walker++;
    941 	mutex_exit(&ndp->ndp_g_lock);
    942 	for (ncep = ndp->nce_hash_tbl;
    943 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
    944 		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
    945 			ncec1 = ncec->ncec_next;
    946 			if (ill == NULL || ncec->ncec_ill == ill) {
    947 				if (trace) {
    948 					ncec_refhold(ncec);
    949 					(*pfi)(ncec, arg1);
    950 					ncec_refrele(ncec);
    951 				} else {
    952 					ncec_refhold_notr(ncec);
    953 					(*pfi)(ncec, arg1);
    954 					ncec_refrele_notr(ncec);
    955 				}
    956 			}
    957 		}
    958 	}
    959 	mutex_enter(&ndp->ndp_g_lock);
    960 	ndp->ndp_g_walker--;
    961 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
    962 		/* Time to delete condemned entries */
    963 		for (ncep = ndp->nce_hash_tbl;
    964 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
    965 			ncec = *ncep;
    966 			if (ncec != NULL) {
    967 				nce_remove(ndp, ncec, &free_nce_list);
    968 			}
    969 		}
    970 		ndp->ndp_g_walker_cleanup = B_FALSE;
    971 	}
    972 
    973 	mutex_exit(&ndp->ndp_g_lock);
    974 
    975 	if (free_nce_list != NULL) {
    976 		nce_cleanup_list(free_nce_list);
    977 	}
    978 }
    979 
    980 /*
    981  * Walk everything.
    982  * Note that ill can be NULL hence can't derive the ipst from it.
    983  */
    984 void
    985 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
    986 {
    987 	ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
    988 	ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
    989 }
    990 
    991 /*
    992  * For each interface an entry is added for the unspecified multicast group.
    993  * Here that mapping is used to form the multicast cache entry for a particular
    994  * multicast destination.
    995  */
    996 static int
    997 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
    998     uint16_t flags, nce_t **newnce)
    999 {
   1000 	uchar_t		*hw_addr;
   1001 	int		err = 0;
   1002 	ip_stack_t	*ipst = ill->ill_ipst;
   1003 	nce_t		*nce;
   1004 
   1005 	ASSERT(ill != NULL);
   1006 	ASSERT(ill->ill_isv6);
   1007 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
   1008 
   1009 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
   1010 	nce = nce_lookup_addr(ill, dst);
   1011 	if (nce != NULL) {
   1012 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   1013 		goto done;
   1014 	}
   1015 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
   1016 		/*
   1017 		 * For IRE_IF_RESOLVER a hardware mapping can be
   1018 		 * generated.
   1019 		 */
   1020 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
   1021 		if (hw_addr == NULL) {
   1022 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   1023 			return (ENOMEM);
   1024 		}
   1025 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
   1026 	} else {
   1027 		/* No hw_addr is needed for IRE_IF_NORESOLVER. */
   1028 		hw_addr = NULL;
   1029 	}
   1030 	ASSERT((flags & NCE_F_MCAST) != 0);
   1031 	ASSERT((flags & NCE_F_NONUD) != 0);
   1032 	/* nce_state will be computed by nce_add_common() */
   1033 	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
   1034 	    ND_UNCHANGED, &nce);
   1035 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   1036 	if (err == 0)
   1037 		err = nce_add_v6_postprocess(nce);
   1038 	if (hw_addr != NULL)
   1039 		kmem_free(hw_addr, ill->ill_nd_lla_len);
   1040 	if (err != 0) {
   1041 		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
   1042 		return (err);
   1043 	}
   1044 done:
   1045 	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
   1046 	if (newnce != NULL)
   1047 		*newnce = nce;
   1048 	else
   1049 		nce_refrele(nce);
   1050 	return (0);
   1051 }
   1052 
   1053 /*
   1054  * Return the link layer address, and any flags of a ncec.
   1055  */
   1056 int
   1057 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
   1058 {
   1059 	ncec_t		*ncec;
   1060 	in6_addr_t	*addr;
   1061 	sin6_t		*sin6;
   1062 
   1063 	ASSERT(ill != NULL && ill->ill_isv6);
   1064 	sin6 = (sin6_t *)&lnr->lnr_addr;
   1065 	addr =  &sin6->sin6_addr;
   1066 
   1067 	/*
   1068 	 * NOTE: if the ill is an IPMP interface, then match against the whole
   1069 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
   1070 	 * addresses for the data addresses on an IPMP interface even though
   1071 	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
   1072 	 */
   1073 	ncec = ncec_lookup_illgrp_v6(ill, addr);
   1074 	if (ncec == NULL)
   1075 		return (ESRCH);
   1076 	/* If no link layer address is available yet, return ESRCH */
   1077 	if (!NCE_ISREACHABLE(ncec)) {
   1078 		ncec_refrele(ncec);
   1079 		return (ESRCH);
   1080 	}
   1081 	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
   1082 	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
   1083 	    lnr->lnr_hdw_len);
   1084 	if (ncec->ncec_flags & NCE_F_ISROUTER)
   1085 		lnr->lnr_flags = NDF_ISROUTER_ON;
   1086 	if (ncec->ncec_flags & NCE_F_ANYCAST)
   1087 		lnr->lnr_flags |= NDF_ANYCAST_ON;
   1088 	ncec_refrele(ncec);
   1089 	return (0);
   1090 }
   1091 
   1092 /*
   1093  * Finish setting up the Enable/Disable multicast for the driver.
   1094  */
   1095 mblk_t *
   1096 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
   1097     uint32_t hw_addr_offset, mblk_t *mp)
   1098 {
   1099 	uchar_t		*hw_addr;
   1100 	ipaddr_t	v4group;
   1101 	uchar_t		*addr;
   1102 
   1103 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
   1104 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
   1105 		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
   1106 
   1107 		ASSERT(CLASSD(v4group));
   1108 		ASSERT(!(ill->ill_isv6));
   1109 
   1110 		addr = (uchar_t *)&v4group;
   1111 	} else {
   1112 		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
   1113 		ASSERT(ill->ill_isv6);
   1114 
   1115 		addr = (uchar_t *)v6group;
   1116 	}
   1117 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
   1118 	if (hw_addr == NULL) {
   1119 		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
   1120 		freemsg(mp);
   1121 		return (NULL);
   1122 	}
   1123 
   1124 	ip_mcast_mapping(ill, addr, hw_addr);
   1125 	return (mp);
   1126 }
   1127 
   1128 void
   1129 ip_ndp_resolve(ncec_t *ncec)
   1130 {
   1131 	in_addr_t	sender4 = INADDR_ANY;
   1132 	in6_addr_t	sender6 = ipv6_all_zeros;
   1133 	ill_t		*src_ill;
   1134 	uint32_t	ms;
   1135 
   1136 	src_ill = nce_resolve_src(ncec, &sender6);
   1137 	if (src_ill == NULL) {
   1138 		/* Make sure we try again later */
   1139 		ms = ncec->ncec_ill->ill_reachable_retrans_time;
   1140 		nce_restart_timer(ncec, (clock_t)ms);
   1141 		return;
   1142 	}
   1143 	if (ncec->ncec_ipversion == IPV4_VERSION)
   1144 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
   1145 	mutex_enter(&ncec->ncec_lock);
   1146 	if (ncec->ncec_ipversion == IPV6_VERSION)
   1147 		ms = ndp_solicit(ncec, sender6, src_ill);
   1148 	else
   1149 		ms = arp_request(ncec, sender4, src_ill);
   1150 	mutex_exit(&ncec->ncec_lock);
   1151 	if (ms == 0) {
   1152 		if (ncec->ncec_state != ND_REACHABLE) {
   1153 			if (ncec->ncec_ipversion == IPV6_VERSION)
   1154 				ndp_resolv_failed(ncec);
   1155 			else
   1156 				arp_resolv_failed(ncec);
   1157 			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
   1158 			nce_make_unreachable(ncec);
   1159 			ncec_delete(ncec);
   1160 		}
   1161 	} else {
   1162 		nce_restart_timer(ncec, (clock_t)ms);
   1163 	}
   1164 done:
   1165 	ill_refrele(src_ill);
   1166 }
   1167 
   1168 /*
   1169  * Send an IPv6 neighbor solicitation.
   1170  * Returns number of milliseconds after which we should either rexmit or abort.
   1171  * Return of zero means we should abort.
   1172  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
   1173  * The optional source address is used as a hint to ndp_solicit for
   1174  * which source to use in the packet.
   1175  *
   1176  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
   1177  * the packet.
   1178  */
   1179 uint32_t
   1180 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
   1181 {
   1182 	in6_addr_t	dst;
   1183 	boolean_t	dropped = B_FALSE;
   1184 
   1185 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
   1186 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   1187 
   1188 	if (ncec->ncec_rcnt == 0)
   1189 		return (0);
   1190 
   1191 	dst = ncec->ncec_addr;
   1192 	ncec->ncec_rcnt--;
   1193 	mutex_exit(&ncec->ncec_lock);
   1194 	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
   1195 	    ill->ill_phys_addr_length, &src, &dst, 0);
   1196 	mutex_enter(&ncec->ncec_lock);
   1197 	if (dropped)
   1198 		ncec->ncec_rcnt++;
   1199 	return (ncec->ncec_ill->ill_reachable_retrans_time);
   1200 }
   1201 
   1202 /*
   1203  * Attempt to recover an address on an interface that's been marked as a
   1204  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
   1205  * no easy way to just probe the address and have the right thing happen if
   1206  * it's no longer in use.  Instead, we just bring it up normally and allow the
   1207  * regular interface start-up logic to probe for a remaining duplicate and take
   1208  * us back down if necessary.
   1209  * Neither DHCP nor temporary addresses arrive here; they're excluded by
   1210  * ip_ndp_excl.
   1211  */
   1212 /* ARGSUSED */
   1213 void
   1214 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
   1215 {
   1216 	ill_t	*ill = rq->q_ptr;
   1217 	ipif_t	*ipif;
   1218 	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
   1219 	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
   1220 	boolean_t addr_equal;
   1221 
   1222 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
   1223 		/*
   1224 		 * We do not support recovery of proxy ARP'd interfaces,
   1225 		 * because the system lacks a complete proxy ARP mechanism.
   1226 		 */
   1227 		if (ill->ill_isv6) {
   1228 			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
   1229 			    addr6);
   1230 		} else {
   1231 			addr_equal = (ipif->ipif_lcl_addr == *addr4);
   1232 		}
   1233 
   1234 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
   1235 			continue;
   1236 
   1237 		/*
   1238 		 * If we have already recovered or if the interface is going
   1239 		 * away, then ignore.
   1240 		 */
   1241 		mutex_enter(&ill->ill_lock);
   1242 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
   1243 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
   1244 			mutex_exit(&ill->ill_lock);
   1245 			continue;
   1246 		}
   1247 
   1248 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
   1249 		ill->ill_ipif_dup_count--;
   1250 		mutex_exit(&ill->ill_lock);
   1251 		ipif->ipif_was_dup = B_TRUE;
   1252 
   1253 		if (ill->ill_isv6) {
   1254 			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
   1255 			(void) ipif_up_done_v6(ipif);
   1256 		} else {
   1257 			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
   1258 			    EINPROGRESS);
   1259 			(void) ipif_up_done(ipif);
   1260 		}
   1261 	}
   1262 	freeb(mp);
   1263 }
   1264 
   1265 /*
   1266  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
   1267  * As long as someone else holds the address, the interface will stay down.
   1268  * When that conflict goes away, the interface is brought back up.  This is
   1269  * done so that accidental shutdowns of addresses aren't made permanent.  Your
   1270  * server will recover from a failure.
   1271  *
   1272  * For DHCP and temporary addresses, recovery is not done in the kernel.
   1273  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
   1274  *
   1275  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
   1276  */
   1277 void
   1278 ipif_dup_recovery(void *arg)
   1279 {
   1280 	ipif_t *ipif = arg;
   1281 
   1282 	ipif->ipif_recovery_id = 0;
   1283 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
   1284 		return;
   1285 
   1286 	/*
   1287 	 * No lock, because this is just an optimization.
   1288 	 */
   1289 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
   1290 		return;
   1291 
   1292 	/* If the link is down, we'll retry this later */
   1293 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
   1294 		return;
   1295 
   1296 	ipif_do_recovery(ipif);
   1297 }
   1298 
   1299 /*
   1300  * Perform interface recovery by forcing the duplicate interfaces up and
   1301  * allowing the system to determine which ones should stay up.
   1302  *
   1303  * Called both by recovery timer expiry and link-up notification.
   1304  */
   1305 void
   1306 ipif_do_recovery(ipif_t *ipif)
   1307 {
   1308 	ill_t *ill = ipif->ipif_ill;
   1309 	mblk_t *mp;
   1310 	ip_stack_t *ipst = ill->ill_ipst;
   1311 	size_t mp_size;
   1312 
   1313 	if (ipif->ipif_isv6)
   1314 		mp_size = sizeof (ipif->ipif_v6lcl_addr);
   1315 	else
   1316 		mp_size = sizeof (ipif->ipif_lcl_addr);
   1317 	mp = allocb(mp_size, BPRI_MED);
   1318 	if (mp == NULL) {
   1319 		mutex_enter(&ill->ill_lock);
   1320 		if (ipst->ips_ip_dup_recovery > 0 &&
   1321 		    ipif->ipif_recovery_id == 0 &&
   1322 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
   1323 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
   1324 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
   1325 		}
   1326 		mutex_exit(&ill->ill_lock);
   1327 	} else {
   1328 		/*
   1329 		 * A recovery timer may still be running if we got here from
   1330 		 * ill_restart_dad(); cancel that timer.
   1331 		 */
   1332 		if (ipif->ipif_recovery_id != 0)
   1333 			(void) untimeout(ipif->ipif_recovery_id);
   1334 		ipif->ipif_recovery_id = 0;
   1335 
   1336 		if (ipif->ipif_isv6) {
   1337 			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
   1338 			    sizeof (ipif->ipif_v6lcl_addr));
   1339 		} else  {
   1340 			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
   1341 			    sizeof (ipif->ipif_lcl_addr));
   1342 		}
   1343 		ill_refhold(ill);
   1344 		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
   1345 		    B_FALSE);
   1346 	}
   1347 }
   1348 
   1349 /*
   1350  * Find the MAC and IP addresses in an NA/NS message.
   1351  */
   1352 static void
   1353 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
   1354     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
   1355 {
   1356 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
   1357 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
   1358 	uchar_t *addr;
   1359 	int alen;
   1360 
   1361 	/* icmp_inbound_v6 ensures this */
   1362 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
   1363 
   1364 	addr = ira->ira_l2src;
   1365 	alen = ill->ill_phys_addr_length;
   1366 	if (alen > 0) {
   1367 		*haddr = addr;
   1368 		*haddrlenp = alen;
   1369 	} else {
   1370 		*haddr = NULL;
   1371 		*haddrlenp = 0;
   1372 	}
   1373 
   1374 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
   1375 	*targp = ns->nd_ns_target;
   1376 }
   1377 
   1378 /*
   1379  * This is for exclusive changes due to NDP duplicate address detection
   1380  * failure.
   1381  */
   1382 /* ARGSUSED */
   1383 static void
   1384 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
   1385 {
   1386 	ill_t	*ill = rq->q_ptr;
   1387 	ipif_t	*ipif;
   1388 	uchar_t	*haddr;
   1389 	uint_t	haddrlen;
   1390 	ip_stack_t *ipst = ill->ill_ipst;
   1391 	in6_addr_t targ;
   1392 	ip_recv_attr_t iras;
   1393 	mblk_t	*attrmp;
   1394 
   1395 	attrmp = mp;
   1396 	mp = mp->b_cont;
   1397 	attrmp->b_cont = NULL;
   1398 	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
   1399 		/* The ill or ip_stack_t disappeared on us */
   1400 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1401 		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
   1402 		freemsg(mp);
   1403 		ira_cleanup(&iras, B_TRUE);
   1404 		return;
   1405 	}
   1406 
   1407 	ASSERT(ill == iras.ira_rill);
   1408 
   1409 	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
   1410 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
   1411 		/*
   1412 		 * Ignore conflicts generated by misbehaving switches that
   1413 		 * just reflect our own messages back to us.  For IPMP, we may
   1414 		 * see reflections across any ill in the illgrp.
   1415 		 *
   1416 		 * RFC2462 and revisions tried to detect both the case
   1417 		 * when a statically configured IPv6 address is a duplicate,
   1418 		 * and the case when the L2 address itself is a duplicate. The
   1419 		 * later is important because, with stateles address autoconf,
   1420 		 * if the L2 address is a duplicate, the resulting IPv6
   1421 		 * address(es) would also be duplicates. We rely on DAD of the
   1422 		 * IPv6 address itself to detect the latter case.
   1423 		 */
   1424 		/* For an under ill_grp can change under lock */
   1425 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1426 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
   1427 		    IS_UNDER_IPMP(ill) &&
   1428 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
   1429 		    haddrlen) != NULL) {
   1430 			rw_exit(&ipst->ips_ill_g_lock);
   1431 			goto ignore_conflict;
   1432 		}
   1433 		rw_exit(&ipst->ips_ill_g_lock);
   1434 	}
   1435 
   1436 	/*
   1437 	 * Look up the appropriate ipif.
   1438 	 */
   1439 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
   1440 	if (ipif == NULL)
   1441 		goto ignore_conflict;
   1442 
   1443 	/* Reload the ill to match the ipif */
   1444 	ill = ipif->ipif_ill;
   1445 
   1446 	/* If it's already duplicate or ineligible, then don't do anything. */
   1447 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
   1448 		ipif_refrele(ipif);
   1449 		goto ignore_conflict;
   1450 	}
   1451 
   1452 	/*
   1453 	 * If this is a failure during duplicate recovery, then don't
   1454 	 * complain.  It may take a long time to recover.
   1455 	 */
   1456 	if (!ipif->ipif_was_dup) {
   1457 		char ibuf[LIFNAMSIZ];
   1458 		char hbuf[MAC_STR_LEN];
   1459 		char sbuf[INET6_ADDRSTRLEN];
   1460 
   1461 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
   1462 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
   1463 		    " disabled", ibuf,
   1464 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
   1465 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
   1466 	}
   1467 	mutex_enter(&ill->ill_lock);
   1468 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
   1469 	ipif->ipif_flags |= IPIF_DUPLICATE;
   1470 	ill->ill_ipif_dup_count++;
   1471 	mutex_exit(&ill->ill_lock);
   1472 	(void) ipif_down(ipif, NULL, NULL);
   1473 	(void) ipif_down_tail(ipif);
   1474 	mutex_enter(&ill->ill_lock);
   1475 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
   1476 	    ill->ill_net_type == IRE_IF_RESOLVER &&
   1477 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
   1478 	    ipst->ips_ip_dup_recovery > 0) {
   1479 		ASSERT(ipif->ipif_recovery_id == 0);
   1480 		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
   1481 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
   1482 	}
   1483 	mutex_exit(&ill->ill_lock);
   1484 	ipif_refrele(ipif);
   1485 
   1486 ignore_conflict:
   1487 	freemsg(mp);
   1488 	ira_cleanup(&iras, B_TRUE);
   1489 }
   1490 
   1491 /*
   1492  * Handle failure by tearing down the ipifs with the specified address.  Note
   1493  * that tearing down the ipif also means deleting the ncec through ipif_down, so
   1494  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
   1495  * we start a timer on the ipif.
   1496  * Caller has to free mp;
   1497  */
   1498 static void
   1499 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
   1500 {
   1501 	const uchar_t	*haddr;
   1502 	ill_t		*ill = ira->ira_rill;
   1503 
   1504 	/*
   1505 	 * Ignore conflicts generated by misbehaving switches that just
   1506 	 * reflect our own messages back to us.
   1507 	 */
   1508 
   1509 	/* icmp_inbound_v6 ensures this */
   1510 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
   1511 	haddr = ira->ira_l2src;
   1512 	if (haddr != NULL &&
   1513 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
   1514 		return;
   1515 	}
   1516 
   1517 	if ((mp = copymsg(mp)) != NULL) {
   1518 		mblk_t	*attrmp;
   1519 
   1520 		attrmp = ip_recv_attr_to_mblk(ira);
   1521 		if (attrmp == NULL) {
   1522 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1523 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
   1524 			freemsg(mp);
   1525 		} else {
   1526 			ASSERT(attrmp->b_cont == NULL);
   1527 			attrmp->b_cont = mp;
   1528 			mp = attrmp;
   1529 			ill_refhold(ill);
   1530 			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
   1531 			    B_FALSE);
   1532 		}
   1533 	}
   1534 }
   1535 
   1536 /*
   1537  * Handle a discovered conflict: some other system is advertising that it owns
   1538  * one of our IP addresses.  We need to defend ourselves, or just shut down the
   1539  * interface.
   1540  *
   1541  * Handles both IPv4 and IPv6
   1542  */
   1543 boolean_t
   1544 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
   1545 {
   1546 	ipif_t		*ipif;
   1547 	clock_t		now;
   1548 	uint_t		maxdefense;
   1549 	uint_t		defs;
   1550 	ill_t		*ill = ira->ira_ill;
   1551 	ip_stack_t	*ipst = ill->ill_ipst;
   1552 	uint32_t	elapsed;
   1553 	boolean_t	isv6 = ill->ill_isv6;
   1554 	ipaddr_t	ncec_addr;
   1555 
   1556 	if (isv6) {
   1557 		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
   1558 		    ipst);
   1559 	} else {
   1560 		if (arp_no_defense) {
   1561 			/*
   1562 			 * Yes, there is a conflict, but no, we do not
   1563 			 * defend ourself.
   1564 			 */
   1565 			return (B_TRUE);
   1566 		}
   1567 		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
   1568 		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
   1569 		    ipst);
   1570 	}
   1571 	if (ipif == NULL)
   1572 		return (B_FALSE);
   1573 
   1574 	/*
   1575 	 * First, figure out if this address is disposable.
   1576 	 */
   1577 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
   1578 		maxdefense = ipst->ips_ip_max_temp_defend;
   1579 	else
   1580 		maxdefense = ipst->ips_ip_max_defend;
   1581 
   1582 	/*
   1583 	 * Now figure out how many times we've defended ourselves.  Ignore
   1584 	 * defenses that happened long in the past.
   1585 	 */
   1586 	now = ddi_get_lbolt();
   1587 	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
   1588 	mutex_enter(&ncec->ncec_lock);
   1589 	if ((defs = ncec->ncec_defense_count) > 0 &&
   1590 	    elapsed > ipst->ips_ip_defend_interval) {
   1591 		/*
   1592 		 * ip_defend_interval has elapsed.
   1593 		 * reset the defense count.
   1594 		 */
   1595 		ncec->ncec_defense_count = defs = 0;
   1596 	}
   1597 	ncec->ncec_defense_count++;
   1598 	ncec->ncec_last_time_defended = now;
   1599 	mutex_exit(&ncec->ncec_lock);
   1600 	ipif_refrele(ipif);
   1601 
   1602 	/*
   1603 	 * If we've defended ourselves too many times already, then give up and
   1604 	 * tear down the interface(s) using this address.
   1605 	 * Otherwise, caller has to defend by sending out an announce.
   1606 	 */
   1607 	if (defs >= maxdefense) {
   1608 		if (isv6)
   1609 			ndp_failure(mp, ira);
   1610 		else
   1611 			arp_failure(mp, ira);
   1612 	} else {
   1613 		return (B_TRUE); /* caller must defend this address */
   1614 	}
   1615 	return (B_FALSE);
   1616 }
   1617 
   1618 /*
   1619  * Handle reception of Neighbor Solicitation messages.
   1620  */
   1621 static void
   1622 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
   1623 {
   1624 	ill_t		*ill = ira->ira_ill, *under_ill;
   1625 	nd_neighbor_solicit_t *ns;
   1626 	uint32_t	hlen = ill->ill_phys_addr_length;
   1627 	uchar_t		*haddr = NULL;
   1628 	icmp6_t		*icmp_nd;
   1629 	ip6_t		*ip6h;
   1630 	ncec_t		*our_ncec = NULL;
   1631 	in6_addr_t	target;
   1632 	in6_addr_t	src;
   1633 	int		len;
   1634 	int		flag = 0;
   1635 	nd_opt_hdr_t	*opt = NULL;
   1636 	boolean_t	bad_solicit = B_FALSE;
   1637 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
   1638 	boolean_t	need_ill_refrele = B_FALSE;
   1639 
   1640 	ip6h = (ip6_t *)mp->b_rptr;
   1641 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
   1642 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
   1643 	src = ip6h->ip6_src;
   1644 	ns = (nd_neighbor_solicit_t *)icmp_nd;
   1645 	target = ns->nd_ns_target;
   1646 	if (IN6_IS_ADDR_MULTICAST(&target)) {
   1647 		if (ip_debug > 2) {
   1648 			/* ip1dbg */
   1649 			pr_addr_dbg("ndp_input_solicit: Target is"
   1650 			    " multicast! %s\n", AF_INET6, &target);
   1651 		}
   1652 		bad_solicit = B_TRUE;
   1653 		goto done;
   1654 	}
   1655 	if (len > sizeof (nd_neighbor_solicit_t)) {
   1656 		/* Options present */
   1657 		opt = (nd_opt_hdr_t *)&ns[1];
   1658 		len -= sizeof (nd_neighbor_solicit_t);
   1659 		if (!ndp_verify_optlen(opt, len)) {
   1660 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
   1661 			bad_solicit = B_TRUE;
   1662 			goto done;
   1663 		}
   1664 	}
   1665 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
   1666 		/* Check to see if this is a valid DAD solicitation */
   1667 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
   1668 			if (ip_debug > 2) {
   1669 				/* ip1dbg */
   1670 				pr_addr_dbg("ndp_input_solicit: IPv6 "
   1671 				    "Destination is not solicited node "
   1672 				    "multicast %s\n", AF_INET6,
   1673 				    &ip6h->ip6_dst);
   1674 			}
   1675 			bad_solicit = B_TRUE;
   1676 			goto done;
   1677 		}
   1678 	}
   1679 
   1680 	/*
   1681 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
   1682 	 * received this packet if it's multicast) is not the ill tied to
   1683 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
   1684 	 * to ensure we find the associated NCE.
   1685 	 */
   1686 	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
   1687 	/*
   1688 	 * If this is a valid Solicitation for an address we are publishing,
   1689 	 * then a PUBLISH entry should exist in the cache
   1690 	 */
   1691 	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
   1692 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
   1693 		    "ifname=%s ", ill->ill_name));
   1694 		if (ip_debug > 2) {
   1695 			/* ip1dbg */
   1696 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
   1697 		}
   1698 		if (our_ncec == NULL)
   1699 			bad_solicit = B_TRUE;
   1700 		goto done;
   1701 	}
   1702 
   1703 	/* At this point we should have a verified NS per spec */
   1704 	if (opt != NULL) {
   1705 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
   1706 		if (opt != NULL) {
   1707 			haddr = (uchar_t *)&opt[1];
   1708 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
   1709 			    hlen == 0) {
   1710 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
   1711 				bad_solicit = B_TRUE;
   1712 				goto done;
   1713 			}
   1714 		}
   1715 	}
   1716 
   1717 	/* If sending directly to peer, set the unicast flag */
   1718 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
   1719 		flag |= NDP_UNICAST;
   1720 
   1721 	/*
   1722 	 * Create/update the entry for the soliciting node on the ipmp_ill.
   1723 	 * or respond to outstanding queries, don't if
   1724 	 * the source is unspecified address.
   1725 	 */
   1726 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
   1727 		int	err;
   1728 		nce_t	*nnce;
   1729 
   1730 		ASSERT(ill->ill_isv6);
   1731 		/*
   1732 		 * Regular solicitations *must* include the Source Link-Layer
   1733 		 * Address option.  Ignore messages that do not.
   1734 		 */
   1735 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
   1736 			ip1dbg(("ndp_input_solicit: source link-layer address "
   1737 			    "option missing with a specified source.\n"));
   1738 			bad_solicit = B_TRUE;
   1739 			goto done;
   1740 		}
   1741 
   1742 		/*
   1743 		 * This is a regular solicitation.  If we're still in the
   1744 		 * process of verifying the address, then don't respond at all
   1745 		 * and don't keep track of the sender.
   1746 		 */
   1747 		if (our_ncec->ncec_state == ND_PROBE)
   1748 			goto done;
   1749 
   1750 		/*
   1751 		 * If the solicitation doesn't have sender hardware address
   1752 		 * (legal for unicast solicitation), then process without
   1753 		 * installing the return NCE.  Either we already know it, or
   1754 		 * we'll be forced to look it up when (and if) we reply to the
   1755 		 * packet.
   1756 		 */
   1757 		if (haddr == NULL)
   1758 			goto no_source;
   1759 
   1760 		under_ill = ill;
   1761 		if (IS_UNDER_IPMP(under_ill)) {
   1762 			ill = ipmp_ill_hold_ipmp_ill(under_ill);
   1763 			if (ill == NULL)
   1764 				ill = under_ill;
   1765 			else
   1766 				need_ill_refrele = B_TRUE;
   1767 		}
   1768 		err = nce_lookup_then_add_v6(ill,
   1769 		    haddr, hlen,
   1770 		    &src,	/* Soliciting nodes address */
   1771 		    0,
   1772 		    ND_STALE,
   1773 		    &nnce);
   1774 
   1775 		if (need_ill_refrele) {
   1776 			ill_refrele(ill);
   1777 			ill = under_ill;
   1778 			need_ill_refrele =  B_FALSE;
   1779 		}
   1780 		switch (err) {
   1781 		case 0:
   1782 			/* done with this entry */
   1783 			nce_refrele(nnce);
   1784 			break;
   1785 		case EEXIST:
   1786 			/*
   1787 			 * B_FALSE indicates this is not an an advertisement.
   1788 			 */
   1789 			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
   1790 			nce_refrele(nnce);
   1791 			break;
   1792 		default:
   1793 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
   1794 			    err));
   1795 			goto done;
   1796 		}
   1797 no_source:
   1798 		flag |= NDP_SOLICITED;
   1799 	} else {
   1800 		/*
   1801 		 * No source link layer address option should be present in a
   1802 		 * valid DAD request.
   1803 		 */
   1804 		if (haddr != NULL) {
   1805 			ip1dbg(("ndp_input_solicit: source link-layer address "
   1806 			    "option present with an unspecified source.\n"));
   1807 			bad_solicit = B_TRUE;
   1808 			goto done;
   1809 		}
   1810 		if (our_ncec->ncec_state == ND_PROBE) {
   1811 			/*
   1812 			 * Internally looped-back probes will have
   1813 			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
   1814 			 * transmissions.
   1815 			 */
   1816 			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
   1817 				/*
   1818 				 * If someone else is probing our address, then
   1819 				 * we've crossed wires.  Declare failure.
   1820 				 */
   1821 				ndp_failure(mp, ira);
   1822 			}
   1823 			goto done;
   1824 		}
   1825 		/*
   1826 		 * This is a DAD probe.  Multicast the advertisement to the
   1827 		 * all-nodes address.
   1828 		 */
   1829 		src = ipv6_all_hosts_mcast;
   1830 	}
   1831 	flag |= nce_advert_flags(our_ncec);
   1832 	(void) ndp_xmit(ill,
   1833 	    ND_NEIGHBOR_ADVERT,
   1834 	    our_ncec->ncec_lladdr,
   1835 	    our_ncec->ncec_lladdr_length,
   1836 	    &target,	/* Source and target of the advertisement pkt */
   1837 	    &src,	/* IP Destination (source of original pkt) */
   1838 	    flag);
   1839 done:
   1840 	if (bad_solicit)
   1841 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
   1842 	if (our_ncec != NULL)
   1843 		ncec_refrele(our_ncec);
   1844 }
   1845 
   1846 /*
   1847  * Handle reception of Neighbor Solicitation messages
   1848  */
   1849 void
   1850 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
   1851 {
   1852 	ill_t		*ill = ira->ira_ill;
   1853 	nd_neighbor_advert_t *na;
   1854 	uint32_t	hlen = ill->ill_phys_addr_length;
   1855 	uchar_t		*haddr = NULL;
   1856 	icmp6_t		*icmp_nd;
   1857 	ip6_t		*ip6h;
   1858 	ncec_t		*dst_ncec = NULL;
   1859 	in6_addr_t	target;
   1860 	nd_opt_hdr_t	*opt = NULL;
   1861 	int		len;
   1862 	ip_stack_t	*ipst = ill->ill_ipst;
   1863 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
   1864 
   1865 	ip6h = (ip6_t *)mp->b_rptr;
   1866 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
   1867 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
   1868 	na = (nd_neighbor_advert_t *)icmp_nd;
   1869 
   1870 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
   1871 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
   1872 		ip1dbg(("ndp_input_advert: Target is multicast but the "
   1873 		    "solicited flag is not zero\n"));
   1874 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
   1875 		return;
   1876 	}
   1877 	target = na->nd_na_target;
   1878 	if (IN6_IS_ADDR_MULTICAST(&target)) {
   1879 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
   1880 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
   1881 		return;
   1882 	}
   1883 	if (len > sizeof (nd_neighbor_advert_t)) {
   1884 		opt = (nd_opt_hdr_t *)&na[1];
   1885 		if (!ndp_verify_optlen(opt,
   1886 		    len - sizeof (nd_neighbor_advert_t))) {
   1887 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
   1888 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
   1889 			return;
   1890 		}
   1891 		/* At this point we have a verified NA per spec */
   1892 		len -= sizeof (nd_neighbor_advert_t);
   1893 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
   1894 		if (opt != NULL) {
   1895 			haddr = (uchar_t *)&opt[1];
   1896 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
   1897 			    hlen == 0) {
   1898 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
   1899 				BUMP_MIB(mib,
   1900 				    ipv6IfIcmpInBadNeighborAdvertisements);
   1901 				return;
   1902 			}
   1903 		}
   1904 	}
   1905 
   1906 	/*
   1907 	 * NOTE: we match across the illgrp since we need to do DAD for all of
   1908 	 * our local addresses, and those are spread across all the active
   1909 	 * ills in the group.
   1910 	 */
   1911 	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
   1912 		return;
   1913 
   1914 	if (NCE_PUBLISH(dst_ncec)) {
   1915 		/*
   1916 		 * Someone just advertised an addresses that we publish. First,
   1917 		 * check it it was us -- if so, we can safely ignore it.
   1918 		 * We don't get the haddr from the ira_l2src because, in the
   1919 		 * case that the packet originated from us, on an IPMP group,
   1920 		 * the ira_l2src may would be the link-layer address of the
   1921 		 * cast_ill used to send the packet, which may not be the same
   1922 		 * as the dst_ncec->ncec_lladdr of the address.
   1923 		 */
   1924 		if (haddr != NULL) {
   1925 			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
   1926 				goto out;
   1927 
   1928 			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
   1929 				goto out;   /* from us -- no conflict */
   1930 
   1931 			/*
   1932 			 * If we're in an IPMP group, check if this is an echo
   1933 			 * from another ill in the group.  Use the double-
   1934 			 * checked locking pattern to avoid grabbing
   1935 			 * ill_g_lock in the non-IPMP case.
   1936 			 */
   1937 			if (IS_UNDER_IPMP(ill)) {
   1938 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1939 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
   1940 				    ill->ill_grp, haddr, hlen) != NULL) {
   1941 					rw_exit(&ipst->ips_ill_g_lock);
   1942 					goto out;
   1943 				}
   1944 				rw_exit(&ipst->ips_ill_g_lock);
   1945 			}
   1946 		}
   1947 
   1948 		/*
   1949 		 * This appears to be a real conflict.  If we're trying to
   1950 		 * configure this NCE (ND_PROBE), then shut it down.
   1951 		 * Otherwise, handle the discovered conflict.
   1952 		 */
   1953 		if (dst_ncec->ncec_state == ND_PROBE) {
   1954 			ndp_failure(mp, ira);
   1955 		} else {
   1956 			if (ip_nce_conflict(mp, ira, dst_ncec)) {
   1957 				char hbuf[MAC_STR_LEN];
   1958 				char sbuf[INET6_ADDRSTRLEN];
   1959 
   1960 				cmn_err(CE_WARN,
   1961 				    "node '%s' is using %s on %s",
   1962 				    inet_ntop(AF_INET6, &target, sbuf,
   1963 				    sizeof (sbuf)),
   1964 				    haddr == NULL ? "<none>" :
   1965 				    mac_colon_addr(haddr, hlen, hbuf,
   1966 				    sizeof (hbuf)), ill->ill_name);
   1967 				/*
   1968 				 * RFC 4862, Section 5.4.4 does not mandate
   1969 				 * any specific behavior when an NA matches
   1970 				 * a non-tentative address assigned to the
   1971 				 * receiver. We make the choice of defending
   1972 				 * our address, based on the assumption that
   1973 				 * the sender has not detected the Duplicate.
   1974 				 *
   1975 				 * ncec_last_time_defended has been adjusted
   1976 				 * in ip_nce_conflict()
   1977 				 */
   1978 				(void) ndp_announce(dst_ncec);
   1979 			}
   1980 		}
   1981 	} else {
   1982 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
   1983 			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
   1984 
   1985 		/* B_TRUE indicates this an advertisement */
   1986 		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
   1987 	}
   1988 out:
   1989 	ncec_refrele(dst_ncec);
   1990 }
   1991 
   1992 /*
   1993  * Process NDP neighbor solicitation/advertisement messages.
   1994  * The checksum has already checked o.k before reaching here.
   1995  * Information about the datalink header is contained in ira_l2src, but
   1996  * that should be ignored for loopback packets.
   1997  */
   1998 void
   1999 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
   2000 {
   2001 	ill_t		*ill = ira->ira_rill;
   2002 	icmp6_t		*icmp_nd;
   2003 	ip6_t		*ip6h;
   2004 	int		len;
   2005 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
   2006 	ill_t		*orig_ill = NULL;
   2007 
   2008 	/*
   2009 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
   2010 	 * and make it be the IPMP upper so avoid being confused by a packet
   2011 	 * addressed to a unicast address on a different ill.
   2012 	 */
   2013 	if (IS_UNDER_IPMP(ill)) {
   2014 		orig_ill = ill;
   2015 		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
   2016 		if (ill == NULL) {
   2017 			ill = orig_ill;
   2018 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2019 			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
   2020 			    mp, ill);
   2021 			freemsg(mp);
   2022 			return;
   2023 		}
   2024 		ASSERT(ill != orig_ill);
   2025 		orig_ill = ira->ira_ill;
   2026 		ira->ira_ill = ill;
   2027 		mib = ill->ill_icmp6_mib;
   2028 	}
   2029 	if (!pullupmsg(mp, -1)) {
   2030 		ip1dbg(("ndp_input: pullupmsg failed\n"));
   2031 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2032 		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
   2033 		goto done;
   2034 	}
   2035 	ip6h = (ip6_t *)mp->b_rptr;
   2036 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
   2037 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
   2038 		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
   2039 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
   2040 		goto done;
   2041 	}
   2042 	/*
   2043 	 * NDP does not accept any extension headers between the
   2044 	 * IP header and the ICMP header since e.g. a routing
   2045 	 * header could be dangerous.
   2046 	 * This assumes that any AH or ESP headers are removed
   2047 	 * by ip prior to passing the packet to ndp_input.
   2048 	 */
   2049 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
   2050 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
   2051 		    ip6h->ip6_nxt));
   2052 		ip_drop_input("Wrong next header", mp, ill);
   2053 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
   2054 		goto done;
   2055 	}
   2056 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
   2057 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
   2058 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
   2059 	if (icmp_nd->icmp6_code != 0) {
   2060 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
   2061 		ip_drop_input("code non-zero", mp, ill);
   2062 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
   2063 		goto done;
   2064 	}
   2065 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
   2066 	/*
   2067 	 * Make sure packet length is large enough for either
   2068 	 * a NS or a NA icmp packet.
   2069 	 */
   2070 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
   2071 		ip1dbg(("ndp_input: packet too short\n"));
   2072 		ip_drop_input("packet too short", mp, ill);
   2073 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
   2074 		goto done;
   2075 	}
   2076 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
   2077 		ndp_input_solicit(mp, ira);
   2078 	} else {
   2079 		ndp_input_advert(mp, ira);
   2080 	}
   2081 done:
   2082 	freemsg(mp);
   2083 	if (orig_ill != NULL) {
   2084 		ill_refrele(ill);
   2085 		ira->ira_ill = orig_ill;
   2086 	}
   2087 }
   2088 
   2089 /*
   2090  * ndp_xmit is called to form and transmit a ND solicitation or
   2091  * advertisement ICMP packet.
   2092  *
   2093  * If the source address is unspecified and this isn't a probe (used for
   2094  * duplicate address detection), an appropriate source address and link layer
   2095  * address will be chosen here.  The link layer address option is included if
   2096  * the source is specified (i.e., all non-probe packets), and omitted (per the
   2097  * specification) otherwise.
   2098  *
   2099  * It returns B_FALSE only if it does a successful put() to the
   2100  * corresponding ill's ill_wq otherwise returns B_TRUE.
   2101  */
   2102 static boolean_t
   2103 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
   2104     const in6_addr_t *sender, const in6_addr_t *target, int flag)
   2105 {
   2106 	uint32_t	len;
   2107 	icmp6_t 	*icmp6;
   2108 	mblk_t		*mp;
   2109 	ip6_t		*ip6h;
   2110 	nd_opt_hdr_t	*opt;
   2111 	uint_t		plen;
   2112 	zoneid_t	zoneid = GLOBAL_ZONEID;
   2113 	ill_t		*hwaddr_ill = ill;
   2114 	ip_xmit_attr_t	ixas;
   2115 	ip_stack_t	*ipst = ill->ill_ipst;
   2116 	boolean_t	need_refrele = B_FALSE;
   2117 	boolean_t	probe = B_FALSE;
   2118 
   2119 	if (IS_UNDER_IPMP(ill)) {
   2120 		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
   2121 		/*
   2122 		 * We send non-probe packets on the upper IPMP interface.
   2123 		 * ip_output_simple() will use cast_ill for sending any
   2124 		 * multicast packets. Note that we can't follow the same
   2125 		 * logic for probe packets because all interfaces in the ipmp
   2126 		 * group may have failed, so that we really want to only try
   2127 		 * to send the ND packet on the ill corresponding to the src
   2128 		 * address.
   2129 		 */
   2130 		if (!probe) {
   2131 			ill = ipmp_ill_hold_ipmp_ill(ill);
   2132 			if (ill != NULL)
   2133 				need_refrele = B_TRUE;
   2134 			else
   2135 				ill = hwaddr_ill;
   2136 		}
   2137 	}
   2138 
   2139 	/*
   2140 	 * If we have a unspecified source(sender) address, select a
   2141 	 * proper source address for the solicitation here itself so
   2142 	 * that we can initialize the h/w address correctly.
   2143 	 *
   2144 	 * If the sender is specified then we use this address in order
   2145 	 * to lookup the zoneid before calling ip_output_v6(). This is to
   2146 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
   2147 	 * by IP (we cannot guarantee that the global zone has an interface
   2148 	 * route to the destination).
   2149 	 *
   2150 	 * Note that the NA never comes here with the unspecified source
   2151 	 * address.
   2152 	 */
   2153 
   2154 	/*
   2155 	 * Probes will have unspec src at this point.
   2156 	 */
   2157 	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
   2158 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
   2159 		/*
   2160 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
   2161 		 * ALL_ZONES if it cannot find a matching ipif for the address
   2162 		 * we are trying to use. In this case we err on the side of
   2163 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
   2164 		 */
   2165 		if (zoneid == ALL_ZONES)
   2166 			zoneid = GLOBAL_ZONEID;
   2167 	}
   2168 
   2169 	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
   2170 	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
   2171 	mp = allocb(len,  BPRI_LO);
   2172 	if (mp == NULL) {
   2173 		if (need_refrele)
   2174 			ill_refrele(ill);
   2175 		return (B_TRUE);
   2176 	}
   2177 
   2178 	bzero((char *)mp->b_rptr, len);
   2179 	mp->b_wptr = mp->b_rptr + len;
   2180 
   2181 	bzero(&ixas, sizeof (ixas));
   2182 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6 | IXAF_NO_HW_CKSUM;
   2183 
   2184 	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
   2185 	ixas.ixa_ipst = ipst;
   2186 	ixas.ixa_cred = kcred;
   2187 	ixas.ixa_cpid = NOPID;
   2188 	ixas.ixa_tsl = NULL;
   2189 	ixas.ixa_zoneid = zoneid;
   2190 
   2191 	ip6h = (ip6_t *)mp->b_rptr;
   2192 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
   2193 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
   2194 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
   2195 	ip6h->ip6_hops = IPV6_MAX_HOPS;
   2196 	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
   2197 	ip6h->ip6_dst = *target;
   2198 	icmp6 = (icmp6_t *)&ip6h[1];
   2199 
   2200 	if (hw_addr_len != 0) {
   2201 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
   2202 		    sizeof (nd_neighbor_advert_t));
   2203 	} else {
   2204 		opt = NULL;
   2205 	}
   2206 	if (operation == ND_NEIGHBOR_SOLICIT) {
   2207 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
   2208 
   2209 		if (opt != NULL && !(flag & NDP_PROBE)) {
   2210 			/*
   2211 			 * Note that we don't send out SLLA for ND probes
   2212 			 * per RFC 4862, even though we do send out the src
   2213 			 * haddr for IPv4 DAD probes, even though both IPv4
   2214 			 * and IPv6 go out with the unspecified/INADDR_ANY
   2215 			 * src IP addr.
   2216 			 */
   2217 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
   2218 		}
   2219 		ip6h->ip6_src = *sender;
   2220 		ns->nd_ns_target = *target;
   2221 		if (!(flag & NDP_UNICAST)) {
   2222 			/* Form multicast address of the target */
   2223 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
   2224 			ip6h->ip6_dst.s6_addr32[3] |=
   2225 			    ns->nd_ns_target.s6_addr32[3];
   2226 		}
   2227 	} else {
   2228 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
   2229 
   2230 		ASSERT(!(flag & NDP_PROBE));
   2231 		if (opt != NULL)
   2232 			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
   2233 		ip6h->ip6_src = *sender;
   2234 		na->nd_na_target = *sender;
   2235 		if (flag & NDP_ISROUTER)
   2236 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
   2237 		if (flag & NDP_SOLICITED)
   2238 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
   2239 		if (flag & NDP_ORIDE)
   2240 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
   2241 	}
   2242 
   2243 	if (!(flag & NDP_PROBE)) {
   2244 		if (hw_addr != NULL && opt != NULL) {
   2245 			/* Fill in link layer address and option len */
   2246 			opt->nd_opt_len = (uint8_t)plen;
   2247 			bcopy(hw_addr, &opt[1], hw_addr_len);
   2248 		}
   2249 	}
   2250 	if (opt != NULL && opt->nd_opt_type == 0) {
   2251 		/* If there's no link layer address option, then strip it. */
   2252 		len -= plen * 8;
   2253 		mp->b_wptr = mp->b_rptr + len;
   2254 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
   2255 	}
   2256 
   2257 	icmp6->icmp6_type = (uint8_t)operation;
   2258 	icmp6->icmp6_code = 0;
   2259 	/*
   2260 	 * Prepare for checksum by putting icmp length in the icmp
   2261 	 * checksum field. The checksum is calculated in ip_output.c.
   2262 	 */
   2263 	icmp6->icmp6_cksum = ip6h->ip6_plen;
   2264 
   2265 	(void) ip_output_simple(mp, &ixas);
   2266 	ixa_cleanup(&ixas);
   2267 	if (need_refrele)
   2268 		ill_refrele(ill);
   2269 	return (B_FALSE);
   2270 }
   2271 
   2272 /*
   2273  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
   2274  * The datapath uses this as an indication that there
   2275  * is a problem (as opposed to a NCE that was just
   2276  * reclaimed due to lack of memory.
   2277  * Note that static ARP entries never become unreachable.
   2278  */
   2279 void
   2280 nce_make_unreachable(ncec_t *ncec)
   2281 {
   2282 	mutex_enter(&ncec->ncec_lock);
   2283 	ncec->ncec_state = ND_UNREACHABLE;
   2284 	mutex_exit(&ncec->ncec_lock);
   2285 }
   2286 
   2287 /*
   2288  * NCE retransmit timer. Common to IPv4 and IPv6.
   2289  * This timer goes off when:
   2290  * a. It is time to retransmit a resolution for resolver.
   2291  * b. It is time to send reachability probes.
   2292  */
   2293 void
   2294 nce_timer(void *arg)
   2295 {
   2296 	ncec_t		*ncec = arg;
   2297 	ill_t		*ill = ncec->ncec_ill, *src_ill;
   2298 	char		addrbuf[INET6_ADDRSTRLEN];
   2299 	boolean_t	dropped = B_FALSE;
   2300 	ip_stack_t	*ipst = ncec->ncec_ipst;
   2301 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
   2302 	in_addr_t	sender4 = INADDR_ANY;
   2303 	in6_addr_t	sender6 = ipv6_all_zeros;
   2304 
   2305 	/*
   2306 	 * The timer has to be cancelled by ncec_delete before doing the final
   2307 	 * refrele. So the NCE is guaranteed to exist when the timer runs
   2308 	 * until it clears the timeout_id. Before clearing the timeout_id
   2309 	 * bump up the refcnt so that we can continue to use the ncec
   2310 	 */
   2311 	ASSERT(ncec != NULL);
   2312 	mutex_enter(&ncec->ncec_lock);
   2313 	ncec_refhold_locked(ncec);
   2314 	ncec->ncec_timeout_id = 0;
   2315 	mutex_exit(&ncec->ncec_lock);
   2316 
   2317 	src_ill = nce_resolve_src(ncec, &sender6);
   2318 	/* if we could not find a sender address, return */
   2319 	if (src_ill == NULL) {
   2320 		if (!isv6) {
   2321 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
   2322 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
   2323 			    &sender4, addrbuf, sizeof (addrbuf))));
   2324 		} else {
   2325 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
   2326 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
   2327 		}
   2328 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
   2329 		ncec_refrele(ncec);
   2330 		return;
   2331 	}
   2332 	if (!isv6)
   2333 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
   2334 
   2335 	mutex_enter(&ncec->ncec_lock);
   2336 	/*
   2337 	 * Check the reachability state.
   2338 	 */
   2339 	switch (ncec->ncec_state) {
   2340 	case ND_DELAY:
   2341 		ASSERT(ncec->ncec_lladdr != NULL);
   2342 		ncec->ncec_state = ND_PROBE;
   2343 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
   2344 		if (isv6) {
   2345 			mutex_exit(&ncec->ncec_lock);
   2346 			dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
   2347 			    src_ill->ill_phys_addr,
   2348 			    src_ill->ill_phys_addr_length,
   2349 			    &sender6, &ncec->ncec_addr,
   2350 			    NDP_UNICAST);
   2351 		} else {
   2352 			dropped = arp_request(ncec, sender4, src_ill);
   2353 			mutex_exit(&ncec->ncec_lock);
   2354 		}
   2355 		if (!dropped) {
   2356 			mutex_enter(&ncec->ncec_lock);
   2357 			ncec->ncec_pcnt--;
   2358 			mutex_exit(&ncec->ncec_lock);
   2359 		}
   2360 		if (ip_debug > 3) {
   2361 			/* ip2dbg */
   2362 			pr_addr_dbg("nce_timer: state for %s changed "
   2363 			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
   2364 		}
   2365 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
   2366 		break;
   2367 	case ND_PROBE:
   2368 		/* must be retransmit timer */
   2369 		ASSERT(ncec->ncec_pcnt >= -1);
   2370 		if (ncec->ncec_pcnt > 0) {
   2371 			/*
   2372 			 * As per RFC2461, the ncec gets deleted after
   2373 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
   2374 			 * Note that the first unicast solicitation is sent
   2375 			 * during the DELAY state.
   2376 			 */
   2377 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
   2378 			    ncec->ncec_pcnt,
   2379 			    inet_ntop((isv6? AF_INET6 : AF_INET),
   2380 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
   2381 			if (NCE_PUBLISH(ncec)) {
   2382 				mutex_exit(&ncec->ncec_lock);
   2383 				/*
   2384 				 * send out a probe; note that src_ill
   2385 				 * is ignored by nce_dad() for all
   2386 				 * DAD message types other than IPv6
   2387 				 * unicast probes
   2388 				 */
   2389 				nce_dad(ncec, src_ill, B_TRUE);
   2390 			} else {
   2391 				ASSERT(src_ill != NULL);
   2392 				if (isv6) {
   2393 					mutex_exit(&ncec->ncec_lock);
   2394 					dropped = ndp_xmit(src_ill,
   2395 					    ND_NEIGHBOR_SOLICIT,
   2396 					    src_ill->ill_phys_addr,
   2397 					    src_ill->ill_phys_addr_length,
   2398 					    &sender6, &ncec->ncec_addr,
   2399 					    NDP_UNICAST);
   2400 				} else {
   2401 					/*
   2402 					 * since the nce is REACHABLE,
   2403 					 * the ARP request will be sent out
   2404 					 * as a link-layer unicast.
   2405 					 */
   2406 					dropped = arp_request(ncec, sender4,
   2407 					    src_ill);
   2408 					mutex_exit(&ncec->ncec_lock);
   2409 				}
   2410 				if (!dropped) {
   2411 					mutex_enter(&ncec->ncec_lock);
   2412 					ncec->ncec_pcnt--;
   2413 					mutex_exit(&ncec->ncec_lock);
   2414 				}
   2415 				nce_restart_timer(ncec,
   2416 				    ill->ill_reachable_retrans_time);
   2417 			}
   2418 		} else if (ncec->ncec_pcnt < 0) {
   2419 			/* No hope, delete the ncec */
   2420 			/* Tell datapath it went bad */
   2421 			ncec->ncec_state = ND_UNREACHABLE;
   2422 			mutex_exit(&ncec->ncec_lock);
   2423 			if (ip_debug > 2) {
   2424 				/* ip1dbg */
   2425 				pr_addr_dbg("nce_timer: Delete NCE for"
   2426 				    " dst %s\n", (isv6? AF_INET6: AF_INET),
   2427 				    &ncec->ncec_addr);
   2428 			}
   2429 			/* if static ARP can't delete. */
   2430 			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
   2431 				ncec_delete(ncec);
   2432 
   2433 		} else if (!NCE_PUBLISH(ncec)) {
   2434 			/*
   2435 			 * Probe count is 0 for a dynamic entry (one that we
   2436 			 * ourselves are not publishing). We should never get
   2437 			 * here if NONUD was requested, hence the ASSERT below.
   2438 			 */
   2439 			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
   2440 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
   2441 			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
   2442 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
   2443 			ncec->ncec_pcnt--;
   2444 			mutex_exit(&ncec->ncec_lock);
   2445 			/* Wait one interval before killing */
   2446 			nce_restart_timer(ncec,
   2447 			    ill->ill_reachable_retrans_time);
   2448 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
   2449 			ipif_t *ipif;
   2450 			ipaddr_t ncec_addr;
   2451 
   2452 			/*
   2453 			 * We're done probing, and we can now declare this
   2454 			 * address to be usable.  Let IP know that it's ok to
   2455 			 * use.
   2456 			 */
   2457 			ncec->ncec_state = ND_REACHABLE;
   2458 			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
   2459 			mutex_exit(&ncec->ncec_lock);
   2460 			if (isv6) {
   2461 				ipif = ipif_lookup_addr_exact_v6(
   2462 				    &ncec->ncec_addr, ill, ipst);
   2463 			} else {
   2464 				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
   2465 				    ncec_addr);
   2466 				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
   2467 				    ipst);
   2468 			}
   2469 			if (ipif != NULL) {
   2470 				if (ipif->ipif_was_dup) {
   2471 					char ibuf[LIFNAMSIZ + 10];
   2472 					char sbuf[INET6_ADDRSTRLEN];
   2473 
   2474 					ipif->ipif_was_dup = B_FALSE;
   2475 					(void) inet_ntop(AF_INET6,
   2476 					    &ipif->ipif_v6lcl_addr,
   2477 					    sbuf, sizeof (sbuf));
   2478 					ipif_get_name(ipif, ibuf,
   2479 					    sizeof (ibuf));
   2480 					cmn_err(CE_NOTE, "recovered address "
   2481 					    "%s on %s", sbuf, ibuf);
   2482 				}
   2483 				if ((ipif->ipif_flags & IPIF_UP) &&
   2484 				    !ipif->ipif_addr_ready)
   2485 					ipif_up_notify(ipif);
   2486 				ipif->ipif_addr_ready = 1;
   2487 				ipif_refrele(ipif);
   2488 			}
   2489 			if (!isv6 && arp_no_defense)
   2490 				break;
   2491 			/* Begin defending our new address */
   2492 			if (ncec->ncec_unsolicit_count > 0) {
   2493 				ncec->ncec_unsolicit_count--;
   2494 				if (isv6) {
   2495 					dropped = ndp_announce(ncec);
   2496 				} else {
   2497 					dropped = arp_announce(ncec);
   2498 				}
   2499 
   2500 				if (dropped)
   2501 					ncec->ncec_unsolicit_count++;
   2502 				else
   2503 					ncec->ncec_last_time_defended =
   2504 					    ddi_get_lbolt();
   2505 			}
   2506 			if (ncec->ncec_unsolicit_count > 0) {
   2507 				nce_restart_timer(ncec,
   2508 				    ANNOUNCE_INTERVAL(isv6));
   2509 			} else if (DEFENSE_INTERVAL(isv6) != 0) {
   2510 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
   2511 			}
   2512 		} else {
   2513 			/*
   2514 			 * This is an address we're probing to be our own, but
   2515 			 * the ill is down.  Wait until it comes back before
   2516 			 * doing anything, but switch to reachable state so
   2517 			 * that the restart will work.
   2518 			 */
   2519 			ncec->ncec_state = ND_REACHABLE;
   2520 			mutex_exit(&ncec->ncec_lock);
   2521 		}
   2522 		break;
   2523 	case ND_INCOMPLETE: {
   2524 		mblk_t	*mp, *nextmp;
   2525 		mblk_t	**prevmpp;
   2526 
   2527 		/*
   2528 		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
   2529 		 * for any IPMP probe packets, and toss them.  IPMP probe
   2530 		 * packets will always be at the head of ncec_qd_mp, so that
   2531 		 * we can stop at the first queued ND packet that is
   2532 		 * not a probe packet.
   2533 		 */
   2534 		prevmpp = &ncec->ncec_qd_mp;
   2535 		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
   2536 			nextmp = mp->b_next;
   2537 
   2538 			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
   2539 				inet_freemsg(mp);
   2540 				ncec->ncec_nprobes--;
   2541 				*prevmpp = nextmp;
   2542 			} else {
   2543 				prevmpp = &mp->b_next;
   2544 			}
   2545 		}
   2546 
   2547 		/*
   2548 		 * Must be resolver's retransmit timer.
   2549 		 */
   2550 		mutex_exit(&ncec->ncec_lock);
   2551 		ip_ndp_resolve(ncec);
   2552 		break;
   2553 	}
   2554 	case ND_REACHABLE:
   2555 		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
   2556 		    ncec->ncec_unsolicit_count != 0) ||
   2557 		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
   2558 			if (ncec->ncec_unsolicit_count > 0) {
   2559 				ncec->ncec_unsolicit_count--;
   2560 				mutex_exit(&ncec->ncec_lock);
   2561 				/*
   2562 				 * When we get to zero announcements left,
   2563 				 * switch to address defense
   2564 				 */
   2565 			} else {
   2566 				boolean_t rate_limit;
   2567 
   2568 				mutex_exit(&ncec->ncec_lock);
   2569 				rate_limit = ill_defend_rate_limit(ill, ncec);
   2570 				if (rate_limit) {
   2571 					nce_restart_timer(ncec,
   2572 					    DEFENSE_INTERVAL(isv6));
   2573 					break;
   2574 				}
   2575 			}
   2576 			if (isv6) {
   2577 				dropped = ndp_announce(ncec);
   2578 			} else {
   2579 				dropped = arp_announce(ncec);
   2580 			}
   2581 			mutex_enter(&ncec->ncec_lock);
   2582 			if (dropped) {
   2583 				ncec->ncec_unsolicit_count++;
   2584 			} else {
   2585 				ncec->ncec_last_time_defended =
   2586 				    ddi_get_lbolt();
   2587 			}
   2588 			mutex_exit(&ncec->ncec_lock);
   2589 			if (ncec->ncec_unsolicit_count != 0) {
   2590 				nce_restart_timer(ncec,
   2591 				    ANNOUNCE_INTERVAL(isv6));
   2592 			} else {
   2593 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
   2594 			}
   2595 		} else {
   2596 			mutex_exit(&ncec->ncec_lock);
   2597 		}
   2598 		break;
   2599 	default:
   2600 		mutex_exit(&ncec->ncec_lock);
   2601 		break;
   2602 	}
   2603 done:
   2604 	ncec_refrele(ncec);
   2605 	ill_refrele(src_ill);
   2606 }
   2607 
   2608 /*
   2609  * Set a link layer address from the ll_addr passed in.
   2610  * Copy SAP from ill.
   2611  */
   2612 static void
   2613 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
   2614 {
   2615 	ill_t	*ill = ncec->ncec_ill;
   2616 
   2617 	ASSERT(ll_addr != NULL);
   2618 	if (ill->ill_phys_addr_length > 0) {
   2619 		/*
   2620 		 * The bcopy() below used to be called for the physical address
   2621 		 * length rather than the link layer address length. For
   2622 		 * ethernet and many other media, the phys_addr and lla are
   2623 		 * identical.
   2624 		 *
   2625 		 * The phys_addr and lla may not be the same for devices that
   2626 		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
   2627 		 * no known instances of these.
   2628 		 *
   2629 		 * For PPP or other interfaces with a zero length
   2630 		 * physical address, don't do anything here.
   2631 		 * The bcopy() with a zero phys_addr length was previously
   2632 		 * a no-op for interfaces with a zero-length physical address.
   2633 		 * Using the lla for them would change the way they operate.
   2634 		 * Doing nothing in such cases preserves expected behavior.
   2635 		 */
   2636 		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
   2637 	}
   2638 }
   2639 
   2640 boolean_t
   2641 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
   2642     uint32_t ll_addr_len)
   2643 {
   2644 	ASSERT(ncec->ncec_lladdr != NULL);
   2645 	if (ll_addr == NULL)
   2646 		return (B_FALSE);
   2647 	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
   2648 		return (B_TRUE);
   2649 	return (B_FALSE);
   2650 }
   2651 
   2652 /*
   2653  * Updates the link layer address or the reachability state of
   2654  * a cache entry.  Reset probe counter if needed.
   2655  */
   2656 void
   2657 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
   2658 {
   2659 	ill_t	*ill = ncec->ncec_ill;
   2660 	boolean_t need_stop_timer = B_FALSE;
   2661 	boolean_t need_fastpath_update = B_FALSE;
   2662 	nce_t	*nce = NULL;
   2663 	timeout_id_t tid;
   2664 
   2665 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   2666 	/*
   2667 	 * If this interface does not do NUD, there is no point
   2668 	 * in allowing an update to the cache entry.  Although
   2669 	 * we will respond to NS.
   2670 	 * The only time we accept an update for a resolver when
   2671 	 * NUD is turned off is when it has just been created.
   2672 	 * Non-Resolvers will always be created as REACHABLE.
   2673 	 */
   2674 	if (new_state != ND_UNCHANGED) {
   2675 		if ((ncec->ncec_flags & NCE_F_NONUD) &&
   2676 		    (ncec->ncec_state != ND_INCOMPLETE))
   2677 			return;
   2678 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
   2679 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
   2680 		need_stop_timer = B_TRUE;
   2681 		if (new_state == ND_REACHABLE)
   2682 			ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
   2683 		else {
   2684 			/* We force NUD in this case */
   2685 			ncec->ncec_last = 0;
   2686 		}
   2687 		ncec->ncec_state = new_state;
   2688 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
   2689 		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
   2690 		    new_state == ND_INCOMPLETE);
   2691 	}
   2692 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
   2693 		tid = ncec->ncec_timeout_id;
   2694 		ncec->ncec_timeout_id = 0;
   2695 	}
   2696 	/*
   2697 	 * Re-trigger fastpath probe and
   2698 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
   2699 	 * whatever packets that happens to be transmitting at the time.
   2700 	 */
   2701 	if (new_ll_addr != NULL) {
   2702 		bcopy(new_ll_addr, ncec->ncec_lladdr,
   2703 		    ill->ill_phys_addr_length);
   2704 		need_fastpath_update = B_TRUE;
   2705 	}
   2706 	mutex_exit(&ncec->ncec_lock);
   2707 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
   2708 		if (tid != 0)
   2709 			(void) untimeout(tid);
   2710 	}
   2711 	if (need_fastpath_update) {
   2712 		/*
   2713 		 * Delete any existing existing dlur_mp and fp_mp information.
   2714 		 * For IPMP interfaces, all underlying ill's must be checked
   2715 		 * and purged.
   2716 		 */
   2717 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
   2718 		/*
   2719 		 * add the new dlur_mp and fp_mp
   2720 		 */
   2721 		nce = nce_fastpath(ncec, B_TRUE, NULL);
   2722 		if (nce != NULL)
   2723 			nce_refrele(nce);
   2724 	}
   2725 	mutex_enter(&ncec->ncec_lock);
   2726 }
   2727 
   2728 static void
   2729 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
   2730 {
   2731 	uint_t	count = 0;
   2732 	mblk_t  **mpp, *tmp;
   2733 
   2734 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   2735 
   2736 	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
   2737 		if (++count > ncec->ncec_ill->ill_max_buf) {
   2738 			tmp = ncec->ncec_qd_mp->b_next;
   2739 			ncec->ncec_qd_mp->b_next = NULL;
   2740 			/*
   2741 			 * if we never create data addrs on the under_ill
   2742 			 * does this matter?
   2743 			 */
   2744 			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
   2745 			    ipIfStatsOutDiscards);
   2746 			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
   2747 			    ncec->ncec_ill);
   2748 			freemsg(ncec->ncec_qd_mp);
   2749 			ncec->ncec_qd_mp = tmp;
   2750 		}
   2751 	}
   2752 
   2753 	if (head_insert) {
   2754 		ncec->ncec_nprobes++;
   2755 		mp->b_next = ncec->ncec_qd_mp;
   2756 		ncec->ncec_qd_mp = mp;
   2757 	} else {
   2758 		*mpp = mp;
   2759 	}
   2760 }
   2761 
   2762 /*
   2763  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
   2764  * queued at the head or tail of the queue based on the input argument
   2765  * 'head_insert'. The caller should specify this argument as B_TRUE if this
   2766  * packet is an IPMP probe packet, in which case the following happens:
   2767  *
   2768  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
   2769  *	(non-ipmp_probe) load-speading case where the source address of the ND
   2770  *	packet is not tied to ncec_ill. If the ill bound to the source address
   2771  *	cannot receive, the response to the ND packet will not be received.
   2772  *	However, if ND packets for ncec_ill's probes are queued	behind that ND
   2773  *	packet, those probes will also fail to be sent, and thus in.mpathd will
   2774  *	 erroneously conclude that ncec_ill has also failed.
   2775  *
   2776  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
   2777  *	the first attempt.  This ensures that ND problems do not manifest as
   2778  *	probe RTT spikes.
   2779  *
   2780  * We achieve this by inserting ipmp_probe() packets at the head of the
   2781  * nce_queue.
   2782  *
   2783  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
   2784  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
   2785  */
   2786 void
   2787 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
   2788 {
   2789 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   2790 	nce_queue_mp_common(ncec, mp, head_insert);
   2791 }
   2792 
   2793 /*
   2794  * Called when address resolution failed due to a timeout.
   2795  * Send an ICMP unreachable in response to all queued packets.
   2796  */
   2797 void
   2798 ndp_resolv_failed(ncec_t *ncec)
   2799 {
   2800 	mblk_t	*mp, *nxt_mp;
   2801 	char	buf[INET6_ADDRSTRLEN];
   2802 	ill_t *ill = ncec->ncec_ill;
   2803 	ip_recv_attr_t	iras;
   2804 
   2805 	bzero(&iras, sizeof (iras));
   2806 	iras.ira_flags = 0;
   2807 	/*
   2808 	 * we are setting the ira_rill to the ipmp_ill (instead of
   2809 	 * the actual ill on which the packet was received), but this
   2810 	 * is ok because we don't actually need the real ira_rill.
   2811 	 * to send the icmp unreachable to the sender.
   2812 	 */
   2813 	iras.ira_ill = iras.ira_rill = ill;
   2814 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
   2815 	iras.ira_rifindex = iras.ira_ruifindex;
   2816 
   2817 	ip1dbg(("ndp_resolv_failed: dst %s\n",
   2818 	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
   2819 	mutex_enter(&ncec->ncec_lock);
   2820 	mp = ncec->ncec_qd_mp;
   2821 	ncec->ncec_qd_mp = NULL;
   2822 	ncec->ncec_nprobes = 0;
   2823 	mutex_exit(&ncec->ncec_lock);
   2824 	while (mp != NULL) {
   2825 		nxt_mp = mp->b_next;
   2826 		mp->b_next = NULL;
   2827 
   2828 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   2829 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
   2830 		    mp, ill);
   2831 		icmp_unreachable_v6(mp,
   2832 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
   2833 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   2834 		mp = nxt_mp;
   2835 	}
   2836 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
   2837 }
   2838 
   2839 /*
   2840  * Handle the completion of NDP and ARP resolution.
   2841  */
   2842 void
   2843 nce_resolv_ok(ncec_t *ncec)
   2844 {
   2845 	mblk_t *mp;
   2846 	uint_t pkt_len;
   2847 	iaflags_t ixaflags = IXAF_NO_TRACE;
   2848 	nce_t *nce;
   2849 	ill_t	*ill = ncec->ncec_ill;
   2850 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
   2851 	ip_stack_t *ipst = ill->ill_ipst;
   2852 
   2853 	if (IS_IPMP(ncec->ncec_ill)) {
   2854 		nce_resolv_ipmp_ok(ncec);
   2855 		return;
   2856 	}
   2857 	/* non IPMP case */
   2858 
   2859 	mutex_enter(&ncec->ncec_lock);
   2860 	ASSERT(ncec->ncec_nprobes == 0);
   2861 	mp = ncec->ncec_qd_mp;
   2862 	ncec->ncec_qd_mp = NULL;
   2863 	mutex_exit(&ncec->ncec_lock);
   2864 
   2865 	while (mp != NULL) {
   2866 		mblk_t *nxt_mp;
   2867 
   2868 		if (ill->ill_isv6) {
   2869 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
   2870 
   2871 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
   2872 		} else {
   2873 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
   2874 
   2875 			ixaflags |= IXAF_IS_IPV4;
   2876 			pkt_len = ntohs(ipha->ipha_length);
   2877 		}
   2878 		nxt_mp = mp->b_next;
   2879 		mp->b_next = NULL;
   2880 		/*
   2881 		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
   2882 		 * longer available, but it's ok to drop this flag because TCP
   2883 		 * has its own flow-control in effect, so TCP packets
   2884 		 * are not likely to get here when flow-control is in effect.
   2885 		 */
   2886 		mutex_enter(&ill->ill_lock);
   2887 		nce = nce_lookup(ill, &ncec->ncec_addr);
   2888 		mutex_exit(&ill->ill_lock);
   2889 
   2890 		if (nce == NULL) {
   2891 			if (isv6) {
   2892 				BUMP_MIB(&ipst->ips_ip6_mib,
   2893 				    ipIfStatsOutDiscards);
   2894 			} else {
   2895 				BUMP_MIB(&ipst->ips_ip_mib,
   2896 				    ipIfStatsOutDiscards);
   2897 			}
   2898 			ip_drop_output("ipIfStatsOutDiscards - no nce",
   2899 			    mp, NULL);
   2900 			freemsg(mp);
   2901 		} else {
   2902 			/*
   2903 			 * We don't know the zoneid, but
   2904 			 * ip_xmit does not care since IXAF_NO_TRACE
   2905 			 * is set. (We traced the packet the first
   2906 			 * time through ip_xmit.)
   2907 			 */
   2908 			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
   2909 			    ALL_ZONES, 0, NULL);
   2910 			nce_refrele(nce);
   2911 		}
   2912 		mp = nxt_mp;
   2913 	}
   2914 
   2915 	ncec_cb_dispatch(ncec); /* complete callbacks */
   2916 }
   2917 
   2918 /*
   2919  * Called by SIOCSNDP* ioctl to add/change an ncec entry
   2920  * and the corresponding attributes.
   2921  * Disallow states other than ND_REACHABLE or ND_STALE.
   2922  */
   2923 int
   2924 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
   2925 {
   2926 	sin6_t		*sin6;
   2927 	in6_addr_t	*addr;
   2928 	ncec_t		*ncec;
   2929 	nce_t		*nce;
   2930 	int		err = 0;
   2931 	uint16_t	new_flags = 0;
   2932 	uint16_t	old_flags = 0;
   2933 	int		inflags = lnr->lnr_flags;
   2934 	ip_stack_t	*ipst = ill->ill_ipst;
   2935 	boolean_t	do_postprocess = B_FALSE;
   2936 
   2937 	ASSERT(ill->ill_isv6);
   2938 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
   2939 	    (lnr->lnr_state_create != ND_STALE))
   2940 		return (EINVAL);
   2941 
   2942 	sin6 = (sin6_t *)&lnr->lnr_addr;
   2943 	addr = &sin6->sin6_addr;
   2944 
   2945 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
   2946 	ASSERT(!IS_UNDER_IPMP(ill));
   2947 	nce = nce_lookup_addr(ill, addr);
   2948 	if (nce != NULL)
   2949 		new_flags = nce->nce_common->ncec_flags;
   2950 
   2951 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
   2952 	case NDF_ISROUTER_ON:
   2953 		new_flags |= NCE_F_ISROUTER;
   2954 		break;
   2955 	case NDF_ISROUTER_OFF:
   2956 		new_flags &= ~NCE_F_ISROUTER;
   2957 		break;
   2958 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
   2959 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   2960 		if (nce != NULL)
   2961 			nce_refrele(nce);
   2962 		return (EINVAL);
   2963 	}
   2964 
   2965 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
   2966 	case NDF_ANYCAST_ON:
   2967 		new_flags |= NCE_F_ANYCAST;
   2968 		break;
   2969 	case NDF_ANYCAST_OFF:
   2970 		new_flags &= ~NCE_F_ANYCAST;
   2971 		break;
   2972 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
   2973 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   2974 		if (nce != NULL)
   2975 			nce_refrele(nce);
   2976 		return (EINVAL);
   2977 	}
   2978 
   2979 	if (nce == NULL) {
   2980 		err = nce_add_v6(ill,
   2981 		    (uchar_t *)lnr->lnr_hdw_addr,
   2982 		    ill->ill_phys_addr_length,
   2983 		    addr,
   2984 		    new_flags,
   2985 		    lnr->lnr_state_create,
   2986 		    &nce);
   2987 		if (err != 0) {
   2988 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   2989 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
   2990 			return (err);
   2991 		} else {
   2992 			do_postprocess = B_TRUE;
   2993 		}
   2994 	}
   2995 	ncec = nce->nce_common;
   2996 	old_flags = ncec->ncec_flags;
   2997 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
   2998 		ncec_router_to_host(ncec);
   2999 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   3000 		if (do_postprocess)
   3001 			err = nce_add_v6_postprocess(nce);
   3002 		nce_refrele(nce);
   3003 		return (0);
   3004 	}
   3005 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
   3006 
   3007 	if (do_postprocess)
   3008 		err = nce_add_v6_postprocess(nce);
   3009 	/*
   3010 	 * err cannot be anything other than 0 because we don't support
   3011 	 * proxy arp of static addresses.
   3012 	 */
   3013 	ASSERT(err == 0);
   3014 
   3015 	mutex_enter(&ncec->ncec_lock);
   3016 	ncec->ncec_flags = new_flags;
   3017 	mutex_exit(&ncec->ncec_lock);
   3018 	/*
   3019 	 * Note that we ignore the state at this point, which
   3020 	 * should be either STALE or REACHABLE.  Instead we let
   3021 	 * the link layer address passed in to determine the state
   3022 	 * much like incoming packets.
   3023 	 */
   3024 	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
   3025 	nce_refrele(nce);
   3026 	return (0);
   3027 }
   3028 
   3029 /*
   3030  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
   3031  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
   3032  * be held to ensure that they are in the same group.
   3033  */
   3034 static nce_t *
   3035 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
   3036 {
   3037 
   3038 	nce_t *nce;
   3039 
   3040 	nce = nce_ill_lookup_then_add(ill, ncec);
   3041 
   3042 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
   3043 		return (nce);
   3044 
   3045 	/*
   3046 	 * hold the ncec_lock to synchronize with nce_update() so that,
   3047 	 * at the end of this function, the contents of nce_dlur_mp are
   3048 	 * consistent with ncec->ncec_lladdr, even though some intermediate
   3049 	 * packet may have been sent out with a mangled address, which would
   3050 	 * only be a transient condition.
   3051 	 */
   3052 	mutex_enter(&ncec->ncec_lock);
   3053 	if (ncec->ncec_lladdr != NULL) {
   3054 		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
   3055 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
   3056 	} else {
   3057 		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
   3058 		    ill->ill_sap_length);
   3059 	}
   3060 	mutex_exit(&ncec->ncec_lock);
   3061 	return (nce);
   3062 }
   3063 
   3064 /*
   3065  * we make nce_fp_mp to have an M_DATA prepend.
   3066  * The caller ensures there is hold on ncec for this function.
   3067  * Note that since ill_fastpath_probe() copies the mblk there is
   3068  * no need to hold the nce or ncec beyond this function.
   3069  *
   3070  * If the caller has passed in a non-null ncec_nce to nce_faspath() that
   3071  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
   3072  * and will be returned back by this function, so that no extra nce_refrele
   3073  * is required for the caller. The calls from nce_add_common() use this
   3074  * method. All other callers (that pass in NULL ncec_nce) will have to do a
   3075  * nce_refrele of the returned nce (when it is non-null).
   3076  */
   3077 nce_t *
   3078 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
   3079 {
   3080 	nce_t *nce;
   3081 	ill_t *ill = ncec->ncec_ill;
   3082 
   3083 	ASSERT(ill != NULL);
   3084 
   3085 	if (IS_IPMP(ill) && trigger_fp_req) {
   3086 		trigger_fp_req = B_FALSE;
   3087 		ipmp_ncec_fastpath(ncec, ill);
   3088 
   3089 	}
   3090 	/*
   3091 	 * If the caller already has the nce corresponding to the ill, use
   3092 	 * that one. Otherwise we have to lookup/add the nce. Calls from
   3093 	 * nce_add_common() fall in the former category, and have just done
   3094 	 * the nce lookup/add that can be reused.
   3095 	 */
   3096 	if (ncec_nce == NULL)
   3097 		nce = nce_fastpath_create(ill, ncec);
   3098 	else
   3099 		nce = ncec_nce;
   3100 
   3101 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
   3102 		return (nce);
   3103 
   3104 	if (trigger_fp_req)
   3105 		nce_fastpath_trigger(nce);
   3106 	return (nce);
   3107 }
   3108 
   3109 /*
   3110  * Trigger fastpath on nce. No locks may be held.
   3111  */
   3112 static void
   3113 nce_fastpath_trigger(nce_t *nce)
   3114 {
   3115 	int res;
   3116 	ill_t *ill = nce->nce_ill;
   3117 	ncec_t *ncec = nce->nce_common;
   3118 
   3119 	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
   3120 	/*
   3121 	 * EAGAIN is an indication of a transient error
   3122 	 * i.e. allocation failure etc. leave the ncec in the list it
   3123 	 * will be updated when another probe happens for another ire
   3124 	 * if not it will be taken out of the list when the ire is
   3125 	 * deleted.
   3126 	 */
   3127 	if (res != 0 && res != EAGAIN && res != ENOTSUP)
   3128 		nce_fastpath_list_delete(ill, ncec, NULL);
   3129 }
   3130 
   3131 /*
   3132  * Add ncec to the nce fastpath list on ill.
   3133  */
   3134 static nce_t *
   3135 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
   3136 {
   3137 	nce_t *nce = NULL;
   3138 
   3139 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   3140 	/*
   3141 	 * Atomically ensure that the ill is not CONDEMNED and is not going
   3142 	 * down, before adding the NCE.
   3143 	 */
   3144 	if (ill->ill_state_flags & ILL_CONDEMNED)
   3145 		return (NULL);
   3146 	mutex_enter(&ncec->ncec_lock);
   3147 	/*
   3148 	 * if ncec has not been deleted and
   3149 	 * is not already in the list add it.
   3150 	 */
   3151 	if (!NCE_ISCONDEMNED(ncec)) {
   3152 		nce = nce_lookup(ill, &ncec->ncec_addr);
   3153 		if (nce != NULL)
   3154 			goto done;
   3155 		nce = nce_add(ill, ncec);
   3156 	}
   3157 done:
   3158 	mutex_exit(&ncec->ncec_lock);
   3159 	return (nce);
   3160 }
   3161 
   3162 nce_t *
   3163 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
   3164 {
   3165 	nce_t *nce;
   3166 
   3167 	mutex_enter(&ill->ill_lock);
   3168 	nce = nce_ill_lookup_then_add_locked(ill, ncec);
   3169 	mutex_exit(&ill->ill_lock);
   3170 	return (nce);
   3171 }
   3172 
   3173 
   3174 /*
   3175  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
   3176  * nce is added to the 'dead' list, and the caller must nce_refrele() the
   3177  * entry after all locks have been dropped.
   3178  */
   3179 void
   3180 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
   3181 {
   3182 	nce_t *nce;
   3183 
   3184 	ASSERT(ill != NULL);
   3185 
   3186 	/* first clean out any nce pointers in the under_ills */
   3187 	if (IS_IPMP(ill))
   3188 		ipmp_ncec_flush_nce(ncec);
   3189 
   3190 	/* now the ill itself */
   3191 	mutex_enter(&ill->ill_lock);
   3192 	for (nce = list_head(&ill->ill_nce); nce != NULL;
   3193 	    nce = list_next(&ill->ill_nce, nce)) {
   3194 		if (nce->nce_common == ncec) {
   3195 			nce_refhold(nce);
   3196 			nce_delete(nce);
   3197 			break;
   3198 		}
   3199 	}
   3200 	mutex_exit(&ill->ill_lock);
   3201 	if (nce != NULL) {
   3202 		if (dead == NULL)
   3203 			nce_refrele(nce);
   3204 		else
   3205 			list_insert_tail(dead, nce);
   3206 	}
   3207 }
   3208 
   3209 /*
   3210  * when the fastpath response does not fit in the datab
   3211  * associated with the existing nce_fp_mp, we delete and
   3212  * add the nce to retrigger fastpath based on the information
   3213  * in the ncec_t.
   3214  */
   3215 static nce_t *
   3216 nce_delete_then_add(nce_t *nce)
   3217 {
   3218 	ill_t		*ill = nce->nce_ill;
   3219 	nce_t		*newnce = NULL;
   3220 
   3221 	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
   3222 	    (void *)nce, ill->ill_name));
   3223 	mutex_enter(&ill->ill_lock);
   3224 	mutex_enter(&nce->nce_common->ncec_lock);
   3225 	nce_delete(nce);
   3226 	/*
   3227 	 * Make sure that ncec is not condemned before adding. We hold the
   3228 	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
   3229 	 * ipmp_ncec_flush_nce()
   3230 	 */
   3231 	if (!NCE_ISCONDEMNED(nce->nce_common))
   3232 		newnce = nce_add(ill, nce->nce_common);
   3233 	mutex_exit(&nce->nce_common->ncec_lock);
   3234 	mutex_exit(&ill->ill_lock);
   3235 	nce_refrele(nce);
   3236 	return (newnce); /* could be null if nomem */
   3237 }
   3238 
   3239 typedef struct nce_fp_match_s {
   3240 	nce_t	*nce_fp_match_res;
   3241 	mblk_t	*nce_fp_match_ack_mp;
   3242 } nce_fp_match_t;
   3243 
   3244 /* ARGSUSED */
   3245 static int
   3246 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
   3247 {
   3248 	nce_fp_match_t	*nce_fp_marg = arg;
   3249 	ncec_t		*ncec = nce->nce_common;
   3250 	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
   3251 	uchar_t	*mp_rptr, *ud_mp_rptr;
   3252 	mblk_t		*ud_mp = nce->nce_dlur_mp;
   3253 	ptrdiff_t	cmplen;
   3254 
   3255 	/*
   3256 	 * mp is the mp associated with the fastpath ack.
   3257 	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
   3258 	 * under consideration. If the contents match, then the
   3259 	 * fastpath ack is used to update the nce.
   3260 	 */
   3261 	if (ud_mp == NULL)
   3262 		return (0);
   3263 	mp_rptr = mp->b_rptr;
   3264 	cmplen = mp->b_wptr - mp_rptr;
   3265 	ASSERT(cmplen >= 0);
   3266 
   3267 	ud_mp_rptr = ud_mp->b_rptr;
   3268 	/*
   3269 	 * The ncec is locked here to prevent any other threads from accessing
   3270 	 * and changing nce_dlur_mp when the address becomes resolved to an
   3271 	 * lla while we're in the middle of looking at and comparing the
   3272 	 * hardware address (lla). It is also locked to prevent multiple
   3273 	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
   3274 	 * time.
   3275 	 */
   3276 	mutex_enter(&ncec->ncec_lock);
   3277 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
   3278 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
   3279 		nce_fp_marg->nce_fp_match_res = nce;
   3280 		mutex_exit(&ncec->ncec_lock);
   3281 		nce_refhold(nce);
   3282 		return (1);
   3283 	}
   3284 	mutex_exit(&ncec->ncec_lock);
   3285 	return (0);
   3286 }
   3287 
   3288 /*
   3289  * Update all NCE's that are not in fastpath mode and
   3290  * have an nce_fp_mp that matches mp. mp->b_cont contains
   3291  * the fastpath header.
   3292  *
   3293  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
   3294  */
   3295 void
   3296 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
   3297 {
   3298 	nce_fp_match_t nce_fp_marg;
   3299 	nce_t *nce;
   3300 	mblk_t *nce_fp_mp, *fp_mp;
   3301 
   3302 	nce_fp_marg.nce_fp_match_res = NULL;
   3303 	nce_fp_marg.nce_fp_match_ack_mp = mp;
   3304 
   3305 	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
   3306 
   3307 	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
   3308 		return;
   3309 
   3310 	mutex_enter(&nce->nce_lock);
   3311 	nce_fp_mp = nce->nce_fp_mp;
   3312 
   3313 	if (nce_fp_mp != NULL) {
   3314 		fp_mp = mp->b_cont;
   3315 		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
   3316 		    nce_fp_mp->b_datap->db_lim) {
   3317 			mutex_exit(&nce->nce_lock);
   3318 			nce = nce_delete_then_add(nce);
   3319 			if (nce == NULL) {
   3320 				return;
   3321 			}
   3322 			mutex_enter(&nce->nce_lock);
   3323 			nce_fp_mp = nce->nce_fp_mp;
   3324 		}
   3325 	}
   3326 
   3327 	/* Matched - install mp as the fastpath mp */
   3328 	if (nce_fp_mp == NULL) {
   3329 		fp_mp = dupb(mp->b_cont);
   3330 		nce->nce_fp_mp = fp_mp;
   3331 	} else {
   3332 		fp_mp = mp->b_cont;
   3333 		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
   3334 		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
   3335 		    + MBLKL(fp_mp);
   3336 	}
   3337 	mutex_exit(&nce->nce_lock);
   3338 	nce_refrele(nce);
   3339 }
   3340 
   3341 /*
   3342  * Return a pointer to a given option in the packet.
   3343  * Assumes that option part of the packet have already been validated.
   3344  */
   3345 nd_opt_hdr_t *
   3346 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
   3347 {
   3348 	while (optlen > 0) {
   3349 		if (opt->nd_opt_type == opt_type)
   3350 			return (opt);
   3351 		optlen -= 8 * opt->nd_opt_len;
   3352 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
   3353 	}
   3354 	return (NULL);
   3355 }
   3356 
   3357 /*
   3358  * Verify all option lengths present are > 0, also check to see
   3359  * if the option lengths and packet length are consistent.
   3360  */
   3361 boolean_t
   3362 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
   3363 {
   3364 	ASSERT(opt != NULL);
   3365 	while (optlen > 0) {
   3366 		if (opt->nd_opt_len == 0)
   3367 			return (B_FALSE);
   3368 		optlen -= 8 * opt->nd_opt_len;
   3369 		if (optlen < 0)
   3370 			return (B_FALSE);
   3371 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
   3372 	}
   3373 	return (B_TRUE);
   3374 }
   3375 
   3376 /*
   3377  * ncec_walk function.
   3378  * Free a fraction of the NCE cache entries.
   3379  *
   3380  * A possible optimization here would be to use ncec_last where possible, and
   3381  * delete the least-frequently used entry, which would require more complex
   3382  * computation as we walk through the ncec's (e.g., track ncec entries by
   3383  * order of ncec_last and/or maintain state)
   3384  */
   3385 static void
   3386 ncec_cache_reclaim(ncec_t *ncec, char *arg)
   3387 {
   3388 	ip_stack_t	*ipst = ncec->ncec_ipst;
   3389 	uint_t		fraction = *(uint_t *)arg;
   3390 	uint_t		rand;
   3391 
   3392 	if ((ncec->ncec_flags &
   3393 	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
   3394 		return;
   3395 	}
   3396 
   3397 	rand = (uint_t)ddi_get_lbolt() +
   3398 	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
   3399 	if ((rand/fraction)*fraction == rand) {
   3400 		IP_STAT(ipst, ip_nce_reclaim_deleted);
   3401 		ncec_delete(ncec);
   3402 	}
   3403 }
   3404 
   3405 /*
   3406  * kmem_cache callback to free up memory.
   3407  *
   3408  * For now we just delete a fixed fraction.
   3409  */
   3410 static void
   3411 ip_nce_reclaim_stack(ip_stack_t *ipst)
   3412 {
   3413 	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
   3414 
   3415 	IP_STAT(ipst, ip_nce_reclaim_calls);
   3416 
   3417 	ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
   3418 
   3419 	/*
   3420 	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
   3421 	 * Get them to update any stale references to drop any refholds they
   3422 	 * have.
   3423 	 */
   3424 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
   3425 }
   3426 
   3427 /*
   3428  * Called by the memory allocator subsystem directly, when the system
   3429  * is running low on memory.
   3430  */
   3431 /* ARGSUSED */
   3432 void
   3433 ip_nce_reclaim(void *args)
   3434 {
   3435 	netstack_handle_t nh;
   3436 	netstack_t *ns;
   3437 
   3438 	netstack_next_init(&nh);
   3439 	while ((ns = netstack_next(&nh)) != NULL) {
   3440 		ip_nce_reclaim_stack(ns->netstack_ip);
   3441 		netstack_rele(ns);
   3442 	}
   3443 	netstack_next_fini(&nh);
   3444 }
   3445 
   3446 #ifdef DEBUG
   3447 void
   3448 ncec_trace_ref(ncec_t *ncec)
   3449 {
   3450 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   3451 
   3452 	if (ncec->ncec_trace_disable)
   3453 		return;
   3454 
   3455 	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
   3456 		ncec->ncec_trace_disable = B_TRUE;
   3457 		ncec_trace_cleanup(ncec);
   3458 	}
   3459 }
   3460 
   3461 void
   3462 ncec_untrace_ref(ncec_t *ncec)
   3463 {
   3464 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   3465 
   3466 	if (!ncec->ncec_trace_disable)
   3467 		th_trace_unref(ncec);
   3468 }
   3469 
   3470 static void
   3471 ncec_trace_cleanup(const ncec_t *ncec)
   3472 {
   3473 	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
   3474 }
   3475 #endif
   3476 
   3477 /*
   3478  * Called when address resolution fails due to a timeout.
   3479  * Send an ICMP unreachable in response to all queued packets.
   3480  */
   3481 void
   3482 arp_resolv_failed(ncec_t *ncec)
   3483 {
   3484 	mblk_t	*mp, *nxt_mp;
   3485 	char	buf[INET6_ADDRSTRLEN];
   3486 	struct in_addr ipv4addr;
   3487 	ill_t *ill = ncec->ncec_ill;
   3488 	ip_stack_t *ipst = ncec->ncec_ipst;
   3489 	ip_recv_attr_t	iras;
   3490 
   3491 	bzero(&iras, sizeof (iras));
   3492 	iras.ira_flags = IRAF_IS_IPV4;
   3493 	/*
   3494 	 * we are setting the ira_rill to the ipmp_ill (instead of
   3495 	 * the actual ill on which the packet was received), but this
   3496 	 * is ok because we don't actually need the real ira_rill.
   3497 	 * to send the icmp unreachable to the sender.
   3498 	 */
   3499 	iras.ira_ill = iras.ira_rill = ill;
   3500 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
   3501 	iras.ira_rifindex = iras.ira_ruifindex;
   3502 
   3503 	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
   3504 	ip3dbg(("arp_resolv_failed: dst %s\n",
   3505 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
   3506 	mutex_enter(&ncec->ncec_lock);
   3507 	mp = ncec->ncec_qd_mp;
   3508 	ncec->ncec_qd_mp = NULL;
   3509 	ncec->ncec_nprobes = 0;
   3510 	mutex_exit(&ncec->ncec_lock);
   3511 	while (mp != NULL) {
   3512 		nxt_mp = mp->b_next;
   3513 		mp->b_next = NULL;
   3514 
   3515 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   3516 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
   3517 		    mp, ill);
   3518 		if (ipst->ips_ip_arp_icmp_error) {
   3519 			ip3dbg(("arp_resolv_failed: "
   3520 			    "Calling icmp_unreachable\n"));
   3521 			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
   3522 		} else {
   3523 			freemsg(mp);
   3524 		}
   3525 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   3526 		mp = nxt_mp;
   3527 	}
   3528 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
   3529 }
   3530 
   3531 /*
   3532  * if ill is an under_ill, translate it to the ipmp_ill and add the
   3533  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
   3534  * one on the underlying in_ill) will be created for the
   3535  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
   3536  */
   3537 int
   3538 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
   3539     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
   3540 {
   3541 	int	err;
   3542 	in6_addr_t addr6;
   3543 	ip_stack_t *ipst = ill->ill_ipst;
   3544 	nce_t	*nce, *upper_nce = NULL;
   3545 	ill_t	*in_ill = ill, *under = NULL;
   3546 	boolean_t need_ill_refrele = B_FALSE;
   3547 
   3548 	if (flags & NCE_F_MCAST) {
   3549 		/*
   3550 		 * hw_addr will be figured out in nce_set_multicast_v4;
   3551 		 * caller needs to pass in the cast_ill for ipmp
   3552 		 */
   3553 		ASSERT(hw_addr == NULL);
   3554 		ASSERT(!IS_IPMP(ill));
   3555 		err = nce_set_multicast_v4(ill, addr, flags, newnce);
   3556 		return (err);
   3557 	}
   3558 
   3559 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
   3560 		ill = ipmp_ill_hold_ipmp_ill(ill);
   3561 		if (ill == NULL)
   3562 			return (ENXIO);
   3563 		need_ill_refrele = B_TRUE;
   3564 	}
   3565 	if ((flags & NCE_F_BCAST) != 0) {
   3566 		/*
   3567 		 * IPv4 broadcast ncec: compute the hwaddr.
   3568 		 */
   3569 		if (IS_IPMP(ill)) {
   3570 			under = ipmp_ill_get_xmit_ill(ill, B_FALSE);
   3571 			if (under == NULL)  {
   3572 				if (need_ill_refrele)
   3573 					ill_refrele(ill);
   3574 				return (ENETDOWN);
   3575 			}
   3576 			hw_addr = under->ill_bcast_mp->b_rptr +
   3577 			    NCE_LL_ADDR_OFFSET(under);
   3578 			hw_addr_len = under->ill_phys_addr_length;
   3579 		} else {
   3580 			hw_addr = ill->ill_bcast_mp->b_rptr +
   3581 			    NCE_LL_ADDR_OFFSET(ill),
   3582 			    hw_addr_len = ill->ill_phys_addr_length;
   3583 		}
   3584 	}
   3585 
   3586 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
   3587 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
   3588 	nce = nce_lookup_addr(ill, &addr6);
   3589 	if (nce == NULL) {
   3590 		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
   3591 		    state, &nce);
   3592 	} else {
   3593 		err = EEXIST;
   3594 	}
   3595 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
   3596 	if (err == 0)
   3597 		err = nce_add_v4_postprocess(nce);
   3598 
   3599 	if (in_ill != ill && nce != NULL) {
   3600 		nce_t *under_nce;
   3601 
   3602 		/*
   3603 		 * in_ill was the under_ill. Try to create the under_nce.
   3604 		 * Hold the ill_g_lock to prevent changes to group membership
   3605 		 * until we are done.
   3606 		 */
   3607 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   3608 		if (IS_IN_SAME_ILLGRP(in_ill, ill)) {
   3609 			under_nce = nce_fastpath_create(in_ill,
   3610 			    nce->nce_common);
   3611 			upper_nce = nce;
   3612 			if ((nce = under_nce) == NULL)
   3613 				err = EINVAL;
   3614 		}
   3615 		rw_exit(&ipst->ips_ill_g_lock);
   3616 		if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common))
   3617 			nce_fastpath_trigger(under_nce);
   3618 	}
   3619 	if (nce != NULL) {
   3620 		if (newnce != NULL)
   3621 			*newnce = nce;
   3622 		else
   3623 			nce_refrele(nce);
   3624 	}
   3625 
   3626 	if (under != NULL)
   3627 		ill_refrele(under);
   3628 
   3629 	if (upper_nce != NULL)
   3630 		nce_refrele(upper_nce);
   3631 
   3632 	if (need_ill_refrele)
   3633 		ill_refrele(ill);
   3634 
   3635 	return (err);
   3636 }
   3637 
   3638 /*
   3639  * NDP Cache Entry creation routine for IPv4.
   3640  * This routine must always be called with ndp4->ndp_g_lock held.
   3641  * Prior to return, ncec_refcnt is incremented.
   3642  *
   3643  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
   3644  * are always added pointing at the ipmp_ill. Thus, when the ill passed
   3645  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
   3646  * entries will be created, both pointing at the same ncec_t. The nce_t
   3647  * entries will have their nce_ill set to the ipmp_ill and the under_ill
   3648  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
   3649  * Local addresses are always created on the ill passed to nce_add_v4.
   3650  */
   3651 int
   3652 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
   3653     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
   3654 {
   3655 	int		err;
   3656 	boolean_t	is_multicast = (flags & NCE_F_MCAST);
   3657 	struct in6_addr	addr6;
   3658 	nce_t		*nce;
   3659 
   3660 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
   3661 	ASSERT(!ill->ill_isv6);
   3662 	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
   3663 
   3664 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
   3665 	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
   3666 	    &nce);
   3667 	ASSERT(newnce != NULL);
   3668 	*newnce = nce;
   3669 	return (err);
   3670 }
   3671 
   3672 /*
   3673  * Post-processing routine to be executed after nce_add_v4(). This function
   3674  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
   3675  * and must be called without any locks held.
   3676  *
   3677  * Always returns 0, but we return an int to keep this symmetric with the
   3678  * IPv6 counter-part.
   3679  */
   3680 int
   3681 nce_add_v4_postprocess(nce_t *nce)
   3682 {
   3683 	ncec_t		*ncec = nce->nce_common;
   3684 	uint16_t	flags = ncec->ncec_flags;
   3685 	boolean_t	ndp_need_dad = B_FALSE;
   3686 	boolean_t	dropped;
   3687 	clock_t		delay;
   3688 	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
   3689 	uchar_t		*hw_addr = ncec->ncec_lladdr;
   3690 	boolean_t	trigger_fastpath = B_TRUE;
   3691 
   3692 	/*
   3693 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
   3694 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
   3695 	 * We call nce_fastpath from nce_update if the link layer address of
   3696 	 * the peer changes from nce_update
   3697 	 */
   3698 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
   3699 	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
   3700 		trigger_fastpath = B_FALSE;
   3701 
   3702 	if (trigger_fastpath)
   3703 		nce_fastpath_trigger(nce);
   3704 
   3705 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
   3706 		/*
   3707 		 * Either the caller (by passing in ND_PROBE)
   3708 		 * or nce_add_common() (by the internally computed state
   3709 		 * based on ncec_addr and ill_net_type) has determined
   3710 		 * that this unicast entry needs DAD. Trigger DAD.
   3711 		 */
   3712 		ndp_need_dad = B_TRUE;
   3713 	} else if (flags & NCE_F_UNSOL_ADV) {
   3714 		/*
   3715 		 * We account for the transmit below by assigning one
   3716 		 * less than the ndd variable. Subsequent decrements
   3717 		 * are done in nce_timer.
   3718 		 */
   3719 		mutex_enter(&ncec->ncec_lock);
   3720 		ncec->ncec_unsolicit_count =
   3721 		    ipst->ips_ip_arp_publish_count - 1;
   3722 		mutex_exit(&ncec->ncec_lock);
   3723 		dropped = arp_announce(ncec);
   3724 		mutex_enter(&ncec->ncec_lock);
   3725 		if (dropped)
   3726 			ncec->ncec_unsolicit_count++;
   3727 		else
   3728 			ncec->ncec_last_time_defended = ddi_get_lbolt();
   3729 		if (ncec->ncec_unsolicit_count != 0) {
   3730 			nce_start_timer(ncec,
   3731 			    ipst->ips_ip_arp_publish_interval);
   3732 		}
   3733 		mutex_exit(&ncec->ncec_lock);
   3734 	}
   3735 
   3736 	/*
   3737 	 * If ncec_xmit_interval is 0, user has configured us to send the first
   3738 	 * probe right away.  Do so, and set up for the subsequent probes.
   3739 	 */
   3740 	if (ndp_need_dad) {
   3741 		mutex_enter(&ncec->ncec_lock);
   3742 		if (ncec->ncec_pcnt == 0) {
   3743 			/*
   3744 			 * DAD probes and announce can be
   3745 			 * administratively disabled by setting the
   3746 			 * probe_count to zero. Restart the timer in
   3747 			 * this case to mark the ipif as ready.
   3748 			 */
   3749 			ncec->ncec_unsolicit_count = 0;
   3750 			mutex_exit(&ncec->ncec_lock);
   3751 			nce_restart_timer(ncec, 0);
   3752 		} else {
   3753 			mutex_exit(&ncec->ncec_lock);
   3754 			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
   3755 			    ipst->ips_arp_probe_delay :
   3756 			    ipst->ips_arp_fastprobe_delay);
   3757 			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
   3758 		}
   3759 	}
   3760 	return (0);
   3761 }
   3762 
   3763 /*
   3764  * ncec_walk routine to update all entries that have a given destination or
   3765  * gateway address and cached link layer (MAC) address.  This is used when ARP
   3766  * informs us that a network-to-link-layer mapping may have changed.
   3767  */
   3768 void
   3769 nce_update_hw_changed(ncec_t *ncec, void *arg)
   3770 {
   3771 	nce_hw_map_t *hwm = arg;
   3772 	ipaddr_t ncec_addr;
   3773 
   3774 	if (ncec->ncec_state != ND_REACHABLE)
   3775 		return;
   3776 
   3777 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
   3778 	if (ncec_addr != hwm->hwm_addr)
   3779 		return;
   3780 
   3781 	mutex_enter(&ncec->ncec_lock);
   3782 	if (hwm->hwm_flags != 0)
   3783 		ncec->ncec_flags = hwm->hwm_flags;
   3784 	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
   3785 	mutex_exit(&ncec->ncec_lock);
   3786 }
   3787 
   3788 void
   3789 ncec_refhold(ncec_t *ncec)
   3790 {
   3791 	mutex_enter(&(ncec)->ncec_lock);
   3792 	(ncec)->ncec_refcnt++;
   3793 	ASSERT((ncec)->ncec_refcnt != 0);
   3794 #ifdef DEBUG
   3795 	ncec_trace_ref(ncec);
   3796 #endif
   3797 	mutex_exit(&(ncec)->ncec_lock);
   3798 }
   3799 
   3800 void
   3801 ncec_refhold_notr(ncec_t *ncec)
   3802 {
   3803 	mutex_enter(&(ncec)->ncec_lock);
   3804 	(ncec)->ncec_refcnt++;
   3805 	ASSERT((ncec)->ncec_refcnt != 0);
   3806 	mutex_exit(&(ncec)->ncec_lock);
   3807 }
   3808 
   3809 static void
   3810 ncec_refhold_locked(ncec_t *ncec)
   3811 {
   3812 	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
   3813 	(ncec)->ncec_refcnt++;
   3814 #ifdef DEBUG
   3815 	ncec_trace_ref(ncec);
   3816 #endif
   3817 }
   3818 
   3819 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
   3820 void
   3821 ncec_refrele(ncec_t *ncec)
   3822 {
   3823 	mutex_enter(&(ncec)->ncec_lock);
   3824 #ifdef DEBUG
   3825 	ncec_untrace_ref(ncec);
   3826 #endif
   3827 	ASSERT((ncec)->ncec_refcnt != 0);
   3828 	if (--(ncec)->ncec_refcnt == 0) {
   3829 		ncec_inactive(ncec);
   3830 	} else {
   3831 		mutex_exit(&(ncec)->ncec_lock);
   3832 	}
   3833 }
   3834 
   3835 void
   3836 ncec_refrele_notr(ncec_t *ncec)
   3837 {
   3838 	mutex_enter(&(ncec)->ncec_lock);
   3839 	ASSERT((ncec)->ncec_refcnt != 0);
   3840 	if (--(ncec)->ncec_refcnt == 0) {
   3841 		ncec_inactive(ncec);
   3842 	} else {
   3843 		mutex_exit(&(ncec)->ncec_lock);
   3844 	}
   3845 }
   3846 
   3847 /*
   3848  * Common to IPv4 and IPv6.
   3849  */
   3850 void
   3851 nce_restart_timer(ncec_t *ncec, uint_t ms)
   3852 {
   3853 	timeout_id_t tid;
   3854 
   3855 	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
   3856 
   3857 	/* First cancel any running timer */
   3858 	mutex_enter(&ncec->ncec_lock);
   3859 	tid = ncec->ncec_timeout_id;
   3860 	ncec->ncec_timeout_id = 0;
   3861 	if (tid != 0) {
   3862 		mutex_exit(&ncec->ncec_lock);
   3863 		(void) untimeout(tid);
   3864 		mutex_enter(&ncec->ncec_lock);
   3865 	}
   3866 
   3867 	/* Restart timer */
   3868 	nce_start_timer(ncec, ms);
   3869 	mutex_exit(&ncec->ncec_lock);
   3870 }
   3871 
   3872 static void
   3873 nce_start_timer(ncec_t *ncec, uint_t ms)
   3874 {
   3875 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   3876 	/*
   3877 	 * Don't start the timer if the ncec has been deleted, or if the timer
   3878 	 * is already running
   3879 	 */
   3880 	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
   3881 		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
   3882 		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
   3883 	}
   3884 }
   3885 
   3886 int
   3887 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
   3888     uint16_t flags, nce_t **newnce)
   3889 {
   3890 	uchar_t		*hw_addr;
   3891 	int		err = 0;
   3892 	ip_stack_t	*ipst = ill->ill_ipst;
   3893 	in6_addr_t	dst6;
   3894 	nce_t		*nce;
   3895 
   3896 	ASSERT(!ill->ill_isv6);
   3897 
   3898 	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
   3899 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
   3900 	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
   3901 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
   3902 		goto done;
   3903 	}
   3904 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
   3905 		/*
   3906 		 * For IRE_IF_RESOLVER a hardware mapping can be
   3907 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
   3908 		 * in the ill is copied in nce_add_v4().
   3909 		 */
   3910 		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
   3911 		if (hw_addr == NULL) {
   3912 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
   3913 			return (ENOMEM);
   3914 		}
   3915 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
   3916 	} else {
   3917 		/*
   3918 		 * IRE_IF_NORESOLVER type simply copies the resolution
   3919 		 * cookie passed in.  So no hw_addr is needed.
   3920 		 */
   3921 		hw_addr = NULL;
   3922 	}
   3923 	ASSERT(flags & NCE_F_MCAST);
   3924 	ASSERT(flags & NCE_F_NONUD);
   3925 	/* nce_state will be computed by nce_add_common() */
   3926 	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
   3927 	    ND_UNCHANGED, &nce);
   3928 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
   3929 	if (err == 0)
   3930 		err = nce_add_v4_postprocess(nce);
   3931 	if (hw_addr != NULL)
   3932 		kmem_free(hw_addr, ill->ill_phys_addr_length);
   3933 	if (err != 0) {
   3934 		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
   3935 		return (err);
   3936 	}
   3937 done:
   3938 	if (newnce != NULL)
   3939 		*newnce = nce;
   3940 	else
   3941 		nce_refrele(nce);
   3942 	return (0);
   3943 }
   3944 
   3945 /*
   3946  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
   3947  * don't want to have to walk the list for every single one, so we gather up
   3948  * batches at a time.
   3949  */
   3950 #define	NCE_RESCHED_LIST_LEN	8
   3951 
   3952 typedef struct {
   3953 	ill_t	*ncert_ill;
   3954 	uint_t	ncert_num;
   3955 	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
   3956 } nce_resched_t;
   3957 
   3958 /*
   3959  * Pick the longest waiting NCEs for defense.
   3960  */
   3961 /* ARGSUSED */
   3962 static int
   3963 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
   3964 {
   3965 	nce_resched_t *ncert = arg;
   3966 	ncec_t **ncecs;
   3967 	ncec_t **ncec_max;
   3968 	ncec_t *ncec_temp;
   3969 	ncec_t *ncec = nce->nce_common;
   3970 
   3971 	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
   3972 	/*
   3973 	 * Only reachable entries that are ready for announcement are eligible.
   3974 	 */
   3975 	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
   3976 		return (0);
   3977 	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
   3978 		ncec_refhold(ncec);
   3979 		ncert->ncert_nces[ncert->ncert_num++] = ncec;
   3980 	} else {
   3981 		ncecs = ncert->ncert_nces;
   3982 		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
   3983 		ncec_refhold(ncec);
   3984 		for (; ncecs < ncec_max; ncecs++) {
   3985 			ASSERT(ncec != NULL);
   3986 			if ((*ncecs)->ncec_last_time_defended >
   3987 			    ncec->ncec_last_time_defended) {
   3988 				ncec_temp = *ncecs;
   3989 				*ncecs = ncec;
   3990 				ncec = ncec_temp;
   3991 			}
   3992 		}
   3993 		ncec_refrele(ncec);
   3994 	}
   3995 	return (0);
   3996 }
   3997 
   3998 /*
   3999  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
   4000  * doesn't happen very often (if at all), and thus it needn't be highly
   4001  * optimized.  (Note, though, that it's actually O(N) complexity, because the
   4002  * outer loop is bounded by a constant rather than by the length of the list.)
   4003  */
   4004 static void
   4005 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
   4006 {
   4007 	ncec_t		*ncec;
   4008 	ip_stack_t	*ipst = ill->ill_ipst;
   4009 	uint_t		i, defend_rate;
   4010 
   4011 	i = ill->ill_defend_count;
   4012 	ill->ill_defend_count = 0;
   4013 	if (ill->ill_isv6)
   4014 		defend_rate = ipst->ips_ndp_defend_rate;
   4015 	else
   4016 		defend_rate = ipst->ips_arp_defend_rate;
   4017 	/* If none could be sitting around, then don't reschedule */
   4018 	if (i < defend_rate) {
   4019 		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
   4020 		return;
   4021 	}
   4022 	ncert->ncert_ill = ill;
   4023 	while (ill->ill_defend_count < defend_rate) {
   4024 		nce_walk_common(ill, ncec_reschedule, ncert);
   4025 		for (i = 0; i < ncert->ncert_num; i++) {
   4026 
   4027 			ncec = ncert->ncert_nces[i];
   4028 			mutex_enter(&ncec->ncec_lock);
   4029 			ncec->ncec_flags |= NCE_F_DELAYED;
   4030 			mutex_exit(&ncec->ncec_lock);
   4031 			/*
   4032 			 * we plan to schedule this ncec, so incr the
   4033 			 * defend_count in anticipation.
   4034 			 */
   4035 			if (++ill->ill_defend_count >= defend_rate)
   4036 				break;
   4037 		}
   4038 		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
   4039 			break;
   4040 	}
   4041 }
   4042 
   4043 /*
   4044  * Check if the current rate-limiting parameters permit the sending
   4045  * of another address defense announcement for both IPv4 and IPv6.
   4046  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
   4047  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
   4048  * determines how many address defense announcements are permitted
   4049  * in any `defense_perio' interval.
   4050  */
   4051 static boolean_t
   4052 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
   4053 {
   4054 	clock_t		now = ddi_get_lbolt();
   4055 	ip_stack_t	*ipst = ill->ill_ipst;
   4056 	clock_t		start = ill->ill_defend_start;
   4057 	uint32_t	elapsed, defend_period, defend_rate;
   4058 	nce_resched_t	ncert;
   4059 	boolean_t	ret;
   4060 	int		i;
   4061 
   4062 	if (ill->ill_isv6) {
   4063 		defend_period = ipst->ips_ndp_defend_period;
   4064 		defend_rate = ipst->ips_ndp_defend_rate;
   4065 	} else {
   4066 		defend_period = ipst->ips_arp_defend_period;
   4067 		defend_rate = ipst->ips_arp_defend_rate;
   4068 	}
   4069 	if (defend_rate == 0)
   4070 		return (B_TRUE);
   4071 	bzero(&ncert, sizeof (ncert));
   4072 	mutex_enter(&ill->ill_lock);
   4073 	if (start > 0) {
   4074 		elapsed = now - start;
   4075 		if (elapsed > SEC_TO_TICK(defend_period)) {
   4076 			ill->ill_defend_start = now;
   4077 			/*
   4078 			 * nce_ill_reschedule will attempt to
   4079 			 * prevent starvation by reschduling the
   4080 			 * oldest entries, which are marked with
   4081 			 * the NCE_F_DELAYED flag.
   4082 			 */
   4083 			nce_ill_reschedule(ill, &ncert);
   4084 		}
   4085 	} else {
   4086 		ill->ill_defend_start = now;
   4087 	}
   4088 	ASSERT(ill->ill_defend_count <= defend_rate);
   4089 	mutex_enter(&ncec->ncec_lock);
   4090 	if (ncec->ncec_flags & NCE_F_DELAYED) {
   4091 		/*
   4092 		 * This ncec was rescheduled as one of the really old
   4093 		 * entries needing on-going defense. The
   4094 		 * ill_defend_count was already incremented in
   4095 		 * nce_ill_reschedule. Go ahead and send the announce.
   4096 		 */
   4097 		ncec->ncec_flags &= ~NCE_F_DELAYED;
   4098 		mutex_exit(&ncec->ncec_lock);
   4099 		ret = B_FALSE;
   4100 		goto done;
   4101 	}
   4102 	mutex_exit(&ncec->ncec_lock);
   4103 	if (ill->ill_defend_count < defend_rate)
   4104 		ill->ill_defend_count++;
   4105 	if (ill->ill_defend_count == defend_rate) {
   4106 		/*
   4107 		 * we are no longer allowed to send unbidden defense
   4108 		 * messages. Wait for rescheduling.
   4109 		 */
   4110 		ret = B_TRUE;
   4111 	} else {
   4112 		ret = B_FALSE;
   4113 	}
   4114 done:
   4115 	mutex_exit(&ill->ill_lock);
   4116 	/*
   4117 	 * After all the locks have been dropped we can restart nce timer,
   4118 	 * and refrele the delayed ncecs
   4119 	 */
   4120 	for (i = 0; i < ncert.ncert_num; i++) {
   4121 		clock_t	xmit_interval;
   4122 		ncec_t	*tmp;
   4123 
   4124 		tmp = ncert.ncert_nces[i];
   4125 		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
   4126 		    B_FALSE);
   4127 		nce_restart_timer(tmp, xmit_interval);
   4128 		ncec_refrele(tmp);
   4129 	}
   4130 	return (ret);
   4131 }
   4132 
   4133 boolean_t
   4134 ndp_announce(ncec_t *ncec)
   4135 {
   4136 	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
   4137 	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
   4138 	    nce_advert_flags(ncec)));
   4139 }
   4140 
   4141 ill_t *
   4142 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
   4143 {
   4144 	mblk_t		*mp;
   4145 	in6_addr_t	src6;
   4146 	ipaddr_t	src4;
   4147 	ill_t		*ill = ncec->ncec_ill;
   4148 	ill_t		*src_ill = NULL;
   4149 	ipif_t		*ipif = NULL;
   4150 	boolean_t	is_myaddr = NCE_MYADDR(ncec);
   4151 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
   4152 
   4153 	ASSERT(src != NULL);
   4154 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
   4155 	src6 = *src;
   4156 	if (is_myaddr) {
   4157 		src6 = ncec->ncec_addr;
   4158 		if (!isv6)
   4159 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
   4160 	} else {
   4161 		/*
   4162 		 * try to find one from the outgoing packet.
   4163 		 */
   4164 		mutex_enter(&ncec->ncec_lock);
   4165 		mp = ncec->ncec_qd_mp;
   4166 		if (mp != NULL) {
   4167 			if (isv6) {
   4168 				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
   4169 
   4170 				src6 = ip6h->ip6_src;
   4171 			} else {
   4172 				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
   4173 
   4174 				src4 = ipha->ipha_src;
   4175 				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
   4176 			}
   4177 		}
   4178 		mutex_exit(&ncec->ncec_lock);
   4179 	}
   4180 
   4181 	/*
   4182 	 * For outgoing packets, if the src of outgoing packet is one
   4183 	 * of the assigned interface addresses use it, otherwise we
   4184 	 * will pick the source address below.
   4185 	 * For local addresses (is_myaddr) doing DAD, NDP announce
   4186 	 * messages are mcast. So we use the (IPMP) cast_ill or the
   4187 	 * (non-IPMP) ncec_ill for these message types. The only case
   4188 	 * of unicast DAD messages are for IPv6 ND probes, for which
   4189 	 * we find the ipif_bound_ill corresponding to the ncec_addr.
   4190 	 */
   4191 	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
   4192 		if (isv6) {
   4193 			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
   4194 			    ill->ill_ipst);
   4195 		} else {
   4196 			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
   4197 			    ill->ill_ipst);
   4198 		}
   4199 
   4200 		/*
   4201 		 * If no relevant ipif can be found, then it's not one of our
   4202 		 * addresses.  Reset to :: and try to find a src for the NS or
   4203 		 * ARP request using ipif_select_source_v[4,6]  below.
   4204 		 * If an ipif can be found, but it's not yet done with
   4205 		 * DAD verification, and we are not being invoked for
   4206 		 * DAD (i.e., !is_myaddr), then just postpone this
   4207 		 * transmission until later.
   4208 		 */
   4209 		if (ipif == NULL) {
   4210 			src6 = ipv6_all_zeros;
   4211 			src4 = INADDR_ANY;
   4212 		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
   4213 			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
   4214 			    ncec_t *, ncec, ipif_t *, ipif);
   4215 			ipif_refrele(ipif);
   4216 			return (NULL);
   4217 		}
   4218 	}
   4219 
   4220 	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
   4221 		/*
   4222 		 * Pick a source address for this solicitation, but
   4223 		 * restrict the selection to addresses assigned to the
   4224 		 * output interface.  We do this because the destination will
   4225 		 * create a neighbor cache entry for the source address of
   4226 		 * this packet, so the source address had better be a valid
   4227 		 * neighbor.
   4228 		 */
   4229 		if (isv6) {
   4230 			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
   4231 			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
   4232 			    B_FALSE, NULL);
   4233 		} else {
   4234 			ipaddr_t nce_addr;
   4235 
   4236 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
   4237 			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
   4238 			    B_FALSE, NULL);
   4239 		}
   4240 		if (ipif == NULL && IS_IPMP(ill)) {
   4241 			ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE);
   4242 
   4243 			if (send_ill != NULL) {
   4244 				if (isv6) {
   4245 					ipif = ipif_select_source_v6(send_ill,
   4246 					    &ncec->ncec_addr, B_TRUE,
   4247 					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
   4248 					    B_FALSE, NULL);
   4249 				} else {
   4250 					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
   4251 					    src4);
   4252 					ipif = ipif_select_source_v4(send_ill,
   4253 					    src4, ALL_ZONES, B_TRUE, NULL);
   4254 				}
   4255 				ill_refrele(send_ill);
   4256 			}
   4257 		}
   4258 
   4259 		if (ipif == NULL) {
   4260 			char buf[INET6_ADDRSTRLEN];
   4261 
   4262 			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
   4263 			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
   4264 			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
   4265 			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
   4266 			return (NULL);
   4267 		}
   4268 		src6 = ipif->ipif_v6lcl_addr;
   4269 	}
   4270 	*src = src6;
   4271 	if (ipif != NULL) {
   4272 		src_ill = ipif->ipif_ill;
   4273 		if (IS_IPMP(src_ill))
   4274 			src_ill = ipmp_ipif_hold_bound_ill(ipif);
   4275 		else
   4276 			ill_refhold(src_ill);
   4277 		ipif_refrele(ipif);
   4278 		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
   4279 		    ill_t *, src_ill);
   4280 	}
   4281 	return (src_ill);
   4282 }
   4283 
   4284 void
   4285 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
   4286     uchar_t *hwaddr, int hwaddr_len, int flags)
   4287 {
   4288 	ill_t	*ill;
   4289 	ncec_t	*ncec;
   4290 	nce_t	*nce;
   4291 	uint16_t new_state;
   4292 
   4293 	ill = (ipif ? ipif->ipif_ill : NULL);
   4294 	if (ill != NULL) {
   4295 		/*
   4296 		 * only one ncec is possible
   4297 		 */
   4298 		nce = nce_lookup_v4(ill, addr);
   4299 		if (nce != NULL) {
   4300 			ncec = nce->nce_common;
   4301 			mutex_enter(&ncec->ncec_lock);
   4302 			if (NCE_ISREACHABLE(ncec))
   4303 				new_state = ND_UNCHANGED;
   4304 			else
   4305 				new_state = ND_STALE;
   4306 			ncec->ncec_flags = flags;
   4307 			nce_update(ncec, new_state, hwaddr);
   4308 			mutex_exit(&ncec->ncec_lock);
   4309 			nce_refrele(nce);
   4310 			return;
   4311 		}
   4312 	} else {
   4313 		/*
   4314 		 * ill is wildcard; clean up all ncec's and ire's
   4315 		 * that match on addr.
   4316 		 */
   4317 		nce_hw_map_t hwm;
   4318 
   4319 		hwm.hwm_addr = *addr;
   4320 		hwm.hwm_hwlen = hwaddr_len;
   4321 		hwm.hwm_hwaddr = hwaddr;
   4322 		hwm.hwm_flags = flags;
   4323 
   4324 		ncec_walk_common(ipst->ips_ndp4, NULL,
   4325 		    (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
   4326 	}
   4327 }
   4328 
   4329 /*
   4330  * Common function to add ncec entries.
   4331  * we always add the ncec with ncec_ill == ill, and always create
   4332  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
   4333  * ncec is !reachable.
   4334  *
   4335  * When the caller passes in an nce_state of ND_UNCHANGED,
   4336  * nce_add_common() will determine the state of the created nce based
   4337  * on the ill_net_type and nce_flags used. Otherwise, the nce will
   4338  * be created with state set to the passed in nce_state.
   4339  */
   4340 static int
   4341 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
   4342     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
   4343 {
   4344 	static	ncec_t		nce_nil;
   4345 	uchar_t			*template = NULL;
   4346 	int			err;
   4347 	ncec_t			*ncec;
   4348 	ncec_t			**ncep;
   4349 	ip_stack_t		*ipst = ill->ill_ipst;
   4350 	uint16_t		state;
   4351 	boolean_t		fastprobe = B_FALSE;
   4352 	struct ndp_g_s		*ndp;
   4353 	nce_t			*nce = NULL;
   4354 	mblk_t			*dlur_mp = NULL;
   4355 
   4356 	if (ill->ill_isv6)
   4357 		ndp = ill->ill_ipst->ips_ndp6;
   4358 	else
   4359 		ndp = ill->ill_ipst->ips_ndp4;
   4360 
   4361 	*retnce = NULL;
   4362 
   4363 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
   4364 
   4365 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
   4366 		ip0dbg(("nce_add_common: no addr\n"));
   4367 		return (EINVAL);
   4368 	}
   4369 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
   4370 		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
   4371 		return (EINVAL);
   4372 	}
   4373 
   4374 	if (ill->ill_isv6) {
   4375 		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
   4376 	} else {
   4377 		ipaddr_t v4addr;
   4378 
   4379 		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
   4380 		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
   4381 	}
   4382 
   4383 	/*
   4384 	 * The caller has ensured that there is no nce on ill, but there could
   4385 	 * still be an nce_common_t for the address, so that we find exisiting
   4386 	 * ncec_t strucutures first, and atomically add a new nce_t if
   4387 	 * one is found. The ndp_g_lock ensures that we don't cross threads
   4388 	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
   4389 	 * compare for matches across the illgrp because this function is
   4390 	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
   4391 	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
   4392 	 * appropriate.
   4393 	 */
   4394 	ncec = *ncep;
   4395 	for (; ncec != NULL; ncec = ncec->ncec_next) {
   4396 		if (ncec->ncec_ill == ill) {
   4397 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
   4398 				*retnce = nce_ill_lookup_then_add(ill, ncec);
   4399 				if (*retnce != NULL)
   4400 					break;
   4401 			}
   4402 		}
   4403 	}
   4404 	if (*retnce != NULL) {
   4405 		/*
   4406 		 * We should never find *retnce to be MYADDR, since the caller
   4407 		 * may then incorrectly restart a DAD timer that's already
   4408 		 * running.
   4409 		 */
   4410 		ASSERT(!NCE_MYADDR(ncec));
   4411 		/* caller must trigger fastpath on nce */
   4412 		return (0);
   4413 	}
   4414 	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
   4415 	if (ncec == NULL)
   4416 		return (ENOMEM);
   4417 	*ncec = nce_nil;
   4418 	ncec->ncec_ill = ill;
   4419 	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
   4420 	ncec->ncec_flags = flags;
   4421 	ncec->ncec_ipst = ipst;	/* No netstack_hold */
   4422 
   4423 	if (!ill->ill_isv6) {
   4424 		ipaddr_t addr4;
   4425 
   4426 		/*
   4427 		 * DAD probe interval and probe count are set based on
   4428 		 * fast/slow probe settings. If the underlying link doesn't
   4429 		 * have reliably up/down notifications or if we're working
   4430 		 * with IPv4 169.254.0.0/16 Link Local Address space, then
   4431 		 * don't use the fast timers.  Otherwise, use them.
   4432 		 */
   4433 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
   4434 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
   4435 		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4))
   4436 			fastprobe = B_TRUE;
   4437 		if (fastprobe) {
   4438 			ncec->ncec_xmit_interval =
   4439 			    ipst->ips_arp_fastprobe_interval;
   4440 			ncec->ncec_pcnt =
   4441 			    ipst->ips_arp_fastprobe_count;
   4442 			ncec->ncec_flags |= NCE_F_FAST;
   4443 		} else {
   4444 			ncec->ncec_xmit_interval =
   4445 			    ipst->ips_arp_probe_interval;
   4446 			ncec->ncec_pcnt =
   4447 			    ipst->ips_arp_probe_count;
   4448 		}
   4449 		if (NCE_PUBLISH(ncec)) {
   4450 			ncec->ncec_unsolicit_count =
   4451 			    ipst->ips_ip_arp_publish_count;
   4452 		}
   4453 	} else {
   4454 		/*
   4455 		 * probe interval is constant: ILL_PROBE_INTERVAL
   4456 		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
   4457 		 */
   4458 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
   4459 		if (NCE_PUBLISH(ncec)) {
   4460 			ncec->ncec_unsolicit_count =
   4461 			    ipst->ips_ip_ndp_unsolicit_count;
   4462 		}
   4463 	}
   4464 	ncec->ncec_rcnt = ill->ill_xmit_count;
   4465 	ncec->ncec_addr = *addr;
   4466 	ncec->ncec_qd_mp = NULL;
   4467 	ncec->ncec_refcnt = 1; /* for ncec getting created */
   4468 	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
   4469 	ncec->ncec_trace_disable = B_FALSE;
   4470 
   4471 	/*
   4472 	 * ncec_lladdr holds link layer address
   4473 	 */
   4474 	if (hw_addr_len > 0) {
   4475 		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
   4476 		if (template == NULL) {
   4477 			err = ENOMEM;
   4478 			goto err_ret;
   4479 		}
   4480 		ncec->ncec_lladdr = template;
   4481 		ncec->ncec_lladdr_length = hw_addr_len;
   4482 		bzero(ncec->ncec_lladdr, hw_addr_len);
   4483 	}
   4484 	if ((flags & NCE_F_BCAST) != 0) {
   4485 		state = ND_REACHABLE;
   4486 		ASSERT(hw_addr_len > 0);
   4487 	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
   4488 		state = ND_INITIAL;
   4489 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
   4490 		/*
   4491 		 * NORESOLVER entries are always created in the REACHABLE
   4492 		 * state.
   4493 		 */
   4494 		state = ND_REACHABLE;
   4495 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
   4496 		    ill->ill_mactype != DL_IPV4 &&
   4497 		    ill->ill_mactype != DL_6TO4) {
   4498 			/*
   4499 			 * We create a nce_res_mp with the IP nexthop address
   4500 			 * as the destination address if the physical length
   4501 			 * is exactly 4 bytes for point-to-multipoint links
   4502 			 * that do their own resolution from IP to link-layer
   4503 			 * address (e.g. IP over X.25).
   4504 			 */
   4505 			bcopy((uchar_t *)addr,
   4506 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
   4507 		}
   4508 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
   4509 		    ill->ill_mactype != DL_IPV6) {
   4510 			/*
   4511 			 * We create a nce_res_mp with the IP nexthop address
   4512 			 * as the destination address if the physical legnth
   4513 			 * is exactly 16 bytes for point-to-multipoint links
   4514 			 * that do their own resolution from IP to link-layer
   4515 			 * address.
   4516 			 */
   4517 			bcopy((uchar_t *)addr,
   4518 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
   4519 		}
   4520 		/*
   4521 		 * Since NUD is not part of the base IPv4 protocol definition,
   4522 		 * IPv4 neighbor entries on NORESOLVER interfaces will never
   4523 		 * age, and are marked NCE_F_NONUD.
   4524 		 */
   4525 		if (!ill->ill_isv6)
   4526 			ncec->ncec_flags |= NCE_F_NONUD;
   4527 	} else if (ill->ill_net_type == IRE_LOOPBACK) {
   4528 		state = ND_REACHABLE;
   4529 	}
   4530 
   4531 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
   4532 		/*
   4533 		 * We are adding an ncec with a deterministic hw_addr,
   4534 		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
   4535 		 *
   4536 		 * if we are adding a unicast ncec for the local address
   4537 		 * it would be REACHABLE; we would be adding a ND_STALE entry
   4538 		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
   4539 		 * addresses are added in PROBE to trigger DAD.
   4540 		 */
   4541 		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
   4542 		    ill->ill_net_type == IRE_IF_NORESOLVER)
   4543 			state = ND_REACHABLE;
   4544 		else if (!NCE_PUBLISH(ncec))
   4545 			state = ND_STALE;
   4546 		else
   4547 			state = ND_PROBE;
   4548 		if (hw_addr != NULL)
   4549 			nce_set_ll(ncec, hw_addr);
   4550 	}
   4551 	/* caller overrides internally computed state */
   4552 	if (nce_state != ND_UNCHANGED)
   4553 		state = nce_state;
   4554 
   4555 	if (state == ND_PROBE)
   4556 		ncec->ncec_flags |= NCE_F_UNVERIFIED;
   4557 
   4558 	ncec->ncec_state = state;
   4559 
   4560 	if (state == ND_REACHABLE) {
   4561 		ncec->ncec_last = ncec->ncec_init_time =
   4562 		    TICK_TO_MSEC(ddi_get_lbolt64());
   4563 	} else {
   4564 		ncec->ncec_last = 0;
   4565 		if (state == ND_INITIAL)
   4566 			ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
   4567 	}
   4568 	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
   4569 	    offsetof(ncec_cb_t, ncec_cb_node));
   4570 	/*
   4571 	 * have all the memory allocations out of the way before taking locks
   4572 	 * and adding the nce.
   4573 	 */
   4574 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
   4575 	if (nce == NULL) {
   4576 		err = ENOMEM;
   4577 		goto err_ret;
   4578 	}
   4579 	if (ncec->ncec_lladdr != NULL ||
   4580 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
   4581 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
   4582 		    ill->ill_phys_addr_length, ill->ill_sap,
   4583 		    ill->ill_sap_length);
   4584 		if (dlur_mp == NULL) {
   4585 			err = ENOMEM;
   4586 			goto err_ret;
   4587 		}
   4588 	}
   4589 
   4590 	/*
   4591 	 * Atomically ensure that the ill is not CONDEMNED, before
   4592 	 * adding the NCE.
   4593 	 */
   4594 	mutex_enter(&ill->ill_lock);
   4595 	if (ill->ill_state_flags & ILL_CONDEMNED) {
   4596 		mutex_exit(&ill->ill_lock);
   4597 		err = EINVAL;
   4598 		goto err_ret;
   4599 	}
   4600 	if (!NCE_MYADDR(ncec) &&
   4601 	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
   4602 		mutex_exit(&ill->ill_lock);
   4603 		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
   4604 		err = EINVAL;
   4605 		goto err_ret;
   4606 	}
   4607 	/*
   4608 	 * Acquire the ncec_lock even before adding the ncec to the list
   4609 	 * so that it cannot get deleted after the ncec is added, but
   4610 	 * before we add the nce.
   4611 	 */
   4612 	mutex_enter(&ncec->ncec_lock);
   4613 	if ((ncec->ncec_next = *ncep) != NULL)
   4614 		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
   4615 	*ncep = ncec;
   4616 	ncec->ncec_ptpn = ncep;
   4617 
   4618 	/* Bump up the number of ncec's referencing this ill */
   4619 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
   4620 	    (char *), "ncec", (void *), ncec);
   4621 	ill->ill_ncec_cnt++;
   4622 	/*
   4623 	 * Since we hold the ncec_lock at this time, the ncec cannot be
   4624 	 * condemned, and we can safely add the nce.
   4625 	 */
   4626 	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
   4627 	mutex_exit(&ncec->ncec_lock);
   4628 	mutex_exit(&ill->ill_lock);
   4629 
   4630 	/* caller must trigger fastpath on *retnce */
   4631 	return (0);
   4632 
   4633 err_ret:
   4634 	if (ncec != NULL)
   4635 		kmem_cache_free(ncec_cache, ncec);
   4636 	if (nce != NULL)
   4637 		kmem_cache_free(nce_cache, nce);
   4638 	freemsg(dlur_mp);
   4639 	if (template != NULL)
   4640 		kmem_free(template, ill->ill_phys_addr_length);
   4641 	return (err);
   4642 }
   4643 
   4644 /*
   4645  * take a ref on the nce
   4646  */
   4647 void
   4648 nce_refhold(nce_t *nce)
   4649 {
   4650 	mutex_enter(&nce->nce_lock);
   4651 	nce->nce_refcnt++;
   4652 	ASSERT((nce)->nce_refcnt != 0);
   4653 	mutex_exit(&nce->nce_lock);
   4654 }
   4655 
   4656 /*
   4657  * release a ref on the nce; In general, this
   4658  * cannot be called with locks held because nce_inactive
   4659  * may result in nce_inactive which will take the ill_lock,
   4660  * do ipif_ill_refrele_tail etc. Thus the one exception
   4661  * where this can be called with locks held is when the caller
   4662  * is certain that the nce_refcnt is sufficient to prevent
   4663  * the invocation of nce_inactive.
   4664  */
   4665 void
   4666 nce_refrele(nce_t *nce)
   4667 {
   4668 	ASSERT((nce)->nce_refcnt != 0);
   4669 	mutex_enter(&nce->nce_lock);
   4670 	if (--nce->nce_refcnt == 0)
   4671 		nce_inactive(nce); /* destroys the mutex */
   4672 	else
   4673 		mutex_exit(&nce->nce_lock);
   4674 }
   4675 
   4676 /*
   4677  * free the nce after all refs have gone away.
   4678  */
   4679 static void
   4680 nce_inactive(nce_t *nce)
   4681 {
   4682 	ill_t *ill = nce->nce_ill;
   4683 
   4684 	ASSERT(nce->nce_refcnt == 0);
   4685 
   4686 	ncec_refrele_notr(nce->nce_common);
   4687 	nce->nce_common = NULL;
   4688 	freemsg(nce->nce_fp_mp);
   4689 	freemsg(nce->nce_dlur_mp);
   4690 
   4691 	mutex_enter(&ill->ill_lock);
   4692 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
   4693 	    (char *), "nce", (void *), nce);
   4694 	ill->ill_nce_cnt--;
   4695 	nce->nce_ill = NULL;
   4696 	/*
   4697 	 * If the number of ncec's associated with this ill have dropped
   4698 	 * to zero, check whether we need to restart any operation that
   4699 	 * is waiting for this to happen.
   4700 	 */
   4701 	if (ILL_DOWN_OK(ill)) {
   4702 		/* ipif_ill_refrele_tail drops the ill_lock */
   4703 		ipif_ill_refrele_tail(ill);
   4704 	} else {
   4705 		mutex_exit(&ill->ill_lock);
   4706 	}
   4707 
   4708 	mutex_destroy(&nce->nce_lock);
   4709 	kmem_cache_free(nce_cache, nce);
   4710 }
   4711 
   4712 /*
   4713  * Add an nce to the ill_nce list.
   4714  */
   4715 static nce_t *
   4716 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
   4717 {
   4718 	bzero(nce, sizeof (*nce));
   4719 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
   4720 	nce->nce_common = ncec;
   4721 	nce->nce_addr = ncec->ncec_addr;
   4722 	nce->nce_ill = ill;
   4723 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
   4724 	    (char *), "nce", (void *), nce);
   4725 	ill->ill_nce_cnt++;
   4726 
   4727 	nce->nce_refcnt = 1; /* for the thread */
   4728 	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
   4729 	nce->nce_dlur_mp = dlur_mp;
   4730 
   4731 	/* add nce to the ill's fastpath list.  */
   4732 	nce->nce_refcnt++; /* for the list */
   4733 	list_insert_head(&ill->ill_nce, nce);
   4734 	return (nce);
   4735 }
   4736 
   4737 static nce_t *
   4738 nce_add(ill_t *ill, ncec_t *ncec)
   4739 {
   4740 	nce_t	*nce;
   4741 	mblk_t	*dlur_mp = NULL;
   4742 
   4743 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4744 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
   4745 
   4746 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
   4747 	if (nce == NULL)
   4748 		return (NULL);
   4749 	if (ncec->ncec_lladdr != NULL ||
   4750 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
   4751 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
   4752 		    ill->ill_phys_addr_length, ill->ill_sap,
   4753 		    ill->ill_sap_length);
   4754 		if (dlur_mp == NULL) {
   4755 			kmem_cache_free(nce_cache, nce);
   4756 			return (NULL);
   4757 		}
   4758 	}
   4759 	return (nce_add_impl(ill, ncec, nce, dlur_mp));
   4760 }
   4761 
   4762 /*
   4763  * remove the nce from the ill_faspath list
   4764  */
   4765 void
   4766 nce_delete(nce_t *nce)
   4767 {
   4768 	ill_t	*ill = nce->nce_ill;
   4769 
   4770 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4771 
   4772 	mutex_enter(&nce->nce_lock);
   4773 	if (nce->nce_is_condemned) {
   4774 		/*
   4775 		 * some other thread has removed this nce from the ill_nce list
   4776 		 */
   4777 		mutex_exit(&nce->nce_lock);
   4778 		return;
   4779 	}
   4780 	nce->nce_is_condemned = B_TRUE;
   4781 	mutex_exit(&nce->nce_lock);
   4782 
   4783 	list_remove(&ill->ill_nce, nce);
   4784 	/*
   4785 	 * even though we are holding the ill_lock, it is ok to
   4786 	 * call nce_refrele here because we know that we should have
   4787 	 * at least 2 refs on the nce: one for the thread, and one
   4788 	 * for the list. The refrele below will release the one for
   4789 	 * the list.
   4790 	 */
   4791 	nce_refrele(nce);
   4792 }
   4793 
   4794 nce_t *
   4795 nce_lookup(ill_t *ill, const in6_addr_t *addr)
   4796 {
   4797 	nce_t *nce = NULL;
   4798 
   4799 	ASSERT(ill != NULL);
   4800 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4801 
   4802 	for (nce = list_head(&ill->ill_nce); nce != NULL;
   4803 	    nce = list_next(&ill->ill_nce, nce)) {
   4804 		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
   4805 			break;
   4806 	}
   4807 
   4808 	/*
   4809 	 * if we found the nce on the ill_nce list while holding
   4810 	 * the ill_lock, then it cannot be condemned yet.
   4811 	 */
   4812 	if (nce != NULL) {
   4813 		ASSERT(!nce->nce_is_condemned);
   4814 		nce_refhold(nce);
   4815 	}
   4816 	return (nce);
   4817 }
   4818 
   4819 /*
   4820  * Walk the ill_nce list on ill. The callback function func() cannot perform
   4821  * any destructive actions.
   4822  */
   4823 static void
   4824 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
   4825 {
   4826 	nce_t *nce = NULL, *nce_next;
   4827 
   4828 	ASSERT(MUTEX_HELD(&ill->ill_lock));
   4829 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
   4830 		nce_next = list_next(&ill->ill_nce, nce);
   4831 		if (func(ill, nce, arg) != 0)
   4832 			break;
   4833 		nce = nce_next;
   4834 	}
   4835 }
   4836 
   4837 void
   4838 nce_walk(ill_t *ill, pfi_t func, void *arg)
   4839 {
   4840 	mutex_enter(&ill->ill_lock);
   4841 	nce_walk_common(ill, func, arg);
   4842 	mutex_exit(&ill->ill_lock);
   4843 }
   4844 
   4845 void
   4846 nce_flush(ill_t *ill, boolean_t flushall)
   4847 {
   4848 	nce_t *nce, *nce_next;
   4849 	list_t dead;
   4850 
   4851 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
   4852 	mutex_enter(&ill->ill_lock);
   4853 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
   4854 		nce_next = list_next(&ill->ill_nce, nce);
   4855 		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
   4856 			nce = nce_next;
   4857 			continue;
   4858 		}
   4859 		/*
   4860 		 * nce_delete requires that the caller should either not
   4861 		 * be holding locks, or should hold a ref to ensure that
   4862 		 * we wont hit ncec_inactive. So take a ref and clean up
   4863 		 * after the list is flushed.
   4864 		 */
   4865 		nce_refhold(nce);
   4866 		nce_delete(nce);
   4867 		list_insert_tail(&dead, nce);
   4868 		nce = nce_next;
   4869 	}
   4870 	mutex_exit(&ill->ill_lock);
   4871 	while ((nce = list_head(&dead)) != NULL) {
   4872 		list_remove(&dead, nce);
   4873 		nce_refrele(nce);
   4874 	}
   4875 	ASSERT(list_is_empty(&dead));
   4876 	list_destroy(&dead);
   4877 }
   4878 
   4879 /* Return an interval that is anywhere in the [1 .. intv] range */
   4880 static clock_t
   4881 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
   4882 {
   4883 	clock_t rnd, frac;
   4884 
   4885 	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
   4886 	/* Note that clock_t is signed; must chop off bits */
   4887 	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
   4888 	if (initial_time) {
   4889 		if (intv <= 0)
   4890 			intv = 1;
   4891 		else
   4892 			intv = (rnd % intv) + 1;
   4893 	} else {
   4894 		/* Compute 'frac' as 20% of the configured interval */
   4895 		if ((frac = intv / 5) <= 1)
   4896 			frac = 2;
   4897 		/* Set intv randomly in the range [intv-frac .. intv+frac] */
   4898 		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
   4899 		intv = 1;
   4900 	}
   4901 	return (intv);
   4902 }
   4903 
   4904 void
   4905 nce_resolv_ipmp_ok(ncec_t *ncec)
   4906 {
   4907 	mblk_t *mp;
   4908 	uint_t pkt_len;
   4909 	iaflags_t ixaflags = IXAF_NO_TRACE;
   4910 	nce_t *under_nce;
   4911 	ill_t	*ill = ncec->ncec_ill;
   4912 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
   4913 	ipif_t *src_ipif = NULL;
   4914 	ip_stack_t *ipst = ill->ill_ipst;
   4915 	ill_t *send_ill;
   4916 	uint_t nprobes;
   4917 
   4918 	ASSERT(IS_IPMP(ill));
   4919 
   4920 	mutex_enter(&ncec->ncec_lock);
   4921 	nprobes = ncec->ncec_nprobes;
   4922 	mp = ncec->ncec_qd_mp;
   4923 	ncec->ncec_qd_mp = NULL;
   4924 	ncec->ncec_nprobes = 0;
   4925 	mutex_exit(&ncec->ncec_lock);
   4926 
   4927 	while (mp != NULL) {
   4928 		mblk_t *nxt_mp;
   4929 
   4930 		nxt_mp = mp->b_next;
   4931 		mp->b_next = NULL;
   4932 		if (isv6) {
   4933 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
   4934 
   4935 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
   4936 			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
   4937 			    ill, ALL_ZONES, ipst);
   4938 		} else {
   4939 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
   4940 
   4941 			ixaflags |= IXAF_IS_IPV4;
   4942 			pkt_len = ntohs(ipha->ipha_length);
   4943 			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
   4944 			    ill, ALL_ZONES, ipst);
   4945 		}
   4946 
   4947 		/*
   4948 		 * find a new nce based on an under_ill. The first IPMP probe
   4949 		 * packet gets queued, so we could still find a src_ipif that
   4950 		 * matches an IPMP test address.
   4951 		 */
   4952 		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
   4953 			/*
   4954 			 * if src_ipif is null, this could be either a
   4955 			 * forwarded packet or a probe whose src got deleted.
   4956 			 * We identify the former case by looking for the
   4957 			 * ncec_nprobes: the first ncec_nprobes packets are
   4958 			 * probes;
   4959 			 */
   4960 			if (src_ipif == NULL && nprobes > 0)
   4961 				goto drop_pkt;
   4962 
   4963 			/*
   4964 			 * For forwarded packets, we use the ipmp rotor
   4965 			 * to find send_ill.
   4966 			 */
   4967 			send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill,
   4968 			    B_TRUE);
   4969 		} else {
   4970 			send_ill = src_ipif->ipif_ill;
   4971 			ill_refhold(send_ill);
   4972 		}
   4973 
   4974 		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
   4975 		    (ncec_t *), ncec, (ipif_t *),
   4976 		    src_ipif, (ill_t *), send_ill);
   4977 
   4978 		if (send_ill == NULL) {
   4979 			if (src_ipif != NULL)
   4980 				ipif_refrele(src_ipif);
   4981 			goto drop_pkt;
   4982 		}
   4983 		/* create an under_nce on send_ill */
   4984 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   4985 		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
   4986 			under_nce = nce_fastpath_create(send_ill, ncec);
   4987 		else
   4988 			under_nce = NULL;
   4989 		rw_exit(&ipst->ips_ill_g_lock);
   4990 		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
   4991 			nce_fastpath_trigger(under_nce);
   4992 
   4993 		ill_refrele(send_ill);
   4994 		if (src_ipif != NULL)
   4995 			ipif_refrele(src_ipif);
   4996 
   4997 		if (under_nce != NULL) {
   4998 			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
   4999 			    ALL_ZONES, 0, NULL);
   5000 			nce_refrele(under_nce);
   5001 			if (nprobes > 0)
   5002 				nprobes--;
   5003 			mp = nxt_mp;
   5004 			continue;
   5005 		}
   5006 drop_pkt:
   5007 		if (isv6) {
   5008 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
   5009 		} else {
   5010 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   5011 		}
   5012 		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
   5013 		freemsg(mp);
   5014 		if (nprobes > 0)
   5015 			nprobes--;
   5016 		mp = nxt_mp;
   5017 	}
   5018 	ncec_cb_dispatch(ncec); /* complete callbacks */
   5019 }
   5020