Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /*
     26  * Copyright (c) 1990 Mentat Inc.
     27  */
     28 
     29 /*
     30  * This file contains routines that manipulate Internet Routing Entries (IREs).
     31  */
     32 #include <sys/types.h>
     33 #include <sys/stream.h>
     34 #include <sys/stropts.h>
     35 #include <sys/ddi.h>
     36 #include <sys/cmn_err.h>
     37 
     38 #include <sys/systm.h>
     39 #include <sys/param.h>
     40 #include <sys/socket.h>
     41 #include <net/if.h>
     42 #include <net/route.h>
     43 #include <netinet/in.h>
     44 #include <net/if_dl.h>
     45 #include <netinet/ip6.h>
     46 #include <netinet/icmp6.h>
     47 
     48 #include <inet/common.h>
     49 #include <inet/mi.h>
     50 #include <inet/ip.h>
     51 #include <inet/ip6.h>
     52 #include <inet/ip_ndp.h>
     53 #include <inet/ip_if.h>
     54 #include <inet/ip_ire.h>
     55 #include <inet/ipclassifier.h>
     56 #include <inet/nd.h>
     57 #include <sys/kmem.h>
     58 #include <sys/zone.h>
     59 
     60 #include <sys/tsol/label.h>
     61 #include <sys/tsol/tnet.h>
     62 
     63 #define	IS_DEFAULT_ROUTE_V6(ire)	\
     64 	(((ire)->ire_type & IRE_DEFAULT) || \
     65 	    (((ire)->ire_type & IRE_INTERFACE) && \
     66 	    (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6))))
     67 
     68 static	ire_t	ire_null;
     69 
     70 static ire_t *
     71 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
     72     const in6_addr_t *gateway, int type, const ill_t *ill,
     73     zoneid_t zoneid, const ts_label_t *tsl, int flags,
     74     ip_stack_t *ipst);
     75 
     76 /*
     77  * Initialize the ire that is specific to IPv6 part and call
     78  * ire_init_common to finish it.
     79  * Returns zero or errno.
     80  */
     81 int
     82 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
     83     const in6_addr_t *v6gateway, ushort_t type, ill_t *ill,
     84     zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
     85 {
     86 	int error;
     87 
     88 	/*
     89 	 * Reject IRE security attmakeribute creation/initialization
     90 	 * if system is not running in Trusted mode.
     91 	 */
     92 	if (gc != NULL && !is_system_labeled())
     93 		return (EINVAL);
     94 
     95 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
     96 	if (v6addr != NULL)
     97 		ire->ire_addr_v6 = *v6addr;
     98 	if (v6gateway != NULL)
     99 		ire->ire_gateway_addr_v6 = *v6gateway;
    100 
    101 	/* Make sure we don't have stray values in some fields */
    102 	switch (type) {
    103 	case IRE_LOOPBACK:
    104 	case IRE_HOST:
    105 	case IRE_LOCAL:
    106 	case IRE_IF_CLONE:
    107 		ire->ire_mask_v6 = ipv6_all_ones;
    108 		ire->ire_masklen = IPV6_ABITS;
    109 		break;
    110 	case IRE_PREFIX:
    111 	case IRE_DEFAULT:
    112 	case IRE_IF_RESOLVER:
    113 	case IRE_IF_NORESOLVER:
    114 		if (v6mask != NULL) {
    115 			ire->ire_mask_v6 = *v6mask;
    116 			ire->ire_masklen =
    117 			    ip_mask_to_plen_v6(&ire->ire_mask_v6);
    118 		}
    119 		break;
    120 	case IRE_MULTICAST:
    121 	case IRE_NOROUTE:
    122 		ASSERT(v6mask == NULL);
    123 		break;
    124 	default:
    125 		ASSERT(0);
    126 		return (EINVAL);
    127 	}
    128 
    129 	error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION,
    130 	    gc, ipst);
    131 	if (error != NULL)
    132 		return (error);
    133 
    134 	/* Determine which function pointers to use */
    135 	ire->ire_postfragfn = ip_xmit;		/* Common case */
    136 
    137 	switch (ire->ire_type) {
    138 	case IRE_LOCAL:
    139 		ire->ire_sendfn = ire_send_local_v6;
    140 		ire->ire_recvfn = ire_recv_local_v6;
    141 		ASSERT(ire->ire_ill != NULL);
    142 		if (ire->ire_ill->ill_flags & ILLF_NOACCEPT)
    143 			ire->ire_recvfn = ire_recv_noaccept_v6;
    144 		break;
    145 	case IRE_LOOPBACK:
    146 		ire->ire_sendfn = ire_send_local_v6;
    147 		ire->ire_recvfn = ire_recv_loopback_v6;
    148 		break;
    149 	case IRE_MULTICAST:
    150 		ire->ire_postfragfn = ip_postfrag_loopcheck;
    151 		ire->ire_sendfn = ire_send_multicast_v6;
    152 		ire->ire_recvfn = ire_recv_multicast_v6;
    153 		break;
    154 	default:
    155 		/*
    156 		 * For IRE_IF_ALL and IRE_OFFLINK we forward received
    157 		 * packets by default.
    158 		 */
    159 		ire->ire_sendfn = ire_send_wire_v6;
    160 		ire->ire_recvfn = ire_recv_forward_v6;
    161 		break;
    162 	}
    163 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
    164 		ire->ire_sendfn = ire_send_noroute_v6;
    165 		ire->ire_recvfn = ire_recv_noroute_v6;
    166 	} else if (ire->ire_flags & RTF_MULTIRT) {
    167 		ire->ire_postfragfn = ip_postfrag_multirt_v6;
    168 		ire->ire_sendfn = ire_send_multirt_v6;
    169 		ire->ire_recvfn = ire_recv_multirt_v6;
    170 	}
    171 	ire->ire_nce_capable = ire_determine_nce_capable(ire);
    172 	return (0);
    173 }
    174 
    175 /*
    176  * ire_create_v6 is called to allocate and initialize a new IRE.
    177  *
    178  * NOTE : This is called as writer sometimes though not required
    179  * by this function.
    180  */
    181 /* ARGSUSED */
    182 ire_t *
    183 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
    184     const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid,
    185     uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
    186 {
    187 	ire_t	*ire;
    188 	int	error;
    189 
    190 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
    191 
    192 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
    193 	if (ire == NULL) {
    194 		DTRACE_PROBE(kmem__cache__alloc);
    195 		return (NULL);
    196 	}
    197 	*ire = ire_null;
    198 
    199 	error = ire_init_v6(ire, v6addr, v6mask, v6gateway,
    200 	    type, ill, zoneid, flags, gc, ipst);
    201 
    202 	if (error != 0) {
    203 		DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error);
    204 		kmem_cache_free(ire_cache, ire);
    205 		return (NULL);
    206 	}
    207 	return (ire);
    208 }
    209 
    210 /*
    211  * Find the ill matching a multicast group.
    212  * Allows different routes for multicast addresses
    213  * in the unicast routing table (akin to FF::0/8 but could be more specific)
    214  * which point at different interfaces. This is used when IPV6_MULTICAST_IF
    215  * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
    216  * specify the interface to join on.
    217  *
    218  * Supports link-local addresses by using ire_route_recursive which follows
    219  * the ill when recursing.
    220  *
    221  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
    222  * and the MULTIRT property can be different for different groups, we
    223  * extract RTF_MULTIRT from the special unicast route added for a group
    224  * with CGTP and pass that back in the multirtp argument.
    225  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
    226  * We have a setsrcp argument for the same reason.
    227  */
    228 ill_t *
    229 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid,
    230     ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp)
    231 {
    232 	ire_t	*ire;
    233 	ill_t	*ill;
    234 
    235 	ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL,
    236 	    MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
    237 	ASSERT(ire != NULL);
    238 
    239 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
    240 		ire_refrele(ire);
    241 		return (NULL);
    242 	}
    243 
    244 	if (multirtp != NULL)
    245 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
    246 
    247 	ill = ire_nexthop_ill(ire);
    248 	ire_refrele(ire);
    249 	return (ill);
    250 }
    251 
    252 /*
    253  * This function takes a mask and returns number of bits set in the
    254  * mask (the represented prefix length).  Assumes a contiguous mask.
    255  */
    256 int
    257 ip_mask_to_plen_v6(const in6_addr_t *v6mask)
    258 {
    259 	int		bits;
    260 	int		plen = IPV6_ABITS;
    261 	int		i;
    262 
    263 	for (i = 3; i >= 0; i--) {
    264 		if (v6mask->s6_addr32[i] == 0) {
    265 			plen -= 32;
    266 			continue;
    267 		}
    268 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
    269 		if (bits == 0)
    270 			break;
    271 		plen -= bits;
    272 	}
    273 
    274 	return (plen);
    275 }
    276 
    277 /*
    278  * Convert a prefix length to the mask for that prefix.
    279  * Returns the argument bitmask.
    280  */
    281 in6_addr_t *
    282 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
    283 {
    284 	uint32_t *ptr;
    285 
    286 	if (plen < 0 || plen > IPV6_ABITS)
    287 		return (NULL);
    288 	*bitmask = ipv6_all_zeros;
    289 	if (plen == 0)
    290 		return (bitmask);
    291 
    292 	ptr = (uint32_t *)bitmask;
    293 	while (plen > 32) {
    294 		*ptr++ = 0xffffffffU;
    295 		plen -= 32;
    296 	}
    297 	*ptr = htonl(0xffffffffU << (32 - plen));
    298 	return (bitmask);
    299 }
    300 
    301 /*
    302  * Add a fully initialized IPv6 IRE to the forwarding table.
    303  * This returns NULL on failure, or a held IRE on success.
    304  * Normally the returned IRE is the same as the argument. But a different
    305  * IRE will be returned if the added IRE is deemed identical to an existing
    306  * one. In that case ire_identical_ref will be increased.
    307  * The caller always needs to do an ire_refrele() on the returned IRE.
    308  */
    309 ire_t *
    310 ire_add_v6(ire_t *ire)
    311 {
    312 	ire_t	*ire1;
    313 	int	mask_table_index;
    314 	irb_t	*irb_ptr;
    315 	ire_t	**irep;
    316 	int	match_flags;
    317 	int	error;
    318 	ip_stack_t	*ipst = ire->ire_ipst;
    319 
    320 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
    321 
    322 	/* Make sure the address is properly masked. */
    323 	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
    324 
    325 	mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
    326 	if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) {
    327 		irb_t *ptr;
    328 		int i;
    329 
    330 		ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size *
    331 		    sizeof (irb_t)));
    332 		if (ptr == NULL) {
    333 			ire_delete(ire);
    334 			return (NULL);
    335 		}
    336 		for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
    337 			rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL);
    338 			ptr[i].irb_ipst = ipst;
    339 		}
    340 		mutex_enter(&ipst->ips_ire_ft_init_lock);
    341 		if (ipst->ips_ip_forwarding_table_v6[mask_table_index] ==
    342 		    NULL) {
    343 			ipst->ips_ip_forwarding_table_v6[mask_table_index] =
    344 			    ptr;
    345 			mutex_exit(&ipst->ips_ire_ft_init_lock);
    346 		} else {
    347 			/*
    348 			 * Some other thread won the race in
    349 			 * initializing the forwarding table at the
    350 			 * same index.
    351 			 */
    352 			mutex_exit(&ipst->ips_ire_ft_init_lock);
    353 			for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
    354 				rw_destroy(&ptr[i].irb_lock);
    355 			}
    356 			mi_free(ptr);
    357 		}
    358 	}
    359 	irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
    360 	    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
    361 	    ipst->ips_ip6_ftable_hash_size)]);
    362 
    363 	match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
    364 	if (ire->ire_ill != NULL)
    365 		match_flags |= MATCH_IRE_ILL;
    366 	/*
    367 	 * Start the atomic add of the ire. Grab the bucket lock and the
    368 	 * ill lock. Check for condemned.
    369 	 */
    370 	error = ire_atomic_start(irb_ptr, ire);
    371 	if (error != 0) {
    372 		ire_delete(ire);
    373 		return (NULL);
    374 	}
    375 
    376 	/*
    377 	 * If we are creating a hidden IRE, make sure we search for
    378 	 * hidden IREs when searching for duplicates below.
    379 	 * Otherwise, we might find an IRE on some other interface
    380 	 * that's not marked hidden.
    381 	 */
    382 	if (ire->ire_testhidden)
    383 		match_flags |= MATCH_IRE_TESTHIDDEN;
    384 
    385 	/*
    386 	 * Atomically check for duplicate and insert in the table.
    387 	 */
    388 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
    389 		if (IRE_IS_CONDEMNED(ire1))
    390 			continue;
    391 		/*
    392 		 * Here we need an exact match on zoneid, i.e.,
    393 		 * ire_match_args doesn't fit.
    394 		 */
    395 		if (ire1->ire_zoneid != ire->ire_zoneid)
    396 			continue;
    397 
    398 		if (ire1->ire_type != ire->ire_type)
    399 			continue;
    400 
    401 		/*
    402 		 * Note: We do not allow multiple routes that differ only
    403 		 * in the gateway security attributes; such routes are
    404 		 * considered duplicates.
    405 		 * To change that we explicitly have to treat them as
    406 		 * different here.
    407 		 */
    408 		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
    409 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
    410 		    ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL,
    411 		    match_flags)) {
    412 			/*
    413 			 * Return the old ire after doing a REFHOLD.
    414 			 * As most of the callers continue to use the IRE
    415 			 * after adding, we return a held ire. This will
    416 			 * avoid a lookup in the caller again. If the callers
    417 			 * don't want to use it, they need to do a REFRELE.
    418 			 */
    419 			ip1dbg(("found dup ire existing %p new %p",
    420 			    (void *)ire1, (void *)ire));
    421 			ire_refhold(ire1);
    422 			atomic_add_32(&ire1->ire_identical_ref, 1);
    423 			ire_atomic_end(irb_ptr, ire);
    424 			ire_delete(ire);
    425 			return (ire1);
    426 		}
    427 	}
    428 
    429 	/*
    430 	 * Normally we do head insertion since most things do not care about
    431 	 * the order of the IREs in the bucket.
    432 	 * However, due to shared-IP zones (and restrict_interzone_loopback)
    433 	 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
    434 	 * address. For that reason we do tail insertion for IRE_IF_CLONE.
    435 	 */
    436 	irep = (ire_t **)irb_ptr;
    437 	if (ire->ire_type & IRE_IF_CLONE) {
    438 		while ((ire1 = *irep) != NULL)
    439 			irep = &ire1->ire_next;
    440 	}
    441 	/* Insert at *irep */
    442 	ire1 = *irep;
    443 	if (ire1 != NULL)
    444 		ire1->ire_ptpn = &ire->ire_next;
    445 	ire->ire_next = ire1;
    446 	/* Link the new one in. */
    447 	ire->ire_ptpn = irep;
    448 	/*
    449 	 * ire_walk routines de-reference ire_next without holding
    450 	 * a lock. Before we point to the new ire, we want to make
    451 	 * sure the store that sets the ire_next of the new ire
    452 	 * reaches global visibility, so that ire_walk routines
    453 	 * don't see a truncated list of ires i.e if the ire_next
    454 	 * of the new ire gets set after we do "*irep = ire" due
    455 	 * to re-ordering, the ire_walk thread will see a NULL
    456 	 * once it accesses the ire_next of the new ire.
    457 	 * membar_producer() makes sure that the following store
    458 	 * happens *after* all of the above stores.
    459 	 */
    460 	membar_producer();
    461 	*irep = ire;
    462 	ire->ire_bucket = irb_ptr;
    463 	/*
    464 	 * We return a bumped up IRE above. Keep it symmetrical
    465 	 * so that the callers will always have to release. This
    466 	 * helps the callers of this function because they continue
    467 	 * to use the IRE after adding and hence they don't have to
    468 	 * lookup again after we return the IRE.
    469 	 *
    470 	 * NOTE : We don't have to use atomics as this is appearing
    471 	 * in the list for the first time and no one else can bump
    472 	 * up the reference count on this yet.
    473 	 */
    474 	ire_refhold_locked(ire);
    475 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
    476 	irb_ptr->irb_ire_cnt++;
    477 
    478 	if (ire->ire_ill != NULL) {
    479 		DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill,
    480 		    (char *), "ire", (void *), ire);
    481 		ire->ire_ill->ill_ire_cnt++;
    482 		ASSERT(ire->ire_ill->ill_ire_cnt != 0);	/* Wraparound */
    483 	}
    484 	ire_atomic_end(irb_ptr, ire);
    485 
    486 	/* Make any caching of the IREs be notified or updated */
    487 	ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
    488 
    489 	return (ire);
    490 }
    491 
    492 /*
    493  * Search for all HOST REDIRECT routes that are
    494  * pointing at the specified gateway and
    495  * delete them. This routine is called only
    496  * when a default gateway is going away.
    497  */
    498 static void
    499 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
    500 {
    501 	irb_t *irb_ptr;
    502 	irb_t *irb;
    503 	ire_t *ire;
    504 	in6_addr_t gw_addr_v6;
    505 	int i;
    506 
    507 	/* get the hash table for HOST routes */
    508 	irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
    509 	if (irb_ptr == NULL)
    510 		return;
    511 	for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
    512 		irb = &irb_ptr[i];
    513 		irb_refhold(irb);
    514 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
    515 			if (!(ire->ire_flags & RTF_DYNAMIC))
    516 				continue;
    517 			mutex_enter(&ire->ire_lock);
    518 			gw_addr_v6 = ire->ire_gateway_addr_v6;
    519 			mutex_exit(&ire->ire_lock);
    520 			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
    521 				ire_delete(ire);
    522 		}
    523 		irb_refrele(irb);
    524 	}
    525 }
    526 
    527 /*
    528  * Delete the specified IRE.
    529  * All calls should use ire_delete().
    530  * Sometimes called as writer though not required by this function.
    531  *
    532  * NOTE : This function is called only if the ire was added
    533  * in the list.
    534  */
    535 void
    536 ire_delete_v6(ire_t *ire)
    537 {
    538 	in6_addr_t gw_addr_v6;
    539 	ip_stack_t	*ipst = ire->ire_ipst;
    540 
    541 	/*
    542 	 * Make sure ire_generation increases from ire_flush_cache happen
    543 	 * after any lookup/reader has read ire_generation.
    544 	 * Since the rw_enter makes us wait until any lookup/reader has
    545 	 * completed we can exit the lock immediately.
    546 	 */
    547 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
    548 	rw_exit(&ipst->ips_ip6_ire_head_lock);
    549 
    550 	ASSERT(ire->ire_refcnt >= 1);
    551 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
    552 
    553 	ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
    554 
    555 	if (ire->ire_type == IRE_DEFAULT) {
    556 		/*
    557 		 * when a default gateway is going away
    558 		 * delete all the host redirects pointing at that
    559 		 * gateway.
    560 		 */
    561 		mutex_enter(&ire->ire_lock);
    562 		gw_addr_v6 = ire->ire_gateway_addr_v6;
    563 		mutex_exit(&ire->ire_lock);
    564 		ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
    565 	}
    566 
    567 	/*
    568 	 * If we are deleting an IRE_INTERFACE then we make sure we also
    569 	 * delete any IRE_IF_CLONE that has been created from it.
    570 	 * Those are always in ire_dep_children.
    571 	 */
    572 	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
    573 		ire_dep_delete_if_clone(ire);
    574 
    575 	/* Remove from parent dependencies and child */
    576 	rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
    577 	if (ire->ire_dep_parent != NULL) {
    578 		ire_dep_remove(ire);
    579 	}
    580 	while (ire->ire_dep_children != NULL)
    581 		ire_dep_remove(ire->ire_dep_children);
    582 	rw_exit(&ipst->ips_ire_dep_lock);
    583 }
    584 
    585 /*
    586  * When an IRE is added or deleted this routine is called to make sure
    587  * any caching of IRE information is notified or updated.
    588  *
    589  * The flag argument indicates if the flush request is due to addition
    590  * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
    591  * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
    592  */
    593 void
    594 ire_flush_cache_v6(ire_t *ire, int flag)
    595 {
    596 	ip_stack_t *ipst = ire->ire_ipst;
    597 
    598 	/*
    599 	 * IRE_IF_CLONE ire's don't provide any new information
    600 	 * than the parent from which they are cloned, so don't
    601 	 * perturb the generation numbers.
    602 	 */
    603 	if (ire->ire_type & IRE_IF_CLONE)
    604 		return;
    605 
    606 	/*
    607 	 * Ensure that an ire_add during a lookup serializes the updates of
    608 	 * the generation numbers under ire_head_lock so that the lookup gets
    609 	 * either the old ire and old generation number, or a new ire and new
    610 	 * generation number.
    611 	 */
    612 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
    613 
    614 	/*
    615 	 * If a route was just added, we need to notify everybody that
    616 	 * has cached an IRE_NOROUTE since there might now be a better
    617 	 * route for them.
    618 	 */
    619 	if (flag == IRE_FLUSH_ADD) {
    620 		ire_increment_generation(ipst->ips_ire_reject_v6);
    621 		ire_increment_generation(ipst->ips_ire_blackhole_v6);
    622 	}
    623 
    624 	/* Adding a default can't otherwise provide a better route */
    625 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
    626 		rw_exit(&ipst->ips_ip6_ire_head_lock);
    627 		return;
    628 	}
    629 
    630 	switch (flag) {
    631 	case IRE_FLUSH_DELETE:
    632 	case IRE_FLUSH_GWCHANGE:
    633 		/*
    634 		 * Update ire_generation for all ire_dep_children chains
    635 		 * starting with this IRE
    636 		 */
    637 		ire_dep_incr_generation(ire);
    638 		break;
    639 	case IRE_FLUSH_ADD: {
    640 		in6_addr_t	addr;
    641 		in6_addr_t	mask;
    642 		ip_stack_t	*ipst = ire->ire_ipst;
    643 		uint_t		masklen;
    644 
    645 		/*
    646 		 * Find an IRE which is a shorter match than the ire to be added
    647 		 * For any such IRE (which we repeat) we update the
    648 		 * ire_generation the same way as in the delete case.
    649 		 */
    650 		addr = ire->ire_addr_v6;
    651 		mask = ire->ire_mask_v6;
    652 		masklen = ip_mask_to_plen_v6(&mask);
    653 
    654 		ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL,
    655 		    ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
    656 		while (ire != NULL) {
    657 			/* We need to handle all in the same bucket */
    658 			irb_increment_generation(ire->ire_bucket);
    659 
    660 			mask = ire->ire_mask_v6;
    661 			ASSERT(masklen > ip_mask_to_plen_v6(&mask));
    662 			masklen = ip_mask_to_plen_v6(&mask);
    663 			ire_refrele(ire);
    664 			ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0,
    665 			    NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
    666 		}
    667 		}
    668 		break;
    669 	}
    670 	rw_exit(&ipst->ips_ip6_ire_head_lock);
    671 }
    672 
    673 /*
    674  * Matches the arguments passed with the values in the ire.
    675  *
    676  * Note: for match types that match using "ill" passed in, ill
    677  * must be checked for non-NULL before calling this routine.
    678  */
    679 boolean_t
    680 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
    681     const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid,
    682     const ts_label_t *tsl, int match_flags)
    683 {
    684 	in6_addr_t masked_addr;
    685 	in6_addr_t gw_addr_v6;
    686 	ill_t *ire_ill = NULL, *dst_ill;
    687 	ip_stack_t *ipst = ire->ire_ipst;
    688 
    689 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
    690 	ASSERT(addr != NULL);
    691 	ASSERT(mask != NULL);
    692 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
    693 	ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
    694 	    (ill != NULL && ill->ill_isv6));
    695 
    696 	/*
    697 	 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
    698 	 * is in fact hidden, to ensure the caller gets the right one.
    699 	 */
    700 	if (ire->ire_testhidden) {
    701 		if (!(match_flags & MATCH_IRE_TESTHIDDEN))
    702 			return (B_FALSE);
    703 	}
    704 
    705 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
    706 	    ire->ire_zoneid != ALL_ZONES) {
    707 		/*
    708 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
    709 		 * does not match that of ire_zoneid, a failure to
    710 		 * match is reported at this point. Otherwise, since some IREs
    711 		 * that are available in the global zone can be used in local
    712 		 * zones, additional checks need to be performed:
    713 		 *
    714 		 * IRE_LOOPBACK
    715 		 *	entries should never be matched in this situation.
    716 		 *	Each zone has its own IRE_LOOPBACK.
    717 		 *
    718 		 * IRE_LOCAL
    719 		 *	We allow them for any zoneid. ire_route_recursive
    720 		 *	does additional checks when
    721 		 *	ip_restrict_interzone_loopback is set.
    722 		 *
    723 		 * If ill_usesrc_ifindex is set
    724 		 *	Then we check if the zone has a valid source address
    725 		 *	on the usesrc ill.
    726 		 *
    727 		 * If ire_ill is set, then check that the zone has an ipif
    728 		 *	on that ill.
    729 		 *
    730 		 * Outside of this function (in ire_round_robin) we check
    731 		 * that any IRE_OFFLINK has a gateway that reachable from the
    732 		 * zone when we have multiple choices (ECMP).
    733 		 */
    734 		if (match_flags & MATCH_IRE_ZONEONLY)
    735 			return (B_FALSE);
    736 		if (ire->ire_type & IRE_LOOPBACK)
    737 			return (B_FALSE);
    738 
    739 		if (ire->ire_type & IRE_LOCAL)
    740 			goto matchit;
    741 
    742 		/*
    743 		 * The normal case of IRE_ONLINK has a matching zoneid.
    744 		 * Here we handle the case when shared-IP zones have been
    745 		 * configured with IP addresses on vniN. In that case it
    746 		 * is ok for traffic from a zone to use IRE_ONLINK routes
    747 		 * if the ill has a usesrc pointing at vniN
    748 		 * Applies to IRE_INTERFACE.
    749 		 */
    750 		dst_ill = ire->ire_ill;
    751 		if (ire->ire_type & IRE_ONLINK) {
    752 			uint_t	ifindex;
    753 
    754 			/*
    755 			 * Note there is no IRE_INTERFACE on vniN thus
    756 			 * can't do an IRE lookup for a matching route.
    757 			 */
    758 			ifindex = dst_ill->ill_usesrc_ifindex;
    759 			if (ifindex == 0)
    760 				return (B_FALSE);
    761 
    762 			/*
    763 			 * If there is a usable source address in the
    764 			 * zone, then it's ok to return this IRE_INTERFACE
    765 			 */
    766 			if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
    767 			    zoneid, ipst)) {
    768 				ip3dbg(("ire_match_args: no usrsrc for zone"
    769 				    " dst_ill %p\n", (void *)dst_ill));
    770 				return (B_FALSE);
    771 			}
    772 		}
    773 		/*
    774 		 * For exampe, with
    775 		 * route add 11.0.0.0 gw1 -ifp bge0
    776 		 * route add 11.0.0.0 gw2 -ifp bge1
    777 		 * this code would differentiate based on
    778 		 * where the sending zone has addresses.
    779 		 * Only if the zone has an address on bge0 can it use the first
    780 		 * route. It isn't clear if this behavior is documented
    781 		 * anywhere.
    782 		 */
    783 		if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
    784 			ipif_t	*tipif;
    785 
    786 			mutex_enter(&dst_ill->ill_lock);
    787 			for (tipif = dst_ill->ill_ipif;
    788 			    tipif != NULL; tipif = tipif->ipif_next) {
    789 				if (!IPIF_IS_CONDEMNED(tipif) &&
    790 				    (tipif->ipif_flags & IPIF_UP) &&
    791 				    (tipif->ipif_zoneid == zoneid ||
    792 				    tipif->ipif_zoneid == ALL_ZONES))
    793 					break;
    794 			}
    795 			mutex_exit(&dst_ill->ill_lock);
    796 			if (tipif == NULL)
    797 				return (B_FALSE);
    798 		}
    799 	}
    800 
    801 matchit:
    802 	if (match_flags & MATCH_IRE_GW) {
    803 		mutex_enter(&ire->ire_lock);
    804 		gw_addr_v6 = ire->ire_gateway_addr_v6;
    805 		mutex_exit(&ire->ire_lock);
    806 	}
    807 	if (match_flags & MATCH_IRE_ILL) {
    808 		ire_ill = ire->ire_ill;
    809 
    810 		/*
    811 		 * If asked to match an ill, we *must* match
    812 		 * on the ire_ill for ipmp test addresses, or
    813 		 * any of the ill in the group for data addresses.
    814 		 * If we don't, we may as well fail.
    815 		 * However, we need an exception for IRE_LOCALs to ensure
    816 		 * we loopback packets even sent to test addresses on different
    817 		 * interfaces in the group.
    818 		 */
    819 		if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
    820 		    !(ire->ire_type & IRE_LOCAL)) {
    821 			if (ire->ire_ill != ill)
    822 				return (B_FALSE);
    823 		} else  {
    824 			match_flags &= ~MATCH_IRE_TESTHIDDEN;
    825 			/*
    826 			 * We know that ill is not NULL, but ire_ill could be
    827 			 * NULL
    828 			 */
    829 			if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
    830 				return (B_FALSE);
    831 		}
    832 	}
    833 	/* No ire_addr_v6 bits set past the mask */
    834 	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
    835 	    ire->ire_addr_v6));
    836 	V6_MASK_COPY(*addr, *mask, masked_addr);
    837 	if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
    838 	    ((!(match_flags & MATCH_IRE_GW)) ||
    839 	    IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
    840 	    ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
    841 	    ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
    842 	    ((!(match_flags & MATCH_IRE_MASK)) ||
    843 	    (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) &&
    844 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
    845 	    (!is_system_labeled()) ||
    846 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
    847 		/* We found the matched IRE */
    848 		return (B_TRUE);
    849 	}
    850 	return (B_FALSE);
    851 }
    852 
    853 /*
    854  * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
    855  * gateway address. If ill is non-NULL we also match on it.
    856  * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
    857  */
    858 boolean_t
    859 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill,
    860     const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
    861 {
    862 	ire_t	*ire;
    863 	uint_t	match_flags;
    864 
    865 	if (lock_held)
    866 		ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock));
    867 	else
    868 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
    869 
    870 	match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
    871 	if (ill != NULL)
    872 		match_flags |= MATCH_IRE_ILL;
    873 
    874 	ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros,
    875 	    &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags,
    876 	    ipst);
    877 
    878 	if (!lock_held)
    879 		rw_exit(&ipst->ips_ip6_ire_head_lock);
    880 	if (ire != NULL) {
    881 		ire_refrele(ire);
    882 		return (B_TRUE);
    883 	} else {
    884 		return (B_FALSE);
    885 	}
    886 }
    887 
    888 /*
    889  * Lookup a route in forwarding table.
    890  * specific lookup is indicated by passing the
    891  * required parameters and indicating the
    892  * match required in flag field.
    893  *
    894  * Supports link-local addresses by following the ipif/ill when recursing.
    895  */
    896 ire_t *
    897 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
    898     const in6_addr_t *gateway, int type, const ill_t *ill,
    899     zoneid_t zoneid, const ts_label_t *tsl, int flags,
    900     uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
    901 {
    902 	ire_t *ire = NULL;
    903 
    904 	ASSERT(addr != NULL);
    905 	ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
    906 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
    907 	ASSERT(ill == NULL || ill->ill_isv6);
    908 
    909 	ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
    910 
    911 	/*
    912 	 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
    913 	 * is set.
    914 	 */
    915 	if ((flags & (MATCH_IRE_ILL)) && (ill == NULL))
    916 		return (NULL);
    917 
    918 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
    919 	ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid,
    920 	    tsl, flags, ipst);
    921 	if (ire == NULL) {
    922 		rw_exit(&ipst->ips_ip6_ire_head_lock);
    923 		return (NULL);
    924 	}
    925 
    926 	/*
    927 	 * round-robin only if we have more than one route in the bucket.
    928 	 * ips_ip_ecmp_behavior controls when we do ECMP
    929 	 *	2:	always
    930 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
    931 	 *	0:	never
    932 	 *
    933 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
    934 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
    935 	 * and the IRE_INTERFACESs are likely to be shorter matches.
    936 	 */
    937 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
    938 		if (ipst->ips_ip_ecmp_behavior == 2 ||
    939 		    (ipst->ips_ip_ecmp_behavior == 1 &&
    940 		    IS_DEFAULT_ROUTE_V6(ire))) {
    941 			ire_t	*next_ire;
    942 			ire_ftable_args_t margs;
    943 
    944 			bzero(&margs, sizeof (margs));
    945 			margs.ift_addr_v6 = *addr;
    946 			if (mask != NULL)
    947 				margs.ift_mask_v6 = *mask;
    948 			if (gateway != NULL)
    949 				margs.ift_gateway_v6 = *gateway;
    950 			margs.ift_type = type;
    951 			margs.ift_ill = ill;
    952 			margs.ift_zoneid = zoneid;
    953 			margs.ift_tsl = tsl;
    954 			margs.ift_flags = flags;
    955 
    956 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
    957 			    xmit_hint, ire, ipst);
    958 			if (next_ire == NULL) {
    959 				/* keep ire if next_ire is null */
    960 				goto done;
    961 			}
    962 			ire_refrele(ire);
    963 			ire = next_ire;
    964 		}
    965 	}
    966 
    967 done:
    968 	/* Return generation before dropping lock */
    969 	if (generationp != NULL)
    970 		*generationp = ire->ire_generation;
    971 
    972 	rw_exit(&ipst->ips_ip6_ire_head_lock);
    973 
    974 	/*
    975 	 * For shared-IP zones we need additional checks to what was
    976 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
    977 	 *
    978 	 * When ip_restrict_interzone_loopback is set, then
    979 	 * we ensure that IRE_LOCAL are only used for loopback
    980 	 * between zones when the logical "Ethernet" would
    981 	 * have looped them back. That is, if in the absense of
    982 	 * the IRE_LOCAL we would have sent to packet out the
    983 	 * same ill.
    984 	 */
    985 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
    986 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
    987 	    ipst->ips_ip_restrict_interzone_loopback) {
    988 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
    989 		ASSERT(ire != NULL);
    990 	}
    991 
    992 	return (ire);
    993 }
    994 
    995 /*
    996  * Look up a single ire. The caller holds either the read or write lock.
    997  */
    998 ire_t *
    999 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
   1000     const in6_addr_t *gateway, int type, const ill_t *ill,
   1001     zoneid_t zoneid, const ts_label_t *tsl, int flags,
   1002     ip_stack_t *ipst)
   1003 {
   1004 	irb_t *irb_ptr;
   1005 	ire_t *ire = NULL;
   1006 	int i;
   1007 
   1008 	ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock));
   1009 
   1010 	/*
   1011 	 * If the mask is known, the lookup
   1012 	 * is simple, if the mask is not known
   1013 	 * we need to search.
   1014 	 */
   1015 	if (flags & MATCH_IRE_MASK) {
   1016 		uint_t masklen;
   1017 
   1018 		masklen = ip_mask_to_plen_v6(mask);
   1019 		if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) {
   1020 			return (NULL);
   1021 		}
   1022 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
   1023 		    IRE_ADDR_MASK_HASH_V6(*addr, *mask,
   1024 		    ipst->ips_ip6_ftable_hash_size)]);
   1025 		rw_enter(&irb_ptr->irb_lock, RW_READER);
   1026 		for (ire = irb_ptr->irb_ire; ire != NULL;
   1027 		    ire = ire->ire_next) {
   1028 			if (IRE_IS_CONDEMNED(ire))
   1029 				continue;
   1030 			if (ire_match_args_v6(ire, addr, mask, gateway, type,
   1031 			    ill, zoneid, tsl, flags))
   1032 				goto found_ire;
   1033 		}
   1034 		rw_exit(&irb_ptr->irb_lock);
   1035 	} else {
   1036 		uint_t masklen;
   1037 
   1038 		/*
   1039 		 * In this case we don't know the mask, we need to
   1040 		 * search the table assuming different mask sizes.
   1041 		 */
   1042 		if (flags & MATCH_IRE_SHORTERMASK) {
   1043 			masklen = ip_mask_to_plen_v6(mask);
   1044 			if (masklen == 0) {
   1045 				/* Nothing shorter than zero */
   1046 				return (NULL);
   1047 			}
   1048 			masklen--;
   1049 		} else {
   1050 			masklen = IP6_MASK_TABLE_SIZE - 1;
   1051 		}
   1052 
   1053 		for (i = masklen; i >= 0; i--) {
   1054 			in6_addr_t tmpmask;
   1055 
   1056 			if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
   1057 				continue;
   1058 			(void) ip_plen_to_mask_v6(i, &tmpmask);
   1059 			irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][
   1060 			    IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
   1061 			    ipst->ips_ip6_ftable_hash_size)];
   1062 			rw_enter(&irb_ptr->irb_lock, RW_READER);
   1063 			for (ire = irb_ptr->irb_ire; ire != NULL;
   1064 			    ire = ire->ire_next) {
   1065 				if (IRE_IS_CONDEMNED(ire))
   1066 					continue;
   1067 				if (ire_match_args_v6(ire, addr,
   1068 				    &ire->ire_mask_v6, gateway, type, ill,
   1069 				    zoneid, tsl, flags))
   1070 					goto found_ire;
   1071 			}
   1072 			rw_exit(&irb_ptr->irb_lock);
   1073 		}
   1074 	}
   1075 	ASSERT(ire == NULL);
   1076 	ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
   1077 	return (NULL);
   1078 
   1079 found_ire:
   1080 	ire_refhold(ire);
   1081 	rw_exit(&irb_ptr->irb_lock);
   1082 	return (ire);
   1083 }
   1084 
   1085 
   1086 /*
   1087  * This function is called by
   1088  * ip_input/ire_route_recursive when doing a route lookup on only the
   1089  * destination address.
   1090  *
   1091  * The optimizations of this function over ire_ftable_lookup are:
   1092  *	o removing unnecessary flag matching
   1093  *	o doing longest prefix match instead of overloading it further
   1094  *	  with the unnecessary "best_prefix_match"
   1095  *
   1096  * If no route is found we return IRE_NOROUTE.
   1097  */
   1098 ire_t *
   1099 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint,
   1100     ip_stack_t *ipst, uint_t *generationp)
   1101 {
   1102 	ire_t	*ire;
   1103 
   1104 	ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL,
   1105 	    MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp);
   1106 	if (ire == NULL) {
   1107 		ire = ire_reject(ipst, B_TRUE);
   1108 		if (generationp != NULL)
   1109 			*generationp = IRE_GENERATION_VERIFY;
   1110 	}
   1111 	/* ftable_lookup did round robin */
   1112 	return (ire);
   1113 }
   1114 
   1115 ire_t *
   1116 ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa,
   1117     uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
   1118 {
   1119 	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
   1120 
   1121 	return (ip_select_route(dst, ixa, generationp, setsrcp, errorp,
   1122 	    multirtp));
   1123 }
   1124 
   1125 /*
   1126  * Recursively look for a route to the destination. Can also match on
   1127  * the zoneid, ill, and label. Used for the data paths. See also
   1128  * ire_route_recursive_dstonly.
   1129  *
   1130  * If ill is set this means we will match it by adding MATCH_IRE_ILL.
   1131  *
   1132  * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
   1133  * create an IRE_IF_CLONE. This is used on the receive side when we are not
   1134  * forwarding.
   1135  * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
   1136  * resolve the gateway.
   1137  *
   1138  * Note that this function never returns NULL. It returns an IRE_NOROUTE
   1139  * instead.
   1140  *
   1141  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
   1142  * is an error.
   1143  * Allow at most one RTF_INDIRECT.
   1144  */
   1145 ire_t *
   1146 ire_route_recursive_impl_v6(ire_t *ire,
   1147     const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg,
   1148     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
   1149     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst,
   1150     in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
   1151 {
   1152 	int		i, j;
   1153 	in6_addr_t	v6nexthop = *nexthop;
   1154 	ire_t		*ires[MAX_IRE_RECURSION];
   1155 	uint_t		generation;
   1156 	uint_t		generations[MAX_IRE_RECURSION];
   1157 	boolean_t	need_refrele = B_FALSE;
   1158 	boolean_t	invalidate = B_FALSE;
   1159 	int		prefs[MAX_IRE_RECURSION];
   1160 	ill_t		*ill = NULL;
   1161 
   1162 	if (setsrcp != NULL)
   1163 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
   1164 	if (gwattrp != NULL)
   1165 		ASSERT(*gwattrp == NULL);
   1166 
   1167 	if (ill_arg != NULL)
   1168 		match_args |= MATCH_IRE_ILL;
   1169 
   1170 	/*
   1171 	 * We iterate up to three times to resolve a route, even though
   1172 	 * we have four slots in the array. The extra slot is for an
   1173 	 * IRE_IF_CLONE we might need to create.
   1174 	 */
   1175 	i = 0;
   1176 	while (i < MAX_IRE_RECURSION - 1) {
   1177 		/* ire_ftable_lookup handles round-robin/ECMP */
   1178 		if (ire == NULL) {
   1179 			ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
   1180 			    (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
   1181 			    match_args, xmit_hint, ipst, &generation);
   1182 		} else {
   1183 			/* Caller passed it; extra hold since we will rele */
   1184 			ire_refhold(ire);
   1185 			if (generationp != NULL)
   1186 				generation = *generationp;
   1187 			else
   1188 				generation = IRE_GENERATION_VERIFY;
   1189 		}
   1190 
   1191 		if (ire == NULL)
   1192 			ire = ire_reject(ipst, B_TRUE);
   1193 
   1194 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
   1195 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
   1196 			goto error;
   1197 
   1198 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
   1199 
   1200 		if (i != 0) {
   1201 			prefs[i] = ire_pref(ire);
   1202 			/*
   1203 			 * Don't allow anything unusual past the first
   1204 			 * iteration.
   1205 			 */
   1206 			if ((ire->ire_type &
   1207 			    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
   1208 			    prefs[i] <= prefs[i-1]) {
   1209 				ire_refrele(ire);
   1210 				if (irr_flags & IRR_INCOMPLETE) {
   1211 					ire = ires[0];
   1212 					ire_refhold(ire);
   1213 				} else {
   1214 					ire = ire_reject(ipst, B_TRUE);
   1215 				}
   1216 				goto error;
   1217 			}
   1218 		}
   1219 		/* We have a usable IRE */
   1220 		ires[i] = ire;
   1221 		generations[i] = generation;
   1222 		i++;
   1223 
   1224 		/* The first RTF_SETSRC address is passed back if setsrcp */
   1225 		if ((ire->ire_flags & RTF_SETSRC) &&
   1226 		    setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) {
   1227 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
   1228 			    &ire->ire_setsrc_addr_v6));
   1229 			*setsrcp = ire->ire_setsrc_addr_v6;
   1230 		}
   1231 
   1232 		/* The first ire_gw_secattr is passed back if gwattrp */
   1233 		if (ire->ire_gw_secattr != NULL &&
   1234 		    gwattrp != NULL && *gwattrp == NULL)
   1235 			*gwattrp = ire->ire_gw_secattr;
   1236 
   1237 		/*
   1238 		 * Check if we have a short-cut pointer to an IRE for this
   1239 		 * destination, and that the cached dependency isn't stale.
   1240 		 * In that case we've rejoined an existing tree towards a
   1241 		 * parent, thus we don't need to continue the loop to
   1242 		 * discover the rest of the tree.
   1243 		 */
   1244 		mutex_enter(&ire->ire_lock);
   1245 		if (ire->ire_dep_parent != NULL &&
   1246 		    ire->ire_dep_parent->ire_generation ==
   1247 		    ire->ire_dep_parent_generation) {
   1248 			mutex_exit(&ire->ire_lock);
   1249 			ire = NULL;
   1250 			goto done;
   1251 		}
   1252 		mutex_exit(&ire->ire_lock);
   1253 
   1254 		/*
   1255 		 * If this type should have an ire_nce_cache (even if it
   1256 		 * doesn't yet have one) then we are done. Includes
   1257 		 * IRE_INTERFACE with a full 128 bit mask.
   1258 		 */
   1259 		if (ire->ire_nce_capable) {
   1260 			ire = NULL;
   1261 			goto done;
   1262 		}
   1263 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
   1264 		/*
   1265 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
   1266 		 * particular destination
   1267 		 */
   1268 		if (ire->ire_type & IRE_INTERFACE) {
   1269 			ire_t		*clone;
   1270 
   1271 			ASSERT(ire->ire_masklen != IPV6_ABITS);
   1272 
   1273 			/*
   1274 			 * In the case of ip_input and ILLF_FORWARDING not
   1275 			 * being set, and in the case of RTM_GET, there is
   1276 			 * no point in allocating an IRE_IF_CLONE. We return
   1277 			 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
   1278 			 * result in a ire_dep_parent which is IRE_IF_*
   1279 			 * without an IRE_IF_CLONE.
   1280 			 * We recover from that when we need to send packets
   1281 			 * by ensuring that the generations become
   1282 			 * IRE_GENERATION_VERIFY in this case.
   1283 			 */
   1284 			if (!(irr_flags & IRR_ALLOCATE)) {
   1285 				invalidate = B_TRUE;
   1286 				ire = NULL;
   1287 				goto done;
   1288 			}
   1289 
   1290 			clone = ire_create_if_clone(ire, &v6nexthop,
   1291 			    &generation);
   1292 			if (clone == NULL) {
   1293 				/*
   1294 				 * Temporary failure - no memory.
   1295 				 * Don't want caller to cache IRE_NOROUTE.
   1296 				 */
   1297 				invalidate = B_TRUE;
   1298 				ire = ire_blackhole(ipst, B_TRUE);
   1299 				goto error;
   1300 			}
   1301 			/*
   1302 			 * Make clone next to last entry and the
   1303 			 * IRE_INTERFACE the last in the dependency
   1304 			 * chain since the clone depends on the
   1305 			 * IRE_INTERFACE.
   1306 			 */
   1307 			ASSERT(i >= 1);
   1308 			ASSERT(i < MAX_IRE_RECURSION);
   1309 
   1310 			ires[i] = ires[i-1];
   1311 			generations[i] = generations[i-1];
   1312 			ires[i-1] = clone;
   1313 			generations[i-1] = generation;
   1314 			i++;
   1315 
   1316 			ire = NULL;
   1317 			goto done;
   1318 		}
   1319 
   1320 		/*
   1321 		 * We only match on the type and optionally ILL when
   1322 		 * recursing. The type match is used by some callers
   1323 		 * to exclude certain types (such as IRE_IF_CLONE or
   1324 		 * IRE_LOCAL|IRE_LOOPBACK).
   1325 		 */
   1326 		match_args &= MATCH_IRE_TYPE;
   1327 		v6nexthop = ire->ire_gateway_addr_v6;
   1328 		if (ill == NULL && ire->ire_ill != NULL) {
   1329 			ill = ire->ire_ill;
   1330 			need_refrele = B_TRUE;
   1331 			ill_refhold(ill);
   1332 			match_args |= MATCH_IRE_ILL;
   1333 		}
   1334 		/*
   1335 		 * We set the prefs[i] value above if i > 0. We've already
   1336 		 * done i++ so i is one in the case of the first time around.
   1337 		 */
   1338 		if (i == 1)
   1339 			prefs[0] = ire_pref(ire);
   1340 		ire = NULL;
   1341 	}
   1342 	ASSERT(ire == NULL);
   1343 	ire = ire_reject(ipst, B_TRUE);
   1344 
   1345 error:
   1346 	ASSERT(ire != NULL);
   1347 	if (need_refrele)
   1348 		ill_refrele(ill);
   1349 
   1350 	/*
   1351 	 * In the case of MULTIRT we want to try a different IRE the next
   1352 	 * time. We let the next packet retry in that case.
   1353 	 */
   1354 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
   1355 		(void) ire_no_good(ires[0]);
   1356 
   1357 cleanup:
   1358 	/* cleanup ires[i] */
   1359 	ire_dep_unbuild(ires, i);
   1360 	for (j = 0; j < i; j++)
   1361 		ire_refrele(ires[j]);
   1362 
   1363 	ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
   1364 	    (irr_flags & IRR_INCOMPLETE));
   1365 	/*
   1366 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
   1367 	 * ip_select_route since the reject or lack of memory might be gone.
   1368 	 */
   1369 	if (generationp != NULL)
   1370 		*generationp = IRE_GENERATION_VERIFY;
   1371 	return (ire);
   1372 
   1373 done:
   1374 	ASSERT(ire == NULL);
   1375 	if (need_refrele)
   1376 		ill_refrele(ill);
   1377 
   1378 	/* Build dependencies */
   1379 	if (i > 1 && !ire_dep_build(ires, generations, i)) {
   1380 		/* Something in chain was condemned; tear it apart */
   1381 		ire = ire_blackhole(ipst, B_TRUE);
   1382 		goto cleanup;
   1383 	}
   1384 
   1385 	/*
   1386 	 * Release all refholds except the one for ires[0] that we
   1387 	 * will return to the caller.
   1388 	 */
   1389 	for (j = 1; j < i; j++)
   1390 		ire_refrele(ires[j]);
   1391 
   1392 	if (invalidate) {
   1393 		/*
   1394 		 * Since we needed to allocate but couldn't we need to make
   1395 		 * sure that the dependency chain is rebuilt the next time.
   1396 		 */
   1397 		ire_dep_invalidate_generations(ires[0]);
   1398 		generation = IRE_GENERATION_VERIFY;
   1399 	} else {
   1400 		/*
   1401 		 * IREs can have been added or deleted while we did the
   1402 		 * recursive lookup and we can't catch those until we've built
   1403 		 * the dependencies. We verify the stored
   1404 		 * ire_dep_parent_generation to catch any such changes and
   1405 		 * return IRE_GENERATION_VERIFY (which will cause
   1406 		 * ip_select_route to be called again so we can redo the
   1407 		 * recursive lookup next time we send a packet.
   1408 		 */
   1409 		if (ires[0]->ire_dep_parent == NULL)
   1410 			generation = ires[0]->ire_generation;
   1411 		else
   1412 			generation = ire_dep_validate_generations(ires[0]);
   1413 		if (generations[0] != ires[0]->ire_generation) {
   1414 			/* Something changed at the top */
   1415 			generation = IRE_GENERATION_VERIFY;
   1416 		}
   1417 	}
   1418 	if (generationp != NULL)
   1419 		*generationp = generation;
   1420 
   1421 	return (ires[0]);
   1422 }
   1423 
   1424 ire_t *
   1425 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type,
   1426     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
   1427     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst,
   1428     in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
   1429 {
   1430 	return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill,
   1431 	    zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
   1432 	    gwattrp, generationp));
   1433 }
   1434 
   1435 /*
   1436  * Recursively look for a route to the destination.
   1437  * We only handle a destination match here, yet we have the same arguments
   1438  * as the full match to allow function pointers to select between the two.
   1439  *
   1440  * Note that this function never returns NULL. It returns an IRE_NOROUTE
   1441  * instead.
   1442  *
   1443  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
   1444  * is an error.
   1445  * Allow at most one RTF_INDIRECT.
   1446  */
   1447 ire_t *
   1448 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags,
   1449     uint32_t xmit_hint, ip_stack_t *ipst)
   1450 {
   1451 	ire_t	*ire;
   1452 	ire_t	*ire1;
   1453 	uint_t	generation;
   1454 
   1455 	/* ire_ftable_lookup handles round-robin/ECMP */
   1456 	ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst,
   1457 	    &generation);
   1458 	ASSERT(ire != NULL);
   1459 
   1460 	/*
   1461 	 * If this type should have an ire_nce_cache (even if it
   1462 	 * doesn't yet have one) then we are done. Includes
   1463 	 * IRE_INTERFACE with a full 128 bit mask.
   1464 	 */
   1465 	if (ire->ire_nce_capable)
   1466 		return (ire);
   1467 
   1468 	/*
   1469 	 * If the IRE has a current cached parent we know that the whole
   1470 	 * parent chain is current, hence we don't need to discover and
   1471 	 * build any dependencies by doing a recursive lookup.
   1472 	 */
   1473 	mutex_enter(&ire->ire_lock);
   1474 	if (ire->ire_dep_parent != NULL &&
   1475 	    ire->ire_dep_parent->ire_generation ==
   1476 	    ire->ire_dep_parent_generation) {
   1477 		mutex_exit(&ire->ire_lock);
   1478 		return (ire);
   1479 	}
   1480 	mutex_exit(&ire->ire_lock);
   1481 
   1482 	/*
   1483 	 * Fallback to loop in the normal code starting with the ire
   1484 	 * we found. Normally this would return the same ire.
   1485 	 */
   1486 	ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES,
   1487 	    NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
   1488 	    &generation);
   1489 	ire_refrele(ire);
   1490 	return (ire1);
   1491 }
   1492