Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * This file contains consumer routines of the IPv4 forwarding engine
     28  */
     29 
     30 #include <sys/types.h>
     31 #include <sys/stream.h>
     32 #include <sys/stropts.h>
     33 #include <sys/strlog.h>
     34 #include <sys/dlpi.h>
     35 #include <sys/ddi.h>
     36 #include <sys/cmn_err.h>
     37 #include <sys/policy.h>
     38 
     39 #include <sys/systm.h>
     40 #include <sys/strsun.h>
     41 #include <sys/kmem.h>
     42 #include <sys/param.h>
     43 #include <sys/socket.h>
     44 #include <sys/strsubr.h>
     45 #include <net/if.h>
     46 #include <net/route.h>
     47 #include <netinet/in.h>
     48 #include <net/if_dl.h>
     49 #include <netinet/ip6.h>
     50 #include <netinet/icmp6.h>
     51 
     52 #include <inet/ipsec_impl.h>
     53 #include <inet/common.h>
     54 #include <inet/mi.h>
     55 #include <inet/mib2.h>
     56 #include <inet/ip.h>
     57 #include <inet/ip_impl.h>
     58 #include <inet/ip6.h>
     59 #include <inet/ip_ndp.h>
     60 #include <inet/arp.h>
     61 #include <inet/ip_if.h>
     62 #include <inet/ip_ire.h>
     63 #include <inet/ip_ftable.h>
     64 #include <inet/ip_rts.h>
     65 #include <inet/nd.h>
     66 
     67 #include <net/pfkeyv2.h>
     68 #include <inet/sadb.h>
     69 #include <inet/tcp.h>
     70 #include <inet/ipclassifier.h>
     71 #include <sys/zone.h>
     72 #include <net/radix.h>
     73 #include <sys/tsol/label.h>
     74 #include <sys/tsol/tnet.h>
     75 
     76 #define	IS_DEFAULT_ROUTE(ire)	\
     77 	(((ire)->ire_type & IRE_DEFAULT) || \
     78 	    (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
     79 
     80 static ire_t	*route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
     81 static void	ire_del_host_redir(ire_t *, char *);
     82 static boolean_t ire_find_best_route(struct radix_node *, void *);
     83 
     84 /*
     85  * Lookup a route in forwarding table. A specific lookup is indicated by
     86  * passing the required parameters and indicating the match required in the
     87  * flag field.
     88  *
     89  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
     90  */
     91 ire_t *
     92 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
     93     int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
     94     int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
     95 {
     96 	ire_t *ire;
     97 	struct rt_sockaddr rdst, rmask;
     98 	struct rt_entry *rt;
     99 	ire_ftable_args_t margs;
    100 
    101 	ASSERT(ill == NULL || !ill->ill_isv6);
    102 
    103 	/*
    104 	 * ire_match_args() will dereference ill if MATCH_IRE_ILL
    105 	 * is set.
    106 	 */
    107 	if ((flags & MATCH_IRE_ILL) && (ill == NULL))
    108 		return (NULL);
    109 
    110 	bzero(&rdst, sizeof (rdst));
    111 	rdst.rt_sin_len = sizeof (rdst);
    112 	rdst.rt_sin_family = AF_INET;
    113 	rdst.rt_sin_addr.s_addr = addr;
    114 
    115 	bzero(&rmask, sizeof (rmask));
    116 	rmask.rt_sin_len = sizeof (rmask);
    117 	rmask.rt_sin_family = AF_INET;
    118 	rmask.rt_sin_addr.s_addr = mask;
    119 
    120 	bzero(&margs, sizeof (margs));
    121 	margs.ift_addr = addr;
    122 	margs.ift_mask = mask;
    123 	margs.ift_gateway = gateway;
    124 	margs.ift_type = type;
    125 	margs.ift_ill = ill;
    126 	margs.ift_zoneid = zoneid;
    127 	margs.ift_tsl = tsl;
    128 	margs.ift_flags = flags;
    129 
    130 	/*
    131 	 * The flags argument passed to ire_ftable_lookup may cause the
    132 	 * search to return, not the longest matching prefix, but the
    133 	 * "best matching prefix", i.e., the longest prefix that also
    134 	 * satisfies constraints imposed via the permutation of flags
    135 	 * passed in. To achieve this, we invoke ire_match_args() on
    136 	 * each matching leaf in the  radix tree. ire_match_args is
    137 	 * invoked by the callback function ire_find_best_route()
    138 	 * We hold the global tree lock in read mode when calling
    139 	 * rn_match_args. Before dropping the global tree lock, ensure
    140 	 * that the radix node can't be deleted by incrementing ire_refcnt.
    141 	 */
    142 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
    143 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
    144 	    ipst->ips_ip_ftable, ire_find_best_route, &margs);
    145 	ire = margs.ift_best_ire;
    146 	if (rt == NULL) {
    147 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    148 		return (NULL);
    149 	}
    150 	ASSERT(ire != NULL);
    151 
    152 	DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
    153 
    154 	/*
    155 	 * round-robin only if we have more than one route in the bucket.
    156 	 * ips_ip_ecmp_behavior controls when we do ECMP
    157 	 *	2:	always
    158 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
    159 	 *	0:	never
    160 	 */
    161 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
    162 		if (ipst->ips_ip_ecmp_behavior == 2 ||
    163 		    (ipst->ips_ip_ecmp_behavior == 1 &&
    164 		    IS_DEFAULT_ROUTE(ire))) {
    165 			ire_t	*next_ire;
    166 
    167 			margs.ift_best_ire = NULL;
    168 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
    169 			    xmit_hint, ire, ipst);
    170 			if (next_ire == NULL) {
    171 				/* keep ire if next_ire is null */
    172 				goto done;
    173 			}
    174 			ire_refrele(ire);
    175 			ire = next_ire;
    176 		}
    177 	}
    178 
    179 done:
    180 	/* Return generation before dropping lock */
    181 	if (generationp != NULL)
    182 		*generationp = ire->ire_generation;
    183 
    184 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    185 
    186 	/*
    187 	 * For shared-IP zones we need additional checks to what was
    188 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
    189 	 *
    190 	 * When ip_restrict_interzone_loopback is set, then
    191 	 * we ensure that IRE_LOCAL are only used for loopback
    192 	 * between zones when the logical "Ethernet" would
    193 	 * have looped them back. That is, if in the absense of
    194 	 * the IRE_LOCAL we would have sent to packet out the
    195 	 * same ill.
    196 	 */
    197 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
    198 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
    199 	    ipst->ips_ip_restrict_interzone_loopback) {
    200 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
    201 		ASSERT(ire != NULL);
    202 	}
    203 	return (ire);
    204 }
    205 
    206 /*
    207  * This function is called by
    208  * ip_input/ire_route_recursive when doing a route lookup on only the
    209  * destination address.
    210  *
    211  * The optimizations of this function over ire_ftable_lookup are:
    212  *	o removing unnecessary flag matching
    213  *	o doing longest prefix match instead of overloading it further
    214  *	  with the unnecessary "best_prefix_match"
    215  *
    216  * If no route is found we return IRE_NOROUTE.
    217  */
    218 ire_t *
    219 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
    220     uint_t *generationp)
    221 {
    222 	ire_t *ire;
    223 	struct rt_sockaddr rdst;
    224 	struct rt_entry *rt;
    225 	irb_t *irb;
    226 
    227 	rdst.rt_sin_len = sizeof (rdst);
    228 	rdst.rt_sin_family = AF_INET;
    229 	rdst.rt_sin_addr.s_addr = addr;
    230 
    231 	/*
    232 	 * This is basically inlining  a simpler version of ire_match_args
    233 	 */
    234 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
    235 
    236 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
    237 	    ipst->ips_ip_ftable, NULL, NULL);
    238 
    239 	if (rt == NULL)
    240 		goto bad;
    241 
    242 	irb = &rt->rt_irb;
    243 	if (irb->irb_ire_cnt == 0)
    244 		goto bad;
    245 
    246 	rw_enter(&irb->irb_lock, RW_READER);
    247 	ire = irb->irb_ire;
    248 	if (ire == NULL) {
    249 		rw_exit(&irb->irb_lock);
    250 		goto bad;
    251 	}
    252 	while (IRE_IS_CONDEMNED(ire)) {
    253 		ire = ire->ire_next;
    254 		if (ire == NULL) {
    255 			rw_exit(&irb->irb_lock);
    256 			goto bad;
    257 		}
    258 	}
    259 
    260 	/* we have a ire that matches */
    261 	ire_refhold(ire);
    262 	rw_exit(&irb->irb_lock);
    263 
    264 	/*
    265 	 * round-robin only if we have more than one route in the bucket.
    266 	 * ips_ip_ecmp_behavior controls when we do ECMP
    267 	 *	2:	always
    268 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
    269 	 *	0:	never
    270 	 *
    271 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
    272 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
    273 	 * and the IRE_INTERFACESs are likely to be shorter matches.
    274 	 */
    275 	if (ire->ire_bucket->irb_ire_cnt > 1) {
    276 		if (ipst->ips_ip_ecmp_behavior == 2 ||
    277 		    (ipst->ips_ip_ecmp_behavior == 1 &&
    278 		    IS_DEFAULT_ROUTE(ire))) {
    279 			ire_t	*next_ire;
    280 			ire_ftable_args_t margs;
    281 
    282 			bzero(&margs, sizeof (margs));
    283 			margs.ift_addr = addr;
    284 			margs.ift_zoneid = ALL_ZONES;
    285 
    286 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
    287 			    xmit_hint, ire, ipst);
    288 			if (next_ire == NULL) {
    289 				/* keep ire if next_ire is null */
    290 				if (generationp != NULL)
    291 					*generationp = ire->ire_generation;
    292 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    293 				return (ire);
    294 			}
    295 			ire_refrele(ire);
    296 			ire = next_ire;
    297 		}
    298 	}
    299 	/* Return generation before dropping lock */
    300 	if (generationp != NULL)
    301 		*generationp = ire->ire_generation;
    302 
    303 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    304 
    305 	/*
    306 	 * Since we only did ALL_ZONES matches there is no special handling
    307 	 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
    308 	 */
    309 	return (ire);
    310 
    311 bad:
    312 	if (generationp != NULL)
    313 		*generationp = IRE_GENERATION_VERIFY;
    314 
    315 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    316 	return (ire_reject(ipst, B_FALSE));
    317 }
    318 
    319 /*
    320  * Find the ill matching a multicast group.
    321  * Allows different routes for multicast addresses
    322  * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
    323  * which point at different interfaces. This is used when IP_MULTICAST_IF
    324  * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
    325  * specify the interface to join on.
    326  *
    327  * Supports link-local addresses by using ire_route_recursive which follows
    328  * the ill when recursing.
    329  *
    330  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
    331  * and the MULTIRT property can be different for different groups, we
    332  * extract RTF_MULTIRT from the special unicast route added for a group
    333  * with CGTP and pass that back in the multirtp argument.
    334  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
    335  * We have a setsrcp argument for the same reason.
    336  */
    337 ill_t *
    338 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
    339     boolean_t *multirtp, ipaddr_t *setsrcp)
    340 {
    341 	ire_t	*ire;
    342 	ill_t	*ill;
    343 
    344 	ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
    345 	    MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
    346 	ASSERT(ire != NULL);
    347 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
    348 		ire_refrele(ire);
    349 		return (NULL);
    350 	}
    351 
    352 	if (multirtp != NULL)
    353 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
    354 
    355 	ill = ire_nexthop_ill(ire);
    356 	ire_refrele(ire);
    357 	return (ill);
    358 }
    359 
    360 /*
    361  * Delete the passed in ire if the gateway addr matches
    362  */
    363 void
    364 ire_del_host_redir(ire_t *ire, char *gateway)
    365 {
    366 	if ((ire->ire_flags & RTF_DYNAMIC) &&
    367 	    (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
    368 		ire_delete(ire);
    369 }
    370 
    371 /*
    372  * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
    373  * pointing at the specified gateway and
    374  * delete them. This routine is called only
    375  * when a default gateway is going away.
    376  */
    377 void
    378 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
    379 {
    380 	struct rtfuncarg rtfarg;
    381 
    382 	bzero(&rtfarg, sizeof (rtfarg));
    383 	rtfarg.rt_func = ire_del_host_redir;
    384 	rtfarg.rt_arg = (void *)&gateway;
    385 	rtfarg.rt_zoneid = ALL_ZONES;
    386 	rtfarg.rt_ipst = ipst;
    387 	(void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
    388 	    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
    389 }
    390 
    391 /*
    392  * Obtain the rt_entry and rt_irb for the route to be added to
    393  * the ips_ip_ftable.
    394  * First attempt to add a node to the radix tree via rn_addroute. If the
    395  * route already exists, return the bucket for the existing route.
    396  *
    397  * Locking notes: Need to hold the global radix tree lock in write mode to
    398  * add a radix node. To prevent the node from being deleted, ire_get_bucket()
    399  * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
    400  * while holding the irb_lock, but not the radix tree lock.
    401  */
    402 irb_t *
    403 ire_get_bucket(ire_t *ire)
    404 {
    405 	struct radix_node *rn;
    406 	struct rt_entry *rt;
    407 	struct rt_sockaddr rmask, rdst;
    408 	irb_t *irb = NULL;
    409 	ip_stack_t *ipst = ire->ire_ipst;
    410 
    411 	ASSERT(ipst->ips_ip_ftable != NULL);
    412 
    413 	/* first try to see if route exists (based on rtalloc1) */
    414 	bzero(&rdst, sizeof (rdst));
    415 	rdst.rt_sin_len = sizeof (rdst);
    416 	rdst.rt_sin_family = AF_INET;
    417 	rdst.rt_sin_addr.s_addr = ire->ire_addr;
    418 
    419 	bzero(&rmask, sizeof (rmask));
    420 	rmask.rt_sin_len = sizeof (rmask);
    421 	rmask.rt_sin_family = AF_INET;
    422 	rmask.rt_sin_addr.s_addr = ire->ire_mask;
    423 
    424 	/*
    425 	 * add the route. based on BSD's rtrequest1(RTM_ADD)
    426 	 */
    427 	R_Malloc(rt, rt_entry_cache,  sizeof (*rt));
    428 	/* kmem_alloc failed */
    429 	if (rt == NULL)
    430 		return (NULL);
    431 
    432 	bzero(rt, sizeof (*rt));
    433 	rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
    434 	rt->rt_dst = rdst;
    435 	irb = &rt->rt_irb;
    436 	irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
    437 	irb->irb_ipst = ipst;
    438 	rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
    439 	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
    440 	rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
    441 	    ipst->ips_ip_ftable, (struct radix_node *)rt);
    442 	if (rn == NULL) {
    443 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    444 		Free(rt, rt_entry_cache);
    445 		rt = NULL;
    446 		irb = NULL;
    447 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
    448 		rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
    449 		    ipst->ips_ip_ftable);
    450 		if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
    451 			/* found a non-root match */
    452 			rt = (struct rt_entry *)rn;
    453 		}
    454 	}
    455 	if (rt != NULL) {
    456 		irb = &rt->rt_irb;
    457 		irb_refhold(irb);
    458 	}
    459 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    460 	return (irb);
    461 }
    462 
    463 /*
    464  * This function is used when the caller wants to know the outbound
    465  * interface for a packet given only the address.
    466  * If this is a offlink IP address and there are multiple
    467  * routes to this destination, this routine will utilise the
    468  * first route it finds to IP address
    469  * Return values:
    470  * 	0	- FAILURE
    471  *	nonzero	- ifindex
    472  */
    473 uint_t
    474 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
    475 {
    476 	uint_t ifindex = 0;
    477 	ire_t *ire;
    478 	ill_t *ill;
    479 	netstack_t *ns;
    480 	ip_stack_t *ipst;
    481 
    482 	if (zoneid == ALL_ZONES)
    483 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
    484 	else
    485 		ns = netstack_find_by_zoneid(zoneid);
    486 	ASSERT(ns != NULL);
    487 
    488 	/*
    489 	 * For exclusive stacks we set the zoneid to zero
    490 	 * since IP uses the global zoneid in the exclusive stacks.
    491 	 */
    492 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
    493 		zoneid = GLOBAL_ZONEID;
    494 	ipst = ns->netstack_ip;
    495 
    496 	ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
    497 
    498 	if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
    499 		ill = ire_nexthop_ill(ire);
    500 		if (ill != NULL) {
    501 			ifindex = ill->ill_phyint->phyint_ifindex;
    502 			ill_refrele(ill);
    503 		}
    504 		ire_refrele(ire);
    505 	}
    506 	netstack_rele(ns);
    507 	return (ifindex);
    508 }
    509 
    510 /*
    511  * Routine to find the route to a destination. If a ifindex is supplied
    512  * it tries to match the route to the corresponding ipif for the ifindex
    513  */
    514 static	ire_t *
    515 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
    516 {
    517 	ire_t *ire = NULL;
    518 	int match_flags;
    519 
    520 	match_flags = MATCH_IRE_DSTONLY;
    521 
    522 	/* XXX pass NULL tsl for now */
    523 
    524 	if (dst_addr->sa_family == AF_INET) {
    525 		ire = ire_route_recursive_v4(
    526 		    ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
    527 		    zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
    528 		    NULL, NULL);
    529 	} else {
    530 		ire = ire_route_recursive_v6(
    531 		    &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
    532 		    zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
    533 		    NULL, NULL);
    534 	}
    535 	ASSERT(ire != NULL);
    536 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
    537 		ire_refrele(ire);
    538 		return (NULL);
    539 	}
    540 	return (ire);
    541 }
    542 
    543 /*
    544  * This routine is called by IP Filter to send a packet out on the wire
    545  * to a specified dstination (which may be onlink or offlink). The ifindex may
    546  * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
    547  * an outgoing interface and requires the nexthop to be on that interface.
    548  * IP WILL NOT DO the following to the data packet before sending it out:
    549  *	a. manipulate ttl
    550  *	b. ipsec work
    551  *	c. fragmentation
    552  *
    553  * If the packet has been prepared for hardware checksum then it will be
    554  * passed off to ip_send_align_cksum() to check that the flags set on the
    555  * packet are in alignment with the capabilities of the new outgoing NIC.
    556  *
    557  * Return values:
    558  *	0:		IP was able to send of the data pkt
    559  *	ECOMM:		Could not send packet
    560  *	ENONET		No route to dst. It is up to the caller
    561  *			to send icmp unreachable error message,
    562  *	EINPROGRESS	The macaddr of the onlink dst or that
    563  *			of the offlink dst's nexthop needs to get
    564  *			resolved before packet can be sent to dst.
    565  *			Thus transmission is not guaranteed.
    566  *			Note: No longer have visibility to the ARP queue
    567  *			hence no EINPROGRESS.
    568  */
    569 int
    570 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
    571     zoneid_t zoneid)
    572 {
    573 	ipaddr_t nexthop;
    574 	netstack_t *ns;
    575 	ip_stack_t *ipst;
    576 	ip_xmit_attr_t ixas;
    577 	int error;
    578 
    579 	ASSERT(mp != NULL);
    580 
    581 	if (zoneid == ALL_ZONES)
    582 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
    583 	else
    584 		ns = netstack_find_by_zoneid(zoneid);
    585 	ASSERT(ns != NULL);
    586 
    587 	/*
    588 	 * For exclusive stacks we set the zoneid to zero
    589 	 * since IP uses the global zoneid in the exclusive stacks.
    590 	 */
    591 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
    592 		zoneid = GLOBAL_ZONEID;
    593 	ipst = ns->netstack_ip;
    594 
    595 	ASSERT(dst_addr->sa_family == AF_INET ||
    596 	    dst_addr->sa_family == AF_INET6);
    597 
    598 	bzero(&ixas, sizeof (ixas));
    599 	/*
    600 	 * No IPsec, no fragmentation, and don't let any hooks see
    601 	 * the packet.
    602 	 */
    603 	ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
    604 	ixas.ixa_cred = kcred;
    605 	ixas.ixa_cpid = NOPID;
    606 	ixas.ixa_tsl = NULL;
    607 	ixas.ixa_ipst = ipst;
    608 	ixas.ixa_ifindex = ifindex;
    609 
    610 	if (dst_addr->sa_family == AF_INET) {
    611 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
    612 
    613 		ixas.ixa_flags |= IXAF_IS_IPV4;
    614 		nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
    615 		if (nexthop != ipha->ipha_dst) {
    616 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
    617 			ixas.ixa_nexthop_v4 = nexthop;
    618 		}
    619 		ixas.ixa_multicast_ttl = ipha->ipha_ttl;
    620 	} else {
    621 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
    622 		in6_addr_t *nexthop6;
    623 
    624 		nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
    625 		if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
    626 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
    627 			ixas.ixa_nexthop_v6 = *nexthop6;
    628 		}
    629 		ixas.ixa_multicast_ttl = ip6h->ip6_hops;
    630 	}
    631 	error = ip_output_simple(mp, &ixas);
    632 	ixa_cleanup(&ixas);
    633 
    634 	netstack_rele(ns);
    635 	switch (error) {
    636 	case 0:
    637 		break;
    638 
    639 	case EHOSTUNREACH:
    640 	case ENETUNREACH:
    641 		error = ENONET;
    642 		break;
    643 
    644 	default:
    645 		error = ECOMM;
    646 		break;
    647 	}
    648 	return (error);
    649 }
    650 
    651 /*
    652  * callback function provided by ire_ftable_lookup when calling
    653  * rn_match_args(). Invoke ire_match_args on each matching leaf node in
    654  * the radix tree.
    655  */
    656 boolean_t
    657 ire_find_best_route(struct radix_node *rn, void *arg)
    658 {
    659 	struct rt_entry *rt = (struct rt_entry *)rn;
    660 	irb_t *irb_ptr;
    661 	ire_t *ire;
    662 	ire_ftable_args_t *margs = arg;
    663 	ipaddr_t match_mask;
    664 
    665 	ASSERT(rt != NULL);
    666 
    667 	irb_ptr = &rt->rt_irb;
    668 
    669 	if (irb_ptr->irb_ire_cnt == 0)
    670 		return (B_FALSE);
    671 
    672 	rw_enter(&irb_ptr->irb_lock, RW_READER);
    673 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
    674 		if (IRE_IS_CONDEMNED(ire))
    675 			continue;
    676 		if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK))
    677 			match_mask = margs->ift_mask;
    678 		else
    679 			match_mask = ire->ire_mask;
    680 
    681 		if (ire_match_args(ire, margs->ift_addr, match_mask,
    682 		    margs->ift_gateway, margs->ift_type, margs->ift_ill,
    683 		    margs->ift_zoneid, margs->ift_tsl,
    684 		    margs->ift_flags)) {
    685 			ire_refhold(ire);
    686 			rw_exit(&irb_ptr->irb_lock);
    687 			margs->ift_best_ire = ire;
    688 			return (B_TRUE);
    689 		}
    690 	}
    691 	rw_exit(&irb_ptr->irb_lock);
    692 	return (B_FALSE);
    693 }
    694 
    695 /*
    696  * ftable irb_t structures are dynamically allocated, and we need to
    697  * check if the irb_t (and associated ftable tree attachment) needs to
    698  * be cleaned up when the irb_refcnt goes to 0. The conditions that need
    699  * be verified are:
    700  * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
    701  * - no other threads holding references to ire's in the bucket,
    702  *   i.e., irb_nire == 0
    703  * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
    704  * - need to hold the global tree lock and irb_lock in write mode.
    705  */
    706 void
    707 irb_refrele_ftable(irb_t *irb)
    708 {
    709 	for (;;) {
    710 		rw_enter(&irb->irb_lock, RW_WRITER);
    711 		ASSERT(irb->irb_refcnt != 0);
    712 		if (irb->irb_refcnt != 1) {
    713 			/*
    714 			 * Someone has a reference to this radix node
    715 			 * or there is some bucket walker.
    716 			 */
    717 			irb->irb_refcnt--;
    718 			rw_exit(&irb->irb_lock);
    719 			return;
    720 		} else {
    721 			/*
    722 			 * There is no other walker, nor is there any
    723 			 * other thread that holds a direct ref to this
    724 			 * radix node. Do the clean up if needed. Call
    725 			 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
    726 			 */
    727 			if (irb->irb_marks & IRB_MARK_CONDEMNED)  {
    728 				ire_t *ire_list;
    729 
    730 				ire_list = ire_unlink(irb);
    731 				rw_exit(&irb->irb_lock);
    732 
    733 				if (ire_list != NULL)
    734 					ire_cleanup(ire_list);
    735 				/*
    736 				 * more CONDEMNED entries could have
    737 				 * been added while we dropped the lock,
    738 				 * so we have to re-check.
    739 				 */
    740 				continue;
    741 			}
    742 
    743 			/*
    744 			 * Now check if there are still any ires
    745 			 * associated with this radix node.
    746 			 */
    747 			if (irb->irb_nire != 0) {
    748 				/*
    749 				 * someone is still holding on
    750 				 * to ires in this bucket
    751 				 */
    752 				irb->irb_refcnt--;
    753 				rw_exit(&irb->irb_lock);
    754 				return;
    755 			} else {
    756 				/*
    757 				 * Everything is clear. Zero walkers,
    758 				 * Zero threads with a ref to this
    759 				 * radix node, Zero ires associated with
    760 				 * this radix node. Due to lock order,
    761 				 * check the above conditions again
    762 				 * after grabbing all locks in the right order
    763 				 */
    764 				rw_exit(&irb->irb_lock);
    765 				if (irb_inactive(irb))
    766 					return;
    767 				/*
    768 				 * irb_inactive could not free the irb.
    769 				 * See if there are any walkers, if not
    770 				 * try to clean up again.
    771 				 */
    772 			}
    773 		}
    774 	}
    775 }
    776 
    777 /*
    778  * IRE iterator used by ire_ftable_lookup to process multiple equal
    779  * routes. Given a starting point in the hash list (hash), walk the IREs
    780  * in the bucket skipping deleted entries. We treat the bucket as a circular
    781  * list for the purposes of walking it.
    782  * Returns the IRE (held) that corresponds to the hash value. If that IRE is
    783  * not applicable (ire_match_args failed) then it returns a subsequent one.
    784  * If we fail to find an IRE we return NULL.
    785  *
    786  * Assumes that the caller holds a reference on the IRE bucket and a read lock
    787  * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
    788  *
    789  * Applies to IPv4 and IPv6.
    790  *
    791  * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
    792  * address and bucket, we compare against ire_type for the orig_ire. We also
    793  * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
    794  * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
    795  *
    796  * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
    797  * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
    798  * in which the zone has an IP address. We check this for the global zone
    799  * even if no shared-IP zones are configured.
    800  */
    801 ire_t *
    802 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
    803     ire_t *orig_ire, ip_stack_t *ipst)
    804 {
    805 	ire_t		*ire, *maybe_ire = NULL;
    806 	uint_t		maybe_badcnt;
    807 	uint_t		maxwalk;
    808 
    809 	/* Fold in more bits from the hint/hash */
    810 	hash = hash ^ (hash >> 8) ^ (hash >> 16);
    811 
    812 	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
    813 	maxwalk = irb_ptr->irb_ire_cnt;	/* Excludes condemned */
    814 	hash %= maxwalk;
    815 	irb_refhold_locked(irb_ptr);
    816 	rw_exit(&irb_ptr->irb_lock);
    817 
    818 	/*
    819 	 * Round-robin the routers list looking for a route that
    820 	 * matches the passed in parameters.
    821 	 * First we skip "hash" number of non-condemned IREs.
    822 	 * Then we match the IRE.
    823 	 * If we find an ire which has a non-zero ire_badcnt then we remember
    824 	 * it and keep on looking for a lower ire_badcnt.
    825 	 * If we come to the end of the list we continue (treat the
    826 	 * bucket list as a circular list) but we match less than "max"
    827 	 * entries.
    828 	 */
    829 	ire = irb_ptr->irb_ire;
    830 	while (maxwalk > 0) {
    831 		if (IRE_IS_CONDEMNED(ire))
    832 			goto next_ire_skip;
    833 
    834 		/* Skip the first "hash" entries to do ECMP */
    835 		if (hash != 0) {
    836 			hash--;
    837 			goto next_ire_skip;
    838 		}
    839 
    840 		/* See CGTP comment above */
    841 		if (ire->ire_type != orig_ire->ire_type ||
    842 		    ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
    843 			goto next_ire;
    844 
    845 		/*
    846 		 * Note: Since IPv6 has hash buckets instead of radix
    847 		 * buckers we need to explicitly compare the addresses.
    848 		 * That makes this less efficient since we will be called
    849 		 * even if there is no alternatives just because the
    850 		 * bucket has multiple IREs for different addresses.
    851 		 */
    852 		if (ire->ire_ipversion == IPV6_VERSION) {
    853 			if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
    854 			    &ire->ire_addr_v6))
    855 				goto next_ire;
    856 		}
    857 
    858 		/*
    859 		 * For some reason find_best_route uses ire_mask. We do
    860 		 * the same.
    861 		 */
    862 		if (ire->ire_ipversion == IPV4_VERSION ?
    863 		    !ire_match_args(ire, margs->ift_addr,
    864 		    ire->ire_mask, margs->ift_gateway,
    865 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
    866 		    margs->ift_tsl, margs->ift_flags) :
    867 		    !ire_match_args_v6(ire, &margs->ift_addr_v6,
    868 		    &ire->ire_mask_v6, &margs->ift_gateway_v6,
    869 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
    870 		    margs->ift_tsl, margs->ift_flags))
    871 			goto next_ire;
    872 
    873 		if (margs->ift_zoneid != ALL_ZONES &&
    874 		    (ire->ire_type & IRE_OFFLINK)) {
    875 			/*
    876 			 * When we're in a zone, we're only
    877 			 * interested in routers that are
    878 			 * reachable through ipifs within our zone.
    879 			 */
    880 			if (ire->ire_ipversion == IPV4_VERSION) {
    881 				if (!ire_gateway_ok_zone_v4(
    882 				    ire->ire_gateway_addr, margs->ift_zoneid,
    883 				    ire->ire_ill, margs->ift_tsl, ipst,
    884 				    B_TRUE))
    885 					goto next_ire;
    886 			} else {
    887 				if (!ire_gateway_ok_zone_v6(
    888 				    &ire->ire_gateway_addr_v6,
    889 				    margs->ift_zoneid, ire->ire_ill,
    890 				    margs->ift_tsl, ipst, B_TRUE))
    891 					goto next_ire;
    892 			}
    893 		}
    894 		mutex_enter(&ire->ire_lock);
    895 		/* Look for stale ire_badcnt and clear */
    896 		if (ire->ire_badcnt != 0 &&
    897 		    (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
    898 		    ipst->ips_ip_ire_badcnt_lifetime))
    899 			ire->ire_badcnt = 0;
    900 		mutex_exit(&ire->ire_lock);
    901 
    902 		if (ire->ire_badcnt == 0) {
    903 			/* We found one with a zero badcnt; done */
    904 			ire_refhold(ire);
    905 			/*
    906 			 * Care needed since irb_refrele grabs WLOCK to free
    907 			 * the irb_t.
    908 			 */
    909 			if (ire->ire_ipversion == IPV4_VERSION) {
    910 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    911 				irb_refrele(irb_ptr);
    912 				RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
    913 			} else {
    914 				rw_exit(&ipst->ips_ip6_ire_head_lock);
    915 				irb_refrele(irb_ptr);
    916 				rw_enter(&ipst->ips_ip6_ire_head_lock,
    917 				    RW_READER);
    918 			}
    919 			return (ire);
    920 		}
    921 		/*
    922 		 * keep looking to see if there is a better (lower
    923 		 * badcnt) matching IRE, but save this one as a last resort.
    924 		 * If we find a lower badcnt pick that one as the last* resort.
    925 		 */
    926 		if (maybe_ire == NULL) {
    927 			maybe_ire = ire;
    928 			maybe_badcnt = ire->ire_badcnt;
    929 		} else if (ire->ire_badcnt < maybe_badcnt) {
    930 			maybe_ire = ire;
    931 			maybe_badcnt = ire->ire_badcnt;
    932 		}
    933 
    934 next_ire:
    935 		maxwalk--;
    936 next_ire_skip:
    937 		ire = ire->ire_next;
    938 		if (ire == NULL)
    939 			ire = irb_ptr->irb_ire;
    940 	}
    941 	if (maybe_ire != NULL)
    942 		ire_refhold(maybe_ire);
    943 
    944 	/* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
    945 	if (ire->ire_ipversion == IPV4_VERSION) {
    946 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    947 		irb_refrele(irb_ptr);
    948 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
    949 	} else {
    950 		rw_exit(&ipst->ips_ip6_ire_head_lock);
    951 		irb_refrele(irb_ptr);
    952 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
    953 	}
    954 	return (maybe_ire);
    955 }
    956 
    957 void
    958 irb_refhold_rn(struct radix_node *rn)
    959 {
    960 	if ((rn->rn_flags & RNF_ROOT) == 0)
    961 		irb_refhold(&((rt_t *)(rn))->rt_irb);
    962 }
    963 
    964 void
    965 irb_refrele_rn(struct radix_node *rn)
    966 {
    967 	if ((rn->rn_flags & RNF_ROOT) == 0)
    968 		irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
    969 }
    970 
    971 /*
    972  * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
    973  * routes this routine sets up a ire_nce_cache as well. The caller needs to
    974  * lookup an nce for the multicast case.
    975  */
    976 ire_t *
    977 ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa,
    978     uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
    979 {
    980 	uint_t		match_args;
    981 	uint_t		ire_type;
    982 	ill_t		*ill;
    983 	ire_t		*ire;
    984 	ip_stack_t	*ipst = ixa->ixa_ipst;
    985 	ipaddr_t	v4dst;
    986 	in6_addr_t	v6nexthop;
    987 	iaflags_t	ixaflags = ixa->ixa_flags;
    988 	nce_t		*nce;
    989 
    990 	match_args = MATCH_IRE_SECATTR;
    991 	IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
    992 	if (setsrcp != NULL)
    993 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
    994 	if (errorp != NULL)
    995 		ASSERT(*errorp == 0);
    996 
    997 	/*
    998 	 * The content of the ixa will be different if IP_NEXTHOP,
    999 	 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
   1000 	 */
   1001 
   1002 	if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) :
   1003 	    IN6_IS_ADDR_MULTICAST(v6dst)) {
   1004 		/* Pick up the IRE_MULTICAST for the ill */
   1005 		if (ixa->ixa_multicast_ifindex != 0) {
   1006 			ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
   1007 			    !(ixaflags & IXAF_IS_IPV4), ipst);
   1008 		} else if (ixaflags & IXAF_SCOPEID_SET) {
   1009 			/* sin6_scope_id takes precedence over ixa_ifindex */
   1010 			ASSERT(ixa->ixa_scopeid != 0);
   1011 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
   1012 			    !(ixaflags & IXAF_IS_IPV4), ipst);
   1013 		} else if (ixa->ixa_ifindex != 0) {
   1014 			/*
   1015 			 * In the ipmp case, the ixa_ifindex is set to
   1016 			 * point at an under_ill and we would return the
   1017 			 * ire_multicast() corresponding to that under_ill.
   1018 			 */
   1019 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
   1020 			    !(ixaflags & IXAF_IS_IPV4), ipst);
   1021 		} else if (ixaflags & IXAF_IS_IPV4) {
   1022 			ipaddr_t	v4setsrc = INADDR_ANY;
   1023 
   1024 			ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst,
   1025 			    multirtp, &v4setsrc);
   1026 			if (setsrcp != NULL)
   1027 				IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
   1028 		} else {
   1029 			ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst,
   1030 			    multirtp, setsrcp);
   1031 		}
   1032 		if (ill != NULL && IS_VNI(ill)) {
   1033 			ill_refrele(ill);
   1034 			ill = NULL;
   1035 		}
   1036 		if (ill == NULL) {
   1037 			if (errorp != NULL)
   1038 				*errorp = ENXIO;
   1039 			/* Get a hold on the IRE_NOROUTE */
   1040 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
   1041 			return (ire);
   1042 		}
   1043 		if (!(ill->ill_flags & ILLF_MULTICAST)) {
   1044 			ill_refrele(ill);
   1045 			if (errorp != NULL)
   1046 				*errorp = EHOSTUNREACH;
   1047 			/* Get a hold on the IRE_NOROUTE */
   1048 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
   1049 			return (ire);
   1050 		}
   1051 		/* Get a refcnt on the single IRE_MULTICAST per ill */
   1052 		ire = ire_multicast(ill);
   1053 		ill_refrele(ill);
   1054 		if (generationp != NULL)
   1055 			*generationp = ire->ire_generation;
   1056 		if (errorp != NULL &&
   1057 		    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
   1058 			*errorp = EHOSTUNREACH;
   1059 		}
   1060 		return (ire);
   1061 	}
   1062 
   1063 	if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
   1064 		if (ixaflags & IXAF_SCOPEID_SET) {
   1065 			/* sin6_scope_id takes precedence over ixa_ifindex */
   1066 			ASSERT(ixa->ixa_scopeid != 0);
   1067 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
   1068 			    !(ixaflags & IXAF_IS_IPV4), ipst);
   1069 		} else {
   1070 			ASSERT(ixa->ixa_ifindex != 0);
   1071 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
   1072 			    !(ixaflags & IXAF_IS_IPV4), ipst);
   1073 		}
   1074 		if (ill != NULL && IS_VNI(ill)) {
   1075 			ill_refrele(ill);
   1076 			ill = NULL;
   1077 		}
   1078 		if (ill == NULL) {
   1079 			if (errorp != NULL)
   1080 				*errorp = ENXIO;
   1081 			/* Get a hold on the IRE_NOROUTE */
   1082 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
   1083 			return (ire);
   1084 		}
   1085 		/*
   1086 		 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
   1087 		 * so for both of them we need to be able look for an under
   1088 		 * interface.
   1089 		 */
   1090 		if (IS_UNDER_IPMP(ill))
   1091 			match_args |= MATCH_IRE_TESTHIDDEN;
   1092 	} else {
   1093 		ill = NULL;
   1094 	}
   1095 
   1096 	if (ixaflags & IXAF_NEXTHOP_SET) {
   1097 		/* IP_NEXTHOP was set */
   1098 		v6nexthop = ixa->ixa_nexthop_v6;
   1099 	} else {
   1100 		v6nexthop = *v6dst;
   1101 	}
   1102 
   1103 	ire_type = 0;
   1104 	/* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */
   1105 
   1106 	/*
   1107 	 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
   1108 	 * we only look for an onlink IRE.
   1109 	 */
   1110 	if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
   1111 		match_args |= MATCH_IRE_TYPE;
   1112 		ire_type = IRE_ONLINK;
   1113 	}
   1114 
   1115 	if (ixaflags & IXAF_IS_IPV4) {
   1116 		ipaddr_t	v4nexthop;
   1117 		ipaddr_t	v4setsrc = INADDR_ANY;
   1118 
   1119 		IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
   1120 		ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
   1121 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
   1122 		    ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
   1123 		if (setsrcp != NULL)
   1124 			IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
   1125 	} else {
   1126 		ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
   1127 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
   1128 		    ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
   1129 	}
   1130 
   1131 #ifdef DEBUG
   1132 	if (match_args & MATCH_IRE_TESTHIDDEN) {
   1133 		ip3dbg(("looking for hidden; dst %x ire %p\n",
   1134 		    v4dst, (void *)ire));
   1135 	}
   1136 #endif
   1137 
   1138 	if (ill != NULL)
   1139 		ill_refrele(ill);
   1140 
   1141 	if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
   1142 	    (ire->ire_type & IRE_MULTICAST)) {
   1143 		/* No ire_nce_cache */
   1144 		return (ire);
   1145 	}
   1146 
   1147 	/* Setup ire_nce_cache if it doesn't exist or is condemned. */
   1148 	mutex_enter(&ire->ire_lock);
   1149 	nce = ire->ire_nce_cache;
   1150 	if (nce == NULL || nce->nce_is_condemned) {
   1151 		mutex_exit(&ire->ire_lock);
   1152 		(void) ire_revalidate_nce(ire);
   1153 	} else {
   1154 		mutex_exit(&ire->ire_lock);
   1155 	}
   1156 	return (ire);
   1157 }
   1158 
   1159 /*
   1160  * Find a route given some xmit attributes and a packet.
   1161  * Generic for IPv4 and IPv6
   1162  *
   1163  * This never returns NULL. But when it returns the IRE_NOROUTE
   1164  * it might set errorp.
   1165  */
   1166 ire_t *
   1167 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
   1168     int *errorp, boolean_t *multirtp)
   1169 {
   1170 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
   1171 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
   1172 		in6_addr_t	v6dst;
   1173 
   1174 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
   1175 
   1176 		return (ip_select_route(&v6dst, ixa, generationp,
   1177 		    NULL, errorp, multirtp));
   1178 	} else {
   1179 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
   1180 
   1181 		return (ip_select_route(&ip6h->ip6_dst, ixa, generationp,
   1182 		    NULL, errorp, multirtp));
   1183 	}
   1184 }
   1185 
   1186 ire_t *
   1187 ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp,
   1188     ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
   1189 {
   1190 	in6_addr_t	v6dst;
   1191 	ire_t		*ire;
   1192 	in6_addr_t	setsrc;
   1193 
   1194 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
   1195 
   1196 	IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
   1197 
   1198 	setsrc = ipv6_all_zeros;
   1199 	ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp,
   1200 	    multirtp);
   1201 	if (v4setsrcp != NULL)
   1202 		IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
   1203 	return (ire);
   1204 }
   1205 
   1206 /*
   1207  * Recursively look for a route to the destination. Can also match on
   1208  * the zoneid, ill, and label. Used for the data paths. See also
   1209  * ire_route_recursive.
   1210  *
   1211  * If ill is set this means we will match it by adding MATCH_IRE_ILL.
   1212  *
   1213  * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
   1214  * create an IRE_IF_CLONE. This is used on the receive side when we are not
   1215  * forwarding.
   1216  * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
   1217  * resolve the gateway.
   1218  *
   1219  * Note that this function never returns NULL. It returns an IRE_NOROUTE
   1220  * instead.
   1221  *
   1222  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
   1223  * is an error.
   1224  * Allow at most one RTF_INDIRECT.
   1225  */
   1226 ire_t *
   1227 ire_route_recursive_impl_v4(ire_t *ire,
   1228     ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
   1229     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
   1230     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
   1231     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
   1232 {
   1233 	int		i, j;
   1234 	ire_t		*ires[MAX_IRE_RECURSION];
   1235 	uint_t		generation;
   1236 	uint_t		generations[MAX_IRE_RECURSION];
   1237 	boolean_t	need_refrele = B_FALSE;
   1238 	boolean_t	invalidate = B_FALSE;
   1239 	int		prefs[MAX_IRE_RECURSION];
   1240 	ill_t		*ill = NULL;
   1241 
   1242 	if (setsrcp != NULL)
   1243 		ASSERT(*setsrcp == INADDR_ANY);
   1244 	if (gwattrp != NULL)
   1245 		ASSERT(*gwattrp == NULL);
   1246 
   1247 	if (ill_arg != NULL)
   1248 		match_args |= MATCH_IRE_ILL;
   1249 
   1250 	/*
   1251 	 * We iterate up to three times to resolve a route, even though
   1252 	 * we have four slots in the array. The extra slot is for an
   1253 	 * IRE_IF_CLONE we might need to create.
   1254 	 */
   1255 	i = 0;
   1256 	while (i < MAX_IRE_RECURSION - 1) {
   1257 		/* ire_ftable_lookup handles round-robin/ECMP */
   1258 		if (ire == NULL) {
   1259 			ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
   1260 			    (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
   1261 			    match_args, xmit_hint, ipst, &generation);
   1262 		} else {
   1263 			/* Caller passed it; extra hold since we will rele */
   1264 			ire_refhold(ire);
   1265 			if (generationp != NULL)
   1266 				generation = *generationp;
   1267 			else
   1268 				generation = IRE_GENERATION_VERIFY;
   1269 		}
   1270 		if (ire == NULL)
   1271 			ire = ire_reject(ipst, B_FALSE);
   1272 
   1273 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
   1274 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
   1275 			goto error;
   1276 
   1277 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
   1278 
   1279 		if (i != 0) {
   1280 			prefs[i] = ire_pref(ire);
   1281 			/*
   1282 			 * Don't allow anything unusual past the first
   1283 			 * iteration.
   1284 			 */
   1285 			if ((ire->ire_type &
   1286 			    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
   1287 			    prefs[i] <= prefs[i-1]) {
   1288 				ire_refrele(ire);
   1289 				if (irr_flags & IRR_INCOMPLETE) {
   1290 					ire = ires[0];
   1291 					ire_refhold(ire);
   1292 				} else {
   1293 					ire = ire_reject(ipst, B_FALSE);
   1294 				}
   1295 				goto error;
   1296 			}
   1297 		}
   1298 		/* We have a usable IRE */
   1299 		ires[i] = ire;
   1300 		generations[i] = generation;
   1301 		i++;
   1302 
   1303 		/* The first RTF_SETSRC address is passed back if setsrcp */
   1304 		if ((ire->ire_flags & RTF_SETSRC) &&
   1305 		    setsrcp != NULL && *setsrcp == INADDR_ANY) {
   1306 			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
   1307 			*setsrcp = ire->ire_setsrc_addr;
   1308 		}
   1309 
   1310 		/* The first ire_gw_secattr is passed back if gwattrp */
   1311 		if (ire->ire_gw_secattr != NULL &&
   1312 		    gwattrp != NULL && *gwattrp == NULL)
   1313 			*gwattrp = ire->ire_gw_secattr;
   1314 
   1315 		/*
   1316 		 * Check if we have a short-cut pointer to an IRE for this
   1317 		 * destination, and that the cached dependency isn't stale.
   1318 		 * In that case we've rejoined an existing tree towards a
   1319 		 * parent, thus we don't need to continue the loop to
   1320 		 * discover the rest of the tree.
   1321 		 */
   1322 		mutex_enter(&ire->ire_lock);
   1323 		if (ire->ire_dep_parent != NULL &&
   1324 		    ire->ire_dep_parent->ire_generation ==
   1325 		    ire->ire_dep_parent_generation) {
   1326 			mutex_exit(&ire->ire_lock);
   1327 			ire = NULL;
   1328 			goto done;
   1329 		}
   1330 		mutex_exit(&ire->ire_lock);
   1331 
   1332 		/*
   1333 		 * If this type should have an ire_nce_cache (even if it
   1334 		 * doesn't yet have one) then we are done. Includes
   1335 		 * IRE_INTERFACE with a full 32 bit mask.
   1336 		 */
   1337 		if (ire->ire_nce_capable) {
   1338 			ire = NULL;
   1339 			goto done;
   1340 		}
   1341 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
   1342 		/*
   1343 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
   1344 		 * particular destination
   1345 		 */
   1346 		if (ire->ire_type & IRE_INTERFACE) {
   1347 			in6_addr_t	v6nexthop;
   1348 			ire_t		*clone;
   1349 
   1350 			ASSERT(ire->ire_masklen != IPV4_ABITS);
   1351 
   1352 			/*
   1353 			 * In the case of ip_input and ILLF_FORWARDING not
   1354 			 * being set, and in the case of RTM_GET, there is
   1355 			 * no point in allocating an IRE_IF_CLONE. We return
   1356 			 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
   1357 			 * result in a ire_dep_parent which is IRE_IF_*
   1358 			 * without an IRE_IF_CLONE.
   1359 			 * We recover from that when we need to send packets
   1360 			 * by ensuring that the generations become
   1361 			 * IRE_GENERATION_VERIFY in this case.
   1362 			 */
   1363 			if (!(irr_flags & IRR_ALLOCATE)) {
   1364 				invalidate = B_TRUE;
   1365 				ire = NULL;
   1366 				goto done;
   1367 			}
   1368 
   1369 			IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
   1370 
   1371 			clone = ire_create_if_clone(ire, &v6nexthop,
   1372 			    &generation);
   1373 			if (clone == NULL) {
   1374 				/*
   1375 				 * Temporary failure - no memory.
   1376 				 * Don't want caller to cache IRE_NOROUTE.
   1377 				 */
   1378 				invalidate = B_TRUE;
   1379 				ire = ire_blackhole(ipst, B_FALSE);
   1380 				goto error;
   1381 			}
   1382 			/*
   1383 			 * Make clone next to last entry and the
   1384 			 * IRE_INTERFACE the last in the dependency
   1385 			 * chain since the clone depends on the
   1386 			 * IRE_INTERFACE.
   1387 			 */
   1388 			ASSERT(i >= 1);
   1389 			ASSERT(i < MAX_IRE_RECURSION);
   1390 
   1391 			ires[i] = ires[i-1];
   1392 			generations[i] = generations[i-1];
   1393 			ires[i-1] = clone;
   1394 			generations[i-1] = generation;
   1395 			i++;
   1396 
   1397 			ire = NULL;
   1398 			goto done;
   1399 		}
   1400 
   1401 		/*
   1402 		 * We only match on the type and optionally ILL when
   1403 		 * recursing. The type match is used by some callers
   1404 		 * to exclude certain types (such as IRE_IF_CLONE or
   1405 		 * IRE_LOCAL|IRE_LOOPBACK).
   1406 		 */
   1407 		match_args &= MATCH_IRE_TYPE;
   1408 		nexthop = ire->ire_gateway_addr;
   1409 		if (ill == NULL && ire->ire_ill != NULL) {
   1410 			ill = ire->ire_ill;
   1411 			need_refrele = B_TRUE;
   1412 			ill_refhold(ill);
   1413 			match_args |= MATCH_IRE_ILL;
   1414 		}
   1415 		/*
   1416 		 * We set the prefs[i] value above if i > 0. We've already
   1417 		 * done i++ so i is one in the case of the first time around.
   1418 		 */
   1419 		if (i == 1)
   1420 			prefs[0] = ire_pref(ire);
   1421 		ire = NULL;
   1422 	}
   1423 	ASSERT(ire == NULL);
   1424 	ire = ire_reject(ipst, B_FALSE);
   1425 
   1426 error:
   1427 	ASSERT(ire != NULL);
   1428 	if (need_refrele)
   1429 		ill_refrele(ill);
   1430 
   1431 	/*
   1432 	 * In the case of MULTIRT we want to try a different IRE the next
   1433 	 * time. We let the next packet retry in that case.
   1434 	 */
   1435 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
   1436 		(void) ire_no_good(ires[0]);
   1437 
   1438 cleanup:
   1439 	/* cleanup ires[i] */
   1440 	ire_dep_unbuild(ires, i);
   1441 	for (j = 0; j < i; j++)
   1442 		ire_refrele(ires[j]);
   1443 
   1444 	ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
   1445 	    (irr_flags & IRR_INCOMPLETE));
   1446 	/*
   1447 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
   1448 	 * ip_select_route since the reject or lack of memory might be gone.
   1449 	 */
   1450 	if (generationp != NULL)
   1451 		*generationp = IRE_GENERATION_VERIFY;
   1452 	return (ire);
   1453 
   1454 done:
   1455 	ASSERT(ire == NULL);
   1456 	if (need_refrele) {
   1457 		ill_refrele(ill);
   1458 		ill = NULL;
   1459 	}
   1460 
   1461 	/* Build dependencies */
   1462 	if (i > 1 && !ire_dep_build(ires, generations, i)) {
   1463 		/* Something in chain was condemned; tear it apart */
   1464 		ire = ire_reject(ipst, B_FALSE);
   1465 		goto cleanup;
   1466 	}
   1467 
   1468 	/*
   1469 	 * Release all refholds except the one for ires[0] that we
   1470 	 * will return to the caller.
   1471 	 */
   1472 	for (j = 1; j < i; j++)
   1473 		ire_refrele(ires[j]);
   1474 
   1475 	if (invalidate) {
   1476 		/*
   1477 		 * Since we needed to allocate but couldn't we need to make
   1478 		 * sure that the dependency chain is rebuilt the next time.
   1479 		 */
   1480 		ire_dep_invalidate_generations(ires[0]);
   1481 		generation = IRE_GENERATION_VERIFY;
   1482 	} else {
   1483 		/*
   1484 		 * IREs can have been added or deleted while we did the
   1485 		 * recursive lookup and we can't catch those until we've built
   1486 		 * the dependencies. We verify the stored
   1487 		 * ire_dep_parent_generation to catch any such changes and
   1488 		 * return IRE_GENERATION_VERIFY (which will cause
   1489 		 * ip_select_route to be called again so we can redo the
   1490 		 * recursive lookup next time we send a packet.
   1491 		 */
   1492 		if (ires[0]->ire_dep_parent == NULL)
   1493 			generation = ires[0]->ire_generation;
   1494 		else
   1495 			generation = ire_dep_validate_generations(ires[0]);
   1496 		if (generations[0] != ires[0]->ire_generation) {
   1497 			/* Something changed at the top */
   1498 			generation = IRE_GENERATION_VERIFY;
   1499 		}
   1500 	}
   1501 	if (generationp != NULL)
   1502 		*generationp = generation;
   1503 
   1504 	return (ires[0]);
   1505 }
   1506 
   1507 ire_t *
   1508 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
   1509     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
   1510     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
   1511     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
   1512 {
   1513 	return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
   1514 	    zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
   1515 	    gwattrp, generationp));
   1516 }
   1517 
   1518 /*
   1519  * Recursively look for a route to the destination.
   1520  * We only handle a destination match here, yet we have the same arguments
   1521  * as the full match to allow function pointers to select between the two.
   1522  *
   1523  * Note that this function never returns NULL. It returns an IRE_NOROUTE
   1524  * instead.
   1525  *
   1526  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
   1527  * is an error.
   1528  * Allow at most one RTF_INDIRECT.
   1529  */
   1530 ire_t *
   1531 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags,
   1532     uint32_t xmit_hint, ip_stack_t *ipst)
   1533 {
   1534 	ire_t	*ire;
   1535 	ire_t	*ire1;
   1536 	uint_t	generation;
   1537 
   1538 	/* ire_ftable_lookup handles round-robin/ECMP */
   1539 	ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
   1540 	    &generation);
   1541 	ASSERT(ire != NULL);
   1542 
   1543 	/*
   1544 	 * If this type should have an ire_nce_cache (even if it
   1545 	 * doesn't yet have one) then we are done. Includes
   1546 	 * IRE_INTERFACE with a full 32 bit mask.
   1547 	 */
   1548 	if (ire->ire_nce_capable)
   1549 		return (ire);
   1550 
   1551 	/*
   1552 	 * If the IRE has a current cached parent we know that the whole
   1553 	 * parent chain is current, hence we don't need to discover and
   1554 	 * build any dependencies by doing a recursive lookup.
   1555 	 */
   1556 	mutex_enter(&ire->ire_lock);
   1557 	if (ire->ire_dep_parent != NULL &&
   1558 	    ire->ire_dep_parent->ire_generation ==
   1559 	    ire->ire_dep_parent_generation) {
   1560 		mutex_exit(&ire->ire_lock);
   1561 		return (ire);
   1562 	}
   1563 	mutex_exit(&ire->ire_lock);
   1564 
   1565 	/*
   1566 	 * Fallback to loop in the normal code starting with the ire
   1567 	 * we found. Normally this would return the same ire.
   1568 	 */
   1569 	ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
   1570 	    NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
   1571 	    &generation);
   1572 	ire_refrele(ire);
   1573 	return (ire1);
   1574 }
   1575