Home | History | Annotate | Download | only in ip
      1   2535  sangeeta /*
      2   2535  sangeeta  * CDDL HEADER START
      3   2535  sangeeta  *
      4   2535  sangeeta  * The contents of this file are subject to the terms of the
      5   2535  sangeeta  * Common Development and Distribution License (the "License").
      6   2535  sangeeta  * You may not use this file except in compliance with the License.
      7   2535  sangeeta  *
      8   2535  sangeeta  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9   2535  sangeeta  * or http://www.opensolaris.org/os/licensing.
     10   2535  sangeeta  * See the License for the specific language governing permissions
     11   2535  sangeeta  * and limitations under the License.
     12   2535  sangeeta  *
     13   2535  sangeeta  * When distributing Covered Code, include this CDDL HEADER in each
     14   2535  sangeeta  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15   2535  sangeeta  * If applicable, add the following below this CDDL HEADER, with the
     16   2535  sangeeta  * fields enclosed by brackets "[]" replaced with your own identifying
     17   2535  sangeeta  * information: Portions Copyright [yyyy] [name of copyright owner]
     18   2535  sangeeta  *
     19   2535  sangeeta  * CDDL HEADER END
     20   2535  sangeeta  */
     21   2535  sangeeta /*
     22   8485     Peter  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23   2535  sangeeta  * Use is subject to license terms.
     24   2535  sangeeta  */
     25   2535  sangeeta 
     26   2535  sangeeta /*
     27   2535  sangeeta  * This file contains consumer routines of the IPv4 forwarding engine
     28   2535  sangeeta  */
     29   2535  sangeeta 
     30   2535  sangeeta #include <sys/types.h>
     31   2535  sangeeta #include <sys/stream.h>
     32   2535  sangeeta #include <sys/stropts.h>
     33   2535  sangeeta #include <sys/strlog.h>
     34   2535  sangeeta #include <sys/dlpi.h>
     35   2535  sangeeta #include <sys/ddi.h>
     36   2535  sangeeta #include <sys/cmn_err.h>
     37   2535  sangeeta #include <sys/policy.h>
     38   2535  sangeeta 
     39   2535  sangeeta #include <sys/systm.h>
     40   2535  sangeeta #include <sys/strsun.h>
     41   2535  sangeeta #include <sys/kmem.h>
     42   2535  sangeeta #include <sys/param.h>
     43   2535  sangeeta #include <sys/socket.h>
     44   4482  dr146992 #include <sys/strsubr.h>
     45   2535  sangeeta #include <net/if.h>
     46   2535  sangeeta #include <net/route.h>
     47   2535  sangeeta #include <netinet/in.h>
     48   2535  sangeeta #include <net/if_dl.h>
     49   2535  sangeeta #include <netinet/ip6.h>
     50   2535  sangeeta #include <netinet/icmp6.h>
     51   2535  sangeeta 
     52  11042      Erik #include <inet/ipsec_impl.h>
     53   2535  sangeeta #include <inet/common.h>
     54   2535  sangeeta #include <inet/mi.h>
     55   2535  sangeeta #include <inet/mib2.h>
     56   2535  sangeeta #include <inet/ip.h>
     57   4482  dr146992 #include <inet/ip_impl.h>
     58   2535  sangeeta #include <inet/ip6.h>
     59   2535  sangeeta #include <inet/ip_ndp.h>
     60   2535  sangeeta #include <inet/arp.h>
     61   2535  sangeeta #include <inet/ip_if.h>
     62   2535  sangeeta #include <inet/ip_ire.h>
     63   2535  sangeeta #include <inet/ip_ftable.h>
     64   2535  sangeeta #include <inet/ip_rts.h>
     65   2535  sangeeta #include <inet/nd.h>
     66   2535  sangeeta 
     67   2535  sangeeta #include <net/pfkeyv2.h>
     68   2535  sangeeta #include <inet/sadb.h>
     69   2535  sangeeta #include <inet/tcp.h>
     70   2535  sangeeta #include <inet/ipclassifier.h>
     71   2535  sangeeta #include <sys/zone.h>
     72   2535  sangeeta #include <net/radix.h>
     73   2535  sangeeta #include <sys/tsol/label.h>
     74   2535  sangeeta #include <sys/tsol/tnet.h>
     75   2535  sangeeta 
     76   2535  sangeeta #define	IS_DEFAULT_ROUTE(ire)	\
     77   2535  sangeeta 	(((ire)->ire_type & IRE_DEFAULT) || \
     78   2535  sangeeta 	    (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
     79   2535  sangeeta 
     80   3448  dh155122 static ire_t	*route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
     81  11042      Erik static void	ire_del_host_redir(ire_t *, char *);
     82  11042      Erik static boolean_t ire_find_best_route(struct radix_node *, void *);
     83   2535  sangeeta 
     84   2535  sangeeta /*
     85   2535  sangeeta  * Lookup a route in forwarding table. A specific lookup is indicated by
     86   2535  sangeeta  * passing the required parameters and indicating the match required in the
     87   2535  sangeeta  * flag field.
     88   2535  sangeeta  *
     89   2535  sangeeta  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
     90   2535  sangeeta  */
     91   2535  sangeeta ire_t *
     92  11042      Erik ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
     93  11042      Erik     int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
     94  11042      Erik     int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
     95   2535  sangeeta {
     96  11042      Erik 	ire_t *ire;
     97   2535  sangeeta 	struct rt_sockaddr rdst, rmask;
     98   2535  sangeeta 	struct rt_entry *rt;
     99   2535  sangeeta 	ire_ftable_args_t margs;
    100   2535  sangeeta 
    101  11042      Erik 	ASSERT(ill == NULL || !ill->ill_isv6);
    102   2535  sangeeta 
    103   2535  sangeeta 	/*
    104  11042      Erik 	 * ire_match_args() will dereference ill if MATCH_IRE_ILL
    105  11042      Erik 	 * is set.
    106   2535  sangeeta 	 */
    107  11042      Erik 	if ((flags & MATCH_IRE_ILL) && (ill == NULL))
    108   2535  sangeeta 		return (NULL);
    109   2535  sangeeta 
    110  11131      Erik 	bzero(&rdst, sizeof (rdst));
    111   2535  sangeeta 	rdst.rt_sin_len = sizeof (rdst);
    112   2535  sangeeta 	rdst.rt_sin_family = AF_INET;
    113   2535  sangeeta 	rdst.rt_sin_addr.s_addr = addr;
    114   2535  sangeeta 
    115  11131      Erik 	bzero(&rmask, sizeof (rmask));
    116   2535  sangeeta 	rmask.rt_sin_len = sizeof (rmask);
    117   2535  sangeeta 	rmask.rt_sin_family = AF_INET;
    118   2535  sangeeta 	rmask.rt_sin_addr.s_addr = mask;
    119   2535  sangeeta 
    120  11131      Erik 	bzero(&margs, sizeof (margs));
    121   2535  sangeeta 	margs.ift_addr = addr;
    122   2535  sangeeta 	margs.ift_mask = mask;
    123   2535  sangeeta 	margs.ift_gateway = gateway;
    124   2535  sangeeta 	margs.ift_type = type;
    125  11042      Erik 	margs.ift_ill = ill;
    126   2535  sangeeta 	margs.ift_zoneid = zoneid;
    127   2535  sangeeta 	margs.ift_tsl = tsl;
    128   2535  sangeeta 	margs.ift_flags = flags;
    129   2535  sangeeta 
    130   2535  sangeeta 	/*
    131   2535  sangeeta 	 * The flags argument passed to ire_ftable_lookup may cause the
    132   2535  sangeeta 	 * search to return, not the longest matching prefix, but the
    133   2535  sangeeta 	 * "best matching prefix", i.e., the longest prefix that also
    134   2535  sangeeta 	 * satisfies constraints imposed via the permutation of flags
    135   2535  sangeeta 	 * passed in. To achieve this, we invoke ire_match_args() on
    136   2535  sangeeta 	 * each matching leaf in the  radix tree. ire_match_args is
    137   2535  sangeeta 	 * invoked by the callback function ire_find_best_route()
    138   2535  sangeeta 	 * We hold the global tree lock in read mode when calling
    139  11042      Erik 	 * rn_match_args. Before dropping the global tree lock, ensure
    140   2535  sangeeta 	 * that the radix node can't be deleted by incrementing ire_refcnt.
    141   2535  sangeeta 	 */
    142   3448  dh155122 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
    143   3448  dh155122 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
    144   3448  dh155122 	    ipst->ips_ip_ftable, ire_find_best_route, &margs);
    145   2535  sangeeta 	ire = margs.ift_best_ire;
    146   2535  sangeeta 	if (rt == NULL) {
    147  11042      Erik 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    148   2535  sangeeta 		return (NULL);
    149   2535  sangeeta 	}
    150  11042      Erik 	ASSERT(ire != NULL);
    151   2535  sangeeta 
    152   2535  sangeeta 	DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
    153   2535  sangeeta 
    154   2535  sangeeta 	/*
    155   2535  sangeeta 	 * round-robin only if we have more than one route in the bucket.
    156  11042      Erik 	 * ips_ip_ecmp_behavior controls when we do ECMP
    157  11042      Erik 	 *	2:	always
    158  11042      Erik 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
    159  11042      Erik 	 *	0:	never
    160   2535  sangeeta 	 */
    161  11042      Erik 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
    162  11042      Erik 		if (ipst->ips_ip_ecmp_behavior == 2 ||
    163  11042      Erik 		    (ipst->ips_ip_ecmp_behavior == 1 &&
    164  11042      Erik 		    IS_DEFAULT_ROUTE(ire))) {
    165  11042      Erik 			ire_t	*next_ire;
    166   2535  sangeeta 
    167  11042      Erik 			margs.ift_best_ire = NULL;
    168  11042      Erik 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
    169  11042      Erik 			    xmit_hint, ire, ipst);
    170  11042      Erik 			if (next_ire == NULL) {
    171  11042      Erik 				/* keep ire if next_ire is null */
    172  11042      Erik 				goto done;
    173  11042      Erik 			}
    174  11042      Erik 			ire_refrele(ire);
    175   2535  sangeeta 			ire = next_ire;
    176   2535  sangeeta 		}
    177   2535  sangeeta 	}
    178  11042      Erik 
    179  11042      Erik done:
    180  11042      Erik 	/* Return generation before dropping lock */
    181  11042      Erik 	if (generationp != NULL)
    182  11042      Erik 		*generationp = ire->ire_generation;
    183  11042      Erik 
    184  11042      Erik 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    185  11042      Erik 
    186  11042      Erik 	/*
    187  11042      Erik 	 * For shared-IP zones we need additional checks to what was
    188  11042      Erik 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
    189  11042      Erik 	 *
    190  11042      Erik 	 * When ip_restrict_interzone_loopback is set, then
    191  11042      Erik 	 * we ensure that IRE_LOCAL are only used for loopback
    192  11042      Erik 	 * between zones when the logical "Ethernet" would
    193  11042      Erik 	 * have looped them back. That is, if in the absense of
    194  11042      Erik 	 * the IRE_LOCAL we would have sent to packet out the
    195  11042      Erik 	 * same ill.
    196  11042      Erik 	 */
    197  11042      Erik 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
    198  11042      Erik 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
    199  11042      Erik 	    ipst->ips_ip_restrict_interzone_loopback) {
    200  11042      Erik 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
    201  11042      Erik 		ASSERT(ire != NULL);
    202   2535  sangeeta 	}
    203   2535  sangeeta 	return (ire);
    204   2535  sangeeta }
    205   2535  sangeeta 
    206   8275      Eric /*
    207   8275      Eric  * This function is called by
    208  11042      Erik  * ip_input/ire_route_recursive when doing a route lookup on only the
    209  11042      Erik  * destination address.
    210  11042      Erik  *
    211   8275      Eric  * The optimizations of this function over ire_ftable_lookup are:
    212   8275      Eric  *	o removing unnecessary flag matching
    213   8275      Eric  *	o doing longest prefix match instead of overloading it further
    214   8275      Eric  *	  with the unnecessary "best_prefix_match"
    215  11042      Erik  *
    216  11042      Erik  * If no route is found we return IRE_NOROUTE.
    217   8275      Eric  */
    218  11042      Erik ire_t *
    219  11042      Erik ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
    220  11042      Erik     uint_t *generationp)
    221   8275      Eric {
    222  11042      Erik 	ire_t *ire;
    223   8275      Eric 	struct rt_sockaddr rdst;
    224   8275      Eric 	struct rt_entry *rt;
    225  11042      Erik 	irb_t *irb;
    226   8275      Eric 
    227   8275      Eric 	rdst.rt_sin_len = sizeof (rdst);
    228   8275      Eric 	rdst.rt_sin_family = AF_INET;
    229   8275      Eric 	rdst.rt_sin_addr.s_addr = addr;
    230   8275      Eric 
    231   8275      Eric 	/*
    232   8275      Eric 	 * This is basically inlining  a simpler version of ire_match_args
    233   8275      Eric 	 */
    234   8275      Eric 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
    235   8275      Eric 
    236   8275      Eric 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
    237   8275      Eric 	    ipst->ips_ip_ftable, NULL, NULL);
    238   8275      Eric 
    239  11042      Erik 	if (rt == NULL)
    240  11042      Erik 		goto bad;
    241  11042      Erik 
    242  11042      Erik 	irb = &rt->rt_irb;
    243  11042      Erik 	if (irb->irb_ire_cnt == 0)
    244  11042      Erik 		goto bad;
    245  11042      Erik 
    246  11042      Erik 	rw_enter(&irb->irb_lock, RW_READER);
    247  11042      Erik 	ire = irb->irb_ire;
    248  11042      Erik 	if (ire == NULL) {
    249  11042      Erik 		rw_exit(&irb->irb_lock);
    250  11042      Erik 		goto bad;
    251   8275      Eric 	}
    252  11042      Erik 	while (IRE_IS_CONDEMNED(ire)) {
    253  11042      Erik 		ire = ire->ire_next;
    254  11042      Erik 		if (ire == NULL) {
    255  11042      Erik 			rw_exit(&irb->irb_lock);
    256  11042      Erik 			goto bad;
    257  11042      Erik 		}
    258   8275      Eric 	}
    259   8275      Eric 
    260  11042      Erik 	/* we have a ire that matches */
    261  11042      Erik 	ire_refhold(ire);
    262  11042      Erik 	rw_exit(&irb->irb_lock);
    263  11042      Erik 
    264  11042      Erik 	/*
    265  11042      Erik 	 * round-robin only if we have more than one route in the bucket.
    266  11042      Erik 	 * ips_ip_ecmp_behavior controls when we do ECMP
    267  11042      Erik 	 *	2:	always
    268  11042      Erik 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
    269  11042      Erik 	 *	0:	never
    270  11042      Erik 	 *
    271  11042      Erik 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
    272  11042      Erik 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
    273  11042      Erik 	 * and the IRE_INTERFACESs are likely to be shorter matches.
    274  11042      Erik 	 */
    275  11042      Erik 	if (ire->ire_bucket->irb_ire_cnt > 1) {
    276  11042      Erik 		if (ipst->ips_ip_ecmp_behavior == 2 ||
    277  11042      Erik 		    (ipst->ips_ip_ecmp_behavior == 1 &&
    278  11042      Erik 		    IS_DEFAULT_ROUTE(ire))) {
    279  11042      Erik 			ire_t	*next_ire;
    280  11042      Erik 			ire_ftable_args_t margs;
    281  11042      Erik 
    282  11131      Erik 			bzero(&margs, sizeof (margs));
    283  11042      Erik 			margs.ift_addr = addr;
    284  11042      Erik 			margs.ift_zoneid = ALL_ZONES;
    285  11042      Erik 
    286  11042      Erik 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
    287  11042      Erik 			    xmit_hint, ire, ipst);
    288  11042      Erik 			if (next_ire == NULL) {
    289  11042      Erik 				/* keep ire if next_ire is null */
    290  11042      Erik 				if (generationp != NULL)
    291  11042      Erik 					*generationp = ire->ire_generation;
    292  11042      Erik 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    293  11042      Erik 				return (ire);
    294  11042      Erik 			}
    295  11042      Erik 			ire_refrele(ire);
    296  11042      Erik 			ire = next_ire;
    297  11042      Erik 		}
    298   8275      Eric 	}
    299  11042      Erik 	/* Return generation before dropping lock */
    300  11042      Erik 	if (generationp != NULL)
    301  11042      Erik 		*generationp = ire->ire_generation;
    302   8275      Eric 
    303   8275      Eric 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    304   8275      Eric 
    305   8275      Eric 	/*
    306  11042      Erik 	 * Since we only did ALL_ZONES matches there is no special handling
    307  11042      Erik 	 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
    308   8275      Eric 	 */
    309  11042      Erik 	return (ire);
    310   8275      Eric 
    311  11042      Erik bad:
    312  11042      Erik 	if (generationp != NULL)
    313  11042      Erik 		*generationp = IRE_GENERATION_VERIFY;
    314   8275      Eric 
    315  11042      Erik 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    316  11042      Erik 	return (ire_reject(ipst, B_FALSE));
    317   8275      Eric }
    318   2535  sangeeta 
    319   2535  sangeeta /*
    320  11042      Erik  * Find the ill matching a multicast group.
    321   2535  sangeeta  * Allows different routes for multicast addresses
    322   2535  sangeeta  * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
    323   2535  sangeeta  * which point at different interfaces. This is used when IP_MULTICAST_IF
    324   2535  sangeeta  * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
    325   2535  sangeeta  * specify the interface to join on.
    326   2535  sangeeta  *
    327  11042      Erik  * Supports link-local addresses by using ire_route_recursive which follows
    328  11042      Erik  * the ill when recursing.
    329  11042      Erik  *
    330  11042      Erik  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
    331  11042      Erik  * and the MULTIRT property can be different for different groups, we
    332  11042      Erik  * extract RTF_MULTIRT from the special unicast route added for a group
    333  11042      Erik  * with CGTP and pass that back in the multirtp argument.
    334  11042      Erik  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
    335  11042      Erik  * We have a setsrcp argument for the same reason.
    336   2535  sangeeta  */
    337  11042      Erik ill_t *
    338  11042      Erik ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
    339  11042      Erik     boolean_t *multirtp, ipaddr_t *setsrcp)
    340   2535  sangeeta {
    341   2535  sangeeta 	ire_t	*ire;
    342  11042      Erik 	ill_t	*ill;
    343   2535  sangeeta 
    344  11042      Erik 	ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
    345  11042      Erik 	    MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL);
    346  11042      Erik 	ASSERT(ire != NULL);
    347  11042      Erik 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
    348   2535  sangeeta 		ire_refrele(ire);
    349   2535  sangeeta 		return (NULL);
    350   2535  sangeeta 	}
    351  11042      Erik 
    352  11042      Erik 	if (multirtp != NULL)
    353  11042      Erik 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
    354  11042      Erik 
    355  11042      Erik 	ill = ire_nexthop_ill(ire);
    356  11042      Erik 	ire_refrele(ire);
    357  11042      Erik 	return (ill);
    358   2535  sangeeta }
    359   2535  sangeeta 
    360   2535  sangeeta /*
    361   2535  sangeeta  * Delete the passed in ire if the gateway addr matches
    362   2535  sangeeta  */
    363   2535  sangeeta void
    364   2535  sangeeta ire_del_host_redir(ire_t *ire, char *gateway)
    365   2535  sangeeta {
    366   3004  dd193516 	if ((ire->ire_flags & RTF_DYNAMIC) &&
    367   2535  sangeeta 	    (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
    368   2535  sangeeta 		ire_delete(ire);
    369   2535  sangeeta }
    370   2535  sangeeta 
    371   2535  sangeeta /*
    372  11042      Erik  * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
    373   2535  sangeeta  * pointing at the specified gateway and
    374   2535  sangeeta  * delete them. This routine is called only
    375   2535  sangeeta  * when a default gateway is going away.
    376   2535  sangeeta  */
    377   2535  sangeeta void
    378   3448  dh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
    379   2535  sangeeta {
    380   2535  sangeeta 	struct rtfuncarg rtfarg;
    381   2535  sangeeta 
    382  11131      Erik 	bzero(&rtfarg, sizeof (rtfarg));
    383   2535  sangeeta 	rtfarg.rt_func = ire_del_host_redir;
    384   2535  sangeeta 	rtfarg.rt_arg = (void *)&gateway;
    385  11131      Erik 	rtfarg.rt_zoneid = ALL_ZONES;
    386  11131      Erik 	rtfarg.rt_ipst = ipst;
    387   3448  dh155122 	(void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
    388   3448  dh155122 	    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
    389   2535  sangeeta }
    390   2535  sangeeta 
    391   2535  sangeeta /*
    392   3448  dh155122  * Obtain the rt_entry and rt_irb for the route to be added to
    393   3448  dh155122  * the ips_ip_ftable.
    394   2535  sangeeta  * First attempt to add a node to the radix tree via rn_addroute. If the
    395   2535  sangeeta  * route already exists, return the bucket for the existing route.
    396   2535  sangeeta  *
    397   2535  sangeeta  * Locking notes: Need to hold the global radix tree lock in write mode to
    398   2535  sangeeta  * add a radix node. To prevent the node from being deleted, ire_get_bucket()
    399   2535  sangeeta  * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
    400   2535  sangeeta  * while holding the irb_lock, but not the radix tree lock.
    401   2535  sangeeta  */
    402   2535  sangeeta irb_t *
    403   2535  sangeeta ire_get_bucket(ire_t *ire)
    404   2535  sangeeta {
    405   2535  sangeeta 	struct radix_node *rn;
    406   2535  sangeeta 	struct rt_entry *rt;
    407   2535  sangeeta 	struct rt_sockaddr rmask, rdst;
    408   2535  sangeeta 	irb_t *irb = NULL;
    409   3448  dh155122 	ip_stack_t *ipst = ire->ire_ipst;
    410   2535  sangeeta 
    411   3448  dh155122 	ASSERT(ipst->ips_ip_ftable != NULL);
    412   2535  sangeeta 
    413   2535  sangeeta 	/* first try to see if route exists (based on rtalloc1) */
    414  11131      Erik 	bzero(&rdst, sizeof (rdst));
    415   2535  sangeeta 	rdst.rt_sin_len = sizeof (rdst);
    416   2535  sangeeta 	rdst.rt_sin_family = AF_INET;
    417   2535  sangeeta 	rdst.rt_sin_addr.s_addr = ire->ire_addr;
    418   2535  sangeeta 
    419  11131      Erik 	bzero(&rmask, sizeof (rmask));
    420   2535  sangeeta 	rmask.rt_sin_len = sizeof (rmask);
    421   2535  sangeeta 	rmask.rt_sin_family = AF_INET;
    422   2535  sangeeta 	rmask.rt_sin_addr.s_addr = ire->ire_mask;
    423   2535  sangeeta 
    424   2535  sangeeta 	/*
    425   2535  sangeeta 	 * add the route. based on BSD's rtrequest1(RTM_ADD)
    426   2535  sangeeta 	 */
    427   2535  sangeeta 	R_Malloc(rt, rt_entry_cache,  sizeof (*rt));
    428   5090  sangeeta 	/* kmem_alloc failed */
    429   5090  sangeeta 	if (rt == NULL)
    430   5090  sangeeta 		return (NULL);
    431   5090  sangeeta 
    432  11131      Erik 	bzero(rt, sizeof (*rt));
    433   2535  sangeeta 	rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
    434   2535  sangeeta 	rt->rt_dst = rdst;
    435   2535  sangeeta 	irb = &rt->rt_irb;
    436  11042      Erik 	irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
    437   3448  dh155122 	irb->irb_ipst = ipst;
    438   2535  sangeeta 	rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
    439   3448  dh155122 	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
    440   3448  dh155122 	rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
    441   3448  dh155122 	    ipst->ips_ip_ftable, (struct radix_node *)rt);
    442   2535  sangeeta 	if (rn == NULL) {
    443   3448  dh155122 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    444   2535  sangeeta 		Free(rt, rt_entry_cache);
    445   2535  sangeeta 		rt = NULL;
    446   2535  sangeeta 		irb = NULL;
    447   3448  dh155122 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
    448   3448  dh155122 		rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
    449   3448  dh155122 		    ipst->ips_ip_ftable);
    450   3448  dh155122 		if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
    451   2535  sangeeta 			/* found a non-root match */
    452   2535  sangeeta 			rt = (struct rt_entry *)rn;
    453   2535  sangeeta 		}
    454   2535  sangeeta 	}
    455   2535  sangeeta 	if (rt != NULL) {
    456   2535  sangeeta 		irb = &rt->rt_irb;
    457  11042      Erik 		irb_refhold(irb);
    458   2535  sangeeta 	}
    459   3448  dh155122 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    460   2535  sangeeta 	return (irb);
    461   2535  sangeeta }
    462   2535  sangeeta 
    463   2535  sangeeta /*
    464   2535  sangeeta  * This function is used when the caller wants to know the outbound
    465   2535  sangeeta  * interface for a packet given only the address.
    466   2535  sangeeta  * If this is a offlink IP address and there are multiple
    467   2535  sangeeta  * routes to this destination, this routine will utilise the
    468   2535  sangeeta  * first route it finds to IP address
    469   2535  sangeeta  * Return values:
    470   2535  sangeeta  * 	0	- FAILURE
    471   2535  sangeeta  *	nonzero	- ifindex
    472   2535  sangeeta  */
    473   2535  sangeeta uint_t
    474   2535  sangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
    475   2535  sangeeta {
    476   2535  sangeeta 	uint_t ifindex = 0;
    477   2535  sangeeta 	ire_t *ire;
    478   2535  sangeeta 	ill_t *ill;
    479   3448  dh155122 	netstack_t *ns;
    480   3448  dh155122 	ip_stack_t *ipst;
    481   2535  sangeeta 
    482   3448  dh155122 	if (zoneid == ALL_ZONES)
    483   3448  dh155122 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
    484   3448  dh155122 	else
    485   3448  dh155122 		ns = netstack_find_by_zoneid(zoneid);
    486   3448  dh155122 	ASSERT(ns != NULL);
    487   3448  dh155122 
    488   3448  dh155122 	/*
    489   3448  dh155122 	 * For exclusive stacks we set the zoneid to zero
    490   3448  dh155122 	 * since IP uses the global zoneid in the exclusive stacks.
    491   3448  dh155122 	 */
    492   3448  dh155122 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
    493   3448  dh155122 		zoneid = GLOBAL_ZONEID;
    494   3448  dh155122 	ipst = ns->netstack_ip;
    495   2535  sangeeta 
    496   2535  sangeeta 	ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
    497   2535  sangeeta 
    498  11042      Erik 	if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
    499  11042      Erik 		ill = ire_nexthop_ill(ire);
    500  11042      Erik 		if (ill != NULL) {
    501   2535  sangeeta 			ifindex = ill->ill_phyint->phyint_ifindex;
    502  11042      Erik 			ill_refrele(ill);
    503  11042      Erik 		}
    504   2535  sangeeta 		ire_refrele(ire);
    505   2535  sangeeta 	}
    506   3448  dh155122 	netstack_rele(ns);
    507   2535  sangeeta 	return (ifindex);
    508   2535  sangeeta }
    509   2535  sangeeta 
    510   2535  sangeeta /*
    511   2535  sangeeta  * Routine to find the route to a destination. If a ifindex is supplied
    512  11042      Erik  * it tries to match the route to the corresponding ipif for the ifindex
    513   2535  sangeeta  */
    514   2535  sangeeta static	ire_t *
    515   3448  dh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
    516   2535  sangeeta {
    517   2535  sangeeta 	ire_t *ire = NULL;
    518   2535  sangeeta 	int match_flags;
    519   2535  sangeeta 
    520  11042      Erik 	match_flags = MATCH_IRE_DSTONLY;
    521   2535  sangeeta 
    522   2535  sangeeta 	/* XXX pass NULL tsl for now */
    523   2535  sangeeta 
    524   2535  sangeeta 	if (dst_addr->sa_family == AF_INET) {
    525  11042      Erik 		ire = ire_route_recursive_v4(
    526  11042      Erik 		    ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
    527  11042      Erik 		    zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL,
    528  11042      Erik 		    NULL);
    529   2535  sangeeta 	} else {
    530  11042      Erik 		ire = ire_route_recursive_v6(
    531  11042      Erik 		    &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
    532  11042      Erik 		    zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL,
    533  11042      Erik 		    NULL);
    534  11042      Erik 	}
    535  11042      Erik 	ASSERT(ire != NULL);
    536  11042      Erik 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
    537  11042      Erik 		ire_refrele(ire);
    538  11042      Erik 		return (NULL);
    539   2535  sangeeta 	}
    540   2535  sangeeta 	return (ire);
    541   2535  sangeeta }
    542   2535  sangeeta 
    543   2535  sangeeta /*
    544   2535  sangeeta  * This routine is called by IP Filter to send a packet out on the wire
    545  11042      Erik  * to a specified dstination (which may be onlink or offlink). The ifindex may
    546  11042      Erik  * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
    547   2535  sangeeta  * an outgoing interface and requires the nexthop to be on that interface.
    548   4482  dr146992  * IP WILL NOT DO the following to the data packet before sending it out:
    549   2535  sangeeta  *	a. manipulate ttl
    550   4482  dr146992  *	b. ipsec work
    551   4482  dr146992  *	c. fragmentation
    552   4482  dr146992  *
    553   4482  dr146992  * If the packet has been prepared for hardware checksum then it will be
    554   4482  dr146992  * passed off to ip_send_align_cksum() to check that the flags set on the
    555   4482  dr146992  * packet are in alignment with the capabilities of the new outgoing NIC.
    556   2535  sangeeta  *
    557   2535  sangeeta  * Return values:
    558   2535  sangeeta  *	0:		IP was able to send of the data pkt
    559   2535  sangeeta  *	ECOMM:		Could not send packet
    560   2535  sangeeta  *	ENONET		No route to dst. It is up to the caller
    561   2535  sangeeta  *			to send icmp unreachable error message,
    562   2535  sangeeta  *	EINPROGRESS	The macaddr of the onlink dst or that
    563   2535  sangeeta  *			of the offlink dst's nexthop needs to get
    564   2535  sangeeta  *			resolved before packet can be sent to dst.
    565   2535  sangeeta  *			Thus transmission is not guaranteed.
    566  11042      Erik  *			Note: No longer have visibility to the ARP queue
    567  11042      Erik  *			hence no EINPROGRESS.
    568   2535  sangeeta  */
    569   2535  sangeeta int
    570   2535  sangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
    571   2535  sangeeta     zoneid_t zoneid)
    572   2535  sangeeta {
    573  11042      Erik 	ipaddr_t nexthop;
    574   3448  dh155122 	netstack_t *ns;
    575   3448  dh155122 	ip_stack_t *ipst;
    576  11042      Erik 	ip_xmit_attr_t ixas;
    577  11042      Erik 	int error;
    578   2535  sangeeta 
    579   2535  sangeeta 	ASSERT(mp != NULL);
    580   3448  dh155122 
    581   3448  dh155122 	if (zoneid == ALL_ZONES)
    582   3448  dh155122 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
    583   3448  dh155122 	else
    584   3448  dh155122 		ns = netstack_find_by_zoneid(zoneid);
    585   3448  dh155122 	ASSERT(ns != NULL);
    586   3448  dh155122 
    587   3448  dh155122 	/*
    588   3448  dh155122 	 * For exclusive stacks we set the zoneid to zero
    589   3448  dh155122 	 * since IP uses the global zoneid in the exclusive stacks.
    590   3448  dh155122 	 */
    591   3448  dh155122 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
    592   3448  dh155122 		zoneid = GLOBAL_ZONEID;
    593   3448  dh155122 	ipst = ns->netstack_ip;
    594   2535  sangeeta 
    595   2535  sangeeta 	ASSERT(dst_addr->sa_family == AF_INET ||
    596   2535  sangeeta 	    dst_addr->sa_family == AF_INET6);
    597   2535  sangeeta 
    598  11042      Erik 	bzero(&ixas, sizeof (ixas));
    599  11042      Erik 	/*
    600  11042      Erik 	 * No IPsec, no fragmentation, and don't let any hooks see
    601  11042      Erik 	 * the packet.
    602  11042      Erik 	 */
    603  11042      Erik 	ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
    604  11042      Erik 	ixas.ixa_cred = kcred;
    605  11042      Erik 	ixas.ixa_cpid = NOPID;
    606  11042      Erik 	ixas.ixa_tsl = NULL;
    607  11042      Erik 	ixas.ixa_ipst = ipst;
    608  11042      Erik 	ixas.ixa_ifindex = ifindex;
    609  11042      Erik 
    610   2535  sangeeta 	if (dst_addr->sa_family == AF_INET) {
    611  11042      Erik 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
    612  11042      Erik 
    613  11042      Erik 		ixas.ixa_flags |= IXAF_IS_IPV4;
    614  11042      Erik 		nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
    615  11042      Erik 		if (nexthop != ipha->ipha_dst) {
    616  11042      Erik 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
    617  11042      Erik 			ixas.ixa_nexthop_v4 = nexthop;
    618  11042      Erik 		}
    619  11042      Erik 		ixas.ixa_multicast_ttl = ipha->ipha_ttl;
    620   2535  sangeeta 	} else {
    621  11042      Erik 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
    622  11042      Erik 		in6_addr_t *nexthop6;
    623  11042      Erik 
    624  11042      Erik 		nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
    625  11042      Erik 		if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
    626  11042      Erik 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
    627  11042      Erik 			ixas.ixa_nexthop_v6 = *nexthop6;
    628  11042      Erik 		}
    629  11042      Erik 		ixas.ixa_multicast_ttl = ip6h->ip6_hops;
    630   2535  sangeeta 	}
    631  11042      Erik 	error = ip_output_simple(mp, &ixas);
    632  11042      Erik 	ixa_cleanup(&ixas);
    633   2535  sangeeta 
    634  11042      Erik 	netstack_rele(ns);
    635  11042      Erik 	switch (error) {
    636  11042      Erik 	case 0:
    637  11042      Erik 		break;
    638   2535  sangeeta 
    639  11042      Erik 	case EHOSTUNREACH:
    640  11042      Erik 	case ENETUNREACH:
    641  11042      Erik 		error = ENONET;
    642  11042      Erik 		break;
    643   2535  sangeeta 
    644  11042      Erik 	default:
    645  11042      Erik 		error = ECOMM;
    646   2535  sangeeta 		break;
    647   2535  sangeeta 	}
    648  11042      Erik 	return (error);
    649   4482  dr146992 }
    650   4482  dr146992 
    651   2535  sangeeta /*
    652   2535  sangeeta  * callback function provided by ire_ftable_lookup when calling
    653   2535  sangeeta  * rn_match_args(). Invoke ire_match_args on each matching leaf node in
    654   2535  sangeeta  * the radix tree.
    655   2535  sangeeta  */
    656   2535  sangeeta boolean_t
    657   2535  sangeeta ire_find_best_route(struct radix_node *rn, void *arg)
    658   2535  sangeeta {
    659   2535  sangeeta 	struct rt_entry *rt = (struct rt_entry *)rn;
    660   2535  sangeeta 	irb_t *irb_ptr;
    661   2535  sangeeta 	ire_t *ire;
    662   2535  sangeeta 	ire_ftable_args_t *margs = arg;
    663   2535  sangeeta 	ipaddr_t match_mask;
    664   2535  sangeeta 
    665   2535  sangeeta 	ASSERT(rt != NULL);
    666   2535  sangeeta 
    667   2535  sangeeta 	irb_ptr = &rt->rt_irb;
    668   2535  sangeeta 
    669   2535  sangeeta 	if (irb_ptr->irb_ire_cnt == 0)
    670   2535  sangeeta 		return (B_FALSE);
    671   2535  sangeeta 
    672   2535  sangeeta 	rw_enter(&irb_ptr->irb_lock, RW_READER);
    673   2535  sangeeta 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
    674  11042      Erik 		if (IRE_IS_CONDEMNED(ire))
    675   2535  sangeeta 			continue;
    676  11042      Erik 		if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK))
    677   2535  sangeeta 			match_mask = margs->ift_mask;
    678   2535  sangeeta 		else
    679   2535  sangeeta 			match_mask = ire->ire_mask;
    680   2535  sangeeta 
    681   2535  sangeeta 		if (ire_match_args(ire, margs->ift_addr, match_mask,
    682  11042      Erik 		    margs->ift_gateway, margs->ift_type, margs->ift_ill,
    683  11042      Erik 		    margs->ift_zoneid, margs->ift_tsl,
    684  11042      Erik 		    margs->ift_flags)) {
    685  11042      Erik 			ire_refhold(ire);
    686   2535  sangeeta 			rw_exit(&irb_ptr->irb_lock);
    687   2535  sangeeta 			margs->ift_best_ire = ire;
    688   2535  sangeeta 			return (B_TRUE);
    689   2535  sangeeta 		}
    690   2535  sangeeta 	}
    691   2535  sangeeta 	rw_exit(&irb_ptr->irb_lock);
    692   2535  sangeeta 	return (B_FALSE);
    693   2535  sangeeta }
    694   2535  sangeeta 
    695   2535  sangeeta /*
    696   2535  sangeeta  * ftable irb_t structures are dynamically allocated, and we need to
    697   2535  sangeeta  * check if the irb_t (and associated ftable tree attachment) needs to
    698   2535  sangeeta  * be cleaned up when the irb_refcnt goes to 0. The conditions that need
    699   2535  sangeeta  * be verified are:
    700   2535  sangeeta  * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
    701   2535  sangeeta  * - no other threads holding references to ire's in the bucket,
    702   2535  sangeeta  *   i.e., irb_nire == 0
    703   2535  sangeeta  * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
    704   2535  sangeeta  * - need to hold the global tree lock and irb_lock in write mode.
    705   2535  sangeeta  */
    706   2535  sangeeta void
    707   2535  sangeeta irb_refrele_ftable(irb_t *irb)
    708   2535  sangeeta {
    709   2535  sangeeta 	for (;;) {
    710   2535  sangeeta 		rw_enter(&irb->irb_lock, RW_WRITER);
    711   2535  sangeeta 		ASSERT(irb->irb_refcnt != 0);
    712   2535  sangeeta 		if (irb->irb_refcnt != 1) {
    713   2535  sangeeta 			/*
    714   2535  sangeeta 			 * Someone has a reference to this radix node
    715   2535  sangeeta 			 * or there is some bucket walker.
    716   2535  sangeeta 			 */
    717   2535  sangeeta 			irb->irb_refcnt--;
    718   2535  sangeeta 			rw_exit(&irb->irb_lock);
    719   2535  sangeeta 			return;
    720   2535  sangeeta 		} else {
    721   2535  sangeeta 			/*
    722   2535  sangeeta 			 * There is no other walker, nor is there any
    723   2535  sangeeta 			 * other thread that holds a direct ref to this
    724   2535  sangeeta 			 * radix node. Do the clean up if needed. Call
    725   2535  sangeeta 			 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
    726   2535  sangeeta 			 */
    727   2535  sangeeta 			if (irb->irb_marks & IRB_MARK_CONDEMNED)  {
    728   2535  sangeeta 				ire_t *ire_list;
    729   2535  sangeeta 
    730   2535  sangeeta 				ire_list = ire_unlink(irb);
    731   2535  sangeeta 				rw_exit(&irb->irb_lock);
    732   2535  sangeeta 
    733   2535  sangeeta 				if (ire_list != NULL)
    734   2535  sangeeta 					ire_cleanup(ire_list);
    735   2535  sangeeta 				/*
    736   2535  sangeeta 				 * more CONDEMNED entries could have
    737   2535  sangeeta 				 * been added while we dropped the lock,
    738   2535  sangeeta 				 * so we have to re-check.
    739   2535  sangeeta 				 */
    740   2535  sangeeta 				continue;
    741   2535  sangeeta 			}
    742   2535  sangeeta 
    743   2535  sangeeta 			/*
    744   2535  sangeeta 			 * Now check if there are still any ires
    745   2535  sangeeta 			 * associated with this radix node.
    746   2535  sangeeta 			 */
    747   2535  sangeeta 			if (irb->irb_nire != 0) {
    748   2535  sangeeta 				/*
    749   2535  sangeeta 				 * someone is still holding on
    750   2535  sangeeta 				 * to ires in this bucket
    751   2535  sangeeta 				 */
    752   2535  sangeeta 				irb->irb_refcnt--;
    753   2535  sangeeta 				rw_exit(&irb->irb_lock);
    754   2535  sangeeta 				return;
    755   2535  sangeeta 			} else {
    756   2535  sangeeta 				/*
    757   2535  sangeeta 				 * Everything is clear. Zero walkers,
    758   2535  sangeeta 				 * Zero threads with a ref to this
    759   2535  sangeeta 				 * radix node, Zero ires associated with
    760   2535  sangeeta 				 * this radix node. Due to lock order,
    761   2535  sangeeta 				 * check the above conditions again
    762   2535  sangeeta 				 * after grabbing all locks in the right order
    763   2535  sangeeta 				 */
    764   2535  sangeeta 				rw_exit(&irb->irb_lock);
    765   2535  sangeeta 				if (irb_inactive(irb))
    766   2535  sangeeta 					return;
    767   2535  sangeeta 				/*
    768   2535  sangeeta 				 * irb_inactive could not free the irb.
    769   2535  sangeeta 				 * See if there are any walkers, if not
    770   2535  sangeeta 				 * try to clean up again.
    771   2535  sangeeta 				 */
    772   2535  sangeeta 			}
    773   2535  sangeeta 		}
    774   2535  sangeeta 	}
    775   2535  sangeeta }
    776   2535  sangeeta 
    777   2535  sangeeta /*
    778  11042      Erik  * IRE iterator used by ire_ftable_lookup to process multiple equal
    779  11042      Erik  * routes. Given a starting point in the hash list (hash), walk the IREs
    780  11042      Erik  * in the bucket skipping deleted entries. We treat the bucket as a circular
    781  11042      Erik  * list for the purposes of walking it.
    782  11042      Erik  * Returns the IRE (held) that corresponds to the hash value. If that IRE is
    783  11042      Erik  * not applicable (ire_match_args failed) then it returns a subsequent one.
    784  11042      Erik  * If we fail to find an IRE we return NULL.
    785   2535  sangeeta  *
    786  11042      Erik  * Assumes that the caller holds a reference on the IRE bucket and a read lock
    787  11042      Erik  * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
    788  11042      Erik  *
    789  11042      Erik  * Applies to IPv4 and IPv6.
    790  11042      Erik  *
    791  11042      Erik  * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
    792  11042      Erik  * address and bucket, we compare against ire_type for the orig_ire. We also
    793  11042      Erik  * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
    794  11131      Erik  * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
    795  11042      Erik  *
    796  11042      Erik  * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
    797  11042      Erik  * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
    798  11042      Erik  * in which the zone has an IP address. We check this for the global zone
    799  11042      Erik  * even if no shared-IP zones are configured.
    800   2535  sangeeta  */
    801   2535  sangeeta ire_t *
    802  11042      Erik ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
    803  11042      Erik     ire_t *orig_ire, ip_stack_t *ipst)
    804   2535  sangeeta {
    805  11042      Erik 	ire_t		*ire, *maybe_ire = NULL;
    806  11042      Erik 	uint_t		maybe_badcnt;
    807  11042      Erik 	uint_t		maxwalk;
    808  11042      Erik 
    809  11042      Erik 	/* Fold in more bits from the hint/hash */
    810  11042      Erik 	hash = hash ^ (hash >> 8) ^ (hash >> 16);
    811   2535  sangeeta 
    812   2535  sangeeta 	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
    813  11042      Erik 	maxwalk = irb_ptr->irb_ire_cnt;	/* Excludes condemned */
    814  11042      Erik 	hash %= maxwalk;
    815  11042      Erik 	irb_refhold_locked(irb_ptr);
    816   2535  sangeeta 	rw_exit(&irb_ptr->irb_lock);
    817   2535  sangeeta 
    818   2535  sangeeta 	/*
    819   2535  sangeeta 	 * Round-robin the routers list looking for a route that
    820   2535  sangeeta 	 * matches the passed in parameters.
    821  11042      Erik 	 * First we skip "hash" number of non-condemned IREs.
    822  11042      Erik 	 * Then we match the IRE.
    823  11042      Erik 	 * If we find an ire which has a non-zero ire_badcnt then we remember
    824  11042      Erik 	 * it and keep on looking for a lower ire_badcnt.
    825  11042      Erik 	 * If we come to the end of the list we continue (treat the
    826  11042      Erik 	 * bucket list as a circular list) but we match less than "max"
    827  11042      Erik 	 * entries.
    828   2535  sangeeta 	 */
    829  11042      Erik 	ire = irb_ptr->irb_ire;
    830  11042      Erik 	while (maxwalk > 0) {
    831  11042      Erik 		if (IRE_IS_CONDEMNED(ire))
    832  11042      Erik 			goto next_ire_skip;
    833   2535  sangeeta 
    834  11042      Erik 		/* Skip the first "hash" entries to do ECMP */
    835  11042      Erik 		if (hash != 0) {
    836  11042      Erik 			hash--;
    837  11042      Erik 			goto next_ire_skip;
    838  11042      Erik 		}
    839  11042      Erik 
    840  11042      Erik 		/* See CGTP comment above */
    841  11042      Erik 		if (ire->ire_type != orig_ire->ire_type ||
    842  11131      Erik 		    ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
    843   2535  sangeeta 			goto next_ire;
    844   2535  sangeeta 
    845  11042      Erik 		/*
    846  11042      Erik 		 * Note: Since IPv6 has hash buckets instead of radix
    847  11042      Erik 		 * buckers we need to explicitly compare the addresses.
    848  11042      Erik 		 * That makes this less efficient since we will be called
    849  11042      Erik 		 * even if there is no alternatives just because the
    850  11042      Erik 		 * bucket has multiple IREs for different addresses.
    851  11042      Erik 		 */
    852  11042      Erik 		if (ire->ire_ipversion == IPV6_VERSION) {
    853  11042      Erik 			if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
    854  11042      Erik 			    &ire->ire_addr_v6))
    855  11042      Erik 				goto next_ire;
    856  11042      Erik 		}
    857  11042      Erik 
    858  11042      Erik 		/*
    859  11042      Erik 		 * For some reason find_best_route uses ire_mask. We do
    860  11042      Erik 		 * the same.
    861  11042      Erik 		 */
    862  11042      Erik 		if (ire->ire_ipversion == IPV4_VERSION ?
    863  11042      Erik 		    !ire_match_args(ire, margs->ift_addr,
    864  11042      Erik 		    ire->ire_mask, margs->ift_gateway,
    865  11042      Erik 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
    866  11042      Erik 		    margs->ift_tsl, margs->ift_flags) :
    867  11042      Erik 		    !ire_match_args_v6(ire, &margs->ift_addr_v6,
    868  11042      Erik 		    &ire->ire_mask_v6, &margs->ift_gateway_v6,
    869  11042      Erik 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
    870  11042      Erik 		    margs->ift_tsl, margs->ift_flags))
    871   2535  sangeeta 			goto next_ire;
    872   2535  sangeeta 
    873  11042      Erik 		if (margs->ift_zoneid != ALL_ZONES &&
    874  11042      Erik 		    (ire->ire_type & IRE_OFFLINK)) {
    875   2535  sangeeta 			/*
    876  11042      Erik 			 * When we're in a zone, we're only
    877  11042      Erik 			 * interested in routers that are
    878  11042      Erik 			 * reachable through ipifs within our zone.
    879   2535  sangeeta 			 */
    880  11042      Erik 			if (ire->ire_ipversion == IPV4_VERSION) {
    881  11042      Erik 				if (!ire_gateway_ok_zone_v4(
    882  11042      Erik 				    ire->ire_gateway_addr, margs->ift_zoneid,
    883  11042      Erik 				    ire->ire_ill, margs->ift_tsl, ipst,
    884  11042      Erik 				    B_TRUE))
    885  11042      Erik 					goto next_ire;
    886  11042      Erik 			} else {
    887  11042      Erik 				if (!ire_gateway_ok_zone_v6(
    888  11042      Erik 				    &ire->ire_gateway_addr_v6,
    889  11042      Erik 				    margs->ift_zoneid, ire->ire_ill,
    890  11042      Erik 				    margs->ift_tsl, ipst, B_TRUE))
    891  11042      Erik 					goto next_ire;
    892  11042      Erik 			}
    893   2535  sangeeta 		}
    894  11042      Erik 		mutex_enter(&ire->ire_lock);
    895  11042      Erik 		/* Look for stale ire_badcnt and clear */
    896  11042      Erik 		if (ire->ire_badcnt != 0 &&
    897  11066    rafael 		    (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
    898  11042      Erik 		    ipst->ips_ip_ire_badcnt_lifetime))
    899  11042      Erik 			ire->ire_badcnt = 0;
    900  11042      Erik 		mutex_exit(&ire->ire_lock);
    901   2535  sangeeta 
    902  11042      Erik 		if (ire->ire_badcnt == 0) {
    903  11042      Erik 			/* We found one with a zero badcnt; done */
    904  11042      Erik 			ire_refhold(ire);
    905  11042      Erik 			/*
    906  11042      Erik 			 * Care needed since irb_refrele grabs WLOCK to free
    907  11042      Erik 			 * the irb_t.
    908  11042      Erik 			 */
    909  11042      Erik 			if (ire->ire_ipversion == IPV4_VERSION) {
    910  11042      Erik 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    911  11042      Erik 				irb_refrele(irb_ptr);
    912  11042      Erik 				RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
    913  11042      Erik 			} else {
    914  11042      Erik 				rw_exit(&ipst->ips_ip6_ire_head_lock);
    915  11042      Erik 				irb_refrele(irb_ptr);
    916  11042      Erik 				rw_enter(&ipst->ips_ip6_ire_head_lock,
    917  11042      Erik 				    RW_READER);
    918  11042      Erik 			}
    919   2535  sangeeta 			return (ire);
    920   2535  sangeeta 		}
    921   2535  sangeeta 		/*
    922  11042      Erik 		 * keep looking to see if there is a better (lower
    923  11042      Erik 		 * badcnt) matching IRE, but save this one as a last resort.
    924  11042      Erik 		 * If we find a lower badcnt pick that one as the last* resort.
    925   2535  sangeeta 		 */
    926  11042      Erik 		if (maybe_ire == NULL) {
    927  11042      Erik 			maybe_ire = ire;
    928  11042      Erik 			maybe_badcnt = ire->ire_badcnt;
    929  11042      Erik 		} else if (ire->ire_badcnt < maybe_badcnt) {
    930  11042      Erik 			maybe_ire = ire;
    931  11042      Erik 			maybe_badcnt = ire->ire_badcnt;
    932  11042      Erik 		}
    933   8485     Peter 
    934   2535  sangeeta next_ire:
    935  11042      Erik 		maxwalk--;
    936  11042      Erik next_ire_skip:
    937  11042      Erik 		ire = ire->ire_next;
    938  11042      Erik 		if (ire == NULL)
    939  11042      Erik 			ire = irb_ptr->irb_ire;
    940   2535  sangeeta 	}
    941   2535  sangeeta 	if (maybe_ire != NULL)
    942  11042      Erik 		ire_refhold(maybe_ire);
    943  11042      Erik 
    944  11042      Erik 	/* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
    945  11042      Erik 	if (ire->ire_ipversion == IPV4_VERSION) {
    946  11042      Erik 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    947  11042      Erik 		irb_refrele(irb_ptr);
    948  11042      Erik 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
    949  11042      Erik 	} else {
    950  11042      Erik 		rw_exit(&ipst->ips_ip6_ire_head_lock);
    951  11042      Erik 		irb_refrele(irb_ptr);
    952  11042      Erik 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
    953  11042      Erik 	}
    954   2535  sangeeta 	return (maybe_ire);
    955   2535  sangeeta }
    956   2783   sowmini 
    957   2783   sowmini void
    958   2783   sowmini irb_refhold_rn(struct radix_node *rn)
    959   2783   sowmini {
    960   2783   sowmini 	if ((rn->rn_flags & RNF_ROOT) == 0)
    961  11042      Erik 		irb_refhold(&((rt_t *)(rn))->rt_irb);
    962   2783   sowmini }
    963   2783   sowmini 
    964   2783   sowmini void
    965   2783   sowmini irb_refrele_rn(struct radix_node *rn)
    966   2783   sowmini {
    967   2783   sowmini 	if ((rn->rn_flags & RNF_ROOT) == 0)
    968   2783   sowmini 		irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
    969   2783   sowmini }
    970  11042      Erik 
    971  11042      Erik /*
    972  11042      Erik  * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
    973  11042      Erik  * routes this routine sets up a ire_nce_cache as well. The caller needs to
    974  11042      Erik  * lookup an nce for the multicast case.
    975  11042      Erik  */
    976  11042      Erik ire_t *
    977  11042      Erik ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa,
    978  11042      Erik     uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
    979  11042      Erik {
    980  11042      Erik 	uint_t		match_args;
    981  11042      Erik 	uint_t		ire_type;
    982  11042      Erik 	ill_t		*ill;
    983  11042      Erik 	ire_t		*ire;
    984  11042      Erik 	ip_stack_t	*ipst = ixa->ixa_ipst;
    985  11042      Erik 	ipaddr_t	v4dst;
    986  11042      Erik 	in6_addr_t	v6nexthop;
    987  11042      Erik 	iaflags_t	ixaflags = ixa->ixa_flags;
    988  11042      Erik 	nce_t		*nce;
    989  11042      Erik 
    990  11042      Erik 	match_args = MATCH_IRE_SECATTR;
    991  11042      Erik 	IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
    992  11042      Erik 	if (setsrcp != NULL)
    993  11042      Erik 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
    994  11042      Erik 	if (errorp != NULL)
    995  11042      Erik 		ASSERT(*errorp == 0);
    996  11042      Erik 
    997  11042      Erik 	/*
    998  11042      Erik 	 * The content of the ixa will be different if IP_NEXTHOP,
    999  11042      Erik 	 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
   1000  11042      Erik 	 */
   1001  11042      Erik 
   1002  11042      Erik 	if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) :
   1003  11042      Erik 	    IN6_IS_ADDR_MULTICAST(v6dst)) {
   1004  11042      Erik 		/* Pick up the IRE_MULTICAST for the ill */
   1005  11042      Erik 		if (ixa->ixa_multicast_ifindex != 0) {
   1006  11042      Erik 			ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
   1007  11042      Erik 			    !(ixaflags & IXAF_IS_IPV4), ipst);
   1008  11042      Erik 		} else if (ixaflags & IXAF_SCOPEID_SET) {
   1009  11042      Erik 			/* sin6_scope_id takes precedence over ixa_ifindex */
   1010  11042      Erik 			ASSERT(ixa->ixa_scopeid != 0);
   1011  11042      Erik 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
   1012  11042      Erik 			    !(ixaflags & IXAF_IS_IPV4), ipst);
   1013  11042      Erik 		} else if (ixa->ixa_ifindex != 0) {
   1014  11042      Erik 			/*
   1015  11042      Erik 			 * In the ipmp case, the ixa_ifindex is set to
   1016  11042      Erik 			 * point at an under_ill and we would return the
   1017  11042      Erik 			 * ire_multicast() corresponding to that under_ill.
   1018  11042      Erik 			 */
   1019  11042      Erik 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
   1020  11042      Erik 			    !(ixaflags & IXAF_IS_IPV4), ipst);
   1021  11042      Erik 		} else if (ixaflags & IXAF_IS_IPV4) {
   1022  11042      Erik 			ipaddr_t	v4setsrc = INADDR_ANY;
   1023  11042      Erik 
   1024  11042      Erik 			ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst,
   1025  11042      Erik 			    multirtp, &v4setsrc);
   1026  11042      Erik 			if (setsrcp != NULL)
   1027  11042      Erik 				IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
   1028  11042      Erik 		} else {
   1029  11042      Erik 			ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst,
   1030  11042      Erik 			    multirtp, setsrcp);
   1031  11042      Erik 		}
   1032  11042      Erik 		if (ill != NULL && IS_VNI(ill)) {
   1033  11042      Erik 			ill_refrele(ill);
   1034  11042      Erik 			ill = NULL;
   1035  11042      Erik 		}
   1036  11042      Erik 		if (ill == NULL) {
   1037  11042      Erik 			if (errorp != NULL)
   1038  11042      Erik 				*errorp = ENXIO;
   1039  11042      Erik 			/* Get a hold on the IRE_NOROUTE */
   1040  11042      Erik 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
   1041  11042      Erik 			return (ire);
   1042  11042      Erik 		}
   1043  11042      Erik 		if (!(ill->ill_flags & ILLF_MULTICAST)) {
   1044  11042      Erik 			ill_refrele(ill);
   1045  11042      Erik 			if (errorp != NULL)
   1046  11042      Erik 				*errorp = EHOSTUNREACH;
   1047  11042      Erik 			/* Get a hold on the IRE_NOROUTE */
   1048  11042      Erik 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
   1049  11042      Erik 			return (ire);
   1050  11042      Erik 		}
   1051  11042      Erik 		/* Get a refcnt on the single IRE_MULTICAST per ill */
   1052  11042      Erik 		ire = ire_multicast(ill);
   1053  11042      Erik 		ill_refrele(ill);
   1054  11042      Erik 		if (generationp != NULL)
   1055  11042      Erik 			*generationp = ire->ire_generation;
   1056  11042      Erik 		if (errorp != NULL &&
   1057  11042      Erik 		    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
   1058  11042      Erik 			*errorp = EHOSTUNREACH;
   1059  11042      Erik 		}
   1060  11042      Erik 		return (ire);
   1061  11042      Erik 	}
   1062  11042      Erik 
   1063  11042      Erik 	if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
   1064  11042      Erik 		if (ixaflags & IXAF_SCOPEID_SET) {
   1065  11042      Erik 			/* sin6_scope_id takes precedence over ixa_ifindex */
   1066  11042      Erik 			ASSERT(ixa->ixa_scopeid != 0);
   1067  11042      Erik 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
   1068  11042      Erik 			    !(ixaflags & IXAF_IS_IPV4), ipst);
   1069  11042      Erik 		} else {
   1070  11042      Erik 			ASSERT(ixa->ixa_ifindex != 0);
   1071  11042      Erik 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
   1072  11042      Erik 			    !(ixaflags & IXAF_IS_IPV4), ipst);
   1073  11042      Erik 		}
   1074  11042      Erik 		if (ill != NULL && IS_VNI(ill)) {
   1075  11042      Erik 			ill_refrele(ill);
   1076  11042      Erik 			ill = NULL;
   1077  11042      Erik 		}
   1078  11042      Erik 		if (ill == NULL) {
   1079  11042      Erik 			if (errorp != NULL)
   1080  11042      Erik 				*errorp = ENXIO;
   1081  11042      Erik 			/* Get a hold on the IRE_NOROUTE */
   1082  11042      Erik 			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
   1083  11042      Erik 			return (ire);
   1084  11042      Erik 		}
   1085  11042      Erik 		/*
   1086  11042      Erik 		 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
   1087  11042      Erik 		 * so for both of them we need to be able look for an under
   1088  11042      Erik 		 * interface.
   1089  11042      Erik 		 */
   1090  11042      Erik 		if (IS_UNDER_IPMP(ill))
   1091  11042      Erik 			match_args |= MATCH_IRE_TESTHIDDEN;
   1092  11042      Erik 	} else {
   1093  11042      Erik 		ill = NULL;
   1094  11042      Erik 	}
   1095  11042      Erik 
   1096  11042      Erik 	if (ixaflags & IXAF_NEXTHOP_SET) {
   1097  11042      Erik 		/* IP_NEXTHOP was set */
   1098  11042      Erik 		v6nexthop = ixa->ixa_nexthop_v6;
   1099  11042      Erik 	} else {
   1100  11042      Erik 		v6nexthop = *v6dst;
   1101  11042      Erik 	}
   1102  11042      Erik 
   1103  11042      Erik 	ire_type = 0;
   1104  11042      Erik 	/* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */
   1105  11042      Erik 
   1106  11042      Erik 	/*
   1107  11042      Erik 	 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
   1108  11042      Erik 	 * we only look for an onlink IRE.
   1109  11042      Erik 	 */
   1110  11042      Erik 	if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
   1111  11042      Erik 		match_args |= MATCH_IRE_TYPE;
   1112  11042      Erik 		ire_type = IRE_ONLINK;
   1113  11042      Erik 	}
   1114  11042      Erik 
   1115  11042      Erik 	if (ixaflags & IXAF_IS_IPV4) {
   1116  11042      Erik 		ipaddr_t	v4nexthop;
   1117  11042      Erik 		ipaddr_t	v4setsrc = INADDR_ANY;
   1118  11042      Erik 
   1119  11042      Erik 		IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
   1120  11042      Erik 		ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
   1121  11042      Erik 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE,
   1122  11042      Erik 		    ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
   1123  11042      Erik 		if (setsrcp != NULL)
   1124  11042      Erik 			IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
   1125  11042      Erik 	} else {
   1126  11042      Erik 		ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
   1127  11042      Erik 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE,
   1128  11042      Erik 		    ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
   1129  11042      Erik 	}
   1130  11042      Erik 
   1131  11042      Erik #ifdef DEBUG
   1132  11042      Erik 	if (match_args & MATCH_IRE_TESTHIDDEN) {
   1133  11042      Erik 		ip3dbg(("looking for hidden; dst %x ire %p\n",
   1134  11042      Erik 		    v4dst, (void *)ire));
   1135  11042      Erik 	}
   1136  11042      Erik #endif
   1137  11042      Erik 
   1138  11042      Erik 	if (ill != NULL)
   1139  11042      Erik 		ill_refrele(ill);
   1140  11042      Erik 
   1141  11042      Erik 	if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
   1142  11042      Erik 	    (ire->ire_type & IRE_MULTICAST)) {
   1143  11042      Erik 		/* No ire_nce_cache */
   1144  11042      Erik 		return (ire);
   1145  11042      Erik 	}
   1146  11042      Erik 
   1147  11042      Erik 	/* Setup ire_nce_cache if it doesn't exist or is condemned. */
   1148  11042      Erik 	mutex_enter(&ire->ire_lock);
   1149  11042      Erik 	nce = ire->ire_nce_cache;
   1150  11042      Erik 	if (nce == NULL || nce->nce_is_condemned) {
   1151  11042      Erik 		mutex_exit(&ire->ire_lock);
   1152  11042      Erik 		(void) ire_revalidate_nce(ire);
   1153  11042      Erik 	} else {
   1154  11042      Erik 		mutex_exit(&ire->ire_lock);
   1155  11042      Erik 	}
   1156  11042      Erik 	return (ire);
   1157  11042      Erik }
   1158  11042      Erik 
   1159  11042      Erik /*
   1160  11042      Erik  * Find a route given some xmit attributes and a packet.
   1161  11042      Erik  * Generic for IPv4 and IPv6
   1162  11042      Erik  *
   1163  11042      Erik  * This never returns NULL. But when it returns the IRE_NOROUTE
   1164  11042      Erik  * it might set errorp.
   1165  11042      Erik  */
   1166  11042      Erik ire_t *
   1167  11042      Erik ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
   1168  11042      Erik     int *errorp, boolean_t *multirtp)
   1169  11042      Erik {
   1170  11042      Erik 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
   1171  11042      Erik 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
   1172  11042      Erik 		in6_addr_t	v6dst;
   1173  11042      Erik 
   1174  11042      Erik 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
   1175  11042      Erik 
   1176  11042      Erik 		return (ip_select_route(&v6dst, ixa, generationp,
   1177  11042      Erik 		    NULL, errorp, multirtp));
   1178  11042      Erik 	} else {
   1179  11042      Erik 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
   1180  11042      Erik 
   1181  11042      Erik 		return (ip_select_route(&ip6h->ip6_dst, ixa, generationp,
   1182  11042      Erik 		    NULL, errorp, multirtp));
   1183  11042      Erik 	}
   1184  11042      Erik }
   1185  11042      Erik 
   1186  11042      Erik ire_t *
   1187  11042      Erik ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp,
   1188  11042      Erik     ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
   1189  11042      Erik {
   1190  11042      Erik 	in6_addr_t	v6dst;
   1191  11042      Erik 	ire_t		*ire;
   1192  11042      Erik 	in6_addr_t	setsrc;
   1193  11042      Erik 
   1194  11042      Erik 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
   1195  11042      Erik 
   1196  11042      Erik 	IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
   1197  11042      Erik 
   1198  11042      Erik 	setsrc = ipv6_all_zeros;
   1199  11042      Erik 	ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp,
   1200  11042      Erik 	    multirtp);
   1201  11042      Erik 	if (v4setsrcp != NULL)
   1202  11042      Erik 		IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
   1203  11042      Erik 	return (ire);
   1204  11042      Erik }
   1205  11042      Erik 
   1206  11042      Erik /*
   1207  11042      Erik  * Recursively look for a route to the destination. Can also match on
   1208  11042      Erik  * the zoneid, ill, and label. Used for the data paths. See also
   1209  11042      Erik  * ire_route_recursive.
   1210  11042      Erik  *
   1211  11042      Erik  * If ill is set this means we will match it by adding MATCH_IRE_ILL.
   1212  11042      Erik  *
   1213  11042      Erik  * Note that this function never returns NULL. It returns an IRE_NOROUTE
   1214  11042      Erik  * instead.
   1215  11042      Erik  *
   1216  11042      Erik  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
   1217  11042      Erik  * is an error.
   1218  11042      Erik  * Allow at most one RTF_INDIRECT.
   1219  11042      Erik  */
   1220  11042      Erik ire_t *
   1221  11042      Erik ire_route_recursive_impl_v4(ire_t *ire,
   1222  11042      Erik     ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
   1223  11042      Erik     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
   1224  11042      Erik     boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
   1225  11042      Erik     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
   1226  11042      Erik {
   1227  11042      Erik 	int		i, j;
   1228  11042      Erik 	ire_t		*ires[MAX_IRE_RECURSION];
   1229  11042      Erik 	uint_t		generation;
   1230  11042      Erik 	uint_t		generations[MAX_IRE_RECURSION];
   1231  11042      Erik 	boolean_t	need_refrele = B_FALSE;
   1232  11042      Erik 	boolean_t	invalidate = B_FALSE;
   1233  11042      Erik 	int		prefs[MAX_IRE_RECURSION];
   1234  11042      Erik 	ill_t		*ill = NULL;
   1235  11042      Erik 
   1236  11042      Erik 	if (setsrcp != NULL)
   1237  11042      Erik 		ASSERT(*setsrcp == INADDR_ANY);
   1238  11042      Erik 	if (gwattrp != NULL)
   1239  11042      Erik 		ASSERT(*gwattrp == NULL);
   1240  11042      Erik 
   1241  11042      Erik 	if (ill_arg != NULL)
   1242  11042      Erik 		match_args |= MATCH_IRE_ILL;
   1243  11042      Erik 
   1244  11042      Erik 	/*
   1245  11042      Erik 	 * We iterate up to three times to resolve a route, even though
   1246  11042      Erik 	 * we have four slots in the array. The extra slot is for an
   1247  11042      Erik 	 * IRE_IF_CLONE we might need to create.
   1248  11042      Erik 	 */
   1249  11042      Erik 	i = 0;
   1250  11042      Erik 	while (i < MAX_IRE_RECURSION - 1) {
   1251  11042      Erik 		/* ire_ftable_lookup handles round-robin/ECMP */
   1252  11042      Erik 		if (ire == NULL) {
   1253  11042      Erik 			ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
   1254  11042      Erik 			    (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
   1255  11042      Erik 			    match_args, xmit_hint, ipst, &generation);
   1256  11042      Erik 		} else {
   1257  11042      Erik 			/* Caller passed it; extra hold since we will rele */
   1258  11042      Erik 			ire_refhold(ire);
   1259  11042      Erik 			if (generationp != NULL)
   1260  11042      Erik 				generation = *generationp;
   1261  11042      Erik 			else
   1262  11042      Erik 				generation = IRE_GENERATION_VERIFY;
   1263  11042      Erik 		}
   1264  11042      Erik 		if (ire == NULL)
   1265  11042      Erik 			ire = ire_reject(ipst, B_FALSE);
   1266  11042      Erik 
   1267  11042      Erik 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
   1268  11042      Erik 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
   1269  11042      Erik 			goto error;
   1270  11042      Erik 
   1271  11042      Erik 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
   1272  11042      Erik 
   1273  11042      Erik 		if (i != 0) {
   1274  11131      Erik 			prefs[i] = ire_pref(ire);
   1275  11042      Erik 			/*
   1276  11042      Erik 			 * Don't allow anything unusual past the first
   1277  11042      Erik 			 * iteration.
   1278  11042      Erik 			 */
   1279  11042      Erik 			if ((ire->ire_type &
   1280  11042      Erik 			    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
   1281  11042      Erik 			    prefs[i] <= prefs[i-1]) {
   1282  11042      Erik 				ire_refrele(ire);
   1283  11042      Erik 				ire = ire_reject(ipst, B_FALSE);
   1284  11042      Erik 				goto error;
   1285  11042      Erik 			}
   1286  11042      Erik 		}
   1287  11042      Erik 		/* We have a usable IRE */
   1288  11042      Erik 		ires[i] = ire;
   1289  11042      Erik 		generations[i] = generation;
   1290  11042      Erik 		i++;
   1291  11042      Erik 
   1292  11042      Erik 		/* The first RTF_SETSRC address is passed back if setsrcp */
   1293  11042      Erik 		if ((ire->ire_flags & RTF_SETSRC) &&
   1294  11042      Erik 		    setsrcp != NULL && *setsrcp == INADDR_ANY) {
   1295  11042      Erik 			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
   1296  11042      Erik 			*setsrcp = ire->ire_setsrc_addr;
   1297  11042      Erik 		}
   1298  11042      Erik 
   1299  11042      Erik 		/* The first ire_gw_secattr is passed back if gwattrp */
   1300  11042      Erik 		if (ire->ire_gw_secattr != NULL &&
   1301  11042      Erik 		    gwattrp != NULL && *gwattrp == NULL)
   1302  11042      Erik 			*gwattrp = ire->ire_gw_secattr;
   1303  11042      Erik 
   1304  11042      Erik 		/*
   1305  11042      Erik 		 * Check if we have a short-cut pointer to an IRE for this
   1306  11042      Erik 		 * destination, and that the cached dependency isn't stale.
   1307  11042      Erik 		 * In that case we've rejoined an existing tree towards a
   1308  11042      Erik 		 * parent, thus we don't need to continue the loop to
   1309  11042      Erik 		 * discover the rest of the tree.
   1310  11042      Erik 		 */
   1311  11042      Erik 		mutex_enter(&ire->ire_lock);
   1312  11042      Erik 		if (ire->ire_dep_parent != NULL &&
   1313  11042      Erik 		    ire->ire_dep_parent->ire_generation ==
   1314  11042      Erik 		    ire->ire_dep_parent_generation) {
   1315  11042      Erik 			mutex_exit(&ire->ire_lock);
   1316  11042      Erik 			ire = NULL;
   1317  11042      Erik 			goto done;
   1318  11042      Erik 		}
   1319  11042      Erik 		mutex_exit(&ire->ire_lock);
   1320  11042      Erik 
   1321  11042      Erik 		/*
   1322  11042      Erik 		 * If this type should have an ire_nce_cache (even if it
   1323  11042      Erik 		 * doesn't yet have one) then we are done. Includes
   1324  11042      Erik 		 * IRE_INTERFACE with a full 32 bit mask.
   1325  11042      Erik 		 */
   1326  11042      Erik 		if (ire->ire_nce_capable) {
   1327  11042      Erik 			ire = NULL;
   1328  11042      Erik 			goto done;
   1329  11042      Erik 		}
   1330  11042      Erik 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
   1331  11042      Erik 		/*
   1332  11042      Erik 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
   1333  11042      Erik 		 * particular destination
   1334  11042      Erik 		 */
   1335  11042      Erik 		if (ire->ire_type & IRE_INTERFACE) {
   1336  11042      Erik 			in6_addr_t	v6nexthop;
   1337  11042      Erik 			ire_t		*clone;
   1338  11042      Erik 
   1339  11042      Erik 			ASSERT(ire->ire_masklen != IPV4_ABITS);
   1340  11042      Erik 
   1341  11042      Erik 			/*
   1342  11042      Erik 			 * In the case of ip_input and ILLF_FORWARDING not
   1343  11042      Erik 			 * being set, and in the case of RTM_GET,
   1344  11042      Erik 			 * there is no point in allocating
   1345  11042      Erik 			 * an IRE_IF_CLONE. We return the IRE_INTERFACE.
   1346  11042      Erik 			 * Note that !allocate can result in a ire_dep_parent
   1347  11042      Erik 			 * which is IRE_IF_* without an IRE_IF_CLONE.
   1348  11042      Erik 			 * We recover from that when we need to send packets
   1349  11042      Erik 			 * by ensuring that the generations become
   1350  11042      Erik 			 * IRE_GENERATION_VERIFY in this case.
   1351  11042      Erik 			 */
   1352  11042      Erik 			if (!allocate) {
   1353  11042      Erik 				invalidate = B_TRUE;
   1354  11042      Erik 				ire = NULL;
   1355  11042      Erik 				goto done;
   1356  11042      Erik 			}
   1357  11042      Erik 
   1358  11042      Erik 			IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
   1359  11042      Erik 
   1360  11042      Erik 			clone = ire_create_if_clone(ire, &v6nexthop,
   1361  11042      Erik 			    &generation);
   1362  11042      Erik 			if (clone == NULL) {
   1363  11042      Erik 				/*
   1364  11042      Erik 				 * Temporary failure - no memory.
   1365  11042      Erik 				 * Don't want caller to cache IRE_NOROUTE.
   1366  11042      Erik 				 */
   1367  11042      Erik 				invalidate = B_TRUE;
   1368  11042      Erik 				ire = ire_blackhole(ipst, B_FALSE);
   1369  11042      Erik 				goto error;
   1370  11042      Erik 			}
   1371  11042      Erik 			/*
   1372  11042      Erik 			 * Make clone next to last entry and the
   1373  11042      Erik 			 * IRE_INTERFACE the last in the dependency
   1374  11042      Erik 			 * chain since the clone depends on the
   1375  11042      Erik 			 * IRE_INTERFACE.
   1376  11042      Erik 			 */
   1377  11042      Erik 			ASSERT(i >= 1);
   1378  11042      Erik 			ASSERT(i < MAX_IRE_RECURSION);
   1379  11042      Erik 
   1380  11042      Erik 			ires[i] = ires[i-1];
   1381  11042      Erik 			generations[i] = generations[i-1];
   1382  11042      Erik 			ires[i-1] = clone;
   1383  11042      Erik 			generations[i-1] = generation;
   1384  11042      Erik 			i++;
   1385  11042      Erik 
   1386  11042      Erik 			ire = NULL;
   1387  11042      Erik 			goto done;
   1388  11042      Erik 		}
   1389  11042      Erik 
   1390  11042      Erik 		/*
   1391  11042      Erik 		 * We only match on the type and optionally ILL when
   1392  11042      Erik 		 * recursing. The type match is used by some callers
   1393  11042      Erik 		 * to exclude certain types (such as IRE_IF_CLONE or
   1394  11042      Erik 		 * IRE_LOCAL|IRE_LOOPBACK).
   1395  11042      Erik 		 */
   1396  11042      Erik 		match_args &= MATCH_IRE_TYPE;
   1397  11042      Erik 		nexthop = ire->ire_gateway_addr;
   1398  11042      Erik 		if (ill == NULL && ire->ire_ill != NULL) {
   1399  11042      Erik 			ill = ire->ire_ill;
   1400  11042      Erik 			need_refrele = B_TRUE;
   1401  11042      Erik 			ill_refhold(ill);
   1402  11042      Erik 			match_args |= MATCH_IRE_ILL;
   1403  11042      Erik 		}
   1404  11131      Erik 		/*
   1405  11131      Erik 		 * We set the prefs[i] value above if i > 0. We've already
   1406  11131      Erik 		 * done i++ so i is one in the case of the first time around.
   1407  11131      Erik 		 */
   1408  11131      Erik 		if (i == 1)
   1409  11131      Erik 			prefs[0] = ire_pref(ire);
   1410  11042      Erik 		ire = NULL;
   1411  11042      Erik 	}
   1412  11042      Erik 	ASSERT(ire == NULL);
   1413  11042      Erik 	ire = ire_reject(ipst, B_FALSE);
   1414  11042      Erik 
   1415  11042      Erik error:
   1416  11042      Erik 	ASSERT(ire != NULL);
   1417  11042      Erik 	if (need_refrele)
   1418  11042      Erik 		ill_refrele(ill);
   1419  11042      Erik 
   1420  11042      Erik 	/*
   1421  11042      Erik 	 * In the case of MULTIRT we want to try a different IRE the next
   1422  11042      Erik 	 * time. We let the next packet retry in that case.
   1423  11042      Erik 	 */
   1424  11042      Erik 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
   1425  11042      Erik 		(void) ire_no_good(ires[0]);
   1426  11042      Erik 
   1427  11042      Erik cleanup:
   1428  11042      Erik 	/* cleanup ires[i] */
   1429  11042      Erik 	ire_dep_unbuild(ires, i);
   1430  11042      Erik 	for (j = 0; j < i; j++)
   1431  11042      Erik 		ire_refrele(ires[j]);
   1432  11042      Erik 
   1433  11042      Erik 	ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE));
   1434  11042      Erik 	/*
   1435  11042      Erik 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
   1436  11042      Erik 	 * ip_select_route since the reject or lack of memory might be gone.
   1437  11042      Erik 	 */
   1438  11042      Erik 	if (generationp != NULL)
   1439  11042      Erik 		*generationp = IRE_GENERATION_VERIFY;
   1440  11042      Erik 	return (ire);
   1441  11042      Erik 
   1442  11042      Erik done:
   1443  11042      Erik 	ASSERT(ire == NULL);
   1444  11042      Erik 	if (need_refrele) {
   1445  11042      Erik 		ill_refrele(ill);
   1446  11042      Erik 		ill = NULL;
   1447  11042      Erik 	}
   1448  11042      Erik 
   1449  11042      Erik 	/* Build dependencies */
   1450  11131      Erik 	if (i > 1 && !ire_dep_build(ires, generations, i)) {
   1451  11042      Erik 		/* Something in chain was condemned; tear it apart */
   1452  11042      Erik 		ire = ire_reject(ipst, B_FALSE);
   1453  11042      Erik 		goto cleanup;
   1454  11042      Erik 	}
   1455  11042      Erik 
   1456  11042      Erik 	/*
   1457  11042      Erik 	 * Release all refholds except the one for ires[0] that we
   1458  11042      Erik 	 * will return to the caller.
   1459  11042      Erik 	 */
   1460  11042      Erik 	for (j = 1; j < i; j++)
   1461  11042      Erik 		ire_refrele(ires[j]);
   1462  11042      Erik 
   1463  11042      Erik 	if (invalidate) {
   1464  11042      Erik 		/*
   1465  11042      Erik 		 * Since we needed to allocate but couldn't we need to make
   1466  11042      Erik 		 * sure that the dependency chain is rebuilt the next time.
   1467  11042      Erik 		 */
   1468  11042      Erik 		ire_dep_invalidate_generations(ires[0]);
   1469  11042      Erik 		generation = IRE_GENERATION_VERIFY;
   1470  11042      Erik 	} else {
   1471  11042      Erik 		/*
   1472  11042      Erik 		 * IREs can have been added or deleted while we did the
   1473  11042      Erik 		 * recursive lookup and we can't catch those until we've built
   1474  11042      Erik 		 * the dependencies. We verify the stored
   1475  11042      Erik 		 * ire_dep_parent_generation to catch any such changes and
   1476  11042      Erik 		 * return IRE_GENERATION_VERIFY (which will cause
   1477  11042      Erik 		 * ip_select_route to be called again so we can redo the
   1478  11042      Erik 		 * recursive lookup next time we send a packet.
   1479  11042      Erik 		 */
   1480  11131      Erik 		if (ires[0]->ire_dep_parent == NULL)
   1481  11131      Erik 			generation = ires[0]->ire_generation;
   1482  11131      Erik 		else
   1483  11131      Erik 			generation = ire_dep_validate_generations(ires[0]);
   1484  11042      Erik 		if (generations[0] != ires[0]->ire_generation) {
   1485  11042      Erik 			/* Something changed at the top */
   1486  11042      Erik 			generation = IRE_GENERATION_VERIFY;
   1487  11042      Erik 		}
   1488  11042      Erik 	}
   1489  11042      Erik 	if (generationp != NULL)
   1490  11042      Erik 		*generationp = generation;
   1491  11042      Erik 
   1492  11042      Erik 	return (ires[0]);
   1493  11042      Erik }
   1494  11042      Erik 
   1495  11042      Erik ire_t *
   1496  11042      Erik ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
   1497  11042      Erik     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
   1498  11042      Erik     boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
   1499  11042      Erik     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
   1500  11042      Erik {
   1501  11042      Erik 	return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
   1502  11042      Erik 	    zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp,
   1503  11042      Erik 	    gwattrp, generationp));
   1504  11042      Erik }
   1505  11042      Erik 
   1506  11042      Erik /*
   1507  11042      Erik  * Recursively look for a route to the destination.
   1508  11042      Erik  * We only handle a destination match here, yet we have the same arguments
   1509  11042      Erik  * as the full match to allow function pointers to select between the two.
   1510  11042      Erik  *
   1511  11042      Erik  * Note that this function never returns NULL. It returns an IRE_NOROUTE
   1512  11042      Erik  * instead.
   1513  11042      Erik  *
   1514  11042      Erik  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
   1515  11042      Erik  * is an error.
   1516  11042      Erik  * Allow at most one RTF_INDIRECT.
   1517  11042      Erik  */
   1518  11042      Erik ire_t *
   1519  11042      Erik ire_route_recursive_dstonly_v4(ipaddr_t nexthop, boolean_t allocate,
   1520  11042      Erik     uint32_t xmit_hint, ip_stack_t *ipst)
   1521  11042      Erik {
   1522  11042      Erik 	ire_t	*ire;
   1523  11042      Erik 	ire_t	*ire1;
   1524  11042      Erik 	uint_t	generation;
   1525  11042      Erik 
   1526  11042      Erik 	/* ire_ftable_lookup handles round-robin/ECMP */
   1527  11042      Erik 	ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
   1528  11042      Erik 	    &generation);
   1529  11042      Erik 	ASSERT(ire != NULL);
   1530  11042      Erik 
   1531  11042      Erik 	/*
   1532  11042      Erik 	 * If this type should have an ire_nce_cache (even if it
   1533  11042      Erik 	 * doesn't yet have one) then we are done. Includes
   1534  11042      Erik 	 * IRE_INTERFACE with a full 32 bit mask.
   1535  11042      Erik 	 */
   1536  11042      Erik 	if (ire->ire_nce_capable)
   1537  11042      Erik 		return (ire);
   1538  11042      Erik 
   1539  11042      Erik 	/*
   1540  11042      Erik 	 * If the IRE has a current cached parent we know that the whole
   1541  11042      Erik 	 * parent chain is current, hence we don't need to discover and
   1542  11042      Erik 	 * build any dependencies by doing a recursive lookup.
   1543  11042      Erik 	 */
   1544  11042      Erik 	mutex_enter(&ire->ire_lock);
   1545  11042      Erik 	if (ire->ire_dep_parent != NULL &&
   1546  11042      Erik 	    ire->ire_dep_parent->ire_generation ==
   1547  11042      Erik 	    ire->ire_dep_parent_generation) {
   1548  11042      Erik 		mutex_exit(&ire->ire_lock);
   1549  11042      Erik 		return (ire);
   1550  11042      Erik 	}
   1551  11042      Erik 	mutex_exit(&ire->ire_lock);
   1552  11042      Erik 
   1553  11042      Erik 	/*
   1554  11042      Erik 	 * Fallback to loop in the normal code starting with the ire
   1555  11042      Erik 	 * we found. Normally this would return the same ire.
   1556  11042      Erik 	 */
   1557  11042      Erik 	ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
   1558  11042      Erik 	    NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL,
   1559  11042      Erik 	    &generation);
   1560  11042      Erik 	ire_refrele(ire);
   1561  11042      Erik 	return (ire1);
   1562  11042      Erik }
   1563