1 0 stevel /* 2 0 stevel * CDDL HEADER START 3 0 stevel * 4 0 stevel * The contents of this file are subject to the terms of the 5 1676 jpk * Common Development and Distribution License (the "License"). 6 1676 jpk * You may not use this file except in compliance with the License. 7 0 stevel * 8 0 stevel * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 0 stevel * or http://www.opensolaris.org/os/licensing. 10 0 stevel * See the License for the specific language governing permissions 11 0 stevel * and limitations under the License. 12 0 stevel * 13 0 stevel * When distributing Covered Code, include this CDDL HEADER in each 14 0 stevel * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 0 stevel * If applicable, add the following below this CDDL HEADER, with the 16 0 stevel * fields enclosed by brackets "[]" replaced with your own identifying 17 0 stevel * information: Portions Copyright [yyyy] [name of copyright owner] 18 0 stevel * 19 0 stevel * CDDL HEADER END 20 0 stevel */ 21 0 stevel /* 22 8485 Peter * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 0 stevel * Use is subject to license terms. 24 0 stevel */ 25 0 stevel /* Copyright (c) 1990 Mentat Inc. */ 26 0 stevel 27 0 stevel /* 28 0 stevel * This file contains routines that manipulate Internet Routing Entries (IREs). 29 0 stevel */ 30 0 stevel 31 0 stevel #include <sys/types.h> 32 0 stevel #include <sys/stream.h> 33 0 stevel #include <sys/stropts.h> 34 8485 Peter #include <sys/strsun.h> 35 8778 Erik #include <sys/strsubr.h> 36 0 stevel #include <sys/ddi.h> 37 0 stevel #include <sys/cmn_err.h> 38 0 stevel #include <sys/policy.h> 39 0 stevel 40 0 stevel #include <sys/systm.h> 41 0 stevel #include <sys/kmem.h> 42 0 stevel #include <sys/param.h> 43 0 stevel #include <sys/socket.h> 44 0 stevel #include <net/if.h> 45 0 stevel #include <net/route.h> 46 0 stevel #include <netinet/in.h> 47 0 stevel #include <net/if_dl.h> 48 0 stevel #include <netinet/ip6.h> 49 0 stevel #include <netinet/icmp6.h> 50 0 stevel 51 0 stevel #include <inet/common.h> 52 0 stevel #include <inet/mi.h> 53 0 stevel #include <inet/ip.h> 54 0 stevel #include <inet/ip6.h> 55 0 stevel #include <inet/ip_ndp.h> 56 2535 sangeeta #include <inet/arp.h> 57 0 stevel #include <inet/ip_if.h> 58 0 stevel #include <inet/ip_ire.h> 59 2535 sangeeta #include <inet/ip_ftable.h> 60 0 stevel #include <inet/ip_rts.h> 61 0 stevel #include <inet/nd.h> 62 0 stevel 63 0 stevel #include <inet/tcp.h> 64 0 stevel #include <inet/ipclassifier.h> 65 0 stevel #include <sys/zone.h> 66 3448 dh155122 #include <sys/cpuvar.h> 67 3448 dh155122 68 1676 jpk #include <sys/tsol/label.h> 69 1676 jpk #include <sys/tsol/tnet.h> 70 0 stevel 71 2535 sangeeta struct kmem_cache *rt_entry_cache; 72 2535 sangeeta 73 11042 Erik typedef struct nce_clookup_s { 74 11042 Erik ipaddr_t ncecl_addr; 75 11042 Erik boolean_t ncecl_found; 76 11042 Erik } nce_clookup_t; 77 11042 Erik 78 0 stevel /* 79 0 stevel * Synchronization notes: 80 0 stevel * 81 0 stevel * The fields of the ire_t struct are protected in the following way : 82 0 stevel * 83 0 stevel * ire_next/ire_ptpn 84 0 stevel * 85 11042 Erik * - bucket lock of the forwarding table in which is ire stored. 86 0 stevel * 87 11042 Erik * ire_ill, ire_u *except* ire_gateway_addr[v6], ire_mask, 88 11042 Erik * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, 89 11042 Erik * ire_bucket 90 0 stevel * 91 0 stevel * - Set in ire_create_v4/v6 and never changes after that. Thus, 92 0 stevel * we don't need a lock whenever these fields are accessed. 93 0 stevel * 94 0 stevel * - ire_bucket and ire_masklen (also set in ire_create) is set in 95 11042 Erik * ire_add before inserting in the bucket and never 96 0 stevel * changes after that. Thus we don't need a lock whenever these 97 0 stevel * fields are accessed. 98 0 stevel * 99 0 stevel * ire_gateway_addr_v4[v6] 100 0 stevel * 101 0 stevel * - ire_gateway_addr_v4[v6] is set during ire_create and later modified 102 0 stevel * by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to 103 0 stevel * it assumed to be atomic and hence the other parts of the code 104 0 stevel * does not use any locks. ire_gateway_addr_v6 updates are not atomic 105 0 stevel * and hence any access to it uses ire_lock to get/set the right value. 106 0 stevel * 107 11042 Erik * ire_refcnt, ire_identical_ref 108 0 stevel * 109 0 stevel * - Updated atomically using atomic_add_32 110 0 stevel * 111 0 stevel * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count 112 0 stevel * 113 0 stevel * - Assumes that 32 bit writes are atomic. No locks. ire_lock is 114 0 stevel * used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt. 115 0 stevel * 116 11042 Erik * ire_generation 117 11042 Erik * - Under ire_lock 118 0 stevel * 119 11042 Erik * ire_nce_cache 120 11042 Erik * - Under ire_lock 121 0 stevel * 122 11042 Erik * ire_dep_parent (To next IRE in recursive lookup chain) 123 11042 Erik * - Under ips_ire_dep_lock. Write held when modifying. Read held when 124 11042 Erik * walking. We also hold ire_lock when modifying to allow the data path 125 11042 Erik * to only acquire ire_lock. 126 0 stevel * 127 11042 Erik * ire_dep_parent_generation (Generation number from ire_dep_parent) 128 11042 Erik * - Under ips_ire_dep_lock and/or ire_lock. (A read claim on the dep_lock 129 11042 Erik * and ire_lock held when modifying) 130 0 stevel * 131 11042 Erik * ire_dep_children (From parent to first child) 132 11042 Erik * ire_dep_sib_next (linked list of siblings) 133 11042 Erik * ire_dep_sib_ptpn (linked list of siblings) 134 11042 Erik * - Under ips_ire_dep_lock. Write held when modifying. Read held when 135 11042 Erik * walking. 136 0 stevel * 137 0 stevel * As we always hold the bucket locks in all the places while accessing 138 0 stevel * the above values, it is natural to use them for protecting them. 139 0 stevel * 140 11042 Erik * We have a forwarding table for IPv4 and IPv6. The IPv6 forwarding table 141 5335 sowmini * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t 142 11042 Erik * structures. ip_forwarding_table_v6 is allocated dynamically in 143 3448 dh155122 * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads 144 0 stevel * initializing the same bucket. Once a bucket is initialized, it is never 145 3448 dh155122 * de-alloacted. This assumption enables us to access 146 3448 dh155122 * ip_forwarding_table_v6[i] without any locks. 147 5335 sowmini * 148 5335 sowmini * The forwarding table for IPv4 is a radix tree whose leaves 149 5335 sowmini * are rt_entry structures containing the irb_t for the rt_dst. The irb_t 150 5335 sowmini * for IPv4 is dynamically allocated and freed. 151 0 stevel * 152 0 stevel * Each irb_t - ire bucket structure has a lock to protect 153 0 stevel * a bucket and the ires residing in the bucket have a back pointer to 154 0 stevel * the bucket structure. It also has a reference count for the number 155 0 stevel * of threads walking the bucket - irb_refcnt which is bumped up 156 11042 Erik * using the irb_refhold function. The flags irb_marks can be 157 11042 Erik * set to IRB_MARK_CONDEMNED indicating that there are some ires 158 11042 Erik * in this bucket that are IRE_IS_CONDEMNED and the 159 0 stevel * last thread to leave the bucket should delete the ires. Usually 160 11042 Erik * this is done by the irb_refrele function which is used to decrement 161 5335 sowmini * the reference count on a bucket. See comments above irb_t structure 162 5335 sowmini * definition in ip.h for further details. 163 0 stevel * 164 11042 Erik * The ire_refhold/ire_refrele functions operate on the ire which increments/ 165 0 stevel * decrements the reference count, ire_refcnt, atomically on the ire. 166 11042 Erik * ire_refcnt is modified only using those functions. Operations on the IRE 167 0 stevel * could be described as follows : 168 0 stevel * 169 0 stevel * CREATE an ire with reference count initialized to 1. 170 0 stevel * 171 0 stevel * ADDITION of an ire holds the bucket lock, checks for duplicates 172 11042 Erik * and then adds the ire. ire_add returns the ire after 173 0 stevel * bumping up once more i.e the reference count is 2. This is to avoid 174 0 stevel * an extra lookup in the functions calling ire_add which wants to 175 0 stevel * work with the ire after adding. 176 0 stevel * 177 11042 Erik * LOOKUP of an ire bumps up the reference count using ire_refhold 178 11042 Erik * function. It is valid to bump up the referece count of the IRE, 179 0 stevel * after the lookup has returned an ire. Following are the lookup 180 0 stevel * functions that return an HELD ire : 181 0 stevel * 182 11042 Erik * ire_ftable_lookup[_v6], ire_lookup_multi_ill[_v6] 183 0 stevel * 184 0 stevel * DELETION of an ire holds the bucket lock, removes it from the list 185 0 stevel * and then decrements the reference count for having removed from the list 186 11042 Erik * by using the ire_refrele function. If some other thread has looked up 187 0 stevel * the ire, the reference count would have been bumped up and hence 188 0 stevel * this ire will not be freed once deleted. It will be freed once the 189 0 stevel * reference count drops to zero. 190 0 stevel * 191 0 stevel * Add and Delete acquires the bucket lock as RW_WRITER, while all the 192 0 stevel * lookups acquire the bucket lock as RW_READER. 193 0 stevel * 194 11042 Erik * The general rule is to do the ire_refrele in the function 195 0 stevel * that is passing the ire as an argument. 196 0 stevel * 197 0 stevel * In trying to locate ires the following points are to be noted. 198 0 stevel * 199 11042 Erik * IRE_IS_CONDEMNED signifies that the ire has been logically deleted and is 200 0 stevel * to be ignored when walking the ires using ire_next. 201 0 stevel * 202 0 stevel * Zones note: 203 0 stevel * Walking IREs within a given zone also walks certain ires in other 204 0 stevel * zones. This is done intentionally. IRE walks with a specified 205 0 stevel * zoneid are used only when doing informational reports, and 206 0 stevel * zone users want to see things that they can access. See block 207 0 stevel * comment in ire_walk_ill_match(). 208 0 stevel */ 209 0 stevel 210 0 stevel /* 211 0 stevel * The size of the forwarding table. We will make sure that it is a 212 0 stevel * power of 2 in ip_ire_init(). 213 3448 dh155122 * Setable in /etc/system 214 0 stevel */ 215 0 stevel uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE; 216 0 stevel 217 0 stevel struct kmem_cache *ire_cache; 218 11042 Erik struct kmem_cache *ncec_cache; 219 11042 Erik struct kmem_cache *nce_cache; 220 11042 Erik 221 0 stevel static ire_t ire_null; 222 0 stevel 223 11042 Erik static ire_t *ire_add_v4(ire_t *ire); 224 0 stevel static void ire_delete_v4(ire_t *ire); 225 11042 Erik static void ire_dep_invalidate_children(ire_t *child); 226 1676 jpk static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, 227 3448 dh155122 zoneid_t zoneid, ip_stack_t *); 228 0 stevel static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, 229 1676 jpk pfv_t func, void *arg, uchar_t vers, ill_t *ill); 230 5023 carlsonj #ifdef DEBUG 231 5023 carlsonj static void ire_trace_cleanup(const ire_t *); 232 0 stevel #endif 233 0 stevel 234 0 stevel /* 235 11042 Erik * Following are the functions to increment/decrement the reference 236 11042 Erik * count of the IREs and IRBs (ire bucket). 237 11042 Erik * 238 11042 Erik * 1) We bump up the reference count of an IRE to make sure that 239 11042 Erik * it does not get deleted and freed while we are using it. 240 11042 Erik * Typically all the lookup functions hold the bucket lock, 241 11042 Erik * and look for the IRE. If it finds an IRE, it bumps up the 242 11042 Erik * reference count before dropping the lock. Sometimes we *may* want 243 11042 Erik * to bump up the reference count after we *looked* up i.e without 244 11042 Erik * holding the bucket lock. So, the ire_refhold function does not assert 245 11042 Erik * on the bucket lock being held. Any thread trying to delete from 246 11042 Erik * the hash bucket can still do so but cannot free the IRE if 247 11042 Erik * ire_refcnt is not 0. 248 11042 Erik * 249 11042 Erik * 2) We bump up the reference count on the bucket where the IRE resides 250 11042 Erik * (IRB), when we want to prevent the IREs getting deleted from a given 251 11042 Erik * hash bucket. This makes life easier for ire_walk type functions which 252 11042 Erik * wants to walk the IRE list, call a function, but needs to drop 253 11042 Erik * the bucket lock to prevent recursive rw_enters. While the 254 11042 Erik * lock is dropped, the list could be changed by other threads or 255 11042 Erik * the same thread could end up deleting the ire or the ire pointed by 256 11042 Erik * ire_next. ire_refholding the ire or ire_next is not sufficient as 257 11042 Erik * a delete will still remove the ire from the bucket while we have 258 11042 Erik * dropped the lock and hence the ire_next would be NULL. Thus, we 259 11042 Erik * need a mechanism to prevent deletions from a given bucket. 260 11042 Erik * 261 11042 Erik * To prevent deletions, we bump up the reference count on the 262 11042 Erik * bucket. If the bucket is held, ire_delete just marks both 263 11042 Erik * the ire and irb as CONDEMNED. When the 264 11042 Erik * reference count on the bucket drops to zero, all the CONDEMNED ires 265 11042 Erik * are deleted. We don't have to bump up the reference count on the 266 11042 Erik * bucket if we are walking the bucket and never have to drop the bucket 267 11042 Erik * lock. Note that irb_refhold does not prevent addition of new ires 268 11042 Erik * in the list. It is okay because addition of new ires will not cause 269 11042 Erik * ire_next to point to freed memory. We do irb_refhold only when 270 11042 Erik * all of the 3 conditions are true : 271 11042 Erik * 272 11042 Erik * 1) The code needs to walk the IRE bucket from start to end. 273 11042 Erik * 2) It may have to drop the bucket lock sometimes while doing (1) 274 11042 Erik * 3) It does not want any ires to be deleted meanwhile. 275 11042 Erik */ 276 11042 Erik 277 11042 Erik /* 278 11042 Erik * Bump up the reference count on the hash bucket - IRB to 279 11042 Erik * prevent ires from being deleted in this bucket. 280 11042 Erik */ 281 11042 Erik void 282 11042 Erik irb_refhold(irb_t *irb) 283 11042 Erik { 284 11042 Erik rw_enter(&irb->irb_lock, RW_WRITER); 285 11042 Erik irb->irb_refcnt++; 286 11042 Erik ASSERT(irb->irb_refcnt != 0); 287 11042 Erik rw_exit(&irb->irb_lock); 288 11042 Erik } 289 11042 Erik 290 11042 Erik void 291 11042 Erik irb_refhold_locked(irb_t *irb) 292 11042 Erik { 293 11042 Erik ASSERT(RW_WRITE_HELD(&irb->irb_lock)); 294 11042 Erik irb->irb_refcnt++; 295 11042 Erik ASSERT(irb->irb_refcnt != 0); 296 11042 Erik } 297 11042 Erik 298 11042 Erik /* 299 11042 Erik * Note: when IRB_MARK_DYNAMIC is not set the irb_t 300 11042 Erik * is statically allocated, so that when the irb_refcnt goes to 0, 301 11042 Erik * we simply clean up the ire list and continue. 302 11042 Erik */ 303 11042 Erik void 304 11042 Erik irb_refrele(irb_t *irb) 305 11042 Erik { 306 11042 Erik if (irb->irb_marks & IRB_MARK_DYNAMIC) { 307 11042 Erik irb_refrele_ftable(irb); 308 11042 Erik } else { 309 11042 Erik rw_enter(&irb->irb_lock, RW_WRITER); 310 11042 Erik ASSERT(irb->irb_refcnt != 0); 311 11042 Erik if (--irb->irb_refcnt == 0 && 312 11042 Erik (irb->irb_marks & IRB_MARK_CONDEMNED)) { 313 11042 Erik ire_t *ire_list; 314 11042 Erik 315 11042 Erik ire_list = ire_unlink(irb); 316 11042 Erik rw_exit(&irb->irb_lock); 317 11042 Erik ASSERT(ire_list != NULL); 318 11042 Erik ire_cleanup(ire_list); 319 11042 Erik } else { 320 11042 Erik rw_exit(&irb->irb_lock); 321 11042 Erik } 322 11042 Erik } 323 11042 Erik } 324 11042 Erik 325 11042 Erik 326 11042 Erik /* 327 11042 Erik * Bump up the reference count on the IRE. We cannot assert that the 328 11042 Erik * bucket lock is being held as it is legal to bump up the reference 329 11042 Erik * count after the first lookup has returned the IRE without 330 11042 Erik * holding the lock. 331 11042 Erik */ 332 11042 Erik void 333 11042 Erik ire_refhold(ire_t *ire) 334 11042 Erik { 335 11042 Erik atomic_add_32(&(ire)->ire_refcnt, 1); 336 11042 Erik ASSERT((ire)->ire_refcnt != 0); 337 11042 Erik #ifdef DEBUG 338 11042 Erik ire_trace_ref(ire); 339 11042 Erik #endif 340 11042 Erik } 341 11042 Erik 342 11042 Erik void 343 11042 Erik ire_refhold_notr(ire_t *ire) 344 11042 Erik { 345 11042 Erik atomic_add_32(&(ire)->ire_refcnt, 1); 346 11042 Erik ASSERT((ire)->ire_refcnt != 0); 347 11042 Erik } 348 11042 Erik 349 11042 Erik void 350 11042 Erik ire_refhold_locked(ire_t *ire) 351 11042 Erik { 352 11042 Erik #ifdef DEBUG 353 11042 Erik ire_trace_ref(ire); 354 11042 Erik #endif 355 11042 Erik ire->ire_refcnt++; 356 11042 Erik } 357 11042 Erik 358 11042 Erik /* 359 11042 Erik * Release a ref on an IRE. 360 0 stevel * 361 0 stevel * Must not be called while holding any locks. Otherwise if this is 362 0 stevel * the last reference to be released there is a chance of recursive mutex 363 0 stevel * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 364 0 stevel * to restart an ioctl. The one exception is when the caller is sure that 365 0 stevel * this is not the last reference to be released. Eg. if the caller is 366 0 stevel * sure that the ire has not been deleted and won't be deleted. 367 11042 Erik * 368 11042 Erik * In architectures e.g sun4u, where atomic_add_32_nv is just 369 11042 Erik * a cas, we need to maintain the right memory barrier semantics 370 11042 Erik * as that of mutex_exit i.e all the loads and stores should complete 371 11042 Erik * before the cas is executed. membar_exit() does that here. 372 0 stevel */ 373 0 stevel void 374 0 stevel ire_refrele(ire_t *ire) 375 0 stevel { 376 11042 Erik #ifdef DEBUG 377 11042 Erik ire_untrace_ref(ire); 378 11042 Erik #endif 379 11042 Erik ASSERT((ire)->ire_refcnt != 0); 380 11042 Erik membar_exit(); 381 11042 Erik if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0) 382 11042 Erik ire_inactive(ire); 383 0 stevel } 384 0 stevel 385 0 stevel void 386 0 stevel ire_refrele_notr(ire_t *ire) 387 0 stevel { 388 11042 Erik ASSERT((ire)->ire_refcnt != 0); 389 11042 Erik membar_exit(); 390 11042 Erik if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0) 391 11042 Erik ire_inactive(ire); 392 0 stevel } 393 0 stevel 394 0 stevel /* 395 0 stevel * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY] 396 11042 Erik * IOCTL[s]. The NO_REPLY form is used by TCP to tell IP that it is 397 11042 Erik * having problems reaching a particular destination. 398 11042 Erik * This will make IP consider alternate routes (e.g., when there are 399 11042 Erik * muliple default routes), and it will also make IP discard any (potentially) 400 11042 Erik * stale redirect. 401 11042 Erik * Management processes may want to use the version that generates a reply. 402 0 stevel * 403 11042 Erik * With the use of NUD like behavior for IPv4/ARP in addition to IPv6 404 11042 Erik * this function shouldn't be necessary for IP to recover from a bad redirect, 405 11042 Erik * a bad default router (when there are multiple default routers), or 406 11042 Erik * a stale ND/ARP entry. But we retain it in any case. 407 11042 Erik * For instance, this is helpful when TCP suspects a failure before NUD does. 408 0 stevel */ 409 0 stevel int 410 0 stevel ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 411 0 stevel { 412 2535 sangeeta uchar_t *addr_ucp; 413 11042 Erik uint_t ipversion; 414 11042 Erik sin_t *sin; 415 11042 Erik sin6_t *sin6; 416 11042 Erik ipaddr_t v4addr; 417 11042 Erik in6_addr_t v6addr; 418 2535 sangeeta ire_t *ire; 419 2535 sangeeta ipid_t *ipid; 420 0 stevel zoneid_t zoneid; 421 3448 dh155122 ip_stack_t *ipst; 422 0 stevel 423 0 stevel ASSERT(q->q_next == NULL); 424 11042 Erik zoneid = IPCL_ZONEID(Q_TO_CONN(q)); 425 3448 dh155122 ipst = CONNQ_TO_IPST(q); 426 0 stevel 427 0 stevel /* 428 0 stevel * Check privilege using the ioctl credential; if it is NULL 429 0 stevel * then this is a kernel message and therefor privileged. 430 0 stevel */ 431 3448 dh155122 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 432 0 stevel return (EPERM); 433 0 stevel 434 0 stevel ipid = (ipid_t *)mp->b_rptr; 435 0 stevel 436 0 stevel addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset, 437 4714 sowmini ipid->ipid_addr_length); 438 0 stevel if (addr_ucp == NULL || !OK_32PTR(addr_ucp)) 439 0 stevel return (EINVAL); 440 0 stevel switch (ipid->ipid_addr_length) { 441 11042 Erik case sizeof (sin_t): 442 0 stevel /* 443 0 stevel * got complete (sockaddr) address - increment addr_ucp to point 444 0 stevel * at the ip_addr field. 445 0 stevel */ 446 0 stevel sin = (sin_t *)addr_ucp; 447 0 stevel addr_ucp = (uchar_t *)&sin->sin_addr.s_addr; 448 11042 Erik ipversion = IPV4_VERSION; 449 0 stevel break; 450 11042 Erik case sizeof (sin6_t): 451 11042 Erik /* 452 11042 Erik * got complete (sockaddr) address - increment addr_ucp to point 453 11042 Erik * at the ip_addr field. 454 11042 Erik */ 455 11042 Erik sin6 = (sin6_t *)addr_ucp; 456 11042 Erik addr_ucp = (uchar_t *)&sin6->sin6_addr; 457 11042 Erik ipversion = IPV6_VERSION; 458 11042 Erik break; 459 0 stevel default: 460 0 stevel return (EINVAL); 461 0 stevel } 462 11042 Erik if (ipversion == IPV4_VERSION) { 463 11042 Erik /* Extract the destination address. */ 464 11042 Erik bcopy(addr_ucp, &v4addr, IP_ADDR_LEN); 465 0 stevel 466 11042 Erik ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, 467 11042 Erik zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 468 11042 Erik } else { 469 11042 Erik /* Extract the destination address. */ 470 11042 Erik bcopy(addr_ucp, &v6addr, IPV6_ADDR_LEN); 471 0 stevel 472 11042 Erik ire = ire_ftable_lookup_v6(&v6addr, NULL, NULL, 0, NULL, 473 11042 Erik zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 474 11042 Erik } 475 11042 Erik if (ire != NULL) { 476 11042 Erik if (ipversion == IPV4_VERSION) { 477 11042 Erik ip_rts_change(RTM_LOSING, ire->ire_addr, 478 11042 Erik ire->ire_gateway_addr, ire->ire_mask, 479 11042 Erik (Q_TO_CONN(q))->conn_laddr_v4, 0, 0, 0, 480 11042 Erik (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), 481 11042 Erik ire->ire_ipst); 482 0 stevel } 483 11042 Erik (void) ire_no_good(ire); 484 4714 sowmini ire_refrele(ire); 485 0 stevel } 486 0 stevel return (0); 487 0 stevel } 488 0 stevel 489 0 stevel /* 490 0 stevel * Initialize the ire that is specific to IPv4 part and call 491 0 stevel * ire_init_common to finish it. 492 11042 Erik * Returns zero or errno. 493 0 stevel */ 494 11042 Erik int 495 11042 Erik ire_init_v4(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *gateway, 496 11042 Erik ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, 497 11042 Erik tsol_gc_t *gc, ip_stack_t *ipst) 498 1676 jpk { 499 11042 Erik int error; 500 11042 Erik 501 1676 jpk /* 502 1676 jpk * Reject IRE security attribute creation/initialization 503 1676 jpk * if system is not running in Trusted mode. 504 1676 jpk */ 505 11042 Erik if (gc != NULL && !is_system_labeled()) 506 11042 Erik return (EINVAL); 507 1676 jpk 508 3448 dh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced); 509 0 stevel 510 0 stevel if (addr != NULL) 511 0 stevel bcopy(addr, &ire->ire_addr, IP_ADDR_LEN); 512 11042 Erik if (gateway != NULL) 513 0 stevel bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN); 514 11042 Erik 515 11042 Erik /* Make sure we don't have stray values in some fields */ 516 11042 Erik switch (type) { 517 11042 Erik case IRE_LOOPBACK: 518 11042 Erik bcopy(&ire->ire_addr, &ire->ire_gateway_addr, IP_ADDR_LEN); 519 11042 Erik /* FALLTHRU */ 520 11042 Erik case IRE_HOST: 521 11042 Erik case IRE_BROADCAST: 522 11042 Erik case IRE_LOCAL: 523 11042 Erik case IRE_IF_CLONE: 524 11042 Erik ire->ire_mask = IP_HOST_MASK; 525 11042 Erik ire->ire_masklen = IPV4_ABITS; 526 11042 Erik break; 527 11042 Erik case IRE_PREFIX: 528 11042 Erik case IRE_DEFAULT: 529 11042 Erik case IRE_IF_RESOLVER: 530 11042 Erik case IRE_IF_NORESOLVER: 531 11042 Erik if (mask != NULL) { 532 11042 Erik bcopy(mask, &ire->ire_mask, IP_ADDR_LEN); 533 11042 Erik ire->ire_masklen = ip_mask_to_plen(ire->ire_mask); 534 11042 Erik } 535 11042 Erik break; 536 11042 Erik case IRE_MULTICAST: 537 11042 Erik case IRE_NOROUTE: 538 11042 Erik ASSERT(mask == NULL); 539 11042 Erik break; 540 11042 Erik default: 541 11042 Erik ASSERT(0); 542 11042 Erik return (EINVAL); 543 0 stevel } 544 0 stevel 545 11042 Erik error = ire_init_common(ire, type, ill, zoneid, flags, IPV4_VERSION, 546 11042 Erik gc, ipst); 547 11042 Erik if (error != NULL) 548 11042 Erik return (error); 549 0 stevel 550 11042 Erik /* Determine which function pointers to use */ 551 11042 Erik ire->ire_postfragfn = ip_xmit; /* Common case */ 552 0 stevel 553 11042 Erik switch (ire->ire_type) { 554 11042 Erik case IRE_LOCAL: 555 11042 Erik ire->ire_sendfn = ire_send_local_v4; 556 11042 Erik ire->ire_recvfn = ire_recv_local_v4; 557 11042 Erik ASSERT(ire->ire_ill != NULL); 558 11076 Cathy if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) 559 11042 Erik ire->ire_recvfn = ire_recv_noaccept_v6; 560 11042 Erik break; 561 11042 Erik case IRE_LOOPBACK: 562 11042 Erik ire->ire_sendfn = ire_send_local_v4; 563 11042 Erik ire->ire_recvfn = ire_recv_loopback_v4; 564 11042 Erik break; 565 11042 Erik case IRE_BROADCAST: 566 11042 Erik ire->ire_postfragfn = ip_postfrag_loopcheck; 567 11042 Erik ire->ire_sendfn = ire_send_broadcast_v4; 568 11042 Erik ire->ire_recvfn = ire_recv_broadcast_v4; 569 11042 Erik break; 570 11042 Erik case IRE_MULTICAST: 571 11042 Erik ire->ire_postfragfn = ip_postfrag_loopcheck; 572 11042 Erik ire->ire_sendfn = ire_send_multicast_v4; 573 11042 Erik ire->ire_recvfn = ire_recv_multicast_v4; 574 11042 Erik break; 575 11042 Erik default: 576 11042 Erik /* 577 11042 Erik * For IRE_IF_ALL and IRE_OFFLINK we forward received 578 11042 Erik * packets by default. 579 11042 Erik */ 580 11042 Erik ire->ire_sendfn = ire_send_wire_v4; 581 11042 Erik ire->ire_recvfn = ire_recv_forward_v4; 582 11042 Erik break; 583 11042 Erik } 584 11042 Erik if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 585 11042 Erik ire->ire_sendfn = ire_send_noroute_v4; 586 11042 Erik ire->ire_recvfn = ire_recv_noroute_v4; 587 11042 Erik } else if (ire->ire_flags & RTF_MULTIRT) { 588 11042 Erik ire->ire_postfragfn = ip_postfrag_multirt_v4; 589 11042 Erik ire->ire_sendfn = ire_send_multirt_v4; 590 11042 Erik /* Multirt receive of broadcast uses ire_recv_broadcast_v4 */ 591 11042 Erik if (ire->ire_type != IRE_BROADCAST) 592 11042 Erik ire->ire_recvfn = ire_recv_multirt_v4; 593 11042 Erik } 594 11042 Erik ire->ire_nce_capable = ire_determine_nce_capable(ire); 595 11042 Erik return (0); 596 0 stevel } 597 0 stevel 598 0 stevel /* 599 11042 Erik * Determine ire_nce_capable 600 0 stevel */ 601 11042 Erik boolean_t 602 11042 Erik ire_determine_nce_capable(ire_t *ire) 603 0 stevel { 604 11042 Erik int max_masklen; 605 2535 sangeeta 606 11042 Erik if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 607 11042 Erik (ire->ire_type & IRE_MULTICAST)) 608 11042 Erik return (B_TRUE); 609 2535 sangeeta 610 11042 Erik if (ire->ire_ipversion == IPV4_VERSION) 611 11042 Erik max_masklen = IPV4_ABITS; 612 11042 Erik else 613 11042 Erik max_masklen = IPV6_ABITS; 614 0 stevel 615 11042 Erik if ((ire->ire_type & IRE_ONLINK) && ire->ire_masklen == max_masklen) 616 11042 Erik return (B_TRUE); 617 11042 Erik return (B_FALSE); 618 0 stevel } 619 0 stevel 620 0 stevel /* 621 0 stevel * ire_create is called to allocate and initialize a new IRE. 622 0 stevel * 623 0 stevel * NOTE : This is called as writer sometimes though not required 624 0 stevel * by this function. 625 0 stevel */ 626 0 stevel ire_t * 627 11042 Erik ire_create(uchar_t *addr, uchar_t *mask, uchar_t *gateway, 628 11042 Erik ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, 629 11042 Erik ip_stack_t *ipst) 630 0 stevel { 631 0 stevel ire_t *ire; 632 11042 Erik int error; 633 0 stevel 634 0 stevel ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 635 0 stevel if (ire == NULL) { 636 11042 Erik DTRACE_PROBE(kmem__cache__alloc); 637 0 stevel return (NULL); 638 0 stevel } 639 0 stevel *ire = ire_null; 640 0 stevel 641 11042 Erik error = ire_init_v4(ire, addr, mask, gateway, type, ill, zoneid, flags, 642 11042 Erik gc, ipst); 643 11042 Erik if (error != 0) { 644 11042 Erik DTRACE_PROBE2(ire__init, ire_t *, ire, int, error); 645 0 stevel kmem_cache_free(ire_cache, ire); 646 0 stevel return (NULL); 647 0 stevel } 648 0 stevel return (ire); 649 0 stevel } 650 0 stevel 651 0 stevel /* 652 0 stevel * Common to IPv4 and IPv6 653 11042 Erik * Returns zero or errno. 654 0 stevel */ 655 11042 Erik int 656 11042 Erik ire_init_common(ire_t *ire, ushort_t type, ill_t *ill, zoneid_t zoneid, 657 11042 Erik uint_t flags, uchar_t ipversion, tsol_gc_t *gc, ip_stack_t *ipst) 658 0 stevel { 659 11042 Erik int error; 660 0 stevel 661 1676 jpk #ifdef DEBUG 662 11042 Erik if (ill != NULL) { 663 11042 Erik if (ill->ill_isv6) 664 0 stevel ASSERT(ipversion == IPV6_VERSION); 665 0 stevel else 666 0 stevel ASSERT(ipversion == IPV4_VERSION); 667 1676 jpk } 668 1676 jpk #endif /* DEBUG */ 669 1676 jpk 670 1676 jpk /* 671 1676 jpk * Create/initialize IRE security attribute only in Trusted mode; 672 11042 Erik * if the passed in gc is non-NULL, we expect that the caller 673 1676 jpk * has held a reference to it and will release it when this routine 674 1676 jpk * returns a failure, otherwise we own the reference. We do this 675 1676 jpk * prior to initializing the rest IRE fields. 676 1676 jpk */ 677 1676 jpk if (is_system_labeled()) { 678 1676 jpk if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST | 679 11042 Erik IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE)) != 0) { 680 1676 jpk /* release references on behalf of caller */ 681 1676 jpk if (gc != NULL) 682 1676 jpk GC_REFRELE(gc); 683 11042 Erik } else { 684 11042 Erik error = tsol_ire_init_gwattr(ire, ipversion, gc); 685 11042 Erik if (error != 0) 686 11042 Erik return (error); 687 1676 jpk } 688 0 stevel } 689 0 stevel 690 0 stevel ire->ire_type = type; 691 0 stevel ire->ire_flags = RTF_UP | flags; 692 0 stevel ire->ire_create_time = (uint32_t)gethrestime_sec(); 693 11042 Erik ire->ire_generation = IRE_GENERATION_INITIAL; 694 0 stevel 695 0 stevel /* 696 11042 Erik * The ill_ire_cnt isn't increased until 697 11042 Erik * the IRE is added to ensure that a walker will find 698 11042 Erik * all IREs that hold a reference on an ill. 699 0 stevel * 700 11042 Erik * Note that ill_ire_multicast doesn't hold a ref on the ill since 701 11042 Erik * ire_add() is not called for the IRE_MULTICAST. 702 0 stevel */ 703 11042 Erik ire->ire_ill = ill; 704 11042 Erik ire->ire_zoneid = zoneid; 705 0 stevel ire->ire_ipversion = ipversion; 706 11042 Erik 707 2535 sangeeta mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL); 708 0 stevel ire->ire_refcnt = 1; 709 11042 Erik ire->ire_identical_ref = 1; /* Number of ire_delete's needed */ 710 3448 dh155122 ire->ire_ipst = ipst; /* No netstack_hold */ 711 5023 carlsonj ire->ire_trace_disable = B_FALSE; 712 1676 jpk 713 11042 Erik return (0); 714 0 stevel } 715 0 stevel 716 0 stevel /* 717 11042 Erik * This creates an IRE_BROADCAST based on the arguments. 718 11042 Erik * A mirror is ire_lookup_bcast(). 719 0 stevel * 720 11042 Erik * Any supression of unneeded ones is done in ire_add_v4. 721 11042 Erik * We add one IRE_BROADCAST per address. ire_send_broadcast_v4() 722 11042 Erik * takes care of generating a loopback copy of the packet. 723 0 stevel */ 724 0 stevel ire_t ** 725 11042 Erik ire_create_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid, ire_t **irep) 726 0 stevel { 727 11042 Erik ip_stack_t *ipst = ill->ill_ipst; 728 0 stevel 729 11042 Erik ASSERT(IAM_WRITER_ILL(ill)); 730 3448 dh155122 731 0 stevel *irep++ = ire_create( 732 0 stevel (uchar_t *)&addr, /* dest addr */ 733 0 stevel (uchar_t *)&ip_g_all_ones, /* mask */ 734 0 stevel NULL, /* no gateway */ 735 0 stevel IRE_BROADCAST, 736 11042 Erik ill, 737 11042 Erik zoneid, 738 11042 Erik RTF_KERNEL, 739 4714 sowmini NULL, 740 4714 sowmini ipst); 741 0 stevel 742 0 stevel return (irep); 743 0 stevel } 744 0 stevel 745 0 stevel /* 746 11042 Erik * This looks up an IRE_BROADCAST based on the arguments. 747 11042 Erik * Mirrors ire_create_bcast(). 748 0 stevel */ 749 0 stevel ire_t * 750 11042 Erik ire_lookup_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 751 0 stevel { 752 11042 Erik ire_t *ire; 753 11042 Erik int match_args; 754 0 stevel 755 11042 Erik match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_GW | 756 11042 Erik MATCH_IRE_MASK | MATCH_IRE_ZONEONLY; 757 0 stevel 758 11042 Erik if (IS_UNDER_IPMP(ill)) 759 11042 Erik match_args |= MATCH_IRE_TESTHIDDEN; 760 0 stevel 761 11042 Erik ire = ire_ftable_lookup_v4( 762 11042 Erik addr, /* dest addr */ 763 11042 Erik ip_g_all_ones, /* mask */ 764 11042 Erik 0, /* no gateway */ 765 11042 Erik IRE_BROADCAST, 766 11042 Erik ill, 767 11042 Erik zoneid, 768 11042 Erik NULL, 769 11042 Erik match_args, 770 11042 Erik 0, 771 11042 Erik ill->ill_ipst, 772 11042 Erik NULL); 773 11042 Erik return (ire); 774 0 stevel } 775 0 stevel 776 0 stevel /* Arrange to call the specified function for every IRE in the world. */ 777 0 stevel void 778 3448 dh155122 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst) 779 3448 dh155122 { 780 3448 dh155122 ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst); 781 3448 dh155122 } 782 3448 dh155122 783 3448 dh155122 void 784 3448 dh155122 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 785 3448 dh155122 { 786 3448 dh155122 ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst); 787 3448 dh155122 } 788 3448 dh155122 789 3448 dh155122 void 790 3448 dh155122 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 791 3448 dh155122 { 792 3448 dh155122 ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst); 793 0 stevel } 794 0 stevel 795 0 stevel /* 796 0 stevel * Walk a particular version. version == 0 means both v4 and v6. 797 0 stevel */ 798 0 stevel static void 799 3448 dh155122 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid, 800 3448 dh155122 ip_stack_t *ipst) 801 0 stevel { 802 0 stevel if (vers != IPV6_VERSION) { 803 2535 sangeeta /* 804 2535 sangeeta * ip_forwarding_table variable doesn't matter for IPv4 since 805 3448 dh155122 * ire_walk_ill_tables uses ips_ip_ftable for IPv4. 806 2535 sangeeta */ 807 0 stevel ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE, 808 2535 sangeeta 0, NULL, 809 3448 dh155122 NULL, zoneid, ipst); 810 0 stevel } 811 0 stevel if (vers != IPV4_VERSION) { 812 0 stevel ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE, 813 3448 dh155122 ipst->ips_ip6_ftable_hash_size, 814 3448 dh155122 ipst->ips_ip_forwarding_table_v6, 815 11042 Erik NULL, zoneid, ipst); 816 0 stevel } 817 0 stevel } 818 0 stevel 819 0 stevel /* 820 7216 meem * Arrange to call the specified function for every IRE that matches the ill. 821 0 stevel */ 822 0 stevel void 823 1676 jpk ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 824 0 stevel ill_t *ill) 825 0 stevel { 826 7216 meem uchar_t vers = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 827 7216 meem 828 7216 meem ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill); 829 0 stevel } 830 0 stevel 831 0 stevel /* 832 7216 meem * Walk a particular ill and version. 833 0 stevel */ 834 0 stevel static void 835 0 stevel ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func, 836 1676 jpk void *arg, uchar_t vers, ill_t *ill) 837 0 stevel { 838 3448 dh155122 ip_stack_t *ipst = ill->ill_ipst; 839 3448 dh155122 840 7216 meem if (vers == IPV4_VERSION) { 841 0 stevel ire_walk_ill_tables(match_flags, ire_type, func, arg, 842 11042 Erik IP_MASK_TABLE_SIZE, 843 11042 Erik 0, NULL, 844 11042 Erik ill, ALL_ZONES, ipst); 845 11042 Erik } 846 11042 Erik if (vers != IPV4_VERSION) { 847 0 stevel ire_walk_ill_tables(match_flags, ire_type, func, arg, 848 3448 dh155122 IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size, 849 3448 dh155122 ipst->ips_ip_forwarding_table_v6, 850 11042 Erik ill, ALL_ZONES, ipst); 851 0 stevel } 852 0 stevel } 853 0 stevel 854 11042 Erik /* 855 11042 Erik * Do the specific matching of IREs to shared-IP zones. 856 11042 Erik * 857 11042 Erik * We have the same logic as in ire_match_args but implemented slightly 858 11042 Erik * differently. 859 11042 Erik */ 860 2535 sangeeta boolean_t 861 0 stevel ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, 862 3448 dh155122 ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst) 863 0 stevel { 864 11131 Erik ill_t *dst_ill = ire->ire_ill; 865 0 stevel 866 0 stevel ASSERT(match_flags != 0 || zoneid != ALL_ZONES); 867 0 stevel 868 11042 Erik if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 869 11042 Erik ire->ire_zoneid != ALL_ZONES) { 870 0 stevel /* 871 0 stevel * We're walking the IREs for a specific zone. The only relevant 872 0 stevel * IREs are: 873 0 stevel * - all IREs with a matching ire_zoneid 874 11042 Erik * - IRE_IF_ALL IREs for interfaces with a usable source addr 875 0 stevel * with a matching zone 876 11042 Erik * - IRE_OFFLINK with a gateway reachable from the zone 877 11042 Erik * Note that ealier we only did the IRE_OFFLINK check for 878 11042 Erik * IRE_DEFAULT (and only when we had multiple IRE_DEFAULTs). 879 0 stevel */ 880 11042 Erik if (ire->ire_type & IRE_ONLINK) { 881 11042 Erik uint_t ifindex; 882 11042 Erik 883 0 stevel /* 884 11042 Erik * Note there is no IRE_INTERFACE on vniN thus 885 11042 Erik * can't do an IRE lookup for a matching route. 886 0 stevel */ 887 11042 Erik ifindex = dst_ill->ill_usesrc_ifindex; 888 11042 Erik if (ifindex == 0) 889 11042 Erik return (B_FALSE); 890 0 stevel 891 11042 Erik /* 892 11042 Erik * If there is a usable source address in the 893 11042 Erik * zone, then it's ok to return an 894 11042 Erik * IRE_INTERFACE 895 11042 Erik */ 896 11042 Erik if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 897 11042 Erik zoneid, ipst)) { 898 11042 Erik return (B_FALSE); 899 11042 Erik } 900 11042 Erik } 901 11042 Erik if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 902 11042 Erik ipif_t *tipif; 903 11042 Erik 904 11042 Erik mutex_enter(&dst_ill->ill_lock); 905 11042 Erik for (tipif = dst_ill->ill_ipif; 906 11042 Erik tipif != NULL; tipif = tipif->ipif_next) { 907 11042 Erik if (!IPIF_IS_CONDEMNED(tipif) && 908 11042 Erik (tipif->ipif_flags & IPIF_UP) && 909 11042 Erik (tipif->ipif_zoneid == zoneid || 910 11042 Erik tipif->ipif_zoneid == ALL_ZONES)) 911 11042 Erik break; 912 11042 Erik } 913 11042 Erik mutex_exit(&dst_ill->ill_lock); 914 11042 Erik if (tipif == NULL) { 915 0 stevel return (B_FALSE); 916 0 stevel } 917 0 stevel } 918 11131 Erik } 919 11131 Erik /* 920 11131 Erik * Except for ALL_ZONES, we only match the offlink routes 921 11131 Erik * where ire_gateway_addr has an IRE_INTERFACE for the zoneid. 922 11131 Erik */ 923 11131 Erik if ((ire->ire_type & IRE_OFFLINK) && zoneid != ALL_ZONES) { 924 11131 Erik in6_addr_t gw_addr_v6; 925 0 stevel 926 11131 Erik if (ire->ire_ipversion == IPV4_VERSION) { 927 11131 Erik if (!ire_gateway_ok_zone_v4(ire->ire_gateway_addr, 928 11131 Erik zoneid, dst_ill, NULL, ipst, B_FALSE)) 929 11131 Erik return (B_FALSE); 930 11131 Erik } else { 931 11131 Erik ASSERT(ire->ire_ipversion == IPV6_VERSION); 932 11131 Erik mutex_enter(&ire->ire_lock); 933 11131 Erik gw_addr_v6 = ire->ire_gateway_addr_v6; 934 11131 Erik mutex_exit(&ire->ire_lock); 935 8485 Peter 936 11131 Erik if (!ire_gateway_ok_zone_v6(&gw_addr_v6, zoneid, 937 11131 Erik dst_ill, NULL, ipst, B_FALSE)) 938 11131 Erik return (B_FALSE); 939 0 stevel } 940 0 stevel } 941 0 stevel 942 0 stevel if (((!(match_flags & MATCH_IRE_TYPE)) || 943 4714 sowmini (ire->ire_type & ire_type)) && 944 0 stevel ((!(match_flags & MATCH_IRE_ILL)) || 945 11042 Erik (dst_ill == ill || 946 11042 Erik dst_ill != NULL && IS_IN_SAME_ILLGRP(dst_ill, ill)))) { 947 0 stevel return (B_TRUE); 948 0 stevel } 949 0 stevel return (B_FALSE); 950 0 stevel } 951 0 stevel 952 2535 sangeeta int 953 2535 sangeeta rtfunc(struct radix_node *rn, void *arg) 954 2535 sangeeta { 955 2535 sangeeta struct rtfuncarg *rtf = arg; 956 2535 sangeeta struct rt_entry *rt; 957 2535 sangeeta irb_t *irb; 958 2535 sangeeta ire_t *ire; 959 2535 sangeeta boolean_t ret; 960 2535 sangeeta 961 2535 sangeeta rt = (struct rt_entry *)rn; 962 2535 sangeeta ASSERT(rt != NULL); 963 2535 sangeeta irb = &rt->rt_irb; 964 2535 sangeeta for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 965 2535 sangeeta if ((rtf->rt_match_flags != 0) || 966 2535 sangeeta (rtf->rt_zoneid != ALL_ZONES)) { 967 2535 sangeeta ret = ire_walk_ill_match(rtf->rt_match_flags, 968 2535 sangeeta rtf->rt_ire_type, ire, 969 3448 dh155122 rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst); 970 11042 Erik } else { 971 2535 sangeeta ret = B_TRUE; 972 11042 Erik } 973 2535 sangeeta if (ret) 974 2535 sangeeta (*rtf->rt_func)(ire, rtf->rt_arg); 975 2535 sangeeta } 976 2535 sangeeta return (0); 977 2535 sangeeta } 978 2535 sangeeta 979 0 stevel /* 980 11042 Erik * Walk the ftable entries that match the ill. 981 0 stevel */ 982 2535 sangeeta void 983 0 stevel ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, 984 1676 jpk void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl, 985 11042 Erik ill_t *ill, zoneid_t zoneid, 986 3448 dh155122 ip_stack_t *ipst) 987 0 stevel { 988 0 stevel irb_t *irb_ptr; 989 0 stevel irb_t *irb; 990 0 stevel ire_t *ire; 991 0 stevel int i, j; 992 0 stevel boolean_t ret; 993 2535 sangeeta struct rtfuncarg rtfarg; 994 0 stevel 995 8485 Peter ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL)); 996 0 stevel ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0)); 997 11042 Erik 998 11042 Erik /* knobs such that routine is called only for v6 case */ 999 11042 Erik if (ipftbl == ipst->ips_ip_forwarding_table_v6) { 1000 11042 Erik for (i = (ftbl_sz - 1); i >= 0; i--) { 1001 11042 Erik if ((irb_ptr = ipftbl[i]) == NULL) 1002 11042 Erik continue; 1003 11042 Erik for (j = 0; j < htbl_sz; j++) { 1004 11042 Erik irb = &irb_ptr[j]; 1005 11042 Erik if (irb->irb_ire == NULL) 1006 0 stevel continue; 1007 2535 sangeeta 1008 11042 Erik irb_refhold(irb); 1009 11042 Erik for (ire = irb->irb_ire; ire != NULL; 1010 11042 Erik ire = ire->ire_next) { 1011 11042 Erik if (match_flags == 0 && 1012 11042 Erik zoneid == ALL_ZONES) { 1013 11042 Erik ret = B_TRUE; 1014 11042 Erik } else { 1015 11042 Erik ret = 1016 11042 Erik ire_walk_ill_match( 1017 11042 Erik match_flags, 1018 11042 Erik ire_type, ire, ill, 1019 11042 Erik zoneid, ipst); 1020 0 stevel } 1021 11042 Erik if (ret) 1022 11042 Erik (*func)(ire, arg); 1023 2535 sangeeta } 1024 11042 Erik irb_refrele(irb); 1025 2535 sangeeta } 1026 0 stevel } 1027 11042 Erik } else { 1028 11131 Erik bzero(&rtfarg, sizeof (rtfarg)); 1029 11042 Erik rtfarg.rt_func = func; 1030 11042 Erik rtfarg.rt_arg = arg; 1031 11042 Erik if (match_flags != 0) { 1032 11042 Erik rtfarg.rt_match_flags = match_flags; 1033 0 stevel } 1034 11042 Erik rtfarg.rt_ire_type = ire_type; 1035 11042 Erik rtfarg.rt_ill = ill; 1036 11042 Erik rtfarg.rt_zoneid = zoneid; 1037 11042 Erik rtfarg.rt_ipst = ipst; /* No netstack_hold */ 1038 11042 Erik (void) ipst->ips_ip_ftable->rnh_walktree_mt( 1039 11042 Erik ipst->ips_ip_ftable, 1040 11042 Erik rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 1041 0 stevel } 1042 0 stevel } 1043 0 stevel 1044 0 stevel /* 1045 0 stevel * This function takes a mask and returns 1046 0 stevel * number of bits set in the mask. If no 1047 0 stevel * bit is set it returns 0. 1048 0 stevel * Assumes a contiguous mask. 1049 0 stevel */ 1050 0 stevel int 1051 0 stevel ip_mask_to_plen(ipaddr_t mask) 1052 0 stevel { 1053 0 stevel return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1)); 1054 0 stevel } 1055 0 stevel 1056 0 stevel /* 1057 0 stevel * Convert length for a mask to the mask. 1058 0 stevel */ 1059 0 stevel ipaddr_t 1060 0 stevel ip_plen_to_mask(uint_t masklen) 1061 0 stevel { 1062 11042 Erik if (masklen == 0) 1063 11042 Erik return (0); 1064 11042 Erik 1065 0 stevel return (htonl(IP_HOST_MASK << (IP_ABITS - masklen))); 1066 0 stevel } 1067 0 stevel 1068 0 stevel void 1069 0 stevel ire_atomic_end(irb_t *irb_ptr, ire_t *ire) 1070 0 stevel { 1071 11042 Erik ill_t *ill; 1072 8564 Peter 1073 11042 Erik ill = ire->ire_ill; 1074 11042 Erik if (ill != NULL) 1075 11042 Erik mutex_exit(&ill->ill_lock); 1076 0 stevel rw_exit(&irb_ptr->irb_lock); 1077 0 stevel } 1078 0 stevel 1079 0 stevel /* 1080 11042 Erik * ire_add_v[46] atomically make sure that the ill associated 1081 11042 Erik * with the new ire is not going away i.e., we check ILL_CONDEMNED. 1082 0 stevel */ 1083 0 stevel int 1084 11042 Erik ire_atomic_start(irb_t *irb_ptr, ire_t *ire) 1085 0 stevel { 1086 11042 Erik ill_t *ill; 1087 0 stevel 1088 11042 Erik ill = ire->ire_ill; 1089 0 stevel 1090 0 stevel rw_enter(&irb_ptr->irb_lock, RW_WRITER); 1091 11042 Erik if (ill != NULL) { 1092 11042 Erik mutex_enter(&ill->ill_lock); 1093 0 stevel 1094 11042 Erik /* 1095 11042 Erik * Don't allow IRE's to be created on dying ills. 1096 11042 Erik */ 1097 11042 Erik if (ill->ill_state_flags & ILL_CONDEMNED) { 1098 11042 Erik ire_atomic_end(irb_ptr, ire); 1099 11042 Erik return (ENXIO); 1100 0 stevel } 1101 0 stevel 1102 11042 Erik if (IS_UNDER_IPMP(ill)) { 1103 11042 Erik int error = 0; 1104 11042 Erik mutex_enter(&ill->ill_phyint->phyint_lock); 1105 11042 Erik if (!ipmp_ill_is_active(ill) && 1106 11042 Erik IRE_HIDDEN_TYPE(ire->ire_type) && 1107 11042 Erik !ire->ire_testhidden) { 1108 8485 Peter error = EINVAL; 1109 8485 Peter } 1110 11042 Erik mutex_exit(&ill->ill_phyint->phyint_lock); 1111 11042 Erik if (error != 0) { 1112 11042 Erik ire_atomic_end(irb_ptr, ire); 1113 11042 Erik return (error); 1114 11042 Erik } 1115 8485 Peter } 1116 11042 Erik 1117 8485 Peter } 1118 11042 Erik return (0); 1119 0 stevel } 1120 0 stevel 1121 0 stevel /* 1122 11042 Erik * Add a fully initialized IRE to the forwarding table. 1123 11042 Erik * This returns NULL on failure, or a held IRE on success. 1124 11042 Erik * Normally the returned IRE is the same as the argument. But a different 1125 11042 Erik * IRE will be returned if the added IRE is deemed identical to an existing 1126 11042 Erik * one. In that case ire_identical_ref will be increased. 1127 11042 Erik * The caller always needs to do an ire_refrele() on the returned IRE. 1128 2535 sangeeta */ 1129 11042 Erik ire_t * 1130 11042 Erik ire_add(ire_t *ire) 1131 0 stevel { 1132 11042 Erik if (IRE_HIDDEN_TYPE(ire->ire_type) && 1133 11042 Erik ire->ire_ill != NULL && IS_UNDER_IPMP(ire->ire_ill)) { 1134 2416 jarrett /* 1135 11042 Erik * IREs hosted on interfaces that are under IPMP 1136 11042 Erik * should be hidden so that applications don't 1137 11042 Erik * accidentally end up sending packets with test 1138 11042 Erik * addresses as their source addresses, or 1139 11042 Erik * sending out interfaces that are e.g. IFF_INACTIVE. 1140 11042 Erik * Hide them here. 1141 2416 jarrett */ 1142 11042 Erik ire->ire_testhidden = B_TRUE; 1143 0 stevel } 1144 0 stevel 1145 4823 seb if (ire->ire_ipversion == IPV6_VERSION) 1146 11042 Erik return (ire_add_v6(ire)); 1147 4823 seb else 1148 11042 Erik return (ire_add_v4(ire)); 1149 0 stevel } 1150 0 stevel 1151 0 stevel /* 1152 11042 Erik * Add a fully initialized IPv4 IRE to the forwarding table. 1153 11042 Erik * This returns NULL on failure, or a held IRE on success. 1154 11042 Erik * Normally the returned IRE is the same as the argument. But a different 1155 11042 Erik * IRE will be returned if the added IRE is deemed identical to an existing 1156 11042 Erik * one. In that case ire_identical_ref will be increased. 1157 11042 Erik * The caller always needs to do an ire_refrele() on the returned IRE. 1158 0 stevel */ 1159 11042 Erik static ire_t * 1160 11042 Erik ire_add_v4(ire_t *ire) 1161 0 stevel { 1162 0 stevel ire_t *ire1; 1163 0 stevel irb_t *irb_ptr; 1164 0 stevel ire_t **irep; 1165 11042 Erik int match_flags; 1166 0 stevel int error; 1167 3448 dh155122 ip_stack_t *ipst = ire->ire_ipst; 1168 8485 Peter 1169 11042 Erik if (ire->ire_ill != NULL) 1170 11042 Erik ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock)); 1171 0 stevel ASSERT(ire->ire_ipversion == IPV4_VERSION); 1172 0 stevel 1173 0 stevel /* Make sure the address is properly masked. */ 1174 0 stevel ire->ire_addr &= ire->ire_mask; 1175 0 stevel 1176 11042 Erik match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 1177 2535 sangeeta 1178 11042 Erik if (ire->ire_ill != NULL) { 1179 11042 Erik match_flags |= MATCH_IRE_ILL; 1180 0 stevel } 1181 11042 Erik irb_ptr = ire_get_bucket(ire); 1182 11042 Erik if (irb_ptr == NULL) { 1183 11042 Erik printf("no bucket for %p\n", (void *)ire); 1184 11042 Erik ire_delete(ire); 1185 11042 Erik return (NULL); 1186 2535 sangeeta } 1187 0 stevel 1188 0 stevel /* 1189 11042 Erik * Start the atomic add of the ire. Grab the ill lock, 1190 11042 Erik * the bucket lock. Check for condemned. 1191 3448 dh155122 */ 1192 11042 Erik error = ire_atomic_start(irb_ptr, ire); 1193 0 stevel if (error != 0) { 1194 11042 Erik printf("no ire_atomic_start for %p\n", (void *)ire); 1195 0 stevel ire_delete(ire); 1196 11042 Erik irb_refrele(irb_ptr); 1197 11042 Erik return (NULL); 1198 0 stevel } 1199 0 stevel /* 1200 11042 Erik * If we are creating a hidden IRE, make sure we search for 1201 11042 Erik * hidden IREs when searching for duplicates below. 1202 11042 Erik * Otherwise, we might find an IRE on some other interface 1203 11042 Erik * that's not marked hidden. 1204 0 stevel */ 1205 11042 Erik if (ire->ire_testhidden) 1206 11042 Erik match_flags |= MATCH_IRE_TESTHIDDEN; 1207 0 stevel 1208 0 stevel /* 1209 0 stevel * Atomically check for duplicate and insert in the table. 1210 0 stevel */ 1211 0 stevel for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1212 11042 Erik if (IRE_IS_CONDEMNED(ire1)) 1213 0 stevel continue; 1214 11042 Erik /* 1215 11042 Erik * Here we need an exact match on zoneid, i.e., 1216 11042 Erik * ire_match_args doesn't fit. 1217 11042 Erik */ 1218 0 stevel if (ire1->ire_zoneid != ire->ire_zoneid) 1219 0 stevel continue; 1220 11042 Erik 1221 11042 Erik if (ire1->ire_type != ire->ire_type) 1222 11042 Erik continue; 1223 11042 Erik 1224 11042 Erik /* 1225 11042 Erik * Note: We do not allow multiple routes that differ only 1226 11042 Erik * in the gateway security attributes; such routes are 1227 11042 Erik * considered duplicates. 1228 11042 Erik * To change that we explicitly have to treat them as 1229 11042 Erik * different here. 1230 11042 Erik */ 1231 0 stevel if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask, 1232 11042 Erik ire->ire_gateway_addr, ire->ire_type, ire->ire_ill, 1233 11042 Erik ire->ire_zoneid, NULL, match_flags)) { 1234 0 stevel /* 1235 0 stevel * Return the old ire after doing a REFHOLD. 1236 0 stevel * As most of the callers continue to use the IRE 1237 0 stevel * after adding, we return a held ire. This will 1238 0 stevel * avoid a lookup in the caller again. If the callers 1239 0 stevel * don't want to use it, they need to do a REFRELE. 1240 0 stevel */ 1241 11042 Erik atomic_add_32(&ire1->ire_identical_ref, 1); 1242 11042 Erik DTRACE_PROBE2(ire__add__exist, ire_t *, ire1, 1243 11042 Erik ire_t *, ire); 1244 11042 Erik ire_refhold(ire1); 1245 0 stevel ire_atomic_end(irb_ptr, ire); 1246 0 stevel ire_delete(ire); 1247 11042 Erik irb_refrele(irb_ptr); 1248 11042 Erik return (ire1); 1249 0 stevel } 1250 0 stevel } 1251 8485 Peter 1252 0 stevel /* 1253 11042 Erik * Normally we do head insertion since most things do not care about 1254 11042 Erik * the order of the IREs in the bucket. Note that ip_cgtp_bcast_add 1255 11042 Erik * assumes we at least do head insertion so that its IRE_BROADCAST 1256 11042 Erik * arrive ahead of existing IRE_HOST for the same address. 1257 11042 Erik * However, due to shared-IP zones (and restrict_interzone_loopback) 1258 11042 Erik * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same 1259 11042 Erik * address. For that reason we do tail insertion for IRE_IF_CLONE. 1260 11042 Erik * Due to the IRE_BROADCAST on cgtp0, which must be last in the bucket, 1261 11042 Erik * we do tail insertion of IRE_BROADCASTs that do not have RTF_MULTIRT 1262 11042 Erik * set. 1263 0 stevel */ 1264 0 stevel irep = (ire_t **)irb_ptr; 1265 11042 Erik if ((ire->ire_type & IRE_IF_CLONE) || 1266 11042 Erik ((ire->ire_type & IRE_BROADCAST) && 1267 11042 Erik !(ire->ire_flags & RTF_MULTIRT))) { 1268 11042 Erik while ((ire1 = *irep) != NULL) 1269 0 stevel irep = &ire1->ire_next; 1270 0 stevel } 1271 0 stevel /* Insert at *irep */ 1272 0 stevel ire1 = *irep; 1273 0 stevel if (ire1 != NULL) 1274 0 stevel ire1->ire_ptpn = &ire->ire_next; 1275 0 stevel ire->ire_next = ire1; 1276 0 stevel /* Link the new one in. */ 1277 0 stevel ire->ire_ptpn = irep; 1278 0 stevel 1279 0 stevel /* 1280 0 stevel * ire_walk routines de-reference ire_next without holding 1281 0 stevel * a lock. Before we point to the new ire, we want to make 1282 0 stevel * sure the store that sets the ire_next of the new ire 1283 0 stevel * reaches global visibility, so that ire_walk routines 1284 0 stevel * don't see a truncated list of ires i.e if the ire_next 1285 0 stevel * of the new ire gets set after we do "*irep = ire" due 1286 0 stevel * to re-ordering, the ire_walk thread will see a NULL 1287 0 stevel * once it accesses the ire_next of the new ire. 1288 0 stevel * membar_producer() makes sure that the following store 1289 0 stevel * happens *after* all of the above stores. 1290 0 stevel */ 1291 0 stevel membar_producer(); 1292 0 stevel *irep = ire; 1293 0 stevel ire->ire_bucket = irb_ptr; 1294 0 stevel /* 1295 0 stevel * We return a bumped up IRE above. Keep it symmetrical 1296 0 stevel * so that the callers will always have to release. This 1297 0 stevel * helps the callers of this function because they continue 1298 0 stevel * to use the IRE after adding and hence they don't have to 1299 0 stevel * lookup again after we return the IRE. 1300 0 stevel * 1301 0 stevel * NOTE : We don't have to use atomics as this is appearing 1302 0 stevel * in the list for the first time and no one else can bump 1303 0 stevel * up the reference count on this yet. 1304 0 stevel */ 1305 11042 Erik ire_refhold_locked(ire); 1306 3448 dh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted); 1307 2535 sangeeta 1308 0 stevel irb_ptr->irb_ire_cnt++; 1309 11042 Erik if (irb_ptr->irb_marks & IRB_MARK_DYNAMIC) 1310 2535 sangeeta irb_ptr->irb_nire++; 1311 2535 sangeeta 1312 11042 Erik if (ire->ire_ill != NULL) { 1313 11042 Erik ire->ire_ill->ill_ire_cnt++; 1314 11042 Erik ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ 1315 0 stevel } 1316 0 stevel 1317 0 stevel ire_atomic_end(irb_ptr, ire); 1318 0 stevel 1319 11042 Erik /* Make any caching of the IREs be notified or updated */ 1320 11042 Erik ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 1321 0 stevel 1322 11042 Erik if (ire->ire_ill != NULL) 1323 11042 Erik ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock)); 1324 11042 Erik irb_refrele(irb_ptr); 1325 11042 Erik return (ire); 1326 0 stevel } 1327 0 stevel 1328 0 stevel /* 1329 11042 Erik * irb_refrele is the only caller of the function. ire_unlink calls to 1330 0 stevel * do the final cleanup for this ire. 1331 0 stevel */ 1332 0 stevel void 1333 0 stevel ire_cleanup(ire_t *ire) 1334 0 stevel { 1335 0 stevel ire_t *ire_next; 1336 3448 dh155122 ip_stack_t *ipst = ire->ire_ipst; 1337 0 stevel 1338 0 stevel ASSERT(ire != NULL); 1339 0 stevel 1340 0 stevel while (ire != NULL) { 1341 0 stevel ire_next = ire->ire_next; 1342 0 stevel if (ire->ire_ipversion == IPV4_VERSION) { 1343 0 stevel ire_delete_v4(ire); 1344 3448 dh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, 1345 3448 dh155122 ire_stats_deleted); 1346 0 stevel } else { 1347 0 stevel ASSERT(ire->ire_ipversion == IPV6_VERSION); 1348 0 stevel ire_delete_v6(ire); 1349 3448 dh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, 1350 3448 dh155122 ire_stats_deleted); 1351 0 stevel } 1352 0 stevel /* 1353 0 stevel * Now it's really out of the list. Before doing the 1354 0 stevel * REFRELE, set ire_next to NULL as ire_inactive asserts 1355 0 stevel * so. 1356 0 stevel */ 1357 0 stevel ire->ire_next = NULL; 1358 11042 Erik ire_refrele_notr(ire); 1359 0 stevel ire = ire_next; 1360 0 stevel } 1361 0 stevel } 1362 0 stevel 1363 0 stevel /* 1364 11042 Erik * irb_refrele is the only caller of the function. It calls to unlink 1365 0 stevel * all the CONDEMNED ires from this bucket. 1366 0 stevel */ 1367 0 stevel ire_t * 1368 0 stevel ire_unlink(irb_t *irb) 1369 0 stevel { 1370 0 stevel ire_t *ire; 1371 0 stevel ire_t *ire1; 1372 0 stevel ire_t **ptpn; 1373 0 stevel ire_t *ire_list = NULL; 1374 0 stevel 1375 0 stevel ASSERT(RW_WRITE_HELD(&irb->irb_lock)); 1376 11042 Erik ASSERT(((irb->irb_marks & IRB_MARK_DYNAMIC) && irb->irb_refcnt == 1) || 1377 2535 sangeeta (irb->irb_refcnt == 0)); 1378 2535 sangeeta ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED); 1379 0 stevel ASSERT(irb->irb_ire != NULL); 1380 0 stevel 1381 0 stevel for (ire = irb->irb_ire; ire != NULL; ire = ire1) { 1382 0 stevel ire1 = ire->ire_next; 1383 11042 Erik if (IRE_IS_CONDEMNED(ire)) { 1384 0 stevel ptpn = ire->ire_ptpn; 1385 0 stevel ire1 = ire->ire_next; 1386 0 stevel if (ire1) 1387 0 stevel ire1->ire_ptpn = ptpn; 1388 0 stevel *ptpn = ire1; 1389 0 stevel ire->ire_ptpn = NULL; 1390 0 stevel ire->ire_next = NULL; 1391 11042 Erik 1392 0 stevel /* 1393 11042 Erik * We need to call ire_delete_v4 or ire_delete_v6 to 1394 11042 Erik * clean up dependents and the redirects pointing at 1395 0 stevel * the default gateway. We need to drop the lock 1396 0 stevel * as ire_flush_cache/ire_delete_host_redircts require 1397 0 stevel * so. But we can't drop the lock, as ire_unlink needs 1398 0 stevel * to atomically remove the ires from the list. 1399 0 stevel * So, create a temporary list of CONDEMNED ires 1400 0 stevel * for doing ire_delete_v4/ire_delete_v6 operations 1401 0 stevel * later on. 1402 0 stevel */ 1403 0 stevel ire->ire_next = ire_list; 1404 0 stevel ire_list = ire; 1405 0 stevel } 1406 0 stevel } 1407 2535 sangeeta irb->irb_marks &= ~IRB_MARK_CONDEMNED; 1408 0 stevel return (ire_list); 1409 0 stevel } 1410 0 stevel 1411 0 stevel /* 1412 11042 Erik * Clean up the radix node for this ire. Must be called by irb_refrele 1413 2535 sangeeta * when there are no ire's left in the bucket. Returns TRUE if the bucket 1414 2535 sangeeta * is deleted and freed. 1415 2535 sangeeta */ 1416 2535 sangeeta boolean_t 1417 2535 sangeeta irb_inactive(irb_t *irb) 1418 2535 sangeeta { 1419 2535 sangeeta struct rt_entry *rt; 1420 2535 sangeeta struct radix_node *rn; 1421 3448 dh155122 ip_stack_t *ipst = irb->irb_ipst; 1422 3448 dh155122 1423 3448 dh155122 ASSERT(irb->irb_ipst != NULL); 1424 2535 sangeeta 1425 2535 sangeeta rt = IRB2RT(irb); 1426 2535 sangeeta rn = (struct radix_node *)rt; 1427 2535 sangeeta 1428 2535 sangeeta /* first remove it from the radix tree. */ 1429 3448 dh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 1430 2535 sangeeta rw_enter(&irb->irb_lock, RW_WRITER); 1431 2535 sangeeta if (irb->irb_refcnt == 1 && irb->irb_nire == 0) { 1432 3448 dh155122 rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask, 1433 3448 dh155122 ipst->ips_ip_ftable); 1434 2535 sangeeta DTRACE_PROBE1(irb__free, rt_t *, rt); 1435 2535 sangeeta ASSERT((void *)rn == (void *)rt); 1436 2535 sangeeta Free(rt, rt_entry_cache); 1437 2535 sangeeta /* irb_lock is freed */ 1438 3448 dh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 1439 2535 sangeeta return (B_TRUE); 1440 2535 sangeeta } 1441 2535 sangeeta rw_exit(&irb->irb_lock); 1442 3448 dh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 1443 2535 sangeeta return (B_FALSE); 1444 0 stevel } 1445 0 stevel 1446 0 stevel /* 1447 0 stevel * Delete the specified IRE. 1448 11042 Erik * We assume that if ire_bucket is not set then ire_ill->ill_ire_cnt was 1449 11042 Erik * not incremented i.e., that the insertion in the bucket and the increment 1450 11042 Erik * of that counter is done atomically. 1451 0 stevel */ 1452 0 stevel void 1453 0 stevel ire_delete(ire_t *ire) 1454 0 stevel { 1455 0 stevel ire_t *ire1; 1456 0 stevel ire_t **ptpn; 1457 11042 Erik irb_t *irb; 1458 11042 Erik nce_t *nce; 1459 3448 dh155122 ip_stack_t *ipst = ire->ire_ipst; 1460 11042 Erik 1461 11042 Erik /* We can clear ire_nce_cache under ire_lock even if the IRE is used */ 1462 11042 Erik mutex_enter(&ire->ire_lock); 1463 11042 Erik nce = ire->ire_nce_cache; 1464 11042 Erik ire->ire_nce_cache = NULL; 1465 11042 Erik mutex_exit(&ire->ire_lock); 1466 11042 Erik if (nce != NULL) 1467 11042 Erik nce_refrele(nce); 1468 0 stevel 1469 0 stevel if ((irb = ire->ire_bucket) == NULL) { 1470 2535 sangeeta /* 1471 2535 sangeeta * It was never inserted in the list. Should call REFRELE 1472 2535 sangeeta * to free this IRE. 1473 2535 sangeeta */ 1474 11042 Erik ire_refrele_notr(ire); 1475 0 stevel return; 1476 0 stevel } 1477 0 stevel 1478 11042 Erik /* 1479 11042 Erik * Move the use counts from an IRE_IF_CLONE to its parent 1480 11042 Erik * IRE_INTERFACE. 1481 11042 Erik * We need to do this before acquiring irb_lock. 1482 11042 Erik */ 1483 11042 Erik if (ire->ire_type & IRE_IF_CLONE) { 1484 11042 Erik ire_t *parent; 1485 0 stevel 1486 11042 Erik rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 1487 11042 Erik if ((parent = ire->ire_dep_parent) != NULL) { 1488 11042 Erik parent->ire_ob_pkt_count += ire->ire_ob_pkt_count; 1489 11042 Erik parent->ire_ib_pkt_count += ire->ire_ib_pkt_count; 1490 11042 Erik ire->ire_ob_pkt_count = 0; 1491 11042 Erik ire->ire_ib_pkt_count = 0; 1492 11042 Erik } 1493 11042 Erik rw_exit(&ipst->ips_ire_dep_lock); 1494 2535 sangeeta } 1495 2535 sangeeta 1496 11042 Erik rw_enter(&irb->irb_lock, RW_WRITER); 1497 0 stevel if (ire->ire_ptpn == NULL) { 1498 0 stevel /* 1499 0 stevel * Some other thread has removed us from the list. 1500 0 stevel * It should have done the REFRELE for us. 1501 0 stevel */ 1502 0 stevel rw_exit(&irb->irb_lock); 1503 0 stevel return; 1504 0 stevel } 1505 0 stevel 1506 11042 Erik if (!IRE_IS_CONDEMNED(ire)) { 1507 11042 Erik /* Is this an IRE representing multiple duplicate entries? */ 1508 11042 Erik ASSERT(ire->ire_identical_ref >= 1); 1509 11042 Erik if (atomic_add_32_nv(&ire->ire_identical_ref, -1) != 0) { 1510 11042 Erik /* Removed one of the identical parties */ 1511 11042 Erik rw_exit(&irb->irb_lock); 1512 11042 Erik return; 1513 11042 Erik } 1514 11042 Erik 1515 5388 ja97890 irb->irb_ire_cnt--; 1516 11042 Erik ire_make_condemned(ire); 1517 5388 ja97890 } 1518 5388 ja97890 1519 0 stevel if (irb->irb_refcnt != 0) { 1520 0 stevel /* 1521 0 stevel * The last thread to leave this bucket will 1522 0 stevel * delete this ire. 1523 0 stevel */ 1524 2535 sangeeta irb->irb_marks |= IRB_MARK_CONDEMNED; 1525 0 stevel rw_exit(&irb->irb_lock); 1526 0 stevel return; 1527 0 stevel } 1528 0 stevel 1529 0 stevel /* 1530 0 stevel * Normally to delete an ire, we walk the bucket. While we 1531 0 stevel * walk the bucket, we normally bump up irb_refcnt and hence 1532 0 stevel * we return from above where we mark CONDEMNED and the ire 1533 0 stevel * gets deleted from ire_unlink. This case is where somebody 1534 0 stevel * knows the ire e.g by doing a lookup, and wants to delete the 1535 0 stevel * IRE. irb_refcnt would be 0 in this case if nobody is walking 1536 0 stevel * the bucket. 1537 0 stevel */ 1538 0 stevel ptpn = ire->ire_ptpn; 1539 0 stevel ire1 = ire->ire_next; 1540 0 stevel if (ire1 != NULL) 1541 0 stevel ire1->ire_ptpn = ptpn; 1542 0 stevel ASSERT(ptpn != NULL); 1543 0 stevel *ptpn = ire1; 1544 0 stevel ire->ire_ptpn = NULL; 1545 0 stevel ire->ire_next = NULL; 1546 0 stevel if (ire->ire_ipversion == IPV6_VERSION) { 1547 3448 dh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted); 1548 3448 dh155122 } else { 1549 3448 dh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted); 1550 0 stevel } 1551 0 stevel rw_exit(&irb->irb_lock); 1552 0 stevel 1553 11042 Erik /* Cleanup dependents and related stuff */ 1554 0 stevel if (ire->ire_ipversion == IPV6_VERSION) { 1555 0 stevel ire_delete_v6(ire); 1556 0 stevel } else { 1557 0 stevel ire_delete_v4(ire); 1558 0 stevel } 1559 0 stevel /* 1560 0 stevel * We removed it from the list. Decrement the 1561 0 stevel * reference count. 1562 0 stevel */ 1563 11042 Erik ire_refrele_notr(ire); 1564 0 stevel } 1565 0 stevel 1566 0 stevel /* 1567 0 stevel * Delete the specified IRE. 1568 0 stevel * All calls should use ire_delete(). 1569 0 stevel * Sometimes called as writer though not required by this function. 1570 0 stevel * 1571 0 stevel * NOTE : This function is called only if the ire was added 1572 0 stevel * in the list. 1573 0 stevel */ 1574 0 stevel static void 1575 0 stevel ire_delete_v4(ire_t *ire) 1576 0 stevel { 1577 3448 dh155122 ip_stack_t *ipst = ire->ire_ipst; 1578 3448 dh155122 1579 0 stevel ASSERT(ire->ire_refcnt >= 1); 1580 0 stevel ASSERT(ire->ire_ipversion == IPV4_VERSION); 1581 0 stevel 1582 11042 Erik ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); 1583 0 stevel if (ire->ire_type == IRE_DEFAULT) { 1584 0 stevel /* 1585 0 stevel * when a default gateway is going away 1586 0 stevel * delete all the host redirects pointing at that 1587 0 stevel * gateway. 1588 0 stevel */ 1589 3448 dh155122 ire_delete_host_redirects(ire->ire_gateway_addr, ipst); 1590 0 stevel } 1591 11042 Erik 1592 11042 Erik /* 1593 11042 Erik * If we are deleting an IRE_INTERFACE then we make sure we also 1594 11042 Erik * delete any IRE_IF_CLONE that has been created from it. 1595 11042 Erik * Those are always in ire_dep_children. 1596 11042 Erik */ 1597 11042 Erik if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != NULL) 1598 11042 Erik ire_dep_delete_if_clone(ire); 1599 11042 Erik 1600 11042 Erik /* Remove from parent dependencies and child */ 1601 11042 Erik rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 1602 11042 Erik if (ire->ire_dep_parent != NULL) 1603 11042 Erik ire_dep_remove(ire); 1604 11042 Erik 1605 11042 Erik while (ire->ire_dep_children != NULL) 1606 11042 Erik ire_dep_remove(ire->ire_dep_children); 1607 11042 Erik rw_exit(&ipst->ips_ire_dep_lock); 1608 0 stevel } 1609 0 stevel 1610 0 stevel /* 1611 11042 Erik * ire_refrele is the only caller of the function. It calls 1612 0 stevel * to free the ire when the reference count goes to zero. 1613 0 stevel */ 1614 0 stevel void 1615 0 stevel ire_inactive(ire_t *ire) 1616 0 stevel { 1617 11042 Erik ill_t *ill; 1618 2535 sangeeta irb_t *irb; 1619 3448 dh155122 ip_stack_t *ipst = ire->ire_ipst; 1620 0 stevel 1621 0 stevel ASSERT(ire->ire_refcnt == 0); 1622 0 stevel ASSERT(ire->ire_ptpn == NULL); 1623 0 stevel ASSERT(ire->ire_next == NULL); 1624 0 stevel 1625 11042 Erik /* Count how many condemned ires for kmem_cache callback */ 1626 11042 Erik if (IRE_IS_CONDEMNED(ire)) 1627 11042 Erik atomic_add_32(&ipst->ips_num_ire_condemned, -1); 1628 11042 Erik 1629 2535 sangeeta if (ire->ire_gw_secattr != NULL) { 1630 2535 sangeeta ire_gw_secattr_free(ire->ire_gw_secattr); 1631 2535 sangeeta ire->ire_gw_secattr = NULL; 1632 2535 sangeeta } 1633 2535 sangeeta 1634 11042 Erik /* 1635 11042 Erik * ire_nce_cache is cleared in ire_delete, and we make sure we don't 1636 11042 Erik * set it once the ire is marked condemned. 1637 11042 Erik */ 1638 11042 Erik ASSERT(ire->ire_nce_cache == NULL); 1639 0 stevel 1640 0 stevel /* 1641 11042 Erik * Since any parent would have a refhold on us they would already 1642 11042 Erik * have been removed. 1643 0 stevel */ 1644 11042 Erik ASSERT(ire->ire_dep_parent == NULL); 1645 11042 Erik ASSERT(ire->ire_dep_sib_next == NULL); 1646 11042 Erik ASSERT(ire->ire_dep_sib_ptpn == NULL); 1647 4823 seb 1648 11042 Erik /* 1649 11042 Erik * Since any children would have a refhold on us they should have 1650 11042 Erik * already been removed. 1651 11042 Erik */ 1652 11042 Erik ASSERT(ire->ire_dep_children == NULL); 1653 11042 Erik 1654 11042 Erik /* 1655 11042 Erik * ill_ire_ref is increased when the IRE is inserted in the 1656 11042 Erik * bucket - not when the IRE is created. 1657 11042 Erik */ 1658 11042 Erik irb = ire->ire_bucket; 1659 11042 Erik ill = ire->ire_ill; 1660 11042 Erik if (irb != NULL && ill != NULL) { 1661 0 stevel mutex_enter(&ill->ill_lock); 1662 11042 Erik ASSERT(ill->ill_ire_cnt != 0); 1663 11042 Erik DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 1664 6255 sowmini (char *), "ire", (void *), ire); 1665 11042 Erik ill->ill_ire_cnt--; 1666 11042 Erik if (ILL_DOWN_OK(ill)) { 1667 0 stevel /* Drops the ill lock */ 1668 0 stevel ipif_ill_refrele_tail(ill); 1669 0 stevel } else { 1670 0 stevel mutex_exit(&ill->ill_lock); 1671 0 stevel } 1672 0 stevel } 1673 11042 Erik ire->ire_ill = NULL; 1674 11042 Erik 1675 0 stevel /* This should be true for both V4 and V6 */ 1676 11042 Erik if (irb != NULL && (irb->irb_marks & IRB_MARK_DYNAMIC)) { 1677 2535 sangeeta rw_enter(&irb->irb_lock, RW_WRITER); 1678 2535 sangeeta irb->irb_nire--; 1679 2535 sangeeta /* 1680 2535 sangeeta * Instead of examining the conditions for freeing 1681 2535 sangeeta * the radix node here, we do it by calling 1682 11042 Erik * irb_refrele which is a single point in the code 1683 2535 sangeeta * that embeds that logic. Bump up the refcnt to 1684 11042 Erik * be able to call irb_refrele 1685 2535 sangeeta */ 1686 11042 Erik irb_refhold_locked(irb); 1687 2535 sangeeta rw_exit(&irb->irb_lock); 1688 11042 Erik irb_refrele(irb); 1689 2535 sangeeta } 1690 0 stevel 1691 5023 carlsonj #ifdef DEBUG 1692 5023 carlsonj ire_trace_cleanup(ire); 1693 0 stevel #endif 1694 0 stevel mutex_destroy(&ire->ire_lock); 1695 0 stevel if (ire->ire_ipversion == IPV6_VERSION) { 1696 3448 dh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed); 1697 3448 dh155122 } else { 1698 3448 dh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); 1699 0 stevel } 1700 2535 sangeeta kmem_cache_free(ire_cache, ire); 1701 0 stevel } 1702 0 stevel 1703 0 stevel /* 1704 11042 Erik * ire_update_generation is the callback function provided by 1705 11042 Erik * ire_get_bucket() to update the generation number of any 1706 11042 Erik * matching shorter route when a new route is added. 1707 11042 Erik * 1708 11042 Erik * This fucntion always returns a failure return (B_FALSE) 1709 11042 Erik * to force the caller (rn_matchaddr_args) 1710 11042 Erik * to back-track up the tree looking for shorter matches. 1711 0 stevel */ 1712 11042 Erik /* ARGSUSED */ 1713 11042 Erik static boolean_t 1714 11042 Erik ire_update_generation(struct radix_node *rn, void *arg) 1715 0 stevel { 1716 11042 Erik struct rt_entry *rt = (struct rt_entry *)rn; 1717 0 stevel 1718 11042 Erik /* We need to handle all in the same bucket */ 1719 11042 Erik irb_increment_generation(&rt->rt_irb); 1720 11042 Erik return (B_FALSE); 1721 0 stevel } 1722 0 stevel 1723 0 stevel /* 1724 11042 Erik * Take care of all the generation numbers in the bucket. 1725 11042 Erik */ 1726 11042 Erik void 1727 11042 Erik irb_increment_generation(irb_t *irb) 1728 11042 Erik { 1729 11042 Erik ire_t *ire; 1730 11042 Erik 1731 11042 Erik if (irb == NULL || irb->irb_ire_cnt == 0) 1732 11042 Erik return; 1733 11042 Erik 1734 11042 Erik irb_refhold(irb); 1735 11042 Erik for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 1736 11042 Erik if (!IRE_IS_CONDEMNED(ire)) 1737 11042 Erik ire_increment_generation(ire); /* Ourselves */ 1738 11042 Erik ire_dep_incr_generation(ire); /* Dependants */ 1739 11042 Erik } 1740 11042 Erik irb_refrele(irb); 1741 11042 Erik } 1742 11042 Erik 1743 11042 Erik /* 1744 11042 Erik * When an IRE is added or deleted this routine is called to make sure 1745 11042 Erik * any caching of IRE information is notified or updated. 1746 0 stevel * 1747 0 stevel * The flag argument indicates if the flush request is due to addition 1748 11042 Erik * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), 1749 11042 Erik * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). 1750 0 stevel */ 1751 0 stevel void 1752 0 stevel ire_flush_cache_v4(ire_t *ire, int flag) 1753 0 stevel { 1754 11042 Erik irb_t *irb = ire->ire_bucket; 1755 11042 Erik struct rt_entry *rt = IRB2RT(irb); 1756 11042 Erik ip_stack_t *ipst = ire->ire_ipst; 1757 0 stevel 1758 11042 Erik /* 1759 11042 Erik * IRE_IF_CLONE ire's don't provide any new information 1760 11042 Erik * than the parent from which they are cloned, so don't 1761 11042 Erik * perturb the generation numbers. 1762 11042 Erik */ 1763 11042 Erik if (ire->ire_type & IRE_IF_CLONE) 1764 4714 sowmini return; 1765 0 stevel 1766 0 stevel /* 1767 11042 Erik * Ensure that an ire_add during a lookup serializes the updates of the 1768 11042 Erik * generation numbers under the radix head lock so that the lookup gets 1769 11042 Erik * either the old ire and old generation number, or a new ire and new 1770 11042 Erik * generation number. 1771 0 stevel */ 1772 11042 Erik RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 1773 11042 Erik 1774 11042 Erik /* 1775 11042 Erik * If a route was just added, we need to notify everybody that 1776 11042 Erik * has cached an IRE_NOROUTE since there might now be a better 1777 11042 Erik * route for them. 1778 11042 Erik */ 1779 11042 Erik if (flag == IRE_FLUSH_ADD) { 1780 11042 Erik ire_increment_generation(ipst->ips_ire_reject_v4); 1781 11042 Erik ire_increment_generation(ipst->ips_ire_blackhole_v4); 1782 11042 Erik } 1783 11042 Erik 1784 11042 Erik /* Adding a default can't otherwise provide a better route */ 1785 11042 Erik if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { 1786 11042 Erik RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 1787 0 stevel return; 1788 11042 Erik } 1789 11042 Erik 1790 11042 Erik switch (flag) { 1791 11042 Erik case IRE_FLUSH_DELETE: 1792 11042 Erik case IRE_FLUSH_GWCHANGE: 1793 0 stevel /* 1794 11042 Erik * Update ire_generation for all ire_dep_children chains 1795 11042 Erik * starting with this IRE 1796 0 stevel */ 1797 11042 Erik ire_dep_incr_generation(ire); 1798 11042 Erik break; 1799 11042 Erik case IRE_FLUSH_ADD: 1800 0 stevel /* 1801 11042 Erik * Update the generation numbers of all shorter matching routes. 1802 11042 Erik * ire_update_generation takes care of the dependants by 1803 11042 Erik * using ire_dep_incr_generation. 1804 0 stevel */ 1805 11042 Erik (void) ipst->ips_ip_ftable->rnh_matchaddr_args(&rt->rt_dst, 1806 11042 Erik ipst->ips_ip_ftable, ire_update_generation, NULL); 1807 11042 Erik break; 1808 0 stevel } 1809 11042 Erik RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 1810 0 stevel } 1811 0 stevel 1812 0 stevel /* 1813 0 stevel * Matches the arguments passed with the values in the ire. 1814 0 stevel * 1815 11042 Erik * Note: for match types that match using "ill" passed in, ill 1816 0 stevel * must be checked for non-NULL before calling this routine. 1817 0 stevel */ 1818 2535 sangeeta boolean_t 1819 0 stevel ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 1820 11042 Erik int type, const ill_t *ill, zoneid_t zoneid, 1821 11042 Erik const ts_label_t *tsl, int match_flags) 1822 0 stevel { 1823 0 stevel ill_t *ire_ill = NULL, *dst_ill; 1824 11042 Erik ip_stack_t *ipst = ire->ire_ipst; 1825 0 stevel 1826 0 stevel ASSERT(ire->ire_ipversion == IPV4_VERSION); 1827 0 stevel ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); 1828 8485 Peter ASSERT((!(match_flags & MATCH_IRE_ILL)) || 1829 11042 Erik (ill != NULL && !ill->ill_isv6)); 1830 0 stevel 1831 0 stevel /* 1832 11042 Erik * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it is 1833 11042 Erik * in fact hidden, to ensure the caller gets the right one. 1834 8485 Peter */ 1835 11042 Erik if (ire->ire_testhidden) { 1836 11042 Erik if (!(match_flags & MATCH_IRE_TESTHIDDEN)) 1837 8485 Peter return (B_FALSE); 1838 8485 Peter } 1839 1095 priyanka 1840 1676 jpk if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 1841 1676 jpk ire->ire_zoneid != ALL_ZONES) { 1842 0 stevel /* 1843 11042 Erik * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid 1844 11042 Erik * does not match that of ire_zoneid, a failure to 1845 0 stevel * match is reported at this point. Otherwise, since some IREs 1846 0 stevel * that are available in the global zone can be used in local 1847 0 stevel * zones, additional checks need to be performed: 1848 0 stevel * 1849 11042 Erik * IRE_LOOPBACK 1850 0 stevel * entries should never be matched in this situation. 1851 11042 Erik * Each zone has its own IRE_LOOPBACK. 1852 0 stevel * 1853 11042 Erik * IRE_LOCAL 1854 11042 Erik * We allow them for any zoneid. ire_route_recursive 1855 11042 Erik * does additional checks when 1856 11042 Erik * ip_restrict_interzone_loopback is set. 1857 0 stevel * 1858 11042 Erik * If ill_usesrc_ifindex is set 1859 11042 Erik * Then we check if the zone has a valid source address 1860 11042 Erik * on the usesrc ill. 1861 0 stevel * 1862 11042 Erik * If ire_ill is set, then check that the zone has an ipif 1863 11042 Erik * on that ill. 1864 11042 Erik * 1865 11042 Erik * Outside of this function (in ire_round_robin) we check 1866 11042 Erik * that any IRE_OFFLINK has a gateway that reachable from the 1867 11042 Erik * zone when we have multiple choices (ECMP). 1868 0 stevel */ 1869 0 stevel if (match_flags & MATCH_IRE_ZONEONLY) 1870 0 stevel return (B_FALSE); 1871 11042 Erik if (ire->ire_type & IRE_LOOPBACK) 1872 0 stevel return (B_FALSE); 1873 11042 Erik 1874 11042 Erik if (ire->ire_type & IRE_LOCAL) 1875 11042 Erik goto matchit; 1876 11042 Erik 1877 0 stevel /* 1878 11042 Erik * The normal case of IRE_ONLINK has a matching zoneid. 1879 11042 Erik * Here we handle the case when shared-IP zones have been 1880 11042 Erik * configured with IP addresses on vniN. In that case it 1881 11042 Erik * is ok for traffic from a zone to use IRE_ONLINK routes 1882 11042 Erik * if the ill has a usesrc pointing at vniN 1883 0 stevel */ 1884 11042 Erik dst_ill = ire->ire_ill; 1885 11042 Erik if (ire->ire_type & IRE_ONLINK) { 1886 11042 Erik uint_t ifindex; 1887 11042 Erik 1888 11042 Erik /* 1889 11042 Erik * Note there is no IRE_INTERFACE on vniN thus 1890 11042 Erik * can't do an IRE lookup for a matching route. 1891 11042 Erik */ 1892 11042 Erik ifindex = dst_ill->ill_usesrc_ifindex; 1893 11042 Erik if (ifindex == 0) 1894 11042 Erik return (B_FALSE); 1895 11042 Erik 1896 0 stevel /* 1897 0 stevel * If there is a usable source address in the 1898 11042 Erik * zone, then it's ok to return this IRE_INTERFACE 1899 0 stevel */ 1900 11042 Erik if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 1901 11042 Erik zoneid, ipst)) { 1902 11042 Erik ip3dbg(("ire_match_args: no usrsrc for zone" 1903 0 stevel " dst_ill %p\n", (void *)dst_ill)); 1904 0 stevel return (B_FALSE); 1905 0 stevel } 1906 0 stevel } 1907 11042 Erik /* 1908 11042 Erik * For exampe, with 1909 11042 Erik * route add 11.0.0.0 gw1 -ifp bge0 1910 11042 Erik * route add 11.0.0.0 gw2 -ifp bge1 1911 11042 Erik * this code would differentiate based on 1912 11042 Erik * where the sending zone has addresses. 1913 11042 Erik * Only if the zone has an address on bge0 can it use the first 1914 11042 Erik * route. It isn't clear if this behavior is documented 1915 11042 Erik * anywhere. 1916 11042 Erik */ 1917 11042 Erik if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 1918 0 stevel ipif_t *tipif; 1919 0 stevel 1920 11042 Erik mutex_enter(&dst_ill->ill_lock); 1921 11042 Erik for (tipif = dst_ill->ill_ipif; 1922 0 stevel tipif != NULL; tipif = tipif->ipif_next) { 1923 11042 Erik if (!IPIF_IS_CONDEMNED(tipif) && 1924 0 stevel (tipif->ipif_flags & IPIF_UP) && 1925 1676 jpk (tipif->ipif_zoneid == zoneid || 1926 1676 jpk tipif->ipif_zoneid == ALL_ZONES)) 1927 0 stevel break; 1928 0 stevel } 1929 11042 Erik mutex_exit(&dst_ill->ill_lock); 1930 0 stevel if (tipif == NULL) { 1931 0 stevel return (B_FALSE); 1932 0 stevel } 1933 0 stevel } 1934 0 stevel } 1935 0 stevel 1936 11042 Erik matchit: 1937 8485 Peter if (match_flags & MATCH_IRE_ILL) { 1938 11042 Erik ire_ill = ire->ire_ill; 1939 11042 Erik 1940 11042 Erik /* 1941 11042 Erik * If asked to match an ill, we *must* match 1942 11042 Erik * on the ire_ill for ipmp test addresses, or 1943 11042 Erik * any of the ill in the group for data addresses. 1944 11042 Erik * If we don't, we may as well fail. 1945 11042 Erik * However, we need an exception for IRE_LOCALs to ensure 1946 11042 Erik * we loopback packets even sent to test addresses on different 1947 11042 Erik * interfaces in the group. 1948 11042 Erik */ 1949 11042 Erik if ((match_flags & MATCH_IRE_TESTHIDDEN) && 1950 11042 Erik !(ire->ire_type & IRE_LOCAL)) { 1951 11042 Erik if (ire->ire_ill != ill) 1952 11042 Erik return (B_FALSE); 1953 11042 Erik } else { 1954 11042 Erik match_flags &= ~MATCH_IRE_TESTHIDDEN; 1955 11042 Erik /* 1956 11042 Erik * We know that ill is not NULL, but ire_ill could be 1957 11042 Erik * NULL 1958 11042 Erik */ 1959 11042 Erik if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) 1960 11042 Erik return (B_FALSE); 1961 11042 Erik } 1962 0 stevel } 1963 0 stevel 1964 0 stevel if ((ire->ire_addr == (addr & mask)) && 1965 0 stevel ((!(match_flags & MATCH_IRE_GW)) || 1966 4714 sowmini (ire->ire_gateway_addr == gateway)) && 1967 11042 Erik ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && 1968 11042 Erik ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && 1969 11042 Erik ((!(match_flags & MATCH_IRE_MASK)) || (ire->ire_mask == mask)) && 1970 1676 jpk ((!(match_flags & MATCH_IRE_SECATTR)) || 1971 4714 sowmini (!is_system_labeled()) || 1972 4714 sowmini (tsol_ire_match_gwattr(ire, tsl) == 0))) { 1973 0 stevel /* We found the matched IRE */ 1974 0 stevel return (B_TRUE); 1975 0 stevel } 1976 0 stevel return (B_FALSE); 1977 0 stevel } 1978 0 stevel 1979 0 stevel /* 1980 11042 Erik * Check if the IRE_LOCAL uses the same ill as another route would use. 1981 11042 Erik * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE, 1982 11042 Erik * then we don't allow this IRE_LOCAL to be used. 1983 11042 Erik * We always return an IRE; will be RTF_REJECT if no route available. 1984 0 stevel */ 1985 0 stevel ire_t * 1986 11042 Erik ire_alt_local(ire_t *ire, zoneid_t zoneid, const ts_label_t *tsl, 1987 11042 Erik const ill_t *ill, uint_t *generationp) 1988 0 stevel { 1989 11042 Erik ip_stack_t *ipst = ire->ire_ipst; 1990 11042 Erik ire_t *alt_ire; 1991 11042 Erik uint_t ire_type; 1992 11042 Erik uint_t generation; 1993 11042 Erik uint_t match_flags; 1994 11042 Erik 1995 11042 Erik ASSERT(ire->ire_type & IRE_LOCAL); 1996 11042 Erik ASSERT(ire->ire_ill != NULL); 1997 0 stevel 1998 0 stevel /* 1999 11042 Erik * Need to match on everything but local. 2000 11042 Erik * This might result in the creation of a IRE_IF_CLONE for the 2001 11042 Erik * same address as the IRE_LOCAL when restrict_interzone_loopback is 2002 11042 Erik * set. ire_add_*() ensures that the IRE_IF_CLONE are tail inserted 2003 11042 Erik * to make sure the IRE_LOCAL is always found first. 2004 0 stevel */ 2005 11042 Erik ire_type = (IRE_ONLINK | IRE_OFFLINK) & ~(IRE_LOCAL|IRE_LOOPBACK); 2006 11042 Erik match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; 2007 11042 Erik if (ill != NULL) 2008 11042 Erik match_flags |= MATCH_IRE_ILL; 2009 0 stevel 2010 11042 Erik if (ire->ire_ipversion == IPV4_VERSION) { 2011 11042 Erik alt_ire = ire_route_recursive_v4(ire->ire_addr, ire_type, 2012 11042 Erik ill, zoneid, tsl, match_flags, B_TRUE, 0, ipst, NULL, NULL, 2013 11042 Erik &generation); 2014 11042 Erik } else { 2015 11042 Erik alt_ire = ire_route_recursive_v6(&ire->ire_addr_v6, ire_type, 2016 11042 Erik ill, zoneid, tsl, match_flags, B_TRUE, 0, ipst, NULL, NULL, 2017 11042 Erik &generation); 2018 0 stevel } 2019 11042 Erik ASSERT(alt_ire != NULL); 2020 11042 Erik 2021 11042 Erik if (alt_ire->ire_ill == ire->ire_ill) { 2022 11042 Erik /* Going out the same ILL - ok to send to IRE_LOCAL */ 2023 11042 Erik ire_refrele(alt_ire); 2024 11042 Erik } else { 2025 11042 Erik /* Different ill - ignore IRE_LOCAL */ 2026 11042 Erik ire_refrele(ire); 2027 11042 Erik ire = alt_ire; 2028 11042 Erik if (generationp != NULL) 2029 11042 Erik *generationp = generation; 2030 0 stevel } 2031 0 stevel return (ire); 2032 0 stevel } 2033 1676 jpk 2034 11042 Erik boolean_t 2035 11042 Erik ire_find_zoneid(struct radix_node *rn, void *arg) 2036 1676 jpk { 2037 11042 Erik struct rt_entry *rt = (struct rt_entry *)rn; 2038 1676 jpk irb_t *irb; 2039 1676 jpk ire_t *ire; 2040 11042 Erik ire_ftable_args_t *margs = arg; 2041 1676 jpk 2042 11042 Erik ASSERT(rt != NULL); 2043 11042 Erik 2044 11042 Erik irb = &rt->rt_irb; 2045 11042 Erik 2046 11042 Erik if (irb->irb_ire_cnt == 0) 2047 11042 Erik return (B_FALSE); 2048 11042 Erik 2049 11042 Erik rw_enter(&irb->irb_lock, RW_READER); 2050 1676 jpk for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 2051 11042 Erik if (IRE_IS_CONDEMNED(ire)) 2052 1676 jpk continue; 2053 1676 jpk 2054 11131 Erik if (!(ire->ire_type & IRE_INTERFACE)) 2055 11131 Erik continue; 2056 11131 Erik 2057 11042 Erik if (ire->ire_zoneid != ALL_ZONES && 2058 11042 Erik ire->ire_zoneid != margs->ift_zoneid) 2059 11042 Erik continue; 2060 11042 Erik 2061 11042 Erik if (margs->ift_ill != NULL && margs->ift_ill != ire->ire_ill) 2062 11042 Erik continue; 2063 11042 Erik 2064 11042 Erik if (is_system_labeled() && 2065 11042 Erik tsol_ire_match_gwattr(ire, margs->ift_tsl) != 0) 2066 11042 Erik continue; 2067 11042 Erik 2068 11042 Erik rw_exit(&irb->irb_lock); 2069 11042 Erik return (B_TRUE); 2070 11042 Erik } 2071 11042 Erik rw_exit(&irb->irb_lock); 2072 11042 Erik return (B_FALSE); 2073 11042 Erik } 2074 11042 Erik 2075 11042 Erik /* 2076 11042 Erik * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified 2077 11042 Erik * gateway address. If ill is non-NULL we also match on it. 2078 11042 Erik * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. 2079 11042 Erik */ 2080 11042 Erik boolean_t 2081 11042 Erik ire_gateway_ok_zone_v4(ipaddr_t gateway, zoneid_t zoneid, ill_t *ill, 2082 11042 Erik const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) 2083 11042 Erik { 2084 11042 Erik struct rt_sockaddr rdst; 2085 11042 Erik struct rt_entry *rt; 2086 11042 Erik ire_ftable_args_t margs; 2087 11042 Erik 2088 11042 Erik ASSERT(ill == NULL || !ill->ill_isv6); 2089 11042 Erik if (lock_held) 2090 11042 Erik ASSERT(RW_READ_HELD(&ipst->ips_ip_ftable->rnh_lock)); 2091 11042 Erik else 2092 11042 Erik RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 2093 11042 Erik 2094 11131 Erik bzero(&rdst, sizeof (rdst)); 2095 11042 Erik rdst.rt_sin_len = sizeof (rdst); 2096 11042 Erik rdst.rt_sin_family = AF_INET; 2097 11042 Erik rdst.rt_sin_addr.s_addr = gateway; 2098 11042 Erik 2099 11042 Erik /* 2100 11042 Erik * We only use margs for ill, zoneid, and tsl matching in 2101 11042 Erik * ire_find_zoneid 2102 11042 Erik */ 2103 11131 Erik bzero(&margs, sizeof (margs)); 2104 11042 Erik margs.ift_ill = ill; 2105 11042 Erik margs.ift_zoneid = zoneid; 2106 11042 Erik margs.ift_tsl = tsl; 2107 11042 Erik rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 2108 11042 Erik ipst->ips_ip_ftable, ire_find_zoneid, (void *)&margs); 2109 11042 Erik 2110 11042 Erik if (!lock_held) 2111 11042 Erik RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 2112 11042 Erik 2113 11042 Erik return (rt != NULL); 2114 11042 Erik } 2115 11042 Erik 2116 11042 Erik /* 2117 11042 Erik * ire_walk routine to delete a fraction of redirect IREs and IRE_CLONE_IF IREs. 2118 11042 Erik * The fraction argument tells us what fraction of the IREs to delete. 2119 11042 Erik * Common for IPv4 and IPv6. 2120 11042 Erik * Used when memory backpressure. 2121 11042 Erik */ 2122 11042 Erik static void 2123 11042 Erik ire_delete_reclaim(ire_t *ire, char *arg) 2124 11042 Erik { 2125 11042 Erik ip_stack_t *ipst = ire->ire_ipst; 2126 11042 Erik uint_t fraction = *(uint_t *)arg; 2127 11042 Erik uint_t rand; 2128 11042 Erik 2129 11042 Erik if ((ire->ire_flags & RTF_DYNAMIC) || 2130 11042 Erik (ire->ire_type & IRE_IF_CLONE)) { 2131 11042 Erik 2132 11042 Erik /* Pick a random number */ 2133 11066 rafael rand = (uint_t)ddi_get_lbolt() + 2134 11042 Erik IRE_ADDR_HASH_V6(ire->ire_addr_v6, 256); 2135 11042 Erik 2136 11042 Erik /* Use truncation */ 2137 11042 Erik if ((rand/fraction)*fraction == rand) { 2138 11042 Erik IP_STAT(ipst, ip_ire_reclaim_deleted); 2139 1676 jpk ire_delete(ire); 2140 1676 jpk } 2141 1676 jpk } 2142 1676 jpk 2143 0 stevel } 2144 0 stevel 2145 0 stevel /* 2146 11042 Erik * kmem_cache callback to free up memory. 2147 11042 Erik * 2148 11042 Erik * Free a fraction (ips_ip_ire_reclaim_fraction) of things IP added dynamically 2149 11042 Erik * (RTF_DYNAMIC and IRE_IF_CLONE). 2150 0 stevel */ 2151 11042 Erik static void 2152 11042 Erik ip_ire_reclaim_stack(ip_stack_t *ipst) 2153 0 stevel { 2154 11042 Erik uint_t fraction = ipst->ips_ip_ire_reclaim_fraction; 2155 7880 Jonathan 2156 11042 Erik IP_STAT(ipst, ip_ire_reclaim_calls); 2157 7880 Jonathan 2158 11042 Erik ire_walk(ire_delete_reclaim, &fraction, ipst); 2159 11042 Erik 2160 11042 Erik /* 2161 11042 Erik * Walk all CONNs that can have a reference on an ire, nce or dce. 2162 11042 Erik * Get them to update any stale references to drop any refholds they 2163 11042 Erik * have. 2164 11042 Erik */ 2165 11042 Erik ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 2166 0 stevel } 2167 0 stevel 2168 0 stevel /* 2169 11042 Erik * Called by the memory allocator subsystem directly, when the system 2170 11042 Erik * is running low on memory. 2171 8485 Peter */ 2172 11042 Erik /* ARGSUSED */ 2173 11042 Erik void 2174 11042 Erik ip_ire_reclaim(void *args) 2175 8485 Peter { 2176 11042 Erik netstack_handle_t nh; 2177 11042 Erik netstack_t *ns; 2178 2733 nordmark 2179 11042 Erik netstack_next_init(&nh); 2180 11042 Erik while ((ns = netstack_next(&nh)) != NULL) { 2181 11042 Erik ip_ire_reclaim_stack(ns->netstack_ip); 2182 11042 Erik netstack_rele(ns); 2183 2733 nordmark } 2184 11042 Erik netstack_next_fini(&nh); 2185 0 stevel } 2186 0 stevel 2187 0 stevel static void 2188 0 stevel power2_roundup(uint32_t *value) 2189 0 stevel { 2190 0 stevel int i; 2191 0 stevel 2192 0 stevel for (i = 1; i < 31; i++) { 2193 0 stevel if (*value <= (1 << i)) 2194 0 stevel break; 2195 0 stevel } 2196 0 stevel *value = (1 << i); 2197 0 stevel } 2198 0 stevel 2199 3448 dh155122 /* Global init for all zones */ 2200 3448 dh155122 void 2201 3448 dh155122 ip_ire_g_init() 2202 3448 dh155122 { 2203 0 stevel /* 2204 11042 Erik * Create kmem_caches. ip_ire_reclaim() and ip_nce_reclaim() 2205 11042 Erik * will give disposable IREs back to system when needed. 2206 0 stevel * This needs to be done here before anything else, since 2207 0 stevel * ire_add() expects the cache to be created. 2208 0 stevel */ 2209 0 stevel ire_cache = kmem_cache_create("ire_cache", 2210 11042 Erik sizeof (ire_t), 0, NULL, NULL, 2211 11042 Erik ip_ire_reclaim, NULL, NULL, 0); 2212 11042 Erik 2213 11042 Erik ncec_cache = kmem_cache_create("ncec_cache", 2214 11042 Erik sizeof (ncec_t), 0, NULL, NULL, 2215 11042 Erik ip_nce_reclaim, NULL, NULL, 0); 2216 11042 Erik nce_cache = kmem_cache_create("nce_cache", 2217 11042 Erik sizeof (nce_t), 0, NULL, NULL, 2218 11042 Erik NULL, NULL, NULL, 0); 2219 0 stevel 2220 3448 dh155122 rt_entry_cache = kmem_cache_create("rt_entry", 2221 3448 dh155122 sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0); 2222 3448 dh155122 2223 3448 dh155122 /* 2224 3448 dh155122 * Have radix code setup kmem caches etc. 2225 3448 dh155122 */ 2226 3448 dh155122 rn_init(); 2227 3448 dh155122 } 2228 3448 dh155122 2229 3448 dh155122 void 2230 3448 dh155122 ip_ire_init(ip_stack_t *ipst) 2231 3448 dh155122 { 2232 11042 Erik ire_t *ire; 2233 11042 Erik int error; 2234 3448 dh155122 2235 3448 dh155122 mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0); 2236 3448 dh155122 2237 3448 dh155122 (void) rn_inithead((void **)&ipst->ips_ip_ftable, 32); 2238 3448 dh155122 2239 0 stevel /* 2240 0 stevel * Make sure that the forwarding table size is a power of 2. 2241 0 stevel * The IRE*_ADDR_HASH() macroes depend on that. 2242 0 stevel */ 2243 3448 dh155122 ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size; 2244 3448 dh155122 power2_roundup(&ipst->ips_ip6_ftable_hash_size); 2245 3448 dh155122 2246 11042 Erik /* 2247 11042 Erik * Allocate/initialize a pair of IRE_NOROUTEs for each of IPv4 and IPv6. 2248 11042 Erik * The ire_reject_v* has RTF_REJECT set, and the ire_blackhole_v* has 2249 11042 Erik * RTF_BLACKHOLE set. We use the latter for transient errors such 2250 11042 Erik * as memory allocation failures and tripping on IRE_IS_CONDEMNED 2251 11042 Erik * entries. 2252 11042 Erik */ 2253 11042 Erik ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 2254 11042 Erik *ire = ire_null; 2255 11042 Erik error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 2256 11042 Erik RTF_REJECT|RTF_UP, NULL, ipst); 2257 11042 Erik ASSERT(error == 0); 2258 11042 Erik ipst->ips_ire_reject_v4 = ire; 2259 11042 Erik 2260 11042 Erik ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 2261 11042 Erik *ire = ire_null; 2262 11042 Erik error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 2263 11042 Erik RTF_REJECT|RTF_UP, NULL, ipst); 2264 11042 Erik ASSERT(error == 0); 2265 11042 Erik ipst->ips_ire_reject_v6 = ire; 2266 11042 Erik 2267 11042 Erik ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 2268 11042 Erik *ire = ire_null; 2269 11042 Erik error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 2270 11042 Erik RTF_BLACKHOLE|RTF_UP, NULL, ipst); 2271 11042 Erik ASSERT(error == 0); 2272 11042 Erik ipst->ips_ire_blackhole_v4 = ire; 2273 11042 Erik 2274 11042 Erik ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 2275 11042 Erik *ire = ire_null; 2276 11042 Erik error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 2277 11042 Erik RTF_BLACKHOLE|RTF_UP, NULL, ipst); 2278 11042 Erik ASSERT(error == 0); 2279 11042 Erik ipst->ips_ire_blackhole_v6 = ire; 2280 11042 Erik 2281 11042 Erik rw_init(&ipst->ips_ip6_ire_head_lock, NULL, RW_DEFAULT, NULL); 2282 11042 Erik rw_init(&ipst->ips_ire_dep_lock, NULL, RW_DEFAULT, NULL); 2283 3448 dh155122 } 2284 3448 dh155122 2285 3448 dh155122 void 2286 3448 dh155122 ip_ire_g_fini(void) 2287 3448 dh155122 { 2288 3448 dh155122 kmem_cache_destroy(ire_cache); 2289 11042 Erik kmem_cache_destroy(ncec_cache); 2290 11042 Erik kmem_cache_destroy(nce_cache); 2291 3448 dh155122 kmem_cache_destroy(rt_entry_cache); 2292 3448 dh155122 2293 3448 dh155122 rn_fini(); 2294 3448 dh155122 } 2295 3448 dh155122 2296 3448 dh155122 void 2297 3448 dh155122 ip_ire_fini(ip_stack_t *ipst) 2298 0 stevel { 2299 0 stevel int i; 2300 0 stevel 2301 11042 Erik rw_destroy(&ipst->ips_ire_dep_lock); 2302 11042 Erik rw_destroy(&ipst->ips_ip6_ire_head_lock); 2303 11042 Erik 2304 11042 Erik ire_refrele_notr(ipst->ips_ire_reject_v6); 2305 11042 Erik ipst->ips_ire_reject_v6 = NULL; 2306 11042 Erik ire_refrele_notr(ipst->ips_ire_reject_v4); 2307 11042 Erik ipst->ips_ire_reject_v4 = NULL; 2308 11042 Erik ire_refrele_notr(ipst->ips_ire_blackhole_v6); 2309 11042 Erik ipst->ips_ire_blackhole_v6 = NULL; 2310 11042 Erik ire_refrele_notr(ipst->ips_ire_blackhole_v4); 2311 11042 Erik ipst->ips_ire_blackhole_v4 = NULL; 2312 11042 Erik 2313 3448 dh155122 /* 2314 3448 dh155122 * Delete all IREs - assumes that the ill/ipifs have 2315 11042 Erik * been removed so what remains are just the ftable to handle. 2316 3448 dh155122 */ 2317 3448 dh155122 ire_walk(ire_delete, NULL, ipst); 2318 3448 dh155122 2319 3448 dh155122 rn_freehead(ipst->ips_ip_ftable); 2320 3448 dh155122 ipst->ips_ip_ftable = NULL; 2321 3448 dh155122 2322 3448 dh155122 mutex_destroy(&ipst->ips_ire_ft_init_lock); 2323 3448 dh155122 2324 3448 dh155122 for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) { 2325 3448 dh155122 irb_t *ptr; 2326 3448 dh155122 int j; 2327 3448 dh155122 2328 3448 dh155122 if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL) 2329 3448 dh155122 continue; 2330 3448 dh155122 2331 3448 dh155122 for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) { 2332 3448 dh155122 ASSERT(ptr[j].irb_ire == NULL); 2333 3448 dh155122 rw_destroy(&ptr[j].irb_lock); 2334 3448 dh155122 } 2335 3448 dh155122 mi_free(ptr); 2336 3448 dh155122 ipst->ips_ip_forwarding_table_v6[i] = NULL; 2337 3448 dh155122 } 2338 0 stevel } 2339 0 stevel 2340 5023 carlsonj #ifdef DEBUG 2341 0 stevel void 2342 0 stevel ire_trace_ref(ire_t *ire) 2343 0 stevel { 2344 0 stevel mutex_enter(&ire->ire_lock); 2345 5023 carlsonj if (ire->ire_trace_disable) { 2346 5023 carlsonj mutex_exit(&ire->ire_lock); 2347 5023 carlsonj return; 2348 5023 carlsonj } 2349 5023 carlsonj 2350 5023 carlsonj if (th_trace_ref(ire, ire->ire_ipst)) { 2351 5023 carlsonj mutex_exit(&ire->ire_lock); 2352 5023 carlsonj } else { 2353 5023 carlsonj ire->ire_trace_disable = B_TRUE; 2354 5023 carlsonj mutex_exit(&ire->ire_lock); 2355 5023 carlsonj ire_trace_cleanup(ire); 2356 5023 carlsonj } 2357 5023 carlsonj } 2358 5023 carlsonj 2359 5023 carlsonj void 2360 5023 carlsonj ire_untrace_ref(ire_t *ire) 2361 5023 carlsonj { 2362 5023 carlsonj mutex_enter(&ire->ire_lock); 2363 5023 carlsonj if (!ire->ire_trace_disable) 2364 5023 carlsonj th_trace_unref(ire); 2365 0 stevel mutex_exit(&ire->ire_lock); 2366 0 stevel } 2367 0 stevel 2368 5023 carlsonj static void 2369 5023 carlsonj ire_trace_cleanup(const ire_t *ire) 2370 5023 carlsonj { 2371 5023 carlsonj th_trace_cleanup(ire, ire->ire_trace_disable); 2372 5023 carlsonj } 2373 5023 carlsonj #endif /* DEBUG */ 2374 2535 sangeeta 2375 2535 sangeeta /* 2376 11042 Erik * Find, or create if needed, the nce_t pointer to the neighbor cache 2377 11042 Erik * entry ncec_t for an IPv4 address. The nce_t will be created on the ill_t 2378 11042 Erik * in the non-IPMP case, or on the cast-ill in the IPMP bcast/mcast case, or 2379 11042 Erik * on the next available under-ill (selected by the IPMP rotor) in the 2380 11042 Erik * unicast IPMP case. 2381 11042 Erik * 2382 11042 Erik * If a neighbor-cache entry has to be created (i.e., one does not already 2383 11042 Erik * exist in the nce list) the ncec_lladdr and ncec_state of the neighbor cache 2384 11042 Erik * entry are initialized in nce_add_v4(). The broadcast, multicast, and 2385 11042 Erik * link-layer type determine the contents of {ncec_state, ncec_lladdr} of 2386 11042 Erik * the ncec_t created. The ncec_lladdr is non-null for all link types with 2387 11042 Erik * non-zero ill_phys_addr_length, though the contents may be zero in cases 2388 11042 Erik * where the link-layer type is not known at the time of creation 2389 11042 Erik * (e.g., IRE_IFRESOLVER links) 2390 11042 Erik * 2391 11042 Erik * All IRE_BROADCAST entries have ncec_state = ND_REACHABLE, and the nce_lladr 2392 11042 Erik * has the physical broadcast address of the outgoing interface. 2393 11042 Erik * For unicast ire entries, 2394 11042 Erik * - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created 2395 11042 Erik * ncec_t with 0 nce_lladr contents, and will be in the ND_INITIAL state. 2396 11042 Erik * - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link 2397 11042 Erik * layer resolution is necessary, so that the ncec_t will be in the 2398 11042 Erik * ND_REACHABLE state 2399 11042 Erik * 2400 11042 Erik * The link layer information needed for broadcast addresses, and for 2401 11042 Erik * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that 2402 11042 Erik * never needs re-verification for the lifetime of the ncec_t. These are 2403 11042 Erik * therefore marked NCE_F_NONUD. 2404 11042 Erik * 2405 11042 Erik * The nce returned will be created such that the nce_ill == ill that 2406 11042 Erik * is passed in. Note that the nce itself may not have ncec_ill == ill 2407 11042 Erik * where IPMP links are involved. 2408 11042 Erik */ 2409 11042 Erik static nce_t * 2410 11042 Erik ire_nce_init(ill_t *ill, const void *addr, int ire_type) 2411 11042 Erik { 2412 11042 Erik int err; 2413 11042 Erik nce_t *nce = NULL; 2414 11042 Erik uint16_t ncec_flags; 2415 11042 Erik uchar_t *hwaddr; 2416 11042 Erik boolean_t need_refrele = B_FALSE; 2417 11042 Erik ill_t *in_ill = ill; 2418 11042 Erik boolean_t is_unicast; 2419 11042 Erik uint_t hwaddr_len; 2420 11042 Erik 2421 11042 Erik is_unicast = ((ire_type & (IRE_MULTICAST|IRE_BROADCAST)) == 0); 2422 11042 Erik if (IS_IPMP(ill) || 2423 11042 Erik ((ire_type & IRE_BROADCAST) && IS_UNDER_IPMP(ill))) { 2424 11042 Erik if ((ill = ipmp_ill_get_xmit_ill(ill, is_unicast)) == NULL) 2425 11042 Erik return (NULL); 2426 11042 Erik need_refrele = B_TRUE; 2427 11042 Erik } 2428 11042 Erik ncec_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0; 2429 11042 Erik 2430 11042 Erik switch (ire_type) { 2431 11042 Erik case IRE_BROADCAST: 2432 11042 Erik ASSERT(!ill->ill_isv6); 2433 11042 Erik ncec_flags |= (NCE_F_BCAST|NCE_F_NONUD); 2434 11042 Erik break; 2435 11042 Erik case IRE_MULTICAST: 2436 11042 Erik ncec_flags |= (NCE_F_MCAST|NCE_F_NONUD); 2437 11042 Erik break; 2438 11042 Erik } 2439 11042 Erik 2440 11042 Erik if (ill->ill_net_type == IRE_IF_NORESOLVER && is_unicast) { 2441 11042 Erik hwaddr = ill->ill_dest_addr; 2442 11042 Erik } else { 2443 11042 Erik hwaddr = NULL; 2444 11042 Erik } 2445 11042 Erik hwaddr_len = ill->ill_phys_addr_length; 2446 11042 Erik 2447 11042 Erik retry: 2448 11042 Erik /* nce_state will be computed by nce_add_common() */ 2449 11042 Erik if (!ill->ill_isv6) { 2450 11042 Erik err = nce_lookup_then_add_v4(ill, hwaddr, hwaddr_len, addr, 2451 11042 Erik ncec_flags, ND_UNCHANGED, &nce); 2452 11042 Erik } else { 2453 11042 Erik err = nce_lookup_then_add_v6(ill, hwaddr, hwaddr_len, addr, 2454 11042 Erik ncec_flags, ND_UNCHANGED, &nce); 2455 11042 Erik } 2456 11042 Erik 2457 11042 Erik switch (err) { 2458 11042 Erik case 0: 2459 11042 Erik break; 2460 11042 Erik case EEXIST: 2461 11042 Erik /* 2462 11042 Erik * When subnets change or partially overlap what was once 2463 11042 Erik * a broadcast address could now be a unicast, or vice versa. 2464 11042 Erik */ 2465 11042 Erik if (((ncec_flags ^ nce->nce_common->ncec_flags) & 2466 11042 Erik NCE_F_BCAST) != 0) { 2467 11042 Erik ASSERT(!ill->ill_isv6); 2468 11042 Erik ncec_delete(nce->nce_common); 2469 11042 Erik nce_refrele(nce); 2470 11042 Erik goto retry; 2471 11042 Erik } 2472 11042 Erik break; 2473 11042 Erik default: 2474 11042 Erik DTRACE_PROBE2(nce__init__fail, ill_t *, ill, int, err); 2475 11042 Erik if (need_refrele) 2476 11042 Erik ill_refrele(ill); 2477 11042 Erik return (NULL); 2478 11042 Erik } 2479 11042 Erik /* 2480 11042 Erik * If the ill was an under-ill of an IPMP group, we need to verify 2481 11042 Erik * that it is still active so that we select an active interface in 2482 11042 Erik * the group. However, since ipmp_ill_is_active ASSERTs for 2483 11042 Erik * IS_UNDER_IPMP(), we first need to verify that the ill is an 2484 11042 Erik * under-ill, and since this is being done in the data path, the 2485 11042 Erik * only way to ascertain this is by holding the ill_g_lock. 2486 11042 Erik */ 2487 11042 Erik rw_enter(&ill->ill_ipst->ips_ill_g_lock, RW_READER); 2488 11042 Erik mutex_enter(&ill->ill_lock); 2489 11042 Erik mutex_enter(&ill->ill_phyint->phyint_lock); 2490 11042 Erik if (need_refrele && IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { 2491 11042 Erik /* 2492 11042 Erik * need_refrele implies that the under ill was selected by 2493 11042 Erik * ipmp_ill_get_xmit_ill() because either the in_ill was an 2494 11042 Erik * ipmp_ill, or we are sending a non-unicast packet on 2495 11042 Erik * an under_ill. However, when we get here, the ill selected by 2496 11042 Erik * ipmp_ill_get_xmit_ill was pulled out of the active set 2497 11042 Erik * (for unicast) or cast_ill nomination (for 2498 11042 Erik * !unicast) after it was picked as the outgoing ill. 2499 11042 Erik * We have to pick an active interface and/or cast_ill in the 2500 11042 Erik * group. 2501 11042 Erik */ 2502 11042 Erik mutex_exit(&ill->ill_phyint->phyint_lock); 2503 11042 Erik nce_delete(nce); 2504 11042 Erik mutex_exit(&ill->ill_lock); 2505 11042 Erik rw_exit(&ill->ill_ipst->ips_ill_g_lock); 2506 11042 Erik nce_refrele(nce); 2507 11042 Erik ill_refrele(ill); 2508 11042 Erik if ((ill = ipmp_ill_get_xmit_ill(in_ill, is_unicast)) == NULL) 2509 11042 Erik return (NULL); 2510 11042 Erik goto retry; 2511 11042 Erik } else { 2512 11042 Erik mutex_exit(&ill->ill_phyint->phyint_lock); 2513 11042 Erik mutex_exit(&ill->ill_lock); 2514 11042 Erik rw_exit(&ill->ill_ipst->ips_ill_g_lock); 2515 11042 Erik } 2516 11042 Erik done: 2517 11042 Erik ASSERT(nce->nce_ill == ill); 2518 11042 Erik if (need_refrele) 2519 11042 Erik ill_refrele(ill); 2520 11042 Erik return (nce); 2521 11042 Erik } 2522 11042 Erik 2523 11042 Erik nce_t * 2524 11042 Erik arp_nce_init(ill_t *ill, in_addr_t addr4, int ire_type) 2525 11042 Erik { 2526 11042 Erik return (ire_nce_init(ill, &addr4, ire_type)); 2527 11042 Erik } 2528 11042 Erik 2529 11042 Erik nce_t * 2530 11042 Erik ndp_nce_init(ill_t *ill, const in6_addr_t *addr6, int ire_type) 2531 11042 Erik { 2532 11042 Erik ASSERT((ire_type & IRE_BROADCAST) == 0); 2533 11042 Erik return (ire_nce_init(ill, addr6, ire_type)); 2534 11042 Erik } 2535 11042 Erik 2536 11042 Erik /* 2537 11042 Erik * The caller should hold irb_lock as a writer if the ire is in a bucket. 2538 2535 sangeeta */ 2539 2535 sangeeta void 2540 11042 Erik ire_make_condemned(ire_t *ire) 2541 2535 sangeeta { 2542 11042 Erik ip_stack_t *ipst = ire->ire_ipst; 2543 8485 Peter 2544 11042 Erik mutex_enter(&ire->ire_lock); 2545 11042 Erik ASSERT(ire->ire_bucket == NULL || 2546 11042 Erik RW_WRITE_HELD(&ire->ire_bucket->irb_lock)); 2547 11042 Erik ASSERT(!IRE_IS_CONDEMNED(ire)); 2548 11042 Erik ire->ire_generation = IRE_GENERATION_CONDEMNED; 2549 11042 Erik /* Count how many condemned ires for kmem_cache callback */ 2550 11042 Erik atomic_add_32(&ipst->ips_num_ire_condemned, 1); 2551 11042 Erik mutex_exit(&ire->ire_lock); 2552 11042 Erik } 2553 8485 Peter 2554 11042 Erik /* 2555 11042 Erik * Increment the generation avoiding the special condemned value 2556 11042 Erik */ 2557 11042 Erik void 2558 11042 Erik ire_increment_generation(ire_t *ire) 2559 11042 Erik { 2560 11042 Erik uint_t generation; 2561 11042 Erik 2562 11042 Erik mutex_enter(&ire->ire_lock); 2563 11042 Erik /* 2564 11042 Erik * Even though the caller has a hold it can't prevent a concurrent 2565 11042 Erik * ire_delete marking the IRE condemned 2566 11042 Erik */ 2567 11042 Erik if (!IRE_IS_CONDEMNED(ire)) { 2568 11042 Erik generation = ire->ire_generation + 1; 2569 11042 Erik if (generation == IRE_GENERATION_CONDEMNED) 2570 11042 Erik generation = IRE_GENERATION_INITIAL; 2571 11042 Erik ASSERT(generation != IRE_GENERATION_VERIFY); 2572 11042 Erik ire->ire_generation = generation; 2573 11042 Erik } 2574 11042 Erik mutex_exit(&ire->ire_lock); 2575 11042 Erik } 2576 11042 Erik 2577 11042 Erik /* 2578 11042 Erik * Increment ire_generation on all the IRE_MULTICASTs 2579 11042 Erik * Used when the default multicast interface (as determined by 2580 11042 Erik * ill_lookup_multicast) might have changed. 2581 11042 Erik * 2582 11042 Erik * That includes the zoneid, IFF_ flags, the IPv6 scope of the address, and 2583 11042 Erik * ill unplumb. 2584 11042 Erik */ 2585 11042 Erik void 2586 11042 Erik ire_increment_multicast_generation(ip_stack_t *ipst, boolean_t isv6) 2587 11042 Erik { 2588 11042 Erik ill_t *ill; 2589 11042 Erik ill_walk_context_t ctx; 2590 11042 Erik 2591 11042 Erik rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2592 11042 Erik if (isv6) 2593 11042 Erik ill = ILL_START_WALK_V6(&ctx, ipst); 2594 11042 Erik else 2595 11042 Erik ill = ILL_START_WALK_V4(&ctx, ipst); 2596 11042 Erik for (; ill != NULL; ill = ill_next(&ctx, ill)) { 2597 11042 Erik if (ILL_IS_CONDEMNED(ill)) 2598 11042 Erik continue; 2599 11042 Erik if (ill->ill_ire_multicast != NULL) 2600 11042 Erik ire_increment_generation(ill->ill_ire_multicast); 2601 11042 Erik } 2602 11042 Erik rw_exit(&ipst->ips_ill_g_lock); 2603 11042 Erik } 2604 11042 Erik 2605 11042 Erik /* 2606 11042 Erik * Return a held IRE_NOROUTE with RTF_REJECT set 2607 11042 Erik */ 2608 11042 Erik ire_t * 2609 11042 Erik ire_reject(ip_stack_t *ipst, boolean_t isv6) 2610 11042 Erik { 2611 11042 Erik ire_t *ire; 2612 11042 Erik 2613 11042 Erik if (isv6) 2614 11042 Erik ire = ipst->ips_ire_reject_v6; 2615 11042 Erik else 2616 11042 Erik ire = ipst->ips_ire_reject_v4; 2617 11042 Erik 2618 11042 Erik ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED); 2619 11042 Erik ire_refhold(ire); 2620 11042 Erik return (ire); 2621 11042 Erik } 2622 11042 Erik 2623 11042 Erik /* 2624 11042 Erik * Return a held IRE_NOROUTE with RTF_BLACKHOLE set 2625 11042 Erik */ 2626 11042 Erik ire_t * 2627 11042 Erik ire_blackhole(ip_stack_t *ipst, boolean_t isv6) 2628 11042 Erik { 2629 11042 Erik ire_t *ire; 2630 11042 Erik 2631 11042 Erik if (isv6) 2632 11042 Erik ire = ipst->ips_ire_blackhole_v6; 2633 11042 Erik else 2634 11042 Erik ire = ipst->ips_ire_blackhole_v4; 2635 11042 Erik 2636 11042 Erik ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED); 2637 11042 Erik ire_refhold(ire); 2638 11042 Erik return (ire); 2639 11042 Erik } 2640 11042 Erik 2641 11042 Erik /* 2642 11042 Erik * Return a held IRE_MULTICAST. 2643 11042 Erik */ 2644 11042 Erik ire_t * 2645 11042 Erik ire_multicast(ill_t *ill) 2646 11042 Erik { 2647 11042 Erik ire_t *ire = ill->ill_ire_multicast; 2648 11042 Erik 2649 11042 Erik ASSERT(ire == NULL || ire->ire_generation != IRE_GENERATION_CONDEMNED); 2650 11042 Erik if (ire == NULL) 2651 11042 Erik ire = ire_blackhole(ill->ill_ipst, ill->ill_isv6); 2652 11042 Erik else 2653 11042 Erik ire_refhold(ire); 2654 11042 Erik return (ire); 2655 11042 Erik } 2656 11042 Erik 2657 11042 Erik /* 2658 11042 Erik * Given an IRE return its nexthop IRE. The nexthop IRE is an IRE_ONLINK 2659 11042 Erik * that is an exact match (i.e., a /32 for IPv4 and /128 for IPv6). 2660 11042 Erik * This can return an RTF_REJECT|RTF_BLACKHOLE. 2661 11042 Erik * The returned IRE is held. 2662 11042 Erik * The assumption is that ip_select_route() has been called and returned the 2663 11042 Erik * IRE (thus ip_select_route would have set up the ire_dep* information.) 2664 11042 Erik * If some IRE is deleteted then ire_dep_remove() will have been called and 2665 11042 Erik * we might not find a nexthop IRE, in which case we return NULL. 2666 11042 Erik */ 2667 11042 Erik ire_t * 2668 11042 Erik ire_nexthop(ire_t *ire) 2669 11042 Erik { 2670 11042 Erik ip_stack_t *ipst = ire->ire_ipst; 2671 11042 Erik 2672 11042 Erik /* Acquire lock to walk ire_dep_parent */ 2673 11042 Erik rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 2674 11042 Erik while (ire != NULL) { 2675 11042 Erik if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 2676 11042 Erik goto done; 2677 11042 Erik } 2678 11042 Erik /* 2679 11042 Erik * If we find an IRE_ONLINK we are done. This includes 2680 11042 Erik * the case of IRE_MULTICAST. 2681 11042 Erik * Note that in order to send packets we need a host-specific 2682 11042 Erik * IRE_IF_ALL first in the ire_dep_parent chain. Normally this 2683 11042 Erik * is done by inserting an IRE_IF_CLONE if the IRE_INTERFACE 2684 11042 Erik * was not host specific. 2685 11042 Erik * However, ip_rts_request doesn't want to send packets 2686 11042 Erik * hence doesn't want to allocate an IRE_IF_CLONE. Yet 2687 11042 Erik * it needs an IRE_IF_ALL to get to the ill. Thus 2688 11042 Erik * we return IRE_IF_ALL that are not host specific here. 2689 11042 Erik */ 2690 11042 Erik if (ire->ire_type & IRE_ONLINK) 2691 11042 Erik goto done; 2692 11042 Erik ire = ire->ire_dep_parent; 2693 11042 Erik } 2694 11042 Erik rw_exit(&ipst->ips_ire_dep_lock); 2695 11042 Erik return (NULL); 2696 11042 Erik 2697 11042 Erik done: 2698 11042 Erik ire_refhold(ire); 2699 11042 Erik rw_exit(&ipst->ips_ire_dep_lock); 2700 11042 Erik return (ire); 2701 11042 Erik } 2702 11042 Erik 2703 11042 Erik /* 2704 11042 Erik * Find the ill used to send packets. This will be NULL in case 2705 11042 Erik * of a reject or blackhole. 2706 11042 Erik * The returned ill is held; caller needs to do ill_refrele when done. 2707 11042 Erik */ 2708 11042 Erik ill_t * 2709 11042 Erik ire_nexthop_ill(ire_t *ire) 2710 11042 Erik { 2711 11042 Erik ill_t *ill; 2712 11042 Erik 2713 11042 Erik ire = ire_nexthop(ire); 2714 11042 Erik if (ire == NULL) 2715 11042 Erik return (NULL); 2716 11042 Erik 2717 11042 Erik /* ire_ill can not change for an existing ire */ 2718 11042 Erik ill = ire->ire_ill; 2719 11042 Erik if (ill != NULL) 2720 11042 Erik ill_refhold(ill); 2721 11042 Erik ire_refrele(ire); 2722 11042 Erik return (ill); 2723 11042 Erik } 2724 11042 Erik 2725 11042 Erik #ifdef DEBUG 2726 11042 Erik static boolean_t 2727 11042 Erik parent_has_child(ire_t *parent, ire_t *child) 2728 11042 Erik { 2729 11042 Erik ire_t *ire; 2730 11042 Erik ire_t *prev; 2731 11042 Erik 2732 11042 Erik ire = parent->ire_dep_children; 2733 11042 Erik prev = NULL; 2734 11042 Erik while (ire != NULL) { 2735 11042 Erik if (prev == NULL) { 2736 11042 Erik ASSERT(ire->ire_dep_sib_ptpn == 2737 11042 Erik &(parent->ire_dep_children)); 2738 11042 Erik } else { 2739 11042 Erik ASSERT(ire->ire_dep_sib_ptpn == 2740 11042 Erik &(prev->ire_dep_sib_next)); 2741 11042 Erik } 2742 11042 Erik if (ire == child) 2743 11042 Erik return (B_TRUE); 2744 11042 Erik prev = ire; 2745 11042 Erik ire = ire->ire_dep_sib_next; 2746 11042 Erik } 2747 11042 Erik return (B_FALSE); 2748 11042 Erik } 2749 11042 Erik 2750 11042 Erik static void 2751 11042 Erik ire_dep_verify(ire_t *ire) 2752 11042 Erik { 2753 11042 Erik ire_t *parent = ire->ire_dep_parent; 2754 11042 Erik ire_t *child = ire->ire_dep_children; 2755 11042 Erik 2756 11042 Erik ASSERT(ire->ire_ipversion == IPV4_VERSION || 2757 11042 Erik ire->ire_ipversion == IPV6_VERSION); 2758 11042 Erik if (parent != NULL) { 2759 11042 Erik ASSERT(parent->ire_ipversion == IPV4_VERSION || 2760 11042 Erik parent->ire_ipversion == IPV6_VERSION); 2761 11042 Erik ASSERT(parent->ire_refcnt >= 1); 2762 11042 Erik ASSERT(parent_has_child(parent, ire)); 2763 11042 Erik } 2764 11042 Erik if (child != NULL) { 2765 11042 Erik ASSERT(child->ire_ipversion == IPV4_VERSION || 2766 11042 Erik child->ire_ipversion == IPV6_VERSION); 2767 11042 Erik ASSERT(child->ire_dep_parent == ire); 2768 11042 Erik ASSERT(child->ire_dep_sib_ptpn != NULL); 2769 11042 Erik ASSERT(parent_has_child(ire, child)); 2770 11042 Erik } 2771 11042 Erik } 2772 11042 Erik #endif /* DEBUG */ 2773 11042 Erik 2774 11042 Erik /* 2775 11042 Erik * Assumes ire_dep_parent is set. Remove this child from its parent's linkage. 2776 11042 Erik */ 2777 11042 Erik void 2778 11042 Erik ire_dep_remove(ire_t *ire) 2779 11042 Erik { 2780 11042 Erik ip_stack_t *ipst = ire->ire_ipst; 2781 11042 Erik ire_t *parent = ire->ire_dep_parent; 2782 11042 Erik ire_t *next; 2783 11042 Erik nce_t *nce; 2784 11042 Erik 2785 11042 Erik ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock)); 2786 11042 Erik ASSERT(ire->ire_dep_parent != NULL); 2787 11042 Erik ASSERT(ire->ire_dep_sib_ptpn != NULL); 2788 11042 Erik 2789 11042 Erik #ifdef DEBUG 2790 11042 Erik ire_dep_verify(ire); 2791 11042 Erik ire_dep_verify(parent); 2792 11042 Erik #endif 2793 11042 Erik 2794 11042 Erik next = ire->ire_dep_sib_next; 2795 11042 Erik if (next != NULL) 2796 11042 Erik next->ire_dep_sib_ptpn = ire->ire_dep_sib_ptpn; 2797 11042 Erik 2798 11042 Erik ASSERT(*(ire->ire_dep_sib_ptpn) == ire); 2799 11042 Erik *(ire->ire_dep_sib_ptpn) = ire->ire_dep_sib_next; 2800 11042 Erik 2801 11042 Erik ire->ire_dep_sib_ptpn = NULL; 2802 11042 Erik ire->ire_dep_sib_next = NULL; 2803 11042 Erik 2804 11042 Erik mutex_enter(&ire->ire_lock); 2805 11042 Erik parent = ire->ire_dep_parent; 2806 11042 Erik ire->ire_dep_parent = NULL; 2807 11042 Erik mutex_exit(&ire->ire_lock); 2808 2535 sangeeta 2809 2535 sangeeta /* 2810 11042 Erik * Make sure all our children, grandchildren, etc set 2811 11042 Erik * ire_dep_parent_generation to IRE_GENERATION_VERIFY since 2812 11042 Erik * we can no longer guarantee than the children have a current 2813 11042 Erik * ire_nce_cache and ire_nexthop_ill(). 2814 2535 sangeeta */ 2815 11042 Erik if (ire->ire_dep_children != NULL) 2816 11042 Erik ire_dep_invalidate_children(ire->ire_dep_children); 2817 2535 sangeeta 2818 2535 sangeeta /* 2819 11042 Erik * Since the parent is gone we make sure we clear ire_nce_cache. 2820 11042 Erik * We can clear it under ire_lock even if the IRE is used 2821 2535 sangeeta */ 2822 11042 Erik mutex_enter(&ire->ire_lock); 2823 11042 Erik nce = ire->ire_nce_cache; 2824 11042 Erik ire->ire_nce_cache = NULL; 2825 11042 Erik mutex_exit(&ire->ire_lock); 2826 11042 Erik if (nce != NULL) 2827 11042 Erik nce_refrele(nce); 2828 2535 sangeeta 2829 11042 Erik #ifdef DEBUG 2830 11042 Erik ire_dep_verify(ire); 2831 11042 Erik ire_dep_verify(parent); 2832 11042 Erik #endif 2833 11042 Erik 2834 11042 Erik ire_refrele_notr(parent); 2835 11042 Erik ire_refrele_notr(ire); 2836 11042 Erik } 2837 11042 Erik 2838 11042 Erik /* 2839 11042 Erik * Insert the child in the linkage of the parent 2840 11042 Erik */ 2841 11042 Erik static void 2842 11042 Erik ire_dep_parent_insert(ire_t *child, ire_t *parent) 2843 11042 Erik { 2844 11042 Erik ip_stack_t *ipst = child->ire_ipst; 2845 11042 Erik ire_t *next; 2846 11042 Erik 2847 11042 Erik ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock)); 2848 11042 Erik ASSERT(child->ire_dep_parent == NULL); 2849 11042 Erik 2850 11042 Erik #ifdef DEBUG 2851 11042 Erik ire_dep_verify(child); 2852 11042 Erik ire_dep_verify(parent); 2853 11042 Erik #endif 2854 11042 Erik /* No parents => no siblings */ 2855 11042 Erik ASSERT(child->ire_dep_sib_ptpn == NULL); 2856 11042 Erik ASSERT(child->ire_dep_sib_next == NULL); 2857 11042 Erik 2858 11042 Erik ire_refhold_notr(parent); 2859 11042 Erik ire_refhold_notr(child); 2860 11042 Erik 2861 11042 Erik /* Head insertion */ 2862 11042 Erik next = parent->ire_dep_children; 2863 11042 Erik if (next != NULL) { 2864 11042 Erik ASSERT(next->ire_dep_sib_ptpn == &(parent->ire_dep_children)); 2865 11042 Erik child->ire_dep_sib_next = next; 2866 11042 Erik next->ire_dep_sib_ptpn = &(child->ire_dep_sib_next); 2867 11042 Erik } 2868 11042 Erik parent->ire_dep_children = child; 2869 11042 Erik child->ire_dep_sib_ptpn = &(parent->ire_dep_children); 2870 11042 Erik 2871 11042 Erik mutex_enter(&child->ire_lock); 2872 11042 Erik child->ire_dep_parent = parent; 2873 11042 Erik mutex_exit(&child->ire_lock); 2874 11042 Erik 2875 11042 Erik #ifdef DEBUG 2876 11042 Erik ire_dep_verify(child); 2877 11042 Erik ire_dep_verify(parent); 2878 11042 Erik #endif 2879 11042 Erik } 2880 11042 Erik 2881 11042 Erik 2882 11042 Erik /* 2883 11042 Erik * Given count worth of ires and generations, build ire_dep_* relationships 2884 11042 Erik * from ires[0] to ires[count-1]. Record generations[i+1] in 2885 11042 Erik * ire_dep_parent_generation for ires[i]. 2886 11042 Erik * We graft onto an existing parent chain by making sure that we don't 2887 11042 Erik * touch ire_dep_parent for ires[count-1]. 2888 11042 Erik * 2889 11042 Erik * We check for any condemned ire_generation count and return B_FALSE in 2890 11042 Erik * that case so that the caller can tear it apart. 2891 11042 Erik * 2892 11042 Erik * Note that generations[0] is not used. Caller handles that. 2893 11042 Erik */ 2894 11042 Erik boolean_t 2895 11042 Erik ire_dep_build(ire_t *ires[], uint_t generations[], uint_t count) 2896 11042 Erik { 2897 11042 Erik ire_t *ire = ires[0]; 2898 11042 Erik ip_stack_t *ipst; 2899 11042 Erik uint_t i; 2900 11042 Erik 2901 11042 Erik ASSERT(count > 0); 2902 11042 Erik if (count == 1) { 2903 11042 Erik /* No work to do */ 2904 11042 Erik return (B_TRUE); 2905 11042 Erik } 2906 11042 Erik ipst = ire->ire_ipst; 2907 11042 Erik rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 2908 11042 Erik /* 2909 11042 Erik * Do not remove the linkage for any existing parent chain i.e., 2910 11042 Erik * ires[count-1] is left alone. 2911 11042 Erik */ 2912 11042 Erik for (i = 0; i < count-1; i++) { 2913 11042 Erik /* Remove existing parent if we need to change it */ 2914 11042 Erik if (ires[i]->ire_dep_parent != NULL && 2915 11042 Erik ires[i]->ire_dep_parent != ires[i+1]) 2916 11042 Erik ire_dep_remove(ires[i]); 2917 11042 Erik } 2918 11042 Erik 2919 11042 Erik for (i = 0; i < count - 1; i++) { 2920 11042 Erik ASSERT(ires[i]->ire_ipversion == IPV4_VERSION || 2921 11042 Erik ires[i]->ire_ipversion == IPV6_VERSION); 2922 11042 Erik /* Does it need to change? */ 2923 11042 Erik if (ires[i]->ire_dep_parent != ires[i+1]) 2924 11042 Erik ire_dep_parent_insert(ires[i], ires[i+1]); 2925 11042 Erik 2926 11042 Erik mutex_enter(&ires[i+1]->ire_lock); 2927 11042 Erik if (IRE_IS_CONDEMNED(ires[i+1])) { 2928 11042 Erik mutex_exit(&ires[i+1]->ire_lock); 2929 11042 Erik rw_exit(&ipst->ips_ire_dep_lock); 2930 11042 Erik return (B_FALSE); 2931 11042 Erik } 2932 11042 Erik mutex_exit(&ires[i+1]->ire_lock); 2933 11042 Erik 2934 11042 Erik mutex_enter(&ires[i]->ire_lock); 2935 11042 Erik ires[i]->ire_dep_parent_generation = generations[i+1]; 2936 11042 Erik mutex_exit(&ires[i]->ire_lock); 2937 11042 Erik } 2938 11042 Erik rw_exit(&ipst->ips_ire_dep_lock); 2939 11042 Erik return (B_TRUE); 2940 11042 Erik } 2941 11042 Erik 2942 11042 Erik /* 2943 11042 Erik * Given count worth of ires, unbuild ire_dep_* relationships 2944 11042 Erik * from ires[0] to ires[count-1]. 2945 11042 Erik */ 2946 11042 Erik void 2947 11042 Erik ire_dep_unbuild(ire_t *ires[], uint_t count) 2948 11042 Erik { 2949 11042 Erik ip_stack_t *ipst; 2950 11042 Erik uint_t i; 2951 11042 Erik 2952 11042 Erik if (count == 0) { 2953 11042 Erik /* No work to do */ 2954 2535 sangeeta return; 2955 2535 sangeeta } 2956 11042 Erik ipst = ires[0]->ire_ipst; 2957 11042 Erik rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 2958 11042 Erik for (i = 0; i < count; i++) { 2959 11042 Erik ASSERT(ires[i]->ire_ipversion == IPV4_VERSION || 2960 11042 Erik ires[i]->ire_ipversion == IPV6_VERSION); 2961 11042 Erik if (ires[i]->ire_dep_parent != NULL) 2962 11042 Erik ire_dep_remove(ires[i]); 2963 11042 Erik mutex_enter(&ires[i]->ire_lock); 2964 11042 Erik ires[i]->ire_dep_parent_generation = IRE_GENERATION_VERIFY; 2965 11042 Erik mutex_exit(&ires[i]->ire_lock); 2966 11042 Erik } 2967 11042 Erik rw_exit(&ipst->ips_ire_dep_lock); 2968 11042 Erik } 2969 2535 sangeeta 2970 11042 Erik /* 2971 11042 Erik * Both the forwarding and the outbound code paths can trip on 2972 11042 Erik * a condemned NCE, in which case we call this function. 2973 11042 Erik * We have two different behaviors: if the NCE was UNREACHABLE 2974 11042 Erik * it is an indication that something failed. In that case 2975 11042 Erik * we see if we should look for a different IRE (for example, 2976 11042 Erik * delete any matching redirect IRE, or try a different 2977 11042 Erik * IRE_DEFAULT (ECMP)). We mark the ire as bad so a hopefully 2978 11042 Erik * different IRE will be picked next time we send/forward. 2979 11042 Erik * 2980 11042 Erik * If we are called by the output path then fail_if_better is set 2981 11042 Erik * and we return NULL if there could be a better IRE. This is because the 2982 11042 Erik * output path retries the IRE lookup. (The input/forward path can not retry.) 2983 11042 Erik * 2984 11042 Erik * If the NCE was not unreachable then we pick/allocate a 2985 11042 Erik * new (most likely ND_INITIAL) NCE and proceed with it. 2986 11042 Erik * 2987 11042 Erik * ipha/ip6h are needed for multicast packets; ipha needs to be 2988 11042 Erik * set for IPv4 and ip6h needs to be set for IPv6 packets. 2989 11042 Erik */ 2990 11042 Erik nce_t * 2991 11042 Erik ire_handle_condemned_nce(nce_t *nce, ire_t *ire, ipha_t *ipha, ip6_t *ip6h, 2992 11042 Erik boolean_t fail_if_better) 2993 11042 Erik { 2994 11042 Erik if (nce->nce_common->ncec_state == ND_UNREACHABLE) { 2995 11042 Erik if (ire_no_good(ire) && fail_if_better) { 2996 11042 Erik /* 2997 11042 Erik * Did some changes, or ECMP likely to exist. 2998 11042 Erik * Make ip_output look for a different IRE 2999 11042 Erik */ 3000 11042 Erik return (NULL); 3001 11042 Erik } 3002 11042 Erik } 3003 11042 Erik if (ire_revalidate_nce(ire) == ENETUNREACH) { 3004 11042 Erik /* The ire_dep_parent chain went bad, or no memory? */ 3005 11042 Erik (void) ire_no_good(ire); 3006 11042 Erik return (NULL); 3007 11042 Erik } 3008 11042 Erik if (ire->ire_ipversion == IPV4_VERSION) { 3009 11042 Erik ASSERT(ipha != NULL); 3010 11042 Erik nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 3011 11042 Erik } else { 3012 11042 Erik ASSERT(ip6h != NULL); 3013 11042 Erik nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst); 3014 2535 sangeeta } 3015 8485 Peter 3016 11042 Erik if (nce == NULL) 3017 11042 Erik return (NULL); 3018 11042 Erik if (nce->nce_is_condemned) { 3019 11042 Erik nce_refrele(nce); 3020 11042 Erik return (NULL); 3021 2535 sangeeta } 3022 11042 Erik return (nce); 3023 11042 Erik } 3024 2535 sangeeta 3025 11042 Erik /* 3026 11042 Erik * The caller has found that the ire is bad, either due to a reference to an NCE 3027 11042 Erik * in ND_UNREACHABLE state, or a MULTIRT route whose gateway can't be resolved. 3028 11042 Erik * We update things so a subsequent attempt to send to the destination 3029 11042 Erik * is likely to find different IRE, or that a new NCE would be created. 3030 11042 Erik * 3031 11042 Erik * Returns B_TRUE if it is likely that a subsequent ire_ftable_lookup would 3032 11042 Erik * find a different route (either due to having deleted a redirect, or there 3033 11042 Erik * being ECMP routes.) 3034 11042 Erik * 3035 11042 Erik * If we have a redirect (RTF_DYNAMIC) we delete it. 3036 11042 Erik * Otherwise we increment ire_badcnt and increment the generation number so 3037 11042 Erik * that a cached ixa_ire will redo the route selection. ire_badcnt is taken 3038 11042 Erik * into account in the route selection when we have multiple choices (multiple 3039 11042 Erik * default routes or ECMP in general). 3040 11042 Erik * Any time ip_select_route find an ire with a condemned ire_nce_cache 3041 11042 Erik * (e.g., if no equal cost route to the bad one) ip_select_route will make 3042 11042 Erik * sure the NCE is revalidated to avoid getting stuck on a 3043 11042 Erik * NCE_F_CONDMNED ncec that caused ire_no_good to be called. 3044 11042 Erik */ 3045 11042 Erik boolean_t 3046 11042 Erik ire_no_good(ire_t *ire) 3047 11042 Erik { 3048 11042 Erik ip_stack_t *ipst = ire->ire_ipst; 3049 11042 Erik ire_t *ire2; 3050 11042 Erik nce_t *nce; 3051 11042 Erik 3052 11042 Erik if (ire->ire_flags & RTF_DYNAMIC) { 3053 11042 Erik ire_delete(ire); 3054 11042 Erik return (B_TRUE); 3055 11042 Erik } 3056 11042 Erik if (ire->ire_flags & RTF_INDIRECT) { 3057 11042 Erik /* Check if next IRE is a redirect */ 3058 11042 Erik rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 3059 11042 Erik if (ire->ire_dep_parent != NULL && 3060 11042 Erik (ire->ire_dep_parent->ire_flags & RTF_DYNAMIC)) { 3061 11042 Erik ire2 = ire->ire_dep_parent; 3062 11042 Erik ire_refhold(ire2); 3063 11042 Erik } else { 3064 11042 Erik ire2 = NULL; 3065 11042 Erik } 3066 11042 Erik rw_exit(&ipst->ips_ire_dep_lock); 3067 11042 Erik if (ire2 != NULL) { 3068 11042 Erik ire_delete(ire2); 3069 11042 Erik ire_refrele(ire2); 3070 11042 Erik return (B_TRUE); 3071 11042 Erik } 3072 11042 Erik } 3073 2535 sangeeta /* 3074 11042 Erik * No redirect involved. Increment badcnt so that if we have ECMP 3075 11042 Erik * routes we are likely to pick a different one for the next packet. 3076 11042 Erik * 3077 11042 Erik * If the NCE is unreachable and condemned we should drop the reference 3078 11042 Erik * to it so that a new NCE can be created. 3079 11042 Erik * 3080 11042 Erik * Finally we increment the generation number so that any ixa_ire 3081 11042 Erik * cache will be revalidated. 3082 2535 sangeeta */ 3083 11042 Erik mutex_enter(&ire->ire_lock); 3084 11042 Erik ire->ire_badcnt++; 3085 11066 rafael ire->ire_last_badcnt = TICK_TO_SEC(ddi_get_lbolt64()); 3086 11042 Erik nce = ire->ire_nce_cache; 3087 11042 Erik if (nce != NULL && nce->nce_is_condemned && 3088 11042 Erik nce->nce_common->ncec_state == ND_UNREACHABLE) 3089 11042 Erik ire->ire_nce_cache = NULL; 3090 11042 Erik else 3091 11042 Erik nce = NULL; 3092 11042 Erik mutex_exit(&ire->ire_lock); 3093 11042 Erik if (nce != NULL) 3094 11042 Erik nce_refrele(nce); 3095 3448 dh155122 3096 11042 Erik ire_increment_generation(ire); 3097 11042 Erik ire_dep_incr_generation(ire); 3098 2535 sangeeta 3099 11042 Erik return (ire->ire_bucket->irb_ire_cnt > 1); 3100 11042 Erik } 3101 2535 sangeeta 3102 11042 Erik /* 3103 11042 Erik * Walk ire_dep_parent chain and validate that ire_dep_parent->ire_generation == 3104 11042 Erik * ire_dep_parent_generation. 3105 11042 Erik * If they all match we just return ire_generation from the topmost IRE. 3106 11042 Erik * Otherwise we propagate the mismatch by setting all ire_dep_parent_generation 3107 11042 Erik * above the mismatch to IRE_GENERATION_VERIFY and also returning 3108 11042 Erik * IRE_GENERATION_VERIFY. 3109 11042 Erik */ 3110 11042 Erik uint_t 3111 11042 Erik ire_dep_validate_generations(ire_t *ire) 3112 11042 Erik { 3113 11042 Erik ip_stack_t *ipst = ire->ire_ipst; 3114 11042 Erik uint_t generation; 3115 11042 Erik ire_t *ire1; 3116 11042 Erik 3117 11042 Erik rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 3118 11042 Erik generation = ire->ire_generation; /* Assuming things match */ 3119 11042 Erik for (ire1 = ire; ire1 != NULL; ire1 = ire1->ire_dep_parent) { 3120 11042 Erik ASSERT(ire1->ire_ipversion == IPV4_VERSION || 3121 11042 Erik ire1->ire_ipversion == IPV6_VERSION); 3122 11042 Erik if (ire1->ire_dep_parent == NULL) 3123 11042 Erik break; 3124 11042 Erik if (ire1->ire_dep_parent_generation != 3125 11042 Erik ire1->ire_dep_parent->ire_generation) 3126 11042 Erik goto mismat