1 0 stevel /* 2 0 stevel * CDDL HEADER START 3 0 stevel * 4 0 stevel * The contents of this file are subject to the terms of the 5 1205 kcpoon * Common Development and Distribution License (the "License"). 6 1205 kcpoon * You may not use this file except in compliance with the License. 7 0 stevel * 8 0 stevel * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 0 stevel * or http://www.opensolaris.org/os/licensing. 10 0 stevel * See the License for the specific language governing permissions 11 0 stevel * and limitations under the License. 12 0 stevel * 13 0 stevel * When distributing Covered Code, include this CDDL HEADER in each 14 0 stevel * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 0 stevel * If applicable, add the following below this CDDL HEADER, with the 16 0 stevel * fields enclosed by brackets "[]" replaced with your own identifying 17 0 stevel * information: Portions Copyright [yyyy] [name of copyright owner] 18 0 stevel * 19 0 stevel * CDDL HEADER END 20 0 stevel */ 21 1205 kcpoon 22 1205 kcpoon /* 23 8477 Rao * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 0 stevel * Use is subject to license terms. 25 0 stevel */ 26 0 stevel /* Copyright (c) 1990 Mentat Inc. */ 27 0 stevel 28 0 stevel #include <sys/types.h> 29 0 stevel #include <sys/stream.h> 30 0 stevel #include <sys/strsun.h> 31 0 stevel #include <sys/strsubr.h> 32 0 stevel #include <sys/stropts.h> 33 0 stevel #include <sys/strlog.h> 34 0 stevel #define _SUN_TPI_VERSION 2 35 0 stevel #include <sys/tihdr.h> 36 0 stevel #include <sys/timod.h> 37 0 stevel #include <sys/ddi.h> 38 0 stevel #include <sys/sunddi.h> 39 0 stevel #include <sys/suntpi.h> 40 0 stevel #include <sys/xti_inet.h> 41 0 stevel #include <sys/cmn_err.h> 42 0 stevel #include <sys/debug.h> 43 2958 dr146992 #include <sys/sdt.h> 44 0 stevel #include <sys/vtrace.h> 45 0 stevel #include <sys/kmem.h> 46 0 stevel #include <sys/ethernet.h> 47 0 stevel #include <sys/cpuvar.h> 48 0 stevel #include <sys/dlpi.h> 49 0 stevel #include <sys/pattr.h> 50 0 stevel #include <sys/policy.h> 51 1676 jpk #include <sys/priv.h> 52 0 stevel #include <sys/zone.h> 53 3448 dh155122 #include <sys/sunldi.h> 54 0 stevel 55 0 stevel #include <sys/errno.h> 56 0 stevel #include <sys/signal.h> 57 0 stevel #include <sys/socket.h> 58 8348 Eric #include <sys/socketvar.h> 59 0 stevel #include <sys/sockio.h> 60 0 stevel #include <sys/isa_defs.h> 61 0 stevel #include <sys/md5.h> 62 0 stevel #include <sys/random.h> 63 6707 brutus #include <sys/uio.h> 64 8048 Madhavan #include <sys/systm.h> 65 0 stevel #include <netinet/in.h> 66 0 stevel #include <netinet/tcp.h> 67 0 stevel #include <netinet/ip6.h> 68 0 stevel #include <netinet/icmp6.h> 69 0 stevel #include <net/if.h> 70 0 stevel #include <net/route.h> 71 0 stevel #include <inet/ipsec_impl.h> 72 0 stevel 73 0 stevel #include <inet/common.h> 74 0 stevel #include <inet/ip.h> 75 741 masputra #include <inet/ip_impl.h> 76 0 stevel #include <inet/ip6.h> 77 0 stevel #include <inet/ip_ndp.h> 78 8348 Eric #include <inet/proto_set.h> 79 0 stevel #include <inet/mib2.h> 80 0 stevel #include <inet/nd.h> 81 0 stevel #include <inet/optcom.h> 82 0 stevel #include <inet/snmpcom.h> 83 0 stevel #include <inet/kstatcom.h> 84 0 stevel #include <inet/tcp.h> 85 741 masputra #include <inet/tcp_impl.h> 86 8833 Venu #include <inet/udp_impl.h> 87 0 stevel #include <net/pfkeyv2.h> 88 0 stevel #include <inet/ipdrop.h> 89 0 stevel 90 0 stevel #include <inet/ipclassifier.h> 91 0 stevel #include <inet/ip_ire.h> 92 2535 sangeeta #include <inet/ip_ftable.h> 93 0 stevel #include <inet/ip_if.h> 94 0 stevel #include <inet/ipp_common.h> 95 11042 Erik #include <inet/ip_rts.h> 96 2958 dr146992 #include <inet/ip_netinfo.h> 97 8275 Eric #include <sys/squeue_impl.h> 98 0 stevel #include <sys/squeue.h> 99 898 kais #include <inet/kssl/ksslapi.h> 100 1676 jpk #include <sys/tsol/label.h> 101 1676 jpk #include <sys/tsol/tnet.h> 102 1676 jpk #include <rpc/pmap_prot.h> 103 8048 Madhavan #include <sys/callo.h> 104 11066 rafael 105 11110 Erik #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ 106 0 stevel 107 0 stevel /* 108 0 stevel * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) 109 0 stevel * 110 0 stevel * (Read the detailed design doc in PSARC case directory) 111 0 stevel * 112 0 stevel * The entire tcp state is contained in tcp_t and conn_t structure 113 0 stevel * which are allocated in tandem using ipcl_conn_create() and passing 114 11042 Erik * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect 115 0 stevel * the references on the tcp_t. The tcp_t structure is never compressed 116 0 stevel * and packets always land on the correct TCP perimeter from the time 117 0 stevel * eager is created till the time tcp_t dies (as such the old mentat 118 0 stevel * TCP global queue is not used for detached state and no IPSEC checking 119 0 stevel * is required). The global queue is still allocated to send out resets 120 0 stevel * for connection which have no listeners and IP directly calls 121 0 stevel * tcp_xmit_listeners_reset() which does any policy check. 122 0 stevel * 123 0 stevel * Protection and Synchronisation mechanism: 124 0 stevel * 125 0 stevel * The tcp data structure does not use any kind of lock for protecting 126 0 stevel * its state but instead uses 'squeues' for mutual exclusion from various 127 0 stevel * read and write side threads. To access a tcp member, the thread should 128 8275 Eric * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS, 129 8275 Eric * or SQ_NODRAIN). Since the squeues allow a direct function call, caller 130 0 stevel * can pass any tcp function having prototype of edesc_t as argument 131 0 stevel * (different from traditional STREAMs model where packets come in only 132 0 stevel * designated entry points). The list of functions that can be directly 133 0 stevel * called via squeue are listed before the usual function prototype. 134 0 stevel * 135 0 stevel * Referencing: 136 0 stevel * 137 0 stevel * TCP is MT-Hot and we use a reference based scheme to make sure that the 138 0 stevel * tcp structure doesn't disappear when its needed. When the application 139 0 stevel * creates an outgoing connection or accepts an incoming connection, we 140 0 stevel * start out with 2 references on 'conn_ref'. One for TCP and one for IP. 141 0 stevel * The IP reference is just a symbolic reference since ip_tcpclose() 142 0 stevel * looks at tcp structure after tcp_close_output() returns which could 143 0 stevel * have dropped the last TCP reference. So as long as the connection is 144 0 stevel * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the 145 0 stevel * conn_t. The classifier puts its own reference when the connection is 146 0 stevel * inserted in listen or connected hash. Anytime a thread needs to enter 147 0 stevel * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr 148 0 stevel * on write side or by doing a classify on read side and then puts a 149 0 stevel * reference on the conn before doing squeue_enter/tryenter/fill. For 150 0 stevel * read side, the classifier itself puts the reference under fanout lock 151 0 stevel * to make sure that tcp can't disappear before it gets processed. The 152 0 stevel * squeue will drop this reference automatically so the called function 153 0 stevel * doesn't have to do a DEC_REF. 154 0 stevel * 155 0 stevel * Opening a new connection: 156 0 stevel * 157 3448 dh155122 * The outgoing connection open is pretty simple. tcp_open() does the 158 0 stevel * work in creating the conn/tcp structure and initializing it. The 159 0 stevel * squeue assignment is done based on the CPU the application 160 0 stevel * is running on. So for outbound connections, processing is always done 161 0 stevel * on application CPU which might be different from the incoming CPU 162 0 stevel * being interrupted by the NIC. An optimal way would be to figure out 163 0 stevel * the NIC <-> CPU binding at listen time, and assign the outgoing 164 0 stevel * connection to the squeue attached to the CPU that will be interrupted 165 0 stevel * for incoming packets (we know the NIC based on the bind IP address). 166 0 stevel * This might seem like a problem if more data is going out but the 167 0 stevel * fact is that in most cases the transmit is ACK driven transmit where 168 0 stevel * the outgoing data normally sits on TCP's xmit queue waiting to be 169 0 stevel * transmitted. 170 0 stevel * 171 0 stevel * Accepting a connection: 172 0 stevel * 173 0 stevel * This is a more interesting case because of various races involved in 174 0 stevel * establishing a eager in its own perimeter. Read the meta comment on 175 11042 Erik * top of tcp_input_listener(). But briefly, the squeue is picked by 176 11042 Erik * ip_fanout based on the ring or the sender (if loopback). 177 0 stevel * 178 0 stevel * Closing a connection: 179 0 stevel * 180 0 stevel * The close is fairly straight forward. tcp_close() calls tcp_close_output() 181 0 stevel * via squeue to do the close and mark the tcp as detached if the connection 182 0 stevel * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its 183 0 stevel * reference but tcp_close() drop IP's reference always. So if tcp was 184 0 stevel * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP 185 0 stevel * and 1 because it is in classifier's connected hash. This is the condition 186 0 stevel * we use to determine that its OK to clean up the tcp outside of squeue 187 0 stevel * when time wait expires (check the ref under fanout and conn_lock and 188 0 stevel * if it is 2, remove it from fanout hash and kill it). 189 0 stevel * 190 0 stevel * Although close just drops the necessary references and marks the 191 0 stevel * tcp_detached state, tcp_close needs to know the tcp_detached has been 192 0 stevel * set (under squeue) before letting the STREAM go away (because a 193 0 stevel * inbound packet might attempt to go up the STREAM while the close 194 0 stevel * has happened and tcp_detached is not set). So a special lock and 195 0 stevel * flag is used along with a condition variable (tcp_closelock, tcp_closed, 196 0 stevel * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked 197 0 stevel * tcp_detached. 198 0 stevel * 199 0 stevel * Special provisions and fast paths: 200 0 stevel * 201 11042 Erik * We make special provisions for sockfs by marking tcp_issocket 202 0 stevel * whenever we have only sockfs on top of TCP. This allows us to skip 203 0 stevel * putting the tcp in acceptor hash since a sockfs listener can never 204 0 stevel * become acceptor and also avoid allocating a tcp_t for acceptor STREAM 205 0 stevel * since eager has already been allocated and the accept now happens 206 0 stevel * on acceptor STREAM. There is a big blob of comment on top of 207 11042 Erik * tcp_input_listener explaining the new accept. When socket is POP'd, 208 0 stevel * sockfs sends us an ioctl to mark the fact and we go back to old 209 0 stevel * behaviour. Once tcp_issocket is unset, its never set for the 210 0 stevel * life of that connection. 211 0 stevel * 212 0 stevel * IPsec notes : 213 0 stevel * 214 0 stevel * Since a packet is always executed on the correct TCP perimeter 215 0 stevel * all IPsec processing is defered to IP including checking new 216 0 stevel * connections and setting IPSEC policies for new connection. The 217 0 stevel * only exception is tcp_xmit_listeners_reset() which is called 218 0 stevel * directly from IP and needs to policy check to see if TH_RST 219 0 stevel * can be sent out. 220 0 stevel */ 221 0 stevel 222 0 stevel /* 223 0 stevel * Values for squeue switch: 224 8275 Eric * 1: SQ_NODRAIN 225 8275 Eric * 2: SQ_PROCESS 226 8275 Eric * 3: SQ_FILL 227 8275 Eric */ 228 8275 Eric int tcp_squeue_wput = 2; /* /etc/systems */ 229 8275 Eric int tcp_squeue_flag; 230 0 stevel 231 0 stevel /* 232 0 stevel * This controls how tiny a write must be before we try to copy it 233 11042 Erik * into the mblk on the tail of the transmit queue. Not much 234 0 stevel * speedup is observed for values larger than sixteen. Zero will 235 0 stevel * disable the optimisation. 236 0 stevel */ 237 0 stevel int tcp_tx_pull_len = 16; 238 0 stevel 239 0 stevel /* 240 0 stevel * TCP Statistics. 241 0 stevel * 242 0 stevel * How TCP statistics work. 243 0 stevel * 244 0 stevel * There are two types of statistics invoked by two macros. 245 0 stevel * 246 0 stevel * TCP_STAT(name) does non-atomic increment of a named stat counter. It is 247 0 stevel * supposed to be used in non MT-hot paths of the code. 248 0 stevel * 249 0 stevel * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is 250 0 stevel * supposed to be used for DEBUG purposes and may be used on a hot path. 251 0 stevel * 252 0 stevel * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat 253 0 stevel * (use "kstat tcp" to get them). 254 0 stevel * 255 0 stevel * There is also additional debugging facility that marks tcp_clean_death() 256 0 stevel * instances and saves them in tcp_t structure. It is triggered by 257 0 stevel * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for 258 0 stevel * tcp_clean_death() calls that counts the number of times each tag was hit. It 259 0 stevel * is triggered by TCP_CLD_COUNTERS define. 260 0 stevel * 261 0 stevel * How to add new counters. 262 0 stevel * 263 0 stevel * 1) Add a field in the tcp_stat structure describing your counter. 264 3448 dh155122 * 2) Add a line in the template in tcp_kstat2_init() with the name 265 3448 dh155122 * of the counter. 266 0 stevel * 267 0 stevel * IMPORTANT!! - make sure that both are in sync !! 268 0 stevel * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. 269 0 stevel * 270 0 stevel * Please avoid using private counters which are not kstat-exported. 271 0 stevel * 272 0 stevel * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances 273 0 stevel * in tcp_t structure. 274 0 stevel * 275 0 stevel * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. 276 0 stevel */ 277 0 stevel 278 0 stevel #ifndef TCP_DEBUG_COUNTER 279 0 stevel #ifdef DEBUG 280 0 stevel #define TCP_DEBUG_COUNTER 1 281 0 stevel #else 282 0 stevel #define TCP_DEBUG_COUNTER 0 283 0 stevel #endif 284 0 stevel #endif 285 0 stevel 286 741 masputra #define TCP_CLD_COUNTERS 0 287 0 stevel 288 0 stevel #define TCP_TAG_CLEAN_DEATH 1 289 0 stevel #define TCP_MAX_CLEAN_DEATH_TAG 32 290 0 stevel 291 0 stevel #ifdef lint 292 0 stevel static int _lint_dummy_; 293 0 stevel #endif 294 0 stevel 295 0 stevel #if TCP_CLD_COUNTERS 296 0 stevel static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; 297 0 stevel #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ 298 0 stevel #elif defined(lint) 299 0 stevel #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0); 300 0 stevel #else 301 0 stevel #define TCP_CLD_STAT(x) 302 0 stevel #endif 303 0 stevel 304 0 stevel #if TCP_DEBUG_COUNTER 305 3448 dh155122 #define TCP_DBGSTAT(tcps, x) \ 306 3448 dh155122 atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1) 307 3448 dh155122 #define TCP_G_DBGSTAT(x) \ 308 3448 dh155122 atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1) 309 0 stevel #elif defined(lint) 310 3448 dh155122 #define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0); 311 3448 dh155122 #define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); 312 0 stevel #else 313 3448 dh155122 #define TCP_DBGSTAT(tcps, x) 314 3448 dh155122 #define TCP_G_DBGSTAT(x) 315 0 stevel #endif 316 0 stevel 317 3448 dh155122 #define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++) 318 3448 dh155122 319 3448 dh155122 tcp_g_stat_t tcp_g_statistics; 320 3448 dh155122 kstat_t *tcp_g_kstat; 321 0 stevel 322 0 stevel /* Macros for timestamp comparisons */ 323 0 stevel #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 324 0 stevel #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 325 0 stevel 326 0 stevel /* 327 0 stevel * Parameters for TCP Initial Send Sequence number (ISS) generation. When 328 0 stevel * tcp_strong_iss is set to 1, which is the default, the ISS is calculated 329 0 stevel * by adding three components: a time component which grows by 1 every 4096 330 0 stevel * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); 331 0 stevel * a per-connection component which grows by 125000 for every new connection; 332 0 stevel * and an "extra" component that grows by a random amount centered 333 11042 Erik * approximately on 64000. This causes the ISS generator to cycle every 334 0 stevel * 4.89 hours if no TCP connections are made, and faster if connections are 335 0 stevel * made. 336 0 stevel * 337 0 stevel * When tcp_strong_iss is set to 0, ISS is calculated by adding two 338 0 stevel * components: a time component which grows by 250000 every second; and 339 0 stevel * a per-connection component which grows by 125000 for every new connections. 340 0 stevel * 341 0 stevel * A third method, when tcp_strong_iss is set to 2, for generating ISS is 342 0 stevel * prescribed by Steve Bellovin. This involves adding time, the 125000 per 343 0 stevel * connection, and a one-way hash (MD5) of the connection ID <sport, dport, 344 0 stevel * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered 345 0 stevel * password. 346 0 stevel */ 347 0 stevel #define ISS_INCR 250000 348 0 stevel #define ISS_NSEC_SHT 12 349 0 stevel 350 0 stevel static sin_t sin_null; /* Zero address for quick clears */ 351 0 stevel static sin6_t sin6_null; /* Zero address for quick clears */ 352 0 stevel 353 0 stevel /* 354 0 stevel * This implementation follows the 4.3BSD interpretation of the urgent 355 0 stevel * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause 356 0 stevel * incompatible changes in protocols like telnet and rlogin. 357 0 stevel */ 358 0 stevel #define TCP_OLD_URP_INTERPRETATION 1 359 0 stevel 360 11042 Erik /* 361 11042 Erik * Since tcp_listener is not cleared atomically with tcp_detached 362 11042 Erik * being cleared we need this extra bit to tell a detached connection 363 11042 Erik * apart from one that is in the process of being accepted. 364 11042 Erik */ 365 0 stevel #define TCP_IS_DETACHED_NONEAGER(tcp) \ 366 11042 Erik (TCP_IS_DETACHED(tcp) && \ 367 0 stevel (!(tcp)->tcp_hard_binding)) 368 0 stevel 369 0 stevel /* 370 0 stevel * TCP reassembly macros. We hide starting and ending sequence numbers in 371 0 stevel * b_next and b_prev of messages on the reassembly queue. The messages are 372 0 stevel * chained using b_cont. These macros are used in tcp_reass() so we don't 373 0 stevel * have to see the ugly casts and assignments. 374 0 stevel */ 375 0 stevel #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) 376 0 stevel #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ 377 0 stevel (mblk_t *)(uintptr_t)(u)) 378 0 stevel #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) 379 0 stevel #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ 380 0 stevel (mblk_t *)(uintptr_t)(u)) 381 0 stevel 382 0 stevel /* 383 0 stevel * Implementation of TCP Timers. 384 0 stevel * ============================= 385 0 stevel * 386 0 stevel * INTERFACE: 387 0 stevel * 388 0 stevel * There are two basic functions dealing with tcp timers: 389 0 stevel * 390 0 stevel * timeout_id_t tcp_timeout(connp, func, time) 391 0 stevel * clock_t tcp_timeout_cancel(connp, timeout_id) 392 0 stevel * TCP_TIMER_RESTART(tcp, intvl) 393 0 stevel * 394 0 stevel * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' 395 0 stevel * after 'time' ticks passed. The function called by timeout() must adhere to 396 0 stevel * the same restrictions as a driver soft interrupt handler - it must not sleep 397 0 stevel * or call other functions that might sleep. The value returned is the opaque 398 0 stevel * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to 399 0 stevel * cancel the request. The call to tcp_timeout() may fail in which case it 400 0 stevel * returns zero. This is different from the timeout(9F) function which never 401 0 stevel * fails. 402 0 stevel * 403 0 stevel * The call-back function 'func' always receives 'connp' as its single 404 0 stevel * argument. It is always executed in the squeue corresponding to the tcp 405 0 stevel * structure. The tcp structure is guaranteed to be present at the time the 406 0 stevel * call-back is called. 407 0 stevel * 408 0 stevel * NOTE: The call-back function 'func' is never called if tcp is in 409 0 stevel * the TCPS_CLOSED state. 410 0 stevel * 411 0 stevel * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() 412 0 stevel * request. locks acquired by the call-back routine should not be held across 413 0 stevel * the call to tcp_timeout_cancel() or a deadlock may result. 414 0 stevel * 415 0 stevel * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. 416 0 stevel * Otherwise, it returns an integer value greater than or equal to 0. In 417 0 stevel * particular, if the call-back function is already placed on the squeue, it can 418 0 stevel * not be canceled. 419 0 stevel * 420 0 stevel * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called 421 0 stevel * within squeue context corresponding to the tcp instance. Since the 422 0 stevel * call-back is also called via the same squeue, there are no race 423 0 stevel * conditions described in untimeout(9F) manual page since all calls are 424 0 stevel * strictly serialized. 425 0 stevel * 426 0 stevel * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout 427 0 stevel * stored in tcp_timer_tid and starts a new one using 428 0 stevel * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back 429 0 stevel * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid 430 0 stevel * field. 431 0 stevel * 432 0 stevel * NOTE: since the timeout cancellation is not guaranteed, the cancelled 433 0 stevel * call-back may still be called, so it is possible tcp_timer() will be 434 0 stevel * called several times. This should not be a problem since tcp_timer() 435 0 stevel * should always check the tcp instance state. 436 0 stevel * 437 0 stevel * 438 0 stevel * IMPLEMENTATION: 439 0 stevel * 440 0 stevel * TCP timers are implemented using three-stage process. The call to 441 0 stevel * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function 442 0 stevel * when the timer expires. The tcp_timer_callback() arranges the call of the 443 0 stevel * tcp_timer_handler() function via squeue corresponding to the tcp 444 0 stevel * instance. The tcp_timer_handler() calls actual requested timeout call-back 445 0 stevel * and passes tcp instance as an argument to it. Information is passed between 446 0 stevel * stages using the tcp_timer_t structure which contains the connp pointer, the 447 0 stevel * tcp call-back to call and the timeout id returned by the timeout(9F). 448 0 stevel * 449 0 stevel * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - 450 0 stevel * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo 451 0 stevel * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() 452 0 stevel * returns the pointer to this mblk. 453 0 stevel * 454 0 stevel * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It 455 0 stevel * looks like a normal mblk without actual dblk attached to it. 456 0 stevel * 457 0 stevel * To optimize performance each tcp instance holds a small cache of timer 458 0 stevel * mblocks. In the current implementation it caches up to two timer mblocks per 459 0 stevel * tcp instance. The cache is preserved over tcp frees and is only freed when 460 0 stevel * the whole tcp structure is destroyed by its kmem destructor. Since all tcp 461 0 stevel * timer processing happens on a corresponding squeue, the cache manipulation 462 0 stevel * does not require any locks. Experiments show that majority of timer mblocks 463 0 stevel * allocations are satisfied from the tcp cache and do not involve kmem calls. 464 0 stevel * 465 0 stevel * The tcp_timeout() places a refhold on the connp instance which guarantees 466 0 stevel * that it will be present at the time the call-back function fires. The 467 0 stevel * tcp_timer_handler() drops the reference after calling the call-back, so the 468 0 stevel * call-back function does not need to manipulate the references explicitly. 469 0 stevel */ 470 0 stevel 471 0 stevel typedef struct tcp_timer_s { 472 0 stevel conn_t *connp; 473 0 stevel void (*tcpt_proc)(void *); 474 8048 Madhavan callout_id_t tcpt_tid; 475 0 stevel } tcp_timer_t; 476 0 stevel 477 0 stevel static kmem_cache_t *tcp_timercache; 478 0 stevel kmem_cache_t *tcp_sack_info_cache; 479 0 stevel 480 0 stevel /* 481 0 stevel * For scalability, we must not run a timer for every TCP connection 482 0 stevel * in TIME_WAIT state. To see why, consider (for time wait interval of 483 0 stevel * 4 minutes): 484 0 stevel * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's 485 0 stevel * 486 0 stevel * This list is ordered by time, so you need only delete from the head 487 0 stevel * until you get to entries which aren't old enough to delete yet. 488 0 stevel * The list consists of only the detached TIME_WAIT connections. 489 0 stevel * 490 0 stevel * Note that the timer (tcp_time_wait_expire) is started when the tcp_t 491 0 stevel * becomes detached TIME_WAIT (either by changing the state and already 492 0 stevel * being detached or the other way around). This means that the TIME_WAIT 493 0 stevel * state can be extended (up to doubled) if the connection doesn't become 494 0 stevel * detached for a long time. 495 0 stevel * 496 0 stevel * The list manipulations (including tcp_time_wait_next/prev) 497 0 stevel * are protected by the tcp_time_wait_lock. The content of the 498 0 stevel * detached TIME_WAIT connections is protected by the normal perimeters. 499 3448 dh155122 * 500 3448 dh155122 * This list is per squeue and squeues are shared across the tcp_stack_t's. 501 3448 dh155122 * Things on tcp_time_wait_head remain associated with the tcp_stack_t 502 3448 dh155122 * and conn_netstack. 503 3448 dh155122 * The tcp_t's that are added to tcp_free_list are disassociated and 504 3448 dh155122 * have NULL tcp_tcps and conn_netstack pointers. 505 3448 dh155122 */ 506 0 stevel typedef struct tcp_squeue_priv_s { 507 0 stevel kmutex_t tcp_time_wait_lock; 508 8048 Madhavan callout_id_t tcp_time_wait_tid; 509 0 stevel tcp_t *tcp_time_wait_head; 510 0 stevel tcp_t *tcp_time_wait_tail; 511 0 stevel tcp_t *tcp_free_list; 512 1023 ethindra uint_t tcp_free_list_cnt; 513 0 stevel } tcp_squeue_priv_t; 514 0 stevel 515 0 stevel /* 516 0 stevel * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. 517 0 stevel * Running it every 5 seconds seems to give the best results. 518 0 stevel */ 519 0 stevel #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000) 520 0 stevel 521 1023 ethindra /* 522 1023 ethindra * To prevent memory hog, limit the number of entries in tcp_free_list 523 1023 ethindra * to 1% of available memory / number of cpus 524 1023 ethindra */ 525 1023 ethindra uint_t tcp_free_list_max_cnt = 0; 526 0 stevel 527 0 stevel #define TCP_XMIT_LOWATER 4096 528 0 stevel #define TCP_XMIT_HIWATER 49152 529 0 stevel #define TCP_RECV_LOWATER 2048 530 11055 Kacheong #define TCP_RECV_HIWATER 128000 531 0 stevel 532 0 stevel /* 533 0 stevel * PAWS needs a timer for 24 days. This is the number of ticks in 24 days 534 0 stevel */ 535 0 stevel #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) 536 0 stevel 537 0 stevel #define TIDUSZ 4096 /* transport interface data unit size */ 538 0 stevel 539 0 stevel /* 540 0 stevel * Bind hash list size and has function. It has to be a power of 2 for 541 0 stevel * hashing. 542 0 stevel */ 543 0 stevel #define TCP_BIND_FANOUT_SIZE 512 544 0 stevel #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) 545 0 stevel /* 546 0 stevel * Size of listen and acceptor hash list. It has to be a power of 2 for 547 0 stevel * hashing. 548 0 stevel */ 549 0 stevel #define TCP_FANOUT_SIZE 256 550 0 stevel 551 0 stevel #ifdef _ILP32 552 0 stevel #define TCP_ACCEPTOR_HASH(accid) \ 553 0 stevel (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1)) 554 0 stevel #else 555 0 stevel #define TCP_ACCEPTOR_HASH(accid) \ 556 0 stevel ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1)) 557 0 stevel #endif /* _ILP32 */ 558 0 stevel 559 0 stevel #define IP_ADDR_CACHE_SIZE 2048 560 0 stevel #define IP_ADDR_CACHE_HASH(faddr) \ 561 0 stevel (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) 562 0 stevel 563 0 stevel /* 564 0 stevel * TCP options struct returned from tcp_parse_options. 565 0 stevel */ 566 0 stevel typedef struct tcp_opt_s { 567 0 stevel uint32_t tcp_opt_mss; 568 0 stevel uint32_t tcp_opt_wscale; 569 0 stevel uint32_t tcp_opt_ts_val; 570 0 stevel uint32_t tcp_opt_ts_ecr; 571 0 stevel tcp_t *tcp; 572 0 stevel } tcp_opt_t; 573 0 stevel 574 0 stevel /* 575 0 stevel * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 576 0 stevel */ 577 0 stevel 578 0 stevel #ifdef _BIG_ENDIAN 579 0 stevel #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 580 0 stevel (TCPOPT_TSTAMP << 8) | 10) 581 0 stevel #else 582 0 stevel #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 583 0 stevel (TCPOPT_NOP << 8) | TCPOPT_NOP) 584 0 stevel #endif 585 0 stevel 586 0 stevel /* 587 0 stevel * Flags returned from tcp_parse_options. 588 0 stevel */ 589 0 stevel #define TCP_OPT_MSS_PRESENT 1 590 0 stevel #define TCP_OPT_WSCALE_PRESENT 2 591 0 stevel #define TCP_OPT_TSTAMP_PRESENT 4 592 0 stevel #define TCP_OPT_SACK_OK_PRESENT 8 593 0 stevel #define TCP_OPT_SACK_PRESENT 16 594 0 stevel 595 0 stevel /* TCP option length */ 596 0 stevel #define TCPOPT_NOP_LEN 1 597 0 stevel #define TCPOPT_MAXSEG_LEN 4 598 0 stevel #define TCPOPT_WS_LEN 3 599 0 stevel #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 600 0 stevel #define TCPOPT_TSTAMP_LEN 10 601 0 stevel #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 602 0 stevel #define TCPOPT_SACK_OK_LEN 2 603 0 stevel #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 604 0 stevel #define TCPOPT_REAL_SACK_LEN 4 605 0 stevel #define TCPOPT_MAX_SACK_LEN 36 606 0 stevel #define TCPOPT_HEADER_LEN 2 607 0 stevel 608 0 stevel /* TCP cwnd burst factor. */ 609 0 stevel #define TCP_CWND_INFINITE 65535 610 0 stevel #define TCP_CWND_SS 3 611 0 stevel #define TCP_CWND_NORMAL 5 612 0 stevel 613 0 stevel /* Maximum TCP initial cwin (start/restart). */ 614 0 stevel #define TCP_MAX_INIT_CWND 8 615 0 stevel 616 0 stevel /* 617 0 stevel * Initialize cwnd according to RFC 3390. def_max_init_cwnd is 618 0 stevel * either tcp_slow_start_initial or tcp_slow_start_after idle 619 0 stevel * depending on the caller. If the upper layer has not used the 620 0 stevel * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd 621 0 stevel * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. 622 0 stevel * If the upper layer has changed set the tcp_init_cwnd, just use 623 0 stevel * it to calculate the tcp_cwnd. 624 0 stevel */ 625 0 stevel #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \ 626 0 stevel { \ 627 0 stevel if ((tcp)->tcp_init_cwnd == 0) { \ 628 0 stevel (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ 629 0 stevel MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ 630 0 stevel } else { \ 631 0 stevel (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ 632 0 stevel } \ 633 0 stevel tcp->tcp_cwnd_cnt = 0; \ 634 0 stevel } 635 0 stevel 636 0 stevel /* TCP Timer control structure */ 637 0 stevel typedef struct tcpt_s { 638 0 stevel pfv_t tcpt_pfv; /* The routine we are to call */ 639 0 stevel tcp_t *tcpt_tcp; /* The parameter we are to pass in */ 640 0 stevel } tcpt_t; 641 0 stevel 642 0 stevel /* 643 0 stevel * Functions called directly via squeue having a prototype of edesc_t. 644 0 stevel */ 645 11042 Erik void tcp_input_listener(void *arg, mblk_t *mp, void *arg2, 646 11042 Erik ip_recv_attr_t *ira); 647 11042 Erik static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, 648 11042 Erik ip_recv_attr_t *dummy); 649 11042 Erik void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, 650 11042 Erik ip_recv_attr_t *dummy); 651 11042 Erik static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, 652 11042 Erik ip_recv_attr_t *dummy); 653 11042 Erik static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, 654 11042 Erik ip_recv_attr_t *dummy); 655 11042 Erik void tcp_input_data(void *arg, mblk_t *mp, void *arg2, 656 11042 Erik ip_recv_attr_t *ira); 657 11042 Erik static void tcp_close_output(void *arg, mblk_t *mp, void *arg2, 658 11042 Erik ip_recv_attr_t *dummy); 659 11042 Erik void tcp_output(void *arg, mblk_t *mp, void *arg2, 660 11042 Erik ip_recv_attr_t *dummy); 661 11042 Erik void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, 662 11042 Erik ip_recv_attr_t *dummy); 663 11042 Erik static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, 664 11042 Erik ip_recv_attr_t *dummy); 665 11042 Erik static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, 666 11042 Erik ip_recv_attr_t *dummy); 667 11042 Erik static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, 668 11042 Erik ip_recv_attr_t *dummy); 669 0 stevel 670 0 stevel 671 0 stevel /* Prototype for TCP functions */ 672 0 stevel static void tcp_random_init(void); 673 0 stevel int tcp_random(void); 674 8348 Eric static void tcp_tli_accept(tcp_t *tcp, mblk_t *mp); 675 11042 Erik static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, 676 0 stevel tcp_t *eager); 677 11042 Erik static int tcp_set_destination(tcp_t *tcp); 678 0 stevel static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 679 646 gt145670 int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, 680 646 gt145670 boolean_t user_specified); 681 0 stevel static void tcp_closei_local(tcp_t *tcp); 682 0 stevel static void tcp_close_detached(tcp_t *tcp); 683 11042 Erik static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, 684 11042 Erik mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira); 685 8348 Eric static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp); 686 8348 Eric static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, 687 11042 Erik in_port_t dstport, uint_t srcid); 688 11042 Erik static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, 689 11042 Erik in_port_t dstport, uint32_t flowinfo, 690 11042 Erik uint_t srcid, uint32_t scope_id); 691 0 stevel static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); 692 0 stevel static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); 693 0 stevel static char *tcp_display(tcp_t *tcp, char *, char); 694 0 stevel static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); 695 0 stevel static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only); 696 0 stevel static void tcp_eager_unlink(tcp_t *tcp); 697 0 stevel static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr, 698 0 stevel int unixerr); 699 0 stevel static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 700 0 stevel int tlierr, int unixerr); 701 0 stevel static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, 702 0 stevel cred_t *cr); 703 0 stevel static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, 704 0 stevel char *value, caddr_t cp, cred_t *cr); 705 0 stevel static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, 706 0 stevel char *value, caddr_t cp, cred_t *cr); 707 0 stevel static int tcp_tpistate(tcp_t *tcp); 708 0 stevel static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, 709 0 stevel int caller_holds_lock); 710 0 stevel static void tcp_bind_hash_remove(tcp_t *tcp); 711 3448 dh155122 static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *); 712 0 stevel void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); 713 0 stevel static void tcp_acceptor_hash_remove(tcp_t *tcp); 714 0 stevel static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); 715 0 stevel static void tcp_info_req(tcp_t *tcp, mblk_t *mp); 716 0 stevel static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); 717 11042 Erik static void tcp_init_values(tcp_t *tcp); 718 0 stevel static void tcp_ip_notify(tcp_t *tcp); 719 0 stevel static void tcp_iss_init(tcp_t *tcp); 720 0 stevel static void tcp_keepalive_killer(void *arg); 721 11042 Erik static int tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt); 722 11042 Erik static void tcp_mss_set(tcp_t *tcp, uint32_t size); 723 0 stevel static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, 724 0 stevel int *do_disconnectp, int *t_errorp, int *sys_errorp); 725 0 stevel static boolean_t tcp_allow_connopt_set(int level, int name); 726 0 stevel int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); 727 0 stevel static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); 728 3448 dh155122 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, 729 3448 dh155122 tcp_stack_t *); 730 0 stevel static int tcp_param_set(queue_t *q, mblk_t *mp, char *value, 731 0 stevel caddr_t cp, cred_t *cr); 732 0 stevel static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, 733 0 stevel caddr_t cp, cred_t *cr); 734 3448 dh155122 static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *); 735 0 stevel static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, 736 0 stevel caddr_t cp, cred_t *cr); 737 0 stevel static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); 738 9864 Phil static void tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt); 739 0 stevel static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start); 740 0 stevel static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp); 741 0 stevel static void tcp_reinit(tcp_t *tcp); 742 0 stevel static void tcp_reinit_values(tcp_t *tcp); 743 0 stevel 744 8348 Eric static uint_t tcp_rwnd_reopen(tcp_t *tcp); 745 8348 Eric static uint_t tcp_rcv_drain(tcp_t *tcp); 746 0 stevel static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); 747 3448 dh155122 static boolean_t tcp_send_rst_chk(tcp_stack_t *); 748 0 stevel static void tcp_ss_rexmit(tcp_t *tcp); 749 11042 Erik static mblk_t *tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, 750 11042 Erik ip_recv_attr_t *); 751 11042 Erik static void tcp_process_options(tcp_t *, tcpha_t *); 752 0 stevel static void tcp_rsrv(queue_t *q); 753 0 stevel static int tcp_snmp_state(tcp_t *tcp); 754 0 stevel static void tcp_timer(void *arg); 755 0 stevel static void tcp_timer_callback(void *); 756 1676 jpk static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp, 757 1676 jpk boolean_t random); 758 1676 jpk static in_port_t tcp_get_next_priv_port(const tcp_t *); 759 0 stevel static void tcp_wput_sock(queue_t *q, mblk_t *mp); 760 8348 Eric static void tcp_wput_fallback(queue_t *q, mblk_t *mp); 761 8348 Eric void tcp_tpi_accept(queue_t *q, mblk_t *mp); 762 0 stevel static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); 763 0 stevel static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); 764 0 stevel static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); 765 11042 Erik static int tcp_send(tcp_t *tcp, const int mss, 766 11042 Erik const int total_hdr_len, const int tcp_hdr_len, 767 0 stevel const int num_sack_blk, int *usable, uint_t *snxt, 768 11042 Erik int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time); 769 0 stevel static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, 770 0 stevel int num_sack_blk); 771 0 stevel static void tcp_wsrv(queue_t *q); 772 0 stevel static int tcp_xmit_end(tcp_t *tcp); 773 0 stevel static void tcp_ack_timer(void *arg); 774 0 stevel static mblk_t *tcp_ack_mp(tcp_t *tcp); 775 0 stevel static void tcp_xmit_early_reset(char *str, mblk_t *mp, 776 11042 Erik uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *, 777 11042 Erik ip_stack_t *, conn_t *); 778 0 stevel static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, 779 0 stevel uint32_t ack, int ctl); 780 0 stevel static void tcp_set_rto(tcp_t *, time_t); 781 11042 Erik static void tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 782 11042 Erik static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *); 783 11042 Erik static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *, 784 11042 Erik ip_recv_attr_t *); 785 8348 Eric static int tcp_build_hdrs(tcp_t *); 786 0 stevel static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 787 11042 Erik uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha, 788 11042 Erik ip_recv_attr_t *ira); 789 11042 Erik boolean_t tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp); 790 11042 Erik static boolean_t tcp_zcopy_check(tcp_t *); 791 11042 Erik static void tcp_zcopy_notify(tcp_t *); 792 11042 Erik static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t); 793 11042 Erik static void tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa); 794 11042 Erik static void tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only); 795 11042 Erik static void tcp_update_zcopy(tcp_t *tcp); 796 11042 Erik static void tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t, 797 11042 Erik ixa_notify_arg_t); 798 11042 Erik static void tcp_rexmit_after_error(tcp_t *tcp); 799 11042 Erik static void tcp_send_data(tcp_t *, mblk_t *); 800 0 stevel extern mblk_t *tcp_timermp_alloc(int); 801 0 stevel extern void tcp_timermp_free(tcp_t *); 802 0 stevel static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); 803 0 stevel static void tcp_stop_lingering(tcp_t *tcp); 804 0 stevel static void tcp_close_linger_timeout(void *arg); 805 3448 dh155122 static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns); 806 3448 dh155122 static void tcp_stack_fini(netstackid_t stackid, void *arg); 807 3448 dh155122 static void *tcp_g_kstat_init(tcp_g_stat_t *); 808 3448 dh155122 static void tcp_g_kstat_fini(kstat_t *); 809 3448 dh155122 static void *tcp_kstat_init(netstackid_t, tcp_stack_t *); 810 3448 dh155122 static void tcp_kstat_fini(netstackid_t, kstat_t *); 811 3448 dh155122 static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *); 812 3448 dh155122 static void tcp_kstat2_fini(netstackid_t, kstat_t *); 813 0 stevel static int tcp_kstat_update(kstat_t *kp, int rw); 814 11042 Erik static mblk_t *tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 815 11042 Erik ip_recv_attr_t *ira); 816 11042 Erik static mblk_t *tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, 817 11042 Erik ip_recv_attr_t *ira); 818 8275 Eric static int tcp_squeue_switch(int); 819 0 stevel 820 5240 nordmark static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); 821 5240 nordmark static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *); 822 5240 nordmark static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *); 823 8348 Eric static int tcp_tpi_close(queue_t *, int); 824 9395 Rao static int tcp_tpi_close_accept(queue_t *); 825 0 stevel 826 0 stevel static void tcp_squeue_add(squeue_t *); 827 11042 Erik static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); 828 11042 Erik 829 11042 Erik extern void tcp_kssl_input(tcp_t *, mblk_t *, cred_t *); 830 11042 Erik 831 11042 Erik void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy); 832 11042 Erik void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, 833 11042 Erik ip_recv_attr_t *dummy); 834 8348 Eric 835 8348 Eric static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, 836 8348 Eric sock_upper_handle_t, cred_t *); 837 8348 Eric static int tcp_listen(sock_lower_handle_t, int, cred_t *); 838 9395 Rao static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *, 839 9395 Rao boolean_t); 840 8348 Eric static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t, 841 8348 Eric cred_t *, pid_t); 842 8348 Eric static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *, 843 8348 Eric boolean_t); 844 8348 Eric static int tcp_do_unbind(conn_t *); 845 8348 Eric static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *, 846 8348 Eric boolean_t); 847 3104 jprakash 848 8682 Anders static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *); 849 8682 Anders 850 0 stevel /* 851 0 stevel * Routines related to the TCP_IOC_ABORT_CONN ioctl command. 852 0 stevel * 853 0 stevel * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting 854 0 stevel * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure 855 0 stevel * (defined in tcp.h) needs to be filled in and passed into the kernel 856 0 stevel * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t 857 0 stevel * structure contains the four-tuple of a TCP connection and a range of TCP 858 0 stevel * states (specified by ac_start and ac_end). The use of wildcard addresses 859 0 stevel * and ports is allowed. Connections with a matching four tuple and a state 860 0 stevel * within the specified range will be aborted. The valid states for the 861 0 stevel * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, 862 0 stevel * inclusive. 863 0 stevel * 864 0 stevel * An application which has its connection aborted by this ioctl will receive 865 0 stevel * an error that is dependent on the connection state at the time of the abort. 866 0 stevel * If the connection state is < TCPS_TIME_WAIT, an application should behave as 867 0 stevel * though a RST packet has been received. If the connection state is equal to 868 0 stevel * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel 869 0 stevel * and all resources associated with the connection will be freed. 870 0 stevel */ 871 0 stevel static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); 872 0 stevel static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); 873 11042 Erik static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, 874 11042 Erik ip_recv_attr_t *dummy); 875 3448 dh155122 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); 876 0 stevel static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); 877 0 stevel static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, 878 3448 dh155122 boolean_t, tcp_stack_t *); 879 0 stevel 880 0 stevel static struct module_info tcp_rinfo = { 881 741 masputra TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER 882 0 stevel }; 883 0 stevel 884 0 stevel static struct module_info tcp_winfo = { 885 741 masputra TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16 886 0 stevel }; 887 0 stevel 888 0 stevel /* 889 0 stevel * Entry points for TCP as a device. The normal case which supports 890 0 stevel * the TCP functionality. 891 5240 nordmark * We have separate open functions for the /dev/tcp and /dev/tcp6 devices. 892 5240 nordmark */ 893 5240 nordmark struct qinit tcp_rinitv4 = { 894 8348 Eric NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo 895 5240 nordmark }; 896 5240 nordmark 897 5240 nordmark struct qinit tcp_rinitv6 = { 898 8348 Eric NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo 899 0 stevel }; 900 0 stevel 901 0 stevel struct qinit tcp_winit = { 902 0 stevel (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 903 0 stevel }; 904 0 stevel 905 0 stevel /* Initial entry point for TCP in socket mode. */ 906 0 stevel struct qinit tcp_sock_winit = { 907 0 stevel (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 908 0 stevel }; 909 0 stevel 910 8348 Eric /* TCP entry point during fallback */ 911 8348 Eric struct qinit tcp_fallback_sock_winit = { 912 8348 Eric (pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo 913 8348 Eric }; 914 8348 Eric 915 0 stevel /* 916 0 stevel * Entry points for TCP as a acceptor STREAM opened by sockfs when doing 917 0 stevel * an accept. Avoid allocating data structures since eager has already 918 0 stevel * been created. 919 0 stevel */ 920 0 stevel struct qinit tcp_acceptor_rinit = { 921 9395 Rao NULL, (pfi_t)tcp_rsrv, NULL, tcp_tpi_close_accept, NULL, &tcp_winfo 922 0 stevel }; 923 0 stevel 924 0 stevel struct qinit tcp_acceptor_winit = { 925 8348 Eric (pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo 926 0 stevel }; 927 0 stevel 928 5240 nordmark /* For AF_INET aka /dev/tcp */ 929 5240 nordmark struct streamtab tcpinfov4 = { 930 5240 nordmark &tcp_rinitv4, &tcp_winit 931 5240 nordmark }; 932 5240 nordmark 933 5240 nordmark /* For AF_INET6 aka /dev/tcp6 */ 934 5240 nordmark struct streamtab tcpinfov6 = { 935 5240 nordmark &tcp_rinitv6, &tcp_winit 936 0 stevel }; 937 8348 Eric 938 8348 Eric sock_downcalls_t sock_tcp_downcalls; 939 0 stevel 940 3448 dh155122 /* Setable only in /etc/system. Move to ndd? */ 941 0 stevel boolean_t tcp_icmp_source_quench = B_FALSE; 942 3448 dh155122 943 0 stevel /* 944 0 stevel * Following assumes TPI alignment requirements stay along 32 bit 945 0 stevel * boundaries 946 0 stevel */ 947 0 stevel #define ROUNDUP32(x) \ 948 0 stevel (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) 949 0 stevel 950 0 stevel /* Template for response to info request. */ 951 0 stevel static struct T_info_ack tcp_g_t_info_ack = { 952 0 stevel T_INFO_ACK, /* PRIM_type */ 953 0 stevel 0, /* TSDU_size */ 954 0 stevel T_INFINITE, /* ETSDU_size */ 955 0 stevel T_INVALID, /* CDATA_size */ 956 0 stevel T_INVALID, /* DDATA_size */ 957 0 stevel sizeof (sin_t), /* ADDR_size */ 958 0 stevel 0, /* OPT_size - not initialized here */ 959 0 stevel TIDUSZ, /* TIDU_size */ 960 0 stevel T_COTS_ORD, /* SERV_type */ 961 0 stevel TCPS_IDLE, /* CURRENT_state */ 962 0 stevel (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 963 0 stevel }; 964 0 stevel 965 0 stevel static struct T_info_ack tcp_g_t_info_ack_v6 = { 966 0 stevel T_INFO_ACK, /* PRIM_type */ 967 0 stevel 0, /* TSDU_size */ 968 0 stevel T_INFINITE, /* ETSDU_size */ 969 0 stevel T_INVALID, /* CDATA_size */ 970 0 stevel T_INVALID, /* DDATA_size */ 971 0 stevel sizeof (sin6_t), /* ADDR_size */ 972 0 stevel 0, /* OPT_size - not initialized here */ 973 0 stevel TIDUSZ, /* TIDU_size */ 974 0 stevel T_COTS_ORD, /* SERV_type */ 975 0 stevel TCPS_IDLE, /* CURRENT_state */ 976 0 stevel (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 977 0 stevel }; 978 0 stevel 979 0 stevel #define MS 1L 980 0 stevel #define SECONDS (1000 * MS) 981 0 stevel #define MINUTES (60 * SECONDS) 982 0 stevel #define HOURS (60 * MINUTES) 983 0 stevel #define DAYS (24 * HOURS) 984 0 stevel 985 0 stevel #define PARAM_MAX (~(uint32_t)0) 986 0 stevel 987 0 stevel /* Max size IP datagram is 64k - 1 */ 988 11042 Erik #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t))) 989 11042 Erik #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t))) 990 0 stevel /* Max of the above */ 991 0 stevel #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 992 0 stevel 993 0 stevel /* Largest TCP port number */ 994 0 stevel #define TCP_MAX_PORT (64 * 1024 - 1) 995 0 stevel 996 0 stevel /* 997 0 stevel * tcp_wroff_xtra is the extra space in front of TCP/IP header for link 998 0 stevel * layer header. It has to be a multiple of 4. 999 0 stevel */ 1000 3448 dh155122 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; 1001 3448 dh155122 #define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val 1002 0 stevel 1003 0 stevel /* 1004 0 stevel * All of these are alterable, within the min/max values given, at run time. 1005 0 stevel * Note that the default value of "tcp_time_wait_interval" is four minutes, 1006 0 stevel * per the TCP spec. 1007 0 stevel */ 1008 0 stevel /* BEGIN CSTYLED */ 1009 3448 dh155122 static tcpparam_t lcl_tcp_param_arr[] = { 1010 0 stevel /*min max value name */ 1011 0 stevel { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"}, 1012 0 stevel { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" }, 1013 0 stevel { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" }, 1014 0 stevel { 1, 1024, 1, "tcp_conn_req_min" }, 1015 0 stevel { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" }, 1016 0 stevel { 128, (1<<30), 1024*1024, "tcp_cwnd_max" }, 1017 0 stevel { 0, 10, 0, "tcp_debug" }, 1018 0 stevel { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"}, 1019 0 stevel { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"}, 1020 0 stevel { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"}, 1021 0 stevel { 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"}, 1022 0 stevel { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"}, 1023 0 stevel { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"}, 1024 0 stevel { 1, 255, 64, "tcp_ipv4_ttl"}, 1025 0 stevel { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"}, 1026 0 stevel { 0, 100, 10, "tcp_maxpsz_multiplier" }, 1027 0 stevel { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"}, 1028 0 stevel { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"}, 1029 0 stevel { 1, TCP_MSS_MAX, 108, "tcp_mss_min"}, 1030 0 stevel { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"}, 1031 0 stevel { 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"}, 1032 0 stevel { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"}, 1033 0 stevel { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"}, 1034 0 stevel { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" }, 1035 0 stevel { 0, 16, 0, "tcp_snd_lowat_fraction" }, 1036 0 stevel { 0, 128000, 0, "tcp_sth_rcv_hiwat" }, 1037 0 stevel { 0, 128000, 0, "tcp_sth_rcv_lowat" }, 1038 0 stevel { 1, 10000, 3, "tcp_dupack_fast_retransmit" }, 1039 0 stevel { 0, 1, 0, "tcp_ignore_path_mtu" }, 1040 0 stevel { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"}, 1041 0 stevel { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"}, 1042 0 stevel { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"}, 1043 0 stevel { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"}, 1044 0 stevel { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"}, 1045 0 stevel { 1, 65536, 4, "tcp_recv_hiwat_minmss"}, 1046 0 stevel { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"}, 1047 0 stevel { 8192, (1<<30), 1024*1024, "tcp_max_buf"}, 1048 0 stevel /* 1049 0 stevel * Question: What default value should I set for tcp_strong_iss? 1050 0 stevel */ 1051 0 stevel { 0, 2, 1, "tcp_strong_iss"}, 1052 0 stevel { 0, 65536, 20, "tcp_rtt_updates"}, 1053 0 stevel { 0, 1, 1, "tcp_wscale_always"}, 1054 0 stevel { 0, 1, 0, "tcp_tstamp_always"}, 1055 0 stevel { 0, 1, 1, "tcp_tstamp_if_wscale"}, 1056 0 stevel { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"}, 1057 0 stevel { 0, 16, 2, "tcp_deferred_acks_max"}, 1058 0 stevel { 1, 16384, 4, "tcp_slow_start_after_idle"}, 1059 0 stevel { 1, 4, 4, "tcp_slow_start_initial"}, 1060 0 stevel { 0, 2, 2, "tcp_sack_permitted"}, 1061 0 stevel { 0, 1, 1, "tcp_compression_enabled"}, 1062 0 stevel { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"}, 1063 0 stevel { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"}, 1064 0 stevel { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"}, 1065 0 stevel { 0, 1, 0, "tcp_rev_src_routes"}, 1066 0 stevel { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"}, 1067 0 stevel { 0, 16, 8, "tcp_local_dacks_max"}, 1068 0 stevel { 0, 2, 1, "tcp_ecn_permitted"}, 1069 0 stevel { 0, 1, 1, "tcp_rst_sent_rate_enabled"}, 1070 0 stevel { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"}, 1071 0 stevel { 0, 100*MS, 50*MS, "tcp_push_timer_interval"}, 1072 0 stevel { 0, 1, 0, "tcp_use_smss_as_mss_opt"}, 1073 0 stevel { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"}, 1074 11042 Erik { 0, 1, 0, "tcp_dev_flow_ctl"}, 1075 0 stevel }; 1076 0 stevel /* END CSTYLED */ 1077 0 stevel 1078 0 stevel /* Round up the value to the nearest mss. */ 1079 0 stevel #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 1080 0 stevel 1081 0 stevel /* 1082 0 stevel * Set ECN capable transport (ECT) code point in IP header. 1083 0 stevel * 1084 0 stevel * Note that there are 2 ECT code points '01' and '10', which are called 1085 0 stevel * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 1086 0 stevel * point ECT(0) for TCP as described in RFC 2481. 1087 0 stevel */ 1088 0 stevel #define SET_ECT(tcp, iph) \ 1089 11042 Erik if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \ 1090 0 stevel /* We need to clear the code point first. */ \ 1091 0 stevel ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ 1092 0 stevel ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ 1093 0 stevel } else { \ 1094 0 stevel ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ 1095 0 stevel ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ 1096 0 stevel } 1097 0 stevel 1098 0 stevel /* 1099 0 stevel * The format argument to pass to tcp_display(). 1100 0 stevel * DISP_PORT_ONLY means that the returned string has only port info. 1101 0 stevel * DISP_ADDR_AND_PORT means that the returned string also contains the 1102 0 stevel * remote and local IP address. 1103 0 stevel */ 1104 0 stevel #define DISP_PORT_ONLY 1 1105 0 stevel #define DISP_ADDR_AND_PORT 2 1106 0 stevel 1107 0 stevel #define IS_VMLOANED_MBLK(mp) \ 1108 0 stevel (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) 1109 0 stevel 1110 0 stevel uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ 1111 0 stevel 1112 0 stevel /* 1113 3448 dh155122 * Forces all connections to obey the value of the tcps_maxpsz_multiplier 1114 0 stevel * tunable settable via NDD. Otherwise, the per-connection behavior is 1115 11042 Erik * determined dynamically during tcp_set_destination(), which is the default. 1116 0 stevel */ 1117 0 stevel boolean_t tcp_static_maxpsz = B_FALSE; 1118 0 stevel 1119 3448 dh155122 /* Setable in /etc/system */ 1120 0 stevel /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 1121 0 stevel uint32_t tcp_random_anon_port = 1; 1122 0 stevel 1123 0 stevel /* 1124 3104 jprakash * To reach to an eager in Q0 which can be dropped due to an incoming 1125 3104 jprakash * new SYN request when Q0 is full, a new doubly linked list is 1126 3104 jprakash * introduced. This list allows to select an eager from Q0 in O(1) time. 1127 3104 jprakash * This is needed to avoid spending too much time walking through the 1128 3104 jprakash * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of 1129 3104 jprakash * this new list has to be a member of Q0. 1130 3104 jprakash * This list is headed by listener's tcp_t. When the list is empty, 1131 3104 jprakash * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0, 1132 3104 jprakash * of listener's tcp_t point to listener's tcp_t itself. 1133 3104 jprakash * 1134 3104 jprakash * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager 1135 3104 jprakash * in the list. MAKE_UNDROPPABLE() takes the eager out of the list. 1136 3104 jprakash * These macros do not affect the eager's membership to Q0. 1137 3104 jprakash */ 1138 3104 jprakash 1139 3104 jprakash 1140 3104 jprakash #define MAKE_DROPPABLE(listener, eager) \ 1141 3104 jprakash if ((eager)->tcp_eager_next_drop_q0 == NULL) { \ 1142 3104 jprakash (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\ 1143 3104 jprakash = (eager); \ 1144 3104 jprakash (eager)->tcp_eager_prev_drop_q0 = (listener); \ 1145 3104 jprakash (eager)->tcp_eager_next_drop_q0 = \ 1146 3104 jprakash (listener)->tcp_eager_next_drop_q0; \ 1147 3104 jprakash (listener)->tcp_eager_next_drop_q0 = (eager); \ 1148 3104 jprakash } 1149 3104 jprakash 1150 3104 jprakash #define MAKE_UNDROPPABLE(eager) \ 1151 3104 jprakash if ((eager)->tcp_eager_next_drop_q0 != NULL) { \ 1152 3104 jprakash (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \ 1153 3104 jprakash = (eager)->tcp_eager_prev_drop_q0; \ 1154 3104 jprakash (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \ 1155 3104 jprakash = (eager)->tcp_eager_next_drop_q0; \ 1156 3104 jprakash (eager)->tcp_eager_prev_drop_q0 = NULL; \ 1157 3104 jprakash (eager)->tcp_eager_next_drop_q0 = NULL; \ 1158 3104 jprakash } 1159 3104 jprakash 1160 3104 jprakash /* 1161 0 stevel * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more 1162 0 stevel * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent 1163 0 stevel * data, TCP will not respond with an ACK. RFC 793 requires that 1164 0 stevel * TCP responds with an ACK for such a bogus ACK. By not following 1165 0 stevel * the RFC, we prevent TCP from getting into an ACK storm if somehow 1166 0 stevel * an attacker successfully spoofs an acceptable segment to our 1167 0 stevel * peer; or when our peer is "confused." 1168 0 stevel */ 1169 0 stevel uint32_t tcp_drop_ack_unsent_cnt = 10; 1170 0 stevel 1171 0 stevel /* 1172 0 stevel * Hook functions to enable cluster networking 1173 0 stevel * On non-clustered systems these vectors must always be NULL. 1174 0 stevel */ 1175 0 stevel 1176 8392 Huafeng void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol, 1177 8392 Huafeng sa_family_t addr_family, uint8_t *laddrp, 1178 8392 Huafeng in_port_t lport, void *args) = NULL; 1179 8392 Huafeng void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol, 1180 8392 Huafeng sa_family_t addr_family, uint8_t *laddrp, 1181 8392 Huafeng in_port_t lport, void *args) = NULL; 1182 8392 Huafeng 1183 8392 Huafeng int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol, 1184 8392 Huafeng boolean_t is_outgoing, 1185 8392 Huafeng sa_family_t addr_family, 1186 0 stevel uint8_t *laddrp, in_port_t lport, 1187 8392 Huafeng uint8_t *faddrp, in_port_t fport, 1188 8392 Huafeng void *args) = NULL; 1189 8392 Huafeng void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol, 1190 8392 Huafeng sa_family_t addr_family, uint8_t *laddrp, 1191 8392 Huafeng in_port_t lport, uint8_t *faddrp, 1192 8392 Huafeng in_port_t fport, void *args) = NULL; 1193 0 stevel 1194 8392 Huafeng 1195 8392 Huafeng /* 1196 8392 Huafeng * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err) 1197 8392 Huafeng */ 1198 11042 Erik #define CL_INET_CONNECT(connp, is_outgoing, err) { \ 1199 8392 Huafeng (err) = 0; \ 1200 8392 Huafeng if (cl_inet_connect2 != NULL) { \ 1201 0 stevel /* \ 1202 0 stevel * Running in cluster mode - register active connection \ 1203 0 stevel * information \ 1204 0 stevel */ \ 1205 11042 Erik if ((connp)->conn_ipversion == IPV4_VERSION) { \ 1206 11042 Erik if ((connp)->conn_laddr_v4 != 0) { \ 1207 8392 Huafeng (err) = (*cl_inet_connect2)( \ 1208 8392 Huafeng (connp)->conn_netstack->netstack_stackid,\ 1209 8392 Huafeng IPPROTO_TCP, is_outgoing, AF_INET, \ 1210 11042 Erik (uint8_t *)(&((connp)->conn_laddr_v4)),\ 1211 11042 Erik (in_port_t)(connp)->conn_lport, \ 1212 11042 Erik (uint8_t *)(&((connp)->conn_faddr_v4)),\ 1213 11042 Erik (in_port_t)(connp)->conn_fport, NULL); \ 1214 0 stevel } \ 1215 0 stevel } else { \ 1216 0 stevel if (!IN6_IS_ADDR_UNSPECIFIED( \ 1217 11042 Erik &(connp)->conn_laddr_v6)) { \ 1218 8392 Huafeng (err) = (*cl_inet_connect2)( \ 1219 8392 Huafeng (connp)->conn_netstack->netstack_stackid,\ 1220 8392 Huafeng IPPROTO_TCP, is_outgoing, AF_INET6, \ 1221 11042 Erik (uint8_t *)(&((connp)->conn_laddr_v6)),\ 1222 11042 Erik (in_port_t)(connp)->conn_lport, \ 1223 11042 Erik (uint8_t *)(&((connp)->conn_faddr_v6)), \ 1224 11042 Erik (in_port_t)(connp)->conn_fport, NULL); \ 1225 0 stevel } \ 1226 0 stevel } \ 1227 0 stevel } \ 1228 0 stevel } 1229 0 stevel 1230 11042 Erik #define CL_INET_DISCONNECT(connp) { \ 1231 0 stevel if (cl_inet_disconnect != NULL) { \ 1232 0 stevel /* \ 1233 0 stevel * Running in cluster mode - deregister active \ 1234 0 stevel * connection information \ 1235 0 stevel */ \ 1236 11042 Erik if ((connp)->conn_ipversion == IPV4_VERSION) { \ 1237 11042 Erik if ((connp)->conn_laddr_v4 != 0) { \ 1238 8392 Huafeng (*cl_inet_disconnect)( \ 1239 8392 Huafeng (connp)->conn_netstack->netstack_stackid,\ 1240 8392 Huafeng IPPROTO_TCP, AF_INET, \ 1241 11042 Erik (uint8_t *)(&((connp)->conn_laddr_v4)),\ 1242 11042 Erik (in_port_t)(connp)->conn_lport, \ 1243 11042 Erik (uint8_t *)(&((connp)->conn_faddr_v4)),\ 1244 11042 Erik (in_port_t)(connp)->conn_fport, NULL); \ 1245 0 stevel } \ 1246 0 stevel } else { \ 1247 0 stevel if (!IN6_IS_ADDR_UNSPECIFIED( \ 1248 11042 Erik &(connp)->conn_laddr_v6)) { \ 1249 8392 Huafeng (*cl_inet_disconnect)( \ 1250 8392 Huafeng (connp)->conn_netstack->netstack_stackid,\ 1251 8392 Huafeng IPPROTO_TCP, AF_INET6, \ 1252 11042 Erik (uint8_t *)(&((connp)->conn_laddr_v6)),\ 1253 11042 Erik (in_port_t)(connp)->conn_lport, \ 1254 11042 Erik (uint8_t *)(&((connp)->conn_faddr_v6)), \ 1255 11042 Erik (in_port_t)(connp)->conn_fport, NULL); \ 1256 0 stevel } \ 1257 0 stevel } \ 1258 0 stevel } \ 1259 0 stevel } 1260 0 stevel 1261 0 stevel /* 1262 0 stevel * Cluster networking hook for traversing current connection list. 1263 0 stevel * This routine is used to extract the current list of live connections 1264 0 stevel * which must continue to to be dispatched to this node. 1265 0 stevel */ 1266 8392 Huafeng int cl_tcp_walk_list(netstackid_t stack_id, 1267 8392 Huafeng int (*callback)(cl_tcp_info_t *, void *), void *arg); 1268 0 stevel 1269 3448 dh155122 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), 1270 3448 dh155122 void *arg, tcp_stack_t *tcps); 1271 6878 brendan 1272 10312 Rao static void 1273 10312 Rao tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh) 1274 10312 Rao { 1275 10312 Rao uint32_t default_threshold = SOCKET_RECVHIWATER >> 3; 1276 10312 Rao 1277 10312 Rao if (IPCL_IS_NONSTR(tcp->tcp_connp)) { 1278 10312 Rao conn_t *connp = tcp->tcp_connp; 1279 10312 Rao struct sock_proto_props sopp; 1280 10312 Rao 1281 10312 Rao /* 1282 10312 Rao * only increase rcvthresh upto default_threshold 1283 10312 Rao */ 1284 10312 Rao if (new_rcvthresh > default_threshold) 1285 10312 Rao new_rcvthresh = default_threshold; 1286 10312 Rao 1287 10312 Rao sopp.sopp_flags = SOCKOPT_RCVTHRESH; 1288 10312 Rao sopp.sopp_rcvthresh = new_rcvthresh; 1289 10312 Rao 1290 10312 Rao (*connp->conn_upcalls->su_set_proto_props) 1291 10312 Rao (connp->conn_upper_handle, &sopp); 1292 10312 Rao } 1293 10312 Rao } 1294 0 stevel /* 1295 0 stevel * Figure out the value of window scale opton. Note that the rwnd is 1296 0 stevel * ASSUMED to be rounded up to the nearest MSS before the calculation. 1297 0 stevel * We cannot find the scale value and then do a round up of tcp_rwnd 1298 0 stevel * because the scale value may not be correct after that. 1299 0 stevel * 1300 0 stevel * Set the compiler flag to make this function inline. 1301 0 stevel */ 1302 0 stevel static void 1303 0 stevel tcp_set_ws_value(tcp_t *tcp) 1304 0 stevel { 1305 0 stevel int i; 1306 0 stevel uint32_t rwnd = tcp->tcp_rwnd; 1307 0 stevel 1308 0 stevel for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; 1309 0 stevel i++, rwnd >>= 1) 1310 0 stevel ; 1311 0 stevel tcp->tcp_rcv_ws = i; 1312 0 stevel } 1313 0 stevel 1314 0 stevel /* 1315 0 stevel * Remove a connection from the list of detached TIME_WAIT connections. 1316 3104 jprakash * It returns B_FALSE if it can't remove the connection from the list 1317 3104 jprakash * as the connection has already been removed from the list due to an 1318 3104 jprakash * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. 1319 3104 jprakash */ 1320 3104 jprakash static boolean_t 1321 0 stevel tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) 1322 0 stevel { 1323 0 stevel boolean_t locked = B_FALSE; 1324 0 stevel 1325 0 stevel if (tcp_time_wait == NULL) { 1326 0 stevel tcp_time_wait = *((tcp_squeue_priv_t **) 1327 0 stevel squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); 1328 0 stevel mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1329 0 stevel locked = B_TRUE; 1330 3448 dh155122 } else { 1331 3448 dh155122 ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); 1332 0 stevel } 1333 0 stevel 1334 0 stevel if (tcp->tcp_time_wait_expire == 0) { 1335 0 stevel ASSERT(tcp->tcp_time_wait_next == NULL); 1336 0 stevel ASSERT(tcp->tcp_time_wait_prev == NULL); 1337 0 stevel if (locked) 1338 0 stevel mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1339 3104 jprakash return (B_FALSE); 1340 0 stevel } 1341 0 stevel ASSERT(TCP_IS_DETACHED(tcp)); 1342 0 stevel ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1343 0 stevel 1344 0 stevel if (tcp == tcp_time_wait->tcp_time_wait_head) { 1345 0 stevel ASSERT(tcp->tcp_time_wait_prev == NULL); 1346 0 stevel tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; 1347 0 stevel if (tcp_time_wait->tcp_time_wait_head != NULL) { 1348 0 stevel tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = 1349 0 stevel NULL; 1350 0 stevel } else { 1351 0 stevel tcp_time_wait->tcp_time_wait_tail = NULL; 1352 0 stevel } 1353 0 stevel } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { 1354 0 stevel ASSERT(tcp != tcp_time_wait->tcp_time_wait_head); 1355 0 stevel ASSERT(tcp->tcp_time_wait_next == NULL); 1356 0 stevel tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; 1357 0 stevel ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1358 0 stevel tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; 1359 0 stevel } else { 1360 0 stevel ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 1361 0 stevel ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 1362 0 stevel tcp->tcp_time_wait_prev->tcp_time_wait_next = 1363 0 stevel tcp->tcp_time_wait_next; 1364 0 stevel tcp->tcp_time_wait_next->tcp_time_wait_prev = 1365 0 stevel tcp->tcp_time_wait_prev; 1366 0 stevel } 1367 0 stevel tcp->tcp_time_wait_next = NULL; 1368 0 stevel tcp->tcp_time_wait_prev = NULL; 1369 0 stevel tcp->tcp_time_wait_expire = 0; 1370 0 stevel 1371 0 stevel if (locked) 1372 0 stevel mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1373 3104 jprakash return (B_TRUE); 1374 0 stevel } 1375 0 stevel 1376 0 stevel /* 1377 0 stevel * Add a connection to the list of detached TIME_WAIT connections 1378 0 stevel * and set its time to expire. 1379 0 stevel */ 1380 0 stevel static void 1381 0 stevel tcp_time_wait_append(tcp_t *tcp) 1382 0 stevel { 1383 3448 dh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 1384 0 stevel tcp_squeue_priv_t *tcp_time_wait = 1385 0 stevel *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, 1386 5031 rs200217 SQPRIVATE_TCP)); 1387 0 stevel 1388 0 stevel tcp_timers_stop(tcp); 1389 0 stevel 1390 0 stevel /* Freed above */ 1391 0 stevel ASSERT(tcp->tcp_timer_tid == 0); 1392 0 stevel ASSERT(tcp->tcp_ack_tid == 0); 1393 0 stevel 1394 0 stevel /* must have happened at the time of detaching the tcp */ 1395 0 stevel ASSERT(tcp->tcp_ptpahn == NULL); 1396 0 stevel ASSERT(tcp->tcp_flow_stopped == 0); 1397 0 stevel ASSERT(tcp->tcp_time_wait_next == NULL); 1398 0 stevel ASSERT(tcp->tcp_time_wait_prev == NULL); 1399 0 stevel ASSERT(tcp->tcp_time_wait_expire == NULL); 1400 0 stevel ASSERT(tcp->tcp_listener == NULL); 1401 0 stevel 1402 0 stevel tcp->tcp_time_wait_expire = ddi_get_lbolt(); 1403 0 stevel /* 1404 0 stevel * The value computed below in tcp->tcp_time_wait_expire may 1405 0 stevel * appear negative or wrap around. That is ok since our 1406 0 stevel * interest is only in the difference between the current lbolt 1407 0 stevel * value and tcp->tcp_time_wait_expire. But the value should not 1408 0 stevel * be zero, since it means the tcp is not in the TIME_WAIT list. 1409 0 stevel * The corresponding comparison in tcp_time_wait_collector() uses 1410 0 stevel * modular arithmetic. 1411 0 stevel */ 1412 0 stevel tcp->tcp_time_wait_expire += 1413 3448 dh155122 drv_usectohz(tcps->tcps_time_wait_interval * 1000); 1414 0 stevel if (tcp->tcp_time_wait_expire == 0) 1415 0 stevel tcp->tcp_time_wait_expire = 1; 1416 0 stevel 1417 0 stevel ASSERT(TCP_IS_DETACHED(tcp)); 1418 0 stevel ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1419 0 stevel ASSERT(tcp->tcp_time_wait_next == NULL); 1420 0 stevel ASSERT(tcp->tcp_time_wait_prev == NULL); 1421 3448 dh155122 TCP_DBGSTAT(tcps, tcp_time_wait); 1422 3448 dh155122 1423 0 stevel mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1424 0 stevel if (tcp_time_wait->tcp_time_wait_head == NULL) { 1425 0 stevel ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); 1426 0 stevel tcp_time_wait->tcp_time_wait_head = tcp; 1427 0 stevel } else { 1428 0 stevel ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1429 0 stevel ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == 1430 0 stevel TCPS_TIME_WAIT); 1431 0 stevel tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; 1432 0 stevel tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; 1433 0 stevel } 1434 0 stevel tcp_time_wait->tcp_time_wait_tail = tcp; 1435 0 stevel mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1436 0 stevel } 1437 0 stevel 1438 0 stevel /* ARGSUSED */ 1439 0 stevel void 1440 11042 Erik tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1441 0 stevel { 1442 0 stevel conn_t *connp = (conn_t *)arg; 1443 0 stevel tcp_t *tcp = connp->conn_tcp; 1444 3448 dh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 1445 0 stevel 1446 0 stevel ASSERT(tcp != NULL); 1447 0 stevel if (tcp->tcp_state == TCPS_CLOSED) { 1448 0 stevel return; 1449 0 stevel } 1450 0 stevel 1451 11042 Erik ASSERT((connp->conn_family == AF_INET && 1452 11042 Erik connp->conn_ipversion == IPV4_VERSION) || 1453 11042 Erik (connp->conn_family == AF_INET6 && 1454 11042 Erik (connp->conn_ipversion == IPV4_VERSION || 1455 11042 Erik connp->conn_ipversion == IPV6_VERSION))); 1456 0 stevel ASSERT(!tcp->tcp_listener); 1457 0 stevel 1458 3448 dh155122 TCP_STAT(tcps, tcp_time_wait_reap); 1459 0 stevel ASSERT(TCP_IS_DETACHED(tcp)); 1460 0 stevel 1461 0 stevel /* 1462 0 stevel * Because they have no upstream client to rebind or tcp_close() 1463 0 stevel * them later, we axe the connection here and now. 1464 0 stevel */ 1465 0 stevel tcp_close_detached(tcp); 1466 0 stevel } 1467 0 stevel 1468 3448 dh155122 /* 1469 3448 dh155122 * Remove cached/latched IPsec references. 1470 3448 dh155122 */ 1471 3448 dh155122 void 1472 3448 dh155122 tcp_ipsec_cleanup(tcp_t *tcp) 1473 3448 dh155122 { 1474 3448 dh155122 conn_t *connp = tcp->tcp_connp; 1475 3448 dh155122 1476 5240 nordmark ASSERT(connp->conn_flags & IPCL_TCPCONN); 1477 5240 nordmark 1478 5240 nordmark if (connp->conn_latch != NULL) { 1479 11042 Erik IPLATCH_REFRELE(connp->conn_latch); 1480 5240 nordmark connp->conn_latch = NULL; 1481 11042 Erik } 1482 11042 Erik if (connp->conn_latch_in_policy != NULL) { 1483 11042 Erik IPPOL_REFRELE(connp->conn_latch_in_policy); 1484 11042 Erik connp->conn_latch_in_policy = NULL; 1485 11042 Erik } 1486 11042 Erik if (connp->conn_latch_in_action != NULL) { 1487 11042 Erik IPACT_REFRELE(connp->conn_latch_in_action); 1488 11042 Erik connp->conn_latch_in_action = NULL; 1489 5240 nordmark } 1490 5240 nordmark if (connp->conn_policy != NULL) { 1491 5240 nordmark IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 1492 5240 nordmark connp->conn_policy = NULL; 1493 3448 dh155122 } 1494 3448 dh155122 } 1495 3448 dh155122 1496 3448 dh155122 /* 1497 3448 dh155122 * Cleaup before placing on free list. 1498 3448 dh155122 * Disassociate from the netstack/tcp_stack_t since the freelist 1499 3448 dh155122 * is per squeue and not per netstack. 1500 3448 dh155122 */ 1501 0 stevel void 1502 0 stevel tcp_cleanup(tcp_t *tcp) 1503 0 stevel { 1504 0 stevel mblk_t *mp; 1505 0 stevel tcp_sack_info_t *tcp_sack_info; 1506 0 stevel conn_t *connp = tcp->tcp_connp; 1507 3448 dh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 1508 3448 dh155122 netstack_t *ns = tcps->tcps_netstack; 1509 8014 Kacheong mblk_t *tcp_rsrv_mp; 1510 0 stevel 1511 0 stevel tcp_bind_hash_remove(tcp); 1512 3448 dh155122 1513 3448 dh155122 /* Cleanup that which needs the netstack first */ 1514 3448 dh155122 tcp_ipsec_cleanup(tcp); 1515 11042 Erik ixa_cleanup(connp->conn_ixa); 1516 11042 Erik 1517 11042 Erik if (connp->conn_ht_iphc != NULL) { 1518 11042 Erik kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 1519 11042 Erik connp->conn_ht_iphc = NULL; 1520 11042 Erik connp->conn_ht_iphc_allocated = 0; 1521 11042 Erik connp->conn_ht_iphc_len = 0; 1522 11042 Erik connp->conn_ht_ulp = NULL; 1523 11042 Erik connp->conn_ht_ulp_len = 0; 1524 11042 Erik tcp->tcp_ipha = NULL; 1525 11042 Erik tcp->tcp_ip6h = NULL; 1526 11042 Erik tcp->tcp_tcpha = NULL; 1527 11042 Erik } 1528 11042 Erik 1529 11042 Erik /* We clear any IP_OPTIONS and extension headers */ 1530 11042 Erik ip_pkt_free(&connp->conn_xmit_ipp); 1531 3448 dh155122 1532 0 stevel tcp_free(tcp); 1533 0 stevel 1534 898 kais /* Release any SSL context */ 1535 898 kais if (tcp->tcp_kssl_ent != NULL) { 1536 898 kais kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 1537 898 kais tcp->tcp_kssl_ent = NULL; 1538 898 kais } 1539 898 kais 1540 898 kais if (tcp->tcp_kssl_ctx != NULL) { 1541 898 kais kssl_release_ctx(tcp->tcp_kssl_ctx); 1542 898 kais tcp->tcp_kssl_ctx = NULL; 1543 898 kais } 1544 898 kais tcp->tcp_kssl_pending = B_FALSE; 1545 0 stevel 1546 0 stevel /* 1547 0 stevel * Since we will bzero the entire structure, we need to 1548 0 stevel * remove it and reinsert it in global hash list. We 1549 0 stevel * know the walkers can't get to this conn because we 1550 0 stevel * had set CONDEMNED flag earlier and checked reference 1551 0 stevel * under conn_lock so walker won't pick it and when we 1552 0 stevel * go the ipcl_globalhash_remove() below, no walker 1553 0 stevel * can get to it. 1554 0 stevel */ 1555 0 stevel ipcl_globalhash_remove(connp); 1556 0 stevel 1557 11042 Erik /* Save some state */ 1558 11042 Erik mp = tcp->tcp_timercache; 1559 11042 Erik 1560 11042 Erik tcp_sack_info = tcp->tcp_sack_info; 1561 11042 Erik tcp_rsrv_mp = tcp->tcp_rsrv_mp; 1562 11042 Erik 1563 11042 Erik if (connp->conn_cred != NULL) { 1564 11042 Erik crfree(connp->conn_cred); 1565 11042 Erik connp->conn_cred = NULL; 1566 11042 Erik } 1567 11042 Erik ipcl_conn_cleanup(connp); 1568 11042 Erik connp->conn_flags = IPCL_TCPCONN; 1569 11042 Erik 1570 3448 dh155122 /* 1571 3448 dh155122 * Now it is safe to decrement the reference counts. 1572 11042 Erik * This might be the last reference on the netstack 1573 11042 Erik * in which case it will cause the freeing of the IP Instance. 1574 3448 dh155122 */ 1575 3448 dh155122 connp->conn_netstack = NULL; 1576 11042 Erik connp->conn_ixa->ixa_ipst = NULL; 1577 3448 dh155122 netstack_rele(ns); 1578 3448 dh155122 ASSERT(tcps != NULL); 1579 3448 dh155122 tcp->tcp_tcps = NULL; 1580 11042 Erik 1581 0 stevel bzero(tcp, sizeof (tcp_t)); 1582 0 stevel 1583 0 stevel /* restore the state */ 1584 0 stevel tcp->tcp_timercache = mp; 1585 0 stevel 1586 0 stevel tcp->tcp_sack_info = tcp_sack_info; 1587 8014 Kacheong tcp->tcp_rsrv_mp = tcp_rsrv_mp; 1588 0 stevel 1589 0 stevel tcp->tcp_connp = connp; 1590 0 stevel 1591 5240 nordmark ASSERT(connp->conn_tcp == tcp); 1592 5240 nordmark ASSERT(connp->conn_flags & IPCL_TCPCONN); 1593 0 stevel connp->conn_state_flags = CONN_INCIPIENT; 1594 11042 Erik ASSERT(connp->conn_proto == IPPROTO_TCP); 1595 5240 nordmark ASSERT(connp->conn_ref == 1); 1596 0 stevel } 1597 0 stevel 1598 0 stevel /* 1599 0 stevel * Blows away all tcps whose TIME_WAIT has expired. List traversal 1600 0 stevel * is done forwards from the head. 1601 3448 dh155122 * This walks all stack instances since 1602 3448 dh155122 * tcp_time_wait remains global across all stacks. 1603 0 stevel */ 1604 0 stevel /* ARGSUSED */ 1605 0 stevel void 1606 0 stevel tcp_time_wait_collector(void *arg) 1607 0 stevel { 1608 0 stevel tcp_t *tcp; 1609 0 stevel clock_t now; 1610 0 stevel mblk_t *mp; 1611 0 stevel conn_t *connp; 1612 0 stevel kmutex_t *lock; 1613 3104 jprakash boolean_t removed; 1614 0 stevel 1615 0 stevel squeue_t *sqp = (squeue_t *)arg; 1616 0 stevel tcp_squeue_priv_t *tcp_time_wait = 1617 0 stevel *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 1618 0 stevel 1619 0 stevel mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1620 0 stevel tcp_time_wait->tcp_time_wait_tid = 0; 1621 0 stevel 1622 0 stevel if (tcp_time_wait->tcp_free_list != NULL && 1623 0 stevel tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { 1624 3448 dh155122 TCP_G_STAT(tcp_freelist_cleanup); 1625 0 stevel while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { 1626 0 stevel tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 1627 3448 dh155122 tcp->tcp_time_wait_next = NULL; 1628 3448 dh155122 tcp_time_wait->tcp_free_list_cnt--; 1629 3448 dh155122 ASSERT(tcp->tcp_tcps == NULL); 1630 0 stevel CONN_DEC_REF(tcp->tcp_connp); 1631 0 stevel } 1632 3448 dh155122 ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); 1633 0 stevel } 1634 0 stevel 1635 0 stevel /* 1636 0 stevel * In order to reap time waits reliably, we should use a 1637 0 stevel * source of time that is not adjustable by the user -- hence 1638 0 stevel * the call to ddi_get_lbolt(). 1639 0 stevel */ 1640 0 stevel now = ddi_get_lbolt(); 1641 0 stevel while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { 1642 0 stevel /* 1643 0 stevel * Compare times using modular arithmetic, since 1644 0 stevel * lbolt can wrapover. 1645 0 stevel */ 1646 0 stevel if ((now - tcp->tcp_time_wait_expire) < 0) { 1647 0 stevel break; 1648 0 stevel } 1649 0 stevel 1650 3104 jprakash removed = tcp_time_wait_remove(tcp, tcp_time_wait); 1651 3104 jprakash ASSERT(removed); 1652 0 stevel 1653 0 stevel connp = tcp->tcp_connp; 1654 0 stevel ASSERT(connp->conn_fanout != NULL); 1655 0 stevel lock = &connp->conn_fanout->connf_lock; 1656 0 stevel /* 1657 487 rshoaib * This is essentially a TW reclaim fast path optimization for 1658 487 rshoaib * performance where the timewait collector checks under the 1659 487 rshoaib * fanout lock (so that no one else can get access to the 1660 487 rshoaib * conn_t) that the refcnt is 2 i.e. one for TCP and one for 1661 487 rshoaib * the classifier hash list. If ref count is indeed 2, we can 1662 487 rshoaib * just remove the conn under the fanout lock and avoid 1663 487 rshoaib * cleaning up the conn under the squeue, provided that 1664 487 rshoaib * clustering callbacks are not enabled. If clustering is 1665 487 rshoaib * enabled, we need to make the clustering callback before 1666 487 rshoaib * setting the CONDEMNED flag and after dropping all locks and 1667 487 rshoaib * so we forego this optimization and fall back to the slow 1668 487 rshoaib * path. Also please see the comments in tcp_closei_local 1669 487 rshoaib * regarding the refcnt logic. 1670 0 stevel * 1671 0 stevel * Since we are holding the tcp_time_wait_lock, its better 1672 0 stevel * not to block on the fanout_lock because other connections 1673 0 stevel * can't add themselves to time_wait list. So we do a 1674 0 stevel * tryenter instead of mutex_enter. 1675 0 stevel */ 1676 0 stevel if (mutex_tryenter(lock)) { 1677 0 stevel mutex_enter(&connp->conn_lock); 1678 487 rshoaib if ((connp->conn_ref == 2) && 1679 487 rshoaib (cl_inet_disconnect == NULL)) { 1680 0 stevel ipcl_hash_remove_locked(connp, 1681 0 stevel connp->conn_fanout); 1682 0 stevel /* 1683 0 stevel * Set the CONDEMNED flag now itself so that 1684 0 stevel * the refcnt cannot increase due to any 1685 11042 Erik * walker. 1686 0 stevel */ 1687 0 stevel connp->conn_state_flags |= CONN_CONDEMNED; 1688 0 stevel mutex_exit(lock); 1689 0 stevel mutex_exit(&connp->conn_lock); 1690 1023 ethindra if (tcp_time_wait->tcp_free_list_cnt < 1691 1023 ethindra tcp_free_list_max_cnt) { 1692 1023 ethindra /* Add to head of tcp_free_list */ 1693 1023 ethindra mutex_exit( 1694 1023 ethindra &tcp_time_wait->tcp_time_wait_lock); 1695 1023 ethindra tcp_cleanup(tcp); 1696 3448 dh155122 ASSERT(connp->conn_latch == NULL); 1697 3448 dh155122 ASSERT(connp->conn_policy == NULL); 1698 3448 dh155122 ASSERT(tcp->tcp_tcps == NULL); 1699 3448 dh155122 ASSERT(connp->conn_netstack == NULL); 1700 3448 dh155122 1701 1023 ethindra mutex_enter( 1702 1023 ethindra &tcp_time_wait->tcp_time_wait_lock); 1703 1023 ethindra tcp->tcp_time_wait_next = 1704 1023 ethindra tcp_time_wait->tcp_free_list; 1705 1023 ethindra tcp_time_wait->tcp_free_list = tcp; 1706 1023 ethindra tcp_time_wait->tcp_free_list_cnt++; 1707 1023 ethindra continue; 1708 1023 ethindra } else { 1709 1023 ethindra /* Do not add to tcp_free_list */ 1710 1023 ethindra mutex_exit( 1711 1023 ethindra &tcp_time_wait->tcp_time_wait_lock); 1712 1023 ethindra tcp_bind_hash_remove(tcp); 1713 11042 Erik ixa_cleanup(tcp->tcp_connp->conn_ixa); 1714 3448 dh155122 tcp_ipsec_cleanup(tcp); 1715 1023 ethindra CONN_DEC_REF(tcp->tcp_connp); 1716 1023 ethindra } 1717 0 stevel } else { 1718 0 stevel CONN_INC_REF_LOCKED(connp); 1719 0 stevel mutex_exit(lock); 1720 0 stevel mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1721 0 stevel mutex_exit(&connp->conn_lock); 1722 0 stevel /* 1723 0 stevel * We can reuse the closemp here since conn has 1724 0 stevel * detached (otherwise we wouldn't even be in 1725 3104 jprakash * time_wait list). tcp_closemp_used can safely 1726 3104 jprakash * be changed without taking a lock as no other 1727 3104 jprakash * thread can concurrently access it at this 1728 4200 jprakash * point in the connection lifecycle. 1729 3104 jprakash */ 1730 3104 jprakash 1731 3104 jprakash if (tcp->tcp_closemp.b_prev == NULL) 1732 4200 jprakash tcp->tcp_closemp_used = B_TRUE; 1733 3104 jprakash else 1734 4200 jprakash cmn_err(CE_PANIC, 1735 4200 jprakash "tcp_timewait_collector: " 1736 4200 jprakash "concurrent use of tcp_closemp: " 1737 4200 jprakash "connp %p tcp %p\n", (void *)connp, 1738 4200 jprakash (void *)tcp); 1739 3104 jprakash 1740 3104 jprakash TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1741 0 stevel mp = &tcp->tcp_closemp; 1742 8275 Eric SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 1743 11042 Erik tcp_timewait_output, connp, NULL, 1744 8275 Eric SQ_FILL, SQTAG_TCP_TIMEWAIT); 1745 0 stevel } 1746 0 stevel } else { 1747 0 stevel mutex_enter(&connp->conn_lock); 1748 0 stevel CONN_INC_REF_LOCKED(connp); 1749 0 stevel mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1750 0 stevel mutex_exit(&connp->conn_lock); 1751 0 stevel /* 1752 0 stevel * We can reuse the closemp here since conn has 1753 0 stevel * detached (otherwise we wouldn't even be in 1754 3104 jprakash * time_wait list). tcp_closemp_used can safely 1755 3104 jprakash * be changed without taking a lock as no other 1756 3104 jprakash * thread can concurrently access it at this 1757 4200 jprakash * point in the connection lifecycle. 1758 3104 jprakash */ 1759 3104 jprakash 1760 3104 jprakash if (tcp->tcp_closemp.b_prev == NULL) 1761 4200 jprakash tcp->tcp_closemp_used = B_TRUE; 1762 3104 jprakash else 1763 4200 jprakash cmn_err(CE_PANIC, "tcp_timewait_collector: " 1764 4200 jprakash "concurrent use of tcp_closemp: " 1765 4200 jprakash "connp %p tcp %p\n", (void *)connp, 1766 4200 jprakash (void *)tcp); 1767 3104 jprakash 1768 3104 jprakash TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1769 0 stevel mp = &tcp->tcp_closemp; 1770 8275 Eric SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 1771 11042 Erik tcp_timewait_output, connp, NULL, 1772 8275 Eric SQ_FILL, SQTAG_TCP_TIMEWAIT); 1773 0 stevel } 1774 0 stevel mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1775 0 stevel } 1776 0 stevel 1777 0 stevel if (tcp_time_wait->tcp_free_list != NULL) 1778 0 stevel tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; 1779 0 stevel 1780 0 stevel tcp_time_wait->tcp_time_wait_tid = 1781 8048 Madhavan timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp, 1782 8048 Madhavan TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION, 1783 8048 Madhavan CALLOUT_FLAG_ROUNDUP); 1784 0 stevel mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1785 0 stevel } 1786 8348 Eric 1787 0 stevel /* 1788 0 stevel * Reply to a clients T_CONN_RES TPI message. This function 1789 0 stevel * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1790 11042 Erik * on the acceptor STREAM and processed in tcp_accept_common(). 1791 11042 Erik * Read the block comment on top of tcp_input_listener(). 1792 0 stevel */ 1793 0 stevel static void 1794 8348 Eric tcp_tli_accept(tcp_t *listener, mblk_t *mp) 1795 0 stevel { 1796 11042 Erik tcp_t *acceptor; 1797 11042 Erik tcp_t *eager; 1798 11042 Erik tcp_t *tcp; 1799 0 stevel struct T_conn_res *tcr; 1800 0 stevel t_uscalar_t acceptor_id; 1801 0 stevel t_scalar_t seqnum; 1802 11042 Erik mblk_t *discon_mp = NULL; 1803 11042 Erik mblk_t *ok_mp; 1804 11042 Erik mblk_t *mp1; 1805 3448 dh155122 tcp_stack_t *tcps = listener->tcp_tcps; 1806 11042 Erik conn_t *econnp; 1807 0 stevel 1808 0 stevel if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1809 0 stevel tcp_err_ack(listener, mp, TPROTO, 0); 1810 0 stevel return; 1811 0 stevel } 1812 0 stevel tcr = (struct T_conn_res *)mp->b_rptr; 1813 0 stevel 1814 0 stevel /* 1815 0 stevel * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1816 0 stevel * read side queue of the streams device underneath us i.e. the 1817 0 stevel * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1818 0 stevel * look it up in the queue_hash. Under LP64 it sends down the 1819 0 stevel * minor_t of the accepting endpoint. 1820 0 stevel * 1821 0 stevel * Once the acceptor/eager are modified (in tcp_accept_swap) the 1822 0 stevel * fanout hash lock is held. 1823 0 stevel * This prevents any thread from entering the acceptor queue from 1824 0 stevel * below (since it has not been hard bound yet i.e. any inbound 1825 11042 Erik * packets will arrive on the listener conn_t and 1826 11042 Erik * go through the classifier). 1827 0 stevel * The CONN_INC_REF will prevent the acceptor from closing. 1828 0 stevel * 1829 0 stevel * XXX It is still possible for a tli application to send down data 1830 0 stevel * on the accepting stream while another thread calls t_accept. 1831 0 stevel * This should not be a problem for well-behaved applications since 1832 0 stevel * the T_OK_ACK is sent after the queue swapping is completed. 1833 0 stevel * 1834 0 stevel * If the accepting fd is the same as the listening fd, avoid 1835 0 stevel * queue hash lookup since that will return an eager listener in a 1836 0 stevel * already established state. 1837 0 stevel */ 1838 0 stevel acceptor_id = tcr->ACCEPTOR_id; 1839 0 stevel mutex_enter(&listener->tcp_eager_lock); 1840 0 stevel if (listener->tcp_acceptor_id == acceptor_id) { 1841 0 stevel eager = listener->tcp_eager_next_q; 1842 0 stevel /* only count how many T_CONN_INDs so don't count q0 */ 1843 0 stevel if ((listener->tcp_conn_req_cnt_q != 1) || 1844 0 stevel (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 1845 0 stevel mutex_exit(&listener->tcp_eager_lock); 1846 0 stevel tcp_err_ack(listener, mp, TBADF, 0); 1847 0 stevel return; 1848 0 stevel } 1849 0 stevel if (listener->tcp_conn_req_cnt_q0 != 0) { 1850 0 stevel /* Throw away all the eagers on q0. */ 1851 0 stevel tcp_eager_cleanup(listener, 1); 1852 0 stevel } 1853 0 stevel if (listener->tcp_syn_defense) { 1854 0 stevel listener->tcp_syn_defense = B_FALSE; 1855 0 stevel if (listener->tcp_ip_addr_cache != NULL) { 1856 0 stevel kmem_free(listener->tcp_ip_addr_cache, 1857 0 stevel IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1858 0 stevel listener->tcp_ip_addr_cache = NULL; 1859 0 stevel } 1860 0 stevel } 1861 0 stevel /* 1862 0 stevel * Transfer tcp_conn_req_max to the eager so that when 1863 0 stevel * a disconnect occurs we can revert the endpoint to the 1864 0 stevel * listen state. 1865 0 stevel */ 1866 0 stevel eager->tcp_conn_req_max = listener->tcp_conn_req_max; 1867 0 stevel ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 1868 0 stevel /* 1869 0 stevel * Get a reference on the acceptor just like the 1870 0 stevel * tcp_acceptor_hash_lookup below. 1871 0 stevel */ 1872 0 stevel acceptor = listener; 1873 0 stevel CONN_INC_REF(acceptor->tcp_connp); 1874 0 stevel } else { 1875 3448 dh155122 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); 1876 0 stevel if (acceptor == NULL) { 1877 11042 Erik if (listener->tcp_connp->conn_debug) { 1878 741 masputra (void) strlog(TCP_MOD_ID, 0, 1, 1879 0 stevel SL_ERROR|SL_TRACE, 1880 0 stevel "tcp_accept: did not find acceptor 0x%x\n", 1881 0 stevel acceptor_id); 1882 0 stevel } 1883 0 stevel mutex_exit(&listener->tcp_eager_lock); 1884 0 stevel tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 1885 0 stevel return; 1886 0 stevel } 1887 0 stevel /* 1888 0 stevel * Verify acceptor state. The acceptable states for an acceptor 1889 0 stevel * include TCPS_IDLE and TCPS_BOUND. 1890 0 stevel */ 1891 0 stevel switch (acceptor->tcp_state) { 1892 0 stevel case TCPS_IDLE: 1893 0 stevel /* FALLTHRU */ 1894 0 stevel case TCPS_BOUND: 1895 0 stevel break; 1896 0 stevel default: 1897 0 stevel CONN_DEC_REF(acceptor->tcp_connp); 1898 0 stevel mutex_exit(&listener->tcp_eager_lock); 1899 0 stevel tcp_err_ack(listener, mp, TOUTSTATE, 0); 1900 0 stevel return; 1901 0 stevel } 1902 0 stevel } 1903 0 stevel 1904 0 stevel /* The listener must be in TCPS_LISTEN */ 1905 0 stevel if (listener->tcp_state != TCPS_LISTEN) { 1906 0 stevel CONN_DEC_REF(acceptor->tcp_connp); 1907 0 stevel mutex_exit(&listener->tcp_eager_lock); 1908 0 stevel tcp_err_ack(listener, mp, TOUTSTATE, 0); 1909 0 stevel return; 1910 0 stevel } 1911 0 stevel 1912 0 stevel /* 1913 0 stevel * Rendezvous with an eager connection request packet hanging off 1914 0 stevel * 'tcp' that has the 'seqnum' tag. We tagged the detached open 1915 0 stevel * tcp structure when the connection packet arrived in 1916 11042 Erik * tcp_input_listener(). 1917 0 stevel */ 1918 0 stevel seqnum = tcr->SEQ_number; 1919 0 stevel eager = listener; 1920 0 stevel do { 1921 0 stevel eager = eager->tcp_eager_next_q; 1922 0 stevel if (eager == NULL) { 1923 0 stevel CONN_DEC_REF(acceptor->tcp_connp); 1924 0 stevel mutex_exit(&listener->tcp_eager_lock); 1925 0 stevel tcp_err_ack(listener, mp, TBADSEQ, 0); 1926 0 stevel return; 1927 0 stevel } 1928 0 stevel } while (eager->tcp_conn_req_seqnum != seqnum); 1929 0 stevel mutex_exit(&listener->tcp_eager_lock); 1930 0 stevel 1931 0 stevel /* 1932 0 stevel * At this point, both acceptor and listener have 2 ref 1933 0 stevel * that they begin with. Acceptor has one additional ref 1934 0 stevel * we placed in lookup while listener has 3 additional 1935 0 stevel * ref for being behind the squeue (tcp_accept() is 1936 0 stevel * done on listener's squeue); being in classifier hash; 1937 0 stevel * and eager's ref on listener. 1938 0 stevel */ 1939 0 stevel ASSERT(listener->tcp_connp->conn_ref >= 5); 1940 0 stevel ASSERT(acceptor->tcp_connp->conn_ref >= 3); 1941 0 stevel 1942 0 stevel /* 1943 0 stevel * The eager at this point is set in its own squeue and 1944 0 stevel * could easily have been killed (tcp_accept_finish will 1945 0 stevel * deal with that) because of a TH_RST so we can only 1946 0 stevel * ASSERT for a single ref. 1947 0 stevel */ 1948 0 stevel ASSERT(eager->tcp_connp->conn_ref >= 1); 1949 0 stevel 1950 11042 Erik /* 1951 11042 Erik * Pre allocate the discon_ind mblk also. tcp_accept_finish will 1952 11042 Erik * use it if something failed. 1953 11042 Erik */ 1954 11042 Erik discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 1955 11042 Erik sizeof (struct stroptions)), BPRI_HI); 1956 11042 Erik if (discon_mp == NULL) { 1957 0 stevel CONN_DEC_REF(acceptor->tcp_connp); 1958 0 stevel CONN_DEC_REF(eager->tcp_connp); 1959 0 stevel tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1960 0 stevel return; 1961 0 stevel } 1962 11042 Erik 1963 11042 Erik econnp = eager->tcp_connp; 1964 11042 Erik 1965 11042 Erik /* Hold a copy of mp, in case reallocb fails */ 1966 0 stevel if ((mp1 = copymsg(mp)) == NULL) { 1967 0 stevel CONN_DEC_REF(acceptor->tcp_connp); 1968 0 stevel CONN_DEC_REF(eager->tcp_connp); 1969 11042 Erik freemsg(discon_mp); 1970 0 stevel tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 1971 0 stevel return; 1972 0 stevel } 1973 0 stevel 1974 0 stevel tcr = (struct T_conn_res *)mp1->b_rptr; 1975 0 stevel 1976 0 stevel /* 1977 0 stevel * This is an expanded version of mi_tpi_ok_ack_alloc() 1978 0 stevel * which allocates a larger mblk and appends the new 1979 0 stevel * local address to the ok_ack. The address is copied by 1980 0 stevel * soaccept() for getsockname(). 1981 0 stevel */ 1982 0 stevel { 1983 0 stevel int extra; 1984 0 stevel 1985 11042 Erik extra = (econnp->conn_family == AF_INET) ? 1986 0 stevel sizeof (sin_t) : sizeof (sin6_t); 1987 0 stevel 1988 0 stevel /* 1989 0 stevel * Try to re-use mp, if possible. Otherwise, allocate 1990 0 stevel * an mblk and return it as ok_mp. In any case, mp 1991 0 stevel * is no longer usable upon return. 1992 0 stevel */ 1993 0 stevel if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 1994 0 stevel CONN_DEC_REF(acceptor->tcp_connp); 1995 0 stevel CONN_DEC_REF(eager->tcp_connp); 1996 11042 Erik freemsg(discon_mp); 1997 0 stevel /* Original mp has been freed by now, so use mp1 */ 1998 0 stevel tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 1999 0 stevel return; 2000 0 stevel } 2001 0 stevel 2002 0 stevel mp = NULL; /* We should never use mp after this point */ 2003 0 stevel 2004 0 stevel switch (extra) { 2005 0 stevel case sizeof (sin_t): { 2006 11042 Erik sin_t *sin = (sin_t *)ok_mp->b_wptr; 2007 11042 Erik 2008 11042 Erik ok_mp->b_wptr += extra; 2009 11042 Erik sin->sin_family = AF_INET; 2010 11042 Erik sin->sin_port = econnp->conn_lport; 2011 11042 Erik sin->sin_addr.s_addr = econnp->conn_laddr_v4; 2012 11042 Erik break; 2013 11042 Erik } 2014 0 stevel case sizeof (sin6_t): { 2015 11042 Erik sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 2016 11042 Erik 2017 11042 Erik ok_mp->b_wptr += extra; 2018 11042 Erik sin6->sin6_family = AF_INET6; 2019 11042 Erik sin6->sin6_port = econnp->conn_lport; 2020 11042 Erik sin6->sin6_addr = econnp->conn_laddr_v6; 2021 11042 Erik sin6->sin6_flowinfo = econnp->conn_flowinfo; 2022 11042 Erik if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 2023 11042 Erik (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 2024 11042 Erik sin6->sin6_scope_id = 2025 11042 Erik econnp->conn_ixa->ixa_scopeid; 2026 11042 Erik } else { 2027 4379 ja97890 sin6->sin6_scope_id = 0; 2028 11042 Erik } 2029 11042 Erik sin6->__sin6_src_id = 0; 2030 11042 Erik break; 2031 11042 Erik } 2032 0 stevel default: 2033 0 stevel break; 2034 0 stevel } 2035 0 stevel ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 2036 0 stevel } 2037 0 stevel 2038 0 stevel /* 2039 0 stevel * If there are no options we know that the T_CONN_RES will 2040 0 stevel * succeed. However, we can't send the T_OK_ACK upstream until 2041 0 stevel * the tcp_accept_swap is done since it would be dangerous to 2042 0 stevel * let the application start using the new fd prior to the swap. 2043 0 stevel */ 2044 11042 Erik tcp_accept_swap(listener, acceptor, eager); 2045 0 stevel 2046 0 stevel /* 2047 0 stevel * tcp_accept_swap unlinks eager from listener but does not drop 2048 0 stevel * the eager's reference on the listener. 2049 0 stevel */ 2050 0 stevel ASSERT(eager->tcp_listener == NULL); 2051 0 stevel ASSERT(listener->tcp_connp->conn_ref >= 5); 2052 0 stevel 2053 0 stevel /* 2054 0 stevel * The eager is now associated with its own queue. Insert in 2055 0 stevel * the hash so that the connection can be reused for a future 2056 0 stevel * T_CONN_RES. 2057 0 stevel */ 2058 0 stevel tcp_acceptor_hash_insert(acceptor_id, eager); 2059 0 stevel 2060 0 stevel /* 2061 0 stevel * We now do the processing of options with T_CONN_RES. 2062 0 stevel * We delay till now since we wanted to have queue to pass to 2063 0 stevel * option processing routines that points back to the right 2064 0 stevel * instance structure which does not happen until after 2065 0 stevel * tcp_accept_swap(). 2066 0 stevel * 2067 0 stevel * Note: 2068 0 stevel * The sanity of the logic here assumes that whatever options 2069 0 stevel * are appropriate to inherit from listner=>eager are done 2070 0 stevel * before this point, and whatever were to be overridden (or not) 2071 0 stevel * in transfer logic from eager=>acceptor in tcp_accept_swap(). 2072 0 stevel * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 2073 0 stevel * before its ACCEPTOR_id comes down in T_CONN_RES ] 2074 0 stevel * This may not be true at this point in time but can be fixed 2075 0 stevel * independently. This option processing code starts with 2076 0 stevel * the instantiated acceptor instance and the final queue at 2077 0 stevel * this point. 2078 0 stevel */ 2079 0 stevel 2080 0 stevel if (tcr->OPT_length != 0) { 2081 0 stevel /* Options to process */ 2082 0 stevel int t_error = 0; 2083 0 stevel int sys_error = 0; 2084 0 stevel int do_disconnect = 0; 2085 0 stevel 2086 0 stevel if (tcp_conprim_opt_process(eager, mp1, 2087 0 stevel &do_disconnect, &t_error, &sys_error) < 0) { 2088 0 stevel eager->tcp_accept_error = 1; 2089 0 stevel if (do_disconnect) { 2090 0 stevel /* 2091 0 stevel * An option failed which does not allow 2092 0 stevel * connection to be accepted. 2093 0 stevel * 2094 0 stevel * We allow T_CONN_RES to succeed and 2095 0 stevel * put a T_DISCON_IND on the eager queue. 2096 0 stevel */ 2097 0 stevel ASSERT(t_error == 0 && sys_error == 0); 2098 0 stevel eager->tcp_send_discon_ind = 1; 2099 0 stevel } else { 2100 0 stevel ASSERT(t_error != 0); 2101 0 stevel freemsg(ok_mp); 2102 0 stevel /* 2103 0 stevel * Original mp was either freed or set 2104 0 stevel * to ok_mp above, so use mp1 instead. 2105 0 stevel */ 2106 0 stevel tcp_err_ack(listener, mp1, t_error, sys_error); 2107 0 stevel goto finish; 2108 0 stevel } 2109 0 stevel } 2110 0 stevel /* 2111 0 stevel * Most likely success in setting options (except if 2112 0 stevel * eager->tcp_send_discon_ind set). 2113 0 stevel * mp1 option buffer represented by OPT_length/offset 2114 0 stevel * potentially modified and contains results of setting 2115 0 stevel * options at this point 2116 0 stevel */ 2117 0 stevel } 2118 0 stevel 2119 0 stevel /* We no longer need mp1, since all options processing has passed */ 2120 0 stevel freemsg(mp1); 2121 0 stevel 2122 11042 Erik putnext(listener->tcp_connp->conn_rq, ok_mp); 2123 0 stevel 2124 0 stevel mutex_enter(&listener->tcp_eager_lock); 2125 0 stevel if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 2126 0 stevel tcp_t *tail; 2127 0 stevel mblk_t *conn_ind; 2128 0 stevel 2129 0 stevel /* 2130 0 stevel * This path should not be executed if listener and 2131 0 stevel * acceptor streams are the same. 2132 0 stevel */ 2133 0 stevel ASSERT(listener != acceptor); 2134 0 stevel 2135 0 stevel tcp = listener->tcp_eager_prev_q0; 2136 0 stevel /* 2137 0 stevel * listener->tcp_eager_prev_q0 points to the TAIL of the 2138 0 stevel * deferred T_conn_ind queue. We need to get to the head of 2139 0 stevel * the queue in order to send up T_conn_ind the same order as 2140 0 stevel * how the 3WHS is completed. 2141 0 stevel */ 2142 0 stevel while (tcp != listener) { 2143 0 stevel if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 2144 0 stevel break; 2145 0 stevel else 2146 0 stevel tcp = tcp->tcp_eager_prev_q0; 2147 0 stevel } 2148 0 stevel ASSERT(tcp != listener); 2149 0 stevel conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 2150 0 stevel ASSERT(conn_ind != NULL); 2151 0 stevel tcp->tcp_conn.tcp_eager_conn_ind = NULL; 2152 0 stevel 2153 0 stevel /* Move from q0 to q */ 2154 0 stevel ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 2155 0 stevel listener->tcp_conn_req_cnt_q0--; 2156 0 stevel listener->tcp_conn_req_cnt_q++; 2157 0 stevel tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2158 0 stevel tcp->tcp_eager_prev_q0; 2159 0 stevel tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2160 0 stevel tcp->tcp_eager_next_q0; 2161 0 stevel tcp->tcp_eager_prev_q0 = NULL; 2162 0 stevel tcp->tcp_eager_next_q0 = NULL; 2163 0 stevel tcp->tcp_conn_def_q0 = B_FALSE; 2164 3104 jprakash 2165 3104 jprakash /* Make sure the tcp isn't in the list of droppables */ 2166 3104 jprakash ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 2167 3104 jprakash tcp->tcp_eager_prev_drop_q0 == NULL); 2168 0 stevel 2169 0 stevel /* 2170 0 stevel * Insert at end of the queue because sockfs sends 2171 0 stevel * down T_CONN_RES in chronological order. Leaving 2172 0 stevel * the older conn indications at front of the queue 2173 0 stevel * helps reducing search time. 2174 0 stevel */ 2175 0 stevel tail = listener->tcp_eager_last_q; 2176 0 stevel if (tail != NULL) 2177 0 stevel tail->tcp_eager_next_q = tcp; 2178 0 stevel else 2179 0 stevel listener->tcp_eager_next_q = tcp; 2180 0 stevel listener->tcp_eager_last_q = tcp; 2181 0 stevel tcp->tcp_eager_next_q = NULL; 2182 0 stevel mutex_exit(&listener->tcp_eager_lock); 2183 11042 Erik putnext(tcp->tcp_connp->conn_rq, conn_ind); 2184 0 stevel } else { 2185 0 stevel mutex_exit(&listener->tcp_eager_lock); 2186 0 stevel } 2187 0 stevel 2188 0 stevel /* 2189 0 stevel * Done with the acceptor - free it 2190 0 stevel * 2191 0 stevel * Note: from this point on, no access to listener should be made 2192 0 stevel * as listener can be equal to acceptor. 2193 0 stevel */ 2194 0 stevel finish: 2195 0 stevel ASSERT(acceptor->tcp_detached); 2196 11042 Erik acceptor->tcp_connp->conn_rq = NULL; 2197 8348 Eric ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); 2198 11042 Erik acceptor->tcp_connp->conn_wq = NULL; 2199 0 stevel (void) tcp_clean_death(acceptor, 0, 2); 2200 0 stevel CONN_DEC_REF(acceptor->tcp_connp); 2201 0 stevel 2202 0 stevel /* 2203 11042 Erik * We pass discon_mp to tcp_accept_finish to get on the right squeue. 2204 11042 Erik * 2205 11042 Erik * It will update the setting for sockfs/stream head and also take 2206 11042 Erik * care of any data that arrived before accept() wad called. 2207 11042 Erik * In case we already received a FIN then tcp_accept_finish will send up 2208 11042 Erik * the ordrel. It will also send up a window update if the window 2209 0 stevel * has opened up. 2210 0 stevel */ 2211 0 stevel 2212 0 stevel /* 2213 0 stevel * XXX: we currently have a problem if XTI application closes the 2214 0 stevel * acceptor stream in between. This problem exists in on10-gate also 2215 0 stevel * and is well know but nothing can be done short of major rewrite 2216 0 stevel * to fix it. Now it is possible to take care of it by assigning TLI/XTI 2217 0 stevel * eager same squeue as listener (we can distinguish non socket 2218 11042 Erik * listeners at the time of handling a SYN in tcp_input_listener) 2219 0 stevel * and do most of the work that tcp_accept_finish does here itself 2220 0 stevel * and then get behind the acceptor squeue to access the acceptor 2221 0 stevel * queue. 2222 0 stevel */ 2223 0 stevel /* 2224 8275 Eric * We already have a ref on tcp so no need to do one before squeue_enter 2225 8275 Eric */ 2226 11042 Erik SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, 2227 11042 Erik tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, 2228 11042 Erik SQTAG_TCP_ACCEPT_FINISH); 2229 0 stevel } 2230 0 stevel 2231 0 stevel /* 2232 0 stevel * Swap information between the eager and acceptor for a TLI/XTI client. 2233 0 stevel * The sockfs accept is done on the acceptor stream and control goes 2234 11042 Erik * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not 2235 0 stevel * called. In either case, both the eager and listener are in their own 2236 0 stevel * perimeter (squeue) and the code has to deal with potential race. 2237 0 stevel * 2238 11042 Erik * See the block comment on top of tcp_accept() and tcp_tli_accept(). 2239 11042 Erik */ 2240 11042 Erik static void 2241 0 stevel tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 2242 0 stevel { 2243 0 stevel conn_t *econnp, *aconnp; 2244 11042 Erik 2245 11042 Erik ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); 2246 0 stevel ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 2247 0 stevel ASSERT(!TCP_IS_SOCKET(acceptor)); 2248 0 stevel ASSERT(!TCP_IS_SOCKET(eager)); 2249 0 stevel ASSERT(!TCP_IS_SOCKET(listener)); 2250 9710 Ken 2251 9710 Ken /* 2252 9710 Ken * Trusted Extensions may need to use a security label that is 2253 9710 Ken * different from the acceptor's label on MLP and MAC-Exempt 2254 9710 Ken * sockets. If this is the case, the required security label 2255 11042 Erik * already exists in econnp->conn_ixa->ixa_tsl. Since we make the 2256 11042 Erik * acceptor stream refer to econnp we atomatically get that label. 2257 11042 Erik */ 2258 0 stevel 2259 0 stevel acceptor->tcp_detached = B_TRUE; 2260 0 stevel /* 2261 0 stevel * To permit stream re-use by TLI/XTI, the eager needs a copy of 2262 0 stevel * the acceptor id. 2263 0 stevel */ 2264 0 stevel eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 2265 0 stevel 2266 0 stevel /* remove eager from listen list... */ 2267 0 stevel mutex_enter(&listener->tcp_eager_lock); 2268 0 stevel tcp_eager_unlink(eager); 2269 0 stevel ASSERT(eager->tcp_eager_next_q == NULL && 2270 0 stevel eager->tcp_eager_last_q == NULL); 2271 0 stevel ASSERT(eager->tcp_eager_next_q0 == NULL && 2272 0 stevel eager->tcp_eager_prev_q0 == NULL); 2273 0 stevel mutex_exit(&listener->tcp_eager_lock); 2274 11042 Erik 2275 11042 Erik econnp = eager->tcp_connp; 2276 11042 Erik aconnp = acceptor->tcp_connp; 2277 11042 Erik econnp->conn_rq = aconnp->conn_rq; 2278 11042 Erik econnp->conn_wq = aconnp->conn_wq; 2279 11042 Erik econnp->conn_rq->q_ptr = econnp; 2280 11042 Erik econnp->conn_wq->q_ptr = econnp; 2281 2504 meem 2282 2504 meem /* 2283 2504 meem * In the TLI/XTI loopback case, we are inside the listener's squeue, 2284 2504 meem * which might be a different squeue from our peer TCP instance. 2285 2504 meem * For TCP Fusion, the peer expects that whenever tcp_detached is 2286 2504 meem * clear, our TCP queues point to the acceptor's queues. Thus, use 2287 11042 Erik * membar_producer() to ensure that the assignments of conn_rq/conn_wq 2288 2504 meem * above reach global visibility prior to the clearing of tcp_detached. 2289 2504 meem */ 2290 2504 meem membar_producer(); 2291 0 stevel eager->tcp_detached = B_FALSE; 2292 0 stevel 2293 0 stevel ASSERT(eager->tcp_ack_tid == 0); 2294 0 stevel 2295 0 stevel econnp->conn_dev = aconnp->conn_dev; 2296 5815 gt145670 econnp->conn_minor_arena = aconnp->conn_minor_arena; 2297 9710 Ken 2298 5815 gt145670 ASSERT(econnp->conn_minor_arena != NULL); 2299 11042 Erik if (econnp->conn_cred != NULL) 2300 11042 Erik crfree(econnp->conn_cred); 2301 11042 Erik econnp->conn_cred = aconnp->conn_cred; 2302 9710 Ken aconnp->conn_cred = NULL; 2303 11042 Erik econnp->conn_cpid = aconnp->conn_cpid; 2304 3448 dh155122 ASSERT(econnp->conn_netstack == aconnp->conn_netstack); 2305 3448 dh155122 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); 2306 3448 dh155122 2307 0 stevel econnp->conn_zoneid = aconnp->conn_zoneid; 2308 2834 sommerfe econnp->conn_allzones = aconnp->conn_allzones; 2309 11042 Erik econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; 2310 11042 Erik 2311 11042 Erik econnp->conn_mac_mode = aconnp->conn_mac_mode; 2312 11042 Erik econnp->conn_zone_is_global = aconnp->conn_zone_is_global; 2313 10934 sommerfeld aconnp->conn_mac_mode = CONN_MAC_DEFAULT; 2314 0 stevel 2315 0 stevel /* Do the IPC initialization */ 2316 0 stevel CONN_INC_REF(econnp); 2317 0 stevel 2318 0 stevel /* Done with old IPC. Drop its ref on its connp */ 2319 0 stevel CONN_DEC_REF(aconnp); 2320 0 stevel } 2321 0 stevel 2322 0 stevel 2323 0 stevel /* 2324 0 stevel * Adapt to the information, such as rtt and rtt_sd, provided from the 2325 11042 Erik * DCE and IRE maintained by IP. 2326 0 stevel * 2327 0 stevel * Checks for multicast and broadcast destination address. 2328 11042 Erik * Returns zero if ok; an errno on failure. 2329 0 stevel * 2330 0 stevel * Note that the MSS calculation here is based on the info given in 2331 11042 Erik * the DCE and IRE. We do not do any calculation based on TCP options. They 2332 11042 Erik * will be handled in tcp_input_data() when TCP knows which options to use. 2333 0 stevel * 2334 0 stevel * Note on how TCP gets its parameters for a connection. 2335 0 stevel * 2336 0 stevel * When a tcp_t structure is allocated, it gets all the default parameters. 2337 11042 Erik * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd, 2338 0 stevel * spipe, rpipe, ... from the route metrics. Route metric overrides the 2339 7502 aruna * default. 2340 0 stevel * 2341 11042 Erik * An incoming SYN with a multicast or broadcast destination address is dropped 2342 11042 Erik * in ip_fanout_v4/v6. 2343 0 stevel * 2344 0 stevel * An incoming SYN with a multicast or broadcast source address is always 2345 11042 Erik * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in 2346 11042 Erik * conn_connect. 2347 11042 Erik * The same logic in tcp_set_destination also serves to 2348 0 stevel * reject an attempt to connect to a broadcast or multicast (destination) 2349 0 stevel * address. 2350 0 stevel */ 2351 0 stevel static int 2352 11042 Erik tcp_set_destination(tcp_t *tcp) 2353 11042 Erik { 2354 0 stevel uint32_t mss_max; 2355 0 stevel uint32_t mss; 2356 0 stevel boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 2357 0 stevel conn_t *connp = tcp->tcp_connp; 2358 11042 Erik tcp_stack_t *tcps = tcp->tcp_tcps; 2359 11042 Erik iulp_t uinfo; 2360 11042 Erik int error; 2361 11042 Erik uint32_t flags; 2362 11042 Erik 2363 11042 Erik flags = IPDF_LSO | IPDF_ZCOPY; 2364 11042 Erik /* 2365 11042 Erik * Make sure we have a dce for the destination to avoid dce_ident 2366 11042 Erik * contention for connected sockets. 2367 11042 Erik */ 2368 11042 Erik flags |= IPDF_UNIQUE_DCE; 2369 11042 Erik 2370 11042 Erik if (!tcps->tcps_ignore_path_mtu) 2371 11042 Erik connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 2372 11042 Erik 2373 11042 Erik /* Use conn_lock to satify ASSERT; tcp is already serialized */ 2374 11042 Erik mutex_enter(&connp->conn_lock); 2375 11042 Erik error = conn_connect(connp, &uinfo, flags); 2376 11042 Erik mutex_exit(&connp->conn_lock); 2377 11042 Erik if (error != 0) 2378 11042 Erik return (error); 2379 11042 Erik 2380 11042 Erik error = tcp_build_hdrs(tcp); 2381 11042 Erik if (error != 0) 2382 11042 Erik return (error); 2383 11042 Erik 2384 11042 Erik tcp->tcp_localnet = uinfo.iulp_localnet; 2385 11042 Erik 2386 11042 Erik if (uinfo.iulp_rtt != 0) { 2387 11042 Erik clock_t rto; 2388 11042 Erik 2389 11042 Erik tcp->tcp_rtt_sa = uinfo.iulp_rtt; 2390 11042 Erik tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd; 2391 11042 Erik rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 2392 11042 Erik tcps->tcps_rexmit_interval_extra + 2393 11042 Erik (tcp->tcp_rtt_sa >> 5); 2394 11042 Erik 2395 11042 Erik if (rto > tcps->tcps_rexmit_interval_max) { 2396 11042 Erik tcp->tcp_rto = tcps->tcps_rexmit_interval_max; 2397 11042 Erik } else if (rto < tcps->tcps_rexmit_interval_min) { 2398 11042 Erik tcp->tcp_rto = tcps->tcps_rexmit_interval_min; 2399 11042 Erik } else { 2400 11042 Erik tcp->tcp_rto = rto; 2401 11042 Erik } 2402 11042 Erik } 2403 11042 Erik if (uinfo.iulp_ssthresh != 0) 2404 11042 Erik tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh; 2405 11042 Erik else 2406 11042 Erik tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 2407 11042 Erik if (uinfo.iulp_spipe > 0) { 2408 11042 Erik connp->conn_sndbuf = MIN(uinfo.iulp_spipe, 2409 11042 Erik tcps->tcps_max_buf); 2410 11042 Erik if (tcps->tcps_snd_lowat_fraction != 0) { 2411 11042 Erik connp->conn_sndlowat = connp->conn_sndbuf / 2412 11042 Erik tcps->tcps_snd_lowat_fraction; 2413 11042 Erik } 2414 11042 Erik (void) tcp_maxpsz_set(tcp, B_TRUE); 2415 11042 Erik } 2416 11042 Erik /* 2417 11042 Erik * Note that up till now, acceptor always inherits receive 2418 11042 Erik * window from the listener. But if there is a metrics 2419 11042 Erik * associated with a host, we should use that instead of 2420 11042 Erik * inheriting it from listener. Thus we need to pass this 2421 11042 Erik * info back to the caller. 2422 11042 Erik */ 2423 11042 Erik if (uinfo.iulp_rpipe > 0) { 2424 11042 Erik tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe, 2425 11042 Erik tcps->tcps_max_buf); 2426 11042 Erik } 2427 11042 Erik 2428 11042 Erik if (uinfo.iulp_rtomax > 0) { 2429 11042 Erik tcp->tcp_second_timer_threshold = 2430 11042 Erik uinfo.iulp_rtomax; 2431 11042 Erik } 2432 11042 Erik 2433 11042 Erik /* 2434 11042 Erik * Use the metric option settings, iulp_tstamp_ok and 2435 11042 Erik * iulp_wscale_ok, only for active open. What this means 2436 11042 Erik * is that if the other side uses timestamp or window 2437 11042 Erik * scale option, TCP will also use those options. That 2438 11042 Erik * is for passive open. If the application sets a 2439 11042 Erik * large window, window scale is enabled regardless of 2440 11042 Erik * the value in iulp_wscale_ok. This is the behavior 2441 11042 Erik * since 2.6. So we keep it. 2442 11042 Erik * The only case left in passive open processing is the 2443 11042 Erik * check for SACK. 2444 11042 Erik * For ECN, it should probably be like SACK. But the 2445 11042 Erik * current value is binary, so we treat it like the other 2446 11042 Erik * cases. The metric only controls active open.For passive 2447 11042 Erik * open, the ndd param, tcp_ecn_permitted, controls the 2448 11042 Erik * behavior. 2449 11042 Erik */ 2450 11042 Erik if (!tcp_detached) { 2451 11042 Erik /* 2452 11042 Erik * The if check means that the following can only 2453 11042 Erik * be turned on by the metrics only IRE, but not off. 2454 11042 Erik */ 2455 11042 Erik if (uinfo.iulp_tstamp_ok) 2456 11042 Erik tcp->tcp_snd_ts_ok = B_TRUE; 2457 11042 Erik if (uinfo.iulp_wscale_ok) 2458 11042 Erik tcp->tcp_snd_ws_ok = B_TRUE; 2459 11042 Erik if (uinfo.iulp_sack == 2) 2460 11042 Erik tcp->tcp_snd_sack_ok = B_TRUE; 2461 11042 Erik if (uinfo.iulp_ecn_ok) 2462 11042 Erik tcp->tcp_ecn_ok = B_TRUE; 2463 11042 Erik } else { 2464 11042 Erik /* 2465 11042 Erik * Passive open. 2466 11042 Erik * 2467 11042 Erik * As above, the if check means that SACK can only be 2468 11042 Erik * turned on by the metric only IRE. 2469 11042 Erik */ 2470 11042 Erik if (uinfo.iulp_sack > 0) { 2471 11042 Erik tcp->tcp_snd_sack_ok = B_TRUE; 2472 11042 Erik } 2473 11042 Erik } 2474 11042 Erik 2475 11042 Erik /* 2476 11042 Erik * XXX Note that currently, iulp_mtu can be as small as 68 2477 0 stevel * because of PMTUd. So tcp_mss may go to negative if combined 2478 0 stevel * length of all those options exceeds 28 bytes. But because 2479 0 stevel * of the tcp_mss_min check below, we may not have a problem if 2480 0 stevel * tcp_mss_min is of a reasonable value. The default is 1 so 2481 0 stevel * the negative problem still exists. And the check defeats PMTUd. 2482 0 stevel * In fact, if PMTUd finds that the MSS should be smaller than 2483 0 stevel * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min 2484 0 stevel * value. 2485 0 stevel * 2486 0 stevel * We do not deal with that now. All those problems related to 2487 0 stevel * PMTUd will be fixed later. 2488 0 stevel */ 2489 11042 Erik ASSERT(uinfo.iulp_mtu != 0); 2490 11042 Erik mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu; 2491 0 stevel 2492 0 stevel /* Sanity check for MSS value. */ 2493 11042 Erik if (connp->conn_ipversion == IPV4_VERSION) 2494 3448 dh155122 mss_max = tcps->tcps_mss_max_ipv4; 2495 0 stevel else 2496 3448 dh155122 mss_max = tcps->tcps_mss_max_ipv6; 2497 0 stevel 2498 0 stevel if (tcp->tcp_ipsec_overhead == 0) 2499 0 stevel tcp->tcp_ipsec_overhead = conn_ipsec_length(connp); 2500 0 stevel 2501 0 stevel mss -= tcp->tcp_ipsec_overhead; 2502 0 stevel 2503 3448 dh155122 if (mss < tcps->tcps_mss_min) 2504 3448 dh155122 mss = tcps->tcps_mss_min; 2505 0 stevel if (mss > mss_max) 2506 0 stevel mss = mss_max; 2507 0 stevel 2508 0 stevel /* Note that this is the maximum MSS, excluding all options. */ 2509 0 stevel tcp->tcp_mss = mss; 2510 0 stevel 2511 0 stevel /* 2512 11042 Erik * Update the tcp connection with LSO capability. 2513 11042 Erik */ 2514 11042 Erik tcp_update_lso(tcp, connp->conn_ixa); 2515 11042 Erik 2516 11042 Erik /* 2517 0 stevel * Initialize the ISS here now that we have the full connection ID. 2518 0 stevel * The RFC 1948 method of initial sequence number generation requires 2519 0 stevel * knowledge of the full connection ID before setting the ISS. 2520 0 stevel */ 2521 0 stevel tcp_iss_init(tcp); 2522 0 stevel 2523 11042 Erik tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local); 2524 11042 Erik 2525 0 stevel /* 2526 0 stevel * Make sure that conn is not marked incipient 2527 0 stevel * for incoming connections. A blind 2528 0 stevel * removal of incipient flag is cheaper than 2529 0 stevel * check and removal. 2530 0 stevel */ 2531 11042 Erik mutex_enter(&connp->conn_lock); 2532 0 stevel connp->conn_state_flags &= ~CONN_INCIPIENT; 2533 0 stevel mutex_exit(&connp->conn_lock); 2534 0 stevel return (0); 2535 0 stevel } 2536 0 stevel 2537 8348 Eric static void 2538 8348 Eric tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) 2539 8348 Eric { 2540 8348 Eric int error; 2541 8348 Eric conn_t *connp = tcp->tcp_connp; 2542 8348 Eric struct sockaddr *sa; 2543 8348 Eric mblk_t *mp1; 2544 8348 Eric struct T_bind_req *tbr; 2545 8348 Eric int backlog; 2546 8348 Eric socklen_t len; 2547 0 stevel sin_t *sin; 2548 0 stevel sin6_t *sin6; 2549 8778 Erik cred_t *cr; 2550 8778 Erik 2551 8778 Erik /* 2552 8778 Erik * All Solaris components should pass a db_credp 2553 8778 Erik * for this TPI message, hence we ASSERT. 2554 8778 Erik * But in case there is some other M_PROTO that looks 2555 8778 Erik * like a TPI message sent by some other kernel 2556 8778 Erik * component, we check and return an error. 2557 8778 Erik */ 2558 8778 Erik cr = msg_getcred(mp, NULL); 2559 8778 Erik ASSERT(cr != NULL); 2560 8778 Erik if (cr == NULL) { 2561 8778 Erik tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 2562 8778 Erik return; 2563 8778 Erik } 2564 0 stevel 2565 0 stevel ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 2566 0 stevel if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 2567 11042 Erik if (connp->conn_debug) { 2568 741 masputra (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 2569 8348 Eric "tcp_tpi_bind: bad req, len %u", 2570 0 stevel (uint_t)(mp->b_wptr - mp->b_rptr)); 2571 0 stevel } 2572 0 stevel tcp_err_ack(tcp, mp, TPROTO, 0); 2573 0 stevel return; 2574 0 stevel } 2575 0 stevel /* Make sure the largest address fits */ 2576 11042 Erik mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 2577 0 stevel if (mp1 == NULL) { 2578 0 stevel tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 2579 0 stevel return; 2580 0 stevel } 2581 0 stevel mp = mp1; 2582 0 stevel tbr = (struct T_bind_req *)mp->b_rptr; 2583 8348 Eric 2584 8348 Eric backlog = tbr->CONIND_number; 2585 8348 Eric len = tbr->ADDR_length; 2586 8348 Eric 2587 8348 Eric switch (len) { 2588 8348 Eric case 0: /* request for a generic port */ 2589 0 stevel tbr->ADDR_offset = sizeof (struct T_bind_req); 2590 11042 Erik if (connp->conn_family == AF_INET) { 2591 0 stevel tbr->ADDR_length = sizeof (sin_t); 2592 0 stevel sin = (sin_t *)&tbr[1]; 2593 0 stevel *sin = sin_null; 2594 0 stevel sin->sin_family = AF_INET; 2595 8348 Eric sa = (struct sockaddr *)sin; 2596 8348 Eric len = sizeof (sin_t); 2597 0 stevel mp->b_wptr = (uchar_t *)&sin[1]; 2598 0 stevel } else { 2599 11042 Erik ASSERT(connp->conn_family == AF_INET6); 2600 0 stevel tbr->ADDR_length = sizeof (sin6_t); 2601 0 stevel sin6 = (sin6_t *)&tbr[1]; 2602 0 stevel *sin6 = sin6_null; 2603 0 stevel sin6->sin6_family = AF_INET6; 2604 8348 Eric sa = (struct sockaddr *)sin6; 2605 8348 Eric len = sizeof (sin6_t); 2606 0 stevel mp->b_wptr = (uchar_t *)&sin6[1]; 2607 8348 Eric } 2608 8348 Eric break; 2609 8348 Eric 2610 8348 Eric case sizeof (sin_t): /* Complete IPv4 address */ 2611 8348 Eric sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 2612 0 stevel sizeof (sin_t)); 2613 0 stevel break; 2614 0 stevel 2615 0 stevel case sizeof (sin6_t): /* Complete IPv6 address */ 2616 8348 Eric sa = (struct sockaddr *)mi_offset_param(mp, 2617 0 stevel tbr->ADDR_offset, sizeof (sin6_t)); 2618 0 stevel break; 2619 0 stevel 2620 0 stevel default: 2621 11042 Erik if (connp->conn_debug) { 2622 741 masputra (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 2623 8348 Eric "tcp_tpi_bind: bad address length, %d", 2624 0 stevel tbr->ADDR_length); 2625 0 stevel } 2626 0 stevel tcp_err_ack(tcp, mp, TBADADDR, 0); 2627 0 stevel return; 2628 0 stevel } 2629 8348 Eric 2630 9395 Rao if (backlog > 0) { 2631 9395 Rao error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp), 2632 9395 Rao tbr->PRIM_type != O_T_BIND_REQ); 2633 9395 Rao } else { 2634 9395 Rao error = tcp_do_bind(connp, sa, len, DB_CRED(mp), 2635 9395 Rao tbr->PRIM_type != O_T_BIND_REQ); 2636 8348 Eric } 2637 8348 Eric done: 2638 8348 Eric if (error > 0) { 2639 8348 Eric tcp_err_ack(tcp, mp, TSYSERR, error); 2640 8348 Eric } else if (error < 0) { 2641 8348 Eric tcp_err_ack(tcp, mp, -error, 0); 2642 8348 Eric } else { 2643 9395 Rao /* 2644 9395 Rao * Update port information as sockfs/tpi needs it for checking 2645 9395 Rao */ 2646 11042 Erik if (connp->conn_family == AF_INET) { 2647 9395 Rao sin = (sin_t *)sa; 2648 11042 Erik sin->sin_port = connp->conn_lport; 2649 9395 Rao } else { 2650 9395 Rao sin6 = (sin6_t *)sa; 2651 11042 Erik sin6->sin6_port = connp->conn_lport; 2652 9395 Rao } 2653 8348 Eric mp->b_datap->db_type = M_PCPROTO; 2654 8348 Eric tbr->PRIM_type = T_BIND_ACK; 2655 11042 Erik putnext(connp->conn_rq, mp); 2656 8348 Eric } 2657 8348 Eric } 2658 0 stevel 2659 0 stevel /* 2660 0 stevel * If the "bind_to_req_port_only" parameter is set, if the requested port 2661 0 stevel * number is available, return it, If not return 0 2662 0 stevel * 2663 0 stevel * If "bind_to_req_port_only" parameter is not set and 2664 0 stevel * If the requested port number is available, return it. If not, return 2665 0 stevel * the first anonymous port we happen across. If no anonymous ports are 2666 0 stevel * available, return 0. addr is the requested local address, if any. 2667 0 stevel * 2668 0 stevel * In either case, when succeeding update the tcp_t to record the port number 2669 0 stevel * and insert it in the bind hash table. 2670 0 stevel * 2671 0 stevel * Note that TCP over IPv4 and IPv6 sockets can use the same port number 2672 0 stevel * without setting SO_REUSEADDR. This is needed so that they 2673 0 stevel * can be viewed as two independent transport protocols. 2674 0 stevel */ 2675 0 stevel static in_port_t 2676 646 gt145670 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 2677 646 gt145670 int reuseaddr, boolean_t quick_connect, 2678 0 stevel boolean_t bind_to_req_port_only, boolean_t user_specified) 2679 0 stevel { 2680 0 stevel /* number of times we have run around the loop */ 2681 0 stevel int count = 0; 2682 0 stevel /* maximum number of times to run around the loop */ 2683 0 stevel int loopmax; 2684 1676 jpk conn_t *connp = tcp->tcp_connp; 2685 3448 dh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 2686 0 stevel 2687 0 stevel /* 2688 0 stevel * Lookup for free addresses is done in a loop and "loopmax" 2689 0 stevel * influences how long we spin in the loop 2690 0 stevel */ 2691 0 stevel if (bind_to_req_port_only) { 2692 0 stevel /* 2693 0 stevel * If the requested port is busy, don't bother to look 2694 0 stevel * for a new one. Setting loop maximum count to 1 has 2695 0 stevel * that effect. 2696 0 stevel */ 2697 0 stevel loopmax = 1; 2698 0 stevel } else { 2699 0 stevel /* 2700 0 stevel * If the requested port is busy, look for a free one 2701 0 stevel * in the anonymous port range. 2702 0 stevel * Set loopmax appropriately so that one does not look 2703 0 stevel * forever in the case all of the anonymous ports are in use. 2704 0 stevel */ 2705 11042 Erik if (connp->conn_anon_priv_bind) { 2706 0 stevel /* 2707 0 stevel * loopmax = 2708 0 stevel * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 2709 0 stevel */ 2710 3448 dh155122 loopmax = IPPORT_RESERVED - 2711 3448 dh155122 tcps->tcps_min_anonpriv_port; 2712 3448 dh155122 } else { 2713 3448 dh155122 loopmax = (tcps->tcps_largest_anon_port - 2714 3448 dh155122 tcps->tcps_smallest_anon_port + 1); 2715 0 stevel } 2716 0 stevel } 2717 0 stevel do { 2718 0 stevel uint16_t lport; 2719 0 stevel tf_t *tbf; 2720 0 stevel tcp_t *ltcp; 2721 1676 jpk conn_t *lconnp; 2722 0 stevel 2723 0 stevel lport = htons(port); 2724 0 stevel 2725 0 stevel /* 2726 0 stevel * Ensure that the tcp_t is not currently in the bind hash. 2727 0 stevel * Hold the lock on the hash bucket to ensure that 2728 0 stevel * the duplicate check plus the insertion is an atomic 2729 0 stevel * operation. 2730 0 stevel * 2731 0 stevel * This function does an inline lookup on the bind hash list 2732 0 stevel * Make sure that we access only members of tcp_t 2733 0 stevel * and that we don't look at tcp_tcp, since we are not 2734 0 stevel * doing a CONN_INC_REF. 2735 0 stevel */ 2736 0 stevel tcp_bind_hash_remove(tcp); 2737 3448 dh155122 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; 2738 0 stevel mutex_enter(&tbf->tf_lock); 2739 0 stevel for (ltcp = tbf->tf_tcp; ltcp != NULL; 2740 0 stevel ltcp = ltcp->tcp_bind_hash) { 2741 11042 Erik if (lport == ltcp->tcp_connp->conn_lport) 2742 8348 Eric break; 2743 8348 Eric } 2744 8348 Eric 2745 8348 Eric for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { 2746 2429 kcpoon boolean_t not_socket; 2747 2429 kcpoon boolean_t exclbind; 2748 1676 jpk 2749 1676 jpk lconnp = ltcp->tcp_connp; 2750 1676 jpk 2751 1676 jpk /* 2752 1676 jpk * On a labeled system, we must treat bindings to ports 2753 1676 jpk * on shared IP addresses by sockets with MAC exemption 2754 1676 jpk * privilege as being in all zones, as there's 2755 1676 jpk * otherwise no way to identify the right receiver. 2756 1676 jpk */ 2757 11042 Erik if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) 2758 1676 jpk continue; 2759 0 stevel 2760 0 stevel /* 2761 0 stevel * If TCP_EXCLBIND is set for either the bound or 2762 0 stevel * binding endpoint, the semantics of bind 2763 0 stevel * is changed according to the following. 2764 0 stevel * 2765 0 stevel * spec = specified address (v4 or v6) 2766 0 stevel * unspec = unspecified address (v4 or v6) 2767 0 stevel * A = specified addresses are different for endpoints 2768 0 stevel * 2769 0 stevel * bound bind to allowed 2770 0 stevel * ------------------------------------- 2771 0 stevel * unspec unspec no 2772 0 stevel * unspec spec no 2773 0 stevel * spec unspec no 2774 0 stevel * spec spec yes if A 2775 0 stevel * 2776 1676 jpk * For labeled systems, SO_MAC_EXEMPT behaves the same 2777 2429 kcpoon * as TCP_EXCLBIND, except that zoneid is ignored. 2778 1676 jpk * 2779 0 stevel * Note: 2780 0 stevel * 2781 0 stevel * 1. Because of TLI semantics, an endpoint can go 2782 0 stevel * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 2783 0 stevel * TCPS_BOUND, depending on whether it is originally 2784 0 stevel * a listener or not. That is why we need to check 2785 0 stevel * for states greater than or equal to TCPS_BOUND 2786 0 stevel * here. 2787 0 stevel * 2788 0 stevel * 2. Ideally, we should only check for state equals 2789 0 stevel * to TCPS_LISTEN. And the following check should be 2790 0 stevel * added. 2791 0 stevel * 2792 0 stevel * if (ltcp->tcp_state == TCPS_LISTEN || 2793 11042 Erik * !reuseaddr || !lconnp->conn_reuseaddr) { 2794 0 stevel * ... 2795 0 stevel * } 2796 0 stevel * 2797 0 stevel * The semantics will be changed to this. If the 2798 0 stevel * endpoint on the list is in state not equal to 2799 0 stevel * TCPS_LISTEN and both endpoints have SO_REUSEADDR 2800 0 stevel * set, let the bind succeed. 2801 0 stevel * 2802 2429 kcpoon * Because of (1), we cannot do that for TLI 2803 2429 kcpoon * endpoints. But we can do that for socket endpoints. 2804 2429 kcpoon * If in future, we can change this going back 2805 2429 kcpoon * semantics, we can use the above check for TLI also. 2806 2429 kcpoon */ 2807 2429 kcpoon not_socket = !(TCP_IS_SOCKET(ltcp) && 2808 2429 kcpoon TCP_IS_SOCKET(tcp)); 2809 11042 Erik exclbind = lconnp->conn_exclbind || 2810 11042 Erik connp->conn_exclbind; 2811 2429 kcpoon 2812 10934 sommerfeld if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || 2813 10934 sommerfeld (connp->conn_mac_mode != CONN_MAC_DEFAULT) || 2814 2429 kcpoon (exclbind && (not_socket || 2815 2429 kcpoon ltcp->tcp_state <= TCPS_ESTABLISHED))) { 2816 0 stevel if (V6_OR_V4_INADDR_ANY( 2817 11042 Erik lconnp->conn_bound_addr_v6) || 2818 0 stevel V6_OR_V4_INADDR_ANY(*laddr) || 2819 0 stevel IN6_ARE_ADDR_EQUAL(laddr, 2820 11042 Erik &lconnp->conn_bound_addr_v6)) { 2821 0 stevel break; 2822 0 stevel } 2823 0 stevel continue; 2824 0 stevel } 2825 0 stevel 2826 0 stevel /* 2827 0 stevel * Check ipversion to allow IPv4 and IPv6 sockets to 2828 0 stevel * have disjoint port number spaces, if *_EXCLBIND 2829 0 stevel * is not set and only if the application binds to a 2830 0 stevel * specific port. We use the same autoassigned port 2831 0 stevel * number space for IPv4 and IPv6 sockets. 2832 0 stevel */ 2833 11042 Erik if (connp->conn_ipversion != lconnp->conn_ipversion && 2834 0 stevel bind_to_req_port_only) 2835 0 stevel continue; 2836 0 stevel 2837 646 gt145670 /* 2838 646 gt145670 * Ideally, we should make sure that the source 2839 646 gt145670 * address, remote address, and remote port in the 2840 646 gt145670 * four tuple for this tcp-connection is unique. 2841 646 gt145670 * However, trying to find out the local source 2842 646 gt145670 * address would require too much code duplication 2843 646 gt145670 * with IP, since IP needs needs to have that code 2844 646 gt145670 * to support userland TCP implementations. 2845 646 gt145670 */ 2846 646 gt145670 if (quick_connect && 2847 646 gt145670 (ltcp->tcp_state > TCPS_LISTEN) && 2848 11042 Erik ((connp->conn_fport != lconnp->conn_fport) || 2849 11042 Erik !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, 2850 11042 Erik &lconnp->conn_faddr_v6))) 2851 646 gt145670 continue; 2852 646 gt145670 2853 0 stevel if (!reuseaddr) { 2854 0 stevel /* 2855 0 stevel * No socket option SO_REUSEADDR. 2856 0 stevel * If existing port is bound to 2857 0 stevel * a non-wildcard IP address 2858 0 stevel * and the requesting stream is 2859 0 stevel * bound to a distinct 2860 0 stevel * different IP addresses 2861 0 stevel * (non-wildcard, also), keep 2862 0 stevel * going. 2863 0 stevel */ 2864 0 stevel if (!V6_OR_V4_INADDR_ANY(*laddr) && 2865 0 stevel !V6_OR_V4_INADDR_ANY( 2866 11042 Erik lconnp->conn_bound_addr_v6) && 2867 0 stevel !IN6_ARE_ADDR_EQUAL(laddr, 2868 11042 Erik &lconnp->conn_bound_addr_v6)) 2869 0 stevel continue; 2870 0 stevel if (ltcp->tcp_state >= TCPS_BOUND) { 2871 0 stevel /* 2872 0 stevel * This port is being used and 2873 0 stevel * its state is >= TCPS_BOUND, 2874 0 stevel * so we can't bind to it. 2875 0 stevel */ 2876 0 stevel break; 2877 0 stevel } 2878 0 stevel } else { 2879 0 stevel /* 2880 0 stevel * socket option SO_REUSEADDR is set on the 2881 0 stevel * binding tcp_t. 2882 0 stevel * 2883 0 stevel * If two streams are bound to 2884 0 stevel * same IP address or both addr 2885 0 stevel * and bound source are wildcards 2886 0 stevel * (INADDR_ANY), we want to stop 2887 0 stevel * searching. 2888 0 stevel * We have found a match of IP source 2889 0 stevel * address and source port, which is 2890 0 stevel * refused regardless of the 2891 0 stevel * SO_REUSEADDR setting, so we break. 2892 0 stevel */ 2893 0 stevel if (IN6_ARE_ADDR_EQUAL(laddr, 2894 11042 Erik &lconnp->conn_bound_addr_v6) && 2895 0 stevel (ltcp->tcp_state == TCPS_LISTEN || 2896 5031 rs200217 ltcp->tcp_state == TCPS_BOUND)) 2897 0 stevel break; 2898 0 stevel } 2899 0 stevel } 2900 0 stevel if (ltcp != NULL) { 2901 0 stevel /* The port number is busy */ 2902 0 stevel mutex_exit(&tbf->tf_lock); 2903 0 stevel } else { 2904 0 stevel /* 2905 0 stevel * This port is ours. Insert in fanout and mark as 2906 0 stevel * bound to prevent others from getting the port 2907 0 stevel * number. 2908 0 stevel */ 2909 0 stevel tcp->tcp_state = TCPS_BOUND; 2910 11042 Erik connp->conn_lport = htons(port); 2911 0 stevel 2912 3448 dh155122 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( 2913 11042 Erik connp->conn_lport)] == tbf); 2914 0 stevel tcp_bind_hash_insert(tbf, tcp, 1); 2915 0 stevel 2916 0 stevel mutex_exit(&tbf->tf_lock); 2917 0 stevel 2918 0 stevel /* 2919 0 stevel * We don't want tcp_next_port_to_try to "inherit" 2920 0 stevel * a port number supplied by the user in a bind. 2921 0 stevel */ 2922 0 stevel if (user_specified) 2923 0 stevel return (port); 2924 0 stevel 2925 0 stevel /* 2926 0 stevel * This is the only place where tcp_next_port_to_try 2927 0 stevel * is updated. After the update, it may or may not 2928 0 stevel * be in the valid range. 2929 0 stevel */ 2930 11042 Erik if (!connp->conn_anon_priv_bind) 2931 3448 dh155122 tcps->tcps_next_port_to_try = port + 1; 2932 0 stevel return (port); 2933 0 stevel } 2934 0 stevel 2935 11042 Erik if (connp->conn_anon_priv_bind) { 2936 1676 jpk port = tcp_get_next_priv_port(tcp); 2937 0 stevel } else { 2938 0 stevel if (count == 0 && user_specified) { 2939 0 stevel /* 2940 0 stevel * We may have to return an anonymous port. So 2941 0 stevel * get one to start with. 2942 0 stevel */ 2943 0 stevel port = 2944 3448 dh155122 tcp_update_next_port( 2945 5031 rs200217 tcps->tcps_next_port_to_try, 2946 5031 rs200217 tcp, B_TRUE); 2947 0 stevel user_specified = B_FALSE; 2948 0 stevel } else { 2949 1676 jpk port = tcp_update_next_port(port + 1, tcp, 2950 1676 jpk B_FALSE); 2951 1676 jpk } 2952 1676 jpk } 2953 1676 jpk if (port == 0) 2954 1676 jpk break; 2955 0 stevel 2956 0 stevel /* 2957 0 stevel * Don't let this loop run forever in the case where 2958 0 stevel * all of the anonymous ports are in use. 2959 0 stevel */ 2960 0 stevel } while (++count < loopmax); 2961 0 stevel return (0); 2962 0 stevel } 2963 0 stevel 2964 0 stevel /* 2965 3104 jprakash * tcp_clean_death / tcp_close_detached must not be called more than once 2966 3104 jprakash * on a tcp. Thus every function that potentially calls tcp_clean_death 2967 3104 jprakash * must check for the tcp state before calling tcp_clean_death. 2968 11042 Erik * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper, 2969 3104 jprakash * tcp_timer_handler, all check for the tcp state. 2970 3104 jprakash */ 2971 3104 jprakash /* ARGSUSED */ 2972 3104 jprakash void 2973 11042 Erik tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, 2974 11042 Erik ip_recv_attr_t *dummy) 2975 3104 jprakash { 2976 3104 jprakash tcp_t *tcp = ((conn_t *)arg)->conn_tcp; 2977 3104 jprakash 2978 3104 jprakash freemsg(mp); 2979 3104 jprakash if (tcp->tcp_state > TCPS_BOUND) 2980 5031 rs200217 (void) tcp_clean_death(((conn_t *)arg)->conn_tcp, 2981 5031 rs200217 ETIMEDOUT, 5); 2982 3104 jprakash } 2983 3104 jprakash 2984 3104 jprakash /* 2985 0 stevel * We are dying for some reason. Try to do it gracefully. (May be called 2986 0 stevel * as writer.) 2987 0 stevel * 2988 0 stevel * Return -1 if the structure was not cleaned up (if the cleanup had to be 2989 0 stevel * done by a service procedure). 2990 0 stevel * TBD - Should the return value distinguish between the tcp_t being 2991 0 stevel * freed and it being reinitialized? 2992 0 stevel */ 2993 0 stevel static int 2994 0 stevel tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) 2995 0 stevel { 2996 0 stevel mblk_t *mp; 2997 0 stevel queue_t *q; 2998 8348 Eric conn_t *connp = tcp->tcp_connp; 2999 3448 dh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 3000 0 stevel 3001 0 stevel TCP_CLD_STAT(tag); 3002 0 stevel 3003 0 stevel #if TCP_TAG_CLEAN_DEATH 3004 0 stevel tcp->tcp_cleandeathtag = tag; 3005 0 stevel #endif 3006 0 stevel 3007 2323 ethindra if (tcp->tcp_fused) 3008 2323 ethindra tcp_unfuse(tcp); 3009 2323 ethindra 3010 0 stevel if (tcp->tcp_linger_tid != 0 && 3011 0 stevel TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 3012 0 stevel tcp_stop_lingering(tcp); 3013 0 stevel } 3014 0 stevel 3015 0 stevel ASSERT(tcp != NULL); 3016 11042 Erik ASSERT((connp->conn_family == AF_INET && 3017 11042 Erik connp->conn_ipversion == IPV4_VERSION) || 3018 11042 Erik (connp->conn_family == AF_INET6 && 3019 11042 Erik (connp->conn_ipversion == IPV4_VERSION || 3020 11042 Erik connp->conn_ipversion == IPV6_VERSION))); 3021 0 stevel 3022 0 stevel if (TCP_IS_DETACHED(tcp)) { 3023 0 stevel if (tcp->tcp_hard_binding) { 3024 0 stevel /* 3025 0 stevel * Its an eager that we are dealing with. We close the 3026 0 stevel * eager but in case a conn_ind has already gone to the 3027 0 stevel * listener, let tcp_accept_finish() send a discon_ind 3028 0 stevel * to the listener and drop the last reference. If the 3029 0 stevel * listener doesn't even know about the eager i.e. the 3030 0 stevel * conn_ind hasn't gone up, blow away the eager and drop 3031 0 stevel * the last reference as well. If the conn_ind has gone 3032 0 stevel * up, state should be BOUND. tcp_accept_finish 3033 0 stevel * will figure out that the connection has received a 3034 0 stevel * RST and will send a DISCON_IND to the application. 3035 0 stevel */ 3036 0 stevel tcp_closei_local(tcp); 3037 3104 jprakash if (!tcp->tcp_tconnind_started) { 3038 8348 Eric CONN_DEC_REF(connp); 3039 0 stevel } else { 3040 0 stevel tcp->tcp_state = TCPS_BOUND; 3041 0 stevel } 3042 0 stevel } else { 3043 0 stevel tcp_close_detached(tcp); 3044 0 stevel } 3045 0 stevel return (0); 3046 0 stevel } 3047 0 stevel 3048 3448 dh155122 TCP_STAT(tcps, tcp_clean_death_nondetached); 3049 0 stevel 3050 11042