Home | History | Annotate | Download | only in tcp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 /* Copyright (c) 1990 Mentat Inc. */
     27 
     28 #include <sys/types.h>
     29 #include <sys/stream.h>
     30 #include <sys/strsun.h>
     31 #include <sys/strsubr.h>
     32 #include <sys/stropts.h>
     33 #include <sys/strlog.h>
     34 #define	_SUN_TPI_VERSION 2
     35 #include <sys/tihdr.h>
     36 #include <sys/timod.h>
     37 #include <sys/ddi.h>
     38 #include <sys/sunddi.h>
     39 #include <sys/suntpi.h>
     40 #include <sys/xti_inet.h>
     41 #include <sys/cmn_err.h>
     42 #include <sys/debug.h>
     43 #include <sys/sdt.h>
     44 #include <sys/vtrace.h>
     45 #include <sys/kmem.h>
     46 #include <sys/ethernet.h>
     47 #include <sys/cpuvar.h>
     48 #include <sys/dlpi.h>
     49 #include <sys/pattr.h>
     50 #include <sys/policy.h>
     51 #include <sys/priv.h>
     52 #include <sys/zone.h>
     53 #include <sys/sunldi.h>
     54 
     55 #include <sys/errno.h>
     56 #include <sys/signal.h>
     57 #include <sys/socket.h>
     58 #include <sys/socketvar.h>
     59 #include <sys/sockio.h>
     60 #include <sys/isa_defs.h>
     61 #include <sys/md5.h>
     62 #include <sys/random.h>
     63 #include <sys/uio.h>
     64 #include <sys/systm.h>
     65 #include <netinet/in.h>
     66 #include <netinet/tcp.h>
     67 #include <netinet/ip6.h>
     68 #include <netinet/icmp6.h>
     69 #include <net/if.h>
     70 #include <net/route.h>
     71 #include <inet/ipsec_impl.h>
     72 
     73 #include <inet/common.h>
     74 #include <inet/ip.h>
     75 #include <inet/ip_impl.h>
     76 #include <inet/ip6.h>
     77 #include <inet/ip_ndp.h>
     78 #include <inet/proto_set.h>
     79 #include <inet/mib2.h>
     80 #include <inet/nd.h>
     81 #include <inet/optcom.h>
     82 #include <inet/snmpcom.h>
     83 #include <inet/kstatcom.h>
     84 #include <inet/tcp.h>
     85 #include <inet/tcp_impl.h>
     86 #include <inet/udp_impl.h>
     87 #include <net/pfkeyv2.h>
     88 #include <inet/ipdrop.h>
     89 
     90 #include <inet/ipclassifier.h>
     91 #include <inet/ip_ire.h>
     92 #include <inet/ip_ftable.h>
     93 #include <inet/ip_if.h>
     94 #include <inet/ipp_common.h>
     95 #include <inet/ip_rts.h>
     96 #include <inet/ip_netinfo.h>
     97 #include <sys/squeue_impl.h>
     98 #include <sys/squeue.h>
     99 #include <inet/kssl/ksslapi.h>
    100 #include <sys/tsol/label.h>
    101 #include <sys/tsol/tnet.h>
    102 #include <rpc/pmap_prot.h>
    103 #include <sys/callo.h>
    104 
    105 #include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
    106 
    107 /*
    108  * TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
    109  *
    110  * (Read the detailed design doc in PSARC case directory)
    111  *
    112  * The entire tcp state is contained in tcp_t and conn_t structure
    113  * which are allocated in tandem using ipcl_conn_create() and passing
    114  * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect
    115  * the references on the tcp_t. The tcp_t structure is never compressed
    116  * and packets always land on the correct TCP perimeter from the time
    117  * eager is created till the time tcp_t dies (as such the old mentat
    118  * TCP global queue is not used for detached state and no IPSEC checking
    119  * is required). The global queue is still allocated to send out resets
    120  * for connection which have no listeners and IP directly calls
    121  * tcp_xmit_listeners_reset() which does any policy check.
    122  *
    123  * Protection and Synchronisation mechanism:
    124  *
    125  * The tcp data structure does not use any kind of lock for protecting
    126  * its state but instead uses 'squeues' for mutual exclusion from various
    127  * read and write side threads. To access a tcp member, the thread should
    128  * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS,
    129  * or SQ_NODRAIN). Since the squeues allow a direct function call, caller
    130  * can pass any tcp function having prototype of edesc_t as argument
    131  * (different from traditional STREAMs model where packets come in only
    132  * designated entry points). The list of functions that can be directly
    133  * called via squeue are listed before the usual function prototype.
    134  *
    135  * Referencing:
    136  *
    137  * TCP is MT-Hot and we use a reference based scheme to make sure that the
    138  * tcp structure doesn't disappear when its needed. When the application
    139  * creates an outgoing connection or accepts an incoming connection, we
    140  * start out with 2 references on 'conn_ref'. One for TCP and one for IP.
    141  * The IP reference is just a symbolic reference since ip_tcpclose()
    142  * looks at tcp structure after tcp_close_output() returns which could
    143  * have dropped the last TCP reference. So as long as the connection is
    144  * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
    145  * conn_t. The classifier puts its own reference when the connection is
    146  * inserted in listen or connected hash. Anytime a thread needs to enter
    147  * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
    148  * on write side or by doing a classify on read side and then puts a
    149  * reference on the conn before doing squeue_enter/tryenter/fill. For
    150  * read side, the classifier itself puts the reference under fanout lock
    151  * to make sure that tcp can't disappear before it gets processed. The
    152  * squeue will drop this reference automatically so the called function
    153  * doesn't have to do a DEC_REF.
    154  *
    155  * Opening a new connection:
    156  *
    157  * The outgoing connection open is pretty simple. tcp_open() does the
    158  * work in creating the conn/tcp structure and initializing it. The
    159  * squeue assignment is done based on the CPU the application
    160  * is running on. So for outbound connections, processing is always done
    161  * on application CPU which might be different from the incoming CPU
    162  * being interrupted by the NIC. An optimal way would be to figure out
    163  * the NIC <-> CPU binding at listen time, and assign the outgoing
    164  * connection to the squeue attached to the CPU that will be interrupted
    165  * for incoming packets (we know the NIC based on the bind IP address).
    166  * This might seem like a problem if more data is going out but the
    167  * fact is that in most cases the transmit is ACK driven transmit where
    168  * the outgoing data normally sits on TCP's xmit queue waiting to be
    169  * transmitted.
    170  *
    171  * Accepting a connection:
    172  *
    173  * This is a more interesting case because of various races involved in
    174  * establishing a eager in its own perimeter. Read the meta comment on
    175  * top of tcp_input_listener(). But briefly, the squeue is picked by
    176  * ip_fanout based on the ring or the sender (if loopback).
    177  *
    178  * Closing a connection:
    179  *
    180  * The close is fairly straight forward. tcp_close() calls tcp_close_output()
    181  * via squeue to do the close and mark the tcp as detached if the connection
    182  * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
    183  * reference but tcp_close() drop IP's reference always. So if tcp was
    184  * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
    185  * and 1 because it is in classifier's connected hash. This is the condition
    186  * we use to determine that its OK to clean up the tcp outside of squeue
    187  * when time wait expires (check the ref under fanout and conn_lock and
    188  * if it is 2, remove it from fanout hash and kill it).
    189  *
    190  * Although close just drops the necessary references and marks the
    191  * tcp_detached state, tcp_close needs to know the tcp_detached has been
    192  * set (under squeue) before letting the STREAM go away (because a
    193  * inbound packet might attempt to go up the STREAM while the close
    194  * has happened and tcp_detached is not set). So a special lock and
    195  * flag is used along with a condition variable (tcp_closelock, tcp_closed,
    196  * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
    197  * tcp_detached.
    198  *
    199  * Special provisions and fast paths:
    200  *
    201  * We make special provisions for sockfs by marking tcp_issocket
    202  * whenever we have only sockfs on top of TCP. This allows us to skip
    203  * putting the tcp in acceptor hash since a sockfs listener can never
    204  * become acceptor and also avoid allocating a tcp_t for acceptor STREAM
    205  * since eager has already been allocated and the accept now happens
    206  * on acceptor STREAM. There is a big blob of comment on top of
    207  * tcp_input_listener explaining the new accept. When socket is POP'd,
    208  * sockfs sends us an ioctl to mark the fact and we go back to old
    209  * behaviour. Once tcp_issocket is unset, its never set for the
    210  * life of that connection.
    211  *
    212  * IPsec notes :
    213  *
    214  * Since a packet is always executed on the correct TCP perimeter
    215  * all IPsec processing is defered to IP including checking new
    216  * connections and setting IPSEC policies for new connection. The
    217  * only exception is tcp_xmit_listeners_reset() which is called
    218  * directly from IP and needs to policy check to see if TH_RST
    219  * can be sent out.
    220  */
    221 
    222 /*
    223  * Values for squeue switch:
    224  * 1: SQ_NODRAIN
    225  * 2: SQ_PROCESS
    226  * 3: SQ_FILL
    227  */
    228 int tcp_squeue_wput = 2;	/* /etc/systems */
    229 int tcp_squeue_flag;
    230 
    231 /*
    232  * This controls how tiny a write must be before we try to copy it
    233  * into the mblk on the tail of the transmit queue.  Not much
    234  * speedup is observed for values larger than sixteen.  Zero will
    235  * disable the optimisation.
    236  */
    237 int tcp_tx_pull_len = 16;
    238 
    239 /*
    240  * TCP Statistics.
    241  *
    242  * How TCP statistics work.
    243  *
    244  * There are two types of statistics invoked by two macros.
    245  *
    246  * TCP_STAT(name) does non-atomic increment of a named stat counter. It is
    247  * supposed to be used in non MT-hot paths of the code.
    248  *
    249  * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
    250  * supposed to be used for DEBUG purposes and may be used on a hot path.
    251  *
    252  * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
    253  * (use "kstat tcp" to get them).
    254  *
    255  * There is also additional debugging facility that marks tcp_clean_death()
    256  * instances and saves them in tcp_t structure. It is triggered by
    257  * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
    258  * tcp_clean_death() calls that counts the number of times each tag was hit. It
    259  * is triggered by TCP_CLD_COUNTERS define.
    260  *
    261  * How to add new counters.
    262  *
    263  * 1) Add a field in the tcp_stat structure describing your counter.
    264  * 2) Add a line in the template in tcp_kstat2_init() with the name
    265  *    of the counter.
    266  *
    267  *    IMPORTANT!! - make sure that both are in sync !!
    268  * 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
    269  *
    270  * Please avoid using private counters which are not kstat-exported.
    271  *
    272  * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
    273  * in tcp_t structure.
    274  *
    275  * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
    276  */
    277 
    278 #ifndef TCP_DEBUG_COUNTER
    279 #ifdef DEBUG
    280 #define	TCP_DEBUG_COUNTER 1
    281 #else
    282 #define	TCP_DEBUG_COUNTER 0
    283 #endif
    284 #endif
    285 
    286 #define	TCP_CLD_COUNTERS 0
    287 
    288 #define	TCP_TAG_CLEAN_DEATH 1
    289 #define	TCP_MAX_CLEAN_DEATH_TAG 32
    290 
    291 #ifdef lint
    292 static int _lint_dummy_;
    293 #endif
    294 
    295 #if TCP_CLD_COUNTERS
    296 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
    297 #define	TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
    298 #elif defined(lint)
    299 #define	TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
    300 #else
    301 #define	TCP_CLD_STAT(x)
    302 #endif
    303 
    304 #if TCP_DEBUG_COUNTER
    305 #define	TCP_DBGSTAT(tcps, x)	\
    306 	atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1)
    307 #define	TCP_G_DBGSTAT(x)	\
    308 	atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1)
    309 #elif defined(lint)
    310 #define	TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0);
    311 #define	TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
    312 #else
    313 #define	TCP_DBGSTAT(tcps, x)
    314 #define	TCP_G_DBGSTAT(x)
    315 #endif
    316 
    317 #define	TCP_G_STAT(x)	(tcp_g_statistics.x.value.ui64++)
    318 
    319 tcp_g_stat_t	tcp_g_statistics;
    320 kstat_t		*tcp_g_kstat;
    321 
    322 /* Macros for timestamp comparisons */
    323 #define	TSTMP_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
    324 #define	TSTMP_LT(a, b)	((int32_t)((a)-(b)) < 0)
    325 
    326 /*
    327  * Parameters for TCP Initial Send Sequence number (ISS) generation.  When
    328  * tcp_strong_iss is set to 1, which is the default, the ISS is calculated
    329  * by adding three components: a time component which grows by 1 every 4096
    330  * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
    331  * a per-connection component which grows by 125000 for every new connection;
    332  * and an "extra" component that grows by a random amount centered
    333  * approximately on 64000.  This causes the ISS generator to cycle every
    334  * 4.89 hours if no TCP connections are made, and faster if connections are
    335  * made.
    336  *
    337  * When tcp_strong_iss is set to 0, ISS is calculated by adding two
    338  * components: a time component which grows by 250000 every second; and
    339  * a per-connection component which grows by 125000 for every new connections.
    340  *
    341  * A third method, when tcp_strong_iss is set to 2, for generating ISS is
    342  * prescribed by Steve Bellovin.  This involves adding time, the 125000 per
    343  * connection, and a one-way hash (MD5) of the connection ID <sport, dport,
    344  * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
    345  * password.
    346  */
    347 #define	ISS_INCR	250000
    348 #define	ISS_NSEC_SHT	12
    349 
    350 static sin_t	sin_null;	/* Zero address for quick clears */
    351 static sin6_t	sin6_null;	/* Zero address for quick clears */
    352 
    353 /*
    354  * This implementation follows the 4.3BSD interpretation of the urgent
    355  * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
    356  * incompatible changes in protocols like telnet and rlogin.
    357  */
    358 #define	TCP_OLD_URP_INTERPRETATION	1
    359 
    360 /*
    361  * Since tcp_listener is not cleared atomically with tcp_detached
    362  * being cleared we need this extra bit to tell a detached connection
    363  * apart from one that is in the process of being accepted.
    364  */
    365 #define	TCP_IS_DETACHED_NONEAGER(tcp)	\
    366 	(TCP_IS_DETACHED(tcp) &&	\
    367 	    (!(tcp)->tcp_hard_binding))
    368 
    369 /*
    370  * TCP reassembly macros.  We hide starting and ending sequence numbers in
    371  * b_next and b_prev of messages on the reassembly queue.  The messages are
    372  * chained using b_cont.  These macros are used in tcp_reass() so we don't
    373  * have to see the ugly casts and assignments.
    374  */
    375 #define	TCP_REASS_SEQ(mp)		((uint32_t)(uintptr_t)((mp)->b_next))
    376 #define	TCP_REASS_SET_SEQ(mp, u)	((mp)->b_next = \
    377 					(mblk_t *)(uintptr_t)(u))
    378 #define	TCP_REASS_END(mp)		((uint32_t)(uintptr_t)((mp)->b_prev))
    379 #define	TCP_REASS_SET_END(mp, u)	((mp)->b_prev = \
    380 					(mblk_t *)(uintptr_t)(u))
    381 
    382 /*
    383  * Implementation of TCP Timers.
    384  * =============================
    385  *
    386  * INTERFACE:
    387  *
    388  * There are two basic functions dealing with tcp timers:
    389  *
    390  *	timeout_id_t	tcp_timeout(connp, func, time)
    391  * 	clock_t		tcp_timeout_cancel(connp, timeout_id)
    392  *	TCP_TIMER_RESTART(tcp, intvl)
    393  *
    394  * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
    395  * after 'time' ticks passed. The function called by timeout() must adhere to
    396  * the same restrictions as a driver soft interrupt handler - it must not sleep
    397  * or call other functions that might sleep. The value returned is the opaque
    398  * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
    399  * cancel the request. The call to tcp_timeout() may fail in which case it
    400  * returns zero. This is different from the timeout(9F) function which never
    401  * fails.
    402  *
    403  * The call-back function 'func' always receives 'connp' as its single
    404  * argument. It is always executed in the squeue corresponding to the tcp
    405  * structure. The tcp structure is guaranteed to be present at the time the
    406  * call-back is called.
    407  *
    408  * NOTE: The call-back function 'func' is never called if tcp is in
    409  * 	the TCPS_CLOSED state.
    410  *
    411  * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
    412  * request. locks acquired by the call-back routine should not be held across
    413  * the call to tcp_timeout_cancel() or a deadlock may result.
    414  *
    415  * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
    416  * Otherwise, it returns an integer value greater than or equal to 0. In
    417  * particular, if the call-back function is already placed on the squeue, it can
    418  * not be canceled.
    419  *
    420  * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
    421  * 	within squeue context corresponding to the tcp instance. Since the
    422  *	call-back is also called via the same squeue, there are no race
    423  *	conditions described in untimeout(9F) manual page since all calls are
    424  *	strictly serialized.
    425  *
    426  *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
    427  *	stored in tcp_timer_tid and starts a new one using
    428  *	MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
    429  *	and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
    430  *	field.
    431  *
    432  * NOTE: since the timeout cancellation is not guaranteed, the cancelled
    433  *	call-back may still be called, so it is possible tcp_timer() will be
    434  *	called several times. This should not be a problem since tcp_timer()
    435  *	should always check the tcp instance state.
    436  *
    437  *
    438  * IMPLEMENTATION:
    439  *
    440  * TCP timers are implemented using three-stage process. The call to
    441  * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
    442  * when the timer expires. The tcp_timer_callback() arranges the call of the
    443  * tcp_timer_handler() function via squeue corresponding to the tcp
    444  * instance. The tcp_timer_handler() calls actual requested timeout call-back
    445  * and passes tcp instance as an argument to it. Information is passed between
    446  * stages using the tcp_timer_t structure which contains the connp pointer, the
    447  * tcp call-back to call and the timeout id returned by the timeout(9F).
    448  *
    449  * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
    450  * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
    451  * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
    452  * returns the pointer to this mblk.
    453  *
    454  * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
    455  * looks like a normal mblk without actual dblk attached to it.
    456  *
    457  * To optimize performance each tcp instance holds a small cache of timer
    458  * mblocks. In the current implementation it caches up to two timer mblocks per
    459  * tcp instance. The cache is preserved over tcp frees and is only freed when
    460  * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
    461  * timer processing happens on a corresponding squeue, the cache manipulation
    462  * does not require any locks. Experiments show that majority of timer mblocks
    463  * allocations are satisfied from the tcp cache and do not involve kmem calls.
    464  *
    465  * The tcp_timeout() places a refhold on the connp instance which guarantees
    466  * that it will be present at the time the call-back function fires. The
    467  * tcp_timer_handler() drops the reference after calling the call-back, so the
    468  * call-back function does not need to manipulate the references explicitly.
    469  */
    470 
    471 typedef struct tcp_timer_s {
    472 	conn_t	*connp;
    473 	void 	(*tcpt_proc)(void *);
    474 	callout_id_t   tcpt_tid;
    475 } tcp_timer_t;
    476 
    477 static kmem_cache_t *tcp_timercache;
    478 kmem_cache_t	*tcp_sack_info_cache;
    479 
    480 /*
    481  * For scalability, we must not run a timer for every TCP connection
    482  * in TIME_WAIT state.  To see why, consider (for time wait interval of
    483  * 4 minutes):
    484  *	1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
    485  *
    486  * This list is ordered by time, so you need only delete from the head
    487  * until you get to entries which aren't old enough to delete yet.
    488  * The list consists of only the detached TIME_WAIT connections.
    489  *
    490  * Note that the timer (tcp_time_wait_expire) is started when the tcp_t
    491  * becomes detached TIME_WAIT (either by changing the state and already
    492  * being detached or the other way around). This means that the TIME_WAIT
    493  * state can be extended (up to doubled) if the connection doesn't become
    494  * detached for a long time.
    495  *
    496  * The list manipulations (including tcp_time_wait_next/prev)
    497  * are protected by the tcp_time_wait_lock. The content of the
    498  * detached TIME_WAIT connections is protected by the normal perimeters.
    499  *
    500  * This list is per squeue and squeues are shared across the tcp_stack_t's.
    501  * Things on tcp_time_wait_head remain associated with the tcp_stack_t
    502  * and conn_netstack.
    503  * The tcp_t's that are added to tcp_free_list are disassociated and
    504  * have NULL tcp_tcps and conn_netstack pointers.
    505  */
    506 typedef struct tcp_squeue_priv_s {
    507 	kmutex_t	tcp_time_wait_lock;
    508 	callout_id_t	tcp_time_wait_tid;
    509 	tcp_t		*tcp_time_wait_head;
    510 	tcp_t		*tcp_time_wait_tail;
    511 	tcp_t		*tcp_free_list;
    512 	uint_t		tcp_free_list_cnt;
    513 } tcp_squeue_priv_t;
    514 
    515 /*
    516  * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
    517  * Running it every 5 seconds seems to give the best results.
    518  */
    519 #define	TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
    520 
    521 /*
    522  * To prevent memory hog, limit the number of entries in tcp_free_list
    523  * to 1% of available memory / number of cpus
    524  */
    525 uint_t tcp_free_list_max_cnt = 0;
    526 
    527 #define	TCP_XMIT_LOWATER	4096
    528 #define	TCP_XMIT_HIWATER	49152
    529 #define	TCP_RECV_LOWATER	2048
    530 #define	TCP_RECV_HIWATER	128000
    531 
    532 /*
    533  *  PAWS needs a timer for 24 days.  This is the number of ticks in 24 days
    534  */
    535 #define	PAWS_TIMEOUT	((clock_t)(24*24*60*60*hz))
    536 
    537 #define	TIDUSZ	4096	/* transport interface data unit size */
    538 
    539 /*
    540  * Bind hash list size and has function.  It has to be a power of 2 for
    541  * hashing.
    542  */
    543 #define	TCP_BIND_FANOUT_SIZE	512
    544 #define	TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
    545 
    546 /*
    547  * Size of acceptor hash list.  It has to be a power of 2 for hashing.
    548  */
    549 #define	TCP_ACCEPTOR_FANOUT_SIZE		256
    550 
    551 #ifdef	_ILP32
    552 #define	TCP_ACCEPTOR_HASH(accid)					\
    553 		(((uint_t)(accid) >> 8) & (TCP_ACCEPTOR_FANOUT_SIZE - 1))
    554 #else
    555 #define	TCP_ACCEPTOR_HASH(accid)					\
    556 		((uint_t)(accid) & (TCP_ACCEPTOR_FANOUT_SIZE - 1))
    557 #endif	/* _ILP32 */
    558 
    559 #define	IP_ADDR_CACHE_SIZE	2048
    560 #define	IP_ADDR_CACHE_HASH(faddr)					\
    561 	(ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
    562 
    563 /*
    564  * If there is a limit set on the number of connections allowed per each
    565  * listener, the following struct is used to store that counter.  This needs
    566  * to be separated from the listener since the listener can go away before
    567  * all the connections are gone.  When the struct is allocated, tlc_cnt is set
    568  * to 1.  When the listener goes away, tlc_cnt is decremented  by one.  And
    569  * the last connection (or the listener) which decrements tlc_cnt to zero
    570  * frees the struct.
    571  *
    572  * tlc_max is the threshold value tcps_conn_listen_port.  It is set when the
    573  * tcp_listen_cnt_t is allocated.
    574  *
    575  * tlc_report_time stores the time when cmn_err() is called to report that the
    576  * max has been exceeeded.  Report is done at most once every
    577  * TCP_TLC_REPORT_INTERVAL mins for a listener.
    578  *
    579  * tlc_drop stores the number of connection attempt dropped because the
    580  * limit has reached.
    581  */
    582 typedef struct tcp_listen_cnt_s {
    583 	uint32_t	tlc_max;
    584 	uint32_t	tlc_cnt;
    585 	int64_t		tlc_report_time;
    586 	uint32_t	tlc_drop;
    587 } tcp_listen_cnt_t;
    588 
    589 #define	TCP_TLC_REPORT_INTERVAL	(1 * MINUTES)
    590 
    591 #define	TCP_DECR_LISTEN_CNT(tcp)					\
    592 {									\
    593 	ASSERT((tcp)->tcp_listen_cnt->tlc_cnt > 0);			\
    594 	if (atomic_add_32_nv(&(tcp)->tcp_listen_cnt->tlc_cnt, -1) == 0) \
    595 		kmem_free((tcp)->tcp_listen_cnt, sizeof (tcp_listen_cnt_t)); \
    596 	(tcp)->tcp_listen_cnt = NULL;					\
    597 }
    598 
    599 /* Minimum number of connections per listener. */
    600 uint32_t tcp_min_conn_listener = 2;
    601 
    602 /*
    603  * Linked list struct to store listener connection limit configuration per
    604  * IP stack.
    605  */
    606 typedef struct tcp_listener_s {
    607 	in_port_t	tl_port;
    608 	uint32_t	tl_ratio;
    609 	list_node_t	tl_link;
    610 } tcp_listener_t;
    611 
    612 /*
    613  * The shift factor applied to tcp_mss to decide if the peer sends us a
    614  * valid initial receive window.  By default, if the peer receive window
    615  * is smaller than 1 MSS (shift factor is 0), it is considered as invalid.
    616  */
    617 uint32_t tcp_init_wnd_shft = 0;
    618 
    619 /* Control whether TCP can enter defensive mode when under memory pressure. */
    620 boolean_t tcp_do_reclaim = B_TRUE;
    621 
    622 /*
    623  * When the system is under memory pressure, stack variable tcps_reclaim is
    624  * true, we shorten the connection timeout abort interval to tcp_early_abort
    625  * seconds.
    626  */
    627 uint32_t tcp_early_abort = 30;
    628 
    629 /*
    630  * TCP options struct returned from tcp_parse_options.
    631  */
    632 typedef struct tcp_opt_s {
    633 	uint32_t	tcp_opt_mss;
    634 	uint32_t	tcp_opt_wscale;
    635 	uint32_t	tcp_opt_ts_val;
    636 	uint32_t	tcp_opt_ts_ecr;
    637 	tcp_t		*tcp;
    638 } tcp_opt_t;
    639 
    640 /*
    641  * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
    642  */
    643 
    644 #ifdef _BIG_ENDIAN
    645 #define	TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
    646 	(TCPOPT_TSTAMP << 8) | 10)
    647 #else
    648 #define	TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
    649 	(TCPOPT_NOP << 8) | TCPOPT_NOP)
    650 #endif
    651 
    652 /*
    653  * Flags returned from tcp_parse_options.
    654  */
    655 #define	TCP_OPT_MSS_PRESENT	1
    656 #define	TCP_OPT_WSCALE_PRESENT	2
    657 #define	TCP_OPT_TSTAMP_PRESENT	4
    658 #define	TCP_OPT_SACK_OK_PRESENT	8
    659 #define	TCP_OPT_SACK_PRESENT	16
    660 
    661 /* TCP option length */
    662 #define	TCPOPT_NOP_LEN		1
    663 #define	TCPOPT_MAXSEG_LEN	4
    664 #define	TCPOPT_WS_LEN		3
    665 #define	TCPOPT_REAL_WS_LEN	(TCPOPT_WS_LEN+1)
    666 #define	TCPOPT_TSTAMP_LEN	10
    667 #define	TCPOPT_REAL_TS_LEN	(TCPOPT_TSTAMP_LEN+2)
    668 #define	TCPOPT_SACK_OK_LEN	2
    669 #define	TCPOPT_REAL_SACK_OK_LEN	(TCPOPT_SACK_OK_LEN+2)
    670 #define	TCPOPT_REAL_SACK_LEN	4
    671 #define	TCPOPT_MAX_SACK_LEN	36
    672 #define	TCPOPT_HEADER_LEN	2
    673 
    674 /* TCP cwnd burst factor. */
    675 #define	TCP_CWND_INFINITE	65535
    676 #define	TCP_CWND_SS		3
    677 #define	TCP_CWND_NORMAL		5
    678 
    679 /* Maximum TCP initial cwin (start/restart). */
    680 #define	TCP_MAX_INIT_CWND	8
    681 
    682 /*
    683  * Initialize cwnd according to RFC 3390.  def_max_init_cwnd is
    684  * either tcp_slow_start_initial or tcp_slow_start_after idle
    685  * depending on the caller.  If the upper layer has not used the
    686  * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
    687  * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
    688  * If the upper layer has changed set the tcp_init_cwnd, just use
    689  * it to calculate the tcp_cwnd.
    690  */
    691 #define	SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd)			\
    692 {									\
    693 	if ((tcp)->tcp_init_cwnd == 0) {				\
    694 		(tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss),	\
    695 		    MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
    696 	} else {							\
    697 		(tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss);		\
    698 	}								\
    699 	tcp->tcp_cwnd_cnt = 0;						\
    700 }
    701 
    702 /* TCP Timer control structure */
    703 typedef struct tcpt_s {
    704 	pfv_t	tcpt_pfv;	/* The routine we are to call */
    705 	tcp_t	*tcpt_tcp;	/* The parameter we are to pass in */
    706 } tcpt_t;
    707 
    708 /*
    709  * Functions called directly via squeue having a prototype of edesc_t.
    710  */
    711 void		tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
    712     ip_recv_attr_t *ira);
    713 static void	tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2,
    714     ip_recv_attr_t *dummy);
    715 void		tcp_accept_finish(void *arg, mblk_t *mp, void *arg2,
    716     ip_recv_attr_t *dummy);
    717 static void	tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2,
    718     ip_recv_attr_t *dummy);
    719 static void	tcp_wput_proto(void *arg, mblk_t *mp, void *arg2,
    720     ip_recv_attr_t *dummy);
    721 void		tcp_input_data(void *arg, mblk_t *mp, void *arg2,
    722     ip_recv_attr_t *ira);
    723 static void	tcp_close_output(void *arg, mblk_t *mp, void *arg2,
    724     ip_recv_attr_t *dummy);
    725 void		tcp_output(void *arg, mblk_t *mp, void *arg2,
    726     ip_recv_attr_t *dummy);
    727 void		tcp_output_urgent(void *arg, mblk_t *mp, void *arg2,
    728     ip_recv_attr_t *dummy);
    729 static void	tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2,
    730     ip_recv_attr_t *dummy);
    731 static void	tcp_timer_handler(void *arg, mblk_t *mp, void *arg2,
    732     ip_recv_attr_t *dummy);
    733 static void	tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
    734     ip_recv_attr_t *dummy);
    735 static void	tcp_send_synack(void *arg, mblk_t *mp, void *arg2,
    736     ip_recv_attr_t *dummy);
    737 
    738 
    739 /* Prototype for TCP functions */
    740 static void	tcp_random_init(void);
    741 int		tcp_random(void);
    742 static void	tcp_tli_accept(tcp_t *tcp, mblk_t *mp);
    743 static void	tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
    744 		    tcp_t *eager);
    745 static int	tcp_set_destination(tcp_t *tcp);
    746 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
    747     int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
    748     boolean_t user_specified);
    749 static void	tcp_closei_local(tcp_t *tcp);
    750 static void	tcp_close_detached(tcp_t *tcp);
    751 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr,
    752 		    mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira);
    753 static void	tcp_tpi_connect(tcp_t *tcp, mblk_t *mp);
    754 static int	tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
    755 		    in_port_t dstport, uint_t srcid);
    756 static int	tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
    757 		    in_port_t dstport, uint32_t flowinfo,
    758 		    uint_t srcid, uint32_t scope_id);
    759 static int	tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
    760 static void	tcp_disconnect(tcp_t *tcp, mblk_t *mp);
    761 static char	*tcp_display(tcp_t *tcp, char *, char);
    762 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
    763 static void	tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
    764 static void	tcp_eager_unlink(tcp_t *tcp);
    765 static void	tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
    766 		    int unixerr);
    767 static void	tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
    768 		    int tlierr, int unixerr);
    769 static int	tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
    770 		    cred_t *cr);
    771 static int	tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
    772 		    char *value, caddr_t cp, cred_t *cr);
    773 static int	tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
    774 		    char *value, caddr_t cp, cred_t *cr);
    775 static int	tcp_tpistate(tcp_t *tcp);
    776 static void	tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
    777     int caller_holds_lock);
    778 static void	tcp_bind_hash_remove(tcp_t *tcp);
    779 static tcp_t	*tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *);
    780 void		tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
    781 static void	tcp_acceptor_hash_remove(tcp_t *tcp);
    782 static void	tcp_capability_req(tcp_t *tcp, mblk_t *mp);
    783 static void	tcp_info_req(tcp_t *tcp, mblk_t *mp);
    784 static void	tcp_addr_req(tcp_t *tcp, mblk_t *mp);
    785 static void	tcp_init_values(tcp_t *tcp);
    786 static void	tcp_ip_notify(tcp_t *tcp);
    787 static void	tcp_iss_init(tcp_t *tcp);
    788 static void	tcp_keepalive_killer(void *arg);
    789 static int	tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt);
    790 static void	tcp_mss_set(tcp_t *tcp, uint32_t size);
    791 static int	tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
    792 		    int *do_disconnectp, int *t_errorp, int *sys_errorp);
    793 static boolean_t tcp_allow_connopt_set(int level, int name);
    794 int		tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
    795 static int	tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
    796 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
    797     tcp_stack_t *);
    798 static int	tcp_param_set(queue_t *q, mblk_t *mp, char *value,
    799 		    caddr_t cp, cred_t *cr);
    800 static int	tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
    801 		    caddr_t cp, cred_t *cr);
    802 static void	tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *);
    803 static int	tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
    804 		    caddr_t cp, cred_t *cr);
    805 static void	tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
    806 static void	tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt);
    807 static mblk_t	*tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
    808 static void	tcp_reass_timer(void *arg);
    809 static void	tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
    810 static void	tcp_reinit(tcp_t *tcp);
    811 static void	tcp_reinit_values(tcp_t *tcp);
    812 
    813 static uint_t	tcp_rwnd_reopen(tcp_t *tcp);
    814 static uint_t	tcp_rcv_drain(tcp_t *tcp);
    815 static void	tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
    816 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
    817 static void	tcp_ss_rexmit(tcp_t *tcp);
    818 static mblk_t	*tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
    819     ip_recv_attr_t *);
    820 static void	tcp_process_options(tcp_t *, tcpha_t *);
    821 static void	tcp_rsrv(queue_t *q);
    822 static int	tcp_snmp_state(tcp_t *tcp);
    823 static void	tcp_timer(void *arg);
    824 static void	tcp_timer_callback(void *);
    825 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
    826     boolean_t random);
    827 static in_port_t tcp_get_next_priv_port(const tcp_t *);
    828 static void	tcp_wput_sock(queue_t *q, mblk_t *mp);
    829 static void	tcp_wput_fallback(queue_t *q, mblk_t *mp);
    830 void		tcp_tpi_accept(queue_t *q, mblk_t *mp);
    831 static void	tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
    832 static void	tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
    833 static void	tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
    834 static int	tcp_send(tcp_t *tcp, const int mss,
    835 		    const int total_hdr_len, const int tcp_hdr_len,
    836 		    const int num_sack_blk, int *usable, uint_t *snxt,
    837 		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time);
    838 static void	tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
    839 		    int num_sack_blk);
    840 static void	tcp_wsrv(queue_t *q);
    841 static int	tcp_xmit_end(tcp_t *tcp);
    842 static void	tcp_ack_timer(void *arg);
    843 static mblk_t	*tcp_ack_mp(tcp_t *tcp);
    844 static void	tcp_xmit_early_reset(char *str, mblk_t *mp,
    845 		    uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *,
    846 		    ip_stack_t *, conn_t *);
    847 static void	tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
    848 		    uint32_t ack, int ctl);
    849 static void	tcp_set_rto(tcp_t *, time_t);
    850 static void	tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
    851 static void	tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
    852 static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
    853     ip_recv_attr_t *);
    854 static int	tcp_build_hdrs(tcp_t *);
    855 static void	tcp_time_wait_append(tcp_t *tcp);
    856 static void	tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
    857     uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha,
    858     ip_recv_attr_t *ira);
    859 boolean_t	tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp);
    860 static boolean_t tcp_zcopy_check(tcp_t *);
    861 static void	tcp_zcopy_notify(tcp_t *);
    862 static mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
    863 static void	tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);
    864 static void	tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only);
    865 static void	tcp_update_zcopy(tcp_t *tcp);
    866 static void	tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
    867     ixa_notify_arg_t);
    868 static void	tcp_rexmit_after_error(tcp_t *tcp);
    869 static void	tcp_send_data(tcp_t *, mblk_t *);
    870 extern mblk_t	*tcp_timermp_alloc(int);
    871 extern void	tcp_timermp_free(tcp_t *);
    872 static void	tcp_timer_free(tcp_t *tcp, mblk_t *mp);
    873 static void	tcp_stop_lingering(tcp_t *tcp);
    874 static void	tcp_close_linger_timeout(void *arg);
    875 static void	*tcp_stack_init(netstackid_t stackid, netstack_t *ns);
    876 static void	tcp_stack_fini(netstackid_t stackid, void *arg);
    877 static void	*tcp_g_kstat_init(tcp_g_stat_t *);
    878 static void	tcp_g_kstat_fini(kstat_t *);
    879 static void	*tcp_kstat_init(netstackid_t, tcp_stack_t *);
    880 static void	tcp_kstat_fini(netstackid_t, kstat_t *);
    881 static void	*tcp_kstat2_init(netstackid_t, tcp_stat_t *);
    882 static void	tcp_kstat2_fini(netstackid_t, kstat_t *);
    883 static int	tcp_kstat_update(kstat_t *kp, int rw);
    884 static mblk_t	*tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
    885     ip_recv_attr_t *ira);
    886 static mblk_t	*tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
    887     ip_recv_attr_t *ira);
    888 static int	tcp_squeue_switch(int);
    889 
    890 static int	tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
    891 static int	tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
    892 static int	tcp_openv6(queue_t *, dev_t *, int, int, cred_t *);
    893 static int	tcp_tpi_close(queue_t *, int);
    894 static int	tcp_tpi_close_accept(queue_t *);
    895 
    896 static void	tcp_squeue_add(squeue_t *);
    897 static void	tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
    898 
    899 extern void	tcp_kssl_input(tcp_t *, mblk_t *, cred_t *);
    900 
    901 void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy);
    902 void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
    903     ip_recv_attr_t *dummy);
    904 
    905 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
    906 	    sock_upper_handle_t, cred_t *);
    907 static int tcp_listen(sock_lower_handle_t, int, cred_t *);
    908 static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *,
    909     boolean_t);
    910 static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
    911     cred_t *, pid_t);
    912 static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
    913     boolean_t);
    914 static int tcp_do_unbind(conn_t *);
    915 static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *,
    916     boolean_t);
    917 
    918 static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
    919 
    920 static uint32_t tcp_find_listener_conf(tcp_stack_t *, in_port_t);
    921 static int tcp_listener_conf_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    922 static int tcp_listener_conf_add(queue_t *, mblk_t *, char *, caddr_t,
    923     cred_t *);
    924 static int tcp_listener_conf_del(queue_t *, mblk_t *, char *, caddr_t,
    925     cred_t *);
    926 static void tcp_listener_conf_cleanup(tcp_stack_t *);
    927 
    928 /*
    929  * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
    930  *
    931  * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
    932  * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
    933  * (defined in tcp.h) needs to be filled in and passed into the kernel
    934  * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
    935  * structure contains the four-tuple of a TCP connection and a range of TCP
    936  * states (specified by ac_start and ac_end). The use of wildcard addresses
    937  * and ports is allowed. Connections with a matching four tuple and a state
    938  * within the specified range will be aborted. The valid states for the
    939  * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
    940  * inclusive.
    941  *
    942  * An application which has its connection aborted by this ioctl will receive
    943  * an error that is dependent on the connection state at the time of the abort.
    944  * If the connection state is < TCPS_TIME_WAIT, an application should behave as
    945  * though a RST packet has been received.  If the connection state is equal to
    946  * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
    947  * and all resources associated with the connection will be freed.
    948  */
    949 static mblk_t	*tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
    950 static void	tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
    951 static void	tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
    952     ip_recv_attr_t *dummy);
    953 static int	tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
    954 static void	tcp_ioctl_abort_conn(queue_t *, mblk_t *);
    955 static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
    956     boolean_t, tcp_stack_t *);
    957 
    958 static struct module_info tcp_rinfo =  {
    959 	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
    960 };
    961 
    962 static struct module_info tcp_winfo =  {
    963 	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
    964 };
    965 
    966 /*
    967  * Entry points for TCP as a device. The normal case which supports
    968  * the TCP functionality.
    969  * We have separate open functions for the /dev/tcp and /dev/tcp6 devices.
    970  */
    971 struct qinit tcp_rinitv4 = {
    972 	NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo
    973 };
    974 
    975 struct qinit tcp_rinitv6 = {
    976 	NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo
    977 };
    978 
    979 struct qinit tcp_winit = {
    980 	(pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
    981 };
    982 
    983 /* Initial entry point for TCP in socket mode. */
    984 struct qinit tcp_sock_winit = {
    985 	(pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
    986 };
    987 
    988 /* TCP entry point during fallback */
    989 struct qinit tcp_fallback_sock_winit = {
    990 	(pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo
    991 };
    992 
    993 /*
    994  * Entry points for TCP as a acceptor STREAM opened by sockfs when doing
    995  * an accept. Avoid allocating data structures since eager has already
    996  * been created.
    997  */
    998 struct qinit tcp_acceptor_rinit = {
    999 	NULL, (pfi_t)tcp_rsrv, NULL, tcp_tpi_close_accept, NULL, &tcp_winfo
   1000 };
   1001 
   1002 struct qinit tcp_acceptor_winit = {
   1003 	(pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo
   1004 };
   1005 
   1006 /* For AF_INET aka /dev/tcp */
   1007 struct streamtab tcpinfov4 = {
   1008 	&tcp_rinitv4, &tcp_winit
   1009 };
   1010 
   1011 /* For AF_INET6 aka /dev/tcp6 */
   1012 struct streamtab tcpinfov6 = {
   1013 	&tcp_rinitv6, &tcp_winit
   1014 };
   1015 
   1016 sock_downcalls_t sock_tcp_downcalls;
   1017 
   1018 /* Setable only in /etc/system. Move to ndd? */
   1019 boolean_t tcp_icmp_source_quench = B_FALSE;
   1020 
   1021 /*
   1022  * Following assumes TPI alignment requirements stay along 32 bit
   1023  * boundaries
   1024  */
   1025 #define	ROUNDUP32(x) \
   1026 	(((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
   1027 
   1028 /* Template for response to info request. */
   1029 static struct T_info_ack tcp_g_t_info_ack = {
   1030 	T_INFO_ACK,		/* PRIM_type */
   1031 	0,			/* TSDU_size */
   1032 	T_INFINITE,		/* ETSDU_size */
   1033 	T_INVALID,		/* CDATA_size */
   1034 	T_INVALID,		/* DDATA_size */
   1035 	sizeof (sin_t),		/* ADDR_size */
   1036 	0,			/* OPT_size - not initialized here */
   1037 	TIDUSZ,			/* TIDU_size */
   1038 	T_COTS_ORD,		/* SERV_type */
   1039 	TCPS_IDLE,		/* CURRENT_state */
   1040 	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
   1041 };
   1042 
   1043 static struct T_info_ack tcp_g_t_info_ack_v6 = {
   1044 	T_INFO_ACK,		/* PRIM_type */
   1045 	0,			/* TSDU_size */
   1046 	T_INFINITE,		/* ETSDU_size */
   1047 	T_INVALID,		/* CDATA_size */
   1048 	T_INVALID,		/* DDATA_size */
   1049 	sizeof (sin6_t),	/* ADDR_size */
   1050 	0,			/* OPT_size - not initialized here */
   1051 	TIDUSZ,		/* TIDU_size */
   1052 	T_COTS_ORD,		/* SERV_type */
   1053 	TCPS_IDLE,		/* CURRENT_state */
   1054 	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
   1055 };
   1056 
   1057 #define	MS	1L
   1058 #define	SECONDS	(1000 * MS)
   1059 #define	MINUTES	(60 * SECONDS)
   1060 #define	HOURS	(60 * MINUTES)
   1061 #define	DAYS	(24 * HOURS)
   1062 
   1063 #define	PARAM_MAX (~(uint32_t)0)
   1064 
   1065 /* Max size IP datagram is 64k - 1 */
   1066 #define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
   1067 #define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
   1068 /* Max of the above */
   1069 #define	TCP_MSS_MAX	TCP_MSS_MAX_IPV4
   1070 
   1071 /* Largest TCP port number */
   1072 #define	TCP_MAX_PORT	(64 * 1024 - 1)
   1073 
   1074 /*
   1075  * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
   1076  * layer header.  It has to be a multiple of 4.
   1077  */
   1078 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
   1079 #define	tcps_wroff_xtra	tcps_wroff_xtra_param->tcp_param_val
   1080 
   1081 #define	MB	(1024 * 1024)
   1082 
   1083 /*
   1084  * All of these are alterable, within the min/max values given, at run time.
   1085  * Note that the default value of "tcp_time_wait_interval" is four minutes,
   1086  * per the TCP spec.
   1087  */
   1088 /* BEGIN CSTYLED */
   1089 static tcpparam_t	lcl_tcp_param_arr[] = {
   1090  /*min		max		value		name */
   1091  { 1*SECONDS,	10*MINUTES,	1*MINUTES,	"tcp_time_wait_interval"},
   1092  { 1,		PARAM_MAX,	128,		"tcp_conn_req_max_q" },
   1093  { 0,		PARAM_MAX,	1024,		"tcp_conn_req_max_q0" },
   1094  { 1,		1024,		1,		"tcp_conn_req_min" },
   1095  { 0*MS,	20*SECONDS,	0*MS,		"tcp_conn_grace_period" },
   1096  { 128,		(1<<30),	1*MB,		"tcp_cwnd_max" },
   1097  { 0,		10,		0,		"tcp_debug" },
   1098  { 1024,	(32*1024),	1024,		"tcp_smallest_nonpriv_port"},
   1099  { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_cinterval"},
   1100  { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_linterval"},
   1101  { 500*MS,	PARAM_MAX,	5*MINUTES,	"tcp_ip_abort_interval"},
   1102  { 1*SECONDS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_cinterval"},
   1103  { 500*MS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_interval"},
   1104  { 1,		255,		64,		"tcp_ipv4_ttl"},
   1105  { 10*SECONDS,	10*DAYS,	2*HOURS,	"tcp_keepalive_interval"},
   1106  { 0,		100,		10,		"tcp_maxpsz_multiplier" },
   1107  { 1,		TCP_MSS_MAX_IPV4, 536,		"tcp_mss_def_ipv4"},
   1108  { 1,		TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
   1109  { 1,		TCP_MSS_MAX,	108,		"tcp_mss_min"},
   1110  { 1,		(64*1024)-1,	(4*1024)-1,	"tcp_naglim_def"},
   1111  { 1*MS,	20*SECONDS,	1*SECONDS,	"tcp_rexmit_interval_initial"},
   1112  { 1*MS,	2*HOURS,	60*SECONDS,	"tcp_rexmit_interval_max"},
   1113  { 1*MS,	2*HOURS,	400*MS,		"tcp_rexmit_interval_min"},
   1114  { 1*MS,	1*MINUTES,	100*MS,		"tcp_deferred_ack_interval" },
   1115  { 0,		16,		0,		"tcp_snd_lowat_fraction" },
   1116  { 1,		10000,		3,		"tcp_dupack_fast_retransmit" },
   1117  { 0,		1,		0,		"tcp_ignore_path_mtu" },
   1118  { 1024,	TCP_MAX_PORT,	32*1024,	"tcp_smallest_anon_port"},
   1119  { 1024,	TCP_MAX_PORT,	TCP_MAX_PORT,	"tcp_largest_anon_port"},
   1120  { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
   1121  { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
   1122  { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
   1123  { 1,		65536,		4,		"tcp_recv_hiwat_minmss"},
   1124  { 1*SECONDS,	PARAM_MAX,	675*SECONDS,	"tcp_fin_wait_2_flush_interval"},
   1125  { 8192,	(1<<30),	1*MB,		"tcp_max_buf"},
   1126 /*
   1127  * Question:  What default value should I set for tcp_strong_iss?
   1128  */
   1129  { 0,		2,		1,		"tcp_strong_iss"},
   1130  { 0,		65536,		20,		"tcp_rtt_updates"},
   1131  { 0,		1,		1,		"tcp_wscale_always"},
   1132  { 0,		1,		0,		"tcp_tstamp_always"},
   1133  { 0,		1,		1,		"tcp_tstamp_if_wscale"},
   1134  { 0*MS,	2*HOURS,	0*MS,		"tcp_rexmit_interval_extra"},
   1135  { 0,		16,		2,		"tcp_deferred_acks_max"},
   1136  { 1,		16384,		4,		"tcp_slow_start_after_idle"},
   1137  { 1,		4,		4,		"tcp_slow_start_initial"},
   1138  { 0,		2,		2,		"tcp_sack_permitted"},
   1139  { 0,		IPV6_MAX_HOPS,	IPV6_DEFAULT_HOPS,	"tcp_ipv6_hoplimit"},
   1140  { 1,		TCP_MSS_MAX_IPV6, 1220,		"tcp_mss_def_ipv6"},
   1141  { 1,		TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
   1142  { 0,		1,		0,		"tcp_rev_src_routes"},
   1143  { 10*MS,	500*MS,		50*MS,		"tcp_local_dack_interval"},
   1144  { 0,		16,		8,		"tcp_local_dacks_max"},
   1145  { 0,		2,		1,		"tcp_ecn_permitted"},
   1146  { 0,		1,		1,		"tcp_rst_sent_rate_enabled"},
   1147  { 0,		PARAM_MAX,	40,		"tcp_rst_sent_rate"},
   1148  { 0,		100*MS,		50*MS,		"tcp_push_timer_interval"},
   1149  { 0,		1,		0,		"tcp_use_smss_as_mss_opt"},
   1150  { 0,		PARAM_MAX,	8*MINUTES,	"tcp_keepalive_abort_interval"},
   1151  { 0,		1,		0,		"tcp_dev_flow_ctl"},
   1152  { 0,		PARAM_MAX,	100*SECONDS,	"tcp_reass_timeout"}
   1153 };
   1154 /* END CSTYLED */
   1155 
   1156 /* Round up the value to the nearest mss. */
   1157 #define	MSS_ROUNDUP(value, mss)		((((value) - 1) / (mss) + 1) * (mss))
   1158 
   1159 /*
   1160  * Set ECN capable transport (ECT) code point in IP header.
   1161  *
   1162  * Note that there are 2 ECT code points '01' and '10', which are called
   1163  * ECT(1) and ECT(0) respectively.  Here we follow the original ECT code
   1164  * point ECT(0) for TCP as described in RFC 2481.
   1165  */
   1166 #define	SET_ECT(tcp, iph) \
   1167 	if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
   1168 		/* We need to clear the code point first. */ \
   1169 		((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
   1170 		((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
   1171 	} else { \
   1172 		((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
   1173 		((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
   1174 	}
   1175 
   1176 /*
   1177  * The format argument to pass to tcp_display().
   1178  * DISP_PORT_ONLY means that the returned string has only port info.
   1179  * DISP_ADDR_AND_PORT means that the returned string also contains the
   1180  * remote and local IP address.
   1181  */
   1182 #define	DISP_PORT_ONLY		1
   1183 #define	DISP_ADDR_AND_PORT	2
   1184 
   1185 #define	IS_VMLOANED_MBLK(mp) \
   1186 	(((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
   1187 
   1188 uint32_t do_tcpzcopy = 1;		/* 0: disable, 1: enable, 2: force */
   1189 
   1190 /*
   1191  * Forces all connections to obey the value of the tcps_maxpsz_multiplier
   1192  * tunable settable via NDD.  Otherwise, the per-connection behavior is
   1193  * determined dynamically during tcp_set_destination(), which is the default.
   1194  */
   1195 boolean_t tcp_static_maxpsz = B_FALSE;
   1196 
   1197 /* Setable in /etc/system */
   1198 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
   1199 uint32_t tcp_random_anon_port = 1;
   1200 
   1201 /*
   1202  * To reach to an eager in Q0 which can be dropped due to an incoming
   1203  * new SYN request when Q0 is full, a new doubly linked list is
   1204  * introduced. This list allows to select an eager from Q0 in O(1) time.
   1205  * This is needed to avoid spending too much time walking through the
   1206  * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
   1207  * this new list has to be a member of Q0.
   1208  * This list is headed by listener's tcp_t. When the list is empty,
   1209  * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
   1210  * of listener's tcp_t point to listener's tcp_t itself.
   1211  *
   1212  * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
   1213  * in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
   1214  * These macros do not affect the eager's membership to Q0.
   1215  */
   1216 
   1217 
   1218 #define	MAKE_DROPPABLE(listener, eager)					\
   1219 	if ((eager)->tcp_eager_next_drop_q0 == NULL) {			\
   1220 		(listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
   1221 		    = (eager);						\
   1222 		(eager)->tcp_eager_prev_drop_q0 = (listener);		\
   1223 		(eager)->tcp_eager_next_drop_q0 =			\
   1224 		    (listener)->tcp_eager_next_drop_q0;			\
   1225 		(listener)->tcp_eager_next_drop_q0 = (eager);		\
   1226 	}
   1227 
   1228 #define	MAKE_UNDROPPABLE(eager)						\
   1229 	if ((eager)->tcp_eager_next_drop_q0 != NULL) {			\
   1230 		(eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0	\
   1231 		    = (eager)->tcp_eager_prev_drop_q0;			\
   1232 		(eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0	\
   1233 		    = (eager)->tcp_eager_next_drop_q0;			\
   1234 		(eager)->tcp_eager_prev_drop_q0 = NULL;			\
   1235 		(eager)->tcp_eager_next_drop_q0 = NULL;			\
   1236 	}
   1237 
   1238 /*
   1239  * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
   1240  * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
   1241  * data, TCP will not respond with an ACK.  RFC 793 requires that
   1242  * TCP responds with an ACK for such a bogus ACK.  By not following
   1243  * the RFC, we prevent TCP from getting into an ACK storm if somehow
   1244  * an attacker successfully spoofs an acceptable segment to our
   1245  * peer; or when our peer is "confused."
   1246  */
   1247 uint32_t tcp_drop_ack_unsent_cnt = 10;
   1248 
   1249 /*
   1250  * Hook functions to enable cluster networking
   1251  * On non-clustered systems these vectors must always be NULL.
   1252  */
   1253 
   1254 void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol,
   1255 			    sa_family_t addr_family, uint8_t *laddrp,
   1256 			    in_port_t lport, void *args) = NULL;
   1257 void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol,
   1258 			    sa_family_t addr_family, uint8_t *laddrp,
   1259 			    in_port_t lport, void *args) = NULL;
   1260 
   1261 int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol,
   1262 			    boolean_t is_outgoing,
   1263 			    sa_family_t addr_family,
   1264 			    uint8_t *laddrp, in_port_t lport,
   1265 			    uint8_t *faddrp, in_port_t fport,
   1266 			    void *args) = NULL;
   1267 void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol,
   1268 			    sa_family_t addr_family, uint8_t *laddrp,
   1269 			    in_port_t lport, uint8_t *faddrp,
   1270 			    in_port_t fport, void *args) = NULL;
   1271 
   1272 
   1273 /*
   1274  * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err)
   1275  */
   1276 #define	CL_INET_CONNECT(connp, is_outgoing, err) {		\
   1277 	(err) = 0;						\
   1278 	if (cl_inet_connect2 != NULL) {				\
   1279 		/*						\
   1280 		 * Running in cluster mode - register active connection	\
   1281 		 * information						\
   1282 		 */							\
   1283 		if ((connp)->conn_ipversion == IPV4_VERSION) {		\
   1284 			if ((connp)->conn_laddr_v4 != 0) {		\
   1285 				(err) = (*cl_inet_connect2)(		\
   1286 				    (connp)->conn_netstack->netstack_stackid,\
   1287 				    IPPROTO_TCP, is_outgoing, AF_INET,	\
   1288 				    (uint8_t *)(&((connp)->conn_laddr_v4)),\
   1289 				    (in_port_t)(connp)->conn_lport,	\
   1290 				    (uint8_t *)(&((connp)->conn_faddr_v4)),\
   1291 				    (in_port_t)(connp)->conn_fport, NULL); \
   1292 			}						\
   1293 		} else {						\
   1294 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
   1295 			    &(connp)->conn_laddr_v6)) {			\
   1296 				(err) = (*cl_inet_connect2)(		\
   1297 				    (connp)->conn_netstack->netstack_stackid,\
   1298 				    IPPROTO_TCP, is_outgoing, AF_INET6,	\
   1299 				    (uint8_t *)(&((connp)->conn_laddr_v6)),\
   1300 				    (in_port_t)(connp)->conn_lport,	\
   1301 				    (uint8_t *)(&((connp)->conn_faddr_v6)), \
   1302 				    (in_port_t)(connp)->conn_fport, NULL); \
   1303 			}						\
   1304 		}							\
   1305 	}								\
   1306 }
   1307 
   1308 #define	CL_INET_DISCONNECT(connp)	{				\
   1309 	if (cl_inet_disconnect != NULL) {				\
   1310 		/*							\
   1311 		 * Running in cluster mode - deregister active		\
   1312 		 * connection information				\
   1313 		 */							\
   1314 		if ((connp)->conn_ipversion == IPV4_VERSION) {		\
   1315 			if ((connp)->conn_laddr_v4 != 0) {		\
   1316 				(*cl_inet_disconnect)(			\
   1317 				    (connp)->conn_netstack->netstack_stackid,\
   1318 				    IPPROTO_TCP, AF_INET,		\
   1319 				    (uint8_t *)(&((connp)->conn_laddr_v4)),\
   1320 				    (in_port_t)(connp)->conn_lport,	\
   1321 				    (uint8_t *)(&((connp)->conn_faddr_v4)),\
   1322 				    (in_port_t)(connp)->conn_fport, NULL); \
   1323 			}						\
   1324 		} else {						\
   1325 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
   1326 			    &(connp)->conn_laddr_v6)) {			\
   1327 				(*cl_inet_disconnect)(			\
   1328 				    (connp)->conn_netstack->netstack_stackid,\
   1329 				    IPPROTO_TCP, AF_INET6,		\
   1330 				    (uint8_t *)(&((connp)->conn_laddr_v6)),\
   1331 				    (in_port_t)(connp)->conn_lport,	\
   1332 				    (uint8_t *)(&((connp)->conn_faddr_v6)), \
   1333 				    (in_port_t)(connp)->conn_fport, NULL); \
   1334 			}						\
   1335 		}							\
   1336 	}								\
   1337 }
   1338 
   1339 /*
   1340  * Steps to do when a tcp_t moves to TIME-WAIT state.
   1341  *
   1342  * This connection is done, we don't need to account for it.  Decrement
   1343  * the listener connection counter if needed.
   1344  *
   1345  * Unconditionally clear the exclusive binding bit so this TIME-WAIT
   1346  * connection won't interfere with new ones.
   1347  *
   1348  * Start the TIME-WAIT timer.  If upper layer has not closed the connection,
   1349  * the timer is handled within the context of this tcp_t.  When the timer
   1350  * fires, tcp_clean_death() is called.  If upper layer closes the connection
   1351  * during this period, tcp_time_wait_append() will be called to add this
   1352  * tcp_t to the global TIME-WAIT list.  Note that this means that the
   1353  * actual wait time in TIME-WAIT state will be longer than the
   1354  * tcps_time_wait_interval since the period before upper layer closes the
   1355  * connection is not accounted for when tcp_time_wait_append() is called.
   1356  *
   1357  * If uppser layer has closed the connection, call tcp_time_wait_append()
   1358  * directly.
   1359  */
   1360 #define	SET_TIME_WAIT(tcps, tcp, connp)				\
   1361 {								\
   1362 	(tcp)->tcp_state = TCPS_TIME_WAIT;			\
   1363 	if ((tcp)->tcp_listen_cnt != NULL)			\
   1364 		TCP_DECR_LISTEN_CNT(tcp);			\
   1365 	(connp)->conn_exclbind = 0;				\
   1366 	if (!TCP_IS_DETACHED(tcp)) {				\
   1367 		TCP_TIMER_RESTART(tcp, (tcps)->tcps_time_wait_interval); \
   1368 	} else {						\
   1369 		tcp_time_wait_append(tcp);			\
   1370 		TCP_DBGSTAT(tcps, tcp_rput_time_wait);		\
   1371 	}							\
   1372 }
   1373 
   1374 /*
   1375  * Cluster networking hook for traversing current connection list.
   1376  * This routine is used to extract the current list of live connections
   1377  * which must continue to to be dispatched to this node.
   1378  */
   1379 int cl_tcp_walk_list(netstackid_t stack_id,
   1380     int (*callback)(cl_tcp_info_t *, void *), void *arg);
   1381 
   1382 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
   1383     void *arg, tcp_stack_t *tcps);
   1384 
   1385 static void
   1386 tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh)
   1387 {
   1388 	uint32_t default_threshold = SOCKET_RECVHIWATER >> 3;
   1389 
   1390 	if (IPCL_IS_NONSTR(tcp->tcp_connp)) {
   1391 		conn_t *connp = tcp->tcp_connp;
   1392 		struct sock_proto_props sopp;
   1393 
   1394 		/*
   1395 		 * only increase rcvthresh upto default_threshold
   1396 		 */
   1397 		if (new_rcvthresh > default_threshold)
   1398 			new_rcvthresh = default_threshold;
   1399 
   1400 		sopp.sopp_flags = SOCKOPT_RCVTHRESH;
   1401 		sopp.sopp_rcvthresh = new_rcvthresh;
   1402 
   1403 		(*connp->conn_upcalls->su_set_proto_props)
   1404 		    (connp->conn_upper_handle, &sopp);
   1405 	}
   1406 }
   1407 /*
   1408  * Figure out the value of window scale opton.  Note that the rwnd is
   1409  * ASSUMED to be rounded up to the nearest MSS before the calculation.
   1410  * We cannot find the scale value and then do a round up of tcp_rwnd
   1411  * because the scale value may not be correct after that.
   1412  *
   1413  * Set the compiler flag to make this function inline.
   1414  */
   1415 static void
   1416 tcp_set_ws_value(tcp_t *tcp)
   1417 {
   1418 	int i;
   1419 	uint32_t rwnd = tcp->tcp_rwnd;
   1420 
   1421 	for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
   1422 	    i++, rwnd >>= 1)
   1423 		;
   1424 	tcp->tcp_rcv_ws = i;
   1425 }
   1426 
   1427 /*
   1428  * Remove a connection from the list of detached TIME_WAIT connections.
   1429  * It returns B_FALSE if it can't remove the connection from the list
   1430  * as the connection has already been removed from the list due to an
   1431  * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
   1432  */
   1433 static boolean_t
   1434 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
   1435 {
   1436 	boolean_t	locked = B_FALSE;
   1437 
   1438 	if (tcp_time_wait == NULL) {
   1439 		tcp_time_wait = *((tcp_squeue_priv_t **)
   1440 		    squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
   1441 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1442 		locked = B_TRUE;
   1443 	} else {
   1444 		ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
   1445 	}
   1446 
   1447 	if (tcp->tcp_time_wait_expire == 0) {
   1448 		ASSERT(tcp->tcp_time_wait_next == NULL);
   1449 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   1450 		if (locked)
   1451 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1452 		return (B_FALSE);
   1453 	}
   1454 	ASSERT(TCP_IS_DETACHED(tcp));
   1455 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
   1456 
   1457 	if (tcp == tcp_time_wait->tcp_time_wait_head) {
   1458 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   1459 		tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
   1460 		if (tcp_time_wait->tcp_time_wait_head != NULL) {
   1461 			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
   1462 			    NULL;
   1463 		} else {
   1464 			tcp_time_wait->tcp_time_wait_tail = NULL;
   1465 		}
   1466 	} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
   1467 		ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
   1468 		ASSERT(tcp->tcp_time_wait_next == NULL);
   1469 		tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
   1470 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
   1471 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
   1472 	} else {
   1473 		ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
   1474 		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
   1475 		tcp->tcp_time_wait_prev->tcp_time_wait_next =
   1476 		    tcp->tcp_time_wait_next;
   1477 		tcp->tcp_time_wait_next->tcp_time_wait_prev =
   1478 		    tcp->tcp_time_wait_prev;
   1479 	}
   1480 	tcp->tcp_time_wait_next = NULL;
   1481 	tcp->tcp_time_wait_prev = NULL;
   1482 	tcp->tcp_time_wait_expire = 0;
   1483 
   1484 	if (locked)
   1485 		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1486 	return (B_TRUE);
   1487 }
   1488 
   1489 /*
   1490  * Add a connection to the list of detached TIME_WAIT connections
   1491  * and set its time to expire.
   1492  */
   1493 static void
   1494 tcp_time_wait_append(tcp_t *tcp)
   1495 {
   1496 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1497 	tcp_squeue_priv_t *tcp_time_wait =
   1498 	    *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
   1499 	    SQPRIVATE_TCP));
   1500 
   1501 	tcp_timers_stop(tcp);
   1502 
   1503 	/* Freed above */
   1504 	ASSERT(tcp->tcp_timer_tid == 0);
   1505 	ASSERT(tcp->tcp_ack_tid == 0);
   1506 
   1507 	/* must have happened at the time of detaching the tcp */
   1508 	ASSERT(tcp->tcp_ptpahn == NULL);
   1509 	ASSERT(tcp->tcp_flow_stopped == 0);
   1510 	ASSERT(tcp->tcp_time_wait_next == NULL);
   1511 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   1512 	ASSERT(tcp->tcp_time_wait_expire == NULL);
   1513 	ASSERT(tcp->tcp_listener == NULL);
   1514 
   1515 	tcp->tcp_time_wait_expire = ddi_get_lbolt();
   1516 	/*
   1517 	 * The value computed below in tcp->tcp_time_wait_expire may
   1518 	 * appear negative or wrap around. That is ok since our
   1519 	 * interest is only in the difference between the current lbolt
   1520 	 * value and tcp->tcp_time_wait_expire. But the value should not
   1521 	 * be zero, since it means the tcp is not in the TIME_WAIT list.
   1522 	 * The corresponding comparison in tcp_time_wait_collector() uses
   1523 	 * modular arithmetic.
   1524 	 */
   1525 	tcp->tcp_time_wait_expire +=
   1526 	    drv_usectohz(tcps->tcps_time_wait_interval * 1000);
   1527 	if (tcp->tcp_time_wait_expire == 0)
   1528 		tcp->tcp_time_wait_expire = 1;
   1529 
   1530 	ASSERT(TCP_IS_DETACHED(tcp));
   1531 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
   1532 	ASSERT(tcp->tcp_time_wait_next == NULL);
   1533 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   1534 	TCP_DBGSTAT(tcps, tcp_time_wait);
   1535 
   1536 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1537 	if (tcp_time_wait->tcp_time_wait_head == NULL) {
   1538 		ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
   1539 		tcp_time_wait->tcp_time_wait_head = tcp;
   1540 	} else {
   1541 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
   1542 		ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
   1543 		    TCPS_TIME_WAIT);
   1544 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
   1545 		tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
   1546 	}
   1547 	tcp_time_wait->tcp_time_wait_tail = tcp;
   1548 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1549 }
   1550 
   1551 /* ARGSUSED */
   1552 void
   1553 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   1554 {
   1555 	conn_t	*connp = (conn_t *)arg;
   1556 	tcp_t	*tcp = connp->conn_tcp;
   1557 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1558 
   1559 	ASSERT(tcp != NULL);
   1560 	if (tcp->tcp_state == TCPS_CLOSED) {
   1561 		return;
   1562 	}
   1563 
   1564 	ASSERT((connp->conn_family == AF_INET &&
   1565 	    connp->conn_ipversion == IPV4_VERSION) ||
   1566 	    (connp->conn_family == AF_INET6 &&
   1567 	    (connp->conn_ipversion == IPV4_VERSION ||
   1568 	    connp->conn_ipversion == IPV6_VERSION)));
   1569 	ASSERT(!tcp->tcp_listener);
   1570 
   1571 	TCP_STAT(tcps, tcp_time_wait_reap);
   1572 	ASSERT(TCP_IS_DETACHED(tcp));
   1573 
   1574 	/*
   1575 	 * Because they have no upstream client to rebind or tcp_close()
   1576 	 * them later, we axe the connection here and now.
   1577 	 */
   1578 	tcp_close_detached(tcp);
   1579 }
   1580 
   1581 /*
   1582  * Remove cached/latched IPsec references.
   1583  */
   1584 void
   1585 tcp_ipsec_cleanup(tcp_t *tcp)
   1586 {
   1587 	conn_t		*connp = tcp->tcp_connp;
   1588 
   1589 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
   1590 
   1591 	if (connp->conn_latch != NULL) {
   1592 		IPLATCH_REFRELE(connp->conn_latch);
   1593 		connp->conn_latch = NULL;
   1594 	}
   1595 	if (connp->conn_latch_in_policy != NULL) {
   1596 		IPPOL_REFRELE(connp->conn_latch_in_policy);
   1597 		connp->conn_latch_in_policy = NULL;
   1598 	}
   1599 	if (connp->conn_latch_in_action != NULL) {
   1600 		IPACT_REFRELE(connp->conn_latch_in_action);
   1601 		connp->conn_latch_in_action = NULL;
   1602 	}
   1603 	if (connp->conn_policy != NULL) {
   1604 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
   1605 		connp->conn_policy = NULL;
   1606 	}
   1607 }
   1608 
   1609 /*
   1610  * Cleaup before placing on free list.
   1611  * Disassociate from the netstack/tcp_stack_t since the freelist
   1612  * is per squeue and not per netstack.
   1613  */
   1614 void
   1615 tcp_cleanup(tcp_t *tcp)
   1616 {
   1617 	mblk_t		*mp;
   1618 	tcp_sack_info_t	*tcp_sack_info;
   1619 	conn_t		*connp = tcp->tcp_connp;
   1620 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1621 	netstack_t	*ns = tcps->tcps_netstack;
   1622 	mblk_t		*tcp_rsrv_mp;
   1623 
   1624 	tcp_bind_hash_remove(tcp);
   1625 
   1626 	/* Cleanup that which needs the netstack first */
   1627 	tcp_ipsec_cleanup(tcp);
   1628 	ixa_cleanup(connp->conn_ixa);
   1629 
   1630 	if (connp->conn_ht_iphc != NULL) {
   1631 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
   1632 		connp->conn_ht_iphc = NULL;
   1633 		connp->conn_ht_iphc_allocated = 0;
   1634 		connp->conn_ht_iphc_len = 0;
   1635 		connp->conn_ht_ulp = NULL;
   1636 		connp->conn_ht_ulp_len = 0;
   1637 		tcp->tcp_ipha = NULL;
   1638 		tcp->tcp_ip6h = NULL;
   1639 		tcp->tcp_tcpha = NULL;
   1640 	}
   1641 
   1642 	/* We clear any IP_OPTIONS and extension headers */
   1643 	ip_pkt_free(&connp->conn_xmit_ipp);
   1644 
   1645 	tcp_free(tcp);
   1646 
   1647 	/* Release any SSL context */
   1648 	if (tcp->tcp_kssl_ent != NULL) {
   1649 		kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
   1650 		tcp->tcp_kssl_ent = NULL;
   1651 	}
   1652 
   1653 	if (tcp->tcp_kssl_ctx != NULL) {
   1654 		kssl_release_ctx(tcp->tcp_kssl_ctx);
   1655 		tcp->tcp_kssl_ctx = NULL;
   1656 	}
   1657 	tcp->tcp_kssl_pending = B_FALSE;
   1658 
   1659 	/*
   1660 	 * Since we will bzero the entire structure, we need to
   1661 	 * remove it and reinsert it in global hash list. We
   1662 	 * know the walkers can't get to this conn because we
   1663 	 * had set CONDEMNED flag earlier and checked reference
   1664 	 * under conn_lock so walker won't pick it and when we
   1665 	 * go the ipcl_globalhash_remove() below, no walker
   1666 	 * can get to it.
   1667 	 */
   1668 	ipcl_globalhash_remove(connp);
   1669 
   1670 	/* Save some state */
   1671 	mp = tcp->tcp_timercache;
   1672 
   1673 	tcp_sack_info = tcp->tcp_sack_info;
   1674 	tcp_rsrv_mp = tcp->tcp_rsrv_mp;
   1675 
   1676 	if (connp->conn_cred != NULL) {
   1677 		crfree(connp->conn_cred);
   1678 		connp->conn_cred = NULL;
   1679 	}
   1680 	ipcl_conn_cleanup(connp);
   1681 	connp->conn_flags = IPCL_TCPCONN;
   1682 
   1683 	/*
   1684 	 * Now it is safe to decrement the reference counts.
   1685 	 * This might be the last reference on the netstack
   1686 	 * in which case it will cause the freeing of the IP Instance.
   1687 	 */
   1688 	connp->conn_netstack = NULL;
   1689 	connp->conn_ixa->ixa_ipst = NULL;
   1690 	netstack_rele(ns);
   1691 	ASSERT(tcps != NULL);
   1692 	tcp->tcp_tcps = NULL;
   1693 
   1694 	bzero(tcp, sizeof (tcp_t));
   1695 
   1696 	/* restore the state */
   1697 	tcp->tcp_timercache = mp;
   1698 
   1699 	tcp->tcp_sack_info = tcp_sack_info;
   1700 	tcp->tcp_rsrv_mp = tcp_rsrv_mp;
   1701 
   1702 	tcp->tcp_connp = connp;
   1703 
   1704 	ASSERT(connp->conn_tcp == tcp);
   1705 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
   1706 	connp->conn_state_flags = CONN_INCIPIENT;
   1707 	ASSERT(connp->conn_proto == IPPROTO_TCP);
   1708 	ASSERT(connp->conn_ref == 1);
   1709 }
   1710 
   1711 /*
   1712  * Blows away all tcps whose TIME_WAIT has expired. List traversal
   1713  * is done forwards from the head.
   1714  * This walks all stack instances since
   1715  * tcp_time_wait remains global across all stacks.
   1716  */
   1717 /* ARGSUSED */
   1718 void
   1719 tcp_time_wait_collector(void *arg)
   1720 {
   1721 	tcp_t *tcp;
   1722 	clock_t now;
   1723 	mblk_t *mp;
   1724 	conn_t *connp;
   1725 	kmutex_t *lock;
   1726 	boolean_t removed;
   1727 
   1728 	squeue_t *sqp = (squeue_t *)arg;
   1729 	tcp_squeue_priv_t *tcp_time_wait =
   1730 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
   1731 
   1732 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1733 	tcp_time_wait->tcp_time_wait_tid = 0;
   1734 
   1735 	if (tcp_time_wait->tcp_free_list != NULL &&
   1736 	    tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
   1737 		TCP_G_STAT(tcp_freelist_cleanup);
   1738 		while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
   1739 			tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
   1740 			tcp->tcp_time_wait_next = NULL;
   1741 			tcp_time_wait->tcp_free_list_cnt--;
   1742 			ASSERT(tcp->tcp_tcps == NULL);
   1743 			CONN_DEC_REF(tcp->tcp_connp);
   1744 		}
   1745 		ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
   1746 	}
   1747 
   1748 	/*
   1749 	 * In order to reap time waits reliably, we should use a
   1750 	 * source of time that is not adjustable by the user -- hence
   1751 	 * the call to ddi_get_lbolt().
   1752 	 */
   1753 	now = ddi_get_lbolt();
   1754 	while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
   1755 		/*
   1756 		 * Compare times using modular arithmetic, since
   1757 		 * lbolt can wrapover.
   1758 		 */
   1759 		if ((now - tcp->tcp_time_wait_expire) < 0) {
   1760 			break;
   1761 		}
   1762 
   1763 		removed = tcp_time_wait_remove(tcp, tcp_time_wait);
   1764 		ASSERT(removed);
   1765 
   1766 		connp = tcp->tcp_connp;
   1767 		ASSERT(connp->conn_fanout != NULL);
   1768 		lock = &connp->conn_fanout->connf_lock;
   1769 		/*
   1770 		 * This is essentially a TW reclaim fast path optimization for
   1771 		 * performance where the timewait collector checks under the
   1772 		 * fanout lock (so that no one else can get access to the
   1773 		 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
   1774 		 * the classifier hash list. If ref count is indeed 2, we can
   1775 		 * just remove the conn under the fanout lock and avoid
   1776 		 * cleaning up the conn under the squeue, provided that
   1777 		 * clustering callbacks are not enabled. If clustering is
   1778 		 * enabled, we need to make the clustering callback before
   1779 		 * setting the CONDEMNED flag and after dropping all locks and
   1780 		 * so we forego this optimization and fall back to the slow
   1781 		 * path. Also please see the comments in tcp_closei_local
   1782 		 * regarding the refcnt logic.
   1783 		 *
   1784 		 * Since we are holding the tcp_time_wait_lock, its better
   1785 		 * not to block on the fanout_lock because other connections
   1786 		 * can't add themselves to time_wait list. So we do a
   1787 		 * tryenter instead of mutex_enter.
   1788 		 */
   1789 		if (mutex_tryenter(lock)) {
   1790 			mutex_enter(&connp->conn_lock);
   1791 			if ((connp->conn_ref == 2) &&
   1792 			    (cl_inet_disconnect == NULL)) {
   1793 				ipcl_hash_remove_locked(connp,
   1794 				    connp->conn_fanout);
   1795 				/*
   1796 				 * Set the CONDEMNED flag now itself so that
   1797 				 * the refcnt cannot increase due to any
   1798 				 * walker.
   1799 				 */
   1800 				connp->conn_state_flags |= CONN_CONDEMNED;
   1801 				mutex_exit(lock);
   1802 				mutex_exit(&connp->conn_lock);
   1803 				if (tcp_time_wait->tcp_free_list_cnt <
   1804 				    tcp_free_list_max_cnt) {
   1805 					/* Add to head of tcp_free_list */
   1806 					mutex_exit(
   1807 					    &tcp_time_wait->tcp_time_wait_lock);
   1808 					tcp_cleanup(tcp);
   1809 					ASSERT(connp->conn_latch == NULL);
   1810 					ASSERT(connp->conn_policy == NULL);
   1811 					ASSERT(tcp->tcp_tcps == NULL);
   1812 					ASSERT(connp->conn_netstack == NULL);
   1813 
   1814 					mutex_enter(
   1815 					    &tcp_time_wait->tcp_time_wait_lock);
   1816 					tcp->tcp_time_wait_next =
   1817 					    tcp_time_wait->tcp_free_list;
   1818 					tcp_time_wait->tcp_free_list = tcp;
   1819 					tcp_time_wait->tcp_free_list_cnt++;
   1820 					continue;
   1821 				} else {
   1822 					/* Do not add to tcp_free_list */
   1823 					mutex_exit(
   1824 					    &tcp_time_wait->tcp_time_wait_lock);
   1825 					tcp_bind_hash_remove(tcp);
   1826 					ixa_cleanup(tcp->tcp_connp->conn_ixa);
   1827 					tcp_ipsec_cleanup(tcp);
   1828 					CONN_DEC_REF(tcp->tcp_connp);
   1829 				}
   1830 			} else {
   1831 				CONN_INC_REF_LOCKED(connp);
   1832 				mutex_exit(lock);
   1833 				mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1834 				mutex_exit(&connp->conn_lock);
   1835 				/*
   1836 				 * We can reuse the closemp here since conn has
   1837 				 * detached (otherwise we wouldn't even be in
   1838 				 * time_wait list). tcp_closemp_used can safely
   1839 				 * be changed without taking a lock as no other
   1840 				 * thread can concurrently access it at this
   1841 				 * point in the connection lifecycle.
   1842 				 */
   1843 
   1844 				if (tcp->tcp_closemp.b_prev == NULL)
   1845 					tcp->tcp_closemp_used = B_TRUE;
   1846 				else
   1847 					cmn_err(CE_PANIC,
   1848 					    "tcp_timewait_collector: "
   1849 					    "concurrent use of tcp_closemp: "
   1850 					    "connp %p tcp %p\n", (void *)connp,
   1851 					    (void *)tcp);
   1852 
   1853 				TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   1854 				mp = &tcp->tcp_closemp;
   1855 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   1856 				    tcp_timewait_output, connp, NULL,
   1857 				    SQ_FILL, SQTAG_TCP_TIMEWAIT);
   1858 			}
   1859 		} else {
   1860 			mutex_enter(&connp->conn_lock);
   1861 			CONN_INC_REF_LOCKED(connp);
   1862 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1863 			mutex_exit(&connp->conn_lock);
   1864 			/*
   1865 			 * We can reuse the closemp here since conn has
   1866 			 * detached (otherwise we wouldn't even be in
   1867 			 * time_wait list). tcp_closemp_used can safely
   1868 			 * be changed without taking a lock as no other
   1869 			 * thread can concurrently access it at this
   1870 			 * point in the connection lifecycle.
   1871 			 */
   1872 
   1873 			if (tcp->tcp_closemp.b_prev == NULL)
   1874 				tcp->tcp_closemp_used = B_TRUE;
   1875 			else
   1876 				cmn_err(CE_PANIC, "tcp_timewait_collector: "
   1877 				    "concurrent use of tcp_closemp: "
   1878 				    "connp %p tcp %p\n", (void *)connp,
   1879 				    (void *)tcp);
   1880 
   1881 			TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   1882 			mp = &tcp->tcp_closemp;
   1883 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   1884 			    tcp_timewait_output, connp, NULL,
   1885 			    SQ_FILL, SQTAG_TCP_TIMEWAIT);
   1886 		}
   1887 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1888 	}
   1889 
   1890 	if (tcp_time_wait->tcp_free_list != NULL)
   1891 		tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
   1892 
   1893 	tcp_time_wait->tcp_time_wait_tid =
   1894 	    timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp,
   1895 	    TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION,
   1896 	    CALLOUT_FLAG_ROUNDUP);
   1897 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1898 }
   1899 
   1900 /*
   1901  * Reply to a clients T_CONN_RES TPI message. This function
   1902  * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
   1903  * on the acceptor STREAM and processed in tcp_accept_common().
   1904  * Read the block comment on top of tcp_input_listener().
   1905  */
   1906 static void
   1907 tcp_tli_accept(tcp_t *listener, mblk_t *mp)
   1908 {
   1909 	tcp_t		*acceptor;
   1910 	tcp_t		*eager;
   1911 	tcp_t   	*tcp;
   1912 	struct T_conn_res	*tcr;
   1913 	t_uscalar_t	acceptor_id;
   1914 	t_scalar_t	seqnum;
   1915 	mblk_t		*discon_mp = NULL;
   1916 	mblk_t		*ok_mp;
   1917 	mblk_t		*mp1;
   1918 	tcp_stack_t	*tcps = listener->tcp_tcps;
   1919 	conn_t		*econnp;
   1920 
   1921 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
   1922 		tcp_err_ack(listener, mp, TPROTO, 0);
   1923 		return;
   1924 	}
   1925 	tcr = (struct T_conn_res *)mp->b_rptr;
   1926 
   1927 	/*
   1928 	 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
   1929 	 * read side queue of the streams device underneath us i.e. the
   1930 	 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
   1931 	 * look it up in the queue_hash.  Under LP64 it sends down the
   1932 	 * minor_t of the accepting endpoint.
   1933 	 *
   1934 	 * Once the acceptor/eager are modified (in tcp_accept_swap) the
   1935 	 * fanout hash lock is held.
   1936 	 * This prevents any thread from entering the acceptor queue from
   1937 	 * below (since it has not been hard bound yet i.e. any inbound
   1938 	 * packets will arrive on the listener conn_t and
   1939 	 * go through the classifier).
   1940 	 * The CONN_INC_REF will prevent the acceptor from closing.
   1941 	 *
   1942 	 * XXX It is still possible for a tli application to send down data
   1943 	 * on the accepting stream while another thread calls t_accept.
   1944 	 * This should not be a problem for well-behaved applications since
   1945 	 * the T_OK_ACK is sent after the queue swapping is completed.
   1946 	 *
   1947 	 * If the accepting fd is the same as the listening fd, avoid
   1948 	 * queue hash lookup since that will return an eager listener in a
   1949 	 * already established state.
   1950 	 */
   1951 	acceptor_id = tcr->ACCEPTOR_id;
   1952 	mutex_enter(&listener->tcp_eager_lock);
   1953 	if (listener->tcp_acceptor_id == acceptor_id) {
   1954 		eager = listener->tcp_eager_next_q;
   1955 		/* only count how many T_CONN_INDs so don't count q0 */
   1956 		if ((listener->tcp_conn_req_cnt_q != 1) ||
   1957 		    (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
   1958 			mutex_exit(&listener->tcp_eager_lock);
   1959 			tcp_err_ack(listener, mp, TBADF, 0);
   1960 			return;
   1961 		}
   1962 		if (listener->tcp_conn_req_cnt_q0 != 0) {
   1963 			/* Throw away all the eagers on q0. */
   1964 			tcp_eager_cleanup(listener, 1);
   1965 		}
   1966 		if (listener->tcp_syn_defense) {
   1967 			listener->tcp_syn_defense = B_FALSE;
   1968 			if (listener->tcp_ip_addr_cache != NULL) {
   1969 				kmem_free(listener->tcp_ip_addr_cache,
   1970 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
   1971 				listener->tcp_ip_addr_cache = NULL;
   1972 			}
   1973 		}
   1974 		/*
   1975 		 * Transfer tcp_conn_req_max to the eager so that when
   1976 		 * a disconnect occurs we can revert the endpoint to the
   1977 		 * listen state.
   1978 		 */
   1979 		eager->tcp_conn_req_max = listener->tcp_conn_req_max;
   1980 		ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
   1981 		/*
   1982 		 * Get a reference on the acceptor just like the
   1983 		 * tcp_acceptor_hash_lookup below.
   1984 		 */
   1985 		acceptor = listener;
   1986 		CONN_INC_REF(acceptor->tcp_connp);
   1987 	} else {
   1988 		acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
   1989 		if (acceptor == NULL) {
   1990 			if (listener->tcp_connp->conn_debug) {
   1991 				(void) strlog(TCP_MOD_ID, 0, 1,
   1992 				    SL_ERROR|SL_TRACE,
   1993 				    "tcp_accept: did not find acceptor 0x%x\n",
   1994 				    acceptor_id);
   1995 			}
   1996 			mutex_exit(&listener->tcp_eager_lock);
   1997 			tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
   1998 			return;
   1999 		}
   2000 		/*
   2001 		 * Verify acceptor state. The acceptable states for an acceptor
   2002 		 * include TCPS_IDLE and TCPS_BOUND.
   2003 		 */
   2004 		switch (acceptor->tcp_state) {
   2005 		case TCPS_IDLE:
   2006 			/* FALLTHRU */
   2007 		case TCPS_BOUND:
   2008 			break;
   2009 		default:
   2010 			CONN_DEC_REF(acceptor->tcp_connp);
   2011 			mutex_exit(&listener->tcp_eager_lock);
   2012 			tcp_err_ack(listener, mp, TOUTSTATE, 0);
   2013 			return;
   2014 		}
   2015 	}
   2016 
   2017 	/* The listener must be in TCPS_LISTEN */
   2018 	if (listener->tcp_state != TCPS_LISTEN) {
   2019 		CONN_DEC_REF(acceptor->tcp_connp);
   2020 		mutex_exit(&listener->tcp_eager_lock);
   2021 		tcp_err_ack(listener, mp, TOUTSTATE, 0);
   2022 		return;
   2023 	}
   2024 
   2025 	/*
   2026 	 * Rendezvous with an eager connection request packet hanging off
   2027 	 * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
   2028 	 * tcp structure when the connection packet arrived in
   2029 	 * tcp_input_listener().
   2030 	 */
   2031 	seqnum = tcr->SEQ_number;
   2032 	eager = listener;
   2033 	do {
   2034 		eager = eager->tcp_eager_next_q;
   2035 		if (eager == NULL) {
   2036 			CONN_DEC_REF(acceptor->tcp_connp);
   2037 			mutex_exit(&listener->tcp_eager_lock);
   2038 			tcp_err_ack(listener, mp, TBADSEQ, 0);
   2039 			return;
   2040 		}
   2041 	} while (eager->tcp_conn_req_seqnum != seqnum);
   2042 	mutex_exit(&listener->tcp_eager_lock);
   2043 
   2044 	/*
   2045 	 * At this point, both acceptor and listener have 2 ref
   2046 	 * that they begin with. Acceptor has one additional ref
   2047 	 * we placed in lookup while listener has 3 additional
   2048 	 * ref for being behind the squeue (tcp_accept() is
   2049 	 * done on listener's squeue); being in classifier hash;
   2050 	 * and eager's ref on listener.
   2051 	 */
   2052 	ASSERT(listener->tcp_connp->conn_ref >= 5);
   2053 	ASSERT(acceptor->tcp_connp->conn_ref >= 3);
   2054 
   2055 	/*
   2056 	 * The eager at this point is set in its own squeue and
   2057 	 * could easily have been killed (tcp_accept_finish will
   2058 	 * deal with that) because of a TH_RST so we can only
   2059 	 * ASSERT for a single ref.
   2060 	 */
   2061 	ASSERT(eager->tcp_connp->conn_ref >= 1);
   2062 
   2063 	/*
   2064 	 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
   2065 	 * use it if something failed.
   2066 	 */
   2067 	discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
   2068 	    sizeof (struct stroptions)), BPRI_HI);
   2069 	if (discon_mp == NULL) {
   2070 		CONN_DEC_REF(acceptor->tcp_connp);
   2071 		CONN_DEC_REF(eager->tcp_connp);
   2072 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
   2073 		return;
   2074 	}
   2075 
   2076 	econnp = eager->tcp_connp;
   2077 
   2078 	/* Hold a copy of mp, in case reallocb fails */
   2079 	if ((mp1 = copymsg(mp)) == NULL) {
   2080 		CONN_DEC_REF(acceptor->tcp_connp);
   2081 		CONN_DEC_REF(eager->tcp_connp);
   2082 		freemsg(discon_mp);
   2083 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
   2084 		return;
   2085 	}
   2086 
   2087 	tcr = (struct T_conn_res *)mp1->b_rptr;
   2088 
   2089 	/*
   2090 	 * This is an expanded version of mi_tpi_ok_ack_alloc()
   2091 	 * which allocates a larger mblk and appends the new
   2092 	 * local address to the ok_ack.  The address is copied by
   2093 	 * soaccept() for getsockname().
   2094 	 */
   2095 	{
   2096 		int extra;
   2097 
   2098 		extra = (econnp->conn_family == AF_INET) ?
   2099 		    sizeof (sin_t) : sizeof (sin6_t);
   2100 
   2101 		/*
   2102 		 * Try to re-use mp, if possible.  Otherwise, allocate
   2103 		 * an mblk and return it as ok_mp.  In any case, mp
   2104 		 * is no longer usable upon return.
   2105 		 */
   2106 		if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
   2107 			CONN_DEC_REF(acceptor->tcp_connp);
   2108 			CONN_DEC_REF(eager->tcp_connp);
   2109 			freemsg(discon_mp);
   2110 			/* Original mp has been freed by now, so use mp1 */
   2111 			tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
   2112 			return;
   2113 		}
   2114 
   2115 		mp = NULL;	/* We should never use mp after this point */
   2116 
   2117 		switch (extra) {
   2118 		case sizeof (sin_t): {
   2119 			sin_t *sin = (sin_t *)ok_mp->b_wptr;
   2120 
   2121 			ok_mp->b_wptr += extra;
   2122 			sin->sin_family = AF_INET;
   2123 			sin->sin_port = econnp->conn_lport;
   2124 			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
   2125 			break;
   2126 		}
   2127 		case sizeof (sin6_t): {
   2128 			sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
   2129 
   2130 			ok_mp->b_wptr += extra;
   2131 			sin6->sin6_family = AF_INET6;
   2132 			sin6->sin6_port = econnp->conn_lport;
   2133 			sin6->sin6_addr = econnp->conn_laddr_v6;
   2134 			sin6->sin6_flowinfo = econnp->conn_flowinfo;
   2135 			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
   2136 			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
   2137 				sin6->sin6_scope_id =
   2138 				    econnp->conn_ixa->ixa_scopeid;
   2139 			} else {
   2140 				sin6->sin6_scope_id = 0;
   2141 			}
   2142 			sin6->__sin6_src_id = 0;
   2143 			break;
   2144 		}
   2145 		default:
   2146 			break;
   2147 		}
   2148 		ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
   2149 	}
   2150 
   2151 	/*
   2152 	 * If there are no options we know that the T_CONN_RES will
   2153 	 * succeed. However, we can't send the T_OK_ACK upstream until
   2154 	 * the tcp_accept_swap is done since it would be dangerous to
   2155 	 * let the application start using the new fd prior to the swap.
   2156 	 */
   2157 	tcp_accept_swap(listener, acceptor, eager);
   2158 
   2159 	/*
   2160 	 * tcp_accept_swap unlinks eager from listener but does not drop
   2161 	 * the eager's reference on the listener.
   2162 	 */
   2163 	ASSERT(eager->tcp_listener == NULL);
   2164 	ASSERT(listener->tcp_connp->conn_ref >= 5);
   2165 
   2166 	/*
   2167 	 * The eager is now associated with its own queue. Insert in
   2168 	 * the hash so that the connection can be reused for a future
   2169 	 * T_CONN_RES.
   2170 	 */
   2171 	tcp_acceptor_hash_insert(acceptor_id, eager);
   2172 
   2173 	/*
   2174 	 * We now do the processing of options with T_CONN_RES.
   2175 	 * We delay till now since we wanted to have queue to pass to
   2176 	 * option processing routines that points back to the right
   2177 	 * instance structure which does not happen until after
   2178 	 * tcp_accept_swap().
   2179 	 *
   2180 	 * Note:
   2181 	 * The sanity of the logic here assumes that whatever options
   2182 	 * are appropriate to inherit from listner=>eager are done
   2183 	 * before this point, and whatever were to be overridden (or not)
   2184 	 * in transfer logic from eager=>acceptor in tcp_accept_swap().
   2185 	 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
   2186 	 *   before its ACCEPTOR_id comes down in T_CONN_RES ]
   2187 	 * This may not be true at this point in time but can be fixed
   2188 	 * independently. This option processing code starts with
   2189 	 * the instantiated acceptor instance and the final queue at
   2190 	 * this point.
   2191 	 */
   2192 
   2193 	if (tcr->OPT_length != 0) {
   2194 		/* Options to process */
   2195 		int t_error = 0;
   2196 		int sys_error = 0;
   2197 		int do_disconnect = 0;
   2198 
   2199 		if (tcp_conprim_opt_process(eager, mp1,
   2200 		    &do_disconnect, &t_error, &sys_error) < 0) {
   2201 			eager->tcp_accept_error = 1;
   2202 			if (do_disconnect) {
   2203 				/*
   2204 				 * An option failed which does not allow
   2205 				 * connection to be accepted.
   2206 				 *
   2207 				 * We allow T_CONN_RES to succeed and
   2208 				 * put a T_DISCON_IND on the eager queue.
   2209 				 */
   2210 				ASSERT(t_error == 0 && sys_error == 0);
   2211 				eager->tcp_send_discon_ind = 1;
   2212 			} else {
   2213 				ASSERT(t_error != 0);
   2214 				freemsg(ok_mp);
   2215 				/*
   2216 				 * Original mp was either freed or set
   2217 				 * to ok_mp above, so use mp1 instead.
   2218 				 */
   2219 				tcp_err_ack(listener, mp1, t_error, sys_error);
   2220 				goto finish;
   2221 			}
   2222 		}
   2223 		/*
   2224 		 * Most likely success in setting options (except if
   2225 		 * eager->tcp_send_discon_ind set).
   2226 		 * mp1 option buffer represented by OPT_length/offset
   2227 		 * potentially modified and contains results of setting
   2228 		 * options at this point
   2229 		 */
   2230 	}
   2231 
   2232 	/* We no longer need mp1, since all options processing has passed */
   2233 	freemsg(mp1);
   2234 
   2235 	putnext(listener->tcp_connp->conn_rq, ok_mp);
   2236 
   2237 	mutex_enter(&listener->tcp_eager_lock);
   2238 	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
   2239 		tcp_t	*tail;
   2240 		mblk_t	*conn_ind;
   2241 
   2242 		/*
   2243 		 * This path should not be executed if listener and
   2244 		 * acceptor streams are the same.
   2245 		 */
   2246 		ASSERT(listener != acceptor);
   2247 
   2248 		tcp = listener->tcp_eager_prev_q0;
   2249 		/*
   2250 		 * listener->tcp_eager_prev_q0 points to the TAIL of the
   2251 		 * deferred T_conn_ind queue. We need to get to the head of
   2252 		 * the queue in order to send up T_conn_ind the same order as
   2253 		 * how the 3WHS is completed.
   2254 		 */
   2255 		while (tcp != listener) {
   2256 			if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
   2257 				break;
   2258 			else
   2259 				tcp = tcp->tcp_eager_prev_q0;
   2260 		}
   2261 		ASSERT(tcp != listener);
   2262 		conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
   2263 		ASSERT(conn_ind != NULL);
   2264 		tcp->tcp_conn.tcp_eager_conn_ind = NULL;
   2265 
   2266 		/* Move from q0 to q */
   2267 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
   2268 		listener->tcp_conn_req_cnt_q0--;
   2269 		listener->tcp_conn_req_cnt_q++;
   2270 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
   2271 		    tcp->tcp_eager_prev_q0;
   2272 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
   2273 		    tcp->tcp_eager_next_q0;
   2274 		tcp->tcp_eager_prev_q0 = NULL;
   2275 		tcp->tcp_eager_next_q0 = NULL;
   2276 		tcp->tcp_conn_def_q0 = B_FALSE;
   2277 
   2278 		/* Make sure the tcp isn't in the list of droppables */
   2279 		ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
   2280 		    tcp->tcp_eager_prev_drop_q0 == NULL);
   2281 
   2282 		/*
   2283 		 * Insert at end of the queue because sockfs sends
   2284 		 * down T_CONN_RES in chronological order. Leaving
   2285 		 * the older conn indications at front of the queue
   2286 		 * helps reducing search time.
   2287 		 */
   2288 		tail = listener->tcp_eager_last_q;
   2289 		if (tail != NULL)
   2290 			tail->tcp_eager_next_q = tcp;
   2291 		else
   2292 			listener->tcp_eager_next_q = tcp;
   2293 		listener->tcp_eager_last_q = tcp;
   2294 		tcp->tcp_eager_next_q = NULL;
   2295 		mutex_exit(&listener->tcp_eager_lock);
   2296 		putnext(tcp->tcp_connp->conn_rq, conn_ind);
   2297 	} else {
   2298 		mutex_exit(&listener->tcp_eager_lock);
   2299 	}
   2300 
   2301 	/*
   2302 	 * Done with the acceptor - free it
   2303 	 *
   2304 	 * Note: from this point on, no access to listener should be made
   2305 	 * as listener can be equal to acceptor.
   2306 	 */
   2307 finish:
   2308 	ASSERT(acceptor->tcp_detached);
   2309 	acceptor->tcp_connp->conn_rq = NULL;
   2310 	ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
   2311 	acceptor->tcp_connp->conn_wq = NULL;
   2312 	(void) tcp_clean_death(acceptor, 0, 2);
   2313 	CONN_DEC_REF(acceptor->tcp_connp);
   2314 
   2315 	/*
   2316 	 * We pass discon_mp to tcp_accept_finish to get on the right squeue.
   2317 	 *
   2318 	 * It will update the setting for sockfs/stream head and also take
   2319 	 * care of any data that arrived before accept() wad called.
   2320 	 * In case we already received a FIN then tcp_accept_finish will send up
   2321 	 * the ordrel. It will also send up a window update if the window
   2322 	 * has opened up.
   2323 	 */
   2324 
   2325 	/*
   2326 	 * XXX: we currently have a problem if XTI application closes the
   2327 	 * acceptor stream in between. This problem exists in on10-gate also
   2328 	 * and is well know but nothing can be done short of major rewrite
   2329 	 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
   2330 	 * eager same squeue as listener (we can distinguish non socket
   2331 	 * listeners at the time of handling a SYN in tcp_input_listener)
   2332 	 * and do most of the work that tcp_accept_finish does here itself
   2333 	 * and then get behind the acceptor squeue to access the acceptor
   2334 	 * queue.
   2335 	 */
   2336 	/*
   2337 	 * We already have a ref on tcp so no need to do one before squeue_enter
   2338 	 */
   2339 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
   2340 	    tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
   2341 	    SQTAG_TCP_ACCEPT_FINISH);
   2342 }
   2343 
   2344 /*
   2345  * Swap information between the eager and acceptor for a TLI/XTI client.
   2346  * The sockfs accept is done on the acceptor stream and control goes
   2347  * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
   2348  * called. In either case, both the eager and listener are in their own
   2349  * perimeter (squeue) and the code has to deal with potential race.
   2350  *
   2351  * See the block comment on top of tcp_accept() and tcp_tli_accept().
   2352  */
   2353 static void
   2354 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
   2355 {
   2356 	conn_t	*econnp, *aconnp;
   2357 
   2358 	ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
   2359 	ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
   2360 	ASSERT(!TCP_IS_SOCKET(acceptor));
   2361 	ASSERT(!TCP_IS_SOCKET(eager));
   2362 	ASSERT(!TCP_IS_SOCKET(listener));
   2363 
   2364 	/*
   2365 	 * Trusted Extensions may need to use a security label that is
   2366 	 * different from the acceptor's label on MLP and MAC-Exempt
   2367 	 * sockets. If this is the case, the required security label
   2368 	 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
   2369 	 * acceptor stream refer to econnp we atomatically get that label.
   2370 	 */
   2371 
   2372 	acceptor->tcp_detached = B_TRUE;
   2373 	/*
   2374 	 * To permit stream re-use by TLI/XTI, the eager needs a copy of
   2375 	 * the acceptor id.
   2376 	 */
   2377 	eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
   2378 
   2379 	/* remove eager from listen list... */
   2380 	mutex_enter(&listener->tcp_eager_lock);
   2381 	tcp_eager_unlink(eager);
   2382 	ASSERT(eager->tcp_eager_next_q == NULL &&
   2383 	    eager->tcp_eager_last_q == NULL);
   2384 	ASSERT(eager->tcp_eager_next_q0 == NULL &&
   2385 	    eager->tcp_eager_prev_q0 == NULL);
   2386 	mutex_exit(&listener->tcp_eager_lock);
   2387 
   2388 	econnp = eager->tcp_connp;
   2389 	aconnp = acceptor->tcp_connp;
   2390 	econnp->conn_rq = aconnp->conn_rq;
   2391 	econnp->conn_wq = aconnp->conn_wq;
   2392 	econnp->conn_rq->q_ptr = econnp;
   2393 	econnp->conn_wq->q_ptr = econnp;
   2394 
   2395 	/*
   2396 	 * In the TLI/XTI loopback case, we are inside the listener's squeue,
   2397 	 * which might be a different squeue from our peer TCP instance.
   2398 	 * For TCP Fusion, the peer expects that whenever tcp_detached is
   2399 	 * clear, our TCP queues point to the acceptor's queues.  Thus, use
   2400 	 * membar_producer() to ensure that the assignments of conn_rq/conn_wq
   2401 	 * above reach global visibility prior to the clearing of tcp_detached.
   2402 	 */
   2403 	membar_producer();
   2404 	eager->tcp_detached = B_FALSE;
   2405 
   2406 	ASSERT(eager->tcp_ack_tid == 0);
   2407 
   2408 	econnp->conn_dev = aconnp->conn_dev;
   2409 	econnp->conn_minor_arena = aconnp->conn_minor_arena;
   2410 
   2411 	ASSERT(econnp->conn_minor_arena != NULL);
   2412 	if (econnp->conn_cred != NULL)
   2413 		crfree(econnp->conn_cred);
   2414 	econnp->conn_cred = aconnp->conn_cred;
   2415 	aconnp->conn_cred = NULL;
   2416 	econnp->conn_cpid = aconnp->conn_cpid;
   2417 	ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
   2418 	ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
   2419 
   2420 	econnp->conn_zoneid = aconnp->conn_zoneid;
   2421 	econnp->conn_allzones = aconnp->conn_allzones;
   2422 	econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
   2423 
   2424 	econnp->conn_mac_mode = aconnp->conn_mac_mode;
   2425 	econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
   2426 	aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
   2427 
   2428 	/* Do the IPC initialization */
   2429 	CONN_INC_REF(econnp);
   2430 
   2431 	/* Done with old IPC. Drop its ref on its connp */
   2432 	CONN_DEC_REF(aconnp);
   2433 }
   2434 
   2435 
   2436 /*
   2437  * Adapt to the information, such as rtt and rtt_sd, provided from the
   2438  * DCE and IRE maintained by IP.
   2439  *
   2440  * Checks for multicast and broadcast destination address.
   2441  * Returns zero if ok; an errno on failure.
   2442  *
   2443  * Note that the MSS calculation here is based on the info given in
   2444  * the DCE and IRE.  We do not do any calculation based on TCP options.  They
   2445  * will be handled in tcp_input_data() when TCP knows which options to use.
   2446  *
   2447  * Note on how TCP gets its parameters for a connection.
   2448  *
   2449  * When a tcp_t structure is allocated, it gets all the default parameters.
   2450  * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd,
   2451  * spipe, rpipe, ... from the route metrics.  Route metric overrides the
   2452  * default.
   2453  *
   2454  * An incoming SYN with a multicast or broadcast destination address is dropped
   2455  * in ip_fanout_v4/v6.
   2456  *
   2457  * An incoming SYN with a multicast or broadcast source address is always
   2458  * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in
   2459  * conn_connect.
   2460  * The same logic in tcp_set_destination also serves to
   2461  * reject an attempt to connect to a broadcast or multicast (destination)
   2462  * address.
   2463  */
   2464 static int
   2465 tcp_set_destination(tcp_t *tcp)
   2466 {
   2467 	uint32_t	mss_max;
   2468 	uint32_t	mss;
   2469 	boolean_t	tcp_detached = TCP_IS_DETACHED(tcp);
   2470 	conn_t		*connp = tcp->tcp_connp;
   2471 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   2472 	iulp_t		uinfo;
   2473 	int		error;
   2474 	uint32_t	flags;
   2475 
   2476 	flags = IPDF_LSO | IPDF_ZCOPY;
   2477 	/*
   2478 	 * Make sure we have a dce for the destination to avoid dce_ident
   2479 	 * contention for connected sockets.
   2480 	 */
   2481 	flags |= IPDF_UNIQUE_DCE;
   2482 
   2483 	if (!tcps->tcps_ignore_path_mtu)
   2484 		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
   2485 
   2486 	/* Use conn_lock to satify ASSERT; tcp is already serialized */
   2487 	mutex_enter(&connp->conn_lock);
   2488 	error = conn_connect(connp, &uinfo, flags);
   2489 	mutex_exit(&connp->conn_lock);
   2490 	if (error != 0)
   2491 		return (error);
   2492 
   2493 	error = tcp_build_hdrs(tcp);
   2494 	if (error != 0)
   2495 		return (error);
   2496 
   2497 	tcp->tcp_localnet = uinfo.iulp_localnet;
   2498 
   2499 	if (uinfo.iulp_rtt != 0) {
   2500 		clock_t	rto;
   2501 
   2502 		tcp->tcp_rtt_sa = uinfo.iulp_rtt;
   2503 		tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
   2504 		rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
   2505 		    tcps->tcps_rexmit_interval_extra +
   2506 		    (tcp->tcp_rtt_sa >> 5);
   2507 
   2508 		if (rto > tcps->tcps_rexmit_interval_max) {
   2509 			tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
   2510 		} else if (rto < tcps->tcps_rexmit_interval_min) {
   2511 			tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
   2512 		} else {
   2513 			tcp->tcp_rto = rto;
   2514 		}
   2515 	}
   2516 	if (uinfo.iulp_ssthresh != 0)
   2517 		tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
   2518 	else
   2519 		tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
   2520 	if (uinfo.iulp_spipe > 0) {
   2521 		connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
   2522 		    tcps->tcps_max_buf);
   2523 		if (tcps->tcps_snd_lowat_fraction != 0) {
   2524 			connp->conn_sndlowat = connp->conn_sndbuf /
   2525 			    tcps->tcps_snd_lowat_fraction;
   2526 		}
   2527 		(void) tcp_maxpsz_set(tcp, B_TRUE);
   2528 	}
   2529 	/*
   2530 	 * Note that up till now, acceptor always inherits receive
   2531 	 * window from the listener.  But if there is a metrics
   2532 	 * associated with a host, we should use that instead of
   2533 	 * inheriting it from listener. Thus we need to pass this
   2534 	 * info back to the caller.
   2535 	 */
   2536 	if (uinfo.iulp_rpipe > 0) {
   2537 		tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe,
   2538 		    tcps->tcps_max_buf);
   2539 	}
   2540 
   2541 	if (uinfo.iulp_rtomax > 0) {
   2542 		tcp->tcp_second_timer_threshold =
   2543 		    uinfo.iulp_rtomax;
   2544 	}
   2545 
   2546 	/*
   2547 	 * Use the metric option settings, iulp_tstamp_ok and
   2548 	 * iulp_wscale_ok, only for active open. What this means
   2549 	 * is that if the other side uses timestamp or window
   2550 	 * scale option, TCP will also use those options. That
   2551 	 * is for passive open.  If the application sets a
   2552 	 * large window, window scale is enabled regardless of
   2553 	 * the value in iulp_wscale_ok.  This is the behavior
   2554 	 * since 2.6.  So we keep it.
   2555 	 * The only case left in passive open processing is the
   2556 	 * check for SACK.
   2557 	 * For ECN, it should probably be like SACK.  But the
   2558 	 * current value is binary, so we treat it like the other
   2559 	 * cases.  The metric only controls active open.For passive
   2560 	 * open, the ndd param, tcp_ecn_permitted, controls the
   2561 	 * behavior.
   2562 	 */
   2563 	if (!tcp_detached) {
   2564 		/*
   2565 		 * The if check means that the following can only
   2566 		 * be turned on by the metrics only IRE, but not off.
   2567 		 */
   2568 		if (uinfo.iulp_tstamp_ok)
   2569 			tcp->tcp_snd_ts_ok = B_TRUE;
   2570 		if (uinfo.iulp_wscale_ok)
   2571 			tcp->tcp_snd_ws_ok = B_TRUE;
   2572 		if (uinfo.iulp_sack == 2)
   2573 			tcp->tcp_snd_sack_ok = B_TRUE;
   2574 		if (uinfo.iulp_ecn_ok)
   2575 			tcp->tcp_ecn_ok = B_TRUE;
   2576 	} else {
   2577 		/*
   2578 		 * Passive open.
   2579 		 *
   2580 		 * As above, the if check means that SACK can only be
   2581 		 * turned on by the metric only IRE.
   2582 		 */
   2583 		if (uinfo.iulp_sack > 0) {
   2584 			tcp->tcp_snd_sack_ok = B_TRUE;
   2585 		}
   2586 	}
   2587 
   2588 	/*
   2589 	 * XXX Note that currently, iulp_mtu can be as small as 68
   2590 	 * because of PMTUd.  So tcp_mss may go to negative if combined
   2591 	 * length of all those options exceeds 28 bytes.  But because
   2592 	 * of the tcp_mss_min check below, we may not have a problem if
   2593 	 * tcp_mss_min is of a reasonable value.  The default is 1 so
   2594 	 * the negative problem still exists.  And the check defeats PMTUd.
   2595 	 * In fact, if PMTUd finds that the MSS should be smaller than
   2596 	 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min
   2597 	 * value.
   2598 	 *
   2599 	 * We do not deal with that now.  All those problems related to
   2600 	 * PMTUd will be fixed later.
   2601 	 */
   2602 	ASSERT(uinfo.iulp_mtu != 0);
   2603 	mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu;
   2604 
   2605 	/* Sanity check for MSS value. */
   2606 	if (connp->conn_ipversion == IPV4_VERSION)
   2607 		mss_max = tcps->tcps_mss_max_ipv4;
   2608 	else
   2609 		mss_max = tcps->tcps_mss_max_ipv6;
   2610 
   2611 	if (tcp->tcp_ipsec_overhead == 0)
   2612 		tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
   2613 
   2614 	mss -= tcp->tcp_ipsec_overhead;
   2615 
   2616 	if (mss < tcps->tcps_mss_min)
   2617 		mss = tcps->tcps_mss_min;
   2618 	if (mss > mss_max)
   2619 		mss = mss_max;
   2620 
   2621 	/* Note that this is the maximum MSS, excluding all options. */
   2622 	tcp->tcp_mss = mss;
   2623 
   2624 	/*
   2625 	 * Update the tcp connection with LSO capability.
   2626 	 */
   2627 	tcp_update_lso(tcp, connp->conn_ixa);
   2628 
   2629 	/*
   2630 	 * Initialize the ISS here now that we have the full connection ID.
   2631 	 * The RFC 1948 method of initial sequence number generation requires
   2632 	 * knowledge of the full connection ID before setting the ISS.
   2633 	 */
   2634 	tcp_iss_init(tcp);
   2635 
   2636 	tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local);
   2637 
   2638 	/*
   2639 	 * Make sure that conn is not marked incipient
   2640 	 * for incoming connections. A blind
   2641 	 * removal of incipient flag is cheaper than
   2642 	 * check and removal.
   2643 	 */
   2644 	mutex_enter(&connp->conn_lock);
   2645 	connp->conn_state_flags &= ~CONN_INCIPIENT;
   2646 	mutex_exit(&connp->conn_lock);
   2647 	return (0);
   2648 }
   2649 
   2650 static void
   2651 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
   2652 {
   2653 	int	error;
   2654 	conn_t	*connp = tcp->tcp_connp;
   2655 	struct sockaddr	*sa;
   2656 	mblk_t  *mp1;
   2657 	struct T_bind_req *tbr;
   2658 	int	backlog;
   2659 	socklen_t	len;
   2660 	sin_t	*sin;
   2661 	sin6_t	*sin6;
   2662 	cred_t		*cr;
   2663 
   2664 	/*
   2665 	 * All Solaris components should pass a db_credp
   2666 	 * for this TPI message, hence we ASSERT.
   2667 	 * But in case there is some other M_PROTO that looks
   2668 	 * like a TPI message sent by some other kernel
   2669 	 * component, we check and return an error.
   2670 	 */
   2671 	cr = msg_getcred(mp, NULL);
   2672 	ASSERT(cr != NULL);
   2673 	if (cr == NULL) {
   2674 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
   2675 		return;
   2676 	}
   2677 
   2678 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
   2679 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
   2680 		if (connp->conn_debug) {
   2681 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   2682 			    "tcp_tpi_bind: bad req, len %u",
   2683 			    (uint_t)(mp->b_wptr - mp->b_rptr));
   2684 		}
   2685 		tcp_err_ack(tcp, mp, TPROTO, 0);
   2686 		return;
   2687 	}
   2688 	/* Make sure the largest address fits */
   2689 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
   2690 	if (mp1 == NULL) {
   2691 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   2692 		return;
   2693 	}
   2694 	mp = mp1;
   2695 	tbr = (struct T_bind_req *)mp->b_rptr;
   2696 
   2697 	backlog = tbr->CONIND_number;
   2698 	len = tbr->ADDR_length;
   2699 
   2700 	switch (len) {
   2701 	case 0:		/* request for a generic port */
   2702 		tbr->ADDR_offset = sizeof (struct T_bind_req);
   2703 		if (connp->conn_family == AF_INET) {
   2704 			tbr->ADDR_length = sizeof (sin_t);
   2705 			sin = (sin_t *)&tbr[1];
   2706 			*sin = sin_null;
   2707 			sin->sin_family = AF_INET;
   2708 			sa = (struct sockaddr *)sin;
   2709 			len = sizeof (sin_t);
   2710 			mp->b_wptr = (uchar_t *)&sin[1];
   2711 		} else {
   2712 			ASSERT(connp->conn_family == AF_INET6);
   2713 			tbr->ADDR_length = sizeof (sin6_t);
   2714 			sin6 = (sin6_t *)&tbr[1];
   2715 			*sin6 = sin6_null;
   2716 			sin6->sin6_family = AF_INET6;
   2717 			sa = (struct sockaddr *)sin6;
   2718 			len = sizeof (sin6_t);
   2719 			mp->b_wptr = (uchar_t *)&sin6[1];
   2720 		}
   2721 		break;
   2722 
   2723 	case sizeof (sin_t):    /* Complete IPv4 address */
   2724 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
   2725 		    sizeof (sin_t));
   2726 		break;
   2727 
   2728 	case sizeof (sin6_t): /* Complete IPv6 address */
   2729 		sa = (struct sockaddr *)mi_offset_param(mp,
   2730 		    tbr->ADDR_offset, sizeof (sin6_t));
   2731 		break;
   2732 
   2733 	default:
   2734 		if (connp->conn_debug) {
   2735 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   2736 			    "tcp_tpi_bind: bad address length, %d",
   2737 			    tbr->ADDR_length);
   2738 		}
   2739 		tcp_err_ack(tcp, mp, TBADADDR, 0);
   2740 		return;
   2741 	}
   2742 
   2743 	if (backlog > 0) {
   2744 		error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
   2745 		    tbr->PRIM_type != O_T_BIND_REQ);
   2746 	} else {
   2747 		error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
   2748 		    tbr->PRIM_type != O_T_BIND_REQ);
   2749 	}
   2750 done:
   2751 	if (error > 0) {
   2752 		tcp_err_ack(tcp, mp, TSYSERR, error);
   2753 	} else if (error < 0) {
   2754 		tcp_err_ack(tcp, mp, -error, 0);
   2755 	} else {
   2756 		/*
   2757 		 * Update port information as sockfs/tpi needs it for checking
   2758 		 */
   2759 		if (connp->conn_family == AF_INET) {
   2760 			sin = (sin_t *)sa;
   2761 			sin->sin_port = connp->conn_lport;
   2762 		} else {
   2763 			sin6 = (sin6_t *)sa;
   2764 			sin6->sin6_port = connp->conn_lport;
   2765 		}
   2766 		mp->b_datap->db_type = M_PCPROTO;
   2767 		tbr->PRIM_type = T_BIND_ACK;
   2768 		putnext(connp->conn_rq, mp);
   2769 	}
   2770 }
   2771 
   2772 /*
   2773  * If the "bind_to_req_port_only" parameter is set, if the requested port
   2774  * number is available, return it, If not return 0
   2775  *
   2776  * If "bind_to_req_port_only" parameter is not set and
   2777  * If the requested port number is available, return it.  If not, return
   2778  * the first anonymous port we happen across.  If no anonymous ports are
   2779  * available, return 0. addr is the requested local address, if any.
   2780  *
   2781  * In either case, when succeeding update the tcp_t to record the port number
   2782  * and insert it in the bind hash table.
   2783  *
   2784  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
   2785  * without setting SO_REUSEADDR. This is needed so that they
   2786  * can be viewed as two independent transport protocols.
   2787  */
   2788 static in_port_t
   2789 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
   2790     int reuseaddr, boolean_t quick_connect,
   2791     boolean_t bind_to_req_port_only, boolean_t user_specified)
   2792 {
   2793 	/* number of times we have run around the loop */
   2794 	int count = 0;
   2795 	/* maximum number of times to run around the loop */
   2796 	int loopmax;
   2797 	conn_t *connp = tcp->tcp_connp;
   2798 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   2799 
   2800 	/*
   2801 	 * Lookup for free addresses is done in a loop and "loopmax"
   2802 	 * influences how long we spin in the loop
   2803 	 */
   2804 	if (bind_to_req_port_only) {
   2805 		/*
   2806 		 * If the requested port is busy, don't bother to look
   2807 		 * for a new one. Setting loop maximum count to 1 has
   2808 		 * that effect.
   2809 		 */
   2810 		loopmax = 1;
   2811 	} else {
   2812 		/*
   2813 		 * If the requested port is busy, look for a free one
   2814 		 * in the anonymous port range.
   2815 		 * Set loopmax appropriately so that one does not look
   2816 		 * forever in the case all of the anonymous ports are in use.
   2817 		 */
   2818 		if (connp->conn_anon_priv_bind) {
   2819 			/*
   2820 			 * loopmax =
   2821 			 * 	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
   2822 			 */
   2823 			loopmax = IPPORT_RESERVED -
   2824 			    tcps->tcps_min_anonpriv_port;
   2825 		} else {
   2826 			loopmax = (tcps->tcps_largest_anon_port -
   2827 			    tcps->tcps_smallest_anon_port + 1);
   2828 		}
   2829 	}
   2830 	do {
   2831 		uint16_t	lport;
   2832 		tf_t		*tbf;
   2833 		tcp_t		*ltcp;
   2834 		conn_t		*lconnp;
   2835 
   2836 		lport = htons(port);
   2837 
   2838 		/*
   2839 		 * Ensure that the tcp_t is not currently in the bind hash.
   2840 		 * Hold the lock on the hash bucket to ensure that
   2841 		 * the duplicate check plus the insertion is an atomic
   2842 		 * operation.
   2843 		 *
   2844 		 * This function does an inline lookup on the bind hash list
   2845 		 * Make sure that we access only members of tcp_t
   2846 		 * and that we don't look at tcp_tcp, since we are not
   2847 		 * doing a CONN_INC_REF.
   2848 		 */
   2849 		tcp_bind_hash_remove(tcp);
   2850 		tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
   2851 		mutex_enter(&tbf->tf_lock);
   2852 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
   2853 		    ltcp = ltcp->tcp_bind_hash) {
   2854 			if (lport == ltcp->tcp_connp->conn_lport)
   2855 				break;
   2856 		}
   2857 
   2858 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
   2859 			boolean_t not_socket;
   2860 			boolean_t exclbind;
   2861 
   2862 			lconnp = ltcp->tcp_connp;
   2863 
   2864 			/*
   2865 			 * On a labeled system, we must treat bindings to ports
   2866 			 * on shared IP addresses by sockets with MAC exemption
   2867 			 * privilege as being in all zones, as there's
   2868 			 * otherwise no way to identify the right receiver.
   2869 			 */
   2870 			if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
   2871 				continue;
   2872 
   2873 			/*
   2874 			 * If TCP_EXCLBIND is set for either the bound or
   2875 			 * binding endpoint, the semantics of bind
   2876 			 * is changed according to the following.
   2877 			 *
   2878 			 * spec = specified address (v4 or v6)
   2879 			 * unspec = unspecified address (v4 or v6)
   2880 			 * A = specified addresses are different for endpoints
   2881 			 *
   2882 			 * bound	bind to		allowed
   2883 			 * -------------------------------------
   2884 			 * unspec	unspec		no
   2885 			 * unspec	spec		no
   2886 			 * spec		unspec		no
   2887 			 * spec		spec		yes if A
   2888 			 *
   2889 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
   2890 			 * as TCP_EXCLBIND, except that zoneid is ignored.
   2891 			 *
   2892 			 * Note:
   2893 			 *
   2894 			 * 1. Because of TLI semantics, an endpoint can go
   2895 			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
   2896 			 * TCPS_BOUND, depending on whether it is originally
   2897 			 * a listener or not.  That is why we need to check
   2898 			 * for states greater than or equal to TCPS_BOUND
   2899 			 * here.
   2900 			 *
   2901 			 * 2. Ideally, we should only check for state equals
   2902 			 * to TCPS_LISTEN. And the following check should be
   2903 			 * added.
   2904 			 *
   2905 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
   2906 			 *	!reuseaddr || !lconnp->conn_reuseaddr) {
   2907 			 *		...
   2908 			 * }
   2909 			 *
   2910 			 * The semantics will be changed to this.  If the
   2911 			 * endpoint on the list is in state not equal to
   2912 			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
   2913 			 * set, let the bind succeed.
   2914 			 *
   2915 			 * Because of (1), we cannot do that for TLI
   2916 			 * endpoints.  But we can do that for socket endpoints.
   2917 			 * If in future, we can change this going back
   2918 			 * semantics, we can use the above check for TLI also.
   2919 			 */
   2920 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
   2921 			    TCP_IS_SOCKET(tcp));
   2922 			exclbind = lconnp->conn_exclbind ||
   2923 			    connp->conn_exclbind;
   2924 
   2925 			if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
   2926 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
   2927 			    (exclbind && (not_socket ||
   2928 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
   2929 				if (V6_OR_V4_INADDR_ANY(
   2930 				    lconnp->conn_bound_addr_v6) ||
   2931 				    V6_OR_V4_INADDR_ANY(*laddr) ||
   2932 				    IN6_ARE_ADDR_EQUAL(laddr,
   2933 				    &lconnp->conn_bound_addr_v6)) {
   2934 					break;
   2935 				}
   2936 				continue;
   2937 			}
   2938 
   2939 			/*
   2940 			 * Check ipversion to allow IPv4 and IPv6 sockets to
   2941 			 * have disjoint port number spaces, if *_EXCLBIND
   2942 			 * is not set and only if the application binds to a
   2943 			 * specific port. We use the same autoassigned port
   2944 			 * number space for IPv4 and IPv6 sockets.
   2945 			 */
   2946 			if (connp->conn_ipversion != lconnp->conn_ipversion &&
   2947 			    bind_to_req_port_only)
   2948 				continue;
   2949 
   2950 			/*
   2951 			 * Ideally, we should make sure that the source
   2952 			 * address, remote address, and remote port in the
   2953 			 * four tuple for this tcp-connection is unique.
   2954 			 * However, trying to find out the local source
   2955 			 * address would require too much code duplication
   2956 			 * with IP, since IP needs needs to have that code
   2957 			 * to support userland TCP implementations.
   2958 			 */
   2959 			if (quick_connect &&
   2960 			    (ltcp->tcp_state > TCPS_LISTEN) &&
   2961 			    ((connp->conn_fport != lconnp->conn_fport) ||
   2962 			    !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
   2963 			    &lconnp->conn_faddr_v6)))
   2964 				continue;
   2965 
   2966 			if (!reuseaddr) {
   2967 				/*
   2968 				 * No socket option SO_REUSEADDR.
   2969 				 * If existing port is bound to
   2970 				 * a non-wildcard IP address
   2971 				 * and the requesting stream is
   2972 				 * bound to a distinct
   2973 				 * different IP addresses
   2974 				 * (non-wildcard, also), keep
   2975 				 * going.
   2976 				 */
   2977 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
   2978 				    !V6_OR_V4_INADDR_ANY(
   2979 				    lconnp->conn_bound_addr_v6) &&
   2980 				    !IN6_ARE_ADDR_EQUAL(laddr,
   2981 				    &lconnp->conn_bound_addr_v6))
   2982 					continue;
   2983 				if (ltcp->tcp_state >= TCPS_BOUND) {
   2984 					/*
   2985 					 * This port is being used and
   2986 					 * its state is >= TCPS_BOUND,
   2987 					 * so we can't bind to it.
   2988 					 */
   2989 					break;
   2990 				}
   2991 			} else {
   2992 				/*
   2993 				 * socket option SO_REUSEADDR is set on the
   2994 				 * binding tcp_t.
   2995 				 *
   2996 				 * If two streams are bound to
   2997 				 * same IP address or both addr
   2998 				 * and bound source are wildcards
   2999 				 * (INADDR_ANY), we want to stop
   3000 				 * searching.
   3001 				 * We have found a match of IP source
   3002 				 * address and source port, which is
   3003 				 * refused regardless of the
   3004 				 * SO_REUSEADDR setting, so we break.
   3005 				 */
   3006 				if (IN6_ARE_ADDR_EQUAL(laddr,
   3007 				    &lconnp->conn_bound_addr_v6) &&
   3008 				    (ltcp->tcp_state == TCPS_LISTEN ||
   3009 				    ltcp->tcp_state == TCPS_BOUND))
   3010 					break;
   3011 			}
   3012 		}
   3013 		if (ltcp != NULL) {
   3014 			/* The port number is busy */
   3015 			mutex_exit(&tbf->tf_lock);
   3016 		} else {
   3017 			/*
   3018 			 * This port is ours. Insert in fanout and mark as
   3019 			 * bound to prevent others from getting the port
   3020 			 * number.
   3021 			 */
   3022 			tcp->tcp_state = TCPS_BOUND;
   3023 			connp->conn_lport = htons(port);
   3024 
   3025 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
   3026 			    connp->conn_lport)] == tbf);
   3027 			tcp_bind_hash_insert(tbf, tcp, 1);
   3028 
   3029 			mutex_exit(&tbf->tf_lock);
   3030 
   3031 			/*
   3032 			 * We don't want tcp_next_port_to_try to "inherit"
   3033 			 * a port number supplied by the user in a bind.
   3034 			 */
   3035 			if (user_specified)
   3036 				return (port);
   3037 
   3038 			/*
   3039 			 * This is the only place where tcp_next_port_to_try
   3040 			 * is updated. After the update, it may or may not
   3041 			 * be in the valid range.
   3042 			 */
   3043 			if (!connp->conn_anon_priv_bind)
   3044 				tcps->tcps_next_port_to_try = port + 1;
   3045 			return (port);
   3046 		}
   3047 
   3048 		if (connp->conn_anon_priv_bind) {
   3049 			port = tcp_get_next_priv_port(tcp);
   3050 		} else {
   3051 			if (count == 0 && user_specified) {
   3052 				/*
   3053 				 * We may have to return an anonymous port. So
   3054 				 * get one to start with.
   3055 				 */
   3056 				port =
   3057 				    tcp_update_next_port(
   3058 				    tcps->tcps_next_port_to_try,
   3059 				    tcp, B_TRUE);
   3060 				user_specified = B_FALSE;
   3061 			} else {
   3062 				port = tcp_update_next_port(port + 1, tcp,
   3063 				    B_FALSE);
   3064 			}
   3065 		}
   3066 		if (port == 0)
   3067 			break;
   3068 
   3069 		/*
   3070 		 * Don't let this loop run forever in the case where
   3071 		 * all of the anonymous ports are in use.
   3072 		 */
   3073 	} while (++count < loopmax);
   3074 	return (0);
   3075 }
   3076 
   3077 /*
   3078  * tcp_clean_death / tcp_close_detached must not be called more than once
   3079  * on a tcp. Thus every function that potentially calls tcp_clean_death
   3080  * must check for the tcp state before calling tcp_clean_death.
   3081  * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper,
   3082  * tcp_timer_handler, all check for the tcp state.
   3083  */
   3084 /* ARGSUSED */
   3085 void
   3086 tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
   3087     ip_recv_attr_t *dummy)
   3088 {
   3089 	tcp_t	*tcp = ((conn_t *)arg)->conn_tcp;
   3090 
   3091 	freemsg(mp);
   3092 	if (tcp->tcp_state > TCPS_BOUND)
   3093 		(void) tcp_clean_death(((conn_t *)arg)->conn_tcp,
   3094 		    ETIMEDOUT, 5);
   3095 }
   3096 
   3097 /*
   3098  * We are dying for some reason.  Try to do it gracefully.  (May be called
   3099  * as writer.)
   3100  *
   3101  * Return -1 if the structure was not cleaned up (if the cleanup had to be
   3102  * done by a service procedure).
   3103  * TBD - Should the return value distinguish between the tcp_t being
   3104  * freed and it being reinitialized?
   3105  */
   3106 static int
   3107 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
   3108 {
   3109 	mblk_t	*mp;
   3110 	queue_t	*q;
   3111 	conn_t	*connp = tcp->tcp_connp;
   3112 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3113 
   3114 	TCP_CLD_STAT(tag);
   3115 
   3116 #if TCP_TAG_CLEAN_DEATH
   3117 	tcp->tcp_cleandeathtag = tag;
   3118 #endif
   3119 
   3120 	if (tcp->tcp_fused)
   3121 		tcp_unfuse(tcp);
   3122 
   3123 	if (tcp->tcp_linger_tid != 0 &&
   3124 	    TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
   3125 		tcp_stop_lingering(tcp);
   3126 	}
   3127 
   3128 	ASSERT(tcp != NULL);
   3129 	ASSERT((connp->conn_family == AF_INET &&
   3130 	    connp->conn_ipversion == IPV4_VERSION) ||
   3131 	    (connp->conn_family == AF_INET6 &&
   3132 	    (connp->conn_ipversion == IPV4_VERSION ||
   3133 	    connp->conn_ipversion == IPV6_VERSION)));
   3134 
   3135 	if (TCP_IS_DETACHED(tcp)) {
   3136 		if (tcp->tcp_hard_binding) {
   3137 			/*
   3138 			 * Its an eager that we are dealing with. We close the
   3139 			 * eager but in case a conn_ind has already gone to the
   3140 			 * listener, let tcp_accept_finish() send a discon_ind
   3141 			 * to the listener and drop the last reference. If the
   3142 			 * listener doesn't even know about the eager i.e. the
   3143 			 * conn_ind hasn't gone up, blow away the eager and drop
   3144 			 * the last reference as well. If the conn_ind has gone
   3145 			 * up, state should be BOUND. tcp_accept_finish
   3146 			 * will figure out that the connection has received a
   3147 			 * RST and will send a DISCON_IND to the application.
   3148 			 */
   3149 			tcp_closei_local(tcp);
   3150 			if (!tcp->tcp_tconnind_started) {
   3151 				CONN_DEC_REF(connp);
   3152 			} else {
   3153 				tcp->tcp_state = TCPS_BOUND;
   3154 			}
   3155 		} else {
   3156 			tcp_close_detached(tcp);
   3157 		}
   3158 		return (0);
   3159 	}
   3160 
   3161 	TCP_STAT(tcps, tcp_clean_death_nondetached);
   3162 
   3163 	/*
   3164 	 * The connection is dead.  Decrement listener connection counter if
   3165 	 * necessary.
   3166 	 */
   3167 	if (tcp->tcp_listen_cnt != NULL)
   3168 		TCP_DECR_LISTEN_CNT(tcp);
   3169 
   3170 	q = connp->conn_rq;
   3171 
   3172 	/* Trash all inbound data */
   3173 	if (!IPCL_IS_NONSTR(connp)) {
   3174 		ASSERT(q != NULL);
   3175 		flushq(q, FLUSHALL);
   3176 	}
   3177 
   3178 	/*
   3179 	 * If we are at least part way open and there is error
   3180 	 * (err==0 implies no error)
   3181 	 * notify our client by a T_DISCON_IND.
   3182 	 */
   3183 	if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) {
   3184 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
   3185 		    !TCP_IS_SOCKET(tcp)) {
   3186 			/*
   3187 			 * Send M_FLUSH according to TPI. Because sockets will
   3188 			 * (and must) ignore FLUSHR we do that only for TPI
   3189 			 * endpoints and sockets in STREAMS mode.
   3190 			 */
   3191 			(void) putnextctl1(q, M_FLUSH, FLUSHR);
   3192 		}
   3193 		if (connp->conn_debug) {
   3194 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
   3195 			    "tcp_clean_death: discon err %d", err);
   3196 		}
   3197 		if (IPCL_IS_NONSTR(connp)) {
   3198 			/* Direct socket, use upcall */
   3199 			(*connp->conn_upcalls->su_disconnected)(
   3200 			    connp->conn_upper_handle, tcp->tcp_connid, err);
   3201 		} else {
   3202 			mp = mi_tpi_discon_ind(NULL, err, 0);
   3203 			if (mp != NULL) {
   3204 				putnext(q, mp);
   3205 			} else {
   3206 				if (connp->conn_debug) {
   3207 					(void) strlog(TCP_MOD_ID, 0, 1,
   3208 					    SL_ERROR|SL_TRACE,
   3209 					    "tcp_clean_death, sending M_ERROR");
   3210 				}
   3211 				(void) putnextctl1(q, M_ERROR, EPROTO);
   3212 			}
   3213 		}
   3214 		if (tcp->tcp_state <= TCPS_SYN_RCVD) {
   3215 			/* SYN_SENT or SYN_RCVD */
   3216 			BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   3217 		} else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) {
   3218 			/* ESTABLISHED or CLOSE_WAIT */
   3219 			BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
   3220 		}
   3221 	}
   3222 
   3223 	tcp_reinit(tcp);
   3224 	if (IPCL_IS_NONSTR(connp))
   3225 		(void) tcp_do_unbind(connp);
   3226 
   3227 	return (-1);
   3228 }
   3229 
   3230 /*
   3231  * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout
   3232  * to expire, stop the wait and finish the close.
   3233  */
   3234 static void
   3235 tcp_stop_lingering(tcp_t *tcp)
   3236 {
   3237 	clock_t	delta = 0;
   3238 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3239 	conn_t		*connp = tcp->tcp_connp;
   3240 
   3241 	tcp->tcp_linger_tid = 0;
   3242 	if (tcp->tcp_state > TCPS_LISTEN) {
   3243 		tcp_acceptor_hash_remove(tcp);
   3244 		mutex_enter(&tcp->tcp_non_sq_lock);
   3245 		if (tcp->tcp_flow_stopped) {
   3246 			tcp_clrqfull(tcp);
   3247 		}
   3248 		mutex_exit(&tcp->tcp_non_sq_lock);
   3249 
   3250 		if (tcp->tcp_timer_tid != 0) {
   3251 			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
   3252 			tcp->tcp_timer_tid = 0;
   3253 		}
   3254 		/*
   3255 		 * Need to cancel those timers which will not be used when
   3256 		 * TCP is detached.  This has to be done before the conn_wq
   3257 		 * is cleared.
   3258 		 */
   3259 		tcp_timers_stop(tcp);
   3260 
   3261 		tcp->tcp_detached = B_TRUE;
   3262 		connp->conn_rq = NULL;
   3263 		connp->conn_wq = NULL;
   3264 
   3265 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
   3266 			tcp_time_wait_append(tcp);
   3267 			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
   3268 			goto finish;
   3269 		}
   3270 
   3271 		/*
   3272 		 * If delta is zero the timer event wasn't executed and was
   3273 		 * successfully canceled. In this case we need to restart it
   3274 		 * with the minimal delta possible.
   3275 		 */
   3276 		if (delta >= 0) {
   3277 			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
   3278 			    delta ? delta : 1);
   3279 		}
   3280 	} else {
   3281 		tcp_closei_local(tcp);
   3282 		CONN_DEC_REF(connp);
   3283 	}
   3284 finish:
   3285 	/* Signal closing thread that it can complete close */
   3286 	mutex_enter(&tcp->tcp_closelock);
   3287 	tcp->tcp_detached = B_TRUE;
   3288 	connp->conn_rq = NULL;
   3289 	connp->conn_wq = NULL;
   3290 
   3291 	tcp->tcp_closed = 1;
   3292 	cv_signal(&tcp->tcp_closecv);
   3293 	mutex_exit(&tcp->tcp_closelock);
   3294 }
   3295 
   3296 /*
   3297  * Handle lingering timeouts. This function is called when the SO_LINGER timeout
   3298  * expires.
   3299  */
   3300 static void
   3301 tcp_close_linger_timeout(void *arg)
   3302 {
   3303 	conn_t	*connp = (conn_t *)arg;
   3304 	tcp_t 	*tcp = connp->conn_tcp;
   3305 
   3306 	tcp->tcp_client_errno = ETIMEDOUT;
   3307 	tcp_stop_lingering(tcp);
   3308 }
   3309 
   3310 static void
   3311 tcp_close_common(conn_t *connp, int flags)
   3312 {
   3313 	tcp_t		*tcp = connp->conn_tcp;
   3314 	mblk_t 		*mp = &tcp->tcp_closemp;
   3315 	boolean_t	conn_ioctl_cleanup_reqd = B_FALSE;
   3316 	mblk_t		*bp;
   3317 
   3318 	ASSERT(connp->conn_ref >= 2);
   3319 
   3320 	/*
   3321 	 * Mark the conn as closing. ipsq_pending_mp_add will not
   3322 	 * add any mp to the pending mp list, after this conn has
   3323 	 * started closing.
   3324 	 */
   3325 	mutex_enter(&connp->conn_lock);
   3326 	connp->conn_state_flags |= CONN_CLOSING;
   3327 	if (connp->conn_oper_pending_ill != NULL)
   3328 		conn_ioctl_cleanup_reqd = B_TRUE;
   3329 	CONN_INC_REF_LOCKED(connp);
   3330 	mutex_exit(&connp->conn_lock);
   3331 	tcp->tcp_closeflags = (uint8_t)flags;
   3332 	ASSERT(connp->conn_ref >= 3);
   3333 
   3334 	/*
   3335 	 * tcp_closemp_used is used below without any protection of a lock
   3336 	 * as we don't expect any one else to use it concurrently at this
   3337 	 * point otherwise it would be a major defect.
   3338 	 */
   3339 
   3340 	if (mp->b_prev == NULL)
   3341 		tcp->tcp_closemp_used = B_TRUE;
   3342 	else
   3343 		cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: "
   3344 		    "connp %p tcp %p\n", (void *)connp, (void *)tcp);
   3345 
   3346 	TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   3347 
   3348 	/*
   3349 	 * Cleanup any queued ioctls here. This must be done before the wq/rq
   3350 	 * are re-written by tcp_close_output().
   3351 	 */
   3352 	if (conn_ioctl_cleanup_reqd)
   3353 		conn_ioctl_cleanup(connp);
   3354 
   3355 	/*
   3356 	 * As CONN_CLOSING is set, no further ioctls should be passed down to
   3357 	 * IP for this conn (see the guards in tcp_ioctl, tcp_wput_ioctl and
   3358 	 * tcp_wput_iocdata). If the ioctl was queued on an ipsq,
   3359 	 * conn_ioctl_cleanup should have found it and removed it. If the ioctl
   3360 	 * was still in flight at the time, we wait for it here. See comments
   3361 	 * for CONN_INC_IOCTLREF in ip.h for details.
   3362 	 */
   3363 	mutex_enter(&connp->conn_lock);
   3364 	while (connp->conn_ioctlref > 0)
   3365 		cv_wait(&connp->conn_cv, &connp->conn_lock);
   3366 	ASSERT(connp->conn_ioctlref == 0);
   3367 	ASSERT(connp->conn_oper_pending_ill == NULL);
   3368 	mutex_exit(&connp->conn_lock);
   3369 
   3370 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp,
   3371 	    NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
   3372 
   3373 	mutex_enter(&tcp->tcp_closelock);
   3374 	while (!tcp->tcp_closed) {
   3375 		if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) {
   3376 			/*
   3377 			 * The cv_wait_sig() was interrupted. We now do the
   3378 			 * following:
   3379 			 *
   3380 			 * 1) If the endpoint was lingering, we allow this
   3381 			 * to be interrupted by cancelling the linger timeout
   3382 			 * and closing normally.
   3383 			 *
   3384 			 * 2) Revert to calling cv_wait()
   3385 			 *
   3386 			 * We revert to using cv_wait() to avoid an
   3387 			 * infinite loop which can occur if the calling
   3388 			 * thread is higher priority than the squeue worker
   3389 			 * thread and is bound to the same cpu.
   3390 			 */
   3391 			if (connp->conn_linger && connp->conn_lingertime > 0) {
   3392 				mutex_exit(&tcp->tcp_closelock);
   3393 				/* Entering squeue, bump ref count. */
   3394 				CONN_INC_REF(connp);
   3395 				bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
   3396 				SQUEUE_ENTER_ONE(connp->conn_sqp, bp,
   3397 				    tcp_linger_interrupted, connp, NULL,
   3398 				    tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
   3399 				mutex_enter(&tcp->tcp_closelock);
   3400 			}
   3401 			break;
   3402 		}
   3403 	}
   3404 	while (!tcp->tcp_closed)
   3405 		cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock);
   3406 	mutex_exit(&tcp->tcp_closelock);
   3407 
   3408 	/*
   3409 	 * In the case of listener streams that have eagers in the q or q0
   3410 	 * we wait for the eagers to drop their reference to us. conn_rq and
   3411 	 * conn_wq of the eagers point to our queues. By waiting for the
   3412 	 * refcnt to drop to 1, we are sure that the eagers have cleaned
   3413 	 * up their queue pointers and also dropped their references to us.
   3414 	 */
   3415 	if (tcp->tcp_wait_for_eagers) {
   3416 		mutex_enter(&connp->conn_lock);
   3417 		while (connp->conn_ref != 1) {
   3418 			cv_wait(&connp->conn_cv, &connp->conn_lock);
   3419 		}
   3420 		mutex_exit(&connp->conn_lock);
   3421 	}
   3422 
   3423 	connp->conn_cpid = NOPID;
   3424 }
   3425 
   3426 static int
   3427 tcp_tpi_close(queue_t *q, int flags)
   3428 {
   3429 	conn_t		*connp;
   3430 
   3431 	ASSERT(WR(q)->q_next == NULL);
   3432 
   3433 	if (flags & SO_FALLBACK) {
   3434 		/*
   3435 		 * stream is being closed while in fallback
   3436 		 * simply free the resources that were allocated
   3437 		 */
   3438 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
   3439 		qprocsoff(q);
   3440 		goto done;
   3441 	}
   3442 
   3443 	connp = Q_TO_CONN(q);
   3444 	/*
   3445 	 * We are being closed as /dev/tcp or /dev/tcp6.
   3446 	 */
   3447 	tcp_close_common(connp, flags);
   3448 
   3449 	qprocsoff(q);
   3450 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
   3451 
   3452 	/*
   3453 	 * Drop IP's reference on the conn. This is the last reference
   3454 	 * on the connp if the state was less than established. If the
   3455 	 * connection has gone into timewait state, then we will have
   3456 	 * one ref for the TCP and one more ref (total of two) for the
   3457 	 * classifier connected hash list (a timewait connections stays
   3458 	 * in connected hash till closed).
   3459 	 *
   3460 	 * We can't assert the references because there might be other
   3461 	 * transient reference places because of some walkers or queued
   3462 	 * packets in squeue for the timewait state.
   3463 	 */
   3464 	CONN_DEC_REF(connp);
   3465 done:
   3466 	q->q_ptr = WR(q)->q_ptr = NULL;
   3467 	return (0);
   3468 }
   3469 
   3470 static int
   3471 tcp_tpi_close_accept(queue_t *q)
   3472 {
   3473 	vmem_t	*minor_arena;
   3474 	dev_t	conn_dev;
   3475 
   3476 	ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
   3477 
   3478 	/*
   3479 	 * We had opened an acceptor STREAM for sockfs which is
   3480 	 * now being closed due to some error.
   3481 	 */
   3482 	qprocsoff(q);
   3483 
   3484 	minor_arena = (vmem_t *)WR(q)->q_ptr;
   3485 	conn_dev = (dev_t)RD(q)->q_ptr;
   3486 	ASSERT(minor_arena != NULL);
   3487 	ASSERT(conn_dev != 0);
   3488 	inet_minor_free(minor_arena, conn_dev);
   3489 	q->q_ptr = WR(q)->q_ptr = NULL;
   3490 	return (0);
   3491 }
   3492 
   3493 /*
   3494  * Called by tcp_close() routine via squeue when lingering is
   3495  * interrupted by a signal.
   3496  */
   3497 
   3498 /* ARGSUSED */
   3499 static void
   3500 tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   3501 {
   3502 	conn_t	*connp = (conn_t *)arg;
   3503 	tcp_t	*tcp = connp->conn_tcp;
   3504 
   3505 	freeb(mp);
   3506 	if (tcp->tcp_linger_tid != 0 &&
   3507 	    TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
   3508 		tcp_stop_lingering(tcp);
   3509 		tcp->tcp_client_errno = EINTR;
   3510 	}
   3511 }
   3512 
   3513 /*
   3514  * Called by streams close routine via squeues when our client blows off her
   3515  * descriptor, we take this to mean: "close the stream state NOW, close the tcp
   3516  * connection politely" When SO_LINGER is set (with a non-zero linger time and
   3517  * it is not a nonblocking socket) then this routine sleeps until the FIN is
   3518  * acked.
   3519  *
   3520  * NOTE: tcp_close potentially returns error when lingering.
   3521  * However, the stream head currently does not pass these errors
   3522  * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
   3523  * errors to the application (from tsleep()) and not errors
   3524  * like ECONNRESET caused by receiving a reset packet.
   3525  */
   3526 
   3527 /* ARGSUSED */
   3528 static void
   3529 tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   3530 {
   3531 	char	*msg;
   3532 	conn_t	*connp = (conn_t *)arg;
   3533 	tcp_t	*tcp = connp->conn_tcp;
   3534 	clock_t	delta = 0;
   3535 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3536 
   3537 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
   3538 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
   3539 
   3540 	mutex_enter(&tcp->tcp_eager_lock);
   3541 	if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
   3542 		/* Cleanup for listener */
   3543 		tcp_eager_cleanup(tcp, 0);
   3544 		tcp->tcp_wait_for_eagers = 1;
   3545 	}
   3546 	mutex_exit(&tcp->tcp_eager_lock);
   3547 
   3548 	tcp->tcp_lso = B_FALSE;
   3549 
   3550 	msg = NULL;
   3551 	switch (tcp->tcp_state) {
   3552 	case TCPS_CLOSED:
   3553 	case TCPS_IDLE:
   3554 	case TCPS_BOUND:
   3555 	case TCPS_LISTEN:
   3556 		break;
   3557 	case TCPS_SYN_SENT:
   3558 		msg = "tcp_close, during connect";
   3559 		break;
   3560 	case TCPS_SYN_RCVD:
   3561 		/*
   3562 		 * Close during the connect 3-way handshake
   3563 		 * but here there may or may not be pending data
   3564 		 * already on queue. Process almost same as in
   3565 		 * the ESTABLISHED state.
   3566 		 */
   3567 		/* FALLTHRU */
   3568 	default:
   3569 		if (tcp->tcp_fused)
   3570 			tcp_unfuse(tcp);
   3571 
   3572 		/*
   3573 		 * If SO_LINGER has set a zero linger time, abort the
   3574 		 * connection with a reset.
   3575 		 */
   3576 		if (connp->conn_linger && connp->conn_lingertime == 0) {
   3577 			msg = "tcp_close, zero lingertime";
   3578 			break;
   3579 		}
   3580 
   3581 		/*
   3582 		 * Abort connection if there is unread data queued.
   3583 		 */
   3584 		if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
   3585 			msg = "tcp_close, unread data";
   3586 			break;
   3587 		}
   3588 		/*
   3589 		 * We have done a qwait() above which could have possibly
   3590 		 * drained more messages in turn causing transition to a
   3591 		 * different state. Check whether we have to do the rest
   3592 		 * of the processing or not.
   3593 		 */
   3594 		if (tcp->tcp_state <= TCPS_LISTEN)
   3595 			break;
   3596 
   3597 		/*
   3598 		 * Transmit the FIN before detaching the tcp_t.
   3599 		 * After tcp_detach returns this queue/perimeter
   3600 		 * no longer owns the tcp_t thus others can modify it.
   3601 		 */
   3602 		(void) tcp_xmit_end(tcp);
   3603 
   3604 		/*
   3605 		 * If lingering on close then wait until the fin is acked,
   3606 		 * the SO_LINGER time passes, or a reset is sent/received.
   3607 		 */
   3608 		if (connp->conn_linger && connp->conn_lingertime > 0 &&
   3609 		    !(tcp->tcp_fin_acked) &&
   3610 		    tcp->tcp_state >= TCPS_ESTABLISHED) {
   3611 			if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
   3612 				tcp->tcp_client_errno = EWOULDBLOCK;
   3613 			} else if (tcp->tcp_client_errno == 0) {
   3614 
   3615 				ASSERT(tcp->tcp_linger_tid == 0);
   3616 
   3617 				tcp->tcp_linger_tid = TCP_TIMER(tcp,
   3618 				    tcp_close_linger_timeout,
   3619 				    connp->conn_lingertime * hz);
   3620 
   3621 				/* tcp_close_linger_timeout will finish close */
   3622 				if (tcp->tcp_linger_tid == 0)
   3623 					tcp->tcp_client_errno = ENOSR;
   3624 				else
   3625 					return;
   3626 			}
   3627 
   3628 			/*
   3629 			 * Check if we need to detach or just close
   3630 			 * the instance.
   3631 			 */
   3632 			if (tcp->tcp_state <= TCPS_LISTEN)
   3633 				break;
   3634 		}
   3635 
   3636 		/*
   3637 		 * Make sure that no other thread will access the conn_rq of
   3638 		 * this instance (through lookups etc.) as conn_rq will go
   3639 		 * away shortly.
   3640 		 */
   3641 		tcp_acceptor_hash_remove(tcp);
   3642 
   3643 		mutex_enter(&tcp->tcp_non_sq_lock);
   3644 		if (tcp->tcp_flow_stopped) {
   3645 			tcp_clrqfull(tcp);
   3646 		}
   3647 		mutex_exit(&tcp->tcp_non_sq_lock);
   3648 
   3649 		if (tcp->tcp_timer_tid != 0) {
   3650 			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
   3651 			tcp->tcp_timer_tid = 0;
   3652 		}
   3653 		/*
   3654 		 * Need to cancel those timers which will not be used when
   3655 		 * TCP is detached.  This has to be done before the conn_wq
   3656 		 * is set to NULL.
   3657 		 */
   3658 		tcp_timers_stop(tcp);
   3659 
   3660 		tcp->tcp_detached = B_TRUE;
   3661 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
   3662 			tcp_time_wait_append(tcp);
   3663 			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
   3664 			ASSERT(connp->conn_ref >= 3);
   3665 			goto finish;
   3666 		}
   3667 
   3668 		/*
   3669 		 * If delta is zero the timer event wasn't executed and was
   3670 		 * successfully canceled. In this case we need to restart it
   3671 		 * with the minimal delta possible.
   3672 		 */
   3673 		if (delta >= 0)
   3674 			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
   3675 			    delta ? delta : 1);
   3676 
   3677 		ASSERT(connp->conn_ref >= 3);
   3678 		goto finish;
   3679 	}
   3680 
   3681 	/* Detach did not complete. Still need to remove q from stream. */
   3682 	if (msg) {
   3683 		if (tcp->tcp_state == TCPS_ESTABLISHED ||
   3684 		    tcp->tcp_state == TCPS_CLOSE_WAIT)
   3685 			BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
   3686 		if (tcp->tcp_state == TCPS_SYN_SENT ||
   3687 		    tcp->tcp_state == TCPS_SYN_RCVD)
   3688 			BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   3689 		tcp_xmit_ctl(msg, tcp,  tcp->tcp_snxt, 0, TH_RST);
   3690 	}
   3691 
   3692 	tcp_closei_local(tcp);
   3693 	CONN_DEC_REF(connp);
   3694 	ASSERT(connp->conn_ref >= 2);
   3695 
   3696 finish:
   3697 	mutex_enter(&tcp->tcp_closelock);
   3698 	/*
   3699 	 * Don't change the queues in the case of a listener that has
   3700 	 * eagers in its q or q0. It could surprise the eagers.
   3701 	 * Instead wait for the eagers outside the squeue.
   3702 	 */
   3703 	if (!tcp->tcp_wait_for_eagers) {
   3704 		tcp->tcp_detached = B_TRUE;
   3705 		connp->conn_rq = NULL;
   3706 		connp->conn_wq = NULL;
   3707 	}
   3708 
   3709 	/* Signal tcp_close() to finish closing. */
   3710 	tcp->tcp_closed = 1;
   3711 	cv_signal(&tcp->tcp_closecv);
   3712 	mutex_exit(&tcp->tcp_closelock);
   3713 }
   3714 
   3715 /*
   3716  * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp.
   3717  * Some stream heads get upset if they see these later on as anything but NULL.
   3718  */
   3719 static void
   3720 tcp_close_mpp(mblk_t **mpp)
   3721 {
   3722 	mblk_t	*mp;
   3723 
   3724 	if ((mp = *mpp) != NULL) {
   3725 		do {
   3726 			mp->b_next = NULL;
   3727 			mp->b_prev = NULL;
   3728 		} while ((mp = mp->b_cont) != NULL);
   3729 
   3730 		mp = *mpp;
   3731 		*mpp = NULL;
   3732 		freemsg(mp);
   3733 	}
   3734 }
   3735 
   3736 /* Do detached close. */
   3737 static void
   3738 tcp_close_detached(tcp_t *tcp)
   3739 {
   3740 	if (tcp->tcp_fused)
   3741 		tcp_unfuse(tcp);
   3742 
   3743 	/*
   3744 	 * Clustering code serializes TCP disconnect callbacks and
   3745 	 * cluster tcp list walks by blocking a TCP disconnect callback
   3746 	 * if a cluster tcp list walk is in progress. This ensures
   3747 	 * accurate accounting of TCPs in the cluster code even though
   3748 	 * the TCP list walk itself is not atomic.
   3749 	 */
   3750 	tcp_closei_local(tcp);
   3751 	CONN_DEC_REF(tcp->tcp_connp);
   3752 }
   3753 
   3754 /*
   3755  * Stop all TCP timers, and free the timer mblks if requested.
   3756  */
   3757 void
   3758 tcp_timers_stop(tcp_t *tcp)
   3759 {
   3760 	if (tcp->tcp_timer_tid != 0) {
   3761 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
   3762 		tcp->tcp_timer_tid = 0;
   3763 	}
   3764 	if (tcp->tcp_ka_tid != 0) {
   3765 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
   3766 		tcp->tcp_ka_tid = 0;
   3767 	}
   3768 	if (tcp->tcp_ack_tid != 0) {
   3769 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
   3770 		tcp->tcp_ack_tid = 0;
   3771 	}
   3772 	if (tcp->tcp_push_tid != 0) {
   3773 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
   3774 		tcp->tcp_push_tid = 0;
   3775 	}
   3776 	if (tcp->tcp_reass_tid != 0) {
   3777 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid);
   3778 		tcp->tcp_reass_tid = 0;
   3779 	}
   3780 }
   3781 
   3782 /*
   3783  * The tcp_t is going away. Remove it from all lists and set it
   3784  * to TCPS_CLOSED. The freeing up of memory is deferred until
   3785  * tcp_inactive. This is needed since a thread in tcp_rput might have
   3786  * done a CONN_INC_REF on this structure before it was removed from the
   3787  * hashes.
   3788  */
   3789 static void
   3790 tcp_closei_local(tcp_t *tcp)
   3791 {
   3792 	conn_t		*connp = tcp->tcp_connp;
   3793 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3794 
   3795 	if (!TCP_IS_SOCKET(tcp))
   3796 		tcp_acceptor_hash_remove(tcp);
   3797 
   3798 	UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
   3799 	tcp->tcp_ibsegs = 0;
   3800 	UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
   3801 	tcp->tcp_obsegs = 0;
   3802 
   3803 	/*
   3804 	 * If we are an eager connection hanging off a listener that
   3805 	 * hasn't formally accepted the connection yet, get off his
   3806 	 * list and blow off any data that we have accumulated.
   3807 	 */
   3808 	if (tcp->tcp_listener != NULL) {
   3809 		tcp_t	*listener = tcp->tcp_listener;
   3810 		mutex_enter(&listener->tcp_eager_lock);
   3811 		/*
   3812 		 * tcp_tconnind_started == B_TRUE means that the
   3813 		 * conn_ind has already gone to listener. At
   3814 		 * this point, eager will be closed but we
   3815 		 * leave it in listeners eager list so that
   3816 		 * if listener decides to close without doing
   3817 		 * accept, we can clean this up. In tcp_tli_accept
   3818 		 * we take care of the case of accept on closed
   3819 		 * eager.
   3820 		 */
   3821 		if (!tcp->tcp_tconnind_started) {
   3822 			tcp_eager_unlink(tcp);
   3823 			mutex_exit(&listener->tcp_eager_lock);
   3824 			/*
   3825 			 * We don't want to have any pointers to the
   3826 			 * listener queue, after we have released our
   3827 			 * reference on the listener
   3828 			 */
   3829 			ASSERT(tcp->tcp_detached);
   3830 			connp->conn_rq = NULL;
   3831 			connp->conn_wq = NULL;
   3832 			CONN_DEC_REF(listener->tcp_connp);
   3833 		} else {
   3834 			mutex_exit(&listener->tcp_eager_lock);
   3835 		}
   3836 	}
   3837 
   3838 	/* Stop all the timers */
   3839 	tcp_timers_stop(tcp);
   3840 
   3841 	if (tcp->tcp_state == TCPS_LISTEN) {
   3842 		if (tcp->tcp_ip_addr_cache) {
   3843 			kmem_free((void *)tcp->tcp_ip_addr_cache,
   3844 			    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
   3845 			tcp->tcp_ip_addr_cache = NULL;
   3846 		}
   3847 	}
   3848 
   3849 	/* Decrement listerner connection counter if necessary. */
   3850 	if (tcp->tcp_listen_cnt != NULL)
   3851 		TCP_DECR_LISTEN_CNT(tcp);
   3852 
   3853 	mutex_enter(&tcp->tcp_non_sq_lock);
   3854 	if (tcp->tcp_flow_stopped)
   3855 		tcp_clrqfull(tcp);
   3856 	mutex_exit(&tcp->tcp_non_sq_lock);
   3857 
   3858 	tcp_bind_hash_remove(tcp);
   3859 	/*
   3860 	 * If the tcp_time_wait_collector (which runs outside the squeue)
   3861 	 * is trying to remove this tcp from the time wait list, we will
   3862 	 * block in tcp_time_wait_remove while trying to acquire the
   3863 	 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also
   3864 	 * requires the ipcl_hash_remove to be ordered after the
   3865 	 * tcp_time_wait_remove for the refcnt checks to work correctly.
   3866 	 */
   3867 	if (tcp->tcp_state == TCPS_TIME_WAIT)
   3868 		(void) tcp_time_wait_remove(tcp, NULL);
   3869 	CL_INET_DISCONNECT(connp);
   3870 	ipcl_hash_remove(connp);
   3871 	ixa_cleanup(connp->conn_ixa);
   3872 
   3873 	/*
   3874 	 * Mark the conn as CONDEMNED
   3875 	 */
   3876 	mutex_enter(&connp->conn_lock);
   3877 	connp->conn_state_flags |= CONN_CONDEMNED;
   3878 	mutex_exit(&connp->conn_lock);
   3879 
   3880 	ASSERT(tcp->tcp_time_wait_next == NULL);
   3881 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   3882 	ASSERT(tcp->tcp_time_wait_expire == 0);
   3883 	tcp->tcp_state = TCPS_CLOSED;
   3884 
   3885 	/* Release any SSL context */
   3886 	if (tcp->tcp_kssl_ent != NULL) {
   3887 		kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
   3888 		tcp->tcp_kssl_ent = NULL;
   3889 	}
   3890 	if (tcp->tcp_kssl_ctx != NULL) {
   3891 		kssl_release_ctx(tcp->tcp_kssl_ctx);
   3892 		tcp->tcp_kssl_ctx = NULL;
   3893 	}
   3894 	tcp->tcp_kssl_pending = B_FALSE;
   3895 
   3896 	tcp_ipsec_cleanup(tcp);
   3897 }
   3898 
   3899 /*
   3900  * tcp is dying (called from ipcl_conn_destroy and error cases).
   3901  * Free the tcp_t in either case.
   3902  */
   3903 void
   3904 tcp_free(tcp_t *tcp)
   3905 {
   3906 	mblk_t		*mp;
   3907 	conn_t		*connp = tcp->tcp_connp;
   3908 
   3909 	ASSERT(tcp != NULL);
   3910 	ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL);
   3911 
   3912 	connp->conn_rq = NULL;
   3913 	connp->conn_wq = NULL;
   3914 
   3915 	tcp_close_mpp(&tcp->tcp_xmit_head);
   3916 	tcp_close_mpp(&tcp->tcp_reass_head);
   3917 	if (tcp->tcp_rcv_list != NULL) {
   3918 		/* Free b_next chain */
   3919 		tcp_close_mpp(&tcp->tcp_rcv_list);
   3920 	}
   3921 	if ((mp = tcp->tcp_urp_mp) != NULL) {
   3922 		freemsg(mp);
   3923 	}
   3924 	if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
   3925 		freemsg(mp);
   3926 	}
   3927 
   3928 	if (tcp->tcp_fused_sigurg_mp != NULL) {
   3929 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   3930 		freeb(tcp->tcp_fused_sigurg_mp);
   3931 		tcp->tcp_fused_sigurg_mp = NULL;
   3932 	}
   3933 
   3934 	if (tcp->tcp_ordrel_mp != NULL) {
   3935 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   3936 		freeb(tcp->tcp_ordrel_mp);
   3937 		tcp->tcp_ordrel_mp = NULL;
   3938 	}
   3939 
   3940 	if (tcp->tcp_sack_info != NULL) {
   3941 		if (tcp->tcp_notsack_list != NULL) {
   3942 			TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
   3943 			    tcp);
   3944 		}
   3945 		bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
   3946 	}
   3947 
   3948 	if (tcp->tcp_hopopts != NULL) {
   3949 		mi_free(tcp->tcp_hopopts);
   3950 		tcp->tcp_hopopts = NULL;
   3951 		tcp->tcp_hopoptslen = 0;
   3952 	}
   3953 	ASSERT(tcp->tcp_hopoptslen == 0);
   3954 	if (tcp->tcp_dstopts != NULL) {
   3955 		mi_free(tcp->tcp_dstopts);
   3956 		tcp->tcp_dstopts = NULL;
   3957 		tcp->tcp_dstoptslen = 0;
   3958 	}
   3959 	ASSERT(tcp->tcp_dstoptslen == 0);
   3960 	if (tcp->tcp_rthdrdstopts != NULL) {
   3961 		mi_free(tcp->tcp_rthdrdstopts);
   3962 		tcp->tcp_rthdrdstopts = NULL;
   3963 		tcp->tcp_rthdrdstoptslen = 0;
   3964 	}
   3965 	ASSERT(tcp->tcp_rthdrdstoptslen == 0);
   3966 	if (tcp->tcp_rthdr != NULL) {
   3967 		mi_free(tcp->tcp_rthdr);
   3968 		tcp->tcp_rthdr = NULL;
   3969 		tcp->tcp_rthdrlen = 0;
   3970 	}
   3971 	ASSERT(tcp->tcp_rthdrlen == 0);
   3972 
   3973 	/*
   3974 	 * Following is really a blowing away a union.
   3975 	 * It happens to have exactly two members of identical size
   3976 	 * the following code is enough.
   3977 	 */
   3978 	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
   3979 }
   3980 
   3981 
   3982 /*
   3983  * Put a connection confirmation message upstream built from the
   3984  * address/flowid information with the conn and iph. Report our success or
   3985  * failure.
   3986  */
   3987 static boolean_t
   3988 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
   3989     mblk_t **defermp, ip_recv_attr_t *ira)
   3990 {
   3991 	sin_t	sin;
   3992 	sin6_t	sin6;
   3993 	mblk_t	*mp;
   3994 	char	*optp = NULL;
   3995 	int	optlen = 0;
   3996 	conn_t	*connp = tcp->tcp_connp;
   3997 
   3998 	if (defermp != NULL)
   3999 		*defermp = NULL;
   4000 
   4001 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
   4002 		/*
   4003 		 * Return in T_CONN_CON results of option negotiation through
   4004 		 * the T_CONN_REQ. Note: If there is an real end-to-end option
   4005 		 * negotiation, then what is received from remote end needs
   4006 		 * to be taken into account but there is no such thing (yet?)
   4007 		 * in our TCP/IP.
   4008 		 * Note: We do not use mi_offset_param() here as
   4009 		 * tcp_opts_conn_req contents do not directly come from
   4010 		 * an application and are either generated in kernel or
   4011 		 * from user input that was already verified.
   4012 		 */
   4013 		mp = tcp->tcp_conn.tcp_opts_conn_req;
   4014 		optp = (char *)(mp->b_rptr +
   4015 		    ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
   4016 		optlen = (int)
   4017 		    ((struct T_conn_req *)mp->b_rptr)->OPT_length;
   4018 	}
   4019 
   4020 	if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
   4021 
   4022 		/* packet is IPv4 */
   4023 		if (connp->conn_family == AF_INET) {
   4024 			sin = sin_null;
   4025 			sin.sin_addr.s_addr = connp->conn_faddr_v4;
   4026 			sin.sin_port = connp->conn_fport;
   4027 			sin.sin_family = AF_INET;
   4028 			mp = mi_tpi_conn_con(NULL, (char *)&sin,
   4029 			    (int)sizeof (sin_t), optp, optlen);
   4030 		} else {
   4031 			sin6 = sin6_null;
   4032 			sin6.sin6_addr = connp->conn_faddr_v6;
   4033 			sin6.sin6_port = connp->conn_fport;
   4034 			sin6.sin6_family = AF_INET6;
   4035 			mp = mi_tpi_conn_con(NULL, (char *)&sin6,
   4036 			    (int)sizeof (sin6_t), optp, optlen);
   4037 
   4038 		}
   4039 	} else {
   4040 		ip6_t	*ip6h = (ip6_t *)iphdr;
   4041 
   4042 		ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
   4043 		ASSERT(connp->conn_family == AF_INET6);
   4044 		sin6 = sin6_null;
   4045 		sin6.sin6_addr = connp->conn_faddr_v6;
   4046 		sin6.sin6_port = connp->conn_fport;
   4047 		sin6.sin6_family = AF_INET6;
   4048 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
   4049 		mp = mi_tpi_conn_con(NULL, (char *)&sin6,
   4050 		    (int)sizeof (sin6_t), optp, optlen);
   4051 	}
   4052 
   4053 	if (!mp)
   4054 		return (B_FALSE);
   4055 
   4056 	mblk_copycred(mp, idmp);
   4057 
   4058 	if (defermp == NULL) {
   4059 		conn_t *connp = tcp->tcp_connp;
   4060 		if (IPCL_IS_NONSTR(connp)) {
   4061 			(*connp->conn_upcalls->su_connected)
   4062 			    (connp->conn_upper_handle, tcp->tcp_connid,
   4063 			    ira->ira_cred, ira->ira_cpid);
   4064 			freemsg(mp);
   4065 		} else {
   4066 			if (ira->ira_cred != NULL) {
   4067 				/* So that getpeerucred works for TPI sockfs */
   4068 				mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
   4069 			}
   4070 			putnext(connp->conn_rq, mp);
   4071 		}
   4072 	} else {
   4073 		*defermp = mp;
   4074 	}
   4075 
   4076 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
   4077 		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
   4078 	return (B_TRUE);
   4079 }
   4080 
   4081 /*
   4082  * Defense for the SYN attack -
   4083  * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest
   4084  *    one from the list of droppable eagers. This list is a subset of q0.
   4085  *    see comments before the definition of MAKE_DROPPABLE().
   4086  * 2. Don't drop a SYN request before its first timeout. This gives every
   4087  *    request at least til the first timeout to complete its 3-way handshake.
   4088  * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many
   4089  *    requests currently on the queue that has timed out. This will be used
   4090  *    as an indicator of whether an attack is under way, so that appropriate
   4091  *    actions can be taken. (It's incremented in tcp_timer() and decremented
   4092  *    either when eager goes into ESTABLISHED, or gets freed up.)
   4093  * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on
   4094  *    # of timeout drops back to <= q0len/32 => SYN alert off
   4095  */
   4096 static boolean_t
   4097 tcp_drop_q0(tcp_t *tcp)
   4098 {
   4099 	tcp_t	*eager;
   4100 	mblk_t	*mp;
   4101 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   4102 
   4103 	ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock));
   4104 	ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0);
   4105 
   4106 	/* Pick oldest eager from the list of droppable eagers */
   4107 	eager = tcp->tcp_eager_prev_drop_q0;
   4108 
   4109 	/* If list is empty. return B_FALSE */
   4110 	if (eager == tcp) {
   4111 		return (B_FALSE);
   4112 	}
   4113 
   4114 	/* If allocated, the mp will be freed in tcp_clean_death_wrapper() */
   4115 	if ((mp = allocb(0, BPRI_HI)) == NULL)
   4116 		return (B_FALSE);
   4117 
   4118 	/*
   4119 	 * Take this eager out from the list of droppable eagers since we are
   4120 	 * going to drop it.
   4121 	 */
   4122 	MAKE_UNDROPPABLE(eager);
   4123 
   4124 	if (tcp->tcp_connp->conn_debug) {
   4125 		(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
   4126 		    "tcp_drop_q0: listen half-open queue (max=%d) overflow"
   4127 		    " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0,
   4128 		    tcp->tcp_conn_req_cnt_q0,
   4129 		    tcp_display(tcp, NULL, DISP_PORT_ONLY));
   4130 	}
   4131 
   4132 	BUMP_MIB(&tcps->tcps_mib, tcpHalfOpenDrop);
   4133 
   4134 	/* Put a reference on the conn as we are enqueueing it in the sqeue */
   4135 	CONN_INC_REF(eager->tcp_connp);
   4136 
   4137 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
   4138 	    tcp_clean_death_wrapper, eager->tcp_connp, NULL,
   4139 	    SQ_FILL, SQTAG_TCP_DROP_Q0);
   4140 
   4141 	return (B_TRUE);
   4142 }
   4143 
   4144 /*
   4145  * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6
   4146  */
   4147 static mblk_t *
   4148 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
   4149     ip_recv_attr_t *ira)
   4150 {
   4151 	tcp_t 		*ltcp = lconnp->conn_tcp;
   4152 	tcp_t		*tcp = connp->conn_tcp;
   4153 	mblk_t		*tpi_mp;
   4154 	ipha_t		*ipha;
   4155 	ip6_t		*ip6h;
   4156 	sin6_t 		sin6;
   4157 	uint_t		ifindex = ira->ira_ruifindex;
   4158 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   4159 
   4160 	if (ira->ira_flags & IRAF_IS_IPV4) {
   4161 		ipha = (ipha_t *)mp->b_rptr;
   4162 
   4163 		connp->conn_ipversion = IPV4_VERSION;
   4164 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
   4165 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
   4166 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   4167 
   4168 		sin6 = sin6_null;
   4169 		sin6.sin6_addr = connp->conn_faddr_v6;
   4170 		sin6.sin6_port = connp->conn_fport;
   4171 		sin6.sin6_family = AF_INET6;
   4172 		sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
   4173 		    IPCL_ZONEID(lconnp), tcps->tcps_netstack);
   4174 
   4175 		if (connp->conn_recv_ancillary.crb_recvdstaddr) {
   4176 			sin6_t	sin6d;
   4177 
   4178 			sin6d = sin6_null;
   4179 			sin6d.sin6_addr = connp->conn_laddr_v6;
   4180 			sin6d.sin6_port = connp->conn_lport;
   4181 			sin6d.sin6_family = AF_INET;
   4182 			tpi_mp = mi_tpi_extconn_ind(NULL,
   4183 			    (char *)&sin6d, sizeof (sin6_t),
   4184 			    (char *)&tcp,
   4185 			    (t_scalar_t)sizeof (intptr_t),
   4186 			    (char *)&sin6d, sizeof (sin6_t),
   4187 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4188 		} else {
   4189 			tpi_mp = mi_tpi_conn_ind(NULL,
   4190 			    (char *)&sin6, sizeof (sin6_t),
   4191 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4192 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4193 		}
   4194 	} else {
   4195 		ip6h = (ip6_t *)mp->b_rptr;
   4196 
   4197 		connp->conn_ipversion = IPV6_VERSION;
   4198 		connp->conn_laddr_v6 = ip6h->ip6_dst;
   4199 		connp->conn_faddr_v6 = ip6h->ip6_src;
   4200 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   4201 
   4202 		sin6 = sin6_null;
   4203 		sin6.sin6_addr = connp->conn_faddr_v6;
   4204 		sin6.sin6_port = connp->conn_fport;
   4205 		sin6.sin6_family = AF_INET6;
   4206 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
   4207 		sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
   4208 		    IPCL_ZONEID(lconnp), tcps->tcps_netstack);
   4209 
   4210 		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
   4211 			/* Pass up the scope_id of remote addr */
   4212 			sin6.sin6_scope_id = ifindex;
   4213 		} else {
   4214 			sin6.sin6_scope_id = 0;
   4215 		}
   4216 		if (connp->conn_recv_ancillary.crb_recvdstaddr) {
   4217 			sin6_t	sin6d;
   4218 
   4219 			sin6d = sin6_null;
   4220 			sin6.sin6_addr = connp->conn_laddr_v6;
   4221 			sin6d.sin6_port = connp->conn_lport;
   4222 			sin6d.sin6_family = AF_INET6;
   4223 			if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6))
   4224 				sin6d.sin6_scope_id = ifindex;
   4225 
   4226 			tpi_mp = mi_tpi_extconn_ind(NULL,
   4227 			    (char *)&sin6d, sizeof (sin6_t),
   4228 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4229 			    (char *)&sin6d, sizeof (sin6_t),
   4230 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4231 		} else {
   4232 			tpi_mp = mi_tpi_conn_ind(NULL,
   4233 			    (char *)&sin6, sizeof (sin6_t),
   4234 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4235 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4236 		}
   4237 	}
   4238 
   4239 	tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
   4240 	return (tpi_mp);
   4241 }
   4242 
   4243 /* Handle a SYN on an AF_INET socket */
   4244 mblk_t *
   4245 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
   4246     ip_recv_attr_t *ira)
   4247 {
   4248 	tcp_t 		*ltcp = lconnp->conn_tcp;
   4249 	tcp_t		*tcp = connp->conn_tcp;
   4250 	sin_t		sin;
   4251 	mblk_t		*tpi_mp = NULL;
   4252 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   4253 	ipha_t		*ipha;
   4254 
   4255 	ASSERT(ira->ira_flags & IRAF_IS_IPV4);
   4256 	ipha = (ipha_t *)mp->b_rptr;
   4257 
   4258 	connp->conn_ipversion = IPV4_VERSION;
   4259 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
   4260 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
   4261 	connp->conn_saddr_v6 = connp->conn_laddr_v6;
   4262 
   4263 	sin = sin_null;
   4264 	sin.sin_addr.s_addr = connp->conn_faddr_v4;
   4265 	sin.sin_port = connp->conn_fport;
   4266 	sin.sin_family = AF_INET;
   4267 	if (lconnp->conn_recv_ancillary.crb_recvdstaddr) {
   4268 		sin_t	sind;
   4269 
   4270 		sind = sin_null;
   4271 		sind.sin_addr.s_addr = connp->conn_laddr_v4;
   4272 		sind.sin_port = connp->conn_lport;
   4273 		sind.sin_family = AF_INET;
   4274 		tpi_mp = mi_tpi_extconn_ind(NULL,
   4275 		    (char *)&sind, sizeof (sin_t), (char *)&tcp,
   4276 		    (t_scalar_t)sizeof (intptr_t), (char *)&sind,
   4277 		    sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4278 	} else {
   4279 		tpi_mp = mi_tpi_conn_ind(NULL,
   4280 		    (char *)&sin, sizeof (sin_t),
   4281 		    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4282 		    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4283 	}
   4284 
   4285 	tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
   4286 	return (tpi_mp);
   4287 }
   4288 
   4289 /*
   4290  * tcp_get_conn/tcp_free_conn
   4291  *
   4292  * tcp_get_conn is used to get a clean tcp connection structure.
   4293  * It tries to reuse the connections put on the freelist by the
   4294  * time_wait_collector failing which it goes to kmem_cache. This
   4295  * way has two benefits compared to just allocating from and
   4296  * freeing to kmem_cache.
   4297  * 1) The time_wait_collector can free (which includes the cleanup)
   4298  * outside the squeue. So when the interrupt comes, we have a clean
   4299  * connection sitting in the freelist. Obviously, this buys us
   4300  * performance.
   4301  *
   4302  * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
   4303  * has multiple disadvantages - tying up the squeue during alloc.
   4304  * But allocating the conn/tcp in IP land is also not the best since
   4305  * we can't check the 'q' and 'q0' which are protected by squeue and
   4306  * blindly allocate memory which might have to be freed here if we are
   4307  * not allowed to accept the connection. By using the freelist and
   4308  * putting the conn/tcp back in freelist, we don't pay a penalty for
   4309  * allocating memory without checking 'q/q0' and freeing it if we can't
   4310  * accept the connection.
   4311  *
   4312  * Care should be taken to put the conn back in the same squeue's freelist
   4313  * from which it was allocated. Best results are obtained if conn is
   4314  * allocated from listener's squeue and freed to the same. Time wait
   4315  * collector will free up the freelist is the connection ends up sitting
   4316  * there for too long.
   4317  */
   4318 void *
   4319 tcp_get_conn(void *arg, tcp_stack_t *tcps)
   4320 {
   4321 	tcp_t			*tcp = NULL;
   4322 	conn_t			*connp = NULL;
   4323 	squeue_t		*sqp = (squeue_t *)arg;
   4324 	tcp_squeue_priv_t 	*tcp_time_wait;
   4325 	netstack_t		*ns;
   4326 	mblk_t			*tcp_rsrv_mp = NULL;
   4327 
   4328 	tcp_time_wait =
   4329 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
   4330 
   4331 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   4332 	tcp = tcp_time_wait->tcp_free_list;
   4333 	ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
   4334 	if (tcp != NULL) {
   4335 		tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
   4336 		tcp_time_wait->tcp_free_list_cnt--;
   4337 		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   4338 		tcp->tcp_time_wait_next = NULL;
   4339 		connp = tcp->tcp_connp;
   4340 		connp->conn_flags |= IPCL_REUSED;
   4341 
   4342 		ASSERT(tcp->tcp_tcps == NULL);
   4343 		ASSERT(connp->conn_netstack == NULL);
   4344 		ASSERT(tcp->tcp_rsrv_mp != NULL);
   4345 		ns = tcps->tcps_netstack;
   4346 		netstack_hold(ns);
   4347 		connp->conn_netstack = ns;
   4348 		connp->conn_ixa->ixa_ipst = ns->netstack_ip;
   4349 		tcp->tcp_tcps = tcps;
   4350 		ipcl_globalhash_insert(connp);
   4351 
   4352 		connp->conn_ixa->ixa_notify_cookie = tcp;
   4353 		ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
   4354 		connp->conn_recv = tcp_input_data;
   4355 		ASSERT(connp->conn_recvicmp == tcp_icmp_input);
   4356 		ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
   4357 		return ((void *)connp);
   4358 	}
   4359 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   4360 	/*
   4361 	 * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
   4362 	 * this conn_t/tcp_t is freed at ipcl_conn_destroy().
   4363 	 */
   4364 	tcp_rsrv_mp = allocb(0, BPRI_HI);
   4365 	if (tcp_rsrv_mp == NULL)
   4366 		return (NULL);
   4367 
   4368 	if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
   4369 	    tcps->tcps_netstack)) == NULL) {
   4370 		freeb(tcp_rsrv_mp);
   4371 		return (NULL);
   4372 	}
   4373 
   4374 	tcp = connp->conn_tcp;
   4375 	tcp->tcp_rsrv_mp = tcp_rsrv_mp;
   4376 	mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
   4377 
   4378 	tcp->tcp_tcps = tcps;
   4379 
   4380 	connp->conn_recv = tcp_input_data;
   4381 	connp->conn_recvicmp = tcp_icmp_input;
   4382 	connp->conn_verifyicmp = tcp_verifyicmp;
   4383 
   4384 	/*
   4385 	 * Register tcp_notify to listen to capability changes detected by IP.
   4386 	 * This upcall is made in the context of the call to conn_ip_output
   4387 	 * thus it is inside the squeue.
   4388 	 */
   4389 	connp->conn_ixa->ixa_notify = tcp_notify;
   4390 	connp->conn_ixa->ixa_notify_cookie = tcp;
   4391 
   4392 	return ((void *)connp);
   4393 }
   4394 
   4395 /* BEGIN CSTYLED */
   4396 /*
   4397  *
   4398  * The sockfs ACCEPT path:
   4399  * =======================
   4400  *
   4401  * The eager is now established in its own perimeter as soon as SYN is
   4402  * received in tcp_input_listener(). When sockfs receives conn_ind, it
   4403  * completes the accept processing on the acceptor STREAM. The sending
   4404  * of conn_ind part is common for both sockfs listener and a TLI/XTI
   4405  * listener but a TLI/XTI listener completes the accept processing
   4406  * on the listener perimeter.
   4407  *
   4408  * Common control flow for 3 way handshake:
   4409  * ----------------------------------------
   4410  *
   4411  * incoming SYN (listener perimeter)	-> tcp_input_listener()
   4412  *
   4413  * incoming SYN-ACK-ACK (eager perim) 	-> tcp_input_data()
   4414  * send T_CONN_IND (listener perim)	-> tcp_send_conn_ind()
   4415  *
   4416  * Sockfs ACCEPT Path:
   4417  * -------------------
   4418  *
   4419  * open acceptor stream (tcp_open allocates tcp_tli_accept()
   4420  * as STREAM entry point)
   4421  *
   4422  * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept()
   4423  *
   4424  * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager
   4425  * association (we are not behind eager's squeue but sockfs is protecting us
   4426  * and no one knows about this stream yet. The STREAMS entry point q->q_info
   4427  * is changed to point at tcp_wput().
   4428  *
   4429  * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to
   4430  * listener (done on listener's perimeter).
   4431  *
   4432  * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish
   4433  * accept.
   4434  *
   4435  * TLI/XTI client ACCEPT path:
   4436  * ---------------------------
   4437  *
   4438  * soaccept() sends T_CONN_RES on the listener STREAM.
   4439  *
   4440  * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send
   4441  * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()).
   4442  *
   4443  * Locks:
   4444  * ======
   4445  *
   4446  * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and
   4447  * and listeners->tcp_eager_next_q.
   4448  *
   4449  * Referencing:
   4450  * ============
   4451  *
   4452  * 1) We start out in tcp_input_listener by eager placing a ref on
   4453  * listener and listener adding eager to listeners->tcp_eager_next_q0.
   4454  *
   4455  * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before
   4456  * doing so we place a ref on the eager. This ref is finally dropped at the
   4457  * end of tcp_accept_finish() while unwinding from the squeue, i.e. the
   4458  * reference is dropped by the squeue framework.
   4459  *
   4460  * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish
   4461  *
   4462  * The reference must be released by the same entity that added the reference
   4463  * In the above scheme, the eager is the entity that adds and releases the
   4464  * references. Note that tcp_accept_finish executes in the squeue of the eager
   4465  * (albeit after it is attached to the acceptor stream). Though 1. executes
   4466  * in the listener's squeue, the eager is nascent at this point and the
   4467  * reference can be considered to have been added on behalf of the eager.
   4468  *
   4469  * Eager getting a Reset or listener closing:
   4470  * ==========================================
   4471  *
   4472  * Once the listener and eager are linked, the listener never does the unlink.
   4473  * If the listener needs to close, tcp_eager_cleanup() is called which queues
   4474  * a message on all eager perimeter. The eager then does the unlink, clears
   4475  * any pointers to the listener's queue and drops the reference to the
   4476  * listener. The listener waits in tcp_close outside the squeue until its
   4477  * refcount has dropped to 1. This ensures that the listener has waited for
   4478  * all eagers to clear their association with the listener.
   4479  *
   4480  * Similarly, if eager decides to go away, it can unlink itself and close.
   4481  * When the T_CONN_RES comes down, we check if eager has closed. Note that
   4482  * the reference to eager is still valid because of the extra ref we put
   4483  * in tcp_send_conn_ind.
   4484  *
   4485  * Listener can always locate the eager under the protection
   4486  * of the listener->tcp_eager_lock, and then do a refhold
   4487  * on the eager during the accept processing.
   4488  *
   4489  * The acceptor stream accesses the eager in the accept processing
   4490  * based on the ref placed on eager before sending T_conn_ind.
   4491  * The only entity that can negate this refhold is a listener close
   4492  * which is mutually exclusive with an active acceptor stream.
   4493  *
   4494  * Eager's reference on the listener
   4495  * ===================================
   4496  *
   4497  * If the accept happens (even on a closed eager) the eager drops its
   4498  * reference on the listener at the start of tcp_accept_finish. If the
   4499  * eager is killed due to an incoming RST before the T_conn_ind is sent up,
   4500  * the reference is dropped in tcp_closei_local. If the listener closes,
   4501  * the reference is dropped in tcp_eager_kill. In all cases the reference
   4502  * is dropped while executing in the eager's context (squeue).
   4503  */
   4504 /* END CSTYLED */
   4505 
   4506 /* Process the SYN packet, mp, directed at the listener 'tcp' */
   4507 
   4508 /*
   4509  * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN.
   4510  * tcp_input_data will not see any packets for listeners since the listener
   4511  * has conn_recv set to tcp_input_listener.
   4512  */
   4513 /* ARGSUSED */
   4514 void
   4515 tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   4516 {
   4517 	tcpha_t		*tcpha;
   4518 	uint32_t	seg_seq;
   4519 	tcp_t		*eager;
   4520 	int		err;
   4521 	conn_t		*econnp = NULL;
   4522 	squeue_t	*new_sqp;
   4523 	mblk_t		*mp1;
   4524 	uint_t 		ip_hdr_len;
   4525 	conn_t		*lconnp = (conn_t *)arg;
   4526 	tcp_t		*listener = lconnp->conn_tcp;
   4527 	tcp_stack_t	*tcps = listener->tcp_tcps;
   4528 	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
   4529 	uint_t		flags;
   4530 	mblk_t		*tpi_mp;
   4531 	uint_t		ifindex = ira->ira_ruifindex;
   4532 	boolean_t	tlc_set = B_FALSE;
   4533 
   4534 	ip_hdr_len = ira->ira_ip_hdr_length;
   4535 	tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
   4536 	flags = (unsigned int)tcpha->tha_flags & 0xFF;
   4537 
   4538 	if (!(flags & TH_SYN)) {
   4539 		if ((flags & TH_RST) || (flags & TH_URG)) {
   4540 			freemsg(mp);
   4541 			return;
   4542 		}
   4543 		if (flags & TH_ACK) {
   4544 			/* Note this executes in listener's squeue */
   4545 			tcp_xmit_listeners_reset(mp, ira, ipst, lconnp);
   4546 			return;
   4547 		}
   4548 
   4549 		freemsg(mp);
   4550 		return;
   4551 	}
   4552 
   4553 	if (listener->tcp_state != TCPS_LISTEN)
   4554 		goto error2;
   4555 
   4556 	ASSERT(IPCL_IS_BOUND(lconnp));
   4557 
   4558 	mutex_enter(&listener->tcp_eager_lock);
   4559 
   4560 	/*
   4561 	 * The system is under memory pressure, so we need to do our part
   4562 	 * to relieve the pressure.  So we only accept new request if there
   4563 	 * is nothing waiting to be accepted or waiting to complete the 3-way
   4564 	 * handshake.  This means that busy listener will not get too many
   4565 	 * new requests which they cannot handle in time while non-busy
   4566 	 * listener is still functioning properly.
   4567 	 */
   4568 	if (tcps->tcps_reclaim && (listener->tcp_conn_req_cnt_q > 0 ||
   4569 	    listener->tcp_conn_req_cnt_q0 > 0)) {
   4570 		mutex_exit(&listener->tcp_eager_lock);
   4571 		TCP_STAT(tcps, tcp_listen_mem_drop);
   4572 		goto error2;
   4573 	}
   4574 
   4575 	if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) {
   4576 		mutex_exit(&listener->tcp_eager_lock);
   4577 		TCP_STAT(tcps, tcp_listendrop);
   4578 		BUMP_MIB(&tcps->tcps_mib, tcpListenDrop);
   4579 		if (lconnp->conn_debug) {
   4580 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
   4581 			    "tcp_input_listener: listen backlog (max=%d) "
   4582 			    "overflow (%d pending) on %s",
   4583 			    listener->tcp_conn_req_max,
   4584 			    listener->tcp_conn_req_cnt_q,
   4585 			    tcp_display(listener, NULL, DISP_PORT_ONLY));
   4586 		}
   4587 		goto error2;
   4588 	}
   4589 
   4590 	if (listener->tcp_conn_req_cnt_q0 >=
   4591 	    listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
   4592 		/*
   4593 		 * Q0 is full. Drop a pending half-open req from the queue
   4594 		 * to make room for the new SYN req. Also mark the time we
   4595 		 * drop a SYN.
   4596 		 *
   4597 		 * A more aggressive defense against SYN attack will
   4598 		 * be to set the "tcp_syn_defense" flag now.
   4599 		 */
   4600 		TCP_STAT(tcps, tcp_listendropq0);
   4601 		listener->tcp_last_rcv_lbolt = ddi_get_lbolt64();
   4602 		if (!tcp_drop_q0(listener)) {
   4603 			mutex_exit(&listener->tcp_eager_lock);
   4604 			BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0);
   4605 			if (lconnp->conn_debug) {
   4606 				(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
   4607 				    "tcp_input_listener: listen half-open "
   4608 				    "queue (max=%d) full (%d pending) on %s",
   4609 				    tcps->tcps_conn_req_max_q0,
   4610 				    listener->tcp_conn_req_cnt_q0,
   4611 				    tcp_display(listener, NULL,
   4612 				    DISP_PORT_ONLY));
   4613 			}
   4614 			goto error2;
   4615 		}
   4616 	}
   4617 
   4618 	/*
   4619 	 * Enforce the limit set on the number of connections per listener.
   4620 	 * Note that tlc_cnt starts with 1.  So need to add 1 to tlc_max
   4621 	 * for comparison.
   4622 	 */
   4623 	if (listener->tcp_listen_cnt != NULL) {
   4624 		tcp_listen_cnt_t *tlc = listener->tcp_listen_cnt;
   4625 		int64_t now;
   4626 
   4627 		if (atomic_add_32_nv(&tlc->tlc_cnt, 1) > tlc->tlc_max + 1) {
   4628 			mutex_exit(&listener->tcp_eager_lock);
   4629 			now = ddi_get_lbolt64();
   4630 			atomic_add_32(&tlc->tlc_cnt, -1);
   4631 			TCP_STAT(tcps, tcp_listen_cnt_drop);
   4632 			tlc->tlc_drop++;
   4633 			if (now - tlc->tlc_report_time >
   4634 			    MSEC_TO_TICK(TCP_TLC_REPORT_INTERVAL)) {
   4635 				zcmn_err(lconnp->conn_zoneid, CE_WARN,
   4636 				    "Listener (port %d) connection max (%u) "
   4637 				    "reached: %u attempts dropped total\n",
   4638 				    ntohs(listener->tcp_connp->conn_lport),
   4639 				    tlc->tlc_max, tlc->tlc_drop);
   4640 				tlc->tlc_report_time = now;
   4641 			}
   4642 			goto error2;
   4643 		}
   4644 		tlc_set = B_TRUE;
   4645 	}
   4646 
   4647 	mutex_exit(&listener->tcp_eager_lock);
   4648 
   4649 	/*
   4650 	 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
   4651 	 * or based on the ring (for packets from GLD). Otherwise it is
   4652 	 * set based on lbolt i.e., a somewhat random number.
   4653 	 */
   4654 	ASSERT(ira->ira_sqp != NULL);
   4655 	new_sqp = ira->ira_sqp;
   4656 
   4657 	econnp = (conn_t *)tcp_get_conn(arg2, tcps);
   4658 	if (econnp == NULL)
   4659 		goto error2;
   4660 
   4661 	ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
   4662 	econnp->conn_sqp = new_sqp;
   4663 	econnp->conn_initial_sqp = new_sqp;
   4664 	econnp->conn_ixa->ixa_sqp = new_sqp;
   4665 
   4666 	econnp->conn_fport = tcpha->tha_lport;
   4667 	econnp->conn_lport = tcpha->tha_fport;
   4668 
   4669 	err = conn_inherit_parent(lconnp, econnp);
   4670 	if (err != 0)
   4671 		goto error3;
   4672 
   4673 	/* We already know the laddr of the new connection is ours */
   4674 	econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation;
   4675 
   4676 	ASSERT(OK_32PTR(mp->b_rptr));
   4677 	ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ||
   4678 	    IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
   4679 
   4680 	if (lconnp->conn_family == AF_INET) {
   4681 		ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION);
   4682 		tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira);
   4683 	} else {
   4684 		tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira);
   4685 	}
   4686 
   4687 	if (tpi_mp == NULL)
   4688 		goto error3;
   4689 
   4690 	eager = econnp->conn_tcp;
   4691 	eager->tcp_detached = B_TRUE;
   4692 	SOCK_CONNID_INIT(eager->tcp_connid);
   4693 
   4694 	tcp_init_values(eager);
   4695 
   4696 	ASSERT((econnp->conn_ixa->ixa_flags &
   4697 	    (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
   4698 	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) ==
   4699 	    (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
   4700 	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO));
   4701 
   4702 	if (!tcps->tcps_dev_flow_ctl)
   4703 		econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL;
   4704 
   4705 	/* Prepare for diffing against previous packets */
   4706 	eager->tcp_recvifindex = 0;
   4707 	eager->tcp_recvhops = 0xffffffffU;
   4708 
   4709 	if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) {
   4710 		if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) ||
   4711 		    IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) {
   4712 			econnp->conn_incoming_ifindex = ifindex;
   4713 			econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
   4714 			econnp->conn_ixa->ixa_scopeid = ifindex;
   4715 		}
   4716 	}
   4717 
   4718 	if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) ==
   4719 	    (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) &&
   4720 	    tcps->tcps_rev_src_routes) {
   4721 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
   4722 		ip_pkt_t *ipp = &econnp->conn_xmit_ipp;
   4723 
   4724 		/* Source routing option copyover (reverse it) */
   4725 		err = ip_find_hdr_v4(ipha, ipp, B_TRUE);
   4726 		if (err != 0) {
   4727 			freemsg(tpi_mp);
   4728 			goto error3;
   4729 		}
   4730 		ip_pkt_source_route_reverse_v4(ipp);
   4731 	}
   4732 
   4733 	ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL);
   4734 	ASSERT(!eager->tcp_tconnind_started);
   4735 	/*
   4736 	 * If the SYN came with a credential, it's a loopback packet or a
   4737 	 * labeled packet; attach the credential to the TPI message.
   4738 	 */
   4739 	if (ira->ira_cred != NULL)
   4740 		mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid);
   4741 
   4742 	eager->tcp_conn.tcp_eager_conn_ind = tpi_mp;
   4743 
   4744 	/* Inherit the listener's SSL protection state */
   4745 	if ((eager->tcp_kssl_ent = listener->tcp_kssl_ent) != NULL) {
   4746 		kssl_hold_ent(eager->tcp_kssl_ent);
   4747 		eager->tcp_kssl_pending = B_TRUE;
   4748 	}
   4749 
   4750 	/* Inherit the listener's non-STREAMS flag */
   4751 	if (IPCL_IS_NONSTR(lconnp)) {
   4752 		econnp->conn_flags |= IPCL_NONSTR;
   4753 	}
   4754 
   4755 	ASSERT(eager->tcp_ordrel_mp == NULL);
   4756 
   4757 	if (!IPCL_IS_NONSTR(econnp)) {
   4758 		/*
   4759 		 * Pre-allocate the T_ordrel_ind mblk for TPI socket so that
   4760 		 * at close time, we will always have that to send up.
   4761 		 * Otherwise, we need to do special handling in case the
   4762 		 * allocation fails at that time.
   4763 		 */
   4764 		if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL)
   4765 			goto error3;
   4766 	}
   4767 	/*
   4768 	 * Now that the IP addresses and ports are setup in econnp we
   4769 	 * can do the IPsec policy work.
   4770 	 */
   4771 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
   4772 		if (lconnp->conn_policy != NULL) {
   4773 			/*
   4774 			 * Inherit the policy from the listener; use
   4775 			 * actions from ira
   4776 			 */
   4777 			if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) {
   4778 				CONN_DEC_REF(econnp);
   4779 				freemsg(mp);
   4780 				goto error3;
   4781 			}
   4782 		}
   4783 	}
   4784 
   4785 	/* Inherit various TCP parameters from the listener */
   4786 	eager->tcp_naglim = listener->tcp_naglim;
   4787 	eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold;
   4788 	eager->tcp_second_timer_threshold =
   4789 	    listener->tcp_second_timer_threshold;
   4790 	eager->tcp_first_ctimer_threshold =
   4791 	    listener->tcp_first_ctimer_threshold;
   4792 	eager->tcp_second_ctimer_threshold =
   4793 	    listener->tcp_second_ctimer_threshold;
   4794 
   4795 	/*
   4796 	 * tcp_set_destination() may set tcp_rwnd according to the route
   4797 	 * metrics. If it does not, the eager's receive window will be set
   4798 	 * to the listener's receive window later in this function.
   4799 	 */
   4800 	eager->tcp_rwnd = 0;
   4801 
   4802 	/*
   4803 	 * Inherit listener's tcp_init_cwnd.  Need to do this before
   4804 	 * calling tcp_process_options() which set the initial cwnd.
   4805 	 */
   4806 	eager->tcp_init_cwnd = listener->tcp_init_cwnd;
   4807 
   4808 	if (is_system_labeled()) {
   4809 		ip_xmit_attr_t *ixa = econnp->conn_ixa;
   4810 
   4811 		ASSERT(ira->ira_tsl != NULL);
   4812 		/* Discard any old label */
   4813 		if (ixa->ixa_free_flags & IXA_FREE_TSL) {
   4814 			ASSERT(ixa->ixa_tsl != NULL);
   4815 			label_rele(ixa->ixa_tsl);
   4816 			ixa->ixa_free_flags &= ~IXA_FREE_TSL;
   4817 			ixa->ixa_tsl = NULL;
   4818 		}
   4819 		if ((lconnp->conn_mlp_type != mlptSingle ||
   4820 		    lconnp->conn_mac_mode != CONN_MAC_DEFAULT) &&
   4821 		    ira->ira_tsl != NULL) {
   4822 			/*
   4823 			 * If this is an MLP connection or a MAC-Exempt
   4824 			 * connection with an unlabeled node, packets are to be
   4825 			 * exchanged using the security label of the received
   4826 			 * SYN packet instead of the server application's label.
   4827 			 * tsol_check_dest called from ip_set_destination
   4828 			 * might later update TSF_UNLABELED by replacing
   4829 			 * ixa_tsl with a new label.
   4830 			 */
   4831 			label_hold(ira->ira_tsl);
   4832 			ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
   4833 			DTRACE_PROBE2(mlp_syn_accept, conn_t *,
   4834 			    econnp, ts_label_t *, ixa->ixa_tsl)
   4835 		} else {
   4836 			ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
   4837 			DTRACE_PROBE2(syn_accept, conn_t *,
   4838 			    econnp, ts_label_t *, ixa->ixa_tsl)
   4839 		}
   4840 		/*
   4841 		 * conn_connect() called from tcp_set_destination will verify
   4842 		 * the destination is allowed to receive packets at the
   4843 		 * security label of the SYN-ACK we are generating. As part of
   4844 		 * that, tsol_check_dest() may create a new effective label for
   4845 		 * this connection.
   4846 		 * Finally conn_connect() will call conn_update_label.
   4847 		 * All that remains for TCP to do is to call
   4848 		 * conn_build_hdr_template which is done as part of
   4849 		 * tcp_set_destination.
   4850 		 */
   4851 	}
   4852 
   4853 	/*
   4854 	 * Since we will clear tcp_listener before we clear tcp_detached
   4855 	 * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress
   4856 	 * so we can tell a TCP_DETACHED_NONEAGER apart.
   4857 	 */
   4858 	eager->tcp_hard_binding = B_TRUE;
   4859 
   4860 	tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
   4861 	    TCP_BIND_HASH(econnp->conn_lport)], eager, 0);
   4862 
   4863 	CL_INET_CONNECT(econnp, B_FALSE, err);
   4864 	if (err != 0) {
   4865 		tcp_bind_hash_remove(eager);
   4866 		goto error3;
   4867 	}
   4868 
   4869 	/*
   4870 	 * No need to check for multicast destination since ip will only pass
   4871 	 * up multicasts to those that have expressed interest
   4872 	 * TODO: what about rejecting broadcasts?
   4873 	 * Also check that source is not a multicast or broadcast address.
   4874 	 */
   4875 	eager->tcp_state = TCPS_SYN_RCVD;
   4876 	SOCK_CONNID_BUMP(eager->tcp_connid);
   4877 
   4878 	/*
   4879 	 * Adapt our mss, ttl, ... based on the remote address.
   4880 	 */
   4881 
   4882 	if (tcp_set_destination(eager) != 0) {
   4883 		BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   4884 		/* Undo the bind_hash_insert */
   4885 		tcp_bind_hash_remove(eager);
   4886 		goto error3;
   4887 	}
   4888 
   4889 	/* Process all TCP options. */
   4890 	tcp_process_options(eager, tcpha);
   4891 
   4892 	/* Is the other end ECN capable? */
   4893 	if (tcps->tcps_ecn_permitted >= 1 &&
   4894 	    (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
   4895 		eager->tcp_ecn_ok = B_TRUE;
   4896 	}
   4897 
   4898 	/*
   4899 	 * The listener's conn_rcvbuf should be the default window size or a
   4900 	 * window size changed via SO_RCVBUF option. First round up the
   4901 	 * eager's tcp_rwnd to the nearest MSS. Then find out the window
   4902 	 * scale option value if needed. Call tcp_rwnd_set() to finish the
   4903 	 * setting.
   4904 	 *
   4905 	 * Note if there is a rpipe metric associated with the remote host,
   4906 	 * we should not inherit receive window size from listener.
   4907 	 */
   4908 	eager->tcp_rwnd = MSS_ROUNDUP(
   4909 	    (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf :
   4910 	    eager->tcp_rwnd), eager->tcp_mss);
   4911 	if (eager->tcp_snd_ws_ok)
   4912 		tcp_set_ws_value(eager);
   4913 	/*
   4914 	 * Note that this is the only place tcp_rwnd_set() is called for
   4915 	 * accepting a connection.  We need to call it here instead of
   4916 	 * after the 3-way handshake because we need to tell the other
   4917 	 * side our rwnd in the SYN-ACK segment.
   4918 	 */
   4919 	(void) tcp_rwnd_set(eager, eager->tcp_rwnd);
   4920 
   4921 	ASSERT(eager->tcp_connp->conn_rcvbuf != 0 &&
   4922 	    eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd);
   4923 
   4924 	ASSERT(econnp->conn_rcvbuf != 0 &&
   4925 	    econnp->conn_rcvbuf == eager->tcp_rwnd);
   4926 
   4927 	/* Put a ref on the listener for the eager. */
   4928 	CONN_INC_REF(lconnp);
   4929 	mutex_enter(&listener->tcp_eager_lock);
   4930 	listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
   4931 	eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0;
   4932 	listener->tcp_eager_next_q0 = eager;
   4933 	eager->tcp_eager_prev_q0 = listener;
   4934 
   4935 	/* Set tcp_listener before adding it to tcp_conn_fanout */
   4936 	eager->tcp_listener = listener;
   4937 	eager->tcp_saved_listener = listener;
   4938 
   4939 	/*
   4940 	 * Set tcp_listen_cnt so that when the connection is done, the counter
   4941 	 * is decremented.
   4942 	 */
   4943 	eager->tcp_listen_cnt = listener->tcp_listen_cnt;
   4944 
   4945 	/*
   4946 	 * Tag this detached tcp vector for later retrieval
   4947 	 * by our listener client in tcp_accept().
   4948 	 */
   4949 	eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum;
   4950 	listener->tcp_conn_req_cnt_q0++;
   4951 	if (++listener->tcp_conn_req_seqnum == -1) {
   4952 		/*
   4953 		 * -1 is "special" and defined in TPI as something
   4954 		 * that should never be used in T_CONN_IND
   4955 		 */
   4956 		++listener->tcp_conn_req_seqnum;
   4957 	}
   4958 	mutex_exit(&listener->tcp_eager_lock);
   4959 
   4960 	if (listener->tcp_syn_defense) {
   4961 		/* Don't drop the SYN that comes from a good IP source */
   4962 		ipaddr_t *addr_cache;
   4963 
   4964 		addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
   4965 		if (addr_cache != NULL && econnp->conn_faddr_v4 ==
   4966 		    addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) {
   4967 			eager->tcp_dontdrop = B_TRUE;
   4968 		}
   4969 	}
   4970 
   4971 	/*
   4972 	 * We need to insert the eager in its own perimeter but as soon
   4973 	 * as we do that, we expose the eager to the classifier and
   4974 	 * should not touch any field outside the eager's perimeter.
   4975 	 * So do all the work necessary before inserting the eager
   4976 	 * in its own perimeter. Be optimistic that conn_connect()
   4977 	 * will succeed but undo everything if it fails.
   4978 	 */
   4979 	seg_seq = ntohl(tcpha->tha_seq);
   4980 	eager->tcp_irs = seg_seq;
   4981 	eager->tcp_rack = seg_seq;
   4982 	eager->tcp_rnxt = seg_seq + 1;
   4983 	eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt);
   4984 	BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);
   4985 	eager->tcp_state = TCPS_SYN_RCVD;
   4986 	mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
   4987 	    NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
   4988 	if (mp1 == NULL) {
   4989 		/*
   4990 		 * Increment the ref count as we are going to
   4991 		 * enqueueing an mp in squeue
   4992 		 */
   4993 		CONN_INC_REF(econnp);
   4994 		goto error;
   4995 	}
   4996 
   4997 	/*
   4998 	 * We need to start the rto timer. In normal case, we start
   4999 	 * the timer after sending the packet on the wire (or at
   5000 	 * least believing that packet was sent by waiting for
   5001 	 * conn_ip_output() to return). Since this is the first packet
   5002 	 * being sent on the wire for the eager, our initial tcp_rto
   5003 	 * is at least tcp_rexmit_interval_min which is a fairly
   5004 	 * large value to allow the algorithm to adjust slowly to large
   5005 	 * fluctuations of RTT during first few transmissions.
   5006 	 *
   5007 	 * Starting the timer first and then sending the packet in this
   5008 	 * case shouldn't make much difference since tcp_rexmit_interval_min
   5009 	 * is of the order of several 100ms and starting the timer
   5010 	 * first and then sending the packet will result in difference
   5011 	 * of few micro seconds.
   5012 	 *
   5013 	 * Without this optimization, we are forced to hold the fanout
   5014 	 * lock across the ipcl_bind_insert() and sending the packet
   5015 	 * so that we don't race against an incoming packet (maybe RST)
   5016 	 * for this eager.
   5017 	 *
   5018 	 * It is necessary to acquire an extra reference on the eager
   5019 	 * at this point and hold it until after tcp_send_data() to
   5020 	 * ensure against an eager close race.
   5021 	 */
   5022 
   5023 	CONN_INC_REF(econnp);
   5024 
   5025 	TCP_TIMER_RESTART(eager, eager->tcp_rto);
   5026 
   5027 	/*
   5028 	 * Insert the eager in its own perimeter now. We are ready to deal
   5029 	 * with any packets on eager.
   5030 	 */
   5031 	if (ipcl_conn_insert(econnp) != 0)
   5032 		goto error;
   5033 
   5034 	ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp);
   5035 	freemsg(mp);
   5036 	/*
   5037 	 * Send the SYN-ACK. Use the right squeue so that conn_ixa is
   5038 	 * only used by one thread at a time.
   5039 	 */
   5040 	if (econnp->conn_sqp == lconnp->conn_sqp) {
   5041 		(void) conn_ip_output(mp1, econnp->conn_ixa);
   5042 		CONN_DEC_REF(econnp);
   5043 	} else {
   5044 		SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_send_synack,
   5045 		    econnp, NULL, SQ_PROCESS, SQTAG_TCP_SEND_SYNACK);
   5046 	}
   5047 	return;
   5048 error:
   5049 	freemsg(mp1);
   5050 	eager->tcp_closemp_used = B_TRUE;
   5051 	TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   5052 	mp1 = &eager->tcp_closemp;
   5053 	SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill,
   5054 	    econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2);
   5055 
   5056 	/*
   5057 	 * If a connection already exists, send the mp to that connections so
   5058 	 * that it can be appropriately dealt with.
   5059 	 */
   5060 	ipst = tcps->tcps_netstack->netstack_ip;
   5061 
   5062 	if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) {
   5063 		if (!IPCL_IS_CONNECTED(econnp)) {
   5064 			/*
   5065 			 * Something bad happened. ipcl_conn_insert()
   5066 			 * failed because a connection already existed
   5067 			 * in connected hash but we can't find it
   5068 			 * anymore (someone blew it away). Just
   5069 			 * free this message and hopefully remote
   5070 			 * will retransmit at which time the SYN can be
   5071 			 * treated as a new connection or dealth with
   5072 			 * a TH_RST if a connection already exists.
   5073 			 */
   5074 			CONN_DEC_REF(econnp);
   5075 			freemsg(mp);
   5076 		} else {
   5077 			SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data,
   5078 			    econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1);
   5079 		}
   5080 	} else {
   5081 		/* Nobody wants this packet */
   5082 		freemsg(mp);
   5083 	}
   5084 	return;
   5085 error3:
   5086 	CONN_DEC_REF(econnp);
   5087 error2:
   5088 	freemsg(mp);
   5089 	if (tlc_set)
   5090 		atomic_add_32(&listener->tcp_listen_cnt->tlc_cnt, -1);
   5091 }
   5092 
   5093 /* ARGSUSED2 */
   5094 void
   5095 tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   5096 {
   5097 	conn_t	*econnp = (conn_t *)arg;
   5098 	tcp_t	*tcp = econnp->conn_tcp;
   5099 
   5100 	/* Guard against a RST having blown it away while on the squeue */
   5101 	if (tcp->tcp_state == TCPS_CLOSED) {
   5102 		freemsg(mp);
   5103 		return;
   5104 	}
   5105 
   5106 	(void) conn_ip_output(mp, econnp->conn_ixa);
   5107 }
   5108 
   5109 /*
   5110  * In an ideal case of vertical partition in NUMA architecture, its
   5111  * beneficial to have the listener and all the incoming connections
   5112  * tied to the same squeue. The other constraint is that incoming
   5113  * connections should be tied to the squeue attached to interrupted
   5114  * CPU for obvious locality reason so this leaves the listener to
   5115  * be tied to the same squeue. Our only problem is that when listener
   5116  * is binding, the CPU that will get interrupted by the NIC whose
   5117  * IP address the listener is binding to is not even known. So
   5118  * the code below allows us to change that binding at the time the
   5119  * CPU is interrupted by virtue of incoming connection's squeue.
   5120  *
   5121  * This is usefull only in case of a listener bound to a specific IP
   5122  * address. For other kind of listeners, they get bound the
   5123  * very first time and there is no attempt to rebind them.
   5124  */
   5125 void
   5126 tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2,
   5127     ip_recv_attr_t *ira)
   5128 {
   5129 	conn_t		*connp = (conn_t *)arg;
   5130 	squeue_t	*sqp = (squeue_t *)arg2;
   5131 	squeue_t	*new_sqp;
   5132 	uint32_t	conn_flags;
   5133 
   5134 	/*
   5135 	 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
   5136 	 * or based on the ring (for packets from GLD). Otherwise it is
   5137 	 * set based on lbolt i.e., a somewhat random number.
   5138 	 */
   5139 	ASSERT(ira->ira_sqp != NULL);
   5140 	new_sqp = ira->ira_sqp;
   5141 
   5142 	if (connp->conn_fanout == NULL)
   5143 		goto done;
   5144 
   5145 	if (!(connp->conn_flags & IPCL_FULLY_BOUND)) {
   5146 		mutex_enter(&connp->conn_fanout->connf_lock);
   5147 		mutex_enter(&connp->conn_lock);
   5148 		/*
   5149 		 * No one from read or write side can access us now
   5150 		 * except for already queued packets on this squeue.
   5151 		 * But since we haven't changed the squeue yet, they
   5152 		 * can't execute. If they are processed after we have
   5153 		 * changed the squeue, they are sent back to the
   5154 		 * correct squeue down below.
   5155 		 * But a listner close can race with processing of
   5156 		 * incoming SYN. If incoming SYN processing changes
   5157 		 * the squeue then the listener close which is waiting
   5158 		 * to enter the squeue would operate on the wrong
   5159 		 * squeue. Hence we don't change the squeue here unless
   5160 		 * the refcount is exactly the minimum refcount. The
   5161 		 * minimum refcount of 4 is counted as - 1 each for
   5162 		 * TCP and IP, 1 for being in the classifier hash, and
   5163 		 * 1 for the mblk being processed.
   5164 		 */
   5165 
   5166 		if (connp->conn_ref != 4 ||
   5167 		    connp->conn_tcp->tcp_state != TCPS_LISTEN) {
   5168 			mutex_exit(&connp->conn_lock);
   5169 			mutex_exit(&connp->conn_fanout->connf_lock);
   5170 			goto done;
   5171 		}
   5172 		if (connp->conn_sqp != new_sqp) {
   5173 			while (connp->conn_sqp != new_sqp)
   5174 				(void) casptr(&connp->conn_sqp, sqp, new_sqp);
   5175 			/* No special MT issues for outbound ixa_sqp hint */
   5176 			connp->conn_ixa->ixa_sqp = new_sqp;
   5177 		}
   5178 
   5179 		do {
   5180 			conn_flags = connp->conn_flags;
   5181 			conn_flags |= IPCL_FULLY_BOUND;
   5182 			(void) cas32(&connp->conn_flags, connp->conn_flags,
   5183 			    conn_flags);
   5184 		} while (!(connp->conn_flags & IPCL_FULLY_BOUND));
   5185 
   5186 		mutex_exit(&connp->conn_fanout->connf_lock);
   5187 		mutex_exit(&connp->conn_lock);
   5188 
   5189 		/*
   5190 		 * Assume we have picked a good squeue for the listener. Make
   5191 		 * subsequent SYNs not try to change the squeue.
   5192 		 */
   5193 		connp->conn_recv = tcp_input_listener;
   5194 	}
   5195 
   5196 done:
   5197 	if (connp->conn_sqp != sqp) {
   5198 		CONN_INC_REF(connp);
   5199 		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
   5200 		    ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND);
   5201 	} else {
   5202 		tcp_input_listener(connp, mp, sqp, ira);
   5203 	}
   5204 }
   5205 
   5206 /*
   5207  * Successful connect request processing begins when our client passes
   5208  * a T_CONN_REQ message into tcp_wput(), which performs function calls into
   5209  * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
   5210  *
   5211  * After various error checks are completed, tcp_tpi_connect() lays
   5212  * the target address and port into the composite header template.
   5213  * Then we ask IP for information, including a source address if we didn't
   5214  * already have one. Finally we prepare to send the SYN packet, and then
   5215  * send up the T_OK_ACK reply message.
   5216  */
   5217 static void
   5218 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
   5219 {
   5220 	sin_t		*sin;
   5221 	struct T_conn_req	*tcr;
   5222 	struct sockaddr	*sa;
   5223 	socklen_t	len;
   5224 	int		error;
   5225 	cred_t		*cr;
   5226 	pid_t		cpid;
   5227 	conn_t		*connp = tcp->tcp_connp;
   5228 	queue_t		*q = connp->conn_wq;
   5229 
   5230 	/*
   5231 	 * All Solaris components should pass a db_credp
   5232 	 * for this TPI message, hence we ASSERT.
   5233 	 * But in case there is some other M_PROTO that looks
   5234 	 * like a TPI message sent by some other kernel
   5235 	 * component, we check and return an error.
   5236 	 */
   5237 	cr = msg_getcred(mp, &cpid);
   5238 	ASSERT(cr != NULL);
   5239 	if (cr == NULL) {
   5240 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
   5241 		return;
   5242 	}
   5243 
   5244 	tcr = (struct T_conn_req *)mp->b_rptr;
   5245 
   5246 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
   5247 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
   5248 		tcp_err_ack(tcp, mp, TPROTO, 0);
   5249 		return;
   5250 	}
   5251 
   5252 	/*
   5253 	 * Pre-allocate the T_ordrel_ind mblk so that at close time, we
   5254 	 * will always have that to send up.  Otherwise, we need to do
   5255 	 * special handling in case the allocation fails at that time.
   5256 	 * If the end point is TPI, the tcp_t can be reused and the
   5257 	 * tcp_ordrel_mp may be allocated already.
   5258 	 */
   5259 	if (tcp->tcp_ordrel_mp == NULL) {
   5260 		if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
   5261 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   5262 			return;
   5263 		}
   5264 	}
   5265 
   5266 	/*
   5267 	 * Determine packet type based on type of address passed in
   5268 	 * the request should contain an IPv4 or IPv6 address.
   5269 	 * Make sure that address family matches the type of
   5270 	 * family of the address passed down.
   5271 	 */
   5272 	switch (tcr->DEST_length) {
   5273 	default:
   5274 		tcp_err_ack(tcp, mp, TBADADDR, 0);
   5275 		return;
   5276 
   5277 	case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
   5278 		/*
   5279 		 * XXX: The check for valid DEST_length was not there
   5280 		 * in earlier releases and some buggy
   5281 		 * TLI apps (e.g Sybase) got away with not feeding
   5282 		 * in sin_zero part of address.
   5283 		 * We allow that bug to keep those buggy apps humming.
   5284 		 * Test suites require the check on DEST_length.
   5285 		 * We construct a new mblk with valid DEST_length
   5286 		 * free the original so the rest of the code does
   5287 		 * not have to keep track of this special shorter
   5288 		 * length address case.
   5289 		 */
   5290 		mblk_t *nmp;
   5291 		struct T_conn_req *ntcr;
   5292 		sin_t *nsin;
   5293 
   5294 		nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
   5295 		    tcr->OPT_length, BPRI_HI);
   5296 		if (nmp == NULL) {
   5297 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   5298 			return;
   5299 		}
   5300 		ntcr = (struct T_conn_req *)nmp->b_rptr;
   5301 		bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
   5302 		ntcr->PRIM_type = T_CONN_REQ;
   5303 		ntcr->DEST_length = sizeof (sin_t);
   5304 		ntcr->DEST_offset = sizeof (struct T_conn_req);
   5305 
   5306 		nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
   5307 		*nsin = sin_null;
   5308 		/* Get pointer to shorter address to copy from original mp */
   5309 		sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
   5310 		    tcr->DEST_length); /* extract DEST_length worth of sin_t */
   5311 		if (sin == NULL || !OK_32PTR((char *)sin)) {
   5312 			freemsg(nmp);
   5313 			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
   5314 			return;
   5315 		}
   5316 		nsin->sin_family = sin->sin_family;
   5317 		nsin->sin_port = sin->sin_port;
   5318 		nsin->sin_addr = sin->sin_addr;
   5319 		/* Note:nsin->sin_zero zero-fill with sin_null assign above */
   5320 		nmp->b_wptr = (uchar_t *)&nsin[1];
   5321 		if (tcr->OPT_length != 0) {
   5322 			ntcr->OPT_length = tcr->OPT_length;
   5323 			ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
   5324 			bcopy((uchar_t *)tcr + tcr->OPT_offset,
   5325 			    (uchar_t *)ntcr + ntcr->OPT_offset,
   5326 			    tcr->OPT_length);
   5327 			nmp->b_wptr += tcr->OPT_length;
   5328 		}
   5329 		freemsg(mp);	/* original mp freed */
   5330 		mp = nmp;	/* re-initialize original variables */
   5331 		tcr = ntcr;
   5332 	}
   5333 	/* FALLTHRU */
   5334 
   5335 	case sizeof (sin_t):
   5336 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
   5337 		    sizeof (sin_t));
   5338 		len = sizeof (sin_t);
   5339 		break;
   5340 
   5341 	case sizeof (sin6_t):
   5342 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
   5343 		    sizeof (sin6_t));
   5344 		len = sizeof (sin6_t);
   5345 		break;
   5346 	}
   5347 
   5348 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
   5349 	if (error != 0) {
   5350 		tcp_err_ack(tcp, mp, TSYSERR, error);
   5351 		return;
   5352 	}
   5353 
   5354 	/*
   5355 	 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
   5356 	 * should key on their sequence number and cut them loose.
   5357 	 */
   5358 
   5359 	/*
   5360 	 * If options passed in, feed it for verification and handling
   5361 	 */
   5362 	if (tcr->OPT_length != 0) {
   5363 		mblk_t	*ok_mp;
   5364 		mblk_t	*discon_mp;
   5365 		mblk_t  *conn_opts_mp;
   5366 		int t_error, sys_error, do_disconnect;
   5367 
   5368 		conn_opts_mp = NULL;
   5369 
   5370 		if (tcp_conprim_opt_process(tcp, mp,
   5371 		    &do_disconnect, &t_error, &sys_error) < 0) {
   5372 			if (do_disconnect) {
   5373 				ASSERT(t_error == 0 && sys_error == 0);
   5374 				discon_mp = mi_tpi_discon_ind(NULL,
   5375 				    ECONNREFUSED, 0);
   5376 				if (!discon_mp) {
   5377 					tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
   5378 					    TSYSERR, ENOMEM);
   5379 					return;
   5380 				}
   5381 				ok_mp = mi_tpi_ok_ack_alloc(mp);
   5382 				if (!ok_mp) {
   5383 					tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
   5384 					    TSYSERR, ENOMEM);
   5385 					return;
   5386 				}
   5387 				qreply(q, ok_mp);
   5388 				qreply(q, discon_mp); /* no flush! */
   5389 			} else {
   5390 				ASSERT(t_error != 0);
   5391 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
   5392 				    sys_error);
   5393 			}
   5394 			return;
   5395 		}
   5396 		/*
   5397 		 * Success in setting options, the mp option buffer represented
   5398 		 * by OPT_length/offset has been potentially modified and
   5399 		 * contains results of option processing. We copy it in
   5400 		 * another mp to save it for potentially influencing returning
   5401 		 * it in T_CONN_CONN.
   5402 		 */
   5403 		if (tcr->OPT_length != 0) { /* there are resulting options */
   5404 			conn_opts_mp = copyb(mp);
   5405 			if (!conn_opts_mp) {
   5406 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
   5407 				    TSYSERR, ENOMEM);
   5408 				return;
   5409 			}
   5410 			ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
   5411 			tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
   5412 			/*
   5413 			 * Note:
   5414 			 * These resulting option negotiation can include any
   5415 			 * end-to-end negotiation options but there no such
   5416 			 * thing (yet?) in our TCP/IP.
   5417 			 */
   5418 		}
   5419 	}
   5420 
   5421 	/* call the non-TPI version */
   5422 	error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid);
   5423 	if (error < 0) {
   5424 		mp = mi_tpi_err_ack_alloc(mp, -error, 0);
   5425 	} else if (error > 0) {
   5426 		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
   5427 	} else {
   5428 		mp = mi_tpi_ok_ack_alloc(mp);
   5429 	}
   5430 
   5431 	/*
   5432 	 * Note: Code below is the "failure" case
   5433 	 */
   5434 	/* return error ack and blow away saved option results if any */
   5435 connect_failed:
   5436 	if (mp != NULL)
   5437 		putnext(connp->conn_rq, mp);
   5438 	else {
   5439 		tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
   5440 		    TSYSERR, ENOMEM);
   5441 	}
   5442 }
   5443 
   5444 /*
   5445  * Handle connect to IPv4 destinations, including connections for AF_INET6
   5446  * sockets connecting to IPv4 mapped IPv6 destinations.
   5447  * Returns zero if OK, a positive errno, or a negative TLI error.
   5448  */
   5449 static int
   5450 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
   5451     uint_t srcid)
   5452 {
   5453 	ipaddr_t 	dstaddr = *dstaddrp;
   5454 	uint16_t 	lport;
   5455 	conn_t		*connp = tcp->tcp_connp;
   5456 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   5457 	int		error;
   5458 
   5459 	ASSERT(connp->conn_ipversion == IPV4_VERSION);
   5460 
   5461 	/* Check for attempt to connect to INADDR_ANY */
   5462 	if (dstaddr == INADDR_ANY)  {
   5463 		/*
   5464 		 * SunOS 4.x and 4.3 BSD allow an application
   5465 		 * to connect a TCP socket to INADDR_ANY.
   5466 		 * When they do this, the kernel picks the
   5467 		 * address of one interface and uses it
   5468 		 * instead.  The kernel usually ends up
   5469 		 * picking the address of the loopback
   5470 		 * interface.  This is an undocumented feature.
   5471 		 * However, we provide the same thing here
   5472 		 * in order to have source and binary
   5473 		 * compatibility with SunOS 4.x.
   5474 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
   5475 		 * generate the T_CONN_CON.
   5476 		 */
   5477 		dstaddr = htonl(INADDR_LOOPBACK);
   5478 		*dstaddrp = dstaddr;
   5479 	}
   5480 
   5481 	/* Handle __sin6_src_id if socket not bound to an IP address */
   5482 	if (srcid != 0 && connp->conn_laddr_v4 == INADDR_ANY) {
   5483 		ip_srcid_find_id(srcid, &connp->conn_laddr_v6,
   5484 		    IPCL_ZONEID(connp), tcps->tcps_netstack);
   5485 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   5486 	}
   5487 
   5488 	IN6_IPADDR_TO_V4MAPPED(dstaddr, &connp->conn_faddr_v6);
   5489 	connp->conn_fport = dstport;
   5490 
   5491 	/*
   5492 	 * At this point the remote destination address and remote port fields
   5493 	 * in the tcp-four-tuple have been filled in the tcp structure. Now we
   5494 	 * have to see which state tcp was in so we can take appropriate action.
   5495 	 */
   5496 	if (tcp->tcp_state == TCPS_IDLE) {
   5497 		/*
   5498 		 * We support a quick connect capability here, allowing
   5499 		 * clients to transition directly from IDLE to SYN_SENT
   5500 		 * tcp_bindi will pick an unused port, insert the connection
   5501 		 * in the bind hash and transition to BOUND state.
   5502 		 */
   5503 		lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
   5504 		    tcp, B_TRUE);
   5505 		lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE,
   5506 		    B_FALSE, B_FALSE);
   5507 		if (lport == 0)
   5508 			return (-TNOADDR);
   5509 	}
   5510 
   5511 	/*
   5512 	 * Lookup the route to determine a source address and the uinfo.
   5513 	 * Setup TCP parameters based on the metrics/DCE.
   5514 	 */
   5515 	error = tcp_set_destination(tcp);
   5516 	if (error != 0)
   5517 		return (error);
   5518 
   5519 	/*
   5520 	 * Don't let an endpoint connect to itself.
   5521 	 */
   5522 	if (connp->conn_faddr_v4 == connp->conn_laddr_v4 &&
   5523 	    connp->conn_fport == connp->conn_lport)
   5524 		return (-TBADADDR);
   5525 
   5526 	tcp->tcp_state = TCPS_SYN_SENT;
   5527 
   5528 	return (ipcl_conn_insert_v4(connp));
   5529 }
   5530 
   5531 /*
   5532  * Handle connect to IPv6 destinations.
   5533  * Returns zero if OK, a positive errno, or a negative TLI error.
   5534  */
   5535 static int
   5536 tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport,
   5537     uint32_t flowinfo, uint_t srcid, uint32_t scope_id)
   5538 {
   5539 	uint16_t 	lport;
   5540 	conn_t		*connp = tcp->tcp_connp;
   5541 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   5542 	int		error;
   5543 
   5544 	ASSERT(connp->conn_family == AF_INET6);
   5545 
   5546 	/*
   5547 	 * If we're here, it means that the destination address is a native
   5548 	 * IPv6 address.  Return an error if conn_ipversion is not IPv6.  A
   5549 	 * reason why it might not be IPv6 is if the socket was bound to an
   5550 	 * IPv4-mapped IPv6 address.
   5551 	 */
   5552 	if (connp->conn_ipversion != IPV6_VERSION)
   5553 		return (-TBADADDR);
   5554 
   5555 	/*
   5556 	 * Interpret a zero destination to mean loopback.
   5557 	 * Update the T_CONN_REQ (sin/sin6) since it is used to
   5558 	 * generate the T_CONN_CON.
   5559 	 */
   5560 	if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp))
   5561 		*dstaddrp = ipv6_loopback;
   5562 
   5563 	/* Handle __sin6_src_id if socket not bound to an IP address */
   5564 	if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
   5565 		ip_srcid_find_id(srcid, &connp->conn_laddr_v6,
   5566 		    IPCL_ZONEID(connp), tcps->tcps_netstack);
   5567 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   5568 	}
   5569 
   5570 	/*
   5571 	 * Take care of the scope_id now.
   5572 	 */
   5573 	if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(dstaddrp)) {
   5574 		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
   5575 		connp->conn_ixa->ixa_scopeid = scope_id;
   5576 	} else {
   5577 		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
   5578 	}
   5579 
   5580 	connp->conn_flowinfo = flowinfo;
   5581 	connp->conn_faddr_v6 = *dstaddrp;
   5582 	connp->conn_fport = dstport;
   5583 
   5584 	/*
   5585 	 * At this point the remote destination address and remote port fields
   5586 	 * in the tcp-four-tuple have been filled in the tcp structure. Now we
   5587 	 * have to see which state tcp was in so we can take appropriate action.
   5588 	 */
   5589 	if (tcp->tcp_state == TCPS_IDLE) {
   5590 		/*
   5591 		 * We support a quick connect capability here, allowing
   5592 		 * clients to transition directly from IDLE to SYN_SENT
   5593 		 * tcp_bindi will pick an unused port, insert the connection
   5594 		 * in the bind hash and transition to BOUND state.
   5595 		 */
   5596 		lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
   5597 		    tcp, B_TRUE);
   5598 		lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE,
   5599 		    B_FALSE, B_FALSE);
   5600 		if (lport == 0)
   5601 			return (-TNOADDR);
   5602 	}
   5603 
   5604 	/*
   5605 	 * Lookup the route to determine a source address and the uinfo.
   5606 	 * Setup TCP parameters based on the metrics/DCE.
   5607 	 */
   5608 	error = tcp_set_destination(tcp);
   5609 	if (error != 0)
   5610 		return (error);
   5611 
   5612 	/*
   5613 	 * Don't let an endpoint connect to itself.
   5614 	 */
   5615 	if (IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &connp->conn_laddr_v6) &&
   5616 	    connp->conn_fport == connp->conn_lport)
   5617 		return (-TBADADDR);
   5618 
   5619 	tcp->tcp_state = TCPS_SYN_SENT;
   5620 
   5621 	return (ipcl_conn_insert_v6(connp));
   5622 }
   5623 
   5624 /*
   5625  * Disconnect
   5626  * Note that unlike other functions this returns a positive tli error
   5627  * when it fails; it never returns an errno.
   5628  */
   5629 static int
   5630 tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
   5631 {
   5632 	conn_t		*lconnp;
   5633 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   5634 	conn_t		*connp = tcp->tcp_connp;
   5635 
   5636 	/*
   5637 	 * Right now, upper modules pass down a T_DISCON_REQ to TCP,
   5638 	 * when the stream is in BOUND state. Do not send a reset,
   5639 	 * since the destination IP address is not valid, and it can
   5640 	 * be the initialized value of all zeros (broadcast address).
   5641 	 */
   5642 	if (tcp->tcp_state <= TCPS_BOUND) {
   5643 		if (connp->conn_debug) {
   5644 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   5645 			    "tcp_disconnect: bad state, %d", tcp->tcp_state);
   5646 		}
   5647 		return (TOUTSTATE);
   5648 	}
   5649 
   5650 
   5651 	if (seqnum == -1 || tcp->tcp_conn_req_max == 0) {
   5652 
   5653 		/*
   5654 		 * According to TPI, for non-listeners, ignore seqnum
   5655 		 * and disconnect.
   5656 		 * Following interpretation of -1 seqnum is historical
   5657 		 * and implied TPI ? (TPI only states that for T_CONN_IND,
   5658 		 * a valid seqnum should not be -1).
   5659 		 *
   5660 		 *	-1 means disconnect everything
   5661 		 *	regardless even on a listener.
   5662 		 */
   5663 
   5664 		int old_state = tcp->tcp_state;
   5665 		ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
   5666 
   5667 		/*
   5668 		 * The connection can't be on the tcp_time_wait_head list
   5669 		 * since it is not detached.
   5670 		 */
   5671 		ASSERT(tcp->tcp_time_wait_next == NULL);
   5672 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   5673 		ASSERT(tcp->tcp_time_wait_expire == 0);
   5674 		/*
   5675 		 * If it used to be a listener, check to make sure no one else
   5676 		 * has taken the port before switching back to LISTEN state.
   5677 		 */
   5678 		if (connp->conn_ipversion == IPV4_VERSION) {
   5679 			lconnp = ipcl_lookup_listener_v4(connp->conn_lport,
   5680 			    connp->conn_laddr_v4, IPCL_ZONEID(connp), ipst);
   5681 		} else {
   5682 			uint_t ifindex = 0;
   5683 
   5684 			if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)
   5685 				ifindex = connp->conn_ixa->ixa_scopeid;
   5686 
   5687 			/* Allow conn_bound_if listeners? */
   5688 			lconnp = ipcl_lookup_listener_v6(connp->conn_lport,
   5689 			    &connp->conn_laddr_v6, ifindex, IPCL_ZONEID(connp),
   5690 			    ipst);
   5691 		}
   5692 		if (tcp->tcp_conn_req_max && lconnp == NULL) {
   5693 			tcp->tcp_state = TCPS_LISTEN;
   5694 		} else if (old_state > TCPS_BOUND) {
   5695 			tcp->tcp_conn_req_max = 0;
   5696 			tcp->tcp_state = TCPS_BOUND;
   5697 
   5698 			/*
   5699 			 * If this end point is not going to become a listener,
   5700 			 * decrement the listener connection count if
   5701 			 * necessary.  Note that we do not do this if it is
   5702 			 * going to be a listner (the above if case) since
   5703 			 * then it may remove the counter struct.
   5704 			 */
   5705 			if (tcp->tcp_listen_cnt != NULL)
   5706 				TCP_DECR_LISTEN_CNT(tcp);
   5707 		}
   5708 		if (lconnp != NULL)
   5709 			CONN_DEC_REF(lconnp);
   5710 		if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
   5711 			BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   5712 		} else if (old_state == TCPS_ESTABLISHED ||
   5713 		    old_state == TCPS_CLOSE_WAIT) {
   5714 			BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
   5715 		}
   5716 
   5717 		if (tcp->tcp_fused)
   5718 			tcp_unfuse(tcp);
   5719 
   5720 		mutex_enter(&tcp->tcp_eager_lock);
   5721 		if ((tcp->tcp_conn_req_cnt_q0 != 0) ||
   5722 		    (tcp->tcp_conn_req_cnt_q != 0)) {
   5723 			tcp_eager_cleanup(tcp, 0);
   5724 		}
   5725 		mutex_exit(&tcp->tcp_eager_lock);
   5726 
   5727 		tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt,
   5728 		    tcp->tcp_rnxt, TH_RST | TH_ACK);
   5729 
   5730 		tcp_reinit(tcp);
   5731 
   5732 		return (0);
   5733 	} else if (!tcp_eager_blowoff(tcp, seqnum)) {
   5734 		return (TBADSEQ);
   5735 	}
   5736 	return (0);
   5737 }
   5738 
   5739 /*
   5740  * Our client hereby directs us to reject the connection request
   5741  * that tcp_input_listener() marked with 'seqnum'.  Rejection consists
   5742  * of sending the appropriate RST, not an ICMP error.
   5743  */
   5744 static void
   5745 tcp_disconnect(tcp_t *tcp, mblk_t *mp)
   5746 {
   5747 	t_scalar_t seqnum;
   5748 	int	error;
   5749 	conn_t	*connp = tcp->tcp_connp;
   5750 
   5751 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
   5752 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) {
   5753 		tcp_err_ack(tcp, mp, TPROTO, 0);
   5754 		return;
   5755 	}
   5756 	seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number;
   5757 	error = tcp_disconnect_common(tcp, seqnum);
   5758 	if (error != 0)
   5759 		tcp_err_ack(tcp, mp, error, 0);
   5760 	else {
   5761 		if (tcp->tcp_state >= TCPS_ESTABLISHED) {
   5762 			/* Send M_FLUSH according to TPI */
   5763 			(void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
   5764 		}
   5765 		mp = mi_tpi_ok_ack_alloc(mp);
   5766 		if (mp != NULL)
   5767 			putnext(connp->conn_rq, mp);
   5768 	}
   5769 }
   5770 
   5771 /*
   5772  * Diagnostic routine used to return a string associated with the tcp state.
   5773  * Note that if the caller does not supply a buffer, it will use an internal
   5774  * static string.  This means that if multiple threads call this function at
   5775  * the same time, output can be corrupted...  Note also that this function
   5776  * does not check the size of the supplied buffer.  The caller has to make
   5777  * sure that it is big enough.
   5778  */
   5779 static char *
   5780 tcp_display(tcp_t *tcp, char *sup_buf, char format)
   5781 {
   5782 	char		buf1[30];
   5783 	static char	priv_buf[INET6_ADDRSTRLEN * 2 + 80];
   5784 	char		*buf;
   5785 	char		*cp;
   5786 	in6_addr_t	local, remote;
   5787 	char		local_addrbuf[INET6_ADDRSTRLEN];
   5788 	char		remote_addrbuf[INET6_ADDRSTRLEN];
   5789 	conn_t		*connp;
   5790 
   5791 	if (sup_buf != NULL)
   5792 		buf = sup_buf;
   5793 	else
   5794 		buf = priv_buf;
   5795 
   5796 	if (tcp == NULL)
   5797 		return ("NULL_TCP");
   5798 
   5799 	connp = tcp->tcp_connp;
   5800 	switch (tcp->tcp_state) {
   5801 	case TCPS_CLOSED:
   5802 		cp = "TCP_CLOSED";
   5803 		break;
   5804 	case TCPS_IDLE:
   5805 		cp = "TCP_IDLE";
   5806 		break;
   5807 	case TCPS_BOUND:
   5808 		cp = "TCP_BOUND";
   5809 		break;
   5810 	case TCPS_LISTEN:
   5811 		cp = "TCP_LISTEN";
   5812 		break;
   5813 	case TCPS_SYN_SENT:
   5814 		cp = "TCP_SYN_SENT";
   5815 		break;
   5816 	case TCPS_SYN_RCVD:
   5817 		cp = "TCP_SYN_RCVD";
   5818 		break;
   5819 	case TCPS_ESTABLISHED:
   5820 		cp = "TCP_ESTABLISHED";
   5821 		break;
   5822 	case TCPS_CLOSE_WAIT:
   5823 		cp = "TCP_CLOSE_WAIT";
   5824 		break;
   5825 	case TCPS_FIN_WAIT_1:
   5826 		cp = "TCP_FIN_WAIT_1";
   5827 		break;
   5828 	case TCPS_CLOSING:
   5829 		cp = "TCP_CLOSING";
   5830 		break;
   5831 	case TCPS_LAST_ACK:
   5832 		cp = "TCP_LAST_ACK";
   5833 		break;
   5834 	case TCPS_FIN_WAIT_2:
   5835 		cp = "TCP_FIN_WAIT_2";
   5836 		break;
   5837 	case TCPS_TIME_WAIT:
   5838 		cp = "TCP_TIME_WAIT";
   5839 		break;
   5840 	default:
   5841 		(void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
   5842 		cp = buf1;
   5843 		break;
   5844 	}
   5845 	switch (format) {
   5846 	case DISP_ADDR_AND_PORT:
   5847 		if (connp->conn_ipversion == IPV4_VERSION) {
   5848 			/*
   5849 			 * Note that we use the remote address in the tcp_b
   5850 			 * structure.  This means that it will print out
   5851 			 * the real destination address, not the next hop's
   5852 			 * address if source routing is used.
   5853 			 */
   5854 			IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local);
   5855 			IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote);
   5856 
   5857 		} else {
   5858 			local = connp->conn_laddr_v6;
   5859 			remote = connp->conn_faddr_v6;
   5860 		}
   5861 		(void) inet_ntop(AF_INET6, &local, local_addrbuf,
   5862 		    sizeof (local_addrbuf));
   5863 		(void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
   5864 		    sizeof (remote_addrbuf));
   5865 		(void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
   5866 		    local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf,
   5867 		    ntohs(connp->conn_fport), cp);
   5868 		break;
   5869 	case DISP_PORT_ONLY:
   5870 	default:
   5871 		(void) mi_sprintf(buf, "[%u, %u] %s",
   5872 		    ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
   5873 		break;
   5874 	}
   5875 
   5876 	return (buf);
   5877 }
   5878 
   5879 /*
   5880  * Called via squeue to get on to eager's perimeter. It sends a
   5881  * TH_RST if eager is in the fanout table. The listener wants the
   5882  * eager to disappear either by means of tcp_eager_blowoff() or
   5883  * tcp_eager_cleanup() being called. tcp_eager_kill() can also be
   5884  * called (via squeue) if the eager cannot be inserted in the
   5885  * fanout table in tcp_input_listener().
   5886  */
   5887 /* ARGSUSED */
   5888 void
   5889 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   5890 {
   5891 	conn_t	*econnp = (conn_t *)arg;
   5892 	tcp_t	*eager = econnp->conn_tcp;
   5893 	tcp_t	*listener = eager->tcp_listener;
   5894 
   5895 	/*
   5896 	 * We could be called because listener is closing. Since
   5897 	 * the eager was using listener's queue's, we avoid
   5898 	 * using the listeners queues from now on.
   5899 	 */
   5900 	ASSERT(eager->tcp_detached);
   5901 	econnp->conn_rq = NULL;
   5902 	econnp->conn_wq = NULL;
   5903 
   5904 	/*
   5905 	 * An eager's conn_fanout will be NULL if it's a duplicate
   5906 	 * for an existing 4-tuples in the conn fanout table.
   5907 	 * We don't want to send an RST out in such case.
   5908 	 */
   5909 	if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) {
   5910 		tcp_xmit_ctl("tcp_eager_kill, can't wait",
   5911 		    eager, eager->tcp_snxt, 0, TH_RST);
   5912 	}
   5913 
   5914 	/* We are here because listener wants this eager gone */
   5915 	if (listener != NULL) {
   5916 		mutex_enter(&listener->tcp_eager_lock);
   5917 		tcp_eager_unlink(eager);
   5918 		if (eager->tcp_tconnind_started) {
   5919 			/*
   5920 			 * The eager has sent a conn_ind up to the
   5921 			 * listener but listener decides to close
   5922 			 * instead. We need to drop the extra ref
   5923 			 * placed on eager in tcp_input_data() before
   5924 			 * sending the conn_ind to listener.
   5925 			 */
   5926 			CONN_DEC_REF(econnp);
   5927 		}
   5928 		mutex_exit(&listener->tcp_eager_lock);
   5929 		CONN_DEC_REF(listener->tcp_connp);
   5930 	}
   5931 
   5932 	if (eager->tcp_state != TCPS_CLOSED)
   5933 		tcp_close_detached(eager);
   5934 }
   5935 
   5936 /*
   5937  * Reset any eager connection hanging off this listener marked
   5938  * with 'seqnum' and then reclaim it's resources.
   5939  */
   5940 static boolean_t
   5941 tcp_eager_blowoff(tcp_t	*listener, t_scalar_t seqnum)
   5942 {
   5943 	tcp_t	*eager;
   5944 	mblk_t 	*mp;
   5945 	tcp_stack_t	*tcps = listener->tcp_tcps;
   5946 
   5947 	TCP_STAT(tcps, tcp_eager_blowoff_calls);
   5948 	eager = listener;
   5949 	mutex_enter(&listener->tcp_eager_lock);
   5950 	do {
   5951 		eager = eager->tcp_eager_next_q;
   5952 		if (eager == NULL) {
   5953 			mutex_exit(&listener->tcp_eager_lock);
   5954 			return (B_FALSE);
   5955 		}
   5956 	} while (eager->tcp_conn_req_seqnum != seqnum);
   5957 
   5958 	if (eager->tcp_closemp_used) {
   5959 		mutex_exit(&listener->tcp_eager_lock);
   5960 		return (B_TRUE);
   5961 	}
   5962 	eager->tcp_closemp_used = B_TRUE;
   5963 	TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   5964 	CONN_INC_REF(eager->tcp_connp);
   5965 	mutex_exit(&listener->tcp_eager_lock);
   5966 	mp = &eager->tcp_closemp;
   5967 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
   5968 	    eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF);
   5969 	return (B_TRUE);
   5970 }
   5971 
   5972 /*
   5973  * Reset any eager connection hanging off this listener
   5974  * and then reclaim it's resources.
   5975  */
   5976 static void
   5977 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
   5978 {
   5979 	tcp_t	*eager;
   5980 	mblk_t	*mp;
   5981 	tcp_stack_t	*tcps = listener->tcp_tcps;
   5982 
   5983 	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
   5984 
   5985 	if (!q0_only) {
   5986 		/* First cleanup q */
   5987 		TCP_STAT(tcps, tcp_eager_blowoff_q);
   5988 		eager = listener->tcp_eager_next_q;
   5989 		while (eager != NULL) {
   5990 			if (!eager->tcp_closemp_used) {
   5991 				eager->tcp_closemp_used = B_TRUE;
   5992 				TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   5993 				CONN_INC_REF(eager->tcp_connp);
   5994 				mp = &eager->tcp_closemp;
   5995 				SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
   5996 				    tcp_eager_kill, eager->tcp_connp, NULL,
   5997 				    SQ_FILL, SQTAG_TCP_EAGER_CLEANUP);
   5998 			}
   5999 			eager = eager->tcp_eager_next_q;
   6000 		}
   6001 	}
   6002 	/* Then cleanup q0 */
   6003 	TCP_STAT(tcps, tcp_eager_blowoff_q0);
   6004 	eager = listener->tcp_eager_next_q0;
   6005 	while (eager != listener) {
   6006 		if (!eager->tcp_closemp_used) {
   6007 			eager->tcp_closemp_used = B_TRUE;
   6008 			TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   6009 			CONN_INC_REF(eager->tcp_connp);
   6010 			mp = &eager->tcp_closemp;
   6011 			SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
   6012 			    tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL,
   6013 			    SQTAG_TCP_EAGER_CLEANUP_Q0);
   6014 		}
   6015 		eager = eager->tcp_eager_next_q0;
   6016 	}
   6017 }
   6018 
   6019 /*
   6020  * If we are an eager connection hanging off a listener that hasn't
   6021  * formally accepted the connection yet, get off his list and blow off
   6022  * any data that we have accumulated.
   6023  */
   6024 static void
   6025 tcp_eager_unlink(tcp_t *tcp)
   6026 {
   6027 	tcp_t	*listener = tcp->tcp_listener;
   6028 
   6029 	ASSERT(listener != NULL);
   6030 	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
   6031 	if (tcp->tcp_eager_next_q0 != NULL) {
   6032 		ASSERT(tcp->tcp_eager_prev_q0 != NULL);
   6033 
   6034 		/* Remove the eager tcp from q0 */
   6035 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
   6036 		    tcp->tcp_eager_prev_q0;
   6037 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
   6038 		    tcp->tcp_eager_next_q0;
   6039 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
   6040 		listener->tcp_conn_req_cnt_q0--;
   6041 
   6042 		tcp->tcp_eager_next_q0 = NULL;
   6043 		tcp->tcp_eager_prev_q0 = NULL;
   6044 
   6045 		/*
   6046 		 * Take the eager out, if it is in the list of droppable
   6047 		 * eagers.
   6048 		 */
   6049 		MAKE_UNDROPPABLE(tcp);
   6050 
   6051 		if (tcp->tcp_syn_rcvd_timeout != 0) {
   6052 			/* we have timed out before */
   6053 			ASSERT(listener->tcp_syn_rcvd_timeout > 0);
   6054 			listener->tcp_syn_rcvd_timeout--;
   6055 		}
   6056 	} else {
   6057 		tcp_t   **tcpp = &listener->tcp_eager_next_q;
   6058 		tcp_t	*prev = NULL;
   6059 
   6060 		for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) {
   6061 			if (tcpp[0] == tcp) {
   6062 				if (listener->tcp_eager_last_q == tcp) {
   6063 					/*
   6064 					 * If we are unlinking the last
   6065 					 * element on the list, adjust
   6066 					 * tail pointer. Set tail pointer
   6067 					 * to nil when list is empty.
   6068 					 */
   6069 					ASSERT(tcp->tcp_eager_next_q == NULL);
   6070 					if (listener->tcp_eager_last_q ==
   6071 					    listener->tcp_eager_next_q) {
   6072 						listener->tcp_eager_last_q =
   6073 						    NULL;
   6074 					} else {
   6075 						/*
   6076 						 * We won't get here if there
   6077 						 * is only one eager in the
   6078 						 * list.
   6079 						 */
   6080 						ASSERT(prev != NULL);
   6081 						listener->tcp_eager_last_q =
   6082 						    prev;
   6083 					}
   6084 				}
   6085 				tcpp[0] = tcp->tcp_eager_next_q;
   6086 				tcp->tcp_eager_next_q = NULL;
   6087 				tcp->tcp_eager_last_q = NULL;
   6088 				ASSERT(listener->tcp_conn_req_cnt_q > 0);
   6089 				listener->tcp_conn_req_cnt_q--;
   6090 				break;
   6091 			}
   6092 			prev = tcpp[0];
   6093 		}
   6094 	}
   6095 	tcp->tcp_listener = NULL;
   6096 }
   6097 
   6098 /* Shorthand to generate and send TPI error acks to our client */
   6099 static void
   6100 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
   6101 {
   6102 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
   6103 		putnext(tcp->tcp_connp->conn_rq, mp);
   6104 }
   6105 
   6106 /* Shorthand to generate and send TPI error acks to our client */
   6107 static void
   6108 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
   6109     int t_error, int sys_error)
   6110 {
   6111 	struct T_error_ack	*teackp;
   6112 
   6113 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
   6114 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
   6115 		teackp = (struct T_error_ack *)mp->b_rptr;
   6116 		teackp->ERROR_prim = primitive;
   6117 		teackp->TLI_error = t_error;
   6118 		teackp->UNIX_error = sys_error;
   6119 		putnext(tcp->tcp_connp->conn_rq, mp);
   6120 	}
   6121 }
   6122 
   6123 /*
   6124  * Note: No locks are held when inspecting tcp_g_*epriv_ports
   6125  * but instead the code relies on:
   6126  * - the fact that the address of the array and its size never changes
   6127  * - the atomic assignment of the elements of the array
   6128  */
   6129 /* ARGSUSED */
   6130 static int
   6131 tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
   6132 {
   6133 	int i;
   6134 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
   6135 
   6136 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   6137 		if (tcps->tcps_g_epriv_ports[i] != 0)
   6138 			(void) mi_mpprintf(mp, "%d ",
   6139 			    tcps->tcps_g_epriv_ports[i]);
   6140 	}
   6141 	return (0);
   6142 }
   6143 
   6144 /*
   6145  * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
   6146  * threads from changing it at the same time.
   6147  */
   6148 /* ARGSUSED */
   6149 static int
   6150 tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
   6151     cred_t *cr)
   6152 {
   6153 	long	new_value;
   6154 	int	i;
   6155 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
   6156 
   6157 	/*
   6158 	 * Fail the request if the new value does not lie within the
   6159 	 * port number limits.
   6160 	 */
   6161 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
   6162 	    new_value <= 0 || new_value >= 65536) {
   6163 		return (EINVAL);
   6164 	}
   6165 
   6166 	mutex_enter(&tcps->tcps_epriv_port_lock);
   6167 	/* Check if the value is already in the list */
   6168 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   6169 		if (new_value == tcps->tcps_g_epriv_ports[i]) {
   6170 			mutex_exit(&tcps->tcps_epriv_port_lock);
   6171 			return (EEXIST);
   6172 		}
   6173 	}
   6174 	/* Find an empty slot */
   6175 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   6176 		if (tcps->tcps_g_epriv_ports[i] == 0)
   6177 			break;
   6178 	}
   6179 	if (i == tcps->tcps_g_num_epriv_ports) {
   6180 		mutex_exit(&tcps->tcps_epriv_port_lock);
   6181 		return (EOVERFLOW);
   6182 	}
   6183 	/* Set the new value */
   6184 	tcps->tcps_g_epriv_ports[i] = (uint16_t)new_value;
   6185 	mutex_exit(&tcps->tcps_epriv_port_lock);
   6186 	return (0);
   6187 }
   6188 
   6189 /*
   6190  * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
   6191  * threads from changing it at the same time.
   6192  */
   6193 /* ARGSUSED */
   6194 static int
   6195 tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
   6196     cred_t *cr)
   6197 {
   6198 	long	new_value;
   6199 	int	i;
   6200 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
   6201 
   6202 	/*
   6203 	 * Fail the request if the new value does not lie within the
   6204 	 * port number limits.
   6205 	 */
   6206 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 ||
   6207 	    new_value >= 65536) {
   6208 		return (EINVAL);
   6209 	}
   6210 
   6211 	mutex_enter(&tcps->tcps_epriv_port_lock);
   6212 	/* Check that the value is already in the list */
   6213 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   6214 		if (tcps->tcps_g_epriv_ports[i] == new_value)
   6215 			break;
   6216 	}
   6217 	if (i == tcps->tcps_g_num_epriv_ports) {
   6218 		mutex_exit(&tcps->tcps_epriv_port_lock);
   6219 		return (ESRCH);
   6220 	}
   6221 	/* Clear the value */
   6222 	tcps->tcps_g_epriv_ports[i] = 0;
   6223 	mutex_exit(&tcps->tcps_epriv_port_lock);
   6224 	return (0);
   6225 }
   6226 
   6227 /* Return the TPI/TLI equivalent of our current tcp_state */
   6228 static int
   6229 tcp_tpistate(tcp_t *tcp)
   6230 {
   6231 	switch (tcp->tcp_state) {
   6232 	case TCPS_IDLE:
   6233 		return (TS_UNBND);
   6234 	case TCPS_LISTEN:
   6235 		/*
   6236 		 * Return whether there are outstanding T_CONN_IND waiting
   6237 		 * for the matching T_CONN_RES. Therefore don't count q0.
   6238 		 */
   6239 		if (tcp->tcp_conn_req_cnt_q > 0)
   6240 			return (TS_WRES_CIND);
   6241 		else
   6242 			return (TS_IDLE);
   6243 	case TCPS_BOUND:
   6244 		return (TS_IDLE);
   6245 	case TCPS_SYN_SENT:
   6246 		return (TS_WCON_CREQ);
   6247 	case TCPS_SYN_RCVD:
   6248 		/*
   6249 		 * Note: assumption: this has to the active open SYN_RCVD.
   6250 		 * The passive instance is detached in SYN_RCVD stage of
   6251 		 * incoming connection processing so we cannot get request
   6252 		 * for T_info_ack on it.
   6253 		 */
   6254 		return (TS_WACK_CRES);
   6255 	case TCPS_ESTABLISHED:
   6256 		return (TS_DATA_XFER);
   6257 	case TCPS_CLOSE_WAIT:
   6258 		return (TS_WREQ_ORDREL);
   6259 	case TCPS_FIN_WAIT_1:
   6260 		return (TS_WIND_ORDREL);
   6261 	case TCPS_FIN_WAIT_2:
   6262 		return (TS_WIND_ORDREL);
   6263 
   6264 	case TCPS_CLOSING:
   6265 	case TCPS_LAST_ACK:
   6266 	case TCPS_TIME_WAIT:
   6267 	case TCPS_CLOSED:
   6268 		/*
   6269 		 * Following TS_WACK_DREQ7 is a rendition of "not
   6270 		 * yet TS_IDLE" TPI state. There is no best match to any
   6271 		 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
   6272 		 * choose a value chosen that will map to TLI/XTI level
   6273 		 * state of TSTATECHNG (state is process of changing) which
   6274 		 * captures what this dummy state represents.
   6275 		 */
   6276 		return (TS_WACK_DREQ7);
   6277 	default:
   6278 		cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
   6279 		    tcp->tcp_state, tcp_display(tcp, NULL,
   6280 		    DISP_PORT_ONLY));
   6281 		return (TS_UNBND);
   6282 	}
   6283 }
   6284 
   6285 static void
   6286 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
   6287 {
   6288 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6289 	conn_t		*connp = tcp->tcp_connp;
   6290 
   6291 	if (connp->conn_family == AF_INET6)
   6292 		*tia = tcp_g_t_info_ack_v6;
   6293 	else
   6294 		*tia = tcp_g_t_info_ack;
   6295 	tia->CURRENT_state = tcp_tpistate(tcp);
   6296 	tia->OPT_size = tcp_max_optsize;
   6297 	if (tcp->tcp_mss == 0) {
   6298 		/* Not yet set - tcp_open does not set mss */
   6299 		if (connp->conn_ipversion == IPV4_VERSION)
   6300 			tia->TIDU_size = tcps->tcps_mss_def_ipv4;
   6301 		else
   6302 			tia->TIDU_size = tcps->tcps_mss_def_ipv6;
   6303 	} else {
   6304 		tia->TIDU_size = tcp->tcp_mss;
   6305 	}
   6306 	/* TODO: Default ETSDU is 1.  Is that correct for tcp? */
   6307 }
   6308 
   6309 static void
   6310 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
   6311     t_uscalar_t cap_bits1)
   6312 {
   6313 	tcap->CAP_bits1 = 0;
   6314 
   6315 	if (cap_bits1 & TC1_INFO) {
   6316 		tcp_copy_info(&tcap->INFO_ack, tcp);
   6317 		tcap->CAP_bits1 |= TC1_INFO;
   6318 	}
   6319 
   6320 	if (cap_bits1 & TC1_ACCEPTOR_ID) {
   6321 		tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
   6322 		tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
   6323 	}
   6324 
   6325 }
   6326 
   6327 /*
   6328  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
   6329  * tcp_wput.  Much of the T_CAPABILITY_ACK information is copied from
   6330  * tcp_g_t_info_ack.  The current state of the stream is copied from
   6331  * tcp_state.
   6332  */
   6333 static void
   6334 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
   6335 {
   6336 	t_uscalar_t		cap_bits1;
   6337 	struct T_capability_ack	*tcap;
   6338 
   6339 	if (MBLKL(mp) < sizeof (struct T_capability_req)) {
   6340 		freemsg(mp);
   6341 		return;
   6342 	}
   6343 
   6344 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
   6345 
   6346 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
   6347 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
   6348 	if (mp == NULL)
   6349 		return;
   6350 
   6351 	tcap = (struct T_capability_ack *)mp->b_rptr;
   6352 	tcp_do_capability_ack(tcp, tcap, cap_bits1);
   6353 
   6354 	putnext(tcp->tcp_connp->conn_rq, mp);
   6355 }
   6356 
   6357 /*
   6358  * This routine responds to T_INFO_REQ messages.  It is called by tcp_wput.
   6359  * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
   6360  * The current state of the stream is copied from tcp_state.
   6361  */
   6362 static void
   6363 tcp_info_req(tcp_t *tcp, mblk_t *mp)
   6364 {
   6365 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
   6366 	    T_INFO_ACK);
   6367 	if (!mp) {
   6368 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   6369 		return;
   6370 	}
   6371 	tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
   6372 	putnext(tcp->tcp_connp->conn_rq, mp);
   6373 }
   6374 
   6375 /* Respond to the TPI addr request */
   6376 static void
   6377 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
   6378 {
   6379 	struct sockaddr *sa;
   6380 	mblk_t	*ackmp;
   6381 	struct T_addr_ack *taa;
   6382 	conn_t	*connp = tcp->tcp_connp;
   6383 	uint_t	addrlen;
   6384 
   6385 	/* Make it large enough for worst case */
   6386 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
   6387 	    2 * sizeof (sin6_t), 1);
   6388 	if (ackmp == NULL) {
   6389 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   6390 		return;
   6391 	}
   6392 
   6393 	taa = (struct T_addr_ack *)ackmp->b_rptr;
   6394 
   6395 	bzero(taa, sizeof (struct T_addr_ack));
   6396 	ackmp->b_wptr = (uchar_t *)&taa[1];
   6397 
   6398 	taa->PRIM_type = T_ADDR_ACK;
   6399 	ackmp->b_datap->db_type = M_PCPROTO;
   6400 
   6401 	if (connp->conn_family == AF_INET)
   6402 		addrlen = sizeof (sin_t);
   6403 	else
   6404 		addrlen = sizeof (sin6_t);
   6405 
   6406 	/*
   6407 	 * Note: Following code assumes 32 bit alignment of basic
   6408 	 * data structures like sin_t and struct T_addr_ack.
   6409 	 */
   6410 	if (tcp->tcp_state >= TCPS_BOUND) {
   6411 		/*
   6412 		 * Fill in local address first
   6413 		 */
   6414 		taa->LOCADDR_offset = sizeof (*taa);
   6415 		taa->LOCADDR_length = addrlen;
   6416 		sa = (struct sockaddr *)&taa[1];
   6417 		(void) conn_getsockname(connp, sa, &addrlen);
   6418 		ackmp->b_wptr += addrlen;
   6419 	}
   6420 	if (tcp->tcp_state >= TCPS_SYN_RCVD) {
   6421 		/*
   6422 		 * Fill in Remote address
   6423 		 */
   6424 		taa->REMADDR_length = addrlen;
   6425 		/* assumed 32-bit alignment */
   6426 		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
   6427 		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
   6428 		(void) conn_getpeername(connp, sa, &addrlen);
   6429 		ackmp->b_wptr += addrlen;
   6430 	}
   6431 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
   6432 	putnext(tcp->tcp_connp->conn_rq, ackmp);
   6433 }
   6434 
   6435 /*
   6436  * Handle reinitialization of a tcp structure.
   6437  * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE.
   6438  */
   6439 static void
   6440 tcp_reinit(tcp_t *tcp)
   6441 {
   6442 	mblk_t		*mp;
   6443 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6444 	conn_t		*connp  = tcp->tcp_connp;
   6445 
   6446 	TCP_STAT(tcps, tcp_reinit_calls);
   6447 
   6448 	/* tcp_reinit should never be called for detached tcp_t's */
   6449 	ASSERT(tcp->tcp_listener == NULL);
   6450 	ASSERT((connp->conn_family == AF_INET &&
   6451 	    connp->conn_ipversion == IPV4_VERSION) ||
   6452 	    (connp->conn_family == AF_INET6 &&
   6453 	    (connp->conn_ipversion == IPV4_VERSION ||
   6454 	    connp->conn_ipversion == IPV6_VERSION)));
   6455 
   6456 	/* Cancel outstanding timers */
   6457 	tcp_timers_stop(tcp);
   6458 
   6459 	/*
   6460 	 * Reset everything in the state vector, after updating global
   6461 	 * MIB data from instance counters.
   6462 	 */
   6463 	UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
   6464 	tcp->tcp_ibsegs = 0;
   6465 	UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
   6466 	tcp->tcp_obsegs = 0;
   6467 
   6468 	tcp_close_mpp(&tcp->tcp_xmit_head);
   6469 	if (tcp->tcp_snd_zcopy_aware)
   6470 		tcp_zcopy_notify(tcp);
   6471 	tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
   6472 	tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
   6473 	mutex_enter(&tcp->tcp_non_sq_lock);
   6474 	if (tcp->tcp_flow_stopped &&
   6475 	    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
   6476 		tcp_clrqfull(tcp);
   6477 	}
   6478 	mutex_exit(&tcp->tcp_non_sq_lock);
   6479 	tcp_close_mpp(&tcp->tcp_reass_head);
   6480 	tcp->tcp_reass_tail = NULL;
   6481 	if (tcp->tcp_rcv_list != NULL) {
   6482 		/* Free b_next chain */
   6483 		tcp_close_mpp(&tcp->tcp_rcv_list);
   6484 		tcp->tcp_rcv_last_head = NULL;
   6485 		tcp->tcp_rcv_last_tail = NULL;
   6486 		tcp->tcp_rcv_cnt = 0;
   6487 	}
   6488 	tcp->tcp_rcv_last_tail = NULL;
   6489 
   6490 	if ((mp = tcp->tcp_urp_mp) != NULL) {
   6491 		freemsg(mp);
   6492 		tcp->tcp_urp_mp = NULL;
   6493 	}
   6494 	if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
   6495 		freemsg(mp);
   6496 		tcp->tcp_urp_mark_mp = NULL;
   6497 	}
   6498 	if (tcp->tcp_fused_sigurg_mp != NULL) {
   6499 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   6500 		freeb(tcp->tcp_fused_sigurg_mp);
   6501 		tcp->tcp_fused_sigurg_mp = NULL;
   6502 	}
   6503 	if (tcp->tcp_ordrel_mp != NULL) {
   6504 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   6505 		freeb(tcp->tcp_ordrel_mp);
   6506 		tcp->tcp_ordrel_mp = NULL;
   6507 	}
   6508 
   6509 	/*
   6510 	 * Following is a union with two members which are
   6511 	 * identical types and size so the following cleanup
   6512 	 * is enough.
   6513 	 */
   6514 	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
   6515 
   6516 	CL_INET_DISCONNECT(connp);
   6517 
   6518 	/*
   6519 	 * The connection can't be on the tcp_time_wait_head list
   6520 	 * since it is not detached.
   6521 	 */
   6522 	ASSERT(tcp->tcp_time_wait_next == NULL);
   6523 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   6524 	ASSERT(tcp->tcp_time_wait_expire == 0);
   6525 
   6526 	if (tcp->tcp_kssl_pending) {
   6527 		tcp->tcp_kssl_pending = B_FALSE;
   6528 
   6529 		/* Don't reset if the initialized by bind. */
   6530 		if (tcp->tcp_kssl_ent != NULL) {
   6531 			kssl_release_ent(tcp->tcp_kssl_ent, NULL,
   6532 			    KSSL_NO_PROXY);
   6533 		}
   6534 	}
   6535 	if (tcp->tcp_kssl_ctx != NULL) {
   6536 		kssl_release_ctx(tcp->tcp_kssl_ctx);
   6537 		tcp->tcp_kssl_ctx = NULL;
   6538 	}
   6539 
   6540 	/*
   6541 	 * Reset/preserve other values
   6542 	 */
   6543 	tcp_reinit_values(tcp);
   6544 	ipcl_hash_remove(connp);
   6545 	ixa_cleanup(connp->conn_ixa);
   6546 	tcp_ipsec_cleanup(tcp);
   6547 
   6548 	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
   6549 	connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
   6550 
   6551 	if (tcp->tcp_conn_req_max != 0) {
   6552 		/*
   6553 		 * This is the case when a TLI program uses the same
   6554 		 * transport end point to accept a connection.  This
   6555 		 * makes the TCP both a listener and acceptor.  When
   6556 		 * this connection is closed, we need to set the state
   6557 		 * back to TCPS_LISTEN.  Make sure that the eager list
   6558 		 * is reinitialized.
   6559 		 *
   6560 		 * Note that this stream is still bound to the four
   6561 		 * tuples of the previous connection in IP.  If a new
   6562 		 * SYN with different foreign address comes in, IP will
   6563 		 * not find it and will send it to the global queue.  In
   6564 		 * the global queue, TCP will do a tcp_lookup_listener()
   6565 		 * to find this stream.  This works because this stream
   6566 		 * is only removed from connected hash.
   6567 		 *
   6568 		 */
   6569 		tcp->tcp_state = TCPS_LISTEN;
   6570 		tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
   6571 		tcp->tcp_eager_next_drop_q0 = tcp;
   6572 		tcp->tcp_eager_prev_drop_q0 = tcp;
   6573 		/*
   6574 		 * Initially set conn_recv to tcp_input_listener_unbound to try
   6575 		 * to pick a good squeue for the listener when the first SYN
   6576 		 * arrives. tcp_input_listener_unbound sets it to
   6577 		 * tcp_input_listener on that first SYN.
   6578 		 */
   6579 		connp->conn_recv = tcp_input_listener_unbound;
   6580 
   6581 		connp->conn_proto = IPPROTO_TCP;
   6582 		connp->conn_faddr_v6 = ipv6_all_zeros;
   6583 		connp->conn_fport = 0;
   6584 
   6585 		(void) ipcl_bind_insert(connp);
   6586 	} else {
   6587 		tcp->tcp_state = TCPS_BOUND;
   6588 	}
   6589 
   6590 	/*
   6591 	 * Initialize to default values
   6592 	 */
   6593 	tcp_init_values(tcp);
   6594 
   6595 	ASSERT(tcp->tcp_ptpbhn != NULL);
   6596 	tcp->tcp_rwnd = connp->conn_rcvbuf;
   6597 	tcp->tcp_mss = connp->conn_ipversion != IPV4_VERSION ?
   6598 	    tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4;
   6599 }
   6600 
   6601 /*
   6602  * Force values to zero that need be zero.
   6603  * Do not touch values asociated with the BOUND or LISTEN state
   6604  * since the connection will end up in that state after the reinit.
   6605  * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t
   6606  * structure!
   6607  */
   6608 static void
   6609 tcp_reinit_values(tcp)
   6610 	tcp_t *tcp;
   6611 {
   6612 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6613 	conn_t		*connp = tcp->tcp_connp;
   6614 
   6615 #ifndef	lint
   6616 #define	DONTCARE(x)
   6617 #define	PRESERVE(x)
   6618 #else
   6619 #define	DONTCARE(x)	((x) = (x))
   6620 #define	PRESERVE(x)	((x) = (x))
   6621 #endif	/* lint */
   6622 
   6623 	PRESERVE(tcp->tcp_bind_hash_port);
   6624 	PRESERVE(tcp->tcp_bind_hash);
   6625 	PRESERVE(tcp->tcp_ptpbhn);
   6626 	PRESERVE(tcp->tcp_acceptor_hash);
   6627 	PRESERVE(tcp->tcp_ptpahn);
   6628 
   6629 	/* Should be ASSERT NULL on these with new code! */
   6630 	ASSERT(tcp->tcp_time_wait_next == NULL);
   6631 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   6632 	ASSERT(tcp->tcp_time_wait_expire == 0);
   6633 	PRESERVE(tcp->tcp_state);
   6634 	PRESERVE(connp->conn_rq);
   6635 	PRESERVE(connp->conn_wq);
   6636 
   6637 	ASSERT(tcp->tcp_xmit_head == NULL);
   6638 	ASSERT(tcp->tcp_xmit_last == NULL);
   6639 	ASSERT(tcp->tcp_unsent == 0);
   6640 	ASSERT(tcp->tcp_xmit_tail == NULL);
   6641 	ASSERT(tcp->tcp_xmit_tail_unsent == 0);
   6642 
   6643 	tcp->tcp_snxt = 0;			/* Displayed in mib */
   6644 	tcp->tcp_suna = 0;			/* Displayed in mib */
   6645 	tcp->tcp_swnd = 0;
   6646 	DONTCARE(tcp->tcp_cwnd);	/* Init in tcp_process_options */
   6647 
   6648 	ASSERT(tcp->tcp_ibsegs == 0);
   6649 	ASSERT(tcp->tcp_obsegs == 0);
   6650 
   6651 	if (connp->conn_ht_iphc != NULL) {
   6652 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
   6653 		connp->conn_ht_iphc = NULL;
   6654 		connp->conn_ht_iphc_allocated = 0;
   6655 		connp->conn_ht_iphc_len = 0;
   6656 		connp->conn_ht_ulp = NULL;
   6657 		connp->conn_ht_ulp_len = 0;
   6658 		tcp->tcp_ipha = NULL;
   6659 		tcp->tcp_ip6h = NULL;
   6660 		tcp->tcp_tcpha = NULL;
   6661 	}
   6662 
   6663 	/* We clear any IP_OPTIONS and extension headers */
   6664 	ip_pkt_free(&connp->conn_xmit_ipp);
   6665 
   6666 	DONTCARE(tcp->tcp_naglim);		/* Init in tcp_init_values */
   6667 	DONTCARE(tcp->tcp_ipha);
   6668 	DONTCARE(tcp->tcp_ip6h);
   6669 	DONTCARE(tcp->tcp_tcpha);
   6670 	tcp->tcp_valid_bits = 0;
   6671 
   6672 	DONTCARE(tcp->tcp_timer_backoff);	/* Init in tcp_init_values */
   6673 	DONTCARE(tcp->tcp_last_recv_time);	/* Init in tcp_init_values */
   6674 	tcp->tcp_last_rcv_lbolt = 0;
   6675 
   6676 	tcp->tcp_init_cwnd = 0;
   6677 
   6678 	tcp->tcp_urp_last_valid = 0;
   6679 	tcp->tcp_hard_binding = 0;
   6680 
   6681 	tcp->tcp_fin_acked = 0;
   6682 	tcp->tcp_fin_rcvd = 0;
   6683 	tcp->tcp_fin_sent = 0;
   6684 	tcp->tcp_ordrel_done = 0;
   6685 
   6686 	tcp->tcp_detached = 0;
   6687 
   6688 	tcp->tcp_snd_ws_ok = B_FALSE;
   6689 	tcp->tcp_snd_ts_ok = B_FALSE;
   6690 	tcp->tcp_zero_win_probe = 0;
   6691 
   6692 	tcp->tcp_loopback = 0;
   6693 	tcp->tcp_localnet = 0;
   6694 	tcp->tcp_syn_defense = 0;
   6695 	tcp->tcp_set_timer = 0;
   6696 
   6697 	tcp->tcp_active_open = 0;
   6698 	tcp->tcp_rexmit = B_FALSE;
   6699 	tcp->tcp_xmit_zc_clean = B_FALSE;
   6700 
   6701 	tcp->tcp_snd_sack_ok = B_FALSE;
   6702 	tcp->tcp_hwcksum = B_FALSE;
   6703 
   6704 	DONTCARE(tcp->tcp_maxpsz_multiplier);	/* Init in tcp_init_values */
   6705 
   6706 	tcp->tcp_conn_def_q0 = 0;
   6707 	tcp->tcp_ip_forward_progress = B_FALSE;
   6708 	tcp->tcp_ecn_ok = B_FALSE;
   6709 
   6710 	tcp->tcp_cwr = B_FALSE;
   6711 	tcp->tcp_ecn_echo_on = B_FALSE;
   6712 	tcp->tcp_is_wnd_shrnk = B_FALSE;
   6713 
   6714 	if (tcp->tcp_sack_info != NULL) {
   6715 		if (tcp->tcp_notsack_list != NULL) {
   6716 			TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
   6717 			    tcp);
   6718 		}
   6719 		kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info);
   6720 		tcp->tcp_sack_info = NULL;
   6721 	}
   6722 
   6723 	tcp->tcp_rcv_ws = 0;
   6724 	tcp->tcp_snd_ws = 0;
   6725 	tcp->tcp_ts_recent = 0;
   6726 	tcp->tcp_rnxt = 0;			/* Displayed in mib */
   6727 	DONTCARE(tcp->tcp_rwnd);		/* Set in tcp_reinit() */
   6728 	tcp->tcp_initial_pmtu = 0;
   6729 
   6730 	ASSERT(tcp->tcp_reass_head == NULL);
   6731 	ASSERT(tcp->tcp_reass_tail == NULL);
   6732 
   6733 	tcp->tcp_cwnd_cnt = 0;
   6734 
   6735 	ASSERT(tcp->tcp_rcv_list == NULL);
   6736 	ASSERT(tcp->tcp_rcv_last_head == NULL);
   6737 	ASSERT(tcp->tcp_rcv_last_tail == NULL);
   6738 	ASSERT(tcp->tcp_rcv_cnt == 0);
   6739 
   6740 	DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */
   6741 	DONTCARE(tcp->tcp_cwnd_max);		/* Init in tcp_init_values */
   6742 	tcp->tcp_csuna = 0;
   6743 
   6744 	tcp->tcp_rto = 0;			/* Displayed in MIB */
   6745 	DONTCARE(tcp->tcp_rtt_sa);		/* Init in tcp_init_values */
   6746 	DONTCARE(tcp->tcp_rtt_sd);		/* Init in tcp_init_values */
   6747 	tcp->tcp_rtt_update = 0;
   6748 
   6749 	DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
   6750 	DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
   6751 
   6752 	tcp->tcp_rack = 0;			/* Displayed in mib */
   6753 	tcp->tcp_rack_cnt = 0;
   6754 	tcp->tcp_rack_cur_max = 0;
   6755 	tcp->tcp_rack_abs_max = 0;
   6756 
   6757 	tcp->tcp_max_swnd = 0;
   6758 
   6759 	ASSERT(tcp->tcp_listener == NULL);
   6760 
   6761 	DONTCARE(tcp->tcp_irs);			/* tcp_valid_bits cleared */
   6762 	DONTCARE(tcp->tcp_iss);			/* tcp_valid_bits cleared */
   6763 	DONTCARE(tcp->tcp_fss);			/* tcp_valid_bits cleared */
   6764 	DONTCARE(tcp->tcp_urg);			/* tcp_valid_bits cleared */
   6765 
   6766 	ASSERT(tcp->tcp_conn_req_cnt_q == 0);
   6767 	ASSERT(tcp->tcp_conn_req_cnt_q0 == 0);
   6768 	PRESERVE(tcp->tcp_conn_req_max);
   6769 	PRESERVE(tcp->tcp_conn_req_seqnum);
   6770 
   6771 	DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */
   6772 	DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */
   6773 	DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */
   6774 	DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */
   6775 
   6776 	DONTCARE(tcp->tcp_urp_last);	/* tcp_urp_last_valid is cleared */
   6777 	ASSERT(tcp->tcp_urp_mp == NULL);
   6778 	ASSERT(tcp->tcp_urp_mark_mp == NULL);
   6779 	ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
   6780 
   6781 	ASSERT(tcp->tcp_eager_next_q == NULL);
   6782 	ASSERT(tcp->tcp_eager_last_q == NULL);
   6783 	ASSERT((tcp->tcp_eager_next_q0 == NULL &&
   6784 	    tcp->tcp_eager_prev_q0 == NULL) ||
   6785 	    tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0);
   6786 	ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
   6787 
   6788 	ASSERT((tcp->tcp_eager_next_drop_q0 == NULL &&
   6789 	    tcp->tcp_eager_prev_drop_q0 == NULL) ||
   6790 	    tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0);
   6791 
   6792 	tcp->tcp_client_errno = 0;
   6793 
   6794 	DONTCARE(connp->conn_sum);		/* Init in tcp_init_values */
   6795 
   6796 	connp->conn_faddr_v6 = ipv6_all_zeros;	/* Displayed in MIB */
   6797 
   6798 	PRESERVE(connp->conn_bound_addr_v6);
   6799 	tcp->tcp_last_sent_len = 0;
   6800 	tcp->tcp_dupack_cnt = 0;
   6801 
   6802 	connp->conn_fport = 0;			/* Displayed in MIB */
   6803 	PRESERVE(connp->conn_lport);
   6804 
   6805 	PRESERVE(tcp->tcp_acceptor_lockp);
   6806 
   6807 	ASSERT(tcp->tcp_ordrel_mp == NULL);
   6808 	PRESERVE(tcp->tcp_acceptor_id);
   6809 	DONTCARE(tcp->tcp_ipsec_overhead);
   6810 
   6811 	PRESERVE(connp->conn_family);
   6812 	/* Remove any remnants of mapped address binding */
   6813 	if (connp->conn_family == AF_INET6) {
   6814 		connp->conn_ipversion = IPV6_VERSION;
   6815 		tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
   6816 	} else {
   6817 		connp->conn_ipversion = IPV4_VERSION;
   6818 		tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
   6819 	}
   6820 
   6821 	connp->conn_bound_if = 0;
   6822 	connp->conn_recv_ancillary.crb_all = 0;
   6823 	tcp->tcp_recvifindex = 0;
   6824 	tcp->tcp_recvhops = 0;
   6825 	tcp->tcp_closed = 0;
   6826 	tcp->tcp_cleandeathtag = 0;
   6827 	if (tcp->tcp_hopopts != NULL) {
   6828 		mi_free(tcp->tcp_hopopts);
   6829 		tcp->tcp_hopopts = NULL;
   6830 		tcp->tcp_hopoptslen = 0;
   6831 	}
   6832 	ASSERT(tcp->tcp_hopoptslen == 0);
   6833 	if (tcp->tcp_dstopts != NULL) {
   6834 		mi_free(tcp->tcp_dstopts);
   6835 		tcp->tcp_dstopts = NULL;
   6836 		tcp->tcp_dstoptslen = 0;
   6837 	}
   6838 	ASSERT(tcp->tcp_dstoptslen == 0);
   6839 	if (tcp->tcp_rthdrdstopts != NULL) {
   6840 		mi_free(tcp->tcp_rthdrdstopts);
   6841 		tcp->tcp_rthdrdstopts = NULL;
   6842 		tcp->tcp_rthdrdstoptslen = 0;
   6843 	}
   6844 	ASSERT(tcp->tcp_rthdrdstoptslen == 0);
   6845 	if (tcp->tcp_rthdr != NULL) {
   6846 		mi_free(tcp->tcp_rthdr);
   6847 		tcp->tcp_rthdr = NULL;
   6848 		tcp->tcp_rthdrlen = 0;
   6849 	}
   6850 	ASSERT(tcp->tcp_rthdrlen == 0);
   6851 
   6852 	/* Reset fusion-related fields */
   6853 	tcp->tcp_fused = B_FALSE;
   6854 	tcp->tcp_unfusable = B_FALSE;
   6855 	tcp->tcp_fused_sigurg = B_FALSE;
   6856 	tcp->tcp_loopback_peer = NULL;
   6857 
   6858 	tcp->tcp_lso = B_FALSE;
   6859 
   6860 	tcp->tcp_in_ack_unsent = 0;
   6861 	tcp->tcp_cork = B_FALSE;
   6862 	tcp->tcp_tconnind_started = B_FALSE;
   6863 
   6864 	PRESERVE(tcp->tcp_squeue_bytes);
   6865 
   6866 	ASSERT(tcp->tcp_kssl_ctx == NULL);
   6867 	ASSERT(!tcp->tcp_kssl_pending);
   6868 	PRESERVE(tcp->tcp_kssl_ent);
   6869 
   6870 	tcp->tcp_closemp_used = B_FALSE;
   6871 
   6872 	PRESERVE(tcp->tcp_rsrv_mp);
   6873 	PRESERVE(tcp->tcp_rsrv_mp_lock);
   6874 
   6875 #ifdef DEBUG
   6876 	DONTCARE(tcp->tcmp_stk[0]);
   6877 #endif
   6878 
   6879 	PRESERVE(tcp->tcp_connid);
   6880 
   6881 	ASSERT(tcp->tcp_listen_cnt == NULL);
   6882 	ASSERT(tcp->tcp_reass_tid == 0);
   6883 
   6884 #undef	DONTCARE
   6885 #undef	PRESERVE
   6886 }
   6887 
   6888 static void
   6889 tcp_init_values(tcp_t *tcp)
   6890 {
   6891 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6892 	conn_t		*connp = tcp->tcp_connp;
   6893 
   6894 	ASSERT((connp->conn_family == AF_INET &&
   6895 	    connp->conn_ipversion == IPV4_VERSION) ||
   6896 	    (connp->conn_family == AF_INET6 &&
   6897 	    (connp->conn_ipversion == IPV4_VERSION ||
   6898 	    connp->conn_ipversion == IPV6_VERSION)));
   6899 
   6900 	/*
   6901 	 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
   6902 	 * will be close to tcp_rexmit_interval_initial.  By doing this, we
   6903 	 * allow the algorithm to adjust slowly to large fluctuations of RTT
   6904 	 * during first few transmissions of a connection as seen in slow
   6905 	 * links.
   6906 	 */
   6907 	tcp->tcp_rtt_sa = tcps->tcps_rexmit_interval_initial << 2;
   6908 	tcp->tcp_rtt_sd = tcps->tcps_rexmit_interval_initial >> 1;
   6909 	tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
   6910 	    tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
   6911 	    tcps->tcps_conn_grace_period;
   6912 	if (tcp->tcp_rto < tcps->tcps_rexmit_interval_min)
   6913 		tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
   6914 	tcp->tcp_timer_backoff = 0;
   6915 	tcp->tcp_ms_we_have_waited = 0;
   6916 	tcp->tcp_last_recv_time = ddi_get_lbolt();
   6917 	tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
   6918 	tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
   6919 	tcp->tcp_snd_burst = TCP_CWND_INFINITE;
   6920 
   6921 	tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
   6922 
   6923 	tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
   6924 	tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval;
   6925 	tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
   6926 	/*
   6927 	 * Fix it to tcp_ip_abort_linterval later if it turns out to be a
   6928 	 * passive open.
   6929 	 */
   6930 	tcp->tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval;
   6931 
   6932 	tcp->tcp_naglim = tcps->tcps_naglim_def;
   6933 
   6934 	/* NOTE:  ISS is now set in tcp_set_destination(). */
   6935 
   6936 	/* Reset fusion-related fields */
   6937 	tcp->tcp_fused = B_FALSE;
   6938 	tcp->tcp_unfusable = B_FALSE;
   6939 	tcp->tcp_fused_sigurg = B_FALSE;
   6940 	tcp->tcp_loopback_peer = NULL;
   6941 
   6942 	/* We rebuild the header template on the next connect/conn_request */
   6943 
   6944 	connp->conn_mlp_type = mlptSingle;
   6945 
   6946 	/*
   6947 	 * Init the window scale to the max so tcp_rwnd_set() won't pare
   6948 	 * down tcp_rwnd. tcp_set_destination() will set the right value later.
   6949 	 */
   6950 	tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT;
   6951 	tcp->tcp_rwnd = connp->conn_rcvbuf;
   6952 
   6953 	tcp->tcp_cork = B_FALSE;
   6954 	/*
   6955 	 * Init the tcp_debug option if it wasn't already set.  This value
   6956 	 * determines whether TCP
   6957 	 * calls strlog() to print out debug messages.  Doing this
   6958 	 * initialization here means that this value is not inherited thru
   6959 	 * tcp_reinit().
   6960 	 */
   6961 	if (!connp->conn_debug)
   6962 		connp->conn_debug = tcps->tcps_dbg;
   6963 
   6964 	tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
   6965 	tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
   6966 }
   6967 
   6968 /* At minimum we need 8 bytes in the TCP header for the lookup */
   6969 #define	ICMP_MIN_TCP_HDR	8
   6970 
   6971 /*
   6972  * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages
   6973  * passed up by IP. The message is always received on the correct tcp_t.
   6974  * Assumes that IP has pulled up everything up to and including the ICMP header.
   6975  */
   6976 /* ARGSUSED2 */
   6977 static void
   6978 tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   6979 {
   6980 	conn_t		*connp = (conn_t *)arg1;
   6981 	icmph_t		*icmph;
   6982 	ipha_t		*ipha;
   6983 	int		iph_hdr_length;
   6984 	tcpha_t		*tcpha;
   6985 	uint32_t	seg_seq;
   6986 	tcp_t		*tcp = connp->conn_tcp;
   6987 
   6988 	/* Assume IP provides aligned packets */
   6989 	ASSERT(OK_32PTR(mp->b_rptr));
   6990 	ASSERT((MBLKL(mp) >= sizeof (ipha_t)));
   6991 
   6992 	/*
   6993 	 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent
   6994 	 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6.
   6995 	 */
   6996 	if (!(ira->ira_flags & IRAF_IS_IPV4)) {
   6997 		tcp_icmp_error_ipv6(tcp, mp, ira);
   6998 		return;
   6999 	}
   7000 
   7001 	/* Skip past the outer IP and ICMP headers */
   7002 	iph_hdr_length = ira->ira_ip_hdr_length;
   7003 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   7004 	/*
   7005 	 * If we don't have the correct outer IP header length
   7006 	 * or if we don't have a complete inner IP header
   7007 	 * drop it.
   7008 	 */
   7009 	if (iph_hdr_length < sizeof (ipha_t) ||
   7010 	    (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) {
   7011 noticmpv4:
   7012 		freemsg(mp);
   7013 		return;
   7014 	}
   7015 	ipha = (ipha_t *)&icmph[1];
   7016 
   7017 	/* Skip past the inner IP and find the ULP header */
   7018 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
   7019 	tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length);
   7020 	/*
   7021 	 * If we don't have the correct inner IP header length or if the ULP
   7022 	 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR
   7023 	 * bytes of TCP header, drop it.
   7024 	 */
   7025 	if (iph_hdr_length < sizeof (ipha_t) ||
   7026 	    ipha->ipha_protocol != IPPROTO_TCP ||
   7027 	    (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) {
   7028 		goto noticmpv4;
   7029 	}
   7030 
   7031 	seg_seq = ntohl(tcpha->tha_seq);
   7032 	switch (icmph->icmph_type) {
   7033 	case ICMP_DEST_UNREACHABLE:
   7034 		switch (icmph->icmph_code) {
   7035 		case ICMP_FRAGMENTATION_NEEDED:
   7036 			/*
   7037 			 * Update Path MTU, then try to send something out.
   7038 			 */
   7039 			tcp_update_pmtu(tcp, B_TRUE);
   7040 			tcp_rexmit_after_error(tcp);
   7041 			break;
   7042 		case ICMP_PORT_UNREACHABLE:
   7043 		case ICMP_PROTOCOL_UNREACHABLE:
   7044 			switch (tcp->tcp_state) {
   7045 			case TCPS_SYN_SENT:
   7046 			case TCPS_SYN_RCVD:
   7047 				/*
   7048 				 * ICMP can snipe away incipient
   7049 				 * TCP connections as long as
   7050 				 * seq number is same as initial
   7051 				 * send seq number.
   7052 				 */
   7053 				if (seg_seq == tcp->tcp_iss) {
   7054 					(void) tcp_clean_death(tcp,
   7055 					    ECONNREFUSED, 6);
   7056 				}
   7057 				break;
   7058 			}
   7059 			break;
   7060 		case ICMP_HOST_UNREACHABLE:
   7061 		case ICMP_NET_UNREACHABLE:
   7062 			/* Record the error in case we finally time out. */
   7063 			if (icmph->icmph_code == ICMP_HOST_UNREACHABLE)
   7064 				tcp->tcp_client_errno = EHOSTUNREACH;
   7065 			else
   7066 				tcp->tcp_client_errno = ENETUNREACH;
   7067 			if (tcp->tcp_state == TCPS_SYN_RCVD) {
   7068 				if (tcp->tcp_listener != NULL &&
   7069 				    tcp->tcp_listener->tcp_syn_defense) {
   7070 					/*
   7071 					 * Ditch the half-open connection if we
   7072 					 * suspect a SYN attack is under way.
   7073 					 */
   7074 					(void) tcp_clean_death(tcp,
   7075 					    tcp->tcp_client_errno, 7);
   7076 				}
   7077 			}
   7078 			break;
   7079 		default:
   7080 			break;
   7081 		}
   7082 		break;
   7083 	case ICMP_SOURCE_QUENCH: {
   7084 		/*
   7085 		 * use a global boolean to control
   7086 		 * whether TCP should respond to ICMP_SOURCE_QUENCH.
   7087 		 * The default is false.
   7088 		 */
   7089 		if (tcp_icmp_source_quench) {
   7090 			/*
   7091 			 * Reduce the sending rate as if we got a
   7092 			 * retransmit timeout
   7093 			 */
   7094 			uint32_t npkt;
   7095 
   7096 			npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
   7097 			    tcp->tcp_mss;
   7098 			tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
   7099 			tcp->tcp_cwnd = tcp->tcp_mss;
   7100 			tcp->tcp_cwnd_cnt = 0;
   7101 		}
   7102 		break;
   7103 	}
   7104 	}
   7105 	freemsg(mp);
   7106 }
   7107 
   7108 /*
   7109  * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might
   7110  * change. But it can refer to fields like tcp_suna and tcp_snxt.
   7111  *
   7112  * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP
   7113  * error messages received by IP. The message is always received on the correct
   7114  * tcp_t.
   7115  */
   7116 /* ARGSUSED */
   7117 static boolean_t
   7118 tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
   7119     ip_recv_attr_t *ira)
   7120 {
   7121 	tcpha_t		*tcpha = (tcpha_t *)arg2;
   7122 	uint32_t	seq = ntohl(tcpha->tha_seq);
   7123 	tcp_t		*tcp = connp->conn_tcp;
   7124 
   7125 	/*
   7126 	 * TCP sequence number contained in payload of the ICMP error message
   7127 	 * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise,
   7128 	 * the message is either a stale ICMP error, or an attack from the
   7129 	 * network. Fail the verification.
   7130 	 */
   7131 	if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt))
   7132 		return (B_FALSE);
   7133 
   7134 	/* For "too big" we also check the ignore flag */
   7135 	if (ira->ira_flags & IRAF_IS_IPV4) {
   7136 		ASSERT(icmph != NULL);
   7137 		if (icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
   7138 		    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED &&
   7139 		    tcp->tcp_tcps->tcps_ignore_path_mtu)
   7140 			return (B_FALSE);
   7141 	} else {
   7142 		ASSERT(icmp6 != NULL);
   7143 		if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG &&
   7144 		    tcp->tcp_tcps->tcps_ignore_path_mtu)
   7145 			return (B_FALSE);
   7146 	}
   7147 	return (B_TRUE);
   7148 }
   7149 
   7150 /*
   7151  * Update the TCP connection according to change of PMTU.
   7152  *
   7153  * Path MTU might have changed by either increase or decrease, so need to
   7154  * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny
   7155  * or negative MSS, since tcp_mss_set() will do it.
   7156  */
   7157 static void
   7158 tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
   7159 {
   7160 	uint32_t	pmtu;
   7161 	int32_t		mss;
   7162 	conn_t		*connp = tcp->tcp_connp;
   7163 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
   7164 	iaflags_t	ixaflags;
   7165 
   7166 	if (tcp->tcp_tcps->tcps_ignore_path_mtu)
   7167 		return;
   7168 
   7169 	if (tcp->tcp_state < TCPS_ESTABLISHED)
   7170 		return;
   7171 
   7172 	/*
   7173 	 * Always call ip_get_pmtu() to make sure that IP has updated
   7174 	 * ixa_flags properly.
   7175 	 */
   7176 	pmtu = ip_get_pmtu(ixa);
   7177 	ixaflags = ixa->ixa_flags;
   7178 
   7179 	/*
   7180 	 * Calculate the MSS by decreasing the PMTU by conn_ht_iphc_len and
   7181 	 * IPsec overhead if applied. Make sure to use the most recent
   7182 	 * IPsec information.
   7183 	 */
   7184 	mss = pmtu - connp->conn_ht_iphc_len - conn_ipsec_length(connp);
   7185 
   7186 	/*
   7187 	 * Nothing to change, so just return.
   7188 	 */
   7189 	if (mss == tcp->tcp_mss)
   7190 		return;
   7191 
   7192 	/*
   7193 	 * Currently, for ICMP errors, only PMTU decrease is handled.
   7194 	 */
   7195 	if (mss > tcp->tcp_mss && decrease_only)
   7196 		return;
   7197 
   7198 	DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss);
   7199 
   7200 	/*
   7201 	 * Update ixa_fragsize and ixa_pmtu.
   7202 	 */
   7203 	ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
   7204 
   7205 	/*
   7206 	 * Adjust MSS and all relevant variables.
   7207 	 */
   7208 	tcp_mss_set(tcp, mss);
   7209 
   7210 	/*
   7211 	 * If the PMTU is below the min size maintained by IP, then ip_get_pmtu
   7212 	 * has set IXAF_PMTU_TOO_SMALL and cleared IXAF_PMTU_IPV4_DF. Since TCP
   7213 	 * has a (potentially different) min size we do the same. Make sure to
   7214 	 * clear IXAF_DONTFRAG, which is used by IP to decide whether to
   7215 	 * fragment the packet.
   7216 	 *
   7217 	 * LSO over IPv6 can not be fragmented. So need to disable LSO
   7218 	 * when IPv6 fragmentation is needed.
   7219 	 */
   7220 	if (mss < tcp->tcp_tcps->tcps_mss_min)
   7221 		ixaflags |= IXAF_PMTU_TOO_SMALL;
   7222 
   7223 	if (ixaflags & IXAF_PMTU_TOO_SMALL)
   7224 		ixaflags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
   7225 
   7226 	if ((connp->conn_ipversion == IPV4_VERSION) &&
   7227 	    !(ixaflags & IXAF_PMTU_IPV4_DF)) {
   7228 		tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
   7229 	}
   7230 	ixa->ixa_flags = ixaflags;
   7231 }
   7232 
   7233 /*
   7234  * Do slow start retransmission after ICMP errors of PMTU changes.
   7235  */
   7236 static void
   7237 tcp_rexmit_after_error(tcp_t *tcp)
   7238 {
   7239 	/*
   7240 	 * All sent data has been acknowledged or no data left to send, just
   7241 	 * to return.
   7242 	 */
   7243 	if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
   7244 	    (tcp->tcp_xmit_head == NULL))
   7245 		return;
   7246 
   7247 	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
   7248 		tcp->tcp_rexmit_max = tcp->tcp_fss;
   7249 	else
   7250 		tcp->tcp_rexmit_max = tcp->tcp_snxt;
   7251 
   7252 	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
   7253 	tcp->tcp_rexmit = B_TRUE;
   7254 	tcp->tcp_dupack_cnt = 0;
   7255 	tcp->tcp_snd_burst = TCP_CWND_SS;
   7256 	tcp_ss_rexmit(tcp);
   7257 }
   7258 
   7259 /*
   7260  * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
   7261  * error messages passed up by IP.
   7262  * Assumes that IP has pulled up all the extension headers as well
   7263  * as the ICMPv6 header.
   7264  */
   7265 static void
   7266 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
   7267 {
   7268 	icmp6_t		*icmp6;
   7269 	ip6_t		*ip6h;
   7270 	uint16_t	iph_hdr_length = ira->ira_ip_hdr_length;
   7271 	tcpha_t		*tcpha;
   7272 	uint8_t		*nexthdrp;
   7273 	uint32_t	seg_seq;
   7274 
   7275 	/*
   7276 	 * Verify that we have a complete IP header.
   7277 	 */
   7278 	ASSERT((MBLKL(mp) >= sizeof (ip6_t)));
   7279 
   7280 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
   7281 	ip6h = (ip6_t *)&icmp6[1];
   7282 	/*
   7283 	 * Verify if we have a complete ICMP and inner IP header.
   7284 	 */
   7285 	if ((uchar_t *)&ip6h[1] > mp->b_wptr) {
   7286 noticmpv6:
   7287 		freemsg(mp);
   7288 		return;
   7289 	}
   7290 
   7291 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp))
   7292 		goto noticmpv6;
   7293 	tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length);
   7294 	/*
   7295 	 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't
   7296 	 * have at least ICMP_MIN_TCP_HDR bytes of  TCP header drop the
   7297 	 * packet.
   7298 	 */
   7299 	if ((*nexthdrp != IPPROTO_TCP) ||
   7300 	    ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) {
   7301 		goto noticmpv6;
   7302 	}
   7303 
   7304 	seg_seq = ntohl(tcpha->tha_seq);
   7305 	switch (icmp6->icmp6_type) {
   7306 	case ICMP6_PACKET_TOO_BIG:
   7307 		/*
   7308 		 * Update Path MTU, then try to send something out.
   7309 		 */
   7310 		tcp_update_pmtu(tcp, B_TRUE);
   7311 		tcp_rexmit_after_error(tcp);
   7312 		break;
   7313 	case ICMP6_DST_UNREACH:
   7314 		switch (icmp6->icmp6_code) {
   7315 		case ICMP6_DST_UNREACH_NOPORT:
   7316 			if (((tcp->tcp_state == TCPS_SYN_SENT) ||
   7317 			    (tcp->tcp_state == TCPS_SYN_RCVD)) &&
   7318 			    (seg_seq == tcp->tcp_iss)) {
   7319 				(void) tcp_clean_death(tcp,
   7320 				    ECONNREFUSED, 8);
   7321 			}
   7322 			break;
   7323 		case ICMP6_DST_UNREACH_ADMIN:
   7324 		case ICMP6_DST_UNREACH_NOROUTE:
   7325 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
   7326 		case ICMP6_DST_UNREACH_ADDR:
   7327 			/* Record the error in case we finally time out. */
   7328 			tcp->tcp_client_errno = EHOSTUNREACH;
   7329 			if (((tcp->tcp_state == TCPS_SYN_SENT) ||
   7330 			    (tcp->tcp_state == TCPS_SYN_RCVD)) &&
   7331 			    (seg_seq == tcp->tcp_iss)) {
   7332 				if (tcp->tcp_listener != NULL &&
   7333 				    tcp->tcp_listener->tcp_syn_defense) {
   7334 					/*
   7335 					 * Ditch the half-open connection if we
   7336 					 * suspect a SYN attack is under way.
   7337 					 */
   7338 					(void) tcp_clean_death(tcp,
   7339 					    tcp->tcp_client_errno, 9);
   7340 				}
   7341 			}
   7342 
   7343 
   7344 			break;
   7345 		default:
   7346 			break;
   7347 		}
   7348 		break;
   7349 	case ICMP6_PARAM_PROB:
   7350 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
   7351 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
   7352 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
   7353 		    (uchar_t *)nexthdrp) {
   7354 			if (tcp->tcp_state == TCPS_SYN_SENT ||
   7355 			    tcp->tcp_state == TCPS_SYN_RCVD) {
   7356 				(void) tcp_clean_death(tcp,
   7357 				    ECONNREFUSED, 10);
   7358 			}
   7359 			break;
   7360 		}
   7361 		break;
   7362 
   7363 	case ICMP6_TIME_EXCEEDED:
   7364 	default:
   7365 		break;
   7366 	}
   7367 	freemsg(mp);
   7368 }
   7369 
   7370 /*
   7371  * Notify IP that we are having trouble with this connection.  IP should
   7372  * make note so it can potentially use a different IRE.
   7373  */
   7374 static void
   7375 tcp_ip_notify(tcp_t *tcp)
   7376 {
   7377 	conn_t		*connp = tcp->tcp_connp;
   7378 	ire_t		*ire;
   7379 
   7380 	/*
   7381 	 * Note: in the case of source routing we want to blow away the
   7382 	 * route to the first source route hop.
   7383 	 */
   7384 	ire = connp->conn_ixa->ixa_ire;
   7385 	if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
   7386 		if (ire->ire_ipversion == IPV4_VERSION) {
   7387 			/*
   7388 			 * As per RFC 1122, we send an RTM_LOSING to inform
   7389 			 * routing protocols.
   7390 			 */
   7391 			ip_rts_change(RTM_LOSING, ire->ire_addr,
   7392 			    ire->ire_gateway_addr, ire->ire_mask,
   7393 			    connp->conn_laddr_v4,  0, 0, 0,
   7394 			    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
   7395 			    ire->ire_ipst);
   7396 		}
   7397 		(void) ire_no_good(ire);
   7398 	}
   7399 }
   7400 
   7401 #pragma inline(tcp_send_data)
   7402 
   7403 /*
   7404  * Timer callback routine for keepalive probe.  We do a fake resend of
   7405  * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
   7406  * check to see if we have heard anything from the other end for the last
   7407  * RTO period.  If we have, set the timer to expire for another
   7408  * tcp_keepalive_intrvl and check again.  If we have not, set a timer using
   7409  * RTO << 1 and check again when it expires.  Keep exponentially increasing
   7410  * the timeout if we have not heard from the other side.  If for more than
   7411  * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
   7412  * kill the connection unless the keepalive abort threshold is 0.  In
   7413  * that case, we will probe "forever."
   7414  */
   7415 static void
   7416 tcp_keepalive_killer(void *arg)
   7417 {
   7418 	mblk_t	*mp;
   7419 	conn_t	*connp = (conn_t *)arg;
   7420 	tcp_t  	*tcp = connp->conn_tcp;
   7421 	int32_t	firetime;
   7422 	int32_t	idletime;
   7423 	int32_t	ka_intrvl;
   7424 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   7425 
   7426 	tcp->tcp_ka_tid = 0;
   7427 
   7428 	if (tcp->tcp_fused)
   7429 		return;
   7430 
   7431 	BUMP_MIB(&tcps->tcps_mib, tcpTimKeepalive);
   7432 	ka_intrvl = tcp->tcp_ka_interval;
   7433 
   7434 	/*
   7435 	 * Keepalive probe should only be sent if the application has not
   7436 	 * done a close on the connection.
   7437 	 */
   7438 	if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
   7439 		return;
   7440 	}
   7441 	/* Timer fired too early, restart it. */
   7442 	if (tcp->tcp_state < TCPS_ESTABLISHED) {
   7443 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
   7444 		    MSEC_TO_TICK(ka_intrvl));
   7445 		return;
   7446 	}
   7447 
   7448 	idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
   7449 	/*
   7450 	 * If we have not heard from the other side for a long
   7451 	 * time, kill the connection unless the keepalive abort
   7452 	 * threshold is 0.  In that case, we will probe "forever."
   7453 	 */
   7454 	if (tcp->tcp_ka_abort_thres != 0 &&
   7455 	    idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
   7456 		BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveDrop);
   7457 		(void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
   7458 		    tcp->tcp_client_errno : ETIMEDOUT, 11);
   7459 		return;
   7460 	}
   7461 
   7462 	if (tcp->tcp_snxt == tcp->tcp_suna &&
   7463 	    idletime >= ka_intrvl) {
   7464 		/* Fake resend of last ACKed byte. */
   7465 		mblk_t	*mp1 = allocb(1, BPRI_LO);
   7466 
   7467 		if (mp1 != NULL) {
   7468 			*mp1->b_wptr++ = '\0';
   7469 			mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
   7470 			    tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
   7471 			freeb(mp1);
   7472 			/*
   7473 			 * if allocation failed, fall through to start the
   7474 			 * timer back.
   7475 			 */
   7476 			if (mp != NULL) {
   7477 				tcp_send_data(tcp, mp);
   7478 				BUMP_MIB(&tcps->tcps_mib,
   7479 				    tcpTimKeepaliveProbe);
   7480 				if (tcp->tcp_ka_last_intrvl != 0) {
   7481 					int max;
   7482 					/*
   7483 					 * We should probe again at least
   7484 					 * in ka_intrvl, but not more than
   7485 					 * tcp_rexmit_interval_max.
   7486 					 */
   7487 					max = tcps->tcps_rexmit_interval_max;
   7488 					firetime = MIN(ka_intrvl - 1,
   7489 					    tcp->tcp_ka_last_intrvl << 1);
   7490 					if (firetime > max)
   7491 						firetime = max;
   7492 				} else {
   7493 					firetime = tcp->tcp_rto;
   7494 				}
   7495 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
   7496 				    tcp_keepalive_killer,
   7497 				    MSEC_TO_TICK(firetime));
   7498 				tcp->tcp_ka_last_intrvl = firetime;
   7499 				return;
   7500 			}
   7501 		}
   7502 	} else {
   7503 		tcp->tcp_ka_last_intrvl = 0;
   7504 	}
   7505 
   7506 	/* firetime can be negative if (mp1 == NULL || mp == NULL) */
   7507 	if ((firetime = ka_intrvl - idletime) < 0) {
   7508 		firetime = ka_intrvl;
   7509 	}
   7510 	tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
   7511 	    MSEC_TO_TICK(firetime));
   7512 }
   7513 
   7514 int
   7515 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
   7516 {
   7517 	conn_t	*connp = tcp->tcp_connp;
   7518 	queue_t	*q = connp->conn_rq;
   7519 	int32_t	mss = tcp->tcp_mss;
   7520 	int	maxpsz;
   7521 
   7522 	if (TCP_IS_DETACHED(tcp))
   7523 		return (mss);
   7524 	if (tcp->tcp_fused) {
   7525 		maxpsz = tcp_fuse_maxpsz(tcp);
   7526 		mss = INFPSZ;
   7527 	} else if (tcp->tcp_maxpsz_multiplier == 0) {
   7528 		/*
   7529 		 * Set the sd_qn_maxpsz according to the socket send buffer
   7530 		 * size, and sd_maxblk to INFPSZ (-1).  This will essentially
   7531 		 * instruct the stream head to copyin user data into contiguous
   7532 		 * kernel-allocated buffers without breaking it up into smaller
   7533 		 * chunks.  We round up the buffer size to the nearest SMSS.
   7534 		 */
   7535 		maxpsz = MSS_ROUNDUP(connp->conn_sndbuf, mss);
   7536 		if (tcp->tcp_kssl_ctx == NULL)
   7537 			mss = INFPSZ;
   7538 		else
   7539 			mss = SSL3_MAX_RECORD_LEN;
   7540 	} else {
   7541 		/*
   7542 		 * Set sd_qn_maxpsz to approx half the (receivers) buffer
   7543 		 * (and a multiple of the mss).  This instructs the stream
   7544 		 * head to break down larger than SMSS writes into SMSS-
   7545 		 * size mblks, up to tcp_maxpsz_multiplier mblks at a time.
   7546 		 */
   7547 		maxpsz = tcp->tcp_maxpsz_multiplier * mss;
   7548 		if (maxpsz > connp->conn_sndbuf / 2) {
   7549 			maxpsz = connp->conn_sndbuf / 2;
   7550 			/* Round up to nearest mss */
   7551 			maxpsz = MSS_ROUNDUP(maxpsz, mss);
   7552 		}
   7553 	}
   7554 
   7555 	(void) proto_set_maxpsz(q, connp, maxpsz);
   7556 	if (!(IPCL_IS_NONSTR(connp)))
   7557 		connp->conn_wq->q_maxpsz = maxpsz;
   7558 	if (set_maxblk)
   7559 		(void) proto_set_tx_maxblk(q, connp, mss);
   7560 	return (mss);
   7561 }
   7562 
   7563 /*
   7564  * Extract option values from a tcp header.  We put any found values into the
   7565  * tcpopt struct and return a bitmask saying which options were found.
   7566  */
   7567 static int
   7568 tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt)
   7569 {
   7570 	uchar_t		*endp;
   7571 	int		len;
   7572 	uint32_t	mss;
   7573 	uchar_t		*up = (uchar_t *)tcpha;
   7574 	int		found = 0;
   7575 	int32_t		sack_len;
   7576 	tcp_seq		sack_begin, sack_end;
   7577 	tcp_t		*tcp;
   7578 
   7579 	endp = up + TCP_HDR_LENGTH(tcpha);
   7580 	up += TCP_MIN_HEADER_LENGTH;
   7581 	while (up < endp) {
   7582 		len = endp - up;
   7583 		switch (*up) {
   7584 		case TCPOPT_EOL:
   7585 			break;
   7586 
   7587 		case TCPOPT_NOP:
   7588 			up++;
   7589 			continue;
   7590 
   7591 		case TCPOPT_MAXSEG:
   7592 			if (len < TCPOPT_MAXSEG_LEN ||
   7593 			    up[1] != TCPOPT_MAXSEG_LEN)
   7594 				break;
   7595 
   7596 			mss = BE16_TO_U16(up+2);
   7597 			/* Caller must handle tcp_mss_min and tcp_mss_max_* */
   7598 			tcpopt->tcp_opt_mss = mss;
   7599 			found |= TCP_OPT_MSS_PRESENT;
   7600 
   7601 			up += TCPOPT_MAXSEG_LEN;
   7602 			continue;
   7603 
   7604 		case TCPOPT_WSCALE:
   7605 			if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN)
   7606 				break;
   7607 
   7608 			if (up[2] > TCP_MAX_WINSHIFT)
   7609 				tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT;
   7610 			else
   7611 				tcpopt->tcp_opt_wscale = up[2];
   7612 			found |= TCP_OPT_WSCALE_PRESENT;
   7613 
   7614 			up += TCPOPT_WS_LEN;
   7615 			continue;
   7616 
   7617 		case TCPOPT_SACK_PERMITTED:
   7618 			if (len < TCPOPT_SACK_OK_LEN ||
   7619 			    up[1] != TCPOPT_SACK_OK_LEN)
   7620 				break;
   7621 			found |= TCP_OPT_SACK_OK_PRESENT;
   7622 			up += TCPOPT_SACK_OK_LEN;
   7623 			continue;
   7624 
   7625 		case TCPOPT_SACK:
   7626 			if (len <= 2 || up[1] <= 2 || len < up[1])
   7627 				break;
   7628 
   7629 			/* If TCP is not interested in SACK blks... */
   7630 			if ((tcp = tcpopt->tcp) == NULL) {
   7631 				up += up[1];
   7632 				continue;
   7633 			}
   7634 			sack_len = up[1] - TCPOPT_HEADER_LEN;
   7635 			up += TCPOPT_HEADER_LEN;
   7636 
   7637 			/*
   7638 			 * If the list is empty, allocate one and assume
   7639 			 * nothing is sack'ed.
   7640 			 */
   7641 			ASSERT(tcp->tcp_sack_info != NULL);
   7642 			if (tcp->tcp_notsack_list == NULL) {
   7643 				tcp_notsack_update(&(tcp->tcp_notsack_list),
   7644 				    tcp->tcp_suna, tcp->tcp_snxt,
   7645 				    &(tcp->tcp_num_notsack_blk),
   7646 				    &(tcp->tcp_cnt_notsack_list));
   7647 
   7648 				/*
   7649 				 * Make sure tcp_notsack_list is not NULL.
   7650 				 * This happens when kmem_alloc(KM_NOSLEEP)
   7651 				 * returns NULL.
   7652 				 */
   7653 				if (tcp->tcp_notsack_list == NULL) {
   7654 					up += sack_len;
   7655 					continue;
   7656 				}
   7657 				tcp->tcp_fack = tcp->tcp_suna;
   7658 			}
   7659 
   7660 			while (sack_len > 0) {
   7661 				if (up + 8 > endp) {
   7662 					up = endp;
   7663 					break;
   7664 				}
   7665 				sack_begin = BE32_TO_U32(up);
   7666 				up += 4;
   7667 				sack_end = BE32_TO_U32(up);
   7668 				up += 4;
   7669 				sack_len -= 8;
   7670 				/*
   7671 				 * Bounds checking.  Make sure the SACK
   7672 				 * info is within tcp_suna and tcp_snxt.
   7673 				 * If this SACK blk is out of bound, ignore
   7674 				 * it but continue to parse the following
   7675 				 * blks.
   7676 				 */
   7677 				if (SEQ_LEQ(sack_end, sack_begin) ||
   7678 				    SEQ_LT(sack_begin, tcp->tcp_suna) ||
   7679 				    SEQ_GT(sack_end, tcp->tcp_snxt)) {
   7680 					continue;
   7681 				}
   7682 				tcp_notsack_insert(&(tcp->tcp_notsack_list),
   7683 				    sack_begin, sack_end,
   7684 				    &(tcp->tcp_num_notsack_blk),
   7685 				    &(tcp->tcp_cnt_notsack_list));
   7686 				if (SEQ_GT(sack_end, tcp->tcp_fack)) {
   7687 					tcp->tcp_fack = sack_end;
   7688 				}
   7689 			}
   7690 			found |= TCP_OPT_SACK_PRESENT;
   7691 			continue;
   7692 
   7693 		case TCPOPT_TSTAMP:
   7694 			if (len < TCPOPT_TSTAMP_LEN ||
   7695 			    up[1] != TCPOPT_TSTAMP_LEN)
   7696 				break;
   7697 
   7698 			tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2);
   7699 			tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6);
   7700 
   7701 			found |= TCP_OPT_TSTAMP_PRESENT;
   7702 
   7703 			up += TCPOPT_TSTAMP_LEN;
   7704 			continue;
   7705 
   7706 		default:
   7707 			if (len <= 1 || len < (int)up[1] || up[1] == 0)
   7708 				break;
   7709 			up += up[1];
   7710 			continue;
   7711 		}
   7712 		break;
   7713 	}
   7714 	return (found);
   7715 }
   7716 
   7717 /*
   7718  * Set the MSS associated with a particular tcp based on its current value,
   7719  * and a new one passed in. Observe minimums and maximums, and reset other
   7720  * state variables that we want to view as multiples of MSS.
   7721  *
   7722  * The value of MSS could be either increased or descreased.
   7723  */
   7724 static void
   7725 tcp_mss_set(tcp_t *tcp, uint32_t mss)
   7726 {
   7727 	uint32_t	mss_max;
   7728 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   7729 	conn_t		*connp = tcp->tcp_connp;
   7730 
   7731 	if (connp->conn_ipversion == IPV4_VERSION)
   7732 		mss_max = tcps->tcps_mss_max_ipv4;
   7733 	else
   7734 		mss_max = tcps->tcps_mss_max_ipv6;
   7735 
   7736 	if (mss < tcps->tcps_mss_min)
   7737 		mss = tcps->tcps_mss_min;
   7738 	if (mss > mss_max)
   7739 		mss = mss_max;
   7740 	/*
   7741 	 * Unless naglim has been set by our client to
   7742 	 * a non-mss value, force naglim to track mss.
   7743 	 * This can help to aggregate small writes.
   7744 	 */
   7745 	if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim)
   7746 		tcp->tcp_naglim = mss;
   7747 	/*
   7748 	 * TCP should be able to buffer at least 4 MSS data for obvious
   7749 	 * performance reason.
   7750 	 */
   7751 	if ((mss << 2) > connp->conn_sndbuf)
   7752 		connp->conn_sndbuf = mss << 2;
   7753 
   7754 	/*
   7755 	 * Set the send lowater to at least twice of MSS.
   7756 	 */
   7757 	if ((mss << 1) > connp->conn_sndlowat)
   7758 		connp->conn_sndlowat = mss << 1;
   7759 
   7760 	/*
   7761 	 * Update tcp_cwnd according to the new value of MSS. Keep the
   7762 	 * previous ratio to preserve the transmit rate.
   7763 	 */
   7764 	tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss;
   7765 	tcp->tcp_cwnd_cnt = 0;
   7766 
   7767 	tcp->tcp_mss = mss;
   7768 	(void) tcp_maxpsz_set(tcp, B_TRUE);
   7769 }
   7770 
   7771 /* For /dev/tcp aka AF_INET open */
   7772 static int
   7773 tcp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   7774 {
   7775 	return (tcp_open(q, devp, flag, sflag, credp, B_FALSE));
   7776 }
   7777 
   7778 /* For /dev/tcp6 aka AF_INET6 open */
   7779 static int
   7780 tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   7781 {
   7782 	return (tcp_open(q, devp, flag, sflag, credp, B_TRUE));
   7783 }
   7784 
   7785 static conn_t *
   7786 tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket,