Home | History | Annotate | Download | only in tcp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 /* Copyright (c) 1990 Mentat Inc. */
     27 
     28 #include <sys/types.h>
     29 #include <sys/stream.h>
     30 #include <sys/strsun.h>
     31 #include <sys/strsubr.h>
     32 #include <sys/stropts.h>
     33 #include <sys/strlog.h>
     34 #define	_SUN_TPI_VERSION 2
     35 #include <sys/tihdr.h>
     36 #include <sys/timod.h>
     37 #include <sys/ddi.h>
     38 #include <sys/sunddi.h>
     39 #include <sys/suntpi.h>
     40 #include <sys/xti_inet.h>
     41 #include <sys/cmn_err.h>
     42 #include <sys/debug.h>
     43 #include <sys/sdt.h>
     44 #include <sys/vtrace.h>
     45 #include <sys/kmem.h>
     46 #include <sys/ethernet.h>
     47 #include <sys/cpuvar.h>
     48 #include <sys/dlpi.h>
     49 #include <sys/pattr.h>
     50 #include <sys/policy.h>
     51 #include <sys/priv.h>
     52 #include <sys/zone.h>
     53 #include <sys/sunldi.h>
     54 
     55 #include <sys/errno.h>
     56 #include <sys/signal.h>
     57 #include <sys/socket.h>
     58 #include <sys/socketvar.h>
     59 #include <sys/sockio.h>
     60 #include <sys/isa_defs.h>
     61 #include <sys/md5.h>
     62 #include <sys/random.h>
     63 #include <sys/uio.h>
     64 #include <sys/systm.h>
     65 #include <netinet/in.h>
     66 #include <netinet/tcp.h>
     67 #include <netinet/ip6.h>
     68 #include <netinet/icmp6.h>
     69 #include <net/if.h>
     70 #include <net/route.h>
     71 #include <inet/ipsec_impl.h>
     72 
     73 #include <inet/common.h>
     74 #include <inet/ip.h>
     75 #include <inet/ip_impl.h>
     76 #include <inet/ip6.h>
     77 #include <inet/ip_ndp.h>
     78 #include <inet/proto_set.h>
     79 #include <inet/mib2.h>
     80 #include <inet/nd.h>
     81 #include <inet/optcom.h>
     82 #include <inet/snmpcom.h>
     83 #include <inet/kstatcom.h>
     84 #include <inet/tcp.h>
     85 #include <inet/tcp_impl.h>
     86 #include <inet/udp_impl.h>
     87 #include <net/pfkeyv2.h>
     88 #include <inet/ipdrop.h>
     89 
     90 #include <inet/ipclassifier.h>
     91 #include <inet/ip_ire.h>
     92 #include <inet/ip_ftable.h>
     93 #include <inet/ip_if.h>
     94 #include <inet/ipp_common.h>
     95 #include <inet/ip_rts.h>
     96 #include <inet/ip_netinfo.h>
     97 #include <sys/squeue_impl.h>
     98 #include <sys/squeue.h>
     99 #include <inet/kssl/ksslapi.h>
    100 #include <sys/tsol/label.h>
    101 #include <sys/tsol/tnet.h>
    102 #include <rpc/pmap_prot.h>
    103 #include <sys/callo.h>
    104 
    105 #include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
    106 
    107 /*
    108  * TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
    109  *
    110  * (Read the detailed design doc in PSARC case directory)
    111  *
    112  * The entire tcp state is contained in tcp_t and conn_t structure
    113  * which are allocated in tandem using ipcl_conn_create() and passing
    114  * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect
    115  * the references on the tcp_t. The tcp_t structure is never compressed
    116  * and packets always land on the correct TCP perimeter from the time
    117  * eager is created till the time tcp_t dies (as such the old mentat
    118  * TCP global queue is not used for detached state and no IPSEC checking
    119  * is required). The global queue is still allocated to send out resets
    120  * for connection which have no listeners and IP directly calls
    121  * tcp_xmit_listeners_reset() which does any policy check.
    122  *
    123  * Protection and Synchronisation mechanism:
    124  *
    125  * The tcp data structure does not use any kind of lock for protecting
    126  * its state but instead uses 'squeues' for mutual exclusion from various
    127  * read and write side threads. To access a tcp member, the thread should
    128  * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS,
    129  * or SQ_NODRAIN). Since the squeues allow a direct function call, caller
    130  * can pass any tcp function having prototype of edesc_t as argument
    131  * (different from traditional STREAMs model where packets come in only
    132  * designated entry points). The list of functions that can be directly
    133  * called via squeue are listed before the usual function prototype.
    134  *
    135  * Referencing:
    136  *
    137  * TCP is MT-Hot and we use a reference based scheme to make sure that the
    138  * tcp structure doesn't disappear when its needed. When the application
    139  * creates an outgoing connection or accepts an incoming connection, we
    140  * start out with 2 references on 'conn_ref'. One for TCP and one for IP.
    141  * The IP reference is just a symbolic reference since ip_tcpclose()
    142  * looks at tcp structure after tcp_close_output() returns which could
    143  * have dropped the last TCP reference. So as long as the connection is
    144  * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
    145  * conn_t. The classifier puts its own reference when the connection is
    146  * inserted in listen or connected hash. Anytime a thread needs to enter
    147  * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
    148  * on write side or by doing a classify on read side and then puts a
    149  * reference on the conn before doing squeue_enter/tryenter/fill. For
    150  * read side, the classifier itself puts the reference under fanout lock
    151  * to make sure that tcp can't disappear before it gets processed. The
    152  * squeue will drop this reference automatically so the called function
    153  * doesn't have to do a DEC_REF.
    154  *
    155  * Opening a new connection:
    156  *
    157  * The outgoing connection open is pretty simple. tcp_open() does the
    158  * work in creating the conn/tcp structure and initializing it. The
    159  * squeue assignment is done based on the CPU the application
    160  * is running on. So for outbound connections, processing is always done
    161  * on application CPU which might be different from the incoming CPU
    162  * being interrupted by the NIC. An optimal way would be to figure out
    163  * the NIC <-> CPU binding at listen time, and assign the outgoing
    164  * connection to the squeue attached to the CPU that will be interrupted
    165  * for incoming packets (we know the NIC based on the bind IP address).
    166  * This might seem like a problem if more data is going out but the
    167  * fact is that in most cases the transmit is ACK driven transmit where
    168  * the outgoing data normally sits on TCP's xmit queue waiting to be
    169  * transmitted.
    170  *
    171  * Accepting a connection:
    172  *
    173  * This is a more interesting case because of various races involved in
    174  * establishing a eager in its own perimeter. Read the meta comment on
    175  * top of tcp_input_listener(). But briefly, the squeue is picked by
    176  * ip_fanout based on the ring or the sender (if loopback).
    177  *
    178  * Closing a connection:
    179  *
    180  * The close is fairly straight forward. tcp_close() calls tcp_close_output()
    181  * via squeue to do the close and mark the tcp as detached if the connection
    182  * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
    183  * reference but tcp_close() drop IP's reference always. So if tcp was
    184  * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
    185  * and 1 because it is in classifier's connected hash. This is the condition
    186  * we use to determine that its OK to clean up the tcp outside of squeue
    187  * when time wait expires (check the ref under fanout and conn_lock and
    188  * if it is 2, remove it from fanout hash and kill it).
    189  *
    190  * Although close just drops the necessary references and marks the
    191  * tcp_detached state, tcp_close needs to know the tcp_detached has been
    192  * set (under squeue) before letting the STREAM go away (because a
    193  * inbound packet might attempt to go up the STREAM while the close
    194  * has happened and tcp_detached is not set). So a special lock and
    195  * flag is used along with a condition variable (tcp_closelock, tcp_closed,
    196  * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
    197  * tcp_detached.
    198  *
    199  * Special provisions and fast paths:
    200  *
    201  * We make special provisions for sockfs by marking tcp_issocket
    202  * whenever we have only sockfs on top of TCP. This allows us to skip
    203  * putting the tcp in acceptor hash since a sockfs listener can never
    204  * become acceptor and also avoid allocating a tcp_t for acceptor STREAM
    205  * since eager has already been allocated and the accept now happens
    206  * on acceptor STREAM. There is a big blob of comment on top of
    207  * tcp_input_listener explaining the new accept. When socket is POP'd,
    208  * sockfs sends us an ioctl to mark the fact and we go back to old
    209  * behaviour. Once tcp_issocket is unset, its never set for the
    210  * life of that connection.
    211  *
    212  * IPsec notes :
    213  *
    214  * Since a packet is always executed on the correct TCP perimeter
    215  * all IPsec processing is defered to IP including checking new
    216  * connections and setting IPSEC policies for new connection. The
    217  * only exception is tcp_xmit_listeners_reset() which is called
    218  * directly from IP and needs to policy check to see if TH_RST
    219  * can be sent out.
    220  */
    221 
    222 /*
    223  * Values for squeue switch:
    224  * 1: SQ_NODRAIN
    225  * 2: SQ_PROCESS
    226  * 3: SQ_FILL
    227  */
    228 int tcp_squeue_wput = 2;	/* /etc/systems */
    229 int tcp_squeue_flag;
    230 
    231 /*
    232  * This controls how tiny a write must be before we try to copy it
    233  * into the mblk on the tail of the transmit queue.  Not much
    234  * speedup is observed for values larger than sixteen.  Zero will
    235  * disable the optimisation.
    236  */
    237 int tcp_tx_pull_len = 16;
    238 
    239 /*
    240  * TCP Statistics.
    241  *
    242  * How TCP statistics work.
    243  *
    244  * There are two types of statistics invoked by two macros.
    245  *
    246  * TCP_STAT(name) does non-atomic increment of a named stat counter. It is
    247  * supposed to be used in non MT-hot paths of the code.
    248  *
    249  * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
    250  * supposed to be used for DEBUG purposes and may be used on a hot path.
    251  *
    252  * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
    253  * (use "kstat tcp" to get them).
    254  *
    255  * There is also additional debugging facility that marks tcp_clean_death()
    256  * instances and saves them in tcp_t structure. It is triggered by
    257  * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
    258  * tcp_clean_death() calls that counts the number of times each tag was hit. It
    259  * is triggered by TCP_CLD_COUNTERS define.
    260  *
    261  * How to add new counters.
    262  *
    263  * 1) Add a field in the tcp_stat structure describing your counter.
    264  * 2) Add a line in the template in tcp_kstat2_init() with the name
    265  *    of the counter.
    266  *
    267  *    IMPORTANT!! - make sure that both are in sync !!
    268  * 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
    269  *
    270  * Please avoid using private counters which are not kstat-exported.
    271  *
    272  * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
    273  * in tcp_t structure.
    274  *
    275  * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
    276  */
    277 
    278 #ifndef TCP_DEBUG_COUNTER
    279 #ifdef DEBUG
    280 #define	TCP_DEBUG_COUNTER 1
    281 #else
    282 #define	TCP_DEBUG_COUNTER 0
    283 #endif
    284 #endif
    285 
    286 #define	TCP_CLD_COUNTERS 0
    287 
    288 #define	TCP_TAG_CLEAN_DEATH 1
    289 #define	TCP_MAX_CLEAN_DEATH_TAG 32
    290 
    291 #ifdef lint
    292 static int _lint_dummy_;
    293 #endif
    294 
    295 #if TCP_CLD_COUNTERS
    296 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
    297 #define	TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
    298 #elif defined(lint)
    299 #define	TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
    300 #else
    301 #define	TCP_CLD_STAT(x)
    302 #endif
    303 
    304 #if TCP_DEBUG_COUNTER
    305 #define	TCP_DBGSTAT(tcps, x)	\
    306 	atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1)
    307 #define	TCP_G_DBGSTAT(x)	\
    308 	atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1)
    309 #elif defined(lint)
    310 #define	TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0);
    311 #define	TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
    312 #else
    313 #define	TCP_DBGSTAT(tcps, x)
    314 #define	TCP_G_DBGSTAT(x)
    315 #endif
    316 
    317 #define	TCP_G_STAT(x)	(tcp_g_statistics.x.value.ui64++)
    318 
    319 tcp_g_stat_t	tcp_g_statistics;
    320 kstat_t		*tcp_g_kstat;
    321 
    322 /* Macros for timestamp comparisons */
    323 #define	TSTMP_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
    324 #define	TSTMP_LT(a, b)	((int32_t)((a)-(b)) < 0)
    325 
    326 /*
    327  * Parameters for TCP Initial Send Sequence number (ISS) generation.  When
    328  * tcp_strong_iss is set to 1, which is the default, the ISS is calculated
    329  * by adding three components: a time component which grows by 1 every 4096
    330  * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
    331  * a per-connection component which grows by 125000 for every new connection;
    332  * and an "extra" component that grows by a random amount centered
    333  * approximately on 64000.  This causes the ISS generator to cycle every
    334  * 4.89 hours if no TCP connections are made, and faster if connections are
    335  * made.
    336  *
    337  * When tcp_strong_iss is set to 0, ISS is calculated by adding two
    338  * components: a time component which grows by 250000 every second; and
    339  * a per-connection component which grows by 125000 for every new connections.
    340  *
    341  * A third method, when tcp_strong_iss is set to 2, for generating ISS is
    342  * prescribed by Steve Bellovin.  This involves adding time, the 125000 per
    343  * connection, and a one-way hash (MD5) of the connection ID <sport, dport,
    344  * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
    345  * password.
    346  */
    347 #define	ISS_INCR	250000
    348 #define	ISS_NSEC_SHT	12
    349 
    350 static sin_t	sin_null;	/* Zero address for quick clears */
    351 static sin6_t	sin6_null;	/* Zero address for quick clears */
    352 
    353 /*
    354  * This implementation follows the 4.3BSD interpretation of the urgent
    355  * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
    356  * incompatible changes in protocols like telnet and rlogin.
    357  */
    358 #define	TCP_OLD_URP_INTERPRETATION	1
    359 
    360 /*
    361  * Since tcp_listener is not cleared atomically with tcp_detached
    362  * being cleared we need this extra bit to tell a detached connection
    363  * apart from one that is in the process of being accepted.
    364  */
    365 #define	TCP_IS_DETACHED_NONEAGER(tcp)	\
    366 	(TCP_IS_DETACHED(tcp) &&	\
    367 	    (!(tcp)->tcp_hard_binding))
    368 
    369 /*
    370  * TCP reassembly macros.  We hide starting and ending sequence numbers in
    371  * b_next and b_prev of messages on the reassembly queue.  The messages are
    372  * chained using b_cont.  These macros are used in tcp_reass() so we don't
    373  * have to see the ugly casts and assignments.
    374  */
    375 #define	TCP_REASS_SEQ(mp)		((uint32_t)(uintptr_t)((mp)->b_next))
    376 #define	TCP_REASS_SET_SEQ(mp, u)	((mp)->b_next = \
    377 					(mblk_t *)(uintptr_t)(u))
    378 #define	TCP_REASS_END(mp)		((uint32_t)(uintptr_t)((mp)->b_prev))
    379 #define	TCP_REASS_SET_END(mp, u)	((mp)->b_prev = \
    380 					(mblk_t *)(uintptr_t)(u))
    381 
    382 /*
    383  * Implementation of TCP Timers.
    384  * =============================
    385  *
    386  * INTERFACE:
    387  *
    388  * There are two basic functions dealing with tcp timers:
    389  *
    390  *	timeout_id_t	tcp_timeout(connp, func, time)
    391  * 	clock_t		tcp_timeout_cancel(connp, timeout_id)
    392  *	TCP_TIMER_RESTART(tcp, intvl)
    393  *
    394  * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
    395  * after 'time' ticks passed. The function called by timeout() must adhere to
    396  * the same restrictions as a driver soft interrupt handler - it must not sleep
    397  * or call other functions that might sleep. The value returned is the opaque
    398  * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
    399  * cancel the request. The call to tcp_timeout() may fail in which case it
    400  * returns zero. This is different from the timeout(9F) function which never
    401  * fails.
    402  *
    403  * The call-back function 'func' always receives 'connp' as its single
    404  * argument. It is always executed in the squeue corresponding to the tcp
    405  * structure. The tcp structure is guaranteed to be present at the time the
    406  * call-back is called.
    407  *
    408  * NOTE: The call-back function 'func' is never called if tcp is in
    409  * 	the TCPS_CLOSED state.
    410  *
    411  * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
    412  * request. locks acquired by the call-back routine should not be held across
    413  * the call to tcp_timeout_cancel() or a deadlock may result.
    414  *
    415  * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
    416  * Otherwise, it returns an integer value greater than or equal to 0. In
    417  * particular, if the call-back function is already placed on the squeue, it can
    418  * not be canceled.
    419  *
    420  * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
    421  * 	within squeue context corresponding to the tcp instance. Since the
    422  *	call-back is also called via the same squeue, there are no race
    423  *	conditions described in untimeout(9F) manual page since all calls are
    424  *	strictly serialized.
    425  *
    426  *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
    427  *	stored in tcp_timer_tid and starts a new one using
    428  *	MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
    429  *	and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
    430  *	field.
    431  *
    432  * NOTE: since the timeout cancellation is not guaranteed, the cancelled
    433  *	call-back may still be called, so it is possible tcp_timer() will be
    434  *	called several times. This should not be a problem since tcp_timer()
    435  *	should always check the tcp instance state.
    436  *
    437  *
    438  * IMPLEMENTATION:
    439  *
    440  * TCP timers are implemented using three-stage process. The call to
    441  * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
    442  * when the timer expires. The tcp_timer_callback() arranges the call of the
    443  * tcp_timer_handler() function via squeue corresponding to the tcp
    444  * instance. The tcp_timer_handler() calls actual requested timeout call-back
    445  * and passes tcp instance as an argument to it. Information is passed between
    446  * stages using the tcp_timer_t structure which contains the connp pointer, the
    447  * tcp call-back to call and the timeout id returned by the timeout(9F).
    448  *
    449  * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
    450  * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
    451  * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
    452  * returns the pointer to this mblk.
    453  *
    454  * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
    455  * looks like a normal mblk without actual dblk attached to it.
    456  *
    457  * To optimize performance each tcp instance holds a small cache of timer
    458  * mblocks. In the current implementation it caches up to two timer mblocks per
    459  * tcp instance. The cache is preserved over tcp frees and is only freed when
    460  * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
    461  * timer processing happens on a corresponding squeue, the cache manipulation
    462  * does not require any locks. Experiments show that majority of timer mblocks
    463  * allocations are satisfied from the tcp cache and do not involve kmem calls.
    464  *
    465  * The tcp_timeout() places a refhold on the connp instance which guarantees
    466  * that it will be present at the time the call-back function fires. The
    467  * tcp_timer_handler() drops the reference after calling the call-back, so the
    468  * call-back function does not need to manipulate the references explicitly.
    469  */
    470 
    471 typedef struct tcp_timer_s {
    472 	conn_t	*connp;
    473 	void 	(*tcpt_proc)(void *);
    474 	callout_id_t   tcpt_tid;
    475 } tcp_timer_t;
    476 
    477 static kmem_cache_t *tcp_timercache;
    478 kmem_cache_t	*tcp_sack_info_cache;
    479 
    480 /*
    481  * For scalability, we must not run a timer for every TCP connection
    482  * in TIME_WAIT state.  To see why, consider (for time wait interval of
    483  * 4 minutes):
    484  *	1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
    485  *
    486  * This list is ordered by time, so you need only delete from the head
    487  * until you get to entries which aren't old enough to delete yet.
    488  * The list consists of only the detached TIME_WAIT connections.
    489  *
    490  * Note that the timer (tcp_time_wait_expire) is started when the tcp_t
    491  * becomes detached TIME_WAIT (either by changing the state and already
    492  * being detached or the other way around). This means that the TIME_WAIT
    493  * state can be extended (up to doubled) if the connection doesn't become
    494  * detached for a long time.
    495  *
    496  * The list manipulations (including tcp_time_wait_next/prev)
    497  * are protected by the tcp_time_wait_lock. The content of the
    498  * detached TIME_WAIT connections is protected by the normal perimeters.
    499  *
    500  * This list is per squeue and squeues are shared across the tcp_stack_t's.
    501  * Things on tcp_time_wait_head remain associated with the tcp_stack_t
    502  * and conn_netstack.
    503  * The tcp_t's that are added to tcp_free_list are disassociated and
    504  * have NULL tcp_tcps and conn_netstack pointers.
    505  */
    506 typedef struct tcp_squeue_priv_s {
    507 	kmutex_t	tcp_time_wait_lock;
    508 	callout_id_t	tcp_time_wait_tid;
    509 	tcp_t		*tcp_time_wait_head;
    510 	tcp_t		*tcp_time_wait_tail;
    511 	tcp_t		*tcp_free_list;
    512 	uint_t		tcp_free_list_cnt;
    513 } tcp_squeue_priv_t;
    514 
    515 /*
    516  * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
    517  * Running it every 5 seconds seems to give the best results.
    518  */
    519 #define	TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
    520 
    521 /*
    522  * To prevent memory hog, limit the number of entries in tcp_free_list
    523  * to 1% of available memory / number of cpus
    524  */
    525 uint_t tcp_free_list_max_cnt = 0;
    526 
    527 #define	TCP_XMIT_LOWATER	4096
    528 #define	TCP_XMIT_HIWATER	49152
    529 #define	TCP_RECV_LOWATER	2048
    530 #define	TCP_RECV_HIWATER	128000
    531 
    532 /*
    533  *  PAWS needs a timer for 24 days.  This is the number of ticks in 24 days
    534  */
    535 #define	PAWS_TIMEOUT	((clock_t)(24*24*60*60*hz))
    536 
    537 #define	TIDUSZ	4096	/* transport interface data unit size */
    538 
    539 /*
    540  * Bind hash list size and has function.  It has to be a power of 2 for
    541  * hashing.
    542  */
    543 #define	TCP_BIND_FANOUT_SIZE	512
    544 #define	TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
    545 /*
    546  * Size of listen and acceptor hash list.  It has to be a power of 2 for
    547  * hashing.
    548  */
    549 #define	TCP_FANOUT_SIZE		256
    550 
    551 #ifdef	_ILP32
    552 #define	TCP_ACCEPTOR_HASH(accid)					\
    553 		(((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
    554 #else
    555 #define	TCP_ACCEPTOR_HASH(accid)					\
    556 		((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
    557 #endif	/* _ILP32 */
    558 
    559 #define	IP_ADDR_CACHE_SIZE	2048
    560 #define	IP_ADDR_CACHE_HASH(faddr)					\
    561 	(ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
    562 
    563 /*
    564  * TCP options struct returned from tcp_parse_options.
    565  */
    566 typedef struct tcp_opt_s {
    567 	uint32_t	tcp_opt_mss;
    568 	uint32_t	tcp_opt_wscale;
    569 	uint32_t	tcp_opt_ts_val;
    570 	uint32_t	tcp_opt_ts_ecr;
    571 	tcp_t		*tcp;
    572 } tcp_opt_t;
    573 
    574 /*
    575  * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
    576  */
    577 
    578 #ifdef _BIG_ENDIAN
    579 #define	TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
    580 	(TCPOPT_TSTAMP << 8) | 10)
    581 #else
    582 #define	TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
    583 	(TCPOPT_NOP << 8) | TCPOPT_NOP)
    584 #endif
    585 
    586 /*
    587  * Flags returned from tcp_parse_options.
    588  */
    589 #define	TCP_OPT_MSS_PRESENT	1
    590 #define	TCP_OPT_WSCALE_PRESENT	2
    591 #define	TCP_OPT_TSTAMP_PRESENT	4
    592 #define	TCP_OPT_SACK_OK_PRESENT	8
    593 #define	TCP_OPT_SACK_PRESENT	16
    594 
    595 /* TCP option length */
    596 #define	TCPOPT_NOP_LEN		1
    597 #define	TCPOPT_MAXSEG_LEN	4
    598 #define	TCPOPT_WS_LEN		3
    599 #define	TCPOPT_REAL_WS_LEN	(TCPOPT_WS_LEN+1)
    600 #define	TCPOPT_TSTAMP_LEN	10
    601 #define	TCPOPT_REAL_TS_LEN	(TCPOPT_TSTAMP_LEN+2)
    602 #define	TCPOPT_SACK_OK_LEN	2
    603 #define	TCPOPT_REAL_SACK_OK_LEN	(TCPOPT_SACK_OK_LEN+2)
    604 #define	TCPOPT_REAL_SACK_LEN	4
    605 #define	TCPOPT_MAX_SACK_LEN	36
    606 #define	TCPOPT_HEADER_LEN	2
    607 
    608 /* TCP cwnd burst factor. */
    609 #define	TCP_CWND_INFINITE	65535
    610 #define	TCP_CWND_SS		3
    611 #define	TCP_CWND_NORMAL		5
    612 
    613 /* Maximum TCP initial cwin (start/restart). */
    614 #define	TCP_MAX_INIT_CWND	8
    615 
    616 /*
    617  * Initialize cwnd according to RFC 3390.  def_max_init_cwnd is
    618  * either tcp_slow_start_initial or tcp_slow_start_after idle
    619  * depending on the caller.  If the upper layer has not used the
    620  * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
    621  * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
    622  * If the upper layer has changed set the tcp_init_cwnd, just use
    623  * it to calculate the tcp_cwnd.
    624  */
    625 #define	SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd)			\
    626 {									\
    627 	if ((tcp)->tcp_init_cwnd == 0) {				\
    628 		(tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss),	\
    629 		    MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
    630 	} else {							\
    631 		(tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss);		\
    632 	}								\
    633 	tcp->tcp_cwnd_cnt = 0;						\
    634 }
    635 
    636 /* TCP Timer control structure */
    637 typedef struct tcpt_s {
    638 	pfv_t	tcpt_pfv;	/* The routine we are to call */
    639 	tcp_t	*tcpt_tcp;	/* The parameter we are to pass in */
    640 } tcpt_t;
    641 
    642 /*
    643  * Functions called directly via squeue having a prototype of edesc_t.
    644  */
    645 void		tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
    646     ip_recv_attr_t *ira);
    647 static void	tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2,
    648     ip_recv_attr_t *dummy);
    649 void		tcp_accept_finish(void *arg, mblk_t *mp, void *arg2,
    650     ip_recv_attr_t *dummy);
    651 static void	tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2,
    652     ip_recv_attr_t *dummy);
    653 static void	tcp_wput_proto(void *arg, mblk_t *mp, void *arg2,
    654     ip_recv_attr_t *dummy);
    655 void		tcp_input_data(void *arg, mblk_t *mp, void *arg2,
    656     ip_recv_attr_t *ira);
    657 static void	tcp_close_output(void *arg, mblk_t *mp, void *arg2,
    658     ip_recv_attr_t *dummy);
    659 void		tcp_output(void *arg, mblk_t *mp, void *arg2,
    660     ip_recv_attr_t *dummy);
    661 void		tcp_output_urgent(void *arg, mblk_t *mp, void *arg2,
    662     ip_recv_attr_t *dummy);
    663 static void	tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2,
    664     ip_recv_attr_t *dummy);
    665 static void	tcp_timer_handler(void *arg, mblk_t *mp, void *arg2,
    666     ip_recv_attr_t *dummy);
    667 static void	tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
    668     ip_recv_attr_t *dummy);
    669 
    670 
    671 /* Prototype for TCP functions */
    672 static void	tcp_random_init(void);
    673 int		tcp_random(void);
    674 static void	tcp_tli_accept(tcp_t *tcp, mblk_t *mp);
    675 static void	tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
    676 		    tcp_t *eager);
    677 static int	tcp_set_destination(tcp_t *tcp);
    678 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
    679     int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
    680     boolean_t user_specified);
    681 static void	tcp_closei_local(tcp_t *tcp);
    682 static void	tcp_close_detached(tcp_t *tcp);
    683 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr,
    684 		    mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira);
    685 static void	tcp_tpi_connect(tcp_t *tcp, mblk_t *mp);
    686 static int	tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
    687 		    in_port_t dstport, uint_t srcid);
    688 static int	tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
    689 		    in_port_t dstport, uint32_t flowinfo,
    690 		    uint_t srcid, uint32_t scope_id);
    691 static int	tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
    692 static void	tcp_disconnect(tcp_t *tcp, mblk_t *mp);
    693 static char	*tcp_display(tcp_t *tcp, char *, char);
    694 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
    695 static void	tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
    696 static void	tcp_eager_unlink(tcp_t *tcp);
    697 static void	tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
    698 		    int unixerr);
    699 static void	tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
    700 		    int tlierr, int unixerr);
    701 static int	tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
    702 		    cred_t *cr);
    703 static int	tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
    704 		    char *value, caddr_t cp, cred_t *cr);
    705 static int	tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
    706 		    char *value, caddr_t cp, cred_t *cr);
    707 static int	tcp_tpistate(tcp_t *tcp);
    708 static void	tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
    709     int caller_holds_lock);
    710 static void	tcp_bind_hash_remove(tcp_t *tcp);
    711 static tcp_t	*tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *);
    712 void		tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
    713 static void	tcp_acceptor_hash_remove(tcp_t *tcp);
    714 static void	tcp_capability_req(tcp_t *tcp, mblk_t *mp);
    715 static void	tcp_info_req(tcp_t *tcp, mblk_t *mp);
    716 static void	tcp_addr_req(tcp_t *tcp, mblk_t *mp);
    717 static void	tcp_init_values(tcp_t *tcp);
    718 static void	tcp_ip_notify(tcp_t *tcp);
    719 static void	tcp_iss_init(tcp_t *tcp);
    720 static void	tcp_keepalive_killer(void *arg);
    721 static int	tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt);
    722 static void	tcp_mss_set(tcp_t *tcp, uint32_t size);
    723 static int	tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
    724 		    int *do_disconnectp, int *t_errorp, int *sys_errorp);
    725 static boolean_t tcp_allow_connopt_set(int level, int name);
    726 int		tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
    727 static int	tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
    728 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
    729     tcp_stack_t *);
    730 static int	tcp_param_set(queue_t *q, mblk_t *mp, char *value,
    731 		    caddr_t cp, cred_t *cr);
    732 static int	tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
    733 		    caddr_t cp, cred_t *cr);
    734 static void	tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *);
    735 static int	tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
    736 		    caddr_t cp, cred_t *cr);
    737 static void	tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
    738 static void	tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt);
    739 static mblk_t	*tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
    740 static void	tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
    741 static void	tcp_reinit(tcp_t *tcp);
    742 static void	tcp_reinit_values(tcp_t *tcp);
    743 
    744 static uint_t	tcp_rwnd_reopen(tcp_t *tcp);
    745 static uint_t	tcp_rcv_drain(tcp_t *tcp);
    746 static void	tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
    747 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
    748 static void	tcp_ss_rexmit(tcp_t *tcp);
    749 static mblk_t	*tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
    750     ip_recv_attr_t *);
    751 static void	tcp_process_options(tcp_t *, tcpha_t *);
    752 static void	tcp_rsrv(queue_t *q);
    753 static int	tcp_snmp_state(tcp_t *tcp);
    754 static void	tcp_timer(void *arg);
    755 static void	tcp_timer_callback(void *);
    756 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
    757     boolean_t random);
    758 static in_port_t tcp_get_next_priv_port(const tcp_t *);
    759 static void	tcp_wput_sock(queue_t *q, mblk_t *mp);
    760 static void	tcp_wput_fallback(queue_t *q, mblk_t *mp);
    761 void		tcp_tpi_accept(queue_t *q, mblk_t *mp);
    762 static void	tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
    763 static void	tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
    764 static void	tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
    765 static int	tcp_send(tcp_t *tcp, const int mss,
    766 		    const int total_hdr_len, const int tcp_hdr_len,
    767 		    const int num_sack_blk, int *usable, uint_t *snxt,
    768 		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time);
    769 static void	tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
    770 		    int num_sack_blk);
    771 static void	tcp_wsrv(queue_t *q);
    772 static int	tcp_xmit_end(tcp_t *tcp);
    773 static void	tcp_ack_timer(void *arg);
    774 static mblk_t	*tcp_ack_mp(tcp_t *tcp);
    775 static void	tcp_xmit_early_reset(char *str, mblk_t *mp,
    776 		    uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *,
    777 		    ip_stack_t *, conn_t *);
    778 static void	tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
    779 		    uint32_t ack, int ctl);
    780 static void	tcp_set_rto(tcp_t *, time_t);
    781 static void	tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
    782 static void	tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
    783 static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
    784     ip_recv_attr_t *);
    785 static int	tcp_build_hdrs(tcp_t *);
    786 static void	tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
    787     uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha,
    788     ip_recv_attr_t *ira);
    789 boolean_t	tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp);
    790 static boolean_t tcp_zcopy_check(tcp_t *);
    791 static void	tcp_zcopy_notify(tcp_t *);
    792 static mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
    793 static void	tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);
    794 static void	tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only);
    795 static void	tcp_update_zcopy(tcp_t *tcp);
    796 static void	tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
    797     ixa_notify_arg_t);
    798 static void	tcp_rexmit_after_error(tcp_t *tcp);
    799 static void	tcp_send_data(tcp_t *, mblk_t *);
    800 extern mblk_t	*tcp_timermp_alloc(int);
    801 extern void	tcp_timermp_free(tcp_t *);
    802 static void	tcp_timer_free(tcp_t *tcp, mblk_t *mp);
    803 static void	tcp_stop_lingering(tcp_t *tcp);
    804 static void	tcp_close_linger_timeout(void *arg);
    805 static void	*tcp_stack_init(netstackid_t stackid, netstack_t *ns);
    806 static void	tcp_stack_fini(netstackid_t stackid, void *arg);
    807 static void	*tcp_g_kstat_init(tcp_g_stat_t *);
    808 static void	tcp_g_kstat_fini(kstat_t *);
    809 static void	*tcp_kstat_init(netstackid_t, tcp_stack_t *);
    810 static void	tcp_kstat_fini(netstackid_t, kstat_t *);
    811 static void	*tcp_kstat2_init(netstackid_t, tcp_stat_t *);
    812 static void	tcp_kstat2_fini(netstackid_t, kstat_t *);
    813 static int	tcp_kstat_update(kstat_t *kp, int rw);
    814 static mblk_t	*tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
    815     ip_recv_attr_t *ira);
    816 static mblk_t	*tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
    817     ip_recv_attr_t *ira);
    818 static int	tcp_squeue_switch(int);
    819 
    820 static int	tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
    821 static int	tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
    822 static int	tcp_openv6(queue_t *, dev_t *, int, int, cred_t *);
    823 static int	tcp_tpi_close(queue_t *, int);
    824 static int	tcp_tpi_close_accept(queue_t *);
    825 
    826 static void	tcp_squeue_add(squeue_t *);
    827 static void	tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
    828 
    829 extern void	tcp_kssl_input(tcp_t *, mblk_t *, cred_t *);
    830 
    831 void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy);
    832 void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
    833     ip_recv_attr_t *dummy);
    834 
    835 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
    836 	    sock_upper_handle_t, cred_t *);
    837 static int tcp_listen(sock_lower_handle_t, int, cred_t *);
    838 static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *,
    839     boolean_t);
    840 static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
    841     cred_t *, pid_t);
    842 static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
    843     boolean_t);
    844 static int tcp_do_unbind(conn_t *);
    845 static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *,
    846     boolean_t);
    847 
    848 static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
    849 
    850 /*
    851  * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
    852  *
    853  * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
    854  * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
    855  * (defined in tcp.h) needs to be filled in and passed into the kernel
    856  * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
    857  * structure contains the four-tuple of a TCP connection and a range of TCP
    858  * states (specified by ac_start and ac_end). The use of wildcard addresses
    859  * and ports is allowed. Connections with a matching four tuple and a state
    860  * within the specified range will be aborted. The valid states for the
    861  * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
    862  * inclusive.
    863  *
    864  * An application which has its connection aborted by this ioctl will receive
    865  * an error that is dependent on the connection state at the time of the abort.
    866  * If the connection state is < TCPS_TIME_WAIT, an application should behave as
    867  * though a RST packet has been received.  If the connection state is equal to
    868  * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
    869  * and all resources associated with the connection will be freed.
    870  */
    871 static mblk_t	*tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
    872 static void	tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
    873 static void	tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
    874     ip_recv_attr_t *dummy);
    875 static int	tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
    876 static void	tcp_ioctl_abort_conn(queue_t *, mblk_t *);
    877 static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
    878     boolean_t, tcp_stack_t *);
    879 
    880 static struct module_info tcp_rinfo =  {
    881 	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
    882 };
    883 
    884 static struct module_info tcp_winfo =  {
    885 	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
    886 };
    887 
    888 /*
    889  * Entry points for TCP as a device. The normal case which supports
    890  * the TCP functionality.
    891  * We have separate open functions for the /dev/tcp and /dev/tcp6 devices.
    892  */
    893 struct qinit tcp_rinitv4 = {
    894 	NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo
    895 };
    896 
    897 struct qinit tcp_rinitv6 = {
    898 	NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo
    899 };
    900 
    901 struct qinit tcp_winit = {
    902 	(pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
    903 };
    904 
    905 /* Initial entry point for TCP in socket mode. */
    906 struct qinit tcp_sock_winit = {
    907 	(pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
    908 };
    909 
    910 /* TCP entry point during fallback */
    911 struct qinit tcp_fallback_sock_winit = {
    912 	(pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo
    913 };
    914 
    915 /*
    916  * Entry points for TCP as a acceptor STREAM opened by sockfs when doing
    917  * an accept. Avoid allocating data structures since eager has already
    918  * been created.
    919  */
    920 struct qinit tcp_acceptor_rinit = {
    921 	NULL, (pfi_t)tcp_rsrv, NULL, tcp_tpi_close_accept, NULL, &tcp_winfo
    922 };
    923 
    924 struct qinit tcp_acceptor_winit = {
    925 	(pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo
    926 };
    927 
    928 /* For AF_INET aka /dev/tcp */
    929 struct streamtab tcpinfov4 = {
    930 	&tcp_rinitv4, &tcp_winit
    931 };
    932 
    933 /* For AF_INET6 aka /dev/tcp6 */
    934 struct streamtab tcpinfov6 = {
    935 	&tcp_rinitv6, &tcp_winit
    936 };
    937 
    938 sock_downcalls_t sock_tcp_downcalls;
    939 
    940 /* Setable only in /etc/system. Move to ndd? */
    941 boolean_t tcp_icmp_source_quench = B_FALSE;
    942 
    943 /*
    944  * Following assumes TPI alignment requirements stay along 32 bit
    945  * boundaries
    946  */
    947 #define	ROUNDUP32(x) \
    948 	(((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
    949 
    950 /* Template for response to info request. */
    951 static struct T_info_ack tcp_g_t_info_ack = {
    952 	T_INFO_ACK,		/* PRIM_type */
    953 	0,			/* TSDU_size */
    954 	T_INFINITE,		/* ETSDU_size */
    955 	T_INVALID,		/* CDATA_size */
    956 	T_INVALID,		/* DDATA_size */
    957 	sizeof (sin_t),		/* ADDR_size */
    958 	0,			/* OPT_size - not initialized here */
    959 	TIDUSZ,			/* TIDU_size */
    960 	T_COTS_ORD,		/* SERV_type */
    961 	TCPS_IDLE,		/* CURRENT_state */
    962 	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
    963 };
    964 
    965 static struct T_info_ack tcp_g_t_info_ack_v6 = {
    966 	T_INFO_ACK,		/* PRIM_type */
    967 	0,			/* TSDU_size */
    968 	T_INFINITE,		/* ETSDU_size */
    969 	T_INVALID,		/* CDATA_size */
    970 	T_INVALID,		/* DDATA_size */
    971 	sizeof (sin6_t),	/* ADDR_size */
    972 	0,			/* OPT_size - not initialized here */
    973 	TIDUSZ,		/* TIDU_size */
    974 	T_COTS_ORD,		/* SERV_type */
    975 	TCPS_IDLE,		/* CURRENT_state */
    976 	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
    977 };
    978 
    979 #define	MS	1L
    980 #define	SECONDS	(1000 * MS)
    981 #define	MINUTES	(60 * SECONDS)
    982 #define	HOURS	(60 * MINUTES)
    983 #define	DAYS	(24 * HOURS)
    984 
    985 #define	PARAM_MAX (~(uint32_t)0)
    986 
    987 /* Max size IP datagram is 64k - 1 */
    988 #define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
    989 #define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
    990 /* Max of the above */
    991 #define	TCP_MSS_MAX	TCP_MSS_MAX_IPV4
    992 
    993 /* Largest TCP port number */
    994 #define	TCP_MAX_PORT	(64 * 1024 - 1)
    995 
    996 /*
    997  * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
    998  * layer header.  It has to be a multiple of 4.
    999  */
   1000 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
   1001 #define	tcps_wroff_xtra	tcps_wroff_xtra_param->tcp_param_val
   1002 
   1003 /*
   1004  * All of these are alterable, within the min/max values given, at run time.
   1005  * Note that the default value of "tcp_time_wait_interval" is four minutes,
   1006  * per the TCP spec.
   1007  */
   1008 /* BEGIN CSTYLED */
   1009 static tcpparam_t	lcl_tcp_param_arr[] = {
   1010  /*min		max		value		name */
   1011  { 1*SECONDS,	10*MINUTES,	1*MINUTES,	"tcp_time_wait_interval"},
   1012  { 1,		PARAM_MAX,	128,		"tcp_conn_req_max_q" },
   1013  { 0,		PARAM_MAX,	1024,		"tcp_conn_req_max_q0" },
   1014  { 1,		1024,		1,		"tcp_conn_req_min" },
   1015  { 0*MS,	20*SECONDS,	0*MS,		"tcp_conn_grace_period" },
   1016  { 128,		(1<<30),	1024*1024,	"tcp_cwnd_max" },
   1017  { 0,		10,		0,		"tcp_debug" },
   1018  { 1024,	(32*1024),	1024,		"tcp_smallest_nonpriv_port"},
   1019  { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_cinterval"},
   1020  { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_linterval"},
   1021  { 500*MS,	PARAM_MAX,	8*MINUTES,	"tcp_ip_abort_interval"},
   1022  { 1*SECONDS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_cinterval"},
   1023  { 500*MS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_interval"},
   1024  { 1,		255,		64,		"tcp_ipv4_ttl"},
   1025  { 10*SECONDS,	10*DAYS,	2*HOURS,	"tcp_keepalive_interval"},
   1026  { 0,		100,		10,		"tcp_maxpsz_multiplier" },
   1027  { 1,		TCP_MSS_MAX_IPV4, 536,		"tcp_mss_def_ipv4"},
   1028  { 1,		TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
   1029  { 1,		TCP_MSS_MAX,	108,		"tcp_mss_min"},
   1030  { 1,		(64*1024)-1,	(4*1024)-1,	"tcp_naglim_def"},
   1031  { 1*MS,	20*SECONDS,	3*SECONDS,	"tcp_rexmit_interval_initial"},
   1032  { 1*MS,	2*HOURS,	60*SECONDS,	"tcp_rexmit_interval_max"},
   1033  { 1*MS,	2*HOURS,	400*MS,		"tcp_rexmit_interval_min"},
   1034  { 1*MS,	1*MINUTES,	100*MS,		"tcp_deferred_ack_interval" },
   1035  { 0,		16,		0,		"tcp_snd_lowat_fraction" },
   1036  { 0,		128000,		0,		"tcp_sth_rcv_hiwat" },
   1037  { 0,		128000,		0,		"tcp_sth_rcv_lowat" },
   1038  { 1,		10000,		3,		"tcp_dupack_fast_retransmit" },
   1039  { 0,		1,		0,		"tcp_ignore_path_mtu" },
   1040  { 1024,	TCP_MAX_PORT,	32*1024,	"tcp_smallest_anon_port"},
   1041  { 1024,	TCP_MAX_PORT,	TCP_MAX_PORT,	"tcp_largest_anon_port"},
   1042  { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
   1043  { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
   1044  { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
   1045  { 1,		65536,		4,		"tcp_recv_hiwat_minmss"},
   1046  { 1*SECONDS,	PARAM_MAX,	675*SECONDS,	"tcp_fin_wait_2_flush_interval"},
   1047  { 8192,	(1<<30),	1024*1024,	"tcp_max_buf"},
   1048 /*
   1049  * Question:  What default value should I set for tcp_strong_iss?
   1050  */
   1051  { 0,		2,		1,		"tcp_strong_iss"},
   1052  { 0,		65536,		20,		"tcp_rtt_updates"},
   1053  { 0,		1,		1,		"tcp_wscale_always"},
   1054  { 0,		1,		0,		"tcp_tstamp_always"},
   1055  { 0,		1,		1,		"tcp_tstamp_if_wscale"},
   1056  { 0*MS,	2*HOURS,	0*MS,		"tcp_rexmit_interval_extra"},
   1057  { 0,		16,		2,		"tcp_deferred_acks_max"},
   1058  { 1,		16384,		4,		"tcp_slow_start_after_idle"},
   1059  { 1,		4,		4,		"tcp_slow_start_initial"},
   1060  { 0,		2,		2,		"tcp_sack_permitted"},
   1061  { 0,		1,		1,		"tcp_compression_enabled"},
   1062  { 0,		IPV6_MAX_HOPS,	IPV6_DEFAULT_HOPS,	"tcp_ipv6_hoplimit"},
   1063  { 1,		TCP_MSS_MAX_IPV6, 1220,		"tcp_mss_def_ipv6"},
   1064  { 1,		TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
   1065  { 0,		1,		0,		"tcp_rev_src_routes"},
   1066  { 10*MS,	500*MS,		50*MS,		"tcp_local_dack_interval"},
   1067  { 0,		16,		8,		"tcp_local_dacks_max"},
   1068  { 0,		2,		1,		"tcp_ecn_permitted"},
   1069  { 0,		1,		1,		"tcp_rst_sent_rate_enabled"},
   1070  { 0,		PARAM_MAX,	40,		"tcp_rst_sent_rate"},
   1071  { 0,		100*MS,		50*MS,		"tcp_push_timer_interval"},
   1072  { 0,		1,		0,		"tcp_use_smss_as_mss_opt"},
   1073  { 0,		PARAM_MAX,	8*MINUTES,	"tcp_keepalive_abort_interval"},
   1074  { 0,		1,		0,		"tcp_dev_flow_ctl"},
   1075 };
   1076 /* END CSTYLED */
   1077 
   1078 /* Round up the value to the nearest mss. */
   1079 #define	MSS_ROUNDUP(value, mss)		((((value) - 1) / (mss) + 1) * (mss))
   1080 
   1081 /*
   1082  * Set ECN capable transport (ECT) code point in IP header.
   1083  *
   1084  * Note that there are 2 ECT code points '01' and '10', which are called
   1085  * ECT(1) and ECT(0) respectively.  Here we follow the original ECT code
   1086  * point ECT(0) for TCP as described in RFC 2481.
   1087  */
   1088 #define	SET_ECT(tcp, iph) \
   1089 	if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
   1090 		/* We need to clear the code point first. */ \
   1091 		((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
   1092 		((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
   1093 	} else { \
   1094 		((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
   1095 		((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
   1096 	}
   1097 
   1098 /*
   1099  * The format argument to pass to tcp_display().
   1100  * DISP_PORT_ONLY means that the returned string has only port info.
   1101  * DISP_ADDR_AND_PORT means that the returned string also contains the
   1102  * remote and local IP address.
   1103  */
   1104 #define	DISP_PORT_ONLY		1
   1105 #define	DISP_ADDR_AND_PORT	2
   1106 
   1107 #define	IS_VMLOANED_MBLK(mp) \
   1108 	(((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
   1109 
   1110 uint32_t do_tcpzcopy = 1;		/* 0: disable, 1: enable, 2: force */
   1111 
   1112 /*
   1113  * Forces all connections to obey the value of the tcps_maxpsz_multiplier
   1114  * tunable settable via NDD.  Otherwise, the per-connection behavior is
   1115  * determined dynamically during tcp_set_destination(), which is the default.
   1116  */
   1117 boolean_t tcp_static_maxpsz = B_FALSE;
   1118 
   1119 /* Setable in /etc/system */
   1120 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
   1121 uint32_t tcp_random_anon_port = 1;
   1122 
   1123 /*
   1124  * To reach to an eager in Q0 which can be dropped due to an incoming
   1125  * new SYN request when Q0 is full, a new doubly linked list is
   1126  * introduced. This list allows to select an eager from Q0 in O(1) time.
   1127  * This is needed to avoid spending too much time walking through the
   1128  * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
   1129  * this new list has to be a member of Q0.
   1130  * This list is headed by listener's tcp_t. When the list is empty,
   1131  * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
   1132  * of listener's tcp_t point to listener's tcp_t itself.
   1133  *
   1134  * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
   1135  * in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
   1136  * These macros do not affect the eager's membership to Q0.
   1137  */
   1138 
   1139 
   1140 #define	MAKE_DROPPABLE(listener, eager)					\
   1141 	if ((eager)->tcp_eager_next_drop_q0 == NULL) {			\
   1142 		(listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
   1143 		    = (eager);						\
   1144 		(eager)->tcp_eager_prev_drop_q0 = (listener);		\
   1145 		(eager)->tcp_eager_next_drop_q0 =			\
   1146 		    (listener)->tcp_eager_next_drop_q0;			\
   1147 		(listener)->tcp_eager_next_drop_q0 = (eager);		\
   1148 	}
   1149 
   1150 #define	MAKE_UNDROPPABLE(eager)						\
   1151 	if ((eager)->tcp_eager_next_drop_q0 != NULL) {			\
   1152 		(eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0	\
   1153 		    = (eager)->tcp_eager_prev_drop_q0;			\
   1154 		(eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0	\
   1155 		    = (eager)->tcp_eager_next_drop_q0;			\
   1156 		(eager)->tcp_eager_prev_drop_q0 = NULL;			\
   1157 		(eager)->tcp_eager_next_drop_q0 = NULL;			\
   1158 	}
   1159 
   1160 /*
   1161  * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
   1162  * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
   1163  * data, TCP will not respond with an ACK.  RFC 793 requires that
   1164  * TCP responds with an ACK for such a bogus ACK.  By not following
   1165  * the RFC, we prevent TCP from getting into an ACK storm if somehow
   1166  * an attacker successfully spoofs an acceptable segment to our
   1167  * peer; or when our peer is "confused."
   1168  */
   1169 uint32_t tcp_drop_ack_unsent_cnt = 10;
   1170 
   1171 /*
   1172  * Hook functions to enable cluster networking
   1173  * On non-clustered systems these vectors must always be NULL.
   1174  */
   1175 
   1176 void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol,
   1177 			    sa_family_t addr_family, uint8_t *laddrp,
   1178 			    in_port_t lport, void *args) = NULL;
   1179 void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol,
   1180 			    sa_family_t addr_family, uint8_t *laddrp,
   1181 			    in_port_t lport, void *args) = NULL;
   1182 
   1183 int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol,
   1184 			    boolean_t is_outgoing,
   1185 			    sa_family_t addr_family,
   1186 			    uint8_t *laddrp, in_port_t lport,
   1187 			    uint8_t *faddrp, in_port_t fport,
   1188 			    void *args) = NULL;
   1189 void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol,
   1190 			    sa_family_t addr_family, uint8_t *laddrp,
   1191 			    in_port_t lport, uint8_t *faddrp,
   1192 			    in_port_t fport, void *args) = NULL;
   1193 
   1194 
   1195 /*
   1196  * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err)
   1197  */
   1198 #define	CL_INET_CONNECT(connp, is_outgoing, err) {		\
   1199 	(err) = 0;						\
   1200 	if (cl_inet_connect2 != NULL) {				\
   1201 		/*						\
   1202 		 * Running in cluster mode - register active connection	\
   1203 		 * information						\
   1204 		 */							\
   1205 		if ((connp)->conn_ipversion == IPV4_VERSION) {		\
   1206 			if ((connp)->conn_laddr_v4 != 0) {		\
   1207 				(err) = (*cl_inet_connect2)(		\
   1208 				    (connp)->conn_netstack->netstack_stackid,\
   1209 				    IPPROTO_TCP, is_outgoing, AF_INET,	\
   1210 				    (uint8_t *)(&((connp)->conn_laddr_v4)),\
   1211 				    (in_port_t)(connp)->conn_lport,	\
   1212 				    (uint8_t *)(&((connp)->conn_faddr_v4)),\
   1213 				    (in_port_t)(connp)->conn_fport, NULL); \
   1214 			}						\
   1215 		} else {						\
   1216 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
   1217 			    &(connp)->conn_laddr_v6)) {			\
   1218 				(err) = (*cl_inet_connect2)(		\
   1219 				    (connp)->conn_netstack->netstack_stackid,\
   1220 				    IPPROTO_TCP, is_outgoing, AF_INET6,	\
   1221 				    (uint8_t *)(&((connp)->conn_laddr_v6)),\
   1222 				    (in_port_t)(connp)->conn_lport,	\
   1223 				    (uint8_t *)(&((connp)->conn_faddr_v6)), \
   1224 				    (in_port_t)(connp)->conn_fport, NULL); \
   1225 			}						\
   1226 		}							\
   1227 	}								\
   1228 }
   1229 
   1230 #define	CL_INET_DISCONNECT(connp)	{				\
   1231 	if (cl_inet_disconnect != NULL) {				\
   1232 		/*							\
   1233 		 * Running in cluster mode - deregister active		\
   1234 		 * connection information				\
   1235 		 */							\
   1236 		if ((connp)->conn_ipversion == IPV4_VERSION) {		\
   1237 			if ((connp)->conn_laddr_v4 != 0) {		\
   1238 				(*cl_inet_disconnect)(			\
   1239 				    (connp)->conn_netstack->netstack_stackid,\
   1240 				    IPPROTO_TCP, AF_INET,		\
   1241 				    (uint8_t *)(&((connp)->conn_laddr_v4)),\
   1242 				    (in_port_t)(connp)->conn_lport,	\
   1243 				    (uint8_t *)(&((connp)->conn_faddr_v4)),\
   1244 				    (in_port_t)(connp)->conn_fport, NULL); \
   1245 			}						\
   1246 		} else {						\
   1247 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
   1248 			    &(connp)->conn_laddr_v6)) {			\
   1249 				(*cl_inet_disconnect)(			\
   1250 				    (connp)->conn_netstack->netstack_stackid,\
   1251 				    IPPROTO_TCP, AF_INET6,		\
   1252 				    (uint8_t *)(&((connp)->conn_laddr_v6)),\
   1253 				    (in_port_t)(connp)->conn_lport,	\
   1254 				    (uint8_t *)(&((connp)->conn_faddr_v6)), \
   1255 				    (in_port_t)(connp)->conn_fport, NULL); \
   1256 			}						\
   1257 		}							\
   1258 	}								\
   1259 }
   1260 
   1261 /*
   1262  * Cluster networking hook for traversing current connection list.
   1263  * This routine is used to extract the current list of live connections
   1264  * which must continue to to be dispatched to this node.
   1265  */
   1266 int cl_tcp_walk_list(netstackid_t stack_id,
   1267     int (*callback)(cl_tcp_info_t *, void *), void *arg);
   1268 
   1269 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
   1270     void *arg, tcp_stack_t *tcps);
   1271 
   1272 static void
   1273 tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh)
   1274 {
   1275 	uint32_t default_threshold = SOCKET_RECVHIWATER >> 3;
   1276 
   1277 	if (IPCL_IS_NONSTR(tcp->tcp_connp)) {
   1278 		conn_t *connp = tcp->tcp_connp;
   1279 		struct sock_proto_props sopp;
   1280 
   1281 		/*
   1282 		 * only increase rcvthresh upto default_threshold
   1283 		 */
   1284 		if (new_rcvthresh > default_threshold)
   1285 			new_rcvthresh = default_threshold;
   1286 
   1287 		sopp.sopp_flags = SOCKOPT_RCVTHRESH;
   1288 		sopp.sopp_rcvthresh = new_rcvthresh;
   1289 
   1290 		(*connp->conn_upcalls->su_set_proto_props)
   1291 		    (connp->conn_upper_handle, &sopp);
   1292 	}
   1293 }
   1294 /*
   1295  * Figure out the value of window scale opton.  Note that the rwnd is
   1296  * ASSUMED to be rounded up to the nearest MSS before the calculation.
   1297  * We cannot find the scale value and then do a round up of tcp_rwnd
   1298  * because the scale value may not be correct after that.
   1299  *
   1300  * Set the compiler flag to make this function inline.
   1301  */
   1302 static void
   1303 tcp_set_ws_value(tcp_t *tcp)
   1304 {
   1305 	int i;
   1306 	uint32_t rwnd = tcp->tcp_rwnd;
   1307 
   1308 	for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
   1309 	    i++, rwnd >>= 1)
   1310 		;
   1311 	tcp->tcp_rcv_ws = i;
   1312 }
   1313 
   1314 /*
   1315  * Remove a connection from the list of detached TIME_WAIT connections.
   1316  * It returns B_FALSE if it can't remove the connection from the list
   1317  * as the connection has already been removed from the list due to an
   1318  * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
   1319  */
   1320 static boolean_t
   1321 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
   1322 {
   1323 	boolean_t	locked = B_FALSE;
   1324 
   1325 	if (tcp_time_wait == NULL) {
   1326 		tcp_time_wait = *((tcp_squeue_priv_t **)
   1327 		    squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
   1328 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1329 		locked = B_TRUE;
   1330 	} else {
   1331 		ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
   1332 	}
   1333 
   1334 	if (tcp->tcp_time_wait_expire == 0) {
   1335 		ASSERT(tcp->tcp_time_wait_next == NULL);
   1336 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   1337 		if (locked)
   1338 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1339 		return (B_FALSE);
   1340 	}
   1341 	ASSERT(TCP_IS_DETACHED(tcp));
   1342 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
   1343 
   1344 	if (tcp == tcp_time_wait->tcp_time_wait_head) {
   1345 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   1346 		tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
   1347 		if (tcp_time_wait->tcp_time_wait_head != NULL) {
   1348 			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
   1349 			    NULL;
   1350 		} else {
   1351 			tcp_time_wait->tcp_time_wait_tail = NULL;
   1352 		}
   1353 	} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
   1354 		ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
   1355 		ASSERT(tcp->tcp_time_wait_next == NULL);
   1356 		tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
   1357 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
   1358 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
   1359 	} else {
   1360 		ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
   1361 		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
   1362 		tcp->tcp_time_wait_prev->tcp_time_wait_next =
   1363 		    tcp->tcp_time_wait_next;
   1364 		tcp->tcp_time_wait_next->tcp_time_wait_prev =
   1365 		    tcp->tcp_time_wait_prev;
   1366 	}
   1367 	tcp->tcp_time_wait_next = NULL;
   1368 	tcp->tcp_time_wait_prev = NULL;
   1369 	tcp->tcp_time_wait_expire = 0;
   1370 
   1371 	if (locked)
   1372 		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1373 	return (B_TRUE);
   1374 }
   1375 
   1376 /*
   1377  * Add a connection to the list of detached TIME_WAIT connections
   1378  * and set its time to expire.
   1379  */
   1380 static void
   1381 tcp_time_wait_append(tcp_t *tcp)
   1382 {
   1383 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1384 	tcp_squeue_priv_t *tcp_time_wait =
   1385 	    *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
   1386 	    SQPRIVATE_TCP));
   1387 
   1388 	tcp_timers_stop(tcp);
   1389 
   1390 	/* Freed above */
   1391 	ASSERT(tcp->tcp_timer_tid == 0);
   1392 	ASSERT(tcp->tcp_ack_tid == 0);
   1393 
   1394 	/* must have happened at the time of detaching the tcp */
   1395 	ASSERT(tcp->tcp_ptpahn == NULL);
   1396 	ASSERT(tcp->tcp_flow_stopped == 0);
   1397 	ASSERT(tcp->tcp_time_wait_next == NULL);
   1398 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   1399 	ASSERT(tcp->tcp_time_wait_expire == NULL);
   1400 	ASSERT(tcp->tcp_listener == NULL);
   1401 
   1402 	tcp->tcp_time_wait_expire = ddi_get_lbolt();
   1403 	/*
   1404 	 * The value computed below in tcp->tcp_time_wait_expire may
   1405 	 * appear negative or wrap around. That is ok since our
   1406 	 * interest is only in the difference between the current lbolt
   1407 	 * value and tcp->tcp_time_wait_expire. But the value should not
   1408 	 * be zero, since it means the tcp is not in the TIME_WAIT list.
   1409 	 * The corresponding comparison in tcp_time_wait_collector() uses
   1410 	 * modular arithmetic.
   1411 	 */
   1412 	tcp->tcp_time_wait_expire +=
   1413 	    drv_usectohz(tcps->tcps_time_wait_interval * 1000);
   1414 	if (tcp->tcp_time_wait_expire == 0)
   1415 		tcp->tcp_time_wait_expire = 1;
   1416 
   1417 	ASSERT(TCP_IS_DETACHED(tcp));
   1418 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
   1419 	ASSERT(tcp->tcp_time_wait_next == NULL);
   1420 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   1421 	TCP_DBGSTAT(tcps, tcp_time_wait);
   1422 
   1423 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1424 	if (tcp_time_wait->tcp_time_wait_head == NULL) {
   1425 		ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
   1426 		tcp_time_wait->tcp_time_wait_head = tcp;
   1427 	} else {
   1428 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
   1429 		ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
   1430 		    TCPS_TIME_WAIT);
   1431 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
   1432 		tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
   1433 	}
   1434 	tcp_time_wait->tcp_time_wait_tail = tcp;
   1435 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1436 }
   1437 
   1438 /* ARGSUSED */
   1439 void
   1440 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   1441 {
   1442 	conn_t	*connp = (conn_t *)arg;
   1443 	tcp_t	*tcp = connp->conn_tcp;
   1444 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1445 
   1446 	ASSERT(tcp != NULL);
   1447 	if (tcp->tcp_state == TCPS_CLOSED) {
   1448 		return;
   1449 	}
   1450 
   1451 	ASSERT((connp->conn_family == AF_INET &&
   1452 	    connp->conn_ipversion == IPV4_VERSION) ||
   1453 	    (connp->conn_family == AF_INET6 &&
   1454 	    (connp->conn_ipversion == IPV4_VERSION ||
   1455 	    connp->conn_ipversion == IPV6_VERSION)));
   1456 	ASSERT(!tcp->tcp_listener);
   1457 
   1458 	TCP_STAT(tcps, tcp_time_wait_reap);
   1459 	ASSERT(TCP_IS_DETACHED(tcp));
   1460 
   1461 	/*
   1462 	 * Because they have no upstream client to rebind or tcp_close()
   1463 	 * them later, we axe the connection here and now.
   1464 	 */
   1465 	tcp_close_detached(tcp);
   1466 }
   1467 
   1468 /*
   1469  * Remove cached/latched IPsec references.
   1470  */
   1471 void
   1472 tcp_ipsec_cleanup(tcp_t *tcp)
   1473 {
   1474 	conn_t		*connp = tcp->tcp_connp;
   1475 
   1476 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
   1477 
   1478 	if (connp->conn_latch != NULL) {
   1479 		IPLATCH_REFRELE(connp->conn_latch);
   1480 		connp->conn_latch = NULL;
   1481 	}
   1482 	if (connp->conn_latch_in_policy != NULL) {
   1483 		IPPOL_REFRELE(connp->conn_latch_in_policy);
   1484 		connp->conn_latch_in_policy = NULL;
   1485 	}
   1486 	if (connp->conn_latch_in_action != NULL) {
   1487 		IPACT_REFRELE(connp->conn_latch_in_action);
   1488 		connp->conn_latch_in_action = NULL;
   1489 	}
   1490 	if (connp->conn_policy != NULL) {
   1491 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
   1492 		connp->conn_policy = NULL;
   1493 	}
   1494 }
   1495 
   1496 /*
   1497  * Cleaup before placing on free list.
   1498  * Disassociate from the netstack/tcp_stack_t since the freelist
   1499  * is per squeue and not per netstack.
   1500  */
   1501 void
   1502 tcp_cleanup(tcp_t *tcp)
   1503 {
   1504 	mblk_t		*mp;
   1505 	tcp_sack_info_t	*tcp_sack_info;
   1506 	conn_t		*connp = tcp->tcp_connp;
   1507 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1508 	netstack_t	*ns = tcps->tcps_netstack;
   1509 	mblk_t		*tcp_rsrv_mp;
   1510 
   1511 	tcp_bind_hash_remove(tcp);
   1512 
   1513 	/* Cleanup that which needs the netstack first */
   1514 	tcp_ipsec_cleanup(tcp);
   1515 	ixa_cleanup(connp->conn_ixa);
   1516 
   1517 	if (connp->conn_ht_iphc != NULL) {
   1518 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
   1519 		connp->conn_ht_iphc = NULL;
   1520 		connp->conn_ht_iphc_allocated = 0;
   1521 		connp->conn_ht_iphc_len = 0;
   1522 		connp->conn_ht_ulp = NULL;
   1523 		connp->conn_ht_ulp_len = 0;
   1524 		tcp->tcp_ipha = NULL;
   1525 		tcp->tcp_ip6h = NULL;
   1526 		tcp->tcp_tcpha = NULL;
   1527 	}
   1528 
   1529 	/* We clear any IP_OPTIONS and extension headers */
   1530 	ip_pkt_free(&connp->conn_xmit_ipp);
   1531 
   1532 	tcp_free(tcp);
   1533 
   1534 	/* Release any SSL context */
   1535 	if (tcp->tcp_kssl_ent != NULL) {
   1536 		kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
   1537 		tcp->tcp_kssl_ent = NULL;
   1538 	}
   1539 
   1540 	if (tcp->tcp_kssl_ctx != NULL) {
   1541 		kssl_release_ctx(tcp->tcp_kssl_ctx);
   1542 		tcp->tcp_kssl_ctx = NULL;
   1543 	}
   1544 	tcp->tcp_kssl_pending = B_FALSE;
   1545 
   1546 	/*
   1547 	 * Since we will bzero the entire structure, we need to
   1548 	 * remove it and reinsert it in global hash list. We
   1549 	 * know the walkers can't get to this conn because we
   1550 	 * had set CONDEMNED flag earlier and checked reference
   1551 	 * under conn_lock so walker won't pick it and when we
   1552 	 * go the ipcl_globalhash_remove() below, no walker
   1553 	 * can get to it.
   1554 	 */
   1555 	ipcl_globalhash_remove(connp);
   1556 
   1557 	/* Save some state */
   1558 	mp = tcp->tcp_timercache;
   1559 
   1560 	tcp_sack_info = tcp->tcp_sack_info;
   1561 	tcp_rsrv_mp = tcp->tcp_rsrv_mp;
   1562 
   1563 	if (connp->conn_cred != NULL) {
   1564 		crfree(connp->conn_cred);
   1565 		connp->conn_cred = NULL;
   1566 	}
   1567 	ipcl_conn_cleanup(connp);
   1568 	connp->conn_flags = IPCL_TCPCONN;
   1569 
   1570 	/*
   1571 	 * Now it is safe to decrement the reference counts.
   1572 	 * This might be the last reference on the netstack
   1573 	 * in which case it will cause the freeing of the IP Instance.
   1574 	 */
   1575 	connp->conn_netstack = NULL;
   1576 	connp->conn_ixa->ixa_ipst = NULL;
   1577 	netstack_rele(ns);
   1578 	ASSERT(tcps != NULL);
   1579 	tcp->tcp_tcps = NULL;
   1580 
   1581 	bzero(tcp, sizeof (tcp_t));
   1582 
   1583 	/* restore the state */
   1584 	tcp->tcp_timercache = mp;
   1585 
   1586 	tcp->tcp_sack_info = tcp_sack_info;
   1587 	tcp->tcp_rsrv_mp = tcp_rsrv_mp;
   1588 
   1589 	tcp->tcp_connp = connp;
   1590 
   1591 	ASSERT(connp->conn_tcp == tcp);
   1592 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
   1593 	connp->conn_state_flags = CONN_INCIPIENT;
   1594 	ASSERT(connp->conn_proto == IPPROTO_TCP);
   1595 	ASSERT(connp->conn_ref == 1);
   1596 }
   1597 
   1598 /*
   1599  * Blows away all tcps whose TIME_WAIT has expired. List traversal
   1600  * is done forwards from the head.
   1601  * This walks all stack instances since
   1602  * tcp_time_wait remains global across all stacks.
   1603  */
   1604 /* ARGSUSED */
   1605 void
   1606 tcp_time_wait_collector(void *arg)
   1607 {
   1608 	tcp_t *tcp;
   1609 	clock_t now;
   1610 	mblk_t *mp;
   1611 	conn_t *connp;
   1612 	kmutex_t *lock;
   1613 	boolean_t removed;
   1614 
   1615 	squeue_t *sqp = (squeue_t *)arg;
   1616 	tcp_squeue_priv_t *tcp_time_wait =
   1617 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
   1618 
   1619 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1620 	tcp_time_wait->tcp_time_wait_tid = 0;
   1621 
   1622 	if (tcp_time_wait->tcp_free_list != NULL &&
   1623 	    tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
   1624 		TCP_G_STAT(tcp_freelist_cleanup);
   1625 		while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
   1626 			tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
   1627 			tcp->tcp_time_wait_next = NULL;
   1628 			tcp_time_wait->tcp_free_list_cnt--;
   1629 			ASSERT(tcp->tcp_tcps == NULL);
   1630 			CONN_DEC_REF(tcp->tcp_connp);
   1631 		}
   1632 		ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
   1633 	}
   1634 
   1635 	/*
   1636 	 * In order to reap time waits reliably, we should use a
   1637 	 * source of time that is not adjustable by the user -- hence
   1638 	 * the call to ddi_get_lbolt().
   1639 	 */
   1640 	now = ddi_get_lbolt();
   1641 	while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
   1642 		/*
   1643 		 * Compare times using modular arithmetic, since
   1644 		 * lbolt can wrapover.
   1645 		 */
   1646 		if ((now - tcp->tcp_time_wait_expire) < 0) {
   1647 			break;
   1648 		}
   1649 
   1650 		removed = tcp_time_wait_remove(tcp, tcp_time_wait);
   1651 		ASSERT(removed);
   1652 
   1653 		connp = tcp->tcp_connp;
   1654 		ASSERT(connp->conn_fanout != NULL);
   1655 		lock = &connp->conn_fanout->connf_lock;
   1656 		/*
   1657 		 * This is essentially a TW reclaim fast path optimization for
   1658 		 * performance where the timewait collector checks under the
   1659 		 * fanout lock (so that no one else can get access to the
   1660 		 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
   1661 		 * the classifier hash list. If ref count is indeed 2, we can
   1662 		 * just remove the conn under the fanout lock and avoid
   1663 		 * cleaning up the conn under the squeue, provided that
   1664 		 * clustering callbacks are not enabled. If clustering is
   1665 		 * enabled, we need to make the clustering callback before
   1666 		 * setting the CONDEMNED flag and after dropping all locks and
   1667 		 * so we forego this optimization and fall back to the slow
   1668 		 * path. Also please see the comments in tcp_closei_local
   1669 		 * regarding the refcnt logic.
   1670 		 *
   1671 		 * Since we are holding the tcp_time_wait_lock, its better
   1672 		 * not to block on the fanout_lock because other connections
   1673 		 * can't add themselves to time_wait list. So we do a
   1674 		 * tryenter instead of mutex_enter.
   1675 		 */
   1676 		if (mutex_tryenter(lock)) {
   1677 			mutex_enter(&connp->conn_lock);
   1678 			if ((connp->conn_ref == 2) &&
   1679 			    (cl_inet_disconnect == NULL)) {
   1680 				ipcl_hash_remove_locked(connp,
   1681 				    connp->conn_fanout);
   1682 				/*
   1683 				 * Set the CONDEMNED flag now itself so that
   1684 				 * the refcnt cannot increase due to any
   1685 				 * walker.
   1686 				 */
   1687 				connp->conn_state_flags |= CONN_CONDEMNED;
   1688 				mutex_exit(lock);
   1689 				mutex_exit(&connp->conn_lock);
   1690 				if (tcp_time_wait->tcp_free_list_cnt <
   1691 				    tcp_free_list_max_cnt) {
   1692 					/* Add to head of tcp_free_list */
   1693 					mutex_exit(
   1694 					    &tcp_time_wait->tcp_time_wait_lock);
   1695 					tcp_cleanup(tcp);
   1696 					ASSERT(connp->conn_latch == NULL);
   1697 					ASSERT(connp->conn_policy == NULL);
   1698 					ASSERT(tcp->tcp_tcps == NULL);
   1699 					ASSERT(connp->conn_netstack == NULL);
   1700 
   1701 					mutex_enter(
   1702 					    &tcp_time_wait->tcp_time_wait_lock);
   1703 					tcp->tcp_time_wait_next =
   1704 					    tcp_time_wait->tcp_free_list;
   1705 					tcp_time_wait->tcp_free_list = tcp;
   1706 					tcp_time_wait->tcp_free_list_cnt++;
   1707 					continue;
   1708 				} else {
   1709 					/* Do not add to tcp_free_list */
   1710 					mutex_exit(
   1711 					    &tcp_time_wait->tcp_time_wait_lock);
   1712 					tcp_bind_hash_remove(tcp);
   1713 					ixa_cleanup(tcp->tcp_connp->conn_ixa);
   1714 					tcp_ipsec_cleanup(tcp);
   1715 					CONN_DEC_REF(tcp->tcp_connp);
   1716 				}
   1717 			} else {
   1718 				CONN_INC_REF_LOCKED(connp);
   1719 				mutex_exit(lock);
   1720 				mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1721 				mutex_exit(&connp->conn_lock);
   1722 				/*
   1723 				 * We can reuse the closemp here since conn has
   1724 				 * detached (otherwise we wouldn't even be in
   1725 				 * time_wait list). tcp_closemp_used can safely
   1726 				 * be changed without taking a lock as no other
   1727 				 * thread can concurrently access it at this
   1728 				 * point in the connection lifecycle.
   1729 				 */
   1730 
   1731 				if (tcp->tcp_closemp.b_prev == NULL)
   1732 					tcp->tcp_closemp_used = B_TRUE;
   1733 				else
   1734 					cmn_err(CE_PANIC,
   1735 					    "tcp_timewait_collector: "
   1736 					    "concurrent use of tcp_closemp: "
   1737 					    "connp %p tcp %p\n", (void *)connp,
   1738 					    (void *)tcp);
   1739 
   1740 				TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   1741 				mp = &tcp->tcp_closemp;
   1742 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   1743 				    tcp_timewait_output, connp, NULL,
   1744 				    SQ_FILL, SQTAG_TCP_TIMEWAIT);
   1745 			}
   1746 		} else {
   1747 			mutex_enter(&connp->conn_lock);
   1748 			CONN_INC_REF_LOCKED(connp);
   1749 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1750 			mutex_exit(&connp->conn_lock);
   1751 			/*
   1752 			 * We can reuse the closemp here since conn has
   1753 			 * detached (otherwise we wouldn't even be in
   1754 			 * time_wait list). tcp_closemp_used can safely
   1755 			 * be changed without taking a lock as no other
   1756 			 * thread can concurrently access it at this
   1757 			 * point in the connection lifecycle.
   1758 			 */
   1759 
   1760 			if (tcp->tcp_closemp.b_prev == NULL)
   1761 				tcp->tcp_closemp_used = B_TRUE;
   1762 			else
   1763 				cmn_err(CE_PANIC, "tcp_timewait_collector: "
   1764 				    "concurrent use of tcp_closemp: "
   1765 				    "connp %p tcp %p\n", (void *)connp,
   1766 				    (void *)tcp);
   1767 
   1768 			TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   1769 			mp = &tcp->tcp_closemp;
   1770 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   1771 			    tcp_timewait_output, connp, NULL,
   1772 			    SQ_FILL, SQTAG_TCP_TIMEWAIT);
   1773 		}
   1774 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1775 	}
   1776 
   1777 	if (tcp_time_wait->tcp_free_list != NULL)
   1778 		tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
   1779 
   1780 	tcp_time_wait->tcp_time_wait_tid =
   1781 	    timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp,
   1782 	    TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION,
   1783 	    CALLOUT_FLAG_ROUNDUP);
   1784 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1785 }
   1786 
   1787 /*
   1788  * Reply to a clients T_CONN_RES TPI message. This function
   1789  * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
   1790  * on the acceptor STREAM and processed in tcp_accept_common().
   1791  * Read the block comment on top of tcp_input_listener().
   1792  */
   1793 static void
   1794 tcp_tli_accept(tcp_t *listener, mblk_t *mp)
   1795 {
   1796 	tcp_t		*acceptor;
   1797 	tcp_t		*eager;
   1798 	tcp_t   	*tcp;
   1799 	struct T_conn_res	*tcr;
   1800 	t_uscalar_t	acceptor_id;
   1801 	t_scalar_t	seqnum;
   1802 	mblk_t		*discon_mp = NULL;
   1803 	mblk_t		*ok_mp;
   1804 	mblk_t		*mp1;
   1805 	tcp_stack_t	*tcps = listener->tcp_tcps;
   1806 	conn_t		*econnp;
   1807 
   1808 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
   1809 		tcp_err_ack(listener, mp, TPROTO, 0);
   1810 		return;
   1811 	}
   1812 	tcr = (struct T_conn_res *)mp->b_rptr;
   1813 
   1814 	/*
   1815 	 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
   1816 	 * read side queue of the streams device underneath us i.e. the
   1817 	 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
   1818 	 * look it up in the queue_hash.  Under LP64 it sends down the
   1819 	 * minor_t of the accepting endpoint.
   1820 	 *
   1821 	 * Once the acceptor/eager are modified (in tcp_accept_swap) the
   1822 	 * fanout hash lock is held.
   1823 	 * This prevents any thread from entering the acceptor queue from
   1824 	 * below (since it has not been hard bound yet i.e. any inbound
   1825 	 * packets will arrive on the listener conn_t and
   1826 	 * go through the classifier).
   1827 	 * The CONN_INC_REF will prevent the acceptor from closing.
   1828 	 *
   1829 	 * XXX It is still possible for a tli application to send down data
   1830 	 * on the accepting stream while another thread calls t_accept.
   1831 	 * This should not be a problem for well-behaved applications since
   1832 	 * the T_OK_ACK is sent after the queue swapping is completed.
   1833 	 *
   1834 	 * If the accepting fd is the same as the listening fd, avoid
   1835 	 * queue hash lookup since that will return an eager listener in a
   1836 	 * already established state.
   1837 	 */
   1838 	acceptor_id = tcr->ACCEPTOR_id;
   1839 	mutex_enter(&listener->tcp_eager_lock);
   1840 	if (listener->tcp_acceptor_id == acceptor_id) {
   1841 		eager = listener->tcp_eager_next_q;
   1842 		/* only count how many T_CONN_INDs so don't count q0 */
   1843 		if ((listener->tcp_conn_req_cnt_q != 1) ||
   1844 		    (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
   1845 			mutex_exit(&listener->tcp_eager_lock);
   1846 			tcp_err_ack(listener, mp, TBADF, 0);
   1847 			return;
   1848 		}
   1849 		if (listener->tcp_conn_req_cnt_q0 != 0) {
   1850 			/* Throw away all the eagers on q0. */
   1851 			tcp_eager_cleanup(listener, 1);
   1852 		}
   1853 		if (listener->tcp_syn_defense) {
   1854 			listener->tcp_syn_defense = B_FALSE;
   1855 			if (listener->tcp_ip_addr_cache != NULL) {
   1856 				kmem_free(listener->tcp_ip_addr_cache,
   1857 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
   1858 				listener->tcp_ip_addr_cache = NULL;
   1859 			}
   1860 		}
   1861 		/*
   1862 		 * Transfer tcp_conn_req_max to the eager so that when
   1863 		 * a disconnect occurs we can revert the endpoint to the
   1864 		 * listen state.
   1865 		 */
   1866 		eager->tcp_conn_req_max = listener->tcp_conn_req_max;
   1867 		ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
   1868 		/*
   1869 		 * Get a reference on the acceptor just like the
   1870 		 * tcp_acceptor_hash_lookup below.
   1871 		 */
   1872 		acceptor = listener;
   1873 		CONN_INC_REF(acceptor->tcp_connp);
   1874 	} else {
   1875 		acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
   1876 		if (acceptor == NULL) {
   1877 			if (listener->tcp_connp->conn_debug) {
   1878 				(void) strlog(TCP_MOD_ID, 0, 1,
   1879 				    SL_ERROR|SL_TRACE,
   1880 				    "tcp_accept: did not find acceptor 0x%x\n",
   1881 				    acceptor_id);
   1882 			}
   1883 			mutex_exit(&listener->tcp_eager_lock);
   1884 			tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
   1885 			return;
   1886 		}
   1887 		/*
   1888 		 * Verify acceptor state. The acceptable states for an acceptor
   1889 		 * include TCPS_IDLE and TCPS_BOUND.
   1890 		 */
   1891 		switch (acceptor->tcp_state) {
   1892 		case TCPS_IDLE:
   1893 			/* FALLTHRU */
   1894 		case TCPS_BOUND:
   1895 			break;
   1896 		default:
   1897 			CONN_DEC_REF(acceptor->tcp_connp);
   1898 			mutex_exit(&listener->tcp_eager_lock);
   1899 			tcp_err_ack(listener, mp, TOUTSTATE, 0);
   1900 			return;
   1901 		}
   1902 	}
   1903 
   1904 	/* The listener must be in TCPS_LISTEN */
   1905 	if (listener->tcp_state != TCPS_LISTEN) {
   1906 		CONN_DEC_REF(acceptor->tcp_connp);
   1907 		mutex_exit(&listener->tcp_eager_lock);
   1908 		tcp_err_ack(listener, mp, TOUTSTATE, 0);
   1909 		return;
   1910 	}
   1911 
   1912 	/*
   1913 	 * Rendezvous with an eager connection request packet hanging off
   1914 	 * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
   1915 	 * tcp structure when the connection packet arrived in
   1916 	 * tcp_input_listener().
   1917 	 */
   1918 	seqnum = tcr->SEQ_number;
   1919 	eager = listener;
   1920 	do {
   1921 		eager = eager->tcp_eager_next_q;
   1922 		if (eager == NULL) {
   1923 			CONN_DEC_REF(acceptor->tcp_connp);
   1924 			mutex_exit(&listener->tcp_eager_lock);
   1925 			tcp_err_ack(listener, mp, TBADSEQ, 0);
   1926 			return;
   1927 		}
   1928 	} while (eager->tcp_conn_req_seqnum != seqnum);
   1929 	mutex_exit(&listener->tcp_eager_lock);
   1930 
   1931 	/*
   1932 	 * At this point, both acceptor and listener have 2 ref
   1933 	 * that they begin with. Acceptor has one additional ref
   1934 	 * we placed in lookup while listener has 3 additional
   1935 	 * ref for being behind the squeue (tcp_accept() is
   1936 	 * done on listener's squeue); being in classifier hash;
   1937 	 * and eager's ref on listener.
   1938 	 */
   1939 	ASSERT(listener->tcp_connp->conn_ref >= 5);
   1940 	ASSERT(acceptor->tcp_connp->conn_ref >= 3);
   1941 
   1942 	/*
   1943 	 * The eager at this point is set in its own squeue and
   1944 	 * could easily have been killed (tcp_accept_finish will
   1945 	 * deal with that) because of a TH_RST so we can only
   1946 	 * ASSERT for a single ref.
   1947 	 */
   1948 	ASSERT(eager->tcp_connp->conn_ref >= 1);
   1949 
   1950 	/*
   1951 	 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
   1952 	 * use it if something failed.
   1953 	 */
   1954 	discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
   1955 	    sizeof (struct stroptions)), BPRI_HI);
   1956 	if (discon_mp == NULL) {
   1957 		CONN_DEC_REF(acceptor->tcp_connp);
   1958 		CONN_DEC_REF(eager->tcp_connp);
   1959 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
   1960 		return;
   1961 	}
   1962 
   1963 	econnp = eager->tcp_connp;
   1964 
   1965 	/* Hold a copy of mp, in case reallocb fails */
   1966 	if ((mp1 = copymsg(mp)) == NULL) {
   1967 		CONN_DEC_REF(acceptor->tcp_connp);
   1968 		CONN_DEC_REF(eager->tcp_connp);
   1969 		freemsg(discon_mp);
   1970 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
   1971 		return;
   1972 	}
   1973 
   1974 	tcr = (struct T_conn_res *)mp1->b_rptr;
   1975 
   1976 	/*
   1977 	 * This is an expanded version of mi_tpi_ok_ack_alloc()
   1978 	 * which allocates a larger mblk and appends the new
   1979 	 * local address to the ok_ack.  The address is copied by
   1980 	 * soaccept() for getsockname().
   1981 	 */
   1982 	{
   1983 		int extra;
   1984 
   1985 		extra = (econnp->conn_family == AF_INET) ?
   1986 		    sizeof (sin_t) : sizeof (sin6_t);
   1987 
   1988 		/*
   1989 		 * Try to re-use mp, if possible.  Otherwise, allocate
   1990 		 * an mblk and return it as ok_mp.  In any case, mp
   1991 		 * is no longer usable upon return.
   1992 		 */
   1993 		if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
   1994 			CONN_DEC_REF(acceptor->tcp_connp);
   1995 			CONN_DEC_REF(eager->tcp_connp);
   1996 			freemsg(discon_mp);
   1997 			/* Original mp has been freed by now, so use mp1 */
   1998 			tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
   1999 			return;
   2000 		}
   2001 
   2002 		mp = NULL;	/* We should never use mp after this point */
   2003 
   2004 		switch (extra) {
   2005 		case sizeof (sin_t): {
   2006 			sin_t *sin = (sin_t *)ok_mp->b_wptr;
   2007 
   2008 			ok_mp->b_wptr += extra;
   2009 			sin->sin_family = AF_INET;
   2010 			sin->sin_port = econnp->conn_lport;
   2011 			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
   2012 			break;
   2013 		}
   2014 		case sizeof (sin6_t): {
   2015 			sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
   2016 
   2017 			ok_mp->b_wptr += extra;
   2018 			sin6->sin6_family = AF_INET6;
   2019 			sin6->sin6_port = econnp->conn_lport;
   2020 			sin6->sin6_addr = econnp->conn_laddr_v6;
   2021 			sin6->sin6_flowinfo = econnp->conn_flowinfo;
   2022 			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
   2023 			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
   2024 				sin6->sin6_scope_id =
   2025 				    econnp->conn_ixa->ixa_scopeid;
   2026 			} else {
   2027 				sin6->sin6_scope_id = 0;
   2028 			}
   2029 			sin6->__sin6_src_id = 0;
   2030 			break;
   2031 		}
   2032 		default:
   2033 			break;
   2034 		}
   2035 		ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
   2036 	}
   2037 
   2038 	/*
   2039 	 * If there are no options we know that the T_CONN_RES will
   2040 	 * succeed. However, we can't send the T_OK_ACK upstream until
   2041 	 * the tcp_accept_swap is done since it would be dangerous to
   2042 	 * let the application start using the new fd prior to the swap.
   2043 	 */
   2044 	tcp_accept_swap(listener, acceptor, eager);
   2045 
   2046 	/*
   2047 	 * tcp_accept_swap unlinks eager from listener but does not drop
   2048 	 * the eager's reference on the listener.
   2049 	 */
   2050 	ASSERT(eager->tcp_listener == NULL);
   2051 	ASSERT(listener->tcp_connp->conn_ref >= 5);
   2052 
   2053 	/*
   2054 	 * The eager is now associated with its own queue. Insert in
   2055 	 * the hash so that the connection can be reused for a future
   2056 	 * T_CONN_RES.
   2057 	 */
   2058 	tcp_acceptor_hash_insert(acceptor_id, eager);
   2059 
   2060 	/*
   2061 	 * We now do the processing of options with T_CONN_RES.
   2062 	 * We delay till now since we wanted to have queue to pass to
   2063 	 * option processing routines that points back to the right
   2064 	 * instance structure which does not happen until after
   2065 	 * tcp_accept_swap().
   2066 	 *
   2067 	 * Note:
   2068 	 * The sanity of the logic here assumes that whatever options
   2069 	 * are appropriate to inherit from listner=>eager are done
   2070 	 * before this point, and whatever were to be overridden (or not)
   2071 	 * in transfer logic from eager=>acceptor in tcp_accept_swap().
   2072 	 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
   2073 	 *   before its ACCEPTOR_id comes down in T_CONN_RES ]
   2074 	 * This may not be true at this point in time but can be fixed
   2075 	 * independently. This option processing code starts with
   2076 	 * the instantiated acceptor instance and the final queue at
   2077 	 * this point.
   2078 	 */
   2079 
   2080 	if (tcr->OPT_length != 0) {
   2081 		/* Options to process */
   2082 		int t_error = 0;
   2083 		int sys_error = 0;
   2084 		int do_disconnect = 0;
   2085 
   2086 		if (tcp_conprim_opt_process(eager, mp1,
   2087 		    &do_disconnect, &t_error, &sys_error) < 0) {
   2088 			eager->tcp_accept_error = 1;
   2089 			if (do_disconnect) {
   2090 				/*
   2091 				 * An option failed which does not allow
   2092 				 * connection to be accepted.
   2093 				 *
   2094 				 * We allow T_CONN_RES to succeed and
   2095 				 * put a T_DISCON_IND on the eager queue.
   2096 				 */
   2097 				ASSERT(t_error == 0 && sys_error == 0);
   2098 				eager->tcp_send_discon_ind = 1;
   2099 			} else {
   2100 				ASSERT(t_error != 0);
   2101 				freemsg(ok_mp);
   2102 				/*
   2103 				 * Original mp was either freed or set
   2104 				 * to ok_mp above, so use mp1 instead.
   2105 				 */
   2106 				tcp_err_ack(listener, mp1, t_error, sys_error);
   2107 				goto finish;
   2108 			}
   2109 		}
   2110 		/*
   2111 		 * Most likely success in setting options (except if
   2112 		 * eager->tcp_send_discon_ind set).
   2113 		 * mp1 option buffer represented by OPT_length/offset
   2114 		 * potentially modified and contains results of setting
   2115 		 * options at this point
   2116 		 */
   2117 	}
   2118 
   2119 	/* We no longer need mp1, since all options processing has passed */
   2120 	freemsg(mp1);
   2121 
   2122 	putnext(listener->tcp_connp->conn_rq, ok_mp);
   2123 
   2124 	mutex_enter(&listener->tcp_eager_lock);
   2125 	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
   2126 		tcp_t	*tail;
   2127 		mblk_t	*conn_ind;
   2128 
   2129 		/*
   2130 		 * This path should not be executed if listener and
   2131 		 * acceptor streams are the same.
   2132 		 */
   2133 		ASSERT(listener != acceptor);
   2134 
   2135 		tcp = listener->tcp_eager_prev_q0;
   2136 		/*
   2137 		 * listener->tcp_eager_prev_q0 points to the TAIL of the
   2138 		 * deferred T_conn_ind queue. We need to get to the head of
   2139 		 * the queue in order to send up T_conn_ind the same order as
   2140 		 * how the 3WHS is completed.
   2141 		 */
   2142 		while (tcp != listener) {
   2143 			if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
   2144 				break;
   2145 			else
   2146 				tcp = tcp->tcp_eager_prev_q0;
   2147 		}
   2148 		ASSERT(tcp != listener);
   2149 		conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
   2150 		ASSERT(conn_ind != NULL);
   2151 		tcp->tcp_conn.tcp_eager_conn_ind = NULL;
   2152 
   2153 		/* Move from q0 to q */
   2154 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
   2155 		listener->tcp_conn_req_cnt_q0--;
   2156 		listener->tcp_conn_req_cnt_q++;
   2157 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
   2158 		    tcp->tcp_eager_prev_q0;
   2159 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
   2160 		    tcp->tcp_eager_next_q0;
   2161 		tcp->tcp_eager_prev_q0 = NULL;
   2162 		tcp->tcp_eager_next_q0 = NULL;
   2163 		tcp->tcp_conn_def_q0 = B_FALSE;
   2164 
   2165 		/* Make sure the tcp isn't in the list of droppables */
   2166 		ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
   2167 		    tcp->tcp_eager_prev_drop_q0 == NULL);
   2168 
   2169 		/*
   2170 		 * Insert at end of the queue because sockfs sends
   2171 		 * down T_CONN_RES in chronological order. Leaving
   2172 		 * the older conn indications at front of the queue
   2173 		 * helps reducing search time.
   2174 		 */
   2175 		tail = listener->tcp_eager_last_q;
   2176 		if (tail != NULL)
   2177 			tail->tcp_eager_next_q = tcp;
   2178 		else
   2179 			listener->tcp_eager_next_q = tcp;
   2180 		listener->tcp_eager_last_q = tcp;
   2181 		tcp->tcp_eager_next_q = NULL;
   2182 		mutex_exit(&listener->tcp_eager_lock);
   2183 		putnext(tcp->tcp_connp->conn_rq, conn_ind);
   2184 	} else {
   2185 		mutex_exit(&listener->tcp_eager_lock);
   2186 	}
   2187 
   2188 	/*
   2189 	 * Done with the acceptor - free it
   2190 	 *
   2191 	 * Note: from this point on, no access to listener should be made
   2192 	 * as listener can be equal to acceptor.
   2193 	 */
   2194 finish:
   2195 	ASSERT(acceptor->tcp_detached);
   2196 	acceptor->tcp_connp->conn_rq = NULL;
   2197 	ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
   2198 	acceptor->tcp_connp->conn_wq = NULL;
   2199 	(void) tcp_clean_death(acceptor, 0, 2);
   2200 	CONN_DEC_REF(acceptor->tcp_connp);
   2201 
   2202 	/*
   2203 	 * We pass discon_mp to tcp_accept_finish to get on the right squeue.
   2204 	 *
   2205 	 * It will update the setting for sockfs/stream head and also take
   2206 	 * care of any data that arrived before accept() wad called.
   2207 	 * In case we already received a FIN then tcp_accept_finish will send up
   2208 	 * the ordrel. It will also send up a window update if the window
   2209 	 * has opened up.
   2210 	 */
   2211 
   2212 	/*
   2213 	 * XXX: we currently have a problem if XTI application closes the
   2214 	 * acceptor stream in between. This problem exists in on10-gate also
   2215 	 * and is well know but nothing can be done short of major rewrite
   2216 	 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
   2217 	 * eager same squeue as listener (we can distinguish non socket
   2218 	 * listeners at the time of handling a SYN in tcp_input_listener)
   2219 	 * and do most of the work that tcp_accept_finish does here itself
   2220 	 * and then get behind the acceptor squeue to access the acceptor
   2221 	 * queue.
   2222 	 */
   2223 	/*
   2224 	 * We already have a ref on tcp so no need to do one before squeue_enter
   2225 	 */
   2226 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
   2227 	    tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
   2228 	    SQTAG_TCP_ACCEPT_FINISH);
   2229 }
   2230 
   2231 /*
   2232  * Swap information between the eager and acceptor for a TLI/XTI client.
   2233  * The sockfs accept is done on the acceptor stream and control goes
   2234  * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
   2235  * called. In either case, both the eager and listener are in their own
   2236  * perimeter (squeue) and the code has to deal with potential race.
   2237  *
   2238  * See the block comment on top of tcp_accept() and tcp_tli_accept().
   2239  */
   2240 static void
   2241 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
   2242 {
   2243 	conn_t	*econnp, *aconnp;
   2244 
   2245 	ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
   2246 	ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
   2247 	ASSERT(!TCP_IS_SOCKET(acceptor));
   2248 	ASSERT(!TCP_IS_SOCKET(eager));
   2249 	ASSERT(!TCP_IS_SOCKET(listener));
   2250 
   2251 	/*
   2252 	 * Trusted Extensions may need to use a security label that is
   2253 	 * different from the acceptor's label on MLP and MAC-Exempt
   2254 	 * sockets. If this is the case, the required security label
   2255 	 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
   2256 	 * acceptor stream refer to econnp we atomatically get that label.
   2257 	 */
   2258 
   2259 	acceptor->tcp_detached = B_TRUE;
   2260 	/*
   2261 	 * To permit stream re-use by TLI/XTI, the eager needs a copy of
   2262 	 * the acceptor id.
   2263 	 */
   2264 	eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
   2265 
   2266 	/* remove eager from listen list... */
   2267 	mutex_enter(&listener->tcp_eager_lock);
   2268 	tcp_eager_unlink(eager);
   2269 	ASSERT(eager->tcp_eager_next_q == NULL &&
   2270 	    eager->tcp_eager_last_q == NULL);
   2271 	ASSERT(eager->tcp_eager_next_q0 == NULL &&
   2272 	    eager->tcp_eager_prev_q0 == NULL);
   2273 	mutex_exit(&listener->tcp_eager_lock);
   2274 
   2275 	econnp = eager->tcp_connp;
   2276 	aconnp = acceptor->tcp_connp;
   2277 	econnp->conn_rq = aconnp->conn_rq;
   2278 	econnp->conn_wq = aconnp->conn_wq;
   2279 	econnp->conn_rq->q_ptr = econnp;
   2280 	econnp->conn_wq->q_ptr = econnp;
   2281 
   2282 	/*
   2283 	 * In the TLI/XTI loopback case, we are inside the listener's squeue,
   2284 	 * which might be a different squeue from our peer TCP instance.
   2285 	 * For TCP Fusion, the peer expects that whenever tcp_detached is
   2286 	 * clear, our TCP queues point to the acceptor's queues.  Thus, use
   2287 	 * membar_producer() to ensure that the assignments of conn_rq/conn_wq
   2288 	 * above reach global visibility prior to the clearing of tcp_detached.
   2289 	 */
   2290 	membar_producer();
   2291 	eager->tcp_detached = B_FALSE;
   2292 
   2293 	ASSERT(eager->tcp_ack_tid == 0);
   2294 
   2295 	econnp->conn_dev = aconnp->conn_dev;
   2296 	econnp->conn_minor_arena = aconnp->conn_minor_arena;
   2297 
   2298 	ASSERT(econnp->conn_minor_arena != NULL);
   2299 	if (econnp->conn_cred != NULL)
   2300 		crfree(econnp->conn_cred);
   2301 	econnp->conn_cred = aconnp->conn_cred;
   2302 	aconnp->conn_cred = NULL;
   2303 	econnp->conn_cpid = aconnp->conn_cpid;
   2304 	ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
   2305 	ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
   2306 
   2307 	econnp->conn_zoneid = aconnp->conn_zoneid;
   2308 	econnp->conn_allzones = aconnp->conn_allzones;
   2309 	econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
   2310 
   2311 	econnp->conn_mac_mode = aconnp->conn_mac_mode;
   2312 	econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
   2313 	aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
   2314 
   2315 	/* Do the IPC initialization */
   2316 	CONN_INC_REF(econnp);
   2317 
   2318 	/* Done with old IPC. Drop its ref on its connp */
   2319 	CONN_DEC_REF(aconnp);
   2320 }
   2321 
   2322 
   2323 /*
   2324  * Adapt to the information, such as rtt and rtt_sd, provided from the
   2325  * DCE and IRE maintained by IP.
   2326  *
   2327  * Checks for multicast and broadcast destination address.
   2328  * Returns zero if ok; an errno on failure.
   2329  *
   2330  * Note that the MSS calculation here is based on the info given in
   2331  * the DCE and IRE.  We do not do any calculation based on TCP options.  They
   2332  * will be handled in tcp_input_data() when TCP knows which options to use.
   2333  *
   2334  * Note on how TCP gets its parameters for a connection.
   2335  *
   2336  * When a tcp_t structure is allocated, it gets all the default parameters.
   2337  * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd,
   2338  * spipe, rpipe, ... from the route metrics.  Route metric overrides the
   2339  * default.
   2340  *
   2341  * An incoming SYN with a multicast or broadcast destination address is dropped
   2342  * in ip_fanout_v4/v6.
   2343  *
   2344  * An incoming SYN with a multicast or broadcast source address is always
   2345  * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in
   2346  * conn_connect.
   2347  * The same logic in tcp_set_destination also serves to
   2348  * reject an attempt to connect to a broadcast or multicast (destination)
   2349  * address.
   2350  */
   2351 static int
   2352 tcp_set_destination(tcp_t *tcp)
   2353 {
   2354 	uint32_t	mss_max;
   2355 	uint32_t	mss;
   2356 	boolean_t	tcp_detached = TCP_IS_DETACHED(tcp);
   2357 	conn_t		*connp = tcp->tcp_connp;
   2358 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   2359 	iulp_t		uinfo;
   2360 	int		error;
   2361 	uint32_t	flags;
   2362 
   2363 	flags = IPDF_LSO | IPDF_ZCOPY;
   2364 	/*
   2365 	 * Make sure we have a dce for the destination to avoid dce_ident
   2366 	 * contention for connected sockets.
   2367 	 */
   2368 	flags |= IPDF_UNIQUE_DCE;
   2369 
   2370 	if (!tcps->tcps_ignore_path_mtu)
   2371 		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
   2372 
   2373 	/* Use conn_lock to satify ASSERT; tcp is already serialized */
   2374 	mutex_enter(&connp->conn_lock);
   2375 	error = conn_connect(connp, &uinfo, flags);
   2376 	mutex_exit(&connp->conn_lock);
   2377 	if (error != 0)
   2378 		return (error);
   2379 
   2380 	error = tcp_build_hdrs(tcp);
   2381 	if (error != 0)
   2382 		return (error);
   2383 
   2384 	tcp->tcp_localnet = uinfo.iulp_localnet;
   2385 
   2386 	if (uinfo.iulp_rtt != 0) {
   2387 		clock_t	rto;
   2388 
   2389 		tcp->tcp_rtt_sa = uinfo.iulp_rtt;
   2390 		tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
   2391 		rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
   2392 		    tcps->tcps_rexmit_interval_extra +
   2393 		    (tcp->tcp_rtt_sa >> 5);
   2394 
   2395 		if (rto > tcps->tcps_rexmit_interval_max) {
   2396 			tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
   2397 		} else if (rto < tcps->tcps_rexmit_interval_min) {
   2398 			tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
   2399 		} else {
   2400 			tcp->tcp_rto = rto;
   2401 		}
   2402 	}
   2403 	if (uinfo.iulp_ssthresh != 0)
   2404 		tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
   2405 	else
   2406 		tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
   2407 	if (uinfo.iulp_spipe > 0) {
   2408 		connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
   2409 		    tcps->tcps_max_buf);
   2410 		if (tcps->tcps_snd_lowat_fraction != 0) {
   2411 			connp->conn_sndlowat = connp->conn_sndbuf /
   2412 			    tcps->tcps_snd_lowat_fraction;
   2413 		}
   2414 		(void) tcp_maxpsz_set(tcp, B_TRUE);
   2415 	}
   2416 	/*
   2417 	 * Note that up till now, acceptor always inherits receive
   2418 	 * window from the listener.  But if there is a metrics
   2419 	 * associated with a host, we should use that instead of
   2420 	 * inheriting it from listener. Thus we need to pass this
   2421 	 * info back to the caller.
   2422 	 */
   2423 	if (uinfo.iulp_rpipe > 0) {
   2424 		tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe,
   2425 		    tcps->tcps_max_buf);
   2426 	}
   2427 
   2428 	if (uinfo.iulp_rtomax > 0) {
   2429 		tcp->tcp_second_timer_threshold =
   2430 		    uinfo.iulp_rtomax;
   2431 	}
   2432 
   2433 	/*
   2434 	 * Use the metric option settings, iulp_tstamp_ok and
   2435 	 * iulp_wscale_ok, only for active open. What this means
   2436 	 * is that if the other side uses timestamp or window
   2437 	 * scale option, TCP will also use those options. That
   2438 	 * is for passive open.  If the application sets a
   2439 	 * large window, window scale is enabled regardless of
   2440 	 * the value in iulp_wscale_ok.  This is the behavior
   2441 	 * since 2.6.  So we keep it.
   2442 	 * The only case left in passive open processing is the
   2443 	 * check for SACK.
   2444 	 * For ECN, it should probably be like SACK.  But the
   2445 	 * current value is binary, so we treat it like the other
   2446 	 * cases.  The metric only controls active open.For passive
   2447 	 * open, the ndd param, tcp_ecn_permitted, controls the
   2448 	 * behavior.
   2449 	 */
   2450 	if (!tcp_detached) {
   2451 		/*
   2452 		 * The if check means that the following can only
   2453 		 * be turned on by the metrics only IRE, but not off.
   2454 		 */
   2455 		if (uinfo.iulp_tstamp_ok)
   2456 			tcp->tcp_snd_ts_ok = B_TRUE;
   2457 		if (uinfo.iulp_wscale_ok)
   2458 			tcp->tcp_snd_ws_ok = B_TRUE;
   2459 		if (uinfo.iulp_sack == 2)
   2460 			tcp->tcp_snd_sack_ok = B_TRUE;
   2461 		if (uinfo.iulp_ecn_ok)
   2462 			tcp->tcp_ecn_ok = B_TRUE;
   2463 	} else {
   2464 		/*
   2465 		 * Passive open.
   2466 		 *
   2467 		 * As above, the if check means that SACK can only be
   2468 		 * turned on by the metric only IRE.
   2469 		 */
   2470 		if (uinfo.iulp_sack > 0) {
   2471 			tcp->tcp_snd_sack_ok = B_TRUE;
   2472 		}
   2473 	}
   2474 
   2475 	/*
   2476 	 * XXX Note that currently, iulp_mtu can be as small as 68
   2477 	 * because of PMTUd.  So tcp_mss may go to negative if combined
   2478 	 * length of all those options exceeds 28 bytes.  But because
   2479 	 * of the tcp_mss_min check below, we may not have a problem if
   2480 	 * tcp_mss_min is of a reasonable value.  The default is 1 so
   2481 	 * the negative problem still exists.  And the check defeats PMTUd.
   2482 	 * In fact, if PMTUd finds that the MSS should be smaller than
   2483 	 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min
   2484 	 * value.
   2485 	 *
   2486 	 * We do not deal with that now.  All those problems related to
   2487 	 * PMTUd will be fixed later.
   2488 	 */
   2489 	ASSERT(uinfo.iulp_mtu != 0);
   2490 	mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu;
   2491 
   2492 	/* Sanity check for MSS value. */
   2493 	if (connp->conn_ipversion == IPV4_VERSION)
   2494 		mss_max = tcps->tcps_mss_max_ipv4;
   2495 	else
   2496 		mss_max = tcps->tcps_mss_max_ipv6;
   2497 
   2498 	if (tcp->tcp_ipsec_overhead == 0)
   2499 		tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
   2500 
   2501 	mss -= tcp->tcp_ipsec_overhead;
   2502 
   2503 	if (mss < tcps->tcps_mss_min)
   2504 		mss = tcps->tcps_mss_min;
   2505 	if (mss > mss_max)
   2506 		mss = mss_max;
   2507 
   2508 	/* Note that this is the maximum MSS, excluding all options. */
   2509 	tcp->tcp_mss = mss;
   2510 
   2511 	/*
   2512 	 * Update the tcp connection with LSO capability.
   2513 	 */
   2514 	tcp_update_lso(tcp, connp->conn_ixa);
   2515 
   2516 	/*
   2517 	 * Initialize the ISS here now that we have the full connection ID.
   2518 	 * The RFC 1948 method of initial sequence number generation requires
   2519 	 * knowledge of the full connection ID before setting the ISS.
   2520 	 */
   2521 	tcp_iss_init(tcp);
   2522 
   2523 	tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local);
   2524 
   2525 	/*
   2526 	 * Make sure that conn is not marked incipient
   2527 	 * for incoming connections. A blind
   2528 	 * removal of incipient flag is cheaper than
   2529 	 * check and removal.
   2530 	 */
   2531 	mutex_enter(&connp->conn_lock);
   2532 	connp->conn_state_flags &= ~CONN_INCIPIENT;
   2533 	mutex_exit(&connp->conn_lock);
   2534 	return (0);
   2535 }
   2536 
   2537 static void
   2538 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
   2539 {
   2540 	int	error;
   2541 	conn_t	*connp = tcp->tcp_connp;
   2542 	struct sockaddr	*sa;
   2543 	mblk_t  *mp1;
   2544 	struct T_bind_req *tbr;
   2545 	int	backlog;
   2546 	socklen_t	len;
   2547 	sin_t	*sin;
   2548 	sin6_t	*sin6;
   2549 	cred_t		*cr;
   2550 
   2551 	/*
   2552 	 * All Solaris components should pass a db_credp
   2553 	 * for this TPI message, hence we ASSERT.
   2554 	 * But in case there is some other M_PROTO that looks
   2555 	 * like a TPI message sent by some other kernel
   2556 	 * component, we check and return an error.
   2557 	 */
   2558 	cr = msg_getcred(mp, NULL);
   2559 	ASSERT(cr != NULL);
   2560 	if (cr == NULL) {
   2561 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
   2562 		return;
   2563 	}
   2564 
   2565 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
   2566 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
   2567 		if (connp->conn_debug) {
   2568 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   2569 			    "tcp_tpi_bind: bad req, len %u",
   2570 			    (uint_t)(mp->b_wptr - mp->b_rptr));
   2571 		}
   2572 		tcp_err_ack(tcp, mp, TPROTO, 0);
   2573 		return;
   2574 	}
   2575 	/* Make sure the largest address fits */
   2576 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
   2577 	if (mp1 == NULL) {
   2578 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   2579 		return;
   2580 	}
   2581 	mp = mp1;
   2582 	tbr = (struct T_bind_req *)mp->b_rptr;
   2583 
   2584 	backlog = tbr->CONIND_number;
   2585 	len = tbr->ADDR_length;
   2586 
   2587 	switch (len) {
   2588 	case 0:		/* request for a generic port */
   2589 		tbr->ADDR_offset = sizeof (struct T_bind_req);
   2590 		if (connp->conn_family == AF_INET) {
   2591 			tbr->ADDR_length = sizeof (sin_t);
   2592 			sin = (sin_t *)&tbr[1];
   2593 			*sin = sin_null;
   2594 			sin->sin_family = AF_INET;
   2595 			sa = (struct sockaddr *)sin;
   2596 			len = sizeof (sin_t);
   2597 			mp->b_wptr = (uchar_t *)&sin[1];
   2598 		} else {
   2599 			ASSERT(connp->conn_family == AF_INET6);
   2600 			tbr->ADDR_length = sizeof (sin6_t);
   2601 			sin6 = (sin6_t *)&tbr[1];
   2602 			*sin6 = sin6_null;
   2603 			sin6->sin6_family = AF_INET6;
   2604 			sa = (struct sockaddr *)sin6;
   2605 			len = sizeof (sin6_t);
   2606 			mp->b_wptr = (uchar_t *)&sin6[1];
   2607 		}
   2608 		break;
   2609 
   2610 	case sizeof (sin_t):    /* Complete IPv4 address */
   2611 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
   2612 		    sizeof (sin_t));
   2613 		break;
   2614 
   2615 	case sizeof (sin6_t): /* Complete IPv6 address */
   2616 		sa = (struct sockaddr *)mi_offset_param(mp,
   2617 		    tbr->ADDR_offset, sizeof (sin6_t));
   2618 		break;
   2619 
   2620 	default:
   2621 		if (connp->conn_debug) {
   2622 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   2623 			    "tcp_tpi_bind: bad address length, %d",
   2624 			    tbr->ADDR_length);
   2625 		}
   2626 		tcp_err_ack(tcp, mp, TBADADDR, 0);
   2627 		return;
   2628 	}
   2629 
   2630 	if (backlog > 0) {
   2631 		error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
   2632 		    tbr->PRIM_type != O_T_BIND_REQ);
   2633 	} else {
   2634 		error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
   2635 		    tbr->PRIM_type != O_T_BIND_REQ);
   2636 	}
   2637 done:
   2638 	if (error > 0) {
   2639 		tcp_err_ack(tcp, mp, TSYSERR, error);
   2640 	} else if (error < 0) {
   2641 		tcp_err_ack(tcp, mp, -error, 0);
   2642 	} else {
   2643 		/*
   2644 		 * Update port information as sockfs/tpi needs it for checking
   2645 		 */
   2646 		if (connp->conn_family == AF_INET) {
   2647 			sin = (sin_t *)sa;
   2648 			sin->sin_port = connp->conn_lport;
   2649 		} else {
   2650 			sin6 = (sin6_t *)sa;
   2651 			sin6->sin6_port = connp->conn_lport;
   2652 		}
   2653 		mp->b_datap->db_type = M_PCPROTO;
   2654 		tbr->PRIM_type = T_BIND_ACK;
   2655 		putnext(connp->conn_rq, mp);
   2656 	}
   2657 }
   2658 
   2659 /*
   2660  * If the "bind_to_req_port_only" parameter is set, if the requested port
   2661  * number is available, return it, If not return 0
   2662  *
   2663  * If "bind_to_req_port_only" parameter is not set and
   2664  * If the requested port number is available, return it.  If not, return
   2665  * the first anonymous port we happen across.  If no anonymous ports are
   2666  * available, return 0. addr is the requested local address, if any.
   2667  *
   2668  * In either case, when succeeding update the tcp_t to record the port number
   2669  * and insert it in the bind hash table.
   2670  *
   2671  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
   2672  * without setting SO_REUSEADDR. This is needed so that they
   2673  * can be viewed as two independent transport protocols.
   2674  */
   2675 static in_port_t
   2676 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
   2677     int reuseaddr, boolean_t quick_connect,
   2678     boolean_t bind_to_req_port_only, boolean_t user_specified)
   2679 {
   2680 	/* number of times we have run around the loop */
   2681 	int count = 0;
   2682 	/* maximum number of times to run around the loop */
   2683 	int loopmax;
   2684 	conn_t *connp = tcp->tcp_connp;
   2685 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   2686 
   2687 	/*
   2688 	 * Lookup for free addresses is done in a loop and "loopmax"
   2689 	 * influences how long we spin in the loop
   2690 	 */
   2691 	if (bind_to_req_port_only) {
   2692 		/*
   2693 		 * If the requested port is busy, don't bother to look
   2694 		 * for a new one. Setting loop maximum count to 1 has
   2695 		 * that effect.
   2696 		 */
   2697 		loopmax = 1;
   2698 	} else {
   2699 		/*
   2700 		 * If the requested port is busy, look for a free one
   2701 		 * in the anonymous port range.
   2702 		 * Set loopmax appropriately so that one does not look
   2703 		 * forever in the case all of the anonymous ports are in use.
   2704 		 */
   2705 		if (connp->conn_anon_priv_bind) {
   2706 			/*
   2707 			 * loopmax =
   2708 			 * 	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
   2709 			 */
   2710 			loopmax = IPPORT_RESERVED -
   2711 			    tcps->tcps_min_anonpriv_port;
   2712 		} else {
   2713 			loopmax = (tcps->tcps_largest_anon_port -
   2714 			    tcps->tcps_smallest_anon_port + 1);
   2715 		}
   2716 	}
   2717 	do {
   2718 		uint16_t	lport;
   2719 		tf_t		*tbf;
   2720 		tcp_t		*ltcp;
   2721 		conn_t		*lconnp;
   2722 
   2723 		lport = htons(port);
   2724 
   2725 		/*
   2726 		 * Ensure that the tcp_t is not currently in the bind hash.
   2727 		 * Hold the lock on the hash bucket to ensure that
   2728 		 * the duplicate check plus the insertion is an atomic
   2729 		 * operation.
   2730 		 *
   2731 		 * This function does an inline lookup on the bind hash list
   2732 		 * Make sure that we access only members of tcp_t
   2733 		 * and that we don't look at tcp_tcp, since we are not
   2734 		 * doing a CONN_INC_REF.
   2735 		 */
   2736 		tcp_bind_hash_remove(tcp);
   2737 		tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
   2738 		mutex_enter(&tbf->tf_lock);
   2739 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
   2740 		    ltcp = ltcp->tcp_bind_hash) {
   2741 			if (lport == ltcp->tcp_connp->conn_lport)
   2742 				break;
   2743 		}
   2744 
   2745 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
   2746 			boolean_t not_socket;
   2747 			boolean_t exclbind;
   2748 
   2749 			lconnp = ltcp->tcp_connp;
   2750 
   2751 			/*
   2752 			 * On a labeled system, we must treat bindings to ports
   2753 			 * on shared IP addresses by sockets with MAC exemption
   2754 			 * privilege as being in all zones, as there's
   2755 			 * otherwise no way to identify the right receiver.
   2756 			 */
   2757 			if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
   2758 				continue;
   2759 
   2760 			/*
   2761 			 * If TCP_EXCLBIND is set for either the bound or
   2762 			 * binding endpoint, the semantics of bind
   2763 			 * is changed according to the following.
   2764 			 *
   2765 			 * spec = specified address (v4 or v6)
   2766 			 * unspec = unspecified address (v4 or v6)
   2767 			 * A = specified addresses are different for endpoints
   2768 			 *
   2769 			 * bound	bind to		allowed
   2770 			 * -------------------------------------
   2771 			 * unspec	unspec		no
   2772 			 * unspec	spec		no
   2773 			 * spec		unspec		no
   2774 			 * spec		spec		yes if A
   2775 			 *
   2776 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
   2777 			 * as TCP_EXCLBIND, except that zoneid is ignored.
   2778 			 *
   2779 			 * Note:
   2780 			 *
   2781 			 * 1. Because of TLI semantics, an endpoint can go
   2782 			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
   2783 			 * TCPS_BOUND, depending on whether it is originally
   2784 			 * a listener or not.  That is why we need to check
   2785 			 * for states greater than or equal to TCPS_BOUND
   2786 			 * here.
   2787 			 *
   2788 			 * 2. Ideally, we should only check for state equals
   2789 			 * to TCPS_LISTEN. And the following check should be
   2790 			 * added.
   2791 			 *
   2792 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
   2793 			 *	!reuseaddr || !lconnp->conn_reuseaddr) {
   2794 			 *		...
   2795 			 * }
   2796 			 *
   2797 			 * The semantics will be changed to this.  If the
   2798 			 * endpoint on the list is in state not equal to
   2799 			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
   2800 			 * set, let the bind succeed.
   2801 			 *
   2802 			 * Because of (1), we cannot do that for TLI
   2803 			 * endpoints.  But we can do that for socket endpoints.
   2804 			 * If in future, we can change this going back
   2805 			 * semantics, we can use the above check for TLI also.
   2806 			 */
   2807 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
   2808 			    TCP_IS_SOCKET(tcp));
   2809 			exclbind = lconnp->conn_exclbind ||
   2810 			    connp->conn_exclbind;
   2811 
   2812 			if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
   2813 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
   2814 			    (exclbind && (not_socket ||
   2815 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
   2816 				if (V6_OR_V4_INADDR_ANY(
   2817 				    lconnp->conn_bound_addr_v6) ||
   2818 				    V6_OR_V4_INADDR_ANY(*laddr) ||
   2819 				    IN6_ARE_ADDR_EQUAL(laddr,
   2820 				    &lconnp->conn_bound_addr_v6)) {
   2821 					break;
   2822 				}
   2823 				continue;
   2824 			}
   2825 
   2826 			/*
   2827 			 * Check ipversion to allow IPv4 and IPv6 sockets to
   2828 			 * have disjoint port number spaces, if *_EXCLBIND
   2829 			 * is not set and only if the application binds to a
   2830 			 * specific port. We use the same autoassigned port
   2831 			 * number space for IPv4 and IPv6 sockets.
   2832 			 */
   2833 			if (connp->conn_ipversion != lconnp->conn_ipversion &&
   2834 			    bind_to_req_port_only)
   2835 				continue;
   2836 
   2837 			/*
   2838 			 * Ideally, we should make sure that the source
   2839 			 * address, remote address, and remote port in the
   2840 			 * four tuple for this tcp-connection is unique.
   2841 			 * However, trying to find out the local source
   2842 			 * address would require too much code duplication
   2843 			 * with IP, since IP needs needs to have that code
   2844 			 * to support userland TCP implementations.
   2845 			 */
   2846 			if (quick_connect &&
   2847 			    (ltcp->tcp_state > TCPS_LISTEN) &&
   2848 			    ((connp->conn_fport != lconnp->conn_fport) ||
   2849 			    !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
   2850 			    &lconnp->conn_faddr_v6)))
   2851 				continue;
   2852 
   2853 			if (!reuseaddr) {
   2854 				/*
   2855 				 * No socket option SO_REUSEADDR.
   2856 				 * If existing port is bound to
   2857 				 * a non-wildcard IP address
   2858 				 * and the requesting stream is
   2859 				 * bound to a distinct
   2860 				 * different IP addresses
   2861 				 * (non-wildcard, also), keep
   2862 				 * going.
   2863 				 */
   2864 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
   2865 				    !V6_OR_V4_INADDR_ANY(
   2866 				    lconnp->conn_bound_addr_v6) &&
   2867 				    !IN6_ARE_ADDR_EQUAL(laddr,
   2868 				    &lconnp->conn_bound_addr_v6))
   2869 					continue;
   2870 				if (ltcp->tcp_state >= TCPS_BOUND) {
   2871 					/*
   2872 					 * This port is being used and
   2873 					 * its state is >= TCPS_BOUND,
   2874 					 * so we can't bind to it.
   2875 					 */
   2876 					break;
   2877 				}
   2878 			} else {
   2879 				/*
   2880 				 * socket option SO_REUSEADDR is set on the
   2881 				 * binding tcp_t.
   2882 				 *
   2883 				 * If two streams are bound to
   2884 				 * same IP address or both addr
   2885 				 * and bound source are wildcards
   2886 				 * (INADDR_ANY), we want to stop
   2887 				 * searching.
   2888 				 * We have found a match of IP source
   2889 				 * address and source port, which is
   2890 				 * refused regardless of the
   2891 				 * SO_REUSEADDR setting, so we break.
   2892 				 */
   2893 				if (IN6_ARE_ADDR_EQUAL(laddr,
   2894 				    &lconnp->conn_bound_addr_v6) &&
   2895 				    (ltcp->tcp_state == TCPS_LISTEN ||
   2896 				    ltcp->tcp_state == TCPS_BOUND))
   2897 					break;
   2898 			}
   2899 		}
   2900 		if (ltcp != NULL) {
   2901 			/* The port number is busy */
   2902 			mutex_exit(&tbf->tf_lock);
   2903 		} else {
   2904 			/*
   2905 			 * This port is ours. Insert in fanout and mark as
   2906 			 * bound to prevent others from getting the port
   2907 			 * number.
   2908 			 */
   2909 			tcp->tcp_state = TCPS_BOUND;
   2910 			connp->conn_lport = htons(port);
   2911 
   2912 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
   2913 			    connp->conn_lport)] == tbf);
   2914 			tcp_bind_hash_insert(tbf, tcp, 1);
   2915 
   2916 			mutex_exit(&tbf->tf_lock);
   2917 
   2918 			/*
   2919 			 * We don't want tcp_next_port_to_try to "inherit"
   2920 			 * a port number supplied by the user in a bind.
   2921 			 */
   2922 			if (user_specified)
   2923 				return (port);
   2924 
   2925 			/*
   2926 			 * This is the only place where tcp_next_port_to_try
   2927 			 * is updated. After the update, it may or may not
   2928 			 * be in the valid range.
   2929 			 */
   2930 			if (!connp->conn_anon_priv_bind)
   2931 				tcps->tcps_next_port_to_try = port + 1;
   2932 			return (port);
   2933 		}
   2934 
   2935 		if (connp->conn_anon_priv_bind) {
   2936 			port = tcp_get_next_priv_port(tcp);
   2937 		} else {
   2938 			if (count == 0 && user_specified) {
   2939 				/*
   2940 				 * We may have to return an anonymous port. So
   2941 				 * get one to start with.
   2942 				 */
   2943 				port =
   2944 				    tcp_update_next_port(
   2945 				    tcps->tcps_next_port_to_try,
   2946 				    tcp, B_TRUE);
   2947 				user_specified = B_FALSE;
   2948 			} else {
   2949 				port = tcp_update_next_port(port + 1, tcp,
   2950 				    B_FALSE);
   2951 			}
   2952 		}
   2953 		if (port == 0)
   2954 			break;
   2955 
   2956 		/*
   2957 		 * Don't let this loop run forever in the case where
   2958 		 * all of the anonymous ports are in use.
   2959 		 */
   2960 	} while (++count < loopmax);
   2961 	return (0);
   2962 }
   2963 
   2964 /*
   2965  * tcp_clean_death / tcp_close_detached must not be called more than once
   2966  * on a tcp. Thus every function that potentially calls tcp_clean_death
   2967  * must check for the tcp state before calling tcp_clean_death.
   2968  * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper,
   2969  * tcp_timer_handler, all check for the tcp state.
   2970  */
   2971 /* ARGSUSED */
   2972 void
   2973 tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
   2974     ip_recv_attr_t *dummy)
   2975 {
   2976 	tcp_t	*tcp = ((conn_t *)arg)->conn_tcp;
   2977 
   2978 	freemsg(mp);
   2979 	if (tcp->tcp_state > TCPS_BOUND)
   2980 		(void) tcp_clean_death(((conn_t *)arg)->conn_tcp,
   2981 		    ETIMEDOUT, 5);
   2982 }
   2983 
   2984 /*
   2985  * We are dying for some reason.  Try to do it gracefully.  (May be called
   2986  * as writer.)
   2987  *
   2988  * Return -1 if the structure was not cleaned up (if the cleanup had to be
   2989  * done by a service procedure).
   2990  * TBD - Should the return value distinguish between the tcp_t being
   2991  * freed and it being reinitialized?
   2992  */
   2993 static int
   2994 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
   2995 {
   2996 	mblk_t	*mp;
   2997 	queue_t	*q;
   2998 	conn_t	*connp = tcp->tcp_connp;
   2999 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3000 
   3001 	TCP_CLD_STAT(tag);
   3002 
   3003 #if TCP_TAG_CLEAN_DEATH
   3004 	tcp->tcp_cleandeathtag = tag;
   3005 #endif
   3006 
   3007 	if (tcp->tcp_fused)
   3008 		tcp_unfuse(tcp);
   3009 
   3010 	if (tcp->tcp_linger_tid != 0 &&
   3011 	    TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
   3012 		tcp_stop_lingering(tcp);
   3013 	}
   3014 
   3015 	ASSERT(tcp != NULL);
   3016 	ASSERT((connp->conn_family == AF_INET &&
   3017 	    connp->conn_ipversion == IPV4_VERSION) ||
   3018 	    (connp->conn_family == AF_INET6 &&
   3019 	    (connp->conn_ipversion == IPV4_VERSION ||
   3020 	    connp->conn_ipversion == IPV6_VERSION)));
   3021 
   3022 	if (TCP_IS_DETACHED(tcp)) {
   3023 		if (tcp->tcp_hard_binding) {
   3024 			/*
   3025 			 * Its an eager that we are dealing with. We close the
   3026 			 * eager but in case a conn_ind has already gone to the
   3027 			 * listener, let tcp_accept_finish() send a discon_ind
   3028 			 * to the listener and drop the last reference. If the
   3029 			 * listener doesn't even know about the eager i.e. the
   3030 			 * conn_ind hasn't gone up, blow away the eager and drop
   3031 			 * the last reference as well. If the conn_ind has gone
   3032 			 * up, state should be BOUND. tcp_accept_finish
   3033 			 * will figure out that the connection has received a
   3034 			 * RST and will send a DISCON_IND to the application.
   3035 			 */
   3036 			tcp_closei_local(tcp);
   3037 			if (!tcp->tcp_tconnind_started) {
   3038 				CONN_DEC_REF(connp);
   3039 			} else {
   3040 				tcp->tcp_state = TCPS_BOUND;
   3041 			}
   3042 		} else {
   3043 			tcp_close_detached(tcp);
   3044 		}
   3045 		return (0);
   3046 	}
   3047 
   3048 	TCP_STAT(tcps, tcp_clean_death_nondetached);
   3049 
   3050 	q = connp->conn_rq;
   3051 
   3052 	/* Trash all inbound data */
   3053 	if (!IPCL_IS_NONSTR(connp)) {
   3054 		ASSERT(q != NULL);
   3055 		flushq(q, FLUSHALL);
   3056 	}
   3057 
   3058 	/*
   3059 	 * If we are at least part way open and there is error
   3060 	 * (err==0 implies no error)
   3061 	 * notify our client by a T_DISCON_IND.
   3062 	 */
   3063 	if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) {
   3064 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
   3065 		    !TCP_IS_SOCKET(tcp)) {
   3066 			/*
   3067 			 * Send M_FLUSH according to TPI. Because sockets will
   3068 			 * (and must) ignore FLUSHR we do that only for TPI
   3069 			 * endpoints and sockets in STREAMS mode.
   3070 			 */
   3071 			(void) putnextctl1(q, M_FLUSH, FLUSHR);
   3072 		}
   3073 		if (connp->conn_debug) {
   3074 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
   3075 			    "tcp_clean_death: discon err %d", err);
   3076 		}
   3077 		if (IPCL_IS_NONSTR(connp)) {
   3078 			/* Direct socket, use upcall */
   3079 			(*connp->conn_upcalls->su_disconnected)(
   3080 			    connp->conn_upper_handle, tcp->tcp_connid, err);
   3081 		} else {
   3082 			mp = mi_tpi_discon_ind(NULL, err, 0);
   3083 			if (mp != NULL) {
   3084 				putnext(q, mp);
   3085 			} else {
   3086 				if (connp->conn_debug) {
   3087 					(void) strlog(TCP_MOD_ID, 0, 1,
   3088 					    SL_ERROR|SL_TRACE,
   3089 					    "tcp_clean_death, sending M_ERROR");
   3090 				}
   3091 				(void) putnextctl1(q, M_ERROR, EPROTO);
   3092 			}
   3093 		}
   3094 		if (tcp->tcp_state <= TCPS_SYN_RCVD) {
   3095 			/* SYN_SENT or SYN_RCVD */
   3096 			BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   3097 		} else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) {
   3098 			/* ESTABLISHED or CLOSE_WAIT */
   3099 			BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
   3100 		}
   3101 	}
   3102 
   3103 	tcp_reinit(tcp);
   3104 	if (IPCL_IS_NONSTR(connp))
   3105 		(void) tcp_do_unbind(connp);
   3106 
   3107 	return (-1);
   3108 }
   3109 
   3110 /*
   3111  * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout
   3112  * to expire, stop the wait and finish the close.
   3113  */
   3114 static void
   3115 tcp_stop_lingering(tcp_t *tcp)
   3116 {
   3117 	clock_t	delta = 0;
   3118 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3119 	conn_t		*connp = tcp->tcp_connp;
   3120 
   3121 	tcp->tcp_linger_tid = 0;
   3122 	if (tcp->tcp_state > TCPS_LISTEN) {
   3123 		tcp_acceptor_hash_remove(tcp);
   3124 		mutex_enter(&tcp->tcp_non_sq_lock);
   3125 		if (tcp->tcp_flow_stopped) {
   3126 			tcp_clrqfull(tcp);
   3127 		}
   3128 		mutex_exit(&tcp->tcp_non_sq_lock);
   3129 
   3130 		if (tcp->tcp_timer_tid != 0) {
   3131 			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
   3132 			tcp->tcp_timer_tid = 0;
   3133 		}
   3134 		/*
   3135 		 * Need to cancel those timers which will not be used when
   3136 		 * TCP is detached.  This has to be done before the conn_wq
   3137 		 * is cleared.
   3138 		 */
   3139 		tcp_timers_stop(tcp);
   3140 
   3141 		tcp->tcp_detached = B_TRUE;
   3142 		connp->conn_rq = NULL;
   3143 		connp->conn_wq = NULL;
   3144 
   3145 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
   3146 			tcp_time_wait_append(tcp);
   3147 			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
   3148 			goto finish;
   3149 		}
   3150 
   3151 		/*
   3152 		 * If delta is zero the timer event wasn't executed and was
   3153 		 * successfully canceled. In this case we need to restart it
   3154 		 * with the minimal delta possible.
   3155 		 */
   3156 		if (delta >= 0) {
   3157 			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
   3158 			    delta ? delta : 1);
   3159 		}
   3160 	} else {
   3161 		tcp_closei_local(tcp);
   3162 		CONN_DEC_REF(connp);
   3163 	}
   3164 finish:
   3165 	/* Signal closing thread that it can complete close */
   3166 	mutex_enter(&tcp->tcp_closelock);
   3167 	tcp->tcp_detached = B_TRUE;
   3168 	connp->conn_rq = NULL;
   3169 	connp->conn_wq = NULL;
   3170 
   3171 	tcp->tcp_closed = 1;
   3172 	cv_signal(&tcp->tcp_closecv);
   3173 	mutex_exit(&tcp->tcp_closelock);
   3174 }
   3175 
   3176 /*
   3177  * Handle lingering timeouts. This function is called when the SO_LINGER timeout
   3178  * expires.
   3179  */
   3180 static void
   3181 tcp_close_linger_timeout(void *arg)
   3182 {
   3183 	conn_t	*connp = (conn_t *)arg;
   3184 	tcp_t 	*tcp = connp->conn_tcp;
   3185 
   3186 	tcp->tcp_client_errno = ETIMEDOUT;
   3187 	tcp_stop_lingering(tcp);
   3188 }
   3189 
   3190 static void
   3191 tcp_close_common(conn_t *connp, int flags)
   3192 {
   3193 	tcp_t		*tcp = connp->conn_tcp;
   3194 	mblk_t 		*mp = &tcp->tcp_closemp;
   3195 	boolean_t	conn_ioctl_cleanup_reqd = B_FALSE;
   3196 	mblk_t		*bp;
   3197 
   3198 	ASSERT(connp->conn_ref >= 2);
   3199 
   3200 	/*
   3201 	 * Mark the conn as closing. ipsq_pending_mp_add will not
   3202 	 * add any mp to the pending mp list, after this conn has
   3203 	 * started closing.
   3204 	 */
   3205 	mutex_enter(&connp->conn_lock);
   3206 	connp->conn_state_flags |= CONN_CLOSING;
   3207 	if (connp->conn_oper_pending_ill != NULL)
   3208 		conn_ioctl_cleanup_reqd = B_TRUE;
   3209 	CONN_INC_REF_LOCKED(connp);
   3210 	mutex_exit(&connp->conn_lock);
   3211 	tcp->tcp_closeflags = (uint8_t)flags;
   3212 	ASSERT(connp->conn_ref >= 3);
   3213 
   3214 	/*
   3215 	 * tcp_closemp_used is used below without any protection of a lock
   3216 	 * as we don't expect any one else to use it concurrently at this
   3217 	 * point otherwise it would be a major defect.
   3218 	 */
   3219 
   3220 	if (mp->b_prev == NULL)
   3221 		tcp->tcp_closemp_used = B_TRUE;
   3222 	else
   3223 		cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: "
   3224 		    "connp %p tcp %p\n", (void *)connp, (void *)tcp);
   3225 
   3226 	TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   3227 
   3228 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp,
   3229 	    NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
   3230 
   3231 	mutex_enter(&tcp->tcp_closelock);
   3232 	while (!tcp->tcp_closed) {
   3233 		if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) {
   3234 			/*
   3235 			 * The cv_wait_sig() was interrupted. We now do the
   3236 			 * following:
   3237 			 *
   3238 			 * 1) If the endpoint was lingering, we allow this
   3239 			 * to be interrupted by cancelling the linger timeout
   3240 			 * and closing normally.
   3241 			 *
   3242 			 * 2) Revert to calling cv_wait()
   3243 			 *
   3244 			 * We revert to using cv_wait() to avoid an
   3245 			 * infinite loop which can occur if the calling
   3246 			 * thread is higher priority than the squeue worker
   3247 			 * thread and is bound to the same cpu.
   3248 			 */
   3249 			if (connp->conn_linger && connp->conn_lingertime > 0) {
   3250 				mutex_exit(&tcp->tcp_closelock);
   3251 				/* Entering squeue, bump ref count. */
   3252 				CONN_INC_REF(connp);
   3253 				bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
   3254 				SQUEUE_ENTER_ONE(connp->conn_sqp, bp,
   3255 				    tcp_linger_interrupted, connp, NULL,
   3256 				    tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
   3257 				mutex_enter(&tcp->tcp_closelock);
   3258 			}
   3259 			break;
   3260 		}
   3261 	}
   3262 	while (!tcp->tcp_closed)
   3263 		cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock);
   3264 	mutex_exit(&tcp->tcp_closelock);
   3265 
   3266 	/*
   3267 	 * In the case of listener streams that have eagers in the q or q0
   3268 	 * we wait for the eagers to drop their reference to us. conn_rq and
   3269 	 * conn_wq of the eagers point to our queues. By waiting for the
   3270 	 * refcnt to drop to 1, we are sure that the eagers have cleaned
   3271 	 * up their queue pointers and also dropped their references to us.
   3272 	 */
   3273 	if (tcp->tcp_wait_for_eagers) {
   3274 		mutex_enter(&connp->conn_lock);
   3275 		while (connp->conn_ref != 1) {
   3276 			cv_wait(&connp->conn_cv, &connp->conn_lock);
   3277 		}
   3278 		mutex_exit(&connp->conn_lock);
   3279 	}
   3280 	/*
   3281 	 * ioctl cleanup. The mp is queued in the ipx_pending_mp.
   3282 	 */
   3283 	if (conn_ioctl_cleanup_reqd)
   3284 		conn_ioctl_cleanup(connp);
   3285 
   3286 	connp->conn_cpid = NOPID;
   3287 }
   3288 
   3289 static int
   3290 tcp_tpi_close(queue_t *q, int flags)
   3291 {
   3292 	conn_t		*connp;
   3293 
   3294 	ASSERT(WR(q)->q_next == NULL);
   3295 
   3296 	if (flags & SO_FALLBACK) {
   3297 		/*
   3298 		 * stream is being closed while in fallback
   3299 		 * simply free the resources that were allocated
   3300 		 */
   3301 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
   3302 		qprocsoff(q);
   3303 		goto done;
   3304 	}
   3305 
   3306 	connp = Q_TO_CONN(q);
   3307 	/*
   3308 	 * We are being closed as /dev/tcp or /dev/tcp6.
   3309 	 */
   3310 	tcp_close_common(connp, flags);
   3311 
   3312 	qprocsoff(q);
   3313 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
   3314 
   3315 	/*
   3316 	 * Drop IP's reference on the conn. This is the last reference
   3317 	 * on the connp if the state was less than established. If the
   3318 	 * connection has gone into timewait state, then we will have
   3319 	 * one ref for the TCP and one more ref (total of two) for the
   3320 	 * classifier connected hash list (a timewait connections stays
   3321 	 * in connected hash till closed).
   3322 	 *
   3323 	 * We can't assert the references because there might be other
   3324 	 * transient reference places because of some walkers or queued
   3325 	 * packets in squeue for the timewait state.
   3326 	 */
   3327 	CONN_DEC_REF(connp);
   3328 done:
   3329 	q->q_ptr = WR(q)->q_ptr = NULL;
   3330 	return (0);
   3331 }
   3332 
   3333 static int
   3334 tcp_tpi_close_accept(queue_t *q)
   3335 {
   3336 	vmem_t	*minor_arena;
   3337 	dev_t	conn_dev;
   3338 
   3339 	ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
   3340 
   3341 	/*
   3342 	 * We had opened an acceptor STREAM for sockfs which is
   3343 	 * now being closed due to some error.
   3344 	 */
   3345 	qprocsoff(q);
   3346 
   3347 	minor_arena = (vmem_t *)WR(q)->q_ptr;
   3348 	conn_dev = (dev_t)RD(q)->q_ptr;
   3349 	ASSERT(minor_arena != NULL);
   3350 	ASSERT(conn_dev != 0);
   3351 	inet_minor_free(minor_arena, conn_dev);
   3352 	q->q_ptr = WR(q)->q_ptr = NULL;
   3353 	return (0);
   3354 }
   3355 
   3356 /*
   3357  * Called by tcp_close() routine via squeue when lingering is
   3358  * interrupted by a signal.
   3359  */
   3360 
   3361 /* ARGSUSED */
   3362 static void
   3363 tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   3364 {
   3365 	conn_t	*connp = (conn_t *)arg;
   3366 	tcp_t	*tcp = connp->conn_tcp;
   3367 
   3368 	freeb(mp);
   3369 	if (tcp->tcp_linger_tid != 0 &&
   3370 	    TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
   3371 		tcp_stop_lingering(tcp);
   3372 		tcp->tcp_client_errno = EINTR;
   3373 	}
   3374 }
   3375 
   3376 /*
   3377  * Called by streams close routine via squeues when our client blows off her
   3378  * descriptor, we take this to mean: "close the stream state NOW, close the tcp
   3379  * connection politely" When SO_LINGER is set (with a non-zero linger time and
   3380  * it is not a nonblocking socket) then this routine sleeps until the FIN is
   3381  * acked.
   3382  *
   3383  * NOTE: tcp_close potentially returns error when lingering.
   3384  * However, the stream head currently does not pass these errors
   3385  * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
   3386  * errors to the application (from tsleep()) and not errors
   3387  * like ECONNRESET caused by receiving a reset packet.
   3388  */
   3389 
   3390 /* ARGSUSED */
   3391 static void
   3392 tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   3393 {
   3394 	char	*msg;
   3395 	conn_t	*connp = (conn_t *)arg;
   3396 	tcp_t	*tcp = connp->conn_tcp;
   3397 	clock_t	delta = 0;
   3398 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3399 
   3400 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
   3401 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
   3402 
   3403 	mutex_enter(&tcp->tcp_eager_lock);
   3404 	if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
   3405 		/* Cleanup for listener */
   3406 		tcp_eager_cleanup(tcp, 0);
   3407 		tcp->tcp_wait_for_eagers = 1;
   3408 	}
   3409 	mutex_exit(&tcp->tcp_eager_lock);
   3410 
   3411 	tcp->tcp_lso = B_FALSE;
   3412 
   3413 	msg = NULL;
   3414 	switch (tcp->tcp_state) {
   3415 	case TCPS_CLOSED:
   3416 	case TCPS_IDLE:
   3417 	case TCPS_BOUND:
   3418 	case TCPS_LISTEN:
   3419 		break;
   3420 	case TCPS_SYN_SENT:
   3421 		msg = "tcp_close, during connect";
   3422 		break;
   3423 	case TCPS_SYN_RCVD:
   3424 		/*
   3425 		 * Close during the connect 3-way handshake
   3426 		 * but here there may or may not be pending data
   3427 		 * already on queue. Process almost same as in
   3428 		 * the ESTABLISHED state.
   3429 		 */
   3430 		/* FALLTHRU */
   3431 	default:
   3432 		if (tcp->tcp_fused)
   3433 			tcp_unfuse(tcp);
   3434 
   3435 		/*
   3436 		 * If SO_LINGER has set a zero linger time, abort the
   3437 		 * connection with a reset.
   3438 		 */
   3439 		if (connp->conn_linger && connp->conn_lingertime == 0) {
   3440 			msg = "tcp_close, zero lingertime";
   3441 			break;
   3442 		}
   3443 
   3444 		/*
   3445 		 * Abort connection if there is unread data queued.
   3446 		 */
   3447 		if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
   3448 			msg = "tcp_close, unread data";
   3449 			break;
   3450 		}
   3451 		/*
   3452 		 * We have done a qwait() above which could have possibly
   3453 		 * drained more messages in turn causing transition to a
   3454 		 * different state. Check whether we have to do the rest
   3455 		 * of the processing or not.
   3456 		 */
   3457 		if (tcp->tcp_state <= TCPS_LISTEN)
   3458 			break;
   3459 
   3460 		/*
   3461 		 * Transmit the FIN before detaching the tcp_t.
   3462 		 * After tcp_detach returns this queue/perimeter
   3463 		 * no longer owns the tcp_t thus others can modify it.
   3464 		 */
   3465 		(void) tcp_xmit_end(tcp);
   3466 
   3467 		/*
   3468 		 * If lingering on close then wait until the fin is acked,
   3469 		 * the SO_LINGER time passes, or a reset is sent/received.
   3470 		 */
   3471 		if (connp->conn_linger && connp->conn_lingertime > 0 &&
   3472 		    !(tcp->tcp_fin_acked) &&
   3473 		    tcp->tcp_state >= TCPS_ESTABLISHED) {
   3474 			if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
   3475 				tcp->tcp_client_errno = EWOULDBLOCK;
   3476 			} else if (tcp->tcp_client_errno == 0) {
   3477 
   3478 				ASSERT(tcp->tcp_linger_tid == 0);
   3479 
   3480 				tcp->tcp_linger_tid = TCP_TIMER(tcp,
   3481 				    tcp_close_linger_timeout,
   3482 				    connp->conn_lingertime * hz);
   3483 
   3484 				/* tcp_close_linger_timeout will finish close */
   3485 				if (tcp->tcp_linger_tid == 0)
   3486 					tcp->tcp_client_errno = ENOSR;
   3487 				else
   3488 					return;
   3489 			}
   3490 
   3491 			/*
   3492 			 * Check if we need to detach or just close
   3493 			 * the instance.
   3494 			 */
   3495 			if (tcp->tcp_state <= TCPS_LISTEN)
   3496 				break;
   3497 		}
   3498 
   3499 		/*
   3500 		 * Make sure that no other thread will access the conn_rq of
   3501 		 * this instance (through lookups etc.) as conn_rq will go
   3502 		 * away shortly.
   3503 		 */
   3504 		tcp_acceptor_hash_remove(tcp);
   3505 
   3506 		mutex_enter(&tcp->tcp_non_sq_lock);
   3507 		if (tcp->tcp_flow_stopped) {
   3508 			tcp_clrqfull(tcp);
   3509 		}
   3510 		mutex_exit(&tcp->tcp_non_sq_lock);
   3511 
   3512 		if (tcp->tcp_timer_tid != 0) {
   3513 			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
   3514 			tcp->tcp_timer_tid = 0;
   3515 		}
   3516 		/*
   3517 		 * Need to cancel those timers which will not be used when
   3518 		 * TCP is detached.  This has to be done before the conn_wq
   3519 		 * is set to NULL.
   3520 		 */
   3521 		tcp_timers_stop(tcp);
   3522 
   3523 		tcp->tcp_detached = B_TRUE;
   3524 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
   3525 			tcp_time_wait_append(tcp);
   3526 			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
   3527 			ASSERT(connp->conn_ref >= 3);
   3528 			goto finish;
   3529 		}
   3530 
   3531 		/*
   3532 		 * If delta is zero the timer event wasn't executed and was
   3533 		 * successfully canceled. In this case we need to restart it
   3534 		 * with the minimal delta possible.
   3535 		 */
   3536 		if (delta >= 0)
   3537 			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
   3538 			    delta ? delta : 1);
   3539 
   3540 		ASSERT(connp->conn_ref >= 3);
   3541 		goto finish;
   3542 	}
   3543 
   3544 	/* Detach did not complete. Still need to remove q from stream. */
   3545 	if (msg) {
   3546 		if (tcp->tcp_state == TCPS_ESTABLISHED ||
   3547 		    tcp->tcp_state == TCPS_CLOSE_WAIT)
   3548 			BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
   3549 		if (tcp->tcp_state == TCPS_SYN_SENT ||
   3550 		    tcp->tcp_state == TCPS_SYN_RCVD)
   3551 			BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   3552 		tcp_xmit_ctl(msg, tcp,  tcp->tcp_snxt, 0, TH_RST);
   3553 	}
   3554 
   3555 	tcp_closei_local(tcp);
   3556 	CONN_DEC_REF(connp);
   3557 	ASSERT(connp->conn_ref >= 2);
   3558 
   3559 finish:
   3560 	mutex_enter(&tcp->tcp_closelock);
   3561 	/*
   3562 	 * Don't change the queues in the case of a listener that has
   3563 	 * eagers in its q or q0. It could surprise the eagers.
   3564 	 * Instead wait for the eagers outside the squeue.
   3565 	 */
   3566 	if (!tcp->tcp_wait_for_eagers) {
   3567 		tcp->tcp_detached = B_TRUE;
   3568 		connp->conn_rq = NULL;
   3569 		connp->conn_wq = NULL;
   3570 	}
   3571 
   3572 	/* Signal tcp_close() to finish closing. */
   3573 	tcp->tcp_closed = 1;
   3574 	cv_signal(&tcp->tcp_closecv);
   3575 	mutex_exit(&tcp->tcp_closelock);
   3576 }
   3577 
   3578 /*
   3579  * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp.
   3580  * Some stream heads get upset if they see these later on as anything but NULL.
   3581  */
   3582 static void
   3583 tcp_close_mpp(mblk_t **mpp)
   3584 {
   3585 	mblk_t	*mp;
   3586 
   3587 	if ((mp = *mpp) != NULL) {
   3588 		do {
   3589 			mp->b_next = NULL;
   3590 			mp->b_prev = NULL;
   3591 		} while ((mp = mp->b_cont) != NULL);
   3592 
   3593 		mp = *mpp;
   3594 		*mpp = NULL;
   3595 		freemsg(mp);
   3596 	}
   3597 }
   3598 
   3599 /* Do detached close. */
   3600 static void
   3601 tcp_close_detached(tcp_t *tcp)
   3602 {
   3603 	if (tcp->tcp_fused)
   3604 		tcp_unfuse(tcp);
   3605 
   3606 	/*
   3607 	 * Clustering code serializes TCP disconnect callbacks and
   3608 	 * cluster tcp list walks by blocking a TCP disconnect callback
   3609 	 * if a cluster tcp list walk is in progress. This ensures
   3610 	 * accurate accounting of TCPs in the cluster code even though
   3611 	 * the TCP list walk itself is not atomic.
   3612 	 */
   3613 	tcp_closei_local(tcp);
   3614 	CONN_DEC_REF(tcp->tcp_connp);
   3615 }
   3616 
   3617 /*
   3618  * Stop all TCP timers, and free the timer mblks if requested.
   3619  */
   3620 void
   3621 tcp_timers_stop(tcp_t *tcp)
   3622 {
   3623 	if (tcp->tcp_timer_tid != 0) {
   3624 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
   3625 		tcp->tcp_timer_tid = 0;
   3626 	}
   3627 	if (tcp->tcp_ka_tid != 0) {
   3628 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
   3629 		tcp->tcp_ka_tid = 0;
   3630 	}
   3631 	if (tcp->tcp_ack_tid != 0) {
   3632 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
   3633 		tcp->tcp_ack_tid = 0;
   3634 	}
   3635 	if (tcp->tcp_push_tid != 0) {
   3636 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
   3637 		tcp->tcp_push_tid = 0;
   3638 	}
   3639 }
   3640 
   3641 /*
   3642  * The tcp_t is going away. Remove it from all lists and set it
   3643  * to TCPS_CLOSED. The freeing up of memory is deferred until
   3644  * tcp_inactive. This is needed since a thread in tcp_rput might have
   3645  * done a CONN_INC_REF on this structure before it was removed from the
   3646  * hashes.
   3647  */
   3648 static void
   3649 tcp_closei_local(tcp_t *tcp)
   3650 {
   3651 	conn_t		*connp = tcp->tcp_connp;
   3652 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3653 
   3654 	if (!TCP_IS_SOCKET(tcp))
   3655 		tcp_acceptor_hash_remove(tcp);
   3656 
   3657 	UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
   3658 	tcp->tcp_ibsegs = 0;
   3659 	UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
   3660 	tcp->tcp_obsegs = 0;
   3661 
   3662 	/*
   3663 	 * If we are an eager connection hanging off a listener that
   3664 	 * hasn't formally accepted the connection yet, get off his
   3665 	 * list and blow off any data that we have accumulated.
   3666 	 */
   3667 	if (tcp->tcp_listener != NULL) {
   3668 		tcp_t	*listener = tcp->tcp_listener;
   3669 		mutex_enter(&listener->tcp_eager_lock);
   3670 		/*
   3671 		 * tcp_tconnind_started == B_TRUE means that the
   3672 		 * conn_ind has already gone to listener. At
   3673 		 * this point, eager will be closed but we
   3674 		 * leave it in listeners eager list so that
   3675 		 * if listener decides to close without doing
   3676 		 * accept, we can clean this up. In tcp_tli_accept
   3677 		 * we take care of the case of accept on closed
   3678 		 * eager.
   3679 		 */
   3680 		if (!tcp->tcp_tconnind_started) {
   3681 			tcp_eager_unlink(tcp);
   3682 			mutex_exit(&listener->tcp_eager_lock);
   3683 			/*
   3684 			 * We don't want to have any pointers to the
   3685 			 * listener queue, after we have released our
   3686 			 * reference on the listener
   3687 			 */
   3688 			ASSERT(tcp->tcp_detached);
   3689 			connp->conn_rq = NULL;
   3690 			connp->conn_wq = NULL;
   3691 			CONN_DEC_REF(listener->tcp_connp);
   3692 		} else {
   3693 			mutex_exit(&listener->tcp_eager_lock);
   3694 		}
   3695 	}
   3696 
   3697 	/* Stop all the timers */
   3698 	tcp_timers_stop(tcp);
   3699 
   3700 	if (tcp->tcp_state == TCPS_LISTEN) {
   3701 		if (tcp->tcp_ip_addr_cache) {
   3702 			kmem_free((void *)tcp->tcp_ip_addr_cache,
   3703 			    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
   3704 			tcp->tcp_ip_addr_cache = NULL;
   3705 		}
   3706 	}
   3707 	mutex_enter(&tcp->tcp_non_sq_lock);
   3708 	if (tcp->tcp_flow_stopped)
   3709 		tcp_clrqfull(tcp);
   3710 	mutex_exit(&tcp->tcp_non_sq_lock);
   3711 
   3712 	tcp_bind_hash_remove(tcp);
   3713 	/*
   3714 	 * If the tcp_time_wait_collector (which runs outside the squeue)
   3715 	 * is trying to remove this tcp from the time wait list, we will
   3716 	 * block in tcp_time_wait_remove while trying to acquire the
   3717 	 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also
   3718 	 * requires the ipcl_hash_remove to be ordered after the
   3719 	 * tcp_time_wait_remove for the refcnt checks to work correctly.
   3720 	 */
   3721 	if (tcp->tcp_state == TCPS_TIME_WAIT)
   3722 		(void) tcp_time_wait_remove(tcp, NULL);
   3723 	CL_INET_DISCONNECT(connp);
   3724 	ipcl_hash_remove(connp);
   3725 	ixa_cleanup(connp->conn_ixa);
   3726 
   3727 	/*
   3728 	 * Mark the conn as CONDEMNED
   3729 	 */
   3730 	mutex_enter(&connp->conn_lock);
   3731 	connp->conn_state_flags |= CONN_CONDEMNED;
   3732 	mutex_exit(&connp->conn_lock);
   3733 
   3734 	/* Need to cleanup any pending ioctls */
   3735 	ASSERT(tcp->tcp_time_wait_next == NULL);
   3736 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   3737 	ASSERT(tcp->tcp_time_wait_expire == 0);
   3738 	tcp->tcp_state = TCPS_CLOSED;
   3739 
   3740 	/* Release any SSL context */
   3741 	if (tcp->tcp_kssl_ent != NULL) {
   3742 		kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
   3743 		tcp->tcp_kssl_ent = NULL;
   3744 	}
   3745 	if (tcp->tcp_kssl_ctx != NULL) {
   3746 		kssl_release_ctx(tcp->tcp_kssl_ctx);
   3747 		tcp->tcp_kssl_ctx = NULL;
   3748 	}
   3749 	tcp->tcp_kssl_pending = B_FALSE;
   3750 
   3751 	tcp_ipsec_cleanup(tcp);
   3752 }
   3753 
   3754 /*
   3755  * tcp is dying (called from ipcl_conn_destroy and error cases).
   3756  * Free the tcp_t in either case.
   3757  */
   3758 void
   3759 tcp_free(tcp_t *tcp)
   3760 {
   3761 	mblk_t		*mp;
   3762 	conn_t		*connp = tcp->tcp_connp;
   3763 
   3764 	ASSERT(tcp != NULL);
   3765 	ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL);
   3766 
   3767 	connp->conn_rq = NULL;
   3768 	connp->conn_wq = NULL;
   3769 
   3770 	tcp_close_mpp(&tcp->tcp_xmit_head);
   3771 	tcp_close_mpp(&tcp->tcp_reass_head);
   3772 	if (tcp->tcp_rcv_list != NULL) {
   3773 		/* Free b_next chain */
   3774 		tcp_close_mpp(&tcp->tcp_rcv_list);
   3775 	}
   3776 	if ((mp = tcp->tcp_urp_mp) != NULL) {
   3777 		freemsg(mp);
   3778 	}
   3779 	if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
   3780 		freemsg(mp);
   3781 	}
   3782 
   3783 	if (tcp->tcp_fused_sigurg_mp != NULL) {
   3784 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   3785 		freeb(tcp->tcp_fused_sigurg_mp);
   3786 		tcp->tcp_fused_sigurg_mp = NULL;
   3787 	}
   3788 
   3789 	if (tcp->tcp_ordrel_mp != NULL) {
   3790 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   3791 		freeb(tcp->tcp_ordrel_mp);
   3792 		tcp->tcp_ordrel_mp = NULL;
   3793 	}
   3794 
   3795 	if (tcp->tcp_sack_info != NULL) {
   3796 		if (tcp->tcp_notsack_list != NULL) {
   3797 			TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
   3798 			    tcp);
   3799 		}
   3800 		bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
   3801 	}
   3802 
   3803 	if (tcp->tcp_hopopts != NULL) {
   3804 		mi_free(tcp->tcp_hopopts);
   3805 		tcp->tcp_hopopts = NULL;
   3806 		tcp->tcp_hopoptslen = 0;
   3807 	}
   3808 	ASSERT(tcp->tcp_hopoptslen == 0);
   3809 	if (tcp->tcp_dstopts != NULL) {
   3810 		mi_free(tcp->tcp_dstopts);
   3811 		tcp->tcp_dstopts = NULL;
   3812 		tcp->tcp_dstoptslen = 0;
   3813 	}
   3814 	ASSERT(tcp->tcp_dstoptslen == 0);
   3815 	if (tcp->tcp_rthdrdstopts != NULL) {
   3816 		mi_free(tcp->tcp_rthdrdstopts);
   3817 		tcp->tcp_rthdrdstopts = NULL;
   3818 		tcp->tcp_rthdrdstoptslen = 0;
   3819 	}
   3820 	ASSERT(tcp->tcp_rthdrdstoptslen == 0);
   3821 	if (tcp->tcp_rthdr != NULL) {
   3822 		mi_free(tcp->tcp_rthdr);
   3823 		tcp->tcp_rthdr = NULL;
   3824 		tcp->tcp_rthdrlen = 0;
   3825 	}
   3826 	ASSERT(tcp->tcp_rthdrlen == 0);
   3827 
   3828 	/*
   3829 	 * Following is really a blowing away a union.
   3830 	 * It happens to have exactly two members of identical size
   3831 	 * the following code is enough.
   3832 	 */
   3833 	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
   3834 }
   3835 
   3836 
   3837 /*
   3838  * Put a connection confirmation message upstream built from the
   3839  * address/flowid information with the conn and iph. Report our success or
   3840  * failure.
   3841  */
   3842 static boolean_t
   3843 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
   3844     mblk_t **defermp, ip_recv_attr_t *ira)
   3845 {
   3846 	sin_t	sin;
   3847 	sin6_t	sin6;
   3848 	mblk_t	*mp;
   3849 	char	*optp = NULL;
   3850 	int	optlen = 0;
   3851 	conn_t	*connp = tcp->tcp_connp;
   3852 
   3853 	if (defermp != NULL)
   3854 		*defermp = NULL;
   3855 
   3856 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
   3857 		/*
   3858 		 * Return in T_CONN_CON results of option negotiation through
   3859 		 * the T_CONN_REQ. Note: If there is an real end-to-end option
   3860 		 * negotiation, then what is received from remote end needs
   3861 		 * to be taken into account but there is no such thing (yet?)
   3862 		 * in our TCP/IP.
   3863 		 * Note: We do not use mi_offset_param() here as
   3864 		 * tcp_opts_conn_req contents do not directly come from
   3865 		 * an application and are either generated in kernel or
   3866 		 * from user input that was already verified.
   3867 		 */
   3868 		mp = tcp->tcp_conn.tcp_opts_conn_req;
   3869 		optp = (char *)(mp->b_rptr +
   3870 		    ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
   3871 		optlen = (int)
   3872 		    ((struct T_conn_req *)mp->b_rptr)->OPT_length;
   3873 	}
   3874 
   3875 	if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
   3876 
   3877 		/* packet is IPv4 */
   3878 		if (connp->conn_family == AF_INET) {
   3879 			sin = sin_null;
   3880 			sin.sin_addr.s_addr = connp->conn_faddr_v4;
   3881 			sin.sin_port = connp->conn_fport;
   3882 			sin.sin_family = AF_INET;
   3883 			mp = mi_tpi_conn_con(NULL, (char *)&sin,
   3884 			    (int)sizeof (sin_t), optp, optlen);
   3885 		} else {
   3886 			sin6 = sin6_null;
   3887 			sin6.sin6_addr = connp->conn_faddr_v6;
   3888 			sin6.sin6_port = connp->conn_fport;
   3889 			sin6.sin6_family = AF_INET6;
   3890 			mp = mi_tpi_conn_con(NULL, (char *)&sin6,
   3891 			    (int)sizeof (sin6_t), optp, optlen);
   3892 
   3893 		}
   3894 	} else {
   3895 		ip6_t	*ip6h = (ip6_t *)iphdr;
   3896 
   3897 		ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
   3898 		ASSERT(connp->conn_family == AF_INET6);
   3899 		sin6 = sin6_null;
   3900 		sin6.sin6_addr = connp->conn_faddr_v6;
   3901 		sin6.sin6_port = connp->conn_fport;
   3902 		sin6.sin6_family = AF_INET6;
   3903 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
   3904 		mp = mi_tpi_conn_con(NULL, (char *)&sin6,
   3905 		    (int)sizeof (sin6_t), optp, optlen);
   3906 	}
   3907 
   3908 	if (!mp)
   3909 		return (B_FALSE);
   3910 
   3911 	mblk_copycred(mp, idmp);
   3912 
   3913 	if (defermp == NULL) {
   3914 		conn_t *connp = tcp->tcp_connp;
   3915 		if (IPCL_IS_NONSTR(connp)) {
   3916 			(*connp->conn_upcalls->su_connected)
   3917 			    (connp->conn_upper_handle, tcp->tcp_connid,
   3918 			    ira->ira_cred, ira->ira_cpid);
   3919 			freemsg(mp);
   3920 		} else {
   3921 			if (ira->ira_cred != NULL) {
   3922 				/* So that getpeerucred works for TPI sockfs */
   3923 				mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
   3924 			}
   3925 			putnext(connp->conn_rq, mp);
   3926 		}
   3927 	} else {
   3928 		*defermp = mp;
   3929 	}
   3930 
   3931 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
   3932 		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
   3933 	return (B_TRUE);
   3934 }
   3935 
   3936 /*
   3937  * Defense for the SYN attack -
   3938  * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest
   3939  *    one from the list of droppable eagers. This list is a subset of q0.
   3940  *    see comments before the definition of MAKE_DROPPABLE().
   3941  * 2. Don't drop a SYN request before its first timeout. This gives every
   3942  *    request at least til the first timeout to complete its 3-way handshake.
   3943  * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many
   3944  *    requests currently on the queue that has timed out. This will be used
   3945  *    as an indicator of whether an attack is under way, so that appropriate
   3946  *    actions can be taken. (It's incremented in tcp_timer() and decremented
   3947  *    either when eager goes into ESTABLISHED, or gets freed up.)
   3948  * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on
   3949  *    # of timeout drops back to <= q0len/32 => SYN alert off
   3950  */
   3951 static boolean_t
   3952 tcp_drop_q0(tcp_t *tcp)
   3953 {
   3954 	tcp_t	*eager;
   3955 	mblk_t	*mp;
   3956 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3957 
   3958 	ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock));
   3959 	ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0);
   3960 
   3961 	/* Pick oldest eager from the list of droppable eagers */
   3962 	eager = tcp->tcp_eager_prev_drop_q0;
   3963 
   3964 	/* If list is empty. return B_FALSE */
   3965 	if (eager == tcp) {
   3966 		return (B_FALSE);
   3967 	}
   3968 
   3969 	/* If allocated, the mp will be freed in tcp_clean_death_wrapper() */
   3970 	if ((mp = allocb(0, BPRI_HI)) == NULL)
   3971 		return (B_FALSE);
   3972 
   3973 	/*
   3974 	 * Take this eager out from the list of droppable eagers since we are
   3975 	 * going to drop it.
   3976 	 */
   3977 	MAKE_UNDROPPABLE(eager);
   3978 
   3979 	if (tcp->tcp_connp->conn_debug) {
   3980 		(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
   3981 		    "tcp_drop_q0: listen half-open queue (max=%d) overflow"
   3982 		    " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0,
   3983 		    tcp->tcp_conn_req_cnt_q0,
   3984 		    tcp_display(tcp, NULL, DISP_PORT_ONLY));
   3985 	}
   3986 
   3987 	BUMP_MIB(&tcps->tcps_mib, tcpHalfOpenDrop);
   3988 
   3989 	/* Put a reference on the conn as we are enqueueing it in the sqeue */
   3990 	CONN_INC_REF(eager->tcp_connp);
   3991 
   3992 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
   3993 	    tcp_clean_death_wrapper, eager->tcp_connp, NULL,
   3994 	    SQ_FILL, SQTAG_TCP_DROP_Q0);
   3995 
   3996 	return (B_TRUE);
   3997 }
   3998 
   3999 /*
   4000  * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6
   4001  */
   4002 static mblk_t *
   4003 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
   4004     ip_recv_attr_t *ira)
   4005 {
   4006 	tcp_t 		*ltcp = lconnp->conn_tcp;
   4007 	tcp_t		*tcp = connp->conn_tcp;
   4008 	mblk_t		*tpi_mp;
   4009 	ipha_t		*ipha;
   4010 	ip6_t		*ip6h;
   4011 	sin6_t 		sin6;
   4012 	uint_t		ifindex = ira->ira_ruifindex;
   4013 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   4014 
   4015 	if (ira->ira_flags & IRAF_IS_IPV4) {
   4016 		ipha = (ipha_t *)mp->b_rptr;
   4017 
   4018 		connp->conn_ipversion = IPV4_VERSION;
   4019 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
   4020 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
   4021 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   4022 
   4023 		sin6 = sin6_null;
   4024 		sin6.sin6_addr = connp->conn_faddr_v6;
   4025 		sin6.sin6_port = connp->conn_fport;
   4026 		sin6.sin6_family = AF_INET6;
   4027 		sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
   4028 		    IPCL_ZONEID(lconnp), tcps->tcps_netstack);
   4029 
   4030 		if (connp->conn_recv_ancillary.crb_recvdstaddr) {
   4031 			sin6_t	sin6d;
   4032 
   4033 			sin6d = sin6_null;
   4034 			sin6d.sin6_addr = connp->conn_laddr_v6;
   4035 			sin6d.sin6_port = connp->conn_lport;
   4036 			sin6d.sin6_family = AF_INET;
   4037 			tpi_mp = mi_tpi_extconn_ind(NULL,
   4038 			    (char *)&sin6d, sizeof (sin6_t),
   4039 			    (char *)&tcp,
   4040 			    (t_scalar_t)sizeof (intptr_t),
   4041 			    (char *)&sin6d, sizeof (sin6_t),
   4042 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4043 		} else {
   4044 			tpi_mp = mi_tpi_conn_ind(NULL,
   4045 			    (char *)&sin6, sizeof (sin6_t),
   4046 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4047 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4048 		}
   4049 	} else {
   4050 		ip6h = (ip6_t *)mp->b_rptr;
   4051 
   4052 		connp->conn_ipversion = IPV6_VERSION;
   4053 		connp->conn_laddr_v6 = ip6h->ip6_dst;
   4054 		connp->conn_faddr_v6 = ip6h->ip6_src;
   4055 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   4056 
   4057 		sin6 = sin6_null;
   4058 		sin6.sin6_addr = connp->conn_faddr_v6;
   4059 		sin6.sin6_port = connp->conn_fport;
   4060 		sin6.sin6_family = AF_INET6;
   4061 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
   4062 		sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
   4063 		    IPCL_ZONEID(lconnp), tcps->tcps_netstack);
   4064 
   4065 		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
   4066 			/* Pass up the scope_id of remote addr */
   4067 			sin6.sin6_scope_id = ifindex;
   4068 		} else {
   4069 			sin6.sin6_scope_id = 0;
   4070 		}
   4071 		if (connp->conn_recv_ancillary.crb_recvdstaddr) {
   4072 			sin6_t	sin6d;
   4073 
   4074 			sin6d = sin6_null;
   4075 			sin6.sin6_addr = connp->conn_laddr_v6;
   4076 			sin6d.sin6_port = connp->conn_lport;
   4077 			sin6d.sin6_family = AF_INET6;
   4078 			if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6))
   4079 				sin6d.sin6_scope_id = ifindex;
   4080 
   4081 			tpi_mp = mi_tpi_extconn_ind(NULL,
   4082 			    (char *)&sin6d, sizeof (sin6_t),
   4083 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4084 			    (char *)&sin6d, sizeof (sin6_t),
   4085 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4086 		} else {
   4087 			tpi_mp = mi_tpi_conn_ind(NULL,
   4088 			    (char *)&sin6, sizeof (sin6_t),
   4089 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4090 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4091 		}
   4092 	}
   4093 
   4094 	tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
   4095 	return (tpi_mp);
   4096 }
   4097 
   4098 /* Handle a SYN on an AF_INET socket */
   4099 mblk_t *
   4100 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
   4101     ip_recv_attr_t *ira)
   4102 {
   4103 	tcp_t 		*ltcp = lconnp->conn_tcp;
   4104 	tcp_t		*tcp = connp->conn_tcp;
   4105 	sin_t		sin;
   4106 	mblk_t		*tpi_mp = NULL;
   4107 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   4108 	ipha_t		*ipha;
   4109 
   4110 	ASSERT(ira->ira_flags & IRAF_IS_IPV4);
   4111 	ipha = (ipha_t *)mp->b_rptr;
   4112 
   4113 	connp->conn_ipversion = IPV4_VERSION;
   4114 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
   4115 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
   4116 	connp->conn_saddr_v6 = connp->conn_laddr_v6;
   4117 
   4118 	sin = sin_null;
   4119 	sin.sin_addr.s_addr = connp->conn_faddr_v4;
   4120 	sin.sin_port = connp->conn_fport;
   4121 	sin.sin_family = AF_INET;
   4122 	if (lconnp->conn_recv_ancillary.crb_recvdstaddr) {
   4123 		sin_t	sind;
   4124 
   4125 		sind = sin_null;
   4126 		sind.sin_addr.s_addr = connp->conn_laddr_v4;
   4127 		sind.sin_port = connp->conn_lport;
   4128 		sind.sin_family = AF_INET;
   4129 		tpi_mp = mi_tpi_extconn_ind(NULL,
   4130 		    (char *)&sind, sizeof (sin_t), (char *)&tcp,
   4131 		    (t_scalar_t)sizeof (intptr_t), (char *)&sind,
   4132 		    sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4133 	} else {
   4134 		tpi_mp = mi_tpi_conn_ind(NULL,
   4135 		    (char *)&sin, sizeof (sin_t),
   4136 		    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4137 		    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4138 	}
   4139 
   4140 	tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
   4141 	return (tpi_mp);
   4142 }
   4143 
   4144 /*
   4145  * tcp_get_conn/tcp_free_conn
   4146  *
   4147  * tcp_get_conn is used to get a clean tcp connection structure.
   4148  * It tries to reuse the connections put on the freelist by the
   4149  * time_wait_collector failing which it goes to kmem_cache. This
   4150  * way has two benefits compared to just allocating from and
   4151  * freeing to kmem_cache.
   4152  * 1) The time_wait_collector can free (which includes the cleanup)
   4153  * outside the squeue. So when the interrupt comes, we have a clean
   4154  * connection sitting in the freelist. Obviously, this buys us
   4155  * performance.
   4156  *
   4157  * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
   4158  * has multiple disadvantages - tying up the squeue during alloc.
   4159  * But allocating the conn/tcp in IP land is also not the best since
   4160  * we can't check the 'q' and 'q0' which are protected by squeue and
   4161  * blindly allocate memory which might have to be freed here if we are
   4162  * not allowed to accept the connection. By using the freelist and
   4163  * putting the conn/tcp back in freelist, we don't pay a penalty for
   4164  * allocating memory without checking 'q/q0' and freeing it if we can't
   4165  * accept the connection.
   4166  *
   4167  * Care should be taken to put the conn back in the same squeue's freelist
   4168  * from which it was allocated. Best results are obtained if conn is
   4169  * allocated from listener's squeue and freed to the same. Time wait
   4170  * collector will free up the freelist is the connection ends up sitting
   4171  * there for too long.
   4172  */
   4173 void *
   4174 tcp_get_conn(void *arg, tcp_stack_t *tcps)
   4175 {
   4176 	tcp_t			*tcp = NULL;
   4177 	conn_t			*connp = NULL;
   4178 	squeue_t		*sqp = (squeue_t *)arg;
   4179 	tcp_squeue_priv_t 	*tcp_time_wait;
   4180 	netstack_t		*ns;
   4181 	mblk_t			*tcp_rsrv_mp = NULL;
   4182 
   4183 	tcp_time_wait =
   4184 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
   4185 
   4186 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   4187 	tcp = tcp_time_wait->tcp_free_list;
   4188 	ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
   4189 	if (tcp != NULL) {
   4190 		tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
   4191 		tcp_time_wait->tcp_free_list_cnt--;
   4192 		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   4193 		tcp->tcp_time_wait_next = NULL;
   4194 		connp = tcp->tcp_connp;
   4195 		connp->conn_flags |= IPCL_REUSED;
   4196 
   4197 		ASSERT(tcp->tcp_tcps == NULL);
   4198 		ASSERT(connp->conn_netstack == NULL);
   4199 		ASSERT(tcp->tcp_rsrv_mp != NULL);
   4200 		ns = tcps->tcps_netstack;
   4201 		netstack_hold(ns);
   4202 		connp->conn_netstack = ns;
   4203 		connp->conn_ixa->ixa_ipst = ns->netstack_ip;
   4204 		tcp->tcp_tcps = tcps;
   4205 		ipcl_globalhash_insert(connp);
   4206 
   4207 		connp->conn_ixa->ixa_notify_cookie = tcp;
   4208 		ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
   4209 		connp->conn_recv = tcp_input_data;
   4210 		ASSERT(connp->conn_recvicmp == tcp_icmp_input);
   4211 		ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
   4212 		return ((void *)connp);
   4213 	}
   4214 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   4215 	/*
   4216 	 * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
   4217 	 * this conn_t/tcp_t is freed at ipcl_conn_destroy().
   4218 	 */
   4219 	tcp_rsrv_mp = allocb(0, BPRI_HI);
   4220 	if (tcp_rsrv_mp == NULL)
   4221 		return (NULL);
   4222 
   4223 	if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
   4224 	    tcps->tcps_netstack)) == NULL) {
   4225 		freeb(tcp_rsrv_mp);
   4226 		return (NULL);
   4227 	}
   4228 
   4229 	tcp = connp->conn_tcp;
   4230 	tcp->tcp_rsrv_mp = tcp_rsrv_mp;
   4231 	mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
   4232 
   4233 	tcp->tcp_tcps = tcps;
   4234 
   4235 	connp->conn_recv = tcp_input_data;
   4236 	connp->conn_recvicmp = tcp_icmp_input;
   4237 	connp->conn_verifyicmp = tcp_verifyicmp;
   4238 
   4239 	/*
   4240 	 * Register tcp_notify to listen to capability changes detected by IP.
   4241 	 * This upcall is made in the context of the call to conn_ip_output
   4242 	 * thus it is inside the squeue.
   4243 	 */
   4244 	connp->conn_ixa->ixa_notify = tcp_notify;
   4245 	connp->conn_ixa->ixa_notify_cookie = tcp;
   4246 
   4247 	return ((void *)connp);
   4248 }
   4249 
   4250 /* BEGIN CSTYLED */
   4251 /*
   4252  *
   4253  * The sockfs ACCEPT path:
   4254  * =======================
   4255  *
   4256  * The eager is now established in its own perimeter as soon as SYN is
   4257  * received in tcp_input_listener(). When sockfs receives conn_ind, it
   4258  * completes the accept processing on the acceptor STREAM. The sending
   4259  * of conn_ind part is common for both sockfs listener and a TLI/XTI
   4260  * listener but a TLI/XTI listener completes the accept processing
   4261  * on the listener perimeter.
   4262  *
   4263  * Common control flow for 3 way handshake:
   4264  * ----------------------------------------
   4265  *
   4266  * incoming SYN (listener perimeter)	-> tcp_input_listener()
   4267  *
   4268  * incoming SYN-ACK-ACK (eager perim) 	-> tcp_input_data()
   4269  * send T_CONN_IND (listener perim)	-> tcp_send_conn_ind()
   4270  *
   4271  * Sockfs ACCEPT Path:
   4272  * -------------------
   4273  *
   4274  * open acceptor stream (tcp_open allocates tcp_tli_accept()
   4275  * as STREAM entry point)
   4276  *
   4277  * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept()
   4278  *
   4279  * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager
   4280  * association (we are not behind eager's squeue but sockfs is protecting us
   4281  * and no one knows about this stream yet. The STREAMS entry point q->q_info
   4282  * is changed to point at tcp_wput().
   4283  *
   4284  * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to
   4285  * listener (done on listener's perimeter).
   4286  *
   4287  * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish
   4288  * accept.
   4289  *
   4290  * TLI/XTI client ACCEPT path:
   4291  * ---------------------------
   4292  *
   4293  * soaccept() sends T_CONN_RES on the listener STREAM.
   4294  *
   4295  * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send
   4296  * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()).
   4297  *
   4298  * Locks:
   4299  * ======
   4300  *
   4301  * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and
   4302  * and listeners->tcp_eager_next_q.
   4303  *
   4304  * Referencing:
   4305  * ============
   4306  *
   4307  * 1) We start out in tcp_input_listener by eager placing a ref on
   4308  * listener and listener adding eager to listeners->tcp_eager_next_q0.
   4309  *
   4310  * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before
   4311  * doing so we place a ref on the eager. This ref is finally dropped at the
   4312  * end of tcp_accept_finish() while unwinding from the squeue, i.e. the
   4313  * reference is dropped by the squeue framework.
   4314  *
   4315  * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish
   4316  *
   4317  * The reference must be released by the same entity that added the reference
   4318  * In the above scheme, the eager is the entity that adds and releases the
   4319  * references. Note that tcp_accept_finish executes in the squeue of the eager
   4320  * (albeit after it is attached to the acceptor stream). Though 1. executes
   4321  * in the listener's squeue, the eager is nascent at this point and the
   4322  * reference can be considered to have been added on behalf of the eager.
   4323  *
   4324  * Eager getting a Reset or listener closing:
   4325  * ==========================================
   4326  *
   4327  * Once the listener and eager are linked, the listener never does the unlink.
   4328  * If the listener needs to close, tcp_eager_cleanup() is called which queues
   4329  * a message on all eager perimeter. The eager then does the unlink, clears
   4330  * any pointers to the listener's queue and drops the reference to the
   4331  * listener. The listener waits in tcp_close outside the squeue until its
   4332  * refcount has dropped to 1. This ensures that the listener has waited for
   4333  * all eagers to clear their association with the listener.
   4334  *
   4335  * Similarly, if eager decides to go away, it can unlink itself and close.
   4336  * When the T_CONN_RES comes down, we check if eager has closed. Note that
   4337  * the reference to eager is still valid because of the extra ref we put
   4338  * in tcp_send_conn_ind.
   4339  *
   4340  * Listener can always locate the eager under the protection
   4341  * of the listener->tcp_eager_lock, and then do a refhold
   4342  * on the eager during the accept processing.
   4343  *
   4344  * The acceptor stream accesses the eager in the accept processing
   4345  * based on the ref placed on eager before sending T_conn_ind.
   4346  * The only entity that can negate this refhold is a listener close
   4347  * which is mutually exclusive with an active acceptor stream.
   4348  *
   4349  * Eager's reference on the listener
   4350  * ===================================
   4351  *
   4352  * If the accept happens (even on a closed eager) the eager drops its
   4353  * reference on the listener at the start of tcp_accept_finish. If the
   4354  * eager is killed due to an incoming RST before the T_conn_ind is sent up,
   4355  * the reference is dropped in tcp_closei_local. If the listener closes,
   4356  * the reference is dropped in tcp_eager_kill. In all cases the reference
   4357  * is dropped while executing in the eager's context (squeue).
   4358  */
   4359 /* END CSTYLED */
   4360 
   4361 /* Process the SYN packet, mp, directed at the listener 'tcp' */
   4362 
   4363 /*
   4364  * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN.
   4365  * tcp_input_data will not see any packets for listeners since the listener
   4366  * has conn_recv set to tcp_input_listener.
   4367  */
   4368 /* ARGSUSED */
   4369 void
   4370 tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   4371 {
   4372 	tcpha_t		*tcpha;
   4373 	uint32_t	seg_seq;
   4374 	tcp_t		*eager;
   4375 	int		err;
   4376 	conn_t		*econnp = NULL;
   4377 	squeue_t	*new_sqp;
   4378 	mblk_t		*mp1;
   4379 	uint_t 		ip_hdr_len;
   4380 	conn_t		*lconnp = (conn_t *)arg;
   4381 	tcp_t		*listener = lconnp->conn_tcp;
   4382 	tcp_stack_t	*tcps = listener->tcp_tcps;
   4383 	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
   4384 	uint_t		flags;
   4385 	mblk_t		*tpi_mp;
   4386 	uint_t		ifindex = ira->ira_ruifindex;
   4387 
   4388 	ip_hdr_len = ira->ira_ip_hdr_length;
   4389 	tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
   4390 	flags = (unsigned int)tcpha->tha_flags & 0xFF;
   4391 
   4392 	if (!(flags & TH_SYN)) {
   4393 		if ((flags & TH_RST) || (flags & TH_URG)) {
   4394 			freemsg(mp);
   4395 			return;
   4396 		}
   4397 		if (flags & TH_ACK) {
   4398 			/* Note this executes in listener's squeue */
   4399 			tcp_xmit_listeners_reset(mp, ira, ipst, lconnp);
   4400 			return;
   4401 		}
   4402 
   4403 		freemsg(mp);
   4404 		return;
   4405 	}
   4406 
   4407 	if (listener->tcp_state != TCPS_LISTEN)
   4408 		goto error2;
   4409 
   4410 	ASSERT(IPCL_IS_BOUND(lconnp));
   4411 
   4412 	mutex_enter(&listener->tcp_eager_lock);
   4413 	if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) {
   4414 		mutex_exit(&listener->tcp_eager_lock);
   4415 		TCP_STAT(tcps, tcp_listendrop);
   4416 		BUMP_MIB(&tcps->tcps_mib, tcpListenDrop);
   4417 		if (lconnp->conn_debug) {
   4418 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
   4419 			    "tcp_input_listener: listen backlog (max=%d) "
   4420 			    "overflow (%d pending) on %s",
   4421 			    listener->tcp_conn_req_max,
   4422 			    listener->tcp_conn_req_cnt_q,
   4423 			    tcp_display(listener, NULL, DISP_PORT_ONLY));
   4424 		}
   4425 		goto error2;
   4426 	}
   4427 
   4428 	if (listener->tcp_conn_req_cnt_q0 >=
   4429 	    listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
   4430 		/*
   4431 		 * Q0 is full. Drop a pending half-open req from the queue
   4432 		 * to make room for the new SYN req. Also mark the time we
   4433 		 * drop a SYN.
   4434 		 *
   4435 		 * A more aggressive defense against SYN attack will
   4436 		 * be to set the "tcp_syn_defense" flag now.
   4437 		 */
   4438 		TCP_STAT(tcps, tcp_listendropq0);
   4439 		listener->tcp_last_rcv_lbolt = ddi_get_lbolt64();
   4440 		if (!tcp_drop_q0(listener)) {
   4441 			mutex_exit(&listener->tcp_eager_lock);
   4442 			BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0);
   4443 			if (lconnp->conn_debug) {
   4444 				(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
   4445 				    "tcp_input_listener: listen half-open "
   4446 				    "queue (max=%d) full (%d pending) on %s",
   4447 				    tcps->tcps_conn_req_max_q0,
   4448 				    listener->tcp_conn_req_cnt_q0,
   4449 				    tcp_display(listener, NULL,
   4450 				    DISP_PORT_ONLY));
   4451 			}
   4452 			goto error2;
   4453 		}
   4454 	}
   4455 	mutex_exit(&listener->tcp_eager_lock);
   4456 
   4457 	/*
   4458 	 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
   4459 	 * or based on the ring (for packets from GLD). Otherwise it is
   4460 	 * set based on lbolt i.e., a somewhat random number.
   4461 	 */
   4462 	ASSERT(ira->ira_sqp != NULL);
   4463 	new_sqp = ira->ira_sqp;
   4464 
   4465 	econnp = (conn_t *)tcp_get_conn(arg2, tcps);
   4466 	if (econnp == NULL)
   4467 		goto error2;
   4468 
   4469 	ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
   4470 	econnp->conn_sqp = new_sqp;
   4471 	econnp->conn_initial_sqp = new_sqp;
   4472 	econnp->conn_ixa->ixa_sqp = new_sqp;
   4473 
   4474 	econnp->conn_fport = tcpha->tha_lport;
   4475 	econnp->conn_lport = tcpha->tha_fport;
   4476 
   4477 	err = conn_inherit_parent(lconnp, econnp);
   4478 	if (err != 0)
   4479 		goto error3;
   4480 
   4481 	ASSERT(OK_32PTR(mp->b_rptr));
   4482 	ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ||
   4483 	    IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
   4484 
   4485 	if (lconnp->conn_family == AF_INET) {
   4486 		ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION);
   4487 		tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira);
   4488 	} else {
   4489 		tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira);
   4490 	}
   4491 
   4492 	if (tpi_mp == NULL)
   4493 		goto error3;
   4494 
   4495 	eager = econnp->conn_tcp;
   4496 	eager->tcp_detached = B_TRUE;
   4497 	SOCK_CONNID_INIT(eager->tcp_connid);
   4498 
   4499 	tcp_init_values(eager);
   4500 
   4501 	ASSERT((econnp->conn_ixa->ixa_flags &
   4502 	    (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
   4503 	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) ==
   4504 	    (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
   4505 	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO));
   4506 
   4507 	if (!tcps->tcps_dev_flow_ctl)
   4508 		econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL;
   4509 
   4510 	/* Prepare for diffing against previous packets */
   4511 	eager->tcp_recvifindex = 0;
   4512 	eager->tcp_recvhops = 0xffffffffU;
   4513 
   4514 	if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) {
   4515 		if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) ||
   4516 		    IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) {
   4517 			econnp->conn_incoming_ifindex = ifindex;
   4518 			econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
   4519 			econnp->conn_ixa->ixa_scopeid = ifindex;
   4520 		}
   4521 	}
   4522 
   4523 	if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) ==
   4524 	    (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) &&
   4525 	    tcps->tcps_rev_src_routes) {
   4526 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
   4527 		ip_pkt_t *ipp = &econnp->conn_xmit_ipp;
   4528 
   4529 		/* Source routing option copyover (reverse it) */
   4530 		err = ip_find_hdr_v4(ipha, ipp, B_TRUE);
   4531 		if (err != 0) {
   4532 			freemsg(tpi_mp);
   4533 			goto error3;
   4534 		}
   4535 		ip_pkt_source_route_reverse_v4(ipp);
   4536 	}
   4537 
   4538 	ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL);
   4539 	ASSERT(!eager->tcp_tconnind_started);
   4540 	/*
   4541 	 * If the SYN came with a credential, it's a loopback packet or a
   4542 	 * labeled packet; attach the credential to the TPI message.
   4543 	 */
   4544 	if (ira->ira_cred != NULL)
   4545 		mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid);
   4546 
   4547 	eager->tcp_conn.tcp_eager_conn_ind = tpi_mp;
   4548 
   4549 	/* Inherit the listener's SSL protection state */
   4550 	if ((eager->tcp_kssl_ent = listener->tcp_kssl_ent) != NULL) {
   4551 		kssl_hold_ent(eager->tcp_kssl_ent);
   4552 		eager->tcp_kssl_pending = B_TRUE;
   4553 	}
   4554 
   4555 	/* Inherit the listener's non-STREAMS flag */
   4556 	if (IPCL_IS_NONSTR(lconnp)) {
   4557 		econnp->conn_flags |= IPCL_NONSTR;
   4558 	}
   4559 
   4560 	ASSERT(eager->tcp_ordrel_mp == NULL);
   4561 
   4562 	if (!IPCL_IS_NONSTR(econnp)) {
   4563 		/*
   4564 		 * Pre-allocate the T_ordrel_ind mblk for TPI socket so that
   4565 		 * at close time, we will always have that to send up.
   4566 		 * Otherwise, we need to do special handling in case the
   4567 		 * allocation fails at that time.
   4568 		 */
   4569 		if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL)
   4570 			goto error3;
   4571 	}
   4572 	/*
   4573 	 * Now that the IP addresses and ports are setup in econnp we
   4574 	 * can do the IPsec policy work.
   4575 	 */
   4576 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
   4577 		if (lconnp->conn_policy != NULL) {
   4578 			/*
   4579 			 * Inherit the policy from the listener; use
   4580 			 * actions from ira
   4581 			 */
   4582 			if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) {
   4583 				CONN_DEC_REF(econnp);
   4584 				freemsg(mp);
   4585 				goto error3;
   4586 			}
   4587 		}
   4588 	}
   4589 
   4590 	/* Inherit various TCP parameters from the listener */
   4591 	eager->tcp_naglim = listener->tcp_naglim;
   4592 	eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold;
   4593 	eager->tcp_second_timer_threshold =
   4594 	    listener->tcp_second_timer_threshold;
   4595 	eager->tcp_first_ctimer_threshold =
   4596 	    listener->tcp_first_ctimer_threshold;
   4597 	eager->tcp_second_ctimer_threshold =
   4598 	    listener->tcp_second_ctimer_threshold;
   4599 
   4600 	/*
   4601 	 * tcp_set_destination() may set tcp_rwnd according to the route
   4602 	 * metrics. If it does not, the eager's receive window will be set
   4603 	 * to the listener's receive window later in this function.
   4604 	 */
   4605 	eager->tcp_rwnd = 0;
   4606 
   4607 	/*
   4608 	 * Inherit listener's tcp_init_cwnd.  Need to do this before
   4609 	 * calling tcp_process_options() which set the initial cwnd.
   4610 	 */
   4611 	eager->tcp_init_cwnd = listener->tcp_init_cwnd;
   4612 
   4613 	if (is_system_labeled()) {
   4614 		ip_xmit_attr_t *ixa = econnp->conn_ixa;
   4615 
   4616 		ASSERT(ira->ira_tsl != NULL);
   4617 		/* Discard any old label */
   4618 		if (ixa->ixa_free_flags & IXA_FREE_TSL) {
   4619 			ASSERT(ixa->ixa_tsl != NULL);
   4620 			label_rele(ixa->ixa_tsl);
   4621 			ixa->ixa_free_flags &= ~IXA_FREE_TSL;
   4622 			ixa->ixa_tsl = NULL;
   4623 		}
   4624 		if ((lconnp->conn_mlp_type != mlptSingle ||
   4625 		    lconnp->conn_mac_mode != CONN_MAC_DEFAULT) &&
   4626 		    ira->ira_tsl != NULL) {
   4627 			/*
   4628 			 * If this is an MLP connection or a MAC-Exempt
   4629 			 * connection with an unlabeled node, packets are to be
   4630 			 * exchanged using the security label of the received
   4631 			 * SYN packet instead of the server application's label.
   4632 			 * tsol_check_dest called from ip_set_destination
   4633 			 * might later update TSF_UNLABELED by replacing
   4634 			 * ixa_tsl with a new label.
   4635 			 */
   4636 			label_hold(ira->ira_tsl);
   4637 			ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
   4638 			DTRACE_PROBE2(mlp_syn_accept, conn_t *,
   4639 			    econnp, ts_label_t *, ixa->ixa_tsl)
   4640 		} else {
   4641 			ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
   4642 			DTRACE_PROBE2(syn_accept, conn_t *,
   4643 			    econnp, ts_label_t *, ixa->ixa_tsl)
   4644 		}
   4645 		/*
   4646 		 * conn_connect() called from tcp_set_destination will verify
   4647 		 * the destination is allowed to receive packets at the
   4648 		 * security label of the SYN-ACK we are generating. As part of
   4649 		 * that, tsol_check_dest() may create a new effective label for
   4650 		 * this connection.
   4651 		 * Finally conn_connect() will call conn_update_label.
   4652 		 * All that remains for TCP to do is to call
   4653 		 * conn_build_hdr_template which is done as part of
   4654 		 * tcp_set_destination.
   4655 		 */
   4656 	}
   4657 
   4658 	/*
   4659 	 * Since we will clear tcp_listener before we clear tcp_detached
   4660 	 * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress
   4661 	 * so we can tell a TCP_DETACHED_NONEAGER apart.
   4662 	 */
   4663 	eager->tcp_hard_binding = B_TRUE;
   4664 
   4665 	tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
   4666 	    TCP_BIND_HASH(econnp->conn_lport)], eager, 0);
   4667 
   4668 	CL_INET_CONNECT(econnp, B_FALSE, err);
   4669 	if (err != 0) {
   4670 		tcp_bind_hash_remove(eager);
   4671 		goto error3;
   4672 	}
   4673 
   4674 	/*
   4675 	 * No need to check for multicast destination since ip will only pass
   4676 	 * up multicasts to those that have expressed interest
   4677 	 * TODO: what about rejecting broadcasts?
   4678 	 * Also check that source is not a multicast or broadcast address.
   4679 	 */
   4680 	eager->tcp_state = TCPS_SYN_RCVD;
   4681 	SOCK_CONNID_BUMP(eager->tcp_connid);
   4682 
   4683 	/*
   4684 	 * Adapt our mss, ttl, ... based on the remote address.
   4685 	 */
   4686 
   4687 	if (tcp_set_destination(eager) != 0) {
   4688 		BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   4689 		/* Undo the bind_hash_insert */
   4690 		tcp_bind_hash_remove(eager);
   4691 		goto error3;
   4692 	}
   4693 
   4694 	/* Process all TCP options. */
   4695 	tcp_process_options(eager, tcpha);
   4696 
   4697 	/* Is the other end ECN capable? */
   4698 	if (tcps->tcps_ecn_permitted >= 1 &&
   4699 	    (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
   4700 		eager->tcp_ecn_ok = B_TRUE;
   4701 	}
   4702 
   4703 	/*
   4704 	 * The listener's conn_rcvbuf should be the default window size or a
   4705 	 * window size changed via SO_RCVBUF option. First round up the
   4706 	 * eager's tcp_rwnd to the nearest MSS. Then find out the window
   4707 	 * scale option value if needed. Call tcp_rwnd_set() to finish the
   4708 	 * setting.
   4709 	 *
   4710 	 * Note if there is a rpipe metric associated with the remote host,
   4711 	 * we should not inherit receive window size from listener.
   4712 	 */
   4713 	eager->tcp_rwnd = MSS_ROUNDUP(
   4714 	    (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf :
   4715 	    eager->tcp_rwnd), eager->tcp_mss);
   4716 	if (eager->tcp_snd_ws_ok)
   4717 		tcp_set_ws_value(eager);
   4718 	/*
   4719 	 * Note that this is the only place tcp_rwnd_set() is called for
   4720 	 * accepting a connection.  We need to call it here instead of
   4721 	 * after the 3-way handshake because we need to tell the other
   4722 	 * side our rwnd in the SYN-ACK segment.
   4723 	 */
   4724 	(void) tcp_rwnd_set(eager, eager->tcp_rwnd);
   4725 
   4726 	ASSERT(eager->tcp_connp->conn_rcvbuf != 0 &&
   4727 	    eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd);
   4728 
   4729 	ASSERT(econnp->conn_rcvbuf != 0 &&
   4730 	    econnp->conn_rcvbuf == eager->tcp_rwnd);
   4731 
   4732 	/* Put a ref on the listener for the eager. */
   4733 	CONN_INC_REF(lconnp);
   4734 	mutex_enter(&listener->tcp_eager_lock);
   4735 	listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
   4736 	eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0;
   4737 	listener->tcp_eager_next_q0 = eager;
   4738 	eager->tcp_eager_prev_q0 = listener;
   4739 
   4740 	/* Set tcp_listener before adding it to tcp_conn_fanout */
   4741 	eager->tcp_listener = listener;
   4742 	eager->tcp_saved_listener = listener;
   4743 
   4744 	/*
   4745 	 * Tag this detached tcp vector for later retrieval
   4746 	 * by our listener client in tcp_accept().
   4747 	 */
   4748 	eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum;
   4749 	listener->tcp_conn_req_cnt_q0++;
   4750 	if (++listener->tcp_conn_req_seqnum == -1) {
   4751 		/*
   4752 		 * -1 is "special" and defined in TPI as something
   4753 		 * that should never be used in T_CONN_IND
   4754 		 */
   4755 		++listener->tcp_conn_req_seqnum;
   4756 	}
   4757 	mutex_exit(&listener->tcp_eager_lock);
   4758 
   4759 	if (listener->tcp_syn_defense) {
   4760 		/* Don't drop the SYN that comes from a good IP source */
   4761 		ipaddr_t *addr_cache;
   4762 
   4763 		addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
   4764 		if (addr_cache != NULL && econnp->conn_faddr_v4 ==
   4765 		    addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) {
   4766 			eager->tcp_dontdrop = B_TRUE;
   4767 		}
   4768 	}
   4769 
   4770 	/*
   4771 	 * We need to insert the eager in its own perimeter but as soon
   4772 	 * as we do that, we expose the eager to the classifier and
   4773 	 * should not touch any field outside the eager's perimeter.
   4774 	 * So do all the work necessary before inserting the eager
   4775 	 * in its own perimeter. Be optimistic that conn_connect()
   4776 	 * will succeed but undo everything if it fails.
   4777 	 */
   4778 	seg_seq = ntohl(tcpha->tha_seq);
   4779 	eager->tcp_irs = seg_seq;
   4780 	eager->tcp_rack = seg_seq;
   4781 	eager->tcp_rnxt = seg_seq + 1;
   4782 	eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt);
   4783 	BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);
   4784 	eager->tcp_state = TCPS_SYN_RCVD;
   4785 	mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
   4786 	    NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
   4787 	if (mp1 == NULL) {
   4788 		/*
   4789 		 * Increment the ref count as we are going to
   4790 		 * enqueueing an mp in squeue
   4791 		 */
   4792 		CONN_INC_REF(econnp);
   4793 		goto error;
   4794 	}
   4795 
   4796 	/*
   4797 	 * We need to start the rto timer. In normal case, we start
   4798 	 * the timer after sending the packet on the wire (or at
   4799 	 * least believing that packet was sent by waiting for
   4800 	 * conn_ip_output() to return). Since this is the first packet
   4801 	 * being sent on the wire for the eager, our initial tcp_rto
   4802 	 * is at least tcp_rexmit_interval_min which is a fairly
   4803 	 * large value to allow the algorithm to adjust slowly to large
   4804 	 * fluctuations of RTT during first few transmissions.
   4805 	 *
   4806 	 * Starting the timer first and then sending the packet in this
   4807 	 * case shouldn't make much difference since tcp_rexmit_interval_min
   4808 	 * is of the order of several 100ms and starting the timer
   4809 	 * first and then sending the packet will result in difference
   4810 	 * of few micro seconds.
   4811 	 *
   4812 	 * Without this optimization, we are forced to hold the fanout
   4813 	 * lock across the ipcl_bind_insert() and sending the packet
   4814 	 * so that we don't race against an incoming packet (maybe RST)
   4815 	 * for this eager.
   4816 	 *
   4817 	 * It is necessary to acquire an extra reference on the eager
   4818 	 * at this point and hold it until after tcp_send_data() to
   4819 	 * ensure against an eager close race.
   4820 	 */
   4821 
   4822 	CONN_INC_REF(econnp);
   4823 
   4824 	TCP_TIMER_RESTART(eager, eager->tcp_rto);
   4825 
   4826 	/*
   4827 	 * Insert the eager in its own perimeter now. We are ready to deal
   4828 	 * with any packets on eager.
   4829 	 */
   4830 	if (ipcl_conn_insert(econnp) != 0)
   4831 		goto error;
   4832 
   4833 	/*
   4834 	 * Send the SYN-ACK. Can't use tcp_send_data since we can't update
   4835 	 * pmtu etc; we are not on the eager's squeue
   4836 	 */
   4837 	ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp);
   4838 	(void) conn_ip_output(mp1, econnp->conn_ixa);
   4839 	CONN_DEC_REF(econnp);
   4840 	freemsg(mp);
   4841 
   4842 	return;
   4843 error:
   4844 	freemsg(mp1);
   4845 	eager->tcp_closemp_used = B_TRUE;
   4846 	TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   4847 	mp1 = &eager->tcp_closemp;
   4848 	SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill,
   4849 	    econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2);
   4850 
   4851 	/*
   4852 	 * If a connection already exists, send the mp to that connections so
   4853 	 * that it can be appropriately dealt with.
   4854 	 */
   4855 	ipst = tcps->tcps_netstack->netstack_ip;
   4856 
   4857 	if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) {
   4858 		if (!IPCL_IS_CONNECTED(econnp)) {
   4859 			/*
   4860 			 * Something bad happened. ipcl_conn_insert()
   4861 			 * failed because a connection already existed
   4862 			 * in connected hash but we can't find it
   4863 			 * anymore (someone blew it away). Just
   4864 			 * free this message and hopefully remote
   4865 			 * will retransmit at which time the SYN can be
   4866 			 * treated as a new connection or dealth with
   4867 			 * a TH_RST if a connection already exists.
   4868 			 */
   4869 			CONN_DEC_REF(econnp);
   4870 			freemsg(mp);
   4871 		} else {
   4872 			SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data,
   4873 			    econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1);
   4874 		}
   4875 	} else {
   4876 		/* Nobody wants this packet */
   4877 		freemsg(mp);
   4878 	}
   4879 	return;
   4880 error3:
   4881 	CONN_DEC_REF(econnp);
   4882 error2:
   4883 	freemsg(mp);
   4884 }
   4885 
   4886 /*
   4887  * In an ideal case of vertical partition in NUMA architecture, its
   4888  * beneficial to have the listener and all the incoming connections
   4889  * tied to the same squeue. The other constraint is that incoming
   4890  * connections should be tied to the squeue attached to interrupted
   4891  * CPU for obvious locality reason so this leaves the listener to
   4892  * be tied to the same squeue. Our only problem is that when listener
   4893  * is binding, the CPU that will get interrupted by the NIC whose
   4894  * IP address the listener is binding to is not even known. So
   4895  * the code below allows us to change that binding at the time the
   4896  * CPU is interrupted by virtue of incoming connection's squeue.
   4897  *
   4898  * This is usefull only in case of a listener bound to a specific IP
   4899  * address. For other kind of listeners, they get bound the
   4900  * very first time and there is no attempt to rebind them.
   4901  */
   4902 void
   4903 tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2,
   4904     ip_recv_attr_t *ira)
   4905 {
   4906 	conn_t		*connp = (conn_t *)arg;
   4907 	squeue_t	*sqp = (squeue_t *)arg2;
   4908 	squeue_t	*new_sqp;
   4909 	uint32_t	conn_flags;
   4910 
   4911 	/*
   4912 	 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
   4913 	 * or based on the ring (for packets from GLD). Otherwise it is
   4914 	 * set based on lbolt i.e., a somewhat random number.
   4915 	 */
   4916 	ASSERT(ira->ira_sqp != NULL);
   4917 	new_sqp = ira->ira_sqp;
   4918 
   4919 	if (connp->conn_fanout == NULL)
   4920 		goto done;
   4921 
   4922 	if (!(connp->conn_flags & IPCL_FULLY_BOUND)) {
   4923 		mutex_enter(&connp->conn_fanout->connf_lock);
   4924 		mutex_enter(&connp->conn_lock);
   4925 		/*
   4926 		 * No one from read or write side can access us now
   4927 		 * except for already queued packets on this squeue.
   4928 		 * But since we haven't changed the squeue yet, they
   4929 		 * can't execute. If they are processed after we have
   4930 		 * changed the squeue, they are sent back to the
   4931 		 * correct squeue down below.
   4932 		 * But a listner close can race with processing of
   4933 		 * incoming SYN. If incoming SYN processing changes
   4934 		 * the squeue then the listener close which is waiting
   4935 		 * to enter the squeue would operate on the wrong
   4936 		 * squeue. Hence we don't change the squeue here unless
   4937 		 * the refcount is exactly the minimum refcount. The
   4938 		 * minimum refcount of 4 is counted as - 1 each for
   4939 		 * TCP and IP, 1 for being in the classifier hash, and
   4940 		 * 1 for the mblk being processed.
   4941 		 */
   4942 
   4943 		if (connp->conn_ref != 4 ||
   4944 		    connp->conn_tcp->tcp_state != TCPS_LISTEN) {
   4945 			mutex_exit(&connp->conn_lock);
   4946 			mutex_exit(&connp->conn_fanout->connf_lock);
   4947 			goto done;
   4948 		}
   4949 		if (connp->conn_sqp != new_sqp) {
   4950 			while (connp->conn_sqp != new_sqp)
   4951 				(void) casptr(&connp->conn_sqp, sqp, new_sqp);
   4952 			/* No special MT issues for outbound ixa_sqp hint */
   4953 			connp->conn_ixa->ixa_sqp = new_sqp;
   4954 		}
   4955 
   4956 		do {
   4957 			conn_flags = connp->conn_flags;
   4958 			conn_flags |= IPCL_FULLY_BOUND;
   4959 			(void) cas32(&connp->conn_flags, connp->conn_flags,
   4960 			    conn_flags);
   4961 		} while (!(connp->conn_flags & IPCL_FULLY_BOUND));
   4962 
   4963 		mutex_exit(&connp->conn_fanout->connf_lock);
   4964 		mutex_exit(&connp->conn_lock);
   4965 
   4966 		/*
   4967 		 * Assume we have picked a good squeue for the listener. Make
   4968 		 * subsequent SYNs not try to change the squeue.
   4969 		 */
   4970 		connp->conn_recv = tcp_input_listener;
   4971 	}
   4972 
   4973 done:
   4974 	if (connp->conn_sqp != sqp) {
   4975 		CONN_INC_REF(connp);
   4976 		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
   4977 		    ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND);
   4978 	} else {
   4979 		tcp_input_listener(connp, mp, sqp, ira);
   4980 	}
   4981 }
   4982 
   4983 /*
   4984  * Successful connect request processing begins when our client passes
   4985  * a T_CONN_REQ message into tcp_wput(), which performs function calls into
   4986  * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
   4987  *
   4988  * After various error checks are completed, tcp_tpi_connect() lays
   4989  * the target address and port into the composite header template.
   4990  * Then we ask IP for information, including a source address if we didn't
   4991  * already have one. Finally we prepare to send the SYN packet, and then
   4992  * send up the T_OK_ACK reply message.
   4993  */
   4994 static void
   4995 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
   4996 {
   4997 	sin_t		*sin;
   4998 	struct T_conn_req	*tcr;
   4999 	struct sockaddr	*sa;
   5000 	socklen_t	len;
   5001 	int		error;
   5002 	cred_t		*cr;
   5003 	pid_t		cpid;
   5004 	conn_t		*connp = tcp->tcp_connp;
   5005 	queue_t		*q = connp->conn_wq;
   5006 
   5007 	/*
   5008 	 * All Solaris components should pass a db_credp
   5009 	 * for this TPI message, hence we ASSERT.
   5010 	 * But in case there is some other M_PROTO that looks
   5011 	 * like a TPI message sent by some other kernel
   5012 	 * component, we check and return an error.
   5013 	 */
   5014 	cr = msg_getcred(mp, &cpid);
   5015 	ASSERT(cr != NULL);
   5016 	if (cr == NULL) {
   5017 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
   5018 		return;
   5019 	}
   5020 
   5021 	tcr = (struct T_conn_req *)mp->b_rptr;
   5022 
   5023 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
   5024 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
   5025 		tcp_err_ack(tcp, mp, TPROTO, 0);
   5026 		return;
   5027 	}
   5028 
   5029 	/*
   5030 	 * Pre-allocate the T_ordrel_ind mblk so that at close time, we
   5031 	 * will always have that to send up.  Otherwise, we need to do
   5032 	 * special handling in case the allocation fails at that time.
   5033 	 * If the end point is TPI, the tcp_t can be reused and the
   5034 	 * tcp_ordrel_mp may be allocated already.
   5035 	 */
   5036 	if (tcp->tcp_ordrel_mp == NULL) {
   5037 		if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
   5038 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   5039 			return;
   5040 		}
   5041 	}
   5042 
   5043 	/*
   5044 	 * Determine packet type based on type of address passed in
   5045 	 * the request should contain an IPv4 or IPv6 address.
   5046 	 * Make sure that address family matches the type of
   5047 	 * family of the address passed down.
   5048 	 */
   5049 	switch (tcr->DEST_length) {
   5050 	default:
   5051 		tcp_err_ack(tcp, mp, TBADADDR, 0);
   5052 		return;
   5053 
   5054 	case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
   5055 		/*
   5056 		 * XXX: The check for valid DEST_length was not there
   5057 		 * in earlier releases and some buggy
   5058 		 * TLI apps (e.g Sybase) got away with not feeding
   5059 		 * in sin_zero part of address.
   5060 		 * We allow that bug to keep those buggy apps humming.
   5061 		 * Test suites require the check on DEST_length.
   5062 		 * We construct a new mblk with valid DEST_length
   5063 		 * free the original so the rest of the code does
   5064 		 * not have to keep track of this special shorter
   5065 		 * length address case.
   5066 		 */
   5067 		mblk_t *nmp;
   5068 		struct T_conn_req *ntcr;
   5069 		sin_t *nsin;
   5070 
   5071 		nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
   5072 		    tcr->OPT_length, BPRI_HI);
   5073 		if (nmp == NULL) {
   5074 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   5075 			return;
   5076 		}
   5077 		ntcr = (struct T_conn_req *)nmp->b_rptr;
   5078 		bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
   5079 		ntcr->PRIM_type = T_CONN_REQ;
   5080 		ntcr->DEST_length = sizeof (sin_t);
   5081 		ntcr->DEST_offset = sizeof (struct T_conn_req);
   5082 
   5083 		nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
   5084 		*nsin = sin_null;
   5085 		/* Get pointer to shorter address to copy from original mp */
   5086 		sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
   5087 		    tcr->DEST_length); /* extract DEST_length worth of sin_t */
   5088 		if (sin == NULL || !OK_32PTR((char *)sin)) {
   5089 			freemsg(nmp);
   5090 			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
   5091 			return;
   5092 		}
   5093 		nsin->sin_family = sin->sin_family;
   5094 		nsin->sin_port = sin->sin_port;
   5095 		nsin->sin_addr = sin->sin_addr;
   5096 		/* Note:nsin->sin_zero zero-fill with sin_null assign above */
   5097 		nmp->b_wptr = (uchar_t *)&nsin[1];
   5098 		if (tcr->OPT_length != 0) {
   5099 			ntcr->OPT_length = tcr->OPT_length;
   5100 			ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
   5101 			bcopy((uchar_t *)tcr + tcr->OPT_offset,
   5102 			    (uchar_t *)ntcr + ntcr->OPT_offset,
   5103 			    tcr->OPT_length);
   5104 			nmp->b_wptr += tcr->OPT_length;
   5105 		}
   5106 		freemsg(mp);	/* original mp freed */
   5107 		mp = nmp;	/* re-initialize original variables */
   5108 		tcr = ntcr;
   5109 	}
   5110 	/* FALLTHRU */
   5111 
   5112 	case sizeof (sin_t):
   5113 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
   5114 		    sizeof (sin_t));
   5115 		len = sizeof (sin_t);
   5116 		break;
   5117 
   5118 	case sizeof (sin6_t):
   5119 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
   5120 		    sizeof (sin6_t));
   5121 		len = sizeof (sin6_t);
   5122 		break;
   5123 	}
   5124 
   5125 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
   5126 	if (error != 0) {
   5127 		tcp_err_ack(tcp, mp, TSYSERR, error);
   5128 		return;
   5129 	}
   5130 
   5131 	/*
   5132 	 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
   5133 	 * should key on their sequence number and cut them loose.
   5134 	 */
   5135 
   5136 	/*
   5137 	 * If options passed in, feed it for verification and handling
   5138 	 */
   5139 	if (tcr->OPT_length != 0) {
   5140 		mblk_t	*ok_mp;
   5141 		mblk_t	*discon_mp;
   5142 		mblk_t  *conn_opts_mp;
   5143 		int t_error, sys_error, do_disconnect;
   5144 
   5145 		conn_opts_mp = NULL;
   5146 
   5147 		if (tcp_conprim_opt_process(tcp, mp,
   5148 		    &do_disconnect, &t_error, &sys_error) < 0) {
   5149 			if (do_disconnect) {
   5150 				ASSERT(t_error == 0 && sys_error == 0);
   5151 				discon_mp = mi_tpi_discon_ind(NULL,
   5152 				    ECONNREFUSED, 0);
   5153 				if (!discon_mp) {
   5154 					tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
   5155 					    TSYSERR, ENOMEM);
   5156 					return;
   5157 				}
   5158 				ok_mp = mi_tpi_ok_ack_alloc(mp);
   5159 				if (!ok_mp) {
   5160 					tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
   5161 					    TSYSERR, ENOMEM);
   5162 					return;
   5163 				}
   5164 				qreply(q, ok_mp);
   5165 				qreply(q, discon_mp); /* no flush! */
   5166 			} else {
   5167 				ASSERT(t_error != 0);
   5168 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
   5169 				    sys_error);
   5170 			}
   5171 			return;
   5172 		}
   5173 		/*
   5174 		 * Success in setting options, the mp option buffer represented
   5175 		 * by OPT_length/offset has been potentially modified and
   5176 		 * contains results of option processing. We copy it in
   5177 		 * another mp to save it for potentially influencing returning
   5178 		 * it in T_CONN_CONN.
   5179 		 */
   5180 		if (tcr->OPT_length != 0) { /* there are resulting options */
   5181 			conn_opts_mp = copyb(mp);
   5182 			if (!conn_opts_mp) {
   5183 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
   5184 				    TSYSERR, ENOMEM);
   5185 				return;
   5186 			}
   5187 			ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
   5188 			tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
   5189 			/*
   5190 			 * Note:
   5191 			 * These resulting option negotiation can include any
   5192 			 * end-to-end negotiation options but there no such
   5193 			 * thing (yet?) in our TCP/IP.
   5194 			 */
   5195 		}
   5196 	}
   5197 
   5198 	/* call the non-TPI version */
   5199 	error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid);
   5200 	if (error < 0) {
   5201 		mp = mi_tpi_err_ack_alloc(mp, -error, 0);
   5202 	} else if (error > 0) {
   5203 		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
   5204 	} else {
   5205 		mp = mi_tpi_ok_ack_alloc(mp);
   5206 	}
   5207 
   5208 	/*
   5209 	 * Note: Code below is the "failure" case
   5210 	 */
   5211 	/* return error ack and blow away saved option results if any */
   5212 connect_failed:
   5213 	if (mp != NULL)
   5214 		putnext(connp->conn_rq, mp);
   5215 	else {
   5216 		tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
   5217 		    TSYSERR, ENOMEM);
   5218 	}
   5219 }
   5220 
   5221 /*
   5222  * Handle connect to IPv4 destinations, including connections for AF_INET6
   5223  * sockets connecting to IPv4 mapped IPv6 destinations.
   5224  * Returns zero if OK, a positive errno, or a negative TLI error.
   5225  */
   5226 static int
   5227 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
   5228     uint_t srcid)
   5229 {
   5230 	ipaddr_t 	dstaddr = *dstaddrp;
   5231 	uint16_t 	lport;
   5232 	conn_t		*connp = tcp->tcp_connp;
   5233 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   5234 	int		error;
   5235 
   5236 	ASSERT(connp->conn_ipversion == IPV4_VERSION);
   5237 
   5238 	/* Check for attempt to connect to INADDR_ANY */
   5239 	if (dstaddr == INADDR_ANY)  {
   5240 		/*
   5241 		 * SunOS 4.x and 4.3 BSD allow an application
   5242 		 * to connect a TCP socket to INADDR_ANY.
   5243 		 * When they do this, the kernel picks the
   5244 		 * address of one interface and uses it
   5245 		 * instead.  The kernel usually ends up
   5246 		 * picking the address of the loopback
   5247 		 * interface.  This is an undocumented feature.
   5248 		 * However, we provide the same thing here
   5249 		 * in order to have source and binary
   5250 		 * compatibility with SunOS 4.x.
   5251 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
   5252 		 * generate the T_CONN_CON.
   5253 		 */
   5254 		dstaddr = htonl(INADDR_LOOPBACK);
   5255 		*dstaddrp = dstaddr;
   5256 	}
   5257 
   5258 	/* Handle __sin6_src_id if socket not bound to an IP address */
   5259 	if (srcid != 0 && connp->conn_laddr_v4 == INADDR_ANY) {
   5260 		ip_srcid_find_id(srcid, &connp->conn_laddr_v6,
   5261 		    IPCL_ZONEID(connp), tcps->tcps_netstack);
   5262 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   5263 	}
   5264 
   5265 	IN6_IPADDR_TO_V4MAPPED(dstaddr, &connp->conn_faddr_v6);
   5266 	connp->conn_fport = dstport;
   5267 
   5268 	/*
   5269 	 * At this point the remote destination address and remote port fields
   5270 	 * in the tcp-four-tuple have been filled in the tcp structure. Now we
   5271 	 * have to see which state tcp was in so we can take appropriate action.
   5272 	 */
   5273 	if (tcp->tcp_state == TCPS_IDLE) {
   5274 		/*
   5275 		 * We support a quick connect capability here, allowing
   5276 		 * clients to transition directly from IDLE to SYN_SENT
   5277 		 * tcp_bindi will pick an unused port, insert the connection
   5278 		 * in the bind hash and transition to BOUND state.
   5279 		 */
   5280 		lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
   5281 		    tcp, B_TRUE);
   5282 		lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE,
   5283 		    B_FALSE, B_FALSE);
   5284 		if (lport == 0)
   5285 			return (-TNOADDR);
   5286 	}
   5287 
   5288 	/*
   5289 	 * Lookup the route to determine a source address and the uinfo.
   5290 	 * Setup TCP parameters based on the metrics/DCE.
   5291 	 */
   5292 	error = tcp_set_destination(tcp);
   5293 	if (error != 0)
   5294 		return (error);
   5295 
   5296 	/*
   5297 	 * Don't let an endpoint connect to itself.
   5298 	 */
   5299 	if (connp->conn_faddr_v4 == connp->conn_laddr_v4 &&
   5300 	    connp->conn_fport == connp->conn_lport)
   5301 		return (-TBADADDR);
   5302 
   5303 	tcp->tcp_state = TCPS_SYN_SENT;
   5304 
   5305 	return (ipcl_conn_insert_v4(connp));
   5306 }
   5307 
   5308 /*
   5309  * Handle connect to IPv6 destinations.
   5310  * Returns zero if OK, a positive errno, or a negative TLI error.
   5311  */
   5312 static int
   5313 tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport,
   5314     uint32_t flowinfo, uint_t srcid, uint32_t scope_id)
   5315 {
   5316 	uint16_t 	lport;
   5317 	conn_t		*connp = tcp->tcp_connp;
   5318 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   5319 	int		error;
   5320 
   5321 	ASSERT(connp->conn_family == AF_INET6);
   5322 
   5323 	/*
   5324 	 * If we're here, it means that the destination address is a native
   5325 	 * IPv6 address.  Return an error if conn_ipversion is not IPv6.  A
   5326 	 * reason why it might not be IPv6 is if the socket was bound to an
   5327 	 * IPv4-mapped IPv6 address.
   5328 	 */
   5329 	if (connp->conn_ipversion != IPV6_VERSION)
   5330 		return (-TBADADDR);
   5331 
   5332 	/*
   5333 	 * Interpret a zero destination to mean loopback.
   5334 	 * Update the T_CONN_REQ (sin/sin6) since it is used to
   5335 	 * generate the T_CONN_CON.
   5336 	 */
   5337 	if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp))
   5338 		*dstaddrp = ipv6_loopback;
   5339 
   5340 	/* Handle __sin6_src_id if socket not bound to an IP address */
   5341 	if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
   5342 		ip_srcid_find_id(srcid, &connp->conn_laddr_v6,
   5343 		    IPCL_ZONEID(connp), tcps->tcps_netstack);
   5344 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   5345 	}
   5346 
   5347 	/*
   5348 	 * Take care of the scope_id now.
   5349 	 */
   5350 	if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(dstaddrp)) {
   5351 		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
   5352 		connp->conn_ixa->ixa_scopeid = scope_id;
   5353 	} else {
   5354 		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
   5355 	}
   5356 
   5357 	connp->conn_flowinfo = flowinfo;
   5358 	connp->conn_faddr_v6 = *dstaddrp;
   5359 	connp->conn_fport = dstport;
   5360 
   5361 	/*
   5362 	 * At this point the remote destination address and remote port fields
   5363 	 * in the tcp-four-tuple have been filled in the tcp structure. Now we
   5364 	 * have to see which state tcp was in so we can take appropriate action.
   5365 	 */
   5366 	if (tcp->tcp_state == TCPS_IDLE) {
   5367 		/*
   5368 		 * We support a quick connect capability here, allowing
   5369 		 * clients to transition directly from IDLE to SYN_SENT
   5370 		 * tcp_bindi will pick an unused port, insert the connection
   5371 		 * in the bind hash and transition to BOUND state.
   5372 		 */
   5373 		lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
   5374 		    tcp, B_TRUE);
   5375 		lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE,
   5376 		    B_FALSE, B_FALSE);
   5377 		if (lport == 0)
   5378 			return (-TNOADDR);
   5379 	}
   5380 
   5381 	/*
   5382 	 * Lookup the route to determine a source address and the uinfo.
   5383 	 * Setup TCP parameters based on the metrics/DCE.
   5384 	 */
   5385 	error = tcp_set_destination(tcp);
   5386 	if (error != 0)
   5387 		return (error);
   5388 
   5389 	/*
   5390 	 * Don't let an endpoint connect to itself.
   5391 	 */
   5392 	if (IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &connp->conn_laddr_v6) &&
   5393 	    connp->conn_fport == connp->conn_lport)
   5394 		return (-TBADADDR);
   5395 
   5396 	tcp->tcp_state = TCPS_SYN_SENT;
   5397 
   5398 	return (ipcl_conn_insert_v6(connp));
   5399 }
   5400 
   5401 /*
   5402  * Disconnect
   5403  * Note that unlike other functions this returns a positive tli error
   5404  * when it fails; it never returns an errno.
   5405  */
   5406 static int
   5407 tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
   5408 {
   5409 	conn_t		*lconnp;
   5410 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   5411 	conn_t		*connp = tcp->tcp_connp;
   5412 
   5413 	/*
   5414 	 * Right now, upper modules pass down a T_DISCON_REQ to TCP,
   5415 	 * when the stream is in BOUND state. Do not send a reset,
   5416 	 * since the destination IP address is not valid, and it can
   5417 	 * be the initialized value of all zeros (broadcast address).
   5418 	 */
   5419 	if (tcp->tcp_state <= TCPS_BOUND) {
   5420 		if (connp->conn_debug) {
   5421 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   5422 			    "tcp_disconnect: bad state, %d", tcp->tcp_state);
   5423 		}
   5424 		return (TOUTSTATE);
   5425 	}
   5426 
   5427 
   5428 	if (seqnum == -1 || tcp->tcp_conn_req_max == 0) {
   5429 
   5430 		/*
   5431 		 * According to TPI, for non-listeners, ignore seqnum
   5432 		 * and disconnect.
   5433 		 * Following interpretation of -1 seqnum is historical
   5434 		 * and implied TPI ? (TPI only states that for T_CONN_IND,
   5435 		 * a valid seqnum should not be -1).
   5436 		 *
   5437 		 *	-1 means disconnect everything
   5438 		 *	regardless even on a listener.
   5439 		 */
   5440 
   5441 		int old_state = tcp->tcp_state;
   5442 		ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
   5443 
   5444 		/*
   5445 		 * The connection can't be on the tcp_time_wait_head list
   5446 		 * since it is not detached.
   5447 		 */
   5448 		ASSERT(tcp->tcp_time_wait_next == NULL);
   5449 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   5450 		ASSERT(tcp->tcp_time_wait_expire == 0);
   5451 		/*
   5452 		 * If it used to be a listener, check to make sure no one else
   5453 		 * has taken the port before switching back to LISTEN state.
   5454 		 */
   5455 		if (connp->conn_ipversion == IPV4_VERSION) {
   5456 			lconnp = ipcl_lookup_listener_v4(connp->conn_lport,
   5457 			    connp->conn_laddr_v4, IPCL_ZONEID(connp), ipst);
   5458 		} else {
   5459 			uint_t ifindex = 0;
   5460 
   5461 			if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)
   5462 				ifindex = connp->conn_ixa->ixa_scopeid;
   5463 
   5464 			/* Allow conn_bound_if listeners? */
   5465 			lconnp = ipcl_lookup_listener_v6(connp->conn_lport,
   5466 			    &connp->conn_laddr_v6, ifindex, IPCL_ZONEID(connp),
   5467 			    ipst);
   5468 		}
   5469 		if (tcp->tcp_conn_req_max && lconnp == NULL) {
   5470 			tcp->tcp_state = TCPS_LISTEN;
   5471 		} else if (old_state > TCPS_BOUND) {
   5472 			tcp->tcp_conn_req_max = 0;
   5473 			tcp->tcp_state = TCPS_BOUND;
   5474 		}
   5475 		if (lconnp != NULL)
   5476 			CONN_DEC_REF(lconnp);
   5477 		if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
   5478 			BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   5479 		} else if (old_state == TCPS_ESTABLISHED ||
   5480 		    old_state == TCPS_CLOSE_WAIT) {
   5481 			BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
   5482 		}
   5483 
   5484 		if (tcp->tcp_fused)
   5485 			tcp_unfuse(tcp);
   5486 
   5487 		mutex_enter(&tcp->tcp_eager_lock);
   5488 		if ((tcp->tcp_conn_req_cnt_q0 != 0) ||
   5489 		    (tcp->tcp_conn_req_cnt_q != 0)) {
   5490 			tcp_eager_cleanup(tcp, 0);
   5491 		}
   5492 		mutex_exit(&tcp->tcp_eager_lock);
   5493 
   5494 		tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt,
   5495 		    tcp->tcp_rnxt, TH_RST | TH_ACK);
   5496 
   5497 		tcp_reinit(tcp);
   5498 
   5499 		return (0);
   5500 	} else if (!tcp_eager_blowoff(tcp, seqnum)) {
   5501 		return (TBADSEQ);
   5502 	}
   5503 	return (0);
   5504 }
   5505 
   5506 /*
   5507  * Our client hereby directs us to reject the connection request
   5508  * that tcp_input_listener() marked with 'seqnum'.  Rejection consists
   5509  * of sending the appropriate RST, not an ICMP error.
   5510  */
   5511 static void
   5512 tcp_disconnect(tcp_t *tcp, mblk_t *mp)
   5513 {
   5514 	t_scalar_t seqnum;
   5515 	int	error;
   5516 	conn_t	*connp = tcp->tcp_connp;
   5517 
   5518 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
   5519 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) {
   5520 		tcp_err_ack(tcp, mp, TPROTO, 0);
   5521 		return;
   5522 	}
   5523 	seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number;
   5524 	error = tcp_disconnect_common(tcp, seqnum);
   5525 	if (error != 0)
   5526 		tcp_err_ack(tcp, mp, error, 0);
   5527 	else {
   5528 		if (tcp->tcp_state >= TCPS_ESTABLISHED) {
   5529 			/* Send M_FLUSH according to TPI */
   5530 			(void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
   5531 		}
   5532 		mp = mi_tpi_ok_ack_alloc(mp);
   5533 		if (mp != NULL)
   5534 			putnext(connp->conn_rq, mp);
   5535 	}
   5536 }
   5537 
   5538 /*
   5539  * Diagnostic routine used to return a string associated with the tcp state.
   5540  * Note that if the caller does not supply a buffer, it will use an internal
   5541  * static string.  This means that if multiple threads call this function at
   5542  * the same time, output can be corrupted...  Note also that this function
   5543  * does not check the size of the supplied buffer.  The caller has to make
   5544  * sure that it is big enough.
   5545  */
   5546 static char *
   5547 tcp_display(tcp_t *tcp, char *sup_buf, char format)
   5548 {
   5549 	char		buf1[30];
   5550 	static char	priv_buf[INET6_ADDRSTRLEN * 2 + 80];
   5551 	char		*buf;
   5552 	char		*cp;
   5553 	in6_addr_t	local, remote;
   5554 	char		local_addrbuf[INET6_ADDRSTRLEN];
   5555 	char		remote_addrbuf[INET6_ADDRSTRLEN];
   5556 	conn_t		*connp;
   5557 
   5558 	if (sup_buf != NULL)
   5559 		buf = sup_buf;
   5560 	else
   5561 		buf = priv_buf;
   5562 
   5563 	if (tcp == NULL)
   5564 		return ("NULL_TCP");
   5565 
   5566 	connp = tcp->tcp_connp;
   5567 	switch (tcp->tcp_state) {
   5568 	case TCPS_CLOSED:
   5569 		cp = "TCP_CLOSED";
   5570 		break;
   5571 	case TCPS_IDLE:
   5572 		cp = "TCP_IDLE";
   5573 		break;
   5574 	case TCPS_BOUND:
   5575 		cp = "TCP_BOUND";
   5576 		break;
   5577 	case TCPS_LISTEN:
   5578 		cp = "TCP_LISTEN";
   5579 		break;
   5580 	case TCPS_SYN_SENT:
   5581 		cp = "TCP_SYN_SENT";
   5582 		break;
   5583 	case TCPS_SYN_RCVD:
   5584 		cp = "TCP_SYN_RCVD";
   5585 		break;
   5586 	case TCPS_ESTABLISHED:
   5587 		cp = "TCP_ESTABLISHED";
   5588 		break;
   5589 	case TCPS_CLOSE_WAIT:
   5590 		cp = "TCP_CLOSE_WAIT";
   5591 		break;
   5592 	case TCPS_FIN_WAIT_1:
   5593 		cp = "TCP_FIN_WAIT_1";
   5594 		break;
   5595 	case TCPS_CLOSING:
   5596 		cp = "TCP_CLOSING";
   5597 		break;
   5598 	case TCPS_LAST_ACK:
   5599 		cp = "TCP_LAST_ACK";
   5600 		break;
   5601 	case TCPS_FIN_WAIT_2:
   5602 		cp = "TCP_FIN_WAIT_2";
   5603 		break;
   5604 	case TCPS_TIME_WAIT:
   5605 		cp = "TCP_TIME_WAIT";
   5606 		break;
   5607 	default:
   5608 		(void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
   5609 		cp = buf1;
   5610 		break;
   5611 	}
   5612 	switch (format) {
   5613 	case DISP_ADDR_AND_PORT:
   5614 		if (connp->conn_ipversion == IPV4_VERSION) {
   5615 			/*
   5616 			 * Note that we use the remote address in the tcp_b
   5617 			 * structure.  This means that it will print out
   5618 			 * the real destination address, not the next hop's
   5619 			 * address if source routing is used.
   5620 			 */
   5621 			IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local);
   5622 			IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote);
   5623 
   5624 		} else {
   5625 			local = connp->conn_laddr_v6;
   5626 			remote = connp->conn_faddr_v6;
   5627 		}
   5628 		(void) inet_ntop(AF_INET6, &local, local_addrbuf,
   5629 		    sizeof (local_addrbuf));
   5630 		(void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
   5631 		    sizeof (remote_addrbuf));
   5632 		(void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
   5633 		    local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf,
   5634 		    ntohs(connp->conn_fport), cp);
   5635 		break;
   5636 	case DISP_PORT_ONLY:
   5637 	default:
   5638 		(void) mi_sprintf(buf, "[%u, %u] %s",
   5639 		    ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
   5640 		break;
   5641 	}
   5642 
   5643 	return (buf);
   5644 }
   5645 
   5646 /*
   5647  * Called via squeue to get on to eager's perimeter. It sends a
   5648  * TH_RST if eager is in the fanout table. The listener wants the
   5649  * eager to disappear either by means of tcp_eager_blowoff() or
   5650  * tcp_eager_cleanup() being called. tcp_eager_kill() can also be
   5651  * called (via squeue) if the eager cannot be inserted in the
   5652  * fanout table in tcp_input_listener().
   5653  */
   5654 /* ARGSUSED */
   5655 void
   5656 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   5657 {
   5658 	conn_t	*econnp = (conn_t *)arg;
   5659 	tcp_t	*eager = econnp->conn_tcp;
   5660 	tcp_t	*listener = eager->tcp_listener;
   5661 
   5662 	/*
   5663 	 * We could be called because listener is closing. Since
   5664 	 * the eager was using listener's queue's, we avoid
   5665 	 * using the listeners queues from now on.
   5666 	 */
   5667 	ASSERT(eager->tcp_detached);
   5668 	econnp->conn_rq = NULL;
   5669 	econnp->conn_wq = NULL;
   5670 
   5671 	/*
   5672 	 * An eager's conn_fanout will be NULL if it's a duplicate
   5673 	 * for an existing 4-tuples in the conn fanout table.
   5674 	 * We don't want to send an RST out in such case.
   5675 	 */
   5676 	if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) {
   5677 		tcp_xmit_ctl("tcp_eager_kill, can't wait",
   5678 		    eager, eager->tcp_snxt, 0, TH_RST);
   5679 	}
   5680 
   5681 	/* We are here because listener wants this eager gone */
   5682 	if (listener != NULL) {
   5683 		mutex_enter(&listener->tcp_eager_lock);
   5684 		tcp_eager_unlink(eager);
   5685 		if (eager->tcp_tconnind_started) {
   5686 			/*
   5687 			 * The eager has sent a conn_ind up to the
   5688 			 * listener but listener decides to close
   5689 			 * instead. We need to drop the extra ref
   5690 			 * placed on eager in tcp_input_data() before
   5691 			 * sending the conn_ind to listener.
   5692 			 */
   5693 			CONN_DEC_REF(econnp);
   5694 		}
   5695 		mutex_exit(&listener->tcp_eager_lock);
   5696 		CONN_DEC_REF(listener->tcp_connp);
   5697 	}
   5698 
   5699 	if (eager->tcp_state != TCPS_CLOSED)
   5700 		tcp_close_detached(eager);
   5701 }
   5702 
   5703 /*
   5704  * Reset any eager connection hanging off this listener marked
   5705  * with 'seqnum' and then reclaim it's resources.
   5706  */
   5707 static boolean_t
   5708 tcp_eager_blowoff(tcp_t	*listener, t_scalar_t seqnum)
   5709 {
   5710 	tcp_t	*eager;
   5711 	mblk_t 	*mp;
   5712 	tcp_stack_t	*tcps = listener->tcp_tcps;
   5713 
   5714 	TCP_STAT(tcps, tcp_eager_blowoff_calls);
   5715 	eager = listener;
   5716 	mutex_enter(&listener->tcp_eager_lock);
   5717 	do {
   5718 		eager = eager->tcp_eager_next_q;
   5719 		if (eager == NULL) {
   5720 			mutex_exit(&listener->tcp_eager_lock);
   5721 			return (B_FALSE);
   5722 		}
   5723 	} while (eager->tcp_conn_req_seqnum != seqnum);
   5724 
   5725 	if (eager->tcp_closemp_used) {
   5726 		mutex_exit(&listener->tcp_eager_lock);
   5727 		return (B_TRUE);
   5728 	}
   5729 	eager->tcp_closemp_used = B_TRUE;
   5730 	TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   5731 	CONN_INC_REF(eager->tcp_connp);
   5732 	mutex_exit(&listener->tcp_eager_lock);
   5733 	mp = &eager->tcp_closemp;
   5734 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
   5735 	    eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF);
   5736 	return (B_TRUE);
   5737 }
   5738 
   5739 /*
   5740  * Reset any eager connection hanging off this listener
   5741  * and then reclaim it's resources.
   5742  */
   5743 static void
   5744 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
   5745 {
   5746 	tcp_t	*eager;
   5747 	mblk_t	*mp;
   5748 	tcp_stack_t	*tcps = listener->tcp_tcps;
   5749 
   5750 	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
   5751 
   5752 	if (!q0_only) {
   5753 		/* First cleanup q */
   5754 		TCP_STAT(tcps, tcp_eager_blowoff_q);
   5755 		eager = listener->tcp_eager_next_q;
   5756 		while (eager != NULL) {
   5757 			if (!eager->tcp_closemp_used) {
   5758 				eager->tcp_closemp_used = B_TRUE;
   5759 				TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   5760 				CONN_INC_REF(eager->tcp_connp);
   5761 				mp = &eager->tcp_closemp;
   5762 				SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
   5763 				    tcp_eager_kill, eager->tcp_connp, NULL,
   5764 				    SQ_FILL, SQTAG_TCP_EAGER_CLEANUP);
   5765 			}
   5766 			eager = eager->tcp_eager_next_q;
   5767 		}
   5768 	}
   5769 	/* Then cleanup q0 */
   5770 	TCP_STAT(tcps, tcp_eager_blowoff_q0);
   5771 	eager = listener->tcp_eager_next_q0;
   5772 	while (eager != listener) {
   5773 		if (!eager->tcp_closemp_used) {
   5774 			eager->tcp_closemp_used = B_TRUE;
   5775 			TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   5776 			CONN_INC_REF(eager->tcp_connp);
   5777 			mp = &eager->tcp_closemp;
   5778 			SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
   5779 			    tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL,
   5780 			    SQTAG_TCP_EAGER_CLEANUP_Q0);
   5781 		}
   5782 		eager = eager->tcp_eager_next_q0;
   5783 	}
   5784 }
   5785 
   5786 /*
   5787  * If we are an eager connection hanging off a listener that hasn't
   5788  * formally accepted the connection yet, get off his list and blow off
   5789  * any data that we have accumulated.
   5790  */
   5791 static void
   5792 tcp_eager_unlink(tcp_t *tcp)
   5793 {
   5794 	tcp_t	*listener = tcp->tcp_listener;
   5795 
   5796 	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
   5797 	ASSERT(listener != NULL);
   5798 	if (tcp->tcp_eager_next_q0 != NULL) {
   5799 		ASSERT(tcp->tcp_eager_prev_q0 != NULL);
   5800 
   5801 		/* Remove the eager tcp from q0 */
   5802 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
   5803 		    tcp->tcp_eager_prev_q0;
   5804 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
   5805 		    tcp->tcp_eager_next_q0;
   5806 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
   5807 		listener->tcp_conn_req_cnt_q0--;
   5808 
   5809 		tcp->tcp_eager_next_q0 = NULL;
   5810 		tcp->tcp_eager_prev_q0 = NULL;
   5811 
   5812 		/*
   5813 		 * Take the eager out, if it is in the list of droppable
   5814 		 * eagers.
   5815 		 */
   5816 		MAKE_UNDROPPABLE(tcp);
   5817 
   5818 		if (tcp->tcp_syn_rcvd_timeout != 0) {
   5819 			/* we have timed out before */
   5820 			ASSERT(listener->tcp_syn_rcvd_timeout > 0);
   5821 			listener->tcp_syn_rcvd_timeout--;
   5822 		}
   5823 	} else {
   5824 		tcp_t   **tcpp = &listener->tcp_eager_next_q;
   5825 		tcp_t	*prev = NULL;
   5826 
   5827 		for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) {
   5828 			if (tcpp[0] == tcp) {
   5829 				if (listener->tcp_eager_last_q == tcp) {
   5830 					/*
   5831 					 * If we are unlinking the last
   5832 					 * element on the list, adjust
   5833 					 * tail pointer. Set tail pointer
   5834 					 * to nil when list is empty.
   5835 					 */
   5836 					ASSERT(tcp->tcp_eager_next_q == NULL);
   5837 					if (listener->tcp_eager_last_q ==
   5838 					    listener->tcp_eager_next_q) {
   5839 						listener->tcp_eager_last_q =
   5840 						    NULL;
   5841 					} else {
   5842 						/*
   5843 						 * We won't get here if there
   5844 						 * is only one eager in the
   5845 						 * list.
   5846 						 */
   5847 						ASSERT(prev != NULL);
   5848 						listener->tcp_eager_last_q =
   5849 						    prev;
   5850 					}
   5851 				}
   5852 				tcpp[0] = tcp->tcp_eager_next_q;
   5853 				tcp->tcp_eager_next_q = NULL;
   5854 				tcp->tcp_eager_last_q = NULL;
   5855 				ASSERT(listener->tcp_conn_req_cnt_q > 0);
   5856 				listener->tcp_conn_req_cnt_q--;
   5857 				break;
   5858 			}
   5859 			prev = tcpp[0];
   5860 		}
   5861 	}
   5862 	tcp->tcp_listener = NULL;
   5863 }
   5864 
   5865 /* Shorthand to generate and send TPI error acks to our client */
   5866 static void
   5867 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
   5868 {
   5869 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
   5870 		putnext(tcp->tcp_connp->conn_rq, mp);
   5871 }
   5872 
   5873 /* Shorthand to generate and send TPI error acks to our client */
   5874 static void
   5875 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
   5876     int t_error, int sys_error)
   5877 {
   5878 	struct T_error_ack	*teackp;
   5879 
   5880 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
   5881 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
   5882 		teackp = (struct T_error_ack *)mp->b_rptr;
   5883 		teackp->ERROR_prim = primitive;
   5884 		teackp->TLI_error = t_error;
   5885 		teackp->UNIX_error = sys_error;
   5886 		putnext(tcp->tcp_connp->conn_rq, mp);
   5887 	}
   5888 }
   5889 
   5890 /*
   5891  * Note: No locks are held when inspecting tcp_g_*epriv_ports
   5892  * but instead the code relies on:
   5893  * - the fact that the address of the array and its size never changes
   5894  * - the atomic assignment of the elements of the array
   5895  */
   5896 /* ARGSUSED */
   5897 static int
   5898 tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
   5899 {
   5900 	int i;
   5901 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
   5902 
   5903 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   5904 		if (tcps->tcps_g_epriv_ports[i] != 0)
   5905 			(void) mi_mpprintf(mp, "%d ",
   5906 			    tcps->tcps_g_epriv_ports[i]);
   5907 	}
   5908 	return (0);
   5909 }
   5910 
   5911 /*
   5912  * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
   5913  * threads from changing it at the same time.
   5914  */
   5915 /* ARGSUSED */
   5916 static int
   5917 tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
   5918     cred_t *cr)
   5919 {
   5920 	long	new_value;
   5921 	int	i;
   5922 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
   5923 
   5924 	/*
   5925 	 * Fail the request if the new value does not lie within the
   5926 	 * port number limits.
   5927 	 */
   5928 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
   5929 	    new_value <= 0 || new_value >= 65536) {
   5930 		return (EINVAL);
   5931 	}
   5932 
   5933 	mutex_enter(&tcps->tcps_epriv_port_lock);
   5934 	/* Check if the value is already in the list */
   5935 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   5936 		if (new_value == tcps->tcps_g_epriv_ports[i]) {
   5937 			mutex_exit(&tcps->tcps_epriv_port_lock);
   5938 			return (EEXIST);
   5939 		}
   5940 	}
   5941 	/* Find an empty slot */
   5942 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   5943 		if (tcps->tcps_g_epriv_ports[i] == 0)
   5944 			break;
   5945 	}
   5946 	if (i == tcps->tcps_g_num_epriv_ports) {
   5947 		mutex_exit(&tcps->tcps_epriv_port_lock);
   5948 		return (EOVERFLOW);
   5949 	}
   5950 	/* Set the new value */
   5951 	tcps->tcps_g_epriv_ports[i] = (uint16_t)new_value;
   5952 	mutex_exit(&tcps->tcps_epriv_port_lock);
   5953 	return (0);
   5954 }
   5955 
   5956 /*
   5957  * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
   5958  * threads from changing it at the same time.
   5959  */
   5960 /* ARGSUSED */
   5961 static int
   5962 tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
   5963     cred_t *cr)
   5964 {
   5965 	long	new_value;
   5966 	int	i;
   5967 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
   5968 
   5969 	/*
   5970 	 * Fail the request if the new value does not lie within the
   5971 	 * port number limits.
   5972 	 */
   5973 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 ||
   5974 	    new_value >= 65536) {
   5975 		return (EINVAL);
   5976 	}
   5977 
   5978 	mutex_enter(&tcps->tcps_epriv_port_lock);
   5979 	/* Check that the value is already in the list */
   5980 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   5981 		if (tcps->tcps_g_epriv_ports[i] == new_value)
   5982 			break;
   5983 	}
   5984 	if (i == tcps->tcps_g_num_epriv_ports) {
   5985 		mutex_exit(&tcps->tcps_epriv_port_lock);
   5986 		return (ESRCH);
   5987 	}
   5988 	/* Clear the value */
   5989 	tcps->tcps_g_epriv_ports[i] = 0;
   5990 	mutex_exit(&tcps->tcps_epriv_port_lock);
   5991 	return (0);
   5992 }
   5993 
   5994 /* Return the TPI/TLI equivalent of our current tcp_state */
   5995 static int
   5996 tcp_tpistate(tcp_t *tcp)
   5997 {
   5998 	switch (tcp->tcp_state) {
   5999 	case TCPS_IDLE:
   6000 		return (TS_UNBND);
   6001 	case TCPS_LISTEN:
   6002 		/*
   6003 		 * Return whether there are outstanding T_CONN_IND waiting
   6004 		 * for the matching T_CONN_RES. Therefore don't count q0.
   6005 		 */
   6006 		if (tcp->tcp_conn_req_cnt_q > 0)
   6007 			return (TS_WRES_CIND);
   6008 		else
   6009 			return (TS_IDLE);
   6010 	case TCPS_BOUND:
   6011 		return (TS_IDLE);
   6012 	case TCPS_SYN_SENT:
   6013 		return (TS_WCON_CREQ);
   6014 	case TCPS_SYN_RCVD:
   6015 		/*
   6016 		 * Note: assumption: this has to the active open SYN_RCVD.
   6017 		 * The passive instance is detached in SYN_RCVD stage of
   6018 		 * incoming connection processing so we cannot get request
   6019 		 * for T_info_ack on it.
   6020 		 */
   6021 		return (TS_WACK_CRES);
   6022 	case TCPS_ESTABLISHED:
   6023 		return (TS_DATA_XFER);
   6024 	case TCPS_CLOSE_WAIT:
   6025 		return (TS_WREQ_ORDREL);
   6026 	case TCPS_FIN_WAIT_1:
   6027 		return (TS_WIND_ORDREL);
   6028 	case TCPS_FIN_WAIT_2:
   6029 		return (TS_WIND_ORDREL);
   6030 
   6031 	case TCPS_CLOSING:
   6032 	case TCPS_LAST_ACK:
   6033 	case TCPS_TIME_WAIT:
   6034 	case TCPS_CLOSED:
   6035 		/*
   6036 		 * Following TS_WACK_DREQ7 is a rendition of "not
   6037 		 * yet TS_IDLE" TPI state. There is no best match to any
   6038 		 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
   6039 		 * choose a value chosen that will map to TLI/XTI level
   6040 		 * state of TSTATECHNG (state is process of changing) which
   6041 		 * captures what this dummy state represents.
   6042 		 */
   6043 		return (TS_WACK_DREQ7);
   6044 	default:
   6045 		cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
   6046 		    tcp->tcp_state, tcp_display(tcp, NULL,
   6047 		    DISP_PORT_ONLY));
   6048 		return (TS_UNBND);
   6049 	}
   6050 }
   6051 
   6052 static void
   6053 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
   6054 {
   6055 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6056 	conn_t		*connp = tcp->tcp_connp;
   6057 
   6058 	if (connp->conn_family == AF_INET6)
   6059 		*tia = tcp_g_t_info_ack_v6;
   6060 	else
   6061 		*tia = tcp_g_t_info_ack;
   6062 	tia->CURRENT_state = tcp_tpistate(tcp);
   6063 	tia->OPT_size = tcp_max_optsize;
   6064 	if (tcp->tcp_mss == 0) {
   6065 		/* Not yet set - tcp_open does not set mss */
   6066 		if (connp->conn_ipversion == IPV4_VERSION)
   6067 			tia->TIDU_size = tcps->tcps_mss_def_ipv4;
   6068 		else
   6069 			tia->TIDU_size = tcps->tcps_mss_def_ipv6;
   6070 	} else {
   6071 		tia->TIDU_size = tcp->tcp_mss;
   6072 	}
   6073 	/* TODO: Default ETSDU is 1.  Is that correct for tcp? */
   6074 }
   6075 
   6076 static void
   6077 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
   6078     t_uscalar_t cap_bits1)
   6079 {
   6080 	tcap->CAP_bits1 = 0;
   6081 
   6082 	if (cap_bits1 & TC1_INFO) {
   6083 		tcp_copy_info(&tcap->INFO_ack, tcp);
   6084 		tcap->CAP_bits1 |= TC1_INFO;
   6085 	}
   6086 
   6087 	if (cap_bits1 & TC1_ACCEPTOR_ID) {
   6088 		tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
   6089 		tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
   6090 	}
   6091 
   6092 }
   6093 
   6094 /*
   6095  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
   6096  * tcp_wput.  Much of the T_CAPABILITY_ACK information is copied from
   6097  * tcp_g_t_info_ack.  The current state of the stream is copied from
   6098  * tcp_state.
   6099  */
   6100 static void
   6101 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
   6102 {
   6103 	t_uscalar_t		cap_bits1;
   6104 	struct T_capability_ack	*tcap;
   6105 
   6106 	if (MBLKL(mp) < sizeof (struct T_capability_req)) {
   6107 		freemsg(mp);
   6108 		return;
   6109 	}
   6110 
   6111 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
   6112 
   6113 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
   6114 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
   6115 	if (mp == NULL)
   6116 		return;
   6117 
   6118 	tcap = (struct T_capability_ack *)mp->b_rptr;
   6119 	tcp_do_capability_ack(tcp, tcap, cap_bits1);
   6120 
   6121 	putnext(tcp->tcp_connp->conn_rq, mp);
   6122 }
   6123 
   6124 /*
   6125  * This routine responds to T_INFO_REQ messages.  It is called by tcp_wput.
   6126  * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
   6127  * The current state of the stream is copied from tcp_state.
   6128  */
   6129 static void
   6130 tcp_info_req(tcp_t *tcp, mblk_t *mp)
   6131 {
   6132 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
   6133 	    T_INFO_ACK);
   6134 	if (!mp) {
   6135 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   6136 		return;
   6137 	}
   6138 	tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
   6139 	putnext(tcp->tcp_connp->conn_rq, mp);
   6140 }
   6141 
   6142 /* Respond to the TPI addr request */
   6143 static void
   6144 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
   6145 {
   6146 	struct sockaddr *sa;
   6147 	mblk_t	*ackmp;
   6148 	struct T_addr_ack *taa;
   6149 	conn_t	*connp = tcp->tcp_connp;
   6150 	uint_t	addrlen;
   6151 
   6152 	/* Make it large enough for worst case */
   6153 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
   6154 	    2 * sizeof (sin6_t), 1);
   6155 	if (ackmp == NULL) {
   6156 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   6157 		return;
   6158 	}
   6159 
   6160 	taa = (struct T_addr_ack *)ackmp->b_rptr;
   6161 
   6162 	bzero(taa, sizeof (struct T_addr_ack));
   6163 	ackmp->b_wptr = (uchar_t *)&taa[1];
   6164 
   6165 	taa->PRIM_type = T_ADDR_ACK;
   6166 	ackmp->b_datap->db_type = M_PCPROTO;
   6167 
   6168 	if (connp->conn_family == AF_INET)
   6169 		addrlen = sizeof (sin_t);
   6170 	else
   6171 		addrlen = sizeof (sin6_t);
   6172 
   6173 	/*
   6174 	 * Note: Following code assumes 32 bit alignment of basic
   6175 	 * data structures like sin_t and struct T_addr_ack.
   6176 	 */
   6177 	if (tcp->tcp_state >= TCPS_BOUND) {
   6178 		/*
   6179 		 * Fill in local address first
   6180 		 */
   6181 		taa->LOCADDR_offset = sizeof (*taa);
   6182 		taa->LOCADDR_length = addrlen;
   6183 		sa = (struct sockaddr *)&taa[1];
   6184 		(void) conn_getsockname(connp, sa, &addrlen);
   6185 		ackmp->b_wptr += addrlen;
   6186 	}
   6187 	if (tcp->tcp_state >= TCPS_SYN_RCVD) {
   6188 		/*
   6189 		 * Fill in Remote address
   6190 		 */
   6191 		taa->REMADDR_length = addrlen;
   6192 		/* assumed 32-bit alignment */
   6193 		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
   6194 		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
   6195 		(void) conn_getpeername(connp, sa, &addrlen);
   6196 		ackmp->b_wptr += addrlen;
   6197 	}
   6198 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
   6199 	putnext(tcp->tcp_connp->conn_rq, ackmp);
   6200 }
   6201 
   6202 /*
   6203  * Handle reinitialization of a tcp structure.
   6204  * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE.
   6205  */
   6206 static void
   6207 tcp_reinit(tcp_t *tcp)
   6208 {
   6209 	mblk_t		*mp;
   6210 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6211 	conn_t		*connp  = tcp->tcp_connp;
   6212 
   6213 	TCP_STAT(tcps, tcp_reinit_calls);
   6214 
   6215 	/* tcp_reinit should never be called for detached tcp_t's */
   6216 	ASSERT(tcp->tcp_listener == NULL);
   6217 	ASSERT((connp->conn_family == AF_INET &&
   6218 	    connp->conn_ipversion == IPV4_VERSION) ||
   6219 	    (connp->conn_family == AF_INET6 &&
   6220 	    (connp->conn_ipversion == IPV4_VERSION ||
   6221 	    connp->conn_ipversion == IPV6_VERSION)));
   6222 
   6223 	/* Cancel outstanding timers */
   6224 	tcp_timers_stop(tcp);
   6225 
   6226 	/*
   6227 	 * Reset everything in the state vector, after updating global
   6228 	 * MIB data from instance counters.
   6229 	 */
   6230 	UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
   6231 	tcp->tcp_ibsegs = 0;
   6232 	UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
   6233 	tcp->tcp_obsegs = 0;
   6234 
   6235 	tcp_close_mpp(&tcp->tcp_xmit_head);
   6236 	if (tcp->tcp_snd_zcopy_aware)
   6237 		tcp_zcopy_notify(tcp);
   6238 	tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
   6239 	tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
   6240 	mutex_enter(&tcp->tcp_non_sq_lock);
   6241 	if (tcp->tcp_flow_stopped &&
   6242 	    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
   6243 		tcp_clrqfull(tcp);
   6244 	}
   6245 	mutex_exit(&tcp->tcp_non_sq_lock);
   6246 	tcp_close_mpp(&tcp->tcp_reass_head);
   6247 	tcp->tcp_reass_tail = NULL;
   6248 	if (tcp->tcp_rcv_list != NULL) {
   6249 		/* Free b_next chain */
   6250 		tcp_close_mpp(&tcp->tcp_rcv_list);
   6251 		tcp->tcp_rcv_last_head = NULL;
   6252 		tcp->tcp_rcv_last_tail = NULL;
   6253 		tcp->tcp_rcv_cnt = 0;
   6254 	}
   6255 	tcp->tcp_rcv_last_tail = NULL;
   6256 
   6257 	if ((mp = tcp->tcp_urp_mp) != NULL) {
   6258 		freemsg(mp);
   6259 		tcp->tcp_urp_mp = NULL;
   6260 	}
   6261 	if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
   6262 		freemsg(mp);
   6263 		tcp->tcp_urp_mark_mp = NULL;
   6264 	}
   6265 	if (tcp->tcp_fused_sigurg_mp != NULL) {
   6266 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   6267 		freeb(tcp->tcp_fused_sigurg_mp);
   6268 		tcp->tcp_fused_sigurg_mp = NULL;
   6269 	}
   6270 	if (tcp->tcp_ordrel_mp != NULL) {
   6271 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   6272 		freeb(tcp->tcp_ordrel_mp);
   6273 		tcp->tcp_ordrel_mp = NULL;
   6274 	}
   6275 
   6276 	/*
   6277 	 * Following is a union with two members which are
   6278 	 * identical types and size so the following cleanup
   6279 	 * is enough.
   6280 	 */
   6281 	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
   6282 
   6283 	CL_INET_DISCONNECT(connp);
   6284 
   6285 	/*
   6286 	 * The connection can't be on the tcp_time_wait_head list
   6287 	 * since it is not detached.
   6288 	 */
   6289 	ASSERT(tcp->tcp_time_wait_next == NULL);
   6290 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   6291 	ASSERT(tcp->tcp_time_wait_expire == 0);
   6292 
   6293 	if (tcp->tcp_kssl_pending) {
   6294 		tcp->tcp_kssl_pending = B_FALSE;
   6295 
   6296 		/* Don't reset if the initialized by bind. */
   6297 		if (tcp->tcp_kssl_ent != NULL) {
   6298 			kssl_release_ent(tcp->tcp_kssl_ent, NULL,
   6299 			    KSSL_NO_PROXY);
   6300 		}
   6301 	}
   6302 	if (tcp->tcp_kssl_ctx != NULL) {
   6303 		kssl_release_ctx(tcp->tcp_kssl_ctx);
   6304 		tcp->tcp_kssl_ctx = NULL;
   6305 	}
   6306 
   6307 	/*
   6308 	 * Reset/preserve other values
   6309 	 */
   6310 	tcp_reinit_values(tcp);
   6311 	ipcl_hash_remove(connp);
   6312 	ixa_cleanup(connp->conn_ixa);
   6313 	tcp_ipsec_cleanup(tcp);
   6314 
   6315 	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
   6316 	connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
   6317 
   6318 	if (tcp->tcp_conn_req_max != 0) {
   6319 		/*
   6320 		 * This is the case when a TLI program uses the same
   6321 		 * transport end point to accept a connection.  This
   6322 		 * makes the TCP both a listener and acceptor.  When
   6323 		 * this connection is closed, we need to set the state
   6324 		 * back to TCPS_LISTEN.  Make sure that the eager list
   6325 		 * is reinitialized.
   6326 		 *
   6327 		 * Note that this stream is still bound to the four
   6328 		 * tuples of the previous connection in IP.  If a new
   6329 		 * SYN with different foreign address comes in, IP will
   6330 		 * not find it and will send it to the global queue.  In
   6331 		 * the global queue, TCP will do a tcp_lookup_listener()
   6332 		 * to find this stream.  This works because this stream
   6333 		 * is only removed from connected hash.
   6334 		 *
   6335 		 */
   6336 		tcp->tcp_state = TCPS_LISTEN;
   6337 		tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
   6338 		tcp->tcp_eager_next_drop_q0 = tcp;
   6339 		tcp->tcp_eager_prev_drop_q0 = tcp;
   6340 		/*
   6341 		 * Initially set conn_recv to tcp_input_listener_unbound to try
   6342 		 * to pick a good squeue for the listener when the first SYN
   6343 		 * arrives. tcp_input_listener_unbound sets it to
   6344 		 * tcp_input_listener on that first SYN.
   6345 		 */
   6346 		connp->conn_recv = tcp_input_listener_unbound;
   6347 
   6348 		connp->conn_proto = IPPROTO_TCP;
   6349 		connp->conn_faddr_v6 = ipv6_all_zeros;
   6350 		connp->conn_fport = 0;
   6351 
   6352 		(void) ipcl_bind_insert(connp);
   6353 	} else {
   6354 		tcp->tcp_state = TCPS_BOUND;
   6355 	}
   6356 
   6357 	/*
   6358 	 * Initialize to default values
   6359 	 */
   6360 	tcp_init_values(tcp);
   6361 
   6362 	ASSERT(tcp->tcp_ptpbhn != NULL);
   6363 	tcp->tcp_rwnd = connp->conn_rcvbuf;
   6364 	tcp->tcp_mss = connp->conn_ipversion != IPV4_VERSION ?
   6365 	    tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4;
   6366 }
   6367 
   6368 /*
   6369  * Force values to zero that need be zero.
   6370  * Do not touch values asociated with the BOUND or LISTEN state
   6371  * since the connection will end up in that state after the reinit.
   6372  * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t
   6373  * structure!
   6374  */
   6375 static void
   6376 tcp_reinit_values(tcp)
   6377 	tcp_t *tcp;
   6378 {
   6379 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6380 	conn_t		*connp = tcp->tcp_connp;
   6381 
   6382 #ifndef	lint
   6383 #define	DONTCARE(x)
   6384 #define	PRESERVE(x)
   6385 #else
   6386 #define	DONTCARE(x)	((x) = (x))
   6387 #define	PRESERVE(x)	((x) = (x))
   6388 #endif	/* lint */
   6389 
   6390 	PRESERVE(tcp->tcp_bind_hash_port);
   6391 	PRESERVE(tcp->tcp_bind_hash);
   6392 	PRESERVE(tcp->tcp_ptpbhn);
   6393 	PRESERVE(tcp->tcp_acceptor_hash);
   6394 	PRESERVE(tcp->tcp_ptpahn);
   6395 
   6396 	/* Should be ASSERT NULL on these with new code! */
   6397 	ASSERT(tcp->tcp_time_wait_next == NULL);
   6398 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   6399 	ASSERT(tcp->tcp_time_wait_expire == 0);
   6400 	PRESERVE(tcp->tcp_state);
   6401 	PRESERVE(connp->conn_rq);
   6402 	PRESERVE(connp->conn_wq);
   6403 
   6404 	ASSERT(tcp->tcp_xmit_head == NULL);
   6405 	ASSERT(tcp->tcp_xmit_last == NULL);
   6406 	ASSERT(tcp->tcp_unsent == 0);
   6407 	ASSERT(tcp->tcp_xmit_tail == NULL);
   6408 	ASSERT(tcp->tcp_xmit_tail_unsent == 0);
   6409 
   6410 	tcp->tcp_snxt = 0;			/* Displayed in mib */
   6411 	tcp->tcp_suna = 0;			/* Displayed in mib */
   6412 	tcp->tcp_swnd = 0;
   6413 	DONTCARE(tcp->tcp_cwnd);	/* Init in tcp_process_options */
   6414 
   6415 	ASSERT(tcp->tcp_ibsegs == 0);
   6416 	ASSERT(tcp->tcp_obsegs == 0);
   6417 
   6418 	if (connp->conn_ht_iphc != NULL) {
   6419 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
   6420 		connp->conn_ht_iphc = NULL;
   6421 		connp->conn_ht_iphc_allocated = 0;
   6422 		connp->conn_ht_iphc_len = 0;
   6423 		connp->conn_ht_ulp = NULL;
   6424 		connp->conn_ht_ulp_len = 0;
   6425 		tcp->tcp_ipha = NULL;
   6426 		tcp->tcp_ip6h = NULL;
   6427 		tcp->tcp_tcpha = NULL;
   6428 	}
   6429 
   6430 	/* We clear any IP_OPTIONS and extension headers */
   6431 	ip_pkt_free(&connp->conn_xmit_ipp);
   6432 
   6433 	DONTCARE(tcp->tcp_naglim);		/* Init in tcp_init_values */
   6434 	DONTCARE(tcp->tcp_ipha);
   6435 	DONTCARE(tcp->tcp_ip6h);
   6436 	DONTCARE(tcp->tcp_tcpha);
   6437 	tcp->tcp_valid_bits = 0;
   6438 
   6439 	DONTCARE(tcp->tcp_timer_backoff);	/* Init in tcp_init_values */
   6440 	DONTCARE(tcp->tcp_last_recv_time);	/* Init in tcp_init_values */
   6441 	tcp->tcp_last_rcv_lbolt = 0;
   6442 
   6443 	tcp->tcp_init_cwnd = 0;
   6444 
   6445 	tcp->tcp_urp_last_valid = 0;
   6446 	tcp->tcp_hard_binding = 0;
   6447 
   6448 	tcp->tcp_fin_acked = 0;
   6449 	tcp->tcp_fin_rcvd = 0;
   6450 	tcp->tcp_fin_sent = 0;
   6451 	tcp->tcp_ordrel_done = 0;
   6452 
   6453 	tcp->tcp_detached = 0;
   6454 
   6455 	tcp->tcp_snd_ws_ok = B_FALSE;
   6456 	tcp->tcp_snd_ts_ok = B_FALSE;
   6457 	tcp->tcp_zero_win_probe = 0;
   6458 
   6459 	tcp->tcp_loopback = 0;
   6460 	tcp->tcp_localnet = 0;
   6461 	tcp->tcp_syn_defense = 0;
   6462 	tcp->tcp_set_timer = 0;
   6463 
   6464 	tcp->tcp_active_open = 0;
   6465 	tcp->tcp_rexmit = B_FALSE;
   6466 	tcp->tcp_xmit_zc_clean = B_FALSE;
   6467 
   6468 	tcp->tcp_snd_sack_ok = B_FALSE;
   6469 	tcp->tcp_hwcksum = B_FALSE;
   6470 
   6471 	DONTCARE(tcp->tcp_maxpsz_multiplier);	/* Init in tcp_init_values */
   6472 
   6473 	tcp->tcp_conn_def_q0 = 0;
   6474 	tcp->tcp_ip_forward_progress = B_FALSE;
   6475 	tcp->tcp_ecn_ok = B_FALSE;
   6476 
   6477 	tcp->tcp_cwr = B_FALSE;
   6478 	tcp->tcp_ecn_echo_on = B_FALSE;
   6479 	tcp->tcp_is_wnd_shrnk = B_FALSE;
   6480 
   6481 	if (tcp->tcp_sack_info != NULL) {
   6482 		if (tcp->tcp_notsack_list != NULL) {
   6483 			TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
   6484 			    tcp);
   6485 		}
   6486 		kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info);
   6487 		tcp->tcp_sack_info = NULL;
   6488 	}
   6489 
   6490 	tcp->tcp_rcv_ws = 0;
   6491 	tcp->tcp_snd_ws = 0;
   6492 	tcp->tcp_ts_recent = 0;
   6493 	tcp->tcp_rnxt = 0;			/* Displayed in mib */
   6494 	DONTCARE(tcp->tcp_rwnd);		/* Set in tcp_reinit() */
   6495 	tcp->tcp_initial_pmtu = 0;
   6496 
   6497 	ASSERT(tcp->tcp_reass_head == NULL);
   6498 	ASSERT(tcp->tcp_reass_tail == NULL);
   6499 
   6500 	tcp->tcp_cwnd_cnt = 0;
   6501 
   6502 	ASSERT(tcp->tcp_rcv_list == NULL);
   6503 	ASSERT(tcp->tcp_rcv_last_head == NULL);
   6504 	ASSERT(tcp->tcp_rcv_last_tail == NULL);
   6505 	ASSERT(tcp->tcp_rcv_cnt == 0);
   6506 
   6507 	DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */
   6508 	DONTCARE(tcp->tcp_cwnd_max);		/* Init in tcp_init_values */
   6509 	tcp->tcp_csuna = 0;
   6510 
   6511 	tcp->tcp_rto = 0;			/* Displayed in MIB */
   6512 	DONTCARE(tcp->tcp_rtt_sa);		/* Init in tcp_init_values */
   6513 	DONTCARE(tcp->tcp_rtt_sd);		/* Init in tcp_init_values */
   6514 	tcp->tcp_rtt_update = 0;
   6515 
   6516 	DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
   6517 	DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
   6518 
   6519 	tcp->tcp_rack = 0;			/* Displayed in mib */
   6520 	tcp->tcp_rack_cnt = 0;
   6521 	tcp->tcp_rack_cur_max = 0;
   6522 	tcp->tcp_rack_abs_max = 0;
   6523 
   6524 	tcp->tcp_max_swnd = 0;
   6525 
   6526 	ASSERT(tcp->tcp_listener == NULL);
   6527 
   6528 	DONTCARE(tcp->tcp_irs);			/* tcp_valid_bits cleared */
   6529 	DONTCARE(tcp->tcp_iss);			/* tcp_valid_bits cleared */
   6530 	DONTCARE(tcp->tcp_fss);			/* tcp_valid_bits cleared */
   6531 	DONTCARE(tcp->tcp_urg);			/* tcp_valid_bits cleared */
   6532 
   6533 	ASSERT(tcp->tcp_conn_req_cnt_q == 0);
   6534 	ASSERT(tcp->tcp_conn_req_cnt_q0 == 0);
   6535 	PRESERVE(tcp->tcp_conn_req_max);
   6536 	PRESERVE(tcp->tcp_conn_req_seqnum);
   6537 
   6538 	DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */
   6539 	DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */
   6540 	DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */
   6541 	DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */
   6542 
   6543 	DONTCARE(tcp->tcp_urp_last);	/* tcp_urp_last_valid is cleared */
   6544 	ASSERT(tcp->tcp_urp_mp == NULL);
   6545 	ASSERT(tcp->tcp_urp_mark_mp == NULL);
   6546 	ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
   6547 
   6548 	ASSERT(tcp->tcp_eager_next_q == NULL);
   6549 	ASSERT(tcp->tcp_eager_last_q == NULL);
   6550 	ASSERT((tcp->tcp_eager_next_q0 == NULL &&
   6551 	    tcp->tcp_eager_prev_q0 == NULL) ||
   6552 	    tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0);
   6553 	ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
   6554 
   6555 	ASSERT((tcp->tcp_eager_next_drop_q0 == NULL &&
   6556 	    tcp->tcp_eager_prev_drop_q0 == NULL) ||
   6557 	    tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0);
   6558 
   6559 	tcp->tcp_client_errno = 0;
   6560 
   6561 	DONTCARE(connp->conn_sum);		/* Init in tcp_init_values */
   6562 
   6563 	connp->conn_faddr_v6 = ipv6_all_zeros;	/* Displayed in MIB */
   6564 
   6565 	PRESERVE(connp->conn_bound_addr_v6);
   6566 	tcp->tcp_last_sent_len = 0;
   6567 	tcp->tcp_dupack_cnt = 0;
   6568 
   6569 	connp->conn_fport = 0;			/* Displayed in MIB */
   6570 	PRESERVE(connp->conn_lport);
   6571 
   6572 	PRESERVE(tcp->tcp_acceptor_lockp);
   6573 
   6574 	ASSERT(tcp->tcp_ordrel_mp == NULL);
   6575 	PRESERVE(tcp->tcp_acceptor_id);
   6576 	DONTCARE(tcp->tcp_ipsec_overhead);
   6577 
   6578 	PRESERVE(connp->conn_family);
   6579 	/* Remove any remnants of mapped address binding */
   6580 	if (connp->conn_family == AF_INET6) {
   6581 		connp->conn_ipversion = IPV6_VERSION;
   6582 		tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
   6583 	} else {
   6584 		connp->conn_ipversion = IPV4_VERSION;
   6585 		tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
   6586 	}
   6587 
   6588 	connp->conn_bound_if = 0;
   6589 	connp->conn_recv_ancillary.crb_all = 0;
   6590 	tcp->tcp_recvifindex = 0;
   6591 	tcp->tcp_recvhops = 0;
   6592 	tcp->tcp_closed = 0;
   6593 	tcp->tcp_cleandeathtag = 0;
   6594 	if (tcp->tcp_hopopts != NULL) {
   6595 		mi_free(tcp->tcp_hopopts);
   6596 		tcp->tcp_hopopts = NULL;
   6597 		tcp->tcp_hopoptslen = 0;
   6598 	}
   6599 	ASSERT(tcp->tcp_hopoptslen == 0);
   6600 	if (tcp->tcp_dstopts != NULL) {
   6601 		mi_free(tcp->tcp_dstopts);
   6602 		tcp->tcp_dstopts = NULL;
   6603 		tcp->tcp_dstoptslen = 0;
   6604 	}
   6605 	ASSERT(tcp->tcp_dstoptslen == 0);
   6606 	if (tcp->tcp_rthdrdstopts != NULL) {
   6607 		mi_free(tcp->tcp_rthdrdstopts);
   6608 		tcp->tcp_rthdrdstopts = NULL;
   6609 		tcp->tcp_rthdrdstoptslen = 0;
   6610 	}
   6611 	ASSERT(tcp->tcp_rthdrdstoptslen == 0);
   6612 	if (tcp->tcp_rthdr != NULL) {
   6613 		mi_free(tcp->tcp_rthdr);
   6614 		tcp->tcp_rthdr = NULL;
   6615 		tcp->tcp_rthdrlen = 0;
   6616 	}
   6617 	ASSERT(tcp->tcp_rthdrlen == 0);
   6618 
   6619 	/* Reset fusion-related fields */
   6620 	tcp->tcp_fused = B_FALSE;
   6621 	tcp->tcp_unfusable = B_FALSE;
   6622 	tcp->tcp_fused_sigurg = B_FALSE;
   6623 	tcp->tcp_loopback_peer = NULL;
   6624 
   6625 	tcp->tcp_lso = B_FALSE;
   6626 
   6627 	tcp->tcp_in_ack_unsent = 0;
   6628 	tcp->tcp_cork = B_FALSE;
   6629 	tcp->tcp_tconnind_started = B_FALSE;
   6630 
   6631 	PRESERVE(tcp->tcp_squeue_bytes);
   6632 
   6633 	ASSERT(tcp->tcp_kssl_ctx == NULL);
   6634 	ASSERT(!tcp->tcp_kssl_pending);
   6635 	PRESERVE(tcp->tcp_kssl_ent);
   6636 
   6637 	tcp->tcp_closemp_used = B_FALSE;
   6638 
   6639 	PRESERVE(tcp->tcp_rsrv_mp);
   6640 	PRESERVE(tcp->tcp_rsrv_mp_lock);
   6641 
   6642 #ifdef DEBUG
   6643 	DONTCARE(tcp->tcmp_stk[0]);
   6644 #endif
   6645 
   6646 	PRESERVE(tcp->tcp_connid);
   6647 
   6648 
   6649 #undef	DONTCARE
   6650 #undef	PRESERVE
   6651 }
   6652 
   6653 static void
   6654 tcp_init_values(tcp_t *tcp)
   6655 {
   6656 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6657 	conn_t		*connp = tcp->tcp_connp;
   6658 
   6659 	ASSERT((connp->conn_family == AF_INET &&
   6660 	    connp->conn_ipversion == IPV4_VERSION) ||
   6661 	    (connp->conn_family == AF_INET6 &&
   6662 	    (connp->conn_ipversion == IPV4_VERSION ||
   6663 	    connp->conn_ipversion == IPV6_VERSION)));
   6664 
   6665 	/*
   6666 	 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
   6667 	 * will be close to tcp_rexmit_interval_initial.  By doing this, we
   6668 	 * allow the algorithm to adjust slowly to large fluctuations of RTT
   6669 	 * during first few transmissions of a connection as seen in slow
   6670 	 * links.
   6671 	 */
   6672 	tcp->tcp_rtt_sa = tcps->tcps_rexmit_interval_initial << 2;
   6673 	tcp->tcp_rtt_sd = tcps->tcps_rexmit_interval_initial >> 1;
   6674 	tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
   6675 	    tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
   6676 	    tcps->tcps_conn_grace_period;
   6677 	if (tcp->tcp_rto < tcps->tcps_rexmit_interval_min)
   6678 		tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
   6679 	tcp->tcp_timer_backoff = 0;
   6680 	tcp->tcp_ms_we_have_waited = 0;
   6681 	tcp->tcp_last_recv_time = ddi_get_lbolt();
   6682 	tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
   6683 	tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
   6684 	tcp->tcp_snd_burst = TCP_CWND_INFINITE;
   6685 
   6686 	tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
   6687 
   6688 	tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
   6689 	tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval;
   6690 	tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
   6691 	/*
   6692 	 * Fix it to tcp_ip_abort_linterval later if it turns out to be a
   6693 	 * passive open.
   6694 	 */
   6695 	tcp->tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval;
   6696 
   6697 	tcp->tcp_naglim = tcps->tcps_naglim_def;
   6698 
   6699 	/* NOTE:  ISS is now set in tcp_set_destination(). */
   6700 
   6701 	/* Reset fusion-related fields */
   6702 	tcp->tcp_fused = B_FALSE;
   6703 	tcp->tcp_unfusable = B_FALSE;
   6704 	tcp->tcp_fused_sigurg = B_FALSE;
   6705 	tcp->tcp_loopback_peer = NULL;
   6706 
   6707 	/* We rebuild the header template on the next connect/conn_request */
   6708 
   6709 	connp->conn_mlp_type = mlptSingle;
   6710 
   6711 	/*
   6712 	 * Init the window scale to the max so tcp_rwnd_set() won't pare
   6713 	 * down tcp_rwnd. tcp_set_destination() will set the right value later.
   6714 	 */
   6715 	tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT;
   6716 	tcp->tcp_rwnd = connp->conn_rcvbuf;
   6717 
   6718 	tcp->tcp_cork = B_FALSE;
   6719 	/*
   6720 	 * Init the tcp_debug option if it wasn't already set.  This value
   6721 	 * determines whether TCP
   6722 	 * calls strlog() to print out debug messages.  Doing this
   6723 	 * initialization here means that this value is not inherited thru
   6724 	 * tcp_reinit().
   6725 	 */
   6726 	if (!connp->conn_debug)
   6727 		connp->conn_debug = tcps->tcps_dbg;
   6728 
   6729 	tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
   6730 	tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
   6731 }
   6732 
   6733 /* At minimum we need 8 bytes in the TCP header for the lookup */
   6734 #define	ICMP_MIN_TCP_HDR	8
   6735 
   6736 /*
   6737  * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages
   6738  * passed up by IP. The message is always received on the correct tcp_t.
   6739  * Assumes that IP has pulled up everything up to and including the ICMP header.
   6740  */
   6741 /* ARGSUSED2 */
   6742 static void
   6743 tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   6744 {
   6745 	conn_t		*connp = (conn_t *)arg1;
   6746 	icmph_t		*icmph;
   6747 	ipha_t		*ipha;
   6748 	int		iph_hdr_length;
   6749 	tcpha_t		*tcpha;
   6750 	uint32_t	seg_seq;
   6751 	tcp_t		*tcp = connp->conn_tcp;
   6752 
   6753 	/* Assume IP provides aligned packets */
   6754 	ASSERT(OK_32PTR(mp->b_rptr));
   6755 	ASSERT((MBLKL(mp) >= sizeof (ipha_t)));
   6756 
   6757 	/*
   6758 	 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent
   6759 	 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6.
   6760 	 */
   6761 	if (!(ira->ira_flags & IRAF_IS_IPV4)) {
   6762 		tcp_icmp_error_ipv6(tcp, mp, ira);
   6763 		return;
   6764 	}
   6765 
   6766 	/* Skip past the outer IP and ICMP headers */
   6767 	iph_hdr_length = ira->ira_ip_hdr_length;
   6768 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   6769 	/*
   6770 	 * If we don't have the correct outer IP header length
   6771 	 * or if we don't have a complete inner IP header
   6772 	 * drop it.
   6773 	 */
   6774 	if (iph_hdr_length < sizeof (ipha_t) ||
   6775 	    (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) {
   6776 noticmpv4:
   6777 		freemsg(mp);
   6778 		return;
   6779 	}
   6780 	ipha = (ipha_t *)&icmph[1];
   6781 
   6782 	/* Skip past the inner IP and find the ULP header */
   6783 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
   6784 	tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length);
   6785 	/*
   6786 	 * If we don't have the correct inner IP header length or if the ULP
   6787 	 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR
   6788 	 * bytes of TCP header, drop it.
   6789 	 */
   6790 	if (iph_hdr_length < sizeof (ipha_t) ||
   6791 	    ipha->ipha_protocol != IPPROTO_TCP ||
   6792 	    (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) {
   6793 		goto noticmpv4;
   6794 	}
   6795 
   6796 	seg_seq = ntohl(tcpha->tha_seq);
   6797 	switch (icmph->icmph_type) {
   6798 	case ICMP_DEST_UNREACHABLE:
   6799 		switch (icmph->icmph_code) {
   6800 		case ICMP_FRAGMENTATION_NEEDED:
   6801 			/*
   6802 			 * Update Path MTU, then try to send something out.
   6803 			 */
   6804 			tcp_update_pmtu(tcp, B_TRUE);
   6805 			tcp_rexmit_after_error(tcp);
   6806 			break;
   6807 		case ICMP_PORT_UNREACHABLE:
   6808 		case ICMP_PROTOCOL_UNREACHABLE:
   6809 			switch (tcp->tcp_state) {
   6810 			case TCPS_SYN_SENT:
   6811 			case TCPS_SYN_RCVD:
   6812 				/*
   6813 				 * ICMP can snipe away incipient
   6814 				 * TCP connections as long as
   6815 				 * seq number is same as initial
   6816 				 * send seq number.
   6817 				 */
   6818 				if (seg_seq == tcp->tcp_iss) {
   6819 					(void) tcp_clean_death(tcp,
   6820 					    ECONNREFUSED, 6);
   6821 				}
   6822 				break;
   6823 			}
   6824 			break;
   6825 		case ICMP_HOST_UNREACHABLE:
   6826 		case ICMP_NET_UNREACHABLE:
   6827 			/* Record the error in case we finally time out. */
   6828 			if (icmph->icmph_code == ICMP_HOST_UNREACHABLE)
   6829 				tcp->tcp_client_errno = EHOSTUNREACH;
   6830 			else
   6831 				tcp->tcp_client_errno = ENETUNREACH;
   6832 			if (tcp->tcp_state == TCPS_SYN_RCVD) {
   6833 				if (tcp->tcp_listener != NULL &&
   6834 				    tcp->tcp_listener->tcp_syn_defense) {
   6835 					/*
   6836 					 * Ditch the half-open connection if we
   6837 					 * suspect a SYN attack is under way.
   6838 					 */
   6839 					(void) tcp_clean_death(tcp,
   6840 					    tcp->tcp_client_errno, 7);
   6841 				}
   6842 			}
   6843 			break;
   6844 		default:
   6845 			break;
   6846 		}
   6847 		break;
   6848 	case ICMP_SOURCE_QUENCH: {
   6849 		/*
   6850 		 * use a global boolean to control
   6851 		 * whether TCP should respond to ICMP_SOURCE_QUENCH.
   6852 		 * The default is false.
   6853 		 */
   6854 		if (tcp_icmp_source_quench) {
   6855 			/*
   6856 			 * Reduce the sending rate as if we got a
   6857 			 * retransmit timeout
   6858 			 */
   6859 			uint32_t npkt;
   6860 
   6861 			npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
   6862 			    tcp->tcp_mss;
   6863 			tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
   6864 			tcp->tcp_cwnd = tcp->tcp_mss;
   6865 			tcp->tcp_cwnd_cnt = 0;
   6866 		}
   6867 		break;
   6868 	}
   6869 	}
   6870 	freemsg(mp);
   6871 }
   6872 
   6873 /*
   6874  * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might
   6875  * change. But it can refer to fields like tcp_suna and tcp_snxt.
   6876  *
   6877  * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP
   6878  * error messages received by IP. The message is always received on the correct
   6879  * tcp_t.
   6880  */
   6881 /* ARGSUSED */
   6882 static boolean_t
   6883 tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
   6884     ip_recv_attr_t *ira)
   6885 {
   6886 	tcpha_t		*tcpha = (tcpha_t *)arg2;
   6887 	uint32_t	seq = ntohl(tcpha->tha_seq);
   6888 	tcp_t		*tcp = connp->conn_tcp;
   6889 
   6890 	/*
   6891 	 * TCP sequence number contained in payload of the ICMP error message
   6892 	 * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise,
   6893 	 * the message is either a stale ICMP error, or an attack from the
   6894 	 * network. Fail the verification.
   6895 	 */
   6896 	if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt))
   6897 		return (B_FALSE);
   6898 
   6899 	/* For "too big" we also check the ignore flag */
   6900 	if (ira->ira_flags & IRAF_IS_IPV4) {
   6901 		ASSERT(icmph != NULL);
   6902 		if (icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
   6903 		    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED &&
   6904 		    tcp->tcp_tcps->tcps_ignore_path_mtu)
   6905 			return (B_FALSE);
   6906 	} else {
   6907 		ASSERT(icmp6 != NULL);
   6908 		if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG &&
   6909 		    tcp->tcp_tcps->tcps_ignore_path_mtu)
   6910 			return (B_FALSE);
   6911 	}
   6912 	return (B_TRUE);
   6913 }
   6914 
   6915 /*
   6916  * Update the TCP connection according to change of PMTU.
   6917  *
   6918  * Path MTU might have changed by either increase or decrease, so need to
   6919  * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny
   6920  * or negative MSS, since tcp_mss_set() will do it.
   6921  */
   6922 static void
   6923 tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
   6924 {
   6925 	uint32_t	pmtu;
   6926 	int32_t		mss;
   6927 	conn_t		*connp = tcp->tcp_connp;
   6928 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
   6929 	iaflags_t	ixaflags;
   6930 
   6931 	if (tcp->tcp_tcps->tcps_ignore_path_mtu)
   6932 		return;
   6933 
   6934 	if (tcp->tcp_state < TCPS_ESTABLISHED)
   6935 		return;
   6936 
   6937 	/*
   6938 	 * Always call ip_get_pmtu() to make sure that IP has updated
   6939 	 * ixa_flags properly.
   6940 	 */
   6941 	pmtu = ip_get_pmtu(ixa);
   6942 	ixaflags = ixa->ixa_flags;
   6943 
   6944 	/*
   6945 	 * Calculate the MSS by decreasing the PMTU by conn_ht_iphc_len and
   6946 	 * IPsec overhead if applied. Make sure to use the most recent
   6947 	 * IPsec information.
   6948 	 */
   6949 	mss = pmtu - connp->conn_ht_iphc_len - conn_ipsec_length(connp);
   6950 
   6951 	/*
   6952 	 * Nothing to change, so just return.
   6953 	 */
   6954 	if (mss == tcp->tcp_mss)
   6955 		return;
   6956 
   6957 	/*
   6958 	 * Currently, for ICMP errors, only PMTU decrease is handled.
   6959 	 */
   6960 	if (mss > tcp->tcp_mss && decrease_only)
   6961 		return;
   6962 
   6963 	DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss);
   6964 
   6965 	/*
   6966 	 * Update ixa_fragsize and ixa_pmtu.
   6967 	 */
   6968 	ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
   6969 
   6970 	/*
   6971 	 * Adjust MSS and all relevant variables.
   6972 	 */
   6973 	tcp_mss_set(tcp, mss);
   6974 
   6975 	/*
   6976 	 * If the PMTU is below the min size maintained by IP, then ip_get_pmtu
   6977 	 * has set IXAF_PMTU_TOO_SMALL and cleared IXAF_PMTU_IPV4_DF. Since TCP
   6978 	 * has a (potentially different) min size we do the same. Make sure to
   6979 	 * clear IXAF_DONTFRAG, which is used by IP to decide whether to
   6980 	 * fragment the packet.
   6981 	 *
   6982 	 * LSO over IPv6 can not be fragmented. So need to disable LSO
   6983 	 * when IPv6 fragmentation is needed.
   6984 	 */
   6985 	if (mss < tcp->tcp_tcps->tcps_mss_min)
   6986 		ixaflags |= IXAF_PMTU_TOO_SMALL;
   6987 
   6988 	if (ixaflags & IXAF_PMTU_TOO_SMALL)
   6989 		ixaflags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
   6990 
   6991 	if ((connp->conn_ipversion == IPV4_VERSION) &&
   6992 	    !(ixaflags & IXAF_PMTU_IPV4_DF)) {
   6993 		tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
   6994 	}
   6995 	ixa->ixa_flags = ixaflags;
   6996 }
   6997 
   6998 /*
   6999  * Do slow start retransmission after ICMP errors of PMTU changes.
   7000  */
   7001 static void
   7002 tcp_rexmit_after_error(tcp_t *tcp)
   7003 {
   7004 	/*
   7005 	 * All sent data has been acknowledged or no data left to send, just
   7006 	 * to return.
   7007 	 */
   7008 	if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
   7009 	    (tcp->tcp_xmit_head == NULL))
   7010 		return;
   7011 
   7012 	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
   7013 		tcp->tcp_rexmit_max = tcp->tcp_fss;
   7014 	else
   7015 		tcp->tcp_rexmit_max = tcp->tcp_snxt;
   7016 
   7017 	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
   7018 	tcp->tcp_rexmit = B_TRUE;
   7019 	tcp->tcp_dupack_cnt = 0;
   7020 	tcp->tcp_snd_burst = TCP_CWND_SS;
   7021 	tcp_ss_rexmit(tcp);
   7022 }
   7023 
   7024 /*
   7025  * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
   7026  * error messages passed up by IP.
   7027  * Assumes that IP has pulled up all the extension headers as well
   7028  * as the ICMPv6 header.
   7029  */
   7030 static void
   7031 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
   7032 {
   7033 	icmp6_t		*icmp6;
   7034 	ip6_t		*ip6h;
   7035 	uint16_t	iph_hdr_length = ira->ira_ip_hdr_length;
   7036 	tcpha_t		*tcpha;
   7037 	uint8_t		*nexthdrp;
   7038 	uint32_t	seg_seq;
   7039 
   7040 	/*
   7041 	 * Verify that we have a complete IP header.
   7042 	 */
   7043 	ASSERT((MBLKL(mp) >= sizeof (ip6_t)));
   7044 
   7045 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
   7046 	ip6h = (ip6_t *)&icmp6[1];
   7047 	/*
   7048 	 * Verify if we have a complete ICMP and inner IP header.
   7049 	 */
   7050 	if ((uchar_t *)&ip6h[1] > mp->b_wptr) {
   7051 noticmpv6:
   7052 		freemsg(mp);
   7053 		return;
   7054 	}
   7055 
   7056 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp))
   7057 		goto noticmpv6;
   7058 	tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length);
   7059 	/*
   7060 	 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't
   7061 	 * have at least ICMP_MIN_TCP_HDR bytes of  TCP header drop the
   7062 	 * packet.
   7063 	 */
   7064 	if ((*nexthdrp != IPPROTO_TCP) ||
   7065 	    ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) {
   7066 		goto noticmpv6;
   7067 	}
   7068 
   7069 	seg_seq = ntohl(tcpha->tha_seq);
   7070 	switch (icmp6->icmp6_type) {
   7071 	case ICMP6_PACKET_TOO_BIG:
   7072 		/*
   7073 		 * Update Path MTU, then try to send something out.
   7074 		 */
   7075 		tcp_update_pmtu(tcp, B_TRUE);
   7076 		tcp_rexmit_after_error(tcp);
   7077 		break;
   7078 	case ICMP6_DST_UNREACH:
   7079 		switch (icmp6->icmp6_code) {
   7080 		case ICMP6_DST_UNREACH_NOPORT:
   7081 			if (((tcp->tcp_state == TCPS_SYN_SENT) ||
   7082 			    (tcp->tcp_state == TCPS_SYN_RCVD)) &&
   7083 			    (seg_seq == tcp->tcp_iss)) {
   7084 				(void) tcp_clean_death(tcp,
   7085 				    ECONNREFUSED, 8);
   7086 			}
   7087 			break;
   7088 		case ICMP6_DST_UNREACH_ADMIN:
   7089 		case ICMP6_DST_UNREACH_NOROUTE:
   7090 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
   7091 		case ICMP6_DST_UNREACH_ADDR:
   7092 			/* Record the error in case we finally time out. */
   7093 			tcp->tcp_client_errno = EHOSTUNREACH;
   7094 			if (((tcp->tcp_state == TCPS_SYN_SENT) ||
   7095 			    (tcp->tcp_state == TCPS_SYN_RCVD)) &&
   7096 			    (seg_seq == tcp->tcp_iss)) {
   7097 				if (tcp->tcp_listener != NULL &&
   7098 				    tcp->tcp_listener->tcp_syn_defense) {
   7099 					/*
   7100 					 * Ditch the half-open connection if we
   7101 					 * suspect a SYN attack is under way.
   7102 					 */
   7103 					(void) tcp_clean_death(tcp,
   7104 					    tcp->tcp_client_errno, 9);
   7105 				}
   7106 			}
   7107 
   7108 
   7109 			break;
   7110 		default:
   7111 			break;
   7112 		}
   7113 		break;
   7114 	case ICMP6_PARAM_PROB:
   7115 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
   7116 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
   7117 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
   7118 		    (uchar_t *)nexthdrp) {
   7119 			if (tcp->tcp_state == TCPS_SYN_SENT ||
   7120 			    tcp->tcp_state == TCPS_SYN_RCVD) {
   7121 				(void) tcp_clean_death(tcp,
   7122 				    ECONNREFUSED, 10);
   7123 			}
   7124 			break;
   7125 		}
   7126 		break;
   7127 
   7128 	case ICMP6_TIME_EXCEEDED:
   7129 	default:
   7130 		break;
   7131 	}
   7132 	freemsg(mp);
   7133 }
   7134 
   7135 /*
   7136  * Notify IP that we are having trouble with this connection.  IP should
   7137  * make note so it can potentially use a different IRE.
   7138  */
   7139 static void
   7140 tcp_ip_notify(tcp_t *tcp)
   7141 {
   7142 	conn_t		*connp = tcp->tcp_connp;
   7143 	ire_t		*ire;
   7144 
   7145 	/*
   7146 	 * Note: in the case of source routing we want to blow away the
   7147 	 * route to the first source route hop.
   7148 	 */
   7149 	ire = connp->conn_ixa->ixa_ire;
   7150 	if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
   7151 		if (ire->ire_ipversion == IPV4_VERSION) {
   7152 			/*
   7153 			 * As per RFC 1122, we send an RTM_LOSING to inform
   7154 			 * routing protocols.
   7155 			 */
   7156 			ip_rts_change(RTM_LOSING, ire->ire_addr,
   7157 			    ire->ire_gateway_addr, ire->ire_mask,
   7158 			    connp->conn_laddr_v4,  0, 0, 0,
   7159 			    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
   7160 			    ire->ire_ipst);
   7161 		}
   7162 		(void) ire_no_good(ire);
   7163 	}
   7164 }
   7165 
   7166 #pragma inline(tcp_send_data)
   7167 
   7168 /*
   7169  * Timer callback routine for keepalive probe.  We do a fake resend of
   7170  * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
   7171  * check to see if we have heard anything from the other end for the last
   7172  * RTO period.  If we have, set the timer to expire for another
   7173  * tcp_keepalive_intrvl and check again.  If we have not, set a timer using
   7174  * RTO << 1 and check again when it expires.  Keep exponentially increasing
   7175  * the timeout if we have not heard from the other side.  If for more than
   7176  * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
   7177  * kill the connection unless the keepalive abort threshold is 0.  In
   7178  * that case, we will probe "forever."
   7179  */
   7180 static void
   7181 tcp_keepalive_killer(void *arg)
   7182 {
   7183 	mblk_t	*mp;
   7184 	conn_t	*connp = (conn_t *)arg;
   7185 	tcp_t  	*tcp = connp->conn_tcp;
   7186 	int32_t	firetime;
   7187 	int32_t	idletime;
   7188 	int32_t	ka_intrvl;
   7189 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   7190 
   7191 	tcp->tcp_ka_tid = 0;
   7192 
   7193 	if (tcp->tcp_fused)
   7194 		return;
   7195 
   7196 	BUMP_MIB(&tcps->tcps_mib, tcpTimKeepalive);
   7197 	ka_intrvl = tcp->tcp_ka_interval;
   7198 
   7199 	/*
   7200 	 * Keepalive probe should only be sent if the application has not
   7201 	 * done a close on the connection.
   7202 	 */
   7203 	if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
   7204 		return;
   7205 	}
   7206 	/* Timer fired too early, restart it. */
   7207 	if (tcp->tcp_state < TCPS_ESTABLISHED) {
   7208 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
   7209 		    MSEC_TO_TICK(ka_intrvl));
   7210 		return;
   7211 	}
   7212 
   7213 	idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
   7214 	/*
   7215 	 * If we have not heard from the other side for a long
   7216 	 * time, kill the connection unless the keepalive abort
   7217 	 * threshold is 0.  In that case, we will probe "forever."
   7218 	 */
   7219 	if (tcp->tcp_ka_abort_thres != 0 &&
   7220 	    idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
   7221 		BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveDrop);
   7222 		(void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
   7223 		    tcp->tcp_client_errno : ETIMEDOUT, 11);
   7224 		return;
   7225 	}
   7226 
   7227 	if (tcp->tcp_snxt == tcp->tcp_suna &&
   7228 	    idletime >= ka_intrvl) {
   7229 		/* Fake resend of last ACKed byte. */
   7230 		mblk_t	*mp1 = allocb(1, BPRI_LO);
   7231 
   7232 		if (mp1 != NULL) {
   7233 			*mp1->b_wptr++ = '\0';
   7234 			mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
   7235 			    tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
   7236 			freeb(mp1);
   7237 			/*
   7238 			 * if allocation failed, fall through to start the
   7239 			 * timer back.
   7240 			 */
   7241 			if (mp != NULL) {
   7242 				tcp_send_data(tcp, mp);
   7243 				BUMP_MIB(&tcps->tcps_mib,
   7244 				    tcpTimKeepaliveProbe);
   7245 				if (tcp->tcp_ka_last_intrvl != 0) {
   7246 					int max;
   7247 					/*
   7248 					 * We should probe again at least
   7249 					 * in ka_intrvl, but not more than
   7250 					 * tcp_rexmit_interval_max.
   7251 					 */
   7252 					max = tcps->tcps_rexmit_interval_max;
   7253 					firetime = MIN(ka_intrvl - 1,
   7254 					    tcp->tcp_ka_last_intrvl << 1);
   7255 					if (firetime > max)
   7256 						firetime = max;
   7257 				} else {
   7258 					firetime = tcp->tcp_rto;
   7259 				}
   7260 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
   7261 				    tcp_keepalive_killer,
   7262 				    MSEC_TO_TICK(firetime));
   7263 				tcp->tcp_ka_last_intrvl = firetime;
   7264 				return;
   7265 			}
   7266 		}
   7267 	} else {
   7268 		tcp->tcp_ka_last_intrvl = 0;
   7269 	}
   7270 
   7271 	/* firetime can be negative if (mp1 == NULL || mp == NULL) */
   7272 	if ((firetime = ka_intrvl - idletime) < 0) {
   7273 		firetime = ka_intrvl;
   7274 	}
   7275 	tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
   7276 	    MSEC_TO_TICK(firetime));
   7277 }
   7278 
   7279 int
   7280 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
   7281 {
   7282 	conn_t	*connp = tcp->tcp_connp;
   7283 	queue_t	*q = connp->conn_rq;
   7284 	int32_t	mss = tcp->tcp_mss;
   7285 	int	maxpsz;
   7286 
   7287 	if (TCP_IS_DETACHED(tcp))
   7288 		return (mss);
   7289 	if (tcp->tcp_fused) {
   7290 		maxpsz = tcp_fuse_maxpsz(tcp);
   7291 		mss = INFPSZ;
   7292 	} else if (tcp->tcp_maxpsz_multiplier == 0) {
   7293 		/*
   7294 		 * Set the sd_qn_maxpsz according to the socket send buffer
   7295 		 * size, and sd_maxblk to INFPSZ (-1).  This will essentially
   7296 		 * instruct the stream head to copyin user data into contiguous
   7297 		 * kernel-allocated buffers without breaking it up into smaller
   7298 		 * chunks.  We round up the buffer size to the nearest SMSS.
   7299 		 */
   7300 		maxpsz = MSS_ROUNDUP(connp->conn_sndbuf, mss);
   7301 		if (tcp->tcp_kssl_ctx == NULL)
   7302 			mss = INFPSZ;
   7303 		else
   7304 			mss = SSL3_MAX_RECORD_LEN;
   7305 	} else {
   7306 		/*
   7307 		 * Set sd_qn_maxpsz to approx half the (receivers) buffer
   7308 		 * (and a multiple of the mss).  This instructs the stream
   7309 		 * head to break down larger than SMSS writes into SMSS-
   7310 		 * size mblks, up to tcp_maxpsz_multiplier mblks at a time.
   7311 		 */
   7312 		maxpsz = tcp->tcp_maxpsz_multiplier * mss;
   7313 		if (maxpsz > connp->conn_sndbuf / 2) {
   7314 			maxpsz = connp->conn_sndbuf / 2;
   7315 			/* Round up to nearest mss */
   7316 			maxpsz = MSS_ROUNDUP(maxpsz, mss);
   7317 		}
   7318 	}
   7319 
   7320 	(void) proto_set_maxpsz(q, connp, maxpsz);
   7321 	if (!(IPCL_IS_NONSTR(connp)))
   7322 		connp->conn_wq->q_maxpsz = maxpsz;
   7323 	if (set_maxblk)
   7324 		(void) proto_set_tx_maxblk(q, connp, mss);
   7325 	return (mss);
   7326 }
   7327 
   7328 /*
   7329  * Extract option values from a tcp header.  We put any found values into the
   7330  * tcpopt struct and return a bitmask saying which options were found.
   7331  */
   7332 static int
   7333 tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt)
   7334 {
   7335 	uchar_t		*endp;
   7336 	int		len;
   7337 	uint32_t	mss;
   7338 	uchar_t		*up = (uchar_t *)tcpha;
   7339 	int		found = 0;
   7340 	int32_t		sack_len;
   7341 	tcp_seq		sack_begin, sack_end;
   7342 	tcp_t		*tcp;
   7343 
   7344 	endp = up + TCP_HDR_LENGTH(tcpha);
   7345 	up += TCP_MIN_HEADER_LENGTH;
   7346 	while (up < endp) {
   7347 		len = endp - up;
   7348 		switch (*up) {
   7349 		case TCPOPT_EOL:
   7350 			break;
   7351 
   7352 		case TCPOPT_NOP:
   7353 			up++;
   7354 			continue;
   7355 
   7356 		case TCPOPT_MAXSEG:
   7357 			if (len < TCPOPT_MAXSEG_LEN ||
   7358 			    up[1] != TCPOPT_MAXSEG_LEN)
   7359 				break;
   7360 
   7361 			mss = BE16_TO_U16(up+2);
   7362 			/* Caller must handle tcp_mss_min and tcp_mss_max_* */
   7363 			tcpopt->tcp_opt_mss = mss;
   7364 			found |= TCP_OPT_MSS_PRESENT;
   7365 
   7366 			up += TCPOPT_MAXSEG_LEN;
   7367 			continue;
   7368 
   7369 		case TCPOPT_WSCALE:
   7370 			if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN)
   7371 				break;
   7372 
   7373 			if (up[2] > TCP_MAX_WINSHIFT)
   7374 				tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT;
   7375 			else
   7376 				tcpopt->tcp_opt_wscale = up[2];
   7377 			found |= TCP_OPT_WSCALE_PRESENT;
   7378 
   7379 			up += TCPOPT_WS_LEN;
   7380 			continue;
   7381 
   7382 		case TCPOPT_SACK_PERMITTED:
   7383 			if (len < TCPOPT_SACK_OK_LEN ||
   7384 			    up[1] != TCPOPT_SACK_OK_LEN)
   7385 				break;
   7386 			found |= TCP_OPT_SACK_OK_PRESENT;
   7387 			up += TCPOPT_SACK_OK_LEN;
   7388 			continue;
   7389 
   7390 		case TCPOPT_SACK:
   7391 			if (len <= 2 || up[1] <= 2 || len < up[1])
   7392 				break;
   7393 
   7394 			/* If TCP is not interested in SACK blks... */
   7395 			if ((tcp = tcpopt->tcp) == NULL) {
   7396 				up += up[1];
   7397 				continue;
   7398 			}
   7399 			sack_len = up[1] - TCPOPT_HEADER_LEN;
   7400 			up += TCPOPT_HEADER_LEN;
   7401 
   7402 			/*
   7403 			 * If the list is empty, allocate one and assume
   7404 			 * nothing is sack'ed.
   7405 			 */
   7406 			ASSERT(tcp->tcp_sack_info != NULL);
   7407 			if (tcp->tcp_notsack_list == NULL) {
   7408 				tcp_notsack_update(&(tcp->tcp_notsack_list),
   7409 				    tcp->tcp_suna, tcp->tcp_snxt,
   7410 				    &(tcp->tcp_num_notsack_blk),
   7411 				    &(tcp->tcp_cnt_notsack_list));
   7412 
   7413 				/*
   7414 				 * Make sure tcp_notsack_list is not NULL.
   7415 				 * This happens when kmem_alloc(KM_NOSLEEP)
   7416 				 * returns NULL.
   7417 				 */
   7418 				if (tcp->tcp_notsack_list == NULL) {
   7419 					up += sack_len;
   7420 					continue;
   7421 				}
   7422 				tcp->tcp_fack = tcp->tcp_suna;
   7423 			}
   7424 
   7425 			while (sack_len > 0) {
   7426 				if (up + 8 > endp) {
   7427 					up = endp;
   7428 					break;
   7429 				}
   7430 				sack_begin = BE32_TO_U32(up);
   7431 				up += 4;
   7432 				sack_end = BE32_TO_U32(up);
   7433 				up += 4;
   7434 				sack_len -= 8;
   7435 				/*
   7436 				 * Bounds checking.  Make sure the SACK
   7437 				 * info is within tcp_suna and tcp_snxt.
   7438 				 * If this SACK blk is out of bound, ignore
   7439 				 * it but continue to parse the following
   7440 				 * blks.
   7441 				 */
   7442 				if (SEQ_LEQ(sack_end, sack_begin) ||
   7443 				    SEQ_LT(sack_begin, tcp->tcp_suna) ||
   7444 				    SEQ_GT(sack_end, tcp->tcp_snxt)) {
   7445 					continue;
   7446 				}
   7447 				tcp_notsack_insert(&(tcp->tcp_notsack_list),
   7448 				    sack_begin, sack_end,
   7449 				    &(tcp->tcp_num_notsack_blk),
   7450 				    &(tcp->tcp_cnt_notsack_list));
   7451 				if (SEQ_GT(sack_end, tcp->tcp_fack)) {
   7452 					tcp->tcp_fack = sack_end;
   7453 				}
   7454 			}
   7455 			found |= TCP_OPT_SACK_PRESENT;
   7456 			continue;
   7457 
   7458 		case TCPOPT_TSTAMP:
   7459 			if (len < TCPOPT_TSTAMP_LEN ||
   7460 			    up[1] != TCPOPT_TSTAMP_LEN)
   7461 				break;
   7462 
   7463 			tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2);
   7464 			tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6);
   7465 
   7466 			found |= TCP_OPT_TSTAMP_PRESENT;
   7467 
   7468 			up += TCPOPT_TSTAMP_LEN;
   7469 			continue;
   7470 
   7471 		default:
   7472 			if (len <= 1 || len < (int)up[1] || up[1] == 0)
   7473 				break;
   7474 			up += up[1];
   7475 			continue;
   7476 		}
   7477 		break;
   7478 	}
   7479 	return (found);
   7480 }
   7481 
   7482 /*
   7483  * Set the MSS associated with a particular tcp based on its current value,
   7484  * and a new one passed in. Observe minimums and maximums, and reset other
   7485  * state variables that we want to view as multiples of MSS.
   7486  *
   7487  * The value of MSS could be either increased or descreased.
   7488  */
   7489 static void
   7490 tcp_mss_set(tcp_t *tcp, uint32_t mss)
   7491 {
   7492 	uint32_t	mss_max;
   7493 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   7494 	conn_t		*connp = tcp->tcp_connp;
   7495 
   7496 	if (connp->conn_ipversion == IPV4_VERSION)
   7497 		mss_max = tcps->tcps_mss_max_ipv4;
   7498 	else
   7499 		mss_max = tcps->tcps_mss_max_ipv6;
   7500 
   7501 	if (mss < tcps->tcps_mss_min)
   7502 		mss = tcps->tcps_mss_min;
   7503 	if (mss > mss_max)
   7504 		mss = mss_max;
   7505 	/*
   7506 	 * Unless naglim has been set by our client to
   7507 	 * a non-mss value, force naglim to track mss.
   7508 	 * This can help to aggregate small writes.
   7509 	 */
   7510 	if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim)
   7511 		tcp->tcp_naglim = mss;
   7512 	/*
   7513 	 * TCP should be able to buffer at least 4 MSS data for obvious
   7514 	 * performance reason.
   7515 	 */
   7516 	if ((mss << 2) > connp->conn_sndbuf)
   7517 		connp->conn_sndbuf = mss << 2;
   7518 
   7519 	/*
   7520 	 * Set the send lowater to at least twice of MSS.
   7521 	 */
   7522 	if ((mss << 1) > connp->conn_sndlowat)
   7523 		connp->conn_sndlowat = mss << 1;
   7524 
   7525 	/*
   7526 	 * Update tcp_cwnd according to the new value of MSS. Keep the
   7527 	 * previous ratio to preserve the transmit rate.
   7528 	 */
   7529 	tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss;
   7530 	tcp->tcp_cwnd_cnt = 0;
   7531 
   7532 	tcp->tcp_mss = mss;
   7533 	(void) tcp_maxpsz_set(tcp, B_TRUE);
   7534 }
   7535 
   7536 /* For /dev/tcp aka AF_INET open */
   7537 static int
   7538 tcp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   7539 {
   7540 	return (tcp_open(q, devp, flag, sflag, credp, B_FALSE));
   7541 }
   7542 
   7543 /* For /dev/tcp6 aka AF_INET6 open */
   7544 static int
   7545 tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   7546 {
   7547 	return (tcp_open(q, devp, flag, sflag, credp, B_TRUE));
   7548 }
   7549 
   7550 static conn_t *
   7551 tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket,
   7552     int *errorp)
   7553 {
   7554 	tcp_t		*tcp = NULL;
   7555 	conn_t		*connp;
   7556 	zoneid_t	zoneid;
   7557 	tcp_stack_t	*tcps;
   7558 	squeue_t	*sqp;
   7559 
   7560 	ASSERT(errorp != NULL);
   7561 	/*
   7562 	 * Find the proper zoneid and netstack.
   7563 	 */
   7564 	/*
   7565 	 * Special case for install: miniroot needs to be able to
   7566 	 * access files via NFS as though it were always in the
   7567 	 * global zone.
   7568 	 */
   7569 	if (credp == kcred && nfs_global_client_only != 0) {
   7570 		zoneid = GLOBAL_ZONEID;
   7571 		tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)->
   7572 		    netstack_tcp;
   7573 		ASSERT(tcps != NULL);
   7574 	} else {
   7575 		netstack_t *ns;
   7576 
   7577 		ns = netstack_find_by_cred(credp);
   7578 		ASSERT(ns != NULL);
   7579 		tcps = ns->netstack_tcp;
   7580 		ASSERT(tcps != NULL);
   7581 
   7582 		/*
   7583 		 * For exclusive stacks we set the zoneid to zero
   7584 		 * to make TCP operate as if in the global zone.
   7585 		 */
   7586 		if (tcps->tcps_netstack->netstack_stackid !=
   7587 		    GLOBAL_NETSTACKID)
   7588 			zoneid = GLOBAL_ZONEID;
   7589 		else
   7590 			zoneid = crgetzoneid(credp);
   7591 	}
   7592 
   7593 	sqp = IP_SQUEUE_GET((uint_t)gethrtime());
   7594 	connp = (conn_t *)tcp_get_conn(sqp, tcps);
   7595 	/*
   7596 	 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
   7597 	 * so we drop it by one.
   7598 	 */
   7599 	netstack_rele(tcps->tcps_netstack);
   7600 	if (connp == NULL) {
   7601 		*errorp = ENOSR;
   7602 		return (NULL);
   7603 	}
   7604 	ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
   7605 
   7606 	connp->conn_sqp = sqp;
   7607 	connp->conn_initial_sqp = connp->conn_sqp;
   7608 	connp->conn_ixa->ixa_sqp = connp->conn_sqp;
   7609 	tcp = connp->conn_tcp;
   7610 
   7611 	/*
   7612 	 * Besides asking IP to set the checksum for us, have conn_ip_output
   7613 	 * to do the following checks when necessary:
   7614 	 *
   7615 	 * IXAF_VERIFY_SOURCE: drop packets when our outer source goes invalid
   7616 	 * IXAF_VERIFY_PMTU: verify PMTU changes
   7617 	 * IXAF_VERIFY_LSO: verify LSO capability changes
   7618 	 */
   7619 	connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
   7620 	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO;
   7621 
   7622 	if (!tcps->tcps_dev_flow_ctl)
   7623 		connp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL;
   7624 
   7625 	if (isv6) {
   7626 		connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
   7627 		connp->conn_ipversion = IPV6_VERSION;
   7628 		connp->conn_family = AF_INET6;
   7629 		tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
   7630 		connp->conn_default_ttl = tcps->tcps_ipv6_hoplimit;
   7631 	} else {
   7632 		connp->conn_ipversion = IPV4_VERSION;
   7633 		connp->conn_family = AF_INET;
   7634 		tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
   7635 		connp->conn_default_ttl = tcps->tcps_ipv4_ttl;
   7636 	}
   7637 	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
   7638 
   7639 	crhold(credp);
   7640 	connp->conn_cred = credp;
   7641 	connp->conn_cpid = curproc->p_pid;
   7642 	connp->conn_open_time = ddi_get_lbolt64();
   7643 
   7644 	connp->conn_zoneid = zoneid;
   7645 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
   7646 	connp->conn_ixa->ixa_zoneid = zoneid;
   7647 	connp->conn_mlp_type = mlptSingle;
   7648 	ASSERT(connp->conn_netstack == tcps->tcps_netstack);
   7649 	ASSERT(tcp->tcp_tcps == tcps);
   7650 
   7651 	/*
   7652 	 * If the caller has the process-wide flag set, then default to MAC
   7653 	 * exempt mode.  This allows read-down to unlabeled hosts.
   7654 	 */
   7655 	if (getpflags(NET_MAC_AWARE, credp) != 0)
   7656 		connp->conn_mac_mode = CONN_MAC_AWARE;
   7657 
   7658 	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
   7659 
   7660 	if (issocket) {
   7661 		tcp->tcp_issocket = 1;
   7662 	}
   7663 
   7664 	connp->conn_rcvbuf = tcps->tcps_recv_hiwat;
   7665 	connp->conn_sndbuf = tcps->tcps_xmit_hiwat;
   7666 	connp->conn_sndlowat = tcps->tcps_xmit_lowat;
   7667 	connp->conn_so_type = SOCK_STREAM;
   7668 	connp->conn_wroff = connp->conn_ht_iphc_allocated +
   7669 	    tcps->tcps_wroff_xtra;
   7670 
   7671 	SOCK_CONNID_INIT(tcp->tcp_connid);
   7672 	tcp->tcp_state = TCPS_IDLE;
   7673 	tcp_init_values(tcp);
   7674 	return (connp);
   7675 }
   7676 
   7677 static int
   7678 tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
   7679     boolean_t isv6)
   7680 {
   7681 	tcp_t		*tcp = NULL;
   7682 	conn_t		*connp = NULL;
   7683 	int		err;
   7684 	vmem_t		*minor_arena = NULL;
   7685 	dev_t		conn_dev;
   7686 	boolean_t	issocket;
   7687 
   7688 	if (q->q_ptr != NULL)
   7689 		return (0);
   7690 
   7691 	if (sflag == MODOPEN)
   7692 		return (EINVAL);
   7693 
   7694 	if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
   7695 	    ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
   7696 		minor_arena = ip_minor_arena_la;
   7697 	} else {
   7698 		/*
   7699 		 * Either minor numbers in the large arena were exhausted
   7700 		 * or a non socket application is doing the open.
   7701 		 * Try to allocate from the small arena.
   7702 		 */
   7703 		if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
   7704 			return (EBUSY);
   7705 		}
   7706 		minor_arena = ip_minor_arena_sa;
   7707 	}
   7708 
   7709 	ASSERT(minor_arena != NULL);
   7710 
   7711 	*devp = makedevice(getmajor(*devp), (minor_t)conn_dev);
   7712 
   7713 	if (flag & SO_FALLBACK) {
   7714 		/*
   7715 		 * Non streams socket needs a stream to fallback to
   7716 		 */
   7717 		RD(q)->q_ptr = (void *)conn_dev;
   7718 		WR(q)->q_qinfo = &tcp_fallback_sock_winit;
   7719 		WR(q)->q_ptr = (void *)minor_arena;
   7720 		qprocson(q);
   7721 		return (0);
   7722 	} else if (flag & SO_ACCEPTOR) {
   7723 		q->q_qinfo = &tcp_acceptor_rinit;
   7724 		/*
   7725 		 * the conn_dev and minor_arena will be subsequently used by
   7726 		 * tcp_tli_accept() and tcp_tpi_close_accept() to figure out
   7727 		 * the minor device number for this connection from the q_ptr.
   7728 		 */
   7729 		RD(q)->q_ptr = (void *)conn_dev;
   7730 		WR(q)->q_qinfo = &tcp_acceptor_winit;
   7731 		WR(q)->q_ptr = (void *)minor_arena;
   7732 		qprocson(q);
   7733 		return (0);
   7734 	}
   7735 
   7736 	issocket = flag & SO_SOCKSTR;
   7737 	connp = tcp_create_common(credp, isv6, issocket, &err);
   7738 
   7739 	if (connp == NULL) {
   7740 		inet_minor_free(minor_arena, conn_dev);
   7741 		q->q_ptr = WR(q)->q_ptr = NULL;
   7742 		return (err);
   7743 	}
   7744 
   7745 	connp->conn_rq = q;
   7746 	connp->conn_wq = WR(q);
   7747 	q->q_ptr = WR(q)->q_ptr = connp;
   7748 
   7749 	connp->conn_dev = conn_dev;
   7750 	connp->conn_minor_arena = minor_arena;
   7751 
   7752 	ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6);
   7753 	ASSERT(WR(q)->q_qinfo == &tcp_winit);
   7754 
   7755 	tcp = connp->conn_tcp;
   7756 
   7757 	if (issocket) {
   7758 		WR(q)->q_qinfo = &tcp_sock_winit;
   7759 	} else {
   7760 #ifdef  _ILP32
   7761 		tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
   7762 #else
   7763 		tcp->tcp_acceptor_id = conn_dev;
   7764 #endif  /* _ILP32 */
   7765 		tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
   7766 	}
   7767 
   7768 	/*
   7769 	 * Put the ref for TCP. Ref for IP was already put
   7770 	 * by ipcl_conn_create. Also Make the conn_t globally
   7771 	 * visible to walkers
   7772 	 */
   7773 	mutex_enter(&