Home | History | Annotate | Download | only in tcp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 /* Copyright (c) 1990 Mentat Inc. */
     27 
     28 #include <sys/types.h>
     29 #include <sys/stream.h>
     30 #include <sys/strsun.h>
     31 #include <sys/strsubr.h>
     32 #include <sys/stropts.h>
     33 #include <sys/strlog.h>
     34 #define	_SUN_TPI_VERSION 2
     35 #include <sys/tihdr.h>
     36 #include <sys/timod.h>
     37 #include <sys/ddi.h>
     38 #include <sys/sunddi.h>
     39 #include <sys/suntpi.h>
     40 #include <sys/xti_inet.h>
     41 #include <sys/cmn_err.h>
     42 #include <sys/debug.h>
     43 #include <sys/sdt.h>
     44 #include <sys/vtrace.h>
     45 #include <sys/kmem.h>
     46 #include <sys/ethernet.h>
     47 #include <sys/cpuvar.h>
     48 #include <sys/dlpi.h>
     49 #include <sys/pattr.h>
     50 #include <sys/policy.h>
     51 #include <sys/priv.h>
     52 #include <sys/zone.h>
     53 #include <sys/sunldi.h>
     54 
     55 #include <sys/errno.h>
     56 #include <sys/signal.h>
     57 #include <sys/socket.h>
     58 #include <sys/socketvar.h>
     59 #include <sys/sockio.h>
     60 #include <sys/isa_defs.h>
     61 #include <sys/md5.h>
     62 #include <sys/random.h>
     63 #include <sys/uio.h>
     64 #include <sys/systm.h>
     65 #include <netinet/in.h>
     66 #include <netinet/tcp.h>
     67 #include <netinet/ip6.h>
     68 #include <netinet/icmp6.h>
     69 #include <net/if.h>
     70 #include <net/route.h>
     71 #include <inet/ipsec_impl.h>
     72 
     73 #include <inet/common.h>
     74 #include <inet/ip.h>
     75 #include <inet/ip_impl.h>
     76 #include <inet/ip6.h>
     77 #include <inet/ip_ndp.h>
     78 #include <inet/proto_set.h>
     79 #include <inet/mib2.h>
     80 #include <inet/nd.h>
     81 #include <inet/optcom.h>
     82 #include <inet/snmpcom.h>
     83 #include <inet/kstatcom.h>
     84 #include <inet/tcp.h>
     85 #include <inet/tcp_impl.h>
     86 #include <inet/udp_impl.h>
     87 #include <net/pfkeyv2.h>
     88 #include <inet/ipdrop.h>
     89 
     90 #include <inet/ipclassifier.h>
     91 #include <inet/ip_ire.h>
     92 #include <inet/ip_ftable.h>
     93 #include <inet/ip_if.h>
     94 #include <inet/ipp_common.h>
     95 #include <inet/ip_rts.h>
     96 #include <inet/ip_netinfo.h>
     97 #include <sys/squeue_impl.h>
     98 #include <sys/squeue.h>
     99 #include <inet/kssl/ksslapi.h>
    100 #include <sys/tsol/label.h>
    101 #include <sys/tsol/tnet.h>
    102 #include <rpc/pmap_prot.h>
    103 #include <sys/callo.h>
    104 
    105 #include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
    106 
    107 /*
    108  * TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
    109  *
    110  * (Read the detailed design doc in PSARC case directory)
    111  *
    112  * The entire tcp state is contained in tcp_t and conn_t structure
    113  * which are allocated in tandem using ipcl_conn_create() and passing
    114  * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect
    115  * the references on the tcp_t. The tcp_t structure is never compressed
    116  * and packets always land on the correct TCP perimeter from the time
    117  * eager is created till the time tcp_t dies (as such the old mentat
    118  * TCP global queue is not used for detached state and no IPSEC checking
    119  * is required). The global queue is still allocated to send out resets
    120  * for connection which have no listeners and IP directly calls
    121  * tcp_xmit_listeners_reset() which does any policy check.
    122  *
    123  * Protection and Synchronisation mechanism:
    124  *
    125  * The tcp data structure does not use any kind of lock for protecting
    126  * its state but instead uses 'squeues' for mutual exclusion from various
    127  * read and write side threads. To access a tcp member, the thread should
    128  * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS,
    129  * or SQ_NODRAIN). Since the squeues allow a direct function call, caller
    130  * can pass any tcp function having prototype of edesc_t as argument
    131  * (different from traditional STREAMs model where packets come in only
    132  * designated entry points). The list of functions that can be directly
    133  * called via squeue are listed before the usual function prototype.
    134  *
    135  * Referencing:
    136  *
    137  * TCP is MT-Hot and we use a reference based scheme to make sure that the
    138  * tcp structure doesn't disappear when its needed. When the application
    139  * creates an outgoing connection or accepts an incoming connection, we
    140  * start out with 2 references on 'conn_ref'. One for TCP and one for IP.
    141  * The IP reference is just a symbolic reference since ip_tcpclose()
    142  * looks at tcp structure after tcp_close_output() returns which could
    143  * have dropped the last TCP reference. So as long as the connection is
    144  * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
    145  * conn_t. The classifier puts its own reference when the connection is
    146  * inserted in listen or connected hash. Anytime a thread needs to enter
    147  * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
    148  * on write side or by doing a classify on read side and then puts a
    149  * reference on the conn before doing squeue_enter/tryenter/fill. For
    150  * read side, the classifier itself puts the reference under fanout lock
    151  * to make sure that tcp can't disappear before it gets processed. The
    152  * squeue will drop this reference automatically so the called function
    153  * doesn't have to do a DEC_REF.
    154  *
    155  * Opening a new connection:
    156  *
    157  * The outgoing connection open is pretty simple. tcp_open() does the
    158  * work in creating the conn/tcp structure and initializing it. The
    159  * squeue assignment is done based on the CPU the application
    160  * is running on. So for outbound connections, processing is always done
    161  * on application CPU which might be different from the incoming CPU
    162  * being interrupted by the NIC. An optimal way would be to figure out
    163  * the NIC <-> CPU binding at listen time, and assign the outgoing
    164  * connection to the squeue attached to the CPU that will be interrupted
    165  * for incoming packets (we know the NIC based on the bind IP address).
    166  * This might seem like a problem if more data is going out but the
    167  * fact is that in most cases the transmit is ACK driven transmit where
    168  * the outgoing data normally sits on TCP's xmit queue waiting to be
    169  * transmitted.
    170  *
    171  * Accepting a connection:
    172  *
    173  * This is a more interesting case because of various races involved in
    174  * establishing a eager in its own perimeter. Read the meta comment on
    175  * top of tcp_input_listener(). But briefly, the squeue is picked by
    176  * ip_fanout based on the ring or the sender (if loopback).
    177  *
    178  * Closing a connection:
    179  *
    180  * The close is fairly straight forward. tcp_close() calls tcp_close_output()
    181  * via squeue to do the close and mark the tcp as detached if the connection
    182  * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
    183  * reference but tcp_close() drop IP's reference always. So if tcp was
    184  * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
    185  * and 1 because it is in classifier's connected hash. This is the condition
    186  * we use to determine that its OK to clean up the tcp outside of squeue
    187  * when time wait expires (check the ref under fanout and conn_lock and
    188  * if it is 2, remove it from fanout hash and kill it).
    189  *
    190  * Although close just drops the necessary references and marks the
    191  * tcp_detached state, tcp_close needs to know the tcp_detached has been
    192  * set (under squeue) before letting the STREAM go away (because a
    193  * inbound packet might attempt to go up the STREAM while the close
    194  * has happened and tcp_detached is not set). So a special lock and
    195  * flag is used along with a condition variable (tcp_closelock, tcp_closed,
    196  * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
    197  * tcp_detached.
    198  *
    199  * Special provisions and fast paths:
    200  *
    201  * We make special provisions for sockfs by marking tcp_issocket
    202  * whenever we have only sockfs on top of TCP. This allows us to skip
    203  * putting the tcp in acceptor hash since a sockfs listener can never
    204  * become acceptor and also avoid allocating a tcp_t for acceptor STREAM
    205  * since eager has already been allocated and the accept now happens
    206  * on acceptor STREAM. There is a big blob of comment on top of
    207  * tcp_input_listener explaining the new accept. When socket is POP'd,
    208  * sockfs sends us an ioctl to mark the fact and we go back to old
    209  * behaviour. Once tcp_issocket is unset, its never set for the
    210  * life of that connection.
    211  *
    212  * IPsec notes :
    213  *
    214  * Since a packet is always executed on the correct TCP perimeter
    215  * all IPsec processing is defered to IP including checking new
    216  * connections and setting IPSEC policies for new connection. The
    217  * only exception is tcp_xmit_listeners_reset() which is called
    218  * directly from IP and needs to policy check to see if TH_RST
    219  * can be sent out.
    220  */
    221 
    222 /*
    223  * Values for squeue switch:
    224  * 1: SQ_NODRAIN
    225  * 2: SQ_PROCESS
    226  * 3: SQ_FILL
    227  */
    228 int tcp_squeue_wput = 2;	/* /etc/systems */
    229 int tcp_squeue_flag;
    230 
    231 /*
    232  * This controls how tiny a write must be before we try to copy it
    233  * into the mblk on the tail of the transmit queue.  Not much
    234  * speedup is observed for values larger than sixteen.  Zero will
    235  * disable the optimisation.
    236  */
    237 int tcp_tx_pull_len = 16;
    238 
    239 /*
    240  * TCP Statistics.
    241  *
    242  * How TCP statistics work.
    243  *
    244  * There are two types of statistics invoked by two macros.
    245  *
    246  * TCP_STAT(name) does non-atomic increment of a named stat counter. It is
    247  * supposed to be used in non MT-hot paths of the code.
    248  *
    249  * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
    250  * supposed to be used for DEBUG purposes and may be used on a hot path.
    251  *
    252  * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
    253  * (use "kstat tcp" to get them).
    254  *
    255  * There is also additional debugging facility that marks tcp_clean_death()
    256  * instances and saves them in tcp_t structure. It is triggered by
    257  * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
    258  * tcp_clean_death() calls that counts the number of times each tag was hit. It
    259  * is triggered by TCP_CLD_COUNTERS define.
    260  *
    261  * How to add new counters.
    262  *
    263  * 1) Add a field in the tcp_stat structure describing your counter.
    264  * 2) Add a line in the template in tcp_kstat2_init() with the name
    265  *    of the counter.
    266  *
    267  *    IMPORTANT!! - make sure that both are in sync !!
    268  * 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
    269  *
    270  * Please avoid using private counters which are not kstat-exported.
    271  *
    272  * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
    273  * in tcp_t structure.
    274  *
    275  * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
    276  */
    277 
    278 #ifndef TCP_DEBUG_COUNTER
    279 #ifdef DEBUG
    280 #define	TCP_DEBUG_COUNTER 1
    281 #else
    282 #define	TCP_DEBUG_COUNTER 0
    283 #endif
    284 #endif
    285 
    286 #define	TCP_CLD_COUNTERS 0
    287 
    288 #define	TCP_TAG_CLEAN_DEATH 1
    289 #define	TCP_MAX_CLEAN_DEATH_TAG 32
    290 
    291 #ifdef lint
    292 static int _lint_dummy_;
    293 #endif
    294 
    295 #if TCP_CLD_COUNTERS
    296 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
    297 #define	TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
    298 #elif defined(lint)
    299 #define	TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
    300 #else
    301 #define	TCP_CLD_STAT(x)
    302 #endif
    303 
    304 #if TCP_DEBUG_COUNTER
    305 #define	TCP_DBGSTAT(tcps, x)	\
    306 	atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1)
    307 #define	TCP_G_DBGSTAT(x)	\
    308 	atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1)
    309 #elif defined(lint)
    310 #define	TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0);
    311 #define	TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
    312 #else
    313 #define	TCP_DBGSTAT(tcps, x)
    314 #define	TCP_G_DBGSTAT(x)
    315 #endif
    316 
    317 #define	TCP_G_STAT(x)	(tcp_g_statistics.x.value.ui64++)
    318 
    319 tcp_g_stat_t	tcp_g_statistics;
    320 kstat_t		*tcp_g_kstat;
    321 
    322 /* Macros for timestamp comparisons */
    323 #define	TSTMP_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
    324 #define	TSTMP_LT(a, b)	((int32_t)((a)-(b)) < 0)
    325 
    326 /*
    327  * Parameters for TCP Initial Send Sequence number (ISS) generation.  When
    328  * tcp_strong_iss is set to 1, which is the default, the ISS is calculated
    329  * by adding three components: a time component which grows by 1 every 4096
    330  * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
    331  * a per-connection component which grows by 125000 for every new connection;
    332  * and an "extra" component that grows by a random amount centered
    333  * approximately on 64000.  This causes the ISS generator to cycle every
    334  * 4.89 hours if no TCP connections are made, and faster if connections are
    335  * made.
    336  *
    337  * When tcp_strong_iss is set to 0, ISS is calculated by adding two
    338  * components: a time component which grows by 250000 every second; and
    339  * a per-connection component which grows by 125000 for every new connections.
    340  *
    341  * A third method, when tcp_strong_iss is set to 2, for generating ISS is
    342  * prescribed by Steve Bellovin.  This involves adding time, the 125000 per
    343  * connection, and a one-way hash (MD5) of the connection ID <sport, dport,
    344  * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
    345  * password.
    346  */
    347 #define	ISS_INCR	250000
    348 #define	ISS_NSEC_SHT	12
    349 
    350 static sin_t	sin_null;	/* Zero address for quick clears */
    351 static sin6_t	sin6_null;	/* Zero address for quick clears */
    352 
    353 /*
    354  * This implementation follows the 4.3BSD interpretation of the urgent
    355  * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
    356  * incompatible changes in protocols like telnet and rlogin.
    357  */
    358 #define	TCP_OLD_URP_INTERPRETATION	1
    359 
    360 /*
    361  * Since tcp_listener is not cleared atomically with tcp_detached
    362  * being cleared we need this extra bit to tell a detached connection
    363  * apart from one that is in the process of being accepted.
    364  */
    365 #define	TCP_IS_DETACHED_NONEAGER(tcp)	\
    366 	(TCP_IS_DETACHED(tcp) &&	\
    367 	    (!(tcp)->tcp_hard_binding))
    368 
    369 /*
    370  * TCP reassembly macros.  We hide starting and ending sequence numbers in
    371  * b_next and b_prev of messages on the reassembly queue.  The messages are
    372  * chained using b_cont.  These macros are used in tcp_reass() so we don't
    373  * have to see the ugly casts and assignments.
    374  */
    375 #define	TCP_REASS_SEQ(mp)		((uint32_t)(uintptr_t)((mp)->b_next))
    376 #define	TCP_REASS_SET_SEQ(mp, u)	((mp)->b_next = \
    377 					(mblk_t *)(uintptr_t)(u))
    378 #define	TCP_REASS_END(mp)		((uint32_t)(uintptr_t)((mp)->b_prev))
    379 #define	TCP_REASS_SET_END(mp, u)	((mp)->b_prev = \
    380 					(mblk_t *)(uintptr_t)(u))
    381 
    382 /*
    383  * Implementation of TCP Timers.
    384  * =============================
    385  *
    386  * INTERFACE:
    387  *
    388  * There are two basic functions dealing with tcp timers:
    389  *
    390  *	timeout_id_t	tcp_timeout(connp, func, time)
    391  * 	clock_t		tcp_timeout_cancel(connp, timeout_id)
    392  *	TCP_TIMER_RESTART(tcp, intvl)
    393  *
    394  * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
    395  * after 'time' ticks passed. The function called by timeout() must adhere to
    396  * the same restrictions as a driver soft interrupt handler - it must not sleep
    397  * or call other functions that might sleep. The value returned is the opaque
    398  * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
    399  * cancel the request. The call to tcp_timeout() may fail in which case it
    400  * returns zero. This is different from the timeout(9F) function which never
    401  * fails.
    402  *
    403  * The call-back function 'func' always receives 'connp' as its single
    404  * argument. It is always executed in the squeue corresponding to the tcp
    405  * structure. The tcp structure is guaranteed to be present at the time the
    406  * call-back is called.
    407  *
    408  * NOTE: The call-back function 'func' is never called if tcp is in
    409  * 	the TCPS_CLOSED state.
    410  *
    411  * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
    412  * request. locks acquired by the call-back routine should not be held across
    413  * the call to tcp_timeout_cancel() or a deadlock may result.
    414  *
    415  * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
    416  * Otherwise, it returns an integer value greater than or equal to 0. In
    417  * particular, if the call-back function is already placed on the squeue, it can
    418  * not be canceled.
    419  *
    420  * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
    421  * 	within squeue context corresponding to the tcp instance. Since the
    422  *	call-back is also called via the same squeue, there are no race
    423  *	conditions described in untimeout(9F) manual page since all calls are
    424  *	strictly serialized.
    425  *
    426  *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
    427  *	stored in tcp_timer_tid and starts a new one using
    428  *	MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
    429  *	and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
    430  *	field.
    431  *
    432  * NOTE: since the timeout cancellation is not guaranteed, the cancelled
    433  *	call-back may still be called, so it is possible tcp_timer() will be
    434  *	called several times. This should not be a problem since tcp_timer()
    435  *	should always check the tcp instance state.
    436  *
    437  *
    438  * IMPLEMENTATION:
    439  *
    440  * TCP timers are implemented using three-stage process. The call to
    441  * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
    442  * when the timer expires. The tcp_timer_callback() arranges the call of the
    443  * tcp_timer_handler() function via squeue corresponding to the tcp
    444  * instance. The tcp_timer_handler() calls actual requested timeout call-back
    445  * and passes tcp instance as an argument to it. Information is passed between
    446  * stages using the tcp_timer_t structure which contains the connp pointer, the
    447  * tcp call-back to call and the timeout id returned by the timeout(9F).
    448  *
    449  * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
    450  * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
    451  * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
    452  * returns the pointer to this mblk.
    453  *
    454  * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
    455  * looks like a normal mblk without actual dblk attached to it.
    456  *
    457  * To optimize performance each tcp instance holds a small cache of timer
    458  * mblocks. In the current implementation it caches up to two timer mblocks per
    459  * tcp instance. The cache is preserved over tcp frees and is only freed when
    460  * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
    461  * timer processing happens on a corresponding squeue, the cache manipulation
    462  * does not require any locks. Experiments show that majority of timer mblocks
    463  * allocations are satisfied from the tcp cache and do not involve kmem calls.
    464  *
    465  * The tcp_timeout() places a refhold on the connp instance which guarantees
    466  * that it will be present at the time the call-back function fires. The
    467  * tcp_timer_handler() drops the reference after calling the call-back, so the
    468  * call-back function does not need to manipulate the references explicitly.
    469  */
    470 
    471 typedef struct tcp_timer_s {
    472 	conn_t	*connp;
    473 	void 	(*tcpt_proc)(void *);
    474 	callout_id_t   tcpt_tid;
    475 } tcp_timer_t;
    476 
    477 static kmem_cache_t *tcp_timercache;
    478 kmem_cache_t	*tcp_sack_info_cache;
    479 
    480 /*
    481  * For scalability, we must not run a timer for every TCP connection
    482  * in TIME_WAIT state.  To see why, consider (for time wait interval of
    483  * 4 minutes):
    484  *	1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
    485  *
    486  * This list is ordered by time, so you need only delete from the head
    487  * until you get to entries which aren't old enough to delete yet.
    488  * The list consists of only the detached TIME_WAIT connections.
    489  *
    490  * Note that the timer (tcp_time_wait_expire) is started when the tcp_t
    491  * becomes detached TIME_WAIT (either by changing the state and already
    492  * being detached or the other way around). This means that the TIME_WAIT
    493  * state can be extended (up to doubled) if the connection doesn't become
    494  * detached for a long time.
    495  *
    496  * The list manipulations (including tcp_time_wait_next/prev)
    497  * are protected by the tcp_time_wait_lock. The content of the
    498  * detached TIME_WAIT connections is protected by the normal perimeters.
    499  *
    500  * This list is per squeue and squeues are shared across the tcp_stack_t's.
    501  * Things on tcp_time_wait_head remain associated with the tcp_stack_t
    502  * and conn_netstack.
    503  * The tcp_t's that are added to tcp_free_list are disassociated and
    504  * have NULL tcp_tcps and conn_netstack pointers.
    505  */
    506 typedef struct tcp_squeue_priv_s {
    507 	kmutex_t	tcp_time_wait_lock;
    508 	callout_id_t	tcp_time_wait_tid;
    509 	tcp_t		*tcp_time_wait_head;
    510 	tcp_t		*tcp_time_wait_tail;
    511 	tcp_t		*tcp_free_list;
    512 	uint_t		tcp_free_list_cnt;
    513 } tcp_squeue_priv_t;
    514 
    515 /*
    516  * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
    517  * Running it every 5 seconds seems to give the best results.
    518  */
    519 #define	TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
    520 
    521 /*
    522  * To prevent memory hog, limit the number of entries in tcp_free_list
    523  * to 1% of available memory / number of cpus
    524  */
    525 uint_t tcp_free_list_max_cnt = 0;
    526 
    527 #define	TCP_XMIT_LOWATER	4096
    528 #define	TCP_XMIT_HIWATER	49152
    529 #define	TCP_RECV_LOWATER	2048
    530 #define	TCP_RECV_HIWATER	128000
    531 
    532 /*
    533  *  PAWS needs a timer for 24 days.  This is the number of ticks in 24 days
    534  */
    535 #define	PAWS_TIMEOUT	((clock_t)(24*24*60*60*hz))
    536 
    537 #define	TIDUSZ	4096	/* transport interface data unit size */
    538 
    539 /*
    540  * Bind hash list size and has function.  It has to be a power of 2 for
    541  * hashing.
    542  */
    543 #define	TCP_BIND_FANOUT_SIZE	512
    544 #define	TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
    545 /*
    546  * Size of listen and acceptor hash list.  It has to be a power of 2 for
    547  * hashing.
    548  */
    549 #define	TCP_FANOUT_SIZE		256
    550 
    551 #ifdef	_ILP32
    552 #define	TCP_ACCEPTOR_HASH(accid)					\
    553 		(((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
    554 #else
    555 #define	TCP_ACCEPTOR_HASH(accid)					\
    556 		((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
    557 #endif	/* _ILP32 */
    558 
    559 #define	IP_ADDR_CACHE_SIZE	2048
    560 #define	IP_ADDR_CACHE_HASH(faddr)					\
    561 	(ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
    562 
    563 /*
    564  * TCP options struct returned from tcp_parse_options.
    565  */
    566 typedef struct tcp_opt_s {
    567 	uint32_t	tcp_opt_mss;
    568 	uint32_t	tcp_opt_wscale;
    569 	uint32_t	tcp_opt_ts_val;
    570 	uint32_t	tcp_opt_ts_ecr;
    571 	tcp_t		*tcp;
    572 } tcp_opt_t;
    573 
    574 /*
    575  * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
    576  */
    577 
    578 #ifdef _BIG_ENDIAN
    579 #define	TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
    580 	(TCPOPT_TSTAMP << 8) | 10)
    581 #else
    582 #define	TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
    583 	(TCPOPT_NOP << 8) | TCPOPT_NOP)
    584 #endif
    585 
    586 /*
    587  * Flags returned from tcp_parse_options.
    588  */
    589 #define	TCP_OPT_MSS_PRESENT	1
    590 #define	TCP_OPT_WSCALE_PRESENT	2
    591 #define	TCP_OPT_TSTAMP_PRESENT	4
    592 #define	TCP_OPT_SACK_OK_PRESENT	8
    593 #define	TCP_OPT_SACK_PRESENT	16
    594 
    595 /* TCP option length */
    596 #define	TCPOPT_NOP_LEN		1
    597 #define	TCPOPT_MAXSEG_LEN	4
    598 #define	TCPOPT_WS_LEN		3
    599 #define	TCPOPT_REAL_WS_LEN	(TCPOPT_WS_LEN+1)
    600 #define	TCPOPT_TSTAMP_LEN	10
    601 #define	TCPOPT_REAL_TS_LEN	(TCPOPT_TSTAMP_LEN+2)
    602 #define	TCPOPT_SACK_OK_LEN	2
    603 #define	TCPOPT_REAL_SACK_OK_LEN	(TCPOPT_SACK_OK_LEN+2)
    604 #define	TCPOPT_REAL_SACK_LEN	4
    605 #define	TCPOPT_MAX_SACK_LEN	36
    606 #define	TCPOPT_HEADER_LEN	2
    607 
    608 /* TCP cwnd burst factor. */
    609 #define	TCP_CWND_INFINITE	65535
    610 #define	TCP_CWND_SS		3
    611 #define	TCP_CWND_NORMAL		5
    612 
    613 /* Maximum TCP initial cwin (start/restart). */
    614 #define	TCP_MAX_INIT_CWND	8
    615 
    616 /*
    617  * Initialize cwnd according to RFC 3390.  def_max_init_cwnd is
    618  * either tcp_slow_start_initial or tcp_slow_start_after idle
    619  * depending on the caller.  If the upper layer has not used the
    620  * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
    621  * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
    622  * If the upper layer has changed set the tcp_init_cwnd, just use
    623  * it to calculate the tcp_cwnd.
    624  */
    625 #define	SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd)			\
    626 {									\
    627 	if ((tcp)->tcp_init_cwnd == 0) {				\
    628 		(tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss),	\
    629 		    MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
    630 	} else {							\
    631 		(tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss);		\
    632 	}								\
    633 	tcp->tcp_cwnd_cnt = 0;						\
    634 }
    635 
    636 /* TCP Timer control structure */
    637 typedef struct tcpt_s {
    638 	pfv_t	tcpt_pfv;	/* The routine we are to call */
    639 	tcp_t	*tcpt_tcp;	/* The parameter we are to pass in */
    640 } tcpt_t;
    641 
    642 /*
    643  * Functions called directly via squeue having a prototype of edesc_t.
    644  */
    645 void		tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
    646     ip_recv_attr_t *ira);
    647 static void	tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2,
    648     ip_recv_attr_t *dummy);
    649 void		tcp_accept_finish(void *arg, mblk_t *mp, void *arg2,
    650     ip_recv_attr_t *dummy);
    651 static void	tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2,
    652     ip_recv_attr_t *dummy);
    653 static void	tcp_wput_proto(void *arg, mblk_t *mp, void *arg2,
    654     ip_recv_attr_t *dummy);
    655 void		tcp_input_data(void *arg, mblk_t *mp, void *arg2,
    656     ip_recv_attr_t *ira);
    657 static void	tcp_close_output(void *arg, mblk_t *mp, void *arg2,
    658     ip_recv_attr_t *dummy);
    659 void		tcp_output(void *arg, mblk_t *mp, void *arg2,
    660     ip_recv_attr_t *dummy);
    661 void		tcp_output_urgent(void *arg, mblk_t *mp, void *arg2,
    662     ip_recv_attr_t *dummy);
    663 static void	tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2,
    664     ip_recv_attr_t *dummy);
    665 static void	tcp_timer_handler(void *arg, mblk_t *mp, void *arg2,
    666     ip_recv_attr_t *dummy);
    667 static void	tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
    668     ip_recv_attr_t *dummy);
    669 
    670 
    671 /* Prototype for TCP functions */
    672 static void	tcp_random_init(void);
    673 int		tcp_random(void);
    674 static void	tcp_tli_accept(tcp_t *tcp, mblk_t *mp);
    675 static void	tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
    676 		    tcp_t *eager);
    677 static int	tcp_set_destination(tcp_t *tcp);
    678 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
    679     int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
    680     boolean_t user_specified);
    681 static void	tcp_closei_local(tcp_t *tcp);
    682 static void	tcp_close_detached(tcp_t *tcp);
    683 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr,
    684 		    mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira);
    685 static void	tcp_tpi_connect(tcp_t *tcp, mblk_t *mp);
    686 static int	tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
    687 		    in_port_t dstport, uint_t srcid);
    688 static int	tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
    689 		    in_port_t dstport, uint32_t flowinfo,
    690 		    uint_t srcid, uint32_t scope_id);
    691 static int	tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
    692 static void	tcp_disconnect(tcp_t *tcp, mblk_t *mp);
    693 static char	*tcp_display(tcp_t *tcp, char *, char);
    694 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
    695 static void	tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
    696 static void	tcp_eager_unlink(tcp_t *tcp);
    697 static void	tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
    698 		    int unixerr);
    699 static void	tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
    700 		    int tlierr, int unixerr);
    701 static int	tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
    702 		    cred_t *cr);
    703 static int	tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
    704 		    char *value, caddr_t cp, cred_t *cr);
    705 static int	tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
    706 		    char *value, caddr_t cp, cred_t *cr);
    707 static int	tcp_tpistate(tcp_t *tcp);
    708 static void	tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
    709     int caller_holds_lock);
    710 static void	tcp_bind_hash_remove(tcp_t *tcp);
    711 static tcp_t	*tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *);
    712 void		tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
    713 static void	tcp_acceptor_hash_remove(tcp_t *tcp);
    714 static void	tcp_capability_req(tcp_t *tcp, mblk_t *mp);
    715 static void	tcp_info_req(tcp_t *tcp, mblk_t *mp);
    716 static void	tcp_addr_req(tcp_t *tcp, mblk_t *mp);
    717 static void	tcp_init_values(tcp_t *tcp);
    718 static void	tcp_ip_notify(tcp_t *tcp);
    719 static void	tcp_iss_init(tcp_t *tcp);
    720 static void	tcp_keepalive_killer(void *arg);
    721 static int	tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt);
    722 static void	tcp_mss_set(tcp_t *tcp, uint32_t size);
    723 static int	tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
    724 		    int *do_disconnectp, int *t_errorp, int *sys_errorp);
    725 static boolean_t tcp_allow_connopt_set(int level, int name);
    726 int		tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
    727 static int	tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
    728 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
    729     tcp_stack_t *);
    730 static int	tcp_param_set(queue_t *q, mblk_t *mp, char *value,
    731 		    caddr_t cp, cred_t *cr);
    732 static int	tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
    733 		    caddr_t cp, cred_t *cr);
    734 static void	tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *);
    735 static int	tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
    736 		    caddr_t cp, cred_t *cr);
    737 static void	tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
    738 static void	tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt);
    739 static mblk_t	*tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
    740 static void	tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
    741 static void	tcp_reinit(tcp_t *tcp);
    742 static void	tcp_reinit_values(tcp_t *tcp);
    743 
    744 static uint_t	tcp_rwnd_reopen(tcp_t *tcp);
    745 static uint_t	tcp_rcv_drain(tcp_t *tcp);
    746 static void	tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
    747 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
    748 static void	tcp_ss_rexmit(tcp_t *tcp);
    749 static mblk_t	*tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
    750     ip_recv_attr_t *);
    751 static void	tcp_process_options(tcp_t *, tcpha_t *);
    752 static void	tcp_rsrv(queue_t *q);
    753 static int	tcp_snmp_state(tcp_t *tcp);
    754 static void	tcp_timer(void *arg);
    755 static void	tcp_timer_callback(void *);
    756 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
    757     boolean_t random);
    758 static in_port_t tcp_get_next_priv_port(const tcp_t *);
    759 static void	tcp_wput_sock(queue_t *q, mblk_t *mp);
    760 static void	tcp_wput_fallback(queue_t *q, mblk_t *mp);
    761 void		tcp_tpi_accept(queue_t *q, mblk_t *mp);
    762 static void	tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
    763 static void	tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
    764 static void	tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
    765 static int	tcp_send(tcp_t *tcp, const int mss,
    766 		    const int total_hdr_len, const int tcp_hdr_len,
    767 		    const int num_sack_blk, int *usable, uint_t *snxt,
    768 		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time);
    769 static void	tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
    770 		    int num_sack_blk);
    771 static void	tcp_wsrv(queue_t *q);
    772 static int	tcp_xmit_end(tcp_t *tcp);
    773 static void	tcp_ack_timer(void *arg);
    774 static mblk_t	*tcp_ack_mp(tcp_t *tcp);
    775 static void	tcp_xmit_early_reset(char *str, mblk_t *mp,
    776 		    uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *,
    777 		    ip_stack_t *, conn_t *);
    778 static void	tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
    779 		    uint32_t ack, int ctl);
    780 static void	tcp_set_rto(tcp_t *, time_t);
    781 static void	tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
    782 static void	tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
    783 static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
    784     ip_recv_attr_t *);
    785 static int	tcp_build_hdrs(tcp_t *);
    786 static void	tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
    787     uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha,
    788     ip_recv_attr_t *ira);
    789 boolean_t	tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp);
    790 static boolean_t tcp_zcopy_check(tcp_t *);
    791 static void	tcp_zcopy_notify(tcp_t *);
    792 static mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
    793 static void	tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);
    794 static void	tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only);
    795 static void	tcp_update_zcopy(tcp_t *tcp);
    796 static void	tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
    797     ixa_notify_arg_t);
    798 static void	tcp_rexmit_after_error(tcp_t *tcp);
    799 static void	tcp_send_data(tcp_t *, mblk_t *);
    800 extern mblk_t	*tcp_timermp_alloc(int);
    801 extern void	tcp_timermp_free(tcp_t *);
    802 static void	tcp_timer_free(tcp_t *tcp, mblk_t *mp);
    803 static void	tcp_stop_lingering(tcp_t *tcp);
    804 static void	tcp_close_linger_timeout(void *arg);
    805 static void	*tcp_stack_init(netstackid_t stackid, netstack_t *ns);
    806 static void	tcp_stack_fini(netstackid_t stackid, void *arg);
    807 static void	*tcp_g_kstat_init(tcp_g_stat_t *);
    808 static void	tcp_g_kstat_fini(kstat_t *);
    809 static void	*tcp_kstat_init(netstackid_t, tcp_stack_t *);
    810 static void	tcp_kstat_fini(netstackid_t, kstat_t *);
    811 static void	*tcp_kstat2_init(netstackid_t, tcp_stat_t *);
    812 static void	tcp_kstat2_fini(netstackid_t, kstat_t *);
    813 static int	tcp_kstat_update(kstat_t *kp, int rw);
    814 static mblk_t	*tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
    815     ip_recv_attr_t *ira);
    816 static mblk_t	*tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
    817     ip_recv_attr_t *ira);
    818 static int	tcp_squeue_switch(int);
    819 
    820 static int	tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
    821 static int	tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
    822 static int	tcp_openv6(queue_t *, dev_t *, int, int, cred_t *);
    823 static int	tcp_tpi_close(queue_t *, int);
    824 static int	tcp_tpi_close_accept(queue_t *);
    825 
    826 static void	tcp_squeue_add(squeue_t *);
    827 static void	tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
    828 
    829 extern void	tcp_kssl_input(tcp_t *, mblk_t *, cred_t *);
    830 
    831 void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy);
    832 void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
    833     ip_recv_attr_t *dummy);
    834 
    835 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
    836 	    sock_upper_handle_t, cred_t *);
    837 static int tcp_listen(sock_lower_handle_t, int, cred_t *);
    838 static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *,
    839     boolean_t);
    840 static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
    841     cred_t *, pid_t);
    842 static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
    843     boolean_t);
    844 static int tcp_do_unbind(conn_t *);
    845 static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *,
    846     boolean_t);
    847 
    848 static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
    849 
    850 /*
    851  * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
    852  *
    853  * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
    854  * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
    855  * (defined in tcp.h) needs to be filled in and passed into the kernel
    856  * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
    857  * structure contains the four-tuple of a TCP connection and a range of TCP
    858  * states (specified by ac_start and ac_end). The use of wildcard addresses
    859  * and ports is allowed. Connections with a matching four tuple and a state
    860  * within the specified range will be aborted. The valid states for the
    861  * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
    862  * inclusive.
    863  *
    864  * An application which has its connection aborted by this ioctl will receive
    865  * an error that is dependent on the connection state at the time of the abort.
    866  * If the connection state is < TCPS_TIME_WAIT, an application should behave as
    867  * though a RST packet has been received.  If the connection state is equal to
    868  * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
    869  * and all resources associated with the connection will be freed.
    870  */
    871 static mblk_t	*tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
    872 static void	tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
    873 static void	tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
    874     ip_recv_attr_t *dummy);
    875 static int	tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
    876 static void	tcp_ioctl_abort_conn(queue_t *, mblk_t *);
    877 static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
    878     boolean_t, tcp_stack_t *);
    879 
    880 static struct module_info tcp_rinfo =  {
    881 	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
    882 };
    883 
    884 static struct module_info tcp_winfo =  {
    885 	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
    886 };
    887 
    888 /*
    889  * Entry points for TCP as a device. The normal case which supports
    890  * the TCP functionality.
    891  * We have separate open functions for the /dev/tcp and /dev/tcp6 devices.
    892  */
    893 struct qinit tcp_rinitv4 = {
    894 	NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo
    895 };
    896 
    897 struct qinit tcp_rinitv6 = {
    898 	NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo
    899 };
    900 
    901 struct qinit tcp_winit = {
    902 	(pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
    903 };
    904 
    905 /* Initial entry point for TCP in socket mode. */
    906 struct qinit tcp_sock_winit = {
    907 	(pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
    908 };
    909 
    910 /* TCP entry point during fallback */
    911 struct qinit tcp_fallback_sock_winit = {
    912 	(pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo
    913 };
    914 
    915 /*
    916  * Entry points for TCP as a acceptor STREAM opened by sockfs when doing
    917  * an accept. Avoid allocating data structures since eager has already
    918  * been created.
    919  */
    920 struct qinit tcp_acceptor_rinit = {
    921 	NULL, (pfi_t)tcp_rsrv, NULL, tcp_tpi_close_accept, NULL, &tcp_winfo
    922 };
    923 
    924 struct qinit tcp_acceptor_winit = {
    925 	(pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo
    926 };
    927 
    928 /* For AF_INET aka /dev/tcp */
    929 struct streamtab tcpinfov4 = {
    930 	&tcp_rinitv4, &tcp_winit
    931 };
    932 
    933 /* For AF_INET6 aka /dev/tcp6 */
    934 struct streamtab tcpinfov6 = {
    935 	&tcp_rinitv6, &tcp_winit
    936 };
    937 
    938 sock_downcalls_t sock_tcp_downcalls;
    939 
    940 /* Setable only in /etc/system. Move to ndd? */
    941 boolean_t tcp_icmp_source_quench = B_FALSE;
    942 
    943 /*
    944  * Following assumes TPI alignment requirements stay along 32 bit
    945  * boundaries
    946  */
    947 #define	ROUNDUP32(x) \
    948 	(((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
    949 
    950 /* Template for response to info request. */
    951 static struct T_info_ack tcp_g_t_info_ack = {
    952 	T_INFO_ACK,		/* PRIM_type */
    953 	0,			/* TSDU_size */
    954 	T_INFINITE,		/* ETSDU_size */
    955 	T_INVALID,		/* CDATA_size */
    956 	T_INVALID,		/* DDATA_size */
    957 	sizeof (sin_t),		/* ADDR_size */
    958 	0,			/* OPT_size - not initialized here */
    959 	TIDUSZ,			/* TIDU_size */
    960 	T_COTS_ORD,		/* SERV_type */
    961 	TCPS_IDLE,		/* CURRENT_state */
    962 	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
    963 };
    964 
    965 static struct T_info_ack tcp_g_t_info_ack_v6 = {
    966 	T_INFO_ACK,		/* PRIM_type */
    967 	0,			/* TSDU_size */
    968 	T_INFINITE,		/* ETSDU_size */
    969 	T_INVALID,		/* CDATA_size */
    970 	T_INVALID,		/* DDATA_size */
    971 	sizeof (sin6_t),	/* ADDR_size */
    972 	0,			/* OPT_size - not initialized here */
    973 	TIDUSZ,		/* TIDU_size */
    974 	T_COTS_ORD,		/* SERV_type */
    975 	TCPS_IDLE,		/* CURRENT_state */
    976 	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
    977 };
    978 
    979 #define	MS	1L
    980 #define	SECONDS	(1000 * MS)
    981 #define	MINUTES	(60 * SECONDS)
    982 #define	HOURS	(60 * MINUTES)
    983 #define	DAYS	(24 * HOURS)
    984 
    985 #define	PARAM_MAX (~(uint32_t)0)
    986 
    987 /* Max size IP datagram is 64k - 1 */
    988 #define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
    989 #define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
    990 /* Max of the above */
    991 #define	TCP_MSS_MAX	TCP_MSS_MAX_IPV4
    992 
    993 /* Largest TCP port number */
    994 #define	TCP_MAX_PORT	(64 * 1024 - 1)
    995 
    996 /*
    997  * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
    998  * layer header.  It has to be a multiple of 4.
    999  */
   1000 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
   1001 #define	tcps_wroff_xtra	tcps_wroff_xtra_param->tcp_param_val
   1002 
   1003 /*
   1004  * All of these are alterable, within the min/max values given, at run time.
   1005  * Note that the default value of "tcp_time_wait_interval" is four minutes,
   1006  * per the TCP spec.
   1007  */
   1008 /* BEGIN CSTYLED */
   1009 static tcpparam_t	lcl_tcp_param_arr[] = {
   1010  /*min		max		value		name */
   1011  { 1*SECONDS,	10*MINUTES,	1*MINUTES,	"tcp_time_wait_interval"},
   1012  { 1,		PARAM_MAX,	128,		"tcp_conn_req_max_q" },
   1013  { 0,		PARAM_MAX,	1024,		"tcp_conn_req_max_q0" },
   1014  { 1,		1024,		1,		"tcp_conn_req_min" },
   1015  { 0*MS,	20*SECONDS,	0*MS,		"tcp_conn_grace_period" },
   1016  { 128,		(1<<30),	1024*1024,	"tcp_cwnd_max" },
   1017  { 0,		10,		0,		"tcp_debug" },
   1018  { 1024,	(32*1024),	1024,		"tcp_smallest_nonpriv_port"},
   1019  { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_cinterval"},
   1020  { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_linterval"},
   1021  { 500*MS,	PARAM_MAX,	8*MINUTES,	"tcp_ip_abort_interval"},
   1022  { 1*SECONDS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_cinterval"},
   1023  { 500*MS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_interval"},
   1024  { 1,		255,		64,		"tcp_ipv4_ttl"},
   1025  { 10*SECONDS,	10*DAYS,	2*HOURS,	"tcp_keepalive_interval"},
   1026  { 0,		100,		10,		"tcp_maxpsz_multiplier" },
   1027  { 1,		TCP_MSS_MAX_IPV4, 536,		"tcp_mss_def_ipv4"},
   1028  { 1,		TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
   1029  { 1,		TCP_MSS_MAX,	108,		"tcp_mss_min"},
   1030  { 1,		(64*1024)-1,	(4*1024)-1,	"tcp_naglim_def"},
   1031  { 1*MS,	20*SECONDS,	3*SECONDS,	"tcp_rexmit_interval_initial"},
   1032  { 1*MS,	2*HOURS,	60*SECONDS,	"tcp_rexmit_interval_max"},
   1033  { 1*MS,	2*HOURS,	400*MS,		"tcp_rexmit_interval_min"},
   1034  { 1*MS,	1*MINUTES,	100*MS,		"tcp_deferred_ack_interval" },
   1035  { 0,		16,		0,		"tcp_snd_lowat_fraction" },
   1036  { 0,		128000,		0,		"tcp_sth_rcv_hiwat" },
   1037  { 0,		128000,		0,		"tcp_sth_rcv_lowat" },
   1038  { 1,		10000,		3,		"tcp_dupack_fast_retransmit" },
   1039  { 0,		1,		0,		"tcp_ignore_path_mtu" },
   1040  { 1024,	TCP_MAX_PORT,	32*1024,	"tcp_smallest_anon_port"},
   1041  { 1024,	TCP_MAX_PORT,	TCP_MAX_PORT,	"tcp_largest_anon_port"},
   1042  { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
   1043  { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
   1044  { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
   1045  { 1,		65536,		4,		"tcp_recv_hiwat_minmss"},
   1046  { 1*SECONDS,	PARAM_MAX,	675*SECONDS,	"tcp_fin_wait_2_flush_interval"},
   1047  { 8192,	(1<<30),	1024*1024,	"tcp_max_buf"},
   1048 /*
   1049  * Question:  What default value should I set for tcp_strong_iss?
   1050  */
   1051  { 0,		2,		1,		"tcp_strong_iss"},
   1052  { 0,		65536,		20,		"tcp_rtt_updates"},
   1053  { 0,		1,		1,		"tcp_wscale_always"},
   1054  { 0,		1,		0,		"tcp_tstamp_always"},
   1055  { 0,		1,		1,		"tcp_tstamp_if_wscale"},
   1056  { 0*MS,	2*HOURS,	0*MS,		"tcp_rexmit_interval_extra"},
   1057  { 0,		16,		2,		"tcp_deferred_acks_max"},
   1058  { 1,		16384,		4,		"tcp_slow_start_after_idle"},
   1059  { 1,		4,		4,		"tcp_slow_start_initial"},
   1060  { 0,		2,		2,		"tcp_sack_permitted"},
   1061  { 0,		1,		1,		"tcp_compression_enabled"},
   1062  { 0,		IPV6_MAX_HOPS,	IPV6_DEFAULT_HOPS,	"tcp_ipv6_hoplimit"},
   1063  { 1,		TCP_MSS_MAX_IPV6, 1220,		"tcp_mss_def_ipv6"},
   1064  { 1,		TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
   1065  { 0,		1,		0,		"tcp_rev_src_routes"},
   1066  { 10*MS,	500*MS,		50*MS,		"tcp_local_dack_interval"},
   1067  { 0,		16,		8,		"tcp_local_dacks_max"},
   1068  { 0,		2,		1,		"tcp_ecn_permitted"},
   1069  { 0,		1,		1,		"tcp_rst_sent_rate_enabled"},
   1070  { 0,		PARAM_MAX,	40,		"tcp_rst_sent_rate"},
   1071  { 0,		100*MS,		50*MS,		"tcp_push_timer_interval"},
   1072  { 0,		1,		0,		"tcp_use_smss_as_mss_opt"},
   1073  { 0,		PARAM_MAX,	8*MINUTES,	"tcp_keepalive_abort_interval"},
   1074  { 0,		1,		0,		"tcp_dev_flow_ctl"},
   1075 };
   1076 /* END CSTYLED */
   1077 
   1078 /* Round up the value to the nearest mss. */
   1079 #define	MSS_ROUNDUP(value, mss)		((((value) - 1) / (mss) + 1) * (mss))
   1080 
   1081 /*
   1082  * Set ECN capable transport (ECT) code point in IP header.
   1083  *
   1084  * Note that there are 2 ECT code points '01' and '10', which are called
   1085  * ECT(1) and ECT(0) respectively.  Here we follow the original ECT code
   1086  * point ECT(0) for TCP as described in RFC 2481.
   1087  */
   1088 #define	SET_ECT(tcp, iph) \
   1089 	if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
   1090 		/* We need to clear the code point first. */ \
   1091 		((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
   1092 		((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
   1093 	} else { \
   1094 		((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
   1095 		((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
   1096 	}
   1097 
   1098 /*
   1099  * The format argument to pass to tcp_display().
   1100  * DISP_PORT_ONLY means that the returned string has only port info.
   1101  * DISP_ADDR_AND_PORT means that the returned string also contains the
   1102  * remote and local IP address.
   1103  */
   1104 #define	DISP_PORT_ONLY		1
   1105 #define	DISP_ADDR_AND_PORT	2
   1106 
   1107 #define	IS_VMLOANED_MBLK(mp) \
   1108 	(((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
   1109 
   1110 uint32_t do_tcpzcopy = 1;		/* 0: disable, 1: enable, 2: force */
   1111 
   1112 /*
   1113  * Forces all connections to obey the value of the tcps_maxpsz_multiplier
   1114  * tunable settable via NDD.  Otherwise, the per-connection behavior is
   1115  * determined dynamically during tcp_set_destination(), which is the default.
   1116  */
   1117 boolean_t tcp_static_maxpsz = B_FALSE;
   1118 
   1119 /* Setable in /etc/system */
   1120 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
   1121 uint32_t tcp_random_anon_port = 1;
   1122 
   1123 /*
   1124  * To reach to an eager in Q0 which can be dropped due to an incoming
   1125  * new SYN request when Q0 is full, a new doubly linked list is
   1126  * introduced. This list allows to select an eager from Q0 in O(1) time.
   1127  * This is needed to avoid spending too much time walking through the
   1128  * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
   1129  * this new list has to be a member of Q0.
   1130  * This list is headed by listener's tcp_t. When the list is empty,
   1131  * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
   1132  * of listener's tcp_t point to listener's tcp_t itself.
   1133  *
   1134  * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
   1135  * in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
   1136  * These macros do not affect the eager's membership to Q0.
   1137  */
   1138 
   1139 
   1140 #define	MAKE_DROPPABLE(listener, eager)					\
   1141 	if ((eager)->tcp_eager_next_drop_q0 == NULL) {			\
   1142 		(listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
   1143 		    = (eager);						\
   1144 		(eager)->tcp_eager_prev_drop_q0 = (listener);		\
   1145 		(eager)->tcp_eager_next_drop_q0 =			\
   1146 		    (listener)->tcp_eager_next_drop_q0;			\
   1147 		(listener)->tcp_eager_next_drop_q0 = (eager);		\
   1148 	}
   1149 
   1150 #define	MAKE_UNDROPPABLE(eager)						\
   1151 	if ((eager)->tcp_eager_next_drop_q0 != NULL) {			\
   1152 		(eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0	\
   1153 		    = (eager)->tcp_eager_prev_drop_q0;			\
   1154 		(eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0	\
   1155 		    = (eager)->tcp_eager_next_drop_q0;			\
   1156 		(eager)->tcp_eager_prev_drop_q0 = NULL;			\
   1157 		(eager)->tcp_eager_next_drop_q0 = NULL;			\
   1158 	}
   1159 
   1160 /*
   1161  * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
   1162  * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
   1163  * data, TCP will not respond with an ACK.  RFC 793 requires that
   1164  * TCP responds with an ACK for such a bogus ACK.  By not following
   1165  * the RFC, we prevent TCP from getting into an ACK storm if somehow
   1166  * an attacker successfully spoofs an acceptable segment to our
   1167  * peer; or when our peer is "confused."
   1168  */
   1169 uint32_t tcp_drop_ack_unsent_cnt = 10;
   1170 
   1171 /*
   1172  * Hook functions to enable cluster networking
   1173  * On non-clustered systems these vectors must always be NULL.
   1174  */
   1175 
   1176 void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol,
   1177 			    sa_family_t addr_family, uint8_t *laddrp,
   1178 			    in_port_t lport, void *args) = NULL;
   1179 void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol,
   1180 			    sa_family_t addr_family, uint8_t *laddrp,
   1181 			    in_port_t lport, void *args) = NULL;
   1182 
   1183 int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol,
   1184 			    boolean_t is_outgoing,
   1185 			    sa_family_t addr_family,
   1186 			    uint8_t *laddrp, in_port_t lport,
   1187 			    uint8_t *faddrp, in_port_t fport,
   1188 			    void *args) = NULL;
   1189 void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol,
   1190 			    sa_family_t addr_family, uint8_t *laddrp,
   1191 			    in_port_t lport, uint8_t *faddrp,
   1192 			    in_port_t fport, void *args) = NULL;
   1193 
   1194 
   1195 /*
   1196  * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err)
   1197  */
   1198 #define	CL_INET_CONNECT(connp, is_outgoing, err) {		\
   1199 	(err) = 0;						\
   1200 	if (cl_inet_connect2 != NULL) {				\
   1201 		/*						\
   1202 		 * Running in cluster mode - register active connection	\
   1203 		 * information						\
   1204 		 */							\
   1205 		if ((connp)->conn_ipversion == IPV4_VERSION) {		\
   1206 			if ((connp)->conn_laddr_v4 != 0) {		\
   1207 				(err) = (*cl_inet_connect2)(		\
   1208 				    (connp)->conn_netstack->netstack_stackid,\
   1209 				    IPPROTO_TCP, is_outgoing, AF_INET,	\
   1210 				    (uint8_t *)(&((connp)->conn_laddr_v4)),\
   1211 				    (in_port_t)(connp)->conn_lport,	\
   1212 				    (uint8_t *)(&((connp)->conn_faddr_v4)),\
   1213 				    (in_port_t)(connp)->conn_fport, NULL); \
   1214 			}						\
   1215 		} else {						\
   1216 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
   1217 			    &(connp)->conn_laddr_v6)) {			\
   1218 				(err) = (*cl_inet_connect2)(		\
   1219 				    (connp)->conn_netstack->netstack_stackid,\
   1220 				    IPPROTO_TCP, is_outgoing, AF_INET6,	\
   1221 				    (uint8_t *)(&((connp)->conn_laddr_v6)),\
   1222 				    (in_port_t)(connp)->conn_lport,	\
   1223 				    (uint8_t *)(&((connp)->conn_faddr_v6)), \
   1224 				    (in_port_t)(connp)->conn_fport, NULL); \
   1225 			}						\
   1226 		}							\
   1227 	}								\
   1228 }
   1229 
   1230 #define	CL_INET_DISCONNECT(connp)	{				\
   1231 	if (cl_inet_disconnect != NULL) {				\
   1232 		/*							\
   1233 		 * Running in cluster mode - deregister active		\
   1234 		 * connection information				\
   1235 		 */							\
   1236 		if ((connp)->conn_ipversion == IPV4_VERSION) {		\
   1237 			if ((connp)->conn_laddr_v4 != 0) {		\
   1238 				(*cl_inet_disconnect)(			\
   1239 				    (connp)->conn_netstack->netstack_stackid,\
   1240 				    IPPROTO_TCP, AF_INET,		\
   1241 				    (uint8_t *)(&((connp)->conn_laddr_v4)),\
   1242 				    (in_port_t)(connp)->conn_lport,	\
   1243 				    (uint8_t *)(&((connp)->conn_faddr_v4)),\
   1244 				    (in_port_t)(connp)->conn_fport, NULL); \
   1245 			}						\
   1246 		} else {						\
   1247 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
   1248 			    &(connp)->conn_laddr_v6)) {			\
   1249 				(*cl_inet_disconnect)(			\
   1250 				    (connp)->conn_netstack->netstack_stackid,\
   1251 				    IPPROTO_TCP, AF_INET6,		\
   1252 				    (uint8_t *)(&((connp)->conn_laddr_v6)),\
   1253 				    (in_port_t)(connp)->conn_lport,	\
   1254 				    (uint8_t *)(&((connp)->conn_faddr_v6)), \
   1255 				    (in_port_t)(connp)->conn_fport, NULL); \
   1256 			}						\
   1257 		}							\
   1258 	}								\
   1259 }
   1260 
   1261 /*
   1262  * Cluster networking hook for traversing current connection list.
   1263  * This routine is used to extract the current list of live connections
   1264  * which must continue to to be dispatched to this node.
   1265  */
   1266 int cl_tcp_walk_list(netstackid_t stack_id,
   1267     int (*callback)(cl_tcp_info_t *, void *), void *arg);
   1268 
   1269 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
   1270     void *arg, tcp_stack_t *tcps);
   1271 
   1272 static void
   1273 tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh)
   1274 {
   1275 	uint32_t default_threshold = SOCKET_RECVHIWATER >> 3;
   1276 
   1277 	if (IPCL_IS_NONSTR(tcp->tcp_connp)) {
   1278 		conn_t *connp = tcp->tcp_connp;
   1279 		struct sock_proto_props sopp;
   1280 
   1281 		/*
   1282 		 * only increase rcvthresh upto default_threshold
   1283 		 */
   1284 		if (new_rcvthresh > default_threshold)
   1285 			new_rcvthresh = default_threshold;
   1286 
   1287 		sopp.sopp_flags = SOCKOPT_RCVTHRESH;
   1288 		sopp.sopp_rcvthresh = new_rcvthresh;
   1289 
   1290 		(*connp->conn_upcalls->su_set_proto_props)
   1291 		    (connp->conn_upper_handle, &sopp);
   1292 	}
   1293 }
   1294 /*
   1295  * Figure out the value of window scale opton.  Note that the rwnd is
   1296  * ASSUMED to be rounded up to the nearest MSS before the calculation.
   1297  * We cannot find the scale value and then do a round up of tcp_rwnd
   1298  * because the scale value may not be correct after that.
   1299  *
   1300  * Set the compiler flag to make this function inline.
   1301  */
   1302 static void
   1303 tcp_set_ws_value(tcp_t *tcp)
   1304 {
   1305 	int i;
   1306 	uint32_t rwnd = tcp->tcp_rwnd;
   1307 
   1308 	for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
   1309 	    i++, rwnd >>= 1)
   1310 		;
   1311 	tcp->tcp_rcv_ws = i;
   1312 }
   1313 
   1314 /*
   1315  * Remove a connection from the list of detached TIME_WAIT connections.
   1316  * It returns B_FALSE if it can't remove the connection from the list
   1317  * as the connection has already been removed from the list due to an
   1318  * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
   1319  */
   1320 static boolean_t
   1321 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
   1322 {
   1323 	boolean_t	locked = B_FALSE;
   1324 
   1325 	if (tcp_time_wait == NULL) {
   1326 		tcp_time_wait = *((tcp_squeue_priv_t **)
   1327 		    squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
   1328 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1329 		locked = B_TRUE;
   1330 	} else {
   1331 		ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
   1332 	}
   1333 
   1334 	if (tcp->tcp_time_wait_expire == 0) {
   1335 		ASSERT(tcp->tcp_time_wait_next == NULL);
   1336 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   1337 		if (locked)
   1338 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1339 		return (B_FALSE);
   1340 	}
   1341 	ASSERT(TCP_IS_DETACHED(tcp));
   1342 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
   1343 
   1344 	if (tcp == tcp_time_wait->tcp_time_wait_head) {
   1345 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   1346 		tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
   1347 		if (tcp_time_wait->tcp_time_wait_head != NULL) {
   1348 			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
   1349 			    NULL;
   1350 		} else {
   1351 			tcp_time_wait->tcp_time_wait_tail = NULL;
   1352 		}
   1353 	} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
   1354 		ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
   1355 		ASSERT(tcp->tcp_time_wait_next == NULL);
   1356 		tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
   1357 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
   1358 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
   1359 	} else {
   1360 		ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
   1361 		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
   1362 		tcp->tcp_time_wait_prev->tcp_time_wait_next =
   1363 		    tcp->tcp_time_wait_next;
   1364 		tcp->tcp_time_wait_next->tcp_time_wait_prev =
   1365 		    tcp->tcp_time_wait_prev;
   1366 	}
   1367 	tcp->tcp_time_wait_next = NULL;
   1368 	tcp->tcp_time_wait_prev = NULL;
   1369 	tcp->tcp_time_wait_expire = 0;
   1370 
   1371 	if (locked)
   1372 		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1373 	return (B_TRUE);
   1374 }
   1375 
   1376 /*
   1377  * Add a connection to the list of detached TIME_WAIT connections
   1378  * and set its time to expire.
   1379  */
   1380 static void
   1381 tcp_time_wait_append(tcp_t *tcp)
   1382 {
   1383 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1384 	tcp_squeue_priv_t *tcp_time_wait =
   1385 	    *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
   1386 	    SQPRIVATE_TCP));
   1387 
   1388 	tcp_timers_stop(tcp);
   1389 
   1390 	/* Freed above */
   1391 	ASSERT(tcp->tcp_timer_tid == 0);
   1392 	ASSERT(tcp->tcp_ack_tid == 0);
   1393 
   1394 	/* must have happened at the time of detaching the tcp */
   1395 	ASSERT(tcp->tcp_ptpahn == NULL);
   1396 	ASSERT(tcp->tcp_flow_stopped == 0);
   1397 	ASSERT(tcp->tcp_time_wait_next == NULL);
   1398 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   1399 	ASSERT(tcp->tcp_time_wait_expire == NULL);
   1400 	ASSERT(tcp->tcp_listener == NULL);
   1401 
   1402 	tcp->tcp_time_wait_expire = ddi_get_lbolt();
   1403 	/*
   1404 	 * The value computed below in tcp->tcp_time_wait_expire may
   1405 	 * appear negative or wrap around. That is ok since our
   1406 	 * interest is only in the difference between the current lbolt
   1407 	 * value and tcp->tcp_time_wait_expire. But the value should not
   1408 	 * be zero, since it means the tcp is not in the TIME_WAIT list.
   1409 	 * The corresponding comparison in tcp_time_wait_collector() uses
   1410 	 * modular arithmetic.
   1411 	 */
   1412 	tcp->tcp_time_wait_expire +=
   1413 	    drv_usectohz(tcps->tcps_time_wait_interval * 1000);
   1414 	if (tcp->tcp_time_wait_expire == 0)
   1415 		tcp->tcp_time_wait_expire = 1;
   1416 
   1417 	ASSERT(TCP_IS_DETACHED(tcp));
   1418 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
   1419 	ASSERT(tcp->tcp_time_wait_next == NULL);
   1420 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   1421 	TCP_DBGSTAT(tcps, tcp_time_wait);
   1422 
   1423 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1424 	if (tcp_time_wait->tcp_time_wait_head == NULL) {
   1425 		ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
   1426 		tcp_time_wait->tcp_time_wait_head = tcp;
   1427 	} else {
   1428 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
   1429 		ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
   1430 		    TCPS_TIME_WAIT);
   1431 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
   1432 		tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
   1433 	}
   1434 	tcp_time_wait->tcp_time_wait_tail = tcp;
   1435 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1436 }
   1437 
   1438 /* ARGSUSED */
   1439 void
   1440 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   1441 {
   1442 	conn_t	*connp = (conn_t *)arg;
   1443 	tcp_t	*tcp = connp->conn_tcp;
   1444 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1445 
   1446 	ASSERT(tcp != NULL);
   1447 	if (tcp->tcp_state == TCPS_CLOSED) {
   1448 		return;
   1449 	}
   1450 
   1451 	ASSERT((connp->conn_family == AF_INET &&
   1452 	    connp->conn_ipversion == IPV4_VERSION) ||
   1453 	    (connp->conn_family == AF_INET6 &&
   1454 	    (connp->conn_ipversion == IPV4_VERSION ||
   1455 	    connp->conn_ipversion == IPV6_VERSION)));
   1456 	ASSERT(!tcp->tcp_listener);
   1457 
   1458 	TCP_STAT(tcps, tcp_time_wait_reap);
   1459 	ASSERT(TCP_IS_DETACHED(tcp));
   1460 
   1461 	/*
   1462 	 * Because they have no upstream client to rebind or tcp_close()
   1463 	 * them later, we axe the connection here and now.
   1464 	 */
   1465 	tcp_close_detached(tcp);
   1466 }
   1467 
   1468 /*
   1469  * Remove cached/latched IPsec references.
   1470  */
   1471 void
   1472 tcp_ipsec_cleanup(tcp_t *tcp)
   1473 {
   1474 	conn_t		*connp = tcp->tcp_connp;
   1475 
   1476 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
   1477 
   1478 	if (connp->conn_latch != NULL) {
   1479 		IPLATCH_REFRELE(connp->conn_latch);
   1480 		connp->conn_latch = NULL;
   1481 	}
   1482 	if (connp->conn_latch_in_policy != NULL) {
   1483 		IPPOL_REFRELE(connp->conn_latch_in_policy);
   1484 		connp->conn_latch_in_policy = NULL;
   1485 	}
   1486 	if (connp->conn_latch_in_action != NULL) {
   1487 		IPACT_REFRELE(connp->conn_latch_in_action);
   1488 		connp->conn_latch_in_action = NULL;
   1489 	}
   1490 	if (connp->conn_policy != NULL) {
   1491 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
   1492 		connp->conn_policy = NULL;
   1493 	}
   1494 }
   1495 
   1496 /*
   1497  * Cleaup before placing on free list.
   1498  * Disassociate from the netstack/tcp_stack_t since the freelist
   1499  * is per squeue and not per netstack.
   1500  */
   1501 void
   1502 tcp_cleanup(tcp_t *tcp)
   1503 {
   1504 	mblk_t		*mp;
   1505 	tcp_sack_info_t	*tcp_sack_info;
   1506 	conn_t		*connp = tcp->tcp_connp;
   1507 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1508 	netstack_t	*ns = tcps->tcps_netstack;
   1509 	mblk_t		*tcp_rsrv_mp;
   1510 
   1511 	tcp_bind_hash_remove(tcp);
   1512 
   1513 	/* Cleanup that which needs the netstack first */
   1514 	tcp_ipsec_cleanup(tcp);
   1515 	ixa_cleanup(connp->conn_ixa);
   1516 
   1517 	if (connp->conn_ht_iphc != NULL) {
   1518 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
   1519 		connp->conn_ht_iphc = NULL;
   1520 		connp->conn_ht_iphc_allocated = 0;
   1521 		connp->conn_ht_iphc_len = 0;
   1522 		connp->conn_ht_ulp = NULL;
   1523 		connp->conn_ht_ulp_len = 0;
   1524 		tcp->tcp_ipha = NULL;
   1525 		tcp->tcp_ip6h = NULL;
   1526 		tcp->tcp_tcpha = NULL;
   1527 	}
   1528 
   1529 	/* We clear any IP_OPTIONS and extension headers */
   1530 	ip_pkt_free(&connp->conn_xmit_ipp);
   1531 
   1532 	tcp_free(tcp);
   1533 
   1534 	/* Release any SSL context */
   1535 	if (tcp->tcp_kssl_ent != NULL) {
   1536 		kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
   1537 		tcp->tcp_kssl_ent = NULL;
   1538 	}
   1539 
   1540 	if (tcp->tcp_kssl_ctx != NULL) {
   1541 		kssl_release_ctx(tcp->tcp_kssl_ctx);
   1542 		tcp->tcp_kssl_ctx = NULL;
   1543 	}
   1544 	tcp->tcp_kssl_pending = B_FALSE;
   1545 
   1546 	/*
   1547 	 * Since we will bzero the entire structure, we need to
   1548 	 * remove it and reinsert it in global hash list. We
   1549 	 * know the walkers can't get to this conn because we
   1550 	 * had set CONDEMNED flag earlier and checked reference
   1551 	 * under conn_lock so walker won't pick it and when we
   1552 	 * go the ipcl_globalhash_remove() below, no walker
   1553 	 * can get to it.
   1554 	 */
   1555 	ipcl_globalhash_remove(connp);
   1556 
   1557 	/* Save some state */
   1558 	mp = tcp->tcp_timercache;
   1559 
   1560 	tcp_sack_info = tcp->tcp_sack_info;
   1561 	tcp_rsrv_mp = tcp->tcp_rsrv_mp;
   1562 
   1563 	if (connp->conn_cred != NULL) {
   1564 		crfree(connp->conn_cred);
   1565 		connp->conn_cred = NULL;
   1566 	}
   1567 	ipcl_conn_cleanup(connp);
   1568 	connp->conn_flags = IPCL_TCPCONN;
   1569 
   1570 	/*
   1571 	 * Now it is safe to decrement the reference counts.
   1572 	 * This might be the last reference on the netstack
   1573 	 * in which case it will cause the freeing of the IP Instance.
   1574 	 */
   1575 	connp->conn_netstack = NULL;
   1576 	connp->conn_ixa->ixa_ipst = NULL;
   1577 	netstack_rele(ns);
   1578 	ASSERT(tcps != NULL);
   1579 	tcp->tcp_tcps = NULL;
   1580 
   1581 	bzero(tcp, sizeof (tcp_t));
   1582 
   1583 	/* restore the state */
   1584 	tcp->tcp_timercache = mp;
   1585 
   1586 	tcp->tcp_sack_info = tcp_sack_info;
   1587 	tcp->tcp_rsrv_mp = tcp_rsrv_mp;
   1588 
   1589 	tcp->tcp_connp = connp;
   1590 
   1591 	ASSERT(connp->conn_tcp == tcp);
   1592 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
   1593 	connp->conn_state_flags = CONN_INCIPIENT;
   1594 	ASSERT(connp->conn_proto == IPPROTO_TCP);
   1595 	ASSERT(connp->conn_ref == 1);
   1596 }
   1597 
   1598 /*
   1599  * Blows away all tcps whose TIME_WAIT has expired. List traversal
   1600  * is done forwards from the head.
   1601  * This walks all stack instances since
   1602  * tcp_time_wait remains global across all stacks.
   1603  */
   1604 /* ARGSUSED */
   1605 void
   1606 tcp_time_wait_collector(void *arg)
   1607 {
   1608 	tcp_t *tcp;
   1609 	clock_t now;
   1610 	mblk_t *mp;
   1611 	conn_t *connp;
   1612 	kmutex_t *lock;
   1613 	boolean_t removed;
   1614 
   1615 	squeue_t *sqp = (squeue_t *)arg;
   1616 	tcp_squeue_priv_t *tcp_time_wait =
   1617 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
   1618 
   1619 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1620 	tcp_time_wait->tcp_time_wait_tid = 0;
   1621 
   1622 	if (tcp_time_wait->tcp_free_list != NULL &&
   1623 	    tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
   1624 		TCP_G_STAT(tcp_freelist_cleanup);
   1625 		while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
   1626 			tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
   1627 			tcp->tcp_time_wait_next = NULL;
   1628 			tcp_time_wait->tcp_free_list_cnt--;
   1629 			ASSERT(tcp->tcp_tcps == NULL);
   1630 			CONN_DEC_REF(tcp->tcp_connp);
   1631 		}
   1632 		ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
   1633 	}
   1634 
   1635 	/*
   1636 	 * In order to reap time waits reliably, we should use a
   1637 	 * source of time that is not adjustable by the user -- hence
   1638 	 * the call to ddi_get_lbolt().
   1639 	 */
   1640 	now = ddi_get_lbolt();
   1641 	while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
   1642 		/*
   1643 		 * Compare times using modular arithmetic, since
   1644 		 * lbolt can wrapover.
   1645 		 */
   1646 		if ((now - tcp->tcp_time_wait_expire) < 0) {
   1647 			break;
   1648 		}
   1649 
   1650 		removed = tcp_time_wait_remove(tcp, tcp_time_wait);
   1651 		ASSERT(removed);
   1652 
   1653 		connp = tcp->tcp_connp;
   1654 		ASSERT(connp->conn_fanout != NULL);
   1655 		lock = &connp->conn_fanout->connf_lock;
   1656 		/*
   1657 		 * This is essentially a TW reclaim fast path optimization for
   1658 		 * performance where the timewait collector checks under the
   1659 		 * fanout lock (so that no one else can get access to the
   1660 		 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
   1661 		 * the classifier hash list. If ref count is indeed 2, we can
   1662 		 * just remove the conn under the fanout lock and avoid
   1663 		 * cleaning up the conn under the squeue, provided that
   1664 		 * clustering callbacks are not enabled. If clustering is
   1665 		 * enabled, we need to make the clustering callback before
   1666 		 * setting the CONDEMNED flag and after dropping all locks and
   1667 		 * so we forego this optimization and fall back to the slow
   1668 		 * path. Also please see the comments in tcp_closei_local
   1669 		 * regarding the refcnt logic.
   1670 		 *
   1671 		 * Since we are holding the tcp_time_wait_lock, its better
   1672 		 * not to block on the fanout_lock because other connections
   1673 		 * can't add themselves to time_wait list. So we do a
   1674 		 * tryenter instead of mutex_enter.
   1675 		 */
   1676 		if (mutex_tryenter(lock)) {
   1677 			mutex_enter(&connp->conn_lock);
   1678 			if ((connp->conn_ref == 2) &&
   1679 			    (cl_inet_disconnect == NULL)) {
   1680 				ipcl_hash_remove_locked(connp,
   1681 				    connp->conn_fanout);
   1682 				/*
   1683 				 * Set the CONDEMNED flag now itself so that
   1684 				 * the refcnt cannot increase due to any
   1685 				 * walker.
   1686 				 */
   1687 				connp->conn_state_flags |= CONN_CONDEMNED;
   1688 				mutex_exit(lock);
   1689 				mutex_exit(&connp->conn_lock);
   1690 				if (tcp_time_wait->tcp_free_list_cnt <
   1691 				    tcp_free_list_max_cnt) {
   1692 					/* Add to head of tcp_free_list */
   1693 					mutex_exit(
   1694 					    &tcp_time_wait->tcp_time_wait_lock);
   1695 					tcp_cleanup(tcp);
   1696 					ASSERT(connp->conn_latch == NULL);
   1697 					ASSERT(connp->conn_policy == NULL);
   1698 					ASSERT(tcp->tcp_tcps == NULL);
   1699 					ASSERT(connp->conn_netstack == NULL);
   1700 
   1701 					mutex_enter(
   1702 					    &tcp_time_wait->tcp_time_wait_lock);
   1703 					tcp->tcp_time_wait_next =
   1704 					    tcp_time_wait->tcp_free_list;
   1705 					tcp_time_wait->tcp_free_list = tcp;
   1706 					tcp_time_wait->tcp_free_list_cnt++;
   1707 					continue;
   1708 				} else {
   1709 					/* Do not add to tcp_free_list */
   1710 					mutex_exit(
   1711 					    &tcp_time_wait->tcp_time_wait_lock);
   1712 					tcp_bind_hash_remove(tcp);
   1713 					ixa_cleanup(tcp->tcp_connp->conn_ixa);
   1714 					tcp_ipsec_cleanup(tcp);
   1715 					CONN_DEC_REF(tcp->tcp_connp);
   1716 				}
   1717 			} else {
   1718 				CONN_INC_REF_LOCKED(connp);
   1719 				mutex_exit(lock);
   1720 				mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1721 				mutex_exit(&connp->conn_lock);
   1722 				/*
   1723 				 * We can reuse the closemp here since conn has
   1724 				 * detached (otherwise we wouldn't even be in
   1725 				 * time_wait list). tcp_closemp_used can safely
   1726 				 * be changed without taking a lock as no other
   1727 				 * thread can concurrently access it at this
   1728 				 * point in the connection lifecycle.
   1729 				 */
   1730 
   1731 				if (tcp->tcp_closemp.b_prev == NULL)
   1732 					tcp->tcp_closemp_used = B_TRUE;
   1733 				else
   1734 					cmn_err(CE_PANIC,
   1735 					    "tcp_timewait_collector: "
   1736 					    "concurrent use of tcp_closemp: "
   1737 					    "connp %p tcp %p\n", (void *)connp,
   1738 					    (void *)tcp);
   1739 
   1740 				TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   1741 				mp = &tcp->tcp_closemp;
   1742 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   1743 				    tcp_timewait_output, connp, NULL,
   1744 				    SQ_FILL, SQTAG_TCP_TIMEWAIT);
   1745 			}
   1746 		} else {
   1747 			mutex_enter(&connp->conn_lock);
   1748 			CONN_INC_REF_LOCKED(connp);
   1749 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1750 			mutex_exit(&connp->conn_lock);
   1751 			/*
   1752 			 * We can reuse the closemp here since conn has
   1753 			 * detached (otherwise we wouldn't even be in
   1754 			 * time_wait list). tcp_closemp_used can safely
   1755 			 * be changed without taking a lock as no other
   1756 			 * thread can concurrently access it at this
   1757 			 * point in the connection lifecycle.
   1758 			 */
   1759 
   1760 			if (tcp->tcp_closemp.b_prev == NULL)
   1761 				tcp->tcp_closemp_used = B_TRUE;
   1762 			else
   1763 				cmn_err(CE_PANIC, "tcp_timewait_collector: "
   1764 				    "concurrent use of tcp_closemp: "
   1765 				    "connp %p tcp %p\n", (void *)connp,
   1766 				    (void *)tcp);
   1767 
   1768 			TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   1769 			mp = &tcp->tcp_closemp;
   1770 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   1771 			    tcp_timewait_output, connp, NULL,
   1772 			    SQ_FILL, SQTAG_TCP_TIMEWAIT);
   1773 		}
   1774 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1775 	}
   1776 
   1777 	if (tcp_time_wait->tcp_free_list != NULL)
   1778 		tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
   1779 
   1780 	tcp_time_wait->tcp_time_wait_tid =
   1781 	    timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp,
   1782 	    TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION,
   1783 	    CALLOUT_FLAG_ROUNDUP);
   1784 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1785 }
   1786 
   1787 /*
   1788  * Reply to a clients T_CONN_RES TPI message. This function
   1789  * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
   1790  * on the acceptor STREAM and processed in tcp_accept_common().
   1791  * Read the block comment on top of tcp_input_listener().
   1792  */
   1793 static void
   1794 tcp_tli_accept(tcp_t *listener, mblk_t *mp)
   1795 {
   1796 	tcp_t		*acceptor;
   1797 	tcp_t		*eager;
   1798 	tcp_t   	*tcp;
   1799 	struct T_conn_res	*tcr;
   1800 	t_uscalar_t	acceptor_id;
   1801 	t_scalar_t	seqnum;
   1802 	mblk_t		*discon_mp = NULL;
   1803 	mblk_t		*ok_mp;
   1804 	mblk_t		*mp1;
   1805 	tcp_stack_t	*tcps = listener->tcp_tcps;
   1806 	conn_t		*econnp;
   1807 
   1808 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
   1809 		tcp_err_ack(listener, mp, TPROTO, 0);
   1810 		return;
   1811 	}
   1812 	tcr = (struct T_conn_res *)mp->b_rptr;
   1813 
   1814 	/*
   1815 	 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
   1816 	 * read side queue of the streams device underneath us i.e. the
   1817 	 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
   1818 	 * look it up in the queue_hash.  Under LP64 it sends down the
   1819 	 * minor_t of the accepting endpoint.
   1820 	 *
   1821 	 * Once the acceptor/eager are modified (in tcp_accept_swap) the
   1822 	 * fanout hash lock is held.
   1823 	 * This prevents any thread from entering the acceptor queue from
   1824 	 * below (since it has not been hard bound yet i.e. any inbound
   1825 	 * packets will arrive on the listener conn_t and
   1826 	 * go through the classifier).
   1827 	 * The CONN_INC_REF will prevent the acceptor from closing.
   1828 	 *
   1829 	 * XXX It is still possible for a tli application to send down data
   1830 	 * on the accepting stream while another thread calls t_accept.
   1831 	 * This should not be a problem for well-behaved applications since
   1832 	 * the T_OK_ACK is sent after the queue swapping is completed.
   1833 	 *
   1834 	 * If the accepting fd is the same as the listening fd, avoid
   1835 	 * queue hash lookup since that will return an eager listener in a
   1836 	 * already established state.
   1837 	 */
   1838 	acceptor_id = tcr->ACCEPTOR_id;
   1839 	mutex_enter(&listener->tcp_eager_lock);
   1840 	if (listener->tcp_acceptor_id == acceptor_id) {
   1841 		eager = listener->tcp_eager_next_q;
   1842 		/* only count how many T_CONN_INDs so don't count q0 */
   1843 		if ((listener->tcp_conn_req_cnt_q != 1) ||
   1844 		    (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
   1845 			mutex_exit(&listener->tcp_eager_lock);
   1846 			tcp_err_ack(listener, mp, TBADF, 0);
   1847 			return;
   1848 		}
   1849 		if (listener->tcp_conn_req_cnt_q0 != 0) {
   1850 			/* Throw away all the eagers on q0. */
   1851 			tcp_eager_cleanup(listener, 1);
   1852 		}
   1853 		if (listener->tcp_syn_defense) {
   1854 			listener->tcp_syn_defense = B_FALSE;
   1855 			if (listener->tcp_ip_addr_cache != NULL) {
   1856 				kmem_free(listener->tcp_ip_addr_cache,
   1857 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
   1858 				listener->tcp_ip_addr_cache = NULL;
   1859 			}
   1860 		}
   1861 		/*
   1862 		 * Transfer tcp_conn_req_max to the eager so that when
   1863 		 * a disconnect occurs we can revert the endpoint to the
   1864 		 * listen state.
   1865 		 */
   1866 		eager->tcp_conn_req_max = listener->tcp_conn_req_max;
   1867 		ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
   1868 		/*
   1869 		 * Get a reference on the acceptor just like the
   1870 		 * tcp_acceptor_hash_lookup below.
   1871 		 */
   1872 		acceptor = listener;
   1873 		CONN_INC_REF(acceptor->tcp_connp);
   1874 	} else {
   1875 		acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
   1876 		if (acceptor == NULL) {
   1877 			if (listener->tcp_connp->conn_debug) {
   1878 				(void) strlog(TCP_MOD_ID, 0, 1,
   1879 				    SL_ERROR|SL_TRACE,
   1880 				    "tcp_accept: did not find acceptor 0x%x\n",
   1881 				    acceptor_id);
   1882 			}
   1883 			mutex_exit(&listener->tcp_eager_lock);
   1884 			tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
   1885 			return;
   1886 		}
   1887 		/*
   1888 		 * Verify acceptor state. The acceptable states for an acceptor
   1889 		 * include TCPS_IDLE and TCPS_BOUND.
   1890 		 */
   1891 		switch (acceptor->tcp_state) {
   1892 		case TCPS_IDLE:
   1893 			/* FALLTHRU */
   1894 		case TCPS_BOUND:
   1895 			break;
   1896 		default:
   1897 			CONN_DEC_REF(acceptor->tcp_connp);
   1898 			mutex_exit(&listener->tcp_eager_lock);
   1899 			tcp_err_ack(listener, mp, TOUTSTATE, 0);
   1900 			return;
   1901 		}
   1902 	}
   1903 
   1904 	/* The listener must be in TCPS_LISTEN */
   1905 	if (listener->tcp_state != TCPS_LISTEN) {
   1906 		CONN_DEC_REF(acceptor->tcp_connp);
   1907 		mutex_exit(&listener->tcp_eager_lock);
   1908 		tcp_err_ack(listener, mp, TOUTSTATE, 0);
   1909 		return;
   1910 	}
   1911 
   1912 	/*
   1913 	 * Rendezvous with an eager connection request packet hanging off
   1914 	 * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
   1915 	 * tcp structure when the connection packet arrived in
   1916 	 * tcp_input_listener().
   1917 	 */
   1918 	seqnum = tcr->SEQ_number;
   1919 	eager = listener;
   1920 	do {
   1921 		eager = eager->tcp_eager_next_q;
   1922 		if (eager == NULL) {
   1923 			CONN_DEC_REF(acceptor->tcp_connp);
   1924 			mutex_exit(&listener->tcp_eager_lock);
   1925 			tcp_err_ack(listener, mp, TBADSEQ, 0);
   1926 			return;
   1927 		}
   1928 	} while (eager->tcp_conn_req_seqnum != seqnum);
   1929 	mutex_exit(&listener->tcp_eager_lock);
   1930 
   1931 	/*
   1932 	 * At this point, both acceptor and listener have 2 ref
   1933 	 * that they begin with. Acceptor has one additional ref
   1934 	 * we placed in lookup while listener has 3 additional
   1935 	 * ref for being behind the squeue (tcp_accept() is
   1936 	 * done on listener's squeue); being in classifier hash;
   1937 	 * and eager's ref on listener.
   1938 	 */
   1939 	ASSERT(listener->tcp_connp->conn_ref >= 5);
   1940 	ASSERT(acceptor->tcp_connp->conn_ref >= 3);
   1941 
   1942 	/*
   1943 	 * The eager at this point is set in its own squeue and
   1944 	 * could easily have been killed (tcp_accept_finish will
   1945 	 * deal with that) because of a TH_RST so we can only
   1946 	 * ASSERT for a single ref.
   1947 	 */
   1948 	ASSERT(eager->tcp_connp->conn_ref >= 1);
   1949 
   1950 	/*
   1951 	 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
   1952 	 * use it if something failed.
   1953 	 */
   1954 	discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
   1955 	    sizeof (struct stroptions)), BPRI_HI);
   1956 	if (discon_mp == NULL) {
   1957 		CONN_DEC_REF(acceptor->tcp_connp);
   1958 		CONN_DEC_REF(eager->tcp_connp);
   1959 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
   1960 		return;
   1961 	}
   1962 
   1963 	econnp = eager->tcp_connp;
   1964 
   1965 	/* Hold a copy of mp, in case reallocb fails */
   1966 	if ((mp1 = copymsg(mp)) == NULL) {
   1967 		CONN_DEC_REF(acceptor->tcp_connp);
   1968 		CONN_DEC_REF(eager->tcp_connp);
   1969 		freemsg(discon_mp);
   1970 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
   1971 		return;
   1972 	}
   1973 
   1974 	tcr = (struct T_conn_res *)mp1->b_rptr;
   1975 
   1976 	/*
   1977 	 * This is an expanded version of mi_tpi_ok_ack_alloc()
   1978 	 * which allocates a larger mblk and appends the new
   1979 	 * local address to the ok_ack.  The address is copied by
   1980 	 * soaccept() for getsockname().
   1981 	 */
   1982 	{
   1983 		int extra;
   1984 
   1985 		extra = (econnp->conn_family == AF_INET) ?
   1986 		    sizeof (sin_t) : sizeof (sin6_t);
   1987 
   1988 		/*
   1989 		 * Try to re-use mp, if possible.  Otherwise, allocate
   1990 		 * an mblk and return it as ok_mp.  In any case, mp
   1991 		 * is no longer usable upon return.
   1992 		 */
   1993 		if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
   1994 			CONN_DEC_REF(acceptor->tcp_connp);
   1995 			CONN_DEC_REF(eager->tcp_connp);
   1996 			freemsg(discon_mp);
   1997 			/* Original mp has been freed by now, so use mp1 */
   1998 			tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
   1999 			return;
   2000 		}
   2001 
   2002 		mp = NULL;	/* We should never use mp after this point */
   2003 
   2004 		switch (extra) {
   2005 		case sizeof (sin_t): {
   2006 			sin_t *sin = (sin_t *)ok_mp->b_wptr;
   2007 
   2008 			ok_mp->b_wptr += extra;
   2009 			sin->sin_family = AF_INET;
   2010 			sin->sin_port = econnp->conn_lport;
   2011 			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
   2012 			break;
   2013 		}
   2014 		case sizeof (sin6_t): {
   2015 			sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
   2016 
   2017 			ok_mp->b_wptr += extra;
   2018 			sin6->sin6_family = AF_INET6;
   2019 			sin6->sin6_port = econnp->conn_lport;
   2020 			sin6->sin6_addr = econnp->conn_laddr_v6;
   2021 			sin6->sin6_flowinfo = econnp->conn_flowinfo;
   2022 			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
   2023 			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
   2024 				sin6->sin6_scope_id =
   2025 				    econnp->conn_ixa->ixa_scopeid;
   2026 			} else {
   2027 				sin6->sin6_scope_id = 0;
   2028 			}
   2029 			sin6->__sin6_src_id = 0;
   2030 			break;
   2031 		}
   2032 		default:
   2033 			break;
   2034 		}
   2035 		ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
   2036 	}
   2037 
   2038 	/*
   2039 	 * If there are no options we know that the T_CONN_RES will
   2040 	 * succeed. However, we can't send the T_OK_ACK upstream until
   2041 	 * the tcp_accept_swap is done since it would be dangerous to
   2042 	 * let the application start using the new fd prior to the swap.
   2043 	 */
   2044 	tcp_accept_swap(listener, acceptor, eager);
   2045 
   2046 	/*
   2047 	 * tcp_accept_swap unlinks eager from listener but does not drop
   2048 	 * the eager's reference on the listener.
   2049 	 */
   2050 	ASSERT(eager->tcp_listener == NULL);
   2051 	ASSERT(listener->tcp_connp->conn_ref >= 5);
   2052 
   2053 	/*
   2054 	 * The eager is now associated with its own queue. Insert in
   2055 	 * the hash so that the connection can be reused for a future
   2056 	 * T_CONN_RES.
   2057 	 */
   2058 	tcp_acceptor_hash_insert(acceptor_id, eager);
   2059 
   2060 	/*
   2061 	 * We now do the processing of options with T_CONN_RES.
   2062 	 * We delay till now since we wanted to have queue to pass to
   2063 	 * option processing routines that points back to the right
   2064 	 * instance structure which does not happen until after
   2065 	 * tcp_accept_swap().
   2066 	 *
   2067 	 * Note:
   2068 	 * The sanity of the logic here assumes that whatever options
   2069 	 * are appropriate to inherit from listner=>eager are done
   2070 	 * before this point, and whatever were to be overridden (or not)
   2071 	 * in transfer logic from eager=>acceptor in tcp_accept_swap().
   2072 	 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
   2073 	 *   before its ACCEPTOR_id comes down in T_CONN_RES ]
   2074 	 * This may not be true at this point in time but can be fixed
   2075 	 * independently. This option processing code starts with
   2076 	 * the instantiated acceptor instance and the final queue at
   2077 	 * this point.
   2078 	 */
   2079 
   2080 	if (tcr->OPT_length != 0) {
   2081 		/* Options to process */
   2082 		int t_error = 0;
   2083 		int sys_error = 0;
   2084 		int do_disconnect = 0;
   2085 
   2086 		if (tcp_conprim_opt_process(eager, mp1,
   2087 		    &do_disconnect, &t_error, &sys_error) < 0) {
   2088 			eager->tcp_accept_error = 1;
   2089 			if (do_disconnect) {
   2090 				/*
   2091 				 * An option failed which does not allow
   2092 				 * connection to be accepted.
   2093 				 *
   2094 				 * We allow T_CONN_RES to succeed and
   2095 				 * put a T_DISCON_IND on the eager queue.
   2096 				 */
   2097 				ASSERT(t_error == 0 && sys_error == 0);
   2098 				eager->tcp_send_discon_ind = 1;
   2099 			} else {
   2100 				ASSERT(t_error != 0);
   2101 				freemsg(ok_mp);
   2102 				/*
   2103 				 * Original mp was either freed or set
   2104 				 * to ok_mp above, so use mp1 instead.
   2105 				 */
   2106 				tcp_err_ack(listener, mp1, t_error, sys_error);
   2107 				goto finish;
   2108 			}
   2109 		}
   2110 		/*
   2111 		 * Most likely success in setting options (except if
   2112 		 * eager->tcp_send_discon_ind set).
   2113 		 * mp1 option buffer represented by OPT_length/offset
   2114 		 * potentially modified and contains results of setting
   2115 		 * options at this point
   2116 		 */
   2117 	}
   2118 
   2119 	/* We no longer need mp1, since all options processing has passed */
   2120 	freemsg(mp1);
   2121 
   2122 	putnext(listener->tcp_connp->conn_rq, ok_mp);
   2123 
   2124 	mutex_enter(&listener->tcp_eager_lock);
   2125 	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
   2126 		tcp_t	*tail;
   2127 		mblk_t	*conn_ind;
   2128 
   2129 		/*
   2130 		 * This path should not be executed if listener and
   2131 		 * acceptor streams are the same.
   2132 		 */
   2133 		ASSERT(listener != acceptor);
   2134 
   2135 		tcp = listener->tcp_eager_prev_q0;
   2136 		/*
   2137 		 * listener->tcp_eager_prev_q0 points to the TAIL of the
   2138 		 * deferred T_conn_ind queue. We need to get to the head of
   2139 		 * the queue in order to send up T_conn_ind the same order as
   2140 		 * how the 3WHS is completed.
   2141 		 */
   2142 		while (tcp != listener) {
   2143 			if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
   2144 				break;
   2145 			else
   2146 				tcp = tcp->tcp_eager_prev_q0;
   2147 		}
   2148 		ASSERT(tcp != listener);
   2149 		conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
   2150 		ASSERT(conn_ind != NULL);
   2151 		tcp->tcp_conn.tcp_eager_conn_ind = NULL;
   2152 
   2153 		/* Move from q0 to q */
   2154 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
   2155 		listener->tcp_conn_req_cnt_q0--;
   2156 		listener->tcp_conn_req_cnt_q++;
   2157 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
   2158 		    tcp->tcp_eager_prev_q0;
   2159 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
   2160 		    tcp->tcp_eager_next_q0;
   2161 		tcp->tcp_eager_prev_q0 = NULL;
   2162 		tcp->tcp_eager_next_q0 = NULL;
   2163 		tcp->tcp_conn_def_q0 = B_FALSE;
   2164 
   2165 		/* Make sure the tcp isn't in the list of droppables */
   2166 		ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
   2167 		    tcp->tcp_eager_prev_drop_q0 == NULL);
   2168 
   2169 		/*
   2170 		 * Insert at end of the queue because sockfs sends
   2171 		 * down T_CONN_RES in chronological order. Leaving
   2172 		 * the older conn indications at front of the queue
   2173 		 * helps reducing search time.
   2174 		 */
   2175 		tail = listener->tcp_eager_last_q;
   2176 		if (tail != NULL)
   2177 			tail->tcp_eager_next_q = tcp;
   2178 		else
   2179 			listener->tcp_eager_next_q = tcp;
   2180 		listener->tcp_eager_last_q = tcp;
   2181 		tcp->tcp_eager_next_q = NULL;
   2182 		mutex_exit(&listener->tcp_eager_lock);
   2183 		putnext(tcp->tcp_connp->conn_rq, conn_ind);
   2184 	} else {
   2185 		mutex_exit(&listener->tcp_eager_lock);
   2186 	}
   2187 
   2188 	/*
   2189 	 * Done with the acceptor - free it
   2190 	 *
   2191 	 * Note: from this point on, no access to listener should be made
   2192 	 * as listener can be equal to acceptor.
   2193 	 */
   2194 finish:
   2195 	ASSERT(acceptor->tcp_detached);
   2196 	acceptor->tcp_connp->conn_rq = NULL;
   2197 	ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
   2198 	acceptor->tcp_connp->conn_wq = NULL;
   2199 	(void) tcp_clean_death(acceptor, 0, 2);
   2200 	CONN_DEC_REF(acceptor->tcp_connp);
   2201 
   2202 	/*
   2203 	 * We pass discon_mp to tcp_accept_finish to get on the right squeue.
   2204 	 *
   2205 	 * It will update the setting for sockfs/stream head and also take
   2206 	 * care of any data that arrived before accept() wad called.
   2207 	 * In case we already received a FIN then tcp_accept_finish will send up
   2208 	 * the ordrel. It will also send up a window update if the window
   2209 	 * has opened up.
   2210 	 */
   2211 
   2212 	/*
   2213 	 * XXX: we currently have a problem if XTI application closes the
   2214 	 * acceptor stream in between. This problem exists in on10-gate also
   2215 	 * and is well know but nothing can be done short of major rewrite
   2216 	 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
   2217 	 * eager same squeue as listener (we can distinguish non socket
   2218 	 * listeners at the time of handling a SYN in tcp_input_listener)
   2219 	 * and do most of the work that tcp_accept_finish does here itself
   2220 	 * and then get behind the acceptor squeue to access the acceptor
   2221 	 * queue.
   2222 	 */
   2223 	/*
   2224 	 * We already have a ref on tcp so no need to do one before squeue_enter
   2225 	 */
   2226 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
   2227 	    tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
   2228 	    SQTAG_TCP_ACCEPT_FINISH);
   2229 }
   2230 
   2231 /*
   2232  * Swap information between the eager and acceptor for a TLI/XTI client.
   2233  * The sockfs accept is done on the acceptor stream and control goes
   2234  * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
   2235  * called. In either case, both the eager and listener are in their own
   2236  * perimeter (squeue) and the code has to deal with potential race.
   2237  *
   2238  * See the block comment on top of tcp_accept() and tcp_tli_accept().
   2239  */
   2240 static void
   2241 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
   2242 {
   2243 	conn_t	*econnp, *aconnp;
   2244 
   2245 	ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
   2246 	ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
   2247 	ASSERT(!TCP_IS_SOCKET(acceptor));
   2248 	ASSERT(!TCP_IS_SOCKET(eager));
   2249 	ASSERT(!TCP_IS_SOCKET(listener));
   2250 
   2251 	/*
   2252 	 * Trusted Extensions may need to use a security label that is
   2253 	 * different from the acceptor's label on MLP and MAC-Exempt
   2254 	 * sockets. If this is the case, the required security label
   2255 	 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
   2256 	 * acceptor stream refer to econnp we atomatically get that label.
   2257 	 */
   2258 
   2259 	acceptor->tcp_detached = B_TRUE;
   2260 	/*
   2261 	 * To permit stream re-use by TLI/XTI, the eager needs a copy of
   2262 	 * the acceptor id.
   2263 	 */
   2264 	eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
   2265 
   2266 	/* remove eager from listen list... */
   2267 	mutex_enter(&listener->tcp_eager_lock);
   2268 	tcp_eager_unlink(eager);
   2269 	ASSERT(eager->tcp_eager_next_q == NULL &&
   2270 	    eager->tcp_eager_last_q == NULL);
   2271 	ASSERT(eager->tcp_eager_next_q0 == NULL &&
   2272 	    eager->tcp_eager_prev_q0 == NULL);
   2273 	mutex_exit(&listener->tcp_eager_lock);
   2274 
   2275 	econnp = eager->tcp_connp;
   2276 	aconnp = acceptor->tcp_connp;
   2277 	econnp->conn_rq = aconnp->conn_rq;
   2278 	econnp->conn_wq = aconnp->conn_wq;
   2279 	econnp->conn_rq->q_ptr = econnp;
   2280 	econnp->conn_wq->q_ptr = econnp;
   2281 
   2282 	/*
   2283 	 * In the TLI/XTI loopback case, we are inside the listener's squeue,
   2284 	 * which might be a different squeue from our peer TCP instance.
   2285 	 * For TCP Fusion, the peer expects that whenever tcp_detached is
   2286 	 * clear, our TCP queues point to the acceptor's queues.  Thus, use
   2287 	 * membar_producer() to ensure that the assignments of conn_rq/conn_wq
   2288 	 * above reach global visibility prior to the clearing of tcp_detached.
   2289 	 */
   2290 	membar_producer();
   2291 	eager->tcp_detached = B_FALSE;
   2292 
   2293 	ASSERT(eager->tcp_ack_tid == 0);
   2294 
   2295 	econnp->conn_dev = aconnp->conn_dev;
   2296 	econnp->conn_minor_arena = aconnp->conn_minor_arena;
   2297 
   2298 	ASSERT(econnp->conn_minor_arena != NULL);
   2299 	if (econnp->conn_cred != NULL)
   2300 		crfree(econnp->conn_cred);
   2301 	econnp->conn_cred = aconnp->conn_cred;
   2302 	aconnp->conn_cred = NULL;
   2303 	econnp->conn_cpid = aconnp->conn_cpid;
   2304 	ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
   2305 	ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
   2306 
   2307 	econnp->conn_zoneid = aconnp->conn_zoneid;
   2308 	econnp->conn_allzones = aconnp->conn_allzones;
   2309 	econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
   2310 
   2311 	econnp->conn_mac_mode = aconnp->conn_mac_mode;
   2312 	econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
   2313 	aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
   2314 
   2315 	/* Do the IPC initialization */
   2316 	CONN_INC_REF(econnp);
   2317 
   2318 	econnp->conn_family = aconnp->conn_family;
   2319 	econnp->conn_ipversion = aconnp->conn_ipversion;
   2320 
   2321 	/* Done with old IPC. Drop its ref on its connp */
   2322 	CONN_DEC_REF(aconnp);
   2323 }
   2324 
   2325 
   2326 /*
   2327  * Adapt to the information, such as rtt and rtt_sd, provided from the
   2328  * DCE and IRE maintained by IP.
   2329  *
   2330  * Checks for multicast and broadcast destination address.
   2331  * Returns zero if ok; an errno on failure.
   2332  *
   2333  * Note that the MSS calculation here is based on the info given in
   2334  * the DCE and IRE.  We do not do any calculation based on TCP options.  They
   2335  * will be handled in tcp_input_data() when TCP knows which options to use.
   2336  *
   2337  * Note on how TCP gets its parameters for a connection.
   2338  *
   2339  * When a tcp_t structure is allocated, it gets all the default parameters.
   2340  * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd,
   2341  * spipe, rpipe, ... from the route metrics.  Route metric overrides the
   2342  * default.
   2343  *
   2344  * An incoming SYN with a multicast or broadcast destination address is dropped
   2345  * in ip_fanout_v4/v6.
   2346  *
   2347  * An incoming SYN with a multicast or broadcast source address is always
   2348  * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in
   2349  * conn_connect.
   2350  * The same logic in tcp_set_destination also serves to
   2351  * reject an attempt to connect to a broadcast or multicast (destination)
   2352  * address.
   2353  */
   2354 static int
   2355 tcp_set_destination(tcp_t *tcp)
   2356 {
   2357 	uint32_t	mss_max;
   2358 	uint32_t	mss;
   2359 	boolean_t	tcp_detached = TCP_IS_DETACHED(tcp);
   2360 	conn_t		*connp = tcp->tcp_connp;
   2361 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   2362 	iulp_t		uinfo;
   2363 	int		error;
   2364 	uint32_t	flags;
   2365 
   2366 	flags = IPDF_LSO | IPDF_ZCOPY;
   2367 	/*
   2368 	 * Make sure we have a dce for the destination to avoid dce_ident
   2369 	 * contention for connected sockets.
   2370 	 */
   2371 	flags |= IPDF_UNIQUE_DCE;
   2372 
   2373 	if (!tcps->tcps_ignore_path_mtu)
   2374 		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
   2375 
   2376 	/* Use conn_lock to satify ASSERT; tcp is already serialized */
   2377 	mutex_enter(&connp->conn_lock);
   2378 	error = conn_connect(connp, &uinfo, flags);
   2379 	mutex_exit(&connp->conn_lock);
   2380 	if (error != 0)
   2381 		return (error);
   2382 
   2383 	error = tcp_build_hdrs(tcp);
   2384 	if (error != 0)
   2385 		return (error);
   2386 
   2387 	tcp->tcp_localnet = uinfo.iulp_localnet;
   2388 
   2389 	if (uinfo.iulp_rtt != 0) {
   2390 		clock_t	rto;
   2391 
   2392 		tcp->tcp_rtt_sa = uinfo.iulp_rtt;
   2393 		tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
   2394 		rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
   2395 		    tcps->tcps_rexmit_interval_extra +
   2396 		    (tcp->tcp_rtt_sa >> 5);
   2397 
   2398 		if (rto > tcps->tcps_rexmit_interval_max) {
   2399 			tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
   2400 		} else if (rto < tcps->tcps_rexmit_interval_min) {
   2401 			tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
   2402 		} else {
   2403 			tcp->tcp_rto = rto;
   2404 		}
   2405 	}
   2406 	if (uinfo.iulp_ssthresh != 0)
   2407 		tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
   2408 	else
   2409 		tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
   2410 	if (uinfo.iulp_spipe > 0) {
   2411 		connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
   2412 		    tcps->tcps_max_buf);
   2413 		if (tcps->tcps_snd_lowat_fraction != 0) {
   2414 			connp->conn_sndlowat = connp->conn_sndbuf /
   2415 			    tcps->tcps_snd_lowat_fraction;
   2416 		}
   2417 		(void) tcp_maxpsz_set(tcp, B_TRUE);
   2418 	}
   2419 	/*
   2420 	 * Note that up till now, acceptor always inherits receive
   2421 	 * window from the listener.  But if there is a metrics
   2422 	 * associated with a host, we should use that instead of
   2423 	 * inheriting it from listener. Thus we need to pass this
   2424 	 * info back to the caller.
   2425 	 */
   2426 	if (uinfo.iulp_rpipe > 0) {
   2427 		tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe,
   2428 		    tcps->tcps_max_buf);
   2429 	}
   2430 
   2431 	if (uinfo.iulp_rtomax > 0) {
   2432 		tcp->tcp_second_timer_threshold =
   2433 		    uinfo.iulp_rtomax;
   2434 	}
   2435 
   2436 	/*
   2437 	 * Use the metric option settings, iulp_tstamp_ok and
   2438 	 * iulp_wscale_ok, only for active open. What this means
   2439 	 * is that if the other side uses timestamp or window
   2440 	 * scale option, TCP will also use those options. That
   2441 	 * is for passive open.  If the application sets a
   2442 	 * large window, window scale is enabled regardless of
   2443 	 * the value in iulp_wscale_ok.  This is the behavior
   2444 	 * since 2.6.  So we keep it.
   2445 	 * The only case left in passive open processing is the
   2446 	 * check for SACK.
   2447 	 * For ECN, it should probably be like SACK.  But the
   2448 	 * current value is binary, so we treat it like the other
   2449 	 * cases.  The metric only controls active open.For passive
   2450 	 * open, the ndd param, tcp_ecn_permitted, controls the
   2451 	 * behavior.
   2452 	 */
   2453 	if (!tcp_detached) {
   2454 		/*
   2455 		 * The if check means that the following can only
   2456 		 * be turned on by the metrics only IRE, but not off.
   2457 		 */
   2458 		if (uinfo.iulp_tstamp_ok)
   2459 			tcp->tcp_snd_ts_ok = B_TRUE;
   2460 		if (uinfo.iulp_wscale_ok)
   2461 			tcp->tcp_snd_ws_ok = B_TRUE;
   2462 		if (uinfo.iulp_sack == 2)
   2463 			tcp->tcp_snd_sack_ok = B_TRUE;
   2464 		if (uinfo.iulp_ecn_ok)
   2465 			tcp->tcp_ecn_ok = B_TRUE;
   2466 	} else {
   2467 		/*
   2468 		 * Passive open.
   2469 		 *
   2470 		 * As above, the if check means that SACK can only be
   2471 		 * turned on by the metric only IRE.
   2472 		 */
   2473 		if (uinfo.iulp_sack > 0) {
   2474 			tcp->tcp_snd_sack_ok = B_TRUE;
   2475 		}
   2476 	}
   2477 
   2478 	/*
   2479 	 * XXX Note that currently, iulp_mtu can be as small as 68
   2480 	 * because of PMTUd.  So tcp_mss may go to negative if combined
   2481 	 * length of all those options exceeds 28 bytes.  But because
   2482 	 * of the tcp_mss_min check below, we may not have a problem if
   2483 	 * tcp_mss_min is of a reasonable value.  The default is 1 so
   2484 	 * the negative problem still exists.  And the check defeats PMTUd.
   2485 	 * In fact, if PMTUd finds that the MSS should be smaller than
   2486 	 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min
   2487 	 * value.
   2488 	 *
   2489 	 * We do not deal with that now.  All those problems related to
   2490 	 * PMTUd will be fixed later.
   2491 	 */
   2492 	ASSERT(uinfo.iulp_mtu != 0);
   2493 	mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu;
   2494 
   2495 	/* Sanity check for MSS value. */
   2496 	if (connp->conn_ipversion == IPV4_VERSION)
   2497 		mss_max = tcps->tcps_mss_max_ipv4;
   2498 	else
   2499 		mss_max = tcps->tcps_mss_max_ipv6;
   2500 
   2501 	if (tcp->tcp_ipsec_overhead == 0)
   2502 		tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
   2503 
   2504 	mss -= tcp->tcp_ipsec_overhead;
   2505 
   2506 	if (mss < tcps->tcps_mss_min)
   2507 		mss = tcps->tcps_mss_min;
   2508 	if (mss > mss_max)
   2509 		mss = mss_max;
   2510 
   2511 	/* Note that this is the maximum MSS, excluding all options. */
   2512 	tcp->tcp_mss = mss;
   2513 
   2514 	/*
   2515 	 * Update the tcp connection with LSO capability.
   2516 	 */
   2517 	tcp_update_lso(tcp, connp->conn_ixa);
   2518 
   2519 	/*
   2520 	 * Initialize the ISS here now that we have the full connection ID.
   2521 	 * The RFC 1948 method of initial sequence number generation requires
   2522 	 * knowledge of the full connection ID before setting the ISS.
   2523 	 */
   2524 	tcp_iss_init(tcp);
   2525 
   2526 	tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local);
   2527 
   2528 	/*
   2529 	 * Make sure that conn is not marked incipient
   2530 	 * for incoming connections. A blind
   2531 	 * removal of incipient flag is cheaper than
   2532 	 * check and removal.
   2533 	 */
   2534 	mutex_enter(&connp->conn_lock);
   2535 	connp->conn_state_flags &= ~CONN_INCIPIENT;
   2536 	mutex_exit(&connp->conn_lock);
   2537 	return (0);
   2538 }
   2539 
   2540 static void
   2541 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
   2542 {
   2543 	int	error;
   2544 	conn_t	*connp = tcp->tcp_connp;
   2545 	struct sockaddr	*sa;
   2546 	mblk_t  *mp1;
   2547 	struct T_bind_req *tbr;
   2548 	int	backlog;
   2549 	socklen_t	len;
   2550 	sin_t	*sin;
   2551 	sin6_t	*sin6;
   2552 	cred_t		*cr;
   2553 
   2554 	/*
   2555 	 * All Solaris components should pass a db_credp
   2556 	 * for this TPI message, hence we ASSERT.
   2557 	 * But in case there is some other M_PROTO that looks
   2558 	 * like a TPI message sent by some other kernel
   2559 	 * component, we check and return an error.
   2560 	 */
   2561 	cr = msg_getcred(mp, NULL);
   2562 	ASSERT(cr != NULL);
   2563 	if (cr == NULL) {
   2564 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
   2565 		return;
   2566 	}
   2567 
   2568 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
   2569 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
   2570 		if (connp->conn_debug) {
   2571 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   2572 			    "tcp_tpi_bind: bad req, len %u",
   2573 			    (uint_t)(mp->b_wptr - mp->b_rptr));
   2574 		}
   2575 		tcp_err_ack(tcp, mp, TPROTO, 0);
   2576 		return;
   2577 	}
   2578 	/* Make sure the largest address fits */
   2579 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
   2580 	if (mp1 == NULL) {
   2581 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   2582 		return;
   2583 	}
   2584 	mp = mp1;
   2585 	tbr = (struct T_bind_req *)mp->b_rptr;
   2586 
   2587 	backlog = tbr->CONIND_number;
   2588 	len = tbr->ADDR_length;
   2589 
   2590 	switch (len) {
   2591 	case 0:		/* request for a generic port */
   2592 		tbr->ADDR_offset = sizeof (struct T_bind_req);
   2593 		if (connp->conn_family == AF_INET) {
   2594 			tbr->ADDR_length = sizeof (sin_t);
   2595 			sin = (sin_t *)&tbr[1];
   2596 			*sin = sin_null;
   2597 			sin->sin_family = AF_INET;
   2598 			sa = (struct sockaddr *)sin;
   2599 			len = sizeof (sin_t);
   2600 			mp->b_wptr = (uchar_t *)&sin[1];
   2601 		} else {
   2602 			ASSERT(connp->conn_family == AF_INET6);
   2603 			tbr->ADDR_length = sizeof (sin6_t);
   2604 			sin6 = (sin6_t *)&tbr[1];
   2605 			*sin6 = sin6_null;
   2606 			sin6->sin6_family = AF_INET6;
   2607 			sa = (struct sockaddr *)sin6;
   2608 			len = sizeof (sin6_t);
   2609 			mp->b_wptr = (uchar_t *)&sin6[1];
   2610 		}
   2611 		break;
   2612 
   2613 	case sizeof (sin_t):    /* Complete IPv4 address */
   2614 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
   2615 		    sizeof (sin_t));
   2616 		break;
   2617 
   2618 	case sizeof (sin6_t): /* Complete IPv6 address */
   2619 		sa = (struct sockaddr *)mi_offset_param(mp,
   2620 		    tbr->ADDR_offset, sizeof (sin6_t));
   2621 		break;
   2622 
   2623 	default:
   2624 		if (connp->conn_debug) {
   2625 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   2626 			    "tcp_tpi_bind: bad address length, %d",
   2627 			    tbr->ADDR_length);
   2628 		}
   2629 		tcp_err_ack(tcp, mp, TBADADDR, 0);
   2630 		return;
   2631 	}
   2632 
   2633 	if (backlog > 0) {
   2634 		error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
   2635 		    tbr->PRIM_type != O_T_BIND_REQ);
   2636 	} else {
   2637 		error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
   2638 		    tbr->PRIM_type != O_T_BIND_REQ);
   2639 	}
   2640 done:
   2641 	if (error > 0) {
   2642 		tcp_err_ack(tcp, mp, TSYSERR, error);
   2643 	} else if (error < 0) {
   2644 		tcp_err_ack(tcp, mp, -error, 0);
   2645 	} else {
   2646 		/*
   2647 		 * Update port information as sockfs/tpi needs it for checking
   2648 		 */
   2649 		if (connp->conn_family == AF_INET) {
   2650 			sin = (sin_t *)sa;
   2651 			sin->sin_port = connp->conn_lport;
   2652 		} else {
   2653 			sin6 = (sin6_t *)sa;
   2654 			sin6->sin6_port = connp->conn_lport;
   2655 		}
   2656 		mp->b_datap->db_type = M_PCPROTO;
   2657 		tbr->PRIM_type = T_BIND_ACK;
   2658 		putnext(connp->conn_rq, mp);
   2659 	}
   2660 }
   2661 
   2662 /*
   2663  * If the "bind_to_req_port_only" parameter is set, if the requested port
   2664  * number is available, return it, If not return 0
   2665  *
   2666  * If "bind_to_req_port_only" parameter is not set and
   2667  * If the requested port number is available, return it.  If not, return
   2668  * the first anonymous port we happen across.  If no anonymous ports are
   2669  * available, return 0. addr is the requested local address, if any.
   2670  *
   2671  * In either case, when succeeding update the tcp_t to record the port number
   2672  * and insert it in the bind hash table.
   2673  *
   2674  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
   2675  * without setting SO_REUSEADDR. This is needed so that they
   2676  * can be viewed as two independent transport protocols.
   2677  */
   2678 static in_port_t
   2679 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
   2680     int reuseaddr, boolean_t quick_connect,
   2681     boolean_t bind_to_req_port_only, boolean_t user_specified)
   2682 {
   2683 	/* number of times we have run around the loop */
   2684 	int count = 0;
   2685 	/* maximum number of times to run around the loop */
   2686 	int loopmax;
   2687 	conn_t *connp = tcp->tcp_connp;
   2688 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   2689 
   2690 	/*
   2691 	 * Lookup for free addresses is done in a loop and "loopmax"
   2692 	 * influences how long we spin in the loop
   2693 	 */
   2694 	if (bind_to_req_port_only) {
   2695 		/*
   2696 		 * If the requested port is busy, don't bother to look
   2697 		 * for a new one. Setting loop maximum count to 1 has
   2698 		 * that effect.
   2699 		 */
   2700 		loopmax = 1;
   2701 	} else {
   2702 		/*
   2703 		 * If the requested port is busy, look for a free one
   2704 		 * in the anonymous port range.
   2705 		 * Set loopmax appropriately so that one does not look
   2706 		 * forever in the case all of the anonymous ports are in use.
   2707 		 */
   2708 		if (connp->conn_anon_priv_bind) {
   2709 			/*
   2710 			 * loopmax =
   2711 			 * 	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
   2712 			 */
   2713 			loopmax = IPPORT_RESERVED -
   2714 			    tcps->tcps_min_anonpriv_port;
   2715 		} else {
   2716 			loopmax = (tcps->tcps_largest_anon_port -
   2717 			    tcps->tcps_smallest_anon_port + 1);
   2718 		}
   2719 	}
   2720 	do {
   2721 		uint16_t	lport;
   2722 		tf_t		*tbf;
   2723 		tcp_t		*ltcp;
   2724 		conn_t		*lconnp;
   2725 
   2726 		lport = htons(port);
   2727 
   2728 		/*
   2729 		 * Ensure that the tcp_t is not currently in the bind hash.
   2730 		 * Hold the lock on the hash bucket to ensure that
   2731 		 * the duplicate check plus the insertion is an atomic
   2732 		 * operation.
   2733 		 *
   2734 		 * This function does an inline lookup on the bind hash list
   2735 		 * Make sure that we access only members of tcp_t
   2736 		 * and that we don't look at tcp_tcp, since we are not
   2737 		 * doing a CONN_INC_REF.
   2738 		 */
   2739 		tcp_bind_hash_remove(tcp);
   2740 		tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
   2741 		mutex_enter(&tbf->tf_lock);
   2742 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
   2743 		    ltcp = ltcp->tcp_bind_hash) {
   2744 			if (lport == ltcp->tcp_connp->conn_lport)
   2745 				break;
   2746 		}
   2747 
   2748 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
   2749 			boolean_t not_socket;
   2750 			boolean_t exclbind;
   2751 
   2752 			lconnp = ltcp->tcp_connp;
   2753 
   2754 			/*
   2755 			 * On a labeled system, we must treat bindings to ports
   2756 			 * on shared IP addresses by sockets with MAC exemption
   2757 			 * privilege as being in all zones, as there's
   2758 			 * otherwise no way to identify the right receiver.
   2759 			 */
   2760 			if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
   2761 				continue;
   2762 
   2763 			/*
   2764 			 * If TCP_EXCLBIND is set for either the bound or
   2765 			 * binding endpoint, the semantics of bind
   2766 			 * is changed according to the following.
   2767 			 *
   2768 			 * spec = specified address (v4 or v6)
   2769 			 * unspec = unspecified address (v4 or v6)
   2770 			 * A = specified addresses are different for endpoints
   2771 			 *
   2772 			 * bound	bind to		allowed
   2773 			 * -------------------------------------
   2774 			 * unspec	unspec		no
   2775 			 * unspec	spec		no
   2776 			 * spec		unspec		no
   2777 			 * spec		spec		yes if A
   2778 			 *
   2779 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
   2780 			 * as TCP_EXCLBIND, except that zoneid is ignored.
   2781 			 *
   2782 			 * Note:
   2783 			 *
   2784 			 * 1. Because of TLI semantics, an endpoint can go
   2785 			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
   2786 			 * TCPS_BOUND, depending on whether it is originally
   2787 			 * a listener or not.  That is why we need to check
   2788 			 * for states greater than or equal to TCPS_BOUND
   2789 			 * here.
   2790 			 *
   2791 			 * 2. Ideally, we should only check for state equals
   2792 			 * to TCPS_LISTEN. And the following check should be
   2793 			 * added.
   2794 			 *
   2795 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
   2796 			 *	!reuseaddr || !lconnp->conn_reuseaddr) {
   2797 			 *		...
   2798 			 * }
   2799 			 *
   2800 			 * The semantics will be changed to this.  If the
   2801 			 * endpoint on the list is in state not equal to
   2802 			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
   2803 			 * set, let the bind succeed.
   2804 			 *
   2805 			 * Because of (1), we cannot do that for TLI
   2806 			 * endpoints.  But we can do that for socket endpoints.
   2807 			 * If in future, we can change this going back
   2808 			 * semantics, we can use the above check for TLI also.
   2809 			 */
   2810 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
   2811 			    TCP_IS_SOCKET(tcp));
   2812 			exclbind = lconnp->conn_exclbind ||
   2813 			    connp->conn_exclbind;
   2814 
   2815 			if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
   2816 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
   2817 			    (exclbind && (not_socket ||
   2818 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
   2819 				if (V6_OR_V4_INADDR_ANY(
   2820 				    lconnp->conn_bound_addr_v6) ||
   2821 				    V6_OR_V4_INADDR_ANY(*laddr) ||
   2822 				    IN6_ARE_ADDR_EQUAL(laddr,
   2823 				    &lconnp->conn_bound_addr_v6)) {
   2824 					break;
   2825 				}
   2826 				continue;
   2827 			}
   2828 
   2829 			/*
   2830 			 * Check ipversion to allow IPv4 and IPv6 sockets to
   2831 			 * have disjoint port number spaces, if *_EXCLBIND
   2832 			 * is not set and only if the application binds to a
   2833 			 * specific port. We use the same autoassigned port
   2834 			 * number space for IPv4 and IPv6 sockets.
   2835 			 */
   2836 			if (connp->conn_ipversion != lconnp->conn_ipversion &&
   2837 			    bind_to_req_port_only)
   2838 				continue;
   2839 
   2840 			/*
   2841 			 * Ideally, we should make sure that the source
   2842 			 * address, remote address, and remote port in the
   2843 			 * four tuple for this tcp-connection is unique.
   2844 			 * However, trying to find out the local source
   2845 			 * address would require too much code duplication
   2846 			 * with IP, since IP needs needs to have that code
   2847 			 * to support userland TCP implementations.
   2848 			 */
   2849 			if (quick_connect &&
   2850 			    (ltcp->tcp_state > TCPS_LISTEN) &&
   2851 			    ((connp->conn_fport != lconnp->conn_fport) ||
   2852 			    !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
   2853 			    &lconnp->conn_faddr_v6)))
   2854 				continue;
   2855 
   2856 			if (!reuseaddr) {
   2857 				/*
   2858 				 * No socket option SO_REUSEADDR.
   2859 				 * If existing port is bound to
   2860 				 * a non-wildcard IP address
   2861 				 * and the requesting stream is
   2862 				 * bound to a distinct
   2863 				 * different IP addresses
   2864 				 * (non-wildcard, also), keep
   2865 				 * going.
   2866 				 */
   2867 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
   2868 				    !V6_OR_V4_INADDR_ANY(
   2869 				    lconnp->conn_bound_addr_v6) &&
   2870 				    !IN6_ARE_ADDR_EQUAL(laddr,
   2871 				    &lconnp->conn_bound_addr_v6))
   2872 					continue;
   2873 				if (ltcp->tcp_state >= TCPS_BOUND) {
   2874 					/*
   2875 					 * This port is being used and
   2876 					 * its state is >= TCPS_BOUND,
   2877 					 * so we can't bind to it.
   2878 					 */
   2879 					break;
   2880 				}
   2881 			} else {
   2882 				/*
   2883 				 * socket option SO_REUSEADDR is set on the
   2884 				 * binding tcp_t.
   2885 				 *
   2886 				 * If two streams are bound to
   2887 				 * same IP address or both addr
   2888 				 * and bound source are wildcards
   2889 				 * (INADDR_ANY), we want to stop
   2890 				 * searching.
   2891 				 * We have found a match of IP source
   2892 				 * address and source port, which is
   2893 				 * refused regardless of the
   2894 				 * SO_REUSEADDR setting, so we break.
   2895 				 */
   2896 				if (IN6_ARE_ADDR_EQUAL(laddr,
   2897 				    &lconnp->conn_bound_addr_v6) &&
   2898 				    (ltcp->tcp_state == TCPS_LISTEN ||
   2899 				    ltcp->tcp_state == TCPS_BOUND))
   2900 					break;
   2901 			}
   2902 		}
   2903 		if (ltcp != NULL) {
   2904 			/* The port number is busy */
   2905 			mutex_exit(&tbf->tf_lock);
   2906 		} else {
   2907 			/*
   2908 			 * This port is ours. Insert in fanout and mark as
   2909 			 * bound to prevent others from getting the port
   2910 			 * number.
   2911 			 */
   2912 			tcp->tcp_state = TCPS_BOUND;
   2913 			connp->conn_lport = htons(port);
   2914 
   2915 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
   2916 			    connp->conn_lport)] == tbf);
   2917 			tcp_bind_hash_insert(tbf, tcp, 1);
   2918 
   2919 			mutex_exit(&tbf->tf_lock);
   2920 
   2921 			/*
   2922 			 * We don't want tcp_next_port_to_try to "inherit"
   2923 			 * a port number supplied by the user in a bind.
   2924 			 */
   2925 			if (user_specified)
   2926 				return (port);
   2927 
   2928 			/*
   2929 			 * This is the only place where tcp_next_port_to_try
   2930 			 * is updated. After the update, it may or may not
   2931 			 * be in the valid range.
   2932 			 */
   2933 			if (!connp->conn_anon_priv_bind)
   2934 				tcps->tcps_next_port_to_try = port + 1;
   2935 			return (port);
   2936 		}
   2937 
   2938 		if (connp->conn_anon_priv_bind) {
   2939 			port = tcp_get_next_priv_port(tcp);
   2940 		} else {
   2941 			if (count == 0 && user_specified) {
   2942 				/*
   2943 				 * We may have to return an anonymous port. So
   2944 				 * get one to start with.
   2945 				 */
   2946 				port =
   2947 				    tcp_update_next_port(
   2948 				    tcps->tcps_next_port_to_try,
   2949 				    tcp, B_TRUE);
   2950 				user_specified = B_FALSE;
   2951 			} else {
   2952 				port = tcp_update_next_port(port + 1, tcp,
   2953 				    B_FALSE);
   2954 			}
   2955 		}
   2956 		if (port == 0)
   2957 			break;
   2958 
   2959 		/*
   2960 		 * Don't let this loop run forever in the case where
   2961 		 * all of the anonymous ports are in use.
   2962 		 */
   2963 	} while (++count < loopmax);
   2964 	return (0);
   2965 }
   2966 
   2967 /*
   2968  * tcp_clean_death / tcp_close_detached must not be called more than once
   2969  * on a tcp. Thus every function that potentially calls tcp_clean_death
   2970  * must check for the tcp state before calling tcp_clean_death.
   2971  * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper,
   2972  * tcp_timer_handler, all check for the tcp state.
   2973  */
   2974 /* ARGSUSED */
   2975 void
   2976 tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
   2977     ip_recv_attr_t *dummy)
   2978 {
   2979 	tcp_t	*tcp = ((conn_t *)arg)->conn_tcp;
   2980 
   2981 	freemsg(mp);
   2982 	if (tcp->tcp_state > TCPS_BOUND)
   2983 		(void) tcp_clean_death(((conn_t *)arg)->conn_tcp,
   2984 		    ETIMEDOUT, 5);
   2985 }
   2986 
   2987 /*
   2988  * We are dying for some reason.  Try to do it gracefully.  (May be called
   2989  * as writer.)
   2990  *
   2991  * Return -1 if the structure was not cleaned up (if the cleanup had to be
   2992  * done by a service procedure).
   2993  * TBD - Should the return value distinguish between the tcp_t being
   2994  * freed and it being reinitialized?
   2995  */
   2996 static int
   2997 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
   2998 {
   2999 	mblk_t	*mp;
   3000 	queue_t	*q;
   3001 	conn_t	*connp = tcp->tcp_connp;
   3002 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3003 
   3004 	TCP_CLD_STAT(tag);
   3005 
   3006 #if TCP_TAG_CLEAN_DEATH
   3007 	tcp->tcp_cleandeathtag = tag;
   3008 #endif
   3009 
   3010 	if (tcp->tcp_fused)
   3011 		tcp_unfuse(tcp);
   3012 
   3013 	if (tcp->tcp_linger_tid != 0 &&
   3014 	    TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
   3015 		tcp_stop_lingering(tcp);
   3016 	}
   3017 
   3018 	ASSERT(tcp != NULL);
   3019 	ASSERT((connp->conn_family == AF_INET &&
   3020 	    connp->conn_ipversion == IPV4_VERSION) ||
   3021 	    (connp->conn_family == AF_INET6 &&
   3022 	    (connp->conn_ipversion == IPV4_VERSION ||
   3023 	    connp->conn_ipversion == IPV6_VERSION)));
   3024 
   3025 	if (TCP_IS_DETACHED(tcp)) {
   3026 		if (tcp->tcp_hard_binding) {
   3027 			/*
   3028 			 * Its an eager that we are dealing with. We close the
   3029 			 * eager but in case a conn_ind has already gone to the
   3030 			 * listener, let tcp_accept_finish() send a discon_ind
   3031 			 * to the listener and drop the last reference. If the
   3032 			 * listener doesn't even know about the eager i.e. the
   3033 			 * conn_ind hasn't gone up, blow away the eager and drop
   3034 			 * the last reference as well. If the conn_ind has gone
   3035 			 * up, state should be BOUND. tcp_accept_finish
   3036 			 * will figure out that the connection has received a
   3037 			 * RST and will send a DISCON_IND to the application.
   3038 			 */
   3039 			tcp_closei_local(tcp);
   3040 			if (!tcp->tcp_tconnind_started) {
   3041 				CONN_DEC_REF(connp);
   3042 			} else {
   3043 				tcp->tcp_state = TCPS_BOUND;
   3044 			}
   3045 		} else {
   3046 			tcp_close_detached(tcp);
   3047 		}
   3048 		return (0);
   3049 	}
   3050 
   3051 	TCP_STAT(tcps, tcp_clean_death_nondetached);
   3052 
   3053 	q = connp->conn_rq;
   3054 
   3055 	/* Trash all inbound data */
   3056 	if (!IPCL_IS_NONSTR(connp)) {
   3057 		ASSERT(q != NULL);
   3058 		flushq(q, FLUSHALL);
   3059 	}
   3060 
   3061 	/*
   3062 	 * If we are at least part way open and there is error
   3063 	 * (err==0 implies no error)
   3064 	 * notify our client by a T_DISCON_IND.
   3065 	 */
   3066 	if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) {
   3067 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
   3068 		    !TCP_IS_SOCKET(tcp)) {
   3069 			/*
   3070 			 * Send M_FLUSH according to TPI. Because sockets will
   3071 			 * (and must) ignore FLUSHR we do that only for TPI
   3072 			 * endpoints and sockets in STREAMS mode.
   3073 			 */
   3074 			(void) putnextctl1(q, M_FLUSH, FLUSHR);
   3075 		}
   3076 		if (connp->conn_debug) {
   3077 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
   3078 			    "tcp_clean_death: discon err %d", err);
   3079 		}
   3080 		if (IPCL_IS_NONSTR(connp)) {
   3081 			/* Direct socket, use upcall */
   3082 			(*connp->conn_upcalls->su_disconnected)(
   3083 			    connp->conn_upper_handle, tcp->tcp_connid, err);
   3084 		} else {
   3085 			mp = mi_tpi_discon_ind(NULL, err, 0);
   3086 			if (mp != NULL) {
   3087 				putnext(q, mp);
   3088 			} else {
   3089 				if (connp->conn_debug) {
   3090 					(void) strlog(TCP_MOD_ID, 0, 1,
   3091 					    SL_ERROR|SL_TRACE,
   3092 					    "tcp_clean_death, sending M_ERROR");
   3093 				}
   3094 				(void) putnextctl1(q, M_ERROR, EPROTO);
   3095 			}
   3096 		}
   3097 		if (tcp->tcp_state <= TCPS_SYN_RCVD) {
   3098 			/* SYN_SENT or SYN_RCVD */
   3099 			BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   3100 		} else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) {
   3101 			/* ESTABLISHED or CLOSE_WAIT */
   3102 			BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
   3103 		}
   3104 	}
   3105 
   3106 	tcp_reinit(tcp);
   3107 	if (IPCL_IS_NONSTR(connp))
   3108 		(void) tcp_do_unbind(connp);
   3109 
   3110 	return (-1);
   3111 }
   3112 
   3113 /*
   3114  * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout
   3115  * to expire, stop the wait and finish the close.
   3116  */
   3117 static void
   3118 tcp_stop_lingering(tcp_t *tcp)
   3119 {
   3120 	clock_t	delta = 0;
   3121 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3122 	conn_t		*connp = tcp->tcp_connp;
   3123 
   3124 	tcp->tcp_linger_tid = 0;
   3125 	if (tcp->tcp_state > TCPS_LISTEN) {
   3126 		tcp_acceptor_hash_remove(tcp);
   3127 		mutex_enter(&tcp->tcp_non_sq_lock);
   3128 		if (tcp->tcp_flow_stopped) {
   3129 			tcp_clrqfull(tcp);
   3130 		}
   3131 		mutex_exit(&tcp->tcp_non_sq_lock);
   3132 
   3133 		if (tcp->tcp_timer_tid != 0) {
   3134 			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
   3135 			tcp->tcp_timer_tid = 0;
   3136 		}
   3137 		/*
   3138 		 * Need to cancel those timers which will not be used when
   3139 		 * TCP is detached.  This has to be done before the conn_wq
   3140 		 * is cleared.
   3141 		 */
   3142 		tcp_timers_stop(tcp);
   3143 
   3144 		tcp->tcp_detached = B_TRUE;
   3145 		connp->conn_rq = NULL;
   3146 		connp->conn_wq = NULL;
   3147 
   3148 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
   3149 			tcp_time_wait_append(tcp);
   3150 			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
   3151 			goto finish;
   3152 		}
   3153 
   3154 		/*
   3155 		 * If delta is zero the timer event wasn't executed and was
   3156 		 * successfully canceled. In this case we need to restart it
   3157 		 * with the minimal delta possible.
   3158 		 */
   3159 		if (delta >= 0) {
   3160 			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
   3161 			    delta ? delta : 1);
   3162 		}
   3163 	} else {
   3164 		tcp_closei_local(tcp);
   3165 		CONN_DEC_REF(connp);
   3166 	}
   3167 finish:
   3168 	/* Signal closing thread that it can complete close */
   3169 	mutex_enter(&tcp->tcp_closelock);
   3170 	tcp->tcp_detached = B_TRUE;
   3171 	connp->conn_rq = NULL;
   3172 	connp->conn_wq = NULL;
   3173 
   3174 	tcp->tcp_closed = 1;
   3175 	cv_signal(&tcp->tcp_closecv);
   3176 	mutex_exit(&tcp->tcp_closelock);
   3177 }
   3178 
   3179 /*
   3180  * Handle lingering timeouts. This function is called when the SO_LINGER timeout
   3181  * expires.
   3182  */
   3183 static void
   3184 tcp_close_linger_timeout(void *arg)
   3185 {
   3186 	conn_t	*connp = (conn_t *)arg;
   3187 	tcp_t 	*tcp = connp->conn_tcp;
   3188 
   3189 	tcp->tcp_client_errno = ETIMEDOUT;
   3190 	tcp_stop_lingering(tcp);
   3191 }
   3192 
   3193 static void
   3194 tcp_close_common(conn_t *connp, int flags)
   3195 {
   3196 	tcp_t		*tcp = connp->conn_tcp;
   3197 	mblk_t 		*mp = &tcp->tcp_closemp;
   3198 	boolean_t	conn_ioctl_cleanup_reqd = B_FALSE;
   3199 	mblk_t		*bp;
   3200 
   3201 	ASSERT(connp->conn_ref >= 2);
   3202 
   3203 	/*
   3204 	 * Mark the conn as closing. ipsq_pending_mp_add will not
   3205 	 * add any mp to the pending mp list, after this conn has
   3206 	 * started closing.
   3207 	 */
   3208 	mutex_enter(&connp->conn_lock);
   3209 	connp->conn_state_flags |= CONN_CLOSING;
   3210 	if (connp->conn_oper_pending_ill != NULL)
   3211 		conn_ioctl_cleanup_reqd = B_TRUE;
   3212 	CONN_INC_REF_LOCKED(connp);
   3213 	mutex_exit(&connp->conn_lock);
   3214 	tcp->tcp_closeflags = (uint8_t)flags;
   3215 	ASSERT(connp->conn_ref >= 3);
   3216 
   3217 	/*
   3218 	 * tcp_closemp_used is used below without any protection of a lock
   3219 	 * as we don't expect any one else to use it concurrently at this
   3220 	 * point otherwise it would be a major defect.
   3221 	 */
   3222 
   3223 	if (mp->b_prev == NULL)
   3224 		tcp->tcp_closemp_used = B_TRUE;
   3225 	else
   3226 		cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: "
   3227 		    "connp %p tcp %p\n", (void *)connp, (void *)tcp);
   3228 
   3229 	TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   3230 
   3231 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp,
   3232 	    NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
   3233 
   3234 	mutex_enter(&tcp->tcp_closelock);
   3235 	while (!tcp->tcp_closed) {
   3236 		if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) {
   3237 			/*
   3238 			 * The cv_wait_sig() was interrupted. We now do the
   3239 			 * following:
   3240 			 *
   3241 			 * 1) If the endpoint was lingering, we allow this
   3242 			 * to be interrupted by cancelling the linger timeout
   3243 			 * and closing normally.
   3244 			 *
   3245 			 * 2) Revert to calling cv_wait()
   3246 			 *
   3247 			 * We revert to using cv_wait() to avoid an
   3248 			 * infinite loop which can occur if the calling
   3249 			 * thread is higher priority than the squeue worker
   3250 			 * thread and is bound to the same cpu.
   3251 			 */
   3252 			if (connp->conn_linger && connp->conn_lingertime > 0) {
   3253 				mutex_exit(&tcp->tcp_closelock);
   3254 				/* Entering squeue, bump ref count. */
   3255 				CONN_INC_REF(connp);
   3256 				bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
   3257 				SQUEUE_ENTER_ONE(connp->conn_sqp, bp,
   3258 				    tcp_linger_interrupted, connp, NULL,
   3259 				    tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
   3260 				mutex_enter(&tcp->tcp_closelock);
   3261 			}
   3262 			break;
   3263 		}
   3264 	}
   3265 	while (!tcp->tcp_closed)
   3266 		cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock);
   3267 	mutex_exit(&tcp->tcp_closelock);
   3268 
   3269 	/*
   3270 	 * In the case of listener streams that have eagers in the q or q0
   3271 	 * we wait for the eagers to drop their reference to us. conn_rq and
   3272 	 * conn_wq of the eagers point to our queues. By waiting for the
   3273 	 * refcnt to drop to 1, we are sure that the eagers have cleaned
   3274 	 * up their queue pointers and also dropped their references to us.
   3275 	 */
   3276 	if (tcp->tcp_wait_for_eagers) {
   3277 		mutex_enter(&connp->conn_lock);
   3278 		while (connp->conn_ref != 1) {
   3279 			cv_wait(&connp->conn_cv, &connp->conn_lock);
   3280 		}
   3281 		mutex_exit(&connp->conn_lock);
   3282 	}
   3283 	/*
   3284 	 * ioctl cleanup. The mp is queued in the ipx_pending_mp.
   3285 	 */
   3286 	if (conn_ioctl_cleanup_reqd)
   3287 		conn_ioctl_cleanup(connp);
   3288 
   3289 	connp->conn_cpid = NOPID;
   3290 }
   3291 
   3292 static int
   3293 tcp_tpi_close(queue_t *q, int flags)
   3294 {
   3295 	conn_t		*connp;
   3296 
   3297 	ASSERT(WR(q)->q_next == NULL);
   3298 
   3299 	if (flags & SO_FALLBACK) {
   3300 		/*
   3301 		 * stream is being closed while in fallback
   3302 		 * simply free the resources that were allocated
   3303 		 */
   3304 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
   3305 		qprocsoff(q);
   3306 		goto done;
   3307 	}
   3308 
   3309 	connp = Q_TO_CONN(q);
   3310 	/*
   3311 	 * We are being closed as /dev/tcp or /dev/tcp6.
   3312 	 */
   3313 	tcp_close_common(connp, flags);
   3314 
   3315 	qprocsoff(q);
   3316 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
   3317 
   3318 	/*
   3319 	 * Drop IP's reference on the conn. This is the last reference
   3320 	 * on the connp if the state was less than established. If the
   3321 	 * connection has gone into timewait state, then we will have
   3322 	 * one ref for the TCP and one more ref (total of two) for the
   3323 	 * classifier connected hash list (a timewait connections stays
   3324 	 * in connected hash till closed).
   3325 	 *
   3326 	 * We can't assert the references because there might be other
   3327 	 * transient reference places because of some walkers or queued
   3328 	 * packets in squeue for the timewait state.
   3329 	 */
   3330 	CONN_DEC_REF(connp);
   3331 done:
   3332 	q->q_ptr = WR(q)->q_ptr = NULL;
   3333 	return (0);
   3334 }
   3335 
   3336 static int
   3337 tcp_tpi_close_accept(queue_t *q)
   3338 {
   3339 	vmem_t	*minor_arena;
   3340 	dev_t	conn_dev;
   3341 
   3342 	ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
   3343 
   3344 	/*
   3345 	 * We had opened an acceptor STREAM for sockfs which is
   3346 	 * now being closed due to some error.
   3347 	 */
   3348 	qprocsoff(q);
   3349 
   3350 	minor_arena = (vmem_t *)WR(q)->q_ptr;
   3351 	conn_dev = (dev_t)RD(q)->q_ptr;
   3352 	ASSERT(minor_arena != NULL);
   3353 	ASSERT(conn_dev != 0);
   3354 	inet_minor_free(minor_arena, conn_dev);
   3355 	q->q_ptr = WR(q)->q_ptr = NULL;
   3356 	return (0);
   3357 }
   3358 
   3359 /*
   3360  * Called by tcp_close() routine via squeue when lingering is
   3361  * interrupted by a signal.
   3362  */
   3363 
   3364 /* ARGSUSED */
   3365 static void
   3366 tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   3367 {
   3368 	conn_t	*connp = (conn_t *)arg;
   3369 	tcp_t	*tcp = connp->conn_tcp;
   3370 
   3371 	freeb(mp);
   3372 	if (tcp->tcp_linger_tid != 0 &&
   3373 	    TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
   3374 		tcp_stop_lingering(tcp);
   3375 		tcp->tcp_client_errno = EINTR;
   3376 	}
   3377 }
   3378 
   3379 /*
   3380  * Called by streams close routine via squeues when our client blows off her
   3381  * descriptor, we take this to mean: "close the stream state NOW, close the tcp
   3382  * connection politely" When SO_LINGER is set (with a non-zero linger time and
   3383  * it is not a nonblocking socket) then this routine sleeps until the FIN is
   3384  * acked.
   3385  *
   3386  * NOTE: tcp_close potentially returns error when lingering.
   3387  * However, the stream head currently does not pass these errors
   3388  * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
   3389  * errors to the application (from tsleep()) and not errors
   3390  * like ECONNRESET caused by receiving a reset packet.
   3391  */
   3392 
   3393 /* ARGSUSED */
   3394 static void
   3395 tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   3396 {
   3397 	char	*msg;
   3398 	conn_t	*connp = (conn_t *)arg;
   3399 	tcp_t	*tcp = connp->conn_tcp;
   3400 	clock_t	delta = 0;
   3401 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3402 
   3403 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
   3404 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
   3405 
   3406 	mutex_enter(&tcp->tcp_eager_lock);
   3407 	if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
   3408 		/* Cleanup for listener */
   3409 		tcp_eager_cleanup(tcp, 0);
   3410 		tcp->tcp_wait_for_eagers = 1;
   3411 	}
   3412 	mutex_exit(&tcp->tcp_eager_lock);
   3413 
   3414 	tcp->tcp_lso = B_FALSE;
   3415 
   3416 	msg = NULL;
   3417 	switch (tcp->tcp_state) {
   3418 	case TCPS_CLOSED:
   3419 	case TCPS_IDLE:
   3420 	case TCPS_BOUND:
   3421 	case TCPS_LISTEN:
   3422 		break;
   3423 	case TCPS_SYN_SENT:
   3424 		msg = "tcp_close, during connect";
   3425 		break;
   3426 	case TCPS_SYN_RCVD:
   3427 		/*
   3428 		 * Close during the connect 3-way handshake
   3429 		 * but here there may or may not be pending data
   3430 		 * already on queue. Process almost same as in
   3431 		 * the ESTABLISHED state.
   3432 		 */
   3433 		/* FALLTHRU */
   3434 	default:
   3435 		if (tcp->tcp_fused)
   3436 			tcp_unfuse(tcp);
   3437 
   3438 		/*
   3439 		 * If SO_LINGER has set a zero linger time, abort the
   3440 		 * connection with a reset.
   3441 		 */
   3442 		if (connp->conn_linger && connp->conn_lingertime == 0) {
   3443 			msg = "tcp_close, zero lingertime";
   3444 			break;
   3445 		}
   3446 
   3447 		/*
   3448 		 * Abort connection if there is unread data queued.
   3449 		 */
   3450 		if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
   3451 			msg = "tcp_close, unread data";
   3452 			break;
   3453 		}
   3454 		/*
   3455 		 * We have done a qwait() above which could have possibly
   3456 		 * drained more messages in turn causing transition to a
   3457 		 * different state. Check whether we have to do the rest
   3458 		 * of the processing or not.
   3459 		 */
   3460 		if (tcp->tcp_state <= TCPS_LISTEN)
   3461 			break;
   3462 
   3463 		/*
   3464 		 * Transmit the FIN before detaching the tcp_t.
   3465 		 * After tcp_detach returns this queue/perimeter
   3466 		 * no longer owns the tcp_t thus others can modify it.
   3467 		 */
   3468 		(void) tcp_xmit_end(tcp);
   3469 
   3470 		/*
   3471 		 * If lingering on close then wait until the fin is acked,
   3472 		 * the SO_LINGER time passes, or a reset is sent/received.
   3473 		 */
   3474 		if (connp->conn_linger && connp->conn_lingertime > 0 &&
   3475 		    !(tcp->tcp_fin_acked) &&
   3476 		    tcp->tcp_state >= TCPS_ESTABLISHED) {
   3477 			if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
   3478 				tcp->tcp_client_errno = EWOULDBLOCK;
   3479 			} else if (tcp->tcp_client_errno == 0) {
   3480 
   3481 				ASSERT(tcp->tcp_linger_tid == 0);
   3482 
   3483 				tcp->tcp_linger_tid = TCP_TIMER(tcp,
   3484 				    tcp_close_linger_timeout,
   3485 				    connp->conn_lingertime * hz);
   3486 
   3487 				/* tcp_close_linger_timeout will finish close */
   3488 				if (tcp->tcp_linger_tid == 0)
   3489 					tcp->tcp_client_errno = ENOSR;
   3490 				else
   3491 					return;
   3492 			}
   3493 
   3494 			/*
   3495 			 * Check if we need to detach or just close
   3496 			 * the instance.
   3497 			 */
   3498 			if (tcp->tcp_state <= TCPS_LISTEN)
   3499 				break;
   3500 		}
   3501 
   3502 		/*
   3503 		 * Make sure that no other thread will access the conn_rq of
   3504 		 * this instance (through lookups etc.) as conn_rq will go
   3505 		 * away shortly.
   3506 		 */
   3507 		tcp_acceptor_hash_remove(tcp);
   3508 
   3509 		mutex_enter(&tcp->tcp_non_sq_lock);
   3510 		if (tcp->tcp_flow_stopped) {
   3511 			tcp_clrqfull(tcp);
   3512 		}
   3513 		mutex_exit(&tcp->tcp_non_sq_lock);
   3514 
   3515 		if (tcp->tcp_timer_tid != 0) {
   3516 			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
   3517 			tcp->tcp_timer_tid = 0;
   3518 		}
   3519 		/*
   3520 		 * Need to cancel those timers which will not be used when
   3521 		 * TCP is detached.  This has to be done before the conn_wq
   3522 		 * is set to NULL.
   3523 		 */
   3524 		tcp_timers_stop(tcp);
   3525 
   3526 		tcp->tcp_detached = B_TRUE;
   3527 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
   3528 			tcp_time_wait_append(tcp);
   3529 			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
   3530 			ASSERT(connp->conn_ref >= 3);
   3531 			goto finish;
   3532 		}
   3533 
   3534 		/*
   3535 		 * If delta is zero the timer event wasn't executed and was
   3536 		 * successfully canceled. In this case we need to restart it
   3537 		 * with the minimal delta possible.
   3538 		 */
   3539 		if (delta >= 0)
   3540 			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
   3541 			    delta ? delta : 1);
   3542 
   3543 		ASSERT(connp->conn_ref >= 3);
   3544 		goto finish;
   3545 	}
   3546 
   3547 	/* Detach did not complete. Still need to remove q from stream. */
   3548 	if (msg) {
   3549 		if (tcp->tcp_state == TCPS_ESTABLISHED ||
   3550 		    tcp->tcp_state == TCPS_CLOSE_WAIT)
   3551 			BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
   3552 		if (tcp->tcp_state == TCPS_SYN_SENT ||
   3553 		    tcp->tcp_state == TCPS_SYN_RCVD)
   3554 			BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   3555 		tcp_xmit_ctl(msg, tcp,  tcp->tcp_snxt, 0, TH_RST);
   3556 	}
   3557 
   3558 	tcp_closei_local(tcp);
   3559 	CONN_DEC_REF(connp);
   3560 	ASSERT(connp->conn_ref >= 2);
   3561 
   3562 finish:
   3563 	mutex_enter(&tcp->tcp_closelock);
   3564 	/*
   3565 	 * Don't change the queues in the case of a listener that has
   3566 	 * eagers in its q or q0. It could surprise the eagers.
   3567 	 * Instead wait for the eagers outside the squeue.
   3568 	 */
   3569 	if (!tcp->tcp_wait_for_eagers) {
   3570 		tcp->tcp_detached = B_TRUE;
   3571 		connp->conn_rq = NULL;
   3572 		connp->conn_wq = NULL;
   3573 	}
   3574 
   3575 	/* Signal tcp_close() to finish closing. */
   3576 	tcp->tcp_closed = 1;
   3577 	cv_signal(&tcp->tcp_closecv);
   3578 	mutex_exit(&tcp->tcp_closelock);
   3579 }
   3580 
   3581 /*
   3582  * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp.
   3583  * Some stream heads get upset if they see these later on as anything but NULL.
   3584  */
   3585 static void
   3586 tcp_close_mpp(mblk_t **mpp)
   3587 {
   3588 	mblk_t	*mp;
   3589 
   3590 	if ((mp = *mpp) != NULL) {
   3591 		do {
   3592 			mp->b_next = NULL;
   3593 			mp->b_prev = NULL;
   3594 		} while ((mp = mp->b_cont) != NULL);
   3595 
   3596 		mp = *mpp;
   3597 		*mpp = NULL;
   3598 		freemsg(mp);
   3599 	}
   3600 }
   3601 
   3602 /* Do detached close. */
   3603 static void
   3604 tcp_close_detached(tcp_t *tcp)
   3605 {
   3606 	if (tcp->tcp_fused)
   3607 		tcp_unfuse(tcp);
   3608 
   3609 	/*
   3610 	 * Clustering code serializes TCP disconnect callbacks and
   3611 	 * cluster tcp list walks by blocking a TCP disconnect callback
   3612 	 * if a cluster tcp list walk is in progress. This ensures
   3613 	 * accurate accounting of TCPs in the cluster code even though
   3614 	 * the TCP list walk itself is not atomic.
   3615 	 */
   3616 	tcp_closei_local(tcp);
   3617 	CONN_DEC_REF(tcp->tcp_connp);
   3618 }
   3619 
   3620 /*
   3621  * Stop all TCP timers, and free the timer mblks if requested.
   3622  */
   3623 void
   3624 tcp_timers_stop(tcp_t *tcp)
   3625 {
   3626 	if (tcp->tcp_timer_tid != 0) {
   3627 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
   3628 		tcp->tcp_timer_tid = 0;
   3629 	}
   3630 	if (tcp->tcp_ka_tid != 0) {
   3631 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
   3632 		tcp->tcp_ka_tid = 0;
   3633 	}
   3634 	if (tcp->tcp_ack_tid != 0) {
   3635 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
   3636 		tcp->tcp_ack_tid = 0;
   3637 	}
   3638 	if (tcp->tcp_push_tid != 0) {
   3639 		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
   3640 		tcp->tcp_push_tid = 0;
   3641 	}
   3642 }
   3643 
   3644 /*
   3645  * The tcp_t is going away. Remove it from all lists and set it
   3646  * to TCPS_CLOSED. The freeing up of memory is deferred until
   3647  * tcp_inactive. This is needed since a thread in tcp_rput might have
   3648  * done a CONN_INC_REF on this structure before it was removed from the
   3649  * hashes.
   3650  */
   3651 static void
   3652 tcp_closei_local(tcp_t *tcp)
   3653 {
   3654 	conn_t		*connp = tcp->tcp_connp;
   3655 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3656 
   3657 	if (!TCP_IS_SOCKET(tcp))
   3658 		tcp_acceptor_hash_remove(tcp);
   3659 
   3660 	UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
   3661 	tcp->tcp_ibsegs = 0;
   3662 	UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
   3663 	tcp->tcp_obsegs = 0;
   3664 
   3665 	/*
   3666 	 * If we are an eager connection hanging off a listener that
   3667 	 * hasn't formally accepted the connection yet, get off his
   3668 	 * list and blow off any data that we have accumulated.
   3669 	 */
   3670 	if (tcp->tcp_listener != NULL) {
   3671 		tcp_t	*listener = tcp->tcp_listener;
   3672 		mutex_enter(&listener->tcp_eager_lock);
   3673 		/*
   3674 		 * tcp_tconnind_started == B_TRUE means that the
   3675 		 * conn_ind has already gone to listener. At
   3676 		 * this point, eager will be closed but we
   3677 		 * leave it in listeners eager list so that
   3678 		 * if listener decides to close without doing
   3679 		 * accept, we can clean this up. In tcp_tli_accept
   3680 		 * we take care of the case of accept on closed
   3681 		 * eager.
   3682 		 */
   3683 		if (!tcp->tcp_tconnind_started) {
   3684 			tcp_eager_unlink(tcp);
   3685 			mutex_exit(&listener->tcp_eager_lock);
   3686 			/*
   3687 			 * We don't want to have any pointers to the
   3688 			 * listener queue, after we have released our
   3689 			 * reference on the listener
   3690 			 */
   3691 			ASSERT(tcp->tcp_detached);
   3692 			connp->conn_rq = NULL;
   3693 			connp->conn_wq = NULL;
   3694 			CONN_DEC_REF(listener->tcp_connp);
   3695 		} else {
   3696 			mutex_exit(&listener->tcp_eager_lock);
   3697 		}
   3698 	}
   3699 
   3700 	/* Stop all the timers */
   3701 	tcp_timers_stop(tcp);
   3702 
   3703 	if (tcp->tcp_state == TCPS_LISTEN) {
   3704 		if (tcp->tcp_ip_addr_cache) {
   3705 			kmem_free((void *)tcp->tcp_ip_addr_cache,
   3706 			    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
   3707 			tcp->tcp_ip_addr_cache = NULL;
   3708 		}
   3709 	}
   3710 	mutex_enter(&tcp->tcp_non_sq_lock);
   3711 	if (tcp->tcp_flow_stopped)
   3712 		tcp_clrqfull(tcp);
   3713 	mutex_exit(&tcp->tcp_non_sq_lock);
   3714 
   3715 	tcp_bind_hash_remove(tcp);
   3716 	/*
   3717 	 * If the tcp_time_wait_collector (which runs outside the squeue)
   3718 	 * is trying to remove this tcp from the time wait list, we will
   3719 	 * block in tcp_time_wait_remove while trying to acquire the
   3720 	 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also
   3721 	 * requires the ipcl_hash_remove to be ordered after the
   3722 	 * tcp_time_wait_remove for the refcnt checks to work correctly.
   3723 	 */
   3724 	if (tcp->tcp_state == TCPS_TIME_WAIT)
   3725 		(void) tcp_time_wait_remove(tcp, NULL);
   3726 	CL_INET_DISCONNECT(connp);
   3727 	ipcl_hash_remove(connp);
   3728 	ixa_cleanup(connp->conn_ixa);
   3729 
   3730 	/*
   3731 	 * Mark the conn as CONDEMNED
   3732 	 */
   3733 	mutex_enter(&connp->conn_lock);
   3734 	connp->conn_state_flags |= CONN_CONDEMNED;
   3735 	mutex_exit(&connp->conn_lock);
   3736 
   3737 	/* Need to cleanup any pending ioctls */
   3738 	ASSERT(tcp->tcp_time_wait_next == NULL);
   3739 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   3740 	ASSERT(tcp->tcp_time_wait_expire == 0);
   3741 	tcp->tcp_state = TCPS_CLOSED;
   3742 
   3743 	/* Release any SSL context */
   3744 	if (tcp->tcp_kssl_ent != NULL) {
   3745 		kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
   3746 		tcp->tcp_kssl_ent = NULL;
   3747 	}
   3748 	if (tcp->tcp_kssl_ctx != NULL) {
   3749 		kssl_release_ctx(tcp->tcp_kssl_ctx);
   3750 		tcp->tcp_kssl_ctx = NULL;
   3751 	}
   3752 	tcp->tcp_kssl_pending = B_FALSE;
   3753 
   3754 	tcp_ipsec_cleanup(tcp);
   3755 }
   3756 
   3757 /*
   3758  * tcp is dying (called from ipcl_conn_destroy and error cases).
   3759  * Free the tcp_t in either case.
   3760  */
   3761 void
   3762 tcp_free(tcp_t *tcp)
   3763 {
   3764 	mblk_t		*mp;
   3765 	conn_t		*connp = tcp->tcp_connp;
   3766 
   3767 	ASSERT(tcp != NULL);
   3768 	ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL);
   3769 
   3770 	connp->conn_rq = NULL;
   3771 	connp->conn_wq = NULL;
   3772 
   3773 	tcp_close_mpp(&tcp->tcp_xmit_head);
   3774 	tcp_close_mpp(&tcp->tcp_reass_head);
   3775 	if (tcp->tcp_rcv_list != NULL) {
   3776 		/* Free b_next chain */
   3777 		tcp_close_mpp(&tcp->tcp_rcv_list);
   3778 	}
   3779 	if ((mp = tcp->tcp_urp_mp) != NULL) {
   3780 		freemsg(mp);
   3781 	}
   3782 	if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
   3783 		freemsg(mp);
   3784 	}
   3785 
   3786 	if (tcp->tcp_fused_sigurg_mp != NULL) {
   3787 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   3788 		freeb(tcp->tcp_fused_sigurg_mp);
   3789 		tcp->tcp_fused_sigurg_mp = NULL;
   3790 	}
   3791 
   3792 	if (tcp->tcp_ordrel_mp != NULL) {
   3793 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   3794 		freeb(tcp->tcp_ordrel_mp);
   3795 		tcp->tcp_ordrel_mp = NULL;
   3796 	}
   3797 
   3798 	if (tcp->tcp_sack_info != NULL) {
   3799 		if (tcp->tcp_notsack_list != NULL) {
   3800 			TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
   3801 			    tcp);
   3802 		}
   3803 		bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
   3804 	}
   3805 
   3806 	if (tcp->tcp_hopopts != NULL) {
   3807 		mi_free(tcp->tcp_hopopts);
   3808 		tcp->tcp_hopopts = NULL;
   3809 		tcp->tcp_hopoptslen = 0;
   3810 	}
   3811 	ASSERT(tcp->tcp_hopoptslen == 0);
   3812 	if (tcp->tcp_dstopts != NULL) {
   3813 		mi_free(tcp->tcp_dstopts);
   3814 		tcp->tcp_dstopts = NULL;
   3815 		tcp->tcp_dstoptslen = 0;
   3816 	}
   3817 	ASSERT(tcp->tcp_dstoptslen == 0);
   3818 	if (tcp->tcp_rthdrdstopts != NULL) {
   3819 		mi_free(tcp->tcp_rthdrdstopts);
   3820 		tcp->tcp_rthdrdstopts = NULL;
   3821 		tcp->tcp_rthdrdstoptslen = 0;
   3822 	}
   3823 	ASSERT(tcp->tcp_rthdrdstoptslen == 0);
   3824 	if (tcp->tcp_rthdr != NULL) {
   3825 		mi_free(tcp->tcp_rthdr);
   3826 		tcp->tcp_rthdr = NULL;
   3827 		tcp->tcp_rthdrlen = 0;
   3828 	}
   3829 	ASSERT(tcp->tcp_rthdrlen == 0);
   3830 
   3831 	/*
   3832 	 * Following is really a blowing away a union.
   3833 	 * It happens to have exactly two members of identical size
   3834 	 * the following code is enough.
   3835 	 */
   3836 	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
   3837 }
   3838 
   3839 
   3840 /*
   3841  * Put a connection confirmation message upstream built from the
   3842  * address/flowid information with the conn and iph. Report our success or
   3843  * failure.
   3844  */
   3845 static boolean_t
   3846 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
   3847     mblk_t **defermp, ip_recv_attr_t *ira)
   3848 {
   3849 	sin_t	sin;
   3850 	sin6_t	sin6;
   3851 	mblk_t	*mp;
   3852 	char	*optp = NULL;
   3853 	int	optlen = 0;
   3854 	conn_t	*connp = tcp->tcp_connp;
   3855 
   3856 	if (defermp != NULL)
   3857 		*defermp = NULL;
   3858 
   3859 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
   3860 		/*
   3861 		 * Return in T_CONN_CON results of option negotiation through
   3862 		 * the T_CONN_REQ. Note: If there is an real end-to-end option
   3863 		 * negotiation, then what is received from remote end needs
   3864 		 * to be taken into account but there is no such thing (yet?)
   3865 		 * in our TCP/IP.
   3866 		 * Note: We do not use mi_offset_param() here as
   3867 		 * tcp_opts_conn_req contents do not directly come from
   3868 		 * an application and are either generated in kernel or
   3869 		 * from user input that was already verified.
   3870 		 */
   3871 		mp = tcp->tcp_conn.tcp_opts_conn_req;
   3872 		optp = (char *)(mp->b_rptr +
   3873 		    ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
   3874 		optlen = (int)
   3875 		    ((struct T_conn_req *)mp->b_rptr)->OPT_length;
   3876 	}
   3877 
   3878 	if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
   3879 
   3880 		/* packet is IPv4 */
   3881 		if (connp->conn_family == AF_INET) {
   3882 			sin = sin_null;
   3883 			sin.sin_addr.s_addr = connp->conn_faddr_v4;
   3884 			sin.sin_port = connp->conn_fport;
   3885 			sin.sin_family = AF_INET;
   3886 			mp = mi_tpi_conn_con(NULL, (char *)&sin,
   3887 			    (int)sizeof (sin_t), optp, optlen);
   3888 		} else {
   3889 			sin6 = sin6_null;
   3890 			sin6.sin6_addr = connp->conn_faddr_v6;
   3891 			sin6.sin6_port = connp->conn_fport;
   3892 			sin6.sin6_family = AF_INET6;
   3893 			mp = mi_tpi_conn_con(NULL, (char *)&sin6,
   3894 			    (int)sizeof (sin6_t), optp, optlen);
   3895 
   3896 		}
   3897 	} else {
   3898 		ip6_t	*ip6h = (ip6_t *)iphdr;
   3899 
   3900 		ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
   3901 		ASSERT(connp->conn_family == AF_INET6);
   3902 		sin6 = sin6_null;
   3903 		sin6.sin6_addr = connp->conn_faddr_v6;
   3904 		sin6.sin6_port = connp->conn_fport;
   3905 		sin6.sin6_family = AF_INET6;
   3906 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
   3907 		mp = mi_tpi_conn_con(NULL, (char *)&sin6,
   3908 		    (int)sizeof (sin6_t), optp, optlen);
   3909 	}
   3910 
   3911 	if (!mp)
   3912 		return (B_FALSE);
   3913 
   3914 	mblk_copycred(mp, idmp);
   3915 
   3916 	if (defermp == NULL) {
   3917 		conn_t *connp = tcp->tcp_connp;
   3918 		if (IPCL_IS_NONSTR(connp)) {
   3919 			(*connp->conn_upcalls->su_connected)
   3920 			    (connp->conn_upper_handle, tcp->tcp_connid,
   3921 			    ira->ira_cred, ira->ira_cpid);
   3922 			freemsg(mp);
   3923 		} else {
   3924 			if (ira->ira_cred != NULL) {
   3925 				/* So that getpeerucred works for TPI sockfs */
   3926 				mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
   3927 			}
   3928 			putnext(connp->conn_rq, mp);
   3929 		}
   3930 	} else {
   3931 		*defermp = mp;
   3932 	}
   3933 
   3934 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
   3935 		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
   3936 	return (B_TRUE);
   3937 }
   3938 
   3939 /*
   3940  * Defense for the SYN attack -
   3941  * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest
   3942  *    one from the list of droppable eagers. This list is a subset of q0.
   3943  *    see comments before the definition of MAKE_DROPPABLE().
   3944  * 2. Don't drop a SYN request before its first timeout. This gives every
   3945  *    request at least til the first timeout to complete its 3-way handshake.
   3946  * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many
   3947  *    requests currently on the queue that has timed out. This will be used
   3948  *    as an indicator of whether an attack is under way, so that appropriate
   3949  *    actions can be taken. (It's incremented in tcp_timer() and decremented
   3950  *    either when eager goes into ESTABLISHED, or gets freed up.)
   3951  * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on
   3952  *    # of timeout drops back to <= q0len/32 => SYN alert off
   3953  */
   3954 static boolean_t
   3955 tcp_drop_q0(tcp_t *tcp)
   3956 {
   3957 	tcp_t	*eager;
   3958 	mblk_t	*mp;
   3959 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3960 
   3961 	ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock));
   3962 	ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0);
   3963 
   3964 	/* Pick oldest eager from the list of droppable eagers */
   3965 	eager = tcp->tcp_eager_prev_drop_q0;
   3966 
   3967 	/* If list is empty. return B_FALSE */
   3968 	if (eager == tcp) {
   3969 		return (B_FALSE);
   3970 	}
   3971 
   3972 	/* If allocated, the mp will be freed in tcp_clean_death_wrapper() */
   3973 	if ((mp = allocb(0, BPRI_HI)) == NULL)
   3974 		return (B_FALSE);
   3975 
   3976 	/*
   3977 	 * Take this eager out from the list of droppable eagers since we are
   3978 	 * going to drop it.
   3979 	 */
   3980 	MAKE_UNDROPPABLE(eager);
   3981 
   3982 	if (tcp->tcp_connp->conn_debug) {
   3983 		(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
   3984 		    "tcp_drop_q0: listen half-open queue (max=%d) overflow"
   3985 		    " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0,
   3986 		    tcp->tcp_conn_req_cnt_q0,
   3987 		    tcp_display(tcp, NULL, DISP_PORT_ONLY));
   3988 	}
   3989 
   3990 	BUMP_MIB(&tcps->tcps_mib, tcpHalfOpenDrop);
   3991 
   3992 	/* Put a reference on the conn as we are enqueueing it in the sqeue */
   3993 	CONN_INC_REF(eager->tcp_connp);
   3994 
   3995 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
   3996 	    tcp_clean_death_wrapper, eager->tcp_connp, NULL,
   3997 	    SQ_FILL, SQTAG_TCP_DROP_Q0);
   3998 
   3999 	return (B_TRUE);
   4000 }
   4001 
   4002 /*
   4003  * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6
   4004  */
   4005 static mblk_t *
   4006 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
   4007     ip_recv_attr_t *ira)
   4008 {
   4009 	tcp_t 		*ltcp = lconnp->conn_tcp;
   4010 	tcp_t		*tcp = connp->conn_tcp;
   4011 	mblk_t		*tpi_mp;
   4012 	ipha_t		*ipha;
   4013 	ip6_t		*ip6h;
   4014 	sin6_t 		sin6;
   4015 	uint_t		ifindex = ira->ira_ruifindex;
   4016 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   4017 
   4018 	if (ira->ira_flags & IRAF_IS_IPV4) {
   4019 		ipha = (ipha_t *)mp->b_rptr;
   4020 
   4021 		connp->conn_ipversion = IPV4_VERSION;
   4022 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
   4023 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
   4024 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   4025 
   4026 		sin6 = sin6_null;
   4027 		sin6.sin6_addr = connp->conn_faddr_v6;
   4028 		sin6.sin6_port = connp->conn_fport;
   4029 		sin6.sin6_family = AF_INET6;
   4030 		sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
   4031 		    IPCL_ZONEID(lconnp), tcps->tcps_netstack);
   4032 
   4033 		if (connp->conn_recv_ancillary.crb_recvdstaddr) {
   4034 			sin6_t	sin6d;
   4035 
   4036 			sin6d = sin6_null;
   4037 			sin6d.sin6_addr = connp->conn_laddr_v6;
   4038 			sin6d.sin6_port = connp->conn_lport;
   4039 			sin6d.sin6_family = AF_INET;
   4040 			tpi_mp = mi_tpi_extconn_ind(NULL,
   4041 			    (char *)&sin6d, sizeof (sin6_t),
   4042 			    (char *)&tcp,
   4043 			    (t_scalar_t)sizeof (intptr_t),
   4044 			    (char *)&sin6d, sizeof (sin6_t),
   4045 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4046 		} else {
   4047 			tpi_mp = mi_tpi_conn_ind(NULL,
   4048 			    (char *)&sin6, sizeof (sin6_t),
   4049 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4050 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4051 		}
   4052 	} else {
   4053 		ip6h = (ip6_t *)mp->b_rptr;
   4054 
   4055 		connp->conn_ipversion = IPV6_VERSION;
   4056 		connp->conn_laddr_v6 = ip6h->ip6_dst;
   4057 		connp->conn_faddr_v6 = ip6h->ip6_src;
   4058 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   4059 
   4060 		sin6 = sin6_null;
   4061 		sin6.sin6_addr = connp->conn_faddr_v6;
   4062 		sin6.sin6_port = connp->conn_fport;
   4063 		sin6.sin6_family = AF_INET6;
   4064 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
   4065 		sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
   4066 		    IPCL_ZONEID(lconnp), tcps->tcps_netstack);
   4067 
   4068 		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
   4069 			/* Pass up the scope_id of remote addr */
   4070 			sin6.sin6_scope_id = ifindex;
   4071 		} else {
   4072 			sin6.sin6_scope_id = 0;
   4073 		}
   4074 		if (connp->conn_recv_ancillary.crb_recvdstaddr) {
   4075 			sin6_t	sin6d;
   4076 
   4077 			sin6d = sin6_null;
   4078 			sin6.sin6_addr = connp->conn_laddr_v6;
   4079 			sin6d.sin6_port = connp->conn_lport;
   4080 			sin6d.sin6_family = AF_INET6;
   4081 			if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6))
   4082 				sin6d.sin6_scope_id = ifindex;
   4083 
   4084 			tpi_mp = mi_tpi_extconn_ind(NULL,
   4085 			    (char *)&sin6d, sizeof (sin6_t),
   4086 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4087 			    (char *)&sin6d, sizeof (sin6_t),
   4088 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4089 		} else {
   4090 			tpi_mp = mi_tpi_conn_ind(NULL,
   4091 			    (char *)&sin6, sizeof (sin6_t),
   4092 			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4093 			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4094 		}
   4095 	}
   4096 
   4097 	tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
   4098 	return (tpi_mp);
   4099 }
   4100 
   4101 /* Handle a SYN on an AF_INET socket */
   4102 mblk_t *
   4103 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
   4104     ip_recv_attr_t *ira)
   4105 {
   4106 	tcp_t 		*ltcp = lconnp->conn_tcp;
   4107 	tcp_t		*tcp = connp->conn_tcp;
   4108 	sin_t		sin;
   4109 	mblk_t		*tpi_mp = NULL;
   4110 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   4111 	ipha_t		*ipha;
   4112 
   4113 	ASSERT(ira->ira_flags & IRAF_IS_IPV4);
   4114 	ipha = (ipha_t *)mp->b_rptr;
   4115 
   4116 	connp->conn_ipversion = IPV4_VERSION;
   4117 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
   4118 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
   4119 	connp->conn_saddr_v6 = connp->conn_laddr_v6;
   4120 
   4121 	sin = sin_null;
   4122 	sin.sin_addr.s_addr = connp->conn_faddr_v4;
   4123 	sin.sin_port = connp->conn_fport;
   4124 	sin.sin_family = AF_INET;
   4125 	if (lconnp->conn_recv_ancillary.crb_recvdstaddr) {
   4126 		sin_t	sind;
   4127 
   4128 		sind = sin_null;
   4129 		sind.sin_addr.s_addr = connp->conn_laddr_v4;
   4130 		sind.sin_port = connp->conn_lport;
   4131 		sind.sin_family = AF_INET;
   4132 		tpi_mp = mi_tpi_extconn_ind(NULL,
   4133 		    (char *)&sind, sizeof (sin_t), (char *)&tcp,
   4134 		    (t_scalar_t)sizeof (intptr_t), (char *)&sind,
   4135 		    sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4136 	} else {
   4137 		tpi_mp = mi_tpi_conn_ind(NULL,
   4138 		    (char *)&sin, sizeof (sin_t),
   4139 		    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
   4140 		    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
   4141 	}
   4142 
   4143 	tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
   4144 	return (tpi_mp);
   4145 }
   4146 
   4147 /*
   4148  * tcp_get_conn/tcp_free_conn
   4149  *
   4150  * tcp_get_conn is used to get a clean tcp connection structure.
   4151  * It tries to reuse the connections put on the freelist by the
   4152  * time_wait_collector failing which it goes to kmem_cache. This
   4153  * way has two benefits compared to just allocating from and
   4154  * freeing to kmem_cache.
   4155  * 1) The time_wait_collector can free (which includes the cleanup)
   4156  * outside the squeue. So when the interrupt comes, we have a clean
   4157  * connection sitting in the freelist. Obviously, this buys us
   4158  * performance.
   4159  *
   4160  * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
   4161  * has multiple disadvantages - tying up the squeue during alloc.
   4162  * But allocating the conn/tcp in IP land is also not the best since
   4163  * we can't check the 'q' and 'q0' which are protected by squeue and
   4164  * blindly allocate memory which might have to be freed here if we are
   4165  * not allowed to accept the connection. By using the freelist and
   4166  * putting the conn/tcp back in freelist, we don't pay a penalty for
   4167  * allocating memory without checking 'q/q0' and freeing it if we can't
   4168  * accept the connection.
   4169  *
   4170  * Care should be taken to put the conn back in the same squeue's freelist
   4171  * from which it was allocated. Best results are obtained if conn is
   4172  * allocated from listener's squeue and freed to the same. Time wait
   4173  * collector will free up the freelist is the connection ends up sitting
   4174  * there for too long.
   4175  */
   4176 void *
   4177 tcp_get_conn(void *arg, tcp_stack_t *tcps)
   4178 {
   4179 	tcp_t			*tcp = NULL;
   4180 	conn_t			*connp = NULL;
   4181 	squeue_t		*sqp = (squeue_t *)arg;
   4182 	tcp_squeue_priv_t 	*tcp_time_wait;
   4183 	netstack_t		*ns;
   4184 	mblk_t			*tcp_rsrv_mp = NULL;
   4185 
   4186 	tcp_time_wait =
   4187 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
   4188 
   4189 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   4190 	tcp = tcp_time_wait->tcp_free_list;
   4191 	ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
   4192 	if (tcp != NULL) {
   4193 		tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
   4194 		tcp_time_wait->tcp_free_list_cnt--;
   4195 		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   4196 		tcp->tcp_time_wait_next = NULL;
   4197 		connp = tcp->tcp_connp;
   4198 		connp->conn_flags |= IPCL_REUSED;
   4199 
   4200 		ASSERT(tcp->tcp_tcps == NULL);
   4201 		ASSERT(connp->conn_netstack == NULL);
   4202 		ASSERT(tcp->tcp_rsrv_mp != NULL);
   4203 		ns = tcps->tcps_netstack;
   4204 		netstack_hold(ns);
   4205 		connp->conn_netstack = ns;
   4206 		connp->conn_ixa->ixa_ipst = ns->netstack_ip;
   4207 		tcp->tcp_tcps = tcps;
   4208 		ipcl_globalhash_insert(connp);
   4209 
   4210 		connp->conn_ixa->ixa_notify_cookie = tcp;
   4211 		ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
   4212 		connp->conn_recv = tcp_input_data;
   4213 		ASSERT(connp->conn_recvicmp == tcp_icmp_input);
   4214 		ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
   4215 		return ((void *)connp);
   4216 	}
   4217 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   4218 	/*
   4219 	 * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
   4220 	 * this conn_t/tcp_t is freed at ipcl_conn_destroy().
   4221 	 */
   4222 	tcp_rsrv_mp = allocb(0, BPRI_HI);
   4223 	if (tcp_rsrv_mp == NULL)
   4224 		return (NULL);
   4225 
   4226 	if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
   4227 	    tcps->tcps_netstack)) == NULL) {
   4228 		freeb(tcp_rsrv_mp);
   4229 		return (NULL);
   4230 	}
   4231 
   4232 	tcp = connp->conn_tcp;
   4233 	tcp->tcp_rsrv_mp = tcp_rsrv_mp;
   4234 	mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
   4235 
   4236 	tcp->tcp_tcps = tcps;
   4237 
   4238 	connp->conn_recv = tcp_input_data;
   4239 	connp->conn_recvicmp = tcp_icmp_input;
   4240 	connp->conn_verifyicmp = tcp_verifyicmp;
   4241 
   4242 	/*
   4243 	 * Register tcp_notify to listen to capability changes detected by IP.
   4244 	 * This upcall is made in the context of the call to conn_ip_output
   4245 	 * thus it is inside the squeue.
   4246 	 */
   4247 	connp->conn_ixa->ixa_notify = tcp_notify;
   4248 	connp->conn_ixa->ixa_notify_cookie = tcp;
   4249 
   4250 	return ((void *)connp);
   4251 }
   4252 
   4253 /* BEGIN CSTYLED */
   4254 /*
   4255  *
   4256  * The sockfs ACCEPT path:
   4257  * =======================
   4258  *
   4259  * The eager is now established in its own perimeter as soon as SYN is
   4260  * received in tcp_input_listener(). When sockfs receives conn_ind, it
   4261  * completes the accept processing on the acceptor STREAM. The sending
   4262  * of conn_ind part is common for both sockfs listener and a TLI/XTI
   4263  * listener but a TLI/XTI listener completes the accept processing
   4264  * on the listener perimeter.
   4265  *
   4266  * Common control flow for 3 way handshake:
   4267  * ----------------------------------------
   4268  *
   4269  * incoming SYN (listener perimeter)	-> tcp_input_listener()
   4270  *
   4271  * incoming SYN-ACK-ACK (eager perim) 	-> tcp_input_data()
   4272  * send T_CONN_IND (listener perim)	-> tcp_send_conn_ind()
   4273  *
   4274  * Sockfs ACCEPT Path:
   4275  * -------------------
   4276  *
   4277  * open acceptor stream (tcp_open allocates tcp_tli_accept()
   4278  * as STREAM entry point)
   4279  *
   4280  * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept()
   4281  *
   4282  * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager
   4283  * association (we are not behind eager's squeue but sockfs is protecting us
   4284  * and no one knows about this stream yet. The STREAMS entry point q->q_info
   4285  * is changed to point at tcp_wput().
   4286  *
   4287  * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to
   4288  * listener (done on listener's perimeter).
   4289  *
   4290  * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish
   4291  * accept.
   4292  *
   4293  * TLI/XTI client ACCEPT path:
   4294  * ---------------------------
   4295  *
   4296  * soaccept() sends T_CONN_RES on the listener STREAM.
   4297  *
   4298  * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send
   4299  * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()).
   4300  *
   4301  * Locks:
   4302  * ======
   4303  *
   4304  * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and
   4305  * and listeners->tcp_eager_next_q.
   4306  *
   4307  * Referencing:
   4308  * ============
   4309  *
   4310  * 1) We start out in tcp_input_listener by eager placing a ref on
   4311  * listener and listener adding eager to listeners->tcp_eager_next_q0.
   4312  *
   4313  * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before
   4314  * doing so we place a ref on the eager. This ref is finally dropped at the
   4315  * end of tcp_accept_finish() while unwinding from the squeue, i.e. the
   4316  * reference is dropped by the squeue framework.
   4317  *
   4318  * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish
   4319  *
   4320  * The reference must be released by the same entity that added the reference
   4321  * In the above scheme, the eager is the entity that adds and releases the
   4322  * references. Note that tcp_accept_finish executes in the squeue of the eager
   4323  * (albeit after it is attached to the acceptor stream). Though 1. executes
   4324  * in the listener's squeue, the eager is nascent at this point and the
   4325  * reference can be considered to have been added on behalf of the eager.
   4326  *
   4327  * Eager getting a Reset or listener closing:
   4328  * ==========================================
   4329  *
   4330  * Once the listener and eager are linked, the listener never does the unlink.
   4331  * If the listener needs to close, tcp_eager_cleanup() is called which queues
   4332  * a message on all eager perimeter. The eager then does the unlink, clears
   4333  * any pointers to the listener's queue and drops the reference to the
   4334  * listener. The listener waits in tcp_close outside the squeue until its
   4335  * refcount has dropped to 1. This ensures that the listener has waited for
   4336  * all eagers to clear their association with the listener.
   4337  *
   4338  * Similarly, if eager decides to go away, it can unlink itself and close.
   4339  * When the T_CONN_RES comes down, we check if eager has closed. Note that
   4340  * the reference to eager is still valid because of the extra ref we put
   4341  * in tcp_send_conn_ind.
   4342  *
   4343  * Listener can always locate the eager under the protection
   4344  * of the listener->tcp_eager_lock, and then do a refhold
   4345  * on the eager during the accept processing.
   4346  *
   4347  * The acceptor stream accesses the eager in the accept processing
   4348  * based on the ref placed on eager before sending T_conn_ind.
   4349  * The only entity that can negate this refhold is a listener close
   4350  * which is mutually exclusive with an active acceptor stream.
   4351  *
   4352  * Eager's reference on the listener
   4353  * ===================================
   4354  *
   4355  * If the accept happens (even on a closed eager) the eager drops its
   4356  * reference on the listener at the start of tcp_accept_finish. If the
   4357  * eager is killed due to an incoming RST before the T_conn_ind is sent up,
   4358  * the reference is dropped in tcp_closei_local. If the listener closes,
   4359  * the reference is dropped in tcp_eager_kill. In all cases the reference
   4360  * is dropped while executing in the eager's context (squeue).
   4361  */
   4362 /* END CSTYLED */
   4363 
   4364 /* Process the SYN packet, mp, directed at the listener 'tcp' */
   4365 
   4366 /*
   4367  * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN.
   4368  * tcp_input_data will not see any packets for listeners since the listener
   4369  * has conn_recv set to tcp_input_listener.
   4370  */
   4371 /* ARGSUSED */
   4372 void
   4373 tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   4374 {
   4375 	tcpha_t		*tcpha;
   4376 	uint32_t	seg_seq;
   4377 	tcp_t		*eager;
   4378 	int		err;
   4379 	conn_t		*econnp = NULL;
   4380 	squeue_t	*new_sqp;
   4381 	mblk_t		*mp1;
   4382 	uint_t 		ip_hdr_len;
   4383 	conn_t		*lconnp = (conn_t *)arg;
   4384 	tcp_t		*listener = lconnp->conn_tcp;
   4385 	tcp_stack_t	*tcps = listener->tcp_tcps;
   4386 	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
   4387 	uint_t		flags;
   4388 	mblk_t		*tpi_mp;
   4389 	uint_t		ifindex = ira->ira_ruifindex;
   4390 
   4391 	ip_hdr_len = ira->ira_ip_hdr_length;
   4392 	tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
   4393 	flags = (unsigned int)tcpha->tha_flags & 0xFF;
   4394 
   4395 	if (!(flags & TH_SYN)) {
   4396 		if ((flags & TH_RST) || (flags & TH_URG)) {
   4397 			freemsg(mp);
   4398 			return;
   4399 		}
   4400 		if (flags & TH_ACK) {
   4401 			/* Note this executes in listener's squeue */
   4402 			tcp_xmit_listeners_reset(mp, ira, ipst, lconnp);
   4403 			return;
   4404 		}
   4405 
   4406 		freemsg(mp);
   4407 		return;
   4408 	}
   4409 
   4410 	if (listener->tcp_state != TCPS_LISTEN)
   4411 		goto error2;
   4412 
   4413 	ASSERT(IPCL_IS_BOUND(lconnp));
   4414 
   4415 	mutex_enter(&listener->tcp_eager_lock);
   4416 	if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) {
   4417 		mutex_exit(&listener->tcp_eager_lock);
   4418 		TCP_STAT(tcps, tcp_listendrop);
   4419 		BUMP_MIB(&tcps->tcps_mib, tcpListenDrop);
   4420 		if (lconnp->conn_debug) {
   4421 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
   4422 			    "tcp_input_listener: listen backlog (max=%d) "
   4423 			    "overflow (%d pending) on %s",
   4424 			    listener->tcp_conn_req_max,
   4425 			    listener->tcp_conn_req_cnt_q,
   4426 			    tcp_display(listener, NULL, DISP_PORT_ONLY));
   4427 		}
   4428 		goto error2;
   4429 	}
   4430 
   4431 	if (listener->tcp_conn_req_cnt_q0 >=
   4432 	    listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
   4433 		/*
   4434 		 * Q0 is full. Drop a pending half-open req from the queue
   4435 		 * to make room for the new SYN req. Also mark the time we
   4436 		 * drop a SYN.
   4437 		 *
   4438 		 * A more aggressive defense against SYN attack will
   4439 		 * be to set the "tcp_syn_defense" flag now.
   4440 		 */
   4441 		TCP_STAT(tcps, tcp_listendropq0);
   4442 		listener->tcp_last_rcv_lbolt = ddi_get_lbolt64();
   4443 		if (!tcp_drop_q0(listener)) {
   4444 			mutex_exit(&listener->tcp_eager_lock);
   4445 			BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0);
   4446 			if (lconnp->conn_debug) {
   4447 				(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
   4448 				    "tcp_input_listener: listen half-open "
   4449 				    "queue (max=%d) full (%d pending) on %s",
   4450 				    tcps->tcps_conn_req_max_q0,
   4451 				    listener->tcp_conn_req_cnt_q0,
   4452 				    tcp_display(listener, NULL,
   4453 				    DISP_PORT_ONLY));
   4454 			}
   4455 			goto error2;
   4456 		}
   4457 	}
   4458 	mutex_exit(&listener->tcp_eager_lock);
   4459 
   4460 	/*
   4461 	 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
   4462 	 * or based on the ring (for packets from GLD). Otherwise it is
   4463 	 * set based on lbolt i.e., a somewhat random number.
   4464 	 */
   4465 	ASSERT(ira->ira_sqp != NULL);
   4466 	new_sqp = ira->ira_sqp;
   4467 
   4468 	econnp = (conn_t *)tcp_get_conn(arg2, tcps);
   4469 	if (econnp == NULL)
   4470 		goto error2;
   4471 
   4472 	ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
   4473 	econnp->conn_sqp = new_sqp;
   4474 	econnp->conn_initial_sqp = new_sqp;
   4475 	econnp->conn_ixa->ixa_sqp = new_sqp;
   4476 
   4477 	econnp->conn_fport = tcpha->tha_lport;
   4478 	econnp->conn_lport = tcpha->tha_fport;
   4479 
   4480 	err = conn_inherit_parent(lconnp, econnp);
   4481 	if (err != 0)
   4482 		goto error3;
   4483 
   4484 	ASSERT(OK_32PTR(mp->b_rptr));
   4485 	ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ||
   4486 	    IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
   4487 
   4488 	if (lconnp->conn_family == AF_INET) {
   4489 		ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION);
   4490 		tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira);
   4491 	} else {
   4492 		tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira);
   4493 	}
   4494 
   4495 	if (tpi_mp == NULL)
   4496 		goto error3;
   4497 
   4498 	eager = econnp->conn_tcp;
   4499 	eager->tcp_detached = B_TRUE;
   4500 	SOCK_CONNID_INIT(eager->tcp_connid);
   4501 
   4502 	tcp_init_values(eager);
   4503 
   4504 	ASSERT((econnp->conn_ixa->ixa_flags &
   4505 	    (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
   4506 	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) ==
   4507 	    (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
   4508 	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO));
   4509 
   4510 	if (!tcps->tcps_dev_flow_ctl)
   4511 		econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL;
   4512 
   4513 	/* Prepare for diffing against previous packets */
   4514 	eager->tcp_recvifindex = 0;
   4515 	eager->tcp_recvhops = 0xffffffffU;
   4516 
   4517 	if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) {
   4518 		if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) ||
   4519 		    IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) {
   4520 			econnp->conn_incoming_ifindex = ifindex;
   4521 			econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
   4522 			econnp->conn_ixa->ixa_scopeid = ifindex;
   4523 		}
   4524 	}
   4525 
   4526 	if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) ==
   4527 	    (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) &&
   4528 	    tcps->tcps_rev_src_routes) {
   4529 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
   4530 		ip_pkt_t *ipp = &econnp->conn_xmit_ipp;
   4531 
   4532 		/* Source routing option copyover (reverse it) */
   4533 		err = ip_find_hdr_v4(ipha, ipp, B_TRUE);
   4534 		if (err != 0) {
   4535 			freemsg(tpi_mp);
   4536 			goto error3;
   4537 		}
   4538 		ip_pkt_source_route_reverse_v4(ipp);
   4539 	}
   4540 
   4541 	ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL);
   4542 	ASSERT(!eager->tcp_tconnind_started);
   4543 	/*
   4544 	 * If the SYN came with a credential, it's a loopback packet or a
   4545 	 * labeled packet; attach the credential to the TPI message.
   4546 	 */
   4547 	if (ira->ira_cred != NULL)
   4548 		mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid);
   4549 
   4550 	eager->tcp_conn.tcp_eager_conn_ind = tpi_mp;
   4551 
   4552 	/* Inherit the listener's SSL protection state */
   4553 	if ((eager->tcp_kssl_ent = listener->tcp_kssl_ent) != NULL) {
   4554 		kssl_hold_ent(eager->tcp_kssl_ent);
   4555 		eager->tcp_kssl_pending = B_TRUE;
   4556 	}
   4557 
   4558 	/* Inherit the listener's non-STREAMS flag */
   4559 	if (IPCL_IS_NONSTR(lconnp)) {
   4560 		econnp->conn_flags |= IPCL_NONSTR;
   4561 	}
   4562 
   4563 	ASSERT(eager->tcp_ordrel_mp == NULL);
   4564 
   4565 	if (!IPCL_IS_NONSTR(econnp)) {
   4566 		/*
   4567 		 * Pre-allocate the T_ordrel_ind mblk for TPI socket so that
   4568 		 * at close time, we will always have that to send up.
   4569 		 * Otherwise, we need to do special handling in case the
   4570 		 * allocation fails at that time.
   4571 		 */
   4572 		if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL)
   4573 			goto error3;
   4574 	}
   4575 	/*
   4576 	 * Now that the IP addresses and ports are setup in econnp we
   4577 	 * can do the IPsec policy work.
   4578 	 */
   4579 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
   4580 		if (lconnp->conn_policy != NULL) {
   4581 			/*
   4582 			 * Inherit the policy from the listener; use
   4583 			 * actions from ira
   4584 			 */
   4585 			if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) {
   4586 				CONN_DEC_REF(econnp);
   4587 				freemsg(mp);
   4588 				goto error3;
   4589 			}
   4590 		}
   4591 	}
   4592 
   4593 	/* Inherit various TCP parameters from the listener */
   4594 	eager->tcp_naglim = listener->tcp_naglim;
   4595 	eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold;
   4596 	eager->tcp_second_timer_threshold =
   4597 	    listener->tcp_second_timer_threshold;
   4598 	eager->tcp_first_ctimer_threshold =
   4599 	    listener->tcp_first_ctimer_threshold;
   4600 	eager->tcp_second_ctimer_threshold =
   4601 	    listener->tcp_second_ctimer_threshold;
   4602 
   4603 	/*
   4604 	 * tcp_set_destination() may set tcp_rwnd according to the route
   4605 	 * metrics. If it does not, the eager's receive window will be set
   4606 	 * to the listener's receive window later in this function.
   4607 	 */
   4608 	eager->tcp_rwnd = 0;
   4609 
   4610 	/*
   4611 	 * Inherit listener's tcp_init_cwnd.  Need to do this before
   4612 	 * calling tcp_process_options() which set the initial cwnd.
   4613 	 */
   4614 	eager->tcp_init_cwnd = listener->tcp_init_cwnd;
   4615 
   4616 	if (is_system_labeled()) {
   4617 		ip_xmit_attr_t *ixa = econnp->conn_ixa;
   4618 
   4619 		ASSERT(ira->ira_tsl != NULL);
   4620 		/* Discard any old label */
   4621 		if (ixa->ixa_free_flags & IXA_FREE_TSL) {
   4622 			ASSERT(ixa->ixa_tsl != NULL);
   4623 			label_rele(ixa->ixa_tsl);
   4624 			ixa->ixa_free_flags &= ~IXA_FREE_TSL;
   4625 			ixa->ixa_tsl = NULL;
   4626 		}
   4627 		if ((lconnp->conn_mlp_type != mlptSingle ||
   4628 		    lconnp->conn_mac_mode != CONN_MAC_DEFAULT) &&
   4629 		    ira->ira_tsl != NULL) {
   4630 			/*
   4631 			 * If this is an MLP connection or a MAC-Exempt
   4632 			 * connection with an unlabeled node, packets are to be
   4633 			 * exchanged using the security label of the received
   4634 			 * SYN packet instead of the server application's label.
   4635 			 * tsol_check_dest called from ip_set_destination
   4636 			 * might later update TSF_UNLABELED by replacing
   4637 			 * ixa_tsl with a new label.
   4638 			 */
   4639 			label_hold(ira->ira_tsl);
   4640 			ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
   4641 			DTRACE_PROBE2(mlp_syn_accept, conn_t *,
   4642 			    econnp, ts_label_t *, ixa->ixa_tsl)
   4643 		} else {
   4644 			ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
   4645 			DTRACE_PROBE2(syn_accept, conn_t *,
   4646 			    econnp, ts_label_t *, ixa->ixa_tsl)
   4647 		}
   4648 		/*
   4649 		 * conn_connect() called from tcp_set_destination will verify
   4650 		 * the destination is allowed to receive packets at the
   4651 		 * security label of the SYN-ACK we are generating. As part of
   4652 		 * that, tsol_check_dest() may create a new effective label for
   4653 		 * this connection.
   4654 		 * Finally conn_connect() will call conn_update_label.
   4655 		 * All that remains for TCP to do is to call
   4656 		 * conn_build_hdr_template which is done as part of
   4657 		 * tcp_set_destination.
   4658 		 */
   4659 	}
   4660 
   4661 	/*
   4662 	 * Since we will clear tcp_listener before we clear tcp_detached
   4663 	 * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress
   4664 	 * so we can tell a TCP_DETACHED_NONEAGER apart.
   4665 	 */
   4666 	eager->tcp_hard_binding = B_TRUE;
   4667 
   4668 	tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
   4669 	    TCP_BIND_HASH(econnp->conn_lport)], eager, 0);
   4670 
   4671 	CL_INET_CONNECT(econnp, B_FALSE, err);
   4672 	if (err != 0) {
   4673 		tcp_bind_hash_remove(eager);
   4674 		goto error3;
   4675 	}
   4676 
   4677 	/*
   4678 	 * No need to check for multicast destination since ip will only pass
   4679 	 * up multicasts to those that have expressed interest
   4680 	 * TODO: what about rejecting broadcasts?
   4681 	 * Also check that source is not a multicast or broadcast address.
   4682 	 */
   4683 	eager->tcp_state = TCPS_SYN_RCVD;
   4684 	SOCK_CONNID_BUMP(eager->tcp_connid);
   4685 
   4686 	/*
   4687 	 * Adapt our mss, ttl, ... based on the remote address.
   4688 	 */
   4689 
   4690 	if (tcp_set_destination(eager) != 0) {
   4691 		BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   4692 		/* Undo the bind_hash_insert */
   4693 		tcp_bind_hash_remove(eager);
   4694 		goto error3;
   4695 	}
   4696 
   4697 	/* Process all TCP options. */
   4698 	tcp_process_options(eager, tcpha);
   4699 
   4700 	/* Is the other end ECN capable? */
   4701 	if (tcps->tcps_ecn_permitted >= 1 &&
   4702 	    (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
   4703 		eager->tcp_ecn_ok = B_TRUE;
   4704 	}
   4705 
   4706 	/*
   4707 	 * The listener's conn_rcvbuf should be the default window size or a
   4708 	 * window size changed via SO_RCVBUF option. First round up the
   4709 	 * eager's tcp_rwnd to the nearest MSS. Then find out the window
   4710 	 * scale option value if needed. Call tcp_rwnd_set() to finish the
   4711 	 * setting.
   4712 	 *
   4713 	 * Note if there is a rpipe metric associated with the remote host,
   4714 	 * we should not inherit receive window size from listener.
   4715 	 */
   4716 	eager->tcp_rwnd = MSS_ROUNDUP(
   4717 	    (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf :
   4718 	    eager->tcp_rwnd), eager->tcp_mss);
   4719 	if (eager->tcp_snd_ws_ok)
   4720 		tcp_set_ws_value(eager);
   4721 	/*
   4722 	 * Note that this is the only place tcp_rwnd_set() is called for
   4723 	 * accepting a connection.  We need to call it here instead of
   4724 	 * after the 3-way handshake because we need to tell the other
   4725 	 * side our rwnd in the SYN-ACK segment.
   4726 	 */
   4727 	(void) tcp_rwnd_set(eager, eager->tcp_rwnd);
   4728 
   4729 	ASSERT(eager->tcp_connp->conn_rcvbuf != 0 &&
   4730 	    eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd);
   4731 
   4732 	ASSERT(econnp->conn_rcvbuf != 0 &&
   4733 	    econnp->conn_rcvbuf == eager->tcp_rwnd);
   4734 
   4735 	/* Put a ref on the listener for the eager. */
   4736 	CONN_INC_REF(lconnp);
   4737 	mutex_enter(&listener->tcp_eager_lock);
   4738 	listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
   4739 	eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0;
   4740 	listener->tcp_eager_next_q0 = eager;
   4741 	eager->tcp_eager_prev_q0 = listener;
   4742 
   4743 	/* Set tcp_listener before adding it to tcp_conn_fanout */
   4744 	eager->tcp_listener = listener;
   4745 	eager->tcp_saved_listener = listener;
   4746 
   4747 	/*
   4748 	 * Tag this detached tcp vector for later retrieval
   4749 	 * by our listener client in tcp_accept().
   4750 	 */
   4751 	eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum;
   4752 	listener->tcp_conn_req_cnt_q0++;
   4753 	if (++listener->tcp_conn_req_seqnum == -1) {
   4754 		/*
   4755 		 * -1 is "special" and defined in TPI as something
   4756 		 * that should never be used in T_CONN_IND
   4757 		 */
   4758 		++listener->tcp_conn_req_seqnum;
   4759 	}
   4760 	mutex_exit(&listener->tcp_eager_lock);
   4761 
   4762 	if (listener->tcp_syn_defense) {
   4763 		/* Don't drop the SYN that comes from a good IP source */
   4764 		ipaddr_t *addr_cache;
   4765 
   4766 		addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
   4767 		if (addr_cache != NULL && econnp->conn_faddr_v4 ==
   4768 		    addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) {
   4769 			eager->tcp_dontdrop = B_TRUE;
   4770 		}
   4771 	}
   4772 
   4773 	/*
   4774 	 * We need to insert the eager in its own perimeter but as soon
   4775 	 * as we do that, we expose the eager to the classifier and
   4776 	 * should not touch any field outside the eager's perimeter.
   4777 	 * So do all the work necessary before inserting the eager
   4778 	 * in its own perimeter. Be optimistic that conn_connect()
   4779 	 * will succeed but undo everything if it fails.
   4780 	 */
   4781 	seg_seq = ntohl(tcpha->tha_seq);
   4782 	eager->tcp_irs = seg_seq;
   4783 	eager->tcp_rack = seg_seq;
   4784 	eager->tcp_rnxt = seg_seq + 1;
   4785 	eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt);
   4786 	BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);
   4787 	eager->tcp_state = TCPS_SYN_RCVD;
   4788 	mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
   4789 	    NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
   4790 	if (mp1 == NULL) {
   4791 		/*
   4792 		 * Increment the ref count as we are going to
   4793 		 * enqueueing an mp in squeue
   4794 		 */
   4795 		CONN_INC_REF(econnp);
   4796 		goto error;
   4797 	}
   4798 
   4799 	/*
   4800 	 * We need to start the rto timer. In normal case, we start
   4801 	 * the timer after sending the packet on the wire (or at
   4802 	 * least believing that packet was sent by waiting for
   4803 	 * conn_ip_output() to return). Since this is the first packet
   4804 	 * being sent on the wire for the eager, our initial tcp_rto
   4805 	 * is at least tcp_rexmit_interval_min which is a fairly
   4806 	 * large value to allow the algorithm to adjust slowly to large
   4807 	 * fluctuations of RTT during first few transmissions.
   4808 	 *
   4809 	 * Starting the timer first and then sending the packet in this
   4810 	 * case shouldn't make much difference since tcp_rexmit_interval_min
   4811 	 * is of the order of several 100ms and starting the timer
   4812 	 * first and then sending the packet will result in difference
   4813 	 * of few micro seconds.
   4814 	 *
   4815 	 * Without this optimization, we are forced to hold the fanout
   4816 	 * lock across the ipcl_bind_insert() and sending the packet
   4817 	 * so that we don't race against an incoming packet (maybe RST)
   4818 	 * for this eager.
   4819 	 *
   4820 	 * It is necessary to acquire an extra reference on the eager
   4821 	 * at this point and hold it until after tcp_send_data() to
   4822 	 * ensure against an eager close race.
   4823 	 */
   4824 
   4825 	CONN_INC_REF(econnp);
   4826 
   4827 	TCP_TIMER_RESTART(eager, eager->tcp_rto);
   4828 
   4829 	/*
   4830 	 * Insert the eager in its own perimeter now. We are ready to deal
   4831 	 * with any packets on eager.
   4832 	 */
   4833 	if (ipcl_conn_insert(econnp) != 0)
   4834 		goto error;
   4835 
   4836 	/*
   4837 	 * Send the SYN-ACK. Can't use tcp_send_data since we can't update
   4838 	 * pmtu etc; we are not on the eager's squeue
   4839 	 */
   4840 	ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp);
   4841 	(void) conn_ip_output(mp1, econnp->conn_ixa);
   4842 	CONN_DEC_REF(econnp);
   4843 	freemsg(mp);
   4844 
   4845 	return;
   4846 error:
   4847 	freemsg(mp1);
   4848 	eager->tcp_closemp_used = B_TRUE;
   4849 	TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   4850 	mp1 = &eager->tcp_closemp;
   4851 	SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill,
   4852 	    econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2);
   4853 
   4854 	/*
   4855 	 * If a connection already exists, send the mp to that connections so
   4856 	 * that it can be appropriately dealt with.
   4857 	 */
   4858 	ipst = tcps->tcps_netstack->netstack_ip;
   4859 
   4860 	if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) {
   4861 		if (!IPCL_IS_CONNECTED(econnp)) {
   4862 			/*
   4863 			 * Something bad happened. ipcl_conn_insert()
   4864 			 * failed because a connection already existed
   4865 			 * in connected hash but we can't find it
   4866 			 * anymore (someone blew it away). Just
   4867 			 * free this message and hopefully remote
   4868 			 * will retransmit at which time the SYN can be
   4869 			 * treated as a new connection or dealth with
   4870 			 * a TH_RST if a connection already exists.
   4871 			 */
   4872 			CONN_DEC_REF(econnp);
   4873 			freemsg(mp);
   4874 		} else {
   4875 			SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data,
   4876 			    econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1);
   4877 		}
   4878 	} else {
   4879 		/* Nobody wants this packet */
   4880 		freemsg(mp);
   4881 	}
   4882 	return;
   4883 error3:
   4884 	CONN_DEC_REF(econnp);
   4885 error2:
   4886 	freemsg(mp);
   4887 }
   4888 
   4889 /*
   4890  * In an ideal case of vertical partition in NUMA architecture, its
   4891  * beneficial to have the listener and all the incoming connections
   4892  * tied to the same squeue. The other constraint is that incoming
   4893  * connections should be tied to the squeue attached to interrupted
   4894  * CPU for obvious locality reason so this leaves the listener to
   4895  * be tied to the same squeue. Our only problem is that when listener
   4896  * is binding, the CPU that will get interrupted by the NIC whose
   4897  * IP address the listener is binding to is not even known. So
   4898  * the code below allows us to change that binding at the time the
   4899  * CPU is interrupted by virtue of incoming connection's squeue.
   4900  *
   4901  * This is usefull only in case of a listener bound to a specific IP
   4902  * address. For other kind of listeners, they get bound the
   4903  * very first time and there is no attempt to rebind them.
   4904  */
   4905 void
   4906 tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2,
   4907     ip_recv_attr_t *ira)
   4908 {
   4909 	conn_t		*connp = (conn_t *)arg;
   4910 	squeue_t	*sqp = (squeue_t *)arg2;
   4911 	squeue_t	*new_sqp;
   4912 	uint32_t	conn_flags;
   4913 
   4914 	/*
   4915 	 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
   4916 	 * or based on the ring (for packets from GLD). Otherwise it is
   4917 	 * set based on lbolt i.e., a somewhat random number.
   4918 	 */
   4919 	ASSERT(ira->ira_sqp != NULL);
   4920 	new_sqp = ira->ira_sqp;
   4921 
   4922 	if (connp->conn_fanout == NULL)
   4923 		goto done;
   4924 
   4925 	if (!(connp->conn_flags & IPCL_FULLY_BOUND)) {
   4926 		mutex_enter(&connp->conn_fanout->connf_lock);
   4927 		mutex_enter(&connp->conn_lock);
   4928 		/*
   4929 		 * No one from read or write side can access us now
   4930 		 * except for already queued packets on this squeue.
   4931 		 * But since we haven't changed the squeue yet, they
   4932 		 * can't execute. If they are processed after we have
   4933 		 * changed the squeue, they are sent back to the
   4934 		 * correct squeue down below.
   4935 		 * But a listner close can race with processing of
   4936 		 * incoming SYN. If incoming SYN processing changes
   4937 		 * the squeue then the listener close which is waiting
   4938 		 * to enter the squeue would operate on the wrong
   4939 		 * squeue. Hence we don't change the squeue here unless
   4940 		 * the refcount is exactly the minimum refcount. The
   4941 		 * minimum refcount of 4 is counted as - 1 each for
   4942 		 * TCP and IP, 1 for being in the classifier hash, and
   4943 		 * 1 for the mblk being processed.
   4944 		 */
   4945 
   4946 		if (connp->conn_ref != 4 ||
   4947 		    connp->conn_tcp->tcp_state != TCPS_LISTEN) {
   4948 			mutex_exit(&connp->conn_lock);
   4949 			mutex_exit(&connp->conn_fanout->connf_lock);
   4950 			goto done;
   4951 		}
   4952 		if (connp->conn_sqp != new_sqp) {
   4953 			while (connp->conn_sqp != new_sqp)
   4954 				(void) casptr(&connp->conn_sqp, sqp, new_sqp);
   4955 			/* No special MT issues for outbound ixa_sqp hint */
   4956 			connp->conn_ixa->ixa_sqp = new_sqp;
   4957 		}
   4958 
   4959 		do {
   4960 			conn_flags = connp->conn_flags;
   4961 			conn_flags |= IPCL_FULLY_BOUND;
   4962 			(void) cas32(&connp->conn_flags, connp->conn_flags,
   4963 			    conn_flags);
   4964 		} while (!(connp->conn_flags & IPCL_FULLY_BOUND));
   4965 
   4966 		mutex_exit(&connp->conn_fanout->connf_lock);
   4967 		mutex_exit(&connp->conn_lock);
   4968 
   4969 		/*
   4970 		 * Assume we have picked a good squeue for the listener. Make
   4971 		 * subsequent SYNs not try to change the squeue.
   4972 		 */
   4973 		connp->conn_recv = tcp_input_listener;
   4974 	}
   4975 
   4976 done:
   4977 	if (connp->conn_sqp != sqp) {
   4978 		CONN_INC_REF(connp);
   4979 		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
   4980 		    ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND);
   4981 	} else {
   4982 		tcp_input_listener(connp, mp, sqp, ira);
   4983 	}
   4984 }
   4985 
   4986 /*
   4987  * Successful connect request processing begins when our client passes
   4988  * a T_CONN_REQ message into tcp_wput(), which performs function calls into
   4989  * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
   4990  *
   4991  * After various error checks are completed, tcp_tpi_connect() lays
   4992  * the target address and port into the composite header template.
   4993  * Then we ask IP for information, including a source address if we didn't
   4994  * already have one. Finally we prepare to send the SYN packet, and then
   4995  * send up the T_OK_ACK reply message.
   4996  */
   4997 static void
   4998 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
   4999 {
   5000 	sin_t		*sin;
   5001 	struct T_conn_req	*tcr;
   5002 	struct sockaddr	*sa;
   5003 	socklen_t	len;
   5004 	int		error;
   5005 	cred_t		*cr;
   5006 	pid_t		cpid;
   5007 	conn_t		*connp = tcp->tcp_connp;
   5008 	queue_t		*q = connp->conn_wq;
   5009 
   5010 	/*
   5011 	 * All Solaris components should pass a db_credp
   5012 	 * for this TPI message, hence we ASSERT.
   5013 	 * But in case there is some other M_PROTO that looks
   5014 	 * like a TPI message sent by some other kernel
   5015 	 * component, we check and return an error.
   5016 	 */
   5017 	cr = msg_getcred(mp, &cpid);
   5018 	ASSERT(cr != NULL);
   5019 	if (cr == NULL) {
   5020 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
   5021 		return;
   5022 	}
   5023 
   5024 	tcr = (struct T_conn_req *)mp->b_rptr;
   5025 
   5026 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
   5027 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
   5028 		tcp_err_ack(tcp, mp, TPROTO, 0);
   5029 		return;
   5030 	}
   5031 
   5032 	/*
   5033 	 * Pre-allocate the T_ordrel_ind mblk so that at close time, we
   5034 	 * will always have that to send up.  Otherwise, we need to do
   5035 	 * special handling in case the allocation fails at that time.
   5036 	 * If the end point is TPI, the tcp_t can be reused and the
   5037 	 * tcp_ordrel_mp may be allocated already.
   5038 	 */
   5039 	if (tcp->tcp_ordrel_mp == NULL) {
   5040 		if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
   5041 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   5042 			return;
   5043 		}
   5044 	}
   5045 
   5046 	/*
   5047 	 * Determine packet type based on type of address passed in
   5048 	 * the request should contain an IPv4 or IPv6 address.
   5049 	 * Make sure that address family matches the type of
   5050 	 * family of the address passed down.
   5051 	 */
   5052 	switch (tcr->DEST_length) {
   5053 	default:
   5054 		tcp_err_ack(tcp, mp, TBADADDR, 0);
   5055 		return;
   5056 
   5057 	case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
   5058 		/*
   5059 		 * XXX: The check for valid DEST_length was not there
   5060 		 * in earlier releases and some buggy
   5061 		 * TLI apps (e.g Sybase) got away with not feeding
   5062 		 * in sin_zero part of address.
   5063 		 * We allow that bug to keep those buggy apps humming.
   5064 		 * Test suites require the check on DEST_length.
   5065 		 * We construct a new mblk with valid DEST_length
   5066 		 * free the original so the rest of the code does
   5067 		 * not have to keep track of this special shorter
   5068 		 * length address case.
   5069 		 */
   5070 		mblk_t *nmp;
   5071 		struct T_conn_req *ntcr;
   5072 		sin_t *nsin;
   5073 
   5074 		nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
   5075 		    tcr->OPT_length, BPRI_HI);
   5076 		if (nmp == NULL) {
   5077 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   5078 			return;
   5079 		}
   5080 		ntcr = (struct T_conn_req *)nmp->b_rptr;
   5081 		bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
   5082 		ntcr->PRIM_type = T_CONN_REQ;
   5083 		ntcr->DEST_length = sizeof (sin_t);
   5084 		ntcr->DEST_offset = sizeof (struct T_conn_req);
   5085 
   5086 		nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
   5087 		*nsin = sin_null;
   5088 		/* Get pointer to shorter address to copy from original mp */
   5089 		sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
   5090 		    tcr->DEST_length); /* extract DEST_length worth of sin_t */
   5091 		if (sin == NULL || !OK_32PTR((char *)sin)) {
   5092 			freemsg(nmp);
   5093 			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
   5094 			return;
   5095 		}
   5096 		nsin->sin_family = sin->sin_family;
   5097 		nsin->sin_port = sin->sin_port;
   5098 		nsin->sin_addr = sin->sin_addr;
   5099 		/* Note:nsin->sin_zero zero-fill with sin_null assign above */
   5100 		nmp->b_wptr = (uchar_t *)&nsin[1];
   5101 		if (tcr->OPT_length != 0) {
   5102 			ntcr->OPT_length = tcr->OPT_length;
   5103 			ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
   5104 			bcopy((uchar_t *)tcr + tcr->OPT_offset,
   5105 			    (uchar_t *)ntcr + ntcr->OPT_offset,
   5106 			    tcr->OPT_length);
   5107 			nmp->b_wptr += tcr->OPT_length;
   5108 		}
   5109 		freemsg(mp);	/* original mp freed */
   5110 		mp = nmp;	/* re-initialize original variables */
   5111 		tcr = ntcr;
   5112 	}
   5113 	/* FALLTHRU */
   5114 
   5115 	case sizeof (sin_t):
   5116 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
   5117 		    sizeof (sin_t));
   5118 		len = sizeof (sin_t);
   5119 		break;
   5120 
   5121 	case sizeof (sin6_t):
   5122 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
   5123 		    sizeof (sin6_t));
   5124 		len = sizeof (sin6_t);
   5125 		break;
   5126 	}
   5127 
   5128 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
   5129 	if (error != 0) {
   5130 		tcp_err_ack(tcp, mp, TSYSERR, error);
   5131 		return;
   5132 	}
   5133 
   5134 	/*
   5135 	 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
   5136 	 * should key on their sequence number and cut them loose.
   5137 	 */
   5138 
   5139 	/*
   5140 	 * If options passed in, feed it for verification and handling
   5141 	 */
   5142 	if (tcr->OPT_length != 0) {
   5143 		mblk_t	*ok_mp;
   5144 		mblk_t	*discon_mp;
   5145 		mblk_t  *conn_opts_mp;
   5146 		int t_error, sys_error, do_disconnect;
   5147 
   5148 		conn_opts_mp = NULL;
   5149 
   5150 		if (tcp_conprim_opt_process(tcp, mp,
   5151 		    &do_disconnect, &t_error, &sys_error) < 0) {
   5152 			if (do_disconnect) {
   5153 				ASSERT(t_error == 0 && sys_error == 0);
   5154 				discon_mp = mi_tpi_discon_ind(NULL,
   5155 				    ECONNREFUSED, 0);
   5156 				if (!discon_mp) {
   5157 					tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
   5158 					    TSYSERR, ENOMEM);
   5159 					return;
   5160 				}
   5161 				ok_mp = mi_tpi_ok_ack_alloc(mp);
   5162 				if (!ok_mp) {
   5163 					tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
   5164 					    TSYSERR, ENOMEM);
   5165 					return;
   5166 				}
   5167 				qreply(q, ok_mp);
   5168 				qreply(q, discon_mp); /* no flush! */
   5169 			} else {
   5170 				ASSERT(t_error != 0);
   5171 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
   5172 				    sys_error);
   5173 			}
   5174 			return;
   5175 		}
   5176 		/*
   5177 		 * Success in setting options, the mp option buffer represented
   5178 		 * by OPT_length/offset has been potentially modified and
   5179 		 * contains results of option processing. We copy it in
   5180 		 * another mp to save it for potentially influencing returning
   5181 		 * it in T_CONN_CONN.
   5182 		 */
   5183 		if (tcr->OPT_length != 0) { /* there are resulting options */
   5184 			conn_opts_mp = copyb(mp);
   5185 			if (!conn_opts_mp) {
   5186 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
   5187 				    TSYSERR, ENOMEM);
   5188 				return;
   5189 			}
   5190 			ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
   5191 			tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
   5192 			/*
   5193 			 * Note:
   5194 			 * These resulting option negotiation can include any
   5195 			 * end-to-end negotiation options but there no such
   5196 			 * thing (yet?) in our TCP/IP.
   5197 			 */
   5198 		}
   5199 	}
   5200 
   5201 	/* call the non-TPI version */
   5202 	error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid);
   5203 	if (error < 0) {
   5204 		mp = mi_tpi_err_ack_alloc(mp, -error, 0);
   5205 	} else if (error > 0) {
   5206 		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
   5207 	} else {
   5208 		mp = mi_tpi_ok_ack_alloc(mp);
   5209 	}
   5210 
   5211 	/*
   5212 	 * Note: Code below is the "failure" case
   5213 	 */
   5214 	/* return error ack and blow away saved option results if any */
   5215 connect_failed:
   5216 	if (mp != NULL)
   5217 		putnext(connp->conn_rq, mp);
   5218 	else {
   5219 		tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
   5220 		    TSYSERR, ENOMEM);
   5221 	}
   5222 }
   5223 
   5224 /*
   5225  * Handle connect to IPv4 destinations, including connections for AF_INET6
   5226  * sockets connecting to IPv4 mapped IPv6 destinations.
   5227  * Returns zero if OK, a positive errno, or a negative TLI error.
   5228  */
   5229 static int
   5230 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
   5231     uint_t srcid)
   5232 {
   5233 	ipaddr_t 	dstaddr = *dstaddrp;
   5234 	uint16_t 	lport;
   5235 	conn_t		*connp = tcp->tcp_connp;
   5236 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   5237 	int		error;
   5238 
   5239 	ASSERT(connp->conn_ipversion == IPV4_VERSION);
   5240 
   5241 	/* Check for attempt to connect to INADDR_ANY */
   5242 	if (dstaddr == INADDR_ANY)  {
   5243 		/*
   5244 		 * SunOS 4.x and 4.3 BSD allow an application
   5245 		 * to connect a TCP socket to INADDR_ANY.
   5246 		 * When they do this, the kernel picks the
   5247 		 * address of one interface and uses it
   5248 		 * instead.  The kernel usually ends up
   5249 		 * picking the address of the loopback
   5250 		 * interface.  This is an undocumented feature.
   5251 		 * However, we provide the same thing here
   5252 		 * in order to have source and binary
   5253 		 * compatibility with SunOS 4.x.
   5254 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
   5255 		 * generate the T_CONN_CON.
   5256 		 */
   5257 		dstaddr = htonl(INADDR_LOOPBACK);
   5258 		*dstaddrp = dstaddr;
   5259 	}
   5260 
   5261 	/* Handle __sin6_src_id if socket not bound to an IP address */
   5262 	if (srcid != 0 && connp->conn_laddr_v4 == INADDR_ANY) {
   5263 		ip_srcid_find_id(srcid, &connp->conn_laddr_v6,
   5264 		    IPCL_ZONEID(connp), tcps->tcps_netstack);
   5265 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   5266 	}
   5267 
   5268 	IN6_IPADDR_TO_V4MAPPED(dstaddr, &connp->conn_faddr_v6);
   5269 	connp->conn_fport = dstport;
   5270 
   5271 	/*
   5272 	 * At this point the remote destination address and remote port fields
   5273 	 * in the tcp-four-tuple have been filled in the tcp structure. Now we
   5274 	 * have to see which state tcp was in so we can take appropriate action.
   5275 	 */
   5276 	if (tcp->tcp_state == TCPS_IDLE) {
   5277 		/*
   5278 		 * We support a quick connect capability here, allowing
   5279 		 * clients to transition directly from IDLE to SYN_SENT
   5280 		 * tcp_bindi will pick an unused port, insert the connection
   5281 		 * in the bind hash and transition to BOUND state.
   5282 		 */
   5283 		lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
   5284 		    tcp, B_TRUE);
   5285 		lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE,
   5286 		    B_FALSE, B_FALSE);
   5287 		if (lport == 0)
   5288 			return (-TNOADDR);
   5289 	}
   5290 
   5291 	/*
   5292 	 * Lookup the route to determine a source address and the uinfo.
   5293 	 * Setup TCP parameters based on the metrics/DCE.
   5294 	 */
   5295 	error = tcp_set_destination(tcp);
   5296 	if (error != 0)
   5297 		return (error);
   5298 
   5299 	/*
   5300 	 * Don't let an endpoint connect to itself.
   5301 	 */
   5302 	if (connp->conn_faddr_v4 == connp->conn_laddr_v4 &&
   5303 	    connp->conn_fport == connp->conn_lport)
   5304 		return (-TBADADDR);
   5305 
   5306 	tcp->tcp_state = TCPS_SYN_SENT;
   5307 
   5308 	return (ipcl_conn_insert_v4(connp));
   5309 }
   5310 
   5311 /*
   5312  * Handle connect to IPv6 destinations.
   5313  * Returns zero if OK, a positive errno, or a negative TLI error.
   5314  */
   5315 static int
   5316 tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport,
   5317     uint32_t flowinfo, uint_t srcid, uint32_t scope_id)
   5318 {
   5319 	uint16_t 	lport;
   5320 	conn_t		*connp = tcp->tcp_connp;
   5321 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   5322 	int		error;
   5323 
   5324 	ASSERT(connp->conn_family == AF_INET6);
   5325 
   5326 	/*
   5327 	 * If we're here, it means that the destination address is a native
   5328 	 * IPv6 address.  Return an error if conn_ipversion is not IPv6.  A
   5329 	 * reason why it might not be IPv6 is if the socket was bound to an
   5330 	 * IPv4-mapped IPv6 address.
   5331 	 */
   5332 	if (connp->conn_ipversion != IPV6_VERSION)
   5333 		return (-TBADADDR);
   5334 
   5335 	/*
   5336 	 * Interpret a zero destination to mean loopback.
   5337 	 * Update the T_CONN_REQ (sin/sin6) since it is used to
   5338 	 * generate the T_CONN_CON.
   5339 	 */
   5340 	if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp))
   5341 		*dstaddrp = ipv6_loopback;
   5342 
   5343 	/* Handle __sin6_src_id if socket not bound to an IP address */
   5344 	if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
   5345 		ip_srcid_find_id(srcid, &connp->conn_laddr_v6,
   5346 		    IPCL_ZONEID(connp), tcps->tcps_netstack);
   5347 		connp->conn_saddr_v6 = connp->conn_laddr_v6;
   5348 	}
   5349 
   5350 	/*
   5351 	 * Take care of the scope_id now.
   5352 	 */
   5353 	if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(dstaddrp)) {
   5354 		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
   5355 		connp->conn_ixa->ixa_scopeid = scope_id;
   5356 	} else {
   5357 		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
   5358 	}
   5359 
   5360 	connp->conn_flowinfo = flowinfo;
   5361 	connp->conn_faddr_v6 = *dstaddrp;
   5362 	connp->conn_fport = dstport;
   5363 
   5364 	/*
   5365 	 * At this point the remote destination address and remote port fields
   5366 	 * in the tcp-four-tuple have been filled in the tcp structure. Now we
   5367 	 * have to see which state tcp was in so we can take appropriate action.
   5368 	 */
   5369 	if (tcp->tcp_state == TCPS_IDLE) {
   5370 		/*
   5371 		 * We support a quick connect capability here, allowing
   5372 		 * clients to transition directly from IDLE to SYN_SENT
   5373 		 * tcp_bindi will pick an unused port, insert the connection
   5374 		 * in the bind hash and transition to BOUND state.
   5375 		 */
   5376 		lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
   5377 		    tcp, B_TRUE);
   5378 		lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE,
   5379 		    B_FALSE, B_FALSE);
   5380 		if (lport == 0)
   5381 			return (-TNOADDR);
   5382 	}
   5383 
   5384 	/*
   5385 	 * Lookup the route to determine a source address and the uinfo.
   5386 	 * Setup TCP parameters based on the metrics/DCE.
   5387 	 */
   5388 	error = tcp_set_destination(tcp);
   5389 	if (error != 0)
   5390 		return (error);
   5391 
   5392 	/*
   5393 	 * Don't let an endpoint connect to itself.
   5394 	 */
   5395 	if (IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &connp->conn_laddr_v6) &&
   5396 	    connp->conn_fport == connp->conn_lport)
   5397 		return (-TBADADDR);
   5398 
   5399 	tcp->tcp_state = TCPS_SYN_SENT;
   5400 
   5401 	return (ipcl_conn_insert_v6(connp));
   5402 }
   5403 
   5404 /*
   5405  * Disconnect
   5406  * Note that unlike other functions this returns a positive tli error
   5407  * when it fails; it never returns an errno.
   5408  */
   5409 static int
   5410 tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
   5411 {
   5412 	conn_t		*lconnp;
   5413 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   5414 	conn_t		*connp = tcp->tcp_connp;
   5415 
   5416 	/*
   5417 	 * Right now, upper modules pass down a T_DISCON_REQ to TCP,
   5418 	 * when the stream is in BOUND state. Do not send a reset,
   5419 	 * since the destination IP address is not valid, and it can
   5420 	 * be the initialized value of all zeros (broadcast address).
   5421 	 */
   5422 	if (tcp->tcp_state <= TCPS_BOUND) {
   5423 		if (connp->conn_debug) {
   5424 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   5425 			    "tcp_disconnect: bad state, %d", tcp->tcp_state);
   5426 		}
   5427 		return (TOUTSTATE);
   5428 	}
   5429 
   5430 
   5431 	if (seqnum == -1 || tcp->tcp_conn_req_max == 0) {
   5432 
   5433 		/*
   5434 		 * According to TPI, for non-listeners, ignore seqnum
   5435 		 * and disconnect.
   5436 		 * Following interpretation of -1 seqnum is historical
   5437 		 * and implied TPI ? (TPI only states that for T_CONN_IND,
   5438 		 * a valid seqnum should not be -1).
   5439 		 *
   5440 		 *	-1 means disconnect everything
   5441 		 *	regardless even on a listener.
   5442 		 */
   5443 
   5444 		int old_state = tcp->tcp_state;
   5445 		ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
   5446 
   5447 		/*
   5448 		 * The connection can't be on the tcp_time_wait_head list
   5449 		 * since it is not detached.
   5450 		 */
   5451 		ASSERT(tcp->tcp_time_wait_next == NULL);
   5452 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   5453 		ASSERT(tcp->tcp_time_wait_expire == 0);
   5454 		/*
   5455 		 * If it used to be a listener, check to make sure no one else
   5456 		 * has taken the port before switching back to LISTEN state.
   5457 		 */
   5458 		if (connp->conn_ipversion == IPV4_VERSION) {
   5459 			lconnp = ipcl_lookup_listener_v4(connp->conn_lport,
   5460 			    connp->conn_laddr_v4, IPCL_ZONEID(connp), ipst);
   5461 		} else {
   5462 			uint_t ifindex = 0;
   5463 
   5464 			if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)
   5465 				ifindex = connp->conn_ixa->ixa_scopeid;
   5466 
   5467 			/* Allow conn_bound_if listeners? */
   5468 			lconnp = ipcl_lookup_listener_v6(connp->conn_lport,
   5469 			    &connp->conn_laddr_v6, ifindex, IPCL_ZONEID(connp),
   5470 			    ipst);
   5471 		}
   5472 		if (tcp->tcp_conn_req_max && lconnp == NULL) {
   5473 			tcp->tcp_state = TCPS_LISTEN;
   5474 		} else if (old_state > TCPS_BOUND) {
   5475 			tcp->tcp_conn_req_max = 0;
   5476 			tcp->tcp_state = TCPS_BOUND;
   5477 		}
   5478 		if (lconnp != NULL)
   5479 			CONN_DEC_REF(lconnp);
   5480 		if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
   5481 			BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   5482 		} else if (old_state == TCPS_ESTABLISHED ||
   5483 		    old_state == TCPS_CLOSE_WAIT) {
   5484 			BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
   5485 		}
   5486 
   5487 		if (tcp->tcp_fused)
   5488 			tcp_unfuse(tcp);
   5489 
   5490 		mutex_enter(&tcp->tcp_eager_lock);
   5491 		if ((tcp->tcp_conn_req_cnt_q0 != 0) ||
   5492 		    (tcp->tcp_conn_req_cnt_q != 0)) {
   5493 			tcp_eager_cleanup(tcp, 0);
   5494 		}
   5495 		mutex_exit(&tcp->tcp_eager_lock);
   5496 
   5497 		tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt,
   5498 		    tcp->tcp_rnxt, TH_RST | TH_ACK);
   5499 
   5500 		tcp_reinit(tcp);
   5501 
   5502 		return (0);
   5503 	} else if (!tcp_eager_blowoff(tcp, seqnum)) {
   5504 		return (TBADSEQ);
   5505 	}
   5506 	return (0);
   5507 }
   5508 
   5509 /*
   5510  * Our client hereby directs us to reject the connection request
   5511  * that tcp_input_listener() marked with 'seqnum'.  Rejection consists
   5512  * of sending the appropriate RST, not an ICMP error.
   5513  */
   5514 static void
   5515 tcp_disconnect(tcp_t *tcp, mblk_t *mp)
   5516 {
   5517 	t_scalar_t seqnum;
   5518 	int	error;
   5519 	conn_t	*connp = tcp->tcp_connp;
   5520 
   5521 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
   5522 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) {
   5523 		tcp_err_ack(tcp, mp, TPROTO, 0);
   5524 		return;
   5525 	}
   5526 	seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number;
   5527 	error = tcp_disconnect_common(tcp, seqnum);
   5528 	if (error != 0)
   5529 		tcp_err_ack(tcp, mp, error, 0);
   5530 	else {
   5531 		if (tcp->tcp_state >= TCPS_ESTABLISHED) {
   5532 			/* Send M_FLUSH according to TPI */
   5533 			(void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
   5534 		}
   5535 		mp = mi_tpi_ok_ack_alloc(mp);
   5536 		if (mp != NULL)
   5537 			putnext(connp->conn_rq, mp);
   5538 	}
   5539 }
   5540 
   5541 /*
   5542  * Diagnostic routine used to return a string associated with the tcp state.
   5543  * Note that if the caller does not supply a buffer, it will use an internal
   5544  * static string.  This means that if multiple threads call this function at
   5545  * the same time, output can be corrupted...  Note also that this function
   5546  * does not check the size of the supplied buffer.  The caller has to make
   5547  * sure that it is big enough.
   5548  */
   5549 static char *
   5550 tcp_display(tcp_t *tcp, char *sup_buf, char format)
   5551 {
   5552 	char		buf1[30];
   5553 	static char	priv_buf[INET6_ADDRSTRLEN * 2 + 80];
   5554 	char		*buf;
   5555 	char		*cp;
   5556 	in6_addr_t	local, remote;
   5557 	char		local_addrbuf[INET6_ADDRSTRLEN];
   5558 	char		remote_addrbuf[INET6_ADDRSTRLEN];
   5559 	conn_t		*connp;
   5560 
   5561 	if (sup_buf != NULL)
   5562 		buf = sup_buf;
   5563 	else
   5564 		buf = priv_buf;
   5565 
   5566 	if (tcp == NULL)
   5567 		return ("NULL_TCP");
   5568 
   5569 	connp = tcp->tcp_connp;
   5570 	switch (tcp->tcp_state) {
   5571 	case TCPS_CLOSED:
   5572 		cp = "TCP_CLOSED";
   5573 		break;
   5574 	case TCPS_IDLE:
   5575 		cp = "TCP_IDLE";
   5576 		break;
   5577 	case TCPS_BOUND:
   5578 		cp = "TCP_BOUND";
   5579 		break;
   5580 	case TCPS_LISTEN:
   5581 		cp = "TCP_LISTEN";
   5582 		break;
   5583 	case TCPS_SYN_SENT:
   5584 		cp = "TCP_SYN_SENT";
   5585 		break;
   5586 	case TCPS_SYN_RCVD:
   5587 		cp = "TCP_SYN_RCVD";
   5588 		break;
   5589 	case TCPS_ESTABLISHED:
   5590 		cp = "TCP_ESTABLISHED";
   5591 		break;
   5592 	case TCPS_CLOSE_WAIT:
   5593 		cp = "TCP_CLOSE_WAIT";
   5594 		break;
   5595 	case TCPS_FIN_WAIT_1:
   5596 		cp = "TCP_FIN_WAIT_1";
   5597 		break;
   5598 	case TCPS_CLOSING:
   5599 		cp = "TCP_CLOSING";
   5600 		break;
   5601 	case TCPS_LAST_ACK:
   5602 		cp = "TCP_LAST_ACK";
   5603 		break;
   5604 	case TCPS_FIN_WAIT_2:
   5605 		cp = "TCP_FIN_WAIT_2";
   5606 		break;
   5607 	case TCPS_TIME_WAIT:
   5608 		cp = "TCP_TIME_WAIT";
   5609 		break;
   5610 	default:
   5611 		(void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
   5612 		cp = buf1;
   5613 		break;
   5614 	}
   5615 	switch (format) {
   5616 	case DISP_ADDR_AND_PORT:
   5617 		if (connp->conn_ipversion == IPV4_VERSION) {
   5618 			/*
   5619 			 * Note that we use the remote address in the tcp_b
   5620 			 * structure.  This means that it will print out
   5621 			 * the real destination address, not the next hop's
   5622 			 * address if source routing is used.
   5623 			 */
   5624 			IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local);
   5625 			IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote);
   5626 
   5627 		} else {
   5628 			local = connp->conn_laddr_v6;
   5629 			remote = connp->conn_faddr_v6;
   5630 		}
   5631 		(void) inet_ntop(AF_INET6, &local, local_addrbuf,
   5632 		    sizeof (local_addrbuf));
   5633 		(void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
   5634 		    sizeof (remote_addrbuf));
   5635 		(void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
   5636 		    local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf,
   5637 		    ntohs(connp->conn_fport), cp);
   5638 		break;
   5639 	case DISP_PORT_ONLY:
   5640 	default:
   5641 		(void) mi_sprintf(buf, "[%u, %u] %s",
   5642 		    ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
   5643 		break;
   5644 	}
   5645 
   5646 	return (buf);
   5647 }
   5648 
   5649 /*
   5650  * Called via squeue to get on to eager's perimeter. It sends a
   5651  * TH_RST if eager is in the fanout table. The listener wants the
   5652  * eager to disappear either by means of tcp_eager_blowoff() or
   5653  * tcp_eager_cleanup() being called. tcp_eager_kill() can also be
   5654  * called (via squeue) if the eager cannot be inserted in the
   5655  * fanout table in tcp_input_listener().
   5656  */
   5657 /* ARGSUSED */
   5658 void
   5659 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   5660 {
   5661 	conn_t	*econnp = (conn_t *)arg;
   5662 	tcp_t	*eager = econnp->conn_tcp;
   5663 	tcp_t	*listener = eager->tcp_listener;
   5664 
   5665 	/*
   5666 	 * We could be called because listener is closing. Since
   5667 	 * the eager was using listener's queue's, we avoid
   5668 	 * using the listeners queues from now on.
   5669 	 */
   5670 	ASSERT(eager->tcp_detached);
   5671 	econnp->conn_rq = NULL;
   5672 	econnp->conn_wq = NULL;
   5673 
   5674 	/*
   5675 	 * An eager's conn_fanout will be NULL if it's a duplicate
   5676 	 * for an existing 4-tuples in the conn fanout table.
   5677 	 * We don't want to send an RST out in such case.
   5678 	 */
   5679 	if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) {
   5680 		tcp_xmit_ctl("tcp_eager_kill, can't wait",
   5681 		    eager, eager->tcp_snxt, 0, TH_RST);
   5682 	}
   5683 
   5684 	/* We are here because listener wants this eager gone */
   5685 	if (listener != NULL) {
   5686 		mutex_enter(&listener->tcp_eager_lock);
   5687 		tcp_eager_unlink(eager);
   5688 		if (eager->tcp_tconnind_started) {
   5689 			/*
   5690 			 * The eager has sent a conn_ind up to the
   5691 			 * listener but listener decides to close
   5692 			 * instead. We need to drop the extra ref
   5693 			 * placed on eager in tcp_input_data() before
   5694 			 * sending the conn_ind to listener.
   5695 			 */
   5696 			CONN_DEC_REF(econnp);
   5697 		}
   5698 		mutex_exit(&listener->tcp_eager_lock);
   5699 		CONN_DEC_REF(listener->tcp_connp);
   5700 	}
   5701 
   5702 	if (eager->tcp_state != TCPS_CLOSED)
   5703 		tcp_close_detached(eager);
   5704 }
   5705 
   5706 /*
   5707  * Reset any eager connection hanging off this listener marked
   5708  * with 'seqnum' and then reclaim it's resources.
   5709  */
   5710 static boolean_t
   5711 tcp_eager_blowoff(tcp_t	*listener, t_scalar_t seqnum)
   5712 {
   5713 	tcp_t	*eager;
   5714 	mblk_t 	*mp;
   5715 	tcp_stack_t	*tcps = listener->tcp_tcps;
   5716 
   5717 	TCP_STAT(tcps, tcp_eager_blowoff_calls);
   5718 	eager = listener;
   5719 	mutex_enter(&listener->tcp_eager_lock);
   5720 	do {
   5721 		eager = eager->tcp_eager_next_q;
   5722 		if (eager == NULL) {
   5723 			mutex_exit(&listener->tcp_eager_lock);
   5724 			return (B_FALSE);
   5725 		}
   5726 	} while (eager->tcp_conn_req_seqnum != seqnum);
   5727 
   5728 	if (eager->tcp_closemp_used) {
   5729 		mutex_exit(&listener->tcp_eager_lock);
   5730 		return (B_TRUE);
   5731 	}
   5732 	eager->tcp_closemp_used = B_TRUE;
   5733 	TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   5734 	CONN_INC_REF(eager->tcp_connp);
   5735 	mutex_exit(&listener->tcp_eager_lock);
   5736 	mp = &eager->tcp_closemp;
   5737 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
   5738 	    eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF);
   5739 	return (B_TRUE);
   5740 }
   5741 
   5742 /*
   5743  * Reset any eager connection hanging off this listener
   5744  * and then reclaim it's resources.
   5745  */
   5746 static void
   5747 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
   5748 {
   5749 	tcp_t	*eager;
   5750 	mblk_t	*mp;
   5751 	tcp_stack_t	*tcps = listener->tcp_tcps;
   5752 
   5753 	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
   5754 
   5755 	if (!q0_only) {
   5756 		/* First cleanup q */
   5757 		TCP_STAT(tcps, tcp_eager_blowoff_q);
   5758 		eager = listener->tcp_eager_next_q;
   5759 		while (eager != NULL) {
   5760 			if (!eager->tcp_closemp_used) {
   5761 				eager->tcp_closemp_used = B_TRUE;
   5762 				TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   5763 				CONN_INC_REF(eager->tcp_connp);
   5764 				mp = &eager->tcp_closemp;
   5765 				SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
   5766 				    tcp_eager_kill, eager->tcp_connp, NULL,
   5767 				    SQ_FILL, SQTAG_TCP_EAGER_CLEANUP);
   5768 			}
   5769 			eager = eager->tcp_eager_next_q;
   5770 		}
   5771 	}
   5772 	/* Then cleanup q0 */
   5773 	TCP_STAT(tcps, tcp_eager_blowoff_q0);
   5774 	eager = listener->tcp_eager_next_q0;
   5775 	while (eager != listener) {
   5776 		if (!eager->tcp_closemp_used) {
   5777 			eager->tcp_closemp_used = B_TRUE;
   5778 			TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
   5779 			CONN_INC_REF(eager->tcp_connp);
   5780 			mp = &eager->tcp_closemp;
   5781 			SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
   5782 			    tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL,
   5783 			    SQTAG_TCP_EAGER_CLEANUP_Q0);
   5784 		}
   5785 		eager = eager->tcp_eager_next_q0;
   5786 	}
   5787 }
   5788 
   5789 /*
   5790  * If we are an eager connection hanging off a listener that hasn't
   5791  * formally accepted the connection yet, get off his list and blow off
   5792  * any data that we have accumulated.
   5793  */
   5794 static void
   5795 tcp_eager_unlink(tcp_t *tcp)
   5796 {
   5797 	tcp_t	*listener = tcp->tcp_listener;
   5798 
   5799 	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
   5800 	ASSERT(listener != NULL);
   5801 	if (tcp->tcp_eager_next_q0 != NULL) {
   5802 		ASSERT(tcp->tcp_eager_prev_q0 != NULL);
   5803 
   5804 		/* Remove the eager tcp from q0 */
   5805 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
   5806 		    tcp->tcp_eager_prev_q0;
   5807 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
   5808 		    tcp->tcp_eager_next_q0;
   5809 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
   5810 		listener->tcp_conn_req_cnt_q0--;
   5811 
   5812 		tcp->tcp_eager_next_q0 = NULL;
   5813 		tcp->tcp_eager_prev_q0 = NULL;
   5814 
   5815 		/*
   5816 		 * Take the eager out, if it is in the list of droppable
   5817 		 * eagers.
   5818 		 */
   5819 		MAKE_UNDROPPABLE(tcp);
   5820 
   5821 		if (tcp->tcp_syn_rcvd_timeout != 0) {
   5822 			/* we have timed out before */
   5823 			ASSERT(listener->tcp_syn_rcvd_timeout > 0);
   5824 			listener->tcp_syn_rcvd_timeout--;
   5825 		}
   5826 	} else {
   5827 		tcp_t   **tcpp = &listener->tcp_eager_next_q;
   5828 		tcp_t	*prev = NULL;
   5829 
   5830 		for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) {
   5831 			if (tcpp[0] == tcp) {
   5832 				if (listener->tcp_eager_last_q == tcp) {
   5833 					/*
   5834 					 * If we are unlinking the last
   5835 					 * element on the list, adjust
   5836 					 * tail pointer. Set tail pointer
   5837 					 * to nil when list is empty.
   5838 					 */
   5839 					ASSERT(tcp->tcp_eager_next_q == NULL);
   5840 					if (listener->tcp_eager_last_q ==
   5841 					    listener->tcp_eager_next_q) {
   5842 						listener->tcp_eager_last_q =
   5843 						    NULL;
   5844 					} else {
   5845 						/*
   5846 						 * We won't get here if there
   5847 						 * is only one eager in the
   5848 						 * list.
   5849 						 */
   5850 						ASSERT(prev != NULL);
   5851 						listener->tcp_eager_last_q =
   5852 						    prev;
   5853 					}
   5854 				}
   5855 				tcpp[0] = tcp->tcp_eager_next_q;
   5856 				tcp->tcp_eager_next_q = NULL;
   5857 				tcp->tcp_eager_last_q = NULL;
   5858 				ASSERT(listener->tcp_conn_req_cnt_q > 0);
   5859 				listener->tcp_conn_req_cnt_q--;
   5860 				break;
   5861 			}
   5862 			prev = tcpp[0];
   5863 		}
   5864 	}
   5865 	tcp->tcp_listener = NULL;
   5866 }
   5867 
   5868 /* Shorthand to generate and send TPI error acks to our client */
   5869 static void
   5870 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
   5871 {
   5872 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
   5873 		putnext(tcp->tcp_connp->conn_rq, mp);
   5874 }
   5875 
   5876 /* Shorthand to generate and send TPI error acks to our client */
   5877 static void
   5878 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
   5879     int t_error, int sys_error)
   5880 {
   5881 	struct T_error_ack	*teackp;
   5882 
   5883 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
   5884 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
   5885 		teackp = (struct T_error_ack *)mp->b_rptr;
   5886 		teackp->ERROR_prim = primitive;
   5887 		teackp->TLI_error = t_error;
   5888 		teackp->UNIX_error = sys_error;
   5889 		putnext(tcp->tcp_connp->conn_rq, mp);
   5890 	}
   5891 }
   5892 
   5893 /*
   5894  * Note: No locks are held when inspecting tcp_g_*epriv_ports
   5895  * but instead the code relies on:
   5896  * - the fact that the address of the array and its size never changes
   5897  * - the atomic assignment of the elements of the array
   5898  */
   5899 /* ARGSUSED */
   5900 static int
   5901 tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
   5902 {
   5903 	int i;
   5904 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
   5905 
   5906 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   5907 		if (tcps->tcps_g_epriv_ports[i] != 0)
   5908 			(void) mi_mpprintf(mp, "%d ",
   5909 			    tcps->tcps_g_epriv_ports[i]);
   5910 	}
   5911 	return (0);
   5912 }
   5913 
   5914 /*
   5915  * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
   5916  * threads from changing it at the same time.
   5917  */
   5918 /* ARGSUSED */
   5919 static int
   5920 tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
   5921     cred_t *cr)
   5922 {
   5923 	long	new_value;
   5924 	int	i;
   5925 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
   5926 
   5927 	/*
   5928 	 * Fail the request if the new value does not lie within the
   5929 	 * port number limits.
   5930 	 */
   5931 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
   5932 	    new_value <= 0 || new_value >= 65536) {
   5933 		return (EINVAL);
   5934 	}
   5935 
   5936 	mutex_enter(&tcps->tcps_epriv_port_lock);
   5937 	/* Check if the value is already in the list */
   5938 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   5939 		if (new_value == tcps->tcps_g_epriv_ports[i]) {
   5940 			mutex_exit(&tcps->tcps_epriv_port_lock);
   5941 			return (EEXIST);
   5942 		}
   5943 	}
   5944 	/* Find an empty slot */
   5945 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   5946 		if (tcps->tcps_g_epriv_ports[i] == 0)
   5947 			break;
   5948 	}
   5949 	if (i == tcps->tcps_g_num_epriv_ports) {
   5950 		mutex_exit(&tcps->tcps_epriv_port_lock);
   5951 		return (EOVERFLOW);
   5952 	}
   5953 	/* Set the new value */
   5954 	tcps->tcps_g_epriv_ports[i] = (uint16_t)new_value;
   5955 	mutex_exit(&tcps->tcps_epriv_port_lock);
   5956 	return (0);
   5957 }
   5958 
   5959 /*
   5960  * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
   5961  * threads from changing it at the same time.
   5962  */
   5963 /* ARGSUSED */
   5964 static int
   5965 tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
   5966     cred_t *cr)
   5967 {
   5968 	long	new_value;
   5969 	int	i;
   5970 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
   5971 
   5972 	/*
   5973 	 * Fail the request if the new value does not lie within the
   5974 	 * port number limits.
   5975 	 */
   5976 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 ||
   5977 	    new_value >= 65536) {
   5978 		return (EINVAL);
   5979 	}
   5980 
   5981 	mutex_enter(&tcps->tcps_epriv_port_lock);
   5982 	/* Check that the value is already in the list */
   5983 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
   5984 		if (tcps->tcps_g_epriv_ports[i] == new_value)
   5985 			break;
   5986 	}
   5987 	if (i == tcps->tcps_g_num_epriv_ports) {
   5988 		mutex_exit(&tcps->tcps_epriv_port_lock);
   5989 		return (ESRCH);
   5990 	}
   5991 	/* Clear the value */
   5992 	tcps->tcps_g_epriv_ports[i] = 0;
   5993 	mutex_exit(&tcps->tcps_epriv_port_lock);
   5994 	return (0);
   5995 }
   5996 
   5997 /* Return the TPI/TLI equivalent of our current tcp_state */
   5998 static int
   5999 tcp_tpistate(tcp_t *tcp)
   6000 {
   6001 	switch (tcp->tcp_state) {
   6002 	case TCPS_IDLE:
   6003 		return (TS_UNBND);
   6004 	case TCPS_LISTEN:
   6005 		/*
   6006 		 * Return whether there are outstanding T_CONN_IND waiting
   6007 		 * for the matching T_CONN_RES. Therefore don't count q0.
   6008 		 */
   6009 		if (tcp->tcp_conn_req_cnt_q > 0)
   6010 			return (TS_WRES_CIND);
   6011 		else
   6012 			return (TS_IDLE);
   6013 	case TCPS_BOUND:
   6014 		return (TS_IDLE);
   6015 	case TCPS_SYN_SENT:
   6016 		return (TS_WCON_CREQ);
   6017 	case TCPS_SYN_RCVD:
   6018 		/*
   6019 		 * Note: assumption: this has to the active open SYN_RCVD.
   6020 		 * The passive instance is detached in SYN_RCVD stage of
   6021 		 * incoming connection processing so we cannot get request
   6022 		 * for T_info_ack on it.
   6023 		 */
   6024 		return (TS_WACK_CRES);
   6025 	case TCPS_ESTABLISHED:
   6026 		return (TS_DATA_XFER);
   6027 	case TCPS_CLOSE_WAIT:
   6028 		return (TS_WREQ_ORDREL);
   6029 	case TCPS_FIN_WAIT_1:
   6030 		return (TS_WIND_ORDREL);
   6031 	case TCPS_FIN_WAIT_2:
   6032 		return (TS_WIND_ORDREL);
   6033 
   6034 	case TCPS_CLOSING:
   6035 	case TCPS_LAST_ACK:
   6036 	case TCPS_TIME_WAIT:
   6037 	case TCPS_CLOSED:
   6038 		/*
   6039 		 * Following TS_WACK_DREQ7 is a rendition of "not
   6040 		 * yet TS_IDLE" TPI state. There is no best match to any
   6041 		 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
   6042 		 * choose a value chosen that will map to TLI/XTI level
   6043 		 * state of TSTATECHNG (state is process of changing) which
   6044 		 * captures what this dummy state represents.
   6045 		 */
   6046 		return (TS_WACK_DREQ7);
   6047 	default:
   6048 		cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
   6049 		    tcp->tcp_state, tcp_display(tcp, NULL,
   6050 		    DISP_PORT_ONLY));
   6051 		return (TS_UNBND);
   6052 	}
   6053 }
   6054 
   6055 static void
   6056 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
   6057 {
   6058 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6059 	conn_t		*connp = tcp->tcp_connp;
   6060 
   6061 	if (connp->conn_family == AF_INET6)
   6062 		*tia = tcp_g_t_info_ack_v6;
   6063 	else
   6064 		*tia = tcp_g_t_info_ack;
   6065 	tia->CURRENT_state = tcp_tpistate(tcp);
   6066 	tia->OPT_size = tcp_max_optsize;
   6067 	if (tcp->tcp_mss == 0) {
   6068 		/* Not yet set - tcp_open does not set mss */
   6069 		if (connp->conn_ipversion == IPV4_VERSION)
   6070 			tia->TIDU_size = tcps->tcps_mss_def_ipv4;
   6071 		else
   6072 			tia->TIDU_size = tcps->tcps_mss_def_ipv6;
   6073 	} else {
   6074 		tia->TIDU_size = tcp->tcp_mss;
   6075 	}
   6076 	/* TODO: Default ETSDU is 1.  Is that correct for tcp? */
   6077 }
   6078 
   6079 static void
   6080 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
   6081     t_uscalar_t cap_bits1)
   6082 {
   6083 	tcap->CAP_bits1 = 0;
   6084 
   6085 	if (cap_bits1 & TC1_INFO) {
   6086 		tcp_copy_info(&tcap->INFO_ack, tcp);
   6087 		tcap->CAP_bits1 |= TC1_INFO;
   6088 	}
   6089 
   6090 	if (cap_bits1 & TC1_ACCEPTOR_ID) {
   6091 		tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
   6092 		tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
   6093 	}
   6094 
   6095 }
   6096 
   6097 /*
   6098  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
   6099  * tcp_wput.  Much of the T_CAPABILITY_ACK information is copied from
   6100  * tcp_g_t_info_ack.  The current state of the stream is copied from
   6101  * tcp_state.
   6102  */
   6103 static void
   6104 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
   6105 {
   6106 	t_uscalar_t		cap_bits1;
   6107 	struct T_capability_ack	*tcap;
   6108 
   6109 	if (MBLKL(mp) < sizeof (struct T_capability_req)) {
   6110 		freemsg(mp);
   6111 		return;
   6112 	}
   6113 
   6114 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
   6115 
   6116 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
   6117 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
   6118 	if (mp == NULL)
   6119 		return;
   6120 
   6121 	tcap = (struct T_capability_ack *)mp->b_rptr;
   6122 	tcp_do_capability_ack(tcp, tcap, cap_bits1);
   6123 
   6124 	putnext(tcp->tcp_connp->conn_rq, mp);
   6125 }
   6126 
   6127 /*
   6128  * This routine responds to T_INFO_REQ messages.  It is called by tcp_wput.
   6129  * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
   6130  * The current state of the stream is copied from tcp_state.
   6131  */
   6132 static void
   6133 tcp_info_req(tcp_t *tcp, mblk_t *mp)
   6134 {
   6135 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
   6136 	    T_INFO_ACK);
   6137 	if (!mp) {
   6138 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   6139 		return;
   6140 	}
   6141 	tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
   6142 	putnext(tcp->tcp_connp->conn_rq, mp);
   6143 }
   6144 
   6145 /* Respond to the TPI addr request */
   6146 static void
   6147 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
   6148 {
   6149 	struct sockaddr *sa;
   6150 	mblk_t	*ackmp;
   6151 	struct T_addr_ack *taa;
   6152 	conn_t	*connp = tcp->tcp_connp;
   6153 	uint_t	addrlen;
   6154 
   6155 	/* Make it large enough for worst case */
   6156 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
   6157 	    2 * sizeof (sin6_t), 1);
   6158 	if (ackmp == NULL) {
   6159 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   6160 		return;
   6161 	}
   6162 
   6163 	taa = (struct T_addr_ack *)ackmp->b_rptr;
   6164 
   6165 	bzero(taa, sizeof (struct T_addr_ack));
   6166 	ackmp->b_wptr = (uchar_t *)&taa[1];
   6167 
   6168 	taa->PRIM_type = T_ADDR_ACK;
   6169 	ackmp->b_datap->db_type = M_PCPROTO;
   6170 
   6171 	if (connp->conn_family == AF_INET)
   6172 		addrlen = sizeof (sin_t);
   6173 	else
   6174 		addrlen = sizeof (sin6_t);
   6175 
   6176 	/*
   6177 	 * Note: Following code assumes 32 bit alignment of basic
   6178 	 * data structures like sin_t and struct T_addr_ack.
   6179 	 */
   6180 	if (tcp->tcp_state >= TCPS_BOUND) {
   6181 		/*
   6182 		 * Fill in local address first
   6183 		 */
   6184 		taa->LOCADDR_offset = sizeof (*taa);
   6185 		taa->LOCADDR_length = addrlen;
   6186 		sa = (struct sockaddr *)&taa[1];
   6187 		(void) conn_getsockname(connp, sa, &addrlen);
   6188 		ackmp->b_wptr += addrlen;
   6189 	}
   6190 	if (tcp->tcp_state >= TCPS_SYN_RCVD) {
   6191 		/*
   6192 		 * Fill in Remote address
   6193 		 */
   6194 		taa->REMADDR_length = addrlen;
   6195 		/* assumed 32-bit alignment */
   6196 		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
   6197 		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
   6198 		(void) conn_getpeername(connp, sa, &addrlen);
   6199 		ackmp->b_wptr += addrlen;
   6200 	}
   6201 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
   6202 	putnext(tcp->tcp_connp->conn_rq, ackmp);
   6203 }
   6204 
   6205 /*
   6206  * Handle reinitialization of a tcp structure.
   6207  * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE.
   6208  */
   6209 static void
   6210 tcp_reinit(tcp_t *tcp)
   6211 {
   6212 	mblk_t		*mp;
   6213 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6214 	conn_t		*connp  = tcp->tcp_connp;
   6215 
   6216 	TCP_STAT(tcps, tcp_reinit_calls);
   6217 
   6218 	/* tcp_reinit should never be called for detached tcp_t's */
   6219 	ASSERT(tcp->tcp_listener == NULL);
   6220 	ASSERT((connp->conn_family == AF_INET &&
   6221 	    connp->conn_ipversion == IPV4_VERSION) ||
   6222 	    (connp->conn_family == AF_INET6 &&
   6223 	    (connp->conn_ipversion == IPV4_VERSION ||
   6224 	    connp->conn_ipversion == IPV6_VERSION)));
   6225 
   6226 	/* Cancel outstanding timers */
   6227 	tcp_timers_stop(tcp);
   6228 
   6229 	/*
   6230 	 * Reset everything in the state vector, after updating global
   6231 	 * MIB data from instance counters.
   6232 	 */
   6233 	UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
   6234 	tcp->tcp_ibsegs = 0;
   6235 	UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
   6236 	tcp->tcp_obsegs = 0;
   6237 
   6238 	tcp_close_mpp(&tcp->tcp_xmit_head);
   6239 	if (tcp->tcp_snd_zcopy_aware)
   6240 		tcp_zcopy_notify(tcp);
   6241 	tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
   6242 	tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
   6243 	mutex_enter(&tcp->tcp_non_sq_lock);
   6244 	if (tcp->tcp_flow_stopped &&
   6245 	    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
   6246 		tcp_clrqfull(tcp);
   6247 	}
   6248 	mutex_exit(&tcp->tcp_non_sq_lock);
   6249 	tcp_close_mpp(&tcp->tcp_reass_head);
   6250 	tcp->tcp_reass_tail = NULL;
   6251 	if (tcp->tcp_rcv_list != NULL) {
   6252 		/* Free b_next chain */
   6253 		tcp_close_mpp(&tcp->tcp_rcv_list);
   6254 		tcp->tcp_rcv_last_head = NULL;
   6255 		tcp->tcp_rcv_last_tail = NULL;
   6256 		tcp->tcp_rcv_cnt = 0;
   6257 	}
   6258 	tcp->tcp_rcv_last_tail = NULL;
   6259 
   6260 	if ((mp = tcp->tcp_urp_mp) != NULL) {
   6261 		freemsg(mp);
   6262 		tcp->tcp_urp_mp = NULL;
   6263 	}
   6264 	if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
   6265 		freemsg(mp);
   6266 		tcp->tcp_urp_mark_mp = NULL;
   6267 	}
   6268 	if (tcp->tcp_fused_sigurg_mp != NULL) {
   6269 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   6270 		freeb(tcp->tcp_fused_sigurg_mp);
   6271 		tcp->tcp_fused_sigurg_mp = NULL;
   6272 	}
   6273 	if (tcp->tcp_ordrel_mp != NULL) {
   6274 		ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
   6275 		freeb(tcp->tcp_ordrel_mp);
   6276 		tcp->tcp_ordrel_mp = NULL;
   6277 	}
   6278 
   6279 	/*
   6280 	 * Following is a union with two members which are
   6281 	 * identical types and size so the following cleanup
   6282 	 * is enough.
   6283 	 */
   6284 	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
   6285 
   6286 	CL_INET_DISCONNECT(connp);
   6287 
   6288 	/*
   6289 	 * The connection can't be on the tcp_time_wait_head list
   6290 	 * since it is not detached.
   6291 	 */
   6292 	ASSERT(tcp->tcp_time_wait_next == NULL);
   6293 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   6294 	ASSERT(tcp->tcp_time_wait_expire == 0);
   6295 
   6296 	if (tcp->tcp_kssl_pending) {
   6297 		tcp->tcp_kssl_pending = B_FALSE;
   6298 
   6299 		/* Don't reset if the initialized by bind. */
   6300 		if (tcp->tcp_kssl_ent != NULL) {
   6301 			kssl_release_ent(tcp->tcp_kssl_ent, NULL,
   6302 			    KSSL_NO_PROXY);
   6303 		}
   6304 	}
   6305 	if (tcp->tcp_kssl_ctx != NULL) {
   6306 		kssl_release_ctx(tcp->tcp_kssl_ctx);
   6307 		tcp->tcp_kssl_ctx = NULL;
   6308 	}
   6309 
   6310 	/*
   6311 	 * Reset/preserve other values
   6312 	 */
   6313 	tcp_reinit_values(tcp);
   6314 	ipcl_hash_remove(connp);
   6315 	ixa_cleanup(connp->conn_ixa);
   6316 	tcp_ipsec_cleanup(tcp);
   6317 
   6318 	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
   6319 	connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
   6320 
   6321 	if (tcp->tcp_conn_req_max != 0) {
   6322 		/*
   6323 		 * This is the case when a TLI program uses the same
   6324 		 * transport end point to accept a connection.  This
   6325 		 * makes the TCP both a listener and acceptor.  When
   6326 		 * this connection is closed, we need to set the state
   6327 		 * back to TCPS_LISTEN.  Make sure that the eager list
   6328 		 * is reinitialized.
   6329 		 *
   6330 		 * Note that this stream is still bound to the four
   6331 		 * tuples of the previous connection in IP.  If a new
   6332 		 * SYN with different foreign address comes in, IP will
   6333 		 * not find it and will send it to the global queue.  In
   6334 		 * the global queue, TCP will do a tcp_lookup_listener()
   6335 		 * to find this stream.  This works because this stream
   6336 		 * is only removed from connected hash.
   6337 		 *
   6338 		 */
   6339 		tcp->tcp_state = TCPS_LISTEN;
   6340 		tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
   6341 		tcp->tcp_eager_next_drop_q0 = tcp;
   6342 		tcp->tcp_eager_prev_drop_q0 = tcp;
   6343 		/*
   6344 		 * Initially set conn_recv to tcp_input_listener_unbound to try
   6345 		 * to pick a good squeue for the listener when the first SYN
   6346 		 * arrives. tcp_input_listener_unbound sets it to
   6347 		 * tcp_input_listener on that first SYN.
   6348 		 */
   6349 		connp->conn_recv = tcp_input_listener_unbound;
   6350 
   6351 		connp->conn_proto = IPPROTO_TCP;
   6352 		connp->conn_faddr_v6 = ipv6_all_zeros;
   6353 		connp->conn_fport = 0;
   6354 
   6355 		(void) ipcl_bind_insert(connp);
   6356 	} else {
   6357 		tcp->tcp_state = TCPS_BOUND;
   6358 	}
   6359 
   6360 	/*
   6361 	 * Initialize to default values
   6362 	 */
   6363 	tcp_init_values(tcp);
   6364 
   6365 	ASSERT(tcp->tcp_ptpbhn != NULL);
   6366 	tcp->tcp_rwnd = connp->conn_rcvbuf;
   6367 	tcp->tcp_mss = connp->conn_ipversion != IPV4_VERSION ?
   6368 	    tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4;
   6369 }
   6370 
   6371 /*
   6372  * Force values to zero that need be zero.
   6373  * Do not touch values asociated with the BOUND or LISTEN state
   6374  * since the connection will end up in that state after the reinit.
   6375  * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t
   6376  * structure!
   6377  */
   6378 static void
   6379 tcp_reinit_values(tcp)
   6380 	tcp_t *tcp;
   6381 {
   6382 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6383 	conn_t		*connp = tcp->tcp_connp;
   6384 
   6385 #ifndef	lint
   6386 #define	DONTCARE(x)
   6387 #define	PRESERVE(x)
   6388 #else
   6389 #define	DONTCARE(x)	((x) = (x))
   6390 #define	PRESERVE(x)	((x) = (x))
   6391 #endif	/* lint */
   6392 
   6393 	PRESERVE(tcp->tcp_bind_hash_port);
   6394 	PRESERVE(tcp->tcp_bind_hash);
   6395 	PRESERVE(tcp->tcp_ptpbhn);
   6396 	PRESERVE(tcp->tcp_acceptor_hash);
   6397 	PRESERVE(tcp->tcp_ptpahn);
   6398 
   6399 	/* Should be ASSERT NULL on these with new code! */
   6400 	ASSERT(tcp->tcp_time_wait_next == NULL);
   6401 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   6402 	ASSERT(tcp->tcp_time_wait_expire == 0);
   6403 	PRESERVE(tcp->tcp_state);
   6404 	PRESERVE(connp->conn_rq);
   6405 	PRESERVE(connp->conn_wq);
   6406 
   6407 	ASSERT(tcp->tcp_xmit_head == NULL);
   6408 	ASSERT(tcp->tcp_xmit_last == NULL);
   6409 	ASSERT(tcp->tcp_unsent == 0);
   6410 	ASSERT(tcp->tcp_xmit_tail == NULL);
   6411 	ASSERT(tcp->tcp_xmit_tail_unsent == 0);
   6412 
   6413 	tcp->tcp_snxt = 0;			/* Displayed in mib */
   6414 	tcp->tcp_suna = 0;			/* Displayed in mib */
   6415 	tcp->tcp_swnd = 0;
   6416 	DONTCARE(tcp->tcp_cwnd);	/* Init in tcp_process_options */
   6417 
   6418 	ASSERT(tcp->tcp_ibsegs == 0);
   6419 	ASSERT(tcp->tcp_obsegs == 0);
   6420 
   6421 	if (connp->conn_ht_iphc != NULL) {
   6422 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
   6423 		connp->conn_ht_iphc = NULL;
   6424 		connp->conn_ht_iphc_allocated = 0;
   6425 		connp->conn_ht_iphc_len = 0;
   6426 		connp->conn_ht_ulp = NULL;
   6427 		connp->conn_ht_ulp_len = 0;
   6428 		tcp->tcp_ipha = NULL;
   6429 		tcp->tcp_ip6h = NULL;
   6430 		tcp->tcp_tcpha = NULL;
   6431 	}
   6432 
   6433 	/* We clear any IP_OPTIONS and extension headers */
   6434 	ip_pkt_free(&connp->conn_xmit_ipp);
   6435 
   6436 	DONTCARE(tcp->tcp_naglim);		/* Init in tcp_init_values */
   6437 	DONTCARE(tcp->tcp_ipha);
   6438 	DONTCARE(tcp->tcp_ip6h);
   6439 	DONTCARE(tcp->tcp_tcpha);
   6440 	tcp->tcp_valid_bits = 0;
   6441 
   6442 	DONTCARE(tcp->tcp_timer_backoff);	/* Init in tcp_init_values */
   6443 	DONTCARE(tcp->tcp_last_recv_time);	/* Init in tcp_init_values */
   6444 	tcp->tcp_last_rcv_lbolt = 0;
   6445 
   6446 	tcp->tcp_init_cwnd = 0;
   6447 
   6448 	tcp->tcp_urp_last_valid = 0;
   6449 	tcp->tcp_hard_binding = 0;
   6450 
   6451 	tcp->tcp_fin_acked = 0;
   6452 	tcp->tcp_fin_rcvd = 0;
   6453 	tcp->tcp_fin_sent = 0;
   6454 	tcp->tcp_ordrel_done = 0;
   6455 
   6456 	tcp->tcp_detached = 0;
   6457 
   6458 	tcp->tcp_snd_ws_ok = B_FALSE;
   6459 	tcp->tcp_snd_ts_ok = B_FALSE;
   6460 	tcp->tcp_zero_win_probe = 0;
   6461 
   6462 	tcp->tcp_loopback = 0;
   6463 	tcp->tcp_localnet = 0;
   6464 	tcp->tcp_syn_defense = 0;
   6465 	tcp->tcp_set_timer = 0;
   6466 
   6467 	tcp->tcp_active_open = 0;
   6468 	tcp->tcp_rexmit = B_FALSE;
   6469 	tcp->tcp_xmit_zc_clean = B_FALSE;
   6470 
   6471 	tcp->tcp_snd_sack_ok = B_FALSE;
   6472 	tcp->tcp_hwcksum = B_FALSE;
   6473 
   6474 	DONTCARE(tcp->tcp_maxpsz_multiplier);	/* Init in tcp_init_values */
   6475 
   6476 	tcp->tcp_conn_def_q0 = 0;
   6477 	tcp->tcp_ip_forward_progress = B_FALSE;
   6478 	tcp->tcp_ecn_ok = B_FALSE;
   6479 
   6480 	tcp->tcp_cwr = B_FALSE;
   6481 	tcp->tcp_ecn_echo_on = B_FALSE;
   6482 	tcp->tcp_is_wnd_shrnk = B_FALSE;
   6483 
   6484 	if (tcp->tcp_sack_info != NULL) {
   6485 		if (tcp->tcp_notsack_list != NULL) {
   6486 			TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
   6487 			    tcp);
   6488 		}
   6489 		kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info);
   6490 		tcp->tcp_sack_info = NULL;
   6491 	}
   6492 
   6493 	tcp->tcp_rcv_ws = 0;
   6494 	tcp->tcp_snd_ws = 0;
   6495 	tcp->tcp_ts_recent = 0;
   6496 	tcp->tcp_rnxt = 0;			/* Displayed in mib */
   6497 	DONTCARE(tcp->tcp_rwnd);		/* Set in tcp_reinit() */
   6498 	tcp->tcp_initial_pmtu = 0;
   6499 
   6500 	ASSERT(tcp->tcp_reass_head == NULL);
   6501 	ASSERT(tcp->tcp_reass_tail == NULL);
   6502 
   6503 	tcp->tcp_cwnd_cnt = 0;
   6504 
   6505 	ASSERT(tcp->tcp_rcv_list == NULL);
   6506 	ASSERT(tcp->tcp_rcv_last_head == NULL);
   6507 	ASSERT(tcp->tcp_rcv_last_tail == NULL);
   6508 	ASSERT(tcp->tcp_rcv_cnt == 0);
   6509 
   6510 	DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */
   6511 	DONTCARE(tcp->tcp_cwnd_max);		/* Init in tcp_init_values */
   6512 	tcp->tcp_csuna = 0;
   6513 
   6514 	tcp->tcp_rto = 0;			/* Displayed in MIB */
   6515 	DONTCARE(tcp->tcp_rtt_sa);		/* Init in tcp_init_values */
   6516 	DONTCARE(tcp->tcp_rtt_sd);		/* Init in tcp_init_values */
   6517 	tcp->tcp_rtt_update = 0;
   6518 
   6519 	DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
   6520 	DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
   6521 
   6522 	tcp->tcp_rack = 0;			/* Displayed in mib */
   6523 	tcp->tcp_rack_cnt = 0;
   6524 	tcp->tcp_rack_cur_max = 0;
   6525 	tcp->tcp_rack_abs_max = 0;
   6526 
   6527 	tcp->tcp_max_swnd = 0;
   6528 
   6529 	ASSERT(tcp->tcp_listener == NULL);
   6530 
   6531 	DONTCARE(tcp->tcp_irs);			/* tcp_valid_bits cleared */
   6532 	DONTCARE(tcp->tcp_iss);			/* tcp_valid_bits cleared */
   6533 	DONTCARE(tcp->tcp_fss);			/* tcp_valid_bits cleared */
   6534 	DONTCARE(tcp->tcp_urg);			/* tcp_valid_bits cleared */
   6535 
   6536 	ASSERT(tcp->tcp_conn_req_cnt_q == 0);
   6537 	ASSERT(tcp->tcp_conn_req_cnt_q0 == 0);
   6538 	PRESERVE(tcp->tcp_conn_req_max);
   6539 	PRESERVE(tcp->tcp_conn_req_seqnum);
   6540 
   6541 	DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */
   6542 	DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */
   6543 	DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */
   6544 	DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */
   6545 
   6546 	DONTCARE(tcp->tcp_urp_last);	/* tcp_urp_last_valid is cleared */
   6547 	ASSERT(tcp->tcp_urp_mp == NULL);
   6548 	ASSERT(tcp->tcp_urp_mark_mp == NULL);
   6549 	ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
   6550 
   6551 	ASSERT(tcp->tcp_eager_next_q == NULL);
   6552 	ASSERT(tcp->tcp_eager_last_q == NULL);
   6553 	ASSERT((tcp->tcp_eager_next_q0 == NULL &&
   6554 	    tcp->tcp_eager_prev_q0 == NULL) ||
   6555 	    tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0);
   6556 	ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
   6557 
   6558 	ASSERT((tcp->tcp_eager_next_drop_q0 == NULL &&
   6559 	    tcp->tcp_eager_prev_drop_q0 == NULL) ||
   6560 	    tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0);
   6561 
   6562 	tcp->tcp_client_errno = 0;
   6563 
   6564 	DONTCARE(connp->conn_sum);		/* Init in tcp_init_values */
   6565 
   6566 	connp->conn_faddr_v6 = ipv6_all_zeros;	/* Displayed in MIB */
   6567 
   6568 	PRESERVE(connp->conn_bound_addr_v6);
   6569 	tcp->tcp_last_sent_len = 0;
   6570 	tcp->tcp_dupack_cnt = 0;
   6571 
   6572 	connp->conn_fport = 0;			/* Displayed in MIB */
   6573 	PRESERVE(connp->conn_lport);
   6574 
   6575 	PRESERVE(tcp->tcp_acceptor_lockp);
   6576 
   6577 	ASSERT(tcp->tcp_ordrel_mp == NULL);
   6578 	PRESERVE(tcp->tcp_acceptor_id);
   6579 	DONTCARE(tcp->tcp_ipsec_overhead);
   6580 
   6581 	PRESERVE(connp->conn_family);
   6582 	/* Remove any remnants of mapped address binding */
   6583 	if (connp->conn_family == AF_INET6) {
   6584 		connp->conn_ipversion = IPV6_VERSION;
   6585 		tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
   6586 	} else {
   6587 		connp->conn_ipversion = IPV4_VERSION;
   6588 		tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
   6589 	}
   6590 
   6591 	connp->conn_bound_if = 0;
   6592 	connp->conn_recv_ancillary.crb_all = 0;
   6593 	tcp->tcp_recvifindex = 0;
   6594 	tcp->tcp_recvhops = 0;
   6595 	tcp->tcp_closed = 0;
   6596 	tcp->tcp_cleandeathtag = 0;
   6597 	if (tcp->tcp_hopopts != NULL) {
   6598 		mi_free(tcp->tcp_hopopts);
   6599 		tcp->tcp_hopopts = NULL;
   6600 		tcp->tcp_hopoptslen = 0;
   6601 	}
   6602 	ASSERT(tcp->tcp_hopoptslen == 0);
   6603 	if (tcp->tcp_dstopts != NULL) {
   6604 		mi_free(tcp->tcp_dstopts);
   6605 		tcp->tcp_dstopts = NULL;
   6606 		tcp->tcp_dstoptslen = 0;
   6607 	}
   6608 	ASSERT(tcp->tcp_dstoptslen == 0);
   6609 	if (tcp->tcp_rthdrdstopts != NULL) {
   6610 		mi_free(tcp->tcp_rthdrdstopts);
   6611 		tcp->tcp_rthdrdstopts = NULL;
   6612 		tcp->tcp_rthdrdstoptslen = 0;
   6613 	}
   6614 	ASSERT(tcp->tcp_rthdrdstoptslen == 0);
   6615 	if (tcp->tcp_rthdr != NULL) {
   6616 		mi_free(tcp->tcp_rthdr);
   6617 		tcp->tcp_rthdr = NULL;
   6618 		tcp->tcp_rthdrlen = 0;
   6619 	}
   6620 	ASSERT(tcp->tcp_rthdrlen == 0);
   6621 
   6622 	/* Reset fusion-related fields */
   6623 	tcp->tcp_fused = B_FALSE;
   6624 	tcp->tcp_unfusable = B_FALSE;
   6625 	tcp->tcp_fused_sigurg = B_FALSE;
   6626 	tcp->tcp_loopback_peer = NULL;
   6627 
   6628 	tcp->tcp_lso = B_FALSE;
   6629 
   6630 	tcp->tcp_in_ack_unsent = 0;
   6631 	tcp->tcp_cork = B_FALSE;
   6632 	tcp->tcp_tconnind_started = B_FALSE;
   6633 
   6634 	PRESERVE(tcp->tcp_squeue_bytes);
   6635 
   6636 	ASSERT(tcp->tcp_kssl_ctx == NULL);
   6637 	ASSERT(!tcp->tcp_kssl_pending);
   6638 	PRESERVE(tcp->tcp_kssl_ent);
   6639 
   6640 	tcp->tcp_closemp_used = B_FALSE;
   6641 
   6642 	PRESERVE(tcp->tcp_rsrv_mp);
   6643 	PRESERVE(tcp->tcp_rsrv_mp_lock);
   6644 
   6645 #ifdef DEBUG
   6646 	DONTCARE(tcp->tcmp_stk[0]);
   6647 #endif
   6648 
   6649 	PRESERVE(tcp->tcp_connid);
   6650 
   6651 
   6652 #undef	DONTCARE
   6653 #undef	PRESERVE
   6654 }
   6655 
   6656 static void
   6657 tcp_init_values(tcp_t *tcp)
   6658 {
   6659 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   6660 	conn_t		*connp = tcp->tcp_connp;
   6661 
   6662 	ASSERT((connp->conn_family == AF_INET &&
   6663 	    connp->conn_ipversion == IPV4_VERSION) ||
   6664 	    (connp->conn_family == AF_INET6 &&
   6665 	    (connp->conn_ipversion == IPV4_VERSION ||
   6666 	    connp->conn_ipversion == IPV6_VERSION)));
   6667 
   6668 	/*
   6669 	 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
   6670 	 * will be close to tcp_rexmit_interval_initial.  By doing this, we
   6671 	 * allow the algorithm to adjust slowly to large fluctuations of RTT
   6672 	 * during first few transmissions of a connection as seen in slow
   6673 	 * links.
   6674 	 */
   6675 	tcp->tcp_rtt_sa = tcps->tcps_rexmit_interval_initial << 2;
   6676 	tcp->tcp_rtt_sd = tcps->tcps_rexmit_interval_initial >> 1;
   6677 	tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
   6678 	    tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
   6679 	    tcps->tcps_conn_grace_period;
   6680 	if (tcp->tcp_rto < tcps->tcps_rexmit_interval_min)
   6681 		tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
   6682 	tcp->tcp_timer_backoff = 0;
   6683 	tcp->tcp_ms_we_have_waited = 0;
   6684 	tcp->tcp_last_recv_time = ddi_get_lbolt();
   6685 	tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
   6686 	tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
   6687 	tcp->tcp_snd_burst = TCP_CWND_INFINITE;
   6688 
   6689 	tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
   6690 
   6691 	tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
   6692 	tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval;
   6693 	tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
   6694 	/*
   6695 	 * Fix it to tcp_ip_abort_linterval later if it turns out to be a
   6696 	 * passive open.
   6697 	 */
   6698 	tcp->tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval;
   6699 
   6700 	tcp->tcp_naglim = tcps->tcps_naglim_def;
   6701 
   6702 	/* NOTE:  ISS is now set in tcp_set_destination(). */
   6703 
   6704 	/* Reset fusion-related fields */
   6705 	tcp->tcp_fused = B_FALSE;
   6706 	tcp->tcp_unfusable = B_FALSE;
   6707 	tcp->tcp_fused_sigurg = B_FALSE;
   6708 	tcp->tcp_loopback_peer = NULL;
   6709 
   6710 	/* We rebuild the header template on the next connect/conn_request */
   6711 
   6712 	connp->conn_mlp_type = mlptSingle;
   6713 
   6714 	/*
   6715 	 * Init the window scale to the max so tcp_rwnd_set() won't pare
   6716 	 * down tcp_rwnd. tcp_set_destination() will set the right value later.
   6717 	 */
   6718 	tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT;
   6719 	tcp->tcp_rwnd = connp->conn_rcvbuf;
   6720 
   6721 	tcp->tcp_cork = B_FALSE;
   6722 	/*
   6723 	 * Init the tcp_debug option if it wasn't already set.  This value
   6724 	 * determines whether TCP
   6725 	 * calls strlog() to print out debug messages.  Doing this
   6726 	 * initialization here means that this value is not inherited thru
   6727 	 * tcp_reinit().
   6728 	 */
   6729 	if (!connp->conn_debug)
   6730 		connp->conn_debug = tcps->tcps_dbg;
   6731 
   6732 	tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
   6733 	tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
   6734 }
   6735 
   6736 /* At minimum we need 8 bytes in the TCP header for the lookup */
   6737 #define	ICMP_MIN_TCP_HDR	8
   6738 
   6739 /*
   6740  * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages
   6741  * passed up by IP. The message is always received on the correct tcp_t.
   6742  * Assumes that IP has pulled up everything up to and including the ICMP header.
   6743  */
   6744 /* ARGSUSED2 */
   6745 static void
   6746 tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
   6747 {
   6748 	conn_t		*connp = (conn_t *)arg1;
   6749 	icmph_t		*icmph;
   6750 	ipha_t		*ipha;
   6751 	int		iph_hdr_length;
   6752 	tcpha_t		*tcpha;
   6753 	uint32_t	seg_seq;
   6754 	tcp_t		*tcp = connp->conn_tcp;
   6755 
   6756 	/* Assume IP provides aligned packets */
   6757 	ASSERT(OK_32PTR(mp->b_rptr));
   6758 	ASSERT((MBLKL(mp) >= sizeof (ipha_t)));
   6759 
   6760 	/*
   6761 	 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent
   6762 	 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6.
   6763 	 */
   6764 	if (!(ira->ira_flags & IRAF_IS_IPV4)) {
   6765 		tcp_icmp_error_ipv6(tcp, mp, ira);
   6766 		return;
   6767 	}
   6768 
   6769 	/* Skip past the outer IP and ICMP headers */
   6770 	iph_hdr_length = ira->ira_ip_hdr_length;
   6771 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
   6772 	/*
   6773 	 * If we don't have the correct outer IP header length
   6774 	 * or if we don't have a complete inner IP header
   6775 	 * drop it.
   6776 	 */
   6777 	if (iph_hdr_length < sizeof (ipha_t) ||
   6778 	    (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) {
   6779 noticmpv4:
   6780 		freemsg(mp);
   6781 		return;
   6782 	}
   6783 	ipha = (ipha_t *)&icmph[1];
   6784 
   6785 	/* Skip past the inner IP and find the ULP header */
   6786 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
   6787 	tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length);
   6788 	/*
   6789 	 * If we don't have the correct inner IP header length or if the ULP
   6790 	 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR
   6791 	 * bytes of TCP header, drop it.
   6792 	 */
   6793 	if (iph_hdr_length < sizeof (ipha_t) ||
   6794 	    ipha->ipha_protocol != IPPROTO_TCP ||
   6795 	    (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) {
   6796 		goto noticmpv4;
   6797 	}
   6798 
   6799 	seg_seq = ntohl(tcpha->tha_seq);
   6800 	switch (icmph->icmph_type) {
   6801 	case ICMP_DEST_UNREACHABLE:
   6802 		switch (icmph->icmph_code) {
   6803 		case ICMP_FRAGMENTATION_NEEDED:
   6804 			/*
   6805 			 * Update Path MTU, then try to send something out.
   6806 			 */
   6807 			tcp_update_pmtu(tcp, B_TRUE);
   6808 			tcp_rexmit_after_error(tcp);
   6809 			break;
   6810 		case ICMP_PORT_UNREACHABLE:
   6811 		case ICMP_PROTOCOL_UNREACHABLE:
   6812 			switch (tcp->tcp_state) {
   6813 			case TCPS_SYN_SENT:
   6814 			case TCPS_SYN_RCVD:
   6815 				/*
   6816 				 * ICMP can snipe away incipient
   6817 				 * TCP connections as long as
   6818 				 * seq number is same as initial
   6819 				 * send seq number.
   6820 				 */
   6821 				if (seg_seq == tcp->tcp_iss) {
   6822 					(void) tcp_clean_death(tcp,
   6823 					    ECONNREFUSED, 6);
   6824 				}
   6825 				break;
   6826 			}
   6827 			break;
   6828 		case ICMP_HOST_UNREACHABLE:
   6829 		case ICMP_NET_UNREACHABLE:
   6830 			/* Record the error in case we finally time out. */
   6831 			if (icmph->icmph_code == ICMP_HOST_UNREACHABLE)
   6832 				tcp->tcp_client_errno = EHOSTUNREACH;
   6833 			else
   6834 				tcp->tcp_client_errno = ENETUNREACH;
   6835 			if (tcp->tcp_state == TCPS_SYN_RCVD) {
   6836 				if (tcp->tcp_listener != NULL &&
   6837 				    tcp->tcp_listener->tcp_syn_defense) {
   6838 					/*
   6839 					 * Ditch the half-open connection if we
   6840 					 * suspect a SYN attack is under way.
   6841 					 */
   6842 					(void) tcp_clean_death(tcp,
   6843 					    tcp->tcp_client_errno, 7);
   6844 				}
   6845 			}
   6846 			break;
   6847 		default:
   6848 			break;
   6849 		}
   6850 		break;
   6851 	case ICMP_SOURCE_QUENCH: {
   6852 		/*
   6853 		 * use a global boolean to control
   6854 		 * whether TCP should respond to ICMP_SOURCE_QUENCH.
   6855 		 * The default is false.
   6856 		 */
   6857 		if (tcp_icmp_source_quench) {
   6858 			/*
   6859 			 * Reduce the sending rate as if we got a
   6860 			 * retransmit timeout
   6861 			 */
   6862 			uint32_t npkt;
   6863 
   6864 			npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
   6865 			    tcp->tcp_mss;
   6866 			tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
   6867 			tcp->tcp_cwnd = tcp->tcp_mss;
   6868 			tcp->tcp_cwnd_cnt = 0;
   6869 		}
   6870 		break;
   6871 	}
   6872 	}
   6873 	freemsg(mp);
   6874 }
   6875 
   6876 /*
   6877  * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might
   6878  * change. But it can refer to fields like tcp_suna and tcp_snxt.
   6879  *
   6880  * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP
   6881  * error messages received by IP. The message is always received on the correct
   6882  * tcp_t.
   6883  */
   6884 /* ARGSUSED */
   6885 static boolean_t
   6886 tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
   6887     ip_recv_attr_t *ira)
   6888 {
   6889 	tcpha_t		*tcpha = (tcpha_t *)arg2;
   6890 	uint32_t	seq = ntohl(tcpha->tha_seq);
   6891 	tcp_t		*tcp = connp->conn_tcp;
   6892 
   6893 	/*
   6894 	 * TCP sequence number contained in payload of the ICMP error message
   6895 	 * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise,
   6896 	 * the message is either a stale ICMP error, or an attack from the
   6897 	 * network. Fail the verification.
   6898 	 */
   6899 	if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt))
   6900 		return (B_FALSE);
   6901 
   6902 	/* For "too big" we also check the ignore flag */
   6903 	if (ira->ira_flags & IRAF_IS_IPV4) {
   6904 		ASSERT(icmph != NULL);
   6905 		if (icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
   6906 		    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED &&
   6907 		    tcp->tcp_tcps->tcps_ignore_path_mtu)
   6908 			return (B_FALSE);
   6909 	} else {
   6910 		ASSERT(icmp6 != NULL);
   6911 		if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG &&
   6912 		    tcp->tcp_tcps->tcps_ignore_path_mtu)
   6913 			return (B_FALSE);
   6914 	}
   6915 	return (B_TRUE);
   6916 }
   6917 
   6918 /*
   6919  * Update the TCP connection according to change of PMTU.
   6920  *
   6921  * Path MTU might have changed by either increase or decrease, so need to
   6922  * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny
   6923  * or negative MSS, since tcp_mss_set() will do it.
   6924  */
   6925 static void
   6926 tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
   6927 {
   6928 	uint32_t	pmtu;
   6929 	int32_t		mss;
   6930 	conn_t		*connp = tcp->tcp_connp;
   6931 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
   6932 	iaflags_t	ixaflags;
   6933 
   6934 	if (tcp->tcp_tcps->tcps_ignore_path_mtu)
   6935 		return;
   6936 
   6937 	if (tcp->tcp_state < TCPS_ESTABLISHED)
   6938 		return;
   6939 
   6940 	/*
   6941 	 * Always call ip_get_pmtu() to make sure that IP has updated
   6942 	 * ixa_flags properly.
   6943 	 */
   6944 	pmtu = ip_get_pmtu(ixa);
   6945 	ixaflags = ixa->ixa_flags;
   6946 
   6947 	/*
   6948 	 * Calculate the MSS by decreasing the PMTU by conn_ht_iphc_len and
   6949 	 * IPsec overhead if applied. Make sure to use the most recent
   6950 	 * IPsec information.
   6951 	 */
   6952 	mss = pmtu - connp->conn_ht_iphc_len - conn_ipsec_length(connp);
   6953 
   6954 	/*
   6955 	 * Nothing to change, so just return.
   6956 	 */
   6957 	if (mss == tcp->tcp_mss)
   6958 		return;
   6959 
   6960 	/*
   6961 	 * Currently, for ICMP errors, only PMTU decrease is handled.
   6962 	 */
   6963 	if (mss > tcp->tcp_mss && decrease_only)
   6964 		return;
   6965 
   6966 	DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss);
   6967 
   6968 	/*
   6969 	 * Update ixa_fragsize and ixa_pmtu.
   6970 	 */
   6971 	ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
   6972 
   6973 	/*
   6974 	 * Adjust MSS and all relevant variables.
   6975 	 */
   6976 	tcp_mss_set(tcp, mss);
   6977 
   6978 	/*
   6979 	 * If the PMTU is below the min size maintained by IP, then ip_get_pmtu
   6980 	 * has set IXAF_PMTU_TOO_SMALL and cleared IXAF_PMTU_IPV4_DF. Since TCP
   6981 	 * has a (potentially different) min size we do the same. Make sure to
   6982 	 * clear IXAF_DONTFRAG, which is used by IP to decide whether to
   6983 	 * fragment the packet.
   6984 	 *
   6985 	 * LSO over IPv6 can not be fragmented. So need to disable LSO
   6986 	 * when IPv6 fragmentation is needed.
   6987 	 */
   6988 	if (mss < tcp->tcp_tcps->tcps_mss_min)
   6989 		ixaflags |= IXAF_PMTU_TOO_SMALL;
   6990 
   6991 	if (ixaflags & IXAF_PMTU_TOO_SMALL)
   6992 		ixaflags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
   6993 
   6994 	if ((connp->conn_ipversion == IPV4_VERSION) &&
   6995 	    !(ixaflags & IXAF_PMTU_IPV4_DF)) {
   6996 		tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
   6997 	}
   6998 	ixa->ixa_flags = ixaflags;
   6999 }
   7000 
   7001 /*
   7002  * Do slow start retransmission after ICMP errors of PMTU changes.
   7003  */
   7004 static void
   7005 tcp_rexmit_after_error(tcp_t *tcp)
   7006 {
   7007 	/*
   7008 	 * All sent data has been acknowledged or no data left to send, just
   7009 	 * to return.
   7010 	 */
   7011 	if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
   7012 	    (tcp->tcp_xmit_head == NULL))
   7013 		return;
   7014 
   7015 	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
   7016 		tcp->tcp_rexmit_max = tcp->tcp_fss;
   7017 	else
   7018 		tcp->tcp_rexmit_max = tcp->tcp_snxt;
   7019 
   7020 	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
   7021 	tcp->tcp_rexmit = B_TRUE;
   7022 	tcp->tcp_dupack_cnt = 0;
   7023 	tcp->tcp_snd_burst = TCP_CWND_SS;
   7024 	tcp_ss_rexmit(tcp);
   7025 }
   7026 
   7027 /*
   7028  * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
   7029  * error messages passed up by IP.
   7030  * Assumes that IP has pulled up all the extension headers as well
   7031  * as the ICMPv6 header.
   7032  */
   7033 static void
   7034 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
   7035 {
   7036 	icmp6_t		*icmp6;
   7037 	ip6_t		*ip6h;
   7038 	uint16_t	iph_hdr_length = ira->ira_ip_hdr_length;
   7039 	tcpha_t		*tcpha;
   7040 	uint8_t		*nexthdrp;
   7041 	uint32_t	seg_seq;
   7042 
   7043 	/*
   7044 	 * Verify that we have a complete IP header.
   7045 	 */
   7046 	ASSERT((MBLKL(mp) >= sizeof (ip6_t)));
   7047 
   7048 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
   7049 	ip6h = (ip6_t *)&icmp6[1];
   7050 	/*
   7051 	 * Verify if we have a complete ICMP and inner IP header.
   7052 	 */
   7053 	if ((uchar_t *)&ip6h[1] > mp->b_wptr) {
   7054 noticmpv6:
   7055 		freemsg(mp);
   7056 		return;
   7057 	}
   7058 
   7059 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp))
   7060 		goto noticmpv6;
   7061 	tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length);
   7062 	/*
   7063 	 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't
   7064 	 * have at least ICMP_MIN_TCP_HDR bytes of  TCP header drop the
   7065 	 * packet.
   7066 	 */
   7067 	if ((*nexthdrp != IPPROTO_TCP) ||
   7068 	    ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) {
   7069 		goto noticmpv6;
   7070 	}
   7071 
   7072 	seg_seq = ntohl(tcpha->tha_seq);
   7073 	switch (icmp6->icmp6_type) {
   7074 	case ICMP6_PACKET_TOO_BIG:
   7075 		/*
   7076 		 * Update Path MTU, then try to send something out.
   7077 		 */
   7078 		tcp_update_pmtu(tcp, B_TRUE);
   7079 		tcp_rexmit_after_error(tcp);
   7080 		break;
   7081 	case ICMP6_DST_UNREACH:
   7082 		switch (icmp6->icmp6_code) {
   7083 		case ICMP6_DST_UNREACH_NOPORT:
   7084 			if (((tcp->tcp_state == TCPS_SYN_SENT) ||
   7085 			    (tcp->tcp_state == TCPS_SYN_RCVD)) &&
   7086 			    (seg_seq == tcp->tcp_iss)) {
   7087 				(void) tcp_clean_death(tcp,
   7088 				    ECONNREFUSED, 8);
   7089 			}
   7090 			break;
   7091 		case ICMP6_DST_UNREACH_ADMIN:
   7092 		case ICMP6_DST_UNREACH_NOROUTE:
   7093 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
   7094 		case ICMP6_DST_UNREACH_ADDR:
   7095 			/* Record the error in case we finally time out. */
   7096 			tcp->tcp_client_errno = EHOSTUNREACH;
   7097 			if (((tcp->tcp_state == TCPS_SYN_SENT) ||
   7098 			    (tcp->tcp_state == TCPS_SYN_RCVD)) &&
   7099 			    (seg_seq == tcp->tcp_iss)) {
   7100 				if (tcp->tcp_listener != NULL &&
   7101 				    tcp->tcp_listener->tcp_syn_defense) {
   7102 					/*
   7103 					 * Ditch the half-open connection if we
   7104 					 * suspect a SYN attack is under way.
   7105 					 */
   7106 					(void) tcp_clean_death(tcp,
   7107 					    tcp->tcp_client_errno, 9);
   7108 				}
   7109 			}
   7110 
   7111 
   7112 			break;
   7113 		default:
   7114 			break;
   7115 		}
   7116 		break;
   7117 	case ICMP6_PARAM_PROB:
   7118 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
   7119 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
   7120 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
   7121 		    (uchar_t *)nexthdrp) {
   7122 			if (tcp->tcp_state == TCPS_SYN_SENT ||
   7123 			    tcp->tcp_state == TCPS_SYN_RCVD) {
   7124 				(void) tcp_clean_death(tcp,
   7125 				    ECONNREFUSED, 10);
   7126 			}
   7127 			break;
   7128 		}
   7129 		break;
   7130 
   7131 	case ICMP6_TIME_EXCEEDED:
   7132 	default:
   7133 		break;
   7134 	}
   7135 	freemsg(mp);
   7136 }
   7137 
   7138 /*
   7139  * Notify IP that we are having trouble with this connection.  IP should
   7140  * make note so it can potentially use a different IRE.
   7141  */
   7142 static void
   7143 tcp_ip_notify(tcp_t *tcp)
   7144 {
   7145 	conn_t		*connp = tcp->tcp_connp;
   7146 	ire_t		*ire;
   7147 
   7148 	/*
   7149 	 * Note: in the case of source routing we want to blow away the
   7150 	 * route to the first source route hop.
   7151 	 */
   7152 	ire = connp->conn_ixa->ixa_ire;
   7153 	if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
   7154 		if (ire->ire_ipversion == IPV4_VERSION) {
   7155 			/*
   7156 			 * As per RFC 1122, we send an RTM_LOSING to inform
   7157 			 * routing protocols.
   7158 			 */
   7159 			ip_rts_change(RTM_LOSING, ire->ire_addr,
   7160 			    ire->ire_gateway_addr, ire->ire_mask,
   7161 			    connp->conn_laddr_v4,  0, 0, 0,
   7162 			    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
   7163 			    ire->ire_ipst);
   7164 		}
   7165 		(void) ire_no_good(ire);
   7166 	}
   7167 }
   7168 
   7169 #pragma inline(tcp_send_data)
   7170 
   7171 /*
   7172  * Timer callback routine for keepalive probe.  We do a fake resend of
   7173  * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
   7174  * check to see if we have heard anything from the other end for the last
   7175  * RTO period.  If we have, set the timer to expire for another
   7176  * tcp_keepalive_intrvl and check again.  If we have not, set a timer using
   7177  * RTO << 1 and check again when it expires.  Keep exponentially increasing
   7178  * the timeout if we have not heard from the other side.  If for more than
   7179  * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
   7180  * kill the connection unless the keepalive abort threshold is 0.  In
   7181  * that case, we will probe "forever."
   7182  */
   7183 static void
   7184 tcp_keepalive_killer(void *arg)
   7185 {
   7186 	mblk_t	*mp;
   7187 	conn_t	*connp = (conn_t *)arg;
   7188 	tcp_t  	*tcp = connp->conn_tcp;
   7189 	int32_t	firetime;
   7190 	int32_t	idletime;
   7191 	int32_t	ka_intrvl;
   7192 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   7193 
   7194 	tcp->tcp_ka_tid = 0;
   7195 
   7196 	if (tcp->tcp_fused)
   7197 		return;
   7198 
   7199 	BUMP_MIB(&tcps->tcps_mib, tcpTimKeepalive);
   7200 	ka_intrvl = tcp->tcp_ka_interval;
   7201 
   7202 	/*
   7203 	 * Keepalive probe should only be sent if the application has not
   7204 	 * done a close on the connection.
   7205 	 */
   7206 	if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
   7207 		return;
   7208 	}
   7209 	/* Timer fired too early, restart it. */
   7210 	if (tcp->tcp_state < TCPS_ESTABLISHED) {
   7211 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
   7212 		    MSEC_TO_TICK(ka_intrvl));
   7213 		return;
   7214 	}
   7215 
   7216 	idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
   7217 	/*
   7218 	 * If we have not heard from the other side for a long
   7219 	 * time, kill the connection unless the keepalive abort
   7220 	 * threshold is 0.  In that case, we will probe "forever."
   7221 	 */
   7222 	if (tcp->tcp_ka_abort_thres != 0 &&
   7223 	    idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
   7224 		BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveDrop);
   7225 		(void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
   7226 		    tcp->tcp_client_errno : ETIMEDOUT, 11);
   7227 		return;
   7228 	}
   7229 
   7230 	if (tcp->tcp_snxt == tcp->tcp_suna &&
   7231 	    idletime >= ka_intrvl) {
   7232 		/* Fake resend of last ACKed byte. */
   7233 		mblk_t	*mp1 = allocb(1, BPRI_LO);
   7234 
   7235 		if (mp1 != NULL) {
   7236 			*mp1->b_wptr++ = '\0';
   7237 			mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
   7238 			    tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
   7239 			freeb(mp1);
   7240 			/*
   7241 			 * if allocation failed, fall through to start the
   7242 			 * timer back.
   7243 			 */
   7244 			if (mp != NULL) {
   7245 				tcp_send_data(tcp, mp);
   7246 				BUMP_MIB(&tcps->tcps_mib,
   7247 				    tcpTimKeepaliveProbe);
   7248 				if (tcp->tcp_ka_last_intrvl != 0) {
   7249 					int max;
   7250 					/*
   7251 					 * We should probe again at least
   7252 					 * in ka_intrvl, but not more than
   7253 					 * tcp_rexmit_interval_max.
   7254 					 */
   7255 					max = tcps->tcps_rexmit_interval_max;
   7256 					firetime = MIN(ka_intrvl - 1,
   7257 					    tcp->tcp_ka_last_intrvl << 1);
   7258 					if (firetime > max)
   7259 						firetime = max;
   7260 				} else {
   7261 					firetime = tcp->tcp_rto;
   7262 				}
   7263 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
   7264 				    tcp_keepalive_killer,
   7265 				    MSEC_TO_TICK(firetime));
   7266 				tcp->tcp_ka_last_intrvl = firetime;
   7267 				return;
   7268 			}
   7269 		}
   7270 	} else {
   7271 		tcp->tcp_ka_last_intrvl = 0;
   7272 	}
   7273 
   7274 	/* firetime can be negative if (mp1 == NULL || mp == NULL) */
   7275 	if ((firetime = ka_intrvl - idletime) < 0) {
   7276 		firetime = ka_intrvl;
   7277 	}
   7278 	tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
   7279 	    MSEC_TO_TICK(firetime));
   7280 }
   7281 
   7282 int
   7283 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
   7284 {
   7285 	conn_t	*connp = tcp->tcp_connp;
   7286 	queue_t	*q = connp->conn_rq;
   7287 	int32_t	mss = tcp->tcp_mss;
   7288 	int	maxpsz;
   7289 
   7290 	if (TCP_IS_DETACHED(tcp))
   7291 		return (mss);
   7292 	if (tcp->tcp_fused) {
   7293 		maxpsz = tcp_fuse_maxpsz(tcp);
   7294 		mss = INFPSZ;
   7295 	} else if (tcp->tcp_maxpsz_multiplier == 0) {
   7296 		/*
   7297 		 * Set the sd_qn_maxpsz according to the socket send buffer
   7298 		 * size, and sd_maxblk to INFPSZ (-1).  This will essentially
   7299 		 * instruct the stream head to copyin user data into contiguous
   7300 		 * kernel-allocated buffers without breaking it up into smaller
   7301 		 * chunks.  We round up the buffer size to the nearest SMSS.
   7302 		 */
   7303 		maxpsz = MSS_ROUNDUP(connp->conn_sndbuf, mss);
   7304 		if (tcp->tcp_kssl_ctx == NULL)
   7305 			mss = INFPSZ;
   7306 		else
   7307 			mss = SSL3_MAX_RECORD_LEN;
   7308 	} else {
   7309 		/*
   7310 		 * Set sd_qn_maxpsz to approx half the (receivers) buffer
   7311 		 * (and a multiple of the mss).  This instructs the stream
   7312 		 * head to break down larger than SMSS writes into SMSS-
   7313 		 * size mblks, up to tcp_maxpsz_multiplier mblks at a time.
   7314 		 */
   7315 		maxpsz = tcp->tcp_maxpsz_multiplier * mss;
   7316 		if (maxpsz > connp->conn_sndbuf / 2) {
   7317 			maxpsz = connp->conn_sndbuf / 2;
   7318 			/* Round up to nearest mss */
   7319 			maxpsz = MSS_ROUNDUP(maxpsz, mss);
   7320 		}
   7321 	}
   7322 
   7323 	(void) proto_set_maxpsz(q, connp, maxpsz);
   7324 	if (!(IPCL_IS_NONSTR(connp)))
   7325 		connp->conn_wq->q_maxpsz = maxpsz;
   7326 	if (set_maxblk)
   7327 		(void) proto_set_tx_maxblk(q, connp, mss);
   7328 	return (mss);
   7329 }
   7330 
   7331 /*
   7332  * Extract option values from a tcp header.  We put any found values into the
   7333  * tcpopt struct and return a bitmask saying which options were found.
   7334  */
   7335 static int
   7336 tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt)
   7337 {
   7338 	uchar_t		*endp;
   7339 	int		len;
   7340 	uint32_t	mss;
   7341 	uchar_t		*up = (uchar_t *)tcpha;
   7342 	int		found = 0;
   7343 	int32_t		sack_len;
   7344 	tcp_seq		sack_begin, sack_end;
   7345 	tcp_t		*tcp;
   7346 
   7347 	endp = up + TCP_HDR_LENGTH(tcpha);
   7348 	up += TCP_MIN_HEADER_LENGTH;
   7349 	while (up < endp) {
   7350 		len = endp - up;
   7351 		switch (*up) {
   7352 		case TCPOPT_EOL:
   7353 			break;
   7354 
   7355 		case TCPOPT_NOP:
   7356 			up++;
   7357 			continue;
   7358 
   7359 		case TCPOPT_MAXSEG:
   7360 			if (len < TCPOPT_MAXSEG_LEN ||
   7361 			    up[1] != TCPOPT_MAXSEG_LEN)
   7362 				break;
   7363 
   7364 			mss = BE16_TO_U16(up+2);
   7365 			/* Caller must handle tcp_mss_min and tcp_mss_max_* */
   7366 			tcpopt->tcp_opt_mss = mss;
   7367 			found |= TCP_OPT_MSS_PRESENT;
   7368 
   7369 			up += TCPOPT_MAXSEG_LEN;
   7370 			continue;
   7371 
   7372 		case TCPOPT_WSCALE:
   7373 			if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN)
   7374 				break;
   7375 
   7376 			if (up[2] > TCP_MAX_WINSHIFT)
   7377 				tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT;
   7378 			else
   7379 				tcpopt->tcp_opt_wscale = up[2];
   7380 			found |= TCP_OPT_WSCALE_PRESENT;
   7381 
   7382 			up += TCPOPT_WS_LEN;
   7383 			continue;
   7384 
   7385 		case TCPOPT_SACK_PERMITTED:
   7386 			if (len < TCPOPT_SACK_OK_LEN ||
   7387 			    up[1] != TCPOPT_SACK_OK_LEN)
   7388 				break;
   7389 			found |= TCP_OPT_SACK_OK_PRESENT;
   7390 			up += TCPOPT_SACK_OK_LEN;
   7391 			continue;
   7392 
   7393 		case TCPOPT_SACK:
   7394 			if (len <= 2 || up[1] <= 2 || len < up[1])
   7395 				break;
   7396 
   7397 			/* If TCP is not interested in SACK blks... */
   7398 			if ((tcp = tcpopt->tcp) == NULL) {
   7399 				up += up[1];
   7400 				continue;
   7401 			}
   7402 			sack_len = up[1] - TCPOPT_HEADER_LEN;
   7403 			up += TCPOPT_HEADER_LEN;
   7404 
   7405 			/*
   7406 			 * If the list is empty, allocate one and assume
   7407 			 * nothing is sack'ed.
   7408 			 */
   7409 			ASSERT(tcp->tcp_sack_info != NULL);
   7410 			if (tcp->tcp_notsack_list == NULL) {
   7411 				tcp_notsack_update(&(tcp->tcp_notsack_list),
   7412 				    tcp->tcp_suna, tcp->tcp_snxt,
   7413 				    &(tcp->tcp_num_notsack_blk),
   7414 				    &(tcp->tcp_cnt_notsack_list));
   7415 
   7416 				/*
   7417 				 * Make sure tcp_notsack_list is not NULL.
   7418 				 * This happens when kmem_alloc(KM_NOSLEEP)
   7419 				 * returns NULL.
   7420 				 */
   7421 				if (tcp->tcp_notsack_list == NULL) {
   7422 					up += sack_len;
   7423 					continue;
   7424 				}
   7425 				tcp->tcp_fack = tcp->tcp_suna;
   7426 			}
   7427 
   7428 			while (sack_len > 0) {
   7429 				if (up + 8 > endp) {
   7430 					up = endp;
   7431 					break;
   7432 				}
   7433 				sack_begin = BE32_TO_U32(up);
   7434 				up += 4;
   7435 				sack_end = BE32_TO_U32(up);
   7436 				up += 4;
   7437 				sack_len -= 8;
   7438 				/*
   7439 				 * Bounds checking.  Make sure the SACK
   7440 				 * info is within tcp_suna and tcp_snxt.
   7441 				 * If this SACK blk is out of bound, ignore
   7442 				 * it but continue to parse the following
   7443 				 * blks.
   7444 				 */
   7445 				if (SEQ_LEQ(sack_end, sack_begin) ||
   7446 				    SEQ_LT(sack_begin, tcp->tcp_suna) ||
   7447 				    SEQ_GT(sack_end, tcp->tcp_snxt)) {
   7448 					continue;
   7449 				}
   7450 				tcp_notsack_insert(&(tcp->tcp_notsack_list),
   7451 				    sack_begin, sack_end,
   7452 				    &(tcp->tcp_num_notsack_blk),
   7453 				    &(tcp->tcp_cnt_notsack_list));
   7454 				if (SEQ_GT(sack_end, tcp->tcp_fack)) {
   7455 					tcp->tcp_fack = sack_end;
   7456 				}
   7457 			}
   7458 			found |= TCP_OPT_SACK_PRESENT;
   7459 			continue;
   7460 
   7461 		case TCPOPT_TSTAMP:
   7462 			if (len < TCPOPT_TSTAMP_LEN ||
   7463 			    up[1] != TCPOPT_TSTAMP_LEN)
   7464 				break;
   7465 
   7466 			tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2);
   7467 			tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6);
   7468 
   7469 			found |= TCP_OPT_TSTAMP_PRESENT;
   7470 
   7471 			up += TCPOPT_TSTAMP_LEN;
   7472 			continue;
   7473 
   7474 		default:
   7475 			if (len <= 1 || len < (int)up[1] || up[1] == 0)
   7476 				break;
   7477 			up += up[1];
   7478 			continue;
   7479 		}
   7480 		break;
   7481 	}
   7482 	return (found);
   7483 }
   7484 
   7485 /*
   7486  * Set the MSS associated with a particular tcp based on its current value,
   7487  * and a new one passed in. Observe minimums and maximums, and reset other
   7488  * state variables that we want to view as multiples of MSS.
   7489  *
   7490  * The value of MSS could be either increased or descreased.
   7491  */
   7492 static void
   7493 tcp_mss_set(tcp_t *tcp, uint32_t mss)
   7494 {
   7495 	uint32_t	mss_max;
   7496 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   7497 	conn_t		*connp = tcp->tcp_connp;
   7498 
   7499 	if (connp->conn_ipversion == IPV4_VERSION)
   7500 		mss_max = tcps->tcps_mss_max_ipv4;
   7501 	else
   7502 		mss_max = tcps->tcps_mss_max_ipv6;
   7503 
   7504 	if (mss < tcps->tcps_mss_min)
   7505 		mss = tcps->tcps_mss_min;
   7506 	if (mss > mss_max)
   7507 		mss = mss_max;
   7508 	/*
   7509 	 * Unless naglim has been set by our client to
   7510 	 * a non-mss value, force naglim to track mss.
   7511 	 * This can help to aggregate small writes.
   7512 	 */
   7513 	if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim)
   7514 		tcp->tcp_naglim = mss;
   7515 	/*
   7516 	 * TCP should be able to buffer at least 4 MSS data for obvious
   7517 	 * performance reason.
   7518 	 */
   7519 	if ((mss << 2) > connp->conn_sndbuf)
   7520 		connp->conn_sndbuf = mss << 2;
   7521 
   7522 	/*
   7523 	 * Set the send lowater to at least twice of MSS.
   7524 	 */
   7525 	if ((mss << 1) > connp->conn_sndlowat)
   7526 		connp->conn_sndlowat = mss << 1;
   7527 
   7528 	/*
   7529 	 * Update tcp_cwnd according to the new value of MSS. Keep the
   7530 	 * previous ratio to preserve the transmit rate.
   7531 	 */
   7532 	tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss;
   7533 	tcp->tcp_cwnd_cnt = 0;
   7534 
   7535 	tcp->tcp_mss = mss;
   7536 	(void) tcp_maxpsz_set(tcp, B_TRUE);
   7537 }
   7538 
   7539 /* For /dev/tcp aka AF_INET open */
   7540 static int
   7541 tcp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   7542 {
   7543 	return (tcp_open(q, devp, flag, sflag, credp, B_FALSE));
   7544 }
   7545 
   7546 /* For /dev/tcp6 aka AF_INET6 open */
   7547 static int
   7548 tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
   7549 {
   7550 	return (tcp_open(q, devp, flag, sflag, credp, B_TRUE));
   7551 }
   7552 
   7553 static conn_t *
   7554 tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket,
   7555     int *errorp)
   7556 {
   7557 	tcp_t		*tcp = NULL;
   7558 	conn_t		*connp;
   7559 	zoneid_t	zoneid;
   7560 	tcp_stack_t	*tcps;
   7561 	squeue_t	*sqp;
   7562 
   7563 	ASSERT(errorp != NULL);
   7564 	/*
   7565 	 * Find the proper zoneid and netstack.
   7566 	 */
   7567 	/*
   7568 	 * Special case for install: miniroot needs to be able to
   7569 	 * access files via NFS as though it were always in the
   7570 	 * global zone.
   7571 	 */
   7572 	if (credp == kcred && nfs_global_client_only != 0) {
   7573 		zoneid = GLOBAL_ZONEID;
   7574 		tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)->
   7575 		    netstack_tcp;
   7576 		ASSERT(tcps != NULL);
   7577 	} else {
   7578 		netstack_t *ns;
   7579 
   7580 		ns = netstack_find_by_cred(credp);
   7581 		ASSERT(ns != NULL);
   7582 		tcps = ns->netstack_tcp;
   7583 		ASSERT(tcps != NULL);
   7584 
   7585 		/*
   7586 		 * For exclusive stacks we set the zoneid to zero
   7587 		 * to make TCP operate as if in the global zone.
   7588 		 */
   7589 		if (tcps->tcps_netstack->netstack_stackid !=
   7590 		    GLOBAL_NETSTACKID)
   7591 			zoneid = GLOBAL_ZONEID;
   7592 		else
   7593 			zoneid = crgetzoneid(credp);
   7594 	}
   7595 
   7596 	sqp = IP_SQUEUE_GET((uint_t)gethrtime());
   7597 	connp = (conn_t *)tcp_get_conn(sqp, tcps);
   7598 	/*
   7599 	 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
   7600 	 * so we drop it by one.
   7601 	 */
   7602 	netstack_rele(tcps->tcps_netstack);
   7603 	if (connp == NULL) {
   7604 		*errorp = ENOSR;
   7605 		return (NULL);
   7606 	}
   7607 	ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
   7608 
   7609 	connp->conn_sqp = sqp;
   7610 	connp->conn_initial_sqp = connp->conn_sqp;
   7611 	connp->conn_ixa->ixa_sqp = connp->conn_sqp;
   7612 	tcp = connp->conn_tcp;
   7613 
   7614 	/*
   7615 	 * Besides asking IP to set the checksum for us, have conn_ip_output
   7616 	 * to do the following checks when necessary:
   7617 	 *
   7618 	 * IXAF_VERIFY_SOURCE: drop packets when our outer source goes invalid
   7619 	 * IXAF_VERIFY_PMTU: verify PMTU changes
   7620 	 * IXAF_VERIFY_LSO: verify LSO capability changes
   7621 	 */
   7622 	connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
   7623 	    IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO;
   7624 
   7625 	if (!tcps->tcps_dev_flow_ctl)
   7626 		connp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL;
   7627 
   7628 	if (isv6) {
   7629 		connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
   7630 		connp->conn_ipversion = IPV6_VERSION;
   7631 		connp->conn_family = AF_INET6;
   7632 		tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
   7633 		connp->conn_default_ttl = tcps->tcps_ipv6_hoplimit;
   7634 	} else {
   7635 		connp->conn_ipversion = IPV4_VERSION;
   7636 		connp->conn_family = AF_INET;
   7637 		tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
   7638 		connp->conn_default_ttl = tcps->tcps_ipv4_ttl;
   7639 	}
   7640 	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
   7641 
   7642 	crhold(credp);
   7643 	connp->conn_cred = credp;
   7644 	connp->conn_cpid = curproc->p_pid;
   7645 	connp->conn_open_time = ddi_get_lbolt64();
   7646 
   7647 	connp->conn_zoneid = zoneid;
   7648 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
   7649 	connp->conn_ixa->ixa_zoneid = zoneid;
   7650 	connp->conn_mlp_type = mlptSingle;
   7651 	ASSERT(connp->conn_netstack == tcps->tcps_netstack);
   7652 	ASSERT(tcp->tcp_tcps == tcps);
   7653 
   7654 	/*
   7655 	 * If the caller has the process-wide flag set, then default to MAC
   7656 	 * exempt mode.  This allows read-down to unlabeled hosts.
   7657 	 */
   7658 	if (getpflags(NET_MAC_AWARE, credp) != 0)
   7659 		connp->conn_mac_mode = CONN_MAC_AWARE;
   7660 
   7661 	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
   7662 
   7663 	if (issocket) {
   7664 		tcp->tcp_issocket = 1;
   7665 	}
   7666 
   7667 	connp->conn_rcvbuf = tcps->tcps_recv_hiwat;
   7668 	connp->conn_sndbuf = tcps->tcps_xmit_hiwat;
   7669 	connp->conn_sndlowat = tcps->tcps_xmit_lowat;
   7670 	connp->conn_so_type = SOCK_STREAM;
   7671 	connp->conn_wroff = connp->conn_ht_iphc_allocated +
   7672 	    tcps->tcps_wroff_xtra;
   7673 
   7674 	SOCK_CONNID_INIT(tcp->tcp_connid);
   7675 	tcp->tcp_state = TCPS_IDLE;
   7676 	tcp_init_values(tcp);
   7677 	return (connp);
   7678 }
   7679 
   7680 static int
   7681 tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
   7682     boolean_t isv6)
   7683 {
   7684 	tcp_t		*tcp = NULL;
   7685 	conn_t		*connp = NULL;
   7686 	int		err;
   7687 	vmem_t		*minor_arena = NULL;
   7688 	dev_t		conn_dev;
   7689 	boolean_t	issocket;
   7690 
   7691 	if (q->q_ptr != NULL)
   7692 		return (0);
   7693 
   7694 	if (sflag == MODOPEN)
   7695 		return (EINVAL);
   7696 
   7697 	if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
   7698 	    ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
   7699 		minor_arena = ip_minor_arena_la;
   7700 	} else {
   7701 		/*
   7702 		 * Either minor numbers in the large arena were exhausted
   7703 		 * or a non socket application is doing the open.
   7704 		 * Try to allocate from the small arena.
   7705 		 */
   7706 		if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
   7707 			return (EBUSY);
   7708 		}
   7709 		minor_arena = ip_minor_arena_sa;
   7710 	}
   7711 
   7712 	ASSERT(minor_arena != NULL);
   7713 
   7714 	*devp = makedevice(getmajor(*devp), (minor_t)conn_dev);
   7715 
   7716 	if (flag & SO_FALLBACK) {
   7717 		/*
   7718 		 * Non streams socket needs a stream to fallback to
   7719 		 */
   7720 		RD(q)->q_ptr = (void *)conn_dev;
   7721 		WR(q)->q_qinfo = &tcp_fallback_sock_winit;
   7722 		WR(q)->q_ptr = (void *)minor_arena;
   7723 		qprocson(q);
   7724 		return (0);
   7725 	} else if (flag & SO_ACCEPTOR) {
   7726 		q->q_qinfo = &tcp_acceptor_rinit;
   7727 		/*
   7728 		 * the conn_dev and minor_arena will be subsequently used by
   7729 		 * tcp_tli_accept() and tcp_tpi_close_accept() to figure out
   7730 		 * the minor device number for this connection from the q_ptr.
   7731 		 */
   7732 		RD(q)->q_ptr = (void *)conn_dev;
   7733 		WR(q)->q_qinfo = &tcp_acceptor_winit;
   7734 		WR(q)->q_ptr = (void *)minor_arena;
   7735 		qprocson(q);
   7736 		return (0);
   7737 	}
   7738 
   7739 	issocket = flag & SO_SOCKSTR;
   7740 	connp = tcp_create_common(credp, isv6, issocket, &err);
   7741 
   7742 	if (connp == NULL) {
   7743 		inet_minor_free(minor_arena, conn_dev);
   7744 		q->q_ptr = WR(q)->q_ptr = NULL;
   7745 		return (err);
   7746 	}
   7747 
   7748 	connp->conn_rq = q;
   7749 	connp->conn_wq = WR(q);
   7750 	q->q_ptr = WR(q)->q_ptr = connp;
   7751 
   7752 	connp->conn_dev = conn_dev;
   7753 	connp->conn_minor_arena = minor_arena;
   7754 
   7755 	ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6);
   7756 	ASSERT(WR(q)->q_qinfo == &tcp_winit);
   7757 
   7758 	tcp = connp->conn_tcp;
   7759 
   7760 	if (issocket) {
   7761 		WR(q)->q_qinfo = &tcp_sock_winit;
   7762 	} else {
   7763 #ifdef  _ILP32
   7764 		tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
   7765 #else
   7766 		tcp->tcp_acceptor_id = conn_dev;
   7767 #endif  /* _ILP32 */
   7768 		tcp_acceptor_hash_insert(tcp->tcp_acceptor_id,