Home | History | Annotate | Download | only in tcp
      1      0      stevel /*
      2      0      stevel  * CDDL HEADER START
      3      0      stevel  *
      4      0      stevel  * The contents of this file are subject to the terms of the
      5   1205      kcpoon  * Common Development and Distribution License (the "License").
      6   1205      kcpoon  * You may not use this file except in compliance with the License.
      7      0      stevel  *
      8      0      stevel  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9      0      stevel  * or http://www.opensolaris.org/os/licensing.
     10      0      stevel  * See the License for the specific language governing permissions
     11      0      stevel  * and limitations under the License.
     12      0      stevel  *
     13      0      stevel  * When distributing Covered Code, include this CDDL HEADER in each
     14      0      stevel  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15      0      stevel  * If applicable, add the following below this CDDL HEADER, with the
     16      0      stevel  * fields enclosed by brackets "[]" replaced with your own identifying
     17      0      stevel  * information: Portions Copyright [yyyy] [name of copyright owner]
     18      0      stevel  *
     19      0      stevel  * CDDL HEADER END
     20      0      stevel  */
     21   1205      kcpoon 
     22   1205      kcpoon /*
     23   8477         Rao  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24      0      stevel  * Use is subject to license terms.
     25      0      stevel  */
     26      0      stevel /* Copyright (c) 1990 Mentat Inc. */
     27      0      stevel 
     28      0      stevel #include <sys/types.h>
     29      0      stevel #include <sys/stream.h>
     30      0      stevel #include <sys/strsun.h>
     31      0      stevel #include <sys/strsubr.h>
     32      0      stevel #include <sys/stropts.h>
     33      0      stevel #include <sys/strlog.h>
     34      0      stevel #define	_SUN_TPI_VERSION 2
     35      0      stevel #include <sys/tihdr.h>
     36      0      stevel #include <sys/timod.h>
     37      0      stevel #include <sys/ddi.h>
     38      0      stevel #include <sys/sunddi.h>
     39      0      stevel #include <sys/suntpi.h>
     40      0      stevel #include <sys/xti_inet.h>
     41      0      stevel #include <sys/cmn_err.h>
     42      0      stevel #include <sys/debug.h>
     43   2958    dr146992 #include <sys/sdt.h>
     44      0      stevel #include <sys/vtrace.h>
     45      0      stevel #include <sys/kmem.h>
     46      0      stevel #include <sys/ethernet.h>
     47      0      stevel #include <sys/cpuvar.h>
     48      0      stevel #include <sys/dlpi.h>
     49      0      stevel #include <sys/pattr.h>
     50      0      stevel #include <sys/policy.h>
     51   1676         jpk #include <sys/priv.h>
     52      0      stevel #include <sys/zone.h>
     53   3448    dh155122 #include <sys/sunldi.h>
     54      0      stevel 
     55      0      stevel #include <sys/errno.h>
     56      0      stevel #include <sys/signal.h>
     57      0      stevel #include <sys/socket.h>
     58   8348        Eric #include <sys/socketvar.h>
     59      0      stevel #include <sys/sockio.h>
     60      0      stevel #include <sys/isa_defs.h>
     61      0      stevel #include <sys/md5.h>
     62      0      stevel #include <sys/random.h>
     63   6707      brutus #include <sys/uio.h>
     64   8048    Madhavan #include <sys/systm.h>
     65      0      stevel #include <netinet/in.h>
     66      0      stevel #include <netinet/tcp.h>
     67      0      stevel #include <netinet/ip6.h>
     68      0      stevel #include <netinet/icmp6.h>
     69      0      stevel #include <net/if.h>
     70      0      stevel #include <net/route.h>
     71      0      stevel #include <inet/ipsec_impl.h>
     72      0      stevel 
     73      0      stevel #include <inet/common.h>
     74      0      stevel #include <inet/ip.h>
     75    741    masputra #include <inet/ip_impl.h>
     76      0      stevel #include <inet/ip6.h>
     77      0      stevel #include <inet/ip_ndp.h>
     78   8348        Eric #include <inet/proto_set.h>
     79      0      stevel #include <inet/mib2.h>
     80      0      stevel #include <inet/nd.h>
     81      0      stevel #include <inet/optcom.h>
     82      0      stevel #include <inet/snmpcom.h>
     83      0      stevel #include <inet/kstatcom.h>
     84      0      stevel #include <inet/tcp.h>
     85    741    masputra #include <inet/tcp_impl.h>
     86   8833        Venu #include <inet/udp_impl.h>
     87      0      stevel #include <net/pfkeyv2.h>
     88      0      stevel #include <inet/ipdrop.h>
     89      0      stevel 
     90      0      stevel #include <inet/ipclassifier.h>
     91      0      stevel #include <inet/ip_ire.h>
     92   2535    sangeeta #include <inet/ip_ftable.h>
     93      0      stevel #include <inet/ip_if.h>
     94      0      stevel #include <inet/ipp_common.h>
     95  11042        Erik #include <inet/ip_rts.h>
     96   2958    dr146992 #include <inet/ip_netinfo.h>
     97   8275        Eric #include <sys/squeue_impl.h>
     98      0      stevel #include <sys/squeue.h>
     99    898        kais #include <inet/kssl/ksslapi.h>
    100   1676         jpk #include <sys/tsol/label.h>
    101   1676         jpk #include <sys/tsol/tnet.h>
    102   1676         jpk #include <rpc/pmap_prot.h>
    103   8048    Madhavan #include <sys/callo.h>
    104  11066      rafael 
    105  11110        Erik #include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
    106      0      stevel 
    107      0      stevel /*
    108      0      stevel  * TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
    109      0      stevel  *
    110      0      stevel  * (Read the detailed design doc in PSARC case directory)
    111      0      stevel  *
    112      0      stevel  * The entire tcp state is contained in tcp_t and conn_t structure
    113      0      stevel  * which are allocated in tandem using ipcl_conn_create() and passing
    114  11042        Erik  * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect
    115      0      stevel  * the references on the tcp_t. The tcp_t structure is never compressed
    116      0      stevel  * and packets always land on the correct TCP perimeter from the time
    117      0      stevel  * eager is created till the time tcp_t dies (as such the old mentat
    118      0      stevel  * TCP global queue is not used for detached state and no IPSEC checking
    119      0      stevel  * is required). The global queue is still allocated to send out resets
    120      0      stevel  * for connection which have no listeners and IP directly calls
    121      0      stevel  * tcp_xmit_listeners_reset() which does any policy check.
    122      0      stevel  *
    123      0      stevel  * Protection and Synchronisation mechanism:
    124      0      stevel  *
    125      0      stevel  * The tcp data structure does not use any kind of lock for protecting
    126      0      stevel  * its state but instead uses 'squeues' for mutual exclusion from various
    127      0      stevel  * read and write side threads. To access a tcp member, the thread should
    128   8275        Eric  * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS,
    129   8275        Eric  * or SQ_NODRAIN). Since the squeues allow a direct function call, caller
    130      0      stevel  * can pass any tcp function having prototype of edesc_t as argument
    131      0      stevel  * (different from traditional STREAMs model where packets come in only
    132      0      stevel  * designated entry points). The list of functions that can be directly
    133      0      stevel  * called via squeue are listed before the usual function prototype.
    134      0      stevel  *
    135      0      stevel  * Referencing:
    136      0      stevel  *
    137      0      stevel  * TCP is MT-Hot and we use a reference based scheme to make sure that the
    138      0      stevel  * tcp structure doesn't disappear when its needed. When the application
    139      0      stevel  * creates an outgoing connection or accepts an incoming connection, we
    140      0      stevel  * start out with 2 references on 'conn_ref'. One for TCP and one for IP.
    141      0      stevel  * The IP reference is just a symbolic reference since ip_tcpclose()
    142      0      stevel  * looks at tcp structure after tcp_close_output() returns which could
    143      0      stevel  * have dropped the last TCP reference. So as long as the connection is
    144      0      stevel  * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
    145      0      stevel  * conn_t. The classifier puts its own reference when the connection is
    146      0      stevel  * inserted in listen or connected hash. Anytime a thread needs to enter
    147      0      stevel  * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
    148      0      stevel  * on write side or by doing a classify on read side and then puts a
    149      0      stevel  * reference on the conn before doing squeue_enter/tryenter/fill. For
    150      0      stevel  * read side, the classifier itself puts the reference under fanout lock
    151      0      stevel  * to make sure that tcp can't disappear before it gets processed. The
    152      0      stevel  * squeue will drop this reference automatically so the called function
    153      0      stevel  * doesn't have to do a DEC_REF.
    154      0      stevel  *
    155      0      stevel  * Opening a new connection:
    156      0      stevel  *
    157   3448    dh155122  * The outgoing connection open is pretty simple. tcp_open() does the
    158      0      stevel  * work in creating the conn/tcp structure and initializing it. The
    159      0      stevel  * squeue assignment is done based on the CPU the application
    160      0      stevel  * is running on. So for outbound connections, processing is always done
    161      0      stevel  * on application CPU which might be different from the incoming CPU
    162      0      stevel  * being interrupted by the NIC. An optimal way would be to figure out
    163      0      stevel  * the NIC <-> CPU binding at listen time, and assign the outgoing
    164      0      stevel  * connection to the squeue attached to the CPU that will be interrupted
    165      0      stevel  * for incoming packets (we know the NIC based on the bind IP address).
    166      0      stevel  * This might seem like a problem if more data is going out but the
    167      0      stevel  * fact is that in most cases the transmit is ACK driven transmit where
    168      0      stevel  * the outgoing data normally sits on TCP's xmit queue waiting to be
    169      0      stevel  * transmitted.
    170      0      stevel  *
    171      0      stevel  * Accepting a connection:
    172      0      stevel  *
    173      0      stevel  * This is a more interesting case because of various races involved in
    174      0      stevel  * establishing a eager in its own perimeter. Read the meta comment on
    175  11042        Erik  * top of tcp_input_listener(). But briefly, the squeue is picked by
    176  11042        Erik  * ip_fanout based on the ring or the sender (if loopback).
    177      0      stevel  *
    178      0      stevel  * Closing a connection:
    179      0      stevel  *
    180      0      stevel  * The close is fairly straight forward. tcp_close() calls tcp_close_output()
    181      0      stevel  * via squeue to do the close and mark the tcp as detached if the connection
    182      0      stevel  * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
    183      0      stevel  * reference but tcp_close() drop IP's reference always. So if tcp was
    184      0      stevel  * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
    185      0      stevel  * and 1 because it is in classifier's connected hash. This is the condition
    186      0      stevel  * we use to determine that its OK to clean up the tcp outside of squeue
    187      0      stevel  * when time wait expires (check the ref under fanout and conn_lock and
    188      0      stevel  * if it is 2, remove it from fanout hash and kill it).
    189      0      stevel  *
    190      0      stevel  * Although close just drops the necessary references and marks the
    191      0      stevel  * tcp_detached state, tcp_close needs to know the tcp_detached has been
    192      0      stevel  * set (under squeue) before letting the STREAM go away (because a
    193      0      stevel  * inbound packet might attempt to go up the STREAM while the close
    194      0      stevel  * has happened and tcp_detached is not set). So a special lock and
    195      0      stevel  * flag is used along with a condition variable (tcp_closelock, tcp_closed,
    196      0      stevel  * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
    197      0      stevel  * tcp_detached.
    198      0      stevel  *
    199      0      stevel  * Special provisions and fast paths:
    200      0      stevel  *
    201  11042        Erik  * We make special provisions for sockfs by marking tcp_issocket
    202      0      stevel  * whenever we have only sockfs on top of TCP. This allows us to skip
    203      0      stevel  * putting the tcp in acceptor hash since a sockfs listener can never
    204      0      stevel  * become acceptor and also avoid allocating a tcp_t for acceptor STREAM
    205      0      stevel  * since eager has already been allocated and the accept now happens
    206      0      stevel  * on acceptor STREAM. There is a big blob of comment on top of
    207  11042        Erik  * tcp_input_listener explaining the new accept. When socket is POP'd,
    208      0      stevel  * sockfs sends us an ioctl to mark the fact and we go back to old
    209      0      stevel  * behaviour. Once tcp_issocket is unset, its never set for the
    210      0      stevel  * life of that connection.
    211      0      stevel  *
    212      0      stevel  * IPsec notes :
    213      0      stevel  *
    214      0      stevel  * Since a packet is always executed on the correct TCP perimeter
    215      0      stevel  * all IPsec processing is defered to IP including checking new
    216      0      stevel  * connections and setting IPSEC policies for new connection. The
    217      0      stevel  * only exception is tcp_xmit_listeners_reset() which is called
    218      0      stevel  * directly from IP and needs to policy check to see if TH_RST
    219      0      stevel  * can be sent out.
    220      0      stevel  */
    221      0      stevel 
    222      0      stevel /*
    223      0      stevel  * Values for squeue switch:
    224   8275        Eric  * 1: SQ_NODRAIN
    225   8275        Eric  * 2: SQ_PROCESS
    226   8275        Eric  * 3: SQ_FILL
    227   8275        Eric  */
    228   8275        Eric int tcp_squeue_wput = 2;	/* /etc/systems */
    229   8275        Eric int tcp_squeue_flag;
    230      0      stevel 
    231      0      stevel /*
    232      0      stevel  * This controls how tiny a write must be before we try to copy it
    233  11042        Erik  * into the mblk on the tail of the transmit queue.  Not much
    234      0      stevel  * speedup is observed for values larger than sixteen.  Zero will
    235      0      stevel  * disable the optimisation.
    236      0      stevel  */
    237      0      stevel int tcp_tx_pull_len = 16;
    238      0      stevel 
    239      0      stevel /*
    240      0      stevel  * TCP Statistics.
    241      0      stevel  *
    242      0      stevel  * How TCP statistics work.
    243      0      stevel  *
    244      0      stevel  * There are two types of statistics invoked by two macros.
    245      0      stevel  *
    246      0      stevel  * TCP_STAT(name) does non-atomic increment of a named stat counter. It is
    247      0      stevel  * supposed to be used in non MT-hot paths of the code.
    248      0      stevel  *
    249      0      stevel  * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
    250      0      stevel  * supposed to be used for DEBUG purposes and may be used on a hot path.
    251      0      stevel  *
    252      0      stevel  * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
    253      0      stevel  * (use "kstat tcp" to get them).
    254      0      stevel  *
    255      0      stevel  * There is also additional debugging facility that marks tcp_clean_death()
    256      0      stevel  * instances and saves them in tcp_t structure. It is triggered by
    257      0      stevel  * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
    258      0      stevel  * tcp_clean_death() calls that counts the number of times each tag was hit. It
    259      0      stevel  * is triggered by TCP_CLD_COUNTERS define.
    260      0      stevel  *
    261      0      stevel  * How to add new counters.
    262      0      stevel  *
    263      0      stevel  * 1) Add a field in the tcp_stat structure describing your counter.
    264   3448    dh155122  * 2) Add a line in the template in tcp_kstat2_init() with the name
    265   3448    dh155122  *    of the counter.
    266      0      stevel  *
    267      0      stevel  *    IMPORTANT!! - make sure that both are in sync !!
    268      0      stevel  * 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
    269      0      stevel  *
    270      0      stevel  * Please avoid using private counters which are not kstat-exported.
    271      0      stevel  *
    272      0      stevel  * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
    273      0      stevel  * in tcp_t structure.
    274      0      stevel  *
    275      0      stevel  * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
    276      0      stevel  */
    277      0      stevel 
    278      0      stevel #ifndef TCP_DEBUG_COUNTER
    279      0      stevel #ifdef DEBUG
    280      0      stevel #define	TCP_DEBUG_COUNTER 1
    281      0      stevel #else
    282      0      stevel #define	TCP_DEBUG_COUNTER 0
    283      0      stevel #endif
    284      0      stevel #endif
    285      0      stevel 
    286    741    masputra #define	TCP_CLD_COUNTERS 0
    287      0      stevel 
    288      0      stevel #define	TCP_TAG_CLEAN_DEATH 1
    289      0      stevel #define	TCP_MAX_CLEAN_DEATH_TAG 32
    290      0      stevel 
    291      0      stevel #ifdef lint
    292      0      stevel static int _lint_dummy_;
    293      0      stevel #endif
    294      0      stevel 
    295      0      stevel #if TCP_CLD_COUNTERS
    296      0      stevel static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
    297      0      stevel #define	TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
    298      0      stevel #elif defined(lint)
    299      0      stevel #define	TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
    300      0      stevel #else
    301      0      stevel #define	TCP_CLD_STAT(x)
    302      0      stevel #endif
    303      0      stevel 
    304      0      stevel #if TCP_DEBUG_COUNTER
    305   3448    dh155122 #define	TCP_DBGSTAT(tcps, x)	\
    306   3448    dh155122 	atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1)
    307   3448    dh155122 #define	TCP_G_DBGSTAT(x)	\
    308   3448    dh155122 	atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1)
    309      0      stevel #elif defined(lint)
    310   3448    dh155122 #define	TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0);
    311   3448    dh155122 #define	TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
    312      0      stevel #else
    313   3448    dh155122 #define	TCP_DBGSTAT(tcps, x)
    314   3448    dh155122 #define	TCP_G_DBGSTAT(x)
    315      0      stevel #endif
    316      0      stevel 
    317   3448    dh155122 #define	TCP_G_STAT(x)	(tcp_g_statistics.x.value.ui64++)
    318   3448    dh155122 
    319   3448    dh155122 tcp_g_stat_t	tcp_g_statistics;
    320   3448    dh155122 kstat_t		*tcp_g_kstat;
    321      0      stevel 
    322      0      stevel /* Macros for timestamp comparisons */
    323      0      stevel #define	TSTMP_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
    324      0      stevel #define	TSTMP_LT(a, b)	((int32_t)((a)-(b)) < 0)
    325      0      stevel 
    326      0      stevel /*
    327      0      stevel  * Parameters for TCP Initial Send Sequence number (ISS) generation.  When
    328      0      stevel  * tcp_strong_iss is set to 1, which is the default, the ISS is calculated
    329      0      stevel  * by adding three components: a time component which grows by 1 every 4096
    330      0      stevel  * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
    331      0      stevel  * a per-connection component which grows by 125000 for every new connection;
    332      0      stevel  * and an "extra" component that grows by a random amount centered
    333  11042        Erik  * approximately on 64000.  This causes the ISS generator to cycle every
    334      0      stevel  * 4.89 hours if no TCP connections are made, and faster if connections are
    335      0      stevel  * made.
    336      0      stevel  *
    337      0      stevel  * When tcp_strong_iss is set to 0, ISS is calculated by adding two
    338      0      stevel  * components: a time component which grows by 250000 every second; and
    339      0      stevel  * a per-connection component which grows by 125000 for every new connections.
    340      0      stevel  *
    341      0      stevel  * A third method, when tcp_strong_iss is set to 2, for generating ISS is
    342      0      stevel  * prescribed by Steve Bellovin.  This involves adding time, the 125000 per
    343      0      stevel  * connection, and a one-way hash (MD5) of the connection ID <sport, dport,
    344      0      stevel  * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
    345      0      stevel  * password.
    346      0      stevel  */
    347      0      stevel #define	ISS_INCR	250000
    348      0      stevel #define	ISS_NSEC_SHT	12
    349      0      stevel 
    350      0      stevel static sin_t	sin_null;	/* Zero address for quick clears */
    351      0      stevel static sin6_t	sin6_null;	/* Zero address for quick clears */
    352      0      stevel 
    353      0      stevel /*
    354      0      stevel  * This implementation follows the 4.3BSD interpretation of the urgent
    355      0      stevel  * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
    356      0      stevel  * incompatible changes in protocols like telnet and rlogin.
    357      0      stevel  */
    358      0      stevel #define	TCP_OLD_URP_INTERPRETATION	1
    359      0      stevel 
    360  11042        Erik /*
    361  11042        Erik  * Since tcp_listener is not cleared atomically with tcp_detached
    362  11042        Erik  * being cleared we need this extra bit to tell a detached connection
    363  11042        Erik  * apart from one that is in the process of being accepted.
    364  11042        Erik  */
    365      0      stevel #define	TCP_IS_DETACHED_NONEAGER(tcp)	\
    366  11042        Erik 	(TCP_IS_DETACHED(tcp) &&	\
    367      0      stevel 	    (!(tcp)->tcp_hard_binding))
    368      0      stevel 
    369      0      stevel /*
    370      0      stevel  * TCP reassembly macros.  We hide starting and ending sequence numbers in
    371      0      stevel  * b_next and b_prev of messages on the reassembly queue.  The messages are
    372      0      stevel  * chained using b_cont.  These macros are used in tcp_reass() so we don't
    373      0      stevel  * have to see the ugly casts and assignments.
    374      0      stevel  */
    375      0      stevel #define	TCP_REASS_SEQ(mp)		((uint32_t)(uintptr_t)((mp)->b_next))
    376      0      stevel #define	TCP_REASS_SET_SEQ(mp, u)	((mp)->b_next = \
    377      0      stevel 					(mblk_t *)(uintptr_t)(u))
    378      0      stevel #define	TCP_REASS_END(mp)		((uint32_t)(uintptr_t)((mp)->b_prev))
    379      0      stevel #define	TCP_REASS_SET_END(mp, u)	((mp)->b_prev = \
    380      0      stevel 					(mblk_t *)(uintptr_t)(u))
    381      0      stevel 
    382      0      stevel /*
    383      0      stevel  * Implementation of TCP Timers.
    384      0      stevel  * =============================
    385      0      stevel  *
    386      0      stevel  * INTERFACE:
    387      0      stevel  *
    388      0      stevel  * There are two basic functions dealing with tcp timers:
    389      0      stevel  *
    390      0      stevel  *	timeout_id_t	tcp_timeout(connp, func, time)
    391      0      stevel  * 	clock_t		tcp_timeout_cancel(connp, timeout_id)
    392      0      stevel  *	TCP_TIMER_RESTART(tcp, intvl)
    393      0      stevel  *
    394      0      stevel  * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
    395      0      stevel  * after 'time' ticks passed. The function called by timeout() must adhere to
    396      0      stevel  * the same restrictions as a driver soft interrupt handler - it must not sleep
    397      0      stevel  * or call other functions that might sleep. The value returned is the opaque
    398      0      stevel  * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
    399      0      stevel  * cancel the request. The call to tcp_timeout() may fail in which case it
    400      0      stevel  * returns zero. This is different from the timeout(9F) function which never
    401      0      stevel  * fails.
    402      0      stevel  *
    403      0      stevel  * The call-back function 'func' always receives 'connp' as its single
    404      0      stevel  * argument. It is always executed in the squeue corresponding to the tcp
    405      0      stevel  * structure. The tcp structure is guaranteed to be present at the time the
    406      0      stevel  * call-back is called.
    407      0      stevel  *
    408      0      stevel  * NOTE: The call-back function 'func' is never called if tcp is in
    409      0      stevel  * 	the TCPS_CLOSED state.
    410      0      stevel  *
    411      0      stevel  * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
    412      0      stevel  * request. locks acquired by the call-back routine should not be held across
    413      0      stevel  * the call to tcp_timeout_cancel() or a deadlock may result.
    414      0      stevel  *
    415      0      stevel  * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
    416      0      stevel  * Otherwise, it returns an integer value greater than or equal to 0. In
    417      0      stevel  * particular, if the call-back function is already placed on the squeue, it can
    418      0      stevel  * not be canceled.
    419      0      stevel  *
    420      0      stevel  * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
    421      0      stevel  * 	within squeue context corresponding to the tcp instance. Since the
    422      0      stevel  *	call-back is also called via the same squeue, there are no race
    423      0      stevel  *	conditions described in untimeout(9F) manual page since all calls are
    424      0      stevel  *	strictly serialized.
    425      0      stevel  *
    426      0      stevel  *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
    427      0      stevel  *	stored in tcp_timer_tid and starts a new one using
    428      0      stevel  *	MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
    429      0      stevel  *	and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
    430      0      stevel  *	field.
    431      0      stevel  *
    432      0      stevel  * NOTE: since the timeout cancellation is not guaranteed, the cancelled
    433      0      stevel  *	call-back may still be called, so it is possible tcp_timer() will be
    434      0      stevel  *	called several times. This should not be a problem since tcp_timer()
    435      0      stevel  *	should always check the tcp instance state.
    436      0      stevel  *
    437      0      stevel  *
    438      0      stevel  * IMPLEMENTATION:
    439      0      stevel  *
    440      0      stevel  * TCP timers are implemented using three-stage process. The call to
    441      0      stevel  * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
    442      0      stevel  * when the timer expires. The tcp_timer_callback() arranges the call of the
    443      0      stevel  * tcp_timer_handler() function via squeue corresponding to the tcp
    444      0      stevel  * instance. The tcp_timer_handler() calls actual requested timeout call-back
    445      0      stevel  * and passes tcp instance as an argument to it. Information is passed between
    446      0      stevel  * stages using the tcp_timer_t structure which contains the connp pointer, the
    447      0      stevel  * tcp call-back to call and the timeout id returned by the timeout(9F).
    448      0      stevel  *
    449      0      stevel  * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
    450      0      stevel  * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
    451      0      stevel  * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
    452      0      stevel  * returns the pointer to this mblk.
    453      0      stevel  *
    454      0      stevel  * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
    455      0      stevel  * looks like a normal mblk without actual dblk attached to it.
    456      0      stevel  *
    457      0      stevel  * To optimize performance each tcp instance holds a small cache of timer
    458      0      stevel  * mblocks. In the current implementation it caches up to two timer mblocks per
    459      0      stevel  * tcp instance. The cache is preserved over tcp frees and is only freed when
    460      0      stevel  * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
    461      0      stevel  * timer processing happens on a corresponding squeue, the cache manipulation
    462      0      stevel  * does not require any locks. Experiments show that majority of timer mblocks
    463      0      stevel  * allocations are satisfied from the tcp cache and do not involve kmem calls.
    464      0      stevel  *
    465      0      stevel  * The tcp_timeout() places a refhold on the connp instance which guarantees
    466      0      stevel  * that it will be present at the time the call-back function fires. The
    467      0      stevel  * tcp_timer_handler() drops the reference after calling the call-back, so the
    468      0      stevel  * call-back function does not need to manipulate the references explicitly.
    469      0      stevel  */
    470      0      stevel 
    471      0      stevel typedef struct tcp_timer_s {
    472      0      stevel 	conn_t	*connp;
    473      0      stevel 	void 	(*tcpt_proc)(void *);
    474   8048    Madhavan 	callout_id_t   tcpt_tid;
    475      0      stevel } tcp_timer_t;
    476      0      stevel 
    477      0      stevel static kmem_cache_t *tcp_timercache;
    478      0      stevel kmem_cache_t	*tcp_sack_info_cache;
    479      0      stevel 
    480      0      stevel /*
    481      0      stevel  * For scalability, we must not run a timer for every TCP connection
    482      0      stevel  * in TIME_WAIT state.  To see why, consider (for time wait interval of
    483      0      stevel  * 4 minutes):
    484      0      stevel  *	1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
    485      0      stevel  *
    486      0      stevel  * This list is ordered by time, so you need only delete from the head
    487      0      stevel  * until you get to entries which aren't old enough to delete yet.
    488      0      stevel  * The list consists of only the detached TIME_WAIT connections.
    489      0      stevel  *
    490      0      stevel  * Note that the timer (tcp_time_wait_expire) is started when the tcp_t
    491      0      stevel  * becomes detached TIME_WAIT (either by changing the state and already
    492      0      stevel  * being detached or the other way around). This means that the TIME_WAIT
    493      0      stevel  * state can be extended (up to doubled) if the connection doesn't become
    494      0      stevel  * detached for a long time.
    495      0      stevel  *
    496      0      stevel  * The list manipulations (including tcp_time_wait_next/prev)
    497      0      stevel  * are protected by the tcp_time_wait_lock. The content of the
    498      0      stevel  * detached TIME_WAIT connections is protected by the normal perimeters.
    499   3448    dh155122  *
    500   3448    dh155122  * This list is per squeue and squeues are shared across the tcp_stack_t's.
    501   3448    dh155122  * Things on tcp_time_wait_head remain associated with the tcp_stack_t
    502   3448    dh155122  * and conn_netstack.
    503   3448    dh155122  * The tcp_t's that are added to tcp_free_list are disassociated and
    504   3448    dh155122  * have NULL tcp_tcps and conn_netstack pointers.
    505   3448    dh155122  */
    506      0      stevel typedef struct tcp_squeue_priv_s {
    507      0      stevel 	kmutex_t	tcp_time_wait_lock;
    508   8048    Madhavan 	callout_id_t	tcp_time_wait_tid;
    509      0      stevel 	tcp_t		*tcp_time_wait_head;
    510      0      stevel 	tcp_t		*tcp_time_wait_tail;
    511      0      stevel 	tcp_t		*tcp_free_list;
    512   1023    ethindra 	uint_t		tcp_free_list_cnt;
    513      0      stevel } tcp_squeue_priv_t;
    514      0      stevel 
    515      0      stevel /*
    516      0      stevel  * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
    517      0      stevel  * Running it every 5 seconds seems to give the best results.
    518      0      stevel  */
    519      0      stevel #define	TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
    520      0      stevel 
    521   1023    ethindra /*
    522   1023    ethindra  * To prevent memory hog, limit the number of entries in tcp_free_list
    523   1023    ethindra  * to 1% of available memory / number of cpus
    524   1023    ethindra  */
    525   1023    ethindra uint_t tcp_free_list_max_cnt = 0;
    526      0      stevel 
    527      0      stevel #define	TCP_XMIT_LOWATER	4096
    528      0      stevel #define	TCP_XMIT_HIWATER	49152
    529      0      stevel #define	TCP_RECV_LOWATER	2048
    530  11055    Kacheong #define	TCP_RECV_HIWATER	128000
    531      0      stevel 
    532      0      stevel /*
    533      0      stevel  *  PAWS needs a timer for 24 days.  This is the number of ticks in 24 days
    534      0      stevel  */
    535      0      stevel #define	PAWS_TIMEOUT	((clock_t)(24*24*60*60*hz))
    536      0      stevel 
    537      0      stevel #define	TIDUSZ	4096	/* transport interface data unit size */
    538      0      stevel 
    539      0      stevel /*
    540      0      stevel  * Bind hash list size and has function.  It has to be a power of 2 for
    541      0      stevel  * hashing.
    542      0      stevel  */
    543      0      stevel #define	TCP_BIND_FANOUT_SIZE	512
    544      0      stevel #define	TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
    545      0      stevel /*
    546      0      stevel  * Size of listen and acceptor hash list.  It has to be a power of 2 for
    547      0      stevel  * hashing.
    548      0      stevel  */
    549      0      stevel #define	TCP_FANOUT_SIZE		256
    550      0      stevel 
    551      0      stevel #ifdef	_ILP32
    552      0      stevel #define	TCP_ACCEPTOR_HASH(accid)					\
    553      0      stevel 		(((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
    554      0      stevel #else
    555      0      stevel #define	TCP_ACCEPTOR_HASH(accid)					\
    556      0      stevel 		((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
    557      0      stevel #endif	/* _ILP32 */
    558      0      stevel 
    559      0      stevel #define	IP_ADDR_CACHE_SIZE	2048
    560      0      stevel #define	IP_ADDR_CACHE_HASH(faddr)					\
    561      0      stevel 	(ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
    562      0      stevel 
    563      0      stevel /*
    564      0      stevel  * TCP options struct returned from tcp_parse_options.
    565      0      stevel  */
    566      0      stevel typedef struct tcp_opt_s {
    567      0      stevel 	uint32_t	tcp_opt_mss;
    568      0      stevel 	uint32_t	tcp_opt_wscale;
    569      0      stevel 	uint32_t	tcp_opt_ts_val;
    570      0      stevel 	uint32_t	tcp_opt_ts_ecr;
    571      0      stevel 	tcp_t		*tcp;
    572      0      stevel } tcp_opt_t;
    573      0      stevel 
    574      0      stevel /*
    575      0      stevel  * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
    576      0      stevel  */
    577      0      stevel 
    578      0      stevel #ifdef _BIG_ENDIAN
    579      0      stevel #define	TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
    580      0      stevel 	(TCPOPT_TSTAMP << 8) | 10)
    581      0      stevel #else
    582      0      stevel #define	TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
    583      0      stevel 	(TCPOPT_NOP << 8) | TCPOPT_NOP)
    584      0      stevel #endif
    585      0      stevel 
    586      0      stevel /*
    587      0      stevel  * Flags returned from tcp_parse_options.
    588      0      stevel  */
    589      0      stevel #define	TCP_OPT_MSS_PRESENT	1
    590      0      stevel #define	TCP_OPT_WSCALE_PRESENT	2
    591      0      stevel #define	TCP_OPT_TSTAMP_PRESENT	4
    592      0      stevel #define	TCP_OPT_SACK_OK_PRESENT	8
    593      0      stevel #define	TCP_OPT_SACK_PRESENT	16
    594      0      stevel 
    595      0      stevel /* TCP option length */
    596      0      stevel #define	TCPOPT_NOP_LEN		1
    597      0      stevel #define	TCPOPT_MAXSEG_LEN	4
    598      0      stevel #define	TCPOPT_WS_LEN		3
    599      0      stevel #define	TCPOPT_REAL_WS_LEN	(TCPOPT_WS_LEN+1)
    600      0      stevel #define	TCPOPT_TSTAMP_LEN	10
    601      0      stevel #define	TCPOPT_REAL_TS_LEN	(TCPOPT_TSTAMP_LEN+2)
    602      0      stevel #define	TCPOPT_SACK_OK_LEN	2
    603      0      stevel #define	TCPOPT_REAL_SACK_OK_LEN	(TCPOPT_SACK_OK_LEN+2)
    604      0      stevel #define	TCPOPT_REAL_SACK_LEN	4
    605      0      stevel #define	TCPOPT_MAX_SACK_LEN	36
    606      0      stevel #define	TCPOPT_HEADER_LEN	2
    607      0      stevel 
    608      0      stevel /* TCP cwnd burst factor. */
    609      0      stevel #define	TCP_CWND_INFINITE	65535
    610      0      stevel #define	TCP_CWND_SS		3
    611      0      stevel #define	TCP_CWND_NORMAL		5
    612      0      stevel 
    613      0      stevel /* Maximum TCP initial cwin (start/restart). */
    614      0      stevel #define	TCP_MAX_INIT_CWND	8
    615      0      stevel 
    616      0      stevel /*
    617      0      stevel  * Initialize cwnd according to RFC 3390.  def_max_init_cwnd is
    618      0      stevel  * either tcp_slow_start_initial or tcp_slow_start_after idle
    619      0      stevel  * depending on the caller.  If the upper layer has not used the
    620      0      stevel  * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
    621      0      stevel  * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
    622      0      stevel  * If the upper layer has changed set the tcp_init_cwnd, just use
    623      0      stevel  * it to calculate the tcp_cwnd.
    624      0      stevel  */
    625      0      stevel #define	SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd)			\
    626      0      stevel {									\
    627      0      stevel 	if ((tcp)->tcp_init_cwnd == 0) {				\
    628      0      stevel 		(tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss),	\
    629      0      stevel 		    MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
    630      0      stevel 	} else {							\
    631      0      stevel 		(tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss);		\
    632      0      stevel 	}								\
    633      0      stevel 	tcp->tcp_cwnd_cnt = 0;						\
    634      0      stevel }
    635      0      stevel 
    636      0      stevel /* TCP Timer control structure */
    637      0      stevel typedef struct tcpt_s {
    638      0      stevel 	pfv_t	tcpt_pfv;	/* The routine we are to call */
    639      0      stevel 	tcp_t	*tcpt_tcp;	/* The parameter we are to pass in */
    640      0      stevel } tcpt_t;
    641      0      stevel 
    642      0      stevel /*
    643      0      stevel  * Functions called directly via squeue having a prototype of edesc_t.
    644      0      stevel  */
    645  11042        Erik void		tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
    646  11042        Erik     ip_recv_attr_t *ira);
    647  11042        Erik static void	tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2,
    648  11042        Erik     ip_recv_attr_t *dummy);
    649  11042        Erik void		tcp_accept_finish(void *arg, mblk_t *mp, void *arg2,
    650  11042        Erik     ip_recv_attr_t *dummy);
    651  11042        Erik static void	tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2,
    652  11042        Erik     ip_recv_attr_t *dummy);
    653  11042        Erik static void	tcp_wput_proto(void *arg, mblk_t *mp, void *arg2,
    654  11042        Erik     ip_recv_attr_t *dummy);
    655  11042        Erik void		tcp_input_data(void *arg, mblk_t *mp, void *arg2,
    656  11042        Erik     ip_recv_attr_t *ira);
    657  11042        Erik static void	tcp_close_output(void *arg, mblk_t *mp, void *arg2,
    658  11042        Erik     ip_recv_attr_t *dummy);
    659  11042        Erik void		tcp_output(void *arg, mblk_t *mp, void *arg2,
    660  11042        Erik     ip_recv_attr_t *dummy);
    661  11042        Erik void		tcp_output_urgent(void *arg, mblk_t *mp, void *arg2,
    662  11042        Erik     ip_recv_attr_t *dummy);
    663  11042        Erik static void	tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2,
    664  11042        Erik     ip_recv_attr_t *dummy);
    665  11042        Erik static void	tcp_timer_handler(void *arg, mblk_t *mp, void *arg2,
    666  11042        Erik     ip_recv_attr_t *dummy);
    667  11042        Erik static void	tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
    668  11042        Erik     ip_recv_attr_t *dummy);
    669      0      stevel 
    670      0      stevel 
    671      0      stevel /* Prototype for TCP functions */
    672      0      stevel static void	tcp_random_init(void);
    673      0      stevel int		tcp_random(void);
    674   8348        Eric static void	tcp_tli_accept(tcp_t *tcp, mblk_t *mp);
    675  11042        Erik static void	tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
    676      0      stevel 		    tcp_t *eager);
    677  11042        Erik static int	tcp_set_destination(tcp_t *tcp);
    678      0      stevel static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
    679    646    gt145670     int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
    680    646    gt145670     boolean_t user_specified);
    681      0      stevel static void	tcp_closei_local(tcp_t *tcp);
    682      0      stevel static void	tcp_close_detached(tcp_t *tcp);
    683  11042        Erik static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr,
    684  11042        Erik 		    mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira);
    685   8348        Eric static void	tcp_tpi_connect(tcp_t *tcp, mblk_t *mp);
    686   8348        Eric static int	tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
    687  11042        Erik 		    in_port_t dstport, uint_t srcid);
    688  11042        Erik static int	tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
    689  11042        Erik 		    in_port_t dstport, uint32_t flowinfo,
    690  11042        Erik 		    uint_t srcid, uint32_t scope_id);
    691      0      stevel static int	tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
    692      0      stevel static void	tcp_disconnect(tcp_t *tcp, mblk_t *mp);
    693      0      stevel static char	*tcp_display(tcp_t *tcp, char *, char);
    694      0      stevel static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
    695      0      stevel static void	tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
    696      0      stevel static void	tcp_eager_unlink(tcp_t *tcp);
    697      0      stevel static void	tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
    698      0      stevel 		    int unixerr);
    699      0      stevel static void	tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
    700      0      stevel 		    int tlierr, int unixerr);
    701      0      stevel static int	tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
    702      0      stevel 		    cred_t *cr);
    703      0      stevel static int	tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
    704      0      stevel 		    char *value, caddr_t cp, cred_t *cr);
    705      0      stevel static int	tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
    706      0      stevel 		    char *value, caddr_t cp, cred_t *cr);
    707      0      stevel static int	tcp_tpistate(tcp_t *tcp);
    708      0      stevel static void	tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
    709      0      stevel     int caller_holds_lock);
    710      0      stevel static void	tcp_bind_hash_remove(tcp_t *tcp);
    711   3448    dh155122 static tcp_t	*tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *);
    712      0      stevel void		tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
    713      0      stevel static void	tcp_acceptor_hash_remove(tcp_t *tcp);
    714      0      stevel static void	tcp_capability_req(tcp_t *tcp, mblk_t *mp);
    715      0      stevel static void	tcp_info_req(tcp_t *tcp, mblk_t *mp);
    716      0      stevel static void	tcp_addr_req(tcp_t *tcp, mblk_t *mp);
    717  11042        Erik static void	tcp_init_values(tcp_t *tcp);
    718      0      stevel static void	tcp_ip_notify(tcp_t *tcp);
    719      0      stevel static void	tcp_iss_init(tcp_t *tcp);
    720      0      stevel static void	tcp_keepalive_killer(void *arg);
    721  11042        Erik static int	tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt);
    722  11042        Erik static void	tcp_mss_set(tcp_t *tcp, uint32_t size);
    723      0      stevel static int	tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
    724      0      stevel 		    int *do_disconnectp, int *t_errorp, int *sys_errorp);
    725      0      stevel static boolean_t tcp_allow_connopt_set(int level, int name);
    726      0      stevel int		tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
    727      0      stevel static int	tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
    728   3448    dh155122 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
    729   3448    dh155122     tcp_stack_t *);
    730      0      stevel static int	tcp_param_set(queue_t *q, mblk_t *mp, char *value,
    731      0      stevel 		    caddr_t cp, cred_t *cr);
    732      0      stevel static int	tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
    733      0      stevel 		    caddr_t cp, cred_t *cr);
    734   3448    dh155122 static void	tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *);
    735      0      stevel static int	tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
    736      0      stevel 		    caddr_t cp, cred_t *cr);
    737      0      stevel static void	tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
    738   9864        Phil static void	tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt);
    739      0      stevel static mblk_t	*tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
    740      0      stevel static void	tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
    741      0      stevel static void	tcp_reinit(tcp_t *tcp);
    742      0      stevel static void	tcp_reinit_values(tcp_t *tcp);
    743      0      stevel 
    744   8348        Eric static uint_t	tcp_rwnd_reopen(tcp_t *tcp);
    745   8348        Eric static uint_t	tcp_rcv_drain(tcp_t *tcp);
    746      0      stevel static void	tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
    747   3448    dh155122 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
    748      0      stevel static void	tcp_ss_rexmit(tcp_t *tcp);
    749  11042        Erik static mblk_t	*tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
    750  11042        Erik     ip_recv_attr_t *);
    751  11042        Erik static void	tcp_process_options(tcp_t *, tcpha_t *);
    752      0      stevel static void	tcp_rsrv(queue_t *q);
    753      0      stevel static int	tcp_snmp_state(tcp_t *tcp);
    754      0      stevel static void	tcp_timer(void *arg);
    755      0      stevel static void	tcp_timer_callback(void *);
    756   1676         jpk static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
    757   1676         jpk     boolean_t random);
    758   1676         jpk static in_port_t tcp_get_next_priv_port(const tcp_t *);
    759      0      stevel static void	tcp_wput_sock(queue_t *q, mblk_t *mp);
    760   8348        Eric static void	tcp_wput_fallback(queue_t *q, mblk_t *mp);
    761   8348        Eric void		tcp_tpi_accept(queue_t *q, mblk_t *mp);
    762      0      stevel static void	tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
    763      0      stevel static void	tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
    764      0      stevel static void	tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
    765  11042        Erik static int	tcp_send(tcp_t *tcp, const int mss,
    766  11042        Erik 		    const int total_hdr_len, const int tcp_hdr_len,
    767      0      stevel 		    const int num_sack_blk, int *usable, uint_t *snxt,
    768  11042        Erik 		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time);
    769      0      stevel static void	tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
    770      0      stevel 		    int num_sack_blk);
    771      0      stevel static void	tcp_wsrv(queue_t *q);
    772      0      stevel static int	tcp_xmit_end(tcp_t *tcp);
    773      0      stevel static void	tcp_ack_timer(void *arg);
    774      0      stevel static mblk_t	*tcp_ack_mp(tcp_t *tcp);
    775      0      stevel static void	tcp_xmit_early_reset(char *str, mblk_t *mp,
    776  11042        Erik 		    uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *,
    777  11042        Erik 		    ip_stack_t *, conn_t *);
    778      0      stevel static void	tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
    779      0      stevel 		    uint32_t ack, int ctl);
    780      0      stevel static void	tcp_set_rto(tcp_t *, time_t);
    781  11042        Erik static void	tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
    782  11042        Erik static void	tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
    783  11042        Erik static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
    784  11042        Erik     ip_recv_attr_t *);
    785   8348        Eric static int	tcp_build_hdrs(tcp_t *);
    786      0      stevel static void	tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
    787  11042        Erik     uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha,
    788  11042        Erik     ip_recv_attr_t *ira);
    789  11042        Erik boolean_t	tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp);
    790  11042        Erik static boolean_t tcp_zcopy_check(tcp_t *);
    791  11042        Erik static void	tcp_zcopy_notify(tcp_t *);
    792  11042        Erik static mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
    793  11042        Erik static void	tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);
    794  11042        Erik static void	tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only);
    795  11042        Erik static void	tcp_update_zcopy(tcp_t *tcp);
    796  11042        Erik static void	tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
    797  11042        Erik     ixa_notify_arg_t);
    798  11042        Erik static void	tcp_rexmit_after_error(tcp_t *tcp);
    799  11042        Erik static void	tcp_send_data(tcp_t *, mblk_t *);
    800      0      stevel extern mblk_t	*tcp_timermp_alloc(int);
    801      0      stevel extern void	tcp_timermp_free(tcp_t *);
    802      0      stevel static void	tcp_timer_free(tcp_t *tcp, mblk_t *mp);
    803      0      stevel static void	tcp_stop_lingering(tcp_t *tcp);
    804      0      stevel static void	tcp_close_linger_timeout(void *arg);
    805   3448    dh155122 static void	*tcp_stack_init(netstackid_t stackid, netstack_t *ns);
    806   3448    dh155122 static void	tcp_stack_fini(netstackid_t stackid, void *arg);
    807   3448    dh155122 static void	*tcp_g_kstat_init(tcp_g_stat_t *);
    808   3448    dh155122 static void	tcp_g_kstat_fini(kstat_t *);
    809   3448    dh155122 static void	*tcp_kstat_init(netstackid_t, tcp_stack_t *);
    810   3448    dh155122 static void	tcp_kstat_fini(netstackid_t, kstat_t *);
    811   3448    dh155122 static void	*tcp_kstat2_init(netstackid_t, tcp_stat_t *);
    812   3448    dh155122 static void	tcp_kstat2_fini(netstackid_t, kstat_t *);
    813      0      stevel static int	tcp_kstat_update(kstat_t *kp, int rw);
    814  11042        Erik static mblk_t	*tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
    815  11042        Erik     ip_recv_attr_t *ira);
    816  11042        Erik static mblk_t	*tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
    817  11042        Erik     ip_recv_attr_t *ira);
    818   8275        Eric static int	tcp_squeue_switch(int);
    819      0      stevel 
    820   5240    nordmark static int	tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
    821   5240    nordmark static int	tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
    822   5240    nordmark static int	tcp_openv6(queue_t *, dev_t *, int, int, cred_t *);
    823   8348        Eric static int	tcp_tpi_close(queue_t *, int);
    824   9395         Rao static int	tcp_tpi_close_accept(queue_t *);
    825      0      stevel 
    826      0      stevel static void	tcp_squeue_add(squeue_t *);
    827  11042        Erik static void	tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
    828  11042        Erik 
    829  11042        Erik extern void	tcp_kssl_input(tcp_t *, mblk_t *, cred_t *);
    830  11042        Erik 
    831  11042        Erik void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy);
    832  11042        Erik void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
    833  11042        Erik     ip_recv_attr_t *dummy);
    834   8348        Eric 
    835   8348        Eric static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
    836   8348        Eric 	    sock_upper_handle_t, cred_t *);
    837   8348        Eric static int tcp_listen(sock_lower_handle_t, int, cred_t *);
    838   9395         Rao static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *,
    839   9395         Rao     boolean_t);
    840   8348        Eric static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
    841   8348        Eric     cred_t *, pid_t);
    842   8348        Eric static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
    843   8348        Eric     boolean_t);
    844   8348        Eric static int tcp_do_unbind(conn_t *);
    845   8348        Eric static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *,
    846   8348        Eric     boolean_t);
    847   3104    jprakash 
    848   8682      Anders static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
    849   8682      Anders 
    850      0      stevel /*
    851      0      stevel  * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
    852      0      stevel  *
    853      0      stevel  * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
    854      0      stevel  * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
    855      0      stevel  * (defined in tcp.h) needs to be filled in and passed into the kernel
    856      0      stevel  * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
    857      0      stevel  * structure contains the four-tuple of a TCP connection and a range of TCP
    858      0      stevel  * states (specified by ac_start and ac_end). The use of wildcard addresses
    859      0      stevel  * and ports is allowed. Connections with a matching four tuple and a state
    860      0      stevel  * within the specified range will be aborted. The valid states for the
    861      0      stevel  * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
    862      0      stevel  * inclusive.
    863      0      stevel  *
    864      0      stevel  * An application which has its connection aborted by this ioctl will receive
    865      0      stevel  * an error that is dependent on the connection state at the time of the abort.
    866      0      stevel  * If the connection state is < TCPS_TIME_WAIT, an application should behave as
    867      0      stevel  * though a RST packet has been received.  If the connection state is equal to
    868      0      stevel  * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
    869      0      stevel  * and all resources associated with the connection will be freed.
    870      0      stevel  */
    871      0      stevel static mblk_t	*tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
    872      0      stevel static void	tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
    873  11042        Erik static void	tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
    874  11042        Erik     ip_recv_attr_t *dummy);
    875   3448    dh155122 static int	tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
    876      0      stevel static void	tcp_ioctl_abort_conn(queue_t *, mblk_t *);
    877      0      stevel static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
    878   3448    dh155122     boolean_t, tcp_stack_t *);
    879      0      stevel 
    880      0      stevel static struct module_info tcp_rinfo =  {
    881    741    masputra 	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
    882      0      stevel };
    883      0      stevel 
    884      0      stevel static struct module_info tcp_winfo =  {
    885    741    masputra 	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
    886      0      stevel };
    887      0      stevel 
    888      0      stevel /*
    889      0      stevel  * Entry points for TCP as a device. The normal case which supports
    890      0      stevel  * the TCP functionality.
    891   5240    nordmark  * We have separate open functions for the /dev/tcp and /dev/tcp6 devices.
    892   5240    nordmark  */
    893   5240    nordmark struct qinit tcp_rinitv4 = {
    894   8348        Eric 	NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo
    895   5240    nordmark };
    896   5240    nordmark 
    897   5240    nordmark struct qinit tcp_rinitv6 = {
    898   8348        Eric 	NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo
    899      0      stevel };
    900      0      stevel 
    901      0      stevel struct qinit tcp_winit = {
    902      0      stevel 	(pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
    903      0      stevel };
    904      0      stevel 
    905      0      stevel /* Initial entry point for TCP in socket mode. */
    906      0      stevel struct qinit tcp_sock_winit = {
    907      0      stevel 	(pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
    908      0      stevel };
    909      0      stevel 
    910   8348        Eric /* TCP entry point during fallback */
    911   8348        Eric struct qinit tcp_fallback_sock_winit = {
    912   8348        Eric 	(pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo
    913   8348        Eric };
    914   8348        Eric 
    915      0      stevel /*
    916      0      stevel  * Entry points for TCP as a acceptor STREAM opened by sockfs when doing
    917      0      stevel  * an accept. Avoid allocating data structures since eager has already
    918      0      stevel  * been created.
    919      0      stevel  */
    920      0      stevel struct qinit tcp_acceptor_rinit = {
    921   9395         Rao 	NULL, (pfi_t)tcp_rsrv, NULL, tcp_tpi_close_accept, NULL, &tcp_winfo
    922      0      stevel };
    923      0      stevel 
    924      0      stevel struct qinit tcp_acceptor_winit = {
    925   8348        Eric 	(pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo
    926      0      stevel };
    927      0      stevel 
    928   5240    nordmark /* For AF_INET aka /dev/tcp */
    929   5240    nordmark struct streamtab tcpinfov4 = {
    930   5240    nordmark 	&tcp_rinitv4, &tcp_winit
    931   5240    nordmark };
    932   5240    nordmark 
    933   5240    nordmark /* For AF_INET6 aka /dev/tcp6 */
    934   5240    nordmark struct streamtab tcpinfov6 = {
    935   5240    nordmark 	&tcp_rinitv6, &tcp_winit
    936      0      stevel };
    937   8348        Eric 
    938   8348        Eric sock_downcalls_t sock_tcp_downcalls;
    939      0      stevel 
    940   3448    dh155122 /* Setable only in /etc/system. Move to ndd? */
    941      0      stevel boolean_t tcp_icmp_source_quench = B_FALSE;
    942   3448    dh155122 
    943      0      stevel /*
    944      0      stevel  * Following assumes TPI alignment requirements stay along 32 bit
    945      0      stevel  * boundaries
    946      0      stevel  */
    947      0      stevel #define	ROUNDUP32(x) \
    948      0      stevel 	(((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
    949      0      stevel 
    950      0      stevel /* Template for response to info request. */
    951      0      stevel static struct T_info_ack tcp_g_t_info_ack = {
    952      0      stevel 	T_INFO_ACK,		/* PRIM_type */
    953      0      stevel 	0,			/* TSDU_size */
    954      0      stevel 	T_INFINITE,		/* ETSDU_size */
    955      0      stevel 	T_INVALID,		/* CDATA_size */
    956      0      stevel 	T_INVALID,		/* DDATA_size */
    957      0      stevel 	sizeof (sin_t),		/* ADDR_size */
    958      0      stevel 	0,			/* OPT_size - not initialized here */
    959      0      stevel 	TIDUSZ,			/* TIDU_size */
    960      0      stevel 	T_COTS_ORD,		/* SERV_type */
    961      0      stevel 	TCPS_IDLE,		/* CURRENT_state */
    962      0      stevel 	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
    963      0      stevel };
    964      0      stevel 
    965      0      stevel static struct T_info_ack tcp_g_t_info_ack_v6 = {
    966      0      stevel 	T_INFO_ACK,		/* PRIM_type */
    967      0      stevel 	0,			/* TSDU_size */
    968      0      stevel 	T_INFINITE,		/* ETSDU_size */
    969      0      stevel 	T_INVALID,		/* CDATA_size */
    970      0      stevel 	T_INVALID,		/* DDATA_size */
    971      0      stevel 	sizeof (sin6_t),	/* ADDR_size */
    972      0      stevel 	0,			/* OPT_size - not initialized here */
    973      0      stevel 	TIDUSZ,		/* TIDU_size */
    974      0      stevel 	T_COTS_ORD,		/* SERV_type */
    975      0      stevel 	TCPS_IDLE,		/* CURRENT_state */
    976      0      stevel 	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
    977      0      stevel };
    978      0      stevel 
    979      0      stevel #define	MS	1L
    980      0      stevel #define	SECONDS	(1000 * MS)
    981      0      stevel #define	MINUTES	(60 * SECONDS)
    982      0      stevel #define	HOURS	(60 * MINUTES)
    983      0      stevel #define	DAYS	(24 * HOURS)
    984      0      stevel 
    985      0      stevel #define	PARAM_MAX (~(uint32_t)0)
    986      0      stevel 
    987      0      stevel /* Max size IP datagram is 64k - 1 */
    988  11042        Erik #define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
    989  11042        Erik #define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
    990      0      stevel /* Max of the above */
    991      0      stevel #define	TCP_MSS_MAX	TCP_MSS_MAX_IPV4
    992      0      stevel 
    993      0      stevel /* Largest TCP port number */
    994      0      stevel #define	TCP_MAX_PORT	(64 * 1024 - 1)
    995      0      stevel 
    996      0      stevel /*
    997      0      stevel  * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
    998      0      stevel  * layer header.  It has to be a multiple of 4.
    999      0      stevel  */
   1000   3448    dh155122 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
   1001   3448    dh155122 #define	tcps_wroff_xtra	tcps_wroff_xtra_param->tcp_param_val
   1002      0      stevel 
   1003      0      stevel /*
   1004      0      stevel  * All of these are alterable, within the min/max values given, at run time.
   1005      0      stevel  * Note that the default value of "tcp_time_wait_interval" is four minutes,
   1006      0      stevel  * per the TCP spec.
   1007      0      stevel  */
   1008      0      stevel /* BEGIN CSTYLED */
   1009   3448    dh155122 static tcpparam_t	lcl_tcp_param_arr[] = {
   1010      0      stevel  /*min		max		value		name */
   1011      0      stevel  { 1*SECONDS,	10*MINUTES,	1*MINUTES,	"tcp_time_wait_interval"},
   1012      0      stevel  { 1,		PARAM_MAX,	128,		"tcp_conn_req_max_q" },
   1013      0      stevel  { 0,		PARAM_MAX,	1024,		"tcp_conn_req_max_q0" },
   1014      0      stevel  { 1,		1024,		1,		"tcp_conn_req_min" },
   1015      0      stevel  { 0*MS,	20*SECONDS,	0*MS,		"tcp_conn_grace_period" },
   1016      0      stevel  { 128,		(1<<30),	1024*1024,	"tcp_cwnd_max" },
   1017      0      stevel  { 0,		10,		0,		"tcp_debug" },
   1018      0      stevel  { 1024,	(32*1024),	1024,		"tcp_smallest_nonpriv_port"},
   1019      0      stevel  { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_cinterval"},
   1020      0      stevel  { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_linterval"},
   1021      0      stevel  { 500*MS,	PARAM_MAX,	8*MINUTES,	"tcp_ip_abort_interval"},
   1022      0      stevel  { 1*SECONDS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_cinterval"},
   1023      0      stevel  { 500*MS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_interval"},
   1024      0      stevel  { 1,		255,		64,		"tcp_ipv4_ttl"},
   1025      0      stevel  { 10*SECONDS,	10*DAYS,	2*HOURS,	"tcp_keepalive_interval"},
   1026      0      stevel  { 0,		100,		10,		"tcp_maxpsz_multiplier" },
   1027      0      stevel  { 1,		TCP_MSS_MAX_IPV4, 536,		"tcp_mss_def_ipv4"},
   1028      0      stevel  { 1,		TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
   1029      0      stevel  { 1,		TCP_MSS_MAX,	108,		"tcp_mss_min"},
   1030      0      stevel  { 1,		(64*1024)-1,	(4*1024)-1,	"tcp_naglim_def"},
   1031      0      stevel  { 1*MS,	20*SECONDS,	3*SECONDS,	"tcp_rexmit_interval_initial"},
   1032      0      stevel  { 1*MS,	2*HOURS,	60*SECONDS,	"tcp_rexmit_interval_max"},
   1033      0      stevel  { 1*MS,	2*HOURS,	400*MS,		"tcp_rexmit_interval_min"},
   1034      0      stevel  { 1*MS,	1*MINUTES,	100*MS,		"tcp_deferred_ack_interval" },
   1035      0      stevel  { 0,		16,		0,		"tcp_snd_lowat_fraction" },
   1036      0      stevel  { 0,		128000,		0,		"tcp_sth_rcv_hiwat" },
   1037      0      stevel  { 0,		128000,		0,		"tcp_sth_rcv_lowat" },
   1038      0      stevel  { 1,		10000,		3,		"tcp_dupack_fast_retransmit" },
   1039      0      stevel  { 0,		1,		0,		"tcp_ignore_path_mtu" },
   1040      0      stevel  { 1024,	TCP_MAX_PORT,	32*1024,	"tcp_smallest_anon_port"},
   1041      0      stevel  { 1024,	TCP_MAX_PORT,	TCP_MAX_PORT,	"tcp_largest_anon_port"},
   1042      0      stevel  { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
   1043      0      stevel  { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
   1044      0      stevel  { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
   1045      0      stevel  { 1,		65536,		4,		"tcp_recv_hiwat_minmss"},
   1046      0      stevel  { 1*SECONDS,	PARAM_MAX,	675*SECONDS,	"tcp_fin_wait_2_flush_interval"},
   1047      0      stevel  { 8192,	(1<<30),	1024*1024,	"tcp_max_buf"},
   1048      0      stevel /*
   1049      0      stevel  * Question:  What default value should I set for tcp_strong_iss?
   1050      0      stevel  */
   1051      0      stevel  { 0,		2,		1,		"tcp_strong_iss"},
   1052      0      stevel  { 0,		65536,		20,		"tcp_rtt_updates"},
   1053      0      stevel  { 0,		1,		1,		"tcp_wscale_always"},
   1054      0      stevel  { 0,		1,		0,		"tcp_tstamp_always"},
   1055      0      stevel  { 0,		1,		1,		"tcp_tstamp_if_wscale"},
   1056      0      stevel  { 0*MS,	2*HOURS,	0*MS,		"tcp_rexmit_interval_extra"},
   1057      0      stevel  { 0,		16,		2,		"tcp_deferred_acks_max"},
   1058      0      stevel  { 1,		16384,		4,		"tcp_slow_start_after_idle"},
   1059      0      stevel  { 1,		4,		4,		"tcp_slow_start_initial"},
   1060      0      stevel  { 0,		2,		2,		"tcp_sack_permitted"},
   1061      0      stevel  { 0,		1,		1,		"tcp_compression_enabled"},
   1062      0      stevel  { 0,		IPV6_MAX_HOPS,	IPV6_DEFAULT_HOPS,	"tcp_ipv6_hoplimit"},
   1063      0      stevel  { 1,		TCP_MSS_MAX_IPV6, 1220,		"tcp_mss_def_ipv6"},
   1064      0      stevel  { 1,		TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
   1065      0      stevel  { 0,		1,		0,		"tcp_rev_src_routes"},
   1066      0      stevel  { 10*MS,	500*MS,		50*MS,		"tcp_local_dack_interval"},
   1067      0      stevel  { 0,		16,		8,		"tcp_local_dacks_max"},
   1068      0      stevel  { 0,		2,		1,		"tcp_ecn_permitted"},
   1069      0      stevel  { 0,		1,		1,		"tcp_rst_sent_rate_enabled"},
   1070      0      stevel  { 0,		PARAM_MAX,	40,		"tcp_rst_sent_rate"},
   1071      0      stevel  { 0,		100*MS,		50*MS,		"tcp_push_timer_interval"},
   1072      0      stevel  { 0,		1,		0,		"tcp_use_smss_as_mss_opt"},
   1073      0      stevel  { 0,		PARAM_MAX,	8*MINUTES,	"tcp_keepalive_abort_interval"},
   1074  11042        Erik  { 0,		1,		0,		"tcp_dev_flow_ctl"},
   1075      0      stevel };
   1076      0      stevel /* END CSTYLED */
   1077      0      stevel 
   1078      0      stevel /* Round up the value to the nearest mss. */
   1079      0      stevel #define	MSS_ROUNDUP(value, mss)		((((value) - 1) / (mss) + 1) * (mss))
   1080      0      stevel 
   1081      0      stevel /*
   1082      0      stevel  * Set ECN capable transport (ECT) code point in IP header.
   1083      0      stevel  *
   1084      0      stevel  * Note that there are 2 ECT code points '01' and '10', which are called
   1085      0      stevel  * ECT(1) and ECT(0) respectively.  Here we follow the original ECT code
   1086      0      stevel  * point ECT(0) for TCP as described in RFC 2481.
   1087      0      stevel  */
   1088      0      stevel #define	SET_ECT(tcp, iph) \
   1089  11042        Erik 	if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
   1090      0      stevel 		/* We need to clear the code point first. */ \
   1091      0      stevel 		((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
   1092      0      stevel 		((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
   1093      0      stevel 	} else { \
   1094      0      stevel 		((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
   1095      0      stevel 		((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
   1096      0      stevel 	}
   1097      0      stevel 
   1098      0      stevel /*
   1099      0      stevel  * The format argument to pass to tcp_display().
   1100      0      stevel  * DISP_PORT_ONLY means that the returned string has only port info.
   1101      0      stevel  * DISP_ADDR_AND_PORT means that the returned string also contains the
   1102      0      stevel  * remote and local IP address.
   1103      0      stevel  */
   1104      0      stevel #define	DISP_PORT_ONLY		1
   1105      0      stevel #define	DISP_ADDR_AND_PORT	2
   1106      0      stevel 
   1107      0      stevel #define	IS_VMLOANED_MBLK(mp) \
   1108      0      stevel 	(((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
   1109      0      stevel 
   1110      0      stevel uint32_t do_tcpzcopy = 1;		/* 0: disable, 1: enable, 2: force */
   1111      0      stevel 
   1112      0      stevel /*
   1113   3448    dh155122  * Forces all connections to obey the value of the tcps_maxpsz_multiplier
   1114      0      stevel  * tunable settable via NDD.  Otherwise, the per-connection behavior is
   1115  11042        Erik  * determined dynamically during tcp_set_destination(), which is the default.
   1116      0      stevel  */
   1117      0      stevel boolean_t tcp_static_maxpsz = B_FALSE;
   1118      0      stevel 
   1119   3448    dh155122 /* Setable in /etc/system */
   1120      0      stevel /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
   1121      0      stevel uint32_t tcp_random_anon_port = 1;
   1122      0      stevel 
   1123      0      stevel /*
   1124   3104    jprakash  * To reach to an eager in Q0 which can be dropped due to an incoming
   1125   3104    jprakash  * new SYN request when Q0 is full, a new doubly linked list is
   1126   3104    jprakash  * introduced. This list allows to select an eager from Q0 in O(1) time.
   1127   3104    jprakash  * This is needed to avoid spending too much time walking through the
   1128   3104    jprakash  * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
   1129   3104    jprakash  * this new list has to be a member of Q0.
   1130   3104    jprakash  * This list is headed by listener's tcp_t. When the list is empty,
   1131   3104    jprakash  * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
   1132   3104    jprakash  * of listener's tcp_t point to listener's tcp_t itself.
   1133   3104    jprakash  *
   1134   3104    jprakash  * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
   1135   3104    jprakash  * in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
   1136   3104    jprakash  * These macros do not affect the eager's membership to Q0.
   1137   3104    jprakash  */
   1138   3104    jprakash 
   1139   3104    jprakash 
   1140   3104    jprakash #define	MAKE_DROPPABLE(listener, eager)					\
   1141   3104    jprakash 	if ((eager)->tcp_eager_next_drop_q0 == NULL) {			\
   1142   3104    jprakash 		(listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
   1143   3104    jprakash 		    = (eager);						\
   1144   3104    jprakash 		(eager)->tcp_eager_prev_drop_q0 = (listener);		\
   1145   3104    jprakash 		(eager)->tcp_eager_next_drop_q0 =			\
   1146   3104    jprakash 		    (listener)->tcp_eager_next_drop_q0;			\
   1147   3104    jprakash 		(listener)->tcp_eager_next_drop_q0 = (eager);		\
   1148   3104    jprakash 	}
   1149   3104    jprakash 
   1150   3104    jprakash #define	MAKE_UNDROPPABLE(eager)						\
   1151   3104    jprakash 	if ((eager)->tcp_eager_next_drop_q0 != NULL) {			\
   1152   3104    jprakash 		(eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0	\
   1153   3104    jprakash 		    = (eager)->tcp_eager_prev_drop_q0;			\
   1154   3104    jprakash 		(eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0	\
   1155   3104    jprakash 		    = (eager)->tcp_eager_next_drop_q0;			\
   1156   3104    jprakash 		(eager)->tcp_eager_prev_drop_q0 = NULL;			\
   1157   3104    jprakash 		(eager)->tcp_eager_next_drop_q0 = NULL;			\
   1158   3104    jprakash 	}
   1159   3104    jprakash 
   1160   3104    jprakash /*
   1161      0      stevel  * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
   1162      0      stevel  * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
   1163      0      stevel  * data, TCP will not respond with an ACK.  RFC 793 requires that
   1164      0      stevel  * TCP responds with an ACK for such a bogus ACK.  By not following
   1165      0      stevel  * the RFC, we prevent TCP from getting into an ACK storm if somehow
   1166      0      stevel  * an attacker successfully spoofs an acceptable segment to our
   1167      0      stevel  * peer; or when our peer is "confused."
   1168      0      stevel  */
   1169      0      stevel uint32_t tcp_drop_ack_unsent_cnt = 10;
   1170      0      stevel 
   1171      0      stevel /*
   1172      0      stevel  * Hook functions to enable cluster networking
   1173      0      stevel  * On non-clustered systems these vectors must always be NULL.
   1174      0      stevel  */
   1175      0      stevel 
   1176   8392     Huafeng void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol,
   1177   8392     Huafeng 			    sa_family_t addr_family, uint8_t *laddrp,
   1178   8392     Huafeng 			    in_port_t lport, void *args) = NULL;
   1179   8392     Huafeng void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol,
   1180   8392     Huafeng 			    sa_family_t addr_family, uint8_t *laddrp,
   1181   8392     Huafeng 			    in_port_t lport, void *args) = NULL;
   1182   8392     Huafeng 
   1183   8392     Huafeng int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol,
   1184   8392     Huafeng 			    boolean_t is_outgoing,
   1185   8392     Huafeng 			    sa_family_t addr_family,
   1186      0      stevel 			    uint8_t *laddrp, in_port_t lport,
   1187   8392     Huafeng 			    uint8_t *faddrp, in_port_t fport,
   1188   8392     Huafeng 			    void *args) = NULL;
   1189   8392     Huafeng void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol,
   1190   8392     Huafeng 			    sa_family_t addr_family, uint8_t *laddrp,
   1191   8392     Huafeng 			    in_port_t lport, uint8_t *faddrp,
   1192   8392     Huafeng 			    in_port_t fport, void *args) = NULL;
   1193      0      stevel 
   1194   8392     Huafeng 
   1195   8392     Huafeng /*
   1196   8392     Huafeng  * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err)
   1197   8392     Huafeng  */
   1198  11042        Erik #define	CL_INET_CONNECT(connp, is_outgoing, err) {		\
   1199   8392     Huafeng 	(err) = 0;						\
   1200   8392     Huafeng 	if (cl_inet_connect2 != NULL) {				\
   1201      0      stevel 		/*						\
   1202      0      stevel 		 * Running in cluster mode - register active connection	\
   1203      0      stevel 		 * information						\
   1204      0      stevel 		 */							\
   1205  11042        Erik 		if ((connp)->conn_ipversion == IPV4_VERSION) {		\
   1206  11042        Erik 			if ((connp)->conn_laddr_v4 != 0) {		\
   1207   8392     Huafeng 				(err) = (*cl_inet_connect2)(		\
   1208   8392     Huafeng 				    (connp)->conn_netstack->netstack_stackid,\
   1209   8392     Huafeng 				    IPPROTO_TCP, is_outgoing, AF_INET,	\
   1210  11042        Erik 				    (uint8_t *)(&((connp)->conn_laddr_v4)),\
   1211  11042        Erik 				    (in_port_t)(connp)->conn_lport,	\
   1212  11042        Erik 				    (uint8_t *)(&((connp)->conn_faddr_v4)),\
   1213  11042        Erik 				    (in_port_t)(connp)->conn_fport, NULL); \
   1214      0      stevel 			}						\
   1215      0      stevel 		} else {						\
   1216      0      stevel 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
   1217  11042        Erik 			    &(connp)->conn_laddr_v6)) {			\
   1218   8392     Huafeng 				(err) = (*cl_inet_connect2)(		\
   1219   8392     Huafeng 				    (connp)->conn_netstack->netstack_stackid,\
   1220   8392     Huafeng 				    IPPROTO_TCP, is_outgoing, AF_INET6,	\
   1221  11042        Erik 				    (uint8_t *)(&((connp)->conn_laddr_v6)),\
   1222  11042        Erik 				    (in_port_t)(connp)->conn_lport,	\
   1223  11042        Erik 				    (uint8_t *)(&((connp)->conn_faddr_v6)), \
   1224  11042        Erik 				    (in_port_t)(connp)->conn_fport, NULL); \
   1225      0      stevel 			}						\
   1226      0      stevel 		}							\
   1227      0      stevel 	}								\
   1228      0      stevel }
   1229      0      stevel 
   1230  11042        Erik #define	CL_INET_DISCONNECT(connp)	{				\
   1231      0      stevel 	if (cl_inet_disconnect != NULL) {				\
   1232      0      stevel 		/*							\
   1233      0      stevel 		 * Running in cluster mode - deregister active		\
   1234      0      stevel 		 * connection information				\
   1235      0      stevel 		 */							\
   1236  11042        Erik 		if ((connp)->conn_ipversion == IPV4_VERSION) {		\
   1237  11042        Erik 			if ((connp)->conn_laddr_v4 != 0) {		\
   1238   8392     Huafeng 				(*cl_inet_disconnect)(			\
   1239   8392     Huafeng 				    (connp)->conn_netstack->netstack_stackid,\
   1240   8392     Huafeng 				    IPPROTO_TCP, AF_INET,		\
   1241  11042        Erik 				    (uint8_t *)(&((connp)->conn_laddr_v4)),\
   1242  11042        Erik 				    (in_port_t)(connp)->conn_lport,	\
   1243  11042        Erik 				    (uint8_t *)(&((connp)->conn_faddr_v4)),\
   1244  11042        Erik 				    (in_port_t)(connp)->conn_fport, NULL); \
   1245      0      stevel 			}						\
   1246      0      stevel 		} else {						\
   1247      0      stevel 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
   1248  11042        Erik 			    &(connp)->conn_laddr_v6)) {			\
   1249   8392     Huafeng 				(*cl_inet_disconnect)(			\
   1250   8392     Huafeng 				    (connp)->conn_netstack->netstack_stackid,\
   1251   8392     Huafeng 				    IPPROTO_TCP, AF_INET6,		\
   1252  11042        Erik 				    (uint8_t *)(&((connp)->conn_laddr_v6)),\
   1253  11042        Erik 				    (in_port_t)(connp)->conn_lport,	\
   1254  11042        Erik 				    (uint8_t *)(&((connp)->conn_faddr_v6)), \
   1255  11042        Erik 				    (in_port_t)(connp)->conn_fport, NULL); \
   1256      0      stevel 			}						\
   1257      0      stevel 		}							\
   1258      0      stevel 	}								\
   1259      0      stevel }
   1260      0      stevel 
   1261      0      stevel /*
   1262      0      stevel  * Cluster networking hook for traversing current connection list.
   1263      0      stevel  * This routine is used to extract the current list of live connections
   1264      0      stevel  * which must continue to to be dispatched to this node.
   1265      0      stevel  */
   1266   8392     Huafeng int cl_tcp_walk_list(netstackid_t stack_id,
   1267   8392     Huafeng     int (*callback)(cl_tcp_info_t *, void *), void *arg);
   1268      0      stevel 
   1269   3448    dh155122 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
   1270   3448    dh155122     void *arg, tcp_stack_t *tcps);
   1271   6878     brendan 
   1272  10312         Rao static void
   1273  10312         Rao tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh)
   1274  10312         Rao {
   1275  10312         Rao 	uint32_t default_threshold = SOCKET_RECVHIWATER >> 3;
   1276  10312         Rao 
   1277  10312         Rao 	if (IPCL_IS_NONSTR(tcp->tcp_connp)) {
   1278  10312         Rao 		conn_t *connp = tcp->tcp_connp;
   1279  10312         Rao 		struct sock_proto_props sopp;
   1280  10312         Rao 
   1281  10312         Rao 		/*
   1282  10312         Rao 		 * only increase rcvthresh upto default_threshold
   1283  10312         Rao 		 */
   1284  10312         Rao 		if (new_rcvthresh > default_threshold)
   1285  10312         Rao 			new_rcvthresh = default_threshold;
   1286  10312         Rao 
   1287  10312         Rao 		sopp.sopp_flags = SOCKOPT_RCVTHRESH;
   1288  10312         Rao 		sopp.sopp_rcvthresh = new_rcvthresh;
   1289  10312         Rao 
   1290  10312         Rao 		(*connp->conn_upcalls->su_set_proto_props)
   1291  10312         Rao 		    (connp->conn_upper_handle, &sopp);
   1292  10312         Rao 	}
   1293  10312         Rao }
   1294      0      stevel /*
   1295      0      stevel  * Figure out the value of window scale opton.  Note that the rwnd is
   1296      0      stevel  * ASSUMED to be rounded up to the nearest MSS before the calculation.
   1297      0      stevel  * We cannot find the scale value and then do a round up of tcp_rwnd
   1298      0      stevel  * because the scale value may not be correct after that.
   1299      0      stevel  *
   1300      0      stevel  * Set the compiler flag to make this function inline.
   1301      0      stevel  */
   1302      0      stevel static void
   1303      0      stevel tcp_set_ws_value(tcp_t *tcp)
   1304      0      stevel {
   1305      0      stevel 	int i;
   1306      0      stevel 	uint32_t rwnd = tcp->tcp_rwnd;
   1307      0      stevel 
   1308      0      stevel 	for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
   1309      0      stevel 	    i++, rwnd >>= 1)
   1310      0      stevel 		;
   1311      0      stevel 	tcp->tcp_rcv_ws = i;
   1312      0      stevel }
   1313      0      stevel 
   1314      0      stevel /*
   1315      0      stevel  * Remove a connection from the list of detached TIME_WAIT connections.
   1316   3104    jprakash  * It returns B_FALSE if it can't remove the connection from the list
   1317   3104    jprakash  * as the connection has already been removed from the list due to an
   1318   3104    jprakash  * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
   1319   3104    jprakash  */
   1320   3104    jprakash static boolean_t
   1321      0      stevel tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
   1322      0      stevel {
   1323      0      stevel 	boolean_t	locked = B_FALSE;
   1324      0      stevel 
   1325      0      stevel 	if (tcp_time_wait == NULL) {
   1326      0      stevel 		tcp_time_wait = *((tcp_squeue_priv_t **)
   1327      0      stevel 		    squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
   1328      0      stevel 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1329      0      stevel 		locked = B_TRUE;
   1330   3448    dh155122 	} else {
   1331   3448    dh155122 		ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
   1332      0      stevel 	}
   1333      0      stevel 
   1334      0      stevel 	if (tcp->tcp_time_wait_expire == 0) {
   1335      0      stevel 		ASSERT(tcp->tcp_time_wait_next == NULL);
   1336      0      stevel 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   1337      0      stevel 		if (locked)
   1338      0      stevel 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1339   3104    jprakash 		return (B_FALSE);
   1340      0      stevel 	}
   1341      0      stevel 	ASSERT(TCP_IS_DETACHED(tcp));
   1342      0      stevel 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
   1343      0      stevel 
   1344      0      stevel 	if (tcp == tcp_time_wait->tcp_time_wait_head) {
   1345      0      stevel 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   1346      0      stevel 		tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
   1347      0      stevel 		if (tcp_time_wait->tcp_time_wait_head != NULL) {
   1348      0      stevel 			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
   1349      0      stevel 			    NULL;
   1350      0      stevel 		} else {
   1351      0      stevel 			tcp_time_wait->tcp_time_wait_tail = NULL;
   1352      0      stevel 		}
   1353      0      stevel 	} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
   1354      0      stevel 		ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
   1355      0      stevel 		ASSERT(tcp->tcp_time_wait_next == NULL);
   1356      0      stevel 		tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
   1357      0      stevel 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
   1358      0      stevel 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
   1359      0      stevel 	} else {
   1360      0      stevel 		ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
   1361      0      stevel 		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
   1362      0      stevel 		tcp->tcp_time_wait_prev->tcp_time_wait_next =
   1363      0      stevel 		    tcp->tcp_time_wait_next;
   1364      0      stevel 		tcp->tcp_time_wait_next->tcp_time_wait_prev =
   1365      0      stevel 		    tcp->tcp_time_wait_prev;
   1366      0      stevel 	}
   1367      0      stevel 	tcp->tcp_time_wait_next = NULL;
   1368      0      stevel 	tcp->tcp_time_wait_prev = NULL;
   1369      0      stevel 	tcp->tcp_time_wait_expire = 0;
   1370      0      stevel 
   1371      0      stevel 	if (locked)
   1372      0      stevel 		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1373   3104    jprakash 	return (B_TRUE);
   1374      0      stevel }
   1375      0      stevel 
   1376      0      stevel /*
   1377      0      stevel  * Add a connection to the list of detached TIME_WAIT connections
   1378      0      stevel  * and set its time to expire.
   1379      0      stevel  */
   1380      0      stevel static void
   1381      0      stevel tcp_time_wait_append(tcp_t *tcp)
   1382      0      stevel {
   1383   3448    dh155122 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1384      0      stevel 	tcp_squeue_priv_t *tcp_time_wait =
   1385      0      stevel 	    *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
   1386   5031    rs200217 	    SQPRIVATE_TCP));
   1387      0      stevel 
   1388      0      stevel 	tcp_timers_stop(tcp);
   1389      0      stevel 
   1390      0      stevel 	/* Freed above */
   1391      0      stevel 	ASSERT(tcp->tcp_timer_tid == 0);
   1392      0      stevel 	ASSERT(tcp->tcp_ack_tid == 0);
   1393      0      stevel 
   1394      0      stevel 	/* must have happened at the time of detaching the tcp */
   1395      0      stevel 	ASSERT(tcp->tcp_ptpahn == NULL);
   1396      0      stevel 	ASSERT(tcp->tcp_flow_stopped == 0);
   1397      0      stevel 	ASSERT(tcp->tcp_time_wait_next == NULL);
   1398      0      stevel 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   1399      0      stevel 	ASSERT(tcp->tcp_time_wait_expire == NULL);
   1400      0      stevel 	ASSERT(tcp->tcp_listener == NULL);
   1401      0      stevel 
   1402      0      stevel 	tcp->tcp_time_wait_expire = ddi_get_lbolt();
   1403      0      stevel 	/*
   1404      0      stevel 	 * The value computed below in tcp->tcp_time_wait_expire may
   1405      0      stevel 	 * appear negative or wrap around. That is ok since our
   1406      0      stevel 	 * interest is only in the difference between the current lbolt
   1407      0      stevel 	 * value and tcp->tcp_time_wait_expire. But the value should not
   1408      0      stevel 	 * be zero, since it means the tcp is not in the TIME_WAIT list.
   1409      0      stevel 	 * The corresponding comparison in tcp_time_wait_collector() uses
   1410      0      stevel 	 * modular arithmetic.
   1411      0      stevel 	 */
   1412      0      stevel 	tcp->tcp_time_wait_expire +=
   1413   3448    dh155122 	    drv_usectohz(tcps->tcps_time_wait_interval * 1000);
   1414      0      stevel 	if (tcp->tcp_time_wait_expire == 0)
   1415      0      stevel 		tcp->tcp_time_wait_expire = 1;
   1416      0      stevel 
   1417      0      stevel 	ASSERT(TCP_IS_DETACHED(tcp));
   1418      0      stevel 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
   1419      0      stevel 	ASSERT(tcp->tcp_time_wait_next == NULL);
   1420      0      stevel 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   1421   3448    dh155122 	TCP_DBGSTAT(tcps, tcp_time_wait);
   1422   3448    dh155122 
   1423      0      stevel 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1424      0      stevel 	if (tcp_time_wait->tcp_time_wait_head == NULL) {
   1425      0      stevel 		ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
   1426      0      stevel 		tcp_time_wait->tcp_time_wait_head = tcp;
   1427      0      stevel 	} else {
   1428      0      stevel 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
   1429      0      stevel 		ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
   1430      0      stevel 		    TCPS_TIME_WAIT);
   1431      0      stevel 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
   1432      0      stevel 		tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
   1433      0      stevel 	}
   1434      0      stevel 	tcp_time_wait->tcp_time_wait_tail = tcp;
   1435      0      stevel 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1436      0      stevel }
   1437      0      stevel 
   1438      0      stevel /* ARGSUSED */
   1439      0      stevel void
   1440  11042        Erik tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
   1441      0      stevel {
   1442      0      stevel 	conn_t	*connp = (conn_t *)arg;
   1443      0      stevel 	tcp_t	*tcp = connp->conn_tcp;
   1444   3448    dh155122 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1445      0      stevel 
   1446      0      stevel 	ASSERT(tcp != NULL);
   1447      0      stevel 	if (tcp->tcp_state == TCPS_CLOSED) {
   1448      0      stevel 		return;
   1449      0      stevel 	}
   1450      0      stevel 
   1451  11042        Erik 	ASSERT((connp->conn_family == AF_INET &&
   1452  11042        Erik 	    connp->conn_ipversion == IPV4_VERSION) ||
   1453  11042        Erik 	    (connp->conn_family == AF_INET6 &&
   1454  11042        Erik 	    (connp->conn_ipversion == IPV4_VERSION ||
   1455  11042        Erik 	    connp->conn_ipversion == IPV6_VERSION)));
   1456      0      stevel 	ASSERT(!tcp->tcp_listener);
   1457      0      stevel 
   1458   3448    dh155122 	TCP_STAT(tcps, tcp_time_wait_reap);
   1459      0      stevel 	ASSERT(TCP_IS_DETACHED(tcp));
   1460      0      stevel 
   1461      0      stevel 	/*
   1462      0      stevel 	 * Because they have no upstream client to rebind or tcp_close()
   1463      0      stevel 	 * them later, we axe the connection here and now.
   1464      0      stevel 	 */
   1465      0      stevel 	tcp_close_detached(tcp);
   1466      0      stevel }
   1467      0      stevel 
   1468   3448    dh155122 /*
   1469   3448    dh155122  * Remove cached/latched IPsec references.
   1470   3448    dh155122  */
   1471   3448    dh155122 void
   1472   3448    dh155122 tcp_ipsec_cleanup(tcp_t *tcp)
   1473   3448    dh155122 {
   1474   3448    dh155122 	conn_t		*connp = tcp->tcp_connp;
   1475   3448    dh155122 
   1476   5240    nordmark 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
   1477   5240    nordmark 
   1478   5240    nordmark 	if (connp->conn_latch != NULL) {
   1479  11042        Erik 		IPLATCH_REFRELE(connp->conn_latch);
   1480   5240    nordmark 		connp->conn_latch = NULL;
   1481  11042        Erik 	}
   1482  11042        Erik 	if (connp->conn_latch_in_policy != NULL) {
   1483  11042        Erik 		IPPOL_REFRELE(connp->conn_latch_in_policy);
   1484  11042        Erik 		connp->conn_latch_in_policy = NULL;
   1485  11042        Erik 	}
   1486  11042        Erik 	if (connp->conn_latch_in_action != NULL) {
   1487  11042        Erik 		IPACT_REFRELE(connp->conn_latch_in_action);
   1488  11042        Erik 		connp->conn_latch_in_action = NULL;
   1489   5240    nordmark 	}
   1490   5240    nordmark 	if (connp->conn_policy != NULL) {
   1491   5240    nordmark 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
   1492   5240    nordmark 		connp->conn_policy = NULL;
   1493   3448    dh155122 	}
   1494   3448    dh155122 }
   1495   3448    dh155122 
   1496   3448    dh155122 /*
   1497   3448    dh155122  * Cleaup before placing on free list.
   1498   3448    dh155122  * Disassociate from the netstack/tcp_stack_t since the freelist
   1499   3448    dh155122  * is per squeue and not per netstack.
   1500   3448    dh155122  */
   1501      0      stevel void
   1502      0      stevel tcp_cleanup(tcp_t *tcp)
   1503      0      stevel {
   1504      0      stevel 	mblk_t		*mp;
   1505      0      stevel 	tcp_sack_info_t	*tcp_sack_info;
   1506      0      stevel 	conn_t		*connp = tcp->tcp_connp;
   1507   3448    dh155122 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1508   3448    dh155122 	netstack_t	*ns = tcps->tcps_netstack;
   1509   8014    Kacheong 	mblk_t		*tcp_rsrv_mp;
   1510      0      stevel 
   1511      0      stevel 	tcp_bind_hash_remove(tcp);
   1512   3448    dh155122 
   1513   3448    dh155122 	/* Cleanup that which needs the netstack first */
   1514   3448    dh155122 	tcp_ipsec_cleanup(tcp);
   1515  11042        Erik 	ixa_cleanup(connp->conn_ixa);
   1516  11042        Erik 
   1517  11042        Erik 	if (connp->conn_ht_iphc != NULL) {
   1518  11042        Erik 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
   1519  11042        Erik 		connp->conn_ht_iphc = NULL;
   1520  11042        Erik 		connp->conn_ht_iphc_allocated = 0;
   1521  11042        Erik 		connp->conn_ht_iphc_len = 0;
   1522  11042        Erik 		connp->conn_ht_ulp = NULL;
   1523  11042        Erik 		connp->conn_ht_ulp_len = 0;
   1524  11042        Erik 		tcp->tcp_ipha = NULL;
   1525  11042        Erik 		tcp->tcp_ip6h = NULL;
   1526  11042        Erik 		tcp->tcp_tcpha = NULL;
   1527  11042        Erik 	}
   1528  11042        Erik 
   1529  11042        Erik 	/* We clear any IP_OPTIONS and extension headers */
   1530  11042        Erik 	ip_pkt_free(&connp->conn_xmit_ipp);
   1531   3448    dh155122 
   1532      0      stevel 	tcp_free(tcp);
   1533      0      stevel 
   1534    898        kais 	/* Release any SSL context */
   1535    898        kais 	if (tcp->tcp_kssl_ent != NULL) {
   1536    898        kais 		kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
   1537    898        kais 		tcp->tcp_kssl_ent = NULL;
   1538    898        kais 	}
   1539    898        kais 
   1540    898        kais 	if (tcp->tcp_kssl_ctx != NULL) {
   1541    898        kais 		kssl_release_ctx(tcp->tcp_kssl_ctx);
   1542    898        kais 		tcp->tcp_kssl_ctx = NULL;
   1543    898        kais 	}
   1544    898        kais 	tcp->tcp_kssl_pending = B_FALSE;
   1545      0      stevel 
   1546      0      stevel 	/*
   1547      0      stevel 	 * Since we will bzero the entire structure, we need to
   1548      0      stevel 	 * remove it and reinsert it in global hash list. We
   1549      0      stevel 	 * know the walkers can't get to this conn because we
   1550      0      stevel 	 * had set CONDEMNED flag earlier and checked reference
   1551      0      stevel 	 * under conn_lock so walker won't pick it and when we
   1552      0      stevel 	 * go the ipcl_globalhash_remove() below, no walker
   1553      0      stevel 	 * can get to it.
   1554      0      stevel 	 */
   1555      0      stevel 	ipcl_globalhash_remove(connp);
   1556      0      stevel 
   1557  11042        Erik 	/* Save some state */
   1558  11042        Erik 	mp = tcp->tcp_timercache;
   1559  11042        Erik 
   1560  11042        Erik 	tcp_sack_info = tcp->tcp_sack_info;
   1561  11042        Erik 	tcp_rsrv_mp = tcp->tcp_rsrv_mp;
   1562  11042        Erik 
   1563  11042        Erik 	if (connp->conn_cred != NULL) {
   1564  11042        Erik 		crfree(connp->conn_cred);
   1565  11042        Erik 		connp->conn_cred = NULL;
   1566  11042        Erik 	}
   1567  11042        Erik 	ipcl_conn_cleanup(connp);
   1568  11042        Erik 	connp->conn_flags = IPCL_TCPCONN;
   1569  11042        Erik 
   1570   3448    dh155122 	/*
   1571   3448    dh155122 	 * Now it is safe to decrement the reference counts.
   1572  11042        Erik 	 * This might be the last reference on the netstack
   1573  11042        Erik 	 * in which case it will cause the freeing of the IP Instance.
   1574   3448    dh155122 	 */
   1575   3448    dh155122 	connp->conn_netstack = NULL;
   1576  11042        Erik 	connp->conn_ixa->ixa_ipst = NULL;
   1577   3448    dh155122 	netstack_rele(ns);
   1578   3448    dh155122 	ASSERT(tcps != NULL);
   1579   3448    dh155122 	tcp->tcp_tcps = NULL;
   1580  11042        Erik 
   1581      0      stevel 	bzero(tcp, sizeof (tcp_t));
   1582      0      stevel 
   1583      0      stevel 	/* restore the state */
   1584      0      stevel 	tcp->tcp_timercache = mp;
   1585      0      stevel 
   1586      0      stevel 	tcp->tcp_sack_info = tcp_sack_info;
   1587   8014    Kacheong 	tcp->tcp_rsrv_mp = tcp_rsrv_mp;
   1588      0      stevel 
   1589      0      stevel 	tcp->tcp_connp = connp;
   1590      0      stevel 
   1591   5240    nordmark 	ASSERT(connp->conn_tcp == tcp);
   1592   5240    nordmark 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
   1593      0      stevel 	connp->conn_state_flags = CONN_INCIPIENT;
   1594  11042        Erik 	ASSERT(connp->conn_proto == IPPROTO_TCP);
   1595   5240    nordmark 	ASSERT(connp->conn_ref == 1);
   1596      0      stevel }
   1597      0      stevel 
   1598      0      stevel /*
   1599      0      stevel  * Blows away all tcps whose TIME_WAIT has expired. List traversal
   1600      0      stevel  * is done forwards from the head.
   1601   3448    dh155122  * This walks all stack instances since
   1602   3448    dh155122  * tcp_time_wait remains global across all stacks.
   1603      0      stevel  */
   1604      0      stevel /* ARGSUSED */
   1605      0      stevel void
   1606      0      stevel tcp_time_wait_collector(void *arg)
   1607      0      stevel {
   1608      0      stevel 	tcp_t *tcp;
   1609      0      stevel 	clock_t now;
   1610      0      stevel 	mblk_t *mp;
   1611      0      stevel 	conn_t *connp;
   1612      0      stevel 	kmutex_t *lock;
   1613   3104    jprakash 	boolean_t removed;
   1614      0      stevel 
   1615      0      stevel 	squeue_t *sqp = (squeue_t *)arg;
   1616      0      stevel 	tcp_squeue_priv_t *tcp_time_wait =
   1617      0      stevel 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
   1618      0      stevel 
   1619      0      stevel 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1620      0      stevel 	tcp_time_wait->tcp_time_wait_tid = 0;
   1621      0      stevel 
   1622      0      stevel 	if (tcp_time_wait->tcp_free_list != NULL &&
   1623      0      stevel 	    tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
   1624   3448    dh155122 		TCP_G_STAT(tcp_freelist_cleanup);
   1625      0      stevel 		while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
   1626      0      stevel 			tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
   1627   3448    dh155122 			tcp->tcp_time_wait_next = NULL;
   1628   3448    dh155122 			tcp_time_wait->tcp_free_list_cnt--;
   1629   3448    dh155122 			ASSERT(tcp->tcp_tcps == NULL);
   1630      0      stevel 			CONN_DEC_REF(tcp->tcp_connp);
   1631      0      stevel 		}
   1632   3448    dh155122 		ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
   1633      0      stevel 	}
   1634      0      stevel 
   1635      0      stevel 	/*
   1636      0      stevel 	 * In order to reap time waits reliably, we should use a
   1637      0      stevel 	 * source of time that is not adjustable by the user -- hence
   1638      0      stevel 	 * the call to ddi_get_lbolt().
   1639      0      stevel 	 */
   1640      0      stevel 	now = ddi_get_lbolt();
   1641      0      stevel 	while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
   1642      0      stevel 		/*
   1643      0      stevel 		 * Compare times using modular arithmetic, since
   1644      0      stevel 		 * lbolt can wrapover.
   1645      0      stevel 		 */
   1646      0      stevel 		if ((now - tcp->tcp_time_wait_expire) < 0) {
   1647      0      stevel 			break;
   1648      0      stevel 		}
   1649      0      stevel 
   1650   3104    jprakash 		removed = tcp_time_wait_remove(tcp, tcp_time_wait);
   1651   3104    jprakash 		ASSERT(removed);
   1652      0      stevel 
   1653      0      stevel 		connp = tcp->tcp_connp;
   1654      0      stevel 		ASSERT(connp->conn_fanout != NULL);
   1655      0      stevel 		lock = &connp->conn_fanout->connf_lock;
   1656      0      stevel 		/*
   1657    487     rshoaib 		 * This is essentially a TW reclaim fast path optimization for
   1658    487     rshoaib 		 * performance where the timewait collector checks under the
   1659    487     rshoaib 		 * fanout lock (so that no one else can get access to the
   1660    487     rshoaib 		 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
   1661    487     rshoaib 		 * the classifier hash list. If ref count is indeed 2, we can
   1662    487     rshoaib 		 * just remove the conn under the fanout lock and avoid
   1663    487     rshoaib 		 * cleaning up the conn under the squeue, provided that
   1664    487     rshoaib 		 * clustering callbacks are not enabled. If clustering is
   1665    487     rshoaib 		 * enabled, we need to make the clustering callback before
   1666    487     rshoaib 		 * setting the CONDEMNED flag and after dropping all locks and
   1667    487     rshoaib 		 * so we forego this optimization and fall back to the slow
   1668    487     rshoaib 		 * path. Also please see the comments in tcp_closei_local
   1669    487     rshoaib 		 * regarding the refcnt logic.
   1670      0      stevel 		 *
   1671      0      stevel 		 * Since we are holding the tcp_time_wait_lock, its better
   1672      0      stevel 		 * not to block on the fanout_lock because other connections
   1673      0      stevel 		 * can't add themselves to time_wait list. So we do a
   1674      0      stevel 		 * tryenter instead of mutex_enter.
   1675      0      stevel 		 */
   1676      0      stevel 		if (mutex_tryenter(lock)) {
   1677      0      stevel 			mutex_enter(&connp->conn_lock);
   1678    487     rshoaib 			if ((connp->conn_ref == 2) &&
   1679    487     rshoaib 			    (cl_inet_disconnect == NULL)) {
   1680      0      stevel 				ipcl_hash_remove_locked(connp,
   1681      0      stevel 				    connp->conn_fanout);
   1682      0      stevel 				/*
   1683      0      stevel 				 * Set the CONDEMNED flag now itself so that
   1684      0      stevel 				 * the refcnt cannot increase due to any
   1685  11042        Erik 				 * walker.
   1686      0      stevel 				 */
   1687      0      stevel 				connp->conn_state_flags |= CONN_CONDEMNED;
   1688      0      stevel 				mutex_exit(lock);
   1689      0      stevel 				mutex_exit(&connp->conn_lock);
   1690   1023    ethindra 				if (tcp_time_wait->tcp_free_list_cnt <
   1691   1023    ethindra 				    tcp_free_list_max_cnt) {
   1692   1023    ethindra 					/* Add to head of tcp_free_list */
   1693   1023    ethindra 					mutex_exit(
   1694   1023    ethindra 					    &tcp_time_wait->tcp_time_wait_lock);
   1695   1023    ethindra 					tcp_cleanup(tcp);
   1696   3448    dh155122 					ASSERT(connp->conn_latch == NULL);
   1697   3448    dh155122 					ASSERT(connp->conn_policy == NULL);
   1698   3448    dh155122 					ASSERT(tcp->tcp_tcps == NULL);
   1699   3448    dh155122 					ASSERT(connp->conn_netstack == NULL);
   1700   3448    dh155122 
   1701   1023    ethindra 					mutex_enter(
   1702   1023    ethindra 					    &tcp_time_wait->tcp_time_wait_lock);
   1703   1023    ethindra 					tcp->tcp_time_wait_next =
   1704   1023    ethindra 					    tcp_time_wait->tcp_free_list;
   1705   1023    ethindra 					tcp_time_wait->tcp_free_list = tcp;
   1706   1023    ethindra 					tcp_time_wait->tcp_free_list_cnt++;
   1707   1023    ethindra 					continue;
   1708   1023    ethindra 				} else {
   1709   1023    ethindra 					/* Do not add to tcp_free_list */
   1710   1023    ethindra 					mutex_exit(
   1711   1023    ethindra 					    &tcp_time_wait->tcp_time_wait_lock);
   1712   1023    ethindra 					tcp_bind_hash_remove(tcp);
   1713  11042        Erik 					ixa_cleanup(tcp->tcp_connp->conn_ixa);
   1714   3448    dh155122 					tcp_ipsec_cleanup(tcp);
   1715   1023    ethindra 					CONN_DEC_REF(tcp->tcp_connp);
   1716   1023    ethindra 				}
   1717      0      stevel 			} else {
   1718      0      stevel 				CONN_INC_REF_LOCKED(connp);
   1719      0      stevel 				mutex_exit(lock);
   1720      0      stevel 				mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1721      0      stevel 				mutex_exit(&connp->conn_lock);
   1722      0      stevel 				/*
   1723      0      stevel 				 * We can reuse the closemp here since conn has
   1724      0      stevel 				 * detached (otherwise we wouldn't even be in
   1725   3104    jprakash 				 * time_wait list). tcp_closemp_used can safely
   1726   3104    jprakash 				 * be changed without taking a lock as no other
   1727   3104    jprakash 				 * thread can concurrently access it at this
   1728   4200    jprakash 				 * point in the connection lifecycle.
   1729   3104    jprakash 				 */
   1730   3104    jprakash 
   1731   3104    jprakash 				if (tcp->tcp_closemp.b_prev == NULL)
   1732   4200    jprakash 					tcp->tcp_closemp_used = B_TRUE;
   1733   3104    jprakash 				else
   1734   4200    jprakash 					cmn_err(CE_PANIC,
   1735   4200    jprakash 					    "tcp_timewait_collector: "
   1736   4200    jprakash 					    "concurrent use of tcp_closemp: "
   1737   4200    jprakash 					    "connp %p tcp %p\n", (void *)connp,
   1738   4200    jprakash 					    (void *)tcp);
   1739   3104    jprakash 
   1740   3104    jprakash 				TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   1741      0      stevel 				mp = &tcp->tcp_closemp;
   1742   8275        Eric 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   1743  11042        Erik 				    tcp_timewait_output, connp, NULL,
   1744   8275        Eric 				    SQ_FILL, SQTAG_TCP_TIMEWAIT);
   1745      0      stevel 			}
   1746      0      stevel 		} else {
   1747      0      stevel 			mutex_enter(&connp->conn_lock);
   1748      0      stevel 			CONN_INC_REF_LOCKED(connp);
   1749      0      stevel 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1750      0      stevel 			mutex_exit(&connp->conn_lock);
   1751      0      stevel 			/*
   1752      0      stevel 			 * We can reuse the closemp here since conn has
   1753      0      stevel 			 * detached (otherwise we wouldn't even be in
   1754   3104    jprakash 			 * time_wait list). tcp_closemp_used can safely
   1755   3104    jprakash 			 * be changed without taking a lock as no other
   1756   3104    jprakash 			 * thread can concurrently access it at this
   1757   4200    jprakash 			 * point in the connection lifecycle.
   1758   3104    jprakash 			 */
   1759   3104    jprakash 
   1760   3104    jprakash 			if (tcp->tcp_closemp.b_prev == NULL)
   1761   4200    jprakash 				tcp->tcp_closemp_used = B_TRUE;
   1762   3104    jprakash 			else
   1763   4200    jprakash 				cmn_err(CE_PANIC, "tcp_timewait_collector: "
   1764   4200    jprakash 				    "concurrent use of tcp_closemp: "
   1765   4200    jprakash 				    "connp %p tcp %p\n", (void *)connp,
   1766   4200    jprakash 				    (void *)tcp);
   1767   3104    jprakash 
   1768   3104    jprakash 			TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   1769      0      stevel 			mp = &tcp->tcp_closemp;
   1770   8275        Eric 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   1771  11042        Erik 			    tcp_timewait_output, connp, NULL,
   1772   8275        Eric 			    SQ_FILL, SQTAG_TCP_TIMEWAIT);
   1773      0      stevel 		}
   1774      0      stevel 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1775      0      stevel 	}
   1776      0      stevel 
   1777      0      stevel 	if (tcp_time_wait->tcp_free_list != NULL)
   1778      0      stevel 		tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
   1779      0      stevel 
   1780      0      stevel 	tcp_time_wait->tcp_time_wait_tid =
   1781   8048    Madhavan 	    timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp,
   1782   8048    Madhavan 	    TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION,
   1783   8048    Madhavan 	    CALLOUT_FLAG_ROUNDUP);
   1784      0      stevel 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1785      0      stevel }
   1786   8348        Eric 
   1787      0      stevel /*
   1788      0      stevel  * Reply to a clients T_CONN_RES TPI message. This function
   1789      0      stevel  * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
   1790  11042        Erik  * on the acceptor STREAM and processed in tcp_accept_common().
   1791  11042        Erik  * Read the block comment on top of tcp_input_listener().
   1792      0      stevel  */
   1793      0      stevel static void
   1794   8348        Eric tcp_tli_accept(tcp_t *listener, mblk_t *mp)
   1795      0      stevel {
   1796  11042        Erik 	tcp_t		*acceptor;
   1797  11042        Erik 	tcp_t		*eager;
   1798  11042        Erik 	tcp_t   	*tcp;
   1799      0      stevel 	struct T_conn_res	*tcr;
   1800      0      stevel 	t_uscalar_t	acceptor_id;
   1801      0      stevel 	t_scalar_t	seqnum;
   1802  11042        Erik 	mblk_t		*discon_mp = NULL;
   1803  11042        Erik 	mblk_t		*ok_mp;
   1804  11042        Erik 	mblk_t		*mp1;
   1805   3448    dh155122 	tcp_stack_t	*tcps = listener->tcp_tcps;
   1806  11042        Erik 	conn_t		*econnp;
   1807      0      stevel 
   1808      0      stevel 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
   1809      0      stevel 		tcp_err_ack(listener, mp, TPROTO, 0);
   1810      0      stevel 		return;
   1811      0      stevel 	}
   1812      0      stevel 	tcr = (struct T_conn_res *)mp->b_rptr;
   1813      0      stevel 
   1814      0      stevel 	/*
   1815      0      stevel 	 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
   1816      0      stevel 	 * read side queue of the streams device underneath us i.e. the
   1817      0      stevel 	 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
   1818      0      stevel 	 * look it up in the queue_hash.  Under LP64 it sends down the
   1819      0      stevel 	 * minor_t of the accepting endpoint.
   1820      0      stevel 	 *
   1821      0      stevel 	 * Once the acceptor/eager are modified (in tcp_accept_swap) the
   1822      0      stevel 	 * fanout hash lock is held.
   1823      0      stevel 	 * This prevents any thread from entering the acceptor queue from
   1824      0      stevel 	 * below (since it has not been hard bound yet i.e. any inbound
   1825  11042        Erik 	 * packets will arrive on the listener conn_t and
   1826  11042        Erik 	 * go through the classifier).
   1827      0      stevel 	 * The CONN_INC_REF will prevent the acceptor from closing.
   1828      0      stevel 	 *
   1829      0      stevel 	 * XXX It is still possible for a tli application to send down data
   1830      0      stevel 	 * on the accepting stream while another thread calls t_accept.
   1831      0      stevel 	 * This should not be a problem for well-behaved applications since
   1832      0      stevel 	 * the T_OK_ACK is sent after the queue swapping is completed.
   1833      0      stevel 	 *
   1834      0      stevel 	 * If the accepting fd is the same as the listening fd, avoid
   1835      0      stevel 	 * queue hash lookup since that will return an eager listener in a
   1836      0      stevel 	 * already established state.
   1837      0      stevel 	 */
   1838      0      stevel 	acceptor_id = tcr->ACCEPTOR_id;
   1839      0      stevel 	mutex_enter(&listener->tcp_eager_lock);
   1840      0      stevel 	if (listener->tcp_acceptor_id == acceptor_id) {
   1841      0      stevel 		eager = listener->tcp_eager_next_q;
   1842      0      stevel 		/* only count how many T_CONN_INDs so don't count q0 */
   1843      0      stevel 		if ((listener->tcp_conn_req_cnt_q != 1) ||
   1844      0      stevel 		    (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
   1845      0      stevel 			mutex_exit(&listener->tcp_eager_lock);
   1846      0      stevel 			tcp_err_ack(listener, mp, TBADF, 0);
   1847      0      stevel 			return;
   1848      0      stevel 		}
   1849      0      stevel 		if (listener->tcp_conn_req_cnt_q0 != 0) {
   1850      0      stevel 			/* Throw away all the eagers on q0. */
   1851      0      stevel 			tcp_eager_cleanup(listener, 1);
   1852      0      stevel 		}
   1853      0      stevel 		if (listener->tcp_syn_defense) {
   1854      0      stevel 			listener->tcp_syn_defense = B_FALSE;
   1855      0      stevel 			if (listener->tcp_ip_addr_cache != NULL) {
   1856      0      stevel 				kmem_free(listener->tcp_ip_addr_cache,
   1857      0      stevel 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
   1858      0      stevel 				listener->tcp_ip_addr_cache = NULL;
   1859      0      stevel 			}
   1860      0      stevel 		}
   1861      0      stevel 		/*
   1862      0      stevel 		 * Transfer tcp_conn_req_max to the eager so that when
   1863      0      stevel 		 * a disconnect occurs we can revert the endpoint to the
   1864      0      stevel 		 * listen state.
   1865      0      stevel 		 */
   1866      0      stevel 		eager->tcp_conn_req_max = listener->tcp_conn_req_max;
   1867      0      stevel 		ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
   1868      0      stevel 		/*
   1869      0      stevel 		 * Get a reference on the acceptor just like the
   1870      0      stevel 		 * tcp_acceptor_hash_lookup below.
   1871      0      stevel 		 */
   1872      0      stevel 		acceptor = listener;
   1873      0      stevel 		CONN_INC_REF(acceptor->tcp_connp);
   1874      0      stevel 	} else {
   1875   3448    dh155122 		acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
   1876      0      stevel 		if (acceptor == NULL) {
   1877  11042        Erik 			if (listener->tcp_connp->conn_debug) {
   1878    741    masputra 				(void) strlog(TCP_MOD_ID, 0, 1,
   1879      0      stevel 				    SL_ERROR|SL_TRACE,
   1880      0      stevel 				    "tcp_accept: did not find acceptor 0x%x\n",
   1881      0      stevel 				    acceptor_id);
   1882      0      stevel 			}
   1883      0      stevel 			mutex_exit(&listener->tcp_eager_lock);
   1884      0      stevel 			tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
   1885      0      stevel 			return;
   1886      0      stevel 		}
   1887      0      stevel 		/*
   1888      0      stevel 		 * Verify acceptor state. The acceptable states for an acceptor
   1889      0      stevel 		 * include TCPS_IDLE and TCPS_BOUND.
   1890      0      stevel 		 */
   1891      0      stevel 		switch (acceptor->tcp_state) {
   1892      0      stevel 		case TCPS_IDLE:
   1893      0      stevel 			/* FALLTHRU */
   1894      0      stevel 		case TCPS_BOUND:
   1895      0      stevel 			break;
   1896      0      stevel 		default:
   1897      0      stevel 			CONN_DEC_REF(acceptor->tcp_connp);
   1898      0      stevel 			mutex_exit(&listener->tcp_eager_lock);
   1899      0      stevel 			tcp_err_ack(listener, mp, TOUTSTATE, 0);
   1900      0      stevel 			return;
   1901      0      stevel 		}
   1902      0      stevel 	}
   1903      0      stevel 
   1904      0      stevel 	/* The listener must be in TCPS_LISTEN */
   1905      0      stevel 	if (listener->tcp_state != TCPS_LISTEN) {
   1906      0      stevel 		CONN_DEC_REF(acceptor->tcp_connp);
   1907      0      stevel 		mutex_exit(&listener->tcp_eager_lock);
   1908      0      stevel 		tcp_err_ack(listener, mp, TOUTSTATE, 0);
   1909      0      stevel 		return;
   1910      0      stevel 	}
   1911      0      stevel 
   1912      0      stevel 	/*
   1913      0      stevel 	 * Rendezvous with an eager connection request packet hanging off
   1914      0      stevel 	 * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
   1915      0      stevel 	 * tcp structure when the connection packet arrived in
   1916  11042        Erik 	 * tcp_input_listener().
   1917      0      stevel 	 */
   1918      0      stevel 	seqnum = tcr->SEQ_number;
   1919      0      stevel 	eager = listener;
   1920      0      stevel 	do {
   1921      0      stevel 		eager = eager->tcp_eager_next_q;
   1922      0      stevel 		if (eager == NULL) {
   1923      0      stevel 			CONN_DEC_REF(acceptor->tcp_connp);
   1924      0      stevel 			mutex_exit(&listener->tcp_eager_lock);
   1925      0      stevel 			tcp_err_ack(listener, mp, TBADSEQ, 0);
   1926      0      stevel 			return;
   1927      0      stevel 		}
   1928      0      stevel 	} while (eager->tcp_conn_req_seqnum != seqnum);
   1929      0      stevel 	mutex_exit(&listener->tcp_eager_lock);
   1930      0      stevel 
   1931      0      stevel 	/*
   1932      0      stevel 	 * At this point, both acceptor and listener have 2 ref
   1933      0      stevel 	 * that they begin with. Acceptor has one additional ref
   1934      0      stevel 	 * we placed in lookup while listener has 3 additional
   1935      0      stevel 	 * ref for being behind the squeue (tcp_accept() is
   1936      0      stevel 	 * done on listener's squeue); being in classifier hash;
   1937      0      stevel 	 * and eager's ref on listener.
   1938      0      stevel 	 */
   1939      0      stevel 	ASSERT(listener->tcp_connp->conn_ref >= 5);
   1940      0      stevel 	ASSERT(acceptor->tcp_connp->conn_ref >= 3);
   1941      0      stevel 
   1942      0      stevel 	/*
   1943      0      stevel 	 * The eager at this point is set in its own squeue and
   1944      0      stevel 	 * could easily have been killed (tcp_accept_finish will
   1945      0      stevel 	 * deal with that) because of a TH_RST so we can only
   1946      0      stevel 	 * ASSERT for a single ref.
   1947      0      stevel 	 */
   1948      0      stevel 	ASSERT(eager->tcp_connp->conn_ref >= 1);
   1949      0      stevel 
   1950  11042        Erik 	/*
   1951  11042        Erik 	 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
   1952  11042        Erik 	 * use it if something failed.
   1953  11042        Erik 	 */
   1954  11042        Erik 	discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
   1955  11042        Erik 	    sizeof (struct stroptions)), BPRI_HI);
   1956  11042        Erik 	if (discon_mp == NULL) {
   1957      0      stevel 		CONN_DEC_REF(acceptor->tcp_connp);
   1958      0      stevel 		CONN_DEC_REF(eager->tcp_connp);
   1959      0      stevel 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
   1960      0      stevel 		return;
   1961      0      stevel 	}
   1962  11042        Erik 
   1963  11042        Erik 	econnp = eager->tcp_connp;
   1964  11042        Erik 
   1965  11042        Erik 	/* Hold a copy of mp, in case reallocb fails */
   1966      0      stevel 	if ((mp1 = copymsg(mp)) == NULL) {
   1967      0      stevel 		CONN_DEC_REF(acceptor->tcp_connp);
   1968      0      stevel 		CONN_DEC_REF(eager->tcp_connp);
   1969  11042        Erik 		freemsg(discon_mp);
   1970      0      stevel 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
   1971      0      stevel 		return;
   1972      0      stevel 	}
   1973      0      stevel 
   1974      0      stevel 	tcr = (struct T_conn_res *)mp1->b_rptr;
   1975      0      stevel 
   1976      0      stevel 	/*
   1977      0      stevel 	 * This is an expanded version of mi_tpi_ok_ack_alloc()
   1978      0      stevel 	 * which allocates a larger mblk and appends the new
   1979      0      stevel 	 * local address to the ok_ack.  The address is copied by
   1980      0      stevel 	 * soaccept() for getsockname().
   1981      0      stevel 	 */
   1982      0      stevel 	{
   1983      0      stevel 		int extra;
   1984      0      stevel 
   1985  11042        Erik 		extra = (econnp->conn_family == AF_INET) ?
   1986      0      stevel 		    sizeof (sin_t) : sizeof (sin6_t);
   1987      0      stevel 
   1988      0      stevel 		/*
   1989      0      stevel 		 * Try to re-use mp, if possible.  Otherwise, allocate
   1990      0      stevel 		 * an mblk and return it as ok_mp.  In any case, mp
   1991      0      stevel 		 * is no longer usable upon return.
   1992      0      stevel 		 */
   1993      0      stevel 		if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
   1994      0      stevel 			CONN_DEC_REF(acceptor->tcp_connp);
   1995      0      stevel 			CONN_DEC_REF(eager->tcp_connp);
   1996  11042        Erik 			freemsg(discon_mp);
   1997      0      stevel 			/* Original mp has been freed by now, so use mp1 */
   1998      0      stevel 			tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
   1999      0      stevel 			return;
   2000      0      stevel 		}
   2001      0      stevel 
   2002      0      stevel 		mp = NULL;	/* We should never use mp after this point */
   2003      0      stevel 
   2004      0      stevel 		switch (extra) {
   2005      0      stevel 		case sizeof (sin_t): {
   2006  11042        Erik 			sin_t *sin = (sin_t *)ok_mp->b_wptr;
   2007  11042        Erik 
   2008  11042        Erik 			ok_mp->b_wptr += extra;
   2009  11042        Erik 			sin->sin_family = AF_INET;
   2010  11042        Erik 			sin->sin_port = econnp->conn_lport;
   2011  11042        Erik 			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
   2012  11042        Erik 			break;
   2013  11042        Erik 		}
   2014      0      stevel 		case sizeof (sin6_t): {
   2015  11042        Erik 			sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
   2016  11042        Erik 
   2017  11042        Erik 			ok_mp->b_wptr += extra;
   2018  11042        Erik 			sin6->sin6_family = AF_INET6;
   2019  11042        Erik 			sin6->sin6_port = econnp->conn_lport;
   2020  11042        Erik 			sin6->sin6_addr = econnp->conn_laddr_v6;
   2021  11042        Erik 			sin6->sin6_flowinfo = econnp->conn_flowinfo;
   2022  11042        Erik 			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
   2023  11042        Erik 			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
   2024  11042        Erik 				sin6->sin6_scope_id =
   2025  11042        Erik 				    econnp->conn_ixa->ixa_scopeid;
   2026  11042        Erik 			} else {
   2027   4379     ja97890 				sin6->sin6_scope_id = 0;
   2028  11042        Erik 			}
   2029  11042        Erik 			sin6->__sin6_src_id = 0;
   2030  11042        Erik 			break;
   2031  11042        Erik 		}
   2032      0      stevel 		default:
   2033      0      stevel 			break;
   2034      0      stevel 		}
   2035      0      stevel 		ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
   2036      0      stevel 	}
   2037      0      stevel 
   2038      0      stevel 	/*
   2039      0      stevel 	 * If there are no options we know that the T_CONN_RES will
   2040      0      stevel 	 * succeed. However, we can't send the T_OK_ACK upstream until
   2041      0      stevel 	 * the tcp_accept_swap is done since it would be dangerous to
   2042      0      stevel 	 * let the application start using the new fd prior to the swap.
   2043      0      stevel 	 */
   2044  11042        Erik 	tcp_accept_swap(listener, acceptor, eager);
   2045      0      stevel 
   2046      0      stevel 	/*
   2047      0      stevel 	 * tcp_accept_swap unlinks eager from listener but does not drop
   2048      0      stevel 	 * the eager's reference on the listener.
   2049      0      stevel 	 */
   2050      0      stevel 	ASSERT(eager->tcp_listener == NULL);
   2051      0      stevel 	ASSERT(listener->tcp_connp->conn_ref >= 5);
   2052      0      stevel 
   2053      0      stevel 	/*
   2054      0      stevel 	 * The eager is now associated with its own queue. Insert in
   2055      0      stevel 	 * the hash so that the connection can be reused for a future
   2056      0      stevel 	 * T_CONN_RES.
   2057      0      stevel 	 */
   2058      0      stevel 	tcp_acceptor_hash_insert(acceptor_id, eager);
   2059      0      stevel 
   2060      0      stevel 	/*
   2061      0      stevel 	 * We now do the processing of options with T_CONN_RES.
   2062      0      stevel 	 * We delay till now since we wanted to have queue to pass to
   2063      0      stevel 	 * option processing routines that points back to the right
   2064      0      stevel 	 * instance structure which does not happen until after
   2065      0      stevel 	 * tcp_accept_swap().
   2066      0      stevel 	 *
   2067      0      stevel 	 * Note:
   2068      0      stevel 	 * The sanity of the logic here assumes that whatever options
   2069      0      stevel 	 * are appropriate to inherit from listner=>eager are done
   2070      0      stevel 	 * before this point, and whatever were to be overridden (or not)
   2071      0      stevel 	 * in transfer logic from eager=>acceptor in tcp_accept_swap().
   2072      0      stevel 	 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
   2073      0      stevel 	 *   before its ACCEPTOR_id comes down in T_CONN_RES ]
   2074      0      stevel 	 * This may not be true at this point in time but can be fixed
   2075      0      stevel 	 * independently. This option processing code starts with
   2076      0      stevel 	 * the instantiated acceptor instance and the final queue at
   2077      0      stevel 	 * this point.
   2078      0      stevel 	 */
   2079      0      stevel 
   2080      0      stevel 	if (tcr->OPT_length != 0) {
   2081      0      stevel 		/* Options to process */
   2082      0      stevel 		int t_error = 0;
   2083      0      stevel 		int sys_error = 0;
   2084      0      stevel 		int do_disconnect = 0;
   2085      0      stevel 
   2086      0      stevel 		if (tcp_conprim_opt_process(eager, mp1,
   2087      0      stevel 		    &do_disconnect, &t_error, &sys_error) < 0) {
   2088      0      stevel 			eager->tcp_accept_error = 1;
   2089      0      stevel 			if (do_disconnect) {
   2090      0      stevel 				/*
   2091      0      stevel 				 * An option failed which does not allow
   2092      0      stevel 				 * connection to be accepted.
   2093      0      stevel 				 *
   2094      0      stevel 				 * We allow T_CONN_RES to succeed and
   2095      0      stevel 				 * put a T_DISCON_IND on the eager queue.
   2096      0      stevel 				 */
   2097      0      stevel 				ASSERT(t_error == 0 && sys_error == 0);
   2098      0      stevel 				eager->tcp_send_discon_ind = 1;
   2099      0      stevel 			} else {
   2100      0      stevel 				ASSERT(t_error != 0);
   2101      0      stevel 				freemsg(ok_mp);
   2102      0      stevel 				/*
   2103      0      stevel 				 * Original mp was either freed or set
   2104      0      stevel 				 * to ok_mp above, so use mp1 instead.
   2105      0      stevel 				 */
   2106      0      stevel 				tcp_err_ack(listener, mp1, t_error, sys_error);
   2107      0      stevel 				goto finish;
   2108      0      stevel 			}
   2109      0      stevel 		}
   2110      0      stevel 		/*
   2111      0      stevel 		 * Most likely success in setting options (except if
   2112      0      stevel 		 * eager->tcp_send_discon_ind set).
   2113      0      stevel 		 * mp1 option buffer represented by OPT_length/offset
   2114      0      stevel 		 * potentially modified and contains results of setting
   2115      0      stevel 		 * options at this point
   2116      0      stevel 		 */
   2117      0      stevel 	}
   2118      0      stevel 
   2119      0      stevel 	/* We no longer need mp1, since all options processing has passed */
   2120      0      stevel 	freemsg(mp1);
   2121      0      stevel 
   2122  11042        Erik 	putnext(listener->tcp_connp->conn_rq, ok_mp);
   2123      0      stevel 
   2124      0      stevel 	mutex_enter(&listener->tcp_eager_lock);
   2125      0      stevel 	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
   2126      0      stevel 		tcp_t	*tail;
   2127      0      stevel 		mblk_t	*conn_ind;
   2128      0      stevel 
   2129      0      stevel 		/*
   2130      0      stevel 		 * This path should not be executed if listener and
   2131      0      stevel 		 * acceptor streams are the same.
   2132      0      stevel 		 */
   2133      0      stevel 		ASSERT(listener != acceptor);
   2134      0      stevel 
   2135      0      stevel 		tcp = listener->tcp_eager_prev_q0;
   2136      0      stevel 		/*
   2137      0      stevel 		 * listener->tcp_eager_prev_q0 points to the TAIL of the
   2138      0      stevel 		 * deferred T_conn_ind queue. We need to get to the head of
   2139      0      stevel 		 * the queue in order to send up T_conn_ind the same order as
   2140      0      stevel 		 * how the 3WHS is completed.
   2141      0      stevel 		 */
   2142      0      stevel 		while (tcp != listener) {
   2143      0      stevel 			if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
   2144      0      stevel 				break;
   2145      0      stevel 			else
   2146      0      stevel 				tcp = tcp->tcp_eager_prev_q0;
   2147      0      stevel 		}
   2148      0      stevel 		ASSERT(tcp != listener);
   2149      0      stevel 		conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
   2150      0      stevel 		ASSERT(conn_ind != NULL);
   2151      0      stevel 		tcp->tcp_conn.tcp_eager_conn_ind = NULL;
   2152      0      stevel 
   2153      0      stevel 		/* Move from q0 to q */
   2154      0      stevel 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
   2155      0      stevel 		listener->tcp_conn_req_cnt_q0--;
   2156      0      stevel 		listener->tcp_conn_req_cnt_q++;
   2157      0      stevel 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
   2158      0      stevel 		    tcp->tcp_eager_prev_q0;
   2159      0      stevel 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
   2160      0      stevel 		    tcp->tcp_eager_next_q0;
   2161      0      stevel 		tcp->tcp_eager_prev_q0 = NULL;
   2162      0      stevel 		tcp->tcp_eager_next_q0 = NULL;
   2163      0      stevel 		tcp->tcp_conn_def_q0 = B_FALSE;
   2164   3104    jprakash 
   2165   3104    jprakash 		/* Make sure the tcp isn't in the list of droppables */
   2166   3104    jprakash 		ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
   2167   3104    jprakash 		    tcp->tcp_eager_prev_drop_q0 == NULL);
   2168      0      stevel 
   2169      0      stevel 		/*
   2170      0      stevel 		 * Insert at end of the queue because sockfs sends
   2171      0      stevel 		 * down T_CONN_RES in chronological order. Leaving
   2172      0      stevel 		 * the older conn indications at front of the queue
   2173      0      stevel 		 * helps reducing search time.
   2174      0      stevel 		 */
   2175      0      stevel 		tail = listener->tcp_eager_last_q;
   2176      0      stevel 		if (tail != NULL)
   2177      0      stevel 			tail->tcp_eager_next_q = tcp;
   2178      0      stevel 		else
   2179      0      stevel 			listener->tcp_eager_next_q = tcp;
   2180      0      stevel 		listener->tcp_eager_last_q = tcp;
   2181      0      stevel 		tcp->tcp_eager_next_q = NULL;
   2182      0      stevel 		mutex_exit(&listener->tcp_eager_lock);
   2183  11042        Erik 		putnext(tcp->tcp_connp->conn_rq, conn_ind);
   2184      0      stevel 	} else {
   2185      0      stevel 		mutex_exit(&listener->tcp_eager_lock);
   2186      0      stevel 	}
   2187      0      stevel 
   2188      0      stevel 	/*
   2189      0      stevel 	 * Done with the acceptor - free it
   2190      0      stevel 	 *
   2191      0      stevel 	 * Note: from this point on, no access to listener should be made
   2192      0      stevel 	 * as listener can be equal to acceptor.
   2193      0      stevel 	 */
   2194      0      stevel finish:
   2195      0      stevel 	ASSERT(acceptor->tcp_detached);
   2196  11042        Erik 	acceptor->tcp_connp->conn_rq = NULL;
   2197   8348        Eric 	ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
   2198  11042        Erik 	acceptor->tcp_connp->conn_wq = NULL;
   2199      0      stevel 	(void) tcp_clean_death(acceptor, 0, 2);
   2200      0      stevel 	CONN_DEC_REF(acceptor->tcp_connp);
   2201      0      stevel 
   2202      0      stevel 	/*
   2203  11042        Erik 	 * We pass discon_mp to tcp_accept_finish to get on the right squeue.
   2204  11042        Erik 	 *
   2205  11042        Erik 	 * It will update the setting for sockfs/stream head and also take
   2206  11042        Erik 	 * care of any data that arrived before accept() wad called.
   2207  11042        Erik 	 * In case we already received a FIN then tcp_accept_finish will send up
   2208  11042        Erik 	 * the ordrel. It will also send up a window update if the window
   2209      0      stevel 	 * has opened up.
   2210      0      stevel 	 */
   2211      0      stevel 
   2212      0      stevel 	/*
   2213      0      stevel 	 * XXX: we currently have a problem if XTI application closes the
   2214      0      stevel 	 * acceptor stream in between. This problem exists in on10-gate also
   2215      0      stevel 	 * and is well know but nothing can be done short of major rewrite
   2216      0      stevel 	 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
   2217      0      stevel 	 * eager same squeue as listener (we can distinguish non socket
   2218  11042        Erik 	 * listeners at the time of handling a SYN in tcp_input_listener)
   2219      0      stevel 	 * and do most of the work that tcp_accept_finish does here itself
   2220      0      stevel 	 * and then get behind the acceptor squeue to access the acceptor
   2221      0      stevel 	 * queue.
   2222      0      stevel 	 */
   2223      0      stevel 	/*
   2224   8275        Eric 	 * We already have a ref on tcp so no need to do one before squeue_enter
   2225   8275        Eric 	 */
   2226  11042        Erik 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
   2227  11042        Erik 	    tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
   2228  11042        Erik 	    SQTAG_TCP_ACCEPT_FINISH);
   2229      0      stevel }
   2230      0      stevel 
   2231      0      stevel /*
   2232      0      stevel  * Swap information between the eager and acceptor for a TLI/XTI client.
   2233      0      stevel  * The sockfs accept is done on the acceptor stream and control goes
   2234  11042        Erik  * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
   2235      0      stevel  * called. In either case, both the eager and listener are in their own
   2236      0      stevel  * perimeter (squeue) and the code has to deal with potential race.
   2237      0      stevel  *
   2238  11042        Erik  * See the block comment on top of tcp_accept() and tcp_tli_accept().
   2239  11042        Erik  */
   2240  11042        Erik static void
   2241      0      stevel tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
   2242      0      stevel {
   2243      0      stevel 	conn_t	*econnp, *aconnp;
   2244  11042        Erik 
   2245  11042        Erik 	ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
   2246      0      stevel 	ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
   2247      0      stevel 	ASSERT(!TCP_IS_SOCKET(acceptor));
   2248      0      stevel 	ASSERT(!TCP_IS_SOCKET(eager));
   2249      0      stevel 	ASSERT(!TCP_IS_SOCKET(listener));
   2250   9710         Ken 
   2251   9710         Ken 	/*
   2252   9710         Ken 	 * Trusted Extensions may need to use a security label that is
   2253   9710         Ken 	 * different from the acceptor's label on MLP and MAC-Exempt
   2254   9710         Ken 	 * sockets. If this is the case, the required security label
   2255  11042        Erik 	 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
   2256  11042        Erik 	 * acceptor stream refer to econnp we atomatically get that label.
   2257  11042        Erik 	 */
   2258      0      stevel 
   2259      0      stevel 	acceptor->tcp_detached = B_TRUE;
   2260      0      stevel 	/*
   2261      0      stevel 	 * To permit stream re-use by TLI/XTI, the eager needs a copy of
   2262      0      stevel 	 * the acceptor id.
   2263      0      stevel 	 */
   2264      0      stevel 	eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
   2265      0      stevel 
   2266      0      stevel 	/* remove eager from listen list... */
   2267      0      stevel 	mutex_enter(&listener->tcp_eager_lock);
   2268      0      stevel 	tcp_eager_unlink(eager);
   2269      0      stevel 	ASSERT(eager->tcp_eager_next_q == NULL &&
   2270      0      stevel 	    eager->tcp_eager_last_q == NULL);
   2271      0      stevel 	ASSERT(eager->tcp_eager_next_q0 == NULL &&
   2272      0      stevel 	    eager->tcp_eager_prev_q0 == NULL);
   2273      0      stevel 	mutex_exit(&listener->tcp_eager_lock);
   2274  11042        Erik 
   2275  11042        Erik 	econnp = eager->tcp_connp;
   2276  11042        Erik 	aconnp = acceptor->tcp_connp;
   2277  11042        Erik 	econnp->conn_rq = aconnp->conn_rq;
   2278  11042        Erik 	econnp->conn_wq = aconnp->conn_wq;
   2279  11042        Erik 	econnp->conn_rq->q_ptr = econnp;
   2280  11042        Erik 	econnp->conn_wq->q_ptr = econnp;
   2281   2504        meem 
   2282   2504        meem 	/*
   2283   2504        meem 	 * In the TLI/XTI loopback case, we are inside the listener's squeue,
   2284   2504        meem 	 * which might be a different squeue from our peer TCP instance.
   2285   2504        meem 	 * For TCP Fusion, the peer expects that whenever tcp_detached is
   2286   2504        meem 	 * clear, our TCP queues point to the acceptor's queues.  Thus, use
   2287  11042        Erik 	 * membar_producer() to ensure that the assignments of conn_rq/conn_wq
   2288   2504        meem 	 * above reach global visibility prior to the clearing of tcp_detached.
   2289   2504        meem 	 */
   2290   2504        meem 	membar_producer();
   2291      0      stevel 	eager->tcp_detached = B_FALSE;
   2292      0      stevel 
   2293      0      stevel 	ASSERT(eager->tcp_ack_tid == 0);
   2294      0      stevel 
   2295      0      stevel 	econnp->conn_dev = aconnp->conn_dev;
   2296   5815    gt145670 	econnp->conn_minor_arena = aconnp->conn_minor_arena;
   2297   9710         Ken 
   2298   5815    gt145670 	ASSERT(econnp->conn_minor_arena != NULL);
   2299  11042        Erik 	if (econnp->conn_cred != NULL)
   2300  11042        Erik 		crfree(econnp->conn_cred);
   2301  11042        Erik 	econnp->conn_cred = aconnp->conn_cred;
   2302   9710         Ken 	aconnp->conn_cred = NULL;
   2303  11042        Erik 	econnp->conn_cpid = aconnp->conn_cpid;
   2304   3448    dh155122 	ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
   2305   3448    dh155122 	ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
   2306   3448    dh155122 
   2307      0      stevel 	econnp->conn_zoneid = aconnp->conn_zoneid;
   2308   2834    sommerfe 	econnp->conn_allzones = aconnp->conn_allzones;
   2309  11042        Erik 	econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
   2310  11042        Erik 
   2311  11042        Erik 	econnp->conn_mac_mode = aconnp->conn_mac_mode;
   2312  11042        Erik 	econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
   2313  10934  sommerfeld 	aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
   2314      0      stevel 
   2315      0      stevel 	/* Do the IPC initialization */
   2316      0      stevel 	CONN_INC_REF(econnp);
   2317      0      stevel 
   2318      0      stevel 	/* Done with old IPC. Drop its ref on its connp */
   2319      0      stevel 	CONN_DEC_REF(aconnp);
   2320      0      stevel }
   2321      0      stevel 
   2322      0      stevel 
   2323      0      stevel /*
   2324      0      stevel  * Adapt to the information, such as rtt and rtt_sd, provided from the
   2325  11042        Erik  * DCE and IRE maintained by IP.
   2326      0      stevel  *
   2327      0      stevel  * Checks for multicast and broadcast destination address.
   2328  11042        Erik  * Returns zero if ok; an errno on failure.
   2329      0      stevel  *
   2330      0      stevel  * Note that the MSS calculation here is based on the info given in
   2331  11042        Erik  * the DCE and IRE.  We do not do any calculation based on TCP options.  They
   2332  11042        Erik  * will be handled in tcp_input_data() when TCP knows which options to use.
   2333      0      stevel  *
   2334      0      stevel  * Note on how TCP gets its parameters for a connection.
   2335      0      stevel  *
   2336      0      stevel  * When a tcp_t structure is allocated, it gets all the default parameters.
   2337  11042        Erik  * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd,
   2338      0      stevel  * spipe, rpipe, ... from the route metrics.  Route metric overrides the
   2339   7502       aruna  * default.
   2340      0      stevel  *
   2341  11042        Erik  * An incoming SYN with a multicast or broadcast destination address is dropped
   2342  11042        Erik  * in ip_fanout_v4/v6.
   2343      0      stevel  *
   2344      0      stevel  * An incoming SYN with a multicast or broadcast source address is always
   2345  11042        Erik  * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in
   2346  11042        Erik  * conn_connect.
   2347  11042        Erik  * The same logic in tcp_set_destination also serves to
   2348      0      stevel  * reject an attempt to connect to a broadcast or multicast (destination)
   2349      0      stevel  * address.
   2350      0      stevel  */
   2351      0      stevel static int
   2352  11042        Erik tcp_set_destination(tcp_t *tcp)
   2353  11042        Erik {
   2354      0      stevel 	uint32_t	mss_max;
   2355      0      stevel 	uint32_t	mss;
   2356      0      stevel 	boolean_t	tcp_detached = TCP_IS_DETACHED(tcp);
   2357      0      stevel 	conn_t		*connp = tcp->tcp_connp;
   2358  11042        Erik 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   2359  11042        Erik 	iulp_t		uinfo;
   2360  11042        Erik 	int		error;
   2361  11042        Erik 	uint32_t	flags;
   2362  11042        Erik 
   2363  11042        Erik 	flags = IPDF_LSO | IPDF_ZCOPY;
   2364  11042        Erik 	/*
   2365  11042        Erik 	 * Make sure we have a dce for the destination to avoid dce_ident
   2366  11042        Erik 	 * contention for connected sockets.
   2367  11042        Erik 	 */
   2368  11042        Erik 	flags |= IPDF_UNIQUE_DCE;
   2369  11042        Erik 
   2370  11042        Erik 	if (!tcps->tcps_ignore_path_mtu)
   2371  11042        Erik 		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
   2372  11042        Erik 
   2373  11042        Erik 	/* Use conn_lock to satify ASSERT; tcp is already serialized */
   2374  11042        Erik 	mutex_enter(&connp->conn_lock);
   2375  11042        Erik 	error = conn_connect(connp, &uinfo, flags);
   2376  11042        Erik 	mutex_exit(&connp->conn_lock);
   2377  11042        Erik 	if (error != 0)
   2378  11042        Erik 		return (error);
   2379  11042        Erik 
   2380  11042        Erik 	error = tcp_build_hdrs(tcp);
   2381  11042        Erik 	if (error != 0)
   2382  11042        Erik 		return (error);
   2383  11042        Erik 
   2384  11042        Erik 	tcp->tcp_localnet = uinfo.iulp_localnet;
   2385  11042        Erik 
   2386  11042        Erik 	if (uinfo.iulp_rtt != 0) {
   2387  11042        Erik 		clock_t	rto;
   2388  11042        Erik 
   2389  11042        Erik 		tcp->tcp_rtt_sa = uinfo.iulp_rtt;
   2390  11042        Erik 		tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
   2391  11042        Erik 		rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
   2392  11042        Erik 		    tcps->tcps_rexmit_interval_extra +
   2393  11042        Erik 		    (tcp->tcp_rtt_sa >> 5);
   2394  11042        Erik 
   2395  11042        Erik 		if (rto > tcps->tcps_rexmit_interval_max) {
   2396  11042        Erik 			tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
   2397  11042        Erik 		} else if (rto < tcps->tcps_rexmit_interval_min) {
   2398  11042        Erik 			tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
   2399  11042        Erik 		} else {
   2400  11042        Erik 			tcp->tcp_rto = rto;
   2401  11042        Erik 		}
   2402  11042        Erik 	}
   2403  11042        Erik 	if (uinfo.iulp_ssthresh != 0)
   2404  11042        Erik 		tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
   2405  11042        Erik 	else
   2406  11042        Erik 		tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
   2407  11042        Erik 	if (uinfo.iulp_spipe > 0) {
   2408  11042        Erik 		connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
   2409  11042        Erik 		    tcps->tcps_max_buf);
   2410  11042        Erik 		if (tcps->tcps_snd_lowat_fraction != 0) {
   2411  11042        Erik 			connp->conn_sndlowat = connp->conn_sndbuf /
   2412  11042        Erik 			    tcps->tcps_snd_lowat_fraction;
   2413  11042        Erik 		}
   2414  11042        Erik 		(void) tcp_maxpsz_set(tcp, B_TRUE);
   2415  11042        Erik 	}
   2416  11042        Erik 	/*
   2417  11042        Erik 	 * Note that up till now, acceptor always inherits receive
   2418  11042        Erik 	 * window from the listener.  But if there is a metrics
   2419  11042        Erik 	 * associated with a host, we should use that instead of
   2420  11042        Erik 	 * inheriting it from listener. Thus we need to pass this
   2421  11042        Erik 	 * info back to the caller.
   2422  11042        Erik 	 */
   2423  11042        Erik 	if (uinfo.iulp_rpipe > 0) {
   2424  11042        Erik 		tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe,
   2425  11042        Erik 		    tcps->tcps_max_buf);
   2426  11042        Erik 	}
   2427  11042        Erik 
   2428  11042        Erik 	if (uinfo.iulp_rtomax > 0) {
   2429  11042        Erik 		tcp->tcp_second_timer_threshold =
   2430  11042        Erik 		    uinfo.iulp_rtomax;
   2431  11042        Erik 	}
   2432  11042        Erik 
   2433  11042        Erik 	/*
   2434  11042        Erik 	 * Use the metric option settings, iulp_tstamp_ok and
   2435  11042        Erik 	 * iulp_wscale_ok, only for active open. What this means
   2436  11042        Erik 	 * is that if the other side uses timestamp or window
   2437  11042        Erik 	 * scale option, TCP will also use those options. That
   2438  11042        Erik 	 * is for passive open.  If the application sets a
   2439  11042        Erik 	 * large window, window scale is enabled regardless of
   2440  11042        Erik 	 * the value in iulp_wscale_ok.  This is the behavior
   2441  11042        Erik 	 * since 2.6.  So we keep it.
   2442  11042        Erik 	 * The only case left in passive open processing is the
   2443  11042        Erik 	 * check for SACK.
   2444  11042        Erik 	 * For ECN, it should probably be like SACK.  But the
   2445  11042        Erik 	 * current value is binary, so we treat it like the other
   2446  11042        Erik 	 * cases.  The metric only controls active open.For passive
   2447  11042        Erik 	 * open, the ndd param, tcp_ecn_permitted, controls the
   2448  11042        Erik 	 * behavior.
   2449  11042        Erik 	 */
   2450  11042        Erik 	if (!tcp_detached) {
   2451  11042        Erik 		/*
   2452  11042        Erik 		 * The if check means that the following can only
   2453  11042        Erik 		 * be turned on by the metrics only IRE, but not off.
   2454  11042        Erik 		 */
   2455  11042        Erik 		if (uinfo.iulp_tstamp_ok)
   2456  11042        Erik 			tcp->tcp_snd_ts_ok = B_TRUE;
   2457  11042        Erik 		if (uinfo.iulp_wscale_ok)
   2458  11042        Erik 			tcp->tcp_snd_ws_ok = B_TRUE;
   2459  11042        Erik 		if (uinfo.iulp_sack == 2)
   2460  11042        Erik 			tcp->tcp_snd_sack_ok = B_TRUE;
   2461  11042        Erik 		if (uinfo.iulp_ecn_ok)
   2462  11042        Erik 			tcp->tcp_ecn_ok = B_TRUE;
   2463  11042        Erik 	} else {
   2464  11042        Erik 		/*
   2465  11042        Erik 		 * Passive open.
   2466  11042        Erik 		 *
   2467  11042        Erik 		 * As above, the if check means that SACK can only be
   2468  11042        Erik 		 * turned on by the metric only IRE.
   2469  11042        Erik 		 */
   2470  11042        Erik 		if (uinfo.iulp_sack > 0) {
   2471  11042        Erik 			tcp->tcp_snd_sack_ok = B_TRUE;
   2472  11042        Erik 		}
   2473  11042        Erik 	}
   2474  11042        Erik 
   2475  11042        Erik 	/*
   2476  11042        Erik 	 * XXX Note that currently, iulp_mtu can be as small as 68
   2477      0      stevel 	 * because of PMTUd.  So tcp_mss may go to negative if combined
   2478      0      stevel 	 * length of all those options exceeds 28 bytes.  But because
   2479      0      stevel 	 * of the tcp_mss_min check below, we may not have a problem if
   2480      0      stevel 	 * tcp_mss_min is of a reasonable value.  The default is 1 so
   2481      0      stevel 	 * the negative problem still exists.  And the check defeats PMTUd.
   2482      0      stevel 	 * In fact, if PMTUd finds that the MSS should be smaller than
   2483      0      stevel 	 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min
   2484      0      stevel 	 * value.
   2485      0      stevel 	 *
   2486      0      stevel 	 * We do not deal with that now.  All those problems related to
   2487      0      stevel 	 * PMTUd will be fixed later.
   2488      0      stevel 	 */
   2489  11042        Erik 	ASSERT(uinfo.iulp_mtu != 0);
   2490  11042        Erik 	mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu;
   2491      0      stevel 
   2492      0      stevel 	/* Sanity check for MSS value. */
   2493  11042        Erik 	if (connp->conn_ipversion == IPV4_VERSION)
   2494   3448    dh155122 		mss_max = tcps->tcps_mss_max_ipv4;
   2495      0      stevel 	else
   2496   3448    dh155122 		mss_max = tcps->tcps_mss_max_ipv6;
   2497      0      stevel 
   2498      0      stevel 	if (tcp->tcp_ipsec_overhead == 0)
   2499      0      stevel 		tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
   2500      0      stevel 
   2501      0      stevel 	mss -= tcp->tcp_ipsec_overhead;
   2502      0      stevel 
   2503   3448    dh155122 	if (mss < tcps->tcps_mss_min)
   2504   3448    dh155122 		mss = tcps->tcps_mss_min;
   2505      0      stevel 	if (mss > mss_max)
   2506      0      stevel 		mss = mss_max;
   2507      0      stevel 
   2508      0      stevel 	/* Note that this is the maximum MSS, excluding all options. */
   2509      0      stevel 	tcp->tcp_mss = mss;
   2510      0      stevel 
   2511      0      stevel 	/*
   2512  11042        Erik 	 * Update the tcp connection with LSO capability.
   2513  11042        Erik 	 */
   2514  11042        Erik 	tcp_update_lso(tcp, connp->conn_ixa);
   2515  11042        Erik 
   2516  11042        Erik 	/*
   2517      0      stevel 	 * Initialize the ISS here now that we have the full connection ID.
   2518      0      stevel 	 * The RFC 1948 method of initial sequence number generation requires
   2519      0      stevel 	 * knowledge of the full connection ID before setting the ISS.
   2520      0      stevel 	 */
   2521      0      stevel 	tcp_iss_init(tcp);
   2522      0      stevel 
   2523  11042        Erik 	tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local);
   2524  11042        Erik 
   2525      0      stevel 	/*
   2526      0      stevel 	 * Make sure that conn is not marked incipient
   2527      0      stevel 	 * for incoming connections. A blind
   2528      0      stevel 	 * removal of incipient flag is cheaper than
   2529      0      stevel 	 * check and removal.
   2530      0      stevel 	 */
   2531  11042        Erik 	mutex_enter(&connp->conn_lock);
   2532      0      stevel 	connp->conn_state_flags &= ~CONN_INCIPIENT;
   2533      0      stevel 	mutex_exit(&connp->conn_lock);
   2534      0      stevel 	return (0);
   2535      0      stevel }
   2536      0      stevel 
   2537   8348        Eric static void
   2538   8348        Eric tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
   2539   8348        Eric {
   2540   8348        Eric 	int	error;
   2541   8348        Eric 	conn_t	*connp = tcp->tcp_connp;
   2542   8348        Eric 	struct sockaddr	*sa;
   2543   8348        Eric 	mblk_t  *mp1;
   2544   8348        Eric 	struct T_bind_req *tbr;
   2545   8348        Eric 	int	backlog;
   2546   8348        Eric 	socklen_t	len;
   2547      0      stevel 	sin_t	*sin;
   2548      0      stevel 	sin6_t	*sin6;
   2549   8778        Erik 	cred_t		*cr;
   2550   8778        Erik 
   2551   8778        Erik 	/*
   2552   8778        Erik 	 * All Solaris components should pass a db_credp
   2553   8778        Erik 	 * for this TPI message, hence we ASSERT.
   2554   8778        Erik 	 * But in case there is some other M_PROTO that looks
   2555   8778        Erik 	 * like a TPI message sent by some other kernel
   2556   8778        Erik 	 * component, we check and return an error.
   2557   8778        Erik 	 */
   2558   8778        Erik 	cr = msg_getcred(mp, NULL);
   2559   8778        Erik 	ASSERT(cr != NULL);
   2560   8778        Erik 	if (cr == NULL) {
   2561   8778        Erik 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
   2562   8778        Erik 		return;
   2563   8778        Erik 	}
   2564      0      stevel 
   2565      0      stevel 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
   2566      0      stevel 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
   2567  11042        Erik 		if (connp->conn_debug) {
   2568    741    masputra 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   2569   8348        Eric 			    "tcp_tpi_bind: bad req, len %u",
   2570      0      stevel 			    (uint_t)(mp->b_wptr - mp->b_rptr));
   2571      0      stevel 		}
   2572      0      stevel 		tcp_err_ack(tcp, mp, TPROTO, 0);
   2573      0      stevel 		return;
   2574      0      stevel 	}
   2575      0      stevel 	/* Make sure the largest address fits */
   2576  11042        Erik 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
   2577      0      stevel 	if (mp1 == NULL) {
   2578      0      stevel 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   2579      0      stevel 		return;
   2580      0      stevel 	}
   2581      0      stevel 	mp = mp1;
   2582      0      stevel 	tbr = (struct T_bind_req *)mp->b_rptr;
   2583   8348        Eric 
   2584   8348        Eric 	backlog = tbr->CONIND_number;
   2585   8348        Eric 	len = tbr->ADDR_length;
   2586   8348        Eric 
   2587   8348        Eric 	switch (len) {
   2588   8348        Eric 	case 0:		/* request for a generic port */
   2589      0      stevel 		tbr->ADDR_offset = sizeof (struct T_bind_req);
   2590  11042        Erik 		if (connp->conn_family == AF_INET) {
   2591      0      stevel 			tbr->ADDR_length = sizeof (sin_t);
   2592      0      stevel 			sin = (sin_t *)&tbr[1];
   2593      0      stevel 			*sin = sin_null;
   2594      0      stevel 			sin->sin_family = AF_INET;
   2595   8348        Eric 			sa = (struct sockaddr *)sin;
   2596   8348        Eric 			len = sizeof (sin_t);
   2597      0      stevel 			mp->b_wptr = (uchar_t *)&sin[1];
   2598      0      stevel 		} else {
   2599  11042        Erik 			ASSERT(connp->conn_family == AF_INET6);
   2600      0      stevel 			tbr->ADDR_length = sizeof (sin6_t);
   2601      0      stevel 			sin6 = (sin6_t *)&tbr[1];
   2602      0      stevel 			*sin6 = sin6_null;
   2603      0      stevel 			sin6->sin6_family = AF_INET6;
   2604   8348        Eric 			sa = (struct sockaddr *)sin6;
   2605   8348        Eric 			len = sizeof (sin6_t);
   2606      0      stevel 			mp->b_wptr = (uchar_t *)&sin6[1];
   2607   8348        Eric 		}
   2608   8348        Eric 		break;
   2609   8348        Eric 
   2610   8348        Eric 	case sizeof (sin_t):    /* Complete IPv4 address */
   2611   8348        Eric 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
   2612      0      stevel 		    sizeof (sin_t));
   2613      0      stevel 		break;
   2614      0      stevel 
   2615      0      stevel 	case sizeof (sin6_t): /* Complete IPv6 address */
   2616   8348        Eric 		sa = (struct sockaddr *)mi_offset_param(mp,
   2617      0      stevel 		    tbr->ADDR_offset, sizeof (sin6_t));
   2618      0      stevel 		break;
   2619      0      stevel 
   2620      0      stevel 	default:
   2621  11042        Erik 		if (connp->conn_debug) {
   2622    741    masputra 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   2623   8348        Eric 			    "tcp_tpi_bind: bad address length, %d",
   2624      0      stevel 			    tbr->ADDR_length);
   2625      0      stevel 		}
   2626      0      stevel 		tcp_err_ack(tcp, mp, TBADADDR, 0);
   2627      0      stevel 		return;
   2628      0      stevel 	}
   2629   8348        Eric 
   2630   9395         Rao 	if (backlog > 0) {
   2631   9395         Rao 		error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
   2632   9395         Rao 		    tbr->PRIM_type != O_T_BIND_REQ);
   2633   9395         Rao 	} else {
   2634   9395         Rao 		error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
   2635   9395         Rao 		    tbr->PRIM_type != O_T_BIND_REQ);
   2636   8348        Eric 	}
   2637   8348        Eric done:
   2638   8348        Eric 	if (error > 0) {
   2639   8348        Eric 		tcp_err_ack(tcp, mp, TSYSERR, error);
   2640   8348        Eric 	} else if (error < 0) {
   2641   8348        Eric 		tcp_err_ack(tcp, mp, -error, 0);
   2642   8348        Eric 	} else {
   2643   9395         Rao 		/*
   2644   9395         Rao 		 * Update port information as sockfs/tpi needs it for checking
   2645   9395         Rao 		 */
   2646  11042        Erik 		if (connp->conn_family == AF_INET) {
   2647   9395         Rao 			sin = (sin_t *)sa;
   2648  11042        Erik 			sin->sin_port = connp->conn_lport;
   2649   9395         Rao 		} else {
   2650   9395         Rao 			sin6 = (sin6_t *)sa;
   2651  11042        Erik 			sin6->sin6_port = connp->conn_lport;
   2652   9395         Rao 		}
   2653   8348        Eric 		mp->b_datap->db_type = M_PCPROTO;
   2654   8348        Eric 		tbr->PRIM_type = T_BIND_ACK;
   2655  11042        Erik 		putnext(connp->conn_rq, mp);
   2656   8348        Eric 	}
   2657   8348        Eric }
   2658      0      stevel 
   2659      0      stevel /*
   2660      0      stevel  * If the "bind_to_req_port_only" parameter is set, if the requested port
   2661      0      stevel  * number is available, return it, If not return 0
   2662      0      stevel  *
   2663      0      stevel  * If "bind_to_req_port_only" parameter is not set and
   2664      0      stevel  * If the requested port number is available, return it.  If not, return
   2665      0      stevel  * the first anonymous port we happen across.  If no anonymous ports are
   2666      0      stevel  * available, return 0. addr is the requested local address, if any.
   2667      0      stevel  *
   2668      0      stevel  * In either case, when succeeding update the tcp_t to record the port number
   2669      0      stevel  * and insert it in the bind hash table.
   2670      0      stevel  *
   2671      0      stevel  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
   2672      0      stevel  * without setting SO_REUSEADDR. This is needed so that they
   2673      0      stevel  * can be viewed as two independent transport protocols.
   2674      0      stevel  */
   2675      0      stevel static in_port_t
   2676    646    gt145670 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
   2677    646    gt145670     int reuseaddr, boolean_t quick_connect,
   2678      0      stevel     boolean_t bind_to_req_port_only, boolean_t user_specified)
   2679      0      stevel {
   2680      0      stevel 	/* number of times we have run around the loop */
   2681      0      stevel 	int count = 0;
   2682      0      stevel 	/* maximum number of times to run around the loop */
   2683      0      stevel 	int loopmax;
   2684   1676         jpk 	conn_t *connp = tcp->tcp_connp;
   2685   3448    dh155122 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   2686      0      stevel 
   2687      0      stevel 	/*
   2688      0      stevel 	 * Lookup for free addresses is done in a loop and "loopmax"
   2689      0      stevel 	 * influences how long we spin in the loop
   2690      0      stevel 	 */
   2691      0      stevel 	if (bind_to_req_port_only) {
   2692      0      stevel 		/*
   2693      0      stevel 		 * If the requested port is busy, don't bother to look
   2694      0      stevel 		 * for a new one. Setting loop maximum count to 1 has
   2695      0      stevel 		 * that effect.
   2696      0      stevel 		 */
   2697      0      stevel 		loopmax = 1;
   2698      0      stevel 	} else {
   2699      0      stevel 		/*
   2700      0      stevel 		 * If the requested port is busy, look for a free one
   2701      0      stevel 		 * in the anonymous port range.
   2702      0      stevel 		 * Set loopmax appropriately so that one does not look
   2703      0      stevel 		 * forever in the case all of the anonymous ports are in use.
   2704      0      stevel 		 */
   2705  11042        Erik 		if (connp->conn_anon_priv_bind) {
   2706      0      stevel 			/*
   2707      0      stevel 			 * loopmax =
   2708      0      stevel 			 * 	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
   2709      0      stevel 			 */
   2710   3448    dh155122 			loopmax = IPPORT_RESERVED -
   2711   3448    dh155122 			    tcps->tcps_min_anonpriv_port;
   2712   3448    dh155122 		} else {
   2713   3448    dh155122 			loopmax = (tcps->tcps_largest_anon_port -
   2714   3448    dh155122 			    tcps->tcps_smallest_anon_port + 1);
   2715      0      stevel 		}
   2716      0      stevel 	}
   2717      0      stevel 	do {
   2718      0      stevel 		uint16_t	lport;
   2719      0      stevel 		tf_t		*tbf;
   2720      0      stevel 		tcp_t		*ltcp;
   2721   1676         jpk 		conn_t		*lconnp;
   2722      0      stevel 
   2723      0      stevel 		lport = htons(port);
   2724      0      stevel 
   2725      0      stevel 		/*
   2726      0      stevel 		 * Ensure that the tcp_t is not currently in the bind hash.
   2727      0      stevel 		 * Hold the lock on the hash bucket to ensure that
   2728      0      stevel 		 * the duplicate check plus the insertion is an atomic
   2729      0      stevel 		 * operation.
   2730      0      stevel 		 *
   2731      0      stevel 		 * This function does an inline lookup on the bind hash list
   2732      0      stevel 		 * Make sure that we access only members of tcp_t
   2733      0      stevel 		 * and that we don't look at tcp_tcp, since we are not
   2734      0      stevel 		 * doing a CONN_INC_REF.
   2735      0      stevel 		 */
   2736      0      stevel 		tcp_bind_hash_remove(tcp);
   2737   3448    dh155122 		tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
   2738      0      stevel 		mutex_enter(&tbf->tf_lock);
   2739      0      stevel 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
   2740      0      stevel 		    ltcp = ltcp->tcp_bind_hash) {
   2741  11042        Erik 			if (lport == ltcp->tcp_connp->conn_lport)
   2742   8348        Eric 				break;
   2743   8348        Eric 		}
   2744   8348        Eric 
   2745   8348        Eric 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
   2746   2429      kcpoon 			boolean_t not_socket;
   2747   2429      kcpoon 			boolean_t exclbind;
   2748   1676         jpk 
   2749   1676         jpk 			lconnp = ltcp->tcp_connp;
   2750   1676         jpk 
   2751   1676         jpk 			/*
   2752   1676         jpk 			 * On a labeled system, we must treat bindings to ports
   2753   1676         jpk 			 * on shared IP addresses by sockets with MAC exemption
   2754   1676         jpk 			 * privilege as being in all zones, as there's
   2755   1676         jpk 			 * otherwise no way to identify the right receiver.
   2756   1676         jpk 			 */
   2757  11042        Erik 			if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
   2758   1676         jpk 				continue;
   2759      0      stevel 
   2760      0      stevel 			/*
   2761      0      stevel 			 * If TCP_EXCLBIND is set for either the bound or
   2762      0      stevel 			 * binding endpoint, the semantics of bind
   2763      0      stevel 			 * is changed according to the following.
   2764      0      stevel 			 *
   2765      0      stevel 			 * spec = specified address (v4 or v6)
   2766      0      stevel 			 * unspec = unspecified address (v4 or v6)
   2767      0      stevel 			 * A = specified addresses are different for endpoints
   2768      0      stevel 			 *
   2769      0      stevel 			 * bound	bind to		allowed
   2770      0      stevel 			 * -------------------------------------
   2771      0      stevel 			 * unspec	unspec		no
   2772      0      stevel 			 * unspec	spec		no
   2773      0      stevel 			 * spec		unspec		no
   2774      0      stevel 			 * spec		spec		yes if A
   2775      0      stevel 			 *
   2776   1676         jpk 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
   2777   2429      kcpoon 			 * as TCP_EXCLBIND, except that zoneid is ignored.
   2778   1676         jpk 			 *
   2779      0      stevel 			 * Note:
   2780      0      stevel 			 *
   2781      0      stevel 			 * 1. Because of TLI semantics, an endpoint can go
   2782      0      stevel 			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
   2783      0      stevel 			 * TCPS_BOUND, depending on whether it is originally
   2784      0      stevel 			 * a listener or not.  That is why we need to check
   2785      0      stevel 			 * for states greater than or equal to TCPS_BOUND
   2786      0      stevel 			 * here.
   2787      0      stevel 			 *
   2788      0      stevel 			 * 2. Ideally, we should only check for state equals
   2789      0      stevel 			 * to TCPS_LISTEN. And the following check should be
   2790      0      stevel 			 * added.
   2791      0      stevel 			 *
   2792      0      stevel 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
   2793  11042        Erik 			 *	!reuseaddr || !lconnp->conn_reuseaddr) {
   2794      0      stevel 			 *		...
   2795      0      stevel 			 * }
   2796      0      stevel 			 *
   2797      0      stevel 			 * The semantics will be changed to this.  If the
   2798      0      stevel 			 * endpoint on the list is in state not equal to
   2799      0      stevel 			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
   2800      0      stevel 			 * set, let the bind succeed.
   2801      0      stevel 			 *
   2802   2429      kcpoon 			 * Because of (1), we cannot do that for TLI
   2803   2429      kcpoon 			 * endpoints.  But we can do that for socket endpoints.
   2804   2429      kcpoon 			 * If in future, we can change this going back
   2805   2429      kcpoon 			 * semantics, we can use the above check for TLI also.
   2806   2429      kcpoon 			 */
   2807   2429      kcpoon 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
   2808   2429      kcpoon 			    TCP_IS_SOCKET(tcp));
   2809  11042        Erik 			exclbind = lconnp->conn_exclbind ||
   2810  11042        Erik 			    connp->conn_exclbind;
   2811   2429      kcpoon 
   2812  10934  sommerfeld 			if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
   2813  10934  sommerfeld 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
   2814   2429      kcpoon 			    (exclbind && (not_socket ||
   2815   2429      kcpoon 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
   2816      0      stevel 				if (V6_OR_V4_INADDR_ANY(
   2817  11042        Erik 				    lconnp->conn_bound_addr_v6) ||
   2818      0      stevel 				    V6_OR_V4_INADDR_ANY(*laddr) ||
   2819      0      stevel 				    IN6_ARE_ADDR_EQUAL(laddr,
   2820  11042        Erik 				    &lconnp->conn_bound_addr_v6)) {
   2821      0      stevel 					break;
   2822      0      stevel 				}
   2823      0      stevel 				continue;
   2824      0      stevel 			}
   2825      0      stevel 
   2826      0      stevel 			/*
   2827      0      stevel 			 * Check ipversion to allow IPv4 and IPv6 sockets to
   2828      0      stevel 			 * have disjoint port number spaces, if *_EXCLBIND
   2829      0      stevel 			 * is not set and only if the application binds to a
   2830      0      stevel 			 * specific port. We use the same autoassigned port
   2831      0      stevel 			 * number space for IPv4 and IPv6 sockets.
   2832      0      stevel 			 */
   2833  11042        Erik 			if (connp->conn_ipversion != lconnp->conn_ipversion &&
   2834      0      stevel 			    bind_to_req_port_only)
   2835      0      stevel 				continue;
   2836      0      stevel 
   2837    646    gt145670 			/*
   2838    646    gt145670 			 * Ideally, we should make sure that the source
   2839    646    gt145670 			 * address, remote address, and remote port in the
   2840    646    gt145670 			 * four tuple for this tcp-connection is unique.
   2841    646    gt145670 			 * However, trying to find out the local source
   2842    646    gt145670 			 * address would require too much code duplication
   2843    646    gt145670 			 * with IP, since IP needs needs to have that code
   2844    646    gt145670 			 * to support userland TCP implementations.
   2845    646    gt145670 			 */
   2846    646    gt145670 			if (quick_connect &&
   2847    646    gt145670 			    (ltcp->tcp_state > TCPS_LISTEN) &&
   2848  11042        Erik 			    ((connp->conn_fport != lconnp->conn_fport) ||
   2849  11042        Erik 			    !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
   2850  11042        Erik 			    &lconnp->conn_faddr_v6)))
   2851    646    gt145670 				continue;
   2852    646    gt145670 
   2853      0      stevel 			if (!reuseaddr) {
   2854      0      stevel 				/*
   2855      0      stevel 				 * No socket option SO_REUSEADDR.
   2856      0      stevel 				 * If existing port is bound to
   2857      0      stevel 				 * a non-wildcard IP address
   2858      0      stevel 				 * and the requesting stream is
   2859      0      stevel 				 * bound to a distinct
   2860      0      stevel 				 * different IP addresses
   2861      0      stevel 				 * (non-wildcard, also), keep
   2862      0      stevel 				 * going.
   2863      0      stevel 				 */
   2864      0      stevel 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
   2865      0      stevel 				    !V6_OR_V4_INADDR_ANY(
   2866  11042        Erik 				    lconnp->conn_bound_addr_v6) &&
   2867      0      stevel 				    !IN6_ARE_ADDR_EQUAL(laddr,
   2868  11042        Erik 				    &lconnp->conn_bound_addr_v6))
   2869      0      stevel 					continue;
   2870      0      stevel 				if (ltcp->tcp_state >= TCPS_BOUND) {
   2871      0      stevel 					/*
   2872      0      stevel 					 * This port is being used and
   2873      0      stevel 					 * its state is >= TCPS_BOUND,
   2874      0      stevel 					 * so we can't bind to it.
   2875      0      stevel 					 */
   2876      0      stevel 					break;
   2877      0      stevel 				}
   2878      0      stevel 			} else {
   2879      0      stevel 				/*
   2880      0      stevel 				 * socket option SO_REUSEADDR is set on the
   2881      0      stevel 				 * binding tcp_t.
   2882      0      stevel 				 *
   2883      0      stevel 				 * If two streams are bound to
   2884      0      stevel 				 * same IP address or both addr
   2885      0      stevel 				 * and bound source are wildcards
   2886      0      stevel 				 * (INADDR_ANY), we want to stop
   2887      0      stevel 				 * searching.
   2888      0      stevel 				 * We have found a match of IP source
   2889      0      stevel 				 * address and source port, which is
   2890      0      stevel 				 * refused regardless of the
   2891      0      stevel 				 * SO_REUSEADDR setting, so we break.
   2892      0      stevel 				 */
   2893      0      stevel 				if (IN6_ARE_ADDR_EQUAL(laddr,
   2894  11042        Erik 				    &lconnp->conn_bound_addr_v6) &&
   2895      0      stevel 				    (ltcp->tcp_state == TCPS_LISTEN ||
   2896   5031    rs200217 				    ltcp->tcp_state == TCPS_BOUND))
   2897      0      stevel 					break;
   2898      0      stevel 			}
   2899      0      stevel 		}
   2900      0      stevel 		if (ltcp != NULL) {
   2901      0      stevel 			/* The port number is busy */
   2902      0      stevel 			mutex_exit(&tbf->tf_lock);
   2903      0      stevel 		} else {
   2904      0      stevel 			/*
   2905      0      stevel 			 * This port is ours. Insert in fanout and mark as
   2906      0      stevel 			 * bound to prevent others from getting the port
   2907      0      stevel 			 * number.
   2908      0      stevel 			 */
   2909      0      stevel 			tcp->tcp_state = TCPS_BOUND;
   2910  11042        Erik 			connp->conn_lport = htons(port);
   2911      0      stevel 
   2912   3448    dh155122 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
   2913  11042        Erik 			    connp->conn_lport)] == tbf);
   2914      0      stevel 			tcp_bind_hash_insert(tbf, tcp, 1);
   2915      0      stevel 
   2916      0      stevel 			mutex_exit(&tbf->tf_lock);
   2917      0      stevel 
   2918      0      stevel 			/*
   2919      0      stevel 			 * We don't want tcp_next_port_to_try to "inherit"
   2920      0      stevel 			 * a port number supplied by the user in a bind.
   2921      0      stevel 			 */
   2922      0      stevel 			if (user_specified)
   2923      0      stevel 				return (port);
   2924      0      stevel 
   2925      0      stevel 			/*
   2926      0      stevel 			 * This is the only place where tcp_next_port_to_try
   2927      0      stevel 			 * is updated. After the update, it may or may not
   2928      0      stevel 			 * be in the valid range.
   2929      0      stevel 			 */
   2930  11042        Erik 			if (!connp->conn_anon_priv_bind)
   2931   3448    dh155122 				tcps->tcps_next_port_to_try = port + 1;
   2932      0      stevel 			return (port);
   2933      0      stevel 		}
   2934      0      stevel 
   2935  11042        Erik 		if (connp->conn_anon_priv_bind) {
   2936   1676         jpk 			port = tcp_get_next_priv_port(tcp);
   2937      0      stevel 		} else {
   2938      0      stevel 			if (count == 0 && user_specified) {
   2939      0      stevel 				/*
   2940      0      stevel 				 * We may have to return an anonymous port. So
   2941      0      stevel 				 * get one to start with.
   2942      0      stevel 				 */
   2943      0      stevel 				port =
   2944   3448    dh155122 				    tcp_update_next_port(
   2945   5031    rs200217 				    tcps->tcps_next_port_to_try,
   2946   5031    rs200217 				    tcp, B_TRUE);
   2947      0      stevel 				user_specified = B_FALSE;
   2948      0      stevel 			} else {
   2949   1676         jpk 				port = tcp_update_next_port(port + 1, tcp,
   2950   1676         jpk 				    B_FALSE);
   2951   1676         jpk 			}
   2952   1676         jpk 		}
   2953   1676         jpk 		if (port == 0)
   2954   1676         jpk 			break;
   2955      0      stevel 
   2956      0      stevel 		/*
   2957      0      stevel 		 * Don't let this loop run forever in the case where
   2958      0      stevel 		 * all of the anonymous ports are in use.
   2959      0      stevel 		 */
   2960      0      stevel 	} while (++count < loopmax);
   2961      0      stevel 	return (0);
   2962      0      stevel }
   2963      0      stevel 
   2964      0      stevel /*
   2965   3104    jprakash  * tcp_clean_death / tcp_close_detached must not be called more than once
   2966   3104    jprakash  * on a tcp. Thus every function that potentially calls tcp_clean_death
   2967   3104    jprakash  * must check for the tcp state before calling tcp_clean_death.
   2968  11042        Erik  * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper,
   2969   3104    jprakash  * tcp_timer_handler, all check for the tcp state.
   2970   3104    jprakash  */
   2971   3104    jprakash /* ARGSUSED */
   2972   3104    jprakash void
   2973  11042        Erik tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
   2974  11042        Erik     ip_recv_attr_t *dummy)
   2975   3104    jprakash {
   2976   3104    jprakash 	tcp_t	*tcp = ((conn_t *)arg)->conn_tcp;
   2977   3104    jprakash 
   2978   3104    jprakash 	freemsg(mp);
   2979   3104    jprakash 	if (tcp->tcp_state > TCPS_BOUND)
   2980   5031    rs200217 		(void) tcp_clean_death(((conn_t *)arg)->conn_tcp,
   2981   5031    rs200217 		    ETIMEDOUT, 5);
   2982   3104    jprakash }
   2983   3104    jprakash 
   2984   3104    jprakash /*
   2985      0      stevel  * We are dying for some reason.  Try to do it gracefully.  (May be called
   2986      0      stevel  * as writer.)
   2987      0      stevel  *
   2988      0      stevel  * Return -1 if the structure was not cleaned up (if the cleanup had to be
   2989      0      stevel  * done by a service procedure).
   2990      0      stevel  * TBD - Should the return value distinguish between the tcp_t being
   2991      0      stevel  * freed and it being reinitialized?
   2992      0      stevel  */
   2993      0      stevel static int
   2994      0      stevel tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
   2995      0      stevel {
   2996      0      stevel 	mblk_t	*mp;
   2997      0      stevel 	queue_t	*q;
   2998   8348        Eric 	conn_t	*connp = tcp->tcp_connp;
   2999   3448    dh155122 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3000      0      stevel 
   3001      0      stevel 	TCP_CLD_STAT(tag);
   3002      0      stevel 
   3003      0      stevel #if TCP_TAG_CLEAN_DEATH
   3004      0      stevel 	tcp->tcp_cleandeathtag = tag;
   3005      0      stevel #endif
   3006      0      stevel 
   3007   2323    ethindra 	if (tcp->tcp_fused)
   3008   2323    ethindra 		tcp_unfuse(tcp);
   3009   2323    ethindra 
   3010      0      stevel 	if (tcp->tcp_linger_tid != 0 &&
   3011      0      stevel 	    TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
   3012      0      stevel 		tcp_stop_lingering(tcp);
   3013      0      stevel 	}
   3014      0      stevel 
   3015      0      stevel 	ASSERT(tcp != NULL);
   3016  11042        Erik 	ASSERT((connp->conn_family == AF_INET &&
   3017  11042        Erik 	    connp->conn_ipversion == IPV4_VERSION) ||
   3018  11042        Erik 	    (connp->conn_family == AF_INET6 &&
   3019  11042        Erik 	    (connp->conn_ipversion == IPV4_VERSION ||
   3020  11042        Erik 	    connp->conn_ipversion == IPV6_VERSION)));
   3021      0      stevel 
   3022      0      stevel 	if (TCP_IS_DETACHED(tcp)) {
   3023      0      stevel 		if (tcp->tcp_hard_binding) {
   3024      0      stevel 			/*
   3025      0      stevel 			 * Its an eager that we are dealing with. We close the
   3026      0      stevel 			 * eager but in case a conn_ind has already gone to the
   3027      0      stevel 			 * listener, let tcp_accept_finish() send a discon_ind
   3028      0      stevel 			 * to the listener and drop the last reference. If the
   3029      0      stevel 			 * listener doesn't even know about the eager i.e. the
   3030      0      stevel 			 * conn_ind hasn't gone up, blow away the eager and drop
   3031      0      stevel 			 * the last reference as well. If the conn_ind has gone
   3032      0      stevel 			 * up, state should be BOUND. tcp_accept_finish
   3033      0      stevel 			 * will figure out that the connection has received a
   3034      0      stevel 			 * RST and will send a DISCON_IND to the application.
   3035      0      stevel 			 */
   3036      0      stevel 			tcp_closei_local(tcp);
   3037   3104    jprakash 			if (!tcp->tcp_tconnind_started) {
   3038   8348        Eric 				CONN_DEC_REF(connp);
   3039      0      stevel 			} else {
   3040      0      stevel 				tcp->tcp_state = TCPS_BOUND;
   3041      0      stevel 			}
   3042      0      stevel 		} else {
   3043      0      stevel 			tcp_close_detached(tcp);
   3044      0      stevel 		}
   3045      0      stevel 		return (0);
   3046      0      stevel 	}
   3047      0      stevel 
   3048   3448    dh155122 	TCP_STAT(tcps, tcp_clean_death_nondetached);
   3049      0      stevel 
   3050  11042