Home | History | Annotate | Download | only in rsmrdt
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the License).
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/CDDL.txt
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/CDDL.txt.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets [] replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  *
     26  * RSMRDT driver for RAC
     27  *
     28  */
     29 
     30 #pragma ident	"@(#)rsmrdt.c	1.70	08/05/20 SMI"
     31 
     32 const char opsrsm_version[] = "@(#)rsmrdt.c 1.70     08/05/20 SMI";
     33 
     34 #include <sys/types.h>
     35 #include <sys/errno.h>
     36 #include <sys/debug.h>
     37 #include <sys/stropts.h>
     38 #include <sys/stream.h>
     39 #include <sys/strlog.h>
     40 #include <sys/cmn_err.h>
     41 #include <sys/kmem.h>
     42 #include <sys/conf.h>
     43 #include <sys/stat.h>
     44 #include <sys/dlpi.h>
     45 #include <sys/modctl.h>
     46 #include <sys/kstat.h>
     47 #include <sys/ddi.h>
     48 #include <sys/sunddi.h>
     49 #include <sys/strsun.h>
     50 #include <sys/taskq.h>
     51 #include <sys/open.h>
     52 #include <sys/uio.h>
     53 #include <sys/cpuvar.h>
     54 #include <sys/atomic.h>
     55 
     56 #include <sys/rsm/rsm_common.h>
     57 #include <sys/rsm/rsmpi.h>
     58 
     59 #include "rsmrdt.h"		/* This driver's data structures */
     60 
     61 /* inter-module dependencies */
     62 char	_depends_on[] =	 "misc/rsmops";
     63 
     64 /*
     65  * Lock hierarchy:
     66  *
     67  * opsrsmp->opsrsm_lock
     68  * opsrsmdevlock
     69  *
     70  *	rd->rd_lock
     71  *	rd->rd_xmit_lock
     72  *	rd->rd_net_lock
     73  *
     74  *	opsrsm->opsrsm_dest_lock
     75  *	opsrsm->opsrsm_runq_lock
     76  *
     77  * rd->rd_nlb_lock -- currently never taken while another lock is held
     78  * opsrsmattlock
     79  * opsrsmdbglock
     80  */
     81 
     82 
     83 /*
     84  * Defining DEBUG on the compile line (-DDEBUG) will compile
     85  * debugging code into the driver.  Whether any debug output actually gets
     86  * printed depends on the value of opsrsmdbg, which determines the class of
     87  * messages that the user is interested in, and opsrsmdbgmode, which
     88  * determines how the user wants the messages to be produced.
     89  *
     90  * See the #defines for D1(), D2(), etc.  below for which bits in opsrsmdbg
     91  * cause which messages to get printed.
     92  *
     93  * The various types of output are controlled by bits in opsrsmdbgmode, as
     94  * follows.  Multiple types of output may be used at once, if desired.
     95  *
     96  * (opsrsmdbgmode & 1)	Use debugging log.
     97  * (opsrsmdbgmode & 2)	Use kernel printfs.
     98  */
     99 
    100 #ifndef lint
    101 
    102 #ifdef DEBUG
    103 
    104 int opsrsmdbg = 0x0100;
    105 int opsrsmdbgmode = 0x1;
    106 static void opsrsmconsole(const char *, ...);
    107 
    108 /* opsrsm function enter/exit, parameters, return values. */
    109 #define	D1								\
    110 	if (opsrsmdbg & 0x01)						\
    111 	    opsrsmdebug
    112 
    113 /* Additional function debugging. */
    114 #define	D2								\
    115 	if (opsrsmdbg & 0x02) 						\
    116 	    opsrsmdebug
    117 
    118 /* rsmpi interface routine enter/exit, parameters, return */
    119 #define	D4								\
    120 	if (opsrsmdbg & 0x08) 						\
    121 	    opsrsmdebug
    122 
    123 /* Latency timing output. */
    124 #define	D5								\
    125 	if (opsrsmdbg & 0x10) 						\
    126 	    opsrsmdebug
    127 
    128 /* Excessive debugging output */
    129 #define	D6								\
    130 	if (opsrsmdbg & 0x20) 						\
    131 	    opsrsmdebug
    132 
    133 /* debug message on the console */
    134 #define	DINFO								\
    135 	opsrsmconsole
    136 
    137 /* error message logged to the debug buffer */
    138 #define	DERR								\
    139 	if (opsrsmdbg & 0x100) 						\
    140 	    opsrsmdebug
    141 
    142 #else /* DEBUG */
    143 
    144 #define	D1	if (0) printf
    145 #define	D2	if (0) printf
    146 #define	D4	if (0) printf
    147 #define	D5	if (0) printf
    148 #define	D6	if (0) printf
    149 #define	DINFO	if (0) printf
    150 #define	DERR	if (0) printf
    151 
    152 #endif /* DEBUG */
    153 
    154 #else /* lint */
    155 
    156 #ifdef DEBUG
    157 int opsrsmdbg;
    158 int opsrsmdbgmode;
    159 #endif
    160 
    161 #define	D1	printf
    162 #define	D2	printf
    163 #define	D4	printf
    164 #define	D5	printf
    165 #define	D6	printf
    166 #define	DINFO	printf
    167 #define	DERR	printf
    168 #endif /* lint */
    169 
    170 /*
    171  * Function prototypes.
    172  */
    173 static int	opsrsm_open(dev_t *, int, int, struct cred *);
    174 static int	opsrsm_close(dev_t, int, int, struct cred *);
    175 static int	opsrsm_attach(dev_info_t *, ddi_attach_cmd_t);
    176 static int	opsrsm_detach(dev_info_t *, ddi_detach_cmd_t);
    177 static int	opsrsm_info(dev_info_t *, ddi_info_cmd_t, void *, void **);
    178 static int	opsrsm_chpoll(dev_t, short, int, short *, struct pollhead **);
    179 static int	opsrsm_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
    180 
    181 static void	opsrsmwsrv(void *);
    182 static int	opsrsmcrexfer(opsrsm_t *, opsrsm_dest_t *);
    183 static int	opsrsmsconn(opsrsm_t *, opsrsm_dest_t *, int);
    184 static int	opsrsmconnxfer(opsrsm_t *, opsrsm_dest_t *);
    185 static int	opsrsmsack(opsrsm_dest_t *);
    186 static int	opsrsmsaccept(opsrsm_t *opsrsmp, opsrsm_dest_t *rd);
    187 
    188 static opsrsm_dest_t *opsrsm_connect(int, opsrsmresource_t *);
    189 static opsrsm_dest_t *opsrsmmkdest(adapter_t *, rsm_addr_t);
    190 static void	opsrsmsconntmo(void *);
    191 static void	opsrsmacktmo(void *);
    192 static void	opsrsmaccepttmo(void * arg);
    193 static void	opsrsmfreedesttmo(void *);
    194 
    195 static void	opsrsmmsghdlr_req_connect(opsrsm_dest_t *, opsrsm_msg_t *);
    196 static void	opsrsmmsghdlr_con_accept(opsrsm_dest_t *, opsrsm_msg_t *);
    197 static void	opsrsmmsghdlr_syncdqe(opsrsm_dest_t *, opsrsm_msg_t *);
    198 static void	opsrsmmsghdlr_default(opsrsm_dest_t *, opsrsm_msg_t *);
    199 
    200 static int	opsrsmgetstate(opsrsm_dest_t *);
    201 static void	opsrsmsetstate(opsrsm_dest_t *, int);
    202 static int	opsrsmmovestate(opsrsm_dest_t *, int, int newstate);
    203 static int	opsrsmread(opsrsm_dest_t *, int, int, int, ushort_t sap);
    204 static int	opsrsmuninit(adapter_t *adapterp);
    205 static boolean_t	opsrsmdest_refcnt_0(opsrsm_dest_t *);
    206 static int	opsrsmfreedest(adapter_t *adapter, rsm_addr_t);
    207 
    208 /* LINTED: E_STATIC_FUNC_CALLD_NOT_DEFINED */
    209 static void	opsrsmdebug(const char *, ...);
    210 static void	opsrsmerror(dev_info_t *,  const char *, ...);
    211 static void	opsrsmkstatinit(opsrsm_t *);
    212 static void	opsrsmkstatremove(opsrsm_t *opsrsmp);
    213 static void	opsrsmgetparam(dev_info_t *, opsrsm_t *);
    214 static void	opsrsmtakedown(opsrsm_t *, int);
    215 
    216 static void	opsrsmfreebuf(opsrsmbuf_t *);
    217 static void	opsrsmputfqe(opsrsm_dest_t *, int);
    218 static void	opsrsmputfqe_nolock(opsrsm_dest_t *, int);
    219 static opsrsm_queued_fqe_t *opsrsm_queued_fqe_alloc(opsrsm_dest_t *);
    220 static void opsrsm_queued_fqe_free(opsrsm_dest_t *, opsrsm_queued_fqe_t *);
    221 
    222 static void	opsrsmputdqes(opsrsm_dest_t *);
    223 static int	opsrsmavailfqe(opsrsm_dest_t *);
    224 static int	opsrsmavailfqe2(opsrsm_dest_t *);
    225 static int	opsrsmgetfqe(opsrsm_dest_t *, int *);
    226 static int	opsrsmgetdqe(opsrsm_dest_t *, int *, int *, int *, ushort_t *);
    227 static int	opsrsmsendmsg(opsrsm_dest_t *, uint8_t, opsrsm_msg_t *);
    228 
    229 rsm_intr_hand_ret_t opsrsm_rsm_intr_handler(rsm_controller_object_t *,
    230     rsm_intr_q_op_t, rsm_addr_t, void *, size_t, rsm_intr_hand_arg_t);
    231 
    232 static opsrsm_failover_info_t *opsrsm_finfo_add(opsrsm_dest_t *);
    233 static opsrsm_failover_info_t *opsrsm_finfo_lookup_by_local_skey(uint32_t);
    234 static opsrsm_failover_info_t *opsrsm_finfo_lookup_by_remote_skey(uint32_t);
    235 static int opsrsm_finfo_wait(uint32_t);
    236 static void opsrsm_finfo_wakeup(opsrsm_failover_info_t *, int);
    237 static void opsrsm_finfo_init(void);
    238 static void opsrsm_finfo_fini(void);
    239 static void opsrsm_finfo_destroy(void *);
    240 static void opsrsm_failover_thread(void *);
    241 static void opsrsm_lostconn(opsrsm_dest_t *);
    242 static void opsrsm_reset_all_rps(opsrsm_dest_t *rd);
    243 static void opsrsmmsghdlr_finfo(opsrsm_dest_t *, opsrsm_msg_t *);
    244 static void opsrsm_option_rexmit_end(mblk_t *, opsrsm_dest_t *);
    245 static int opsrsm_finfo_sendmsg(opsrsm_dest_t *, uint8_t, uint32_t);
    246 static mblk_t *opsrsm_alloc_ack_msg(uint32_t);
    247 static void opsrsm_queued_msg_send(opsrsm_dest_t *);
    248 static void opsrsm_queued_msg_flush(opsrsm_dest_t *);
    249 static void opsrsm_queued_msg_append(opsrsm_dest_t *, opsrsm_queued_msg_t *);
    250 
    251 extern void apply_on_all_adapters(void (*)(adapter_t *, void *), void *);
    252 static uint32_t opsrsm_pending_bytes = 0;
    253 
    254 static kmutex_t opsrsm_flow_tmo_lock;
    255 static timeout_id_t opsrsm_flow_tmo_id;
    256 static int opsrsm_flow_tmo_retries = 0;
    257 
    258 static void opsrsm_flow_tmo(void *);
    259 static void opsrsm_flow_tmo_cancel(void);
    260 static void opsrsm_flow_enable(adapter_t *, void *);
    261 static void opsrsm_sync_flow_ctl(void *);
    262 static void opsrsm_sync_flow_tmo(void *);
    263 static void opsrsm_set_sync_flow_tmo(opsrsm_dest_t *);
    264 static void opsrsm_cancel_sync_flow_tmo(opsrsm_dest_t *);
    265 static void opsrsm_check_flow_ctl(opsrsm_dest_t *);
    266 static int opsrsmdemux_loopback(mblk_t *);
    267 static void opsrsm_status_check_tmo(void *);
    268 
    269 taskq_t *opsrsm_failover_taskq;
    270 taskq_t *opsrsm_events_taskq;
    271 
    272 #define	OPSRSM_Q_LEN(q) ((q)->q_len)
    273 #define	OPSRSM_Q_HEAD(q) ((q)->q_head)
    274 #define	OPSRSM_Q_NEXT(q, mp) ((mp)->b_next)
    275 
    276 #define	OPSRSM_Q_INIT(q) {	\
    277 	(q)->q_head = NULL;	\
    278 	(q)->q_tail = NULL;	\
    279 	(q)->q_len = 0;		\
    280 }
    281 
    282 #define	OPSRSM_Q_APPEND(q, mp) {		\
    283 	ASSERT((mp)->b_next == NULL);		\
    284 	if ((q)->q_head == NULL) {		\
    285 		(q)->q_head = (mp);		\
    286 		(q)->q_tail = (mp);		\
    287 	} else {				\
    288 		(q)->q_tail->b_next = (mp);	\
    289 		(q)->q_tail = (mp);		\
    290 	}					\
    291 	(q)->q_len++;				\
    292 }
    293 
    294 #define	OPSRSM_Q_REMOVE(q, mp) {		\
    295 	ASSERT((q)->q_len > 0);			\
    296 	(mp) = (q)->q_head;			\
    297 	if ((q)->q_head == (q)->q_tail) {	\
    298 		(q)->q_tail = NULL;		\
    299 	}					\
    300 	(q)->q_head = (mp)->b_next;		\
    301 	(mp)->b_next = NULL;			\
    302 	(q)->q_len--;				\
    303 }
    304 
    305 #define	OPSRSM_Q_FLUSH(q) {			\
    306 	while ((q)->q_head != NULL) {		\
    307 		mblk_t *mp;			\
    308 						\
    309 		mp = (q)->q_head;		\
    310 		(q)->q_head = mp->b_next;	\
    311 		mp->b_prev = mp->b_next = NULL;	\
    312 		mp->b_cont = NULL;		\
    313 		freemsg(mp);			\
    314 	}					\
    315 	(q)->q_tail = NULL;			\
    316 	(q)->q_len = 0;				\
    317 }
    318 
    319 #define	OPSRSM_Q_CONCAT(q1, q2) {		\
    320 	if ((q1)->q_head == NULL) {		\
    321 		(q1)->q_head = (q2)->q_head;	\
    322 		(q1)->q_tail = (q2)->q_tail;	\
    323 		(q1)->q_len = (q2)->q_len;	\
    324 	} else {				\
    325 		if ((q2)->q_len > 0) {		\
    326 			(q1)->q_tail->b_next = (q2)->q_head;	\
    327 			(q1)->q_tail = (q2)->q_tail;		\
    328 			(q1)->q_len += (q2)->q_len;		\
    329 		}						\
    330 	}							\
    331 	(q2)->q_head = NULL;					\
    332 	(q2)->q_tail = NULL;					\
    333 	(q2)->q_len = 0;					\
    334 }
    335 
    336 #define	OPSRSM_REACHED_STATIC_DATA_THRESHOLD(rd) \
    337 	((rd)->rd_data_collected >= opsrsmdev-> \
    338 	opsrsm_param.opsrsm_data_threshold)
    339 
    340 #define	OPSRSM_REACHED_DATA_THRESHOLD(rd)				\
    341 	((opsrsmdev->opsrsm_param.opsrsm_adaptive_intr == 1) ?		\
    342 	((rd)->rd_data_collected >= (rd)->rd_adaptive_threshold) :	\
    343 	(OPSRSM_REACHED_STATIC_DATA_THRESHOLD(rd)))
    344 
    345 #define	OPSRSM_ADAPT_THRESHOLD(rd, pktlen) { 				\
    346 	if (opsrsmdev->opsrsm_param.opsrsm_adaptive_intr == 1) {	\
    347 		uint32_t diff =						\
    348 			(uint32_t)ddi_get_lbolt() - (rd)->rd_last_sent;	\
    349 		(rd)->rd_last_sent = (uint32_t)ddi_get_lbolt();		\
    350 		if (diff == 0) {					\
    351 			(rd)->rd_pkt_freq++;				\
    352 			if ((rd)->rd_pkt_freq > opsrsmdev->		\
    353 			    opsrsm_param.opsrsm_adaptive_rate) {	\
    354 				(rd)->rd_adaptive_threshold += pktlen;	\
    355 				(rd)->rd_pkt_freq = 0;			\
    356 			}						\
    357 			if ((rd)->rd_adaptive_threshold >		\
    358 			    opsrsmdev->opsrsm_param.			\
    359 			    opsrsm_data_threshold)			\
    360 				(rd)->rd_adaptive_threshold =		\
    361 				opsrsmdev->opsrsm_param.		\
    362 				opsrsm_data_threshold;			\
    363 		} else {						\
    364 			uint32_t reduce = 2 * pktlen * diff;		\
    365 			if (reduce > (rd)->rd_adaptive_threshold) {	\
    366 				(rd)->rd_adaptive_threshold = 0;	\
    367 			} else {					\
    368 				(rd)->rd_adaptive_threshold -= reduce;	\
    369 			}						\
    370 		}							\
    371 	}								\
    372 }
    373 
    374 #define	OPSRSM_NO_PENDING_WRITES(rd) \
    375 	((rd)->rd_writes_completed == OPSRSM_Q_LEN(&(rd)->rd_pendq))
    376 
    377 #define	OPSRSM_RSREF(rp) { 		\
    378 	mutex_enter(&(rp)->rs_lock);	\
    379 	(rp)->rs_refcnt++;		\
    380 	mutex_exit(&(rp)->rs_lock);	\
    381 }
    382 
    383 #define	OPSRSM_RSUNREF(rp) { 				\
    384 	mutex_enter(&(rp)->rs_lock);			\
    385 	(rp)->rs_refcnt--;				\
    386 	if ((rp)->rs_refcnt == 0) {			\
    387 		cv_broadcast(&(rp)->rs_close_cv);	\
    388 	}						\
    389 	mutex_exit(&(rp)->rs_lock);			\
    390 }
    391 
    392 #define	OPSRSM_LOOPBACK		0x100b
    393 #define	OPSRSM_IS_LOOPBACK(rd)	((uint32_t)(rd) == OPSRSM_LOOPBACK)
    394 
    395 static int opsrsm_start_batch(opsrsm_dest_t *, uint32_t);
    396 static int opsrsm_end_batch(opsrsm_dest_t *);
    397 static void opsrsm_xmit_tmo(void *);
    398 static void opsrsm_fqe_tmo(void *);
    399 static void opsrsm_dispatch_tmo(void *);
    400 static void opsrsm_set_xmit_tmo(opsrsm_dest_t *, int);
    401 static void opsrsm_set_fqe_tmo(opsrsm_dest_t *, int);
    402 static void opsrsm_cancel_xmit_tmo(opsrsm_dest_t *);
    403 static void opsrsm_cancel_fqe_tmo(opsrsm_dest_t *);
    404 static void opsrsmxmit_thread(void *);
    405 static int opsrsmxmit(opsrsm_dest_t *, mblk_t *);
    406 static int opsrsmrexmit(opsrsm_dest_t *);
    407 static int opsrsm_write_data(opsrsm_dest_t *, mblk_t *);
    408 static int opsrsm_sync_dqe(opsrsm_dest_t *);
    409 static int opsrsm_sync_fqe(opsrsm_dest_t *);
    410 static void opsrsm_wake_senders(opsrsm_dest_t *, short);
    411 static void opsrsm_sync_dqe_tmo(void *);
    412 static void opsrsm_sync_fqe_tmo(void *);
    413 static void opsrsmdemux(mblk_t *, opsrsm_dest_t *);
    414 static void opsrsm_event_thread(void *);
    415 static void opsrsm_event_add(opsrsm_dest_t *, uint32_t);
    416 
    417 static opsrsmresource_t *opsrsmresource_alloc(minor_t *);
    418 static opsrsmresource_t *opsrsm_resstruct_alloc();
    419 static opsrsmresource_t *opsrsmresource_free(minor_t rnum);
    420 static opsrsmresource_t *opsrsmresource_lookup(minor_t, int);
    421 static void opsrsmresource_destroy(void);
    422 static int opsrsm_resstruct_free(minor_t);
    423 static void opsrsmresource_init(void);
    424 static void opsrsmresource_fini(void);
    425 static struct opsrsmresource_table opsrsm_resource;
    426 
    427 static opsrsm_failover_info_t *opsrsm_finfo_list;
    428 static kmutex_t opsrsm_finfo_lock;
    429 static kcondvar_t opsrsm_finfo_cv;
    430 static int opsrsm_failover_threads;
    431 static int opsrsm_failover_max_retries = 6000;
    432 static int opsrsm_failover_destruct_time = 3000;
    433 static int opsrsm_queued_msg_max_retries = 2000;
    434 
    435 int rsmrdt_adapterinit(adapter_t *);
    436 int rsmrdt_adapterfini(adapter_t *);
    437 void rsmrdt_failover(adapter_t *, rsm_addr_t);
    438 int rsmrdt_check_openhandles(void);
    439 
    440 /* LINTED: E_STATIC_FUNC_CALLD_NOT_DEFINED */
    441 extern mblk_t	*desballoc(unsigned char *, size_t, uint_t, frtn_t *);
    442 extern void rsmrdt_pathmanager_init(void);
    443 extern void rsmrdt_pathmanager_cleanup(void);
    444 extern rsm_addr_t rsmrdt_get_remote_hwaddr(adapter_t *, rsm_node_id_t);
    445 extern adapter_t *rsmrdt_select_adapter(rsm_node_id_t, int);
    446 extern void rsmrdt_get_remote_ids(adapter_t *, rsm_addr_t, int *, int *);
    447 
    448 /*
    449  * The opsrsm driver implements a reference count scheme for destination
    450  * structures.  The idea behind the scheme is to prevent the driver from
    451  * deleting a destination structure while it is being used elsewhere, for
    452  * example in a message handling routine.  (Failures to protect against
    453  * this occurrence have led to a fair array of baffling bugs over the
    454  * lifetime of the driver.)
    455  *
    456  * The following set of macros implement the reference count scheme,
    457  * translation from RSM address to destination structure, and removal of
    458  * destinations from the run queue.  All must be intertwined, since
    459  * otherwise it would be possible to get a destination pointer from an RSM
    460  * address , or from the run queue, but have some other part of the driver
    461  * delete the destination before you could bump its reference count.  The
    462  * incorporation of reference count code in FINDDEST/MAKEDEST/GETRUNQ
    463  * solves this race condition.
    464  */
    465 
    466 /*
    467  * FINDDEST attempts to find the destination with RSM address rsm_addr.  If the
    468  * destination exists, rd is set to point to it.  If the destination exists,
    469  * isdel is set to indicate whether the destination is currently being deleted
    470  * (nonzero implies a delete is in progress).  If the destination exists and
    471  * is not being deleted, its reference count is increased by one.
    472  */
    473 #define	FINDDEST(rd, isdel, rsm_addr, adapter) {		\
    474 	mutex_enter(&adapter->opsrsm_dest_lock);		\
    475 	(rd) = (((rsm_addr) >= RSM_MAX_DESTADDR) ? NULL :	\
    476 	    (adapter)->opsrsm_desttbl[(rsm_addr)]);		\
    477 	if (rd)							\
    478 		if (((isdel) = (rd)->rd_dstate) == 0) {		\
    479 			(rd)->rd_refcnt++;			\
    480 			D6("FINDDEST ctlr %d addr %ld refcnt++ is %d\n", \
    481 			    adapter->instance, rsm_addr,	\
    482 			    (rd)->rd_refcnt);			\
    483 		}						\
    484 	mutex_exit(&(adapter)->opsrsm_dest_lock);		\
    485 }
    486 
    487 
    488 /*
    489  * MAKEDEST attempts to find the destination with RSM address rsm_addr.  If the
    490  * destination exists, rd and isdel are set as in the description of FINDDEST,
    491  * above.  If the destination does not exist, a new destination structure is
    492  * allocated and installed, rd is set to point to it, and isnew is set to 1.
    493  */
    494 #define	MAKEDEST(rd, isdel, isnew, rsm_addr, adapter) {		\
    495 	mutex_enter(&(adapter)->opsrsm_dest_lock); 		\
    496 	(rd) = (((rsm_addr) >= RSM_MAX_DESTADDR) ? NULL : 	\
    497 	    (adapter)->opsrsm_desttbl[(rsm_addr)]); 		\
    498 	if (!(rd)) { 						\
    499 		(rd) = opsrsmmkdest((adapter), (rsm_addr)); 	\
    500 		(isnew) = 1;					\
    501 	}							\
    502 	if (rd)							\
    503 		if (((isdel) = (rd)->rd_dstate) == 0) {		\
    504 			(rd)->rd_refcnt++;			\
    505 			D6("MAKEDEST ctlr %d addr %ld refcnt++ is %d\n", \
    506 			    adapter->instance, (uint64_t)rsm_addr,	\
    507 			    (rd)->rd_refcnt);			\
    508 		}						\
    509 	mutex_exit(&(adapter)->opsrsm_dest_lock);		\
    510 }
    511 
    512 
    513 /*
    514  * GETRUNQ attempts to return the destination which is at the head of opsrsm's
    515  * run queue.  If the run queue is non-empty, the head of the queue is removed,
    516  * and rd is set to point to it; otherwise, rd is set to NULL.  If rd is
    517  * nonzero, isdel is set to 1 if the destination pointed to by rd is being
    518  * deleted, or to 0 otherwise.  Finally, if rd is nonzero, and isdel is zero,
    519  * then rd's reference count is increased by one.
    520  */
    521 #define	GETRUNQ(rd, isdel, adapterp) {				\
    522 	mutex_enter(&(adapterp)->opsrsm_dest_lock);		\
    523 	mutex_enter(&(adapterp)->opsrsm_runq_lock);		\
    524 	rd = (adapterp)->opsrsm_runq;				\
    525 	if (rd) {						\
    526 		(adapterp)->opsrsm_runq = rd->rd_next;		\
    527 		if (((isdel) = (rd)->rd_dstate) == 0) {		\
    528 			(rd)->rd_refcnt++;			\
    529 			D6("GETRUNQ ctlr %d addr %ld refcnt++ is %d\n", \
    530 			    adapterp->instance,			\
    531 			    (rd)->rd_rsm_addr,			\
    532 			    (rd)->rd_refcnt);			\
    533 		}						\
    534 	}							\
    535 	mutex_exit(&(adapterp)->opsrsm_runq_lock);		\
    536 	mutex_exit(&(adapterp)->opsrsm_dest_lock);		\
    537 }
    538 
    539 
    540 /*
    541  * REFDEST checks to see if the destination pointed to by rd is currently being
    542  * deleted.  If so, isdel is set to a nonzero value; otherwise, it is set to
    543  * zero, and the destination's reference count is incremented.
    544  */
    545 #define	REFDEST(rd, isdel) {					\
    546 	mutex_enter(&(rd)->rd_adapter->opsrsm_dest_lock);	\
    547 	if (((isdel) = (rd)->rd_dstate) == 0) {			\
    548 		(rd)->rd_refcnt++;				\
    549 	}							\
    550 	mutex_exit(&(rd)->rd_adapter->opsrsm_dest_lock);	\
    551 }
    552 
    553 
    554 /*
    555  * UNREFDEST decrements the reference count of the destination pointed to by
    556  * rd.  If the reference count becomes zero, we start the deletion process for
    557  * the destination.
    558  */
    559 #define	UNREFDEST(rd) {						\
    560 	mutex_enter(&(rd)->rd_adapter->opsrsm_dest_lock);		\
    561 	D6("UNREFDEST ctlr %d addr %ld refcnt-- is %d\n",	\
    562 	    (rd)->rd_adapter->instance, (rd)->rd_rsm_addr,	\
    563 	    (rd)->rd_refcnt - 1);				\
    564 	if (--(rd)->rd_refcnt <= 0) {				\
    565 		mutex_exit(&(rd)->rd_adapter->opsrsm_dest_lock);	\
    566 		if (opsrsmdest_refcnt_0(rd)) { rd = NULL; }	\
    567 	} else							\
    568 		mutex_exit(&(rd)->rd_adapter->opsrsm_dest_lock);	\
    569 }
    570 
    571 
    572 
    573 /* Local Static def's */
    574 
    575 /*
    576  * Lock and variable to allow attach routines to initialize global mutexes
    577  */
    578 
    579 static kmutex_t opsrsmattlock;	/* Protects opsrsmdbginit  */
    580 
    581 /*
    582  * Pointer to 'opsrsm' global structure.
    583  */
    584 opsrsm_t *opsrsmdev = NULL;	/* Head of list */
    585 
    586 static kmutex_t opsrsmdevlock;	/* Protects list contents */
    587 static void *opsrsm_state;	/* opaque handle for soft state structs */
    588 
    589 extern rsm_node_id_t rsmrdt_my_nodeid;
    590 
    591 /*
    592  * ****************************************************************
    593  *                                                               *
    594  * B E G I N   BASIC MODULE BOILERPLATE                          *
    595  *                                                               *
    596  * ****************************************************************
    597  */
    598 
    599 
    600 /* Module Loading/Unloading and Autoconfiguration declarations */
    601 
    602 /*
    603  * cb_ops contains the driver entry points and is roughly equivalent
    604  * to the cdevsw and bdevsw  structures in previous releases.
    605  *
    606  * dev_ops contains, in addition to the pointer to cb_ops, the routines
    607  * that support loading and unloading our driver.
    608  *
    609  */
    610 
    611 static struct cb_ops opsrsm_cb_ops = {
    612 	opsrsm_open,		/* cb_open */
    613 	opsrsm_close,		/* cb_close */
    614 	nodev,			/* cb_strategy */
    615 	nodev,			/* cb_print */
    616 	nodev,			/* cb_dump */
    617 	nodev,			/* cb_read */
    618 	nodev,			/* cb_write */
    619 	opsrsm_ioctl,		/* cb_ioctl */
    620 	nodev,			/* cb_devmap */
    621 	nodev,			/* cb_mmap */
    622 	nodev,			/* cb_segmap */
    623 	opsrsm_chpoll,		/* cb_chpoll */
    624 	ddi_prop_op,		/* cb_prop_op */
    625 	NULL,			/* cb_stream */
    626 	D_NEW | D_MP,		/* cb_flag */
    627 	CB_REV,			/* rev */
    628 	nodev,			/* int (*cb_aread)() */
    629 	nodev			/* int (*cb_awrite)() */
    630 };
    631 
    632 static struct dev_ops opsrsm_ops = {
    633 	DEVO_REV,		/* devo_rev */
    634 	0,			/* devo_refcnt */
    635 	opsrsm_info,		/* devo_getinfo */
    636 	nulldev,		/* devo_identify */
    637 	nulldev,		/* devo_probe */
    638 	opsrsm_attach,		/* devo_attach */
    639 	opsrsm_detach,		/* devo_detach */
    640 	nodev,			/* devo_reset */
    641 	&opsrsm_cb_ops,		/* devo_cb_ops */
    642 	(struct bus_ops *)NULL,	/* devo_bus_ops */
    643 	nulldev			/* power */
    644 };
    645 
    646 
    647 /*
    648  * Module linkage information for the kernel.
    649  */
    650 static struct modldrv modldrv = {
    651 	&mod_driverops,
    652 	"Reliable Datagram Transport driver - v1.0",
    653 	&opsrsm_ops,
    654 };
    655 
    656 static struct modlinkage modlinkage = {
    657 #ifdef _LP64
    658 	MODREV_1, { (void *) &modldrv, NULL, NULL, NULL, NULL, NULL, NULL }
    659 #else
    660 	MODREV_1, { (void *) &modldrv, NULL, NULL, NULL }
    661 #endif
    662 };
    663 
    664 /*
    665  * Module Loading and Installation Routines.
    666  */
    667 
    668 /*
    669  * Module Installation
    670  * Install the driver, initialize soft state system, initialize opsrsmattlock
    671  */
    672 
    673 int
    674 _init(void)
    675 {
    676 	int status;
    677 
    678 	status = ddi_soft_state_init(&opsrsm_state, sizeof (opsrsm_t), 1);
    679 	if (status != 0) {
    680 #ifdef DEBUG
    681 		cmn_err(CE_CONT,
    682 		    "opsrsm:_init - soft_state_init failed: 0x%x\n", status);
    683 #endif /* DEBUG */
    684 		return (status);
    685 	}
    686 
    687 	/* initialize global locks here */
    688 	mutex_init(&opsrsmattlock, NULL, MUTEX_DRIVER, NULL);
    689 	mutex_init(&opsrsmdevlock, NULL, MUTEX_DRIVER, NULL);
    690 	mutex_init(&opsrsm_flow_tmo_lock, NULL, MUTEX_DRIVER, NULL);
    691 	opsrsm_flow_tmo_id = 0;
    692 
    693 	opsrsm_events_taskq = taskq_create("events", 8, maxclsyspri, 1, 8,
    694 	    TASKQ_PREPOPULATE);
    695 
    696 	opsrsm_finfo_init();
    697 	opsrsmresource_init();
    698 
    699 	status = mod_install(&modlinkage);
    700 	if (status != DDI_SUCCESS) {
    701 		mutex_destroy(&opsrsmattlock);
    702 		mutex_destroy(&opsrsmdevlock);
    703 	}
    704 
    705 	/* Init rsmrdt pm client */
    706 	rsmrdt_pathmanager_init();
    707 
    708 	return (status);
    709 }
    710 
    711 /*
    712  * Module Removal
    713  */
    714 
    715 int
    716 _fini(void)
    717 {
    718 	int status;
    719 
    720 	if ((status = mod_remove(&modlinkage)) != 0) {
    721 		DERR("opsrsm_fini - mod_remove failed: 0x%x\n", status);
    722 		return (status);
    723 	}
    724 
    725 	/* Un-init the rsmrdt pm client */
    726 	rsmrdt_pathmanager_cleanup();
    727 
    728 	ddi_soft_state_fini(&opsrsm_state);
    729 	opsrsmresource_fini();
    730 	opsrsm_finfo_fini();
    731 	opsrsm_flow_tmo_cancel();
    732 	taskq_destroy(opsrsm_events_taskq);
    733 
    734 	mutex_destroy(&opsrsm_flow_tmo_lock);
    735 	mutex_destroy(&opsrsmattlock);
    736 	mutex_destroy(&opsrsmdevlock);
    737 
    738 	return (status);
    739 }
    740 
    741 /*
    742  * Return Module Info.
    743  */
    744 
    745 int
    746 _info(struct modinfo *modinfop)
    747 {
    748 	return (mod_info(&modlinkage, modinfop));
    749 }
    750 
    751 
    752 
    753 /*
    754  * Autoconfiguration Routines
    755  */
    756 
    757 
    758 /*
    759  * Attach the device, create and fill in the device-specific structure.
    760  */
    761 
    762 static int
    763 opsrsm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
    764 {
    765 	opsrsm_t *opsrsmp;
    766 	int instance;
    767 	int progress = 0;
    768 	minor_t rnum;
    769 
    770 	D1("opsrsmattach: dip 0x%p, cmd %d", (void *)dip, cmd);
    771 
    772 	if (cmd != DDI_ATTACH) {
    773 		return (DDI_FAILURE);
    774 	}
    775 
    776 	/*
    777 	 * Allocate soft data structure
    778 	 */
    779 
    780 	instance = ddi_get_instance(dip);
    781 
    782 	if (ddi_soft_state_zalloc(opsrsm_state, instance) != DDI_SUCCESS) {
    783 		DERR("opsrsmattach: bad state zalloc, returning DDI_FAILURE");
    784 		return (DDI_FAILURE);
    785 	}
    786 
    787 	opsrsmp = ddi_get_soft_state(opsrsm_state, instance);
    788 	if (opsrsmp == NULL) {
    789 		return (DDI_FAILURE);
    790 	}
    791 
    792 	/*
    793 	 * Stuff private info into dip.
    794 	 */
    795 	opsrsmp->opsrsm_dip = dip;
    796 	ddi_set_driver_private(dip, (caddr_t)opsrsmp);
    797 
    798 	/*
    799 	 * Get device parameters from the device tree and save them in our
    800 	 * per-device structure for later use.
    801 	 */
    802 	opsrsmgetparam(dip, opsrsmp);
    803 
    804 	/*
    805 	 * Initialize kernel statistics.
    806 	 */
    807 	opsrsmkstatinit(opsrsmp);
    808 	progress |= OPSRSM_ATT_KSTAT;
    809 	/*
    810 	 * Link this per-device structure in with the rest.
    811 	 */
    812 	mutex_enter(&opsrsmdevlock);
    813 
    814 	opsrsmdev = opsrsmp;
    815 	mutex_exit(&opsrsmdevlock);
    816 
    817 	/*
    818 	 * Create minor number
    819 	 */
    820 	if (opsrsmresource_alloc(&rnum) == NULL) {
    821 		DERR("opsrsmattach: Unable to get minor number\n");
    822 		opsrsmtakedown(opsrsmp, progress);
    823 		return (DDI_FAILURE);
    824 	}
    825 
    826 	D1("opsrsmattach: rnum %d : ddi %d", rnum, ddi_get_instance(dip));
    827 
    828 	/*
    829 	 * Create the filesystem device node.
    830 	 */
    831 	if (ddi_create_minor_node(dip, OPSRSMNAME, S_IFCHR,
    832 	    rnum, DDI_PSEUDO, NULL) != DDI_SUCCESS) {
    833 		DERR("opsrsmattach: bad create_minor_node, returning "
    834 		    "DDI_FAILURE");
    835 		opsrsmtakedown(opsrsmp, progress);
    836 		return (DDI_FAILURE);
    837 	}
    838 
    839 	progress |= OPSRSM_ATT_MINOR;
    840 
    841 	opsrsmp->opsrsm_max_batch_size = 0;
    842 	opsrsmp->opsrsm_min_batch_size = 0;
    843 
    844 	ddi_report_dev(dip);
    845 
    846 	D1("opsrsmattach: returning DDI_SUCCESS");
    847 	return (DDI_SUCCESS);
    848 }
    849 
    850 /*
    851  * Detach - Free resources allocated in attach
    852  */
    853 
    854 /*ARGSUSED*/
    855 static int
    856 opsrsm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
    857 {
    858 	int instance;
    859 	opsrsm_t *opsrsmp;
    860 
    861 	D1("opsrsmdetach: dip 0x%p, cmd %d", (void *)dip, cmd);
    862 
    863 	if (cmd != DDI_DETACH) {
    864 		return (DDI_FAILURE);
    865 	}
    866 
    867 	if (rsmrdt_check_openhandles() != 0) {
    868 		DERR("opsrsmdetach: Failed to detach due to open handles");
    869 		return (DDI_FAILURE);
    870 	}
    871 
    872 	instance = ddi_get_instance(dip);
    873 	opsrsmp = ddi_get_soft_state(opsrsm_state, instance);
    874 	if (opsrsmp == NULL) {
    875 		return (DDI_FAILURE);
    876 	}
    877 
    878 
    879 	/*
    880 	 * Release all our resources. At this point, all attachment
    881 	 * setup must have completed, so must all be torn down.
    882 	 */
    883 	opsrsmtakedown(opsrsmp, OPSRSM_ATT_ALL);
    884 
    885 	/*
    886 	 * Free resource table
    887 	 */
    888 	opsrsmresource_destroy();
    889 	return (DDI_SUCCESS);
    890 }
    891 
    892 /*ARGSUSED*/
    893 static int
    894 opsrsm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
    895 {
    896 	switch (infocmd) {
    897 	case DDI_INFO_DEVT2DEVINFO:
    898 		if (opsrsmdev !=  NULL) {
    899 			*result = opsrsmdev->opsrsm_dip;
    900 			return (DDI_SUCCESS);
    901 		} else {
    902 			return (DDI_FAILURE);
    903 		}
    904 
    905 	case DDI_INFO_DEVT2INSTANCE:
    906 		*result = 0;
    907 		return (DDI_SUCCESS);
    908 
    909 	default:
    910 		return (DDI_FAILURE);
    911 	}
    912 }
    913 
    914 /*
    915  * Return local node id.
    916  */
    917 /* ARGSUSED */
    918 static int
    919 opsrsm_ioctl_getnodeid(opsrsmresource_t *rp, intptr_t arg, int mode)
    920 {
    921 	if (rsmrdt_my_nodeid == (rsm_node_id_t)-1)
    922 		return (ENXIO);
    923 
    924 	(void) ddi_copyout((caddr_t)&rsmrdt_my_nodeid, (caddr_t)arg,
    925 		sizeof (rsmrdt_my_nodeid), mode);
    926 	return (0);
    927 }
    928 
    929 /*
    930  * Return a unique(wrt the local node) number associated with
    931  * the communication endpoint.
    932  */
    933 static int
    934 opsrsm_ioctl_bind(opsrsmresource_t *rp, intptr_t arg, int mode)
    935 {
    936 	if (ddi_copyout((caddr_t)&rp->rs_lportnum, (caddr_t)arg,
    937 	    sizeof (rp->rs_lportnum), mode) != DDI_SUCCESS) {
    938 		DERR("ioctl_bind: unable to copyout portnum");
    939 		return (EFAULT);
    940 	}
    941 	rp->rs_state |= OPSRSM_RS_BOUND;
    942 	return (0);
    943 }
    944 
    945 /*
    946  * The local communication endpoint will simply remember and will use
    947  * the address specified here as the target address for all the future
    948  * outgoing messages.
    949  *
    950  * Note that it will not verify the validity of the remote "portnum".
    951  * It's up to the applications to make sure the remote endpoint
    952  * exists before send.
    953 */
    954 static int
    955 opsrsm_ioctl_connect(opsrsmresource_t *rp, intptr_t arg, int mode)
    956 {
    957 	rsmrdt_connect_arg_t io_args;
    958 
    959 	if ((rp->rs_state & OPSRSM_RS_BOUND) == 0) {
    960 		DERR("ioctl_connect: port not bound yet");
    961 		return (EADDRNOTAVAIL);
    962 	}
    963 
    964 	rw_enter(&opsrsm_resource.opsrsmrct_lock, RW_READER);
    965 	if (opsrsm_resource.opsrsmrc_flag == OPSRSMRC_UNLOAD_INPROGRESS) {
    966 		DERR("ioctl_connect: Unloading in progress");
    967 		rw_exit(&opsrsm_resource.opsrsmrct_lock);
    968 		return (ENETDOWN);
    969 	}
    970 	rw_exit(&opsrsm_resource.opsrsmrct_lock);
    971 
    972 	if (rp->rs_dest != NULL) {
    973 		DERR("ioctl_connect: reconnect not supported");
    974 		return (EISCONN);
    975 	}
    976 
    977 	/*
    978 	 * Copy in the connect ioctl arg structure
    979 	 */
    980 	(void) ddi_copyin((caddr_t)arg, (caddr_t)&io_args, sizeof (io_args),
    981 		mode);
    982 
    983 	D1("ioctl_connect: nodeid = %d", io_args.nodeid);
    984 	D1("ioctl_connect: portnum = %d", io_args.portnum);
    985 
    986 	if (io_args.nodeid < 0)
    987 		return (EINVAL);
    988 
    989 	if (io_args.nodeid == (int)rsmrdt_my_nodeid) {
    990 		/*
    991 		 * Loopback mode
    992 		 */
    993 		mutex_enter(&rp->rs_lock);
    994 		rp->rs_dest = (opsrsm_dest_t *)OPSRSM_LOOPBACK;
    995 		rp->rs_local_skey = 0;
    996 		rp->rs_rportnum = io_args.portnum;
    997 		mutex_exit(&rp->rs_lock);
    998 	} else {
    999 		opsrsm_dest_t *rd = NULL;
   1000 
   1001 		mutex_enter(&rp->rs_lock);
   1002 		rp->rs_nodeid = io_args.nodeid;
   1003 		rp->rs_rportnum = io_args.portnum;
   1004 		rp->rs_state |= OPSRSM_RS_CONNECTING;
   1005 		mutex_exit(&rp->rs_lock);
   1006 
   1007 		rd = opsrsm_connect(rp->rs_nodeid, rp);
   1008 		mutex_enter(&rp->rs_lock);
   1009 		rp->rs_dest = rd;
   1010 		if (rd != NULL) {
   1011 			rp->rs_local_skey = rd->rd_local_skey;
   1012 			rp->rs_state |= OPSRSM_RS_REFDEST;
   1013 		} else {
   1014 			rp->rs_local_skey = 0;
   1015 		}
   1016 		rp->rs_state &= ~OPSRSM_RS_CONNECTING;
   1017 		cv_broadcast(&rp->rs_conn_cv);
   1018 		mutex_exit(&rp->rs_lock);
   1019 
   1020 		if (rd == NULL) {
   1021 			return (ENETDOWN);
   1022 		}
   1023 	}
   1024 	return (0);
   1025 }
   1026 
   1027 
   1028 #define	RETRY_DELAY 1
   1029 static int opsrsm_connect_max_retries = 20;
   1030 
   1031 static opsrsm_dest_t *
   1032 opsrsm_connect(int nodeid, opsrsmresource_t *rp)
   1033 {
   1034 	adapter_t *adp = NULL;
   1035 	rsm_addr_t rem_hwaddr;
   1036 	opsrsm_dest_t *rd;
   1037 	int isdel = 0, isnew = 0;
   1038 	int newdest = 0;
   1039 	uint32_t old_skey = 0;
   1040 
   1041 again:;
   1042 	/* Find local adapter */
   1043 	if (newdest > opsrsm_connect_max_retries) {
   1044 		DINFO("connect: failed to connect to node %d\n", nodeid);
   1045 		return (NULL);
   1046 	}
   1047 	adp = rsmrdt_select_adapter((rsm_node_id_t)nodeid, newdest);
   1048 	if (adp == NULL) {
   1049 		DINFO("connect: node %d is unreachable\n", nodeid);
   1050 		return (NULL);
   1051 	}
   1052 
   1053 	/*
   1054 	 * if path down happens after we've chosen an adapter, that's
   1055 	 * still ok because the connection handshake will fail.
   1056 	 */
   1057 
   1058 	/* Find remote hw addr */
   1059 	rem_hwaddr = rsmrdt_get_remote_hwaddr(adp, (rsm_node_id_t)nodeid);
   1060 	if (rem_hwaddr == (rsm_addr_t)-1 || rem_hwaddr > RSM_MAX_DESTADDR) {
   1061 		if (opsrsmdev->opsrsm_param.rsmrdt_enable_loadbalance) {
   1062 			if (adp->sel_cnt > 0) adp->sel_cnt--;
   1063 		}
   1064 		return (NULL);
   1065 	}
   1066 
   1067 	MAKEDEST(rd, isdel, isnew, rem_hwaddr, adp);
   1068 	if (isdel) {
   1069 		if (opsrsmdev->opsrsm_param.rsmrdt_enable_loadbalance) {
   1070 			if (adp->sel_cnt > 0) adp->sel_cnt--;
   1071 		}
   1072 		/*
   1073 		 * need sufficient delay to ensure the old rd gets freed up
   1074 		 * completely before MAKEDEST gets called again
   1075 		 */
   1076 		delay(RETRY_DELAY);
   1077 		goto again;
   1078 	}
   1079 	if (isnew) {
   1080 		isnew = 0;
   1081 		if (rd == NULL) goto again;
   1082 		(void) opsrsmmovestate(rd, OPSRSM_STATE_NEW,
   1083 		    OPSRSM_STATE_S_REQ_CONNECT);
   1084 	}
   1085 	if (rd->rd_local_skey != old_skey) {
   1086 		old_skey = rd->rd_local_skey;
   1087 		newdest++;
   1088 	}
   1089 
   1090 	mutex_enter(&rd->rd_xmit_lock);
   1091 	if (rd->rd_nodeid != nodeid) {
   1092 		cmn_err(CE_PANIC, "invalid nodeid %d, expected %d\n",
   1093 		    rd->rd_nodeid, nodeid);
   1094 	}
   1095 	if (rd->rd_xmit_state < OPSRSM_XMIT_BARRIER_CLOSED) {
   1096 		if (rd->rd_xmit_state != OPSRSM_XMIT_DISCONNECTED) {
   1097 			int retval;
   1098 
   1099 			retval = cv_wait_sig(&rd->rd_conn_cv,
   1100 			    &rd->rd_xmit_lock);
   1101 			if (retval == 0) {
   1102 				mutex_exit(&rd->rd_xmit_lock);
   1103 				if (opsrsmdev->opsrsm_param.
   1104 				    rsmrdt_enable_loadbalance) {
   1105 					if (adp->sel_cnt > 0)
   1106 						adp->sel_cnt--;
   1107 				}
   1108 				UNREFDEST(rd);
   1109 				return (NULL);
   1110 			} else {
   1111 				if (rd->rd_xmit_state >=
   1112 				    OPSRSM_XMIT_BARRIER_CLOSED) {
   1113 					mutex_exit(&rd->rd_xmit_lock);
   1114 					return (rd);
   1115 				} else {
   1116 					mutex_exit(&rd->rd_xmit_lock);
   1117 					if (opsrsmdev->opsrsm_param.
   1118 					    rsmrdt_enable_loadbalance) {
   1119 						if (adp->sel_cnt > 0)
   1120 							adp->sel_cnt--;
   1121 					}
   1122 					UNREFDEST(rd);
   1123 					goto again;
   1124 				}
   1125 			}
   1126 		} else {
   1127 			mutex_exit(&rd->rd_xmit_lock);
   1128 			if (opsrsmdev->opsrsm_param.
   1129 			    rsmrdt_enable_loadbalance) {
   1130 				if (adp->sel_cnt > 0) adp->sel_cnt--;
   1131 			}
   1132 			UNREFDEST(rd);
   1133 			/*
   1134 			 * if rd is still in DISCONNECTED state, we need to
   1135 			 * wait until it is completely freed up.
   1136 			 */
   1137 			if (rp != NULL) {
   1138 				mutex_enter(&rp->rs_lock);
   1139 				cv_broadcast(&rp->rs_conn_cv);
   1140 				mutex_exit(&rp->rs_lock);
   1141 			}
   1142 			delay(RETRY_DELAY);
   1143 			goto again;
   1144 		}
   1145 	}
   1146 	mutex_exit(&rd->rd_xmit_lock);
   1147 	return (rd);
   1148 }
   1149 
   1150 /*
   1151  * RSMRDT_IOCTL_SENDMSG deals with one message at a time. It maintains
   1152  * message boundary, and guarantees either the whole message, or none
   1153  * of it is delivered to the destination successfully.
   1154  *
   1155  * EMSGSIZE is returned when the message is too big to be handled as a
   1156  * single message by the opsrsm driver.
   1157  *
   1158  * Note that SKGXP library doesn't require the send socket to have a
   1159  * portnum associated with it. In other words, the send side can send
   1160  * msgs through a socket without calling ioctl.bind() first.
   1161  *
   1162  * RSMRDT_IOCTL_SENDMSG is always non-blocking.
   1163  *
   1164  * Normally messages are guaranteed to be delivered eventually to the
   1165  * destination endpoint, in the same order RSMRDT_IOCTL_SENDMSG calls
   1166  * are made. There is one notable exception. That is, if the destination
   1167  * endpoint doesn't exist, the msg will be dropped by the receiving node.
   1168  */
   1169 static int
   1170 opsrsm_ioctl_sendmsg(opsrsmresource_t *rp, intptr_t arg, int mode)
   1171 {
   1172 	rsmrdt_send_arg_t io_args;
   1173 	struct iovec vptr[OPSRSM_MAXVECS];
   1174 #ifdef _MULTI_DATAMODEL
   1175 	rsmrdt_send_arg32_t io_args32;
   1176 	struct iovec32 vptr32[OPSRSM_MAXVECS];
   1177 	model_t model;
   1178 #endif /* _MULTI_DATAMODEL */
   1179 	uio_t phys_uio;
   1180 	mblk_t *mp;
   1181 	int bytecount = 0;
   1182 	uint_t nvecs;
   1183 	int i;
   1184 	int err = 0;
   1185 
   1186 	if ((rp->rs_state & OPSRSM_RS_NORECVR) != 0) {
   1187 		DERR("ioctl_sendmsg: Receiver doesn't exist");
   1188 		return (ESRCH);
   1189 	}
   1190 
   1191 	if ((rp->rs_state & OPSRSM_RS_PKEYMISMATCH) != 0) {
   1192 		DERR("ioctl_sendmsg: pkey mismatch");
   1193 		return (EACCES);
   1194 	}
   1195 
   1196 	if (rp->rs_dest == NULL) {
   1197 		return (ENOTCONN);
   1198 	}
   1199 #ifdef _MULTI_DATAMODEL
   1200 	model = ddi_model_convert_from(mode & FMODELS);
   1201 	if (model == DDI_MODEL_ILP32) {
   1202 		/*
   1203 		 * Copy in sendmsg arg structure to driver buffer
   1204 		 */
   1205 		ddi_copyin((caddr_t)arg, &io_args32, sizeof (io_args32),
   1206 		    mode);
   1207 
   1208 		/*
   1209 		 * Find number of iovecs for this message
   1210 		 */
   1211 		nvecs = io_args32.iovcnt;
   1212 
   1213 		if (nvecs > OPSRSM_MAXVECS) {
   1214 			DERR("ioctl_sendmsg: invalid vec size 0x%x", nvecs);
   1215 			return (EINVAL);
   1216 		}
   1217 
   1218 		D1("ioctl_sendmsg: nvecs = %d, sz = 0x%x, vptr32 = %lx",
   1219 		    nvecs, nvecs * sizeof (struct iovec32), vptr32);
   1220 
   1221 		/*
   1222 		 * Copy in iovec structures to driver buffer
   1223 		 */
   1224 		if (ddi_copyin((struct iovec32 *)io_args32.iov, (caddr_t)vptr32,
   1225 			nvecs * sizeof (struct iovec32), mode)) {
   1226 			DERR("ioctl_sendmsg: invalid iovec pointer");
   1227 			err = EFAULT;
   1228 			goto done;
   1229 		}
   1230 
   1231 		/*
   1232 		 * Find out the size of this message
   1233 		 */
   1234 		bytecount = 0;
   1235 		for (i = 0; i < nvecs; i++) {
   1236 			ssize32_t iovlen32 = vptr32[i].iov_len;
   1237 			bytecount += iovlen32;
   1238 			if (iovlen32 < 0 || bytecount < 0) {
   1239 				err = EINVAL;
   1240 				goto done;
   1241 			}
   1242 			vptr[i].iov_len = iovlen32;
   1243 			vptr[i].iov_base = (caddr_t)vptr32[i].iov_base;
   1244 		}
   1245 	} else
   1246 #endif /* _MULTI_DATAMODEL */
   1247 	{
   1248 		/*
   1249 		 * Copy in sendmsg arg structure to driver buffer
   1250 		 */
   1251 		(void) ddi_copyin((caddr_t)arg, (caddr_t)&io_args,
   1252 			sizeof (io_args), mode);
   1253 
   1254 		/*
   1255 		 * Find number of iovecs for this message
   1256 		 */
   1257 		nvecs = io_args.iovcnt;
   1258 
   1259 		if (nvecs > OPSRSM_MAXVECS) {
   1260 			DERR("ioctl_sendmsg: invalid vec size 0x%x", nvecs);
   1261 			return (EINVAL);
   1262 		}
   1263 
   1264 		D1("ioctl_sendmsg: nvecs = %d, sz = 0x%x, vptr = %p",
   1265 		    nvecs, nvecs * (int)sizeof (iovec_t), vptr);
   1266 
   1267 		/*
   1268 		 * Copy in iovec structures to driver buffer
   1269 		 */
   1270 		if (ddi_copyin(io_args.iov, (caddr_t)vptr,
   1271 		    (size_t)nvecs * sizeof (iovec_t), mode)) {
   1272 			DERR("ioctl_sendmsg: invalid iovec pointer");
   1273 			err = EFAULT;
   1274 			goto done;
   1275 		}
   1276 
   1277 		/*
   1278 		 * Find out the size of this message
   1279 		 */
   1280 		bytecount = 0;
   1281 		for (i = 0; i < (int)nvecs; i++) {
   1282 			ssize_t iovlen = vptr[i].iov_len;
   1283 			bytecount += iovlen;
   1284 			if (iovlen < 0 || bytecount < 0) {
   1285 				err = EINVAL;
   1286 				goto done;
   1287 			}
   1288 		}
   1289 	}
   1290 
   1291 	D1("ioctl_sendmsg: Message Size 0x%x", bytecount);
   1292 
   1293 	/*
   1294 	 * Check if message + header size is bigger than MTU size
   1295 	 */
   1296 	if ((bytecount + OPSRSM_CACHELINE_SIZE) >
   1297 		OPSRSM_MAX_BUFFER_SIZE_DFLT) {
   1298 		DERR("ioctl_sendmsg: message too big");
   1299 		err = EMSGSIZE;
   1300 		goto done;
   1301 	}
   1302 
   1303 	/*
   1304 	 * Allocate mblk
   1305 	 */
   1306 	mp = allocb(OPSRSM_CACHELINE_SIZE + OPSRSM_MESSAGE_HDRSZ +
   1307 		(size_t)bytecount + OPSRSM_CACHELINE_SIZE, BPRI_LO);
   1308 	if (mp == NULL) {
   1309 		DERR("ioctl_sendmsg: allocb failed");
   1310 		err = ENOMEM;
   1311 		goto done;
   1312 	}
   1313 	mp->b_rptr = (uchar_t *)OPSRSM_CACHELINE_ROUNDUP(mp->b_rptr);
   1314 
   1315 	/*
   1316 	 * Stuff in the message header
   1317 	 */
   1318 	OPSRSM_MESSAGE_HDRPTR(mp)->lportnum = rp->rs_lportnum;
   1319 	OPSRSM_MESSAGE_HDRPTR(mp)->rportnum = rp->rs_rportnum;
   1320 	OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz = (uint32_t)bytecount;
   1321 	OPSRSM_MESSAGE_HDRPTR(mp)->nodeid = (int)rsmrdt_my_nodeid;
   1322 	OPSRSM_MESSAGE_HDRPTR(mp)->pkey = rp->rs_pkey;
   1323 	OPSRSM_MESSAGE_HDRPTR(mp)->seqno = 0;
   1324 	OPSRSM_MESSAGE_HDRPTR(mp)->option = 0;
   1325 
   1326 	D1("ioctl_sendmsg: lportnum = %d, rportnum = %d, msz_sz = 0x%x",
   1327 		rp->rs_lportnum, rp->rs_rportnum, bytecount);
   1328 	D1("ioctl_sendmsg: my node id = %d", rsmrdt_my_nodeid);
   1329 
   1330 	/*
   1331 	 * Initialize uio structure
   1332 	 */
   1333 	phys_uio.uio_iov = vptr;
   1334 	phys_uio.uio_iovcnt = (int)nvecs;
   1335 	phys_uio.uio_resid = bytecount;
   1336 	phys_uio.uio_segflg = UIO_USERSPACE;
   1337 
   1338 	if (uiomove((caddr_t)mp->b_rptr + OPSRSM_MESSAGE_HDRSZ,
   1339 		(size_t)bytecount, UIO_WRITE, &phys_uio)) {
   1340 		DERR("ioctl_sendmsg: uiomove failed");
   1341 		err = EFAULT;
   1342 		freemsg(mp);
   1343 		goto done;
   1344 	}
   1345 
   1346 	mp->b_wptr = mp->b_rptr + bytecount + OPSRSM_MESSAGE_HDRSZ;
   1347 	mp->b_prev = mp->b_cont = NULL;
   1348 
   1349 	if (OPSRSM_IS_LOOPBACK(rp->rs_dest)) {
   1350 		err = opsrsmdemux_loopback(mp);
   1351 	} else {
   1352 		int isdel = 0;
   1353 
   1354 		mutex_enter(&rp->rs_lock);
   1355 		if ((rp->rs_state & OPSRSM_RS_FAILOVER) != 0) {
   1356 			opsrsm_dest_t *new_rd = NULL;
   1357 
   1358 			ASSERT((rp->rs_state & OPSRSM_RS_REFDEST) == 0);
   1359 			mutex_exit(&rp->rs_lock);
   1360 			err = opsrsm_finfo_wait(rp->rs_local_skey);
   1361 			if (err == 0) {
   1362 				mutex_enter(&rp->rs_lock);
   1363 				rp->rs_state |= OPSRSM_RS_CONNECTING;
   1364 				mutex_exit(&rp->rs_lock);
   1365 				new_rd = opsrsm_connect(rp->rs_nodeid, rp);
   1366 			}
   1367 			mutex_enter(&rp->rs_lock);
   1368 			rp->rs_dest = new_rd;
   1369 			rp->rs_state &= ~OPSRSM_RS_CONNECTING;
   1370 			if (new_rd != NULL) {
   1371 				rp->rs_local_skey = new_rd->rd_local_skey;
   1372 				rp->rs_state &= ~OPSRSM_RS_FAILOVER;
   1373 				rp->rs_state |= OPSRSM_RS_REFDEST;
   1374 				cv_broadcast(&rp->rs_conn_cv);
   1375 			} else {
   1376 				rp->rs_local_skey = 0;
   1377 				freemsg(mp);
   1378 				cv_broadcast(&rp->rs_conn_cv);
   1379 				mutex_exit(&rp->rs_lock);
   1380 				goto done;
   1381 			}
   1382 		}
   1383 		REFDEST(rp->rs_dest, isdel);
   1384 		if (isdel != 0) {
   1385 			err = ENETDOWN;
   1386 			freemsg(mp);
   1387 			mutex_exit(&rp->rs_lock);
   1388 			goto done;
   1389 		}
   1390 
   1391 		if ((bytecount + OPSRSM_CACHELINE_SIZE) >
   1392 		    (int)rp->rs_dest->rd_buffer_size) {
   1393 			err = EMSGSIZE;
   1394 			freemsg(mp);
   1395 			mutex_exit(&rp->rs_lock);
   1396 			goto done;
   1397 		}
   1398 		mutex_exit(&rp->rs_lock);
   1399 		err =  opsrsmxmit(rp->rs_dest, mp);
   1400 		if (err != EWOULDBLOCK) err = 0;
   1401 	}
   1402 done:;
   1403 	return (err);
   1404 }
   1405 
   1406 
   1407 /*
   1408  * RSMRDT_IOCTL_RECVMSG can return more than one msg at a time.
   1409  *
   1410  * If a message is too long to fit in the supplied buffer, excessive
   1411  * bytes will be discarded.
   1412  *
   1413  * It will return EWOULDBLOCK if there is no message in the receive
   1414  * queue. Caller can then use poll() to poll for the POLLIN event.
   1415  *
   1416  * In case of memory allocation errors, it will not drop the packets.
   1417  */
   1418 
   1419 static int
   1420 opsrsm_ioctl_recvmsgs(opsrsmresource_t *rp, intptr_t arg, int mode)
   1421 {
   1422 	rsmrdt_recvmsgs_arg_t io_args;
   1423 	rdt_recvmsg_t *rm_ptr;
   1424 	struct iovec vptr[OPSRSM_MAXVECS];
   1425 #ifdef _MULTI_DATAMODEL
   1426 	rsmrdt_recvmsgs_arg32_t io_args32;
   1427 	rsmrdt_recvmsg32_t *rm_ptr32;
   1428 	struct iovec32 vptr32[OPSRSM_MAXVECS];
   1429 	model_t model;
   1430 #endif /* _MULTI_DATAMODEL */
   1431 	int nmsgs, count;
   1432 	int err = 0;
   1433 	int32_t total_bytes = 0;
   1434 
   1435 	if ((rp->rs_state & OPSRSM_RS_BOUND) == 0) {
   1436 		DERR("ioctl_recvmsgs: port not bound yet");
   1437 		return (EADDRNOTAVAIL);
   1438 	}
   1439 
   1440 	/*
   1441 	 * Copy in recvmsgs arg structure to driver buffer
   1442 	 */
   1443 #ifdef _MULTI_DATAMODEL
   1444 	model = ddi_model_convert_from(mode & FMODELS);
   1445 	if (model == DDI_MODEL_ILP32) {
   1446 		ddi_copyin((caddr_t)arg, (caddr_t)&io_args32,
   1447 		    sizeof (io_args32), mode);
   1448 
   1449 		io_args.msgcnt = io_args32.msgcnt;
   1450 		io_args.timeout = io_args32.timeout;
   1451 	} else
   1452 #endif /* _MULTI_DATAMODEL */
   1453 	(void) ddi_copyin((caddr_t)arg, (caddr_t)&io_args, sizeof (io_args),
   1454 		mode);
   1455 
   1456 	/*
   1457 	 * Check if messages are waiting in the queue
   1458 	 */
   1459 	mutex_enter(&rp->rs_lock);
   1460 	if (OPSRSM_Q_LEN(&rp->rs_recvq) == 0) {
   1461 		if (io_args.timeout == 0) {
   1462 			mutex_exit(&rp->rs_lock);
   1463 			return (EWOULDBLOCK);
   1464 		} else if (io_args.timeout > 0) {
   1465 			clock_t	timeout_time;
   1466 			int retval;
   1467 			/*
   1468 			 * Wait only for 'timeout' time.
   1469 			 */
   1470 			timeout_time = ddi_get_lbolt();
   1471 			timeout_time += drv_usectohz
   1472 			    ((clock_t)io_args.timeout * (clock_t)1000);
   1473 			rp->rs_state |= OPSRSM_RS_SIG;
   1474 			retval = cv_timedwait_sig(&rp->rs_cv, &rp->rs_lock,
   1475 			    timeout_time);
   1476 			rp->rs_state &= ~OPSRSM_RS_SIG;
   1477 			if (retval == 0) {
   1478 				mutex_exit(&rp->rs_lock);
   1479 				return (EINTR);
   1480 			} else if (OPSRSM_Q_LEN(&rp->rs_recvq) == 0) {
   1481 				mutex_exit(&rp->rs_lock);
   1482 				err = 0;
   1483 				count = 0;
   1484 				goto done;
   1485 			}
   1486 		} else {
   1487 			int retval;
   1488 
   1489 			/*
   1490 			 * Wait until you get the wakeup signal
   1491 			 */
   1492 			rp->rs_state |= OPSRSM_RS_SIG;
   1493 			retval = cv_wait_sig(&rp->rs_cv, &rp->rs_lock);
   1494 			rp->rs_state &= ~OPSRSM_RS_SIG;
   1495 			if (retval == 0) {
   1496 				mutex_exit(&rp->rs_lock);
   1497 				return (EINTR);
   1498 			} else if (OPSRSM_Q_LEN(&rp->rs_recvq) == 0) {
   1499 				mutex_exit(&rp->rs_lock);
   1500 				return (EWOULDBLOCK);
   1501 			}
   1502 		}
   1503 	}
   1504 	mutex_exit(&rp->rs_lock);
   1505 
   1506 	/*
   1507 	 * Find number of messages
   1508 	 */
   1509 	nmsgs = (int)io_args.msgcnt;
   1510 	if (nmsgs > (int)opsrsmdev->opsrsm_param.opsrsm_max_recv_msgs ||
   1511 	    nmsgs < 0) {
   1512 		DERR("ioctl_recvmsgs: invalid nmsgs");
   1513 		return (EINVAL);
   1514 	}
   1515 
   1516 	D1("ioctl_recvmsgs: nmsgs = %d\n", nmsgs);
   1517 
   1518 	/*
   1519 	 * Copy in receive messages structures to driver buffer
   1520 	 */
   1521 #ifdef _MULTI_DATAMODEL
   1522 	if (model == DDI_MODEL_ILP32) {
   1523 		rm_ptr32 = (rsmrdt_recvmsg32_t *)rp->rs_rmptr;
   1524 		if (ddi_copyin((rsmrdt_recvmsg32_t *)io_args32.msg_iov,
   1525 		    (caddr_t)rm_ptr32, nmsgs * sizeof (rsmrdt_recvmsg32_t),
   1526 		    mode)) {
   1527 			DERR("ioctl_recvmsgs: cannot copy in msg structs");
   1528 			return (EFAULT);
   1529 		}
   1530 
   1531 	} else
   1532 #endif /* _MULTI_DATAMODEL */
   1533 	{
   1534 		rm_ptr = (rdt_recvmsg_t *)rp->rs_rmptr;
   1535 		if (ddi_copyin(io_args.msg_iov, (caddr_t)rm_ptr,
   1536 			(size_t)nmsgs * sizeof (rdt_recvmsg_t), mode)) {
   1537 			DERR("ioctl_recvmsgs: cannot copy in msg structs");
   1538 			return (EFAULT);
   1539 		}
   1540 	}
   1541 
   1542 	count = 0;
   1543 	do {
   1544 		uint_t nvecs;
   1545 		int i;
   1546 		uint32_t bytecount;
   1547 		uint32_t org_msglen;
   1548 		int nodeid;
   1549 		uint32_t portnum;
   1550 		mblk_t *mp;
   1551 
   1552 #ifdef _MULTI_DATAMODEL
   1553 		if (model == DDI_MODEL_ILP32) {
   1554 			/*
   1555 			 * Find number of iovecs for this message
   1556 			 */
   1557 			nvecs = rm_ptr32[count].iovcnt;
   1558 
   1559 			if (nvecs > OPSRSM_MAXVECS) {
   1560 				DERR("ioctl_recvmsgs: invalid vec size");
   1561 				err = EINVAL;
   1562 				break;
   1563 			}
   1564 
   1565 			D1("ioctl_recvmsgs: nvecs = %d", nvecs);
   1566 
   1567 			/*
   1568 			 * Copy in iovec structures to driver buffer
   1569 			 */
   1570 			if (ddi_copyin((struct iovec32 *)rm_ptr32[count].iov,
   1571 			    (caddr_t)vptr32, nvecs * sizeof (struct iovec32),
   1572 			    mode)) {
   1573 				DERR("ioctl_recvmsgs: invalid iovec pointer");
   1574 				err = EFAULT;
   1575 				break;
   1576 			}
   1577 
   1578 			/*
   1579 			 * Calculate buffer size
   1580 			 */
   1581 			bytecount = 0;
   1582 			for (i = 0; i < nvecs; i++) {
   1583 				if (vptr32[i].iov_len < 0) {
   1584 					DERR("ioctl_recvmsgs: invalid iovlen");
   1585 					err = EINVAL;
   1586 					break;
   1587 				}
   1588 				bytecount += vptr32[i].iov_len;
   1589 
   1590 				vptr[i].iov_len = vptr32[i].iov_len;
   1591 				vptr[i].iov_base = (caddr_t)vptr32[i].iov_base;
   1592 			}
   1593 		} else
   1594 #endif /* _MULTI_DATAMODEL */
   1595 		{
   1596 			/*
   1597 			 * Find number of iovecs for this message
   1598 			 */
   1599 			nvecs = rm_ptr[count].iovcnt;
   1600 
   1601 			if (nvecs > OPSRSM_MAXVECS) {
   1602 				DERR("ioctl_recvmsgs: invalid vec size");
   1603 				err = EINVAL;
   1604 				break;
   1605 			}
   1606 
   1607 			D1("ioctl_recvmsgs: nvecs = %d", nvecs);
   1608 
   1609 			/*
   1610 			 * Copy in iovec structures to driver buffer
   1611 			 */
   1612 			if (ddi_copyin(rm_ptr[count].iov, (caddr_t)vptr,
   1613 			    (size_t)nvecs * sizeof (iovec_t), mode)) {
   1614 				DERR("ioctl_recvmsgs: invalid iovec pointer");
   1615 				err = EFAULT;
   1616 				break;
   1617 			}
   1618 
   1619 			/*
   1620 			 * Calculate buffer size
   1621 			 */
   1622 			bytecount = 0;
   1623 			for (i = 0; i < (int)nvecs; i++) {
   1624 				if (vptr[i].iov_len < 0) {
   1625 					DERR("ioctl_recvmsgs: invalid iovlen");
   1626 					err = EINVAL;
   1627 					break;
   1628 				}
   1629 				bytecount += (uint32_t)vptr[i].iov_len;
   1630 			}
   1631 		}
   1632 
   1633 		if (err != 0) break;
   1634 
   1635 		/*
   1636 		 * Grab a buffer
   1637 		 */
   1638 		mutex_enter(&rp->rs_lock);
   1639 		OPSRSM_Q_REMOVE(&rp->rs_recvq, mp);
   1640 		if (OPSRSM_Q_LEN(&rp->rs_recvq) == 0) {
   1641 			rp->rs_events = 0;
   1642 		}
   1643 		mutex_exit(&rp->rs_lock);
   1644 		ASSERT(mp != NULL && MBLKL(mp) >= (int)OPSRSM_MESSAGE_HDRSZ);
   1645 		total_bytes += MBLKL(mp);
   1646 
   1647 		/*
   1648 		 * Extract the length from the header
   1649 		 *
   1650 		 * Truncate the message if the incoming message size
   1651 		 * is greater than buffer size.
   1652 		 */
   1653 
   1654 		org_msglen = OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz;
   1655 		nodeid = OPSRSM_MESSAGE_HDRPTR(mp)->nodeid;
   1656 		portnum = OPSRSM_MESSAGE_HDRPTR(mp)->lportnum;
   1657 		if (OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz < bytecount) {
   1658 			bytecount = OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz;
   1659 		}
   1660 
   1661 		/*
   1662 		 * Initialize uio structure
   1663 		 */
   1664 		if (bytecount > 0) {
   1665 			uio_t phys_uio;
   1666 
   1667 			phys_uio.uio_iov = vptr;
   1668 			phys_uio.uio_iovcnt = (int)nvecs;
   1669 			phys_uio.uio_segflg = UIO_USERSPACE;
   1670 			phys_uio.uio_resid = (ssize_t)bytecount;
   1671 
   1672 			if (uiomove((caddr_t)mp->b_rptr + OPSRSM_MESSAGE_HDRSZ,
   1673 				bytecount, UIO_READ, &phys_uio)) {
   1674 				DERR("ioctl_recvmsg: uiomove failed");
   1675 				err = EFAULT;
   1676 				freemsg(mp);
   1677 				mp = NULL;
   1678 				break;
   1679 			}
   1680 		}
   1681 
   1682 		opsrsmdev->opsrsm_packets_consumed++;
   1683 		freemsg(mp);
   1684 
   1685 		/*
   1686 		 * Return the number of bytes received and original message
   1687 		 * length.
   1688 		 */
   1689 #ifdef _MULTI_DATAMODEL
   1690 		if (model == DDI_MODEL_ILP32) {
   1691 			rm_ptr32[count].bytes_recvd = bytecount;
   1692 			rm_ptr32[count].msglen = org_msglen;
   1693 			rm_ptr32[count].portnum = portnum;
   1694 			rm_ptr32[count].nodeid = nodeid;
   1695 
   1696 			if (ddi_copyout((caddr_t *)&(rm_ptr32[count]),
   1697 			    (caddr_t)&((((rsmrdt_recvmsg32_t *)
   1698 			    (io_args32.msg_iov))[count])),
   1699 			    sizeof (rsmrdt_recvmsg32_t), mode)
   1700 			    != DDI_SUCCESS) {
   1701 				DERR("ioctl_recvmsgs:unable to copyout rdata");
   1702 				err = EFAULT;
   1703 				break;
   1704 			}
   1705 		} else
   1706 #endif /* _MULTI_DATAMODEL */
   1707 		{
   1708 			rm_ptr[count].bytes_recvd = bytecount;
   1709 			rm_ptr[count].msglen = org_msglen;
   1710 			rm_ptr[count].portnum = portnum;
   1711 			rm_ptr[count].nodeid = nodeid;
   1712 
   1713 			if (ddi_copyout((caddr_t *)&(rm_ptr[count]),
   1714 			    (caddr_t)&((((rdt_recvmsg_t *)
   1715 			    (io_args.msg_iov))[count])),
   1716 			    sizeof (rdt_recvmsg_t), mode) != DDI_SUCCESS) {
   1717 				DERR("ioctl_recvmsgs:unable to copyout rdata");
   1718 				err = EFAULT;
   1719 				break;
   1720 			}
   1721 		}
   1722 
   1723 		/*
   1724 		 * Increment the received messages count
   1725 		 */
   1726 		count++;
   1727 
   1728 	} while (OPSRSM_Q_LEN(&rp->rs_recvq) > 0 && count < nmsgs);
   1729 
   1730 	atomic_add_32(&opsrsm_pending_bytes, -total_bytes);
   1731 	/*
   1732 	 * Copy out the number of messages received
   1733 	 */
   1734 done:;
   1735 #ifdef _MULTI_DATAMODEL
   1736 	if (model == DDI_MODEL_ILP32) {
   1737 		if (ddi_copyout((caddr_t)&count,
   1738 		    (caddr_t)&(((rsmrdt_recvmsgs_arg32_t *)arg)->msgcnt),
   1739 		    sizeof (uint32_t), mode) != DDI_SUCCESS) {
   1740 			DERR("ioctl_recvmsgs: unable to copyout buffer count");
   1741 			err = EFAULT;
   1742 		}
   1743 	} else
   1744 #endif /* _MULTI_DATAMODEL */
   1745 	if (ddi_copyout((caddr_t)&count,
   1746 	    (caddr_t)&(((rsmrdt_recvmsgs_arg_t *)arg)->msgcnt),
   1747 	    sizeof (uint32_t), mode) != DDI_SUCCESS) {
   1748 		DERR("ioctl_recvmsgs: unable to copyout buffer count");
   1749 		err = EFAULT;
   1750 	}
   1751 
   1752 	return (err);
   1753 }
   1754 
   1755 /*
   1756  * This call is used to set some per-endpoint parameters.
   1757  * o per fd protection key
   1758  *
   1759  */
   1760 static int
   1761 opsrsm_ioctl_setparam(opsrsmresource_t *rp, intptr_t arg, int mode)
   1762 {
   1763 	void 		*value;
   1764 	rsmrdt_getsetparam_arg_t io_args;
   1765 	int 		error = RSM_SUCCESS;
   1766 #ifdef _MULTI_DATAMODEL
   1767 	rsmrdt_getsetparam_arg32_t io_args32;
   1768 
   1769 	model_t model = ddi_model_convert_from(mode & FMODELS);
   1770 
   1771 	/*
   1772 	 * Copy in the setparam ioctl arg structure
   1773 	 */
   1774 	if (model == DDI_MODEL_ILP32) {
   1775 		ddi_copyin((caddr_t)arg, (caddr_t)&io_args32,
   1776 		    sizeof (io_args32), mode);
   1777 		io_args.cmd = io_args32.cmd;
   1778 		io_args.size = io_args32.size;
   1779 	} else
   1780 #endif /* _MULTI_DATAMODEL */
   1781 	/*
   1782 	 * Copy in the setparam ioctl arg structure
   1783 	 */
   1784 	(void) ddi_copyin((caddr_t)arg, (caddr_t)&io_args, sizeof (io_args),
   1785 	    mode);
   1786 
   1787 	value = (void *)kmem_zalloc(io_args.size, KM_NOSLEEP);
   1788 	if (value == NULL) {
   1789 		DERR("ioctl_setparam: kmem_zalloc failed");
   1790 		return (ENOMEM);
   1791 	}
   1792 
   1793 #ifdef _MULTI_DATAMODEL
   1794 	if (model == DDI_MODEL_ILP32) {
   1795 		if (ddi_copyin((caddr_t)io_args32.value, (caddr_t)value,
   1796 		    io_args32.size, mode)) {
   1797 			kmem_free((void *)value, io_args32.size);
   1798 			DERR("ioctl_setparam: cannot copy args");
   1799 			return (EFAULT);
   1800 		}
   1801 	} else
   1802 #endif /* _MULTI_DATAMODEL */
   1803 	if (ddi_copyin((caddr_t)io_args.value, (caddr_t)value,
   1804 	    io_args.size, mode)) {
   1805 		kmem_free((void *)value, io_args.size);
   1806 		DERR("ioctl_setparam: cannot copy args");
   1807 		return (EFAULT);
   1808 	}
   1809 
   1810 	switch (io_args.cmd) {
   1811 	case RDT_MAXMSGSIZE:
   1812 		/*
   1813 		 * Buffer length must be multiple of 64 (0x40) and
   1814 		 * must be between 64 and 64k bytes.
   1815 		 * Add the cache line size.
   1816 		 */
   1817 		if (((*(uint_t *)value & ~OPSRSM_CACHELINE_MASK) == 0) &&
   1818 		    (*(int *)value > 0) &&
   1819 		    (*(uint_t *)value <= OPSRSM_MAX_BUFFER_SIZE_DFLT)) {
   1820 			opsrsmdev->opsrsm_param.opsrsm_buffer_size =
   1821 				*(uint_t *)value + OPSRSM_CACHELINE_SIZE;
   1822 			D1("ioctl_setparam: MTU sz 0x%x", *(uint_t *)value);
   1823 		} else {
   1824 			DERR("ioctl_setparam: invalid MTU sz\n");
   1825 			error = EINVAL;
   1826 		}
   1827 		break;
   1828 	case RDT_PROTECTION_KEY:
   1829 		/*
   1830 		 * Set the pkey
   1831 		 */
   1832 		rp->rs_pkey = *(uint32_t *)value;
   1833 		D1("ioctl_setparam: pkey 0x%x", rp->rs_pkey);
   1834 		break;
   1835 	default:
   1836 		DERR("ioctl_setparam: invalid cmd\n");
   1837 		error = EINVAL;
   1838 	}
   1839 
   1840 	kmem_free((void *)value, io_args.size);
   1841 	return (error);
   1842 }
   1843 
   1844 static int
   1845 opsrsm_ioctl_getparam(opsrsmresource_t *rp, intptr_t arg, int mode)
   1846 {
   1847 	void 		*value;
   1848 	rsmrdt_getsetparam_arg_t io_args;
   1849 #ifdef _MULTI_DATAMODEL
   1850 	rsmrdt_getsetparam_arg32_t io_args32;
   1851 
   1852 	model_t model = ddi_model_convert_from(mode & FMODELS);
   1853 
   1854 	/*
   1855 	 * Copy in the getparam ioctl arg structure
   1856 	 */
   1857 	if (model == DDI_MODEL_ILP32) {
   1858 		ddi_copyin((caddr_t)arg, (caddr_t)&io_args32,
   1859 		    sizeof (io_args32), mode);
   1860 		io_args.cmd = io_args32.cmd;
   1861 		io_args.size = io_args32.size;
   1862 	} else
   1863 #endif /* _MULTI_DATAMODEL */
   1864 	/*
   1865 	 * Copy in the getparam ioctl arg structure
   1866 	 */
   1867 	(void) ddi_copyin((caddr_t)arg, (caddr_t)&io_args, sizeof (io_args),
   1868 	    mode);
   1869 
   1870 	value = (void *)kmem_zalloc(io_args.size, KM_NOSLEEP);
   1871 	if (value == NULL) {
   1872 		DERR("ioctl_getparam: kmem_zalloc failed");
   1873 		return (ENOMEM);
   1874 	}
   1875 
   1876 #ifdef _MULTI_DATAMODEL
   1877 	if (model == DDI_MODEL_ILP32) {
   1878 		if (ddi_copyin((caddr_t)io_args32.value, (caddr_t)value,
   1879 		    io_args32.size, mode)) {
   1880 			kmem_free((void *)value, io_args32.size);
   1881 			DERR("ioctl_getparam: cannot copy args");
   1882 			return (EFAULT);
   1883 		}
   1884 	} else
   1885 #endif /* _MULTI_DATAMODEL */
   1886 	if (ddi_copyin((caddr_t)io_args.value, (caddr_t)value,
   1887 	    io_args.size, mode)) {
   1888 		kmem_free((void *)value, io_args.size);
   1889 		DERR("ioctl_getparam: cannot copy args");
   1890 		return (EFAULT);
   1891 	}
   1892 
   1893 	switch (io_args.cmd) {
   1894 	case RDT_MAXMSGSIZE:
   1895 		/*
   1896 		 * Get the message size
   1897 		 * Subtract the cache line size.
   1898 		 */
   1899 #ifdef _MULTI_DATAMODEL
   1900 		if (model == DDI_MODEL_ILP32) {
   1901 			*(uint_t *)value =
   1902 			    opsrsmdev->opsrsm_param.opsrsm_buffer_size -
   1903 			    OPSRSM_CACHELINE_SIZE;
   1904 			D1("ioctl_getparam: MTU sz 0x%x", *(uint_t *)value);
   1905 		} else
   1906 #endif /* _MULTI_DATAMODEL */
   1907 		{
   1908 			*(size_t *)value =
   1909 			    opsrsmdev->opsrsm_param.opsrsm_buffer_size -
   1910 			    OPSRSM_CACHELINE_SIZE;
   1911 
   1912 			D1("ioctl_getparam: MTU sz 0x%x", *(size_t *)value);
   1913 		}
   1914 		break;
   1915 	case RDT_PROTECTION_KEY:
   1916 		/*
   1917 		 * Get the protection key
   1918 		 */
   1919 		*(uint32_t *)value = rp->rs_pkey;
   1920 
   1921 		D1("ioctl_getparam: pkey 0x%x", *(uint32_t *)value);
   1922 		break;
   1923 	default:
   1924 		DERR("ioctl_getparam: invalid cmd\n");
   1925 		kmem_free((void *)value, io_args.size);
   1926 		return (EINVAL);
   1927 	}
   1928 
   1929 #ifdef _MULTI_DATAMODEL
   1930 	if (model == DDI_MODEL_ILP32) {
   1931 		if (ddi_copyout((caddr_t)value, (caddr_t)io_args32.value,
   1932 		    io_args32.size, mode) != DDI_SUCCESS) {
   1933 			kmem_free((void *)value, io_args32.size);
   1934 			DERR("ioctl_getparam: unable to copyout value");
   1935 			return (EFAULT);
   1936 		}
   1937 	} else
   1938 #endif /* _MULTI_DATAMODEL */
   1939 	if (ddi_copyout((caddr_t)value, (caddr_t)io_args.value,
   1940 	    io_args.size, mode) != DDI_SUCCESS) {
   1941 		kmem_free((void *)value, io_args.size);
   1942 		DERR("ioctl_getparam: unable to copyout value");
   1943 		return (EFAULT);
   1944 	}
   1945 	kmem_free((void *)value, io_args.size);
   1946 	return (0);
   1947 }
   1948 
   1949 /*
   1950  * Free minor resource
   1951  */
   1952 static int
   1953 opsrsm_resstruct_free(minor_t rnum)
   1954 {
   1955 	opsrsmresource_t	*rp;
   1956 	opsrsm_queue_t		*q;
   1957 	int32_t			total_bytes = 0;
   1958 
   1959 	/*
   1960 	 * remove resource from global table
   1961 	 */
   1962 	rp = opsrsmresource_free(rnum);
   1963 	if (rp == NULL) {
   1964 		return (DDI_FAILURE);
   1965 	}
   1966 	/*
   1967 	 * check if refcnt is 0 before we destroy rp. if it
   1968 	 * is non-zero, we need to wait until it becomes zero.
   1969 	 */
   1970 	mutex_enter(&rp->rs_lock);
   1971 	while (rp->rs_refcnt > 0) {
   1972 		cv_wait(&rp->rs_close_cv, &rp->rs_lock);
   1973 	}
   1974 	/*
   1975 	 * we can be sure that no other thread can increment
   1976 	 * rp->rs_refcnt since we already removed rp from the
   1977 	 * global table.
   1978 	 */
   1979 	ASSERT(rp->rs_refcnt == 0);
   1980 	mutex_exit(&rp->rs_lock);
   1981 	/*
   1982 	 * at this point, no other thread has a reference to
   1983 	 * rp. we can safely cleanup rp without holding
   1984 	 * rs_lock.
   1985 	 */
   1986 
   1987 	/*
   1988 	 * flush all remaining messages in the recvq
   1989 	 */
   1990 	q = &rp->rs_recvq;
   1991 	while ((q)->q_head != NULL) {
   1992 		mblk_t *mp;
   1993 
   1994 		mp = (q)->q_head;
   1995 		total_bytes += MBLKL(mp);
   1996 		(q)->q_head = mp->b_next;
   1997 		mp->b_prev = mp->b_next = NULL;
   1998 		mp->b_cont = NULL;
   1999 		freemsg(mp);
   2000 	}
   2001 	(q)->q_tail = NULL;
   2002 	(q)->q_len = 0;
   2003 	atomic_add_32(&opsrsm_pending_bytes, -total_bytes);
   2004 
   2005 	/*
   2006 	 * if rs_dest is still valid, we need to release our
   2007 	 * reference to it.
   2008 	 */
   2009 	if (rp->rs_dest != NULL && !OPSRSM_IS_LOOPBACK(rp->rs_dest) &&
   2010 	    (rp->rs_state & OPSRSM_RS_FAILOVER) == 0 &&
   2011 	    (rp->rs_state & OPSRSM_RS_REFDEST) != 0) {
   2012 		rp->rs_state &= ~OPSRSM_RS_REFDEST;
   2013 		if (opsrsmdev->opsrsm_param.rsmrdt_enable_loadbalance) {
   2014 			if (rp->rs_dest->rd_adapter->sel_cnt > 0)
   2015 				rp->rs_dest->rd_adapter->sel_cnt--;
   2016 		}
   2017 		UNREFDEST(rp->rs_dest);
   2018 	}
   2019 
   2020 	/*
   2021 	 * cleanup and free the resource structure
   2022 	 */
   2023 	if (rp->rs_pollhd.ph_list != NULL)
   2024 		pollhead_clean(&rp->rs_pollhd);
   2025 	mutex_destroy(&rp->rs_lock);
   2026 	cv_destroy(&rp->rs_cv);
   2027 	cv_destroy(&rp->rs_conn_cv);
   2028 	cv_destroy(&rp->rs_close_cv);
   2029 	kmem_free((void *)rp->rs_rmptr, opsrsmdev->
   2030 	    opsrsm_param.opsrsm_max_recv_msgs * sizeof (rdt_recvmsg_t));
   2031 	kmem_free((void *)rp, sizeof (*rp));
   2032 
   2033 	return (DDI_SUCCESS);
   2034 }
   2035 
   2036 
   2037 /*
   2038 * Allocate a resource struct
   2039 */
   2040 static opsrsmresource_t *
   2041 opsrsm_resstruct_alloc()
   2042 {
   2043 	opsrsmresource_t *rp;
   2044 
   2045 	rp = (opsrsmresource_t *)kmem_zalloc(sizeof (*rp), KM_SLEEP);
   2046 	if (rp == NULL) {
   2047 		DERR("opsrsm_resstruct_alloc: kmem_zalloc failed");
   2048 		return (NULL);
   2049 	}
   2050 
   2051 	rp->rs_rmptr = (void *)kmem_zalloc(
   2052 			opsrsmdev->opsrsm_param.opsrsm_max_recv_msgs *
   2053 			sizeof (rdt_recvmsg_t), KM_SLEEP);
   2054 	if (rp->rs_rmptr == NULL) {
   2055 		DERR("opsrsm_resstruct_alloc: kmem_zalloc failed");
   2056 		kmem_free((void *)rp, sizeof (*rp));
   2057 		return (NULL);
   2058 	}
   2059 
   2060 	rp->rs_events = 0;
   2061 	rp->rs_nodeid = 0;
   2062 	rp->rs_lportnum = 0;
   2063 	rp->rs_rportnum = 0;
   2064 	rp->rs_dest = NULL;
   2065 	rp->rs_pollhd.ph_list = NULL;
   2066 	rp->rs_refcnt = 0;
   2067 	rp->rs_state = 0;
   2068 	rp->rs_poll_index = -1;
   2069 	rp->rs_pkey = 0;
   2070 	rp->rs_local_skey = 0;
   2071 	OPSRSM_Q_INIT(&rp->rs_recvq);
   2072 
   2073 	mutex_init(&rp->rs_lock, NULL, MUTEX_DRIVER, NULL);
   2074 	cv_init(&rp->rs_cv, NULL, CV_DRIVER, NULL);
   2075 	cv_init(&rp->rs_conn_cv, NULL, CV_DRIVER, NULL);
   2076 	cv_init(&rp->rs_close_cv, NULL, CV_DRIVER, NULL);
   2077 
   2078 	return (rp);
   2079 }
   2080 
   2081 /*ARGSUSED*/
   2082 static int
   2083 opsrsm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred,
   2084 	int *rvalp)
   2085 {
   2086 	int error = RSM_SUCCESS;
   2087 	opsrsmresource_t *rp;
   2088 	minor_t rnum;
   2089 
   2090 	rnum = getminor(dev);
   2091 	rp = opsrsmresource_lookup(rnum, OPSRSM_RO_DEFAULT);
   2092 	if (rp == NULL) {
   2093 		return (ENXIO);
   2094 	}
   2095 
   2096 	D1("opsrsm_ioctl: rnum = %d ; rp = %p", rnum, rp);
   2097 
   2098 	switch (cmd) {
   2099 	case RSMRDT_IOCTL_BIND:
   2100 		error = opsrsm_ioctl_bind(rp, arg, mode);
   2101 		break;
   2102 	case RSMRDT_IOCTL_CONNECT:
   2103 		error = opsrsm_ioctl_connect(rp, arg, mode);
   2104 		break;
   2105 	case RSMRDT_IOCTL_SENDMSG:
   2106 		error = opsrsm_ioctl_sendmsg(rp, arg, mode);
   2107 		break;
   2108 	case RSMRDT_IOCTL_RECVMSGS:
   2109 		error = opsrsm_ioctl_recvmsgs(rp, arg, mode);
   2110 		break;
   2111 	case RSMRDT_IOCTL_GETPARAM:
   2112 		error = opsrsm_ioctl_getparam(rp, arg, mode);
   2113 		break;
   2114 	case RSMRDT_IOCTL_SETPARAM:
   2115 		error = opsrsm_ioctl_setparam(rp, arg, mode);
   2116 		break;
   2117 	case RSMRDT_IOCTL_GETNODEID:
   2118 		error = opsrsm_ioctl_getnodeid(rp, arg, mode);
   2119 		break;
   2120 	default:
   2121 		DERR("opsrsm_ioctl: cmd not supported\n");
   2122 		error = DDI_FAILURE;
   2123 	}
   2124 	return (error);
   2125 }
   2126 
   2127 /* ********************* Driver Open/Close/Poll *************** */
   2128 
   2129 /* ARGSUSED */
   2130 static int
   2131 opsrsm_open(dev_t *devp, int flag, int otyp, struct cred *cred)
   2132 {
   2133 	minor_t rnum;
   2134 	opsrsmresource_t *rp;
   2135 
   2136 	/*
   2137 	 * Char only
   2138 	 */
   2139 	if (otyp != OTYP_CHR) {
   2140 		return (EINVAL);
   2141 	}
   2142 
   2143 	rw_enter(&opsrsm_resource.opsrsmrct_lock, RW_READER);
   2144 	if (opsrsm_resource.opsrsmrc_flag == OPSRSMRC_UNLOAD_INPROGRESS) {
   2145 		DERR("opsrsm_open: Unloading in progress");
   2146 		rw_exit(&opsrsm_resource.opsrsmrct_lock);
   2147 		return (ENODEV);
   2148 	}
   2149 	rw_exit(&opsrsm_resource.opsrsmrct_lock);
   2150 
   2151 	/*
   2152 	 * Only zero can be opened, clones are used for resources.
   2153 	 */
   2154 	if (getminor(*devp) != OPSRSM_DRIVER_MINOR) {
   2155 		DERR("opsrsm_open: bad minor %d\n", getminor(*devp));
   2156 		return (ENODEV);
   2157 	}
   2158 
   2159 	/*
   2160 	 * - allocate new minor number
   2161 	 * - update devp argument to new device
   2162 	 */
   2163 	if ((rp = opsrsmresource_alloc(&rnum)) != NULL) {
   2164 		*devp = makedevice(getmajor(*devp), rnum);
   2165 		rp->rs_lportnum = rnum;
   2166 	} else {
   2167 		return (ENOMEM);
   2168 	}
   2169 
   2170 	return (DDI_SUCCESS);
   2171 }
   2172 
   2173 /* ARGSUSED */
   2174 static int
   2175 opsrsm_close(dev_t dev, int flag, int otyp, struct cred *cred)
   2176 {
   2177 	minor_t rnum = getminor(dev);
   2178 
   2179 	/*
   2180 	 * Char only
   2181 	 */
   2182 	if (otyp != OTYP_CHR) {
   2183 		return (EINVAL);
   2184 	}
   2185 	D1("opsrsm_close: rnum = %d", rnum);
   2186 
   2187 	/*
   2188 	 * remove resource from resource table and destroy resource
   2189 	 */
   2190 	if (opsrsm_resstruct_free(rnum) != DDI_SUCCESS) {
   2191 		DERR("opsrsm_close: cannot free resource structure\n");
   2192 		return (DDI_FAILURE);
   2193 	}
   2194 	return (DDI_SUCCESS);
   2195 }
   2196 
   2197 /*ARGSUSED*/
   2198 static int
   2199 opsrsm_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
   2200 	struct pollhead **phpp)
   2201 {
   2202 	opsrsmresource_t *rp;
   2203 	minor_t rnum;
   2204 	int error = 0;
   2205 
   2206 	rnum = getminor(dev);
   2207 	rp = opsrsmresource_lookup(rnum, OPSRSM_RO_DEFAULT);
   2208 	if (rp == NULL) {
   2209 		return (ENXIO);
   2210 	}
   2211 
   2212 	D1("opsrsm_chpoll: rnum = %d : rp = %p\n", rnum, rp);
   2213 
   2214 	*reventsp = 0;
   2215 
   2216 	/*
   2217 	 * Valid device events are:
   2218 	 * POLLIN | POLLRDNORM | POLLOUT | POLLERR
   2219 	 */
   2220 	if ((events & POLLIN) != 0) {
   2221 		mutex_enter(&rp->rs_lock);
   2222 		if ((events & POLLOUT) != 0) {
   2223 			error = ENOTSUP;
   2224 			mutex_exit(&rp->rs_lock);
   2225 			goto done;
   2226 		}
   2227 		if ((rp->rs_events & POLLIN) != 0 ||
   2228 		    OPSRSM_Q_LEN(&rp->rs_recvq) > 0) {
   2229 			*reventsp = POLLIN;
   2230 		} else {
   2231 			if (!anyyet) {
   2232 				*phpp = &rp->rs_pollhd;
   2233 			}
   2234 		}
   2235 		mutex_exit(&rp->rs_lock);
   2236 	} else if ((events & POLLOUT) != 0) {
   2237 		boolean_t not_full;
   2238 		opsrsm_dest_t *rd;
   2239 		int isdel;
   2240 
   2241 		if (OPSRSM_IS_LOOPBACK(rp->rs_dest)) {
   2242 			*reventsp = POLLOUT;
   2243 			goto done;
   2244 		}
   2245 		if (rp->rs_dest == NULL) {
   2246 			error = ENOTCONN;
   2247 			goto done;
   2248 		}
   2249 
   2250 		mutex_enter(&rp->rs_lock);
   2251 		if ((rp->rs_state & OPSRSM_RS_FAILOVER) != 0) {
   2252 			*reventsp = POLLOUT;
   2253 			mutex_exit(&rp->rs_lock);
   2254 			goto done;
   2255 		}
   2256 		rd = rp->rs_dest;
   2257 		REFDEST(rd, isdel);
   2258 		if (isdel != 0) {
   2259 			error = ENETDOWN;
   2260 			mutex_exit(&rp->rs_lock);
   2261 			goto done;
   2262 		}
   2263 		mutex_exit(&rp->rs_lock);
   2264 
   2265 		mutex_enter(&rd->rd_sendq_lock);
   2266 		not_full = (OPSRSM_Q_LEN(&rd->rd_sendq) < opsrsmdev->
   2267 		    opsrsm_param.opsrsm_max_queued_pkts);
   2268 		mutex_exit(&rd->rd_sendq_lock);
   2269 
   2270 		if (not_full) {
   2271 			*reventsp = POLLOUT;
   2272 		} else {
   2273 			if (!anyyet) {
   2274 				if (rd->rd_dstate == 0)
   2275 					*phpp = &rd->rd_pollhd;
   2276 			}
   2277 		}
   2278 		UNREFDEST(rd);
   2279 	} else if ((events & POLLERR) != 0) {
   2280 		error = ENOTSUP;
   2281 	}
   2282 
   2283 done:;
   2284 	return (error);
   2285 }
   2286 
   2287 
   2288 
   2289 /*
   2290  * Undo tasks done by opsrsmattach(), either because we're detaching or because
   2291  * attach() got partly done then failed.  progress is a bitmap that tells
   2292  * us what has been done so far.
   2293  */
   2294 static void
   2295 opsrsmtakedown(
   2296 	opsrsm_t *opsrsmp,	/* OPSRSM device (RSM controller) pointer */
   2297 	int progress)	/* Mask of RSMPI_ATT_xxx values */
   2298 {
   2299 	int instance;
   2300 	dev_info_t *dip;
   2301 
   2302 	D1("opsrsmtakedown: opsrsmp 0x%p, progress 0x%x",
   2303 	    (void *)opsrsmp, progress);
   2304 
   2305 	ASSERT(opsrsmp);
   2306 
   2307 	dip = opsrsmp->opsrsm_dip;
   2308 	instance = ddi_get_instance(dip);
   2309 
   2310 	if (progress & OPSRSM_ATT_KSTAT) {
   2311 		opsrsmkstatremove(opsrsmp);
   2312 		progress &= ~OPSRSM_ATT_KSTAT;
   2313 	}
   2314 
   2315 	if (progress & OPSRSM_ATT_MINOR) {
   2316 		ddi_remove_minor_node(dip, NULL);
   2317 		progress &= ~OPSRSM_ATT_MINOR;
   2318 	}
   2319 
   2320 	ASSERT(progress == 0);
   2321 
   2322 	ddi_soft_state_free(opsrsm_state, instance);
   2323 
   2324 	D1("opsrsmtakedown: returning DDI_SUCCESS");
   2325 }
   2326 
   2327 
   2328 /*
   2329  * ****************************************************************
   2330  *                                                               *
   2331  * E N D   BASIC MODULE BOILERPLATE                              *
   2332  *                                                               *
   2333  * ****************************************************************
   2334  */
   2335 
   2336 
   2337 /*
   2338  * ****************************************************************
   2339  *                                                               *
   2340  * B E G I N   STATUS REPORTING STUFF                            *
   2341  *                                                               *
   2342  * ****************************************************************
   2343  */
   2344 
   2345 /*
   2346  * This routine makes the data in our kernel statistics structure reflect
   2347  * the current state of the device; it's called whenever a user requests
   2348  * the kstat data.  Basically, all we do is copy the stats from the RSMPI
   2349  * controller structure, where they're maintained, to the kstat's data
   2350  * portion.
   2351  */
   2352 static int
   2353 opsrsmstat_kstat_update(
   2354 	kstat_t *ksp,	/* Pointer to kstat that will be updated */
   2355 	int rw)		/* Indicates read or write (we don't support write) */
   2356 {
   2357 	opsrsm_t *opsrsmp;
   2358 	opsrsm_stat_t *opsrsmsp;
   2359 
   2360 	if (rw == KSTAT_WRITE)
   2361 		return (EACCES);
   2362 
   2363 	opsrsmp = (opsrsm_t *)ksp->ks_private;
   2364 	opsrsmsp = (opsrsm_stat_t *)ksp->ks_data;
   2365 
   2366 	opsrsmsp->rsm_ipackets.value.ul = (uint_t)opsrsmp->opsrsm_ipackets;
   2367 	opsrsmsp->rsm_ipackets64.value.ui64 = opsrsmp->opsrsm_ipackets;
   2368 	opsrsmsp->rsm_ierrors.value.ul = opsrsmp->opsrsm_ierrors;
   2369 	opsrsmsp->rsm_opackets.value.ul = (uint_t)opsrsmp->opsrsm_opackets;
   2370 	opsrsmsp->rsm_opackets64.value.ui64 = opsrsmp->opsrsm_opackets;
   2371 	opsrsmsp->rsm_oerrors.value.ul = opsrsmp->opsrsm_oerrors;
   2372 	opsrsmsp->rsm_collisions.value.ul = opsrsmp->opsrsm_collisions;
   2373 
   2374 	opsrsmsp->rsm_xfers.value.ul = opsrsmp->opsrsm_xfers;
   2375 	opsrsmsp->rsm_xfer_pkts.value.ul = opsrsmp->opsrsm_xfer_pkts;
   2376 	opsrsmsp->rsm_syncdqes.value.ul = opsrsmp->opsrsm_syncdqes;
   2377 	opsrsmsp->rsm_lbufs.value.ul = opsrsmp->opsrsm_lbufs;
   2378 	opsrsmsp->rsm_nlbufs.value.ul = opsrsmp->opsrsm_nlbufs;
   2379 	opsrsmsp->rsm_pullup.value.ul = opsrsmp->opsrsm_pullup;
   2380 	opsrsmsp->rsm_pullup_fail.value.ul = opsrsmp->opsrsm_pullup_fail;
   2381 	opsrsmsp->rsm_starts.value.ul = opsrsmp->opsrsm_starts;
   2382 	opsrsmsp->rsm_start_xfers.value.ul = opsrsmp->opsrsm_start_xfers;
   2383 	opsrsmsp->rsm_fqetmo_hint.value.ul = opsrsmp->opsrsm_fqetmo_hint;
   2384 	opsrsmsp->rsm_fqetmo_drops.value.ul = opsrsmp->opsrsm_fqetmo_drops;
   2385 	opsrsmsp->rsm_no_fqes.value.ul = opsrsmp->opsrsm_no_fqes;
   2386 	opsrsmsp->rsm_pending_writes.value.ul = opsrsmp->opsrsm_pending_writes;
   2387 	opsrsmsp->rsm_pkts_queued.value.ul = opsrsmp->opsrsm_pkts_queued;
   2388 	opsrsmsp->rsm_pkts_discarded.value.ul = opsrsmp->opsrsm_pkts_discarded;
   2389 	opsrsmsp->rsm_pkts_pending.value.ul = opsrsmp->opsrsm_pkts_pending;
   2390 	opsrsmsp->rsm_last_sendq_len.value.ul = opsrsmp->opsrsm_last_sendq_len;
   2391 	opsrsmsp->rsm_last_pendq_len.value.ul = opsrsmp->opsrsm_last_pendq_len;
   2392 	opsrsmsp->rsm_last_wr_comp.value.ul = opsrsmp->opsrsm_last_wr_comp;
   2393 	opsrsmsp->rsm_errs.value.ul = opsrsmp->opsrsm_errs;
   2394 	opsrsmsp->rsm_in_bytes.value.ul = (uint_t)opsrsmp->opsrsm_in_bytes;
   2395 	opsrsmsp->rsm_in_bytes64.value.ui64 = opsrsmp->opsrsm_in_bytes;
   2396 	opsrsmsp->rsm_out_bytes.value.ul = (uint_t)opsrsmp->opsrsm_out_bytes;
   2397 	opsrsmsp->rsm_out_bytes64.value.ui64 = opsrsmp->opsrsm_out_bytes;
   2398 	opsrsmsp->rsm_intr_send_errs.value.ul = opsrsmp->opsrsm_intr_send_errs;
   2399 	opsrsmsp->rsm_max_batch_size.value.ul = opsrsmp->opsrsm_max_batch_size;
   2400 	opsrsmsp->rsm_min_batch_size.value.ul = opsrsmp->opsrsm_min_batch_size;
   2401 	opsrsmsp->rsm_put_fqes.value.ul = opsrsmp->opsrsm_put_fqes;
   2402 	opsrsmsp->rsm_queued_fqes.value.ul = opsrsmp->opsrsm_queued_fqes;
   2403 	opsrsmsp->rsm_packets_consumed.value.ul =
   2404 		opsrsmp->opsrsm_packets_consumed;
   2405 	return (0);
   2406 }
   2407 
   2408 /*
   2409  * This routine initializes the kernel statistics structures for an
   2410  * OPSRSM device.
   2411  */
   2412 static void
   2413 opsrsmkstatinit(
   2414 	opsrsm_t *opsrsmp)	/* OPSRSM device (RSM controller) pointer */
   2415 {
   2416 	struct kstat *ksp;
   2417 	opsrsm_stat_t *opsrsmsp;
   2418 
   2419 	/*
   2420 	 * We create a kstat for the device, then create a whole bunch of
   2421 	 * named stats inside that first kstat.
   2422 	 */
   2423 	if ((ksp = kstat_create("rsmrdt", ddi_get_instance(opsrsmp->opsrsm_dip),
   2424 	    NULL, "net", KSTAT_TYPE_NAMED, sizeof (opsrsm_stat_t) /
   2425 	    sizeof (kstat_named_t), 0)) == NULL) {
   2426 		opsrsmerror(opsrsmp->opsrsm_dip, "kstat_create failed");
   2427 		return;
   2428 	}
   2429 	opsrsmsp = (opsrsm_stat_t *)(ksp->ks_data);
   2430 
   2431 	/*
   2432 	 * The first five named stats we create have well-known names, and are
   2433 	 * used by standard SunOS utilities (e.g., netstat).  (There is actually
   2434 	 * a sixth well-known stat, called "queue", which we don't support.)
   2435 	 */
   2436 	kstat_named_init(&opsrsmsp->rsm_ipackets, "ipackets", KSTAT_DATA_ULONG);
   2437 	kstat_named_init(&opsrsmsp->rsm_ierrors, "ierrors", KSTAT_DATA_ULONG);
   2438 	kstat_named_init(&opsrsmsp->rsm_opackets, "opackets", KSTAT_DATA_ULONG);
   2439 	kstat_named_init(&opsrsmsp->rsm_oerrors, "oerrors", KSTAT_DATA_ULONG);
   2440 	kstat_named_init(&opsrsmsp->rsm_collisions, "collisions",
   2441 	    KSTAT_DATA_ULONG);
   2442 
   2443 	/*
   2444 	 * MIB II kstat variables
   2445 	 */
   2446 	kstat_named_init(&opsrsmsp->rsm_in_bytes, "rbytes", KSTAT_DATA_ULONG);
   2447 	kstat_named_init(&opsrsmsp->rsm_out_bytes, "obytes", KSTAT_DATA_ULONG);
   2448 
   2449 	/*
   2450 	 * PSARC 1997/198
   2451 	 */
   2452 	kstat_named_init(&opsrsmsp->rsm_ipackets64, "ipackets64",
   2453 		KSTAT_DATA_ULONGLONG);
   2454 	kstat_named_init(&opsrsmsp->rsm_opackets64, "opackets64",
   2455 		KSTAT_DATA_ULONGLONG);
   2456 	kstat_named_init(&opsrsmsp->rsm_in_bytes64, "rbytes64",
   2457 		KSTAT_DATA_ULONGLONG);
   2458 	kstat_named_init(&opsrsmsp->rsm_out_bytes64, "obytes64",
   2459 		KSTAT_DATA_ULONGLONG);
   2460 
   2461 
   2462 	/*
   2463 	 * The remainder of the named stats are specific to our driver, and
   2464 	 * are extracted using the kstat utility.
   2465 	 */
   2466 	kstat_named_init(&opsrsmsp->rsm_xfers, "xfers", KSTAT_DATA_ULONG);
   2467 	kstat_named_init(&opsrsmsp->rsm_xfer_pkts, "xfer_pkts",
   2468 	    KSTAT_DATA_ULONG);
   2469 	kstat_named_init(&opsrsmsp->rsm_syncdqes, "syncdqes", KSTAT_DATA_ULONG);
   2470 	kstat_named_init(&opsrsmsp->rsm_lbufs, "lbufs", KSTAT_DATA_ULONG);
   2471 	kstat_named_init(&opsrsmsp->rsm_nlbufs, "nlbufs", KSTAT_DATA_ULONG);
   2472 	kstat_named_init(&opsrsmsp->rsm_pullup, "pullup", KSTAT_DATA_ULONG);
   2473 	kstat_named_init(&opsrsmsp->rsm_pullup_fail, "pullup_fail",
   2474 	    KSTAT_DATA_ULONG);
   2475 	kstat_named_init(&opsrsmsp->rsm_starts, "starts", KSTAT_DATA_ULONG);
   2476 	kstat_named_init(&opsrsmsp->rsm_start_xfers, "start_xfers",
   2477 	    KSTAT_DATA_ULONG);
   2478 	kstat_named_init(&opsrsmsp->rsm_fqetmo_hint, "fqetmo_hint",
   2479 	    KSTAT_DATA_ULONG);
   2480 	kstat_named_init(&opsrsmsp->rsm_fqetmo_drops, "fqetmo_drops",
   2481 	    KSTAT_DATA_ULONG);
   2482 	kstat_named_init(&opsrsmsp->rsm_no_fqes, "no_fqes",
   2483 	    KSTAT_DATA_ULONG);
   2484 	kstat_named_init(&opsrsmsp->rsm_pending_writes, "pending_writes",
   2485 	    KSTAT_DATA_ULONG);
   2486 	kstat_named_init(&opsrsmsp->rsm_pkts_queued, "pkts_queued",
   2487 	    KSTAT_DATA_ULONG);
   2488 	kstat_named_init(&opsrsmsp->rsm_pkts_discarded, "pkts_discarded",
   2489 	    KSTAT_DATA_ULONG);
   2490 	kstat_named_init(&opsrsmsp->rsm_pkts_pending, "pkts_pending",
   2491 	    KSTAT_DATA_ULONG);
   2492 	kstat_named_init(&opsrsmsp->rsm_last_sendq_len, "last_sendq_len",
   2493 	    KSTAT_DATA_ULONG);
   2494 	kstat_named_init(&opsrsmsp->rsm_last_pendq_len, "last_pendq_len",
   2495 	    KSTAT_DATA_ULONG);
   2496 	kstat_named_init(&opsrsmsp->rsm_last_wr_comp, "last_wr_comp",
   2497 	    KSTAT_DATA_ULONG);
   2498 	kstat_named_init(&opsrsmsp->rsm_errs, "errs", KSTAT_DATA_ULONG);
   2499 	kstat_named_init(&opsrsmsp->rsm_intr_send_errs, "intr_send_errs",
   2500 	    KSTAT_DATA_ULONG);
   2501 
   2502 	kstat_named_init(&opsrsmsp->rsm_min_batch_size, "min_batch_size",
   2503 	    KSTAT_DATA_ULONG);
   2504 	kstat_named_init(&opsrsmsp->rsm_max_batch_size, "max_batch_size",
   2505 	    KSTAT_DATA_ULONG);
   2506 	kstat_named_init(&opsrsmsp->rsm_put_fqes, "put_fqes",
   2507 	    KSTAT_DATA_ULONG);
   2508 	kstat_named_init(&opsrsmsp->rsm_queued_fqes, "queued_fqes",
   2509 	    KSTAT_DATA_ULONG);
   2510 	kstat_named_init(&opsrsmsp->rsm_packets_consumed, "packets_consumed",
   2511 	    KSTAT_DATA_ULONG);
   2512 
   2513 	ksp->ks_update = opsrsmstat_kstat_update;
   2514 	ksp->ks_private = (void *) opsrsmp;
   2515 	opsrsmp->opsrsm_ksp = ksp;
   2516 	kstat_install(ksp);
   2517 
   2518 }
   2519 
   2520 /*
   2521  * This routine removes any kstats we might have created.
   2522  */
   2523 static void
   2524 opsrsmkstatremove(
   2525 	opsrsm_t *opsrsmp)	/* OPSRSM device (RSM controller) pointer */
   2526 {
   2527 
   2528 	if (opsrsmp->opsrsm_ksp)
   2529 		kstat_delete(opsrsmp->opsrsm_ksp);
   2530 }
   2531 
   2532 /*
   2533  * Print an error message to the console.
   2534  */
   2535 static void
   2536 opsrsmerror(
   2537 	dev_info_t *dip,	/* Dev info for the device in question */
   2538 	const char *fmt,	/* Format of output */
   2539 	...)			/* Parameters for output */
   2540 {
   2541 	char name[16];
   2542 	char buff[1024];
   2543 	va_list ap;
   2544 
   2545 	if (dip) {
   2546 		(void) sprintf(name, "%s%d", ddi_get_name(dip),
   2547 			ddi_get_instance(dip));
   2548 	} else {
   2549 		(void) sprintf(name, "opsrsm");
   2550 	}
   2551 
   2552 	/* lint -e40 Undeclared identifier (__builtin_va_alist) */
   2553 	va_start(ap, fmt);
   2554 	/* lint +e40 */
   2555 	(void) vsprintf(buff, fmt, ap);
   2556 	va_end(ap);
   2557 
   2558 	D1("%s:\t%s", name, buff);
   2559 #ifdef DEBUG
   2560 	cmn_err(CE_CONT, "%s:\t%s", name, buff);
   2561 #endif /* DEBUG */
   2562 }
   2563 
   2564 
   2565 #ifdef DEBUG
   2566 
   2567 /*
   2568  * The following variables support the debug log buffer scheme.
   2569  */
   2570 
   2571 char opsrsmdbgbuf[0x80000];	/* The log buffer */
   2572 int opsrsmdbgsize = sizeof (opsrsmdbgbuf);	/* Size of the log buffer */
   2573 size_t opsrsmdbgnext;		/* Next byte to write in buffer (note */
   2574 				/*  this is an index, not a pointer */
   2575 int opsrsmdbginit = 0;		/* Nonzero if opsrsmdbglock's inited */
   2576 kmutex_t opsrsmdbglock;
   2577 
   2578 /*
   2579  * Add the string str to the end of the debug log, followed by a newline.
   2580  */
   2581 static void
   2582 opsrsmdbglog(char *str)
   2583 {
   2584 	size_t length, remlen;
   2585 
   2586 	/*
   2587 	 * If this is the first time we've written to the log, initialize it.
   2588 	 */
   2589 	if (!opsrsmdbginit) {
   2590 		mutex_enter(&opsrsmattlock);
   2591 		if (!opsrsmdbginit) {
   2592 			mutex_init(&opsrsmdbglock, NULL, MUTEX_DRIVER,
   2593 			    NULL);
   2594 			bzero(opsrsmdbgbuf, sizeof (opsrsmdbgbuf));
   2595 			opsrsmdbgnext = 0;
   2596 			opsrsmdbginit = 1;
   2597 		}
   2598 		mutex_exit(&opsrsmattlock);
   2599 	}
   2600 
   2601 	mutex_enter(&opsrsmdbglock);
   2602 
   2603 	/*
   2604 	 * Note the log is circular; if this string would run over the end,
   2605 	 * we copy the first piece to the end and then the last piece to
   2606 	 * the beginning of the log.
   2607 	 */
   2608 	length = strlen(str);
   2609 
   2610 	remlen = (size_t)sizeof (opsrsmdbgbuf) - opsrsmdbgnext;
   2611 
   2612 	if (length > remlen) {
   2613 		if (remlen)
   2614 			bcopy(str, opsrsmdbgbuf + opsrsmdbgnext, remlen);
   2615 		str += remlen;
   2616 		length -= remlen;
   2617 		opsrsmdbgnext = 0;
   2618 	}
   2619 
   2620 	bcopy(str, opsrsmdbgbuf + opsrsmdbgnext, length);
   2621 	opsrsmdbgnext += length;
   2622 
   2623 	if (opsrsmdbgnext >= sizeof (opsrsmdbgbuf))
   2624 		opsrsmdbgnext = 0;
   2625 	opsrsmdbgbuf[opsrsmdbgnext++] = '\n';
   2626 
   2627 	mutex_exit(&opsrsmdbglock);
   2628 }
   2629 
   2630 
   2631 /*
   2632  * Add a printf-style message to whichever debug logs we're currently using.
   2633  */
   2634 static void
   2635 opsrsmdebug(const char *fmt, ...)
   2636 {
   2637 	char buff[512];
   2638 	va_list ap;
   2639 
   2640 	/*lint -e40 Undeclared identifier (__builtin_va_alist) */
   2641 	va_start(ap, fmt);
   2642 	/*lint +e40 */
   2643 	(void) vsprintf(buff, fmt, ap);
   2644 	va_end(ap);
   2645 
   2646 	if (opsrsmdbgmode & 0x1)
   2647 		opsrsmdbglog(buff);
   2648 	if (opsrsmdbgmode & 0x2)
   2649 		cmn_err(CE_CONT, "%s\n", buff);
   2650 }
   2651 
   2652 static void
   2653 opsrsmconsole(const char *fmt, ...)
   2654 {
   2655 	char buff[512];
   2656 	va_list ap;
   2657 
   2658 	/*lint -e40 Undeclared identifier (__builtin_va_alist) */
   2659 	va_start(ap, fmt);
   2660 	/*lint +e40 */
   2661 	(void) vsprintf(buff, fmt, ap);
   2662 	va_end(ap);
   2663 
   2664 	cmn_err(CE_CONT, "%s", buff);
   2665 }
   2666 
   2667 #endif
   2668 
   2669 
   2670 /*
   2671  * ****************************************************************
   2672  *                                                               *
   2673  * E N D   STATUS REPORTING STUFF                                *
   2674  *                                                               *
   2675  * ****************************************************************
   2676  */
   2677 
   2678 
   2679 /*
   2680  * ****************************************************************
   2681  *                                                               *
   2682  * B E G I N   BASIC STREAMS OPERATIONS                          *
   2683  *                                                               *
   2684  * ****************************************************************
   2685  */
   2686 
   2687 
   2688 /*
   2689  * Write service routine.  This routine processes any messages put on the queue
   2690  * via a putq() in the write put routine.  It also handles any destinations put
   2691  * on the destination run queue.
   2692  */
   2693 static void
   2694 opsrsmwsrv(void *arg)
   2695 {
   2696 	adapter_t *adapterp;
   2697 	opsrsm_t *opsrsmp = opsrsmdev;
   2698 	opsrsm_dest_t *rd;
   2699 	int isdel = 0;
   2700 
   2701 	D5("opsrsmwsrv: time 0x%llx", gethrtime());
   2702 
   2703 	adapterp = (adapter_t *)arg;
   2704 
   2705 	/*
   2706 	 * rd's refcnt is incremented by GETRUNQ
   2707 	 */
   2708 	GETRUNQ(rd, isdel, adapterp);
   2709 	while (rd) {
   2710 		int oldstate, delete;
   2711 
   2712 		if (isdel) {
   2713 			D2("opsrsmwsrv: dest 0x%p being deleted, ignored",
   2714 			    (void *)rd);
   2715 			GETRUNQ(rd, isdel, adapterp);
   2716 			continue;
   2717 		}
   2718 
   2719 		mutex_enter(&rd->rd_lock);
   2720 		delete = 0;
   2721 
   2722 		oldstate = opsrsmgetstate(rd);
   2723 		D5("opsrsmwsrv: running state %s time 0x%llx",
   2724 		    OPSRSM_STATE_STR(oldstate), gethrtime());
   2725 		switch (oldstate) {
   2726 
   2727 		case OPSRSM_STATE_S_XFER: {
   2728 			cmn_err(CE_PANIC, "impossible state\n");
   2729 			break;
   2730 		}
   2731 
   2732 		case OPSRSM_STATE_S_REQ_CONNECT: {
   2733 			if (opsrsmcrexfer(opsrsmp, rd) != 0 ||
   2734 			    opsrsmsconn(opsrsmp, rd, 0) != 0) {
   2735 				opsrsmsetstate(rd, OPSRSM_STATE_DELETING);
   2736 				delete = 1;
   2737 			}
   2738 			break;
   2739 		}
   2740 
   2741 		case OPSRSM_STATE_S_NEWCONN: {
   2742 			if (opsrsmcrexfer(opsrsmp, rd) != 0 ||
   2743 			    opsrsmconnxfer(opsrsmp, rd) != 0 ||
   2744 			    opsrsmsaccept(opsrsmp, rd) != 0) {
   2745 				opsrsmsetstate(rd, OPSRSM_STATE_DELETING);
   2746 				delete = 1;
   2747 			}
   2748 			break;
   2749 		}
   2750 
   2751 		case OPSRSM_STATE_S_CONNXFER_ACCEPT: {
   2752 			if (opsrsmconnxfer(opsrsmp, rd) != 0 ||
   2753 			    opsrsmsaccept(opsrsmp, rd) != 0) {
   2754 				opsrsmsetstate(rd, OPSRSM_STATE_DELETING);
   2755 				delete = 1;
   2756 			}
   2757 			break;
   2758 		}
   2759 
   2760 		case OPSRSM_STATE_S_CONNXFER_ACK: {
   2761 			if (opsrsmconnxfer(opsrsmp, rd) != 0 ||
   2762 			    opsrsmsack(rd) != 0) {
   2763 				opsrsmsetstate(rd, OPSRSM_STATE_DELETING);
   2764 				delete = 1;
   2765 			}
   2766 			break;
   2767 		}
   2768 
   2769 		/*
   2770 		 * Delete this connection.  This causes a message
   2771 		 * to be sent to the remote side when RSM_SENDQ_DESTROY
   2772 		 * is called, so there is no need to send an additional
   2773 		 * message.
   2774 		 */
   2775 		case OPSRSM_STATE_S_DELETE: {
   2776 			opsrsmsetstate(rd, OPSRSM_STATE_DELETING);
   2777 			delete = 1;
   2778 			break;
   2779 		}
   2780 
   2781 		/*
   2782 		 * Retry the SCONN.
   2783 		 */
   2784 		case OPSRSM_STATE_S_SCONN: {
   2785 			if (opsrsmsconn(opsrsmp, rd, 1) != 0) {
   2786 				opsrsmsetstate(rd, OPSRSM_STATE_DELETING);
   2787 				delete = 1;
   2788 			}
   2789 			break;
   2790 		}
   2791 
   2792 		default:
   2793 			D1("opsrsm: bad state %s in wsrv "
   2794 			    " for dest 0x%lx", OPSRSM_STATE_STR(oldstate),
   2795 			    (uintptr_t)rd);
   2796 			cmn_err(CE_PANIC, "opsrsm: bad state %s in wsrv "
   2797 			    " for dest 0x%lx", OPSRSM_STATE_STR(oldstate),
   2798 			    (uintptr_t)rd);
   2799 			break;
   2800 		}
   2801 
   2802 		mutex_exit(&rd->rd_lock);
   2803 
   2804 		if (delete)
   2805 			(void) opsrsmfreedest(adapterp, rd->rd_rsm_addr);
   2806 
   2807 		UNREFDEST(rd);
   2808 
   2809 		GETRUNQ(rd, isdel, adapterp);
   2810 	}
   2811 
   2812 	D1("opsrsmwsrv: returning");
   2813 }
   2814 
   2815 
   2816 
   2817 
   2818 /*
   2819  * ****************************************************************
   2820  *                                                               *
   2821  * E N D       BASIC STREAMS OPERATIONS                          *
   2822  *                                                               *
   2823  * ****************************************************************
   2824  */
   2825 
   2826 
   2827 /*
   2828  * ****************************************************************
   2829  *                                                               *
   2830  * B E G I N   NEW DATA TRANSFER LOGIC                           *
   2831  *                                                               *
   2832  * ****************************************************************
   2833  */
   2834 
   2835 static int
   2836 opsrsm_start_batch(opsrsm_dest_t *rd, uint32_t start_time)
   2837 {
   2838 	int err = 0;
   2839 
   2840 	switch (rd->rd_xmit_state) {
   2841 	case OPSRSM_XMIT_BARRIER_CLOSED:
   2842 	case OPSRSM_XMIT_RETRY_DATA:
   2843 		err = RSM_OPEN_BARRIER_REGION(rd->rd_adapter->rsmrdt_ctlr_obj,
   2844 		    rd->rd_rxferhand, &rd->rd_barrier);
   2845 		ASSERT(err == RSM_SUCCESS);
   2846 		if (rd->rd_xmit_state == OPSRSM_XMIT_BARRIER_CLOSED) {
   2847 			rd->rd_xmit_state = OPSRSM_XMIT_BARRIER_OPENED;
   2848 			rd->rd_start_time = start_time;
   2849 			rd->rd_data_collected = 0;
   2850 			rd->rd_writes_completed = 0;
   2851 			err = 0;
   2852 		}
   2853 		break;
   2854 	default:
   2855 		cmn_err(CE_PANIC, "invalid state = %d\n", rd->rd_xmit_state);
   2856 		break;
   2857 	}
   2858 	return (err);
   2859 }
   2860 
   2861 static int
   2862 opsrsm_end_batch(opsrsm_dest_t *rd)
   2863 {
   2864 	int err = 0;
   2865 	uint32_t qlen = 0;
   2866 
   2867 	switch (rd->rd_xmit_state) {
   2868 	case OPSRSM_XMIT_BARRIER_OPENED:
   2869 	case OPSRSM_XMIT_RETRY_DATA:
   2870 		err = RSM_CLOSE_BARRIER(rd->rd_adapter->rsmrdt_ctlr_obj,
   2871 		    &rd->rd_barrier);
   2872 		if (err != RSM_SUCCESS) {
   2873 			rd->rd_xmit_state = OPSRSM_XMIT_RETRY_DATA;
   2874 			break;
   2875 		}
   2876 		qlen = (uint32_t)OPSRSM_Q_LEN(&rd->rd_pendq);
   2877 		opsrsmputdqes(rd);
   2878 
   2879 		mutex_enter(&rd->rd_freeq_lock);
   2880 		OPSRSM_Q_CONCAT(&rd->rd_freeq, &rd->rd_pendq);
   2881 		mutex_exit(&rd->rd_freeq_lock);
   2882 
   2883 		opsrsmdev->opsrsm_xfers++;
   2884 		opsrsmdev->opsrsm_xfer_pkts += qlen;
   2885 		opsrsmdev->opsrsm_max_batch_size =
   2886 		    max(opsrsmdev->opsrsm_max_batch_size, qlen);
   2887 
   2888 		if (opsrsmdev->opsrsm_min_batch_size == 0) {
   2889 			opsrsmdev->opsrsm_min_batch_size = qlen;
   2890 		} else {
   2891 			opsrsmdev->opsrsm_min_batch_size =
   2892 			    min(opsrsmdev->opsrsm_min_batch_size, qlen);
   2893 		}
   2894 
   2895 		rd->rd_start_time = 0;
   2896 		rd->rd_data_collected = 0;
   2897 		rd->rd_writes_completed = 0;
   2898 		rd->rd_nretries = 0;
   2899 		rd->rd_xmit_state = OPSRSM_XMIT_BARRIER_CLOSED;
   2900 		break;
   2901 	case OPSRSM_XMIT_DISCONNECTED:
   2902 		err = RSM_SUCCESS;
   2903 		break;
   2904 	default:
   2905 		cmn_err(CE_PANIC, "invalid state = %d\n", rd->rd_xmit_state);
   2906 		break;
   2907 	}
   2908 
   2909 	if (err != RSM_SUCCESS) {
   2910 		opsrsmdev->opsrsm_collisions++;
   2911 		if (++rd->rd_nretries > opsrsmdev->
   2912 		    opsrsm_param.opsrsm_retry_limit) {
   2913 			rd->rd_nretries = 0;
   2914 			err = ENETDOWN;
   2915 		} else {
   2916 			opsrsm_set_xmit_tmo(rd, opsrsmdev->opsrsm_param.
   2917 			    opsrsm_retry_delay);
   2918 		}
   2919 	} else {
   2920 		err = 0;
   2921 	}
   2922 	return (err);
   2923 }
   2924 
   2925 static void
   2926 opsrsm_dispatch_tmo(void *arg)
   2927 {
   2928 	opsrsm_dest_t *rd = (opsrsm_dest_t *)arg;
   2929 
   2930 	/* keep rescheduling itself until opsrsmxmit_thread is dispatched */
   2931 	if (taskq_dispatch(rd->rd_adapter->opsrsm_taskq, opsrsmxmit_thread,
   2932 	    rd, KM_NOSLEEP) == 0) {
   2933 		(void) timeout(opsrsm_dispatch_tmo, rd, 1);
   2934 	}
   2935 }
   2936 
   2937 static void
   2938 opsrsm_xmit_tmo(void *arg)
   2939 {
   2940 	opsrsm_dest_t *rd = (opsrsm_dest_t *)arg;
   2941 
   2942 	mutex_enter(&rd->rd_tmo_lock);
   2943 	if (rd->rd_xmit_tmo_id == 0) {
   2944 		mutex_exit(&rd->rd_tmo_lock);
   2945 		return;
   2946 	}
   2947 	if (taskq_dispatch(rd->rd_adapter->opsrsm_taskq, opsrsmxmit_thread,
   2948 	    rd, KM_NOSLEEP) == 0) {
   2949 		rd->rd_xmit_tmo_id = timeout(opsrsm_xmit_tmo, rd,
   2950 		    rd->rd_xmit_tmo_int);
   2951 	} else {
   2952 		rd->rd_xmit_tmo_id = 0;
   2953 		rd->rd_xmit_tmo_int = 0;
   2954 	}
   2955 	mutex_exit(&rd->rd_tmo_lock);
   2956 }
   2957 
   2958 static void
   2959 opsrsm_set_xmit_tmo(opsrsm_dest_t *rd, int interval)
   2960 {
   2961 	int isdel = 0;
   2962 
   2963 	mutex_enter(&rd->rd_tmo_lock);
   2964 	if (rd->rd_xmit_tmo_id != 0) {
   2965 		goto out;
   2966 	}
   2967 	REFDEST(rd, isdel);
   2968 	if (isdel != 0) goto out;
   2969 	rd->rd_xmit_tmo_int = interval;
   2970 	rd->rd_xmit_tmo_id = timeout(opsrsm_xmit_tmo, rd, rd->rd_xmit_tmo_int);
   2971 out:;
   2972 	mutex_exit(&rd->rd_tmo_lock);
   2973 }
   2974 
   2975 static void
   2976 opsrsm_cancel_xmit_tmo(opsrsm_dest_t *rd)
   2977 {
   2978 	timeout_id_t tmoid;
   2979 
   2980 	mutex_enter(&rd->rd_tmo_lock);
   2981 	if (rd->rd_xmit_tmo_id == 0) {
   2982 		mutex_exit(&rd->rd_tmo_lock);
   2983 		return;
   2984 	}
   2985 	UNREFDEST(rd);
   2986 	tmoid = rd->rd_xmit_tmo_id;
   2987 	rd->rd_xmit_tmo_id = 0;
   2988 	mutex_exit(&rd->rd_tmo_lock);
   2989 	(void) untimeout(tmoid);
   2990 }
   2991 
   2992 static void
   2993 opsrsm_wake_senders(opsrsm_dest_t *rd, short events)
   2994 {
   2995 	pollwakeup(&rd->rd_pollhd, events);
   2996 }
   2997 
   2998 static void
   2999 opsrsm_fqe_tmo(void *arg)
   3000 {
   3001 	opsrsm_dest_t *rd = (opsrsm_dest_t *)arg;
   3002 
   3003 	mutex_enter(&rd->rd_tmo_lock);
   3004 	if (rd->rd_fqe_tmo_id == 0) {
   3005 		mutex_exit(&rd->rd_tmo_lock);
   3006 		return;
   3007 	}
   3008 	if (opsrsmavailfqe(rd)) {
   3009 		if (taskq_dispatch(rd->rd_adapter->opsrsm_taskq,
   3010 		    opsrsmxmit_thread, rd, KM_NOSLEEP) != 0) {
   3011 			rd->rd_fqe_tmo_id = 0;
   3012 			rd->rd_fqe_tmo_int = 0;
   3013 			mutex_exit(&rd->rd_tmo_lock);
   3014 			return;
   3015 		}
   3016 	}
   3017 	rd->rd_fqe_tmo_id = timeout(opsrsm_fqe_tmo, rd, rd->rd_fqe_tmo_int);
   3018 	mutex_exit(&rd->rd_tmo_lock);
   3019 }
   3020 
   3021 static void
   3022 opsrsm_set_fqe_tmo(opsrsm_dest_t *rd, int interval)
   3023 {
   3024 	int isdel = 0;
   3025 
   3026 	mutex_enter(&rd->rd_tmo_lock);
   3027 	if (rd->rd_fqe_tmo_id != 0) {
   3028 		goto out;
   3029 	}
   3030 	REFDEST(rd, isdel);
   3031 	if (isdel != 0) goto out;
   3032 	rd->rd_fqe_tmo_int = interval;
   3033 	rd->rd_fqe_tmo_id = timeout(opsrsm_fqe_tmo, rd, rd->rd_fqe_tmo_int);
   3034 out:;
   3035 	mutex_exit(&rd->rd_tmo_lock);
   3036 }
   3037 
   3038 static void
   3039 opsrsm_cancel_fqe_tmo(opsrsm_dest_t *rd)
   3040 {
   3041 	timeout_id_t tmoid;
   3042 
   3043 	mutex_enter(&rd->rd_tmo_lock);
   3044 	if (rd->rd_fqe_tmo_id == 0) {
   3045 		mutex_exit(&rd->rd_tmo_lock);
   3046 		return;
   3047 	}
   3048 	UNREFDEST(rd);
   3049 	tmoid = rd->rd_fqe_tmo_id;
   3050 	rd->rd_fqe_tmo_id = 0;
   3051 	mutex_exit(&rd->rd_tmo_lock);
   3052 	(void) untimeout(tmoid);
   3053 }
   3054 
   3055 static int
   3056 opsrsm_write_data(opsrsm_dest_t *rd, mblk_t *mp)
   3057 {
   3058 	uint_t bufnum;
   3059 	int write_err;
   3060 	uint_t pktlen;
   3061 	uint_t start_offset, end_offset;
   3062 	uchar_t *srcaddr, *endaddr;
   3063 
   3064 	pktlen = (uint_t)MBLKL(mp);
   3065 	if (pktlen > rd->rd_rbuflen)
   3066 		pktlen = rd->rd_rbuflen;
   3067 
   3068 	bufnum = (uint_t)mp->b_prev;
   3069 	srcaddr = mp->b_rptr;
   3070 	start_offset = (uint_t)((uint64_t)srcaddr & OPSRSM_CACHELINE_OFFSET);
   3071 	ASSERT(start_offset == 0);
   3072 	endaddr = srcaddr + pktlen;
   3073 	end_offset = (uint_t)(OPSRSM_CACHELINE_ROUNDUP(endaddr) -
   3074 	    (uint64_t)endaddr);
   3075 
   3076 	ASSERT((pktlen + start_offset + end_offset) <= rd->rd_rbuflen);
   3077 	ASSERT(((rd->rd_rbufoff + (off_t)(bufnum * rd->rd_rbuflen)) &
   3078 	    OPSRSM_CACHELINE_OFFSET) == 0);
   3079 
   3080 	D6("write_data: put 0x%x bytes at segoffset 0x%lx from addr 0x%p",
   3081 	    pktlen + start_offset + end_offset, rd->rd_rbufoff +
   3082 	    (off_t)(bufnum * rd->rd_rbuflen), (void *)(srcaddr - start_offset));
   3083 
   3084 	write_err = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj, rd->rd_rxferhand,
   3085 	    rd->rd_rbufoff + (off_t)(bufnum * rd->rd_rbuflen),
   3086 	    srcaddr - start_offset,
   3087 	    (size_t)(pktlen + start_offset + end_offset));
   3088 
   3089 	if (write_err != RSM_SUCCESS) DERR("write_err = %d\n", write_err);
   3090 	return (0);
   3091 }
   3092 
   3093 static int
   3094 opsrsm_sync_dqe(opsrsm_dest_t *rd)
   3095 {
   3096 	opsrsm_t *opsrsmp = opsrsmdev;
   3097 	uint64_t start_offset, end_offset;
   3098 	opsrsm_dqe_t *new_shdwdqw_o = NULL;
   3099 	rsm_send_t send_obj;
   3100 	ushort_t new_dqw_seq = 0;
   3101 	rsm_barrier_t dq_barrier;
   3102 	opsrsm_msg_t msg;
   3103 	uint32_t msg_cnt;
   3104 	int stat = RSM_SUCCESS;
   3105 
   3106 	mutex_enter(&rd->rd_net_lock);
   3107 	/* If network down, nothing to do either */
   3108 	if (rd->rd_stopq) {
   3109 		D1("opsrsm_sync_dqe: stopq on, done");
   3110 		mutex_exit(&rd->rd_net_lock);
   3111 		return (0);
   3112 	}
   3113 
   3114 	/* If nothing's queued, nothing to do */
   3115 	if (rd->rd_shdwdqw_i == rd->rd_shdwdqw_o) {
   3116 		D1("opsrsm_sync_dqe: no work, done");
   3117 		if (rd->rd_retry_int) goto retry_int;
   3118 		mutex_exit(&rd->rd_net_lock);
   3119 		return (0);
   3120 	}
   3121 
   3122 	stat = RSM_OPEN_BARRIER_REGION(rd->rd_adapter->rsmrdt_ctlr_obj,
   3123 	    rd->rd_rxferhand, &dq_barrier);
   3124 
   3125 	if (stat != RSM_SUCCESS) {
   3126 		goto done;
   3127 	}
   3128 	/*
   3129 	 * remember any updates to the DQ; commit changes when
   3130 	 * opsrsm_end_batch succeeds
   3131 	 */
   3132 	new_shdwdqw_o = rd->rd_shdwdqw_o;
   3133 	new_dqw_seq = rd->rd_dqw_seq;
   3134 
   3135 	/*
   3136 	 * If we've wrapped around, so that the next element to go comes from
   3137 	 * a lower address than where we started, do it in two segments.
   3138 	 */
   3139 	if (new_shdwdqw_o > rd->rd_shdwdqw_i) {
   3140 		/*
   3141 		 * handle elements from current (shdwdqw_o) to end of list
   3142 		 * (shdwdqw_l)
   3143 		 */
   3144 		opsrsm_dqe_t *tmpdqe = new_shdwdqw_o;
   3145 		/*
   3146 		 * update entries being sent with current sequence number
   3147 		 */
   3148 		while (tmpdqe <= rd->rd_shdwdqw_l) {
   3149 			tmpdqe->s.dq_seqnum =
   3150 			    new_dqw_seq & OPSRSM_DQE_SEQ_MASK;
   3151 			tmpdqe++;
   3152 		}
   3153 
   3154 		/*
   3155 		 * get DQE offset for these DQ entries
   3156 		 */
   3157 		start_offset = (uint64_t)((char *)new_shdwdqw_o -
   3158 		    (char *)rd->rd_shdwdqw_f);
   3159 		end_offset = (uint64_t)((char *)(rd->rd_shdwdqw_l + 1) -
   3160 		    (char *)rd->rd_shdwdqw_f);
   3161 		D6("opsrsm_sync_dqe: start 0x%lx end 0x%lx", start_offset,
   3162 		    end_offset);
   3163 
   3164 		/*
   3165 		 * Round down and up to 64-byte boundaries
   3166 		 */
   3167 		start_offset = start_offset & OPSRSM_CACHELINE_MASK;
   3168 		end_offset = OPSRSM_CACHELINE_ROUNDUP(end_offset);
   3169 		D6("opsrsm_sync_dqe: start 0x%lx end 0x%lx", start_offset,
   3170 		    end_offset);
   3171 
   3172 		/*
   3173 		 * Push to remote side
   3174 		 */
   3175 		stat = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj,
   3176 		    rd->rd_rxferhand, rd->rd_dqw_f_off + (off_t)start_offset,
   3177 		    ((char *)rd->rd_shdwdqw_f) + start_offset,
   3178 		    (size_t)(end_offset - start_offset));
   3179 
   3180 		if (stat != RSM_SUCCESS) {
   3181 			goto done;
   3182 		}
   3183 		/*
   3184 		 * Successfully processed these entries.
   3185 		 * Wrap around to the beginning of DQE list, and
   3186 		 * update the sequence number for the next round.
   3187 		 */
   3188 		new_shdwdqw_o = rd->rd_shdwdqw_f;
   3189 		new_dqw_seq++;
   3190 		if (new_dqw_seq == 0)
   3191 			new_dqw_seq++;
   3192 	}
   3193 
   3194 	/*
   3195 	 * Handle remaining sequential DQEs
   3196 	 */
   3197 	if (new_shdwdqw_o != rd->rd_shdwdqw_i) {
   3198 		opsrsm_dqe_t *tmpdqe = new_shdwdqw_o;
   3199 		while (tmpdqe < rd->rd_shdwdqw_i) {
   3200 			tmpdqe->s.dq_seqnum =
   3201 			    new_dqw_seq & OPSRSM_DQE_SEQ_MASK;
   3202 			tmpdqe++;
   3203 		}
   3204 
   3205 		/*
   3206 		 * get DQE offset for these DQ entries
   3207 		 */
   3208 		start_offset = (uint64_t)((char *)new_shdwdqw_o -
   3209 		    (char *)rd->rd_shdwdqw_f);
   3210 		end_offset = (uint64_t)((char *)rd->rd_shdwdqw_i -
   3211 		    (char *)rd->rd_shdwdqw_f);
   3212 		D6("opsrsm_sync_dqe: start 0x%lx end 0x%lx", start_offset,
   3213 		    end_offset);
   3214 
   3215 		/*
   3216 		 * Round down and up to 64-byte cacheline boundaries
   3217 		 */
   3218 		start_offset = start_offset & OPSRSM_CACHELINE_MASK;
   3219 		end_offset = OPSRSM_CACHELINE_ROUNDUP(end_offset);
   3220 		D6("opsrsm_sync_dqe: start 0x%lx end 0x%lx", start_offset,
   3221 		    end_offset);
   3222 
   3223 		/*
   3224 		 * Push to remote side
   3225 		 */
   3226 		stat = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj,
   3227 		    rd->rd_rxferhand, rd->rd_dqw_f_off + (off_t)start_offset,
   3228 		    ((char *)rd->rd_shdwdqw_f) + start_offset,
   3229 		    (size_t)(end_offset - start_offset));
   3230 
   3231 		if (stat != RSM_SUCCESS) {
   3232 			goto done;
   3233 		}
   3234 		new_shdwdqw_o = rd->rd_shdwdqw_i;
   3235 	}
   3236 
   3237 	stat = RSM_CLOSE_BARRIER(rd->rd_adapter->rsmrdt_ctlr_obj, &dq_barrier);
   3238 
   3239 done:;
   3240 	if (stat != RSM_SUCCESS) {
   3241 		/* set timer to retry */
   3242 		opsrsmdev->opsrsm_oerrors++;
   3243 		if (rd->rd_sync_dqe_tmo_id == 0 &&
   3244 		    rd->rd_state == OPSRSM_STATE_W_READY) {
   3245 			rd->rd_sync_dqe_tmo_id = timeout(opsrsm_sync_dqe_tmo,
   3246 			    rd, (clock_t)opsrsmp->
   3247 			    opsrsm_param.opsrsm_sync_tmo);
   3248 		}
   3249 		mutex_exit(&rd->rd_net_lock);
   3250 		return (stat);
   3251 	} else {
   3252 		rd->rd_shdwdqw_o = new_shdwdqw_o;
   3253 		rd->rd_dqw_seq = new_dqw_seq;
   3254 	}
   3255 
   3256 retry_int:
   3257 	rd->rd_retry_int = B_FALSE;
   3258 	msg_cnt = rd->rd_pkts_delivered;
   3259 	rd->rd_pkts_delivered = 0;
   3260 
   3261 	msg.p.hdr.reqtype = OPSRSM_MSG_SYNC_DQE;
   3262 	msg.p.hdr.seqno = 0;
   3263 	msg.p.hdr.opsrsm_version = OPSRSM_VERSION;
   3264 	msg.p.m.syncdqe.rcv_segid = rd->rd_rxfersegid;
   3265 	msg.p.m.syncdqe.msg_cnt = msg_cnt;
   3266 
   3267 	send_obj.is_data = &msg;
   3268 	send_obj.is_size = sizeof (opsrsm_msg_t);
   3269 	send_obj.is_flags = RSM_DLPI_SQFLAGS;
   3270 	send_obj.is_wait = 0;
   3271 	mutex_exit(&rd->rd_net_lock);
   3272 	/*
   3273 	 * send interrupt to remote node. need to release rd_net_lock
   3274 	 * first because RSM_SEND can block.
   3275 	 */
   3276 	stat = RSM_SEND(rd->rd_adapter->rsmrdt_ctlr_obj, rd->rsm_sendq,
   3277 	    &send_obj, NULL);
   3278 
   3279 	mutex_enter(&rd->rd_net_lock);
   3280 	if (stat == RSMERR_CONN_ABORTED) {
   3281 		DERR("RSM_SEND: connection aborted");
   3282 		opsrsmdev->opsrsm_intr_send_errs++;
   3283 		mutex_exit(&rd->rd_net_lock);
   3284 		opsrsm_lostconn(rd);
   3285 		return (stat);
   3286 	} else if (stat != RSM_SUCCESS) {
   3287 		opsrsmdev->opsrsm_collisions++;
   3288 		rd->rd_nretries++;
   3289 		rd->rd_retry_int = B_TRUE;
   3290 		rd->rd_pkts_delivered += msg_cnt;
   3291 		if (rd->rd_sync_dqe_tmo_id == 0) {
   3292 			rd->rd_sync_dqe_tmo_id = timeout(opsrsm_sync_dqe_tmo,
   3293 			    rd, (clock_t)opsrsmp->
   3294 			    opsrsm_param.opsrsm_sync_tmo);
   3295 		}
   3296 		mutex_exit(&rd->rd_net_lock);
   3297 		return (stat);
   3298 	}
   3299 	mutex_exit(&rd->rd_net_lock);
   3300 
   3301 	/* free up only the messages that were delivered */
   3302 	mutex_enter(&rd->rd_freeq_lock);
   3303 	if (!rd->rd_freeq_freeze) {
   3304 		opsrsm_queue_t *q;
   3305 		uint_t cnt = 0;
   3306 
   3307 		q = &rd->rd_freeq;
   3308 		while ((q)->q_head != NULL) {
   3309 			mblk_t *mp;
   3310 
   3311 			cnt++;
   3312 			mp = (q)->q_head;
   3313 			if ((q)->q_head == (q)->q_tail) {
   3314 				ASSERT(mp->b_next == NULL);
   3315 				ASSERT((q)->q_len == 1);
   3316 				(q)->q_tail = NULL;
   3317 			}
   3318 			(q)->q_head = mp->b_next;
   3319 			mp->b_prev = mp->b_next = NULL;
   3320 			mp->b_cont = NULL;
   3321 			freemsg(mp);
   3322 			(q)->q_len--;
   3323 			if (cnt == msg_cnt) {
   3324 				break;
   3325 			}
   3326 		}
   3327 	}
   3328 	mutex_exit(&rd->rd_freeq_lock);
   3329 	return (0);
   3330 }
   3331 
   3332 
   3333 static int
   3334 opsrsm_sync_fqe(opsrsm_dest_t *rd)
   3335 {
   3336 	uint64_t start_offset, end_offset;
   3337 	opsrsm_fqe_t *new_shdwfqw_o = NULL;
   3338 	ushort_t new_fqw_seq = 0;
   3339 	rsm_barrier_t fq_barrier;
   3340 	int stat = RSM_SUCCESS;
   3341 
   3342 	mutex_enter(&rd->rd_fqr_lock);
   3343 
   3344 	ASSERT((rd->rd_fqr_flags & OPSRSM_FQR_LOCKED) == 0);
   3345 	/*
   3346 	 * setting this flag guarantees that no other
   3347 	 * thread can access the shadow queue pointers
   3348 	 * (rd_shdwfqw_*) used by the RSM calls below.
   3349 	 */
   3350 	rd->rd_fqr_flags |= OPSRSM_FQR_LOCKED;
   3351 	new_shdwfqw_o = rd->rd_shdwfqw_o;
   3352 	new_fqw_seq = rd->rd_fqw_seq;
   3353 
   3354 	/* If nothing's queued, nothing to do */
   3355 	if (rd->rd_shdwfqw_i == rd->rd_shdwfqw_o) {
   3356 		boolean_t putfqes;
   3357 
   3358 		D1("opsrsmsyncfqe: no work, done");
   3359 		putfqes = (rd->rd_queued_fqe_list != NULL);
   3360 		if (!putfqes) {
   3361 			rd->rd_fqr_flags &= ~OPSRSM_FQR_LOCKED;
   3362 		}
   3363 		mutex_exit(&rd->rd_fqr_lock);
   3364 		if (putfqes) goto done;
   3365 		return (0);
   3366 	}
   3367 
   3368 	/* If network down, nothing to do either */
   3369 	if (rd->rd_stopq) {
   3370 		D1("opsrsmsyncfqe: stopq on, done");
   3371 		rd->rd_fqr_flags &= ~OPSRSM_FQR_LOCKED;
   3372 		mutex_exit(&rd->rd_fqr_lock);
   3373 		return (0);
   3374 	}
   3375 
   3376 	mutex_exit(&rd->rd_fqr_lock);
   3377 
   3378 	stat = RSM_OPEN_BARRIER_REGION(rd->rd_adapter->rsmrdt_ctlr_obj,
   3379 	    rd->rd_rxferhand, &fq_barrier);
   3380 
   3381 	if (stat != RSM_SUCCESS) {
   3382 		goto done;
   3383 	}
   3384 
   3385 	/*
   3386 	 * If we've wrapped around, so that the next element to go comes from
   3387 	 * a lower address than where we started, do it in two segments.
   3388 	 */
   3389 	if (new_shdwfqw_o > rd->rd_shdwfqw_i) {
   3390 		/*
   3391 		 * Process the elements from the current to the end of
   3392 		 * the list, then adjust pointers to point to start of
   3393 		 * list.
   3394 		 */
   3395 
   3396 		/*
   3397 		 * Set the sequence numbers in these FQEs.
   3398 		 */
   3399 		opsrsm_fqe_t *tmpfqe = new_shdwfqw_o;
   3400 		while (tmpfqe <= rd->rd_shdwfqw_l) {
   3401 			tmpfqe->s.fq_seqnum =
   3402 			    new_fqw_seq & OPSRSM_FQE_SEQ_MASK;
   3403 			tmpfqe++;
   3404 		}
   3405 
   3406 		/*
   3407 		 * Get FQE offsets for FQ range being updated
   3408 		 */
   3409 		start_offset = (uint64_t)((char *)new_shdwfqw_o -
   3410 		    (char *)rd->rd_shdwfqw_f);
   3411 		end_offset = (uint64_t)((char *)(rd->rd_shdwfqw_l + 1) -
   3412 		    (char *)rd->rd_shdwfqw_f);
   3413 
   3414 		D6("opsrsmsyncfqe: start 0x%lx end 0x%lx", start_offset,
   3415 		    end_offset);
   3416 
   3417 		/*
   3418 		 * Round down and up to 64-byte boundaries
   3419 		 */
   3420 		start_offset = start_offset & OPSRSM_CACHELINE_MASK;
   3421 		end_offset = OPSRSM_CACHELINE_ROUNDUP(end_offset);
   3422 
   3423 		D6("opsrsmsyncfqe: start 0x%lx end 0x%lx", start_offset,
   3424 		    end_offset);
   3425 
   3426 		/*
   3427 		 * Push to remote side
   3428 		 */
   3429 
   3430 		stat = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj,
   3431 		    rd->rd_rxferhand, rd->rd_fqw_f_off + (off_t)start_offset,
   3432 		    ((char *)rd->rd_shdwfqw_f) + start_offset,
   3433 		    (size_t)(end_offset - start_offset));
   3434 
   3435 		if (stat != RSM_SUCCESS) {
   3436 			goto done;
   3437 		}
   3438 		/*
   3439 		 * Successfully processed these entries.
   3440 		 * Wrap around to the beginning of FQE list, and
   3441 		 * update sequence number for the next round
   3442 		 */
   3443 		new_shdwfqw_o = rd->rd_shdwfqw_f;
   3444 		new_fqw_seq++;
   3445 		if (new_fqw_seq == 0)
   3446 			new_fqw_seq++;
   3447 	}
   3448 
   3449 	/*
   3450 	 * Handle remaining sequential FQEs
   3451 	 */
   3452 	if ((stat == RSM_SUCCESS) && (new_shdwfqw_o != rd->rd_shdwfqw_i)) {
   3453 		opsrsm_fqe_t *tmpfqe = new_shdwfqw_o;
   3454 		/*
   3455 		 * Set the sequence numbers in these FQEs.
   3456 		 */
   3457 		while (tmpfqe < rd->rd_shdwfqw_i) {
   3458 			tmpfqe->s.fq_seqnum =
   3459 			    new_fqw_seq & OPSRSM_FQE_SEQ_MASK;
   3460 			tmpfqe++;
   3461 		}
   3462 
   3463 		/*
   3464 		 * Get FQE offsets for FQ range being updated
   3465 		 */
   3466 		start_offset = (uint64_t)((char *)new_shdwfqw_o -
   3467 		    (char *)rd->rd_shdwfqw_f);
   3468 		end_offset = (uint64_t)((char *)rd->rd_shdwfqw_i -
   3469 		    (char *)rd->rd_shdwfqw_f);
   3470 		D6("opsrsmsyncfqe: start 0x%lx end 0x%lx", start_offset,
   3471 		    end_offset);
   3472 
   3473 		/*
   3474 		 * Round down and up to 64-byte boundaries
   3475 		 */
   3476 		start_offset = start_offset & OPSRSM_CACHELINE_MASK;
   3477 		end_offset = OPSRSM_CACHELINE_ROUNDUP(end_offset);
   3478 		D6("opsrsmsyncfqe: start 0x%lx end 0x%lx", start_offset,
   3479 		    end_offset);
   3480 
   3481 		/*
   3482 		 * Push to remote side
   3483 		 */
   3484 		stat = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj,
   3485 		    rd->rd_rxferhand, rd->rd_fqw_f_off + (off_t)start_offset,
   3486 		    ((char *)rd->rd_shdwfqw_f) + start_offset,
   3487 		    (size_t)(end_offset - start_offset));
   3488 
   3489 		if (stat != RSM_SUCCESS) {
   3490 			goto done;
   3491 		}
   3492 		new_shdwfqw_o = rd->rd_shdwfqw_i;
   3493 	}
   3494 	stat = RSM_CLOSE_BARRIER(rd->rd_adapter->rsmrdt_ctlr_obj, &fq_barrier);
   3495 
   3496 done:;
   3497 	mutex_enter(&rd->rd_fqr_lock);
   3498 	ASSERT((rd->rd_fqr_flags & OPSRSM_FQR_LOCKED) != 0);
   3499 	if (stat != RSM_SUCCESS) {
   3500 		opsrsmdev->opsrsm_errs++;
   3501 		if (rd->rd_sync_fqe_tmo_id == 0 &&
   3502 		    rd->rd_state == OPSRSM_STATE_W_READY) {
   3503 			rd->rd_sync_fqe_tmo_id = timeout(opsrsm_sync_fqe_tmo,
   3504 			    rd, (clock_t)opsrsmdev->
   3505 			    opsrsm_param.opsrsm_sync_tmo);
   3506 		}
   3507 	} else {
   3508 		rd->rd_shdwfqw_o = new_shdwfqw_o;
   3509 		rd->rd_fqw_seq = new_fqw_seq;
   3510 	}
   3511 
   3512 	/*
   3513 	 * an interrupt thread might have enqueued fqe entries
   3514 	 * while we were in the RSM calls. we now need to take
   3515 	 * these entries and update the actual shadow fq. we also
   3516 	 * need to schedule a sync_fqe event after updating our
   3517 	 * shadow fq.
   3518 	 */
   3519 	if (rd->rd_queued_fqe_list != NULL &&
   3520 	    rd->rd_state == OPSRSM_STATE_W_READY) {
   3521 		opsrsm_queued_fqe_t *q, *qfqe = rd->rd_queued_fqe_list;
   3522 
   3523 		for (;;) {
   3524 			q = qfqe;
   3525 			qfqe = qfqe->qf_next;
   3526 
   3527 			q->qf_next = NULL;
   3528 			ASSERT(q->qf_bufnum != -1);
   3529 			opsrsmputfqe_nolock(rd, q->qf_bufnum);
   3530 			opsrsm_queued_fqe_free(rd, q);
   3531 			if (qfqe == NULL) break;
   3532 		}
   3533 		rd->rd_queued_fqe_list = NULL;
   3534 		rd->rd_queued_fqe_tail = NULL;
   3535 		opsrsm_event_add(rd, OPSRSM_EVT_SYNC_FQE);
   3536 	}
   3537 	rd->rd_fqr_flags &= ~OPSRSM_FQR_LOCKED;
   3538 	mutex_exit(&rd->rd_fqr_lock);
   3539 	return (0);
   3540 }
   3541 
   3542 static void
   3543 opsrsm_queued_msg_append(opsrsm_dest_t *rd, opsrsm_queued_msg_t *qmsg)
   3544 {
   3545 	mutex_enter(&rd->rd_msgs_lock);
   3546 	if (rd->rd_msgs == NULL) {
   3547 		rd->rd_msgs = qmsg;
   3548 		rd->rd_msgs_tail = qmsg;
   3549 	} else {
   3550 		rd->rd_msgs_tail->qm_next = qmsg;
   3551 		rd->rd_msgs_tail = qmsg;
   3552 	}
   3553 	mutex_exit(&rd->rd_msgs_lock);
   3554 }
   3555 
   3556 static void
   3557 opsrsm_queued_msg_flush(opsrsm_dest_t *rd)
   3558 {
   3559 	opsrsm_queued_msg_t *qmsg;
   3560 	int cnt = 0;
   3561 
   3562 	mutex_enter(&rd->rd_msgs_lock);
   3563 	while (rd->rd_msgs != NULL) {
   3564 		cnt++;
   3565 		qmsg = rd->rd_msgs;
   3566 		rd->rd_msgs = qmsg->qm_next;
   3567 		kmem_free(qmsg, sizeof (opsrsm_queued_msg_t));
   3568 	}
   3569 	mutex_exit(&rd->rd_msgs_lock);
   3570 	if (cnt > 0) {
   3571 		DINFO("0x%x flushed %d queued msgs\n",
   3572 		    rd->rd_local_skey, cnt);
   3573 	}
   3574 }
   3575 
   3576 /*
   3577  * This function is called by the event thread to send
   3578  * queued interrupt messages to a peer node. some interrupt
   3579  * messages need to be delivered in this manner because RSMPI
   3580  * prohibits the sending of interrupt messages inside an
   3581  * RSM interrupt handler or callout.
   3582  */
   3583 static void
   3584 opsrsm_queued_msg_send(opsrsm_dest_t *rd)
   3585 {
   3586 	opsrsm_queued_msg_t *qmsg;
   3587 	rsm_send_t send_obj;
   3588 	boolean_t more_msgs;
   3589 	int status;
   3590 
   3591 	mutex_enter(&rd->rd_msgs_lock);
   3592 	qmsg = rd->rd_msgs;
   3593 	if (qmsg != NULL) {
   3594 		rd->rd_msgs = qmsg->qm_next;
   3595 		if (rd->rd_msgs == NULL) {
   3596 			rd->rd_msgs_tail = NULL;
   3597 		}
   3598 	} else {
   3599 		mutex_exit(&rd->rd_msgs_lock);
   3600 		return;
   3601 	}
   3602 	mutex_exit(&rd->rd_msgs_lock);
   3603 
   3604 	send_obj.is_data = &qmsg->qm_msg;
   3605 	send_obj.is_size = sizeof (opsrsm_msg_t);
   3606 	send_obj.is_flags = RSM_DLPI_SQFLAGS;
   3607 	send_obj.is_wait = 0;
   3608 	status = RSM_SEND(rd->rd_adapter->rsmrdt_ctlr_obj, rd->rsm_sendq,
   3609 	    &send_obj, NULL);
   3610 
   3611 	mutex_enter(&rd->rd_msgs_lock);
   3612 	if (status != RSM_SUCCESS) {
   3613 		if (++qmsg->qm_retries > opsrsm_queued_msg_max_retries) {
   3614 			DINFO("0x%x cannot send msg %d, err = %d, "
   3615 			    "retrying...", rd->rd_local_skey,
   3616 			    qmsg->qm_msg.p.hdr.reqtype, status);
   3617 			qmsg->qm_retries = 0;
   3618 		}
   3619 		qmsg->qm_next = rd->rd_msgs;
   3620 		rd->rd_msgs = qmsg;
   3621 		if (rd->rd_msgs_tail == NULL) {
   3622 			rd->rd_msgs_tail = qmsg;
   3623 		}
   3624 	} else {
   3625 		kmem_free(qmsg, sizeof (opsrsm_queued_msg_t));
   3626 	}
   3627 	more_msgs = (rd->rd_msgs != NULL);
   3628 	mutex_exit(&rd->rd_msgs_lock);
   3629 
   3630 	if (more_msgs) {
   3631 		if (status != RSM_SUCCESS) {
   3632 			delay(1);
   3633 		}
   3634 		opsrsm_event_add(rd, OPSRSM_EVT_SEND_MSG);
   3635 	}
   3636 }
   3637 
   3638 static void
   3639 opsrsm_event_add(opsrsm_dest_t *rd, uint32_t evt_type)
   3640 {
   3641 	mutex_enter(&rd->rd_evt_lock);
   3642 	rd->rd_evt_flags |= evt_type;
   3643 	cv_signal(&rd->rd_evt_cv);
   3644 	mutex_exit(&rd->rd_evt_lock);
   3645 }
   3646 
   3647 /*
   3648  * This thread is used for processing events that cannot
   3649  * be done in interrupt context.
   3650  */
   3651 static void
   3652 opsrsm_event_thread(void *arg)
   3653 {
   3654 	opsrsm_dest_t *rd = (opsrsm_dest_t *)arg;
   3655 	boolean_t sync_dqe, sync_fqe, send_msg;
   3656 	int events;
   3657 
   3658 	mutex_enter(&rd->rd_evt_lock);
   3659 again:;
   3660 	events = 0;
   3661 	sync_dqe = ((rd->rd_evt_flags & OPSRSM_EVT_SYNC_DQE) != 0);
   3662 	if (sync_dqe) {
   3663 		events++;
   3664 		rd->rd_evt_flags &= ~OPSRSM_EVT_SYNC_DQE;
   3665 	}
   3666 	sync_fqe = ((rd->rd_evt_flags & OPSRSM_EVT_SYNC_FQE) != 0);
   3667 	if (sync_fqe) {
   3668 		events++;
   3669 		rd->rd_evt_flags &= ~OPSRSM_EVT_SYNC_FQE;
   3670 	}
   3671 	send_msg = ((rd->rd_evt_flags & OPSRSM_EVT_SEND_MSG) != 0);
   3672 	if (send_msg) {
   3673 		events++;
   3674 		rd->rd_evt_flags &= ~OPSRSM_EVT_SEND_MSG;
   3675 	}
   3676 
   3677 	if ((rd->rd_evt_flags & OPSRSM_EVT_STOP) != 0) {
   3678 		rd->rd_evt_flags |= OPSRSM_EVT_DONE;
   3679 		cv_signal(&rd->rd_evt_wait_cv);
   3680 		mutex_exit(&rd->rd_evt_lock);
   3681 		DINFO("0x%x event thread exiting\n", rd->rd_local_skey);
   3682 		return;
   3683 	}
   3684 	if (events == 0 || rd->rd_evt_flags == 0) {
   3685 		cv_wait(&rd->rd_evt_cv, &rd->rd_evt_lock);
   3686 		goto again;
   3687 	}
   3688 	mutex_exit(&rd->rd_evt_lock);
   3689 
   3690 	if (sync_dqe) {
   3691 		(void) opsrsm_sync_dqe(rd);
   3692 	}
   3693 	if (sync_fqe) {
   3694 		(void) opsrsm_sync_fqe(rd);
   3695 	}
   3696 	if (send_msg) {
   3697 		opsrsm_queued_msg_send(rd);
   3698 	}
   3699 
   3700 	mutex_enter(&rd->rd_evt_lock);
   3701 	goto again;
   3702 }
   3703 
   3704 static void
   3705 opsrsm_sync_dqe_tmo(void *arg)
   3706 {
   3707 	opsrsm_dest_t *rd = (opsrsm_dest_t *)arg;
   3708 	if (rd->rd_sync_dqe_tmo_id == 0) {
   3709 		return;
   3710 	}
   3711 	rd->rd_sync_dqe_tmo_id = 0;
   3712 	opsrsm_event_add(rd, OPSRSM_EVT_SYNC_DQE);
   3713 }
   3714 
   3715 static void
   3716 opsrsm_sync_fqe_tmo(void *arg)
   3717 {
   3718 	opsrsm_dest_t *rd = (opsrsm_dest_t *)arg;
   3719 	if (rd->rd_sync_fqe_tmo_id == 0) {
   3720 		return;
   3721 	}
   3722 	rd->rd_sync_fqe_tmo_id = 0;
   3723 	opsrsm_event_add(rd, OPSRSM_EVT_SYNC_FQE);
   3724 }
   3725 
   3726 static void
   3727 opsrsm_status_check_tmo(void *arg)
   3728 {
   3729 	opsrsm_dest_t *rd = (opsrsm_dest_t *)arg;
   3730 
   3731 	if (rd->rd_status_tmo_id == 0 ||
   3732 	    rd->rd_state != OPSRSM_STATE_W_READY) {
   3733 		return;
   3734 	}
   3735 	if (OPSRSM_Q_LEN(&rd->rd_sendq) > 0 &&
   3736 	    ((uint32_t)ddi_get_lbolt() - rd->rd_last_sent) > 1000) {
   3737 		DINFO("RDT packets queued but not sent for %d "
   3738 		    "seconds: active_threads = %d, availfqe = %d, "
   3739 		    "qlen = %d, rd->rd_dstate = %d\n",
   3740 		    ((uint32_t)ddi_get_lbolt() - rd->rd_last_sent)/100,
   3741 		    rd->rd_active_threads, opsrsmavailfqe2(rd),
   3742 		    OPSRSM_Q_LEN(&rd->rd_sendq), rd->rd_dstate);
   3743 
   3744 		if (((uint32_t)ddi_get_lbolt() - rd->rd_last_sent) > 9000) {
   3745 			opsrsm_lostconn(rd);
   3746 		}
   3747 		opsrsm_set_fqe_tmo(rd, (clock_t)opsrsmdev->
   3748 		    opsrsm_param.opsrsm_xmit_delay);
   3749 	}
   3750 	rd->rd_status_tmo_id = timeout(opsrsm_status_check_tmo, rd, 1000);
   3751 }
   3752 
   3753 static int
   3754 opsrsmrexmit(opsrsm_dest_t *rd)
   3755 {
   3756 	int error = 0;
   3757 	mblk_t *mp;
   3758 
   3759 	switch (rd->rd_xmit_state) {
   3760 	case OPSRSM_XMIT_RETRY_DATA:
   3761 		error = opsrsm_start_batch(rd, 0);
   3762 		mp = OPSRSM_Q_HEAD(&rd->rd_pendq);
   3763 		while (mp) {
   3764 			error = opsrsm_write_data(rd, mp);
   3765 			mp = OPSRSM_Q_NEXT(&rd->rd_pendq, mp);
   3766 		}
   3767 		ASSERT(mp == NULL);
   3768 		error = opsrsm_end_batch(rd);
   3769 		break;
   3770 	default:
   3771 		cmn_err(CE_PANIC, "opsrsmrexmit: invalid state = %d\n",
   3772 		    rd->rd_xmit_state);
   3773 		break;
   3774 	}
   3775 	return (error);
   3776 }
   3777 
   3778 static void
   3779 opsrsmxmit_thread(void *arg)
   3780 {
   3781 	(void) opsrsmxmit((opsrsm_dest_t *)arg, NULL);
   3782 }
   3783 
   3784 static int
   3785 opsrsmxmit(opsrsm_dest_t *rd, mblk_t *mp)
   3786 {
   3787 	opsrsm_t *opsrsmp = opsrsmdev;
   3788 	boolean_t discarded = B_FALSE;
   3789 	boolean_t do_sync = B_FALSE;
   3790 	boolean_t nofqe, nopendw;
   3791 	int err = 0, pkts_sent, sendq_len;
   3792 
   3793 	mutex_enter(&rd->rd_sendq_lock);
   3794 	rd->rd_active_threads++;
   3795 	if (mp != NULL) {
   3796 		if (OPSRSM_Q_LEN(&rd->rd_sendq) >=
   3797 		    opsrsmp->opsrsm_param.opsrsm_max_queued_pkts) {
   3798 			opsrsmp->opsrsm_pkts_discarded++;
   3799 			freemsg(mp);
   3800 			mp = NULL;
   3801 			discarded = B_TRUE;
   3802 			err = EWOULDBLOCK;
   3803 		} else {
   3804 			uint32_t seqno = OPSRSM_MESSAGE_HDRPTR(mp)->seqno;
   3805 			if (seqno == 0) {
   3806 				OPSRSM_MESSAGE_HDRPTR(mp)->seqno =
   3807 				    rd->rd_next_lseqno;
   3808 				OPSRSM_MESSAGE_HDRPTR(mp)->skey =
   3809 				    rd->rd_local_skey;
   3810 
   3811 				rd->rd_next_lseqno++;
   3812 				if (rd->rd_next_lseqno == 0)
   3813 					rd->rd_next_lseqno++;
   3814 			}
   3815 			opsrsmp->opsrsm_pkts_queued++;
   3816 			OPSRSM_Q_APPEND(&rd->rd_sendq, mp);
   3817 			mp->b_prev = (mblk_t *)0;
   3818 		}
   3819 		if (!discarded && rd->rd_active_threads > ncpus_online) {
   3820 			rd->rd_active_threads--;
   3821 			opsrsmp->opsrsm_last_sendq_len =
   3822 			    (uint32_t)OPSRSM_Q_LEN(&rd->rd_sendq);
   3823 			mutex_exit(&rd->rd_sendq_lock);
   3824 			UNREFDEST(rd);
   3825 			return (err);
   3826 		}
   3827 	}
   3828 	mutex_exit(&rd->rd_sendq_lock);
   3829 
   3830 	mutex_enter(&rd->rd_xmit_lock);
   3831 	if (rd->rd_xmit_state < OPSRSM_XMIT_BARRIER_CLOSED) {
   3832 		if (rd->rd_xmit_state == OPSRSM_XMIT_DISCONNECTED &&
   3833 		    !discarded) {
   3834 			err = ENETDOWN;
   3835 		}
   3836 		mutex_enter(&rd->rd_sendq_lock);
   3837 		rd->rd_active_threads--;
   3838 		opsrsmp->opsrsm_last_sendq_len =
   3839 			(uint32_t)OPSRSM_Q_LEN(&rd->rd_sendq);
   3840 		mutex_exit(&rd->rd_sendq_lock);
   3841 		mutex_exit(&rd->rd_xmit_lock);
   3842 		UNREFDEST(rd);
   3843 		return (err);
   3844 	}
   3845 	if (rd->rd_xmit_state > OPSRSM_XMIT_BARRIER_OPENED) {
   3846 		boolean_t dont_rexmit = B_FALSE;
   3847 
   3848 		dont_rexmit = (mp != NULL);
   3849 		if (dont_rexmit || opsrsmrexmit(rd) != 0) {
   3850 			if (!discarded) err = 0;
   3851 			mutex_enter(&rd->rd_sendq_lock);
   3852 			rd->rd_active_threads--;
   3853 			opsrsmp->opsrsm_last_sendq_len =
   3854 			    (uint32_t)OPSRSM_Q_LEN(&rd->rd_sendq);
   3855 			mutex_exit(&rd->rd_sendq_lock);
   3856 			mutex_exit(&rd->rd_xmit_lock);
   3857 			UNREFDEST(rd);
   3858 			return (err);
   3859 		} else {
   3860 			mutex_exit(&rd->rd_xmit_lock);
   3861 			(void) opsrsm_sync_dqe(rd);
   3862 			mutex_enter(&rd->rd_xmit_lock);
   3863 		}
   3864 	} else if (discarded) {
   3865 		mutex_enter(&rd->rd_sendq_lock);
   3866 		rd->rd_active_threads--;
   3867 		opsrsmp->opsrsm_last_sendq_len =
   3868 			(uint32_t)OPSRSM_Q_LEN(&rd->rd_sendq);
   3869 		mutex_exit(&rd->rd_sendq_lock);
   3870 		mutex_exit(&rd->rd_xmit_lock);
   3871 		UNREFDEST(rd);
   3872 		return (err);
   3873 	}
   3874 
   3875 	nofqe = B_FALSE;
   3876 	nopendw = B_TRUE;
   3877 	pkts_sent = 0;
   3878 	opsrsm_cancel_xmit_tmo(rd);
   3879 	for (;;) {
   3880 		mblk_t *nmp;
   3881 		int bufnum, qlen, pktlen;
   3882 
   3883 		if (rd->rd_xmit_state < OPSRSM_XMIT_BARRIER_CLOSED ||
   3884 		    rd->rd_xmit_state > OPSRSM_XMIT_BARRIER_OPENED) {
   3885 			mutex_enter(&rd->rd_sendq_lock);
   3886 			rd->rd_active_threads--;
   3887 			mutex_exit(&rd->rd_sendq_lock);
   3888 			break;
   3889 		}
   3890 		mutex_enter(&rd->rd_sendq_lock);
   3891 		qlen = OPSRSM_Q_LEN(&rd->rd_sendq);
   3892 		if (qlen == 0) {
   3893 			rd->rd_active_threads--;
   3894 			mutex_exit(&rd->rd_sendq_lock);
   3895 			break;
   3896 		}
   3897 		if (opsrsmgetfqe(rd, &bufnum) == 0) {
   3898 			opsrsm_set_fqe_tmo(rd, (clock_t)opsrsmp->
   3899 			    opsrsm_param.opsrsm_xmit_delay);
   3900 			nofqe = B_TRUE;
   3901 			rd->rd_active_threads--;
   3902 			mutex_exit(&rd->rd_sendq_lock);
   3903 			break;
   3904 		}
   3905 		OPSRSM_Q_REMOVE(&rd->rd_sendq, nmp);
   3906 		mutex_exit(&rd->rd_sendq_lock);
   3907 
   3908 		if (rd->rd_xmit_state == OPSRSM_XMIT_BARRIER_CLOSED) {
   3909 			ASSERT(OPSRSM_Q_LEN(&rd->rd_pendq) == 0 &&
   3910 			    rd->rd_start_time == 0 &&
   3911 			    rd->rd_data_collected == 0 &&
   3912 			    rd->rd_writes_completed == 0);
   3913 			err = opsrsm_start_batch(rd, (uint32_t)nmp->b_prev);
   3914 		}
   3915 		opsrsmp->opsrsm_pkts_pending++;
   3916 		OPSRSM_Q_APPEND(&rd->rd_pendq, nmp);
   3917 		nmp->b_prev = (mblk_t *)bufnum;
   3918 		pktlen = MBLKL(nmp);
   3919 		mutex_exit(&rd->rd_xmit_lock);
   3920 
   3921 		err = opsrsm_write_data(rd, nmp);
   3922 		pkts_sent++;
   3923 
   3924 		mutex_enter(&rd->rd_xmit_lock);
   3925 		ASSERT(rd->rd_xmit_state == OPSRSM_XMIT_BARRIER_OPENED ||
   3926 		    rd->rd_xmit_state == OPSRSM_XMIT_DISCONNECTED);
   3927 		rd->rd_writes_completed++;
   3928 		rd->rd_data_collected += (uint32_t)pktlen;
   3929 		OPSRSM_ADAPT_THRESHOLD(rd, (uint32_t)pktlen);
   3930 		if (opsrsmdev->opsrsm_param.opsrsm_adaptive_intr == 0) {
   3931 			rd->rd_last_sent = (uint32_t)ddi_get_lbolt();
   3932 		}
   3933 		nopendw = OPSRSM_NO_PENDING_WRITES(rd);
   3934 		if (nopendw && OPSRSM_REACHED_DATA_THRESHOLD(rd)) {
   3935 			err = opsrsm_end_batch(rd);
   3936 			if (err != 0) {
   3937 				mutex_enter(&rd->rd_sendq_lock);
   3938 				rd->rd_active_threads--;
   3939 				mutex_exit(&rd->rd_sendq_lock);
   3940 				break;
   3941 			} else {
   3942 				mutex_exit(&rd->rd_xmit_lock);
   3943 				(void) opsrsm_sync_dqe(rd);
   3944 				mutex_enter(&rd->rd_xmit_lock);
   3945 			}
   3946 		} else if (!nopendw) {
   3947 			opsrsmp->opsrsm_pending_writes++;
   3948 			nopendw = B_TRUE;
   3949 		}
   3950 	}
   3951 	if (nofqe) opsrsmp->opsrsm_no_fqes++;
   3952 	if (pkts_sent == 0 && mp == NULL) {
   3953 		rd->rd_adaptive_threshold = 0;
   3954 		rd->rd_pkt_freq = 0;
   3955 		opsrsmp->opsrsm_starts++;
   3956 	}
   3957 
   3958 	nopendw = OPSRSM_NO_PENDING_WRITES(rd);
   3959 	if (nopendw && (rd->rd_xmit_state == OPSRSM_XMIT_BARRIER_OPENED)) {
   3960 		if (OPSRSM_REACHED_DATA_THRESHOLD(rd) ||
   3961 		    (pkts_sent == 0 && mp == NULL) ||
   3962 		    (nofqe && OPSRSM_Q_LEN(&rd->rd_pendq) == opsrsmp->
   3963 		    opsrsm_param.opsrsm_buffers)) {
   3964 			err = opsrsm_end_batch(rd);
   3965 			if (err == 0) {
   3966 				do_sync = B_TRUE;
   3967 			}
   3968 		} else {
   3969 			opsrsm_set_xmit_tmo(rd,
   3970 				(int)opsrsmp->opsrsm_param.opsrsm_xmit_delay);
   3971 		}
   3972 	} else if (!nopendw) {
   3973 		opsrsmp->opsrsm_pending_writes++;
   3974 		nopendw = B_TRUE;
   3975 	}
   3976 	opsrsmp->opsrsm_last_pendq_len =
   3977 		(uint32_t)OPSRSM_Q_LEN(&rd->rd_pendq);
   3978 	mutex_exit(&rd->rd_xmit_lock);
   3979 
   3980 	if (do_sync) {
   3981 		(void) opsrsm_sync_dqe(rd);
   3982 	}
   3983 
   3984 	sendq_len = OPSRSM_Q_LEN(&rd->rd_sendq);
   3985 	opsrsmp->opsrsm_last_sendq_len = (uint32_t)sendq_len;
   3986 	if (sendq_len <= 1024) {
   3987 		opsrsm_wake_senders(rd, POLLOUT);
   3988 	}
   3989 	UNREFDEST(rd);
   3990 
   3991 	if (err != ENETDOWN) err = 0;
   3992 	if (discarded) err = EWOULDBLOCK;
   3993 	return (err);
   3994 }
   3995 
   3996 /*
   3997  * Callback routine, called when an desballoc'ed buffer is eventually freed.
   3998  */
   3999 static void
   4000 opsrsmfreebuf(
   4001 	opsrsmbuf_t *rbp)	/* Structure describing freed buffer */
   4002 {
   4003 	opsrsm_dest_t *rd = rbp->rb_rd;
   4004 	int delflg, zerflg;
   4005 
   4006 	D1("opsrsmfreebuf: rbp 0x%p", (void *)rbp);
   4007 
   4008 	/*
   4009 	 * Find out if this is the last outstanding buffer, and whether we're
   4010 	 * being deleted.
   4011 	 */
   4012 	mutex_enter(&rd->rd_nlb_lock);
   4013 
   4014 	rd->rd_nlb--;
   4015 	delflg = rd->rd_nlb_del;
   4016 	zerflg = (rd->rd_nlb == 0);
   4017 
   4018 	mutex_exit(&rd->rd_nlb_lock);
   4019 
   4020 	/*
   4021 	 * If we're being deleted, we don't put this buffer on the free queue.
   4022 	 * Also, if we're being deleted, and this was the last outstanding
   4023 	 * buffer, we do an UNREF.  Otherwise we send this buffer to the other
   4024 	 * system for reuse.
   4025 	 */
   4026 	if (delflg) {
   4027 		if (zerflg)
   4028 			UNREFDEST(rd);
   4029 	} else {
   4030 		opsrsmputfqe(rd, rbp->rb_bufnum);
   4031 		opsrsm_event_add(rd, OPSRSM_EVT_SYNC_FQE);
   4032 	}
   4033 
   4034 	D1("opsrsmfreebuf: done");
   4035 }
   4036 
   4037 static void
   4038 opsrsmdemux(mblk_t *mp, opsrsm_dest_t *rd)
   4039 {
   4040 	opsrsmresource_t *rp;
   4041 	uint32_t rportnum = OPSRSM_MESSAGE_HDRPTR(mp)->rportnum;
   4042 
   4043 	/* control messages do not belong here */
   4044 	if (OPSRSM_MESSAGE_HDRPTR(mp)->option != 0) {
   4045 		freemsg(mp);
   4046 		return;
   4047 	}
   4048 	if (rd != NULL) {
   4049 		uint32_t seq_key, seq_no;
   4050 
   4051 		seq_key = OPSRSM_MESSAGE_HDRPTR(mp)->skey;
   4052 		seq_no = OPSRSM_MESSAGE_HDRPTR(mp)->seqno;
   4053 		ASSERT(seq_no != 0);
   4054 		if (seq_key == rd->rd_remote_skey) {
   4055 			if (seq_no == rd->rd_next_rseqno) {
   4056 				rd->rd_next_rseqno++;
   4057 				if (rd->rd_next_rseqno == 0)
   4058 					rd->rd_next_rseqno++;
   4059 			} else {
   4060 				cmn_err(CE_PANIC, "opsrsmdemux: seqno = %d "
   4061 				    "expected seqno = %d\n", seq_no,
   4062 				    rd->rd_next_rseqno);
   4063 				freemsg(mp);
   4064 				return;
   4065 			}
   4066 		} else {
   4067 			opsrsm_failover_info_t *finfo =
   4068 			    opsrsm_finfo_lookup_by_remote_skey(seq_key);
   4069 
   4070 			if (finfo == NULL) {
   4071 				cmn_err(CE_CONT, "opsrsmdemux: cannot find "
   4072 				    "skey = 0x%x\n", seq_key);
   4073 				freemsg(mp);
   4074 				return;
   4075 			} else {
   4076 				if (seq_no != finfo->fi_next_rseqno) {
   4077 					/*
   4078 					 * We get here if a rexmitted packet is
   4079 					 * a duplicate or if it is out of
   4080 					 * sequence.
   4081 					 */
   4082 					DERR("received duplicate packet, "
   4083 					    "seqno = %d, expected = %d\n",
   4084 					    seq_no, finfo->fi_next_rseqno);
   4085 					freemsg(mp);
   4086 					return;
   4087 				} else {
   4088 					finfo->fi_next_rseqno++;
   4089 					if (finfo->fi_next_rseqno == 0)
   4090 						finfo->fi_next_rseqno++;
   4091 				}
   4092 			}
   4093 		}
   4094 	}
   4095 
   4096 	rp = opsrsmresource_lookup(rportnum, OPSRSM_RO_INCREFCNT);
   4097 	if (rp == NULL) {
   4098 		opsrsm_queued_msg_t *qmsg;
   4099 
   4100 		DERR("opsrsmdemux: port %d has no receiver, dropping message",
   4101 		    rportnum);
   4102 
   4103 		/*
   4104 		 * Do not send error notifications if a segment is about to
   4105 		 * be torn down.
   4106 		 */
   4107 		if (rd != NULL && rd->rd_state != OPSRSM_STATE_W_READY) {
   4108 			freemsg(mp);
   4109 			return;
   4110 		}
   4111 		qmsg = (opsrsm_queued_msg_t *)kmem_zalloc(sizeof (*qmsg),
   4112 		    KM_NOSLEEP);
   4113 		if (qmsg == NULL) {
   4114 			DERR("opsrsmdemux: cannot allocate qmsg");
   4115 			freemsg(mp);
   4116 			return;
   4117 		}
   4118 		/*
   4119 		 * Tell the sender that receiver doesn't exist.
   4120 		 */
   4121 		qmsg->qm_msg.p.hdr.reqtype = RSMRDT_MSG_SEND_ERR;
   4122 		qmsg->qm_msg.p.hdr.seqno = 0;
   4123 		qmsg->qm_msg.p.hdr.opsrsm_version = OPSRSM_VERSION;
   4124 		qmsg->qm_msg.p.m.senderr.sender_portnum =
   4125 		    OPSRSM_MESSAGE_HDRPTR(mp)->lportnum;
   4126 		qmsg->qm_msg.p.m.senderr.sender_pkey =
   4127 		    OPSRSM_MESSAGE_HDRPTR(mp)->pkey;
   4128 		qmsg->qm_msg.p.m.senderr.errstate = OPSRSM_RS_NORECVR;
   4129 		qmsg->qm_retries = 0;
   4130 
   4131 		opsrsm_queued_msg_append(rd, qmsg);
   4132 		opsrsm_event_add(rd, OPSRSM_EVT_SEND_MSG);
   4133 		freemsg(mp);
   4134 		return;
   4135 	}
   4136 
   4137 	if (rp->rs_pkey != OPSRSM_MESSAGE_HDRPTR(mp)->pkey) {
   4138 		opsrsm_queued_msg_t *qmsg;
   4139 
   4140 		DERR("opsrsmdemux: Invalid pkey: sender %d local %d, "
   4141 		    "dropping message", OPSRSM_MESSAGE_HDRPTR(mp)->pkey,
   4142 		    rp->rs_pkey);
   4143 
   4144 		/*
   4145 		 * Do not send error notifications if a segment is about to
   4146 		 * be torn down.
   4147 		 */
   4148 		if (rd != NULL && rd->rd_state != OPSRSM_STATE_W_READY) {
   4149 			freemsg(mp);
   4150 			OPSRSM_RSUNREF(rp);
   4151 			return;
   4152 		}
   4153 
   4154 		qmsg = (opsrsm_queued_msg_t *)kmem_zalloc(sizeof (*qmsg),
   4155 		    KM_NOSLEEP);
   4156 		if (qmsg == NULL) {
   4157 			DERR("opsrsmdemux: cannot allocate qmsg");
   4158 			freemsg(mp);
   4159 			OPSRSM_RSUNREF(rp);
   4160 			return;
   4161 		}
   4162 		/*
   4163 		 * Tell the sender about pkey mismatch.
   4164 		 */
   4165 		qmsg->qm_msg.p.hdr.reqtype = RSMRDT_MSG_SEND_ERR;
   4166 		qmsg->qm_msg.p.hdr.seqno = 0;
   4167 		qmsg->qm_msg.p.hdr.opsrsm_version = OPSRSM_VERSION;
   4168 		qmsg->qm_msg.p.m.senderr.sender_portnum =
   4169 		    OPSRSM_MESSAGE_HDRPTR(mp)->lportnum;
   4170 		qmsg->qm_msg.p.m.senderr.sender_pkey =
   4171 		    OPSRSM_MESSAGE_HDRPTR(mp)->pkey;
   4172 		qmsg->qm_msg.p.m.senderr.errstate = OPSRSM_RS_PKEYMISMATCH;
   4173 		qmsg->qm_retries = 0;
   4174 
   4175 		opsrsm_queued_msg_append(rd, qmsg);
   4176 		opsrsm_event_add(rd, OPSRSM_EVT_SEND_MSG);
   4177 		freemsg(mp);
   4178 		OPSRSM_RSUNREF(rp);
   4179 		return;
   4180 	}
   4181 
   4182 	if ((rp->rs_state & OPSRSM_RS_BOUND) == 0) {
   4183 		DERR("opsrsmdemux: port %d not bound (rp->rs_state = %d), "
   4184 		    "dropping message", rportnum, rp->rs_state);
   4185 		freemsg(mp);
   4186 		OPSRSM_RSUNREF(rp);
   4187 		return;
   4188 	}
   4189 
   4190 	mutex_enter(&rp->rs_lock);
   4191 	OPSRSM_Q_APPEND(&rp->rs_recvq, mp);
   4192 	atomic_add_32(&opsrsm_pending_bytes, MBLKL(mp));
   4193 
   4194 	if ((rp->rs_state & OPSRSM_RS_SIG) != 0) {
   4195 		cv_signal(&rp->rs_cv);
   4196 	}
   4197 	if ((rp->rs_events & POLLIN) == 0) {
   4198 		rp->rs_events |= POLLIN;
   4199 		mutex_exit(&rp->rs_lock);
   4200 		pollwakeup(&rp->rs_pollhd, POLLIN);
   4201 	} else {
   4202 		mutex_exit(&rp->rs_lock);
   4203 	}
   4204 
   4205 	OPSRSM_RSUNREF(rp);
   4206 }
   4207 
   4208 static int
   4209 opsrsmdemux_loopback(mblk_t *mp)
   4210 {
   4211 	opsrsmresource_t *rp;
   4212 	uint32_t rportnum = OPSRSM_MESSAGE_HDRPTR(mp)->rportnum;
   4213 
   4214 	rp = opsrsmresource_lookup(rportnum, OPSRSM_RO_INCREFCNT);
   4215 	if (rp == NULL) {
   4216 		freemsg(mp);
   4217 		return (ESRCH);
   4218 	}
   4219 
   4220 	if (rp->rs_pkey != OPSRSM_MESSAGE_HDRPTR(mp)->pkey) {
   4221 		freemsg(mp);
   4222 		OPSRSM_RSUNREF(rp);
   4223 		return (EACCES);
   4224 	}
   4225 
   4226 	if ((rp->rs_state & OPSRSM_RS_BOUND) == 0) {
   4227 		DERR("opsrsmdemux: port %d not bound (rp->rs_state = %d), "
   4228 		    "dropping message", rportnum, rp->rs_state);
   4229 		freemsg(mp);
   4230 		OPSRSM_RSUNREF(rp);
   4231 		return (EADDRNOTAVAIL);
   4232 	}
   4233 
   4234 	mutex_enter(&rp->rs_lock);
   4235 	if ((uint_t)OPSRSM_Q_LEN(&rp->rs_recvq) >=
   4236 	    opsrsmdev->opsrsm_param.opsrsm_max_loopback_pkts) {
   4237 		mutex_exit(&rp->rs_lock);
   4238 		freemsg(mp);
   4239 		OPSRSM_RSUNREF(rp);
   4240 		return (EWOULDBLOCK);
   4241 	}
   4242 	OPSRSM_Q_APPEND(&rp->rs_recvq, mp);
   4243 	atomic_add_32(&opsrsm_pending_bytes, MBLKL(mp));
   4244 
   4245 	if ((rp->rs_state & OPSRSM_RS_SIG) != 0) {
   4246 		cv_signal(&rp->rs_cv);
   4247 	}
   4248 	if ((rp->rs_events & POLLIN) == 0) {
   4249 		rp->rs_events |= POLLIN;
   4250 		mutex_exit(&rp->rs_lock);
   4251 		pollwakeup(&rp->rs_pollhd, POLLIN);
   4252 	} else {
   4253 		mutex_exit(&rp->rs_lock);
   4254 	}
   4255 	OPSRSM_RSUNREF(rp);
   4256 	return (0);
   4257 }
   4258 
   4259 
   4260 /*
   4261  * opsrsmread() takes the packet described by the arguments and sends it
   4262  * upstream.
   4263  */
   4264 static int
   4265 opsrsmread(
   4266 	opsrsm_dest_t *rd,	/* Destination pointer */
   4267 	int bufnum,	/* Index of buffer containing packet */
   4268 	int offset,	/* Offset of packet within buffer */
   4269 	int length,	/* Length of packet */
   4270 	ushort_t sap)	/* SAP for packet */
   4271 {
   4272 	opsrsm_t *opsrsmp = opsrsmdev;
   4273 	mblk_t *mp;
   4274 	boolean_t canloan = B_FALSE;
   4275 	boolean_t nobufs = B_FALSE;
   4276 	caddr_t bufptr;
   4277 	int buffree = 0;
   4278 	int calc_sz;
   4279 
   4280 	D1("opsrsmread: rd 0x%p, bufnum %d, offset %d, length %d, sap 0x%x",
   4281 	    (void *)rd, bufnum, offset, length, sap);
   4282 
   4283 	bufptr = (caddr_t)rd->rd_lbuf + ((uint_t)bufnum * rd->rd_lbuflen);
   4284 
   4285 	/* Figure out if we can loan this buffer up or not */
   4286 	mutex_enter(&rd->rd_nlb_lock);
   4287 	nobufs = (rd->rd_rawmem_base_size > freemem * PAGESIZE) &&
   4288 	    ((rd->rd_sstate & OPSRSM_RSMS_LXFER_C) != 0);
   4289 	if (nobufs || rd->rd_nlb < (opsrsmp->opsrsm_param.opsrsm_buffers -
   4290 	    opsrsmp->opsrsm_param.opsrsm_buffers_retained)) {
   4291 		rd->rd_nlb++;
   4292 		canloan = B_TRUE;
   4293 	}
   4294 	mutex_exit(&rd->rd_nlb_lock);
   4295 
   4296 
   4297 	if (canloan) {
   4298 		/*
   4299 		 * We make the mblk cover the whole buffer in case anybody
   4300 		 * wants the leading/trailing space; below we adjust the
   4301 		 * rptr/wptr to describe the actual packet.
   4302 		 */
   4303 		mp = desballoc((uchar_t *)bufptr, rd->rd_lbuflen,
   4304 		    BPRI_LO, &(rd->rd_bufbase+bufnum)->rb_frtn);
   4305 
   4306 		if (mp == NULL) {
   4307 			mutex_enter(&rd->rd_nlb_lock);
   4308 			rd->rd_nlb--;
   4309 			mutex_exit(&rd->rd_nlb_lock);
   4310 
   4311 			opsrsmputfqe(rd, bufnum);
   4312 			buffree = 1;
   4313 
   4314 			opsrsmp->opsrsm_ierrors++;
   4315 			cmn_err(CE_PANIC, "desballoc failed, dropping "
   4316 			    "packet\n");
   4317 			return (1);
   4318 		}
   4319 		mp->b_rptr += offset;
   4320 		mp->b_wptr = mp->b_rptr + length;
   4321 
   4322 		opsrsmp->opsrsm_lbufs++;
   4323 	} else {
   4324 		/*
   4325 		 * We make the destination (within the new mblk) have the
   4326 		 * same address mod 64 as our source, so that the kernel
   4327 		 * bcopy is as efficient as possible.  (This is a sun4u
   4328 		 * bcopy optimization, not a RSM optimization.)
   4329 		 */
   4330 		mp = allocb((size_t)(length + 0x40), BPRI_LO);
   4331 		if (mp) {
   4332 			intptr_t dstoffset = (intptr_t)mp->b_rptr;
   4333 
   4334 			dstoffset = offset - (dstoffset & 0x3f);
   4335 			if (dstoffset < 0)
   4336 				dstoffset += 0x40;
   4337 
   4338 			mp->b_rptr += dstoffset;
   4339 			mp->b_wptr = mp->b_rptr + length;
   4340 			bcopy((void *)(bufptr + offset), (void *)mp->b_rptr,
   4341 			    (size_t)length);
   4342 
   4343 			opsrsmp->opsrsm_nlbufs++;
   4344 			opsrsmputfqe(rd, bufnum);
   4345 			buffree = 1;
   4346 		} else {
   4347 			opsrsmputfqe(rd, bufnum);
   4348 			buffree = 1;
   4349 			opsrsmp->opsrsm_ierrors++;
   4350 			cmn_err(CE_PANIC, "allocb failed, dropping packet\n");
   4351 			return (1);
   4352 		}
   4353 	}
   4354 
   4355 	calc_sz = (int)(OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz +
   4356 	    OPSRSM_MESSAGE_HDRSZ);
   4357 	if (length != calc_sz) {
   4358 		cmn_err(CE_PANIC, "corrupted packet, length = %d, "
   4359 		    "calculated size = %d\n", length, calc_sz);
   4360 	}
   4361 
   4362 	if (OPSRSM_MESSAGE_HDRPTR(mp)->option == OPSRSM_OPT_REXMIT_END) {
   4363 		opsrsm_option_rexmit_end(mp, rd);
   4364 	} else {
   4365 		opsrsmp->opsrsm_in_bytes += (uint32_t)length;
   4366 		opsrsmp->opsrsm_ipackets++;
   4367 		/*
   4368 		 * Do demux right here.
   4369 		 */
   4370 		opsrsmdemux(mp, rd);
   4371 	}
   4372 	D1("opsrsmread: canloan was %d, done", canloan);
   4373 	return (buffree);
   4374 }
   4375 
   4376 /*
   4377  * ****************************************************************
   4378  *                                                               *
   4379  * E N D       NEW DATA TRANSFER LOGIC                           *
   4380  *                                                               *
   4381  * ****************************************************************
   4382  */
   4383 
   4384 /*
   4385  * ****************************************************************
   4386  *                                                               *
   4387  * B E G I N   RSM FAILOVER                                      *
   4388  *                                                               *
   4389  * ****************************************************************
   4390  */
   4391 
   4392 static opsrsm_failover_info_t *
   4393 opsrsm_finfo_add(opsrsm_dest_t *rd)
   4394 {
   4395 	opsrsm_failover_info_t *finfo = opsrsm_finfo_list;
   4396 
   4397 	D1("adding finfo for dest = %p, remote_skey = %d\n",
   4398 	    rd, rd->rd_remote_skey);
   4399 	mutex_enter(&opsrsm_finfo_lock);
   4400 	while (finfo != NULL) {
   4401 		if (finfo->fi_dest == rd &&
   4402 		    finfo->fi_remote_skey == rd->rd_remote_skey &&
   4403 		    finfo->fi_local_skey == rd->rd_local_skey) {
   4404 			mutex_exit(&opsrsm_finfo_lock);
   4405 			return (finfo);
   4406 		}
   4407 		finfo = finfo->fi_next;
   4408 	}
   4409 	ASSERT(finfo == NULL);
   4410 	finfo = (opsrsm_failover_info_t *)kmem_zalloc(sizeof (*finfo),
   4411 	    KM_SLEEP);
   4412 	ASSERT(finfo != NULL);
   4413 	finfo->fi_nodeid = rd->rd_nodeid;
   4414 	finfo->fi_dest = rd;
   4415 	finfo->fi_remote_skey = rd->rd_remote_skey;
   4416 	finfo->fi_local_skey = rd->rd_local_skey;
   4417 	finfo->fi_next_rseqno = 0;
   4418 	finfo->fi_retval = -1;
   4419 	finfo->fi_waiters = 0;
   4420 	finfo->fi_status = 0;
   4421 	finfo->fi_last_accessed = ddi_get_lbolt();
   4422 	OPSRSM_Q_INIT(&finfo->fi_rexmitq);
   4423 	cv_init(&finfo->fi_cv, NULL, CV_DRIVER, NULL);
   4424 	cv_init(&finfo->fi_wait_cv, NULL, CV_DRIVER, NULL);
   4425 	mutex_init(&finfo->fi_lock, NULL, MUTEX_DRIVER, NULL);
   4426 
   4427 	finfo->fi_next = opsrsm_finfo_list;
   4428 	opsrsm_finfo_list = finfo;
   4429 	mutex_exit(&opsrsm_finfo_lock);
   4430 	return (finfo);
   4431 }
   4432 
   4433 static opsrsm_failover_info_t *
   4434 opsrsm_finfo_lookup_by_remote_skey(uint32_t skey)
   4435 {
   4436 	opsrsm_failover_info_t *finfo = opsrsm_finfo_list;
   4437 
   4438 	mutex_enter(&opsrsm_finfo_lock);
   4439 	while (finfo != NULL) {
   4440 		if (finfo->fi_remote_skey == skey) {
   4441 			break;
   4442 		}
   4443 		finfo = finfo->fi_next;
   4444 	}
   4445 	if (finfo != NULL) {
   4446 		finfo->fi_last_accessed = ddi_get_lbolt();
   4447 	}
   4448 	mutex_exit(&opsrsm_finfo_lock);
   4449 	return (finfo);
   4450 }
   4451 
   4452 static opsrsm_failover_info_t *
   4453 opsrsm_finfo_lookup_by_local_skey(uint32_t skey)
   4454 {
   4455 	opsrsm_failover_info_t *finfo = opsrsm_finfo_list;
   4456 
   4457 	mutex_enter(&opsrsm_finfo_lock);
   4458 	while (finfo != NULL) {
   4459 		if (finfo->fi_local_skey == skey) {
   4460 			break;
   4461 		}
   4462 		finfo = finfo->fi_next;
   4463 	}
   4464 	if (finfo != NULL) {
   4465 		finfo->fi_last_accessed = ddi_get_lbolt();
   4466 	}
   4467 	mutex_exit(&opsrsm_finfo_lock);
   4468 	return (finfo);
   4469 }
   4470 
   4471 static int
   4472 opsrsm_finfo_wait(uint32_t skey)
   4473 {
   4474 	opsrsm_failover_info_t *finfo = opsrsm_finfo_list;
   4475 	int error;
   4476 
   4477 	mutex_enter(&opsrsm_finfo_lock);
   4478 	while (finfo != NULL) {
   4479 		if (finfo->fi_local_skey == skey) {
   4480 			break;
   4481 		}
   4482 		finfo = finfo->fi_next;
   4483 	}
   4484 	if (finfo == NULL) {
   4485 		mutex_exit(&opsrsm_finfo_lock);
   4486 		return (0);
   4487 	}
   4488 
   4489 	finfo->fi_last_accessed = ddi_get_lbolt();
   4490 	error = finfo->fi_retval;
   4491 	if (error == -1) {
   4492 		int retval;
   4493 
   4494 		finfo->fi_waiters++;
   4495 		retval = cv_wait_sig(&finfo->fi_cv, &opsrsm_finfo_lock);
   4496 		finfo->fi_waiters--;
   4497 		if (retval == 0) {
   4498 			error = EINTR;
   4499 		} else {
   4500 			error = finfo->fi_retval;
   4501 		}
   4502 	}
   4503 	mutex_exit(&opsrsm_finfo_lock);
   4504 	return (error);
   4505 }
   4506 
   4507 static void
   4508 opsrsm_finfo_wakeup(opsrsm_failover_info_t *finfo, int retval)
   4509 {
   4510 	mutex_enter(&opsrsm_finfo_lock);
   4511 	finfo->fi_retval = retval;
   4512 	cv_broadcast(&finfo->fi_cv);
   4513 	mutex_exit(&opsrsm_finfo_lock);
   4514 }
   4515 
   4516 static void
   4517 opsrsm_finfo_init(void)
   4518 {
   4519 	opsrsm_finfo_list = NULL;
   4520 	opsrsm_failover_taskq = taskq_create("failover", 8, maxclsyspri, 1, 8,
   4521 	    TASKQ_PREPOPULATE);
   4522 	opsrsm_failover_threads = 0;
   4523 	mutex_init(&opsrsm_finfo_lock, NULL, MUTEX_DRIVER, NULL);
   4524 	cv_init(&opsrsm_finfo_cv, NULL, CV_DRIVER, NULL);
   4525 }
   4526 
   4527 static void
   4528 opsrsm_finfo_fini(void)
   4529 {
   4530 	opsrsm_failover_info_t *finfo = opsrsm_finfo_list, *f;
   4531 
   4532 	mutex_enter(&opsrsm_finfo_lock);
   4533 	if (opsrsm_failover_threads > 0) {
   4534 		DINFO("cannot detach yet, failover_threads = %d, "
   4535 		    "need to wait for approx. %d secs\n",
   4536 		    opsrsm_failover_threads, 180 * opsrsm_failover_threads);
   4537 		cv_wait(&opsrsm_finfo_cv, &opsrsm_finfo_lock);
   4538 	}
   4539 	while (finfo != NULL) {
   4540 		ASSERT(finfo->fi_waiters == 0);
   4541 		OPSRSM_Q_FLUSH(&finfo->fi_rexmitq);
   4542 		cv_destroy(&finfo->fi_cv);
   4543 		cv_destroy(&finfo->fi_wait_cv);
   4544 		mutex_destroy(&finfo->fi_lock);
   4545 		f = finfo;
   4546 		finfo = finfo->fi_next;
   4547 		kmem_free(f, sizeof (*f));
   4548 	}
   4549 	mutex_exit(&opsrsm_finfo_lock);
   4550 	taskq_destroy(opsrsm_failover_taskq);
   4551 	cv_destroy(&opsrsm_finfo_cv);
   4552 	mutex_destroy(&opsrsm_finfo_lock);
   4553 }
   4554 
   4555 
   4556 static void
   4557 opsrsm_finfo_destroy(void *arg)
   4558 {
   4559 	opsrsm_failover_info_t *finfo = (opsrsm_failover_info_t *)arg;
   4560 	opsrsm_failover_info_t *fptr = opsrsm_finfo_list;
   4561 
   4562 	mutex_enter(&opsrsm_finfo_lock);
   4563 	/*
   4564 	 * delay the destruction if it was touched recently
   4565 	 */
   4566 	if ((ddi_get_lbolt() - finfo->fi_last_accessed) <= 3000) {
   4567 		mutex_exit(&opsrsm_finfo_lock);
   4568 		(void) timeout(opsrsm_finfo_destroy, finfo, 3000);
   4569 		return;
   4570 	}
   4571 	/*
   4572 	 * delay the destruction if there are waiters yet to
   4573 	 * be woken up.
   4574 	 */
   4575 	if (finfo->fi_waiters > 0) {
   4576 		mutex_exit(&opsrsm_finfo_lock);
   4577 		(void) timeout(opsrsm_finfo_destroy, finfo, 10);
   4578 		return;
   4579 	}
   4580 	DINFO("failover: destroying finfo, local_skey 0x%x, "
   4581 	    "remote_skey 0x%x\n", finfo->fi_local_skey,
   4582 	    finfo->fi_remote_skey);
   4583 	if (fptr == finfo) {
   4584 		opsrsm_finfo_list = finfo->fi_next;
   4585 	} else {
   4586 		while (fptr != NULL) {
   4587 			if (fptr->fi_next == finfo) break;
   4588 			fptr = fptr->fi_next;
   4589 		}
   4590 		ASSERT(fptr != NULL);
   4591 		fptr->fi_next = finfo->fi_next;
   4592 	}
   4593 
   4594 	ASSERT(finfo->fi_waiters == 0);
   4595 	cv_destroy(&finfo->fi_cv);
   4596 	cv_destroy(&finfo->fi_wait_cv);
   4597 	mutex_destroy(&finfo->fi_lock);
   4598 	kmem_free(finfo, sizeof (*finfo));
   4599 	opsrsm_failover_threads--;
   4600 	ASSERT(opsrsm_failover_threads >= 0);
   4601 	if (opsrsm_failover_threads == 0) {
   4602 		cv_broadcast(&opsrsm_finfo_cv);
   4603 	}
   4604 	mutex_exit(&opsrsm_finfo_lock);
   4605 }
   4606 
   4607 static void
   4608 opsrsm_dispatch_failover(void *arg)
   4609 {
   4610 	opsrsm_failover_info_t *finfo = (opsrsm_failover_info_t *)arg;
   4611 
   4612 	ASSERT(opsrsm_failover_taskq != NULL);
   4613 	if (taskq_dispatch(opsrsm_failover_taskq,
   4614 	    opsrsm_failover_thread, finfo, KM_NOSLEEP) == 0) {
   4615 		(void) timeout(opsrsm_dispatch_failover, finfo, 1);
   4616 	}
   4617 }
   4618 
   4619 /* handler for failover related messages */
   4620 /*ARGSUSED*/
   4621 static void
   4622 opsrsmmsghdlr_finfo(opsrsm_dest_t *rd, opsrsm_msg_t *msg)
   4623 {
   4624 	opsrsm_failover_info_t *finfo;
   4625 	uint32_t skey = msg->p.m.finfoquery.skey;
   4626 	uint32_t flag = 0;
   4627 
   4628 	finfo = opsrsm_finfo_lookup_by_local_skey(skey);
   4629 	if (finfo == NULL) {
   4630 		return;
   4631 	}
   4632 	switch (msg->p.hdr.reqtype) {
   4633 	case OPSRSM_MSG_FINFO_DEMUX_DONE:
   4634 		flag = OPSRSM_FINFO_DEMUX_DONE;
   4635 		if ((finfo->fi_status & flag) != 0) break;
   4636 		DINFO("failover: 0x%x peer node demux done\n", skey);
   4637 		break;
   4638 	case OPSRSM_MSG_FINFO_REPLY:
   4639 		flag = OPSRSM_FINFO_DEMUX_DONE | OPSRSM_FINFO_OLD_PROTO;
   4640 		DINFO("failover: 0x%x peer node uses old protocol\n", skey);
   4641 		break;
   4642 	case OPSRSM_MSG_FINFO_REXMIT_ACK:
   4643 		flag = OPSRSM_FINFO_REXMIT_ACKED;
   4644 		DINFO("failover: 0x%x rexmit marker acked\n", skey);
   4645 		break;
   4646 	default:
   4647 		cmn_err(CE_PANIC, "unknown message type\n");
   4648 	}
   4649 
   4650 	mutex_enter(&finfo->fi_lock);
   4651 	finfo->fi_status |= flag;
   4652 	cv_broadcast(&finfo->fi_wait_cv);
   4653 	mutex_exit(&finfo->fi_lock);
   4654 }
   4655 
   4656 static void
   4657 opsrsm_option_rexmit_end(mblk_t *mp, opsrsm_dest_t *rd)
   4658 {
   4659 	opsrsm_queued_msg_t *qmsg;
   4660 	opsrsm_ack_msg_t *ack;
   4661 	opsrsm_failover_info_t *finfo;
   4662 
   4663 	ack = (opsrsm_ack_msg_t *)((caddr_t)mp->b_rptr +
   4664 	    OPSRSM_MESSAGE_HDRSZ);
   4665 	finfo = opsrsm_finfo_lookup_by_local_skey(ack->am_skey);
   4666 	if (finfo == NULL) {
   4667 		freemsg(mp);
   4668 		return;
   4669 	}
   4670 	freemsg(mp);
   4671 
   4672 	mutex_enter(&finfo->fi_lock);
   4673 	finfo->fi_status |= OPSRSM_FINFO_REXMIT_DONE;
   4674 	cv_broadcast(&finfo->fi_wait_cv);
   4675 	mutex_exit(&finfo->fi_lock);
   4676 
   4677 	if (rd->rd_state != OPSRSM_STATE_W_READY) {
   4678 		return;
   4679 	}
   4680 	qmsg = kmem_zalloc(sizeof (opsrsm_queued_msg_t), KM_NOSLEEP);
   4681 	if (qmsg == NULL) {
   4682 		return;
   4683 	}
   4684 
   4685 	qmsg->qm_msg.p.hdr.reqtype = OPSRSM_MSG_FINFO_REXMIT_ACK;
   4686 	qmsg->qm_msg.p.hdr.seqno = 0;
   4687 	qmsg->qm_msg.p.hdr.opsrsm_version = OPSRSM_VERSION;
   4688 	qmsg->qm_msg.p.m.finfoquery.skey = finfo->fi_remote_skey;
   4689 	qmsg->qm_retries = 0;
   4690 
   4691 	opsrsm_queued_msg_append(rd, qmsg);
   4692 	opsrsm_event_add(rd, OPSRSM_EVT_SEND_MSG);
   4693 }
   4694 
   4695 static int
   4696 opsrsm_finfo_sendmsg(opsrsm_dest_t *rd, uint8_t type, uint32_t skey)
   4697 {
   4698 	rsm_send_t send_obj;
   4699 	opsrsm_msg_t msg;
   4700 	int retval;
   4701 
   4702 	msg.p.hdr.reqtype = type;
   4703 	msg.p.hdr.seqno = 0;
   4704 	msg.p.hdr.opsrsm_version = OPSRSM_VERSION;
   4705 	msg.p.m.finfoquery.skey = skey;
   4706 	send_obj.is_data = &msg;
   4707 	send_obj.is_size = sizeof (opsrsm_msg_t);
   4708 	send_obj.is_flags = RSM_DLPI_SQFLAGS;
   4709 	send_obj.is_wait = 0;
   4710 	retval = RSM_SEND(rd->rd_adapter->rsmrdt_ctlr_obj, rd->rsm_sendq,
   4711 	    &send_obj, NULL);
   4712 
   4713 	return (((retval == RSM_SUCCESS) ? 0 : retval));
   4714 }
   4715 
   4716 /*
   4717  * Create a special packet that is used as a marker to the end of
   4718  * a retransmitted stream of packets.
   4719  */
   4720 static mblk_t *
   4721 opsrsm_alloc_ack_msg(uint32_t skey)
   4722 {
   4723 	mblk_t *mp;
   4724 	opsrsm_ack_msg_t *msg;
   4725 
   4726 	mp = allocb(OPSRSM_CACHELINE_SIZE + OPSRSM_MESSAGE_HDRSZ +
   4727 	    sizeof (opsrsm_ack_msg_t) + OPSRSM_CACHELINE_SIZE, BPRI_LO);
   4728 	if (mp == NULL) {
   4729 		return (NULL);
   4730 	}
   4731 	mp->b_rptr = (uchar_t *)OPSRSM_CACHELINE_ROUNDUP(mp->b_rptr);
   4732 
   4733 	OPSRSM_MESSAGE_HDRPTR(mp)->lportnum = 0;
   4734 	OPSRSM_MESSAGE_HDRPTR(mp)->rportnum = 0;
   4735 	OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz = sizeof (opsrsm_ack_msg_t);
   4736 	OPSRSM_MESSAGE_HDRPTR(mp)->nodeid = (int)rsmrdt_my_nodeid;
   4737 	OPSRSM_MESSAGE_HDRPTR(mp)->pkey = 0;
   4738 	OPSRSM_MESSAGE_HDRPTR(mp)->seqno = 0;
   4739 	OPSRSM_MESSAGE_HDRPTR(mp)->option = OPSRSM_OPT_REXMIT_END;
   4740 
   4741 	msg = (opsrsm_ack_msg_t *)((caddr_t)mp->b_rptr + OPSRSM_MESSAGE_HDRSZ);
   4742 	msg->am_skey = skey;
   4743 
   4744 	mp->b_prev = mp->b_cont = NULL;
   4745 	mp->b_wptr = mp->b_rptr + sizeof (opsrsm_ack_msg_t) +
   4746 	    OPSRSM_MESSAGE_HDRSZ;
   4747 
   4748 	return (mp);
   4749 }
   4750 
   4751 /*
   4752  * This thread handles packet retransmission
   4753  */
   4754 static void
   4755 opsrsm_failover_thread(void *arg)
   4756 {
   4757 	opsrsm_failover_info_t *finfo = (opsrsm_failover_info_t *)arg;
   4758 	int qlen, isdel, retries, error = 0;
   4759 	opsrsm_dest_t *rd;
   4760 	clock_t stime, wait_time, curr_time;
   4761 
   4762 	/* wait time interval size is 100ms */
   4763 	wait_time = drv_usectohz(100000);
   4764 
   4765 	/* reconnect to the remote node. */
   4766 	rd = opsrsm_connect(finfo->fi_nodeid, NULL);
   4767 	if (rd == NULL) {
   4768 		cmn_err(CE_CONT, "failover: 0x%x failed to reconnect "
   4769 		    "to node %d\n", finfo->fi_local_skey, finfo->fi_nodeid);
   4770 		error = ENETDOWN;
   4771 		goto done;
   4772 	}
   4773 
   4774 	retries = 0;
   4775 	stime = ddi_get_lbolt();
   4776 	for (;;) {
   4777 		/* tell remote node that demux has completed */
   4778 		error = opsrsm_finfo_sendmsg(rd, OPSRSM_MSG_FINFO_DEMUX_DONE,
   4779 		    finfo->fi_remote_skey);
   4780 
   4781 		mutex_enter(&finfo->fi_lock);
   4782 		/*
   4783 		 * wait for DEMUX_DONE message from remote node.
   4784 		 * this is necessary because we must not start
   4785 		 * retransmission before the remote node has demuxed
   4786 		 * all remaining packets in its delivery queue.
   4787 		 */
   4788 		if ((finfo->fi_status & OPSRSM_FINFO_DEMUX_DONE) != 0) {
   4789 			break;
   4790 		}
   4791 		curr_time = ddi_get_lbolt();
   4792 		(void) cv_timedwait(&finfo->fi_wait_cv, &finfo->fi_lock,
   4793 		    curr_time + wait_time);
   4794 
   4795 		/* exit from loop if connection got reset */
   4796 		if (rd->rd_state != OPSRSM_STATE_W_READY) {
   4797 			error = ENETDOWN;
   4798 			break;
   4799 		}
   4800 		if (++retries > opsrsm_failover_max_retries) {
   4801 			error = ETIMEDOUT;
   4802 			opsrsm_lostconn(rd);
   4803 			break;
   4804 		}
   4805 		mutex_exit(&finfo->fi_lock);
   4806 	}
   4807 	mutex_exit(&finfo->fi_lock);
   4808 	curr_time = ddi_get_lbolt();
   4809 	if (error != 0) {
   4810 		DINFO("failover: 0x%x step 1 error %d, wait time %d ms\n",
   4811 		    finfo->fi_local_skey, error,
   4812 		    drv_hztousec(curr_time - stime)/1000);
   4813 		goto done;
   4814 	}
   4815 	DINFO("failover: 0x%x demux done in %d ms, retries = %d\n",
   4816 	    finfo->fi_local_skey, drv_hztousec(curr_time - stime)/1000,
   4817 	    retries);
   4818 	ASSERT((finfo->fi_status & OPSRSM_FINFO_DEMUX_DONE) != 0);
   4819 
   4820 	/*
   4821 	 * retransmission is done by queueing the packets in the rexmitq
   4822 	 * to the sendq of the rd and dispatching a thread to drain the
   4823 	 * sendq.
   4824 	 */
   4825 	qlen = OPSRSM_Q_LEN(&finfo->fi_rexmitq);
   4826 	DINFO("failover: 0x%x rexmiting remaining %d packets\n",
   4827 	    finfo->fi_local_skey, qlen);
   4828 	mutex_enter(&rd->rd_sendq_lock);
   4829 	/*
   4830 	 * need to increment the refcnt before dispatching the xmit
   4831 	 * thread to make sure that the rd does not disappear before
   4832 	 * the xmit thread runs.
   4833 	 */
   4834 	REFDEST(rd, isdel);
   4835 	if (isdel == 0) {
   4836 		mblk_t *mp;
   4837 		uint32_t skey;
   4838 
   4839 		if (qlen > 0) {
   4840 			OPSRSM_Q_CONCAT(&rd->rd_sendq, &finfo->fi_rexmitq);
   4841 		}
   4842 		/*
   4843 		 * a marker is appended at the end of the retransmitted
   4844 		 * stream. an acknowledgement will be sent back when the
   4845 		 * receiver gets this marker. send local skey if old proto
   4846 		 * is used.
   4847 		 */
   4848 		if ((finfo->fi_status & OPSRSM_FINFO_OLD_PROTO) != 0) {
   4849 			skey = finfo->fi_local_skey;
   4850 		} else {
   4851 			skey = finfo->fi_remote_skey;
   4852 		}
   4853 		mp = opsrsm_alloc_ack_msg(skey);
   4854 		if (mp != NULL) {
   4855 			OPSRSM_Q_APPEND(&rd->rd_sendq, mp);
   4856 			opsrsm_dispatch_tmo((void *)rd);
   4857 		} else {
   4858 			error = ENOMEM;
   4859 			UNREFDEST(rd);
   4860 			opsrsm_lostconn(rd);
   4861 		}
   4862 	} else {
   4863 		/*
   4864 		 * failure to increment the refcnt indicates that the
   4865 		 * rd is about to be torn down.
   4866 		 */
   4867 		error = ENETDOWN;
   4868 	}
   4869 	mutex_exit(&rd->rd_sendq_lock);
   4870 	if (error != 0) {
   4871 		DINFO("failover: 0x%x step 2 error %d\n",
   4872 		    finfo->fi_local_skey, error);
   4873 		goto done;
   4874 	}
   4875 
   4876 	retries = 0;
   4877 	stime = ddi_get_lbolt();
   4878 	mutex_enter(&finfo->fi_lock);
   4879 	/*
   4880 	 * we need to wait for two conditions:
   4881 	 * REXMIT_DONE - this indicates that the remote node
   4882 	 *		 has finished retransmitting packets to us.
   4883 	 * REXMIT_ACKED - this indicates that we have finished
   4884 	 *		  retransmitting packets to the remote node.
   4885 	 *
   4886 	 * Note that for backward compatibility with an older version
   4887 	 * of RDT we also need to check the OLD_PROTO flag. if this
   4888 	 * flag is set, we do not wait for the REXMIT_DONE condition.
   4889 	 * this has to be done because the older protocol only transmits
   4890 	 * a marker if there are packets to rexmit and the REXMIT_DONE
   4891 	 * condition cannot be met if the marker never arrives.
   4892 	 */
   4893 	for (;;) {
   4894 		uint_t flag;
   4895 
   4896 		flag = OPSRSM_FINFO_REXMIT_DONE | OPSRSM_FINFO_OLD_PROTO;
   4897 		if ((finfo->fi_status & OPSRSM_FINFO_REXMIT_ACKED) != 0 &&
   4898 		    (finfo->fi_status & flag) != 0) {
   4899 			break;
   4900 		}
   4901 		curr_time = ddi_get_lbolt();
   4902 		(void) cv_timedwait(&finfo->fi_wait_cv, &finfo->fi_lock,
   4903 		    curr_time + wait_time);
   4904 		if (rd->rd_state != OPSRSM_STATE_W_READY) {
   4905 			error = ENETDOWN;
   4906 			break;
   4907 		}
   4908 		if (++retries > opsrsm_failover_max_retries) {
   4909 			error = ETIMEDOUT;
   4910 			opsrsm_lostconn(rd);
   4911 			break;
   4912 		}
   4913 	}
   4914 	mutex_exit(&finfo->fi_lock);
   4915 	curr_time = ddi_get_lbolt();
   4916 	if (error != 0) {
   4917 		DINFO("failover: 0x%x step 3 error %d, wait time %d ms\n",
   4918 		    finfo->fi_local_skey, error, (curr_time - stime) * 10);
   4919 		goto done;
   4920 	}
   4921 	ASSERT((finfo->fi_status & OPSRSM_FINFO_REXMIT_ACKED) != 0);
   4922 	DINFO("failover: 0x%x retransmission done in %d ms\n",
   4923 	    finfo->fi_local_skey, drv_hztousec(curr_time - stime)/1000);
   4924 
   4925 done:;
   4926 	if (rd != NULL) {
   4927 		if (opsrsmdev->opsrsm_param.rsmrdt_enable_loadbalance) {
   4928 			if (rd->rd_adapter->sel_cnt > 0)
   4929 				rd->rd_adapter->sel_cnt--;
   4930 		}
   4931 		UNREFDEST(rd);
   4932 	}
   4933 	qlen = OPSRSM_Q_LEN(&finfo->fi_rexmitq);
   4934 	if (qlen > 0) {
   4935 		DINFO("failover: 0x%x reconnect failed, discarding "
   4936 		    "%d packets\n", finfo->fi_local_skey, qlen);
   4937 	}
   4938 	OPSRSM_Q_FLUSH(&finfo->fi_rexmitq);
   4939 	/* senders can now be woken up */
   4940 	opsrsm_finfo_wakeup(finfo, error);
   4941 
   4942 	/* schedule a timeout to destroy the finfo structure */
   4943 	(void) timeout(opsrsm_finfo_destroy, finfo,
   4944 	    opsrsm_failover_destruct_time * wait_time);
   4945 }
   4946 
   4947 
   4948 static void
   4949 opsrsm_reset_rp(opsrsmresource_t *rp, opsrsm_dest_t *rd)
   4950 {
   4951 	mutex_enter(&rp->rs_lock);
   4952 	if ((rp->rs_state & OPSRSM_RS_CONNECTING) != 0) {
   4953 		cv_wait(&rp->rs_conn_cv, &rp->rs_lock);
   4954 	}
   4955 	if (rp->rs_dest == rd && rp->rs_local_skey == rd->rd_local_skey) {
   4956 		rp->rs_state |= OPSRSM_RS_FAILOVER;
   4957 		if ((rp->rs_state & OPSRSM_RS_REFDEST) != 0) {
   4958 			if (opsrsmdev->opsrsm_param.
   4959 			    rsmrdt_enable_loadbalance) {
   4960 				if (rd->rd_adapter->sel_cnt > 0)
   4961 					rp->rs_dest->rd_adapter->sel_cnt--;
   4962 			}
   4963 			rp->rs_state &= ~OPSRSM_RS_REFDEST;
   4964 			UNREFDEST(rp->rs_dest);
   4965 		}
   4966 	}
   4967 	mutex_exit(&rp->rs_lock);
   4968 }
   4969 
   4970 static void
   4971 opsrsm_reset_all_rps(opsrsm_dest_t *rd)
   4972 {
   4973 	int i, j;
   4974 	opsrsmresource_blk_t *blk;
   4975 	opsrsmresource_t *rp;
   4976 
   4977 	rw_enter(&opsrsm_resource.opsrsmrct_lock, RW_READER);
   4978 	for (i = 0; i < opsrsm_resource.opsrsmrc_len; i++) {
   4979 		blk = opsrsm_resource.opsrsmrc_root[i];
   4980 		if (blk != NULL && blk->opsrsmrcblk_avail < OPSRSMRC_BLKSZ) {
   4981 			for (j = 0; j < OPSRSMRC_BLKSZ; j++) {
   4982 				rp = blk->opsrsmrcblk_blks[j];
   4983 				if (rp != NULL) {
   4984 					opsrsm_reset_rp(rp, rd);
   4985 				}
   4986 			}
   4987 		}
   4988 	}
   4989 	rw_exit(&opsrsm_resource.opsrsmrct_lock);
   4990 }
   4991 
   4992 void
   4993 rsmrdt_failover(adapter_t *adapterp, rsm_addr_t hwaddr)
   4994 {
   4995 	opsrsm_dest_t *rd;
   4996 	int isdel = 0;
   4997 
   4998 	/*
   4999 	 * Scan through all the rds associated with this adapter and
   5000 	 * mark them down.
   5001 	 */
   5002 
   5003 	DINFO("failover: entering, adapter 0x%p\n", adapterp);
   5004 
   5005 	FINDDEST(rd, isdel, hwaddr, adapterp);
   5006 	if (isdel || !rd) {
   5007 		goto out;
   5008 	}
   5009 
   5010 	DINFO("failover: local_skey 0x%x, rd 0x%p\n", rd->rd_local_skey, rd);
   5011 	opsrsm_lostconn(rd);
   5012 	UNREFDEST(rd);
   5013 
   5014 out:;
   5015 	/* call lower failover function */
   5016 	DINFO("failover: exiting, adapter 0x%p\n", adapterp);
   5017 }
   5018 
   5019 /*
   5020  * ****************************************************************
   5021  *                                                               *
   5022  * B E G I N   RSM SETUP/TAKEDOWN                                *
   5023  *                                                               *
   5024  * ****************************************************************
   5025  */
   5026 
   5027 int
   5028 rsmrdt_check_openhandles() {
   5029 	int rval = -1;
   5030 
   5031 	while (rw_tryenter(&opsrsm_resource.opsrsmrct_lock, RW_WRITER) == 0) {
   5032 		delay(1);
   5033 	}
   5034 	if (opsrsm_resource.opsrsmrc_cnt <= 1) {
   5035 		/*
   5036 		 * Can unload module.
   5037 		 */
   5038 		opsrsm_resource.opsrsmrc_flag = OPSRSMRC_UNLOAD_INPROGRESS;
   5039 		rval = 0;
   5040 	}
   5041 	rw_exit(&opsrsm_resource.opsrsmrct_lock);
   5042 	return (rval);
   5043 }
   5044 
   5045 /*
   5046  * Initialize per adapter RSMRDT resources.
   5047  * Return 0 on success, nonzero on error.
   5048  */
   5049 int
   5050 rsmrdt_adapterinit(adapter_t *adapterp)
   5051 {
   5052 	char tqname[32];
   5053 
   5054 	/*
   5055 	 * Initialize mutexes for this device.
   5056 	 */
   5057 	mutex_init(&adapterp->opsrsm_dest_lock, NULL, MUTEX_DRIVER, NULL);
   5058 	mutex_init(&adapterp->opsrsm_runq_lock, NULL, MUTEX_DRIVER, NULL);
   5059 	cv_init(&adapterp->opsrsm_uninit_cv, NULL, CV_DRIVER, NULL);
   5060 
   5061 	(void) sprintf(tqname, "opsrsm%d", adapterp->adapterid);
   5062 	adapterp->opsrsm_taskq = taskq_create(tqname, 4, maxclsyspri, 1, 4,
   5063 		TASKQ_PREPOPULATE);
   5064 	return (0);
   5065 }
   5066 
   5067 
   5068 /*
   5069  * Un-initialize per adapter RSMRDT resources.
   5070  * Returns 0 if completely successful.
   5071  * Returns -1 if not in a state where uninitialize makes sense.
   5072  */
   5073 int
   5074 rsmrdt_adapterfini(adapter_t *adapterp)
   5075 {
   5076 	/*
   5077 	 * If we can't release all destination and RSMPI resources, we can't
   5078 	 * detach.  The user will have to try later to unload the driver.
   5079 	 */
   5080 	D1("rsmrdt_adapterfini: adapterp->adapterid = %d\n",
   5081 	    adapterp->adapterid);
   5082 	if (opsrsmuninit(adapterp) != 0) {
   5083 		return (-1);
   5084 	}
   5085 	taskq_destroy(adapterp->opsrsm_taskq);
   5086 	adapterp->opsrsm_taskq = NULL;
   5087 
   5088 	cv_destroy(&adapterp->opsrsm_uninit_cv);
   5089 	mutex_destroy(&adapterp->opsrsm_runq_lock);
   5090 	mutex_destroy(&adapterp->opsrsm_dest_lock);
   5091 
   5092 	return (0);
   5093 }
   5094 
   5095 
   5096 /*
   5097  * Un-initialize OPSRSM resources.  Returns 0 if completely successful.
   5098  * Returns -1 if not in a state where uninitialize makes sense.  Returns >0
   5099  * if uninitialize was started, but hasn't completed because not all
   5100  * connections have been torn down yet.
   5101  */
   5102 static int
   5103 opsrsmuninit(adapter_t *adapterp)
   5104 {
   5105 	int dests_not_cleaned_up;
   5106 	int total_refcnt = 0;
   5107 	rsm_addr_t i;
   5108 
   5109 	D1("opsrsmuninit: adapterp 0x%p", (void *)adapterp);
   5110 
   5111 	for (i = 0; i < RSM_MAX_DESTADDR; i++)
   5112 		total_refcnt += opsrsmfreedest(adapterp, i);
   5113 
   5114 	mutex_enter(&adapterp->opsrsm_dest_lock);
   5115 	dests_not_cleaned_up = adapterp->opsrsm_numdest;
   5116 	if (total_refcnt > adapterp->opsrsm_numdest) {
   5117 		mutex_exit(&adapterp->opsrsm_dest_lock);
   5118 		DERR("opsrsmuninit: total_refcnt = %d", total_refcnt);
   5119 		return (dests_not_cleaned_up);
   5120 	}
   5121 
   5122 	if (dests_not_cleaned_up > 0) {
   5123 		cv_wait(&adapterp->opsrsm_uninit_cv,
   5124 			&adapterp->opsrsm_dest_lock);
   5125 		dests_not_cleaned_up = adapterp->opsrsm_numdest;
   5126 	}
   5127 	mutex_exit(&adapterp->opsrsm_dest_lock);
   5128 
   5129 	D1("opsrsmuninit: returning %d", dests_not_cleaned_up);
   5130 	return (dests_not_cleaned_up);
   5131 }
   5132 
   5133 /*
   5134  * Get all the opsrsm parameters out of the device tree and store them in a
   5135  * OPSRSM device (RSM controller) structure.
   5136  */
   5137 static void
   5138 opsrsmgetparam(
   5139 	dev_info_t *dip,	/* Device's info pointer */
   5140 	opsrsm_t *opsrsmp)	/* OPSRSM device (RSM controller) pointer */
   5141 {
   5142 	struct opsrsm_param *sp = &opsrsmp->opsrsm_param;
   5143 
   5144 	/* Get parameters */
   5145 
   5146 	sp->opsrsm_buffers = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5147 	    "rsmrdt-buffers", OPSRSM_BUFFERS_DFLT);
   5148 	sp->opsrsm_buffer_size = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5149 	    "rsmrdt-buffer-size", OPSRSM_BUFFER_SIZE_DFLT) +
   5150 	    OPSRSM_CACHELINE_SIZE;
   5151 	sp->opsrsm_queue_size = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5152 	    "rsmrdt-queue-size", OPSRSM_QUEUE_SIZE_DFLT);
   5153 	sp->opsrsm_buffers_retained = (ushort_t)ddi_getprop(DDI_DEV_T_ANY,
   5154 	    dip, 0, "rsmrdt-buffers-retained", OPSRSM_BUFFERS_RETAINED_DFLT);
   5155 	sp->opsrsm_max_queued_pkts = (ushort_t)ddi_getprop(DDI_DEV_T_ANY,
   5156 	    dip, 0, "rsmrdt-max-queued-pkts", OPSRSM_MAX_QUEUED_PKTS_DFLT);
   5157 	sp->opsrsm_msg_init_tmo = ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5158 	    "rsmrdt-msg-init-tmo", OPSRSM_MSG_INIT_TMO_DFLT);
   5159 	sp->opsrsm_msg_max_tmo = ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5160 	    "rsmrdt-msg-max-tmo", OPSRSM_MSG_MAX_TMO_DFLT);
   5161 	sp->opsrsm_msg_drop_tmo = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5162 	    "rsmrdt-msg-drop-tmo", OPSRSM_MSG_DROP_TMO_DFLT);
   5163 	sp->opsrsm_ack_tmo = ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5164 	    "rsmrdt-ack-tmo", OPSRSM_ACK_TMO_DFLT);
   5165 	sp->opsrsm_sync_tmo = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5166 	    "rsmrdt-sync-tmo", OPSRSM_SYNC_TMO_DFLT);
   5167 	sp->opsrsm_fqe_sync_size = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5168 	    "rsmrdt-fqe-sync-size", OPSRSM_FQE_SYNC_SIZE_DFLT);
   5169 	sp->opsrsm_retry_limit = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5170 	    "rsmrdt-retry-limit", OPSRSM_RETRY_LIMIT_DFLT);
   5171 	sp->opsrsm_retry_delay = ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5172 	    "rsmrdt-retry-delay", OPSRSM_RETRY_DELAY_DFLT);
   5173 	sp->opsrsm_xmit_delay = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5174 	    "rsmrdt-xmit-delay", OPSRSM_XMIT_DELAY_DFLT);
   5175 	sp->opsrsm_data_threshold = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5176 	    "rsmrdt-data-threshold", OPSRSM_DATA_THRESHOLD_DFLT);
   5177 	sp->opsrsm_max_recv_msgs = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5178 	    "rsmrdt-max-recv-msgs", OPSRSM_MAX_RECV_MSGS_DFLT);
   5179 	sp->opsrsm_adaptive_intr = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5180 	    "rsmrdt-adaptive-intr", OPSRSM_ADAPTIVE_INTR_DFLT);
   5181 	sp->opsrsm_adaptive_rate = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5182 	    "rsmrdt-adaptive-rate", OPSRSM_ADAPTIVE_RATE_DFLT);
   5183 	sp->opsrsm_mem_hi_wat = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5184 	    "rsmrdt-mem-hi-wat", OPSRSM_MEM_HI_WAT_DFLT);
   5185 	sp->opsrsm_mem_lo_wat = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5186 	    "rsmrdt-mem-lo-wat", OPSRSM_MEM_LO_WAT_DFLT);
   5187 	sp->opsrsm_recv_hi_wat = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5188 	    "rsmrdt-recv-hi-wat", OPSRSM_RECV_HI_WAT_DFLT);
   5189 	sp->opsrsm_recv_lo_wat = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5190 	    "rsmrdt-recv-lo-wat", OPSRSM_RECV_LO_WAT_DFLT);
   5191 	sp->opsrsm_flow_tmo_int = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0,
   5192 	    "rsmrdt-flow-tmo-int", OPSRSM_FLOW_TMO_INT_DFLT);
   5193 	sp->opsrsm_max_loopback_pkts = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip,
   5194 	    0, "rsmrdt-max-loopback-pkts", OPSRSM_MAX_LOOPBACK_PKTS_DFLT);
   5195 	sp->rsmrdt_enable_loadbalance = (ushort_t)ddi_getprop(DDI_DEV_T_ANY,
   5196 	    dip, 0, "rsmrdt-enable-loadbalance",
   5197 	    RSMRDT_ENABLE_LOADBALANCE_DFLT);
   5198 
   5199 	/*
   5200 	 * Sanity check parameters, modify if needed.  Note that we mainly
   5201 	 * check to make sure parameters won't make the driver malfunction;
   5202 	 * we don't necessarily prevent them from being stupid.
   5203 	 */
   5204 
   5205 	/* Need to have at least one buffer. */
   5206 
   5207 	if (sp->opsrsm_buffers == 0)
   5208 		sp->opsrsm_buffers = 1;
   5209 
   5210 	/* Buffer length must be multiple of 64 (0x40). */
   5211 
   5212 	if (sp->opsrsm_buffer_size & ~OPSRSM_CACHELINE_MASK) {
   5213 		sp->opsrsm_buffer_size &= OPSRSM_CACHELINE_MASK;
   5214 	}
   5215 
   5216 	/*
   5217 	 * Must have at least one more queue element then the number of
   5218 	 * buffers.  This is so that we can track when all queue elements
   5219 	 * need to be flushed to remote.
   5220 	 */
   5221 
   5222 	if (sp->opsrsm_queue_size <= sp->opsrsm_buffers) {
   5223 		sp->opsrsm_queue_size = sp->opsrsm_buffers + 1;
   5224 	}
   5225 
   5226 	/* Can't retain more buffers than we have. */
   5227 
   5228 	if (sp->opsrsm_buffers_retained > sp->opsrsm_buffers) {
   5229 		sp->opsrsm_buffers_retained = sp->opsrsm_buffers;
   5230 	}
   5231 
   5232 	/* Have to be able to queue at least 1 packet. */
   5233 
   5234 	if (sp->opsrsm_max_queued_pkts < 1) {
   5235 		sp->opsrsm_max_queued_pkts = 1;
   5236 	}
   5237 
   5238 	if (sp->opsrsm_msg_init_tmo < 1) {
   5239 		sp->opsrsm_msg_init_tmo = 1;
   5240 	}
   5241 	if (sp->opsrsm_msg_max_tmo < 1) {
   5242 		sp->opsrsm_msg_max_tmo = 1;
   5243 	}
   5244 	if (sp->opsrsm_ack_tmo < 1) {
   5245 		sp->opsrsm_ack_tmo = 1;
   5246 	}
   5247 	if (sp->opsrsm_sync_tmo < 1) {
   5248 		sp->opsrsm_sync_tmo = 1;
   5249 	}
   5250 	if (sp->opsrsm_retry_limit < 1) {
   5251 		sp->opsrsm_retry_limit = 1;
   5252 	}
   5253 	if (sp->opsrsm_retry_delay < 1) {
   5254 		sp->opsrsm_retry_delay = 1;
   5255 	}
   5256 	if (sp->opsrsm_xmit_delay < 1) {
   5257 		sp->opsrsm_xmit_delay = 1;
   5258 	}
   5259 
   5260 	if (sp->opsrsm_max_recv_msgs < 1) {
   5261 		sp->opsrsm_max_recv_msgs = OPSRSM_MAX_RECV_MSGS_DFLT;
   5262 	}
   5263 
   5264 }
   5265 
   5266 /*
   5267  * ****************************************************************
   5268  *                                                               *
   5269  * E N D       RSM SETUP/TAKEDOWN                                *
   5270  *                                                               *
   5271  * ****************************************************************
   5272  */
   5273 
   5274 
   5275 /*
   5276  * ****************************************************************
   5277  *                                                               *
   5278  * B E G I N   CONNECTION DATA STRUCTURE MANAGEMENT              *
   5279  *                                                               *
   5280  * ****************************************************************
   5281  */
   5282 
   5283 
   5284 /*
   5285  * Create the indicated destination structure, and return a pointer to it.
   5286  * NOTE:  this should never be called directly; use the MAKEDEST macro
   5287  * instead.  The macro checks that the destination structure does not yet
   5288  * exist before calling this function.
   5289  */
   5290 
   5291 
   5292 static opsrsm_dest_t *
   5293 opsrsmmkdest(adapter_t *adapterp,
   5294 	rsm_addr_t rsm_addr)	/* Address of destination to find/create */
   5295 {
   5296 	opsrsm_dest_t *rd;
   5297 	clock_t lbolttime;
   5298 	int adapterid, nodeid;
   5299 
   5300 	D1("opsrsmmkdest:cltr %d, rsmaddr %ld", adapterp->instance, rsm_addr);
   5301 
   5302 	/* Is the destination reasonable? */
   5303 
   5304 	if (rsm_addr >= RSM_MAX_DESTADDR) {
   5305 		DERR("opsrsmmkdest: too big, returning NULL");
   5306 		return (NULL);
   5307 	}
   5308 
   5309 	if ((rd = adapterp->opsrsm_desttbl[rsm_addr]) != NULL) {
   5310 		return (rd);
   5311 	}
   5312 
   5313 	ASSERT(MUTEX_HELD(&adapterp->opsrsm_dest_lock));
   5314 
   5315 	/* retrieve the remote adapter id and remote node id */
   5316 	rsmrdt_get_remote_ids(adapterp, rsm_addr, &adapterid, &nodeid);
   5317 	if (adapterid == -1 || nodeid == -1) {
   5318 		DERR("opsrsmmkdest: Unable to find remote ids\n");
   5319 		return (NULL);
   5320 	}
   5321 	D1("opsrsmmkdest: Remote adapter id %d\n", adapterid);
   5322 
   5323 	if ((rd = (opsrsm_dest_t *)kmem_zalloc(sizeof (*rd), KM_NOSLEEP)) ==
   5324 	    NULL) {
   5325 		DERR("opsrsmmkdest: can't alloc, returning NULL");
   5326 		return (NULL);
   5327 	}
   5328 
   5329 	rd->rd_evt_taskq = taskq_create("rd_events", 1, maxclsyspri, 1, 1,
   5330 	    TASKQ_PREPOPULATE);
   5331 
   5332 	if (rd->rd_evt_taskq == NULL) {
   5333 		kmem_free(rd, sizeof (*rd));
   5334 		return (NULL);
   5335 	}
   5336 	mutex_init(&rd->rd_msgs_lock, NULL, MUTEX_DRIVER, NULL);
   5337 	mutex_init(&rd->rd_evt_lock, NULL, MUTEX_DRIVER, NULL);
   5338 	cv_init(&rd->rd_evt_cv, NULL, CV_DRIVER, NULL);
   5339 	cv_init(&rd->rd_evt_wait_cv, NULL, CV_DRIVER, NULL);
   5340 	rd->rd_evt_flags = 0;
   5341 	rd->rd_msgs = NULL;
   5342 	rd->rd_msgs_tail = NULL;
   5343 
   5344 	if (taskq_dispatch(rd->rd_evt_taskq, opsrsm_event_thread,
   5345 	    rd, KM_NOSLEEP) == 0) {
   5346 		mutex_destroy(&rd->rd_msgs_lock);
   5347 		mutex_destroy(&rd->rd_evt_lock);
   5348 		cv_destroy(&rd->rd_evt_cv);
   5349 		cv_destroy(&rd->rd_evt_wait_cv);
   5350 		kmem_free(rd, sizeof (*rd));
   5351 		return (NULL);
   5352 	}
   5353 
   5354 	rd->rd_buffer_size = opsrsmdev->opsrsm_param.opsrsm_buffer_size;
   5355 	rd->rd_rsm_addr = rsm_addr;
   5356 	rd->rd_rem_adapterid = adapterid;
   5357 	rd->rd_adapter = adapterp;
   5358 
   5359 	mutex_init(&rd->rd_net_lock, NULL, MUTEX_DRIVER, NULL);
   5360 	mutex_init(&rd->rd_fqr_lock, NULL, MUTEX_DRIVER, NULL);
   5361 	mutex_init(&rd->rd_xmit_lock, NULL, MUTEX_DRIVER, NULL);
   5362 	mutex_init(&rd->rd_lock, NULL, MUTEX_DRIVER, NULL);
   5363 	mutex_init(&rd->rd_sendq_lock, NULL, MUTEX_DRIVER, NULL);
   5364 	mutex_init(&rd->rd_tmo_lock, NULL, MUTEX_DRIVER, NULL);
   5365 	mutex_init(&rd->rd_freeq_lock, NULL, MUTEX_DRIVER, NULL);
   5366 	cv_init(&rd->rd_conn_cv, NULL, CV_DRIVER, NULL);
   5367 
   5368 	/*
   5369 	 * Use the time to generate a pseudo-random initial sequence
   5370 	 * number.
   5371 	 */
   5372 	(void) drv_getparm(LBOLT, &lbolttime);
   5373 	rd->rd_nseq = (ushort_t)lbolttime;
   5374 
   5375 	rd->rd_state = OPSRSM_STATE_NEW;
   5376 	rd->rd_xmit_state = OPSRSM_XMIT_UNINITIALIZED;
   5377 	rd->rd_nodeid = nodeid;
   5378 	rd->rd_start_time = 0;
   5379 	rd->rd_data_collected = 0;
   5380 	rd->rd_writes_completed = 0;
   5381 	rd->rd_active_threads = 0;
   5382 	rd->rd_xmit_tmo_id = 0;
   5383 	rd->rd_fqe_tmo_id = 0;
   5384 	rd->rd_sync_dqe_tmo_id = 0;
   5385 	rd->rd_sync_fqe_tmo_id = 0;
   5386 	rd->rd_xmit_tmo_int = 0;
   5387 	rd->rd_fqe_tmo_int = 0;
   5388 	rd->rd_refcnt = 1;
   5389 	rd->rd_pollhd.ph_list = NULL;
   5390 	rd->rd_events = 0;
   5391 	rd->rd_adaptive_threshold = 0;
   5392 	rd->rd_last_sent = (uint32_t)lbolttime;
   5393 	rd->rd_pkt_freq = 0;
   5394 	rd->rd_freeq_freeze = B_FALSE;
   5395 	rd->rd_next_rseqno = 1;
   5396 	rd->rd_next_lseqno = 1;
   5397 	rd->rd_local_skey = (uint32_t)gethrtime();
   5398 	rd->rd_local_skey ^= (uint32_t)rd + (uint32_t)rsm_addr;
   5399 	rd->rd_remote_skey = 0;
   5400 	rd->rd_retry_int = B_FALSE;
   5401 	rd->rd_freeq_freeze = B_FALSE;
   5402 	rd->rd_freed = B_FALSE;
   5403 	rd->rd_unpublish_errs = 0;
   5404 	OPSRSM_Q_INIT(&rd->rd_sendq);
   5405 	OPSRSM_Q_INIT(&rd->rd_pendq);
   5406 	OPSRSM_Q_INIT(&rd->rd_freeq);
   5407 	rd->rd_remote_flow_stop = 0;
   5408 	rd->rd_remote_flow_ctl = 0;
   5409 	rd->rd_flow_ctl = NULL;
   5410 	rd->rd_flow_tmo_id = 0;
   5411 	rd->rd_pkts_delivered = 0;
   5412 	rd->rd_status_tmo_id = 0;
   5413 	rd->rd_queued_fqe_freelist = NULL;
   5414 	rd->rd_queued_fqe_list = NULL;
   5415 	rd->rd_queued_fqe_tail = NULL;
   5416 	rd->rd_queued_fqe_array = NULL;
   5417 	rd->rd_queued_fqe_cnt = 0;
   5418 	rd->rd_fqr_flags = 0;
   5419 
   5420 	adapterp->opsrsm_desttbl[rsm_addr] = rd;
   5421 	adapterp->opsrsm_numdest++;
   5422 
   5423 	D1("opsrsmmkdest: created new dest, returning 0x%p", (void *)rd);
   5424 	return (rd);
   5425 }
   5426 
   5427 /*
   5428  * Destination deletion
   5429  *
   5430  * As mentioned above (way above), we maintain a reference count on all
   5431  * destinations, which is incremented and decremented around uses of the
   5432  * destination structure.  When this reference count goes to zero, we delete
   5433  * the destination.
   5434  *
   5435  * Because of the possibility of other threads trying to use the destination
   5436  * while we're deleting it, deletion is actually a multiple-step process,
   5437  * which works as follows.
   5438  *
   5439  * 1. When a destination is created, its dstate (deletion state) is set to
   5440  *    zero, and its reference count is set to one.
   5441  *
   5442  * 2. When the service routine or some other routine decides that a destination
   5443  *    should be deleted, it calls opsrsmfreedest().  That routine sets dstate
   5444  *    to 1 and cancels any pending sync timeouts.  It then decrements the
   5445  *    destination's reference count.  This deletes the reference set in
   5446  *    opsrsmmkdest. (Note that since dstate is now 1, the FINDDEST and REFDEST
   5447  *    macros will now note that the destination is being deleted; thus, any
   5448  *    interrupt referring to the destination will no longer modify the
   5449  *    reference count.)
   5450  *
   5451  * 3. Soon after this, opsrsmdest_refcnt_0 is called.  (This may either be
   5452  *    directly from opsrsmfreedest(), or perhaps from another routine if it
   5453  *    was running concurrently with freedest() and its UNREF happened last).
   5454  *    This routine sees that dstate is 1, and immediately queues a timeout
   5455  *    which will execute opsrsmfreedesttmo().  (This is necessary because we
   5456  *    may not be able to do everything in the phase 1 deletion from the routine
   5457  *    that we're currently in.)
   5458  *
   5459  * 4. opsrsmfreedesttmo() runs, it checks if there are any outstanding
   5460  *    loaned-up buffers.  If so, it sets a flag to cause the loan returning
   5461  *    code to decrement the refcnt, and returns without performing cleanup.
   5462  *    When all loaned buffers are returned and the refcnt is decremented, we
   5463  *    go back to step 3, above.  When opsrsmfreedesttmo() finally runs with
   5464  *    no loaned buffers, gets rid of most of the OPSRSM resources attached
   5465  *    to the destination.  It also throws away any queued packets, gets
   5466  *    rid of any allocated DVMA resources.  It changes dstate to 2, takes
   5467  *    this destination structure out of the base-ID => destination table.
   5468  *    It then decrements the reference count that had been added by
   5469  *    opsrsmdest_refcnt_0().
   5470  *
   5471  * 5. When the reference count becomes 0, opsrsmdest_refcnt_0 is again called.
   5472  *    It notices that dstate is 2, and frees the destination structure.
   5473  */
   5474 
   5475 /*
   5476  * A destination's reference count went to 0, deal with it.
   5477  */
   5478 static boolean_t
   5479 opsrsmdest_refcnt_0(
   5480 	opsrsm_dest_t *rd)	/* Destination pointer */
   5481 {
   5482 	opsrsm_t *opsrsmp = opsrsmdev;
   5483 	adapter_t *adapterp = rd->rd_adapter;
   5484 	boolean_t freed = B_FALSE;
   5485 
   5486 	mutex_enter(&adapterp->opsrsm_dest_lock);
   5487 
   5488 	D1("opsrsmdest_refcnt_0: rd 0x%p (addr %ld ctlr %d), refcnt %d, "
   5489 	    "dstate %d",
   5490 	    (void *)rd, rd->rd_rsm_addr, adapterp->instance,
   5491 	    rd->rd_refcnt, rd->rd_dstate);
   5492 
   5493 	if (rd->rd_dstate == 1) {
   5494 		rd->rd_refcnt++;	/* Inline REFDEST */
   5495 
   5496 		DINFO("failover: 0x%x start destruction\n", rd->rd_local_skey);
   5497 		/*
   5498 		 * We may be called from a routine that can't actually do the
   5499 		 * work that needs to be done, so we schedule a thread to do
   5500 		 * the next phase of the deletion.
   5501 		 */
   5502 		(void) taskq_dispatch(opsrsm_events_taskq, opsrsmfreedesttmo,
   5503 		    rd, KM_SLEEP);
   5504 
   5505 	} else if (rd->rd_dstate == 2) {
   5506 
   5507 		/* Destroy all the mutexes */
   5508 		DINFO("failover: 0x%x end destruction\n", rd->rd_local_skey);
   5509 		opsrsm_queued_msg_flush(rd);
   5510 
   5511 		mutex_destroy(&rd->rd_lock);
   5512 		mutex_destroy(&rd->rd_net_lock);
   5513 		mutex_destroy(&rd->rd_fqr_lock);
   5514 		mutex_destroy(&rd->rd_xmit_lock);
   5515 		mutex_destroy(&rd->rd_nlb_lock);
   5516 		mutex_destroy(&rd->rd_sendq_lock);
   5517 		mutex_destroy(&rd->rd_tmo_lock);
   5518 		mutex_destroy(&rd->rd_freeq_lock);
   5519 		mutex_destroy(&rd->rd_msgs_lock);
   5520 		mutex_destroy(&rd->rd_evt_lock);
   5521 
   5522 		cv_destroy(&rd->rd_conn_cv);
   5523 		cv_destroy(&rd->rd_evt_cv);
   5524 		cv_destroy(&rd->rd_evt_wait_cv);
   5525 
   5526 		/*
   5527 		 * Free any allocated memory hanging off the dest structure.
   5528 		 */
   5529 		if (rd->rd_queued_fqe_array) {
   5530 			rd->rd_queued_fqe_freelist = NULL;
   5531 			rd->rd_queued_fqe_list = NULL;
   5532 			rd->rd_queued_fqe_tail = NULL;
   5533 			kmem_free(rd->rd_queued_fqe_array,
   5534 			    opsrsmp->opsrsm_param.opsrsm_queue_size *
   5535 			    sizeof (opsrsm_queued_fqe_t));
   5536 			rd->rd_queued_fqe_array = NULL;
   5537 			rd->rd_queued_fqe_cnt = 0;
   5538 		}
   5539 
   5540 		if (rd->rd_cached_fqr) {
   5541 			kmem_free(rd->rd_cached_fqr,
   5542 			    sizeof (*rd->rd_cached_fqr) * rd->rd_num_fqrs);
   5543 		}
   5544 		if (rd->rd_shdwfqw_f) {
   5545 			kmem_free(rd->rd_shdwfqw_f,
   5546 			    sizeof (*rd->rd_shdwfqw_f) * rd->rd_num_fqws);
   5547 		}
   5548 		if (rd->rd_shdwdqw_f) {
   5549 			kmem_free(rd->rd_shdwdqw_f,
   5550 			    sizeof (*rd->rd_shdwdqw_f) * rd->rd_num_dqws);
   5551 		}
   5552 		if (rd->rd_bufbase) {
   5553 			kmem_free(rd->rd_bufbase,
   5554 			    opsrsmp->opsrsm_param.opsrsm_buffers *
   5555 			    sizeof (*rd->rd_bufbase));
   5556 		}
   5557 		if (rd->rd_rawmem_base_addr) {
   5558 			kmem_free(rd->rd_rawmem_base_addr,
   5559 			    rd->rd_rawmem_base_size);
   5560 		}
   5561 
   5562 		/* Finally free the dest structure */
   5563 
   5564 		kmem_free(rd, sizeof (*rd));
   5565 		freed = B_TRUE;
   5566 
   5567 		adapterp->opsrsm_numdest--;
   5568 		D1("opsrsmdest_refcnt_0: freed rd data structures");
   5569 	}
   5570 
   5571 	if (freed && adapterp->opsrsm_numdest <= 0) {
   5572 		cv_signal(&adapterp->opsrsm_uninit_cv);
   5573 	}
   5574 	mutex_exit(&adapterp->opsrsm_dest_lock);
   5575 
   5576 	D1("opsrsmdest_refcnt_0: done");
   5577 	return (freed);
   5578 }
   5579 
   5580 /*
   5581  * Do deletion work.
   5582  */
   5583 static void
   5584 opsrsmfreedesttmo(void * arg)
   5585 {
   5586 	opsrsm_dest_t *rd = (opsrsm_dest_t *)arg;
   5587 	int bufnum, offset, length;
   5588 	opsrsm_failover_info_t *finfo;
   5589 	ushort_t sap;
   5590 	int err, cnt;
   5591 	boolean_t read_pkts = B_FALSE;
   5592 
   5593 	/*
   5594 	 * See if there are any more outstanding loaned buffers.  If so,
   5595 	 * set flag so that freebuf will eventually do an UNREF when it
   5596 	 * frees the last buffer.  This removes the reference added in
   5597 	 * opsrsmdest_refcnt_0(), causing the count to again go to 0.
   5598 	 * opsrsmdest_refcnt_0() will again be called, increment the refcnt
   5599 	 * and cause this routine to be called to complete cleanup.
   5600 	 */
   5601 
   5602 	mutex_enter(&rd->rd_nlb_lock);
   5603 
   5604 	rd->rd_nlb_del = 1;
   5605 	if (rd->rd_nlb != 0) {
   5606 		DERR("opsrsmfreedesttmo: loaned buffers outstanding %d, dest "
   5607 		    "%ld", rd->rd_nlb, rd->rd_rsm_addr);
   5608 		mutex_exit(&rd->rd_nlb_lock);
   5609 		return;
   5610 	}
   5611 
   5612 	mutex_exit(&rd->rd_nlb_lock);
   5613 
   5614 	/*
   5615 	 * Perform the sendq destroy first -- this notifies the
   5616 	 * remote side that the connection is going away, so
   5617 	 * it can immediately start cleaning up.  This helps
   5618 	 * to avoid a situation where a segment is unpublished
   5619 	 * while there is still a connection to it (which is legal,
   5620 	 * but can cause overhead in some specific RSM drivers).
   5621 	 */
   5622 	if (rd->rd_sstate & OPSRSM_RSMS_RXFER_S) {
   5623 		ASSERT(rd->rsm_sendq);
   5624 		D4("opsrsmfreedesttmo: destroying sendq\n");
   5625 		err = RSM_SENDQ_DESTROY(rd->rd_adapter->rsmrdt_ctlr_obj,
   5626 			rd->rsm_sendq);
   5627 		if (err) {
   5628 			DERR("RSM_SENDQ_DESTROY failed! err %d\n", err);
   5629 		}
   5630 		rd->rd_sstate &= ~OPSRSM_RSMS_RXFER_S;
   5631 	}
   5632 
   5633 	if (rd->rd_sstate & OPSRSM_RSMS_RXFER_C) {
   5634 		ASSERT(rd->rd_rxferhand);
   5635 		D1("opsrsmfreedesttmo: disconnecting from remote segment\n");
   5636 		err = RSM_DISCONNECT(rd->rd_adapter->rsmrdt_ctlr_obj,
   5637 			rd->rd_rxferhand);
   5638 		if (err) {
   5639 			DERR("RSM_DISCONNECT failed! err %d\n", err);
   5640 		}
   5641 		rd->rd_sstate &= ~OPSRSM_RSMS_RXFER_C;
   5642 	}
   5643 
   5644 	if (rd->rd_sstate & OPSRSM_RSMS_LXFER_P) {
   5645 		ASSERT(rd->rd_lxferhand);
   5646 		D1("opsrsmfreedesttmo: unpublishing local segment\n");
   5647 retry:;
   5648 		err = RSM_UNPUBLISH(rd->rd_adapter->rsmrdt_ctlr_obj,
   5649 		    rd->rd_lxferhand);
   5650 
   5651 		if (err && rd->rd_unpublish_errs < RSMRDT_UNPUBLISH_TRY) {
   5652 			D1("RSM_UNPUBLISH failed! err %d\n", err);
   5653 			opsrsmdev->opsrsm_ierrors++;
   5654 			rd->rd_unpublish_errs++;
   5655 			/*
   5656 			 * if RSM_UNPUBLISH fails, we need a slight delay
   5657 			 * before retrying.
   5658 			 */
   5659 			delay(1);
   5660 			goto retry;
   5661 		}
   5662 		rd->rd_unpublish_errs = 0;
   5663 		rd->rd_sstate &= ~OPSRSM_RSMS_LXFER_P;
   5664 	}
   5665 
   5666 	if (rd->rd_sstate & OPSRSM_RSMS_LXFER_C) {
   5667 		ASSERT(rd->rd_lxferhand);
   5668 		D1("opsrsmfreedesttmo: destroying local segment\n");
   5669 		err = RSM_SEG_DESTROY(rd->rd_adapter->rsmrdt_ctlr_obj,
   5670 			rd->rd_lxferhand);
   5671 		if (err) {
   5672 			DERR("RSM_SEG_DESTROY failed! err %d\n", err);
   5673 		}
   5674 		rd->rd_sstate &= ~OPSRSM_RSMS_LXFER_C;
   5675 		read_pkts = B_TRUE;
   5676 	}
   5677 
   5678 	/* empty out packets remaining in the buffer pool */
   5679 
   5680 	if (read_pkts) {
   5681 		cnt = 0;
   5682 		/* Loop through all valid DQE's and process their packets. */
   5683 		while (opsrsmgetdqe(rd, &bufnum, &offset, &length, &sap)) {
   5684 			/* Don't try to send up DQE with zero length */
   5685 			cnt++;
   5686 			if (length)
   5687 				(void) opsrsmread(rd, bufnum, offset, length,
   5688 				    sap);
   5689 			else {
   5690 				cmn_err(CE_PANIC, "received corrupted "
   5691 				    "packet\n");
   5692 			}
   5693 		}
   5694 		if (cnt > 0) {
   5695 			DINFO("failover: 0x%x read %d remaining packets\n",
   5696 			    rd->rd_local_skey, cnt);
   5697 		}
   5698 	}
   5699 	opsrsm_wake_senders(rd, POLLOUT);
   5700 
   5701 	/* Take out of desttbl */
   5702 	mutex_enter(&rd->rd_adapter->opsrsm_dest_lock);
   5703 	rd->rd_adapter->opsrsm_desttbl[rd->rd_rsm_addr] = NULL;
   5704 	ASSERT(rd->rd_dstate == 1);
   5705 	rd->rd_dstate = 2;
   5706 	mutex_exit(&rd->rd_adapter->opsrsm_dest_lock);
   5707 
   5708 	finfo = opsrsm_finfo_lookup_by_local_skey(rd->rd_local_skey);
   5709 	ASSERT(finfo != NULL);
   5710 
   5711 	OPSRSM_Q_CONCAT(&finfo->fi_rexmitq, &rd->rd_freeq);
   5712 	OPSRSM_Q_CONCAT(&finfo->fi_rexmitq, &rd->rd_pendq);
   5713 	OPSRSM_Q_CONCAT(&finfo->fi_rexmitq, &rd->rd_sendq);
   5714 	finfo->fi_next_rseqno = rd->rd_next_rseqno;
   5715 
   5716 	/*
   5717 	 * no need to rexmit if there are no open fds or if driver
   5718 	 * is being unloaded
   5719 	 */
   5720 	if (opsrsm_resource.opsrsmrc_flag == OPSRSMRC_UNLOAD_INPROGRESS) {
   5721 		DINFO("failover: rexmit thread not dispatched\n");
   5722 		OPSRSM_Q_FLUSH(&finfo->fi_rexmitq);
   5723 		opsrsm_finfo_wakeup(finfo, ENETDOWN);
   5724 		if (opsrsm_resource.opsrsmrc_flag !=
   5725 		    OPSRSMRC_UNLOAD_INPROGRESS) {
   5726 			mutex_enter(&opsrsm_finfo_lock);
   5727 			opsrsm_failover_threads++;
   5728 			mutex_exit(&opsrsm_finfo_lock);
   5729 			(void) timeout(opsrsm_finfo_destroy, finfo, 10);
   5730 		}
   5731 	} else {
   5732 		DINFO("failover: rd 0x%p adapter 0x%p (cltr %d) addr 0x%llx\n",
   5733 		    rd, rd->rd_adapter, rd->rd_adapter->instance,
   5734 		    rd->rd_rsm_addr);
   5735 		DINFO("failover: local_skey 0x%x, remote_skey 0x%x, "
   5736 		    "finfo 0x%p\n", rd->rd_local_skey, rd->rd_remote_skey,
   5737 		    finfo);
   5738 		mutex_enter(&opsrsm_finfo_lock);
   5739 		opsrsm_failover_threads++;
   5740 		mutex_exit(&opsrsm_finfo_lock);
   5741 		if (OPSRSM_Q_LEN(&finfo->fi_rexmitq) == 0 &&
   5742 		    finfo->fi_remote_skey == 0) {
   5743 			/*
   5744 			 * this occurs when segment establishment
   5745 			 * failed. there is no need to keep the finfo
   5746 			 * since no data transfer ever occurred.
   5747 			 */
   5748 			opsrsm_finfo_wakeup(finfo, ENETDOWN);
   5749 			(void) timeout(opsrsm_finfo_destroy, finfo, 10);
   5750 		} else {
   5751 			(void) timeout(opsrsm_dispatch_failover, finfo, 100);
   5752 		}
   5753 	}
   5754 	/* Make sure dest isn't on service queue */
   5755 	mutex_enter(&rd->rd_adapter->opsrsm_runq_lock);
   5756 
   5757 	if (rd->rd_adapter->opsrsm_runq == rd)
   5758 		rd->rd_adapter->opsrsm_runq = rd->rd_next;
   5759 	else {
   5760 		opsrsm_dest_t *lastrd = rd->rd_adapter->opsrsm_runq;
   5761 
   5762 		while (lastrd) {
   5763 			if (lastrd->rd_next == rd) {
   5764 				lastrd->rd_next = rd->rd_next;
   5765 				break;
   5766 			}
   5767 			lastrd = lastrd->rd_next;
   5768 		}
   5769 	}
   5770 
   5771 	mutex_exit(&rd->rd_adapter->opsrsm_runq_lock);
   5772 
   5773 	ASSERT(rd->rd_sstate == 0);
   5774 
   5775 	/*
   5776 	 * Removes the reference added in opsrsmdest_refcnt_0().
   5777 	 */
   5778 	UNREFDEST(rd);
   5779 
   5780 	D1("opsrsmfreedesttmo: done");
   5781 }
   5782 
   5783 
   5784 /*
   5785  * Start the deletion process for a destination.
   5786  */
   5787 static int
   5788 opsrsmfreedest(adapter_t *adapter, rsm_addr_t rsm_addr)
   5789 {
   5790 	opsrsm_dest_t *rd;
   5791 	timeout_id_t tmoid;
   5792 	int refcnt = 0;
   5793 	opsrsm_t *opsrsmp = opsrsmdev;
   5794 
   5795 	D2("opsrsmfreedest: remote rsmaddr %ld", rsm_addr);
   5796 	mutex_enter(&adapter->opsrsm_dest_lock);
   5797 	rd = adapter->opsrsm_desttbl[rsm_addr];
   5798 	if (rd == NULL || rd->rd_dstate != 0) {
   5799 #ifdef DEBUG
   5800 		if (rd != NULL) {
   5801 			cmn_err(CE_CONT, "opsrsmfreedest: dstate = %d, "
   5802 			    "exiting\n", rd->rd_dstate);
   5803 		}
   5804 #endif /* DEBUG */
   5805 		mutex_exit(&adapter->opsrsm_dest_lock);
   5806 		return (refcnt);
   5807 	}
   5808 	if (rd->rd_freed) {
   5809 #ifdef DEBUG
   5810 		cmn_err(CE_CONT, "opsrsmfreedest: already freed\n");
   5811 #endif /* DEBUG */
   5812 		mutex_exit(&adapter->opsrsm_dest_lock);
   5813 		return (refcnt);
   5814 	}
   5815 	rd->rd_freed = B_TRUE;
   5816 	(void) opsrsm_finfo_add(rd);
   5817 	mutex_exit(&adapter->opsrsm_dest_lock);
   5818 	mutex_enter(&rd->rd_xmit_lock);
   5819 
   5820 	mutex_enter(&rd->rd_freeq_lock);
   5821 	rd->rd_freeq_freeze = B_TRUE;
   5822 	mutex_exit(&rd->rd_freeq_lock);
   5823 
   5824 	rd->rd_xmit_state = OPSRSM_XMIT_DISCONNECTED;
   5825 	cv_broadcast(&rd->rd_conn_cv);
   5826 	mutex_exit(&rd->rd_xmit_lock);
   5827 
   5828 	opsrsm_reset_all_rps(rd);
   5829 
   5830 	mutex_enter(&adapter->opsrsm_dest_lock);
   5831 	D1("opsrsmfreedest: opsrsmp 0x%p (cltr %d) rsmaddr %ld",
   5832 	    (void *)opsrsmp, rd->rd_adapter->instance, rsm_addr);
   5833 	rd->rd_dstate = 1;
   5834 	refcnt = rd->rd_refcnt;
   5835 	mutex_exit(&adapter->opsrsm_dest_lock);
   5836 
   5837 	mutex_enter(&rd->rd_evt_lock);
   5838 	rd->rd_evt_flags |= OPSRSM_EVT_STOP;
   5839 	cv_signal(&rd->rd_evt_cv);
   5840 	cv_wait(&rd->rd_evt_wait_cv, &rd->rd_evt_lock);
   5841 	ASSERT((rd->rd_evt_flags & OPSRSM_EVT_DONE) != 0);
   5842 	mutex_exit(&rd->rd_evt_lock);
   5843 
   5844 	opsrsm_queued_msg_flush(rd);
   5845 	taskq_destroy(rd->rd_evt_taskq);
   5846 	rd->rd_evt_taskq = NULL;
   5847 
   5848 	/*
   5849 	 * Turn off any timeouts.  The sync timeout reschedules itself, so we
   5850 	 * have to go to great lengths to kill it.
   5851 	 */
   5852 	mutex_enter(&rd->rd_xmit_lock);
   5853 	tmoid = rd->rd_tmo_id;
   5854 	rd->rd_tmo_id = 0;
   5855 	rd->rd_stopq = B_TRUE;
   5856 	mutex_exit(&rd->rd_xmit_lock);
   5857 
   5858 	if (tmoid)
   5859 		(void) untimeout(tmoid);
   5860 
   5861 	tmoid = rd->rd_sync_dqe_tmo_id;
   5862 	while (tmoid) {
   5863 		(void) untimeout(tmoid);
   5864 		/*
   5865 		 * untimeout guarantees the either the function was
   5866 		 * cancelled, or it has completed.  If timeout was
   5867 		 * cancelled before the function ran, the timout id will
   5868 		 * not have changed.
   5869 		 */
   5870 		if (tmoid == rd->rd_sync_dqe_tmo_id)
   5871 			rd->rd_sync_dqe_tmo_id = 0;
   5872 		tmoid = rd->rd_sync_dqe_tmo_id;
   5873 	}
   5874 
   5875 	tmoid = rd->rd_sync_fqe_tmo_id;
   5876 	while (tmoid) {
   5877 		(void) untimeout(tmoid);
   5878 		if (tmoid == rd->rd_sync_fqe_tmo_id)
   5879 			rd->rd_sync_fqe_tmo_id = 0;
   5880 		tmoid = rd->rd_sync_fqe_tmo_id;
   5881 	}
   5882 
   5883 	opsrsm_cancel_xmit_tmo(rd);
   5884 	opsrsm_cancel_fqe_tmo(rd);
   5885 	opsrsm_cancel_sync_flow_tmo(rd);
   5886 	opsrsm_wake_senders(rd, POLLOUT);
   5887 	if (rd->rd_pollhd.ph_list != NULL)
   5888 		pollhead_clean(&rd->rd_pollhd);
   5889 
   5890 	D1("opsrsmfreedest: done");
   5891 
   5892 	/* remove reference added in opsrsmmkdest() */
   5893 	UNREFDEST(rd);
   5894 	return (refcnt);
   5895 }
   5896 
   5897 /*
   5898  * ****************************************************************
   5899  *                                                               *
   5900  * E N D       CONNECTION DATA STRUCTURE MANAGEMENT              *
   5901  *                                                               *
   5902  * ****************************************************************
   5903  */
   5904 
   5905 
   5906 
   5907 
   5908 /*
   5909  * ****************************************************************
   5910  *                                                               *
   5911  * B E G I N   MAIN STATE MACHINE                                *
   5912  *                                                               *
   5913  * ****************************************************************
   5914  */
   5915 
   5916 
   5917 /*
   5918  * We change a destination's state in a number of routines; we define these
   5919  * macros to make sure it gets done the same way every time.
   5920  */
   5921 #define	OPSRSM_SETSTATE(rd, adapter, routine, newstate)			\
   5922 	rd->rd_state = (ushort_t)newstate;				\
   5923 		if (OPSRSM_SCHED_STATE(newstate)) {			\
   5924 			rd->rd_next = adapter->opsrsm_runq;		\
   5925 			adapter->opsrsm_runq = rd;			\
   5926 			D1(routine ": added to runq");	        	\
   5927 			if (adapter->opsrsm_taskq) {			\
   5928 				(void) taskq_dispatch(adapter->opsrsm_taskq,\
   5929 				opsrsmwsrv, adapter, KM_NOSLEEP);	\
   5930 				D1(routine ": enabled 0x%p",		\
   5931 				    (void *)adapter->opsrsm_taskq);	\
   5932 			}				        	\
   5933 		}							\
   5934 
   5935 
   5936 /*
   5937  * This routine processes a notification that a destination has become
   5938  * unreachable.  Delete our record of it, so that when it comes back up we
   5939  * will re-establish our association.  We do this by changing its state to
   5940  * S_DELETE; the service routine will then start the deletion
   5941  * process.
   5942  *
   5943  * Since other parts of the driver may have operations in progress that
   5944  * involve this destination, most of the time we cannot just whack the
   5945  * state to the new value.  Instead, we record (in rd_estate) that the
   5946  * connection was lost.  The next time someone else attempts to change the
   5947  * state, the state change routines recognize that there is a pending event
   5948  * and change the state to the one we wanted instead.  (There are
   5949  * exceptions in cases where the new state indicates that we've enabled
   5950  * some sort of timeout; in this case, we may wait until the following
   5951  * state change to take note of the event.)
   5952  */
   5953 static void
   5954 opsrsm_lostconn(opsrsm_dest_t *rd)
   5955 {
   5956 	adapter_t *adapter = rd->rd_adapter;
   5957 
   5958 	D1("opsrsm_lostconn: rd 0x%p (addr %ld ctlr %d)", (void *)rd,
   5959 	    rd->rd_rsm_addr, adapter->instance);
   5960 
   5961 	mutex_enter(&adapter->opsrsm_runq_lock);
   5962 	if ((rd->rd_state == OPSRSM_STATE_W_READY) ||
   5963 	    (rd->rd_state == OPSRSM_STATE_NEW) ||
   5964 	    (rd->rd_state == OPSRSM_STATE_W_ACCEPT) ||
   5965 	    (rd->rd_state == OPSRSM_STATE_W_ACK) ||
   5966 	    (rd->rd_state == OPSRSM_STATE_W_FQE)) {
   5967 		/* LINTED: E_CONSTANT_CONDITION */
   5968 		OPSRSM_SETSTATE(rd, adapter, "opsrsm_lostconn",
   5969 		    OPSRSM_STATE_S_DELETE);
   5970 	} else {
   5971 		rd->rd_estate = OPSRSM_STATE_S_DELETE;
   5972 	}
   5973 	DERR("opsrsm_lostconn: state now %s, estate now %s",
   5974 	    OPSRSM_STATE_STR(rd->rd_state), OPSRSM_STATE_STR(rd->rd_estate));
   5975 
   5976 	mutex_exit(&adapter->opsrsm_runq_lock);
   5977 
   5978 	/*
   5979 	 * Stop trying to flush queue entries to the other side.
   5980 	 */
   5981 	rd->rd_stopq = B_TRUE;
   5982 	D1("opsrsm_lostconn: done");
   5983 }
   5984 
   5985 
   5986 /*
   5987  * Figure out what state transition should actually occur after an event
   5988  * has happened.
   5989  */
   5990 static int
   5991 opsrsmestate_newstate(opsrsm_dest_t *rd, int newstate)
   5992 {
   5993 	int retval = newstate;
   5994 
   5995 	/*
   5996 	 * If we're going to a state where we've just set a timeout, don't
   5997 	 * mess with the state.  When the timeout happens, it will change
   5998 	 * state again, and we'll nab 'em there.  If we're about to delete
   5999 	 * rd, don't bother worrying about the event.
   6000 	 */
   6001 	switch (newstate) {
   6002 	case OPSRSM_STATE_W_SCONNTMO:
   6003 	case OPSRSM_STATE_W_ACCEPT:
   6004 	case OPSRSM_STATE_W_ACK:
   6005 	case OPSRSM_STATE_W_FQE:
   6006 	case OPSRSM_STATE_DELETING:
   6007 	case OPSRSM_STATE_S_DELETE:
   6008 		return (retval);
   6009 	default:
   6010 		break;
   6011 	}
   6012 
   6013 	if (rd->rd_estate) {
   6014 		retval = rd->rd_estate;
   6015 		rd->rd_estate = OPSRSM_STATE_NEW; /* clear event state */
   6016 	}
   6017 
   6018 	D1("opsrsmestate_newstate: %d %d -> %d", rd->rd_estate,
   6019 	    newstate, retval);
   6020 
   6021 	return (retval);
   6022 }
   6023 
   6024 
   6025 /*
   6026  * Return destination's state, then set its state to INPROGRESS.
   6027  */
   6028 static int
   6029 opsrsmgetstate(
   6030 	opsrsm_dest_t *rd)	/* Destination pointer */
   6031 {
   6032 	int state;
   6033 
   6034 	D1("opsrsmgetstate: rd 0x%p", (void *)rd);
   6035 
   6036 	mutex_enter(&rd->rd_adapter->opsrsm_runq_lock);
   6037 
   6038 	state = rd->rd_state;
   6039 	rd->rd_state = OPSRSM_STATE_INPROGRESS;
   6040 
   6041 	mutex_exit(&rd->rd_adapter->opsrsm_runq_lock);
   6042 
   6043 	D1("opsrsmgetstate: returning %s", OPSRSM_STATE_STR(state));
   6044 
   6045 	return (state);
   6046 }
   6047 
   6048 /*
   6049  * Set destination's state; must be preceded by a getstate call.  (i.e.,
   6050  * destination's current state must be INPROGRESS.)
   6051  */
   6052 static void
   6053 opsrsmsetstate(
   6054 	opsrsm_dest_t *rd,	/* Destination pointer */
   6055 	int newstate)	/* State to set */
   6056 {
   6057 	adapter_t *adapter = rd->rd_adapter;
   6058 
   6059 	D1("opsrsmsetstate: rd 0x%p, newstate %s", (void *)rd,
   6060 	    OPSRSM_STATE_STR(newstate));
   6061 
   6062 	mutex_enter(&adapter->opsrsm_runq_lock);
   6063 
   6064 	if (rd->rd_state == OPSRSM_STATE_INPROGRESS) {
   6065 		if (rd->rd_estate)
   6066 			newstate = opsrsmestate_newstate(rd, newstate);
   6067 		OPSRSM_SETSTATE(rd, adapter, "opsrsmsetstate", newstate);
   6068 	} else {
   6069 		D1("opsrsm: setstate without getstate");
   6070 		cmn_err(CE_PANIC, "opsrsm: setstate without getstate");
   6071 	}
   6072 
   6073 	mutex_exit(&adapter->opsrsm_runq_lock);
   6074 
   6075 	D1("opsrsmsetstate: done");
   6076 }
   6077 
   6078 
   6079 /*
   6080  * Set state to newstate iff state is oldstate.  Return 1 if move happened,
   6081  * else 0.
   6082  */
   6083 static int
   6084 opsrsmmovestate(
   6085 	opsrsm_dest_t *rd,	/* Destination pointer */
   6086 	int oldstate,	/* State to check against */
   6087 	int newstate)	/* State to set if check succeeds */
   6088 {
   6089 	adapter_t *adapter = rd->rd_adapter;
   6090 	int retval;
   6091 
   6092 	D1("opsrsmmovestate: rd 0x%p, oldstate %s, newstate %s",
   6093 	    (void *)rd, OPSRSM_STATE_STR(oldstate), OPSRSM_STATE_STR(newstate));
   6094 
   6095 	mutex_enter(&adapter->opsrsm_runq_lock);
   6096 
   6097 	if (rd->rd_state == oldstate) {
   6098 		if (rd->rd_estate)
   6099 			newstate = opsrsmestate_newstate(rd, newstate);
   6100 		OPSRSM_SETSTATE(rd, adapter, "opsrsmmovestate", newstate);
   6101 		retval = 1;
   6102 		D1("opsrsmmovestate: state changed, returning 1");
   6103 	} else {
   6104 		retval = 0;
   6105 		D1("opsrsmmovestate: oldstate really %s, returning 0",
   6106 		    OPSRSM_STATE_STR(rd->rd_state));
   6107 	}
   6108 
   6109 	mutex_exit(&adapter->opsrsm_runq_lock);
   6110 
   6111 	return (retval);
   6112 }
   6113 
   6114 
   6115 
   6116 /*
   6117  * ****************************************************************
   6118  *                                                               *
   6119  * E N D       MAIN STATE MACHINE                                *
   6120  *                                                               *
   6121  * ****************************************************************
   6122  */
   6123 
   6124 
   6125 
   6126 /*
   6127  * ****************************************************************
   6128  *                                                               *
   6129  * B E G I N      HANDLERS FOR INCOMING RSM MESSAGES             *
   6130  *                                                               *
   6131  * ****************************************************************
   6132  */
   6133 
   6134 
   6135 /*
   6136  * Handlers for the various messages that may arrive.  All of these happen
   6137  * during interrupt handling, and will not actually use RSMPI calls.
   6138  * Rather, they will schedule actions to happen.
   6139  */
   6140 
   6141 
   6142 /*
   6143  * Received CONNECT REQUEST message.  Cause this side to set up
   6144  * connection to xfer segment and send back an ACCEPT message.
   6145  *
   6146  * We must have everything set up before sending the ACCEPT.
   6147  * However, we must not transmit any data until we receive the ACK
   6148  * of the ACCEPT.
   6149  */
   6150 static void
   6151 opsrsmmsghdlr_req_connect(opsrsm_dest_t *rd, opsrsm_msg_t *msg)
   6152 {
   6153 	adapter_t *adapter = rd->rd_adapter;
   6154 	boolean_t utmo = B_FALSE;
   6155 	timeout_id_t tmoid = NULL;
   6156 
   6157 	D1("opsrsmmsghdlr_req_connect: rd 0x%p (addr %ld ctlr %d)",
   6158 	    (void *)rd, rd->rd_rsm_addr, adapter->instance);
   6159 
   6160 	/*
   6161 	 * xmit lock guarantees that timeout has really been set
   6162 	 * for any wait conditions.
   6163 	 */
   6164 	mutex_enter(&rd->rd_xmit_lock);
   6165 	mutex_enter(&adapter->opsrsm_runq_lock);
   6166 
   6167 	if (rd->rd_segid_valid) {
   6168 		/*
   6169 		 * Another connect message - is it a duplicate?
   6170 		 * If so, just ignore.  Otherwise, there is a
   6171 		 * problem, so force a connection teardown.
   6172 		 */
   6173 
   6174 		mutex_exit(&adapter->opsrsm_runq_lock);
   6175 		mutex_exit(&rd->rd_xmit_lock);
   6176 
   6177 		if ((rd->rd_rxfersegid != msg->p.m.con_request.send_segid) ||
   6178 		    (rd->rd_lastconnmsg_seq != msg->p.hdr.seqno)) {
   6179 			/* Not the same connect request, drop connection */
   6180 			opsrsm_lostconn(rd);
   6181 		}
   6182 
   6183 		return;
   6184 	}
   6185 
   6186 	/* remember the message sequence number of this connection request */
   6187 	rd->rd_lastconnmsg_seq = msg->p.hdr.seqno;
   6188 
   6189 	if (rd->rd_state == OPSRSM_STATE_W_ACCEPT) {
   6190 		/*
   6191 		 * Crossed connection requests.  If we're the higher
   6192 		 * numbered address, cancel the ACCEPT timeout and accept
   6193 		 * the remote request.  If we're the lower numbered
   6194 		 * address, ignore this request because the remote side
   6195 		 * will accept ours.  If the W_ACCEPT timeout expires prior
   6196 		 * to cancelling the timeout, the timeout function will
   6197 		 * notice the state is no longer W_ACCEPT, and will not
   6198 		 * cause the connection to be torn down.  If the timeout
   6199 		 * has already occured (and the rd state is S_DELETE),
   6200 		 * we're out of luck, and will have to wait for a new
   6201 		 * connection request from the remote side.
   6202 		 */
   6203 		if (rd->rd_rsm_addr >
   6204 		    adapter->rsmrdt_attr.attr_controller_addr) {
   6205 			rd->rd_segid_valid = B_TRUE;
   6206 			rd->rd_rxfersegid = msg->p.m.con_request.send_segid;
   6207 			/* LINTED: E_CONSTANT_CONDITION */
   6208 			OPSRSM_SETSTATE(rd, adapter,
   6209 			    "opsrsmmsghdlr_req_connect",
   6210 			    OPSRSM_STATE_S_CONNXFER_ACCEPT);
   6211 			utmo = B_TRUE;
   6212 			tmoid = rd->rd_tmo_id;
   6213 			rd->rd_tmo_id = 0;
   6214 			rd->rd_tmo_int = 0;
   6215 		}
   6216 	} else {
   6217 
   6218 		/*
   6219 		 * Save away the connection information.  If possible,
   6220 		 * change the state to cause the request to be immediately
   6221 		 * acted upon.  If the state is currently INPROGRESS
   6222 		 * in the early stages of connection (during crexfer
   6223 		 * or the start of sconn), then this request will
   6224 		 * eventually be noticed when sconn() is called.  The
   6225 		 * sconn() function will notice that the segid is valid,
   6226 		 * and perform the CONNXER_ACCEPT tasks instead.
   6227 		 *
   6228 		 * If this rd's state was in a later stage of the
   6229 		 * connection dance (or after a connection exists), a
   6230 		 * previous connection request should have been received,
   6231 		 * the new connection request will not be expected, and
   6232 		 * this will have been caught by noticing the segid was
   6233 		 * already valid, and cause a failure, above.
   6234 		 */
   6235 
   6236 		rd->rd_segid_valid = B_TRUE;
   6237 		rd->rd_rxfersegid = msg->p.m.con_request.send_segid;
   6238 
   6239 		if (rd->rd_state == OPSRSM_STATE_NEW) {
   6240 			/*
   6241 			 * No connection was in progress.  Start a new
   6242 			 * connection setup process.
   6243 			 */
   6244 			/* LINTED: E_CONSTANT_CONDITION */
   6245 			OPSRSM_SETSTATE(rd, adapter,
   6246 			    "opsrsmmsghdlr_req_connect",
   6247 			    OPSRSM_STATE_S_NEWCONN);
   6248 
   6249 		} else if (rd->rd_state == OPSRSM_STATE_W_SCONNTMO) {
   6250 			/*
   6251 			 * Accept this request instead of resending our
   6252 			 * connect request.  Cancel the timeout.  If the
   6253 			 * SCONNTMO timeout function is called prior to
   6254 			 * cancelling the timeout, it will notice the state
   6255 			 * is no longer W_SCONNTMO, and will not cause a
   6256 			 * new connection request to be sent.  If the
   6257 			 * timeout already occured (and rd is in the
   6258 			 * S_SCONN state), the sconn() function will notice
   6259 			 * that the segid is valid, and perform the
   6260 			 * CONNXER_ACCEPT tasks instead.
   6261 			 */
   6262 			/* LINTED: E_CONSTANT_CONDITION */
   6263 			OPSRSM_SETSTATE(rd, adapter,
   6264 			    "opsrsmmsghdlr_req_connect",
   6265 			    OPSRSM_STATE_S_CONNXFER_ACCEPT);
   6266 			utmo = B_TRUE;
   6267 			tmoid = rd->rd_tmo_id;
   6268 			rd->rd_tmo_id = 0;
   6269 			rd->rd_tmo_int = 0;
   6270 		}
   6271 	}
   6272 
   6273 	mutex_exit(&adapter->opsrsm_runq_lock);
   6274 	mutex_exit(&rd->rd_xmit_lock);
   6275 
   6276 	if (utmo)
   6277 		(void) untimeout(tmoid);
   6278 }
   6279 
   6280 
   6281 
   6282 /*
   6283  * Received ACCEPT message.  Cause this side to set up a connection
   6284  * to the remote transfer segment and send back an ACK message.
   6285  */
   6286 static void
   6287 opsrsmmsghdlr_con_accept(opsrsm_dest_t *rd, opsrsm_msg_t *msg)
   6288 {
   6289 	adapter_t *adapter = rd->rd_adapter;
   6290 	boolean_t utmo = B_FALSE;
   6291 	timeout_id_t tmoid;
   6292 
   6293 	D1("opsrsmmsghdlr_con_accept: rd 0x%p (addr %ld ctlr %d)",
   6294 	    (void *)rd, rd->rd_rsm_addr, adapter->instance);
   6295 
   6296 	/*
   6297 	 * xmit lock protects segid field
   6298 	 */
   6299 	mutex_enter(&rd->rd_xmit_lock);
   6300 	mutex_enter(&adapter->opsrsm_runq_lock);
   6301 
   6302 	if (rd->rd_state == OPSRSM_STATE_W_ACCEPT &&
   6303 	    rd->rd_lxfersegid == msg->p.m.con_accept.rcv_segid) {
   6304 		rd->rd_segid_valid = B_TRUE;
   6305 		rd->rd_rxfersegid = msg->p.m.con_accept.send_segid;
   6306 		utmo = B_TRUE;
   6307 		tmoid = rd->rd_tmo_id;
   6308 		rd->rd_tmo_id = 0;
   6309 		/* LINTED: E_CONSTANT_CONDITION */
   6310 		OPSRSM_SETSTATE(rd, adapter, "opsrsmmsghdlr_con_accept",
   6311 		    OPSRSM_STATE_S_CONNXFER_ACK);
   6312 		mutex_exit(&adapter->opsrsm_runq_lock);
   6313 		mutex_exit(&rd->rd_xmit_lock);
   6314 
   6315 		if (utmo)
   6316 			(void) untimeout(tmoid);
   6317 	} else {
   6318 		mutex_exit(&adapter->opsrsm_runq_lock);
   6319 		mutex_exit(&rd->rd_xmit_lock);
   6320 		opsrsm_lostconn(rd);
   6321 		return;
   6322 	}
   6323 
   6324 }
   6325 
   6326 
   6327 /*
   6328  * Received ACK message.  Now ok to proceed with DLPI data transfer.
   6329  */
   6330 static void
   6331 opsrsmmsghdlr_con_ack(opsrsm_dest_t *rd, opsrsm_msg_t *msg)
   6332 {
   6333 	adapter_t *adapter = rd->rd_adapter;
   6334 	boolean_t utmo = B_FALSE;
   6335 	timeout_id_t tmoid;
   6336 
   6337 	D1("opsrsmmsghdlr_con_ack: rd 0x%p (addr %ld ctlr %d)",
   6338 	    (void *)rd, rd->rd_rsm_addr, adapter->instance);
   6339 
   6340 	mutex_enter(&adapter->opsrsm_runq_lock);
   6341 
   6342 	if (rd->rd_state == OPSRSM_STATE_W_ACK &&
   6343 	    msg->p.m.con_ack.rcv_segid == rd->rd_lxfersegid &&
   6344 	    msg->p.m.con_ack.send_segid == rd->rd_rxfersegid) {
   6345 		int isdel = 0;
   6346 
   6347 		utmo = B_TRUE;
   6348 		tmoid = rd->rd_tmo_id;
   6349 		rd->rd_tmo_id = 0;
   6350 		/* LINTED: E_CONSTANT_CONDITION */
   6351 		/*lint -e778 */
   6352 		OPSRSM_SETSTATE(rd, adapter, "opsrsmmsghdlr_con_ack",
   6353 		    OPSRSM_STATE_W_READY);
   6354 		/*lint +e778 */
   6355 		mutex_exit(&adapter->opsrsm_runq_lock);
   6356 		if (utmo) {
   6357 			(void) untimeout(tmoid);
   6358 		}
   6359 		if (rd->rd_status_tmo_id == 0) {
   6360 			rd->rd_status_tmo_id =
   6361 			    timeout(opsrsm_status_check_tmo, rd, 1);
   6362 		}
   6363 		mutex_enter(&rd->rd_evt_lock);
   6364 		rd->rd_evt_flags = OPSRSM_EVT_READY;
   6365 		cv_signal(&rd->rd_evt_cv);
   6366 		mutex_exit(&rd->rd_evt_lock);
   6367 
   6368 		mutex_enter(&rd->rd_xmit_lock);
   6369 		rd->rd_xmit_state = OPSRSM_XMIT_BARRIER_CLOSED;
   6370 		cv_broadcast(&rd->rd_conn_cv);
   6371 		mutex_exit(&rd->rd_xmit_lock);
   6372 
   6373 		mutex_enter(&rd->rd_sendq_lock);
   6374 		if (OPSRSM_Q_LEN(&rd->rd_sendq) > 0) {
   6375 			REFDEST(rd, isdel);
   6376 			if (isdel == 0) {
   6377 				opsrsm_dispatch_tmo((void *)rd);
   6378 			}
   6379 		}
   6380 		mutex_exit(&rd->rd_sendq_lock);
   6381 	} else {
   6382 		mutex_exit(&adapter->opsrsm_runq_lock);
   6383 		opsrsm_lostconn(rd);
   6384 		return;
   6385 	}
   6386 }
   6387 
   6388 static void
   6389 opsrsm_sync_flow_tmo(void *arg)
   6390 {
   6391 	opsrsm_dest_t *rd = (opsrsm_dest_t *)arg;
   6392 
   6393 	mutex_enter(&rd->rd_tmo_lock);
   6394 	if (rd->rd_flow_tmo_id == 0) {
   6395 		mutex_exit(&rd->rd_tmo_lock);
   6396 		return;
   6397 	}
   6398 	if (taskq_dispatch(rd->rd_adapter->opsrsm_taskq,
   6399 	    opsrsm_sync_flow_ctl, rd, KM_NOSLEEP) == 0) {
   6400 		rd->rd_flow_tmo_id = timeout(opsrsm_sync_flow_tmo, rd, 0);
   6401 	} else {
   6402 		rd->rd_flow_tmo_id = 0;
   6403 	}
   6404 	mutex_exit(&rd->rd_tmo_lock);
   6405 }
   6406 
   6407 static void
   6408 opsrsm_set_sync_flow_tmo(opsrsm_dest_t *rd)
   6409 {
   6410 	int isdel = 0;
   6411 
   6412 	mutex_enter(&rd->rd_tmo_lock);
   6413 	if (rd->rd_flow_tmo_id != 0) {
   6414 		goto out;
   6415 	}
   6416 	REFDEST(rd, isdel);
   6417 	if (isdel != 0) goto out;
   6418 	rd->rd_flow_tmo_id = timeout(opsrsm_sync_flow_tmo, rd, 0);
   6419 out:;
   6420 	mutex_exit(&rd->rd_tmo_lock);
   6421 }
   6422 
   6423 static void
   6424 opsrsm_cancel_sync_flow_tmo(opsrsm_dest_t *rd)
   6425 {
   6426 	timeout_id_t tmoid;
   6427 
   6428 	mutex_enter(&rd->rd_tmo_lock);
   6429 	if (rd->rd_flow_tmo_id == 0) {
   6430 		mutex_exit(&rd->rd_tmo_lock);
   6431 		return;
   6432 	}
   6433 	UNREFDEST(rd);
   6434 	tmoid = rd->rd_flow_tmo_id;
   6435 	rd->rd_flow_tmo_id = 0;
   6436 	mutex_exit(&rd->rd_tmo_lock);
   6437 	(void) untimeout(tmoid);
   6438 }
   6439 
   6440 static void
   6441 opsrsm_sync_flow_ctl(void *arg)
   6442 {
   6443 	opsrsm_dest_t *rd = (opsrsm_dest_t *)arg;
   6444 	uchar_t srcaddr[64];
   6445 	rsm_barrier_t fc_barrier;
   6446 	opsrsm_flow_ctl_t *fctl;
   6447 	int err, errcnt = 0;
   6448 
   6449 again:;
   6450 	fctl = (opsrsm_flow_ctl_t *)&srcaddr[0];
   6451 	fctl->fc_stop = rd->rd_remote_flow_stop;
   6452 
   6453 	err = RSM_OPEN_BARRIER_REGION(rd->rd_adapter->rsmrdt_ctlr_obj,
   6454 	    rd->rd_rxferhand, &fc_barrier);
   6455 	ASSERT(err == RSM_SUCCESS);
   6456 	ASSERT((rd->rd_remote_flow_ctl & OPSRSM_CACHELINE_OFFSET) == 0);
   6457 
   6458 	err = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj, rd->rd_rxferhand,
   6459 	    rd->rd_remote_flow_ctl, srcaddr, 64);
   6460 	ASSERT(err == RSM_SUCCESS);
   6461 
   6462 	err = RSM_CLOSE_BARRIER(rd->rd_adapter->rsmrdt_ctlr_obj, &fc_barrier);
   6463 	if (err != RSM_SUCCESS) {
   6464 		opsrsmdev->opsrsm_ierrors++;
   6465 		if ((uint_t)++errcnt <=
   6466 		    opsrsmdev->opsrsm_param.opsrsm_retry_limit) {
   6467 			delay(2);
   6468 			goto again;
   6469 		} else {
   6470 			cmn_err(CE_CONT, "unable to sync flow control info\n");
   6471 		}
   6472 	}
   6473 	UNREFDEST(rd);
   6474 }
   6475 
   6476 static void
   6477 opsrsm_check_flow_ctl(opsrsm_dest_t *rd)
   6478 {
   6479 	uint_t mem_lo_wat = max(opsrsmdev->opsrsm_param.opsrsm_mem_lo_wat,
   6480 	    opsrsmdev->opsrsm_param.opsrsm_buffer_size *
   6481 	    opsrsmdev->opsrsm_param.opsrsm_buffers);
   6482 
   6483 	/* stop sender if pending_bytes gets too large */
   6484 	if (opsrsm_pending_bytes >
   6485 	    opsrsmdev->opsrsm_param.opsrsm_recv_hi_wat ||
   6486 	    freemem * PAGESIZE < mem_lo_wat) {
   6487 		if (rd->rd_remote_flow_stop == 0) {
   6488 			DERR("opsrsm_pending_bytes = %d "
   6489 			    "opsrsm_recv_hi_wat = %d", opsrsm_pending_bytes,
   6490 			    opsrsmdev->opsrsm_param.opsrsm_recv_hi_wat);
   6491 			rd->rd_remote_flow_stop = 1;
   6492 			if (rd->rd_state == OPSRSM_STATE_W_READY) {
   6493 				opsrsm_set_sync_flow_tmo(rd);
   6494 			}
   6495 		}
   6496 		mutex_enter(&opsrsm_flow_tmo_lock);
   6497 		if (opsrsm_flow_tmo_id == 0) {
   6498 			opsrsm_flow_tmo_retries = 0;
   6499 			opsrsm_flow_tmo_id = timeout(opsrsm_flow_tmo, NULL,
   6500 			    (long)opsrsmdev->opsrsm_param.opsrsm_flow_tmo_int);
   6501 		}
   6502 		mutex_exit(&opsrsm_flow_tmo_lock);
   6503 	}
   6504 }
   6505 
   6506 /*ARGSUSED*/
   6507 static void
   6508 opsrsm_flow_enable(adapter_t *adp, void *arg)
   6509 {
   6510 	int i;
   6511 
   6512 	for (i = 0; i < RSM_MAX_DESTADDR; i++) {
   6513 		opsrsm_dest_t *rd = NULL;
   6514 		int isdel = 0;
   6515 
   6516 		FINDDEST(rd, isdel, i, adp);
   6517 		if (isdel != 0 || rd == NULL) {
   6518 			continue;
   6519 		}
   6520 		if (rd->rd_state != OPSRSM_STATE_W_READY) {
   6521 			UNREFDEST(rd);
   6522 			continue;
   6523 		}
   6524 
   6525 		if (rd->rd_remote_flow_stop == 1) {
   6526 			rd->rd_remote_flow_stop = 0;
   6527 			opsrsm_set_sync_flow_tmo(rd);
   6528 		}
   6529 		UNREFDEST(rd);
   6530 	}
   6531 }
   6532 
   6533 static void
   6534 opsrsm_flow_tmo_cancel(void)
   6535 {
   6536 	timeout_id_t tmoid;
   6537 
   6538 	mutex_enter(&opsrsm_flow_tmo_lock);
   6539 	if (opsrsm_flow_tmo_id == 0) {
   6540 		mutex_exit(&opsrsm_flow_tmo_lock);
   6541 		return;
   6542 	}
   6543 	tmoid = opsrsm_flow_tmo_id;
   6544 	opsrsm_flow_tmo_id = 0;
   6545 	mutex_exit(&opsrsm_flow_tmo_lock);
   6546 	(void) untimeout(tmoid);
   6547 }
   6548 
   6549 /*ARGSUSED*/
   6550 static void
   6551 opsrsm_flow_tmo(void *arg)
   6552 {
   6553 	uint_t mem_hi_wat = max(opsrsmdev->opsrsm_param.opsrsm_mem_hi_wat,
   6554 	    opsrsmdev->opsrsm_param.opsrsm_buffer_size *
   6555 	    opsrsmdev->opsrsm_param.opsrsm_buffers +
   6556 	    opsrsmdev->opsrsm_param.opsrsm_mem_hi_wat -
   6557 	    opsrsmdev->opsrsm_param.opsrsm_mem_lo_wat);
   6558 
   6559 	mutex_enter(&opsrsm_flow_tmo_lock);
   6560 	if (opsrsm_flow_tmo_id == 0) {
   6561 		mutex_exit(&opsrsm_flow_tmo_lock);
   6562 		return;
   6563 	}
   6564 	opsrsm_flow_tmo_id = 0;
   6565 	if (opsrsm_pending_bytes >=
   6566 	    opsrsmdev->opsrsm_param.opsrsm_recv_lo_wat ||
   6567 	    freemem * PAGESIZE <= mem_hi_wat) {
   6568 		/* cannot unblock senders yet, rescheduling timeout */
   6569 		opsrsm_flow_tmo_id = timeout(opsrsm_flow_tmo, NULL,
   6570 		    (long)opsrsmdev->opsrsm_param.opsrsm_flow_tmo_int);
   6571 		opsrsm_flow_tmo_retries++;
   6572 		if ((opsrsm_flow_tmo_retries % 36000) == 0) {
   6573 			cmn_err(CE_CONT, "remained in flow control "
   6574 			    "condition for %d intervals\n",
   6575 			    opsrsm_flow_tmo_retries);
   6576 		}
   6577 		mutex_exit(&opsrsm_flow_tmo_lock);
   6578 		return;
   6579 	}
   6580 	opsrsm_flow_tmo_retries = 0;
   6581 	mutex_exit(&opsrsm_flow_tmo_lock);
   6582 
   6583 	/* unblock senders */
   6584 	apply_on_all_adapters(opsrsm_flow_enable, NULL);
   6585 }
   6586 
   6587 /*
   6588  * Remote side has just sync'ed up the local DQE with its copy, so there
   6589  * may be buffers to deliver.
   6590  */
   6591 static void
   6592 opsrsmmsghdlr_syncdqe(opsrsm_dest_t *rd, opsrsm_msg_t *msg)
   6593 {
   6594 	int bufnum, offset, length;
   6595 	ushort_t sap;
   6596 	int freebufs = 0;
   6597 	uint32_t msg_cnt = 0;
   6598 
   6599 	D1("opsrsmmsghdlr_syncdqe: rd 0x%p (addr %ld ctlr %d)",
   6600 	    (void *)rd, rd->rd_rsm_addr, rd->rd_adapter->instance);
   6601 
   6602 	ASSERT(rd->rd_sstate == OPSRSM_RSMS_ALL);
   6603 
   6604 	opsrsm_check_flow_ctl(rd);
   6605 	/*
   6606 	 * message sanity check
   6607 	 */
   6608 	if (msg->p.m.syncdqe.rcv_segid != rd->rd_lxfersegid ||
   6609 	    msg->p.m.syncdqe.msg_cnt == 0) {
   6610 		cmn_err(CE_CONT, "opsrsmmsghdlr_syncdqe: bad rcv_segid");
   6611 		opsrsm_lostconn(rd);
   6612 		return;
   6613 	}
   6614 
   6615 	/* Loop through all valid DQE's and process their packets. */
   6616 	while (msg_cnt < msg->p.m.syncdqe.msg_cnt &&
   6617 	    opsrsmgetdqe(rd, &bufnum, &offset, &length, &sap)) {
   6618 		/* Don't try to send up DQE with zero length */
   6619 		if (length)
   6620 			freebufs += opsrsmread(rd, bufnum, offset, length, sap);
   6621 		else {
   6622 			cmn_err(CE_PANIC, "received corrupted packet\n");
   6623 			opsrsmputfqe(rd, bufnum);
   6624 			freebufs++;
   6625 		}
   6626 
   6627 		if (freebufs ==
   6628 		    opsrsmdev->opsrsm_param.opsrsm_fqe_sync_size) {
   6629 			freebufs = 0;
   6630 			opsrsm_event_add(rd, OPSRSM_EVT_SYNC_FQE);
   6631 		}
   6632 		msg_cnt++;
   6633 	}
   6634 	if (msg_cnt != msg->p.m.syncdqe.msg_cnt) {
   6635 		cmn_err(CE_CONT, "DQ corruption detected, msg_cnt = %d, "
   6636 		    "dqes read = %d\n", msg->p.m.syncdqe.msg_cnt, msg_cnt);
   6637 	}
   6638 	if (freebufs) {
   6639 		opsrsm_event_add(rd, OPSRSM_EVT_SYNC_FQE);
   6640 	}
   6641 	D1("opsrsmmsghdlr_syncdqe: success");
   6642 }
   6643 
   6644 static void
   6645 rsmrdtmsghdlr_senderr(opsrsm_dest_t *rd, opsrsm_msg_t *msg)
   6646 {
   6647 	opsrsmresource_t *rp;
   6648 
   6649 	D1("opsrsmmsghdlr_senderr: rd 0x%p (addr %ld ctlr %d)",
   6650 	    (void *)rd, rd->rd_rsm_addr, rd->rd_adapter->instance);
   6651 
   6652 	rp = opsrsmresource_lookup(msg->p.m.senderr.sender_portnum,
   6653 		OPSRSM_RO_DEFAULT);
   6654 	if (rp == NULL) {
   6655 		return;
   6656 	}
   6657 
   6658 	if (msg->p.m.senderr.sender_pkey == rp->rs_pkey) {
   6659 		rp->rs_state |= msg->p.m.senderr.errstate;
   6660 	}
   6661 }
   6662 
   6663 
   6664 /* ARGSUSED */
   6665 static void
   6666 opsrsmmsghdlr_default(opsrsm_dest_t *rd, opsrsm_msg_t *msg)
   6667 {
   6668 	opsrsmerror(opsrsmdev->opsrsm_dip, "Unknown message type %d",
   6669 	    msg->p.hdr.reqtype);
   6670 }
   6671 
   6672 
   6673 /*
   6674  * Handler for connection-related RSMPI messages from remote OPSRSM drivers
   6675  */
   6676 /* ARGSUSED */
   6677 rsm_intr_hand_ret_t
   6678 opsrsm_rsm_intr_handler(rsm_controller_object_t *controller,
   6679     rsm_intr_q_op_t operation,
   6680     rsm_addr_t sender,
   6681     void *data,
   6682     size_t size,
   6683     rsm_intr_hand_arg_t handler_arg)
   6684 {
   6685 	adapter_t *adapter = (adapter_t *)handler_arg;
   6686 	opsrsm_t *opsrsmp = opsrsmdev;
   6687 	opsrsm_dest_t *rd;
   6688 	opsrsm_msg_t *msg;
   6689 	int isdel = 0;
   6690 	int isnew = 0;
   6691 
   6692 	if (adapter == NULL || opsrsmp == NULL)
   6693 		return (RSM_INTR_HAND_UNCLAIMED);
   6694 	/*
   6695 	 * We only handle RSM addresses that fit in 48 bits.
   6696 	 */
   6697 	ASSERT(sender <= (rsm_addr_t)0xffffffffffffLL);
   6698 
   6699 	D1("opsrsm_intr_handle: opsrsmp 0x%p (cltr %d) sender-addr %ld",
   6700 	    (void *)opsrsmp,
   6701 	    adapter ? adapter->instance : -1, sender);
   6702 
   6703 	/* Is this our interrupt? */
   6704 	mutex_enter(&adapter->mutex);
   6705 	if (controller->handle != adapter->rsmrdt_ctlr_obj.handle) {
   6706 		mutex_exit(&adapter->mutex);
   6707 		D1("opsrsm_intr_handle: bad controller handle");
   6708 		return (RSM_INTR_HAND_UNCLAIMED);
   6709 	}
   6710 	mutex_exit(&adapter->mutex);
   6711 	/*
   6712 	 * We don't really care about anything but a received packet
   6713 	 * or a queue destroy
   6714 	 */
   6715 	switch (operation) {
   6716 
   6717 	case RSM_INTR_Q_OP_CREATE: {
   6718 		/*
   6719 		 * Create a dest structure, on the assumption that
   6720 		 * somebody's about to communicate with us.
   6721 		 */
   6722 		MAKEDEST(rd, isdel, isnew, sender, adapter);
   6723 		if (isdel || !rd) {
   6724 			return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE);
   6725 		}
   6726 		UNREFDEST(rd);
   6727 
   6728 		D1("opsrsm_intr_handle: op-create/config mkdset for addr %ld",
   6729 		    sender);
   6730 
   6731 		return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE);
   6732 	}
   6733 
   6734 	case RSM_INTR_Q_OP_CONFIGURE:
   6735 		/* ignore configure messages */
   6736 		return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE);
   6737 
   6738 	case RSM_INTR_Q_OP_DESTROY: {
   6739 		/*
   6740 		 * The remote side has shut down the connection.  We need
   6741 		 * to shut local side of the connection down as well.
   6742 		 */
   6743 		FINDDEST(rd, isdel, sender, adapter);
   6744 		if (isdel || !rd) {
   6745 			return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE);
   6746 		}
   6747 		D1("opsrsm_intr_handle: op-destroy for addr %ld", sender);
   6748 		opsrsm_lostconn(rd);
   6749 		UNREFDEST(rd);
   6750 		return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE);
   6751 	}
   6752 
   6753 	case RSM_INTR_Q_OP_RECEIVE:
   6754 		/*
   6755 		 * A DLPI message from the remote node.  Handle in the main
   6756 		 * body.
   6757 		 */
   6758 		break;
   6759 
   6760 	default:
   6761 		/* ignore */
   6762 		return (RSM_INTR_HAND_UNCLAIMED);
   6763 	}
   6764 
   6765 	/*
   6766 	 * Dest should already exist, having been created by the
   6767 	 * RSM_INTR_Q_OP_CREATE, above.
   6768 	 */
   6769 	FINDDEST(rd, isdel, sender, adapter);
   6770 	if (isdel) {
   6771 		return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE);
   6772 	} else if (rd == NULL) {
   6773 		D1("opsrsm_rsm_intr_handler: can't finddest");
   6774 		return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE);
   6775 	}
   6776 
   6777 
   6778 	msg = (opsrsm_msg_t *)data;
   6779 
   6780 	if (msg->p.hdr.opsrsm_version != OPSRSM_VERSION) {
   6781 		/*
   6782 		 * Non-matching driver version!
   6783 		 * Toss message.
   6784 		 */
   6785 		DINFO("version mismatch: version = %d, expected = %d\n",
   6786 		    msg->p.hdr.opsrsm_version, OPSRSM_VERSION);
   6787 		UNREFDEST(rd);
   6788 		return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE);
   6789 	}
   6790 
   6791 	switch (msg->p.hdr.reqtype) {
   6792 
   6793 	case OPSRSM_MSG_REQ_CONNECT:
   6794 		opsrsmmsghdlr_req_connect(rd, msg);
   6795 		break;
   6796 
   6797 	case OPSRSM_MSG_CON_ACCEPT:
   6798 		opsrsmmsghdlr_con_accept(rd, msg);
   6799 		break;
   6800 
   6801 	case OPSRSM_MSG_CON_ACK:
   6802 		opsrsmmsghdlr_con_ack(rd, msg);
   6803 		break;
   6804 
   6805 		/*
   6806 		 * Maybe scan the incoming queue at this time?
   6807 		 */
   6808 	case OPSRSM_MSG_SYNC_DQE:
   6809 		opsrsmmsghdlr_syncdqe(rd, msg);
   6810 		break;
   6811 
   6812 	case RSMRDT_MSG_SEND_ERR:
   6813 		rsmrdtmsghdlr_senderr(rd, msg);
   6814 		break;
   6815 
   6816 	case OPSRSM_MSG_RESET:
   6817 		if (rd->rd_state == OPSRSM_STATE_W_READY) {
   6818 			(void) opsrsmgetstate(rd);
   6819 			opsrsmsetstate(rd, OPSRSM_STATE_S_DELETE);
   6820 		}
   6821 		break;
   6822 	case OPSRSM_MSG_FINFO_DEMUX_DONE:
   6823 	case OPSRSM_MSG_FINFO_REPLY:
   6824 	case OPSRSM_MSG_FINFO_REXMIT_ACK:
   6825 		opsrsmmsghdlr_finfo(rd, msg);
   6826 		break;
   6827 	default:
   6828 		opsrsmmsghdlr_default(rd, msg);
   6829 		break;
   6830 	}
   6831 
   6832 	UNREFDEST(rd);
   6833 
   6834 	return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE);
   6835 /*
   6836  * Supress lint Warning(550) [c:0]: isnew not accessed
   6837  */
   6838 } /*lint !e550 */
   6839 
   6840 /*
   6841  * ****************************************************************
   6842  *                                                               *
   6843  * E N D       HANDLERS FOR INCOMING RSM MESSAGES                *
   6844  *                                                               *
   6845  * ****************************************************************
   6846  */
   6847 
   6848 
   6849 
   6850 /*
   6851  * ****************************************************************
   6852  *                                                               *
   6853  * B E G I N   CONNECTION MANAGEMENT                             *
   6854  *                                                               *
   6855  * ****************************************************************
   6856  */
   6857 
   6858 /*
   6859  * Create and initialize a transfer segment for the remote destination.  If
   6860  * successful, return 0, else 1.  The destination's state must be
   6861  * INPROGRESS.  In remains INPROGRESS during this function.
   6862  */
   6863 
   6864 static int
   6865 opsrsmcrexfer(opsrsm_t *opsrsmp, opsrsm_dest_t *rd)
   6866 {
   6867 	volatile opsrsm_xfer_hdr_t *xfer;
   6868 	opsrsm_fqe_t fqe;
   6869 	volatile opsrsm_fqe_t *fqep;
   6870 	opsrsm_dqe_t dqe;
   6871 	volatile opsrsm_dqe_t *dqep;
   6872 	opsrsmbuf_t *rbp;
   6873 	uint_t bufsize;
   6874 	int i, stat, bufalign = 0;
   6875 	uint32_t buf_offset, fq_offset, dq_offset;
   6876 	size_t xfer_size;
   6877 	caddr_t xfer_start;
   6878 	size_t roundup;
   6879 	size_t transport_pgsize = 0;
   6880 	rsm_access_entry_t perms;
   6881 
   6882 	D1("opsrsmcrexfer: rd 0x%p (addr %ld ctlr %d)",
   6883 	    (void *)rd, rd->rd_rsm_addr, rd->rd_adapter->instance);
   6884 
   6885 	ASSERT(rd->rd_rawmem_base_addr == NULL);
   6886 	ASSERT(rd->rd_rawmem_base_size == 0);
   6887 
   6888 	bufsize = rd->rd_buffer_size;
   6889 
   6890 	transport_pgsize = PAGESIZE;
   6891 
   6892 	D1("opsrsmcrexfer: remote adapter id = %d", rd->rd_rem_adapterid);
   6893 
   6894 	/*
   6895 	 * Make sure the remote side is responding before setting
   6896 	 * up the local xfer segment.
   6897 	 */
   6898 	stat = RSM_SENDQ_CREATE(rd->rd_adapter->rsmrdt_ctlr_obj,
   6899 	    rd->rd_rsm_addr,
   6900 	    (rsm_intr_t)(RSMRDT_INTR_T_BASE + rd->rd_rem_adapterid),
   6901 	    RSM_DLPI_QPRI, RSM_DLPI_QDEPTH, RSM_DLPI_QFLAGS,
   6902 	    RSM_RESOURCE_DONTWAIT, 0, &(rd->rsm_sendq));
   6903 
   6904 	if (stat != RSM_SUCCESS) {
   6905 		cmn_err(CE_CONT, "sendq create failed, stat 0x%x\n", stat);
   6906 		return (1);
   6907 	}
   6908 
   6909 	rd->rd_sstate |= OPSRSM_RSMS_RXFER_S;
   6910 
   6911 
   6912 	/*
   6913 	 * Allocate memory for segment.  Allow for alignment of DQE list
   6914 	 * and FQE list.  Also allow buffers to be aligned on
   6915 	 * RSM-page-sized boundaries.
   6916 	 */
   6917 
   6918 	/*
   6919 	if (65536 % bufsize == 0)
   6920 		bufalign = 1;
   6921 	*/
   6922 
   6923 	/*
   6924 	 * Even after round up to transport page alignments,
   6925 	 * to make sure that xfer_size can still accomodate
   6926 	 * all the queue elements, 2 * transport_pgsize has
   6927 	 * been used in the following calculation.
   6928 	 */
   6929 
   6930 	xfer_size = (size_t)(sizeof (*xfer) + 64 +
   6931 	    (sizeof (opsrsm_dqe_t) * opsrsmp->opsrsm_param.opsrsm_queue_size)
   6932 	    + 64 +
   6933 	    (sizeof (opsrsm_fqe_t) * opsrsmp->opsrsm_param.opsrsm_queue_size)
   6934 	    + 64 + OPSRSM_FLOW_CTL_SZ +
   6935 	    (bufsize * (uint_t)(opsrsmp->opsrsm_param.opsrsm_buffers +
   6936 	    bufalign)) + (2 * (transport_pgsize - 1)));
   6937 
   6938 	xfer_start = kmem_alloc(xfer_size, KM_NOSLEEP);
   6939 	if (!xfer_start) {
   6940 		D1("opsrsmcrexfer: can't allocate memory, returning 1");
   6941 #ifdef DEBUG
   6942 		cmn_err(CE_CONT, "?opsrsm: crexfer, failed to alloc");
   6943 #endif /* DEBUG */
   6944 		return (1);
   6945 	}
   6946 	rd->rd_rawmem_base_addr = xfer_start;
   6947 	rd->rd_rawmem_base_size = xfer_size;
   6948 
   6949 	/*
   6950 	 * Round up memory pointer and round down size to allow alignment
   6951 	 * within the transport's supported page size.
   6952 	 */
   6953 	roundup = transport_pgsize - ((uint64_t)xfer_start &
   6954 	    (transport_pgsize - 1));
   6955 	if (roundup != transport_pgsize) {
   6956 		xfer_size -= roundup;
   6957 
   6958 		/* Align the xfer_start with transport_pgsize */
   6959 		xfer_start += roundup;
   6960 	}
   6961 
   6962 	/* Align the xfer_size with the transport_pgsize */
   6963 	xfer_size = xfer_size & ~(transport_pgsize - 1);
   6964 
   6965 	rd->rd_memory.ms_type = RSM_MEM_VADDR;
   6966 	rd->rd_memory.ms_memory.vr.length = xfer_size;
   6967 	rd->rd_memory.ms_memory.vr.as = NULL;	/* kas */
   6968 	rd->rd_memory.ms_memory.vr.vaddr = xfer_start;
   6969 
   6970 	D1("opsrsmcrexfer: rawsize 0x%lx rawmem 0x%p xfersize 0x%lx "
   6971 	    "xfermem 0x%p pgsize 0x%lx\n",
   6972 	    rd->rd_rawmem_base_size,
   6973 	    (void *)rd->rd_rawmem_base_addr,
   6974 	    xfer_size,
   6975 	    (void *)xfer_start,
   6976 	    transport_pgsize);
   6977 
   6978 	xfer = (volatile struct opsrsm_xfer_hdr *)xfer_start;
   6979 
   6980 	/* Force FQ to start on a 64-byte boundary. */
   6981 	fq_offset = sizeof (struct opsrsm_xfer_hdr);
   6982 	fq_offset = OPSRSM_CACHELINE_ROUNDUP(fq_offset);
   6983 
   6984 	/* Force DQ to start on a 64-byte boundary. */
   6985 	dq_offset = fq_offset + (sizeof (opsrsm_fqe_t) *
   6986 	    opsrsmp->opsrsm_param.opsrsm_queue_size + OPSRSM_FLOW_CTL_SZ);
   6987 	dq_offset = OPSRSM_CACHELINE_ROUNDUP(dq_offset);
   6988 
   6989 	/* Force buffers to start on a 64-byte boundary. */
   6990 	buf_offset = dq_offset + (sizeof (opsrsm_dqe_t) *
   6991 	    opsrsmp->opsrsm_param.opsrsm_queue_size);
   6992 	buf_offset = OPSRSM_CACHELINE_ROUNDUP(buf_offset);
   6993 
   6994 	if (bufalign == 1 && (buf_offset & (bufsize - 1)) != 0) {
   6995 		buf_offset += bufsize - (buf_offset & (bufsize - 1));
   6996 	}
   6997 	/*
   6998 	 * Note that while we set the _f and _n queue pointers and the
   6999 	 * queue lengths here, the _l pointers will be set (and the lengths
   7000 	 * may be adjusted) when we connect to the remote xfer segment (see
   7001 	 * connxfer).
   7002 	 */
   7003 	mutex_enter(&rd->rd_net_lock);
   7004 
   7005 	rd->rd_fqr_f = rd->rd_fqr_n = (volatile opsrsm_fqe_t *) (xfer_start +
   7006 	    fq_offset);
   7007 	rd->rd_fqr_seq = 1;
   7008 	rd->rd_num_fqrs = opsrsmp->opsrsm_param.opsrsm_queue_size;
   7009 
   7010 	/*
   7011 	 * flow control structure is located at 128 bytes before the
   7012 	 * start of the DQ
   7013 	 */
   7014 	rd->rd_flow_ctl = (volatile opsrsm_flow_ctl_t *)(xfer_start +
   7015 	    dq_offset - OPSRSM_FLOW_CTL_SZ);
   7016 	rd->rd_flow_ctl->fc_stop = 0;
   7017 
   7018 	rd->rd_dqr_f = rd->rd_dqr_n = (volatile opsrsm_dqe_t *) (xfer_start +
   7019 	    dq_offset);
   7020 	rd->rd_dqr_seq = 1;
   7021 	rd->rd_num_dqrs = opsrsmp->opsrsm_param.opsrsm_queue_size;
   7022 
   7023 	rd->rd_lbuf = xfer_start + buf_offset;
   7024 	rd->rd_lbuflen = bufsize;
   7025 	rd->rd_numlbufs = opsrsmp->opsrsm_param.opsrsm_buffers;
   7026 
   7027 	/*
   7028 	 * Initialize the delivery and free queues:  elements in the free
   7029 	 * queue are valid, and elements in the delivery queue are invalid
   7030 	 * (seqno == 0).
   7031 	 */
   7032 	fqep = rd->rd_fqr_f;
   7033 	dqep = rd->rd_dqr_f;
   7034 
   7035 	dqe.s.dq_seqnum = 0;
   7036 	dqe.s.dq_bufnum = (ushort_t)~0;
   7037 
   7038 	fqe.s.fq_seqnum = 1;
   7039 
   7040 	for (i = 0; i < opsrsmp->opsrsm_param.opsrsm_queue_size; i++) {
   7041 		fqe.s.fq_bufnum = (ushort_t)i;
   7042 
   7043 		*fqep++ = fqe;
   7044 		*dqep++ = dqe;
   7045 	}
   7046 
   7047 	mutex_exit(&rd->rd_net_lock);
   7048 
   7049 	/* allocate space for queued fqes */
   7050 	mutex_enter(&rd->rd_fqr_lock);
   7051 	rd->rd_queued_fqe_array = (opsrsm_queued_fqe_t *)kmem_zalloc(opsrsmp->
   7052 	    opsrsm_param.opsrsm_queue_size * sizeof (opsrsm_queued_fqe_t),
   7053 	    KM_NOSLEEP);
   7054 
   7055 	if (rd->rd_queued_fqe_array == NULL) {
   7056 		DINFO("opsrsmcrexfer: cannot alloc queued_fqe_array\n");
   7057 		mutex_exit(&rd->rd_fqr_lock);
   7058 		return (1);
   7059 	}
   7060 	/* construct freelist */
   7061 	for (i = 0; i < opsrsmp->opsrsm_param.opsrsm_queue_size; i++) {
   7062 		opsrsm_queued_fqe_t *qp;
   7063 
   7064 		qp = &rd->rd_queued_fqe_array[i];
   7065 		/* enqueue element onto freelist */
   7066 		opsrsm_queued_fqe_free(rd, qp);
   7067 	}
   7068 	mutex_exit(&rd->rd_fqr_lock);
   7069 
   7070 	/*
   7071 	 * Allocate and init our structures to describe loaned-up buffers.
   7072 	 */
   7073 	rbp = rd->rd_bufbase = (opsrsmbuf_t *)kmem_zalloc(opsrsmp->
   7074 	    opsrsm_param.opsrsm_buffers * sizeof (*rd->rd_bufbase), KM_NOSLEEP);
   7075 
   7076 	if (rbp == NULL) {
   7077 		D1("opsrsmcrexfer: can't alloc rbp structs, returning 1");
   7078 #ifdef DEBUG
   7079 		cmn_err(CE_CONT, "?opsrsm: abuf");
   7080 #endif /* DEBUG */
   7081 		return (1);
   7082 	}
   7083 
   7084 	for (i = 0; i < rd->rd_numlbufs; i++) {
   7085 		rbp->rb_rd = rd;
   7086 		rbp->rb_frtn.free_func = opsrsmfreebuf;
   7087 		rbp->rb_frtn.free_arg = (char *)rbp;
   7088 		rbp->rb_bufnum = i;
   7089 		rbp++;
   7090 	}
   7091 
   7092 	mutex_init(&rd->rd_nlb_lock, NULL, MUTEX_DRIVER, NULL);
   7093 	mutex_enter(&rd->rd_nlb_lock);
   7094 	rd->rd_nlb = 0;
   7095 	mutex_exit(&rd->rd_nlb_lock);
   7096 
   7097 	/*
   7098 	 * Set everything in the header of the segment.
   7099 	 */
   7100 
   7101 	xfer->rx_segsize = xfer_size;
   7102 	xfer->rx_buf_offset = buf_offset;
   7103 	xfer->rx_fq_offset = fq_offset;
   7104 	xfer->rx_dq_offset = dq_offset;
   7105 	xfer->rx_numbufs = rd->rd_numlbufs;
   7106 	xfer->rx_bufsize = rd->rd_lbuflen;
   7107 	xfer->rx_numfqes = rd->rd_num_fqrs;
   7108 	xfer->rx_numdqes = rd->rd_num_dqrs;
   7109 	xfer->rx_skey = rd->rd_local_skey;
   7110 	D1("opsrsmcrexfer: rx_buf_offset 0x%x fq_offset 0x%x dq_offset 0x%x "
   7111 	    "rd_numlbufs 0x%x rd_lbuflen 0x%x rd_num_fqrs 0x%x "
   7112 	    "rd_num_dqrs 0x%x\n",
   7113 	    buf_offset,
   7114 	    fq_offset,
   7115 	    dq_offset,
   7116 	    rd->rd_numlbufs,
   7117 	    rd->rd_lbuflen,
   7118 	    rd->rd_num_fqrs,
   7119 	    rd->rd_num_dqrs);
   7120 
   7121 	xfer->rx_cookie = OPSRSM_XFER_COOKIE;
   7122 
   7123 	/*
   7124 	 * Local xfer segment is now initialized; make it available to the
   7125 	 * remote node.
   7126 	 */
   7127 
   7128 	stat = RSM_SEG_CREATE(rd->rd_adapter->rsmrdt_ctlr_obj,
   7129 	    &(rd->rd_lxferhand),
   7130 	    xfer_size, 0, &(rd->rd_memory), RSM_RESOURCE_DONTWAIT, 0);
   7131 
   7132 	if (stat != RSM_SUCCESS) {
   7133 		D1("opsrsmcrexfer: can't create RSM segment, stat 0x%x, "
   7134 		    "return 1", stat);
   7135 #ifdef DEBUG
   7136 		cmn_err(CE_CONT, "?opsrsm: crexfer, stat 0x%x", stat);
   7137 #endif /* DEBUG */
   7138 		return (1);
   7139 	}
   7140 	rd->rd_sstate |= OPSRSM_RSMS_LXFER_C;
   7141 
   7142 
   7143 	/*
   7144 	 * Publish this segment.  First try using an id that is likely
   7145 	 * to be unique.
   7146 	 */
   7147 	perms.ae_addr = rd->rd_rsm_addr;
   7148 	perms.ae_permission = RSM_PERM_RDWR;
   7149 	stat = RSMERR_SEGID_IN_USE;
   7150 	if (rd->rd_rsm_addr <=
   7151 	    (RSMRDT_SEGID_END - RSMRDT_SEGID_BASE)) {
   7152 		rd->rd_lxfersegid = RSMRDT_SEGID_BASE +
   7153 		    (uint32_t)rd->rd_rsm_addr;
   7154 		stat = (RSM_PUBLISH(rd->rd_adapter->rsmrdt_ctlr_obj,
   7155 			rd->rd_lxferhand,
   7156 		    &perms, 1, rd->rd_lxfersegid, NULL, 0));
   7157 	}
   7158 	if (stat == RSMERR_SEGID_IN_USE) {
   7159 		/* Couldn't use default id; try other ids in allowed range  */
   7160 		rd->rd_lxfersegid = RSMRDT_SEGID_BASE;
   7161 		while ((stat = (RSM_PUBLISH(rd->rd_adapter->rsmrdt_ctlr_obj,
   7162 		    rd->rd_lxferhand,
   7163 		    &perms, 1, rd->rd_lxfersegid, NULL, 0))) ==
   7164 		    RSMERR_SEGID_IN_USE && rd->rd_lxfersegid <
   7165 		    RSMRDT_SEGID_END)
   7166 			rd->rd_lxfersegid++;
   7167 	}
   7168 
   7169 	if (stat != RSM_SUCCESS) {
   7170 		D1("opsrsmcrexfer: can't publish, stat 0x%x, returning 1",
   7171 		    stat);
   7172 #ifdef DEBUG
   7173 		cmn_err(CE_CONT, "?opsrsm: expxfer, stat 0x%x", stat);
   7174 #endif /* DEBUG */
   7175 		return (1);
   7176 	}
   7177 	rd->rd_sstate |= OPSRSM_RSMS_LXFER_P;
   7178 
   7179 	D1("opsrsmcrexfer: returning 0");
   7180 	return (0);
   7181 }
   7182 
   7183 /*
   7184  * Send a connect request to the remote.
   7185  *
   7186  * If we've received a Connect message from the destination, connect to the
   7187  * remote transfer segment.  Otherwise, send them a Connect Request
   7188  * message.  On success, return 0.  If the connect fails return 1.  A
   7189  * failure in sending a Connect Request message will result in a retry
   7190  * timeout being scheduled, but will not return 1 unless the total timeout
   7191  * period has expired.  Destination's state must be INPROGRESS when called.
   7192  * Destination's state is set to a new state prior to returning.
   7193  */
   7194 static int
   7195 opsrsmsconn(
   7196 	opsrsm_t *opsrsmp,	/* OPSRSM device (RSM controller) pointer */
   7197 	opsrsm_dest_t *rd,	/* Destination pointer */
   7198 	int fromtmo)	/* 0 if this is our first attempt; nonzero if this */
   7199 			/*  is a retry, requested by a timeout routine. */
   7200 {
   7201 	int stat, seq;
   7202 
   7203 	D1("opsrsmsconn: rd 0x%p (addr %ld ctlr %d)",
   7204 	    (void *)rd, rd->rd_rs