Home | History | Annotate | Download | only in rpc
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
     28  *
     29  * Portions of this source code is developed by the team members of
     30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
     31  * headed by Professor Dhabaleswar K. (DK) Panda.
     32  *
     33  * Acknowledgements to contributions from developors:
     34  *   Ranjit Noronha: noronha (at) cse.ohio-state.edu
     35  *   Lei Chai      : chail (at) cse.ohio-state.edu
     36  *   Weikuan Yu    : yuw (at) cse.ohio-state.edu
     37  *
     38  */
     39 
     40 /*
     41  * The rpcib plugin. Implements the interface for RDMATF's
     42  * interaction with IBTF.
     43  */
     44 
     45 #include <sys/param.h>
     46 #include <sys/types.h>
     47 #include <sys/user.h>
     48 #include <sys/systm.h>
     49 #include <sys/sysmacros.h>
     50 #include <sys/proc.h>
     51 #include <sys/socket.h>
     52 #include <sys/file.h>
     53 #include <sys/stream.h>
     54 #include <sys/strsubr.h>
     55 #include <sys/stropts.h>
     56 #include <sys/errno.h>
     57 #include <sys/kmem.h>
     58 #include <sys/debug.h>
     59 #include <sys/pathname.h>
     60 #include <sys/kstat.h>
     61 #include <sys/t_lock.h>
     62 #include <sys/ddi.h>
     63 #include <sys/cmn_err.h>
     64 #include <sys/time.h>
     65 #include <sys/isa_defs.h>
     66 #include <sys/callb.h>
     67 #include <sys/sunddi.h>
     68 #include <sys/sunndi.h>
     69 #include <sys/sdt.h>
     70 #include <sys/ib/ibtl/ibti.h>
     71 #include <rpc/rpc.h>
     72 #include <rpc/ib.h>
     73 #include <sys/modctl.h>
     74 #include <sys/kstr.h>
     75 #include <sys/sockio.h>
     76 #include <sys/vnode.h>
     77 #include <sys/tiuser.h>
     78 #include <net/if.h>
     79 #include <net/if_types.h>
     80 #include <sys/cred.h>
     81 #include <rpc/rpc_rdma.h>
     82 #include <nfs/nfs.h>
     83 #include <sys/atomic.h>
     84 
     85 #define	NFS_RDMA_PORT	20049
     86 
     87 
     88 /*
     89  * Convenience structures for connection management
     90  */
     91 typedef struct rpcib_ipaddrs {
     92 	void	*ri_list;	/* pointer to list of addresses */
     93 	uint_t	ri_count;	/* number of addresses in list */
     94 	uint_t	ri_size;	/* size of ri_list in bytes */
     95 } rpcib_ipaddrs_t;
     96 
     97 
     98 typedef struct rpcib_ping {
     99 	rib_hca_t  *hca;
    100 	ibt_path_info_t path;
    101 	ibt_ip_addr_t srcip;
    102 	ibt_ip_addr_t dstip;
    103 } rpcib_ping_t;
    104 
    105 /*
    106  * Prototype declarations for driver ops
    107  */
    108 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
    109 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
    110 				void *, void **);
    111 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
    112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
    113 static int	rpcib_do_ip_ioctl(int, int, void *);
    114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
    115 static int rpcib_cache_kstat_update(kstat_t *, int);
    116 static void rib_force_cleanup(void *);
    117 static void rib_stop_hca_services(rib_hca_t *);
    118 static void rib_attach_hca(void);
    119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
    120 		struct netbuf *d_svcaddr, CONN **conn);
    121 
    122 struct {
    123 	kstat_named_t cache_limit;
    124 	kstat_named_t cache_allocation;
    125 	kstat_named_t cache_hits;
    126 	kstat_named_t cache_misses;
    127 	kstat_named_t cache_misses_above_the_limit;
    128 } rpcib_kstat = {
    129 	{"cache_limit",			KSTAT_DATA_UINT64 },
    130 	{"cache_allocation",		KSTAT_DATA_UINT64 },
    131 	{"cache_hits",			KSTAT_DATA_UINT64 },
    132 	{"cache_misses",		KSTAT_DATA_UINT64 },
    133 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
    134 };
    135 
    136 /* rpcib cb_ops */
    137 static struct cb_ops rpcib_cbops = {
    138 	nulldev,		/* open */
    139 	nulldev,		/* close */
    140 	nodev,			/* strategy */
    141 	nodev,			/* print */
    142 	nodev,			/* dump */
    143 	nodev,			/* read */
    144 	nodev,			/* write */
    145 	nodev,			/* ioctl */
    146 	nodev,			/* devmap */
    147 	nodev,			/* mmap */
    148 	nodev,			/* segmap */
    149 	nochpoll,		/* poll */
    150 	ddi_prop_op,		/* prop_op */
    151 	NULL,			/* stream */
    152 	D_MP,			/* cb_flag */
    153 	CB_REV,			/* rev */
    154 	nodev,			/* int (*cb_aread)() */
    155 	nodev			/* int (*cb_awrite)() */
    156 };
    157 
    158 /*
    159  * Device options
    160  */
    161 static struct dev_ops rpcib_ops = {
    162 	DEVO_REV,		/* devo_rev, */
    163 	0,			/* refcnt  */
    164 	rpcib_getinfo,		/* info */
    165 	nulldev,		/* identify */
    166 	nulldev,		/* probe */
    167 	rpcib_attach,		/* attach */
    168 	rpcib_detach,		/* detach */
    169 	nodev,			/* reset */
    170 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
    171 	NULL,			/* bus operations */
    172 	NULL,			/* power */
    173 	ddi_quiesce_not_needed,		/* quiesce */
    174 };
    175 
    176 /*
    177  * Module linkage information.
    178  */
    179 
    180 static struct modldrv rib_modldrv = {
    181 	&mod_driverops,		/* Driver module */
    182 	"RPCIB plugin driver",	/* Driver name and version */
    183 	&rpcib_ops,		/* Driver ops */
    184 };
    185 
    186 static struct modlinkage rib_modlinkage = {
    187 	MODREV_1,
    188 	(void *)&rib_modldrv,
    189 	NULL
    190 };
    191 
    192 typedef struct rib_lrc_entry {
    193 	struct rib_lrc_entry *forw;
    194 	struct rib_lrc_entry *back;
    195 	char *lrc_buf;
    196 
    197 	uint32_t lrc_len;
    198 	void  *avl_node;
    199 	bool_t registered;
    200 
    201 	struct mrc lrc_mhandle;
    202 	bool_t lrc_on_freed_list;
    203 } rib_lrc_entry_t;
    204 
    205 typedef	struct cache_struct	{
    206 	rib_lrc_entry_t		r;
    207 	uint32_t		len;
    208 	uint32_t		elements;
    209 	kmutex_t		node_lock;
    210 	avl_node_t		avl_link;
    211 } cache_avl_struct_t;
    212 
    213 uint64_t	cache_limit = 100 * 1024 * 1024;
    214 static uint64_t	cache_watermark = 80 * 1024 * 1024;
    215 static bool_t	stats_enabled = FALSE;
    216 
    217 static uint64_t max_unsignaled_rws = 5;
    218 int nfs_rdma_port = NFS_RDMA_PORT;
    219 
    220 #define	RIBNETID_TCP	"tcp"
    221 #define	RIBNETID_TCP6	"tcp6"
    222 
    223 /*
    224  * rib_stat: private data pointer used when registering
    225  *	with the IBTF.  It is returned to the consumer
    226  *	in all callbacks.
    227  */
    228 static rpcib_state_t *rib_stat = NULL;
    229 
    230 #define	RNR_RETRIES	IBT_RNR_RETRY_1
    231 #define	MAX_PORTS	2
    232 #define	RDMA_DUMMY_WRID	0x4D3A1D4D3A1D
    233 #define	RDMA_CONN_REAP_RETRY	10	/* 10 secs */
    234 
    235 int preposted_rbufs = RDMA_BUFS_GRANT;
    236 int send_threshold = 1;
    237 
    238 /*
    239  * Old cards with Tavor driver have limited memory footprint
    240  * when booted in 32bit. The rib_max_rbufs tunable can be
    241  * tuned for more buffers if needed.
    242  */
    243 
    244 #if !defined(_ELF64) && !defined(__sparc)
    245 int rib_max_rbufs = MAX_BUFS;
    246 #else
    247 int rib_max_rbufs = 10 * MAX_BUFS;
    248 #endif	/* !(_ELF64) && !(__sparc) */
    249 
    250 int rib_conn_timeout = 60 * 12;		/* 12 minutes */
    251 
    252 /*
    253  * State of the plugin.
    254  * ACCEPT = accepting new connections and requests.
    255  * NO_ACCEPT = not accepting new connection and requests.
    256  * This should eventually move to rpcib_state_t structure, since this
    257  * will tell in which state the plugin is for a particular type of service
    258  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
    259  * state for one and in no_accept state for the other.
    260  */
    261 int		plugin_state;
    262 kmutex_t	plugin_state_lock;
    263 
    264 ldi_ident_t rpcib_li;
    265 
    266 /*
    267  * RPCIB RDMATF operations
    268  */
    269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
    270 static rdma_stat rib_disconnect(CONN *conn);
    271 static void rib_listen(struct rdma_svc_data *rd);
    272 static void rib_listen_stop(struct rdma_svc_data *rd);
    273 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
    274 	uint_t buflen, struct mrc *buf_handle);
    275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
    276 	struct mrc buf_handle);
    277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
    278 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
    279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
    280 		struct mrc buf_handle);
    281 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
    282 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
    283 	void *lrc);
    284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
    285 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
    286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
    287 	caddr_t buf, int len, int cpu);
    288 
    289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
    290 
    291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
    292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
    293 
    294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
    295 
    296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
    297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
    298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
    299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
    300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
    301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
    302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
    303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
    304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
    305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
    306 	int addr_type, void *, CONN **);
    307 static rdma_stat rib_conn_release(CONN *conn);
    308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
    309 	rpcib_ping_t *, CONN **);
    310 static rdma_stat rib_getinfo(rdma_info_t *info);
    311 
    312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
    313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
    314 static void rib_destroy_cache(rib_hca_t *hca);
    315 static	void	rib_server_side_cache_reclaim(void *argp);
    316 static int avl_compare(const void *t1, const void *t2);
    317 
    318 static void rib_stop_services(rib_hca_t *);
    319 static void rib_close_channels(rib_conn_list_t *);
    320 static void rib_conn_close(void *);
    321 static void rib_recv_rele(rib_qp_t *);
    322 static rdma_stat rib_conn_release_locked(CONN *conn);
    323 
    324 /*
    325  * RPCIB addressing operations
    326  */
    327 
    328 /*
    329  * RDMA operations the RPCIB module exports
    330  */
    331 static rdmaops_t rib_ops = {
    332 	rib_reachable,
    333 	rib_conn_get,
    334 	rib_conn_release,
    335 	rib_listen,
    336 	rib_listen_stop,
    337 	rib_registermem,
    338 	rib_deregistermem,
    339 	rib_registermemsync,
    340 	rib_deregistermemsync,
    341 	rib_syncmem,
    342 	rib_reg_buf_alloc,
    343 	rib_reg_buf_free,
    344 	rib_send,
    345 	rib_send_resp,
    346 	rib_post_resp,
    347 	rib_post_resp_remove,
    348 	rib_post_recv,
    349 	rib_recv,
    350 	rib_read,
    351 	rib_write,
    352 	rib_getinfo,
    353 };
    354 
    355 /*
    356  * RDMATF RPCIB plugin details
    357  */
    358 static rdma_mod_t rib_mod = {
    359 	"ibtf",		/* api name */
    360 	RDMATF_VERS_1,
    361 	0,
    362 	&rib_ops,	/* rdma op vector for ibtf */
    363 };
    364 
    365 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
    366 static rdma_stat rib_qp_init(rib_qp_t *, int);
    367 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
    368 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
    369 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
    370 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
    371 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
    372 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
    373 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
    374 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
    375 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
    376 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
    377 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
    378 	rib_qp_t **);
    379 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
    380 	rib_qp_t **);
    381 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
    382 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
    383 static int rib_free_sendwait(struct send_wid *);
    384 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
    385 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
    386 static void rdma_done_rem_list(rib_qp_t *);
    387 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
    388 
    389 static void rib_async_handler(void *,
    390 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
    391 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
    392 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
    393 static int rib_free_svc_recv(struct svc_recv *);
    394 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
    395 static void rib_free_wid(struct recv_wid *);
    396 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
    397 static void rib_detach_hca(ibt_hca_hdl_t);
    398 static void rib_close_a_channel(CONN *);
    399 static void rib_send_hold(rib_qp_t *);
    400 static void rib_send_rele(rib_qp_t *);
    401 
    402 /*
    403  * Registration with IBTF as a consumer
    404  */
    405 static struct ibt_clnt_modinfo_s rib_modinfo = {
    406 	IBTI_V_CURR,
    407 	IBT_GENERIC,
    408 	rib_async_handler,	/* async event handler */
    409 	NULL,			/* Memory Region Handler */
    410 	"nfs/ib"
    411 };
    412 
    413 /*
    414  * Global strucuture
    415  */
    416 
    417 typedef struct rpcib_s {
    418 	dev_info_t	*rpcib_dip;
    419 	kmutex_t	rpcib_mutex;
    420 } rpcib_t;
    421 
    422 rpcib_t rpcib;
    423 
    424 /*
    425  * /etc/system controlled variable to control
    426  * debugging in rpcib kernel module.
    427  * Set it to values greater that 1 to control
    428  * the amount of debugging messages required.
    429  */
    430 int rib_debug = 0;
    431 
    432 int
    433 _init(void)
    434 {
    435 	int error;
    436 
    437 	error = mod_install((struct modlinkage *)&rib_modlinkage);
    438 	if (error != 0) {
    439 		/*
    440 		 * Could not load module
    441 		 */
    442 		return (error);
    443 	}
    444 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
    445 	return (0);
    446 }
    447 
    448 int
    449 _fini()
    450 {
    451 	int status;
    452 
    453 	/*
    454 	 * Remove module
    455 	 */
    456 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
    457 		return (status);
    458 	}
    459 	mutex_destroy(&plugin_state_lock);
    460 	return (0);
    461 }
    462 
    463 int
    464 _info(struct modinfo *modinfop)
    465 {
    466 	return (mod_info(&rib_modlinkage, modinfop));
    467 }
    468 
    469 /*
    470  * rpcib_getinfo()
    471  * Given the device number, return the devinfo pointer or the
    472  * instance number.
    473  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
    474  */
    475 
    476 /*ARGSUSED*/
    477 static int
    478 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
    479 {
    480 	int ret = DDI_SUCCESS;
    481 
    482 	switch (cmd) {
    483 	case DDI_INFO_DEVT2DEVINFO:
    484 		if (rpcib.rpcib_dip != NULL)
    485 			*result = rpcib.rpcib_dip;
    486 		else {
    487 			*result = NULL;
    488 			ret = DDI_FAILURE;
    489 		}
    490 		break;
    491 
    492 	case DDI_INFO_DEVT2INSTANCE:
    493 		*result = NULL;
    494 		break;
    495 
    496 	default:
    497 		ret = DDI_FAILURE;
    498 	}
    499 	return (ret);
    500 }
    501 
    502 static void
    503 rpcib_free_hca_list()
    504 {
    505 	rib_hca_t *hca, *hcap;
    506 
    507 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
    508 	hca = rib_stat->hcas_list;
    509 	rib_stat->hcas_list = NULL;
    510 	rw_exit(&rib_stat->hcas_list_lock);
    511 	while (hca != NULL) {
    512 		rw_enter(&hca->state_lock, RW_WRITER);
    513 		hcap = hca;
    514 		hca = hca->next;
    515 		rib_stat->nhca_inited--;
    516 		rib_mod.rdma_count--;
    517 		hcap->state = HCA_DETACHED;
    518 		rw_exit(&hcap->state_lock);
    519 		rib_stop_hca_services(hcap);
    520 
    521 		kmem_free(hcap, sizeof (*hcap));
    522 	}
    523 }
    524 
    525 static rdma_stat
    526 rpcib_free_service_list()
    527 {
    528 	rib_service_t *service;
    529 	ibt_status_t ret;
    530 
    531 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
    532 	while (rib_stat->service_list != NULL) {
    533 		service = rib_stat->service_list;
    534 		ret = ibt_unbind_all_services(service->srv_hdl);
    535 		if (ret != IBT_SUCCESS) {
    536 			rw_exit(&rib_stat->service_list_lock);
    537 #ifdef DEBUG
    538 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
    539 			    "ibt_unbind_all_services failed (%d)\n", (int)ret);
    540 #endif
    541 			return (RDMA_FAILED);
    542 		}
    543 		ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
    544 		    service->srv_hdl);
    545 		if (ret != IBT_SUCCESS) {
    546 			rw_exit(&rib_stat->service_list_lock);
    547 #ifdef DEBUG
    548 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
    549 			    "ibt_deregister_service failed (%d)\n", (int)ret);
    550 #endif
    551 			return (RDMA_FAILED);
    552 		}
    553 		rib_stat->service_list = service->next;
    554 		kmem_free(service, sizeof (rib_service_t));
    555 	}
    556 	rw_exit(&rib_stat->service_list_lock);
    557 
    558 	return (RDMA_SUCCESS);
    559 }
    560 
    561 static int
    562 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
    563 {
    564 	ibt_status_t	ibt_status;
    565 	rdma_stat	r_status;
    566 
    567 	switch (cmd) {
    568 	case DDI_ATTACH:
    569 		break;
    570 	case DDI_RESUME:
    571 		return (DDI_SUCCESS);
    572 	default:
    573 		return (DDI_FAILURE);
    574 	}
    575 
    576 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
    577 
    578 	mutex_enter(&rpcib.rpcib_mutex);
    579 	if (rpcib.rpcib_dip != NULL) {
    580 		mutex_exit(&rpcib.rpcib_mutex);
    581 		return (DDI_FAILURE);
    582 	}
    583 	rpcib.rpcib_dip = dip;
    584 	mutex_exit(&rpcib.rpcib_mutex);
    585 	/*
    586 	 * Create the "rpcib" minor-node.
    587 	 */
    588 	if (ddi_create_minor_node(dip,
    589 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
    590 		/* Error message, no cmn_err as they print on console */
    591 		return (DDI_FAILURE);
    592 	}
    593 
    594 	if (rib_stat == NULL) {
    595 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
    596 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
    597 		rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
    598 		mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
    599 	}
    600 
    601 	rib_stat->hca_count = ibt_get_hca_list(NULL);
    602 	if (rib_stat->hca_count < 1) {
    603 		mutex_destroy(&rib_stat->listen_lock);
    604 		rw_destroy(&rib_stat->hcas_list_lock);
    605 		mutex_destroy(&rib_stat->open_hca_lock);
    606 		kmem_free(rib_stat, sizeof (*rib_stat));
    607 		rib_stat = NULL;
    608 		return (DDI_FAILURE);
    609 	}
    610 
    611 	ibt_status = ibt_attach(&rib_modinfo, dip,
    612 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
    613 
    614 	if (ibt_status != IBT_SUCCESS) {
    615 		mutex_destroy(&rib_stat->listen_lock);
    616 		rw_destroy(&rib_stat->hcas_list_lock);
    617 		mutex_destroy(&rib_stat->open_hca_lock);
    618 		kmem_free(rib_stat, sizeof (*rib_stat));
    619 		rib_stat = NULL;
    620 		return (DDI_FAILURE);
    621 	}
    622 
    623 	rib_stat->service_list = NULL;
    624 	rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
    625 	mutex_enter(&rib_stat->open_hca_lock);
    626 	if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
    627 		mutex_exit(&rib_stat->open_hca_lock);
    628 		goto open_fail;
    629 	}
    630 	mutex_exit(&rib_stat->open_hca_lock);
    631 
    632 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
    633 	    DDI_PROP_SUCCESS) {
    634 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
    635 		    "failed.");
    636 		goto register_fail;
    637 	}
    638 
    639 	/*
    640 	 * Register with rdmatf
    641 	 */
    642 	r_status = rdma_register_mod(&rib_mod);
    643 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
    644 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
    645 		    "status = %d", r_status);
    646 		goto register_fail;
    647 	}
    648 
    649 	return (DDI_SUCCESS);
    650 
    651 register_fail:
    652 
    653 open_fail:
    654 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
    655 	rpcib_free_hca_list();
    656 	(void) rpcib_free_service_list();
    657 	mutex_destroy(&rib_stat->listen_lock);
    658 	rw_destroy(&rib_stat->hcas_list_lock);
    659 	mutex_destroy(&rib_stat->open_hca_lock);
    660 	rw_destroy(&rib_stat->service_list_lock);
    661 	kmem_free(rib_stat, sizeof (*rib_stat));
    662 	rib_stat = NULL;
    663 	return (DDI_FAILURE);
    664 }
    665 
    666 /*ARGSUSED*/
    667 static int
    668 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
    669 {
    670 	switch (cmd) {
    671 
    672 	case DDI_DETACH:
    673 		break;
    674 
    675 	case DDI_SUSPEND:
    676 	default:
    677 		return (DDI_FAILURE);
    678 	}
    679 
    680 	/*
    681 	 * Detach the hca and free resources
    682 	 */
    683 	mutex_enter(&plugin_state_lock);
    684 	plugin_state = NO_ACCEPT;
    685 	mutex_exit(&plugin_state_lock);
    686 
    687 	if (rpcib_free_service_list() != RDMA_SUCCESS)
    688 		return (DDI_FAILURE);
    689 	rpcib_free_hca_list();
    690 
    691 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
    692 	mutex_destroy(&rib_stat->listen_lock);
    693 	rw_destroy(&rib_stat->hcas_list_lock);
    694 	mutex_destroy(&rib_stat->open_hca_lock);
    695 	rw_destroy(&rib_stat->service_list_lock);
    696 
    697 	kmem_free(rib_stat, sizeof (*rib_stat));
    698 	rib_stat = NULL;
    699 
    700 	mutex_enter(&rpcib.rpcib_mutex);
    701 	rpcib.rpcib_dip = NULL;
    702 	mutex_exit(&rpcib.rpcib_mutex);
    703 	mutex_destroy(&rpcib.rpcib_mutex);
    704 	return (DDI_SUCCESS);
    705 }
    706 
    707 
    708 static void rib_rbufpool_free(rib_hca_t *, int);
    709 static void rib_rbufpool_deregister(rib_hca_t *, int);
    710 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
    711 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
    712 static rdma_stat rib_rem_replylist(rib_qp_t *);
    713 static int rib_remreply(rib_qp_t *, struct reply *);
    714 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
    715 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
    716 
    717 
    718 /*
    719  * One CQ pair per HCA
    720  */
    721 static rdma_stat
    722 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
    723 	rib_cq_t **cqp)
    724 {
    725 	rib_cq_t	*cq;
    726 	ibt_cq_attr_t	cq_attr;
    727 	uint32_t	real_size;
    728 	ibt_status_t	status;
    729 	rdma_stat	error = RDMA_SUCCESS;
    730 
    731 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
    732 	cq->rib_hca = hca;
    733 	cq_attr.cq_size = cq_size;
    734 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
    735 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
    736 	    &real_size);
    737 	if (status != IBT_SUCCESS) {
    738 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
    739 		    " status=%d", status);
    740 		error = RDMA_FAILED;
    741 		goto fail;
    742 	}
    743 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
    744 
    745 	/*
    746 	 * Enable CQ callbacks. CQ Callbacks are single shot
    747 	 * (e.g. you have to call ibt_enable_cq_notify()
    748 	 * after each callback to get another one).
    749 	 */
    750 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
    751 	if (status != IBT_SUCCESS) {
    752 		cmn_err(CE_WARN, "rib_create_cq: "
    753 		    "enable_cq_notify failed, status %d", status);
    754 		error = RDMA_FAILED;
    755 		goto fail;
    756 	}
    757 	*cqp = cq;
    758 
    759 	return (error);
    760 fail:
    761 	if (cq->rib_cq_hdl)
    762 		(void) ibt_free_cq(cq->rib_cq_hdl);
    763 	if (cq)
    764 		kmem_free(cq, sizeof (rib_cq_t));
    765 	return (error);
    766 }
    767 
    768 /*
    769  * rpcib_find_hca
    770  *
    771  * Caller should have already locked the hcas_lock before calling
    772  * this function.
    773  */
    774 static rib_hca_t *
    775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
    776 {
    777 	rib_hca_t *hca = ribstat->hcas_list;
    778 
    779 	while (hca && hca->hca_guid != guid)
    780 		hca = hca->next;
    781 
    782 	return (hca);
    783 }
    784 
    785 static rdma_stat
    786 rpcib_open_hcas(rpcib_state_t *ribstat)
    787 {
    788 	rib_hca_t		*hca;
    789 	ibt_status_t		ibt_status;
    790 	rdma_stat		status;
    791 	ibt_hca_portinfo_t	*pinfop;
    792 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
    793 	uint_t			size, cq_size;
    794 	int			i;
    795 	kstat_t *ksp;
    796 	cache_avl_struct_t example_avl_node;
    797 	char rssc_name[32];
    798 	int old_nhca_inited = ribstat->nhca_inited;
    799 	ib_guid_t		*hca_guids;
    800 
    801 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
    802 
    803 	ribstat->hca_count = ibt_get_hca_list(&hca_guids);
    804 	if (ribstat->hca_count == 0)
    805 		return (RDMA_FAILED);
    806 
    807 	rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
    808 	/*
    809 	 * Open a hca and setup for RDMA
    810 	 */
    811 	for (i = 0; i < ribstat->hca_count; i++) {
    812 		if (rpcib_find_hca(ribstat, hca_guids[i]))
    813 			continue;
    814 		hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
    815 
    816 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
    817 		    hca_guids[i], &hca->hca_hdl);
    818 		if (ibt_status != IBT_SUCCESS) {
    819 			kmem_free(hca, sizeof (rib_hca_t));
    820 			continue;
    821 		}
    822 		hca->hca_guid = hca_guids[i];
    823 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
    824 		hca->state = HCA_INITED;
    825 
    826 		/*
    827 		 * query HCA info
    828 		 */
    829 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
    830 		if (ibt_status != IBT_SUCCESS) {
    831 			goto fail1;
    832 		}
    833 
    834 		/*
    835 		 * One PD (Protection Domain) per HCA.
    836 		 * A qp is allowed to access a memory region
    837 		 * only when it's in the same PD as that of
    838 		 * the memory region.
    839 		 */
    840 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
    841 		if (ibt_status != IBT_SUCCESS) {
    842 			goto fail1;
    843 		}
    844 
    845 		/*
    846 		 * query HCA ports
    847 		 */
    848 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
    849 		    0, &pinfop, &hca->hca_nports, &size);
    850 		if (ibt_status != IBT_SUCCESS) {
    851 			goto fail2;
    852 		}
    853 		hca->hca_ports = pinfop;
    854 		hca->hca_pinfosz = size;
    855 		pinfop = NULL;
    856 
    857 		cq_size = DEF_CQ_SIZE; /* default cq size */
    858 		/*
    859 		 * Create 2 pairs of cq's (1 pair for client
    860 		 * and the other pair for server) on this hca.
    861 		 * If number of qp's gets too large, then several
    862 		 * cq's will be needed.
    863 		 */
    864 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
    865 		    &hca->svc_rcq);
    866 		if (status != RDMA_SUCCESS) {
    867 			goto fail3;
    868 		}
    869 
    870 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
    871 		    &hca->svc_scq);
    872 		if (status != RDMA_SUCCESS) {
    873 			goto fail3;
    874 		}
    875 
    876 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
    877 		    &hca->clnt_rcq);
    878 		if (status != RDMA_SUCCESS) {
    879 			goto fail3;
    880 		}
    881 
    882 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
    883 		    &hca->clnt_scq);
    884 		if (status != RDMA_SUCCESS) {
    885 			goto fail3;
    886 		}
    887 
    888 		/*
    889 		 * Create buffer pools.
    890 		 * Note rib_rbuf_create also allocates memory windows.
    891 		 */
    892 		hca->recv_pool = rib_rbufpool_create(hca,
    893 		    RECV_BUFFER, rib_max_rbufs);
    894 		if (hca->recv_pool == NULL) {
    895 			goto fail3;
    896 		}
    897 
    898 		hca->send_pool = rib_rbufpool_create(hca,
    899 		    SEND_BUFFER, rib_max_rbufs);
    900 		if (hca->send_pool == NULL) {
    901 			rib_rbufpool_destroy(hca, RECV_BUFFER);
    902 			goto fail3;
    903 		}
    904 
    905 		if (hca->server_side_cache == NULL) {
    906 			(void) sprintf(rssc_name,
    907 			    "rib_srvr_cache_%llx",
    908 			    (long long unsigned int) hca->hca_guid);
    909 			hca->server_side_cache = kmem_cache_create(
    910 			    rssc_name,
    911 			    sizeof (cache_avl_struct_t), 0,
    912 			    NULL,
    913 			    NULL,
    914 			    rib_server_side_cache_reclaim,
    915 			    hca, NULL, 0);
    916 		}
    917 
    918 		avl_create(&hca->avl_tree,
    919 		    avl_compare,
    920 		    sizeof (cache_avl_struct_t),
    921 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
    922 		    (uint_t)(uintptr_t)&example_avl_node);
    923 
    924 		rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
    925 		    hca->iblock);
    926 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
    927 		rw_init(&hca->avl_rw_lock,
    928 		    NULL, RW_DRIVER, hca->iblock);
    929 		mutex_init(&hca->cache_allocation_lock,
    930 		    NULL, MUTEX_DRIVER, NULL);
    931 		hca->avl_init = TRUE;
    932 
    933 		/* Create kstats for the cache */
    934 		ASSERT(INGLOBALZONE(curproc));
    935 
    936 		if (!stats_enabled) {
    937 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
    938 			    KSTAT_TYPE_NAMED,
    939 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
    940 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
    941 			    GLOBAL_ZONEID);
    942 			if (ksp) {
    943 				ksp->ks_data = (void *) &rpcib_kstat;
    944 				ksp->ks_update = rpcib_cache_kstat_update;
    945 				kstat_install(ksp);
    946 				stats_enabled = TRUE;
    947 			}
    948 		}
    949 		if (hca->cleanup_helper == NULL) {
    950 			char tq_name[sizeof (hca->hca_guid) * 2 + 1];
    951 
    952 			(void) snprintf(tq_name, sizeof (tq_name), "%llX",
    953 			    (unsigned long long int) hca->hca_guid);
    954 			hca->cleanup_helper = ddi_taskq_create(NULL,
    955 			    tq_name, 1, TASKQ_DEFAULTPRI, 0);
    956 		}
    957 
    958 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
    959 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
    960 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
    961 		    hca->iblock);
    962 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
    963 		    hca->iblock);
    964 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
    965 		hca->inuse = TRUE;
    966 
    967 		hca->next = ribstat->hcas_list;
    968 		ribstat->hcas_list = hca;
    969 		ribstat->nhca_inited++;
    970 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
    971 		continue;
    972 
    973 fail3:
    974 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
    975 fail2:
    976 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
    977 fail1:
    978 		(void) ibt_close_hca(hca->hca_hdl);
    979 		kmem_free(hca, sizeof (rib_hca_t));
    980 	}
    981 	rw_exit(&ribstat->hcas_list_lock);
    982 	ibt_free_hca_list(hca_guids, ribstat->hca_count);
    983 	rib_mod.rdma_count = rib_stat->nhca_inited;
    984 
    985 	/*
    986 	 * return success if at least one new hca has been configured.
    987 	 */
    988 	if (ribstat->nhca_inited != old_nhca_inited)
    989 		return (RDMA_SUCCESS);
    990 	else
    991 		return (RDMA_FAILED);
    992 }
    993 
    994 /*
    995  * Callback routines
    996  */
    997 
    998 /*
    999  * SCQ handlers
   1000  */
   1001 /* ARGSUSED */
   1002 static void
   1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
   1004 {
   1005 	ibt_status_t	ibt_status;
   1006 	ibt_wc_t	wc;
   1007 	struct send_wid	*wd;
   1008 	CONN		*conn;
   1009 	rib_qp_t	*qp;
   1010 	int		i;
   1011 
   1012 	/*
   1013 	 * Re-enable cq notify here to avoid missing any
   1014 	 * completion queue notification.
   1015 	 */
   1016 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
   1017 
   1018 	ibt_status = IBT_SUCCESS;
   1019 	while (ibt_status != IBT_CQ_EMPTY) {
   1020 		bzero(&wc, sizeof (wc));
   1021 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
   1022 		if (ibt_status != IBT_SUCCESS)
   1023 			return;
   1024 
   1025 		/*
   1026 		 * Got a send completion
   1027 		 */
   1028 		if (wc.wc_id != RDMA_DUMMY_WRID) {
   1029 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
   1030 			qp = wd->qp;
   1031 			conn = qptoc(qp);
   1032 
   1033 			mutex_enter(&wd->sendwait_lock);
   1034 			switch (wc.wc_status) {
   1035 			case IBT_WC_SUCCESS:
   1036 				wd->status = RDMA_SUCCESS;
   1037 				break;
   1038 			default:
   1039 /*
   1040  *    RC Send Q Error Code		Local state     Remote State
   1041  *    ==================== 		===========     ============
   1042  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
   1043  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
   1044  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
   1045  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
   1046  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
   1047  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
   1048  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
   1049  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
   1050  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
   1051  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
   1052  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
   1053  */
   1054 				/*
   1055 				 * Channel in error state. Set connection to
   1056 				 * ERROR and cleanup will happen either from
   1057 				 * conn_release  or from rib_conn_get
   1058 				 */
   1059 				wd->status = RDMA_FAILED;
   1060 				mutex_enter(&conn->c_lock);
   1061 				if (conn->c_state != C_DISCONN_PEND)
   1062 					conn->c_state = C_ERROR_CONN;
   1063 				mutex_exit(&conn->c_lock);
   1064 				break;
   1065 			}
   1066 
   1067 			if (wd->cv_sig == 1) {
   1068 				/*
   1069 				 * Notify poster
   1070 				 */
   1071 				cv_signal(&wd->wait_cv);
   1072 				mutex_exit(&wd->sendwait_lock);
   1073 			} else {
   1074 				/*
   1075 				 * Poster not waiting for notification.
   1076 				 * Free the send buffers and send_wid
   1077 				 */
   1078 				for (i = 0; i < wd->nsbufs; i++) {
   1079 					rib_rbuf_free(qptoc(wd->qp),
   1080 					    SEND_BUFFER,
   1081 					    (void *)(uintptr_t)wd->sbufaddr[i]);
   1082 				}
   1083 
   1084 				/* decrement the send ref count */
   1085 				rib_send_rele(qp);
   1086 
   1087 				mutex_exit(&wd->sendwait_lock);
   1088 				(void) rib_free_sendwait(wd);
   1089 			}
   1090 		}
   1091 	}
   1092 }
   1093 
   1094 /* ARGSUSED */
   1095 static void
   1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
   1097 {
   1098 	ibt_status_t	ibt_status;
   1099 	ibt_wc_t	wc;
   1100 	struct send_wid	*wd;
   1101 	rib_qp_t	*qp;
   1102 	CONN		*conn;
   1103 	int		i;
   1104 
   1105 	/*
   1106 	 * Re-enable cq notify here to avoid missing any
   1107 	 * completion queue notification.
   1108 	 */
   1109 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
   1110 
   1111 	ibt_status = IBT_SUCCESS;
   1112 	while (ibt_status != IBT_CQ_EMPTY) {
   1113 		bzero(&wc, sizeof (wc));
   1114 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
   1115 		if (ibt_status != IBT_SUCCESS)
   1116 			return;
   1117 
   1118 		/*
   1119 		 * Got a send completion
   1120 		 */
   1121 		if (wc.wc_id != RDMA_DUMMY_WRID) {
   1122 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
   1123 			qp = wd->qp;
   1124 			conn = qptoc(qp);
   1125 			mutex_enter(&wd->sendwait_lock);
   1126 
   1127 			switch (wc.wc_status) {
   1128 			case IBT_WC_SUCCESS:
   1129 				wd->status = RDMA_SUCCESS;
   1130 				break;
   1131 			default:
   1132 				/*
   1133 				 * Channel in error state. Set connection to
   1134 				 * ERROR and cleanup will happen either from
   1135 				 * conn_release  or conn timeout.
   1136 				 */
   1137 				wd->status = RDMA_FAILED;
   1138 				mutex_enter(&conn->c_lock);
   1139 				if (conn->c_state != C_DISCONN_PEND)
   1140 					conn->c_state = C_ERROR_CONN;
   1141 				mutex_exit(&conn->c_lock);
   1142 				break;
   1143 			}
   1144 
   1145 			if (wd->cv_sig == 1) {
   1146 				/*
   1147 				 * Update completion status and notify poster
   1148 				 */
   1149 				cv_signal(&wd->wait_cv);
   1150 				mutex_exit(&wd->sendwait_lock);
   1151 			} else {
   1152 				/*
   1153 				 * Poster not waiting for notification.
   1154 				 * Free the send buffers and send_wid
   1155 				 */
   1156 				for (i = 0; i < wd->nsbufs; i++) {
   1157 					rib_rbuf_free(qptoc(wd->qp),
   1158 					    SEND_BUFFER,
   1159 					    (void *)(uintptr_t)wd->sbufaddr[i]);
   1160 				}
   1161 
   1162 				/* decrement the send ref count */
   1163 				rib_send_rele(qp);
   1164 
   1165 				mutex_exit(&wd->sendwait_lock);
   1166 				(void) rib_free_sendwait(wd);
   1167 			}
   1168 		}
   1169 	}
   1170 }
   1171 
   1172 /*
   1173  * RCQ handler
   1174  */
   1175 /* ARGSUSED */
   1176 static void
   1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
   1178 {
   1179 	rib_qp_t	*qp;
   1180 	ibt_status_t	ibt_status;
   1181 	ibt_wc_t	wc;
   1182 	struct recv_wid	*rwid;
   1183 
   1184 	/*
   1185 	 * Re-enable cq notify here to avoid missing any
   1186 	 * completion queue notification.
   1187 	 */
   1188 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
   1189 
   1190 	ibt_status = IBT_SUCCESS;
   1191 	while (ibt_status != IBT_CQ_EMPTY) {
   1192 		bzero(&wc, sizeof (wc));
   1193 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
   1194 		if (ibt_status != IBT_SUCCESS)
   1195 			return;
   1196 
   1197 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
   1198 		qp = rwid->qp;
   1199 
   1200 		if (wc.wc_status == IBT_WC_SUCCESS) {
   1201 			XDR	inxdrs, *xdrs;
   1202 			uint_t	xid, vers, op, find_xid = 0;
   1203 			struct reply	*r;
   1204 			CONN *conn = qptoc(qp);
   1205 			uint32_t rdma_credit = 0;
   1206 
   1207 			xdrs = &inxdrs;
   1208 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
   1209 			    wc.wc_bytes_xfer, XDR_DECODE);
   1210 			/*
   1211 			 * Treat xid as opaque (xid is the first entity
   1212 			 * in the rpc rdma message).
   1213 			 */
   1214 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
   1215 
   1216 			/* Skip xid and set the xdr position accordingly. */
   1217 			XDR_SETPOS(xdrs, sizeof (uint32_t));
   1218 			(void) xdr_u_int(xdrs, &vers);
   1219 			(void) xdr_u_int(xdrs, &rdma_credit);
   1220 			(void) xdr_u_int(xdrs, &op);
   1221 			XDR_DESTROY(xdrs);
   1222 
   1223 			if (vers != RPCRDMA_VERS) {
   1224 				/*
   1225 				 * Invalid RPC/RDMA version. Cannot
   1226 				 * interoperate.  Set connection to
   1227 				 * ERROR state and bail out.
   1228 				 */
   1229 				mutex_enter(&conn->c_lock);
   1230 				if (conn->c_state != C_DISCONN_PEND)
   1231 					conn->c_state = C_ERROR_CONN;
   1232 				mutex_exit(&conn->c_lock);
   1233 				rib_rbuf_free(conn, RECV_BUFFER,
   1234 				    (void *)(uintptr_t)rwid->addr);
   1235 				rib_free_wid(rwid);
   1236 				rib_recv_rele(qp);
   1237 				continue;
   1238 			}
   1239 
   1240 			mutex_enter(&qp->replylist_lock);
   1241 			for (r = qp->replylist; r != NULL; r = r->next) {
   1242 				if (r->xid == xid) {
   1243 					find_xid = 1;
   1244 					switch (op) {
   1245 					case RDMA_MSG:
   1246 					case RDMA_NOMSG:
   1247 					case RDMA_MSGP:
   1248 						r->status = RDMA_SUCCESS;
   1249 						r->vaddr_cq = rwid->addr;
   1250 						r->bytes_xfer =
   1251 						    wc.wc_bytes_xfer;
   1252 						cv_signal(&r->wait_cv);
   1253 						break;
   1254 					default:
   1255 						rib_rbuf_free(qptoc(qp),
   1256 						    RECV_BUFFER,
   1257 						    (void *)(uintptr_t)
   1258 						    rwid->addr);
   1259 						break;
   1260 					}
   1261 					break;
   1262 				}
   1263 			}
   1264 			mutex_exit(&qp->replylist_lock);
   1265 			if (find_xid == 0) {
   1266 				/* RPC caller not waiting for reply */
   1267 
   1268 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
   1269 				    int, xid);
   1270 
   1271 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
   1272 				    (void *)(uintptr_t)rwid->addr);
   1273 			}
   1274 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
   1275 			CONN *conn = qptoc(qp);
   1276 
   1277 			/*
   1278 			 * Connection being flushed. Just free
   1279 			 * the posted buffer
   1280 			 */
   1281 			rib_rbuf_free(conn, RECV_BUFFER,
   1282 			    (void *)(uintptr_t)rwid->addr);
   1283 		} else {
   1284 			CONN *conn = qptoc(qp);
   1285 /*
   1286  *  RC Recv Q Error Code		Local state     Remote State
   1287  *  ====================		===========     ============
   1288  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
   1289  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
   1290  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
   1291  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
   1292  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
   1293  *  IBT_WC_WR_FLUSHED_ERR               None            None
   1294  */
   1295 			/*
   1296 			 * Channel in error state. Set connection
   1297 			 * in ERROR state.
   1298 			 */
   1299 			mutex_enter(&conn->c_lock);
   1300 			if (conn->c_state != C_DISCONN_PEND)
   1301 				conn->c_state = C_ERROR_CONN;
   1302 			mutex_exit(&conn->c_lock);
   1303 			rib_rbuf_free(conn, RECV_BUFFER,
   1304 			    (void *)(uintptr_t)rwid->addr);
   1305 		}
   1306 		rib_free_wid(rwid);
   1307 		rib_recv_rele(qp);
   1308 	}
   1309 }
   1310 
   1311 /* Server side */
   1312 /* ARGSUSED */
   1313 static void
   1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
   1315 {
   1316 	rdma_recv_data_t *rdp;
   1317 	rib_qp_t	*qp;
   1318 	ibt_status_t	ibt_status;
   1319 	ibt_wc_t	wc;
   1320 	struct svc_recv	*s_recvp;
   1321 	CONN		*conn;
   1322 	mblk_t		*mp;
   1323 
   1324 	/*
   1325 	 * Re-enable cq notify here to avoid missing any
   1326 	 * completion queue notification.
   1327 	 */
   1328 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
   1329 
   1330 	ibt_status = IBT_SUCCESS;
   1331 	while (ibt_status != IBT_CQ_EMPTY) {
   1332 		bzero(&wc, sizeof (wc));
   1333 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
   1334 		if (ibt_status != IBT_SUCCESS)
   1335 			return;
   1336 
   1337 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
   1338 		qp = s_recvp->qp;
   1339 		conn = qptoc(qp);
   1340 
   1341 		if (wc.wc_status == IBT_WC_SUCCESS) {
   1342 			XDR	inxdrs, *xdrs;
   1343 			uint_t	xid, vers, op;
   1344 			uint32_t rdma_credit;
   1345 
   1346 			xdrs = &inxdrs;
   1347 			/* s_recvp->vaddr stores data */
   1348 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
   1349 			    wc.wc_bytes_xfer, XDR_DECODE);
   1350 
   1351 			/*
   1352 			 * Treat xid as opaque (xid is the first entity
   1353 			 * in the rpc rdma message).
   1354 			 */
   1355 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
   1356 			/* Skip xid and set the xdr position accordingly. */
   1357 			XDR_SETPOS(xdrs, sizeof (uint32_t));
   1358 			if (!xdr_u_int(xdrs, &vers) ||
   1359 			    !xdr_u_int(xdrs, &rdma_credit) ||
   1360 			    !xdr_u_int(xdrs, &op)) {
   1361 				rib_rbuf_free(conn, RECV_BUFFER,
   1362 				    (void *)(uintptr_t)s_recvp->vaddr);
   1363 				XDR_DESTROY(xdrs);
   1364 				rib_recv_rele(qp);
   1365 				(void) rib_free_svc_recv(s_recvp);
   1366 				continue;
   1367 			}
   1368 			XDR_DESTROY(xdrs);
   1369 
   1370 			if (vers != RPCRDMA_VERS) {
   1371 				/*
   1372 				 * Invalid RPC/RDMA version.
   1373 				 * Drop rpc rdma message.
   1374 				 */
   1375 				rib_rbuf_free(conn, RECV_BUFFER,
   1376 				    (void *)(uintptr_t)s_recvp->vaddr);
   1377 				rib_recv_rele(qp);
   1378 				(void) rib_free_svc_recv(s_recvp);
   1379 				continue;
   1380 			}
   1381 			/*
   1382 			 * Is this for RDMA_DONE?
   1383 			 */
   1384 			if (op == RDMA_DONE) {
   1385 				rib_rbuf_free(conn, RECV_BUFFER,
   1386 				    (void *)(uintptr_t)s_recvp->vaddr);
   1387 				/*
   1388 				 * Wake up the thread waiting on
   1389 				 * a RDMA_DONE for xid
   1390 				 */
   1391 				mutex_enter(&qp->rdlist_lock);
   1392 				rdma_done_notify(qp, xid);
   1393 				mutex_exit(&qp->rdlist_lock);
   1394 				rib_recv_rele(qp);
   1395 				(void) rib_free_svc_recv(s_recvp);
   1396 				continue;
   1397 			}
   1398 
   1399 			mutex_enter(&plugin_state_lock);
   1400 			mutex_enter(&conn->c_lock);
   1401 			if ((plugin_state == ACCEPT) &&
   1402 			    (conn->c_state == C_CONNECTED)) {
   1403 				conn->c_ref++;
   1404 				mutex_exit(&conn->c_lock);
   1405 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
   1406 				    == NULL)
   1407 					(void) strwaitbuf(
   1408 					    sizeof (*rdp), BPRI_LO);
   1409 				/*
   1410 				 * Plugin is in accept state, hence the master
   1411 				 * transport queue for this is still accepting
   1412 				 * requests. Hence we can call svc_queuereq to
   1413 				 * queue this recieved msg.
   1414 				 */
   1415 				rdp = (rdma_recv_data_t *)mp->b_rptr;
   1416 				rdp->conn = conn;
   1417 				rdp->rpcmsg.addr =
   1418 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
   1419 				rdp->rpcmsg.type = RECV_BUFFER;
   1420 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
   1421 				rdp->status = wc.wc_status;
   1422 				mp->b_wptr += sizeof (*rdp);
   1423 				svc_queuereq((queue_t *)rib_stat->q, mp);
   1424 				mutex_exit(&plugin_state_lock);
   1425 			} else {
   1426 				/*
   1427 				 * The master transport for this is going
   1428 				 * away and the queue is not accepting anymore
   1429 				 * requests for krpc, so don't do anything, just
   1430 				 * free the msg.
   1431 				 */
   1432 				mutex_exit(&conn->c_lock);
   1433 				mutex_exit(&plugin_state_lock);
   1434 				rib_rbuf_free(conn, RECV_BUFFER,
   1435 				    (void *)(uintptr_t)s_recvp->vaddr);
   1436 			}
   1437 		} else {
   1438 			rib_rbuf_free(conn, RECV_BUFFER,
   1439 			    (void *)(uintptr_t)s_recvp->vaddr);
   1440 		}
   1441 		rib_recv_rele(qp);
   1442 		(void) rib_free_svc_recv(s_recvp);
   1443 	}
   1444 }
   1445 
   1446 static void
   1447 rib_attach_hca()
   1448 {
   1449 	mutex_enter(&rib_stat->open_hca_lock);
   1450 	(void) rpcib_open_hcas(rib_stat);
   1451 	rib_listen(NULL);
   1452 	mutex_exit(&rib_stat->open_hca_lock);
   1453 }
   1454 
   1455 /*
   1456  * Handles DR event of IBT_HCA_DETACH_EVENT.
   1457  */
   1458 /* ARGSUSED */
   1459 static void
   1460 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
   1461 	ibt_async_code_t code, ibt_async_event_t *event)
   1462 {
   1463 	switch (code) {
   1464 	case IBT_HCA_ATTACH_EVENT:
   1465 		rib_attach_hca();
   1466 		break;
   1467 	case IBT_HCA_DETACH_EVENT:
   1468 		rib_detach_hca(hca_hdl);
   1469 #ifdef DEBUG
   1470 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
   1471 #endif
   1472 		break;
   1473 	case IBT_EVENT_PORT_UP:
   1474 		/*
   1475 		 * A port is up. We should call rib_listen() since there is
   1476 		 * a chance that rib_listen() may have failed during
   1477 		 * rib_attach_hca() because the port had not been up yet.
   1478 		 */
   1479 		rib_listen(NULL);
   1480 #ifdef DEBUG
   1481 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
   1482 #endif
   1483 		break;
   1484 #ifdef DEBUG
   1485 	case IBT_EVENT_PATH_MIGRATED:
   1486 		cmn_err(CE_NOTE, "rib_async_handler(): "
   1487 		    "IBT_EVENT_PATH_MIGRATED\n");
   1488 		break;
   1489 	case IBT_EVENT_SQD:
   1490 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
   1491 		break;
   1492 	case IBT_EVENT_COM_EST:
   1493 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
   1494 		break;
   1495 	case IBT_ERROR_CATASTROPHIC_CHAN:
   1496 		cmn_err(CE_NOTE, "rib_async_handler(): "
   1497 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
   1498 		break;
   1499 	case IBT_ERROR_INVALID_REQUEST_CHAN:
   1500 		cmn_err(CE_NOTE, "rib_async_handler(): "
   1501 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
   1502 		break;
   1503 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
   1504 		cmn_err(CE_NOTE, "rib_async_handler(): "
   1505 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
   1506 		break;
   1507 	case IBT_ERROR_PATH_MIGRATE_REQ:
   1508 		cmn_err(CE_NOTE, "rib_async_handler(): "
   1509 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
   1510 		break;
   1511 	case IBT_ERROR_CQ:
   1512 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
   1513 		break;
   1514 	case IBT_ERROR_PORT_DOWN:
   1515 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
   1516 		break;
   1517 	case IBT_ASYNC_OPAQUE1:
   1518 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
   1519 		break;
   1520 	case IBT_ASYNC_OPAQUE2:
   1521 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
   1522 		break;
   1523 	case IBT_ASYNC_OPAQUE3:
   1524 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
   1525 		break;
   1526 	case IBT_ASYNC_OPAQUE4:
   1527 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
   1528 		break;
   1529 #endif
   1530 	default:
   1531 		break;
   1532 	}
   1533 }
   1534 
   1535 /*
   1536  * Client's reachable function.
   1537  */
   1538 static rdma_stat
   1539 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
   1540 {
   1541 	rdma_stat	status;
   1542 	rpcib_ping_t	rpt;
   1543 	struct netbuf	saddr;
   1544 	CONN		*conn;
   1545 
   1546 	bzero(&saddr, sizeof (struct netbuf));
   1547 	status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
   1548 
   1549 	if (status == RDMA_SUCCESS) {
   1550 		*handle = (void *)rpt.hca;
   1551 		/* release the reference */
   1552 		(void) rib_conn_release(conn);
   1553 		return (RDMA_SUCCESS);
   1554 	} else {
   1555 		*handle = NULL;
   1556 		DTRACE_PROBE(rpcib__i__pingfailed);
   1557 		return (RDMA_FAILED);
   1558 	}
   1559 }
   1560 
   1561 /* Client side qp creation */
   1562 static rdma_stat
   1563 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
   1564 {
   1565 	rib_qp_t	*kqp = NULL;
   1566 	CONN		*conn;
   1567 	rdma_clnt_cred_ctrl_t *cc_info;
   1568 
   1569 	ASSERT(qp != NULL);
   1570 	*qp = NULL;
   1571 
   1572 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
   1573 	conn = qptoc(kqp);
   1574 	kqp->hca = hca;
   1575 	kqp->rdmaconn.c_rdmamod = &rib_mod;
   1576 	kqp->rdmaconn.c_private = (caddr_t)kqp;
   1577 
   1578 	kqp->mode = RIB_CLIENT;
   1579 	kqp->chan_flags = IBT_BLOCKING;
   1580 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
   1581 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
   1582 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
   1583 	/*
   1584 	 * Initialize
   1585 	 */
   1586 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
   1587 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
   1588 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
   1589 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
   1590 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
   1591 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
   1592 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
   1593 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
   1594 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
   1595 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
   1596 	/*
   1597 	 * Initialize the client credit control
   1598 	 * portion of the rdmaconn struct.
   1599 	 */
   1600 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
   1601 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
   1602 	cc_info->clnt_cc_granted_ops = 0;
   1603 	cc_info->clnt_cc_in_flight_ops = 0;
   1604 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
   1605 
   1606 	*qp = kqp;
   1607 	return (RDMA_SUCCESS);
   1608 }
   1609 
   1610 /* Server side qp creation */
   1611 static rdma_stat
   1612 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
   1613 {
   1614 	rib_qp_t	*kqp = NULL;
   1615 	ibt_chan_sizes_t	chan_sizes;
   1616 	ibt_rc_chan_alloc_args_t	qp_attr;
   1617 	ibt_status_t		ibt_status;
   1618 	rdma_srv_cred_ctrl_t *cc_info;
   1619 
   1620 	*qp = NULL;
   1621 
   1622 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
   1623 	kqp->hca = hca;
   1624 	kqp->port_num = port;
   1625 	kqp->rdmaconn.c_rdmamod = &rib_mod;
   1626 	kqp->rdmaconn.c_private = (caddr_t)kqp;
   1627 
   1628 	/*
   1629 	 * Create the qp handle
   1630 	 */
   1631 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
   1632 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
   1633 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
   1634 	qp_attr.rc_pd = hca->pd_hdl;
   1635 	qp_attr.rc_hca_port_num = port;
   1636 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
   1637 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
   1638 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
   1639 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
   1640 	qp_attr.rc_clone_chan = NULL;
   1641 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
   1642 	qp_attr.rc_flags = IBT_WR_SIGNALED;
   1643 
   1644 	rw_enter(&hca->state_lock, RW_READER);
   1645 	if (hca->state != HCA_DETACHED) {
   1646 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
   1647 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
   1648 		    &chan_sizes);
   1649 	} else {
   1650 		rw_exit(&hca->state_lock);
   1651 		goto fail;
   1652 	}
   1653 	rw_exit(&hca->state_lock);
   1654 
   1655 	if (ibt_status != IBT_SUCCESS) {
   1656 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
   1657 		    int, ibt_status);
   1658 		goto fail;
   1659 	}
   1660 
   1661 	kqp->mode = RIB_SERVER;
   1662 	kqp->chan_flags = IBT_BLOCKING;
   1663 	kqp->q = q;	/* server ONLY */
   1664 
   1665 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
   1666 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
   1667 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
   1668 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
   1669 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
   1670 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
   1671 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
   1672 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
   1673 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
   1674 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
   1675 	/*
   1676 	 * Set the private data area to qp to be used in callbacks
   1677 	 */
   1678 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
   1679 	kqp->rdmaconn.c_state = C_CONNECTED;
   1680 
   1681 	/*
   1682 	 * Initialize the server credit control
   1683 	 * portion of the rdmaconn struct.
   1684 	 */
   1685 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
   1686 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
   1687 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
   1688 	cc_info->srv_cc_cur_buffers_used = 0;
   1689 	cc_info->srv_cc_posted = preposted_rbufs;
   1690 
   1691 	*qp = kqp;
   1692 
   1693 	return (RDMA_SUCCESS);
   1694 fail:
   1695 	if (kqp)
   1696 		kmem_free(kqp, sizeof (rib_qp_t));
   1697 
   1698 	return (RDMA_FAILED);
   1699 }
   1700 
   1701 /* ARGSUSED */
   1702 ibt_cm_status_t
   1703 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
   1704     ibt_cm_return_args_t *ret_args, void *priv_data,
   1705     ibt_priv_data_len_t len)
   1706 {
   1707 	rib_hca_t	*hca;
   1708 
   1709 	hca = (rib_hca_t *)clnt_hdl;
   1710 
   1711 	switch (event->cm_type) {
   1712 
   1713 	/* got a connection close event */
   1714 	case IBT_CM_EVENT_CONN_CLOSED:
   1715 	{
   1716 		CONN	*conn;
   1717 		rib_qp_t *qp;
   1718 
   1719 		/* check reason why connection was closed */
   1720 		switch (event->cm_event.closed) {
   1721 		case IBT_CM_CLOSED_DREP_RCVD:
   1722 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
   1723 		case IBT_CM_CLOSED_DUP:
   1724 		case IBT_CM_CLOSED_ABORT:
   1725 		case IBT_CM_CLOSED_ALREADY:
   1726 			/*
   1727 			 * These cases indicate the local end initiated
   1728 			 * the closing of the channel. Nothing to do here.
   1729 			 */
   1730 			break;
   1731 		default:
   1732 			/*
   1733 			 * Reason for CONN_CLOSED event must be one of
   1734 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
   1735 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
   1736 			 * the remote end is closing the channel. In these
   1737 			 * cases free the channel and transition to error
   1738 			 * state
   1739 			 */
   1740 			qp = ibt_get_chan_private(event->cm_channel);
   1741 			conn = qptoc(qp);
   1742 			mutex_enter(&conn->c_lock);
   1743 			if (conn->c_state == C_DISCONN_PEND) {
   1744 				mutex_exit(&conn->c_lock);
   1745 				break;
   1746 			}
   1747 
   1748 			conn->c_state = C_ERROR_CONN;
   1749 
   1750 			/*
   1751 			 * Free the conn if c_ref is down to 0 already
   1752 			 */
   1753 			if (conn->c_ref == 0) {
   1754 				/*
   1755 				 * Remove from list and free conn
   1756 				 */
   1757 				conn->c_state = C_DISCONN_PEND;
   1758 				mutex_exit(&conn->c_lock);
   1759 				rw_enter(&hca->state_lock, RW_READER);
   1760 				if (hca->state != HCA_DETACHED)
   1761 					(void) rib_disconnect_channel(conn,
   1762 					    &hca->cl_conn_list);
   1763 				rw_exit(&hca->state_lock);
   1764 			} else {
   1765 				/*
   1766 				 * conn will be freed when c_ref goes to 0.
   1767 				 * Indicate to cleaning thread not to close
   1768 				 * the connection, but just free the channel.
   1769 				 */
   1770 				conn->c_flags |= C_CLOSE_NOTNEEDED;
   1771 				mutex_exit(&conn->c_lock);
   1772 			}
   1773 #ifdef DEBUG
   1774 			if (rib_debug)
   1775 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
   1776 				    "(CONN_CLOSED) channel disconnected");
   1777 #endif
   1778 			break;
   1779 		}
   1780 		break;
   1781 	}
   1782 	default:
   1783 		break;
   1784 	}
   1785 	return (IBT_CM_ACCEPT);
   1786 }
   1787 
   1788 /*
   1789  * Connect to the server.
   1790  */
   1791 rdma_stat
   1792 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
   1793 {
   1794 	ibt_chan_open_args_t	chan_args;	/* channel args */
   1795 	ibt_chan_sizes_t	chan_sizes;
   1796 	ibt_rc_chan_alloc_args_t	qp_attr;
   1797 	ibt_status_t		ibt_status;
   1798 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
   1799 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
   1800 	ibt_ip_cm_info_t	ipcm_info;
   1801 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
   1802 
   1803 
   1804 	(void) bzero(&chan_args, sizeof (chan_args));
   1805 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
   1806 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
   1807 
   1808 	ipcm_info.src_addr.family = rptp->srcip.family;
   1809 	switch (ipcm_info.src_addr.family) {
   1810 	case AF_INET:
   1811 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
   1812 		break;
   1813 	case AF_INET6:
   1814 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
   1815 		break;
   1816 	}
   1817 
   1818 	ipcm_info.dst_addr.family = rptp->srcip.family;
   1819 	switch (ipcm_info.dst_addr.family) {
   1820 	case AF_INET:
   1821 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
   1822 		break;
   1823 	case AF_INET6:
   1824 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
   1825 		break;
   1826 	}
   1827 
   1828 	ipcm_info.src_port = (in_port_t)nfs_rdma_port;
   1829 
   1830 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
   1831 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
   1832 
   1833 	if (ibt_status != IBT_SUCCESS) {
   1834 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
   1835 		return (-1);
   1836 	}
   1837 
   1838 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
   1839 	/* Alloc a RC channel */
   1840 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
   1841 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
   1842 	qp_attr.rc_pd = hca->pd_hdl;
   1843 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
   1844 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
   1845 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
   1846 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
   1847 	qp_attr.rc_clone_chan = NULL;
   1848 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
   1849 	qp_attr.rc_flags = IBT_WR_SIGNALED;
   1850 
   1851 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
   1852 	chan_args.oc_path = &rptp->path;
   1853 
   1854 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
   1855 	chan_args.oc_cm_clnt_private = (void *)hca;
   1856 	chan_args.oc_rdma_ra_out = 4;
   1857 	chan_args.oc_rdma_ra_in = 4;
   1858 	chan_args.oc_path_retry_cnt = 2;
   1859 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
   1860 	chan_args.oc_priv_data = cmp_ip_pvt;
   1861 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
   1862 
   1863 refresh:
   1864 	rw_enter(&hca->state_lock, RW_READER);
   1865 	if (hca->state != HCA_DETACHED) {
   1866 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
   1867 		    IBT_ACHAN_NO_FLAGS,
   1868 		    &qp_attr, &qp->qp_hdl,
   1869 		    &chan_sizes);
   1870 	} else {
   1871 		rw_exit(&hca->state_lock);
   1872 		return (RDMA_FAILED);
   1873 	}
   1874 	rw_exit(&hca->state_lock);
   1875 
   1876 	if (ibt_status != IBT_SUCCESS) {
   1877 		DTRACE_PROBE1(rpcib__i_conntosrv,
   1878 		    int, ibt_status);
   1879 		return (RDMA_FAILED);
   1880 	}
   1881 
   1882 	/* Connect to the Server */
   1883 	(void) bzero(&ret_args, sizeof (ret_args));
   1884 	mutex_enter(&qp->cb_lock);
   1885 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
   1886 	    IBT_BLOCKING, &chan_args, &ret_args);
   1887 	if (ibt_status != IBT_SUCCESS) {
   1888 		DTRACE_PROBE2(rpcib__i_openrctosrv,
   1889 		    int, ibt_status, int, ret_args.rc_status);
   1890 
   1891 		(void) ibt_free_channel(qp->qp_hdl);
   1892 		qp->qp_hdl = NULL;
   1893 		mutex_exit(&qp->cb_lock);
   1894 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
   1895 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
   1896 			/*
   1897 			 * Got IBT_CM_CONN_STALE probably because of stale
   1898 			 * data on the passive end of a channel that existed
   1899 			 * prior to reboot. Retry establishing a channel
   1900 			 * REFRESH_ATTEMPTS times, during which time the
   1901 			 * stale conditions on the server might clear up.
   1902 			 */
   1903 			goto refresh;
   1904 		}
   1905 		return (RDMA_FAILED);
   1906 	}
   1907 	mutex_exit(&qp->cb_lock);
   1908 	/*
   1909 	 * Set the private data area to qp to be used in callbacks
   1910 	 */
   1911 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
   1912 	return (RDMA_SUCCESS);
   1913 }
   1914 
   1915 rdma_stat
   1916 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
   1917 {
   1918 	uint_t			i, addr_count;
   1919 	ibt_status_t		ibt_status;
   1920 	uint8_t			num_paths_p;
   1921 	ibt_ip_path_attr_t	ipattr;
   1922 	ibt_path_ip_src_t	srcip;
   1923 	rpcib_ipaddrs_t		addrs4;
   1924 	rpcib_ipaddrs_t		addrs6;
   1925 	struct sockaddr_in	*sinp;
   1926 	struct sockaddr_in6	*sin6p;
   1927 	rdma_stat		retval = RDMA_FAILED;
   1928 	rib_hca_t *hca;
   1929 
   1930 	if ((addr_type != AF_INET) && (addr_type != AF_INET6))
   1931 		return (RDMA_INVAL);
   1932 	ASSERT(raddr->buf != NULL);
   1933 
   1934 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
   1935 
   1936 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
   1937 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
   1938 		retval = RDMA_FAILED;
   1939 		goto done2;
   1940 	}
   1941 
   1942 	if (addr_type == AF_INET) {
   1943 		addr_count = addrs4.ri_count;
   1944 		sinp = (struct sockaddr_in *)raddr->buf;
   1945 		rptp->dstip.family = AF_INET;
   1946 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
   1947 		sinp = addrs4.ri_list;
   1948 	} else {
   1949 		addr_count = addrs6.ri_count;
   1950 		sin6p = (struct sockaddr_in6 *)raddr->buf;
   1951 		rptp->dstip.family = AF_INET6;
   1952 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
   1953 		sin6p = addrs6.ri_list;
   1954 	}
   1955 
   1956 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
   1957 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
   1958 		rw_enter(&hca->state_lock, RW_READER);
   1959 		if (hca->state == HCA_DETACHED) {
   1960 			rw_exit(&hca->state_lock);
   1961 			continue;
   1962 		}
   1963 
   1964 		ipattr.ipa_dst_ip 	= &rptp->dstip;
   1965 		ipattr.ipa_hca_guid	= hca->hca_guid;
   1966 		ipattr.ipa_ndst		= 1;
   1967 		ipattr.ipa_max_paths	= 1;
   1968 		ipattr.ipa_src_ip.family = rptp->dstip.family;
   1969 		for (i = 0; i < addr_count; i++) {
   1970 			num_paths_p = 0;
   1971 			if (addr_type == AF_INET) {
   1972 				ipattr.ipa_src_ip.un.ip4addr =
   1973 				    sinp[i].sin_addr.s_addr;
   1974 			} else {
   1975 				ipattr.ipa_src_ip.un.ip6addr =
   1976 				    sin6p[i].sin6_addr;
   1977 			}
   1978 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
   1979 
   1980 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
   1981 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
   1982 			    &num_paths_p, &srcip);
   1983 			if (ibt_status == IBT_SUCCESS &&
   1984 			    num_paths_p != 0 &&
   1985 			    rptp->path.pi_hca_guid == hca->hca_guid) {
   1986 				rptp->hca = hca;
   1987 				rw_exit(&hca->state_lock);
   1988 				if (addr_type == AF_INET) {
   1989 					rptp->srcip.family = AF_INET;
   1990 					rptp->srcip.un.ip4addr =
   1991 					    srcip.ip_primary.un.ip4addr;
   1992 				} else {
   1993 					rptp->srcip.family = AF_INET6;
   1994 					rptp->srcip.un.ip6addr =
   1995 					    srcip.ip_primary.un.ip6addr;
   1996 
   1997 				}
   1998 				retval = RDMA_SUCCESS;
   1999 				goto done1;
   2000 			}
   2001 		}
   2002 		rw_exit(&hca->state_lock);
   2003 	}
   2004 done1:
   2005 	rw_exit(&rib_stat->hcas_list_lock);
   2006 done2:
   2007 	if (addrs4.ri_size > 0)
   2008 		kmem_free(addrs4.ri_list, addrs4.ri_size);
   2009 	if (addrs6.ri_size > 0)
   2010 		kmem_free(addrs6.ri_list, addrs6.ri_size);
   2011 	return (retval);
   2012 }
   2013 
   2014 /*
   2015  * Close channel, remove from connection list and
   2016  * free up resources allocated for that channel.
   2017  */
   2018 rdma_stat
   2019 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
   2020 {
   2021 	rib_qp_t	*qp = ctoqp(conn);
   2022 	rib_hca_t	*hca;
   2023 
   2024 	mutex_enter(&conn->c_lock);
   2025 	if (conn->c_timeout != NULL) {
   2026 		mutex_exit(&conn->c_lock);
   2027 		(void) untimeout(conn->c_timeout);
   2028 		mutex_enter(&conn->c_lock);
   2029 	}
   2030 
   2031 	while (conn->c_flags & C_CLOSE_PENDING) {
   2032 		cv_wait(&conn->c_cv, &conn->c_lock);
   2033 	}
   2034 	mutex_exit(&conn->c_lock);
   2035 
   2036 	/*
   2037 	 * c_ref == 0 and connection is in C_DISCONN_PEND
   2038 	 */
   2039 	hca = qp->hca;
   2040 	if (conn_list != NULL)
   2041 		(void) rib_rm_conn(conn, conn_list);
   2042 
   2043 	/*
   2044 	 * There is only one case where we get here with
   2045 	 * qp_hdl = NULL, which is during connection setup on
   2046 	 * the client. In such a case there are no posted
   2047 	 * send/recv buffers.
   2048 	 */
   2049 	if (qp->qp_hdl != NULL) {
   2050 		mutex_enter(&qp->posted_rbufs_lock);
   2051 		while (qp->n_posted_rbufs)
   2052 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
   2053 		mutex_exit(&qp->posted_rbufs_lock);
   2054 
   2055 		mutex_enter(&qp->send_rbufs_lock);
   2056 		while (qp->n_send_rbufs)
   2057 			cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
   2058 			mutex_exit(&qp->send_rbufs_lock);
   2059 
   2060 		(void) ibt_free_channel(qp->qp_hdl);
   2061 			qp->qp_hdl = NULL;
   2062 	}
   2063 
   2064 	ASSERT(qp->rdlist == NULL);
   2065 
   2066 	if (qp->replylist != NULL) {
   2067 		(void) rib_rem_replylist(qp);
   2068 	}
   2069 
   2070 	cv_destroy(&qp->cb_conn_cv);
   2071 	cv_destroy(&qp->posted_rbufs_cv);
   2072 	cv_destroy(&qp->send_rbufs_cv);
   2073 	mutex_destroy(&qp->cb_lock);
   2074 	mutex_destroy(&qp->replylist_lock);
   2075 	mutex_destroy(&qp->posted_rbufs_lock);
   2076 	mutex_destroy(&qp->send_rbufs_lock);
   2077 	mutex_destroy(&qp->rdlist_lock);
   2078 
   2079 	cv_destroy(&conn->c_cv);
   2080 	mutex_destroy(&conn->c_lock);
   2081 
   2082 	if (conn->c_raddr.buf != NULL) {
   2083 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
   2084 	}
   2085 	if (conn->c_laddr.buf != NULL) {
   2086 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
   2087 	}
   2088 	if (conn->c_netid != NULL) {
   2089 		kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
   2090 	}
   2091 
   2092 	/*
   2093 	 * Credit control cleanup.
   2094 	 */
   2095 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
   2096 		rdma_clnt_cred_ctrl_t *cc_info;
   2097 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
   2098 		cv_destroy(&cc_info->clnt_cc_cv);
   2099 	}
   2100 
   2101 	kmem_free(qp, sizeof (rib_qp_t));
   2102 
   2103 	/*
   2104 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
   2105 	 * then the hca is no longer being used.
   2106 	 */
   2107 	if (conn_list != NULL) {
   2108 		rw_enter(&hca->state_lock, RW_READER);
   2109 		if (hca->state == HCA_DETACHED) {
   2110 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
   2111 			if (hca->srv_conn_list.conn_hd == NULL) {
   2112 				rw_enter(&hca->cl_conn_list.conn_lock,
   2113 				    RW_READER);
   2114 
   2115 				if (hca->cl_conn_list.conn_hd == NULL) {
   2116 					mutex_enter(&hca->inuse_lock);
   2117 					hca->inuse = FALSE;
   2118 					cv_signal(&hca->cb_cv);
   2119 					mutex_exit(&hca->inuse_lock);
   2120 				}
   2121 				rw_exit(&hca->cl_conn_list.conn_lock);
   2122 			}
   2123 			rw_exit(&hca->srv_conn_list.conn_lock);
   2124 		}
   2125 		rw_exit(&hca->state_lock);
   2126 	}
   2127 
   2128 	return (RDMA_SUCCESS);
   2129 }
   2130 
   2131 /*
   2132  * All sends are done under the protection of
   2133  * the wdesc->sendwait_lock. n_send_rbufs count
   2134  * is protected using the send_rbufs_lock.
   2135  * lock ordering is:
   2136  * sendwait_lock -> send_rbufs_lock
   2137  */
   2138 
   2139 void
   2140 rib_send_hold(rib_qp_t *qp)
   2141 {
   2142 	mutex_enter(&qp->send_rbufs_lock);
   2143 	qp->n_send_rbufs++;
   2144 	mutex_exit(&qp->send_rbufs_lock);
   2145 }
   2146 
   2147 void
   2148 rib_send_rele(rib_qp_t *qp)
   2149 {
   2150 	mutex_enter(&qp->send_rbufs_lock);
   2151 	qp->n_send_rbufs--;
   2152 	if (qp->n_send_rbufs == 0)
   2153 		cv_signal(&qp->send_rbufs_cv);
   2154 	mutex_exit(&qp->send_rbufs_lock);
   2155 }
   2156 
   2157 void
   2158 rib_recv_rele(rib_qp_t *qp)
   2159 {
   2160 	mutex_enter(&qp->posted_rbufs_lock);
   2161 	qp->n_posted_rbufs--;
   2162 	if (qp->n_posted_rbufs == 0)
   2163 		cv_signal(&qp->posted_rbufs_cv);
   2164 	mutex_exit(&qp->posted_rbufs_lock);
   2165 }
   2166 
   2167 /*
   2168  * Wait for send completion notification. Only on receiving a
   2169  * notification be it a successful or error completion, free the
   2170  * send_wid.
   2171  */
   2172 static rdma_stat
   2173 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
   2174 {
   2175 	clock_t timout, cv_wait_ret;
   2176 	rdma_stat error = RDMA_SUCCESS;
   2177 	int	i;
   2178 
   2179 	/*
   2180 	 * Wait for send to complete
   2181 	 */
   2182 	ASSERT(wd != NULL);
   2183 	mutex_enter(&wd->sendwait_lock);
   2184 	if (wd->status == (uint_t)SEND_WAIT) {
   2185 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
   2186 		    ddi_get_lbolt();
   2187 
   2188 		if (qp->mode == RIB_SERVER) {
   2189 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
   2190 			    &wd->sendwait_lock, timout)) > 0 &&
   2191 			    wd->status == (uint_t)SEND_WAIT)
   2192 				;
   2193 			switch (cv_wait_ret) {
   2194 			case -1:	/* timeout */
   2195 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
   2196 
   2197 				wd->cv_sig = 0;		/* no signal needed */
   2198 				error = RDMA_TIMEDOUT;
   2199 				break;
   2200 			default:	/* got send completion */
   2201 				break;
   2202 			}
   2203 		} else {
   2204 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
   2205 			    &wd->sendwait_lock, timout)) > 0 &&
   2206 			    wd->status == (uint_t)SEND_WAIT)
   2207 				;
   2208 			switch (cv_wait_ret) {
   2209 			case -1:	/* timeout */
   2210 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
   2211 
   2212 				wd->cv_sig = 0;		/* no signal needed */
   2213 				error = RDMA_TIMEDOUT;
   2214 				break;
   2215 			case 0:		/* interrupted */
   2216 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
   2217 
   2218 				wd->cv_sig = 0;		/* no signal needed */
   2219 				error = RDMA_INTR;
   2220 				break;
   2221 			default:	/* got send completion */
   2222 				break;
   2223 			}
   2224 		}
   2225 	}
   2226 
   2227 	if (wd->status != (uint_t)SEND_WAIT) {
   2228 		/* got send completion */
   2229 		if (wd->status != RDMA_SUCCESS) {
   2230 			switch (wd->status) {
   2231 			case RDMA_CONNLOST:
   2232 				error = RDMA_CONNLOST;
   2233 				break;
   2234 			default:
   2235 				error = RDMA_FAILED;
   2236 				break;
   2237 			}
   2238 		}
   2239 		for (i = 0; i < wd->nsbufs; i++) {
   2240 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
   2241 			    (void *)(uintptr_t)wd->sbufaddr[i]);
   2242 		}
   2243 
   2244 		rib_send_rele(qp);
   2245 
   2246 		mutex_exit(&wd->sendwait_lock);
   2247 		(void) rib_free_sendwait(wd);
   2248 
   2249 	} else {
   2250 		mutex_exit(&wd->sendwait_lock);
   2251 	}
   2252 	return (error);
   2253 }
   2254 
   2255 static struct send_wid *
   2256 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
   2257 {
   2258 	struct send_wid	*wd;
   2259 
   2260 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
   2261 	wd->xid = xid;
   2262 	wd->cv_sig = cv_sig;
   2263 	wd->qp = qp;
   2264 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
   2265 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
   2266 	wd->status = (uint_t)SEND_WAIT;
   2267 
   2268 	return (wd);
   2269 }
   2270 
   2271 static int
   2272 rib_free_sendwait(struct send_wid *wdesc)
   2273 {
   2274 	cv_destroy(&wdesc->wait_cv);
   2275 	mutex_destroy(&wdesc->sendwait_lock);
   2276 	kmem_free(wdesc, sizeof (*wdesc));
   2277 
   2278 	return (0);
   2279 }
   2280 
   2281 static rdma_stat
   2282 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
   2283 {
   2284 	mutex_enter(&qp->replylist_lock);
   2285 	if (rep != NULL) {
   2286 		(void) rib_remreply(qp, rep);
   2287 		mutex_exit(&qp->replylist_lock);
   2288 		return (RDMA_SUCCESS);
   2289 	}
   2290 	mutex_exit(&qp->replylist_lock);
   2291 	return (RDMA_FAILED);
   2292 }
   2293 
   2294 /*
   2295  * Send buffers are freed here only in case of error in posting
   2296  * on QP. If the post succeeded, the send buffers are freed upon
   2297  * send completion in rib_sendwait() or in the scq_handler.
   2298  */
   2299 rdma_stat
   2300 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
   2301 	int send_sig, int cv_sig, caddr_t *swid)
   2302 {
   2303 	struct send_wid	*wdesc;
   2304 	struct clist	*clp;
   2305 	ibt_status_t	ibt_status = IBT_SUCCESS;
   2306 	rdma_stat	ret = RDMA_SUCCESS;
   2307 	ibt_send_wr_t	tx_wr;
   2308 	int		i, nds;
   2309 	ibt_wr_ds_t	sgl[DSEG_MAX];
   2310 	uint_t		total_msg_size;
   2311 	rib_qp_t	*qp;
   2312 
   2313 	qp = ctoqp(conn);
   2314 
   2315 	ASSERT(cl != NULL);
   2316 
   2317 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
   2318 
   2319 	nds = 0;
   2320 	total_msg_size = 0;
   2321 	clp = cl;
   2322 	while (clp != NULL) {
   2323 		if (nds >= DSEG_MAX) {
   2324 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
   2325 			return (RDMA_FAILED);
   2326 		}
   2327 		sgl[nds].ds_va = clp->w.c_saddr;
   2328 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
   2329 		sgl[nds].ds_len = clp->c_len;
   2330 		total_msg_size += clp->c_len;
   2331 		clp = clp->c_next;
   2332 		nds++;
   2333 	}
   2334 
   2335 	if (send_sig) {
   2336 		/* Set SEND_SIGNAL flag. */
   2337 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
   2338 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
   2339 		*swid = (caddr_t)wdesc;
   2340 		tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
   2341 		mutex_enter(&wdesc->sendwait_lock);
   2342 		wdesc->nsbufs = nds;
   2343 		for (i = 0; i < nds; i++) {
   2344 			wdesc->sbufaddr[i] = sgl[i].ds_va;
   2345 		}
   2346 	} else {
   2347 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
   2348 		*swid = NULL;
   2349 		tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
   2350 	}
   2351 
   2352 	tx_wr.wr_opcode = IBT_WRC_SEND;
   2353 	tx_wr.wr_trans = IBT_RC_SRV;
   2354 	tx_wr.wr_nds = nds;
   2355 	tx_wr.wr_sgl = sgl;
   2356 
   2357 	mutex_enter(&conn->c_lock);
   2358 	if (conn->c_state == C_CONNECTED) {
   2359 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
   2360 	}
   2361 	if (conn->c_state != C_CONNECTED ||
   2362 	    ibt_status != IBT_SUCCESS) {
   2363 		if (conn->c_state != C_DISCONN_PEND)
   2364 			conn->c_state = C_ERROR_CONN;
   2365 		mutex_exit(&conn->c_lock);
   2366 		if (send_sig) {
   2367 			for (i = 0; i < nds; i++) {
   2368 				rib_rbuf_free(conn, SEND_BUFFER,
   2369 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
   2370 			}
   2371 			mutex_exit(&wdesc->sendwait_lock);
   2372 			(void) rib_free_sendwait(wdesc);
   2373 		}
   2374 		return (RDMA_CONNLOST);
   2375 	}
   2376 
   2377 	mutex_exit(&conn->c_lock);
   2378 
   2379 	if (send_sig) {
   2380 		rib_send_hold(qp);
   2381 		mutex_exit(&wdesc->sendwait_lock);
   2382 		if (cv_sig) {
   2383 			/*
   2384 			 * cv_wait for send to complete.
   2385 			 * We can fail due to a timeout or signal or
   2386 			 * unsuccessful send.
   2387 			 */
   2388 			ret = rib_sendwait(qp, wdesc);
   2389 
   2390 			return (ret);
   2391 		}
   2392 	}
   2393 
   2394 	return (RDMA_SUCCESS);
   2395 }
   2396 
   2397 
   2398 rdma_stat
   2399 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
   2400 {
   2401 	rdma_stat	ret;
   2402 	caddr_t		wd;
   2403 
   2404 	/* send-wait & cv_signal */
   2405 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
   2406 	return (ret);
   2407 }
   2408 
   2409 /*
   2410  * Deprecated/obsolete interface not used currently
   2411  * but earlier used for READ-READ protocol.
   2412  * Send RPC reply and wait for RDMA_DONE.
   2413  */
   2414 rdma_stat
   2415 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
   2416 {
   2417 	rdma_stat ret = RDMA_SUCCESS;
   2418 	struct rdma_done_list *rd;
   2419 	clock_t cv_wait_ret;
   2420 	caddr_t *wid = NULL;
   2421 	rib_qp_t *qp = ctoqp(conn);
   2422 
   2423 	mutex_enter(&qp->rdlist_lock);
   2424 	rd = rdma_done_add(qp, msgid);
   2425 
   2426 	/* No cv_signal (whether send-wait or no-send-wait) */
   2427 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
   2428 
   2429 	if (ret != RDMA_SUCCESS) {
   2430 		rdma_done_rm(qp, rd);
   2431 	} else {
   2432 		/*
   2433 		 * Wait for RDMA_DONE from remote end
   2434 		 */
   2435 		cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
   2436 		    &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
   2437 		    TR_CLOCK_TICK);
   2438 
   2439 		rdma_done_rm(qp, rd);
   2440 
   2441 		if (cv_wait_ret < 0) {
   2442 			ret = RDMA_TIMEDOUT;
   2443 		}
   2444 	}
   2445 
   2446 	mutex_exit(&qp->rdlist_lock);
   2447 	return (ret);
   2448 }
   2449 
   2450 static struct recv_wid *
   2451 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
   2452 {
   2453 	struct recv_wid	*rwid;
   2454 
   2455 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
   2456 	rwid->xid = msgid;
   2457 	rwid->addr = sgl->ds_va;
   2458 	rwid->qp = qp;
   2459 
   2460 	return (rwid);
   2461 }
   2462 
   2463 static void
   2464 rib_free_wid(struct recv_wid *rwid)
   2465 {
   2466 	kmem_free(rwid, sizeof (struct recv_wid));
   2467 }
   2468 
   2469 rdma_stat
   2470 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
   2471 {
   2472 	rib_qp_t	*qp = ctoqp(conn);
   2473 	struct clist	*clp = cl;
   2474 	struct reply	*rep;
   2475 	struct recv_wid	*rwid;
   2476 	int		nds;
   2477 	ibt_wr_ds_t	sgl[DSEG_MAX];
   2478 	ibt_recv_wr_t	recv_wr;
   2479 	rdma_stat	ret;
   2480 	ibt_status_t	ibt_status;
   2481 
   2482 	/*
   2483 	 * rdma_clnt_postrecv uses RECV_BUFFER.
   2484 	 */
   2485 
   2486 	nds = 0;
   2487 	while (cl != NULL) {
   2488 		if (nds >= DSEG_MAX) {
   2489 			ret = RDMA_FAILED;
   2490 			goto done;
   2491 		}
   2492 		sgl[nds].ds_va = cl->w.c_saddr;
   2493 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
   2494 		sgl[nds].ds_len = cl->c_len;
   2495 		cl = cl->c_next;
   2496 		nds++;
   2497 	}
   2498 
   2499 	if (nds != 1) {
   2500 		ret = RDMA_FAILED;
   2501 		goto done;
   2502 	}
   2503 
   2504 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
   2505 	recv_wr.wr_nds = nds;
   2506 	recv_wr.wr_sgl = sgl;
   2507 
   2508 	rwid = rib_create_wid(qp, &sgl[0], msgid);
   2509 	if (rwid) {
   2510 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
   2511 	} else {
   2512 		ret = RDMA_NORESOURCE;
   2513 		goto done;
   2514 	}
   2515 	rep = rib_addreplylist(qp, msgid);
   2516 	if (!rep) {
   2517 		rib_free_wid(rwid);
   2518 		ret = RDMA_NORESOURCE;
   2519 		goto done;
   2520 	}
   2521 
   2522 	mutex_enter(&conn->c_lock);
   2523 
   2524 	if (conn->c_state == C_CONNECTED) {
   2525 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
   2526 	}
   2527 
   2528 	if (conn->c_state != C_CONNECTED ||
   2529 	    ibt_status != IBT_SUCCESS) {
   2530 		if (conn->c_state != C_DISCONN_PEND)
   2531 			conn->c_state = C_ERROR_CONN;
   2532 		mutex_exit(&conn->c_lock);
   2533 		rib_free_wid(rwid);
   2534 		(void) rib_rem_rep(qp, rep);
   2535 		ret = RDMA_CONNLOST;
   2536 		goto done;
   2537 	}
   2538 
   2539 	mutex_enter(&qp->posted_rbufs_lock);
   2540 	qp->n_posted_rbufs++;
   2541 	mutex_exit(&qp->posted_rbufs_lock);
   2542 
   2543 	mutex_exit(&conn->c_lock);
   2544 	return (RDMA_SUCCESS);
   2545 
   2546 done:
   2547 	while (clp != NULL) {
   2548 		rib_rbuf_free(conn, RECV_BUFFER,
   2549 		    (void *)(uintptr_t)clp->w.c_saddr3);
   2550 		clp = clp->c_next;
   2551 	}
   2552 	return (ret);
   2553 }
   2554 
   2555 rdma_stat
   2556 rib_svc_post(CONN* conn, struct clist *cl)
   2557 {
   2558 	rib_qp_t	*qp = ctoqp(conn);
   2559 	struct svc_recv	*s_recvp;
   2560 	int		nds;
   2561 	ibt_wr_ds_t	sgl[DSEG_MAX];
   2562 	ibt_recv_wr_t	recv_wr;
   2563 	ibt_status_t	ibt_status;
   2564 
   2565 	nds = 0;
   2566 	while (cl != NULL) {
   2567 		if (nds >= DSEG_MAX) {
   2568 			return (RDMA_FAILED);
   2569 		}
   2570 		sgl[nds].ds_va = cl->w.c_saddr;
   2571 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
   2572 		sgl[nds].ds_len = cl->c_len;
   2573 		cl = cl->c_next;
   2574 		nds++;
   2575 	}
   2576 
   2577 	if (nds != 1) {
   2578 		rib_rbuf_free(conn, RECV_BUFFER,
   2579 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
   2580 
   2581 		return (RDMA_FAILED);
   2582 	}
   2583 
   2584 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
   2585 	recv_wr.wr_nds = nds;
   2586 	recv_wr.wr_sgl = sgl;
   2587 
   2588 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
   2589 	/* Use s_recvp's addr as wr id */
   2590 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
   2591 	mutex_enter(&conn->c_lock);
   2592 	if (conn->c_state == C_CONNECTED) {
   2593 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
   2594 	}
   2595 	if (conn->c_state != C_CONNECTED ||
   2596 	    ibt_status != IBT_SUCCESS) {
   2597 		if (conn->c_state != C_DISCONN_PEND)
   2598 			conn->c_state = C_ERROR_CONN;
   2599 		mutex_exit(&conn->c_lock);
   2600 		rib_rbuf_free(conn, RECV_BUFFER,
   2601 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
   2602 		(void) rib_free_svc_recv(s_recvp);
   2603 
   2604 		return (RDMA_CONNLOST);
   2605 	}
   2606 	mutex_exit(&conn->c_lock);
   2607 
   2608 	return (RDMA_SUCCESS);
   2609 }
   2610 
   2611 /* Client */
   2612 rdma_stat
   2613 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
   2614 {
   2615 	return (rib_clnt_post(conn, cl, msgid));
   2616 }
   2617 
   2618 /* Client */
   2619 rdma_stat
   2620 rib_post_resp_remove(CONN* conn, uint32_t msgid)
   2621 {
   2622 	rib_qp_t	*qp = ctoqp(conn);
   2623 	struct reply	*rep;
   2624 
   2625 	mutex_enter(&qp->replylist_lock);
   2626 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
   2627 		if (rep->xid == msgid) {
   2628 			if (rep->vaddr_cq) {
   2629 				rib_rbuf_free(conn, RECV_BUFFER,
   2630 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
   2631 			}
   2632 			(void) rib_remreply(qp, rep);
   2633 			break;
   2634 		}
   2635 	}
   2636 	mutex_exit(&qp->replylist_lock);
   2637 
   2638 	return (RDMA_SUCCESS);
   2639 }
   2640 
   2641 /* Server */
   2642 rdma_stat
   2643 rib_post_recv(CONN *conn, struct clist *cl)
   2644 {
   2645 	rib_qp_t	*qp = ctoqp(conn);
   2646 
   2647 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
   2648 		mutex_enter(&qp->posted_rbufs_lock);
   2649 		qp->n_posted_rbufs++;
   2650 		mutex_exit(&qp->posted_rbufs_lock);
   2651 		return (RDMA_SUCCESS);
   2652 	}
   2653 	return (RDMA_FAILED);
   2654 }
   2655 
   2656 /*
   2657  * Client side only interface to "recv" the rpc reply buf
   2658  * posted earlier by rib_post_resp(conn, cl, msgid).
   2659  */
   2660 rdma_stat
   2661 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
   2662 {
   2663 	struct reply *rep = NULL;
   2664 	clock_t timout, cv_wait_ret;
   2665 	rdma_stat ret = RDMA_SUCCESS;
   2666 	rib_qp_t *qp = ctoqp(conn);
   2667 
   2668 	/*
   2669 	 * Find the reply structure for this msgid
   2670 	 */
   2671 	mutex_enter(&qp->replylist_lock);
   2672 
   2673 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
   2674 		if (rep->xid == msgid)
   2675 			break;
   2676 	}
   2677 
   2678 	if (rep != NULL) {
   2679 		/*
   2680 		 * If message not yet received, wait.
   2681 		 */
   2682 		if (rep->status == (uint_t)REPLY_WAIT) {
   2683 			timout = ddi_get_lbolt() +
   2684 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
   2685 
   2686 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
   2687 			    &qp->replylist_lock, timout)) > 0 &&
   2688 			    rep->status == (uint_t)REPLY_WAIT)
   2689 				;
   2690 
   2691 			switch (cv_wait_ret) {
   2692 			case -1:	/* timeout */
   2693 				ret = RDMA_TIMEDOUT;
   2694 				break;
   2695 			case 0:
   2696 				ret = RDMA_INTR;
   2697 				break;
   2698 			default:
   2699 				break;
   2700 			}
   2701 		}
   2702 
   2703 		if (rep->status == RDMA_SUCCESS) {
   2704 			struct clist *cl = NULL;
   2705 
   2706 			/*
   2707 			 * Got message successfully
   2708 			 */
   2709 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
   2710 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
   2711 			*clp = cl;
   2712 		} else {
   2713 			if (rep->status != (uint_t)REPLY_WAIT) {
   2714 				/*
   2715 				 * Got error in reply message. Free
   2716 				 * recv buffer here.
   2717 				 */
   2718 				ret = rep->status;
   2719 				rib_rbuf_free(conn, RECV_BUFFER,
   2720 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
   2721 			}
   2722 		}
   2723 		(void) rib_remreply(qp, rep);
   2724 	} else {
   2725 		/*
   2726 		 * No matching reply structure found for given msgid on the
   2727 		 * reply wait list.
   2728 		 */
   2729 		ret = RDMA_INVAL;
   2730 		DTRACE_PROBE(rpcib__i__nomatchxid2);
   2731 	}
   2732 
   2733 	/*
   2734 	 * Done.
   2735 	 */
   2736 	mutex_exit(&qp->replylist_lock);
   2737 	return (ret);
   2738 }
   2739 
   2740 /*
   2741  * RDMA write a buffer to the remote address.
   2742  */
   2743 rdma_stat
   2744 rib_write(CONN *conn, struct clist *cl, int wait)
   2745 {
   2746 	ibt_send_wr_t	tx_wr;
   2747 	int		cv_sig;
   2748 	ibt_wr_ds_t	sgl[DSEG_MAX];
   2749 	struct send_wid	*wdesc;
   2750 	ibt_status_t	ibt_status;
   2751 	rdma_stat	ret = RDMA_SUCCESS;
   2752 	rib_qp_t	*qp = ctoqp(conn);
   2753 	uint64_t	n_writes = 0;
   2754 
   2755 	if (cl == NULL) {
   2756 		return (RDMA_FAILED);
   2757 	}
   2758 
   2759 	while ((cl != NULL)) {
   2760 		if (cl->c_len > 0) {
   2761 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
   2762 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
   2763 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
   2764 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
   2765 			sgl[0].ds_va = cl->w.c_saddr;
   2766 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
   2767 			sgl[0].ds_len = cl->c_len;
   2768 
   2769 			if (wait) {
   2770 				cv_sig = 1;
   2771 			} else {
   2772 				if (n_writes > max_unsignaled_rws) {
   2773 					n_writes = 0;
   2774 					cv_sig = 1;
   2775 				} else {
   2776 					cv_sig = 0;
   2777 				}
   2778 			}
   2779 
   2780 			if (cv_sig) {
   2781 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
   2782 				wdesc = rib_init_sendwait(0, cv_sig, qp);
   2783 				tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
   2784 				mutex_enter(&wdesc->sendwait_lock);
   2785 			} else {
   2786 				tx_wr.wr_flags = IBT_WR_NO_FLAGS;
   2787 				tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
   2788 			}
   2789 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
   2790 			tx_wr.wr_trans = IBT_RC_SRV;
   2791 			tx_wr.wr_nds = 1;
   2792 			tx_wr.wr_sgl = sgl;
   2793 
   2794 			mutex_enter(&conn->c_lock);
   2795 			if (conn->c_state == C_CONNECTED) {
   2796 				ibt_status =
   2797 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
   2798 			}
   2799 			if (conn->c_state != C_CONNECTED ||
   2800 			    ibt_status != IBT_SUCCESS) {
   2801 				if (conn->c_state != C_DISCONN_PEND)
   2802 					conn->c_state = C_ERROR_CONN;
   2803 				mutex_exit(&conn->c_lock);
   2804 				if (cv_sig) {
   2805 					mutex_exit(&wdesc->sendwait_lock);
   2806 					(void) rib_free_sendwait(wdesc);
   2807 				}
   2808 				return (RDMA_CONNLOST);
   2809 			}
   2810 
   2811 			mutex_exit(&conn->c_lock);
   2812 
   2813 			/*
   2814 			 * Wait for send to complete
   2815 			 */
   2816 			if (cv_sig) {
   2817 
   2818 				rib_send_hold(qp);
   2819 				mutex_exit(&wdesc->sendwait_lock);
   2820 
   2821 				ret = rib_sendwait(qp, wdesc);
   2822 				if (ret != 0)
   2823 					return (ret);
   2824 			}
   2825 			n_writes ++;
   2826 		}
   2827 		cl = cl->c_next;
   2828 	}
   2829 	return (RDMA_SUCCESS);
   2830 }
   2831 
   2832 /*
   2833  * RDMA Read a buffer from the remote address.
   2834  */
   2835 rdma_stat
   2836 rib_read(CONN *conn, struct clist *cl, int wait)
   2837 {
   2838 	ibt_send_wr_t	rx_wr;
   2839 	int		cv_sig = 0;
   2840 	ibt_wr_ds_t	sgl;
   2841 	struct send_wid	*wdesc;
   2842 	ibt_status_t	ibt_status = IBT_SUCCESS;
   2843 	rdma_stat	ret = RDMA_SUCCESS;
   2844 	rib_qp_t	*qp = ctoqp(conn);
   2845 
   2846 	if (cl == NULL) {
   2847 		return (RDMA_FAILED);
   2848 	}
   2849 
   2850 	while (cl != NULL) {
   2851 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
   2852 		/*
   2853 		 * Remote address is at the head chunk item in list.
   2854 		 */
   2855 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
   2856 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
   2857 
   2858 		sgl.ds_va = cl->u.c_daddr;
   2859 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
   2860 		sgl.ds_len = cl->c_len;
   2861 
   2862 		/*
   2863 		 * If there are multiple chunks to be read, and
   2864 		 * wait is set, ask for signal only for the last chunk
   2865 		 * and wait only on the last chunk. The completion of
   2866 		 * RDMA_READ on last chunk ensures that reads on all
   2867 		 * previous chunks are also completed.
   2868 		 */
   2869 		if (wait && (cl->c_next == NULL)) {
   2870 			cv_sig = 1;
   2871 			wdesc = rib_init_sendwait(0, cv_sig, qp);
   2872 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
   2873 			rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
   2874 			mutex_enter(&wdesc->sendwait_lock);
   2875 		} else {
   2876 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
   2877 			rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
   2878 		}
   2879 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
   2880 		rx_wr.wr_trans = IBT_RC_SRV;
   2881 		rx_wr.wr_nds = 1;
   2882 		rx_wr.wr_sgl = &sgl;
   2883 
   2884 		mutex_enter(&conn->c_lock);
   2885 		if (conn->c_state == C_CONNECTED) {
   2886 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
   2887 		}
   2888 		if (conn->c_state != C_CONNECTED ||
   2889 		    ibt_status != IBT_SUCCESS) {
   2890 			if (conn->c_state != C_DISCONN_PEND)
   2891 				conn->c_state = C_ERROR_CONN;
   2892 			mutex_exit(&conn->c_lock);
   2893 			if (wait && (cl->c_next == NULL)) {
   2894 				mutex_exit(&wdesc->sendwait_lock);
   2895 				(void) rib_free_sendwait(wdesc);
   2896 			}
   2897 			return (RDMA_CONNLOST);
   2898 		}
   2899 
   2900 		mutex_exit(&conn->c_lock);
   2901 
   2902 		/*
   2903 		 * Wait for send to complete if this is the
   2904 		 * last item in the list.
   2905 		 */
   2906 		if (wait && cl->c_next == NULL) {
   2907 			rib_send_hold(qp);
   2908 			mutex_exit(&wdesc->sendwait_lock);
   2909 
   2910 			ret = rib_sendwait(qp, wdesc);
   2911 
   2912 			if (ret != 0)
   2913 				return (ret);
   2914 		}
   2915 		cl = cl->c_next;
   2916 	}
   2917 	return (RDMA_SUCCESS);
   2918 }
   2919 
   2920 /*
   2921  * rib_srv_cm_handler()
   2922  *    Connection Manager callback to handle RC connection requests.
   2923  */
   2924 /* ARGSUSED */
   2925 static ibt_cm_status_t
   2926 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
   2927 	ibt_cm_return_args_t *ret_args, void *priv_data,
   2928 	ibt_priv_data_len_t len)
   2929 {
   2930 	queue_t		*q;
   2931 	rib_qp_t	*qp;
   2932 	rib_hca_t	*hca;
   2933 	rdma_stat	status = RDMA_SUCCESS;
   2934 	int		i;
   2935 	struct clist	cl;
   2936 	rdma_buf_t	rdbuf = {0};
   2937 	void		*buf = NULL;
   2938 	CONN		*conn;
   2939 	ibt_ip_cm_info_t	ipinfo;
   2940 	struct sockaddr_in *s;
   2941 	struct sockaddr_in6 *s6;
   2942 	int sin_size = sizeof (struct sockaddr_in);
   2943 	int in_size = sizeof (struct in_addr);
   2944 	int sin6_size = sizeof (struct sockaddr_in6);
   2945 
   2946 	ASSERT(any != NULL);
   2947 	ASSERT(event != NULL);
   2948 
   2949 	hca = (rib_hca_t *)any;
   2950 
   2951 	/* got a connection request */
   2952 	switch (event->cm_type) {
   2953 	case IBT_CM_EVENT_REQ_RCV:
   2954 		/*
   2955 		 * If the plugin is in the NO_ACCEPT state, bail out.
   2956 		 */
   2957 		mutex_enter(&plugin_state_lock);
   2958 		if (plugin_state == NO_ACCEPT) {
   2959 			mutex_exit(&plugin_state_lock);
   2960 			return (IBT_CM_REJECT);
   2961 		}
   2962 		mutex_exit(&plugin_state_lock);
   2963 
   2964 		/*
   2965 		 * Need to send a MRA MAD to CM so that it does not
   2966 		 * timeout on us.
   2967 		 */
   2968 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
   2969 		    event->cm_event.req.req_timeout * 8, NULL, 0);
   2970 
   2971 		mutex_enter(&rib_stat->open_hca_lock);
   2972 		q = rib_stat->q;
   2973 		mutex_exit(&rib_stat->open_hca_lock);
   2974 
   2975 		status = rib_svc_create_chan(hca, (caddr_t)q,
   2976 		    event->cm_event.req.req_prim_hca_port, &qp);
   2977 
   2978 		if (status) {
   2979 			return (IBT_CM_REJECT);
   2980 		}
   2981 
   2982 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
   2983 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
   2984 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
   2985 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
   2986 
   2987 		/*
   2988 		 * Pre-posts RECV buffers
   2989 		 */
   2990 		conn = qptoc(qp);
   2991 		for (i = 0; i < preposted_rbufs; i++) {
   2992 			bzero(&rdbuf, sizeof (rdbuf));
   2993 			rdbuf.type = RECV_BUFFER;
   2994 			buf = rib_rbuf_alloc(conn, &rdbuf);
   2995 			if (buf == NULL) {
   2996 				/*
   2997 				 * A connection is not established yet.
   2998 				 * Just flush the channel. Buffers
   2999 				 * posted till now will error out with
   3000 				 * IBT_WC_WR_FLUSHED_ERR.
   3001 				 */
   3002 				(void) ibt_flush_channel(qp->qp_hdl);
   3003 				(void) rib_disconnect_channel(conn, NULL);
   3004 				return (IBT_CM_REJECT);
   3005 			}
   3006 
   3007 			bzero(&cl, sizeof (cl));
   3008 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
   3009 			cl.c_len = rdbuf.len;
   3010 			cl.c_smemhandle.mrc_lmr =
   3011 			    rdbuf.handle.mrc_lmr; /* lkey */
   3012 			cl.c_next = NULL;
   3013 			status = rib_post_recv(conn, &cl);
   3014 			if (status != RDMA_SUCCESS) {
   3015 				/*
   3016 				 * A connection is not established yet.
   3017 				 * Just flush the channel. Buffers
   3018 				 * posted till now will error out with
   3019 				 * IBT_WC_WR_FLUSHED_ERR.
   3020 				 */
   3021 				(void) ibt_flush_channel(qp->qp_hdl);
   3022 				(void) rib_disconnect_channel(conn, NULL);
   3023 				return (IBT_CM_REJECT);
   3024 			}
   3025 		}
   3026 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
   3027 
   3028 		/*
   3029 		 * Get the address translation
   3030 		 */
   3031 		rw_enter(&hca->state_lock, RW_READER);
   3032 		if (hca->state == HCA_DETACHED) {
   3033 			rw_exit(&hca->state_lock);
   3034 			return (IBT_CM_REJECT);
   3035 		}
   3036 		rw_exit(&hca->state_lock);
   3037 
   3038 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
   3039 
   3040 		if (ibt_get_ip_data(event->cm_priv_data_len,
   3041 		    event->cm_priv_data,
   3042 		    &ipinfo) != IBT_SUCCESS) {
   3043 
   3044 			return (IBT_CM_REJECT);
   3045 		}
   3046 
   3047 		switch (ipinfo.src_addr.family) {
   3048 		case AF_INET:
   3049 
   3050 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
   3051 			    KM_SLEEP);
   3052 			(void) strcpy(conn->c_netid, RIBNETID_TCP);
   3053 
   3054 			conn->c_raddr.maxlen =
   3055 			    conn->c_raddr.len = sin_size;
   3056 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
   3057 
   3058 			s = (struct sockaddr_in *)conn->c_raddr.buf;
   3059 			s->sin_family = AF_INET;
   3060 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
   3061 			    &s->sin_addr, in_size);
   3062 
   3063 			conn->c_laddr.maxlen =
   3064 			    conn->c_laddr.len = sin_size;
   3065 			conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
   3066 
   3067 			s = (struct sockaddr_in *)conn->c_laddr.buf;
   3068 			s->sin_family = AF_INET;
   3069 			bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
   3070 			    &s->sin_addr, in_size);
   3071 
   3072 			break;
   3073 
   3074 		case AF_INET6:
   3075 
   3076 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
   3077 			    KM_SLEEP);
   3078 			(void) strcpy(conn->c_netid, RIBNETID_TCP6);
   3079 
   3080 			conn->c_raddr.maxlen =
   3081 			    conn->c_raddr.len = sin6_size;
   3082 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
   3083 
   3084 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
   3085 			s6->sin6_family = AF_INET6;
   3086 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
   3087 			    &s6->sin6_addr,
   3088 			    sizeof (struct in6_addr));
   3089 
   3090 			conn->c_laddr.maxlen =
   3091 			    conn->c_laddr.len = sin6_size;
   3092 			conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
   3093 
   3094 			s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
   3095 			s6->sin6_family = AF_INET6;
   3096 			bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
   3097 			    &s6->sin6_addr,
   3098 			    sizeof (struct in6_addr));
   3099 
   3100 			break;
   3101 
   3102 		default:
   3103 			return (IBT_CM_REJECT);
   3104 		}
   3105 
   3106 		break;
   3107 
   3108 	case IBT_CM_EVENT_CONN_CLOSED:
   3109 	{
   3110 		CONN		*conn;
   3111 		rib_qp_t	*qp;
   3112 
   3113 		switch (event->cm_event.closed) {
   3114 		case IBT_CM_CLOSED_DREP_RCVD:
   3115 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
   3116 		case IBT_CM_CLOSED_DUP:
   3117 		case IBT_CM_CLOSED_ABORT:
   3118 		case IBT_CM_CLOSED_ALREADY:
   3119 			/*
   3120 			 * These cases indicate the local end initiated
   3121 			 * the closing of the channel. Nothing to do here.
   3122 			 */
   3123 			break;
   3124 		default:
   3125 			/*
   3126 			 * Reason for CONN_CLOSED event must be one of
   3127 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
   3128 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
   3129 			 * the remote end is closing the channel. In these
   3130 			 * cases free the channel and transition to error
   3131 			 * state
   3132 			 */
   3133 			qp = ibt_get_chan_private(event->cm_channel);
   3134 			conn = qptoc(qp);
   3135 			mutex_enter(&conn->c_lock);
   3136 			if (conn->c_state == C_DISCONN_PEND) {
   3137 				mutex_exit(&conn->c_lock);
   3138 				break;
   3139 			}
   3140 			conn->c_state = C_ERROR_CONN;
   3141 
   3142 			/*
   3143 			 * Free the conn if c_ref goes down to 0
   3144 			 */
   3145 			if (conn->c_ref == 0) {
   3146 				/*
   3147 				 * Remove from list and free conn
   3148 				 */
   3149 				conn->c_state = C_DISCONN_PEND;
   3150 				mutex_exit(&conn->c_lock);
   3151 				(void) rib_disconnect_channel(conn,
   3152 				    &hca->srv_conn_list);
   3153 			} else {
   3154 				/*
   3155 				 * conn will be freed when c_ref goes to 0.
   3156 				 * Indicate to cleaning thread not to close
   3157 				 * the connection, but just free the channel.
   3158 				 */
   3159 				conn->c_flags |= C_CLOSE_NOTNEEDED;
   3160 				mutex_exit(&conn->c_lock);
   3161 			}
   3162 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
   3163 			break;
   3164 		}
   3165 		break;
   3166 	}
   3167 	case IBT_CM_EVENT_CONN_EST:
   3168 		/*
   3169 		 * RTU received, hence connection established.
   3170 		 */
   3171 		if (rib_debug > 1)
   3172 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
   3173 			    "(CONN_EST) channel established");
   3174 		break;
   3175 
   3176 	default:
   3177 		if (rib_debug > 2) {
   3178 			/* Let CM handle the following events. */
   3179 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
   3180 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
   3181 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
   3182 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
   3183 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
   3184 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
   3185 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
   3186 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
   3187 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
   3188 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
   3189 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
   3190 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
   3191 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
   3192 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
   3193 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
   3194 			}
   3195 		}
   3196 		return (IBT_CM_DEFAULT);
   3197 	}
   3198 
   3199 	/* accept all other CM messages (i.e. let the CM handle them) */
   3200 	return (IBT_CM_ACCEPT);
   3201 }
   3202 
   3203 static rdma_stat
   3204 rib_register_service(rib_hca_t *hca, int service_type,
   3205 	uint8_t protocol_num, in_port_t dst_port)
   3206 {
   3207 	ibt_srv_desc_t		sdesc;
   3208 	ibt_hca_portinfo_t	*port_infop;
   3209 	ib_svc_id_t		srv_id;
   3210 	ibt_srv_hdl_t		srv_hdl;
   3211 	uint_t			port_size;
   3212 	uint_t			pki, i, num_ports, nbinds;
   3213 	ibt_status_t		ibt_status;
   3214 	rib_service_t		*service;
   3215 	ib_pkey_t		pkey;
   3216 
   3217 	/*
   3218 	 * Query all ports for the given HCA
   3219 	 */
   3220 	rw_enter(&hca->state_lock, RW_READER);
   3221 	if (hca->state != HCA_DETACHED) {
   3222 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
   3223 		    &num_ports, &port_size);
   3224 		rw_exit(&hca->state_lock);
   3225 	} else {
   3226 		rw_exit(&hca->state_lock);
   3227 		return (RDMA_FAILED);
   3228 	}
   3229 	if (ibt_status != IBT_SUCCESS) {
   3230 		return (RDMA_FAILED);
   3231 	}
   3232 
   3233 	DTRACE_PROBE1(rpcib__i__regservice_numports,
   3234 	    int, num_ports);
   3235 
   3236 	for (i = 0; i < num_ports; i++) {
   3237 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
   3238 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
   3239 			    int, i+1);
   3240 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
   3241 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
   3242 			    int, i+1);
   3243 		}
   3244 	}
   3245 
   3246 	/*
   3247 	 * Get all the IP addresses on this system to register the
   3248 	 * given "service type" on all DNS recognized IP addrs.
   3249 	 * Each service type such as NFS will have all the systems
   3250 	 * IP addresses as its different names. For now the only
   3251 	 * type of service we support in RPCIB is NFS.
   3252 	 */
   3253 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
   3254 	/*
   3255 	 * Start registering and binding service to active
   3256 	 * on active ports on this HCA.
   3257 	 */
   3258 	nbinds = 0;
   3259 	for (service = rib_stat->service_list;
   3260 	    service && (service->srv_type != service_type);
   3261 	    service = service->next)
   3262 		;
   3263 
   3264 	if (service == NULL) {
   3265 		/*
   3266 		 * We use IP addresses as the service names for
   3267 		 * service registration.  Register each of them
   3268 		 * with CM to obtain a svc_id and svc_hdl.  We do not
   3269 		 * register the service with machine's loopback address.
   3270 		 */
   3271 		(void) bzero(&srv_id, sizeof (ib_svc_id_t));
   3272 		(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
   3273 		(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
   3274 		sdesc.sd_handler = rib_srv_cm_handler;
   3275 		sdesc.sd_flags = 0;
   3276 		ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
   3277 		    &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
   3278 		    1, &srv_hdl, &srv_id);
   3279 		if ((ibt_status != IBT_SUCCESS) &&
   3280 		    (ibt_status != IBT_CM_SERVICE_EXISTS)) {
   3281 			rw_exit(&rib_stat->service_list_lock);
   3282 			DTRACE_PROBE1(rpcib__i__regservice__ibtres,
   3283 			    int, ibt_status);
   3284 			ibt_free_portinfo(port_infop, port_size);
   3285 			return (RDMA_FAILED);
   3286 		}
   3287 
   3288 		/*
   3289 		 * Allocate and prepare a service entry
   3290 		 */
   3291 		service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
   3292 
   3293 		service->srv_type = service_type;
   3294 		service->srv_hdl = srv_hdl;
   3295 		service->srv_id = srv_id;
   3296 
   3297 		service->next = rib_stat->service_list;
   3298 		rib_stat->service_list = service;
   3299 		DTRACE_PROBE1(rpcib__i__regservice__new__service,
   3300 		    int, service->srv_type);
   3301 	} else {
   3302 		srv_hdl = service->srv_hdl;
   3303 		srv_id = service->srv_id;
   3304 		DTRACE_PROBE1(rpcib__i__regservice__existing__service,
   3305 		    int, service->srv_type);
   3306 	}
   3307 
   3308 	for (i = 0; i < num_ports; i++) {
   3309 		ibt_sbind_hdl_t		sbp;
   3310 		rib_hca_service_t	*hca_srv;
   3311 		ib_gid_t		gid;
   3312 
   3313 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
   3314 			continue;
   3315 
   3316 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
   3317 			pkey = port_infop[i].p_pkey_tbl[pki];
   3318 
   3319 			rw_enter(&hca->bound_services_lock, RW_READER);
   3320 			gid = port_infop[i].p_sgid_tbl[0];
   3321 			for (hca_srv = hca->bound_services; hca_srv;
   3322 			    hca_srv = hca_srv->next) {
   3323 				if ((hca_srv->srv_id == service->srv_id) &&
   3324 				    (hca_srv->gid.gid_prefix ==
   3325 				    gid.gid_prefix) &&
   3326 				    (hca_srv->gid.gid_guid == gid.gid_guid))
   3327 					break;
   3328 			}
   3329 			rw_exit(&hca->bound_services_lock);
   3330 			if (hca_srv != NULL) {
   3331 				/*
   3332 				 * port is alreay bound the the service
   3333 				 */
   3334 				DTRACE_PROBE1(
   3335 				    rpcib__i__regservice__already__bound,
   3336 				    int, i+1);
   3337 				nbinds++;
   3338 				continue;
   3339 			}
   3340 
   3341 			if ((pkey & IBSRM_HB) &&
   3342 			    (pkey != IB_PKEY_INVALID_FULL)) {
   3343 
   3344 				sbp = NULL;
   3345 				ibt_status = ibt_bind_service(srv_hdl,
   3346 				    gid, NULL, hca, &sbp);
   3347 
   3348 				if (ibt_status == IBT_SUCCESS) {
   3349 					hca_srv = kmem_zalloc(
   3350 					    sizeof (rib_hca_service_t),
   3351 					    KM_SLEEP);
   3352 					hca_srv->srv_id = srv_id;
   3353 					hca_srv->gid = gid;
   3354 					hca_srv->sbind_hdl = sbp;
   3355 
   3356 					rw_enter(&hca->bound_services_lock,
   3357 					    RW_WRITER);
   3358 					hca_srv->next = hca->bound_services;
   3359 					hca->bound_services = hca_srv;
   3360 					rw_exit(&hca->bound_services_lock);
   3361 					nbinds++;
   3362 				}
   3363 
   3364 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
   3365 				    int, ibt_status);
   3366 			}
   3367 		}
   3368 	}
   3369 	rw_exit(&rib_stat->service_list_lock);
   3370 
   3371 	ibt_free_portinfo(port_infop, port_size);
   3372 
   3373 	if (nbinds == 0) {
   3374 		return (RDMA_FAILED);
   3375 	} else {
   3376 		/*
   3377 		 * Put this plugin into accept state, since atleast
   3378 		 * one registration was successful.
   3379 		 */
   3380 		mutex_enter(&plugin_state_lock);
   3381 		plugin_state = ACCEPT;
   3382 		mutex_exit(&plugin_state_lock);
   3383 		return (RDMA_SUCCESS);
   3384 	}
   3385 }
   3386 
   3387 void
   3388 rib_listen(struct rdma_svc_data *rd)
   3389 {
   3390 	rdma_stat status;
   3391 	int n_listening = 0;
   3392 	rib_hca_t *hca;
   3393 
   3394 	mutex_enter(&rib_stat->listen_lock);
   3395 	/*
   3396 	 * if rd parameter is NULL then it means that rib_stat->q is
   3397 	 * already initialized by a call from RDMA and we just want to
   3398 	 * add a newly attached HCA to the same listening state as other
   3399 	 * HCAs.
   3400 	 */
   3401 	if (rd == NULL) {
   3402 		if (rib_stat->q == NULL) {
   3403 			mutex_exit(&rib_stat->listen_lock);
   3404 			return;
   3405 		}
   3406 	} else {
   3407 		rib_stat->q = &rd->q;
   3408 	}
   3409 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
   3410 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
   3411 		/*
   3412 		 * First check if a hca is still attached
   3413 		 */
   3414 		rw_enter(&hca->state_lock, RW_READER);
   3415 		if (hca->state != HCA_INITED) {
   3416 			rw_exit(&hca->state_lock);
   3417 			continue;
   3418 		}
   3419 		rw_exit(&hca->state_lock);
   3420 
   3421 		/*
   3422 		 * Right now the only service type is NFS. Hence
   3423 		 * force feed this value. Ideally to communicate
   3424 		 * the service type it should be passed down in
   3425 		 * rdma_svc_data.
   3426 		 */
   3427 		status = rib_register_service(hca, NFS,
   3428 		    IPPROTO_TCP, nfs_rdma_port);
   3429 		if (status == RDMA_SUCCESS)
   3430 			n_listening++;
   3431 	}
   3432 	rw_exit(&rib_stat->hcas_list_lock);
   3433 
   3434 	/*
   3435 	 * Service active on an HCA, check rd->err_code for more
   3436 	 * explainable errors.
   3437 	 */
   3438 	if (rd) {
   3439 		if (n_listening > 0) {
   3440 			rd->active = 1;
   3441 			rd->err_code = RDMA_SUCCESS;
   3442 		} else {
   3443 			rd->active = 0;
   3444 			rd->err_code = RDMA_FAILED;
   3445 		}
   3446 	}
   3447 	mutex_exit(&rib_stat->listen_lock);
   3448 }
   3449 
   3450 /* XXXX */
   3451 /* ARGSUSED */
   3452 static void
   3453 rib_listen_stop(struct rdma_svc_data *svcdata)
   3454 {
   3455 	rib_hca_t		*hca;
   3456 
   3457 	mutex_enter(&rib_stat->listen_lock);
   3458 	/*
   3459 	 * KRPC called the RDMATF to stop the listeners, this means
   3460 	 * stop sending incomming or recieved requests to KRPC master
   3461 	 * transport handle for RDMA-IB. This is also means that the
   3462 	 * master transport handle, responsible for us, is going away.
   3463 	 */
   3464 	mutex_enter(&plugin_state_lock);
   3465 	plugin_state = NO_ACCEPT;
   3466 	if (svcdata != NULL)
   3467 		svcdata->active = 0;
   3468 	mutex_exit(&plugin_state_lock);
   3469 
   3470 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
   3471 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
   3472 		/*
   3473 		 * First check if a hca is still attached
   3474 		 */
   3475 		rw_enter(&hca->state_lock, RW_READER);
   3476 		if (hca->state == HCA_DETACHED) {
   3477 			rw_exit(&hca->state_lock);
   3478 			continue;
   3479 		}
   3480 		rib_close_channels(&hca->srv_conn_list);
   3481 		rib_stop_services(hca);
   3482 		rw_exit(&hca->state_lock);
   3483 	}
   3484 	rw_exit(&rib_stat->hcas_list_lock);
   3485 
   3486 	/*
   3487 	 * Avoid rib_listen() using the stale q field.
   3488 	 * This could happen if a port goes up after all services
   3489 	 * are already unregistered.
   3490 	 */
   3491 	rib_stat->q = NULL;
   3492 	mutex_exit(&rib_stat->listen_lock);
   3493 }
   3494 
   3495 /*
   3496  * Traverse the HCA's service list to unbind and deregister services.
   3497  * For each bound service of HCA to be removed, first find the corresponding
   3498  * service handle (srv_hdl) and then unbind the service by calling
   3499  * ibt_unbind_service().
   3500  */
   3501 static void
   3502 rib_stop_services(rib_hca_t *hca)
   3503 {
   3504 	rib_hca_service_t *srv_list, *to_remove;
   3505 
   3506 	/*
   3507 	 * unbind and deregister the services for this service type.
   3508 	 * Right now there is only one service type. In future it will
   3509 	 * be passed down to this function.
   3510 	 */
   3511 	rw_enter(&hca->bound_services_lock, RW_READER);
   3512 	srv_list = hca->bound_services;
   3513 	hca->bound_services = NULL;
   3514 	rw_exit(&hca->bound_services_lock);
   3515 
   3516 	while (srv_list != NULL) {
   3517 		rib_service_t *sc;
   3518 
   3519 		to_remove = srv_list;
   3520 		srv_list = to_remove->next;
   3521 		rw_enter(&rib_stat->service_list_lock, RW_READER);
   3522 		for (sc = rib_stat->service_list;
   3523 		    sc && (sc->srv_id != to_remove->srv_id);
   3524 		    sc = sc->next)
   3525 			;
   3526 		/*
   3527 		 * if sc is NULL then the service doesn't exist anymore,
   3528 		 * probably just removed completely through rib_stat.
   3529 		 */
   3530 		if (sc != NULL)
   3531 			(void) ibt_unbind_service(sc->srv_hdl,
   3532 			    to_remove->sbind_hdl);
   3533 		rw_exit(&rib_stat->service_list_lock);
   3534 		kmem_free(to_remove, sizeof (rib_hca_service_t));
   3535 	}
   3536 }
   3537 
   3538 static struct svc_recv *
   3539 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
   3540 {
   3541 	struct svc_recv	*recvp;
   3542 
   3543 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
   3544 	recvp->vaddr = sgl->ds_va;
   3545 	recvp->qp = qp;
   3546 	recvp->bytes_xfer = 0;
   3547 	return (recvp);
   3548 }
   3549 
   3550 static int
   3551 rib_free_svc_recv(struct svc_recv *recvp)
   3552 {
   3553 	kmem_free(recvp, sizeof (*recvp));
   3554 
   3555 	return (0);
   3556 }
   3557 
   3558 static struct reply *
   3559 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
   3560 {
   3561 	struct reply	*rep;
   3562 
   3563 
   3564 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
   3565 	if (rep == NULL) {
   3566 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
   3567 		return (NULL);
   3568 	}
   3569 	rep->xid = msgid;
   3570 	rep->vaddr_cq = NULL;
   3571 	rep->bytes_xfer = 0;
   3572 	rep->status = (uint_t)REPLY_WAIT;
   3573 	rep->prev = NULL;
   3574 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
   3575 
   3576 	mutex_enter(&qp->replylist_lock);
   3577 	if (qp->replylist) {
   3578 		rep->next = qp->replylist;
   3579 		qp->replylist->prev = rep;
   3580 	}
   3581 	qp->rep_list_size++;
   3582 
   3583 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
   3584 	    int, qp->rep_list_size);
   3585 
   3586 	qp->replylist = rep;
   3587 	mutex_exit(&qp->replylist_lock);
   3588 
   3589 	return (rep);
   3590 }
   3591 
   3592 static rdma_stat
   3593 rib_rem_replylist(rib_qp_t *qp)
   3594 {
   3595 	struct reply	*r, *n;
   3596 
   3597 	mutex_enter(&qp->replylist_lock);
   3598 	for (r = qp->replylist; r != NULL; r = n) {
   3599 		n = r->next;
   3600 		(void) rib_remreply(qp, r);
   3601 	}
   3602 	mutex_exit(&qp->replylist_lock);
   3603 
   3604 	return (RDMA_SUCCESS);
   3605 }
   3606 
   3607 static int
   3608 rib_remreply(rib_qp_t *qp, struct reply *rep)
   3609 {
   3610 
   3611 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
   3612 	if (rep->prev) {
   3613 		rep->prev->next = rep->next;
   3614 	}
   3615 	if (rep->next) {
   3616 		rep->next->prev = rep->prev;
   3617 	}
   3618 	if (qp->replylist == rep)
   3619 		qp->replylist = rep->next;
   3620 
   3621 	cv_destroy(&rep->wait_cv);
   3622 	qp->rep_list_size--;
   3623 
   3624 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
   3625 	    int, qp->rep_list_size);
   3626 
   3627 	kmem_free(rep, sizeof (*rep));
   3628 
   3629 	return (0);
   3630 }
   3631 
   3632 rdma_stat
   3633 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
   3634 	struct mrc *buf_handle)
   3635 {
   3636 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
   3637 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
   3638 	rdma_stat	status;
   3639 	rib_hca_t	*hca = (ctoqp(conn))->hca;
   3640 
   3641 	/*
   3642 	 * Note: ALL buffer pools use the same memory type RDMARW.
   3643 	 */
   3644 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
   3645 	if (status == RDMA_SUCCESS) {
   3646 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
   3647 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
   3648 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
   3649 	} else {
   3650 		buf_handle->mrc_linfo = NULL;
   3651 		buf_handle->mrc_lmr = 0;
   3652 		buf_handle->mrc_rmr = 0;
   3653 	}
   3654 	return (status);
   3655 }
   3656 
   3657 static rdma_stat
   3658 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
   3659 	ibt_mr_flags_t spec,
   3660 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
   3661 {
   3662 	ibt_mr_attr_t	mem_attr;
   3663 	ibt_status_t	ibt_status;
   3664 	mem_attr.mr_vaddr = (uintptr_t)buf;
   3665 	mem_attr.mr_len = (ib_msglen_t)size;
   3666 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
   3667 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
   3668 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
   3669 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
   3670 
   3671 	rw_enter(&hca->state_lock, RW_READER);
   3672 	if (hca->state != HCA_DETACHED) {
   3673 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
   3674 		    &mem_attr, mr_hdlp, mr_descp);
   3675 		rw_exit(&hca->state_lock);
   3676 	} else {
   3677 		rw_exit(&hca->state_lock);
   3678 		return (RDMA_FAILED);
   3679 	}
   3680 
   3681 	if (ibt_status != IBT_SUCCESS) {
   3682 		return (RDMA_FAILED);
   3683 	}
   3684 	return (RDMA_SUCCESS);
   3685 }
   3686 
   3687 rdma_stat
   3688 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
   3689 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
   3690 {
   3691 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
   3692 	rib_lrc_entry_t *l;
   3693 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
   3694 	rdma_stat	status;
   3695 	rib_hca_t	*hca = (ctoqp(conn))->hca;
   3696 
   3697 	/*
   3698 	 * Non-coherent memory registration.
   3699 	 */
   3700 	l = (rib_lrc_entry_t *)lrc;
   3701 	if (l) {
   3702 		if (l->registered) {
   3703 			buf_handle->mrc_linfo =
   3704 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
   3705 			buf_handle->mrc_lmr =
   3706 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
   3707 			buf_handle->mrc_rmr =
   3708 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
   3709 			*sync_handle = (RIB_SYNCMEM_HANDLE)
   3710 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
   3711 			return (RDMA_SUCCESS);
   3712 		} else {
   3713 			/* Always register the whole buffer */
   3714 			buf = (caddr_t)l->lrc_buf;
   3715 			buflen = l->lrc_len;
   3716 		}
   3717 	}
   3718 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
   3719 
   3720 	if (status == RDMA_SUCCESS) {
   3721 		if (l) {
   3722 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
   3723 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
   3724 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
   3725 			l->registered		 = TRUE;
   3726 		}
   3727 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
   3728 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
   3729 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
   3730 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
   3731 	} else {
   3732 		buf_handle->mrc_linfo = NULL;
   3733 		buf_handle->mrc_lmr = 0;
   3734 		buf_handle->mrc_rmr = 0;
   3735 	}
   3736 	return (status);
   3737 }
   3738 
   3739 /* ARGSUSED */
   3740 rdma_stat
   3741 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
   3742 {
   3743 	rib_hca_t *hca = (ctoqp(conn))->hca;
   3744 	/*
   3745 	 * Allow memory deregistration even if HCA is
   3746 	 * getting detached. Need all outstanding
   3747 	 * memory registrations to be deregistered
   3748 	 * before HCA_DETACH_EVENT can be accepted.
   3749 	 */
   3750 	(void) ibt_deregister_mr(hca->hca_hdl,
   3751 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
   3752 	return (RDMA_SUCCESS);
   3753 }
   3754 
   3755 /* ARGSUSED */
   3756 rdma_stat
   3757 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
   3758 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
   3759 {
   3760 	rib_lrc_entry_t *l;
   3761 	l = (rib_lrc_entry_t *)lrc;
   3762 	if (l)
   3763 		if (l->registered)
   3764 			return (RDMA_SUCCESS);
   3765 
   3766 	(void) rib_deregistermem(conn, buf, buf_handle);
   3767 
   3768 	return (RDMA_SUCCESS);
   3769 }
   3770 
   3771 /* ARGSUSED */
   3772 rdma_stat
   3773 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
   3774 		int len, int cpu)
   3775 {
   3776 	ibt_status_t	status;
   3777 	rib_hca_t *hca = (ctoqp(conn))->hca;
   3778 	ibt_mr_sync_t	mr_segment;
   3779 
   3780 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
   3781 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
   3782 	mr_segment.ms_len = (ib_memlen_t)len;
   3783 	if (cpu) {
   3784 		/* make incoming data visible to memory */
   3785 		mr_segment.ms_flags = IBT_SYNC_WRITE;
   3786 	} else {
   3787 		/* make memory changes visible to IO */
   3788 		mr_segment.ms_flags = IBT_SYNC_READ;
   3789 	}
   3790 	rw_enter(&hca->state_lock, RW_READER);
   3791 	if (hca->state != HCA_DETACHED) {
   3792 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
   3793 		rw_exit(&hca->state_lock);
   3794 	} else {
   3795 		rw_exit(&hca->state_lock);
   3796 		return (RDMA_FAILED);
   3797 	}
   3798 
   3799 	if (status == IBT_SUCCESS)
   3800 		return (RDMA_SUCCESS);
   3801 	else {
   3802 		return (RDMA_FAILED);
   3803 	}
   3804 }
   3805 
   3806 /*
   3807  * XXXX	????
   3808  */
   3809 static rdma_stat
   3810 rib_getinfo(rdma_info_t *info)
   3811 {
   3812 	/*
   3813 	 * XXXX	Hack!
   3814 	 */
   3815 	info->addrlen = 16;
   3816 	info->mts = 1000000;
   3817 	info->mtu = 1000000;
   3818 
   3819 	return (RDMA_SUCCESS);
   3820 }
   3821 
   3822 rib_bufpool_t *
   3823 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
   3824 {
   3825 	rib_bufpool_t	*rbp = NULL;
   3826 	bufpool_t	*bp = NULL;
   3827 	caddr_t		buf;
   3828 	ibt_mr_attr_t	mem_attr;
   3829 	ibt_status_t	ibt_status;
   3830 	int		i, j;
   3831 
   3832 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
   3833 
   3834 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
   3835 	    num * sizeof (void *), KM_SLEEP);
   3836 
   3837 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
   3838 	bp->numelems = num;
   3839 
   3840 
   3841 	switch (ptype) {
   3842 	case SEND_BUFFER:
   3843 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
   3844 		bp->rsize = RPC_MSG_SZ;
   3845 		break;
   3846 	case RECV_BUFFER:
   3847 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
   3848 		bp->rsize = RPC_BUF_SIZE;
   3849 		break;
   3850 	default:
   3851 		goto fail;
   3852 	}
   3853 
   3854 	/*
   3855 	 * Register the pool.
   3856 	 */
   3857 	bp->bufsize = num * bp->rsize;
   3858 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
   3859 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
   3860 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
   3861 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
   3862 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
   3863 	rw_enter(&hca->state_lock, RW_READER);
   3864 
   3865 	if (hca->state == HCA_DETACHED) {
   3866 		rw_exit(&hca->state_lock);
   3867 		goto fail;
   3868 	}
   3869 
   3870 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
   3871 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
   3872 		mem_attr.mr_vaddr = (uintptr_t)buf;
   3873 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
   3874 		mem_attr.mr_as = NULL;
   3875 		ibt_status = ibt_register_mr(hca->hca_hdl,
   3876 		    hca->pd_hdl, &mem_attr,
   3877 		    &rbp->mr_hdl[i],
   3878 		    &rbp->mr_desc[i]);
   3879 		if (ibt_status != IBT_SUCCESS) {
   3880 			for (j = 0; j < i; j++) {
   3881 				(void) ibt_deregister_mr(hca->hca_hdl,
   3882 				    rbp->mr_hdl[j]);
   3883 			}
   3884 			rw_exit(&hca->state_lock);
   3885 			goto fail;
   3886 		}
   3887 	}
   3888 	rw_exit(&hca->state_lock);
   3889 	buf = (caddr_t)bp->buf;
   3890 	for (i = 0; i < num; i++, buf += bp->rsize) {
   3891 		bp->buflist[i] = (void *)buf;
   3892 	}
   3893 	bp->buffree = num - 1;	/* no. of free buffers */
   3894 	rbp->bpool = bp;
   3895 
   3896 	return (rbp);
   3897 fail:
   3898 	if (bp) {
   3899 		if (bp->buf)
   3900 			kmem_free(bp->buf, bp->bufsize);
   3901 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
   3902 	}
   3903 	if (rbp) {
   3904 		if (rbp->mr_hdl)
   3905 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
   3906 		if (rbp->mr_desc)
   3907 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
   3908 		kmem_free(rbp, sizeof (rib_bufpool_t));
   3909 	}
   3910 	return (NULL);
   3911 }
   3912 
   3913 static void
   3914 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
   3915 {
   3916 	int i;
   3917 	rib_bufpool_t *rbp = NULL;
   3918 	bufpool_t *bp;
   3919 
   3920 	/*
   3921 	 * Obtain pool address based on type of pool
   3922 	 */
   3923 	switch (ptype) {
   3924 		case SEND_BUFFER:
   3925 			rbp = hca->send_pool;
   3926 			break;
   3927 		case RECV_BUFFER:
   3928 			rbp = hca->recv_pool;
   3929 			break;
   3930 		default:
   3931 			return;
   3932 	}
   3933 	if (rbp == NULL)
   3934 		return;
   3935 
   3936 	bp = rbp->bpool;
   3937 
   3938 	/*
   3939 	 * Deregister the pool memory and free it.
   3940 	 */
   3941 	for (i = 0; i < bp->numelems; i++) {
   3942 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
   3943 	}
   3944 }
   3945 
   3946 static void
   3947 rib_rbufpool_free(rib_hca_t *hca, int ptype)
   3948 {
   3949 
   3950 	rib_bufpool_t *rbp = NULL;
   3951 	bufpool_t *bp;
   3952 
   3953 	/*
   3954 	 * Obtain pool address based on type of pool
   3955 	 */
   3956 	switch (ptype) {
   3957 		case SEND_BUFFER:
   3958 			rbp = hca->send_pool;
   3959 			break;
   3960 		case RECV_BUFFER:
   3961 			rbp = hca->recv_pool;
   3962 			break;
   3963 		default:
   3964 			return;
   3965 	}
   3966 	if (rbp == NULL)
   3967 		return;
   3968 
   3969 	bp = rbp->bpool;
   3970 
   3971 	/*
   3972 	 * Free the pool memory.
   3973 	 */
   3974 	if (rbp->mr_hdl)
   3975 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
   3976 
   3977 	if (rbp->mr_desc)
   3978 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
   3979 	if (bp->buf)
   3980 		kmem_free(bp->buf, bp->bufsize);
   3981 	mutex_destroy(&bp->buflock);
   3982 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
   3983 	kmem_free(rbp, sizeof (rib_bufpool_t));
   3984 }
   3985 
   3986 void
   3987 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
   3988 {
   3989 	/*
   3990 	 * Deregister the pool memory and free it.
   3991 	 */
   3992 	rib_rbufpool_deregister(hca, ptype);
   3993 	rib_rbufpool_free(hca, ptype);
   3994 }
   3995 
   3996 /*
   3997  * Fetch a buffer from the pool of type specified in rdbuf->type.
   3998  */
   3999 static rdma_stat
   4000 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
   4001 {
   4002 	rib_lrc_entry_t *rlep;
   4003 
   4004 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
   4005 		rlep = rib_get_cache_buf(conn, rdbuf->len);
   4006 		rdbuf->rb_private =  (caddr_t)rlep;
   4007 		rdbuf->addr = rlep->lrc_buf;
   4008 		rdbuf->handle = rlep->lrc_mhandle;
   4009 		return (RDMA_SUCCESS);
   4010 	}
   4011 
   4012 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
   4013 	if (rdbuf->addr) {
   4014 		switch (rdbuf->type) {
   4015 		case SEND_BUFFER:
   4016 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
   4017 			break;
   4018 		case RECV_BUFFER:
   4019 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
   4020 			break;
   4021 		default:
   4022 			rdbuf->len = 0;
   4023 		}
   4024 		return (RDMA_SUCCESS);
   4025 	} else
   4026 		return (RDMA_FAILED);
   4027 }
   4028 
   4029 /*
   4030  * Fetch a buffer of specified type.
   4031  * Note that rdbuf->handle is mw's rkey.
   4032  */
   4033 static void *
   4034 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
   4035 {
   4036 	rib_qp_t	*qp = ctoqp(conn);
   4037 	rib_hca_t	*hca = qp->hca;
   4038 	rdma_btype	ptype = rdbuf->type;
   4039 	void		*buf;
   4040 	rib_bufpool_t	*rbp = NULL;
   4041 	bufpool_t	*bp;
   4042 	int		i;
   4043 
   4044 	/*
   4045 	 * Obtain pool address based on type of pool
   4046 	 */
   4047 	switch (ptype) {
   4048 	case SEND_BUFFER:
   4049 		rbp = hca->send_pool;
   4050 		break;
   4051 	case RECV_BUFFER:
   4052 		rbp = hca->recv_pool;
   4053 		break;
   4054 	default:
   4055 		return (NULL);
   4056 	}
   4057 	if (rbp == NULL)
   4058 		return (NULL);
   4059 
   4060 	bp = rbp->bpool;
   4061 
   4062 	mutex_enter(&bp->buflock);
   4063 	if (bp->buffree < 0) {
   4064 		mutex_exit(&bp->buflock);
   4065 		return (NULL);
   4066 	}
   4067 
   4068 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
   4069 	buf = bp->buflist[bp->buffree];
   4070 	rdbuf->addr = buf;
   4071 	rdbuf->len = bp->rsize;
   4072 	for (i = bp->numelems - 1; i >= 0; i--) {
   4073 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
   4074 			rdbuf->handle.mrc_rmr =
   4075 			    (uint32_t)rbp->mr_desc[i].md_rkey;
   4076 			rdbuf->handle.mrc_linfo =
   4077 			    (uintptr_t)rbp->mr_hdl[i];
   4078 			rdbuf->handle.mrc_lmr =
   4079 			    (uint32_t)rbp->mr_desc[i].md_lkey;
   4080 			bp->buffree--;
   4081 
   4082 			mutex_exit(&bp->buflock);
   4083 
   4084 			return (buf);
   4085 		}
   4086 	}
   4087 
   4088 	mutex_exit(&bp->buflock);
   4089 
   4090 	return (NULL);
   4091 }
   4092 
   4093 static void
   4094 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
   4095 {
   4096 
   4097 	if (rdbuf->type == RDMA_LONG_BUFFER) {
   4098 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
   4099 		rdbuf->rb_private = NULL;
   4100 		return;
   4101 	}
   4102 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
   4103 }
   4104 
   4105 static void
   4106 rib_rbuf_free(CONN *conn, int ptype, void *buf)
   4107 {
   4108 	rib_qp_t *qp = ctoqp(conn);
   4109 	rib_hca_t *hca = qp->hca;
   4110 	rib_bufpool_t *rbp = NULL;
   4111 	bufpool_t *bp;
   4112 
   4113 	/*
   4114 	 * Obtain pool address based on type of pool
   4115 	 */
   4116 	switch (ptype) {
   4117 	case SEND_BUFFER:
   4118 		rbp = hca->send_pool;
   4119 		break;
   4120 	case RECV_BUFFER:
   4121 		rbp = hca->recv_pool;
   4122 		break;
   4123 	default:
   4124 		return;
   4125 	}
   4126 	if (rbp == NULL)
   4127 		return;
   4128 
   4129 	bp = rbp->bpool;
   4130 
   4131 	mutex_enter(&bp->buflock);
   4132 	if (++bp->buffree >= bp->numelems) {
   4133 		/*
   4134 		 * Should never happen
   4135 		 */
   4136 		bp->buffree--;
   4137 	} else {
   4138 		bp->buflist[bp->buffree] = buf;
   4139 	}
   4140 	mutex_exit(&bp->buflock);
   4141 }
   4142 
   4143 static rdma_stat
   4144 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
   4145 {
   4146 	rw_enter(&connlist->conn_lock, RW_WRITER);
   4147 	if (connlist->conn_hd) {
   4148 		cn->c_next = connlist->conn_hd;
   4149 		connlist->conn_hd->c_prev = cn;
   4150 	}
   4151 	connlist->conn_hd = cn;
   4152 	rw_exit(&connlist->conn_lock);
   4153 
   4154 	return (RDMA_SUCCESS);
   4155 }
   4156 
   4157 static rdma_stat
   4158 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
   4159 {
   4160 	rw_enter(&connlist->conn_lock, RW_WRITER);
   4161 	if (cn->c_prev) {
   4162 		cn->c_prev->c_next = cn->c_next;
   4163 	}
   4164 	if (cn->c_next) {
   4165 		cn->c_next->c_prev = cn->c_prev;
   4166 	}
   4167 	if (connlist->conn_hd == cn)
   4168 		connlist->conn_hd = cn->c_next;
   4169 	rw_exit(&connlist->conn_lock);
   4170 
   4171 	return (RDMA_SUCCESS);
   4172 }
   4173 
   4174 /* ARGSUSED */
   4175 static rdma_stat
   4176 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
   4177     int addr_type, void *handle, CONN **conn)
   4178 {
   4179 	rdma_stat status;
   4180 	rpcib_ping_t rpt;
   4181 
   4182 	status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
   4183 	return (status);
   4184 }
   4185 
   4186 /*
   4187  * rib_find_hca_connection
   4188  *
   4189  * if there is an existing connection to the specified address then
   4190  * it will be returned in conn, otherwise conn will be set to NULL.
   4191  * Also cleans up any connection that is in error state.
   4192  */
   4193 static int
   4194 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
   4195     struct netbuf *d_svcaddr, CONN **conn)
   4196 {
   4197 	CONN *cn;
   4198 	clock_t cv_stat, timout;
   4199 
   4200 	*conn = NULL;
   4201 again:
   4202 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
   4203 	cn = hca->cl_conn_list.conn_hd;
   4204 	while (cn != NULL) {
   4205 		/*
   4206 		 * First, clear up any connection in the ERROR state
   4207 		 */
   4208 		mutex_enter(&cn->c_lock);
   4209 		if (cn->c_state == C_ERROR_CONN) {
   4210 			if (cn->c_ref == 0) {
   4211 				/*
   4212 				 * Remove connection from list and destroy it.
   4213 				 */
   4214 				cn->c_state = C_DISCONN_PEND;
   4215 				mutex_exit(&cn->c_lock);
   4216 				rw_exit(&hca->cl_conn_list.conn_lock);
   4217 				rib_conn_close((void *)cn);
   4218 				goto again;
   4219 			}
   4220 			mutex_exit(&cn->c_lock);
   4221 			cn = cn->c_next;
   4222 			continue;
   4223 		}
   4224 		if (cn->c_state == C_DISCONN_PEND) {
   4225 			mutex_exit(&cn->c_lock);
   4226 			cn = cn->c_next;
   4227 			continue;
   4228 		}
   4229 
   4230 		/*
   4231 		 * source address is only checked for if there is one,
   4232 		 * this is the case for retries.
   4233 		 */
   4234 		if ((cn->c_raddr.len == d_svcaddr->len) &&
   4235 		    (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
   4236 		    d_svcaddr->len) == 0) &&
   4237 		    ((s_svcaddr->len == 0) ||
   4238 		    ((cn->c_laddr.len == s_svcaddr->len) &&
   4239 		    (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
   4240 		    s_svcaddr->len) == 0)))) {
   4241 			/*
   4242 			 * Our connection. Give up conn list lock
   4243 			 * as we are done traversing the list.
   4244 			 */
   4245 			rw_exit(&hca->cl_conn_list.conn_lock);
   4246 			if (cn->c_state == C_CONNECTED) {
   4247 				cn->c_ref++;	/* sharing a conn */
   4248 				mutex_exit(&cn->c_lock);
   4249 				*conn = cn;
   4250 				return (RDMA_SUCCESS);
   4251 			}
   4252 			if (cn->c_state == C_CONN_PEND) {
   4253 				/*
   4254 				 * Hold a reference to this conn before
   4255 				 * we give up the lock.
   4256 				 */
   4257 				cn->c_ref++;
   4258 				timout =  ddi_get_lbolt() +
   4259 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
   4260 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
   4261 				    &cn->c_lock, timout)) > 0 &&
   4262 				    cn->c_state == C_CONN_PEND)
   4263 					;
   4264 				if (cv_stat == 0) {
   4265 					(void) rib_conn_release_locked(cn);
   4266 					return (RDMA_INTR);
   4267 				}
   4268 				if (cv_stat < 0) {
   4269 					(void) rib_conn_release_locked(cn);
   4270 					return (RDMA_TIMEDOUT);
   4271 				}
   4272 				if (cn->c_state == C_CONNECTED) {
   4273 					*conn = cn;
   4274 					mutex_exit(&cn->c_lock);
   4275 					return (RDMA_SUCCESS);
   4276 				} else {
   4277 					(void) rib_conn_release_locked(cn);
   4278 					return (RDMA_TIMEDOUT);
   4279 				}
   4280 			}
   4281 		}
   4282 		mutex_exit(&cn->c_lock);
   4283 		cn = cn->c_next;
   4284 	}
   4285 	rw_exit(&hca->cl_conn_list.conn_lock);
   4286 	*conn = NULL;
   4287 	return (RDMA_FAILED);
   4288 }
   4289 
   4290 /*
   4291  * Connection management.
   4292  * IBTF does not support recycling of channels. So connections are only
   4293  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
   4294  * C_DISCONN_PEND state. No C_IDLE state.
   4295  * C_CONN_PEND state: Connection establishment in progress to the server.
   4296  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
   4297  * It has an RC channel associated with it. ibt_post_send/recv are allowed
   4298  * only in this state.
   4299  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
   4300  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
   4301  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
   4302  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
   4303  * c_ref drops to 0 (this indicates that RPC has no more references to this
   4304  * connection), the connection should be destroyed. A connection transitions
   4305  * into this state when it is being destroyed.
   4306  */
   4307 /* ARGSUSED */
   4308 static rdma_stat
   4309 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
   4310     int addr_type, rpcib_ping_t *rpt, CONN **conn)
   4311 {
   4312 	CONN *cn;
   4313 	int status;
   4314 	rib_hca_t *hca;
   4315 	rib_qp_t *qp;
   4316 	int s_addr_len;
   4317 	char *s_addr_buf;
   4318 
   4319 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
   4320 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
   4321 		rw_enter(&hca->state_lock, RW_READER);
   4322 		if (hca->state != HCA_DETACHED) {
   4323 			status = rib_find_hca_connection(hca, s_svcaddr,
   4324 			    d_svcaddr, conn);
   4325 			rw_exit(&hca->state_lock);
   4326 			if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
   4327 				rw_exit(&rib_stat->hcas_list_lock);
   4328 				return (status);
   4329 			}
   4330 		} else
   4331 			rw_exit(&hca->state_lock);
   4332 	}
   4333 	rw_exit(&rib_stat->hcas_list_lock);
   4334 
   4335 	/*
   4336 	 * No existing connection found, establish a new connection.
   4337 	 */
   4338 	bzero(rpt, sizeof (rpcib_ping_t));
   4339 
   4340 	status = rib_ping_srv(addr_type, d_svcaddr, rpt);
   4341 	if (status != RDMA_SUCCESS) {
   4342 		return (RDMA_FAILED);
   4343 	}
   4344 	hca = rpt->hca;
   4345 
   4346 	if (rpt->srcip.family == AF_INET) {
   4347 		s_addr_len = sizeof (rpt->srcip.un.ip4addr);
   4348 		s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
   4349 	} else if (rpt->srcip.family == AF_INET6) {
   4350 		s_addr_len = sizeof (rpt->srcip.un.ip6addr);
   4351 		s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
   4352 	} else {
   4353 		return (RDMA_FAILED);
   4354 	}
   4355 
   4356 	/*
   4357 	 * Channel to server doesn't exist yet, create one.
   4358 	 */
   4359 	if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
   4360 		return (RDMA_FAILED);
   4361 	}
   4362 	cn = qptoc(qp);
   4363 	cn->c_state = C_CONN_PEND;
   4364 	cn->c_ref = 1;
   4365 
   4366 	cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
   4367 	bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
   4368 	cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
   4369 
   4370 	if (rpt->srcip.family == AF_INET) {
   4371 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
   4372 		(void) strcpy(cn->c_netid, RIBNETID_TCP);
   4373 	} else {
   4374 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
   4375 		(void) strcpy(cn->c_netid, RIBNETID_TCP6);
   4376 	}
   4377 
   4378 	/*
   4379 	 * Add to conn list.
   4380 	 * We had given up the READER lock. In the time since then,
   4381 	 * another thread might have created the connection we are
   4382 	 * trying here. But for now, that is quiet alright - there
   4383 	 * might be two connections between a pair of hosts instead
   4384 	 * of one. If we really want to close that window,
   4385 	 * then need to check the list after acquiring the
   4386 	 * WRITER lock.
   4387 	 */
   4388 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
   4389 	status = rib_conn_to_srv(hca, qp, rpt);
   4390 	mutex_enter(&cn->c_lock);
   4391 
   4392 	if (cn->c_flags & C_CLOSE_PENDING) {
   4393 		/*
   4394 		 * This handles a case where the module or
   4395 		 * HCA detached in the time a connection is
   4396 		 * established. In such a case close the
   4397 		 * connection immediately if this is the
   4398 		 * only reference.
   4399 		 */
   4400 		if (cn->c_ref == 1) {
   4401 			cn->c_ref--;
   4402 			cn->c_state = C_DISCONN_PEND;
   4403 			mutex_exit(&cn->c_lock);
   4404 			rib_conn_close((void *)cn);
   4405 			return (RDMA_FAILED);
   4406 		}
   4407 
   4408 		/*
   4409 		 * Connection to be closed later when c_ref = 0
   4410 		 */
   4411 		status = RDMA_FAILED;
   4412 	}
   4413 
   4414 	if (status == RDMA_SUCCESS) {
   4415 		cn->c_state = C_CONNECTED;
   4416 		*conn = cn;
   4417 	} else {
   4418 		cn->c_state = C_ERROR_CONN;
   4419 		cn->c_ref--;
   4420 	}
   4421 	cv_signal(&cn->c_cv);
   4422 	mutex_exit(&cn->c_lock);
   4423 	return (status);
   4424 }
   4425 
   4426 static void
   4427 rib_conn_close(void *rarg)
   4428 {
   4429 	CONN *conn = (CONN *)rarg;
   4430 	rib_qp_t *qp = ctoqp(conn);
   4431 
   4432 	mutex_enter(&conn->c_lock);
   4433 	if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
   4434 
   4435 		conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
   4436 
   4437 		/*
   4438 		 * Live connection in CONNECTED state.
   4439 		 */
   4440 		if (conn->c_state == C_CONNECTED) {
   4441 			conn->c_state = C_ERROR_CONN;
   4442 		}
   4443 		mutex_exit(&conn->c_lock);
   4444 
   4445 		rib_close_a_channel(conn);
   4446 
   4447 		mutex_enter(&conn->c_lock);
   4448 		conn->c_flags &= ~C_CLOSE_PENDING;
   4449 	}
   4450 
   4451 	mutex_exit(&conn->c_lock);
   4452 
   4453 	if (qp->mode == RIB_SERVER)
   4454 		(void) rib_disconnect_channel(conn,
   4455 		    &qp->hca->srv_conn_list);
   4456 	else
   4457 		(void) rib_disconnect_channel(conn,
   4458 		    &qp->hca->cl_conn_list);
   4459 }
   4460 
   4461 static void
   4462 rib_conn_timeout_call(void *carg)
   4463 {
   4464 	time_t idle_time;
   4465 	CONN *conn = (CONN *)carg;
   4466 	rib_hca_t *hca = ctoqp(conn)->hca;
   4467 	int error;
   4468 
   4469 	mutex_enter(&conn->c_lock);
   4470 	if ((conn->c_ref > 0) ||
   4471 	    (conn->c_state == C_DISCONN_PEND)) {
   4472 		conn->c_timeout = NULL;
   4473 		mutex_exit(&conn->c_lock);
   4474 		return;
   4475 	}
   4476 
   4477 	idle_time = (gethrestime_sec() - conn->c_last_used);
   4478 
   4479 	if ((idle_time <= rib_conn_timeout) &&
   4480 	    (conn->c_state != C_ERROR_CONN)) {
   4481 		/*
   4482 		 * There was activity after the last timeout.
   4483 		 * Extend the conn life. Unless the conn is
   4484 		 * already in error state.
   4485 		 */
   4486 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
   4487 		    SEC_TO_TICK(rib_conn_timeout - idle_time));
   4488 		mutex_exit(&conn->c_lock);
   4489 		return;
   4490 	}
   4491 
   4492 	error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
   4493 	    (void *)conn, DDI_NOSLEEP);
   4494 
   4495 	/*
   4496 	 * If taskq dispatch fails above, then reset the timeout
   4497 	 * to try again after 10 secs.
   4498 	 */
   4499 
   4500 	if (error != DDI_SUCCESS) {
   4501 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
   4502 		    SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
   4503 		mutex_exit(&conn->c_lock);
   4504 		return;
   4505 	}
   4506 
   4507 	conn->c_state = C_DISCONN_PEND;
   4508 	mutex_exit(&conn->c_lock);
   4509 }
   4510 
   4511 static rdma_stat
   4512 rib_conn_release(CONN *conn)
   4513 {
   4514 	mutex_enter(&conn->c_lock);
   4515 	return (rib_conn_release_locked(conn));
   4516 }
   4517 
   4518 /*
   4519  * Expects conn->c_lock to be held on entry.
   4520  * c_lock released on return
   4521  */
   4522 static rdma_stat
   4523 rib_conn_release_locked(CONN *conn)
   4524 {
   4525 	conn->c_ref--;
   4526 
   4527 	conn->c_last_used = gethrestime_sec();
   4528 	if (conn->c_ref > 0) {
   4529 		mutex_exit(&conn->c_lock);
   4530 		return (RDMA_SUCCESS);
   4531 	}
   4532 
   4533 	/*
   4534 	 * If a conn is C_ERROR_CONN, close the channel.
   4535 	 */
   4536 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
   4537 		conn->c_state = C_DISCONN_PEND;
   4538 		mutex_exit(&conn->c_lock);
   4539 		rib_conn_close((void *)conn);
   4540 		return (RDMA_SUCCESS);
   4541 	}
   4542 
   4543 	/*
   4544 	 * c_ref == 0, set a timeout for conn release
   4545 	 */
   4546 
   4547 	if (conn->c_timeout == NULL) {
   4548 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
   4549 		    SEC_TO_TICK(rib_conn_timeout));
   4550 	}
   4551 
   4552 	mutex_exit(&conn->c_lock);
   4553 	return (RDMA_SUCCESS);
   4554 }
   4555 
   4556 /*
   4557  * Add at front of list
   4558  */
   4559 static struct rdma_done_list *
   4560 rdma_done_add(rib_qp_t *qp, uint32_t xid)
   4561 {
   4562 	struct rdma_done_list *rd;
   4563 
   4564 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
   4565 
   4566 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
   4567 	rd->xid = xid;
   4568 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
   4569 
   4570 	rd->prev = NULL;
   4571 	rd->next = qp->rdlist;
   4572 	if (qp->rdlist != NULL)
   4573 		qp->rdlist->prev = rd;
   4574 	qp->rdlist = rd;
   4575 
   4576 	return (rd);
   4577 }
   4578 
   4579 static void
   4580 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
   4581 {
   4582 	struct rdma_done_list *r;
   4583 
   4584 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
   4585 
   4586 	r = rd->next;
   4587 	if (r != NULL) {
   4588 		r->prev = rd->prev;
   4589 	}
   4590 
   4591 	r = rd->prev;
   4592 	if (r != NULL) {
   4593 		r->next = rd->next;
   4594 	} else {
   4595 		qp->rdlist = rd->next;
   4596 	}
   4597 
   4598 	cv_destroy(&rd->rdma_done_cv);
   4599 	kmem_free(rd, sizeof (*rd));
   4600 }
   4601 
   4602 static void
   4603 rdma_done_rem_list(rib_qp_t *qp)
   4604 {
   4605 	struct rdma_done_list	*r, *n;
   4606 
   4607 	mutex_enter(&qp->rdlist_lock);
   4608 	for (r = qp->rdlist; r != NULL; r = n) {
   4609 		n = r->next;
   4610 		rdma_done_rm(qp, r);
   4611 	}
   4612 	mutex_exit(&qp->rdlist_lock);
   4613 }
   4614 
   4615 static void
   4616 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
   4617 {
   4618 	struct rdma_done_list *r = qp->rdlist;
   4619 
   4620 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
   4621 
   4622 	while (r) {
   4623 		if (r->xid == xid) {
   4624 			cv_signal(&r->rdma_done_cv);
   4625 			return;
   4626 		} else {
   4627 			r = r->next;
   4628 		}
   4629 	}
   4630 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
   4631 	    int, xid);
   4632 }
   4633 
   4634 /*
   4635  * Expects conn->c_lock to be held by the caller.
   4636  */
   4637 
   4638 static void
   4639 rib_close_a_channel(CONN *conn)
   4640 {
   4641 	rib_qp_t	*qp;
   4642 	qp = ctoqp(conn);
   4643 
   4644 	if (qp->qp_hdl == NULL) {
   4645 		/* channel already freed */
   4646 		return;
   4647 	}
   4648 
   4649 	/*
   4650 	 * Call ibt_close_rc_channel in blocking mode
   4651 	 * with no callbacks.
   4652 	 */
   4653 	(void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
   4654 	    NULL, 0, NULL, NULL, 0);
   4655 }
   4656 
   4657 /*
   4658  * Goes through all connections and closes the channel
   4659  * This will cause all the WRs on those channels to be
   4660  * flushed.
   4661  */
   4662 static void
   4663 rib_close_channels(rib_conn_list_t *connlist)
   4664 {
   4665 	CONN 		*conn, *tmp;
   4666 
   4667 	rw_enter(&connlist->conn_lock, RW_READER);
   4668 	conn = connlist->conn_hd;
   4669 	while (conn != NULL) {
   4670 		mutex_enter(&conn->c_lock);
   4671 		tmp = conn->c_next;
   4672 		if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
   4673 
   4674 			if (conn->c_state == C_CONN_PEND) {
   4675 				conn->c_flags |= C_CLOSE_PENDING;
   4676 				goto next;
   4677 			}
   4678 
   4679 			conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
   4680 
   4681 			/*
   4682 			 * Live connection in CONNECTED state.
   4683 			 */
   4684 			if (conn->c_state == C_CONNECTED)
   4685 				conn->c_state = C_ERROR_CONN;
   4686 			mutex_exit(&conn->c_lock);
   4687 
   4688 			rib_close_a_channel(conn);
   4689 
   4690 			mutex_enter(&conn->c_lock);
   4691 			conn->c_flags &= ~C_CLOSE_PENDING;
   4692 			/* Signal a pending rib_disconnect_channel() */
   4693 			cv_signal(&conn->c_cv);
   4694 		}
   4695 next:
   4696 		mutex_exit(&conn->c_lock);
   4697 		conn = tmp;
   4698 	}
   4699 	rw_exit(&connlist->conn_lock);
   4700 }
   4701 
   4702 /*
   4703  * Frees up all connections that are no longer being referenced
   4704  */
   4705 static void
   4706 rib_purge_connlist(rib_conn_list_t *connlist)
   4707 {
   4708 	CONN 		*conn;
   4709 
   4710 top:
   4711 	rw_enter(&connlist->conn_lock, RW_READER);
   4712 	conn = connlist->conn_hd;
   4713 	while (conn != NULL) {
   4714 		mutex_enter(&conn->c_lock);
   4715 
   4716 		/*
   4717 		 * At this point connection is either in ERROR
   4718 		 * or DISCONN_PEND state. If in DISCONN_PEND state
   4719 		 * then some other thread is culling that connection.
   4720 		 * If not and if c_ref is 0, then destroy the connection.
   4721 		 */
   4722 		if (conn->c_ref == 0 &&
   4723 		    conn->c_state != C_DISCONN_PEND) {
   4724 			/*
   4725 			 * Cull the connection
   4726 			 */
   4727 			conn->c_state = C_DISCONN_PEND;
   4728 			mutex_exit(&conn->c_lock);
   4729 			rw_exit(&connlist->conn_lock);
   4730 			(void) rib_disconnect_channel(conn, connlist);
   4731 			goto top;
   4732 		} else {
   4733 			/*
   4734 			 * conn disconnect already scheduled or will
   4735 			 * happen from conn_release when c_ref drops to 0.
   4736 			 */
   4737 			mutex_exit(&conn->c_lock);
   4738 		}
   4739 		conn = conn->c_next;
   4740 	}
   4741 	rw_exit(&connlist->conn_lock);
   4742 
   4743 	/*
   4744 	 * At this point, only connections with c_ref != 0 are on the list
   4745 	 */
   4746 }
   4747 
   4748 /*
   4749  * Free all the HCA resources and close
   4750  * the hca.
   4751  */
   4752 
   4753 static void
   4754 rib_free_hca(rib_hca_t *hca)
   4755 {
   4756 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
   4757 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
   4758 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
   4759 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
   4760 
   4761 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
   4762 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
   4763 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
   4764 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
   4765 
   4766 	rib_rbufpool_destroy(hca, RECV_BUFFER);
   4767 	rib_rbufpool_destroy(hca, SEND_BUFFER);
   4768 	rib_destroy_cache(hca);
   4769 	if (rib_mod.rdma_count == 0)
   4770 		(void) rdma_unregister_mod(&rib_mod);
   4771 	(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
   4772 	(void) ibt_close_hca(hca->hca_hdl);
   4773 	hca->hca_hdl = NULL;
   4774 }
   4775 
   4776 
   4777 static void
   4778 rib_stop_hca_services(rib_hca_t *hca)
   4779 {
   4780 	rib_stop_services(hca);
   4781 	rib_close_channels(&hca->cl_conn_list);
   4782 	rib_close_channels(&hca->srv_conn_list);
   4783 
   4784 	rib_purge_connlist(&hca->cl_conn_list);
   4785 	rib_purge_connlist(&hca->srv_conn_list);
   4786 
   4787 	if ((rib_stat->hcas_list == NULL) && stats_enabled) {
   4788 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
   4789 		    GLOBAL_ZONEID);
   4790 		stats_enabled = FALSE;
   4791 	}
   4792 
   4793 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
   4794 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
   4795 	if (hca->srv_conn_list.conn_hd == NULL &&
   4796 	    hca->cl_conn_list.conn_hd == NULL) {
   4797 		/*
   4798 		 * conn_lists are NULL, so destroy
   4799 		 * buffers, close hca and be done.
   4800 		 */
   4801 		rib_free_hca(hca);
   4802 	}
   4803 	rw_exit(&hca->cl_conn_list.conn_lock);
   4804 	rw_exit(&hca->srv_conn_list.conn_lock);
   4805 
   4806 	if (hca->hca_hdl != NULL) {
   4807 		mutex_enter(&hca->inuse_lock);
   4808 		while (hca->inuse)
   4809 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
   4810 		mutex_exit(&hca->inuse_lock);
   4811 
   4812 		rib_free_hca(hca);
   4813 	}
   4814 	rw_destroy(&hca->bound_services_lock);
   4815 
   4816 	if (hca->cleanup_helper != NULL) {
   4817 		ddi_taskq_destroy(hca->cleanup_helper);
   4818 		hca->cleanup_helper = NULL;
   4819 	}
   4820 }
   4821 
   4822 /*
   4823  * Cleans and closes up all uses of the HCA
   4824  */
   4825 static void
   4826 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
   4827 {
   4828 	rib_hca_t *hca = NULL;
   4829 	rib_hca_t **hcap;
   4830 
   4831 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
   4832 	for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
   4833 		hca = *hcap;
   4834 		rw_enter(&hca->state_lock, RW_WRITER);
   4835 		if (hca->hca_hdl == hca_hdl) {
   4836 			/*
   4837 			 * Mark as detached and remove from
   4838 			 * hca list.
   4839 			 */
   4840 			hca->state = HCA_DETACHED;
   4841 			*hcap = hca->next;
   4842 			rib_stat->nhca_inited--;
   4843 			rib_mod.rdma_count--;
   4844 			rw_exit(&hca->state_lock);
   4845 			break;
   4846 		}
   4847 		rw_exit(&hca->state_lock);
   4848 	}
   4849 	rw_exit(&rib_stat->hcas_list_lock);
   4850 
   4851 	if (hca == NULL)
   4852 		return;
   4853 	ASSERT(hca->hca_hdl == hca_hdl);
   4854 
   4855 	/*
   4856 	 * Stop all services on the HCA
   4857 	 * Go through cl_conn_list and close all rc_channels
   4858 	 * Go through svr_conn_list and close all rc_channels
   4859 	 * Free connections whose c_ref has dropped to 0
   4860 	 * Destroy all CQs
   4861 	 * Deregister and released all buffer pool memory after all
   4862 	 * connections are destroyed
   4863 	 * Free the protection domain
   4864 	 * ibt_close_hca()
   4865 	 */
   4866 	rib_stop_hca_services(hca);
   4867 
   4868 	kmem_free(hca, sizeof (*hca));
   4869 }
   4870 
   4871 static void
   4872 rib_server_side_cache_reclaim(void *argp)
   4873 {
   4874 	cache_avl_struct_t    *rcas;
   4875 	rib_lrc_entry_t		*rb;
   4876 	rib_hca_t *hca = (rib_hca_t *)argp;
   4877 
   4878 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
   4879 	rcas = avl_first(&hca->avl_tree);
   4880 	if (rcas != NULL)
   4881 		avl_remove(&hca->avl_tree, rcas);
   4882 
   4883 	while (rcas != NULL) {
   4884 		while (rcas->r.forw != &rcas->r) {
   4885 			rcas->elements--;
   4886 			rb = rcas->r.forw;
   4887 			remque(rb);
   4888 			if (rb->registered)
   4889 				(void) rib_deregistermem_via_hca(hca,
   4890 				    rb->lrc_buf, rb->lrc_mhandle);
   4891 
   4892 			hca->cache_allocation -= rb->lrc_len;
   4893 			kmem_free(rb->lrc_buf, rb->lrc_len);
   4894 			kmem_free(rb, sizeof (rib_lrc_entry_t));
   4895 		}
   4896 		mutex_destroy(&rcas->node_lock);
   4897 		kmem_cache_free(hca->server_side_cache, rcas);
   4898 		rcas = avl_first(&hca->avl_tree);
   4899 		if (rcas != NULL)
   4900 			avl_remove(&hca->avl_tree, rcas);
   4901 	}
   4902 	rw_exit(&hca->avl_rw_lock);
   4903 }
   4904 
   4905 static void
   4906 rib_server_side_cache_cleanup(void *argp)
   4907 {
   4908 	cache_avl_struct_t    *rcas;
   4909 	rib_lrc_entry_t		*rb;
   4910 	rib_hca_t *hca = (rib_hca_t *)argp;
   4911 
   4912 	mutex_enter(&hca->cache_allocation_lock);
   4913 	if (hca->cache_allocation < cache_limit) {
   4914 		mutex_exit(&hca->cache_allocation_lock);
   4915 		return;
   4916 	}
   4917 	mutex_exit(&hca->cache_allocation_lock);
   4918 
   4919 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
   4920 	rcas = avl_last(&hca->avl_tree);
   4921 	if (rcas != NULL)
   4922 		avl_remove(&hca->avl_tree, rcas);
   4923 
   4924 	while (rcas != NULL) {
   4925 		while (rcas->r.forw != &rcas->r) {
   4926 			rcas->elements--;
   4927 			rb = rcas->r.forw;
   4928 			remque(rb);
   4929 			if (rb->registered)
   4930 				(void) rib_deregistermem_via_hca(hca,
   4931 				    rb->lrc_buf, rb->lrc_mhandle);
   4932 
   4933 			hca->cache_allocation -= rb->lrc_len;
   4934 
   4935 			kmem_free(rb->lrc_buf, rb->lrc_len);
   4936 			kmem_free(rb, sizeof (rib_lrc_entry_t));
   4937 		}
   4938 		mutex_destroy(&rcas->node_lock);
   4939 		if (hca->server_side_cache) {
   4940 			kmem_cache_free(hca->server_side_cache, rcas);
   4941 		}
   4942 
   4943 		if (hca->cache_allocation < cache_limit) {
   4944 			rw_exit(&hca->avl_rw_lock);
   4945 			return;
   4946 		}
   4947 
   4948 		rcas = avl_last(&hca->avl_tree);
   4949 		if (rcas != NULL)
   4950 			avl_remove(&hca->avl_tree, rcas);
   4951 	}
   4952 	rw_exit(&hca->avl_rw_lock);
   4953 }
   4954 
   4955 static int
   4956 avl_compare(const void *t1, const void *t2)
   4957 {
   4958 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
   4959 		return (0);
   4960 
   4961 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
   4962 		return (-1);
   4963 
   4964 	return (1);
   4965 }
   4966 
   4967 static void
   4968 rib_destroy_cache(rib_hca_t *hca)
   4969 {
   4970 	if (hca->avl_init) {
   4971 		rib_server_side_cache_reclaim((void *)hca);
   4972 		if (hca->server_side_cache) {
   4973 			kmem_cache_destroy(hca->server_side_cache);
   4974 			hca->server_side_cache = NULL;
   4975 		}
   4976 		avl_destroy(&hca->avl_tree);
   4977 		mutex_destroy(&hca->cache_allocation_lock);
   4978 		rw_destroy(&hca->avl_rw_lock);
   4979 	}
   4980 	hca->avl_init = FALSE;
   4981 }
   4982 
   4983 static void
   4984 rib_force_cleanup(void *hca)
   4985 {
   4986 	if (((rib_hca_t *)hca)->cleanup_helper != NULL)
   4987 		(void) ddi_taskq_dispatch(
   4988 		    ((rib_hca_t *)hca)->cleanup_helper,
   4989 		    rib_server_side_cache_cleanup,
   4990 		    (void *)hca, DDI_NOSLEEP);
   4991 }
   4992 
   4993 static rib_lrc_entry_t *
   4994 rib_get_cache_buf(CONN *conn, uint32_t len)
   4995 {
   4996 	cache_avl_struct_t	cas, *rcas;
   4997 	rib_hca_t	*hca = (ctoqp(conn))->hca;
   4998 	rib_lrc_entry_t *reply_buf;
   4999 	avl_index_t where = NULL;
   5000 	uint64_t c_alloc = 0;
   5001 
   5002 	if (!hca->avl_init)
   5003 		goto  error_alloc;
   5004 
   5005 	cas.len = len;
   5006 
   5007 	rw_enter(&hca->avl_rw_lock, RW_READER);
   5008 
   5009 	mutex_enter(&hca->cache_allocation_lock);
   5010 	c_alloc = hca->cache_allocation;
   5011 	mutex_exit(&hca->cache_allocation_lock);
   5012 
   5013 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
   5014 	    &where)) == NULL) {
   5015 		/* Am I above the cache limit */
   5016 		if ((c_alloc + len) >= cache_limit) {
   5017 			rib_force_cleanup((void *)hca);
   5018 			rw_exit(&hca->avl_rw_lock);
   5019 			mutex_enter(&hca->cache_allocation_lock);
   5020 			hca->cache_misses_above_the_limit ++;
   5021 			mutex_exit(&hca->cache_allocation_lock);
   5022 
   5023 			/* Allocate and register the buffer directly */
   5024 			goto error_alloc;
   5025 		}
   5026 
   5027 		rw_exit(&hca->avl_rw_lock);
   5028 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
   5029 
   5030 		/* Recheck to make sure no other thread added the entry in */
   5031 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
   5032 		    &cas, &where)) == NULL) {
   5033 			/* Allocate an avl tree entry */
   5034 			rcas = (cache_avl_struct_t *)
   5035 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
   5036 
   5037 			bzero(rcas, sizeof (cache_avl_struct_t));
   5038 			rcas->elements = 0;
   5039 			rcas->r.forw = &rcas->r;
   5040 			rcas->r.back = &rcas->r;
   5041 			rcas->len = len;
   5042 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
   5043 			avl_insert(&hca->avl_tree, rcas, where);
   5044 		}
   5045 	}
   5046 
   5047 	mutex_enter(&rcas->node_lock);
   5048 
   5049 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
   5050 		reply_buf = rcas->r.forw;
   5051 		remque(reply_buf);
   5052 		rcas->elements--;
   5053 		mutex_exit(&rcas->node_lock);
   5054 		rw_exit(&hca->avl_rw_lock);
   5055 
   5056 		mutex_enter(&hca->cache_allocation_lock);
   5057 		hca->cache_hits++;
   5058 		hca->cache_allocation -= len;
   5059 		mutex_exit(&hca->cache_allocation_lock);
   5060 	} else {
   5061 		/* Am I above the cache limit */
   5062 		mutex_exit(&rcas->node_lock);
   5063 		if ((c_alloc + len) >= cache_limit) {
   5064 			rib_force_cleanup((void *)hca);
   5065 			rw_exit(&hca->avl_rw_lock);
   5066 
   5067 			mutex_enter(&hca->cache_allocation_lock);
   5068 			hca->cache_misses_above_the_limit++;
   5069 			mutex_exit(&hca->cache_allocation_lock);
   5070 			/* Allocate and register the buffer directly */
   5071 			goto error_alloc;
   5072 		}
   5073 		rw_exit(&hca->avl_rw_lock);
   5074 		mutex_enter(&hca->cache_allocation_lock);
   5075 		hca->cache_misses++;
   5076 		mutex_exit(&hca->cache_allocation_lock);
   5077 		/* Allocate a reply_buf entry */
   5078 		reply_buf = (rib_lrc_entry_t *)
   5079 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
   5080 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
   5081 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
   5082 		reply_buf->lrc_len  = len;
   5083 		reply_buf->registered = FALSE;
   5084 		reply_buf->avl_node = (void *)rcas;
   5085 	}
   5086 
   5087 	return (reply_buf);
   5088 
   5089 error_alloc:
   5090 	reply_buf = (rib_lrc_entry_t *)
   5091 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
   5092 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
   5093 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
   5094 	reply_buf->lrc_len = len;
   5095 	reply_buf->registered = FALSE;
   5096 	reply_buf->avl_node = NULL;
   5097 
   5098 	return (reply_buf);
   5099 }
   5100 
   5101 /*
   5102  * Return a pre-registered back to the cache (without
   5103  * unregistering the buffer)..
   5104  */
   5105 
   5106 static void
   5107 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
   5108 {
   5109 	cache_avl_struct_t    cas, *rcas;
   5110 	avl_index_t where = NULL;
   5111 	rib_hca_t	*hca = (ctoqp(conn))->hca;
   5112 
   5113 	if (!hca->avl_init)
   5114 		goto  error_free;
   5115 
   5116 	cas.len = reg_buf->lrc_len;
   5117 	rw_enter(&hca->avl_rw_lock, RW_READER);
   5118 	if ((rcas = (cache_avl_struct_t *)
   5119 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
   5120 		rw_exit(&hca->avl_rw_lock);
   5121 		goto error_free;
   5122 	} else {
   5123 		cas.len = reg_buf->lrc_len;
   5124 		mutex_enter(&rcas->node_lock);
   5125 		insque(reg_buf, &rcas->r);
   5126 		rcas->elements ++;
   5127 		mutex_exit(&rcas->node_lock);
   5128 		rw_exit(&hca->avl_rw_lock);
   5129 		mutex_enter(&hca->cache_allocation_lock);
   5130 		hca->cache_allocation += cas.len;
   5131 		mutex_exit(&hca->cache_allocation_lock);
   5132 	}
   5133 
   5134 	return;
   5135 
   5136 error_free:
   5137 
   5138 	if (reg_buf->registered)
   5139 		(void) rib_deregistermem_via_hca(hca,
   5140 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
   5141 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
   5142 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
   5143 }
   5144 
   5145 static rdma_stat
   5146 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
   5147 	uint_t buflen, struct mrc *buf_handle)
   5148 {
   5149 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
   5150 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
   5151 	rdma_stat	status;
   5152 
   5153 
   5154 	/*
   5155 	 * Note: ALL buffer pools use the same memory type RDMARW.
   5156 	 */
   5157 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
   5158 	if (status == RDMA_SUCCESS) {
   5159 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
   5160 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
   5161 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
   5162 	} else {
   5163 		buf_handle->mrc_linfo = NULL;
   5164 		buf_handle->mrc_lmr = 0;
   5165 		buf_handle->mrc_rmr = 0;
   5166 	}
   5167 	return (status);
   5168 }
   5169 
   5170 /* ARGSUSED */
   5171 static rdma_stat
   5172 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
   5173     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
   5174 {
   5175 
   5176 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
   5177 	return (RDMA_SUCCESS);
   5178 }
   5179 
   5180 /* ARGSUSED */
   5181 static rdma_stat
   5182 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
   5183 {
   5184 
   5185 	(void) ibt_deregister_mr(hca->hca_hdl,
   5186 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
   5187 	return (RDMA_SUCCESS);
   5188 }
   5189 
   5190 /*
   5191  * Check if the IP interface named by `lifrp' is RDMA-capable.
   5192  */
   5193 static boolean_t
   5194 rpcib_rdma_capable_interface(struct lifreq *lifrp)
   5195 {
   5196 	char ifname[LIFNAMSIZ];
   5197 	char *cp;
   5198 
   5199 	if (lifrp->lifr_type == IFT_IB)
   5200 		return (B_TRUE);
   5201 
   5202 	/*
   5203 	 * Strip off the logical interface portion before getting
   5204 	 * intimate with the name.
   5205 	 */
   5206 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
   5207 	if ((cp = strchr(ifname, ':')) != NULL)
   5208 		*cp = '\0';
   5209 
   5210 	return (strcmp("lo0", ifname) == 0);
   5211 }
   5212 
   5213 static int
   5214 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
   5215 {
   5216 	vnode_t *kkvp, *vp;
   5217 	TIUSER  *tiptr;
   5218 	struct  strioctl iocb;
   5219 	k_sigset_t smask;
   5220 	int	err = 0;
   5221 
   5222 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
   5223 		if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
   5224 		    &tiptr, CRED()) == 0) {
   5225 			vp = tiptr->fp->f_vnode;
   5226 		} else {
   5227 			VN_RELE(kkvp);
   5228 			return (EPROTO);
   5229 		}
   5230 	} else {
   5231 		return (EPROTO);
   5232 	}
   5233 
   5234 	iocb.ic_cmd = cmd;
   5235 	iocb.ic_timout = 0;
   5236 	iocb.ic_len = len;
   5237 	iocb.ic_dp = (caddr_t)arg;
   5238 	sigintr(&smask, 0);
   5239 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
   5240 	sigunintr(&smask);
   5241 	(void) t_kclose(tiptr, 0);
   5242 	VN_RELE(kkvp);
   5243 	return (err);
   5244 }
   5245 
   5246 /*
   5247  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
   5248  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
   5249  */
   5250 static int
   5251 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
   5252 {
   5253 	int err;
   5254 	struct lifnum lifn;
   5255 
   5256 	bzero(&lifn, sizeof (struct lifnum));
   5257 	lifn.lifn_family = AF_UNSPEC;
   5258 
   5259 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
   5260 	if (err != 0)
   5261 		return (err);
   5262 
   5263 	/*
   5264 	 * Pad the interface count to account for additional interfaces that
   5265 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
   5266 	 */
   5267 	lifn.lifn_count += 4;
   5268 
   5269 	bzero(lifcp, sizeof (struct lifconf));
   5270 	lifcp->lifc_family = AF_UNSPEC;
   5271 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
   5272 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
   5273 
   5274 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
   5275 	if (err != 0) {
   5276 		kmem_free(lifcp->lifc_buf, *bufsizep);
   5277 		return (err);
   5278 	}
   5279 	return (0);
   5280 }
   5281 
   5282 static boolean_t
   5283 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
   5284 {
   5285 	uint_t i, nifs;
   5286 	uint_t bufsize;
   5287 	struct lifconf lifc;
   5288 	struct lifreq *lifrp;
   5289 	struct sockaddr_in *sinp;
   5290 	struct sockaddr_in6 *sin6p;
   5291 
   5292 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
   5293 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
   5294 
   5295 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
   5296 		return (B_FALSE);
   5297 
   5298 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
   5299 		kmem_free(lifc.lifc_buf, bufsize);
   5300 		return (B_FALSE);
   5301 	}
   5302 
   5303 	/*
   5304 	 * Worst case is that all of the addresses are IB-capable and have
   5305 	 * the same address family, so size our buffers accordingly.
   5306 	 */
   5307 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
   5308 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
   5309 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
   5310 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
   5311 
   5312 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
   5313 		if (!rpcib_rdma_capable_interface(lifrp))
   5314 			continue;
   5315 
   5316 		if (lifrp->lifr_addr.ss_family == AF_INET) {
   5317 			sinp = addrs4->ri_list;
   5318 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
   5319 			    sizeof (struct sockaddr_in));
   5320 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
   5321 			sin6p = addrs6->ri_list;
   5322 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
   5323 			    sizeof (struct sockaddr_in6));
   5324 		}
   5325 	}
   5326 
   5327 	kmem_free(lifc.lifc_buf, bufsize);
   5328 	return (B_TRUE);
   5329 }
   5330 
   5331 /* ARGSUSED */
   5332 static int
   5333 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
   5334 {
   5335 	rib_hca_t *hca;
   5336 
   5337 	if (KSTAT_WRITE == rw) {
   5338 		return (EACCES);
   5339 	}
   5340 
   5341 	rpcib_kstat.cache_limit.value.ui64 =
   5342 	    (uint64_t)cache_limit;
   5343 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
   5344 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
   5345 		rpcib_kstat.cache_allocation.value.ui64 +=
   5346 		    (uint64_t)hca->cache_allocation;
   5347 		rpcib_kstat.cache_hits.value.ui64 +=
   5348 		    (uint64_t)hca->cache_hits;
   5349 		rpcib_kstat.cache_misses.value.ui64 +=
   5350 		    (uint64_t)hca->cache_misses;
   5351 		rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
   5352 		    (uint64_t)hca->cache_misses_above_the_limit;
   5353 	}
   5354 	rw_exit(&rib_stat->hcas_list_lock);
   5355 	return (0);
   5356 }
   5357