Home | History | Annotate | Download | only in rds
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /*
     26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
     27  *
     28  * This software is available to you under a choice of one of two
     29  * licenses.  You may choose to be licensed under the terms of the GNU
     30  * General Public License (GPL) Version 2, available from the file
     31  * COPYING in the main directory of this source tree, or the
     32  * OpenIB.org BSD license below:
     33  *
     34  *     Redistribution and use in source and binary forms, with or
     35  *     without modification, are permitted provided that the following
     36  *     conditions are met:
     37  *
     38  *	- Redistributions of source code must retain the above
     39  *	  copyright notice, this list of conditions and the following
     40  *	  disclaimer.
     41  *
     42  *	- Redistributions in binary form must reproduce the above
     43  *	  copyright notice, this list of conditions and the following
     44  *	  disclaimer in the documentation and/or other materials
     45  *	  provided with the distribution.
     46  *
     47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     54  * SOFTWARE.
     55  *
     56  */
     57 /*
     58  * Sun elects to include this software in Sun product
     59  * under the OpenIB BSD license.
     60  *
     61  *
     62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     72  * POSSIBILITY OF SUCH DAMAGE.
     73  */
     74 
     75 #include <sys/types.h>
     76 #include <sys/ddi.h>
     77 #include <sys/sunddi.h>
     78 #include <sys/ib/clients/rds/rdsib_cm.h>
     79 #include <sys/ib/clients/rds/rdsib_ib.h>
     80 #include <sys/ib/clients/rds/rdsib_buf.h>
     81 #include <sys/ib/clients/rds/rdsib_ep.h>
     82 #include <sys/ib/clients/rds/rds_kstat.h>
     83 
     84 static void rds_async_handler(void *clntp, ibt_hca_hdl_t hdl,
     85     ibt_async_code_t code, ibt_async_event_t *event);
     86 
     87 static struct ibt_clnt_modinfo_s rds_ib_modinfo = {
     88 	IBTI_V_CURR,
     89 	IBT_NETWORK,
     90 	rds_async_handler,
     91 	NULL,
     92 	"RDS"
     93 };
     94 
     95 /* performance tunables */
     96 uint_t		rds_no_interrupts = 0;
     97 uint_t		rds_poll_percent_full = 25;
     98 uint_t		rds_wc_signal = IBT_NEXT_SOLICITED;
     99 uint_t		rds_waittime_ms = 100; /* ms */
    100 
    101 extern dev_info_t *rdsib_dev_info;
    102 extern void rds_close_sessions();
    103 
    104 static void
    105 rdsib_validate_chan_sizes(ibt_hca_attr_t *hattrp)
    106 {
    107 	/* The SQ size should not be more than that supported by the HCA */
    108 	if (((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_chan_sz) ||
    109 	    ((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_cq_sz)) {
    110 		RDS_DPRINTF2("RDSIB", "MaxDataSendBuffers + %d is greater "
    111 		    "than that supported by the HCA driver "
    112 		    "(%d + %d > %d or %d), lowering it to a supported value.",
    113 		    RDS_NUM_ACKS, MaxDataSendBuffers, RDS_NUM_ACKS,
    114 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
    115 
    116 		MaxDataSendBuffers = (hattrp->hca_max_chan_sz >
    117 		    hattrp->hca_max_cq_sz) ?
    118 		    hattrp->hca_max_cq_sz - RDS_NUM_ACKS :
    119 		    hattrp->hca_max_chan_sz - RDS_NUM_ACKS;
    120 	}
    121 
    122 	/* The RQ size should not be more than that supported by the HCA */
    123 	if ((MaxDataRecvBuffers > hattrp->hca_max_chan_sz) ||
    124 	    (MaxDataRecvBuffers > hattrp->hca_max_cq_sz)) {
    125 		RDS_DPRINTF2("RDSIB", "MaxDataRecvBuffers is greater than that "
    126 		    "supported by the HCA driver (%d > %d or %d), lowering it "
    127 		    "to a supported value.", MaxDataRecvBuffers,
    128 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
    129 
    130 		MaxDataRecvBuffers = (hattrp->hca_max_chan_sz >
    131 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
    132 		    hattrp->hca_max_chan_sz;
    133 	}
    134 
    135 	/* The SQ size should not be more than that supported by the HCA */
    136 	if ((MaxCtrlSendBuffers > hattrp->hca_max_chan_sz) ||
    137 	    (MaxCtrlSendBuffers > hattrp->hca_max_cq_sz)) {
    138 		RDS_DPRINTF2("RDSIB", "MaxCtrlSendBuffers is greater than that "
    139 		    "supported by the HCA driver (%d > %d or %d), lowering it "
    140 		    "to a supported value.", MaxCtrlSendBuffers,
    141 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
    142 
    143 		MaxCtrlSendBuffers = (hattrp->hca_max_chan_sz >
    144 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
    145 		    hattrp->hca_max_chan_sz;
    146 	}
    147 
    148 	/* The RQ size should not be more than that supported by the HCA */
    149 	if ((MaxCtrlRecvBuffers > hattrp->hca_max_chan_sz) ||
    150 	    (MaxCtrlRecvBuffers > hattrp->hca_max_cq_sz)) {
    151 		RDS_DPRINTF2("RDSIB", "MaxCtrlRecvBuffers is greater than that "
    152 		    "supported by the HCA driver (%d > %d or %d), lowering it "
    153 		    "to a supported value.", MaxCtrlRecvBuffers,
    154 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
    155 
    156 		MaxCtrlRecvBuffers = (hattrp->hca_max_chan_sz >
    157 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
    158 		    hattrp->hca_max_chan_sz;
    159 	}
    160 
    161 	/* The MaxRecvMemory should be less than that supported by the HCA */
    162 	if ((NDataRX * RdsPktSize) > hattrp->hca_max_memr_len) {
    163 		RDS_DPRINTF2("RDSIB", "MaxRecvMemory is greater than that "
    164 		    "supported by the HCA driver (%d > %d), lowering it to %d",
    165 		    NDataRX * RdsPktSize, hattrp->hca_max_memr_len,
    166 		    hattrp->hca_max_memr_len);
    167 
    168 		NDataRX = hattrp->hca_max_memr_len/RdsPktSize;
    169 	}
    170 }
    171 
    172 /* Return hcap, given the hca guid */
    173 rds_hca_t *
    174 rds_lkup_hca(ib_guid_t hca_guid)
    175 {
    176 	rds_hca_t	*hcap;
    177 
    178 	RDS_DPRINTF4("rds_lkup_hca", "Enter: statep: 0x%p "
    179 	    "guid: %llx", rdsib_statep, hca_guid);
    180 
    181 	rw_enter(&rdsib_statep->rds_hca_lock, RW_READER);
    182 
    183 	hcap = rdsib_statep->rds_hcalistp;
    184 	while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
    185 		hcap = hcap->hca_nextp;
    186 	}
    187 
    188 	rw_exit(&rdsib_statep->rds_hca_lock);
    189 
    190 	RDS_DPRINTF4("rds_lkup_hca", "return");
    191 
    192 	return (hcap);
    193 }
    194 
    195 void rds_randomize_qps(rds_hca_t *hcap);
    196 
    197 static rds_hca_t *
    198 rdsib_init_hca(ib_guid_t hca_guid)
    199 {
    200 	rds_hca_t	*hcap;
    201 	boolean_t	alloc = B_FALSE;
    202 	int		ret;
    203 
    204 	RDS_DPRINTF2("rdsib_init_hca", "enter: HCA 0x%llx", hca_guid);
    205 
    206 	/* Do a HCA lookup */
    207 	hcap = rds_lkup_hca(hca_guid);
    208 
    209 	if (hcap != NULL && hcap->hca_hdl != NULL) {
    210 		/*
    211 		 * This can happen if we get IBT_HCA_ATTACH_EVENT on an HCA
    212 		 * that we have already opened. Just return NULL so that
    213 		 * we'll not end up reinitializing the HCA again.
    214 		 */
    215 		RDS_DPRINTF2("rdsib_init_hca", "HCA already initialized");
    216 		return (NULL);
    217 	}
    218 
    219 	if (hcap == NULL) {
    220 		RDS_DPRINTF2("rdsib_init_hca", "New HCA is added");
    221 		hcap = (rds_hca_t *)kmem_zalloc(sizeof (rds_hca_t), KM_SLEEP);
    222 		alloc = B_TRUE;
    223 	}
    224 
    225 	hcap->hca_guid = hca_guid;
    226 	ret = ibt_open_hca(rdsib_statep->rds_ibhdl, hca_guid,
    227 	    &hcap->hca_hdl);
    228 	if (ret != IBT_SUCCESS) {
    229 		if (ret == IBT_HCA_IN_USE) {
    230 			RDS_DPRINTF2("rdsib_init_hca",
    231 			    "ibt_open_hca: 0x%llx returned IBT_HCA_IN_USE",
    232 			    hca_guid);
    233 		} else {
    234 			RDS_DPRINTF2("rdsib_init_hca",
    235 			    "ibt_open_hca: 0x%llx failed: %d", hca_guid, ret);
    236 		}
    237 		if (alloc == B_TRUE) {
    238 			kmem_free(hcap, sizeof (rds_hca_t));
    239 		}
    240 		return (NULL);
    241 	}
    242 
    243 	ret = ibt_query_hca(hcap->hca_hdl, &hcap->hca_attr);
    244 	if (ret != IBT_SUCCESS) {
    245 		RDS_DPRINTF2("rdsib_init_hca",
    246 		    "Query HCA: 0x%llx failed:  %d", hca_guid, ret);
    247 		ret = ibt_close_hca(hcap->hca_hdl);
    248 		ASSERT(ret == IBT_SUCCESS);
    249 		if (alloc == B_TRUE) {
    250 			kmem_free(hcap, sizeof (rds_hca_t));
    251 		} else {
    252 			hcap->hca_hdl = NULL;
    253 		}
    254 		return (NULL);
    255 	}
    256 
    257 	ret = ibt_query_hca_ports(hcap->hca_hdl, 0,
    258 	    &hcap->hca_pinfop, &hcap->hca_nports, &hcap->hca_pinfo_sz);
    259 	if (ret != IBT_SUCCESS) {
    260 		RDS_DPRINTF2("rdsib_init_hca",
    261 		    "Query HCA 0x%llx ports failed: %d", hca_guid,
    262 		    ret);
    263 		ret = ibt_close_hca(hcap->hca_hdl);
    264 		hcap->hca_hdl = NULL;
    265 		ASSERT(ret == IBT_SUCCESS);
    266 		if (alloc == B_TRUE) {
    267 			kmem_free(hcap, sizeof (rds_hca_t));
    268 		} else {
    269 			hcap->hca_hdl = NULL;
    270 		}
    271 		return (NULL);
    272 	}
    273 
    274 	/* Only one PD per HCA is allocated, so do it here */
    275 	ret = ibt_alloc_pd(hcap->hca_hdl, IBT_PD_NO_FLAGS,
    276 	    &hcap->hca_pdhdl);
    277 	if (ret != IBT_SUCCESS) {
    278 		RDS_DPRINTF2("rdsib_init_hca",
    279 		    "ibt_alloc_pd 0x%llx failed: %d", hca_guid, ret);
    280 		(void) ibt_free_portinfo(hcap->hca_pinfop,
    281 		    hcap->hca_pinfo_sz);
    282 		ret = ibt_close_hca(hcap->hca_hdl);
    283 		ASSERT(ret == IBT_SUCCESS);
    284 		hcap->hca_hdl = NULL;
    285 		if (alloc == B_TRUE) {
    286 			kmem_free(hcap, sizeof (rds_hca_t));
    287 		} else {
    288 			hcap->hca_hdl = NULL;
    289 		}
    290 		return (NULL);
    291 	}
    292 
    293 	rdsib_validate_chan_sizes(&hcap->hca_attr);
    294 
    295 	/* To minimize stale connections after ungraceful reboots */
    296 	rds_randomize_qps(hcap);
    297 
    298 	rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
    299 	hcap->hca_state = RDS_HCA_STATE_OPEN;
    300 	if (alloc == B_TRUE) {
    301 		/* this is a new HCA, add it to the list */
    302 		rdsib_statep->rds_nhcas++;
    303 		hcap->hca_nextp = rdsib_statep->rds_hcalistp;
    304 		rdsib_statep->rds_hcalistp = hcap;
    305 	}
    306 	rw_exit(&rdsib_statep->rds_hca_lock);
    307 
    308 	RDS_DPRINTF2("rdsib_init_hca", "return: HCA 0x%llx", hca_guid);
    309 
    310 	return (hcap);
    311 }
    312 
    313 /*
    314  * Called from attach
    315  */
    316 int
    317 rdsib_initialize_ib()
    318 {
    319 	ib_guid_t	*guidp;
    320 	rds_hca_t	*hcap;
    321 	uint_t		ix, hcaix, nhcas;
    322 	int		ret;
    323 
    324 	RDS_DPRINTF2("rdsib_initialize_ib", "enter: statep %p", rdsib_statep);
    325 
    326 	ASSERT(rdsib_statep != NULL);
    327 	if (rdsib_statep == NULL) {
    328 		RDS_DPRINTF1("rdsib_initialize_ib",
    329 		    "RDS Statep not initialized");
    330 		return (-1);
    331 	}
    332 
    333 	/* How many hcas are there? */
    334 	nhcas = ibt_get_hca_list(&guidp);
    335 	if (nhcas == 0) {
    336 		RDS_DPRINTF2("rdsib_initialize_ib", "No IB HCAs Available");
    337 		return (-1);
    338 	}
    339 
    340 	RDS_DPRINTF3("rdsib_initialize_ib", "Number of HCAs: %d", nhcas);
    341 
    342 	/* Register with IBTF */
    343 	ret = ibt_attach(&rds_ib_modinfo, rdsib_dev_info, rdsib_statep,
    344 	    &rdsib_statep->rds_ibhdl);
    345 	if (ret != IBT_SUCCESS) {
    346 		RDS_DPRINTF2("rdsib_initialize_ib", "ibt_attach failed: %d",
    347 		    ret);
    348 		(void) ibt_free_hca_list(guidp, nhcas);
    349 		return (-1);
    350 	}
    351 
    352 	/*
    353 	 * Open each HCA and gather its information. Don't care about HCAs
    354 	 * that cannot be opened. It is OK as long as atleast one HCA can be
    355 	 * opened.
    356 	 * Initialize a HCA only if all the information is available.
    357 	 */
    358 	for (ix = 0, hcaix = 0; ix < nhcas; ix++) {
    359 		RDS_DPRINTF3(LABEL, "Open HCA: 0x%llx", guidp[ix]);
    360 
    361 		hcap = rdsib_init_hca(guidp[ix]);
    362 		if (hcap != NULL) hcaix++;
    363 	}
    364 
    365 	/* free the HCA list, we are done with it */
    366 	(void) ibt_free_hca_list(guidp, nhcas);
    367 
    368 	if (hcaix == 0) {
    369 		/* Failed to Initialize even one HCA */
    370 		RDS_DPRINTF2("rdsib_initialize_ib", "No HCAs are initialized");
    371 		(void) ibt_detach(rdsib_statep->rds_ibhdl);
    372 		rdsib_statep->rds_ibhdl = NULL;
    373 		return (-1);
    374 	}
    375 
    376 	if (hcaix < nhcas) {
    377 		RDS_DPRINTF2("rdsib_open_ib", "HCAs %d/%d failed to initialize",
    378 		    (nhcas - hcaix), nhcas);
    379 	}
    380 
    381 	RDS_DPRINTF2("rdsib_initialize_ib", "return: statep %p", rdsib_statep);
    382 
    383 	return (0);
    384 }
    385 
    386 /*
    387  * Called from detach
    388  */
    389 void
    390 rdsib_deinitialize_ib()
    391 {
    392 	rds_hca_t	*hcap, *nextp;
    393 	int		ret;
    394 
    395 	RDS_DPRINTF2("rdsib_deinitialize_ib", "enter: statep %p", rdsib_statep);
    396 
    397 	/* close and destroy all the sessions */
    398 	rds_close_sessions(NULL);
    399 
    400 	/* Release all HCA resources */
    401 	rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
    402 	RDS_DPRINTF2("rdsib_deinitialize_ib", "HCA List: %p, NHCA: %d",
    403 	    rdsib_statep->rds_hcalistp, rdsib_statep->rds_nhcas);
    404 	hcap = rdsib_statep->rds_hcalistp;
    405 	rdsib_statep->rds_hcalistp = NULL;
    406 	rdsib_statep->rds_nhcas = 0;
    407 	rw_exit(&rdsib_statep->rds_hca_lock);
    408 
    409 	while (hcap != NULL) {
    410 		nextp = hcap->hca_nextp;
    411 
    412 		if (hcap->hca_hdl != NULL) {
    413 			ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
    414 			ASSERT(ret == IBT_SUCCESS);
    415 
    416 			(void) ibt_free_portinfo(hcap->hca_pinfop,
    417 			    hcap->hca_pinfo_sz);
    418 
    419 			ret = ibt_close_hca(hcap->hca_hdl);
    420 			ASSERT(ret == IBT_SUCCESS);
    421 		}
    422 
    423 		kmem_free(hcap, sizeof (rds_hca_t));
    424 		hcap = nextp;
    425 	}
    426 
    427 	/* Deregister with IBTF */
    428 	if (rdsib_statep->rds_ibhdl != NULL) {
    429 		(void) ibt_detach(rdsib_statep->rds_ibhdl);
    430 		rdsib_statep->rds_ibhdl = NULL;
    431 	}
    432 
    433 	RDS_DPRINTF2("rdsib_deinitialize_ib", "return: statep %p",
    434 	    rdsib_statep);
    435 }
    436 
    437 /*
    438  * Called on open of first RDS socket
    439  */
    440 int
    441 rdsib_open_ib()
    442 {
    443 	int	ret;
    444 
    445 	RDS_DPRINTF2("rdsib_open_ib", "enter: statep %p", rdsib_statep);
    446 
    447 	/* Enable incoming connection requests */
    448 	if (rdsib_statep->rds_srvhdl == NULL) {
    449 		rdsib_statep->rds_srvhdl =
    450 		    rds_register_service(rdsib_statep->rds_ibhdl);
    451 		if (rdsib_statep->rds_srvhdl == NULL) {
    452 			RDS_DPRINTF2("rdsib_open_ib",
    453 			    "Service registration failed");
    454 			return (-1);
    455 		} else {
    456 			/* bind the service on all available ports */
    457 			ret = rds_bind_service(rdsib_statep);
    458 			if (ret != 0) {
    459 				RDS_DPRINTF2("rdsib_open_ib",
    460 				    "Bind service failed: %d", ret);
    461 			}
    462 		}
    463 	}
    464 
    465 	RDS_DPRINTF2("rdsib_open_ib", "return: statep %p", rdsib_statep);
    466 
    467 	return (0);
    468 }
    469 
    470 /*
    471  * Called when all ports are closed.
    472  */
    473 void
    474 rdsib_close_ib()
    475 {
    476 	int	ret;
    477 
    478 	RDS_DPRINTF2("rdsib_close_ib", "enter: statep %p", rdsib_statep);
    479 
    480 	/* Disable incoming connection requests */
    481 	if (rdsib_statep->rds_srvhdl != NULL) {
    482 		ret = ibt_unbind_all_services(rdsib_statep->rds_srvhdl);
    483 		if (ret != 0) {
    484 			RDS_DPRINTF2("rdsib_close_ib",
    485 			    "ibt_unbind_all_services failed: %d\n", ret);
    486 		}
    487 		ret = ibt_deregister_service(rdsib_statep->rds_ibhdl,
    488 		    rdsib_statep->rds_srvhdl);
    489 		if (ret != 0) {
    490 			RDS_DPRINTF2("rdsib_close_ib",
    491 			    "ibt_deregister_service failed: %d\n", ret);
    492 		} else {
    493 			rdsib_statep->rds_srvhdl = NULL;
    494 		}
    495 	}
    496 
    497 	RDS_DPRINTF2("rdsib_close_ib", "return: statep %p", rdsib_statep);
    498 }
    499 
    500 /* Return hcap, given the hca guid */
    501 rds_hca_t *
    502 rds_get_hcap(rds_state_t *statep, ib_guid_t hca_guid)
    503 {
    504 	rds_hca_t	*hcap;
    505 
    506 	RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: Enter: statep: 0x%p "
    507 	    "guid: %llx", statep, hca_guid);
    508 
    509 	rw_enter(&statep->rds_hca_lock, RW_READER);
    510 
    511 	hcap = statep->rds_hcalistp;
    512 	while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
    513 		hcap = hcap->hca_nextp;
    514 	}
    515 
    516 	/*
    517 	 * don't let anyone use this HCA until the RECV memory
    518 	 * is registered with this HCA
    519 	 */
    520 	if ((hcap != NULL) &&
    521 	    (hcap->hca_state == RDS_HCA_STATE_MEM_REGISTERED)) {
    522 		ASSERT(hcap->hca_mrhdl != NULL);
    523 		rw_exit(&statep->rds_hca_lock);
    524 		return (hcap);
    525 	}
    526 
    527 	RDS_DPRINTF2("rds_get_hcap",
    528 	    "HCA (0x%p, 0x%llx) is not initialized", hcap, hca_guid);
    529 	rw_exit(&statep->rds_hca_lock);
    530 
    531 	RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: return");
    532 
    533 	return (NULL);
    534 }
    535 
    536 /* Return hcap, given a gid */
    537 rds_hca_t *
    538 rds_gid_to_hcap(rds_state_t *statep, ib_gid_t gid)
    539 {
    540 	rds_hca_t	*hcap;
    541 	uint_t		ix;
    542 
    543 	RDS_DPRINTF4("rds_gid_to_hcap", "Enter: statep: 0x%p gid: %llx:%llx",
    544 	    statep, gid.gid_prefix, gid.gid_guid);
    545 
    546 	rw_enter(&statep->rds_hca_lock, RW_READER);
    547 
    548 	hcap = statep->rds_hcalistp;
    549 	while (hcap != NULL) {
    550 
    551 		/*
    552 		 * don't let anyone use this HCA until the RECV memory
    553 		 * is registered with this HCA
    554 		 */
    555 		if (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED) {
    556 			RDS_DPRINTF3("rds_gid_to_hcap",
    557 			    "HCA (0x%p, 0x%llx) is not initialized",
    558 			    hcap, gid.gid_guid);
    559 			hcap = hcap->hca_nextp;
    560 			continue;
    561 		}
    562 
    563 		for (ix = 0; ix < hcap->hca_nports; ix++) {
    564 			if ((hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_prefix ==
    565 			    gid.gid_prefix) &&
    566 			    (hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_guid ==
    567 			    gid.gid_guid)) {
    568 				RDS_DPRINTF4("rds_gid_to_hcap",
    569 				    "gid found in hcap: 0x%p", hcap);
    570 				rw_exit(&statep->rds_hca_lock);
    571 				return (hcap);
    572 			}
    573 		}
    574 		hcap = hcap->hca_nextp;
    575 	}
    576 
    577 	rw_exit(&statep->rds_hca_lock);
    578 
    579 	return (NULL);
    580 }
    581 
    582 /* This is called from the send CQ handler */
    583 void
    584 rds_send_acknowledgement(rds_ep_t *ep)
    585 {
    586 	int	ret;
    587 	uint_t	ix;
    588 
    589 	RDS_DPRINTF4("rds_send_acknowledgement", "Enter EP(%p)", ep);
    590 
    591 	mutex_enter(&ep->ep_lock);
    592 
    593 	ASSERT(ep->ep_rdmacnt != 0);
    594 
    595 	/*
    596 	 * The previous ACK completed successfully, send the next one
    597 	 * if more messages were received after sending the last ACK
    598 	 */
    599 	if (ep->ep_rbufid != *(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va) {
    600 		*(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va = ep->ep_rbufid;
    601 		mutex_exit(&ep->ep_lock);
    602 
    603 		/* send acknowledgement */
    604 		RDS_INCR_TXACKS();
    605 		ret = ibt_post_send(ep->ep_chanhdl, &ep->ep_ackwr, 1, &ix);
    606 		if (ret != IBT_SUCCESS) {
    607 			RDS_DPRINTF2("rds_send_acknowledgement",
    608 			    "EP(%p): ibt_post_send for acknowledgement "
    609 			    "failed: %d, SQ depth: %d",
    610 			    ep, ret, ep->ep_sndpool.pool_nbusy);
    611 			mutex_enter(&ep->ep_lock);
    612 			ep->ep_rdmacnt--;
    613 			mutex_exit(&ep->ep_lock);
    614 		}
    615 	} else {
    616 		/* ACKed all messages, no more to ACK */
    617 		ep->ep_rdmacnt--;
    618 		mutex_exit(&ep->ep_lock);
    619 		return;
    620 	}
    621 
    622 	RDS_DPRINTF4("rds_send_acknowledgement", "Return EP(%p)", ep);
    623 }
    624 
    625 static int
    626 rds_poll_ctrl_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
    627 {
    628 	ibt_wc_t	wc;
    629 	uint_t		npolled;
    630 	rds_buf_t	*bp;
    631 	rds_ctrl_pkt_t	*cpkt;
    632 	rds_qp_t	*recvqp;
    633 	int		ret = IBT_SUCCESS;
    634 
    635 	RDS_DPRINTF4("rds_poll_ctrl_completions", "Enter: EP(%p)", ep);
    636 
    637 	bzero(&wc, sizeof (ibt_wc_t));
    638 	ret = ibt_poll_cq(cq, &wc, 1, &npolled);
    639 	if (ret != IBT_SUCCESS) {
    640 		if (ret != IBT_CQ_EMPTY) {
    641 			RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
    642 			    "returned: %d", ep, cq, ret);
    643 		} else {
    644 			RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
    645 			    "returned: IBT_CQ_EMPTY", ep, cq);
    646 		}
    647 		return (ret);
    648 	}
    649 
    650 	bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
    651 
    652 	if (wc.wc_status != IBT_WC_SUCCESS) {
    653 		mutex_enter(&ep->ep_recvqp.qp_lock);
    654 		ep->ep_recvqp.qp_level--;
    655 		mutex_exit(&ep->ep_recvqp.qp_lock);
    656 
    657 		/* Free the buffer */
    658 		bp->buf_state = RDS_RCVBUF_FREE;
    659 		rds_free_recv_buf(bp, 1);
    660 
    661 		/* Receive completion failure */
    662 		if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
    663 			RDS_DPRINTF2("rds_poll_ctrl_completions",
    664 			    "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
    665 			    ep, cq, wc.wc_id, wc.wc_status);
    666 		}
    667 		return (ret);
    668 	}
    669 
    670 	/* there is one less in the RQ */
    671 	recvqp = &ep->ep_recvqp;
    672 	mutex_enter(&recvqp->qp_lock);
    673 	recvqp->qp_level--;
    674 	if ((recvqp->qp_taskqpending == B_FALSE) &&
    675 	    (recvqp->qp_level <= recvqp->qp_lwm)) {
    676 		/* Time to post more buffers into the RQ */
    677 		recvqp->qp_taskqpending = B_TRUE;
    678 		mutex_exit(&recvqp->qp_lock);
    679 
    680 		ret = ddi_taskq_dispatch(rds_taskq,
    681 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
    682 		if (ret != DDI_SUCCESS) {
    683 			RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
    684 			    ret);
    685 			mutex_enter(&recvqp->qp_lock);
    686 			recvqp->qp_taskqpending = B_FALSE;
    687 			mutex_exit(&recvqp->qp_lock);
    688 		}
    689 	} else {
    690 		mutex_exit(&recvqp->qp_lock);
    691 	}
    692 
    693 	cpkt = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
    694 	rds_handle_control_message(ep->ep_sp, cpkt);
    695 
    696 	bp->buf_state = RDS_RCVBUF_FREE;
    697 	rds_free_recv_buf(bp, 1);
    698 
    699 	RDS_DPRINTF4("rds_poll_ctrl_completions", "Return: EP(%p)", ep);
    700 
    701 	return (ret);
    702 }
    703 
    704 #define	RDS_POST_FEW_ATATIME	100
    705 /* Post recv WRs into the RQ. Assumes the ep->refcnt is already incremented */
    706 void
    707 rds_post_recv_buf(void *arg)
    708 {
    709 	ibt_channel_hdl_t	chanhdl;
    710 	rds_ep_t		*ep;
    711 	rds_session_t		*sp;
    712 	rds_qp_t		*recvqp;
    713 	rds_bufpool_t		*gp;
    714 	rds_buf_t		*bp, *bp1;
    715 	ibt_recv_wr_t		*wrp, wr[RDS_POST_FEW_ATATIME];
    716 	rds_hca_t		*hcap;
    717 	uint_t			npost, nspace, rcv_len;
    718 	uint_t			ix, jx, kx;
    719 	int			ret;
    720 
    721 	chanhdl = (ibt_channel_hdl_t)arg;
    722 	RDS_DPRINTF4("rds_post_recv_buf", "Enter: CHAN(%p)", chanhdl);
    723 	RDS_INCR_POST_RCV_BUF_CALLS();
    724 
    725 	ep = (rds_ep_t *)ibt_get_chan_private(chanhdl);
    726 	ASSERT(ep != NULL);
    727 	sp = ep->ep_sp;
    728 	recvqp = &ep->ep_recvqp;
    729 
    730 	RDS_DPRINTF5("rds_post_recv_buf", "EP(%p)", ep);
    731 
    732 	/* get the hcap for the HCA hosting this channel */
    733 	hcap = rds_lkup_hca(ep->ep_hca_guid);
    734 	if (hcap == NULL) {
    735 		RDS_DPRINTF2("rds_post_recv_buf", "HCA (0x%llx) not found",
    736 		    ep->ep_hca_guid);
    737 		return;
    738 	}
    739 
    740 	/* Make sure the session is still connected */
    741 	rw_enter(&sp->session_lock, RW_READER);
    742 	if ((sp->session_state != RDS_SESSION_STATE_INIT) &&
    743 	    (sp->session_state != RDS_SESSION_STATE_CONNECTED) &&
    744 	    (sp->session_state != RDS_SESSION_STATE_HCA_CLOSING)) {
    745 		RDS_DPRINTF2("rds_post_recv_buf", "EP(%p): Session is not "
    746 		    "in active state (%d)", ep, sp->session_state);
    747 		rw_exit(&sp->session_lock);
    748 		return;
    749 	}
    750 	rw_exit(&sp->session_lock);
    751 
    752 	/* how many can be posted */
    753 	mutex_enter(&recvqp->qp_lock);
    754 	nspace = recvqp->qp_depth - recvqp->qp_level;
    755 	if (nspace == 0) {
    756 		RDS_DPRINTF2("rds_post_recv_buf", "RQ is FULL");
    757 		recvqp->qp_taskqpending = B_FALSE;
    758 		mutex_exit(&recvqp->qp_lock);
    759 		return;
    760 	}
    761 	mutex_exit(&recvqp->qp_lock);
    762 
    763 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
    764 		gp = &rds_dpool;
    765 		rcv_len = RdsPktSize;
    766 	} else {
    767 		gp = &rds_cpool;
    768 		rcv_len = RDS_CTRLPKT_SIZE;
    769 	}
    770 
    771 	bp = rds_get_buf(gp, nspace, &jx);
    772 	if (bp == NULL) {
    773 		RDS_DPRINTF2(LABEL, "EP(%p): No Recv buffers available", ep);
    774 		/* try again later */
    775 		ret = ddi_taskq_dispatch(rds_taskq, rds_post_recv_buf,
    776 		    (void *)chanhdl, DDI_NOSLEEP);
    777 		if (ret != DDI_SUCCESS) {
    778 			RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
    779 			    ret);
    780 			mutex_enter(&recvqp->qp_lock);
    781 			recvqp->qp_taskqpending = B_FALSE;
    782 			mutex_exit(&recvqp->qp_lock);
    783 		}
    784 		return;
    785 	}
    786 
    787 	if (jx != nspace) {
    788 		RDS_DPRINTF2(LABEL, "EP(%p): Recv buffers "
    789 		    "needed: %d available: %d", ep, nspace, jx);
    790 		nspace = jx;
    791 	}
    792 
    793 	bp1 = bp;
    794 	for (ix = 0; ix < nspace; ix++) {
    795 		bp1->buf_ep = ep;
    796 		ASSERT(bp1->buf_state == RDS_RCVBUF_FREE);
    797 		bp1->buf_state = RDS_RCVBUF_POSTED;
    798 		bp1->buf_ds.ds_key = hcap->hca_lkey;
    799 		bp1->buf_ds.ds_len = rcv_len;
    800 		bp1 = bp1->buf_nextp;
    801 	}
    802 
    803 #if 0
    804 	wrp = kmem_zalloc(RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t),
    805 	    KM_SLEEP);
    806 #else
    807 	wrp = &wr[0];
    808 #endif
    809 
    810 	npost = nspace;
    811 	while (npost) {
    812 		jx = (npost > RDS_POST_FEW_ATATIME) ?
    813 		    RDS_POST_FEW_ATATIME : npost;
    814 		for (ix = 0; ix < jx; ix++) {
    815 			wrp[ix].wr_id = (uintptr_t)bp;
    816 			wrp[ix].wr_nds = 1;
    817 			wrp[ix].wr_sgl = &bp->buf_ds;
    818 			bp = bp->buf_nextp;
    819 		}
    820 
    821 		ret = ibt_post_recv(chanhdl, wrp, jx, &kx);
    822 		if ((ret != IBT_SUCCESS) || (kx != jx)) {
    823 			RDS_DPRINTF2(LABEL, "ibt_post_recv for %d WRs failed: "
    824 			    "%d", npost, ret);
    825 			npost -= kx;
    826 			break;
    827 		}
    828 
    829 		npost -= jx;
    830 	}
    831 
    832 	mutex_enter(&recvqp->qp_lock);
    833 	if (npost != 0) {
    834 		RDS_DPRINTF2("rds_post_recv_buf",
    835 		    "EP(%p) Failed to post %d WRs", ep, npost);
    836 		recvqp->qp_level += (nspace - npost);
    837 	} else {
    838 		recvqp->qp_level += nspace;
    839 	}
    840 
    841 	/*
    842 	 * sometimes, the recv WRs can get consumed as soon as they are
    843 	 * posted. In that case, taskq thread to post more WRs to the RQ will
    844 	 * not be scheduled as the taskqpending flag is still set.
    845 	 */
    846 	if (recvqp->qp_level == 0) {
    847 		mutex_exit(&recvqp->qp_lock);
    848 		ret = ddi_taskq_dispatch(rds_taskq,
    849 		    rds_post_recv_buf, (void *)chanhdl, DDI_NOSLEEP);
    850 		if (ret != DDI_SUCCESS) {
    851 			RDS_DPRINTF2("rds_post_recv_buf",
    852 			    "ddi_taskq_dispatch failed: %d", ret);
    853 			mutex_enter(&recvqp->qp_lock);
    854 			recvqp->qp_taskqpending = B_FALSE;
    855 			mutex_exit(&recvqp->qp_lock);
    856 		}
    857 	} else {
    858 		recvqp->qp_taskqpending = B_FALSE;
    859 		mutex_exit(&recvqp->qp_lock);
    860 	}
    861 
    862 #if 0
    863 	kmem_free(wrp, RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t));
    864 #endif
    865 
    866 	RDS_DPRINTF4("rds_post_recv_buf", "Return: EP(%p)", ep);
    867 }
    868 
    869 static int
    870 rds_poll_data_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
    871 {
    872 	ibt_wc_t	wc;
    873 	rds_buf_t	*bp;
    874 	rds_data_hdr_t	*pktp;
    875 	rds_qp_t	*recvqp;
    876 	uint_t		npolled;
    877 	int		ret = IBT_SUCCESS;
    878 
    879 
    880 	RDS_DPRINTF4("rds_poll_data_completions", "Enter: EP(%p)", ep);
    881 
    882 	bzero(&wc, sizeof (ibt_wc_t));
    883 	ret = ibt_poll_cq(cq, &wc, 1, &npolled);
    884 	if (ret != IBT_SUCCESS) {
    885 		if (ret != IBT_CQ_EMPTY) {
    886 			RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
    887 			    "returned: %d", ep, cq, ret);
    888 		} else {
    889 			RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
    890 			    "returned: IBT_CQ_EMPTY", ep, cq);
    891 		}
    892 		return (ret);
    893 	}
    894 
    895 	bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
    896 	ASSERT(bp->buf_state == RDS_RCVBUF_POSTED);
    897 	bp->buf_state = RDS_RCVBUF_ONSOCKQ;
    898 	bp->buf_nextp = NULL;
    899 
    900 	if (wc.wc_status != IBT_WC_SUCCESS) {
    901 		mutex_enter(&ep->ep_recvqp.qp_lock);
    902 		ep->ep_recvqp.qp_level--;
    903 		mutex_exit(&ep->ep_recvqp.qp_lock);
    904 
    905 		/* free the buffer */
    906 		bp->buf_state = RDS_RCVBUF_FREE;
    907 		rds_free_recv_buf(bp, 1);
    908 
    909 		/* Receive completion failure */
    910 		if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
    911 			RDS_DPRINTF2("rds_poll_data_completions",
    912 			    "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
    913 			    ep, cq, wc.wc_id, wc.wc_status);
    914 			RDS_INCR_RXERRS();
    915 		}
    916 		return (ret);
    917 	}
    918 
    919 	/* there is one less in the RQ */
    920 	recvqp = &ep->ep_recvqp;
    921 	mutex_enter(&recvqp->qp_lock);
    922 	recvqp->qp_level--;
    923 	if ((recvqp->qp_taskqpending == B_FALSE) &&
    924 	    (recvqp->qp_level <= recvqp->qp_lwm)) {
    925 		/* Time to post more buffers into the RQ */
    926 		recvqp->qp_taskqpending = B_TRUE;
    927 		mutex_exit(&recvqp->qp_lock);
    928 
    929 		ret = ddi_taskq_dispatch(rds_taskq,
    930 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
    931 		if (ret != DDI_SUCCESS) {
    932 			RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
    933 			    ret);
    934 			mutex_enter(&recvqp->qp_lock);
    935 			recvqp->qp_taskqpending = B_FALSE;
    936 			mutex_exit(&recvqp->qp_lock);
    937 		}
    938 	} else {
    939 		mutex_exit(&recvqp->qp_lock);
    940 	}
    941 
    942 	pktp = (rds_data_hdr_t *)(uintptr_t)bp->buf_ds.ds_va;
    943 	ASSERT(pktp->dh_datalen != 0);
    944 
    945 	RDS_DPRINTF5(LABEL, "Message Received: sendIP: 0x%x recvIP: 0x%x "
    946 	    "sendport: %d recvport: %d npkts: %d pktno: %d", ep->ep_remip,
    947 	    ep->ep_myip, pktp->dh_sendport, pktp->dh_recvport,
    948 	    pktp->dh_npkts, pktp->dh_psn);
    949 
    950 	RDS_DPRINTF3(LABEL, "BP(%p): npkts: %d psn: %d", bp,
    951 	    pktp->dh_npkts, pktp->dh_psn);
    952 
    953 	if (pktp->dh_npkts == 1) {
    954 		/* single pkt or last packet */
    955 		if (pktp->dh_psn != 0) {
    956 			/* last packet of a segmented message */
    957 			ASSERT(ep->ep_seglbp != NULL);
    958 			ep->ep_seglbp->buf_nextp = bp;
    959 			ep->ep_seglbp = bp;
    960 			rds_received_msg(ep, ep->ep_segfbp);
    961 			ep->ep_segfbp = NULL;
    962 			ep->ep_seglbp = NULL;
    963 		} else {
    964 			/* single packet */
    965 			rds_received_msg(ep, bp);
    966 		}
    967 	} else {
    968 		/* multi-pkt msg */
    969 		if (pktp->dh_psn == 0) {
    970 			/* first packet */
    971 			ASSERT(ep->ep_segfbp == NULL);
    972 			ep->ep_segfbp = bp;
    973 			ep->ep_seglbp = bp;
    974 		} else {
    975 			/* intermediate packet */
    976 			ASSERT(ep->ep_segfbp != NULL);
    977 			ep->ep_seglbp->buf_nextp = bp;
    978 			ep->ep_seglbp = bp;
    979 		}
    980 	}
    981 
    982 	RDS_DPRINTF4("rds_poll_data_completions", "Return: EP(%p)", ep);
    983 
    984 	return (ret);
    985 }
    986 
    987 void
    988 rds_recvcq_handler(ibt_cq_hdl_t cq, void *arg)
    989 {
    990 	rds_ep_t	*ep;
    991 	int		ret = IBT_SUCCESS;
    992 	int		(*func)(ibt_cq_hdl_t, rds_ep_t *);
    993 
    994 	ep = (rds_ep_t *)arg;
    995 
    996 	RDS_DPRINTF4("rds_recvcq_handler", "enter: EP(%p)", ep);
    997 
    998 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
    999 		func = rds_poll_data_completions;
   1000 	} else {
   1001 		func = rds_poll_ctrl_completions;
   1002 	}
   1003 
   1004 	do {
   1005 		ret = func(cq, ep);
   1006 	} while (ret != IBT_CQ_EMPTY);
   1007 
   1008 	/* enable the CQ */
   1009 	ret = ibt_enable_cq_notify(cq, rds_wc_signal);
   1010 	if (ret != IBT_SUCCESS) {
   1011 		RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
   1012 		    "failed: %d", ep, cq, ret);
   1013 		return;
   1014 	}
   1015 
   1016 	do {
   1017 		ret = func(cq, ep);
   1018 	} while (ret != IBT_CQ_EMPTY);
   1019 
   1020 	RDS_DPRINTF4("rds_recvcq_handler", "Return: EP(%p)", ep);
   1021 }
   1022 
   1023 void
   1024 rds_poll_send_completions(ibt_cq_hdl_t cq, rds_ep_t *ep, boolean_t lock)
   1025 {
   1026 	ibt_wc_t	wc[RDS_NUM_DATA_SEND_WCS];
   1027 	uint_t		npolled, nret, send_error = 0;
   1028 	rds_buf_t	*headp, *tailp, *bp;
   1029 	int		ret, ix;
   1030 
   1031 	RDS_DPRINTF4("rds_poll_send_completions", "Enter EP(%p)", ep);
   1032 
   1033 	headp = NULL;
   1034 	tailp = NULL;
   1035 	npolled = 0;
   1036 	do {
   1037 		ret = ibt_poll_cq(cq, wc, RDS_NUM_DATA_SEND_WCS, &nret);
   1038 		if (ret != IBT_SUCCESS) {
   1039 			if (ret != IBT_CQ_EMPTY) {
   1040 				RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): "
   1041 				    "ibt_poll_cq returned: %d", ep, cq, ret);
   1042 			} else {
   1043 				RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): "
   1044 				    "ibt_poll_cq returned: IBT_CQ_EMPTY",
   1045 				    ep, cq);
   1046 			}
   1047 
   1048 			break;
   1049 		}
   1050 
   1051 		for (ix = 0; ix < nret; ix++) {
   1052 			if (wc[ix].wc_status == IBT_WC_SUCCESS) {
   1053 				if (wc[ix].wc_type == IBT_WRC_RDMAW) {
   1054 					rds_send_acknowledgement(ep);
   1055 					continue;
   1056 				}
   1057 
   1058 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
   1059 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
   1060 				bp->buf_state = RDS_SNDBUF_FREE;
   1061 			} else if (wc[ix].wc_status == IBT_WC_WR_FLUSHED_ERR) {
   1062 				RDS_INCR_TXERRS();
   1063 				RDS_DPRINTF5("rds_poll_send_completions",
   1064 				    "EP(%p): WC ID: %p ERROR: %d", ep,
   1065 				    wc[ix].wc_id, wc[ix].wc_status);
   1066 
   1067 				if (wc[ix].wc_id == RDS_RDMAW_WRID) {
   1068 					mutex_enter(&ep->ep_lock);
   1069 					ep->ep_rdmacnt--;
   1070 					mutex_exit(&ep->ep_lock);
   1071 					continue;
   1072 				}
   1073 
   1074 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
   1075 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
   1076 				bp->buf_state = RDS_SNDBUF_FREE;
   1077 			} else {
   1078 				RDS_INCR_TXERRS();
   1079 				RDS_DPRINTF2("rds_poll_send_completions",
   1080 				    "EP(%p): WC ID: %p ERROR: %d", ep,
   1081 				    wc[ix].wc_id, wc[ix].wc_status);
   1082 				if (send_error == 0) {
   1083 					rds_session_t	*sp = ep->ep_sp;
   1084 
   1085 					/* don't let anyone send anymore */
   1086 					rw_enter(&sp->session_lock, RW_WRITER);
   1087 					if (sp->session_state !=
   1088 					    RDS_SESSION_STATE_ERROR) {
   1089 						sp->session_state =
   1090 						    RDS_SESSION_STATE_ERROR;
   1091 						/* Make this the active end */
   1092 						sp->session_type =
   1093 						    RDS_SESSION_ACTIVE;
   1094 					}
   1095 					rw_exit(&sp->session_lock);
   1096 				}
   1097 
   1098 				send_error++;
   1099 
   1100 				if (wc[ix].wc_id == RDS_RDMAW_WRID) {
   1101 					mutex_enter(&ep->ep_lock);
   1102 					ep->ep_rdmacnt--;
   1103 					mutex_exit(&ep->ep_lock);
   1104 					continue;
   1105 				}
   1106 
   1107 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
   1108 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
   1109 				bp->buf_state = RDS_SNDBUF_FREE;
   1110 			}
   1111 
   1112 			bp->buf_nextp = NULL;
   1113 			if (headp) {
   1114 				tailp->buf_nextp = bp;
   1115 				tailp = bp;
   1116 			} else {
   1117 				headp = bp;
   1118 				tailp = bp;
   1119 			}
   1120 
   1121 			npolled++;
   1122 		}
   1123 
   1124 		if (rds_no_interrupts && (npolled > 100)) {
   1125 			break;
   1126 		}
   1127 
   1128 		if (rds_no_interrupts == 1) {
   1129 			break;
   1130 		}
   1131 	} while (ret != IBT_CQ_EMPTY);
   1132 
   1133 	RDS_DPRINTF5("rds_poll_send_completions", "Npolled: %d send_error: %d",
   1134 	    npolled, send_error);
   1135 
   1136 	/* put the buffers to the pool */
   1137 	if (npolled != 0) {
   1138 		rds_free_send_buf(ep, headp, tailp, npolled, lock);
   1139 	}
   1140 
   1141 	if (send_error != 0) {
   1142 		rds_handle_send_error(ep);
   1143 	}
   1144 
   1145 	RDS_DPRINTF4("rds_poll_send_completions", "Return EP(%p)", ep);
   1146 }
   1147 
   1148 void
   1149 rds_sendcq_handler(ibt_cq_hdl_t cq, void *arg)
   1150 {
   1151 	rds_ep_t	*ep;
   1152 	int		ret;
   1153 
   1154 	ep = (rds_ep_t *)arg;
   1155 
   1156 	RDS_DPRINTF4("rds_sendcq_handler", "Enter: EP(%p)", ep);
   1157 
   1158 	/* enable the CQ */
   1159 	ret = ibt_enable_cq_notify(cq, IBT_NEXT_COMPLETION);
   1160 	if (ret != IBT_SUCCESS) {
   1161 		RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
   1162 		    "failed: %d", ep, cq, ret);
   1163 		return;
   1164 	}
   1165 
   1166 	rds_poll_send_completions(cq, ep, B_FALSE);
   1167 
   1168 	RDS_DPRINTF4("rds_sendcq_handler", "Return: EP(%p)", ep);
   1169 }
   1170 
   1171 void
   1172 rds_ep_free_rc_channel(rds_ep_t *ep)
   1173 {
   1174 	int ret;
   1175 
   1176 	RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Enter", ep);
   1177 
   1178 	ASSERT(mutex_owned(&ep->ep_lock));
   1179 
   1180 	/* free the QP */
   1181 	if (ep->ep_chanhdl != NULL) {
   1182 		/* wait until the RQ is empty */
   1183 		(void) ibt_flush_channel(ep->ep_chanhdl);
   1184 		(void) rds_is_recvq_empty(ep, B_TRUE);
   1185 		ret = ibt_free_channel(ep->ep_chanhdl);
   1186 		if (ret != IBT_SUCCESS) {
   1187 			RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) "
   1188 			    "ibt_free_channel returned: %d", ep, ret);
   1189 		}
   1190 		ep->ep_chanhdl = NULL;
   1191 	} else {
   1192 		RDS_DPRINTF2("rds_ep_free_rc_channel",
   1193 		    "EP(%p) Channel is ALREADY FREE", ep);
   1194 	}
   1195 
   1196 	/* free the Send CQ */
   1197 	if (ep->ep_sendcq != NULL) {
   1198 		ret = ibt_free_cq(ep->ep_sendcq);
   1199 		if (ret != IBT_SUCCESS) {
   1200 			RDS_DPRINTF2("rds_ep_free_rc_channel",
   1201 			    "EP(%p) - for sendcq, ibt_free_cq returned %d",
   1202 			    ep, ret);
   1203 		}
   1204 		ep->ep_sendcq = NULL;
   1205 	} else {
   1206 		RDS_DPRINTF2("rds_ep_free_rc_channel",
   1207 		    "EP(%p) SendCQ is ALREADY FREE", ep);
   1208 	}
   1209 
   1210 	/* free the Recv CQ */
   1211 	if (ep->ep_recvcq != NULL) {
   1212 		ret = ibt_free_cq(ep->ep_recvcq);
   1213 		if (ret != IBT_SUCCESS) {
   1214 			RDS_DPRINTF2("rds_ep_free_rc_channel",
   1215 			    "EP(%p) - for recvcq, ibt_free_cq returned %d",
   1216 			    ep, ret);
   1217 		}
   1218 		ep->ep_recvcq = NULL;
   1219 	} else {
   1220 		RDS_DPRINTF2("rds_ep_free_rc_channel",
   1221 		    "EP(%p) RecvCQ is ALREADY FREE", ep);
   1222 	}
   1223 
   1224 	RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Return", ep);
   1225 }
   1226 
   1227 /* Allocate resources for RC channel */
   1228 ibt_channel_hdl_t
   1229 rds_ep_alloc_rc_channel(rds_ep_t *ep, uint8_t hca_port)
   1230 {
   1231 	int				ret = IBT_SUCCESS;
   1232 	ibt_cq_attr_t			scqattr, rcqattr;
   1233 	ibt_rc_chan_alloc_args_t	chanargs;
   1234 	ibt_channel_hdl_t		chanhdl;
   1235 	rds_session_t			*sp;
   1236 	rds_hca_t			*hcap;
   1237 
   1238 	RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Enter: 0x%p port: %d",
   1239 	    ep, hca_port);
   1240 
   1241 	/* Update the EP with the right IP address and HCA guid */
   1242 	sp = ep->ep_sp;
   1243 	ASSERT(sp != NULL);
   1244 	rw_enter(&sp->session_lock, RW_READER);
   1245 	mutex_enter(&ep->ep_lock);
   1246 	ep->ep_myip = sp->session_myip;
   1247 	ep->ep_remip = sp->session_remip;
   1248 	hcap = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
   1249 	ep->ep_hca_guid = hcap->hca_guid;
   1250 	mutex_exit(&ep->ep_lock);
   1251 	rw_exit(&sp->session_lock);
   1252 
   1253 	/* reset taskqpending flag here */
   1254 	ep->ep_recvqp.qp_taskqpending = B_FALSE;
   1255 
   1256 	if (ep->ep_type == RDS_EP_TYPE_CTRL) {
   1257 		scqattr.cq_size = MaxCtrlSendBuffers;
   1258 		scqattr.cq_sched = NULL;
   1259 		scqattr.cq_flags = IBT_CQ_NO_FLAGS;
   1260 
   1261 		rcqattr.cq_size = MaxCtrlRecvBuffers;
   1262 		rcqattr.cq_sched = NULL;
   1263 		rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
   1264 
   1265 		chanargs.rc_sizes.cs_sq = MaxCtrlSendBuffers;
   1266 		chanargs.rc_sizes.cs_rq = MaxCtrlRecvBuffers;
   1267 		chanargs.rc_sizes.cs_sq_sgl = 1;
   1268 		chanargs.rc_sizes.cs_rq_sgl = 1;
   1269 	} else {
   1270 		scqattr.cq_size = MaxDataSendBuffers + RDS_NUM_ACKS;
   1271 		scqattr.cq_sched = NULL;
   1272 		scqattr.cq_flags = IBT_CQ_NO_FLAGS;
   1273 
   1274 		rcqattr.cq_size = MaxDataRecvBuffers;
   1275 		rcqattr.cq_sched = NULL;
   1276 		rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
   1277 
   1278 		chanargs.rc_sizes.cs_sq = MaxDataSendBuffers + RDS_NUM_ACKS;
   1279 		chanargs.rc_sizes.cs_rq = MaxDataRecvBuffers;
   1280 		chanargs.rc_sizes.cs_sq_sgl = 1;
   1281 		chanargs.rc_sizes.cs_rq_sgl = 1;
   1282 	}
   1283 
   1284 	mutex_enter(&ep->ep_lock);
   1285 	if (ep->ep_sendcq == NULL) {
   1286 		/* returned size is always greater than the requested size */
   1287 		ret = ibt_alloc_cq(hcap->hca_hdl, &scqattr,
   1288 		    &ep->ep_sendcq, NULL);
   1289 		if (ret != IBT_SUCCESS) {
   1290 			RDS_DPRINTF2(LABEL, "ibt_alloc_cq for sendCQ "
   1291 			    "failed, size = %d: %d", scqattr.cq_size, ret);
   1292 			mutex_exit(&ep->ep_lock);
   1293 			return (NULL);
   1294 		}
   1295 
   1296 		(void) ibt_set_cq_handler(ep->ep_sendcq, rds_sendcq_handler,
   1297 		    ep);
   1298 
   1299 		if (rds_no_interrupts == 0) {
   1300 			ret = ibt_enable_cq_notify(ep->ep_sendcq,
   1301 			    IBT_NEXT_COMPLETION);
   1302 			if (ret != IBT_SUCCESS) {
   1303 				RDS_DPRINTF2(LABEL,
   1304 				    "ibt_enable_cq_notify failed: %d", ret);
   1305 				(void) ibt_free_cq(ep->ep_sendcq);
   1306 				ep->ep_sendcq = NULL;
   1307 				mutex_exit(&ep->ep_lock);
   1308 				return (NULL);
   1309 			}
   1310 		}
   1311 	}
   1312 
   1313 	if (ep->ep_recvcq == NULL) {
   1314 		/* returned size is always greater than the requested size */
   1315 		ret = ibt_alloc_cq(hcap->hca_hdl, &rcqattr,
   1316 		    &ep->ep_recvcq, NULL);
   1317 		if (ret != IBT_SUCCESS) {
   1318 			RDS_DPRINTF2(LABEL, "ibt_alloc_cq for recvCQ "
   1319 			    "failed, size = %d: %d", rcqattr.cq_size, ret);
   1320 			(void) ibt_free_cq(ep->ep_sendcq);
   1321 			ep->ep_sendcq = NULL;
   1322 			mutex_exit(&ep->ep_lock);
   1323 			return (NULL);
   1324 		}
   1325 
   1326 		(void) ibt_set_cq_handler(ep->ep_recvcq, rds_recvcq_handler,
   1327 		    ep);
   1328 
   1329 		ret = ibt_enable_cq_notify(ep->ep_recvcq, rds_wc_signal);
   1330 		if (ret != IBT_SUCCESS) {
   1331 			RDS_DPRINTF2(LABEL,
   1332 			    "ibt_enable_cq_notify failed: %d", ret);
   1333 			(void) ibt_free_cq(ep->ep_recvcq);
   1334 			ep->ep_recvcq = NULL;
   1335 			(void) ibt_free_cq(ep->ep_sendcq);
   1336 			ep->ep_sendcq = NULL;
   1337 			mutex_exit(&ep->ep_lock);
   1338 			return (NULL);
   1339 		}
   1340 	}
   1341 
   1342 	chanargs.rc_flags = IBT_ALL_SIGNALED;
   1343 	chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
   1344 	    IBT_CEP_ATOMIC;
   1345 	chanargs.rc_hca_port_num = hca_port;
   1346 	chanargs.rc_scq = ep->ep_sendcq;
   1347 	chanargs.rc_rcq = ep->ep_recvcq;
   1348 	chanargs.rc_pd = hcap->hca_pdhdl;
   1349 	chanargs.rc_srq = NULL;
   1350 
   1351 	ret = ibt_alloc_rc_channel(hcap->hca_hdl,
   1352 	    IBT_ACHAN_NO_FLAGS, &chanargs, &chanhdl, NULL);
   1353 	if (ret != IBT_SUCCESS) {
   1354 		RDS_DPRINTF2(LABEL, "ibt_alloc_rc_channel fail: %d",
   1355 		    ret);
   1356 		(void) ibt_free_cq(ep->ep_recvcq);
   1357 		ep->ep_recvcq = NULL;
   1358 		(void) ibt_free_cq(ep->ep_sendcq);
   1359 		ep->ep_sendcq = NULL;
   1360 		mutex_exit(&ep->ep_lock);
   1361 		return (NULL);
   1362 	}
   1363 	mutex_exit(&ep->ep_lock);
   1364 
   1365 	/* Chan private should contain the ep */
   1366 	(void) ibt_set_chan_private(chanhdl, ep);
   1367 
   1368 	RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Return: 0x%p", chanhdl);
   1369 
   1370 	return (chanhdl);
   1371 }
   1372 
   1373 
   1374 #if 0
   1375 
   1376 /* Return node guid given a port gid */
   1377 ib_guid_t
   1378 rds_gid_to_node_guid(ib_gid_t gid)
   1379 {
   1380 	ibt_node_info_t	nodeinfo;
   1381 	int		ret;
   1382 
   1383 	RDS_DPRINTF4("rds_gid_to_node_guid", "Enter: gid: %llx:%llx",
   1384 	    gid.gid_prefix, gid.gid_guid);
   1385 
   1386 	ret = ibt_gid_to_node_info(gid, &nodeinfo);
   1387 	if (ret != IBT_SUCCESS) {
   1388 		RDS_DPRINTF2(LABEL, "ibt_gid_node_info for gid: %llx:%llx "
   1389 		    "failed", gid.gid_prefix, gid.gid_guid);
   1390 		return (0LL);
   1391 	}
   1392 
   1393 	RDS_DPRINTF4("rds_gid_to_node_guid", "Return: Node guid: %llx",
   1394 	    nodeinfo.n_node_guid);
   1395 
   1396 	return (nodeinfo.n_node_guid);
   1397 }
   1398 
   1399 #endif
   1400 
   1401 static void
   1402 rds_handle_portup_event(rds_state_t *statep, ibt_hca_hdl_t hdl,
   1403     ibt_async_event_t *event)
   1404 {
   1405 	rds_hca_t		*hcap;
   1406 	ibt_hca_portinfo_t	*newpinfop, *oldpinfop;
   1407 	uint_t			newsize, oldsize, nport;
   1408 	ib_gid_t		gid;
   1409 	int			ret;
   1410 
   1411 	RDS_DPRINTF2("rds_handle_portup_event",
   1412 	    "Enter: GUID: 0x%llx Statep: %p", event->ev_hca_guid, statep);
   1413 
   1414 	rw_enter(&statep->rds_hca_lock, RW_WRITER);
   1415 
   1416 	hcap = statep->rds_hcalistp;
   1417 	while ((hcap != NULL) && (hcap->hca_guid != event->ev_hca_guid)) {
   1418 		hcap = hcap->hca_nextp;
   1419 	}
   1420 
   1421 	if (hcap == NULL) {
   1422 		RDS_DPRINTF2("rds_handle_portup_event", "HCA: 0x%llx is "
   1423 		    "not in our list", event->ev_hca_guid);
   1424 		rw_exit(&statep->rds_hca_lock);
   1425 		return;
   1426 	}
   1427 
   1428 	ret = ibt_query_hca_ports(hdl, 0, &newpinfop, &nport, &newsize);
   1429 	if (ret != IBT_SUCCESS) {
   1430 		RDS_DPRINTF2(LABEL, "ibt_query_hca_ports failed: %d", ret);
   1431 		rw_exit(&statep->rds_hca_lock);
   1432 		return;
   1433 	}
   1434 
   1435 	oldpinfop = hcap->hca_pinfop;
   1436 	oldsize = hcap->hca_pinfo_sz;
   1437 	hcap->hca_pinfop = newpinfop;
   1438 	hcap->hca_pinfo_sz = newsize;
   1439 
   1440 	(void) ibt_free_portinfo(oldpinfop, oldsize);
   1441 
   1442 	/* If RDS service is not registered then no bind is needed */
   1443 	if (statep->rds_srvhdl == NULL) {
   1444 		RDS_DPRINTF2("rds_handle_portup_event",
   1445 		    "RDS Service is not registered, so no action needed");
   1446 		rw_exit(&statep->rds_hca_lock);
   1447 		return;
   1448 	}
   1449 
   1450 	/*
   1451 	 * If the service was previously bound on this port and
   1452 	 * if this port has changed state down and now up, we do not
   1453 	 * need to bind the service again. The bind is expected to
   1454 	 * persist across state changes. If the service was never bound
   1455 	 * before then we bind it this time.
   1456 	 */
   1457 	if (hcap->hca_bindhdl[event->ev_port - 1] == NULL) {
   1458 
   1459 		/* structure copy */
   1460 		gid = newpinfop[event->ev_port - 1].p_sgid_tbl[0];
   1461 
   1462 		/* bind RDS service on the port, pass statep as cm_private */
   1463 		ret = ibt_bind_service(statep->rds_srvhdl, gid, NULL, statep,
   1464 		    &hcap->hca_bindhdl[event->ev_port - 1]);
   1465 		if (ret != IBT_SUCCESS) {
   1466 			RDS_DPRINTF2("rds_handle_portup_event",
   1467 			    "Bind service for HCA: 0x%llx Port: %d "
   1468 			    "gid %llx:%llx returned: %d", event->ev_hca_guid,
   1469 			    event->ev_port, gid.gid_prefix, gid.gid_guid, ret);
   1470 		}
   1471 	}
   1472 
   1473 	rw_exit(&statep->rds_hca_lock);
   1474 
   1475 	RDS_DPRINTF2("rds_handle_portup_event", "Return: GUID: 0x%llx",
   1476 	    event->ev_hca_guid);
   1477 }
   1478 
   1479 static void
   1480 rdsib_add_hca(ib_guid_t hca_guid)
   1481 {
   1482 	rds_hca_t	*hcap;
   1483 	ibt_mr_attr_t	mem_attr;
   1484 	ibt_mr_desc_t	mem_desc;
   1485 	int		ret;
   1486 
   1487 	RDS_DPRINTF2("rdsib_add_hca", "Enter: GUID: 0x%llx", hca_guid);
   1488 
   1489 	hcap = rdsib_init_hca(hca_guid);
   1490 	if (hcap == NULL)
   1491 		return;
   1492 
   1493 	/* register the recv memory with this hca */
   1494 	mutex_enter(&rds_dpool.pool_lock);
   1495 	if (rds_dpool.pool_memp == NULL) {
   1496 		/* no memory to register */
   1497 		RDS_DPRINTF2("rdsib_add_hca", "No memory to register");
   1498 		mutex_exit(&rds_dpool.pool_lock);
   1499 		return;
   1500 	}
   1501 
   1502 	mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)rds_dpool.pool_memp;
   1503 	mem_attr.mr_len = rds_dpool.pool_memsize;
   1504 	mem_attr.mr_as = NULL;
   1505 	mem_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
   1506 
   1507 	ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl, &mem_attr,
   1508 	    &hcap->hca_mrhdl, &mem_desc);
   1509 
   1510 	mutex_exit(&rds_dpool.pool_lock);
   1511 
   1512 	if (ret != IBT_SUCCESS) {
   1513 		RDS_DPRINTF2("rdsib_add_hca", "ibt_register_mr failed: %d",
   1514 		    ret);
   1515 	} else {
   1516 		rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
   1517 		hcap->hca_state = RDS_HCA_STATE_MEM_REGISTERED;
   1518 		hcap->hca_lkey = mem_desc.md_lkey;
   1519 		hcap->hca_rkey = mem_desc.md_rkey;
   1520 		rw_exit(&rdsib_statep->rds_hca_lock);
   1521 	}
   1522 
   1523 	RDS_DPRINTF2("rdsib_add_hca", "Retrun: GUID: 0x%llx", hca_guid);
   1524 }
   1525 
   1526 void rds_close_this_session(rds_session_t *sp, uint8_t wait);
   1527 int rds_post_control_message(rds_session_t *sp, uint8_t code, in_port_t port);
   1528 
   1529 static void
   1530 rdsib_del_hca(rds_state_t *statep, ib_guid_t hca_guid)
   1531 {
   1532 	rds_session_t	*sp;
   1533 	rds_hca_t	*hcap;
   1534 	rds_hca_state_t	saved_state;
   1535 	int		ret, ix;
   1536 
   1537 	RDS_DPRINTF2("rdsib_del_hca", "Enter: GUID: 0x%llx", hca_guid);
   1538 
   1539 	/*
   1540 	 * This should be a write lock as we don't want anyone to get access
   1541 	 * to the hcap while we are modifing its contents
   1542 	 */
   1543 	rw_enter(&statep->rds_hca_lock, RW_WRITER);
   1544 
   1545 	hcap = statep->rds_hcalistp;
   1546 	while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
   1547 		hcap = hcap->hca_nextp;
   1548 	}
   1549 
   1550 	/* Prevent initiating any new activity on this HCA */
   1551 	ASSERT(hcap != NULL);
   1552 	saved_state = hcap->hca_state;
   1553 	hcap->hca_state = RDS_HCA_STATE_STOPPING;
   1554 
   1555 	rw_exit(&statep->rds_hca_lock);
   1556 
   1557 	/*
   1558 	 * stop the outgoing traffic and close any active sessions on this hca.
   1559 	 * Any pending messages in the SQ will be allowed to complete.
   1560 	 */
   1561 	rw_enter(&statep->rds_sessionlock, RW_READER);
   1562 	sp = statep->rds_sessionlistp;
   1563 	while (sp) {
   1564 		if (sp->session_hca_guid != hca_guid) {
   1565 			sp = sp->session_nextp;
   1566 			continue;
   1567 		}
   1568 
   1569 		rw_enter(&sp->session_lock, RW_WRITER);
   1570 		RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
   1571 		    sp->session_state);
   1572 		/*
   1573 		 * We are changing the session state in advance. This prevents
   1574 		 * further messages to be posted to the SQ. We then
   1575 		 * send a control message to the remote and tell it close
   1576 		 * the session.
   1577 		 */
   1578 		sp->session_state = RDS_SESSION_STATE_HCA_CLOSING;
   1579 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
   1580 		    "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
   1581 		rw_exit(&sp->session_lock);
   1582 
   1583 		/*
   1584 		 * wait until the sendq is empty then tell the remote to
   1585 		 * close this session. This enables for graceful shutdown of
   1586 		 * the session
   1587 		 */
   1588 		(void) rds_is_sendq_empty(&sp->session_dataep, 2);
   1589 		(void) rds_post_control_message(sp,
   1590 		    RDS_CTRL_CODE_CLOSE_SESSION, 0);
   1591 
   1592 		sp = sp->session_nextp;
   1593 	}
   1594 
   1595 	/* wait until all the sessions are off this HCA */
   1596 	sp = statep->rds_sessionlistp;
   1597 	while (sp) {
   1598 		if (sp->session_hca_guid != hca_guid) {
   1599 			sp = sp->session_nextp;
   1600 			continue;
   1601 		}
   1602 
   1603 		rw_enter(&sp->session_lock, RW_READER);
   1604 		RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
   1605 		    sp->session_state);
   1606 
   1607 		while ((sp->session_state == RDS_SESSION_STATE_HCA_CLOSING) ||
   1608 		    (sp->session_state == RDS_SESSION_STATE_ERROR) ||
   1609 		    (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING) ||
   1610 		    (sp->session_state == RDS_SESSION_STATE_CLOSED)) {
   1611 			rw_exit(&sp->session_lock);
   1612 			delay(drv_usectohz(1000000));
   1613 			rw_enter(&sp->session_lock, RW_READER);
   1614 			RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
   1615 			    sp->session_state);
   1616 		}
   1617 
   1618 		rw_exit(&sp->session_lock);
   1619 
   1620 		sp = sp->session_nextp;
   1621 	}
   1622 	rw_exit(&statep->rds_sessionlock);
   1623 
   1624 	/*
   1625 	 * if rdsib_close_ib was called before this, then that would have
   1626 	 * unbound the service on all ports. In that case, the HCA structs
   1627 	 * will contain stale bindhdls. Hence, we do not call unbind unless
   1628 	 * the service is still registered.
   1629 	 */
   1630 	if (statep->rds_srvhdl != NULL) {
   1631 		/* unbind RDS service on all ports on this HCA */
   1632 		for (ix = 0; ix < hcap->hca_nports; ix++) {
   1633 			if (hcap->hca_bindhdl[ix] == NULL) {
   1634 				continue;
   1635 			}
   1636 
   1637 			RDS_DPRINTF2("rdsib_del_hca",
   1638 			    "Unbinding Service: port: %d, bindhdl: %p",
   1639 			    ix + 1, hcap->hca_bindhdl[ix]);
   1640 			(void) ibt_unbind_service(rdsib_statep->rds_srvhdl,
   1641 			    hcap->hca_bindhdl[ix]);
   1642 			hcap->hca_bindhdl[ix] = NULL;
   1643 		}
   1644 	}
   1645 
   1646 	RDS_DPRINTF2("rdsib_del_hca", "HCA(%p) State: %d", hcap,
   1647 	    hcap->hca_state);
   1648 
   1649 	switch (saved_state) {
   1650 	case RDS_HCA_STATE_MEM_REGISTERED:
   1651 		ASSERT(hcap->hca_mrhdl != NULL);
   1652 		ret = ibt_deregister_mr(hcap->hca_hdl, hcap->hca_mrhdl);
   1653 		if (ret != IBT_SUCCESS) {
   1654 			RDS_DPRINTF2("rdsib_del_hca",
   1655 			    "ibt_deregister_mr failed: %d", ret);
   1656 			return;
   1657 		}
   1658 		hcap->hca_mrhdl = NULL;
   1659 		/* FALLTHRU */
   1660 	case RDS_HCA_STATE_OPEN:
   1661 		ASSERT(hcap->hca_hdl != NULL);
   1662 		ASSERT(hcap->hca_pdhdl != NULL);
   1663 
   1664 
   1665 		ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
   1666 		if (ret != IBT_SUCCESS) {
   1667 			RDS_DPRINTF2("rdsib_del_hca",
   1668 			    "ibt_free_pd failed: %d", ret);
   1669 		}
   1670 
   1671 		(void) ibt_free_portinfo(hcap->hca_pinfop, hcap->hca_pinfo_sz);
   1672 
   1673 		ret = ibt_close_hca(hcap->hca_hdl);
   1674 		if (ret != IBT_SUCCESS) {
   1675 			RDS_DPRINTF2("rdsib_del_hca",
   1676 			    "ibt_close_hca failed: %d", ret);
   1677 		}
   1678 
   1679 		hcap->hca_hdl = NULL;
   1680 		hcap->hca_pdhdl = NULL;
   1681 		hcap->hca_lkey = 0;
   1682 		hcap->hca_rkey = 0;
   1683 	}
   1684 
   1685 	/*
   1686 	 * This should be a write lock as we don't want anyone to get access
   1687 	 * to the hcap while we are modifing its contents
   1688 	 */
   1689 	rw_enter(&statep->rds_hca_lock, RW_WRITER);
   1690 	hcap->hca_state = RDS_HCA_STATE_REMOVED;
   1691 	rw_exit(&statep->rds_hca_lock);
   1692 
   1693 	RDS_DPRINTF2("rdsib_del_hca", "Return: GUID: 0x%llx", hca_guid);
   1694 }
   1695 
   1696 static void
   1697 rds_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
   1698     ibt_async_event_t *event)
   1699 {
   1700 	rds_state_t		*statep = (rds_state_t *)clntp;
   1701 
   1702 	RDS_DPRINTF2("rds_async_handler", "Async code: %d", code);
   1703 
   1704 	switch (code) {
   1705 	case IBT_EVENT_PORT_UP:
   1706 		rds_handle_portup_event(statep, hdl, event);
   1707 		break;
   1708 	case IBT_HCA_ATTACH_EVENT:
   1709 		/*
   1710 		 * NOTE: In some error recovery paths, it is possible to
   1711 		 * receive IBT_HCA_ATTACH_EVENTs on already known HCAs.
   1712 		 */
   1713 		(void) rdsib_add_hca(event->ev_hca_guid);
   1714 		break;
   1715 	case IBT_HCA_DETACH_EVENT:
   1716 		(void) rdsib_del_hca(statep, event->ev_hca_guid);
   1717 		break;
   1718 
   1719 	default:
   1720 		RDS_DPRINTF2(LABEL, "Async event: %d not handled", code);
   1721 	}
   1722 
   1723 	RDS_DPRINTF2("rds_async_handler", "Return: code: %d", code);
   1724 }
   1725 
   1726 /*
   1727  * This routine exists to minimize stale connections across ungraceful
   1728  * reboots of nodes in a cluster.
   1729  */
   1730 void
   1731 rds_randomize_qps(rds_hca_t *hcap)
   1732 {
   1733 	ibt_cq_attr_t			cqattr;
   1734 	ibt_rc_chan_alloc_args_t	chanargs;
   1735 	ibt_channel_hdl_t		qp1, qp2;
   1736 	ibt_cq_hdl_t			cq_hdl;
   1737 	hrtime_t			nsec;
   1738 	uint8_t				i, j, rand1, rand2;
   1739 	int				ret;
   1740 
   1741 	bzero(&cqattr, sizeof (ibt_cq_attr_t));
   1742 	cqattr.cq_size = 1;
   1743 	cqattr.cq_sched = NULL;
   1744 	cqattr.cq_flags = IBT_CQ_NO_FLAGS;
   1745 	ret = ibt_alloc_cq(hcap->hca_hdl, &cqattr, &cq_hdl, NULL);
   1746 	if (ret != IBT_SUCCESS) {
   1747 		RDS_DPRINTF2("rds_randomize_qps",
   1748 		    "ibt_alloc_cq failed: %d", ret);
   1749 		return;
   1750 	}
   1751 
   1752 	bzero(&chanargs, sizeof (ibt_rc_chan_alloc_args_t));
   1753 	chanargs.rc_flags = IBT_ALL_SIGNALED;
   1754 	chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
   1755 	    IBT_CEP_ATOMIC;
   1756 	chanargs.rc_hca_port_num = 1;
   1757 	chanargs.rc_scq = cq_hdl;
   1758 	chanargs.rc_rcq = cq_hdl;
   1759 	chanargs.rc_pd = hcap->hca_pdhdl;
   1760 	chanargs.rc_srq = NULL;
   1761 
   1762 	nsec = gethrtime();
   1763 	rand1 = (nsec & 0xF);
   1764 	rand2 = (nsec >> 4) & 0xF;
   1765 	RDS_DPRINTF2("rds_randomize_qps", "rand1: %d rand2: %d",
   1766 	    rand1, rand2);
   1767 
   1768 	for (i = 0; i < rand1 + 3; i++) {
   1769 		if (ibt_alloc_rc_channel(hcap->hca_hdl,
   1770 		    IBT_ACHAN_NO_FLAGS, &chanargs, &qp1, NULL) !=
   1771 		    IBT_SUCCESS) {
   1772 			RDS_DPRINTF2("rds_randomize_qps",
   1773 			    "Bailing at i: %d", i);
   1774 			(void) ibt_free_cq(cq_hdl);
   1775 			return;
   1776 		}
   1777 		for (j = 0; j < rand2 + 3; j++) {
   1778 			if (ibt_alloc_rc_channel(hcap->hca_hdl,
   1779 			    IBT_ACHAN_NO_FLAGS, &chanargs, &qp2,
   1780 			    NULL) != IBT_SUCCESS) {
   1781 				RDS_DPRINTF2("rds_randomize_qps",
   1782 				    "Bailing at i: %d j: %d", i, j);
   1783 				(void) ibt_free_channel(qp1);
   1784 				(void) ibt_free_cq(cq_hdl);
   1785 				return;
   1786 			}
   1787 			(void) ibt_free_channel(qp2);
   1788 		}
   1789 		(void) ibt_free_channel(qp1);
   1790 	}
   1791 
   1792 	(void) ibt_free_cq(cq_hdl);
   1793 }
   1794