Home | History | Annotate | Download | only in rds
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /*
     26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
     27  *
     28  * This software is available to you under a choice of one of two
     29  * licenses.  You may choose to be licensed under the terms of the GNU
     30  * General Public License (GPL) Version 2, available from the file
     31  * COPYING in the main directory of this source tree, or the
     32  * OpenIB.org BSD license below:
     33  *
     34  *     Redistribution and use in source and binary forms, with or
     35  *     without modification, are permitted provided that the following
     36  *     conditions are met:
     37  *
     38  *	- Redistributions of source code must retain the above
     39  *	  copyright notice, this list of conditions and the following
     40  *	  disclaimer.
     41  *
     42  *	- Redistributions in binary form must reproduce the above
     43  *	  copyright notice, this list of conditions and the following
     44  *	  disclaimer in the documentation and/or other materials
     45  *	  provided with the distribution.
     46  *
     47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     54  * SOFTWARE.
     55  *
     56  */
     57 /*
     58  * Sun elects to include this software in Sun product
     59  * under the OpenIB BSD license.
     60  *
     61  *
     62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     72  * POSSIBILITY OF SUCH DAMAGE.
     73  */
     74 
     75 #include <sys/stream.h>
     76 #include <sys/ib/clients/rds/rdsib_cm.h>
     77 #include <sys/ib/clients/rds/rdsib_ib.h>
     78 #include <sys/ib/clients/rds/rdsib_buf.h>
     79 #include <sys/ib/clients/rds/rdsib_ep.h>
     80 #include <sys/ib/clients/rds/rds_kstat.h>
     81 #include <sys/zone.h>
     82 
     83 #define	RDS_POLL_CQ_IN_2TICKS	1
     84 
     85 /*
     86  * This File contains the endpoint related calls
     87  */
     88 
     89 extern boolean_t rds_islocal(ipaddr_t addr);
     90 extern uint_t rds_wc_signal;
     91 
     92 #define	RDS_LOOPBACK	0
     93 #define	RDS_LOCAL	1
     94 #define	RDS_REMOTE	2
     95 
     96 #define	IBT_IPADDR	1
     97 
     98 static uint8_t
     99 rds_is_port_marked(rds_session_t *sp, in_port_t port, uint_t qualifier)
    100 {
    101 	uint8_t	ret;
    102 
    103 	switch (qualifier) {
    104 	case RDS_LOOPBACK: /* loopback */
    105 		rw_enter(&rds_loopback_portmap_lock, RW_READER);
    106 		ret = (rds_loopback_portmap[port/8] & (1 << (port % 8)));
    107 		rw_exit(&rds_loopback_portmap_lock);
    108 		break;
    109 
    110 	case RDS_LOCAL: /* Session local */
    111 		ASSERT(sp != NULL);
    112 		rw_enter(&sp->session_local_portmap_lock, RW_READER);
    113 		ret = (sp->session_local_portmap[port/8] & (1 << (port % 8)));
    114 		rw_exit(&sp->session_local_portmap_lock);
    115 		break;
    116 
    117 	case RDS_REMOTE: /* Session remote */
    118 		ASSERT(sp != NULL);
    119 		rw_enter(&sp->session_remote_portmap_lock, RW_READER);
    120 		ret = (sp->session_remote_portmap[port/8] & (1 << (port % 8)));
    121 		rw_exit(&sp->session_remote_portmap_lock);
    122 		break;
    123 	}
    124 
    125 	return (ret);
    126 }
    127 
    128 static uint8_t
    129 rds_check_n_mark_port(rds_session_t *sp, in_port_t port, uint_t qualifier)
    130 {
    131 	uint8_t	ret;
    132 
    133 	switch (qualifier) {
    134 	case RDS_LOOPBACK: /* loopback */
    135 		rw_enter(&rds_loopback_portmap_lock, RW_WRITER);
    136 		ret = (rds_loopback_portmap[port/8] & (1 << (port % 8)));
    137 		if (!ret) {
    138 			/* port is not marked, mark it */
    139 			rds_loopback_portmap[port/8] =
    140 			    rds_loopback_portmap[port/8] | (1 << (port % 8));
    141 		}
    142 		rw_exit(&rds_loopback_portmap_lock);
    143 		break;
    144 
    145 	case RDS_LOCAL: /* Session local */
    146 		ASSERT(sp != NULL);
    147 		rw_enter(&sp->session_local_portmap_lock, RW_WRITER);
    148 		ret = (sp->session_local_portmap[port/8] & (1 << (port % 8)));
    149 		if (!ret) {
    150 			/* port is not marked, mark it */
    151 			sp->session_local_portmap[port/8] =
    152 			    sp->session_local_portmap[port/8] |
    153 			    (1 << (port % 8));
    154 		}
    155 		rw_exit(&sp->session_local_portmap_lock);
    156 		break;
    157 
    158 	case RDS_REMOTE: /* Session remote */
    159 		ASSERT(sp != NULL);
    160 		rw_enter(&sp->session_remote_portmap_lock, RW_WRITER);
    161 		ret = (sp->session_remote_portmap[port/8] & (1 << (port % 8)));
    162 		if (!ret) {
    163 			/* port is not marked, mark it */
    164 			sp->session_remote_portmap[port/8] =
    165 			    sp->session_remote_portmap[port/8] |
    166 			    (1 << (port % 8));
    167 		}
    168 		rw_exit(&sp->session_remote_portmap_lock);
    169 		break;
    170 	}
    171 
    172 	return (ret);
    173 }
    174 
    175 static uint8_t
    176 rds_check_n_unmark_port(rds_session_t *sp, in_port_t port, uint_t qualifier)
    177 {
    178 	uint8_t	ret;
    179 
    180 	switch (qualifier) {
    181 	case RDS_LOOPBACK: /* loopback */
    182 		rw_enter(&rds_loopback_portmap_lock, RW_WRITER);
    183 		ret = (rds_loopback_portmap[port/8] & (1 << (port % 8)));
    184 		if (ret) {
    185 			/* port is marked, unmark it */
    186 			rds_loopback_portmap[port/8] =
    187 			    rds_loopback_portmap[port/8] & ~(1 << (port % 8));
    188 		}
    189 		rw_exit(&rds_loopback_portmap_lock);
    190 		break;
    191 
    192 	case RDS_LOCAL: /* Session local */
    193 		ASSERT(sp != NULL);
    194 		rw_enter(&sp->session_local_portmap_lock, RW_WRITER);
    195 		ret = (sp->session_local_portmap[port/8] & (1 << (port % 8)));
    196 		if (ret) {
    197 			/* port is marked, unmark it */
    198 			sp->session_local_portmap[port/8] =
    199 			    sp->session_local_portmap[port/8] &
    200 			    ~(1 << (port % 8));
    201 		}
    202 		rw_exit(&sp->session_local_portmap_lock);
    203 		break;
    204 
    205 	case RDS_REMOTE: /* Session remote */
    206 		ASSERT(sp != NULL);
    207 		rw_enter(&sp->session_remote_portmap_lock, RW_WRITER);
    208 		ret = (sp->session_remote_portmap[port/8] & (1 << (port % 8)));
    209 		if (ret) {
    210 			/* port is marked, unmark it */
    211 			sp->session_remote_portmap[port/8] =
    212 			    sp->session_remote_portmap[port/8] &
    213 			    ~(1 << (port % 8));
    214 		}
    215 		rw_exit(&sp->session_remote_portmap_lock);
    216 		break;
    217 	}
    218 
    219 	return (ret);
    220 }
    221 
    222 static void
    223 rds_mark_all_ports(rds_session_t *sp, uint_t qualifier)
    224 {
    225 	switch (qualifier) {
    226 	case RDS_LOOPBACK: /* loopback */
    227 		rw_enter(&rds_loopback_portmap_lock, RW_WRITER);
    228 		(void) memset(rds_loopback_portmap, 0xFF, RDS_PORT_MAP_SIZE);
    229 		rw_exit(&rds_loopback_portmap_lock);
    230 		break;
    231 
    232 	case RDS_LOCAL: /* Session local */
    233 		ASSERT(sp != NULL);
    234 		rw_enter(&sp->session_local_portmap_lock, RW_WRITER);
    235 		(void) memset(sp->session_local_portmap, 0xFF,
    236 		    RDS_PORT_MAP_SIZE);
    237 		rw_exit(&sp->session_local_portmap_lock);
    238 		break;
    239 
    240 	case RDS_REMOTE: /* Session remote */
    241 		ASSERT(sp != NULL);
    242 		rw_enter(&sp->session_remote_portmap_lock, RW_WRITER);
    243 		(void) memset(sp->session_remote_portmap, 0xFF,
    244 		    RDS_PORT_MAP_SIZE);
    245 		rw_exit(&sp->session_remote_portmap_lock);
    246 		break;
    247 	}
    248 }
    249 
    250 static void
    251 rds_unmark_all_ports(rds_session_t *sp, uint_t qualifier)
    252 {
    253 	switch (qualifier) {
    254 	case RDS_LOOPBACK: /* loopback */
    255 		rw_enter(&rds_loopback_portmap_lock, RW_WRITER);
    256 		bzero(rds_loopback_portmap, RDS_PORT_MAP_SIZE);
    257 		rw_exit(&rds_loopback_portmap_lock);
    258 		break;
    259 
    260 	case RDS_LOCAL: /* Session local */
    261 		ASSERT(sp != NULL);
    262 		rw_enter(&sp->session_local_portmap_lock, RW_WRITER);
    263 		bzero(sp->session_local_portmap, RDS_PORT_MAP_SIZE);
    264 		rw_exit(&sp->session_local_portmap_lock);
    265 		break;
    266 
    267 	case RDS_REMOTE: /* Session remote */
    268 		ASSERT(sp != NULL);
    269 		rw_enter(&sp->session_remote_portmap_lock, RW_WRITER);
    270 		bzero(sp->session_remote_portmap, RDS_PORT_MAP_SIZE);
    271 		rw_exit(&sp->session_remote_portmap_lock);
    272 		break;
    273 	}
    274 }
    275 
    276 static boolean_t
    277 rds_add_session(rds_session_t *sp, boolean_t locked)
    278 {
    279 	boolean_t retval = B_TRUE;
    280 
    281 	RDS_DPRINTF2("rds_add_session", "Enter: SP(%p)", sp);
    282 
    283 	if (!locked) {
    284 		rw_enter(&rdsib_statep->rds_sessionlock, RW_WRITER);
    285 	}
    286 
    287 	/* Don't allow more sessions than configured in rdsib.conf */
    288 	if (rdsib_statep->rds_nsessions >= (MaxNodes - 1)) {
    289 		RDS_DPRINTF1("rds_add_session", "Max session limit reached");
    290 		retval = B_FALSE;
    291 	} else {
    292 		sp->session_nextp = rdsib_statep->rds_sessionlistp;
    293 		rdsib_statep->rds_sessionlistp = sp;
    294 		rdsib_statep->rds_nsessions++;
    295 		RDS_INCR_SESS();
    296 	}
    297 
    298 	if (!locked) {
    299 		rw_exit(&rdsib_statep->rds_sessionlock);
    300 	}
    301 
    302 	RDS_DPRINTF2("rds_add_session", "Return: SP(%p)", sp);
    303 
    304 	return (retval);
    305 }
    306 
    307 /* Session lookup based on destination IP or destination node guid */
    308 rds_session_t *
    309 rds_session_lkup(rds_state_t *statep, ipaddr_t remoteip, ib_guid_t node_guid)
    310 {
    311 	rds_session_t	*sp;
    312 
    313 	RDS_DPRINTF4("rds_session_lkup", "Enter: 0x%p 0x%x 0x%llx", statep,
    314 	    remoteip, node_guid);
    315 
    316 	/* A read/write lock is expected, will panic if none of them are held */
    317 	ASSERT(rw_lock_held(&statep->rds_sessionlock));
    318 	sp = statep->rds_sessionlistp;
    319 	while (sp) {
    320 		if ((sp->session_remip == remoteip) || ((node_guid != 0) &&
    321 		    (sp->session_rgid.gid_guid == node_guid))) {
    322 			break;
    323 		}
    324 
    325 		sp = sp->session_nextp;
    326 	}
    327 
    328 	RDS_DPRINTF4("rds_session_lkup", "Return: SP(%p)", sp);
    329 
    330 	return (sp);
    331 }
    332 
    333 boolean_t
    334 rds_session_lkup_by_sp(rds_session_t *sp)
    335 {
    336 	rds_session_t *sessionp;
    337 
    338 	RDS_DPRINTF4("rds_session_lkup_by_sp", "Enter: 0x%p", sp);
    339 
    340 	rw_enter(&rdsib_statep->rds_sessionlock, RW_READER);
    341 	sessionp = rdsib_statep->rds_sessionlistp;
    342 	while (sessionp) {
    343 		if (sessionp == sp) {
    344 			rw_exit(&rdsib_statep->rds_sessionlock);
    345 			return (B_TRUE);
    346 		}
    347 
    348 		sessionp = sessionp->session_nextp;
    349 	}
    350 	rw_exit(&rdsib_statep->rds_sessionlock);
    351 
    352 	return (B_FALSE);
    353 }
    354 
    355 static void
    356 rds_ep_fini(rds_ep_t *ep)
    357 {
    358 	RDS_DPRINTF3("rds_ep_fini", "Enter: EP(%p) type: %d", ep, ep->ep_type);
    359 
    360 	/* free send pool */
    361 	rds_free_send_pool(ep);
    362 
    363 	/* free recv pool */
    364 	rds_free_recv_pool(ep);
    365 
    366 	mutex_enter(&ep->ep_lock);
    367 	ep->ep_hca_guid = 0;
    368 	mutex_exit(&ep->ep_lock);
    369 
    370 	RDS_DPRINTF3("rds_ep_fini", "Return EP(%p)", ep);
    371 }
    372 
    373 /* Assumes SP write lock is held */
    374 int
    375 rds_ep_init(rds_ep_t *ep, ib_guid_t hca_guid)
    376 {
    377 	uint_t		ret;
    378 
    379 	RDS_DPRINTF3("rds_ep_init", "Enter: EP(%p) Type: %d", ep, ep->ep_type);
    380 
    381 	/* send pool */
    382 	ret = rds_init_send_pool(ep, hca_guid);
    383 	if (ret != 0) {
    384 		RDS_DPRINTF2(LABEL, "EP(%p): rds_init_send_pool failed: %d",
    385 		    ep, ret);
    386 		return (-1);
    387 	}
    388 
    389 	/* recv pool */
    390 	ret = rds_init_recv_pool(ep);
    391 	if (ret != 0) {
    392 		RDS_DPRINTF2(LABEL, "EP(%p): rds_init_recv_pool failed: %d",
    393 		    ep, ret);
    394 		rds_free_send_pool(ep);
    395 		return (-1);
    396 	}
    397 
    398 	/* reset the ep state */
    399 	mutex_enter(&ep->ep_lock);
    400 	ep->ep_state = RDS_EP_STATE_UNCONNECTED;
    401 	ep->ep_hca_guid = hca_guid;
    402 	ep->ep_lbufid = NULL;
    403 	ep->ep_rbufid = NULL;
    404 	ep->ep_segfbp = NULL;
    405 	ep->ep_seglbp = NULL;
    406 
    407 	/* Initialize the WR to send acknowledgements */
    408 	ep->ep_ackwr.wr_id = RDS_RDMAW_WRID;
    409 	ep->ep_ackwr.wr_flags = IBT_WR_SEND_SOLICIT;
    410 	ep->ep_ackwr.wr_trans = IBT_RC_SRV;
    411 	ep->ep_ackwr.wr_opcode = IBT_WRC_RDMAW;
    412 	ep->ep_ackwr.wr_nds = 1;
    413 	ep->ep_ackwr.wr_sgl = &ep->ep_ackds;
    414 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = NULL;
    415 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = 0;
    416 	mutex_exit(&ep->ep_lock);
    417 
    418 	RDS_DPRINTF3("rds_ep_init", "Return: EP(%p) type: %d", ep, ep->ep_type);
    419 
    420 	return (0);
    421 }
    422 
    423 static int
    424 rds_ep_reinit(rds_ep_t *ep, ib_guid_t hca_guid)
    425 {
    426 	int	ret;
    427 
    428 	RDS_DPRINTF3("rds_ep_reinit", "Enter: EP(%p) Type: %d",
    429 	    ep, ep->ep_type);
    430 
    431 	/* Re-initialize send pool */
    432 	ret = rds_reinit_send_pool(ep, hca_guid);
    433 	if (ret != 0) {
    434 		RDS_DPRINTF2("rds_ep_reinit",
    435 		    "EP(%p): rds_reinit_send_pool failed: %d", ep, ret);
    436 		return (-1);
    437 	}
    438 
    439 	/* free all the receive buffers in the pool */
    440 	rds_free_recv_pool(ep);
    441 
    442 	RDS_DPRINTF3("rds_ep_reinit", "Return: EP(%p) Type: %d",
    443 	    ep, ep->ep_type);
    444 
    445 	return (0);
    446 }
    447 
    448 void
    449 rds_session_fini(rds_session_t *sp)
    450 {
    451 	RDS_DPRINTF2("rds_session_fini", "Enter: SP(0x%p)", sp);
    452 
    453 	rds_ep_fini(&sp->session_dataep);
    454 	rds_ep_fini(&sp->session_ctrlep);
    455 
    456 	RDS_DPRINTF2("rds_session_fini", "Return: SP(0x%p)", sp);
    457 }
    458 
    459 /*
    460  * Allocate and initialize the resources needed for the control and
    461  * data channels
    462  */
    463 int
    464 rds_session_init(rds_session_t *sp)
    465 {
    466 	int		ret;
    467 	rds_hca_t	*hcap;
    468 	ib_guid_t	hca_guid;
    469 
    470 	RDS_DPRINTF2("rds_session_init", "Enter: SP(0x%p)", sp);
    471 
    472 	/* CALLED WITH SESSION WRITE LOCK */
    473 
    474 	hcap = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
    475 	if (hcap == NULL) {
    476 		RDS_DPRINTF2("rds_session_init", "SGID is on an uninitialized "
    477 		    "HCA: %llx", sp->session_lgid.gid_guid);
    478 		return (-1);
    479 	}
    480 
    481 	hca_guid = hcap->hca_guid;
    482 	sp->session_hca_guid = hca_guid;
    483 
    484 	/* allocate and initialize the ctrl channel */
    485 	ret = rds_ep_init(&sp->session_ctrlep, hca_guid);
    486 	if (ret != 0) {
    487 		RDS_DPRINTF2(LABEL, "SP(%p): Ctrl EP(%p) initialization "
    488 		    "failed", sp, &sp->session_ctrlep);
    489 		return (-1);
    490 	}
    491 
    492 	RDS_DPRINTF2(LABEL, "SP(%p) Control EP(%p)", sp, &sp->session_ctrlep);
    493 
    494 	/* allocate and initialize the data channel */
    495 	ret = rds_ep_init(&sp->session_dataep, hca_guid);
    496 	if (ret != 0) {
    497 		RDS_DPRINTF2(LABEL, "SP(%p): Data EP(%p) initialization "
    498 		    "failed", sp, &sp->session_dataep);
    499 		rds_ep_fini(&sp->session_ctrlep);
    500 		return (-1);
    501 	}
    502 
    503 	/* Clear the portmaps */
    504 	rds_unmark_all_ports(sp, RDS_LOCAL);
    505 	rds_unmark_all_ports(sp, RDS_REMOTE);
    506 
    507 	RDS_DPRINTF2(LABEL, "SP(%p) Data EP(%p)", sp, &sp->session_dataep);
    508 
    509 	RDS_DPRINTF2("rds_session_init", "Return");
    510 
    511 	return (0);
    512 }
    513 
    514 /*
    515  * This should be called before moving a session from ERROR state to
    516  * INIT state. This will update the HCA keys incase the session has moved from
    517  * one HCA to another.
    518  */
    519 int
    520 rds_session_reinit(rds_session_t *sp, ib_gid_t lgid)
    521 {
    522 	rds_hca_t	*hcap, *hcap1;
    523 	int		ret;
    524 
    525 	RDS_DPRINTF2("rds_session_reinit", "Enter: SP(0x%p) - state: %d",
    526 	    sp, sp->session_state);
    527 
    528 	/* CALLED WITH SESSION WRITE LOCK */
    529 
    530 	/* Clear the portmaps */
    531 	rds_unmark_all_ports(sp, RDS_LOCAL);
    532 	rds_unmark_all_ports(sp, RDS_REMOTE);
    533 
    534 	/* This should not happen but just a safe guard */
    535 	if (sp->session_dataep.ep_ack_addr == NULL) {
    536 		RDS_DPRINTF2("rds_session_reinit",
    537 		    "ERROR: Unexpected: SP(0x%p) - state: %d",
    538 		    sp, sp->session_state);
    539 		return (-1);
    540 	}
    541 
    542 	/* make the last buffer as the acknowledged */
    543 	*(uintptr_t *)sp->session_dataep.ep_ack_addr =
    544 	    (uintptr_t)sp->session_dataep.ep_sndpool.pool_tailp;
    545 
    546 	hcap = rds_gid_to_hcap(rdsib_statep, lgid);
    547 	if (hcap == NULL) {
    548 		RDS_DPRINTF2("rds_session_reinit", "SGID is on an "
    549 		    "uninitialized HCA: %llx", lgid.gid_guid);
    550 		return (-1);
    551 	}
    552 
    553 	hcap1 = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
    554 	if (hcap1 == NULL) {
    555 		RDS_DPRINTF2("rds_session_reinit", "Seems like HCA %llx "
    556 		    "is unplugged", sp->session_lgid.gid_guid);
    557 	} else if (hcap->hca_guid == hcap1->hca_guid) {
    558 		/*
    559 		 * No action is needed as the session did not move across
    560 		 * HCAs
    561 		 */
    562 		RDS_DPRINTF2("rds_session_reinit", "Failover on the same HCA");
    563 		return (0);
    564 	}
    565 
    566 	RDS_DPRINTF2("rds_session_reinit", "Failover across HCAs");
    567 
    568 	sp->session_hca_guid = hcap->hca_guid;
    569 
    570 	/* re-initialize the control channel */
    571 	ret = rds_ep_reinit(&sp->session_ctrlep, hcap->hca_guid);
    572 	if (ret != 0) {
    573 		RDS_DPRINTF2("rds_session_reinit",
    574 		    "SP(%p): Ctrl EP(%p) re-initialization failed",
    575 		    sp, &sp->session_ctrlep);
    576 		return (-1);
    577 	}
    578 
    579 	RDS_DPRINTF2("rds_session_reinit", "SP(%p) Control EP(%p)",
    580 	    sp, &sp->session_ctrlep);
    581 
    582 	/* re-initialize the data channel */
    583 	ret = rds_ep_reinit(&sp->session_dataep, hcap->hca_guid);
    584 	if (ret != 0) {
    585 		RDS_DPRINTF2("rds_session_reinit",
    586 		    "SP(%p): Data EP(%p) re-initialization failed",
    587 		    sp, &sp->session_dataep);
    588 		return (-1);
    589 	}
    590 
    591 	RDS_DPRINTF2("rds_session_reinit", "SP(%p) Data EP(%p)",
    592 	    sp, &sp->session_dataep);
    593 
    594 	sp->session_lgid = lgid;
    595 
    596 	RDS_DPRINTF2("rds_session_reinit", "Return: SP(0x%p)", sp);
    597 
    598 	return (0);
    599 }
    600 
    601 static int
    602 rds_session_connect(rds_session_t *sp)
    603 {
    604 	ibt_channel_hdl_t	ctrlchan, datachan;
    605 	rds_ep_t		*ep;
    606 	int			ret;
    607 
    608 	RDS_DPRINTF2("rds_session_connect", "Enter SP(%p)", sp);
    609 
    610 	sp->session_pinfo.pi_sid = rdsib_statep->rds_service_id;
    611 
    612 	/* Override the packet life time based on the conf file */
    613 	if (IBPktLifeTime != 0) {
    614 		sp->session_pinfo.pi_prim_cep_path.cep_cm_opaque1 =
    615 		    IBPktLifeTime;
    616 	}
    617 
    618 	/* Session type may change if we run into peer-to-peer case. */
    619 	rw_enter(&sp->session_lock, RW_READER);
    620 	if (sp->session_type == RDS_SESSION_PASSIVE) {
    621 		RDS_DPRINTF2("rds_session_connect", "SP(%p) is no longer the "
    622 		    "active end", sp);
    623 		rw_exit(&sp->session_lock);
    624 		return (0); /* return success */
    625 	}
    626 	rw_exit(&sp->session_lock);
    627 
    628 	/* connect the data ep first */
    629 	ep = &sp->session_dataep;
    630 	mutex_enter(&ep->ep_lock);
    631 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
    632 		ep->ep_state = RDS_EP_STATE_ACTIVE_PENDING;
    633 		mutex_exit(&ep->ep_lock);
    634 		ret = rds_open_rc_channel(ep, &sp->session_pinfo, IBT_BLOCKING,
    635 		    &datachan);
    636 		if (ret != IBT_SUCCESS) {
    637 			RDS_DPRINTF2(LABEL, "EP(%p): rds_open_rc_channel "
    638 			    "failed: %d", ep, ret);
    639 			return (-1);
    640 		}
    641 		sp->session_dataep.ep_chanhdl = datachan;
    642 	} else {
    643 		RDS_DPRINTF2(LABEL, "SP(%p) Data EP(%p) is in "
    644 		    "unexpected state: %d", sp, ep, ep->ep_state);
    645 		mutex_exit(&ep->ep_lock);
    646 		return (-1);
    647 	}
    648 
    649 	RDS_DPRINTF3(LABEL, "SP(%p) EP(%p): Data channel is connected",
    650 	    sp, ep);
    651 
    652 	ep = &sp->session_ctrlep;
    653 	mutex_enter(&ep->ep_lock);
    654 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
    655 		ep->ep_state = RDS_EP_STATE_ACTIVE_PENDING;
    656 		mutex_exit(&ep->ep_lock);
    657 		ret = rds_open_rc_channel(ep, &sp->session_pinfo, IBT_BLOCKING,
    658 		    &ctrlchan);
    659 		if (ret != IBT_SUCCESS) {
    660 			RDS_DPRINTF2(LABEL, "EP(%p): rds_open_rc_channel "
    661 			    "failed: %d", ep, ret);
    662 			return (-1);
    663 		}
    664 		sp->session_ctrlep.ep_chanhdl = ctrlchan;
    665 	} else {
    666 		RDS_DPRINTF2(LABEL, "SP(%p) Control EP(%p) is in "
    667 		    "unexpected state: %d", sp, ep, ep->ep_state);
    668 		mutex_exit(&ep->ep_lock);
    669 		return (-1);
    670 	}
    671 
    672 	RDS_DPRINTF2(LABEL, "Session (%p) 0x%x <--> 0x%x is CONNECTED",
    673 	    sp, sp->session_myip, sp->session_remip);
    674 
    675 	RDS_DPRINTF2("rds_session_connect", "Return SP(%p)", sp);
    676 
    677 	return (0);
    678 }
    679 
    680 /*
    681  * Can be called with or without session_lock.
    682  */
    683 void
    684 rds_session_close(rds_session_t *sp, ibt_execution_mode_t mode, uint_t wait)
    685 {
    686 	rds_ep_t		*ep;
    687 
    688 	RDS_DPRINTF2("rds_session_close", "SP(%p) State: %d", sp,
    689 	    sp->session_state);
    690 
    691 	ep = &sp->session_dataep;
    692 	RDS_DPRINTF3(LABEL, "EP(%p) State: %d", ep, ep->ep_state);
    693 
    694 	/* wait until the SQ is empty before closing */
    695 	if (wait != 0) {
    696 		(void) rds_is_sendq_empty(ep, wait);
    697 	}
    698 
    699 	mutex_enter(&ep->ep_lock);
    700 	while (ep->ep_state == RDS_EP_STATE_CLOSING) {
    701 		mutex_exit(&ep->ep_lock);
    702 		delay(drv_usectohz(300000));
    703 		mutex_enter(&ep->ep_lock);
    704 	}
    705 
    706 	if (ep->ep_state == RDS_EP_STATE_CONNECTED) {
    707 		ep->ep_state = RDS_EP_STATE_CLOSING;
    708 		mutex_exit(&ep->ep_lock);
    709 		(void) rds_close_rc_channel(ep->ep_chanhdl, mode);
    710 		if (wait == 0) {
    711 			/* make sure all WCs are flushed before proceeding */
    712 			(void) rds_is_sendq_empty(ep, 1);
    713 		}
    714 		mutex_enter(&ep->ep_lock);
    715 	}
    716 	rds_ep_free_rc_channel(ep);
    717 	ep->ep_state = RDS_EP_STATE_UNCONNECTED;
    718 	ep->ep_segfbp = NULL;
    719 	ep->ep_seglbp = NULL;
    720 	mutex_exit(&ep->ep_lock);
    721 
    722 	ep = &sp->session_ctrlep;
    723 	RDS_DPRINTF3(LABEL, "EP(%p) State: %d", ep, ep->ep_state);
    724 
    725 	/* wait until the SQ is empty before closing */
    726 	if (wait != 0) {
    727 		(void) rds_is_sendq_empty(ep, wait);
    728 	}
    729 
    730 	mutex_enter(&ep->ep_lock);
    731 	while (ep->ep_state == RDS_EP_STATE_CLOSING) {
    732 		mutex_exit(&ep->ep_lock);
    733 		delay(drv_usectohz(300000));
    734 		mutex_enter(&ep->ep_lock);
    735 	}
    736 
    737 	if (ep->ep_state == RDS_EP_STATE_CONNECTED) {
    738 		ep->ep_state = RDS_EP_STATE_CLOSING;
    739 		mutex_exit(&ep->ep_lock);
    740 		(void) rds_close_rc_channel(ep->ep_chanhdl, mode);
    741 		if (wait == 0) {
    742 			/* make sure all WCs are flushed before proceeding */
    743 			(void) rds_is_sendq_empty(ep, 1);
    744 		}
    745 		mutex_enter(&ep->ep_lock);
    746 	}
    747 	rds_ep_free_rc_channel(ep);
    748 	ep->ep_state = RDS_EP_STATE_UNCONNECTED;
    749 	ep->ep_segfbp = NULL;
    750 	ep->ep_seglbp = NULL;
    751 	mutex_exit(&ep->ep_lock);
    752 
    753 	RDS_DPRINTF2("rds_session_close", "Return (%p)", sp);
    754 }
    755 
    756 /* Free the session */
    757 static void
    758 rds_destroy_session(rds_session_t *sp)
    759 {
    760 	rds_ep_t	*ep;
    761 	rds_bufpool_t	*pool;
    762 
    763 	ASSERT((sp->session_state == RDS_SESSION_STATE_CLOSED) ||
    764 	    (sp->session_state == RDS_SESSION_STATE_FAILED) ||
    765 	    (sp->session_state == RDS_SESSION_STATE_FINI) ||
    766 	    (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING));
    767 
    768 	rw_enter(&sp->session_lock, RW_READER);
    769 	RDS_DPRINTF2("rds_destroy_session", "SP(%p) State: %d", sp,
    770 	    sp->session_state);
    771 	while (!((sp->session_state == RDS_SESSION_STATE_CLOSED) ||
    772 	    (sp->session_state == RDS_SESSION_STATE_FAILED) ||
    773 	    (sp->session_state == RDS_SESSION_STATE_FINI))) {
    774 		rw_exit(&sp->session_lock);
    775 		delay(drv_usectohz(1000000));
    776 		rw_enter(&sp->session_lock, RW_READER);
    777 		RDS_DPRINTF2("rds_destroy_session", "SP(%p) State: %d WAITING "
    778 		    "ON SESSION", sp, sp->session_state);
    779 	}
    780 	rw_exit(&sp->session_lock);
    781 
    782 	/* data channel */
    783 	ep = &sp->session_dataep;
    784 
    785 	/* send pool locks */
    786 	pool = &ep->ep_sndpool;
    787 	cv_destroy(&pool->pool_cv);
    788 	mutex_destroy(&pool->pool_lock);
    789 
    790 	/* recv pool locks */
    791 	pool = &ep->ep_rcvpool;
    792 	cv_destroy(&pool->pool_cv);
    793 	mutex_destroy(&pool->pool_lock);
    794 	mutex_destroy(&ep->ep_recvqp.qp_lock);
    795 
    796 	/* control channel */
    797 	ep = &sp->session_ctrlep;
    798 
    799 	/* send pool locks */
    800 	pool = &ep->ep_sndpool;
    801 	cv_destroy(&pool->pool_cv);
    802 	mutex_destroy(&pool->pool_lock);
    803 
    804 	/* recv pool locks */
    805 	pool = &ep->ep_rcvpool;
    806 	cv_destroy(&pool->pool_cv);
    807 	mutex_destroy(&pool->pool_lock);
    808 	mutex_destroy(&ep->ep_recvqp.qp_lock);
    809 
    810 	/* session */
    811 	rw_destroy(&sp->session_lock);
    812 	rw_destroy(&sp->session_local_portmap_lock);
    813 	rw_destroy(&sp->session_remote_portmap_lock);
    814 
    815 	/* free the session */
    816 	kmem_free(sp, sizeof (rds_session_t));
    817 
    818 	RDS_DPRINTF2("rds_destroy_session", "SP(%p) Return", sp);
    819 }
    820 
    821 /* This is called on the taskq thread */
    822 void
    823 rds_failover_session(void *arg)
    824 {
    825 	rds_session_t	*sp = (rds_session_t *)arg;
    826 	ib_gid_t	lgid, rgid;
    827 	ipaddr_t	myip, remip;
    828 	int		ret, cnt = 0;
    829 	uint8_t		sp_state;
    830 
    831 	RDS_DPRINTF2("rds_failover_session", "Enter: (%p)", sp);
    832 
    833 	/* Make sure the session is still alive */
    834 	if (rds_session_lkup_by_sp(sp) == B_FALSE) {
    835 		RDS_DPRINTF2("rds_failover_session",
    836 		    "Return: SP(%p) not ALIVE", sp);
    837 		return;
    838 	}
    839 
    840 	RDS_INCR_FAILOVERS();
    841 
    842 	rw_enter(&sp->session_lock, RW_WRITER);
    843 	if (sp->session_type != RDS_SESSION_ACTIVE) {
    844 		/*
    845 		 * The remote side must have seen the error and initiated
    846 		 * a re-connect.
    847 		 */
    848 		RDS_DPRINTF2("rds_failover_session",
    849 		    "SP(%p) has become passive", sp);
    850 		rw_exit(&sp->session_lock);
    851 		return;
    852 	}
    853 	sp->session_failover = 1;
    854 	sp_state = sp->session_state;
    855 	rw_exit(&sp->session_lock);
    856 
    857 	/*
    858 	 * The session is in ERROR state but close both channels
    859 	 * for a clean start.
    860 	 */
    861 	if (sp_state == RDS_SESSION_STATE_ERROR) {
    862 		rds_session_close(sp, IBT_BLOCKING, 1);
    863 	}
    864 
    865 	/* wait 1 sec before re-connecting */
    866 	delay(drv_usectohz(1000000));
    867 
    868 	do {
    869 		ibt_ip_path_attr_t	ipattr;
    870 		ibt_ip_addr_t		dstip;
    871 
    872 		/* The ipaddr should be in the network order */
    873 		myip = sp->session_myip;
    874 		remip = sp->session_remip;
    875 		ret = rds_sc_path_lookup(&myip, &remip);
    876 		if (ret == 0) {
    877 			RDS_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)",
    878 			    myip, remip);
    879 		}
    880 		/* check if we have (new) path from the source to destination */
    881 		lgid.gid_prefix = 0;
    882 		lgid.gid_guid = 0;
    883 		rgid.gid_prefix = 0;
    884 		rgid.gid_guid = 0;
    885 
    886 		bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
    887 		dstip.family = AF_INET;
    888 		dstip.un.ip4addr = remip;
    889 		ipattr.ipa_dst_ip = &dstip;
    890 		ipattr.ipa_src_ip.family = AF_INET;
    891 		ipattr.ipa_src_ip.un.ip4addr = myip;
    892 		ipattr.ipa_ndst = 1;
    893 		ipattr.ipa_max_paths = 1;
    894 		RDS_DPRINTF2(LABEL, "ibt_get_ip_paths: 0x%x <-> 0x%x ",
    895 		    myip, remip);
    896 		ret = ibt_get_ip_paths(rdsib_statep->rds_ibhdl,
    897 		    IBT_PATH_NO_FLAGS, &ipattr, &sp->session_pinfo, NULL, NULL);
    898 		if (ret == IBT_SUCCESS) {
    899 			RDS_DPRINTF2(LABEL, "ibt_get_ip_paths success");
    900 			lgid = sp->session_pinfo.
    901 			    pi_prim_cep_path.cep_adds_vect.av_sgid;
    902 			rgid = sp->session_pinfo.
    903 			    pi_prim_cep_path.cep_adds_vect.av_dgid;
    904 			break;
    905 		}
    906 
    907 		RDS_DPRINTF2(LABEL, "ibt_get_ip_paths failed, ret: %d ", ret);
    908 
    909 		/* wait 1 sec before re-trying */
    910 		delay(drv_usectohz(1000000));
    911 		cnt++;
    912 	} while (cnt < 5);
    913 
    914 	if (ret != IBT_SUCCESS) {
    915 		rw_enter(&sp->session_lock, RW_WRITER);
    916 		if (sp->session_type == RDS_SESSION_ACTIVE) {
    917 			rds_session_fini(sp);
    918 			sp->session_state = RDS_SESSION_STATE_FAILED;
    919 			sp->session_failover = 0;
    920 			RDS_DPRINTF3("rds_failover_session",
    921 			    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
    922 		} else {
    923 			RDS_DPRINTF2("rds_failover_session",
    924 			    "SP(%p) has become passive", sp);
    925 		}
    926 		rw_exit(&sp->session_lock);
    927 		return;
    928 	}
    929 
    930 	RDS_DPRINTF2(LABEL, "lgid: %llx:%llx rgid: %llx:%llx",
    931 	    lgid.gid_prefix, lgid.gid_guid, rgid.gid_prefix,
    932 	    rgid.gid_guid);
    933 
    934 	rw_enter(&sp->session_lock, RW_WRITER);
    935 	if (sp->session_type != RDS_SESSION_ACTIVE) {
    936 		/*
    937 		 * The remote side must have seen the error and initiated
    938 		 * a re-connect.
    939 		 */
    940 		RDS_DPRINTF2("rds_failover_session",
    941 		    "SP(%p) has become passive", sp);
    942 		rw_exit(&sp->session_lock);
    943 		return;
    944 	}
    945 
    946 	/* move the session to init state */
    947 	ret = rds_session_reinit(sp, lgid);
    948 	sp->session_lgid = lgid;
    949 	sp->session_rgid = rgid;
    950 	if (ret != 0) {
    951 		rds_session_fini(sp);
    952 		sp->session_state = RDS_SESSION_STATE_FAILED;
    953 		sp->session_failover = 0;
    954 		RDS_DPRINTF3("rds_failover_session",
    955 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
    956 		rw_exit(&sp->session_lock);
    957 		return;
    958 	} else {
    959 		sp->session_state = RDS_SESSION_STATE_INIT;
    960 		RDS_DPRINTF3("rds_failover_session",
    961 		    "SP(%p) State RDS_SESSION_STATE_INIT", sp);
    962 	}
    963 	rw_exit(&sp->session_lock);
    964 
    965 	rds_session_open(sp);
    966 
    967 	RDS_DPRINTF2("rds_failover_session", "Return: (%p)", sp);
    968 }
    969 
    970 void
    971 rds_handle_send_error(rds_ep_t *ep)
    972 {
    973 	if (rds_is_sendq_empty(ep, 0)) {
    974 		/* Session should already be in ERROR, try to reconnect */
    975 		RDS_DPRINTF2("rds_handle_send_error",
    976 		    "Dispatching taskq to failover SP(%p)", ep->ep_sp);
    977 		(void) ddi_taskq_dispatch(rds_taskq, rds_failover_session,
    978 		    (void *)ep->ep_sp, DDI_SLEEP);
    979 	}
    980 }
    981 
    982 /*
    983  * Called in the CM handler on the passive side
    984  * Called on a taskq thread.
    985  */
    986 void
    987 rds_cleanup_passive_session(void *arg)
    988 {
    989 	rds_session_t	*sp = arg;
    990 
    991 	RDS_DPRINTF2("rds_cleanup_passive_session", "SP(%p) State: %d", sp,
    992 	    sp->session_state);
    993 	ASSERT((sp->session_state == RDS_SESSION_STATE_CLOSED) ||
    994 	    (sp->session_state == RDS_SESSION_STATE_ERROR));
    995 
    996 	rds_session_close(sp, IBT_BLOCKING, 1);
    997 
    998 	rw_enter(&sp->session_lock, RW_WRITER);
    999 	if (sp->session_state == RDS_SESSION_STATE_CLOSED) {
   1000 		rds_session_fini(sp);
   1001 		sp->session_state = RDS_SESSION_STATE_FINI;
   1002 		sp->session_failover = 0;
   1003 		RDS_DPRINTF3("rds_cleanup_passive_session",
   1004 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
   1005 	} else if (sp->session_state == RDS_SESSION_STATE_ERROR) {
   1006 		rds_session_fini(sp);
   1007 		sp->session_state = RDS_SESSION_STATE_FAILED;
   1008 		sp->session_failover = 0;
   1009 		RDS_DPRINTF3("rds_cleanup_passive_session",
   1010 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
   1011 	}
   1012 	rw_exit(&sp->session_lock);
   1013 
   1014 	RDS_DPRINTF2("rds_cleanup_passive_session", "Return: SP (%p)", sp);
   1015 }
   1016 
   1017 /*
   1018  * Called by the CM handler on the passive side
   1019  * Called with WRITE lock on the session
   1020  */
   1021 void
   1022 rds_passive_session_fini(rds_session_t *sp)
   1023 {
   1024 	rds_ep_t	*ep;
   1025 
   1026 	RDS_DPRINTF2("rds_passive_session_fini", "SP(%p) State: %d", sp,
   1027 	    sp->session_state);
   1028 	ASSERT((sp->session_state == RDS_SESSION_STATE_CLOSED) ||
   1029 	    (sp->session_state == RDS_SESSION_STATE_ERROR));
   1030 
   1031 	/* clean the data channel */
   1032 	ep = &sp->session_dataep;
   1033 	(void) rds_is_sendq_empty(ep, 1);
   1034 	mutex_enter(&ep->ep_lock);
   1035 	RDS_DPRINTF2("rds_passive_session_fini", "EP(%p) State: %d", ep,
   1036 	    ep->ep_state);
   1037 	rds_ep_free_rc_channel(ep);
   1038 	mutex_exit(&ep->ep_lock);
   1039 
   1040 	/* clean the control channel */
   1041 	ep = &sp->session_ctrlep;
   1042 	(void) rds_is_sendq_empty(ep, 1);
   1043 	mutex_enter(&ep->ep_lock);
   1044 	RDS_DPRINTF2("rds_passive_session_fini", "EP(%p) State: %d", ep,
   1045 	    ep->ep_state);
   1046 	rds_ep_free_rc_channel(ep);
   1047 	mutex_exit(&ep->ep_lock);
   1048 
   1049 	rds_session_fini(sp);
   1050 	sp->session_failover = 0;
   1051 
   1052 	RDS_DPRINTF2("rds_passive_session_fini", "Return: SP (%p)", sp);
   1053 }
   1054 
   1055 void
   1056 rds_close_this_session(rds_session_t *sp, uint8_t wait)
   1057 {
   1058 	switch (sp->session_state) {
   1059 	case RDS_SESSION_STATE_CONNECTED:
   1060 		sp->session_state = RDS_SESSION_STATE_ACTIVE_CLOSING;
   1061 		rw_exit(&sp->session_lock);
   1062 
   1063 		rds_session_close(sp, IBT_BLOCKING, wait);
   1064 
   1065 		rw_enter(&sp->session_lock, RW_WRITER);
   1066 		sp->session_state = RDS_SESSION_STATE_CLOSED;
   1067 		RDS_DPRINTF3("rds_close_sessions",
   1068 		    "SP(%p) State RDS_SESSION_STATE_CLOSED", sp);
   1069 		rds_session_fini(sp);
   1070 		sp->session_state = RDS_SESSION_STATE_FINI;
   1071 		sp->session_failover = 0;
   1072 		RDS_DPRINTF3("rds_close_sessions",
   1073 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
   1074 		break;
   1075 
   1076 	case RDS_SESSION_STATE_ERROR:
   1077 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
   1078 	case RDS_SESSION_STATE_INIT:
   1079 		sp->session_state = RDS_SESSION_STATE_ACTIVE_CLOSING;
   1080 		rw_exit(&sp->session_lock);
   1081 
   1082 		rds_session_close(sp, IBT_BLOCKING, wait);
   1083 
   1084 		rw_enter(&sp->session_lock, RW_WRITER);
   1085 		sp->session_state = RDS_SESSION_STATE_CLOSED;
   1086 		RDS_DPRINTF3("rds_close_sessions",
   1087 		    "SP(%p) State RDS_SESSION_STATE_CLOSED", sp);
   1088 		/* FALLTHRU */
   1089 	case RDS_SESSION_STATE_CLOSED:
   1090 		rds_session_fini(sp);
   1091 		sp->session_state = RDS_SESSION_STATE_FINI;
   1092 		sp->session_failover = 0;
   1093 		RDS_DPRINTF3("rds_close_sessions",
   1094 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
   1095 		break;
   1096 	}
   1097 }
   1098 
   1099 /*
   1100  * Can be called:
   1101  * 1. on driver detach
   1102  * 2. on taskq thread
   1103  * arg is always NULL
   1104  */
   1105 /* ARGSUSED */
   1106 void
   1107 rds_close_sessions(void *arg)
   1108 {
   1109 	rds_session_t *sp, *spnextp;
   1110 
   1111 	RDS_DPRINTF2("rds_close_sessions", "Enter");
   1112 
   1113 	/* wait until all the buffers are freed by the sockets */
   1114 	while (RDS_GET_RXPKTS_PEND() != 0) {
   1115 		/* wait one second and try again */
   1116 		RDS_DPRINTF2("rds_close_sessions", "waiting on "
   1117 		    "pending packets", RDS_GET_RXPKTS_PEND());
   1118 		delay(drv_usectohz(1000000));
   1119 	}
   1120 	RDS_DPRINTF2("rds_close_sessions", "No more RX packets pending");
   1121 
   1122 	/* close all the sessions */
   1123 	rw_enter(&rdsib_statep->rds_sessionlock, RW_WRITER);
   1124 	sp = rdsib_statep->rds_sessionlistp;
   1125 	while (sp) {
   1126 		rw_enter(&sp->session_lock, RW_WRITER);
   1127 		RDS_DPRINTF2("rds_close_sessions", "SP(%p) State: %d", sp,
   1128 		    sp->session_state);
   1129 		rds_close_this_session(sp, 2);
   1130 		rw_exit(&sp->session_lock);
   1131 		sp = sp->session_nextp;
   1132 	}
   1133 
   1134 	sp = rdsib_statep->rds_sessionlistp;
   1135 	rdsib_statep->rds_sessionlistp = NULL;
   1136 	rdsib_statep->rds_nsessions = 0;
   1137 	rw_exit(&rdsib_statep->rds_sessionlock);
   1138 
   1139 	while (sp) {
   1140 		spnextp = sp->session_nextp;
   1141 		rds_destroy_session(sp);
   1142 		RDS_DECR_SESS();
   1143 		sp = spnextp;
   1144 	}
   1145 
   1146 	/* free the global pool */
   1147 	rds_free_recv_caches(rdsib_statep);
   1148 
   1149 	RDS_DPRINTF2("rds_close_sessions", "Return");
   1150 }
   1151 
   1152 void
   1153 rds_session_open(rds_session_t *sp)
   1154 {
   1155 	int		ret;
   1156 
   1157 	RDS_DPRINTF2("rds_session_open", "Enter SP(%p)", sp);
   1158 
   1159 	ret = rds_session_connect(sp);
   1160 	if (ret == -1) {
   1161 		/*
   1162 		 * may be the session has become passive due to
   1163 		 * hitting peer-to-peer case
   1164 		 */
   1165 		rw_enter(&sp->session_lock, RW_READER);
   1166 		if (sp->session_type == RDS_SESSION_PASSIVE) {
   1167 			RDS_DPRINTF2("rds_session_open", "SP(%p) "
   1168 			    "has become passive from active", sp);
   1169 			rw_exit(&sp->session_lock);
   1170 			return;
   1171 		}
   1172 
   1173 		/* get the lock for writing */
   1174 		rw_exit(&sp->session_lock);
   1175 		rw_enter(&sp->session_lock, RW_WRITER);
   1176 		sp->session_state = RDS_SESSION_STATE_ERROR;
   1177 		RDS_DPRINTF3("rds_session_open",
   1178 		    "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
   1179 		rw_exit(&sp->session_lock);
   1180 
   1181 		/* Connect request failed */
   1182 		rds_session_close(sp, IBT_BLOCKING, 1);
   1183 
   1184 		rw_enter(&sp->session_lock, RW_WRITER);
   1185 		rds_session_fini(sp);
   1186 		sp->session_state = RDS_SESSION_STATE_FAILED;
   1187 		sp->session_failover = 0;
   1188 		RDS_DPRINTF3("rds_session_open",
   1189 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
   1190 		rw_exit(&sp->session_lock);
   1191 
   1192 		return;
   1193 	}
   1194 
   1195 	RDS_DPRINTF2("rds_session_open", "Return: SP(%p)", sp);
   1196 }
   1197 
   1198 /*
   1199  * Creates a session and inserts it into the list of sessions. The session
   1200  * state would be CREATED.
   1201  * Return Values:
   1202  *	EWOULDBLOCK
   1203  */
   1204 rds_session_t *
   1205 rds_session_create(rds_state_t *statep, ipaddr_t localip, ipaddr_t remip,
   1206     ibt_cm_req_rcv_t *reqp, uint8_t type)
   1207 {
   1208 	ib_gid_t	lgid, rgid;
   1209 	rds_session_t	*newp, *oldp;
   1210 	rds_ep_t	*dataep, *ctrlep;
   1211 	rds_bufpool_t	*pool;
   1212 	int		ret;
   1213 
   1214 	RDS_DPRINTF2("rds_session_create", "Enter: 0x%p 0x%x 0x%x, type: %d",
   1215 	    statep, localip, remip, type);
   1216 
   1217 	/* Check if there is space for a new session */
   1218 	rw_enter(&statep->rds_sessionlock, RW_READER);
   1219 	if (statep->rds_nsessions >= (MaxNodes - 1)) {
   1220 		rw_exit(&statep->rds_sessionlock);
   1221 		RDS_DPRINTF1("rds_session_create", "No More Sessions allowed");
   1222 		return (NULL);
   1223 	}
   1224 	rw_exit(&statep->rds_sessionlock);
   1225 
   1226 	/* Allocate and initialize global buffer pool */
   1227 	ret = rds_init_recv_caches(statep);
   1228 	if (ret != 0) {
   1229 		RDS_DPRINTF2(LABEL, "Buffer Cache Initialization failed");
   1230 		return (NULL);
   1231 	}
   1232 
   1233 	/* enough memory for session (includes 2 endpoints) */
   1234 	newp = kmem_zalloc(sizeof (rds_session_t), KM_SLEEP);
   1235 
   1236 	newp->session_remip = remip;
   1237 	newp->session_myip = localip;
   1238 	newp->session_type = type;
   1239 	newp->session_state = RDS_SESSION_STATE_CREATED;
   1240 	RDS_DPRINTF3("rds_session_create",
   1241 	    "SP(%p) State RDS_SESSION_STATE_CREATED", newp);
   1242 	rw_init(&newp->session_lock, NULL, RW_DRIVER, NULL);
   1243 	rw_init(&newp->session_local_portmap_lock, NULL, RW_DRIVER, NULL);
   1244 	rw_init(&newp->session_remote_portmap_lock, NULL, RW_DRIVER, NULL);
   1245 
   1246 	/* Initialize data endpoint */
   1247 	dataep = &newp->session_dataep;
   1248 	dataep->ep_remip = newp->session_remip;
   1249 	dataep->ep_myip = newp->session_myip;
   1250 	dataep->ep_state = RDS_EP_STATE_UNCONNECTED;
   1251 	dataep->ep_sp = newp;
   1252 	dataep->ep_type = RDS_EP_TYPE_DATA;
   1253 	mutex_init(&dataep->ep_lock, NULL, MUTEX_DRIVER, NULL);
   1254 
   1255 	/* Initialize send pool locks */
   1256 	pool = &dataep->ep_sndpool;
   1257 	mutex_init(&pool->pool_lock, NULL, MUTEX_DRIVER, NULL);
   1258 	cv_init(&pool->pool_cv, NULL, CV_DRIVER, NULL);
   1259 
   1260 	/* Initialize recv pool locks */
   1261 	pool = &dataep->ep_rcvpool;
   1262 	mutex_init(&dataep->ep_recvqp.qp_lock, NULL, MUTEX_DRIVER, NULL);
   1263 	mutex_init(&pool->pool_lock, NULL, MUTEX_DRIVER, NULL);
   1264 	cv_init(&pool->pool_cv, NULL, CV_DRIVER, NULL);
   1265 
   1266 	/* Initialize control endpoint */
   1267 	ctrlep = &newp->session_ctrlep;
   1268 	ctrlep->ep_remip = newp->session_remip;
   1269 	ctrlep->ep_myip = newp->session_myip;
   1270 	ctrlep->ep_state = RDS_EP_STATE_UNCONNECTED;
   1271 	ctrlep->ep_sp = newp;
   1272 	ctrlep->ep_type = RDS_EP_TYPE_CTRL;
   1273 	mutex_init(&ctrlep->ep_lock, NULL, MUTEX_DRIVER, NULL);
   1274 
   1275 	/* Initialize send pool locks */
   1276 	pool = &ctrlep->ep_sndpool;
   1277 	mutex_init(&pool->pool_lock, NULL, MUTEX_DRIVER, NULL);
   1278 	cv_init(&pool->pool_cv, NULL, CV_DRIVER, NULL);
   1279 
   1280 	/* Initialize recv pool locks */
   1281 	pool = &ctrlep->ep_rcvpool;
   1282 	mutex_init(&ctrlep->ep_recvqp.qp_lock, NULL, MUTEX_DRIVER, NULL);
   1283 	mutex_init(&pool->pool_lock, NULL, MUTEX_DRIVER, NULL);
   1284 	cv_init(&pool->pool_cv, NULL, CV_DRIVER, NULL);
   1285 
   1286 	/* lkup if there is already a session */
   1287 	rw_enter(&statep->rds_sessionlock, RW_WRITER);
   1288 	oldp = rds_session_lkup(statep, remip, 0);
   1289 	if (oldp != NULL) {
   1290 		/* A session to this destination exists */
   1291 		rw_exit(&statep->rds_sessionlock);
   1292 		rw_destroy(&newp->session_lock);
   1293 		rw_destroy(&newp->session_local_portmap_lock);
   1294 		rw_destroy(&newp->session_remote_portmap_lock);
   1295 		mutex_destroy(&dataep->ep_lock);
   1296 		mutex_destroy(&ctrlep->ep_lock);
   1297 		kmem_free(newp, sizeof (rds_session_t));
   1298 		return (NULL);
   1299 	}
   1300 
   1301 	/* Insert this session into the list */
   1302 	if (rds_add_session(newp, B_TRUE) != B_TRUE) {
   1303 		/* No room to add this session */
   1304 		rw_exit(&statep->rds_sessionlock);
   1305 		rw_destroy(&newp->session_lock);
   1306 		rw_destroy(&newp->session_local_portmap_lock);
   1307 		rw_destroy(&newp->session_remote_portmap_lock);
   1308 		mutex_destroy(&dataep->ep_lock);
   1309 		mutex_destroy(&ctrlep->ep_lock);
   1310 		kmem_free(newp, sizeof (rds_session_t));
   1311 		return (NULL);
   1312 	}
   1313 
   1314 	/* unlock the session list */
   1315 	rw_exit(&statep->rds_sessionlock);
   1316 
   1317 	if (type == RDS_SESSION_ACTIVE) {
   1318 		ipaddr_t		localip1, remip1;
   1319 		ibt_ip_path_attr_t	ipattr;
   1320 		ibt_ip_addr_t		dstip;
   1321 
   1322 		/* The ipaddr should be in the network order */
   1323 		localip1 = localip;
   1324 		remip1 = remip;
   1325 		ret = rds_sc_path_lookup(&localip1, &remip1);
   1326 		if (ret == 0) {
   1327 			RDS_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)",
   1328 			    localip, remip);
   1329 		}
   1330 
   1331 		/* Get the gids for the source and destination ip addrs */
   1332 		lgid.gid_prefix = 0;
   1333 		lgid.gid_guid = 0;
   1334 		rgid.gid_prefix = 0;
   1335 		rgid.gid_guid = 0;
   1336 
   1337 		bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
   1338 		dstip.family = AF_INET;
   1339 		dstip.un.ip4addr = remip1;
   1340 		ipattr.ipa_dst_ip = &dstip;
   1341 		ipattr.ipa_src_ip.family = AF_INET;
   1342 		ipattr.ipa_src_ip.un.ip4addr = localip1;
   1343 		ipattr.ipa_ndst = 1;
   1344 		ipattr.ipa_max_paths = 1;
   1345 		RDS_DPRINTF2(LABEL, "ibt_get_ip_paths: 0x%x <-> 0x%x ",
   1346 		    localip1, remip1);
   1347 		ret = ibt_get_ip_paths(rdsib_statep->rds_ibhdl,
   1348 		    IBT_PATH_NO_FLAGS, &ipattr, &newp->session_pinfo,
   1349 		    NULL, NULL);
   1350 		if (ret != IBT_SUCCESS) {
   1351 			RDS_DPRINTF2(LABEL, "ibt_get_ip_paths failed, ret: %d "
   1352 			    "lgid: %llx:%llx rgid: %llx:%llx", lgid.gid_prefix,
   1353 			    lgid.gid_guid, rgid.gid_prefix, rgid.gid_guid);
   1354 
   1355 			RDS_SESSION_TRANSITION(newp, RDS_SESSION_STATE_FAILED);
   1356 			return (NULL);
   1357 		}
   1358 		RDS_DPRINTF2(LABEL, "ibt_get_ip_paths success");
   1359 		lgid =
   1360 		    newp->session_pinfo.pi_prim_cep_path.cep_adds_vect.av_sgid;
   1361 		rgid =
   1362 		    newp->session_pinfo.pi_prim_cep_path.cep_adds_vect.av_dgid;
   1363 
   1364 		RDS_DPRINTF2(LABEL, "lgid: %llx:%llx rgid: %llx:%llx",
   1365 		    lgid.gid_prefix, lgid.gid_guid, rgid.gid_prefix,
   1366 		    rgid.gid_guid);
   1367 	}
   1368 
   1369 	rw_enter(&newp->session_lock, RW_WRITER);
   1370 	/* check for peer-to-peer case */
   1371 	if (type == newp->session_type) {
   1372 		/* no peer-to-peer case */
   1373 		if (type == RDS_SESSION_ACTIVE) {
   1374 			newp->session_lgid = lgid;
   1375 			newp->session_rgid = rgid;
   1376 		} else {
   1377 			/* rgid is requester gid & lgid is receiver gid */
   1378 			newp->session_rgid = reqp->req_prim_addr.av_dgid;
   1379 			newp->session_lgid = reqp->req_prim_addr.av_sgid;
   1380 		}
   1381 	}
   1382 	rw_exit(&newp->session_lock);
   1383 
   1384 	RDS_DPRINTF2("rds_session_create", "Return SP(%p)", newp);
   1385 
   1386 	return (newp);
   1387 }
   1388 
   1389 void
   1390 rds_handle_close_session_request(void *arg)
   1391 {
   1392 	rds_session_t	*sp = (rds_session_t *)arg;
   1393 
   1394 	RDS_DPRINTF2("rds_handle_close_session_request",
   1395 	    "Enter: Closing this Session (%p)", sp);
   1396 
   1397 	rw_enter(&sp->session_lock, RW_WRITER);
   1398 	RDS_DPRINTF2("rds_handle_close_session_request",
   1399 	    "SP(%p) State: %d", sp, sp->session_state);
   1400 	rds_close_this_session(sp, 2);
   1401 	rw_exit(&sp->session_lock);
   1402 
   1403 	RDS_DPRINTF2("rds_handle_close_session_request", "Return SP(%p)", sp);
   1404 }
   1405 
   1406 void
   1407 rds_handle_control_message(rds_session_t *sp, rds_ctrl_pkt_t *cpkt)
   1408 {
   1409 	RDS_DPRINTF4("rds_handle_control_message", "Enter: SP(%p) code: %d "
   1410 	    "port: %d", sp, cpkt->rcp_code, cpkt->rcp_port);
   1411 
   1412 	switch (cpkt->rcp_code) {
   1413 	case RDS_CTRL_CODE_STALL:
   1414 		RDS_INCR_STALLS_RCVD();
   1415 		(void) rds_check_n_mark_port(sp, cpkt->rcp_port, RDS_REMOTE);
   1416 		break;
   1417 	case RDS_CTRL_CODE_UNSTALL:
   1418 		RDS_INCR_UNSTALLS_RCVD();
   1419 		(void) rds_check_n_unmark_port(sp, cpkt->rcp_port, RDS_REMOTE);
   1420 		break;
   1421 	case RDS_CTRL_CODE_STALL_PORTS:
   1422 		rds_mark_all_ports(sp, RDS_REMOTE);
   1423 		break;
   1424 	case RDS_CTRL_CODE_UNSTALL_PORTS:
   1425 		rds_unmark_all_ports(sp, RDS_REMOTE);
   1426 		break;
   1427 	case RDS_CTRL_CODE_HEARTBEAT:
   1428 		break;
   1429 	case RDS_CTRL_CODE_CLOSE_SESSION:
   1430 		RDS_DPRINTF2("rds_handle_control_message",
   1431 		    "SP(%p) Remote Requested to close this session", sp);
   1432 		(void) ddi_taskq_dispatch(rds_taskq,
   1433 		    rds_handle_close_session_request, (void *)sp, DDI_SLEEP);
   1434 		break;
   1435 	default:
   1436 		RDS_DPRINTF2(LABEL, "ERROR: Invalid Control code: %d",
   1437 		    cpkt->rcp_code);
   1438 		break;
   1439 	}
   1440 
   1441 	RDS_DPRINTF4("rds_handle_control_message", "Return");
   1442 }
   1443 
   1444 int
   1445 rds_post_control_message(rds_session_t *sp, uint8_t code, in_port_t port)
   1446 {
   1447 	ibt_send_wr_t	wr;
   1448 	rds_ep_t	*ep;
   1449 	rds_buf_t	*bp;
   1450 	rds_ctrl_pkt_t	*cp;
   1451 	int		ret;
   1452 
   1453 	RDS_DPRINTF4("rds_post_control_message", "Enter: SP(%p) Code: %d "
   1454 	    "Port: %d", sp, code, port);
   1455 
   1456 	ep = &sp->session_ctrlep;
   1457 
   1458 	bp = rds_get_send_buf(ep, 1);
   1459 	if (bp == NULL) {
   1460 		RDS_DPRINTF2(LABEL, "No buffers available to send control "
   1461 		    "message: SP(%p) Code: %d Port: %d", sp, code,
   1462 		    port);
   1463 		return (-1);
   1464 	}
   1465 
   1466 	cp = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
   1467 	cp->rcp_code = code;
   1468 	cp->rcp_port = port;
   1469 	bp->buf_ds.ds_len = RDS_CTRLPKT_SIZE;
   1470 
   1471 	wr.wr_id = (uintptr_t)bp;
   1472 	wr.wr_flags = IBT_WR_SEND_SOLICIT;
   1473 	wr.wr_trans = IBT_RC_SRV;
   1474 	wr.wr_opcode = IBT_WRC_SEND;
   1475 	wr.wr_nds = 1;
   1476 	wr.wr_sgl = &bp->buf_ds;
   1477 	RDS_DPRINTF5(LABEL, "ds_va %p ds_len %d ds_lkey 0x%llx",
   1478 	    bp->buf_ds.ds_va, bp->buf_ds.ds_len, bp->buf_ds.ds_key);
   1479 	ret = ibt_post_send(ep->ep_chanhdl, &wr, 1, NULL);
   1480 	if (ret != IBT_SUCCESS) {
   1481 		RDS_DPRINTF2(LABEL, "EP(%p): ibt_post_send failed: "
   1482 		    "%d", ep, ret);
   1483 		bp->buf_state = RDS_SNDBUF_FREE;
   1484 		rds_free_send_buf(ep, bp, NULL, 1, B_FALSE);
   1485 		return (-1);
   1486 	}
   1487 
   1488 	RDS_DPRINTF4("rds_post_control_message", "Return SP(%p) Code: %d "
   1489 	    "Port: %d", sp, code, port);
   1490 
   1491 	return (0);
   1492 }
   1493 
   1494 void
   1495 rds_stall_port(rds_session_t *sp, in_port_t port, uint_t qualifier)
   1496 {
   1497 	int		ret;
   1498 
   1499 	RDS_DPRINTF4("rds_stall_port", "Enter: SP(%p) Port %d", sp, port);
   1500 
   1501 	RDS_INCR_STALLS_TRIGGERED();
   1502 
   1503 	if (!rds_check_n_mark_port(sp, port, qualifier)) {
   1504 
   1505 		if (sp != NULL) {
   1506 			ret = rds_post_control_message(sp,
   1507 			    RDS_CTRL_CODE_STALL, port);
   1508 			if (ret != 0) {
   1509 				(void) rds_check_n_unmark_port(sp, port,
   1510 				    qualifier);
   1511 				return;
   1512 			}
   1513 			RDS_INCR_STALLS_SENT();
   1514 		}
   1515 	} else {
   1516 		RDS_DPRINTF3(LABEL,
   1517 		    "Port %d is already in stall state", port);
   1518 	}
   1519 
   1520 	RDS_DPRINTF4("rds_stall_port", "Return: SP(%p) Port %d", sp, port);
   1521 }
   1522 
   1523 void
   1524 rds_resume_port(in_port_t port)
   1525 {
   1526 	rds_session_t	*sp;
   1527 	uint_t		ix;
   1528 	int		ret;
   1529 
   1530 	RDS_DPRINTF4("rds_resume_port", "Enter: Port %d", port);
   1531 
   1532 	RDS_INCR_UNSTALLS_TRIGGERED();
   1533 
   1534 	/* resume loopback traffic */
   1535 	(void) rds_check_n_unmark_port(NULL, port, RDS_LOOPBACK);
   1536 
   1537 	/* send unstall messages to resume the remote traffic */
   1538 	rw_enter(&rdsib_statep->rds_sessionlock, RW_READER);
   1539 
   1540 	sp = rdsib_statep->rds_sessionlistp;
   1541 	for (ix = 0; ix < rdsib_statep->rds_nsessions; ix++) {
   1542 		ASSERT(sp != NULL);
   1543 		if ((sp->session_state == RDS_SESSION_STATE_CONNECTED) &&
   1544 		    (rds_check_n_unmark_port(sp, port, RDS_LOCAL))) {
   1545 				ret = rds_post_control_message(sp,
   1546 				    RDS_CTRL_CODE_UNSTALL, port);
   1547 				if (ret != 0) {
   1548 					(void) rds_check_n_mark_port(sp, port,
   1549 					    RDS_LOCAL);
   1550 				} else {
   1551 					RDS_INCR_UNSTALLS_SENT();
   1552 				}
   1553 		}
   1554 
   1555 		sp = sp->session_nextp;
   1556 	}
   1557 
   1558 	rw_exit(&rdsib_statep->rds_sessionlock);
   1559 
   1560 	RDS_DPRINTF4("rds_resume_port", "Return: Port %d", port);
   1561 }
   1562 
   1563 static int
   1564 rds_build_n_post_msg(rds_ep_t *ep, uio_t *uiop, in_port_t sendport,
   1565     in_port_t recvport)
   1566 {
   1567 	ibt_send_wr_t	*wrp, wr;
   1568 	rds_buf_t	*bp, *bp1;
   1569 	rds_data_hdr_t	*pktp;
   1570 	uint32_t	msgsize, npkts, residual, pktno, ix;
   1571 	int		ret;
   1572 
   1573 	RDS_DPRINTF4("rds_build_n_post_msg", "Enter: EP(%p) UIOP(%p)",
   1574 	    ep, uiop);
   1575 
   1576 	/* how many pkts are needed to carry this msg */
   1577 	msgsize = uiop->uio_resid;
   1578 	npkts = ((msgsize - 1) / UserBufferSize) + 1;
   1579 	residual = ((msgsize - 1) % UserBufferSize) + 1;
   1580 
   1581 	RDS_DPRINTF5(LABEL, "EP(%p) UIOP(%p) msg size: %d npkts: %d", ep, uiop,
   1582 	    msgsize, npkts);
   1583 
   1584 	/* Get the buffers needed to post this message */
   1585 	bp = rds_get_send_buf(ep, npkts);
   1586 	if (bp == NULL) {
   1587 		RDS_INCR_ENOBUFS();
   1588 		return (ENOBUFS);
   1589 	}
   1590 
   1591 	if (npkts > 1) {
   1592 		/*
   1593 		 * multi-pkt messages are posted at the same time as a list
   1594 		 * of WRs
   1595 		 */
   1596 		wrp = (ibt_send_wr_t *)kmem_zalloc(sizeof (ibt_send_wr_t) *
   1597 		    npkts, KM_SLEEP);
   1598 	}
   1599 
   1600 
   1601 	pktno = 0;
   1602 	bp1 = bp;
   1603 	do {
   1604 		/* prepare the header */
   1605 		pktp = (rds_data_hdr_t *)(uintptr_t)bp1->buf_ds.ds_va;
   1606 		pktp->dh_datalen = UserBufferSize;
   1607 		pktp->dh_npkts = npkts - pktno;
   1608 		pktp->dh_psn = pktno;
   1609 		pktp->dh_sendport = sendport;
   1610 		pktp->dh_recvport = recvport;
   1611 		bp1->buf_ds.ds_len = RdsPktSize;
   1612 
   1613 		/* copy the data */
   1614 		ret = uiomove((uint8_t *)pktp + RDS_DATA_HDR_SZ,
   1615 		    UserBufferSize, UIO_WRITE, uiop);
   1616 		if (ret != 0) {
   1617 			break;
   1618 		}
   1619 
   1620 		if (uiop->uio_resid == 0) {
   1621 			pktp->dh_datalen = residual;
   1622 			bp1->buf_ds.ds_len = residual + RDS_DATA_HDR_SZ;
   1623 			break;
   1624 		}
   1625 		pktno++;
   1626 		bp1 = bp1->buf_nextp;
   1627 	} while (uiop->uio_resid);
   1628 
   1629 	if (ret) {
   1630 		/* uiomove failed */
   1631 		RDS_DPRINTF2("rds_build_n_post_msg", "UIO(%p) Move FAILED: %d",
   1632 		    uiop, ret);
   1633 		if (npkts > 1) {
   1634 			kmem_free(wrp, npkts * sizeof (ibt_send_wr_t));
   1635 		}
   1636 		rds_free_send_buf(ep, bp, NULL, npkts, B_FALSE);
   1637 		return (ret);
   1638 	}
   1639 
   1640 	if (npkts > 1) {
   1641 		/* multi-pkt message */
   1642 		RDS_DPRINTF5(LABEL, "EP(%p) Sending Multiple Packets", ep);
   1643 
   1644 		bp1 = bp;
   1645 		for (ix = 0; ix < npkts; ix++) {
   1646 			wrp[ix].wr_id = (uintptr_t)bp1;
   1647 			wrp[ix].wr_flags = IBT_WR_NO_FLAGS;
   1648 			wrp[ix].wr_trans = IBT_RC_SRV;
   1649 			wrp[ix].wr_opcode = IBT_WRC_SEND;
   1650 			wrp[ix].wr_nds = 1;
   1651 			wrp[ix].wr_sgl = &bp1->buf_ds;
   1652 			bp1 = bp1->buf_nextp;
   1653 		}
   1654 		wrp[npkts - 1].wr_flags = IBT_WR_SEND_SOLICIT;
   1655 
   1656 		ret = ibt_post_send(ep->ep_chanhdl, wrp, npkts, &ix);
   1657 		if (ret != IBT_SUCCESS) {
   1658 			RDS_DPRINTF2(LABEL, "EP(%p): ibt_post_send failed: "
   1659 			    "%d for %d pkts", ep, ret, npkts);
   1660 			rds_free_send_buf(ep, bp, NULL, npkts, B_FALSE);
   1661 			kmem_free(wrp, npkts * sizeof (ibt_send_wr_t));
   1662 			return (ret);
   1663 		}
   1664 
   1665 		kmem_free(wrp, npkts * sizeof (ibt_send_wr_t));
   1666 	} else {
   1667 		/* single pkt */
   1668 		RDS_DPRINTF5(LABEL, "EP(%p) Sending Single Packet", ep);
   1669 		wr.wr_id = (uintptr_t)bp;
   1670 		wr.wr_flags = IBT_WR_SEND_SOLICIT;
   1671 		wr.wr_trans = IBT_RC_SRV;
   1672 		wr.wr_opcode = IBT_WRC_SEND;
   1673 		wr.wr_nds = 1;
   1674 		wr.wr_sgl = &bp->buf_ds;
   1675 		RDS_DPRINTF5(LABEL, "ds_va %p ds_key 0x%llx ds_len %d ",
   1676 		    bp->buf_ds.ds_va, bp->buf_ds.ds_key, bp->buf_ds.ds_len);
   1677 		ret = ibt_post_send(ep->ep_chanhdl, &wr, 1, NULL);
   1678 		if (ret != IBT_SUCCESS) {
   1679 			RDS_DPRINTF2(LABEL, "EP(%p): ibt_post_send failed: "
   1680 			    "%d", ep, ret);
   1681 			rds_free_send_buf(ep, bp, NULL, 1, B_FALSE);
   1682 			return (ret);
   1683 		}
   1684 	}
   1685 
   1686 	RDS_INCR_TXPKTS(npkts);
   1687 	RDS_INCR_TXBYTES(msgsize);
   1688 
   1689 	RDS_DPRINTF4("rds_build_n_post_msg", "Return: EP(%p) UIOP(%p)",
   1690 	    ep, uiop);
   1691 
   1692 	return (0);
   1693 }
   1694 
   1695 static int
   1696 rds_deliver_loopback_msg(uio_t *uiop, ipaddr_t recvip, ipaddr_t sendip,
   1697     in_port_t recvport, in_port_t sendport, zoneid_t zoneid)
   1698 {
   1699 	mblk_t		*mp;
   1700 	int		ret;
   1701 
   1702 	RDS_DPRINTF4("rds_deliver_loopback_msg", "Enter");
   1703 
   1704 	RDS_DPRINTF3(LABEL, "Loopback message: sendport: "
   1705 	    "%d to recvport: %d", sendport, recvport);
   1706 
   1707 	mp = allocb(uiop->uio_resid, BPRI_MED);
   1708 	if (mp == NULL) {
   1709 		RDS_DPRINTF2(LABEL, "allocb failed, size: %d\n",
   1710 		    uiop->uio_resid);
   1711 		return (ENOSPC);
   1712 	}
   1713 	mp->b_wptr = mp->b_rptr + uiop->uio_resid;
   1714 
   1715 	ret = uiomove(mp->b_rptr, uiop->uio_resid, UIO_WRITE, uiop);
   1716 	if (ret) {
   1717 		RDS_DPRINTF2(LABEL, "ERROR: uiomove returned: %d", ret);
   1718 		freeb(mp);
   1719 		return (ret);
   1720 	}
   1721 
   1722 	ret = rds_deliver_new_msg(mp, recvip, sendip, recvport, sendport,
   1723 	    zoneid);
   1724 	if (ret != 0) {
   1725 		if (ret == ENOSPC) {
   1726 			/*
   1727 			 * The message is delivered but cannot take more,
   1728 			 * stop further loopback traffic to this port
   1729 			 */
   1730 			RDS_DPRINTF3("rds_deliver_loopback_msg",
   1731 			    "Port %d NO SPACE", recvport);
   1732 			rds_stall_port(NULL, recvport, RDS_LOOPBACK);
   1733 		} else {
   1734 			RDS_DPRINTF2(LABEL, "Loopback message: port %d -> "
   1735 			    "port %d failed: %d", sendport, recvport, ret);
   1736 			return (ret);
   1737 		}
   1738 	}
   1739 
   1740 	RDS_DPRINTF4("rds_deliver_loopback_msg", "Return");
   1741 	return (0);
   1742 }
   1743 
   1744 static void
   1745 rds_resend_messages(void *arg)
   1746 {
   1747 	rds_session_t	*sp = (rds_session_t *)arg;
   1748 	rds_ep_t	*ep;
   1749 	rds_bufpool_t	*spool;
   1750 	rds_buf_t	*bp, *endp, *tmp;
   1751 	ibt_send_wr_t	*wrp;
   1752 	uint_t		nwr = 0, ix, jx;
   1753 	int		ret;
   1754 
   1755 	RDS_DPRINTF2("rds_resend_messages", "Enter: SP(%p)", sp);
   1756 
   1757 	ep = &sp->session_dataep;
   1758 
   1759 	spool = &ep->ep_sndpool;
   1760 	mutex_enter(&spool->pool_lock);
   1761 
   1762 	ASSERT(spool->pool_nfree == spool->pool_nbuffers);
   1763 
   1764 	if (ep->ep_lbufid == NULL) {
   1765 		RDS_DPRINTF2("rds_resend_messages",
   1766 		    "SP(%p) Remote session is cleaned up ", sp);
   1767 		/*
   1768 		 * The remote end cleaned up its session. There may be loss
   1769 		 * of messages. Mark all buffers as acknowledged.
   1770 		 */
   1771 		tmp = spool->pool_tailp;
   1772 	} else {
   1773 		tmp = (rds_buf_t *)ep->ep_lbufid;
   1774 		RDS_DPRINTF2("rds_resend_messages",
   1775 		    "SP(%p) Last successful BP(%p) ", sp, tmp);
   1776 	}
   1777 
   1778 	endp = spool->pool_tailp;
   1779 	bp = spool->pool_headp;
   1780 	jx = 0;
   1781 	while ((bp != NULL) && (bp != tmp)) {
   1782 		bp->buf_state = RDS_SNDBUF_FREE;
   1783 		jx++;
   1784 		bp = bp->buf_nextp;
   1785 	}
   1786 
   1787 	if (bp == NULL) {
   1788 		mutex_exit(&spool->pool_lock);
   1789 		RDS_DPRINTF2("rds_resend_messages", "Alert: lbufid(%p) is not "
   1790 		    "found in the list", tmp);
   1791 
   1792 		rw_enter(&sp->session_lock, RW_WRITER);
   1793 		if (sp->session_state == RDS_SESSION_STATE_INIT) {
   1794 			sp->session_state = RDS_SESSION_STATE_CONNECTED;
   1795 		} else {
   1796 			RDS_DPRINTF2("rds_resend_messages", "SP(%p) State: %d "
   1797 			    "Expected State: %d", sp, sp->session_state,
   1798 			    RDS_SESSION_STATE_CONNECTED);
   1799 		}
   1800 		sp->session_failover = 0;
   1801 		rw_exit(&sp->session_lock);
   1802 		return;
   1803 	}
   1804 
   1805 	/* Found the match */
   1806 	bp->buf_state = RDS_SNDBUF_FREE;
   1807 	jx++;
   1808 
   1809 	spool->pool_tailp = bp;
   1810 	bp = bp->buf_nextp;
   1811 	spool->pool_tailp->buf_nextp = NULL;
   1812 	nwr = spool->pool_nfree - jx;
   1813 	spool->pool_nfree = jx;
   1814 	mutex_exit(&spool->pool_lock);
   1815 
   1816 	RDS_DPRINTF2("rds_resend_messages", "SP(%p): Number of "
   1817 	    "bufs (BP %p) to re-send: %d", sp, bp, nwr);
   1818 
   1819 	if (bp) {
   1820 		wrp = (ibt_send_wr_t *)kmem_zalloc(sizeof (ibt_send_wr_t) * 100,
   1821 		    KM_SLEEP);
   1822 
   1823 		while (nwr) {
   1824 			jx = (nwr > 100) ? 100 : nwr;
   1825 
   1826 			tmp = bp;
   1827 			for (ix = 0; ix < jx; ix++) {
   1828 				bp->buf_state = RDS_SNDBUF_PENDING;
   1829 				wrp[ix].wr_id = (uintptr_t)bp;
   1830 				wrp[ix].wr_flags = IBT_WR_SEND_SOLICIT;
   1831 				wrp[ix].wr_trans = IBT_RC_SRV;
   1832 				wrp[ix].wr_opcode = IBT_WRC_SEND;
   1833 				wrp[ix].wr_nds = 1;
   1834 				wrp[ix].wr_sgl = &bp->buf_ds;
   1835 				bp = bp->buf_nextp;
   1836 			}
   1837 
   1838 			ret = ibt_post_send(ep->ep_chanhdl, wrp, jx, &ix);
   1839 			if (ret != IBT_SUCCESS) {
   1840 				RDS_DPRINTF2(LABEL, "EP(%p): ibt_post_send "
   1841 				    "failed: %d for % pkts", ep, ret, jx);
   1842 				break;
   1843 			}
   1844 
   1845 			mutex_enter(&spool->pool_lock);
   1846 			spool->pool_nbusy += jx;
   1847 			mutex_exit(&spool->pool_lock);
   1848 
   1849 			nwr -= jx;
   1850 		}
   1851 
   1852 		kmem_free(wrp, sizeof (ibt_send_wr_t) * 100);
   1853 
   1854 		if (nwr != 0) {
   1855 
   1856 			/*
   1857 			 * An error while failover is in progress. Some WRs are
   1858 			 * posted while other remain. If any of the posted WRs
   1859 			 * complete in error then they would dispatch a taskq to
   1860 			 * do a failover. Getting the session lock will prevent
   1861 			 * the taskq to wait until we are done here.
   1862 			 */
   1863 			rw_enter(&sp->session_lock, RW_READER);
   1864 
   1865 			/*
   1866 			 * Wait until all the previous WRs are completed and
   1867 			 * then queue the remaining, otherwise the order of
   1868 			 * the messages may change.
   1869 			 */
   1870 			(void) rds_is_sendq_empty(ep, 1);
   1871 
   1872 			/* free the remaining buffers */
   1873 			rds_free_send_buf(ep, tmp, endp, nwr, B_FALSE);
   1874 
   1875 			rw_exit(&sp->session_lock);
   1876 			return;
   1877 		}
   1878 	}
   1879 
   1880 	rw_enter(&sp->session_lock, RW_WRITER);
   1881 	if (sp->session_state == RDS_SESSION_STATE_INIT) {
   1882 		sp->session_state = RDS_SESSION_STATE_CONNECTED;
   1883 	} else {
   1884 		RDS_DPRINTF2("rds_resend_messages", "SP(%p) State: %d "
   1885 		    "Expected State: %d", sp, sp->session_state,
   1886 		    RDS_SESSION_STATE_CONNECTED);
   1887 	}
   1888 	sp->session_failover = 0;
   1889 	rw_exit(&sp->session_lock);
   1890 
   1891 	RDS_DPRINTF2("rds_resend_messages", "Return: SP(%p)", sp);
   1892 }
   1893 
   1894 /*
   1895  * This is called when a channel is connected. Transition the session to
   1896  * CONNECTED state iff both channels are connected.
   1897  */
   1898 void
   1899 rds_session_active(rds_session_t *sp)
   1900 {
   1901 	rds_ep_t	*ep;
   1902 	uint_t		failover;
   1903 
   1904 	RDS_DPRINTF2("rds_session_active", "Enter: 0x%p", sp);
   1905 
   1906 	rw_enter(&sp->session_lock, RW_READER);
   1907 
   1908 	failover = sp->session_failover;
   1909 
   1910 	/*
   1911 	 * we establish the data channel first, so check the control channel
   1912 	 * first but make sure it is initialized.
   1913 	 */
   1914 	ep = &sp->session_ctrlep;
   1915 	mutex_enter(&ep->ep_lock);
   1916 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
   1917 		/* the session is not ready yet */
   1918 		mutex_exit(&ep->ep_lock);
   1919 		rw_exit(&sp->session_lock);
   1920 		return;
   1921 	}
   1922 	mutex_exit(&ep->ep_lock);
   1923 
   1924 	/* control channel is connected, check the data channel */
   1925 	ep = &sp->session_dataep;
   1926 	mutex_enter(&ep->ep_lock);
   1927 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
   1928 		/* data channel is not yet connected */
   1929 		mutex_exit(&ep->ep_lock);
   1930 		rw_exit(&sp->session_lock);
   1931 		return;
   1932 	}
   1933 	mutex_exit(&ep->ep_lock);
   1934 
   1935 	if (failover) {
   1936 		rw_exit(&sp->session_lock);
   1937 
   1938 		/*
   1939 		 * The session has failed over. Previous msgs have to be
   1940 		 * re-sent before the session is moved to the connected
   1941 		 * state.
   1942 		 */
   1943 		RDS_DPRINTF2("rds_session_active", "SP(%p) Dispatching taskq "
   1944 		    "to re-send messages", sp);
   1945 		(void) ddi_taskq_dispatch(rds_taskq,
   1946 		    rds_resend_messages, (void *)sp, DDI_SLEEP);
   1947 		return;
   1948 	}
   1949 
   1950 	/* the session is ready */
   1951 	sp->session_state = RDS_SESSION_STATE_CONNECTED;
   1952 	RDS_DPRINTF3("rds_session_active",
   1953 	    "SP(%p) State RDS_SESSION_STATE_CONNECTED", sp);
   1954 
   1955 	rw_exit(&sp->session_lock);
   1956 
   1957 	RDS_DPRINTF2("rds_session_active", "Return: SP(%p) is CONNECTED", sp);
   1958 }
   1959 
   1960 static int
   1961 rds_ep_sendmsg(rds_ep_t *ep, uio_t *uiop, in_port_t sendport,
   1962     in_port_t recvport)
   1963 {
   1964 	int	ret;
   1965 
   1966 	RDS_DPRINTF4("rds_ep_sendmsg", "Enter: EP(%p) sendport: %d recvport: "
   1967 	    "%d", ep, sendport, recvport);
   1968 
   1969 	/* make sure the remote port is not stalled */
   1970 	if (rds_is_port_marked(ep->ep_sp, recvport, RDS_REMOTE)) {
   1971 		RDS_DPRINTF2(LABEL, "SP(%p) Port:%d is in stall state",
   1972 		    ep->ep_sp, recvport);
   1973 		RDS_INCR_EWOULDBLOCK();
   1974 		ret = ENOMEM;
   1975 	} else {
   1976 		ret = rds_build_n_post_msg(ep, uiop, sendport, recvport);
   1977 	}
   1978 
   1979 	RDS_DPRINTF4("rds_ep_sendmsg", "Return: EP(%p)", ep);
   1980 
   1981 	return (ret);
   1982 }
   1983 
   1984 /* Send a message to a destination socket */
   1985 int
   1986 rds_sendmsg(uio_t *uiop, ipaddr_t sendip, ipaddr_t recvip, in_port_t sendport,
   1987     in_port_t recvport, zoneid_t zoneid)
   1988 {
   1989 	rds_session_t	*sp;
   1990 	ib_gid_t	lgid, rgid;
   1991 	int		ret;
   1992 
   1993 	RDS_DPRINTF4("rds_sendmsg", "Enter: uiop: 0x%p, srcIP: 0x%x destIP: "
   1994 	    "0x%x sndport: %d recvport: %d", uiop, sendip, recvip,
   1995 	    sendport, recvport);
   1996 
   1997 	/* If msg length is 0, just return success */
   1998 	if (uiop->uio_resid == 0) {
   1999 		RDS_DPRINTF2("rds_sendmsg", "Zero sized message");
   2000 		return (0);
   2001 	}
   2002 
   2003 	/* Is there a session to the destination? */
   2004 	rw_enter(&rdsib_statep->rds_sessionlock, RW_READER);
   2005 	sp = rds_session_lkup(rdsib_statep, recvip, 0);
   2006 	rw_exit(&rdsib_statep->rds_sessionlock);
   2007 
   2008 	/* Is this a loopback message? */
   2009 	if ((sp == NULL) && (rds_islocal(recvip))) {
   2010 		/* make sure the port is not stalled */
   2011 		if (rds_is_port_marked(NULL, recvport, RDS_LOOPBACK)) {
   2012 			RDS_DPRINTF2(LABEL, "Local Port:%d is in stall state",
   2013 			    recvport);
   2014 			RDS_INCR_EWOULDBLOCK();
   2015 			return (ENOMEM);
   2016 		}
   2017 		ret = rds_deliver_loopback_msg(uiop, recvip, sendip, recvport,
   2018 		    sendport, zoneid);
   2019 		return (ret);
   2020 	}
   2021 
   2022 	/* Not a loopback message */
   2023 	if (sp == NULL) {
   2024 		/* There is no session to the destination, create one. */
   2025 		RDS_DPRINTF3(LABEL, "There is no session to the destination "
   2026 		    "IP: 0x%x", recvip);
   2027 		sp = rds_session_create(rdsib_statep, sendip, recvip, NULL,
   2028 		    RDS_SESSION_ACTIVE);
   2029 		if (sp != NULL) {
   2030 			rw_enter(&sp->session_lock, RW_WRITER);
   2031 			if (sp->session_type == RDS_SESSION_ACTIVE) {
   2032 				ret = rds_session_init(sp);
   2033 				if (ret != 0) {
   2034 					RDS_DPRINTF2("rds_sendmsg",
   2035 					    "SP(%p): rds_session_init failed",
   2036 					    sp);
   2037 					sp->session_state =
   2038 					    RDS_SESSION_STATE_FAILED;
   2039 					RDS_DPRINTF3("rds_sendmsg",
   2040 					    "SP(%p) State "
   2041 					    "RDS_SESSION_STATE_FAILED", sp);
   2042 					rw_exit(&sp->session_lock);
   2043 					return (EFAULT);
   2044 				}
   2045 				sp->session_state = RDS_SESSION_STATE_INIT;
   2046 				RDS_DPRINTF3("rds_sendmsg",
   2047 				    "SP(%p) State "
   2048 				    "RDS_SESSION_STATE_INIT", sp);
   2049 				rw_exit(&sp->session_lock);
   2050 				rds_session_open(sp);
   2051 			} else {
   2052 				rw_exit(&sp->session_lock);
   2053 			}
   2054 		} else {
   2055 			/* Is a session created for this destination */
   2056 			rw_enter(&rdsib_statep->rds_sessionlock, RW_READER);
   2057 			sp = rds_session_lkup(rdsib_statep, recvip, 0);
   2058 			rw_exit(&rdsib_statep->rds_sessionlock);
   2059 			if (sp == NULL) {
   2060 				return (EFAULT);
   2061 			}
   2062 		}
   2063 	}
   2064 
   2065 	/* There is a session to the destination */
   2066 	rw_enter(&sp->session_lock, RW_READER);
   2067 	if (sp->session_state == RDS_SESSION_STATE_CONNECTED) {
   2068 		rw_exit(&sp->session_lock);
   2069 
   2070 		ret = rds_ep_sendmsg(&sp->session_dataep, uiop, sendport,
   2071 		    recvport);
   2072 		return (ret);
   2073 	} else if ((sp->session_state == RDS_SESSION_STATE_FAILED) ||
   2074 	    (sp->session_state == RDS_SESSION_STATE_FINI)) {
   2075 		ipaddr_t sendip1, recvip1;
   2076 
   2077 		RDS_DPRINTF3("rds_sendmsg", "SP(%p) is not connected, State: "
   2078 		    "%d", sp, sp->session_state);
   2079 		rw_exit(&sp->session_lock);
   2080 		rw_enter(&sp->session_lock, RW_WRITER);
   2081 		if ((sp->session_state == RDS_SESSION_STATE_FAILED) ||
   2082 		    (sp->session_state == RDS_SESSION_STATE_FINI)) {
   2083 			ibt_ip_path_attr_t	ipattr;
   2084 			ibt_ip_addr_t		dstip;
   2085 
   2086 			sp->session_state = RDS_SESSION_STATE_CREATED;
   2087 			sp->session_type = RDS_SESSION_ACTIVE;
   2088 			RDS_DPRINTF3("rds_sendmsg", "SP(%p) State "
   2089 			    "RDS_SESSION_STATE_CREATED", sp);
   2090 			rw_exit(&sp->session_lock);
   2091 
   2092 
   2093 			/* The ipaddr should be in the network order */
   2094 			sendip1 = sendip;
   2095 			recvip1 = recvip;
   2096 			ret = rds_sc_path_lookup(&sendip1, &recvip1);
   2097 			if (ret == 0) {
   2098 				RDS_DPRINTF2(LABEL, "Path not found "
   2099 				    "(0x%x 0x%x)", sendip1, recvip1);
   2100 			}
   2101 
   2102 			/* Resolve the IP addresses */
   2103 			lgid.gid_prefix = 0;
   2104 			lgid.gid_guid = 0;
   2105 			rgid.gid_prefix = 0;
   2106 			rgid.gid_guid = 0;
   2107 
   2108 			bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
   2109 			dstip.family = AF_INET;
   2110 			dstip.un.ip4addr = recvip1;
   2111 			ipattr.ipa_dst_ip = &dstip;
   2112 			ipattr.ipa_src_ip.family = AF_INET;
   2113 			ipattr.ipa_src_ip.un.ip4addr = sendip1;
   2114 			ipattr.ipa_ndst = 1;
   2115 			ipattr.ipa_max_paths = 1;
   2116 			RDS_DPRINTF2(LABEL, "ibt_get_ip_paths: 0x%x <-> 0x%x ",
   2117 			    sendip1, recvip1);
   2118 			ret = ibt_get_ip_paths(rdsib_statep->rds_ibhdl,
   2119 			    IBT_PATH_NO_FLAGS, &ipattr, &sp->session_pinfo,
   2120 			    NULL, NULL);
   2121 			if (ret != IBT_SUCCESS) {
   2122 				RDS_DPRINTF2("rds_sendmsg",
   2123 				    "ibt_get_ip_paths failed, ret: %d ", ret);
   2124 
   2125 				rw_enter(&sp->session_lock, RW_WRITER);
   2126 				if (sp->session_type == RDS_SESSION_ACTIVE) {
   2127 					sp->session_state =
   2128 					    RDS_SESSION_STATE_FAILED;
   2129 					RDS_DPRINTF3("rds_sendmsg",
   2130 					    "SP(%p) State "
   2131 					    "RDS_SESSION_STATE_FAILED", sp);
   2132 					rw_exit(&sp->session_lock);
   2133 					return (EFAULT);
   2134 				} else {
   2135 					rw_exit(&sp->session_lock);
   2136 					return (ENOMEM);
   2137 				}
   2138 			}
   2139 			RDS_DPRINTF2(LABEL, "ibt_get_ip_paths success");
   2140 			lgid = sp->session_pinfo.
   2141 			    pi_prim_cep_path.cep_adds_vect.av_sgid;
   2142 			rgid = sp->session_pinfo.
   2143 			    pi_prim_cep_path.cep_adds_vect.av_dgid;
   2144 
   2145 			RDS_DPRINTF2(LABEL, "lgid: %llx:%llx rgid: %llx:%llx",
   2146 			    lgid.gid_prefix, lgid.gid_guid, rgid.gid_prefix,
   2147 			    rgid.gid_guid);
   2148 
   2149 			rw_enter(&sp->session_lock, RW_WRITER);
   2150 			if (sp->session_type == RDS_SESSION_ACTIVE) {
   2151 				sp->session_lgid = lgid;
   2152 				sp->session_rgid = rgid;
   2153 				ret = rds_session_init(sp);
   2154 				if (ret != 0) {
   2155 					RDS_DPRINTF2("rds_sendmsg",
   2156 					    "SP(%p): rds_session_init failed",
   2157 					    sp);
   2158 					sp->session_state =
   2159 					    RDS_SESSION_STATE_FAILED;
   2160 					RDS_DPRINTF3("rds_sendmsg",
   2161 					    "SP(%p) State "
   2162 					    "RDS_SESSION_STATE_FAILED", sp);
   2163 					rw_exit(&sp->session_lock);
   2164 					return (EFAULT);
   2165 				}
   2166 				sp->session_state = RDS_SESSION_STATE_INIT;
   2167 				rw_exit(&sp->session_lock);
   2168 
   2169 				rds_session_open(sp);
   2170 
   2171 			} else {
   2172 				RDS_DPRINTF2("rds_sendmsg",
   2173 				    "SP(%p): type changed to %d",
   2174 				    sp, sp->session_type);
   2175 				rw_exit(&sp->session_lock);
   2176 				return (ENOMEM);
   2177 			}
   2178 		} else {
   2179 			RDS_DPRINTF2("rds_sendmsg",
   2180 			    "SP(%p): Session state %d changed",
   2181 			    sp, sp->session_state);
   2182 			rw_exit(&sp->session_lock);
   2183 			return (ENOMEM);
   2184 		}
   2185 	} else {
   2186 		RDS_DPRINTF4("rds_sendmsg", "SP(%p): Session is in %d state",
   2187 		    sp, sp->session_state);
   2188 		rw_exit(&sp->session_lock);
   2189 		return (ENOMEM);
   2190 	}
   2191 
   2192 	rw_enter(&sp->session_lock, RW_READER);
   2193 	if (sp->session_state == RDS_SESSION_STATE_CONNECTED) {
   2194 		rw_exit(&sp->session_lock);
   2195 
   2196 		ret = rds_ep_sendmsg(&sp->session_dataep, uiop, sendport,
   2197 		    recvport);
   2198 	} else {
   2199 		RDS_DPRINTF2("rds_sendmsg", "SP(%p): state(%d) not connected",
   2200 		    sp, sp->session_state);
   2201 		rw_exit(&sp->session_lock);
   2202 	}
   2203 
   2204 	RDS_DPRINTF4("rds_sendmsg", "Return: SP(%p) ret: %d", sp, ret);
   2205 
   2206 	return (ret);
   2207 }
   2208 
   2209 /* Note: This is called on the CQ handler thread */
   2210 void
   2211 rds_received_msg(rds_ep_t *ep, rds_buf_t *bp)
   2212 {
   2213 	mblk_t		*mp, *mp1;
   2214 	rds_data_hdr_t	*pktp, *pktp1;
   2215 	uint8_t		*datap;
   2216 	rds_buf_t	*bp1;
   2217 	rds_bufpool_t	*rpool;
   2218 	uint_t		npkts, ix;
   2219 	int		ret;
   2220 
   2221 	RDS_DPRINTF4("rds_received_msg", "Enter: EP(%p)", ep);
   2222 
   2223 	pktp = (rds_data_hdr_t *)(uintptr_t)bp->buf_ds.ds_va;
   2224 	datap = ((uint8_t *)(uintptr_t)bp->buf_ds.ds_va) + RDS_DATA_HDR_SZ;
   2225 	npkts = pktp->dh_npkts;
   2226 
   2227 	/* increment rx pending here */
   2228 	rpool = &ep->ep_rcvpool;
   2229 	mutex_enter(&rpool->pool_lock);
   2230 	rpool->pool_nbusy += npkts;
   2231 	mutex_exit(&rpool->pool_lock);
   2232 
   2233 	/* this will get freed by sockfs */
   2234 	mp = esballoc(datap, pktp->dh_datalen, BPRI_HI, &bp->buf_frtn);
   2235 	if (mp == NULL) {
   2236 		RDS_DPRINTF2(LABEL, "EP(%p) BP(%p): allocb failed",
   2237 		    ep, bp);
   2238 		rds_free_recv_buf(bp, npkts);
   2239 		return;
   2240 	}
   2241 	mp->b_wptr = datap + pktp->dh_datalen;
   2242 	mp->b_datap->db_type = M_DATA;
   2243 
   2244 	mp1 = mp;
   2245 	bp1 = bp->buf_nextp;
   2246 	while (bp1 != NULL) {
   2247 		pktp1 = (rds_data_hdr_t *)(uintptr_t)bp1->buf_ds.ds_va;
   2248 		datap = ((uint8_t *)(uintptr_t)bp1->buf_ds.ds_va) +
   2249 		    RDS_DATA_HDR_SZ;
   2250 
   2251 		mp1->b_cont = esballoc(datap, pktp1->dh_datalen,
   2252 		    BPRI_HI, &bp1->buf_frtn);
   2253 		if (mp1->b_cont == NULL) {
   2254 			RDS_DPRINTF2(LABEL, "EP(%p) BP(%p): allocb failed",
   2255 			    ep, bp1);
   2256 			freemsg(mp);
   2257 			rds_free_recv_buf(bp1, pktp1->dh_npkts);
   2258 			return;
   2259 		}
   2260 		mp1 = mp1->b_cont;
   2261 		mp1->b_wptr = datap + pktp1->dh_datalen;
   2262 		mp1->b_datap->db_type = M_DATA;
   2263 
   2264 		bp1 = bp1->buf_nextp;
   2265 	}
   2266 
   2267 	RDS_INCR_RXPKTS_PEND(npkts);
   2268 	RDS_INCR_RXPKTS(npkts);
   2269 	RDS_INCR_RXBYTES(msgdsize(mp));
   2270 
   2271 	RDS_DPRINTF5(LABEL, "Deliver Message: sendIP: 0x%x recvIP: 0x%x "
   2272 	    "sendport: %d recvport: %d npkts: %d pktno: %d", ep->ep_remip,
   2273 	    ep->ep_myip, pktp->dh_sendport, pktp->dh_recvport,
   2274 	    npkts, pktp->dh_psn);
   2275 
   2276 	/* store the last buffer id, no lock needed */
   2277 	if (npkts > 1) {
   2278 		ep->ep_rbufid = pktp1->dh_bufid;
   2279 	} else {
   2280 		ep->ep_rbufid = pktp->dh_bufid;
   2281 	}
   2282 
   2283 	ret = rds_deliver_new_msg(mp, ep->ep_myip, ep->ep_remip,
   2284 	    pktp->dh_recvport, pktp->dh_sendport, ALL_ZONES);
   2285 	if (ret != 0) {
   2286 		if (ret == ENOSPC) {
   2287 			/*
   2288 			 * The message is delivered but cannot take more,
   2289 			 * stop further remote messages coming to this port
   2290 			 */
   2291 			RDS_DPRINTF3("rds_received_msg", "Port %d NO SPACE",
   2292 			    pktp->dh_recvport);
   2293 			rds_stall_port(ep->ep_sp, pktp->dh_recvport, RDS_LOCAL);
   2294 		} else {
   2295 			RDS_DPRINTF2(LABEL, "rds_deliver_new_msg returned: %d",
   2296 			    ret);
   2297 		}
   2298 	}
   2299 
   2300 	mutex_enter(&ep->ep_lock);
   2301 	/* The first message can come in before the conn est event */
   2302 	if ((ep->ep_rdmacnt == 0) && (ep->ep_state == RDS_EP_STATE_CONNECTED)) {
   2303 		ep->ep_rdmacnt++;
   2304 		*(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va = ep->ep_rbufid;
   2305 		mutex_exit(&ep->ep_lock);
   2306 
   2307 		/* send acknowledgement */
   2308 		RDS_INCR_TXACKS();
   2309 		ret = ibt_post_send(ep->ep_chanhdl, &ep->ep_ackwr, 1, &ix);
   2310 		if (ret != IBT_SUCCESS) {
   2311 			RDS_DPRINTF2(LABEL, "EP(%p): ibt_post_send for "
   2312 			    "acknowledgement failed: %d, SQ depth: %d",
   2313 			    ep, ret, ep->ep_sndpool.pool_nbusy);
   2314 			mutex_enter(&ep->ep_lock);
   2315 			ep->ep_rdmacnt--;
   2316 			mutex_exit(&ep->ep_lock);
   2317 		}
   2318 	} else {
   2319 		/* no room to send acknowledgement */
   2320 		mutex_exit(&ep->ep_lock);
   2321 	}
   2322 
   2323 	RDS_DPRINTF4("rds_received_msg", "Return: EP(%p)", ep);
   2324 }
   2325