Home | History | Annotate | Download | only in rds
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /*
     26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
     27  *
     28  * This software is available to you under a choice of one of two
     29  * licenses.  You may choose to be licensed under the terms of the GNU
     30  * General Public License (GPL) Version 2, available from the file
     31  * COPYING in the main directory of this source tree, or the
     32  * OpenIB.org BSD license below:
     33  *
     34  *     Redistribution and use in source and binary forms, with or
     35  *     without modification, are permitted provided that the following
     36  *     conditions are met:
     37  *
     38  *	- Redistributions of source code must retain the above
     39  *	  copyright notice, this list of conditions and the following
     40  *	  disclaimer.
     41  *
     42  *	- Redistributions in binary form must reproduce the above
     43  *	  copyright notice, this list of conditions and the following
     44  *	  disclaimer in the documentation and/or other materials
     45  *	  provided with the distribution.
     46  *
     47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     54  * SOFTWARE.
     55  *
     56  */
     57 /*
     58  * Sun elects to include this software in Sun product
     59  * under the OpenIB BSD license.
     60  *
     61  *
     62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     72  * POSSIBILITY OF SUCH DAMAGE.
     73  */
     74 
     75 #include <sys/ib/clients/rds/rdsib_cm.h>
     76 #include <sys/ib/clients/rds/rdsib_ib.h>
     77 #include <sys/ib/clients/rds/rdsib_buf.h>
     78 #include <sys/ib/clients/rds/rdsib_ep.h>
     79 
     80 /*
     81  * This file contains CM related work:
     82  *
     83  * Service registration/deregistration
     84  * Path lookup
     85  * CM connection callbacks
     86  * CM active and passive connection establishment
     87  * Connection failover
     88  */
     89 
     90 #define	SRCIP	src_addr.un.ip4addr
     91 #define	DSTIP	dst_addr.un.ip4addr
     92 
     93 /*
     94  * Handle an incoming CM REQ
     95  */
     96 /* ARGSUSED */
     97 static ibt_cm_status_t
     98 rds_handle_cm_req(rds_state_t *statep, ibt_cm_event_t *evp,
     99     ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len)
    100 {
    101 	ibt_cm_req_rcv_t	*reqp;
    102 	ib_gid_t		lgid, rgid;
    103 	rds_cm_private_data_t	cmp;
    104 	rds_session_t		*sp;
    105 	rds_ep_t		*ep;
    106 	ibt_channel_hdl_t	chanhdl;
    107 	ibt_ip_cm_info_t	ipcm_info;
    108 	uint8_t			save_state, save_type;
    109 	int			ret;
    110 
    111 	RDS_DPRINTF2("rds_handle_cm_req", "Enter");
    112 
    113 	reqp = &evp->cm_event.req;
    114 	rgid = reqp->req_prim_addr.av_dgid; /* requester gid */
    115 	lgid = reqp->req_prim_addr.av_sgid; /* receiver gid */
    116 
    117 	RDS_DPRINTF2(LABEL, "REQ Received: From: %llx:%llx To: %llx:%llx",
    118 	    rgid.gid_prefix, rgid.gid_guid, lgid.gid_prefix, lgid.gid_guid);
    119 
    120 	/*
    121 	 * CM private data brings IP information
    122 	 * Private data received is a stream of bytes and may not be properly
    123 	 * aligned. So, bcopy the data onto the stack before accessing it.
    124 	 */
    125 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
    126 	    sizeof (rds_cm_private_data_t));
    127 
    128 	/* extract the CM IP info */
    129 	ret = ibt_get_ip_data(evp->cm_priv_data_len, evp->cm_priv_data,
    130 	    &ipcm_info);
    131 	if (ret != IBT_SUCCESS) {
    132 		RDS_DPRINTF2("rds_handle_cm_req", "ibt_get_ip_data failed: %d",
    133 		    ret);
    134 		return (IBT_CM_REJECT);
    135 	}
    136 
    137 	RDS_DPRINTF2("rds_handle_cm_req",
    138 	    "REQ Received: From IP: 0x%x To IP: 0x%x type: %d",
    139 	    ipcm_info.SRCIP, ipcm_info.DSTIP, cmp.cmp_eptype);
    140 
    141 	if (cmp.cmp_version != RDS_VERSION) {
    142 		RDS_DPRINTF2(LABEL, "Version Mismatch: Local version: %d "
    143 		    "Remote version: %d", RDS_VERSION, cmp.cmp_version);
    144 		return (IBT_CM_REJECT);
    145 	}
    146 
    147 	/* RDS supports V4 addresses only */
    148 	if ((ipcm_info.src_addr.family != AF_INET) ||
    149 	    (ipcm_info.dst_addr.family != AF_INET)) {
    150 		RDS_DPRINTF2(LABEL, "Unsupported Address Family: "
    151 		    "src: %d dst: %d", ipcm_info.src_addr.family,
    152 		    ipcm_info.dst_addr.family);
    153 		return (IBT_CM_REJECT);
    154 	}
    155 
    156 	if (cmp.cmp_arch != RDS_THIS_ARCH) {
    157 		RDS_DPRINTF2(LABEL, "ARCH does not match (%d != %d)",
    158 		    cmp.cmp_arch, RDS_THIS_ARCH);
    159 		return (IBT_CM_REJECT);
    160 	}
    161 
    162 	if ((cmp.cmp_eptype != RDS_EP_TYPE_CTRL) &&
    163 	    (cmp.cmp_eptype != RDS_EP_TYPE_DATA)) {
    164 		RDS_DPRINTF2(LABEL, "Unknown Channel type: %d", cmp.cmp_eptype);
    165 		return (IBT_CM_REJECT);
    166 	}
    167 
    168 	/* user_buffer_size should be same on all nodes */
    169 	if (cmp.cmp_user_buffer_size != UserBufferSize) {
    170 		RDS_DPRINTF2(LABEL,
    171 		    "UserBufferSize Mismatch, this node: %d remote node: %d",
    172 		    UserBufferSize, cmp.cmp_user_buffer_size);
    173 		return (IBT_CM_REJECT);
    174 	}
    175 
    176 	/*
    177 	 * RDS needs more time to process a failover REQ so send an MRA.
    178 	 * Otherwise, the remote may retry the REQ and fail the connection.
    179 	 */
    180 	if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) {
    181 		RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA");
    182 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
    183 		    10000000 /* 10 sec */, NULL, 0);
    184 	}
    185 
    186 	/* Is there a session to the destination node? */
    187 	rw_enter(&statep->rds_sessionlock, RW_READER);
    188 	sp = rds_session_lkup(statep, ipcm_info.SRCIP, rgid.gid_guid);
    189 	rw_exit(&statep->rds_sessionlock);
    190 
    191 	if (sp == NULL) {
    192 		/*
    193 		 * currently there is no session to the destination
    194 		 * remote ip in the private data is the local ip and vice
    195 		 * versa
    196 		 */
    197 		sp = rds_session_create(statep, ipcm_info.DSTIP,
    198 		    ipcm_info.SRCIP, reqp, RDS_SESSION_PASSIVE);
    199 		if (sp == NULL) {
    200 			/* Check the list anyway. */
    201 			rw_enter(&statep->rds_sessionlock, RW_READER);
    202 			sp = rds_session_lkup(statep, ipcm_info.SRCIP,
    203 			    rgid.gid_guid);
    204 			rw_exit(&statep->rds_sessionlock);
    205 			if (sp == NULL) {
    206 				/*
    207 				 * The only way this can fail is due to lack
    208 				 * of kernel resources
    209 				 */
    210 				return (IBT_CM_REJECT);
    211 			}
    212 		}
    213 	}
    214 
    215 	rw_enter(&sp->session_lock, RW_WRITER);
    216 
    217 	/* catch peer-to-peer case as soon as possible */
    218 	if ((sp->session_state == RDS_SESSION_STATE_CREATED) ||
    219 	    (sp->session_state == RDS_SESSION_STATE_INIT)) {
    220 		/* Check possible peer-to-peer case here */
    221 		if (sp->session_type != RDS_SESSION_PASSIVE) {
    222 			RDS_DPRINTF2("rds_handle_cm_req",
    223 			    "SP(%p) Peer-peer connection handling", sp);
    224 			if (lgid.gid_guid > rgid.gid_guid) {
    225 				/* this node is active so reject this request */
    226 				rw_exit(&sp->session_lock);
    227 				return (IBT_CM_REJECT);
    228 			} else {
    229 				/* this node is passive, change the session */
    230 				sp->session_type = RDS_SESSION_PASSIVE;
    231 				sp->session_lgid = lgid;
    232 				sp->session_rgid = rgid;
    233 			}
    234 		}
    235 	}
    236 
    237 	RDS_DPRINTF2(LABEL, "SP(%p) state: %d", sp, sp->session_state);
    238 	save_state = sp->session_state;
    239 	save_type = sp->session_type;
    240 
    241 	switch (sp->session_state) {
    242 	case RDS_SESSION_STATE_CONNECTED:
    243 		RDS_DPRINTF2(LABEL, "STALE Session Detected SP(%p)", sp);
    244 		sp->session_state = RDS_SESSION_STATE_ERROR;
    245 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
    246 		    "RDS_SESSION_STATE_ERROR", sp);
    247 
    248 		/* FALLTHRU */
    249 	case RDS_SESSION_STATE_ERROR:
    250 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
    251 		/*
    252 		 * Some other thread must be processing this session,
    253 		 * this thread must wait until the other thread finishes.
    254 		 */
    255 		sp->session_type = RDS_SESSION_PASSIVE;
    256 		rw_exit(&sp->session_lock);
    257 
    258 		/* Handling this will take some time, so send an MRA */
    259 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
    260 		    10000000 /* 10 sec */, NULL, 0);
    261 
    262 		/*
    263 		 * Any pending completions don't get flushed until the channel
    264 		 * is closed. So, passing 0 here will not wait for pending
    265 		 * completions in rds_session_close before closing the channel
    266 		 */
    267 		rds_session_close(sp, IBT_NOCALLBACKS, 0);
    268 
    269 		rw_enter(&sp->session_lock, RW_WRITER);
    270 
    271 		/*
    272 		 * If the session was in ERROR, then either a failover thread
    273 		 * or event_failure thread would be processing this session.
    274 		 * This thread should wait for event_failure thread to
    275 		 * complete. This need not wait for failover thread.
    276 		 */
    277 		if ((save_state != RDS_SESSION_STATE_CONNECTED) &&
    278 		    (save_type == RDS_SESSION_PASSIVE)) {
    279 				/*
    280 				 * The other thread is event_failure thread,
    281 				 * wait until it finishes.
    282 				 */
    283 				while (!((sp->session_state ==
    284 				    RDS_SESSION_STATE_FAILED) ||
    285 				    (sp->session_state ==
    286 				    RDS_SESSION_STATE_FINI))) {
    287 					rw_exit(&sp->session_lock);
    288 					delay(drv_usectohz(1000000));
    289 					rw_enter(&sp->session_lock, RW_WRITER);
    290 				}
    291 		}
    292 
    293 		/* move the session to init state */
    294 		if ((sp->session_state == RDS_SESSION_STATE_ERROR) ||
    295 		    (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING)) {
    296 			ret = rds_session_reinit(sp, lgid);
    297 			sp->session_myip = ipcm_info.DSTIP;
    298 			sp->session_lgid = lgid;
    299 			sp->session_rgid = rgid;
    300 			if (ret != 0) {
    301 				rds_session_fini(sp);
    302 				sp->session_state = RDS_SESSION_STATE_FAILED;
    303 				RDS_DPRINTF3("rds_handle_cm_req",
    304 				    "SP(%p) State RDS_SESSION_STATE_FAILED",
    305 				    sp);
    306 				rw_exit(&sp->session_lock);
    307 				return (IBT_CM_REJECT);
    308 			} else {
    309 				sp->session_state = RDS_SESSION_STATE_INIT;
    310 				RDS_DPRINTF3("rds_handle_cm_req",
    311 				    "SP(%p) State RDS_SESSION_STATE_INIT", sp);
    312 			}
    313 
    314 			if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) {
    315 				ep = &sp->session_ctrlep;
    316 			} else {
    317 				ep = &sp->session_dataep;
    318 			}
    319 			break;
    320 		}
    321 
    322 		/* FALLTHRU */
    323 	case RDS_SESSION_STATE_CREATED:
    324 	case RDS_SESSION_STATE_FAILED:
    325 	case RDS_SESSION_STATE_FINI:
    326 		/*
    327 		 * Initialize both channels, we accept this connection
    328 		 * only if both channels are initialized
    329 		 */
    330 		sp->session_type = RDS_SESSION_PASSIVE;
    331 		sp->session_lgid = lgid;
    332 		sp->session_rgid = rgid;
    333 		sp->session_state = RDS_SESSION_STATE_CREATED;
    334 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
    335 		    "RDS_SESSION_STATE_CREATED", sp);
    336 		ret = rds_session_init(sp);
    337 		if (ret != 0) {
    338 			/* Seems like there are not enough resources */
    339 			sp->session_state = RDS_SESSION_STATE_FAILED;
    340 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
    341 			    "RDS_SESSION_STATE_FAILED", sp);
    342 			rw_exit(&sp->session_lock);
    343 			return (IBT_CM_REJECT);
    344 		}
    345 		sp->session_state = RDS_SESSION_STATE_INIT;
    346 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
    347 		    "RDS_SESSION_STATE_INIT", sp);
    348 
    349 		/* FALLTHRU */
    350 	case RDS_SESSION_STATE_INIT:
    351 		/*
    352 		 * When re-using an existing session, make sure the
    353 		 * session is still through the same HCA. Otherwise, the
    354 		 * memory registrations have to moved to the new HCA.
    355 		 */
    356 		if (cmp.cmp_eptype == RDS_EP_TYPE_DATA) {
    357 			if (sp->session_lgid.gid_guid != lgid.gid_guid) {
    358 				RDS_DPRINTF2("rds_handle_cm_req",
    359 				    "Existing Session but different gid "
    360 				    "existing: 0x%llx, new: 0x%llx, "
    361 				    "sending an MRA",
    362 				    sp->session_lgid.gid_guid, lgid.gid_guid);
    363 				(void) ibt_cm_delay(IBT_CM_DELAY_REQ,
    364 				    evp->cm_session_id, 10000000 /* 10 sec */,
    365 				    NULL, 0);
    366 				ret = rds_session_reinit(sp, lgid);
    367 				if (ret != 0) {
    368 					rds_session_fini(sp);
    369 					sp->session_state =
    370 					    RDS_SESSION_STATE_FAILED;
    371 					sp->session_failover = 0;
    372 					RDS_DPRINTF3("rds_failover_session",
    373 					    "SP(%p) State "
    374 					    "RDS_SESSION_STATE_FAILED", sp);
    375 					rw_exit(&sp->session_lock);
    376 					return (IBT_CM_REJECT);
    377 				}
    378 			}
    379 			ep = &sp->session_dataep;
    380 		} else {
    381 			ep = &sp->session_ctrlep;
    382 		}
    383 
    384 		break;
    385 	default:
    386 		RDS_DPRINTF2(LABEL, "ERROR: SP(%p) is in an unexpected "
    387 		    "state: %d", sp, sp->session_state);
    388 		rw_exit(&sp->session_lock);
    389 		return (IBT_CM_REJECT);
    390 	}
    391 
    392 	sp->session_failover = 0; /* reset any previous value */
    393 	if (cmp.cmp_failover) {
    394 		RDS_DPRINTF2("rds_handle_cm_req",
    395 		    "SP(%p) Failover Session (BP %p)", sp, cmp.cmp_last_bufid);
    396 		sp->session_failover = 1;
    397 	}
    398 
    399 	mutex_enter(&ep->ep_lock);
    400 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
    401 		ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
    402 		sp->session_type = RDS_SESSION_PASSIVE;
    403 		rw_exit(&sp->session_lock);
    404 	} else if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
    405 		rw_exit(&sp->session_lock);
    406 		/*
    407 		 * Peer to peer connection. There is an active
    408 		 * connection pending on this ep. The one with
    409 		 * greater port guid becomes active and the
    410 		 * other becomes passive.
    411 		 */
    412 		RDS_DPRINTF2("rds_handle_cm_req",
    413 		    "EP(%p) Peer-peer connection handling", ep);
    414 		if (lgid.gid_guid > rgid.gid_guid) {
    415 			/* this node is active so reject this request */
    416 			mutex_exit(&ep->ep_lock);
    417 			RDS_DPRINTF2(LABEL, "SP(%p) EP(%p): "
    418 			    "Rejecting passive in favor of active", sp, ep);
    419 			return (IBT_CM_REJECT);
    420 		} else {
    421 			/*
    422 			 * This session is not the active end, change it
    423 			 * to passive end.
    424 			 */
    425 			ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
    426 
    427 			rw_enter(&sp->session_lock, RW_WRITER);
    428 			sp->session_type = RDS_SESSION_PASSIVE;
    429 			sp->session_lgid = lgid;
    430 			sp->session_rgid = rgid;
    431 			rw_exit(&sp->session_lock);
    432 		}
    433 	} else {
    434 		rw_exit(&sp->session_lock);
    435 	}
    436 
    437 	ep->ep_lbufid = cmp.cmp_last_bufid;
    438 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
    439 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
    440 	cmp.cmp_last_bufid = ep->ep_rbufid;
    441 	cmp.cmp_ack_addr = ep->ep_ack_addr;
    442 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
    443 	mutex_exit(&ep->ep_lock);
    444 
    445 	/* continue with accepting the connection request for this channel */
    446 	chanhdl = rds_ep_alloc_rc_channel(ep, reqp->req_prim_hca_port);
    447 	if (chanhdl == NULL) {
    448 		mutex_enter(&ep->ep_lock);
    449 		ep->ep_state = RDS_EP_STATE_UNCONNECTED;
    450 		mutex_exit(&ep->ep_lock);
    451 		return (IBT_CM_REJECT);
    452 	}
    453 
    454 	/* pre-post recv buffers in the RQ */
    455 	rds_post_recv_buf((void *)chanhdl);
    456 
    457 	rargsp->cm_ret_len = sizeof (rds_cm_private_data_t);
    458 	bcopy((uint8_t *)&cmp, rcmp, sizeof (rds_cm_private_data_t));
    459 	rargsp->cm_ret.rep.cm_channel = chanhdl;
    460 	rargsp->cm_ret.rep.cm_rdma_ra_out = 4;
    461 	rargsp->cm_ret.rep.cm_rdma_ra_in = 4;
    462 	rargsp->cm_ret.rep.cm_rnr_retry_cnt = MinRnrRetry;
    463 
    464 	RDS_DPRINTF2("rds_handle_cm_req", "Return: SP(%p) EP(%p) Chan (%p)",
    465 	    sp, ep, chanhdl);
    466 
    467 	return (IBT_CM_ACCEPT);
    468 }
    469 
    470 /*
    471  * Handle an incoming CM REP
    472  * Pre-post recv buffers for the QP
    473  */
    474 /* ARGSUSED */
    475 static ibt_cm_status_t
    476 rds_handle_cm_rep(ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp,
    477     void *rcmp, ibt_priv_data_len_t rcmp_len)
    478 {
    479 	rds_ep_t	*ep;
    480 	rds_cm_private_data_t	cmp;
    481 
    482 	RDS_DPRINTF2("rds_handle_cm_rep", "Enter");
    483 
    484 	/* pre-post recv buffers in the RQ */
    485 	rds_post_recv_buf((void *)evp->cm_channel);
    486 
    487 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
    488 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
    489 	    sizeof (rds_cm_private_data_t));
    490 	ep->ep_lbufid = cmp.cmp_last_bufid;
    491 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
    492 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
    493 
    494 	rargsp->cm_ret_len = 0;
    495 
    496 	RDS_DPRINTF2("rds_handle_cm_rep", "Return: lbufid: %p", ep->ep_lbufid);
    497 
    498 	return (IBT_CM_ACCEPT);
    499 }
    500 
    501 /*
    502  * Handle CONN EST
    503  */
    504 static ibt_cm_status_t
    505 rds_handle_cm_conn_est(ibt_cm_event_t *evp)
    506 {
    507 	rds_session_t	*sp;
    508 	rds_ep_t	*ep;
    509 
    510 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
    511 
    512 	RDS_DPRINTF2("rds_handle_cm_conn_est", "EP(%p) State: %d", ep,
    513 	    ep->ep_state);
    514 
    515 	mutex_enter(&ep->ep_lock);
    516 	ASSERT((ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) ||
    517 	    (ep->ep_state == RDS_EP_STATE_PASSIVE_PENDING));
    518 	ep->ep_state = RDS_EP_STATE_CONNECTED;
    519 	ep->ep_chanhdl = evp->cm_channel;
    520 	sp = ep->ep_sp;
    521 	mutex_exit(&ep->ep_lock);
    522 
    523 	(void) rds_session_active(sp);
    524 
    525 	RDS_DPRINTF2("rds_handle_cm_conn_est", "Return");
    526 	return (IBT_CM_ACCEPT);
    527 }
    528 
    529 /*
    530  * Handle CONN CLOSED
    531  */
    532 static ibt_cm_status_t
    533 rds_handle_cm_conn_closed(ibt_cm_event_t *evp)
    534 {
    535 	rds_ep_t	*ep;
    536 	rds_session_t	*sp;
    537 
    538 	/* Catch DREQs but ignore DREPs */
    539 	if (evp->cm_event.closed != IBT_CM_CLOSED_DREQ_RCVD) {
    540 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
    541 		    "Ignoring Event: %d received", evp->cm_event.closed);
    542 		return (IBT_CM_ACCEPT);
    543 	}
    544 
    545 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
    546 	sp = ep->ep_sp;
    547 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) Chan(%p) Enter",
    548 	    ep, evp->cm_channel);
    549 
    550 	mutex_enter(&ep->ep_lock);
    551 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
    552 		/* Ignore this DREQ */
    553 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
    554 		    "EP(%p) not connected, state: %d", ep, ep->ep_state);
    555 		mutex_exit(&ep->ep_lock);
    556 		return (IBT_CM_ACCEPT);
    557 	}
    558 	ep->ep_state = RDS_EP_STATE_CLOSING;
    559 	mutex_exit(&ep->ep_lock);
    560 
    561 	rw_enter(&sp->session_lock, RW_WRITER);
    562 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - state: %d", sp,
    563 	    sp->session_state);
    564 
    565 	switch (sp->session_state) {
    566 	case RDS_SESSION_STATE_CONNECTED:
    567 	case RDS_SESSION_STATE_HCA_CLOSING:
    568 		sp->session_state = RDS_SESSION_STATE_PASSIVE_CLOSING;
    569 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
    570 		    "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
    571 		break;
    572 
    573 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
    574 		sp->session_state = RDS_SESSION_STATE_CLOSED;
    575 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
    576 		    "RDS_SESSION_STATE_CLOSED", sp);
    577 		rds_passive_session_fini(sp);
    578 		sp->session_state = RDS_SESSION_STATE_FINI;
    579 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
    580 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
    581 		break;
    582 
    583 	case RDS_SESSION_STATE_ACTIVE_CLOSING:
    584 	case RDS_SESSION_STATE_ERROR:
    585 	case RDS_SESSION_STATE_CLOSED:
    586 		break;
    587 
    588 	case RDS_SESSION_STATE_INIT:
    589 		sp->session_state = RDS_SESSION_STATE_ERROR;
    590 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
    591 		    "RDS_SESSION_STATE_ERROR", sp);
    592 		rds_passive_session_fini(sp);
    593 		sp->session_state = RDS_SESSION_STATE_FAILED;
    594 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
    595 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
    596 		break;
    597 
    598 	default:
    599 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
    600 		    "SP(%p) - Unexpected state: %d", sp, sp->session_state);
    601 		rds_passive_session_fini(sp);
    602 		sp->session_state = RDS_SESSION_STATE_FAILED;
    603 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
    604 		    "RDS_SESSION_STATE_FAILED", sp);
    605 	}
    606 	rw_exit(&sp->session_lock);
    607 
    608 	mutex_enter(&ep->ep_lock);
    609 	ep->ep_state = RDS_EP_STATE_CLOSED;
    610 	mutex_exit(&ep->ep_lock);
    611 
    612 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) Return", sp);
    613 	return (IBT_CM_ACCEPT);
    614 }
    615 
    616 /*
    617  * Handle EVENT FAILURE
    618  */
    619 static ibt_cm_status_t
    620 rds_handle_cm_event_failure(ibt_cm_event_t *evp)
    621 {
    622 	rds_ep_t	*ep;
    623 	rds_session_t	*sp;
    624 	int		ret;
    625 
    626 	RDS_DPRINTF2("rds_handle_cm_event_failure", "Enter: Chan hdl: 0x%p "
    627 	    "Code: %d msg: %d reason: %d", evp->cm_channel,
    628 	    evp->cm_event.failed.cf_code, evp->cm_event.failed.cf_msg,
    629 	    evp->cm_event.failed.cf_reason);
    630 
    631 	if (evp->cm_event.failed.cf_reason == IBT_CM_INVALID_SID) {
    632 		RDS_DPRINTF2(LABEL,
    633 		    "Received REJ with reason IBT_CM_INVALID_SID: "
    634 		    "RDS may not be loaded on the remote system");
    635 	}
    636 
    637 	if (evp->cm_channel == NULL) {
    638 		return (IBT_CM_ACCEPT);
    639 	}
    640 
    641 	if ((evp->cm_event.failed.cf_code != IBT_CM_FAILURE_STALE) &&
    642 	    (evp->cm_event.failed.cf_msg == IBT_CM_FAILURE_REQ)) {
    643 		/*
    644 		 * This end is active, just ignore, ibt_open_rc_channel()
    645 		 * caller will take care of cleanup.
    646 		 */
    647 		RDS_DPRINTF2("rds_handle_cm_event_failure",
    648 		    "Ignoring this event: Chan hdl: 0x%p", evp->cm_channel);
    649 		return (IBT_CM_ACCEPT);
    650 	}
    651 
    652 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
    653 	sp = ep->ep_sp;
    654 
    655 	rw_enter(&sp->session_lock, RW_WRITER);
    656 	if (sp->session_type == RDS_SESSION_PASSIVE) {
    657 		RDS_DPRINTF2("rds_handle_cm_event_failure",
    658 		    "SP(%p) - state: %d", sp, sp->session_state);
    659 		if ((sp->session_state == RDS_SESSION_STATE_INIT) ||
    660 		    (sp->session_state == RDS_SESSION_STATE_CONNECTED)) {
    661 			sp->session_state = RDS_SESSION_STATE_ERROR;
    662 			RDS_DPRINTF3("rds_handle_cm_event_failure",
    663 			    "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
    664 
    665 			/*
    666 			 * Store the cm_channel for freeing later
    667 			 * Active side frees it on ibt_open_rc_channel
    668 			 * failure
    669 			 */
    670 			if (ep->ep_chanhdl == NULL) {
    671 				ep->ep_chanhdl = evp->cm_channel;
    672 			}
    673 			rw_exit(&sp->session_lock);
    674 
    675 			/*
    676 			 * rds_passive_session_fini should not be called
    677 			 * directly in the CM handler. It will cause a deadlock.
    678 			 */
    679 			ret = ddi_taskq_dispatch(rds_taskq,
    680 			    rds_cleanup_passive_session, (void *)sp,
    681 			    DDI_NOSLEEP);
    682 			if (ret != DDI_SUCCESS) {
    683 				RDS_DPRINTF2("rds_handle_cm_event_failure",
    684 				    "SP(%p) TaskQ dispatch FAILED:%d", sp, ret);
    685 			}
    686 			return (IBT_CM_ACCEPT);
    687 		}
    688 	}
    689 	rw_exit(&sp->session_lock);
    690 
    691 	RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) Return", sp);
    692 	return (IBT_CM_ACCEPT);
    693 }
    694 
    695 /*
    696  * CM Handler
    697  *
    698  * Called by IBCM
    699  * The cm_private type differs for active and passive events.
    700  */
    701 ibt_cm_status_t
    702 rds_cm_handler(void *cm_private, ibt_cm_event_t *eventp,
    703     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
    704     ibt_priv_data_len_t ret_len_max)
    705 {
    706 	ibt_cm_status_t		ret = IBT_CM_ACCEPT;
    707 
    708 	RDS_DPRINTF2("rds_cm_handler", "Enter: event: %d", eventp->cm_type);
    709 
    710 	switch (eventp->cm_type) {
    711 	case IBT_CM_EVENT_REQ_RCV:
    712 		ret = rds_handle_cm_req((rds_state_t *)cm_private, eventp,
    713 		    ret_args, ret_priv_data, ret_len_max);
    714 		break;
    715 	case IBT_CM_EVENT_REP_RCV:
    716 		ret = rds_handle_cm_rep(eventp, ret_args, ret_priv_data,
    717 		    ret_len_max);
    718 		break;
    719 	case IBT_CM_EVENT_MRA_RCV:
    720 		/* Not supported */
    721 		break;
    722 	case IBT_CM_EVENT_CONN_EST:
    723 		ret = rds_handle_cm_conn_est(eventp);
    724 		break;
    725 	case IBT_CM_EVENT_CONN_CLOSED:
    726 		ret = rds_handle_cm_conn_closed(eventp);
    727 		break;
    728 	case IBT_CM_EVENT_FAILURE:
    729 		ret = rds_handle_cm_event_failure(eventp);
    730 		break;
    731 	case IBT_CM_EVENT_LAP_RCV:
    732 		/* Not supported */
    733 		RDS_DPRINTF2(LABEL, "LAP message received");
    734 		break;
    735 	case IBT_CM_EVENT_APR_RCV:
    736 		/* Not supported */
    737 		RDS_DPRINTF2(LABEL, "APR message received");
    738 		break;
    739 	default:
    740 		break;
    741 	}
    742 
    743 	RDS_DPRINTF2("rds_cm_handler", "Return");
    744 
    745 	return (ret);
    746 }
    747 
    748 /* This is based on OFED Linux RDS */
    749 #define	RDS_PORT_NUM	6556
    750 
    751 /*
    752  * Register the wellknown service with service id: RDS_SERVICE_ID
    753  * Incoming connection requests should arrive on this service id.
    754  */
    755 ibt_srv_hdl_t
    756 rds_register_service(ibt_clnt_hdl_t rds_ibhdl)
    757 {
    758 	ibt_srv_hdl_t	srvhdl;
    759 	ibt_srv_desc_t	srvdesc;
    760 	int		ret;
    761 
    762 	RDS_DPRINTF2("rds_register_service", "Enter: 0x%p", rds_ibhdl);
    763 
    764 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
    765 	srvdesc.sd_handler = rds_cm_handler;
    766 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
    767 
    768 	/*
    769 	 * This is the new service id as per:
    770 	 * Annex A11: RDMA IP CM Service
    771 	 */
    772 	rdsib_statep->rds_service_id = ibt_get_ip_sid(IPPROTO_TCP,
    773 	    RDS_PORT_NUM);
    774 	ret = ibt_register_service(rds_ibhdl, &srvdesc,
    775 	    rdsib_statep->rds_service_id, 1, &srvhdl, NULL);
    776 	if (ret != IBT_SUCCESS) {
    777 		RDS_DPRINTF2(LABEL,
    778 		    "RDS Service (0x%llx) Registration Failed: %d",
    779 		    rdsib_statep->rds_service_id, ret);
    780 		return (NULL);
    781 	}
    782 
    783 	RDS_DPRINTF2("rds_register_service", "Return: 0x%p", srvhdl);
    784 	return (srvhdl);
    785 }
    786 
    787 /* Bind the RDS service on all ports */
    788 int
    789 rds_bind_service(rds_state_t *statep)
    790 {
    791 	rds_hca_t	*hcap;
    792 	ib_gid_t	gid;
    793 	uint_t		jx, nbinds = 0, nports = 0;
    794 	int		ret;
    795 
    796 	RDS_DPRINTF2("rds_bind_service", "Enter: 0x%p", statep);
    797 
    798 	rw_enter(&statep->rds_hca_lock, RW_READER);
    799 
    800 	hcap = statep->rds_hcalistp;
    801 	while (hcap != NULL) {
    802 
    803 		/* skip the HCAs that are not fully online */
    804 		if ((hcap->hca_state != RDS_HCA_STATE_OPEN) &&
    805 		    (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED)) {
    806 			RDS_DPRINTF2("rds_bind_service",
    807 			    "Skipping HCA: 0x%llx, state: %d",
    808 			    hcap->hca_guid, hcap->hca_state);
    809 			hcap = hcap->hca_nextp;
    810 			continue;
    811 		}
    812 
    813 		/* currently, we have space for only 4 bindhdls */
    814 		ASSERT(hcap->hca_nports < 4);
    815 		for (jx = 0; jx < hcap->hca_nports; jx++) {
    816 			nports++;
    817 			if (hcap->hca_pinfop[jx].p_linkstate !=
    818 			    IBT_PORT_ACTIVE) {
    819 				/*
    820 				 * service bind will be called in the async
    821 				 * handler when the port comes up. Clear any
    822 				 * stale bind handle.
    823 				 */
    824 				hcap->hca_bindhdl[jx] = NULL;
    825 				continue;
    826 			}
    827 
    828 			gid = hcap->hca_pinfop[jx].p_sgid_tbl[0];
    829 			RDS_DPRINTF5(LABEL, "HCA: 0x%llx Port: %d "
    830 			    "gid: %llx:%llx", hcap->hca_guid,
    831 			    hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix,
    832 			    gid.gid_guid);
    833 
    834 			/* pass statep as cm_private */
    835 			ret = ibt_bind_service(statep->rds_srvhdl, gid,
    836 			    NULL, statep, &hcap->hca_bindhdl[jx]);
    837 			if (ret != IBT_SUCCESS) {
    838 				RDS_DPRINTF2(LABEL, "Bind service for "
    839 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
    840 				    "failed: %d", hcap->hca_guid,
    841 				    hcap->hca_pinfop[jx].p_port_num,
    842 				    gid.gid_prefix, gid.gid_guid, ret);
    843 				continue;
    844 			}
    845 
    846 			nbinds++;
    847 		}
    848 		hcap = hcap->hca_nextp;
    849 	}
    850 
    851 	rw_exit(&statep->rds_hca_lock);
    852 
    853 	RDS_DPRINTF2(LABEL, "RDS Service available on %d/%d ports",
    854 	    nbinds, nports);
    855 
    856 #if 0
    857 	if (nbinds == 0) {
    858 		return (-1);
    859 	}
    860 #endif
    861 
    862 	RDS_DPRINTF2("rds_bind_service", "Return");
    863 
    864 	return (0);
    865 }
    866 
    867 /* Open an RC connection */
    868 int
    869 rds_open_rc_channel(rds_ep_t *ep, ibt_path_info_t *pinfo,
    870     ibt_execution_mode_t mode, ibt_channel_hdl_t *chanhdl)
    871 {
    872 	rds_session_t		*sp;
    873 	ibt_chan_open_args_t	ocargs;
    874 	ibt_rc_returns_t	ocrets;
    875 	rds_cm_private_data_t	cmp;
    876 	uint8_t			hca_port;
    877 	ibt_channel_hdl_t	hdl;
    878 	ibt_status_t		ret = 0;
    879 	ibt_ip_cm_info_t	ipcm_info;
    880 
    881 	RDS_DPRINTF2("rds_open_rc_channel", "Enter: EP(%p) mode: %d", ep, mode);
    882 
    883 	sp = ep->ep_sp;
    884 
    885 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
    886 	ipcm_info.src_addr.family = AF_INET;
    887 	ipcm_info.SRCIP = sp->session_myip;
    888 	ipcm_info.dst_addr.family = AF_INET;
    889 	ipcm_info.DSTIP = sp->session_remip;
    890 	ipcm_info.src_port = RDS_PORT_NUM;
    891 	ret = ibt_format_ip_private_data(&ipcm_info,
    892 	    sizeof (rds_cm_private_data_t), &cmp);
    893 	if (ret != IBT_SUCCESS) {
    894 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_format_ip_private_data "
    895 		    "failed: %d", sp, ep, ret);
    896 		return (-1);
    897 	}
    898 
    899 	hca_port = pinfo->pi_prim_cep_path.cep_hca_port_num;
    900 
    901 	hdl = rds_ep_alloc_rc_channel(ep, hca_port);
    902 	if (hdl == NULL) {
    903 		return (-1);
    904 	}
    905 
    906 	cmp.cmp_version = RDS_VERSION;
    907 	cmp.cmp_arch = RDS_THIS_ARCH;
    908 	cmp.cmp_eptype = ep->ep_type;
    909 	cmp.cmp_failover = sp->session_failover;
    910 	cmp.cmp_last_bufid = ep->ep_rbufid;
    911 	cmp.cmp_user_buffer_size = UserBufferSize;
    912 	cmp.cmp_ack_addr = ep->ep_ack_addr;
    913 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
    914 
    915 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
    916 	bzero(&ocrets, sizeof (ibt_rc_returns_t));
    917 	ocargs.oc_path = pinfo;
    918 	ocargs.oc_cm_handler = rds_cm_handler;
    919 	ocargs.oc_cm_clnt_private = NULL;
    920 	ocargs.oc_rdma_ra_out = 4;
    921 	ocargs.oc_rdma_ra_in = 4;
    922 	ocargs.oc_priv_data_len = sizeof (rds_cm_private_data_t);
    923 	ocargs.oc_priv_data = &cmp;
    924 	ocargs.oc_path_retry_cnt = IBPathRetryCount;
    925 	ocargs.oc_path_rnr_retry_cnt = MinRnrRetry;
    926 	ret = ibt_open_rc_channel(hdl, IBT_OCHAN_NO_FLAGS,
    927 	    mode, &ocargs, &ocrets);
    928 	if (ret != IBT_SUCCESS) {
    929 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_open_rc_channel "
    930 		    "failed: %d", sp, ep, ret);
    931 		(void) ibt_flush_channel(hdl);
    932 		(void) ibt_free_channel(hdl);
    933 
    934 		mutex_enter(&ep->ep_lock);
    935 		/* don't cleanup if this failure is due to peer-peer race */
    936 		if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
    937 			/* cleanup stuff allocated in rds_ep_alloc_rc_channel */
    938 			ep->ep_state = RDS_EP_STATE_ERROR;
    939 			rds_ep_free_rc_channel(ep);
    940 		}
    941 		mutex_exit(&ep->ep_lock);
    942 
    943 		return (-1);
    944 	}
    945 
    946 	*chanhdl = hdl;
    947 
    948 	RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep,
    949 	    *chanhdl);
    950 
    951 	return (0);
    952 }
    953 
    954 int
    955 rds_close_rc_channel(ibt_channel_hdl_t chanhdl, ibt_execution_mode_t mode)
    956 {
    957 	int	ret;
    958 
    959 	RDS_DPRINTF2("rds_close_rc_channel", "Enter: Chan(%p) Mode(%d)",
    960 	    chanhdl, mode);
    961 
    962 	ret = ibt_close_rc_channel(chanhdl, mode, NULL, 0, NULL, NULL, 0);
    963 
    964 	RDS_DPRINTF2("rds_close_rc_channel", "Return Chan(%p)", chanhdl);
    965 
    966 	return (ret);
    967 }
    968