Home | History | Annotate | Download | only in rpc
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /*
     26  * Copyright (c) 2007, The Ohio State University. All rights reserved.
     27  *
     28  * Portions of this source code is developed by the team members of
     29  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
     30  * headed by Professor Dhabaleswar K. (DK) Panda.
     31  *
     32  * Acknowledgements to contributions from developors:
     33  *   Ranjit Noronha: noronha (at) cse.ohio-state.edu
     34  *   Lei Chai      : chail (at) cse.ohio-state.edu
     35  *   Weikuan Yu    : yuw (at) cse.ohio-state.edu
     36  *
     37  */
     38 
     39 
     40 #ifndef _IB_H
     41 #define	_IB_H
     42 
     43 /*
     44  * ib.h, rpcib plugin interface.
     45  */
     46 
     47 #include <sys/types.h>
     48 #include <sys/ddi.h>
     49 #include <sys/sunddi.h>
     50 #include <sys/conf.h>
     51 #include <sys/stat.h>
     52 #include <rpc/rpc.h>
     53 #include <rpc/rpc_rdma.h>
     54 #include <sys/ib/ibtl/ibti.h>
     55 #include <sys/avl.h>
     56 
     57 #ifdef __cplusplus
     58 extern "C" {
     59 #endif
     60 
     61 #define	MAX_BUFS	1024	/* max no. of buffers per pool */
     62 
     63 #define	DEF_CQ_SIZE	4096 - 1	/* default CQ size */
     64 				/*
     65 				 * Tavor returns the next higher power of 2
     66 				 * CQ entries than the requested size.
     67 				 * For instance, if you request (2^12 - 1)
     68 				 * CQ entries, Tavor returns 2^12 entries.
     69 				 * 4K CQ entries suffice.  Hence, 4096 - 1.
     70 				 */
     71 #define	DEF_SQ_SIZE	128	/* default SendQ size */
     72 #define	DEF_RQ_SIZE	256	/* default RecvQ size */
     73 #define	DSEG_MAX	2
     74 #define	RQ_DSEG_MAX	1	/* default RQ data seg */
     75 #define	IBSRM_HB	0x8000	/* high order bit of pkey */
     76 
     77 /* max no. of refresh attempts on IBT_CM_CONN_STALE error */
     78 #define	REFRESH_ATTEMPTS	3
     79 
     80 typedef struct rib_hca_s rib_hca_t;
     81 typedef struct rib_qp_s rib_qp_t;
     82 typedef struct rib_cq_s rib_cq_t;
     83 
     84 /*
     85  * Notification for RDMA_DONE is based on xid
     86  */
     87 struct rdma_done_list {
     88 	uint32_t	xid;		/* XID waiting for RDMA_DONE */
     89 	kcondvar_t	rdma_done_cv;	/* cv for RDMA_DONE */
     90 	struct rdma_done_list	*next;
     91 	struct rdma_done_list	*prev;
     92 };
     93 
     94 /*
     95  * State of the plugin.
     96  * ACCEPT = accepting new connections and requests
     97  * NO_ACCEPT = not accepting new connection and requests
     98  */
     99 #define	ACCEPT		1
    100 #define	NO_ACCEPT	2
    101 
    102 /*
    103  * Send Wait states
    104  */
    105 #define	SEND_WAIT	-1
    106 
    107 /*
    108  * Reply states
    109  */
    110 #define	REPLY_WAIT	-1
    111 
    112 typedef void * rib_pvoid;
    113 typedef rib_pvoid RIB_SYNCMEM_HANDLE;
    114 
    115 /*
    116  * IB buffer pool management structure
    117  */
    118 
    119 /*
    120  * Buffer pool info
    121  */
    122 typedef struct {
    123 	kmutex_t	buflock;	/* lock for this structure */
    124 	caddr_t		buf;		/* pool address */
    125 	uint32_t	bufhandle;	/* rkey for this pool */
    126 	ulong_t		bufsize;	/* size of pool */
    127 	int		rsize;		/* size of each element */
    128 	int		numelems;	/* no. of elements allocated */
    129 	int		buffree;	/* no. of free elements */
    130 	void		*buflist[1];	/* free elements in pool */
    131 } bufpool_t;
    132 
    133 typedef struct {
    134 	bufpool_t	*bpool;
    135 	ibt_mr_hdl_t	*mr_hdl;
    136 	ibt_mr_desc_t	*mr_desc;	/* vaddr, lkey, rkey */
    137 } rib_bufpool_t;
    138 
    139 /*
    140  * ATS relsted defines and structures.
    141  */
    142 #define	ATS_AR_DATA_LEN	16
    143 #define	IBD_NAME	"ibd"
    144 #define	N_IBD_INSTANCES	4
    145 
    146 
    147 /*
    148  * Service types supported by RPCIB
    149  * For now only NFS is supported.
    150  */
    151 #define	NFS		1
    152 #define	NLM		2
    153 
    154 /*
    155  * Tracks consumer state (client or server).
    156  */
    157 typedef enum {
    158 	RIB_SERVER,
    159 	RIB_CLIENT
    160 } rib_mode_t;
    161 
    162 /*
    163  * CQ structure
    164  */
    165 struct rib_cq_s {
    166 	rib_hca_t		*rib_hca;
    167 	ibt_cq_hdl_t		rib_cq_hdl;
    168 };
    169 
    170 /*
    171  * Each registered service's data structure.
    172  */
    173 typedef struct rib_service_s rib_service_t;
    174 struct rib_service_s {
    175 	uint32_t		srv_type;	/* i.e, NFS, NLM, v4CBD */
    176 	ibt_srv_hdl_t		srv_hdl;	/* from ibt_register call */
    177 	ib_svc_id_t		srv_id;
    178 	rib_service_t		*next;
    179 };
    180 
    181 /*
    182  * RPCIB plugin state
    183  */
    184 typedef struct rpcib_state {
    185 	ibt_clnt_hdl_t		ibt_clnt_hdl;
    186 	uint32_t		hca_count;
    187 	uint32_t		nhca_inited;
    188 	rib_hca_t		*hcas_list;
    189 	krwlock_t		hcas_list_lock;	/* protects hcas_list */
    190 	int			refcount;
    191 	kmutex_t		open_hca_lock;
    192 	queue_t			*q;		/* up queue for a serv_type */
    193 	void			*private;
    194 	rib_service_t		*service_list;
    195 	krwlock_t		service_list_lock;
    196 	kmutex_t		listen_lock;
    197 } rpcib_state_t;
    198 
    199 /*
    200  * Connection lists
    201  */
    202 typedef struct {
    203 	krwlock_t	conn_lock;	/* list lock */
    204 	CONN		*conn_hd;	/* list head */
    205 } rib_conn_list_t;
    206 
    207 enum hca_state {
    208 	HCA_DETACHED,		/* hca in detached state */
    209 	HCA_INITED,		/* hca in up and running state */
    210 };
    211 
    212 typedef struct rib_hca_service_s rib_hca_service_t;
    213 struct rib_hca_service_s {
    214 	ib_svc_id_t	srv_id;
    215 	ib_gid_t	gid;
    216 	ibt_sbind_hdl_t	sbind_hdl;
    217 	rib_hca_service_t *next;
    218 };
    219 
    220 /*
    221  * RPCIB per HCA structure
    222  */
    223 struct rib_hca_s {
    224 	ibt_clnt_hdl_t		ibt_clnt_hdl;
    225 
    226 	/*
    227 	 * per HCA.
    228 	 */
    229 	ibt_hca_hdl_t		hca_hdl;	/* HCA handle */
    230 	ibt_hca_attr_t		hca_attrs;	/* HCA attributes */
    231 	ibt_pd_hdl_t		pd_hdl;
    232 	rib_hca_service_t	*bound_services;
    233 	krwlock_t		bound_services_lock;
    234 	ib_guid_t		hca_guid;
    235 	uint32_t		hca_nports;
    236 	ibt_hca_portinfo_t	*hca_ports;
    237 	size_t			hca_pinfosz;
    238 	enum hca_state		state;		/* state of HCA */
    239 	krwlock_t		state_lock;	/* protects state field */
    240 	bool_t			inuse;		/* indicates HCA usage */
    241 	kmutex_t		inuse_lock;	/* protects inuse field */
    242 
    243 	rib_conn_list_t		cl_conn_list;	/* client conn list */
    244 	rib_conn_list_t		srv_conn_list;	/* server conn list */
    245 
    246 	rib_cq_t		*clnt_scq;
    247 	rib_cq_t		*clnt_rcq;
    248 	rib_cq_t		*svc_scq;
    249 	rib_cq_t		*svc_rcq;
    250 	kmutex_t		cb_lock;
    251 	kcondvar_t		cb_cv;
    252 
    253 	rib_bufpool_t		*recv_pool;	/* recv buf pool */
    254 	rib_bufpool_t		*send_pool;	/* send buf pool */
    255 
    256 	void			*iblock;	/* interrupt cookie */
    257 
    258 	kmem_cache_t	*server_side_cache;	/* long reply pool */
    259 	avl_tree_t	avl_tree;
    260 	kmutex_t	avl_lock;
    261 	krwlock_t	avl_rw_lock;
    262 	volatile bool_t avl_init;
    263 	kmutex_t	cache_allocation_lock;
    264 	ddi_taskq_t	*cleanup_helper;
    265 	ib_svc_id_t	srv_id;
    266 	ibt_srv_hdl_t 	srv_hdl;
    267 	uint_t		reg_state;
    268 
    269 	volatile uint64_t	cache_allocation;
    270 	uint64_t	cache_hits;
    271 	uint64_t	cache_misses;
    272 	uint64_t	cache_cold_misses;
    273 	uint64_t	cache_hot_misses;
    274 	uint64_t	cache_misses_above_the_limit;
    275 
    276 	struct rib_hca_s *next;
    277 };
    278 
    279 
    280 /*
    281  * Structure on wait state of a post send
    282  */
    283 struct send_wid {
    284 	uint32_t 	xid;
    285 	int		cv_sig;
    286 	kmutex_t	sendwait_lock;
    287 	kcondvar_t	wait_cv;
    288 	uint_t		status;
    289 	rib_qp_t	*qp;
    290 	int		nsbufs;			/* # of send buffers posted */
    291 	uint64_t	sbufaddr[DSEG_MAX];	/* posted send buffers */
    292 	caddr_t		c;
    293 	caddr_t		c1;
    294 	int		l1;
    295 	caddr_t		c2;
    296 	int		l2;
    297 	int		wl, rl;
    298 };
    299 
    300 /*
    301  * Structure on reply descriptor for recv queue.
    302  * Different from the above posting of a descriptor.
    303  */
    304 struct reply {
    305 	uint32_t 	xid;
    306 	uint_t		status;
    307 	uint64_t	vaddr_cq;	/* buf addr from CQ */
    308 	uint_t		bytes_xfer;
    309 	kcondvar_t	wait_cv;
    310 	struct reply	*next;
    311 	struct reply 	*prev;
    312 };
    313 
    314 struct svc_recv {
    315 	rib_qp_t	*qp;
    316 	uint64_t	vaddr;
    317 	uint_t		bytes_xfer;
    318 };
    319 
    320 struct recv_wid {
    321 	uint32_t 	xid;
    322 	rib_qp_t	*qp;
    323 	uint64_t	addr;	/* posted buf addr */
    324 };
    325 
    326 /*
    327  * Per QP data structure
    328  */
    329 struct rib_qp_s {
    330 	rib_hca_t		*hca;
    331 	rib_mode_t		mode;	/* RIB_SERVER or RIB_CLIENT */
    332 	CONN			rdmaconn;
    333 	ibt_channel_hdl_t	qp_hdl;
    334 	uint_t			port_num;
    335 	ib_qpn_t		qpn;
    336 	int			chan_flags;
    337 	clock_t			timeout;
    338 	ibt_rc_chan_query_attr_t	qp_q_attrs;
    339 	rib_cq_t		*send_cq;	/* send CQ */
    340 	rib_cq_t		*recv_cq;	/* recv CQ */
    341 
    342 	/*
    343 	 * Number of pre-posted rbufs
    344 	 */
    345 	uint_t			n_posted_rbufs;
    346 	kcondvar_t 		posted_rbufs_cv;
    347 	kmutex_t		posted_rbufs_lock;
    348 
    349 	/*
    350 	 * Number of SENDs pending completion
    351 	 */
    352 
    353 	uint_t			n_send_rbufs;
    354 	kcondvar_t 		send_rbufs_cv;
    355 	kmutex_t		send_rbufs_lock;
    356 
    357 	/*
    358 	 * RPC reply
    359 	 */
    360 	uint_t			rep_list_size;
    361 	struct reply		*replylist;
    362 	kmutex_t		replylist_lock;
    363 
    364 	/*
    365 	 * server only, RDMA_DONE
    366 	 */
    367 	struct rdma_done_list	*rdlist;
    368 	kmutex_t		rdlist_lock;
    369 
    370 	kmutex_t		cb_lock;
    371 	kcondvar_t 		cb_conn_cv;
    372 
    373 	caddr_t			q;	/* upstream queue */
    374 	struct send_wid		wd;
    375 };
    376 
    377 #define	ctoqp(conn)	((rib_qp_t *)((conn)->c_private))
    378 #define	qptoc(rqp)	((CONN *)&((rqp)->rdmaconn))
    379 
    380 /*
    381  * Timeout for various calls
    382  */
    383 #define	CONN_WAIT_TIME	40
    384 #define	SEND_WAIT_TIME	40	/* time for send completion */
    385 
    386 #define	REPLY_WAIT_TIME	40	/* time to get reply from remote QP */
    387 
    388 #ifdef __cplusplus
    389 }
    390 #endif
    391 
    392 #endif	/* !_IB_H */
    393