Home | History | Annotate | Download | only in ibd
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * An implementation of the IPoIB standard based on PSARC 2001/289.
     29  */
     30 
     31 #include <sys/types.h>
     32 #include <sys/conf.h>
     33 #include <sys/ddi.h>
     34 #include <sys/sunddi.h>
     35 #include <sys/modctl.h>
     36 #include <sys/stropts.h>
     37 #include <sys/stream.h>
     38 #include <sys/strsun.h>
     39 #include <sys/strsubr.h>
     40 #include <sys/dlpi.h>
     41 #include <sys/mac_provider.h>
     42 
     43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
     44 #include <sys/sysmacros.h>	/* for offsetof */
     45 #include <sys/disp.h>		/* for async thread pri */
     46 #include <sys/atomic.h>		/* for atomic_add*() */
     47 #include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
     48 #include <netinet/in.h>		/* for netinet/ip.h below */
     49 #include <netinet/ip.h>		/* for struct ip */
     50 #include <netinet/udp.h>	/* for struct udphdr */
     51 #include <inet/common.h>	/* for inet/ip.h below */
     52 #include <inet/ip.h>		/* for ipha_t */
     53 #include <inet/ip6.h>		/* for ip6_t */
     54 #include <inet/tcp.h>		/* for tcph_t */
     55 #include <netinet/icmp6.h>	/* for icmp6_t */
     56 #include <sys/callb.h>
     57 #include <sys/modhash.h>
     58 
     59 #include <sys/ib/clients/ibd/ibd.h>
     60 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
     61 #include <sys/note.h>
     62 #include <sys/multidata.h>
     63 
     64 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
     65 
     66 /*
     67  * Per-interface tunables (for developers)
     68  *
     69  * ibd_tx_copy_thresh
     70  *     This sets the threshold at which ibd will attempt to do a bcopy of the
     71  *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
     72  *     is restricted by various parameters, so setting of this value must be
     73  *     made after careful considerations only.  For instance, IB HCAs currently
     74  *     impose a relatively small limit (when compared to ethernet NICs) on the
     75  *     length of the SGL for transmit. On the other hand, the ip stack could
     76  *     send down mp chains that are quite long when LSO is enabled.
     77  *
     78  * ibd_num_swqe
     79  *     Number of "send WQE" elements that will be allocated and used by ibd.
     80  *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
     81  *     buffer in each of these send wqes must be taken into account. This
     82  *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
     83  *     currently set to the same value of ibd_tx_copy_thresh, but may be
     84  *     changed independently if needed).
     85  *
     86  * ibd_num_rwqe
     87  *     Number of "receive WQE" elements that will be allocated and used by
     88  *     ibd. This parameter is limited by the maximum channel size of the HCA.
     89  *     Each buffer in the receive wqe will be of MTU size.
     90  *
     91  * ibd_num_lso_bufs
     92  *     Number of "larger-than-MTU" copy buffers to use for cases when the
     93  *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
     94  *     and too large to be used with regular MTU-sized copy buffers. It is
     95  *     not recommended to tune this variable without understanding the
     96  *     application environment and/or memory resources. The size of each of
     97  *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
     98  *
     99  * ibd_num_ah
    100  *     Number of AH cache entries to allocate
    101  *
    102  * ibd_hash_size
    103  *     Hash table size for the active AH list
    104  *
    105  * ibd_tx_softintr
    106  * ibd_rx_softintr
    107  *     The softintr mechanism allows ibd to avoid event queue overflows if
    108  *     the receive/completion handlers are to be expensive. These are enabled
    109  *     by default.
    110  *
    111  * ibd_log_sz
    112  *     This specifies the size of the ibd log buffer in bytes. The buffer is
    113  *     allocated and logging is enabled only when IBD_LOGGING is defined.
    114  *
    115  */
    116 uint_t ibd_tx_copy_thresh = 0x1000;
    117 uint_t ibd_num_swqe = 4000;
    118 uint_t ibd_num_rwqe = 4000;
    119 uint_t ibd_num_lso_bufs = 0x400;
    120 uint_t ibd_num_ah = 256;
    121 uint_t ibd_hash_size = 32;
    122 uint_t ibd_rx_softintr = 1;
    123 uint_t ibd_tx_softintr = 1;
    124 uint_t ibd_create_broadcast_group = 1;
    125 #ifdef IBD_LOGGING
    126 uint_t ibd_log_sz = 0x20000;
    127 #endif
    128 
    129 #define	IBD_TX_COPY_THRESH		ibd_tx_copy_thresh
    130 #define	IBD_TX_BUF_SZ			ibd_tx_copy_thresh
    131 #define	IBD_NUM_SWQE			ibd_num_swqe
    132 #define	IBD_NUM_RWQE			ibd_num_rwqe
    133 #define	IBD_NUM_LSO_BUFS		ibd_num_lso_bufs
    134 #define	IBD_NUM_AH			ibd_num_ah
    135 #define	IBD_HASH_SIZE			ibd_hash_size
    136 #ifdef IBD_LOGGING
    137 #define	IBD_LOG_SZ			ibd_log_sz
    138 #endif
    139 
    140 /*
    141  * ibd_rc_tx_copy_thresh
    142  *     This sets the threshold upto which ibd will attempt to do a bcopy of the
    143  *     outgoing data into a pre-mapped buffer.
    144  */
    145 uint_t ibd_rc_tx_copy_thresh = 0x1000;
    146 
    147 /*
    148  * Receive CQ moderation parameters: tunable (for developers)
    149  */
    150 uint_t ibd_rxcomp_count = 4;
    151 uint_t ibd_rxcomp_usec = 10;
    152 
    153 /*
    154  * Send CQ moderation parameters: tunable (for developers)
    155  */
    156 uint_t ibd_txcomp_count = 16;
    157 uint_t ibd_txcomp_usec = 300;
    158 
    159 /* Post IBD_RX_POST_CNT receive work requests at a time. */
    160 #define	IBD_RX_POST_CNT			8
    161 
    162 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
    163 #define	IBD_LOG_RX_POST			4
    164 
    165 /* Minimum number of receive work requests driver needs to always have */
    166 #define	IBD_RWQE_MIN	((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
    167 
    168 /*
    169  * LSO parameters
    170  */
    171 #define	IBD_LSO_MAXLEN			65536
    172 #define	IBD_LSO_BUFSZ			8192
    173 #define	IBD_PROP_LSO_POLICY		"lso-policy"
    174 
    175 /*
    176  * Async operation states
    177  */
    178 #define	IBD_OP_NOTSTARTED		0
    179 #define	IBD_OP_ONGOING			1
    180 #define	IBD_OP_COMPLETED		2
    181 #define	IBD_OP_ERRORED			3
    182 #define	IBD_OP_ROUTERED			4
    183 
    184 /*
    185  * State of IBD driver initialization during attach/m_start
    186  */
    187 #define	IBD_DRV_STATE_INITIALIZED	0x00001
    188 #define	IBD_DRV_RXINTR_ADDED		0x00002
    189 #define	IBD_DRV_TXINTR_ADDED		0x00004
    190 #define	IBD_DRV_IBTL_ATTACH_DONE	0x00008
    191 #define	IBD_DRV_HCA_OPENED		0x00010
    192 #define	IBD_DRV_PD_ALLOCD		0x00020
    193 #define	IBD_DRV_MAC_REGISTERED		0x00040
    194 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x00080
    195 #define	IBD_DRV_BCAST_GROUP_FOUND	0x00100
    196 #define	IBD_DRV_ACACHE_INITIALIZED	0x00200
    197 #define	IBD_DRV_CQS_ALLOCD		0x00400
    198 #define	IBD_DRV_UD_CHANNEL_SETUP	0x00800
    199 #define	IBD_DRV_TXLIST_ALLOCD		0x01000
    200 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x02000
    201 #define	IBD_DRV_RXLIST_ALLOCD		0x04000
    202 #define	IBD_DRV_BCAST_GROUP_JOINED	0x08000
    203 #define	IBD_DRV_ASYNC_THR_CREATED	0x10000
    204 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x20000
    205 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x40000
    206 #define	IBD_DRV_STARTED			0x80000
    207 #define	IBD_DRV_RC_SRQ_ALLOCD		0x100000
    208 #define	IBD_DRV_RC_LARGEBUF_ALLOCD	0x200000
    209 #define	IBD_DRV_RC_LISTEN		0x400000
    210 #ifdef DEBUG
    211 #define	IBD_DRV_RC_PRIVATE_STATE	0x800000
    212 #endif
    213 
    214 /*
    215  * Start/stop in-progress flags; note that restart must always remain
    216  * the OR of start and stop flag values.
    217  */
    218 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
    219 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
    220 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
    221 
    222 /*
    223  * Miscellaneous constants
    224  */
    225 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
    226 #define	IBD_DEF_MAX_SDU			2044
    227 #define	IBD_DEFAULT_QKEY		0xB1B
    228 #ifdef IBD_LOGGING
    229 #define	IBD_DMAX_LINE			100
    230 #endif
    231 
    232 /*
    233  * Enumerations for link states
    234  */
    235 typedef enum {
    236 	IBD_LINK_DOWN,
    237 	IBD_LINK_UP,
    238 	IBD_LINK_UP_ABSENT
    239 } ibd_link_op_t;
    240 
    241 /*
    242  * Driver State Pointer
    243  */
    244 void *ibd_list;
    245 
    246 /*
    247  * Logging
    248  */
    249 #ifdef IBD_LOGGING
    250 kmutex_t ibd_lbuf_lock;
    251 uint8_t *ibd_lbuf;
    252 uint32_t ibd_lbuf_ndx;
    253 #endif
    254 
    255 /*
    256  * Required system entry points
    257  */
    258 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
    259 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
    260 
    261 /*
    262  * Required driver entry points for GLDv3
    263  */
    264 static int ibd_m_stat(void *, uint_t, uint64_t *);
    265 static int ibd_m_start(void *);
    266 static void ibd_m_stop(void *);
    267 static int ibd_m_promisc(void *, boolean_t);
    268 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
    269 static int ibd_m_unicst(void *, const uint8_t *);
    270 static mblk_t *ibd_m_tx(void *, mblk_t *);
    271 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
    272 
    273 /*
    274  * Private driver entry points for GLDv3
    275  */
    276 
    277 /*
    278  * Initialization
    279  */
    280 static int ibd_state_init(ibd_state_t *, dev_info_t *);
    281 static int ibd_init_txlist(ibd_state_t *);
    282 static int ibd_init_rxlist(ibd_state_t *);
    283 static int ibd_acache_init(ibd_state_t *);
    284 #ifdef IBD_LOGGING
    285 static void ibd_log_init(void);
    286 #endif
    287 
    288 /*
    289  * Termination/cleanup
    290  */
    291 static void ibd_state_fini(ibd_state_t *);
    292 static void ibd_fini_txlist(ibd_state_t *);
    293 static void ibd_fini_rxlist(ibd_state_t *);
    294 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
    295 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
    296 static void ibd_acache_fini(ibd_state_t *);
    297 #ifdef IBD_LOGGING
    298 static void ibd_log_fini(void);
    299 #endif
    300 
    301 /*
    302  * Allocation/acquire/map routines
    303  */
    304 static int ibd_alloc_tx_copybufs(ibd_state_t *);
    305 static int ibd_alloc_rx_copybufs(ibd_state_t *);
    306 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
    307 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
    308 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
    309     uint32_t *);
    310 
    311 /*
    312  * Free/release/unmap routines
    313  */
    314 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
    315 static void ibd_free_tx_copybufs(ibd_state_t *);
    316 static void ibd_free_rx_copybufs(ibd_state_t *);
    317 static void ibd_free_rx_rsrcs(ibd_state_t *);
    318 static void ibd_free_tx_lsobufs(ibd_state_t *);
    319 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
    320 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
    321 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
    322 
    323 /*
    324  * Handlers/callback routines
    325  */
    326 static uint_t ibd_intr(caddr_t);
    327 static uint_t ibd_tx_recycle(caddr_t);
    328 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
    329 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
    330 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
    331 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
    332 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
    333 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
    334 static void ibd_freemsg_cb(char *);
    335 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
    336     ibt_async_event_t *);
    337 static void ibd_snet_notices_handler(void *, ib_gid_t,
    338     ibt_subnet_event_code_t, ibt_subnet_event_t *);
    339 
    340 /*
    341  * Send/receive routines
    342  */
    343 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
    344 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
    345 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
    346 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
    347 
    348 /*
    349  * Threads
    350  */
    351 static void ibd_async_work(ibd_state_t *);
    352 
    353 /*
    354  * Async tasks
    355  */
    356 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
    357 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
    358 static void ibd_async_setprom(ibd_state_t *);
    359 static void ibd_async_unsetprom(ibd_state_t *);
    360 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
    361 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
    362 static void ibd_async_txsched(ibd_state_t *);
    363 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
    364 
    365 /*
    366  * Async task helpers
    367  */
    368 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
    369 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
    370 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
    371 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
    372     ipoib_mac_t *, ipoib_mac_t *);
    373 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
    374 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
    375 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
    376 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
    377 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
    378 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
    379 static uint64_t ibd_get_portspeed(ibd_state_t *);
    380 static boolean_t ibd_async_safe(ibd_state_t *);
    381 static void ibd_async_done(ibd_state_t *);
    382 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
    383 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
    384 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
    385 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
    386 
    387 /*
    388  * Helpers for attach/start routines
    389  */
    390 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
    391 static int ibd_record_capab(ibd_state_t *, dev_info_t *);
    392 static int ibd_unattach(ibd_state_t *, dev_info_t *);
    393 static int ibd_get_port_details(ibd_state_t *);
    394 static int ibd_alloc_cqs(ibd_state_t *);
    395 static int ibd_setup_ud_channel(ibd_state_t *);
    396 static int ibd_start(ibd_state_t *);
    397 static int ibd_undo_start(ibd_state_t *, link_state_t);
    398 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
    399 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
    400 
    401 
    402 /*
    403  * Miscellaneous helpers
    404  */
    405 static int ibd_sched_poll(ibd_state_t *, int, int);
    406 static void ibd_resume_transmission(ibd_state_t *);
    407 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
    408 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
    409 static void *list_get_head(list_t *);
    410 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
    411 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
    412 #ifdef IBD_LOGGING
    413 static void ibd_log(const char *, ...);
    414 #endif
    415 
    416 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
    417     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
    418 
    419 /* Module Driver Info */
    420 static struct modldrv ibd_modldrv = {
    421 	&mod_driverops,			/* This one is a driver */
    422 	"InfiniBand GLDv3 Driver",	/* short description */
    423 	&ibd_dev_ops			/* driver specific ops */
    424 };
    425 
    426 /* Module Linkage */
    427 static struct modlinkage ibd_modlinkage = {
    428 	MODREV_1, (void *)&ibd_modldrv, NULL
    429 };
    430 
    431 /*
    432  * Module (static) info passed to IBTL during ibt_attach
    433  */
    434 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
    435 	IBTI_V_CURR,
    436 	IBT_NETWORK,
    437 	ibd_async_handler,
    438 	NULL,
    439 	"IPIB"
    440 };
    441 
    442 /*
    443  * GLDv3 entry points
    444  */
    445 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
    446 static mac_callbacks_t ibd_m_callbacks = {
    447 	IBD_M_CALLBACK_FLAGS,
    448 	ibd_m_stat,
    449 	ibd_m_start,
    450 	ibd_m_stop,
    451 	ibd_m_promisc,
    452 	ibd_m_multicst,
    453 	ibd_m_unicst,
    454 	ibd_m_tx,
    455 	NULL,
    456 	ibd_m_getcapab
    457 };
    458 
    459 /*
    460  * Fill/clear <scope> and <p_key> in multicast/broadcast address
    461  */
    462 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
    463 {							\
    464 	*(uint32_t *)((char *)(maddr) + 4) |=		\
    465 	    htonl((uint32_t)(scope) << 16);		\
    466 	*(uint32_t *)((char *)(maddr) + 8) |=		\
    467 	    htonl((uint32_t)(pkey) << 16);		\
    468 }
    469 
    470 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
    471 {							\
    472 	*(uint32_t *)((char *)(maddr) + 4) &=		\
    473 	    htonl(~((uint32_t)0xF << 16));		\
    474 	*(uint32_t *)((char *)(maddr) + 8) &=		\
    475 	    htonl(~((uint32_t)0xFFFF << 16));		\
    476 }
    477 
    478 /*
    479  * Rudimentary debugging support
    480  */
    481 #ifdef DEBUG
    482 int ibd_debuglevel = 100;
    483 void
    484 debug_print(int l, char *fmt, ...)
    485 {
    486 	va_list ap;
    487 
    488 	if (l < ibd_debuglevel)
    489 		return;
    490 	va_start(ap, fmt);
    491 	vcmn_err(CE_CONT, fmt, ap);
    492 	va_end(ap);
    493 }
    494 #endif
    495 
    496 /*
    497  * Common routine to print warning messages; adds in hca guid, port number
    498  * and pkey to be able to identify the IBA interface.
    499  */
    500 void
    501 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
    502 {
    503 	ib_guid_t hca_guid;
    504 	char ibd_print_buf[256];
    505 	int len;
    506 	va_list ap;
    507 
    508 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
    509 	    0, "hca-guid", 0);
    510 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
    511 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
    512 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
    513 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
    514 	va_start(ap, fmt);
    515 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
    516 	    fmt, ap);
    517 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
    518 	va_end(ap);
    519 }
    520 
    521 /*
    522  * Warlock directives
    523  */
    524 
    525 /*
    526  * id_lso_lock
    527  *
    528  * state->id_lso->bkt_nfree may be accessed without a lock to
    529  * determine the threshold at which we have to ask the nw layer
    530  * to resume transmission (see ibd_resume_transmission()).
    531  */
    532 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
    533     ibd_state_t::id_lso))
    534 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
    535 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
    536 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
    537 
    538 /*
    539  * id_scq_poll_lock
    540  */
    541 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
    542     ibd_state_t::id_scq_poll_busy))
    543 
    544 /*
    545  * id_txpost_lock
    546  */
    547 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
    548     ibd_state_t::id_tx_head))
    549 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
    550     ibd_state_t::id_tx_busy))
    551 
    552 /*
    553  * id_acache_req_lock
    554  */
    555 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
    556     ibd_state_t::id_acache_req_cv))
    557 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
    558     ibd_state_t::id_req_list))
    559 _NOTE(SCHEME_PROTECTS_DATA("atomic",
    560     ibd_acache_s::ac_ref))
    561 
    562 /*
    563  * id_ac_mutex
    564  *
    565  * This mutex is actually supposed to protect id_ah_op as well,
    566  * but this path of the code isn't clean (see update of id_ah_op
    567  * in ibd_async_acache(), immediately after the call to
    568  * ibd_async_mcache()). For now, we'll skip this check by
    569  * declaring that id_ah_op is protected by some internal scheme
    570  * that warlock isn't aware of.
    571  */
    572 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    573     ibd_state_t::id_ah_active))
    574 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    575     ibd_state_t::id_ah_free))
    576 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    577     ibd_state_t::id_ah_addr))
    578 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
    579     ibd_state_t::id_ah_op))
    580 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    581     ibd_state_t::id_ah_error))
    582 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
    583     ibd_state_t::id_ac_hot_ace))
    584 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
    585 
    586 /*
    587  * id_mc_mutex
    588  */
    589 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
    590     ibd_state_t::id_mc_full))
    591 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
    592     ibd_state_t::id_mc_non))
    593 
    594 /*
    595  * id_trap_lock
    596  */
    597 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
    598     ibd_state_t::id_trap_cv))
    599 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
    600     ibd_state_t::id_trap_stop))
    601 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
    602     ibd_state_t::id_trap_inprog))
    603 
    604 /*
    605  * id_prom_op
    606  */
    607 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
    608     ibd_state_t::id_prom_op))
    609 
    610 /*
    611  * id_sched_lock
    612  */
    613 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
    614     ibd_state_t::id_sched_needed))
    615 
    616 /*
    617  * id_link_mutex
    618  */
    619 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
    620     ibd_state_t::id_link_state))
    621 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
    622 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
    623     ibd_state_t::id_link_speed))
    624 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
    625 
    626 /*
    627  * id_tx_list.dl_mutex
    628  */
    629 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
    630     ibd_state_t::id_tx_list.dl_head))
    631 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
    632     ibd_state_t::id_tx_list.dl_pending_sends))
    633 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
    634     ibd_state_t::id_tx_list.dl_cnt))
    635 
    636 /*
    637  * id_rx_list.dl_mutex
    638  */
    639 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    640     ibd_state_t::id_rx_list.dl_bufs_outstanding))
    641 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    642     ibd_state_t::id_rx_list.dl_cnt))
    643 
    644 
    645 /*
    646  * Items protected by atomic updates
    647  */
    648 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
    649     ibd_state_s::id_brd_rcv
    650     ibd_state_s::id_brd_xmt
    651     ibd_state_s::id_multi_rcv
    652     ibd_state_s::id_multi_xmt
    653     ibd_state_s::id_num_intrs
    654     ibd_state_s::id_rcv_bytes
    655     ibd_state_s::id_rcv_pkt
    656     ibd_state_s::id_rx_post_queue_index
    657     ibd_state_s::id_tx_short
    658     ibd_state_s::id_xmt_bytes
    659     ibd_state_s::id_xmt_pkt
    660     ibd_state_s::rc_rcv_trans_byte
    661     ibd_state_s::rc_rcv_trans_pkt
    662     ibd_state_s::rc_rcv_copy_byte
    663     ibd_state_s::rc_rcv_copy_pkt
    664     ibd_state_s::rc_xmt_bytes
    665     ibd_state_s::rc_xmt_small_pkt
    666     ibd_state_s::rc_xmt_fragmented_pkt
    667     ibd_state_s::rc_xmt_map_fail_pkt
    668     ibd_state_s::rc_xmt_map_succ_pkt))
    669 
    670 /*
    671  * Non-mutex protection schemes for data elements. Almost all of
    672  * these are non-shared items.
    673  */
    674 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
    675     callb_cpr
    676     ib_gid_s
    677     ib_header_info
    678     ibd_acache_rq
    679     ibd_acache_s::ac_mce
    680     ibd_acache_s::ac_chan
    681     ibd_mcache::mc_fullreap
    682     ibd_mcache::mc_jstate
    683     ibd_mcache::mc_req
    684     ibd_rwqe_s
    685     ibd_swqe_s
    686     ibd_wqe_s
    687     ibt_wr_ds_s::ds_va
    688     ibt_wr_lso_s
    689     ipoib_mac::ipoib_qpn
    690     mac_capab_lso_s
    691     msgb::b_next
    692     msgb::b_cont
    693     msgb::b_rptr
    694     msgb::b_wptr
    695     ibd_state_s::id_bgroup_created
    696     ibd_state_s::id_mac_state
    697     ibd_state_s::id_mtu
    698     ibd_state_s::id_num_rwqe
    699     ibd_state_s::id_num_swqe
    700     ibd_state_s::id_qpnum
    701     ibd_state_s::id_rcq_hdl
    702     ibd_state_s::id_rx_buf_sz
    703     ibd_state_s::id_rx_bufs
    704     ibd_state_s::id_rx_mr_hdl
    705     ibd_state_s::id_rx_wqes
    706     ibd_state_s::id_rxwcs
    707     ibd_state_s::id_rxwcs_size
    708     ibd_state_s::id_rx_nqueues
    709     ibd_state_s::id_rx_queues
    710     ibd_state_s::id_scope
    711     ibd_state_s::id_scq_hdl
    712     ibd_state_s::id_tx_buf_sz
    713     ibd_state_s::id_tx_bufs
    714     ibd_state_s::id_tx_mr_hdl
    715     ibd_state_s::id_tx_rel_list.dl_cnt
    716     ibd_state_s::id_tx_wqes
    717     ibd_state_s::id_txwcs
    718     ibd_state_s::id_txwcs_size
    719     ibd_state_s::rc_listen_hdl
    720     ibd_state_s::rc_listen_hdl_OFED_interop
    721     ibd_state_s::rc_srq_size
    722     ibd_state_s::rc_srq_rwqes
    723     ibd_state_s::rc_srq_rx_bufs
    724     ibd_state_s::rc_srq_rx_mr_hdl
    725     ibd_state_s::rc_tx_largebuf_desc_base
    726     ibd_state_s::rc_tx_mr_bufs
    727     ibd_state_s::rc_tx_mr_hdl
    728     ipha_s
    729     icmph_s
    730     ibt_path_info_s::pi_sid
    731     ibd_rc_chan_s::ace
    732     ibd_rc_chan_s::chan_hdl
    733     ibd_rc_chan_s::state
    734     ibd_rc_chan_s::chan_state
    735     ibd_rc_chan_s::is_tx_chan
    736     ibd_rc_chan_s::rcq_hdl
    737     ibd_rc_chan_s::rcq_size
    738     ibd_rc_chan_s::scq_hdl
    739     ibd_rc_chan_s::scq_size
    740     ibd_rc_chan_s::requester_gid
    741     ibd_rc_chan_s::requester_pkey
    742     ibd_rc_chan_s::rx_bufs
    743     ibd_rc_chan_s::rx_mr_hdl
    744     ibd_rc_chan_s::rx_rwqes
    745     ibd_rc_chan_s::tx_wqes
    746     ibd_rc_chan_s::tx_mr_bufs
    747     ibd_rc_chan_s::tx_mr_hdl
    748     ibd_rc_chan_s::tx_rel_list.dl_cnt
    749     ibd_rc_chan_s::tx_trans_error_cnt
    750     ibd_rc_tx_largebuf_s::lb_buf
    751     ibd_rc_msg_hello_s
    752     ibt_cm_return_args_s))
    753 
    754 /*
    755  * ibd_rc_chan_s::next is protected by two mutexes:
    756  * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
    757  * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
    758  */
    759 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
    760     ibd_rc_chan_s::next))
    761 
    762 /*
    763  * ibd_state_s.rc_tx_large_bufs_lock
    764  */
    765 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
    766     ibd_state_s::rc_tx_largebuf_free_head))
    767 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
    768     ibd_state_s::rc_tx_largebuf_nfree))
    769 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
    770     ibd_rc_tx_largebuf_s::lb_next))
    771 
    772 /*
    773  * ibd_acache_s.tx_too_big_mutex
    774  */
    775 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
    776     ibd_acache_s::tx_too_big_ongoing))
    777 
    778 /*
    779  * tx_wqe_list.dl_mutex
    780  */
    781 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
    782     ibd_rc_chan_s::tx_wqe_list.dl_head))
    783 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
    784     ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
    785 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
    786     ibd_rc_chan_s::tx_wqe_list.dl_cnt))
    787 
    788 /*
    789  * ibd_state_s.rc_ace_recycle_lock
    790  */
    791 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
    792     ibd_state_s::rc_ace_recycle))
    793 
    794 /*
    795  * rc_srq_rwqe_list.dl_mutex
    796  */
    797 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    798     ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
    799 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
    800     ibd_state_t::rc_srq_rwqe_list.dl_cnt))
    801 
    802 /*
    803  * Non-mutex protection schemes for data elements. They are counters
    804  * for problem diagnosis. Don't need be protected.
    805  */
    806 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
    807     ibd_state_s::rc_rcv_alloc_fail
    808     ibd_state_s::rc_rcq_invoke
    809     ibd_state_s::rc_rcq_err
    810     ibd_state_s::rc_ace_not_found
    811     ibd_state_s::rc_xmt_drop_too_long_pkt
    812     ibd_state_s::rc_xmt_icmp_too_long_pkt
    813     ibd_state_s::rc_xmt_reenter_too_long_pkt
    814     ibd_state_s::rc_swqe_short
    815     ibd_state_s::rc_swqe_mac_update
    816     ibd_state_s::rc_xmt_buf_short
    817     ibd_state_s::rc_xmt_buf_mac_update
    818     ibd_state_s::rc_scq_no_swqe
    819     ibd_state_s::rc_scq_no_largebuf
    820     ibd_state_s::rc_scq_invoke
    821     ibd_state_s::rc_conn_succ
    822     ibd_state_s::rc_conn_fail
    823     ibd_state_s::rc_null_conn
    824     ibd_state_s::rc_no_estab_conn
    825     ibd_state_s::rc_act_close
    826     ibd_state_s::rc_pas_close
    827     ibd_state_s::rc_delay_ace_recycle
    828     ibd_state_s::rc_act_close_simultaneous
    829     ibd_state_s::rc_reset_cnt))
    830 
    831 #ifdef DEBUG
    832 /*
    833  * Non-mutex protection schemes for data elements. They are counters
    834  * for problem diagnosis. Don't need be protected.
    835  */
    836 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
    837     ibd_state_s::rc_rwqe_short
    838     ibd_rc_stat_s::rc_rcv_trans_byte
    839     ibd_rc_stat_s::rc_rcv_trans_pkt
    840     ibd_rc_stat_s::rc_rcv_copy_byte
    841     ibd_rc_stat_s::rc_rcv_copy_pkt
    842     ibd_rc_stat_s::rc_rcv_alloc_fail
    843     ibd_rc_stat_s::rc_rcq_invoke
    844     ibd_rc_stat_s::rc_rcq_err
    845     ibd_rc_stat_s::rc_scq_invoke
    846     ibd_rc_stat_s::rc_rwqe_short
    847     ibd_rc_stat_s::rc_xmt_bytes
    848     ibd_rc_stat_s::rc_xmt_small_pkt
    849     ibd_rc_stat_s::rc_xmt_fragmented_pkt
    850     ibd_rc_stat_s::rc_xmt_map_fail_pkt
    851     ibd_rc_stat_s::rc_xmt_map_succ_pkt
    852     ibd_rc_stat_s::rc_ace_not_found
    853     ibd_rc_stat_s::rc_scq_no_swqe
    854     ibd_rc_stat_s::rc_scq_no_largebuf
    855     ibd_rc_stat_s::rc_swqe_short
    856     ibd_rc_stat_s::rc_swqe_mac_update
    857     ibd_rc_stat_s::rc_xmt_buf_short
    858     ibd_rc_stat_s::rc_xmt_buf_mac_update
    859     ibd_rc_stat_s::rc_conn_succ
    860     ibd_rc_stat_s::rc_conn_fail
    861     ibd_rc_stat_s::rc_null_conn
    862     ibd_rc_stat_s::rc_no_estab_conn
    863     ibd_rc_stat_s::rc_act_close
    864     ibd_rc_stat_s::rc_pas_close
    865     ibd_rc_stat_s::rc_delay_ace_recycle
    866     ibd_rc_stat_s::rc_act_close_simultaneous
    867     ibd_rc_stat_s::rc_reset_cnt))
    868 #endif
    869 
    870 int
    871 _init()
    872 {
    873 	int status;
    874 
    875 	status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
    876 	    PAGESIZE), 0);
    877 	if (status != 0) {
    878 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
    879 		return (status);
    880 	}
    881 
    882 	mac_init_ops(&ibd_dev_ops, "ibd");
    883 	status = mod_install(&ibd_modlinkage);
    884 	if (status != 0) {
    885 		DPRINT(10, "_init:failed in mod_install()");
    886 		ddi_soft_state_fini(&ibd_list);
    887 		mac_fini_ops(&ibd_dev_ops);
    888 		return (status);
    889 	}
    890 
    891 #ifdef IBD_LOGGING
    892 	ibd_log_init();
    893 #endif
    894 	return (0);
    895 }
    896 
    897 int
    898 _info(struct modinfo *modinfop)
    899 {
    900 	return (mod_info(&ibd_modlinkage, modinfop));
    901 }
    902 
    903 int
    904 _fini()
    905 {
    906 	int status;
    907 
    908 	status = mod_remove(&ibd_modlinkage);
    909 	if (status != 0)
    910 		return (status);
    911 
    912 	mac_fini_ops(&ibd_dev_ops);
    913 	ddi_soft_state_fini(&ibd_list);
    914 #ifdef IBD_LOGGING
    915 	ibd_log_fini();
    916 #endif
    917 	return (0);
    918 }
    919 
    920 /*
    921  * Convert the GID part of the mac address from network byte order
    922  * to host order.
    923  */
    924 static void
    925 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
    926 {
    927 	ib_sn_prefix_t nbopref;
    928 	ib_guid_t nboguid;
    929 
    930 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
    931 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
    932 	dgid->gid_prefix = b2h64(nbopref);
    933 	dgid->gid_guid = b2h64(nboguid);
    934 }
    935 
    936 /*
    937  * Create the IPoIB address in network byte order from host order inputs.
    938  */
    939 static void
    940 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
    941     ib_guid_t guid)
    942 {
    943 	ib_sn_prefix_t nbopref;
    944 	ib_guid_t nboguid;
    945 
    946 	mac->ipoib_qpn = htonl(qpn);
    947 	nbopref = h2b64(prefix);
    948 	nboguid = h2b64(guid);
    949 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
    950 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
    951 }
    952 
    953 /*
    954  * Send to the appropriate all-routers group when the IBA multicast group
    955  * does not exist, based on whether the target group is v4 or v6.
    956  */
    957 static boolean_t
    958 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
    959     ipoib_mac_t *rmac)
    960 {
    961 	boolean_t retval = B_TRUE;
    962 	uint32_t adjscope = state->id_scope << 16;
    963 	uint32_t topword;
    964 
    965 	/*
    966 	 * Copy the first 4 bytes in without assuming any alignment of
    967 	 * input mac address; this will have IPoIB signature, flags and
    968 	 * scope bits.
    969 	 */
    970 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
    971 	topword = ntohl(topword);
    972 
    973 	/*
    974 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
    975 	 */
    976 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
    977 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
    978 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
    979 		    ((uint32_t)(state->id_pkey << 16))),
    980 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
    981 	else
    982 		/*
    983 		 * Does not have proper bits in the mgid address.
    984 		 */
    985 		retval = B_FALSE;
    986 
    987 	return (retval);
    988 }
    989 
    990 /*
    991  * Membership states for different mcg's are tracked by two lists:
    992  * the "non" list is used for promiscuous mode, when all mcg traffic
    993  * needs to be inspected. This type of membership is never used for
    994  * transmission, so there can not be an AH in the active list
    995  * corresponding to a member in this list. This list does not need
    996  * any protection, since all operations are performed by the async
    997  * thread.
    998  *
    999  * "Full" and "SendOnly" membership is tracked using a single list,
   1000  * the "full" list. This is because this single list can then be
   1001  * searched during transmit to a multicast group (if an AH for the
   1002  * mcg is not found in the active list), since at least one type
   1003  * of membership must be present before initiating the transmit.
   1004  * This list is also emptied during driver detach, since sendonly
   1005  * membership acquired during transmit is dropped at detach time
   1006  * along with ipv4 broadcast full membership. Insert/deletes to
   1007  * this list are done only by the async thread, but it is also
   1008  * searched in program context (see multicast disable case), thus
   1009  * the id_mc_mutex protects the list. The driver detach path also
   1010  * deconstructs the "full" list, but it ensures that the async
   1011  * thread will not be accessing the list (by blocking out mcg
   1012  * trap handling and making sure no more Tx reaping will happen).
   1013  *
   1014  * Currently, an IBA attach is done in the SendOnly case too,
   1015  * although this is not required.
   1016  */
   1017 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
   1018 	list_insert_head(&state->id_mc_full, mce)
   1019 #define	IBD_MCACHE_INSERT_NON(state, mce) \
   1020 	list_insert_head(&state->id_mc_non, mce)
   1021 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
   1022 	ibd_mcache_find(mgid, &state->id_mc_full)
   1023 #define	IBD_MCACHE_FIND_NON(state, mgid) \
   1024 	ibd_mcache_find(mgid, &state->id_mc_non)
   1025 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
   1026 	list_remove(&state->id_mc_full, mce)
   1027 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
   1028 	list_remove(&state->id_mc_non, mce)
   1029 
   1030 static void *
   1031 list_get_head(list_t *list)
   1032 {
   1033 	list_node_t *lhead = list_head(list);
   1034 
   1035 	if (lhead != NULL)
   1036 		list_remove(list, lhead);
   1037 	return (lhead);
   1038 }
   1039 
   1040 /*
   1041  * This is always guaranteed to be able to queue the work.
   1042  */
   1043 void
   1044 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
   1045 {
   1046 	/* Initialize request */
   1047 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
   1048 	ptr->rq_op = op;
   1049 
   1050 	/*
   1051 	 * Queue provided slot onto request pool.
   1052 	 */
   1053 	mutex_enter(&state->id_acache_req_lock);
   1054 	list_insert_tail(&state->id_req_list, ptr);
   1055 
   1056 	/* Go, fetch, async thread */
   1057 	cv_signal(&state->id_acache_req_cv);
   1058 	mutex_exit(&state->id_acache_req_lock);
   1059 }
   1060 
   1061 /*
   1062  * Main body of the per interface async thread.
   1063  */
   1064 static void
   1065 ibd_async_work(ibd_state_t *state)
   1066 {
   1067 	ibd_req_t *ptr;
   1068 	callb_cpr_t cprinfo;
   1069 
   1070 	mutex_enter(&state->id_acache_req_lock);
   1071 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
   1072 	    callb_generic_cpr, "ibd_async_work");
   1073 
   1074 	for (;;) {
   1075 		ptr = list_get_head(&state->id_req_list);
   1076 		if (ptr != NULL) {
   1077 			mutex_exit(&state->id_acache_req_lock);
   1078 
   1079 			/*
   1080 			 * Once we have done the operation, there is no
   1081 			 * guarantee the request slot is going to be valid,
   1082 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
   1083 			 * TRAP).
   1084 			 *
   1085 			 * Perform the request.
   1086 			 */
   1087 			switch (ptr->rq_op) {
   1088 				case IBD_ASYNC_GETAH:
   1089 					ibd_async_acache(state, &ptr->rq_mac);
   1090 					break;
   1091 				case IBD_ASYNC_JOIN:
   1092 				case IBD_ASYNC_LEAVE:
   1093 					ibd_async_multicast(state,
   1094 					    ptr->rq_gid, ptr->rq_op);
   1095 					break;
   1096 				case IBD_ASYNC_PROMON:
   1097 					ibd_async_setprom(state);
   1098 					break;
   1099 				case IBD_ASYNC_PROMOFF:
   1100 					ibd_async_unsetprom(state);
   1101 					break;
   1102 				case IBD_ASYNC_REAP:
   1103 					ibd_async_reap_group(state,
   1104 					    ptr->rq_ptr, ptr->rq_gid,
   1105 					    IB_MC_JSTATE_FULL);
   1106 					/*
   1107 					 * the req buf contains in mce
   1108 					 * structure, so we do not need
   1109 					 * to free it here.
   1110 					 */
   1111 					ptr = NULL;
   1112 					break;
   1113 				case IBD_ASYNC_TRAP:
   1114 					ibd_async_trap(state, ptr);
   1115 					break;
   1116 				case IBD_ASYNC_SCHED:
   1117 					ibd_async_txsched(state);
   1118 					break;
   1119 				case IBD_ASYNC_LINK:
   1120 					ibd_async_link(state, ptr);
   1121 					break;
   1122 				case IBD_ASYNC_EXIT:
   1123 					mutex_enter(&state->id_acache_req_lock);
   1124 #ifndef __lock_lint
   1125 					CALLB_CPR_EXIT(&cprinfo);
   1126 #else
   1127 					mutex_exit(&state->id_acache_req_lock);
   1128 #endif
   1129 					return;
   1130 				case IBD_ASYNC_RC_TOO_BIG:
   1131 					ibd_async_rc_process_too_big(state,
   1132 					    ptr);
   1133 					break;
   1134 				case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
   1135 					ibd_async_rc_close_act_chan(state, ptr);
   1136 					break;
   1137 				case IBD_ASYNC_RC_RECYCLE_ACE:
   1138 					ibd_async_rc_recycle_ace(state, ptr);
   1139 					break;
   1140 			}
   1141 			if (ptr != NULL)
   1142 				kmem_cache_free(state->id_req_kmc, ptr);
   1143 
   1144 			mutex_enter(&state->id_acache_req_lock);
   1145 		} else {
   1146 #ifndef __lock_lint
   1147 			/*
   1148 			 * Nothing to do: wait till new request arrives.
   1149 			 */
   1150 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
   1151 			cv_wait(&state->id_acache_req_cv,
   1152 			    &state->id_acache_req_lock);
   1153 			CALLB_CPR_SAFE_END(&cprinfo,
   1154 			    &state->id_acache_req_lock);
   1155 #endif
   1156 		}
   1157 	}
   1158 
   1159 	/*NOTREACHED*/
   1160 	_NOTE(NOT_REACHED)
   1161 }
   1162 
   1163 /*
   1164  * Return when it is safe to queue requests to the async daemon; primarily
   1165  * for subnet trap and async event handling. Disallow requests before the
   1166  * daemon is created, and when interface deinitilization starts.
   1167  */
   1168 static boolean_t
   1169 ibd_async_safe(ibd_state_t *state)
   1170 {
   1171 	mutex_enter(&state->id_trap_lock);
   1172 	if (state->id_trap_stop) {
   1173 		mutex_exit(&state->id_trap_lock);
   1174 		return (B_FALSE);
   1175 	}
   1176 	state->id_trap_inprog++;
   1177 	mutex_exit(&state->id_trap_lock);
   1178 	return (B_TRUE);
   1179 }
   1180 
   1181 /*
   1182  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
   1183  * trap or event handling to complete to kill the async thread and deconstruct
   1184  * the mcg/ace list.
   1185  */
   1186 static void
   1187 ibd_async_done(ibd_state_t *state)
   1188 {
   1189 	mutex_enter(&state->id_trap_lock);
   1190 	if (--state->id_trap_inprog == 0)
   1191 		cv_signal(&state->id_trap_cv);
   1192 	mutex_exit(&state->id_trap_lock);
   1193 }
   1194 
   1195 /*
   1196  * Hash functions:
   1197  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
   1198  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
   1199  * These operate on mac addresses input into ibd_send, but there is no
   1200  * guarantee on the alignment of the ipoib_mac_t structure.
   1201  */
   1202 /*ARGSUSED*/
   1203 static uint_t
   1204 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
   1205 {
   1206 	ulong_t ptraddr = (ulong_t)key;
   1207 	uint_t hval;
   1208 
   1209 	/*
   1210 	 * If the input address is 4 byte aligned, we can just dereference
   1211 	 * it. This is most common, since IP will send in a 4 byte aligned
   1212 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
   1213 	 * 4 byte aligned too.
   1214 	 */
   1215 	if ((ptraddr & 3) == 0)
   1216 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
   1217 
   1218 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
   1219 	return (hval);
   1220 }
   1221 
   1222 static int
   1223 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
   1224 {
   1225 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
   1226 		return (0);
   1227 	else
   1228 		return (1);
   1229 }
   1230 
   1231 /*
   1232  * Initialize all the per interface caches and lists; AH cache,
   1233  * MCG list etc.
   1234  */
   1235 static int
   1236 ibd_acache_init(ibd_state_t *state)
   1237 {
   1238 	ibd_ace_t *ce;
   1239 	int i;
   1240 
   1241 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
   1242 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
   1243 
   1244 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
   1245 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
   1246 	mutex_enter(&state->id_ac_mutex);
   1247 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
   1248 	    offsetof(ibd_ace_t, ac_list));
   1249 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
   1250 	    offsetof(ibd_ace_t, ac_list));
   1251 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
   1252 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
   1253 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
   1254 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
   1255 	    offsetof(ibd_mce_t, mc_list));
   1256 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
   1257 	    offsetof(ibd_mce_t, mc_list));
   1258 	list_create(&state->id_req_list, sizeof (ibd_req_t),
   1259 	    offsetof(ibd_req_t, rq_list));
   1260 	state->id_ac_hot_ace = NULL;
   1261 
   1262 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
   1263 	    IBD_NUM_AH, KM_SLEEP);
   1264 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
   1265 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
   1266 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
   1267 			mutex_exit(&state->id_ac_mutex);
   1268 			ibd_acache_fini(state);
   1269 			return (DDI_FAILURE);
   1270 		} else {
   1271 			CLEAR_REFCYCLE(ce);
   1272 			ce->ac_mce = NULL;
   1273 			mutex_init(&ce->tx_too_big_mutex, NULL,
   1274 			    MUTEX_DRIVER, NULL);
   1275 			IBD_ACACHE_INSERT_FREE(state, ce);
   1276 		}
   1277 	}
   1278 	mutex_exit(&state->id_ac_mutex);
   1279 	return (DDI_SUCCESS);
   1280 }
   1281 
   1282 static void
   1283 ibd_acache_fini(ibd_state_t *state)
   1284 {
   1285 	ibd_ace_t *ptr;
   1286 
   1287 	mutex_enter(&state->id_ac_mutex);
   1288 
   1289 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
   1290 		ASSERT(GET_REF(ptr) == 0);
   1291 		mutex_destroy(&ptr->tx_too_big_mutex);
   1292 		(void) ibt_free_ud_dest(ptr->ac_dest);
   1293 	}
   1294 
   1295 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
   1296 		ASSERT(GET_REF(ptr) == 0);
   1297 		mutex_destroy(&ptr->tx_too_big_mutex);
   1298 		(void) ibt_free_ud_dest(ptr->ac_dest);
   1299 	}
   1300 
   1301 	list_destroy(&state->id_ah_free);
   1302 	list_destroy(&state->id_ah_active);
   1303 	list_destroy(&state->id_mc_full);
   1304 	list_destroy(&state->id_mc_non);
   1305 	list_destroy(&state->id_req_list);
   1306 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
   1307 	mutex_exit(&state->id_ac_mutex);
   1308 	mutex_destroy(&state->id_ac_mutex);
   1309 	mutex_destroy(&state->id_mc_mutex);
   1310 	mutex_destroy(&state->id_acache_req_lock);
   1311 	cv_destroy(&state->id_acache_req_cv);
   1312 }
   1313 
   1314 /*
   1315  * Search AH active hash list for a cached path to input destination.
   1316  * If we are "just looking", hold == F. When we are in the Tx path,
   1317  * we set hold == T to grab a reference on the AH so that it can not
   1318  * be recycled to a new destination while the Tx request is posted.
   1319  */
   1320 ibd_ace_t *
   1321 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
   1322 {
   1323 	ibd_ace_t *ptr;
   1324 
   1325 	ASSERT(mutex_owned(&state->id_ac_mutex));
   1326 
   1327 	/*
   1328 	 * Do hash search.
   1329 	 */
   1330 	if (mod_hash_find(state->id_ah_active_hash,
   1331 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
   1332 		if (hold)
   1333 			INC_REF(ptr, num);
   1334 		return (ptr);
   1335 	}
   1336 	return (NULL);
   1337 }
   1338 
   1339 /*
   1340  * This is called by the tx side; if an initialized AH is found in
   1341  * the active list, it is locked down and can be used; if no entry
   1342  * is found, an async request is queued to do path resolution.
   1343  */
   1344 static ibd_ace_t *
   1345 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
   1346 {
   1347 	ibd_ace_t *ptr;
   1348 	ibd_req_t *req;
   1349 
   1350 	/*
   1351 	 * Only attempt to print when we can; in the mdt pattr case, the
   1352 	 * address is not aligned properly.
   1353 	 */
   1354 	if (((ulong_t)mac & 3) == 0) {
   1355 		DPRINT(4,
   1356 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
   1357 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
   1358 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
   1359 		    htonl(mac->ipoib_gidsuff[1]));
   1360 	}
   1361 
   1362 	mutex_enter(&state->id_ac_mutex);
   1363 
   1364 	if (((ptr = state->id_ac_hot_ace) != NULL) &&
   1365 	    (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
   1366 		INC_REF(ptr, numwqe);
   1367 		mutex_exit(&state->id_ac_mutex);
   1368 		return (ptr);
   1369 	}
   1370 	if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
   1371 		state->id_ac_hot_ace = ptr;
   1372 		mutex_exit(&state->id_ac_mutex);
   1373 		return (ptr);
   1374 	}
   1375 
   1376 	/*
   1377 	 * Implementation of a single outstanding async request; if
   1378 	 * the operation is not started yet, queue a request and move
   1379 	 * to ongoing state. Remember in id_ah_addr for which address
   1380 	 * we are queueing the request, in case we need to flag an error;
   1381 	 * Any further requests, for the same or different address, until
   1382 	 * the operation completes, is sent back to GLDv3 to be retried.
   1383 	 * The async thread will update id_ah_op with an error indication
   1384 	 * or will set it to indicate the next look up can start; either
   1385 	 * way, it will mac_tx_update() so that all blocked requests come
   1386 	 * back here.
   1387 	 */
   1388 	*err = EAGAIN;
   1389 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
   1390 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
   1391 		if (req != NULL) {
   1392 			/*
   1393 			 * We did not even find the entry; queue a request
   1394 			 * for it.
   1395 			 */
   1396 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
   1397 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
   1398 			state->id_ah_op = IBD_OP_ONGOING;
   1399 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
   1400 		}
   1401 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
   1402 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
   1403 		/*
   1404 		 * Check the status of the pathrecord lookup request
   1405 		 * we had queued before.
   1406 		 */
   1407 		if (state->id_ah_op == IBD_OP_ERRORED) {
   1408 			*err = EFAULT;
   1409 			state->id_ah_error++;
   1410 		} else {
   1411 			/*
   1412 			 * IBD_OP_ROUTERED case: We need to send to the
   1413 			 * all-router MCG. If we can find the AH for
   1414 			 * the mcg, the Tx will be attempted. If we
   1415 			 * do not find the AH, we return NORESOURCES
   1416 			 * to retry.
   1417 			 */
   1418 			ipoib_mac_t routermac;
   1419 
   1420 			(void) ibd_get_allroutergroup(state, mac, &routermac);
   1421 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
   1422 			    numwqe);
   1423 		}
   1424 		state->id_ah_op = IBD_OP_NOTSTARTED;
   1425 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
   1426 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
   1427 		/*
   1428 		 * This case can happen when we get a higher band
   1429 		 * packet. The easiest way is to reset the state machine
   1430 		 * to accommodate the higher priority packet.
   1431 		 */
   1432 		state->id_ah_op = IBD_OP_NOTSTARTED;
   1433 	}
   1434 	mutex_exit(&state->id_ac_mutex);
   1435 
   1436 	return (ptr);
   1437 }
   1438 
   1439 /*
   1440  * Grab a not-currently-in-use AH/PathRecord from the active
   1441  * list to recycle to a new destination. Only the async thread
   1442  * executes this code.
   1443  */
   1444 static ibd_ace_t *
   1445 ibd_acache_get_unref(ibd_state_t *state)
   1446 {
   1447 	ibd_ace_t *ptr = list_tail(&state->id_ah_active);
   1448 	boolean_t try_rc_chan_recycle = B_FALSE;
   1449 
   1450 	ASSERT(mutex_owned(&state->id_ac_mutex));
   1451 
   1452 	/*
   1453 	 * Do plain linear search.
   1454 	 */
   1455 	while (ptr != NULL) {
   1456 		/*
   1457 		 * Note that it is possible that the "cycle" bit
   1458 		 * is set on the AH w/o any reference count. The
   1459 		 * mcg must have been deleted, and the tx cleanup
   1460 		 * just decremented the reference count to 0, but
   1461 		 * hasn't gotten around to grabbing the id_ac_mutex
   1462 		 * to move the AH into the free list.
   1463 		 */
   1464 		if (GET_REF(ptr) == 0) {
   1465 			if (ptr->ac_chan != NULL) {
   1466 				ASSERT(state->id_enable_rc == B_TRUE);
   1467 				if (!try_rc_chan_recycle) {
   1468 					try_rc_chan_recycle = B_TRUE;
   1469 					ibd_rc_signal_ace_recycle(state, ptr);
   1470 				}
   1471 			} else {
   1472 				IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
   1473 				break;
   1474 			}
   1475 		}
   1476 		ptr = list_prev(&state->id_ah_active, ptr);
   1477 	}
   1478 	return (ptr);
   1479 }
   1480 
   1481 /*
   1482  * Invoked to clean up AH from active list in case of multicast
   1483  * disable and to handle sendonly memberships during mcg traps.
   1484  * And for port up processing for multicast and unicast AHs.
   1485  * Normally, the AH is taken off the active list, and put into
   1486  * the free list to be recycled for a new destination. In case
   1487  * Tx requests on the AH have not completed yet, the AH is marked
   1488  * for reaping (which will put the AH on the free list) once the Tx's
   1489  * complete; in this case, depending on the "force" input, we take
   1490  * out the AH from the active list right now, or leave it also for
   1491  * the reap operation. Returns TRUE if the AH is taken off the active
   1492  * list (and either put into the free list right now, or arranged for
   1493  * later), FALSE otherwise.
   1494  */
   1495 boolean_t
   1496 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
   1497 {
   1498 	ibd_ace_t *acactive;
   1499 	boolean_t ret = B_TRUE;
   1500 
   1501 	ASSERT(mutex_owned(&state->id_ac_mutex));
   1502 
   1503 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
   1504 
   1505 		/*
   1506 		 * Note that the AH might already have the cycle bit set
   1507 		 * on it; this might happen if sequences of multicast
   1508 		 * enables and disables are coming so fast, that posted
   1509 		 * Tx's to the mcg have not completed yet, and the cycle
   1510 		 * bit is set successively by each multicast disable.
   1511 		 */
   1512 		if (SET_CYCLE_IF_REF(acactive)) {
   1513 			if (!force) {
   1514 				/*
   1515 				 * The ace is kept on the active list, further
   1516 				 * Tx's can still grab a reference on it; the
   1517 				 * ace is reaped when all pending Tx's
   1518 				 * referencing the AH complete.
   1519 				 */
   1520 				ret = B_FALSE;
   1521 			} else {
   1522 				/*
   1523 				 * In the mcg trap case, we always pull the
   1524 				 * AH from the active list. And also the port
   1525 				 * up multi/unicast case.
   1526 				 */
   1527 				ASSERT(acactive->ac_chan == NULL);
   1528 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
   1529 				acactive->ac_mce = NULL;
   1530 			}
   1531 		} else {
   1532 			/*
   1533 			 * Determined the ref count is 0, thus reclaim
   1534 			 * immediately after pulling out the ace from
   1535 			 * the active list.
   1536 			 */
   1537 			ASSERT(acactive->ac_chan == NULL);
   1538 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
   1539 			acactive->ac_mce = NULL;
   1540 			IBD_ACACHE_INSERT_FREE(state, acactive);
   1541 		}
   1542 
   1543 	}
   1544 	return (ret);
   1545 }
   1546 
   1547 /*
   1548  * Helper function for async path record lookup. If we are trying to
   1549  * Tx to a MCG, check our membership, possibly trying to join the
   1550  * group if required. If that fails, try to send the packet to the
   1551  * all router group (indicated by the redirect output), pointing
   1552  * the input mac address to the router mcg address.
   1553  */
   1554 static ibd_mce_t *
   1555 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
   1556 {
   1557 	ib_gid_t mgid;
   1558 	ibd_mce_t *mce;
   1559 	ipoib_mac_t routermac;
   1560 
   1561 	*redirect = B_FALSE;
   1562 	ibd_n2h_gid(mac, &mgid);
   1563 
   1564 	/*
   1565 	 * Check the FullMember+SendOnlyNonMember list.
   1566 	 * Since we are the only one who manipulates the
   1567 	 * id_mc_full list, no locks are needed.
   1568 	 */
   1569 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
   1570 	if (mce != NULL) {
   1571 		DPRINT(4, "ibd_async_mcache : already joined to group");
   1572 		return (mce);
   1573 	}
   1574 
   1575 	/*
   1576 	 * Not found; try to join(SendOnlyNonMember) and attach.
   1577 	 */
   1578 	DPRINT(4, "ibd_async_mcache : not joined to group");
   1579 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
   1580 	    NULL) {
   1581 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
   1582 		return (mce);
   1583 	}
   1584 
   1585 	/*
   1586 	 * MCGroup not present; try to join the all-router group. If
   1587 	 * any of the following steps succeed, we will be redirecting
   1588 	 * to the all router group.
   1589 	 */
   1590 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
   1591 	if (!ibd_get_allroutergroup(state, mac, &routermac))
   1592 		return (NULL);
   1593 	*redirect = B_TRUE;
   1594 	ibd_n2h_gid(&routermac, &mgid);
   1595 	bcopy(&routermac, mac, IPOIB_ADDRL);
   1596 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
   1597 	    mgid.gid_prefix, mgid.gid_guid);
   1598 
   1599 	/*
   1600 	 * Are we already joined to the router group?
   1601 	 */
   1602 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
   1603 		DPRINT(4, "ibd_async_mcache : using already joined router"
   1604 		    "group\n");
   1605 		return (mce);
   1606 	}
   1607 
   1608 	/*
   1609 	 * Can we join(SendOnlyNonMember) the router group?
   1610 	 */
   1611 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
   1612 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
   1613 	    NULL) {
   1614 		DPRINT(4, "ibd_async_mcache : joined to router grp");
   1615 		return (mce);
   1616 	}
   1617 
   1618 	return (NULL);
   1619 }
   1620 
   1621 /*
   1622  * Async path record lookup code.
   1623  */
   1624 static void
   1625 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
   1626 {
   1627 	ibd_ace_t *ce;
   1628 	ibd_mce_t *mce = NULL;
   1629 	ibt_path_attr_t path_attr;
   1630 	ibt_path_info_t path_info;
   1631 	ib_gid_t destgid;
   1632 	char ret = IBD_OP_NOTSTARTED;
   1633 
   1634 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
   1635 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
   1636 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
   1637 	    htonl(mac->ipoib_gidsuff[1]));
   1638 
   1639 	/*
   1640 	 * Check whether we are trying to transmit to a MCG.
   1641 	 * In that case, we need to make sure we are a member of
   1642 	 * the MCG.
   1643 	 */
   1644 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
   1645 		boolean_t redirected;
   1646 
   1647 		/*
   1648 		 * If we can not find or join the group or even
   1649 		 * redirect, error out.
   1650 		 */
   1651 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
   1652 		    NULL) {
   1653 			state->id_ah_op = IBD_OP_ERRORED;
   1654 			return;
   1655 		}
   1656 
   1657 		/*
   1658 		 * If we got redirected, we need to determine whether
   1659 		 * the AH for the new mcg is in the cache already, and
   1660 		 * not pull it in then; otherwise proceed to get the
   1661 		 * path for the new mcg. There is no guarantee that
   1662 		 * if the AH is currently in the cache, it will still be
   1663 		 * there when we look in ibd_acache_lookup(), but that's
   1664 		 * okay, we will come back here.
   1665 		 */
   1666 		if (redirected) {
   1667 			ret = IBD_OP_ROUTERED;
   1668 			DPRINT(4, "ibd_async_acache :  redirected to "
   1669 			    "%08X:%08X:%08X:%08X:%08X",
   1670 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
   1671 			    htonl(mac->ipoib_gidpref[1]),
   1672 			    htonl(mac->ipoib_gidsuff[0]),
   1673 			    htonl(mac->ipoib_gidsuff[1]));
   1674 
   1675 			mutex_enter(&state->id_ac_mutex);
   1676 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
   1677 				state->id_ah_op = IBD_OP_ROUTERED;
   1678 				mutex_exit(&state->id_ac_mutex);
   1679 				DPRINT(4, "ibd_async_acache : router AH found");
   1680 				return;
   1681 			}
   1682 			mutex_exit(&state->id_ac_mutex);
   1683 		}
   1684 	}
   1685 
   1686 	/*
   1687 	 * Get an AH from the free list.
   1688 	 */
   1689 	mutex_enter(&state->id_ac_mutex);
   1690 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
   1691 		/*
   1692 		 * No free ones; try to grab an unreferenced active
   1693 		 * one. Maybe we need to make the active list LRU,
   1694 		 * but that will create more work for Tx callbacks.
   1695 		 * Is there a way of not having to pull out the
   1696 		 * entry from the active list, but just indicate it
   1697 		 * is being recycled? Yes, but that creates one more
   1698 		 * check in the fast lookup path.
   1699 		 */
   1700 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
   1701 			/*
   1702 			 * Pretty serious shortage now.
   1703 			 */
   1704 			state->id_ah_op = IBD_OP_NOTSTARTED;
   1705 			mutex_exit(&state->id_ac_mutex);
   1706 			DPRINT(10, "ibd_async_acache : failed to find AH "
   1707 			    "slot\n");
   1708 			return;
   1709 		}
   1710 		/*
   1711 		 * We could check whether ac_mce points to a SendOnly
   1712 		 * member and drop that membership now. Or do it lazily
   1713 		 * at detach time.
   1714 		 */
   1715 		ce->ac_mce = NULL;
   1716 	}
   1717 	mutex_exit(&state->id_ac_mutex);
   1718 	ASSERT(ce->ac_mce == NULL);
   1719 
   1720 	/*
   1721 	 * Update the entry.
   1722 	 */
   1723 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
   1724 
   1725 	bzero(&path_info, sizeof (path_info));
   1726 	bzero(&path_attr, sizeof (ibt_path_attr_t));
   1727 	path_attr.pa_sgid = state->id_sgid;
   1728 	path_attr.pa_num_dgids = 1;
   1729 	ibd_n2h_gid(&ce->ac_mac, &destgid);
   1730 	path_attr.pa_dgids = &destgid;
   1731 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
   1732 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
   1733 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
   1734 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
   1735 		goto error;
   1736 	}
   1737 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
   1738 	    ntohl(ce->ac_mac.ipoib_qpn),
   1739 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
   1740 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
   1741 		goto error;
   1742 	}
   1743 
   1744 	/*
   1745 	 * mce is set whenever an AH is being associated with a
   1746 	 * MCG; this will come in handy when we leave the MCG. The
   1747 	 * lock protects Tx fastpath from scanning the active list.
   1748 	 */
   1749 	if (mce != NULL)
   1750 		ce->ac_mce = mce;
   1751 
   1752 	/*
   1753 	 * initiate a RC mode connection for unicast address
   1754 	 */
   1755 	if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
   1756 	    (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
   1757 		ASSERT(ce->ac_chan == NULL);
   1758 		DPRINT(10, "ibd_async_acache: call "
   1759 		    "ibd_rc_try_connect(ace=%p)", ce);
   1760 		ibd_rc_try_connect(state, ce, &path_info);
   1761 		if (ce->ac_chan == NULL) {
   1762 			DPRINT(10, "ibd_async_acache: fail to setup RC"
   1763 			    " channel");
   1764 			state->rc_conn_fail++;
   1765 			goto error;
   1766 		}
   1767 	}
   1768 
   1769 	mutex_enter(&state->id_ac_mutex);
   1770 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
   1771 	state->id_ah_op = ret;
   1772 	mutex_exit(&state->id_ac_mutex);
   1773 	return;
   1774 error:
   1775 	/*
   1776 	 * We might want to drop SendOnly membership here if we
   1777 	 * joined above. The lock protects Tx callbacks inserting
   1778 	 * into the free list.
   1779 	 */
   1780 	mutex_enter(&state->id_ac_mutex);
   1781 	state->id_ah_op = IBD_OP_ERRORED;
   1782 	IBD_ACACHE_INSERT_FREE(state, ce);
   1783 	mutex_exit(&state->id_ac_mutex);
   1784 }
   1785 
   1786 /*
   1787  * While restoring port's presence on the subnet on a port up, it is possible
   1788  * that the port goes down again.
   1789  */
   1790 static void
   1791 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
   1792 {
   1793 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
   1794 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
   1795 	    LINK_STATE_UP;
   1796 	ibd_mce_t *mce, *pmce;
   1797 	ibd_ace_t *ace, *pace;
   1798 
   1799 	DPRINT(10, "ibd_async_link(): %d", opcode);
   1800 
   1801 	/*
   1802 	 * On a link up, revalidate the link speed/width. No point doing
   1803 	 * this on a link down, since we will be unable to do SA operations,
   1804 	 * defaulting to the lowest speed. Also notice that we update our
   1805 	 * notion of speed before calling mac_link_update(), which will do
   1806 	 * necessary higher level notifications for speed changes.
   1807 	 */
   1808 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
   1809 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
   1810 		state->id_link_speed = ibd_get_portspeed(state);
   1811 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
   1812 	}
   1813 
   1814 	/*
   1815 	 * Do all the work required to establish our presence on
   1816 	 * the subnet.
   1817 	 */
   1818 	if (opcode == IBD_LINK_UP_ABSENT) {
   1819 		/*
   1820 		 * If in promiscuous mode ...
   1821 		 */
   1822 		if (state->id_prom_op == IBD_OP_COMPLETED) {
   1823 			/*
   1824 			 * Drop all nonmembership.
   1825 			 */
   1826 			ibd_async_unsetprom(state);
   1827 
   1828 			/*
   1829 			 * Then, try to regain nonmembership to all mcg's.
   1830 			 */
   1831 			ibd_async_setprom(state);
   1832 
   1833 		}
   1834 
   1835 		/*
   1836 		 * Drop all sendonly membership (which also gets rid of the
   1837 		 * AHs); try to reacquire all full membership.
   1838 		 */
   1839 		mce = list_head(&state->id_mc_full);
   1840 		while ((pmce = mce) != NULL) {
   1841 			mce = list_next(&state->id_mc_full, mce);
   1842 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
   1843 				ibd_leave_group(state,
   1844 				    pmce->mc_info.mc_adds_vect.av_dgid,
   1845 				    IB_MC_JSTATE_SEND_ONLY_NON);
   1846 			else
   1847 				ibd_reacquire_group(state, pmce);
   1848 		}
   1849 
   1850 		/*
   1851 		 * Recycle all active AHs to free list (and if there are
   1852 		 * pending posts, make sure they will go into the free list
   1853 		 * once the Tx's complete). Grab the lock to prevent
   1854 		 * concurrent Tx's as well as Tx cleanups.
   1855 		 */
   1856 		mutex_enter(&state->id_ac_mutex);
   1857 		ace = list_head(&state->id_ah_active);
   1858 		while ((pace = ace) != NULL) {
   1859 			boolean_t cycled;
   1860 
   1861 			ace = list_next(&state->id_ah_active, ace);
   1862 			mce = pace->ac_mce;
   1863 			if (pace->ac_chan != NULL) {
   1864 				ASSERT(mce == NULL);
   1865 				ASSERT(state->id_enable_rc == B_TRUE);
   1866 				if (pace->ac_chan->chan_state ==
   1867 				    IBD_RC_STATE_ACT_ESTAB) {
   1868 					INC_REF(pace, 1);
   1869 					IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
   1870 					pace->ac_chan->chan_state =
   1871 					    IBD_RC_STATE_ACT_CLOSING;
   1872 					ibd_rc_signal_act_close(state, pace);
   1873 				} else {
   1874 					state->rc_act_close_simultaneous++;
   1875 					DPRINT(40, "ibd_async_link: other "
   1876 					    "thread is closing it, ace=%p, "
   1877 					    "ac_chan=%p, chan_state=%d",
   1878 					    pace, pace->ac_chan,
   1879 					    pace->ac_chan->chan_state);
   1880 				}
   1881 			} else {
   1882 				cycled = ibd_acache_recycle(state,
   1883 				    &pace->ac_mac, B_TRUE);
   1884 			}
   1885 			/*
   1886 			 * If this is for an mcg, it must be for a fullmember,
   1887 			 * since we got rid of send-only members above when
   1888 			 * processing the mce list.
   1889 			 */
   1890 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
   1891 			    IB_MC_JSTATE_FULL)));
   1892 
   1893 			/*
   1894 			 * Check if the fullmember mce needs to be torn down,
   1895 			 * ie whether the DLPI disable has already been done.
   1896 			 * If so, do some of the work of tx_cleanup, namely
   1897 			 * causing leave (which will fail), detach and
   1898 			 * mce-freeing. tx_cleanup will put the AH into free
   1899 			 * list. The reason to duplicate some of this
   1900 			 * tx_cleanup work is because we want to delete the
   1901 			 * AH right now instead of waiting for tx_cleanup, to
   1902 			 * force subsequent Tx's to reacquire an AH.
   1903 			 */
   1904 			if ((mce != NULL) && (mce->mc_fullreap))
   1905 				ibd_async_reap_group(state, mce,
   1906 				    mce->mc_info.mc_adds_vect.av_dgid,
   1907 				    mce->mc_jstate);
   1908 		}
   1909 		mutex_exit(&state->id_ac_mutex);
   1910 	}
   1911 
   1912 	/*
   1913 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
   1914 	 * (which stops further events from being delivered) before
   1915 	 * mac_unregister(). At this point, it is guaranteed that mac_register
   1916 	 * has already been done.
   1917 	 */
   1918 	mutex_enter(&state->id_link_mutex);
   1919 	state->id_link_state = lstate;
   1920 	mac_link_update(state->id_mh, lstate);
   1921 	mutex_exit(&state->id_link_mutex);
   1922 
   1923 	ibd_async_done(state);
   1924 }
   1925 
   1926 /*
   1927  * Check the pkey table to see if we can find the pkey we're looking for.
   1928  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
   1929  * failure.
   1930  */
   1931 static int
   1932 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
   1933     uint16_t *pkix)
   1934 {
   1935 	uint16_t ndx;
   1936 
   1937 	ASSERT(pkix != NULL);
   1938 
   1939 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
   1940 		if (pkey_tbl[ndx] == pkey) {
   1941 			*pkix = ndx;
   1942 			return (0);
   1943 		}
   1944 	}
   1945 	return (-1);
   1946 }
   1947 
   1948 /*
   1949  * When the link is notified up, we need to do a few things, based
   1950  * on the port's current p_init_type_reply claiming a reinit has been
   1951  * done or not. The reinit steps are:
   1952  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
   1953  *    the old Pkey and GID0 are correct.
   1954  * 2. Register for mcg traps (already done by ibmf).
   1955  * 3. If PreservePresenceReply indicates the SM has restored port's presence
   1956  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
   1957  * 4. Give up all sendonly memberships.
   1958  * 5. Acquire all full memberships.
   1959  * 6. In promiscuous mode, acquire all non memberships.
   1960  * 7. Recycle all AHs to free list.
   1961  */
   1962 static void
   1963 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
   1964 {
   1965 	ibt_hca_portinfo_t *port_infop = NULL;
   1966 	ibt_status_t ibt_status;
   1967 	uint_t psize, port_infosz;
   1968 	ibd_link_op_t opcode;
   1969 	ibd_req_t *req;
   1970 	link_state_t new_link_state = LINK_STATE_UP;
   1971 	uint8_t itreply;
   1972 	uint16_t pkix;
   1973 	int ret;
   1974 
   1975 	/*
   1976 	 * Let's not race with a plumb or an unplumb; if we detect a
   1977 	 * pkey relocation event later on here, we may have to restart.
   1978 	 */
   1979 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
   1980 
   1981 	mutex_enter(&state->id_link_mutex);
   1982 
   1983 	/*
   1984 	 * If the init code in ibd_m_start hasn't yet set up the
   1985 	 * pkey/gid, nothing to do; that code will set the link state.
   1986 	 */
   1987 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
   1988 		mutex_exit(&state->id_link_mutex);
   1989 		goto link_mod_return;
   1990 	}
   1991 
   1992 	/*
   1993 	 * If this routine was called in response to a port down event,
   1994 	 * we just need to see if this should be informed.
   1995 	 */
   1996 	if (code == IBT_ERROR_PORT_DOWN) {
   1997 		new_link_state = LINK_STATE_DOWN;
   1998 		goto update_link_state;
   1999 	}
   2000 
   2001 	/*
   2002 	 * If it's not a port down event we've received, try to get the port
   2003 	 * attributes first. If we fail here, the port is as good as down.
   2004 	 * Otherwise, if the link went down by the time the handler gets
   2005 	 * here, give up - we cannot even validate the pkey/gid since those
   2006 	 * are not valid and this is as bad as a port down anyway.
   2007 	 */
   2008 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
   2009 	    &port_infop, &psize, &port_infosz);
   2010 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
   2011 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
   2012 		new_link_state = LINK_STATE_DOWN;
   2013 		goto update_link_state;
   2014 	}
   2015 
   2016 	/*
   2017 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
   2018 	 * PreserveContentReply are 0, we don't know anything about the
   2019 	 * data loaded into the port attributes, so we need to verify
   2020 	 * if gid0 and pkey are still valid.
   2021 	 */
   2022 	itreply = port_infop->p_init_type_reply;
   2023 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
   2024 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
   2025 		/*
   2026 		 * Check to see if the subnet part of GID0 has changed. If
   2027 		 * not, check the simple case first to see if the pkey
   2028 		 * index is the same as before; finally check to see if the
   2029 		 * pkey has been relocated to a different index in the table.
   2030 		 */
   2031 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
   2032 		if (bcmp(port_infop->p_sgid_tbl,
   2033 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
   2034 
   2035 			new_link_state = LINK_STATE_DOWN;
   2036 
   2037 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
   2038 		    state->id_pkey) {
   2039 
   2040 			new_link_state = LINK_STATE_UP;
   2041 
   2042 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
   2043 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
   2044 
   2045 			ibt_free_portinfo(port_infop, port_infosz);
   2046 			mutex_exit(&state->id_link_mutex);
   2047 
   2048 			/*
   2049 			 * Currently a restart is required if our pkey has moved
   2050 			 * in the pkey table. If we get the ibt_recycle_ud() to
   2051 			 * work as documented (expected), we may be able to
   2052 			 * avoid a complete restart.  Note that we've already
   2053 			 * marked both the start and stop 'in-progress' flags,
   2054 			 * so it is ok to go ahead and do this restart.
   2055 			 */
   2056 			(void) ibd_undo_start(state, LINK_STATE_DOWN);
   2057 			if ((ret = ibd_start(state)) != 0) {
   2058 				DPRINT(10, "ibd_restart: cannot restart, "
   2059 				    "ret=%d", ret);
   2060 			}
   2061 
   2062 			goto link_mod_return;
   2063 		} else {
   2064 			new_link_state = LINK_STATE_DOWN;
   2065 		}
   2066 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
   2067 	}
   2068 
   2069 update_link_state:
   2070 	if (port_infop) {
   2071 		ibt_free_portinfo(port_infop, port_infosz);
   2072 	}
   2073 
   2074 	/*
   2075 	 * If the old state is the same as the new state, nothing to do
   2076 	 */
   2077 	if (state->id_link_state == new_link_state) {
   2078 		mutex_exit(&state->id_link_mutex);
   2079 		goto link_mod_return;
   2080 	}
   2081 
   2082 	/*
   2083 	 * Ok, so there was a link state change; see if it's safe to ask
   2084 	 * the async thread to do the work
   2085 	 */
   2086 	if (!ibd_async_safe(state)) {
   2087 		state->id_link_state = new_link_state;
   2088 		mutex_exit(&state->id_link_mutex);
   2089 		goto link_mod_return;
   2090 	}
   2091 
   2092 	mutex_exit(&state->id_link_mutex);
   2093 
   2094 	/*
   2095 	 * If we're reporting a link up, check InitTypeReply to see if
   2096 	 * the SM has ensured that the port's presence in mcg, traps,
   2097 	 * etc. is intact.
   2098 	 */
   2099 	if (new_link_state == LINK_STATE_DOWN) {
   2100 		opcode = IBD_LINK_DOWN;
   2101 	} else {
   2102 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
   2103 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
   2104 			opcode = IBD_LINK_UP;
   2105 		} else {
   2106 			opcode = IBD_LINK_UP_ABSENT;
   2107 		}
   2108 	}
   2109 
   2110 	/*
   2111 	 * Queue up a request for ibd_async_link() to handle this link
   2112 	 * state change event
   2113 	 */
   2114 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
   2115 	req->rq_ptr = (void *)opcode;
   2116 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
   2117 
   2118 link_mod_return:
   2119 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
   2120 }
   2121 
   2122 /*
   2123  * For the port up/down events, IBTL guarantees there will not be concurrent
   2124  * invocations of the handler. IBTL might coalesce link transition events,
   2125  * and not invoke the handler for _each_ up/down transition, but it will
   2126  * invoke the handler with last known state
   2127  */
   2128 static void
   2129 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
   2130     ibt_async_code_t code, ibt_async_event_t *event)
   2131 {
   2132 	ibd_state_t *state = (ibd_state_t *)clnt_private;
   2133 
   2134 	switch (code) {
   2135 	case IBT_ERROR_CATASTROPHIC_CHAN:
   2136 		ibd_print_warn(state, "catastrophic channel error");
   2137 		break;
   2138 	case IBT_ERROR_CQ:
   2139 		ibd_print_warn(state, "completion queue error");
   2140 		break;
   2141 	case IBT_PORT_CHANGE_EVENT:
   2142 		/*
   2143 		 * Events will be delivered to all instances that have
   2144 		 * done ibt_open_hca() but not yet done ibt_close_hca().
   2145 		 * Only need to do work for our port; IBTF will deliver
   2146 		 * events for other ports on the hca we have ibt_open_hca'ed
   2147 		 * too. Note that id_port is initialized in ibd_attach()
   2148 		 * before we do an ibt_open_hca() in ibd_attach().
   2149 		 */
   2150 		ASSERT(state->id_hca_hdl == hca_hdl);
   2151 		if (state->id_port != event->ev_port)
   2152 			break;
   2153 
   2154 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
   2155 		    IBT_PORT_CHANGE_PKEY) {
   2156 			ibd_link_mod(state, code);
   2157 		}
   2158 		break;
   2159 	case IBT_ERROR_PORT_DOWN:
   2160 	case IBT_CLNT_REREG_EVENT:
   2161 	case IBT_EVENT_PORT_UP:
   2162 		/*
   2163 		 * Events will be delivered to all instances that have
   2164 		 * done ibt_open_hca() but not yet done ibt_close_hca().
   2165 		 * Only need to do work for our port; IBTF will deliver
   2166 		 * events for other ports on the hca we have ibt_open_hca'ed
   2167 		 * too. Note that id_port is initialized in ibd_attach()
   2168 		 * before we do an ibt_open_hca() in ibd_attach().
   2169 		 */
   2170 		ASSERT(state->id_hca_hdl == hca_hdl);
   2171 		if (state->id_port != event->ev_port)
   2172 			break;
   2173 
   2174 		ibd_link_mod(state, code);
   2175 		break;
   2176 
   2177 	case IBT_HCA_ATTACH_EVENT:
   2178 	case IBT_HCA_DETACH_EVENT:
   2179 		/*
   2180 		 * When a new card is plugged to the system, attach_event is
   2181 		 * invoked. Additionally, a cfgadm needs to be run to make the
   2182 		 * card known to the system, and an ifconfig needs to be run to
   2183 		 * plumb up any ibd interfaces on the card. In the case of card
   2184 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
   2185 		 * unplumb the ibd interfaces on the card; when the card is
   2186 		 * actually unplugged, the detach_event is invoked;
   2187 		 * additionally, if any ibd instances are still active on the
   2188 		 * card (eg there were no associated RCM scripts), driver's
   2189 		 * detach routine is invoked.
   2190 		 */
   2191 		break;
   2192 	default:
   2193 		break;
   2194 	}
   2195 }
   2196 
   2197 static int
   2198 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
   2199 {
   2200 	mac_register_t *macp;
   2201 	int ret;
   2202 
   2203 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
   2204 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
   2205 		return (DDI_FAILURE);
   2206 	}
   2207 
   2208 	/*
   2209 	 * Note that when we register with mac during attach, we don't
   2210 	 * have the id_macaddr yet, so we'll simply be registering a
   2211 	 * zero macaddr that we'll overwrite later during plumb (in
   2212 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
   2213 	 * update the mac layer with the correct mtu during plumb.
   2214 	 */
   2215 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
   2216 	macp->m_driver = state;
   2217 	macp->m_dip = dip;
   2218 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
   2219 	macp->m_callbacks = &ibd_m_callbacks;
   2220 	macp->m_min_sdu = 0;
   2221 	if (state->id_enable_rc) {
   2222 		macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
   2223 	} else {
   2224 		macp->m_max_sdu = IBD_DEF_MAX_SDU;
   2225 	}
   2226 
   2227 	/*
   2228 	 *  Register ourselves with the GLDv3 interface
   2229 	 */
   2230 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
   2231 		mac_free(macp);
   2232 		DPRINT(10,
   2233 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
   2234 		return (DDI_FAILURE);
   2235 	}
   2236 
   2237 	mac_free(macp);
   2238 	return (DDI_SUCCESS);
   2239 }
   2240 
   2241 static int
   2242 ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
   2243 {
   2244 	ibt_hca_attr_t hca_attrs;
   2245 	ibt_status_t ibt_status;
   2246 
   2247 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
   2248 
   2249 	/*
   2250 	 * Query the HCA and fetch its attributes
   2251 	 */
   2252 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
   2253 	ASSERT(ibt_status == IBT_SUCCESS);
   2254 
   2255 	/*
   2256 	 * 1. Set the Hardware Checksum capability. Currently we only consider
   2257 	 *    full checksum offload.
   2258 	 */
   2259 	if (state->id_enable_rc) {
   2260 			state->id_hwcksum_capab = 0;
   2261 	} else {
   2262 		if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
   2263 		    == IBT_HCA_CKSUM_FULL) {
   2264 			state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
   2265 		}
   2266 	}
   2267 
   2268 	/*
   2269 	 * 2. Set LSO policy, capability and maximum length
   2270 	 */
   2271 	if (state->id_enable_rc) {
   2272 		state->id_lso_policy = B_FALSE;
   2273 		state->id_lso_capable = B_FALSE;
   2274 		state->id_lso_maxlen = 0;
   2275 	} else {
   2276 		if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS
   2277 		    |DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
   2278 			state->id_lso_policy = B_TRUE;
   2279 		} else {
   2280 			state->id_lso_policy = B_FALSE;
   2281 		}
   2282 
   2283 		if (hca_attrs.hca_max_lso_size > 0) {
   2284 			state->id_lso_capable = B_TRUE;
   2285 			if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
   2286 				state->id_lso_maxlen = IBD_LSO_MAXLEN;
   2287 			else
   2288 				state->id_lso_maxlen =
   2289 				    hca_attrs.hca_max_lso_size;
   2290 		} else {
   2291 			state->id_lso_capable = B_FALSE;
   2292 			state->id_lso_maxlen = 0;
   2293 		}
   2294 	}
   2295 
   2296 	/*
   2297 	 * 3. Set Reserved L_Key capability
   2298 	 */
   2299 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
   2300 		state->id_hca_res_lkey_capab = 1;
   2301 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
   2302 		state->rc_enable_iov_map = B_TRUE;
   2303 	} else {
   2304 		/* If no reserved lkey, we will not use ibt_map_mem_iov */
   2305 		state->rc_enable_iov_map = B_FALSE;
   2306 	}
   2307 
   2308 	/*
   2309 	 * 4. Set maximum sqseg value after checking to see if extended sgl
   2310 	 *    size information is provided by the hca
   2311 	 */
   2312 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
   2313 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
   2314 		state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
   2315 	} else {
   2316 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
   2317 		state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
   2318 	}
   2319 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
   2320 		state->id_max_sqseg = IBD_MAX_SQSEG;
   2321 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
   2322 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
   2323 		    state->id_max_sqseg, IBD_MAX_SQSEG);
   2324 	}
   2325 	if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
   2326 		state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
   2327 	} else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
   2328 		ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
   2329 		    "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
   2330 	}
   2331 
   2332 	/*
   2333 	 * Translating the virtual address regions into physical regions
   2334 	 * for using the Reserved LKey feature results in a wr sgl that
   2335 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
   2336 	 * we'll fix a high-water mark (65%) for when we should stop.
   2337 	 */
   2338 	state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
   2339 	state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
   2340 
   2341 	/*
   2342 	 * 5. Set number of recv and send wqes after checking hca maximum
   2343 	 *    channel size
   2344 	 */
   2345 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
   2346 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
   2347 	} else {
   2348 		state->id_num_rwqe = IBD_NUM_RWQE;
   2349 	}
   2350 	state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN;
   2351 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
   2352 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
   2353 	} else {
   2354 		state->id_num_swqe = IBD_NUM_SWQE;
   2355 	}
   2356 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
   2357 
   2358 	return (DDI_SUCCESS);
   2359 }
   2360 
   2361 static int
   2362 ibd_unattach(ibd_state_t *state, dev_info_t *dip)
   2363 {
   2364 	int instance;
   2365 	uint32_t progress = state->id_mac_state;
   2366 	ibt_status_t ret;
   2367 
   2368 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
   2369 		cmn_err(CE_CONT, "ibd_detach: failed: rx bufs outstanding\n");
   2370 		return (DDI_FAILURE);
   2371 	}
   2372 
   2373 	/* make sure rx resources are freed */
   2374 	ibd_free_rx_rsrcs(state);
   2375 
   2376 	if (progress & IBD_DRV_MAC_REGISTERED) {
   2377 		(void) mac_unregister(state->id_mh);
   2378 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
   2379 	}
   2380 
   2381 	if (progress & IBD_DRV_PD_ALLOCD) {
   2382 		if ((ret = ibt_free_pd(state->id_hca_hdl,
   2383 		    state->id_pd_hdl)) != IBT_SUCCESS) {
   2384 			ibd_print_warn(state, "failed to free "
   2385 			    "protection domain, ret=%d", ret);
   2386 		}
   2387 		state->id_pd_hdl = NULL;
   2388 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
   2389 	}
   2390 
   2391 	if (progress & IBD_DRV_HCA_OPENED) {
   2392 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
   2393 		    IBT_SUCCESS) {
   2394 			ibd_print_warn(state, "failed to close "
   2395 			    "HCA device, ret=%d", ret);
   2396 		}
   2397 		state->id_hca_hdl = NULL;
   2398 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
   2399 	}
   2400 
   2401 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
   2402 		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
   2403 			ibd_print_warn(state,
   2404 			    "ibt_detach() failed, ret=%d", ret);
   2405 		}
   2406 		state->id_ibt_hdl = NULL;
   2407 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
   2408 	}
   2409 
   2410 	if (progress & IBD_DRV_TXINTR_ADDED) {
   2411 		ddi_remove_softintr(state->id_tx);
   2412 		state->id_tx = NULL;
   2413 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
   2414 	}
   2415 
   2416 	if (progress & IBD_DRV_RXINTR_ADDED) {
   2417 		ddi_remove_softintr(state->id_rx);
   2418 		state->id_rx = NULL;
   2419 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
   2420 	}
   2421 
   2422 #ifdef DEBUG
   2423 	if (progress & IBD_DRV_RC_PRIVATE_STATE) {
   2424 		kstat_delete(state->rc_ksp);
   2425 		state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
   2426 	}
   2427 #endif
   2428 
   2429 	if (progress & IBD_DRV_STATE_INITIALIZED) {
   2430 		ibd_state_fini(state);
   2431 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
   2432 	}
   2433 
   2434 	instance = ddi_get_instance(dip);
   2435 	ddi_soft_state_free(ibd_list, instance);
   2436 
   2437 	return (DDI_SUCCESS);
   2438 }
   2439 
   2440 /*
   2441  * Attach device to the IO framework.
   2442  */
   2443 static int
   2444 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
   2445 {
   2446 	ibd_state_t *state = NULL;
   2447 	ib_guid_t hca_guid;
   2448 	int instance;
   2449 	ibt_status_t ret;
   2450 	int rv;
   2451 
   2452 	/*
   2453 	 * IBD doesn't support suspend/resume
   2454 	 */
   2455 	if (cmd != DDI_ATTACH)
   2456 		return (DDI_FAILURE);
   2457 
   2458 	/*
   2459 	 * Allocate softstate structure
   2460 	 */
   2461 	instance = ddi_get_instance(dip);
   2462 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
   2463 		return (DDI_FAILURE);
   2464 	state = ddi_get_soft_state(ibd_list, instance);
   2465 
   2466 	/*
   2467 	 * Initialize mutexes and condition variables
   2468 	 */
   2469 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
   2470 		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
   2471 		goto attach_fail;
   2472 	}
   2473 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
   2474 
   2475 	/*
   2476 	 * Allocate rx,tx softintr
   2477 	 */
   2478 	if (ibd_rx_softintr == 1) {
   2479 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
   2480 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
   2481 			DPRINT(10, "ibd_attach: failed in "
   2482 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
   2483 			goto attach_fail;
   2484 		}
   2485 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
   2486 	}
   2487 	if (ibd_tx_softintr == 1) {
   2488 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
   2489 		    NULL, NULL, ibd_tx_recycle,
   2490 		    (caddr_t)state)) != DDI_SUCCESS) {
   2491 			DPRINT(10, "ibd_attach: failed in "
   2492 			    "ddi_add_softintr(id_tx), ret=%d", rv);
   2493 			goto attach_fail;
   2494 		}
   2495 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
   2496 	}
   2497 
   2498 	/*
   2499 	 * Obtain IBA P_Key, port number and HCA guid and validate
   2500 	 * them (for P_Key, only full members are allowed as per
   2501 	 * IPoIB specification; neither port number nor HCA guid
   2502 	 * can be zero)
   2503 	 */
   2504 	if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
   2505 	    "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
   2506 		DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
   2507 		    state->id_pkey);
   2508 		goto attach_fail;
   2509 	}
   2510 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
   2511 	    "port-number", 0)) == 0) {
   2512 		DPRINT(10, "ibd_attach: invalid port number (%d)",
   2513 		    state->id_port);
   2514 		goto attach_fail;
   2515 	}
   2516 	if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
   2517 	    "hca-guid", 0)) == 0) {
   2518 		DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
   2519 		    hca_guid);
   2520 		goto attach_fail;
   2521 	}
   2522 
   2523 	/*
   2524 	 * Attach to IBTL
   2525 	 */
   2526 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
   2527 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
   2528 		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
   2529 		goto attach_fail;
   2530 	}
   2531 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
   2532 
   2533 	/*
   2534 	 * Open the HCA
   2535 	 */
   2536 	if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
   2537 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
   2538 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
   2539 		goto attach_fail;
   2540 	}
   2541 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
   2542 
   2543 	/* Get RC config before ibd_record_capab */
   2544 	ibd_rc_get_conf(state);
   2545 
   2546 #ifdef DEBUG
   2547 	/* Initialize Driver Counters for Reliable Connected Mode */
   2548 	if (state->id_enable_rc) {
   2549 		if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
   2550 			DPRINT(10, "ibd_attach: failed in ibd_rc_init_stats");
   2551 			goto attach_fail;
   2552 		}
   2553 		state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
   2554 	}
   2555 #endif
   2556 
   2557 	/*
   2558 	 * Record capabilities
   2559 	 */
   2560 	(void) ibd_record_capab(state, dip);
   2561 
   2562 	/*
   2563 	 * Allocate a protection domain on the HCA
   2564 	 */
   2565 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
   2566 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
   2567 		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
   2568 		goto attach_fail;
   2569 	}
   2570 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
   2571 
   2572 
   2573 	/*
   2574 	 * Register ibd interfaces with the Nemo framework
   2575 	 */
   2576 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
   2577 		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
   2578 		goto attach_fail;
   2579 	}
   2580 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
   2581 
   2582 	/*
   2583 	 * We're done with everything we could to make the attach
   2584 	 * succeed.  All the buffer allocations and IPoIB broadcast
   2585 	 * group joins are deferred to when the interface instance
   2586 	 * is actually plumbed to avoid wasting memory.
   2587 	 */
   2588 	return (DDI_SUCCESS);
   2589 
   2590 attach_fail:
   2591 	(void) ibd_unattach(state, dip);
   2592 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
   2593 	return (DDI_FAILURE);
   2594 }
   2595 
   2596 /*
   2597  * Detach device from the IO framework.
   2598  */
   2599 static int
   2600 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
   2601 {
   2602 	ibd_state_t *state;
   2603 	int instance;
   2604 
   2605 	/*
   2606 	 * IBD doesn't support suspend/resume
   2607 	 */
   2608 	if (cmd != DDI_DETACH)
   2609 		return (DDI_FAILURE);
   2610 
   2611 	/*
   2612 	 * Get the instance softstate
   2613 	 */
   2614 	instance = ddi_get_instance(dip);
   2615 	state = ddi_get_soft_state(ibd_list, instance);
   2616 
   2617 	/*
   2618 	 * Release all resources we're holding still.  Note that if we'd
   2619 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
   2620 	 * so far, we should find all the flags we need in id_mac_state.
   2621 	 */
   2622 	return (ibd_unattach(state, dip));
   2623 }
   2624 
   2625 /*
   2626  * Pre ibt_attach() driver initialization
   2627  */
   2628 static int
   2629 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
   2630 {
   2631 	char buf[64];
   2632 
   2633 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
   2634 	state->id_link_state = LINK_STATE_UNKNOWN;
   2635 
   2636 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
   2637 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
   2638 	state->id_trap_stop = B_TRUE;
   2639 	state->id_trap_inprog = 0;
   2640 
   2641 	mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
   2642 	mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
   2643 	state->id_dip = dip;
   2644 
   2645 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
   2646 
   2647 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
   2648 	mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
   2649 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
   2650 	state->id_tx_busy = 0;
   2651 	mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
   2652 
   2653 	state->id_rx_list.dl_bufs_outstanding = 0;
   2654 	state->id_rx_list.dl_cnt = 0;
   2655 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
   2656 	mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
   2657 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
   2658 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
   2659 	    0, NULL, NULL, NULL, NULL, NULL, 0);
   2660 
   2661 	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
   2662 	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
   2663 
   2664 	/* For Reliable Connected Mode */
   2665 	mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
   2666 	mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
   2667 	mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
   2668 	mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
   2669 	mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
   2670 	    MUTEX_DRIVER, NULL);
   2671 
   2672 	return (DDI_SUCCESS);
   2673 }
   2674 
   2675 /*
   2676  * Post ibt_detach() driver deconstruction
   2677  */
   2678 static void
   2679 ibd_state_fini(ibd_state_t *state)
   2680 {
   2681 	cv_destroy(&state->id_macst_cv);
   2682 	mutex_destroy(&state->id_macst_lock);
   2683 
   2684 	kmem_cache_destroy(state->id_req_kmc);
   2685 
   2686 	mutex_destroy(&state->id_rx_list.dl_mutex);
   2687 	mutex_destroy(&state->id_rx_free_list.dl_mutex);
   2688 
   2689 	mutex_destroy(&state->id_txpost_lock);
   2690 	mutex_destroy(&state->id_tx_list.dl_mutex);
   2691 	mutex_destroy(&state->id_tx_rel_list.dl_mutex);
   2692 	mutex_destroy(&state->id_lso_lock);
   2693 
   2694 	mutex_destroy(&state->id_sched_lock);
   2695 	mutex_destroy(&state->id_scq_poll_lock);
   2696 	mutex_destroy(&state->id_rcq_poll_lock);
   2697 
   2698 	cv_destroy(&state->id_trap_cv);
   2699 	mutex_destroy(&state->id_trap_lock);
   2700 	mutex_destroy(&state->id_link_mutex);
   2701 
   2702 	/* For Reliable Connected Mode */
   2703 	mutex_destroy(&state->rc_srq_free_list.dl_mutex);
   2704 	mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
   2705 	mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
   2706 	mutex_destroy(&state->rc_tx_large_bufs_lock);
   2707 	mutex_destroy(&state->rc_rx_lock);
   2708 }
   2709 
   2710 /*
   2711  * Fetch link speed from SA for snmp ifspeed reporting.
   2712  */
   2713 static uint64_t
   2714 ibd_get_portspeed(ibd_state_t *state)
   2715 {
   2716 	int			ret;
   2717 	ibt_path_info_t		path;
   2718 	ibt_path_attr_t		path_attr;
   2719 	uint8_t			num_paths;
   2720 	uint64_t		ifspeed;
   2721 
   2722 	/*
   2723 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
   2724 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
   2725 	 * 2000000000. Start with that as default.
   2726 	 */
   2727 	ifspeed = 2000000000;
   2728 
   2729 	bzero(&path_attr, sizeof (path_attr));
   2730 
   2731 	/*
   2732 	 * Get the port speed from Loopback path information.
   2733 	 */
   2734 	path_attr.pa_dgids = &state->id_sgid;
   2735 	path_attr.pa_num_dgids = 1;
   2736 	path_attr.pa_sgid = state->id_sgid;
   2737 
   2738 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
   2739 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
   2740 		goto earlydone;
   2741 
   2742 	if (num_paths < 1)
   2743 		goto earlydone;
   2744 
   2745 	/*
   2746 	 * In case SA does not return an expected value, report the default
   2747 	 * speed as 1X.
   2748 	 */
   2749 	ret = 1;
   2750 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
   2751 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
   2752 			ret = 1;
   2753 			break;
   2754 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
   2755 			ret = 4;
   2756 			break;
   2757 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
   2758 			ret = 12;
   2759 			break;
   2760 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
   2761 			ret = 2;
   2762 			break;
   2763 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
   2764 			ret = 8;
   2765 			break;
   2766 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
   2767 			ret = 16;
   2768 			break;
   2769 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
   2770 			ret = 24;
   2771 			break;
   2772 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
   2773 			ret = 32;
   2774 			break;
   2775 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
   2776 			ret = 48;
   2777 			break;
   2778 	}
   2779 
   2780 	ifspeed *= ret;
   2781 
   2782 earlydone:
   2783 	return (ifspeed);
   2784 }
   2785 
   2786 /*
   2787  * Search input mcg list (id_mc_full or id_mc_non) for an entry
   2788  * representing the input mcg mgid.
   2789  */
   2790 static ibd_mce_t *
   2791 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
   2792 {
   2793 	ibd_mce_t *ptr = list_head(mlist);
   2794 
   2795 	/*
   2796 	 * Do plain linear search.
   2797 	 */
   2798 	while (ptr != NULL) {
   2799 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
   2800 		    sizeof (ib_gid_t)) == 0)
   2801 			return (ptr);
   2802 		ptr = list_next(mlist, ptr);
   2803 	}
   2804 	return (NULL);
   2805 }
   2806 
   2807 /*
   2808  * Execute IBA JOIN.
   2809  */
   2810 static ibt_status_t
   2811 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
   2812 {
   2813 	ibt_mcg_attr_t mcg_attr;
   2814 
   2815 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
   2816 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
   2817 	mcg_attr.mc_mgid = mgid;
   2818 	mcg_attr.mc_join_state = mce->mc_jstate;
   2819 	mcg_attr.mc_scope = state->id_scope;
   2820 	mcg_attr.mc_pkey = state->id_pkey;
   2821 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
   2822 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
   2823 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
   2824 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
   2825 	    NULL, NULL));
   2826 }
   2827 
   2828 /*
   2829  * This code JOINs the port in the proper way (depending on the join
   2830  * state) so that IBA fabric will forward mcg packets to/from the port.
   2831  * It also attaches the QPN to the mcg so it can receive those mcg
   2832  * packets. This code makes sure not to attach the mcg to the QP if
   2833  * that has been previously done due to the mcg being joined with a
   2834  * different join state, even though this is not required by SWG_0216,
   2835  * refid 3610.
   2836  */
   2837 static ibd_mce_t *
   2838 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
   2839 {
   2840 	ibt_status_t ibt_status;
   2841 	ibd_mce_t *mce, *tmce, *omce = NULL;
   2842 	boolean_t do_attach = B_TRUE;
   2843 
   2844 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
   2845 	    jstate, mgid.gid_prefix, mgid.gid_guid);
   2846 
   2847 	/*
   2848 	 * For enable_multicast Full member joins, we need to do some
   2849 	 * extra work. If there is already an mce on the list that
   2850 	 * indicates full membership, that means the membership has
   2851 	 * not yet been dropped (since the disable_multicast was issued)
   2852 	 * because there are pending Tx's to the mcg; in that case, just
   2853 	 * mark the mce not to be reaped when the Tx completion queues
   2854 	 * an async reap operation.
   2855 	 *
   2856 	 * If there is already an mce on the list indicating sendonly
   2857 	 * membership, try to promote to full membership. Be careful
   2858 	 * not to deallocate the old mce, since there might be an AH
   2859 	 * pointing to it; instead, update the old mce with new data
   2860 	 * that tracks the full membership.
   2861 	 */
   2862 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
   2863 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
   2864 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
   2865 			ASSERT(omce->mc_fullreap);
   2866 			omce->mc_fullreap = B_FALSE;
   2867 			return (omce);
   2868 		} else {
   2869 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
   2870 		}
   2871 	}
   2872 
   2873 	/*
   2874 	 * Allocate the ibd_mce_t to track this JOIN.
   2875 	 */
   2876 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
   2877 	mce->mc_fullreap = B_FALSE;
   2878 	mce->mc_jstate = jstate;
   2879 
   2880 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
   2881 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
   2882 		    ibt_status);
   2883 		kmem_free(mce, sizeof (ibd_mce_t));
   2884 		return (NULL);
   2885 	}
   2886 
   2887 	/*
   2888 	 * Is an IBA attach required? Not if the interface is already joined
   2889 	 * to the mcg in a different appropriate join state.
   2890 	 */
   2891 	if (jstate == IB_MC_JSTATE_NON) {
   2892 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
   2893 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
   2894 			do_attach = B_FALSE;
   2895 	} else if (jstate == IB_MC_JSTATE_FULL) {
   2896 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
   2897 			do_attach = B_FALSE;
   2898 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
   2899 		do_attach = B_FALSE;
   2900 	}
   2901 
   2902 	if (do_attach) {
   2903 		/*
   2904 		 * Do the IBA attach.
   2905 		 */
   2906 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
   2907 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
   2908 		    &mce->mc_info)) != IBT_SUCCESS) {
   2909 			DPRINT(10, "ibd_join_group : failed qp attachment "
   2910 			    "%d\n", ibt_status);
   2911 			/*
   2912 			 * NOTE that we should probably preserve the join info
   2913 			 * in the list and later try to leave again at detach
   2914 			 * time.
   2915 			 */
   2916 			(void) ibt_leave_mcg(state->id_sgid, mgid,
   2917 			    state->id_sgid, jstate);
   2918 			kmem_free(mce, sizeof (ibd_mce_t));
   2919 			return (NULL);
   2920 		}
   2921 	}
   2922 
   2923 	/*
   2924 	 * Insert the ibd_mce_t in the proper list.
   2925 	 */
   2926 	if (jstate == IB_MC_JSTATE_NON) {
   2927 		IBD_MCACHE_INSERT_NON(state, mce);
   2928 	} else {
   2929 		/*
   2930 		 * Set up the mc_req fields used for reaping the
   2931 		 * mcg in case of delayed tx completion (see
   2932 		 * ibd_tx_cleanup()). Also done for sendonly join in
   2933 		 * case we are promoted to fullmembership later and
   2934 		 * keep using the same mce.
   2935 		 */
   2936 		mce->mc_req.rq_gid = mgid;
   2937 		mce->mc_req.rq_ptr = mce;
   2938 		/*
   2939 		 * Check whether this is the case of trying to join
   2940 		 * full member, and we were already joined send only.
   2941 		 * We try to drop our SendOnly membership, but it is
   2942 		 * possible that the mcg does not exist anymore (and
   2943 		 * the subnet trap never reached us), so the leave
   2944 		 * operation might fail.
   2945 		 */
   2946 		if (omce != NULL) {
   2947 			(void) ibt_leave_mcg(state->id_sgid, mgid,
   2948 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
   2949 			omce->mc_jstate = IB_MC_JSTATE_FULL;
   2950 			bcopy(&mce->mc_info, &omce->mc_info,
   2951 			    sizeof (ibt_mcg_info_t));
   2952 			kmem_free(mce, sizeof (ibd_mce_t));
   2953 			return (omce);
   2954 		}
   2955 		mutex_enter(&state->id_mc_mutex);
   2956 		IBD_MCACHE_INSERT_FULL(state, mce);
   2957 		mutex_exit(&state->id_mc_mutex);
   2958 	}
   2959 
   2960 	return (mce);
   2961 }
   2962 
   2963 /*
   2964  * Called during port up event handling to attempt to reacquire full
   2965  * membership to an mcg. Stripped down version of ibd_join_group().
   2966  * Note that it is possible that the mcg might have gone away, and
   2967  * gets recreated at this point.
   2968  */
   2969 static void
   2970 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
   2971 {
   2972 	ib_gid_t mgid;
   2973 
   2974 	/*
   2975 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
   2976 	 * reap/leave is going to try to leave the group. We could prevent
   2977 	 * that by adding a boolean flag into ibd_mce_t, if required.
   2978 	 */
   2979 	if (mce->mc_fullreap)
   2980 		return;
   2981 
   2982 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
   2983 
   2984 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
   2985 	    mgid.gid_guid);
   2986 
   2987 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
   2988 		ibd_print_warn(state, "Failure on port up to rejoin "
   2989 		    "multicast gid %016llx:%016llx",
   2990 		    (u_longlong_t)mgid.gid_prefix,
   2991 		    (u_longlong_t)mgid.gid_guid);
   2992 }
   2993 
   2994 /*
   2995  * This code handles delayed Tx completion cleanups for mcg's to which
   2996  * disable_multicast has been issued, regular mcg related cleanups during
   2997  * disable_multicast, disable_promiscuous and mcg traps, as well as
   2998  * cleanups during driver detach time. Depending on the join state,
   2999  * it deletes the mce from the appropriate list and issues the IBA
   3000  * leave/detach; except in the disable_multicast case when the mce
   3001  * is left on the active list for a subsequent Tx completion cleanup.
   3002  */
   3003 static void
   3004 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
   3005     uint8_t jstate)
   3006 {
   3007 	ibd_mce_t *tmce;
   3008 	boolean_t do_detach = B_TRUE;
   3009 
   3010 	/*
   3011 	 * Before detaching, we must check whether the other list
   3012 	 * contains the mcg; if we detach blindly, the consumer
   3013 	 * who set up the other list will also stop receiving
   3014 	 * traffic.
   3015 	 */
   3016 	if (jstate == IB_MC_JSTATE_FULL) {
   3017 		/*
   3018 		 * The following check is only relevant while coming
   3019 		 * from the Tx completion path in the reap case.
   3020 		 */
   3021 		if (!mce->mc_fullreap)
   3022 			return;
   3023 		mutex_enter(&state->id_mc_mutex);
   3024 		IBD_MCACHE_PULLOUT_FULL(state, mce);
   3025 		mutex_exit(&state->id_mc_mutex);
   3026 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
   3027 			do_detach = B_FALSE;
   3028 	} else if (jstate == IB_MC_JSTATE_NON) {
   3029 		IBD_MCACHE_PULLOUT_NON(state, mce);
   3030 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
   3031 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
   3032 			do_detach = B_FALSE;
   3033 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
   3034 		mutex_enter(&state->id_mc_mutex);
   3035 		IBD_MCACHE_PULLOUT_FULL(state, mce);
   3036 		mutex_exit(&state->id_mc_mutex);
   3037 		do_detach = B_FALSE;
   3038 	}
   3039 
   3040 	/*
   3041 	 * If we are reacting to a mcg trap and leaving our sendonly or
   3042 	 * non membership, the mcg is possibly already gone, so attempting
   3043 	 * to leave might fail. On the other hand, we must try to leave
   3044 	 * anyway, since this might be a trap from long ago, and we could
   3045 	 * have potentially sendonly joined to a recent incarnation of
   3046 	 * the mcg and are about to loose track of this information.
   3047 	 */
   3048 	if (do_detach) {
   3049 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
   3050 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
   3051 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
   3052 	}
   3053 
   3054 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
   3055 	kmem_free(mce, sizeof (ibd_mce_t));
   3056 }
   3057 
   3058 /*
   3059  * Async code executed due to multicast and promiscuous disable requests
   3060  * and mcg trap handling; also executed during driver detach. Mostly, a
   3061  * leave and detach is done; except for the fullmember case when Tx
   3062  * requests are pending, whence arrangements are made for subsequent
   3063  * cleanup on Tx completion.
   3064  */
   3065 static void
   3066 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
   3067 {
   3068 	ipoib_mac_t mcmac;
   3069 	boolean_t recycled;
   3070 	ibd_mce_t *mce;
   3071 
   3072 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
   3073 	    jstate, mgid.gid_prefix, mgid.gid_guid);
   3074 
   3075 	if (jstate == IB_MC_JSTATE_NON) {
   3076 		recycled = B_TRUE;
   3077 		mce = IBD_MCACHE_FIND_NON(state, mgid);
   3078 		/*
   3079 		 * In case we are handling a mcg trap, we might not find
   3080 		 * the mcg in the non list.
   3081 		 */
   3082 		if (mce == NULL) {
   3083 			return;
   3084 		}
   3085 	} else {
   3086 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
   3087 
   3088 		/*
   3089 		 * In case we are handling a mcg trap, make sure the trap
   3090 		 * is not arriving late; if we have an mce that indicates
   3091 		 * that we are already a fullmember, that would be a clear
   3092 		 * indication that the trap arrived late (ie, is for a
   3093 		 * previous incarnation of the mcg).
   3094 		 */
   3095 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
   3096 			if ((mce == NULL) || (mce->mc_jstate ==
   3097 			    IB_MC_JSTATE_FULL)) {
   3098 				return;
   3099 			}
   3100 		} else {
   3101 			ASSERT(jstate == IB_MC_JSTATE_FULL);
   3102 
   3103 			/*
   3104 			 * If join group failed, mce will be NULL here.
   3105 			 * This is because in GLDv3 driver, set multicast
   3106 			 *  will always return success.
   3107 			 */
   3108 			if (mce == NULL) {
   3109 				return;
   3110 			}
   3111 
   3112 			mce->mc_fullreap = B_TRUE;
   3113 		}
   3114 
   3115 		/*
   3116 		 * If no pending Tx's remain that reference the AH
   3117 		 * for the mcg, recycle it from active to free list.
   3118 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
   3119 		 * so the last completing Tx will cause an async reap
   3120 		 * operation to be invoked, at which time we will drop our
   3121 		 * membership to the mcg so that the pending Tx's complete
   3122 		 * successfully. Refer to comments on "AH and MCE active
   3123 		 * list manipulation" at top of this file. The lock protects
   3124 		 * against Tx fast path and Tx cleanup code.
   3125 		 */
   3126 		mutex_enter(&state->id_ac_mutex);
   3127 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
   3128 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
   3129 		    IB_MC_JSTATE_SEND_ONLY_NON));
   3130 		mutex_exit(&state->id_ac_mutex);
   3131 	}
   3132 
   3133 	if (recycled) {
   3134 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
   3135 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
   3136 		ibd_async_reap_group(state, mce, mgid, jstate);
   3137 	}
   3138 }
   3139 
   3140 /*
   3141  * Find the broadcast address as defined by IPoIB; implicitly
   3142  * determines the IBA scope, mtu, tclass etc of the link the
   3143  * interface is going to be a member of.
   3144  */
   3145 static ibt_status_t
   3146 ibd_find_bgroup(ibd_state_t *state)
   3147 {
   3148 	ibt_mcg_attr_t mcg_attr;
   3149 	uint_t numg;
   3150 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
   3151 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
   3152 	    IB_MC_SCOPE_GLOBAL };
   3153 	int i, mcgmtu;
   3154 	boolean_t found = B_FALSE;
   3155 	int ret;
   3156 	ibt_mcg_info_t mcg_info;
   3157 
   3158 	state->id_bgroup_created = B_FALSE;
   3159 
   3160 query_bcast_grp:
   3161 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
   3162 	mcg_attr.mc_pkey = state->id_pkey;
   3163 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
   3164 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
   3165 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
   3166 
   3167 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
   3168 		state->id_scope = mcg_attr.mc_scope = scopes[i];
   3169 
   3170 		/*
   3171 		 * Look for the IPoIB broadcast group.
   3172 		 */
   3173 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
   3174 		state->id_mgid.gid_prefix =
   3175 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
   3176 		    ((uint64_t)state->id_scope << 48) |
   3177 		    ((uint32_t)(state->id_pkey << 16)));
   3178 		mcg_attr.mc_mgid = state->id_mgid;
   3179 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
   3180 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
   3181 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
   3182 			found = B_TRUE;
   3183 			break;
   3184 		}
   3185 	}
   3186 
   3187 	if (!found) {
   3188 		if (ibd_create_broadcast_group) {
   3189 			/*
   3190 			 * If we created the broadcast group, but failed to
   3191 			 * find it, we can't do anything except leave the
   3192 			 * one we created and return failure.
   3193 			 */
   3194 			if (state->id_bgroup_created) {
   3195 				ibd_print_warn(state, "IPoIB broadcast group "
   3196 				    "absent. Unable to query after create.");
   3197 				goto find_bgroup_fail;
   3198 			}
   3199 
   3200 			/*
   3201 			 * Create the ipoib broadcast group if it didn't exist
   3202 			 */
   3203 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
   3204 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
   3205 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
   3206 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
   3207 			mcg_attr.mc_pkey = state->id_pkey;
   3208 			mcg_attr.mc_flow = 0;
   3209 			mcg_attr.mc_sl = 0;
   3210 			mcg_attr.mc_tclass = 0;
   3211 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
   3212 			state->id_mgid.gid_prefix =
   3213 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
   3214 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
   3215 			    ((uint32_t)(state->id_pkey << 16)));
   3216 			mcg_attr.mc_mgid = state->id_mgid;
   3217 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
   3218 
   3219 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
   3220 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
   3221 				ibd_print_warn(state, "IPoIB broadcast group "
   3222 				    "absent, create failed: ret = %d\n", ret);
   3223 				state->id_bgroup_created = B_FALSE;
   3224 				return (IBT_FAILURE);
   3225 			}
   3226 			state->id_bgroup_created = B_TRUE;
   3227 			goto query_bcast_grp;
   3228 		} else {
   3229 			ibd_print_warn(state, "IPoIB broadcast group absent");
   3230 			return (IBT_FAILURE);
   3231 		}
   3232 	}
   3233 
   3234 	/*
   3235 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
   3236 	 */
   3237 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
   3238 	if (state->id_mtu < mcgmtu) {
   3239 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
   3240 		    "greater than port's maximum MTU %d", mcgmtu,
   3241 		    state->id_mtu);
   3242 		ibt_free_mcg_info(state->id_mcinfo, 1);
   3243 		goto find_bgroup_fail;
   3244 	}
   3245 	state->id_mtu = mcgmtu;
   3246 
   3247 	return (IBT_SUCCESS);
   3248 
   3249 find_bgroup_fail:
   3250 	if (state->id_bgroup_created) {
   3251 		(void) ibt_leave_mcg(state->id_sgid,
   3252 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
   3253 		    IB_MC_JSTATE_FULL);
   3254 	}
   3255 
   3256 	return (IBT_FAILURE);
   3257 }
   3258 
   3259 static int
   3260 ibd_alloc_tx_copybufs(ibd_state_t *state)
   3261 {
   3262 	ibt_mr_attr_t mem_attr;
   3263 
   3264 	/*
   3265 	 * Allocate one big chunk for all regular tx copy bufs
   3266 	 */
   3267 	state->id_tx_buf_sz = state->id_mtu;
   3268 	if (state->id_lso_policy && state->id_lso_capable &&
   3269 	    (IBD_TX_BUF_SZ > state->id_mtu)) {
   3270 		state->id_tx_buf_sz = IBD_TX_BUF_SZ;
   3271 	}
   3272 
   3273 	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
   3274 	    state->id_tx_buf_sz, KM_SLEEP);
   3275 
   3276 	state->id_tx_wqes = kmem_zalloc(state->id_num_swqe *
   3277 	    sizeof (ibd_swqe_t), KM_SLEEP);
   3278 
   3279 	/*
   3280 	 * Do one memory registration on the entire txbuf area
   3281 	 */
   3282 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
   3283 	mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
   3284 	mem_attr.mr_as = NULL;
   3285 	mem_attr.mr_flags = IBT_MR_SLEEP;
   3286 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
   3287 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
   3288 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
   3289 		kmem_free(state->id_tx_wqes,
   3290 		    state->id_num_swqe * sizeof (ibd_swqe_t));
   3291 		kmem_free(state->id_tx_bufs,
   3292 		    state->id_num_swqe * state->id_tx_buf_sz);
   3293 		state->id_tx_bufs = NULL;
   3294 		return (DDI_FAILURE);
   3295 	}
   3296 
   3297 	return (DDI_SUCCESS);
   3298 }
   3299 
   3300 static int
   3301 ibd_alloc_tx_lsobufs(ibd_state_t *state)
   3302 {
   3303 	ibt_mr_attr_t mem_attr;
   3304 	ibd_lsobuf_t *buflist;
   3305 	ibd_lsobuf_t *lbufp;
   3306 	ibd_lsobuf_t *tail;
   3307 	ibd_lsobkt_t *bktp;
   3308 	uint8_t *membase;
   3309 	uint8_t *memp;
   3310 	uint_t memsz;
   3311 	int i;
   3312 
   3313 	/*
   3314 	 * Allocate the lso bucket
   3315 	 */
   3316 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
   3317 
   3318 	/*
   3319 	 * Allocate the entire lso memory and register it
   3320 	 */
   3321 	memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
   3322 	membase = kmem_zalloc(memsz, KM_SLEEP);
   3323 
   3324 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
   3325 	mem_attr.mr_len = memsz;
   3326 	mem_attr.mr_as = NULL;
   3327 	mem_attr.mr_flags = IBT_MR_SLEEP;
   3328 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
   3329 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
   3330 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
   3331 		kmem_free(membase, memsz);
   3332 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
   3333 		return (DDI_FAILURE);
   3334 	}
   3335 
   3336 	mutex_enter(&state->id_lso_lock);
   3337 
   3338 	/*
   3339 	 * Now allocate the buflist.  Note that the elements in the buflist and
   3340 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
   3341 	 * can always derive the address of a buflist entry from the address of
   3342 	 * an lso buffer.
   3343 	 */
   3344 	buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
   3345 	    KM_SLEEP);
   3346 
   3347 	/*
   3348 	 * Set up the lso buf chain
   3349 	 */
   3350 	memp = membase;
   3351 	lbufp = buflist;
   3352 	for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
   3353 		lbufp->lb_isfree = 1;
   3354 		lbufp->lb_buf = memp;
   3355 		lbufp->lb_next = lbufp + 1;
   3356 
   3357 		tail = lbufp;
   3358 
   3359 		memp += IBD_LSO_BUFSZ;
   3360 		lbufp++;
   3361 	}
   3362 	tail->lb_next = NULL;
   3363 
   3364 	/*
   3365 	 * Set up the LSO buffer information in ibd state
   3366 	 */
   3367 	bktp->bkt_bufl = buflist;
   3368 	bktp->bkt_free_head = buflist;
   3369 	bktp->bkt_mem = membase;
   3370 	bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
   3371 	bktp->bkt_nfree = bktp->bkt_nelem;
   3372 
   3373 	state->id_lso = bktp;
   3374 	mutex_exit(&state->id_lso_lock);
   3375 
   3376 	return (DDI_SUCCESS);
   3377 }
   3378 
   3379 /*
   3380  * Statically allocate Tx buffer list(s).
   3381  */
   3382 static int
   3383 ibd_init_txlist(ibd_state_t *state)
   3384 {
   3385 	ibd_swqe_t *swqe;
   3386 	ibt_lkey_t lkey;
   3387 	int i;
   3388 	uint_t len;
   3389 	uint8_t *bufaddr;
   3390 
   3391 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
   3392 		return (DDI_FAILURE);
   3393 
   3394 	if (state->id_lso_policy && state->id_lso_capable) {
   3395 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
   3396 			state->id_lso_policy = B_FALSE;
   3397 	}
   3398 
   3399 	mutex_enter(&state->id_tx_list.dl_mutex);
   3400 	state->id_tx_list.dl_head = NULL;
   3401 	state->id_tx_list.dl_pending_sends = B_FALSE;
   3402 	state->id_tx_list.dl_cnt = 0;
   3403 	mutex_exit(&state->id_tx_list.dl_mutex);
   3404 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
   3405 	state->id_tx_rel_list.dl_head = NULL;
   3406 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
   3407 	state->id_tx_rel_list.dl_cnt = 0;
   3408 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
   3409 
   3410 	/*
   3411 	 * Allocate and setup the swqe list
   3412 	 */
   3413 	lkey = state->id_tx_mr_desc.md_lkey;
   3414 	bufaddr = state->id_tx_bufs;
   3415 	len = state->id_tx_buf_sz;
   3416 	swqe = state->id_tx_wqes;
   3417 	mutex_enter(&state->id_tx_list.dl_mutex);
   3418 	for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) {
   3419 		swqe->swqe_next = NULL;
   3420 		swqe->swqe_im_mblk = NULL;
   3421 
   3422 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
   3423 		    bufaddr;
   3424 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
   3425 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
   3426 
   3427 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
   3428 		swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
   3429 		swqe->w_swr.wr_trans = IBT_UD_SRV;
   3430 
   3431 		/* These are set in send */
   3432 		swqe->w_swr.wr_nds = 0;
   3433 		swqe->w_swr.wr_sgl = NULL;
   3434 		swqe->w_swr.wr_opcode = IBT_WRC_SEND;
   3435 
   3436 		/* add to list */
   3437 		state->id_tx_list.dl_cnt++;
   3438 		swqe->swqe_next = state->id_tx_list.dl_head;
   3439 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
   3440 	}
   3441 	mutex_exit(&state->id_tx_list.dl_mutex);
   3442 
   3443 	return (DDI_SUCCESS);
   3444 }
   3445 
   3446 static int
   3447 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
   3448     uint32_t *nds_p)
   3449 {
   3450 	ibd_lsobkt_t *bktp;
   3451 	ibd_lsobuf_t *lbufp;
   3452 	ibd_lsobuf_t *nextp;
   3453 	ibt_lkey_t lso_lkey;
   3454 	uint_t frag_sz;
   3455 	uint_t num_needed;
   3456 	int i;
   3457 
   3458 	ASSERT(sgl_p != NULL);
   3459 	ASSERT(nds_p != NULL);
   3460 	ASSERT(req_sz != 0);
   3461 
   3462 	/*
   3463 	 * Determine how many bufs we'd need for the size requested
   3464 	 */
   3465 	num_needed = req_sz / IBD_LSO_BUFSZ;
   3466 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
   3467 		num_needed++;
   3468 
   3469 	mutex_enter(&state->id_lso_lock);
   3470 
   3471 	/*
   3472 	 * If we don't have enough lso bufs, return failure
   3473 	 */
   3474 	ASSERT(state->id_lso != NULL);
   3475 	bktp = state->id_lso;
   3476 	if (bktp->bkt_nfree < num_needed) {
   3477 		mutex_exit(&state->id_lso_lock);
   3478 		return (-1);
   3479 	}
   3480 
   3481 	/*
   3482 	 * Pick the first 'num_needed' bufs from the free list
   3483 	 */
   3484 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
   3485 	lbufp = bktp->bkt_free_head;
   3486 	for (i = 0; i < num_needed; i++) {
   3487 		ASSERT(lbufp->lb_isfree != 0);
   3488 		ASSERT(lbufp->lb_buf != NULL);
   3489 
   3490 		nextp = lbufp->lb_next;
   3491 
   3492 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
   3493 		sgl_p[i].ds_key = lso_lkey;
   3494 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
   3495 
   3496 		lbufp->lb_isfree = 0;
   3497 		lbufp->lb_next = NULL;
   3498 
   3499 		lbufp = nextp;
   3500 	}
   3501 	bktp->bkt_free_head = lbufp;
   3502 
   3503 	/*
   3504 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
   3505 	 * to adjust the last sgl entry's length. Since we know we need atleast
   3506 	 * one, the i-1 use below is ok.
   3507 	 */
   3508 	if (frag_sz) {
   3509 		sgl_p[i-1].ds_len = frag_sz;
   3510 	}
   3511 
   3512 	/*
   3513 	 * Update nfree count and return
   3514 	 */
   3515 	bktp->bkt_nfree -= num_needed;
   3516 
   3517 	mutex_exit(&state->id_lso_lock);
   3518 
   3519 	*nds_p = num_needed;
   3520 
   3521 	return (0);
   3522 }
   3523 
   3524 static void
   3525 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
   3526 {
   3527 	ibd_lsobkt_t *bktp;
   3528 	ibd_lsobuf_t *lbufp;
   3529 	uint8_t *lso_mem_end;
   3530 	uint_t ndx;
   3531 	int i;
   3532 
   3533 	mutex_enter(&state->id_lso_lock);
   3534 
   3535 	bktp = state->id_lso;
   3536 	ASSERT(bktp != NULL);
   3537 
   3538 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
   3539 	for (i = 0; i < nds; i++) {
   3540 		uint8_t *va;
   3541 
   3542 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
   3543 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
   3544 
   3545 		/*
   3546 		 * Figure out the buflist element this sgl buffer corresponds
   3547 		 * to and put it back at the head
   3548 		 */
   3549 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
   3550 		lbufp = bktp->bkt_bufl + ndx;
   3551 
   3552 		ASSERT(lbufp->lb_isfree == 0);
   3553 		ASSERT(lbufp->lb_buf == va);
   3554 
   3555 		lbufp->lb_isfree = 1;
   3556 		lbufp->lb_next = bktp->bkt_free_head;
   3557 		bktp->bkt_free_head = lbufp;
   3558 	}
   3559 	bktp->bkt_nfree += nds;
   3560 
   3561 	mutex_exit(&state->id_lso_lock);
   3562 }
   3563 
   3564 static void
   3565 ibd_free_tx_copybufs(ibd_state_t *state)
   3566 {
   3567 	/*
   3568 	 * Unregister txbuf mr
   3569 	 */
   3570 	if (ibt_deregister_mr(state->id_hca_hdl,
   3571 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
   3572 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
   3573 	}
   3574 	state->id_tx_mr_hdl = NULL;
   3575 
   3576 	/*
   3577 	 * Free txbuf memory
   3578 	 */
   3579 	kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t));
   3580 	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
   3581 	state->id_tx_wqes = NULL;
   3582 	state->id_tx_bufs = NULL;
   3583 }
   3584 
   3585 static void
   3586 ibd_free_tx_lsobufs(ibd_state_t *state)
   3587 {
   3588 	ibd_lsobkt_t *bktp;
   3589 
   3590 	mutex_enter(&state->id_lso_lock);
   3591 
   3592 	if ((bktp = state->id_lso) == NULL) {
   3593 		mutex_exit(&state->id_lso_lock);
   3594 		return;
   3595 	}
   3596 
   3597 	/*
   3598 	 * First, free the buflist
   3599 	 */
   3600 	ASSERT(bktp->bkt_bufl != NULL);
   3601 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
   3602 
   3603 	/*
   3604 	 * Unregister the LSO memory and free it
   3605 	 */
   3606 	ASSERT(bktp->bkt_mr_hdl != NULL);
   3607 	if (ibt_deregister_mr(state->id_hca_hdl,
   3608 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
   3609 		DPRINT(10,
   3610 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
   3611 	}
   3612 	ASSERT(bktp->bkt_mem);
   3613 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
   3614 
   3615 	/*
   3616 	 * Finally free the bucket
   3617 	 */
   3618 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
   3619 	state->id_lso = NULL;
   3620 
   3621 	mutex_exit(&state->id_lso_lock);
   3622 }
   3623 
   3624 /*
   3625  * Free the statically allocated Tx buffer list.
   3626  */
   3627 static void
   3628 ibd_fini_txlist(ibd_state_t *state)
   3629 {
   3630 	/*
   3631 	 * Free the allocated swqes
   3632 	 */
   3633 	mutex_enter(&state->id_tx_list.dl_mutex);
   3634 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
   3635 	state->id_tx_list.dl_head = NULL;
   3636 	state->id_tx_list.dl_pending_sends = B_FALSE;
   3637 	state->id_tx_list.dl_cnt = 0;
   3638 	state->id_tx_rel_list.dl_head = NULL;
   3639 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
   3640 	state->id_tx_rel_list.dl_cnt = 0;
   3641 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
   3642 	mutex_exit(&state->id_tx_list.dl_mutex);
   3643 
   3644 	ibd_free_tx_lsobufs(state);
   3645 	ibd_free_tx_copybufs(state);
   3646 }
   3647 
   3648 /*
   3649  * post a list of rwqes, NULL terminated.
   3650  */
   3651 static void
   3652 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
   3653 {
   3654 	uint_t		i;
   3655 	uint_t		num_posted;
   3656 	ibt_status_t	ibt_status;
   3657 	ibt_recv_wr_t	wrs[IBD_RX_POST_CNT];
   3658 
   3659 	while (rwqe) {
   3660 		/* Post up to IBD_RX_POST_CNT receive work requests */
   3661 		for (i = 0; i < IBD_RX_POST_CNT; i++) {
   3662 			wrs[i] = rwqe->w_rwr;
   3663 			rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
   3664 			if (rwqe == NULL) {
   3665 				i++;
   3666 				break;
   3667 			}
   3668 		}
   3669 
   3670 		/*
   3671 		 * If posting fails for some reason, we'll never receive
   3672 		 * completion intimation, so we'll need to cleanup. But
   3673 		 * we need to make sure we don't clean up nodes whose
   3674 		 * wrs have been successfully posted. We assume that the
   3675 		 * hca driver returns on the first failure to post and
   3676 		 * therefore the first 'num_posted' entries don't need
   3677 		 * cleanup here.
   3678 		 */
   3679 		atomic_add_32(&state->id_rx_list.dl_cnt, i);
   3680 
   3681 		num_posted = 0;
   3682 		ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
   3683 		    &num_posted);
   3684 		if (ibt_status != IBT_SUCCESS) {
   3685 			/* This cannot happen unless the device has an error. */
   3686 			ibd_print_warn(state, "ibd_post_recv: FATAL: "
   3687 			    "posting multiple wrs failed: "
   3688 			    "requested=%d, done=%d, ret=%d",
   3689 			    IBD_RX_POST_CNT, num_posted, ibt_status);
   3690 			atomic_add_32(&state->id_rx_list.dl_cnt,
   3691 			    num_posted - i);
   3692 		}
   3693 	}
   3694 }
   3695 
   3696 /*
   3697  * Grab a list of rwqes from the array of lists, and post the list.
   3698  */
   3699 static void
   3700 ibd_post_recv_intr(ibd_state_t *state)
   3701 {
   3702 	ibd_rx_queue_t	*rxp;
   3703 	ibd_rwqe_t *list;
   3704 
   3705 	/* rotate through the rx_queue array, expecting an adequate number */
   3706 	state->id_rx_post_queue_index =
   3707 	    (state->id_rx_post_queue_index + 1) &
   3708 	    (state->id_rx_nqueues - 1);
   3709 
   3710 	rxp = state->id_rx_queues + state->id_rx_post_queue_index;
   3711 	mutex_enter(&rxp->rx_post_lock);
   3712 	list = WQE_TO_RWQE(rxp->rx_head);
   3713 	rxp->rx_head = NULL;
   3714 	rxp->rx_cnt = 0;
   3715 	mutex_exit(&rxp->rx_post_lock);
   3716 	ibd_post_recv_list(state, list);
   3717 }
   3718 
   3719 /* macro explained below */
   3720 #define	RX_QUEUE_HASH(rwqe) \
   3721 	(((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
   3722 
   3723 /*
   3724  * Add a rwqe to one of the the Rx lists.  If the list is large enough
   3725  * (exactly IBD_RX_POST_CNT), post the list to the hardware.
   3726  *
   3727  * Note: one of 2^N lists is chosen via a hash.  This is done
   3728  * because using one list is contentious.  If the first list is busy
   3729  * (mutex_tryenter fails), use a second list (just call mutex_enter).
   3730  *
   3731  * The number 8 in RX_QUEUE_HASH is a random choice that provides
   3732  * even distribution of mapping rwqes to the 2^N queues.
   3733  */
   3734 static void
   3735 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
   3736 {
   3737 	ibd_rx_queue_t	*rxp;
   3738 
   3739 	rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
   3740 
   3741 	if (!mutex_tryenter(&rxp->rx_post_lock)) {
   3742 		/* Failed.  Try a different queue ("ptr + 16" ensures that). */
   3743 		rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
   3744 		mutex_enter(&rxp->rx_post_lock);
   3745 	}
   3746 	rwqe->rwqe_next = rxp->rx_head;
   3747 	if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
   3748 		uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
   3749 
   3750 		/* only call ibt_post_recv() every Nth time through here */
   3751 		if ((active & (state->id_rx_nqueues - 1)) == 0) {
   3752 			rxp->rx_head = NULL;
   3753 			rxp->rx_cnt = 0;
   3754 			mutex_exit(&rxp->rx_post_lock);
   3755 			ibd_post_recv_list(state, rwqe);
   3756 			return;
   3757 		}
   3758 	}
   3759 	rxp->rx_head = RWQE_TO_WQE(rwqe);
   3760 	mutex_exit(&rxp->rx_post_lock);
   3761 }
   3762 
   3763 static int
   3764 ibd_alloc_rx_copybufs(ibd_state_t *state)
   3765 {
   3766 	ibt_mr_attr_t mem_attr;
   3767 	int i;
   3768 
   3769 	/*
   3770 	 * Allocate one big chunk for all regular rx copy bufs
   3771 	 */
   3772 	state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
   3773 
   3774 	state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe *
   3775 	    state->id_rx_buf_sz, KM_SLEEP);
   3776 
   3777 	state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe *
   3778 	    sizeof (ibd_rwqe_t), KM_SLEEP);
   3779 
   3780 	state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
   3781 	state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
   3782 	    sizeof (ibd_rx_queue_t), KM_SLEEP);
   3783 	for (i = 0; i < state->id_rx_nqueues; i++) {
   3784 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
   3785 		mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
   3786 	}
   3787 
   3788 	/*
   3789 	 * Do one memory registration on the entire rxbuf area
   3790 	 */
   3791 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
   3792 	mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz;
   3793 	mem_attr.mr_as = NULL;
   3794 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
   3795 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
   3796 	    &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
   3797 		DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
   3798 		kmem_free(state->id_rx_wqes,
   3799 		    state->id_num_rwqe * sizeof (ibd_rwqe_t));
   3800 		kmem_free(state->id_rx_bufs,
   3801 		    state->id_num_rwqe * state->id_rx_buf_sz);
   3802 		state->id_rx_bufs = NULL;
   3803 		state->id_rx_wqes = NULL;
   3804 		return (DDI_FAILURE);
   3805 	}
   3806 
   3807 	return (DDI_SUCCESS);
   3808 }
   3809 
   3810 /*
   3811  * Allocate the statically allocated Rx buffer list.
   3812  */
   3813 static int
   3814 ibd_init_rxlist(ibd_state_t *state)
   3815 {
   3816 	ibd_rwqe_t *rwqe, *next;
   3817 	ibd_wqe_t *list;
   3818 	ibt_lkey_t lkey;
   3819 	int i;
   3820 	uint_t len;
   3821 	uint8_t *bufaddr;
   3822 
   3823 	mutex_enter(&state->id_rx_free_list.dl_mutex);
   3824 	if (state->id_rx_free_list.dl_head != NULL) {
   3825 		/* rx rsrcs were never freed.  Just repost them */
   3826 		len = state->id_rx_buf_sz;
   3827 		list = state->id_rx_free_list.dl_head;
   3828 		state->id_rx_free_list.dl_head = NULL;
   3829 		state->id_rx_free_list.dl_cnt = 0;
   3830 		mutex_exit(&state->id_rx_free_list.dl_mutex);
   3831 		for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
   3832 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
   3833 			if ((rwqe->rwqe_im_mblk = desballoc(
   3834 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
   3835 			    &rwqe->w_freemsg_cb)) == NULL) {
   3836 				/* allow freemsg_cb to free the rwqes */
   3837 				if (atomic_dec_32_nv(&state->id_running) != 0) {
   3838 					cmn_err(CE_WARN, "ibd_init_rxlist: "
   3839 					    "id_running was not 1\n");
   3840 				}
   3841 				DPRINT(10, "ibd_init_rxlist : "
   3842 				    "failed in desballoc()");
   3843 				for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
   3844 				    rwqe = next) {
   3845 					next = WQE_TO_RWQE(rwqe->rwqe_next);
   3846 					freemsg(rwqe->rwqe_im_mblk);
   3847 				}
   3848 				atomic_inc_32(&state->id_running);
   3849 				return (DDI_FAILURE);
   3850 			}
   3851 		}
   3852 		ibd_post_recv_list(state, WQE_TO_RWQE(list));
   3853 		return (DDI_SUCCESS);
   3854 	}
   3855 	mutex_exit(&state->id_rx_free_list.dl_mutex);
   3856 
   3857 	if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
   3858 		return (DDI_FAILURE);
   3859 
   3860 	/*
   3861 	 * Allocate and setup the rwqe list
   3862 	 */
   3863 	len = state->id_rx_buf_sz;
   3864 	lkey = state->id_rx_mr_desc.md_lkey;
   3865 	rwqe = state->id_rx_wqes;
   3866 	bufaddr = state->id_rx_bufs;
   3867 	list = NULL;
   3868 	for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) {
   3869 		rwqe->w_state = state;
   3870 		rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
   3871 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
   3872 
   3873 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
   3874 
   3875 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
   3876 		    &rwqe->w_freemsg_cb)) == NULL) {
   3877 			DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
   3878 			/* allow freemsg_cb to free the rwqes */
   3879 			if (atomic_dec_32_nv(&state->id_running) != 0) {
   3880 				cmn_err(CE_WARN, "ibd_init_rxlist: "
   3881 				    "id_running was not 1\n");
   3882 			}
   3883 			DPRINT(10, "ibd_init_rxlist : "
   3884 			    "failed in desballoc()");
   3885 			for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
   3886 			    rwqe = next) {
   3887 				next = WQE_TO_RWQE(rwqe->rwqe_next);
   3888 				freemsg(rwqe->rwqe_im_mblk);
   3889 			}
   3890 			atomic_inc_32(&state->id_running);
   3891 			return (DDI_FAILURE);
   3892 		}
   3893 
   3894 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
   3895 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
   3896 		    (ib_vaddr_t)(uintptr_t)bufaddr;
   3897 		rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
   3898 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
   3899 		rwqe->w_rwr.wr_nds = 1;
   3900 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
   3901 
   3902 		rwqe->rwqe_next = list;
   3903 		list = RWQE_TO_WQE(rwqe);
   3904 	}
   3905 	ibd_post_recv_list(state, WQE_TO_RWQE(list));
   3906 
   3907 	return (DDI_SUCCESS);
   3908 }
   3909 
   3910 static void
   3911 ibd_free_rx_copybufs(ibd_state_t *state)
   3912 {
   3913 	int i;
   3914 
   3915 	/*
   3916 	 * Unregister rxbuf mr
   3917 	 */
   3918 	if (ibt_deregister_mr(state->id_hca_hdl,
   3919 	    state->id_rx_mr_hdl) != IBT_SUCCESS) {
   3920 		DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
   3921 	}
   3922 	state->id_rx_mr_hdl = NULL;
   3923 
   3924 	/*
   3925 	 * Free rxbuf memory
   3926 	 */
   3927 	for (i = 0; i < state->id_rx_nqueues; i++) {
   3928 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
   3929 		mutex_destroy(&rxp->rx_post_lock);
   3930 	}
   3931 	kmem_free(state->id_rx_queues, state->id_rx_nqueues *
   3932 	    sizeof (ibd_rx_queue_t));
   3933 	kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t));
   3934 	kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz);
   3935 	state->id_rx_queues = NULL;
   3936 	state->id_rx_wqes = NULL;
   3937 	state->id_rx_bufs = NULL;
   3938 }
   3939 
   3940 static void
   3941 ibd_free_rx_rsrcs(ibd_state_t *state)
   3942 {
   3943 	mutex_enter(&state->id_rx_free_list.dl_mutex);
   3944 	if (state->id_rx_free_list.dl_head == NULL) {
   3945 		/* already freed */
   3946 		mutex_exit(&state->id_rx_free_list.dl_mutex);
   3947 		return;
   3948 	}
   3949 	ASSERT(state->id_rx_free_list.dl_cnt == state->id_num_rwqe);
   3950 	ibd_free_rx_copybufs(state);
   3951 	state->id_rx_free_list.dl_cnt = 0;
   3952 	state->id_rx_free_list.dl_head = NULL;
   3953 	mutex_exit(&state->id_rx_free_list.dl_mutex);
   3954 }
   3955 
   3956 /*
   3957  * Free the statically allocated Rx buffer list.
   3958  */
   3959 static void
   3960 ibd_fini_rxlist(ibd_state_t *state)
   3961 {
   3962 	ibd_rwqe_t *rwqe;
   3963 	int i;
   3964 
   3965 	/* run through the rx_queue's, calling freemsg() */
   3966 	for (i = 0; i < state->id_rx_nqueues; i++) {
   3967 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
   3968 		mutex_enter(&rxp->rx_post_lock);
   3969 		for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
   3970 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
   3971 			freemsg(rwqe->rwqe_im_mblk);
   3972 			rxp->rx_cnt--;
   3973 		}
   3974 		rxp->rx_head = NULL;
   3975 		mutex_exit(&rxp->rx_post_lock);
   3976 	}
   3977 
   3978 	/* cannot free rx resources unless gld returned everything */
   3979 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
   3980 		ibd_free_rx_rsrcs(state);
   3981 }
   3982 
   3983 /*
   3984  * Free an allocated recv wqe.
   3985  */
   3986 /* ARGSUSED */
   3987 static void
   3988 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
   3989 {
   3990 	/*
   3991 	 * desballoc() failed (no memory).
   3992 	 *
   3993 	 * This rwqe is placed on a free list so that it
   3994 	 * can be reinstated when memory is available.
   3995 	 *
   3996 	 * NOTE: no code currently exists to reinstate
   3997 	 * these "lost" rwqes.
   3998 	 */
   3999 	mutex_enter(&state->id_rx_free_list.dl_mutex);
   4000 	state->id_rx_free_list.dl_cnt++;
   4001 	rwqe->rwqe_next = state->id_rx_free_list.dl_head;
   4002 	state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
   4003 	mutex_exit(&state->id_rx_free_list.dl_mutex);
   4004 }
   4005 
   4006 /*
   4007  * IBA Rx completion queue handler. Guaranteed to be single
   4008  * threaded and nonreentrant for this CQ.
   4009  */
   4010 /* ARGSUSED */
   4011 static void
   4012 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
   4013 {
   4014 	ibd_state_t *state = (ibd_state_t *)arg;
   4015 
   4016 	atomic_inc_64(&state->id_num_intrs);
   4017 
   4018 	if (ibd_rx_softintr == 1) {
   4019 		mutex_enter(&state->id_rcq_poll_lock);
   4020 		if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
   4021 			state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
   4022 			mutex_exit(&state->id_rcq_poll_lock);
   4023 			return;
   4024 		} else {
   4025 			mutex_exit(&state->id_rcq_poll_lock);
   4026 			ddi_trigger_softintr(state->id_rx);
   4027 		}
   4028 	} else
   4029 		(void) ibd_intr((caddr_t)state);
   4030 }
   4031 
   4032 /*
   4033  * CQ handler for Tx completions, when the Tx CQ is in
   4034  * interrupt driven mode.
   4035  */
   4036 /* ARGSUSED */
   4037 static void
   4038 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
   4039 {
   4040 	ibd_state_t *state = (ibd_state_t *)arg;
   4041 
   4042 	atomic_inc_64(&state->id_num_intrs);
   4043 
   4044 	if (ibd_tx_softintr == 1) {
   4045 		mutex_enter(&state->id_scq_poll_lock);
   4046 		if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
   4047 			state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
   4048 			mutex_exit(&state->id_scq_poll_lock);
   4049 			return;
   4050 		} else {
   4051 			mutex_exit(&state->id_scq_poll_lock);
   4052 			ddi_trigger_softintr(state->id_tx);
   4053 		}
   4054 	} else
   4055 		(void) ibd_tx_recycle((caddr_t)state);
   4056 }
   4057 
   4058 /*
   4059  * Multicast group create/delete trap handler. These will be delivered
   4060  * on a kernel thread (handling can thus block) and can be invoked
   4061  * concurrently. The handler can be invoked anytime after it is
   4062  * registered and before ibt_detach().
   4063  */
   4064 /* ARGSUSED */
   4065 static void
   4066 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
   4067     ibt_subnet_event_t *event)
   4068 {
   4069 	ibd_state_t *state = (ibd_state_t *)arg;
   4070 	ibd_req_t *req;
   4071 
   4072 	/*
   4073 	 * The trap handler will get invoked once for every event for
   4074 	 * every port. The input "gid" is the GID0 of the port the
   4075 	 * trap came in on; we just need to act on traps that came
   4076 	 * to our port, meaning the port on which the ipoib interface
   4077 	 * resides. Since ipoib uses GID0 of the port, we just match
   4078 	 * the gids to check whether we need to handle the trap.
   4079 	 */
   4080 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
   4081 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
   4082 		return;
   4083 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
   4084 
   4085 	DPRINT(10, "ibd_notices_handler : %d\n", code);
   4086 
   4087 	switch (code) {
   4088 		case IBT_SM_EVENT_UNAVAILABLE:
   4089 			/*
   4090 			 * If we are in promiscuous mode or have
   4091 			 * sendnonmembers, we need to print a warning
   4092 			 * message right now. Else, just store the
   4093 			 * information, print when we enter promiscuous
   4094 			 * mode or attempt nonmember send. We might
   4095 			 * also want to stop caching sendnonmember.
   4096 			 */
   4097 			ibd_print_warn(state, "IBA multicast support "
   4098 			    "degraded due to unavailability of multicast "
   4099 			    "traps");
   4100 			break;
   4101 		case IBT_SM_EVENT_AVAILABLE:
   4102 			/*
   4103 			 * If we printed a warning message above or
   4104 			 * while trying to nonmember send or get into
   4105 			 * promiscuous mode, print an okay message.
   4106 			 */
   4107 			ibd_print_warn(state, "IBA multicast support "
   4108 			    "restored due to availability of multicast "
   4109 			    "traps");
   4110 			break;
   4111 		case IBT_SM_EVENT_MCG_CREATED:
   4112 		case IBT_SM_EVENT_MCG_DELETED:
   4113 			/*
   4114 			 * Common processing of creation/deletion traps.
   4115 			 * First check if the instance is being
   4116 			 * [de]initialized; back off then, without doing
   4117 			 * anything more, since we are not sure if the
   4118 			 * async thread is around, or whether we might
   4119 			 * be racing with the detach code in ibd_m_stop()
   4120 			 * that scans the mcg list.
   4121 			 */
   4122 			if (!ibd_async_safe(state))
   4123 				return;
   4124 
   4125 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
   4126 			req->rq_gid = event->sm_notice_gid;
   4127 			req->rq_ptr = (void *)code;
   4128 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
   4129 			break;
   4130 	}
   4131 }
   4132 
   4133 static void
   4134 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
   4135 {
   4136 	ib_gid_t mgid = req->rq_gid;
   4137 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
   4138 
   4139 	DPRINT(10, "ibd_async_trap : %d\n", code);
   4140 
   4141 	/*
   4142 	 * Atomically search the nonmember and sendonlymember lists and
   4143 	 * delete.
   4144 	 */
   4145 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
   4146 
   4147 	if (state->id_prom_op == IBD_OP_COMPLETED) {
   4148 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
   4149 
   4150 		/*
   4151 		 * If in promiscuous mode, try to join/attach to the new
   4152 		 * mcg. Given the unreliable out-of-order mode of trap
   4153 		 * delivery, we can never be sure whether it is a problem
   4154 		 * if the join fails. Thus, we warn the admin of a failure
   4155 		 * if this was a creation trap. Note that the trap might
   4156 		 * actually be reporting a long past event, and the mcg
   4157 		 * might already have been deleted, thus we might be warning
   4158 		 * in vain.
   4159 		 */
   4160 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
   4161 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
   4162 			ibd_print_warn(state, "IBA promiscuous mode missed "
   4163 			    "new multicast gid %016llx:%016llx",
   4164 			    (u_longlong_t)mgid.gid_prefix,
   4165 			    (u_longlong_t)mgid.gid_guid);
   4166 	}
   4167 
   4168 	/*
   4169 	 * Free the request slot allocated by the subnet event thread.
   4170 	 */
   4171 	ibd_async_done(state);
   4172 }
   4173 
   4174 /*
   4175  * GLDv3 entry point to get capabilities.
   4176  */
   4177 static boolean_t
   4178 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
   4179 {
   4180 	ibd_state_t *state = arg;
   4181 
   4182 	switch (cap) {
   4183 	case MAC_CAPAB_HCKSUM: {
   4184 		uint32_t *txflags = cap_data;
   4185 
   4186 		/*
   4187 		 * We either do full checksum or not do it at all
   4188 		 */
   4189 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
   4190 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
   4191 		else
   4192 			return (B_FALSE);
   4193 		break;
   4194 	}
   4195 
   4196 	case MAC_CAPAB_LSO: {
   4197 		mac_capab_lso_t *cap_lso = cap_data;
   4198 
   4199 		/*
   4200 		 * In addition to the capability and policy, since LSO
   4201 		 * relies on hw checksum, we'll not enable LSO if we
   4202 		 * don't have hw checksum.  Of course, if the HCA doesn't
   4203 		 * provide the reserved lkey capability, enabling LSO will
   4204 		 * actually affect performance adversely, so we'll disable
   4205 		 * LSO even for that case.
   4206 		 */
   4207 		if (!state->id_lso_policy || !state->id_lso_capable)
   4208 			return (B_FALSE);
   4209 
   4210 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
   4211 			return (B_FALSE);
   4212 
   4213 		if (state->id_hca_res_lkey_capab == 0) {
   4214 			ibd_print_warn(state, "no reserved-lkey capability, "
   4215 			    "disabling LSO");
   4216 			return (B_FALSE);
   4217 		}
   4218 
   4219 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
   4220 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
   4221 		break;
   4222 	}
   4223 
   4224 	default:
   4225 		return (B_FALSE);
   4226 	}
   4227 
   4228 	return (B_TRUE);
   4229 }
   4230 
   4231 static int
   4232 ibd_get_port_details(ibd_state_t *state)
   4233 {
   4234 	ibt_hca_portinfo_t *port_infop;
   4235 	ibt_status_t ret;
   4236 	uint_t psize, port_infosz;
   4237 
   4238 	mutex_enter(&state->id_link_mutex);
   4239 
   4240 	/*
   4241 	 * Query for port information
   4242 	 */
   4243 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
   4244 	    &port_infop, &psize, &port_infosz);
   4245 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
   4246 		mutex_exit(&state->id_link_mutex);
   4247 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
   4248 		    "failed, ret=%d", ret);
   4249 		return (ENETDOWN);
   4250 	}
   4251 
   4252 	/*
   4253 	 * If the link already went down by the time we get here,
   4254 	 * give up
   4255 	 */
   4256 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
   4257 		mutex_exit(&state->id_link_mutex);
   4258 		ibt_free_portinfo(port_infop, port_infosz);
   4259 		DPRINT(10, "ibd_get_port_details: port is not active");
   4260 		return (ENETDOWN);
   4261 	}
   4262 
   4263 	/*
   4264 	 * If the link is active, verify the pkey
   4265 	 */
   4266 	if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
   4267 	    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
   4268 		mutex_exit(&state->id_link_mutex);
   4269 		ibt_free_portinfo(port_infop, port_infosz);
   4270 		DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
   4271 		    "failed, ret=%d", ret);
   4272 		return (ENONET);
   4273 	}
   4274 
   4275 	state->id_mtu = (128 << port_infop->p_mtu);
   4276 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
   4277 	state->id_sgid = *port_infop->p_sgid_tbl;
   4278 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
   4279 	state->id_link_state = LINK_STATE_UP;
   4280 
   4281 	mutex_exit(&state->id_link_mutex);
   4282 	ibt_free_portinfo(port_infop, port_infosz);
   4283 
   4284 	/*
   4285 	 * Now that the port is active, record the port speed
   4286 	 */
   4287 	state->id_link_speed = ibd_get_portspeed(state);
   4288 
   4289 	return (0);
   4290 }
   4291 
   4292 static int
   4293 ibd_alloc_cqs(ibd_state_t *state)
   4294 {
   4295 	ibt_hca_attr_t hca_attrs;
   4296 	ibt_cq_attr_t cq_attr;
   4297 	ibt_status_t ret;
   4298 	uint32_t real_size;
   4299 
   4300 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
   4301 	ASSERT(ret == IBT_SUCCESS);
   4302 
   4303 	/*
   4304 	 * Allocate Rx/combined CQ:
   4305 	 * Theoretically, there is no point in having more than #rwqe
   4306 	 * plus #swqe cqe's, except that the CQ will be signaled for
   4307 	 * overflow when the last wqe completes, if none of the previous
   4308 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
   4309 	 * to make sure such overflow does not occur.
   4310 	 */
   4311 	cq_attr.cq_sched = NULL;
   4312 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
   4313 
   4314 	/*
   4315 	 * Allocate Receive CQ.
   4316 	 */
   4317 	if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
   4318 		cq_attr.cq_size = state->id_num_rwqe + 1;
   4319 	} else {
   4320 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
   4321 		state->id_num_rwqe = cq_attr.cq_size - 1;
   4322 	}
   4323 
   4324 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
   4325 	    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
   4326 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
   4327 		    "failed, ret=%d\n", ret);
   4328 		return (DDI_FAILURE);
   4329 	}
   4330 
   4331 	if ((ret = ibt_modify_cq(state->id_rcq_hdl,
   4332 	    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
   4333 		DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
   4334 		    "moderation failed, ret=%d\n", ret);
   4335 	}
   4336 
   4337 	/* make the #rx wc's the same as max rx chain size */
   4338 	state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
   4339 	state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
   4340 	    state->id_rxwcs_size, KM_SLEEP);
   4341 
   4342 	/*
   4343 	 * Allocate Send CQ.
   4344 	 */
   4345 	if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
   4346 		cq_attr.cq_size = state->id_num_swqe + 1;
   4347 	} else {
   4348 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
   4349 		state->id_num_swqe = cq_attr.cq_size - 1;
   4350 	}
   4351 
   4352 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
   4353 	    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
   4354 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
   4355 		    "failed, ret=%d\n", ret);
   4356 		kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
   4357 		    state->id_rxwcs_size);
   4358 		(void) ibt_free_cq(state->id_rcq_hdl);
   4359 		return (DDI_FAILURE);
   4360 	}
   4361 	if ((ret = ibt_modify_cq(state->id_scq_hdl,
   4362 	    ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) {
   4363 		DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
   4364 		    "moderation failed, ret=%d\n", ret);
   4365 	}
   4366 
   4367 	state->id_txwcs_size = IBD_TX_POLL_THRESH;
   4368 	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
   4369 	    state->id_txwcs_size, KM_SLEEP);
   4370 
   4371 	/*
   4372 	 * Print message in case we could not allocate as many wqe's
   4373 	 * as was requested.
   4374 	 */
   4375 	if (state->id_num_rwqe != IBD_NUM_RWQE) {
   4376 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
   4377 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
   4378 	}
   4379 	if (state->id_num_swqe != IBD_NUM_SWQE) {
   4380 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
   4381 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
   4382 	}
   4383 
   4384 	return (DDI_SUCCESS);
   4385 }
   4386 
   4387 static int
   4388 ibd_setup_ud_channel(ibd_state_t *state)
   4389 {
   4390 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
   4391 	ibt_ud_chan_query_attr_t ud_chan_attr;
   4392 	ibt_status_t ret;
   4393 
   4394 	ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
   4395 	if (state->id_hca_res_lkey_capab)
   4396 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
   4397 	if (state->id_lso_policy && state->id_lso_capable)
   4398 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
   4399 
   4400 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
   4401 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
   4402 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
   4403 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
   4404 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
   4405 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
   4406 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
   4407 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
   4408 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
   4409 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
   4410 	ud_alloc_attr.ud_clone_chan	= NULL;
   4411 
   4412 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
   4413 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
   4414 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
   4415 		    "failed, ret=%d\n", ret);
   4416 		return (DDI_FAILURE);
   4417 	}
   4418 
   4419 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
   4420 	    &ud_chan_attr)) != IBT_SUCCESS) {
   4421 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
   4422 		    "failed, ret=%d\n", ret);
   4423 		(void) ibt_free_channel(state->id_chnl_hdl);
   4424 		return (DDI_FAILURE);
   4425 	}
   4426 
   4427 	state->id_qpnum = ud_chan_attr.ud_qpn;
   4428 
   4429 	return (DDI_SUCCESS);
   4430 }
   4431 
   4432 static int
   4433 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
   4434 {
   4435 	uint32_t progress = state->id_mac_state;
   4436 	uint_t attempts;
   4437 	ibt_status_t ret;
   4438 	ib_gid_t mgid;
   4439 	ibd_mce_t *mce;
   4440 	uint8_t jstate;
   4441 
   4442 	if (atomic_dec_32_nv(&state->id_running) != 0)
   4443 		cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
   4444 
   4445 	/*
   4446 	 * Before we try to stop/undo whatever we did in ibd_start(),
   4447 	 * we need to mark the link state appropriately to prevent the
   4448 	 * ip layer from using this instance for any new transfers. Note
   4449 	 * that if the original state of the link was "up" when we're
   4450 	 * here, we'll set the final link state to "unknown", to behave
   4451 	 * in the same fashion as other ethernet drivers.
   4452 	 */
   4453 	mutex_enter(&state->id_link_mutex);
   4454 	if (cur_link_state == LINK_STATE_DOWN) {
   4455 		state->id_link_state = cur_link_state;
   4456 	} else {
   4457 		state->id_link_state = LINK_STATE_UNKNOWN;
   4458 	}
   4459 	mutex_exit(&state->id_link_mutex);
   4460 	mac_link_update(state->id_mh, state->id_link_state);
   4461 
   4462 	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
   4463 	if (progress & IBD_DRV_STARTED) {
   4464 		state->id_mac_state &= (~IBD_DRV_STARTED);
   4465 	}
   4466 
   4467 	/* Stop listen under Reliable Connected Mode */
   4468 	if (progress & IBD_DRV_RC_LISTEN) {
   4469 		ASSERT(state->id_enable_rc);
   4470 		if (state->rc_listen_hdl != NULL) {
   4471 			ibd_rc_stop_listen(state);
   4472 		}
   4473 		state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
   4474 	}
   4475 
   4476 	if (state->id_enable_rc) {
   4477 		if (ibd_rc_close_all_chan(state) != DDI_SUCCESS) {
   4478 			(void) ibd_rc_listen(state);
   4479 			state->id_mac_state |= IBD_DRV_RC_LISTEN;
   4480 			return (DDI_FAILURE);
   4481 		}
   4482 	}
   4483 
   4484 	/*
   4485 	 * First, stop receive interrupts; this stops the driver from
   4486 	 * handing up buffers to higher layers.  Wait for receive buffers
   4487 	 * to be returned and give up after 1 second.
   4488 	 */
   4489 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
   4490 		attempts = 10;
   4491 		while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
   4492 		    0) > 0) {
   4493 			delay(drv_usectohz(100000));
   4494 			if (--attempts == 0) {
   4495 				/*
   4496 				 * There are pending bufs with the network
   4497 				 * layer and we have no choice but to wait
   4498 				 * for them to be done with. Reap all the
   4499 				 * Tx/Rx completions that were posted since
   4500 				 * we turned off the notification and
   4501 				 * return failure.
   4502 				 */
   4503 				cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
   4504 				DPRINT(2, "ibd_undo_start: "
   4505 				    "reclaiming failed");
   4506 				break;
   4507 			}
   4508 		}
   4509 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
   4510 	}
   4511 
   4512 	if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
   4513 		ibd_rc_fini_tx_largebuf_list(state);
   4514 		state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
   4515 	}
   4516 
   4517 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
   4518 		ASSERT(state->id_enable_rc);
   4519 		ibd_rc_fini_srq_list(state);
   4520 		state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
   4521 	}
   4522 
   4523 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
   4524 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
   4525 
   4526 		mutex_enter(&state->id_trap_lock);
   4527 		state->id_trap_stop = B_TRUE;
   4528 		while (state->id_trap_inprog > 0)
   4529 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
   4530 		mutex_exit(&state->id_trap_lock);
   4531 
   4532 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
   4533 	}
   4534 
   4535 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
   4536 		/*
   4537 		 * Flushing the channel ensures that all pending WQE's
   4538 		 * are marked with flush_error and handed to the CQ. It
   4539 		 * does not guarantee the invocation of the CQ handler.
   4540 		 * This call is guaranteed to return successfully for
   4541 		 * UD QPNs.
   4542 		 */
   4543 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
   4544 		    IBT_SUCCESS) {
   4545 			DPRINT(10, "ibd_undo_start: flush_channel "
   4546 			    "failed, ret=%d", ret);
   4547 		}
   4548 
   4549 		/*
   4550 		 * Give some time for the TX CQ handler to process the
   4551 		 * completions.
   4552 		 */
   4553 		mutex_enter(&state->id_tx_list.dl_mutex);
   4554 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
   4555 		attempts = 10;
   4556 		while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
   4557 		    != state->id_num_swqe) {
   4558 			if (--attempts == 0)
   4559 				break;
   4560 			mutex_exit(&state->id_tx_rel_list.dl_mutex);
   4561 			mutex_exit(&state->id_tx_list.dl_mutex);
   4562 			delay(drv_usectohz(100000));
   4563 			mutex_enter(&state->id_tx_list.dl_mutex);
   4564 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
   4565 		}
   4566 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
   4567 		if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
   4568 		    state->id_num_swqe) {
   4569 			cmn_err(CE_WARN, "tx resources not freed\n");
   4570 		}
   4571 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
   4572 		mutex_exit(&state->id_tx_list.dl_mutex);
   4573 
   4574 		attempts = 10;
   4575 		while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
   4576 			if (--attempts == 0)
   4577 				break;
   4578 			delay(drv_usectohz(100000));
   4579 		}
   4580 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
   4581 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
   4582 			cmn_err(CE_WARN, "rx resources not freed\n");
   4583 		}
   4584 
   4585 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
   4586 	}
   4587 
   4588 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
   4589 		/*
   4590 		 * No new async requests will be posted since the device
   4591 		 * link state has been marked as unknown; completion handlers
   4592 		 * have been turned off, so Tx handler will not cause any
   4593 		 * more IBD_ASYNC_REAP requests.
   4594 		 *
   4595 		 * Queue a request for the async thread to exit, which will
   4596 		 * be serviced after any pending ones. This can take a while,
   4597 		 * specially if the SM is unreachable, since IBMF will slowly
   4598 		 * timeout each SM request issued by the async thread.  Reap
   4599 		 * the thread before continuing on, we do not want it to be
   4600 		 * lingering in modunloaded code (or we could move the reap
   4601 		 * to ibd_detach(), provided we keep track of the current
   4602 		 * id_async_thrid somewhere safe).
   4603 		 */
   4604 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
   4605 		thread_join(state->id_async_thrid);
   4606 
   4607 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
   4608 	}
   4609 
   4610 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
   4611 		/*
   4612 		 * Drop all residual full/non membership. This includes full
   4613 		 * membership to the broadcast group, and any nonmembership
   4614 		 * acquired during transmits. We do this after the Tx completion
   4615 		 * handlers are done, since those might result in some late
   4616 		 * leaves; this also eliminates a potential race with that
   4617 		 * path wrt the mc full list insert/delete. Trap handling
   4618 		 * has also been suppressed at this point. Thus, no locks
   4619 		 * are required while traversing the mc full list.
   4620 		 */
   4621 		DPRINT(2, "ibd_undo_start: clear full cache entries");
   4622 		mce = list_head(&state->id_mc_full);
   4623 		while (mce != NULL) {
   4624 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
   4625 			jstate = mce->mc_jstate;
   4626 			mce = list_next(&state->id_mc_full, mce);
   4627 			ibd_leave_group(state, mgid, jstate);
   4628 		}
   4629 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
   4630 	}
   4631 
   4632 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
   4633 		ibd_fini_rxlist(state);
   4634 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
   4635 	}
   4636 
   4637 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
   4638 		ibd_fini_txlist(state);
   4639 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
   4640 	}
   4641 
   4642 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
   4643 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
   4644 		    IBT_SUCCESS) {
   4645 			DPRINT(10, "ibd_undo_start: free_channel "
   4646 			    "failed, ret=%d", ret);
   4647 		}
   4648 
   4649 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
   4650 	}
   4651 
   4652 	if (progress & IBD_DRV_CQS_ALLOCD) {
   4653 		kmem_free(state->id_txwcs,
   4654 		    sizeof (ibt_wc_t) * state->id_txwcs_size);
   4655 		if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
   4656 		    IBT_SUCCESS) {
   4657 			DPRINT(10, "ibd_undo_start: free_cq(scq) "
   4658 			    "failed, ret=%d", ret);
   4659 		}
   4660 
   4661 		kmem_free(state->id_rxwcs,
   4662 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
   4663 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
   4664 			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
   4665 			    "ret=%d", ret);
   4666 		}
   4667 
   4668 		state->id_txwcs = NULL;
   4669 		state->id_rxwcs = NULL;
   4670 		state->id_scq_hdl = NULL;
   4671 		state->id_rcq_hdl = NULL;
   4672 
   4673 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
   4674 	}
   4675 
   4676 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
   4677 		mutex_enter(&state->id_ac_mutex);
   4678 		mod_hash_destroy_hash(state->id_ah_active_hash);
   4679 		mutex_exit(&state->id_ac_mutex);
   4680 		ibd_acache_fini(state);
   4681 
   4682 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
   4683 	}
   4684 
   4685 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
   4686 		/*
   4687 		 * If we'd created the ipoib broadcast group and had
   4688 		 * successfully joined it, leave it now
   4689 		 */
   4690 		if (state->id_bgroup_created) {
   4691 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
   4692 			jstate = IB_MC_JSTATE_FULL;
   4693 			(void) ibt_leave_mcg(state->id_sgid, mgid,
   4694 			    state->id_sgid, jstate);
   4695 		}
   4696 		ibt_free_mcg_info(state->id_mcinfo, 1);
   4697 
   4698 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
   4699 	}
   4700 
   4701 	return (DDI_SUCCESS);
   4702 }
   4703 
   4704 /*
   4705  * These pair of routines are used to set/clear the condition that
   4706  * the caller is likely to do something to change the id_mac_state.
   4707  * If there's already someone doing either a start or a stop (possibly
   4708  * due to the async handler detecting a pkey relocation event, a plumb
   4709  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
   4710  * that's done.
   4711  */
   4712 static void
   4713 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
   4714 {
   4715 	mutex_enter(&state->id_macst_lock);
   4716 	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
   4717 		cv_wait(&state->id_macst_cv, &state->id_macst_lock);
   4718 
   4719 	state->id_mac_state |= flag;
   4720 	mutex_exit(&state->id_macst_lock);
   4721 }
   4722 
   4723 static void
   4724 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
   4725 {
   4726 	mutex_enter(&state->id_macst_lock);
   4727 	state->id_mac_state &= (~flag);
   4728 	cv_signal(&state->id_macst_cv);
   4729 	mutex_exit(&state->id_macst_lock);
   4730 }
   4731 
   4732 /*
   4733  * GLDv3 entry point to start hardware.
   4734  */
   4735 /*ARGSUSED*/
   4736 static int
   4737 ibd_m_start(void *arg)
   4738 {
   4739 	ibd_state_t *state = arg;
   4740 	int	ret;
   4741 
   4742 	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
   4743 
   4744 	ret = ibd_start(state);
   4745 
   4746 	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
   4747 
   4748 	return (ret);
   4749 }
   4750 
   4751 static int
   4752 ibd_start(ibd_state_t *state)
   4753 {
   4754 	kthread_t *kht;
   4755 	int err;
   4756 	ibt_status_t ret;
   4757 
   4758 	if (state->id_mac_state & IBD_DRV_STARTED)
   4759 		return (DDI_SUCCESS);
   4760 
   4761 	if (atomic_inc_32_nv(&state->id_running) != 1) {
   4762 		DPRINT(10, "ibd_start: id_running is non-zero");
   4763 		cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
   4764 		atomic_dec_32(&state->id_running);
   4765 		return (EINVAL);
   4766 	}
   4767 
   4768 	/*
   4769 	 * Get port details; if we fail here, very likely the port
   4770 	 * state is inactive or the pkey can't be verified.
   4771 	 */
   4772 	if ((err = ibd_get_port_details(state)) != 0) {
   4773 		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
   4774 		goto start_fail;
   4775 	}
   4776 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
   4777 
   4778 	/*
   4779 	 * Find the IPoIB broadcast group
   4780 	 */
   4781 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
   4782 		DPRINT(10, "ibd_start: ibd_find_bgroup() failed");
   4783 		err = ENOTACTIVE;
   4784 		goto start_fail;
   4785 	}
   4786 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
   4787 
   4788 	/*
   4789 	 * Initialize per-interface caches and lists; if we fail here,
   4790 	 * it is most likely due to a lack of resources
   4791 	 */
   4792 	if (ibd_acache_init(state) != DDI_SUCCESS) {
   4793 		DPRINT(10, "ibd_start: ibd_acache_init() failed");
   4794 		err = ENOMEM;
   4795 		goto start_fail;
   4796 	}
   4797 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
   4798 
   4799 	/*
   4800 	 * Allocate send and receive completion queues
   4801 	 */
   4802 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
   4803 		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
   4804 		err = ENOMEM;
   4805 		goto start_fail;
   4806 	}
   4807 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
   4808 
   4809 	/*
   4810 	 * Setup a UD channel
   4811 	 */
   4812 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
   4813 		err = ENOMEM;
   4814 		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
   4815 		goto start_fail;
   4816 	}
   4817 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
   4818 
   4819 	/*
   4820 	 * Allocate and initialize the tx buffer list
   4821 	 */
   4822 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
   4823 		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
   4824 		err = ENOMEM;
   4825 		goto start_fail;
   4826 	}
   4827 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
   4828 
   4829 	/*
   4830 	 * Create the send cq handler here
   4831 	 */
   4832 	ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
   4833 	if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
   4834 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
   4835 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
   4836 		    "failed, ret=%d", ret);
   4837 		err = EINVAL;
   4838 		goto start_fail;
   4839 	}
   4840 	state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
   4841 
   4842 	/*
   4843 	 * Allocate and initialize the rx buffer list
   4844 	 */
   4845 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
   4846 		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
   4847 		err = ENOMEM;
   4848 		goto start_fail;
   4849 	}
   4850 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
   4851 
   4852 	/*
   4853 	 * Join IPoIB broadcast group
   4854 	 */
   4855 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
   4856 		DPRINT(10, "ibd_start: ibd_join_group() failed");
   4857 		err = ENOTACTIVE;
   4858 		goto start_fail;
   4859 	}
   4860 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
   4861 
   4862 	/*
   4863 	 * Create the async thread; thread_create never fails.
   4864 	 */
   4865 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
   4866 	    TS_RUN, minclsyspri);
   4867 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
   4868 	state->id_async_thrid = kht->t_did;
   4869 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
   4870 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
   4871 
   4872 	/*
   4873 	 * When we did mac_register() in ibd_attach(), we didn't register
   4874 	 * the real macaddr and we didn't have the true port mtu. Now that
   4875 	 * we're almost ready, set the local mac address and broadcast
   4876 	 * addresses and update gldv3 about the real values of these
   4877 	 * parameters.
   4878 	 */
   4879 	if (state->id_enable_rc) {
   4880 		ibd_h2n_mac(&state->id_macaddr,
   4881 		    IBD_MAC_ADDR_RC + state->id_qpnum,
   4882 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
   4883 	} else {
   4884 		ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
   4885 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
   4886 	}
   4887 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
   4888 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
   4889 
   4890 	if (!state->id_enable_rc) {
   4891 		(void) mac_maxsdu_update(state->id_mh, state->id_mtu
   4892 		    - IPOIB_HDRSIZE);
   4893 	}
   4894 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
   4895 
   4896 	/*
   4897 	 * Setup the receive cq handler
   4898 	 */
   4899 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
   4900 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
   4901 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
   4902 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
   4903 		    "failed, ret=%d", ret);
   4904 		err = EINVAL;
   4905 		goto start_fail;
   4906 	}
   4907 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
   4908 
   4909 	/*
   4910 	 * Setup the subnet notices handler after we've initialized the acache/
   4911 	 * mcache and started the async thread, both of which are required for
   4912 	 * the trap handler to function properly.
   4913 	 *
   4914 	 * Now that the async thread has been started (and we've already done
   4915 	 * a mac_register() during attach so mac_tx_update() can be called
   4916 	 * if necessary without any problem), we can enable the trap handler
   4917 	 * to queue requests to the async thread.
   4918 	 */
   4919 	ibt_register_subnet_notices(state->id_ibt_hdl,
   4920 	    ibd_snet_notices_handler, state);
   4921 	mutex_enter(&state->id_trap_lock);
   4922 	state->id_trap_stop = B_FALSE;
   4923 	mutex_exit(&state->id_trap_lock);
   4924 	state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
   4925 
   4926 	if (state->id_enable_rc) {
   4927 		if (state->rc_enable_srq) {
   4928 			/* Allocate SRQ resource */
   4929 			if (ibd_rc_init_srq_list(state) != IBT_SUCCESS)
   4930 				goto start_fail;
   4931 			state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
   4932 		}
   4933 
   4934 		if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
   4935 			DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
   4936 			    "failed");
   4937 			goto start_fail;
   4938 		}
   4939 		state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
   4940 
   4941 		/* RC: begin to listen only after everything is available */
   4942 		if (ibd_rc_listen(state) != IBT_SUCCESS) {
   4943 			DPRINT(10, "ibd_start: ibd_rc_listen() failed");
   4944 			goto start_fail;
   4945 		}
   4946 		state->id_mac_state |= IBD_DRV_RC_LISTEN;
   4947 	}
   4948 
   4949 	/*
   4950 	 * Indicate link status to GLDv3 and higher layers. By default,
   4951 	 * we assume we are in up state (which must have been true at
   4952 	 * least at the time the broadcast mcg's were probed); if there
   4953 	 * were any up/down transitions till the time we come here, the
   4954 	 * async handler will have updated last known state, which we
   4955 	 * use to tell GLDv3. The async handler will not send any
   4956 	 * notifications to GLDv3 till we reach here in the initialization
   4957 	 * sequence.
   4958 	 */
   4959 	state->id_mac_state |= IBD_DRV_STARTED;
   4960 	mac_link_update(state->id_mh, state->id_link_state);
   4961 
   4962 	return (DDI_SUCCESS);
   4963 
   4964 start_fail:
   4965 	/*
   4966 	 * If we ran into a problem during ibd_start() and ran into
   4967 	 * some other problem during undoing our partial work, we can't
   4968 	 * do anything about it.  Ignore any errors we might get from
   4969 	 * ibd_undo_start() and just return the original error we got.
   4970 	 */
   4971 	(void) ibd_undo_start(state, LINK_STATE_DOWN);
   4972 	return (err);
   4973 }
   4974 
   4975 /*
   4976  * GLDv3 entry point to stop hardware from receiving packets.
   4977  */
   4978 /*ARGSUSED*/
   4979 static void
   4980 ibd_m_stop(void *arg)
   4981 {
   4982 	ibd_state_t *state = (ibd_state_t *)arg;
   4983 
   4984 	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
   4985 
   4986 	(void) ibd_undo_start(state, state->id_link_state);
   4987 
   4988 	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
   4989 }
   4990 
   4991 /*
   4992  * GLDv3 entry point to modify device's mac address. We do not
   4993  * allow address modifications.
   4994  */
   4995 static int
   4996 ibd_m_unicst(void *arg, const uint8_t *macaddr)
   4997 {
   4998 	ibd_state_t *state = arg;
   4999 
   5000 	/*
   5001 	 * Don't bother even comparing the macaddr if we haven't
   5002 	 * completed ibd_m_start().
   5003 	 */
   5004 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
   5005 		return (0);
   5006 
   5007 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
   5008 		return (0);
   5009 	else
   5010 		return (EINVAL);
   5011 }
   5012 
   5013 /*
   5014  * The blocking part of the IBA join/leave operations are done out
   5015  * of here on the async thread.
   5016  */
   5017 static void
   5018 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
   5019 {
   5020 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
   5021 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
   5022 
   5023 	if (op == IBD_ASYNC_JOIN) {
   5024 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
   5025 			ibd_print_warn(state, "Join multicast group failed :"
   5026 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
   5027 		}
   5028 	} else {
   5029 		/*
   5030 		 * Here, we must search for the proper mcg_info and
   5031 		 * use that to leave the group.
   5032 		 */
   5033 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
   5034 	}
   5035 }
   5036 
   5037 /*
   5038  * GLDv3 entry point for multicast enable/disable requests.
   5039  * This function queues the operation to the async thread and
   5040  * return success for a valid multicast address.
   5041  */
   5042 static int
   5043 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
   5044 {
   5045 	ibd_state_t *state = (ibd_state_t *)arg;
   5046 	ipoib_mac_t maddr, *mcast;
   5047 	ib_gid_t mgid;
   5048 	ibd_req_t *req;
   5049 
   5050 	/*
   5051 	 * If we haven't completed ibd_m_start(), async thread wouldn't
   5052 	 * have been started and id_bcaddr wouldn't be set, so there's
   5053 	 * no point in continuing.
   5054 	 */
   5055 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
   5056 		return (0);
   5057 
   5058 	/*
   5059 	 * The incoming multicast address might not be aligned properly
   5060 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
   5061 	 * it to look like one though, to get the offsets of the mc gid,
   5062 	 * since we know we are not going to dereference any values with
   5063 	 * the ipoib_mac_t pointer.
   5064 	 */
   5065 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
   5066 	mcast = &maddr;
   5067 
   5068 	/*
   5069 	 * Check validity of MCG address. We could additionally check
   5070 	 * that a enable/disable is not being issued on the "broadcast"
   5071 	 * mcg, but since this operation is only invokable by privileged
   5072 	 * programs anyway, we allow the flexibility to those dlpi apps.
   5073 	 * Note that we do not validate the "scope" of the IBA mcg.
   5074 	 */
   5075 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
   5076 		return (EINVAL);
   5077 
   5078 	/*
   5079 	 * fill in multicast pkey and scope
   5080 	 */
   5081 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
   5082 
   5083 	/*
   5084 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
   5085 	 * nothing (i.e. we stay JOINed to the broadcast group done in
   5086 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
   5087 	 * requires to be joined to broadcast groups at all times.
   5088 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
   5089 	 * depends on this.
   5090 	 */
   5091 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
   5092 		return (0);
   5093 
   5094 	ibd_n2h_gid(mcast, &mgid);
   5095 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
   5096 	if (req == NULL)
   5097 		return (ENOMEM);
   5098 
   5099 	req->rq_gid = mgid;
   5100 
   5101 	if (add) {
   5102 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
   5103 		    mgid.gid_prefix, mgid.gid_guid);
   5104 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
   5105 	} else {
   5106 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
   5107 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
   5108 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
   5109 	}
   5110 	return (0);
   5111 }
   5112 
   5113 /*
   5114  * The blocking part of the IBA promiscuous operations are done
   5115  * out of here on the async thread. The dlpireq parameter indicates
   5116  * whether this invocation is due to a dlpi request or due to
   5117  * a port up/down event.
   5118  */
   5119 static void
   5120 ibd_async_unsetprom(ibd_state_t *state)
   5121 {
   5122 	ibd_mce_t *mce = list_head(&state->id_mc_non);
   5123 	ib_gid_t mgid;
   5124 
   5125 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
   5126 
   5127 	while (mce != NULL) {
   5128 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
   5129 		mce = list_next(&state->id_mc_non, mce);
   5130 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
   5131 	}
   5132 	state->id_prom_op = IBD_OP_NOTSTARTED;
   5133 }
   5134 
   5135 /*
   5136  * The blocking part of the IBA promiscuous operations are done
   5137  * out of here on the async thread. The dlpireq parameter indicates
   5138  * whether this invocation is due to a dlpi request or due to
   5139  * a port up/down event.
   5140  */
   5141 static void
   5142 ibd_async_setprom(ibd_state_t *state)
   5143 {
   5144 	ibt_mcg_attr_t mcg_attr;
   5145 	ibt_mcg_info_t *mcg_info;
   5146 	ib_gid_t mgid;
   5147 	uint_t numg;
   5148 	int i;
   5149 	char ret = IBD_OP_COMPLETED;
   5150 
   5151 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
   5152 
   5153 	/*
   5154 	 * Obtain all active MC groups on the IB fabric with
   5155 	 * specified criteria (scope + Pkey + Qkey + mtu).
   5156 	 */
   5157 	bzero(&mcg_attr, sizeof (mcg_attr));
   5158 	mcg_attr.mc_pkey = state->id_pkey;
   5159 	mcg_attr.mc_scope = state->id_scope;
   5160 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
   5161 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
   5162 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
   5163 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
   5164 	    IBT_SUCCESS) {
   5165 		ibd_print_warn(state, "Could not get list of IBA multicast "
   5166 		    "groups");
   5167 		ret = IBD_OP_ERRORED;
   5168 		goto done;
   5169 	}
   5170 
   5171 	/*
   5172 	 * Iterate over the returned mcg's and join as NonMember
   5173 	 * to the IP mcg's.
   5174 	 */
   5175 	for (i = 0; i < numg; i++) {
   5176 		/*
   5177 		 * Do a NonMember JOIN on the MC group.
   5178 		 */
   5179 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
   5180 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
   5181 			ibd_print_warn(state, "IBA promiscuous mode missed "
   5182 			    "multicast gid %016llx:%016llx",
   5183 			    (u_longlong_t)mgid.gid_prefix,
   5184 			    (u_longlong_t)mgid.gid_guid);
   5185 	}
   5186 
   5187 	ibt_free_mcg_info(mcg_info, numg);
   5188 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
   5189 done:
   5190 	state->id_prom_op = ret;
   5191 }
   5192 
   5193 /*
   5194  * GLDv3 entry point for multicast promiscuous enable/disable requests.
   5195  * GLDv3 assumes phys state receives more packets than multi state,
   5196  * which is not true for IPoIB. Thus, treat the multi and phys
   5197  * promiscuous states the same way to work with GLDv3's assumption.
   5198  */
   5199 static int
   5200 ibd_m_promisc(void *arg, boolean_t on)
   5201 {
   5202 	ibd_state_t *state = (ibd_state_t *)arg;
   5203 	ibd_req_t *req;
   5204 
   5205 	/*
   5206 	 * Async thread wouldn't have been started if we haven't
   5207 	 * passed ibd_m_start()
   5208 	 */
   5209 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
   5210 		return (0);
   5211 
   5212 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
   5213 	if (req == NULL)
   5214 		return (ENOMEM);
   5215 	if (on) {
   5216 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
   5217 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
   5218 	} else {
   5219 		DPRINT(1, "ibd_m_promisc : unset_promisc");
   5220 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
   5221 	}
   5222 
   5223 	return (0);
   5224 }
   5225 
   5226 /*
   5227  * GLDv3 entry point for gathering statistics.
   5228  */
   5229 static int
   5230 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
   5231 {
   5232 	ibd_state_t *state = (ibd_state_t *)arg;
   5233 
   5234 	switch (stat) {
   5235 	case MAC_STAT_IFSPEED:
   5236 		*val = state->id_link_speed;
   5237 		break;
   5238 	case MAC_STAT_MULTIRCV:
   5239 		*val = state->id_multi_rcv;
   5240 		break;
   5241 	case MAC_STAT_BRDCSTRCV:
   5242 		*val = state->id_brd_rcv;
   5243 		break;
   5244 	case MAC_STAT_MULTIXMT:
   5245 		*val = state->id_multi_xmt;
   5246 		break;
   5247 	case MAC_STAT_BRDCSTXMT:
   5248 		*val = state->id_brd_xmt;
   5249 		break;
   5250 	case MAC_STAT_RBYTES:
   5251 		*val = state->id_rcv_bytes + state->rc_rcv_trans_byte
   5252 		    + state->rc_rcv_copy_byte;
   5253 		break;
   5254 	case MAC_STAT_IPACKETS:
   5255 		*val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
   5256 		    + state->rc_rcv_copy_pkt;
   5257 		break;
   5258 	case MAC_STAT_OBYTES:
   5259 		*val = state->id_xmt_bytes + state->rc_xmt_bytes;
   5260 		break;
   5261 	case MAC_STAT_OPACKETS:
   5262 		*val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
   5263 		    state->rc_xmt_fragmented_pkt +
   5264 		    state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
   5265 		break;
   5266 	case MAC_STAT_OERRORS:
   5267 		*val = state->id_ah_error;	/* failed AH translation */
   5268 		break;
   5269 	case MAC_STAT_IERRORS:
   5270 		*val = 0;
   5271 		break;
   5272 	case MAC_STAT_NOXMTBUF:
   5273 		*val = state->id_tx_short + state->rc_swqe_short +
   5274 		    state->rc_xmt_buf_short;
   5275 		break;
   5276 	case MAC_STAT_NORCVBUF:
   5277 	default:
   5278 		return (ENOTSUP);
   5279 	}
   5280 
   5281 	return (0);
   5282 }
   5283 
   5284 static void
   5285 ibd_async_txsched(ibd_state_t *state)
   5286 {
   5287 	ibd_resume_transmission(state);
   5288 }
   5289 
   5290 static void
   5291 ibd_resume_transmission(ibd_state_t *state)
   5292 {
   5293 	int flag;
   5294 	int met_thresh = 0;
   5295 	int thresh = 0;
   5296 	int ret = -1;
   5297 
   5298 	mutex_enter(&state->id_sched_lock);
   5299 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
   5300 		mutex_enter(&state->id_tx_list.dl_mutex);
   5301 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
   5302 		met_thresh = state->id_tx_list.dl_cnt +
   5303 		    state->id_tx_rel_list.dl_cnt;
   5304 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
   5305 		mutex_exit(&state->id_tx_list.dl_mutex);
   5306 		thresh = IBD_FREE_SWQES_THRESH;
   5307 		flag = IBD_RSRC_SWQE;
   5308 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
   5309 		ASSERT(state->id_lso != NULL);
   5310 		mutex_enter(&state->id_lso_lock);
   5311 		met_thresh = state->id_lso->bkt_nfree;
   5312 		thresh = IBD_FREE_LSOS_THRESH;
   5313 		mutex_exit(&state->id_lso_lock);
   5314 		flag = IBD_RSRC_LSOBUF;
   5315 		if (met_thresh > thresh)
   5316 			state->id_sched_lso_cnt++;
   5317 	}
   5318 	if (met_thresh > thresh) {
   5319 		state->id_sched_needed &= ~flag;
   5320 		state->id_sched_cnt++;
   5321 		ret = 0;
   5322 	}
   5323 	mutex_exit(&state->id_sched_lock);
   5324 
   5325 	if (ret == 0)
   5326 		mac_tx_update(state->id_mh);
   5327 }
   5328 
   5329 /*
   5330  * Release the send wqe back into free list.
   5331  */
   5332 static void
   5333 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
   5334 {
   5335 	/*
   5336 	 * Add back on Tx list for reuse.
   5337 	 */
   5338 	ASSERT(tail->swqe_next == NULL);
   5339 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
   5340 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
   5341 	tail->swqe_next = state->id_tx_rel_list.dl_head;
   5342 	state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
   5343 	state->id_tx_rel_list.dl_cnt += n;
   5344 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
   5345 }
   5346 
   5347 /*
   5348  * Acquire a send wqe from free list.
   5349  * Returns error number and send wqe pointer.
   5350  */
   5351 static ibd_swqe_t *
   5352 ibd_acquire_swqe(ibd_state_t *state)
   5353 {
   5354 	ibd_swqe_t *wqe;
   5355 
   5356 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
   5357 	if (state->id_tx_rel_list.dl_head != NULL) {
   5358 		/* transfer id_tx_rel_list to id_tx_list */
   5359 		state->id_tx_list.dl_head =
   5360 		    state->id_tx_rel_list.dl_head;
   5361 		state->id_tx_list.dl_cnt =
   5362 		    state->id_tx_rel_list.dl_cnt;
   5363 		state->id_tx_list.dl_pending_sends = B_FALSE;
   5364 
   5365 		/* clear id_tx_rel_list */
   5366 		state->id_tx_rel_list.dl_head = NULL;
   5367 		state->id_tx_rel_list.dl_cnt = 0;
   5368 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
   5369 
   5370 		wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
   5371 		state->id_tx_list.dl_cnt -= 1;
   5372 		state->id_tx_list.dl_head = wqe->swqe_next;
   5373 	} else {	/* no free swqe */
   5374 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
   5375 		state->id_tx_list.dl_pending_sends = B_TRUE;
   5376 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
   5377 		state->id_tx_short++;
   5378 		wqe = NULL;
   5379 	}
   5380 	return (wqe);
   5381 }
   5382 
   5383 static int
   5384 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
   5385     ibt_ud_dest_hdl_t ud_dest)
   5386 {
   5387 	mblk_t	*nmp;
   5388 	int iph_len, tcph_len;
   5389 	ibt_wr_lso_t *lso;
   5390 	uintptr_t ip_start, tcp_start;
   5391 	uint8_t *dst;
   5392 	uint_t pending, mblen;
   5393 
   5394 	/*
   5395 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
   5396 	 * we need to adjust it here for lso.
   5397 	 */
   5398 	lso = &(node->w_swr.wr.ud_lso);
   5399 	lso->lso_ud_dest = ud_dest;
   5400 	lso->lso_mss = mss;
   5401 
   5402 	/*
   5403 	 * Calculate the LSO header size and set it in the UD LSO structure.
   5404 	 * Note that the only assumption we make is that each of the IPoIB,
   5405 	 * IP and TCP headers will be contained in a single mblk fragment;
   5406 	 * together, the headers may span multiple mblk fragments.
   5407 	 */
   5408 	nmp = mp;
   5409 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
   5410 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
   5411 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
   5412 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
   5413 		nmp = nmp->b_cont;
   5414 
   5415 	}
   5416 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
   5417 
   5418 	tcp_start = ip_start + iph_len;
   5419 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
   5420 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
   5421 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
   5422 		nmp = nmp->b_cont;
   5423 	}
   5424 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
   5425 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
   5426 
   5427 	/*
   5428 	 * If the lso header fits entirely within a single mblk fragment,
   5429 	 * we'll avoid an additional copy of the lso header here and just
   5430 	 * pass the b_rptr of the mblk directly.
   5431 	 *
   5432 	 * If this isn't true, we'd have to allocate for it explicitly.
   5433 	 */
   5434 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
   5435 		lso->lso_hdr = mp->b_rptr;
   5436 	} else {
   5437 		/* On work completion, remember to free this allocated hdr */
   5438 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
   5439 		if (lso->lso_hdr == NULL) {
   5440 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
   5441 			    "sz = %d", lso->lso_hdr_sz);
   5442 			lso->lso_hdr_sz = 0;
   5443 			lso->lso_mss = 0;
   5444 			return (-1);
   5445 		}
   5446 	}
   5447 
   5448 	/*
   5449 	 * Copy in the lso header only if we need to
   5450 	 */
   5451 	if (lso->lso_hdr != mp->b_rptr) {
   5452 		dst = lso->lso_hdr;
   5453 		pending = lso->lso_hdr_sz;
   5454 
   5455 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
   5456 			mblen = MBLKL(nmp);
   5457 			if (pending > mblen) {
   5458 				bcopy(nmp->b_rptr, dst, mblen);
   5459 				dst += mblen;
   5460 				pending -= mblen;
   5461 			} else {
   5462 				bcopy(nmp->b_rptr, dst, pending);
   5463 				break;
   5464 			}
   5465 		}
   5466 	}
   5467 
   5468 	return (0);
   5469 }
   5470 
   5471 static void
   5472 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
   5473 {
   5474 	ibt_wr_lso_t *lso;
   5475 
   5476 	if ((!node) || (!mp))
   5477 		return;
   5478 
   5479 	/*
   5480 	 * Free any header space that we might've allocated if we
   5481 	 * did an LSO
   5482 	 */
   5483 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
   5484 		lso = &(node->w_swr.wr.ud_lso);
   5485 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
   5486 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
   5487 			lso->lso_hdr = NULL;
   5488 			lso->lso_hdr_sz = 0;
   5489 		}
   5490 	}
   5491 }
   5492 
   5493 static void
   5494 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
   5495 {
   5496 	uint_t		i;
   5497 	uint_t		num_posted;
   5498 	uint_t		n_wrs;
   5499 	ibt_status_t	ibt_status;
   5500 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
   5501 	ibd_swqe_t	*tx_head, *elem;
   5502 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
   5503 
   5504 	/* post the one request, then check for more */
   5505 	ibt_status = ibt_post_send(state->id_chnl_hdl,
   5506 	    &node->w_swr, 1, NULL);
   5507 	if (ibt_status != IBT_SUCCESS) {
   5508 		ibd_print_warn(state, "ibd_post_send: "
   5509 		    "posting one wr failed: ret=%d", ibt_status);
   5510 		ibd_tx_cleanup(state, node);
   5511 	}
   5512 
   5513 	tx_head = NULL;
   5514 	for (;;) {
   5515 		if (tx_head == NULL) {
   5516 			mutex_enter(&state->id_txpost_lock);
   5517 			tx_head = state->id_tx_head;
   5518 			if (tx_head == NULL) {
   5519 				state->id_tx_busy = 0;
   5520 				mutex_exit(&state->id_txpost_lock);
   5521 				return;
   5522 			}
   5523 			state->id_tx_head = NULL;
   5524 			mutex_exit(&state->id_txpost_lock);
   5525 		}
   5526 
   5527 		/*
   5528 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
   5529 		 * at a time if possible, and keep posting them.
   5530 		 */
   5531 		for (n_wrs = 0, elem = tx_head;
   5532 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
   5533 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
   5534 			nodes[n_wrs] = elem;
   5535 			wrs[n_wrs] = elem->w_swr;
   5536 		}
   5537 		tx_head = elem;
   5538 
   5539 		ASSERT(n_wrs != 0);
   5540 
   5541 		/*
   5542 		 * If posting fails for some reason, we'll never receive
   5543 		 * completion intimation, so we'll need to cleanup. But
   5544 		 * we need to make sure we don't clean up nodes whose
   5545 		 * wrs have been successfully posted. We assume that the
   5546 		 * hca driver returns on the first failure to post and
   5547 		 * therefore the first 'num_posted' entries don't need
   5548 		 * cleanup here.
   5549 		 */
   5550 		num_posted = 0;
   5551 		ibt_status = ibt_post_send(state->id_chnl_hdl,
   5552 		    wrs, n_wrs, &num_posted);
   5553 		if (ibt_status != IBT_SUCCESS) {
   5554 			ibd_print_warn(state, "ibd_post_send: "
   5555 			    "posting multiple wrs failed: "
   5556 			    "requested=%d, done=%d, ret=%d",
   5557 			    n_wrs, num_posted, ibt_status);
   5558 
   5559 			for (i = num_posted; i < n_wrs; i++)
   5560 				ibd_tx_cleanup(state, nodes[i]);
   5561 		}
   5562 	}
   5563 }
   5564 
   5565 static int
   5566 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
   5567     uint_t lsohdr_sz)
   5568 {
   5569 	ibt_wr_ds_t *sgl;
   5570 	ibt_status_t ibt_status;
   5571 	mblk_t *nmp;
   5572 	mblk_t *data_mp;
   5573 	uchar_t *bufp;
   5574 	size_t blksize;
   5575 	size_t skip;
   5576 	size_t avail;
   5577 	uint_t pktsize;
   5578 	uint_t frag_len;
   5579 	uint_t pending_hdr;
   5580 	int nmblks;
   5581 	int i;
   5582 
   5583 	/*
   5584 	 * Let's skip ahead to the data if this is LSO
   5585 	 */
   5586 	data_mp = mp;
   5587 	pending_hdr = 0;
   5588 	if (lsohdr_sz) {
   5589 		pending_hdr = lsohdr_sz;
   5590 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
   5591 			frag_len = nmp->b_wptr - nmp->b_rptr;
   5592 			if (frag_len > pending_hdr)
   5593 				break;
   5594 			pending_hdr -= frag_len;
   5595 		}
   5596 		data_mp = nmp;	/* start of data past lso header */
   5597 		ASSERT(data_mp != NULL);
   5598 	}
   5599 
   5600 	/*
   5601 	 * Calculate the size of message data and number of msg blocks
   5602 	 */
   5603 	pktsize = 0;
   5604 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
   5605 	    nmp = nmp->b_cont, nmblks++) {
   5606 		pktsize += MBLKL(nmp);
   5607 	}
   5608 	pktsize -= pending_hdr;
   5609 
   5610 	/*
   5611 	 * We only do ibt_map_mem_iov() if the pktsize is above the
   5612 	 * "copy-threshold", and if the number of mp fragments is less than
   5613 	 * the maximum acceptable.
   5614 	 */
   5615 	if ((state->id_hca_res_lkey_capab) &&
   5616 	    (pktsize > IBD_TX_COPY_THRESH) &&
   5617 	    (nmblks < state->id_max_sqseg_hiwm)) {
   5618 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
   5619 		ibt_iov_attr_t iov_attr;
   5620 
   5621 		iov_attr.iov_as = NULL;
   5622 		iov_attr.iov = iov_arr;
   5623 		iov_attr.iov_buf = NULL;
   5624 		iov_attr.iov_list_len = nmblks;
   5625 		iov_attr.iov_wr_nds = state->id_max_sqseg;
   5626 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
   5627 		iov_attr.iov_flags = IBT_IOV_SLEEP;
   5628 
   5629 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
   5630 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
   5631 			iov_arr[i].iov_len = MBLKL(nmp);
   5632 			if (i == 0) {
   5633 				iov_arr[i].iov_addr += pending_hdr;
   5634 				iov_arr[i].iov_len -= pending_hdr;
   5635 			}
   5636 		}
   5637 
   5638 		node->w_buftype = IBD_WQE_MAPPED;
   5639 		node->w_swr.wr_sgl = node->w_sgl;
   5640 
   5641 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
   5642 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
   5643 		if (ibt_status != IBT_SUCCESS) {
   5644 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
   5645 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
   5646 			goto ibd_copy_path;
   5647 		}
   5648 
   5649 		return (0);
   5650 	}
   5651 
   5652 ibd_copy_path:
   5653 	if (pktsize <= state->id_tx_buf_sz) {
   5654 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
   5655 		node->w_swr.wr_nds = 1;
   5656 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
   5657 		node->w_buftype = IBD_WQE_TXBUF;
   5658 
   5659 		/*
   5660 		 * Even though this is the copy path for transfers less than
   5661 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
   5662 		 * is possible the first data mblk fragment (data_mp) still
   5663 		 * contains part of the LSO header that we need to skip.
   5664 		 */
   5665 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
   5666 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
   5667 			blksize = MBLKL(nmp) - pending_hdr;
   5668 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
   5669 			bufp += blksize;
   5670 			pending_hdr = 0;
   5671 		}
   5672 
   5673 		return (0);
   5674 	}
   5675 
   5676 	/*
   5677 	 * Copy path for transfers greater than id_tx_buf_sz
   5678 	 */
   5679 	node->w_swr.wr_sgl = node->w_sgl;
   5680 	if (ibd_acquire_lsobufs(state, pktsize,
   5681 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
   5682 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
   5683 		return (-1);
   5684 	}
   5685 	node->w_buftype = IBD_WQE_LSOBUF;
   5686 
   5687 	/*
   5688 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
   5689 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
   5690 	 * need to skip part of the LSO header in the first fragment
   5691 	 * as before.
   5692 	 */
   5693 	nmp = data_mp;
   5694 	skip = pending_hdr;
   5695 	for (i = 0; i < node->w_swr.wr_nds; i++) {
   5696 		sgl = node->w_swr.wr_sgl + i;
   5697 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
   5698 		avail = IBD_LSO_BUFSZ;
   5699 		while (nmp && avail) {
   5700 			blksize = MBLKL(nmp) - skip;
   5701 			if (blksize > avail) {
   5702 				bcopy(nmp->b_rptr + skip, bufp, avail);
   5703 				skip += avail;
   5704 				avail = 0;
   5705 			} else {
   5706 				bcopy(nmp->b_rptr + skip, bufp, blksize);
   5707 				skip = 0;
   5708 				avail -= blksize;
   5709 				bufp += blksize;
   5710 				nmp = nmp->b_cont;
   5711 			}
   5712 		}
   5713 	}
   5714 
   5715 	return (0);
   5716 }
   5717 
   5718 /*
   5719  * Schedule a completion queue polling to reap the resource we're
   5720  * short on.  If we implement the change to reap tx completions
   5721  * in a separate thread, we'll need to wake up that thread here.
   5722  */
   5723 static int
   5724 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
   5725 {
   5726 	ibd_req_t *req;
   5727 
   5728 	mutex_enter(&state->id_sched_lock);
   5729 	state->id_sched_needed |= resource_type;
   5730 	mutex_exit(&state->id_sched_lock);
   5731 
   5732 	/*
   5733 	 * If we are asked to queue a work entry, we need to do it
   5734 	 */
   5735 	if (q_flag) {
   5736 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
   5737 		if (req == NULL)
   5738 			return (-1);
   5739 
   5740 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
   5741 	}
   5742 
   5743 	return (0);
   5744 }
   5745 
   5746 /*
   5747  * The passed in packet has this format:
   5748  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
   5749  */
   5750 static boolean_t
   5751 ibd_send(ibd_state_t *state, mblk_t *mp)
   5752 {
   5753 	ibd_ace_t *ace;
   5754 	ibd_swqe_t *node;
   5755 	ipoib_mac_t *dest;
   5756 	ib_header_info_t *ipibp;
   5757 	ip6_t *ip6h;
   5758 	uint_t pktsize;
   5759 	uint32_t mss;
   5760 	uint32_t hckflags;
   5761 	uint32_t lsoflags = 0;
   5762 	uint_t lsohdr_sz = 0;
   5763 	int ret, len;
   5764 	boolean_t dofree = B_FALSE;
   5765 	boolean_t rc;
   5766 	/* if (rc_chan == NULL) send by UD; else send by RC; */
   5767 	ibd_rc_chan_t *rc_chan;
   5768 	int nmblks;
   5769 	mblk_t *nmp;
   5770 
   5771 	/*
   5772 	 * If we aren't done with the device initialization and start,
   5773 	 * we shouldn't be here.
   5774 	 */
   5775 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
   5776 		return (B_FALSE);
   5777 
   5778 	/*
   5779 	 * Obtain an address handle for the destination.
   5780 	 */
   5781 	ipibp = (ib_header_info_t *)mp->b_rptr;
   5782 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
   5783 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
   5784 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
   5785 
   5786 	rc_chan = NULL;
   5787 	ace = ibd_acache_lookup(state, dest, &ret, 1);
   5788 	if (state->id_enable_rc && (ace != NULL) &&
   5789 	    (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
   5790 		if (ace->ac_chan == NULL) {
   5791 			state->rc_null_conn++;
   5792 		} else {
   5793 			if (ace->ac_chan->chan_state ==
   5794 			    IBD_RC_STATE_ACT_ESTAB) {
   5795 				rc_chan = ace->ac_chan;
   5796 				mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
   5797 				node = WQE_TO_SWQE(
   5798 				    rc_chan->tx_wqe_list.dl_head);
   5799 				if (node != NULL) {
   5800 					rc_chan->tx_wqe_list.dl_cnt -= 1;
   5801 					rc_chan->tx_wqe_list.dl_head =
   5802 					    node->swqe_next;
   5803 				} else {
   5804 					node = ibd_rc_acquire_swqes(rc_chan);
   5805 				}
   5806 				mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
   5807 
   5808 				if (node == NULL) {
   5809 					state->rc_swqe_short++;
   5810 					mutex_enter(&state->id_sched_lock);
   5811 					state->id_sched_needed |=
   5812 					    IBD_RSRC_RC_SWQE;
   5813 					mutex_exit(&state->id_sched_lock);
   5814 					ibd_dec_ref_ace(state, ace);
   5815 					return (B_FALSE);
   5816 				}
   5817 			} else {
   5818 				state->rc_no_estab_conn++;
   5819 			}
   5820 		}
   5821 	}
   5822 
   5823 	if (rc_chan == NULL) {
   5824 		mutex_enter(&state->id_tx_list.dl_mutex);
   5825 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
   5826 		if (node != NULL) {
   5827 			state->id_tx_list.dl_cnt -= 1;
   5828 			state->id_tx_list.dl_head = node->swqe_next;
   5829 		} else {
   5830 			node = ibd_acquire_swqe(state);
   5831 		}
   5832 		mutex_exit(&state->id_tx_list.dl_mutex);
   5833 		if (node == NULL) {
   5834 			/*
   5835 			 * If we don't have an swqe available, schedule a
   5836 			 * transmit completion queue cleanup and hold off on
   5837 			 * sending more packets until we have some free swqes
   5838 			 */
   5839 			if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
   5840 				if (ace != NULL) {
   5841 					ibd_dec_ref_ace(state, ace);
   5842 				}
   5843 				return (B_FALSE);
   5844 			}
   5845 
   5846 			/*
   5847 			 * If a poll cannot be scheduled, we have no choice but
   5848 			 * to drop this packet
   5849 			 */
   5850 			ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
   5851 			if (ace != NULL) {
   5852 				ibd_dec_ref_ace(state, ace);
   5853 			}
   5854 			return (B_TRUE);
   5855 		}
   5856 	}
   5857 
   5858 	/*
   5859 	 * Initialize the commonly used fields in swqe to NULL to protect
   5860 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
   5861 	 * failure.
   5862 	 */
   5863 	node->swqe_im_mblk = NULL;
   5864 	node->w_swr.wr_nds = 0;
   5865 	node->w_swr.wr_sgl = NULL;
   5866 	node->w_swr.wr_opcode = IBT_WRC_SEND;
   5867 
   5868 	/*
   5869 	 * Calculate the size of message data and number of msg blocks
   5870 	 */
   5871 	pktsize = 0;
   5872 	for (nmblks = 0, nmp = mp; nmp != NULL;
   5873 	    nmp = nmp->b_cont, nmblks++) {
   5874 		pktsize += MBLKL(nmp);
   5875 	}
   5876 
   5877 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
   5878 		atomic_inc_64(&state->id_brd_xmt);
   5879 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
   5880 		atomic_inc_64(&state->id_multi_xmt);
   5881 
   5882 	if (ace != NULL) {
   5883 		node->w_ahandle = ace;
   5884 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
   5885 	} else {
   5886 		DPRINT(5,
   5887 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
   5888 		    ((ret == EFAULT) ? "failed" : "queued"),
   5889 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
   5890 		    htonl(dest->ipoib_gidpref[1]),
   5891 		    htonl(dest->ipoib_gidsuff[0]),
   5892 		    htonl(dest->ipoib_gidsuff[1]));
   5893 		state->rc_ace_not_found++;
   5894 		node->w_ahandle = NULL;
   5895 
   5896 		/*
   5897 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
   5898 		 * can not find a path for the specific dest address. We
   5899 		 * should get rid of this kind of packet.  We also should get
   5900 		 * rid of the packet if we cannot schedule a poll via the
   5901 		 * async thread.  For the normal case, ibd will return the
   5902 		 * packet to upper layer and wait for AH creating.
   5903 		 *
   5904 		 * Note that we always queue a work slot entry for the async
   5905 		 * thread when we fail AH lookup (even in intr mode); this is
   5906 		 * due to the convoluted way the code currently looks for AH.
   5907 		 */
   5908 		if (ret == EFAULT) {
   5909 			dofree = B_TRUE;
   5910 			rc = B_TRUE;
   5911 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
   5912 			dofree = B_TRUE;
   5913 			rc = B_TRUE;
   5914 		} else {
   5915 			dofree = B_FALSE;
   5916 			rc = B_FALSE;
   5917 		}
   5918 		goto ibd_send_fail;
   5919 	}
   5920 
   5921 	/*
   5922 	 * For ND6 packets, padding is at the front of the source lladdr.
   5923 	 * Insert the padding at front.
   5924 	 */
   5925 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
   5926 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
   5927 			if (!pullupmsg(mp, IPV6_HDR_LEN +
   5928 			    sizeof (ib_header_info_t))) {
   5929 				DPRINT(10, "ibd_send: pullupmsg failure ");
   5930 				dofree = B_TRUE;
   5931 				rc = B_TRUE;
   5932 				goto ibd_send_fail;
   5933 			}
   5934 			ipibp = (ib_header_info_t *)mp->b_rptr;
   5935 		}
   5936 		ip6h = (ip6_t *)((uchar_t *)ipibp +
   5937 		    sizeof (ib_header_info_t));
   5938 		len = ntohs(ip6h->ip6_plen);
   5939 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
   5940 			mblk_t	*pad;
   5941 
   5942 			pad = allocb(4, 0);
   5943 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
   5944 			linkb(mp, pad);
   5945 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
   5946 			    IPV6_HDR_LEN + len + 4) {
   5947 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
   5948 				    IPV6_HDR_LEN + len + 4)) {
   5949 					DPRINT(10, "ibd_send: pullupmsg "
   5950 					    "failure ");
   5951 					dofree = B_TRUE;
   5952 					rc = B_TRUE;
   5953 					goto ibd_send_fail;
   5954 				}
   5955 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
   5956 				    sizeof (ib_header_info_t));
   5957 			}
   5958 
   5959 			/* LINTED: E_CONSTANT_CONDITION */
   5960 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
   5961 		}
   5962 	}
   5963 
   5964 	ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
   5965 	mp->b_rptr += sizeof (ib_addrs_t);
   5966 	pktsize -= sizeof (ib_addrs_t);
   5967 
   5968 	if (rc_chan) {	/* send in RC mode */
   5969 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
   5970 		ibt_iov_attr_t iov_attr;
   5971 		uint_t		i;
   5972 		size_t	blksize;
   5973 		uchar_t *bufp;
   5974 		ibd_rc_tx_largebuf_t *lbufp;
   5975 
   5976 		atomic_add_64(&state->rc_xmt_bytes, pktsize);
   5977 
   5978 		/*
   5979 		 * Upper layer does Tx checksum, we don't need do any
   5980 		 * checksum here.
   5981 		 */
   5982 		ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
   5983 
   5984 		/*
   5985 		 * We only do ibt_map_mem_iov() if the pktsize is above
   5986 		 * the "copy-threshold", and if the number of mp
   5987 		 * fragments is less than the maximum acceptable.
   5988 		 */
   5989 		if (pktsize <= ibd_rc_tx_copy_thresh) {
   5990 			atomic_inc_64(&state->rc_xmt_small_pkt);
   5991 			/*
   5992 			 * Only process unicast packet in Reliable Connected
   5993 			 * mode.
   5994 			 */
   5995 			node->swqe_copybuf.ic_sgl.ds_len = pktsize;
   5996 			node->w_swr.wr_nds = 1;
   5997 			node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
   5998 			node->w_buftype = IBD_WQE_TXBUF;
   5999 
   6000 			bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
   6001 			for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
   6002 				blksize = MBLKL(nmp);
   6003 				bcopy(nmp->b_rptr, bufp, blksize);
   6004 				bufp += blksize;
   6005 			}
   6006 			freemsg(mp);
   6007 			ASSERT(node->swqe_im_mblk == NULL);
   6008 		} else {
   6009 			if ((state->rc_enable_iov_map) &&
   6010 			    (nmblks < state->rc_max_sqseg_hiwm)) {
   6011 
   6012 				/* do ibt_map_mem_iov() */
   6013 				iov_attr.iov_as = NULL;
   6014 				iov_attr.iov = iov_arr;
   6015 				iov_attr.iov_buf = NULL;
   6016 				iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
   6017 				iov_attr.iov_lso_hdr_sz = 0;
   6018 				iov_attr.iov_flags = IBT_IOV_SLEEP;
   6019 
   6020 				i = 0;
   6021 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
   6022 					iov_arr[i].iov_len = MBLKL(nmp);
   6023 					if (iov_arr[i].iov_len != 0) {
   6024 						iov_arr[i].iov_addr = (caddr_t)
   6025 						    (void *)nmp->b_rptr;
   6026 						i++;
   6027 					}
   6028 				}
   6029 				iov_attr.iov_list_len = i;
   6030 				node->w_swr.wr_sgl = node->w_sgl;
   6031 
   6032 				ret = ibt_map_mem_iov(state->id_hca_hdl,
   6033 				    &iov_attr, (ibt_all_wr_t *)&node->w_swr,
   6034 				    &node->w_mi_hdl);
   6035 				if (ret != IBT_SUCCESS) {
   6036 					atomic_inc_64(
   6037 					    &state->rc_xmt_map_fail_pkt);
   6038 					DPRINT(30, "ibd_send: ibt_map_mem_iov("
   6039 					    ") failed, nmblks=%d, real_nmblks"
   6040 					    "=%d, ret=0x%x", nmblks, i, ret);
   6041 					goto ibd_rc_large_copy;
   6042 				}
   6043 
   6044 				atomic_inc_64(&state->rc_xmt_map_succ_pkt);
   6045 				node->w_buftype = IBD_WQE_MAPPED;
   6046 				node->swqe_im_mblk = mp;
   6047 			} else {
   6048 				atomic_inc_64(&state->rc_xmt_fragmented_pkt);
   6049 ibd_rc_large_copy:
   6050 				mutex_enter(&state->rc_tx_large_bufs_lock);
   6051 				if (state->rc_tx_largebuf_nfree == 0) {
   6052 					state->rc_xmt_buf_short++;
   6053 					mutex_exit
   6054 					    (&state->rc_tx_large_bufs_lock);
   6055 					mutex_enter(&state->id_sched_lock);
   6056 					state->id_sched_needed |=
   6057 					    IBD_RSRC_RC_TX_LARGEBUF;
   6058 					mutex_exit(&state->id_sched_lock);
   6059 					dofree = B_FALSE;
   6060 					rc = B_FALSE;
   6061 					/*
   6062 					 * If we don't have Tx large bufs,
   6063 					 * return failure. node->w_buftype
   6064 					 * should not be IBD_WQE_RC_COPYBUF,
   6065 					 * otherwise it will cause problem
   6066 					 * in ibd_rc_tx_cleanup()
   6067 					 */
   6068 					node->w_buftype = IBD_WQE_TXBUF;
   6069 					goto ibd_send_fail;
   6070 				}
   6071 
   6072 				lbufp = state->rc_tx_largebuf_free_head;
   6073 				ASSERT(lbufp->lb_buf != NULL);
   6074 				state->rc_tx_largebuf_free_head =
   6075 				    lbufp->lb_next;
   6076 				lbufp->lb_next = NULL;
   6077 				/* Update nfree count */
   6078 				state->rc_tx_largebuf_nfree --;
   6079 				mutex_exit(&state->rc_tx_large_bufs_lock);
   6080 				bufp = lbufp->lb_buf;
   6081 				node->w_sgl[0].ds_va =
   6082 				    (ib_vaddr_t)(uintptr_t)bufp;
   6083 				node->w_sgl[0].ds_key =
   6084 				    state->rc_tx_mr_desc.md_lkey;
   6085 				node->w_sgl[0].ds_len = pktsize;
   6086 				node->w_swr.wr_sgl = node->w_sgl;
   6087 				node->w_swr.wr_nds = 1;
   6088 				node->w_buftype = IBD_WQE_RC_COPYBUF;
   6089 				node->w_rc_tx_largebuf = lbufp;
   6090 
   6091 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
   6092 					blksize = MBLKL(nmp);
   6093 					if (blksize != 0) {
   6094 						bcopy(nmp->b_rptr, bufp,
   6095 						    blksize);
   6096 						bufp += blksize;
   6097 					}
   6098 				}
   6099 				freemsg(mp);
   6100 				ASSERT(node->swqe_im_mblk == NULL);
   6101 			}
   6102 		}
   6103 
   6104 		node->swqe_next = NULL;
   6105 		mutex_enter(&rc_chan->tx_post_lock);
   6106 		if (rc_chan->tx_busy) {
   6107 			if (rc_chan->tx_head) {
   6108 				rc_chan->tx_tail->swqe_next =
   6109 				    SWQE_TO_WQE(node);
   6110 			} else {
   6111 				rc_chan->tx_head = node;
   6112 			}
   6113 			rc_chan->tx_tail = node;
   6114 			mutex_exit(&rc_chan->tx_post_lock);
   6115 		} else {
   6116 			rc_chan->tx_busy = 1;
   6117 			mutex_exit(&rc_chan->tx_post_lock);
   6118 			ibd_rc_post_send(rc_chan, node);
   6119 		}
   6120 
   6121 		return (B_TRUE);
   6122 	} /* send by RC */
   6123 
   6124 	if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
   6125 		/*
   6126 		 * Too long pktsize. The packet size from GLD should <=
   6127 		 * state->id_mtu + sizeof (ib_addrs_t)
   6128 		 */
   6129 		if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
   6130 			ibd_req_t *req;
   6131 
   6132 			mutex_enter(&ace->tx_too_big_mutex);
   6133 			if (ace->tx_too_big_ongoing) {
   6134 				mutex_exit(&ace->tx_too_big_mutex);
   6135 				state->rc_xmt_reenter_too_long_pkt++;
   6136 				dofree = B_TRUE;
   6137 			} else {
   6138 				ace->tx_too_big_ongoing = B_TRUE;
   6139 				mutex_exit(&ace->tx_too_big_mutex);
   6140 				state->rc_xmt_icmp_too_long_pkt++;
   6141 
   6142 				req = kmem_cache_alloc(state->id_req_kmc,
   6143 				    KM_NOSLEEP);
   6144 				if (req == NULL) {
   6145 					ibd_print_warn(state, "ibd_send: alloc "
   6146 					    "ibd_req_t fail");
   6147 					/* Drop it. */
   6148 					dofree = B_TRUE;
   6149 				} else {
   6150 					req->rq_ptr = mp;
   6151 					req->rq_ptr2 = ace;
   6152 					ibd_queue_work_slot(state, req,
   6153 					    IBD_ASYNC_RC_TOO_BIG);
   6154 					dofree = B_FALSE;
   6155 				}
   6156 			}
   6157 		} else {
   6158 			ibd_print_warn(state, "Reliable Connected mode is on. "
   6159 			    "Multicast packet length %d > %d is too long to "
   6160 			    "send packet (%d > %d), drop it",
   6161 			    pktsize, state->id_mtu);
   6162 			state->rc_xmt_drop_too_long_pkt++;
   6163 			/* Drop it. */
   6164 			dofree = B_TRUE;
   6165 		}
   6166 		rc = B_TRUE;
   6167 		goto ibd_send_fail;
   6168 	}
   6169 
   6170 	atomic_add_64(&state->id_xmt_bytes, pktsize);
   6171 	atomic_inc_64(&state->id_xmt_pkt);
   6172 
   6173 	/*
   6174 	 * Do LSO and checksum related work here.  For LSO send, adjust the
   6175 	 * ud destination, the opcode and the LSO header information to the
   6176 	 * work request.
   6177 	 */
   6178 	lso_info_get(mp, &mss, &lsoflags);
   6179 	if ((lsoflags & HW_LSO) != HW_LSO) {
   6180 		node->w_swr.wr_opcode = IBT_WRC_SEND;
   6181 		lsohdr_sz = 0;
   6182 	} else {
   6183 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
   6184 			/*
   6185 			 * The routine can only fail if there's no memory; we
   6186 			 * can only drop the packet if this happens
   6187 			 */
   6188 			ibd_print_warn(state,
   6189 			    "ibd_send: no memory, lso posting failed");
   6190 			dofree = B_TRUE;
   6191 			rc = B_TRUE;
   6192 			goto ibd_send_fail;
   6193 		}
   6194 
   6195 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
   6196 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
   6197 	}
   6198 
   6199 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
   6200 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
   6201 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
   6202 	else
   6203 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
   6204 
   6205 	/*
   6206 	 * Prepare the sgl for posting; the routine can only fail if there's
   6207 	 * no lso buf available for posting. If this is the case, we should
   6208 	 * probably resched for lso bufs to become available and then try again.
   6209 	 */
   6210 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
   6211 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
   6212 			dofree = B_TRUE;
   6213 			rc = B_TRUE;
   6214 		} else {
   6215 			dofree = B_FALSE;
   6216 			rc = B_FALSE;
   6217 		}
   6218 		goto ibd_send_fail;
   6219 	}
   6220 	node->swqe_im_mblk = mp;
   6221 
   6222 	/*
   6223 	 * Queue the wqe to hardware; since we can now simply queue a
   6224 	 * post instead of doing it serially, we cannot assume anything
   6225 	 * about the 'node' after ibd_post_send() returns.
   6226 	 */
   6227 	node->swqe_next = NULL;
   6228 
   6229 	mutex_enter(&state->id_txpost_lock);
   6230 	if (state->id_tx_busy) {
   6231 		if (state->id_tx_head) {
   6232 			state->id_tx_tail->swqe_next =
   6233 			    SWQE_TO_WQE(node);
   6234 		} else {
   6235 			state->id_tx_head = node;
   6236 		}
   6237 		state->id_tx_tail = node;
   6238 		mutex_exit(&state->id_txpost_lock);
   6239 	} else {
   6240 		state->id_tx_busy = 1;
   6241 		mutex_exit(&state->id_txpost_lock);
   6242 		ibd_post_send(state, node);
   6243 	}
   6244 
   6245 	return (B_TRUE);
   6246 
   6247 ibd_send_fail:
   6248 	if (node && mp)
   6249 		ibd_free_lsohdr(node, mp);
   6250 
   6251 	if (dofree)
   6252 		freemsg(mp);
   6253 
   6254 	if (node != NULL) {
   6255 		if (rc_chan) {
   6256 			ibd_rc_tx_cleanup(node);
   6257 		} else {
   6258 			ibd_tx_cleanup(state, node);
   6259 		}
   6260 	}
   6261 
   6262 	return (rc);
   6263 }
   6264 
   6265 /*
   6266  * GLDv3 entry point for transmitting datagram.
   6267  */
   6268 static mblk_t *
   6269 ibd_m_tx(void *arg, mblk_t *mp)
   6270 {
   6271 	ibd_state_t *state = (ibd_state_t *)arg;
   6272 	mblk_t *next;
   6273 
   6274 	if (state->id_link_state != LINK_STATE_UP) {
   6275 		freemsgchain(mp);
   6276 		mp = NULL;
   6277 	}
   6278 
   6279 	while (mp != NULL) {
   6280 		next = mp->b_next;
   6281 		mp->b_next = NULL;
   6282 		if (ibd_send(state, mp) == B_FALSE) {
   6283 			/* Send fail */
   6284 			mp->b_next = next;
   6285 			break;
   6286 		}
   6287 		mp = next;
   6288 	}
   6289 
   6290 	return (mp);
   6291 }
   6292 
   6293 /*
   6294  * this handles Tx and Rx completions. With separate CQs, this handles
   6295  * only Rx completions.
   6296  */
   6297 static uint_t
   6298 ibd_intr(caddr_t arg)
   6299 {
   6300 	ibd_state_t *state = (ibd_state_t *)arg;
   6301 
   6302 	ibd_poll_rcq(state, state->id_rcq_hdl);
   6303 
   6304 	return (DDI_INTR_CLAIMED);
   6305 }
   6306 
   6307 /*
   6308  * Poll and fully drain the send cq
   6309  */
   6310 static void
   6311 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
   6312 {
   6313 	ibt_wc_t *wcs = state->id_txwcs;
   6314 	uint_t numwcs = state->id_txwcs_size;
   6315 	ibd_wqe_t *wqe;
   6316 	ibd_swqe_t *head, *tail;
   6317 	ibt_wc_t *wc;
   6318 	uint_t num_polled;
   6319 	int i;
   6320 
   6321 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
   6322 		head = tail = NULL;
   6323 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
   6324 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
   6325 			if (wc->wc_status != IBT_WC_SUCCESS) {
   6326 				/*
   6327 				 * Channel being torn down.
   6328 				 */
   6329 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
   6330 					DPRINT(5, "ibd_drain_scq: flush error");
   6331 					DPRINT(10, "ibd_drain_scq: Bad "
   6332 					    "status %d", wc->wc_status);
   6333 				} else {
   6334 					DPRINT(10, "ibd_drain_scq: "
   6335 					    "unexpected wc_status %d",
   6336 					    wc->wc_status);
   6337 				}
   6338 				/*
   6339 				 * Fallthrough to invoke the Tx handler to
   6340 				 * release held resources, e.g., AH refcount.
   6341 				 */
   6342 			}
   6343 			/*
   6344 			 * Add this swqe to the list to be cleaned up.
   6345 			 */
   6346 			if (head)
   6347 				tail->swqe_next = wqe;
   6348 			else
   6349 				head = WQE_TO_SWQE(wqe);
   6350 			tail = WQE_TO_SWQE(wqe);
   6351 		}
   6352 		tail->swqe_next = NULL;
   6353 		ibd_tx_cleanup_list(state, head, tail);
   6354 
   6355 		/*
   6356 		 * Resume any blocked transmissions if possible
   6357 		 */
   6358 		ibd_resume_transmission(state);
   6359 	}
   6360 }
   6361 
   6362 /*
   6363  * Poll and fully drain the receive cq
   6364  */
   6365 static void
   6366 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
   6367 {
   6368 	ibt_wc_t *wcs = state->id_rxwcs;
   6369 	uint_t numwcs = state->id_rxwcs_size;
   6370 	ibd_rwqe_t *rwqe;
   6371 	ibt_wc_t *wc;
   6372 	uint_t num_polled;
   6373 	int i;
   6374 	mblk_t *head, *tail, *mp;
   6375 
   6376 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
   6377 		head = tail = NULL;
   6378 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
   6379 			rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
   6380 			if (wc->wc_status != IBT_WC_SUCCESS) {
   6381 				/*
   6382 				 * Channel being torn down.
   6383 				 */
   6384 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
   6385 					DPRINT(5, "ibd_drain_rcq: "
   6386 					    "expected flushed rwqe");
   6387 				} else {
   6388 					DPRINT(5, "ibd_drain_rcq: "
   6389 					    "unexpected wc_status %d",
   6390 					    wc->wc_status);
   6391 				}
   6392 				atomic_inc_32(
   6393 				    &state->id_rx_list.dl_bufs_outstanding);
   6394 				freemsg(rwqe->rwqe_im_mblk);
   6395 				continue;
   6396 			}
   6397 			mp = ibd_process_rx(state, rwqe, wc);
   6398 			if (mp == NULL)
   6399 				continue;
   6400 
   6401 			/*
   6402 			 * Add this mp to the list to send to the nw layer.
   6403 			 */
   6404 			if (head)
   6405 				tail->b_next = mp;
   6406 			else
   6407 				head = mp;
   6408 			tail = mp;
   6409 		}
   6410 		if (head)
   6411 			mac_rx(state->id_mh, state->id_rh, head);
   6412 
   6413 		/*
   6414 		 * Account for #rwqes polled.
   6415 		 * Post more here, if less than one fourth full.
   6416 		 */
   6417 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
   6418 		    (state->id_num_rwqe / 4))
   6419 			ibd_post_recv_intr(state);
   6420 	}
   6421 }
   6422 
   6423 /*
   6424  * Common code for interrupt handling as well as for polling
   6425  * for all completed wqe's while detaching.
   6426  */
   6427 static void
   6428 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
   6429 {
   6430 	int flag, redo_flag;
   6431 	int redo = 1;
   6432 
   6433 	flag = IBD_CQ_POLLING;
   6434 	redo_flag = IBD_REDO_CQ_POLLING;
   6435 
   6436 	mutex_enter(&state->id_scq_poll_lock);
   6437 	if (state->id_scq_poll_busy & flag) {
   6438 		ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
   6439 		state->id_scq_poll_busy |= redo_flag;
   6440 		mutex_exit(&state->id_scq_poll_lock);
   6441 		return;
   6442 	}
   6443 	state->id_scq_poll_busy |= flag;
   6444 	mutex_exit(&state->id_scq_poll_lock);
   6445 
   6446 	/*
   6447 	 * In some cases (eg detaching), this code can be invoked on
   6448 	 * any cpu after disabling cq notification (thus no concurrency
   6449 	 * exists). Apart from that, the following applies normally:
   6450 	 * Transmit completion handling could be from any cpu if
   6451 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
   6452 	 * is interrupt driven.
   6453 	 */
   6454 
   6455 	/*
   6456 	 * Poll and drain the CQ
   6457 	 */
   6458 	ibd_drain_scq(state, cq_hdl);
   6459 
   6460 	/*
   6461 	 * Enable CQ notifications and redrain the cq to catch any
   6462 	 * completions we might have missed after the ibd_drain_scq()
   6463 	 * above and before the ibt_enable_cq_notify() that follows.
   6464 	 * Finally, service any new requests to poll the cq that
   6465 	 * could've come in after the ibt_enable_cq_notify().
   6466 	 */
   6467 	do {
   6468 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
   6469 		    IBT_SUCCESS) {
   6470 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
   6471 		}
   6472 
   6473 		ibd_drain_scq(state, cq_hdl);
   6474 
   6475 		mutex_enter(&state->id_scq_poll_lock);
   6476 		if (state->id_scq_poll_busy & redo_flag)
   6477 			state->id_scq_poll_busy &= ~redo_flag;
   6478 		else {
   6479 			state->id_scq_poll_busy &= ~flag;
   6480 			redo = 0;
   6481 		}
   6482 		mutex_exit(&state->id_scq_poll_lock);
   6483 
   6484 	} while (redo);
   6485 }
   6486 
   6487 /*
   6488  * Common code for interrupt handling as well as for polling
   6489  * for all completed wqe's while detaching.
   6490  */
   6491 static void
   6492 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
   6493 {
   6494 	int flag, redo_flag;
   6495 	int redo = 1;
   6496 
   6497 	flag = IBD_CQ_POLLING;
   6498 	redo_flag = IBD_REDO_CQ_POLLING;
   6499 
   6500 	mutex_enter(&state->id_rcq_poll_lock);
   6501 	if (state->id_rcq_poll_busy & flag) {
   6502 		ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
   6503 		state->id_rcq_poll_busy |= redo_flag;
   6504 		mutex_exit(&state->id_rcq_poll_lock);
   6505 		return;
   6506 	}
   6507 	state->id_rcq_poll_busy |= flag;
   6508 	mutex_exit(&state->id_rcq_poll_lock);
   6509 
   6510 	/*
   6511 	 * Poll and drain the CQ
   6512 	 */
   6513 	ibd_drain_rcq(state, rcq);
   6514 
   6515 	/*
   6516 	 * Enable CQ notifications and redrain the cq to catch any
   6517 	 * completions we might have missed after the ibd_drain_cq()
   6518 	 * above and before the ibt_enable_cq_notify() that follows.
   6519 	 * Finally, service any new requests to poll the cq that
   6520 	 * could've come in after the ibt_enable_cq_notify().
   6521 	 */
   6522 	do {
   6523 		if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
   6524 		    IBT_SUCCESS) {
   6525 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
   6526 		}
   6527 
   6528 		ibd_drain_rcq(state, rcq);
   6529 
   6530 		mutex_enter(&state->id_rcq_poll_lock);
   6531 		if (state->id_rcq_poll_busy & redo_flag)
   6532 			state->id_rcq_poll_busy &= ~redo_flag;
   6533 		else {
   6534 			state->id_rcq_poll_busy &= ~flag;
   6535 			redo = 0;
   6536 		}
   6537 		mutex_exit(&state->id_rcq_poll_lock);
   6538 
   6539 	} while (redo);
   6540 }
   6541 
   6542 /*
   6543  * Unmap the memory area associated with a given swqe.
   6544  */
   6545 void
   6546 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
   6547 {
   6548 	ibt_status_t stat;
   6549 
   6550 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
   6551 
   6552 	if (swqe->w_mi_hdl) {
   6553 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
   6554 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
   6555 			DPRINT(10,
   6556 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
   6557 		}
   6558 		swqe->w_mi_hdl = NULL;
   6559 	}
   6560 	swqe->w_swr.wr_nds = 0;
   6561 }
   6562 
   6563 void
   6564 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
   6565 {
   6566 	/*
   6567 	 * The recycling logic can be eliminated from here
   6568 	 * and put into the async thread if we create another
   6569 	 * list to hold ACE's for unjoined mcg's.
   6570 	 */
   6571 	if (DEC_REF_DO_CYCLE(ace)) {
   6572 		ibd_mce_t *mce;
   6573 
   6574 		/*
   6575 		 * Check with the lock taken: we decremented
   6576 		 * reference count without the lock, and some
   6577 		 * transmitter might already have bumped the
   6578 		 * reference count (possible in case of multicast
   6579 		 * disable when we leave the AH on the active
   6580 		 * list). If not still 0, get out, leaving the
   6581 		 * recycle bit intact.
   6582 		 *
   6583 		 * Atomically transition the AH from active
   6584 		 * to free list, and queue a work request to
   6585 		 * leave the group and destroy the mce. No
   6586 		 * transmitter can be looking at the AH or
   6587 		 * the MCE in between, since we have the
   6588 		 * ac_mutex lock. In the SendOnly reap case,
   6589 		 * it is not necessary to hold the ac_mutex
   6590 		 * and recheck the ref count (since the AH was
   6591 		 * taken off the active list), we just do it
   6592 		 * to have uniform processing with the Full
   6593 		 * reap case.
   6594 		 */
   6595 		mutex_enter(&state->id_ac_mutex);
   6596 		mce = ace->ac_mce;
   6597 		if (GET_REF_CYCLE(ace) == 0) {
   6598 			CLEAR_REFCYCLE(ace);
   6599 			/*
   6600 			 * Identify the case of fullmember reap as
   6601 			 * opposed to mcg trap reap. Also, port up
   6602 			 * might set ac_mce to NULL to indicate Tx
   6603 			 * cleanup should do no more than put the
   6604 			 * AH in the free list (see ibd_async_link).
   6605 			 */
   6606 			if (mce != NULL) {
   6607 				ace->ac_mce = NULL;
   6608 				IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
   6609 				/*
   6610 				 * mc_req was initialized at mce
   6611 				 * creation time.
   6612 				 */
   6613 				ibd_queue_work_slot(state,
   6614 				    &mce->mc_req, IBD_ASYNC_REAP);
   6615 			}
   6616 			IBD_ACACHE_INSERT_FREE(state, ace);
   6617 		}
   6618 		mutex_exit(&state->id_ac_mutex);
   6619 	}
   6620 }
   6621 
   6622 /*
   6623  * Common code that deals with clean ups after a successful or
   6624  * erroneous transmission attempt.
   6625  */
   6626 static void
   6627 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
   6628 {
   6629 	ibd_ace_t *ace = swqe->w_ahandle;
   6630 
   6631 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
   6632 
   6633 	/*
   6634 	 * If this was a dynamic mapping in ibd_send(), we need to
   6635 	 * unmap here. If this was an lso buffer we'd used for sending,
   6636 	 * we need to release the lso buf to the pool, since the resource
   6637 	 * is scarce. However, if this was simply a normal send using
   6638 	 * the copybuf (present in each swqe), we don't need to release it.
   6639 	 */
   6640 	if (swqe->swqe_im_mblk != NULL) {
   6641 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
   6642 			ibd_unmap_mem(state, swqe);
   6643 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
   6644 			ibd_release_lsobufs(state,
   6645 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
   6646 		}
   6647 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
   6648 		freemsg(swqe->swqe_im_mblk);
   6649 		swqe->swqe_im_mblk = NULL;
   6650 	}
   6651 
   6652 	/*
   6653 	 * Drop the reference count on the AH; it can be reused
   6654 	 * now for a different destination if there are no more
   6655 	 * posted sends that will use it. This can be eliminated
   6656 	 * if we can always associate each Tx buffer with an AH.
   6657 	 * The ace can be null if we are cleaning up from the
   6658 	 * ibd_send() error path.
   6659 	 */
   6660 	if (ace != NULL) {
   6661 		ibd_dec_ref_ace(state, ace);
   6662 	}
   6663 
   6664 	/*
   6665 	 * Release the send wqe for reuse.
   6666 	 */
   6667 	swqe->swqe_next = NULL;
   6668 	ibd_release_swqe(state, swqe, swqe, 1);
   6669 }
   6670 
   6671 static void
   6672 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
   6673 {
   6674 	ibd_ace_t *ace;
   6675 	ibd_swqe_t *swqe;
   6676 	int n = 0;
   6677 
   6678 	DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
   6679 
   6680 	for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
   6681 
   6682 		/*
   6683 		 * If this was a dynamic mapping in ibd_send(), we need to
   6684 		 * unmap here. If this was an lso buffer we'd used for sending,
   6685 		 * we need to release the lso buf to the pool, since the
   6686 		 * resource is scarce. However, if this was simply a normal
   6687 		 * send using the copybuf (present in each swqe), we don't need
   6688 		 * to release it.
   6689 		 */
   6690 		if (swqe->swqe_im_mblk != NULL) {
   6691 			if (swqe->w_buftype == IBD_WQE_MAPPED) {
   6692 				ibd_unmap_mem(state, swqe);
   6693 			} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
   6694 				ibd_release_lsobufs(state,
   6695 				    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
   6696 			}
   6697 			ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
   6698 			freemsg(swqe->swqe_im_mblk);
   6699 			swqe->swqe_im_mblk = NULL;
   6700 		}
   6701 
   6702 		/*
   6703 		 * Drop the reference count on the AH; it can be reused
   6704 		 * now for a different destination if there are no more
   6705 		 * posted sends that will use it. This can be eliminated
   6706 		 * if we can always associate each Tx buffer with an AH.
   6707 		 * The ace can be null if we are cleaning up from the
   6708 		 * ibd_send() error path.
   6709 		 */
   6710 		ace = swqe->w_ahandle;
   6711 		if (ace != NULL) {
   6712 			ibd_dec_ref_ace(state, ace);
   6713 		}
   6714 		n++;
   6715 	}
   6716 
   6717 	/*
   6718 	 * Release the send wqes for reuse.
   6719 	 */
   6720 	ibd_release_swqe(state, head, tail, n);
   6721 }
   6722 
   6723 /*
   6724  * Processing to be done after receipt of a packet; hand off to GLD
   6725  * in the format expected by GLD.  The received packet has this
   6726  * format: 2b sap :: 00 :: data.
   6727  */
   6728 static mblk_t *
   6729 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
   6730 {
   6731 	ib_header_info_t *phdr;
   6732 	mblk_t *mp;
   6733 	ipoib_hdr_t *ipibp;
   6734 	ipha_t *iphap;
   6735 	ip6_t *ip6h;
   6736 	int len;
   6737 	ib_msglen_t pkt_len = wc->wc_bytes_xfer;
   6738 	uint32_t bufs;
   6739 
   6740 	/*
   6741 	 * Track number handed to upper layer that need to be returned.
   6742 	 */
   6743 	bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
   6744 
   6745 	/* Never run out of rwqes, use allocb when running low */
   6746 	if (bufs >= state->id_rx_bufs_outstanding_limit) {
   6747 		atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
   6748 		atomic_inc_32(&state->id_rx_allocb);
   6749 		mp = allocb(pkt_len, BPRI_HI);
   6750 		if (mp) {
   6751 			bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
   6752 			ibd_post_recv(state, rwqe);
   6753 		} else {	/* no memory */
   6754 			atomic_inc_32(&state->id_rx_allocb_failed);
   6755 			ibd_post_recv(state, rwqe);
   6756 			return (NULL);
   6757 		}
   6758 	} else {
   6759 		mp = rwqe->rwqe_im_mblk;
   6760 	}
   6761 
   6762 
   6763 	/*
   6764 	 * Adjust write pointer depending on how much data came in.
   6765 	 */
   6766 	mp->b_wptr = mp->b_rptr + pkt_len;
   6767 
   6768 	/*
   6769 	 * Make sure this is NULL or we're in trouble.
   6770 	 */
   6771 	if (mp->b_next != NULL) {
   6772 		ibd_print_warn(state,
   6773 		    "ibd_process_rx: got duplicate mp from rcq?");
   6774 		mp->b_next = NULL;
   6775 	}
   6776 
   6777 	/*
   6778 	 * the IB link will deliver one of the IB link layer
   6779 	 * headers called, the Global Routing Header (GRH).
   6780 	 * ibd driver uses the information in GRH to build the
   6781 	 * Header_info structure and pass it with the datagram up
   6782 	 * to GLDv3.
   6783 	 * If the GRH is not valid, indicate to GLDv3 by setting
   6784 	 * the VerTcFlow field to 0.
   6785 	 */
   6786 	phdr = (ib_header_info_t *)mp->b_rptr;
   6787 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
   6788 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
   6789 
   6790 		/* if it is loop back packet, just drop it. */
   6791 		if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
   6792 		    IPOIB_ADDRL) == 0) {
   6793 			freemsg(mp);
   6794 			return (NULL);
   6795 		}
   6796 
   6797 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
   6798 		    sizeof (ipoib_mac_t));
   6799 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
   6800 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
   6801 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
   6802 		} else {
   6803 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
   6804 		}
   6805 	} else {
   6806 		/*
   6807 		 * It can not be a IBA multicast packet. Must have been
   6808 		 * unicast for us. Just copy the interface address to dst.
   6809 		 */
   6810 		phdr->ib_grh.ipoib_vertcflow = 0;
   6811 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
   6812 		    sizeof (ipoib_mac_t));
   6813 	}
   6814 
   6815 	/*
   6816 	 * For ND6 packets, padding is at the front of the source/target
   6817 	 * lladdr. However the inet6 layer is not aware of it, hence remove
   6818 	 * the padding from such packets.
   6819 	 */
   6820 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
   6821 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
   6822 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
   6823 		len = ntohs(ip6h->ip6_plen);
   6824 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
   6825 			/* LINTED: E_CONSTANT_CONDITION */
   6826 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
   6827 		}
   6828 	}
   6829 
   6830 	/*
   6831 	 * Update statistics
   6832 	 */
   6833 	atomic_add_64(&state->id_rcv_bytes, pkt_len);
   6834 	atomic_inc_64(&state->id_rcv_pkt);
   6835 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
   6836 		atomic_inc_64(&state->id_brd_rcv);
   6837 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
   6838 		atomic_inc_64(&state->id_multi_rcv);
   6839 
   6840 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
   6841 	/*
   6842 	 * Set receive checksum status in mp
   6843 	 * Hardware checksumming can be considered valid only if:
   6844 	 * 1. CQE.IP_OK bit is set
   6845 	 * 2. CQE.CKSUM = 0xffff
   6846 	 * 3. IPv6 routing header is not present in the packet
   6847 	 * 4. If there are no IP_OPTIONS in the IP HEADER
   6848 	 */
   6849 
   6850 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
   6851 	    (wc->wc_cksum == 0xFFFF) &&
   6852 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
   6853 		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
   6854 		    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
   6855 	}
   6856 
   6857 	return (mp);
   6858 }
   6859 
   6860 /*
   6861  * Callback code invoked from STREAMs when the receive data buffer is
   6862  * free for recycling.
   6863  */
   6864 static void
   6865 ibd_freemsg_cb(char *arg)
   6866 {
   6867 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
   6868 	ibd_state_t *state = rwqe->w_state;
   6869 
   6870 	atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
   6871 
   6872 	/*
   6873 	 * If the driver is stopped, just free the rwqe.
   6874 	 */
   6875 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
   6876 		DPRINT(6, "ibd_freemsg: wqe being freed");
   6877 		rwqe->rwqe_im_mblk = NULL;
   6878 		ibd_free_rwqe(state, rwqe);
   6879 		return;
   6880 	}
   6881 
   6882 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
   6883 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
   6884 	if (rwqe->rwqe_im_mblk == NULL) {
   6885 		ibd_free_rwqe(state, rwqe);
   6886 		DPRINT(6, "ibd_freemsg: desballoc failed");
   6887 		return;
   6888 	}
   6889 
   6890 	ibd_post_recv(state, rwqe);
   6891 }
   6892 
   6893 static uint_t
   6894 ibd_tx_recycle(caddr_t arg)
   6895 {
   6896 	ibd_state_t *state = (ibd_state_t *)arg;
   6897 
   6898 	/*
   6899 	 * Poll for completed entries
   6900 	 */
   6901 	ibd_poll_scq(state, state->id_scq_hdl);
   6902 
   6903 	return (DDI_INTR_CLAIMED);
   6904 }
   6905 
   6906 #ifdef IBD_LOGGING
   6907 static void
   6908 ibd_log_init(void)
   6909 {
   6910 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
   6911 	ibd_lbuf_ndx = 0;
   6912 
   6913 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
   6914 }
   6915 
   6916 static void
   6917 ibd_log_fini(void)
   6918 {
   6919 	if (ibd_lbuf)
   6920 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
   6921 	ibd_lbuf_ndx = 0;
   6922 	ibd_lbuf = NULL;
   6923 
   6924 	mutex_destroy(&ibd_lbuf_lock);
   6925 }
   6926 
   6927 static void
   6928 ibd_log(const char *fmt, ...)
   6929 {
   6930 	va_list	ap;
   6931 	uint32_t off;
   6932 	uint32_t msglen;
   6933 	char tmpbuf[IBD_DMAX_LINE];
   6934 
   6935 	if (ibd_lbuf == NULL)
   6936 		return;
   6937 
   6938 	va_start(ap, fmt);
   6939 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
   6940 	va_end(ap);
   6941 
   6942 	if (msglen >= IBD_DMAX_LINE)
   6943 		msglen = IBD_DMAX_LINE - 1;
   6944 
   6945 	mutex_enter(&ibd_lbuf_lock);
   6946 
   6947 	off = ibd_lbuf_ndx;		/* current msg should go here */
   6948 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
   6949 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
   6950 
   6951 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
   6952 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
   6953 
   6954 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
   6955 		ibd_lbuf_ndx = 0;
   6956 
   6957 	mutex_exit(&ibd_lbuf_lock);
   6958 
   6959 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
   6960 }
   6961 #endif
   6962