1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the License). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/CDDL.txt 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/CDDL.txt. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets [] replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * RSMRDT driver for RAC 27 * 28 */ 29 30 #pragma ident "@(#)rsmrdt.c 1.70 08/05/20 SMI" 31 32 const char opsrsm_version[] = "@(#)rsmrdt.c 1.70 08/05/20 SMI"; 33 34 #include <sys/types.h> 35 #include <sys/errno.h> 36 #include <sys/debug.h> 37 #include <sys/stropts.h> 38 #include <sys/stream.h> 39 #include <sys/strlog.h> 40 #include <sys/cmn_err.h> 41 #include <sys/kmem.h> 42 #include <sys/conf.h> 43 #include <sys/stat.h> 44 #include <sys/dlpi.h> 45 #include <sys/modctl.h> 46 #include <sys/kstat.h> 47 #include <sys/ddi.h> 48 #include <sys/sunddi.h> 49 #include <sys/strsun.h> 50 #include <sys/taskq.h> 51 #include <sys/open.h> 52 #include <sys/uio.h> 53 #include <sys/cpuvar.h> 54 #include <sys/atomic.h> 55 56 #include <sys/rsm/rsm_common.h> 57 #include <sys/rsm/rsmpi.h> 58 59 #include "rsmrdt.h" /* This driver's data structures */ 60 61 /* inter-module dependencies */ 62 char _depends_on[] = "misc/rsmops"; 63 64 /* 65 * Lock hierarchy: 66 * 67 * opsrsmp->opsrsm_lock 68 * opsrsmdevlock 69 * 70 * rd->rd_lock 71 * rd->rd_xmit_lock 72 * rd->rd_net_lock 73 * 74 * opsrsm->opsrsm_dest_lock 75 * opsrsm->opsrsm_runq_lock 76 * 77 * rd->rd_nlb_lock -- currently never taken while another lock is held 78 * opsrsmattlock 79 * opsrsmdbglock 80 */ 81 82 83 /* 84 * Defining DEBUG on the compile line (-DDEBUG) will compile 85 * debugging code into the driver. Whether any debug output actually gets 86 * printed depends on the value of opsrsmdbg, which determines the class of 87 * messages that the user is interested in, and opsrsmdbgmode, which 88 * determines how the user wants the messages to be produced. 89 * 90 * See the #defines for D1(), D2(), etc. below for which bits in opsrsmdbg 91 * cause which messages to get printed. 92 * 93 * The various types of output are controlled by bits in opsrsmdbgmode, as 94 * follows. Multiple types of output may be used at once, if desired. 95 * 96 * (opsrsmdbgmode & 1) Use debugging log. 97 * (opsrsmdbgmode & 2) Use kernel printfs. 98 */ 99 100 #ifndef lint 101 102 #ifdef DEBUG 103 104 int opsrsmdbg = 0x0100; 105 int opsrsmdbgmode = 0x1; 106 static void opsrsmconsole(const char *, ...); 107 108 /* opsrsm function enter/exit, parameters, return values. */ 109 #define D1 \ 110 if (opsrsmdbg & 0x01) \ 111 opsrsmdebug 112 113 /* Additional function debugging. */ 114 #define D2 \ 115 if (opsrsmdbg & 0x02) \ 116 opsrsmdebug 117 118 /* rsmpi interface routine enter/exit, parameters, return */ 119 #define D4 \ 120 if (opsrsmdbg & 0x08) \ 121 opsrsmdebug 122 123 /* Latency timing output. */ 124 #define D5 \ 125 if (opsrsmdbg & 0x10) \ 126 opsrsmdebug 127 128 /* Excessive debugging output */ 129 #define D6 \ 130 if (opsrsmdbg & 0x20) \ 131 opsrsmdebug 132 133 /* debug message on the console */ 134 #define DINFO \ 135 opsrsmconsole 136 137 /* error message logged to the debug buffer */ 138 #define DERR \ 139 if (opsrsmdbg & 0x100) \ 140 opsrsmdebug 141 142 #else /* DEBUG */ 143 144 #define D1 if (0) printf 145 #define D2 if (0) printf 146 #define D4 if (0) printf 147 #define D5 if (0) printf 148 #define D6 if (0) printf 149 #define DINFO if (0) printf 150 #define DERR if (0) printf 151 152 #endif /* DEBUG */ 153 154 #else /* lint */ 155 156 #ifdef DEBUG 157 int opsrsmdbg; 158 int opsrsmdbgmode; 159 #endif 160 161 #define D1 printf 162 #define D2 printf 163 #define D4 printf 164 #define D5 printf 165 #define D6 printf 166 #define DINFO printf 167 #define DERR printf 168 #endif /* lint */ 169 170 /* 171 * Function prototypes. 172 */ 173 static int opsrsm_open(dev_t *, int, int, struct cred *); 174 static int opsrsm_close(dev_t, int, int, struct cred *); 175 static int opsrsm_attach(dev_info_t *, ddi_attach_cmd_t); 176 static int opsrsm_detach(dev_info_t *, ddi_detach_cmd_t); 177 static int opsrsm_info(dev_info_t *, ddi_info_cmd_t, void *, void **); 178 static int opsrsm_chpoll(dev_t, short, int, short *, struct pollhead **); 179 static int opsrsm_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 180 181 static void opsrsmwsrv(void *); 182 static int opsrsmcrexfer(opsrsm_t *, opsrsm_dest_t *); 183 static int opsrsmsconn(opsrsm_t *, opsrsm_dest_t *, int); 184 static int opsrsmconnxfer(opsrsm_t *, opsrsm_dest_t *); 185 static int opsrsmsack(opsrsm_dest_t *); 186 static int opsrsmsaccept(opsrsm_t *opsrsmp, opsrsm_dest_t *rd); 187 188 static opsrsm_dest_t *opsrsm_connect(int, opsrsmresource_t *); 189 static opsrsm_dest_t *opsrsmmkdest(adapter_t *, rsm_addr_t); 190 static void opsrsmsconntmo(void *); 191 static void opsrsmacktmo(void *); 192 static void opsrsmaccepttmo(void * arg); 193 static void opsrsmfreedesttmo(void *); 194 195 static void opsrsmmsghdlr_req_connect(opsrsm_dest_t *, opsrsm_msg_t *); 196 static void opsrsmmsghdlr_con_accept(opsrsm_dest_t *, opsrsm_msg_t *); 197 static void opsrsmmsghdlr_syncdqe(opsrsm_dest_t *, opsrsm_msg_t *); 198 static void opsrsmmsghdlr_default(opsrsm_dest_t *, opsrsm_msg_t *); 199 200 static int opsrsmgetstate(opsrsm_dest_t *); 201 static void opsrsmsetstate(opsrsm_dest_t *, int); 202 static int opsrsmmovestate(opsrsm_dest_t *, int, int newstate); 203 static int opsrsmread(opsrsm_dest_t *, int, int, int, ushort_t sap); 204 static int opsrsmuninit(adapter_t *adapterp); 205 static boolean_t opsrsmdest_refcnt_0(opsrsm_dest_t *); 206 static int opsrsmfreedest(adapter_t *adapter, rsm_addr_t); 207 208 /* LINTED: E_STATIC_FUNC_CALLD_NOT_DEFINED */ 209 static void opsrsmdebug(const char *, ...); 210 static void opsrsmerror(dev_info_t *, const char *, ...); 211 static void opsrsmkstatinit(opsrsm_t *); 212 static void opsrsmkstatremove(opsrsm_t *opsrsmp); 213 static void opsrsmgetparam(dev_info_t *, opsrsm_t *); 214 static void opsrsmtakedown(opsrsm_t *, int); 215 216 static void opsrsmfreebuf(opsrsmbuf_t *); 217 static void opsrsmputfqe(opsrsm_dest_t *, int); 218 static void opsrsmputfqe_nolock(opsrsm_dest_t *, int); 219 static opsrsm_queued_fqe_t *opsrsm_queued_fqe_alloc(opsrsm_dest_t *); 220 static void opsrsm_queued_fqe_free(opsrsm_dest_t *, opsrsm_queued_fqe_t *); 221 222 static void opsrsmputdqes(opsrsm_dest_t *); 223 static int opsrsmavailfqe(opsrsm_dest_t *); 224 static int opsrsmavailfqe2(opsrsm_dest_t *); 225 static int opsrsmgetfqe(opsrsm_dest_t *, int *); 226 static int opsrsmgetdqe(opsrsm_dest_t *, int *, int *, int *, ushort_t *); 227 static int opsrsmsendmsg(opsrsm_dest_t *, uint8_t, opsrsm_msg_t *); 228 229 rsm_intr_hand_ret_t opsrsm_rsm_intr_handler(rsm_controller_object_t *, 230 rsm_intr_q_op_t, rsm_addr_t, void *, size_t, rsm_intr_hand_arg_t); 231 232 static opsrsm_failover_info_t *opsrsm_finfo_add(opsrsm_dest_t *); 233 static opsrsm_failover_info_t *opsrsm_finfo_lookup_by_local_skey(uint32_t); 234 static opsrsm_failover_info_t *opsrsm_finfo_lookup_by_remote_skey(uint32_t); 235 static int opsrsm_finfo_wait(uint32_t); 236 static void opsrsm_finfo_wakeup(opsrsm_failover_info_t *, int); 237 static void opsrsm_finfo_init(void); 238 static void opsrsm_finfo_fini(void); 239 static void opsrsm_finfo_destroy(void *); 240 static void opsrsm_failover_thread(void *); 241 static void opsrsm_lostconn(opsrsm_dest_t *); 242 static void opsrsm_reset_all_rps(opsrsm_dest_t *rd); 243 static void opsrsmmsghdlr_finfo(opsrsm_dest_t *, opsrsm_msg_t *); 244 static void opsrsm_option_rexmit_end(mblk_t *, opsrsm_dest_t *); 245 static int opsrsm_finfo_sendmsg(opsrsm_dest_t *, uint8_t, uint32_t); 246 static mblk_t *opsrsm_alloc_ack_msg(uint32_t); 247 static void opsrsm_queued_msg_send(opsrsm_dest_t *); 248 static void opsrsm_queued_msg_flush(opsrsm_dest_t *); 249 static void opsrsm_queued_msg_append(opsrsm_dest_t *, opsrsm_queued_msg_t *); 250 251 extern void apply_on_all_adapters(void (*)(adapter_t *, void *), void *); 252 static uint32_t opsrsm_pending_bytes = 0; 253 254 static kmutex_t opsrsm_flow_tmo_lock; 255 static timeout_id_t opsrsm_flow_tmo_id; 256 static int opsrsm_flow_tmo_retries = 0; 257 258 static void opsrsm_flow_tmo(void *); 259 static void opsrsm_flow_tmo_cancel(void); 260 static void opsrsm_flow_enable(adapter_t *, void *); 261 static void opsrsm_sync_flow_ctl(void *); 262 static void opsrsm_sync_flow_tmo(void *); 263 static void opsrsm_set_sync_flow_tmo(opsrsm_dest_t *); 264 static void opsrsm_cancel_sync_flow_tmo(opsrsm_dest_t *); 265 static void opsrsm_check_flow_ctl(opsrsm_dest_t *); 266 static int opsrsmdemux_loopback(mblk_t *); 267 static void opsrsm_status_check_tmo(void *); 268 269 taskq_t *opsrsm_failover_taskq; 270 taskq_t *opsrsm_events_taskq; 271 272 #define OPSRSM_Q_LEN(q) ((q)->q_len) 273 #define OPSRSM_Q_HEAD(q) ((q)->q_head) 274 #define OPSRSM_Q_NEXT(q, mp) ((mp)->b_next) 275 276 #define OPSRSM_Q_INIT(q) { \ 277 (q)->q_head = NULL; \ 278 (q)->q_tail = NULL; \ 279 (q)->q_len = 0; \ 280 } 281 282 #define OPSRSM_Q_APPEND(q, mp) { \ 283 ASSERT((mp)->b_next == NULL); \ 284 if ((q)->q_head == NULL) { \ 285 (q)->q_head = (mp); \ 286 (q)->q_tail = (mp); \ 287 } else { \ 288 (q)->q_tail->b_next = (mp); \ 289 (q)->q_tail = (mp); \ 290 } \ 291 (q)->q_len++; \ 292 } 293 294 #define OPSRSM_Q_REMOVE(q, mp) { \ 295 ASSERT((q)->q_len > 0); \ 296 (mp) = (q)->q_head; \ 297 if ((q)->q_head == (q)->q_tail) { \ 298 (q)->q_tail = NULL; \ 299 } \ 300 (q)->q_head = (mp)->b_next; \ 301 (mp)->b_next = NULL; \ 302 (q)->q_len--; \ 303 } 304 305 #define OPSRSM_Q_FLUSH(q) { \ 306 while ((q)->q_head != NULL) { \ 307 mblk_t *mp; \ 308 \ 309 mp = (q)->q_head; \ 310 (q)->q_head = mp->b_next; \ 311 mp->b_prev = mp->b_next = NULL; \ 312 mp->b_cont = NULL; \ 313 freemsg(mp); \ 314 } \ 315 (q)->q_tail = NULL; \ 316 (q)->q_len = 0; \ 317 } 318 319 #define OPSRSM_Q_CONCAT(q1, q2) { \ 320 if ((q1)->q_head == NULL) { \ 321 (q1)->q_head = (q2)->q_head; \ 322 (q1)->q_tail = (q2)->q_tail; \ 323 (q1)->q_len = (q2)->q_len; \ 324 } else { \ 325 if ((q2)->q_len > 0) { \ 326 (q1)->q_tail->b_next = (q2)->q_head; \ 327 (q1)->q_tail = (q2)->q_tail; \ 328 (q1)->q_len += (q2)->q_len; \ 329 } \ 330 } \ 331 (q2)->q_head = NULL; \ 332 (q2)->q_tail = NULL; \ 333 (q2)->q_len = 0; \ 334 } 335 336 #define OPSRSM_REACHED_STATIC_DATA_THRESHOLD(rd) \ 337 ((rd)->rd_data_collected >= opsrsmdev-> \ 338 opsrsm_param.opsrsm_data_threshold) 339 340 #define OPSRSM_REACHED_DATA_THRESHOLD(rd) \ 341 ((opsrsmdev->opsrsm_param.opsrsm_adaptive_intr == 1) ? \ 342 ((rd)->rd_data_collected >= (rd)->rd_adaptive_threshold) : \ 343 (OPSRSM_REACHED_STATIC_DATA_THRESHOLD(rd))) 344 345 #define OPSRSM_ADAPT_THRESHOLD(rd, pktlen) { \ 346 if (opsrsmdev->opsrsm_param.opsrsm_adaptive_intr == 1) { \ 347 uint32_t diff = \ 348 (uint32_t)ddi_get_lbolt() - (rd)->rd_last_sent; \ 349 (rd)->rd_last_sent = (uint32_t)ddi_get_lbolt(); \ 350 if (diff == 0) { \ 351 (rd)->rd_pkt_freq++; \ 352 if ((rd)->rd_pkt_freq > opsrsmdev-> \ 353 opsrsm_param.opsrsm_adaptive_rate) { \ 354 (rd)->rd_adaptive_threshold += pktlen; \ 355 (rd)->rd_pkt_freq = 0; \ 356 } \ 357 if ((rd)->rd_adaptive_threshold > \ 358 opsrsmdev->opsrsm_param. \ 359 opsrsm_data_threshold) \ 360 (rd)->rd_adaptive_threshold = \ 361 opsrsmdev->opsrsm_param. \ 362 opsrsm_data_threshold; \ 363 } else { \ 364 uint32_t reduce = 2 * pktlen * diff; \ 365 if (reduce > (rd)->rd_adaptive_threshold) { \ 366 (rd)->rd_adaptive_threshold = 0; \ 367 } else { \ 368 (rd)->rd_adaptive_threshold -= reduce; \ 369 } \ 370 } \ 371 } \ 372 } 373 374 #define OPSRSM_NO_PENDING_WRITES(rd) \ 375 ((rd)->rd_writes_completed == OPSRSM_Q_LEN(&(rd)->rd_pendq)) 376 377 #define OPSRSM_RSREF(rp) { \ 378 mutex_enter(&(rp)->rs_lock); \ 379 (rp)->rs_refcnt++; \ 380 mutex_exit(&(rp)->rs_lock); \ 381 } 382 383 #define OPSRSM_RSUNREF(rp) { \ 384 mutex_enter(&(rp)->rs_lock); \ 385 (rp)->rs_refcnt--; \ 386 if ((rp)->rs_refcnt == 0) { \ 387 cv_broadcast(&(rp)->rs_close_cv); \ 388 } \ 389 mutex_exit(&(rp)->rs_lock); \ 390 } 391 392 #define OPSRSM_LOOPBACK 0x100b 393 #define OPSRSM_IS_LOOPBACK(rd) ((uint32_t)(rd) == OPSRSM_LOOPBACK) 394 395 static int opsrsm_start_batch(opsrsm_dest_t *, uint32_t); 396 static int opsrsm_end_batch(opsrsm_dest_t *); 397 static void opsrsm_xmit_tmo(void *); 398 static void opsrsm_fqe_tmo(void *); 399 static void opsrsm_dispatch_tmo(void *); 400 static void opsrsm_set_xmit_tmo(opsrsm_dest_t *, int); 401 static void opsrsm_set_fqe_tmo(opsrsm_dest_t *, int); 402 static void opsrsm_cancel_xmit_tmo(opsrsm_dest_t *); 403 static void opsrsm_cancel_fqe_tmo(opsrsm_dest_t *); 404 static void opsrsmxmit_thread(void *); 405 static int opsrsmxmit(opsrsm_dest_t *, mblk_t *); 406 static int opsrsmrexmit(opsrsm_dest_t *); 407 static int opsrsm_write_data(opsrsm_dest_t *, mblk_t *); 408 static int opsrsm_sync_dqe(opsrsm_dest_t *); 409 static int opsrsm_sync_fqe(opsrsm_dest_t *); 410 static void opsrsm_wake_senders(opsrsm_dest_t *, short); 411 static void opsrsm_sync_dqe_tmo(void *); 412 static void opsrsm_sync_fqe_tmo(void *); 413 static void opsrsmdemux(mblk_t *, opsrsm_dest_t *); 414 static void opsrsm_event_thread(void *); 415 static void opsrsm_event_add(opsrsm_dest_t *, uint32_t); 416 417 static opsrsmresource_t *opsrsmresource_alloc(minor_t *); 418 static opsrsmresource_t *opsrsm_resstruct_alloc(); 419 static opsrsmresource_t *opsrsmresource_free(minor_t rnum); 420 static opsrsmresource_t *opsrsmresource_lookup(minor_t, int); 421 static void opsrsmresource_destroy(void); 422 static int opsrsm_resstruct_free(minor_t); 423 static void opsrsmresource_init(void); 424 static void opsrsmresource_fini(void); 425 static struct opsrsmresource_table opsrsm_resource; 426 427 static opsrsm_failover_info_t *opsrsm_finfo_list; 428 static kmutex_t opsrsm_finfo_lock; 429 static kcondvar_t opsrsm_finfo_cv; 430 static int opsrsm_failover_threads; 431 static int opsrsm_failover_max_retries = 6000; 432 static int opsrsm_failover_destruct_time = 3000; 433 static int opsrsm_queued_msg_max_retries = 2000; 434 435 int rsmrdt_adapterinit(adapter_t *); 436 int rsmrdt_adapterfini(adapter_t *); 437 void rsmrdt_failover(adapter_t *, rsm_addr_t); 438 int rsmrdt_check_openhandles(void); 439 440 /* LINTED: E_STATIC_FUNC_CALLD_NOT_DEFINED */ 441 extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *); 442 extern void rsmrdt_pathmanager_init(void); 443 extern void rsmrdt_pathmanager_cleanup(void); 444 extern rsm_addr_t rsmrdt_get_remote_hwaddr(adapter_t *, rsm_node_id_t); 445 extern adapter_t *rsmrdt_select_adapter(rsm_node_id_t, int); 446 extern void rsmrdt_get_remote_ids(adapter_t *, rsm_addr_t, int *, int *); 447 448 /* 449 * The opsrsm driver implements a reference count scheme for destination 450 * structures. The idea behind the scheme is to prevent the driver from 451 * deleting a destination structure while it is being used elsewhere, for 452 * example in a message handling routine. (Failures to protect against 453 * this occurrence have led to a fair array of baffling bugs over the 454 * lifetime of the driver.) 455 * 456 * The following set of macros implement the reference count scheme, 457 * translation from RSM address to destination structure, and removal of 458 * destinations from the run queue. All must be intertwined, since 459 * otherwise it would be possible to get a destination pointer from an RSM 460 * address , or from the run queue, but have some other part of the driver 461 * delete the destination before you could bump its reference count. The 462 * incorporation of reference count code in FINDDEST/MAKEDEST/GETRUNQ 463 * solves this race condition. 464 */ 465 466 /* 467 * FINDDEST attempts to find the destination with RSM address rsm_addr. If the 468 * destination exists, rd is set to point to it. If the destination exists, 469 * isdel is set to indicate whether the destination is currently being deleted 470 * (nonzero implies a delete is in progress). If the destination exists and 471 * is not being deleted, its reference count is increased by one. 472 */ 473 #define FINDDEST(rd, isdel, rsm_addr, adapter) { \ 474 mutex_enter(&adapter->opsrsm_dest_lock); \ 475 (rd) = (((rsm_addr) >= RSM_MAX_DESTADDR) ? NULL : \ 476 (adapter)->opsrsm_desttbl[(rsm_addr)]); \ 477 if (rd) \ 478 if (((isdel) = (rd)->rd_dstate) == 0) { \ 479 (rd)->rd_refcnt++; \ 480 D6("FINDDEST ctlr %d addr %ld refcnt++ is %d\n", \ 481 adapter->instance, rsm_addr, \ 482 (rd)->rd_refcnt); \ 483 } \ 484 mutex_exit(&(adapter)->opsrsm_dest_lock); \ 485 } 486 487 488 /* 489 * MAKEDEST attempts to find the destination with RSM address rsm_addr. If the 490 * destination exists, rd and isdel are set as in the description of FINDDEST, 491 * above. If the destination does not exist, a new destination structure is 492 * allocated and installed, rd is set to point to it, and isnew is set to 1. 493 */ 494 #define MAKEDEST(rd, isdel, isnew, rsm_addr, adapter) { \ 495 mutex_enter(&(adapter)->opsrsm_dest_lock); \ 496 (rd) = (((rsm_addr) >= RSM_MAX_DESTADDR) ? NULL : \ 497 (adapter)->opsrsm_desttbl[(rsm_addr)]); \ 498 if (!(rd)) { \ 499 (rd) = opsrsmmkdest((adapter), (rsm_addr)); \ 500 (isnew) = 1; \ 501 } \ 502 if (rd) \ 503 if (((isdel) = (rd)->rd_dstate) == 0) { \ 504 (rd)->rd_refcnt++; \ 505 D6("MAKEDEST ctlr %d addr %ld refcnt++ is %d\n", \ 506 adapter->instance, (uint64_t)rsm_addr, \ 507 (rd)->rd_refcnt); \ 508 } \ 509 mutex_exit(&(adapter)->opsrsm_dest_lock); \ 510 } 511 512 513 /* 514 * GETRUNQ attempts to return the destination which is at the head of opsrsm's 515 * run queue. If the run queue is non-empty, the head of the queue is removed, 516 * and rd is set to point to it; otherwise, rd is set to NULL. If rd is 517 * nonzero, isdel is set to 1 if the destination pointed to by rd is being 518 * deleted, or to 0 otherwise. Finally, if rd is nonzero, and isdel is zero, 519 * then rd's reference count is increased by one. 520 */ 521 #define GETRUNQ(rd, isdel, adapterp) { \ 522 mutex_enter(&(adapterp)->opsrsm_dest_lock); \ 523 mutex_enter(&(adapterp)->opsrsm_runq_lock); \ 524 rd = (adapterp)->opsrsm_runq; \ 525 if (rd) { \ 526 (adapterp)->opsrsm_runq = rd->rd_next; \ 527 if (((isdel) = (rd)->rd_dstate) == 0) { \ 528 (rd)->rd_refcnt++; \ 529 D6("GETRUNQ ctlr %d addr %ld refcnt++ is %d\n", \ 530 adapterp->instance, \ 531 (rd)->rd_rsm_addr, \ 532 (rd)->rd_refcnt); \ 533 } \ 534 } \ 535 mutex_exit(&(adapterp)->opsrsm_runq_lock); \ 536 mutex_exit(&(adapterp)->opsrsm_dest_lock); \ 537 } 538 539 540 /* 541 * REFDEST checks to see if the destination pointed to by rd is currently being 542 * deleted. If so, isdel is set to a nonzero value; otherwise, it is set to 543 * zero, and the destination's reference count is incremented. 544 */ 545 #define REFDEST(rd, isdel) { \ 546 mutex_enter(&(rd)->rd_adapter->opsrsm_dest_lock); \ 547 if (((isdel) = (rd)->rd_dstate) == 0) { \ 548 (rd)->rd_refcnt++; \ 549 } \ 550 mutex_exit(&(rd)->rd_adapter->opsrsm_dest_lock); \ 551 } 552 553 554 /* 555 * UNREFDEST decrements the reference count of the destination pointed to by 556 * rd. If the reference count becomes zero, we start the deletion process for 557 * the destination. 558 */ 559 #define UNREFDEST(rd) { \ 560 mutex_enter(&(rd)->rd_adapter->opsrsm_dest_lock); \ 561 D6("UNREFDEST ctlr %d addr %ld refcnt-- is %d\n", \ 562 (rd)->rd_adapter->instance, (rd)->rd_rsm_addr, \ 563 (rd)->rd_refcnt - 1); \ 564 if (--(rd)->rd_refcnt <= 0) { \ 565 mutex_exit(&(rd)->rd_adapter->opsrsm_dest_lock); \ 566 if (opsrsmdest_refcnt_0(rd)) { rd = NULL; } \ 567 } else \ 568 mutex_exit(&(rd)->rd_adapter->opsrsm_dest_lock); \ 569 } 570 571 572 573 /* Local Static def's */ 574 575 /* 576 * Lock and variable to allow attach routines to initialize global mutexes 577 */ 578 579 static kmutex_t opsrsmattlock; /* Protects opsrsmdbginit */ 580 581 /* 582 * Pointer to 'opsrsm' global structure. 583 */ 584 opsrsm_t *opsrsmdev = NULL; /* Head of list */ 585 586 static kmutex_t opsrsmdevlock; /* Protects list contents */ 587 static void *opsrsm_state; /* opaque handle for soft state structs */ 588 589 extern rsm_node_id_t rsmrdt_my_nodeid; 590 591 /* 592 * **************************************************************** 593 * * 594 * B E G I N BASIC MODULE BOILERPLATE * 595 * * 596 * **************************************************************** 597 */ 598 599 600 /* Module Loading/Unloading and Autoconfiguration declarations */ 601 602 /* 603 * cb_ops contains the driver entry points and is roughly equivalent 604 * to the cdevsw and bdevsw structures in previous releases. 605 * 606 * dev_ops contains, in addition to the pointer to cb_ops, the routines 607 * that support loading and unloading our driver. 608 * 609 */ 610 611 static struct cb_ops opsrsm_cb_ops = { 612 opsrsm_open, /* cb_open */ 613 opsrsm_close, /* cb_close */ 614 nodev, /* cb_strategy */ 615 nodev, /* cb_print */ 616 nodev, /* cb_dump */ 617 nodev, /* cb_read */ 618 nodev, /* cb_write */ 619 opsrsm_ioctl, /* cb_ioctl */ 620 nodev, /* cb_devmap */ 621 nodev, /* cb_mmap */ 622 nodev, /* cb_segmap */ 623 opsrsm_chpoll, /* cb_chpoll */ 624 ddi_prop_op, /* cb_prop_op */ 625 NULL, /* cb_stream */ 626 D_NEW | D_MP, /* cb_flag */ 627 CB_REV, /* rev */ 628 nodev, /* int (*cb_aread)() */ 629 nodev /* int (*cb_awrite)() */ 630 }; 631 632 static struct dev_ops opsrsm_ops = { 633 DEVO_REV, /* devo_rev */ 634 0, /* devo_refcnt */ 635 opsrsm_info, /* devo_getinfo */ 636 nulldev, /* devo_identify */ 637 nulldev, /* devo_probe */ 638 opsrsm_attach, /* devo_attach */ 639 opsrsm_detach, /* devo_detach */ 640 nodev, /* devo_reset */ 641 &opsrsm_cb_ops, /* devo_cb_ops */ 642 (struct bus_ops *)NULL, /* devo_bus_ops */ 643 nulldev /* power */ 644 }; 645 646 647 /* 648 * Module linkage information for the kernel. 649 */ 650 static struct modldrv modldrv = { 651 &mod_driverops, 652 "Reliable Datagram Transport driver - v1.0", 653 &opsrsm_ops, 654 }; 655 656 static struct modlinkage modlinkage = { 657 #ifdef _LP64 658 MODREV_1, { (void *) &modldrv, NULL, NULL, NULL, NULL, NULL, NULL } 659 #else 660 MODREV_1, { (void *) &modldrv, NULL, NULL, NULL } 661 #endif 662 }; 663 664 /* 665 * Module Loading and Installation Routines. 666 */ 667 668 /* 669 * Module Installation 670 * Install the driver, initialize soft state system, initialize opsrsmattlock 671 */ 672 673 int 674 _init(void) 675 { 676 int status; 677 678 status = ddi_soft_state_init(&opsrsm_state, sizeof (opsrsm_t), 1); 679 if (status != 0) { 680 #ifdef DEBUG 681 cmn_err(CE_CONT, 682 "opsrsm:_init - soft_state_init failed: 0x%x\n", status); 683 #endif /* DEBUG */ 684 return (status); 685 } 686 687 /* initialize global locks here */ 688 mutex_init(&opsrsmattlock, NULL, MUTEX_DRIVER, NULL); 689 mutex_init(&opsrsmdevlock, NULL, MUTEX_DRIVER, NULL); 690 mutex_init(&opsrsm_flow_tmo_lock, NULL, MUTEX_DRIVER, NULL); 691 opsrsm_flow_tmo_id = 0; 692 693 opsrsm_events_taskq = taskq_create("events", 8, maxclsyspri, 1, 8, 694 TASKQ_PREPOPULATE); 695 696 opsrsm_finfo_init(); 697 opsrsmresource_init(); 698 699 status = mod_install(&modlinkage); 700 if (status != DDI_SUCCESS) { 701 mutex_destroy(&opsrsmattlock); 702 mutex_destroy(&opsrsmdevlock); 703 } 704 705 /* Init rsmrdt pm client */ 706 rsmrdt_pathmanager_init(); 707 708 return (status); 709 } 710 711 /* 712 * Module Removal 713 */ 714 715 int 716 _fini(void) 717 { 718 int status; 719 720 if ((status = mod_remove(&modlinkage)) != 0) { 721 DERR("opsrsm_fini - mod_remove failed: 0x%x\n", status); 722 return (status); 723 } 724 725 /* Un-init the rsmrdt pm client */ 726 rsmrdt_pathmanager_cleanup(); 727 728 ddi_soft_state_fini(&opsrsm_state); 729 opsrsmresource_fini(); 730 opsrsm_finfo_fini(); 731 opsrsm_flow_tmo_cancel(); 732 taskq_destroy(opsrsm_events_taskq); 733 734 mutex_destroy(&opsrsm_flow_tmo_lock); 735 mutex_destroy(&opsrsmattlock); 736 mutex_destroy(&opsrsmdevlock); 737 738 return (status); 739 } 740 741 /* 742 * Return Module Info. 743 */ 744 745 int 746 _info(struct modinfo *modinfop) 747 { 748 return (mod_info(&modlinkage, modinfop)); 749 } 750 751 752 753 /* 754 * Autoconfiguration Routines 755 */ 756 757 758 /* 759 * Attach the device, create and fill in the device-specific structure. 760 */ 761 762 static int 763 opsrsm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 764 { 765 opsrsm_t *opsrsmp; 766 int instance; 767 int progress = 0; 768 minor_t rnum; 769 770 D1("opsrsmattach: dip 0x%p, cmd %d", (void *)dip, cmd); 771 772 if (cmd != DDI_ATTACH) { 773 return (DDI_FAILURE); 774 } 775 776 /* 777 * Allocate soft data structure 778 */ 779 780 instance = ddi_get_instance(dip); 781 782 if (ddi_soft_state_zalloc(opsrsm_state, instance) != DDI_SUCCESS) { 783 DERR("opsrsmattach: bad state zalloc, returning DDI_FAILURE"); 784 return (DDI_FAILURE); 785 } 786 787 opsrsmp = ddi_get_soft_state(opsrsm_state, instance); 788 if (opsrsmp == NULL) { 789 return (DDI_FAILURE); 790 } 791 792 /* 793 * Stuff private info into dip. 794 */ 795 opsrsmp->opsrsm_dip = dip; 796 ddi_set_driver_private(dip, (caddr_t)opsrsmp); 797 798 /* 799 * Get device parameters from the device tree and save them in our 800 * per-device structure for later use. 801 */ 802 opsrsmgetparam(dip, opsrsmp); 803 804 /* 805 * Initialize kernel statistics. 806 */ 807 opsrsmkstatinit(opsrsmp); 808 progress |= OPSRSM_ATT_KSTAT; 809 /* 810 * Link this per-device structure in with the rest. 811 */ 812 mutex_enter(&opsrsmdevlock); 813 814 opsrsmdev = opsrsmp; 815 mutex_exit(&opsrsmdevlock); 816 817 /* 818 * Create minor number 819 */ 820 if (opsrsmresource_alloc(&rnum) == NULL) { 821 DERR("opsrsmattach: Unable to get minor number\n"); 822 opsrsmtakedown(opsrsmp, progress); 823 return (DDI_FAILURE); 824 } 825 826 D1("opsrsmattach: rnum %d : ddi %d", rnum, ddi_get_instance(dip)); 827 828 /* 829 * Create the filesystem device node. 830 */ 831 if (ddi_create_minor_node(dip, OPSRSMNAME, S_IFCHR, 832 rnum, DDI_PSEUDO, NULL) != DDI_SUCCESS) { 833 DERR("opsrsmattach: bad create_minor_node, returning " 834 "DDI_FAILURE"); 835 opsrsmtakedown(opsrsmp, progress); 836 return (DDI_FAILURE); 837 } 838 839 progress |= OPSRSM_ATT_MINOR; 840 841 opsrsmp->opsrsm_max_batch_size = 0; 842 opsrsmp->opsrsm_min_batch_size = 0; 843 844 ddi_report_dev(dip); 845 846 D1("opsrsmattach: returning DDI_SUCCESS"); 847 return (DDI_SUCCESS); 848 } 849 850 /* 851 * Detach - Free resources allocated in attach 852 */ 853 854 /*ARGSUSED*/ 855 static int 856 opsrsm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 857 { 858 int instance; 859 opsrsm_t *opsrsmp; 860 861 D1("opsrsmdetach: dip 0x%p, cmd %d", (void *)dip, cmd); 862 863 if (cmd != DDI_DETACH) { 864 return (DDI_FAILURE); 865 } 866 867 if (rsmrdt_check_openhandles() != 0) { 868 DERR("opsrsmdetach: Failed to detach due to open handles"); 869 return (DDI_FAILURE); 870 } 871 872 instance = ddi_get_instance(dip); 873 opsrsmp = ddi_get_soft_state(opsrsm_state, instance); 874 if (opsrsmp == NULL) { 875 return (DDI_FAILURE); 876 } 877 878 879 /* 880 * Release all our resources. At this point, all attachment 881 * setup must have completed, so must all be torn down. 882 */ 883 opsrsmtakedown(opsrsmp, OPSRSM_ATT_ALL); 884 885 /* 886 * Free resource table 887 */ 888 opsrsmresource_destroy(); 889 return (DDI_SUCCESS); 890 } 891 892 /*ARGSUSED*/ 893 static int 894 opsrsm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 895 { 896 switch (infocmd) { 897 case DDI_INFO_DEVT2DEVINFO: 898 if (opsrsmdev != NULL) { 899 *result = opsrsmdev->opsrsm_dip; 900 return (DDI_SUCCESS); 901 } else { 902 return (DDI_FAILURE); 903 } 904 905 case DDI_INFO_DEVT2INSTANCE: 906 *result = 0; 907 return (DDI_SUCCESS); 908 909 default: 910 return (DDI_FAILURE); 911 } 912 } 913 914 /* 915 * Return local node id. 916 */ 917 /* ARGSUSED */ 918 static int 919 opsrsm_ioctl_getnodeid(opsrsmresource_t *rp, intptr_t arg, int mode) 920 { 921 if (rsmrdt_my_nodeid == (rsm_node_id_t)-1) 922 return (ENXIO); 923 924 (void) ddi_copyout((caddr_t)&rsmrdt_my_nodeid, (caddr_t)arg, 925 sizeof (rsmrdt_my_nodeid), mode); 926 return (0); 927 } 928 929 /* 930 * Return a unique(wrt the local node) number associated with 931 * the communication endpoint. 932 */ 933 static int 934 opsrsm_ioctl_bind(opsrsmresource_t *rp, intptr_t arg, int mode) 935 { 936 if (ddi_copyout((caddr_t)&rp->rs_lportnum, (caddr_t)arg, 937 sizeof (rp->rs_lportnum), mode) != DDI_SUCCESS) { 938 DERR("ioctl_bind: unable to copyout portnum"); 939 return (EFAULT); 940 } 941 rp->rs_state |= OPSRSM_RS_BOUND; 942 return (0); 943 } 944 945 /* 946 * The local communication endpoint will simply remember and will use 947 * the address specified here as the target address for all the future 948 * outgoing messages. 949 * 950 * Note that it will not verify the validity of the remote "portnum". 951 * It's up to the applications to make sure the remote endpoint 952 * exists before send. 953 */ 954 static int 955 opsrsm_ioctl_connect(opsrsmresource_t *rp, intptr_t arg, int mode) 956 { 957 rsmrdt_connect_arg_t io_args; 958 959 if ((rp->rs_state & OPSRSM_RS_BOUND) == 0) { 960 DERR("ioctl_connect: port not bound yet"); 961 return (EADDRNOTAVAIL); 962 } 963 964 rw_enter(&opsrsm_resource.opsrsmrct_lock, RW_READER); 965 if (opsrsm_resource.opsrsmrc_flag == OPSRSMRC_UNLOAD_INPROGRESS) { 966 DERR("ioctl_connect: Unloading in progress"); 967 rw_exit(&opsrsm_resource.opsrsmrct_lock); 968 return (ENETDOWN); 969 } 970 rw_exit(&opsrsm_resource.opsrsmrct_lock); 971 972 if (rp->rs_dest != NULL) { 973 DERR("ioctl_connect: reconnect not supported"); 974 return (EISCONN); 975 } 976 977 /* 978 * Copy in the connect ioctl arg structure 979 */ 980 (void) ddi_copyin((caddr_t)arg, (caddr_t)&io_args, sizeof (io_args), 981 mode); 982 983 D1("ioctl_connect: nodeid = %d", io_args.nodeid); 984 D1("ioctl_connect: portnum = %d", io_args.portnum); 985 986 if (io_args.nodeid < 0) 987 return (EINVAL); 988 989 if (io_args.nodeid == (int)rsmrdt_my_nodeid) { 990 /* 991 * Loopback mode 992 */ 993 mutex_enter(&rp->rs_lock); 994 rp->rs_dest = (opsrsm_dest_t *)OPSRSM_LOOPBACK; 995 rp->rs_local_skey = 0; 996 rp->rs_rportnum = io_args.portnum; 997 mutex_exit(&rp->rs_lock); 998 } else { 999 opsrsm_dest_t *rd = NULL; 1000 1001 mutex_enter(&rp->rs_lock); 1002 rp->rs_nodeid = io_args.nodeid; 1003 rp->rs_rportnum = io_args.portnum; 1004 rp->rs_state |= OPSRSM_RS_CONNECTING; 1005 mutex_exit(&rp->rs_lock); 1006 1007 rd = opsrsm_connect(rp->rs_nodeid, rp); 1008 mutex_enter(&rp->rs_lock); 1009 rp->rs_dest = rd; 1010 if (rd != NULL) { 1011 rp->rs_local_skey = rd->rd_local_skey; 1012 rp->rs_state |= OPSRSM_RS_REFDEST; 1013 } else { 1014 rp->rs_local_skey = 0; 1015 } 1016 rp->rs_state &= ~OPSRSM_RS_CONNECTING; 1017 cv_broadcast(&rp->rs_conn_cv); 1018 mutex_exit(&rp->rs_lock); 1019 1020 if (rd == NULL) { 1021 return (ENETDOWN); 1022 } 1023 } 1024 return (0); 1025 } 1026 1027 1028 #define RETRY_DELAY 1 1029 static int opsrsm_connect_max_retries = 20; 1030 1031 static opsrsm_dest_t * 1032 opsrsm_connect(int nodeid, opsrsmresource_t *rp) 1033 { 1034 adapter_t *adp = NULL; 1035 rsm_addr_t rem_hwaddr; 1036 opsrsm_dest_t *rd; 1037 int isdel = 0, isnew = 0; 1038 int newdest = 0; 1039 uint32_t old_skey = 0; 1040 1041 again:; 1042 /* Find local adapter */ 1043 if (newdest > opsrsm_connect_max_retries) { 1044 DINFO("connect: failed to connect to node %d\n", nodeid); 1045 return (NULL); 1046 } 1047 adp = rsmrdt_select_adapter((rsm_node_id_t)nodeid, newdest); 1048 if (adp == NULL) { 1049 DINFO("connect: node %d is unreachable\n", nodeid); 1050 return (NULL); 1051 } 1052 1053 /* 1054 * if path down happens after we've chosen an adapter, that's 1055 * still ok because the connection handshake will fail. 1056 */ 1057 1058 /* Find remote hw addr */ 1059 rem_hwaddr = rsmrdt_get_remote_hwaddr(adp, (rsm_node_id_t)nodeid); 1060 if (rem_hwaddr == (rsm_addr_t)-1 || rem_hwaddr > RSM_MAX_DESTADDR) { 1061 if (opsrsmdev->opsrsm_param.rsmrdt_enable_loadbalance) { 1062 if (adp->sel_cnt > 0) adp->sel_cnt--; 1063 } 1064 return (NULL); 1065 } 1066 1067 MAKEDEST(rd, isdel, isnew, rem_hwaddr, adp); 1068 if (isdel) { 1069 if (opsrsmdev->opsrsm_param.rsmrdt_enable_loadbalance) { 1070 if (adp->sel_cnt > 0) adp->sel_cnt--; 1071 } 1072 /* 1073 * need sufficient delay to ensure the old rd gets freed up 1074 * completely before MAKEDEST gets called again 1075 */ 1076 delay(RETRY_DELAY); 1077 goto again; 1078 } 1079 if (isnew) { 1080 isnew = 0; 1081 if (rd == NULL) goto again; 1082 (void) opsrsmmovestate(rd, OPSRSM_STATE_NEW, 1083 OPSRSM_STATE_S_REQ_CONNECT); 1084 } 1085 if (rd->rd_local_skey != old_skey) { 1086 old_skey = rd->rd_local_skey; 1087 newdest++; 1088 } 1089 1090 mutex_enter(&rd->rd_xmit_lock); 1091 if (rd->rd_nodeid != nodeid) { 1092 cmn_err(CE_PANIC, "invalid nodeid %d, expected %d\n", 1093 rd->rd_nodeid, nodeid); 1094 } 1095 if (rd->rd_xmit_state < OPSRSM_XMIT_BARRIER_CLOSED) { 1096 if (rd->rd_xmit_state != OPSRSM_XMIT_DISCONNECTED) { 1097 int retval; 1098 1099 retval = cv_wait_sig(&rd->rd_conn_cv, 1100 &rd->rd_xmit_lock); 1101 if (retval == 0) { 1102 mutex_exit(&rd->rd_xmit_lock); 1103 if (opsrsmdev->opsrsm_param. 1104 rsmrdt_enable_loadbalance) { 1105 if (adp->sel_cnt > 0) 1106 adp->sel_cnt--; 1107 } 1108 UNREFDEST(rd); 1109 return (NULL); 1110 } else { 1111 if (rd->rd_xmit_state >= 1112 OPSRSM_XMIT_BARRIER_CLOSED) { 1113 mutex_exit(&rd->rd_xmit_lock); 1114 return (rd); 1115 } else { 1116 mutex_exit(&rd->rd_xmit_lock); 1117 if (opsrsmdev->opsrsm_param. 1118 rsmrdt_enable_loadbalance) { 1119 if (adp->sel_cnt > 0) 1120 adp->sel_cnt--; 1121 } 1122 UNREFDEST(rd); 1123 goto again; 1124 } 1125 } 1126 } else { 1127 mutex_exit(&rd->rd_xmit_lock); 1128 if (opsrsmdev->opsrsm_param. 1129 rsmrdt_enable_loadbalance) { 1130 if (adp->sel_cnt > 0) adp->sel_cnt--; 1131 } 1132 UNREFDEST(rd); 1133 /* 1134 * if rd is still in DISCONNECTED state, we need to 1135 * wait until it is completely freed up. 1136 */ 1137 if (rp != NULL) { 1138 mutex_enter(&rp->rs_lock); 1139 cv_broadcast(&rp->rs_conn_cv); 1140 mutex_exit(&rp->rs_lock); 1141 } 1142 delay(RETRY_DELAY); 1143 goto again; 1144 } 1145 } 1146 mutex_exit(&rd->rd_xmit_lock); 1147 return (rd); 1148 } 1149 1150 /* 1151 * RSMRDT_IOCTL_SENDMSG deals with one message at a time. It maintains 1152 * message boundary, and guarantees either the whole message, or none 1153 * of it is delivered to the destination successfully. 1154 * 1155 * EMSGSIZE is returned when the message is too big to be handled as a 1156 * single message by the opsrsm driver. 1157 * 1158 * Note that SKGXP library doesn't require the send socket to have a 1159 * portnum associated with it. In other words, the send side can send 1160 * msgs through a socket without calling ioctl.bind() first. 1161 * 1162 * RSMRDT_IOCTL_SENDMSG is always non-blocking. 1163 * 1164 * Normally messages are guaranteed to be delivered eventually to the 1165 * destination endpoint, in the same order RSMRDT_IOCTL_SENDMSG calls 1166 * are made. There is one notable exception. That is, if the destination 1167 * endpoint doesn't exist, the msg will be dropped by the receiving node. 1168 */ 1169 static int 1170 opsrsm_ioctl_sendmsg(opsrsmresource_t *rp, intptr_t arg, int mode) 1171 { 1172 rsmrdt_send_arg_t io_args; 1173 struct iovec vptr[OPSRSM_MAXVECS]; 1174 #ifdef _MULTI_DATAMODEL 1175 rsmrdt_send_arg32_t io_args32; 1176 struct iovec32 vptr32[OPSRSM_MAXVECS]; 1177 model_t model; 1178 #endif /* _MULTI_DATAMODEL */ 1179 uio_t phys_uio; 1180 mblk_t *mp; 1181 int bytecount = 0; 1182 uint_t nvecs; 1183 int i; 1184 int err = 0; 1185 1186 if ((rp->rs_state & OPSRSM_RS_NORECVR) != 0) { 1187 DERR("ioctl_sendmsg: Receiver doesn't exist"); 1188 return (ESRCH); 1189 } 1190 1191 if ((rp->rs_state & OPSRSM_RS_PKEYMISMATCH) != 0) { 1192 DERR("ioctl_sendmsg: pkey mismatch"); 1193 return (EACCES); 1194 } 1195 1196 if (rp->rs_dest == NULL) { 1197 return (ENOTCONN); 1198 } 1199 #ifdef _MULTI_DATAMODEL 1200 model = ddi_model_convert_from(mode & FMODELS); 1201 if (model == DDI_MODEL_ILP32) { 1202 /* 1203 * Copy in sendmsg arg structure to driver buffer 1204 */ 1205 ddi_copyin((caddr_t)arg, &io_args32, sizeof (io_args32), 1206 mode); 1207 1208 /* 1209 * Find number of iovecs for this message 1210 */ 1211 nvecs = io_args32.iovcnt; 1212 1213 if (nvecs > OPSRSM_MAXVECS) { 1214 DERR("ioctl_sendmsg: invalid vec size 0x%x", nvecs); 1215 return (EINVAL); 1216 } 1217 1218 D1("ioctl_sendmsg: nvecs = %d, sz = 0x%x, vptr32 = %lx", 1219 nvecs, nvecs * sizeof (struct iovec32), vptr32); 1220 1221 /* 1222 * Copy in iovec structures to driver buffer 1223 */ 1224 if (ddi_copyin((struct iovec32 *)io_args32.iov, (caddr_t)vptr32, 1225 nvecs * sizeof (struct iovec32), mode)) { 1226 DERR("ioctl_sendmsg: invalid iovec pointer"); 1227 err = EFAULT; 1228 goto done; 1229 } 1230 1231 /* 1232 * Find out the size of this message 1233 */ 1234 bytecount = 0; 1235 for (i = 0; i < nvecs; i++) { 1236 ssize32_t iovlen32 = vptr32[i].iov_len; 1237 bytecount += iovlen32; 1238 if (iovlen32 < 0 || bytecount < 0) { 1239 err = EINVAL; 1240 goto done; 1241 } 1242 vptr[i].iov_len = iovlen32; 1243 vptr[i].iov_base = (caddr_t)vptr32[i].iov_base; 1244 } 1245 } else 1246 #endif /* _MULTI_DATAMODEL */ 1247 { 1248 /* 1249 * Copy in sendmsg arg structure to driver buffer 1250 */ 1251 (void) ddi_copyin((caddr_t)arg, (caddr_t)&io_args, 1252 sizeof (io_args), mode); 1253 1254 /* 1255 * Find number of iovecs for this message 1256 */ 1257 nvecs = io_args.iovcnt; 1258 1259 if (nvecs > OPSRSM_MAXVECS) { 1260 DERR("ioctl_sendmsg: invalid vec size 0x%x", nvecs); 1261 return (EINVAL); 1262 } 1263 1264 D1("ioctl_sendmsg: nvecs = %d, sz = 0x%x, vptr = %p", 1265 nvecs, nvecs * (int)sizeof (iovec_t), vptr); 1266 1267 /* 1268 * Copy in iovec structures to driver buffer 1269 */ 1270 if (ddi_copyin(io_args.iov, (caddr_t)vptr, 1271 (size_t)nvecs * sizeof (iovec_t), mode)) { 1272 DERR("ioctl_sendmsg: invalid iovec pointer"); 1273 err = EFAULT; 1274 goto done; 1275 } 1276 1277 /* 1278 * Find out the size of this message 1279 */ 1280 bytecount = 0; 1281 for (i = 0; i < (int)nvecs; i++) { 1282 ssize_t iovlen = vptr[i].iov_len; 1283 bytecount += iovlen; 1284 if (iovlen < 0 || bytecount < 0) { 1285 err = EINVAL; 1286 goto done; 1287 } 1288 } 1289 } 1290 1291 D1("ioctl_sendmsg: Message Size 0x%x", bytecount); 1292 1293 /* 1294 * Check if message + header size is bigger than MTU size 1295 */ 1296 if ((bytecount + OPSRSM_CACHELINE_SIZE) > 1297 OPSRSM_MAX_BUFFER_SIZE_DFLT) { 1298 DERR("ioctl_sendmsg: message too big"); 1299 err = EMSGSIZE; 1300 goto done; 1301 } 1302 1303 /* 1304 * Allocate mblk 1305 */ 1306 mp = allocb(OPSRSM_CACHELINE_SIZE + OPSRSM_MESSAGE_HDRSZ + 1307 (size_t)bytecount + OPSRSM_CACHELINE_SIZE, BPRI_LO); 1308 if (mp == NULL) { 1309 DERR("ioctl_sendmsg: allocb failed"); 1310 err = ENOMEM; 1311 goto done; 1312 } 1313 mp->b_rptr = (uchar_t *)OPSRSM_CACHELINE_ROUNDUP(mp->b_rptr); 1314 1315 /* 1316 * Stuff in the message header 1317 */ 1318 OPSRSM_MESSAGE_HDRPTR(mp)->lportnum = rp->rs_lportnum; 1319 OPSRSM_MESSAGE_HDRPTR(mp)->rportnum = rp->rs_rportnum; 1320 OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz = (uint32_t)bytecount; 1321 OPSRSM_MESSAGE_HDRPTR(mp)->nodeid = (int)rsmrdt_my_nodeid; 1322 OPSRSM_MESSAGE_HDRPTR(mp)->pkey = rp->rs_pkey; 1323 OPSRSM_MESSAGE_HDRPTR(mp)->seqno = 0; 1324 OPSRSM_MESSAGE_HDRPTR(mp)->option = 0; 1325 1326 D1("ioctl_sendmsg: lportnum = %d, rportnum = %d, msz_sz = 0x%x", 1327 rp->rs_lportnum, rp->rs_rportnum, bytecount); 1328 D1("ioctl_sendmsg: my node id = %d", rsmrdt_my_nodeid); 1329 1330 /* 1331 * Initialize uio structure 1332 */ 1333 phys_uio.uio_iov = vptr; 1334 phys_uio.uio_iovcnt = (int)nvecs; 1335 phys_uio.uio_resid = bytecount; 1336 phys_uio.uio_segflg = UIO_USERSPACE; 1337 1338 if (uiomove((caddr_t)mp->b_rptr + OPSRSM_MESSAGE_HDRSZ, 1339 (size_t)bytecount, UIO_WRITE, &phys_uio)) { 1340 DERR("ioctl_sendmsg: uiomove failed"); 1341 err = EFAULT; 1342 freemsg(mp); 1343 goto done; 1344 } 1345 1346 mp->b_wptr = mp->b_rptr + bytecount + OPSRSM_MESSAGE_HDRSZ; 1347 mp->b_prev = mp->b_cont = NULL; 1348 1349 if (OPSRSM_IS_LOOPBACK(rp->rs_dest)) { 1350 err = opsrsmdemux_loopback(mp); 1351 } else { 1352 int isdel = 0; 1353 1354 mutex_enter(&rp->rs_lock); 1355 if ((rp->rs_state & OPSRSM_RS_FAILOVER) != 0) { 1356 opsrsm_dest_t *new_rd = NULL; 1357 1358 ASSERT((rp->rs_state & OPSRSM_RS_REFDEST) == 0); 1359 mutex_exit(&rp->rs_lock); 1360 err = opsrsm_finfo_wait(rp->rs_local_skey); 1361 if (err == 0) { 1362 mutex_enter(&rp->rs_lock); 1363 rp->rs_state |= OPSRSM_RS_CONNECTING; 1364 mutex_exit(&rp->rs_lock); 1365 new_rd = opsrsm_connect(rp->rs_nodeid, rp); 1366 } 1367 mutex_enter(&rp->rs_lock); 1368 rp->rs_dest = new_rd; 1369 rp->rs_state &= ~OPSRSM_RS_CONNECTING; 1370 if (new_rd != NULL) { 1371 rp->rs_local_skey = new_rd->rd_local_skey; 1372 rp->rs_state &= ~OPSRSM_RS_FAILOVER; 1373 rp->rs_state |= OPSRSM_RS_REFDEST; 1374 cv_broadcast(&rp->rs_conn_cv); 1375 } else { 1376 rp->rs_local_skey = 0; 1377 freemsg(mp); 1378 cv_broadcast(&rp->rs_conn_cv); 1379 mutex_exit(&rp->rs_lock); 1380 goto done; 1381 } 1382 } 1383 REFDEST(rp->rs_dest, isdel); 1384 if (isdel != 0) { 1385 err = ENETDOWN; 1386 freemsg(mp); 1387 mutex_exit(&rp->rs_lock); 1388 goto done; 1389 } 1390 1391 if ((bytecount + OPSRSM_CACHELINE_SIZE) > 1392 (int)rp->rs_dest->rd_buffer_size) { 1393 err = EMSGSIZE; 1394 freemsg(mp); 1395 mutex_exit(&rp->rs_lock); 1396 goto done; 1397 } 1398 mutex_exit(&rp->rs_lock); 1399 err = opsrsmxmit(rp->rs_dest, mp); 1400 if (err != EWOULDBLOCK) err = 0; 1401 } 1402 done:; 1403 return (err); 1404 } 1405 1406 1407 /* 1408 * RSMRDT_IOCTL_RECVMSG can return more than one msg at a time. 1409 * 1410 * If a message is too long to fit in the supplied buffer, excessive 1411 * bytes will be discarded. 1412 * 1413 * It will return EWOULDBLOCK if there is no message in the receive 1414 * queue. Caller can then use poll() to poll for the POLLIN event. 1415 * 1416 * In case of memory allocation errors, it will not drop the packets. 1417 */ 1418 1419 static int 1420 opsrsm_ioctl_recvmsgs(opsrsmresource_t *rp, intptr_t arg, int mode) 1421 { 1422 rsmrdt_recvmsgs_arg_t io_args; 1423 rdt_recvmsg_t *rm_ptr; 1424 struct iovec vptr[OPSRSM_MAXVECS]; 1425 #ifdef _MULTI_DATAMODEL 1426 rsmrdt_recvmsgs_arg32_t io_args32; 1427 rsmrdt_recvmsg32_t *rm_ptr32; 1428 struct iovec32 vptr32[OPSRSM_MAXVECS]; 1429 model_t model; 1430 #endif /* _MULTI_DATAMODEL */ 1431 int nmsgs, count; 1432 int err = 0; 1433 int32_t total_bytes = 0; 1434 1435 if ((rp->rs_state & OPSRSM_RS_BOUND) == 0) { 1436 DERR("ioctl_recvmsgs: port not bound yet"); 1437 return (EADDRNOTAVAIL); 1438 } 1439 1440 /* 1441 * Copy in recvmsgs arg structure to driver buffer 1442 */ 1443 #ifdef _MULTI_DATAMODEL 1444 model = ddi_model_convert_from(mode & FMODELS); 1445 if (model == DDI_MODEL_ILP32) { 1446 ddi_copyin((caddr_t)arg, (caddr_t)&io_args32, 1447 sizeof (io_args32), mode); 1448 1449 io_args.msgcnt = io_args32.msgcnt; 1450 io_args.timeout = io_args32.timeout; 1451 } else 1452 #endif /* _MULTI_DATAMODEL */ 1453 (void) ddi_copyin((caddr_t)arg, (caddr_t)&io_args, sizeof (io_args), 1454 mode); 1455 1456 /* 1457 * Check if messages are waiting in the queue 1458 */ 1459 mutex_enter(&rp->rs_lock); 1460 if (OPSRSM_Q_LEN(&rp->rs_recvq) == 0) { 1461 if (io_args.timeout == 0) { 1462 mutex_exit(&rp->rs_lock); 1463 return (EWOULDBLOCK); 1464 } else if (io_args.timeout > 0) { 1465 clock_t timeout_time; 1466 int retval; 1467 /* 1468 * Wait only for 'timeout' time. 1469 */ 1470 timeout_time = ddi_get_lbolt(); 1471 timeout_time += drv_usectohz 1472 ((clock_t)io_args.timeout * (clock_t)1000); 1473 rp->rs_state |= OPSRSM_RS_SIG; 1474 retval = cv_timedwait_sig(&rp->rs_cv, &rp->rs_lock, 1475 timeout_time); 1476 rp->rs_state &= ~OPSRSM_RS_SIG; 1477 if (retval == 0) { 1478 mutex_exit(&rp->rs_lock); 1479 return (EINTR); 1480 } else if (OPSRSM_Q_LEN(&rp->rs_recvq) == 0) { 1481 mutex_exit(&rp->rs_lock); 1482 err = 0; 1483 count = 0; 1484 goto done; 1485 } 1486 } else { 1487 int retval; 1488 1489 /* 1490 * Wait until you get the wakeup signal 1491 */ 1492 rp->rs_state |= OPSRSM_RS_SIG; 1493 retval = cv_wait_sig(&rp->rs_cv, &rp->rs_lock); 1494 rp->rs_state &= ~OPSRSM_RS_SIG; 1495 if (retval == 0) { 1496 mutex_exit(&rp->rs_lock); 1497 return (EINTR); 1498 } else if (OPSRSM_Q_LEN(&rp->rs_recvq) == 0) { 1499 mutex_exit(&rp->rs_lock); 1500 return (EWOULDBLOCK); 1501 } 1502 } 1503 } 1504 mutex_exit(&rp->rs_lock); 1505 1506 /* 1507 * Find number of messages 1508 */ 1509 nmsgs = (int)io_args.msgcnt; 1510 if (nmsgs > (int)opsrsmdev->opsrsm_param.opsrsm_max_recv_msgs || 1511 nmsgs < 0) { 1512 DERR("ioctl_recvmsgs: invalid nmsgs"); 1513 return (EINVAL); 1514 } 1515 1516 D1("ioctl_recvmsgs: nmsgs = %d\n", nmsgs); 1517 1518 /* 1519 * Copy in receive messages structures to driver buffer 1520 */ 1521 #ifdef _MULTI_DATAMODEL 1522 if (model == DDI_MODEL_ILP32) { 1523 rm_ptr32 = (rsmrdt_recvmsg32_t *)rp->rs_rmptr; 1524 if (ddi_copyin((rsmrdt_recvmsg32_t *)io_args32.msg_iov, 1525 (caddr_t)rm_ptr32, nmsgs * sizeof (rsmrdt_recvmsg32_t), 1526 mode)) { 1527 DERR("ioctl_recvmsgs: cannot copy in msg structs"); 1528 return (EFAULT); 1529 } 1530 1531 } else 1532 #endif /* _MULTI_DATAMODEL */ 1533 { 1534 rm_ptr = (rdt_recvmsg_t *)rp->rs_rmptr; 1535 if (ddi_copyin(io_args.msg_iov, (caddr_t)rm_ptr, 1536 (size_t)nmsgs * sizeof (rdt_recvmsg_t), mode)) { 1537 DERR("ioctl_recvmsgs: cannot copy in msg structs"); 1538 return (EFAULT); 1539 } 1540 } 1541 1542 count = 0; 1543 do { 1544 uint_t nvecs; 1545 int i; 1546 uint32_t bytecount; 1547 uint32_t org_msglen; 1548 int nodeid; 1549 uint32_t portnum; 1550 mblk_t *mp; 1551 1552 #ifdef _MULTI_DATAMODEL 1553 if (model == DDI_MODEL_ILP32) { 1554 /* 1555 * Find number of iovecs for this message 1556 */ 1557 nvecs = rm_ptr32[count].iovcnt; 1558 1559 if (nvecs > OPSRSM_MAXVECS) { 1560 DERR("ioctl_recvmsgs: invalid vec size"); 1561 err = EINVAL; 1562 break; 1563 } 1564 1565 D1("ioctl_recvmsgs: nvecs = %d", nvecs); 1566 1567 /* 1568 * Copy in iovec structures to driver buffer 1569 */ 1570 if (ddi_copyin((struct iovec32 *)rm_ptr32[count].iov, 1571 (caddr_t)vptr32, nvecs * sizeof (struct iovec32), 1572 mode)) { 1573 DERR("ioctl_recvmsgs: invalid iovec pointer"); 1574 err = EFAULT; 1575 break; 1576 } 1577 1578 /* 1579 * Calculate buffer size 1580 */ 1581 bytecount = 0; 1582 for (i = 0; i < nvecs; i++) { 1583 if (vptr32[i].iov_len < 0) { 1584 DERR("ioctl_recvmsgs: invalid iovlen"); 1585 err = EINVAL; 1586 break; 1587 } 1588 bytecount += vptr32[i].iov_len; 1589 1590 vptr[i].iov_len = vptr32[i].iov_len; 1591 vptr[i].iov_base = (caddr_t)vptr32[i].iov_base; 1592 } 1593 } else 1594 #endif /* _MULTI_DATAMODEL */ 1595 { 1596 /* 1597 * Find number of iovecs for this message 1598 */ 1599 nvecs = rm_ptr[count].iovcnt; 1600 1601 if (nvecs > OPSRSM_MAXVECS) { 1602 DERR("ioctl_recvmsgs: invalid vec size"); 1603 err = EINVAL; 1604 break; 1605 } 1606 1607 D1("ioctl_recvmsgs: nvecs = %d", nvecs); 1608 1609 /* 1610 * Copy in iovec structures to driver buffer 1611 */ 1612 if (ddi_copyin(rm_ptr[count].iov, (caddr_t)vptr, 1613 (size_t)nvecs * sizeof (iovec_t), mode)) { 1614 DERR("ioctl_recvmsgs: invalid iovec pointer"); 1615 err = EFAULT; 1616 break; 1617 } 1618 1619 /* 1620 * Calculate buffer size 1621 */ 1622 bytecount = 0; 1623 for (i = 0; i < (int)nvecs; i++) { 1624 if (vptr[i].iov_len < 0) { 1625 DERR("ioctl_recvmsgs: invalid iovlen"); 1626 err = EINVAL; 1627 break; 1628 } 1629 bytecount += (uint32_t)vptr[i].iov_len; 1630 } 1631 } 1632 1633 if (err != 0) break; 1634 1635 /* 1636 * Grab a buffer 1637 */ 1638 mutex_enter(&rp->rs_lock); 1639 OPSRSM_Q_REMOVE(&rp->rs_recvq, mp); 1640 if (OPSRSM_Q_LEN(&rp->rs_recvq) == 0) { 1641 rp->rs_events = 0; 1642 } 1643 mutex_exit(&rp->rs_lock); 1644 ASSERT(mp != NULL && MBLKL(mp) >= (int)OPSRSM_MESSAGE_HDRSZ); 1645 total_bytes += MBLKL(mp); 1646 1647 /* 1648 * Extract the length from the header 1649 * 1650 * Truncate the message if the incoming message size 1651 * is greater than buffer size. 1652 */ 1653 1654 org_msglen = OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz; 1655 nodeid = OPSRSM_MESSAGE_HDRPTR(mp)->nodeid; 1656 portnum = OPSRSM_MESSAGE_HDRPTR(mp)->lportnum; 1657 if (OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz < bytecount) { 1658 bytecount = OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz; 1659 } 1660 1661 /* 1662 * Initialize uio structure 1663 */ 1664 if (bytecount > 0) { 1665 uio_t phys_uio; 1666 1667 phys_uio.uio_iov = vptr; 1668 phys_uio.uio_iovcnt = (int)nvecs; 1669 phys_uio.uio_segflg = UIO_USERSPACE; 1670 phys_uio.uio_resid = (ssize_t)bytecount; 1671 1672 if (uiomove((caddr_t)mp->b_rptr + OPSRSM_MESSAGE_HDRSZ, 1673 bytecount, UIO_READ, &phys_uio)) { 1674 DERR("ioctl_recvmsg: uiomove failed"); 1675 err = EFAULT; 1676 freemsg(mp); 1677 mp = NULL; 1678 break; 1679 } 1680 } 1681 1682 opsrsmdev->opsrsm_packets_consumed++; 1683 freemsg(mp); 1684 1685 /* 1686 * Return the number of bytes received and original message 1687 * length. 1688 */ 1689 #ifdef _MULTI_DATAMODEL 1690 if (model == DDI_MODEL_ILP32) { 1691 rm_ptr32[count].bytes_recvd = bytecount; 1692 rm_ptr32[count].msglen = org_msglen; 1693 rm_ptr32[count].portnum = portnum; 1694 rm_ptr32[count].nodeid = nodeid; 1695 1696 if (ddi_copyout((caddr_t *)&(rm_ptr32[count]), 1697 (caddr_t)&((((rsmrdt_recvmsg32_t *) 1698 (io_args32.msg_iov))[count])), 1699 sizeof (rsmrdt_recvmsg32_t), mode) 1700 != DDI_SUCCESS) { 1701 DERR("ioctl_recvmsgs:unable to copyout rdata"); 1702 err = EFAULT; 1703 break; 1704 } 1705 } else 1706 #endif /* _MULTI_DATAMODEL */ 1707 { 1708 rm_ptr[count].bytes_recvd = bytecount; 1709 rm_ptr[count].msglen = org_msglen; 1710 rm_ptr[count].portnum = portnum; 1711 rm_ptr[count].nodeid = nodeid; 1712 1713 if (ddi_copyout((caddr_t *)&(rm_ptr[count]), 1714 (caddr_t)&((((rdt_recvmsg_t *) 1715 (io_args.msg_iov))[count])), 1716 sizeof (rdt_recvmsg_t), mode) != DDI_SUCCESS) { 1717 DERR("ioctl_recvmsgs:unable to copyout rdata"); 1718 err = EFAULT; 1719 break; 1720 } 1721 } 1722 1723 /* 1724 * Increment the received messages count 1725 */ 1726 count++; 1727 1728 } while (OPSRSM_Q_LEN(&rp->rs_recvq) > 0 && count < nmsgs); 1729 1730 atomic_add_32(&opsrsm_pending_bytes, -total_bytes); 1731 /* 1732 * Copy out the number of messages received 1733 */ 1734 done:; 1735 #ifdef _MULTI_DATAMODEL 1736 if (model == DDI_MODEL_ILP32) { 1737 if (ddi_copyout((caddr_t)&count, 1738 (caddr_t)&(((rsmrdt_recvmsgs_arg32_t *)arg)->msgcnt), 1739 sizeof (uint32_t), mode) != DDI_SUCCESS) { 1740 DERR("ioctl_recvmsgs: unable to copyout buffer count"); 1741 err = EFAULT; 1742 } 1743 } else 1744 #endif /* _MULTI_DATAMODEL */ 1745 if (ddi_copyout((caddr_t)&count, 1746 (caddr_t)&(((rsmrdt_recvmsgs_arg_t *)arg)->msgcnt), 1747 sizeof (uint32_t), mode) != DDI_SUCCESS) { 1748 DERR("ioctl_recvmsgs: unable to copyout buffer count"); 1749 err = EFAULT; 1750 } 1751 1752 return (err); 1753 } 1754 1755 /* 1756 * This call is used to set some per-endpoint parameters. 1757 * o per fd protection key 1758 * 1759 */ 1760 static int 1761 opsrsm_ioctl_setparam(opsrsmresource_t *rp, intptr_t arg, int mode) 1762 { 1763 void *value; 1764 rsmrdt_getsetparam_arg_t io_args; 1765 int error = RSM_SUCCESS; 1766 #ifdef _MULTI_DATAMODEL 1767 rsmrdt_getsetparam_arg32_t io_args32; 1768 1769 model_t model = ddi_model_convert_from(mode & FMODELS); 1770 1771 /* 1772 * Copy in the setparam ioctl arg structure 1773 */ 1774 if (model == DDI_MODEL_ILP32) { 1775 ddi_copyin((caddr_t)arg, (caddr_t)&io_args32, 1776 sizeof (io_args32), mode); 1777 io_args.cmd = io_args32.cmd; 1778 io_args.size = io_args32.size; 1779 } else 1780 #endif /* _MULTI_DATAMODEL */ 1781 /* 1782 * Copy in the setparam ioctl arg structure 1783 */ 1784 (void) ddi_copyin((caddr_t)arg, (caddr_t)&io_args, sizeof (io_args), 1785 mode); 1786 1787 value = (void *)kmem_zalloc(io_args.size, KM_NOSLEEP); 1788 if (value == NULL) { 1789 DERR("ioctl_setparam: kmem_zalloc failed"); 1790 return (ENOMEM); 1791 } 1792 1793 #ifdef _MULTI_DATAMODEL 1794 if (model == DDI_MODEL_ILP32) { 1795 if (ddi_copyin((caddr_t)io_args32.value, (caddr_t)value, 1796 io_args32.size, mode)) { 1797 kmem_free((void *)value, io_args32.size); 1798 DERR("ioctl_setparam: cannot copy args"); 1799 return (EFAULT); 1800 } 1801 } else 1802 #endif /* _MULTI_DATAMODEL */ 1803 if (ddi_copyin((caddr_t)io_args.value, (caddr_t)value, 1804 io_args.size, mode)) { 1805 kmem_free((void *)value, io_args.size); 1806 DERR("ioctl_setparam: cannot copy args"); 1807 return (EFAULT); 1808 } 1809 1810 switch (io_args.cmd) { 1811 case RDT_MAXMSGSIZE: 1812 /* 1813 * Buffer length must be multiple of 64 (0x40) and 1814 * must be between 64 and 64k bytes. 1815 * Add the cache line size. 1816 */ 1817 if (((*(uint_t *)value & ~OPSRSM_CACHELINE_MASK) == 0) && 1818 (*(int *)value > 0) && 1819 (*(uint_t *)value <= OPSRSM_MAX_BUFFER_SIZE_DFLT)) { 1820 opsrsmdev->opsrsm_param.opsrsm_buffer_size = 1821 *(uint_t *)value + OPSRSM_CACHELINE_SIZE; 1822 D1("ioctl_setparam: MTU sz 0x%x", *(uint_t *)value); 1823 } else { 1824 DERR("ioctl_setparam: invalid MTU sz\n"); 1825 error = EINVAL; 1826 } 1827 break; 1828 case RDT_PROTECTION_KEY: 1829 /* 1830 * Set the pkey 1831 */ 1832 rp->rs_pkey = *(uint32_t *)value; 1833 D1("ioctl_setparam: pkey 0x%x", rp->rs_pkey); 1834 break; 1835 default: 1836 DERR("ioctl_setparam: invalid cmd\n"); 1837 error = EINVAL; 1838 } 1839 1840 kmem_free((void *)value, io_args.size); 1841 return (error); 1842 } 1843 1844 static int 1845 opsrsm_ioctl_getparam(opsrsmresource_t *rp, intptr_t arg, int mode) 1846 { 1847 void *value; 1848 rsmrdt_getsetparam_arg_t io_args; 1849 #ifdef _MULTI_DATAMODEL 1850 rsmrdt_getsetparam_arg32_t io_args32; 1851 1852 model_t model = ddi_model_convert_from(mode & FMODELS); 1853 1854 /* 1855 * Copy in the getparam ioctl arg structure 1856 */ 1857 if (model == DDI_MODEL_ILP32) { 1858 ddi_copyin((caddr_t)arg, (caddr_t)&io_args32, 1859 sizeof (io_args32), mode); 1860 io_args.cmd = io_args32.cmd; 1861 io_args.size = io_args32.size; 1862 } else 1863 #endif /* _MULTI_DATAMODEL */ 1864 /* 1865 * Copy in the getparam ioctl arg structure 1866 */ 1867 (void) ddi_copyin((caddr_t)arg, (caddr_t)&io_args, sizeof (io_args), 1868 mode); 1869 1870 value = (void *)kmem_zalloc(io_args.size, KM_NOSLEEP); 1871 if (value == NULL) { 1872 DERR("ioctl_getparam: kmem_zalloc failed"); 1873 return (ENOMEM); 1874 } 1875 1876 #ifdef _MULTI_DATAMODEL 1877 if (model == DDI_MODEL_ILP32) { 1878 if (ddi_copyin((caddr_t)io_args32.value, (caddr_t)value, 1879 io_args32.size, mode)) { 1880 kmem_free((void *)value, io_args32.size); 1881 DERR("ioctl_getparam: cannot copy args"); 1882 return (EFAULT); 1883 } 1884 } else 1885 #endif /* _MULTI_DATAMODEL */ 1886 if (ddi_copyin((caddr_t)io_args.value, (caddr_t)value, 1887 io_args.size, mode)) { 1888 kmem_free((void *)value, io_args.size); 1889 DERR("ioctl_getparam: cannot copy args"); 1890 return (EFAULT); 1891 } 1892 1893 switch (io_args.cmd) { 1894 case RDT_MAXMSGSIZE: 1895 /* 1896 * Get the message size 1897 * Subtract the cache line size. 1898 */ 1899 #ifdef _MULTI_DATAMODEL 1900 if (model == DDI_MODEL_ILP32) { 1901 *(uint_t *)value = 1902 opsrsmdev->opsrsm_param.opsrsm_buffer_size - 1903 OPSRSM_CACHELINE_SIZE; 1904 D1("ioctl_getparam: MTU sz 0x%x", *(uint_t *)value); 1905 } else 1906 #endif /* _MULTI_DATAMODEL */ 1907 { 1908 *(size_t *)value = 1909 opsrsmdev->opsrsm_param.opsrsm_buffer_size - 1910 OPSRSM_CACHELINE_SIZE; 1911 1912 D1("ioctl_getparam: MTU sz 0x%x", *(size_t *)value); 1913 } 1914 break; 1915 case RDT_PROTECTION_KEY: 1916 /* 1917 * Get the protection key 1918 */ 1919 *(uint32_t *)value = rp->rs_pkey; 1920 1921 D1("ioctl_getparam: pkey 0x%x", *(uint32_t *)value); 1922 break; 1923 default: 1924 DERR("ioctl_getparam: invalid cmd\n"); 1925 kmem_free((void *)value, io_args.size); 1926 return (EINVAL); 1927 } 1928 1929 #ifdef _MULTI_DATAMODEL 1930 if (model == DDI_MODEL_ILP32) { 1931 if (ddi_copyout((caddr_t)value, (caddr_t)io_args32.value, 1932 io_args32.size, mode) != DDI_SUCCESS) { 1933 kmem_free((void *)value, io_args32.size); 1934 DERR("ioctl_getparam: unable to copyout value"); 1935 return (EFAULT); 1936 } 1937 } else 1938 #endif /* _MULTI_DATAMODEL */ 1939 if (ddi_copyout((caddr_t)value, (caddr_t)io_args.value, 1940 io_args.size, mode) != DDI_SUCCESS) { 1941 kmem_free((void *)value, io_args.size); 1942 DERR("ioctl_getparam: unable to copyout value"); 1943 return (EFAULT); 1944 } 1945 kmem_free((void *)value, io_args.size); 1946 return (0); 1947 } 1948 1949 /* 1950 * Free minor resource 1951 */ 1952 static int 1953 opsrsm_resstruct_free(minor_t rnum) 1954 { 1955 opsrsmresource_t *rp; 1956 opsrsm_queue_t *q; 1957 int32_t total_bytes = 0; 1958 1959 /* 1960 * remove resource from global table 1961 */ 1962 rp = opsrsmresource_free(rnum); 1963 if (rp == NULL) { 1964 return (DDI_FAILURE); 1965 } 1966 /* 1967 * check if refcnt is 0 before we destroy rp. if it 1968 * is non-zero, we need to wait until it becomes zero. 1969 */ 1970 mutex_enter(&rp->rs_lock); 1971 while (rp->rs_refcnt > 0) { 1972 cv_wait(&rp->rs_close_cv, &rp->rs_lock); 1973 } 1974 /* 1975 * we can be sure that no other thread can increment 1976 * rp->rs_refcnt since we already removed rp from the 1977 * global table. 1978 */ 1979 ASSERT(rp->rs_refcnt == 0); 1980 mutex_exit(&rp->rs_lock); 1981 /* 1982 * at this point, no other thread has a reference to 1983 * rp. we can safely cleanup rp without holding 1984 * rs_lock. 1985 */ 1986 1987 /* 1988 * flush all remaining messages in the recvq 1989 */ 1990 q = &rp->rs_recvq; 1991 while ((q)->q_head != NULL) { 1992 mblk_t *mp; 1993 1994 mp = (q)->q_head; 1995 total_bytes += MBLKL(mp); 1996 (q)->q_head = mp->b_next; 1997 mp->b_prev = mp->b_next = NULL; 1998 mp->b_cont = NULL; 1999 freemsg(mp); 2000 } 2001 (q)->q_tail = NULL; 2002 (q)->q_len = 0; 2003 atomic_add_32(&opsrsm_pending_bytes, -total_bytes); 2004 2005 /* 2006 * if rs_dest is still valid, we need to release our 2007 * reference to it. 2008 */ 2009 if (rp->rs_dest != NULL && !OPSRSM_IS_LOOPBACK(rp->rs_dest) && 2010 (rp->rs_state & OPSRSM_RS_FAILOVER) == 0 && 2011 (rp->rs_state & OPSRSM_RS_REFDEST) != 0) { 2012 rp->rs_state &= ~OPSRSM_RS_REFDEST; 2013 if (opsrsmdev->opsrsm_param.rsmrdt_enable_loadbalance) { 2014 if (rp->rs_dest->rd_adapter->sel_cnt > 0) 2015 rp->rs_dest->rd_adapter->sel_cnt--; 2016 } 2017 UNREFDEST(rp->rs_dest); 2018 } 2019 2020 /* 2021 * cleanup and free the resource structure 2022 */ 2023 if (rp->rs_pollhd.ph_list != NULL) 2024 pollhead_clean(&rp->rs_pollhd); 2025 mutex_destroy(&rp->rs_lock); 2026 cv_destroy(&rp->rs_cv); 2027 cv_destroy(&rp->rs_conn_cv); 2028 cv_destroy(&rp->rs_close_cv); 2029 kmem_free((void *)rp->rs_rmptr, opsrsmdev-> 2030 opsrsm_param.opsrsm_max_recv_msgs * sizeof (rdt_recvmsg_t)); 2031 kmem_free((void *)rp, sizeof (*rp)); 2032 2033 return (DDI_SUCCESS); 2034 } 2035 2036 2037 /* 2038 * Allocate a resource struct 2039 */ 2040 static opsrsmresource_t * 2041 opsrsm_resstruct_alloc() 2042 { 2043 opsrsmresource_t *rp; 2044 2045 rp = (opsrsmresource_t *)kmem_zalloc(sizeof (*rp), KM_SLEEP); 2046 if (rp == NULL) { 2047 DERR("opsrsm_resstruct_alloc: kmem_zalloc failed"); 2048 return (NULL); 2049 } 2050 2051 rp->rs_rmptr = (void *)kmem_zalloc( 2052 opsrsmdev->opsrsm_param.opsrsm_max_recv_msgs * 2053 sizeof (rdt_recvmsg_t), KM_SLEEP); 2054 if (rp->rs_rmptr == NULL) { 2055 DERR("opsrsm_resstruct_alloc: kmem_zalloc failed"); 2056 kmem_free((void *)rp, sizeof (*rp)); 2057 return (NULL); 2058 } 2059 2060 rp->rs_events = 0; 2061 rp->rs_nodeid = 0; 2062 rp->rs_lportnum = 0; 2063 rp->rs_rportnum = 0; 2064 rp->rs_dest = NULL; 2065 rp->rs_pollhd.ph_list = NULL; 2066 rp->rs_refcnt = 0; 2067 rp->rs_state = 0; 2068 rp->rs_poll_index = -1; 2069 rp->rs_pkey = 0; 2070 rp->rs_local_skey = 0; 2071 OPSRSM_Q_INIT(&rp->rs_recvq); 2072 2073 mutex_init(&rp->rs_lock, NULL, MUTEX_DRIVER, NULL); 2074 cv_init(&rp->rs_cv, NULL, CV_DRIVER, NULL); 2075 cv_init(&rp->rs_conn_cv, NULL, CV_DRIVER, NULL); 2076 cv_init(&rp->rs_close_cv, NULL, CV_DRIVER, NULL); 2077 2078 return (rp); 2079 } 2080 2081 /*ARGSUSED*/ 2082 static int 2083 opsrsm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, 2084 int *rvalp) 2085 { 2086 int error = RSM_SUCCESS; 2087 opsrsmresource_t *rp; 2088 minor_t rnum; 2089 2090 rnum = getminor(dev); 2091 rp = opsrsmresource_lookup(rnum, OPSRSM_RO_DEFAULT); 2092 if (rp == NULL) { 2093 return (ENXIO); 2094 } 2095 2096 D1("opsrsm_ioctl: rnum = %d ; rp = %p", rnum, rp); 2097 2098 switch (cmd) { 2099 case RSMRDT_IOCTL_BIND: 2100 error = opsrsm_ioctl_bind(rp, arg, mode); 2101 break; 2102 case RSMRDT_IOCTL_CONNECT: 2103 error = opsrsm_ioctl_connect(rp, arg, mode); 2104 break; 2105 case RSMRDT_IOCTL_SENDMSG: 2106 error = opsrsm_ioctl_sendmsg(rp, arg, mode); 2107 break; 2108 case RSMRDT_IOCTL_RECVMSGS: 2109 error = opsrsm_ioctl_recvmsgs(rp, arg, mode); 2110 break; 2111 case RSMRDT_IOCTL_GETPARAM: 2112 error = opsrsm_ioctl_getparam(rp, arg, mode); 2113 break; 2114 case RSMRDT_IOCTL_SETPARAM: 2115 error = opsrsm_ioctl_setparam(rp, arg, mode); 2116 break; 2117 case RSMRDT_IOCTL_GETNODEID: 2118 error = opsrsm_ioctl_getnodeid(rp, arg, mode); 2119 break; 2120 default: 2121 DERR("opsrsm_ioctl: cmd not supported\n"); 2122 error = DDI_FAILURE; 2123 } 2124 return (error); 2125 } 2126 2127 /* ********************* Driver Open/Close/Poll *************** */ 2128 2129 /* ARGSUSED */ 2130 static int 2131 opsrsm_open(dev_t *devp, int flag, int otyp, struct cred *cred) 2132 { 2133 minor_t rnum; 2134 opsrsmresource_t *rp; 2135 2136 /* 2137 * Char only 2138 */ 2139 if (otyp != OTYP_CHR) { 2140 return (EINVAL); 2141 } 2142 2143 rw_enter(&opsrsm_resource.opsrsmrct_lock, RW_READER); 2144 if (opsrsm_resource.opsrsmrc_flag == OPSRSMRC_UNLOAD_INPROGRESS) { 2145 DERR("opsrsm_open: Unloading in progress"); 2146 rw_exit(&opsrsm_resource.opsrsmrct_lock); 2147 return (ENODEV); 2148 } 2149 rw_exit(&opsrsm_resource.opsrsmrct_lock); 2150 2151 /* 2152 * Only zero can be opened, clones are used for resources. 2153 */ 2154 if (getminor(*devp) != OPSRSM_DRIVER_MINOR) { 2155 DERR("opsrsm_open: bad minor %d\n", getminor(*devp)); 2156 return (ENODEV); 2157 } 2158 2159 /* 2160 * - allocate new minor number 2161 * - update devp argument to new device 2162 */ 2163 if ((rp = opsrsmresource_alloc(&rnum)) != NULL) { 2164 *devp = makedevice(getmajor(*devp), rnum); 2165 rp->rs_lportnum = rnum; 2166 } else { 2167 return (ENOMEM); 2168 } 2169 2170 return (DDI_SUCCESS); 2171 } 2172 2173 /* ARGSUSED */ 2174 static int 2175 opsrsm_close(dev_t dev, int flag, int otyp, struct cred *cred) 2176 { 2177 minor_t rnum = getminor(dev); 2178 2179 /* 2180 * Char only 2181 */ 2182 if (otyp != OTYP_CHR) { 2183 return (EINVAL); 2184 } 2185 D1("opsrsm_close: rnum = %d", rnum); 2186 2187 /* 2188 * remove resource from resource table and destroy resource 2189 */ 2190 if (opsrsm_resstruct_free(rnum) != DDI_SUCCESS) { 2191 DERR("opsrsm_close: cannot free resource structure\n"); 2192 return (DDI_FAILURE); 2193 } 2194 return (DDI_SUCCESS); 2195 } 2196 2197 /*ARGSUSED*/ 2198 static int 2199 opsrsm_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 2200 struct pollhead **phpp) 2201 { 2202 opsrsmresource_t *rp; 2203 minor_t rnum; 2204 int error = 0; 2205 2206 rnum = getminor(dev); 2207 rp = opsrsmresource_lookup(rnum, OPSRSM_RO_DEFAULT); 2208 if (rp == NULL) { 2209 return (ENXIO); 2210 } 2211 2212 D1("opsrsm_chpoll: rnum = %d : rp = %p\n", rnum, rp); 2213 2214 *reventsp = 0; 2215 2216 /* 2217 * Valid device events are: 2218 * POLLIN | POLLRDNORM | POLLOUT | POLLERR 2219 */ 2220 if ((events & POLLIN) != 0) { 2221 mutex_enter(&rp->rs_lock); 2222 if ((events & POLLOUT) != 0) { 2223 error = ENOTSUP; 2224 mutex_exit(&rp->rs_lock); 2225 goto done; 2226 } 2227 if ((rp->rs_events & POLLIN) != 0 || 2228 OPSRSM_Q_LEN(&rp->rs_recvq) > 0) { 2229 *reventsp = POLLIN; 2230 } else { 2231 if (!anyyet) { 2232 *phpp = &rp->rs_pollhd; 2233 } 2234 } 2235 mutex_exit(&rp->rs_lock); 2236 } else if ((events & POLLOUT) != 0) { 2237 boolean_t not_full; 2238 opsrsm_dest_t *rd; 2239 int isdel; 2240 2241 if (OPSRSM_IS_LOOPBACK(rp->rs_dest)) { 2242 *reventsp = POLLOUT; 2243 goto done; 2244 } 2245 if (rp->rs_dest == NULL) { 2246 error = ENOTCONN; 2247 goto done; 2248 } 2249 2250 mutex_enter(&rp->rs_lock); 2251 if ((rp->rs_state & OPSRSM_RS_FAILOVER) != 0) { 2252 *reventsp = POLLOUT; 2253 mutex_exit(&rp->rs_lock); 2254 goto done; 2255 } 2256 rd = rp->rs_dest; 2257 REFDEST(rd, isdel); 2258 if (isdel != 0) { 2259 error = ENETDOWN; 2260 mutex_exit(&rp->rs_lock); 2261 goto done; 2262 } 2263 mutex_exit(&rp->rs_lock); 2264 2265 mutex_enter(&rd->rd_sendq_lock); 2266 not_full = (OPSRSM_Q_LEN(&rd->rd_sendq) < opsrsmdev-> 2267 opsrsm_param.opsrsm_max_queued_pkts); 2268 mutex_exit(&rd->rd_sendq_lock); 2269 2270 if (not_full) { 2271 *reventsp = POLLOUT; 2272 } else { 2273 if (!anyyet) { 2274 if (rd->rd_dstate == 0) 2275 *phpp = &rd->rd_pollhd; 2276 } 2277 } 2278 UNREFDEST(rd); 2279 } else if ((events & POLLERR) != 0) { 2280 error = ENOTSUP; 2281 } 2282 2283 done:; 2284 return (error); 2285 } 2286 2287 2288 2289 /* 2290 * Undo tasks done by opsrsmattach(), either because we're detaching or because 2291 * attach() got partly done then failed. progress is a bitmap that tells 2292 * us what has been done so far. 2293 */ 2294 static void 2295 opsrsmtakedown( 2296 opsrsm_t *opsrsmp, /* OPSRSM device (RSM controller) pointer */ 2297 int progress) /* Mask of RSMPI_ATT_xxx values */ 2298 { 2299 int instance; 2300 dev_info_t *dip; 2301 2302 D1("opsrsmtakedown: opsrsmp 0x%p, progress 0x%x", 2303 (void *)opsrsmp, progress); 2304 2305 ASSERT(opsrsmp); 2306 2307 dip = opsrsmp->opsrsm_dip; 2308 instance = ddi_get_instance(dip); 2309 2310 if (progress & OPSRSM_ATT_KSTAT) { 2311 opsrsmkstatremove(opsrsmp); 2312 progress &= ~OPSRSM_ATT_KSTAT; 2313 } 2314 2315 if (progress & OPSRSM_ATT_MINOR) { 2316 ddi_remove_minor_node(dip, NULL); 2317 progress &= ~OPSRSM_ATT_MINOR; 2318 } 2319 2320 ASSERT(progress == 0); 2321 2322 ddi_soft_state_free(opsrsm_state, instance); 2323 2324 D1("opsrsmtakedown: returning DDI_SUCCESS"); 2325 } 2326 2327 2328 /* 2329 * **************************************************************** 2330 * * 2331 * E N D BASIC MODULE BOILERPLATE * 2332 * * 2333 * **************************************************************** 2334 */ 2335 2336 2337 /* 2338 * **************************************************************** 2339 * * 2340 * B E G I N STATUS REPORTING STUFF * 2341 * * 2342 * **************************************************************** 2343 */ 2344 2345 /* 2346 * This routine makes the data in our kernel statistics structure reflect 2347 * the current state of the device; it's called whenever a user requests 2348 * the kstat data. Basically, all we do is copy the stats from the RSMPI 2349 * controller structure, where they're maintained, to the kstat's data 2350 * portion. 2351 */ 2352 static int 2353 opsrsmstat_kstat_update( 2354 kstat_t *ksp, /* Pointer to kstat that will be updated */ 2355 int rw) /* Indicates read or write (we don't support write) */ 2356 { 2357 opsrsm_t *opsrsmp; 2358 opsrsm_stat_t *opsrsmsp; 2359 2360 if (rw == KSTAT_WRITE) 2361 return (EACCES); 2362 2363 opsrsmp = (opsrsm_t *)ksp->ks_private; 2364 opsrsmsp = (opsrsm_stat_t *)ksp->ks_data; 2365 2366 opsrsmsp->rsm_ipackets.value.ul = (uint_t)opsrsmp->opsrsm_ipackets; 2367 opsrsmsp->rsm_ipackets64.value.ui64 = opsrsmp->opsrsm_ipackets; 2368 opsrsmsp->rsm_ierrors.value.ul = opsrsmp->opsrsm_ierrors; 2369 opsrsmsp->rsm_opackets.value.ul = (uint_t)opsrsmp->opsrsm_opackets; 2370 opsrsmsp->rsm_opackets64.value.ui64 = opsrsmp->opsrsm_opackets; 2371 opsrsmsp->rsm_oerrors.value.ul = opsrsmp->opsrsm_oerrors; 2372 opsrsmsp->rsm_collisions.value.ul = opsrsmp->opsrsm_collisions; 2373 2374 opsrsmsp->rsm_xfers.value.ul = opsrsmp->opsrsm_xfers; 2375 opsrsmsp->rsm_xfer_pkts.value.ul = opsrsmp->opsrsm_xfer_pkts; 2376 opsrsmsp->rsm_syncdqes.value.ul = opsrsmp->opsrsm_syncdqes; 2377 opsrsmsp->rsm_lbufs.value.ul = opsrsmp->opsrsm_lbufs; 2378 opsrsmsp->rsm_nlbufs.value.ul = opsrsmp->opsrsm_nlbufs; 2379 opsrsmsp->rsm_pullup.value.ul = opsrsmp->opsrsm_pullup; 2380 opsrsmsp->rsm_pullup_fail.value.ul = opsrsmp->opsrsm_pullup_fail; 2381 opsrsmsp->rsm_starts.value.ul = opsrsmp->opsrsm_starts; 2382 opsrsmsp->rsm_start_xfers.value.ul = opsrsmp->opsrsm_start_xfers; 2383 opsrsmsp->rsm_fqetmo_hint.value.ul = opsrsmp->opsrsm_fqetmo_hint; 2384 opsrsmsp->rsm_fqetmo_drops.value.ul = opsrsmp->opsrsm_fqetmo_drops; 2385 opsrsmsp->rsm_no_fqes.value.ul = opsrsmp->opsrsm_no_fqes; 2386 opsrsmsp->rsm_pending_writes.value.ul = opsrsmp->opsrsm_pending_writes; 2387 opsrsmsp->rsm_pkts_queued.value.ul = opsrsmp->opsrsm_pkts_queued; 2388 opsrsmsp->rsm_pkts_discarded.value.ul = opsrsmp->opsrsm_pkts_discarded; 2389 opsrsmsp->rsm_pkts_pending.value.ul = opsrsmp->opsrsm_pkts_pending; 2390 opsrsmsp->rsm_last_sendq_len.value.ul = opsrsmp->opsrsm_last_sendq_len; 2391 opsrsmsp->rsm_last_pendq_len.value.ul = opsrsmp->opsrsm_last_pendq_len; 2392 opsrsmsp->rsm_last_wr_comp.value.ul = opsrsmp->opsrsm_last_wr_comp; 2393 opsrsmsp->rsm_errs.value.ul = opsrsmp->opsrsm_errs; 2394 opsrsmsp->rsm_in_bytes.value.ul = (uint_t)opsrsmp->opsrsm_in_bytes; 2395 opsrsmsp->rsm_in_bytes64.value.ui64 = opsrsmp->opsrsm_in_bytes; 2396 opsrsmsp->rsm_out_bytes.value.ul = (uint_t)opsrsmp->opsrsm_out_bytes; 2397 opsrsmsp->rsm_out_bytes64.value.ui64 = opsrsmp->opsrsm_out_bytes; 2398 opsrsmsp->rsm_intr_send_errs.value.ul = opsrsmp->opsrsm_intr_send_errs; 2399 opsrsmsp->rsm_max_batch_size.value.ul = opsrsmp->opsrsm_max_batch_size; 2400 opsrsmsp->rsm_min_batch_size.value.ul = opsrsmp->opsrsm_min_batch_size; 2401 opsrsmsp->rsm_put_fqes.value.ul = opsrsmp->opsrsm_put_fqes; 2402 opsrsmsp->rsm_queued_fqes.value.ul = opsrsmp->opsrsm_queued_fqes; 2403 opsrsmsp->rsm_packets_consumed.value.ul = 2404 opsrsmp->opsrsm_packets_consumed; 2405 return (0); 2406 } 2407 2408 /* 2409 * This routine initializes the kernel statistics structures for an 2410 * OPSRSM device. 2411 */ 2412 static void 2413 opsrsmkstatinit( 2414 opsrsm_t *opsrsmp) /* OPSRSM device (RSM controller) pointer */ 2415 { 2416 struct kstat *ksp; 2417 opsrsm_stat_t *opsrsmsp; 2418 2419 /* 2420 * We create a kstat for the device, then create a whole bunch of 2421 * named stats inside that first kstat. 2422 */ 2423 if ((ksp = kstat_create("rsmrdt", ddi_get_instance(opsrsmp->opsrsm_dip), 2424 NULL, "net", KSTAT_TYPE_NAMED, sizeof (opsrsm_stat_t) / 2425 sizeof (kstat_named_t), 0)) == NULL) { 2426 opsrsmerror(opsrsmp->opsrsm_dip, "kstat_create failed"); 2427 return; 2428 } 2429 opsrsmsp = (opsrsm_stat_t *)(ksp->ks_data); 2430 2431 /* 2432 * The first five named stats we create have well-known names, and are 2433 * used by standard SunOS utilities (e.g., netstat). (There is actually 2434 * a sixth well-known stat, called "queue", which we don't support.) 2435 */ 2436 kstat_named_init(&opsrsmsp->rsm_ipackets, "ipackets", KSTAT_DATA_ULONG); 2437 kstat_named_init(&opsrsmsp->rsm_ierrors, "ierrors", KSTAT_DATA_ULONG); 2438 kstat_named_init(&opsrsmsp->rsm_opackets, "opackets", KSTAT_DATA_ULONG); 2439 kstat_named_init(&opsrsmsp->rsm_oerrors, "oerrors", KSTAT_DATA_ULONG); 2440 kstat_named_init(&opsrsmsp->rsm_collisions, "collisions", 2441 KSTAT_DATA_ULONG); 2442 2443 /* 2444 * MIB II kstat variables 2445 */ 2446 kstat_named_init(&opsrsmsp->rsm_in_bytes, "rbytes", KSTAT_DATA_ULONG); 2447 kstat_named_init(&opsrsmsp->rsm_out_bytes, "obytes", KSTAT_DATA_ULONG); 2448 2449 /* 2450 * PSARC 1997/198 2451 */ 2452 kstat_named_init(&opsrsmsp->rsm_ipackets64, "ipackets64", 2453 KSTAT_DATA_ULONGLONG); 2454 kstat_named_init(&opsrsmsp->rsm_opackets64, "opackets64", 2455 KSTAT_DATA_ULONGLONG); 2456 kstat_named_init(&opsrsmsp->rsm_in_bytes64, "rbytes64", 2457 KSTAT_DATA_ULONGLONG); 2458 kstat_named_init(&opsrsmsp->rsm_out_bytes64, "obytes64", 2459 KSTAT_DATA_ULONGLONG); 2460 2461 2462 /* 2463 * The remainder of the named stats are specific to our driver, and 2464 * are extracted using the kstat utility. 2465 */ 2466 kstat_named_init(&opsrsmsp->rsm_xfers, "xfers", KSTAT_DATA_ULONG); 2467 kstat_named_init(&opsrsmsp->rsm_xfer_pkts, "xfer_pkts", 2468 KSTAT_DATA_ULONG); 2469 kstat_named_init(&opsrsmsp->rsm_syncdqes, "syncdqes", KSTAT_DATA_ULONG); 2470 kstat_named_init(&opsrsmsp->rsm_lbufs, "lbufs", KSTAT_DATA_ULONG); 2471 kstat_named_init(&opsrsmsp->rsm_nlbufs, "nlbufs", KSTAT_DATA_ULONG); 2472 kstat_named_init(&opsrsmsp->rsm_pullup, "pullup", KSTAT_DATA_ULONG); 2473 kstat_named_init(&opsrsmsp->rsm_pullup_fail, "pullup_fail", 2474 KSTAT_DATA_ULONG); 2475 kstat_named_init(&opsrsmsp->rsm_starts, "starts", KSTAT_DATA_ULONG); 2476 kstat_named_init(&opsrsmsp->rsm_start_xfers, "start_xfers", 2477 KSTAT_DATA_ULONG); 2478 kstat_named_init(&opsrsmsp->rsm_fqetmo_hint, "fqetmo_hint", 2479 KSTAT_DATA_ULONG); 2480 kstat_named_init(&opsrsmsp->rsm_fqetmo_drops, "fqetmo_drops", 2481 KSTAT_DATA_ULONG); 2482 kstat_named_init(&opsrsmsp->rsm_no_fqes, "no_fqes", 2483 KSTAT_DATA_ULONG); 2484 kstat_named_init(&opsrsmsp->rsm_pending_writes, "pending_writes", 2485 KSTAT_DATA_ULONG); 2486 kstat_named_init(&opsrsmsp->rsm_pkts_queued, "pkts_queued", 2487 KSTAT_DATA_ULONG); 2488 kstat_named_init(&opsrsmsp->rsm_pkts_discarded, "pkts_discarded", 2489 KSTAT_DATA_ULONG); 2490 kstat_named_init(&opsrsmsp->rsm_pkts_pending, "pkts_pending", 2491 KSTAT_DATA_ULONG); 2492 kstat_named_init(&opsrsmsp->rsm_last_sendq_len, "last_sendq_len", 2493 KSTAT_DATA_ULONG); 2494 kstat_named_init(&opsrsmsp->rsm_last_pendq_len, "last_pendq_len", 2495 KSTAT_DATA_ULONG); 2496 kstat_named_init(&opsrsmsp->rsm_last_wr_comp, "last_wr_comp", 2497 KSTAT_DATA_ULONG); 2498 kstat_named_init(&opsrsmsp->rsm_errs, "errs", KSTAT_DATA_ULONG); 2499 kstat_named_init(&opsrsmsp->rsm_intr_send_errs, "intr_send_errs", 2500 KSTAT_DATA_ULONG); 2501 2502 kstat_named_init(&opsrsmsp->rsm_min_batch_size, "min_batch_size", 2503 KSTAT_DATA_ULONG); 2504 kstat_named_init(&opsrsmsp->rsm_max_batch_size, "max_batch_size", 2505 KSTAT_DATA_ULONG); 2506 kstat_named_init(&opsrsmsp->rsm_put_fqes, "put_fqes", 2507 KSTAT_DATA_ULONG); 2508 kstat_named_init(&opsrsmsp->rsm_queued_fqes, "queued_fqes", 2509 KSTAT_DATA_ULONG); 2510 kstat_named_init(&opsrsmsp->rsm_packets_consumed, "packets_consumed", 2511 KSTAT_DATA_ULONG); 2512 2513 ksp->ks_update = opsrsmstat_kstat_update; 2514 ksp->ks_private = (void *) opsrsmp; 2515 opsrsmp->opsrsm_ksp = ksp; 2516 kstat_install(ksp); 2517 2518 } 2519 2520 /* 2521 * This routine removes any kstats we might have created. 2522 */ 2523 static void 2524 opsrsmkstatremove( 2525 opsrsm_t *opsrsmp) /* OPSRSM device (RSM controller) pointer */ 2526 { 2527 2528 if (opsrsmp->opsrsm_ksp) 2529 kstat_delete(opsrsmp->opsrsm_ksp); 2530 } 2531 2532 /* 2533 * Print an error message to the console. 2534 */ 2535 static void 2536 opsrsmerror( 2537 dev_info_t *dip, /* Dev info for the device in question */ 2538 const char *fmt, /* Format of output */ 2539 ...) /* Parameters for output */ 2540 { 2541 char name[16]; 2542 char buff[1024]; 2543 va_list ap; 2544 2545 if (dip) { 2546 (void) sprintf(name, "%s%d", ddi_get_name(dip), 2547 ddi_get_instance(dip)); 2548 } else { 2549 (void) sprintf(name, "opsrsm"); 2550 } 2551 2552 /* lint -e40 Undeclared identifier (__builtin_va_alist) */ 2553 va_start(ap, fmt); 2554 /* lint +e40 */ 2555 (void) vsprintf(buff, fmt, ap); 2556 va_end(ap); 2557 2558 D1("%s:\t%s", name, buff); 2559 #ifdef DEBUG 2560 cmn_err(CE_CONT, "%s:\t%s", name, buff); 2561 #endif /* DEBUG */ 2562 } 2563 2564 2565 #ifdef DEBUG 2566 2567 /* 2568 * The following variables support the debug log buffer scheme. 2569 */ 2570 2571 char opsrsmdbgbuf[0x80000]; /* The log buffer */ 2572 int opsrsmdbgsize = sizeof (opsrsmdbgbuf); /* Size of the log buffer */ 2573 size_t opsrsmdbgnext; /* Next byte to write in buffer (note */ 2574 /* this is an index, not a pointer */ 2575 int opsrsmdbginit = 0; /* Nonzero if opsrsmdbglock's inited */ 2576 kmutex_t opsrsmdbglock; 2577 2578 /* 2579 * Add the string str to the end of the debug log, followed by a newline. 2580 */ 2581 static void 2582 opsrsmdbglog(char *str) 2583 { 2584 size_t length, remlen; 2585 2586 /* 2587 * If this is the first time we've written to the log, initialize it. 2588 */ 2589 if (!opsrsmdbginit) { 2590 mutex_enter(&opsrsmattlock); 2591 if (!opsrsmdbginit) { 2592 mutex_init(&opsrsmdbglock, NULL, MUTEX_DRIVER, 2593 NULL); 2594 bzero(opsrsmdbgbuf, sizeof (opsrsmdbgbuf)); 2595 opsrsmdbgnext = 0; 2596 opsrsmdbginit = 1; 2597 } 2598 mutex_exit(&opsrsmattlock); 2599 } 2600 2601 mutex_enter(&opsrsmdbglock); 2602 2603 /* 2604 * Note the log is circular; if this string would run over the end, 2605 * we copy the first piece to the end and then the last piece to 2606 * the beginning of the log. 2607 */ 2608 length = strlen(str); 2609 2610 remlen = (size_t)sizeof (opsrsmdbgbuf) - opsrsmdbgnext; 2611 2612 if (length > remlen) { 2613 if (remlen) 2614 bcopy(str, opsrsmdbgbuf + opsrsmdbgnext, remlen); 2615 str += remlen; 2616 length -= remlen; 2617 opsrsmdbgnext = 0; 2618 } 2619 2620 bcopy(str, opsrsmdbgbuf + opsrsmdbgnext, length); 2621 opsrsmdbgnext += length; 2622 2623 if (opsrsmdbgnext >= sizeof (opsrsmdbgbuf)) 2624 opsrsmdbgnext = 0; 2625 opsrsmdbgbuf[opsrsmdbgnext++] = '\n'; 2626 2627 mutex_exit(&opsrsmdbglock); 2628 } 2629 2630 2631 /* 2632 * Add a printf-style message to whichever debug logs we're currently using. 2633 */ 2634 static void 2635 opsrsmdebug(const char *fmt, ...) 2636 { 2637 char buff[512]; 2638 va_list ap; 2639 2640 /*lint -e40 Undeclared identifier (__builtin_va_alist) */ 2641 va_start(ap, fmt); 2642 /*lint +e40 */ 2643 (void) vsprintf(buff, fmt, ap); 2644 va_end(ap); 2645 2646 if (opsrsmdbgmode & 0x1) 2647 opsrsmdbglog(buff); 2648 if (opsrsmdbgmode & 0x2) 2649 cmn_err(CE_CONT, "%s\n", buff); 2650 } 2651 2652 static void 2653 opsrsmconsole(const char *fmt, ...) 2654 { 2655 char buff[512]; 2656 va_list ap; 2657 2658 /*lint -e40 Undeclared identifier (__builtin_va_alist) */ 2659 va_start(ap, fmt); 2660 /*lint +e40 */ 2661 (void) vsprintf(buff, fmt, ap); 2662 va_end(ap); 2663 2664 cmn_err(CE_CONT, "%s", buff); 2665 } 2666 2667 #endif 2668 2669 2670 /* 2671 * **************************************************************** 2672 * * 2673 * E N D STATUS REPORTING STUFF * 2674 * * 2675 * **************************************************************** 2676 */ 2677 2678 2679 /* 2680 * **************************************************************** 2681 * * 2682 * B E G I N BASIC STREAMS OPERATIONS * 2683 * * 2684 * **************************************************************** 2685 */ 2686 2687 2688 /* 2689 * Write service routine. This routine processes any messages put on the queue 2690 * via a putq() in the write put routine. It also handles any destinations put 2691 * on the destination run queue. 2692 */ 2693 static void 2694 opsrsmwsrv(void *arg) 2695 { 2696 adapter_t *adapterp; 2697 opsrsm_t *opsrsmp = opsrsmdev; 2698 opsrsm_dest_t *rd; 2699 int isdel = 0; 2700 2701 D5("opsrsmwsrv: time 0x%llx", gethrtime()); 2702 2703 adapterp = (adapter_t *)arg; 2704 2705 /* 2706 * rd's refcnt is incremented by GETRUNQ 2707 */ 2708 GETRUNQ(rd, isdel, adapterp); 2709 while (rd) { 2710 int oldstate, delete; 2711 2712 if (isdel) { 2713 D2("opsrsmwsrv: dest 0x%p being deleted, ignored", 2714 (void *)rd); 2715 GETRUNQ(rd, isdel, adapterp); 2716 continue; 2717 } 2718 2719 mutex_enter(&rd->rd_lock); 2720 delete = 0; 2721 2722 oldstate = opsrsmgetstate(rd); 2723 D5("opsrsmwsrv: running state %s time 0x%llx", 2724 OPSRSM_STATE_STR(oldstate), gethrtime()); 2725 switch (oldstate) { 2726 2727 case OPSRSM_STATE_S_XFER: { 2728 cmn_err(CE_PANIC, "impossible state\n"); 2729 break; 2730 } 2731 2732 case OPSRSM_STATE_S_REQ_CONNECT: { 2733 if (opsrsmcrexfer(opsrsmp, rd) != 0 || 2734 opsrsmsconn(opsrsmp, rd, 0) != 0) { 2735 opsrsmsetstate(rd, OPSRSM_STATE_DELETING); 2736 delete = 1; 2737 } 2738 break; 2739 } 2740 2741 case OPSRSM_STATE_S_NEWCONN: { 2742 if (opsrsmcrexfer(opsrsmp, rd) != 0 || 2743 opsrsmconnxfer(opsrsmp, rd) != 0 || 2744 opsrsmsaccept(opsrsmp, rd) != 0) { 2745 opsrsmsetstate(rd, OPSRSM_STATE_DELETING); 2746 delete = 1; 2747 } 2748 break; 2749 } 2750 2751 case OPSRSM_STATE_S_CONNXFER_ACCEPT: { 2752 if (opsrsmconnxfer(opsrsmp, rd) != 0 || 2753 opsrsmsaccept(opsrsmp, rd) != 0) { 2754 opsrsmsetstate(rd, OPSRSM_STATE_DELETING); 2755 delete = 1; 2756 } 2757 break; 2758 } 2759 2760 case OPSRSM_STATE_S_CONNXFER_ACK: { 2761 if (opsrsmconnxfer(opsrsmp, rd) != 0 || 2762 opsrsmsack(rd) != 0) { 2763 opsrsmsetstate(rd, OPSRSM_STATE_DELETING); 2764 delete = 1; 2765 } 2766 break; 2767 } 2768 2769 /* 2770 * Delete this connection. This causes a message 2771 * to be sent to the remote side when RSM_SENDQ_DESTROY 2772 * is called, so there is no need to send an additional 2773 * message. 2774 */ 2775 case OPSRSM_STATE_S_DELETE: { 2776 opsrsmsetstate(rd, OPSRSM_STATE_DELETING); 2777 delete = 1; 2778 break; 2779 } 2780 2781 /* 2782 * Retry the SCONN. 2783 */ 2784 case OPSRSM_STATE_S_SCONN: { 2785 if (opsrsmsconn(opsrsmp, rd, 1) != 0) { 2786 opsrsmsetstate(rd, OPSRSM_STATE_DELETING); 2787 delete = 1; 2788 } 2789 break; 2790 } 2791 2792 default: 2793 D1("opsrsm: bad state %s in wsrv " 2794 " for dest 0x%lx", OPSRSM_STATE_STR(oldstate), 2795 (uintptr_t)rd); 2796 cmn_err(CE_PANIC, "opsrsm: bad state %s in wsrv " 2797 " for dest 0x%lx", OPSRSM_STATE_STR(oldstate), 2798 (uintptr_t)rd); 2799 break; 2800 } 2801 2802 mutex_exit(&rd->rd_lock); 2803 2804 if (delete) 2805 (void) opsrsmfreedest(adapterp, rd->rd_rsm_addr); 2806 2807 UNREFDEST(rd); 2808 2809 GETRUNQ(rd, isdel, adapterp); 2810 } 2811 2812 D1("opsrsmwsrv: returning"); 2813 } 2814 2815 2816 2817 2818 /* 2819 * **************************************************************** 2820 * * 2821 * E N D BASIC STREAMS OPERATIONS * 2822 * * 2823 * **************************************************************** 2824 */ 2825 2826 2827 /* 2828 * **************************************************************** 2829 * * 2830 * B E G I N NEW DATA TRANSFER LOGIC * 2831 * * 2832 * **************************************************************** 2833 */ 2834 2835 static int 2836 opsrsm_start_batch(opsrsm_dest_t *rd, uint32_t start_time) 2837 { 2838 int err = 0; 2839 2840 switch (rd->rd_xmit_state) { 2841 case OPSRSM_XMIT_BARRIER_CLOSED: 2842 case OPSRSM_XMIT_RETRY_DATA: 2843 err = RSM_OPEN_BARRIER_REGION(rd->rd_adapter->rsmrdt_ctlr_obj, 2844 rd->rd_rxferhand, &rd->rd_barrier); 2845 ASSERT(err == RSM_SUCCESS); 2846 if (rd->rd_xmit_state == OPSRSM_XMIT_BARRIER_CLOSED) { 2847 rd->rd_xmit_state = OPSRSM_XMIT_BARRIER_OPENED; 2848 rd->rd_start_time = start_time; 2849 rd->rd_data_collected = 0; 2850 rd->rd_writes_completed = 0; 2851 err = 0; 2852 } 2853 break; 2854 default: 2855 cmn_err(CE_PANIC, "invalid state = %d\n", rd->rd_xmit_state); 2856 break; 2857 } 2858 return (err); 2859 } 2860 2861 static int 2862 opsrsm_end_batch(opsrsm_dest_t *rd) 2863 { 2864 int err = 0; 2865 uint32_t qlen = 0; 2866 2867 switch (rd->rd_xmit_state) { 2868 case OPSRSM_XMIT_BARRIER_OPENED: 2869 case OPSRSM_XMIT_RETRY_DATA: 2870 err = RSM_CLOSE_BARRIER(rd->rd_adapter->rsmrdt_ctlr_obj, 2871 &rd->rd_barrier); 2872 if (err != RSM_SUCCESS) { 2873 rd->rd_xmit_state = OPSRSM_XMIT_RETRY_DATA; 2874 break; 2875 } 2876 qlen = (uint32_t)OPSRSM_Q_LEN(&rd->rd_pendq); 2877 opsrsmputdqes(rd); 2878 2879 mutex_enter(&rd->rd_freeq_lock); 2880 OPSRSM_Q_CONCAT(&rd->rd_freeq, &rd->rd_pendq); 2881 mutex_exit(&rd->rd_freeq_lock); 2882 2883 opsrsmdev->opsrsm_xfers++; 2884 opsrsmdev->opsrsm_xfer_pkts += qlen; 2885 opsrsmdev->opsrsm_max_batch_size = 2886 max(opsrsmdev->opsrsm_max_batch_size, qlen); 2887 2888 if (opsrsmdev->opsrsm_min_batch_size == 0) { 2889 opsrsmdev->opsrsm_min_batch_size = qlen; 2890 } else { 2891 opsrsmdev->opsrsm_min_batch_size = 2892 min(opsrsmdev->opsrsm_min_batch_size, qlen); 2893 } 2894 2895 rd->rd_start_time = 0; 2896 rd->rd_data_collected = 0; 2897 rd->rd_writes_completed = 0; 2898 rd->rd_nretries = 0; 2899 rd->rd_xmit_state = OPSRSM_XMIT_BARRIER_CLOSED; 2900 break; 2901 case OPSRSM_XMIT_DISCONNECTED: 2902 err = RSM_SUCCESS; 2903 break; 2904 default: 2905 cmn_err(CE_PANIC, "invalid state = %d\n", rd->rd_xmit_state); 2906 break; 2907 } 2908 2909 if (err != RSM_SUCCESS) { 2910 opsrsmdev->opsrsm_collisions++; 2911 if (++rd->rd_nretries > opsrsmdev-> 2912 opsrsm_param.opsrsm_retry_limit) { 2913 rd->rd_nretries = 0; 2914 err = ENETDOWN; 2915 } else { 2916 opsrsm_set_xmit_tmo(rd, opsrsmdev->opsrsm_param. 2917 opsrsm_retry_delay); 2918 } 2919 } else { 2920 err = 0; 2921 } 2922 return (err); 2923 } 2924 2925 static void 2926 opsrsm_dispatch_tmo(void *arg) 2927 { 2928 opsrsm_dest_t *rd = (opsrsm_dest_t *)arg; 2929 2930 /* keep rescheduling itself until opsrsmxmit_thread is dispatched */ 2931 if (taskq_dispatch(rd->rd_adapter->opsrsm_taskq, opsrsmxmit_thread, 2932 rd, KM_NOSLEEP) == 0) { 2933 (void) timeout(opsrsm_dispatch_tmo, rd, 1); 2934 } 2935 } 2936 2937 static void 2938 opsrsm_xmit_tmo(void *arg) 2939 { 2940 opsrsm_dest_t *rd = (opsrsm_dest_t *)arg; 2941 2942 mutex_enter(&rd->rd_tmo_lock); 2943 if (rd->rd_xmit_tmo_id == 0) { 2944 mutex_exit(&rd->rd_tmo_lock); 2945 return; 2946 } 2947 if (taskq_dispatch(rd->rd_adapter->opsrsm_taskq, opsrsmxmit_thread, 2948 rd, KM_NOSLEEP) == 0) { 2949 rd->rd_xmit_tmo_id = timeout(opsrsm_xmit_tmo, rd, 2950 rd->rd_xmit_tmo_int); 2951 } else { 2952 rd->rd_xmit_tmo_id = 0; 2953 rd->rd_xmit_tmo_int = 0; 2954 } 2955 mutex_exit(&rd->rd_tmo_lock); 2956 } 2957 2958 static void 2959 opsrsm_set_xmit_tmo(opsrsm_dest_t *rd, int interval) 2960 { 2961 int isdel = 0; 2962 2963 mutex_enter(&rd->rd_tmo_lock); 2964 if (rd->rd_xmit_tmo_id != 0) { 2965 goto out; 2966 } 2967 REFDEST(rd, isdel); 2968 if (isdel != 0) goto out; 2969 rd->rd_xmit_tmo_int = interval; 2970 rd->rd_xmit_tmo_id = timeout(opsrsm_xmit_tmo, rd, rd->rd_xmit_tmo_int); 2971 out:; 2972 mutex_exit(&rd->rd_tmo_lock); 2973 } 2974 2975 static void 2976 opsrsm_cancel_xmit_tmo(opsrsm_dest_t *rd) 2977 { 2978 timeout_id_t tmoid; 2979 2980 mutex_enter(&rd->rd_tmo_lock); 2981 if (rd->rd_xmit_tmo_id == 0) { 2982 mutex_exit(&rd->rd_tmo_lock); 2983 return; 2984 } 2985 UNREFDEST(rd); 2986 tmoid = rd->rd_xmit_tmo_id; 2987 rd->rd_xmit_tmo_id = 0; 2988 mutex_exit(&rd->rd_tmo_lock); 2989 (void) untimeout(tmoid); 2990 } 2991 2992 static void 2993 opsrsm_wake_senders(opsrsm_dest_t *rd, short events) 2994 { 2995 pollwakeup(&rd->rd_pollhd, events); 2996 } 2997 2998 static void 2999 opsrsm_fqe_tmo(void *arg) 3000 { 3001 opsrsm_dest_t *rd = (opsrsm_dest_t *)arg; 3002 3003 mutex_enter(&rd->rd_tmo_lock); 3004 if (rd->rd_fqe_tmo_id == 0) { 3005 mutex_exit(&rd->rd_tmo_lock); 3006 return; 3007 } 3008 if (opsrsmavailfqe(rd)) { 3009 if (taskq_dispatch(rd->rd_adapter->opsrsm_taskq, 3010 opsrsmxmit_thread, rd, KM_NOSLEEP) != 0) { 3011 rd->rd_fqe_tmo_id = 0; 3012 rd->rd_fqe_tmo_int = 0; 3013 mutex_exit(&rd->rd_tmo_lock); 3014 return; 3015 } 3016 } 3017 rd->rd_fqe_tmo_id = timeout(opsrsm_fqe_tmo, rd, rd->rd_fqe_tmo_int); 3018 mutex_exit(&rd->rd_tmo_lock); 3019 } 3020 3021 static void 3022 opsrsm_set_fqe_tmo(opsrsm_dest_t *rd, int interval) 3023 { 3024 int isdel = 0; 3025 3026 mutex_enter(&rd->rd_tmo_lock); 3027 if (rd->rd_fqe_tmo_id != 0) { 3028 goto out; 3029 } 3030 REFDEST(rd, isdel); 3031 if (isdel != 0) goto out; 3032 rd->rd_fqe_tmo_int = interval; 3033 rd->rd_fqe_tmo_id = timeout(opsrsm_fqe_tmo, rd, rd->rd_fqe_tmo_int); 3034 out:; 3035 mutex_exit(&rd->rd_tmo_lock); 3036 } 3037 3038 static void 3039 opsrsm_cancel_fqe_tmo(opsrsm_dest_t *rd) 3040 { 3041 timeout_id_t tmoid; 3042 3043 mutex_enter(&rd->rd_tmo_lock); 3044 if (rd->rd_fqe_tmo_id == 0) { 3045 mutex_exit(&rd->rd_tmo_lock); 3046 return; 3047 } 3048 UNREFDEST(rd); 3049 tmoid = rd->rd_fqe_tmo_id; 3050 rd->rd_fqe_tmo_id = 0; 3051 mutex_exit(&rd->rd_tmo_lock); 3052 (void) untimeout(tmoid); 3053 } 3054 3055 static int 3056 opsrsm_write_data(opsrsm_dest_t *rd, mblk_t *mp) 3057 { 3058 uint_t bufnum; 3059 int write_err; 3060 uint_t pktlen; 3061 uint_t start_offset, end_offset; 3062 uchar_t *srcaddr, *endaddr; 3063 3064 pktlen = (uint_t)MBLKL(mp); 3065 if (pktlen > rd->rd_rbuflen) 3066 pktlen = rd->rd_rbuflen; 3067 3068 bufnum = (uint_t)mp->b_prev; 3069 srcaddr = mp->b_rptr; 3070 start_offset = (uint_t)((uint64_t)srcaddr & OPSRSM_CACHELINE_OFFSET); 3071 ASSERT(start_offset == 0); 3072 endaddr = srcaddr + pktlen; 3073 end_offset = (uint_t)(OPSRSM_CACHELINE_ROUNDUP(endaddr) - 3074 (uint64_t)endaddr); 3075 3076 ASSERT((pktlen + start_offset + end_offset) <= rd->rd_rbuflen); 3077 ASSERT(((rd->rd_rbufoff + (off_t)(bufnum * rd->rd_rbuflen)) & 3078 OPSRSM_CACHELINE_OFFSET) == 0); 3079 3080 D6("write_data: put 0x%x bytes at segoffset 0x%lx from addr 0x%p", 3081 pktlen + start_offset + end_offset, rd->rd_rbufoff + 3082 (off_t)(bufnum * rd->rd_rbuflen), (void *)(srcaddr - start_offset)); 3083 3084 write_err = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj, rd->rd_rxferhand, 3085 rd->rd_rbufoff + (off_t)(bufnum * rd->rd_rbuflen), 3086 srcaddr - start_offset, 3087 (size_t)(pktlen + start_offset + end_offset)); 3088 3089 if (write_err != RSM_SUCCESS) DERR("write_err = %d\n", write_err); 3090 return (0); 3091 } 3092 3093 static int 3094 opsrsm_sync_dqe(opsrsm_dest_t *rd) 3095 { 3096 opsrsm_t *opsrsmp = opsrsmdev; 3097 uint64_t start_offset, end_offset; 3098 opsrsm_dqe_t *new_shdwdqw_o = NULL; 3099 rsm_send_t send_obj; 3100 ushort_t new_dqw_seq = 0; 3101 rsm_barrier_t dq_barrier; 3102 opsrsm_msg_t msg; 3103 uint32_t msg_cnt; 3104 int stat = RSM_SUCCESS; 3105 3106 mutex_enter(&rd->rd_net_lock); 3107 /* If network down, nothing to do either */ 3108 if (rd->rd_stopq) { 3109 D1("opsrsm_sync_dqe: stopq on, done"); 3110 mutex_exit(&rd->rd_net_lock); 3111 return (0); 3112 } 3113 3114 /* If nothing's queued, nothing to do */ 3115 if (rd->rd_shdwdqw_i == rd->rd_shdwdqw_o) { 3116 D1("opsrsm_sync_dqe: no work, done"); 3117 if (rd->rd_retry_int) goto retry_int; 3118 mutex_exit(&rd->rd_net_lock); 3119 return (0); 3120 } 3121 3122 stat = RSM_OPEN_BARRIER_REGION(rd->rd_adapter->rsmrdt_ctlr_obj, 3123 rd->rd_rxferhand, &dq_barrier); 3124 3125 if (stat != RSM_SUCCESS) { 3126 goto done; 3127 } 3128 /* 3129 * remember any updates to the DQ; commit changes when 3130 * opsrsm_end_batch succeeds 3131 */ 3132 new_shdwdqw_o = rd->rd_shdwdqw_o; 3133 new_dqw_seq = rd->rd_dqw_seq; 3134 3135 /* 3136 * If we've wrapped around, so that the next element to go comes from 3137 * a lower address than where we started, do it in two segments. 3138 */ 3139 if (new_shdwdqw_o > rd->rd_shdwdqw_i) { 3140 /* 3141 * handle elements from current (shdwdqw_o) to end of list 3142 * (shdwdqw_l) 3143 */ 3144 opsrsm_dqe_t *tmpdqe = new_shdwdqw_o; 3145 /* 3146 * update entries being sent with current sequence number 3147 */ 3148 while (tmpdqe <= rd->rd_shdwdqw_l) { 3149 tmpdqe->s.dq_seqnum = 3150 new_dqw_seq & OPSRSM_DQE_SEQ_MASK; 3151 tmpdqe++; 3152 } 3153 3154 /* 3155 * get DQE offset for these DQ entries 3156 */ 3157 start_offset = (uint64_t)((char *)new_shdwdqw_o - 3158 (char *)rd->rd_shdwdqw_f); 3159 end_offset = (uint64_t)((char *)(rd->rd_shdwdqw_l + 1) - 3160 (char *)rd->rd_shdwdqw_f); 3161 D6("opsrsm_sync_dqe: start 0x%lx end 0x%lx", start_offset, 3162 end_offset); 3163 3164 /* 3165 * Round down and up to 64-byte boundaries 3166 */ 3167 start_offset = start_offset & OPSRSM_CACHELINE_MASK; 3168 end_offset = OPSRSM_CACHELINE_ROUNDUP(end_offset); 3169 D6("opsrsm_sync_dqe: start 0x%lx end 0x%lx", start_offset, 3170 end_offset); 3171 3172 /* 3173 * Push to remote side 3174 */ 3175 stat = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj, 3176 rd->rd_rxferhand, rd->rd_dqw_f_off + (off_t)start_offset, 3177 ((char *)rd->rd_shdwdqw_f) + start_offset, 3178 (size_t)(end_offset - start_offset)); 3179 3180 if (stat != RSM_SUCCESS) { 3181 goto done; 3182 } 3183 /* 3184 * Successfully processed these entries. 3185 * Wrap around to the beginning of DQE list, and 3186 * update the sequence number for the next round. 3187 */ 3188 new_shdwdqw_o = rd->rd_shdwdqw_f; 3189 new_dqw_seq++; 3190 if (new_dqw_seq == 0) 3191 new_dqw_seq++; 3192 } 3193 3194 /* 3195 * Handle remaining sequential DQEs 3196 */ 3197 if (new_shdwdqw_o != rd->rd_shdwdqw_i) { 3198 opsrsm_dqe_t *tmpdqe = new_shdwdqw_o; 3199 while (tmpdqe < rd->rd_shdwdqw_i) { 3200 tmpdqe->s.dq_seqnum = 3201 new_dqw_seq & OPSRSM_DQE_SEQ_MASK; 3202 tmpdqe++; 3203 } 3204 3205 /* 3206 * get DQE offset for these DQ entries 3207 */ 3208 start_offset = (uint64_t)((char *)new_shdwdqw_o - 3209 (char *)rd->rd_shdwdqw_f); 3210 end_offset = (uint64_t)((char *)rd->rd_shdwdqw_i - 3211 (char *)rd->rd_shdwdqw_f); 3212 D6("opsrsm_sync_dqe: start 0x%lx end 0x%lx", start_offset, 3213 end_offset); 3214 3215 /* 3216 * Round down and up to 64-byte cacheline boundaries 3217 */ 3218 start_offset = start_offset & OPSRSM_CACHELINE_MASK; 3219 end_offset = OPSRSM_CACHELINE_ROUNDUP(end_offset); 3220 D6("opsrsm_sync_dqe: start 0x%lx end 0x%lx", start_offset, 3221 end_offset); 3222 3223 /* 3224 * Push to remote side 3225 */ 3226 stat = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj, 3227 rd->rd_rxferhand, rd->rd_dqw_f_off + (off_t)start_offset, 3228 ((char *)rd->rd_shdwdqw_f) + start_offset, 3229 (size_t)(end_offset - start_offset)); 3230 3231 if (stat != RSM_SUCCESS) { 3232 goto done; 3233 } 3234 new_shdwdqw_o = rd->rd_shdwdqw_i; 3235 } 3236 3237 stat = RSM_CLOSE_BARRIER(rd->rd_adapter->rsmrdt_ctlr_obj, &dq_barrier); 3238 3239 done:; 3240 if (stat != RSM_SUCCESS) { 3241 /* set timer to retry */ 3242 opsrsmdev->opsrsm_oerrors++; 3243 if (rd->rd_sync_dqe_tmo_id == 0 && 3244 rd->rd_state == OPSRSM_STATE_W_READY) { 3245 rd->rd_sync_dqe_tmo_id = timeout(opsrsm_sync_dqe_tmo, 3246 rd, (clock_t)opsrsmp-> 3247 opsrsm_param.opsrsm_sync_tmo); 3248 } 3249 mutex_exit(&rd->rd_net_lock); 3250 return (stat); 3251 } else { 3252 rd->rd_shdwdqw_o = new_shdwdqw_o; 3253 rd->rd_dqw_seq = new_dqw_seq; 3254 } 3255 3256 retry_int: 3257 rd->rd_retry_int = B_FALSE; 3258 msg_cnt = rd->rd_pkts_delivered; 3259 rd->rd_pkts_delivered = 0; 3260 3261 msg.p.hdr.reqtype = OPSRSM_MSG_SYNC_DQE; 3262 msg.p.hdr.seqno = 0; 3263 msg.p.hdr.opsrsm_version = OPSRSM_VERSION; 3264 msg.p.m.syncdqe.rcv_segid = rd->rd_rxfersegid; 3265 msg.p.m.syncdqe.msg_cnt = msg_cnt; 3266 3267 send_obj.is_data = &msg; 3268 send_obj.is_size = sizeof (opsrsm_msg_t); 3269 send_obj.is_flags = RSM_DLPI_SQFLAGS; 3270 send_obj.is_wait = 0; 3271 mutex_exit(&rd->rd_net_lock); 3272 /* 3273 * send interrupt to remote node. need to release rd_net_lock 3274 * first because RSM_SEND can block. 3275 */ 3276 stat = RSM_SEND(rd->rd_adapter->rsmrdt_ctlr_obj, rd->rsm_sendq, 3277 &send_obj, NULL); 3278 3279 mutex_enter(&rd->rd_net_lock); 3280 if (stat == RSMERR_CONN_ABORTED) { 3281 DERR("RSM_SEND: connection aborted"); 3282 opsrsmdev->opsrsm_intr_send_errs++; 3283 mutex_exit(&rd->rd_net_lock); 3284 opsrsm_lostconn(rd); 3285 return (stat); 3286 } else if (stat != RSM_SUCCESS) { 3287 opsrsmdev->opsrsm_collisions++; 3288 rd->rd_nretries++; 3289 rd->rd_retry_int = B_TRUE; 3290 rd->rd_pkts_delivered += msg_cnt; 3291 if (rd->rd_sync_dqe_tmo_id == 0) { 3292 rd->rd_sync_dqe_tmo_id = timeout(opsrsm_sync_dqe_tmo, 3293 rd, (clock_t)opsrsmp-> 3294 opsrsm_param.opsrsm_sync_tmo); 3295 } 3296 mutex_exit(&rd->rd_net_lock); 3297 return (stat); 3298 } 3299 mutex_exit(&rd->rd_net_lock); 3300 3301 /* free up only the messages that were delivered */ 3302 mutex_enter(&rd->rd_freeq_lock); 3303 if (!rd->rd_freeq_freeze) { 3304 opsrsm_queue_t *q; 3305 uint_t cnt = 0; 3306 3307 q = &rd->rd_freeq; 3308 while ((q)->q_head != NULL) { 3309 mblk_t *mp; 3310 3311 cnt++; 3312 mp = (q)->q_head; 3313 if ((q)->q_head == (q)->q_tail) { 3314 ASSERT(mp->b_next == NULL); 3315 ASSERT((q)->q_len == 1); 3316 (q)->q_tail = NULL; 3317 } 3318 (q)->q_head = mp->b_next; 3319 mp->b_prev = mp->b_next = NULL; 3320 mp->b_cont = NULL; 3321 freemsg(mp); 3322 (q)->q_len--; 3323 if (cnt == msg_cnt) { 3324 break; 3325 } 3326 } 3327 } 3328 mutex_exit(&rd->rd_freeq_lock); 3329 return (0); 3330 } 3331 3332 3333 static int 3334 opsrsm_sync_fqe(opsrsm_dest_t *rd) 3335 { 3336 uint64_t start_offset, end_offset; 3337 opsrsm_fqe_t *new_shdwfqw_o = NULL; 3338 ushort_t new_fqw_seq = 0; 3339 rsm_barrier_t fq_barrier; 3340 int stat = RSM_SUCCESS; 3341 3342 mutex_enter(&rd->rd_fqr_lock); 3343 3344 ASSERT((rd->rd_fqr_flags & OPSRSM_FQR_LOCKED) == 0); 3345 /* 3346 * setting this flag guarantees that no other 3347 * thread can access the shadow queue pointers 3348 * (rd_shdwfqw_*) used by the RSM calls below. 3349 */ 3350 rd->rd_fqr_flags |= OPSRSM_FQR_LOCKED; 3351 new_shdwfqw_o = rd->rd_shdwfqw_o; 3352 new_fqw_seq = rd->rd_fqw_seq; 3353 3354 /* If nothing's queued, nothing to do */ 3355 if (rd->rd_shdwfqw_i == rd->rd_shdwfqw_o) { 3356 boolean_t putfqes; 3357 3358 D1("opsrsmsyncfqe: no work, done"); 3359 putfqes = (rd->rd_queued_fqe_list != NULL); 3360 if (!putfqes) { 3361 rd->rd_fqr_flags &= ~OPSRSM_FQR_LOCKED; 3362 } 3363 mutex_exit(&rd->rd_fqr_lock); 3364 if (putfqes) goto done; 3365 return (0); 3366 } 3367 3368 /* If network down, nothing to do either */ 3369 if (rd->rd_stopq) { 3370 D1("opsrsmsyncfqe: stopq on, done"); 3371 rd->rd_fqr_flags &= ~OPSRSM_FQR_LOCKED; 3372 mutex_exit(&rd->rd_fqr_lock); 3373 return (0); 3374 } 3375 3376 mutex_exit(&rd->rd_fqr_lock); 3377 3378 stat = RSM_OPEN_BARRIER_REGION(rd->rd_adapter->rsmrdt_ctlr_obj, 3379 rd->rd_rxferhand, &fq_barrier); 3380 3381 if (stat != RSM_SUCCESS) { 3382 goto done; 3383 } 3384 3385 /* 3386 * If we've wrapped around, so that the next element to go comes from 3387 * a lower address than where we started, do it in two segments. 3388 */ 3389 if (new_shdwfqw_o > rd->rd_shdwfqw_i) { 3390 /* 3391 * Process the elements from the current to the end of 3392 * the list, then adjust pointers to point to start of 3393 * list. 3394 */ 3395 3396 /* 3397 * Set the sequence numbers in these FQEs. 3398 */ 3399 opsrsm_fqe_t *tmpfqe = new_shdwfqw_o; 3400 while (tmpfqe <= rd->rd_shdwfqw_l) { 3401 tmpfqe->s.fq_seqnum = 3402 new_fqw_seq & OPSRSM_FQE_SEQ_MASK; 3403 tmpfqe++; 3404 } 3405 3406 /* 3407 * Get FQE offsets for FQ range being updated 3408 */ 3409 start_offset = (uint64_t)((char *)new_shdwfqw_o - 3410 (char *)rd->rd_shdwfqw_f); 3411 end_offset = (uint64_t)((char *)(rd->rd_shdwfqw_l + 1) - 3412 (char *)rd->rd_shdwfqw_f); 3413 3414 D6("opsrsmsyncfqe: start 0x%lx end 0x%lx", start_offset, 3415 end_offset); 3416 3417 /* 3418 * Round down and up to 64-byte boundaries 3419 */ 3420 start_offset = start_offset & OPSRSM_CACHELINE_MASK; 3421 end_offset = OPSRSM_CACHELINE_ROUNDUP(end_offset); 3422 3423 D6("opsrsmsyncfqe: start 0x%lx end 0x%lx", start_offset, 3424 end_offset); 3425 3426 /* 3427 * Push to remote side 3428 */ 3429 3430 stat = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj, 3431 rd->rd_rxferhand, rd->rd_fqw_f_off + (off_t)start_offset, 3432 ((char *)rd->rd_shdwfqw_f) + start_offset, 3433 (size_t)(end_offset - start_offset)); 3434 3435 if (stat != RSM_SUCCESS) { 3436 goto done; 3437 } 3438 /* 3439 * Successfully processed these entries. 3440 * Wrap around to the beginning of FQE list, and 3441 * update sequence number for the next round 3442 */ 3443 new_shdwfqw_o = rd->rd_shdwfqw_f; 3444 new_fqw_seq++; 3445 if (new_fqw_seq == 0) 3446 new_fqw_seq++; 3447 } 3448 3449 /* 3450 * Handle remaining sequential FQEs 3451 */ 3452 if ((stat == RSM_SUCCESS) && (new_shdwfqw_o != rd->rd_shdwfqw_i)) { 3453 opsrsm_fqe_t *tmpfqe = new_shdwfqw_o; 3454 /* 3455 * Set the sequence numbers in these FQEs. 3456 */ 3457 while (tmpfqe < rd->rd_shdwfqw_i) { 3458 tmpfqe->s.fq_seqnum = 3459 new_fqw_seq & OPSRSM_FQE_SEQ_MASK; 3460 tmpfqe++; 3461 } 3462 3463 /* 3464 * Get FQE offsets for FQ range being updated 3465 */ 3466 start_offset = (uint64_t)((char *)new_shdwfqw_o - 3467 (char *)rd->rd_shdwfqw_f); 3468 end_offset = (uint64_t)((char *)rd->rd_shdwfqw_i - 3469 (char *)rd->rd_shdwfqw_f); 3470 D6("opsrsmsyncfqe: start 0x%lx end 0x%lx", start_offset, 3471 end_offset); 3472 3473 /* 3474 * Round down and up to 64-byte boundaries 3475 */ 3476 start_offset = start_offset & OPSRSM_CACHELINE_MASK; 3477 end_offset = OPSRSM_CACHELINE_ROUNDUP(end_offset); 3478 D6("opsrsmsyncfqe: start 0x%lx end 0x%lx", start_offset, 3479 end_offset); 3480 3481 /* 3482 * Push to remote side 3483 */ 3484 stat = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj, 3485 rd->rd_rxferhand, rd->rd_fqw_f_off + (off_t)start_offset, 3486 ((char *)rd->rd_shdwfqw_f) + start_offset, 3487 (size_t)(end_offset - start_offset)); 3488 3489 if (stat != RSM_SUCCESS) { 3490 goto done; 3491 } 3492 new_shdwfqw_o = rd->rd_shdwfqw_i; 3493 } 3494 stat = RSM_CLOSE_BARRIER(rd->rd_adapter->rsmrdt_ctlr_obj, &fq_barrier); 3495 3496 done:; 3497 mutex_enter(&rd->rd_fqr_lock); 3498 ASSERT((rd->rd_fqr_flags & OPSRSM_FQR_LOCKED) != 0); 3499 if (stat != RSM_SUCCESS) { 3500 opsrsmdev->opsrsm_errs++; 3501 if (rd->rd_sync_fqe_tmo_id == 0 && 3502 rd->rd_state == OPSRSM_STATE_W_READY) { 3503 rd->rd_sync_fqe_tmo_id = timeout(opsrsm_sync_fqe_tmo, 3504 rd, (clock_t)opsrsmdev-> 3505 opsrsm_param.opsrsm_sync_tmo); 3506 } 3507 } else { 3508 rd->rd_shdwfqw_o = new_shdwfqw_o; 3509 rd->rd_fqw_seq = new_fqw_seq; 3510 } 3511 3512 /* 3513 * an interrupt thread might have enqueued fqe entries 3514 * while we were in the RSM calls. we now need to take 3515 * these entries and update the actual shadow fq. we also 3516 * need to schedule a sync_fqe event after updating our 3517 * shadow fq. 3518 */ 3519 if (rd->rd_queued_fqe_list != NULL && 3520 rd->rd_state == OPSRSM_STATE_W_READY) { 3521 opsrsm_queued_fqe_t *q, *qfqe = rd->rd_queued_fqe_list; 3522 3523 for (;;) { 3524 q = qfqe; 3525 qfqe = qfqe->qf_next; 3526 3527 q->qf_next = NULL; 3528 ASSERT(q->qf_bufnum != -1); 3529 opsrsmputfqe_nolock(rd, q->qf_bufnum); 3530 opsrsm_queued_fqe_free(rd, q); 3531 if (qfqe == NULL) break; 3532 } 3533 rd->rd_queued_fqe_list = NULL; 3534 rd->rd_queued_fqe_tail = NULL; 3535 opsrsm_event_add(rd, OPSRSM_EVT_SYNC_FQE); 3536 } 3537 rd->rd_fqr_flags &= ~OPSRSM_FQR_LOCKED; 3538 mutex_exit(&rd->rd_fqr_lock); 3539 return (0); 3540 } 3541 3542 static void 3543 opsrsm_queued_msg_append(opsrsm_dest_t *rd, opsrsm_queued_msg_t *qmsg) 3544 { 3545 mutex_enter(&rd->rd_msgs_lock); 3546 if (rd->rd_msgs == NULL) { 3547 rd->rd_msgs = qmsg; 3548 rd->rd_msgs_tail = qmsg; 3549 } else { 3550 rd->rd_msgs_tail->qm_next = qmsg; 3551 rd->rd_msgs_tail = qmsg; 3552 } 3553 mutex_exit(&rd->rd_msgs_lock); 3554 } 3555 3556 static void 3557 opsrsm_queued_msg_flush(opsrsm_dest_t *rd) 3558 { 3559 opsrsm_queued_msg_t *qmsg; 3560 int cnt = 0; 3561 3562 mutex_enter(&rd->rd_msgs_lock); 3563 while (rd->rd_msgs != NULL) { 3564 cnt++; 3565 qmsg = rd->rd_msgs; 3566 rd->rd_msgs = qmsg->qm_next; 3567 kmem_free(qmsg, sizeof (opsrsm_queued_msg_t)); 3568 } 3569 mutex_exit(&rd->rd_msgs_lock); 3570 if (cnt > 0) { 3571 DINFO("0x%x flushed %d queued msgs\n", 3572 rd->rd_local_skey, cnt); 3573 } 3574 } 3575 3576 /* 3577 * This function is called by the event thread to send 3578 * queued interrupt messages to a peer node. some interrupt 3579 * messages need to be delivered in this manner because RSMPI 3580 * prohibits the sending of interrupt messages inside an 3581 * RSM interrupt handler or callout. 3582 */ 3583 static void 3584 opsrsm_queued_msg_send(opsrsm_dest_t *rd) 3585 { 3586 opsrsm_queued_msg_t *qmsg; 3587 rsm_send_t send_obj; 3588 boolean_t more_msgs; 3589 int status; 3590 3591 mutex_enter(&rd->rd_msgs_lock); 3592 qmsg = rd->rd_msgs; 3593 if (qmsg != NULL) { 3594 rd->rd_msgs = qmsg->qm_next; 3595 if (rd->rd_msgs == NULL) { 3596 rd->rd_msgs_tail = NULL; 3597 } 3598 } else { 3599 mutex_exit(&rd->rd_msgs_lock); 3600 return; 3601 } 3602 mutex_exit(&rd->rd_msgs_lock); 3603 3604 send_obj.is_data = &qmsg->qm_msg; 3605 send_obj.is_size = sizeof (opsrsm_msg_t); 3606 send_obj.is_flags = RSM_DLPI_SQFLAGS; 3607 send_obj.is_wait = 0; 3608 status = RSM_SEND(rd->rd_adapter->rsmrdt_ctlr_obj, rd->rsm_sendq, 3609 &send_obj, NULL); 3610 3611 mutex_enter(&rd->rd_msgs_lock); 3612 if (status != RSM_SUCCESS) { 3613 if (++qmsg->qm_retries > opsrsm_queued_msg_max_retries) { 3614 DINFO("0x%x cannot send msg %d, err = %d, " 3615 "retrying...", rd->rd_local_skey, 3616 qmsg->qm_msg.p.hdr.reqtype, status); 3617 qmsg->qm_retries = 0; 3618 } 3619 qmsg->qm_next = rd->rd_msgs; 3620 rd->rd_msgs = qmsg; 3621 if (rd->rd_msgs_tail == NULL) { 3622 rd->rd_msgs_tail = qmsg; 3623 } 3624 } else { 3625 kmem_free(qmsg, sizeof (opsrsm_queued_msg_t)); 3626 } 3627 more_msgs = (rd->rd_msgs != NULL); 3628 mutex_exit(&rd->rd_msgs_lock); 3629 3630 if (more_msgs) { 3631 if (status != RSM_SUCCESS) { 3632 delay(1); 3633 } 3634 opsrsm_event_add(rd, OPSRSM_EVT_SEND_MSG); 3635 } 3636 } 3637 3638 static void 3639 opsrsm_event_add(opsrsm_dest_t *rd, uint32_t evt_type) 3640 { 3641 mutex_enter(&rd->rd_evt_lock); 3642 rd->rd_evt_flags |= evt_type; 3643 cv_signal(&rd->rd_evt_cv); 3644 mutex_exit(&rd->rd_evt_lock); 3645 } 3646 3647 /* 3648 * This thread is used for processing events that cannot 3649 * be done in interrupt context. 3650 */ 3651 static void 3652 opsrsm_event_thread(void *arg) 3653 { 3654 opsrsm_dest_t *rd = (opsrsm_dest_t *)arg; 3655 boolean_t sync_dqe, sync_fqe, send_msg; 3656 int events; 3657 3658 mutex_enter(&rd->rd_evt_lock); 3659 again:; 3660 events = 0; 3661 sync_dqe = ((rd->rd_evt_flags & OPSRSM_EVT_SYNC_DQE) != 0); 3662 if (sync_dqe) { 3663 events++; 3664 rd->rd_evt_flags &= ~OPSRSM_EVT_SYNC_DQE; 3665 } 3666 sync_fqe = ((rd->rd_evt_flags & OPSRSM_EVT_SYNC_FQE) != 0); 3667 if (sync_fqe) { 3668 events++; 3669 rd->rd_evt_flags &= ~OPSRSM_EVT_SYNC_FQE; 3670 } 3671 send_msg = ((rd->rd_evt_flags & OPSRSM_EVT_SEND_MSG) != 0); 3672 if (send_msg) { 3673 events++; 3674 rd->rd_evt_flags &= ~OPSRSM_EVT_SEND_MSG; 3675 } 3676 3677 if ((rd->rd_evt_flags & OPSRSM_EVT_STOP) != 0) { 3678 rd->rd_evt_flags |= OPSRSM_EVT_DONE; 3679 cv_signal(&rd->rd_evt_wait_cv); 3680 mutex_exit(&rd->rd_evt_lock); 3681 DINFO("0x%x event thread exiting\n", rd->rd_local_skey); 3682 return; 3683 } 3684 if (events == 0 || rd->rd_evt_flags == 0) { 3685 cv_wait(&rd->rd_evt_cv, &rd->rd_evt_lock); 3686 goto again; 3687 } 3688 mutex_exit(&rd->rd_evt_lock); 3689 3690 if (sync_dqe) { 3691 (void) opsrsm_sync_dqe(rd); 3692 } 3693 if (sync_fqe) { 3694 (void) opsrsm_sync_fqe(rd); 3695 } 3696 if (send_msg) { 3697 opsrsm_queued_msg_send(rd); 3698 } 3699 3700 mutex_enter(&rd->rd_evt_lock); 3701 goto again; 3702 } 3703 3704 static void 3705 opsrsm_sync_dqe_tmo(void *arg) 3706 { 3707 opsrsm_dest_t *rd = (opsrsm_dest_t *)arg; 3708 if (rd->rd_sync_dqe_tmo_id == 0) { 3709 return; 3710 } 3711 rd->rd_sync_dqe_tmo_id = 0; 3712 opsrsm_event_add(rd, OPSRSM_EVT_SYNC_DQE); 3713 } 3714 3715 static void 3716 opsrsm_sync_fqe_tmo(void *arg) 3717 { 3718 opsrsm_dest_t *rd = (opsrsm_dest_t *)arg; 3719 if (rd->rd_sync_fqe_tmo_id == 0) { 3720 return; 3721 } 3722 rd->rd_sync_fqe_tmo_id = 0; 3723 opsrsm_event_add(rd, OPSRSM_EVT_SYNC_FQE); 3724 } 3725 3726 static void 3727 opsrsm_status_check_tmo(void *arg) 3728 { 3729 opsrsm_dest_t *rd = (opsrsm_dest_t *)arg; 3730 3731 if (rd->rd_status_tmo_id == 0 || 3732 rd->rd_state != OPSRSM_STATE_W_READY) { 3733 return; 3734 } 3735 if (OPSRSM_Q_LEN(&rd->rd_sendq) > 0 && 3736 ((uint32_t)ddi_get_lbolt() - rd->rd_last_sent) > 1000) { 3737 DINFO("RDT packets queued but not sent for %d " 3738 "seconds: active_threads = %d, availfqe = %d, " 3739 "qlen = %d, rd->rd_dstate = %d\n", 3740 ((uint32_t)ddi_get_lbolt() - rd->rd_last_sent)/100, 3741 rd->rd_active_threads, opsrsmavailfqe2(rd), 3742 OPSRSM_Q_LEN(&rd->rd_sendq), rd->rd_dstate); 3743 3744 if (((uint32_t)ddi_get_lbolt() - rd->rd_last_sent) > 9000) { 3745 opsrsm_lostconn(rd); 3746 } 3747 opsrsm_set_fqe_tmo(rd, (clock_t)opsrsmdev-> 3748 opsrsm_param.opsrsm_xmit_delay); 3749 } 3750 rd->rd_status_tmo_id = timeout(opsrsm_status_check_tmo, rd, 1000); 3751 } 3752 3753 static int 3754 opsrsmrexmit(opsrsm_dest_t *rd) 3755 { 3756 int error = 0; 3757 mblk_t *mp; 3758 3759 switch (rd->rd_xmit_state) { 3760 case OPSRSM_XMIT_RETRY_DATA: 3761 error = opsrsm_start_batch(rd, 0); 3762 mp = OPSRSM_Q_HEAD(&rd->rd_pendq); 3763 while (mp) { 3764 error = opsrsm_write_data(rd, mp); 3765 mp = OPSRSM_Q_NEXT(&rd->rd_pendq, mp); 3766 } 3767 ASSERT(mp == NULL); 3768 error = opsrsm_end_batch(rd); 3769 break; 3770 default: 3771 cmn_err(CE_PANIC, "opsrsmrexmit: invalid state = %d\n", 3772 rd->rd_xmit_state); 3773 break; 3774 } 3775 return (error); 3776 } 3777 3778 static void 3779 opsrsmxmit_thread(void *arg) 3780 { 3781 (void) opsrsmxmit((opsrsm_dest_t *)arg, NULL); 3782 } 3783 3784 static int 3785 opsrsmxmit(opsrsm_dest_t *rd, mblk_t *mp) 3786 { 3787 opsrsm_t *opsrsmp = opsrsmdev; 3788 boolean_t discarded = B_FALSE; 3789 boolean_t do_sync = B_FALSE; 3790 boolean_t nofqe, nopendw; 3791 int err = 0, pkts_sent, sendq_len; 3792 3793 mutex_enter(&rd->rd_sendq_lock); 3794 rd->rd_active_threads++; 3795 if (mp != NULL) { 3796 if (OPSRSM_Q_LEN(&rd->rd_sendq) >= 3797 opsrsmp->opsrsm_param.opsrsm_max_queued_pkts) { 3798 opsrsmp->opsrsm_pkts_discarded++; 3799 freemsg(mp); 3800 mp = NULL; 3801 discarded = B_TRUE; 3802 err = EWOULDBLOCK; 3803 } else { 3804 uint32_t seqno = OPSRSM_MESSAGE_HDRPTR(mp)->seqno; 3805 if (seqno == 0) { 3806 OPSRSM_MESSAGE_HDRPTR(mp)->seqno = 3807 rd->rd_next_lseqno; 3808 OPSRSM_MESSAGE_HDRPTR(mp)->skey = 3809 rd->rd_local_skey; 3810 3811 rd->rd_next_lseqno++; 3812 if (rd->rd_next_lseqno == 0) 3813 rd->rd_next_lseqno++; 3814 } 3815 opsrsmp->opsrsm_pkts_queued++; 3816 OPSRSM_Q_APPEND(&rd->rd_sendq, mp); 3817 mp->b_prev = (mblk_t *)0; 3818 } 3819 if (!discarded && rd->rd_active_threads > ncpus_online) { 3820 rd->rd_active_threads--; 3821 opsrsmp->opsrsm_last_sendq_len = 3822 (uint32_t)OPSRSM_Q_LEN(&rd->rd_sendq); 3823 mutex_exit(&rd->rd_sendq_lock); 3824 UNREFDEST(rd); 3825 return (err); 3826 } 3827 } 3828 mutex_exit(&rd->rd_sendq_lock); 3829 3830 mutex_enter(&rd->rd_xmit_lock); 3831 if (rd->rd_xmit_state < OPSRSM_XMIT_BARRIER_CLOSED) { 3832 if (rd->rd_xmit_state == OPSRSM_XMIT_DISCONNECTED && 3833 !discarded) { 3834 err = ENETDOWN; 3835 } 3836 mutex_enter(&rd->rd_sendq_lock); 3837 rd->rd_active_threads--; 3838 opsrsmp->opsrsm_last_sendq_len = 3839 (uint32_t)OPSRSM_Q_LEN(&rd->rd_sendq); 3840 mutex_exit(&rd->rd_sendq_lock); 3841 mutex_exit(&rd->rd_xmit_lock); 3842 UNREFDEST(rd); 3843 return (err); 3844 } 3845 if (rd->rd_xmit_state > OPSRSM_XMIT_BARRIER_OPENED) { 3846 boolean_t dont_rexmit = B_FALSE; 3847 3848 dont_rexmit = (mp != NULL); 3849 if (dont_rexmit || opsrsmrexmit(rd) != 0) { 3850 if (!discarded) err = 0; 3851 mutex_enter(&rd->rd_sendq_lock); 3852 rd->rd_active_threads--; 3853 opsrsmp->opsrsm_last_sendq_len = 3854 (uint32_t)OPSRSM_Q_LEN(&rd->rd_sendq); 3855 mutex_exit(&rd->rd_sendq_lock); 3856 mutex_exit(&rd->rd_xmit_lock); 3857 UNREFDEST(rd); 3858 return (err); 3859 } else { 3860 mutex_exit(&rd->rd_xmit_lock); 3861 (void) opsrsm_sync_dqe(rd); 3862 mutex_enter(&rd->rd_xmit_lock); 3863 } 3864 } else if (discarded) { 3865 mutex_enter(&rd->rd_sendq_lock); 3866 rd->rd_active_threads--; 3867 opsrsmp->opsrsm_last_sendq_len = 3868 (uint32_t)OPSRSM_Q_LEN(&rd->rd_sendq); 3869 mutex_exit(&rd->rd_sendq_lock); 3870 mutex_exit(&rd->rd_xmit_lock); 3871 UNREFDEST(rd); 3872 return (err); 3873 } 3874 3875 nofqe = B_FALSE; 3876 nopendw = B_TRUE; 3877 pkts_sent = 0; 3878 opsrsm_cancel_xmit_tmo(rd); 3879 for (;;) { 3880 mblk_t *nmp; 3881 int bufnum, qlen, pktlen; 3882 3883 if (rd->rd_xmit_state < OPSRSM_XMIT_BARRIER_CLOSED || 3884 rd->rd_xmit_state > OPSRSM_XMIT_BARRIER_OPENED) { 3885 mutex_enter(&rd->rd_sendq_lock); 3886 rd->rd_active_threads--; 3887 mutex_exit(&rd->rd_sendq_lock); 3888 break; 3889 } 3890 mutex_enter(&rd->rd_sendq_lock); 3891 qlen = OPSRSM_Q_LEN(&rd->rd_sendq); 3892 if (qlen == 0) { 3893 rd->rd_active_threads--; 3894 mutex_exit(&rd->rd_sendq_lock); 3895 break; 3896 } 3897 if (opsrsmgetfqe(rd, &bufnum) == 0) { 3898 opsrsm_set_fqe_tmo(rd, (clock_t)opsrsmp-> 3899 opsrsm_param.opsrsm_xmit_delay); 3900 nofqe = B_TRUE; 3901 rd->rd_active_threads--; 3902 mutex_exit(&rd->rd_sendq_lock); 3903 break; 3904 } 3905 OPSRSM_Q_REMOVE(&rd->rd_sendq, nmp); 3906 mutex_exit(&rd->rd_sendq_lock); 3907 3908 if (rd->rd_xmit_state == OPSRSM_XMIT_BARRIER_CLOSED) { 3909 ASSERT(OPSRSM_Q_LEN(&rd->rd_pendq) == 0 && 3910 rd->rd_start_time == 0 && 3911 rd->rd_data_collected == 0 && 3912 rd->rd_writes_completed == 0); 3913 err = opsrsm_start_batch(rd, (uint32_t)nmp->b_prev); 3914 } 3915 opsrsmp->opsrsm_pkts_pending++; 3916 OPSRSM_Q_APPEND(&rd->rd_pendq, nmp); 3917 nmp->b_prev = (mblk_t *)bufnum; 3918 pktlen = MBLKL(nmp); 3919 mutex_exit(&rd->rd_xmit_lock); 3920 3921 err = opsrsm_write_data(rd, nmp); 3922 pkts_sent++; 3923 3924 mutex_enter(&rd->rd_xmit_lock); 3925 ASSERT(rd->rd_xmit_state == OPSRSM_XMIT_BARRIER_OPENED || 3926 rd->rd_xmit_state == OPSRSM_XMIT_DISCONNECTED); 3927 rd->rd_writes_completed++; 3928 rd->rd_data_collected += (uint32_t)pktlen; 3929 OPSRSM_ADAPT_THRESHOLD(rd, (uint32_t)pktlen); 3930 if (opsrsmdev->opsrsm_param.opsrsm_adaptive_intr == 0) { 3931 rd->rd_last_sent = (uint32_t)ddi_get_lbolt(); 3932 } 3933 nopendw = OPSRSM_NO_PENDING_WRITES(rd); 3934 if (nopendw && OPSRSM_REACHED_DATA_THRESHOLD(rd)) { 3935 err = opsrsm_end_batch(rd); 3936 if (err != 0) { 3937 mutex_enter(&rd->rd_sendq_lock); 3938 rd->rd_active_threads--; 3939 mutex_exit(&rd->rd_sendq_lock); 3940 break; 3941 } else { 3942 mutex_exit(&rd->rd_xmit_lock); 3943 (void) opsrsm_sync_dqe(rd); 3944 mutex_enter(&rd->rd_xmit_lock); 3945 } 3946 } else if (!nopendw) { 3947 opsrsmp->opsrsm_pending_writes++; 3948 nopendw = B_TRUE; 3949 } 3950 } 3951 if (nofqe) opsrsmp->opsrsm_no_fqes++; 3952 if (pkts_sent == 0 && mp == NULL) { 3953 rd->rd_adaptive_threshold = 0; 3954 rd->rd_pkt_freq = 0; 3955 opsrsmp->opsrsm_starts++; 3956 } 3957 3958 nopendw = OPSRSM_NO_PENDING_WRITES(rd); 3959 if (nopendw && (rd->rd_xmit_state == OPSRSM_XMIT_BARRIER_OPENED)) { 3960 if (OPSRSM_REACHED_DATA_THRESHOLD(rd) || 3961 (pkts_sent == 0 && mp == NULL) || 3962 (nofqe && OPSRSM_Q_LEN(&rd->rd_pendq) == opsrsmp-> 3963 opsrsm_param.opsrsm_buffers)) { 3964 err = opsrsm_end_batch(rd); 3965 if (err == 0) { 3966 do_sync = B_TRUE; 3967 } 3968 } else { 3969 opsrsm_set_xmit_tmo(rd, 3970 (int)opsrsmp->opsrsm_param.opsrsm_xmit_delay); 3971 } 3972 } else if (!nopendw) { 3973 opsrsmp->opsrsm_pending_writes++; 3974 nopendw = B_TRUE; 3975 } 3976 opsrsmp->opsrsm_last_pendq_len = 3977 (uint32_t)OPSRSM_Q_LEN(&rd->rd_pendq); 3978 mutex_exit(&rd->rd_xmit_lock); 3979 3980 if (do_sync) { 3981 (void) opsrsm_sync_dqe(rd); 3982 } 3983 3984 sendq_len = OPSRSM_Q_LEN(&rd->rd_sendq); 3985 opsrsmp->opsrsm_last_sendq_len = (uint32_t)sendq_len; 3986 if (sendq_len <= 1024) { 3987 opsrsm_wake_senders(rd, POLLOUT); 3988 } 3989 UNREFDEST(rd); 3990 3991 if (err != ENETDOWN) err = 0; 3992 if (discarded) err = EWOULDBLOCK; 3993 return (err); 3994 } 3995 3996 /* 3997 * Callback routine, called when an desballoc'ed buffer is eventually freed. 3998 */ 3999 static void 4000 opsrsmfreebuf( 4001 opsrsmbuf_t *rbp) /* Structure describing freed buffer */ 4002 { 4003 opsrsm_dest_t *rd = rbp->rb_rd; 4004 int delflg, zerflg; 4005 4006 D1("opsrsmfreebuf: rbp 0x%p", (void *)rbp); 4007 4008 /* 4009 * Find out if this is the last outstanding buffer, and whether we're 4010 * being deleted. 4011 */ 4012 mutex_enter(&rd->rd_nlb_lock); 4013 4014 rd->rd_nlb--; 4015 delflg = rd->rd_nlb_del; 4016 zerflg = (rd->rd_nlb == 0); 4017 4018 mutex_exit(&rd->rd_nlb_lock); 4019 4020 /* 4021 * If we're being deleted, we don't put this buffer on the free queue. 4022 * Also, if we're being deleted, and this was the last outstanding 4023 * buffer, we do an UNREF. Otherwise we send this buffer to the other 4024 * system for reuse. 4025 */ 4026 if (delflg) { 4027 if (zerflg) 4028 UNREFDEST(rd); 4029 } else { 4030 opsrsmputfqe(rd, rbp->rb_bufnum); 4031 opsrsm_event_add(rd, OPSRSM_EVT_SYNC_FQE); 4032 } 4033 4034 D1("opsrsmfreebuf: done"); 4035 } 4036 4037 static void 4038 opsrsmdemux(mblk_t *mp, opsrsm_dest_t *rd) 4039 { 4040 opsrsmresource_t *rp; 4041 uint32_t rportnum = OPSRSM_MESSAGE_HDRPTR(mp)->rportnum; 4042 4043 /* control messages do not belong here */ 4044 if (OPSRSM_MESSAGE_HDRPTR(mp)->option != 0) { 4045 freemsg(mp); 4046 return; 4047 } 4048 if (rd != NULL) { 4049 uint32_t seq_key, seq_no; 4050 4051 seq_key = OPSRSM_MESSAGE_HDRPTR(mp)->skey; 4052 seq_no = OPSRSM_MESSAGE_HDRPTR(mp)->seqno; 4053 ASSERT(seq_no != 0); 4054 if (seq_key == rd->rd_remote_skey) { 4055 if (seq_no == rd->rd_next_rseqno) { 4056 rd->rd_next_rseqno++; 4057 if (rd->rd_next_rseqno == 0) 4058 rd->rd_next_rseqno++; 4059 } else { 4060 cmn_err(CE_PANIC, "opsrsmdemux: seqno = %d " 4061 "expected seqno = %d\n", seq_no, 4062 rd->rd_next_rseqno); 4063 freemsg(mp); 4064 return; 4065 } 4066 } else { 4067 opsrsm_failover_info_t *finfo = 4068 opsrsm_finfo_lookup_by_remote_skey(seq_key); 4069 4070 if (finfo == NULL) { 4071 cmn_err(CE_CONT, "opsrsmdemux: cannot find " 4072 "skey = 0x%x\n", seq_key); 4073 freemsg(mp); 4074 return; 4075 } else { 4076 if (seq_no != finfo->fi_next_rseqno) { 4077 /* 4078 * We get here if a rexmitted packet is 4079 * a duplicate or if it is out of 4080 * sequence. 4081 */ 4082 DERR("received duplicate packet, " 4083 "seqno = %d, expected = %d\n", 4084 seq_no, finfo->fi_next_rseqno); 4085 freemsg(mp); 4086 return; 4087 } else { 4088 finfo->fi_next_rseqno++; 4089 if (finfo->fi_next_rseqno == 0) 4090 finfo->fi_next_rseqno++; 4091 } 4092 } 4093 } 4094 } 4095 4096 rp = opsrsmresource_lookup(rportnum, OPSRSM_RO_INCREFCNT); 4097 if (rp == NULL) { 4098 opsrsm_queued_msg_t *qmsg; 4099 4100 DERR("opsrsmdemux: port %d has no receiver, dropping message", 4101 rportnum); 4102 4103 /* 4104 * Do not send error notifications if a segment is about to 4105 * be torn down. 4106 */ 4107 if (rd != NULL && rd->rd_state != OPSRSM_STATE_W_READY) { 4108 freemsg(mp); 4109 return; 4110 } 4111 qmsg = (opsrsm_queued_msg_t *)kmem_zalloc(sizeof (*qmsg), 4112 KM_NOSLEEP); 4113 if (qmsg == NULL) { 4114 DERR("opsrsmdemux: cannot allocate qmsg"); 4115 freemsg(mp); 4116 return; 4117 } 4118 /* 4119 * Tell the sender that receiver doesn't exist. 4120 */ 4121 qmsg->qm_msg.p.hdr.reqtype = RSMRDT_MSG_SEND_ERR; 4122 qmsg->qm_msg.p.hdr.seqno = 0; 4123 qmsg->qm_msg.p.hdr.opsrsm_version = OPSRSM_VERSION; 4124 qmsg->qm_msg.p.m.senderr.sender_portnum = 4125 OPSRSM_MESSAGE_HDRPTR(mp)->lportnum; 4126 qmsg->qm_msg.p.m.senderr.sender_pkey = 4127 OPSRSM_MESSAGE_HDRPTR(mp)->pkey; 4128 qmsg->qm_msg.p.m.senderr.errstate = OPSRSM_RS_NORECVR; 4129 qmsg->qm_retries = 0; 4130 4131 opsrsm_queued_msg_append(rd, qmsg); 4132 opsrsm_event_add(rd, OPSRSM_EVT_SEND_MSG); 4133 freemsg(mp); 4134 return; 4135 } 4136 4137 if (rp->rs_pkey != OPSRSM_MESSAGE_HDRPTR(mp)->pkey) { 4138 opsrsm_queued_msg_t *qmsg; 4139 4140 DERR("opsrsmdemux: Invalid pkey: sender %d local %d, " 4141 "dropping message", OPSRSM_MESSAGE_HDRPTR(mp)->pkey, 4142 rp->rs_pkey); 4143 4144 /* 4145 * Do not send error notifications if a segment is about to 4146 * be torn down. 4147 */ 4148 if (rd != NULL && rd->rd_state != OPSRSM_STATE_W_READY) { 4149 freemsg(mp); 4150 OPSRSM_RSUNREF(rp); 4151 return; 4152 } 4153 4154 qmsg = (opsrsm_queued_msg_t *)kmem_zalloc(sizeof (*qmsg), 4155 KM_NOSLEEP); 4156 if (qmsg == NULL) { 4157 DERR("opsrsmdemux: cannot allocate qmsg"); 4158 freemsg(mp); 4159 OPSRSM_RSUNREF(rp); 4160 return; 4161 } 4162 /* 4163 * Tell the sender about pkey mismatch. 4164 */ 4165 qmsg->qm_msg.p.hdr.reqtype = RSMRDT_MSG_SEND_ERR; 4166 qmsg->qm_msg.p.hdr.seqno = 0; 4167 qmsg->qm_msg.p.hdr.opsrsm_version = OPSRSM_VERSION; 4168 qmsg->qm_msg.p.m.senderr.sender_portnum = 4169 OPSRSM_MESSAGE_HDRPTR(mp)->lportnum; 4170 qmsg->qm_msg.p.m.senderr.sender_pkey = 4171 OPSRSM_MESSAGE_HDRPTR(mp)->pkey; 4172 qmsg->qm_msg.p.m.senderr.errstate = OPSRSM_RS_PKEYMISMATCH; 4173 qmsg->qm_retries = 0; 4174 4175 opsrsm_queued_msg_append(rd, qmsg); 4176 opsrsm_event_add(rd, OPSRSM_EVT_SEND_MSG); 4177 freemsg(mp); 4178 OPSRSM_RSUNREF(rp); 4179 return; 4180 } 4181 4182 if ((rp->rs_state & OPSRSM_RS_BOUND) == 0) { 4183 DERR("opsrsmdemux: port %d not bound (rp->rs_state = %d), " 4184 "dropping message", rportnum, rp->rs_state); 4185 freemsg(mp); 4186 OPSRSM_RSUNREF(rp); 4187 return; 4188 } 4189 4190 mutex_enter(&rp->rs_lock); 4191 OPSRSM_Q_APPEND(&rp->rs_recvq, mp); 4192 atomic_add_32(&opsrsm_pending_bytes, MBLKL(mp)); 4193 4194 if ((rp->rs_state & OPSRSM_RS_SIG) != 0) { 4195 cv_signal(&rp->rs_cv); 4196 } 4197 if ((rp->rs_events & POLLIN) == 0) { 4198 rp->rs_events |= POLLIN; 4199 mutex_exit(&rp->rs_lock); 4200 pollwakeup(&rp->rs_pollhd, POLLIN); 4201 } else { 4202 mutex_exit(&rp->rs_lock); 4203 } 4204 4205 OPSRSM_RSUNREF(rp); 4206 } 4207 4208 static int 4209 opsrsmdemux_loopback(mblk_t *mp) 4210 { 4211 opsrsmresource_t *rp; 4212 uint32_t rportnum = OPSRSM_MESSAGE_HDRPTR(mp)->rportnum; 4213 4214 rp = opsrsmresource_lookup(rportnum, OPSRSM_RO_INCREFCNT); 4215 if (rp == NULL) { 4216 freemsg(mp); 4217 return (ESRCH); 4218 } 4219 4220 if (rp->rs_pkey != OPSRSM_MESSAGE_HDRPTR(mp)->pkey) { 4221 freemsg(mp); 4222 OPSRSM_RSUNREF(rp); 4223 return (EACCES); 4224 } 4225 4226 if ((rp->rs_state & OPSRSM_RS_BOUND) == 0) { 4227 DERR("opsrsmdemux: port %d not bound (rp->rs_state = %d), " 4228 "dropping message", rportnum, rp->rs_state); 4229 freemsg(mp); 4230 OPSRSM_RSUNREF(rp); 4231 return (EADDRNOTAVAIL); 4232 } 4233 4234 mutex_enter(&rp->rs_lock); 4235 if ((uint_t)OPSRSM_Q_LEN(&rp->rs_recvq) >= 4236 opsrsmdev->opsrsm_param.opsrsm_max_loopback_pkts) { 4237 mutex_exit(&rp->rs_lock); 4238 freemsg(mp); 4239 OPSRSM_RSUNREF(rp); 4240 return (EWOULDBLOCK); 4241 } 4242 OPSRSM_Q_APPEND(&rp->rs_recvq, mp); 4243 atomic_add_32(&opsrsm_pending_bytes, MBLKL(mp)); 4244 4245 if ((rp->rs_state & OPSRSM_RS_SIG) != 0) { 4246 cv_signal(&rp->rs_cv); 4247 } 4248 if ((rp->rs_events & POLLIN) == 0) { 4249 rp->rs_events |= POLLIN; 4250 mutex_exit(&rp->rs_lock); 4251 pollwakeup(&rp->rs_pollhd, POLLIN); 4252 } else { 4253 mutex_exit(&rp->rs_lock); 4254 } 4255 OPSRSM_RSUNREF(rp); 4256 return (0); 4257 } 4258 4259 4260 /* 4261 * opsrsmread() takes the packet described by the arguments and sends it 4262 * upstream. 4263 */ 4264 static int 4265 opsrsmread( 4266 opsrsm_dest_t *rd, /* Destination pointer */ 4267 int bufnum, /* Index of buffer containing packet */ 4268 int offset, /* Offset of packet within buffer */ 4269 int length, /* Length of packet */ 4270 ushort_t sap) /* SAP for packet */ 4271 { 4272 opsrsm_t *opsrsmp = opsrsmdev; 4273 mblk_t *mp; 4274 boolean_t canloan = B_FALSE; 4275 boolean_t nobufs = B_FALSE; 4276 caddr_t bufptr; 4277 int buffree = 0; 4278 int calc_sz; 4279 4280 D1("opsrsmread: rd 0x%p, bufnum %d, offset %d, length %d, sap 0x%x", 4281 (void *)rd, bufnum, offset, length, sap); 4282 4283 bufptr = (caddr_t)rd->rd_lbuf + ((uint_t)bufnum * rd->rd_lbuflen); 4284 4285 /* Figure out if we can loan this buffer up or not */ 4286 mutex_enter(&rd->rd_nlb_lock); 4287 nobufs = (rd->rd_rawmem_base_size > freemem * PAGESIZE) && 4288 ((rd->rd_sstate & OPSRSM_RSMS_LXFER_C) != 0); 4289 if (nobufs || rd->rd_nlb < (opsrsmp->opsrsm_param.opsrsm_buffers - 4290 opsrsmp->opsrsm_param.opsrsm_buffers_retained)) { 4291 rd->rd_nlb++; 4292 canloan = B_TRUE; 4293 } 4294 mutex_exit(&rd->rd_nlb_lock); 4295 4296 4297 if (canloan) { 4298 /* 4299 * We make the mblk cover the whole buffer in case anybody 4300 * wants the leading/trailing space; below we adjust the 4301 * rptr/wptr to describe the actual packet. 4302 */ 4303 mp = desballoc((uchar_t *)bufptr, rd->rd_lbuflen, 4304 BPRI_LO, &(rd->rd_bufbase+bufnum)->rb_frtn); 4305 4306 if (mp == NULL) { 4307 mutex_enter(&rd->rd_nlb_lock); 4308 rd->rd_nlb--; 4309 mutex_exit(&rd->rd_nlb_lock); 4310 4311 opsrsmputfqe(rd, bufnum); 4312 buffree = 1; 4313 4314 opsrsmp->opsrsm_ierrors++; 4315 cmn_err(CE_PANIC, "desballoc failed, dropping " 4316 "packet\n"); 4317 return (1); 4318 } 4319 mp->b_rptr += offset; 4320 mp->b_wptr = mp->b_rptr + length; 4321 4322 opsrsmp->opsrsm_lbufs++; 4323 } else { 4324 /* 4325 * We make the destination (within the new mblk) have the 4326 * same address mod 64 as our source, so that the kernel 4327 * bcopy is as efficient as possible. (This is a sun4u 4328 * bcopy optimization, not a RSM optimization.) 4329 */ 4330 mp = allocb((size_t)(length + 0x40), BPRI_LO); 4331 if (mp) { 4332 intptr_t dstoffset = (intptr_t)mp->b_rptr; 4333 4334 dstoffset = offset - (dstoffset & 0x3f); 4335 if (dstoffset < 0) 4336 dstoffset += 0x40; 4337 4338 mp->b_rptr += dstoffset; 4339 mp->b_wptr = mp->b_rptr + length; 4340 bcopy((void *)(bufptr + offset), (void *)mp->b_rptr, 4341 (size_t)length); 4342 4343 opsrsmp->opsrsm_nlbufs++; 4344 opsrsmputfqe(rd, bufnum); 4345 buffree = 1; 4346 } else { 4347 opsrsmputfqe(rd, bufnum); 4348 buffree = 1; 4349 opsrsmp->opsrsm_ierrors++; 4350 cmn_err(CE_PANIC, "allocb failed, dropping packet\n"); 4351 return (1); 4352 } 4353 } 4354 4355 calc_sz = (int)(OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz + 4356 OPSRSM_MESSAGE_HDRSZ); 4357 if (length != calc_sz) { 4358 cmn_err(CE_PANIC, "corrupted packet, length = %d, " 4359 "calculated size = %d\n", length, calc_sz); 4360 } 4361 4362 if (OPSRSM_MESSAGE_HDRPTR(mp)->option == OPSRSM_OPT_REXMIT_END) { 4363 opsrsm_option_rexmit_end(mp, rd); 4364 } else { 4365 opsrsmp->opsrsm_in_bytes += (uint32_t)length; 4366 opsrsmp->opsrsm_ipackets++; 4367 /* 4368 * Do demux right here. 4369 */ 4370 opsrsmdemux(mp, rd); 4371 } 4372 D1("opsrsmread: canloan was %d, done", canloan); 4373 return (buffree); 4374 } 4375 4376 /* 4377 * **************************************************************** 4378 * * 4379 * E N D NEW DATA TRANSFER LOGIC * 4380 * * 4381 * **************************************************************** 4382 */ 4383 4384 /* 4385 * **************************************************************** 4386 * * 4387 * B E G I N RSM FAILOVER * 4388 * * 4389 * **************************************************************** 4390 */ 4391 4392 static opsrsm_failover_info_t * 4393 opsrsm_finfo_add(opsrsm_dest_t *rd) 4394 { 4395 opsrsm_failover_info_t *finfo = opsrsm_finfo_list; 4396 4397 D1("adding finfo for dest = %p, remote_skey = %d\n", 4398 rd, rd->rd_remote_skey); 4399 mutex_enter(&opsrsm_finfo_lock); 4400 while (finfo != NULL) { 4401 if (finfo->fi_dest == rd && 4402 finfo->fi_remote_skey == rd->rd_remote_skey && 4403 finfo->fi_local_skey == rd->rd_local_skey) { 4404 mutex_exit(&opsrsm_finfo_lock); 4405 return (finfo); 4406 } 4407 finfo = finfo->fi_next; 4408 } 4409 ASSERT(finfo == NULL); 4410 finfo = (opsrsm_failover_info_t *)kmem_zalloc(sizeof (*finfo), 4411 KM_SLEEP); 4412 ASSERT(finfo != NULL); 4413 finfo->fi_nodeid = rd->rd_nodeid; 4414 finfo->fi_dest = rd; 4415 finfo->fi_remote_skey = rd->rd_remote_skey; 4416 finfo->fi_local_skey = rd->rd_local_skey; 4417 finfo->fi_next_rseqno = 0; 4418 finfo->fi_retval = -1; 4419 finfo->fi_waiters = 0; 4420 finfo->fi_status = 0; 4421 finfo->fi_last_accessed = ddi_get_lbolt(); 4422 OPSRSM_Q_INIT(&finfo->fi_rexmitq); 4423 cv_init(&finfo->fi_cv, NULL, CV_DRIVER, NULL); 4424 cv_init(&finfo->fi_wait_cv, NULL, CV_DRIVER, NULL); 4425 mutex_init(&finfo->fi_lock, NULL, MUTEX_DRIVER, NULL); 4426 4427 finfo->fi_next = opsrsm_finfo_list; 4428 opsrsm_finfo_list = finfo; 4429 mutex_exit(&opsrsm_finfo_lock); 4430 return (finfo); 4431 } 4432 4433 static opsrsm_failover_info_t * 4434 opsrsm_finfo_lookup_by_remote_skey(uint32_t skey) 4435 { 4436 opsrsm_failover_info_t *finfo = opsrsm_finfo_list; 4437 4438 mutex_enter(&opsrsm_finfo_lock); 4439 while (finfo != NULL) { 4440 if (finfo->fi_remote_skey == skey) { 4441 break; 4442 } 4443 finfo = finfo->fi_next; 4444 } 4445 if (finfo != NULL) { 4446 finfo->fi_last_accessed = ddi_get_lbolt(); 4447 } 4448 mutex_exit(&opsrsm_finfo_lock); 4449 return (finfo); 4450 } 4451 4452 static opsrsm_failover_info_t * 4453 opsrsm_finfo_lookup_by_local_skey(uint32_t skey) 4454 { 4455 opsrsm_failover_info_t *finfo = opsrsm_finfo_list; 4456 4457 mutex_enter(&opsrsm_finfo_lock); 4458 while (finfo != NULL) { 4459 if (finfo->fi_local_skey == skey) { 4460 break; 4461 } 4462 finfo = finfo->fi_next; 4463 } 4464 if (finfo != NULL) { 4465 finfo->fi_last_accessed = ddi_get_lbolt(); 4466 } 4467 mutex_exit(&opsrsm_finfo_lock); 4468 return (finfo); 4469 } 4470 4471 static int 4472 opsrsm_finfo_wait(uint32_t skey) 4473 { 4474 opsrsm_failover_info_t *finfo = opsrsm_finfo_list; 4475 int error; 4476 4477 mutex_enter(&opsrsm_finfo_lock); 4478 while (finfo != NULL) { 4479 if (finfo->fi_local_skey == skey) { 4480 break; 4481 } 4482 finfo = finfo->fi_next; 4483 } 4484 if (finfo == NULL) { 4485 mutex_exit(&opsrsm_finfo_lock); 4486 return (0); 4487 } 4488 4489 finfo->fi_last_accessed = ddi_get_lbolt(); 4490 error = finfo->fi_retval; 4491 if (error == -1) { 4492 int retval; 4493 4494 finfo->fi_waiters++; 4495 retval = cv_wait_sig(&finfo->fi_cv, &opsrsm_finfo_lock); 4496 finfo->fi_waiters--; 4497 if (retval == 0) { 4498 error = EINTR; 4499 } else { 4500 error = finfo->fi_retval; 4501 } 4502 } 4503 mutex_exit(&opsrsm_finfo_lock); 4504 return (error); 4505 } 4506 4507 static void 4508 opsrsm_finfo_wakeup(opsrsm_failover_info_t *finfo, int retval) 4509 { 4510 mutex_enter(&opsrsm_finfo_lock); 4511 finfo->fi_retval = retval; 4512 cv_broadcast(&finfo->fi_cv); 4513 mutex_exit(&opsrsm_finfo_lock); 4514 } 4515 4516 static void 4517 opsrsm_finfo_init(void) 4518 { 4519 opsrsm_finfo_list = NULL; 4520 opsrsm_failover_taskq = taskq_create("failover", 8, maxclsyspri, 1, 8, 4521 TASKQ_PREPOPULATE); 4522 opsrsm_failover_threads = 0; 4523 mutex_init(&opsrsm_finfo_lock, NULL, MUTEX_DRIVER, NULL); 4524 cv_init(&opsrsm_finfo_cv, NULL, CV_DRIVER, NULL); 4525 } 4526 4527 static void 4528 opsrsm_finfo_fini(void) 4529 { 4530 opsrsm_failover_info_t *finfo = opsrsm_finfo_list, *f; 4531 4532 mutex_enter(&opsrsm_finfo_lock); 4533 if (opsrsm_failover_threads > 0) { 4534 DINFO("cannot detach yet, failover_threads = %d, " 4535 "need to wait for approx. %d secs\n", 4536 opsrsm_failover_threads, 180 * opsrsm_failover_threads); 4537 cv_wait(&opsrsm_finfo_cv, &opsrsm_finfo_lock); 4538 } 4539 while (finfo != NULL) { 4540 ASSERT(finfo->fi_waiters == 0); 4541 OPSRSM_Q_FLUSH(&finfo->fi_rexmitq); 4542 cv_destroy(&finfo->fi_cv); 4543 cv_destroy(&finfo->fi_wait_cv); 4544 mutex_destroy(&finfo->fi_lock); 4545 f = finfo; 4546 finfo = finfo->fi_next; 4547 kmem_free(f, sizeof (*f)); 4548 } 4549 mutex_exit(&opsrsm_finfo_lock); 4550 taskq_destroy(opsrsm_failover_taskq); 4551 cv_destroy(&opsrsm_finfo_cv); 4552 mutex_destroy(&opsrsm_finfo_lock); 4553 } 4554 4555 4556 static void 4557 opsrsm_finfo_destroy(void *arg) 4558 { 4559 opsrsm_failover_info_t *finfo = (opsrsm_failover_info_t *)arg; 4560 opsrsm_failover_info_t *fptr = opsrsm_finfo_list; 4561 4562 mutex_enter(&opsrsm_finfo_lock); 4563 /* 4564 * delay the destruction if it was touched recently 4565 */ 4566 if ((ddi_get_lbolt() - finfo->fi_last_accessed) <= 3000) { 4567 mutex_exit(&opsrsm_finfo_lock); 4568 (void) timeout(opsrsm_finfo_destroy, finfo, 3000); 4569 return; 4570 } 4571 /* 4572 * delay the destruction if there are waiters yet to 4573 * be woken up. 4574 */ 4575 if (finfo->fi_waiters > 0) { 4576 mutex_exit(&opsrsm_finfo_lock); 4577 (void) timeout(opsrsm_finfo_destroy, finfo, 10); 4578 return; 4579 } 4580 DINFO("failover: destroying finfo, local_skey 0x%x, " 4581 "remote_skey 0x%x\n", finfo->fi_local_skey, 4582 finfo->fi_remote_skey); 4583 if (fptr == finfo) { 4584 opsrsm_finfo_list = finfo->fi_next; 4585 } else { 4586 while (fptr != NULL) { 4587 if (fptr->fi_next == finfo) break; 4588 fptr = fptr->fi_next; 4589 } 4590 ASSERT(fptr != NULL); 4591 fptr->fi_next = finfo->fi_next; 4592 } 4593 4594 ASSERT(finfo->fi_waiters == 0); 4595 cv_destroy(&finfo->fi_cv); 4596 cv_destroy(&finfo->fi_wait_cv); 4597 mutex_destroy(&finfo->fi_lock); 4598 kmem_free(finfo, sizeof (*finfo)); 4599 opsrsm_failover_threads--; 4600 ASSERT(opsrsm_failover_threads >= 0); 4601 if (opsrsm_failover_threads == 0) { 4602 cv_broadcast(&opsrsm_finfo_cv); 4603 } 4604 mutex_exit(&opsrsm_finfo_lock); 4605 } 4606 4607 static void 4608 opsrsm_dispatch_failover(void *arg) 4609 { 4610 opsrsm_failover_info_t *finfo = (opsrsm_failover_info_t *)arg; 4611 4612 ASSERT(opsrsm_failover_taskq != NULL); 4613 if (taskq_dispatch(opsrsm_failover_taskq, 4614 opsrsm_failover_thread, finfo, KM_NOSLEEP) == 0) { 4615 (void) timeout(opsrsm_dispatch_failover, finfo, 1); 4616 } 4617 } 4618 4619 /* handler for failover related messages */ 4620 /*ARGSUSED*/ 4621 static void 4622 opsrsmmsghdlr_finfo(opsrsm_dest_t *rd, opsrsm_msg_t *msg) 4623 { 4624 opsrsm_failover_info_t *finfo; 4625 uint32_t skey = msg->p.m.finfoquery.skey; 4626 uint32_t flag = 0; 4627 4628 finfo = opsrsm_finfo_lookup_by_local_skey(skey); 4629 if (finfo == NULL) { 4630 return; 4631 } 4632 switch (msg->p.hdr.reqtype) { 4633 case OPSRSM_MSG_FINFO_DEMUX_DONE: 4634 flag = OPSRSM_FINFO_DEMUX_DONE; 4635 if ((finfo->fi_status & flag) != 0) break; 4636 DINFO("failover: 0x%x peer node demux done\n", skey); 4637 break; 4638 case OPSRSM_MSG_FINFO_REPLY: 4639 flag = OPSRSM_FINFO_DEMUX_DONE | OPSRSM_FINFO_OLD_PROTO; 4640 DINFO("failover: 0x%x peer node uses old protocol\n", skey); 4641 break; 4642 case OPSRSM_MSG_FINFO_REXMIT_ACK: 4643 flag = OPSRSM_FINFO_REXMIT_ACKED; 4644 DINFO("failover: 0x%x rexmit marker acked\n", skey); 4645 break; 4646 default: 4647 cmn_err(CE_PANIC, "unknown message type\n"); 4648 } 4649 4650 mutex_enter(&finfo->fi_lock); 4651 finfo->fi_status |= flag; 4652 cv_broadcast(&finfo->fi_wait_cv); 4653 mutex_exit(&finfo->fi_lock); 4654 } 4655 4656 static void 4657 opsrsm_option_rexmit_end(mblk_t *mp, opsrsm_dest_t *rd) 4658 { 4659 opsrsm_queued_msg_t *qmsg; 4660 opsrsm_ack_msg_t *ack; 4661 opsrsm_failover_info_t *finfo; 4662 4663 ack = (opsrsm_ack_msg_t *)((caddr_t)mp->b_rptr + 4664 OPSRSM_MESSAGE_HDRSZ); 4665 finfo = opsrsm_finfo_lookup_by_local_skey(ack->am_skey); 4666 if (finfo == NULL) { 4667 freemsg(mp); 4668 return; 4669 } 4670 freemsg(mp); 4671 4672 mutex_enter(&finfo->fi_lock); 4673 finfo->fi_status |= OPSRSM_FINFO_REXMIT_DONE; 4674 cv_broadcast(&finfo->fi_wait_cv); 4675 mutex_exit(&finfo->fi_lock); 4676 4677 if (rd->rd_state != OPSRSM_STATE_W_READY) { 4678 return; 4679 } 4680 qmsg = kmem_zalloc(sizeof (opsrsm_queued_msg_t), KM_NOSLEEP); 4681 if (qmsg == NULL) { 4682 return; 4683 } 4684 4685 qmsg->qm_msg.p.hdr.reqtype = OPSRSM_MSG_FINFO_REXMIT_ACK; 4686 qmsg->qm_msg.p.hdr.seqno = 0; 4687 qmsg->qm_msg.p.hdr.opsrsm_version = OPSRSM_VERSION; 4688 qmsg->qm_msg.p.m.finfoquery.skey = finfo->fi_remote_skey; 4689 qmsg->qm_retries = 0; 4690 4691 opsrsm_queued_msg_append(rd, qmsg); 4692 opsrsm_event_add(rd, OPSRSM_EVT_SEND_MSG); 4693 } 4694 4695 static int 4696 opsrsm_finfo_sendmsg(opsrsm_dest_t *rd, uint8_t type, uint32_t skey) 4697 { 4698 rsm_send_t send_obj; 4699 opsrsm_msg_t msg; 4700 int retval; 4701 4702 msg.p.hdr.reqtype = type; 4703 msg.p.hdr.seqno = 0; 4704 msg.p.hdr.opsrsm_version = OPSRSM_VERSION; 4705 msg.p.m.finfoquery.skey = skey; 4706 send_obj.is_data = &msg; 4707 send_obj.is_size = sizeof (opsrsm_msg_t); 4708 send_obj.is_flags = RSM_DLPI_SQFLAGS; 4709 send_obj.is_wait = 0; 4710 retval = RSM_SEND(rd->rd_adapter->rsmrdt_ctlr_obj, rd->rsm_sendq, 4711 &send_obj, NULL); 4712 4713 return (((retval == RSM_SUCCESS) ? 0 : retval)); 4714 } 4715 4716 /* 4717 * Create a special packet that is used as a marker to the end of 4718 * a retransmitted stream of packets. 4719 */ 4720 static mblk_t * 4721 opsrsm_alloc_ack_msg(uint32_t skey) 4722 { 4723 mblk_t *mp; 4724 opsrsm_ack_msg_t *msg; 4725 4726 mp = allocb(OPSRSM_CACHELINE_SIZE + OPSRSM_MESSAGE_HDRSZ + 4727 sizeof (opsrsm_ack_msg_t) + OPSRSM_CACHELINE_SIZE, BPRI_LO); 4728 if (mp == NULL) { 4729 return (NULL); 4730 } 4731 mp->b_rptr = (uchar_t *)OPSRSM_CACHELINE_ROUNDUP(mp->b_rptr); 4732 4733 OPSRSM_MESSAGE_HDRPTR(mp)->lportnum = 0; 4734 OPSRSM_MESSAGE_HDRPTR(mp)->rportnum = 0; 4735 OPSRSM_MESSAGE_HDRPTR(mp)->msg_sz = sizeof (opsrsm_ack_msg_t); 4736 OPSRSM_MESSAGE_HDRPTR(mp)->nodeid = (int)rsmrdt_my_nodeid; 4737 OPSRSM_MESSAGE_HDRPTR(mp)->pkey = 0; 4738 OPSRSM_MESSAGE_HDRPTR(mp)->seqno = 0; 4739 OPSRSM_MESSAGE_HDRPTR(mp)->option = OPSRSM_OPT_REXMIT_END; 4740 4741 msg = (opsrsm_ack_msg_t *)((caddr_t)mp->b_rptr + OPSRSM_MESSAGE_HDRSZ); 4742 msg->am_skey = skey; 4743 4744 mp->b_prev = mp->b_cont = NULL; 4745 mp->b_wptr = mp->b_rptr + sizeof (opsrsm_ack_msg_t) + 4746 OPSRSM_MESSAGE_HDRSZ; 4747 4748 return (mp); 4749 } 4750 4751 /* 4752 * This thread handles packet retransmission 4753 */ 4754 static void 4755 opsrsm_failover_thread(void *arg) 4756 { 4757 opsrsm_failover_info_t *finfo = (opsrsm_failover_info_t *)arg; 4758 int qlen, isdel, retries, error = 0; 4759 opsrsm_dest_t *rd; 4760 clock_t stime, wait_time, curr_time; 4761 4762 /* wait time interval size is 100ms */ 4763 wait_time = drv_usectohz(100000); 4764 4765 /* reconnect to the remote node. */ 4766 rd = opsrsm_connect(finfo->fi_nodeid, NULL); 4767 if (rd == NULL) { 4768 cmn_err(CE_CONT, "failover: 0x%x failed to reconnect " 4769 "to node %d\n", finfo->fi_local_skey, finfo->fi_nodeid); 4770 error = ENETDOWN; 4771 goto done; 4772 } 4773 4774 retries = 0; 4775 stime = ddi_get_lbolt(); 4776 for (;;) { 4777 /* tell remote node that demux has completed */ 4778 error = opsrsm_finfo_sendmsg(rd, OPSRSM_MSG_FINFO_DEMUX_DONE, 4779 finfo->fi_remote_skey); 4780 4781 mutex_enter(&finfo->fi_lock); 4782 /* 4783 * wait for DEMUX_DONE message from remote node. 4784 * this is necessary because we must not start 4785 * retransmission before the remote node has demuxed 4786 * all remaining packets in its delivery queue. 4787 */ 4788 if ((finfo->fi_status & OPSRSM_FINFO_DEMUX_DONE) != 0) { 4789 break; 4790 } 4791 curr_time = ddi_get_lbolt(); 4792 (void) cv_timedwait(&finfo->fi_wait_cv, &finfo->fi_lock, 4793 curr_time + wait_time); 4794 4795 /* exit from loop if connection got reset */ 4796 if (rd->rd_state != OPSRSM_STATE_W_READY) { 4797 error = ENETDOWN; 4798 break; 4799 } 4800 if (++retries > opsrsm_failover_max_retries) { 4801 error = ETIMEDOUT; 4802 opsrsm_lostconn(rd); 4803 break; 4804 } 4805 mutex_exit(&finfo->fi_lock); 4806 } 4807 mutex_exit(&finfo->fi_lock); 4808 curr_time = ddi_get_lbolt(); 4809 if (error != 0) { 4810 DINFO("failover: 0x%x step 1 error %d, wait time %d ms\n", 4811 finfo->fi_local_skey, error, 4812 drv_hztousec(curr_time - stime)/1000); 4813 goto done; 4814 } 4815 DINFO("failover: 0x%x demux done in %d ms, retries = %d\n", 4816 finfo->fi_local_skey, drv_hztousec(curr_time - stime)/1000, 4817 retries); 4818 ASSERT((finfo->fi_status & OPSRSM_FINFO_DEMUX_DONE) != 0); 4819 4820 /* 4821 * retransmission is done by queueing the packets in the rexmitq 4822 * to the sendq of the rd and dispatching a thread to drain the 4823 * sendq. 4824 */ 4825 qlen = OPSRSM_Q_LEN(&finfo->fi_rexmitq); 4826 DINFO("failover: 0x%x rexmiting remaining %d packets\n", 4827 finfo->fi_local_skey, qlen); 4828 mutex_enter(&rd->rd_sendq_lock); 4829 /* 4830 * need to increment the refcnt before dispatching the xmit 4831 * thread to make sure that the rd does not disappear before 4832 * the xmit thread runs. 4833 */ 4834 REFDEST(rd, isdel); 4835 if (isdel == 0) { 4836 mblk_t *mp; 4837 uint32_t skey; 4838 4839 if (qlen > 0) { 4840 OPSRSM_Q_CONCAT(&rd->rd_sendq, &finfo->fi_rexmitq); 4841 } 4842 /* 4843 * a marker is appended at the end of the retransmitted 4844 * stream. an acknowledgement will be sent back when the 4845 * receiver gets this marker. send local skey if old proto 4846 * is used. 4847 */ 4848 if ((finfo->fi_status & OPSRSM_FINFO_OLD_PROTO) != 0) { 4849 skey = finfo->fi_local_skey; 4850 } else { 4851 skey = finfo->fi_remote_skey; 4852 } 4853 mp = opsrsm_alloc_ack_msg(skey); 4854 if (mp != NULL) { 4855 OPSRSM_Q_APPEND(&rd->rd_sendq, mp); 4856 opsrsm_dispatch_tmo((void *)rd); 4857 } else { 4858 error = ENOMEM; 4859 UNREFDEST(rd); 4860 opsrsm_lostconn(rd); 4861 } 4862 } else { 4863 /* 4864 * failure to increment the refcnt indicates that the 4865 * rd is about to be torn down. 4866 */ 4867 error = ENETDOWN; 4868 } 4869 mutex_exit(&rd->rd_sendq_lock); 4870 if (error != 0) { 4871 DINFO("failover: 0x%x step 2 error %d\n", 4872 finfo->fi_local_skey, error); 4873 goto done; 4874 } 4875 4876 retries = 0; 4877 stime = ddi_get_lbolt(); 4878 mutex_enter(&finfo->fi_lock); 4879 /* 4880 * we need to wait for two conditions: 4881 * REXMIT_DONE - this indicates that the remote node 4882 * has finished retransmitting packets to us. 4883 * REXMIT_ACKED - this indicates that we have finished 4884 * retransmitting packets to the remote node. 4885 * 4886 * Note that for backward compatibility with an older version 4887 * of RDT we also need to check the OLD_PROTO flag. if this 4888 * flag is set, we do not wait for the REXMIT_DONE condition. 4889 * this has to be done because the older protocol only transmits 4890 * a marker if there are packets to rexmit and the REXMIT_DONE 4891 * condition cannot be met if the marker never arrives. 4892 */ 4893 for (;;) { 4894 uint_t flag; 4895 4896 flag = OPSRSM_FINFO_REXMIT_DONE | OPSRSM_FINFO_OLD_PROTO; 4897 if ((finfo->fi_status & OPSRSM_FINFO_REXMIT_ACKED) != 0 && 4898 (finfo->fi_status & flag) != 0) { 4899 break; 4900 } 4901 curr_time = ddi_get_lbolt(); 4902 (void) cv_timedwait(&finfo->fi_wait_cv, &finfo->fi_lock, 4903 curr_time + wait_time); 4904 if (rd->rd_state != OPSRSM_STATE_W_READY) { 4905 error = ENETDOWN; 4906 break; 4907 } 4908 if (++retries > opsrsm_failover_max_retries) { 4909 error = ETIMEDOUT; 4910 opsrsm_lostconn(rd); 4911 break; 4912 } 4913 } 4914 mutex_exit(&finfo->fi_lock); 4915 curr_time = ddi_get_lbolt(); 4916 if (error != 0) { 4917 DINFO("failover: 0x%x step 3 error %d, wait time %d ms\n", 4918 finfo->fi_local_skey, error, (curr_time - stime) * 10); 4919 goto done; 4920 } 4921 ASSERT((finfo->fi_status & OPSRSM_FINFO_REXMIT_ACKED) != 0); 4922 DINFO("failover: 0x%x retransmission done in %d ms\n", 4923 finfo->fi_local_skey, drv_hztousec(curr_time - stime)/1000); 4924 4925 done:; 4926 if (rd != NULL) { 4927 if (opsrsmdev->opsrsm_param.rsmrdt_enable_loadbalance) { 4928 if (rd->rd_adapter->sel_cnt > 0) 4929 rd->rd_adapter->sel_cnt--; 4930 } 4931 UNREFDEST(rd); 4932 } 4933 qlen = OPSRSM_Q_LEN(&finfo->fi_rexmitq); 4934 if (qlen > 0) { 4935 DINFO("failover: 0x%x reconnect failed, discarding " 4936 "%d packets\n", finfo->fi_local_skey, qlen); 4937 } 4938 OPSRSM_Q_FLUSH(&finfo->fi_rexmitq); 4939 /* senders can now be woken up */ 4940 opsrsm_finfo_wakeup(finfo, error); 4941 4942 /* schedule a timeout to destroy the finfo structure */ 4943 (void) timeout(opsrsm_finfo_destroy, finfo, 4944 opsrsm_failover_destruct_time * wait_time); 4945 } 4946 4947 4948 static void 4949 opsrsm_reset_rp(opsrsmresource_t *rp, opsrsm_dest_t *rd) 4950 { 4951 mutex_enter(&rp->rs_lock); 4952 if ((rp->rs_state & OPSRSM_RS_CONNECTING) != 0) { 4953 cv_wait(&rp->rs_conn_cv, &rp->rs_lock); 4954 } 4955 if (rp->rs_dest == rd && rp->rs_local_skey == rd->rd_local_skey) { 4956 rp->rs_state |= OPSRSM_RS_FAILOVER; 4957 if ((rp->rs_state & OPSRSM_RS_REFDEST) != 0) { 4958 if (opsrsmdev->opsrsm_param. 4959 rsmrdt_enable_loadbalance) { 4960 if (rd->rd_adapter->sel_cnt > 0) 4961 rp->rs_dest->rd_adapter->sel_cnt--; 4962 } 4963 rp->rs_state &= ~OPSRSM_RS_REFDEST; 4964 UNREFDEST(rp->rs_dest); 4965 } 4966 } 4967 mutex_exit(&rp->rs_lock); 4968 } 4969 4970 static void 4971 opsrsm_reset_all_rps(opsrsm_dest_t *rd) 4972 { 4973 int i, j; 4974 opsrsmresource_blk_t *blk; 4975 opsrsmresource_t *rp; 4976 4977 rw_enter(&opsrsm_resource.opsrsmrct_lock, RW_READER); 4978 for (i = 0; i < opsrsm_resource.opsrsmrc_len; i++) { 4979 blk = opsrsm_resource.opsrsmrc_root[i]; 4980 if (blk != NULL && blk->opsrsmrcblk_avail < OPSRSMRC_BLKSZ) { 4981 for (j = 0; j < OPSRSMRC_BLKSZ; j++) { 4982 rp = blk->opsrsmrcblk_blks[j]; 4983 if (rp != NULL) { 4984 opsrsm_reset_rp(rp, rd); 4985 } 4986 } 4987 } 4988 } 4989 rw_exit(&opsrsm_resource.opsrsmrct_lock); 4990 } 4991 4992 void 4993 rsmrdt_failover(adapter_t *adapterp, rsm_addr_t hwaddr) 4994 { 4995 opsrsm_dest_t *rd; 4996 int isdel = 0; 4997 4998 /* 4999 * Scan through all the rds associated with this adapter and 5000 * mark them down. 5001 */ 5002 5003 DINFO("failover: entering, adapter 0x%p\n", adapterp); 5004 5005 FINDDEST(rd, isdel, hwaddr, adapterp); 5006 if (isdel || !rd) { 5007 goto out; 5008 } 5009 5010 DINFO("failover: local_skey 0x%x, rd 0x%p\n", rd->rd_local_skey, rd); 5011 opsrsm_lostconn(rd); 5012 UNREFDEST(rd); 5013 5014 out:; 5015 /* call lower failover function */ 5016 DINFO("failover: exiting, adapter 0x%p\n", adapterp); 5017 } 5018 5019 /* 5020 * **************************************************************** 5021 * * 5022 * B E G I N RSM SETUP/TAKEDOWN * 5023 * * 5024 * **************************************************************** 5025 */ 5026 5027 int 5028 rsmrdt_check_openhandles() { 5029 int rval = -1; 5030 5031 while (rw_tryenter(&opsrsm_resource.opsrsmrct_lock, RW_WRITER) == 0) { 5032 delay(1); 5033 } 5034 if (opsrsm_resource.opsrsmrc_cnt <= 1) { 5035 /* 5036 * Can unload module. 5037 */ 5038 opsrsm_resource.opsrsmrc_flag = OPSRSMRC_UNLOAD_INPROGRESS; 5039 rval = 0; 5040 } 5041 rw_exit(&opsrsm_resource.opsrsmrct_lock); 5042 return (rval); 5043 } 5044 5045 /* 5046 * Initialize per adapter RSMRDT resources. 5047 * Return 0 on success, nonzero on error. 5048 */ 5049 int 5050 rsmrdt_adapterinit(adapter_t *adapterp) 5051 { 5052 char tqname[32]; 5053 5054 /* 5055 * Initialize mutexes for this device. 5056 */ 5057 mutex_init(&adapterp->opsrsm_dest_lock, NULL, MUTEX_DRIVER, NULL); 5058 mutex_init(&adapterp->opsrsm_runq_lock, NULL, MUTEX_DRIVER, NULL); 5059 cv_init(&adapterp->opsrsm_uninit_cv, NULL, CV_DRIVER, NULL); 5060 5061 (void) sprintf(tqname, "opsrsm%d", adapterp->adapterid); 5062 adapterp->opsrsm_taskq = taskq_create(tqname, 4, maxclsyspri, 1, 4, 5063 TASKQ_PREPOPULATE); 5064 return (0); 5065 } 5066 5067 5068 /* 5069 * Un-initialize per adapter RSMRDT resources. 5070 * Returns 0 if completely successful. 5071 * Returns -1 if not in a state where uninitialize makes sense. 5072 */ 5073 int 5074 rsmrdt_adapterfini(adapter_t *adapterp) 5075 { 5076 /* 5077 * If we can't release all destination and RSMPI resources, we can't 5078 * detach. The user will have to try later to unload the driver. 5079 */ 5080 D1("rsmrdt_adapterfini: adapterp->adapterid = %d\n", 5081 adapterp->adapterid); 5082 if (opsrsmuninit(adapterp) != 0) { 5083 return (-1); 5084 } 5085 taskq_destroy(adapterp->opsrsm_taskq); 5086 adapterp->opsrsm_taskq = NULL; 5087 5088 cv_destroy(&adapterp->opsrsm_uninit_cv); 5089 mutex_destroy(&adapterp->opsrsm_runq_lock); 5090 mutex_destroy(&adapterp->opsrsm_dest_lock); 5091 5092 return (0); 5093 } 5094 5095 5096 /* 5097 * Un-initialize OPSRSM resources. Returns 0 if completely successful. 5098 * Returns -1 if not in a state where uninitialize makes sense. Returns >0 5099 * if uninitialize was started, but hasn't completed because not all 5100 * connections have been torn down yet. 5101 */ 5102 static int 5103 opsrsmuninit(adapter_t *adapterp) 5104 { 5105 int dests_not_cleaned_up; 5106 int total_refcnt = 0; 5107 rsm_addr_t i; 5108 5109 D1("opsrsmuninit: adapterp 0x%p", (void *)adapterp); 5110 5111 for (i = 0; i < RSM_MAX_DESTADDR; i++) 5112 total_refcnt += opsrsmfreedest(adapterp, i); 5113 5114 mutex_enter(&adapterp->opsrsm_dest_lock); 5115 dests_not_cleaned_up = adapterp->opsrsm_numdest; 5116 if (total_refcnt > adapterp->opsrsm_numdest) { 5117 mutex_exit(&adapterp->opsrsm_dest_lock); 5118 DERR("opsrsmuninit: total_refcnt = %d", total_refcnt); 5119 return (dests_not_cleaned_up); 5120 } 5121 5122 if (dests_not_cleaned_up > 0) { 5123 cv_wait(&adapterp->opsrsm_uninit_cv, 5124 &adapterp->opsrsm_dest_lock); 5125 dests_not_cleaned_up = adapterp->opsrsm_numdest; 5126 } 5127 mutex_exit(&adapterp->opsrsm_dest_lock); 5128 5129 D1("opsrsmuninit: returning %d", dests_not_cleaned_up); 5130 return (dests_not_cleaned_up); 5131 } 5132 5133 /* 5134 * Get all the opsrsm parameters out of the device tree and store them in a 5135 * OPSRSM device (RSM controller) structure. 5136 */ 5137 static void 5138 opsrsmgetparam( 5139 dev_info_t *dip, /* Device's info pointer */ 5140 opsrsm_t *opsrsmp) /* OPSRSM device (RSM controller) pointer */ 5141 { 5142 struct opsrsm_param *sp = &opsrsmp->opsrsm_param; 5143 5144 /* Get parameters */ 5145 5146 sp->opsrsm_buffers = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5147 "rsmrdt-buffers", OPSRSM_BUFFERS_DFLT); 5148 sp->opsrsm_buffer_size = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5149 "rsmrdt-buffer-size", OPSRSM_BUFFER_SIZE_DFLT) + 5150 OPSRSM_CACHELINE_SIZE; 5151 sp->opsrsm_queue_size = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5152 "rsmrdt-queue-size", OPSRSM_QUEUE_SIZE_DFLT); 5153 sp->opsrsm_buffers_retained = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, 5154 dip, 0, "rsmrdt-buffers-retained", OPSRSM_BUFFERS_RETAINED_DFLT); 5155 sp->opsrsm_max_queued_pkts = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, 5156 dip, 0, "rsmrdt-max-queued-pkts", OPSRSM_MAX_QUEUED_PKTS_DFLT); 5157 sp->opsrsm_msg_init_tmo = ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5158 "rsmrdt-msg-init-tmo", OPSRSM_MSG_INIT_TMO_DFLT); 5159 sp->opsrsm_msg_max_tmo = ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5160 "rsmrdt-msg-max-tmo", OPSRSM_MSG_MAX_TMO_DFLT); 5161 sp->opsrsm_msg_drop_tmo = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5162 "rsmrdt-msg-drop-tmo", OPSRSM_MSG_DROP_TMO_DFLT); 5163 sp->opsrsm_ack_tmo = ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5164 "rsmrdt-ack-tmo", OPSRSM_ACK_TMO_DFLT); 5165 sp->opsrsm_sync_tmo = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5166 "rsmrdt-sync-tmo", OPSRSM_SYNC_TMO_DFLT); 5167 sp->opsrsm_fqe_sync_size = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5168 "rsmrdt-fqe-sync-size", OPSRSM_FQE_SYNC_SIZE_DFLT); 5169 sp->opsrsm_retry_limit = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5170 "rsmrdt-retry-limit", OPSRSM_RETRY_LIMIT_DFLT); 5171 sp->opsrsm_retry_delay = ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5172 "rsmrdt-retry-delay", OPSRSM_RETRY_DELAY_DFLT); 5173 sp->opsrsm_xmit_delay = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5174 "rsmrdt-xmit-delay", OPSRSM_XMIT_DELAY_DFLT); 5175 sp->opsrsm_data_threshold = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5176 "rsmrdt-data-threshold", OPSRSM_DATA_THRESHOLD_DFLT); 5177 sp->opsrsm_max_recv_msgs = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5178 "rsmrdt-max-recv-msgs", OPSRSM_MAX_RECV_MSGS_DFLT); 5179 sp->opsrsm_adaptive_intr = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5180 "rsmrdt-adaptive-intr", OPSRSM_ADAPTIVE_INTR_DFLT); 5181 sp->opsrsm_adaptive_rate = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5182 "rsmrdt-adaptive-rate", OPSRSM_ADAPTIVE_RATE_DFLT); 5183 sp->opsrsm_mem_hi_wat = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5184 "rsmrdt-mem-hi-wat", OPSRSM_MEM_HI_WAT_DFLT); 5185 sp->opsrsm_mem_lo_wat = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5186 "rsmrdt-mem-lo-wat", OPSRSM_MEM_LO_WAT_DFLT); 5187 sp->opsrsm_recv_hi_wat = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5188 "rsmrdt-recv-hi-wat", OPSRSM_RECV_HI_WAT_DFLT); 5189 sp->opsrsm_recv_lo_wat = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5190 "rsmrdt-recv-lo-wat", OPSRSM_RECV_LO_WAT_DFLT); 5191 sp->opsrsm_flow_tmo_int = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 0, 5192 "rsmrdt-flow-tmo-int", OPSRSM_FLOW_TMO_INT_DFLT); 5193 sp->opsrsm_max_loopback_pkts = (uint_t)ddi_getprop(DDI_DEV_T_ANY, dip, 5194 0, "rsmrdt-max-loopback-pkts", OPSRSM_MAX_LOOPBACK_PKTS_DFLT); 5195 sp->rsmrdt_enable_loadbalance = (ushort_t)ddi_getprop(DDI_DEV_T_ANY, 5196 dip, 0, "rsmrdt-enable-loadbalance", 5197 RSMRDT_ENABLE_LOADBALANCE_DFLT); 5198 5199 /* 5200 * Sanity check parameters, modify if needed. Note that we mainly 5201 * check to make sure parameters won't make the driver malfunction; 5202 * we don't necessarily prevent them from being stupid. 5203 */ 5204 5205 /* Need to have at least one buffer. */ 5206 5207 if (sp->opsrsm_buffers == 0) 5208 sp->opsrsm_buffers = 1; 5209 5210 /* Buffer length must be multiple of 64 (0x40). */ 5211 5212 if (sp->opsrsm_buffer_size & ~OPSRSM_CACHELINE_MASK) { 5213 sp->opsrsm_buffer_size &= OPSRSM_CACHELINE_MASK; 5214 } 5215 5216 /* 5217 * Must have at least one more queue element then the number of 5218 * buffers. This is so that we can track when all queue elements 5219 * need to be flushed to remote. 5220 */ 5221 5222 if (sp->opsrsm_queue_size <= sp->opsrsm_buffers) { 5223 sp->opsrsm_queue_size = sp->opsrsm_buffers + 1; 5224 } 5225 5226 /* Can't retain more buffers than we have. */ 5227 5228 if (sp->opsrsm_buffers_retained > sp->opsrsm_buffers) { 5229 sp->opsrsm_buffers_retained = sp->opsrsm_buffers; 5230 } 5231 5232 /* Have to be able to queue at least 1 packet. */ 5233 5234 if (sp->opsrsm_max_queued_pkts < 1) { 5235 sp->opsrsm_max_queued_pkts = 1; 5236 } 5237 5238 if (sp->opsrsm_msg_init_tmo < 1) { 5239 sp->opsrsm_msg_init_tmo = 1; 5240 } 5241 if (sp->opsrsm_msg_max_tmo < 1) { 5242 sp->opsrsm_msg_max_tmo = 1; 5243 } 5244 if (sp->opsrsm_ack_tmo < 1) { 5245 sp->opsrsm_ack_tmo = 1; 5246 } 5247 if (sp->opsrsm_sync_tmo < 1) { 5248 sp->opsrsm_sync_tmo = 1; 5249 } 5250 if (sp->opsrsm_retry_limit < 1) { 5251 sp->opsrsm_retry_limit = 1; 5252 } 5253 if (sp->opsrsm_retry_delay < 1) { 5254 sp->opsrsm_retry_delay = 1; 5255 } 5256 if (sp->opsrsm_xmit_delay < 1) { 5257 sp->opsrsm_xmit_delay = 1; 5258 } 5259 5260 if (sp->opsrsm_max_recv_msgs < 1) { 5261 sp->opsrsm_max_recv_msgs = OPSRSM_MAX_RECV_MSGS_DFLT; 5262 } 5263 5264 } 5265 5266 /* 5267 * **************************************************************** 5268 * * 5269 * E N D RSM SETUP/TAKEDOWN * 5270 * * 5271 * **************************************************************** 5272 */ 5273 5274 5275 /* 5276 * **************************************************************** 5277 * * 5278 * B E G I N CONNECTION DATA STRUCTURE MANAGEMENT * 5279 * * 5280 * **************************************************************** 5281 */ 5282 5283 5284 /* 5285 * Create the indicated destination structure, and return a pointer to it. 5286 * NOTE: this should never be called directly; use the MAKEDEST macro 5287 * instead. The macro checks that the destination structure does not yet 5288 * exist before calling this function. 5289 */ 5290 5291 5292 static opsrsm_dest_t * 5293 opsrsmmkdest(adapter_t *adapterp, 5294 rsm_addr_t rsm_addr) /* Address of destination to find/create */ 5295 { 5296 opsrsm_dest_t *rd; 5297 clock_t lbolttime; 5298 int adapterid, nodeid; 5299 5300 D1("opsrsmmkdest:cltr %d, rsmaddr %ld", adapterp->instance, rsm_addr); 5301 5302 /* Is the destination reasonable? */ 5303 5304 if (rsm_addr >= RSM_MAX_DESTADDR) { 5305 DERR("opsrsmmkdest: too big, returning NULL"); 5306 return (NULL); 5307 } 5308 5309 if ((rd = adapterp->opsrsm_desttbl[rsm_addr]) != NULL) { 5310 return (rd); 5311 } 5312 5313 ASSERT(MUTEX_HELD(&adapterp->opsrsm_dest_lock)); 5314 5315 /* retrieve the remote adapter id and remote node id */ 5316 rsmrdt_get_remote_ids(adapterp, rsm_addr, &adapterid, &nodeid); 5317 if (adapterid == -1 || nodeid == -1) { 5318 DERR("opsrsmmkdest: Unable to find remote ids\n"); 5319 return (NULL); 5320 } 5321 D1("opsrsmmkdest: Remote adapter id %d\n", adapterid); 5322 5323 if ((rd = (opsrsm_dest_t *)kmem_zalloc(sizeof (*rd), KM_NOSLEEP)) == 5324 NULL) { 5325 DERR("opsrsmmkdest: can't alloc, returning NULL"); 5326 return (NULL); 5327 } 5328 5329 rd->rd_evt_taskq = taskq_create("rd_events", 1, maxclsyspri, 1, 1, 5330 TASKQ_PREPOPULATE); 5331 5332 if (rd->rd_evt_taskq == NULL) { 5333 kmem_free(rd, sizeof (*rd)); 5334 return (NULL); 5335 } 5336 mutex_init(&rd->rd_msgs_lock, NULL, MUTEX_DRIVER, NULL); 5337 mutex_init(&rd->rd_evt_lock, NULL, MUTEX_DRIVER, NULL); 5338 cv_init(&rd->rd_evt_cv, NULL, CV_DRIVER, NULL); 5339 cv_init(&rd->rd_evt_wait_cv, NULL, CV_DRIVER, NULL); 5340 rd->rd_evt_flags = 0; 5341 rd->rd_msgs = NULL; 5342 rd->rd_msgs_tail = NULL; 5343 5344 if (taskq_dispatch(rd->rd_evt_taskq, opsrsm_event_thread, 5345 rd, KM_NOSLEEP) == 0) { 5346 mutex_destroy(&rd->rd_msgs_lock); 5347 mutex_destroy(&rd->rd_evt_lock); 5348 cv_destroy(&rd->rd_evt_cv); 5349 cv_destroy(&rd->rd_evt_wait_cv); 5350 kmem_free(rd, sizeof (*rd)); 5351 return (NULL); 5352 } 5353 5354 rd->rd_buffer_size = opsrsmdev->opsrsm_param.opsrsm_buffer_size; 5355 rd->rd_rsm_addr = rsm_addr; 5356 rd->rd_rem_adapterid = adapterid; 5357 rd->rd_adapter = adapterp; 5358 5359 mutex_init(&rd->rd_net_lock, NULL, MUTEX_DRIVER, NULL); 5360 mutex_init(&rd->rd_fqr_lock, NULL, MUTEX_DRIVER, NULL); 5361 mutex_init(&rd->rd_xmit_lock, NULL, MUTEX_DRIVER, NULL); 5362 mutex_init(&rd->rd_lock, NULL, MUTEX_DRIVER, NULL); 5363 mutex_init(&rd->rd_sendq_lock, NULL, MUTEX_DRIVER, NULL); 5364 mutex_init(&rd->rd_tmo_lock, NULL, MUTEX_DRIVER, NULL); 5365 mutex_init(&rd->rd_freeq_lock, NULL, MUTEX_DRIVER, NULL); 5366 cv_init(&rd->rd_conn_cv, NULL, CV_DRIVER, NULL); 5367 5368 /* 5369 * Use the time to generate a pseudo-random initial sequence 5370 * number. 5371 */ 5372 (void) drv_getparm(LBOLT, &lbolttime); 5373 rd->rd_nseq = (ushort_t)lbolttime; 5374 5375 rd->rd_state = OPSRSM_STATE_NEW; 5376 rd->rd_xmit_state = OPSRSM_XMIT_UNINITIALIZED; 5377 rd->rd_nodeid = nodeid; 5378 rd->rd_start_time = 0; 5379 rd->rd_data_collected = 0; 5380 rd->rd_writes_completed = 0; 5381 rd->rd_active_threads = 0; 5382 rd->rd_xmit_tmo_id = 0; 5383 rd->rd_fqe_tmo_id = 0; 5384 rd->rd_sync_dqe_tmo_id = 0; 5385 rd->rd_sync_fqe_tmo_id = 0; 5386 rd->rd_xmit_tmo_int = 0; 5387 rd->rd_fqe_tmo_int = 0; 5388 rd->rd_refcnt = 1; 5389 rd->rd_pollhd.ph_list = NULL; 5390 rd->rd_events = 0; 5391 rd->rd_adaptive_threshold = 0; 5392 rd->rd_last_sent = (uint32_t)lbolttime; 5393 rd->rd_pkt_freq = 0; 5394 rd->rd_freeq_freeze = B_FALSE; 5395 rd->rd_next_rseqno = 1; 5396 rd->rd_next_lseqno = 1; 5397 rd->rd_local_skey = (uint32_t)gethrtime(); 5398 rd->rd_local_skey ^= (uint32_t)rd + (uint32_t)rsm_addr; 5399 rd->rd_remote_skey = 0; 5400 rd->rd_retry_int = B_FALSE; 5401 rd->rd_freeq_freeze = B_FALSE; 5402 rd->rd_freed = B_FALSE; 5403 rd->rd_unpublish_errs = 0; 5404 OPSRSM_Q_INIT(&rd->rd_sendq); 5405 OPSRSM_Q_INIT(&rd->rd_pendq); 5406 OPSRSM_Q_INIT(&rd->rd_freeq); 5407 rd->rd_remote_flow_stop = 0; 5408 rd->rd_remote_flow_ctl = 0; 5409 rd->rd_flow_ctl = NULL; 5410 rd->rd_flow_tmo_id = 0; 5411 rd->rd_pkts_delivered = 0; 5412 rd->rd_status_tmo_id = 0; 5413 rd->rd_queued_fqe_freelist = NULL; 5414 rd->rd_queued_fqe_list = NULL; 5415 rd->rd_queued_fqe_tail = NULL; 5416 rd->rd_queued_fqe_array = NULL; 5417 rd->rd_queued_fqe_cnt = 0; 5418 rd->rd_fqr_flags = 0; 5419 5420 adapterp->opsrsm_desttbl[rsm_addr] = rd; 5421 adapterp->opsrsm_numdest++; 5422 5423 D1("opsrsmmkdest: created new dest, returning 0x%p", (void *)rd); 5424 return (rd); 5425 } 5426 5427 /* 5428 * Destination deletion 5429 * 5430 * As mentioned above (way above), we maintain a reference count on all 5431 * destinations, which is incremented and decremented around uses of the 5432 * destination structure. When this reference count goes to zero, we delete 5433 * the destination. 5434 * 5435 * Because of the possibility of other threads trying to use the destination 5436 * while we're deleting it, deletion is actually a multiple-step process, 5437 * which works as follows. 5438 * 5439 * 1. When a destination is created, its dstate (deletion state) is set to 5440 * zero, and its reference count is set to one. 5441 * 5442 * 2. When the service routine or some other routine decides that a destination 5443 * should be deleted, it calls opsrsmfreedest(). That routine sets dstate 5444 * to 1 and cancels any pending sync timeouts. It then decrements the 5445 * destination's reference count. This deletes the reference set in 5446 * opsrsmmkdest. (Note that since dstate is now 1, the FINDDEST and REFDEST 5447 * macros will now note that the destination is being deleted; thus, any 5448 * interrupt referring to the destination will no longer modify the 5449 * reference count.) 5450 * 5451 * 3. Soon after this, opsrsmdest_refcnt_0 is called. (This may either be 5452 * directly from opsrsmfreedest(), or perhaps from another routine if it 5453 * was running concurrently with freedest() and its UNREF happened last). 5454 * This routine sees that dstate is 1, and immediately queues a timeout 5455 * which will execute opsrsmfreedesttmo(). (This is necessary because we 5456 * may not be able to do everything in the phase 1 deletion from the routine 5457 * that we're currently in.) 5458 * 5459 * 4. opsrsmfreedesttmo() runs, it checks if there are any outstanding 5460 * loaned-up buffers. If so, it sets a flag to cause the loan returning 5461 * code to decrement the refcnt, and returns without performing cleanup. 5462 * When all loaned buffers are returned and the refcnt is decremented, we 5463 * go back to step 3, above. When opsrsmfreedesttmo() finally runs with 5464 * no loaned buffers, gets rid of most of the OPSRSM resources attached 5465 * to the destination. It also throws away any queued packets, gets 5466 * rid of any allocated DVMA resources. It changes dstate to 2, takes 5467 * this destination structure out of the base-ID => destination table. 5468 * It then decrements the reference count that had been added by 5469 * opsrsmdest_refcnt_0(). 5470 * 5471 * 5. When the reference count becomes 0, opsrsmdest_refcnt_0 is again called. 5472 * It notices that dstate is 2, and frees the destination structure. 5473 */ 5474 5475 /* 5476 * A destination's reference count went to 0, deal with it. 5477 */ 5478 static boolean_t 5479 opsrsmdest_refcnt_0( 5480 opsrsm_dest_t *rd) /* Destination pointer */ 5481 { 5482 opsrsm_t *opsrsmp = opsrsmdev; 5483 adapter_t *adapterp = rd->rd_adapter; 5484 boolean_t freed = B_FALSE; 5485 5486 mutex_enter(&adapterp->opsrsm_dest_lock); 5487 5488 D1("opsrsmdest_refcnt_0: rd 0x%p (addr %ld ctlr %d), refcnt %d, " 5489 "dstate %d", 5490 (void *)rd, rd->rd_rsm_addr, adapterp->instance, 5491 rd->rd_refcnt, rd->rd_dstate); 5492 5493 if (rd->rd_dstate == 1) { 5494 rd->rd_refcnt++; /* Inline REFDEST */ 5495 5496 DINFO("failover: 0x%x start destruction\n", rd->rd_local_skey); 5497 /* 5498 * We may be called from a routine that can't actually do the 5499 * work that needs to be done, so we schedule a thread to do 5500 * the next phase of the deletion. 5501 */ 5502 (void) taskq_dispatch(opsrsm_events_taskq, opsrsmfreedesttmo, 5503 rd, KM_SLEEP); 5504 5505 } else if (rd->rd_dstate == 2) { 5506 5507 /* Destroy all the mutexes */ 5508 DINFO("failover: 0x%x end destruction\n", rd->rd_local_skey); 5509 opsrsm_queued_msg_flush(rd); 5510 5511 mutex_destroy(&rd->rd_lock); 5512 mutex_destroy(&rd->rd_net_lock); 5513 mutex_destroy(&rd->rd_fqr_lock); 5514 mutex_destroy(&rd->rd_xmit_lock); 5515 mutex_destroy(&rd->rd_nlb_lock); 5516 mutex_destroy(&rd->rd_sendq_lock); 5517 mutex_destroy(&rd->rd_tmo_lock); 5518 mutex_destroy(&rd->rd_freeq_lock); 5519 mutex_destroy(&rd->rd_msgs_lock); 5520 mutex_destroy(&rd->rd_evt_lock); 5521 5522 cv_destroy(&rd->rd_conn_cv); 5523 cv_destroy(&rd->rd_evt_cv); 5524 cv_destroy(&rd->rd_evt_wait_cv); 5525 5526 /* 5527 * Free any allocated memory hanging off the dest structure. 5528 */ 5529 if (rd->rd_queued_fqe_array) { 5530 rd->rd_queued_fqe_freelist = NULL; 5531 rd->rd_queued_fqe_list = NULL; 5532 rd->rd_queued_fqe_tail = NULL; 5533 kmem_free(rd->rd_queued_fqe_array, 5534 opsrsmp->opsrsm_param.opsrsm_queue_size * 5535 sizeof (opsrsm_queued_fqe_t)); 5536 rd->rd_queued_fqe_array = NULL; 5537 rd->rd_queued_fqe_cnt = 0; 5538 } 5539 5540 if (rd->rd_cached_fqr) { 5541 kmem_free(rd->rd_cached_fqr, 5542 sizeof (*rd->rd_cached_fqr) * rd->rd_num_fqrs); 5543 } 5544 if (rd->rd_shdwfqw_f) { 5545 kmem_free(rd->rd_shdwfqw_f, 5546 sizeof (*rd->rd_shdwfqw_f) * rd->rd_num_fqws); 5547 } 5548 if (rd->rd_shdwdqw_f) { 5549 kmem_free(rd->rd_shdwdqw_f, 5550 sizeof (*rd->rd_shdwdqw_f) * rd->rd_num_dqws); 5551 } 5552 if (rd->rd_bufbase) { 5553 kmem_free(rd->rd_bufbase, 5554 opsrsmp->opsrsm_param.opsrsm_buffers * 5555 sizeof (*rd->rd_bufbase)); 5556 } 5557 if (rd->rd_rawmem_base_addr) { 5558 kmem_free(rd->rd_rawmem_base_addr, 5559 rd->rd_rawmem_base_size); 5560 } 5561 5562 /* Finally free the dest structure */ 5563 5564 kmem_free(rd, sizeof (*rd)); 5565 freed = B_TRUE; 5566 5567 adapterp->opsrsm_numdest--; 5568 D1("opsrsmdest_refcnt_0: freed rd data structures"); 5569 } 5570 5571 if (freed && adapterp->opsrsm_numdest <= 0) { 5572 cv_signal(&adapterp->opsrsm_uninit_cv); 5573 } 5574 mutex_exit(&adapterp->opsrsm_dest_lock); 5575 5576 D1("opsrsmdest_refcnt_0: done"); 5577 return (freed); 5578 } 5579 5580 /* 5581 * Do deletion work. 5582 */ 5583 static void 5584 opsrsmfreedesttmo(void * arg) 5585 { 5586 opsrsm_dest_t *rd = (opsrsm_dest_t *)arg; 5587 int bufnum, offset, length; 5588 opsrsm_failover_info_t *finfo; 5589 ushort_t sap; 5590 int err, cnt; 5591 boolean_t read_pkts = B_FALSE; 5592 5593 /* 5594 * See if there are any more outstanding loaned buffers. If so, 5595 * set flag so that freebuf will eventually do an UNREF when it 5596 * frees the last buffer. This removes the reference added in 5597 * opsrsmdest_refcnt_0(), causing the count to again go to 0. 5598 * opsrsmdest_refcnt_0() will again be called, increment the refcnt 5599 * and cause this routine to be called to complete cleanup. 5600 */ 5601 5602 mutex_enter(&rd->rd_nlb_lock); 5603 5604 rd->rd_nlb_del = 1; 5605 if (rd->rd_nlb != 0) { 5606 DERR("opsrsmfreedesttmo: loaned buffers outstanding %d, dest " 5607 "%ld", rd->rd_nlb, rd->rd_rsm_addr); 5608 mutex_exit(&rd->rd_nlb_lock); 5609 return; 5610 } 5611 5612 mutex_exit(&rd->rd_nlb_lock); 5613 5614 /* 5615 * Perform the sendq destroy first -- this notifies the 5616 * remote side that the connection is going away, so 5617 * it can immediately start cleaning up. This helps 5618 * to avoid a situation where a segment is unpublished 5619 * while there is still a connection to it (which is legal, 5620 * but can cause overhead in some specific RSM drivers). 5621 */ 5622 if (rd->rd_sstate & OPSRSM_RSMS_RXFER_S) { 5623 ASSERT(rd->rsm_sendq); 5624 D4("opsrsmfreedesttmo: destroying sendq\n"); 5625 err = RSM_SENDQ_DESTROY(rd->rd_adapter->rsmrdt_ctlr_obj, 5626 rd->rsm_sendq); 5627 if (err) { 5628 DERR("RSM_SENDQ_DESTROY failed! err %d\n", err); 5629 } 5630 rd->rd_sstate &= ~OPSRSM_RSMS_RXFER_S; 5631 } 5632 5633 if (rd->rd_sstate & OPSRSM_RSMS_RXFER_C) { 5634 ASSERT(rd->rd_rxferhand); 5635 D1("opsrsmfreedesttmo: disconnecting from remote segment\n"); 5636 err = RSM_DISCONNECT(rd->rd_adapter->rsmrdt_ctlr_obj, 5637 rd->rd_rxferhand); 5638 if (err) { 5639 DERR("RSM_DISCONNECT failed! err %d\n", err); 5640 } 5641 rd->rd_sstate &= ~OPSRSM_RSMS_RXFER_C; 5642 } 5643 5644 if (rd->rd_sstate & OPSRSM_RSMS_LXFER_P) { 5645 ASSERT(rd->rd_lxferhand); 5646 D1("opsrsmfreedesttmo: unpublishing local segment\n"); 5647 retry:; 5648 err = RSM_UNPUBLISH(rd->rd_adapter->rsmrdt_ctlr_obj, 5649 rd->rd_lxferhand); 5650 5651 if (err && rd->rd_unpublish_errs < RSMRDT_UNPUBLISH_TRY) { 5652 D1("RSM_UNPUBLISH failed! err %d\n", err); 5653 opsrsmdev->opsrsm_ierrors++; 5654 rd->rd_unpublish_errs++; 5655 /* 5656 * if RSM_UNPUBLISH fails, we need a slight delay 5657 * before retrying. 5658 */ 5659 delay(1); 5660 goto retry; 5661 } 5662 rd->rd_unpublish_errs = 0; 5663 rd->rd_sstate &= ~OPSRSM_RSMS_LXFER_P; 5664 } 5665 5666 if (rd->rd_sstate & OPSRSM_RSMS_LXFER_C) { 5667 ASSERT(rd->rd_lxferhand); 5668 D1("opsrsmfreedesttmo: destroying local segment\n"); 5669 err = RSM_SEG_DESTROY(rd->rd_adapter->rsmrdt_ctlr_obj, 5670 rd->rd_lxferhand); 5671 if (err) { 5672 DERR("RSM_SEG_DESTROY failed! err %d\n", err); 5673 } 5674 rd->rd_sstate &= ~OPSRSM_RSMS_LXFER_C; 5675 read_pkts = B_TRUE; 5676 } 5677 5678 /* empty out packets remaining in the buffer pool */ 5679 5680 if (read_pkts) { 5681 cnt = 0; 5682 /* Loop through all valid DQE's and process their packets. */ 5683 while (opsrsmgetdqe(rd, &bufnum, &offset, &length, &sap)) { 5684 /* Don't try to send up DQE with zero length */ 5685 cnt++; 5686 if (length) 5687 (void) opsrsmread(rd, bufnum, offset, length, 5688 sap); 5689 else { 5690 cmn_err(CE_PANIC, "received corrupted " 5691 "packet\n"); 5692 } 5693 } 5694 if (cnt > 0) { 5695 DINFO("failover: 0x%x read %d remaining packets\n", 5696 rd->rd_local_skey, cnt); 5697 } 5698 } 5699 opsrsm_wake_senders(rd, POLLOUT); 5700 5701 /* Take out of desttbl */ 5702 mutex_enter(&rd->rd_adapter->opsrsm_dest_lock); 5703 rd->rd_adapter->opsrsm_desttbl[rd->rd_rsm_addr] = NULL; 5704 ASSERT(rd->rd_dstate == 1); 5705 rd->rd_dstate = 2; 5706 mutex_exit(&rd->rd_adapter->opsrsm_dest_lock); 5707 5708 finfo = opsrsm_finfo_lookup_by_local_skey(rd->rd_local_skey); 5709 ASSERT(finfo != NULL); 5710 5711 OPSRSM_Q_CONCAT(&finfo->fi_rexmitq, &rd->rd_freeq); 5712 OPSRSM_Q_CONCAT(&finfo->fi_rexmitq, &rd->rd_pendq); 5713 OPSRSM_Q_CONCAT(&finfo->fi_rexmitq, &rd->rd_sendq); 5714 finfo->fi_next_rseqno = rd->rd_next_rseqno; 5715 5716 /* 5717 * no need to rexmit if there are no open fds or if driver 5718 * is being unloaded 5719 */ 5720 if (opsrsm_resource.opsrsmrc_flag == OPSRSMRC_UNLOAD_INPROGRESS) { 5721 DINFO("failover: rexmit thread not dispatched\n"); 5722 OPSRSM_Q_FLUSH(&finfo->fi_rexmitq); 5723 opsrsm_finfo_wakeup(finfo, ENETDOWN); 5724 if (opsrsm_resource.opsrsmrc_flag != 5725 OPSRSMRC_UNLOAD_INPROGRESS) { 5726 mutex_enter(&opsrsm_finfo_lock); 5727 opsrsm_failover_threads++; 5728 mutex_exit(&opsrsm_finfo_lock); 5729 (void) timeout(opsrsm_finfo_destroy, finfo, 10); 5730 } 5731 } else { 5732 DINFO("failover: rd 0x%p adapter 0x%p (cltr %d) addr 0x%llx\n", 5733 rd, rd->rd_adapter, rd->rd_adapter->instance, 5734 rd->rd_rsm_addr); 5735 DINFO("failover: local_skey 0x%x, remote_skey 0x%x, " 5736 "finfo 0x%p\n", rd->rd_local_skey, rd->rd_remote_skey, 5737 finfo); 5738 mutex_enter(&opsrsm_finfo_lock); 5739 opsrsm_failover_threads++; 5740 mutex_exit(&opsrsm_finfo_lock); 5741 if (OPSRSM_Q_LEN(&finfo->fi_rexmitq) == 0 && 5742 finfo->fi_remote_skey == 0) { 5743 /* 5744 * this occurs when segment establishment 5745 * failed. there is no need to keep the finfo 5746 * since no data transfer ever occurred. 5747 */ 5748 opsrsm_finfo_wakeup(finfo, ENETDOWN); 5749 (void) timeout(opsrsm_finfo_destroy, finfo, 10); 5750 } else { 5751 (void) timeout(opsrsm_dispatch_failover, finfo, 100); 5752 } 5753 } 5754 /* Make sure dest isn't on service queue */ 5755 mutex_enter(&rd->rd_adapter->opsrsm_runq_lock); 5756 5757 if (rd->rd_adapter->opsrsm_runq == rd) 5758 rd->rd_adapter->opsrsm_runq = rd->rd_next; 5759 else { 5760 opsrsm_dest_t *lastrd = rd->rd_adapter->opsrsm_runq; 5761 5762 while (lastrd) { 5763 if (lastrd->rd_next == rd) { 5764 lastrd->rd_next = rd->rd_next; 5765 break; 5766 } 5767 lastrd = lastrd->rd_next; 5768 } 5769 } 5770 5771 mutex_exit(&rd->rd_adapter->opsrsm_runq_lock); 5772 5773 ASSERT(rd->rd_sstate == 0); 5774 5775 /* 5776 * Removes the reference added in opsrsmdest_refcnt_0(). 5777 */ 5778 UNREFDEST(rd); 5779 5780 D1("opsrsmfreedesttmo: done"); 5781 } 5782 5783 5784 /* 5785 * Start the deletion process for a destination. 5786 */ 5787 static int 5788 opsrsmfreedest(adapter_t *adapter, rsm_addr_t rsm_addr) 5789 { 5790 opsrsm_dest_t *rd; 5791 timeout_id_t tmoid; 5792 int refcnt = 0; 5793 opsrsm_t *opsrsmp = opsrsmdev; 5794 5795 D2("opsrsmfreedest: remote rsmaddr %ld", rsm_addr); 5796 mutex_enter(&adapter->opsrsm_dest_lock); 5797 rd = adapter->opsrsm_desttbl[rsm_addr]; 5798 if (rd == NULL || rd->rd_dstate != 0) { 5799 #ifdef DEBUG 5800 if (rd != NULL) { 5801 cmn_err(CE_CONT, "opsrsmfreedest: dstate = %d, " 5802 "exiting\n", rd->rd_dstate); 5803 } 5804 #endif /* DEBUG */ 5805 mutex_exit(&adapter->opsrsm_dest_lock); 5806 return (refcnt); 5807 } 5808 if (rd->rd_freed) { 5809 #ifdef DEBUG 5810 cmn_err(CE_CONT, "opsrsmfreedest: already freed\n"); 5811 #endif /* DEBUG */ 5812 mutex_exit(&adapter->opsrsm_dest_lock); 5813 return (refcnt); 5814 } 5815 rd->rd_freed = B_TRUE; 5816 (void) opsrsm_finfo_add(rd); 5817 mutex_exit(&adapter->opsrsm_dest_lock); 5818 mutex_enter(&rd->rd_xmit_lock); 5819 5820 mutex_enter(&rd->rd_freeq_lock); 5821 rd->rd_freeq_freeze = B_TRUE; 5822 mutex_exit(&rd->rd_freeq_lock); 5823 5824 rd->rd_xmit_state = OPSRSM_XMIT_DISCONNECTED; 5825 cv_broadcast(&rd->rd_conn_cv); 5826 mutex_exit(&rd->rd_xmit_lock); 5827 5828 opsrsm_reset_all_rps(rd); 5829 5830 mutex_enter(&adapter->opsrsm_dest_lock); 5831 D1("opsrsmfreedest: opsrsmp 0x%p (cltr %d) rsmaddr %ld", 5832 (void *)opsrsmp, rd->rd_adapter->instance, rsm_addr); 5833 rd->rd_dstate = 1; 5834 refcnt = rd->rd_refcnt; 5835 mutex_exit(&adapter->opsrsm_dest_lock); 5836 5837 mutex_enter(&rd->rd_evt_lock); 5838 rd->rd_evt_flags |= OPSRSM_EVT_STOP; 5839 cv_signal(&rd->rd_evt_cv); 5840 cv_wait(&rd->rd_evt_wait_cv, &rd->rd_evt_lock); 5841 ASSERT((rd->rd_evt_flags & OPSRSM_EVT_DONE) != 0); 5842 mutex_exit(&rd->rd_evt_lock); 5843 5844 opsrsm_queued_msg_flush(rd); 5845 taskq_destroy(rd->rd_evt_taskq); 5846 rd->rd_evt_taskq = NULL; 5847 5848 /* 5849 * Turn off any timeouts. The sync timeout reschedules itself, so we 5850 * have to go to great lengths to kill it. 5851 */ 5852 mutex_enter(&rd->rd_xmit_lock); 5853 tmoid = rd->rd_tmo_id; 5854 rd->rd_tmo_id = 0; 5855 rd->rd_stopq = B_TRUE; 5856 mutex_exit(&rd->rd_xmit_lock); 5857 5858 if (tmoid) 5859 (void) untimeout(tmoid); 5860 5861 tmoid = rd->rd_sync_dqe_tmo_id; 5862 while (tmoid) { 5863 (void) untimeout(tmoid); 5864 /* 5865 * untimeout guarantees the either the function was 5866 * cancelled, or it has completed. If timeout was 5867 * cancelled before the function ran, the timout id will 5868 * not have changed. 5869 */ 5870 if (tmoid == rd->rd_sync_dqe_tmo_id) 5871 rd->rd_sync_dqe_tmo_id = 0; 5872 tmoid = rd->rd_sync_dqe_tmo_id; 5873 } 5874 5875 tmoid = rd->rd_sync_fqe_tmo_id; 5876 while (tmoid) { 5877 (void) untimeout(tmoid); 5878 if (tmoid == rd->rd_sync_fqe_tmo_id) 5879 rd->rd_sync_fqe_tmo_id = 0; 5880 tmoid = rd->rd_sync_fqe_tmo_id; 5881 } 5882 5883 opsrsm_cancel_xmit_tmo(rd); 5884 opsrsm_cancel_fqe_tmo(rd); 5885 opsrsm_cancel_sync_flow_tmo(rd); 5886 opsrsm_wake_senders(rd, POLLOUT); 5887 if (rd->rd_pollhd.ph_list != NULL) 5888 pollhead_clean(&rd->rd_pollhd); 5889 5890 D1("opsrsmfreedest: done"); 5891 5892 /* remove reference added in opsrsmmkdest() */ 5893 UNREFDEST(rd); 5894 return (refcnt); 5895 } 5896 5897 /* 5898 * **************************************************************** 5899 * * 5900 * E N D CONNECTION DATA STRUCTURE MANAGEMENT * 5901 * * 5902 * **************************************************************** 5903 */ 5904 5905 5906 5907 5908 /* 5909 * **************************************************************** 5910 * * 5911 * B E G I N MAIN STATE MACHINE * 5912 * * 5913 * **************************************************************** 5914 */ 5915 5916 5917 /* 5918 * We change a destination's state in a number of routines; we define these 5919 * macros to make sure it gets done the same way every time. 5920 */ 5921 #define OPSRSM_SETSTATE(rd, adapter, routine, newstate) \ 5922 rd->rd_state = (ushort_t)newstate; \ 5923 if (OPSRSM_SCHED_STATE(newstate)) { \ 5924 rd->rd_next = adapter->opsrsm_runq; \ 5925 adapter->opsrsm_runq = rd; \ 5926 D1(routine ": added to runq"); \ 5927 if (adapter->opsrsm_taskq) { \ 5928 (void) taskq_dispatch(adapter->opsrsm_taskq,\ 5929 opsrsmwsrv, adapter, KM_NOSLEEP); \ 5930 D1(routine ": enabled 0x%p", \ 5931 (void *)adapter->opsrsm_taskq); \ 5932 } \ 5933 } \ 5934 5935 5936 /* 5937 * This routine processes a notification that a destination has become 5938 * unreachable. Delete our record of it, so that when it comes back up we 5939 * will re-establish our association. We do this by changing its state to 5940 * S_DELETE; the service routine will then start the deletion 5941 * process. 5942 * 5943 * Since other parts of the driver may have operations in progress that 5944 * involve this destination, most of the time we cannot just whack the 5945 * state to the new value. Instead, we record (in rd_estate) that the 5946 * connection was lost. The next time someone else attempts to change the 5947 * state, the state change routines recognize that there is a pending event 5948 * and change the state to the one we wanted instead. (There are 5949 * exceptions in cases where the new state indicates that we've enabled 5950 * some sort of timeout; in this case, we may wait until the following 5951 * state change to take note of the event.) 5952 */ 5953 static void 5954 opsrsm_lostconn(opsrsm_dest_t *rd) 5955 { 5956 adapter_t *adapter = rd->rd_adapter; 5957 5958 D1("opsrsm_lostconn: rd 0x%p (addr %ld ctlr %d)", (void *)rd, 5959 rd->rd_rsm_addr, adapter->instance); 5960 5961 mutex_enter(&adapter->opsrsm_runq_lock); 5962 if ((rd->rd_state == OPSRSM_STATE_W_READY) || 5963 (rd->rd_state == OPSRSM_STATE_NEW) || 5964 (rd->rd_state == OPSRSM_STATE_W_ACCEPT) || 5965 (rd->rd_state == OPSRSM_STATE_W_ACK) || 5966 (rd->rd_state == OPSRSM_STATE_W_FQE)) { 5967 /* LINTED: E_CONSTANT_CONDITION */ 5968 OPSRSM_SETSTATE(rd, adapter, "opsrsm_lostconn", 5969 OPSRSM_STATE_S_DELETE); 5970 } else { 5971 rd->rd_estate = OPSRSM_STATE_S_DELETE; 5972 } 5973 DERR("opsrsm_lostconn: state now %s, estate now %s", 5974 OPSRSM_STATE_STR(rd->rd_state), OPSRSM_STATE_STR(rd->rd_estate)); 5975 5976 mutex_exit(&adapter->opsrsm_runq_lock); 5977 5978 /* 5979 * Stop trying to flush queue entries to the other side. 5980 */ 5981 rd->rd_stopq = B_TRUE; 5982 D1("opsrsm_lostconn: done"); 5983 } 5984 5985 5986 /* 5987 * Figure out what state transition should actually occur after an event 5988 * has happened. 5989 */ 5990 static int 5991 opsrsmestate_newstate(opsrsm_dest_t *rd, int newstate) 5992 { 5993 int retval = newstate; 5994 5995 /* 5996 * If we're going to a state where we've just set a timeout, don't 5997 * mess with the state. When the timeout happens, it will change 5998 * state again, and we'll nab 'em there. If we're about to delete 5999 * rd, don't bother worrying about the event. 6000 */ 6001 switch (newstate) { 6002 case OPSRSM_STATE_W_SCONNTMO: 6003 case OPSRSM_STATE_W_ACCEPT: 6004 case OPSRSM_STATE_W_ACK: 6005 case OPSRSM_STATE_W_FQE: 6006 case OPSRSM_STATE_DELETING: 6007 case OPSRSM_STATE_S_DELETE: 6008 return (retval); 6009 default: 6010 break; 6011 } 6012 6013 if (rd->rd_estate) { 6014 retval = rd->rd_estate; 6015 rd->rd_estate = OPSRSM_STATE_NEW; /* clear event state */ 6016 } 6017 6018 D1("opsrsmestate_newstate: %d %d -> %d", rd->rd_estate, 6019 newstate, retval); 6020 6021 return (retval); 6022 } 6023 6024 6025 /* 6026 * Return destination's state, then set its state to INPROGRESS. 6027 */ 6028 static int 6029 opsrsmgetstate( 6030 opsrsm_dest_t *rd) /* Destination pointer */ 6031 { 6032 int state; 6033 6034 D1("opsrsmgetstate: rd 0x%p", (void *)rd); 6035 6036 mutex_enter(&rd->rd_adapter->opsrsm_runq_lock); 6037 6038 state = rd->rd_state; 6039 rd->rd_state = OPSRSM_STATE_INPROGRESS; 6040 6041 mutex_exit(&rd->rd_adapter->opsrsm_runq_lock); 6042 6043 D1("opsrsmgetstate: returning %s", OPSRSM_STATE_STR(state)); 6044 6045 return (state); 6046 } 6047 6048 /* 6049 * Set destination's state; must be preceded by a getstate call. (i.e., 6050 * destination's current state must be INPROGRESS.) 6051 */ 6052 static void 6053 opsrsmsetstate( 6054 opsrsm_dest_t *rd, /* Destination pointer */ 6055 int newstate) /* State to set */ 6056 { 6057 adapter_t *adapter = rd->rd_adapter; 6058 6059 D1("opsrsmsetstate: rd 0x%p, newstate %s", (void *)rd, 6060 OPSRSM_STATE_STR(newstate)); 6061 6062 mutex_enter(&adapter->opsrsm_runq_lock); 6063 6064 if (rd->rd_state == OPSRSM_STATE_INPROGRESS) { 6065 if (rd->rd_estate) 6066 newstate = opsrsmestate_newstate(rd, newstate); 6067 OPSRSM_SETSTATE(rd, adapter, "opsrsmsetstate", newstate); 6068 } else { 6069 D1("opsrsm: setstate without getstate"); 6070 cmn_err(CE_PANIC, "opsrsm: setstate without getstate"); 6071 } 6072 6073 mutex_exit(&adapter->opsrsm_runq_lock); 6074 6075 D1("opsrsmsetstate: done"); 6076 } 6077 6078 6079 /* 6080 * Set state to newstate iff state is oldstate. Return 1 if move happened, 6081 * else 0. 6082 */ 6083 static int 6084 opsrsmmovestate( 6085 opsrsm_dest_t *rd, /* Destination pointer */ 6086 int oldstate, /* State to check against */ 6087 int newstate) /* State to set if check succeeds */ 6088 { 6089 adapter_t *adapter = rd->rd_adapter; 6090 int retval; 6091 6092 D1("opsrsmmovestate: rd 0x%p, oldstate %s, newstate %s", 6093 (void *)rd, OPSRSM_STATE_STR(oldstate), OPSRSM_STATE_STR(newstate)); 6094 6095 mutex_enter(&adapter->opsrsm_runq_lock); 6096 6097 if (rd->rd_state == oldstate) { 6098 if (rd->rd_estate) 6099 newstate = opsrsmestate_newstate(rd, newstate); 6100 OPSRSM_SETSTATE(rd, adapter, "opsrsmmovestate", newstate); 6101 retval = 1; 6102 D1("opsrsmmovestate: state changed, returning 1"); 6103 } else { 6104 retval = 0; 6105 D1("opsrsmmovestate: oldstate really %s, returning 0", 6106 OPSRSM_STATE_STR(rd->rd_state)); 6107 } 6108 6109 mutex_exit(&adapter->opsrsm_runq_lock); 6110 6111 return (retval); 6112 } 6113 6114 6115 6116 /* 6117 * **************************************************************** 6118 * * 6119 * E N D MAIN STATE MACHINE * 6120 * * 6121 * **************************************************************** 6122 */ 6123 6124 6125 6126 /* 6127 * **************************************************************** 6128 * * 6129 * B E G I N HANDLERS FOR INCOMING RSM MESSAGES * 6130 * * 6131 * **************************************************************** 6132 */ 6133 6134 6135 /* 6136 * Handlers for the various messages that may arrive. All of these happen 6137 * during interrupt handling, and will not actually use RSMPI calls. 6138 * Rather, they will schedule actions to happen. 6139 */ 6140 6141 6142 /* 6143 * Received CONNECT REQUEST message. Cause this side to set up 6144 * connection to xfer segment and send back an ACCEPT message. 6145 * 6146 * We must have everything set up before sending the ACCEPT. 6147 * However, we must not transmit any data until we receive the ACK 6148 * of the ACCEPT. 6149 */ 6150 static void 6151 opsrsmmsghdlr_req_connect(opsrsm_dest_t *rd, opsrsm_msg_t *msg) 6152 { 6153 adapter_t *adapter = rd->rd_adapter; 6154 boolean_t utmo = B_FALSE; 6155 timeout_id_t tmoid = NULL; 6156 6157 D1("opsrsmmsghdlr_req_connect: rd 0x%p (addr %ld ctlr %d)", 6158 (void *)rd, rd->rd_rsm_addr, adapter->instance); 6159 6160 /* 6161 * xmit lock guarantees that timeout has really been set 6162 * for any wait conditions. 6163 */ 6164 mutex_enter(&rd->rd_xmit_lock); 6165 mutex_enter(&adapter->opsrsm_runq_lock); 6166 6167 if (rd->rd_segid_valid) { 6168 /* 6169 * Another connect message - is it a duplicate? 6170 * If so, just ignore. Otherwise, there is a 6171 * problem, so force a connection teardown. 6172 */ 6173 6174 mutex_exit(&adapter->opsrsm_runq_lock); 6175 mutex_exit(&rd->rd_xmit_lock); 6176 6177 if ((rd->rd_rxfersegid != msg->p.m.con_request.send_segid) || 6178 (rd->rd_lastconnmsg_seq != msg->p.hdr.seqno)) { 6179 /* Not the same connect request, drop connection */ 6180 opsrsm_lostconn(rd); 6181 } 6182 6183 return; 6184 } 6185 6186 /* remember the message sequence number of this connection request */ 6187 rd->rd_lastconnmsg_seq = msg->p.hdr.seqno; 6188 6189 if (rd->rd_state == OPSRSM_STATE_W_ACCEPT) { 6190 /* 6191 * Crossed connection requests. If we're the higher 6192 * numbered address, cancel the ACCEPT timeout and accept 6193 * the remote request. If we're the lower numbered 6194 * address, ignore this request because the remote side 6195 * will accept ours. If the W_ACCEPT timeout expires prior 6196 * to cancelling the timeout, the timeout function will 6197 * notice the state is no longer W_ACCEPT, and will not 6198 * cause the connection to be torn down. If the timeout 6199 * has already occured (and the rd state is S_DELETE), 6200 * we're out of luck, and will have to wait for a new 6201 * connection request from the remote side. 6202 */ 6203 if (rd->rd_rsm_addr > 6204 adapter->rsmrdt_attr.attr_controller_addr) { 6205 rd->rd_segid_valid = B_TRUE; 6206 rd->rd_rxfersegid = msg->p.m.con_request.send_segid; 6207 /* LINTED: E_CONSTANT_CONDITION */ 6208 OPSRSM_SETSTATE(rd, adapter, 6209 "opsrsmmsghdlr_req_connect", 6210 OPSRSM_STATE_S_CONNXFER_ACCEPT); 6211 utmo = B_TRUE; 6212 tmoid = rd->rd_tmo_id; 6213 rd->rd_tmo_id = 0; 6214 rd->rd_tmo_int = 0; 6215 } 6216 } else { 6217 6218 /* 6219 * Save away the connection information. If possible, 6220 * change the state to cause the request to be immediately 6221 * acted upon. If the state is currently INPROGRESS 6222 * in the early stages of connection (during crexfer 6223 * or the start of sconn), then this request will 6224 * eventually be noticed when sconn() is called. The 6225 * sconn() function will notice that the segid is valid, 6226 * and perform the CONNXER_ACCEPT tasks instead. 6227 * 6228 * If this rd's state was in a later stage of the 6229 * connection dance (or after a connection exists), a 6230 * previous connection request should have been received, 6231 * the new connection request will not be expected, and 6232 * this will have been caught by noticing the segid was 6233 * already valid, and cause a failure, above. 6234 */ 6235 6236 rd->rd_segid_valid = B_TRUE; 6237 rd->rd_rxfersegid = msg->p.m.con_request.send_segid; 6238 6239 if (rd->rd_state == OPSRSM_STATE_NEW) { 6240 /* 6241 * No connection was in progress. Start a new 6242 * connection setup process. 6243 */ 6244 /* LINTED: E_CONSTANT_CONDITION */ 6245 OPSRSM_SETSTATE(rd, adapter, 6246 "opsrsmmsghdlr_req_connect", 6247 OPSRSM_STATE_S_NEWCONN); 6248 6249 } else if (rd->rd_state == OPSRSM_STATE_W_SCONNTMO) { 6250 /* 6251 * Accept this request instead of resending our 6252 * connect request. Cancel the timeout. If the 6253 * SCONNTMO timeout function is called prior to 6254 * cancelling the timeout, it will notice the state 6255 * is no longer W_SCONNTMO, and will not cause a 6256 * new connection request to be sent. If the 6257 * timeout already occured (and rd is in the 6258 * S_SCONN state), the sconn() function will notice 6259 * that the segid is valid, and perform the 6260 * CONNXER_ACCEPT tasks instead. 6261 */ 6262 /* LINTED: E_CONSTANT_CONDITION */ 6263 OPSRSM_SETSTATE(rd, adapter, 6264 "opsrsmmsghdlr_req_connect", 6265 OPSRSM_STATE_S_CONNXFER_ACCEPT); 6266 utmo = B_TRUE; 6267 tmoid = rd->rd_tmo_id; 6268 rd->rd_tmo_id = 0; 6269 rd->rd_tmo_int = 0; 6270 } 6271 } 6272 6273 mutex_exit(&adapter->opsrsm_runq_lock); 6274 mutex_exit(&rd->rd_xmit_lock); 6275 6276 if (utmo) 6277 (void) untimeout(tmoid); 6278 } 6279 6280 6281 6282 /* 6283 * Received ACCEPT message. Cause this side to set up a connection 6284 * to the remote transfer segment and send back an ACK message. 6285 */ 6286 static void 6287 opsrsmmsghdlr_con_accept(opsrsm_dest_t *rd, opsrsm_msg_t *msg) 6288 { 6289 adapter_t *adapter = rd->rd_adapter; 6290 boolean_t utmo = B_FALSE; 6291 timeout_id_t tmoid; 6292 6293 D1("opsrsmmsghdlr_con_accept: rd 0x%p (addr %ld ctlr %d)", 6294 (void *)rd, rd->rd_rsm_addr, adapter->instance); 6295 6296 /* 6297 * xmit lock protects segid field 6298 */ 6299 mutex_enter(&rd->rd_xmit_lock); 6300 mutex_enter(&adapter->opsrsm_runq_lock); 6301 6302 if (rd->rd_state == OPSRSM_STATE_W_ACCEPT && 6303 rd->rd_lxfersegid == msg->p.m.con_accept.rcv_segid) { 6304 rd->rd_segid_valid = B_TRUE; 6305 rd->rd_rxfersegid = msg->p.m.con_accept.send_segid; 6306 utmo = B_TRUE; 6307 tmoid = rd->rd_tmo_id; 6308 rd->rd_tmo_id = 0; 6309 /* LINTED: E_CONSTANT_CONDITION */ 6310 OPSRSM_SETSTATE(rd, adapter, "opsrsmmsghdlr_con_accept", 6311 OPSRSM_STATE_S_CONNXFER_ACK); 6312 mutex_exit(&adapter->opsrsm_runq_lock); 6313 mutex_exit(&rd->rd_xmit_lock); 6314 6315 if (utmo) 6316 (void) untimeout(tmoid); 6317 } else { 6318 mutex_exit(&adapter->opsrsm_runq_lock); 6319 mutex_exit(&rd->rd_xmit_lock); 6320 opsrsm_lostconn(rd); 6321 return; 6322 } 6323 6324 } 6325 6326 6327 /* 6328 * Received ACK message. Now ok to proceed with DLPI data transfer. 6329 */ 6330 static void 6331 opsrsmmsghdlr_con_ack(opsrsm_dest_t *rd, opsrsm_msg_t *msg) 6332 { 6333 adapter_t *adapter = rd->rd_adapter; 6334 boolean_t utmo = B_FALSE; 6335 timeout_id_t tmoid; 6336 6337 D1("opsrsmmsghdlr_con_ack: rd 0x%p (addr %ld ctlr %d)", 6338 (void *)rd, rd->rd_rsm_addr, adapter->instance); 6339 6340 mutex_enter(&adapter->opsrsm_runq_lock); 6341 6342 if (rd->rd_state == OPSRSM_STATE_W_ACK && 6343 msg->p.m.con_ack.rcv_segid == rd->rd_lxfersegid && 6344 msg->p.m.con_ack.send_segid == rd->rd_rxfersegid) { 6345 int isdel = 0; 6346 6347 utmo = B_TRUE; 6348 tmoid = rd->rd_tmo_id; 6349 rd->rd_tmo_id = 0; 6350 /* LINTED: E_CONSTANT_CONDITION */ 6351 /*lint -e778 */ 6352 OPSRSM_SETSTATE(rd, adapter, "opsrsmmsghdlr_con_ack", 6353 OPSRSM_STATE_W_READY); 6354 /*lint +e778 */ 6355 mutex_exit(&adapter->opsrsm_runq_lock); 6356 if (utmo) { 6357 (void) untimeout(tmoid); 6358 } 6359 if (rd->rd_status_tmo_id == 0) { 6360 rd->rd_status_tmo_id = 6361 timeout(opsrsm_status_check_tmo, rd, 1); 6362 } 6363 mutex_enter(&rd->rd_evt_lock); 6364 rd->rd_evt_flags = OPSRSM_EVT_READY; 6365 cv_signal(&rd->rd_evt_cv); 6366 mutex_exit(&rd->rd_evt_lock); 6367 6368 mutex_enter(&rd->rd_xmit_lock); 6369 rd->rd_xmit_state = OPSRSM_XMIT_BARRIER_CLOSED; 6370 cv_broadcast(&rd->rd_conn_cv); 6371 mutex_exit(&rd->rd_xmit_lock); 6372 6373 mutex_enter(&rd->rd_sendq_lock); 6374 if (OPSRSM_Q_LEN(&rd->rd_sendq) > 0) { 6375 REFDEST(rd, isdel); 6376 if (isdel == 0) { 6377 opsrsm_dispatch_tmo((void *)rd); 6378 } 6379 } 6380 mutex_exit(&rd->rd_sendq_lock); 6381 } else { 6382 mutex_exit(&adapter->opsrsm_runq_lock); 6383 opsrsm_lostconn(rd); 6384 return; 6385 } 6386 } 6387 6388 static void 6389 opsrsm_sync_flow_tmo(void *arg) 6390 { 6391 opsrsm_dest_t *rd = (opsrsm_dest_t *)arg; 6392 6393 mutex_enter(&rd->rd_tmo_lock); 6394 if (rd->rd_flow_tmo_id == 0) { 6395 mutex_exit(&rd->rd_tmo_lock); 6396 return; 6397 } 6398 if (taskq_dispatch(rd->rd_adapter->opsrsm_taskq, 6399 opsrsm_sync_flow_ctl, rd, KM_NOSLEEP) == 0) { 6400 rd->rd_flow_tmo_id = timeout(opsrsm_sync_flow_tmo, rd, 0); 6401 } else { 6402 rd->rd_flow_tmo_id = 0; 6403 } 6404 mutex_exit(&rd->rd_tmo_lock); 6405 } 6406 6407 static void 6408 opsrsm_set_sync_flow_tmo(opsrsm_dest_t *rd) 6409 { 6410 int isdel = 0; 6411 6412 mutex_enter(&rd->rd_tmo_lock); 6413 if (rd->rd_flow_tmo_id != 0) { 6414 goto out; 6415 } 6416 REFDEST(rd, isdel); 6417 if (isdel != 0) goto out; 6418 rd->rd_flow_tmo_id = timeout(opsrsm_sync_flow_tmo, rd, 0); 6419 out:; 6420 mutex_exit(&rd->rd_tmo_lock); 6421 } 6422 6423 static void 6424 opsrsm_cancel_sync_flow_tmo(opsrsm_dest_t *rd) 6425 { 6426 timeout_id_t tmoid; 6427 6428 mutex_enter(&rd->rd_tmo_lock); 6429 if (rd->rd_flow_tmo_id == 0) { 6430 mutex_exit(&rd->rd_tmo_lock); 6431 return; 6432 } 6433 UNREFDEST(rd); 6434 tmoid = rd->rd_flow_tmo_id; 6435 rd->rd_flow_tmo_id = 0; 6436 mutex_exit(&rd->rd_tmo_lock); 6437 (void) untimeout(tmoid); 6438 } 6439 6440 static void 6441 opsrsm_sync_flow_ctl(void *arg) 6442 { 6443 opsrsm_dest_t *rd = (opsrsm_dest_t *)arg; 6444 uchar_t srcaddr[64]; 6445 rsm_barrier_t fc_barrier; 6446 opsrsm_flow_ctl_t *fctl; 6447 int err, errcnt = 0; 6448 6449 again:; 6450 fctl = (opsrsm_flow_ctl_t *)&srcaddr[0]; 6451 fctl->fc_stop = rd->rd_remote_flow_stop; 6452 6453 err = RSM_OPEN_BARRIER_REGION(rd->rd_adapter->rsmrdt_ctlr_obj, 6454 rd->rd_rxferhand, &fc_barrier); 6455 ASSERT(err == RSM_SUCCESS); 6456 ASSERT((rd->rd_remote_flow_ctl & OPSRSM_CACHELINE_OFFSET) == 0); 6457 6458 err = RSM_PUT(rd->rd_adapter->rsmrdt_ctlr_obj, rd->rd_rxferhand, 6459 rd->rd_remote_flow_ctl, srcaddr, 64); 6460 ASSERT(err == RSM_SUCCESS); 6461 6462 err = RSM_CLOSE_BARRIER(rd->rd_adapter->rsmrdt_ctlr_obj, &fc_barrier); 6463 if (err != RSM_SUCCESS) { 6464 opsrsmdev->opsrsm_ierrors++; 6465 if ((uint_t)++errcnt <= 6466 opsrsmdev->opsrsm_param.opsrsm_retry_limit) { 6467 delay(2); 6468 goto again; 6469 } else { 6470 cmn_err(CE_CONT, "unable to sync flow control info\n"); 6471 } 6472 } 6473 UNREFDEST(rd); 6474 } 6475 6476 static void 6477 opsrsm_check_flow_ctl(opsrsm_dest_t *rd) 6478 { 6479 uint_t mem_lo_wat = max(opsrsmdev->opsrsm_param.opsrsm_mem_lo_wat, 6480 opsrsmdev->opsrsm_param.opsrsm_buffer_size * 6481 opsrsmdev->opsrsm_param.opsrsm_buffers); 6482 6483 /* stop sender if pending_bytes gets too large */ 6484 if (opsrsm_pending_bytes > 6485 opsrsmdev->opsrsm_param.opsrsm_recv_hi_wat || 6486 freemem * PAGESIZE < mem_lo_wat) { 6487 if (rd->rd_remote_flow_stop == 0) { 6488 DERR("opsrsm_pending_bytes = %d " 6489 "opsrsm_recv_hi_wat = %d", opsrsm_pending_bytes, 6490 opsrsmdev->opsrsm_param.opsrsm_recv_hi_wat); 6491 rd->rd_remote_flow_stop = 1; 6492 if (rd->rd_state == OPSRSM_STATE_W_READY) { 6493 opsrsm_set_sync_flow_tmo(rd); 6494 } 6495 } 6496 mutex_enter(&opsrsm_flow_tmo_lock); 6497 if (opsrsm_flow_tmo_id == 0) { 6498 opsrsm_flow_tmo_retries = 0; 6499 opsrsm_flow_tmo_id = timeout(opsrsm_flow_tmo, NULL, 6500 (long)opsrsmdev->opsrsm_param.opsrsm_flow_tmo_int); 6501 } 6502 mutex_exit(&opsrsm_flow_tmo_lock); 6503 } 6504 } 6505 6506 /*ARGSUSED*/ 6507 static void 6508 opsrsm_flow_enable(adapter_t *adp, void *arg) 6509 { 6510 int i; 6511 6512 for (i = 0; i < RSM_MAX_DESTADDR; i++) { 6513 opsrsm_dest_t *rd = NULL; 6514 int isdel = 0; 6515 6516 FINDDEST(rd, isdel, i, adp); 6517 if (isdel != 0 || rd == NULL) { 6518 continue; 6519 } 6520 if (rd->rd_state != OPSRSM_STATE_W_READY) { 6521 UNREFDEST(rd); 6522 continue; 6523 } 6524 6525 if (rd->rd_remote_flow_stop == 1) { 6526 rd->rd_remote_flow_stop = 0; 6527 opsrsm_set_sync_flow_tmo(rd); 6528 } 6529 UNREFDEST(rd); 6530 } 6531 } 6532 6533 static void 6534 opsrsm_flow_tmo_cancel(void) 6535 { 6536 timeout_id_t tmoid; 6537 6538 mutex_enter(&opsrsm_flow_tmo_lock); 6539 if (opsrsm_flow_tmo_id == 0) { 6540 mutex_exit(&opsrsm_flow_tmo_lock); 6541 return; 6542 } 6543 tmoid = opsrsm_flow_tmo_id; 6544 opsrsm_flow_tmo_id = 0; 6545 mutex_exit(&opsrsm_flow_tmo_lock); 6546 (void) untimeout(tmoid); 6547 } 6548 6549 /*ARGSUSED*/ 6550 static void 6551 opsrsm_flow_tmo(void *arg) 6552 { 6553 uint_t mem_hi_wat = max(opsrsmdev->opsrsm_param.opsrsm_mem_hi_wat, 6554 opsrsmdev->opsrsm_param.opsrsm_buffer_size * 6555 opsrsmdev->opsrsm_param.opsrsm_buffers + 6556 opsrsmdev->opsrsm_param.opsrsm_mem_hi_wat - 6557 opsrsmdev->opsrsm_param.opsrsm_mem_lo_wat); 6558 6559 mutex_enter(&opsrsm_flow_tmo_lock); 6560 if (opsrsm_flow_tmo_id == 0) { 6561 mutex_exit(&opsrsm_flow_tmo_lock); 6562 return; 6563 } 6564 opsrsm_flow_tmo_id = 0; 6565 if (opsrsm_pending_bytes >= 6566 opsrsmdev->opsrsm_param.opsrsm_recv_lo_wat || 6567 freemem * PAGESIZE <= mem_hi_wat) { 6568 /* cannot unblock senders yet, rescheduling timeout */ 6569 opsrsm_flow_tmo_id = timeout(opsrsm_flow_tmo, NULL, 6570 (long)opsrsmdev->opsrsm_param.opsrsm_flow_tmo_int); 6571 opsrsm_flow_tmo_retries++; 6572 if ((opsrsm_flow_tmo_retries % 36000) == 0) { 6573 cmn_err(CE_CONT, "remained in flow control " 6574 "condition for %d intervals\n", 6575 opsrsm_flow_tmo_retries); 6576 } 6577 mutex_exit(&opsrsm_flow_tmo_lock); 6578 return; 6579 } 6580 opsrsm_flow_tmo_retries = 0; 6581 mutex_exit(&opsrsm_flow_tmo_lock); 6582 6583 /* unblock senders */ 6584 apply_on_all_adapters(opsrsm_flow_enable, NULL); 6585 } 6586 6587 /* 6588 * Remote side has just sync'ed up the local DQE with its copy, so there 6589 * may be buffers to deliver. 6590 */ 6591 static void 6592 opsrsmmsghdlr_syncdqe(opsrsm_dest_t *rd, opsrsm_msg_t *msg) 6593 { 6594 int bufnum, offset, length; 6595 ushort_t sap; 6596 int freebufs = 0; 6597 uint32_t msg_cnt = 0; 6598 6599 D1("opsrsmmsghdlr_syncdqe: rd 0x%p (addr %ld ctlr %d)", 6600 (void *)rd, rd->rd_rsm_addr, rd->rd_adapter->instance); 6601 6602 ASSERT(rd->rd_sstate == OPSRSM_RSMS_ALL); 6603 6604 opsrsm_check_flow_ctl(rd); 6605 /* 6606 * message sanity check 6607 */ 6608 if (msg->p.m.syncdqe.rcv_segid != rd->rd_lxfersegid || 6609 msg->p.m.syncdqe.msg_cnt == 0) { 6610 cmn_err(CE_CONT, "opsrsmmsghdlr_syncdqe: bad rcv_segid"); 6611 opsrsm_lostconn(rd); 6612 return; 6613 } 6614 6615 /* Loop through all valid DQE's and process their packets. */ 6616 while (msg_cnt < msg->p.m.syncdqe.msg_cnt && 6617 opsrsmgetdqe(rd, &bufnum, &offset, &length, &sap)) { 6618 /* Don't try to send up DQE with zero length */ 6619 if (length) 6620 freebufs += opsrsmread(rd, bufnum, offset, length, sap); 6621 else { 6622 cmn_err(CE_PANIC, "received corrupted packet\n"); 6623 opsrsmputfqe(rd, bufnum); 6624 freebufs++; 6625 } 6626 6627 if (freebufs == 6628 opsrsmdev->opsrsm_param.opsrsm_fqe_sync_size) { 6629 freebufs = 0; 6630 opsrsm_event_add(rd, OPSRSM_EVT_SYNC_FQE); 6631 } 6632 msg_cnt++; 6633 } 6634 if (msg_cnt != msg->p.m.syncdqe.msg_cnt) { 6635 cmn_err(CE_CONT, "DQ corruption detected, msg_cnt = %d, " 6636 "dqes read = %d\n", msg->p.m.syncdqe.msg_cnt, msg_cnt); 6637 } 6638 if (freebufs) { 6639 opsrsm_event_add(rd, OPSRSM_EVT_SYNC_FQE); 6640 } 6641 D1("opsrsmmsghdlr_syncdqe: success"); 6642 } 6643 6644 static void 6645 rsmrdtmsghdlr_senderr(opsrsm_dest_t *rd, opsrsm_msg_t *msg) 6646 { 6647 opsrsmresource_t *rp; 6648 6649 D1("opsrsmmsghdlr_senderr: rd 0x%p (addr %ld ctlr %d)", 6650 (void *)rd, rd->rd_rsm_addr, rd->rd_adapter->instance); 6651 6652 rp = opsrsmresource_lookup(msg->p.m.senderr.sender_portnum, 6653 OPSRSM_RO_DEFAULT); 6654 if (rp == NULL) { 6655 return; 6656 } 6657 6658 if (msg->p.m.senderr.sender_pkey == rp->rs_pkey) { 6659 rp->rs_state |= msg->p.m.senderr.errstate; 6660 } 6661 } 6662 6663 6664 /* ARGSUSED */ 6665 static void 6666 opsrsmmsghdlr_default(opsrsm_dest_t *rd, opsrsm_msg_t *msg) 6667 { 6668 opsrsmerror(opsrsmdev->opsrsm_dip, "Unknown message type %d", 6669 msg->p.hdr.reqtype); 6670 } 6671 6672 6673 /* 6674 * Handler for connection-related RSMPI messages from remote OPSRSM drivers 6675 */ 6676 /* ARGSUSED */ 6677 rsm_intr_hand_ret_t 6678 opsrsm_rsm_intr_handler(rsm_controller_object_t *controller, 6679 rsm_intr_q_op_t operation, 6680 rsm_addr_t sender, 6681 void *data, 6682 size_t size, 6683 rsm_intr_hand_arg_t handler_arg) 6684 { 6685 adapter_t *adapter = (adapter_t *)handler_arg; 6686 opsrsm_t *opsrsmp = opsrsmdev; 6687 opsrsm_dest_t *rd; 6688 opsrsm_msg_t *msg; 6689 int isdel = 0; 6690 int isnew = 0; 6691 6692 if (adapter == NULL || opsrsmp == NULL) 6693 return (RSM_INTR_HAND_UNCLAIMED); 6694 /* 6695 * We only handle RSM addresses that fit in 48 bits. 6696 */ 6697 ASSERT(sender <= (rsm_addr_t)0xffffffffffffLL); 6698 6699 D1("opsrsm_intr_handle: opsrsmp 0x%p (cltr %d) sender-addr %ld", 6700 (void *)opsrsmp, 6701 adapter ? adapter->instance : -1, sender); 6702 6703 /* Is this our interrupt? */ 6704 mutex_enter(&adapter->mutex); 6705 if (controller->handle != adapter->rsmrdt_ctlr_obj.handle) { 6706 mutex_exit(&adapter->mutex); 6707 D1("opsrsm_intr_handle: bad controller handle"); 6708 return (RSM_INTR_HAND_UNCLAIMED); 6709 } 6710 mutex_exit(&adapter->mutex); 6711 /* 6712 * We don't really care about anything but a received packet 6713 * or a queue destroy 6714 */ 6715 switch (operation) { 6716 6717 case RSM_INTR_Q_OP_CREATE: { 6718 /* 6719 * Create a dest structure, on the assumption that 6720 * somebody's about to communicate with us. 6721 */ 6722 MAKEDEST(rd, isdel, isnew, sender, adapter); 6723 if (isdel || !rd) { 6724 return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE); 6725 } 6726 UNREFDEST(rd); 6727 6728 D1("opsrsm_intr_handle: op-create/config mkdset for addr %ld", 6729 sender); 6730 6731 return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE); 6732 } 6733 6734 case RSM_INTR_Q_OP_CONFIGURE: 6735 /* ignore configure messages */ 6736 return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE); 6737 6738 case RSM_INTR_Q_OP_DESTROY: { 6739 /* 6740 * The remote side has shut down the connection. We need 6741 * to shut local side of the connection down as well. 6742 */ 6743 FINDDEST(rd, isdel, sender, adapter); 6744 if (isdel || !rd) { 6745 return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE); 6746 } 6747 D1("opsrsm_intr_handle: op-destroy for addr %ld", sender); 6748 opsrsm_lostconn(rd); 6749 UNREFDEST(rd); 6750 return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE); 6751 } 6752 6753 case RSM_INTR_Q_OP_RECEIVE: 6754 /* 6755 * A DLPI message from the remote node. Handle in the main 6756 * body. 6757 */ 6758 break; 6759 6760 default: 6761 /* ignore */ 6762 return (RSM_INTR_HAND_UNCLAIMED); 6763 } 6764 6765 /* 6766 * Dest should already exist, having been created by the 6767 * RSM_INTR_Q_OP_CREATE, above. 6768 */ 6769 FINDDEST(rd, isdel, sender, adapter); 6770 if (isdel) { 6771 return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE); 6772 } else if (rd == NULL) { 6773 D1("opsrsm_rsm_intr_handler: can't finddest"); 6774 return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE); 6775 } 6776 6777 6778 msg = (opsrsm_msg_t *)data; 6779 6780 if (msg->p.hdr.opsrsm_version != OPSRSM_VERSION) { 6781 /* 6782 * Non-matching driver version! 6783 * Toss message. 6784 */ 6785 DINFO("version mismatch: version = %d, expected = %d\n", 6786 msg->p.hdr.opsrsm_version, OPSRSM_VERSION); 6787 UNREFDEST(rd); 6788 return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE); 6789 } 6790 6791 switch (msg->p.hdr.reqtype) { 6792 6793 case OPSRSM_MSG_REQ_CONNECT: 6794 opsrsmmsghdlr_req_connect(rd, msg); 6795 break; 6796 6797 case OPSRSM_MSG_CON_ACCEPT: 6798 opsrsmmsghdlr_con_accept(rd, msg); 6799 break; 6800 6801 case OPSRSM_MSG_CON_ACK: 6802 opsrsmmsghdlr_con_ack(rd, msg); 6803 break; 6804 6805 /* 6806 * Maybe scan the incoming queue at this time? 6807 */ 6808 case OPSRSM_MSG_SYNC_DQE: 6809 opsrsmmsghdlr_syncdqe(rd, msg); 6810 break; 6811 6812 case RSMRDT_MSG_SEND_ERR: 6813 rsmrdtmsghdlr_senderr(rd, msg); 6814 break; 6815 6816 case OPSRSM_MSG_RESET: 6817 if (rd->rd_state == OPSRSM_STATE_W_READY) { 6818 (void) opsrsmgetstate(rd); 6819 opsrsmsetstate(rd, OPSRSM_STATE_S_DELETE); 6820 } 6821 break; 6822 case OPSRSM_MSG_FINFO_DEMUX_DONE: 6823 case OPSRSM_MSG_FINFO_REPLY: 6824 case OPSRSM_MSG_FINFO_REXMIT_ACK: 6825 opsrsmmsghdlr_finfo(rd, msg); 6826 break; 6827 default: 6828 opsrsmmsghdlr_default(rd, msg); 6829 break; 6830 } 6831 6832 UNREFDEST(rd); 6833 6834 return (RSM_INTR_HAND_CLAIMED_EXCLUSIVE); 6835 /* 6836 * Supress lint Warning(550) [c:0]: isnew not accessed 6837 */ 6838 } /*lint !e550 */ 6839 6840 /* 6841 * **************************************************************** 6842 * * 6843 * E N D HANDLERS FOR INCOMING RSM MESSAGES * 6844 * * 6845 * **************************************************************** 6846 */ 6847 6848 6849 6850 /* 6851 * **************************************************************** 6852 * * 6853 * B E G I N CONNECTION MANAGEMENT * 6854 * * 6855 * **************************************************************** 6856 */ 6857 6858 /* 6859 * Create and initialize a transfer segment for the remote destination. If 6860 * successful, return 0, else 1. The destination's state must be 6861 * INPROGRESS. In remains INPROGRESS during this function. 6862 */ 6863 6864 static int 6865 opsrsmcrexfer(opsrsm_t *opsrsmp, opsrsm_dest_t *rd) 6866 { 6867 volatile opsrsm_xfer_hdr_t *xfer; 6868 opsrsm_fqe_t fqe; 6869 volatile opsrsm_fqe_t *fqep; 6870 opsrsm_dqe_t dqe; 6871 volatile opsrsm_dqe_t *dqep; 6872 opsrsmbuf_t *rbp; 6873 uint_t bufsize; 6874 int i, stat, bufalign = 0; 6875 uint32_t buf_offset, fq_offset, dq_offset; 6876 size_t xfer_size; 6877 caddr_t xfer_start; 6878 size_t roundup; 6879 size_t transport_pgsize = 0; 6880 rsm_access_entry_t perms; 6881 6882 D1("opsrsmcrexfer: rd 0x%p (addr %ld ctlr %d)", 6883 (void *)rd, rd->rd_rsm_addr, rd->rd_adapter->instance); 6884 6885 ASSERT(rd->rd_rawmem_base_addr == NULL); 6886 ASSERT(rd->rd_rawmem_base_size == 0); 6887 6888 bufsize = rd->rd_buffer_size; 6889 6890 transport_pgsize = PAGESIZE; 6891 6892 D1("opsrsmcrexfer: remote adapter id = %d", rd->rd_rem_adapterid); 6893 6894 /* 6895 * Make sure the remote side is responding before setting 6896 * up the local xfer segment. 6897 */ 6898 stat = RSM_SENDQ_CREATE(rd->rd_adapter->rsmrdt_ctlr_obj, 6899 rd->rd_rsm_addr, 6900 (rsm_intr_t)(RSMRDT_INTR_T_BASE + rd->rd_rem_adapterid), 6901 RSM_DLPI_QPRI, RSM_DLPI_QDEPTH, RSM_DLPI_QFLAGS, 6902 RSM_RESOURCE_DONTWAIT, 0, &(rd->rsm_sendq)); 6903 6904 if (stat != RSM_SUCCESS) { 6905 cmn_err(CE_CONT, "sendq create failed, stat 0x%x\n", stat); 6906 return (1); 6907 } 6908 6909 rd->rd_sstate |= OPSRSM_RSMS_RXFER_S; 6910 6911 6912 /* 6913 * Allocate memory for segment. Allow for alignment of DQE list 6914 * and FQE list. Also allow buffers to be aligned on 6915 * RSM-page-sized boundaries. 6916 */ 6917 6918 /* 6919 if (65536 % bufsize == 0) 6920 bufalign = 1; 6921 */ 6922 6923 /* 6924 * Even after round up to transport page alignments, 6925 * to make sure that xfer_size can still accomodate 6926 * all the queue elements, 2 * transport_pgsize has 6927 * been used in the following calculation. 6928 */ 6929 6930 xfer_size = (size_t)(sizeof (*xfer) + 64 + 6931 (sizeof (opsrsm_dqe_t) * opsrsmp->opsrsm_param.opsrsm_queue_size) 6932 + 64 + 6933 (sizeof (opsrsm_fqe_t) * opsrsmp->opsrsm_param.opsrsm_queue_size) 6934 + 64 + OPSRSM_FLOW_CTL_SZ + 6935 (bufsize * (uint_t)(opsrsmp->opsrsm_param.opsrsm_buffers + 6936 bufalign)) + (2 * (transport_pgsize - 1))); 6937 6938 xfer_start = kmem_alloc(xfer_size, KM_NOSLEEP); 6939 if (!xfer_start) { 6940 D1("opsrsmcrexfer: can't allocate memory, returning 1"); 6941 #ifdef DEBUG 6942 cmn_err(CE_CONT, "?opsrsm: crexfer, failed to alloc"); 6943 #endif /* DEBUG */ 6944 return (1); 6945 } 6946 rd->rd_rawmem_base_addr = xfer_start; 6947 rd->rd_rawmem_base_size = xfer_size; 6948 6949 /* 6950 * Round up memory pointer and round down size to allow alignment 6951 * within the transport's supported page size. 6952 */ 6953 roundup = transport_pgsize - ((uint64_t)xfer_start & 6954 (transport_pgsize - 1)); 6955 if (roundup != transport_pgsize) { 6956 xfer_size -= roundup; 6957 6958 /* Align the xfer_start with transport_pgsize */ 6959 xfer_start += roundup; 6960 } 6961 6962 /* Align the xfer_size with the transport_pgsize */ 6963 xfer_size = xfer_size & ~(transport_pgsize - 1); 6964 6965 rd->rd_memory.ms_type = RSM_MEM_VADDR; 6966 rd->rd_memory.ms_memory.vr.length = xfer_size; 6967 rd->rd_memory.ms_memory.vr.as = NULL; /* kas */ 6968 rd->rd_memory.ms_memory.vr.vaddr = xfer_start; 6969 6970 D1("opsrsmcrexfer: rawsize 0x%lx rawmem 0x%p xfersize 0x%lx " 6971 "xfermem 0x%p pgsize 0x%lx\n", 6972 rd->rd_rawmem_base_size, 6973 (void *)rd->rd_rawmem_base_addr, 6974 xfer_size, 6975 (void *)xfer_start, 6976 transport_pgsize); 6977 6978 xfer = (volatile struct opsrsm_xfer_hdr *)xfer_start; 6979 6980 /* Force FQ to start on a 64-byte boundary. */ 6981 fq_offset = sizeof (struct opsrsm_xfer_hdr); 6982 fq_offset = OPSRSM_CACHELINE_ROUNDUP(fq_offset); 6983 6984 /* Force DQ to start on a 64-byte boundary. */ 6985 dq_offset = fq_offset + (sizeof (opsrsm_fqe_t) * 6986 opsrsmp->opsrsm_param.opsrsm_queue_size + OPSRSM_FLOW_CTL_SZ); 6987 dq_offset = OPSRSM_CACHELINE_ROUNDUP(dq_offset); 6988 6989 /* Force buffers to start on a 64-byte boundary. */ 6990 buf_offset = dq_offset + (sizeof (opsrsm_dqe_t) * 6991 opsrsmp->opsrsm_param.opsrsm_queue_size); 6992 buf_offset = OPSRSM_CACHELINE_ROUNDUP(buf_offset); 6993 6994 if (bufalign == 1 && (buf_offset & (bufsize - 1)) != 0) { 6995 buf_offset += bufsize - (buf_offset & (bufsize - 1)); 6996 } 6997 /* 6998 * Note that while we set the _f and _n queue pointers and the 6999 * queue lengths here, the _l pointers will be set (and the lengths 7000 * may be adjusted) when we connect to the remote xfer segment (see 7001 * connxfer). 7002 */ 7003 mutex_enter(&rd->rd_net_lock); 7004 7005 rd->rd_fqr_f = rd->rd_fqr_n = (volatile opsrsm_fqe_t *) (xfer_start + 7006 fq_offset); 7007 rd->rd_fqr_seq = 1; 7008 rd->rd_num_fqrs = opsrsmp->opsrsm_param.opsrsm_queue_size; 7009 7010 /* 7011 * flow control structure is located at 128 bytes before the 7012 * start of the DQ 7013 */ 7014 rd->rd_flow_ctl = (volatile opsrsm_flow_ctl_t *)(xfer_start + 7015 dq_offset - OPSRSM_FLOW_CTL_SZ); 7016 rd->rd_flow_ctl->fc_stop = 0; 7017 7018 rd->rd_dqr_f = rd->rd_dqr_n = (volatile opsrsm_dqe_t *) (xfer_start + 7019 dq_offset); 7020 rd->rd_dqr_seq = 1; 7021 rd->rd_num_dqrs = opsrsmp->opsrsm_param.opsrsm_queue_size; 7022 7023 rd->rd_lbuf = xfer_start + buf_offset; 7024 rd->rd_lbuflen = bufsize; 7025 rd->rd_numlbufs = opsrsmp->opsrsm_param.opsrsm_buffers; 7026 7027 /* 7028 * Initialize the delivery and free queues: elements in the free 7029 * queue are valid, and elements in the delivery queue are invalid 7030 * (seqno == 0). 7031 */ 7032 fqep = rd->rd_fqr_f; 7033 dqep = rd->rd_dqr_f; 7034 7035 dqe.s.dq_seqnum = 0; 7036 dqe.s.dq_bufnum = (ushort_t)~0; 7037 7038 fqe.s.fq_seqnum = 1; 7039 7040 for (i = 0; i < opsrsmp->opsrsm_param.opsrsm_queue_size; i++) { 7041 fqe.s.fq_bufnum = (ushort_t)i; 7042 7043 *fqep++ = fqe; 7044 *dqep++ = dqe; 7045 } 7046 7047 mutex_exit(&rd->rd_net_lock); 7048 7049 /* allocate space for queued fqes */ 7050 mutex_enter(&rd->rd_fqr_lock); 7051 rd->rd_queued_fqe_array = (opsrsm_queued_fqe_t *)kmem_zalloc(opsrsmp-> 7052 opsrsm_param.opsrsm_queue_size * sizeof (opsrsm_queued_fqe_t), 7053 KM_NOSLEEP); 7054 7055 if (rd->rd_queued_fqe_array == NULL) { 7056 DINFO("opsrsmcrexfer: cannot alloc queued_fqe_array\n"); 7057 mutex_exit(&rd->rd_fqr_lock); 7058 return (1); 7059 } 7060 /* construct freelist */ 7061 for (i = 0; i < opsrsmp->opsrsm_param.opsrsm_queue_size; i++) { 7062 opsrsm_queued_fqe_t *qp; 7063 7064 qp = &rd->rd_queued_fqe_array[i]; 7065 /* enqueue element onto freelist */ 7066 opsrsm_queued_fqe_free(rd, qp); 7067 } 7068 mutex_exit(&rd->rd_fqr_lock); 7069 7070 /* 7071 * Allocate and init our structures to describe loaned-up buffers. 7072 */ 7073 rbp = rd->rd_bufbase = (opsrsmbuf_t *)kmem_zalloc(opsrsmp-> 7074 opsrsm_param.opsrsm_buffers * sizeof (*rd->rd_bufbase), KM_NOSLEEP); 7075 7076 if (rbp == NULL) { 7077 D1("opsrsmcrexfer: can't alloc rbp structs, returning 1"); 7078 #ifdef DEBUG 7079 cmn_err(CE_CONT, "?opsrsm: abuf"); 7080 #endif /* DEBUG */ 7081 return (1); 7082 } 7083 7084 for (i = 0; i < rd->rd_numlbufs; i++) { 7085 rbp->rb_rd = rd; 7086 rbp->rb_frtn.free_func = opsrsmfreebuf; 7087 rbp->rb_frtn.free_arg = (char *)rbp; 7088 rbp->rb_bufnum = i; 7089 rbp++; 7090 } 7091 7092 mutex_init(&rd->rd_nlb_lock, NULL, MUTEX_DRIVER, NULL); 7093 mutex_enter(&rd->rd_nlb_lock); 7094 rd->rd_nlb = 0; 7095 mutex_exit(&rd->rd_nlb_lock); 7096 7097 /* 7098 * Set everything in the header of the segment. 7099 */ 7100 7101 xfer->rx_segsize = xfer_size; 7102 xfer->rx_buf_offset = buf_offset; 7103 xfer->rx_fq_offset = fq_offset; 7104 xfer->rx_dq_offset = dq_offset; 7105 xfer->rx_numbufs = rd->rd_numlbufs; 7106 xfer->rx_bufsize = rd->rd_lbuflen; 7107 xfer->rx_numfqes = rd->rd_num_fqrs; 7108 xfer->rx_numdqes = rd->rd_num_dqrs; 7109 xfer->rx_skey = rd->rd_local_skey; 7110 D1("opsrsmcrexfer: rx_buf_offset 0x%x fq_offset 0x%x dq_offset 0x%x " 7111 "rd_numlbufs 0x%x rd_lbuflen 0x%x rd_num_fqrs 0x%x " 7112 "rd_num_dqrs 0x%x\n", 7113 buf_offset, 7114 fq_offset, 7115 dq_offset, 7116 rd->rd_numlbufs, 7117 rd->rd_lbuflen, 7118 rd->rd_num_fqrs, 7119 rd->rd_num_dqrs); 7120 7121 xfer->rx_cookie = OPSRSM_XFER_COOKIE; 7122 7123 /* 7124 * Local xfer segment is now initialized; make it available to the 7125 * remote node. 7126 */ 7127 7128 stat = RSM_SEG_CREATE(rd->rd_adapter->rsmrdt_ctlr_obj, 7129 &(rd->rd_lxferhand), 7130 xfer_size, 0, &(rd->rd_memory), RSM_RESOURCE_DONTWAIT, 0); 7131 7132 if (stat != RSM_SUCCESS) { 7133 D1("opsrsmcrexfer: can't create RSM segment, stat 0x%x, " 7134 "return 1", stat); 7135 #ifdef DEBUG 7136 cmn_err(CE_CONT, "?opsrsm: crexfer, stat 0x%x", stat); 7137 #endif /* DEBUG */ 7138 return (1); 7139 } 7140 rd->rd_sstate |= OPSRSM_RSMS_LXFER_C; 7141 7142 7143 /* 7144 * Publish this segment. First try using an id that is likely 7145 * to be unique. 7146 */ 7147 perms.ae_addr = rd->rd_rsm_addr; 7148 perms.ae_permission = RSM_PERM_RDWR; 7149 stat = RSMERR_SEGID_IN_USE; 7150 if (rd->rd_rsm_addr <= 7151 (RSMRDT_SEGID_END - RSMRDT_SEGID_BASE)) { 7152 rd->rd_lxfersegid = RSMRDT_SEGID_BASE + 7153 (uint32_t)rd->rd_rsm_addr; 7154 stat = (RSM_PUBLISH(rd->rd_adapter->rsmrdt_ctlr_obj, 7155 rd->rd_lxferhand, 7156 &perms, 1, rd->rd_lxfersegid, NULL, 0)); 7157 } 7158 if (stat == RSMERR_SEGID_IN_USE) { 7159 /* Couldn't use default id; try other ids in allowed range */ 7160 rd->rd_lxfersegid = RSMRDT_SEGID_BASE; 7161 while ((stat = (RSM_PUBLISH(rd->rd_adapter->rsmrdt_ctlr_obj, 7162 rd->rd_lxferhand, 7163 &perms, 1, rd->rd_lxfersegid, NULL, 0))) == 7164 RSMERR_SEGID_IN_USE && rd->rd_lxfersegid < 7165 RSMRDT_SEGID_END) 7166 rd->rd_lxfersegid++; 7167 } 7168 7169 if (stat != RSM_SUCCESS) { 7170 D1("opsrsmcrexfer: can't publish, stat 0x%x, returning 1", 7171 stat); 7172 #ifdef DEBUG 7173 cmn_err(CE_CONT, "?opsrsm: expxfer, stat 0x%x", stat); 7174 #endif /* DEBUG */ 7175 return (1); 7176 } 7177 rd->rd_sstate |= OPSRSM_RSMS_LXFER_P; 7178 7179 D1("opsrsmcrexfer: returning 0"); 7180 return (0); 7181 } 7182 7183 /* 7184 * Send a connect request to the remote. 7185 * 7186 * If we've received a Connect message from the destination, connect to the 7187 * remote transfer segment. Otherwise, send them a Connect Request 7188 * message. On success, return 0. If the connect fails return 1. A 7189 * failure in sending a Connect Request message will result in a retry 7190 * timeout being scheduled, but will not return 1 unless the total timeout 7191 * period has expired. Destination's state must be INPROGRESS when called. 7192 * Destination's state is set to a new state prior to returning. 7193 */ 7194 static int 7195 opsrsmsconn( 7196 opsrsm_t *opsrsmp, /* OPSRSM device (RSM controller) pointer */ 7197 opsrsm_dest_t *rd, /* Destination pointer */ 7198 int fromtmo) /* 0 if this is our first attempt; nonzero if this */ 7199 /* is a retry, requested by a timeout routine. */ 7200 { 7201 int stat, seq; 7202 7203 D1("opsrsmsconn: rd 0x%p (addr %ld ctlr %d)", 7204 (void *)rd, rd->rd_rs