Home | History | Annotate | Download | only in io
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/errno.h>
     29 #include <sys/debug.h>
     30 #include <sys/time.h>
     31 #include <sys/sysmacros.h>
     32 #include <sys/systm.h>
     33 #include <sys/user.h>
     34 #include <sys/stropts.h>
     35 #include <sys/stream.h>
     36 #include <sys/strlog.h>
     37 #include <sys/strsubr.h>
     38 #include <sys/cmn_err.h>
     39 #include <sys/cpu.h>
     40 #include <sys/kmem.h>
     41 #include <sys/conf.h>
     42 #include <sys/ddi.h>
     43 #include <sys/sunddi.h>
     44 #include <sys/ksynch.h>
     45 #include <sys/stat.h>
     46 #include <sys/kstat.h>
     47 #include <sys/vtrace.h>
     48 #include <sys/strsun.h>
     49 #include <sys/dlpi.h>
     50 #include <sys/ethernet.h>
     51 #include <net/if.h>
     52 #include <sys/varargs.h>
     53 #include <sys/machsystm.h>
     54 #include <sys/modctl.h>
     55 #include <sys/modhash.h>
     56 #include <sys/mac.h>
     57 #include <sys/mac_ether.h>
     58 #include <sys/taskq.h>
     59 #include <sys/note.h>
     60 #include <sys/mach_descrip.h>
     61 #include <sys/mdeg.h>
     62 #include <sys/ldc.h>
     63 #include <sys/vsw_fdb.h>
     64 #include <sys/vsw.h>
     65 #include <sys/vio_mailbox.h>
     66 #include <sys/vnet_mailbox.h>
     67 #include <sys/vnet_common.h>
     68 #include <sys/vio_util.h>
     69 #include <sys/sdt.h>
     70 #include <sys/atomic.h>
     71 #include <sys/callb.h>
     72 #include <sys/vlan.h>
     73 
     74 /* Port add/deletion/etc routines */
     75 static	void vsw_port_delete(vsw_port_t *port);
     76 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
     77 static	void vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
     78 static	int vsw_init_ldcs(vsw_port_t *port);
     79 static	void vsw_uninit_ldcs(vsw_port_t *port);
     80 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
     81 static	void vsw_ldc_uninit(vsw_ldc_t *ldcp);
     82 static	void vsw_drain_ldcs(vsw_port_t *port);
     83 static	void vsw_drain_port_taskq(vsw_port_t *port);
     84 static	void vsw_marker_task(void *);
     85 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
     86 void vsw_detach_ports(vsw_t *vswp);
     87 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
     88 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
     89 int vsw_port_detach(vsw_t *vswp, int p_instance);
     90 int vsw_portsend(vsw_port_t *port, mblk_t *mp);
     91 int vsw_port_attach(vsw_port_t *portp);
     92 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
     93 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
     94 int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
     95 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
     96 void vsw_reset_ports(vsw_t *vswp);
     97 void vsw_port_reset(vsw_port_t *portp);
     98 void vsw_physlink_update_ports(vsw_t *vswp);
     99 static	void vsw_port_physlink_update(vsw_port_t *portp);
    100 
    101 /* Interrupt routines */
    102 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
    103 
    104 /* Handshake routines */
    105 static	void vsw_ldc_reinit(vsw_ldc_t *);
    106 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
    107 static	void vsw_conn_task(void *);
    108 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
    109 static	void vsw_next_milestone(vsw_ldc_t *);
    110 static	int vsw_supported_version(vio_ver_msg_t *);
    111 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
    112 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
    113 
    114 /* Data processing routines */
    115 static void vsw_process_pkt(void *);
    116 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
    117 static void vsw_process_ctrl_pkt(void *);
    118 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
    119 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
    120 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
    121 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
    122 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
    123 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
    124 static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
    125 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
    126 	uint32_t);
    127 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
    128 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
    129 static void vsw_process_pkt_data(void *, void *, uint32_t);
    130 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
    131 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
    132 
    133 /* Switching/data transmit routines */
    134 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
    135 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
    136 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
    137 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
    138 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
    139 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
    140 
    141 /* Packet creation routines */
    142 static void vsw_send_ver(void *);
    143 static void vsw_send_attr(vsw_ldc_t *);
    144 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
    145 static void vsw_send_dring_info(vsw_ldc_t *);
    146 static void vsw_send_rdx(vsw_ldc_t *);
    147 static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);
    148 
    149 /* Dring routines */
    150 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
    151 static void vsw_create_privring(vsw_ldc_t *);
    152 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
    153 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
    154     int *);
    155 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
    156 static int vsw_reclaim_dring(dring_info_t *dp, int start);
    157 
    158 static void vsw_set_lane_attr(vsw_t *, lane_t *);
    159 static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
    160 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
    161 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
    162 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
    163 
    164 /* Rcv/Tx thread routines */
    165 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
    166 static void vsw_ldc_tx_worker(void *arg);
    167 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
    168 static void vsw_ldc_rx_worker(void *arg);
    169 
    170 /* Misc support routines */
    171 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
    172 static void vsw_free_ring(dring_info_t *);
    173 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
    174 static int vsw_get_same_dest_list(struct ether_header *ehp,
    175     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
    176 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
    177 
    178 /* Debugging routines */
    179 static void dump_flags(uint64_t);
    180 static void display_state(void);
    181 static void display_lane(lane_t *);
    182 static void display_ring(dring_info_t *);
    183 
    184 /*
    185  * Functions imported from other files.
    186  */
    187 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
    188 extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
    189 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
    190 extern void vsw_del_mcst_port(vsw_port_t *port);
    191 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
    192 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
    193 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
    194 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
    195 extern void vsw_create_vlans(void *arg, int type);
    196 extern void vsw_destroy_vlans(void *arg, int type);
    197 extern void vsw_vlan_add_ids(void *arg, int type);
    198 extern void vsw_vlan_remove_ids(void *arg, int type);
    199 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
    200 	struct ether_header *ehp, uint16_t *vidp);
    201 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
    202 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
    203 	mblk_t **npt);
    204 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
    205 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
    206 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
    207 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
    208 extern void vsw_hio_stop_port(vsw_port_t *portp);
    209 extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
    210 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
    211 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
    212 extern void vsw_destroy_rxpools(void *arg);
    213 
    214 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
    215 
    216 /*
    217  * Tunables used in this file.
    218  */
    219 extern int vsw_num_handshakes;
    220 extern int vsw_wretries;
    221 extern int vsw_desc_delay;
    222 extern int vsw_read_attempts;
    223 extern int vsw_ldc_tx_delay;
    224 extern int vsw_ldc_tx_retries;
    225 extern int vsw_ldc_retries;
    226 extern int vsw_ldc_delay;
    227 extern boolean_t vsw_ldc_rxthr_enabled;
    228 extern boolean_t vsw_ldc_txthr_enabled;
    229 extern uint32_t vsw_ntxds;
    230 extern uint32_t vsw_max_tx_qcount;
    231 extern uint32_t vsw_chain_len;
    232 extern uint32_t vsw_mblk_size1;
    233 extern uint32_t vsw_mblk_size2;
    234 extern uint32_t vsw_mblk_size3;
    235 extern uint32_t vsw_mblk_size4;
    236 extern uint32_t vsw_num_mblks1;
    237 extern uint32_t vsw_num_mblks2;
    238 extern uint32_t vsw_num_mblks3;
    239 extern uint32_t vsw_num_mblks4;
    240 extern boolean_t vsw_obp_ver_proto_workaround;
    241 extern uint32_t vsw_publish_macaddr_count;
    242 extern boolean_t vsw_jumbo_rxpools;
    243 
    244 #define	LDC_ENTER_LOCK(ldcp)	\
    245 				mutex_enter(&((ldcp)->ldc_cblock));\
    246 				mutex_enter(&((ldcp)->ldc_rxlock));\
    247 				mutex_enter(&((ldcp)->ldc_txlock));
    248 #define	LDC_EXIT_LOCK(ldcp)	\
    249 				mutex_exit(&((ldcp)->ldc_txlock));\
    250 				mutex_exit(&((ldcp)->ldc_rxlock));\
    251 				mutex_exit(&((ldcp)->ldc_cblock));
    252 
    253 #define	VSW_VER_EQ(ldcp, major, minor)	\
    254 	((ldcp)->lane_out.ver_major == (major) &&	\
    255 	    (ldcp)->lane_out.ver_minor == (minor))
    256 
    257 #define	VSW_VER_LT(ldcp, major, minor)	\
    258 	(((ldcp)->lane_out.ver_major < (major)) ||	\
    259 	    ((ldcp)->lane_out.ver_major == (major) &&	\
    260 	    (ldcp)->lane_out.ver_minor < (minor)))
    261 
    262 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
    263 	(((ldcp)->lane_out.ver_major > (major)) ||	\
    264 	    ((ldcp)->lane_out.ver_major == (major) &&	\
    265 	    (ldcp)->lane_out.ver_minor >= (minor)))
    266 
    267 /*
    268  * VIO Protocol Version Info:
    269  *
    270  * The version specified below represents the version of protocol currently
    271  * supported in the driver. It means the driver can negotiate with peers with
    272  * versions <= this version. Here is a summary of the feature(s) that are
    273  * supported at each version of the protocol:
    274  *
    275  * 1.0			Basic VIO protocol.
    276  * 1.1			vDisk protocol update (no virtual network update).
    277  * 1.2			Support for priority frames (priority-ether-types).
    278  * 1.3			VLAN and HybridIO support.
    279  * 1.4			Jumbo Frame support.
    280  * 1.5			Link State Notification support with optional support
    281  * 			for Physical Link information.
    282  */
    283 static	ver_sup_t	vsw_versions[] = { {1, 5} };
    284 
    285 /*
    286  * For the moment the state dump routines have their own
    287  * private flag.
    288  */
    289 #define	DUMP_STATE	0
    290 
    291 #if DUMP_STATE
    292 
    293 #define	DUMP_TAG(tag) \
    294 {			\
    295 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
    296 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
    297 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
    298 }
    299 
    300 #define	DUMP_TAG_PTR(tag) \
    301 {			\
    302 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
    303 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
    304 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
    305 }
    306 
    307 #define	DUMP_FLAGS(flags) dump_flags(flags);
    308 #define	DISPLAY_STATE()	display_state()
    309 
    310 #else
    311 
    312 #define	DUMP_TAG(tag)
    313 #define	DUMP_TAG_PTR(tag)
    314 #define	DUMP_FLAGS(state)
    315 #define	DISPLAY_STATE()
    316 
    317 #endif	/* DUMP_STATE */
    318 
    319 /*
    320  * Attach the specified port.
    321  *
    322  * Returns 0 on success, 1 on failure.
    323  */
    324 int
    325 vsw_port_attach(vsw_port_t *port)
    326 {
    327 	vsw_t			*vswp = port->p_vswp;
    328 	vsw_port_list_t		*plist = &vswp->plist;
    329 	vsw_port_t		*p, **pp;
    330 	int			i;
    331 	int			nids = port->num_ldcs;
    332 	uint64_t		*ldcids;
    333 	int			rv;
    334 
    335 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
    336 
    337 	/* port already exists? */
    338 	READ_ENTER(&plist->lockrw);
    339 	for (p = plist->head; p != NULL; p = p->p_next) {
    340 		if (p->p_instance == port->p_instance) {
    341 			DWARN(vswp, "%s: port instance %d already attached",
    342 			    __func__, p->p_instance);
    343 			RW_EXIT(&plist->lockrw);
    344 			return (1);
    345 		}
    346 	}
    347 	RW_EXIT(&plist->lockrw);
    348 
    349 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
    350 
    351 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
    352 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
    353 	rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
    354 
    355 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
    356 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
    357 	port->state = VSW_PORT_INIT;
    358 
    359 	D2(vswp, "%s: %d nids", __func__, nids);
    360 	ldcids = port->ldc_ids;
    361 	for (i = 0; i < nids; i++) {
    362 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
    363 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
    364 			DERR(vswp, "%s: ldc_attach failed", __func__);
    365 			goto exit_error;
    366 		}
    367 	}
    368 
    369 	if (vswp->switching_setup_done == B_TRUE) {
    370 		/*
    371 		 * If the underlying network device has been setup,
    372 		 * then open a mac client and porgram the mac address
    373 		 * for this port.
    374 		 */
    375 		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
    376 		if (rv != 0) {
    377 			goto exit_error;
    378 		}
    379 	}
    380 
    381 	/* create the fdb entry for this port/mac address */
    382 	vsw_fdbe_add(vswp, port);
    383 
    384 	vsw_create_vlans(port, VSW_VNETPORT);
    385 
    386 	WRITE_ENTER(&plist->lockrw);
    387 
    388 	/* link it into the list of ports for this vsw instance */
    389 	pp = (vsw_port_t **)(&plist->head);
    390 	port->p_next = *pp;
    391 	*pp = port;
    392 	plist->num_ports++;
    393 
    394 	RW_EXIT(&plist->lockrw);
    395 
    396 	/*
    397 	 * Initialise the port and any ldc's under it.
    398 	 */
    399 	(void) vsw_init_ldcs(port);
    400 
    401 	/* announce macaddr of vnet to the physical switch */
    402 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
    403 		vsw_publish_macaddr(vswp, port);
    404 	}
    405 
    406 	D1(vswp, "%s: exit", __func__);
    407 	return (0);
    408 
    409 exit_error:
    410 	rw_destroy(&port->p_ldclist.lockrw);
    411 
    412 	cv_destroy(&port->state_cv);
    413 	mutex_destroy(&port->state_lock);
    414 
    415 	rw_destroy(&port->maccl_rwlock);
    416 	mutex_destroy(&port->tx_lock);
    417 	mutex_destroy(&port->mca_lock);
    418 	kmem_free(port, sizeof (vsw_port_t));
    419 	return (1);
    420 }
    421 
    422 /*
    423  * Detach the specified port.
    424  *
    425  * Returns 0 on success, 1 on failure.
    426  */
    427 int
    428 vsw_port_detach(vsw_t *vswp, int p_instance)
    429 {
    430 	vsw_port_t	*port = NULL;
    431 	vsw_port_list_t	*plist = &vswp->plist;
    432 
    433 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
    434 
    435 	WRITE_ENTER(&plist->lockrw);
    436 
    437 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
    438 		RW_EXIT(&plist->lockrw);
    439 		return (1);
    440 	}
    441 
    442 	if (vsw_plist_del_node(vswp, port)) {
    443 		RW_EXIT(&plist->lockrw);
    444 		return (1);
    445 	}
    446 
    447 	/* cleanup any HybridIO for this port */
    448 	vsw_hio_stop_port(port);
    449 
    450 	/*
    451 	 * No longer need to hold writer lock on port list now
    452 	 * that we have unlinked the target port from the list.
    453 	 */
    454 	RW_EXIT(&plist->lockrw);
    455 
    456 	/* Cleanup and close the mac client */
    457 	vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
    458 
    459 	/* Remove the fdb entry for this port/mac address */
    460 	vsw_fdbe_del(vswp, &(port->p_macaddr));
    461 	vsw_destroy_vlans(port, VSW_VNETPORT);
    462 
    463 	/* Remove any multicast addresses.. */
    464 	vsw_del_mcst_port(port);
    465 
    466 	vsw_port_delete(port);
    467 
    468 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
    469 	return (0);
    470 }
    471 
    472 /*
    473  * Detach all active ports.
    474  */
    475 void
    476 vsw_detach_ports(vsw_t *vswp)
    477 {
    478 	vsw_port_list_t 	*plist = &vswp->plist;
    479 	vsw_port_t		*port = NULL;
    480 
    481 	D1(vswp, "%s: enter", __func__);
    482 
    483 	WRITE_ENTER(&plist->lockrw);
    484 
    485 	while ((port = plist->head) != NULL) {
    486 		(void) vsw_plist_del_node(vswp, port);
    487 
    488 		/* cleanup any HybridIO for this port */
    489 		vsw_hio_stop_port(port);
    490 
    491 		/* Cleanup and close the mac client */
    492 		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
    493 
    494 		/* Remove the fdb entry for this port/mac address */
    495 		vsw_fdbe_del(vswp, &(port->p_macaddr));
    496 		vsw_destroy_vlans(port, VSW_VNETPORT);
    497 
    498 		/* Remove any multicast addresses.. */
    499 		vsw_del_mcst_port(port);
    500 
    501 		/*
    502 		 * No longer need to hold the lock on the port list
    503 		 * now that we have unlinked the target port from the
    504 		 * list.
    505 		 */
    506 		RW_EXIT(&plist->lockrw);
    507 		vsw_port_delete(port);
    508 		WRITE_ENTER(&plist->lockrw);
    509 	}
    510 	RW_EXIT(&plist->lockrw);
    511 
    512 	D1(vswp, "%s: exit", __func__);
    513 }
    514 
    515 /*
    516  * Delete the specified port.
    517  */
    518 static void
    519 vsw_port_delete(vsw_port_t *port)
    520 {
    521 	vsw_ldc_list_t 		*ldcl;
    522 	vsw_t			*vswp = port->p_vswp;
    523 	int			num_ldcs;
    524 
    525 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
    526 
    527 	vsw_uninit_ldcs(port);
    528 
    529 	/*
    530 	 * Wait for any pending ctrl msg tasks which reference this
    531 	 * port to finish.
    532 	 */
    533 	vsw_drain_port_taskq(port);
    534 
    535 	/*
    536 	 * Wait for any active callbacks to finish
    537 	 */
    538 	vsw_drain_ldcs(port);
    539 
    540 	ldcl = &port->p_ldclist;
    541 	num_ldcs = port->num_ldcs;
    542 	WRITE_ENTER(&ldcl->lockrw);
    543 	while (num_ldcs > 0) {
    544 		vsw_ldc_detach(port, ldcl->head->ldc_id);
    545 		num_ldcs--;
    546 	}
    547 	RW_EXIT(&ldcl->lockrw);
    548 
    549 	rw_destroy(&port->p_ldclist.lockrw);
    550 
    551 	rw_destroy(&port->maccl_rwlock);
    552 	mutex_destroy(&port->mca_lock);
    553 	mutex_destroy(&port->tx_lock);
    554 
    555 	cv_destroy(&port->state_cv);
    556 	mutex_destroy(&port->state_lock);
    557 
    558 	if (port->num_ldcs != 0) {
    559 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
    560 		port->num_ldcs = 0;
    561 	}
    562 
    563 	if (port->nvids != 0) {
    564 		kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
    565 	}
    566 
    567 	kmem_free(port, sizeof (vsw_port_t));
    568 
    569 	D1(vswp, "%s: exit", __func__);
    570 }
    571 
    572 static int
    573 vsw_init_multipools(vsw_ldc_t *ldcp, vsw_t *vswp)
    574 {
    575 	size_t		data_sz;
    576 	int		rv;
    577 	uint32_t	sz1 = 0;
    578 	uint32_t	sz2 = 0;
    579 	uint32_t	sz3 = 0;
    580 	uint32_t	sz4 = 0;
    581 
    582 	/*
    583 	 * We round up the mtu specified to be a multiple of 2K to limit the
    584 	 * number of rx buffer pools created for a given mtu.
    585 	 */
    586 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
    587 	data_sz = VNET_ROUNDUP_2K(data_sz);
    588 
    589 	/*
    590 	 * If pool sizes are specified, use them. Note that the presence of
    591 	 * the first tunable will be used as a hint.
    592 	 */
    593 	if (vsw_mblk_size1 != 0) {
    594 		sz1 = vsw_mblk_size1;
    595 		sz2 = vsw_mblk_size2;
    596 		sz3 = vsw_mblk_size3;
    597 		sz4 = vsw_mblk_size4;
    598 
    599 		if (sz4 == 0) { /* need 3 pools */
    600 
    601 			ldcp->max_rxpool_size = sz3;
    602 			rv = vio_init_multipools(&ldcp->vmp,
    603 			    VSW_NUM_VMPOOLS, sz1, sz2, sz3,
    604 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
    605 
    606 		} else {
    607 
    608 			ldcp->max_rxpool_size = sz4;
    609 			rv = vio_init_multipools(&ldcp->vmp,
    610 			    VSW_NUM_VMPOOLS + 1, sz1, sz2, sz3, sz4,
    611 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
    612 			    vsw_num_mblks4);
    613 
    614 		}
    615 
    616 		return (rv);
    617 	}
    618 
    619 	/*
    620 	 * Pool sizes are not specified. We select the pool sizes based on the
    621 	 * mtu if vnet_jumbo_rxpools is enabled.
    622 	 */
    623 	if (vsw_jumbo_rxpools == B_FALSE || data_sz == VNET_2K) {
    624 		/*
    625 		 * Receive buffer pool allocation based on mtu is disabled.
    626 		 * Use the default mechanism of standard size pool allocation.
    627 		 */
    628 		sz1 = VSW_MBLK_SZ_128;
    629 		sz2 = VSW_MBLK_SZ_256;
    630 		sz3 = VSW_MBLK_SZ_2048;
    631 		ldcp->max_rxpool_size = sz3;
    632 
    633 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
    634 		    sz1, sz2, sz3,
    635 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
    636 
    637 		return (rv);
    638 	}
    639 
    640 	switch (data_sz) {
    641 
    642 	case VNET_4K:
    643 
    644 		sz1 = VSW_MBLK_SZ_128;
    645 		sz2 = VSW_MBLK_SZ_256;
    646 		sz3 = VSW_MBLK_SZ_2048;
    647 		sz4 = sz3 << 1;			/* 4K */
    648 		ldcp->max_rxpool_size = sz4;
    649 
    650 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
    651 		    sz1, sz2, sz3, sz4,
    652 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
    653 		    vsw_num_mblks4);
    654 		break;
    655 
    656 	default:	/* data_sz:  4K+ to 16K */
    657 
    658 		sz1 = VSW_MBLK_SZ_256;
    659 		sz2 = VSW_MBLK_SZ_2048;
    660 		sz3 = data_sz >> 1;	/* Jumbo-size/2 */
    661 		sz4 = data_sz;	/* Jumbo-size */
    662 		ldcp->max_rxpool_size = sz4;
    663 
    664 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
    665 		    sz1, sz2, sz3, sz4,
    666 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
    667 		    vsw_num_mblks4);
    668 		break;
    669 	}
    670 
    671 	return (rv);
    672 
    673 }
    674 
    675 /*
    676  * Attach a logical domain channel (ldc) under a specified port.
    677  *
    678  * Returns 0 on success, 1 on failure.
    679  */
    680 static int
    681 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
    682 {
    683 	vsw_t 		*vswp = port->p_vswp;
    684 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
    685 	vsw_ldc_t 	*ldcp = NULL;
    686 	ldc_attr_t 	attr;
    687 	ldc_status_t	istatus;
    688 	int 		status = DDI_FAILURE;
    689 	char		kname[MAXNAMELEN];
    690 	enum		{ PROG_init = 0x0,
    691 			    PROG_callback = 0x1, PROG_rx_thread = 0x2,
    692 			    PROG_tx_thread = 0x4}
    693 			progress;
    694 
    695 	progress = PROG_init;
    696 
    697 	D1(vswp, "%s: enter", __func__);
    698 
    699 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
    700 	if (ldcp == NULL) {
    701 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
    702 		return (1);
    703 	}
    704 	ldcp->ldc_id = ldc_id;
    705 
    706 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
    707 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
    708 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
    709 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
    710 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
    711 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
    712 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
    713 
    714 	/* required for handshake with peer */
    715 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
    716 	ldcp->peer_session = 0;
    717 	ldcp->session_status = 0;
    718 	ldcp->hss_id = 1;	/* Initial handshake session id */
    719 
    720 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
    721 
    722 	/* only set for outbound lane, inbound set by peer */
    723 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
    724 
    725 	attr.devclass = LDC_DEV_NT_SVC;
    726 	attr.instance = ddi_get_instance(vswp->dip);
    727 	attr.mode = LDC_MODE_UNRELIABLE;
    728 	attr.mtu = VSW_LDC_MTU;
    729 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
    730 	if (status != 0) {
    731 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
    732 		    __func__, ldc_id, status);
    733 		goto ldc_attach_fail;
    734 	}
    735 
    736 	if (vsw_ldc_rxthr_enabled) {
    737 		ldcp->rx_thr_flags = 0;
    738 
    739 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
    740 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
    741 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
    742 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
    743 
    744 		progress |= PROG_rx_thread;
    745 		if (ldcp->rx_thread == NULL) {
    746 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
    747 			    __func__, ldc_id);
    748 			goto ldc_attach_fail;
    749 		}
    750 	}
    751 
    752 	if (vsw_ldc_txthr_enabled) {
    753 		ldcp->tx_thr_flags = 0;
    754 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
    755 
    756 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
    757 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
    758 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
    759 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
    760 
    761 		progress |= PROG_tx_thread;
    762 		if (ldcp->tx_thread == NULL) {
    763 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
    764 			    __func__, ldc_id);
    765 			goto ldc_attach_fail;
    766 		}
    767 	}
    768 
    769 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
    770 	if (status != 0) {
    771 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
    772 		    __func__, ldc_id, status);
    773 		(void) ldc_fini(ldcp->ldc_handle);
    774 		goto ldc_attach_fail;
    775 	}
    776 	/*
    777 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
    778 	 * data msgs, including raw data msgs used to recv priority frames.
    779 	 */
    780 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
    781 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
    782 
    783 	progress |= PROG_callback;
    784 
    785 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
    786 
    787 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
    788 		DERR(vswp, "%s: ldc_status failed", __func__);
    789 		mutex_destroy(&ldcp->status_lock);
    790 		goto ldc_attach_fail;
    791 	}
    792 
    793 	ldcp->ldc_status = istatus;
    794 	ldcp->ldc_port = port;
    795 	ldcp->ldc_vswp = vswp;
    796 
    797 	vsw_reset_vnet_proto_ops(ldcp);
    798 
    799 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
    800 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
    801 	    kname, &ldcp->ldc_stats);
    802 	if (ldcp->ksp == NULL) {
    803 		DERR(vswp, "%s: kstats setup failed", __func__);
    804 		goto ldc_attach_fail;
    805 	}
    806 
    807 	/* link it into the list of channels for this port */
    808 	WRITE_ENTER(&ldcl->lockrw);
    809 	ldcp->ldc_next = ldcl->head;
    810 	ldcl->head = ldcp;
    811 	RW_EXIT(&ldcl->lockrw);
    812 
    813 	D1(vswp, "%s: exit", __func__);
    814 	return (0);
    815 
    816 ldc_attach_fail:
    817 
    818 	if (progress & PROG_callback) {
    819 		(void) ldc_unreg_callback(ldcp->ldc_handle);
    820 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
    821 	}
    822 
    823 	if (progress & PROG_rx_thread) {
    824 		if (ldcp->rx_thread != NULL) {
    825 			vsw_stop_rx_thread(ldcp);
    826 		}
    827 		mutex_destroy(&ldcp->rx_thr_lock);
    828 		cv_destroy(&ldcp->rx_thr_cv);
    829 	}
    830 
    831 	if (progress & PROG_tx_thread) {
    832 		if (ldcp->tx_thread != NULL) {
    833 			vsw_stop_tx_thread(ldcp);
    834 		}
    835 		mutex_destroy(&ldcp->tx_thr_lock);
    836 		cv_destroy(&ldcp->tx_thr_cv);
    837 	}
    838 	if (ldcp->ksp != NULL) {
    839 		vgen_destroy_kstats(ldcp->ksp);
    840 	}
    841 	mutex_destroy(&ldcp->ldc_txlock);
    842 	mutex_destroy(&ldcp->ldc_rxlock);
    843 	mutex_destroy(&ldcp->ldc_cblock);
    844 	mutex_destroy(&ldcp->drain_cv_lock);
    845 
    846 	cv_destroy(&ldcp->drain_cv);
    847 
    848 	rw_destroy(&ldcp->lane_in.dlistrw);
    849 	rw_destroy(&ldcp->lane_out.dlistrw);
    850 
    851 	kmem_free(ldcp, sizeof (vsw_ldc_t));
    852 
    853 	return (1);
    854 }
    855 
    856 /*
    857  * Detach a logical domain channel (ldc) belonging to a
    858  * particular port.
    859  */
    860 static void
    861 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
    862 {
    863 	vsw_t 		*vswp = port->p_vswp;
    864 	vsw_ldc_t 	*ldcp, *prev_ldcp;
    865 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
    866 	int 		rv;
    867 	int		retries = 0;
    868 	vio_mblk_pool_t *fvmp = NULL;
    869 
    870 	prev_ldcp = ldcl->head;
    871 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
    872 		if (ldcp->ldc_id == ldc_id) {
    873 			break;
    874 		}
    875 	}
    876 
    877 	/* specified ldc id not found */
    878 	ASSERT(ldcp != NULL);
    879 
    880 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
    881 
    882 	/* Stop the receive thread */
    883 	if (ldcp->rx_thread != NULL) {
    884 		vsw_stop_rx_thread(ldcp);
    885 		mutex_destroy(&ldcp->rx_thr_lock);
    886 		cv_destroy(&ldcp->rx_thr_cv);
    887 	}
    888 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
    889 
    890 	/* Stop the tx thread */
    891 	if (ldcp->tx_thread != NULL) {
    892 		vsw_stop_tx_thread(ldcp);
    893 		mutex_destroy(&ldcp->tx_thr_lock);
    894 		cv_destroy(&ldcp->tx_thr_cv);
    895 		if (ldcp->tx_mhead != NULL) {
    896 			freemsgchain(ldcp->tx_mhead);
    897 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
    898 			ldcp->tx_cnt = 0;
    899 		}
    900 	}
    901 
    902 	/* Destory kstats */
    903 	vgen_destroy_kstats(ldcp->ksp);
    904 
    905 	/*
    906 	 * Before we can close the channel we must release any mapped
    907 	 * resources (e.g. drings).
    908 	 */
    909 	vsw_free_lane_resources(ldcp, INBOUND);
    910 	vsw_free_lane_resources(ldcp, OUTBOUND);
    911 
    912 	/*
    913 	 * Close the channel, retry on EAAGIN.
    914 	 */
    915 	while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
    916 		if (++retries > vsw_ldc_retries) {
    917 			break;
    918 		}
    919 		drv_usecwait(vsw_ldc_delay);
    920 	}
    921 	if (rv != 0) {
    922 		cmn_err(CE_NOTE,
    923 		    "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
    924 		    vswp->instance, rv, ldcp->ldc_id);
    925 	}
    926 
    927 	(void) ldc_fini(ldcp->ldc_handle);
    928 
    929 	ldcp->ldc_status = LDC_INIT;
    930 	ldcp->ldc_handle = NULL;
    931 	ldcp->ldc_vswp = NULL;
    932 
    933 
    934 	/*
    935 	 * If we can't destroy all the rx pools for this channel, dispatch
    936 	 * a task to retry and clean up those rx pools. Note that we don't
    937 	 * need to wait for the task to complete. If the vsw device itself
    938 	 * gets detached (vsw_detach()), it will wait for the task to complete
    939 	 * implicitly in ddi_taskq_destroy().
    940 	 */
    941 	vio_destroy_multipools(&ldcp->vmp, &fvmp);
    942 	if (fvmp != NULL) {
    943 		(void) ddi_taskq_dispatch(vswp->rxp_taskq,
    944 		    vsw_destroy_rxpools, fvmp, DDI_SLEEP);
    945 	}
    946 
    947 	/* unlink it from the list */
    948 	prev_ldcp = ldcp->ldc_next;
    949 
    950 	mutex_destroy(&ldcp->ldc_txlock);
    951 	mutex_destroy(&ldcp->ldc_rxlock);
    952 	mutex_destroy(&ldcp->ldc_cblock);
    953 	cv_destroy(&ldcp->drain_cv);
    954 	mutex_destroy(&ldcp->drain_cv_lock);
    955 	mutex_destroy(&ldcp->status_lock);
    956 	rw_destroy(&ldcp->lane_in.dlistrw);
    957 	rw_destroy(&ldcp->lane_out.dlistrw);
    958 
    959 	kmem_free(ldcp, sizeof (vsw_ldc_t));
    960 }
    961 
    962 /*
    963  * Open and attempt to bring up the channel. Note that channel
    964  * can only be brought up if peer has also opened channel.
    965  *
    966  * Returns 0 if can open and bring up channel, otherwise
    967  * returns 1.
    968  */
    969 static int
    970 vsw_ldc_init(vsw_ldc_t *ldcp)
    971 {
    972 	vsw_t 		*vswp = ldcp->ldc_vswp;
    973 	ldc_status_t	istatus = 0;
    974 	int		rv;
    975 
    976 	D1(vswp, "%s: enter", __func__);
    977 
    978 	LDC_ENTER_LOCK(ldcp);
    979 
    980 	/* don't start at 0 in case clients don't like that */
    981 	ldcp->next_ident = 1;
    982 
    983 	rv = ldc_open(ldcp->ldc_handle);
    984 	if (rv != 0) {
    985 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
    986 		    __func__, ldcp->ldc_id, rv);
    987 		LDC_EXIT_LOCK(ldcp);
    988 		return (1);
    989 	}
    990 
    991 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
    992 		DERR(vswp, "%s: unable to get status", __func__);
    993 		LDC_EXIT_LOCK(ldcp);
    994 		return (1);
    995 
    996 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
    997 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
    998 		    __func__, ldcp->ldc_id, istatus);
    999 		LDC_EXIT_LOCK(ldcp);
   1000 		return (1);
   1001 	}
   1002 
   1003 	mutex_enter(&ldcp->status_lock);
   1004 	ldcp->ldc_status = istatus;
   1005 	mutex_exit(&ldcp->status_lock);
   1006 
   1007 	rv = ldc_up(ldcp->ldc_handle);
   1008 	if (rv != 0) {
   1009 		/*
   1010 		 * Not a fatal error for ldc_up() to fail, as peer
   1011 		 * end point may simply not be ready yet.
   1012 		 */
   1013 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
   1014 		    ldcp->ldc_id, rv);
   1015 		LDC_EXIT_LOCK(ldcp);
   1016 		return (1);
   1017 	}
   1018 
   1019 	/*
   1020 	 * ldc_up() call is non-blocking so need to explicitly
   1021 	 * check channel status to see if in fact the channel
   1022 	 * is UP.
   1023 	 */
   1024 	mutex_enter(&ldcp->status_lock);
   1025 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
   1026 		DERR(vswp, "%s: unable to get status", __func__);
   1027 		mutex_exit(&ldcp->status_lock);
   1028 		LDC_EXIT_LOCK(ldcp);
   1029 		return (1);
   1030 
   1031 	}
   1032 
   1033 	if (ldcp->ldc_status == LDC_UP) {
   1034 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
   1035 		    ldcp->ldc_id, istatus);
   1036 		mutex_exit(&ldcp->status_lock);
   1037 		LDC_EXIT_LOCK(ldcp);
   1038 
   1039 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
   1040 		return (0);
   1041 	}
   1042 
   1043 	mutex_exit(&ldcp->status_lock);
   1044 	LDC_EXIT_LOCK(ldcp);
   1045 
   1046 	D1(vswp, "%s: exit", __func__);
   1047 	return (0);
   1048 }
   1049 
   1050 /* disable callbacks on the channel */
   1051 static void
   1052 vsw_ldc_uninit(vsw_ldc_t *ldcp)
   1053 {
   1054 	vsw_t	*vswp = ldcp->ldc_vswp;
   1055 	int	rv;
   1056 
   1057 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
   1058 
   1059 	LDC_ENTER_LOCK(ldcp);
   1060 
   1061 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
   1062 	if (rv != 0) {
   1063 		cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
   1064 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
   1065 	}
   1066 
   1067 	mutex_enter(&ldcp->status_lock);
   1068 	ldcp->ldc_status = LDC_INIT;
   1069 	mutex_exit(&ldcp->status_lock);
   1070 
   1071 	LDC_EXIT_LOCK(ldcp);
   1072 
   1073 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
   1074 }
   1075 
   1076 static int
   1077 vsw_init_ldcs(vsw_port_t *port)
   1078 {
   1079 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
   1080 	vsw_ldc_t	*ldcp;
   1081 
   1082 	READ_ENTER(&ldcl->lockrw);
   1083 	ldcp =  ldcl->head;
   1084 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
   1085 		(void) vsw_ldc_init(ldcp);
   1086 	}
   1087 	RW_EXIT(&ldcl->lockrw);
   1088 
   1089 	return (0);
   1090 }
   1091 
   1092 static void
   1093 vsw_uninit_ldcs(vsw_port_t *port)
   1094 {
   1095 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
   1096 	vsw_ldc_t	*ldcp;
   1097 
   1098 	D1(NULL, "vsw_uninit_ldcs: enter\n");
   1099 
   1100 	READ_ENTER(&ldcl->lockrw);
   1101 	ldcp =  ldcl->head;
   1102 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
   1103 		vsw_ldc_uninit(ldcp);
   1104 	}
   1105 	RW_EXIT(&ldcl->lockrw);
   1106 
   1107 	D1(NULL, "vsw_uninit_ldcs: exit\n");
   1108 }
   1109 
   1110 /*
   1111  * Wait until the callback(s) associated with the ldcs under the specified
   1112  * port have completed.
   1113  *
   1114  * Prior to this function being invoked each channel under this port
   1115  * should have been quiesced via ldc_set_cb_mode(DISABLE).
   1116  *
   1117  * A short explaination of what we are doing below..
   1118  *
   1119  * The simplest approach would be to have a reference counter in
   1120  * the ldc structure which is increment/decremented by the callbacks as
   1121  * they use the channel. The drain function could then simply disable any
   1122  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
   1123  * there is a tiny window here - before the callback is able to get the lock
   1124  * on the channel it is interrupted and this function gets to execute. It
   1125  * sees that the ref count is zero and believes its free to delete the
   1126  * associated data structures.
   1127  *
   1128  * We get around this by taking advantage of the fact that before the ldc
   1129  * framework invokes a callback it sets a flag to indicate that there is a
   1130  * callback active (or about to become active). If when we attempt to
   1131  * unregister a callback when this active flag is set then the unregister
   1132  * will fail with EWOULDBLOCK.
   1133  *
   1134  * If the unregister fails we do a cv_timedwait. We will either be signaled
   1135  * by the callback as it is exiting (note we have to wait a short period to
   1136  * allow the callback to return fully to the ldc framework and it to clear
   1137  * the active flag), or by the timer expiring. In either case we again attempt
   1138  * the unregister. We repeat this until we can succesfully unregister the
   1139  * callback.
   1140  *
   1141  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
   1142  * the case where the callback has finished but the ldc framework has not yet
   1143  * cleared the active flag. In this case we would never get a cv_signal.
   1144  */
   1145 static void
   1146 vsw_drain_ldcs(vsw_port_t *port)
   1147 {
   1148 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
   1149 	vsw_ldc_t	*ldcp;
   1150 	vsw_t		*vswp = port->p_vswp;
   1151 
   1152 	D1(vswp, "%s: enter", __func__);
   1153 
   1154 	READ_ENTER(&ldcl->lockrw);
   1155 
   1156 	ldcp = ldcl->head;
   1157 
   1158 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
   1159 		/*
   1160 		 * If we can unregister the channel callback then we
   1161 		 * know that there is no callback either running or
   1162 		 * scheduled to run for this channel so move on to next
   1163 		 * channel in the list.
   1164 		 */
   1165 		mutex_enter(&ldcp->drain_cv_lock);
   1166 
   1167 		/* prompt active callbacks to quit */
   1168 		ldcp->drain_state = VSW_LDC_DRAINING;
   1169 
   1170 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
   1171 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
   1172 			    ldcp->ldc_id);
   1173 			mutex_exit(&ldcp->drain_cv_lock);
   1174 			continue;
   1175 		} else {
   1176 			/*
   1177 			 * If we end up here we know that either 1) a callback
   1178 			 * is currently executing, 2) is about to start (i.e.
   1179 			 * the ldc framework has set the active flag but
   1180 			 * has not actually invoked the callback yet, or 3)
   1181 			 * has finished and has returned to the ldc framework
   1182 			 * but the ldc framework has not yet cleared the
   1183 			 * active bit.
   1184 			 *
   1185 			 * Wait for it to finish.
   1186 			 */
   1187 			while (ldc_unreg_callback(ldcp->ldc_handle)
   1188 			    == EWOULDBLOCK)
   1189 				(void) cv_reltimedwait(&ldcp->drain_cv,
   1190 				    &ldcp->drain_cv_lock, hz, TR_CLOCK_TICK);
   1191 
   1192 			mutex_exit(&ldcp->drain_cv_lock);
   1193 			D2(vswp, "%s: unreg callback for chan %ld after "
   1194 			    "timeout", __func__, ldcp->ldc_id);
   1195 		}
   1196 	}
   1197 	RW_EXIT(&ldcl->lockrw);
   1198 
   1199 	D1(vswp, "%s: exit", __func__);
   1200 }
   1201 
   1202 /*
   1203  * Wait until all tasks which reference this port have completed.
   1204  *
   1205  * Prior to this function being invoked each channel under this port
   1206  * should have been quiesced via ldc_set_cb_mode(DISABLE).
   1207  */
   1208 static void
   1209 vsw_drain_port_taskq(vsw_port_t *port)
   1210 {
   1211 	vsw_t		*vswp = port->p_vswp;
   1212 
   1213 	D1(vswp, "%s: enter", __func__);
   1214 
   1215 	/*
   1216 	 * Mark the port as in the process of being detached, and
   1217 	 * dispatch a marker task to the queue so we know when all
   1218 	 * relevant tasks have completed.
   1219 	 */
   1220 	mutex_enter(&port->state_lock);
   1221 	port->state = VSW_PORT_DETACHING;
   1222 
   1223 	if ((vswp->taskq_p == NULL) ||
   1224 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
   1225 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
   1226 		cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
   1227 		    vswp->instance);
   1228 		mutex_exit(&port->state_lock);
   1229 		return;
   1230 	}
   1231 
   1232 	/*
   1233 	 * Wait for the marker task to finish.
   1234 	 */
   1235 	while (port->state != VSW_PORT_DETACHABLE)
   1236 		cv_wait(&port->state_cv, &port->state_lock);
   1237 
   1238 	mutex_exit(&port->state_lock);
   1239 
   1240 	D1(vswp, "%s: exit", __func__);
   1241 }
   1242 
   1243 static void
   1244 vsw_marker_task(void *arg)
   1245 {
   1246 	vsw_port_t	*port = arg;
   1247 	vsw_t		*vswp = port->p_vswp;
   1248 
   1249 	D1(vswp, "%s: enter", __func__);
   1250 
   1251 	mutex_enter(&port->state_lock);
   1252 
   1253 	/*
   1254 	 * No further tasks should be dispatched which reference
   1255 	 * this port so ok to mark it as safe to detach.
   1256 	 */
   1257 	port->state = VSW_PORT_DETACHABLE;
   1258 
   1259 	cv_signal(&port->state_cv);
   1260 
   1261 	mutex_exit(&port->state_lock);
   1262 
   1263 	D1(vswp, "%s: exit", __func__);
   1264 }
   1265 
   1266 vsw_port_t *
   1267 vsw_lookup_port(vsw_t *vswp, int p_instance)
   1268 {
   1269 	vsw_port_list_t *plist = &vswp->plist;
   1270 	vsw_port_t	*port;
   1271 
   1272 	for (port = plist->head; port != NULL; port = port->p_next) {
   1273 		if (port->p_instance == p_instance) {
   1274 			D2(vswp, "vsw_lookup_port: found p_instance\n");
   1275 			return (port);
   1276 		}
   1277 	}
   1278 
   1279 	return (NULL);
   1280 }
   1281 
   1282 void
   1283 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
   1284 {
   1285 	vsw_ldc_list_t 	*ldclp;
   1286 	vsw_ldc_t	*ldcp;
   1287 
   1288 	ldclp = &portp->p_ldclist;
   1289 
   1290 	READ_ENTER(&ldclp->lockrw);
   1291 
   1292 	/*
   1293 	 * NOTE: for now, we will assume we have a single channel.
   1294 	 */
   1295 	if (ldclp->head == NULL) {
   1296 		RW_EXIT(&ldclp->lockrw);
   1297 		return;
   1298 	}
   1299 	ldcp = ldclp->head;
   1300 
   1301 	mutex_enter(&ldcp->ldc_cblock);
   1302 
   1303 	/*
   1304 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
   1305 	 * the connection. See comments in vsw_set_vnet_proto_ops().
   1306 	 */
   1307 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
   1308 	    portp->nvids != 0) {
   1309 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1310 	}
   1311 
   1312 	mutex_exit(&ldcp->ldc_cblock);
   1313 
   1314 	RW_EXIT(&ldclp->lockrw);
   1315 }
   1316 
   1317 void
   1318 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
   1319 {
   1320 	vsw_ldc_list_t	*ldclp;
   1321 	vsw_ldc_t	*ldcp;
   1322 
   1323 	ldclp = &portp->p_ldclist;
   1324 
   1325 	READ_ENTER(&ldclp->lockrw);
   1326 
   1327 	/*
   1328 	 * NOTE: for now, we will assume we have a single channel.
   1329 	 */
   1330 	if (ldclp->head == NULL) {
   1331 		RW_EXIT(&ldclp->lockrw);
   1332 		return;
   1333 	}
   1334 	ldcp = ldclp->head;
   1335 
   1336 	mutex_enter(&ldcp->ldc_cblock);
   1337 
   1338 	/*
   1339 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
   1340 	 * to trigger re-negotiation, which inturn trigger HybridIO
   1341 	 * setup/cleanup.
   1342 	 */
   1343 	if ((ldcp->hphase == VSW_MILESTONE4) &&
   1344 	    (portp->p_hio_capable == B_TRUE)) {
   1345 		if (immediate == B_TRUE) {
   1346 			(void) ldc_down(ldcp->ldc_handle);
   1347 		} else {
   1348 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1349 		}
   1350 	}
   1351 
   1352 	mutex_exit(&ldcp->ldc_cblock);
   1353 
   1354 	RW_EXIT(&ldclp->lockrw);
   1355 }
   1356 
   1357 void
   1358 vsw_port_reset(vsw_port_t *portp)
   1359 {
   1360 	vsw_ldc_list_t 	*ldclp;
   1361 	vsw_ldc_t	*ldcp;
   1362 
   1363 	ldclp = &portp->p_ldclist;
   1364 
   1365 	READ_ENTER(&ldclp->lockrw);
   1366 
   1367 	/*
   1368 	 * NOTE: for now, we will assume we have a single channel.
   1369 	 */
   1370 	if (ldclp->head == NULL) {
   1371 		RW_EXIT(&ldclp->lockrw);
   1372 		return;
   1373 	}
   1374 	ldcp = ldclp->head;
   1375 
   1376 	mutex_enter(&ldcp->ldc_cblock);
   1377 
   1378 	/*
   1379 	 * reset channel and terminate the connection.
   1380 	 */
   1381 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1382 
   1383 	mutex_exit(&ldcp->ldc_cblock);
   1384 
   1385 	RW_EXIT(&ldclp->lockrw);
   1386 }
   1387 
   1388 void
   1389 vsw_reset_ports(vsw_t *vswp)
   1390 {
   1391 	vsw_port_list_t	*plist = &vswp->plist;
   1392 	vsw_port_t	*portp;
   1393 
   1394 	READ_ENTER(&plist->lockrw);
   1395 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
   1396 		if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
   1397 			vsw_hio_stop_port(portp);
   1398 		}
   1399 		vsw_port_reset(portp);
   1400 	}
   1401 	RW_EXIT(&plist->lockrw);
   1402 }
   1403 
   1404 static void
   1405 vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
   1406 {
   1407 	vnet_physlink_msg_t	msg;
   1408 	vnet_physlink_msg_t	*msgp = &msg;
   1409 	uint32_t		physlink_info = 0;
   1410 
   1411 	if (plink_state == LINK_STATE_UP) {
   1412 		physlink_info |= VNET_PHYSLINK_STATE_UP;
   1413 	} else {
   1414 		physlink_info |= VNET_PHYSLINK_STATE_DOWN;
   1415 	}
   1416 
   1417 	msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
   1418 	msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
   1419 	msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
   1420 	msgp->tag.vio_sid = ldcp->local_session;
   1421 	msgp->physlink_info = physlink_info;
   1422 
   1423 	(void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
   1424 }
   1425 
   1426 static void
   1427 vsw_port_physlink_update(vsw_port_t *portp)
   1428 {
   1429 	vsw_ldc_list_t 	*ldclp;
   1430 	vsw_ldc_t	*ldcp;
   1431 	vsw_t		*vswp;
   1432 
   1433 	vswp = portp->p_vswp;
   1434 	ldclp = &portp->p_ldclist;
   1435 
   1436 	READ_ENTER(&ldclp->lockrw);
   1437 
   1438 	/*
   1439 	 * NOTE: for now, we will assume we have a single channel.
   1440 	 */
   1441 	if (ldclp->head == NULL) {
   1442 		RW_EXIT(&ldclp->lockrw);
   1443 		return;
   1444 	}
   1445 	ldcp = ldclp->head;
   1446 
   1447 	mutex_enter(&ldcp->ldc_cblock);
   1448 
   1449 	/*
   1450 	 * If handshake has completed successfully and if the vnet device
   1451 	 * has negotiated to get physical link state updates, send a message
   1452 	 * with the current state.
   1453 	 */
   1454 	if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
   1455 		vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
   1456 	}
   1457 
   1458 	mutex_exit(&ldcp->ldc_cblock);
   1459 
   1460 	RW_EXIT(&ldclp->lockrw);
   1461 }
   1462 
   1463 void
   1464 vsw_physlink_update_ports(vsw_t *vswp)
   1465 {
   1466 	vsw_port_list_t	*plist = &vswp->plist;
   1467 	vsw_port_t	*portp;
   1468 
   1469 	READ_ENTER(&plist->lockrw);
   1470 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
   1471 		vsw_port_physlink_update(portp);
   1472 	}
   1473 	RW_EXIT(&plist->lockrw);
   1474 }
   1475 
   1476 /*
   1477  * Search for and remove the specified port from the port
   1478  * list. Returns 0 if able to locate and remove port, otherwise
   1479  * returns 1.
   1480  */
   1481 static int
   1482 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
   1483 {
   1484 	vsw_port_list_t *plist = &vswp->plist;
   1485 	vsw_port_t	*curr_p, *prev_p;
   1486 
   1487 	if (plist->head == NULL)
   1488 		return (1);
   1489 
   1490 	curr_p = prev_p = plist->head;
   1491 
   1492 	while (curr_p != NULL) {
   1493 		if (curr_p == port) {
   1494 			if (prev_p == curr_p) {
   1495 				plist->head = curr_p->p_next;
   1496 			} else {
   1497 				prev_p->p_next = curr_p->p_next;
   1498 			}
   1499 			plist->num_ports--;
   1500 			break;
   1501 		} else {
   1502 			prev_p = curr_p;
   1503 			curr_p = curr_p->p_next;
   1504 		}
   1505 	}
   1506 	return (0);
   1507 }
   1508 
   1509 /*
   1510  * Interrupt handler for ldc messages.
   1511  */
   1512 static uint_t
   1513 vsw_ldc_cb(uint64_t event, caddr_t arg)
   1514 {
   1515 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
   1516 	vsw_t 		*vswp = ldcp->ldc_vswp;
   1517 
   1518 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
   1519 
   1520 	mutex_enter(&ldcp->ldc_cblock);
   1521 	ldcp->ldc_stats.callbacks++;
   1522 
   1523 	mutex_enter(&ldcp->status_lock);
   1524 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
   1525 		mutex_exit(&ldcp->status_lock);
   1526 		mutex_exit(&ldcp->ldc_cblock);
   1527 		return (LDC_SUCCESS);
   1528 	}
   1529 	mutex_exit(&ldcp->status_lock);
   1530 
   1531 	if (event & LDC_EVT_UP) {
   1532 		/*
   1533 		 * Channel has come up.
   1534 		 */
   1535 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
   1536 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
   1537 
   1538 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
   1539 
   1540 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
   1541 	}
   1542 
   1543 	if (event & LDC_EVT_READ) {
   1544 		/*
   1545 		 * Data available for reading.
   1546 		 */
   1547 		D2(vswp, "%s: id(ld) event(%llx) data READ",
   1548 		    __func__, ldcp->ldc_id, event);
   1549 
   1550 		if (ldcp->rx_thread != NULL) {
   1551 			/*
   1552 			 * If the receive thread is enabled, then
   1553 			 * wakeup the receive thread to process the
   1554 			 * LDC messages.
   1555 			 */
   1556 			mutex_exit(&ldcp->ldc_cblock);
   1557 			mutex_enter(&ldcp->rx_thr_lock);
   1558 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
   1559 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
   1560 				cv_signal(&ldcp->rx_thr_cv);
   1561 			}
   1562 			mutex_exit(&ldcp->rx_thr_lock);
   1563 			mutex_enter(&ldcp->ldc_cblock);
   1564 		} else {
   1565 			vsw_process_pkt(ldcp);
   1566 		}
   1567 
   1568 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
   1569 
   1570 		goto vsw_cb_exit;
   1571 	}
   1572 
   1573 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
   1574 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
   1575 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
   1576 
   1577 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
   1578 	}
   1579 
   1580 	/*
   1581 	 * Catch either LDC_EVT_WRITE which we don't support or any
   1582 	 * unknown event.
   1583 	 */
   1584 	if (event &
   1585 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
   1586 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
   1587 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
   1588 	}
   1589 
   1590 vsw_cb_exit:
   1591 	mutex_exit(&ldcp->ldc_cblock);
   1592 
   1593 	/*
   1594 	 * Let the drain function know we are finishing if it
   1595 	 * is waiting.
   1596 	 */
   1597 	mutex_enter(&ldcp->drain_cv_lock);
   1598 	if (ldcp->drain_state == VSW_LDC_DRAINING)
   1599 		cv_signal(&ldcp->drain_cv);
   1600 	mutex_exit(&ldcp->drain_cv_lock);
   1601 
   1602 	return (LDC_SUCCESS);
   1603 }
   1604 
   1605 /*
   1606  * Reinitialise data structures associated with the channel.
   1607  */
   1608 static void
   1609 vsw_ldc_reinit(vsw_ldc_t *ldcp)
   1610 {
   1611 	vsw_t		*vswp = ldcp->ldc_vswp;
   1612 	vsw_port_t	*port;
   1613 	vsw_ldc_list_t	*ldcl;
   1614 	vio_mblk_pool_t *fvmp = NULL;
   1615 
   1616 	D1(vswp, "%s: enter", __func__);
   1617 
   1618 	/*
   1619 	 * If we can't destroy all the rx pools for this channel, dispatch
   1620 	 * a task to retry and clean up those rx pools. Note that we don't
   1621 	 * need to wait for the task to complete. If the vsw device itself
   1622 	 * gets detached (vsw_detach()), it will wait for the task to complete
   1623 	 * implicitly in ddi_taskq_destroy().
   1624 	 */
   1625 	vio_destroy_multipools(&ldcp->vmp, &fvmp);
   1626 	if (fvmp != NULL) {
   1627 		(void) ddi_taskq_dispatch(vswp->rxp_taskq,
   1628 		    vsw_destroy_rxpools, fvmp, DDI_SLEEP);
   1629 	}
   1630 
   1631 	port = ldcp->ldc_port;
   1632 	ldcl = &port->p_ldclist;
   1633 
   1634 	READ_ENTER(&ldcl->lockrw);
   1635 
   1636 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
   1637 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
   1638 
   1639 	vsw_free_lane_resources(ldcp, INBOUND);
   1640 	vsw_free_lane_resources(ldcp, OUTBOUND);
   1641 	RW_EXIT(&ldcl->lockrw);
   1642 
   1643 	ldcp->lane_in.lstate = 0;
   1644 	ldcp->lane_out.lstate = 0;
   1645 
   1646 	/* Remove the fdb entry for this port/mac address */
   1647 	vsw_fdbe_del(vswp, &(port->p_macaddr));
   1648 
   1649 	/* remove the port from vlans it has been assigned to */
   1650 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
   1651 
   1652 	/*
   1653 	 * Remove parent port from any multicast groups
   1654 	 * it may have registered with. Client must resend
   1655 	 * multicast add command after handshake completes.
   1656 	 */
   1657 	vsw_del_mcst_port(port);
   1658 
   1659 	ldcp->peer_session = 0;
   1660 	ldcp->session_status = 0;
   1661 	ldcp->hcnt = 0;
   1662 	ldcp->hphase = VSW_MILESTONE0;
   1663 
   1664 	vsw_reset_vnet_proto_ops(ldcp);
   1665 
   1666 	D1(vswp, "%s: exit", __func__);
   1667 }
   1668 
   1669 /*
   1670  * Process a connection event.
   1671  *
   1672  * Note - care must be taken to ensure that this function is
   1673  * not called with the dlistrw lock held.
   1674  */
   1675 static void
   1676 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
   1677 {
   1678 	vsw_t		*vswp = ldcp->ldc_vswp;
   1679 	vsw_conn_evt_t	*conn = NULL;
   1680 
   1681 	D1(vswp, "%s: enter", __func__);
   1682 
   1683 	/*
   1684 	 * Check if either a reset or restart event is pending
   1685 	 * or in progress. If so just return.
   1686 	 *
   1687 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
   1688 	 * being received by the callback handler, or a ECONNRESET error
   1689 	 * code being returned from a ldc_read() or ldc_write() call.
   1690 	 *
   1691 	 * A VSW_CONN_RESTART event occurs when some error checking code
   1692 	 * decides that there is a problem with data from the channel,
   1693 	 * and that the handshake should be restarted.
   1694 	 */
   1695 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
   1696 	    (ldstub((uint8_t *)&ldcp->reset_active)))
   1697 		return;
   1698 
   1699 	/*
   1700 	 * If it is an LDC_UP event we first check the recorded
   1701 	 * state of the channel. If this is UP then we know that
   1702 	 * the channel moving to the UP state has already been dealt
   1703 	 * with and don't need to dispatch a  new task.
   1704 	 *
   1705 	 * The reason for this check is that when we do a ldc_up(),
   1706 	 * depending on the state of the peer, we may or may not get
   1707 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
   1708 	 * every time we do ldc_up() we explicitly check the channel
   1709 	 * status to see has it come up (ldc_up() is asynch and will
   1710 	 * complete at some undefined time), and take the appropriate
   1711 	 * action.
   1712 	 *
   1713 	 * The flip side of this is that we may get a LDC_UP event
   1714 	 * when we have already seen that the channel is up and have
   1715 	 * dealt with that.
   1716 	 */
   1717 	mutex_enter(&ldcp->status_lock);
   1718 	if (evt == VSW_CONN_UP) {
   1719 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
   1720 			mutex_exit(&ldcp->status_lock);
   1721 			return;
   1722 		}
   1723 	}
   1724 	mutex_exit(&ldcp->status_lock);
   1725 
   1726 	/*
   1727 	 * The transaction group id allows us to identify and discard
   1728 	 * any tasks which are still pending on the taskq and refer
   1729 	 * to the handshake session we are about to restart or reset.
   1730 	 * These stale messages no longer have any real meaning.
   1731 	 */
   1732 	(void) atomic_inc_32(&ldcp->hss_id);
   1733 
   1734 	ASSERT(vswp->taskq_p != NULL);
   1735 
   1736 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
   1737 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
   1738 		    " connection event", vswp->instance);
   1739 		goto err_exit;
   1740 	}
   1741 
   1742 	conn->evt = evt;
   1743 	conn->ldcp = ldcp;
   1744 
   1745 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
   1746 	    DDI_NOSLEEP) != DDI_SUCCESS) {
   1747 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
   1748 		    vswp->instance);
   1749 
   1750 		kmem_free(conn, sizeof (vsw_conn_evt_t));
   1751 		goto err_exit;
   1752 	}
   1753 
   1754 	D1(vswp, "%s: exit", __func__);
   1755 	return;
   1756 
   1757 err_exit:
   1758 	/*
   1759 	 * Have mostly likely failed due to memory shortage. Clear the flag so
   1760 	 * that future requests will at least be attempted and will hopefully
   1761 	 * succeed.
   1762 	 */
   1763 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
   1764 		ldcp->reset_active = 0;
   1765 }
   1766 
   1767 /*
   1768  * Deal with events relating to a connection. Invoked from a taskq.
   1769  */
   1770 static void
   1771 vsw_conn_task(void *arg)
   1772 {
   1773 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
   1774 	vsw_ldc_t	*ldcp = NULL;
   1775 	vsw_port_t	*portp;
   1776 	vsw_t		*vswp = NULL;
   1777 	uint16_t	evt;
   1778 	ldc_status_t	curr_status;
   1779 
   1780 	ldcp = conn->ldcp;
   1781 	evt = conn->evt;
   1782 	vswp = ldcp->ldc_vswp;
   1783 	portp = ldcp->ldc_port;
   1784 
   1785 	D1(vswp, "%s: enter", __func__);
   1786 
   1787 	/* can safely free now have copied out data */
   1788 	kmem_free(conn, sizeof (vsw_conn_evt_t));
   1789 
   1790 	mutex_enter(&ldcp->status_lock);
   1791 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
   1792 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
   1793 		    "channel %ld", vswp->instance, ldcp->ldc_id);
   1794 		mutex_exit(&ldcp->status_lock);
   1795 		return;
   1796 	}
   1797 
   1798 	/*
   1799 	 * If we wish to restart the handshake on this channel, then if
   1800 	 * the channel is UP we bring it DOWN to flush the underlying
   1801 	 * ldc queue.
   1802 	 */
   1803 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
   1804 		(void) ldc_down(ldcp->ldc_handle);
   1805 
   1806 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
   1807 		vsw_hio_stop(vswp, ldcp);
   1808 	}
   1809 
   1810 	/*
   1811 	 * re-init all the associated data structures.
   1812 	 */
   1813 	vsw_ldc_reinit(ldcp);
   1814 
   1815 	/*
   1816 	 * Bring the channel back up (note it does no harm to
   1817 	 * do this even if the channel is already UP, Just
   1818 	 * becomes effectively a no-op).
   1819 	 */
   1820 	(void) ldc_up(ldcp->ldc_handle);
   1821 
   1822 	/*
   1823 	 * Check if channel is now UP. This will only happen if
   1824 	 * peer has also done a ldc_up().
   1825 	 */
   1826 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
   1827 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
   1828 		    "channel %ld", vswp->instance, ldcp->ldc_id);
   1829 		mutex_exit(&ldcp->status_lock);
   1830 		return;
   1831 	}
   1832 
   1833 	ldcp->ldc_status = curr_status;
   1834 
   1835 	/* channel UP so restart handshake by sending version info */
   1836 	if (curr_status == LDC_UP) {
   1837 		if (ldcp->hcnt++ > vsw_num_handshakes) {
   1838 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
   1839 			    " handshake attempts (%d) on channel %ld",
   1840 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
   1841 			mutex_exit(&ldcp->status_lock);
   1842 			return;
   1843 		}
   1844 
   1845 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
   1846 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
   1847 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
   1848 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
   1849 			    vswp->instance);
   1850 
   1851 			/*
   1852 			 * Don't count as valid restart attempt if couldn't
   1853 			 * send version msg.
   1854 			 */
   1855 			if (ldcp->hcnt > 0)
   1856 				ldcp->hcnt--;
   1857 		}
   1858 	}
   1859 
   1860 	/*
   1861 	 * Mark that the process is complete by clearing the flag.
   1862 	 *
   1863 	 * Note is it possible that the taskq dispatch above may have failed,
   1864 	 * most likely due to memory shortage. We still clear the flag so
   1865 	 * future attempts will at least be attempted and will hopefully
   1866 	 * succeed.
   1867 	 */
   1868 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
   1869 		ldcp->reset_active = 0;
   1870 
   1871 	mutex_exit(&ldcp->status_lock);
   1872 
   1873 	D1(vswp, "%s: exit", __func__);
   1874 }
   1875 
   1876 /*
   1877  * returns 0 if legal for event signified by flag to have
   1878  * occured at the time it did. Otherwise returns 1.
   1879  */
   1880 int
   1881 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
   1882 {
   1883 	vsw_t		*vswp = ldcp->ldc_vswp;
   1884 	uint64_t	state;
   1885 	uint64_t	phase;
   1886 
   1887 	if (dir == INBOUND)
   1888 		state = ldcp->lane_in.lstate;
   1889 	else
   1890 		state = ldcp->lane_out.lstate;
   1891 
   1892 	phase = ldcp->hphase;
   1893 
   1894 	switch (flag) {
   1895 	case VSW_VER_INFO_RECV:
   1896 		if (phase > VSW_MILESTONE0) {
   1897 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
   1898 			    " when in state %d\n", ldcp->ldc_id, phase);
   1899 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1900 			return (1);
   1901 		}
   1902 		break;
   1903 
   1904 	case VSW_VER_ACK_RECV:
   1905 	case VSW_VER_NACK_RECV:
   1906 		if (!(state & VSW_VER_INFO_SENT)) {
   1907 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
   1908 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
   1909 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1910 			return (1);
   1911 		} else
   1912 			state &= ~VSW_VER_INFO_SENT;
   1913 		break;
   1914 
   1915 	case VSW_ATTR_INFO_RECV:
   1916 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
   1917 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
   1918 			    " when in state %d\n", ldcp->ldc_id, phase);
   1919 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1920 			return (1);
   1921 		}
   1922 		break;
   1923 
   1924 	case VSW_ATTR_ACK_RECV:
   1925 	case VSW_ATTR_NACK_RECV:
   1926 		if (!(state & VSW_ATTR_INFO_SENT)) {
   1927 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
   1928 			    " or ATTR_NACK when in state %d\n",
   1929 			    ldcp->ldc_id, phase);
   1930 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1931 			return (1);
   1932 		} else
   1933 			state &= ~VSW_ATTR_INFO_SENT;
   1934 		break;
   1935 
   1936 	case VSW_DRING_INFO_RECV:
   1937 		if (phase < VSW_MILESTONE1) {
   1938 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
   1939 			    " when in state %d\n", ldcp->ldc_id, phase);
   1940 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1941 			return (1);
   1942 		}
   1943 		break;
   1944 
   1945 	case VSW_DRING_ACK_RECV:
   1946 	case VSW_DRING_NACK_RECV:
   1947 		if (!(state & VSW_DRING_INFO_SENT)) {
   1948 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
   1949 			    " or DRING_NACK when in state %d\n",
   1950 			    ldcp->ldc_id, phase);
   1951 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1952 			return (1);
   1953 		} else
   1954 			state &= ~VSW_DRING_INFO_SENT;
   1955 		break;
   1956 
   1957 	case VSW_RDX_INFO_RECV:
   1958 		if (phase < VSW_MILESTONE3) {
   1959 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
   1960 			    " when in state %d\n", ldcp->ldc_id, phase);
   1961 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1962 			return (1);
   1963 		}
   1964 		break;
   1965 
   1966 	case VSW_RDX_ACK_RECV:
   1967 	case VSW_RDX_NACK_RECV:
   1968 		if (!(state & VSW_RDX_INFO_SENT)) {
   1969 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
   1970 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
   1971 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1972 			return (1);
   1973 		} else
   1974 			state &= ~VSW_RDX_INFO_SENT;
   1975 		break;
   1976 
   1977 	case VSW_MCST_INFO_RECV:
   1978 		if (phase < VSW_MILESTONE3) {
   1979 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
   1980 			    " when in state %d\n", ldcp->ldc_id, phase);
   1981 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   1982 			return (1);
   1983 		}
   1984 		break;
   1985 
   1986 	default:
   1987 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
   1988 		    ldcp->ldc_id, flag);
   1989 		return (1);
   1990 	}
   1991 
   1992 	if (dir == INBOUND)
   1993 		ldcp->lane_in.lstate = state;
   1994 	else
   1995 		ldcp->lane_out.lstate = state;
   1996 
   1997 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
   1998 
   1999 	return (0);
   2000 }
   2001 
   2002 void
   2003 vsw_next_milestone(vsw_ldc_t *ldcp)
   2004 {
   2005 	vsw_t		*vswp = ldcp->ldc_vswp;
   2006 	vsw_port_t	*portp = ldcp->ldc_port;
   2007 
   2008 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
   2009 	    ldcp->ldc_id, ldcp->hphase);
   2010 
   2011 	DUMP_FLAGS(ldcp->lane_in.lstate);
   2012 	DUMP_FLAGS(ldcp->lane_out.lstate);
   2013 
   2014 	switch (ldcp->hphase) {
   2015 
   2016 	case VSW_MILESTONE0:
   2017 		/*
   2018 		 * If we haven't started to handshake with our peer,
   2019 		 * start to do so now.
   2020 		 */
   2021 		if (ldcp->lane_out.lstate == 0) {
   2022 			D2(vswp, "%s: (chan %lld) starting handshake "
   2023 			    "with peer", __func__, ldcp->ldc_id);
   2024 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
   2025 		}
   2026 
   2027 		/*
   2028 		 * Only way to pass this milestone is to have successfully
   2029 		 * negotiated version info.
   2030 		 */
   2031 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
   2032 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
   2033 
   2034 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
   2035 			    __func__, ldcp->ldc_id);
   2036 
   2037 			vsw_set_vnet_proto_ops(ldcp);
   2038 
   2039 			/*
   2040 			 * Next milestone is passed when attribute
   2041 			 * information has been successfully exchanged.
   2042 			 */
   2043 			ldcp->hphase = VSW_MILESTONE1;
   2044 			vsw_send_attr(ldcp);
   2045 
   2046 		}
   2047 		break;
   2048 
   2049 	case VSW_MILESTONE1:
   2050 		/*
   2051 		 * Only way to pass this milestone is to have successfully
   2052 		 * negotiated attribute information.
   2053 		 */
   2054 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
   2055 
   2056 			ldcp->hphase = VSW_MILESTONE2;
   2057 
   2058 			/*
   2059 			 * If the peer device has said it wishes to
   2060 			 * use descriptor rings then we send it our ring
   2061 			 * info, otherwise we just set up a private ring
   2062 			 * which we use an internal buffer
   2063 			 */
   2064 			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
   2065 			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
   2066 			    (VSW_VER_LT(ldcp, 1, 2) &&
   2067 			    (ldcp->lane_in.xfer_mode ==
   2068 			    VIO_DRING_MODE_V1_0))) {
   2069 				vsw_send_dring_info(ldcp);
   2070 			}
   2071 		}
   2072 		break;
   2073 
   2074 	case VSW_MILESTONE2:
   2075 		/*
   2076 		 * If peer has indicated in its attribute message that
   2077 		 * it wishes to use descriptor rings then the only way
   2078 		 * to pass this milestone is for us to have received
   2079 		 * valid dring info.
   2080 		 *
   2081 		 * If peer is not using descriptor rings then just fall
   2082 		 * through.
   2083 		 */
   2084 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
   2085 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
   2086 		    (VSW_VER_LT(ldcp, 1, 2) &&
   2087 		    (ldcp->lane_in.xfer_mode ==
   2088 		    VIO_DRING_MODE_V1_0))) {
   2089 			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
   2090 				break;
   2091 		}
   2092 
   2093 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
   2094 		    __func__, ldcp->ldc_id);
   2095 
   2096 		ldcp->hphase = VSW_MILESTONE3;
   2097 		vsw_send_rdx(ldcp);
   2098 		break;
   2099 
   2100 	case VSW_MILESTONE3:
   2101 		/*
   2102 		 * Pass this milestone when all paramaters have been
   2103 		 * successfully exchanged and RDX sent in both directions.
   2104 		 *
   2105 		 * Mark outbound lane as available to transmit data.
   2106 		 */
   2107 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
   2108 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
   2109 
   2110 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
   2111 			    __func__, ldcp->ldc_id);
   2112 			D2(vswp, "%s: ** handshake complete (0x%llx : "
   2113 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
   2114 			    ldcp->lane_out.lstate);
   2115 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
   2116 			ldcp->hphase = VSW_MILESTONE4;
   2117 			ldcp->hcnt = 0;
   2118 			DISPLAY_STATE();
   2119 			/* Start HIO if enabled and capable */
   2120 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
   2121 				D2(vswp, "%s: start HybridIO setup", __func__);
   2122 				vsw_hio_start(vswp, ldcp);
   2123 			}
   2124 
   2125 			if (ldcp->pls_negotiated == B_TRUE) {
   2126 				/*
   2127 				 * The vnet device has negotiated to get phys
   2128 				 * link updates. Now that the handshake with
   2129 				 * the vnet device is complete, send an initial
   2130 				 * update with the current physical link state.
   2131 				 */
   2132 				vsw_send_physlink_msg(ldcp,
   2133 				    vswp->phys_link_state);
   2134 			}
   2135 
   2136 		} else {
   2137 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
   2138 			    __func__, ldcp->lane_in.lstate,
   2139 			    ldcp->lane_out.lstate);
   2140 		}
   2141 		break;
   2142 
   2143 	case VSW_MILESTONE4:
   2144 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
   2145 		    ldcp->ldc_id);
   2146 		break;
   2147 
   2148 	default:
   2149 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
   2150 		    ldcp->ldc_id, ldcp->hphase);
   2151 	}
   2152 
   2153 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
   2154 	    ldcp->hphase);
   2155 }
   2156 
   2157 /*
   2158  * Check if major version is supported.
   2159  *
   2160  * Returns 0 if finds supported major number, and if necessary
   2161  * adjusts the minor field.
   2162  *
   2163  * Returns 1 if can't match major number exactly. Sets mjor/minor
   2164  * to next lowest support values, or to zero if no other values possible.
   2165  */
   2166 static int
   2167 vsw_supported_version(vio_ver_msg_t *vp)
   2168 {
   2169 	int	i;
   2170 
   2171 	D1(NULL, "vsw_supported_version: enter");
   2172 
   2173 	for (i = 0; i < VSW_NUM_VER; i++) {
   2174 		if (vsw_versions[i].ver_major == vp->ver_major) {
   2175 			/*
   2176 			 * Matching or lower major version found. Update
   2177 			 * minor number if necessary.
   2178 			 */
   2179 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
   2180 				D2(NULL, "%s: adjusting minor value from %d "
   2181 				    "to %d", __func__, vp->ver_minor,
   2182 				    vsw_versions[i].ver_minor);
   2183 				vp->ver_minor = vsw_versions[i].ver_minor;
   2184 			}
   2185 
   2186 			return (0);
   2187 		}
   2188 
   2189 		/*
   2190 		 * If the message contains a higher major version number, set
   2191 		 * the message's major/minor versions to the current values
   2192 		 * and return false, so this message will get resent with
   2193 		 * these values.
   2194 		 */
   2195 		if (vsw_versions[i].ver_major < vp->ver_major) {
   2196 			D2(NULL, "%s: adjusting major and minor "
   2197 			    "values to %d, %d\n",
   2198 			    __func__, vsw_versions[i].ver_major,
   2199 			    vsw_versions[i].ver_minor);
   2200 			vp->ver_major = vsw_versions[i].ver_major;
   2201 			vp->ver_minor = vsw_versions[i].ver_minor;
   2202 			return (1);
   2203 		}
   2204 	}
   2205 
   2206 	/* No match was possible, zero out fields */
   2207 	vp->ver_major = 0;
   2208 	vp->ver_minor = 0;
   2209 
   2210 	D1(NULL, "vsw_supported_version: exit");
   2211 
   2212 	return (1);
   2213 }
   2214 
   2215 /*
   2216  * Set vnet-protocol-version dependent functions based on version.
   2217  */
   2218 static void
   2219 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
   2220 {
   2221 	vsw_t	*vswp = ldcp->ldc_vswp;
   2222 	lane_t	*lp = &ldcp->lane_out;
   2223 
   2224 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
   2225 		/*
   2226 		 * If the version negotiated with peer is >= 1.4(Jumbo Frame
   2227 		 * Support), set the mtu in our attributes to max_frame_size.
   2228 		 */
   2229 		lp->mtu = vswp->max_frame_size;
   2230 	} else if (VSW_VER_EQ(ldcp, 1, 3)) {
   2231 		/*
   2232 		 * If the version negotiated with peer is == 1.3 (Vlan Tag
   2233 		 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
   2234 		 */
   2235 		lp->mtu = ETHERMAX + VLAN_TAGSZ;
   2236 	} else {
   2237 		vsw_port_t	*portp = ldcp->ldc_port;
   2238 		/*
   2239 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
   2240 		 * We can negotiate that size with those peers provided only
   2241 		 * pvid is defined for our peer and there are no vids. Then we
   2242 		 * can send/recv only untagged frames of max size ETHERMAX.
   2243 		 * Note that pvid of the peer can be different, as vsw has to
   2244 		 * serve the vnet in that vlan even if itself is not assigned
   2245 		 * to that vlan.
   2246 		 */
   2247 		if (portp->nvids == 0) {
   2248 			lp->mtu = ETHERMAX;
   2249 		}
   2250 	}
   2251 
   2252 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
   2253 		/* Versions >= 1.2 */
   2254 
   2255 		if (VSW_PRI_ETH_DEFINED(vswp)) {
   2256 			/*
   2257 			 * enable priority routines and pkt mode only if
   2258 			 * at least one pri-eth-type is specified in MD.
   2259 			 */
   2260 			ldcp->tx = vsw_ldctx_pri;
   2261 			ldcp->rx_pktdata = vsw_process_pkt_data;
   2262 
   2263 			/* set xfer mode for vsw_send_attr() */
   2264 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
   2265 		} else {
   2266 			/* no priority eth types defined in MD */
   2267 
   2268 			ldcp->tx = vsw_ldctx;
   2269 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
   2270 
   2271 			/* set xfer mode for vsw_send_attr() */
   2272 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
   2273 		}
   2274 
   2275 	} else {
   2276 		/* Versions prior to 1.2  */
   2277 
   2278 		vsw_reset_vnet_proto_ops(ldcp);
   2279 	}
   2280 }
   2281 
   2282 /*
   2283  * Reset vnet-protocol-version dependent functions to v1.0.
   2284  */
   2285 static void
   2286 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
   2287 {
   2288 	lane_t	*lp = &ldcp->lane_out;
   2289 
   2290 	ldcp->tx = vsw_ldctx;
   2291 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
   2292 
   2293 	/* set xfer mode for vsw_send_attr() */
   2294 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
   2295 }
   2296 
   2297 /*
   2298  * Main routine for processing messages received over LDC.
   2299  */
   2300 static void
   2301 vsw_process_pkt(void *arg)
   2302 {
   2303 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
   2304 	vsw_t 		*vswp = ldcp->ldc_vswp;
   2305 	size_t		msglen;
   2306 	vio_msg_tag_t	*tagp;
   2307 	uint64_t	*ldcmsg;
   2308 	int 		rv = 0;
   2309 
   2310 
   2311 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
   2312 
   2313 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
   2314 
   2315 	ldcmsg = ldcp->ldcmsg;
   2316 	/*
   2317 	 * If channel is up read messages until channel is empty.
   2318 	 */
   2319 	do {
   2320 		msglen = ldcp->msglen;
   2321 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
   2322 
   2323 		if (rv != 0) {
   2324 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
   2325 			    __func__, ldcp->ldc_id, rv, msglen);
   2326 		}
   2327 
   2328 		/* channel has been reset */
   2329 		if (rv == ECONNRESET) {
   2330 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
   2331 			break;
   2332 		}
   2333 
   2334 		if (msglen == 0) {
   2335 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
   2336 			    ldcp->ldc_id);
   2337 			break;
   2338 		}
   2339 
   2340 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
   2341 		    ldcp->ldc_id, msglen);
   2342 
   2343 		/*
   2344 		 * Figure out what sort of packet we have gotten by
   2345 		 * examining the msg tag, and then switch it appropriately.
   2346 		 */
   2347 		tagp = (vio_msg_tag_t *)ldcmsg;
   2348 
   2349 		switch (tagp->vio_msgtype) {
   2350 		case VIO_TYPE_CTRL:
   2351 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
   2352 			break;
   2353 		case VIO_TYPE_DATA:
   2354 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
   2355 			break;
   2356 		case VIO_TYPE_ERR:
   2357 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
   2358 			break;
   2359 		default:
   2360 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
   2361 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
   2362 			break;
   2363 		}
   2364 	} while (msglen);
   2365 
   2366 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
   2367 }
   2368 
   2369 /*
   2370  * Dispatch a task to process a VIO control message.
   2371  */
   2372 static void
   2373 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
   2374 {
   2375 	vsw_ctrl_task_t		*ctaskp = NULL;
   2376 	vsw_port_t		*port = ldcp->ldc_port;
   2377 	vsw_t			*vswp = port->p_vswp;
   2378 
   2379 	D1(vswp, "%s: enter", __func__);
   2380 
   2381 	/*
   2382 	 * We need to handle RDX ACK messages in-band as once they
   2383 	 * are exchanged it is possible that we will get an
   2384 	 * immediate (legitimate) data packet.
   2385 	 */
   2386 	if ((tagp->vio_subtype_env == VIO_RDX) &&
   2387 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
   2388 
   2389 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
   2390 			return;
   2391 
   2392 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
   2393 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
   2394 		    "(ostate 0x%llx : hphase %d)", __func__,
   2395 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
   2396 		vsw_next_milestone(ldcp);
   2397 		return;
   2398 	}
   2399 
   2400 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
   2401 
   2402 	if (ctaskp == NULL) {
   2403 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
   2404 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   2405 		return;
   2406 	}
   2407 
   2408 	ctaskp->ldcp = ldcp;
   2409 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
   2410 	ctaskp->hss_id = ldcp->hss_id;
   2411 
   2412 	/*
   2413 	 * Dispatch task to processing taskq if port is not in
   2414 	 * the process of being detached.
   2415 	 */
   2416 	mutex_enter(&port->state_lock);
   2417 	if (port->state == VSW_PORT_INIT) {
   2418 		if ((vswp->taskq_p == NULL) ||
   2419 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
   2420 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
   2421 			mutex_exit(&port->state_lock);
   2422 			DERR(vswp, "%s: unable to dispatch task to taskq",
   2423 			    __func__);
   2424 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   2425 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
   2426 			return;
   2427 		}
   2428 	} else {
   2429 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
   2430 		DWARN(vswp, "%s: port %d detaching, not dispatching "
   2431 		    "task", __func__, port->p_instance);
   2432 	}
   2433 
   2434 	mutex_exit(&port->state_lock);
   2435 
   2436 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
   2437 	    ldcp->ldc_id);
   2438 	D1(vswp, "%s: exit", __func__);
   2439 }
   2440 
   2441 /*
   2442  * Process a VIO ctrl message. Invoked from taskq.
   2443  */
   2444 static void
   2445 vsw_process_ctrl_pkt(void *arg)
   2446 {
   2447 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
   2448 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
   2449 	vsw_t 		*vswp = ldcp->ldc_vswp;
   2450 	vio_msg_tag_t	tag;
   2451 	uint16_t	env;
   2452 
   2453 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
   2454 
   2455 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
   2456 	env = tag.vio_subtype_env;
   2457 
   2458 	/* stale pkt check */
   2459 	if (ctaskp->hss_id < ldcp->hss_id) {
   2460 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
   2461 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
   2462 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
   2463 		return;
   2464 	}
   2465 
   2466 	/* session id check */
   2467 	if (ldcp->session_status & VSW_PEER_SESSION) {
   2468 		if (ldcp->peer_session != tag.vio_sid) {
   2469 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
   2470 			    __func__, ldcp->ldc_id, tag.vio_sid);
   2471 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
   2472 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   2473 			return;
   2474 		}
   2475 	}
   2476 
   2477 	/*
   2478 	 * Switch on vio_subtype envelope, then let lower routines
   2479 	 * decide if its an INFO, ACK or NACK packet.
   2480 	 */
   2481 	switch (env) {
   2482 	case VIO_VER_INFO:
   2483 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
   2484 		break;
   2485 	case VIO_DRING_REG:
   2486 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
   2487 		break;
   2488 	case VIO_DRING_UNREG:
   2489 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
   2490 		break;
   2491 	case VIO_ATTR_INFO:
   2492 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
   2493 		break;
   2494 	case VNET_MCAST_INFO:
   2495 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
   2496 		break;
   2497 	case VIO_RDX:
   2498 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
   2499 		break;
   2500 	case VIO_DDS_INFO:
   2501 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
   2502 		break;
   2503 
   2504 	case VNET_PHYSLINK_INFO:
   2505 		vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
   2506 		break;
   2507 	default:
   2508 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
   2509 	}
   2510 
   2511 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
   2512 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
   2513 }
   2514 
   2515 /*
   2516  * Version negotiation. We can end up here either because our peer
   2517  * has responded to a handshake message we have sent it, or our peer
   2518  * has initiated a handshake with us. If its the former then can only
   2519  * be ACK or NACK, if its the later can only be INFO.
   2520  *
   2521  * If its an ACK we move to the next stage of the handshake, namely
   2522  * attribute exchange. If its a NACK we see if we can specify another
   2523  * version, if we can't we stop.
   2524  *
   2525  * If it is an INFO we reset all params associated with communication
   2526  * in that direction over this channel (remember connection is
   2527  * essentially 2 independent simplex channels).
   2528  */
   2529 void
   2530 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
   2531 {
   2532 	vio_ver_msg_t	*ver_pkt;
   2533 	vsw_t 		*vswp = ldcp->ldc_vswp;
   2534 
   2535 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
   2536 
   2537 	/*
   2538 	 * We know this is a ctrl/version packet so
   2539 	 * cast it into the correct structure.
   2540 	 */
   2541 	ver_pkt = (vio_ver_msg_t *)pkt;
   2542 
   2543 	switch (ver_pkt->tag.vio_subtype) {
   2544 	case VIO_SUBTYPE_INFO:
   2545 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
   2546 
   2547 		/*
   2548 		 * Record the session id, which we will use from now
   2549 		 * until we see another VER_INFO msg. Even then the
   2550 		 * session id in most cases will be unchanged, execpt
   2551 		 * if channel was reset.
   2552 		 */
   2553 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
   2554 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
   2555 			DERR(vswp, "%s: updating session id for chan %lld "
   2556 			    "from %llx to %llx", __func__, ldcp->ldc_id,
   2557 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
   2558 		}
   2559 
   2560 		ldcp->peer_session = ver_pkt->tag.vio_sid;
   2561 		ldcp->session_status |= VSW_PEER_SESSION;
   2562 
   2563 		/* Legal message at this time ? */
   2564 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
   2565 			return;
   2566 
   2567 		/*
   2568 		 * First check the device class. Currently only expect
   2569 		 * to be talking to a network device. In the future may
   2570 		 * also talk to another switch.
   2571 		 */
   2572 		if (ver_pkt->dev_class != VDEV_NETWORK) {
   2573 			DERR(vswp, "%s: illegal device class %d", __func__,
   2574 			    ver_pkt->dev_class);
   2575 
   2576 			ver_pkt->tag.vio_sid = ldcp->local_session;
   2577 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
   2578 
   2579 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
   2580 
   2581 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
   2582 			    sizeof (vio_ver_msg_t), B_TRUE);
   2583 
   2584 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
   2585 			vsw_next_milestone(ldcp);
   2586 			return;
   2587 		} else {
   2588 			ldcp->dev_class = ver_pkt->dev_class;
   2589 		}
   2590 
   2591 		/*
   2592 		 * Now check the version.
   2593 		 */
   2594 		if (vsw_supported_version(ver_pkt) == 0) {
   2595 			/*
   2596 			 * Support this major version and possibly
   2597 			 * adjusted minor version.
   2598 			 */
   2599 
   2600 			D2(vswp, "%s: accepted ver %d:%d", __func__,
   2601 			    ver_pkt->ver_major, ver_pkt->ver_minor);
   2602 
   2603 			/* Store accepted values */
   2604 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
   2605 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
   2606 
   2607 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
   2608 
   2609 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
   2610 
   2611 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
   2612 				/*
   2613 				 * Send a version info message
   2614 				 * using the accepted version that
   2615 				 * we are about to ack. Also note that
   2616 				 * we send our ver info before we ack.
   2617 				 * Otherwise, as soon as receiving the
   2618 				 * ack, obp sends attr info msg, which
   2619 				 * breaks vsw_check_flag() invoked
   2620 				 * from vsw_process_ctrl_attr_pkt();
   2621 				 * as we also need VSW_VER_ACK_RECV to
   2622 				 * be set in lane_out.lstate, before
   2623 				 * we can receive attr info.
   2624 				 */
   2625 				vsw_send_ver(ldcp);
   2626 			}
   2627 		} else {
   2628 			/*
   2629 			 * NACK back with the next lower major/minor
   2630 			 * pairing we support (if don't suuport any more
   2631 			 * versions then they will be set to zero.
   2632 			 */
   2633 
   2634 			D2(vswp, "%s: replying with ver %d:%d", __func__,
   2635 			    ver_pkt->ver_major, ver_pkt->ver_minor);
   2636 
   2637 			/* Store updated values */
   2638 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
   2639 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
   2640 
   2641 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
   2642 
   2643 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
   2644 		}
   2645 
   2646 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
   2647 		ver_pkt->tag.vio_sid = ldcp->local_session;
   2648 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
   2649 		    sizeof (vio_ver_msg_t), B_TRUE);
   2650 
   2651 		vsw_next_milestone(ldcp);
   2652 		break;
   2653 
   2654 	case VIO_SUBTYPE_ACK:
   2655 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
   2656 
   2657 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
   2658 			return;
   2659 
   2660 		/* Store updated values */
   2661 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
   2662 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
   2663 
   2664 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
   2665 		vsw_next_milestone(ldcp);
   2666 
   2667 		break;
   2668 
   2669 	case VIO_SUBTYPE_NACK:
   2670 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
   2671 
   2672 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
   2673 			return;
   2674 
   2675 		/*
   2676 		 * If our peer sent us a NACK with the ver fields set to
   2677 		 * zero then there is nothing more we can do. Otherwise see
   2678 		 * if we support either the version suggested, or a lesser
   2679 		 * one.
   2680 		 */
   2681 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
   2682 			DERR(vswp, "%s: peer unable to negotiate any "
   2683 			    "further.", __func__);
   2684 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
   2685 			vsw_next_milestone(ldcp);
   2686 			return;
   2687 		}
   2688 
   2689 		/*
   2690 		 * Check to see if we support this major version or
   2691 		 * a lower one. If we don't then maj/min will be set
   2692 		 * to zero.
   2693 		 */
   2694 		(void) vsw_supported_version(ver_pkt);
   2695 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
   2696 			/* Nothing more we can do */
   2697 			DERR(vswp, "%s: version negotiation failed.\n",
   2698 			    __func__);
   2699 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
   2700 			vsw_next_milestone(ldcp);
   2701 		} else {
   2702 			/* found a supported major version */
   2703 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
   2704 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
   2705 
   2706 			D2(vswp, "%s: resending with updated values (%x, %x)",
   2707 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
   2708 
   2709 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
   2710 			ver_pkt->tag.vio_sid = ldcp->local_session;
   2711 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
   2712 
   2713 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
   2714 
   2715 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
   2716 			    sizeof (vio_ver_msg_t), B_TRUE);
   2717 
   2718 			vsw_next_milestone(ldcp);
   2719 
   2720 		}
   2721 		break;
   2722 
   2723 	default:
   2724 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
   2725 		    ver_pkt->tag.vio_subtype);
   2726 	}
   2727 
   2728 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
   2729 }
   2730 
   2731 /*
   2732  * Process an attribute packet. We can end up here either because our peer
   2733  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
   2734  * peer has sent us an attribute INFO message
   2735  *
   2736  * If its an ACK we then move to the next stage of the handshake which
   2737  * is to send our descriptor ring info to our peer. If its a NACK then
   2738  * there is nothing more we can (currently) do.
   2739  *
   2740  * If we get a valid/acceptable INFO packet (and we have already negotiated
   2741  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
   2742  * NACK back and reset channel state to INACTIV.
   2743  *
   2744  * FUTURE: in time we will probably negotiate over attributes, but for
   2745  * the moment unacceptable attributes are regarded as a fatal error.
   2746  *
   2747  */
   2748 void
   2749 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
   2750 {
   2751 	vnet_attr_msg_t		*attr_pkt;
   2752 	vsw_t			*vswp = ldcp->ldc_vswp;
   2753 	vsw_port_t		*port = ldcp->ldc_port;
   2754 	uint64_t		macaddr = 0;
   2755 	lane_t			*lane_out = &ldcp->lane_out;
   2756 	lane_t			*lane_in = &ldcp->lane_in;
   2757 	uint32_t		mtu;
   2758 	boolean_t		ack = B_TRUE;
   2759 	int			i;
   2760 
   2761 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
   2762 
   2763 	/*
   2764 	 * We know this is a ctrl/attr packet so
   2765 	 * cast it into the correct structure.
   2766 	 */
   2767 	attr_pkt = (vnet_attr_msg_t *)pkt;
   2768 
   2769 	switch (attr_pkt->tag.vio_subtype) {
   2770 	case VIO_SUBTYPE_INFO:
   2771 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
   2772 
   2773 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
   2774 			return;
   2775 
   2776 		/*
   2777 		 * If the attributes are unacceptable then we NACK back.
   2778 		 */
   2779 		if (vsw_check_attr(attr_pkt, ldcp)) {
   2780 			ack = B_FALSE;
   2781 
   2782 			DERR(vswp, "%s (chan %d): invalid attributes",
   2783 			    __func__, ldcp->ldc_id);
   2784 
   2785 		} else {
   2786 
   2787 			if (VSW_VER_GTEQ(ldcp, 1, 4)) {
   2788 				/*
   2789 				 * Versions >= 1.4:
   2790 				 * The mtu is negotiated down to the
   2791 				 * minimum of our mtu and peer's mtu.
   2792 				 */
   2793 				mtu = MIN(attr_pkt->mtu, vswp->max_frame_size);
   2794 
   2795 				/*
   2796 				 * If we have received an ack for the attr info
   2797 				 * that we sent, then check if the mtu computed
   2798 				 * above matches the mtu that the peer had ack'd
   2799 				 * (saved in local hparams). If they don't
   2800 				 * match, we fail the handshake.
   2801 				 */
   2802 				if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
   2803 					if (mtu != lane_out->mtu) {
   2804 						/* send NACK */
   2805 						ack = B_FALSE;
   2806 					}
   2807 				} else {
   2808 					/*
   2809 					 * Save the mtu computed above in our
   2810 					 * attr parameters, so it gets sent in
   2811 					 * the attr info from us to the peer.
   2812 					 */
   2813 					lane_out->mtu = mtu;
   2814 				}
   2815 			}
   2816 
   2817 		}
   2818 
   2819 		if (ack == B_FALSE) {
   2820 
   2821 			vsw_free_lane_resources(ldcp, INBOUND);
   2822 
   2823 			attr_pkt->tag.vio_sid = ldcp->local_session;
   2824 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
   2825 
   2826 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
   2827 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
   2828 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
   2829 			    sizeof (vnet_attr_msg_t), B_TRUE);
   2830 
   2831 			vsw_next_milestone(ldcp);
   2832 			return;
   2833 		}
   2834 
   2835 		/*
   2836 		 * Otherwise store attributes for this lane and update
   2837 		 * lane state.
   2838 		 */
   2839 		lane_in->mtu = attr_pkt->mtu;
   2840 		lane_in->addr = attr_pkt->addr;
   2841 		lane_in->addr_type = attr_pkt->addr_type;
   2842 		lane_in->xfer_mode = attr_pkt->xfer_mode;
   2843 		lane_in->ack_freq = attr_pkt->ack_freq;
   2844 		lane_in->physlink_update = attr_pkt->physlink_update;
   2845 
   2846 		/*
   2847 		 * Check if the client has requested physlink state updates.
   2848 		 * If there is a physical device bound to this vswitch (L2
   2849 		 * mode), set the ack bits to indicate it is supported.
   2850 		 * Otherwise, set the nack bits.
   2851 		 */
   2852 		if (VSW_VER_GTEQ(ldcp, 1, 5)) {	/* Protocol ver >= 1.5 */
   2853 
   2854 			/* Does the vnet need phys link state updates ? */
   2855 			if ((lane_in->physlink_update &
   2856 			    PHYSLINK_UPDATE_STATE_MASK) ==
   2857 			    PHYSLINK_UPDATE_STATE) {
   2858 
   2859 				if (vswp->smode & VSW_LAYER2) {
   2860 					/* is a net-dev assigned to us ? */
   2861 					attr_pkt->physlink_update =
   2862 					    PHYSLINK_UPDATE_STATE_ACK;
   2863 					ldcp->pls_negotiated = B_TRUE;
   2864 				} else {
   2865 					/* not in L2 mode */
   2866 					attr_pkt->physlink_update =
   2867 					    PHYSLINK_UPDATE_STATE_NACK;
   2868 					ldcp->pls_negotiated = B_FALSE;
   2869 				}
   2870 
   2871 			} else {
   2872 				attr_pkt->physlink_update =
   2873 				    PHYSLINK_UPDATE_NONE;
   2874 				ldcp->pls_negotiated = B_FALSE;
   2875 			}
   2876 
   2877 		} else {
   2878 			/*
   2879 			 * physlink_update bits are ignored
   2880 			 * if set by clients < v1.5 protocol.
   2881 			 */
   2882 			attr_pkt->physlink_update = PHYSLINK_UPDATE_NONE;
   2883 			ldcp->pls_negotiated = B_FALSE;
   2884 		}
   2885 
   2886 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
   2887 			/* save the MIN mtu in the msg to be replied */
   2888 			attr_pkt->mtu = mtu;
   2889 		}
   2890 
   2891 		macaddr = lane_in->addr;
   2892 		for (i = ETHERADDRL - 1; i >= 0; i--) {
   2893 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
   2894 			macaddr >>= 8;
   2895 		}
   2896 
   2897 		/* create the fdb entry for this port/mac address */
   2898 		vsw_fdbe_add(vswp, port);
   2899 
   2900 		/* add the port to the specified vlans */
   2901 		vsw_vlan_add_ids(port, VSW_VNETPORT);
   2902 
   2903 		/* setup device specifc xmit routines */
   2904 		mutex_enter(&port->tx_lock);
   2905 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
   2906 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
   2907 		    (VSW_VER_LT(ldcp, 1, 2) &&
   2908 		    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
   2909 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
   2910 			port->transmit = vsw_dringsend;
   2911 		} else if (lane_in->xfer_mode == VIO_DESC_MODE) {
   2912 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
   2913 			vsw_create_privring(ldcp);
   2914 			port->transmit = vsw_descrsend;
   2915 			lane_out->xfer_mode = VIO_DESC_MODE;
   2916 		}
   2917 
   2918 		/*
   2919 		 * HybridIO is supported only vnet, not by OBP.
   2920 		 * So, set hio_capable to true only when in DRING mode.
   2921 		 */
   2922 		if (VSW_VER_GTEQ(ldcp, 1, 3) &&
   2923 		    (lane_in->xfer_mode != VIO_DESC_MODE)) {
   2924 			(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
   2925 		} else {
   2926 			(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
   2927 		}
   2928 
   2929 		mutex_exit(&port->tx_lock);
   2930 
   2931 		attr_pkt->tag.vio_sid = ldcp->local_session;
   2932 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
   2933 
   2934 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
   2935 
   2936 		lane_in->lstate |= VSW_ATTR_ACK_SENT;
   2937 
   2938 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
   2939 		    sizeof (vnet_attr_msg_t), B_TRUE);
   2940 
   2941 		vsw_next_milestone(ldcp);
   2942 		break;
   2943 
   2944 	case VIO_SUBTYPE_ACK:
   2945 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
   2946 
   2947 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
   2948 			return;
   2949 
   2950 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
   2951 			/*
   2952 			 * Versions >= 1.4:
   2953 			 * The ack msg sent by the peer contains the minimum of
   2954 			 * our mtu (that we had sent in our attr info) and the
   2955 			 * peer's mtu.
   2956 			 *
   2957 			 * If we have sent an ack for the attr info msg from
   2958 			 * the peer, check if the mtu that was computed then
   2959 			 * (saved in lane_out params) matches the mtu that the
   2960 			 * peer has ack'd. If they don't match, we fail the
   2961 			 * handshake.
   2962 			 */
   2963 			if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
   2964 				if (lane_out->mtu != attr_pkt->mtu) {
   2965 					return;
   2966 				}
   2967 			} else {
   2968 				/*
   2969 				 * If the mtu ack'd by the peer is > our mtu
   2970 				 * fail handshake. Otherwise, save the mtu, so
   2971 				 * we can validate it when we receive attr info
   2972 				 * from our peer.
   2973 				 */
   2974 				if (attr_pkt->mtu > lane_out->mtu) {
   2975 					return;
   2976 				}
   2977 				if (attr_pkt->mtu <= lane_out->mtu) {
   2978 					lane_out->mtu = attr_pkt->mtu;
   2979 				}
   2980 			}
   2981 		}
   2982 
   2983 		lane_out->lstate |= VSW_ATTR_ACK_RECV;
   2984 		vsw_next_milestone(ldcp);
   2985 		break;
   2986 
   2987 	case VIO_SUBTYPE_NACK:
   2988 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
   2989 
   2990 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
   2991 			return;
   2992 
   2993 		lane_out->lstate |= VSW_ATTR_NACK_RECV;
   2994 		vsw_next_milestone(ldcp);
   2995 		break;
   2996 
   2997 	default:
   2998 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
   2999 		    attr_pkt->tag.vio_subtype);
   3000 	}
   3001 
   3002 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
   3003 }
   3004 
   3005 /*
   3006  * Process a dring info packet. We can end up here either because our peer
   3007  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
   3008  * peer has sent us a dring INFO message.
   3009  *
   3010  * If we get a valid/acceptable INFO packet (and we have already negotiated
   3011  * a version) we ACK back and update the lane state, otherwise we NACK back.
   3012  *
   3013  * FUTURE: nothing to stop client from sending us info on multiple dring's
   3014  * but for the moment we will just use the first one we are given.
   3015  *
   3016  */
   3017 void
   3018 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
   3019 {
   3020 	vio_dring_reg_msg_t	*dring_pkt;
   3021 	vsw_t			*vswp = ldcp->ldc_vswp;
   3022 	ldc_mem_info_t		minfo;
   3023 	dring_info_t		*dp, *dbp;
   3024 	int			dring_found = 0;
   3025 
   3026 	/*
   3027 	 * We know this is a ctrl/dring packet so
   3028 	 * cast it into the correct structure.
   3029 	 */
   3030 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
   3031 
   3032 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
   3033 
   3034 	switch (dring_pkt->tag.vio_subtype) {
   3035 	case VIO_SUBTYPE_INFO:
   3036 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
   3037 
   3038 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
   3039 			return;
   3040 
   3041 		/*
   3042 		 * If the dring params are unacceptable then we NACK back.
   3043 		 */
   3044 		if (vsw_check_dring_info(dring_pkt)) {
   3045 
   3046 			DERR(vswp, "%s (%lld): invalid dring info",
   3047 			    __func__, ldcp->ldc_id);
   3048 
   3049 			vsw_free_lane_resources(ldcp, INBOUND);
   3050 
   3051 			dring_pkt->tag.vio_sid = ldcp->local_session;
   3052 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
   3053 
   3054 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
   3055 
   3056 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
   3057 
   3058 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
   3059 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
   3060 
   3061 			vsw_next_milestone(ldcp);
   3062 			return;
   3063 		}
   3064 
   3065 		/*
   3066 		 * Otherwise, attempt to map in the dring using the
   3067 		 * cookie. If that succeeds we send back a unique dring
   3068 		 * identifier that the sending side will use in future
   3069 		 * to refer to this descriptor ring.
   3070 		 */
   3071 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
   3072 
   3073 		dp->num_descriptors = dring_pkt->num_descriptors;
   3074 		dp->descriptor_size = dring_pkt->descriptor_size;
   3075 		dp->options = dring_pkt->options;
   3076 		dp->ncookies = dring_pkt->ncookies;
   3077 
   3078 		/*
   3079 		 * Note: should only get one cookie. Enforced in
   3080 		 * the ldc layer.
   3081 		 */
   3082 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
   3083 		    sizeof (ldc_mem_cookie_t));
   3084 
   3085 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
   3086 		    dp->num_descriptors, dp->descriptor_size);
   3087 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
   3088 		    dp->options, dp->ncookies);
   3089 
   3090 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
   3091 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
   3092 		    LDC_DIRECT_MAP, &(dp->handle))) != 0) {
   3093 
   3094 			DERR(vswp, "%s: dring_map failed\n", __func__);
   3095 
   3096 			kmem_free(dp, sizeof (dring_info_t));
   3097 			vsw_free_lane_resources(ldcp, INBOUND);
   3098 
   3099 			dring_pkt->tag.vio_sid = ldcp->local_session;
   3100 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
   3101 
   3102 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
   3103 
   3104 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
   3105 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
   3106 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
   3107 
   3108 			vsw_next_milestone(ldcp);
   3109 			return;
   3110 		}
   3111 
   3112 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
   3113 
   3114 			DERR(vswp, "%s: dring_addr failed\n", __func__);
   3115 
   3116 			kmem_free(dp, sizeof (dring_info_t));
   3117 			vsw_free_lane_resources(ldcp, INBOUND);
   3118 
   3119 			dring_pkt->tag.vio_sid = ldcp->local_session;
   3120 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
   3121 
   3122 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
   3123 
   3124 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
   3125 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
   3126 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
   3127 
   3128 			vsw_next_milestone(ldcp);
   3129 			return;
   3130 		} else {
   3131 			/* store the address of the pub part of ring */
   3132 			dp->pub_addr = minfo.vaddr;
   3133 
   3134 			/* cache the dring mtype */
   3135 			dp->dring_mtype = minfo.mtype;
   3136 		}
   3137 
   3138 		/* no private section as we are importing */
   3139 		dp->priv_addr = NULL;
   3140 
   3141 		/*
   3142 		 * Using simple mono increasing int for ident at
   3143 		 * the moment.
   3144 		 */
   3145 		dp->ident = ldcp->next_ident;
   3146 		ldcp->next_ident++;
   3147 
   3148 		dp->end_idx = 0;
   3149 		dp->next = NULL;
   3150 
   3151 		/*
   3152 		 * Link it onto the end of the list of drings
   3153 		 * for this lane.
   3154 		 */
   3155 		if (ldcp->lane_in.dringp == NULL) {
   3156 			D2(vswp, "%s: adding first INBOUND dring", __func__);
   3157 			ldcp->lane_in.dringp = dp;
   3158 		} else {
   3159 			dbp = ldcp->lane_in.dringp;
   3160 
   3161 			while (dbp->next != NULL)
   3162 				dbp = dbp->next;
   3163 
   3164 			dbp->next = dp;
   3165 		}
   3166 
   3167 		/* acknowledge it */
   3168 		dring_pkt->tag.vio_sid = ldcp->local_session;
   3169 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
   3170 		dring_pkt->dring_ident = dp->ident;
   3171 
   3172 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
   3173 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
   3174 
   3175 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
   3176 		vsw_next_milestone(ldcp);
   3177 		break;
   3178 
   3179 	case VIO_SUBTYPE_ACK:
   3180 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
   3181 
   3182 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
   3183 			return;
   3184 
   3185 		/*
   3186 		 * Peer is acknowledging our dring info and will have
   3187 		 * sent us a dring identifier which we will use to
   3188 		 * refer to this ring w.r.t. our peer.
   3189 		 */
   3190 		dp = ldcp->lane_out.dringp;
   3191 		if (dp != NULL) {
   3192 			/*
   3193 			 * Find the ring this ident should be associated
   3194 			 * with.
   3195 			 */
   3196 			if (vsw_dring_match(dp, dring_pkt)) {
   3197 				dring_found = 1;
   3198 
   3199 			} else while (dp != NULL) {
   3200 				if (vsw_dring_match(dp, dring_pkt)) {
   3201 					dring_found = 1;
   3202 					break;
   3203 				}
   3204 				dp = dp->next;
   3205 			}
   3206 
   3207 			if (dring_found == 0) {
   3208 				DERR(NULL, "%s: unrecognised ring cookie",
   3209 				    __func__);
   3210 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   3211 				return;
   3212 			}
   3213 
   3214 		} else {
   3215 			DERR(vswp, "%s: DRING ACK received but no drings "
   3216 			    "allocated", __func__);
   3217 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   3218 			return;
   3219 		}
   3220 
   3221 		/* store ident */
   3222 		dp->ident = dring_pkt->dring_ident;
   3223 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
   3224 		vsw_next_milestone(ldcp);
   3225 		break;
   3226 
   3227 	case VIO_SUBTYPE_NACK:
   3228 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
   3229 
   3230 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
   3231 			return;
   3232 
   3233 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
   3234 		vsw_next_milestone(ldcp);
   3235 		break;
   3236 
   3237 	default:
   3238 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
   3239 		    dring_pkt->tag.vio_subtype);
   3240 	}
   3241 
   3242 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
   3243 }
   3244 
   3245 /*
   3246  * Process a request from peer to unregister a dring.
   3247  *
   3248  * For the moment we just restart the handshake if our
   3249  * peer endpoint attempts to unregister a dring.
   3250  */
   3251 void
   3252 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
   3253 {
   3254 	vsw_t			*vswp = ldcp->ldc_vswp;
   3255 	vio_dring_unreg_msg_t	*dring_pkt;
   3256 
   3257 	/*
   3258 	 * We know this is a ctrl/dring packet so
   3259 	 * cast it into the correct structure.
   3260 	 */
   3261 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
   3262 
   3263 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
   3264 
   3265 	switch (dring_pkt->tag.vio_subtype) {
   3266 	case VIO_SUBTYPE_INFO:
   3267 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
   3268 
   3269 		DWARN(vswp, "%s: restarting handshake..", __func__);
   3270 		break;
   3271 
   3272 	case VIO_SUBTYPE_ACK:
   3273 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
   3274 
   3275 		DWARN(vswp, "%s: restarting handshake..", __func__);
   3276 		break;
   3277 
   3278 	case VIO_SUBTYPE_NACK:
   3279 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
   3280 
   3281 		DWARN(vswp, "%s: restarting handshake..", __func__);
   3282 		break;
   3283 
   3284 	default:
   3285 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
   3286 		    dring_pkt->tag.vio_subtype);
   3287 	}
   3288 
   3289 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   3290 
   3291 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
   3292 }
   3293 
   3294 #define	SND_MCST_NACK(ldcp, pkt) \
   3295 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
   3296 	pkt->tag.vio_sid = ldcp->local_session; \
   3297 	(void) vsw_send_msg(ldcp, (void *)pkt, \
   3298 			sizeof (vnet_mcast_msg_t), B_TRUE);
   3299 
   3300 /*
   3301  * Process a multicast request from a vnet.
   3302  *
   3303  * Vnet's specify a multicast address that they are interested in. This
   3304  * address is used as a key into the hash table which forms the multicast
   3305  * forwarding database (mFDB).
   3306  *
   3307  * The table keys are the multicast addresses, while the table entries
   3308  * are pointers to lists of ports which wish to receive packets for the
   3309  * specified multicast address.
   3310  *
   3311  * When a multicast packet is being switched we use the address as a key
   3312  * into the hash table, and then walk the appropriate port list forwarding
   3313  * the pkt to each port in turn.
   3314  *
   3315  * If a vnet is no longer interested in a particular multicast grouping
   3316  * we simply find the correct location in the hash table and then delete
   3317  * the relevant port from the port list.
   3318  *
   3319  * To deal with the case whereby a port is being deleted without first
   3320  * removing itself from the lists in the hash table, we maintain a list
   3321  * of multicast addresses the port has registered an interest in, within
   3322  * the port structure itself. We then simply walk that list of addresses
   3323  * using them as keys into the hash table and remove the port from the
   3324  * appropriate lists.
   3325  */
   3326 static void
   3327 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
   3328 {
   3329 	vnet_mcast_msg_t	*mcst_pkt;
   3330 	vsw_port_t		*port = ldcp->ldc_port;
   3331 	vsw_t			*vswp = ldcp->ldc_vswp;
   3332 	int			i;
   3333 
   3334 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
   3335 
   3336 	/*
   3337 	 * We know this is a ctrl/mcast packet so
   3338 	 * cast it into the correct structure.
   3339 	 */
   3340 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
   3341 
   3342 	switch (mcst_pkt->tag.vio_subtype) {
   3343 	case VIO_SUBTYPE_INFO:
   3344 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
   3345 
   3346 		/*
   3347 		 * Check if in correct state to receive a multicast
   3348 		 * message (i.e. handshake complete). If not reset
   3349 		 * the handshake.
   3350 		 */
   3351 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
   3352 			return;
   3353 
   3354 		/*
   3355 		 * Before attempting to add or remove address check
   3356 		 * that they are valid multicast addresses.
   3357 		 * If not, then NACK back.
   3358 		 */
   3359 		for (i = 0; i < mcst_pkt->count; i++) {
   3360 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
   3361 				DERR(vswp, "%s: invalid multicast address",
   3362 				    __func__);
   3363 				SND_MCST_NACK(ldcp, mcst_pkt);
   3364 				return;
   3365 			}
   3366 		}
   3367 
   3368 		/*
   3369 		 * Now add/remove the addresses. If this fails we
   3370 		 * NACK back.
   3371 		 */
   3372 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
   3373 			SND_MCST_NACK(ldcp, mcst_pkt);
   3374 			return;
   3375 		}
   3376 
   3377 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
   3378 		mcst_pkt->tag.vio_sid = ldcp->local_session;
   3379 
   3380 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
   3381 
   3382 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
   3383 		    sizeof (vnet_mcast_msg_t), B_TRUE);
   3384 		break;
   3385 
   3386 	case VIO_SUBTYPE_ACK:
   3387 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
   3388 
   3389 		/*
   3390 		 * We shouldn't ever get a multicast ACK message as
   3391 		 * at the moment we never request multicast addresses
   3392 		 * to be set on some other device. This may change in
   3393 		 * the future if we have cascading switches.
   3394 		 */
   3395 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
   3396 			return;
   3397 
   3398 				/* Do nothing */
   3399 		break;
   3400 
   3401 	case VIO_SUBTYPE_NACK:
   3402 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
   3403 
   3404 		/*
   3405 		 * We shouldn't get a multicast NACK packet for the
   3406 		 * same reasons as we shouldn't get a ACK packet.
   3407 		 */
   3408 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
   3409 			return;
   3410 
   3411 				/* Do nothing */
   3412 		break;
   3413 
   3414 	default:
   3415 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
   3416 		    mcst_pkt->tag.vio_subtype);
   3417 	}
   3418 
   3419 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
   3420 }
   3421 
   3422 static void
   3423 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
   3424 {
   3425 	vio_rdx_msg_t	*rdx_pkt;
   3426 	vsw_t		*vswp = ldcp->ldc_vswp;
   3427 
   3428 	/*
   3429 	 * We know this is a ctrl/rdx packet so
   3430 	 * cast it into the correct structure.
   3431 	 */
   3432 	rdx_pkt = (vio_rdx_msg_t *)pkt;
   3433 
   3434 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
   3435 
   3436 	switch (rdx_pkt->tag.vio_subtype) {
   3437 	case VIO_SUBTYPE_INFO:
   3438 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
   3439 
   3440 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
   3441 			return;
   3442 
   3443 		rdx_pkt->tag.vio_sid = ldcp->local_session;
   3444 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
   3445 
   3446 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
   3447 
   3448 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
   3449 
   3450 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
   3451 		    sizeof (vio_rdx_msg_t), B_TRUE);
   3452 
   3453 		vsw_next_milestone(ldcp);
   3454 		break;
   3455 
   3456 	case VIO_SUBTYPE_ACK:
   3457 		/*
   3458 		 * Should be handled in-band by callback handler.
   3459 		 */
   3460 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
   3461 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   3462 		break;
   3463 
   3464 	case VIO_SUBTYPE_NACK:
   3465 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
   3466 
   3467 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
   3468 			return;
   3469 
   3470 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
   3471 		vsw_next_milestone(ldcp);
   3472 		break;
   3473 
   3474 	default:
   3475 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
   3476 		    rdx_pkt->tag.vio_subtype);
   3477 	}
   3478 
   3479 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
   3480 }
   3481 
   3482 static void
   3483 vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
   3484 {
   3485 	vnet_physlink_msg_t	*msgp;
   3486 	vsw_t			*vswp = ldcp->ldc_vswp;
   3487 
   3488 	msgp = (vnet_physlink_msg_t *)pkt;
   3489 
   3490 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
   3491 
   3492 	switch (msgp->tag.vio_subtype) {
   3493 	case VIO_SUBTYPE_INFO:
   3494 
   3495 		/* vsw shouldn't recv physlink info */
   3496 		DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
   3497 		break;
   3498 
   3499 	case VIO_SUBTYPE_ACK:
   3500 
   3501 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
   3502 		break;
   3503 
   3504 	case VIO_SUBTYPE_NACK:
   3505 
   3506 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
   3507 		break;
   3508 
   3509 	default:
   3510 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
   3511 		    msgp->tag.vio_subtype);
   3512 	}
   3513 
   3514 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
   3515 }
   3516 
   3517 static void
   3518 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
   3519 	uint32_t msglen)
   3520 {
   3521 	uint16_t	env = tagp->vio_subtype_env;
   3522 	vsw_t		*vswp = ldcp->ldc_vswp;
   3523 
   3524 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
   3525 
   3526 	/* session id check */
   3527 	if (ldcp->session_status & VSW_PEER_SESSION) {
   3528 		if (ldcp->peer_session != tagp->vio_sid) {
   3529 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
   3530 			    __func__, ldcp->ldc_id, tagp->vio_sid);
   3531 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   3532 			return;
   3533 		}
   3534 	}
   3535 
   3536 	/*
   3537 	 * It is an error for us to be getting data packets
   3538 	 * before the handshake has completed.
   3539 	 */
   3540 	if (ldcp->hphase != VSW_MILESTONE4) {
   3541 		DERR(vswp, "%s: got data packet before handshake complete "
   3542 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
   3543 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
   3544 		DUMP_FLAGS(ldcp->lane_in.lstate);
   3545 		DUMP_FLAGS(ldcp->lane_out.lstate);
   3546 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   3547 		return;
   3548 	}
   3549 
   3550 	/*
   3551 	 * To reduce the locking contention, release the
   3552 	 * ldc_cblock here and re-acquire it once we are done
   3553 	 * receiving packets.
   3554 	 */
   3555 	mutex_exit(&ldcp->ldc_cblock);
   3556 	mutex_enter(&ldcp->ldc_rxlock);
   3557 
   3558 	/*
   3559 	 * Switch on vio_subtype envelope, then let lower routines
   3560 	 * decide if its an INFO, ACK or NACK packet.
   3561 	 */
   3562 	if (env == VIO_DRING_DATA) {
   3563 		vsw_process_data_dring_pkt(ldcp, dpkt);
   3564 	} else if (env == VIO_PKT_DATA) {
   3565 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
   3566 	} else if (env == VIO_DESC_DATA) {
   3567 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
   3568 	} else {
   3569 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
   3570 	}
   3571 
   3572 	mutex_exit(&ldcp->ldc_rxlock);
   3573 	mutex_enter(&ldcp->ldc_cblock);
   3574 
   3575 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
   3576 }
   3577 
   3578 #define	SND_DRING_NACK(ldcp, pkt) \
   3579 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
   3580 	pkt->tag.vio_sid = ldcp->local_session; \
   3581 	(void) vsw_send_msg(ldcp, (void *)pkt, \
   3582 			sizeof (vio_dring_msg_t), B_TRUE);
   3583 
   3584 static void
   3585 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
   3586 {
   3587 	vio_dring_msg_t		*dring_pkt;
   3588 	vnet_public_desc_t	desc, *pub_addr = NULL;
   3589 	vsw_private_desc_t	*priv_addr = NULL;
   3590 	dring_info_t		*dp = NULL;
   3591 	vsw_t			*vswp = ldcp->ldc_vswp;
   3592 	mblk_t			*mp = NULL;
   3593 	mblk_t			*bp = NULL;
   3594 	mblk_t			*bpt = NULL;
   3595 	size_t			nbytes = 0;
   3596 	uint64_t		chain = 0;
   3597 	uint64_t		len;
   3598 	uint32_t		pos, start;
   3599 	uint32_t		range_start, range_end;
   3600 	int32_t			end, num, cnt = 0;
   3601 	int			i, rv, rng_rv = 0, msg_rv = 0;
   3602 	boolean_t		prev_desc_ack = B_FALSE;
   3603 	int			read_attempts = 0;
   3604 	struct ether_header	*ehp;
   3605 	lane_t			*lp = &ldcp->lane_out;
   3606 
   3607 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
   3608 
   3609 	/*
   3610 	 * We know this is a data/dring packet so
   3611 	 * cast it into the correct structure.
   3612 	 */
   3613 	dring_pkt = (vio_dring_msg_t *)dpkt;
   3614 
   3615 	/*
   3616 	 * Switch on the vio_subtype. If its INFO then we need to
   3617 	 * process the data. If its an ACK we need to make sure
   3618 	 * it makes sense (i.e did we send an earlier data/info),
   3619 	 * and if its a NACK then we maybe attempt a retry.
   3620 	 */
   3621 	switch (dring_pkt->tag.vio_subtype) {
   3622 	case VIO_SUBTYPE_INFO:
   3623 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
   3624 
   3625 		READ_ENTER(&ldcp->lane_in.dlistrw);
   3626 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
   3627 		    dring_pkt->dring_ident)) == NULL) {
   3628 			RW_EXIT(&ldcp->lane_in.dlistrw);
   3629 
   3630 			DERR(vswp, "%s(%lld): unable to find dring from "
   3631 			    "ident 0x%llx", __func__, ldcp->ldc_id,
   3632 			    dring_pkt->dring_ident);
   3633 
   3634 			SND_DRING_NACK(ldcp, dring_pkt);
   3635 			return;
   3636 		}
   3637 
   3638 		start = pos = dring_pkt->start_idx;
   3639 		end = dring_pkt->end_idx;
   3640 		len = dp->num_descriptors;
   3641 
   3642 		range_start = range_end = pos;
   3643 
   3644 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
   3645 		    __func__, ldcp->ldc_id, start, end);
   3646 
   3647 		if (end == -1) {
   3648 			num = -1;
   3649 		} else if (end >= 0) {
   3650 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
   3651 
   3652 			/* basic sanity check */
   3653 			if (end > len) {
   3654 				RW_EXIT(&ldcp->lane_in.dlistrw);
   3655 				DERR(vswp, "%s(%lld): endpoint %lld outside "
   3656 				    "ring length %lld", __func__,
   3657 				    ldcp->ldc_id, end, len);
   3658 
   3659 				SND_DRING_NACK(ldcp, dring_pkt);
   3660 				return;
   3661 			}
   3662 		} else {
   3663 			RW_EXIT(&ldcp->lane_in.dlistrw);
   3664 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
   3665 			    __func__, ldcp->ldc_id, end);
   3666 			SND_DRING_NACK(ldcp, dring_pkt);
   3667 			return;
   3668 		}
   3669 
   3670 		while (cnt != num) {
   3671 vsw_recheck_desc:
   3672 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
   3673 
   3674 			if ((rng_rv = vnet_dring_entry_copy(pub_addr,
   3675 			    &desc, dp->dring_mtype, dp->handle,
   3676 			    pos, pos)) != 0) {
   3677 				DERR(vswp, "%s(%lld): unable to copy "
   3678 				    "descriptor at pos %d: err %d",
   3679 				    __func__, pos, ldcp->ldc_id, rng_rv);
   3680 				ldcp->ldc_stats.ierrors++;
   3681 				break;
   3682 			}
   3683 
   3684 			/*
   3685 			 * When given a bounded range of descriptors
   3686 			 * to process, its an error to hit a descriptor
   3687 			 * which is not ready. In the non-bounded case
   3688 			 * (end_idx == -1) this simply indicates we have
   3689 			 * reached the end of the current active range.
   3690 			 */
   3691 			if (desc.hdr.dstate != VIO_DESC_READY) {
   3692 				/* unbound - no error */
   3693 				if (end == -1) {
   3694 					if (read_attempts == vsw_read_attempts)
   3695 						break;
   3696 
   3697 					delay(drv_usectohz(vsw_desc_delay));
   3698 					read_attempts++;
   3699 					goto vsw_recheck_desc;
   3700 				}
   3701 
   3702 				/* bounded - error - so NACK back */
   3703 				RW_EXIT(&ldcp->lane_in.dlistrw);
   3704 				DERR(vswp, "%s(%lld): descriptor not READY "
   3705 				    "(%d)", __func__, ldcp->ldc_id,
   3706 				    desc.hdr.dstate);
   3707 				SND_DRING_NACK(ldcp, dring_pkt);
   3708 				return;
   3709 			}
   3710 
   3711 			DTRACE_PROBE1(read_attempts, int, read_attempts);
   3712 
   3713 			range_end = pos;
   3714 
   3715 			/*
   3716 			 * If we ACK'd the previous descriptor then now
   3717 			 * record the new range start position for later
   3718 			 * ACK's.
   3719 			 */
   3720 			if (prev_desc_ack) {
   3721 				range_start = pos;
   3722 
   3723 				D2(vswp, "%s(%lld): updating range start to be "
   3724 				    "%d", __func__, ldcp->ldc_id, range_start);
   3725 
   3726 				prev_desc_ack = B_FALSE;
   3727 			}
   3728 
   3729 			D2(vswp, "%s(%lld): processing desc %lld at pos"
   3730 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
   3731 			    __func__, ldcp->ldc_id, pos, &desc,
   3732 			    desc.hdr.dstate, desc.nbytes);
   3733 
   3734 			if ((desc.nbytes < ETHERMIN) ||
   3735 			    (desc.nbytes > lp->mtu)) {
   3736 				/* invalid size; drop the packet */
   3737 				ldcp->ldc_stats.ierrors++;
   3738 				goto vsw_process_desc_done;
   3739 			}
   3740 
   3741 			/*
   3742 			 * Ensure that we ask ldc for an aligned
   3743 			 * number of bytes. Data is padded to align on 8
   3744 			 * byte boundary, desc.nbytes is actual data length,
   3745 			 * i.e. minus that padding.
   3746 			 */
   3747 			nbytes = (desc.nbytes + VNET_IPALIGN + 7) & ~7;
   3748 			if (nbytes > ldcp->max_rxpool_size) {
   3749 				mp = allocb(desc.nbytes + VNET_IPALIGN + 8,
   3750 				    BPRI_MED);
   3751 			} else {
   3752 				mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
   3753 				if (mp == NULL) {
   3754 					ldcp->ldc_stats.rx_vio_allocb_fail++;
   3755 					/*
   3756 					 * No free receive buffers available,
   3757 					 * so fallback onto allocb(9F). Make
   3758 					 * sure that we get a data buffer which
   3759 					 * is a multiple of 8 as this is
   3760 					 * required by ldc_mem_copy.
   3761 					 */
   3762 					DTRACE_PROBE(allocb);
   3763 					mp = allocb(desc.nbytes +
   3764 					    VNET_IPALIGN + 8, BPRI_MED);
   3765 				}
   3766 			}
   3767 			if (mp == NULL) {
   3768 				DERR(vswp, "%s(%ld): allocb failed",
   3769 				    __func__, ldcp->ldc_id);
   3770 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
   3771 				    dp->dring_mtype, dp->handle, pos, pos,
   3772 				    VIO_DESC_DONE);
   3773 				ldcp->ldc_stats.ierrors++;
   3774 				ldcp->ldc_stats.rx_allocb_fail++;
   3775 				break;
   3776 			}
   3777 
   3778 			rv = ldc_mem_copy(ldcp->ldc_handle,
   3779 			    (caddr_t)mp->b_rptr, 0, &nbytes,
   3780 			    desc.memcookie, desc.ncookies, LDC_COPY_IN);
   3781 			if (rv != 0) {
   3782 				DERR(vswp, "%s(%d): unable to copy in data "
   3783 				    "from %d cookies in desc %d (rv %d)",
   3784 				    __func__, ldcp->ldc_id, desc.ncookies,
   3785 				    pos, rv);
   3786 				freemsg(mp);
   3787 
   3788 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
   3789 				    dp->dring_mtype, dp->handle, pos, pos,
   3790 				    VIO_DESC_DONE);
   3791 				ldcp->ldc_stats.ierrors++;
   3792 				break;
   3793 			} else {
   3794 				D2(vswp, "%s(%d): copied in %ld bytes"
   3795 				    " using %d cookies", __func__,
   3796 				    ldcp->ldc_id, nbytes, desc.ncookies);
   3797 			}
   3798 
   3799 			/* adjust the read pointer to skip over the padding */
   3800 			mp->b_rptr += VNET_IPALIGN;
   3801 
   3802 			/* point to the actual end of data */
   3803 			mp->b_wptr = mp->b_rptr + desc.nbytes;
   3804 
   3805 			/* update statistics */
   3806 			ehp = (struct ether_header *)mp->b_rptr;
   3807 			if (IS_BROADCAST(ehp))
   3808 				ldcp->ldc_stats.brdcstrcv++;
   3809 			else if (IS_MULTICAST(ehp))
   3810 				ldcp->ldc_stats.multircv++;
   3811 
   3812 			ldcp->ldc_stats.ipackets++;
   3813 			ldcp->ldc_stats.rbytes += desc.nbytes;
   3814 
   3815 			/*
   3816 			 * IPALIGN space can be used for VLAN_TAG
   3817 			 */
   3818 			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
   3819 			    VSW_VNETPORT, mp);
   3820 
   3821 			/* build a chain of received packets */
   3822 			if (bp == NULL) {
   3823 				/* first pkt */
   3824 				bp = mp;
   3825 				bp->b_next = bp->b_prev = NULL;
   3826 				bpt = bp;
   3827 				chain = 1;
   3828 			} else {
   3829 				mp->b_next = mp->b_prev = NULL;
   3830 				bpt->b_next = mp;
   3831 				bpt = mp;
   3832 				chain++;
   3833 			}
   3834 
   3835 vsw_process_desc_done:
   3836 			/* mark we are finished with this descriptor */
   3837 			if ((rng_rv = vnet_dring_entry_set_dstate(pub_addr,
   3838 			    dp->dring_mtype, dp->handle, pos, pos,
   3839 			    VIO_DESC_DONE)) != 0) {
   3840 				DERR(vswp, "%s(%lld): unable to update "
   3841 				    "dstate at pos %d: err %d",
   3842 				    __func__, pos, ldcp->ldc_id, rng_rv);
   3843 				ldcp->ldc_stats.ierrors++;
   3844 				break;
   3845 			}
   3846 
   3847 			/*
   3848 			 * Send an ACK back to peer if requested.
   3849 			 */
   3850 			if (desc.hdr.ack) {
   3851 				dring_pkt->start_idx = range_start;
   3852 				dring_pkt->end_idx = range_end;
   3853 
   3854 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
   3855 				    " requested", __func__, ldcp->ldc_id,
   3856 				    dring_pkt->start_idx, dring_pkt->end_idx);
   3857 
   3858 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
   3859 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
   3860 				dring_pkt->tag.vio_sid = ldcp->local_session;
   3861 
   3862 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
   3863 				    sizeof (vio_dring_msg_t), B_FALSE);
   3864 
   3865 				/*
   3866 				 * Check if ACK was successfully sent. If not
   3867 				 * we break and deal with that below.
   3868 				 */
   3869 				if (msg_rv != 0)
   3870 					break;
   3871 
   3872 				prev_desc_ack = B_TRUE;
   3873 				range_start = pos;
   3874 			}
   3875 
   3876 			/* next descriptor */
   3877 			pos = (pos + 1) % len;
   3878 			cnt++;
   3879 
   3880 			/*
   3881 			 * Break out of loop here and stop processing to
   3882 			 * allow some other network device (or disk) to
   3883 			 * get access to the cpu.
   3884 			 */
   3885 			if (chain > vsw_chain_len) {
   3886 				D3(vswp, "%s(%lld): switching chain of %d "
   3887 				    "msgs", __func__, ldcp->ldc_id, chain);
   3888 				break;
   3889 			}
   3890 		}
   3891 		RW_EXIT(&ldcp->lane_in.dlistrw);
   3892 
   3893 		/* send the chain of packets to be switched */
   3894 		if (bp != NULL) {
   3895 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
   3896 			D3(vswp, "%s(%lld): switching chain of %d msgs",
   3897 			    __func__, ldcp->ldc_id, chain);
   3898 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
   3899 			    ldcp->ldc_port, NULL);
   3900 		}
   3901 
   3902 		/*
   3903 		 * If when we encountered an error when attempting to
   3904 		 * access an imported dring, initiate a connection reset.
   3905 		 */
   3906 		if (rng_rv != 0) {
   3907 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   3908 			break;
   3909 		}
   3910 
   3911 		/*
   3912 		 * If when we attempted to send the ACK we found that the
   3913 		 * channel had been reset then now handle this. We deal with
   3914 		 * it here as we cannot reset the channel while holding the
   3915 		 * dlistrw lock, and we don't want to acquire/release it
   3916 		 * continuously in the above loop, as a channel reset should
   3917 		 * be a rare event.
   3918 		 */
   3919 		if (msg_rv == ECONNRESET) {
   3920 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
   3921 			break;
   3922 		}
   3923 
   3924 		DTRACE_PROBE1(msg_cnt, int, cnt);
   3925 
   3926 		/*
   3927 		 * We are now finished so ACK back with the state
   3928 		 * set to STOPPING so our peer knows we are finished
   3929 		 */
   3930 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
   3931 		dring_pkt->tag.vio_sid = ldcp->local_session;
   3932 
   3933 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
   3934 
   3935 		DTRACE_PROBE(stop_process_sent);
   3936 
   3937 		/*
   3938 		 * We have not processed any more descriptors beyond
   3939 		 * the last one we ACK'd.
   3940 		 */
   3941 		if (prev_desc_ack)
   3942 			range_start = range_end;
   3943 
   3944 		dring_pkt->start_idx = range_start;
   3945 		dring_pkt->end_idx = range_end;
   3946 
   3947 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
   3948 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
   3949 		    dring_pkt->end_idx);
   3950 
   3951 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
   3952 		    sizeof (vio_dring_msg_t), B_TRUE);
   3953 		break;
   3954 
   3955 	case VIO_SUBTYPE_ACK:
   3956 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
   3957 		/*
   3958 		 * Verify that the relevant descriptors are all
   3959 		 * marked as DONE
   3960 		 */
   3961 		READ_ENTER(&ldcp->lane_out.dlistrw);
   3962 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
   3963 		    dring_pkt->dring_ident)) == NULL) {
   3964 			RW_EXIT(&ldcp->lane_out.dlistrw);
   3965 			DERR(vswp, "%s: unknown ident in ACK", __func__);
   3966 			return;
   3967 		}
   3968 
   3969 		start = end = 0;
   3970 		start = dring_pkt->start_idx;
   3971 		end = dring_pkt->end_idx;
   3972 		len = dp->num_descriptors;
   3973 
   3974 
   3975 		mutex_enter(&dp->dlock);
   3976 		dp->last_ack_recv = end;
   3977 		ldcp->ldc_stats.dring_data_acks++;
   3978 		mutex_exit(&dp->dlock);
   3979 
   3980 		(void) vsw_reclaim_dring(dp, start);
   3981 
   3982 		/*
   3983 		 * If our peer is stopping processing descriptors then
   3984 		 * we check to make sure it has processed all the descriptors
   3985 		 * we have updated. If not then we send it a new message
   3986 		 * to prompt it to restart.
   3987 		 */
   3988 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
   3989 			DTRACE_PROBE(stop_process_recv);
   3990 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
   3991 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
   3992 			    dring_pkt->end_idx);
   3993 
   3994 			/*
   3995 			 * Check next descriptor in public section of ring.
   3996 			 * If its marked as READY then we need to prompt our
   3997 			 * peer to start processing the ring again.
   3998 			 */
   3999 			i = (end + 1) % len;
   4000 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
   4001 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
   4002 
   4003 			/*
   4004 			 * Hold the restart lock across all of this to
   4005 			 * make sure that its not possible for us to
   4006 			 * decide that a msg needs to be sent in the future
   4007 			 * but the sending code having already checked is
   4008 			 * about to exit.
   4009 			 */
   4010 			mutex_enter(&dp->restart_lock);
   4011 			ldcp->ldc_stats.dring_stopped_acks++;
   4012 			mutex_enter(&priv_addr->dstate_lock);
   4013 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
   4014 
   4015 				mutex_exit(&priv_addr->dstate_lock);
   4016 
   4017 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
   4018 				dring_pkt->tag.vio_sid = ldcp->local_session;
   4019 
   4020 				dring_pkt->start_idx = (end + 1) % len;
   4021 				dring_pkt->end_idx = -1;
   4022 
   4023 				D2(vswp, "%s(%lld) : sending restart msg:"
   4024 				    " %d : %d", __func__, ldcp->ldc_id,
   4025 				    dring_pkt->start_idx, dring_pkt->end_idx);
   4026 
   4027 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
   4028 				    sizeof (vio_dring_msg_t), B_FALSE);
   4029 				ldcp->ldc_stats.dring_data_msgs++;
   4030 
   4031 			} else {
   4032 				mutex_exit(&priv_addr->dstate_lock);
   4033 				dp->restart_reqd = B_TRUE;
   4034 			}
   4035 			mutex_exit(&dp->restart_lock);
   4036 		}
   4037 		RW_EXIT(&ldcp->lane_out.dlistrw);
   4038 
   4039 		/* only do channel reset after dropping dlistrw lock */
   4040 		if (msg_rv == ECONNRESET)
   4041 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
   4042 
   4043 		break;
   4044 
   4045 	case VIO_SUBTYPE_NACK:
   4046 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
   4047 		    __func__, ldcp->ldc_id);
   4048 		/*
   4049 		 * Something is badly wrong if we are getting NACK's
   4050 		 * for our data pkts. So reset the channel.
   4051 		 */
   4052 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
   4053 
   4054 		break;
   4055 
   4056 	default:
   4057 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
   4058 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
   4059 	}
   4060 
   4061 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
   4062 }
   4063 
   4064 /*
   4065  * dummy pkt data handler function for vnet protocol version 1.0
   4066  */
   4067 static void
   4068 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
   4069 {
   4070 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
   4071 }
   4072 
   4073 /*
   4074  * This function handles raw pkt data messages received over the channel.
   4075  * Currently, only priority-eth-type frames are received through this mechanism.
   4076  * In this case, the frame(data) is present within the message itself which
   4077  * is copied into an mblk before switching it.
   4078  */
   4079 static void
   4080 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
   4081 {
   4082 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
   4083 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
   4084 	uint32_t		size;
   4085 	mblk_t			*mp;
   4086 	vsw_t			*vswp = ldcp->ldc_vswp;
   4087 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
   4088 	lane_t			*lp = &ldcp->lane_out;
   4089 
   4090 	size = msglen - VIO_PKT_DATA_HDRSIZE;
   4091 	if (size < ETHERMIN || size > lp->mtu) {
   4092 		(void) atomic_inc_32(&statsp->rx_pri_fail);
   4093 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
   4094 		    ldcp->ldc_id, size);
   4095 		return;
   4096 	}
   4097 
   4098 	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
   4099 	if (mp == NULL) {
   4100 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
   4101 		if (mp == NULL) {
   4102 			(void) atomic_inc_32(&statsp->rx_pri_fail);
   4103 			DWARN(vswp, "%s(%lld) allocb failure, "
   4104 			    "unable to process priority frame\n", __func__,
   4105 			    ldcp->ldc_id);
   4106 			return;
   4107 		}
   4108 	}
   4109 
   4110 	/* skip over the extra space for vlan tag */
   4111 	mp->b_rptr += VLAN_TAGSZ;
   4112 
   4113 	/* copy the frame from the payload of raw data msg into the mblk */
   4114 	bcopy(dpkt->data, mp->b_rptr, size);
   4115 	mp->b_wptr = mp->b_rptr + size;
   4116 
   4117 	/* update stats */
   4118 	(void) atomic_inc_64(&statsp->rx_pri_packets);
   4119 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
   4120 
   4121 	/*
   4122 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
   4123 	 */
   4124 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
   4125 
   4126 	/* switch the frame to destination */
   4127 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
   4128 }
   4129 
   4130 /*
   4131  * Process an in-band descriptor message (most likely from
   4132  * OBP).
   4133  */
   4134 static void
   4135 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
   4136 {
   4137 	vnet_ibnd_desc_t	*ibnd_desc;
   4138 	dring_info_t		*dp = NULL;
   4139 	vsw_private_desc_t	*priv_addr = NULL;
   4140 	vsw_t			*vswp = ldcp->ldc_vswp;
   4141 	mblk_t			*mp = NULL;
   4142 	size_t			nbytes = 0;
   4143 	size_t			off = 0;
   4144 	uint64_t		idx = 0;
   4145 	uint32_t		num = 1, len, datalen = 0;
   4146 	uint64_t		ncookies = 0;
   4147 	int			i, rv;
   4148 	int			j = 0;
   4149 
   4150 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
   4151 
   4152 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
   4153 
   4154 	switch (ibnd_desc->hdr.tag.vio_subtype) {
   4155 	case VIO_SUBTYPE_INFO:
   4156 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
   4157 
   4158 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
   4159 			return;
   4160 
   4161 		/*
   4162 		 * Data is padded to align on a 8 byte boundary,
   4163 		 * nbytes is actual data length, i.e. minus that
   4164 		 * padding.
   4165 		 */
   4166 		datalen = ibnd_desc->nbytes;
   4167 
   4168 		D2(vswp, "%s(%lld): processing inband desc : "
   4169 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
   4170 
   4171 		ncookies = ibnd_desc->ncookies;
   4172 
   4173 		/*
   4174 		 * allocb(9F) returns an aligned data block. We
   4175 		 * need to ensure that we ask ldc for an aligned
   4176 		 * number of bytes also.
   4177 		 */
   4178 		nbytes = datalen;
   4179 		if (nbytes & 0x7) {
   4180 			off = 8 - (nbytes & 0x7);
   4181 			nbytes += off;
   4182 		}
   4183 
   4184 		/* alloc extra space for VLAN_TAG */
   4185 		mp = allocb(datalen + 8, BPRI_MED);
   4186 		if (mp == NULL) {
   4187 			DERR(vswp, "%s(%lld): allocb failed",
   4188 			    __func__, ldcp->ldc_id);
   4189 			ldcp->ldc_stats.rx_allocb_fail++;
   4190 			return;
   4191 		}
   4192 
   4193 		/* skip over the extra space for VLAN_TAG */
   4194 		mp->b_rptr += 8;
   4195 
   4196 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
   4197 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
   4198 		    LDC_COPY_IN);
   4199 
   4200 		if (rv != 0) {
   4201 			DERR(vswp, "%s(%d): unable to copy in data from "
   4202 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
   4203 			freemsg(mp);
   4204 			ldcp->ldc_stats.ierrors++;
   4205 			return;
   4206 		}
   4207 
   4208 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
   4209 		    __func__, ldcp->ldc_id, nbytes, ncookies);
   4210 
   4211 		/* point to the actual end of data */
   4212 		mp->b_wptr = mp->b_rptr + datalen;
   4213 		ldcp->ldc_stats.ipackets++;
   4214 		ldcp->ldc_stats.rbytes += datalen;
   4215 
   4216 		/*
   4217 		 * We ACK back every in-band descriptor message we process
   4218 		 */
   4219 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
   4220 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
   4221 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
   4222 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
   4223 
   4224 		/*
   4225 		 * there is extra space alloc'd for VLAN_TAG
   4226 		 */
   4227 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
   4228 
   4229 		/* send the packet to be switched */
   4230 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
   4231 		    ldcp->ldc_port, NULL);
   4232 
   4233 		break;
   4234 
   4235 	case VIO_SUBTYPE_ACK:
   4236 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
   4237 
   4238 		/* Verify the ACK is valid */
   4239 		idx = ibnd_desc->hdr.desc_handle;
   4240 
   4241 		if (idx >= vsw_ntxds) {
   4242 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
   4243 			    "(idx %ld)", vswp->instance, idx);
   4244 			return;
   4245 		}
   4246 
   4247 		if ((dp = ldcp->lane_out.dringp) == NULL) {
   4248 			DERR(vswp, "%s: no dring found", __func__);
   4249 			return;
   4250 		}
   4251 
   4252 		len = dp->num_descriptors;
   4253 		/*
   4254 		 * If the descriptor we are being ACK'ed for is not the
   4255 		 * one we expected, then pkts were lost somwhere, either
   4256 		 * when we tried to send a msg, or a previous ACK msg from
   4257 		 * our peer. In either case we now reclaim the descriptors
   4258 		 * in the range from the last ACK we received up to the
   4259 		 * current ACK.
   4260 		 */
   4261 		if (idx != dp->last_ack_recv) {
   4262 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
   4263 			    __func__, dp->last_ack_recv, idx);
   4264 			num = idx >= dp->last_ack_recv ?
   4265 			    idx - dp->last_ack_recv + 1:
   4266 			    (len - dp->last_ack_recv + 1) + idx;
   4267 		}
   4268 
   4269 		/*
   4270 		 * When we sent the in-band message to our peer we
   4271 		 * marked the copy in our private ring as READY. We now
   4272 		 * check that the descriptor we are being ACK'ed for is in
   4273 		 * fact READY, i.e. it is one we have shared with our peer.
   4274 		 *
   4275 		 * If its not we flag an error, but still reset the descr
   4276 		 * back to FREE.
   4277 		 */
   4278 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
   4279 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
   4280 			mutex_enter(&priv_addr->dstate_lock);
   4281 			if (priv_addr->dstate != VIO_DESC_READY) {
   4282 				DERR(vswp, "%s: (%ld) desc at index %ld not "
   4283 				    "READY (0x%lx)", __func__,
   4284 				    ldcp->ldc_id, idx, priv_addr->dstate);
   4285 				DERR(vswp, "%s: bound %d: ncookies %ld : "
   4286 				    "datalen %ld", __func__,
   4287 				    priv_addr->bound, priv_addr->ncookies,
   4288 				    priv_addr->datalen);
   4289 			}
   4290 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
   4291 			    ldcp->ldc_id, idx);
   4292 			/* release resources associated with sent msg */
   4293 			priv_addr->datalen = 0;
   4294 			priv_addr->dstate = VIO_DESC_FREE;
   4295 			mutex_exit(&priv_addr->dstate_lock);
   4296 		}
   4297 		/* update to next expected value */
   4298 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
   4299 
   4300 		break;
   4301 
   4302 	case VIO_SUBTYPE_NACK:
   4303 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
   4304 
   4305 		/*
   4306 		 * We should only get a NACK if our peer doesn't like
   4307 		 * something about a message we have sent it. If this
   4308 		 * happens we just release the resources associated with
   4309 		 * the message. (We are relying on higher layers to decide
   4310 		 * whether or not to resend.
   4311 		 */
   4312 
   4313 		/* limit check */
   4314 		idx = ibnd_desc->hdr.desc_handle;
   4315 
   4316 		if (idx >= vsw_ntxds) {
   4317 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
   4318 			    __func__, idx);
   4319 			return;
   4320 		}
   4321 
   4322 		if ((dp = ldcp->lane_out.dringp) == NULL) {
   4323 			DERR(vswp, "%s: no dring found", __func__);
   4324 			return;
   4325 		}
   4326 
   4327 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
   4328 
   4329 		/* move to correct location in ring */
   4330 		priv_addr += idx;
   4331 
   4332 		/* release resources associated with sent msg */
   4333 		mutex_enter(&priv_addr->dstate_lock);
   4334 		priv_addr->datalen = 0;
   4335 		priv_addr->dstate = VIO_DESC_FREE;
   4336 		mutex_exit(&priv_addr->dstate_lock);
   4337 
   4338 		break;
   4339 
   4340 	default:
   4341 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
   4342 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
   4343 	}
   4344 
   4345 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
   4346 }
   4347 
   4348 static void
   4349 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
   4350 {
   4351 	_NOTE(ARGUNUSED(epkt))
   4352 
   4353 	vsw_t		*vswp = ldcp->ldc_vswp;
   4354 	uint16_t	env = tagp->vio_subtype_env;
   4355 
   4356 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
   4357 
   4358 	/*
   4359 	 * Error vio_subtypes have yet to be defined. So for
   4360 	 * the moment we can't do anything.
   4361 	 */
   4362 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
   4363 
   4364 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
   4365 }
   4366 
   4367 /* transmit the packet over the given port */
   4368 int
   4369 vsw_portsend(vsw_port_t *port, mblk_t *mp)
   4370 {
   4371 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
   4372 	vsw_ldc_t 	*ldcp;
   4373 	mblk_t		*mpt;
   4374 	int		count;
   4375 	int		status = 0;
   4376 
   4377 	READ_ENTER(&ldcl->lockrw);
   4378 	/*
   4379 	 * Note for now, we have a single channel.
   4380 	 */
   4381 	ldcp = ldcl->head;
   4382 	if (ldcp == NULL) {
   4383 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
   4384 		freemsgchain(mp);
   4385 		RW_EXIT(&ldcl->lockrw);
   4386 		return (1);
   4387 	}
   4388 
   4389 	count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
   4390 
   4391 	if (count != 0) {
   4392 		status = ldcp->tx(ldcp, mp, mpt, count);
   4393 	}
   4394 
   4395 	RW_EXIT(&ldcl->lockrw);
   4396 	return (status);
   4397 }
   4398 
   4399 /*
   4400  * Break up frames into 2 seperate chains: normal and
   4401  * priority, based on the frame type. The number of
   4402  * priority frames is also counted and returned.
   4403  *
   4404  * Params:
   4405  * 	vswp:	pointer to the instance of vsw
   4406  *	np:	head of packet chain to be broken
   4407  *	npt:	tail of packet chain to be broken
   4408  *
   4409  * Returns:
   4410  *	np:	head of normal data packets
   4411  *	npt:	tail of normal data packets
   4412  *	hp:	head of high priority packets
   4413  *	hpt:	tail of high priority packets
   4414  */
   4415 static uint32_t
   4416 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
   4417 	mblk_t **hp, mblk_t **hpt)
   4418 {
   4419 	mblk_t			*tmp = NULL;
   4420 	mblk_t			*smp = NULL;
   4421 	mblk_t			*hmp = NULL;	/* high prio pkts head */
   4422 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
   4423 	mblk_t			*nmp = NULL;	/* normal pkts head */
   4424 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
   4425 	uint32_t		count = 0;
   4426 	int			i;
   4427 	struct ether_header	*ehp;
   4428 	uint32_t		num_types;
   4429 	uint16_t		*types;
   4430 
   4431 	tmp = *np;
   4432 	while (tmp != NULL) {
   4433 
   4434 		smp = tmp;
   4435 		tmp = tmp->b_next;
   4436 		smp->b_next = NULL;
   4437 		smp->b_prev = NULL;
   4438 
   4439 		ehp = (struct ether_header *)smp->b_rptr;
   4440 		num_types = vswp->pri_num_types;
   4441 		types = vswp->pri_types;
   4442 		for (i = 0; i < num_types; i++) {
   4443 			if (ehp->ether_type == types[i]) {
   4444 				/* high priority frame */
   4445 
   4446 				if (hmp != NULL) {
   4447 					hmpt->b_next = smp;
   4448 					hmpt = smp;
   4449 				} else {
   4450 					hmp = hmpt = smp;
   4451 				}
   4452 				count++;
   4453 				break;
   4454 			}
   4455 		}
   4456 		if (i == num_types) {
   4457 			/* normal data frame */
   4458 
   4459 			if (nmp != NULL) {
   4460 				nmpt->b_next = smp;
   4461 				nmpt = smp;
   4462 			} else {
   4463 				nmp = nmpt = smp;
   4464 			}
   4465 		}
   4466 	}
   4467 
   4468 	*hp = hmp;
   4469 	*hpt = hmpt;
   4470 	*np = nmp;
   4471 	*npt = nmpt;
   4472 
   4473 	return (count);
   4474 }
   4475 
   4476 /*
   4477  * Wrapper function to transmit normal and/or priority frames over the channel.
   4478  */
   4479 static int
   4480 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
   4481 {
   4482 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
   4483 	mblk_t			*tmp;
   4484 	mblk_t			*smp;
   4485 	mblk_t			*hmp;	/* high prio pkts head */
   4486 	mblk_t			*hmpt;	/* high prio pkts tail */
   4487 	mblk_t			*nmp;	/* normal pkts head */
   4488 	mblk_t			*nmpt;	/* normal pkts tail */
   4489 	uint32_t		n = 0;
   4490 	vsw_t			*vswp = ldcp->ldc_vswp;
   4491 
   4492 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
   4493 	ASSERT(count != 0);
   4494 
   4495 	nmp = mp;
   4496 	nmpt = mpt;
   4497 
   4498 	/* gather any priority frames from the chain of packets */
   4499 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
   4500 
   4501 	/* transmit priority frames */
   4502 	tmp = hmp;
   4503 	while (tmp != NULL) {
   4504 		smp = tmp;
   4505 		tmp = tmp->b_next;
   4506 		smp->b_next = NULL;
   4507 		vsw_ldcsend_pkt(ldcp, smp);
   4508 	}
   4509 
   4510 	count -= n;
   4511 
   4512 	if (count == 0) {
   4513 		/* no normal data frames to process */
   4514 		return (0);
   4515 	}
   4516 
   4517 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
   4518 }
   4519 
   4520 /*
   4521  * Wrapper function to transmit normal frames over the channel.
   4522  */
   4523 static int
   4524 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
   4525 {
   4526 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
   4527 	mblk_t		*tmp = NULL;
   4528 
   4529 	ASSERT(count != 0);
   4530 	/*
   4531 	 * If the TX thread is enabled, then queue the
   4532 	 * ordinary frames and signal the tx thread.
   4533 	 */
   4534 	if (ldcp->tx_thread != NULL) {
   4535 
   4536 		mutex_enter(&ldcp->tx_thr_lock);
   4537 
   4538 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
   4539 			/*
   4540 			 * If we reached queue limit,
   4541 			 * do not queue new packets,
   4542 			 * drop them.
   4543 			 */
   4544 			ldcp->ldc_stats.tx_qfull += count;
   4545 			mutex_exit(&ldcp->tx_thr_lock);
   4546 			freemsgchain(mp);
   4547 			goto exit;
   4548 		}
   4549 		if (ldcp->tx_mhead == NULL) {
   4550 			ldcp->tx_mhead = mp;
   4551 			ldcp->tx_mtail = mpt;
   4552 			cv_signal(&ldcp->tx_thr_cv);
   4553 		} else {
   4554 			ldcp->tx_mtail->b_next = mp;
   4555 			ldcp->tx_mtail = mpt;
   4556 		}
   4557 		ldcp->tx_cnt += count;
   4558 		mutex_exit(&ldcp->tx_thr_lock);
   4559 	} else {
   4560 		while (mp != NULL) {
   4561 			tmp = mp->b_next;
   4562 			mp->b_next = mp->b_prev = NULL;
   4563 			(void) vsw_ldcsend(ldcp, mp, 1);
   4564 			mp = tmp;
   4565 		}
   4566 	}
   4567 
   4568 exit:
   4569 	return (0);
   4570 }
   4571 
   4572 /*
   4573  * This function transmits the frame in the payload of a raw data
   4574  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
   4575  * send special frames with high priorities, without going through
   4576  * the normal data path which uses descriptor ring mechanism.
   4577  */
   4578 static void
   4579 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
   4580 {
   4581 	vio_raw_data_msg_t	*pkt;
   4582 	mblk_t			*bp;
   4583 	mblk_t			*nmp = NULL;
   4584 	caddr_t			dst;
   4585 	uint32_t		mblksz;
   4586 	uint32_t		size;
   4587 	uint32_t		nbytes;
   4588 	int			rv;
   4589 	vsw_t			*vswp = ldcp->ldc_vswp;
   4590 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
   4591 
   4592 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
   4593 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
   4594 		(void) atomic_inc_32(&statsp->tx_pri_fail);
   4595 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
   4596 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
   4597 		    ldcp->lane_out.lstate);
   4598 		goto send_pkt_exit;
   4599 	}
   4600 
   4601 	size = msgsize(mp);
   4602 
   4603 	/* frame size bigger than available payload len of raw data msg ? */
   4604 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
   4605 		(void) atomic_inc_32(&statsp->tx_pri_fail);
   4606 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
   4607 		    ldcp->ldc_id, size);
   4608 		goto send_pkt_exit;
   4609 	}
   4610 
   4611 	if (size < ETHERMIN)
   4612 		size = ETHERMIN;
   4613 
   4614 	/* alloc space for a raw data message */
   4615 	nmp = vio_allocb(vswp->pri_tx_vmp);
   4616 	if (nmp == NULL) {
   4617 		(void) atomic_inc_32(&statsp->tx_pri_fail);
   4618 		DWARN(vswp, "vio_allocb failed\n");
   4619 		goto send_pkt_exit;
   4620 	}
   4621 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
   4622 
   4623 	/* copy frame into the payload of raw data message */
   4624 	dst = (caddr_t)pkt->data;
   4625 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
   4626 		mblksz = MBLKL(bp);
   4627 		bcopy(bp->b_rptr, dst, mblksz);
   4628 		dst += mblksz;
   4629 	}
   4630 
   4631 	/* setup the raw data msg */
   4632 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
   4633 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
   4634 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
   4635 	pkt->tag.vio_sid = ldcp->local_session;
   4636 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
   4637 
   4638 	/* send the msg over ldc */
   4639 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
   4640 	if (rv != 0) {
   4641 		(void) atomic_inc_32(&statsp->tx_pri_fail);
   4642 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
   4643 		    ldcp->ldc_id);
   4644 		goto send_pkt_exit;
   4645 	}
   4646 
   4647 	/* update stats */
   4648 	(void) atomic_inc_64(&statsp->tx_pri_packets);
   4649 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
   4650 
   4651 send_pkt_exit:
   4652 	if (nmp != NULL)
   4653 		freemsg(nmp);
   4654 	freemsg(mp);
   4655 }
   4656 
   4657 /*
   4658  * Transmit the packet over the given LDC channel.
   4659  *
   4660  * The 'retries' argument indicates how many times a packet
   4661  * is retried before it is dropped. Note, the retry is done
   4662  * only for a resource related failure, for all other failures
   4663  * the packet is dropped immediately.
   4664  */
   4665 static int
   4666 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
   4667 {
   4668 	int i;
   4669 	int rc;
   4670 	int status = 0;
   4671 	vsw_port_t *port = ldcp->ldc_port;
   4672 	dring_info_t *dp = NULL;
   4673 
   4674 
   4675 	for (i = 0; i < retries; ) {
   4676 		/*
   4677 		 * Send the message out using the appropriate
   4678 		 * transmit function which will free mblock when it
   4679 		 * is finished with it.
   4680 		 */
   4681 		mutex_enter(&port->tx_lock);
   4682 		if (port->transmit != NULL) {
   4683 			status = (*port->transmit)(ldcp, mp);
   4684 		}
   4685 		if (status == LDC_TX_SUCCESS) {
   4686 			mutex_exit(&port->tx_lock);
   4687 			break;
   4688 		}
   4689 		i++;	/* increment the counter here */
   4690 
   4691 		/* If its the last retry, then update the oerror */
   4692 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
   4693 			ldcp->ldc_stats.oerrors++;
   4694 		}
   4695 		mutex_exit(&port->tx_lock);
   4696 
   4697 		if (status != LDC_TX_NORESOURCES) {
   4698 			/*
   4699 			 * No retrying required for errors un-related
   4700 			 * to resources.
   4701 			 */
   4702 			break;
   4703 		}
   4704 		READ_ENTER(&ldcp->lane_out.dlistrw);
   4705 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
   4706 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
   4707 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
   4708 		    ((VSW_VER_LT(ldcp, 1, 2) &&
   4709 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
   4710 			rc = vsw_reclaim_dring(dp, dp->end_idx);
   4711 		} else {
   4712 			/*
   4713 			 * If there is no dring or the xfer_mode is
   4714 			 * set to DESC_MODE(ie., OBP), then simply break here.
   4715 			 */
   4716 			RW_EXIT(&ldcp->lane_out.dlistrw);
   4717 			break;
   4718 		}
   4719 		RW_EXIT(&ldcp->lane_out.dlistrw);
   4720 
   4721 		/*
   4722 		 * Delay only if none were reclaimed
   4723 		 * and its not the last retry.
   4724 		 */
   4725 		if ((rc == 0) && (i < retries)) {
   4726 			delay(drv_usectohz(vsw_ldc_tx_delay));
   4727 		}
   4728 	}
   4729 	freemsg(mp);
   4730 	return (status);
   4731 }
   4732 
   4733 /*
   4734  * Send packet out via descriptor ring to a logical device.
   4735  */
   4736 static int
   4737 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
   4738 {
   4739 	vio_dring_msg_t		dring_pkt;
   4740 	dring_info_t		*dp = NULL;
   4741 	vsw_private_desc_t	*priv_desc = NULL;
   4742 	vnet_public_desc_t	*pub = NULL;
   4743 	vsw_t			*vswp = ldcp->ldc_vswp;
   4744 	mblk_t			*bp;
   4745 	size_t			n, size;
   4746 	caddr_t			bufp;
   4747 	int			idx;
   4748 	int			status = LDC_TX_SUCCESS;
   4749 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
   4750 	lane_t			*lp = &ldcp->lane_out;
   4751 
   4752 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
   4753 
   4754 	/* TODO: make test a macro */
   4755 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
   4756 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
   4757 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
   4758 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
   4759 		    ldcp->lane_out.lstate);
   4760 		ldcp->ldc_stats.oerrors++;
   4761 		return (LDC_TX_FAILURE);
   4762 	}
   4763 
   4764 	/*
   4765 	 * Note - using first ring only, this may change
   4766 	 * in the future.
   4767 	 */
   4768 	READ_ENTER(&ldcp->lane_out.dlistrw);
   4769 	if ((dp = ldcp->lane_out.dringp) == NULL) {
   4770 		RW_EXIT(&ldcp->lane_out.dlistrw);
   4771 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
   4772 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
   4773 		ldcp->ldc_stats.oerrors++;
   4774 		return (LDC_TX_FAILURE);
   4775 	}
   4776 
   4777 	size = msgsize(mp);
   4778 	if (size > (size_t)lp->mtu) {
   4779 		RW_EXIT(&ldcp->lane_out.dlistrw);
   4780 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
   4781 		    ldcp->ldc_id, size);
   4782 		ldcp->ldc_stats.oerrors++;
   4783 		return (LDC_TX_FAILURE);
   4784 	}
   4785 
   4786 	/*
   4787 	 * Find a free descriptor
   4788 	 *
   4789 	 * Note: for the moment we are assuming that we will only
   4790 	 * have one dring going from the switch to each of its
   4791 	 * peers. This may change in the future.
   4792 	 */
   4793 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
   4794 		D2(vswp, "%s(%lld): no descriptor available for ring "
   4795 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
   4796 
   4797 		/* nothing more we can do */
   4798 		status = LDC_TX_NORESOURCES;
   4799 		ldcp->ldc_stats.tx_no_desc++;
   4800 		goto vsw_dringsend_free_exit;
   4801 	} else {
   4802 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
   4803 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
   4804 	}
   4805 
   4806 	/* copy data into the descriptor */
   4807 	bufp = priv_desc->datap;
   4808 	bufp += VNET_IPALIGN;
   4809 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
   4810 		n = MBLKL(bp);
   4811 		bcopy(bp->b_rptr, bufp, n);
   4812 		bufp += n;
   4813 	}
   4814 
   4815 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
   4816 
   4817 	pub = priv_desc->descp;
   4818 	pub->nbytes = priv_desc->datalen;
   4819 
   4820 	/* update statistics */
   4821 	if (IS_BROADCAST(ehp))
   4822 		ldcp->ldc_stats.brdcstxmt++;
   4823 	else if (IS_MULTICAST(ehp))
   4824 		ldcp->ldc_stats.multixmt++;
   4825 	ldcp->ldc_stats.opackets++;
   4826 	ldcp->ldc_stats.obytes += priv_desc->datalen;
   4827 
   4828 	mutex_enter(&priv_desc->dstate_lock);
   4829 	pub->hdr.dstate = VIO_DESC_READY;
   4830 	mutex_exit(&priv_desc->dstate_lock);
   4831 
   4832 	/*
   4833 	 * Determine whether or not we need to send a message to our
   4834 	 * peer prompting them to read our newly updated descriptor(s).
   4835 	 */
   4836 	mutex_enter(&dp->restart_lock);
   4837 	if (dp->restart_reqd) {
   4838 		dp->restart_reqd = B_FALSE;
   4839 		ldcp->ldc_stats.dring_data_msgs++;
   4840 		mutex_exit(&dp->restart_lock);
   4841 
   4842 		/*
   4843 		 * Send a vio_dring_msg to peer to prompt them to read
   4844 		 * the updated descriptor ring.
   4845 		 */
   4846 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
   4847 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
   4848 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
   4849 		dring_pkt.tag.vio_sid = ldcp->local_session;
   4850 
   4851 		/* Note - for now using first ring */
   4852 		dring_pkt.dring_ident = dp->ident;
   4853 
   4854 		/*
   4855 		 * If last_ack_recv is -1 then we know we've not
   4856 		 * received any ack's yet, so this must be the first
   4857 		 * msg sent, so set the start to the begining of the ring.
   4858 		 */
   4859 		mutex_enter(&dp->dlock);
   4860 		if (dp->last_ack_recv == -1) {
   4861 			dring_pkt.start_idx = 0;
   4862 		} else {
   4863 			dring_pkt.start_idx =
   4864 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
   4865 		}
   4866 		dring_pkt.end_idx = -1;
   4867 		mutex_exit(&dp->dlock);
   4868 
   4869 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
   4870 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
   4871 		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
   4872 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
   4873 		    dring_pkt.end_idx);
   4874 
   4875 		RW_EXIT(&ldcp->lane_out.dlistrw);
   4876 
   4877 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
   4878 		    sizeof (vio_dring_msg_t), B_TRUE);
   4879 
   4880 		return (status);
   4881 
   4882 	} else {
   4883 		mutex_exit(&dp->restart_lock);
   4884 		D2(vswp, "%s(%lld): updating descp %d", __func__,
   4885 		    ldcp->ldc_id, idx);
   4886 	}
   4887 
   4888 vsw_dringsend_free_exit:
   4889 
   4890 	RW_EXIT(&ldcp->lane_out.dlistrw);
   4891 
   4892 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
   4893 	return (status);
   4894 }
   4895 
   4896 /*
   4897  * Send an in-band descriptor message over ldc.
   4898  */
   4899 static int
   4900 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
   4901 {
   4902 	vsw_t			*vswp = ldcp->ldc_vswp;
   4903 	vnet_ibnd_desc_t	ibnd_msg;
   4904 	vsw_private_desc_t	*priv_desc = NULL;
   4905 	dring_info_t		*dp = NULL;
   4906 	size_t			n, size = 0;
   4907 	caddr_t			bufp;
   4908 	mblk_t			*bp;
   4909 	int			idx, i;
   4910 	int			status = LDC_TX_SUCCESS;
   4911 	static int		warn_msg = 1;
   4912 	lane_t			*lp = &ldcp->lane_out;
   4913 
   4914 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
   4915 
   4916 	ASSERT(mp != NULL);
   4917 
   4918 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
   4919 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
   4920 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
   4921 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
   4922 		    ldcp->lane_out.lstate);
   4923 		ldcp->ldc_stats.oerrors++;
   4924 		return (LDC_TX_FAILURE);
   4925 	}
   4926 
   4927 	/*
   4928 	 * only expect single dring to exist, which we use
   4929 	 * as an internal buffer, rather than a transfer channel.
   4930 	 */
   4931 	READ_ENTER(&ldcp->lane_out.dlistrw);
   4932 	if ((dp = ldcp->lane_out.dringp) == NULL) {
   4933 		DERR(vswp, "%s(%lld): no dring for outbound lane",
   4934 		    __func__, ldcp->ldc_id);
   4935 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
   4936 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
   4937 		RW_EXIT(&ldcp->lane_out.dlistrw);
   4938 		ldcp->ldc_stats.oerrors++;
   4939 		return (LDC_TX_FAILURE);
   4940 	}
   4941 
   4942 	size = msgsize(mp);
   4943 	if (size > (size_t)lp->mtu) {
   4944 		RW_EXIT(&ldcp->lane_out.dlistrw);
   4945 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
   4946 		    ldcp->ldc_id, size);
   4947 		ldcp->ldc_stats.oerrors++;
   4948 		return (LDC_TX_FAILURE);
   4949 	}
   4950 
   4951 	/*
   4952 	 * Find a free descriptor in our buffer ring
   4953 	 */
   4954 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
   4955 		RW_EXIT(&ldcp->lane_out.dlistrw);
   4956 		if (warn_msg) {
   4957 			DERR(vswp, "%s(%lld): no descriptor available for ring "
   4958 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
   4959 			warn_msg = 0;
   4960 		}
   4961 
   4962 		/* nothing more we can do */
   4963 		status = LDC_TX_NORESOURCES;
   4964 		goto vsw_descrsend_free_exit;
   4965 	} else {
   4966 		D2(vswp, "%s(%lld): free private descriptor found at pos "
   4967 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
   4968 		warn_msg = 1;
   4969 	}
   4970 
   4971 	/* copy data into the descriptor */
   4972 	bufp = priv_desc->datap;
   4973 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
   4974 		n = MBLKL(bp);
   4975 		bcopy(bp->b_rptr, bufp, n);
   4976 		bufp += n;
   4977 	}
   4978 
   4979 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
   4980 
   4981 	/* create and send the in-band descp msg */
   4982 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
   4983 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
   4984 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
   4985 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
   4986 
   4987 	/*
   4988 	 * Copy the mem cookies describing the data from the
   4989 	 * private region of the descriptor ring into the inband
   4990 	 * descriptor.
   4991 	 */
   4992 	for (i = 0; i < priv_desc->ncookies; i++) {
   4993 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
   4994 		    sizeof (ldc_mem_cookie_t));
   4995 	}
   4996 
   4997 	ibnd_msg.hdr.desc_handle = idx;
   4998 	ibnd_msg.ncookies = priv_desc->ncookies;
   4999 	ibnd_msg.nbytes = size;
   5000 
   5001 	ldcp->ldc_stats.opackets++;
   5002 	ldcp->ldc_stats.obytes += size;
   5003 
   5004 	RW_EXIT(&ldcp->lane_out.dlistrw);
   5005 
   5006 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
   5007 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
   5008 
   5009 vsw_descrsend_free_exit:
   5010 
   5011 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
   5012 	return (status);
   5013 }
   5014 
   5015 static void
   5016 vsw_send_ver(void *arg)
   5017 {
   5018 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
   5019 	vsw_t		*vswp = ldcp->ldc_vswp;
   5020 	lane_t		*lp = &ldcp->lane_out;
   5021 	vio_ver_msg_t	ver_msg;
   5022 
   5023 	D1(vswp, "%s enter", __func__);
   5024 
   5025 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
   5026 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
   5027 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
   5028 	ver_msg.tag.vio_sid = ldcp->local_session;
   5029 
   5030 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
   5031 		ver_msg.ver_major = vsw_versions[0].ver_major;
   5032 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
   5033 	} else {
   5034 		/* use the major,minor that we've ack'd */
   5035 		lane_t	*lpi = &ldcp->lane_in;
   5036 		ver_msg.ver_major = lpi->ver_major;
   5037 		ver_msg.ver_minor = lpi->ver_minor;
   5038 	}
   5039 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
   5040 
   5041 	lp->lstate |= VSW_VER_INFO_SENT;
   5042 	lp->ver_major = ver_msg.ver_major;
   5043 	lp->ver_minor = ver_msg.ver_minor;
   5044 
   5045 	DUMP_TAG(ver_msg.tag);
   5046 
   5047 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
   5048 
   5049 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
   5050 }
   5051 
   5052 static void
   5053 vsw_send_attr(vsw_ldc_t *ldcp)
   5054 {
   5055 	vsw_t			*vswp = ldcp->ldc_vswp;
   5056 	lane_t			*lp = &ldcp->lane_out;
   5057 	vnet_attr_msg_t		attr_msg;
   5058 
   5059 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
   5060 
   5061 	/*
   5062 	 * Subtype is set to INFO by default
   5063 	 */
   5064 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
   5065 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
   5066 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
   5067 	attr_msg.tag.vio_sid = ldcp->local_session;
   5068 
   5069 	/* payload copied from default settings for lane */
   5070 	attr_msg.mtu = lp->mtu;
   5071 	attr_msg.addr_type = lp->addr_type;
   5072 	attr_msg.xfer_mode = lp->xfer_mode;
   5073 	attr_msg.ack_freq = lp->xfer_mode;
   5074 
   5075 	READ_ENTER(&vswp->if_lockrw);
   5076 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
   5077 	RW_EXIT(&vswp->if_lockrw);
   5078 
   5079 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
   5080 
   5081 	DUMP_TAG(attr_msg.tag);
   5082 
   5083 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
   5084 
   5085 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
   5086 }
   5087 
   5088 /*
   5089  * Create dring info msg (which also results in the creation of
   5090  * a dring).
   5091  */
   5092 static vio_dring_reg_msg_t *
   5093 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
   5094 {
   5095 	vio_dring_reg_msg_t	*mp;
   5096 	dring_info_t		*dp;
   5097 	vsw_t			*vswp = ldcp->ldc_vswp;
   5098 	int			rv;
   5099 
   5100 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
   5101 
   5102 	/*
   5103 	 * If we can't create a dring, obviously no point sending
   5104 	 * a message.
   5105 	 */
   5106 	if ((dp = vsw_create_dring(ldcp)) == NULL)
   5107 		return (NULL);
   5108 
   5109 	/* Allocate pools of receive mblks */
   5110 	rv = vsw_init_multipools(ldcp, vswp);
   5111 	if (rv) {
   5112 		/*
   5113 		 * We do not return failure if receive mblk pools can't be
   5114 		 * allocated, instead allocb(9F) will be used to dynamically
   5115 		 * allocate buffers during receive.
   5116 		 */
   5117 		DWARN(vswp, "%s: unable to create free mblk pools for"
   5118 		    " channel %ld (rv %d)", __func__, ldcp->ldc_id, rv);
   5119 	}
   5120 
   5121 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
   5122 
   5123 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
   5124 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
   5125 	mp->tag.vio_subtype_env = VIO_DRING_REG;
   5126 	mp->tag.vio_sid = ldcp->local_session;
   5127 
   5128 	/* payload */
   5129 	mp->num_descriptors = dp->num_descriptors;
   5130 	mp->descriptor_size = dp->descriptor_size;
   5131 	mp->options = dp->options;
   5132 	mp->ncookies = dp->ncookies;
   5133 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
   5134 
   5135 	mp->dring_ident = 0;
   5136 
   5137 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
   5138 
   5139 	return (mp);
   5140 }
   5141 
   5142 static void
   5143 vsw_send_dring_info(vsw_ldc_t *ldcp)
   5144 {
   5145 	vio_dring_reg_msg_t	*dring_msg;
   5146 	vsw_t			*vswp = ldcp->ldc_vswp;
   5147 
   5148 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
   5149 
   5150 	dring_msg = vsw_create_dring_info_pkt(ldcp);
   5151 	if (dring_msg == NULL) {
   5152 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
   5153 		    vswp->instance, __func__);
   5154 		return;
   5155 	}
   5156 
   5157 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
   5158 
   5159 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
   5160 
   5161 	(void) vsw_send_msg(ldcp, dring_msg,
   5162 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
   5163 
   5164 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
   5165 
   5166 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
   5167 }
   5168 
   5169 static void
   5170 vsw_send_rdx(vsw_ldc_t *ldcp)
   5171 {
   5172 	vsw_t		*vswp = ldcp->ldc_vswp;
   5173 	vio_rdx_msg_t	rdx_msg;
   5174 
   5175 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
   5176 
   5177 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
   5178 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
   5179 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
   5180 	rdx_msg.tag.vio_sid = ldcp->local_session;
   5181 
   5182 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
   5183 
   5184 	DUMP_TAG(rdx_msg.tag);
   5185 
   5186 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
   5187 
   5188 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
   5189 }
   5190 
   5191 /*
   5192  * Generic routine to send message out over ldc channel.
   5193  *
   5194  * It is possible that when we attempt to write over the ldc channel
   5195  * that we get notified that it has been reset. Depending on the value
   5196  * of the handle_reset flag we either handle that event here or simply
   5197  * notify the caller that the channel was reset.
   5198  */
   5199 int
   5200 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
   5201 {
   5202 	int			rv;
   5203 	size_t			msglen = size;
   5204 	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
   5205 	vsw_t			*vswp = ldcp->ldc_vswp;
   5206 	vio_dring_msg_t		*dmsg;
   5207 	vio_raw_data_msg_t	*rmsg;
   5208 	vnet_ibnd_desc_t	*imsg;
   5209 	boolean_t		data_msg = B_FALSE;
   5210 
   5211 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
   5212 	    ldcp->ldc_id, size);
   5213 
   5214 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
   5215 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
   5216 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
   5217 
   5218 	mutex_enter(&ldcp->ldc_txlock);
   5219 
   5220 	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
   5221 		if (tag->vio_subtype_env == VIO_DRING_DATA) {
   5222 			dmsg = (vio_dring_msg_t *)tag;
   5223 			dmsg->seq_num = ldcp->lane_out.seq_num;
   5224 			data_msg = B_TRUE;
   5225 		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
   5226 			rmsg = (vio_raw_data_msg_t *)tag;
   5227 			rmsg->seq_num = ldcp->lane_out.seq_num;
   5228 			data_msg = B_TRUE;
   5229 		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
   5230 			imsg = (vnet_ibnd_desc_t *)tag;
   5231 			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
   5232 			data_msg = B_TRUE;
   5233 		}
   5234 	}
   5235 
   5236 	do {
   5237 		msglen = size;
   5238 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
   5239 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
   5240 
   5241 	if (rv == 0 && data_msg == B_TRUE) {
   5242 		ldcp->lane_out.seq_num++;
   5243 	}
   5244 
   5245 	if ((rv != 0) || (msglen != size)) {
   5246 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
   5247 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
   5248 		ldcp->ldc_stats.oerrors++;
   5249 	}
   5250 
   5251 	mutex_exit(&ldcp->ldc_txlock);
   5252 
   5253 	/*
   5254 	 * If channel has been reset we either handle it here or
   5255 	 * simply report back that it has been reset and let caller
   5256 	 * decide what to do.
   5257 	 */
   5258 	if (rv == ECONNRESET) {
   5259 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
   5260 
   5261 		/*
   5262 		 * N.B - must never be holding the dlistrw lock when
   5263 		 * we do a reset of the channel.
   5264 		 */
   5265 		if (handle_reset) {
   5266 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
   5267 		}
   5268 	}
   5269 
   5270 	return (rv);
   5271 }
   5272 
   5273 /*
   5274  * Remove the specified address from the list of address maintained
   5275  * in this port node.
   5276  */
   5277 mcst_addr_t *
   5278 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
   5279 {
   5280 	vsw_t		*vswp = NULL;
   5281 	vsw_port_t	*port = NULL;
   5282 	mcst_addr_t	*prev_p = NULL;
   5283 	mcst_addr_t	*curr_p = NULL;
   5284 
   5285 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
   5286 	    __func__, devtype, addr);
   5287 
   5288 	if (devtype == VSW_VNETPORT) {
   5289 		port = (vsw_port_t *)arg;
   5290 		mutex_enter(&port->mca_lock);
   5291 		prev_p = curr_p = port->mcap;
   5292 	} else {
   5293 		vswp = (vsw_t *)arg;
   5294 		mutex_enter(&vswp->mca_lock);
   5295 		prev_p = curr_p = vswp->mcap;
   5296 	}
   5297 
   5298 	while (curr_p != NULL) {
   5299 		if (curr_p->addr == addr) {
   5300 			D2(NULL, "%s: address found", __func__);
   5301 			/* match found */
   5302 			if (prev_p == curr_p) {
   5303 				/* list head */
   5304 				if (devtype == VSW_VNETPORT)
   5305 					port->mcap = curr_p->nextp;
   5306 				else
   5307 					vswp->mcap = curr_p->nextp;
   5308 			} else {
   5309 				prev_p->nextp = curr_p->nextp;
   5310 			}
   5311 			break;
   5312 		} else {
   5313 			prev_p = curr_p;
   5314 			curr_p = curr_p->nextp;
   5315 		}
   5316 	}
   5317 
   5318 	if (devtype == VSW_VNETPORT)
   5319 		mutex_exit(&port->mca_lock);
   5320 	else
   5321 		mutex_exit(&vswp->mca_lock);
   5322 
   5323 	D1(NULL, "%s: exit", __func__);
   5324 
   5325 	return (curr_p);
   5326 }
   5327 
   5328 /*
   5329  * Creates a descriptor ring (dring) and links it into the
   5330  * link of outbound drings for this channel.
   5331  *
   5332  * Returns NULL if creation failed.
   5333  */
   5334 static dring_info_t *
   5335 vsw_create_dring(vsw_ldc_t *ldcp)
   5336 {
   5337 	vsw_private_desc_t	*priv_addr = NULL;
   5338 	vsw_t			*vswp = ldcp->ldc_vswp;
   5339 	ldc_mem_info_t		minfo;
   5340 	dring_info_t		*dp, *tp;
   5341 	int			i;
   5342 
   5343 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
   5344 
   5345 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
   5346 
   5347 	/* create public section of ring */
   5348 	if ((ldc_mem_dring_create(vsw_ntxds,
   5349 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
   5350 
   5351 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
   5352 		    "failed", ldcp->ldc_id);
   5353 		goto create_fail_exit;
   5354 	}
   5355 
   5356 	ASSERT(dp->handle != NULL);
   5357 
   5358 	/*
   5359 	 * Get the base address of the public section of the ring.
   5360 	 */
   5361 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
   5362 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
   5363 		    ldcp->ldc_id);
   5364 		goto dring_fail_exit;
   5365 	} else {
   5366 		ASSERT(minfo.vaddr != 0);
   5367 		dp->pub_addr = minfo.vaddr;
   5368 	}
   5369 
   5370 	dp->num_descriptors = vsw_ntxds;
   5371 	dp->descriptor_size = VSW_PUB_SIZE;
   5372 	dp->options = VIO_TX_DRING;
   5373 	dp->ncookies = 1;	/* guaranteed by ldc */
   5374 
   5375 	/*
   5376 	 * create private portion of ring
   5377 	 */
   5378 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
   5379 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
   5380 
   5381 	if (vsw_setup_ring(ldcp, dp)) {
   5382 		DERR(vswp, "%s: unable to setup ring", __func__);
   5383 		goto dring_fail_exit;
   5384 	}
   5385 
   5386 	/* haven't used any descriptors yet */
   5387 	dp->end_idx = 0;
   5388 	dp->last_ack_recv = -1;
   5389 
   5390 	/* bind dring to the channel */
   5391 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
   5392 	    LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
   5393 	    &dp->cookie[0], &dp->ncookies)) != 0) {
   5394 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
   5395 		    "%lld", ldcp->ldc_id);
   5396 		goto dring_fail_exit;
   5397 	}
   5398 
   5399 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
   5400 	dp->restart_reqd = B_TRUE;
   5401 
   5402 	/*
   5403 	 * Only ever create rings for outgoing lane. Link it onto
   5404 	 * end of list.
   5405 	 */
   5406 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
   5407 	if (ldcp->lane_out.dringp == NULL) {
   5408 		D2(vswp, "vsw_create_dring: adding first outbound ring");
   5409 		ldcp->lane_out.dringp = dp;
   5410 	} else {
   5411 		tp = ldcp->lane_out.dringp;
   5412 		while (tp->next != NULL)
   5413 			tp = tp->next;
   5414 
   5415 		tp->next = dp;
   5416 	}
   5417 	RW_EXIT(&ldcp->lane_out.dlistrw);
   5418 
   5419 	return (dp);
   5420 
   5421 dring_fail_exit:
   5422 	(void) ldc_mem_dring_destroy(dp->handle);
   5423 
   5424 create_fail_exit:
   5425 	if (dp->priv_addr != NULL) {
   5426 		priv_addr = dp->priv_addr;
   5427 		for (i = 0; i < vsw_ntxds; i++) {
   5428 			if (priv_addr->memhandle != NULL)
   5429 				(void) ldc_mem_free_handle(
   5430 				    priv_addr->memhandle);
   5431 			priv_addr++;
   5432 		}
   5433 		kmem_free(dp->priv_addr,
   5434 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
   5435 	}
   5436 	mutex_destroy(&dp->dlock);
   5437 
   5438 	kmem_free(dp, sizeof (dring_info_t));
   5439 	return (NULL);
   5440 }
   5441 
   5442 /*
   5443  * Create a ring consisting of just a private portion and link
   5444  * it into the list of rings for the outbound lane.
   5445  *
   5446  * These type of rings are used primarily for temporary data
   5447  * storage (i.e. as data buffers).
   5448  */
   5449 void
   5450 vsw_create_privring(vsw_ldc_t *ldcp)
   5451 {
   5452 	dring_info_t		*dp, *tp;
   5453 	vsw_t			*vswp = ldcp->ldc_vswp;
   5454 
   5455 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
   5456 
   5457 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
   5458 
   5459 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
   5460 
   5461 	/* no public section */
   5462 	dp->pub_addr = NULL;
   5463 
   5464 	dp->priv_addr = kmem_zalloc(
   5465 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
   5466 
   5467 	dp->num_descriptors = vsw_ntxds;
   5468 
   5469 	if (vsw_setup_ring(ldcp, dp)) {
   5470 		DERR(vswp, "%s: setup of ring failed", __func__);
   5471 		kmem_free(dp->priv_addr,
   5472 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
   5473 		mutex_destroy(&dp->dlock);
   5474 		kmem_free(dp, sizeof (dring_info_t));
   5475 		return;
   5476 	}
   5477 
   5478 	/* haven't used any descriptors yet */
   5479 	dp->end_idx = 0;
   5480 
   5481 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
   5482 	dp->restart_reqd = B_TRUE;
   5483 
   5484 	/*
   5485 	 * Only ever create rings for outgoing lane. Link it onto
   5486 	 * end of list.
   5487 	 */
   5488 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
   5489 	if (ldcp->lane_out.dringp == NULL) {
   5490 		D2(vswp, "%s: adding first outbound privring", __func__);
   5491 		ldcp->lane_out.dringp = dp;
   5492 	} else {
   5493 		tp = ldcp->lane_out.dringp;
   5494 		while (tp->next != NULL)
   5495 			tp = tp->next;
   5496 
   5497 		tp->next = dp;
   5498 	}
   5499 	RW_EXIT(&ldcp->lane_out.dlistrw);
   5500 
   5501 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
   5502 }
   5503 
   5504 /*
   5505  * Setup the descriptors in the dring. Returns 0 on success, 1 on
   5506  * failure.
   5507  */
   5508 int
   5509 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
   5510 {
   5511 	vnet_public_desc_t	*pub_addr = NULL;
   5512 	vsw_private_desc_t	*priv_addr = NULL;
   5513 	vsw_t			*vswp = ldcp->ldc_vswp;
   5514 	uint64_t		*tmpp;
   5515 	uint64_t		offset = 0;
   5516 	uint32_t		ncookies = 0;
   5517 	static char		*name = "vsw_setup_ring";
   5518 	int			i, j, nc, rv;
   5519 	size_t			data_sz;
   5520 	void			*data_addr;
   5521 
   5522 	priv_addr = dp->priv_addr;
   5523 	pub_addr = dp->pub_addr;
   5524 
   5525 	/* public section may be null but private should never be */
   5526 	ASSERT(priv_addr != NULL);
   5527 
   5528 	/*
   5529 	 * Allocate the region of memory which will be used to hold
   5530 	 * the data the descriptors will refer to.
   5531 	 */
   5532 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
   5533 
   5534 	/*
   5535 	 * In order to ensure that the number of ldc cookies per descriptor is
   5536 	 * limited to be within the default MAX_COOKIES (2), we take the steps
   5537 	 * outlined below:
   5538 	 *
   5539 	 * Align the entire data buffer area to 8K and carve out per descriptor
   5540 	 * data buffers starting from this 8K aligned base address.
   5541 	 *
   5542 	 * We round up the mtu specified to be a multiple of 2K or 4K.
   5543 	 * For sizes up to 12K we round up the size to the next 2K.
   5544 	 * For sizes > 12K we round up to the next 4K (otherwise sizes such as
   5545 	 * 14K could end up needing 3 cookies, with the buffer spread across
   5546 	 * 3 8K pages:  8K+6K, 2K+8K+2K, 6K+8K, ...).
   5547 	 */
   5548 	if (data_sz <= VNET_12K) {
   5549 		data_sz = VNET_ROUNDUP_2K(data_sz);
   5550 	} else {
   5551 		data_sz = VNET_ROUNDUP_4K(data_sz);
   5552 	}
   5553 
   5554 	dp->desc_data_sz = data_sz;
   5555 
   5556 	/* allocate extra 8K bytes for alignment */
   5557 	dp->data_sz = (vsw_ntxds * data_sz) + VNET_8K;
   5558 	data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
   5559 	dp->data_addr = data_addr;
   5560 
   5561 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
   5562 	    dp->data_sz, dp->data_addr);
   5563 
   5564 	/* align the starting address of the data area to 8K */
   5565 	data_addr = (void *)VNET_ROUNDUP_8K((uintptr_t)data_addr);
   5566 
   5567 	tmpp = (uint64_t *)data_addr;
   5568 	offset = dp->desc_data_sz/sizeof (tmpp);
   5569 
   5570 	/*
   5571 	 * Initialise some of the private and public (if they exist)
   5572 	 * descriptor fields.
   5573 	 */
   5574 	for (i = 0; i < vsw_ntxds; i++) {
   5575 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
   5576 
   5577 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
   5578 		    &priv_addr->memhandle)) != 0) {
   5579 			DERR(vswp, "%s: alloc mem handle failed", name);
   5580 			goto setup_ring_cleanup;
   5581 		}
   5582 
   5583 		priv_addr->datap = (void *)tmpp;
   5584 
   5585 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
   5586 		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
   5587 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
   5588 		    &(priv_addr->memcookie[0]), &ncookies);
   5589 		if (rv != 0) {
   5590 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
   5591 			    "(rv %d)", name, ldcp->ldc_id, rv);
   5592 			goto setup_ring_cleanup;
   5593 		}
   5594 		priv_addr->bound = 1;
   5595 
   5596 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
   5597 		    name, i, priv_addr->memcookie[0].addr,
   5598 		    priv_addr->memcookie[0].size);
   5599 
   5600 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
   5601 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
   5602 			    "invalid num of cookies (%d) for size 0x%llx",
   5603 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
   5604 
   5605 			goto setup_ring_cleanup;
   5606 		} else {
   5607 			for (j = 1; j < ncookies; j++) {
   5608 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
   5609 				    &(priv_addr->memcookie[j]));
   5610 				if (rv != 0) {
   5611 					DERR(vswp, "%s: ldc_mem_nextcookie "
   5612 					    "failed rv (%d)", name, rv);
   5613 					goto setup_ring_cleanup;
   5614 				}
   5615 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
   5616 				    "size 0x%llx", name, j,
   5617 				    priv_addr->memcookie[j].addr,
   5618 				    priv_addr->memcookie[j].size);
   5619 			}
   5620 
   5621 		}
   5622 		priv_addr->ncookies = ncookies;
   5623 		priv_addr->dstate = VIO_DESC_FREE;
   5624 
   5625 		if (pub_addr != NULL) {
   5626 
   5627 			/* link pub and private sides */
   5628 			priv_addr->descp = pub_addr;
   5629 
   5630 			pub_addr->ncookies = priv_addr->ncookies;
   5631 
   5632 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
   5633 				bcopy(&priv_addr->memcookie[nc],
   5634 				    &pub_addr->memcookie[nc],
   5635 				    sizeof (ldc_mem_cookie_t));
   5636 			}
   5637 
   5638 			pub_addr->hdr.dstate = VIO_DESC_FREE;
   5639 			pub_addr++;
   5640 		}
   5641 
   5642 		/*
   5643 		 * move to next element in the dring and the next
   5644 		 * position in the data buffer.
   5645 		 */
   5646 		priv_addr++;
   5647 		tmpp += offset;
   5648 	}
   5649 
   5650 	return (0);
   5651 
   5652 setup_ring_cleanup:
   5653 	priv_addr = dp->priv_addr;
   5654 
   5655 	for (j = 0; j < i; j++) {
   5656 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
   5657 		(void) ldc_mem_free_handle(priv_addr->memhandle);
   5658 
   5659 		mutex_destroy(&priv_addr->dstate_lock);
   5660 
   5661 		priv_addr++;
   5662 	}
   5663 	kmem_free(dp->data_addr, dp->data_sz);
   5664 
   5665 	return (1);
   5666 }
   5667 
   5668 /*
   5669  * Searches the private section of a ring for a free descriptor,
   5670  * starting at the location of the last free descriptor found
   5671  * previously.
   5672  *
   5673  * Returns 0 if free descriptor is available, and updates state
   5674  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
   5675  *
   5676  * FUTURE: might need to return contiguous range of descriptors
   5677  * as dring info msg assumes all will be contiguous.
   5678  */
   5679 static int
   5680 vsw_dring_find_free_desc(dring_info_t *dringp,
   5681 		vsw_private_desc_t **priv_p, int *idx)
   5682 {
   5683 	vsw_private_desc_t	*addr = NULL;
   5684 	int			num = vsw_ntxds;
   5685 	int			ret = 1;
   5686 
   5687 	D1(NULL, "%s enter\n", __func__);
   5688 
   5689 	ASSERT(dringp->priv_addr != NULL);
   5690 
   5691 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
   5692 	    __func__, dringp, dringp->end_idx);
   5693 
   5694 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
   5695 
   5696 	mutex_enter(&addr->dstate_lock);
   5697 	if (addr->dstate == VIO_DESC_FREE) {
   5698 		addr->dstate = VIO_DESC_READY;
   5699 		*priv_p = addr;
   5700 		*idx = dringp->end_idx;
   5701 		dringp->end_idx = (dringp->end_idx + 1) % num;
   5702 		ret = 0;
   5703 
   5704 	}
   5705 	mutex_exit(&addr->dstate_lock);
   5706 
   5707 	/* ring full */
   5708 	if (ret == 1) {
   5709 		D2(NULL, "%s: no desp free: started at %d", __func__,
   5710 		    dringp->end_idx);
   5711 	}
   5712 
   5713 	D1(NULL, "%s: exit\n", __func__);
   5714 
   5715 	return (ret);
   5716 }
   5717 
   5718 /*
   5719  * Map from a dring identifier to the ring itself. Returns
   5720  * pointer to ring or NULL if no match found.
   5721  *
   5722  * Should be called with dlistrw rwlock held as reader.
   5723  */
   5724 static dring_info_t *
   5725 vsw_ident2dring(lane_t *lane, uint64_t ident)
   5726 {
   5727 	dring_info_t	*dp = NULL;
   5728 
   5729 	if ((dp = lane->dringp) == NULL) {
   5730 		return (NULL);
   5731 	} else {
   5732 		if (dp->ident == ident)
   5733 			return (dp);
   5734 
   5735 		while (dp != NULL) {
   5736 			if (dp->ident == ident)
   5737 				break;
   5738 			dp = dp->next;
   5739 		}
   5740 	}
   5741 
   5742 	return (dp);
   5743 }
   5744 
   5745 /*
   5746  * Set the default lane attributes. These are copied into
   5747  * the attr msg we send to our peer. If they are not acceptable
   5748  * then (currently) the handshake ends.
   5749  */
   5750 static void
   5751 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
   5752 {
   5753 	bzero(lp, sizeof (lane_t));
   5754 
   5755 	READ_ENTER(&vswp->if_lockrw);
   5756 	ether_copy(&(vswp->if_addr), &(lp->addr));
   5757 	RW_EXIT(&vswp->if_lockrw);
   5758 
   5759 	lp->mtu = vswp->max_frame_size;
   5760 	lp->addr_type = ADDR_TYPE_MAC;
   5761 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
   5762 	lp->ack_freq = 0;	/* for shared mode */
   5763 	lp->seq_num = VNET_ISS;
   5764 }
   5765 
   5766 /*
   5767  * Verify that the attributes are acceptable.
   5768  *
   5769  * FUTURE: If some attributes are not acceptable, change them
   5770  * our desired values.
   5771  */
   5772 static int
   5773 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
   5774 {
   5775 	int			ret = 0;
   5776 	struct ether_addr	ea;
   5777 	vsw_port_t		*port = ldcp->ldc_port;
   5778 	lane_t			*lp = &ldcp->lane_out;
   5779 
   5780 	D1(NULL, "vsw_check_attr enter\n");
   5781 
   5782 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
   5783 	    (pkt->xfer_mode != lp->xfer_mode)) {
   5784 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
   5785 		ret = 1;
   5786 	}
   5787 
   5788 	/* Only support MAC addresses at moment. */
   5789 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
   5790 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
   5791 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
   5792 		ret = 1;
   5793 	}
   5794 
   5795 	/*
   5796 	 * MAC address supplied by device should match that stored
   5797 	 * in the vsw-port OBP node. Need to decide what to do if they
   5798 	 * don't match, for the moment just warn but don't fail.
   5799 	 */
   5800 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
   5801 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
   5802 		DERR(NULL, "vsw_check_attr: device supplied address "
   5803 		    "0x%llx doesn't match node address 0x%llx\n",
   5804 		    pkt->addr, port->p_macaddr);
   5805 	}
   5806 
   5807 	/*
   5808 	 * Ack freq only makes sense in pkt mode, in shared
   5809 	 * mode the ring descriptors say whether or not to
   5810 	 * send back an ACK.
   5811 	 */
   5812 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
   5813 	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
   5814 	    (VSW_VER_LT(ldcp, 1, 2) &&
   5815 	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
   5816 		if (pkt->ack_freq > 0) {
   5817 			D2(NULL, "vsw_check_attr: non zero ack freq "
   5818 			    " in SHM mode\n");
   5819 			ret = 1;
   5820 		}
   5821 	}
   5822 
   5823 	if (VSW_VER_LT(ldcp, 1, 4)) {
   5824 		/* versions < 1.4, mtu must match */
   5825 		if (pkt->mtu != lp->mtu) {
   5826 			D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
   5827 			    pkt->mtu);
   5828 			ret = 1;
   5829 		}
   5830 	} else {
   5831 		/* Ver >= 1.4, validate mtu of the peer is at least ETHERMAX */
   5832 		if (pkt->mtu < ETHERMAX) {
   5833 			ret = 1;
   5834 		}
   5835 	}
   5836 
   5837 	D1(NULL, "vsw_check_attr exit\n");
   5838 
   5839 	return (ret);
   5840 }
   5841 
   5842 /*
   5843  * Returns 1 if there is a problem, 0 otherwise.
   5844  */
   5845 static int
   5846 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
   5847 {
   5848 	_NOTE(ARGUNUSED(pkt))
   5849 
   5850 	int	ret = 0;
   5851 
   5852 	D1(NULL, "vsw_check_dring_info enter\n");
   5853 
   5854 	if ((pkt->num_descriptors == 0) ||
   5855 	    (pkt->descriptor_size == 0) ||
   5856 	    (pkt->ncookies != 1)) {
   5857 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
   5858 		ret = 1;
   5859 	}
   5860 
   5861 	D1(NULL, "vsw_check_dring_info exit\n");
   5862 
   5863 	return (ret);
   5864 }
   5865 
   5866 /*
   5867  * Returns 1 if two memory cookies match. Otherwise returns 0.
   5868  */
   5869 static int
   5870 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
   5871 {
   5872 	if ((m1->addr != m2->addr) ||
   5873 	    (m2->size != m2->size)) {
   5874 		return (0);
   5875 	} else {
   5876 		return (1);
   5877 	}
   5878 }
   5879 
   5880 /*
   5881  * Returns 1 if ring described in reg message matches that
   5882  * described by dring_info structure. Otherwise returns 0.
   5883  */
   5884 static int
   5885 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
   5886 {
   5887 	if ((msg->descriptor_size != dp->descriptor_size) ||
   5888 	    (msg->num_descriptors != dp->num_descriptors) ||
   5889 	    (msg->ncookies != dp->ncookies) ||
   5890 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
   5891 		return (0);
   5892 	} else {
   5893 		return (1);
   5894 	}
   5895 
   5896 }
   5897 
   5898 /*
   5899  * Reset and free all the resources associated with
   5900  * the channel.
   5901  */
   5902 static void
   5903 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
   5904 {
   5905 	dring_info_t		*dp, *dpp;
   5906 	lane_t			*lp = NULL;
   5907 
   5908 	ASSERT(ldcp != NULL);
   5909 
   5910 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
   5911 
   5912 	if (dir == INBOUND) {
   5913 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
   5914 		    " of channel %lld", __func__, ldcp->ldc_id);
   5915 		lp = &ldcp->lane_in;
   5916 	} else {
   5917 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
   5918 		    " of channel %lld", __func__, ldcp->ldc_id);
   5919 		lp = &ldcp->lane_out;
   5920 	}
   5921 
   5922 	lp->lstate = VSW_LANE_INACTIV;
   5923 	lp->seq_num = VNET_ISS;
   5924 
   5925 	if (lp->dringp) {
   5926 		if (dir == INBOUND) {
   5927 			WRITE_ENTER(&lp->dlistrw);
   5928 			dp = lp->dringp;
   5929 			while (dp != NULL) {
   5930 				dpp = dp->next;
   5931 				if (dp->handle != NULL)
   5932 					(void) ldc_mem_dring_unmap(dp->handle);
   5933 				kmem_free(dp, sizeof (dring_info_t));
   5934 				dp = dpp;
   5935 			}
   5936 			RW_EXIT(&lp->dlistrw);
   5937 		} else {
   5938 			/*
   5939 			 * unbind, destroy exported dring, free dring struct
   5940 			 */
   5941 			WRITE_ENTER(&lp->dlistrw);
   5942 			dp = lp->dringp;
   5943 			vsw_free_ring(dp);
   5944 			RW_EXIT(&lp->dlistrw);
   5945 		}
   5946 		lp->dringp = NULL;
   5947 	}
   5948 
   5949 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
   5950 }
   5951 
   5952 /*
   5953  * Free ring and all associated resources.
   5954  *
   5955  * Should be called with dlistrw rwlock held as writer.
   5956  */
   5957 static void
   5958 vsw_free_ring(dring_info_t *dp)
   5959 {
   5960 	vsw_private_desc_t	*paddr = NULL;
   5961 	dring_info_t		*dpp;
   5962 	int			i;
   5963 
   5964 	while (dp != NULL) {
   5965 		mutex_enter(&dp->dlock);
   5966 		dpp = dp->next;
   5967 		if (dp->priv_addr != NULL) {
   5968 			/*
   5969 			 * First unbind and free the memory handles
   5970 			 * stored in each descriptor within the ring.
   5971 			 */
   5972 			for (i = 0; i < vsw_ntxds; i++) {
   5973 				paddr = (vsw_private_desc_t *)
   5974 				    dp->priv_addr + i;
   5975 				if (paddr->memhandle != NULL) {
   5976 					if (paddr->bound == 1) {
   5977 						if (ldc_mem_unbind_handle(
   5978 						    paddr->memhandle) != 0) {
   5979 							DERR(NULL, "error "
   5980 							"unbinding handle for "
   5981 							"ring 0x%llx at pos %d",
   5982 							    dp, i);
   5983 							continue;
   5984 						}
   5985 						paddr->bound = 0;
   5986 					}
   5987 
   5988 					if (ldc_mem_free_handle(
   5989 					    paddr->memhandle) != 0) {
   5990 						DERR(NULL, "error freeing "
   5991 						    "handle for ring 0x%llx "
   5992 						    "at pos %d", dp, i);
   5993 						continue;
   5994 					}
   5995 					paddr->memhandle = NULL;
   5996 				}
   5997 				mutex_destroy(&paddr->dstate_lock);
   5998 			}
   5999 			kmem_free(dp->priv_addr,
   6000 			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
   6001 		}
   6002 
   6003 		/*
   6004 		 * Now unbind and destroy the ring itself.
   6005 		 */
   6006 		if (dp->handle != NULL) {
   6007 			(void) ldc_mem_dring_unbind(dp->handle);
   6008 			(void) ldc_mem_dring_destroy(dp->handle);
   6009 		}
   6010 
   6011 		if (dp->data_addr != NULL) {
   6012 			kmem_free(dp->data_addr, dp->data_sz);
   6013 		}
   6014 
   6015 		mutex_exit(&dp->dlock);
   6016 		mutex_destroy(&dp->dlock);
   6017 		mutex_destroy(&dp->restart_lock);
   6018 		kmem_free(dp, sizeof (dring_info_t));
   6019 
   6020 		dp = dpp;
   6021 	}
   6022 }
   6023 
   6024 /*
   6025  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
   6026  * This thread is woken up by the LDC interrupt handler to process
   6027  * LDC packets and receive data.
   6028  */
   6029 static void
   6030 vsw_ldc_rx_worker(void *arg)
   6031 {
   6032 	callb_cpr_t	cprinfo;
   6033 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
   6034 	vsw_t *vswp = ldcp->ldc_vswp;
   6035 
   6036 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
   6037 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
   6038 	    "vsw_rx_thread");
   6039 	mutex_enter(&ldcp->rx_thr_lock);
   6040 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
   6041 
   6042 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   6043 		/*
   6044 		 * Wait until the data is received or a stop
   6045 		 * request is received.
   6046 		 */
   6047 		while (!(ldcp->rx_thr_flags &
   6048 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
   6049 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
   6050 		}
   6051 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
   6052 
   6053 		/*
   6054 		 * First process the stop request.
   6055 		 */
   6056 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
   6057 			D2(vswp, "%s(%lld):Rx thread stopped\n",
   6058 			    __func__, ldcp->ldc_id);
   6059 			break;
   6060 		}
   6061 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
   6062 		mutex_exit(&ldcp->rx_thr_lock);
   6063 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
   6064 		    __func__, ldcp->ldc_id);
   6065 		mutex_enter(&ldcp->ldc_cblock);
   6066 		vsw_process_pkt(ldcp);
   6067 		mutex_exit(&ldcp->ldc_cblock);
   6068 		mutex_enter(&ldcp->rx_thr_lock);
   6069 	}
   6070 
   6071 	/*
   6072 	 * Update the run status and wakeup the thread that
   6073 	 * has sent the stop request.
   6074 	 */
   6075 	ldcp->rx_thr_flags &= ~VSW_WTHR_STOP;
   6076 	ldcp->rx_thread = NULL;
   6077 	CALLB_CPR_EXIT(&cprinfo);
   6078 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
   6079 	thread_exit();
   6080 }
   6081 
   6082 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
   6083 static void
   6084 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
   6085 {
   6086 	kt_did_t	tid = 0;
   6087 	vsw_t		*vswp = ldcp->ldc_vswp;
   6088 
   6089 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
   6090 	/*
   6091 	 * Send a stop request by setting the stop flag and
   6092 	 * wait until the receive thread stops.
   6093 	 */
   6094 	mutex_enter(&ldcp->rx_thr_lock);
   6095 	if (ldcp->rx_thread != NULL) {
   6096 		tid = ldcp->rx_thread->t_did;
   6097 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
   6098 		cv_signal(&ldcp->rx_thr_cv);
   6099 	}
   6100 	mutex_exit(&ldcp->rx_thr_lock);
   6101 
   6102 	if (tid != 0) {
   6103 		thread_join(tid);
   6104 	}
   6105 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
   6106 }
   6107 
   6108 /*
   6109  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
   6110  * This thread is woken up by the vsw_portsend to transmit
   6111  * packets.
   6112  */
   6113 static void
   6114 vsw_ldc_tx_worker(void *arg)
   6115 {
   6116 	callb_cpr_t	cprinfo;
   6117 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
   6118 	vsw_t *vswp = ldcp->ldc_vswp;
   6119 	mblk_t *mp;
   6120 	mblk_t *tmp;
   6121 
   6122 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
   6123 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
   6124 	    "vnet_tx_thread");
   6125 	mutex_enter(&ldcp->tx_thr_lock);
   6126 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
   6127 
   6128 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   6129 		/*
   6130 		 * Wait until the data is received or a stop
   6131 		 * request is received.
   6132 		 */
   6133 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
   6134 		    (ldcp->tx_mhead == NULL)) {
   6135 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
   6136 		}
   6137 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
   6138 
   6139 		/*
   6140 		 * First process the stop request.
   6141 		 */
   6142 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
   6143 			D2(vswp, "%s(%lld):tx thread stopped\n",
   6144 			    __func__, ldcp->ldc_id);
   6145 			break;
   6146 		}
   6147 		mp = ldcp->tx_mhead;
   6148 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
   6149 		ldcp->tx_cnt = 0;
   6150 		mutex_exit(&ldcp->tx_thr_lock);
   6151 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
   6152 		    __func__, ldcp->ldc_id);
   6153 		while (mp != NULL) {
   6154 			tmp = mp->b_next;
   6155 			mp->b_next = mp->b_prev = NULL;
   6156 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
   6157 			mp = tmp;
   6158 		}
   6159 		mutex_enter(&ldcp->tx_thr_lock);
   6160 	}
   6161 
   6162 	/*
   6163 	 * Update the run status and wakeup the thread that
   6164 	 * has sent the stop request.
   6165 	 */
   6166 	ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
   6167 	ldcp->tx_thread = NULL;
   6168 	CALLB_CPR_EXIT(&cprinfo);
   6169 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
   6170 	thread_exit();
   6171 }
   6172 
   6173 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
   6174 static void
   6175 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
   6176 {
   6177 	kt_did_t	tid = 0;
   6178 	vsw_t		*vswp = ldcp->ldc_vswp;
   6179 
   6180 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
   6181 	/*
   6182 	 * Send a stop request by setting the stop flag and
   6183 	 * wait until the receive thread stops.
   6184 	 */
   6185 	mutex_enter(&ldcp->tx_thr_lock);
   6186 	if (ldcp->tx_thread != NULL) {
   6187 		tid = ldcp->tx_thread->t_did;
   6188 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
   6189 		cv_signal(&ldcp->tx_thr_cv);
   6190 	}
   6191 	mutex_exit(&ldcp->tx_thr_lock);
   6192 
   6193 	if (tid != 0) {
   6194 		thread_join(tid);
   6195 	}
   6196 
   6197 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
   6198 }
   6199 
   6200 /* vsw_reclaim_dring -- reclaim descriptors */
   6201 static int
   6202 vsw_reclaim_dring(dring_info_t *dp, int start)
   6203 {
   6204 	int i, j, len;
   6205 	vsw_private_desc_t *priv_addr;
   6206 	vnet_public_desc_t *pub_addr;
   6207 
   6208 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
   6209 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
   6210 	len = dp->num_descriptors;
   6211 
   6212 	D2(NULL, "%s: start index %ld\n", __func__, start);
   6213 
   6214 	j = 0;
   6215 	for (i = start; j < len; i = (i + 1) % len, j++) {
   6216 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
   6217 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
   6218 
   6219 		mutex_enter(&priv_addr->dstate_lock);
   6220 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
   6221 			mutex_exit(&priv_addr->dstate_lock);
   6222 			break;
   6223 		}
   6224 		pub_addr->hdr.dstate = VIO_DESC_FREE;
   6225 		priv_addr->dstate = VIO_DESC_FREE;
   6226 		/* clear all the fields */
   6227 		priv_addr->datalen = 0;
   6228 		pub_addr->hdr.ack = 0;
   6229 		mutex_exit(&priv_addr->dstate_lock);
   6230 
   6231 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
   6232 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
   6233 	}
   6234 	return (j);
   6235 }
   6236 
   6237 /*
   6238  * Debugging routines
   6239  */
   6240 static void
   6241 display_state(void)
   6242 {
   6243 	vsw_t		*vswp;
   6244 	vsw_port_list_t	*plist;
   6245 	vsw_port_t 	*port;
   6246 	vsw_ldc_list_t	*ldcl;
   6247 	vsw_ldc_t 	*ldcp;
   6248 	extern vsw_t 	*vsw_head;
   6249 
   6250 	cmn_err(CE_NOTE, "***** system state *****");
   6251 
   6252 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
   6253 		plist = &vswp->plist;
   6254 		READ_ENTER(&plist->lockrw);
   6255 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
   6256 		    vswp->instance, plist->num_ports);
   6257 
   6258 		for (port = plist->head; port != NULL; port = port->p_next) {
   6259 			ldcl = &port->p_ldclist;
   6260 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
   6261 			    port->p_instance, port->num_ldcs);
   6262 			READ_ENTER(&ldcl->lockrw);
   6263 			ldcp = ldcl->head;
   6264 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
   6265 				cmn_err(CE_CONT, "chan %lu : dev %d : "
   6266 				    "status %d : phase %u\n",
   6267 				    ldcp->ldc_id, ldcp->dev_class,
   6268 				    ldcp->ldc_status, ldcp->hphase);
   6269 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
   6270 				    "psession %lu\n", ldcp->ldc_id,
   6271 				    ldcp->local_session, ldcp->peer_session);
   6272 
   6273 				cmn_err(CE_CONT, "Inbound lane:\n");
   6274 				display_lane(&ldcp->lane_in);
   6275 				cmn_err(CE_CONT, "Outbound lane:\n");
   6276 				display_lane(&ldcp->lane_out);
   6277 			}
   6278 			RW_EXIT(&ldcl->lockrw);
   6279 		}
   6280 		RW_EXIT(&plist->lockrw);
   6281 	}
   6282 	cmn_err(CE_NOTE, "***** system state *****");
   6283 }
   6284 
   6285 static void
   6286 display_lane(lane_t *lp)
   6287 {
   6288 	dring_info_t	*drp;
   6289 
   6290 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
   6291 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
   6292 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
   6293 	    lp->addr_type, lp->addr, lp->xfer_mode);
   6294 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
   6295 
   6296 	cmn_err(CE_CONT, "Dring info:\n");
   6297 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
   6298 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
   6299 		    drp->num_descriptors, drp->descriptor_size);
   6300 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
   6301 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
   6302 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
   6303 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
   6304 		    drp->ident, drp->end_idx);
   6305 		display_ring(drp);
   6306 	}
   6307 }
   6308 
   6309 static void
   6310 display_ring(dring_info_t *dringp)
   6311 {
   6312 	uint64_t		i;
   6313 	uint64_t		priv_count = 0;
   6314 	uint64_t		pub_count = 0;
   6315 	vnet_public_desc_t	*pub_addr = NULL;
   6316 	vsw_private_desc_t	*priv_addr = NULL;
   6317 
   6318 	for (i = 0; i < vsw_ntxds; i++) {
   6319 		if (dringp->pub_addr != NULL) {
   6320 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
   6321 
   6322 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
   6323 				pub_count++;
   6324 		}
   6325 
   6326 		if (dringp->priv_addr != NULL) {
   6327 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
   6328 
   6329 			if (priv_addr->dstate == VIO_DESC_FREE)
   6330 				priv_count++;
   6331 		}
   6332 	}
   6333 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
   6334 	    i, priv_count, pub_count);
   6335 }
   6336 
   6337 static void
   6338 dump_flags(uint64_t state)
   6339 {
   6340 	int	i;
   6341 
   6342 	typedef struct flag_name {
   6343 		int	flag_val;
   6344 		char	*flag_name;
   6345 	} flag_name_t;
   6346 
   6347 	flag_name_t	flags[] = {
   6348 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
   6349 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
   6350 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
   6351 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
   6352 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
   6353 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
   6354 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
   6355 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
   6356 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
   6357 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
   6358 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
   6359 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
   6360 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
   6361 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
   6362 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
   6363 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
   6364 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
   6365 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
   6366 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
   6367 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
   6368 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
   6369 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
   6370 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
   6371 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
   6372 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
   6373 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
   6374 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
   6375 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
   6376 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
   6377 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
   6378 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
   6379 
   6380 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
   6381 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
   6382 		if (state & flags[i].flag_val)
   6383 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
   6384 	}
   6385 }
   6386