Home | History | Annotate | Download | only in io
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #ifdef DEBUG
     28 #define	XNB_DEBUG 1
     29 #endif /* DEBUG */
     30 
     31 #include "xnb.h"
     32 
     33 #include <sys/sunddi.h>
     34 #include <sys/sunndi.h>
     35 #include <sys/modctl.h>
     36 #include <sys/conf.h>
     37 #include <sys/mac.h>
     38 #include <sys/mac_impl.h> /* For mac_fix_cksum(). */
     39 #include <sys/dlpi.h>
     40 #include <sys/strsubr.h>
     41 #include <sys/strsun.h>
     42 #include <sys/types.h>
     43 #include <sys/pattr.h>
     44 #include <vm/seg_kmem.h>
     45 #include <vm/hat_i86.h>
     46 #include <xen/sys/xenbus_impl.h>
     47 #include <xen/sys/xendev.h>
     48 #include <sys/balloon_impl.h>
     49 #include <sys/evtchn_impl.h>
     50 #include <sys/gnttab.h>
     51 #include <vm/vm_dep.h>
     52 #include <sys/note.h>
     53 #include <sys/gld.h>
     54 #include <inet/ip.h>
     55 #include <inet/ip_impl.h>
     56 
     57 /*
     58  * The terms "transmit" and "receive" are used in alignment with domU,
     59  * which means that packets originating from the peer domU are "transmitted"
     60  * to other parts of the system and packets are "received" from them.
     61  */
     62 
     63 /*
     64  * Should we allow guests to manipulate multicast group membership?
     65  */
     66 static boolean_t	xnb_multicast_control = B_TRUE;
     67 
     68 static boolean_t	xnb_connect_rings(dev_info_t *);
     69 static void		xnb_disconnect_rings(dev_info_t *);
     70 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
     71     void *, void *);
     72 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
     73     void *, void *);
     74 
     75 static int	xnb_txbuf_constructor(void *, void *, int);
     76 static void	xnb_txbuf_destructor(void *, void *);
     77 static void	xnb_tx_notify_peer(xnb_t *, boolean_t);
     78 static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
     79 
     80 mblk_t		*xnb_to_peer(xnb_t *, mblk_t *);
     81 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
     82 
     83 static void		setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *,
     84     size_t, size_t, size_t, grant_ref_t);
     85 #pragma inline(setup_gop)
     86 static boolean_t	is_foreign(void *);
     87 #pragma inline(is_foreign)
     88 
     89 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
     90 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
     91 
     92 static kmutex_t	xnb_alloc_page_lock;
     93 
     94 /*
     95  * On a 32 bit PAE system physical and machine addresses are larger
     96  * than 32 bits.  ddi_btop() on such systems take an unsigned long
     97  * argument, and so addresses above 4G are truncated before ddi_btop()
     98  * gets to see them.  To avoid this, code the shift operation here.
     99  */
    100 #define	xnb_btop(addr)	((addr) >> PAGESHIFT)
    101 
    102 /* DMA attributes for transmit and receive data */
    103 static ddi_dma_attr_t buf_dma_attr = {
    104 	DMA_ATTR_V0,		/* version of this structure */
    105 	0,			/* lowest usable address */
    106 	0xffffffffffffffffULL,	/* highest usable address */
    107 	0x7fffffff,		/* maximum DMAable byte count */
    108 	MMU_PAGESIZE,		/* alignment in bytes */
    109 	0x7ff,			/* bitmap of burst sizes */
    110 	1,			/* minimum transfer */
    111 	0xffffffffU,		/* maximum transfer */
    112 	0xffffffffffffffffULL,	/* maximum segment length */
    113 	1,			/* maximum number of segments */
    114 	1,			/* granularity */
    115 	0,			/* flags (reserved) */
    116 };
    117 
    118 /* DMA access attributes for data: NOT to be byte swapped. */
    119 static ddi_device_acc_attr_t data_accattr = {
    120 	DDI_DEVICE_ATTR_V0,
    121 	DDI_NEVERSWAP_ACC,
    122 	DDI_STRICTORDER_ACC
    123 };
    124 
    125 /*
    126  * Statistics.
    127  */
    128 static char *aux_statistics[] = {
    129 	"rx_cksum_deferred",
    130 	"tx_cksum_no_need",
    131 	"rx_rsp_notok",
    132 	"tx_notify_deferred",
    133 	"tx_notify_sent",
    134 	"rx_notify_deferred",
    135 	"rx_notify_sent",
    136 	"tx_too_early",
    137 	"rx_too_early",
    138 	"rx_allocb_failed",
    139 	"tx_allocb_failed",
    140 	"rx_foreign_page",
    141 	"mac_full",
    142 	"spurious_intr",
    143 	"allocation_success",
    144 	"allocation_failure",
    145 	"small_allocation_success",
    146 	"small_allocation_failure",
    147 	"other_allocation_failure",
    148 	"rx_pageboundary_crossed",
    149 	"rx_cpoparea_grown",
    150 	"csum_hardware",
    151 	"csum_software",
    152 };
    153 
    154 static int
    155 xnb_ks_aux_update(kstat_t *ksp, int flag)
    156 {
    157 	xnb_t *xnbp;
    158 	kstat_named_t *knp;
    159 
    160 	if (flag != KSTAT_READ)
    161 		return (EACCES);
    162 
    163 	xnbp = ksp->ks_private;
    164 	knp = ksp->ks_data;
    165 
    166 	/*
    167 	 * Assignment order should match that of the names in
    168 	 * aux_statistics.
    169 	 */
    170 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
    171 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
    172 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
    173 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
    174 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
    175 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
    176 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
    177 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
    178 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
    179 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
    180 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
    181 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
    182 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
    183 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
    184 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
    185 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
    186 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
    187 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
    188 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
    189 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
    190 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
    191 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
    192 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
    193 
    194 	return (0);
    195 }
    196 
    197 static boolean_t
    198 xnb_ks_init(xnb_t *xnbp)
    199 {
    200 	int nstat = sizeof (aux_statistics) /
    201 	    sizeof (aux_statistics[0]);
    202 	char **cp = aux_statistics;
    203 	kstat_named_t *knp;
    204 
    205 	/*
    206 	 * Create and initialise kstats.
    207 	 */
    208 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
    209 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
    210 	    KSTAT_TYPE_NAMED, nstat, 0);
    211 	if (xnbp->xnb_kstat_aux == NULL)
    212 		return (B_FALSE);
    213 
    214 	xnbp->xnb_kstat_aux->ks_private = xnbp;
    215 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
    216 
    217 	knp = xnbp->xnb_kstat_aux->ks_data;
    218 	while (nstat > 0) {
    219 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
    220 
    221 		knp++;
    222 		cp++;
    223 		nstat--;
    224 	}
    225 
    226 	kstat_install(xnbp->xnb_kstat_aux);
    227 
    228 	return (B_TRUE);
    229 }
    230 
    231 static void
    232 xnb_ks_free(xnb_t *xnbp)
    233 {
    234 	kstat_delete(xnbp->xnb_kstat_aux);
    235 }
    236 
    237 /*
    238  * Calculate and insert the transport checksum for an arbitrary packet.
    239  */
    240 static mblk_t *
    241 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
    242 {
    243 	_NOTE(ARGUNUSED(xnbp));
    244 
    245 	/*
    246 	 * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
    247 	 * because it doesn't cover all of the interesting cases :-(
    248 	 */
    249 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
    250 	    HCK_FULLCKSUM, KM_NOSLEEP);
    251 
    252 	return (mac_fix_cksum(mp));
    253 }
    254 
    255 mblk_t *
    256 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
    257 {
    258 	struct ether_header *ehp;
    259 	uint16_t sap;
    260 	uint32_t offset;
    261 	ipha_t *ipha;
    262 
    263 	ASSERT(mp->b_next == NULL);
    264 
    265 	/*
    266 	 * Check that the packet is contained in a single mblk.  In
    267 	 * the "from peer" path this is true today, but may change
    268 	 * when scatter gather support is added.  In the "to peer"
    269 	 * path we cannot be sure, but in most cases it will be true
    270 	 * (in the xnbo case the packet has come from a MAC device
    271 	 * which is unlikely to split packets).
    272 	 */
    273 	if (mp->b_cont != NULL)
    274 		goto software;
    275 
    276 	/*
    277 	 * If the MAC has no hardware capability don't do any further
    278 	 * checking.
    279 	 */
    280 	if (capab == 0)
    281 		goto software;
    282 
    283 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
    284 	ehp = (struct ether_header *)mp->b_rptr;
    285 
    286 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
    287 		struct ether_vlan_header *evhp;
    288 
    289 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
    290 		evhp = (struct ether_vlan_header *)mp->b_rptr;
    291 		sap = ntohs(evhp->ether_type);
    292 		offset = sizeof (struct ether_vlan_header);
    293 	} else {
    294 		sap = ntohs(ehp->ether_type);
    295 		offset = sizeof (struct ether_header);
    296 	}
    297 
    298 	/*
    299 	 * We only attempt to do IPv4 packets in hardware.
    300 	 */
    301 	if (sap != ETHERTYPE_IP)
    302 		goto software;
    303 
    304 	/*
    305 	 * We know that this is an IPv4 packet.
    306 	 */
    307 	ipha = (ipha_t *)(mp->b_rptr + offset);
    308 
    309 	switch (ipha->ipha_protocol) {
    310 	case IPPROTO_TCP:
    311 	case IPPROTO_UDP: {
    312 		uint32_t start, length, stuff, cksum;
    313 		uint16_t *stuffp;
    314 
    315 		/*
    316 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
    317 		 * can use full IPv4 and partial checksum offload.
    318 		 */
    319 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
    320 			break;
    321 
    322 		start = IP_SIMPLE_HDR_LENGTH;
    323 		length = ntohs(ipha->ipha_length);
    324 		if (ipha->ipha_protocol == IPPROTO_TCP) {
    325 			stuff = start + TCP_CHECKSUM_OFFSET;
    326 			cksum = IP_TCP_CSUM_COMP;
    327 		} else {
    328 			stuff = start + UDP_CHECKSUM_OFFSET;
    329 			cksum = IP_UDP_CSUM_COMP;
    330 		}
    331 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
    332 
    333 		if (capab & HCKSUM_INET_FULL_V4) {
    334 			/*
    335 			 * Some devices require that the checksum
    336 			 * field of the packet is zero for full
    337 			 * offload.
    338 			 */
    339 			*stuffp = 0;
    340 
    341 			(void) hcksum_assoc(mp, NULL, NULL,
    342 			    0, 0, 0, 0,
    343 			    HCK_FULLCKSUM, KM_NOSLEEP);
    344 
    345 			xnbp->xnb_stat_csum_hardware++;
    346 
    347 			return (mp);
    348 		}
    349 
    350 		if (capab & HCKSUM_INET_PARTIAL) {
    351 			if (*stuffp == 0) {
    352 				ipaddr_t src, dst;
    353 
    354 				/*
    355 				 * Older Solaris guests don't insert
    356 				 * the pseudo-header checksum, so we
    357 				 * calculate it here.
    358 				 */
    359 				src = ipha->ipha_src;
    360 				dst = ipha->ipha_dst;
    361 
    362 				cksum += (dst >> 16) + (dst & 0xFFFF);
    363 				cksum += (src >> 16) + (src & 0xFFFF);
    364 				cksum += length - IP_SIMPLE_HDR_LENGTH;
    365 
    366 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
    367 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
    368 
    369 				ASSERT(cksum <= 0xFFFF);
    370 
    371 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
    372 			}
    373 
    374 			(void) hcksum_assoc(mp, NULL, NULL,
    375 			    start, stuff, length, 0,
    376 			    HCK_PARTIALCKSUM, KM_NOSLEEP);
    377 
    378 			xnbp->xnb_stat_csum_hardware++;
    379 
    380 			return (mp);
    381 		}
    382 
    383 		/* NOTREACHED */
    384 		break;
    385 	}
    386 
    387 	default:
    388 		/* Use software. */
    389 		break;
    390 	}
    391 
    392 software:
    393 	/*
    394 	 * We are not able to use any offload so do the whole thing in
    395 	 * software.
    396 	 */
    397 	xnbp->xnb_stat_csum_software++;
    398 
    399 	return (xnb_software_csum(xnbp, mp));
    400 }
    401 
    402 int
    403 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
    404 {
    405 	xnb_t *xnbp;
    406 	char *xsname;
    407 	char cachename[32];
    408 
    409 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
    410 
    411 	xnbp->xnb_flavour = flavour;
    412 	xnbp->xnb_flavour_data = flavour_data;
    413 	xnbp->xnb_devinfo = dip;
    414 	xnbp->xnb_evtchn = INVALID_EVTCHN;
    415 	xnbp->xnb_irq = B_FALSE;
    416 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
    417 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
    418 	xnbp->xnb_connected = B_FALSE;
    419 	xnbp->xnb_hotplugged = B_FALSE;
    420 	xnbp->xnb_detachable = B_FALSE;
    421 	xnbp->xnb_peer = xvdi_get_oeid(dip);
    422 	xnbp->xnb_be_status = XNB_STATE_INIT;
    423 	xnbp->xnb_fe_status = XNB_STATE_INIT;
    424 
    425 	xnbp->xnb_tx_buf_count = 0;
    426 
    427 	xnbp->xnb_rx_hv_copy = B_FALSE;
    428 	xnbp->xnb_multicast_control = B_FALSE;
    429 
    430 	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
    431 	ASSERT(xnbp->xnb_rx_va != NULL);
    432 
    433 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
    434 	    != DDI_SUCCESS)
    435 		goto failure;
    436 
    437 	/* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
    438 	xnbp->xnb_rx_cpop = NULL;
    439 	xnbp->xnb_rx_cpop_count = 0;
    440 
    441 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
    442 	    xnbp->xnb_icookie);
    443 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
    444 	    xnbp->xnb_icookie);
    445 	mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER,
    446 	    xnbp->xnb_icookie);
    447 
    448 	/* Set driver private pointer now. */
    449 	ddi_set_driver_private(dip, xnbp);
    450 
    451 	(void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip));
    452 	xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename,
    453 	    sizeof (xnb_txbuf_t), 0,
    454 	    xnb_txbuf_constructor, xnb_txbuf_destructor,
    455 	    NULL, xnbp, NULL, 0);
    456 	if (xnbp->xnb_tx_buf_cache == NULL)
    457 		goto failure_0;
    458 
    459 	if (!xnb_ks_init(xnbp))
    460 		goto failure_1;
    461 
    462 	/*
    463 	 * Receive notification of changes in the state of the
    464 	 * driver in the guest domain.
    465 	 */
    466 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
    467 	    NULL) != DDI_SUCCESS)
    468 		goto failure_2;
    469 
    470 	/*
    471 	 * Receive notification of hotplug events.
    472 	 */
    473 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
    474 	    NULL) != DDI_SUCCESS)
    475 		goto failure_2;
    476 
    477 	xsname = xvdi_get_xsname(dip);
    478 
    479 	if (xenbus_printf(XBT_NULL, xsname,
    480 	    "feature-multicast-control", "%d",
    481 	    xnb_multicast_control ? 1 : 0) != 0)
    482 		goto failure_3;
    483 
    484 	if (xenbus_printf(XBT_NULL, xsname,
    485 	    "feature-rx-copy", "%d",  1) != 0)
    486 		goto failure_3;
    487 	/*
    488 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
    489 	 * in addition to "feature-rx-copy" being 1. It seems strange
    490 	 * to use four possible states to describe a binary decision,
    491 	 * but we might as well play nice.
    492 	 */
    493 	if (xenbus_printf(XBT_NULL, xsname,
    494 	    "feature-rx-flip", "%d", 0) != 0)
    495 		goto failure_3;
    496 
    497 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
    498 	(void) xvdi_post_event(dip, XEN_HP_ADD);
    499 
    500 	return (DDI_SUCCESS);
    501 
    502 failure_3:
    503 	xvdi_remove_event_handler(dip, NULL);
    504 
    505 failure_2:
    506 	xnb_ks_free(xnbp);
    507 
    508 failure_1:
    509 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
    510 
    511 failure_0:
    512 	mutex_destroy(&xnbp->xnb_state_lock);
    513 	mutex_destroy(&xnbp->xnb_rx_lock);
    514 	mutex_destroy(&xnbp->xnb_tx_lock);
    515 
    516 failure:
    517 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
    518 	kmem_free(xnbp, sizeof (*xnbp));
    519 	return (DDI_FAILURE);
    520 }
    521 
    522 void
    523 xnb_detach(dev_info_t *dip)
    524 {
    525 	xnb_t *xnbp = ddi_get_driver_private(dip);
    526 
    527 	ASSERT(xnbp != NULL);
    528 	ASSERT(!xnbp->xnb_connected);
    529 	ASSERT(xnbp->xnb_tx_buf_count == 0);
    530 
    531 	xnb_disconnect_rings(dip);
    532 
    533 	xvdi_remove_event_handler(dip, NULL);
    534 
    535 	xnb_ks_free(xnbp);
    536 
    537 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
    538 
    539 	ddi_set_driver_private(dip, NULL);
    540 
    541 	mutex_destroy(&xnbp->xnb_state_lock);
    542 	mutex_destroy(&xnbp->xnb_rx_lock);
    543 	mutex_destroy(&xnbp->xnb_tx_lock);
    544 
    545 	if (xnbp->xnb_rx_cpop_count > 0)
    546 		kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0])
    547 		    * xnbp->xnb_rx_cpop_count);
    548 
    549 	ASSERT(xnbp->xnb_rx_va != NULL);
    550 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
    551 
    552 	kmem_free(xnbp, sizeof (*xnbp));
    553 }
    554 
    555 /*
    556  * Allocate a page from the hypervisor to be flipped to the peer.
    557  *
    558  * Try to get pages in batches to reduce the overhead of calls into
    559  * the balloon driver.
    560  */
    561 static mfn_t
    562 xnb_alloc_page(xnb_t *xnbp)
    563 {
    564 #define	WARNING_RATE_LIMIT 100
    565 #define	BATCH_SIZE 256
    566 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
    567 	static int nth = BATCH_SIZE;
    568 	mfn_t mfn;
    569 
    570 	mutex_enter(&xnb_alloc_page_lock);
    571 	if (nth == BATCH_SIZE) {
    572 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
    573 			xnbp->xnb_stat_allocation_failure++;
    574 			mutex_exit(&xnb_alloc_page_lock);
    575 
    576 			/*
    577 			 * Try for a single page in low memory situations.
    578 			 */
    579 			if (balloon_alloc_pages(1, &mfn) != 1) {
    580 				if ((xnbp->xnb_stat_small_allocation_failure++
    581 				    % WARNING_RATE_LIMIT) == 0)
    582 					cmn_err(CE_WARN, "xnb_alloc_page: "
    583 					    "Cannot allocate memory to "
    584 					    "transfer packets to peer.");
    585 				return (0);
    586 			} else {
    587 				xnbp->xnb_stat_small_allocation_success++;
    588 				return (mfn);
    589 			}
    590 		}
    591 
    592 		nth = 0;
    593 		xnbp->xnb_stat_allocation_success++;
    594 	}
    595 
    596 	mfn = mfns[nth++];
    597 	mutex_exit(&xnb_alloc_page_lock);
    598 
    599 	ASSERT(mfn != 0);
    600 
    601 	return (mfn);
    602 #undef BATCH_SIZE
    603 #undef WARNING_RATE_LIMIT
    604 }
    605 
    606 /*
    607  * Free a page back to the hypervisor.
    608  *
    609  * This happens only in the error path, so batching is not worth the
    610  * complication.
    611  */
    612 static void
    613 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
    614 {
    615 	_NOTE(ARGUNUSED(xnbp));
    616 	int r;
    617 	pfn_t pfn;
    618 
    619 	pfn = xen_assign_pfn(mfn);
    620 	pfnzero(pfn, 0, PAGESIZE);
    621 	xen_release_pfn(pfn);
    622 
    623 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
    624 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
    625 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
    626 		    r, mfn);
    627 	}
    628 }
    629 
    630 /*
    631  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
    632  * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
    633  */
    634 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
    635 	((((_r)->sring->req_prod - loop) <		\
    636 		(RING_SIZE(_r) - (loop - prod))) ?	\
    637 	    ((_r)->sring->req_prod - loop) :		\
    638 	    (RING_SIZE(_r) - (loop - prod)))
    639 
    640 /*
    641  * Pass packets to the peer using page flipping.
    642  */
    643 mblk_t *
    644 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
    645 {
    646 	mblk_t *free = mp, *prev = NULL;
    647 	size_t len;
    648 	gnttab_transfer_t *gop;
    649 	boolean_t notify;
    650 	RING_IDX loop, prod, end;
    651 
    652 	/*
    653 	 * For each packet the sequence of operations is:
    654 	 *
    655 	 * 1. get a new page from the hypervisor.
    656 	 * 2. get a request slot from the ring.
    657 	 * 3. copy the data into the new page.
    658 	 * 4. transfer the page to the peer.
    659 	 * 5. update the request slot.
    660 	 * 6. kick the peer.
    661 	 * 7. free mp.
    662 	 *
    663 	 * In order to reduce the number of hypercalls, we prepare
    664 	 * several packets for the peer and perform a single hypercall
    665 	 * to transfer them.
    666 	 */
    667 
    668 	mutex_enter(&xnbp->xnb_rx_lock);
    669 
    670 	/*
    671 	 * If we are not connected to the peer or have not yet
    672 	 * finished hotplug it is too early to pass packets to the
    673 	 * peer.
    674 	 */
    675 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
    676 		mutex_exit(&xnbp->xnb_rx_lock);
    677 		DTRACE_PROBE(flip_rx_too_early);
    678 		xnbp->xnb_stat_rx_too_early++;
    679 		return (mp);
    680 	}
    681 
    682 	loop = xnbp->xnb_rx_ring.req_cons;
    683 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
    684 	gop = xnbp->xnb_rx_top;
    685 
    686 	while ((mp != NULL) &&
    687 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
    688 
    689 		mfn_t mfn;
    690 		pfn_t pfn;
    691 		netif_rx_request_t *rxreq;
    692 		netif_rx_response_t *rxresp;
    693 		char *valoop;
    694 		mblk_t *ml;
    695 		uint16_t cksum_flags;
    696 
    697 		/* 1 */
    698 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
    699 			xnbp->xnb_stat_rx_defer++;
    700 			break;
    701 		}
    702 
    703 		/* 2 */
    704 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
    705 
    706 #ifdef XNB_DEBUG
    707 		if (!(rxreq->id < NET_RX_RING_SIZE))
    708 			cmn_err(CE_PANIC, "xnb_to_peer: "
    709 			    "id %d out of range in request 0x%p",
    710 			    rxreq->id, (void *)rxreq);
    711 #endif /* XNB_DEBUG */
    712 
    713 		/* Assign a pfn and map the new page at the allocated va. */
    714 		pfn = xen_assign_pfn(mfn);
    715 		hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
    716 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
    717 
    718 		/* 3 */
    719 		len = 0;
    720 		valoop = xnbp->xnb_rx_va;
    721 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
    722 			size_t chunk = ml->b_wptr - ml->b_rptr;
    723 
    724 			bcopy(ml->b_rptr, valoop, chunk);
    725 			valoop += chunk;
    726 			len += chunk;
    727 		}
    728 
    729 		ASSERT(len < PAGESIZE);
    730 
    731 		/* Release the pfn. */
    732 		hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
    733 		    HAT_UNLOAD_UNMAP);
    734 		xen_release_pfn(pfn);
    735 
    736 		/* 4 */
    737 		gop->mfn = mfn;
    738 		gop->domid = xnbp->xnb_peer;
    739 		gop->ref = rxreq->gref;
    740 
    741 		/* 5.1 */
    742 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
    743 		rxresp->offset = 0;
    744 		rxresp->flags = 0;
    745 
    746 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
    747 		if (cksum_flags != 0)
    748 			xnbp->xnb_stat_rx_cksum_deferred++;
    749 		rxresp->flags |= cksum_flags;
    750 
    751 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
    752 		rxresp->status = len;
    753 
    754 		loop++;
    755 		prod++;
    756 		gop++;
    757 		prev = mp;
    758 		mp = mp->b_next;
    759 	}
    760 
    761 	/*
    762 	 * Did we actually do anything?
    763 	 */
    764 	if (loop == xnbp->xnb_rx_ring.req_cons) {
    765 		mutex_exit(&xnbp->xnb_rx_lock);
    766 		return (mp);
    767 	}
    768 
    769 	end = loop;
    770 
    771 	/*
    772 	 * Unlink the end of the 'done' list from the remainder.
    773 	 */
    774 	ASSERT(prev != NULL);
    775 	prev->b_next = NULL;
    776 
    777 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
    778 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
    779 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
    780 	}
    781 
    782 	loop = xnbp->xnb_rx_ring.req_cons;
    783 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
    784 	gop = xnbp->xnb_rx_top;
    785 
    786 	while (loop < end) {
    787 		int16_t status = NETIF_RSP_OKAY;
    788 
    789 		if (gop->status != 0) {
    790 			status = NETIF_RSP_ERROR;
    791 
    792 			/*
    793 			 * If the status is anything other than
    794 			 * GNTST_bad_page then we don't own the page
    795 			 * any more, so don't try to give it back.
    796 			 */
    797 			if (gop->status != GNTST_bad_page)
    798 				gop->mfn = 0;
    799 		} else {
    800 			/* The page is no longer ours. */
    801 			gop->mfn = 0;
    802 		}
    803 
    804 		if (gop->mfn != 0)
    805 			/*
    806 			 * Give back the page, as we won't be using
    807 			 * it.
    808 			 */
    809 			xnb_free_page(xnbp, gop->mfn);
    810 		else
    811 			/*
    812 			 * We gave away a page, update our accounting
    813 			 * now.
    814 			 */
    815 			balloon_drv_subtracted(1);
    816 
    817 		/* 5.2 */
    818 		if (status != NETIF_RSP_OKAY) {
    819 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
    820 			    status;
    821 		} else {
    822 			xnbp->xnb_stat_ipackets++;
    823 			xnbp->xnb_stat_rbytes += len;
    824 		}
    825 
    826 		loop++;
    827 		prod++;
    828 		gop++;
    829 	}
    830 
    831 	xnbp->xnb_rx_ring.req_cons = loop;
    832 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
    833 
    834 	/* 6 */
    835 	/* LINTED: constant in conditional context */
    836 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
    837 	if (notify) {
    838 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
    839 		xnbp->xnb_stat_rx_notify_sent++;
    840 	} else {
    841 		xnbp->xnb_stat_rx_notify_deferred++;
    842 	}
    843 
    844 	if (mp != NULL)
    845 		xnbp->xnb_stat_rx_defer++;
    846 
    847 	mutex_exit(&xnbp->xnb_rx_lock);
    848 
    849 	/* Free mblk_t's that we consumed. */
    850 	freemsgchain(free);
    851 
    852 	return (mp);
    853 }
    854 
    855 /* Helper functions for xnb_copy_to_peer(). */
    856 
    857 /*
    858  * Grow the array of copy operation descriptors.
    859  */
    860 static boolean_t
    861 grow_cpop_area(xnb_t *xnbp)
    862 {
    863 	size_t count;
    864 	gnttab_copy_t *new;
    865 
    866 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
    867 
    868 	count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT;
    869 
    870 	if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) {
    871 		xnbp->xnb_stat_other_allocation_failure++;
    872 		return (B_FALSE);
    873 	}
    874 
    875 	bcopy(xnbp->xnb_rx_cpop, new,
    876 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
    877 
    878 	kmem_free(xnbp->xnb_rx_cpop,
    879 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
    880 
    881 	xnbp->xnb_rx_cpop = new;
    882 	xnbp->xnb_rx_cpop_count = count;
    883 
    884 	xnbp->xnb_stat_rx_cpoparea_grown++;
    885 
    886 	return (B_TRUE);
    887 }
    888 
    889 /*
    890  * Check whether an address is on a page that's foreign to this domain.
    891  */
    892 static boolean_t
    893 is_foreign(void *addr)
    894 {
    895 	pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
    896 
    897 	return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN);
    898 }
    899 
    900 /*
    901  * Insert a newly allocated mblk into a chain, replacing the old one.
    902  */
    903 static mblk_t *
    904 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
    905 {
    906 	uint32_t	start, stuff, end, value, flags;
    907 	mblk_t		*new_mp;
    908 
    909 	new_mp = copyb(mp);
    910 	if (new_mp == NULL)
    911 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
    912 		    "for %p, len %lu", (void *) mp, len);
    913 
    914 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
    915 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
    916 	    flags, KM_NOSLEEP);
    917 
    918 	new_mp->b_next = mp->b_next;
    919 	new_mp->b_prev = mp->b_prev;
    920 	new_mp->b_cont = mp->b_cont;
    921 
    922 	/* Make sure we only overwrite pointers to the mblk being replaced. */
    923 	if (mp_prev != NULL && mp_prev->b_next == mp)
    924 		mp_prev->b_next = new_mp;
    925 
    926 	if (ml_prev != NULL && ml_prev->b_cont == mp)
    927 		ml_prev->b_cont = new_mp;
    928 
    929 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
    930 	freemsg(mp);
    931 
    932 	return (new_mp);
    933 }
    934 
    935 /*
    936  * Set all the fields in a gnttab_copy_t.
    937  */
    938 static void
    939 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
    940     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
    941 {
    942 	ASSERT(xnbp != NULL && gp != NULL);
    943 
    944 	gp->source.offset = s_off;
    945 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
    946 	gp->source.domid = DOMID_SELF;
    947 
    948 	gp->len = (uint16_t)len;
    949 	gp->flags = GNTCOPY_dest_gref;
    950 	gp->status = 0;
    951 
    952 	gp->dest.u.ref = d_ref;
    953 	gp->dest.offset = d_off;
    954 	gp->dest.domid = xnbp->xnb_peer;
    955 }
    956 
    957 /*
    958  * Pass packets to the peer using hypervisor copy operations.
    959  */
    960 mblk_t *
    961 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
    962 {
    963 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
    964 	mblk_t		*ml, *ml_prev;
    965 	boolean_t	notify;
    966 	RING_IDX	loop, prod;
    967 	int		i;
    968 
    969 	/*
    970 	 * If the peer does not pre-post buffers for received packets,
    971 	 * use page flipping to pass packets to it.
    972 	 */
    973 	if (!xnbp->xnb_rx_hv_copy)
    974 		return (xnb_to_peer(xnbp, mp));
    975 
    976 	/*
    977 	 * For each packet the sequence of operations is:
    978 	 *
    979 	 *  1. get a request slot from the ring.
    980 	 *  2. set up data for hypercall (see NOTE below)
    981 	 *  3. have the hypervisore copy the data
    982 	 *  4. update the request slot.
    983 	 *  5. kick the peer.
    984 	 *
    985 	 * NOTE ad 2.
    986 	 *  In order to reduce the number of hypercalls, we prepare
    987 	 *  several mblks (mp->b_cont != NULL) for the peer and
    988 	 *  perform a single hypercall to transfer them.  We also have
    989 	 *  to set up a seperate copy operation for every page.
    990 	 *
    991 	 * If we have more than one packet (mp->b_next != NULL), we do
    992 	 * this whole dance repeatedly.
    993 	 */
    994 
    995 	mutex_enter(&xnbp->xnb_rx_lock);
    996 
    997 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
    998 		mutex_exit(&xnbp->xnb_rx_lock);
    999 		DTRACE_PROBE(copy_rx_too_early);
   1000 		xnbp->xnb_stat_rx_too_early++;
   1001 		return (mp);
   1002 	}
   1003 
   1004 	loop = xnbp->xnb_rx_ring.req_cons;
   1005 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
   1006 
   1007 	while ((mp != NULL) &&
   1008 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
   1009 		netif_rx_request_t	*rxreq;
   1010 		size_t			d_offset, len;
   1011 		int			item_count;
   1012 		gnttab_copy_t		*gop_cp;
   1013 		netif_rx_response_t	*rxresp;
   1014 		uint16_t		cksum_flags;
   1015 		int16_t			status = NETIF_RSP_OKAY;
   1016 
   1017 		/* 1 */
   1018 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
   1019 
   1020 #ifdef XNB_DEBUG
   1021 		if (!(rxreq->id < NET_RX_RING_SIZE))
   1022 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
   1023 			    "id %d out of range in request 0x%p",
   1024 			    rxreq->id, (void *)rxreq);
   1025 #endif /* XNB_DEBUG */
   1026 
   1027 		/* 2 */
   1028 		d_offset = 0;
   1029 		len = 0;
   1030 		item_count = 0;
   1031 
   1032 		gop_cp = xnbp->xnb_rx_cpop;
   1033 
   1034 		/*
   1035 		 * We walk the b_cont pointers and set up a
   1036 		 * gnttab_copy_t for each sub-page chunk in each data
   1037 		 * block.
   1038 		 */
   1039 		/* 2a */
   1040 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
   1041 			size_t	chunk = ml->b_wptr - ml->b_rptr;
   1042 			uchar_t	*r_tmp,	*rpt_align;
   1043 			size_t	r_offset;
   1044 
   1045 			/*
   1046 			 * The hypervisor will not allow us to
   1047 			 * reference a foreign page (e.g. one
   1048 			 * belonging to another domain) by mfn in the
   1049 			 * copy operation. If the data in this mblk is
   1050 			 * on such a page we must copy the data into a
   1051 			 * local page before initiating the hypervisor
   1052 			 * copy operation.
   1053 			 */
   1054 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
   1055 				mblk_t *ml_new = replace_msg(ml, chunk,
   1056 				    mp_prev, ml_prev);
   1057 
   1058 				/* We can still use old ml, but not *ml! */
   1059 				if (free == ml)
   1060 					free = ml_new;
   1061 				if (mp == ml)
   1062 					mp = ml_new;
   1063 				ml = ml_new;
   1064 
   1065 				xnbp->xnb_stat_rx_foreign_page++;
   1066 			}
   1067 
   1068 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
   1069 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
   1070 			r_tmp = ml->b_rptr;
   1071 
   1072 			if (d_offset + chunk > PAGESIZE)
   1073 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
   1074 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
   1075 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
   1076 				    (void *)mp, (void *)saved_mp, (void *)ml,
   1077 				    (void *)rpt_align,
   1078 				    d_offset, chunk, (int)PAGESIZE);
   1079 
   1080 			while (chunk > 0) {
   1081 				size_t part_len;
   1082 
   1083 				if (item_count == xnbp->xnb_rx_cpop_count) {
   1084 					if (!grow_cpop_area(xnbp))
   1085 						goto failure;
   1086 					gop_cp = &xnbp->xnb_rx_cpop[item_count];
   1087 				}
   1088 				/*
   1089 				 * If our mblk crosses a page boundary, we need
   1090 				 * to do a seperate copy for each page.
   1091 				 */
   1092 				if (r_offset + chunk > PAGESIZE) {
   1093 					part_len = PAGESIZE - r_offset;
   1094 
   1095 					DTRACE_PROBE3(mblk_page_crossed,
   1096 					    (mblk_t *), ml, int, chunk, int,
   1097 					    (int)r_offset);
   1098 
   1099 					xnbp->xnb_stat_rx_pagebndry_crossed++;
   1100 				} else {
   1101 					part_len = chunk;
   1102 				}
   1103 
   1104 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
   1105 				    d_offset, part_len, rxreq->gref);
   1106 
   1107 				chunk -= part_len;
   1108 
   1109 				len += part_len;
   1110 				d_offset += part_len;
   1111 				r_tmp += part_len;
   1112 				/*
   1113 				 * The 2nd, 3rd ... last copies will always
   1114 				 * start at r_tmp, therefore r_offset is 0.
   1115 				 */
   1116 				r_offset = 0;
   1117 				gop_cp++;
   1118 				item_count++;
   1119 			}
   1120 			ml_prev = ml;
   1121 
   1122 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
   1123 			    chunk, int, len, int, item_count);
   1124 		}
   1125 		/* 3 */
   1126 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
   1127 		    item_count) != 0) {
   1128 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
   1129 			DTRACE_PROBE(HV_granttableopfailed);
   1130 		}
   1131 
   1132 		/* 4 */
   1133 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
   1134 		rxresp->offset = 0;
   1135 
   1136 		rxresp->flags = 0;
   1137 
   1138 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
   1139 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
   1140 		    (int)rxresp->status);
   1141 
   1142 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
   1143 		if (cksum_flags != 0)
   1144 			xnbp->xnb_stat_rx_cksum_deferred++;
   1145 		rxresp->flags |= cksum_flags;
   1146 
   1147 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
   1148 		rxresp->status = len;
   1149 
   1150 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
   1151 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
   1152 		    (int)rxresp->status);
   1153 
   1154 		for (i = 0; i < item_count; i++) {
   1155 			if (xnbp->xnb_rx_cpop[i].status != 0) {
   1156 				DTRACE_PROBE2(cpop_status_nonnull, int,
   1157 				    (int)xnbp->xnb_rx_cpop[i].status,
   1158 				    int, i);
   1159 				status = NETIF_RSP_ERROR;
   1160 			}
   1161 		}
   1162 
   1163 		/* 5.2 */
   1164 		if (status != NETIF_RSP_OKAY) {
   1165 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
   1166 			    status;
   1167 			xnbp->xnb_stat_rx_rsp_notok++;
   1168 		} else {
   1169 			xnbp->xnb_stat_ipackets++;
   1170 			xnbp->xnb_stat_rbytes += len;
   1171 		}
   1172 
   1173 		loop++;
   1174 		prod++;
   1175 		mp_prev = mp;
   1176 		mp = mp->b_next;
   1177 	}
   1178 failure:
   1179 	/*
   1180 	 * Did we actually do anything?
   1181 	 */
   1182 	if (loop == xnbp->xnb_rx_ring.req_cons) {
   1183 		mutex_exit(&xnbp->xnb_rx_lock);
   1184 		return (mp);
   1185 	}
   1186 
   1187 	/*
   1188 	 * Unlink the end of the 'done' list from the remainder.
   1189 	 */
   1190 	ASSERT(mp_prev != NULL);
   1191 	mp_prev->b_next = NULL;
   1192 
   1193 	xnbp->xnb_rx_ring.req_cons = loop;
   1194 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
   1195 
   1196 	/* 6 */
   1197 	/* LINTED: constant in conditional context */
   1198 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
   1199 	if (notify) {
   1200 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
   1201 		xnbp->xnb_stat_rx_notify_sent++;
   1202 	} else {
   1203 		xnbp->xnb_stat_rx_notify_deferred++;
   1204 	}
   1205 
   1206 	if (mp != NULL)
   1207 		xnbp->xnb_stat_rx_defer++;
   1208 
   1209 	mutex_exit(&xnbp->xnb_rx_lock);
   1210 
   1211 	/* Free mblk_t structs we have consumed. */
   1212 	freemsgchain(free);
   1213 
   1214 	return (mp);
   1215 }
   1216 
   1217 
   1218 static void
   1219 xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force)
   1220 {
   1221 	boolean_t notify;
   1222 
   1223 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
   1224 
   1225 	/* LINTED: constant in conditional context */
   1226 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
   1227 	if (notify || force) {
   1228 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
   1229 		xnbp->xnb_stat_tx_notify_sent++;
   1230 	} else {
   1231 		xnbp->xnb_stat_tx_notify_deferred++;
   1232 	}
   1233 }
   1234 
   1235 static void
   1236 xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
   1237 {
   1238 	RING_IDX i;
   1239 	netif_tx_response_t *txresp;
   1240 
   1241 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
   1242 
   1243 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
   1244 
   1245 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
   1246 	txresp->id = id;
   1247 	txresp->status = status;
   1248 
   1249 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
   1250 
   1251 	/*
   1252 	 * Note that we don't push the change to the peer here - that
   1253 	 * is the callers responsibility.
   1254 	 */
   1255 }
   1256 
   1257 static void
   1258 xnb_txbuf_recycle(xnb_txbuf_t *txp)
   1259 {
   1260 	xnb_t *xnbp = txp->xt_xnbp;
   1261 
   1262 	kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
   1263 
   1264 	xnbp->xnb_tx_buf_outstanding--;
   1265 }
   1266 
   1267 static int
   1268 xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
   1269 {
   1270 	_NOTE(ARGUNUSED(kmflag));
   1271 	xnb_txbuf_t *txp = buf;
   1272 	xnb_t *xnbp = arg;
   1273 	size_t len;
   1274 	ddi_dma_cookie_t dma_cookie;
   1275 	uint_t ncookies;
   1276 
   1277 	txp->xt_free_rtn.free_func = xnb_txbuf_recycle;
   1278 	txp->xt_free_rtn.free_arg = (caddr_t)txp;
   1279 	txp->xt_xnbp = xnbp;
   1280 	txp->xt_next = NULL;
   1281 
   1282 	if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr,
   1283 	    0, 0, &txp->xt_dma_handle) != DDI_SUCCESS)
   1284 		goto failure;
   1285 
   1286 	if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr,
   1287 	    DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len,
   1288 	    &txp->xt_acc_handle) != DDI_SUCCESS)
   1289 		goto failure_1;
   1290 
   1291 	if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf,
   1292 	    len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0,
   1293 	    &dma_cookie, &ncookies)
   1294 	    != DDI_DMA_MAPPED)
   1295 		goto failure_2;
   1296 	ASSERT(ncookies == 1);
   1297 
   1298 	txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress);
   1299 	txp->xt_buflen = dma_cookie.dmac_size;
   1300 
   1301 	DTRACE_PROBE(txbuf_allocated);
   1302 
   1303 	atomic_add_32(&xnbp->xnb_tx_buf_count, 1);
   1304 	xnbp->xnb_tx_buf_outstanding++;
   1305 
   1306 	return (0);
   1307 
   1308 failure_2:
   1309 	ddi_dma_mem_free(&txp->xt_acc_handle);
   1310 
   1311 failure_1:
   1312 	ddi_dma_free_handle(&txp->xt_dma_handle);
   1313 
   1314 failure:
   1315 
   1316 	return (-1);
   1317 }
   1318 
   1319 static void
   1320 xnb_txbuf_destructor(void *buf, void *arg)
   1321 {
   1322 	xnb_txbuf_t *txp = buf;
   1323 	xnb_t *xnbp = arg;
   1324 
   1325 	(void) ddi_dma_unbind_handle(txp->xt_dma_handle);
   1326 	ddi_dma_mem_free(&txp->xt_acc_handle);
   1327 	ddi_dma_free_handle(&txp->xt_dma_handle);
   1328 
   1329 	atomic_add_32(&xnbp->xnb_tx_buf_count, -1);
   1330 }
   1331 
   1332 /*
   1333  * Take packets from the peer and deliver them onward.
   1334  */
   1335 static mblk_t *
   1336 xnb_from_peer(xnb_t *xnbp)
   1337 {
   1338 	RING_IDX start, end, loop;
   1339 	gnttab_copy_t *cop;
   1340 	xnb_txbuf_t **txpp;
   1341 	netif_tx_request_t *txreq;
   1342 	boolean_t work_to_do, need_notify = B_FALSE;
   1343 	mblk_t *head, *tail;
   1344 	int n_data_req, i;
   1345 
   1346 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
   1347 
   1348 	head = tail = NULL;
   1349 around:
   1350 
   1351 	/* LINTED: constant in conditional context */
   1352 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
   1353 	if (!work_to_do) {
   1354 finished:
   1355 		xnb_tx_notify_peer(xnbp, need_notify);
   1356 
   1357 		return (head);
   1358 	}
   1359 
   1360 	start = xnbp->xnb_tx_ring.req_cons;
   1361 	end = xnbp->xnb_tx_ring.sring->req_prod;
   1362 
   1363 	if ((end - start) > NET_TX_RING_SIZE) {
   1364 		/*
   1365 		 * This usually indicates that the frontend driver is
   1366 		 * misbehaving, as it's not possible to have more than
   1367 		 * NET_TX_RING_SIZE ring elements in play at any one
   1368 		 * time.
   1369 		 *
   1370 		 * We reset the ring pointers to the state declared by
   1371 		 * the frontend and try to carry on.
   1372 		 */
   1373 		cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
   1374 		    "items in the ring, resetting and trying to recover.",
   1375 		    xnbp->xnb_peer, (end - start));
   1376 
   1377 		/* LINTED: constant in conditional context */
   1378 		BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
   1379 		    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
   1380 
   1381 		goto around;
   1382 	}
   1383 
   1384 	loop = start;
   1385 	cop = xnbp->xnb_tx_cop;
   1386 	txpp = xnbp->xnb_tx_bufp;
   1387 	n_data_req = 0;
   1388 
   1389 	while (loop < end) {
   1390 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
   1391 
   1392 		if (txreq->flags & NETTXF_extra_info) {
   1393 			struct netif_extra_info *erp;
   1394 			boolean_t status;
   1395 
   1396 			loop++; /* Consume another slot in the ring. */
   1397 			ASSERT(loop <= end);
   1398 
   1399 			erp = (struct netif_extra_info *)
   1400 			    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
   1401 
   1402 			switch (erp->type) {
   1403 			case XEN_NETIF_EXTRA_TYPE_MCAST_ADD:
   1404 				ASSERT(xnbp->xnb_multicast_control);
   1405 				status = xnbp->xnb_flavour->xf_mcast_add(xnbp,
   1406 				    &erp->u.mcast.addr);
   1407 				break;
   1408 			case XEN_NETIF_EXTRA_TYPE_MCAST_DEL:
   1409 				ASSERT(xnbp->xnb_multicast_control);
   1410 				status = xnbp->xnb_flavour->xf_mcast_del(xnbp,
   1411 				    &erp->u.mcast.addr);
   1412 				break;
   1413 			default:
   1414 				status = B_FALSE;
   1415 				cmn_err(CE_WARN, "xnb_from_peer: "
   1416 				    "unknown extra type %d", erp->type);
   1417 				break;
   1418 			}
   1419 
   1420 			xnb_tx_mark_complete(xnbp, txreq->id,
   1421 			    status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR);
   1422 			need_notify = B_TRUE;
   1423 		} else {
   1424 			xnb_txbuf_t *txp;
   1425 
   1426 			txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache,
   1427 			    KM_NOSLEEP);
   1428 			if (txp == NULL)
   1429 				break;
   1430 
   1431 			txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf,
   1432 			    txp->xt_buflen, 0, &txp->xt_free_rtn);
   1433 			if (txp->xt_mblk == NULL) {
   1434 				kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
   1435 				break;
   1436 			}
   1437 
   1438 			txp->xt_idx = loop;
   1439 			txp->xt_id = txreq->id;
   1440 
   1441 			cop->source.u.ref = txreq->gref;
   1442 			cop->source.domid = xnbp->xnb_peer;
   1443 			cop->source.offset = txreq->offset;
   1444 
   1445 			cop->dest.u.gmfn = txp->xt_mfn;
   1446 			cop->dest.domid = DOMID_SELF;
   1447 			cop->dest.offset = 0;
   1448 
   1449 			cop->len = txreq->size;
   1450 			cop->flags = GNTCOPY_source_gref;
   1451 			cop->status = 0;
   1452 
   1453 			*txpp = txp;
   1454 
   1455 			txpp++;
   1456 			cop++;
   1457 			n_data_req++;
   1458 
   1459 			ASSERT(n_data_req <= NET_TX_RING_SIZE);
   1460 		}
   1461 
   1462 		loop++;
   1463 	}
   1464 
   1465 	xnbp->xnb_tx_ring.req_cons = loop;
   1466 
   1467 	if (n_data_req == 0)
   1468 		goto around;
   1469 
   1470 	if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
   1471 	    xnbp->xnb_tx_cop, n_data_req) != 0) {
   1472 
   1473 		cmn_err(CE_WARN, "xnb_from_peer: copy operation failed");
   1474 
   1475 		txpp = xnbp->xnb_tx_bufp;
   1476 		i = n_data_req;
   1477 		while (i > 0) {
   1478 			kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp);
   1479 			txpp++;
   1480 			i--;
   1481 		}
   1482 
   1483 		goto finished;
   1484 	}
   1485 
   1486 	txpp = xnbp->xnb_tx_bufp;
   1487 	cop = xnbp->xnb_tx_cop;
   1488 	i = n_data_req;
   1489 
   1490 	while (i > 0) {
   1491 		xnb_txbuf_t *txp = *txpp;
   1492 
   1493 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx);
   1494 
   1495 		if (cop->status != 0) {
   1496 #ifdef XNB_DEBUG
   1497 			cmn_err(CE_WARN, "xnb_from_peer: "
   1498 			    "txpp 0x%p failed (%d)",
   1499 			    (void *)*txpp, cop->status);
   1500 #endif /* XNB_DEBUG */
   1501 			xnb_tx_mark_complete(xnbp, txp->xt_id, cop->status);
   1502 			freemsg(txp->xt_mblk);
   1503 		} else {
   1504 			mblk_t *mp;
   1505 
   1506 			mp = txp->xt_mblk;
   1507 			mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf;
   1508 			mp->b_wptr += txreq->size;
   1509 			mp->b_next = NULL;
   1510 
   1511 			/*
   1512 			 * If there are checksum flags, process them
   1513 			 * appropriately.
   1514 			 */
   1515 			if ((txreq->flags &
   1516 			    (NETTXF_csum_blank | NETTXF_data_validated))
   1517 			    != 0) {
   1518 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
   1519 				    mp, txreq->flags);
   1520 				xnbp->xnb_stat_tx_cksum_no_need++;
   1521 
   1522 				txp->xt_mblk = mp;
   1523 			}
   1524 
   1525 			if (head == NULL) {
   1526 				ASSERT(tail == NULL);
   1527 				head = mp;
   1528 			} else {
   1529 				ASSERT(tail != NULL);
   1530 				tail->b_next = mp;
   1531 			}
   1532 			tail = mp;
   1533 
   1534 			xnbp->xnb_stat_opackets++;
   1535 			xnbp->xnb_stat_obytes += txreq->size;
   1536 
   1537 			xnb_tx_mark_complete(xnbp, txp->xt_id, cop->status);
   1538 		}
   1539 
   1540 		txpp++;
   1541 		cop++;
   1542 		i--;
   1543 	}
   1544 
   1545 	goto around;
   1546 	/* NOTREACHED */
   1547 }
   1548 
   1549 static uint_t
   1550 xnb_intr(caddr_t arg)
   1551 {
   1552 	xnb_t *xnbp = (xnb_t *)arg;
   1553 	mblk_t *mp;
   1554 
   1555 	xnbp->xnb_stat_intr++;
   1556 
   1557 	mutex_enter(&xnbp->xnb_tx_lock);
   1558 
   1559 	ASSERT(xnbp->xnb_connected);
   1560 
   1561 	mp = xnb_from_peer(xnbp);
   1562 
   1563 	mutex_exit(&xnbp->xnb_tx_lock);
   1564 
   1565 	if (!xnbp->xnb_hotplugged) {
   1566 		xnbp->xnb_stat_tx_too_early++;
   1567 		goto fail;
   1568 	}
   1569 	if (mp == NULL) {
   1570 		xnbp->xnb_stat_spurious_intr++;
   1571 		goto fail;
   1572 	}
   1573 
   1574 	xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
   1575 
   1576 	return (DDI_INTR_CLAIMED);
   1577 
   1578 fail:
   1579 	freemsgchain(mp);
   1580 	return (DDI_INTR_CLAIMED);
   1581 }
   1582 
   1583 /*
   1584  * Read our configuration from xenstore.
   1585  */
   1586 boolean_t
   1587 xnb_read_xs_config(xnb_t *xnbp)
   1588 {
   1589 	char *xsname;
   1590 	char mac[ETHERADDRL * 3];
   1591 
   1592 	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
   1593 
   1594 	if (xenbus_scanf(XBT_NULL, xsname,
   1595 	    "mac", "%s", mac) != 0) {
   1596 		cmn_err(CE_WARN, "xnb_attach: "
   1597 		    "cannot read mac address from %s",
   1598 		    xsname);
   1599 		return (B_FALSE);
   1600 	}
   1601 
   1602 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
   1603 		cmn_err(CE_WARN,
   1604 		    "xnb_attach: cannot parse mac address %s",
   1605 		    mac);
   1606 		return (B_FALSE);
   1607 	}
   1608 
   1609 	return (B_TRUE);
   1610 }
   1611 
   1612 /*
   1613  * Read the configuration of the peer from xenstore.
   1614  */
   1615 boolean_t
   1616 xnb_read_oe_config(xnb_t *xnbp)
   1617 {
   1618 	char *oename;
   1619 	int i;
   1620 
   1621 	oename = xvdi_get_oename(xnbp->xnb_devinfo);
   1622 
   1623 	if (xenbus_gather(XBT_NULL, oename,
   1624 	    "event-channel", "%u", &xnbp->xnb_fe_evtchn,
   1625 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
   1626 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
   1627 	    NULL) != 0) {
   1628 		cmn_err(CE_WARN, "xnb_read_oe_config: "
   1629 		    "cannot read other-end details from %s",
   1630 		    oename);
   1631 		return (B_FALSE);
   1632 	}
   1633 
   1634 	/*
   1635 	 * Check whether our peer requests receive side hypervisor
   1636 	 * copy.
   1637 	 */
   1638 	if (xenbus_scanf(XBT_NULL, oename,
   1639 	    "request-rx-copy", "%d", &i) != 0)
   1640 		i = 0;
   1641 	if (i != 0)
   1642 		xnbp->xnb_rx_hv_copy = B_TRUE;
   1643 
   1644 	/*
   1645 	 * Check whether our peer requests multicast_control.
   1646 	 */
   1647 	if (xenbus_scanf(XBT_NULL, oename,
   1648 	    "request-multicast-control", "%d", &i) != 0)
   1649 		i = 0;
   1650 	if (i != 0)
   1651 		xnbp->xnb_multicast_control = B_TRUE;
   1652 
   1653 	/*
   1654 	 * The Linux backend driver here checks to see if the peer has
   1655 	 * set 'feature-no-csum-offload'. This is used to indicate
   1656 	 * that the guest cannot handle receiving packets without a
   1657 	 * valid checksum. We don't check here, because packets passed
   1658 	 * to the peer _always_ have a valid checksum.
   1659 	 *
   1660 	 * There are three cases:
   1661 	 *
   1662 	 * - the NIC is dedicated: packets from the wire should always
   1663 	 *   have a valid checksum. If the hardware validates the
   1664 	 *   checksum then the relevant bit will be set in the packet
   1665 	 *   attributes and we will inform the peer. It can choose to
   1666 	 *   ignore the hardware verification.
   1667 	 *
   1668 	 * - the NIC is shared (VNIC) and a packet originates from the
   1669 	 *   wire: this is the same as the case above - the packets
   1670 	 *   will have a valid checksum.
   1671 	 *
   1672 	 * - the NIC is shared (VNIC) and a packet originates from the
   1673 	 *   host: the MAC layer ensures that all such packets have a
   1674 	 *   valid checksum by calculating one if the stack did not.
   1675 	 */
   1676 
   1677 	return (B_TRUE);
   1678 }
   1679 
   1680 void
   1681 xnb_start_connect(xnb_t *xnbp)
   1682 {
   1683 	dev_info_t  *dip = xnbp->xnb_devinfo;
   1684 
   1685 	if (!xnb_connect_rings(dip)) {
   1686 		cmn_err(CE_WARN, "xnb_start_connect: "
   1687 		    "cannot connect rings");
   1688 		goto failed;
   1689 	}
   1690 
   1691 	if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) {
   1692 		cmn_err(CE_WARN, "xnb_start_connect: "
   1693 		    "flavour failed to connect");
   1694 		goto failed;
   1695 	}
   1696 
   1697 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
   1698 	return;
   1699 
   1700 failed:
   1701 	xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
   1702 	xnb_disconnect_rings(dip);
   1703 	(void) xvdi_switch_state(dip, XBT_NULL,
   1704 	    XenbusStateClosed);
   1705 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
   1706 }
   1707 
   1708 static boolean_t
   1709 xnb_connect_rings(dev_info_t *dip)
   1710 {
   1711 	xnb_t *xnbp = ddi_get_driver_private(dip);
   1712 	struct gnttab_map_grant_ref map_op;
   1713 
   1714 	/*
   1715 	 * Cannot attempt to connect the rings if already connected.
   1716 	 */
   1717 	ASSERT(!xnbp->xnb_connected);
   1718 
   1719 	/*
   1720 	 * 1. allocate a vaddr for the tx page, one for the rx page.
   1721 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
   1722 	 *    into the allocated vaddr (one for tx, one for rx).
   1723 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
   1724 	 *    bound to this domain.
   1725 	 * 4. associate the event channel with an interrupt.
   1726 	 * 5. enable the interrupt.
   1727 	 */
   1728 
   1729 	/* 1.tx */
   1730 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
   1731 	    0, 0, 0, 0, VM_SLEEP);
   1732 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
   1733 
   1734 	/* 2.tx */
   1735 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
   1736 	map_op.flags = GNTMAP_host_map;
   1737 	map_op.ref = xnbp->xnb_tx_ring_ref;
   1738 	map_op.dom = xnbp->xnb_peer;
   1739 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
   1740 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
   1741 	    map_op.status != 0) {
   1742 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
   1743 		goto fail;
   1744 	}
   1745 	xnbp->xnb_tx_ring_handle = map_op.handle;
   1746 
   1747 	/* LINTED: constant in conditional context */
   1748 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
   1749 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
   1750 
   1751 	/* 1.rx */
   1752 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
   1753 	    0, 0, 0, 0, VM_SLEEP);
   1754 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
   1755 
   1756 	/* 2.rx */
   1757 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
   1758 	map_op.flags = GNTMAP_host_map;
   1759 	map_op.ref = xnbp->xnb_rx_ring_ref;
   1760 	map_op.dom = xnbp->xnb_peer;
   1761 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
   1762 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
   1763 	    map_op.status != 0) {
   1764 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
   1765 		goto fail;
   1766 	}
   1767 	xnbp->xnb_rx_ring_handle = map_op.handle;
   1768 
   1769 	/* LINTED: constant in conditional context */
   1770 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
   1771 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
   1772 
   1773 	/* 3 */
   1774 	if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) {
   1775 		cmn_err(CE_WARN, "xnb_connect_rings: "
   1776 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
   1777 		xnbp->xnb_evtchn = INVALID_EVTCHN;
   1778 		goto fail;
   1779 	}
   1780 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
   1781 
   1782 	/*
   1783 	 * It would be good to set the state to XenbusStateConnected
   1784 	 * here as well, but then what if ddi_add_intr() failed?
   1785 	 * Changing the state in the store will be noticed by the peer
   1786 	 * and cannot be "taken back".
   1787 	 */
   1788 	mutex_enter(&xnbp->xnb_tx_lock);
   1789 	mutex_enter(&xnbp->xnb_rx_lock);
   1790 
   1791 	xnbp->xnb_connected = B_TRUE;
   1792 
   1793 	mutex_exit(&xnbp->xnb_rx_lock);
   1794 	mutex_exit(&xnbp->xnb_tx_lock);
   1795 
   1796 	/* 4, 5 */
   1797 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
   1798 	    != DDI_SUCCESS) {
   1799 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
   1800 		goto fail;
   1801 	}
   1802 	xnbp->xnb_irq = B_TRUE;
   1803 
   1804 	return (B_TRUE);
   1805 
   1806 fail:
   1807 	mutex_enter(&xnbp->xnb_tx_lock);
   1808 	mutex_enter(&xnbp->xnb_rx_lock);
   1809 
   1810 	xnbp->xnb_connected = B_FALSE;
   1811 
   1812 	mutex_exit(&xnbp->xnb_rx_lock);
   1813 	mutex_exit(&xnbp->xnb_tx_lock);
   1814 
   1815 	return (B_FALSE);
   1816 }
   1817 
   1818 static void
   1819 xnb_disconnect_rings(dev_info_t *dip)
   1820 {
   1821 	xnb_t *xnbp = ddi_get_driver_private(dip);
   1822 
   1823 	if (xnbp->xnb_irq) {
   1824 		ddi_remove_intr(dip, 0, NULL);
   1825 		xnbp->xnb_irq = B_FALSE;
   1826 	}
   1827 
   1828 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
   1829 		xvdi_free_evtchn(dip);
   1830 		xnbp->xnb_evtchn = INVALID_EVTCHN;
   1831 	}
   1832 
   1833 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
   1834 		struct gnttab_unmap_grant_ref unmap_op;
   1835 
   1836 		unmap_op.host_addr = (uint64_t)(uintptr_t)
   1837 		    xnbp->xnb_rx_ring_addr;
   1838 		unmap_op.dev_bus_addr = 0;
   1839 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
   1840 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
   1841 		    &unmap_op, 1) != 0)
   1842 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
   1843 			    "cannot unmap rx-ring page (%d)",
   1844 			    unmap_op.status);
   1845 
   1846 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
   1847 	}
   1848 
   1849 	if (xnbp->xnb_rx_ring_addr != NULL) {
   1850 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
   1851 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
   1852 		xnbp->xnb_rx_ring_addr = NULL;
   1853 	}
   1854 
   1855 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
   1856 		struct gnttab_unmap_grant_ref unmap_op;
   1857 
   1858 		unmap_op.host_addr = (uint64_t)(uintptr_t)
   1859 		    xnbp->xnb_tx_ring_addr;
   1860 		unmap_op.dev_bus_addr = 0;
   1861 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
   1862 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
   1863 		    &unmap_op, 1) != 0)
   1864 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
   1865 			    "cannot unmap tx-ring page (%d)",
   1866 			    unmap_op.status);
   1867 
   1868 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
   1869 	}
   1870 
   1871 	if (xnbp->xnb_tx_ring_addr != NULL) {
   1872 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
   1873 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
   1874 		xnbp->xnb_tx_ring_addr = NULL;
   1875 	}
   1876 }
   1877 
   1878 static void
   1879 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
   1880     void *arg, void *impl_data)
   1881 {
   1882 	_NOTE(ARGUNUSED(id, arg));
   1883 	xnb_t *xnbp = ddi_get_driver_private(dip);
   1884 	XenbusState new_state = *(XenbusState *)impl_data;
   1885 
   1886 	ASSERT(xnbp != NULL);
   1887 
   1888 	switch (new_state) {
   1889 	case XenbusStateConnected:
   1890 		/* spurious state change */
   1891 		if (xnbp->xnb_connected)
   1892 			return;
   1893 
   1894 		if (!xnb_read_oe_config(xnbp) ||
   1895 		    !xnbp->xnb_flavour->xf_peer_connected(xnbp)) {
   1896 			cmn_err(CE_WARN, "xnb_oe_state_change: "
   1897 			    "read otherend config error");
   1898 			(void) xvdi_switch_state(dip, XBT_NULL,
   1899 			    XenbusStateClosed);
   1900 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
   1901 
   1902 			break;
   1903 		}
   1904 
   1905 
   1906 		mutex_enter(&xnbp->xnb_state_lock);
   1907 		xnbp->xnb_fe_status = XNB_STATE_READY;
   1908 		if (xnbp->xnb_be_status == XNB_STATE_READY)
   1909 			xnb_start_connect(xnbp);
   1910 		mutex_exit(&xnbp->xnb_state_lock);
   1911 
   1912 		/*
   1913 		 * Now that we've attempted to connect it's reasonable
   1914 		 * to allow an attempt to detach.
   1915 		 */
   1916 		xnbp->xnb_detachable = B_TRUE;
   1917 
   1918 		break;
   1919 
   1920 	case XenbusStateClosing:
   1921 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
   1922 
   1923 		break;
   1924 
   1925 	case XenbusStateClosed:
   1926 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
   1927 
   1928 		mutex_enter(&xnbp->xnb_tx_lock);
   1929 		mutex_enter(&xnbp->xnb_rx_lock);
   1930 
   1931 		xnb_disconnect_rings(dip);
   1932 		xnbp->xnb_connected = B_FALSE;
   1933 
   1934 		mutex_exit(&xnbp->xnb_rx_lock);
   1935 		mutex_exit(&xnbp->xnb_tx_lock);
   1936 
   1937 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
   1938 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
   1939 		/*
   1940 		 * In all likelyhood this is already set (in the above
   1941 		 * case), but if the peer never attempted to connect
   1942 		 * and the domain is destroyed we get here without
   1943 		 * having been through the case above, so we set it to
   1944 		 * be sure.
   1945 		 */
   1946 		xnbp->xnb_detachable = B_TRUE;
   1947 
   1948 		break;
   1949 
   1950 	default:
   1951 		break;
   1952 	}
   1953 }
   1954 
   1955 static void
   1956 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
   1957     void *arg, void *impl_data)
   1958 {
   1959 	_NOTE(ARGUNUSED(id, arg));
   1960 	xnb_t *xnbp = ddi_get_driver_private(dip);
   1961 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
   1962 
   1963 	ASSERT(xnbp != NULL);
   1964 
   1965 	switch (state) {
   1966 	case Connected:
   1967 		/* spurious hotplug event */
   1968 		if (xnbp->xnb_hotplugged)
   1969 			break;
   1970 
   1971 		if (!xnb_read_xs_config(xnbp))
   1972 			break;
   1973 
   1974 		if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp))
   1975 			break;
   1976 
   1977 		mutex_enter(&xnbp->xnb_tx_lock);
   1978 		mutex_enter(&xnbp->xnb_rx_lock);
   1979 
   1980 		xnbp->xnb_hotplugged = B_TRUE;
   1981 
   1982 		mutex_exit(&xnbp->xnb_rx_lock);
   1983 		mutex_exit(&xnbp->xnb_tx_lock);
   1984 
   1985 		mutex_enter(&xnbp->xnb_state_lock);
   1986 		xnbp->xnb_be_status = XNB_STATE_READY;
   1987 		if (xnbp->xnb_fe_status == XNB_STATE_READY)
   1988 			xnb_start_connect(xnbp);
   1989 		mutex_exit(&xnbp->xnb_state_lock);
   1990 
   1991 		break;
   1992 
   1993 	default:
   1994 		break;
   1995 	}
   1996 }
   1997 
   1998 static struct modldrv modldrv = {
   1999 	&mod_miscops, "xnb",
   2000 };
   2001 
   2002 static struct modlinkage modlinkage = {
   2003 	MODREV_1, &modldrv, NULL
   2004 };
   2005 
   2006 int
   2007 _init(void)
   2008 {
   2009 	int i;
   2010 
   2011 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
   2012 
   2013 	i = mod_install(&modlinkage);
   2014 	if (i != DDI_SUCCESS)
   2015 		mutex_destroy(&xnb_alloc_page_lock);
   2016 
   2017 	return (i);
   2018 }
   2019 
   2020 int
   2021 _info(struct modinfo *modinfop)
   2022 {
   2023 	return (mod_info(&modlinkage, modinfop));
   2024 }
   2025 
   2026 int
   2027 _fini(void)
   2028 {
   2029 	int i;
   2030 
   2031 	i = mod_remove(&modlinkage);
   2032 	if (i == DDI_SUCCESS)
   2033 		mutex_destroy(&xnb_alloc_page_lock);
   2034 
   2035 	return (i);
   2036 }
   2037