Home | History | Annotate | Download | only in io
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #ifdef DEBUG
     28 #define	XNB_DEBUG 1
     29 #endif /* DEBUG */
     30 
     31 #include "xnb.h"
     32 
     33 #include <sys/sunddi.h>
     34 #include <sys/sunndi.h>
     35 #include <sys/modctl.h>
     36 #include <sys/conf.h>
     37 #include <sys/mac.h>
     38 #include <sys/mac_impl.h> /* For mac_fix_cksum(). */
     39 #include <sys/dlpi.h>
     40 #include <sys/strsubr.h>
     41 #include <sys/strsun.h>
     42 #include <sys/types.h>
     43 #include <sys/pattr.h>
     44 #include <vm/seg_kmem.h>
     45 #include <vm/hat_i86.h>
     46 #include <xen/sys/xenbus_impl.h>
     47 #include <xen/sys/xendev.h>
     48 #include <sys/balloon_impl.h>
     49 #include <sys/evtchn_impl.h>
     50 #include <sys/gnttab.h>
     51 #include <vm/vm_dep.h>
     52 #include <sys/note.h>
     53 #include <sys/gld.h>
     54 #include <inet/ip.h>
     55 #include <inet/ip_impl.h>
     56 
     57 /*
     58  * The terms "transmit" and "receive" are used in alignment with domU,
     59  * which means that packets originating from the peer domU are "transmitted"
     60  * to other parts of the system and packets are "received" from them.
     61  */
     62 
     63 /*
     64  * Should we allow guests to manipulate multicast group membership?
     65  */
     66 static boolean_t	xnb_multicast_control = B_TRUE;
     67 
     68 static boolean_t	xnb_connect_rings(dev_info_t *);
     69 static void		xnb_disconnect_rings(dev_info_t *);
     70 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
     71     void *, void *);
     72 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
     73     void *, void *);
     74 
     75 static int	xnb_txbuf_constructor(void *, void *, int);
     76 static void	xnb_txbuf_destructor(void *, void *);
     77 static void	xnb_tx_notify_peer(xnb_t *, boolean_t);
     78 static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
     79 
     80 mblk_t		*xnb_to_peer(xnb_t *, mblk_t *);
     81 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
     82 
     83 static void		setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *,
     84     size_t, size_t, size_t, grant_ref_t);
     85 #pragma inline(setup_gop)
     86 static boolean_t	is_foreign(void *);
     87 #pragma inline(is_foreign)
     88 
     89 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
     90 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
     91 
     92 static kmutex_t	xnb_alloc_page_lock;
     93 
     94 /*
     95  * On a 32 bit PAE system physical and machine addresses are larger
     96  * than 32 bits.  ddi_btop() on such systems take an unsigned long
     97  * argument, and so addresses above 4G are truncated before ddi_btop()
     98  * gets to see them.  To avoid this, code the shift operation here.
     99  */
    100 #define	xnb_btop(addr)	((addr) >> PAGESHIFT)
    101 
    102 /* DMA attributes for transmit and receive data */
    103 static ddi_dma_attr_t buf_dma_attr = {
    104 	DMA_ATTR_V0,		/* version of this structure */
    105 	0,			/* lowest usable address */
    106 	0xffffffffffffffffULL,	/* highest usable address */
    107 	0x7fffffff,		/* maximum DMAable byte count */
    108 	MMU_PAGESIZE,		/* alignment in bytes */
    109 	0x7ff,			/* bitmap of burst sizes */
    110 	1,			/* minimum transfer */
    111 	0xffffffffU,		/* maximum transfer */
    112 	0xffffffffffffffffULL,	/* maximum segment length */
    113 	1,			/* maximum number of segments */
    114 	1,			/* granularity */
    115 	0,			/* flags (reserved) */
    116 };
    117 
    118 /* DMA access attributes for data: NOT to be byte swapped. */
    119 static ddi_device_acc_attr_t data_accattr = {
    120 	DDI_DEVICE_ATTR_V0,
    121 	DDI_NEVERSWAP_ACC,
    122 	DDI_STRICTORDER_ACC
    123 };
    124 
    125 /*
    126  * Statistics.
    127  */
    128 static const char * const aux_statistics[] = {
    129 	"rx_cksum_deferred",
    130 	"tx_cksum_no_need",
    131 	"rx_rsp_notok",
    132 	"tx_notify_deferred",
    133 	"tx_notify_sent",
    134 	"rx_notify_deferred",
    135 	"rx_notify_sent",
    136 	"tx_too_early",
    137 	"rx_too_early",
    138 	"rx_allocb_failed",
    139 	"tx_allocb_failed",
    140 	"rx_foreign_page",
    141 	"mac_full",
    142 	"spurious_intr",
    143 	"allocation_success",
    144 	"allocation_failure",
    145 	"small_allocation_success",
    146 	"small_allocation_failure",
    147 	"other_allocation_failure",
    148 	"rx_pageboundary_crossed",
    149 	"rx_cpoparea_grown",
    150 	"csum_hardware",
    151 	"csum_software",
    152 	"tx_overflow_page",
    153 	"tx_unexpected_flags",
    154 };
    155 
    156 static int
    157 xnb_ks_aux_update(kstat_t *ksp, int flag)
    158 {
    159 	xnb_t *xnbp;
    160 	kstat_named_t *knp;
    161 
    162 	if (flag != KSTAT_READ)
    163 		return (EACCES);
    164 
    165 	xnbp = ksp->ks_private;
    166 	knp = ksp->ks_data;
    167 
    168 	/*
    169 	 * Assignment order should match that of the names in
    170 	 * aux_statistics.
    171 	 */
    172 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
    173 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
    174 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
    175 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
    176 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
    177 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
    178 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
    179 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
    180 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
    181 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
    182 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
    183 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
    184 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
    185 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
    186 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
    187 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
    188 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
    189 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
    190 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
    191 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
    192 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
    193 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
    194 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
    195 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_overflow_page;
    196 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_unexpected_flags;
    197 
    198 	return (0);
    199 }
    200 
    201 static boolean_t
    202 xnb_ks_init(xnb_t *xnbp)
    203 {
    204 	int nstat = sizeof (aux_statistics) /
    205 	    sizeof (aux_statistics[0]);
    206 	const char * const *cp = aux_statistics;
    207 	kstat_named_t *knp;
    208 
    209 	/*
    210 	 * Create and initialise kstats.
    211 	 */
    212 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
    213 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
    214 	    KSTAT_TYPE_NAMED, nstat, 0);
    215 	if (xnbp->xnb_kstat_aux == NULL)
    216 		return (B_FALSE);
    217 
    218 	xnbp->xnb_kstat_aux->ks_private = xnbp;
    219 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
    220 
    221 	knp = xnbp->xnb_kstat_aux->ks_data;
    222 	while (nstat > 0) {
    223 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
    224 
    225 		knp++;
    226 		cp++;
    227 		nstat--;
    228 	}
    229 
    230 	kstat_install(xnbp->xnb_kstat_aux);
    231 
    232 	return (B_TRUE);
    233 }
    234 
    235 static void
    236 xnb_ks_free(xnb_t *xnbp)
    237 {
    238 	kstat_delete(xnbp->xnb_kstat_aux);
    239 }
    240 
    241 /*
    242  * Calculate and insert the transport checksum for an arbitrary packet.
    243  */
    244 static mblk_t *
    245 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
    246 {
    247 	_NOTE(ARGUNUSED(xnbp));
    248 
    249 	/*
    250 	 * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
    251 	 * because it doesn't cover all of the interesting cases :-(
    252 	 */
    253 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
    254 	    HCK_FULLCKSUM, KM_NOSLEEP);
    255 
    256 	return (mac_fix_cksum(mp));
    257 }
    258 
    259 mblk_t *
    260 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
    261 {
    262 	struct ether_header *ehp;
    263 	uint16_t sap;
    264 	uint32_t offset;
    265 	ipha_t *ipha;
    266 
    267 	ASSERT(mp->b_next == NULL);
    268 
    269 	/*
    270 	 * Check that the packet is contained in a single mblk.  In
    271 	 * the "from peer" path this is true today, but may change
    272 	 * when scatter gather support is added.  In the "to peer"
    273 	 * path we cannot be sure, but in most cases it will be true
    274 	 * (in the xnbo case the packet has come from a MAC device
    275 	 * which is unlikely to split packets).
    276 	 */
    277 	if (mp->b_cont != NULL)
    278 		goto software;
    279 
    280 	/*
    281 	 * If the MAC has no hardware capability don't do any further
    282 	 * checking.
    283 	 */
    284 	if (capab == 0)
    285 		goto software;
    286 
    287 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
    288 	ehp = (struct ether_header *)mp->b_rptr;
    289 
    290 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
    291 		struct ether_vlan_header *evhp;
    292 
    293 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
    294 		evhp = (struct ether_vlan_header *)mp->b_rptr;
    295 		sap = ntohs(evhp->ether_type);
    296 		offset = sizeof (struct ether_vlan_header);
    297 	} else {
    298 		sap = ntohs(ehp->ether_type);
    299 		offset = sizeof (struct ether_header);
    300 	}
    301 
    302 	/*
    303 	 * We only attempt to do IPv4 packets in hardware.
    304 	 */
    305 	if (sap != ETHERTYPE_IP)
    306 		goto software;
    307 
    308 	/*
    309 	 * We know that this is an IPv4 packet.
    310 	 */
    311 	ipha = (ipha_t *)(mp->b_rptr + offset);
    312 
    313 	switch (ipha->ipha_protocol) {
    314 	case IPPROTO_TCP:
    315 	case IPPROTO_UDP: {
    316 		uint32_t start, length, stuff, cksum;
    317 		uint16_t *stuffp;
    318 
    319 		/*
    320 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
    321 		 * can use full IPv4 and partial checksum offload.
    322 		 */
    323 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
    324 			break;
    325 
    326 		start = IP_SIMPLE_HDR_LENGTH;
    327 		length = ntohs(ipha->ipha_length);
    328 		if (ipha->ipha_protocol == IPPROTO_TCP) {
    329 			stuff = start + TCP_CHECKSUM_OFFSET;
    330 			cksum = IP_TCP_CSUM_COMP;
    331 		} else {
    332 			stuff = start + UDP_CHECKSUM_OFFSET;
    333 			cksum = IP_UDP_CSUM_COMP;
    334 		}
    335 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
    336 
    337 		if (capab & HCKSUM_INET_FULL_V4) {
    338 			/*
    339 			 * Some devices require that the checksum
    340 			 * field of the packet is zero for full
    341 			 * offload.
    342 			 */
    343 			*stuffp = 0;
    344 
    345 			(void) hcksum_assoc(mp, NULL, NULL,
    346 			    0, 0, 0, 0,
    347 			    HCK_FULLCKSUM, KM_NOSLEEP);
    348 
    349 			xnbp->xnb_stat_csum_hardware++;
    350 
    351 			return (mp);
    352 		}
    353 
    354 		if (capab & HCKSUM_INET_PARTIAL) {
    355 			if (*stuffp == 0) {
    356 				ipaddr_t src, dst;
    357 
    358 				/*
    359 				 * Older Solaris guests don't insert
    360 				 * the pseudo-header checksum, so we
    361 				 * calculate it here.
    362 				 */
    363 				src = ipha->ipha_src;
    364 				dst = ipha->ipha_dst;
    365 
    366 				cksum += (dst >> 16) + (dst & 0xFFFF);
    367 				cksum += (src >> 16) + (src & 0xFFFF);
    368 				cksum += length - IP_SIMPLE_HDR_LENGTH;
    369 
    370 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
    371 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
    372 
    373 				ASSERT(cksum <= 0xFFFF);
    374 
    375 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
    376 			}
    377 
    378 			(void) hcksum_assoc(mp, NULL, NULL,
    379 			    start, stuff, length, 0,
    380 			    HCK_PARTIALCKSUM, KM_NOSLEEP);
    381 
    382 			xnbp->xnb_stat_csum_hardware++;
    383 
    384 			return (mp);
    385 		}
    386 
    387 		/* NOTREACHED */
    388 		break;
    389 	}
    390 
    391 	default:
    392 		/* Use software. */
    393 		break;
    394 	}
    395 
    396 software:
    397 	/*
    398 	 * We are not able to use any offload so do the whole thing in
    399 	 * software.
    400 	 */
    401 	xnbp->xnb_stat_csum_software++;
    402 
    403 	return (xnb_software_csum(xnbp, mp));
    404 }
    405 
    406 int
    407 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
    408 {
    409 	xnb_t *xnbp;
    410 	char *xsname;
    411 	char cachename[32];
    412 
    413 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
    414 
    415 	xnbp->xnb_flavour = flavour;
    416 	xnbp->xnb_flavour_data = flavour_data;
    417 	xnbp->xnb_devinfo = dip;
    418 	xnbp->xnb_evtchn = INVALID_EVTCHN;
    419 	xnbp->xnb_irq = B_FALSE;
    420 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
    421 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
    422 	xnbp->xnb_connected = B_FALSE;
    423 	xnbp->xnb_hotplugged = B_FALSE;
    424 	xnbp->xnb_detachable = B_FALSE;
    425 	xnbp->xnb_peer = xvdi_get_oeid(dip);
    426 	xnbp->xnb_be_status = XNB_STATE_INIT;
    427 	xnbp->xnb_fe_status = XNB_STATE_INIT;
    428 
    429 	xnbp->xnb_tx_buf_count = 0;
    430 
    431 	xnbp->xnb_rx_hv_copy = B_FALSE;
    432 	xnbp->xnb_multicast_control = B_FALSE;
    433 
    434 	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
    435 	ASSERT(xnbp->xnb_rx_va != NULL);
    436 
    437 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
    438 	    != DDI_SUCCESS)
    439 		goto failure;
    440 
    441 	/* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
    442 	xnbp->xnb_rx_cpop = NULL;
    443 	xnbp->xnb_rx_cpop_count = 0;
    444 
    445 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
    446 	    xnbp->xnb_icookie);
    447 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
    448 	    xnbp->xnb_icookie);
    449 	mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER,
    450 	    xnbp->xnb_icookie);
    451 
    452 	/* Set driver private pointer now. */
    453 	ddi_set_driver_private(dip, xnbp);
    454 
    455 	(void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip));
    456 	xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename,
    457 	    sizeof (xnb_txbuf_t), 0,
    458 	    xnb_txbuf_constructor, xnb_txbuf_destructor,
    459 	    NULL, xnbp, NULL, 0);
    460 	if (xnbp->xnb_tx_buf_cache == NULL)
    461 		goto failure_0;
    462 
    463 	if (!xnb_ks_init(xnbp))
    464 		goto failure_1;
    465 
    466 	/*
    467 	 * Receive notification of changes in the state of the
    468 	 * driver in the guest domain.
    469 	 */
    470 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
    471 	    NULL) != DDI_SUCCESS)
    472 		goto failure_2;
    473 
    474 	/*
    475 	 * Receive notification of hotplug events.
    476 	 */
    477 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
    478 	    NULL) != DDI_SUCCESS)
    479 		goto failure_2;
    480 
    481 	xsname = xvdi_get_xsname(dip);
    482 
    483 	if (xenbus_printf(XBT_NULL, xsname,
    484 	    "feature-multicast-control", "%d",
    485 	    xnb_multicast_control ? 1 : 0) != 0)
    486 		goto failure_3;
    487 
    488 	if (xenbus_printf(XBT_NULL, xsname,
    489 	    "feature-rx-copy", "%d",  1) != 0)
    490 		goto failure_3;
    491 	/*
    492 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
    493 	 * in addition to "feature-rx-copy" being 1. It seems strange
    494 	 * to use four possible states to describe a binary decision,
    495 	 * but we might as well play nice.
    496 	 */
    497 	if (xenbus_printf(XBT_NULL, xsname,
    498 	    "feature-rx-flip", "%d", 0) != 0)
    499 		goto failure_3;
    500 
    501 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
    502 	(void) xvdi_post_event(dip, XEN_HP_ADD);
    503 
    504 	return (DDI_SUCCESS);
    505 
    506 failure_3:
    507 	xvdi_remove_event_handler(dip, NULL);
    508 
    509 failure_2:
    510 	xnb_ks_free(xnbp);
    511 
    512 failure_1:
    513 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
    514 
    515 failure_0:
    516 	mutex_destroy(&xnbp->xnb_state_lock);
    517 	mutex_destroy(&xnbp->xnb_rx_lock);
    518 	mutex_destroy(&xnbp->xnb_tx_lock);
    519 
    520 failure:
    521 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
    522 	kmem_free(xnbp, sizeof (*xnbp));
    523 	return (DDI_FAILURE);
    524 }
    525 
    526 void
    527 xnb_detach(dev_info_t *dip)
    528 {
    529 	xnb_t *xnbp = ddi_get_driver_private(dip);
    530 
    531 	ASSERT(xnbp != NULL);
    532 	ASSERT(!xnbp->xnb_connected);
    533 	ASSERT(xnbp->xnb_tx_buf_count == 0);
    534 
    535 	xnb_disconnect_rings(dip);
    536 
    537 	xvdi_remove_event_handler(dip, NULL);
    538 
    539 	xnb_ks_free(xnbp);
    540 
    541 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
    542 
    543 	ddi_set_driver_private(dip, NULL);
    544 
    545 	mutex_destroy(&xnbp->xnb_state_lock);
    546 	mutex_destroy(&xnbp->xnb_rx_lock);
    547 	mutex_destroy(&xnbp->xnb_tx_lock);
    548 
    549 	if (xnbp->xnb_rx_cpop_count > 0)
    550 		kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0])
    551 		    * xnbp->xnb_rx_cpop_count);
    552 
    553 	ASSERT(xnbp->xnb_rx_va != NULL);
    554 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
    555 
    556 	kmem_free(xnbp, sizeof (*xnbp));
    557 }
    558 
    559 /*
    560  * Allocate a page from the hypervisor to be flipped to the peer.
    561  *
    562  * Try to get pages in batches to reduce the overhead of calls into
    563  * the balloon driver.
    564  */
    565 static mfn_t
    566 xnb_alloc_page(xnb_t *xnbp)
    567 {
    568 #define	WARNING_RATE_LIMIT 100
    569 #define	BATCH_SIZE 256
    570 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
    571 	static int nth = BATCH_SIZE;
    572 	mfn_t mfn;
    573 
    574 	mutex_enter(&xnb_alloc_page_lock);
    575 	if (nth == BATCH_SIZE) {
    576 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
    577 			xnbp->xnb_stat_allocation_failure++;
    578 			mutex_exit(&xnb_alloc_page_lock);
    579 
    580 			/*
    581 			 * Try for a single page in low memory situations.
    582 			 */
    583 			if (balloon_alloc_pages(1, &mfn) != 1) {
    584 				if ((xnbp->xnb_stat_small_allocation_failure++
    585 				    % WARNING_RATE_LIMIT) == 0)
    586 					cmn_err(CE_WARN, "xnb_alloc_page: "
    587 					    "Cannot allocate memory to "
    588 					    "transfer packets to peer.");
    589 				return (0);
    590 			} else {
    591 				xnbp->xnb_stat_small_allocation_success++;
    592 				return (mfn);
    593 			}
    594 		}
    595 
    596 		nth = 0;
    597 		xnbp->xnb_stat_allocation_success++;
    598 	}
    599 
    600 	mfn = mfns[nth++];
    601 	mutex_exit(&xnb_alloc_page_lock);
    602 
    603 	ASSERT(mfn != 0);
    604 
    605 	return (mfn);
    606 #undef BATCH_SIZE
    607 #undef WARNING_RATE_LIMIT
    608 }
    609 
    610 /*
    611  * Free a page back to the hypervisor.
    612  *
    613  * This happens only in the error path, so batching is not worth the
    614  * complication.
    615  */
    616 static void
    617 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
    618 {
    619 	_NOTE(ARGUNUSED(xnbp));
    620 	int r;
    621 	pfn_t pfn;
    622 
    623 	pfn = xen_assign_pfn(mfn);
    624 	pfnzero(pfn, 0, PAGESIZE);
    625 	xen_release_pfn(pfn);
    626 
    627 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
    628 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
    629 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
    630 		    r, mfn);
    631 	}
    632 }
    633 
    634 /*
    635  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
    636  * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
    637  */
    638 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
    639 	((((_r)->sring->req_prod - loop) <		\
    640 		(RING_SIZE(_r) - (loop - prod))) ?	\
    641 	    ((_r)->sring->req_prod - loop) :		\
    642 	    (RING_SIZE(_r) - (loop - prod)))
    643 
    644 /*
    645  * Pass packets to the peer using page flipping.
    646  */
    647 mblk_t *
    648 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
    649 {
    650 	mblk_t *free = mp, *prev = NULL;
    651 	size_t len;
    652 	gnttab_transfer_t *gop;
    653 	boolean_t notify;
    654 	RING_IDX loop, prod, end;
    655 
    656 	/*
    657 	 * For each packet the sequence of operations is:
    658 	 *
    659 	 * 1. get a new page from the hypervisor.
    660 	 * 2. get a request slot from the ring.
    661 	 * 3. copy the data into the new page.
    662 	 * 4. transfer the page to the peer.
    663 	 * 5. update the request slot.
    664 	 * 6. kick the peer.
    665 	 * 7. free mp.
    666 	 *
    667 	 * In order to reduce the number of hypercalls, we prepare
    668 	 * several packets for the peer and perform a single hypercall
    669 	 * to transfer them.
    670 	 */
    671 
    672 	mutex_enter(&xnbp->xnb_rx_lock);
    673 
    674 	/*
    675 	 * If we are not connected to the peer or have not yet
    676 	 * finished hotplug it is too early to pass packets to the
    677 	 * peer.
    678 	 */
    679 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
    680 		mutex_exit(&xnbp->xnb_rx_lock);
    681 		DTRACE_PROBE(flip_rx_too_early);
    682 		xnbp->xnb_stat_rx_too_early++;
    683 		return (mp);
    684 	}
    685 
    686 	loop = xnbp->xnb_rx_ring.req_cons;
    687 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
    688 	gop = xnbp->xnb_rx_top;
    689 
    690 	while ((mp != NULL) &&
    691 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
    692 
    693 		mfn_t mfn;
    694 		pfn_t pfn;
    695 		netif_rx_request_t *rxreq;
    696 		netif_rx_response_t *rxresp;
    697 		char *valoop;
    698 		mblk_t *ml;
    699 		uint16_t cksum_flags;
    700 
    701 		/* 1 */
    702 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
    703 			xnbp->xnb_stat_rx_defer++;
    704 			break;
    705 		}
    706 
    707 		/* 2 */
    708 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
    709 
    710 #ifdef XNB_DEBUG
    711 		if (!(rxreq->id < NET_RX_RING_SIZE))
    712 			cmn_err(CE_PANIC, "xnb_to_peer: "
    713 			    "id %d out of range in request 0x%p",
    714 			    rxreq->id, (void *)rxreq);
    715 #endif /* XNB_DEBUG */
    716 
    717 		/* Assign a pfn and map the new page at the allocated va. */
    718 		pfn = xen_assign_pfn(mfn);
    719 		hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
    720 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
    721 
    722 		/* 3 */
    723 		len = 0;
    724 		valoop = xnbp->xnb_rx_va;
    725 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
    726 			size_t chunk = ml->b_wptr - ml->b_rptr;
    727 
    728 			bcopy(ml->b_rptr, valoop, chunk);
    729 			valoop += chunk;
    730 			len += chunk;
    731 		}
    732 
    733 		ASSERT(len < PAGESIZE);
    734 
    735 		/* Release the pfn. */
    736 		hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
    737 		    HAT_UNLOAD_UNMAP);
    738 		xen_release_pfn(pfn);
    739 
    740 		/* 4 */
    741 		gop->mfn = mfn;
    742 		gop->domid = xnbp->xnb_peer;
    743 		gop->ref = rxreq->gref;
    744 
    745 		/* 5.1 */
    746 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
    747 		rxresp->offset = 0;
    748 		rxresp->flags = 0;
    749 
    750 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
    751 		if (cksum_flags != 0)
    752 			xnbp->xnb_stat_rx_cksum_deferred++;
    753 		rxresp->flags |= cksum_flags;
    754 
    755 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
    756 		rxresp->status = len;
    757 
    758 		loop++;
    759 		prod++;
    760 		gop++;
    761 		prev = mp;
    762 		mp = mp->b_next;
    763 	}
    764 
    765 	/*
    766 	 * Did we actually do anything?
    767 	 */
    768 	if (loop == xnbp->xnb_rx_ring.req_cons) {
    769 		mutex_exit(&xnbp->xnb_rx_lock);
    770 		return (mp);
    771 	}
    772 
    773 	end = loop;
    774 
    775 	/*
    776 	 * Unlink the end of the 'done' list from the remainder.
    777 	 */
    778 	ASSERT(prev != NULL);
    779 	prev->b_next = NULL;
    780 
    781 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
    782 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
    783 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
    784 	}
    785 
    786 	loop = xnbp->xnb_rx_ring.req_cons;
    787 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
    788 	gop = xnbp->xnb_rx_top;
    789 
    790 	while (loop < end) {
    791 		int16_t status = NETIF_RSP_OKAY;
    792 
    793 		if (gop->status != 0) {
    794 			status = NETIF_RSP_ERROR;
    795 
    796 			/*
    797 			 * If the status is anything other than
    798 			 * GNTST_bad_page then we don't own the page
    799 			 * any more, so don't try to give it back.
    800 			 */
    801 			if (gop->status != GNTST_bad_page)
    802 				gop->mfn = 0;
    803 		} else {
    804 			/* The page is no longer ours. */
    805 			gop->mfn = 0;
    806 		}
    807 
    808 		if (gop->mfn != 0)
    809 			/*
    810 			 * Give back the page, as we won't be using
    811 			 * it.
    812 			 */
    813 			xnb_free_page(xnbp, gop->mfn);
    814 		else
    815 			/*
    816 			 * We gave away a page, update our accounting
    817 			 * now.
    818 			 */
    819 			balloon_drv_subtracted(1);
    820 
    821 		/* 5.2 */
    822 		if (status != NETIF_RSP_OKAY) {
    823 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
    824 			    status;
    825 		} else {
    826 			xnbp->xnb_stat_ipackets++;
    827 			xnbp->xnb_stat_rbytes += len;
    828 		}
    829 
    830 		loop++;
    831 		prod++;
    832 		gop++;
    833 	}
    834 
    835 	xnbp->xnb_rx_ring.req_cons = loop;
    836 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
    837 
    838 	/* 6 */
    839 	/* LINTED: constant in conditional context */
    840 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
    841 	if (notify) {
    842 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
    843 		xnbp->xnb_stat_rx_notify_sent++;
    844 	} else {
    845 		xnbp->xnb_stat_rx_notify_deferred++;
    846 	}
    847 
    848 	if (mp != NULL)
    849 		xnbp->xnb_stat_rx_defer++;
    850 
    851 	mutex_exit(&xnbp->xnb_rx_lock);
    852 
    853 	/* Free mblk_t's that we consumed. */
    854 	freemsgchain(free);
    855 
    856 	return (mp);
    857 }
    858 
    859 /* Helper functions for xnb_copy_to_peer(). */
    860 
    861 /*
    862  * Grow the array of copy operation descriptors.
    863  */
    864 static boolean_t
    865 grow_cpop_area(xnb_t *xnbp)
    866 {
    867 	size_t count;
    868 	gnttab_copy_t *new;
    869 
    870 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
    871 
    872 	count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT;
    873 
    874 	if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) {
    875 		xnbp->xnb_stat_other_allocation_failure++;
    876 		return (B_FALSE);
    877 	}
    878 
    879 	bcopy(xnbp->xnb_rx_cpop, new,
    880 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
    881 
    882 	kmem_free(xnbp->xnb_rx_cpop,
    883 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
    884 
    885 	xnbp->xnb_rx_cpop = new;
    886 	xnbp->xnb_rx_cpop_count = count;
    887 
    888 	xnbp->xnb_stat_rx_cpoparea_grown++;
    889 
    890 	return (B_TRUE);
    891 }
    892 
    893 /*
    894  * Check whether an address is on a page that's foreign to this domain.
    895  */
    896 static boolean_t
    897 is_foreign(void *addr)
    898 {
    899 	pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
    900 
    901 	return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN);
    902 }
    903 
    904 /*
    905  * Insert a newly allocated mblk into a chain, replacing the old one.
    906  */
    907 static mblk_t *
    908 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
    909 {
    910 	uint32_t	start, stuff, end, value, flags;
    911 	mblk_t		*new_mp;
    912 
    913 	new_mp = copyb(mp);
    914 	if (new_mp == NULL)
    915 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
    916 		    "for %p, len %lu", (void *) mp, len);
    917 
    918 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
    919 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
    920 	    flags, KM_NOSLEEP);
    921 
    922 	new_mp->b_next = mp->b_next;
    923 	new_mp->b_prev = mp->b_prev;
    924 	new_mp->b_cont = mp->b_cont;
    925 
    926 	/* Make sure we only overwrite pointers to the mblk being replaced. */
    927 	if (mp_prev != NULL && mp_prev->b_next == mp)
    928 		mp_prev->b_next = new_mp;
    929 
    930 	if (ml_prev != NULL && ml_prev->b_cont == mp)
    931 		ml_prev->b_cont = new_mp;
    932 
    933 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
    934 	freemsg(mp);
    935 
    936 	return (new_mp);
    937 }
    938 
    939 /*
    940  * Set all the fields in a gnttab_copy_t.
    941  */
    942 static void
    943 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
    944     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
    945 {
    946 	ASSERT(xnbp != NULL && gp != NULL);
    947 
    948 	gp->source.offset = s_off;
    949 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
    950 	gp->source.domid = DOMID_SELF;
    951 
    952 	gp->len = (uint16_t)len;
    953 	gp->flags = GNTCOPY_dest_gref;
    954 	gp->status = 0;
    955 
    956 	gp->dest.u.ref = d_ref;
    957 	gp->dest.offset = d_off;
    958 	gp->dest.domid = xnbp->xnb_peer;
    959 }
    960 
    961 /*
    962  * Pass packets to the peer using hypervisor copy operations.
    963  */
    964 mblk_t *
    965 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
    966 {
    967 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
    968 	mblk_t		*ml, *ml_prev;
    969 	boolean_t	notify;
    970 	RING_IDX	loop, prod;
    971 	int		i;
    972 
    973 	/*
    974 	 * If the peer does not pre-post buffers for received packets,
    975 	 * use page flipping to pass packets to it.
    976 	 */
    977 	if (!xnbp->xnb_rx_hv_copy)
    978 		return (xnb_to_peer(xnbp, mp));
    979 
    980 	/*
    981 	 * For each packet the sequence of operations is:
    982 	 *
    983 	 *  1. get a request slot from the ring.
    984 	 *  2. set up data for hypercall (see NOTE below)
    985 	 *  3. have the hypervisore copy the data
    986 	 *  4. update the request slot.
    987 	 *  5. kick the peer.
    988 	 *
    989 	 * NOTE ad 2.
    990 	 *  In order to reduce the number of hypercalls, we prepare
    991 	 *  several mblks (mp->b_cont != NULL) for the peer and
    992 	 *  perform a single hypercall to transfer them.  We also have
    993 	 *  to set up a seperate copy operation for every page.
    994 	 *
    995 	 * If we have more than one packet (mp->b_next != NULL), we do
    996 	 * this whole dance repeatedly.
    997 	 */
    998 
    999 	mutex_enter(&xnbp->xnb_rx_lock);
   1000 
   1001 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
   1002 		mutex_exit(&xnbp->xnb_rx_lock);
   1003 		DTRACE_PROBE(copy_rx_too_early);
   1004 		xnbp->xnb_stat_rx_too_early++;
   1005 		return (mp);
   1006 	}
   1007 
   1008 	loop = xnbp->xnb_rx_ring.req_cons;
   1009 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
   1010 
   1011 	while ((mp != NULL) &&
   1012 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
   1013 		netif_rx_request_t	*rxreq;
   1014 		size_t			d_offset, len;
   1015 		int			item_count;
   1016 		gnttab_copy_t		*gop_cp;
   1017 		netif_rx_response_t	*rxresp;
   1018 		uint16_t		cksum_flags;
   1019 		int16_t			status = NETIF_RSP_OKAY;
   1020 
   1021 		/* 1 */
   1022 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
   1023 
   1024 #ifdef XNB_DEBUG
   1025 		if (!(rxreq->id < NET_RX_RING_SIZE))
   1026 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
   1027 			    "id %d out of range in request 0x%p",
   1028 			    rxreq->id, (void *)rxreq);
   1029 #endif /* XNB_DEBUG */
   1030 
   1031 		/* 2 */
   1032 		d_offset = 0;
   1033 		len = 0;
   1034 		item_count = 0;
   1035 
   1036 		gop_cp = xnbp->xnb_rx_cpop;
   1037 
   1038 		/*
   1039 		 * We walk the b_cont pointers and set up a
   1040 		 * gnttab_copy_t for each sub-page chunk in each data
   1041 		 * block.
   1042 		 */
   1043 		/* 2a */
   1044 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
   1045 			size_t	chunk = ml->b_wptr - ml->b_rptr;
   1046 			uchar_t	*r_tmp,	*rpt_align;
   1047 			size_t	r_offset;
   1048 
   1049 			/*
   1050 			 * The hypervisor will not allow us to
   1051 			 * reference a foreign page (e.g. one
   1052 			 * belonging to another domain) by mfn in the
   1053 			 * copy operation. If the data in this mblk is
   1054 			 * on such a page we must copy the data into a
   1055 			 * local page before initiating the hypervisor
   1056 			 * copy operation.
   1057 			 */
   1058 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
   1059 				mblk_t *ml_new = replace_msg(ml, chunk,
   1060 				    mp_prev, ml_prev);
   1061 
   1062 				/* We can still use old ml, but not *ml! */
   1063 				if (free == ml)
   1064 					free = ml_new;
   1065 				if (mp == ml)
   1066 					mp = ml_new;
   1067 				ml = ml_new;
   1068 
   1069 				xnbp->xnb_stat_rx_foreign_page++;
   1070 			}
   1071 
   1072 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
   1073 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
   1074 			r_tmp = ml->b_rptr;
   1075 
   1076 			if (d_offset + chunk > PAGESIZE)
   1077 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
   1078 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
   1079 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
   1080 				    (void *)mp, (void *)saved_mp, (void *)ml,
   1081 				    (void *)rpt_align,
   1082 				    d_offset, chunk, (int)PAGESIZE);
   1083 
   1084 			while (chunk > 0) {
   1085 				size_t part_len;
   1086 
   1087 				if (item_count == xnbp->xnb_rx_cpop_count) {
   1088 					if (!grow_cpop_area(xnbp))
   1089 						goto failure;
   1090 					gop_cp = &xnbp->xnb_rx_cpop[item_count];
   1091 				}
   1092 				/*
   1093 				 * If our mblk crosses a page boundary, we need
   1094 				 * to do a seperate copy for each page.
   1095 				 */
   1096 				if (r_offset + chunk > PAGESIZE) {
   1097 					part_len = PAGESIZE - r_offset;
   1098 
   1099 					DTRACE_PROBE3(mblk_page_crossed,
   1100 					    (mblk_t *), ml, int, chunk, int,
   1101 					    (int)r_offset);
   1102 
   1103 					xnbp->xnb_stat_rx_pagebndry_crossed++;
   1104 				} else {
   1105 					part_len = chunk;
   1106 				}
   1107 
   1108 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
   1109 				    d_offset, part_len, rxreq->gref);
   1110 
   1111 				chunk -= part_len;
   1112 
   1113 				len += part_len;
   1114 				d_offset += part_len;
   1115 				r_tmp += part_len;
   1116 				/*
   1117 				 * The 2nd, 3rd ... last copies will always
   1118 				 * start at r_tmp, therefore r_offset is 0.
   1119 				 */
   1120 				r_offset = 0;
   1121 				gop_cp++;
   1122 				item_count++;
   1123 			}
   1124 			ml_prev = ml;
   1125 
   1126 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
   1127 			    chunk, int, len, int, item_count);
   1128 		}
   1129 		/* 3 */
   1130 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
   1131 		    item_count) != 0) {
   1132 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
   1133 			DTRACE_PROBE(HV_granttableopfailed);
   1134 		}
   1135 
   1136 		/* 4 */
   1137 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
   1138 		rxresp->offset = 0;
   1139 
   1140 		rxresp->flags = 0;
   1141 
   1142 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
   1143 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
   1144 		    (int)rxresp->status);
   1145 
   1146 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
   1147 		if (cksum_flags != 0)
   1148 			xnbp->xnb_stat_rx_cksum_deferred++;
   1149 		rxresp->flags |= cksum_flags;
   1150 
   1151 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
   1152 		rxresp->status = len;
   1153 
   1154 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
   1155 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
   1156 		    (int)rxresp->status);
   1157 
   1158 		for (i = 0; i < item_count; i++) {
   1159 			if (xnbp->xnb_rx_cpop[i].status != 0) {
   1160 				DTRACE_PROBE2(cpop_status_nonnull, int,
   1161 				    (int)xnbp->xnb_rx_cpop[i].status,
   1162 				    int, i);
   1163 				status = NETIF_RSP_ERROR;
   1164 			}
   1165 		}
   1166 
   1167 		/* 5.2 */
   1168 		if (status != NETIF_RSP_OKAY) {
   1169 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
   1170 			    status;
   1171 			xnbp->xnb_stat_rx_rsp_notok++;
   1172 		} else {
   1173 			xnbp->xnb_stat_ipackets++;
   1174 			xnbp->xnb_stat_rbytes += len;
   1175 		}
   1176 
   1177 		loop++;
   1178 		prod++;
   1179 		mp_prev = mp;
   1180 		mp = mp->b_next;
   1181 	}
   1182 failure:
   1183 	/*
   1184 	 * Did we actually do anything?
   1185 	 */
   1186 	if (loop == xnbp->xnb_rx_ring.req_cons) {
   1187 		mutex_exit(&xnbp->xnb_rx_lock);
   1188 		return (mp);
   1189 	}
   1190 
   1191 	/*
   1192 	 * Unlink the end of the 'done' list from the remainder.
   1193 	 */
   1194 	ASSERT(mp_prev != NULL);
   1195 	mp_prev->b_next = NULL;
   1196 
   1197 	xnbp->xnb_rx_ring.req_cons = loop;
   1198 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
   1199 
   1200 	/* 6 */
   1201 	/* LINTED: constant in conditional context */
   1202 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
   1203 	if (notify) {
   1204 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
   1205 		xnbp->xnb_stat_rx_notify_sent++;
   1206 	} else {
   1207 		xnbp->xnb_stat_rx_notify_deferred++;
   1208 	}
   1209 
   1210 	if (mp != NULL)
   1211 		xnbp->xnb_stat_rx_defer++;
   1212 
   1213 	mutex_exit(&xnbp->xnb_rx_lock);
   1214 
   1215 	/* Free mblk_t structs we have consumed. */
   1216 	freemsgchain(free);
   1217 
   1218 	return (mp);
   1219 }
   1220 
   1221 
   1222 static void
   1223 xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force)
   1224 {
   1225 	boolean_t notify;
   1226 
   1227 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
   1228 
   1229 	/* LINTED: constant in conditional context */
   1230 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
   1231 	if (notify || force) {
   1232 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
   1233 		xnbp->xnb_stat_tx_notify_sent++;
   1234 	} else {
   1235 		xnbp->xnb_stat_tx_notify_deferred++;
   1236 	}
   1237 }
   1238 
   1239 static void
   1240 xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
   1241 {
   1242 	RING_IDX i;
   1243 	netif_tx_response_t *txresp;
   1244 
   1245 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
   1246 
   1247 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
   1248 
   1249 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
   1250 	txresp->id = id;
   1251 	txresp->status = status;
   1252 
   1253 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
   1254 
   1255 	/*
   1256 	 * Note that we don't push the change to the peer here - that
   1257 	 * is the callers responsibility.
   1258 	 */
   1259 }
   1260 
   1261 static void
   1262 xnb_txbuf_recycle(xnb_txbuf_t *txp)
   1263 {
   1264 	xnb_t *xnbp = txp->xt_xnbp;
   1265 
   1266 	kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
   1267 
   1268 	xnbp->xnb_tx_buf_outstanding--;
   1269 }
   1270 
   1271 static int
   1272 xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
   1273 {
   1274 	_NOTE(ARGUNUSED(kmflag));
   1275 	xnb_txbuf_t *txp = buf;
   1276 	xnb_t *xnbp = arg;
   1277 	size_t len;
   1278 	ddi_dma_cookie_t dma_cookie;
   1279 	uint_t ncookies;
   1280 
   1281 	txp->xt_free_rtn.free_func = xnb_txbuf_recycle;
   1282 	txp->xt_free_rtn.free_arg = (caddr_t)txp;
   1283 	txp->xt_xnbp = xnbp;
   1284 	txp->xt_next = NULL;
   1285 
   1286 	if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr,
   1287 	    0, 0, &txp->xt_dma_handle) != DDI_SUCCESS)
   1288 		goto failure;
   1289 
   1290 	if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr,
   1291 	    DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len,
   1292 	    &txp->xt_acc_handle) != DDI_SUCCESS)
   1293 		goto failure_1;
   1294 
   1295 	if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf,
   1296 	    len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0,
   1297 	    &dma_cookie, &ncookies)
   1298 	    != DDI_DMA_MAPPED)
   1299 		goto failure_2;
   1300 	ASSERT(ncookies == 1);
   1301 
   1302 	txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress);
   1303 	txp->xt_buflen = dma_cookie.dmac_size;
   1304 
   1305 	DTRACE_PROBE(txbuf_allocated);
   1306 
   1307 	atomic_add_32(&xnbp->xnb_tx_buf_count, 1);
   1308 	xnbp->xnb_tx_buf_outstanding++;
   1309 
   1310 	return (0);
   1311 
   1312 failure_2:
   1313 	ddi_dma_mem_free(&txp->xt_acc_handle);
   1314 
   1315 failure_1:
   1316 	ddi_dma_free_handle(&txp->xt_dma_handle);
   1317 
   1318 failure:
   1319 
   1320 	return (-1);
   1321 }
   1322 
   1323 static void
   1324 xnb_txbuf_destructor(void *buf, void *arg)
   1325 {
   1326 	xnb_txbuf_t *txp = buf;
   1327 	xnb_t *xnbp = arg;
   1328 
   1329 	(void) ddi_dma_unbind_handle(txp->xt_dma_handle);
   1330 	ddi_dma_mem_free(&txp->xt_acc_handle);
   1331 	ddi_dma_free_handle(&txp->xt_dma_handle);
   1332 
   1333 	atomic_add_32(&xnbp->xnb_tx_buf_count, -1);
   1334 }
   1335 
   1336 /*
   1337  * Take packets from the peer and deliver them onward.
   1338  */
   1339 static mblk_t *
   1340 xnb_from_peer(xnb_t *xnbp)
   1341 {
   1342 	RING_IDX start, end, loop;
   1343 	gnttab_copy_t *cop;
   1344 	xnb_txbuf_t **txpp;
   1345 	netif_tx_request_t *txreq;
   1346 	boolean_t work_to_do, need_notify = B_FALSE;
   1347 	mblk_t *head, *tail;
   1348 	int n_data_req, i;
   1349 
   1350 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
   1351 
   1352 	head = tail = NULL;
   1353 around:
   1354 
   1355 	/* LINTED: constant in conditional context */
   1356 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
   1357 	if (!work_to_do) {
   1358 finished:
   1359 		xnb_tx_notify_peer(xnbp, need_notify);
   1360 
   1361 		return (head);
   1362 	}
   1363 
   1364 	start = xnbp->xnb_tx_ring.req_cons;
   1365 	end = xnbp->xnb_tx_ring.sring->req_prod;
   1366 
   1367 	if ((end - start) > NET_TX_RING_SIZE) {
   1368 		/*
   1369 		 * This usually indicates that the frontend driver is
   1370 		 * misbehaving, as it's not possible to have more than
   1371 		 * NET_TX_RING_SIZE ring elements in play at any one
   1372 		 * time.
   1373 		 *
   1374 		 * We reset the ring pointers to the state declared by
   1375 		 * the frontend and try to carry on.
   1376 		 */
   1377 		cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
   1378 		    "items in the ring, resetting and trying to recover.",
   1379 		    xnbp->xnb_peer, (end - start));
   1380 
   1381 		/* LINTED: constant in conditional context */
   1382 		BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
   1383 		    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
   1384 
   1385 		goto around;
   1386 	}
   1387 
   1388 	loop = start;
   1389 	cop = xnbp->xnb_tx_cop;
   1390 	txpp = xnbp->xnb_tx_bufp;
   1391 	n_data_req = 0;
   1392 
   1393 	while (loop < end) {
   1394 		static const uint16_t acceptable_flags =
   1395 		    NETTXF_csum_blank |
   1396 		    NETTXF_data_validated |
   1397 		    NETTXF_extra_info;
   1398 		uint16_t unexpected_flags;
   1399 
   1400 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
   1401 
   1402 		unexpected_flags = txreq->flags & ~acceptable_flags;
   1403 		if (unexpected_flags != 0) {
   1404 			/*
   1405 			 * The peer used flag bits that we do not
   1406 			 * recognize.
   1407 			 */
   1408 			cmn_err(CE_WARN, "xnb_from_peer: "
   1409 			    "unexpected flag bits (0x%x) from peer "
   1410 			    "in transmit request",
   1411 			    unexpected_flags);
   1412 			xnbp->xnb_stat_tx_unexpected_flags++;
   1413 
   1414 			/* Mark this entry as failed. */
   1415 			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
   1416 			need_notify = B_TRUE;
   1417 
   1418 		} else if (txreq->flags & NETTXF_extra_info) {
   1419 			struct netif_extra_info *erp;
   1420 			boolean_t status;
   1421 
   1422 			loop++; /* Consume another slot in the ring. */
   1423 			ASSERT(loop <= end);
   1424 
   1425 			erp = (struct netif_extra_info *)
   1426 			    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
   1427 
   1428 			switch (erp->type) {
   1429 			case XEN_NETIF_EXTRA_TYPE_MCAST_ADD:
   1430 				ASSERT(xnbp->xnb_multicast_control);
   1431 				status = xnbp->xnb_flavour->xf_mcast_add(xnbp,
   1432 				    &erp->u.mcast.addr);
   1433 				break;
   1434 			case XEN_NETIF_EXTRA_TYPE_MCAST_DEL:
   1435 				ASSERT(xnbp->xnb_multicast_control);
   1436 				status = xnbp->xnb_flavour->xf_mcast_del(xnbp,
   1437 				    &erp->u.mcast.addr);
   1438 				break;
   1439 			default:
   1440 				status = B_FALSE;
   1441 				cmn_err(CE_WARN, "xnb_from_peer: "
   1442 				    "unknown extra type %d", erp->type);
   1443 				break;
   1444 			}
   1445 
   1446 			xnb_tx_mark_complete(xnbp, txreq->id,
   1447 			    status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR);
   1448 			need_notify = B_TRUE;
   1449 
   1450 		} else if ((txreq->offset > PAGESIZE) ||
   1451 		    (txreq->offset + txreq->size > PAGESIZE)) {
   1452 			/*
   1453 			 * Peer attempted to refer to data beyond the
   1454 			 * end of the granted page.
   1455 			 */
   1456 			cmn_err(CE_WARN, "xnb_from_peer: "
   1457 			    "attempt to refer beyond the end of granted "
   1458 			    "page in txreq (offset %d, size %d).",
   1459 			    txreq->offset, txreq->size);
   1460 			xnbp->xnb_stat_tx_overflow_page++;
   1461 
   1462 			/* Mark this entry as failed. */
   1463 			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
   1464 			need_notify = B_TRUE;
   1465 
   1466 		} else {
   1467 			xnb_txbuf_t *txp;
   1468 
   1469 			txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache,
   1470 			    KM_NOSLEEP);
   1471 			if (txp == NULL)
   1472 				break;
   1473 
   1474 			txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf,
   1475 			    txp->xt_buflen, 0, &txp->xt_free_rtn);
   1476 			if (txp->xt_mblk == NULL) {
   1477 				kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
   1478 				break;
   1479 			}
   1480 
   1481 			txp->xt_idx = loop;
   1482 			txp->xt_id = txreq->id;
   1483 
   1484 			cop->source.u.ref = txreq->gref;
   1485 			cop->source.domid = xnbp->xnb_peer;
   1486 			cop->source.offset = txreq->offset;
   1487 
   1488 			cop->dest.u.gmfn = txp->xt_mfn;
   1489 			cop->dest.domid = DOMID_SELF;
   1490 			cop->dest.offset = 0;
   1491 
   1492 			cop->len = txreq->size;
   1493 			cop->flags = GNTCOPY_source_gref;
   1494 			cop->status = 0;
   1495 
   1496 			*txpp = txp;
   1497 
   1498 			txpp++;
   1499 			cop++;
   1500 			n_data_req++;
   1501 
   1502 			ASSERT(n_data_req <= NET_TX_RING_SIZE);
   1503 		}
   1504 
   1505 		loop++;
   1506 	}
   1507 
   1508 	xnbp->xnb_tx_ring.req_cons = loop;
   1509 
   1510 	if (n_data_req == 0)
   1511 		goto around;
   1512 
   1513 	if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
   1514 	    xnbp->xnb_tx_cop, n_data_req) != 0) {
   1515 
   1516 		cmn_err(CE_WARN, "xnb_from_peer: copy operation failed");
   1517 
   1518 		txpp = xnbp->xnb_tx_bufp;
   1519 		i = n_data_req;
   1520 		while (i > 0) {
   1521 			kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp);
   1522 			txpp++;
   1523 			i--;
   1524 		}
   1525 
   1526 		goto finished;
   1527 	}
   1528 
   1529 	txpp = xnbp->xnb_tx_bufp;
   1530 	cop = xnbp->xnb_tx_cop;
   1531 	i = n_data_req;
   1532 
   1533 	while (i > 0) {
   1534 		xnb_txbuf_t *txp = *txpp;
   1535 
   1536 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx);
   1537 
   1538 		if (cop->status != 0) {
   1539 #ifdef XNB_DEBUG
   1540 			cmn_err(CE_WARN, "xnb_from_peer: "
   1541 			    "txpp 0x%p failed (%d)",
   1542 			    (void *)*txpp, cop->status);
   1543 #endif /* XNB_DEBUG */
   1544 			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_ERROR);
   1545 			freemsg(txp->xt_mblk);
   1546 		} else {
   1547 			mblk_t *mp;
   1548 
   1549 			mp = txp->xt_mblk;
   1550 			mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf;
   1551 			mp->b_wptr += txreq->size;
   1552 			mp->b_next = NULL;
   1553 
   1554 			/*
   1555 			 * If there are checksum flags, process them
   1556 			 * appropriately.
   1557 			 */
   1558 			if ((txreq->flags &
   1559 			    (NETTXF_csum_blank | NETTXF_data_validated))
   1560 			    != 0) {
   1561 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
   1562 				    mp, txreq->flags);
   1563 				xnbp->xnb_stat_tx_cksum_no_need++;
   1564 
   1565 				txp->xt_mblk = mp;
   1566 			}
   1567 
   1568 			if (head == NULL) {
   1569 				ASSERT(tail == NULL);
   1570 				head = mp;
   1571 			} else {
   1572 				ASSERT(tail != NULL);
   1573 				tail->b_next = mp;
   1574 			}
   1575 			tail = mp;
   1576 
   1577 			xnbp->xnb_stat_opackets++;
   1578 			xnbp->xnb_stat_obytes += txreq->size;
   1579 
   1580 			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_OKAY);
   1581 		}
   1582 
   1583 		txpp++;
   1584 		cop++;
   1585 		i--;
   1586 	}
   1587 
   1588 	goto around;
   1589 	/* NOTREACHED */
   1590 }
   1591 
   1592 static uint_t
   1593 xnb_intr(caddr_t arg)
   1594 {
   1595 	xnb_t *xnbp = (xnb_t *)arg;
   1596 	mblk_t *mp;
   1597 
   1598 	xnbp->xnb_stat_intr++;
   1599 
   1600 	mutex_enter(&xnbp->xnb_tx_lock);
   1601 
   1602 	ASSERT(xnbp->xnb_connected);
   1603 
   1604 	mp = xnb_from_peer(xnbp);
   1605 
   1606 	mutex_exit(&xnbp->xnb_tx_lock);
   1607 
   1608 	if (!xnbp->xnb_hotplugged) {
   1609 		xnbp->xnb_stat_tx_too_early++;
   1610 		goto fail;
   1611 	}
   1612 	if (mp == NULL) {
   1613 		xnbp->xnb_stat_spurious_intr++;
   1614 		goto fail;
   1615 	}
   1616 
   1617 	xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
   1618 
   1619 	return (DDI_INTR_CLAIMED);
   1620 
   1621 fail:
   1622 	freemsgchain(mp);
   1623 	return (DDI_INTR_CLAIMED);
   1624 }
   1625 
   1626 /*
   1627  * Read our configuration from xenstore.
   1628  */
   1629 boolean_t
   1630 xnb_read_xs_config(xnb_t *xnbp)
   1631 {
   1632 	char *xsname;
   1633 	char mac[ETHERADDRL * 3];
   1634 
   1635 	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
   1636 
   1637 	if (xenbus_scanf(XBT_NULL, xsname,
   1638 	    "mac", "%s", mac) != 0) {
   1639 		cmn_err(CE_WARN, "xnb_attach: "
   1640 		    "cannot read mac address from %s",
   1641 		    xsname);
   1642 		return (B_FALSE);
   1643 	}
   1644 
   1645 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
   1646 		cmn_err(CE_WARN,
   1647 		    "xnb_attach: cannot parse mac address %s",
   1648 		    mac);
   1649 		return (B_FALSE);
   1650 	}
   1651 
   1652 	return (B_TRUE);
   1653 }
   1654 
   1655 /*
   1656  * Read the configuration of the peer from xenstore.
   1657  */
   1658 boolean_t
   1659 xnb_read_oe_config(xnb_t *xnbp)
   1660 {
   1661 	char *oename;
   1662 	int i;
   1663 
   1664 	oename = xvdi_get_oename(xnbp->xnb_devinfo);
   1665 
   1666 	if (xenbus_gather(XBT_NULL, oename,
   1667 	    "event-channel", "%u", &xnbp->xnb_fe_evtchn,
   1668 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
   1669 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
   1670 	    NULL) != 0) {
   1671 		cmn_err(CE_WARN, "xnb_read_oe_config: "
   1672 		    "cannot read other-end details from %s",
   1673 		    oename);
   1674 		return (B_FALSE);
   1675 	}
   1676 
   1677 	/*
   1678 	 * Check whether our peer requests receive side hypervisor
   1679 	 * copy.
   1680 	 */
   1681 	if (xenbus_scanf(XBT_NULL, oename,
   1682 	    "request-rx-copy", "%d", &i) != 0)
   1683 		i = 0;
   1684 	if (i != 0)
   1685 		xnbp->xnb_rx_hv_copy = B_TRUE;
   1686 
   1687 	/*
   1688 	 * Check whether our peer requests multicast_control.
   1689 	 */
   1690 	if (xenbus_scanf(XBT_NULL, oename,
   1691 	    "request-multicast-control", "%d", &i) != 0)
   1692 		i = 0;
   1693 	if (i != 0)
   1694 		xnbp->xnb_multicast_control = B_TRUE;
   1695 
   1696 	/*
   1697 	 * The Linux backend driver here checks to see if the peer has
   1698 	 * set 'feature-no-csum-offload'. This is used to indicate
   1699 	 * that the guest cannot handle receiving packets without a
   1700 	 * valid checksum. We don't check here, because packets passed
   1701 	 * to the peer _always_ have a valid checksum.
   1702 	 *
   1703 	 * There are three cases:
   1704 	 *
   1705 	 * - the NIC is dedicated: packets from the wire should always
   1706 	 *   have a valid checksum. If the hardware validates the
   1707 	 *   checksum then the relevant bit will be set in the packet
   1708 	 *   attributes and we will inform the peer. It can choose to
   1709 	 *   ignore the hardware verification.
   1710 	 *
   1711 	 * - the NIC is shared (VNIC) and a packet originates from the
   1712 	 *   wire: this is the same as the case above - the packets
   1713 	 *   will have a valid checksum.
   1714 	 *
   1715 	 * - the NIC is shared (VNIC) and a packet originates from the
   1716 	 *   host: the MAC layer ensures that all such packets have a
   1717 	 *   valid checksum by calculating one if the stack did not.
   1718 	 */
   1719 
   1720 	return (B_TRUE);
   1721 }
   1722 
   1723 void
   1724 xnb_start_connect(xnb_t *xnbp)
   1725 {
   1726 	dev_info_t  *dip = xnbp->xnb_devinfo;
   1727 
   1728 	if (!xnb_connect_rings(dip)) {
   1729 		cmn_err(CE_WARN, "xnb_start_connect: "
   1730 		    "cannot connect rings");
   1731 		goto failed;
   1732 	}
   1733 
   1734 	if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) {
   1735 		cmn_err(CE_WARN, "xnb_start_connect: "
   1736 		    "flavour failed to connect");
   1737 		goto failed;
   1738 	}
   1739 
   1740 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
   1741 	return;
   1742 
   1743 failed:
   1744 	xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
   1745 	xnb_disconnect_rings(dip);
   1746 	(void) xvdi_switch_state(dip, XBT_NULL,
   1747 	    XenbusStateClosed);
   1748 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
   1749 }
   1750 
   1751 static boolean_t
   1752 xnb_connect_rings(dev_info_t *dip)
   1753 {
   1754 	xnb_t *xnbp = ddi_get_driver_private(dip);
   1755 	struct gnttab_map_grant_ref map_op;
   1756 
   1757 	/*
   1758 	 * Cannot attempt to connect the rings if already connected.
   1759 	 */
   1760 	ASSERT(!xnbp->xnb_connected);
   1761 
   1762 	/*
   1763 	 * 1. allocate a vaddr for the tx page, one for the rx page.
   1764 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
   1765 	 *    into the allocated vaddr (one for tx, one for rx).
   1766 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
   1767 	 *    bound to this domain.
   1768 	 * 4. associate the event channel with an interrupt.
   1769 	 * 5. enable the interrupt.
   1770 	 */
   1771 
   1772 	/* 1.tx */
   1773 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
   1774 	    0, 0, 0, 0, VM_SLEEP);
   1775 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
   1776 
   1777 	/* 2.tx */
   1778 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
   1779 	map_op.flags = GNTMAP_host_map;
   1780 	map_op.ref = xnbp->xnb_tx_ring_ref;
   1781 	map_op.dom = xnbp->xnb_peer;
   1782 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
   1783 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
   1784 	    map_op.status != 0) {
   1785 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
   1786 		goto fail;
   1787 	}
   1788 	xnbp->xnb_tx_ring_handle = map_op.handle;
   1789 
   1790 	/* LINTED: constant in conditional context */
   1791 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
   1792 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
   1793 
   1794 	/* 1.rx */
   1795 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
   1796 	    0, 0, 0, 0, VM_SLEEP);
   1797 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
   1798 
   1799 	/* 2.rx */
   1800 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
   1801 	map_op.flags = GNTMAP_host_map;
   1802 	map_op.ref = xnbp->xnb_rx_ring_ref;
   1803 	map_op.dom = xnbp->xnb_peer;
   1804 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
   1805 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
   1806 	    map_op.status != 0) {
   1807 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
   1808 		goto fail;
   1809 	}
   1810 	xnbp->xnb_rx_ring_handle = map_op.handle;
   1811 
   1812 	/* LINTED: constant in conditional context */
   1813 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
   1814 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
   1815 
   1816 	/* 3 */
   1817 	if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) {
   1818 		cmn_err(CE_WARN, "xnb_connect_rings: "
   1819 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
   1820 		xnbp->xnb_evtchn = INVALID_EVTCHN;
   1821 		goto fail;
   1822 	}
   1823 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
   1824 
   1825 	/*
   1826 	 * It would be good to set the state to XenbusStateConnected
   1827 	 * here as well, but then what if ddi_add_intr() failed?
   1828 	 * Changing the state in the store will be noticed by the peer
   1829 	 * and cannot be "taken back".
   1830 	 */
   1831 	mutex_enter(&xnbp->xnb_tx_lock);
   1832 	mutex_enter(&xnbp->xnb_rx_lock);
   1833 
   1834 	xnbp->xnb_connected = B_TRUE;
   1835 
   1836 	mutex_exit(&xnbp->xnb_rx_lock);
   1837 	mutex_exit(&xnbp->xnb_tx_lock);
   1838 
   1839 	/* 4, 5 */
   1840 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
   1841 	    != DDI_SUCCESS) {
   1842 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
   1843 		goto fail;
   1844 	}
   1845 	xnbp->xnb_irq = B_TRUE;
   1846 
   1847 	return (B_TRUE);
   1848 
   1849 fail:
   1850 	mutex_enter(&xnbp->xnb_tx_lock);
   1851 	mutex_enter(&xnbp->xnb_rx_lock);
   1852 
   1853 	xnbp->xnb_connected = B_FALSE;
   1854 
   1855 	mutex_exit(&xnbp->xnb_rx_lock);
   1856 	mutex_exit(&xnbp->xnb_tx_lock);
   1857 
   1858 	return (B_FALSE);
   1859 }
   1860 
   1861 static void
   1862 xnb_disconnect_rings(dev_info_t *dip)
   1863 {
   1864 	xnb_t *xnbp = ddi_get_driver_private(dip);
   1865 
   1866 	if (xnbp->xnb_irq) {
   1867 		ddi_remove_intr(dip, 0, NULL);
   1868 		xnbp->xnb_irq = B_FALSE;
   1869 	}
   1870 
   1871 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
   1872 		xvdi_free_evtchn(dip);
   1873 		xnbp->xnb_evtchn = INVALID_EVTCHN;
   1874 	}
   1875 
   1876 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
   1877 		struct gnttab_unmap_grant_ref unmap_op;
   1878 
   1879 		unmap_op.host_addr = (uint64_t)(uintptr_t)
   1880 		    xnbp->xnb_rx_ring_addr;
   1881 		unmap_op.dev_bus_addr = 0;
   1882 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
   1883 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
   1884 		    &unmap_op, 1) != 0)
   1885 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
   1886 			    "cannot unmap rx-ring page (%d)",
   1887 			    unmap_op.status);
   1888 
   1889 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
   1890 	}
   1891 
   1892 	if (xnbp->xnb_rx_ring_addr != NULL) {
   1893 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
   1894 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
   1895 		xnbp->xnb_rx_ring_addr = NULL;
   1896 	}
   1897 
   1898 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
   1899 		struct gnttab_unmap_grant_ref unmap_op;
   1900 
   1901 		unmap_op.host_addr = (uint64_t)(uintptr_t)
   1902 		    xnbp->xnb_tx_ring_addr;
   1903 		unmap_op.dev_bus_addr = 0;
   1904 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
   1905 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
   1906 		    &unmap_op, 1) != 0)
   1907 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
   1908 			    "cannot unmap tx-ring page (%d)",
   1909 			    unmap_op.status);
   1910 
   1911 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
   1912 	}
   1913 
   1914 	if (xnbp->xnb_tx_ring_addr != NULL) {
   1915 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
   1916 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
   1917 		xnbp->xnb_tx_ring_addr = NULL;
   1918 	}
   1919 }
   1920 
   1921 static void
   1922 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
   1923     void *arg, void *impl_data)
   1924 {
   1925 	_NOTE(ARGUNUSED(id, arg));
   1926 	xnb_t *xnbp = ddi_get_driver_private(dip);
   1927 	XenbusState new_state = *(XenbusState *)impl_data;
   1928 
   1929 	ASSERT(xnbp != NULL);
   1930 
   1931 	switch (new_state) {
   1932 	case XenbusStateConnected:
   1933 		/* spurious state change */
   1934 		if (xnbp->xnb_connected)
   1935 			return;
   1936 
   1937 		if (!xnb_read_oe_config(xnbp) ||
   1938 		    !xnbp->xnb_flavour->xf_peer_connected(xnbp)) {
   1939 			cmn_err(CE_WARN, "xnb_oe_state_change: "
   1940 			    "read otherend config error");
   1941 			(void) xvdi_switch_state(dip, XBT_NULL,
   1942 			    XenbusStateClosed);
   1943 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
   1944 
   1945 			break;
   1946 		}
   1947 
   1948 
   1949 		mutex_enter(&xnbp->xnb_state_lock);
   1950 		xnbp->xnb_fe_status = XNB_STATE_READY;
   1951 		if (xnbp->xnb_be_status == XNB_STATE_READY)
   1952 			xnb_start_connect(xnbp);
   1953 		mutex_exit(&xnbp->xnb_state_lock);
   1954 
   1955 		/*
   1956 		 * Now that we've attempted to connect it's reasonable
   1957 		 * to allow an attempt to detach.
   1958 		 */
   1959 		xnbp->xnb_detachable = B_TRUE;
   1960 
   1961 		break;
   1962 
   1963 	case XenbusStateClosing:
   1964 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
   1965 
   1966 		break;
   1967 
   1968 	case XenbusStateClosed:
   1969 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
   1970 
   1971 		mutex_enter(&xnbp->xnb_tx_lock);
   1972 		mutex_enter(&xnbp->xnb_rx_lock);
   1973 
   1974 		xnb_disconnect_rings(dip);
   1975 		xnbp->xnb_connected = B_FALSE;
   1976 
   1977 		mutex_exit(&xnbp->xnb_rx_lock);
   1978 		mutex_exit(&xnbp->xnb_tx_lock);
   1979 
   1980 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
   1981 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
   1982 		/*
   1983 		 * In all likelyhood this is already set (in the above
   1984 		 * case), but if the peer never attempted to connect
   1985 		 * and the domain is destroyed we get here without
   1986 		 * having been through the case above, so we set it to
   1987 		 * be sure.
   1988 		 */
   1989 		xnbp->xnb_detachable = B_TRUE;
   1990 
   1991 		break;
   1992 
   1993 	default:
   1994 		break;
   1995 	}
   1996 }
   1997 
   1998 static void
   1999 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
   2000     void *arg, void *impl_data)
   2001 {
   2002 	_NOTE(ARGUNUSED(id, arg));
   2003 	xnb_t *xnbp = ddi_get_driver_private(dip);
   2004 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
   2005 
   2006 	ASSERT(xnbp != NULL);
   2007 
   2008 	switch (state) {
   2009 	case Connected:
   2010 		/* spurious hotplug event */
   2011 		if (xnbp->xnb_hotplugged)
   2012 			break;
   2013 
   2014 		if (!xnb_read_xs_config(xnbp))
   2015 			break;
   2016 
   2017 		if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp))
   2018 			break;
   2019 
   2020 		mutex_enter(&xnbp->xnb_tx_lock);
   2021 		mutex_enter(&xnbp->xnb_rx_lock);
   2022 
   2023 		xnbp->xnb_hotplugged = B_TRUE;
   2024 
   2025 		mutex_exit(&xnbp->xnb_rx_lock);
   2026 		mutex_exit(&xnbp->xnb_tx_lock);
   2027 
   2028 		mutex_enter(&xnbp->xnb_state_lock);
   2029 		xnbp->xnb_be_status = XNB_STATE_READY;
   2030 		if (xnbp->xnb_fe_status == XNB_STATE_READY)
   2031 			xnb_start_connect(xnbp);
   2032 		mutex_exit(&xnbp->xnb_state_lock);
   2033 
   2034 		break;
   2035 
   2036 	default:
   2037 		break;
   2038 	}
   2039 }
   2040 
   2041 static struct modldrv modldrv = {
   2042 	&mod_miscops, "xnb",
   2043 };
   2044 
   2045 static struct modlinkage modlinkage = {
   2046 	MODREV_1, &modldrv, NULL
   2047 };
   2048 
   2049 int
   2050 _init(void)
   2051 {
   2052 	int i;
   2053 
   2054 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
   2055 
   2056 	i = mod_install(&modlinkage);
   2057 	if (i != DDI_SUCCESS)
   2058 		mutex_destroy(&xnb_alloc_page_lock);
   2059 
   2060 	return (i);
   2061 }
   2062 
   2063 int
   2064 _info(struct modinfo *modinfop)
   2065 {
   2066 	return (mod_info(&modlinkage, modinfop));
   2067 }
   2068 
   2069 int
   2070 _fini(void)
   2071 {
   2072 	int i;
   2073 
   2074 	i = mod_remove(&modlinkage);
   2075 	if (i == DDI_SUCCESS)
   2076 		mutex_destroy(&xnb_alloc_page_lock);
   2077 
   2078 	return (i);
   2079 }
   2080