Home | History | Annotate | Download | only in io
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  *
     29  * Copyright (c) 2004 Christian Limpach.
     30  * All rights reserved.
     31  *
     32  * Redistribution and use in source and binary forms, with or without
     33  * modification, are permitted provided that the following conditions
     34  * are met:
     35  * 1. Redistributions of source code must retain the above copyright
     36  *    notice, this list of conditions and the following disclaimer.
     37  * 2. Redistributions in binary form must reproduce the above copyright
     38  *    notice, this list of conditions and the following disclaimer in the
     39  *    documentation and/or other materials provided with the distribution.
     40  * 3. This section intentionally left blank.
     41  * 4. The name of the author may not be used to endorse or promote products
     42  *    derived from this software without specific prior written permission.
     43  *
     44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     54  */
     55 /*
     56  * Section 3 of the above license was updated in response to bug 6379571.
     57  */
     58 
     59 /*
     60  * xnf.c - GLDv3 network driver for domU.
     61  */
     62 
     63 /*
     64  * This driver uses four per-instance locks:
     65  *
     66  * xnf_gref_lock:
     67  *
     68  *    Protects access to the grant reference list stored in
     69  *    xnf_gref_head. Grant references should be acquired and released
     70  *    using gref_get() and gref_put() respectively.
     71  *
     72  * xnf_schedlock:
     73  *
     74  *    Protects:
     75  *    xnf_need_sched - used to record that a previous transmit attempt
     76  *       failed (and consequently it will be necessary to call
     77  *       mac_tx_update() when transmit resources are available).
     78  *    xnf_pending_multicast - the number of multicast requests that
     79  *       have been submitted to the backend for which we have not
     80  *       processed responses.
     81  *
     82  * xnf_txlock:
     83  *
     84  *    Protects the transmit ring (xnf_tx_ring) and associated
     85  *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
     86  *
     87  * xnf_rxlock:
     88  *
     89  *    Protects the receive ring (xnf_rx_ring) and associated
     90  *    structures (notably xnf_rx_pkt_info).
     91  *
     92  * If driver-global state that affects both the transmit and receive
     93  * rings is manipulated, both xnf_txlock and xnf_rxlock should be
     94  * held, in that order.
     95  *
     96  * xnf_schedlock is acquired both whilst holding xnf_txlock and
     97  * without. It should always be acquired after xnf_txlock if both are
     98  * held.
     99  *
    100  * Notes:
    101  * - atomic_add_64() is used to manipulate counters where we require
    102  *   accuracy. For counters intended only for observation by humans,
    103  *   post increment/decrement are used instead.
    104  */
    105 
    106 #include <sys/types.h>
    107 #include <sys/errno.h>
    108 #include <sys/param.h>
    109 #include <sys/sysmacros.h>
    110 #include <sys/systm.h>
    111 #include <sys/stream.h>
    112 #include <sys/strsubr.h>
    113 #include <sys/strsun.h>
    114 #include <sys/conf.h>
    115 #include <sys/ddi.h>
    116 #include <sys/devops.h>
    117 #include <sys/sunddi.h>
    118 #include <sys/sunndi.h>
    119 #include <sys/dlpi.h>
    120 #include <sys/ethernet.h>
    121 #include <sys/strsun.h>
    122 #include <sys/pattr.h>
    123 #include <inet/ip.h>
    124 #include <inet/ip_impl.h>
    125 #include <sys/gld.h>
    126 #include <sys/modctl.h>
    127 #include <sys/mac_provider.h>
    128 #include <sys/mac_ether.h>
    129 #include <sys/bootinfo.h>
    130 #include <sys/mach_mmu.h>
    131 #ifdef	XPV_HVM_DRIVER
    132 #include <sys/xpv_support.h>
    133 #include <sys/hypervisor.h>
    134 #else
    135 #include <sys/hypervisor.h>
    136 #include <sys/evtchn_impl.h>
    137 #include <sys/balloon_impl.h>
    138 #endif
    139 #include <xen/public/io/netif.h>
    140 #include <sys/gnttab.h>
    141 #include <xen/sys/xendev.h>
    142 #include <sys/sdt.h>
    143 #include <sys/note.h>
    144 #include <sys/debug.h>
    145 
    146 #include <io/xnf.h>
    147 
    148 #if defined(DEBUG) || defined(__lint)
    149 #define	XNF_DEBUG
    150 #endif
    151 
    152 #ifdef XNF_DEBUG
    153 int xnf_debug = 0;
    154 xnf_t *xnf_debug_instance = NULL;
    155 #endif
    156 
    157 /*
    158  * On a 32 bit PAE system physical and machine addresses are larger
    159  * than 32 bits.  ddi_btop() on such systems take an unsigned long
    160  * argument, and so addresses above 4G are truncated before ddi_btop()
    161  * gets to see them.  To avoid this, code the shift operation here.
    162  */
    163 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
    164 
    165 unsigned int	xnf_max_tx_frags = 1;
    166 
    167 /*
    168  * Should we use the multicast control feature if the backend provides
    169  * it?
    170  */
    171 boolean_t xnf_multicast_control = B_TRUE;
    172 
    173 /*
    174  * Received packets below this size are copied to a new streams buffer
    175  * rather than being desballoc'ed.
    176  *
    177  * This value is chosen to accommodate traffic where there are a large
    178  * number of small packets. For data showing a typical distribution,
    179  * see:
    180  *
    181  * Sinha07a:
    182  *	Rishi Sinha, Christos Papadopoulos, and John
    183  *	Heidemann. Internet Packet Size Distributions: Some
    184  *	Observations. Technical Report ISI-TR-2007-643,
    185  *	USC/Information Sciences Institute, May, 2007. Orignally
    186  *	released October 2005 as web page
    187  *	http://netweb.usc.edu/~sinha/pkt-sizes/.
    188  *	<http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
    189  */
    190 size_t xnf_rx_copy_limit = 64;
    191 
    192 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
    193 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
    194 #define	INVALID_TX_ID		((uint16_t)-1)
    195 
    196 #define	TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
    197 #define	TX_ID_VALID(i) (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
    198 
    199 /* Required system entry points */
    200 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
    201 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
    202 
    203 /* Required driver entry points for Nemo */
    204 static int	xnf_start(void *);
    205 static void	xnf_stop(void *);
    206 static int	xnf_set_mac_addr(void *, const uint8_t *);
    207 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
    208 static int	xnf_set_promiscuous(void *, boolean_t);
    209 static mblk_t	*xnf_send(void *, mblk_t *);
    210 static uint_t	xnf_intr(caddr_t);
    211 static int	xnf_stat(void *, uint_t, uint64_t *);
    212 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
    213 
    214 /* Driver private functions */
    215 static int xnf_alloc_dma_resources(xnf_t *);
    216 static void xnf_release_dma_resources(xnf_t *);
    217 static void xnf_release_mblks(xnf_t *);
    218 
    219 static int xnf_buf_constructor(void *, void *, int);
    220 static void xnf_buf_destructor(void *, void *);
    221 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
    222 #pragma inline(xnf_buf_get)
    223 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
    224 #pragma inline(xnf_buf_put)
    225 static void xnf_buf_refresh(xnf_buf_t *);
    226 #pragma inline(xnf_buf_refresh)
    227 static void xnf_buf_recycle(xnf_buf_t *);
    228 
    229 static int xnf_tx_buf_constructor(void *, void *, int);
    230 static void xnf_tx_buf_destructor(void *, void *);
    231 
    232 static grant_ref_t gref_get(xnf_t *);
    233 #pragma inline(gref_get)
    234 static void gref_put(xnf_t *, grant_ref_t);
    235 #pragma inline(gref_put)
    236 
    237 static xnf_txid_t *txid_get(xnf_t *);
    238 #pragma inline(txid_get)
    239 static void txid_put(xnf_t *, xnf_txid_t *);
    240 #pragma inline(txid_put)
    241 
    242 void xnf_send_driver_status(int, int);
    243 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
    244 static int xnf_tx_clean_ring(xnf_t  *);
    245 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
    246     void *, void *);
    247 static boolean_t xnf_kstat_init(xnf_t *);
    248 static void xnf_rx_collect(xnf_t *);
    249 
    250 static mac_callbacks_t xnf_callbacks = {
    251 	MC_GETCAPAB,
    252 	xnf_stat,
    253 	xnf_start,
    254 	xnf_stop,
    255 	xnf_set_promiscuous,
    256 	xnf_set_multicast,
    257 	xnf_set_mac_addr,
    258 	xnf_send,
    259 	NULL,
    260 	xnf_getcapab
    261 };
    262 
    263 /* DMA attributes for network ring buffer */
    264 static ddi_dma_attr_t ringbuf_dma_attr = {
    265 	DMA_ATTR_V0,		/* version of this structure */
    266 	0,			/* lowest usable address */
    267 	0xffffffffffffffffULL,	/* highest usable address */
    268 	0x7fffffff,		/* maximum DMAable byte count */
    269 	MMU_PAGESIZE,		/* alignment in bytes */
    270 	0x7ff,			/* bitmap of burst sizes */
    271 	1,			/* minimum transfer */
    272 	0xffffffffU,		/* maximum transfer */
    273 	0xffffffffffffffffULL,	/* maximum segment length */
    274 	1,			/* maximum number of segments */
    275 	1,			/* granularity */
    276 	0,			/* flags (reserved) */
    277 };
    278 
    279 /* DMA attributes for transmit and receive data */
    280 static ddi_dma_attr_t buf_dma_attr = {
    281 	DMA_ATTR_V0,		/* version of this structure */
    282 	0,			/* lowest usable address */
    283 	0xffffffffffffffffULL,	/* highest usable address */
    284 	0x7fffffff,		/* maximum DMAable byte count */
    285 	MMU_PAGESIZE,		/* alignment in bytes */
    286 	0x7ff,			/* bitmap of burst sizes */
    287 	1,			/* minimum transfer */
    288 	0xffffffffU,		/* maximum transfer */
    289 	0xffffffffffffffffULL,	/* maximum segment length */
    290 	1,			/* maximum number of segments */
    291 	1,			/* granularity */
    292 	0,			/* flags (reserved) */
    293 };
    294 
    295 /* DMA access attributes for registers and descriptors */
    296 static ddi_device_acc_attr_t accattr = {
    297 	DDI_DEVICE_ATTR_V0,
    298 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
    299 	DDI_STRICTORDER_ACC
    300 };
    301 
    302 /* DMA access attributes for data: NOT to be byte swapped. */
    303 static ddi_device_acc_attr_t data_accattr = {
    304 	DDI_DEVICE_ATTR_V0,
    305 	DDI_NEVERSWAP_ACC,
    306 	DDI_STRICTORDER_ACC
    307 };
    308 
    309 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
    310     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
    311 
    312 static struct modldrv xnf_modldrv = {
    313 	&mod_driverops,
    314 	"Virtual Ethernet driver",
    315 	&xnf_dev_ops
    316 };
    317 
    318 static struct modlinkage modlinkage = {
    319 	MODREV_1, &xnf_modldrv, NULL
    320 };
    321 
    322 int
    323 _init(void)
    324 {
    325 	int r;
    326 
    327 	mac_init_ops(&xnf_dev_ops, "xnf");
    328 	r = mod_install(&modlinkage);
    329 	if (r != DDI_SUCCESS)
    330 		mac_fini_ops(&xnf_dev_ops);
    331 
    332 	return (r);
    333 }
    334 
    335 int
    336 _fini(void)
    337 {
    338 	return (EBUSY); /* XXPV should be removable */
    339 }
    340 
    341 int
    342 _info(struct modinfo *modinfop)
    343 {
    344 	return (mod_info(&modlinkage, modinfop));
    345 }
    346 
    347 /*
    348  * Acquire a grant reference.
    349  */
    350 static grant_ref_t
    351 gref_get(xnf_t *xnfp)
    352 {
    353 	grant_ref_t gref;
    354 
    355 	mutex_enter(&xnfp->xnf_gref_lock);
    356 
    357 	do {
    358 		gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
    359 
    360 	} while ((gref == INVALID_GRANT_REF) &&
    361 	    (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
    362 
    363 	mutex_exit(&xnfp->xnf_gref_lock);
    364 
    365 	if (gref == INVALID_GRANT_REF) {
    366 		xnfp->xnf_stat_gref_failure++;
    367 	} else {
    368 		atomic_add_64(&xnfp->xnf_stat_gref_outstanding, 1);
    369 		if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
    370 			xnfp->xnf_stat_gref_peak =
    371 			    xnfp->xnf_stat_gref_outstanding;
    372 	}
    373 
    374 	return (gref);
    375 }
    376 
    377 /*
    378  * Release a grant reference.
    379  */
    380 static void
    381 gref_put(xnf_t *xnfp, grant_ref_t gref)
    382 {
    383 	ASSERT(gref != INVALID_GRANT_REF);
    384 
    385 	mutex_enter(&xnfp->xnf_gref_lock);
    386 	gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
    387 	mutex_exit(&xnfp->xnf_gref_lock);
    388 
    389 	atomic_add_64(&xnfp->xnf_stat_gref_outstanding, -1);
    390 }
    391 
    392 /*
    393  * Acquire a transmit id.
    394  */
    395 static xnf_txid_t *
    396 txid_get(xnf_t *xnfp)
    397 {
    398 	xnf_txid_t *tidp;
    399 
    400 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
    401 
    402 	if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
    403 		return (NULL);
    404 
    405 	ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
    406 
    407 	tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
    408 	xnfp->xnf_tx_pkt_id_head = tidp->next;
    409 	tidp->next = INVALID_TX_ID;
    410 
    411 	ASSERT(tidp->txbuf == NULL);
    412 
    413 	return (tidp);
    414 }
    415 
    416 /*
    417  * Release a transmit id.
    418  */
    419 static void
    420 txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
    421 {
    422 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
    423 	ASSERT(TX_ID_VALID(tidp->id));
    424 	ASSERT(tidp->next == INVALID_TX_ID);
    425 
    426 	tidp->txbuf = NULL;
    427 	tidp->next = xnfp->xnf_tx_pkt_id_head;
    428 	xnfp->xnf_tx_pkt_id_head = tidp->id;
    429 }
    430 
    431 /*
    432  * Get `wanted' slots in the transmit ring, waiting for at least that
    433  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
    434  * `wanted' to zero.
    435  *
    436  * Return the number of slots available.
    437  */
    438 static int
    439 tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
    440 {
    441 	int slotsfree;
    442 	boolean_t forced_clean = (wanted == 0);
    443 
    444 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
    445 
    446 	/* LINTED: constant in conditional context */
    447 	while (B_TRUE) {
    448 		slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
    449 
    450 		if ((slotsfree < wanted) || forced_clean)
    451 			slotsfree = xnf_tx_clean_ring(xnfp);
    452 
    453 		/*
    454 		 * If there are more than we need free, tell other
    455 		 * people to come looking again. We hold txlock, so we
    456 		 * are able to take our slots before anyone else runs.
    457 		 */
    458 		if (slotsfree > wanted)
    459 			cv_broadcast(&xnfp->xnf_cv_tx_slots);
    460 
    461 		if (slotsfree >= wanted)
    462 			break;
    463 
    464 		if (!wait)
    465 			break;
    466 
    467 		cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
    468 	}
    469 
    470 	ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
    471 
    472 	return (slotsfree);
    473 }
    474 
    475 static int
    476 xnf_setup_rings(xnf_t *xnfp)
    477 {
    478 	domid_t			oeid;
    479 	struct xenbus_device	*xsd;
    480 	RING_IDX		i;
    481 	int			err;
    482 	xnf_txid_t		*tidp;
    483 	xnf_buf_t **bdescp;
    484 
    485 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
    486 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
    487 
    488 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
    489 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
    490 
    491 	err = gnttab_grant_foreign_access(oeid,
    492 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
    493 	if (err <= 0) {
    494 		err = -err;
    495 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
    496 		goto out;
    497 	}
    498 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
    499 
    500 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
    501 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
    502 
    503 	err = gnttab_grant_foreign_access(oeid,
    504 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
    505 	if (err <= 0) {
    506 		err = -err;
    507 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
    508 		goto out;
    509 	}
    510 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
    511 
    512 	mutex_enter(&xnfp->xnf_txlock);
    513 
    514 	/*
    515 	 * Setup/cleanup the TX ring.  Note that this can lose packets
    516 	 * after a resume, but we expect to stagger on.
    517 	 */
    518 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
    519 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
    520 	    i < NET_TX_RING_SIZE;
    521 	    i++, tidp++) {
    522 		xnf_txbuf_t *txp;
    523 
    524 		tidp->id = i;
    525 
    526 		txp = tidp->txbuf;
    527 		if (txp == NULL) {
    528 			tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
    529 			txid_put(xnfp, tidp);
    530 			continue;
    531 		}
    532 
    533 		ASSERT(txp->tx_txreq.gref != INVALID_GRANT_REF);
    534 		ASSERT(txp->tx_mp != NULL);
    535 
    536 		switch (txp->tx_type) {
    537 		case TX_DATA:
    538 			VERIFY(gnttab_query_foreign_access(txp->tx_txreq.gref)
    539 			    == 0);
    540 
    541 			if (txp->tx_bdesc == NULL) {
    542 				(void) gnttab_end_foreign_access_ref(
    543 				    txp->tx_txreq.gref, 1);
    544 				gref_put(xnfp, txp->tx_txreq.gref);
    545 				(void) ddi_dma_unbind_handle(
    546 				    txp->tx_dma_handle);
    547 			} else {
    548 				xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
    549 			}
    550 
    551 			freemsg(txp->tx_mp);
    552 			txid_put(xnfp, tidp);
    553 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
    554 
    555 			break;
    556 
    557 		case TX_MCAST_REQ:
    558 			txp->tx_type = TX_MCAST_RSP;
    559 			txp->tx_status = NETIF_RSP_DROPPED;
    560 			cv_broadcast(&xnfp->xnf_cv_multicast);
    561 
    562 			/*
    563 			 * The request consumed two slots in the ring,
    564 			 * yet only a single xnf_txid_t is used. Step
    565 			 * over the empty slot.
    566 			 */
    567 			i++;
    568 			ASSERT(i < NET_TX_RING_SIZE);
    569 
    570 			break;
    571 
    572 		case TX_MCAST_RSP:
    573 			break;
    574 		}
    575 	}
    576 
    577 	/* LINTED: constant in conditional context */
    578 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
    579 	/* LINTED: constant in conditional context */
    580 	FRONT_RING_INIT(&xnfp->xnf_tx_ring,
    581 	    xnfp->xnf_tx_ring.sring, PAGESIZE);
    582 
    583 	mutex_exit(&xnfp->xnf_txlock);
    584 
    585 	mutex_enter(&xnfp->xnf_rxlock);
    586 
    587 	/*
    588 	 * Clean out any buffers currently posted to the receive ring
    589 	 * before we reset it.
    590 	 */
    591 	for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
    592 	    i < NET_RX_RING_SIZE;
    593 	    i++, bdescp++) {
    594 		if (*bdescp != NULL) {
    595 			xnf_buf_put(xnfp, *bdescp, B_FALSE);
    596 			*bdescp = NULL;
    597 		}
    598 	}
    599 
    600 	/* LINTED: constant in conditional context */
    601 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
    602 	/* LINTED: constant in conditional context */
    603 	FRONT_RING_INIT(&xnfp->xnf_rx_ring,
    604 	    xnfp->xnf_rx_ring.sring, PAGESIZE);
    605 
    606 	/*
    607 	 * Fill the ring with buffers.
    608 	 */
    609 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
    610 		xnf_buf_t *bdesc;
    611 
    612 		bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
    613 		VERIFY(bdesc != NULL);
    614 		xnf_rxbuf_hang(xnfp, bdesc);
    615 	}
    616 
    617 	/* LINTED: constant in conditional context */
    618 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
    619 
    620 	mutex_exit(&xnfp->xnf_rxlock);
    621 
    622 	return (0);
    623 
    624 out:
    625 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
    626 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
    627 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
    628 
    629 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
    630 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
    631 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
    632 
    633 	return (err);
    634 }
    635 
    636 /*
    637  * Connect driver to back end, called to set up communication with
    638  * back end driver both initially and on resume after restore/migrate.
    639  */
    640 void
    641 xnf_be_connect(xnf_t *xnfp)
    642 {
    643 	const char	*message;
    644 	xenbus_transaction_t xbt;
    645 	struct		xenbus_device *xsd;
    646 	char		*xsname;
    647 	int		err;
    648 
    649 	ASSERT(!xnfp->xnf_connected);
    650 
    651 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
    652 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
    653 
    654 	err = xnf_setup_rings(xnfp);
    655 	if (err != 0) {
    656 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
    657 		xenbus_dev_error(xsd, err, "setting up ring");
    658 		return;
    659 	}
    660 
    661 again:
    662 	err = xenbus_transaction_start(&xbt);
    663 	if (err != 0) {
    664 		xenbus_dev_error(xsd, EIO, "starting transaction");
    665 		return;
    666 	}
    667 
    668 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
    669 	    xnfp->xnf_tx_ring_ref);
    670 	if (err != 0) {
    671 		message = "writing tx ring-ref";
    672 		goto abort_transaction;
    673 	}
    674 
    675 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
    676 	    xnfp->xnf_rx_ring_ref);
    677 	if (err != 0) {
    678 		message = "writing rx ring-ref";
    679 		goto abort_transaction;
    680 	}
    681 
    682 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
    683 	    xnfp->xnf_evtchn);
    684 	if (err != 0) {
    685 		message = "writing event-channel";
    686 		goto abort_transaction;
    687 	}
    688 
    689 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
    690 	if (err != 0) {
    691 		message = "writing feature-rx-notify";
    692 		goto abort_transaction;
    693 	}
    694 
    695 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
    696 	if (err != 0) {
    697 		message = "writing request-rx-copy";
    698 		goto abort_transaction;
    699 	}
    700 
    701 	if (xnfp->xnf_be_mcast_control) {
    702 		err = xenbus_printf(xbt, xsname, "request-multicast-control",
    703 		    "%d", 1);
    704 		if (err != 0) {
    705 			message = "writing request-multicast-control";
    706 			goto abort_transaction;
    707 		}
    708 	}
    709 
    710 	err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
    711 	if (err != 0) {
    712 		message = "switching state to XenbusStateConnected";
    713 		goto abort_transaction;
    714 	}
    715 
    716 	err = xenbus_transaction_end(xbt, 0);
    717 	if (err != 0) {
    718 		if (err == EAGAIN)
    719 			goto again;
    720 		xenbus_dev_error(xsd, err, "completing transaction");
    721 	}
    722 
    723 	return;
    724 
    725 abort_transaction:
    726 	(void) xenbus_transaction_end(xbt, 1);
    727 	xenbus_dev_error(xsd, err, "%s", message);
    728 }
    729 
    730 /*
    731  * Read configuration information from xenstore.
    732  */
    733 void
    734 xnf_read_config(xnf_t *xnfp)
    735 {
    736 	int err, be_cap;
    737 	char mac[ETHERADDRL * 3];
    738 	char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
    739 
    740 	err = xenbus_scanf(XBT_NULL, oename, "mac",
    741 	    "%s", (char *)&mac[0]);
    742 	if (err != 0) {
    743 		/*
    744 		 * bad: we're supposed to be set up with a proper mac
    745 		 * addr. at this point
    746 		 */
    747 		cmn_err(CE_WARN, "%s%d: no mac address",
    748 		    ddi_driver_name(xnfp->xnf_devinfo),
    749 		    ddi_get_instance(xnfp->xnf_devinfo));
    750 			return;
    751 	}
    752 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
    753 		err = ENOENT;
    754 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
    755 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
    756 		return;
    757 	}
    758 
    759 	err = xenbus_scanf(XBT_NULL, oename,
    760 	    "feature-rx-copy", "%d", &be_cap);
    761 	/*
    762 	 * If we fail to read the store we assume that the key is
    763 	 * absent, implying an older domain at the far end.  Older
    764 	 * domains cannot do HV copy.
    765 	 */
    766 	if (err != 0)
    767 		be_cap = 0;
    768 	xnfp->xnf_be_rx_copy = (be_cap != 0);
    769 
    770 	err = xenbus_scanf(XBT_NULL, oename,
    771 	    "feature-multicast-control", "%d", &be_cap);
    772 	/*
    773 	 * If we fail to read the store we assume that the key is
    774 	 * absent, implying an older domain at the far end.  Older
    775 	 * domains do not support multicast control.
    776 	 */
    777 	if (err != 0)
    778 		be_cap = 0;
    779 	xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
    780 }
    781 
    782 /*
    783  *  attach(9E) -- Attach a device to the system
    784  */
    785 static int
    786 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
    787 {
    788 	mac_register_t *macp;
    789 	xnf_t *xnfp;
    790 	int err;
    791 	char cachename[32];
    792 
    793 #ifdef XNF_DEBUG
    794 	if (xnf_debug & XNF_DEBUG_DDI)
    795 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
    796 		    (void *)devinfo);
    797 #endif
    798 
    799 	switch (cmd) {
    800 	case DDI_RESUME:
    801 		xnfp = ddi_get_driver_private(devinfo);
    802 		xnfp->xnf_gen++;
    803 
    804 		(void) xvdi_resume(devinfo);
    805 		(void) xvdi_alloc_evtchn(devinfo);
    806 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
    807 #ifdef XPV_HVM_DRIVER
    808 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
    809 		    xnfp);
    810 #else
    811 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
    812 		    (caddr_t)xnfp);
    813 #endif
    814 		return (DDI_SUCCESS);
    815 
    816 	case DDI_ATTACH:
    817 		break;
    818 
    819 	default:
    820 		return (DDI_FAILURE);
    821 	}
    822 
    823 	/*
    824 	 *  Allocate gld_mac_info_t and xnf_instance structures
    825 	 */
    826 	macp = mac_alloc(MAC_VERSION);
    827 	if (macp == NULL)
    828 		return (DDI_FAILURE);
    829 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
    830 
    831 	macp->m_dip = devinfo;
    832 	macp->m_driver = xnfp;
    833 	xnfp->xnf_devinfo = devinfo;
    834 
    835 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
    836 	macp->m_src_addr = xnfp->xnf_mac_addr;
    837 	macp->m_callbacks = &xnf_callbacks;
    838 	macp->m_min_sdu = 0;
    839 	macp->m_max_sdu = XNF_MAXPKT;
    840 
    841 	xnfp->xnf_running = B_FALSE;
    842 	xnfp->xnf_connected = B_FALSE;
    843 	xnfp->xnf_be_rx_copy = B_FALSE;
    844 	xnfp->xnf_be_mcast_control = B_FALSE;
    845 	xnfp->xnf_need_sched = B_FALSE;
    846 
    847 	xnfp->xnf_rx_head = NULL;
    848 	xnfp->xnf_rx_tail = NULL;
    849 	xnfp->xnf_rx_new_buffers_posted = B_FALSE;
    850 
    851 #ifdef XPV_HVM_DRIVER
    852 	/*
    853 	 * Report our version to dom0.
    854 	 */
    855 	if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
    856 	    HVMPV_XNF_VERS))
    857 		cmn_err(CE_WARN, "xnf: couldn't write version\n");
    858 #endif
    859 
    860 	/*
    861 	 * Get the iblock cookie with which to initialize the mutexes.
    862 	 */
    863 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
    864 	    != DDI_SUCCESS)
    865 		goto failure;
    866 
    867 	mutex_init(&xnfp->xnf_txlock,
    868 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
    869 	mutex_init(&xnfp->xnf_rxlock,
    870 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
    871 	mutex_init(&xnfp->xnf_schedlock,
    872 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
    873 	mutex_init(&xnfp->xnf_gref_lock,
    874 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
    875 
    876 	cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
    877 	cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
    878 	cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
    879 
    880 	(void) sprintf(cachename, "xnf_buf_cache_%d",
    881 	    ddi_get_instance(devinfo));
    882 	xnfp->xnf_buf_cache = kmem_cache_create(cachename,
    883 	    sizeof (xnf_buf_t), 0,
    884 	    xnf_buf_constructor, xnf_buf_destructor,
    885 	    NULL, xnfp, NULL, 0);
    886 	if (xnfp->xnf_buf_cache == NULL)
    887 		goto failure_0;
    888 
    889 	(void) sprintf(cachename, "xnf_tx_buf_cache_%d",
    890 	    ddi_get_instance(devinfo));
    891 	xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
    892 	    sizeof (xnf_txbuf_t), 0,
    893 	    xnf_tx_buf_constructor, xnf_tx_buf_destructor,
    894 	    NULL, xnfp, NULL, 0);
    895 	if (xnfp->xnf_tx_buf_cache == NULL)
    896 		goto failure_1;
    897 
    898 	xnfp->xnf_gref_head = INVALID_GRANT_REF;
    899 
    900 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
    901 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
    902 		    "driver data structures",
    903 		    ddi_get_instance(xnfp->xnf_devinfo));
    904 		goto failure_2;
    905 	}
    906 
    907 	xnfp->xnf_rx_ring.sring->rsp_event =
    908 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
    909 
    910 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
    911 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
    912 
    913 	/* set driver private pointer now */
    914 	ddi_set_driver_private(devinfo, xnfp);
    915 
    916 	if (!xnf_kstat_init(xnfp))
    917 		goto failure_3;
    918 
    919 	/*
    920 	 * Allocate an event channel, add the interrupt handler and
    921 	 * bind it to the event channel.
    922 	 */
    923 	(void) xvdi_alloc_evtchn(devinfo);
    924 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
    925 #ifdef XPV_HVM_DRIVER
    926 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
    927 #else
    928 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
    929 #endif
    930 
    931 	err = mac_register(macp, &xnfp->xnf_mh);
    932 	mac_free(macp);
    933 	macp = NULL;
    934 	if (err != 0)
    935 		goto failure_4;
    936 
    937 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
    938 	    != DDI_SUCCESS)
    939 		goto failure_5;
    940 
    941 #ifdef XPV_HVM_DRIVER
    942 	/*
    943 	 * In the HVM case, this driver essentially replaces a driver for
    944 	 * a 'real' PCI NIC. Without the "model" property set to
    945 	 * "Ethernet controller", like the PCI code does, netbooting does
    946 	 * not work correctly, as strplumb_get_netdev_path() will not find
    947 	 * this interface.
    948 	 */
    949 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
    950 	    "Ethernet controller");
    951 #endif
    952 
    953 #ifdef XNF_DEBUG
    954 	if (xnf_debug_instance == NULL)
    955 		xnf_debug_instance = xnfp;
    956 #endif
    957 
    958 	return (DDI_SUCCESS);
    959 
    960 failure_5:
    961 	(void) mac_unregister(xnfp->xnf_mh);
    962 
    963 failure_4:
    964 #ifdef XPV_HVM_DRIVER
    965 	ec_unbind_evtchn(xnfp->xnf_evtchn);
    966 	xvdi_free_evtchn(devinfo);
    967 #else
    968 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
    969 #endif
    970 	xnfp->xnf_evtchn = INVALID_EVTCHN;
    971 	kstat_delete(xnfp->xnf_kstat_aux);
    972 
    973 failure_3:
    974 	xnf_release_dma_resources(xnfp);
    975 
    976 failure_2:
    977 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
    978 
    979 failure_1:
    980 	kmem_cache_destroy(xnfp->xnf_buf_cache);
    981 
    982 failure_0:
    983 	cv_destroy(&xnfp->xnf_cv_tx_slots);
    984 	cv_destroy(&xnfp->xnf_cv_multicast);
    985 	cv_destroy(&xnfp->xnf_cv_state);
    986 
    987 	mutex_destroy(&xnfp->xnf_gref_lock);
    988 	mutex_destroy(&xnfp->xnf_schedlock);
    989 	mutex_destroy(&xnfp->xnf_rxlock);
    990 	mutex_destroy(&xnfp->xnf_txlock);
    991 
    992 failure:
    993 	kmem_free(xnfp, sizeof (*xnfp));
    994 	if (macp != NULL)
    995 		mac_free(macp);
    996 
    997 	return (DDI_FAILURE);
    998 }
    999 
   1000 /*  detach(9E) -- Detach a device from the system */
   1001 static int
   1002 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
   1003 {
   1004 	xnf_t *xnfp;		/* Our private device info */
   1005 
   1006 #ifdef XNF_DEBUG
   1007 	if (xnf_debug & XNF_DEBUG_DDI)
   1008 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
   1009 #endif
   1010 
   1011 	xnfp = ddi_get_driver_private(devinfo);
   1012 
   1013 	switch (cmd) {
   1014 	case DDI_SUSPEND:
   1015 #ifdef XPV_HVM_DRIVER
   1016 		ec_unbind_evtchn(xnfp->xnf_evtchn);
   1017 		xvdi_free_evtchn(devinfo);
   1018 #else
   1019 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
   1020 #endif
   1021 
   1022 		xvdi_suspend(devinfo);
   1023 
   1024 		mutex_enter(&xnfp->xnf_rxlock);
   1025 		mutex_enter(&xnfp->xnf_txlock);
   1026 
   1027 		xnfp->xnf_evtchn = INVALID_EVTCHN;
   1028 		xnfp->xnf_connected = B_FALSE;
   1029 		mutex_exit(&xnfp->xnf_txlock);
   1030 		mutex_exit(&xnfp->xnf_rxlock);
   1031 
   1032 		/* claim link to be down after disconnect */
   1033 		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
   1034 		return (DDI_SUCCESS);
   1035 
   1036 	case DDI_DETACH:
   1037 		break;
   1038 
   1039 	default:
   1040 		return (DDI_FAILURE);
   1041 	}
   1042 
   1043 	if (xnfp->xnf_connected)
   1044 		return (DDI_FAILURE);
   1045 
   1046 	/*
   1047 	 * Cannot detach if we have xnf_buf_t outstanding.
   1048 	 */
   1049 	if (xnfp->xnf_stat_buf_allocated > 0)
   1050 		return (DDI_FAILURE);
   1051 
   1052 	if (mac_unregister(xnfp->xnf_mh) != 0)
   1053 		return (DDI_FAILURE);
   1054 
   1055 	kstat_delete(xnfp->xnf_kstat_aux);
   1056 
   1057 	/* Stop the receiver */
   1058 	xnf_stop(xnfp);
   1059 
   1060 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
   1061 
   1062 	/* Remove the interrupt */
   1063 #ifdef XPV_HVM_DRIVER
   1064 	ec_unbind_evtchn(xnfp->xnf_evtchn);
   1065 	xvdi_free_evtchn(devinfo);
   1066 #else
   1067 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
   1068 #endif
   1069 
   1070 	/* Release any pending xmit mblks */
   1071 	xnf_release_mblks(xnfp);
   1072 
   1073 	/* Release all DMA resources */
   1074 	xnf_release_dma_resources(xnfp);
   1075 
   1076 	cv_destroy(&xnfp->xnf_cv_tx_slots);
   1077 	cv_destroy(&xnfp->xnf_cv_multicast);
   1078 	cv_destroy(&xnfp->xnf_cv_state);
   1079 
   1080 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
   1081 	kmem_cache_destroy(xnfp->xnf_buf_cache);
   1082 
   1083 	mutex_destroy(&xnfp->xnf_gref_lock);
   1084 	mutex_destroy(&xnfp->xnf_schedlock);
   1085 	mutex_destroy(&xnfp->xnf_rxlock);
   1086 	mutex_destroy(&xnfp->xnf_txlock);
   1087 
   1088 	kmem_free(xnfp, sizeof (*xnfp));
   1089 
   1090 	return (DDI_SUCCESS);
   1091 }
   1092 
   1093 /*
   1094  *  xnf_set_mac_addr() -- set the physical network address on the board.
   1095  */
   1096 static int
   1097 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
   1098 {
   1099 	_NOTE(ARGUNUSED(arg, macaddr));
   1100 
   1101 	/*
   1102 	 * We can't set our macaddr.
   1103 	 */
   1104 	return (ENOTSUP);
   1105 }
   1106 
   1107 /*
   1108  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
   1109  *
   1110  *  Program the hardware to enable/disable the multicast address
   1111  *  in "mca".  Enable if "add" is true, disable if false.
   1112  */
   1113 static int
   1114 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
   1115 {
   1116 	xnf_t *xnfp = arg;
   1117 	xnf_txbuf_t *txp;
   1118 	int n_slots;
   1119 	RING_IDX slot;
   1120 	xnf_txid_t *tidp;
   1121 	netif_tx_request_t *txrp;
   1122 	struct netif_extra_info *erp;
   1123 	boolean_t notify, result;
   1124 
   1125 	/*
   1126 	 * If the backend does not support multicast control then we
   1127 	 * must assume that the right packets will just arrive.
   1128 	 */
   1129 	if (!xnfp->xnf_be_mcast_control)
   1130 		return (0);
   1131 
   1132 	txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
   1133 	if (txp == NULL)
   1134 		return (1);
   1135 
   1136 	mutex_enter(&xnfp->xnf_txlock);
   1137 
   1138 	/*
   1139 	 * If we're not yet connected then claim success. This is
   1140 	 * acceptable because we refresh the entire set of multicast
   1141 	 * addresses when we get connected.
   1142 	 *
   1143 	 * We can't wait around here because the MAC layer expects
   1144 	 * this to be a non-blocking operation - waiting ends up
   1145 	 * causing a deadlock during resume.
   1146 	 */
   1147 	if (!xnfp->xnf_connected) {
   1148 		mutex_exit(&xnfp->xnf_txlock);
   1149 		return (0);
   1150 	}
   1151 
   1152 	/*
   1153 	 * 1. Acquire two slots in the ring.
   1154 	 * 2. Fill in the slots.
   1155 	 * 3. Request notification when the operation is done.
   1156 	 * 4. Kick the peer.
   1157 	 * 5. Wait for the response via xnf_tx_clean_ring().
   1158 	 */
   1159 
   1160 	n_slots = tx_slots_get(xnfp, 2, B_TRUE);
   1161 	ASSERT(n_slots >= 2);
   1162 
   1163 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
   1164 	tidp = txid_get(xnfp);
   1165 	VERIFY(tidp != NULL);
   1166 
   1167 	txp->tx_type = TX_MCAST_REQ;
   1168 	txp->tx_slot = slot;
   1169 
   1170 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
   1171 	erp = (struct netif_extra_info *)
   1172 	    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
   1173 
   1174 	txrp->gref = 0;
   1175 	txrp->size = 0;
   1176 	txrp->offset = 0;
   1177 	/* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
   1178 	txrp->id = txp->tx_txreq.id = tidp->id;
   1179 	txrp->flags = NETTXF_extra_info;
   1180 
   1181 	erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
   1182 	    XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
   1183 	bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
   1184 
   1185 	tidp->txbuf = txp;
   1186 
   1187 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
   1188 
   1189 	mutex_enter(&xnfp->xnf_schedlock);
   1190 	xnfp->xnf_pending_multicast++;
   1191 	mutex_exit(&xnfp->xnf_schedlock);
   1192 
   1193 	/* LINTED: constant in conditional context */
   1194 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
   1195 	    notify);
   1196 	if (notify)
   1197 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
   1198 
   1199 	while (txp->tx_type == TX_MCAST_REQ)
   1200 		cv_wait(&xnfp->xnf_cv_multicast,
   1201 		    &xnfp->xnf_txlock);
   1202 
   1203 	ASSERT(txp->tx_type == TX_MCAST_RSP);
   1204 
   1205 	mutex_enter(&xnfp->xnf_schedlock);
   1206 	xnfp->xnf_pending_multicast--;
   1207 	mutex_exit(&xnfp->xnf_schedlock);
   1208 
   1209 	result = (txp->tx_status == NETIF_RSP_OKAY);
   1210 
   1211 	txid_put(xnfp, tidp);
   1212 
   1213 	mutex_exit(&xnfp->xnf_txlock);
   1214 
   1215 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   1216 
   1217 	return (result ? 0 : 1);
   1218 }
   1219 
   1220 /*
   1221  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
   1222  *
   1223  *  Program the hardware to enable/disable promiscuous mode.
   1224  */
   1225 static int
   1226 xnf_set_promiscuous(void *arg, boolean_t on)
   1227 {
   1228 	_NOTE(ARGUNUSED(arg, on));
   1229 
   1230 	/*
   1231 	 * We can't really do this, but we pretend that we can in
   1232 	 * order that snoop will work.
   1233 	 */
   1234 	return (0);
   1235 }
   1236 
   1237 /*
   1238  * Clean buffers that we have responses for from the transmit ring.
   1239  */
   1240 static int
   1241 xnf_tx_clean_ring(xnf_t *xnfp)
   1242 {
   1243 	boolean_t work_to_do;
   1244 
   1245 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
   1246 
   1247 loop:
   1248 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
   1249 		RING_IDX cons, prod, i;
   1250 
   1251 		cons = xnfp->xnf_tx_ring.rsp_cons;
   1252 		prod = xnfp->xnf_tx_ring.sring->rsp_prod;
   1253 		membar_consumer();
   1254 		/*
   1255 		 * Clean tx requests from ring that we have responses
   1256 		 * for.
   1257 		 */
   1258 		DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
   1259 		for (i = cons; i != prod; i++) {
   1260 			netif_tx_response_t *trp;
   1261 			xnf_txid_t *tidp;
   1262 			xnf_txbuf_t *txp;
   1263 
   1264 			trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
   1265 			ASSERT(TX_ID_VALID(trp->id));
   1266 
   1267 			tidp = TX_ID_TO_TXID(xnfp, trp->id);
   1268 			ASSERT(tidp->id == trp->id);
   1269 			ASSERT(tidp->next == INVALID_TX_ID);
   1270 
   1271 			txp = tidp->txbuf;
   1272 			ASSERT(txp != NULL);
   1273 			ASSERT(txp->tx_txreq.id == trp->id);
   1274 
   1275 			switch (txp->tx_type) {
   1276 			case TX_DATA:
   1277 				if (gnttab_query_foreign_access(
   1278 				    txp->tx_txreq.gref) != 0)
   1279 					cmn_err(CE_PANIC,
   1280 					    "tx grant %d still in use by "
   1281 					    "backend domain",
   1282 					    txp->tx_txreq.gref);
   1283 
   1284 				if (txp->tx_bdesc == NULL) {
   1285 					(void) gnttab_end_foreign_access_ref(
   1286 					    txp->tx_txreq.gref, 1);
   1287 					gref_put(xnfp, txp->tx_txreq.gref);
   1288 					(void) ddi_dma_unbind_handle(
   1289 					    txp->tx_dma_handle);
   1290 				} else {
   1291 					xnf_buf_put(xnfp, txp->tx_bdesc,
   1292 					    B_TRUE);
   1293 				}
   1294 
   1295 				freemsg(txp->tx_mp);
   1296 				txid_put(xnfp, tidp);
   1297 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   1298 
   1299 				break;
   1300 
   1301 			case TX_MCAST_REQ:
   1302 				txp->tx_type = TX_MCAST_RSP;
   1303 				txp->tx_status = trp->status;
   1304 				cv_broadcast(&xnfp->xnf_cv_multicast);
   1305 
   1306 				break;
   1307 
   1308 			case TX_MCAST_RSP:
   1309 				break;
   1310 
   1311 			default:
   1312 				cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
   1313 				    "invalid xnf_txbuf_t type: %d",
   1314 				    txp->tx_type);
   1315 				break;
   1316 			}
   1317 		}
   1318 		/*
   1319 		 * Record the last response we dealt with so that we
   1320 		 * know where to start next time around.
   1321 		 */
   1322 		xnfp->xnf_tx_ring.rsp_cons = prod;
   1323 		membar_enter();
   1324 	}
   1325 
   1326 	/* LINTED: constant in conditional context */
   1327 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
   1328 	if (work_to_do)
   1329 		goto loop;
   1330 
   1331 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
   1332 }
   1333 
   1334 /*
   1335  * Allocate and fill in a look-aside buffer for the packet `mp'. Used
   1336  * to ensure that the packet is physically contiguous and contained
   1337  * within a single page.
   1338  */
   1339 static xnf_buf_t *
   1340 xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp)
   1341 {
   1342 	xnf_buf_t *bd;
   1343 	caddr_t bp;
   1344 
   1345 	bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE);
   1346 	if (bd == NULL)
   1347 		return (NULL);
   1348 
   1349 	bp = bd->buf;
   1350 	while (mp != NULL) {
   1351 		size_t len = MBLKL(mp);
   1352 
   1353 		bcopy(mp->b_rptr, bp, len);
   1354 		bp += len;
   1355 
   1356 		mp = mp->b_cont;
   1357 	}
   1358 
   1359 	ASSERT((bp - bd->buf) <= PAGESIZE);
   1360 
   1361 	xnfp->xnf_stat_tx_pullup++;
   1362 
   1363 	return (bd);
   1364 }
   1365 
   1366 /*
   1367  * Insert the pseudo-header checksum into the packet `buf'.
   1368  */
   1369 void
   1370 xnf_pseudo_cksum(caddr_t buf, int length)
   1371 {
   1372 	struct ether_header *ehp;
   1373 	uint16_t sap, len, *stuff;
   1374 	uint32_t cksum;
   1375 	size_t offset;
   1376 	ipha_t *ipha;
   1377 	ipaddr_t src, dst;
   1378 
   1379 	ASSERT(length >= sizeof (*ehp));
   1380 	ehp = (struct ether_header *)buf;
   1381 
   1382 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
   1383 		struct ether_vlan_header *evhp;
   1384 
   1385 		ASSERT(length >= sizeof (*evhp));
   1386 		evhp = (struct ether_vlan_header *)buf;
   1387 		sap = ntohs(evhp->ether_type);
   1388 		offset = sizeof (*evhp);
   1389 	} else {
   1390 		sap = ntohs(ehp->ether_type);
   1391 		offset = sizeof (*ehp);
   1392 	}
   1393 
   1394 	ASSERT(sap == ETHERTYPE_IP);
   1395 
   1396 	/* Packet should have been pulled up by the caller. */
   1397 	if ((offset + sizeof (ipha_t)) > length) {
   1398 		cmn_err(CE_WARN, "xnf_pseudo_cksum: no room for checksum");
   1399 		return;
   1400 	}
   1401 
   1402 	ipha = (ipha_t *)(buf + offset);
   1403 
   1404 	ASSERT(IPH_HDR_LENGTH(ipha) == IP_SIMPLE_HDR_LENGTH);
   1405 
   1406 	len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
   1407 
   1408 	switch (ipha->ipha_protocol) {
   1409 	case IPPROTO_TCP:
   1410 		stuff = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
   1411 		cksum = IP_TCP_CSUM_COMP;
   1412 		break;
   1413 	case IPPROTO_UDP:
   1414 		stuff = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
   1415 		cksum = IP_UDP_CSUM_COMP;
   1416 		break;
   1417 	default:
   1418 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
   1419 		    ipha->ipha_protocol);
   1420 		return;
   1421 	}
   1422 
   1423 	src = ipha->ipha_src;
   1424 	dst = ipha->ipha_dst;
   1425 
   1426 	cksum += (dst >> 16) + (dst & 0xFFFF);
   1427 	cksum += (src >> 16) + (src & 0xFFFF);
   1428 	cksum += htons(len);
   1429 
   1430 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
   1431 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
   1432 
   1433 	ASSERT(cksum <= 0xFFFF);
   1434 
   1435 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
   1436 }
   1437 
   1438 /*
   1439  * Push a list of prepared packets (`txp') into the transmit ring.
   1440  */
   1441 static xnf_txbuf_t *
   1442 tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp)
   1443 {
   1444 	int slots_free;
   1445 	RING_IDX slot;
   1446 	boolean_t notify;
   1447 
   1448 	mutex_enter(&xnfp->xnf_txlock);
   1449 
   1450 	ASSERT(xnfp->xnf_running);
   1451 
   1452 	/*
   1453 	 * Wait until we are connected to the backend.
   1454 	 */
   1455 	while (!xnfp->xnf_connected)
   1456 		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
   1457 
   1458 	slots_free = tx_slots_get(xnfp, 1, B_FALSE);
   1459 	DTRACE_PROBE1(xnf_send_slotsfree, int, slots_free);
   1460 
   1461 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
   1462 
   1463 	while ((txp != NULL) && (slots_free > 0)) {
   1464 		xnf_txid_t *tidp;
   1465 		netif_tx_request_t *txrp;
   1466 
   1467 		tidp = txid_get(xnfp);
   1468 		VERIFY(tidp != NULL);
   1469 
   1470 		txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
   1471 
   1472 		txp->tx_slot = slot;
   1473 		txp->tx_txreq.id = tidp->id;
   1474 		*txrp = txp->tx_txreq;
   1475 
   1476 		tidp->txbuf = txp;
   1477 
   1478 		xnfp->xnf_stat_opackets++;
   1479 		xnfp->xnf_stat_obytes += txp->tx_txreq.size;
   1480 
   1481 		txp = txp->tx_next;
   1482 		slots_free--;
   1483 		slot++;
   1484 
   1485 	}
   1486 
   1487 	xnfp->xnf_tx_ring.req_prod_pvt = slot;
   1488 
   1489 	/*
   1490 	 * Tell the peer that we sent something, if it cares.
   1491 	 */
   1492 	/* LINTED: constant in conditional context */
   1493 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
   1494 	    notify);
   1495 	if (notify)
   1496 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
   1497 
   1498 	mutex_exit(&xnfp->xnf_txlock);
   1499 
   1500 	return (txp);
   1501 }
   1502 
   1503 /*
   1504  * Send the chain of packets `mp'. Called by the MAC framework.
   1505  */
   1506 static mblk_t *
   1507 xnf_send(void *arg, mblk_t *mp)
   1508 {
   1509 	xnf_t *xnfp = arg;
   1510 	domid_t oeid;
   1511 	xnf_txbuf_t *head, *tail;
   1512 	mblk_t *ml;
   1513 	int prepared;
   1514 
   1515 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
   1516 
   1517 	/*
   1518 	 * Prepare packets for transmission.
   1519 	 */
   1520 	head = tail = NULL;
   1521 	prepared = 0;
   1522 	while (mp != NULL) {
   1523 		xnf_txbuf_t *txp;
   1524 		int n_chunks, length;
   1525 		boolean_t page_oops;
   1526 		uint32_t pflags;
   1527 
   1528 		for (ml = mp, n_chunks = length = 0, page_oops = B_FALSE;
   1529 		    ml != NULL;
   1530 		    ml = ml->b_cont, n_chunks++) {
   1531 
   1532 			/*
   1533 			 * Test if this buffer includes a page
   1534 			 * boundary. The test assumes that the range
   1535 			 * b_rptr...b_wptr can include only a single
   1536 			 * boundary.
   1537 			 */
   1538 			if (xnf_btop((size_t)ml->b_rptr) !=
   1539 			    xnf_btop((size_t)ml->b_wptr)) {
   1540 				xnfp->xnf_stat_tx_pagebndry++;
   1541 				page_oops = B_TRUE;
   1542 			}
   1543 
   1544 			length += MBLKL(ml);
   1545 		}
   1546 		DTRACE_PROBE1(xnf_send_b_cont, int, n_chunks);
   1547 
   1548 		/*
   1549 		 * Make sure packet isn't too large.
   1550 		 */
   1551 		if (length > XNF_FRAMESIZE) {
   1552 			cmn_err(CE_WARN,
   1553 			    "xnf%d: oversized packet (%d bytes) dropped",
   1554 			    ddi_get_instance(xnfp->xnf_devinfo), length);
   1555 			freemsg(mp);
   1556 			continue;
   1557 		}
   1558 
   1559 		txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
   1560 		if (txp == NULL)
   1561 			break;
   1562 
   1563 		txp->tx_type = TX_DATA;
   1564 
   1565 		if ((n_chunks > xnf_max_tx_frags) || page_oops) {
   1566 			/*
   1567 			 * Loan a side buffer rather than the mblk
   1568 			 * itself.
   1569 			 */
   1570 			txp->tx_bdesc = xnf_tx_pullup(xnfp, mp);
   1571 			if (txp->tx_bdesc == NULL) {
   1572 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   1573 				break;
   1574 			}
   1575 
   1576 			txp->tx_bufp = txp->tx_bdesc->buf;
   1577 			txp->tx_mfn = txp->tx_bdesc->buf_mfn;
   1578 			txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
   1579 
   1580 		} else {
   1581 			int rc;
   1582 			ddi_dma_cookie_t dma_cookie;
   1583 			uint_t ncookies;
   1584 
   1585 			rc = ddi_dma_addr_bind_handle(txp->tx_dma_handle,
   1586 			    NULL, (char *)mp->b_rptr, length,
   1587 			    DDI_DMA_WRITE | DDI_DMA_STREAMING,
   1588 			    DDI_DMA_DONTWAIT, 0, &dma_cookie,
   1589 			    &ncookies);
   1590 			if (rc != DDI_DMA_MAPPED) {
   1591 				ASSERT(rc != DDI_DMA_INUSE);
   1592 				ASSERT(rc != DDI_DMA_PARTIAL_MAP);
   1593 
   1594 #ifdef XNF_DEBUG
   1595 				if (rc != DDI_DMA_NORESOURCES)
   1596 					cmn_err(CE_WARN,
   1597 					    "xnf%d: bind_handle failed (%x)",
   1598 					    ddi_get_instance(xnfp->xnf_devinfo),
   1599 					    rc);
   1600 #endif
   1601 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   1602 				break;
   1603 			}
   1604 			ASSERT(ncookies == 1);
   1605 
   1606 			txp->tx_bdesc = NULL;
   1607 			txp->tx_bufp = (caddr_t)mp->b_rptr;
   1608 			txp->tx_mfn =
   1609 			    xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
   1610 			txp->tx_txreq.gref = gref_get(xnfp);
   1611 			if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
   1612 				(void) ddi_dma_unbind_handle(
   1613 				    txp->tx_dma_handle);
   1614 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   1615 				break;
   1616 			}
   1617 			gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
   1618 			    oeid, txp->tx_mfn, 1);
   1619 		}
   1620 
   1621 		txp->tx_next = NULL;
   1622 		txp->tx_mp = mp;
   1623 		txp->tx_txreq.size = length;
   1624 		txp->tx_txreq.offset = (uintptr_t)txp->tx_bufp & PAGEOFFSET;
   1625 		txp->tx_txreq.flags = 0;
   1626 		hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL,
   1627 		    &pflags);
   1628 		if (pflags != 0) {
   1629 			/*
   1630 			 * If the local protocol stack requests checksum
   1631 			 * offload we set the 'checksum blank' flag,
   1632 			 * indicating to the peer that we need the checksum
   1633 			 * calculated for us.
   1634 			 *
   1635 			 * We _don't_ set the validated flag, because we haven't
   1636 			 * validated that the data and the checksum match.
   1637 			 */
   1638 			xnf_pseudo_cksum(txp->tx_bufp, length);
   1639 			txp->tx_txreq.flags |= NETTXF_csum_blank;
   1640 
   1641 			xnfp->xnf_stat_tx_cksum_deferred++;
   1642 		}
   1643 
   1644 		if (head == NULL) {
   1645 			ASSERT(tail == NULL);
   1646 
   1647 			head = txp;
   1648 		} else {
   1649 			ASSERT(tail != NULL);
   1650 
   1651 			tail->tx_next = txp;
   1652 		}
   1653 		tail = txp;
   1654 
   1655 		mp = mp->b_next;
   1656 		prepared++;
   1657 
   1658 		/*
   1659 		 * There is no point in preparing more than
   1660 		 * NET_TX_RING_SIZE, as we won't be able to push them
   1661 		 * into the ring in one go and would hence have to
   1662 		 * un-prepare the extra.
   1663 		 */
   1664 		if (prepared == NET_TX_RING_SIZE)
   1665 			break;
   1666 	}
   1667 
   1668 	DTRACE_PROBE1(xnf_send_prepared, int, prepared);
   1669 
   1670 	if (mp != NULL) {
   1671 #ifdef XNF_DEBUG
   1672 		int notprepared = 0;
   1673 		mblk_t *l = mp;
   1674 
   1675 		while (l != NULL) {
   1676 			notprepared++;
   1677 			l = l->b_next;
   1678 		}
   1679 
   1680 		DTRACE_PROBE1(xnf_send_notprepared, int, notprepared);
   1681 #else /* !XNF_DEBUG */
   1682 		DTRACE_PROBE1(xnf_send_notprepared, int, -1);
   1683 #endif /* XNF_DEBUG */
   1684 	}
   1685 
   1686 	/*
   1687 	 * Push the packets we have prepared into the ring. They may
   1688 	 * not all go.
   1689 	 */
   1690 	if (head != NULL)
   1691 		head = tx_push_packets(xnfp, head);
   1692 
   1693 	/*
   1694 	 * If some packets that we prepared were not sent, unprepare
   1695 	 * them and add them back to the head of those we didn't
   1696 	 * prepare.
   1697 	 */
   1698 	{
   1699 		xnf_txbuf_t *loop;
   1700 		mblk_t *mp_head, *mp_tail;
   1701 		int unprepared = 0;
   1702 
   1703 		mp_head = mp_tail = NULL;
   1704 		loop = head;
   1705 
   1706 		while (loop != NULL) {
   1707 			xnf_txbuf_t *next = loop->tx_next;
   1708 
   1709 			if (loop->tx_bdesc == NULL) {
   1710 				(void) gnttab_end_foreign_access_ref(
   1711 				    loop->tx_txreq.gref, 1);
   1712 				gref_put(xnfp, loop->tx_txreq.gref);
   1713 				(void) ddi_dma_unbind_handle(
   1714 				    loop->tx_dma_handle);
   1715 			} else {
   1716 				xnf_buf_put(xnfp, loop->tx_bdesc, B_TRUE);
   1717 			}
   1718 
   1719 			ASSERT(loop->tx_mp != NULL);
   1720 			if (mp_head == NULL)
   1721 				mp_head = loop->tx_mp;
   1722 			mp_tail = loop->tx_mp;
   1723 
   1724 			kmem_cache_free(xnfp->xnf_tx_buf_cache, loop);
   1725 			loop = next;
   1726 			unprepared++;
   1727 		}
   1728 
   1729 		if (mp_tail == NULL) {
   1730 			ASSERT(mp_head == NULL);
   1731 		} else {
   1732 			ASSERT(mp_head != NULL);
   1733 
   1734 			mp_tail->b_next = mp;
   1735 			mp = mp_head;
   1736 		}
   1737 
   1738 		DTRACE_PROBE1(xnf_send_unprepared, int, unprepared);
   1739 	}
   1740 
   1741 	/*
   1742 	 * If any mblks are left then we have deferred for some reason
   1743 	 * and need to ask for a re-schedule later. This is typically
   1744 	 * due to the ring filling.
   1745 	 */
   1746 	if (mp != NULL) {
   1747 		mutex_enter(&xnfp->xnf_schedlock);
   1748 		xnfp->xnf_need_sched = B_TRUE;
   1749 		mutex_exit(&xnfp->xnf_schedlock);
   1750 
   1751 		xnfp->xnf_stat_tx_defer++;
   1752 	}
   1753 
   1754 	return (mp);
   1755 }
   1756 
   1757 /*
   1758  * Notification of RX packets. Currently no TX-complete interrupt is
   1759  * used, as we clean the TX ring lazily.
   1760  */
   1761 static uint_t
   1762 xnf_intr(caddr_t arg)
   1763 {
   1764 	xnf_t *xnfp = (xnf_t *)arg;
   1765 	mblk_t *mp;
   1766 	boolean_t need_sched, clean_ring;
   1767 
   1768 	mutex_enter(&xnfp->xnf_rxlock);
   1769 
   1770 	/*
   1771 	 * Interrupts before we are connected are spurious.
   1772 	 */
   1773 	if (!xnfp->xnf_connected) {
   1774 		mutex_exit(&xnfp->xnf_rxlock);
   1775 		xnfp->xnf_stat_unclaimed_interrupts++;
   1776 		return (DDI_INTR_UNCLAIMED);
   1777 	}
   1778 
   1779 	/*
   1780 	 * Receive side processing.
   1781 	 */
   1782 	do {
   1783 		/*
   1784 		 * Collect buffers from the ring.
   1785 		 */
   1786 		xnf_rx_collect(xnfp);
   1787 
   1788 		/*
   1789 		 * Interrupt me when the next receive buffer is consumed.
   1790 		 */
   1791 		xnfp->xnf_rx_ring.sring->rsp_event =
   1792 		    xnfp->xnf_rx_ring.rsp_cons + 1;
   1793 		xen_mb();
   1794 
   1795 	} while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
   1796 
   1797 	if (xnfp->xnf_rx_new_buffers_posted) {
   1798 		boolean_t notify;
   1799 
   1800 		/*
   1801 		 * Indicate to the peer that we have re-filled the
   1802 		 * receive ring, if it cares.
   1803 		 */
   1804 		/* LINTED: constant in conditional context */
   1805 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
   1806 		if (notify)
   1807 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
   1808 		xnfp->xnf_rx_new_buffers_posted = B_FALSE;
   1809 	}
   1810 
   1811 	mp = xnfp->xnf_rx_head;
   1812 	xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
   1813 
   1814 	xnfp->xnf_stat_interrupts++;
   1815 	mutex_exit(&xnfp->xnf_rxlock);
   1816 
   1817 	if (mp != NULL)
   1818 		mac_rx(xnfp->xnf_mh, NULL, mp);
   1819 
   1820 	/*
   1821 	 * Transmit side processing.
   1822 	 *
   1823 	 * If a previous transmit attempt failed or we have pending
   1824 	 * multicast requests, clean the ring.
   1825 	 *
   1826 	 * If we previously stalled transmission and cleaning produces
   1827 	 * some free slots, tell upstream to attempt sending again.
   1828 	 *
   1829 	 * The odd style is to avoid acquiring xnf_txlock unless we
   1830 	 * will actually look inside the tx machinery.
   1831 	 */
   1832 	mutex_enter(&xnfp->xnf_schedlock);
   1833 	need_sched = xnfp->xnf_need_sched;
   1834 	clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
   1835 	mutex_exit(&xnfp->xnf_schedlock);
   1836 
   1837 	if (clean_ring) {
   1838 		int free_slots;
   1839 
   1840 		mutex_enter(&xnfp->xnf_txlock);
   1841 		free_slots = tx_slots_get(xnfp, 0, B_FALSE);
   1842 
   1843 		if (need_sched && (free_slots > 0)) {
   1844 			mutex_enter(&xnfp->xnf_schedlock);
   1845 			xnfp->xnf_need_sched = B_FALSE;
   1846 			mutex_exit(&xnfp->xnf_schedlock);
   1847 
   1848 			mac_tx_update(xnfp->xnf_mh);
   1849 		}
   1850 		mutex_exit(&xnfp->xnf_txlock);
   1851 	}
   1852 
   1853 	return (DDI_INTR_CLAIMED);
   1854 }
   1855 
   1856 /*
   1857  *  xnf_start() -- start the board receiving and enable interrupts.
   1858  */
   1859 static int
   1860 xnf_start(void *arg)
   1861 {
   1862 	xnf_t *xnfp = arg;
   1863 
   1864 #ifdef XNF_DEBUG
   1865 	if (xnf_debug & XNF_DEBUG_TRACE)
   1866 		printf("xnf%d start(0x%p)\n",
   1867 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
   1868 #endif
   1869 
   1870 	mutex_enter(&xnfp->xnf_rxlock);
   1871 	mutex_enter(&xnfp->xnf_txlock);
   1872 
   1873 	/* Accept packets from above. */
   1874 	xnfp->xnf_running = B_TRUE;
   1875 
   1876 	mutex_exit(&xnfp->xnf_txlock);
   1877 	mutex_exit(&xnfp->xnf_rxlock);
   1878 
   1879 	return (0);
   1880 }
   1881 
   1882 /* xnf_stop() - disable hardware */
   1883 static void
   1884 xnf_stop(void *arg)
   1885 {
   1886 	xnf_t *xnfp = arg;
   1887 
   1888 #ifdef XNF_DEBUG
   1889 	if (xnf_debug & XNF_DEBUG_TRACE)
   1890 		printf("xnf%d stop(0x%p)\n",
   1891 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
   1892 #endif
   1893 
   1894 	mutex_enter(&xnfp->xnf_rxlock);
   1895 	mutex_enter(&xnfp->xnf_txlock);
   1896 
   1897 	xnfp->xnf_running = B_FALSE;
   1898 
   1899 	mutex_exit(&xnfp->xnf_txlock);
   1900 	mutex_exit(&xnfp->xnf_rxlock);
   1901 }
   1902 
   1903 /*
   1904  * Hang buffer `bdesc' on the RX ring.
   1905  */
   1906 static void
   1907 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
   1908 {
   1909 	netif_rx_request_t *reqp;
   1910 	RING_IDX hang_ix;
   1911 
   1912 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
   1913 
   1914 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
   1915 	    xnfp->xnf_rx_ring.req_prod_pvt);
   1916 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
   1917 	ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
   1918 
   1919 	reqp->id = bdesc->id = hang_ix;
   1920 	reqp->gref = bdesc->grant_ref;
   1921 
   1922 	xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
   1923 	xnfp->xnf_rx_ring.req_prod_pvt++;
   1924 
   1925 	xnfp->xnf_rx_new_buffers_posted = B_TRUE;
   1926 }
   1927 
   1928 /*
   1929  * Collect packets from the RX ring, storing them in `xnfp' for later
   1930  * use.
   1931  */
   1932 static void
   1933 xnf_rx_collect(xnf_t *xnfp)
   1934 {
   1935 	mblk_t *head, *tail;
   1936 
   1937 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
   1938 
   1939 	/*
   1940 	 * Loop over unconsumed responses:
   1941 	 * 1. get a response
   1942 	 * 2. take corresponding buffer off recv. ring
   1943 	 * 3. indicate this by setting slot to NULL
   1944 	 * 4. create a new message and
   1945 	 * 5. copy data in, adjust ptr
   1946 	 */
   1947 
   1948 	head = tail = NULL;
   1949 
   1950 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
   1951 		netif_rx_response_t *rxpkt;
   1952 		xnf_buf_t *bdesc;
   1953 		ssize_t len;
   1954 		size_t off;
   1955 		mblk_t *mp = NULL;
   1956 		boolean_t hwcsum = B_FALSE;
   1957 		grant_ref_t ref;
   1958 
   1959 		/* 1. */
   1960 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
   1961 		    xnfp->xnf_rx_ring.rsp_cons);
   1962 
   1963 		DTRACE_PROBE4(xnf_rx_got_rsp, int, (int)rxpkt->id,
   1964 		    int, (int)rxpkt->offset,
   1965 		    int, (int)rxpkt->flags,
   1966 		    int, (int)rxpkt->status);
   1967 
   1968 		/*
   1969 		 * 2.
   1970 		 */
   1971 		bdesc = xnfp->xnf_rx_pkt_info[rxpkt->id];
   1972 
   1973 		/*
   1974 		 * 3.
   1975 		 */
   1976 		xnfp->xnf_rx_pkt_info[rxpkt->id] = NULL;
   1977 		ASSERT(bdesc->id == rxpkt->id);
   1978 
   1979 		ref = bdesc->grant_ref;
   1980 		off = rxpkt->offset;
   1981 		len = rxpkt->status;
   1982 
   1983 		if (!xnfp->xnf_running) {
   1984 			DTRACE_PROBE4(xnf_rx_not_running,
   1985 			    int, rxpkt->status,
   1986 			    char *, bdesc->buf, int, rxpkt->offset,
   1987 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
   1988 
   1989 			xnfp->xnf_stat_drop++;
   1990 
   1991 		} else if (len <= 0) {
   1992 			DTRACE_PROBE4(xnf_rx_pkt_status_negative,
   1993 			    int, rxpkt->status,
   1994 			    char *, bdesc->buf, int, rxpkt->offset,
   1995 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
   1996 
   1997 			xnfp->xnf_stat_errrx++;
   1998 
   1999 			switch (len) {
   2000 			case 0:
   2001 				xnfp->xnf_stat_runt++;
   2002 				break;
   2003 			case NETIF_RSP_ERROR:
   2004 				xnfp->xnf_stat_mac_rcv_error++;
   2005 				break;
   2006 			case NETIF_RSP_DROPPED:
   2007 				xnfp->xnf_stat_norxbuf++;
   2008 				break;
   2009 			}
   2010 
   2011 		} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
   2012 			cmn_err(CE_WARN, "Bad rx grant reference %d "
   2013 			    "from domain %d", ref,
   2014 			    xvdi_get_oeid(xnfp->xnf_devinfo));
   2015 
   2016 		} else if ((off + len) > PAGESIZE) {
   2017 			cmn_err(CE_WARN, "Rx packet overflows page "
   2018 			    "(offset %ld, length %ld) from domain %d",
   2019 			    off, len, xvdi_get_oeid(xnfp->xnf_devinfo));
   2020 		} else {
   2021 			xnf_buf_t *nbuf = NULL;
   2022 
   2023 			DTRACE_PROBE4(xnf_rx_packet, int, len,
   2024 			    char *, bdesc->buf, int, off,
   2025 			    char *, ((char *)bdesc->buf) + off);
   2026 
   2027 			ASSERT(off + len <= PAGEOFFSET);
   2028 
   2029 			if (rxpkt->flags & NETRXF_data_validated)
   2030 				hwcsum = B_TRUE;
   2031 
   2032 			/*
   2033 			 * If the packet is below a pre-determined
   2034 			 * size we will copy data out rather than
   2035 			 * replace it.
   2036 			 */
   2037 			if (len > xnf_rx_copy_limit)
   2038 				nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
   2039 
   2040 			/*
   2041 			 * If we have a replacement buffer, attempt to
   2042 			 * wrap the existing one with an mblk_t in
   2043 			 * order that the upper layers of the stack
   2044 			 * might use it directly.
   2045 			 */
   2046 			if (nbuf != NULL) {
   2047 				mp = desballoc((unsigned char *)bdesc->buf,
   2048 				    bdesc->len, 0, &bdesc->free_rtn);
   2049 				if (mp == NULL) {
   2050 					xnfp->xnf_stat_rx_desballoc_fail++;
   2051 					xnfp->xnf_stat_norxbuf++;
   2052 
   2053 					xnf_buf_put(xnfp, nbuf, B_FALSE);
   2054 					nbuf = NULL;
   2055 				} else {
   2056 					mp->b_rptr = mp->b_rptr + off;
   2057 					mp->b_wptr = mp->b_rptr + len;
   2058 
   2059 					/*
   2060 					 * Release the grant reference
   2061 					 * associated with this buffer
   2062 					 * - they are scarce and the
   2063 					 * upper layers of the stack
   2064 					 * don't need it.
   2065 					 */
   2066 					(void) gnttab_end_foreign_access_ref(
   2067 					    bdesc->grant_ref, 0);
   2068 					gref_put(xnfp, bdesc->grant_ref);
   2069 					bdesc->grant_ref = INVALID_GRANT_REF;
   2070 
   2071 					bdesc = nbuf;
   2072 				}
   2073 			}
   2074 
   2075 			if (nbuf == NULL) {
   2076 				/*
   2077 				 * No replacement buffer allocated -
   2078 				 * attempt to copy the data out and
   2079 				 * re-hang the existing buffer.
   2080 				 */
   2081 
   2082 				/* 4. */
   2083 				mp = allocb(len, BPRI_MED);
   2084 				if (mp == NULL) {
   2085 					xnfp->xnf_stat_rx_allocb_fail++;
   2086 					xnfp->xnf_stat_norxbuf++;
   2087 				} else {
   2088 					/* 5. */
   2089 					bcopy(bdesc->buf + off, mp->b_wptr,
   2090 					    len);
   2091 					mp->b_wptr += len;
   2092 				}
   2093 			}
   2094 		}
   2095 
   2096 		/* Re-hang the buffer. */
   2097 		xnf_rxbuf_hang(xnfp, bdesc);
   2098 
   2099 		if (mp != NULL) {
   2100 			if (hwcsum) {
   2101 				/*
   2102 				 * If the peer says that the data has
   2103 				 * been validated then we declare that
   2104 				 * the full checksum has been
   2105 				 * verified.
   2106 				 *
   2107 				 * We don't look at the "checksum
   2108 				 * blank" flag, and hence could have a
   2109 				 * packet here that we are asserting
   2110 				 * is good with a blank checksum.
   2111 				 *
   2112 				 * The hardware checksum offload
   2113 				 * specification says that we must
   2114 				 * provide the actual checksum as well
   2115 				 * as an assertion that it is valid,
   2116 				 * but the protocol stack doesn't
   2117 				 * actually use it and some other
   2118 				 * drivers don't bother, so we don't.
   2119 				 * If it was necessary we could grovel
   2120 				 * in the packet to find it.
   2121 				 */
   2122 				(void) hcksum_assoc(mp, NULL,
   2123 				    NULL, 0, 0, 0, 0,
   2124 				    HCK_FULLCKSUM |
   2125 				    HCK_FULLCKSUM_OK, 0);
   2126 				xnfp->xnf_stat_rx_cksum_no_need++;
   2127 			}
   2128 			if (head == NULL) {
   2129 				ASSERT(tail == NULL);
   2130 
   2131 				head = mp;
   2132 			} else {
   2133 				ASSERT(tail != NULL);
   2134 
   2135 				tail->b_next = mp;
   2136 			}
   2137 			tail = mp;
   2138 
   2139 			ASSERT(mp->b_next == NULL);
   2140 
   2141 			xnfp->xnf_stat_ipackets++;
   2142 			xnfp->xnf_stat_rbytes += len;
   2143 		}
   2144 
   2145 		xnfp->xnf_rx_ring.rsp_cons++;
   2146 	}
   2147 
   2148 	/*
   2149 	 * Store the mblks we have collected.
   2150 	 */
   2151 	if (head != NULL) {
   2152 		ASSERT(tail != NULL);
   2153 
   2154 		if (xnfp->xnf_rx_head == NULL) {
   2155 			ASSERT(xnfp->xnf_rx_tail == NULL);
   2156 
   2157 			xnfp->xnf_rx_head = head;
   2158 		} else {
   2159 			ASSERT(xnfp->xnf_rx_tail != NULL);
   2160 
   2161 			xnfp->xnf_rx_tail->b_next = head;
   2162 		}
   2163 		xnfp->xnf_rx_tail = tail;
   2164 	}
   2165 }
   2166 
   2167 /*
   2168  *  xnf_alloc_dma_resources() -- initialize the drivers structures
   2169  */
   2170 static int
   2171 xnf_alloc_dma_resources(xnf_t *xnfp)
   2172 {
   2173 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
   2174 	size_t			len;
   2175 	ddi_dma_cookie_t	dma_cookie;
   2176 	uint_t			ncookies;
   2177 	int			rc;
   2178 	caddr_t			rptr;
   2179 
   2180 	/*
   2181 	 * The code below allocates all the DMA data structures that
   2182 	 * need to be released when the driver is detached.
   2183 	 *
   2184 	 * Allocate page for the transmit descriptor ring.
   2185 	 */
   2186 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
   2187 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
   2188 		goto alloc_error;
   2189 
   2190 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
   2191 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
   2192 	    DDI_DMA_SLEEP, 0, &rptr, &len,
   2193 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
   2194 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
   2195 		xnfp->xnf_tx_ring_dma_handle = NULL;
   2196 		goto alloc_error;
   2197 	}
   2198 
   2199 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
   2200 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
   2201 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
   2202 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
   2203 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
   2204 		xnfp->xnf_tx_ring_dma_handle = NULL;
   2205 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
   2206 		if (rc == DDI_DMA_NORESOURCES)
   2207 			goto alloc_error;
   2208 		else
   2209 			goto error;
   2210 	}
   2211 
   2212 	ASSERT(ncookies == 1);
   2213 	bzero(rptr, PAGESIZE);
   2214 	/* LINTED: constant in conditional context */
   2215 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
   2216 	/* LINTED: constant in conditional context */
   2217 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
   2218 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
   2219 
   2220 	/*
   2221 	 * Allocate page for the receive descriptor ring.
   2222 	 */
   2223 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
   2224 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
   2225 		goto alloc_error;
   2226 
   2227 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
   2228 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
   2229 	    DDI_DMA_SLEEP, 0, &rptr, &len,
   2230 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
   2231 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
   2232 		xnfp->xnf_rx_ring_dma_handle = NULL;
   2233 		goto alloc_error;
   2234 	}
   2235 
   2236 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
   2237 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
   2238 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
   2239 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
   2240 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
   2241 		xnfp->xnf_rx_ring_dma_handle = NULL;
   2242 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
   2243 		if (rc == DDI_DMA_NORESOURCES)
   2244 			goto alloc_error;
   2245 		else
   2246 			goto error;
   2247 	}
   2248 
   2249 	ASSERT(ncookies == 1);
   2250 	bzero(rptr, PAGESIZE);
   2251 	/* LINTED: constant in conditional context */
   2252 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
   2253 	/* LINTED: constant in conditional context */
   2254 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
   2255 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
   2256 
   2257 	return (DDI_SUCCESS);
   2258 
   2259 alloc_error:
   2260 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
   2261 	    ddi_get_instance(xnfp->xnf_devinfo));
   2262 error:
   2263 	xnf_release_dma_resources(xnfp);
   2264 	return (DDI_FAILURE);
   2265 }
   2266 
   2267 /*
   2268  * Release all DMA resources in the opposite order from acquisition
   2269  */
   2270 static void
   2271 xnf_release_dma_resources(xnf_t *xnfp)
   2272 {
   2273 	int i;
   2274 
   2275 	/*
   2276 	 * Free receive buffers which are currently associated with
   2277 	 * descriptors.
   2278 	 */
   2279 	mutex_enter(&xnfp->xnf_rxlock);
   2280 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
   2281 		xnf_buf_t *bp;
   2282 
   2283 		if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
   2284 			continue;
   2285 		xnfp->xnf_rx_pkt_info[i] = NULL;
   2286 		xnf_buf_put(xnfp, bp, B_FALSE);
   2287 	}
   2288 	mutex_exit(&xnfp->xnf_rxlock);
   2289 
   2290 	/* Free the receive ring buffer. */
   2291 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
   2292 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
   2293 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
   2294 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
   2295 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
   2296 	}
   2297 	/* Free the transmit ring buffer. */
   2298 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
   2299 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
   2300 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
   2301 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
   2302 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
   2303 	}
   2304 
   2305 }
   2306 
   2307 /*
   2308  * Release any packets and associated structures used by the TX ring.
   2309  */
   2310 static void
   2311 xnf_release_mblks(xnf_t *xnfp)
   2312 {
   2313 	RING_IDX i;
   2314 	xnf_txid_t *tidp;
   2315 
   2316 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
   2317 	    i < NET_TX_RING_SIZE;
   2318 	    i++, tidp++) {
   2319 		xnf_txbuf_t *txp = tidp->txbuf;
   2320 
   2321 		if (txp != NULL) {
   2322 			ASSERT(txp->tx_mp != NULL);
   2323 			freemsg(txp->tx_mp);
   2324 
   2325 			txid_put(xnfp, tidp);
   2326 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   2327 		}
   2328 	}
   2329 }
   2330 
   2331 static int
   2332 xnf_buf_constructor(void *buf, void *arg, int kmflag)
   2333 {
   2334 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
   2335 	xnf_buf_t *bdesc = buf;
   2336 	xnf_t *xnfp = arg;
   2337 	ddi_dma_cookie_t dma_cookie;
   2338 	uint_t ncookies;
   2339 	size_t len;
   2340 
   2341 	if (kmflag & KM_NOSLEEP)
   2342 		ddiflags = DDI_DMA_DONTWAIT;
   2343 
   2344 	/* Allocate a DMA access handle for the buffer. */
   2345 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
   2346 	    ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
   2347 		goto failure;
   2348 
   2349 	/* Allocate DMA-able memory for buffer. */
   2350 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
   2351 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
   2352 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
   2353 		goto failure_1;
   2354 
   2355 	/* Bind to virtual address of buffer to get physical address. */
   2356 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
   2357 	    bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
   2358 	    ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
   2359 		goto failure_2;
   2360 	ASSERT(ncookies == 1);
   2361 
   2362 	bdesc->free_rtn.free_func = xnf_buf_recycle;
   2363 	bdesc->free_rtn.free_arg = (caddr_t)bdesc;
   2364 	bdesc->xnfp = xnfp;
   2365 	bdesc->buf_phys = dma_cookie.dmac_laddress;
   2366 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
   2367 	bdesc->len = dma_cookie.dmac_size;
   2368 	bdesc->grant_ref = INVALID_GRANT_REF;
   2369 	bdesc->gen = xnfp->xnf_gen;
   2370 
   2371 	atomic_add_64(&xnfp->xnf_stat_buf_allocated, 1);
   2372 
   2373 	return (0);
   2374 
   2375 failure_2:
   2376 	ddi_dma_mem_free(&bdesc->acc_handle);
   2377 
   2378 failure_1:
   2379 	ddi_dma_free_handle(&bdesc->dma_handle);
   2380 
   2381 failure:
   2382 
   2383 	return (-1);
   2384 }
   2385 
   2386 static void
   2387 xnf_buf_destructor(void *buf, void *arg)
   2388 {
   2389 	xnf_buf_t *bdesc = buf;
   2390 	xnf_t *xnfp = arg;
   2391 
   2392 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
   2393 	ddi_dma_mem_free(&bdesc->acc_handle);
   2394 	ddi_dma_free_handle(&bdesc->dma_handle);
   2395 
   2396 	atomic_add_64(&xnfp->xnf_stat_buf_allocated, -1);
   2397 }
   2398 
   2399 static xnf_buf_t *
   2400 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
   2401 {
   2402 	grant_ref_t gref;
   2403 	xnf_buf_t *bufp;
   2404 
   2405 	/*
   2406 	 * Usually grant references are more scarce than memory, so we
   2407 	 * attempt to acquire a grant reference first.
   2408 	 */
   2409 	gref = gref_get(xnfp);
   2410 	if (gref == INVALID_GRANT_REF)
   2411 		return (NULL);
   2412 
   2413 	bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
   2414 	if (bufp == NULL) {
   2415 		gref_put(xnfp, gref);
   2416 		return (NULL);
   2417 	}
   2418 
   2419 	ASSERT(bufp->grant_ref == INVALID_GRANT_REF);
   2420 
   2421 	bufp->grant_ref = gref;
   2422 
   2423 	if (bufp->gen != xnfp->xnf_gen)
   2424 		xnf_buf_refresh(bufp);
   2425 
   2426 	gnttab_grant_foreign_access_ref(bufp->grant_ref,
   2427 	    xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
   2428 	    bufp->buf_mfn, readonly ? 1 : 0);
   2429 
   2430 	atomic_add_64(&xnfp->xnf_stat_buf_outstanding, 1);
   2431 
   2432 	return (bufp);
   2433 }
   2434 
   2435 static void
   2436 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
   2437 {
   2438 	if (bufp->grant_ref != INVALID_GRANT_REF) {
   2439 		(void) gnttab_end_foreign_access_ref(
   2440 		    bufp->grant_ref, readonly ? 1 : 0);
   2441 		gref_put(xnfp, bufp->grant_ref);
   2442 		bufp->grant_ref = INVALID_GRANT_REF;
   2443 	}
   2444 
   2445 	kmem_cache_free(xnfp->xnf_buf_cache, bufp);
   2446 
   2447 	atomic_add_64(&xnfp->xnf_stat_buf_outstanding, -1);
   2448 }
   2449 
   2450 /*
   2451  * Refresh any cached data about a buffer after resume.
   2452  */
   2453 static void
   2454 xnf_buf_refresh(xnf_buf_t *bdesc)
   2455 {
   2456 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
   2457 	bdesc->gen = bdesc->xnfp->xnf_gen;
   2458 }
   2459 
   2460 /*
   2461  * Streams `freeb' routine for `xnf_buf_t' when used as transmit
   2462  * look-aside buffers.
   2463  */
   2464 static void
   2465 xnf_buf_recycle(xnf_buf_t *bdesc)
   2466 {
   2467 	xnf_t *xnfp = bdesc->xnfp;
   2468 
   2469 	xnf_buf_put(xnfp, bdesc, B_TRUE);
   2470 }
   2471 
   2472 static int
   2473 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
   2474 {
   2475 	_NOTE(ARGUNUSED(kmflag));
   2476 	xnf_txbuf_t *txp = buf;
   2477 	xnf_t *xnfp = arg;
   2478 
   2479 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
   2480 	    0, 0, &txp->tx_dma_handle) != DDI_SUCCESS)
   2481 		return (-1);
   2482 
   2483 	return (0);
   2484 }
   2485 
   2486 static void
   2487 xnf_tx_buf_destructor(void *buf, void *arg)
   2488 {
   2489 	_NOTE(ARGUNUSED(arg));
   2490 	xnf_txbuf_t *txp = buf;
   2491 
   2492 	ddi_dma_free_handle(&txp->tx_dma_handle);
   2493 }
   2494 
   2495 /*
   2496  * Statistics.
   2497  */
   2498 static char *xnf_aux_statistics[] = {
   2499 	"tx_cksum_deferred",
   2500 	"rx_cksum_no_need",
   2501 	"interrupts",
   2502 	"unclaimed_interrupts",
   2503 	"tx_pullup",
   2504 	"tx_pagebndry",
   2505 	"tx_attempt",
   2506 	"buf_allocated",
   2507 	"buf_outstanding",
   2508 	"gref_outstanding",
   2509 	"gref_failure",
   2510 	"gref_peak",
   2511 	"rx_allocb_fail",
   2512 	"rx_desballoc_fail",
   2513 };
   2514 
   2515 static int
   2516 xnf_kstat_aux_update(kstat_t *ksp, int flag)
   2517 {
   2518 	xnf_t *xnfp;
   2519 	kstat_named_t *knp;
   2520 
   2521 	if (flag != KSTAT_READ)
   2522 		return (EACCES);
   2523 
   2524 	xnfp = ksp->ks_private;
   2525 	knp = ksp->ks_data;
   2526 
   2527 	/*
   2528 	 * Assignment order must match that of the names in
   2529 	 * xnf_aux_statistics.
   2530 	 */
   2531 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
   2532 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
   2533 
   2534 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
   2535 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
   2536 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
   2537 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
   2538 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
   2539 
   2540 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
   2541 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
   2542 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
   2543 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
   2544 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
   2545 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
   2546 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
   2547 
   2548 	return (0);
   2549 }
   2550 
   2551 static boolean_t
   2552 xnf_kstat_init(xnf_t *xnfp)
   2553 {
   2554 	int nstat = sizeof (xnf_aux_statistics) /
   2555 	    sizeof (xnf_aux_statistics[0]);
   2556 	char **cp = xnf_aux_statistics;
   2557 	kstat_named_t *knp;
   2558 
   2559 	/*
   2560 	 * Create and initialise kstats.
   2561 	 */
   2562 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
   2563 	    ddi_get_instance(xnfp->xnf_devinfo),
   2564 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
   2565 	    nstat, 0)) == NULL)
   2566 		return (B_FALSE);
   2567 
   2568 	xnfp->xnf_kstat_aux->ks_private = xnfp;
   2569 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
   2570 
   2571 	knp = xnfp->xnf_kstat_aux->ks_data;
   2572 	while (nstat > 0) {
   2573 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
   2574 
   2575 		knp++;
   2576 		cp++;
   2577 		nstat--;
   2578 	}
   2579 
   2580 	kstat_install(xnfp->xnf_kstat_aux);
   2581 
   2582 	return (B_TRUE);
   2583 }
   2584 
   2585 static int
   2586 xnf_stat(void *arg, uint_t stat, uint64_t *val)
   2587 {
   2588 	xnf_t *xnfp = arg;
   2589 
   2590 	mutex_enter(&xnfp->xnf_rxlock);
   2591 	mutex_enter(&xnfp->xnf_txlock);
   2592 
   2593 #define	mac_stat(q, r)				\
   2594 	case (MAC_STAT_##q):			\
   2595 		*val = xnfp->xnf_stat_##r;	\
   2596 		break
   2597 
   2598 #define	ether_stat(q, r)			\
   2599 	case (ETHER_STAT_##q):			\
   2600 		*val = xnfp->xnf_stat_##r;	\
   2601 		break
   2602 
   2603 	switch (stat) {
   2604 
   2605 	mac_stat(IPACKETS, ipackets);
   2606 	mac_stat(OPACKETS, opackets);
   2607 	mac_stat(RBYTES, rbytes);
   2608 	mac_stat(OBYTES, obytes);
   2609 	mac_stat(NORCVBUF, norxbuf);
   2610 	mac_stat(IERRORS, errrx);
   2611 	mac_stat(NOXMTBUF, tx_defer);
   2612 
   2613 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
   2614 	ether_stat(TOOSHORT_ERRORS, runt);
   2615 
   2616 	/* always claim to be in full duplex mode */
   2617 	case ETHER_STAT_LINK_DUPLEX:
   2618 		*val = LINK_DUPLEX_FULL;
   2619 		break;
   2620 
   2621 	/* always claim to be at 1Gb/s link speed */
   2622 	case MAC_STAT_IFSPEED:
   2623 		*val = 1000000000ull;
   2624 		break;
   2625 
   2626 	default:
   2627 		mutex_exit(&xnfp->xnf_txlock);
   2628 		mutex_exit(&xnfp->xnf_rxlock);
   2629 
   2630 		return (ENOTSUP);
   2631 	}
   2632 
   2633 #undef mac_stat
   2634 #undef ether_stat
   2635 
   2636 	mutex_exit(&xnfp->xnf_txlock);
   2637 	mutex_exit(&xnfp->xnf_rxlock);
   2638 
   2639 	return (0);
   2640 }
   2641 
   2642 static boolean_t
   2643 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
   2644 {
   2645 	_NOTE(ARGUNUSED(arg));
   2646 
   2647 	switch (cap) {
   2648 	case MAC_CAPAB_HCKSUM: {
   2649 		uint32_t *capab = cap_data;
   2650 
   2651 		/*
   2652 		 * Whilst the flag used to communicate with the IO
   2653 		 * domain is called "NETTXF_csum_blank", the checksum
   2654 		 * in the packet must contain the pseudo-header
   2655 		 * checksum and not zero.
   2656 		 *
   2657 		 * To help out the IO domain, we might use
   2658 		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
   2659 		 * then use checksum offload for IPv6 packets, which
   2660 		 * the IO domain can't handle.
   2661 		 *
   2662 		 * As a result, we declare outselves capable of
   2663 		 * HCKSUM_INET_FULL_V4. This means that we receive
   2664 		 * IPv4 packets from the stack with a blank checksum
   2665 		 * field and must insert the pseudo-header checksum
   2666 		 * before passing the packet to the IO domain.
   2667 		 */
   2668 		*capab = HCKSUM_INET_FULL_V4;
   2669 		break;
   2670 	}
   2671 	default:
   2672 		return (B_FALSE);
   2673 	}
   2674 
   2675 	return (B_TRUE);
   2676 }
   2677 
   2678 /*
   2679  * The state of the peer has changed - react accordingly.
   2680  */
   2681 static void
   2682 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
   2683     void *arg, void *impl_data)
   2684 {
   2685 	_NOTE(ARGUNUSED(id, arg));
   2686 	xnf_t *xnfp = ddi_get_driver_private(dip);
   2687 	XenbusState new_state = *(XenbusState *)impl_data;
   2688 
   2689 	ASSERT(xnfp != NULL);
   2690 
   2691 	switch (new_state) {
   2692 	case XenbusStateUnknown:
   2693 	case XenbusStateInitialising:
   2694 	case XenbusStateInitialised:
   2695 	case XenbusStateClosing:
   2696 	case XenbusStateClosed:
   2697 	case XenbusStateReconfiguring:
   2698 	case XenbusStateReconfigured:
   2699 		break;
   2700 
   2701 	case XenbusStateInitWait:
   2702 		xnf_read_config(xnfp);
   2703 
   2704 		if (!xnfp->xnf_be_rx_copy) {
   2705 			cmn_err(CE_WARN,
   2706 			    "The xnf driver requires a dom0 that "
   2707 			    "supports 'feature-rx-copy'.");
   2708 			(void) xvdi_switch_state(xnfp->xnf_devinfo,
   2709 			    XBT_NULL, XenbusStateClosed);
   2710 			break;
   2711 		}
   2712 
   2713 		/*
   2714 		 * Connect to the backend.
   2715 		 */
   2716 		xnf_be_connect(xnfp);
   2717 
   2718 		/*
   2719 		 * Our MAC address as discovered by xnf_read_config().
   2720 		 */
   2721 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
   2722 
   2723 		break;
   2724 
   2725 	case XenbusStateConnected:
   2726 		mutex_enter(&xnfp->xnf_rxlock);
   2727 		mutex_enter(&xnfp->xnf_txlock);
   2728 
   2729 		xnfp->xnf_connected = B_TRUE;
   2730 		/*
   2731 		 * Wake up any threads waiting to send data to
   2732 		 * backend.
   2733 		 */
   2734 		cv_broadcast(&xnfp->xnf_cv_state);
   2735 
   2736 		mutex_exit(&xnfp->xnf_txlock);
   2737 		mutex_exit(&xnfp->xnf_rxlock);
   2738 
   2739 		/*
   2740 		 * Kick the peer in case it missed any transmits
   2741 		 * request in the TX ring.
   2742 		 */
   2743 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
   2744 
   2745 		/*
   2746 		 * There may already be completed receive requests in
   2747 		 * the ring sent by backend after it gets connected
   2748 		 * but before we see its state change here, so we call
   2749 		 * xnf_intr() to handle them, if any.
   2750 		 */
   2751 		(void) xnf_intr((caddr_t)xnfp);
   2752 
   2753 		/*
   2754 		 * Mark the link up now that we are connected.
   2755 		 */
   2756 		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
   2757 
   2758 		/*
   2759 		 * Tell the backend about the multicast addresses in
   2760 		 * which we are interested.
   2761 		 */
   2762 		mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
   2763 
   2764 		break;
   2765 
   2766 	default:
   2767 		break;
   2768 	}
   2769 }
   2770