Home | History | Annotate | Download | only in io
      1   5084   johnlev /*
      2   5084   johnlev  * CDDL HEADER START
      3   5084   johnlev  *
      4   5084   johnlev  * The contents of this file are subject to the terms of the
      5   5084   johnlev  * Common Development and Distribution License (the "License").
      6   5084   johnlev  * You may not use this file except in compliance with the License.
      7   5084   johnlev  *
      8   5084   johnlev  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9   5084   johnlev  * or http://www.opensolaris.org/os/licensing.
     10   5084   johnlev  * See the License for the specific language governing permissions
     11   5084   johnlev  * and limitations under the License.
     12   5084   johnlev  *
     13   5084   johnlev  * When distributing Covered Code, include this CDDL HEADER in each
     14   5084   johnlev  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15   5084   johnlev  * If applicable, add the following below this CDDL HEADER, with the
     16   5084   johnlev  * fields enclosed by brackets "[]" replaced with your own identifying
     17   5084   johnlev  * information: Portions Copyright [yyyy] [name of copyright owner]
     18   5084   johnlev  *
     19   5084   johnlev  * CDDL HEADER END
     20   5084   johnlev  */
     21   5084   johnlev 
     22   5084   johnlev /*
     23   8757       dme  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24   5084   johnlev  * Use is subject to license terms.
     25   5084   johnlev  */
     26   5084   johnlev 
     27   5084   johnlev #ifdef DEBUG
     28   5084   johnlev #define	XNB_DEBUG 1
     29   5084   johnlev #endif /* DEBUG */
     30   5084   johnlev 
     31   5084   johnlev #include "xnb.h"
     32   5084   johnlev 
     33   5084   johnlev #include <sys/sunddi.h>
     34   5084   johnlev #include <sys/sunndi.h>
     35   5084   johnlev #include <sys/modctl.h>
     36   5084   johnlev #include <sys/conf.h>
     37   5084   johnlev #include <sys/mac.h>
     38  10958       dme #include <sys/mac_impl.h> /* For mac_fix_cksum(). */
     39   5084   johnlev #include <sys/dlpi.h>
     40   5084   johnlev #include <sys/strsubr.h>
     41   5084   johnlev #include <sys/strsun.h>
     42   5741       mrj #include <sys/types.h>
     43   5084   johnlev #include <sys/pattr.h>
     44   5084   johnlev #include <vm/seg_kmem.h>
     45   5084   johnlev #include <vm/hat_i86.h>
     46   5084   johnlev #include <xen/sys/xenbus_impl.h>
     47   5084   johnlev #include <xen/sys/xendev.h>
     48   5084   johnlev #include <sys/balloon_impl.h>
     49   5084   johnlev #include <sys/evtchn_impl.h>
     50   5084   johnlev #include <sys/gnttab.h>
     51   5262    rscott #include <vm/vm_dep.h>
     52  10958       dme #include <sys/note.h>
     53   5084   johnlev #include <sys/gld.h>
     54   5084   johnlev #include <inet/ip.h>
     55   5084   johnlev #include <inet/ip_impl.h>
     56   5084   johnlev 
     57   5084   johnlev /*
     58   7615       Max  * The terms "transmit" and "receive" are used in alignment with domU,
     59   7615       Max  * which means that packets originating from the peer domU are "transmitted"
     60   7615       Max  * to other parts of the system and packets are "received" from them.
     61   5084   johnlev  */
     62   5084   johnlev 
     63   5084   johnlev /*
     64  10958       dme  * Should we allow guests to manipulate multicast group membership?
     65   5084   johnlev  */
     66  10958       dme static boolean_t	xnb_multicast_control = B_TRUE;
     67   5084   johnlev 
     68   5084   johnlev static boolean_t	xnb_connect_rings(dev_info_t *);
     69   5084   johnlev static void		xnb_disconnect_rings(dev_info_t *);
     70   5084   johnlev static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
     71   5084   johnlev     void *, void *);
     72   5084   johnlev static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
     73   5084   johnlev     void *, void *);
     74   5084   johnlev 
     75   7615       Max static int	xnb_txbuf_constructor(void *, void *, int);
     76   7615       Max static void	xnb_txbuf_destructor(void *, void *);
     77  10958       dme static void	xnb_tx_notify_peer(xnb_t *, boolean_t);
     78   7615       Max static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
     79  10958       dme 
     80  10958       dme mblk_t		*xnb_to_peer(xnb_t *, mblk_t *);
     81   5741       mrj mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
     82   5741       mrj 
     83  10958       dme static void		setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *,
     84  10958       dme     size_t, size_t, size_t, grant_ref_t);
     85  10958       dme #pragma inline(setup_gop)
     86  10958       dme static boolean_t	is_foreign(void *);
     87  10958       dme #pragma inline(is_foreign)
     88   5741       mrj 
     89   5084   johnlev #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
     90   5084   johnlev #define	INVALID_GRANT_REF	((grant_ref_t)-1)
     91   5084   johnlev 
     92   5084   johnlev static kmutex_t	xnb_alloc_page_lock;
     93  10958       dme 
     94  10958       dme /*
     95  10958       dme  * On a 32 bit PAE system physical and machine addresses are larger
     96  10958       dme  * than 32 bits.  ddi_btop() on such systems take an unsigned long
     97  10958       dme  * argument, and so addresses above 4G are truncated before ddi_btop()
     98  10958       dme  * gets to see them.  To avoid this, code the shift operation here.
     99  10958       dme  */
    100  10958       dme #define	xnb_btop(addr)	((addr) >> PAGESHIFT)
    101  10958       dme 
    102  10958       dme /* DMA attributes for transmit and receive data */
    103  10958       dme static ddi_dma_attr_t buf_dma_attr = {
    104  10958       dme 	DMA_ATTR_V0,		/* version of this structure */
    105  10958       dme 	0,			/* lowest usable address */
    106  10958       dme 	0xffffffffffffffffULL,	/* highest usable address */
    107  10958       dme 	0x7fffffff,		/* maximum DMAable byte count */
    108  10958       dme 	MMU_PAGESIZE,		/* alignment in bytes */
    109  10958       dme 	0x7ff,			/* bitmap of burst sizes */
    110  10958       dme 	1,			/* minimum transfer */
    111  10958       dme 	0xffffffffU,		/* maximum transfer */
    112  10958       dme 	0xffffffffffffffffULL,	/* maximum segment length */
    113  10958       dme 	1,			/* maximum number of segments */
    114  10958       dme 	1,			/* granularity */
    115  10958       dme 	0,			/* flags (reserved) */
    116  10958       dme };
    117  10958       dme 
    118  10958       dme /* DMA access attributes for data: NOT to be byte swapped. */
    119  10958       dme static ddi_device_acc_attr_t data_accattr = {
    120  10958       dme 	DDI_DEVICE_ATTR_V0,
    121  10958       dme 	DDI_NEVERSWAP_ACC,
    122  10958       dme 	DDI_STRICTORDER_ACC
    123  10958       dme };
    124   5084   johnlev 
    125   5084   johnlev /*
    126   5084   johnlev  * Statistics.
    127   5084   johnlev  */
    128   5084   johnlev static char *aux_statistics[] = {
    129   7615       Max 	"rx_cksum_deferred",
    130   7615       Max 	"tx_cksum_no_need",
    131   7615       Max 	"rx_rsp_notok",
    132   5084   johnlev 	"tx_notify_deferred",
    133   5084   johnlev 	"tx_notify_sent",
    134   5084   johnlev 	"rx_notify_deferred",
    135   5084   johnlev 	"rx_notify_sent",
    136   5084   johnlev 	"tx_too_early",
    137   5084   johnlev 	"rx_too_early",
    138   5084   johnlev 	"rx_allocb_failed",
    139   5741       mrj 	"tx_allocb_failed",
    140   7615       Max 	"rx_foreign_page",
    141   5084   johnlev 	"mac_full",
    142   5084   johnlev 	"spurious_intr",
    143   5084   johnlev 	"allocation_success",
    144   5084   johnlev 	"allocation_failure",
    145   5084   johnlev 	"small_allocation_success",
    146   5084   johnlev 	"small_allocation_failure",
    147   5741       mrj 	"other_allocation_failure",
    148   7615       Max 	"rx_pageboundary_crossed",
    149   7615       Max 	"rx_cpoparea_grown",
    150   5084   johnlev 	"csum_hardware",
    151   5084   johnlev 	"csum_software",
    152   5084   johnlev };
    153   5084   johnlev 
    154   5084   johnlev static int
    155   5084   johnlev xnb_ks_aux_update(kstat_t *ksp, int flag)
    156   5084   johnlev {
    157   5084   johnlev 	xnb_t *xnbp;
    158   5084   johnlev 	kstat_named_t *knp;
    159   5084   johnlev 
    160   5084   johnlev 	if (flag != KSTAT_READ)
    161   5084   johnlev 		return (EACCES);
    162   5084   johnlev 
    163   5084   johnlev 	xnbp = ksp->ks_private;
    164   5084   johnlev 	knp = ksp->ks_data;
    165   5084   johnlev 
    166   5084   johnlev 	/*
    167   5084   johnlev 	 * Assignment order should match that of the names in
    168   5084   johnlev 	 * aux_statistics.
    169   5084   johnlev 	 */
    170   7615       Max 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
    171   7615       Max 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
    172   7615       Max 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
    173   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
    174   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
    175   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
    176   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
    177   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
    178   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
    179   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
    180   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
    181   7615       Max 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
    182   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
    183   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
    184   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
    185   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
    186   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
    187   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
    188   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
    189   7615       Max 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
    190   7615       Max 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
    191   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
    192   5741       mrj 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
    193   5084   johnlev 
    194   5084   johnlev 	return (0);
    195   5084   johnlev }
    196   5084   johnlev 
    197   5084   johnlev static boolean_t
    198   5084   johnlev xnb_ks_init(xnb_t *xnbp)
    199   5084   johnlev {
    200   5084   johnlev 	int nstat = sizeof (aux_statistics) /
    201   5084   johnlev 	    sizeof (aux_statistics[0]);
    202   5084   johnlev 	char **cp = aux_statistics;
    203   5084   johnlev 	kstat_named_t *knp;
    204   5084   johnlev 
    205   5084   johnlev 	/*
    206   5084   johnlev 	 * Create and initialise kstats.
    207   5084   johnlev 	 */
    208   5741       mrj 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
    209   5741       mrj 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
    210   5084   johnlev 	    KSTAT_TYPE_NAMED, nstat, 0);
    211   5741       mrj 	if (xnbp->xnb_kstat_aux == NULL)
    212   5084   johnlev 		return (B_FALSE);
    213   5084   johnlev 
    214   5741       mrj 	xnbp->xnb_kstat_aux->ks_private = xnbp;
    215   5741       mrj 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
    216   5084   johnlev 
    217   5741       mrj 	knp = xnbp->xnb_kstat_aux->ks_data;
    218   5084   johnlev 	while (nstat > 0) {
    219   5084   johnlev 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
    220   5084   johnlev 
    221   5084   johnlev 		knp++;
    222   5084   johnlev 		cp++;
    223   5084   johnlev 		nstat--;
    224   5084   johnlev 	}
    225   5084   johnlev 
    226   5741       mrj 	kstat_install(xnbp->xnb_kstat_aux);
    227   5084   johnlev 
    228   5084   johnlev 	return (B_TRUE);
    229   5084   johnlev }
    230   5084   johnlev 
    231   5084   johnlev static void
    232   5084   johnlev xnb_ks_free(xnb_t *xnbp)
    233   5084   johnlev {
    234   5741       mrj 	kstat_delete(xnbp->xnb_kstat_aux);
    235   5084   johnlev }
    236   5084   johnlev 
    237   5084   johnlev /*
    238  10958       dme  * Calculate and insert the transport checksum for an arbitrary packet.
    239   5084   johnlev  */
    240   5084   johnlev static mblk_t *
    241   5084   johnlev xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
    242   5084   johnlev {
    243  10958       dme 	_NOTE(ARGUNUSED(xnbp));
    244  10958       dme 
    245   5084   johnlev 	/*
    246  10958       dme 	 * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
    247   5084   johnlev 	 * because it doesn't cover all of the interesting cases :-(
    248   5084   johnlev 	 */
    249   5084   johnlev 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
    250   5084   johnlev 	    HCK_FULLCKSUM, KM_NOSLEEP);
    251   5084   johnlev 
    252   8275      Eric 	return (mac_fix_cksum(mp));
    253   5084   johnlev }
    254   5084   johnlev 
    255   5084   johnlev mblk_t *
    256   5084   johnlev xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
    257   5084   johnlev {
    258   5084   johnlev 	struct ether_header *ehp;
    259   5084   johnlev 	uint16_t sap;
    260   5084   johnlev 	uint32_t offset;
    261   5084   johnlev 	ipha_t *ipha;
    262   5084   johnlev 
    263   5084   johnlev 	ASSERT(mp->b_next == NULL);
    264   5084   johnlev 
    265   5084   johnlev 	/*
    266   5084   johnlev 	 * Check that the packet is contained in a single mblk.  In
    267  10958       dme 	 * the "from peer" path this is true today, but may change
    268   5084   johnlev 	 * when scatter gather support is added.  In the "to peer"
    269   5084   johnlev 	 * path we cannot be sure, but in most cases it will be true
    270   5084   johnlev 	 * (in the xnbo case the packet has come from a MAC device
    271   5084   johnlev 	 * which is unlikely to split packets).
    272   5084   johnlev 	 */
    273   5084   johnlev 	if (mp->b_cont != NULL)
    274   5084   johnlev 		goto software;
    275   5084   johnlev 
    276   5084   johnlev 	/*
    277   5084   johnlev 	 * If the MAC has no hardware capability don't do any further
    278   5084   johnlev 	 * checking.
    279   5084   johnlev 	 */
    280   5084   johnlev 	if (capab == 0)
    281   5084   johnlev 		goto software;
    282   5084   johnlev 
    283   5084   johnlev 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
    284   5084   johnlev 	ehp = (struct ether_header *)mp->b_rptr;
    285   5084   johnlev 
    286   5084   johnlev 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
    287   5084   johnlev 		struct ether_vlan_header *evhp;
    288   5084   johnlev 
    289   5084   johnlev 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
    290   5084   johnlev 		evhp = (struct ether_vlan_header *)mp->b_rptr;
    291   5084   johnlev 		sap = ntohs(evhp->ether_type);
    292   5084   johnlev 		offset = sizeof (struct ether_vlan_header);
    293   5084   johnlev 	} else {
    294   5084   johnlev 		sap = ntohs(ehp->ether_type);
    295   5084   johnlev 		offset = sizeof (struct ether_header);
    296   5084   johnlev 	}
    297   5084   johnlev 
    298   5084   johnlev 	/*
    299   5084   johnlev 	 * We only attempt to do IPv4 packets in hardware.
    300   5084   johnlev 	 */
    301   5084   johnlev 	if (sap != ETHERTYPE_IP)
    302   5084   johnlev 		goto software;
    303   5084   johnlev 
    304   5084   johnlev 	/*
    305   5084   johnlev 	 * We know that this is an IPv4 packet.
    306   5084   johnlev 	 */
    307   5084   johnlev 	ipha = (ipha_t *)(mp->b_rptr + offset);
    308   5084   johnlev 
    309   5084   johnlev 	switch (ipha->ipha_protocol) {
    310   5084   johnlev 	case IPPROTO_TCP:
    311   7351       dme 	case IPPROTO_UDP: {
    312   7351       dme 		uint32_t start, length, stuff, cksum;
    313   7351       dme 		uint16_t *stuffp;
    314   7351       dme 
    315   5084   johnlev 		/*
    316   7351       dme 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
    317   7351       dme 		 * can use full IPv4 and partial checksum offload.
    318   5084   johnlev 		 */
    319   7351       dme 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
    320   7351       dme 			break;
    321   7351       dme 
    322   7351       dme 		start = IP_SIMPLE_HDR_LENGTH;
    323   7351       dme 		length = ntohs(ipha->ipha_length);
    324   7351       dme 		if (ipha->ipha_protocol == IPPROTO_TCP) {
    325   7351       dme 			stuff = start + TCP_CHECKSUM_OFFSET;
    326   7351       dme 			cksum = IP_TCP_CSUM_COMP;
    327   7351       dme 		} else {
    328   7351       dme 			stuff = start + UDP_CHECKSUM_OFFSET;
    329   7351       dme 			cksum = IP_UDP_CSUM_COMP;
    330   7351       dme 		}
    331   7351       dme 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
    332   7351       dme 
    333   7351       dme 		if (capab & HCKSUM_INET_FULL_V4) {
    334   7351       dme 			/*
    335   7351       dme 			 * Some devices require that the checksum
    336   7351       dme 			 * field of the packet is zero for full
    337   7351       dme 			 * offload.
    338   7351       dme 			 */
    339   7351       dme 			*stuffp = 0;
    340   7351       dme 
    341   5084   johnlev 			(void) hcksum_assoc(mp, NULL, NULL,
    342   5084   johnlev 			    0, 0, 0, 0,
    343   5084   johnlev 			    HCK_FULLCKSUM, KM_NOSLEEP);
    344   5084   johnlev 
    345   5741       mrj 			xnbp->xnb_stat_csum_hardware++;
    346   5084   johnlev 
    347   5084   johnlev 			return (mp);
    348   5084   johnlev 		}
    349   5084   johnlev 
    350   7351       dme 		if (capab & HCKSUM_INET_PARTIAL) {
    351   7351       dme 			if (*stuffp == 0) {
    352   7351       dme 				ipaddr_t src, dst;
    353   5084   johnlev 
    354   7351       dme 				/*
    355   7351       dme 				 * Older Solaris guests don't insert
    356   7351       dme 				 * the pseudo-header checksum, so we
    357   7351       dme 				 * calculate it here.
    358   7351       dme 				 */
    359   7351       dme 				src = ipha->ipha_src;
    360   7351       dme 				dst = ipha->ipha_dst;
    361   7351       dme 
    362   7351       dme 				cksum += (dst >> 16) + (dst & 0xFFFF);
    363   7351       dme 				cksum += (src >> 16) + (src & 0xFFFF);
    364   7351       dme 				cksum += length - IP_SIMPLE_HDR_LENGTH;
    365   7351       dme 
    366   7351       dme 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
    367   7351       dme 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
    368   7351       dme 
    369   7351       dme 				ASSERT(cksum <= 0xFFFF);
    370   7351       dme 
    371   7351       dme 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
    372   7351       dme 			}
    373   7351       dme 
    374   7351       dme 			(void) hcksum_assoc(mp, NULL, NULL,
    375   7351       dme 			    start, stuff, length, 0,
    376   7351       dme 			    HCK_PARTIALCKSUM, KM_NOSLEEP);
    377   7351       dme 
    378   7351       dme 			xnbp->xnb_stat_csum_hardware++;
    379   7351       dme 
    380   7351       dme 			return (mp);
    381   7351       dme 		}
    382   7351       dme 
    383   7351       dme 		/* NOTREACHED */
    384   5084   johnlev 		break;
    385   7351       dme 	}
    386   5084   johnlev 
    387   5084   johnlev 	default:
    388   5084   johnlev 		/* Use software. */
    389   5084   johnlev 		break;
    390   5084   johnlev 	}
    391   5084   johnlev 
    392   5084   johnlev software:
    393   5084   johnlev 	/*
    394   5084   johnlev 	 * We are not able to use any offload so do the whole thing in
    395   5084   johnlev 	 * software.
    396   5084   johnlev 	 */
    397   5741       mrj 	xnbp->xnb_stat_csum_software++;
    398   5084   johnlev 
    399   5084   johnlev 	return (xnb_software_csum(xnbp, mp));
    400   5084   johnlev }
    401   5084   johnlev 
    402   5084   johnlev int
    403   5084   johnlev xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
    404   5084   johnlev {
    405   5084   johnlev 	xnb_t *xnbp;
    406  10958       dme 	char *xsname;
    407  10958       dme 	char cachename[32];
    408   5084   johnlev 
    409   5084   johnlev 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
    410   5084   johnlev 
    411   5741       mrj 	xnbp->xnb_flavour = flavour;
    412   5741       mrj 	xnbp->xnb_flavour_data = flavour_data;
    413   5741       mrj 	xnbp->xnb_devinfo = dip;
    414   5741       mrj 	xnbp->xnb_evtchn = INVALID_EVTCHN;
    415   5741       mrj 	xnbp->xnb_irq = B_FALSE;
    416   5741       mrj 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
    417   5741       mrj 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
    418   5741       mrj 	xnbp->xnb_connected = B_FALSE;
    419   5741       mrj 	xnbp->xnb_hotplugged = B_FALSE;
    420   5741       mrj 	xnbp->xnb_detachable = B_FALSE;
    421   5741       mrj 	xnbp->xnb_peer = xvdi_get_oeid(dip);
    422  10958       dme 	xnbp->xnb_be_status = XNB_STATE_INIT;
    423  10958       dme 	xnbp->xnb_fe_status = XNB_STATE_INIT;
    424   5084   johnlev 
    425   7615       Max 	xnbp->xnb_tx_buf_count = 0;
    426   5084   johnlev 
    427  10958       dme 	xnbp->xnb_rx_hv_copy = B_FALSE;
    428  10958       dme 	xnbp->xnb_multicast_control = B_FALSE;
    429   5084   johnlev 
    430   7615       Max 	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
    431   7615       Max 	ASSERT(xnbp->xnb_rx_va != NULL);
    432   5741       mrj 
    433   5741       mrj 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
    434   5084   johnlev 	    != DDI_SUCCESS)
    435   5084   johnlev 		goto failure;
    436   5084   johnlev 
    437  10958       dme 	/* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
    438   7615       Max 	xnbp->xnb_rx_cpop = NULL;
    439  10958       dme 	xnbp->xnb_rx_cpop_count = 0;
    440   5741       mrj 
    441   5741       mrj 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
    442   5741       mrj 	    xnbp->xnb_icookie);
    443   5741       mrj 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
    444   5741       mrj 	    xnbp->xnb_icookie);
    445  10958       dme 	mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER,
    446  10958       dme 	    xnbp->xnb_icookie);
    447   5084   johnlev 
    448  10958       dme 	/* Set driver private pointer now. */
    449   5084   johnlev 	ddi_set_driver_private(dip, xnbp);
    450  10958       dme 
    451  10958       dme 	(void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip));
    452  10958       dme 	xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename,
    453  10958       dme 	    sizeof (xnb_txbuf_t), 0,
    454  10958       dme 	    xnb_txbuf_constructor, xnb_txbuf_destructor,
    455  10958       dme 	    NULL, xnbp, NULL, 0);
    456  10958       dme 	if (xnbp->xnb_tx_buf_cache == NULL)
    457  10958       dme 		goto failure_0;
    458   5084   johnlev 
    459   5084   johnlev 	if (!xnb_ks_init(xnbp))
    460   5741       mrj 		goto failure_1;
    461   5084   johnlev 
    462   5084   johnlev 	/*
    463   5084   johnlev 	 * Receive notification of changes in the state of the
    464   5084   johnlev 	 * driver in the guest domain.
    465   5084   johnlev 	 */
    466   7756      Mark 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
    467   7756      Mark 	    NULL) != DDI_SUCCESS)
    468   5741       mrj 		goto failure_2;
    469   5084   johnlev 
    470   5084   johnlev 	/*
    471   5084   johnlev 	 * Receive notification of hotplug events.
    472   5084   johnlev 	 */
    473   7756      Mark 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
    474   7756      Mark 	    NULL) != DDI_SUCCESS)
    475   5741       mrj 		goto failure_2;
    476   5084   johnlev 
    477   5084   johnlev 	xsname = xvdi_get_xsname(dip);
    478   5084   johnlev 
    479   5084   johnlev 	if (xenbus_printf(XBT_NULL, xsname,
    480  10958       dme 	    "feature-multicast-control", "%d",
    481  10958       dme 	    xnb_multicast_control ? 1 : 0) != 0)
    482   5741       mrj 		goto failure_3;
    483   5741       mrj 
    484   5741       mrj 	if (xenbus_printf(XBT_NULL, xsname,
    485  10958       dme 	    "feature-rx-copy", "%d",  1) != 0)
    486   5741       mrj 		goto failure_3;
    487   5741       mrj 	/*
    488   5741       mrj 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
    489   5741       mrj 	 * in addition to "feature-rx-copy" being 1. It seems strange
    490   5741       mrj 	 * to use four possible states to describe a binary decision,
    491   5741       mrj 	 * but we might as well play nice.
    492   5741       mrj 	 */
    493   5741       mrj 	if (xenbus_printf(XBT_NULL, xsname,
    494  10958       dme 	    "feature-rx-flip", "%d", 0) != 0)
    495   5741       mrj 		goto failure_3;
    496   5084   johnlev 
    497   5084   johnlev 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
    498   5084   johnlev 	(void) xvdi_post_event(dip, XEN_HP_ADD);
    499   5084   johnlev 
    500   5084   johnlev 	return (DDI_SUCCESS);
    501   5084   johnlev 
    502   5741       mrj failure_3:
    503   5084   johnlev 	xvdi_remove_event_handler(dip, NULL);
    504   5084   johnlev 
    505   5741       mrj failure_2:
    506   5084   johnlev 	xnb_ks_free(xnbp);
    507   5084   johnlev 
    508   5741       mrj failure_1:
    509  10958       dme 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
    510  10958       dme 
    511  10958       dme failure_0:
    512  10958       dme 	mutex_destroy(&xnbp->xnb_state_lock);
    513   5741       mrj 	mutex_destroy(&xnbp->xnb_rx_lock);
    514   5741       mrj 	mutex_destroy(&xnbp->xnb_tx_lock);
    515   5084   johnlev 
    516   5084   johnlev failure:
    517   7615       Max 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
    518   5084   johnlev 	kmem_free(xnbp, sizeof (*xnbp));
    519   5084   johnlev 	return (DDI_FAILURE);
    520   5084   johnlev }
    521   5084   johnlev 
    522   5084   johnlev void
    523   5084   johnlev xnb_detach(dev_info_t *dip)
    524   5084   johnlev {
    525   5084   johnlev 	xnb_t *xnbp = ddi_get_driver_private(dip);
    526   5084   johnlev 
    527   5084   johnlev 	ASSERT(xnbp != NULL);
    528   5741       mrj 	ASSERT(!xnbp->xnb_connected);
    529   7615       Max 	ASSERT(xnbp->xnb_tx_buf_count == 0);
    530   5084   johnlev 
    531   5084   johnlev 	xnb_disconnect_rings(dip);
    532   5084   johnlev 
    533   5084   johnlev 	xvdi_remove_event_handler(dip, NULL);
    534   5084   johnlev 
    535   5084   johnlev 	xnb_ks_free(xnbp);
    536   5084   johnlev 
    537  10958       dme 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
    538  10958       dme 
    539   5084   johnlev 	ddi_set_driver_private(dip, NULL);
    540   5084   johnlev 
    541  10958       dme 	mutex_destroy(&xnbp->xnb_state_lock);
    542  10958       dme 	mutex_destroy(&xnbp->xnb_rx_lock);
    543   5741       mrj 	mutex_destroy(&xnbp->xnb_tx_lock);
    544   5084   johnlev 
    545  10958       dme 	if (xnbp->xnb_rx_cpop_count > 0)
    546  10958       dme 		kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0])
    547  10958       dme 		    * xnbp->xnb_rx_cpop_count);
    548   5741       mrj 
    549   7615       Max 	ASSERT(xnbp->xnb_rx_va != NULL);
    550   7615       Max 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
    551   5084   johnlev 
    552   5084   johnlev 	kmem_free(xnbp, sizeof (*xnbp));
    553   5084   johnlev }
    554   5084   johnlev 
    555  10958       dme /*
    556  10958       dme  * Allocate a page from the hypervisor to be flipped to the peer.
    557  10958       dme  *
    558  10958       dme  * Try to get pages in batches to reduce the overhead of calls into
    559  10958       dme  * the balloon driver.
    560  10958       dme  */
    561   5084   johnlev static mfn_t
    562   5084   johnlev xnb_alloc_page(xnb_t *xnbp)
    563   5084   johnlev {
    564   5084   johnlev #define	WARNING_RATE_LIMIT 100
    565   5084   johnlev #define	BATCH_SIZE 256
    566   5084   johnlev 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
    567   5084   johnlev 	static int nth = BATCH_SIZE;
    568   5084   johnlev 	mfn_t mfn;
    569   5084   johnlev 
    570   5084   johnlev 	mutex_enter(&xnb_alloc_page_lock);
    571   5084   johnlev 	if (nth == BATCH_SIZE) {
    572   5084   johnlev 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
    573   5741       mrj 			xnbp->xnb_stat_allocation_failure++;
    574   5084   johnlev 			mutex_exit(&xnb_alloc_page_lock);
    575   5084   johnlev 
    576   5084   johnlev 			/*
    577   5084   johnlev 			 * Try for a single page in low memory situations.
    578   5084   johnlev 			 */
    579   5084   johnlev 			if (balloon_alloc_pages(1, &mfn) != 1) {
    580   5741       mrj 				if ((xnbp->xnb_stat_small_allocation_failure++
    581   5741       mrj 				    % WARNING_RATE_LIMIT) == 0)
    582   5084   johnlev 					cmn_err(CE_WARN, "xnb_alloc_page: "
    583   5084   johnlev 					    "Cannot allocate memory to "
    584   5084   johnlev 					    "transfer packets to peer.");
    585   5084   johnlev 				return (0);
    586   5084   johnlev 			} else {
    587   5741       mrj 				xnbp->xnb_stat_small_allocation_success++;
    588   5084   johnlev 				return (mfn);
    589   5084   johnlev 			}
    590   5084   johnlev 		}
    591   5084   johnlev 
    592   5084   johnlev 		nth = 0;
    593   5741       mrj 		xnbp->xnb_stat_allocation_success++;
    594   5084   johnlev 	}
    595   5084   johnlev 
    596   5084   johnlev 	mfn = mfns[nth++];
    597   5084   johnlev 	mutex_exit(&xnb_alloc_page_lock);
    598   5084   johnlev 
    599   5084   johnlev 	ASSERT(mfn != 0);
    600   5084   johnlev 
    601   5084   johnlev 	return (mfn);
    602   5084   johnlev #undef BATCH_SIZE
    603   5084   johnlev #undef WARNING_RATE_LIMIT
    604   5084   johnlev }
    605   5084   johnlev 
    606  10958       dme /*
    607  10958       dme  * Free a page back to the hypervisor.
    608  10958       dme  *
    609  10958       dme  * This happens only in the error path, so batching is not worth the
    610  10958       dme  * complication.
    611  10958       dme  */
    612   5084   johnlev static void
    613   5084   johnlev xnb_free_page(xnb_t *xnbp, mfn_t mfn)
    614   5084   johnlev {
    615  10958       dme 	_NOTE(ARGUNUSED(xnbp));
    616   5084   johnlev 	int r;
    617   5262    rscott 	pfn_t pfn;
    618   5262    rscott 
    619   5262    rscott 	pfn = xen_assign_pfn(mfn);
    620   5262    rscott 	pfnzero(pfn, 0, PAGESIZE);
    621   5262    rscott 	xen_release_pfn(pfn);
    622   5084   johnlev 
    623   5084   johnlev 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
    624   5084   johnlev 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
    625   5084   johnlev 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
    626   5084   johnlev 		    r, mfn);
    627   5084   johnlev 	}
    628   5084   johnlev }
    629   5084   johnlev 
    630   5741       mrj /*
    631  10958       dme  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
    632  10958       dme  * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
    633   5741       mrj  */
    634   5741       mrj #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
    635   5741       mrj 	((((_r)->sring->req_prod - loop) <		\
    636   5741       mrj 		(RING_SIZE(_r) - (loop - prod))) ?	\
    637   5741       mrj 	    ((_r)->sring->req_prod - loop) :		\
    638   5741       mrj 	    (RING_SIZE(_r) - (loop - prod)))
    639   5741       mrj 
    640  10958       dme /*
    641  10958       dme  * Pass packets to the peer using page flipping.
    642  10958       dme  */
    643   5084   johnlev mblk_t *
    644   5084   johnlev xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
    645   5084   johnlev {
    646   5084   johnlev 	mblk_t *free = mp, *prev = NULL;
    647   5084   johnlev 	size_t len;
    648   5084   johnlev 	gnttab_transfer_t *gop;
    649   5084   johnlev 	boolean_t notify;
    650   5084   johnlev 	RING_IDX loop, prod, end;
    651   5084   johnlev 
    652   5084   johnlev 	/*
    653   5084   johnlev 	 * For each packet the sequence of operations is:
    654   5084   johnlev 	 *
    655   5084   johnlev 	 * 1. get a new page from the hypervisor.
    656   5084   johnlev 	 * 2. get a request slot from the ring.
    657   5084   johnlev 	 * 3. copy the data into the new page.
    658   5084   johnlev 	 * 4. transfer the page to the peer.
    659   5084   johnlev 	 * 5. update the request slot.
    660   5084   johnlev 	 * 6. kick the peer.
    661   5084   johnlev 	 * 7. free mp.
    662   5084   johnlev 	 *
    663   5084   johnlev 	 * In order to reduce the number of hypercalls, we prepare
    664   5084   johnlev 	 * several packets for the peer and perform a single hypercall
    665   5084   johnlev 	 * to transfer them.
    666   5084   johnlev 	 */
    667   5084   johnlev 
    668   7615       Max 	mutex_enter(&xnbp->xnb_rx_lock);
    669   5084   johnlev 
    670   5084   johnlev 	/*
    671   5084   johnlev 	 * If we are not connected to the peer or have not yet
    672   5084   johnlev 	 * finished hotplug it is too early to pass packets to the
    673   5084   johnlev 	 * peer.
    674   5084   johnlev 	 */
    675   5741       mrj 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
    676   7615       Max 		mutex_exit(&xnbp->xnb_rx_lock);
    677   7615       Max 		DTRACE_PROBE(flip_rx_too_early);
    678   7615       Max 		xnbp->xnb_stat_rx_too_early++;
    679   5084   johnlev 		return (mp);
    680   5084   johnlev 	}
    681   5084   johnlev 
    682   5741       mrj 	loop = xnbp->xnb_rx_ring.req_cons;
    683   5741       mrj 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
    684   7615       Max 	gop = xnbp->xnb_rx_top;
    685   5084   johnlev 
    686   5084   johnlev 	while ((mp != NULL) &&
    687   5741       mrj 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
    688   5084   johnlev 
    689   5084   johnlev 		mfn_t mfn;
    690   5084   johnlev 		pfn_t pfn;
    691   5084   johnlev 		netif_rx_request_t *rxreq;
    692   5084   johnlev 		netif_rx_response_t *rxresp;
    693   5084   johnlev 		char *valoop;
    694   5084   johnlev 		mblk_t *ml;
    695   5084   johnlev 		uint16_t cksum_flags;
    696   5084   johnlev 
    697   5084   johnlev 		/* 1 */
    698   5084   johnlev 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
    699   7615       Max 			xnbp->xnb_stat_rx_defer++;
    700   5084   johnlev 			break;
    701   5084   johnlev 		}
    702   5084   johnlev 
    703   5084   johnlev 		/* 2 */
    704   5741       mrj 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
    705   5084   johnlev 
    706   5084   johnlev #ifdef XNB_DEBUG
    707   5084   johnlev 		if (!(rxreq->id < NET_RX_RING_SIZE))
    708   5084   johnlev 			cmn_err(CE_PANIC, "xnb_to_peer: "
    709   5084   johnlev 			    "id %d out of range in request 0x%p",
    710   5084   johnlev 			    rxreq->id, (void *)rxreq);
    711   5084   johnlev #endif /* XNB_DEBUG */
    712   5084   johnlev 
    713   5084   johnlev 		/* Assign a pfn and map the new page at the allocated va. */
    714   5084   johnlev 		pfn = xen_assign_pfn(mfn);
    715   7615       Max 		hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
    716   5084   johnlev 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
    717   5084   johnlev 
    718   5084   johnlev 		/* 3 */
    719   5084   johnlev 		len = 0;
    720   8757       dme 		valoop = xnbp->xnb_rx_va;
    721   5084   johnlev 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
    722   5084   johnlev 			size_t chunk = ml->b_wptr - ml->b_rptr;
    723   5084   johnlev 
    724   5084   johnlev 			bcopy(ml->b_rptr, valoop, chunk);
    725   5084   johnlev 			valoop += chunk;
    726   5084   johnlev 			len += chunk;
    727   5084   johnlev 		}
    728   5084   johnlev 
    729   8757       dme 		ASSERT(len < PAGESIZE);
    730   5084   johnlev 
    731   5084   johnlev 		/* Release the pfn. */
    732   7615       Max 		hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
    733   5084   johnlev 		    HAT_UNLOAD_UNMAP);
    734   5084   johnlev 		xen_release_pfn(pfn);
    735   5084   johnlev 
    736   5084   johnlev 		/* 4 */
    737   5084   johnlev 		gop->mfn = mfn;
    738   5741       mrj 		gop->domid = xnbp->xnb_peer;
    739   5084   johnlev 		gop->ref = rxreq->gref;
    740   5084   johnlev 
    741   5084   johnlev 		/* 5.1 */
    742   5741       mrj 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
    743   8757       dme 		rxresp->offset = 0;
    744   5084   johnlev 		rxresp->flags = 0;
    745   5084   johnlev 
    746   5741       mrj 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
    747   5084   johnlev 		if (cksum_flags != 0)
    748   7615       Max 			xnbp->xnb_stat_rx_cksum_deferred++;
    749   5084   johnlev 		rxresp->flags |= cksum_flags;
    750   5084   johnlev 
    751   5741       mrj 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
    752   5084   johnlev 		rxresp->status = len;
    753   5084   johnlev 
    754   5084   johnlev 		loop++;
    755   5084   johnlev 		prod++;
    756   5084   johnlev 		gop++;
    757   5084   johnlev 		prev = mp;
    758   5084   johnlev 		mp = mp->b_next;
    759   5084   johnlev 	}
    760   5084   johnlev 
    761   5084   johnlev 	/*
    762   5084   johnlev 	 * Did we actually do anything?
    763   5084   johnlev 	 */
    764   5741       mrj 	if (loop == xnbp->xnb_rx_ring.req_cons) {
    765   7615       Max 		mutex_exit(&xnbp->xnb_rx_lock);
    766   5084   johnlev 		return (mp);
    767   5084   johnlev 	}
    768   5084   johnlev 
    769   5084   johnlev 	end = loop;
    770   5084   johnlev 
    771   5084   johnlev 	/*
    772   5084   johnlev 	 * Unlink the end of the 'done' list from the remainder.
    773   5084   johnlev 	 */
    774   5084   johnlev 	ASSERT(prev != NULL);
    775   5084   johnlev 	prev->b_next = NULL;
    776   5084   johnlev 
    777   7615       Max 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
    778   5741       mrj 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
    779   5084   johnlev 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
    780   5084   johnlev 	}
    781   5084   johnlev 
    782   5741       mrj 	loop = xnbp->xnb_rx_ring.req_cons;
    783   5741       mrj 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
    784   7615       Max 	gop = xnbp->xnb_rx_top;
    785   5084   johnlev 
    786   5084   johnlev 	while (loop < end) {
    787   5084   johnlev 		int16_t status = NETIF_RSP_OKAY;
    788   5084   johnlev 
    789   5084   johnlev 		if (gop->status != 0) {
    790   5084   johnlev 			status = NETIF_RSP_ERROR;
    791   5084   johnlev 
    792   5084   johnlev 			/*
    793   5084   johnlev 			 * If the status is anything other than
    794   5084   johnlev 			 * GNTST_bad_page then we don't own the page
    795   5084   johnlev 			 * any more, so don't try to give it back.
    796   5084   johnlev 			 */
    797   5084   johnlev 			if (gop->status != GNTST_bad_page)
    798   5084   johnlev 				gop->mfn = 0;
    799   5084   johnlev 		} else {
    800   5084   johnlev 			/* The page is no longer ours. */
    801   5084   johnlev 			gop->mfn = 0;
    802   5084   johnlev 		}
    803   5084   johnlev 
    804   5084   johnlev 		if (gop->mfn != 0)
    805   5084   johnlev 			/*
    806   5084   johnlev 			 * Give back the page, as we won't be using
    807   5084   johnlev 			 * it.
    808   5084   johnlev 			 */
    809   5084   johnlev 			xnb_free_page(xnbp, gop->mfn);
    810   5084   johnlev 		else
    811   5084   johnlev 			/*
    812   5084   johnlev 			 * We gave away a page, update our accounting
    813   5084   johnlev 			 * now.
    814   5084   johnlev 			 */
    815   5084   johnlev 			balloon_drv_subtracted(1);
    816   5084   johnlev 
    817   5084   johnlev 		/* 5.2 */
    818   5084   johnlev 		if (status != NETIF_RSP_OKAY) {
    819   5741       mrj 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
    820   5084   johnlev 			    status;
    821   5084   johnlev 		} else {
    822   7615       Max 			xnbp->xnb_stat_ipackets++;
    823   7615       Max 			xnbp->xnb_stat_rbytes += len;
    824   5084   johnlev 		}
    825   5084   johnlev 
    826   5084   johnlev 		loop++;
    827   5084   johnlev 		prod++;
    828   5084   johnlev 		gop++;
    829   5084   johnlev 	}
    830   5084   johnlev 
    831   5741       mrj 	xnbp->xnb_rx_ring.req_cons = loop;
    832   5741       mrj 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
    833   5084   johnlev 
    834   5084   johnlev 	/* 6 */
    835   5741       mrj 	/* LINTED: constant in conditional context */
    836   5741       mrj 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
    837   5084   johnlev 	if (notify) {
    838   5741       mrj 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
    839   7615       Max 		xnbp->xnb_stat_rx_notify_sent++;
    840   5084   johnlev 	} else {
    841   7615       Max 		xnbp->xnb_stat_rx_notify_deferred++;
    842   5084   johnlev 	}
    843   5084   johnlev 
    844   5084   johnlev 	if (mp != NULL)
    845   7615       Max 		xnbp->xnb_stat_rx_defer++;
    846   5084   johnlev 
    847   7615       Max 	mutex_exit(&xnbp->xnb_rx_lock);
    848   5084   johnlev 
    849   5084   johnlev 	/* Free mblk_t's that we consumed. */
    850   5741       mrj 	freemsgchain(free);
    851   5741       mrj 
    852   5741       mrj 	return (mp);
    853   5741       mrj }
    854   5741       mrj 
    855  10958       dme /* Helper functions for xnb_copy_to_peer(). */
    856   5741       mrj 
    857   5741       mrj /*
    858   5741       mrj  * Grow the array of copy operation descriptors.
    859   5741       mrj  */
    860  10958       dme static boolean_t
    861  10958       dme grow_cpop_area(xnb_t *xnbp)
    862   5741       mrj {
    863  10958       dme 	size_t count;
    864  10958       dme 	gnttab_copy_t *new;
    865   5741       mrj 
    866   7615       Max 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
    867   5741       mrj 
    868  10958       dme 	count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT;
    869   5741       mrj 
    870  10958       dme 	if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) {
    871   5741       mrj 		xnbp->xnb_stat_other_allocation_failure++;
    872  10958       dme 		return (B_FALSE);
    873   5741       mrj 	}
    874   5741       mrj 
    875  10958       dme 	bcopy(xnbp->xnb_rx_cpop, new,
    876  10958       dme 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
    877   5741       mrj 
    878  10958       dme 	kmem_free(xnbp->xnb_rx_cpop,
    879  10958       dme 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
    880   5741       mrj 
    881  10958       dme 	xnbp->xnb_rx_cpop = new;
    882  10958       dme 	xnbp->xnb_rx_cpop_count = count;
    883   5741       mrj 
    884   7615       Max 	xnbp->xnb_stat_rx_cpoparea_grown++;
    885   5741       mrj 
    886  10958       dme 	return (B_TRUE);
    887   5741       mrj }
    888   5741       mrj 
    889   5741       mrj /*
    890   5741       mrj  * Check whether an address is on a page that's foreign to this domain.
    891   5741       mrj  */
    892   5741       mrj static boolean_t
    893   5741       mrj is_foreign(void *addr)
    894   5741       mrj {
    895  10958       dme 	pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
    896   5741       mrj 
    897  10958       dme 	return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN);
    898   5741       mrj }
    899   5741       mrj 
    900   5741       mrj /*
    901   5741       mrj  * Insert a newly allocated mblk into a chain, replacing the old one.
    902   5741       mrj  */
    903   5741       mrj static mblk_t *
    904   5741       mrj replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
    905   5741       mrj {
    906   5741       mrj 	uint32_t	start, stuff, end, value, flags;
    907   5741       mrj 	mblk_t		*new_mp;
    908   5741       mrj 
    909   5741       mrj 	new_mp = copyb(mp);
    910   5741       mrj 	if (new_mp == NULL)
    911   5741       mrj 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
    912   5741       mrj 		    "for %p, len %lu", (void *) mp, len);
    913   5741       mrj 
    914   5741       mrj 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
    915   5741       mrj 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
    916   5741       mrj 	    flags, KM_NOSLEEP);
    917   5741       mrj 
    918   5741       mrj 	new_mp->b_next = mp->b_next;
    919   5741       mrj 	new_mp->b_prev = mp->b_prev;
    920   5741       mrj 	new_mp->b_cont = mp->b_cont;
    921   5741       mrj 
    922   5741       mrj 	/* Make sure we only overwrite pointers to the mblk being replaced. */
    923   5741       mrj 	if (mp_prev != NULL && mp_prev->b_next == mp)
    924   5741       mrj 		mp_prev->b_next = new_mp;
    925   5741       mrj 
    926   5741       mrj 	if (ml_prev != NULL && ml_prev->b_cont == mp)
    927   5741       mrj 		ml_prev->b_cont = new_mp;
    928   5741       mrj 
    929   5741       mrj 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
    930   5741       mrj 	freemsg(mp);
    931   5741       mrj 
    932   5741       mrj 	return (new_mp);
    933   5741       mrj }
    934   5741       mrj 
    935   5741       mrj /*
    936   5741       mrj  * Set all the fields in a gnttab_copy_t.
    937   5741       mrj  */
    938   5741       mrj static void
    939   5741       mrj setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
    940   5741       mrj     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
    941   5741       mrj {
    942   5741       mrj 	ASSERT(xnbp != NULL && gp != NULL);
    943   5741       mrj 
    944   5741       mrj 	gp->source.offset = s_off;
    945   5741       mrj 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
    946   5741       mrj 	gp->source.domid = DOMID_SELF;
    947   5741       mrj 
    948   5741       mrj 	gp->len = (uint16_t)len;
    949   5741       mrj 	gp->flags = GNTCOPY_dest_gref;
    950   5741       mrj 	gp->status = 0;
    951   5741       mrj 
    952   5741       mrj 	gp->dest.u.ref = d_ref;
    953   5741       mrj 	gp->dest.offset = d_off;
    954   5741       mrj 	gp->dest.domid = xnbp->xnb_peer;
    955   5741       mrj }
    956   5741       mrj 
    957  10958       dme /*
    958  10958       dme  * Pass packets to the peer using hypervisor copy operations.
    959  10958       dme  */
    960   5741       mrj mblk_t *
    961   5741       mrj xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
    962   5741       mrj {
    963   5741       mrj 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
    964   5741       mrj 	mblk_t		*ml, *ml_prev;
    965   5741       mrj 	boolean_t	notify;
    966   5741       mrj 	RING_IDX	loop, prod;
    967   5741       mrj 	int		i;
    968   5741       mrj 
    969  10958       dme 	/*
    970  10958       dme 	 * If the peer does not pre-post buffers for received packets,
    971  10958       dme 	 * use page flipping to pass packets to it.
    972  10958       dme 	 */
    973  10958       dme 	if (!xnbp->xnb_rx_hv_copy)
    974   5741       mrj 		return (xnb_to_peer(xnbp, mp));
    975   5741       mrj 
    976   5741       mrj 	/*
    977   5741       mrj 	 * For each packet the sequence of operations is:
    978   5741       mrj 	 *
    979   5741       mrj 	 *  1. get a request slot from the ring.
    980   5741       mrj 	 *  2. set up data for hypercall (see NOTE below)
    981   5741       mrj 	 *  3. have the hypervisore copy the data
    982   5741       mrj 	 *  4. update the request slot.
    983   5741       mrj 	 *  5. kick the peer.
    984   5741       mrj 	 *
    985   5741       mrj 	 * NOTE ad 2.
    986   5741       mrj 	 *  In order to reduce the number of hypercalls, we prepare
    987  10958       dme 	 *  several mblks (mp->b_cont != NULL) for the peer and
    988  10958       dme 	 *  perform a single hypercall to transfer them.  We also have
    989  10958       dme 	 *  to set up a seperate copy operation for every page.
    990   5741       mrj 	 *
    991  10958       dme 	 * If we have more than one packet (mp->b_next != NULL), we do
    992  10958       dme 	 * this whole dance repeatedly.
    993   5741       mrj 	 */
    994   5741       mrj 
    995   7615       Max 	mutex_enter(&xnbp->xnb_rx_lock);
    996   5741       mrj 
    997   5741       mrj 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
    998   7615       Max 		mutex_exit(&xnbp->xnb_rx_lock);
    999   7615       Max 		DTRACE_PROBE(copy_rx_too_early);
   1000   7615       Max 		xnbp->xnb_stat_rx_too_early++;
   1001   5741       mrj 		return (mp);
   1002   5741       mrj 	}
   1003   5741       mrj 
   1004   5741       mrj 	loop = xnbp->xnb_rx_ring.req_cons;
   1005   5741       mrj 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
   1006   5741       mrj 
   1007   5741       mrj 	while ((mp != NULL) &&
   1008   5741       mrj 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
   1009   5741       mrj 		netif_rx_request_t	*rxreq;
   1010  10958       dme 		size_t			d_offset, len;
   1011  10958       dme 		int			item_count;
   1012  10958       dme 		gnttab_copy_t		*gop_cp;
   1013   5741       mrj 		netif_rx_response_t	*rxresp;
   1014   5741       mrj 		uint16_t		cksum_flags;
   1015   5741       mrj 		int16_t			status = NETIF_RSP_OKAY;
   1016   5741       mrj 
   1017   5741       mrj 		/* 1 */
   1018   5741       mrj 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
   1019   5741       mrj 
   1020   5741       mrj #ifdef XNB_DEBUG
   1021   5741       mrj 		if (!(rxreq->id < NET_RX_RING_SIZE))
   1022   5741       mrj 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
   1023   5741       mrj 			    "id %d out of range in request 0x%p",
   1024   5741       mrj 			    rxreq->id, (void *)rxreq);
   1025   5741       mrj #endif /* XNB_DEBUG */
   1026   5741       mrj 
   1027   5741       mrj 		/* 2 */
   1028   8757       dme 		d_offset = 0;
   1029   5741       mrj 		len = 0;
   1030   5741       mrj 		item_count = 0;
   1031   5741       mrj 
   1032   7615       Max 		gop_cp = xnbp->xnb_rx_cpop;
   1033   5741       mrj 
   1034   5741       mrj 		/*
   1035  10958       dme 		 * We walk the b_cont pointers and set up a
   1036  10958       dme 		 * gnttab_copy_t for each sub-page chunk in each data
   1037  10958       dme 		 * block.
   1038   5741       mrj 		 */
   1039   5741       mrj 		/* 2a */
   1040   5741       mrj 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
   1041   5741       mrj 			size_t	chunk = ml->b_wptr - ml->b_rptr;
   1042   5741       mrj 			uchar_t	*r_tmp,	*rpt_align;
   1043   5741       mrj 			size_t	r_offset;
   1044   5741       mrj 
   1045   5741       mrj 			/*
   1046  10958       dme 			 * The hypervisor will not allow us to
   1047  10958       dme 			 * reference a foreign page (e.g. one
   1048  10958       dme 			 * belonging to another domain) by mfn in the
   1049  10958       dme 			 * copy operation. If the data in this mblk is
   1050  10958       dme 			 * on such a page we must copy the data into a
   1051  10958       dme 			 * local page before initiating the hypervisor
   1052  10958       dme 			 * copy operation.
   1053   5741       mrj 			 */
   1054   5741       mrj 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
   1055   5741       mrj 				mblk_t *ml_new = replace_msg(ml, chunk,
   1056   5741       mrj 				    mp_prev, ml_prev);
   1057   5741       mrj 
   1058   5741       mrj 				/* We can still use old ml, but not *ml! */
   1059   5741       mrj 				if (free == ml)
   1060   5741       mrj 					free = ml_new;
   1061   5741       mrj 				if (mp == ml)
   1062   5741       mrj 					mp = ml_new;
   1063   5741       mrj 				ml = ml_new;
   1064   5741       mrj 
   1065   7615       Max 				xnbp->xnb_stat_rx_foreign_page++;
   1066   5741       mrj 			}
   1067   5741       mrj 
   1068   5741       mrj 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
   1069   5741       mrj 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
   1070   5741       mrj 			r_tmp = ml->b_rptr;
   1071   5741       mrj 
   1072   5741       mrj 			if (d_offset + chunk > PAGESIZE)
   1073   5741       mrj 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
   1074   5741       mrj 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
   1075   5741       mrj 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
   1076   5741       mrj 				    (void *)mp, (void *)saved_mp, (void *)ml,
   1077   5741       mrj 				    (void *)rpt_align,
   1078   5741       mrj 				    d_offset, chunk, (int)PAGESIZE);
   1079   5741       mrj 
   1080   5741       mrj 			while (chunk > 0) {
   1081   5741       mrj 				size_t part_len;
   1082   5741       mrj 
   1083  10958       dme 				if (item_count == xnbp->xnb_rx_cpop_count) {
   1084  10958       dme 					if (!grow_cpop_area(xnbp))
   1085   5741       mrj 						goto failure;
   1086  10958       dme 					gop_cp = &xnbp->xnb_rx_cpop[item_count];
   1087   5741       mrj 				}
   1088   5741       mrj 				/*
   1089   5741       mrj 				 * If our mblk crosses a page boundary, we need
   1090  10958       dme 				 * to do a seperate copy for each page.
   1091   5741       mrj 				 */
   1092   5741       mrj 				if (r_offset + chunk > PAGESIZE) {
   1093   5741       mrj 					part_len = PAGESIZE - r_offset;
   1094   5741       mrj 
   1095   5741       mrj 					DTRACE_PROBE3(mblk_page_crossed,
   1096   5741       mrj 					    (mblk_t *), ml, int, chunk, int,
   1097   5741       mrj 					    (int)r_offset);
   1098   5741       mrj 
   1099   7615       Max 					xnbp->xnb_stat_rx_pagebndry_crossed++;
   1100   5741       mrj 				} else {
   1101   5741       mrj 					part_len = chunk;
   1102   5741       mrj 				}
   1103   5741       mrj 
   1104   5741       mrj 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
   1105   5741       mrj 				    d_offset, part_len, rxreq->gref);
   1106   5741       mrj 
   1107   5741       mrj 				chunk -= part_len;
   1108   5741       mrj 
   1109   5741       mrj 				len += part_len;
   1110   5741       mrj 				d_offset += part_len;
   1111   5741       mrj 				r_tmp += part_len;
   1112   5741       mrj 				/*
   1113   5741       mrj 				 * The 2nd, 3rd ... last copies will always
   1114   5741       mrj 				 * start at r_tmp, therefore r_offset is 0.
   1115   5741       mrj 				 */
   1116   5741       mrj 				r_offset = 0;
   1117   5741       mrj 				gop_cp++;
   1118  10958       dme 				item_count++;
   1119   5741       mrj 			}
   1120   5741       mrj 			ml_prev = ml;
   1121  10958       dme 
   1122   5741       mrj 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
   1123   5741       mrj 			    chunk, int, len, int, item_count);
   1124   5741       mrj 		}
   1125   5741       mrj 		/* 3 */
   1126   7615       Max 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
   1127   5741       mrj 		    item_count) != 0) {
   1128   5741       mrj 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
   1129   5741       mrj 			DTRACE_PROBE(HV_granttableopfailed);
   1130   5741       mrj 		}
   1131   5741       mrj 
   1132   5741       mrj 		/* 4 */
   1133   5741       mrj 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
   1134   8757       dme 		rxresp->offset = 0;
   1135   5741       mrj 
   1136   5741       mrj 		rxresp->flags = 0;
   1137   5741       mrj 
   1138   5741       mrj 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
   1139   5741       mrj 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
   1140   5741       mrj 		    (int)rxresp->status);
   1141   5741       mrj 
   1142   5741       mrj 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
   1143   5741       mrj 		if (cksum_flags != 0)
   1144   7615       Max 			xnbp->xnb_stat_rx_cksum_deferred++;
   1145   5741       mrj 		rxresp->flags |= cksum_flags;
   1146   5741       mrj 
   1147   5741       mrj 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
   1148   5741       mrj 		rxresp->status = len;
   1149   5741       mrj 
   1150   5741       mrj 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
   1151   5741       mrj 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
   1152   5741       mrj 		    (int)rxresp->status);
   1153   5741       mrj 
   1154   5741       mrj 		for (i = 0; i < item_count; i++) {
   1155   7615       Max 			if (xnbp->xnb_rx_cpop[i].status != 0) {
   1156  10958       dme 				DTRACE_PROBE2(cpop_status_nonnull, int,
   1157   7615       Max 				    (int)xnbp->xnb_rx_cpop[i].status,
   1158   5741       mrj 				    int, i);
   1159   5741       mrj 				status = NETIF_RSP_ERROR;
   1160   5741       mrj 			}
   1161   5741       mrj 		}
   1162   5741       mrj 
   1163   5741       mrj 		/* 5.2 */
   1164   5741       mrj 		if (status != NETIF_RSP_OKAY) {
   1165   5741       mrj 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
   1166   5741       mrj 			    status;
   1167   7615       Max 			xnbp->xnb_stat_rx_rsp_notok++;
   1168   5741       mrj 		} else {
   1169   7615       Max 			xnbp->xnb_stat_ipackets++;
   1170   7615       Max 			xnbp->xnb_stat_rbytes += len;
   1171   5741       mrj 		}
   1172   5741       mrj 
   1173   5741       mrj 		loop++;
   1174   5741       mrj 		prod++;
   1175   5741       mrj 		mp_prev = mp;
   1176   5741       mrj 		mp = mp->b_next;
   1177   5741       mrj 	}
   1178   5741       mrj failure:
   1179   5741       mrj 	/*
   1180   5741       mrj 	 * Did we actually do anything?
   1181   5741       mrj 	 */
   1182   5741       mrj 	if (loop == xnbp->xnb_rx_ring.req_cons) {
   1183   7615       Max 		mutex_exit(&xnbp->xnb_rx_lock);
   1184   5741       mrj 		return (mp);
   1185   5741       mrj 	}
   1186   5741       mrj 
   1187   5741       mrj 	/*
   1188   5741       mrj 	 * Unlink the end of the 'done' list from the remainder.
   1189   5741       mrj 	 */
   1190   5741       mrj 	ASSERT(mp_prev != NULL);
   1191   5741       mrj 	mp_prev->b_next = NULL;
   1192   5741       mrj 
   1193   5741       mrj 	xnbp->xnb_rx_ring.req_cons = loop;
   1194   5741       mrj 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
   1195   5741       mrj 
   1196   5741       mrj 	/* 6 */
   1197   5741       mrj 	/* LINTED: constant in conditional context */
   1198   5741       mrj 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
   1199   5741       mrj 	if (notify) {
   1200   5741       mrj 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
   1201   7615       Max 		xnbp->xnb_stat_rx_notify_sent++;
   1202   5741       mrj 	} else {
   1203   7615       Max 		xnbp->xnb_stat_rx_notify_deferred++;
   1204   5741       mrj 	}
   1205   5741       mrj 
   1206   5741       mrj 	if (mp != NULL)
   1207   7615       Max 		xnbp->xnb_stat_rx_defer++;
   1208   5741       mrj 
   1209   7615       Max 	mutex_exit(&xnbp->xnb_rx_lock);
   1210   5741       mrj 
   1211   5741       mrj 	/* Free mblk_t structs we have consumed. */
   1212   5084   johnlev 	freemsgchain(free);
   1213   5084   johnlev 
   1214   5084   johnlev 	return (mp);
   1215   5084   johnlev }
   1216   5084   johnlev 
   1217   5084   johnlev 
   1218   5084   johnlev static void
   1219  10958       dme xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force)
   1220   5084   johnlev {
   1221   5084   johnlev 	boolean_t notify;
   1222   5084   johnlev 
   1223   7615       Max 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
   1224   5084   johnlev 
   1225   5741       mrj 	/* LINTED: constant in conditional context */
   1226   5741       mrj 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
   1227  10958       dme 	if (notify || force) {
   1228   5741       mrj 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
   1229   7615       Max 		xnbp->xnb_stat_tx_notify_sent++;
   1230   5084   johnlev 	} else {
   1231   7615       Max 		xnbp->xnb_stat_tx_notify_deferred++;
   1232   5084   johnlev 	}
   1233   5084   johnlev }
   1234   5084   johnlev 
   1235   5084   johnlev static void
   1236   7615       Max xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
   1237   5084   johnlev {
   1238   5084   johnlev 	RING_IDX i;
   1239   5084   johnlev 	netif_tx_response_t *txresp;
   1240   5084   johnlev 
   1241   7615       Max 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
   1242   5084   johnlev 
   1243   5741       mrj 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
   1244   5084   johnlev 
   1245   5741       mrj 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
   1246   5084   johnlev 	txresp->id = id;
   1247   5084   johnlev 	txresp->status = status;
   1248   5084   johnlev 
   1249   5741       mrj 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
   1250   5084   johnlev 
   1251   5084   johnlev 	/*
   1252   5084   johnlev 	 * Note that we don't push the change to the peer here - that
   1253   5084   johnlev 	 * is the callers responsibility.
   1254   5084   johnlev 	 */
   1255   5084   johnlev }
   1256   5084   johnlev 
   1257   5084   johnlev static void
   1258  10958       dme xnb_txbuf_recycle(xnb_txbuf_t *txp)
   1259   5084   johnlev {
   1260  10958       dme 	xnb_t *xnbp = txp->xt_xnbp;
   1261   5084   johnlev 
   1262  10958       dme 	kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
   1263   5084   johnlev 
   1264  10958       dme 	xnbp->xnb_tx_buf_outstanding--;
   1265  10958       dme }
   1266   5084   johnlev 
   1267  10958       dme static int
   1268  10958       dme xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
   1269  10958       dme {
   1270  10958       dme 	_NOTE(ARGUNUSED(kmflag));
   1271  10958       dme 	xnb_txbuf_t *txp = buf;
   1272  10958       dme 	xnb_t *xnbp = arg;
   1273  10958       dme 	size_t len;
   1274  10958       dme 	ddi_dma_cookie_t dma_cookie;
   1275  10958       dme 	uint_t ncookies;
   1276   5741       mrj 
   1277  10958       dme 	txp->xt_free_rtn.free_func = xnb_txbuf_recycle;
   1278  10958       dme 	txp->xt_free_rtn.free_arg = (caddr_t)txp;
   1279  10958       dme 	txp->xt_xnbp = xnbp;
   1280  10958       dme 	txp->xt_next = NULL;
   1281   5084   johnlev 
   1282  10958       dme 	if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr,
   1283  10958       dme 	    0, 0, &txp->xt_dma_handle) != DDI_SUCCESS)
   1284  10958       dme 		goto failure;
   1285   5084   johnlev 
   1286  10958       dme 	if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr,
   1287  10958       dme 	    DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len,
   1288  10958       dme 	    &txp->xt_acc_handle) != DDI_SUCCESS)
   1289  10958       dme 		goto failure_1;
   1290   5741       mrj 
   1291  10958       dme 	if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf,
   1292  10958       dme 	    len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0,
   1293  10958       dme 	    &dma_cookie, &ncookies)
   1294  10958       dme 	    != DDI_DMA_MAPPED)
   1295  10958       dme 		goto failure_2;
   1296  10958       dme 	ASSERT(ncookies == 1);
   1297  10958       dme 
   1298  10958       dme 	txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress);
   1299  10958       dme 	txp->xt_buflen = dma_cookie.dmac_size;
   1300  10958       dme 
   1301  10958       dme 	DTRACE_PROBE(txbuf_allocated);
   1302  10958       dme 
   1303  10958       dme 	atomic_add_32(&xnbp->xnb_tx_buf_count, 1);
   1304  10958       dme 	xnbp->xnb_tx_buf_outstanding++;
   1305  10958       dme 
   1306  10958       dme 	return (0);
   1307  10958       dme 
   1308  10958       dme failure_2:
   1309  10958       dme 	ddi_dma_mem_free(&txp->xt_acc_handle);
   1310  10958       dme 
   1311  10958       dme failure_1:
   1312  10958       dme 	ddi_dma_free_handle(&txp->xt_dma_handle);
   1313  10958       dme 
   1314  10958       dme failure:
   1315  10958       dme 
   1316  10958       dme 	return (-1);
   1317  10958       dme }
   1318  10958       dme 
   1319  10958       dme static void
   1320  10958       dme xnb_txbuf_destructor(void *buf, void *arg)
   1321  10958       dme {
   1322  10958       dme 	xnb_txbuf_t *txp = buf;
   1323  10958       dme 	xnb_t *xnbp = arg;
   1324  10958       dme 
   1325  10958       dme 	(void) ddi_dma_unbind_handle(txp->xt_dma_handle);
   1326  10958       dme 	ddi_dma_mem_free(&txp->xt_acc_handle);
   1327  10958       dme 	ddi_dma_free_handle(&txp->xt_dma_handle);
   1328  10958       dme 
   1329  10958       dme 	atomic_add_32(&xnbp->xnb_tx_buf_count, -1);
   1330   5084   johnlev }
   1331   5084   johnlev 
   1332   5741       mrj /*
   1333  10958       dme  * Take packets from the peer and deliver them onward.
   1334   5741       mrj  */
   1335   5084   johnlev static mblk_t *
   1336   7615       Max xnb_from_peer(xnb_t *xnbp)
   1337   5084   johnlev {
   1338   5084   johnlev 	RING_IDX start, end, loop;
   1339  10958       dme 	gnttab_copy_t *cop;
   1340   7615       Max 	xnb_txbuf_t **txpp;
   1341   5084   johnlev 	netif_tx_request_t *txreq;
   1342  10958       dme 	boolean_t work_to_do, need_notify = B_FALSE;
   1343   5084   johnlev 	mblk_t *head, *tail;
   1344  10958       dme 	int n_data_req, i;
   1345   5084   johnlev 
   1346  10958       dme 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
   1347   5084   johnlev 
   1348   5084   johnlev 	head = tail = NULL;
   1349   5084   johnlev around:
   1350   5084   johnlev 
   1351   5741       mrj 	/* LINTED: constant in conditional context */
   1352   5741       mrj 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
   1353   5084   johnlev 	if (!work_to_do) {
   1354   5084   johnlev finished:
   1355  10958       dme 		xnb_tx_notify_peer(xnbp, need_notify);
   1356  10958       dme 
   1357   5084   johnlev 		return (head);
   1358   5084   johnlev 	}
   1359   5084   johnlev 
   1360   5741       mrj 	start = xnbp->xnb_tx_ring.req_cons;
   1361   5741       mrj 	end = xnbp->xnb_tx_ring.sring->req_prod;
   1362   5084   johnlev 
   1363   7676       dme 	if ((end - start) > NET_TX_RING_SIZE) {
   1364   7676       dme 		/*
   1365   7676       dme 		 * This usually indicates that the frontend driver is
   1366   7676       dme 		 * misbehaving, as it's not possible to have more than
   1367   7676       dme 		 * NET_TX_RING_SIZE ring elements in play at any one
   1368   7676       dme 		 * time.
   1369   7676       dme 		 *
   1370   7676       dme 		 * We reset the ring pointers to the state declared by
   1371   7676       dme 		 * the frontend and try to carry on.
   1372   7676       dme 		 */
   1373   7676       dme 		cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
   1374   7676       dme 		    "items in the ring, resetting and trying to recover.",
   1375   7676       dme 		    xnbp->xnb_peer, (end - start));
   1376   7676       dme 
   1377   7676       dme 		/* LINTED: constant in conditional context */
   1378   7676       dme 		BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
   1379   7676       dme 		    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
   1380   7676       dme 
   1381   7676       dme 		goto around;
   1382   7676       dme 	}
   1383   7676       dme 
   1384  10958       dme 	loop = start;
   1385  10958       dme 	cop = xnbp->xnb_tx_cop;
   1386  10958       dme 	txpp = xnbp->xnb_tx_bufp;
   1387  10958       dme 	n_data_req = 0;
   1388   5084   johnlev 
   1389  10958       dme 	while (loop < end) {
   1390  10958       dme 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
   1391   5084   johnlev 
   1392  10958       dme 		if (txreq->flags & NETTXF_extra_info) {
   1393  10958       dme 			struct netif_extra_info *erp;
   1394  10958       dme 			boolean_t status;
   1395   5084   johnlev 
   1396  10958       dme 			loop++; /* Consume another slot in the ring. */
   1397  10958       dme 			ASSERT(loop <= end);
   1398   5084   johnlev 
   1399  10958       dme 			erp = (struct netif_extra_info *)
   1400  10958       dme 			    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
   1401  10958       dme 
   1402  10958       dme 			switch (erp->type) {
   1403  10958       dme 			case XEN_NETIF_EXTRA_TYPE_MCAST_ADD:
   1404  10958       dme 				ASSERT(xnbp->xnb_multicast_control);
   1405  10958       dme 				status = xnbp->xnb_flavour->xf_mcast_add(xnbp,
   1406  10958       dme 				    &erp->u.mcast.addr);
   1407  10958       dme 				break;
   1408  10958       dme 			case XEN_NETIF_EXTRA_TYPE_MCAST_DEL:
   1409  10958       dme 				ASSERT(xnbp->xnb_multicast_control);
   1410  10958       dme 				status = xnbp->xnb_flavour->xf_mcast_del(xnbp,
   1411  10958       dme 				    &erp->u.mcast.addr);
   1412  10958       dme 				break;
   1413  10958       dme 			default:
   1414  10958       dme 				status = B_FALSE;
   1415  10958       dme 				cmn_err(CE_WARN, "xnb_from_peer: "
   1416  10958       dme 				    "unknown extra type %d", erp->type);
   1417  10958       dme 				break;
   1418  10958       dme 			}
   1419  10958       dme 
   1420  10958       dme 			xnb_tx_mark_complete(xnbp, txreq->id,
   1421  10958       dme 			    status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR);
   1422  10958       dme 			need_notify = B_TRUE;
   1423  10958       dme 		} else {
   1424  10958       dme 			xnb_txbuf_t *txp;
   1425  10958       dme 
   1426  10958       dme 			txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache,
   1427  10958       dme 			    KM_NOSLEEP);
   1428  10958       dme 			if (txp == NULL)
   1429  10958       dme 				break;
   1430  10958       dme 
   1431  10958       dme 			txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf,
   1432  10958       dme 			    txp->xt_buflen, 0, &txp->xt_free_rtn);
   1433  10958       dme 			if (txp->xt_mblk == NULL) {
   1434  10958       dme 				kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
   1435  10958       dme 				break;
   1436  10958       dme 			}
   1437  10958       dme 
   1438  10958       dme 			txp->xt_idx = loop;
   1439  10958       dme 			txp->xt_id = txreq->id;
   1440  10958       dme 
   1441  10958       dme 			cop->source.u.ref = txreq->gref;
   1442  10958       dme 			cop->source.domid = xnbp->xnb_peer;
   1443  10958       dme 			cop->source.offset = txreq->offset;
   1444  10958       dme 
   1445  10958       dme 			cop->dest.u.gmfn = txp->xt_mfn;
   1446  10958       dme 			cop->dest.domid = DOMID_SELF;
   1447  10958       dme 			cop->dest.offset = 0;
   1448  10958       dme 
   1449  10958       dme 			cop->len = txreq->size;
   1450  10958       dme 			cop->flags = GNTCOPY_source_gref;
   1451  10958       dme 			cop->status = 0;
   1452  10958       dme 
   1453  10958       dme 			*txpp = txp;
   1454  10958       dme 
   1455  10958       dme 			txpp++;
   1456  10958       dme 			cop++;
   1457  10958       dme 			n_data_req++;
   1458  10958       dme 
   1459  10958       dme 			ASSERT(n_data_req <= NET_TX_RING_SIZE);
   1460  10958       dme 		}
   1461  10958       dme 
   1462  10958       dme 		loop++;
   1463   5084   johnlev 	}
   1464   5084   johnlev 
   1465  10958       dme 	xnbp->xnb_tx_ring.req_cons = loop;
   1466   5084   johnlev 
   1467  10958       dme 	if (n_data_req == 0)
   1468  10958       dme 		goto around;
   1469   5084   johnlev 
   1470  10958       dme 	if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
   1471  10958       dme 	    xnbp->xnb_tx_cop, n_data_req) != 0) {
   1472   5084   johnlev 
   1473  10958       dme 		cmn_err(CE_WARN, "xnb_from_peer: copy operation failed");
   1474   5084   johnlev 
   1475   7615       Max 		txpp = xnbp->xnb_tx_bufp;
   1476  10958       dme 		i = n_data_req;
   1477  10958       dme 		while (i > 0) {
   1478  10958       dme 			kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp);
   1479   7615       Max 			txpp++;
   1480  10958       dme 			i--;
   1481   5084   johnlev 		}
   1482   5084   johnlev 
   1483   5084   johnlev 		goto finished;
   1484   5084   johnlev 	}
   1485   5084   johnlev 
   1486  10958       dme 	txpp = xnbp->xnb_tx_bufp;
   1487  10958       dme 	cop = xnbp->xnb_tx_cop;
   1488  10958       dme 	i = n_data_req;
   1489  10958       dme 
   1490  10958       dme 	while (i > 0) {
   1491   7615       Max 		xnb_txbuf_t *txp = *txpp;
   1492   5084   johnlev 
   1493  10958       dme 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx);
   1494  10958       dme 
   1495  10958       dme 		if (cop->status != 0) {
   1496  10958       dme #ifdef XNB_DEBUG
   1497   7615       Max 			cmn_err(CE_WARN, "xnb_from_peer: "
   1498  10958       dme 			    "txpp 0x%p failed (%d)",
   1499  10958       dme 			    (void *)*txpp, cop->status);
   1500  10958       dme #endif /* XNB_DEBUG */
   1501  10958       dme 			xnb_tx_mark_complete(xnbp, txp->xt_id, cop->status);
   1502  10958       dme 			freemsg(txp->xt_mblk);
   1503  10958       dme 		} else {
   1504  10958       dme 			mblk_t *mp;
   1505   5084   johnlev 
   1506  10958       dme 			mp = txp->xt_mblk;
   1507  10958       dme 			mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf;
   1508  10958       dme 			mp->b_wptr += txreq->size;
   1509  10958       dme 			mp->b_next = NULL;
   1510   5084   johnlev 
   1511   5084   johnlev 			/*
   1512  10958       dme 			 * If there are checksum flags, process them
   1513  10958       dme 			 * appropriately.
   1514   5084   johnlev 			 */
   1515  10958       dme 			if ((txreq->flags &
   1516   5084   johnlev 			    (NETTXF_csum_blank | NETTXF_data_validated))
   1517  10958       dme 			    != 0) {
   1518   5741       mrj 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
   1519   5084   johnlev 				    mp, txreq->flags);
   1520   7615       Max 				xnbp->xnb_stat_tx_cksum_no_need++;
   1521  10958       dme 
   1522  10958       dme 				txp->xt_mblk = mp;
   1523   5084   johnlev 			}
   1524   5084   johnlev 
   1525   5084   johnlev 			if (head == NULL) {
   1526   5084   johnlev 				ASSERT(tail == NULL);
   1527   5084   johnlev 				head = mp;
   1528   5084   johnlev 			} else {
   1529   5084   johnlev 				ASSERT(tail != NULL);
   1530   5084   johnlev 				tail->b_next = mp;
   1531   5084   johnlev 			}
   1532   5084   johnlev 			tail = mp;
   1533  10958       dme 
   1534  10958       dme 			xnbp->xnb_stat_opackets++;
   1535  10958       dme 			xnbp->xnb_stat_obytes += txreq->size;
   1536  10958       dme 
   1537  10958       dme 			xnb_tx_mark_complete(xnbp, txp->xt_id, cop->status);
   1538   5084   johnlev 		}
   1539  10958       dme 
   1540  10958       dme 		txpp++;
   1541  10958       dme 		cop++;
   1542  10958       dme 		i--;
   1543   5084   johnlev 	}
   1544   5084   johnlev 
   1545   5084   johnlev 	goto around;
   1546   5084   johnlev 	/* NOTREACHED */
   1547   5084   johnlev }
   1548   5084   johnlev 
   1549   5084   johnlev static uint_t
   1550   5084   johnlev xnb_intr(caddr_t arg)
   1551   5084   johnlev {
   1552   5084   johnlev 	xnb_t *xnbp = (xnb_t *)arg;
   1553   5084   johnlev 	mblk_t *mp;
   1554   5084   johnlev 
   1555   5741       mrj 	xnbp->xnb_stat_intr++;
   1556   5084   johnlev 
   1557   7615       Max 	mutex_enter(&xnbp->xnb_tx_lock);
   1558   5084   johnlev 
   1559   5741       mrj 	ASSERT(xnbp->xnb_connected);
   1560   5084   johnlev 
   1561   7615       Max 	mp = xnb_from_peer(xnbp);
   1562   5084   johnlev 
   1563   7615       Max 	mutex_exit(&xnbp->xnb_tx_lock);
   1564   5084   johnlev 
   1565   5741       mrj 	if (!xnbp->xnb_hotplugged) {
   1566   7615       Max 		xnbp->xnb_stat_tx_too_early++;
   1567   5084   johnlev 		goto fail;
   1568   5084   johnlev 	}
   1569   5084   johnlev 	if (mp == NULL) {
   1570   5741       mrj 		xnbp->xnb_stat_spurious_intr++;
   1571   5084   johnlev 		goto fail;
   1572   5084   johnlev 	}
   1573   5084   johnlev 
   1574   7615       Max 	xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
   1575   5084   johnlev 
   1576   5084   johnlev 	return (DDI_INTR_CLAIMED);
   1577   5084   johnlev 
   1578   5084   johnlev fail:
   1579   5084   johnlev 	freemsgchain(mp);
   1580   5084   johnlev 	return (DDI_INTR_CLAIMED);
   1581   5084   johnlev }
   1582   5084   johnlev 
   1583  10958       dme /*
   1584  10958       dme  * Read our configuration from xenstore.
   1585  10958       dme  */
   1586  10958       dme boolean_t
   1587  10958       dme xnb_read_xs_config(xnb_t *xnbp)
   1588  10958       dme {
   1589  10958       dme 	char *xsname;
   1590  10958       dme 	char mac[ETHERADDRL * 3];
   1591  10958       dme 
   1592  10958       dme 	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
   1593  10958       dme 
   1594  10958       dme 	if (xenbus_scanf(XBT_NULL, xsname,
   1595  10958       dme 	    "mac", "%s", mac) != 0) {
   1596  10958       dme 		cmn_err(CE_WARN, "xnb_attach: "
   1597  10958       dme 		    "cannot read mac address from %s",
   1598  10958       dme 		    xsname);
   1599  10958       dme 		return (B_FALSE);
   1600  10958       dme 	}
   1601  10958       dme 
   1602  10958       dme 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
   1603  10958       dme 		cmn_err(CE_WARN,
   1604  10958       dme 		    "xnb_attach: cannot parse mac address %s",
   1605  10958       dme 		    mac);
   1606  10958       dme 		return (B_FALSE);
   1607  10958       dme 	}
   1608  10958       dme 
   1609  10958       dme 	return (B_TRUE);
   1610  10958       dme }
   1611  10958       dme 
   1612  10958       dme /*
   1613  10958       dme  * Read the configuration of the peer from xenstore.
   1614  10958       dme  */
   1615  10958       dme boolean_t
   1616  10958       dme xnb_read_oe_config(xnb_t *xnbp)
   1617  10958       dme {
   1618  10958       dme 	char *oename;
   1619  10958       dme 	int i;
   1620  10958       dme 
   1621  10958       dme 	oename = xvdi_get_oename(xnbp->xnb_devinfo);
   1622  10958       dme 
   1623  10958       dme 	if (xenbus_gather(XBT_NULL, oename,
   1624  10958       dme 	    "event-channel", "%u", &xnbp->xnb_fe_evtchn,
   1625  10958       dme 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
   1626  10958       dme 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
   1627  10958       dme 	    NULL) != 0) {
   1628  10958       dme 		cmn_err(CE_WARN, "xnb_read_oe_config: "
   1629  10958       dme 		    "cannot read other-end details from %s",
   1630  10958       dme 		    oename);
   1631  10958       dme 		return (B_FALSE);
   1632  10958       dme 	}
   1633  10958       dme 
   1634  10958       dme 	/*
   1635  10958       dme 	 * Check whether our peer requests receive side hypervisor
   1636  10958       dme 	 * copy.
   1637  10958       dme 	 */
   1638  10958       dme 	if (xenbus_scanf(XBT_NULL, oename,
   1639  10958       dme 	    "request-rx-copy", "%d", &i) != 0)
   1640  10958       dme 		i = 0;
   1641  10958       dme 	if (i != 0)
   1642  10958       dme 		xnbp->xnb_rx_hv_copy = B_TRUE;
   1643  10958       dme 
   1644  10958       dme 	/*
   1645  10958       dme 	 * Check whether our peer requests multicast_control.
   1646  10958       dme 	 */
   1647  10958       dme 	if (xenbus_scanf(XBT_NULL, oename,
   1648  10958       dme 	    "request-multicast-control", "%d", &i) != 0)
   1649  10958       dme 		i = 0;
   1650  10958       dme 	if (i != 0)
   1651  10958       dme 		xnbp->xnb_multicast_control = B_TRUE;
   1652  10958       dme 
   1653  10958       dme 	/*
   1654  10958       dme 	 * The Linux backend driver here checks to see if the peer has
   1655  10958       dme 	 * set 'feature-no-csum-offload'. This is used to indicate
   1656  10958       dme 	 * that the guest cannot handle receiving packets without a
   1657  10958       dme 	 * valid checksum. We don't check here, because packets passed
   1658  10958       dme 	 * to the peer _always_ have a valid checksum.
   1659  10958       dme 	 *
   1660  10958       dme 	 * There are three cases:
   1661  10958       dme 	 *
   1662  10958       dme 	 * - the NIC is dedicated: packets from the wire should always
   1663  10958       dme 	 *   have a valid checksum. If the hardware validates the
   1664  10958       dme 	 *   checksum then the relevant bit will be set in the packet
   1665  10958       dme 	 *   attributes and we will inform the peer. It can choose to
   1666  10958       dme 	 *   ignore the hardware verification.
   1667  10958       dme 	 *
   1668  10958       dme 	 * - the NIC is shared (VNIC) and a packet originates from the
   1669  10958       dme 	 *   wire: this is the same as the case above - the packets
   1670  10958       dme 	 *   will have a valid checksum.
   1671  10958       dme 	 *
   1672  10958       dme 	 * - the NIC is shared (VNIC) and a packet originates from the
   1673  10958       dme 	 *   host: the MAC layer ensures that all such packets have a
   1674  10958       dme 	 *   valid checksum by calculating one if the stack did not.
   1675  10958       dme 	 */
   1676  10958       dme 
   1677  10958       dme 	return (B_TRUE);
   1678  10958       dme }
   1679  10958       dme 
   1680  10958       dme void
   1681  10958       dme xnb_start_connect(xnb_t *xnbp)
   1682  10958       dme {
   1683  10958       dme 	dev_info_t  *dip = xnbp->xnb_devinfo;
   1684  10958       dme 
   1685  10958       dme 	if (!xnb_connect_rings(dip)) {
   1686  10958       dme 		cmn_err(CE_WARN, "xnb_start_connect: "
   1687  10958       dme 		    "cannot connect rings");
   1688  10958       dme 		goto failed;
   1689  10958       dme 	}
   1690  10958       dme 
   1691  10958       dme 	if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) {
   1692  10958       dme 		cmn_err(CE_WARN, "xnb_start_connect: "
   1693  10958       dme 		    "flavour failed to connect");
   1694  10958       dme 		goto failed;
   1695  10958       dme 	}
   1696  10958       dme 
   1697  10958       dme 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
   1698  10958       dme 	return;
   1699  10958       dme 
   1700  10958       dme failed:
   1701  10958       dme 	xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
   1702  10958       dme 	xnb_disconnect_rings(dip);
   1703  10958       dme 	(void) xvdi_switch_state(dip, XBT_NULL,
   1704  10958       dme 	    XenbusStateClosed);
   1705  10958       dme 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
   1706  10958       dme }
   1707  10958       dme 
   1708   5084   johnlev static boolean_t
   1709   5084   johnlev xnb_connect_rings(dev_info_t *dip)
   1710   5084   johnlev {
   1711   5084   johnlev 	xnb_t *xnbp = ddi_get_driver_private(dip);
   1712   5084   johnlev 	struct gnttab_map_grant_ref map_op;
   1713   5084   johnlev 
   1714   5084   johnlev 	/*
   1715   5084   johnlev 	 * Cannot attempt to connect the rings if already connected.
   1716   5084   johnlev 	 */
   1717   5741       mrj 	ASSERT(!xnbp->xnb_connected);
   1718   5084   johnlev 
   1719   5084   johnlev 	/*
   1720   5084   johnlev 	 * 1. allocate a vaddr for the tx page, one for the rx page.
   1721   5084   johnlev 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
   1722   5084   johnlev 	 *    into the allocated vaddr (one for tx, one for rx).
   1723   5084   johnlev 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
   1724   5084   johnlev 	 *    bound to this domain.
   1725   5084   johnlev 	 * 4. associate the event channel with an interrupt.
   1726  10958       dme 	 * 5. enable the interrupt.
   1727   5084   johnlev 	 */
   1728   5084   johnlev 
   1729   5084   johnlev 	/* 1.tx */
   1730   5741       mrj 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
   1731   5084   johnlev 	    0, 0, 0, 0, VM_SLEEP);
   1732   5741       mrj 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
   1733   5084   johnlev 
   1734   5084   johnlev 	/* 2.tx */
   1735   5741       mrj 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
   1736   5084   johnlev 	map_op.flags = GNTMAP_host_map;
   1737   5741       mrj 	map_op.ref = xnbp->xnb_tx_ring_ref;
   1738   5741       mrj 	map_op.dom = xnbp->xnb_peer;
   1739   7756      Mark 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
   1740   7756      Mark 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
   1741   7756      Mark 	    map_op.status != 0) {
   1742   5084   johnlev 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
   1743   5084   johnlev 		goto fail;
   1744   5084   johnlev 	}
   1745   5741       mrj 	xnbp->xnb_tx_ring_handle = map_op.handle;
   1746   5084   johnlev 
   1747   5741       mrj 	/* LINTED: constant in conditional context */
   1748   5741       mrj 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
   1749   5741       mrj 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
   1750   5084   johnlev 
   1751   5084   johnlev 	/* 1.rx */
   1752   5741       mrj 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
   1753   5084   johnlev 	    0, 0, 0, 0, VM_SLEEP);
   1754   5741       mrj 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
   1755   5084   johnlev 
   1756   5084   johnlev 	/* 2.rx */
   1757   5741       mrj 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
   1758   5084   johnlev 	map_op.flags = GNTMAP_host_map;
   1759   5741       mrj 	map_op.ref = xnbp->xnb_rx_ring_ref;
   1760   5741       mrj 	map_op.dom = xnbp->xnb_peer;
   1761   7756      Mark 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
   1762   7756      Mark 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
   1763   7756      Mark 	    map_op.status != 0) {
   1764   5084   johnlev 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
   1765   5084   johnlev 		goto fail;
   1766   5084   johnlev 	}
   1767   5741       mrj 	xnbp->xnb_rx_ring_handle = map_op.handle;
   1768   5084   johnlev 
   1769   5741       mrj 	/* LINTED: constant in conditional context */
   1770   5741       mrj 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
   1771   5741       mrj 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
   1772   5084   johnlev 
   1773   5084   johnlev 	/* 3 */
   1774  10958       dme 	if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) {
   1775   5084   johnlev 		cmn_err(CE_WARN, "xnb_connect_rings: "
   1776   5741       mrj 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
   1777   5741       mrj 		xnbp->xnb_evtchn = INVALID_EVTCHN;
   1778   5084   johnlev 		goto fail;
   1779   5084   johnlev 	}
   1780   5741       mrj 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
   1781   5084   johnlev 
   1782   5084   johnlev 	/*
   1783   5084   johnlev 	 * It would be good to set the state to XenbusStateConnected
   1784   5084   johnlev 	 * here as well, but then what if ddi_add_intr() failed?
   1785   5084   johnlev 	 * Changing the state in the store will be noticed by the peer
   1786   5084   johnlev 	 * and cannot be "taken back".
   1787   5084   johnlev 	 */
   1788   5741       mrj 	mutex_enter(&xnbp->xnb_tx_lock);
   1789   5741       mrj 	mutex_enter(&xnbp->xnb_rx_lock);
   1790   5084   johnlev 
   1791   5741       mrj 	xnbp->xnb_connected = B_TRUE;
   1792   5084   johnlev 
   1793   5741       mrj 	mutex_exit(&xnbp->xnb_rx_lock);
   1794   5741       mrj 	mutex_exit(&xnbp->xnb_tx_lock);
   1795   5084   johnlev 
   1796  10958       dme 	/* 4, 5 */
   1797   5084   johnlev 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
   1798   5084   johnlev 	    != DDI_SUCCESS) {
   1799   5084   johnlev 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
   1800   5084   johnlev 		goto fail;
   1801   5084   johnlev 	}
   1802   5741       mrj 	xnbp->xnb_irq = B_TRUE;
   1803   5084   johnlev 
   1804   5084   johnlev 	return (B_TRUE);
   1805   5084   johnlev 
   1806   5084   johnlev fail:
   1807   5741       mrj 	mutex_enter(&xnbp->xnb_tx_lock);
   1808   5741       mrj 	mutex_enter(&xnbp->xnb_rx_lock);
   1809   5084   johnlev 
   1810   5741       mrj 	xnbp->xnb_connected = B_FALSE;
   1811  10958       dme 
   1812   5741       mrj 	mutex_exit(&xnbp->xnb_rx_lock);
   1813   5741       mrj 	mutex_exit(&xnbp->xnb_tx_lock);
   1814   5084   johnlev 
   1815   5084   johnlev 	return (B_FALSE);
   1816   5084   johnlev }
   1817   5084   johnlev 
   1818   5084   johnlev static void
   1819   5084   johnlev xnb_disconnect_rings(dev_info_t *dip)
   1820   5084   johnlev {
   1821   5084   johnlev 	xnb_t *xnbp = ddi_get_driver_private(dip);
   1822   5084   johnlev 
   1823   5741       mrj 	if (xnbp->xnb_irq) {
   1824   5084   johnlev 		ddi_remove_intr(dip, 0, NULL);
   1825   5741       mrj 		xnbp->xnb_irq = B_FALSE;
   1826   5084   johnlev 	}
   1827   5741       mrj 
   1828   5741       mrj 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
   1829   5084   johnlev 		xvdi_free_evtchn(dip);
   1830   5741       mrj 		xnbp->xnb_evtchn = INVALID_EVTCHN;
   1831   5084   johnlev 	}
   1832   5084   johnlev 
   1833   5741       mrj 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
   1834   5084   johnlev 		struct gnttab_unmap_grant_ref unmap_op;
   1835   5084   johnlev 
   1836   5741       mrj 		unmap_op.host_addr = (uint64_t)(uintptr_t)
   1837   5741       mrj 		    xnbp->xnb_rx_ring_addr;
   1838   5084   johnlev 		unmap_op.dev_bus_addr = 0;
   1839   5741       mrj 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
   1840   5084   johnlev 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
   1841   5084   johnlev 		    &unmap_op, 1) != 0)
   1842   5084   johnlev 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
   1843   5084   johnlev 			    "cannot unmap rx-ring page (%d)",
   1844   5084   johnlev 			    unmap_op.status);
   1845   5084   johnlev 
   1846   5741       mrj 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
   1847   5084   johnlev 	}
   1848   5084   johnlev 
   1849   5741       mrj 	if (xnbp->xnb_rx_ring_addr != NULL) {
   1850   5741       mrj 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
   1851   5741       mrj 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
   1852   5741       mrj 		xnbp->xnb_rx_ring_addr = NULL;
   1853   5084   johnlev 	}
   1854   5084   johnlev 
   1855   5741       mrj 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
   1856   5084   johnlev 		struct gnttab_unmap_grant_ref unmap_op;
   1857   5084   johnlev 
   1858   5741       mrj 		unmap_op.host_addr = (uint64_t)(uintptr_t)
   1859   5741       mrj 		    xnbp->xnb_tx_ring_addr;
   1860   5084   johnlev 		unmap_op.dev_bus_addr = 0;
   1861   5741       mrj 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
   1862   5084   johnlev 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
   1863   5084   johnlev 		    &unmap_op, 1) != 0)
   1864   5084   johnlev 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
   1865   5084   johnlev 			    "cannot unmap tx-ring page (%d)",
   1866   5084   johnlev 			    unmap_op.status);
   1867   5084   johnlev 
   1868   5741       mrj 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
   1869   5084   johnlev 	}
   1870   5084   johnlev 
   1871   5741       mrj 	if (xnbp->xnb_tx_ring_addr != NULL) {
   1872   5741       mrj 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
   1873   5741       mrj 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
   1874   5741       mrj 		xnbp->xnb_tx_ring_addr = NULL;
   1875   5084   johnlev 	}
   1876   5084   johnlev }
   1877   5084   johnlev 
   1878   5084   johnlev static void
   1879   5084   johnlev xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
   1880   5084   johnlev     void *arg, void *impl_data)
   1881   5084   johnlev {
   1882  10958       dme 	_NOTE(ARGUNUSED(id, arg));
   1883   5084   johnlev 	xnb_t *xnbp = ddi_get_driver_private(dip);
   1884   5084   johnlev 	XenbusState new_state = *(XenbusState *)impl_data;
   1885   5084   johnlev 
   1886   5084   johnlev 	ASSERT(xnbp != NULL);
   1887   5084   johnlev 
   1888   5084   johnlev 	switch (new_state) {
   1889   5084   johnlev 	case XenbusStateConnected:
   1890   7005  cz147101 		/* spurious state change */
   1891   7005  cz147101 		if (xnbp->xnb_connected)
   1892   7005  cz147101 			return;
   1893   7005  cz147101 
   1894  10958       dme 		if (!xnb_read_oe_config(xnbp) ||
   1895  10958       dme 		    !xnbp->xnb_flavour->xf_peer_connected(xnbp)) {
   1896  10958       dme 			cmn_err(CE_WARN, "xnb_oe_state_change: "
   1897  10958       dme 			    "read otherend config error");
   1898   5084   johnlev 			(void) xvdi_switch_state(dip, XBT_NULL,
   1899   5084   johnlev 			    XenbusStateClosed);
   1900   5084   johnlev 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
   1901  10958       dme 
   1902  10958       dme 			break;
   1903   5084   johnlev 		}
   1904  10958       dme 
   1905  10958       dme 
   1906  10958       dme 		mutex_enter(&xnbp->xnb_state_lock);
   1907  10958       dme 		xnbp->xnb_fe_status = XNB_STATE_READY;
   1908  10958       dme 		if (xnbp->xnb_be_status == XNB_STATE_READY)
   1909  10958       dme 			xnb_start_connect(xnbp);
   1910  10958       dme 		mutex_exit(&xnbp->xnb_state_lock);
   1911   5084   johnlev 
   1912   5084   johnlev 		/*
   1913   5084   johnlev 		 * Now that we've attempted to connect it's reasonable
   1914   5084   johnlev 		 * to allow an attempt to detach.
   1915   5084   johnlev 		 */
   1916   5741       mrj 		xnbp->xnb_detachable = B_TRUE;
   1917   5084   johnlev 
   1918   5084   johnlev 		break;
   1919   5084   johnlev 
   1920   5084   johnlev 	case XenbusStateClosing:
   1921   5084   johnlev 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
   1922   5084   johnlev 
   1923   5084   johnlev 		break;
   1924   5084   johnlev 
   1925   5084   johnlev 	case XenbusStateClosed:
   1926   5741       mrj 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
   1927   5084   johnlev 
   1928   5741       mrj 		mutex_enter(&xnbp->xnb_tx_lock);
   1929   5741       mrj 		mutex_enter(&xnbp->xnb_rx_lock);
   1930   5084   johnlev 
   1931   5084   johnlev 		xnb_disconnect_rings(dip);
   1932   5741       mrj 		xnbp->xnb_connected = B_FALSE;
   1933   5084   johnlev 
   1934   5741       mrj 		mutex_exit(&xnbp->xnb_rx_lock);
   1935   5741       mrj 		mutex_exit(&xnbp->xnb_tx_lock);
   1936   5084   johnlev 
   1937   5084   johnlev 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
   1938   5084   johnlev 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
   1939   5084   johnlev 		/*
   1940   5084   johnlev 		 * In all likelyhood this is already set (in the above
   1941   5084   johnlev 		 * case), but if the peer never attempted to connect
   1942   5084   johnlev 		 * and the domain is destroyed we get here without
   1943   5084   johnlev 		 * having been through the case above, so we set it to
   1944   5084   johnlev 		 * be sure.
   1945   5084   johnlev 		 */
   1946   5741       mrj 		xnbp->xnb_detachable = B_TRUE;
   1947   5084   johnlev 
   1948   5084   johnlev 		break;
   1949   5084   johnlev 
   1950   5084   johnlev 	default:
   1951   5084   johnlev 		break;
   1952   5084   johnlev 	}
   1953   5084   johnlev }
   1954   5084   johnlev 
   1955   5084   johnlev static void
   1956   5084   johnlev xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
   1957   5084   johnlev     void *arg, void *impl_data)
   1958   5084   johnlev {
   1959  10958       dme 	_NOTE(ARGUNUSED(id, arg));
   1960   5084   johnlev 	xnb_t *xnbp = ddi_get_driver_private(dip);
   1961   5084   johnlev 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
   1962   5084   johnlev 
   1963   5084   johnlev 	ASSERT(xnbp != NULL);
   1964   5084   johnlev 
   1965   5084   johnlev 	switch (state) {
   1966   5084   johnlev 	case Connected:
   1967   7005  cz147101 		/* spurious hotplug event */
   1968   7005  cz147101 		if (xnbp->xnb_hotplugged)
   1969  10958       dme 			break;
   1970   7005  cz147101 
   1971  10958       dme 		if (!xnb_read_xs_config(xnbp))
   1972  10958       dme 			break;
   1973  10958       dme 
   1974  10958       dme 		if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp))
   1975  10958       dme 			break;
   1976   5084   johnlev 
   1977   5741       mrj 		mutex_enter(&xnbp->xnb_tx_lock);
   1978   5741       mrj 		mutex_enter(&xnbp->xnb_rx_lock);
   1979   5084   johnlev 
   1980  10958       dme 		xnbp->xnb_hotplugged = B_TRUE;
   1981   5084   johnlev 
   1982   5741       mrj 		mutex_exit(&xnbp->xnb_rx_lock);
   1983   5741       mrj 		mutex_exit(&xnbp->xnb_tx_lock);
   1984  10958       dme 
   1985  10958       dme 		mutex_enter(&xnbp->xnb_state_lock);
   1986  10958       dme 		xnbp->xnb_be_status = XNB_STATE_READY;
   1987  10958       dme 		if (xnbp->xnb_fe_status == XNB_STATE_READY)
   1988  10958       dme 			xnb_start_connect(xnbp);
   1989  10958       dme 		mutex_exit(&xnbp->xnb_state_lock);
   1990  10958       dme 
   1991   5084   johnlev 		break;
   1992   5084   johnlev 
   1993   5084   johnlev 	default:
   1994   5084   johnlev 		break;
   1995   5084   johnlev 	}
   1996   5084   johnlev }
   1997   5084   johnlev 
   1998   5084   johnlev static struct modldrv modldrv = {
   1999   7351       dme 	&mod_miscops, "xnb",
   2000   5084   johnlev };
   2001   5084   johnlev 
   2002   5084   johnlev static struct modlinkage modlinkage = {
   2003   5084   johnlev 	MODREV_1, &modldrv, NULL
   2004   5084   johnlev };
   2005   5084   johnlev 
   2006   5084   johnlev int
   2007   5084   johnlev _init(void)
   2008   5084   johnlev {
   2009   5084   johnlev 	int i;
   2010   5084   johnlev 
   2011   5084   johnlev 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
   2012   5084   johnlev 
   2013  10958       dme 	i = mod_install(&modlinkage);
   2014  10958       dme 	if (i != DDI_SUCCESS)
   2015  10958       dme 		mutex_destroy(&xnb_alloc_page_lock);
   2016   5084   johnlev 
   2017   5084   johnlev 	return (i);
   2018   5084   johnlev }
   2019   5084   johnlev 
   2020   5084   johnlev int
   2021   5084   johnlev _info(struct modinfo *modinfop)
   2022   5084   johnlev {
   2023   5084   johnlev 	return (mod_info(&modlinkage, modinfop));
   2024   5084   johnlev }
   2025   5084   johnlev 
   2026   5084   johnlev int
   2027   5084   johnlev _fini(void)
   2028   5084   johnlev {
   2029   5084   johnlev 	int i;
   2030   5084   johnlev 
   2031   5084   johnlev 	i = mod_remove(&modlinkage);
   2032  10958       dme 	if (i == DDI_SUCCESS)
   2033   5084   johnlev 		mutex_destroy(&xnb_alloc_page_lock);
   2034  10958       dme 
   2035   5084   johnlev 	return (i);
   2036   5084   johnlev }
   2037